site_classifier 0.0.4 → 0.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 532c257b74d762bf2bad6c07255e87a5c6e924dc
4
- data.tar.gz: 760547db13f4b9c75db9085b778c276fa184efe3
3
+ metadata.gz: 1a6185e3f8b41f26e38ba1787d9a73ec864960ae
4
+ data.tar.gz: d53cf863639a4841920e85ed1d68c1e1a6367c2e
5
5
  SHA512:
6
- metadata.gz: 760921ace478383eee08c03b06b041066d6b60bea86406380b75dd6379445b4ed58ebdda765ebdda16fbcc3f4dee8ff600b7cbd09309b9a86877ad7b8b61ad5d
7
- data.tar.gz: 3084a269a2416a40c472befdf20dc7f2b11f5a0554f2f7f2f08256c37448a54159bb8e575be5d3dd84daa5cc779d074bcb765ae9719172df5b12a3664e282651
6
+ metadata.gz: 5b1e6a8e583c46f23dfdf35c2ce8448ca7539154775acd905a9cf2b212e3a6136b485c6b806ba2dae57ab8727e546b31c82f7cd2e7c879c8d23e002e8ba4f90c
7
+ data.tar.gz: e7ff3a1003a2baecf41486efcab6768c3ef5ee41fc24a558bdbdaadf65a46da6bbe31f121235ab1851934831937130f1e3d3b09f3fa4ea1f4d2c1ec91e55ee9e
@@ -1,13 +1,18 @@
1
1
  module SiteClassifier
2
2
  class Configuration
3
- attr_accessor :translate, :google_translate_api_key
3
+ attr_accessor :translate, :google_translate_api_key, :debug
4
4
 
5
5
  # Instantiate a new class
6
6
  def initialize(options = {})
7
7
  @translate = options[:translate] || false
8
8
  @google_translate_api_key = options[:google_translate_api_key]
9
+ @debug = options[:debug] || false
9
10
  end
10
11
 
12
+ def debug?
13
+ self.debug == true
14
+ end
15
+
11
16
  # Configure by block
12
17
  def self.configure(&block)
13
18
  new_configuration = SiteClassifier::Configuration.new
@@ -12,6 +12,12 @@ module SiteClassifier
12
12
  @lang = lang.downcase
13
13
  end
14
14
 
15
+ def self.debug(string)
16
+ if SiteClassifier.configuration.debug?
17
+ puts "#{Time.now.to_i} - #{string}"
18
+ end
19
+ end
20
+
15
21
  # Normalize site language
16
22
  def validate_lang
17
23
  if EasyTranslate::LANGUAGES.keys.include?(@lang)
@@ -24,18 +30,34 @@ module SiteClassifier
24
30
  # Extract most significant tags
25
31
  def most_significant
26
32
  most_sig = []
27
- if !description.nil?
28
- if tags.any?
29
- most_sig = tags.select {|tag| self.description.downcase.include?(tag)}.collect {|tag| tag.singularize }
30
- else
31
- most_sig = word_frequency.keys.select {|tag| self.description.downcase.include?(tag)}.collect {|tag| tag.singularize }
32
- end
33
+ # if !description.nil?
34
+ # if tags.any?
35
+ # most_sig = tags.select {|tag| self.description.downcase.include?(tag)}.collect {|tag| tag.singularize }
36
+ # else
37
+ # most_sig = word_frequency.keys.select {|tag| self.description.downcase.include?(tag)}.collect {|tag| tag.singularize }
38
+ # end
39
+ # end
40
+
41
+ description.to_s.split.each do |word|
42
+ self.word_frequency[word] ||= 0
43
+ self.word_frequency[word] += 1
33
44
  end
34
45
 
35
46
  if most_sig.empty?
36
- most_sig = self.word_frequency.keys
47
+ most_sig = self.word_frequency.reject {|k,v| v < 3}.keys
48
+ most_sig.flatten!
37
49
  end
38
50
 
51
+ if description && tags.any?
52
+ tags.each do |tag|
53
+ if description.include?(tag)
54
+ most_sig << tag.singularize
55
+ end
56
+ end
57
+ end
58
+
59
+ most_sig.uniq!
60
+
39
61
  self.validate_lang
40
62
 
41
63
  if SiteClassifier.translate_tags?
@@ -65,6 +87,7 @@ module SiteClassifier
65
87
  def self.parse_site(url = "")
66
88
  return if url == "" || url.nil?
67
89
 
90
+ debug("getting #{url}")
68
91
  html = Nokogiri::HTML(self.get(url).parsed_response)
69
92
 
70
93
  tags = []
@@ -74,30 +97,71 @@ module SiteClassifier
74
97
 
75
98
  begin
76
99
  page_lang = html.search("html").first["lang"].to_s.slice(0..1)
100
+ debug("found lang in html tag - #{page_lang}")
77
101
  rescue
78
102
  end
79
103
 
80
104
  begin
81
105
  page_lang = html.search("html").first["xml:lang"].to_s.slice(0..1)
106
+ debug("found lang in html tag (xml:lang) - #{page_lang}")
82
107
  rescue
83
108
  end
84
109
 
85
110
  begin
86
111
  tags = html.search('meta[name="keywords"]').first["content"].split(",").collect(&:strip).collect(&:downcase)
112
+ debug("Tags - #{tags.inspect}")
113
+ rescue
114
+ debug("no tags found")
115
+ end
116
+
117
+ if tags.empty?
118
+ begin
119
+ tags = html.search('meta[property="keywords"]').first["content"].split(",").collect(&:strip).collect(&:downcase)
120
+ debug("Tags - #{tags.inspect}")
121
+ rescue
122
+ debug("no tags found")
123
+ end
124
+ end
125
+ begin
87
126
  description = html.search('meta[name="description"]').first["content"]
127
+ debug("Decription meta found")
88
128
  rescue
89
129
  end
90
130
 
131
+ if description.nil?
132
+ begin
133
+ description = html.search('meta[property="og:description"]').first["content"]
134
+ debug("Facebook og:description found")
135
+ rescue
136
+ end
137
+ end
138
+
139
+ if description.nil?
140
+ begin
141
+ description = html.search('meta[name="og:description"]').first["content"]
142
+ debug("Facebook og:description found")
143
+ rescue
144
+ end
145
+ end
146
+
91
147
  if tags.empty?
148
+ debug("no tags, parsing body")
92
149
  word_hash = Hash.new(0)
93
- all_text = html.search("p").collect {|p| p.text.strip }.collect {|text| text.split.collect(&:strip)}.flatten.reject {|word| word.size < 4}
150
+ all_text = []
151
+ # all_text = html.search("p").collect {|p| p.text.strip }.collect {|text| text.split.collect(&:strip)}.flatten.reject {|word| word.size < 4}
152
+ # debug("p's extracts - #{all_text.inspect}")
94
153
  if all_text.empty?
95
154
  all_text = html.search("div").collect {|p| p.text.strip }.collect {|text| text.split.collect(&:strip)}.flatten.reject {|word| word.size < 4}
155
+ debug("divs extracts - #{all_text.inspect}")
96
156
  end
97
- all_text.each do |word|
157
+ all_text += description.to_s.split
158
+
159
+ all_text.flatten.each do |word|
98
160
  word_hash[word] += 1
99
161
  end
100
- word_hash.reject! {|k,v| v < 2 }
162
+ debug("final word hash - #{word_hash.inspect}")
163
+ word_hash.reject! {|k,v| v < 3 || k.size == 1 || k.include?(".") || k.include?("'") || k.include?("(") || k.include?(":") || k.include?("]")}
164
+
101
165
  end
102
166
  self.new(url, tags, word_hash, description, page_lang)
103
167
  end
@@ -1,3 +1,3 @@
1
1
  module SiteClassifier
2
- VERSION = "0.0.4"
2
+ VERSION = "0.0.5"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: site_classifier
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
4
+ version: 0.0.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Elad Meidar