site_classifier 0.0.5 → 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 1a6185e3f8b41f26e38ba1787d9a73ec864960ae
4
- data.tar.gz: d53cf863639a4841920e85ed1d68c1e1a6367c2e
3
+ metadata.gz: bdaa0867f7090d45492ae06ae60405f6cbb4af14
4
+ data.tar.gz: fb2a4af89c55d876d0a0786b2244ce64bdd06c09
5
5
  SHA512:
6
- metadata.gz: 5b1e6a8e583c46f23dfdf35c2ce8448ca7539154775acd905a9cf2b212e3a6136b485c6b806ba2dae57ab8727e546b31c82f7cd2e7c879c8d23e002e8ba4f90c
7
- data.tar.gz: e7ff3a1003a2baecf41486efcab6768c3ef5ee41fc24a558bdbdaadf65a46da6bbe31f121235ab1851934831937130f1e3d3b09f3fa4ea1f4d2c1ec91e55ee9e
6
+ metadata.gz: f3b5313c03e1b98496bb8e06ca886e8156afec5c4138090e4a4a43586b83b1e47db4f040e805d865ec05f5d3dbde2c1aef02410ee6d7f621216cfb7c355b2fb7
7
+ data.tar.gz: 38e2777f8386bdc950996d9d6dac82a394e5e2d7d2b9cb443719ce9d4b788d95aa24460dd5d26aba0f4e413406e8936a2013fdfc5027a394e17bef45a23541c7
@@ -22,4 +22,8 @@ module SiteClassifier
22
22
  def self.configuration
23
23
  @setup ||= SiteClassifier::Configuration.new
24
24
  end
25
+
26
+ def self.extract!(url)
27
+ SiteClassifier::Extractor.parse_site(url)
28
+ end
25
29
  end
@@ -30,29 +30,30 @@ module SiteClassifier
30
30
  # Extract most significant tags
31
31
  def most_significant
32
32
  most_sig = []
33
- # if !description.nil?
34
- # if tags.any?
35
- # most_sig = tags.select {|tag| self.description.downcase.include?(tag)}.collect {|tag| tag.singularize }
36
- # else
37
- # most_sig = word_frequency.keys.select {|tag| self.description.downcase.include?(tag)}.collect {|tag| tag.singularize }
38
- # end
39
- # end
40
33
 
41
- description.to_s.split.each do |word|
42
- self.word_frequency[word] ||= 0
43
- self.word_frequency[word] += 1
44
- end
34
+ # TODO: replace with NLP.
35
+ # description.to_s.split.each do |word|
36
+ # self.word_frequency[word] ||= 0
37
+ # self.word_frequency[word] += 1
38
+ # end
45
39
 
46
40
  if most_sig.empty?
47
- most_sig = self.word_frequency.reject {|k,v| v < 3}.keys
48
- most_sig.flatten!
49
- end
41
+ total = self.word_frequency.values.count
42
+
43
+ sum = 0
44
+ self.word_frequency.values.each do |counter|
45
+ sum += counter
46
+ end
50
47
 
51
- if description && tags.any?
52
- tags.each do |tag|
53
- if description.include?(tag)
54
- most_sig << tag.singularize
55
- end
48
+ avg = (sum.to_f / total.to_f).floor
49
+
50
+ self.tags.each do |tag|
51
+ self.word_frequency[tag] ||= 0
52
+ self.word_frequency[tag] += (avg * 2.0)
53
+ end
54
+ most_sig = self.word_frequency.reject {|k,v| v < (avg * 2.0).floor }.to_a.uniq.sort_by {|_key, v| v}.reverse.collect(&:first).collect(&:downcase)
55
+ if most_sig.empty?
56
+ most_sig = self.word_frequency.reject {|k,v| v < (avg * 0.5).floor }.to_a.uniq.sort_by {|_key, v| v}.reverse.collect(&:first).collect(&:downcase)
56
57
  end
57
58
  end
58
59
 
@@ -89,7 +90,8 @@ module SiteClassifier
89
90
 
90
91
  debug("getting #{url}")
91
92
  html = Nokogiri::HTML(self.get(url).parsed_response)
92
-
93
+ html.search("//style").remove
94
+ html.search("//script").remove
93
95
  tags = []
94
96
  description = nil
95
97
  word_hash = {}
@@ -144,25 +146,37 @@ module SiteClassifier
144
146
  end
145
147
  end
146
148
 
147
- if tags.empty?
148
- debug("no tags, parsing body")
149
- word_hash = Hash.new(0)
150
- all_text = []
151
- # all_text = html.search("p").collect {|p| p.text.strip }.collect {|text| text.split.collect(&:strip)}.flatten.reject {|word| word.size < 4}
152
- # debug("p's extracts - #{all_text.inspect}")
153
- if all_text.empty?
154
- all_text = html.search("div").collect {|p| p.text.strip }.collect {|text| text.split.collect(&:strip)}.flatten.reject {|word| word.size < 4}
155
- debug("divs extracts - #{all_text.inspect}")
156
- end
157
- all_text += description.to_s.split
149
+ debug("no tags, parsing body")
150
+ word_hash = Hash.new(0)
151
+ all_text = []
152
+ # all_text = html.search("p").collect {|p| p.text.strip }.collect {|text| text.split.collect(&:strip)}.flatten.reject {|word| word.size < 4}
153
+ # debug("p's extracts - #{all_text.inspect}")
154
+ if all_text.empty?
155
+ all_text = html.search("div").collect {|p| p.text.strip }.collect {|text| text.split.collect(&:strip)}.flatten.reject {|word| word.size < 4}
156
+ debug("divs extracts - #{all_text.inspect}")
157
+ end
158
+ all_text += description.to_s.split
158
159
 
159
- all_text.flatten.each do |word|
160
- word_hash[word] += 1
161
- end
162
- debug("final word hash - #{word_hash.inspect}")
163
- word_hash.reject! {|k,v| v < 3 || k.size == 1 || k.include?(".") || k.include?("'") || k.include?("(") || k.include?(":") || k.include?("]")}
164
-
160
+ all_text.flatten.each do |word|
161
+ word_hash[word] += 1
165
162
  end
163
+ debug("final word hash - #{word_hash.inspect}")
164
+ word_hash.reject! {|k,v| v < 3 || k.size == 1 || k.include?(".") || k.include?("'") || k.include?("(") || k.include?(":") || k.include?("]")}
165
+
166
+ # max_score = word_hash.values.max.to_i
167
+ # dmoz = HTTParty.get("http://www.dmoz.org/search", query: {q: url})
168
+ # dmoz_res = Nokogiri::HTML(dmoz.parsed_response)
169
+
170
+ # dmoz_classification = dmoz_res.search('//*[@id="bd-cross"]//ol[1]//li//strong').collect(&:text)
171
+ # if dmoz_classification.any?
172
+ # dmoz_classification.each do |dmoz_class|
173
+ # dmoz_class.split(": ").reject {|v| ["World"].include?(v)}.each do |tag|
174
+ # tag.split(" ").each do |plain_tag|
175
+ # word_hash[plain_tag] = max_score + 5
176
+ # end
177
+ # end
178
+ # end
179
+ # end
166
180
  self.new(url, tags, word_hash, description, page_lang)
167
181
  end
168
182
  end
@@ -1,3 +1,3 @@
1
1
  module SiteClassifier
2
- VERSION = "0.0.5"
2
+ VERSION = "0.0.6"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: site_classifier
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.5
4
+ version: 0.0.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Elad Meidar
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-10-01 00:00:00.000000000 Z
11
+ date: 2013-10-02 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler