site_classifier 0.0.5 → 0.0.6

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 1a6185e3f8b41f26e38ba1787d9a73ec864960ae
4
- data.tar.gz: d53cf863639a4841920e85ed1d68c1e1a6367c2e
3
+ metadata.gz: bdaa0867f7090d45492ae06ae60405f6cbb4af14
4
+ data.tar.gz: fb2a4af89c55d876d0a0786b2244ce64bdd06c09
5
5
  SHA512:
6
- metadata.gz: 5b1e6a8e583c46f23dfdf35c2ce8448ca7539154775acd905a9cf2b212e3a6136b485c6b806ba2dae57ab8727e546b31c82f7cd2e7c879c8d23e002e8ba4f90c
7
- data.tar.gz: e7ff3a1003a2baecf41486efcab6768c3ef5ee41fc24a558bdbdaadf65a46da6bbe31f121235ab1851934831937130f1e3d3b09f3fa4ea1f4d2c1ec91e55ee9e
6
+ metadata.gz: f3b5313c03e1b98496bb8e06ca886e8156afec5c4138090e4a4a43586b83b1e47db4f040e805d865ec05f5d3dbde2c1aef02410ee6d7f621216cfb7c355b2fb7
7
+ data.tar.gz: 38e2777f8386bdc950996d9d6dac82a394e5e2d7d2b9cb443719ce9d4b788d95aa24460dd5d26aba0f4e413406e8936a2013fdfc5027a394e17bef45a23541c7
@@ -22,4 +22,8 @@ module SiteClassifier
22
22
  def self.configuration
23
23
  @setup ||= SiteClassifier::Configuration.new
24
24
  end
25
+
26
+ def self.extract!(url)
27
+ SiteClassifier::Extractor.parse_site(url)
28
+ end
25
29
  end
@@ -30,29 +30,30 @@ module SiteClassifier
30
30
  # Extract most significant tags
31
31
  def most_significant
32
32
  most_sig = []
33
- # if !description.nil?
34
- # if tags.any?
35
- # most_sig = tags.select {|tag| self.description.downcase.include?(tag)}.collect {|tag| tag.singularize }
36
- # else
37
- # most_sig = word_frequency.keys.select {|tag| self.description.downcase.include?(tag)}.collect {|tag| tag.singularize }
38
- # end
39
- # end
40
33
 
41
- description.to_s.split.each do |word|
42
- self.word_frequency[word] ||= 0
43
- self.word_frequency[word] += 1
44
- end
34
+ # TODO: replace with NLP.
35
+ # description.to_s.split.each do |word|
36
+ # self.word_frequency[word] ||= 0
37
+ # self.word_frequency[word] += 1
38
+ # end
45
39
 
46
40
  if most_sig.empty?
47
- most_sig = self.word_frequency.reject {|k,v| v < 3}.keys
48
- most_sig.flatten!
49
- end
41
+ total = self.word_frequency.values.count
42
+
43
+ sum = 0
44
+ self.word_frequency.values.each do |counter|
45
+ sum += counter
46
+ end
50
47
 
51
- if description && tags.any?
52
- tags.each do |tag|
53
- if description.include?(tag)
54
- most_sig << tag.singularize
55
- end
48
+ avg = (sum.to_f / total.to_f).floor
49
+
50
+ self.tags.each do |tag|
51
+ self.word_frequency[tag] ||= 0
52
+ self.word_frequency[tag] += (avg * 2.0)
53
+ end
54
+ most_sig = self.word_frequency.reject {|k,v| v < (avg * 2.0).floor }.to_a.uniq.sort_by {|_key, v| v}.reverse.collect(&:first).collect(&:downcase)
55
+ if most_sig.empty?
56
+ most_sig = self.word_frequency.reject {|k,v| v < (avg * 0.5).floor }.to_a.uniq.sort_by {|_key, v| v}.reverse.collect(&:first).collect(&:downcase)
56
57
  end
57
58
  end
58
59
 
@@ -89,7 +90,8 @@ module SiteClassifier
89
90
 
90
91
  debug("getting #{url}")
91
92
  html = Nokogiri::HTML(self.get(url).parsed_response)
92
-
93
+ html.search("//style").remove
94
+ html.search("//script").remove
93
95
  tags = []
94
96
  description = nil
95
97
  word_hash = {}
@@ -144,25 +146,37 @@ module SiteClassifier
144
146
  end
145
147
  end
146
148
 
147
- if tags.empty?
148
- debug("no tags, parsing body")
149
- word_hash = Hash.new(0)
150
- all_text = []
151
- # all_text = html.search("p").collect {|p| p.text.strip }.collect {|text| text.split.collect(&:strip)}.flatten.reject {|word| word.size < 4}
152
- # debug("p's extracts - #{all_text.inspect}")
153
- if all_text.empty?
154
- all_text = html.search("div").collect {|p| p.text.strip }.collect {|text| text.split.collect(&:strip)}.flatten.reject {|word| word.size < 4}
155
- debug("divs extracts - #{all_text.inspect}")
156
- end
157
- all_text += description.to_s.split
149
+ debug("no tags, parsing body")
150
+ word_hash = Hash.new(0)
151
+ all_text = []
152
+ # all_text = html.search("p").collect {|p| p.text.strip }.collect {|text| text.split.collect(&:strip)}.flatten.reject {|word| word.size < 4}
153
+ # debug("p's extracts - #{all_text.inspect}")
154
+ if all_text.empty?
155
+ all_text = html.search("div").collect {|p| p.text.strip }.collect {|text| text.split.collect(&:strip)}.flatten.reject {|word| word.size < 4}
156
+ debug("divs extracts - #{all_text.inspect}")
157
+ end
158
+ all_text += description.to_s.split
158
159
 
159
- all_text.flatten.each do |word|
160
- word_hash[word] += 1
161
- end
162
- debug("final word hash - #{word_hash.inspect}")
163
- word_hash.reject! {|k,v| v < 3 || k.size == 1 || k.include?(".") || k.include?("'") || k.include?("(") || k.include?(":") || k.include?("]")}
164
-
160
+ all_text.flatten.each do |word|
161
+ word_hash[word] += 1
165
162
  end
163
+ debug("final word hash - #{word_hash.inspect}")
164
+ word_hash.reject! {|k,v| v < 3 || k.size == 1 || k.include?(".") || k.include?("'") || k.include?("(") || k.include?(":") || k.include?("]")}
165
+
166
+ # max_score = word_hash.values.max.to_i
167
+ # dmoz = HTTParty.get("http://www.dmoz.org/search", query: {q: url})
168
+ # dmoz_res = Nokogiri::HTML(dmoz.parsed_response)
169
+
170
+ # dmoz_classification = dmoz_res.search('//*[@id="bd-cross"]//ol[1]//li//strong').collect(&:text)
171
+ # if dmoz_classification.any?
172
+ # dmoz_classification.each do |dmoz_class|
173
+ # dmoz_class.split(": ").reject {|v| ["World"].include?(v)}.each do |tag|
174
+ # tag.split(" ").each do |plain_tag|
175
+ # word_hash[plain_tag] = max_score + 5
176
+ # end
177
+ # end
178
+ # end
179
+ # end
166
180
  self.new(url, tags, word_hash, description, page_lang)
167
181
  end
168
182
  end
@@ -1,3 +1,3 @@
1
1
  module SiteClassifier
2
- VERSION = "0.0.5"
2
+ VERSION = "0.0.6"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: site_classifier
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.5
4
+ version: 0.0.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Elad Meidar
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-10-01 00:00:00.000000000 Z
11
+ date: 2013-10-02 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler