site_classifier 0.0.5 → 0.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/site_classifier.rb +4 -0
- data/lib/site_classifier/extractor.rb +51 -37
- data/lib/site_classifier/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: bdaa0867f7090d45492ae06ae60405f6cbb4af14
|
4
|
+
data.tar.gz: fb2a4af89c55d876d0a0786b2244ce64bdd06c09
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f3b5313c03e1b98496bb8e06ca886e8156afec5c4138090e4a4a43586b83b1e47db4f040e805d865ec05f5d3dbde2c1aef02410ee6d7f621216cfb7c355b2fb7
|
7
|
+
data.tar.gz: 38e2777f8386bdc950996d9d6dac82a394e5e2d7d2b9cb443719ce9d4b788d95aa24460dd5d26aba0f4e413406e8936a2013fdfc5027a394e17bef45a23541c7
|
data/lib/site_classifier.rb
CHANGED
@@ -30,29 +30,30 @@ module SiteClassifier
|
|
30
30
|
# Extract most significant tags
|
31
31
|
def most_significant
|
32
32
|
most_sig = []
|
33
|
-
# if !description.nil?
|
34
|
-
# if tags.any?
|
35
|
-
# most_sig = tags.select {|tag| self.description.downcase.include?(tag)}.collect {|tag| tag.singularize }
|
36
|
-
# else
|
37
|
-
# most_sig = word_frequency.keys.select {|tag| self.description.downcase.include?(tag)}.collect {|tag| tag.singularize }
|
38
|
-
# end
|
39
|
-
# end
|
40
33
|
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
34
|
+
# TODO: replace with NLP.
|
35
|
+
# description.to_s.split.each do |word|
|
36
|
+
# self.word_frequency[word] ||= 0
|
37
|
+
# self.word_frequency[word] += 1
|
38
|
+
# end
|
45
39
|
|
46
40
|
if most_sig.empty?
|
47
|
-
|
48
|
-
|
49
|
-
|
41
|
+
total = self.word_frequency.values.count
|
42
|
+
|
43
|
+
sum = 0
|
44
|
+
self.word_frequency.values.each do |counter|
|
45
|
+
sum += counter
|
46
|
+
end
|
50
47
|
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
48
|
+
avg = (sum.to_f / total.to_f).floor
|
49
|
+
|
50
|
+
self.tags.each do |tag|
|
51
|
+
self.word_frequency[tag] ||= 0
|
52
|
+
self.word_frequency[tag] += (avg * 2.0)
|
53
|
+
end
|
54
|
+
most_sig = self.word_frequency.reject {|k,v| v < (avg * 2.0).floor }.to_a.uniq.sort_by {|_key, v| v}.reverse.collect(&:first).collect(&:downcase)
|
55
|
+
if most_sig.empty?
|
56
|
+
most_sig = self.word_frequency.reject {|k,v| v < (avg * 0.5).floor }.to_a.uniq.sort_by {|_key, v| v}.reverse.collect(&:first).collect(&:downcase)
|
56
57
|
end
|
57
58
|
end
|
58
59
|
|
@@ -89,7 +90,8 @@ module SiteClassifier
|
|
89
90
|
|
90
91
|
debug("getting #{url}")
|
91
92
|
html = Nokogiri::HTML(self.get(url).parsed_response)
|
92
|
-
|
93
|
+
html.search("//style").remove
|
94
|
+
html.search("//script").remove
|
93
95
|
tags = []
|
94
96
|
description = nil
|
95
97
|
word_hash = {}
|
@@ -144,25 +146,37 @@ module SiteClassifier
|
|
144
146
|
end
|
145
147
|
end
|
146
148
|
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
all_text += description.to_s.split
|
149
|
+
debug("no tags, parsing body")
|
150
|
+
word_hash = Hash.new(0)
|
151
|
+
all_text = []
|
152
|
+
# all_text = html.search("p").collect {|p| p.text.strip }.collect {|text| text.split.collect(&:strip)}.flatten.reject {|word| word.size < 4}
|
153
|
+
# debug("p's extracts - #{all_text.inspect}")
|
154
|
+
if all_text.empty?
|
155
|
+
all_text = html.search("div").collect {|p| p.text.strip }.collect {|text| text.split.collect(&:strip)}.flatten.reject {|word| word.size < 4}
|
156
|
+
debug("divs extracts - #{all_text.inspect}")
|
157
|
+
end
|
158
|
+
all_text += description.to_s.split
|
158
159
|
|
159
|
-
|
160
|
-
|
161
|
-
end
|
162
|
-
debug("final word hash - #{word_hash.inspect}")
|
163
|
-
word_hash.reject! {|k,v| v < 3 || k.size == 1 || k.include?(".") || k.include?("'") || k.include?("(") || k.include?(":") || k.include?("]")}
|
164
|
-
|
160
|
+
all_text.flatten.each do |word|
|
161
|
+
word_hash[word] += 1
|
165
162
|
end
|
163
|
+
debug("final word hash - #{word_hash.inspect}")
|
164
|
+
word_hash.reject! {|k,v| v < 3 || k.size == 1 || k.include?(".") || k.include?("'") || k.include?("(") || k.include?(":") || k.include?("]")}
|
165
|
+
|
166
|
+
# max_score = word_hash.values.max.to_i
|
167
|
+
# dmoz = HTTParty.get("http://www.dmoz.org/search", query: {q: url})
|
168
|
+
# dmoz_res = Nokogiri::HTML(dmoz.parsed_response)
|
169
|
+
|
170
|
+
# dmoz_classification = dmoz_res.search('//*[@id="bd-cross"]//ol[1]//li//strong').collect(&:text)
|
171
|
+
# if dmoz_classification.any?
|
172
|
+
# dmoz_classification.each do |dmoz_class|
|
173
|
+
# dmoz_class.split(": ").reject {|v| ["World"].include?(v)}.each do |tag|
|
174
|
+
# tag.split(" ").each do |plain_tag|
|
175
|
+
# word_hash[plain_tag] = max_score + 5
|
176
|
+
# end
|
177
|
+
# end
|
178
|
+
# end
|
179
|
+
# end
|
166
180
|
self.new(url, tags, word_hash, description, page_lang)
|
167
181
|
end
|
168
182
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: site_classifier
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Elad Meidar
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-10-
|
11
|
+
date: 2013-10-02 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|