site_classifier 0.0.5 → 0.0.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/site_classifier.rb +4 -0
- data/lib/site_classifier/extractor.rb +51 -37
- data/lib/site_classifier/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: bdaa0867f7090d45492ae06ae60405f6cbb4af14
|
4
|
+
data.tar.gz: fb2a4af89c55d876d0a0786b2244ce64bdd06c09
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f3b5313c03e1b98496bb8e06ca886e8156afec5c4138090e4a4a43586b83b1e47db4f040e805d865ec05f5d3dbde2c1aef02410ee6d7f621216cfb7c355b2fb7
|
7
|
+
data.tar.gz: 38e2777f8386bdc950996d9d6dac82a394e5e2d7d2b9cb443719ce9d4b788d95aa24460dd5d26aba0f4e413406e8936a2013fdfc5027a394e17bef45a23541c7
|
data/lib/site_classifier.rb
CHANGED
@@ -30,29 +30,30 @@ module SiteClassifier
|
|
30
30
|
# Extract most significant tags
|
31
31
|
def most_significant
|
32
32
|
most_sig = []
|
33
|
-
# if !description.nil?
|
34
|
-
# if tags.any?
|
35
|
-
# most_sig = tags.select {|tag| self.description.downcase.include?(tag)}.collect {|tag| tag.singularize }
|
36
|
-
# else
|
37
|
-
# most_sig = word_frequency.keys.select {|tag| self.description.downcase.include?(tag)}.collect {|tag| tag.singularize }
|
38
|
-
# end
|
39
|
-
# end
|
40
33
|
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
34
|
+
# TODO: replace with NLP.
|
35
|
+
# description.to_s.split.each do |word|
|
36
|
+
# self.word_frequency[word] ||= 0
|
37
|
+
# self.word_frequency[word] += 1
|
38
|
+
# end
|
45
39
|
|
46
40
|
if most_sig.empty?
|
47
|
-
|
48
|
-
|
49
|
-
|
41
|
+
total = self.word_frequency.values.count
|
42
|
+
|
43
|
+
sum = 0
|
44
|
+
self.word_frequency.values.each do |counter|
|
45
|
+
sum += counter
|
46
|
+
end
|
50
47
|
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
48
|
+
avg = (sum.to_f / total.to_f).floor
|
49
|
+
|
50
|
+
self.tags.each do |tag|
|
51
|
+
self.word_frequency[tag] ||= 0
|
52
|
+
self.word_frequency[tag] += (avg * 2.0)
|
53
|
+
end
|
54
|
+
most_sig = self.word_frequency.reject {|k,v| v < (avg * 2.0).floor }.to_a.uniq.sort_by {|_key, v| v}.reverse.collect(&:first).collect(&:downcase)
|
55
|
+
if most_sig.empty?
|
56
|
+
most_sig = self.word_frequency.reject {|k,v| v < (avg * 0.5).floor }.to_a.uniq.sort_by {|_key, v| v}.reverse.collect(&:first).collect(&:downcase)
|
56
57
|
end
|
57
58
|
end
|
58
59
|
|
@@ -89,7 +90,8 @@ module SiteClassifier
|
|
89
90
|
|
90
91
|
debug("getting #{url}")
|
91
92
|
html = Nokogiri::HTML(self.get(url).parsed_response)
|
92
|
-
|
93
|
+
html.search("//style").remove
|
94
|
+
html.search("//script").remove
|
93
95
|
tags = []
|
94
96
|
description = nil
|
95
97
|
word_hash = {}
|
@@ -144,25 +146,37 @@ module SiteClassifier
|
|
144
146
|
end
|
145
147
|
end
|
146
148
|
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
all_text += description.to_s.split
|
149
|
+
debug("no tags, parsing body")
|
150
|
+
word_hash = Hash.new(0)
|
151
|
+
all_text = []
|
152
|
+
# all_text = html.search("p").collect {|p| p.text.strip }.collect {|text| text.split.collect(&:strip)}.flatten.reject {|word| word.size < 4}
|
153
|
+
# debug("p's extracts - #{all_text.inspect}")
|
154
|
+
if all_text.empty?
|
155
|
+
all_text = html.search("div").collect {|p| p.text.strip }.collect {|text| text.split.collect(&:strip)}.flatten.reject {|word| word.size < 4}
|
156
|
+
debug("divs extracts - #{all_text.inspect}")
|
157
|
+
end
|
158
|
+
all_text += description.to_s.split
|
158
159
|
|
159
|
-
|
160
|
-
|
161
|
-
end
|
162
|
-
debug("final word hash - #{word_hash.inspect}")
|
163
|
-
word_hash.reject! {|k,v| v < 3 || k.size == 1 || k.include?(".") || k.include?("'") || k.include?("(") || k.include?(":") || k.include?("]")}
|
164
|
-
|
160
|
+
all_text.flatten.each do |word|
|
161
|
+
word_hash[word] += 1
|
165
162
|
end
|
163
|
+
debug("final word hash - #{word_hash.inspect}")
|
164
|
+
word_hash.reject! {|k,v| v < 3 || k.size == 1 || k.include?(".") || k.include?("'") || k.include?("(") || k.include?(":") || k.include?("]")}
|
165
|
+
|
166
|
+
# max_score = word_hash.values.max.to_i
|
167
|
+
# dmoz = HTTParty.get("http://www.dmoz.org/search", query: {q: url})
|
168
|
+
# dmoz_res = Nokogiri::HTML(dmoz.parsed_response)
|
169
|
+
|
170
|
+
# dmoz_classification = dmoz_res.search('//*[@id="bd-cross"]//ol[1]//li//strong').collect(&:text)
|
171
|
+
# if dmoz_classification.any?
|
172
|
+
# dmoz_classification.each do |dmoz_class|
|
173
|
+
# dmoz_class.split(": ").reject {|v| ["World"].include?(v)}.each do |tag|
|
174
|
+
# tag.split(" ").each do |plain_tag|
|
175
|
+
# word_hash[plain_tag] = max_score + 5
|
176
|
+
# end
|
177
|
+
# end
|
178
|
+
# end
|
179
|
+
# end
|
166
180
|
self.new(url, tags, word_hash, description, page_lang)
|
167
181
|
end
|
168
182
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: site_classifier
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Elad Meidar
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-10-
|
11
|
+
date: 2013-10-02 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|