site_classifier 0.0.6 → 0.0.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/site_classifier.rb +0 -4
- data/lib/site_classifier/extractor.rb +37 -51
- data/lib/site_classifier/version.rb +1 -1
- data/site_classifier.gemspec +4 -4
- metadata +18 -17
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 425b9df31050d7e9a4f2e957a4e7bcf85668c4a2
|
4
|
+
data.tar.gz: 79757d84732bcfe3ca38f968af8587d6d5af47e0
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: df676399410f5aae1a2b7509a8eea697c033b127705ef146f86575b21ae69256b4f02c3949146e13e2312c689a407c0652ffad6ae268dc239f4d522adcb55440
|
7
|
+
data.tar.gz: df848b49799f73aa15124e71da2e33c2e7b4dd1bfec5380ab5bb65b92161956578701fbf51e46ccc6d3279a8120f0e6157ed1244cd9690364752c4f8eeaebd79
|
data/lib/site_classifier.rb
CHANGED
@@ -30,30 +30,29 @@ module SiteClassifier
|
|
30
30
|
# Extract most significant tags
|
31
31
|
def most_significant
|
32
32
|
most_sig = []
|
33
|
-
|
34
|
-
#
|
35
|
-
# description.
|
36
|
-
#
|
37
|
-
#
|
33
|
+
# if !description.nil?
|
34
|
+
# if tags.any?
|
35
|
+
# most_sig = tags.select {|tag| self.description.downcase.include?(tag)}.collect {|tag| tag.singularize }
|
36
|
+
# else
|
37
|
+
# most_sig = word_frequency.keys.select {|tag| self.description.downcase.include?(tag)}.collect {|tag| tag.singularize }
|
38
|
+
# end
|
38
39
|
# end
|
39
40
|
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
self.word_frequency.values.each do |counter|
|
45
|
-
sum += counter
|
46
|
-
end
|
41
|
+
description.to_s.split.each do |word|
|
42
|
+
self.word_frequency[word] ||= 0
|
43
|
+
self.word_frequency[word] += 1
|
44
|
+
end
|
47
45
|
|
48
|
-
|
46
|
+
if most_sig.empty?
|
47
|
+
most_sig = self.word_frequency.reject {|k,v| v < 3}.keys
|
48
|
+
most_sig.flatten!
|
49
|
+
end
|
49
50
|
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
if most_sig.empty?
|
56
|
-
most_sig = self.word_frequency.reject {|k,v| v < (avg * 0.5).floor }.to_a.uniq.sort_by {|_key, v| v}.reverse.collect(&:first).collect(&:downcase)
|
51
|
+
if description && tags.any?
|
52
|
+
tags.each do |tag|
|
53
|
+
if description.include?(tag)
|
54
|
+
most_sig << tag.singularize
|
55
|
+
end
|
57
56
|
end
|
58
57
|
end
|
59
58
|
|
@@ -90,8 +89,7 @@ module SiteClassifier
|
|
90
89
|
|
91
90
|
debug("getting #{url}")
|
92
91
|
html = Nokogiri::HTML(self.get(url).parsed_response)
|
93
|
-
|
94
|
-
html.search("//script").remove
|
92
|
+
|
95
93
|
tags = []
|
96
94
|
description = nil
|
97
95
|
word_hash = {}
|
@@ -146,37 +144,25 @@ module SiteClassifier
|
|
146
144
|
end
|
147
145
|
end
|
148
146
|
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
all_text
|
156
|
-
|
157
|
-
|
158
|
-
|
147
|
+
if tags.empty?
|
148
|
+
debug("no tags, parsing body")
|
149
|
+
word_hash = Hash.new(0)
|
150
|
+
all_text = []
|
151
|
+
# all_text = html.search("p").collect {|p| p.text.strip }.collect {|text| text.split.collect(&:strip)}.flatten.reject {|word| word.size < 4}
|
152
|
+
# debug("p's extracts - #{all_text.inspect}")
|
153
|
+
if all_text.empty?
|
154
|
+
all_text = html.search("div").collect {|p| p.text.strip }.collect {|text| text.split.collect(&:strip)}.flatten.reject {|word| word.size < 4}
|
155
|
+
debug("divs extracts - #{all_text.inspect}")
|
156
|
+
end
|
157
|
+
all_text += description.to_s.split
|
159
158
|
|
160
|
-
|
161
|
-
|
159
|
+
all_text.flatten.each do |word|
|
160
|
+
word_hash[word] += 1
|
161
|
+
end
|
162
|
+
debug("final word hash - #{word_hash.inspect}")
|
163
|
+
word_hash.reject! {|k,v| v < 3 || k.size == 1 || k.include?(".") || k.include?("'") || k.include?("(") || k.include?(":") || k.include?("]")}
|
164
|
+
|
162
165
|
end
|
163
|
-
debug("final word hash - #{word_hash.inspect}")
|
164
|
-
word_hash.reject! {|k,v| v < 3 || k.size == 1 || k.include?(".") || k.include?("'") || k.include?("(") || k.include?(":") || k.include?("]")}
|
165
|
-
|
166
|
-
# max_score = word_hash.values.max.to_i
|
167
|
-
# dmoz = HTTParty.get("http://www.dmoz.org/search", query: {q: url})
|
168
|
-
# dmoz_res = Nokogiri::HTML(dmoz.parsed_response)
|
169
|
-
|
170
|
-
# dmoz_classification = dmoz_res.search('//*[@id="bd-cross"]//ol[1]//li//strong').collect(&:text)
|
171
|
-
# if dmoz_classification.any?
|
172
|
-
# dmoz_classification.each do |dmoz_class|
|
173
|
-
# dmoz_class.split(": ").reject {|v| ["World"].include?(v)}.each do |tag|
|
174
|
-
# tag.split(" ").each do |plain_tag|
|
175
|
-
# word_hash[plain_tag] = max_score + 5
|
176
|
-
# end
|
177
|
-
# end
|
178
|
-
# end
|
179
|
-
# end
|
180
166
|
self.new(url, tags, word_hash, description, page_lang)
|
181
167
|
end
|
182
168
|
end
|
data/site_classifier.gemspec
CHANGED
@@ -21,8 +21,8 @@ Gem::Specification.new do |spec|
|
|
21
21
|
spec.add_development_dependency "bundler", "~> 1.3"
|
22
22
|
spec.add_development_dependency "rake"
|
23
23
|
spec.add_development_dependency "rspec"
|
24
|
-
spec.add_dependency "httparty"
|
25
|
-
spec.add_dependency "nokogiri"
|
26
|
-
spec.add_dependency "easy_translate"
|
27
|
-
spec.add_dependency "activesupport", "4.
|
24
|
+
spec.add_dependency "httparty"
|
25
|
+
spec.add_dependency "nokogiri"
|
26
|
+
spec.add_dependency "easy_translate"
|
27
|
+
spec.add_dependency "activesupport", "4.2.3"
|
28
28
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: site_classifier
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Elad Meidar
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2016-10-31 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -56,58 +56,58 @@ dependencies:
|
|
56
56
|
name: httparty
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
58
58
|
requirements:
|
59
|
-
- - '
|
59
|
+
- - '>='
|
60
60
|
- !ruby/object:Gem::Version
|
61
|
-
version: 0
|
61
|
+
version: '0'
|
62
62
|
type: :runtime
|
63
63
|
prerelease: false
|
64
64
|
version_requirements: !ruby/object:Gem::Requirement
|
65
65
|
requirements:
|
66
|
-
- - '
|
66
|
+
- - '>='
|
67
67
|
- !ruby/object:Gem::Version
|
68
|
-
version: 0
|
68
|
+
version: '0'
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: nokogiri
|
71
71
|
requirement: !ruby/object:Gem::Requirement
|
72
72
|
requirements:
|
73
|
-
- - '
|
73
|
+
- - '>='
|
74
74
|
- !ruby/object:Gem::Version
|
75
|
-
version:
|
75
|
+
version: '0'
|
76
76
|
type: :runtime
|
77
77
|
prerelease: false
|
78
78
|
version_requirements: !ruby/object:Gem::Requirement
|
79
79
|
requirements:
|
80
|
-
- - '
|
80
|
+
- - '>='
|
81
81
|
- !ruby/object:Gem::Version
|
82
|
-
version:
|
82
|
+
version: '0'
|
83
83
|
- !ruby/object:Gem::Dependency
|
84
84
|
name: easy_translate
|
85
85
|
requirement: !ruby/object:Gem::Requirement
|
86
86
|
requirements:
|
87
|
-
- - '
|
87
|
+
- - '>='
|
88
88
|
- !ruby/object:Gem::Version
|
89
|
-
version: 0
|
89
|
+
version: '0'
|
90
90
|
type: :runtime
|
91
91
|
prerelease: false
|
92
92
|
version_requirements: !ruby/object:Gem::Requirement
|
93
93
|
requirements:
|
94
|
-
- - '
|
94
|
+
- - '>='
|
95
95
|
- !ruby/object:Gem::Version
|
96
|
-
version: 0
|
96
|
+
version: '0'
|
97
97
|
- !ruby/object:Gem::Dependency
|
98
98
|
name: activesupport
|
99
99
|
requirement: !ruby/object:Gem::Requirement
|
100
100
|
requirements:
|
101
101
|
- - '='
|
102
102
|
- !ruby/object:Gem::Version
|
103
|
-
version: 4.
|
103
|
+
version: 4.2.3
|
104
104
|
type: :runtime
|
105
105
|
prerelease: false
|
106
106
|
version_requirements: !ruby/object:Gem::Requirement
|
107
107
|
requirements:
|
108
108
|
- - '='
|
109
109
|
- !ruby/object:Gem::Version
|
110
|
-
version: 4.
|
110
|
+
version: 4.2.3
|
111
111
|
description: Return a tag list for submitted urls
|
112
112
|
email:
|
113
113
|
- elad@eizesus.com
|
@@ -150,7 +150,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
150
150
|
version: '0'
|
151
151
|
requirements: []
|
152
152
|
rubyforge_project:
|
153
|
-
rubygems_version: 2.
|
153
|
+
rubygems_version: 2.4.8
|
154
154
|
signing_key:
|
155
155
|
specification_version: 4
|
156
156
|
summary: This gem extracts a list of english tags for a given url
|
@@ -159,3 +159,4 @@ test_files:
|
|
159
159
|
- spec/models/extractor_spec.rb
|
160
160
|
- spec/models/site_classifier_spec.rb
|
161
161
|
- spec/spec_helper.rb
|
162
|
+
has_rdoc:
|