site_classifier 0.0.6 → 0.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: bdaa0867f7090d45492ae06ae60405f6cbb4af14
4
- data.tar.gz: fb2a4af89c55d876d0a0786b2244ce64bdd06c09
3
+ metadata.gz: 425b9df31050d7e9a4f2e957a4e7bcf85668c4a2
4
+ data.tar.gz: 79757d84732bcfe3ca38f968af8587d6d5af47e0
5
5
  SHA512:
6
- metadata.gz: f3b5313c03e1b98496bb8e06ca886e8156afec5c4138090e4a4a43586b83b1e47db4f040e805d865ec05f5d3dbde2c1aef02410ee6d7f621216cfb7c355b2fb7
7
- data.tar.gz: 38e2777f8386bdc950996d9d6dac82a394e5e2d7d2b9cb443719ce9d4b788d95aa24460dd5d26aba0f4e413406e8936a2013fdfc5027a394e17bef45a23541c7
6
+ metadata.gz: df676399410f5aae1a2b7509a8eea697c033b127705ef146f86575b21ae69256b4f02c3949146e13e2312c689a407c0652ffad6ae268dc239f4d522adcb55440
7
+ data.tar.gz: df848b49799f73aa15124e71da2e33c2e7b4dd1bfec5380ab5bb65b92161956578701fbf51e46ccc6d3279a8120f0e6157ed1244cd9690364752c4f8eeaebd79
@@ -22,8 +22,4 @@ module SiteClassifier
22
22
  def self.configuration
23
23
  @setup ||= SiteClassifier::Configuration.new
24
24
  end
25
-
26
- def self.extract!(url)
27
- SiteClassifier::Extractor.parse_site(url)
28
- end
29
25
  end
@@ -30,30 +30,29 @@ module SiteClassifier
30
30
  # Extract most significant tags
31
31
  def most_significant
32
32
  most_sig = []
33
-
34
- # TODO: replace with NLP.
35
- # description.to_s.split.each do |word|
36
- # self.word_frequency[word] ||= 0
37
- # self.word_frequency[word] += 1
33
+ # if !description.nil?
34
+ # if tags.any?
35
+ # most_sig = tags.select {|tag| self.description.downcase.include?(tag)}.collect {|tag| tag.singularize }
36
+ # else
37
+ # most_sig = word_frequency.keys.select {|tag| self.description.downcase.include?(tag)}.collect {|tag| tag.singularize }
38
+ # end
38
39
  # end
39
40
 
40
- if most_sig.empty?
41
- total = self.word_frequency.values.count
42
-
43
- sum = 0
44
- self.word_frequency.values.each do |counter|
45
- sum += counter
46
- end
41
+ description.to_s.split.each do |word|
42
+ self.word_frequency[word] ||= 0
43
+ self.word_frequency[word] += 1
44
+ end
47
45
 
48
- avg = (sum.to_f / total.to_f).floor
46
+ if most_sig.empty?
47
+ most_sig = self.word_frequency.reject {|k,v| v < 3}.keys
48
+ most_sig.flatten!
49
+ end
49
50
 
50
- self.tags.each do |tag|
51
- self.word_frequency[tag] ||= 0
52
- self.word_frequency[tag] += (avg * 2.0)
53
- end
54
- most_sig = self.word_frequency.reject {|k,v| v < (avg * 2.0).floor }.to_a.uniq.sort_by {|_key, v| v}.reverse.collect(&:first).collect(&:downcase)
55
- if most_sig.empty?
56
- most_sig = self.word_frequency.reject {|k,v| v < (avg * 0.5).floor }.to_a.uniq.sort_by {|_key, v| v}.reverse.collect(&:first).collect(&:downcase)
51
+ if description && tags.any?
52
+ tags.each do |tag|
53
+ if description.include?(tag)
54
+ most_sig << tag.singularize
55
+ end
57
56
  end
58
57
  end
59
58
 
@@ -90,8 +89,7 @@ module SiteClassifier
90
89
 
91
90
  debug("getting #{url}")
92
91
  html = Nokogiri::HTML(self.get(url).parsed_response)
93
- html.search("//style").remove
94
- html.search("//script").remove
92
+
95
93
  tags = []
96
94
  description = nil
97
95
  word_hash = {}
@@ -146,37 +144,25 @@ module SiteClassifier
146
144
  end
147
145
  end
148
146
 
149
- debug("no tags, parsing body")
150
- word_hash = Hash.new(0)
151
- all_text = []
152
- # all_text = html.search("p").collect {|p| p.text.strip }.collect {|text| text.split.collect(&:strip)}.flatten.reject {|word| word.size < 4}
153
- # debug("p's extracts - #{all_text.inspect}")
154
- if all_text.empty?
155
- all_text = html.search("div").collect {|p| p.text.strip }.collect {|text| text.split.collect(&:strip)}.flatten.reject {|word| word.size < 4}
156
- debug("divs extracts - #{all_text.inspect}")
157
- end
158
- all_text += description.to_s.split
147
+ if tags.empty?
148
+ debug("no tags, parsing body")
149
+ word_hash = Hash.new(0)
150
+ all_text = []
151
+ # all_text = html.search("p").collect {|p| p.text.strip }.collect {|text| text.split.collect(&:strip)}.flatten.reject {|word| word.size < 4}
152
+ # debug("p's extracts - #{all_text.inspect}")
153
+ if all_text.empty?
154
+ all_text = html.search("div").collect {|p| p.text.strip }.collect {|text| text.split.collect(&:strip)}.flatten.reject {|word| word.size < 4}
155
+ debug("divs extracts - #{all_text.inspect}")
156
+ end
157
+ all_text += description.to_s.split
159
158
 
160
- all_text.flatten.each do |word|
161
- word_hash[word] += 1
159
+ all_text.flatten.each do |word|
160
+ word_hash[word] += 1
161
+ end
162
+ debug("final word hash - #{word_hash.inspect}")
163
+ word_hash.reject! {|k,v| v < 3 || k.size == 1 || k.include?(".") || k.include?("'") || k.include?("(") || k.include?(":") || k.include?("]")}
164
+
162
165
  end
163
- debug("final word hash - #{word_hash.inspect}")
164
- word_hash.reject! {|k,v| v < 3 || k.size == 1 || k.include?(".") || k.include?("'") || k.include?("(") || k.include?(":") || k.include?("]")}
165
-
166
- # max_score = word_hash.values.max.to_i
167
- # dmoz = HTTParty.get("http://www.dmoz.org/search", query: {q: url})
168
- # dmoz_res = Nokogiri::HTML(dmoz.parsed_response)
169
-
170
- # dmoz_classification = dmoz_res.search('//*[@id="bd-cross"]//ol[1]//li//strong').collect(&:text)
171
- # if dmoz_classification.any?
172
- # dmoz_classification.each do |dmoz_class|
173
- # dmoz_class.split(": ").reject {|v| ["World"].include?(v)}.each do |tag|
174
- # tag.split(" ").each do |plain_tag|
175
- # word_hash[plain_tag] = max_score + 5
176
- # end
177
- # end
178
- # end
179
- # end
180
166
  self.new(url, tags, word_hash, description, page_lang)
181
167
  end
182
168
  end
@@ -1,3 +1,3 @@
1
1
  module SiteClassifier
2
- VERSION = "0.0.6"
2
+ VERSION = "0.0.7"
3
3
  end
@@ -21,8 +21,8 @@ Gem::Specification.new do |spec|
21
21
  spec.add_development_dependency "bundler", "~> 1.3"
22
22
  spec.add_development_dependency "rake"
23
23
  spec.add_development_dependency "rspec"
24
- spec.add_dependency "httparty", "0.11.0"
25
- spec.add_dependency "nokogiri", "1.6.0"
26
- spec.add_dependency "easy_translate", "0.3.3"
27
- spec.add_dependency "activesupport", "4.0.0"
24
+ spec.add_dependency "httparty"
25
+ spec.add_dependency "nokogiri"
26
+ spec.add_dependency "easy_translate"
27
+ spec.add_dependency "activesupport", "4.2.3"
28
28
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: site_classifier
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.6
4
+ version: 0.0.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Elad Meidar
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-10-02 00:00:00.000000000 Z
11
+ date: 2016-10-31 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -56,58 +56,58 @@ dependencies:
56
56
  name: httparty
57
57
  requirement: !ruby/object:Gem::Requirement
58
58
  requirements:
59
- - - '='
59
+ - - '>='
60
60
  - !ruby/object:Gem::Version
61
- version: 0.11.0
61
+ version: '0'
62
62
  type: :runtime
63
63
  prerelease: false
64
64
  version_requirements: !ruby/object:Gem::Requirement
65
65
  requirements:
66
- - - '='
66
+ - - '>='
67
67
  - !ruby/object:Gem::Version
68
- version: 0.11.0
68
+ version: '0'
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: nokogiri
71
71
  requirement: !ruby/object:Gem::Requirement
72
72
  requirements:
73
- - - '='
73
+ - - '>='
74
74
  - !ruby/object:Gem::Version
75
- version: 1.6.0
75
+ version: '0'
76
76
  type: :runtime
77
77
  prerelease: false
78
78
  version_requirements: !ruby/object:Gem::Requirement
79
79
  requirements:
80
- - - '='
80
+ - - '>='
81
81
  - !ruby/object:Gem::Version
82
- version: 1.6.0
82
+ version: '0'
83
83
  - !ruby/object:Gem::Dependency
84
84
  name: easy_translate
85
85
  requirement: !ruby/object:Gem::Requirement
86
86
  requirements:
87
- - - '='
87
+ - - '>='
88
88
  - !ruby/object:Gem::Version
89
- version: 0.3.3
89
+ version: '0'
90
90
  type: :runtime
91
91
  prerelease: false
92
92
  version_requirements: !ruby/object:Gem::Requirement
93
93
  requirements:
94
- - - '='
94
+ - - '>='
95
95
  - !ruby/object:Gem::Version
96
- version: 0.3.3
96
+ version: '0'
97
97
  - !ruby/object:Gem::Dependency
98
98
  name: activesupport
99
99
  requirement: !ruby/object:Gem::Requirement
100
100
  requirements:
101
101
  - - '='
102
102
  - !ruby/object:Gem::Version
103
- version: 4.0.0
103
+ version: 4.2.3
104
104
  type: :runtime
105
105
  prerelease: false
106
106
  version_requirements: !ruby/object:Gem::Requirement
107
107
  requirements:
108
108
  - - '='
109
109
  - !ruby/object:Gem::Version
110
- version: 4.0.0
110
+ version: 4.2.3
111
111
  description: Return a tag list for submitted urls
112
112
  email:
113
113
  - elad@eizesus.com
@@ -150,7 +150,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
150
150
  version: '0'
151
151
  requirements: []
152
152
  rubyforge_project:
153
- rubygems_version: 2.0.6
153
+ rubygems_version: 2.4.8
154
154
  signing_key:
155
155
  specification_version: 4
156
156
  summary: This gem extracts a list of english tags for a given url
@@ -159,3 +159,4 @@ test_files:
159
159
  - spec/models/extractor_spec.rb
160
160
  - spec/models/site_classifier_spec.rb
161
161
  - spec/spec_helper.rb
162
+ has_rdoc: