site_classifier 0.0.6 → 0.0.7

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: bdaa0867f7090d45492ae06ae60405f6cbb4af14
4
- data.tar.gz: fb2a4af89c55d876d0a0786b2244ce64bdd06c09
3
+ metadata.gz: 425b9df31050d7e9a4f2e957a4e7bcf85668c4a2
4
+ data.tar.gz: 79757d84732bcfe3ca38f968af8587d6d5af47e0
5
5
  SHA512:
6
- metadata.gz: f3b5313c03e1b98496bb8e06ca886e8156afec5c4138090e4a4a43586b83b1e47db4f040e805d865ec05f5d3dbde2c1aef02410ee6d7f621216cfb7c355b2fb7
7
- data.tar.gz: 38e2777f8386bdc950996d9d6dac82a394e5e2d7d2b9cb443719ce9d4b788d95aa24460dd5d26aba0f4e413406e8936a2013fdfc5027a394e17bef45a23541c7
6
+ metadata.gz: df676399410f5aae1a2b7509a8eea697c033b127705ef146f86575b21ae69256b4f02c3949146e13e2312c689a407c0652ffad6ae268dc239f4d522adcb55440
7
+ data.tar.gz: df848b49799f73aa15124e71da2e33c2e7b4dd1bfec5380ab5bb65b92161956578701fbf51e46ccc6d3279a8120f0e6157ed1244cd9690364752c4f8eeaebd79
@@ -22,8 +22,4 @@ module SiteClassifier
22
22
  def self.configuration
23
23
  @setup ||= SiteClassifier::Configuration.new
24
24
  end
25
-
26
- def self.extract!(url)
27
- SiteClassifier::Extractor.parse_site(url)
28
- end
29
25
  end
@@ -30,30 +30,29 @@ module SiteClassifier
30
30
  # Extract most significant tags
31
31
  def most_significant
32
32
  most_sig = []
33
-
34
- # TODO: replace with NLP.
35
- # description.to_s.split.each do |word|
36
- # self.word_frequency[word] ||= 0
37
- # self.word_frequency[word] += 1
33
+ # if !description.nil?
34
+ # if tags.any?
35
+ # most_sig = tags.select {|tag| self.description.downcase.include?(tag)}.collect {|tag| tag.singularize }
36
+ # else
37
+ # most_sig = word_frequency.keys.select {|tag| self.description.downcase.include?(tag)}.collect {|tag| tag.singularize }
38
+ # end
38
39
  # end
39
40
 
40
- if most_sig.empty?
41
- total = self.word_frequency.values.count
42
-
43
- sum = 0
44
- self.word_frequency.values.each do |counter|
45
- sum += counter
46
- end
41
+ description.to_s.split.each do |word|
42
+ self.word_frequency[word] ||= 0
43
+ self.word_frequency[word] += 1
44
+ end
47
45
 
48
- avg = (sum.to_f / total.to_f).floor
46
+ if most_sig.empty?
47
+ most_sig = self.word_frequency.reject {|k,v| v < 3}.keys
48
+ most_sig.flatten!
49
+ end
49
50
 
50
- self.tags.each do |tag|
51
- self.word_frequency[tag] ||= 0
52
- self.word_frequency[tag] += (avg * 2.0)
53
- end
54
- most_sig = self.word_frequency.reject {|k,v| v < (avg * 2.0).floor }.to_a.uniq.sort_by {|_key, v| v}.reverse.collect(&:first).collect(&:downcase)
55
- if most_sig.empty?
56
- most_sig = self.word_frequency.reject {|k,v| v < (avg * 0.5).floor }.to_a.uniq.sort_by {|_key, v| v}.reverse.collect(&:first).collect(&:downcase)
51
+ if description && tags.any?
52
+ tags.each do |tag|
53
+ if description.include?(tag)
54
+ most_sig << tag.singularize
55
+ end
57
56
  end
58
57
  end
59
58
 
@@ -90,8 +89,7 @@ module SiteClassifier
90
89
 
91
90
  debug("getting #{url}")
92
91
  html = Nokogiri::HTML(self.get(url).parsed_response)
93
- html.search("//style").remove
94
- html.search("//script").remove
92
+
95
93
  tags = []
96
94
  description = nil
97
95
  word_hash = {}
@@ -146,37 +144,25 @@ module SiteClassifier
146
144
  end
147
145
  end
148
146
 
149
- debug("no tags, parsing body")
150
- word_hash = Hash.new(0)
151
- all_text = []
152
- # all_text = html.search("p").collect {|p| p.text.strip }.collect {|text| text.split.collect(&:strip)}.flatten.reject {|word| word.size < 4}
153
- # debug("p's extracts - #{all_text.inspect}")
154
- if all_text.empty?
155
- all_text = html.search("div").collect {|p| p.text.strip }.collect {|text| text.split.collect(&:strip)}.flatten.reject {|word| word.size < 4}
156
- debug("divs extracts - #{all_text.inspect}")
157
- end
158
- all_text += description.to_s.split
147
+ if tags.empty?
148
+ debug("no tags, parsing body")
149
+ word_hash = Hash.new(0)
150
+ all_text = []
151
+ # all_text = html.search("p").collect {|p| p.text.strip }.collect {|text| text.split.collect(&:strip)}.flatten.reject {|word| word.size < 4}
152
+ # debug("p's extracts - #{all_text.inspect}")
153
+ if all_text.empty?
154
+ all_text = html.search("div").collect {|p| p.text.strip }.collect {|text| text.split.collect(&:strip)}.flatten.reject {|word| word.size < 4}
155
+ debug("divs extracts - #{all_text.inspect}")
156
+ end
157
+ all_text += description.to_s.split
159
158
 
160
- all_text.flatten.each do |word|
161
- word_hash[word] += 1
159
+ all_text.flatten.each do |word|
160
+ word_hash[word] += 1
161
+ end
162
+ debug("final word hash - #{word_hash.inspect}")
163
+ word_hash.reject! {|k,v| v < 3 || k.size == 1 || k.include?(".") || k.include?("'") || k.include?("(") || k.include?(":") || k.include?("]")}
164
+
162
165
  end
163
- debug("final word hash - #{word_hash.inspect}")
164
- word_hash.reject! {|k,v| v < 3 || k.size == 1 || k.include?(".") || k.include?("'") || k.include?("(") || k.include?(":") || k.include?("]")}
165
-
166
- # max_score = word_hash.values.max.to_i
167
- # dmoz = HTTParty.get("http://www.dmoz.org/search", query: {q: url})
168
- # dmoz_res = Nokogiri::HTML(dmoz.parsed_response)
169
-
170
- # dmoz_classification = dmoz_res.search('//*[@id="bd-cross"]//ol[1]//li//strong').collect(&:text)
171
- # if dmoz_classification.any?
172
- # dmoz_classification.each do |dmoz_class|
173
- # dmoz_class.split(": ").reject {|v| ["World"].include?(v)}.each do |tag|
174
- # tag.split(" ").each do |plain_tag|
175
- # word_hash[plain_tag] = max_score + 5
176
- # end
177
- # end
178
- # end
179
- # end
180
166
  self.new(url, tags, word_hash, description, page_lang)
181
167
  end
182
168
  end
@@ -1,3 +1,3 @@
1
1
  module SiteClassifier
2
- VERSION = "0.0.6"
2
+ VERSION = "0.0.7"
3
3
  end
@@ -21,8 +21,8 @@ Gem::Specification.new do |spec|
21
21
  spec.add_development_dependency "bundler", "~> 1.3"
22
22
  spec.add_development_dependency "rake"
23
23
  spec.add_development_dependency "rspec"
24
- spec.add_dependency "httparty", "0.11.0"
25
- spec.add_dependency "nokogiri", "1.6.0"
26
- spec.add_dependency "easy_translate", "0.3.3"
27
- spec.add_dependency "activesupport", "4.0.0"
24
+ spec.add_dependency "httparty"
25
+ spec.add_dependency "nokogiri"
26
+ spec.add_dependency "easy_translate"
27
+ spec.add_dependency "activesupport", "4.2.3"
28
28
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: site_classifier
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.6
4
+ version: 0.0.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Elad Meidar
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-10-02 00:00:00.000000000 Z
11
+ date: 2016-10-31 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -56,58 +56,58 @@ dependencies:
56
56
  name: httparty
57
57
  requirement: !ruby/object:Gem::Requirement
58
58
  requirements:
59
- - - '='
59
+ - - '>='
60
60
  - !ruby/object:Gem::Version
61
- version: 0.11.0
61
+ version: '0'
62
62
  type: :runtime
63
63
  prerelease: false
64
64
  version_requirements: !ruby/object:Gem::Requirement
65
65
  requirements:
66
- - - '='
66
+ - - '>='
67
67
  - !ruby/object:Gem::Version
68
- version: 0.11.0
68
+ version: '0'
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: nokogiri
71
71
  requirement: !ruby/object:Gem::Requirement
72
72
  requirements:
73
- - - '='
73
+ - - '>='
74
74
  - !ruby/object:Gem::Version
75
- version: 1.6.0
75
+ version: '0'
76
76
  type: :runtime
77
77
  prerelease: false
78
78
  version_requirements: !ruby/object:Gem::Requirement
79
79
  requirements:
80
- - - '='
80
+ - - '>='
81
81
  - !ruby/object:Gem::Version
82
- version: 1.6.0
82
+ version: '0'
83
83
  - !ruby/object:Gem::Dependency
84
84
  name: easy_translate
85
85
  requirement: !ruby/object:Gem::Requirement
86
86
  requirements:
87
- - - '='
87
+ - - '>='
88
88
  - !ruby/object:Gem::Version
89
- version: 0.3.3
89
+ version: '0'
90
90
  type: :runtime
91
91
  prerelease: false
92
92
  version_requirements: !ruby/object:Gem::Requirement
93
93
  requirements:
94
- - - '='
94
+ - - '>='
95
95
  - !ruby/object:Gem::Version
96
- version: 0.3.3
96
+ version: '0'
97
97
  - !ruby/object:Gem::Dependency
98
98
  name: activesupport
99
99
  requirement: !ruby/object:Gem::Requirement
100
100
  requirements:
101
101
  - - '='
102
102
  - !ruby/object:Gem::Version
103
- version: 4.0.0
103
+ version: 4.2.3
104
104
  type: :runtime
105
105
  prerelease: false
106
106
  version_requirements: !ruby/object:Gem::Requirement
107
107
  requirements:
108
108
  - - '='
109
109
  - !ruby/object:Gem::Version
110
- version: 4.0.0
110
+ version: 4.2.3
111
111
  description: Return a tag list for submitted urls
112
112
  email:
113
113
  - elad@eizesus.com
@@ -150,7 +150,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
150
150
  version: '0'
151
151
  requirements: []
152
152
  rubyforge_project:
153
- rubygems_version: 2.0.6
153
+ rubygems_version: 2.4.8
154
154
  signing_key:
155
155
  specification_version: 4
156
156
  summary: This gem extracts a list of english tags for a given url
@@ -159,3 +159,4 @@ test_files:
159
159
  - spec/models/extractor_spec.rb
160
160
  - spec/models/site_classifier_spec.rb
161
161
  - spec/spec_helper.rb
162
+ has_rdoc: