site_classifier 0.0.4 → 0.0.5

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 532c257b74d762bf2bad6c07255e87a5c6e924dc
4
- data.tar.gz: 760547db13f4b9c75db9085b778c276fa184efe3
3
+ metadata.gz: 1a6185e3f8b41f26e38ba1787d9a73ec864960ae
4
+ data.tar.gz: d53cf863639a4841920e85ed1d68c1e1a6367c2e
5
5
  SHA512:
6
- metadata.gz: 760921ace478383eee08c03b06b041066d6b60bea86406380b75dd6379445b4ed58ebdda765ebdda16fbcc3f4dee8ff600b7cbd09309b9a86877ad7b8b61ad5d
7
- data.tar.gz: 3084a269a2416a40c472befdf20dc7f2b11f5a0554f2f7f2f08256c37448a54159bb8e575be5d3dd84daa5cc779d074bcb765ae9719172df5b12a3664e282651
6
+ metadata.gz: 5b1e6a8e583c46f23dfdf35c2ce8448ca7539154775acd905a9cf2b212e3a6136b485c6b806ba2dae57ab8727e546b31c82f7cd2e7c879c8d23e002e8ba4f90c
7
+ data.tar.gz: e7ff3a1003a2baecf41486efcab6768c3ef5ee41fc24a558bdbdaadf65a46da6bbe31f121235ab1851934831937130f1e3d3b09f3fa4ea1f4d2c1ec91e55ee9e
@@ -1,13 +1,18 @@
1
1
  module SiteClassifier
2
2
  class Configuration
3
- attr_accessor :translate, :google_translate_api_key
3
+ attr_accessor :translate, :google_translate_api_key, :debug
4
4
 
5
5
  # Instantiate a new class
6
6
  def initialize(options = {})
7
7
  @translate = options[:translate] || false
8
8
  @google_translate_api_key = options[:google_translate_api_key]
9
+ @debug = options[:debug] || false
9
10
  end
10
11
 
12
+ def debug?
13
+ self.debug == true
14
+ end
15
+
11
16
  # Configure by block
12
17
  def self.configure(&block)
13
18
  new_configuration = SiteClassifier::Configuration.new
@@ -12,6 +12,12 @@ module SiteClassifier
12
12
  @lang = lang.downcase
13
13
  end
14
14
 
15
+ def self.debug(string)
16
+ if SiteClassifier.configuration.debug?
17
+ puts "#{Time.now.to_i} - #{string}"
18
+ end
19
+ end
20
+
15
21
  # Normalize site language
16
22
  def validate_lang
17
23
  if EasyTranslate::LANGUAGES.keys.include?(@lang)
@@ -24,18 +30,34 @@ module SiteClassifier
24
30
  # Extract most significant tags
25
31
  def most_significant
26
32
  most_sig = []
27
- if !description.nil?
28
- if tags.any?
29
- most_sig = tags.select {|tag| self.description.downcase.include?(tag)}.collect {|tag| tag.singularize }
30
- else
31
- most_sig = word_frequency.keys.select {|tag| self.description.downcase.include?(tag)}.collect {|tag| tag.singularize }
32
- end
33
+ # if !description.nil?
34
+ # if tags.any?
35
+ # most_sig = tags.select {|tag| self.description.downcase.include?(tag)}.collect {|tag| tag.singularize }
36
+ # else
37
+ # most_sig = word_frequency.keys.select {|tag| self.description.downcase.include?(tag)}.collect {|tag| tag.singularize }
38
+ # end
39
+ # end
40
+
41
+ description.to_s.split.each do |word|
42
+ self.word_frequency[word] ||= 0
43
+ self.word_frequency[word] += 1
33
44
  end
34
45
 
35
46
  if most_sig.empty?
36
- most_sig = self.word_frequency.keys
47
+ most_sig = self.word_frequency.reject {|k,v| v < 3}.keys
48
+ most_sig.flatten!
37
49
  end
38
50
 
51
+ if description && tags.any?
52
+ tags.each do |tag|
53
+ if description.include?(tag)
54
+ most_sig << tag.singularize
55
+ end
56
+ end
57
+ end
58
+
59
+ most_sig.uniq!
60
+
39
61
  self.validate_lang
40
62
 
41
63
  if SiteClassifier.translate_tags?
@@ -65,6 +87,7 @@ module SiteClassifier
65
87
  def self.parse_site(url = "")
66
88
  return if url == "" || url.nil?
67
89
 
90
+ debug("getting #{url}")
68
91
  html = Nokogiri::HTML(self.get(url).parsed_response)
69
92
 
70
93
  tags = []
@@ -74,30 +97,71 @@ module SiteClassifier
74
97
 
75
98
  begin
76
99
  page_lang = html.search("html").first["lang"].to_s.slice(0..1)
100
+ debug("found lang in html tag - #{page_lang}")
77
101
  rescue
78
102
  end
79
103
 
80
104
  begin
81
105
  page_lang = html.search("html").first["xml:lang"].to_s.slice(0..1)
106
+ debug("found lang in html tag (xml:lang) - #{page_lang}")
82
107
  rescue
83
108
  end
84
109
 
85
110
  begin
86
111
  tags = html.search('meta[name="keywords"]').first["content"].split(",").collect(&:strip).collect(&:downcase)
112
+ debug("Tags - #{tags.inspect}")
113
+ rescue
114
+ debug("no tags found")
115
+ end
116
+
117
+ if tags.empty?
118
+ begin
119
+ tags = html.search('meta[property="keywords"]').first["content"].split(",").collect(&:strip).collect(&:downcase)
120
+ debug("Tags - #{tags.inspect}")
121
+ rescue
122
+ debug("no tags found")
123
+ end
124
+ end
125
+ begin
87
126
  description = html.search('meta[name="description"]').first["content"]
127
+ debug("Decription meta found")
88
128
  rescue
89
129
  end
90
130
 
131
+ if description.nil?
132
+ begin
133
+ description = html.search('meta[property="og:description"]').first["content"]
134
+ debug("Facebook og:description found")
135
+ rescue
136
+ end
137
+ end
138
+
139
+ if description.nil?
140
+ begin
141
+ description = html.search('meta[name="og:description"]').first["content"]
142
+ debug("Facebook og:description found")
143
+ rescue
144
+ end
145
+ end
146
+
91
147
  if tags.empty?
148
+ debug("no tags, parsing body")
92
149
  word_hash = Hash.new(0)
93
- all_text = html.search("p").collect {|p| p.text.strip }.collect {|text| text.split.collect(&:strip)}.flatten.reject {|word| word.size < 4}
150
+ all_text = []
151
+ # all_text = html.search("p").collect {|p| p.text.strip }.collect {|text| text.split.collect(&:strip)}.flatten.reject {|word| word.size < 4}
152
+ # debug("p's extracts - #{all_text.inspect}")
94
153
  if all_text.empty?
95
154
  all_text = html.search("div").collect {|p| p.text.strip }.collect {|text| text.split.collect(&:strip)}.flatten.reject {|word| word.size < 4}
155
+ debug("divs extracts - #{all_text.inspect}")
96
156
  end
97
- all_text.each do |word|
157
+ all_text += description.to_s.split
158
+
159
+ all_text.flatten.each do |word|
98
160
  word_hash[word] += 1
99
161
  end
100
- word_hash.reject! {|k,v| v < 2 }
162
+ debug("final word hash - #{word_hash.inspect}")
163
+ word_hash.reject! {|k,v| v < 3 || k.size == 1 || k.include?(".") || k.include?("'") || k.include?("(") || k.include?(":") || k.include?("]")}
164
+
101
165
  end
102
166
  self.new(url, tags, word_hash, description, page_lang)
103
167
  end
@@ -1,3 +1,3 @@
1
1
  module SiteClassifier
2
- VERSION = "0.0.4"
2
+ VERSION = "0.0.5"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: site_classifier
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
4
+ version: 0.0.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Elad Meidar