site_classifier 0.0.4 → 0.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/site_classifier/configuration.rb +6 -1
- data/lib/site_classifier/extractor.rb +74 -10
- data/lib/site_classifier/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1a6185e3f8b41f26e38ba1787d9a73ec864960ae
|
4
|
+
data.tar.gz: d53cf863639a4841920e85ed1d68c1e1a6367c2e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5b1e6a8e583c46f23dfdf35c2ce8448ca7539154775acd905a9cf2b212e3a6136b485c6b806ba2dae57ab8727e546b31c82f7cd2e7c879c8d23e002e8ba4f90c
|
7
|
+
data.tar.gz: e7ff3a1003a2baecf41486efcab6768c3ef5ee41fc24a558bdbdaadf65a46da6bbe31f121235ab1851934831937130f1e3d3b09f3fa4ea1f4d2c1ec91e55ee9e
|
@@ -1,13 +1,18 @@
|
|
1
1
|
module SiteClassifier
|
2
2
|
class Configuration
|
3
|
-
attr_accessor :translate, :google_translate_api_key
|
3
|
+
attr_accessor :translate, :google_translate_api_key, :debug
|
4
4
|
|
5
5
|
# Instantiate a new class
|
6
6
|
def initialize(options = {})
|
7
7
|
@translate = options[:translate] || false
|
8
8
|
@google_translate_api_key = options[:google_translate_api_key]
|
9
|
+
@debug = options[:debug] || false
|
9
10
|
end
|
10
11
|
|
12
|
+
def debug?
|
13
|
+
self.debug == true
|
14
|
+
end
|
15
|
+
|
11
16
|
# Configure by block
|
12
17
|
def self.configure(&block)
|
13
18
|
new_configuration = SiteClassifier::Configuration.new
|
@@ -12,6 +12,12 @@ module SiteClassifier
|
|
12
12
|
@lang = lang.downcase
|
13
13
|
end
|
14
14
|
|
15
|
+
def self.debug(string)
|
16
|
+
if SiteClassifier.configuration.debug?
|
17
|
+
puts "#{Time.now.to_i} - #{string}"
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
15
21
|
# Normalize site language
|
16
22
|
def validate_lang
|
17
23
|
if EasyTranslate::LANGUAGES.keys.include?(@lang)
|
@@ -24,18 +30,34 @@ module SiteClassifier
|
|
24
30
|
# Extract most significant tags
|
25
31
|
def most_significant
|
26
32
|
most_sig = []
|
27
|
-
if !description.nil?
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
+
# if !description.nil?
|
34
|
+
# if tags.any?
|
35
|
+
# most_sig = tags.select {|tag| self.description.downcase.include?(tag)}.collect {|tag| tag.singularize }
|
36
|
+
# else
|
37
|
+
# most_sig = word_frequency.keys.select {|tag| self.description.downcase.include?(tag)}.collect {|tag| tag.singularize }
|
38
|
+
# end
|
39
|
+
# end
|
40
|
+
|
41
|
+
description.to_s.split.each do |word|
|
42
|
+
self.word_frequency[word] ||= 0
|
43
|
+
self.word_frequency[word] += 1
|
33
44
|
end
|
34
45
|
|
35
46
|
if most_sig.empty?
|
36
|
-
most_sig = self.word_frequency.keys
|
47
|
+
most_sig = self.word_frequency.reject {|k,v| v < 3}.keys
|
48
|
+
most_sig.flatten!
|
37
49
|
end
|
38
50
|
|
51
|
+
if description && tags.any?
|
52
|
+
tags.each do |tag|
|
53
|
+
if description.include?(tag)
|
54
|
+
most_sig << tag.singularize
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
most_sig.uniq!
|
60
|
+
|
39
61
|
self.validate_lang
|
40
62
|
|
41
63
|
if SiteClassifier.translate_tags?
|
@@ -65,6 +87,7 @@ module SiteClassifier
|
|
65
87
|
def self.parse_site(url = "")
|
66
88
|
return if url == "" || url.nil?
|
67
89
|
|
90
|
+
debug("getting #{url}")
|
68
91
|
html = Nokogiri::HTML(self.get(url).parsed_response)
|
69
92
|
|
70
93
|
tags = []
|
@@ -74,30 +97,71 @@ module SiteClassifier
|
|
74
97
|
|
75
98
|
begin
|
76
99
|
page_lang = html.search("html").first["lang"].to_s.slice(0..1)
|
100
|
+
debug("found lang in html tag - #{page_lang}")
|
77
101
|
rescue
|
78
102
|
end
|
79
103
|
|
80
104
|
begin
|
81
105
|
page_lang = html.search("html").first["xml:lang"].to_s.slice(0..1)
|
106
|
+
debug("found lang in html tag (xml:lang) - #{page_lang}")
|
82
107
|
rescue
|
83
108
|
end
|
84
109
|
|
85
110
|
begin
|
86
111
|
tags = html.search('meta[name="keywords"]').first["content"].split(",").collect(&:strip).collect(&:downcase)
|
112
|
+
debug("Tags - #{tags.inspect}")
|
113
|
+
rescue
|
114
|
+
debug("no tags found")
|
115
|
+
end
|
116
|
+
|
117
|
+
if tags.empty?
|
118
|
+
begin
|
119
|
+
tags = html.search('meta[property="keywords"]').first["content"].split(",").collect(&:strip).collect(&:downcase)
|
120
|
+
debug("Tags - #{tags.inspect}")
|
121
|
+
rescue
|
122
|
+
debug("no tags found")
|
123
|
+
end
|
124
|
+
end
|
125
|
+
begin
|
87
126
|
description = html.search('meta[name="description"]').first["content"]
|
127
|
+
debug("Decription meta found")
|
88
128
|
rescue
|
89
129
|
end
|
90
130
|
|
131
|
+
if description.nil?
|
132
|
+
begin
|
133
|
+
description = html.search('meta[property="og:description"]').first["content"]
|
134
|
+
debug("Facebook og:description found")
|
135
|
+
rescue
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
139
|
+
if description.nil?
|
140
|
+
begin
|
141
|
+
description = html.search('meta[name="og:description"]').first["content"]
|
142
|
+
debug("Facebook og:description found")
|
143
|
+
rescue
|
144
|
+
end
|
145
|
+
end
|
146
|
+
|
91
147
|
if tags.empty?
|
148
|
+
debug("no tags, parsing body")
|
92
149
|
word_hash = Hash.new(0)
|
93
|
-
all_text =
|
150
|
+
all_text = []
|
151
|
+
# all_text = html.search("p").collect {|p| p.text.strip }.collect {|text| text.split.collect(&:strip)}.flatten.reject {|word| word.size < 4}
|
152
|
+
# debug("p's extracts - #{all_text.inspect}")
|
94
153
|
if all_text.empty?
|
95
154
|
all_text = html.search("div").collect {|p| p.text.strip }.collect {|text| text.split.collect(&:strip)}.flatten.reject {|word| word.size < 4}
|
155
|
+
debug("divs extracts - #{all_text.inspect}")
|
96
156
|
end
|
97
|
-
all_text
|
157
|
+
all_text += description.to_s.split
|
158
|
+
|
159
|
+
all_text.flatten.each do |word|
|
98
160
|
word_hash[word] += 1
|
99
161
|
end
|
100
|
-
|
162
|
+
debug("final word hash - #{word_hash.inspect}")
|
163
|
+
word_hash.reject! {|k,v| v < 3 || k.size == 1 || k.include?(".") || k.include?("'") || k.include?("(") || k.include?(":") || k.include?("]")}
|
164
|
+
|
101
165
|
end
|
102
166
|
self.new(url, tags, word_hash, description, page_lang)
|
103
167
|
end
|