site_classifier 0.0.4 → 0.0.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/site_classifier/configuration.rb +6 -1
- data/lib/site_classifier/extractor.rb +74 -10
- data/lib/site_classifier/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1a6185e3f8b41f26e38ba1787d9a73ec864960ae
|
4
|
+
data.tar.gz: d53cf863639a4841920e85ed1d68c1e1a6367c2e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5b1e6a8e583c46f23dfdf35c2ce8448ca7539154775acd905a9cf2b212e3a6136b485c6b806ba2dae57ab8727e546b31c82f7cd2e7c879c8d23e002e8ba4f90c
|
7
|
+
data.tar.gz: e7ff3a1003a2baecf41486efcab6768c3ef5ee41fc24a558bdbdaadf65a46da6bbe31f121235ab1851934831937130f1e3d3b09f3fa4ea1f4d2c1ec91e55ee9e
|
@@ -1,13 +1,18 @@
|
|
1
1
|
module SiteClassifier
|
2
2
|
class Configuration
|
3
|
-
attr_accessor :translate, :google_translate_api_key
|
3
|
+
attr_accessor :translate, :google_translate_api_key, :debug
|
4
4
|
|
5
5
|
# Instantiate a new class
|
6
6
|
def initialize(options = {})
|
7
7
|
@translate = options[:translate] || false
|
8
8
|
@google_translate_api_key = options[:google_translate_api_key]
|
9
|
+
@debug = options[:debug] || false
|
9
10
|
end
|
10
11
|
|
12
|
+
def debug?
|
13
|
+
self.debug == true
|
14
|
+
end
|
15
|
+
|
11
16
|
# Configure by block
|
12
17
|
def self.configure(&block)
|
13
18
|
new_configuration = SiteClassifier::Configuration.new
|
@@ -12,6 +12,12 @@ module SiteClassifier
|
|
12
12
|
@lang = lang.downcase
|
13
13
|
end
|
14
14
|
|
15
|
+
def self.debug(string)
|
16
|
+
if SiteClassifier.configuration.debug?
|
17
|
+
puts "#{Time.now.to_i} - #{string}"
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
15
21
|
# Normalize site language
|
16
22
|
def validate_lang
|
17
23
|
if EasyTranslate::LANGUAGES.keys.include?(@lang)
|
@@ -24,18 +30,34 @@ module SiteClassifier
|
|
24
30
|
# Extract most significant tags
|
25
31
|
def most_significant
|
26
32
|
most_sig = []
|
27
|
-
if !description.nil?
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
+
# if !description.nil?
|
34
|
+
# if tags.any?
|
35
|
+
# most_sig = tags.select {|tag| self.description.downcase.include?(tag)}.collect {|tag| tag.singularize }
|
36
|
+
# else
|
37
|
+
# most_sig = word_frequency.keys.select {|tag| self.description.downcase.include?(tag)}.collect {|tag| tag.singularize }
|
38
|
+
# end
|
39
|
+
# end
|
40
|
+
|
41
|
+
description.to_s.split.each do |word|
|
42
|
+
self.word_frequency[word] ||= 0
|
43
|
+
self.word_frequency[word] += 1
|
33
44
|
end
|
34
45
|
|
35
46
|
if most_sig.empty?
|
36
|
-
most_sig = self.word_frequency.keys
|
47
|
+
most_sig = self.word_frequency.reject {|k,v| v < 3}.keys
|
48
|
+
most_sig.flatten!
|
37
49
|
end
|
38
50
|
|
51
|
+
if description && tags.any?
|
52
|
+
tags.each do |tag|
|
53
|
+
if description.include?(tag)
|
54
|
+
most_sig << tag.singularize
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
most_sig.uniq!
|
60
|
+
|
39
61
|
self.validate_lang
|
40
62
|
|
41
63
|
if SiteClassifier.translate_tags?
|
@@ -65,6 +87,7 @@ module SiteClassifier
|
|
65
87
|
def self.parse_site(url = "")
|
66
88
|
return if url == "" || url.nil?
|
67
89
|
|
90
|
+
debug("getting #{url}")
|
68
91
|
html = Nokogiri::HTML(self.get(url).parsed_response)
|
69
92
|
|
70
93
|
tags = []
|
@@ -74,30 +97,71 @@ module SiteClassifier
|
|
74
97
|
|
75
98
|
begin
|
76
99
|
page_lang = html.search("html").first["lang"].to_s.slice(0..1)
|
100
|
+
debug("found lang in html tag - #{page_lang}")
|
77
101
|
rescue
|
78
102
|
end
|
79
103
|
|
80
104
|
begin
|
81
105
|
page_lang = html.search("html").first["xml:lang"].to_s.slice(0..1)
|
106
|
+
debug("found lang in html tag (xml:lang) - #{page_lang}")
|
82
107
|
rescue
|
83
108
|
end
|
84
109
|
|
85
110
|
begin
|
86
111
|
tags = html.search('meta[name="keywords"]').first["content"].split(",").collect(&:strip).collect(&:downcase)
|
112
|
+
debug("Tags - #{tags.inspect}")
|
113
|
+
rescue
|
114
|
+
debug("no tags found")
|
115
|
+
end
|
116
|
+
|
117
|
+
if tags.empty?
|
118
|
+
begin
|
119
|
+
tags = html.search('meta[property="keywords"]').first["content"].split(",").collect(&:strip).collect(&:downcase)
|
120
|
+
debug("Tags - #{tags.inspect}")
|
121
|
+
rescue
|
122
|
+
debug("no tags found")
|
123
|
+
end
|
124
|
+
end
|
125
|
+
begin
|
87
126
|
description = html.search('meta[name="description"]').first["content"]
|
127
|
+
debug("Decription meta found")
|
88
128
|
rescue
|
89
129
|
end
|
90
130
|
|
131
|
+
if description.nil?
|
132
|
+
begin
|
133
|
+
description = html.search('meta[property="og:description"]').first["content"]
|
134
|
+
debug("Facebook og:description found")
|
135
|
+
rescue
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
139
|
+
if description.nil?
|
140
|
+
begin
|
141
|
+
description = html.search('meta[name="og:description"]').first["content"]
|
142
|
+
debug("Facebook og:description found")
|
143
|
+
rescue
|
144
|
+
end
|
145
|
+
end
|
146
|
+
|
91
147
|
if tags.empty?
|
148
|
+
debug("no tags, parsing body")
|
92
149
|
word_hash = Hash.new(0)
|
93
|
-
all_text =
|
150
|
+
all_text = []
|
151
|
+
# all_text = html.search("p").collect {|p| p.text.strip }.collect {|text| text.split.collect(&:strip)}.flatten.reject {|word| word.size < 4}
|
152
|
+
# debug("p's extracts - #{all_text.inspect}")
|
94
153
|
if all_text.empty?
|
95
154
|
all_text = html.search("div").collect {|p| p.text.strip }.collect {|text| text.split.collect(&:strip)}.flatten.reject {|word| word.size < 4}
|
155
|
+
debug("divs extracts - #{all_text.inspect}")
|
96
156
|
end
|
97
|
-
all_text
|
157
|
+
all_text += description.to_s.split
|
158
|
+
|
159
|
+
all_text.flatten.each do |word|
|
98
160
|
word_hash[word] += 1
|
99
161
|
end
|
100
|
-
|
162
|
+
debug("final word hash - #{word_hash.inspect}")
|
163
|
+
word_hash.reject! {|k,v| v < 3 || k.size == 1 || k.include?(".") || k.include?("'") || k.include?("(") || k.include?(":") || k.include?("]")}
|
164
|
+
|
101
165
|
end
|
102
166
|
self.new(url, tags, word_hash, description, page_lang)
|
103
167
|
end
|