site_analyzer 0.3.9 → 0.3.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/lib/site_analyzer/page.rb +99 -70
- data/lib/site_analyzer/site.rb +22 -10
- data/lib/site_analyzer/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d800ec488fdf5882e08d325ceb904af2b8559e0d
|
4
|
+
data.tar.gz: 7e963e3c8f71905e0e1dafadabd8d81c43bcb2a2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6bdb11a2a9c98b982c4f901e62e9a8000cdd4750f986a976414d8474065d695db8482f7ba8742237a28aa0d3d41e3e9fa233bb402695ad4b709c348675666fdb
|
7
|
+
data.tar.gz: d5c457238252264ae2722a1b1f361e8ab3f6be2210295542a347a468cbe16c807dd0a0d9a97264856b4bd347b3f4862494d2dc982ba4b50e4793853590ba981f
|
data/Gemfile.lock
CHANGED
data/lib/site_analyzer/page.rb
CHANGED
@@ -7,7 +7,7 @@ module SiteAnalyzer
|
|
7
7
|
attr_reader :page_url, :titles, :page
|
8
8
|
def initialize(url)
|
9
9
|
@page_url = url
|
10
|
-
@page = get_page
|
10
|
+
@page = get_page(url)
|
11
11
|
@site_url = get_domain url
|
12
12
|
@titles = all_titles
|
13
13
|
end
|
@@ -17,142 +17,171 @@ module SiteAnalyzer
|
|
17
17
|
end
|
18
18
|
|
19
19
|
def get_page(url)
|
20
|
-
|
21
|
-
|
22
|
-
|
20
|
+
begin
|
21
|
+
timeout(30) { Nokogiri::HTML(open(url)) }
|
22
|
+
rescue Timeout::Error, EOFError, OpenURI::HTTPError
|
23
|
+
return nil
|
24
|
+
end
|
23
25
|
end
|
24
26
|
|
25
27
|
def get_domain(url)
|
26
|
-
timeout(30) { Addressable::URI.parse(url).host }
|
27
|
-
rescue
|
28
|
-
'Error with parsing by Addressable'
|
28
|
+
timeout(30) { Addressable::URI.parse(url).host } rescue nil
|
29
29
|
end
|
30
30
|
|
31
31
|
def title_good?
|
32
|
-
@page.css('title').size == 1 &&
|
32
|
+
@page.css('title').size == 1 && @page.css('title').text.size < 70 if @page
|
33
33
|
end
|
34
34
|
# true if title and h1 have no dublicates
|
35
35
|
def title_and_h1_good?
|
36
|
-
|
37
|
-
|
38
|
-
|
36
|
+
if @page
|
37
|
+
arr = []
|
38
|
+
@page.css('h1').each { |node| arr << node.text }
|
39
|
+
@page.css('title').size == 1 && arr.uniq.size == arr.size
|
40
|
+
end
|
39
41
|
end
|
40
42
|
# true if metadescription less then 200 symbols
|
41
43
|
def metadescription_good?
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
44
|
+
if @page
|
45
|
+
tags = @page.css("meta[name='description']")
|
46
|
+
return false if tags.size == 0
|
47
|
+
tags.each do |t|
|
48
|
+
unless t['value'].nil?
|
49
|
+
return false if t['content'].size == 0 || t['content'].size > 200
|
50
|
+
end
|
47
51
|
end
|
52
|
+
true
|
48
53
|
end
|
49
|
-
true
|
50
54
|
end
|
51
55
|
# true if keywords less then 600 symbols
|
52
56
|
def keywords_good?
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
57
|
+
if @page
|
58
|
+
tags = @page.css("meta[name='keywords']")
|
59
|
+
return false if tags.size == 0
|
60
|
+
tags.each do |t|
|
61
|
+
unless t['value'].nil?
|
62
|
+
return false if t['content'].size == 0 || t['content'].size > 600
|
63
|
+
end
|
58
64
|
end
|
65
|
+
true
|
59
66
|
end
|
60
|
-
true
|
61
67
|
end
|
62
68
|
# true if code less then text
|
63
69
|
def code_less?
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
70
|
+
if @page
|
71
|
+
sum = 0
|
72
|
+
page_text = @page.text.size
|
73
|
+
@page.css('script').each do |tag|
|
74
|
+
sum += tag.text.size
|
75
|
+
end
|
76
|
+
sum < page_text / 2
|
68
77
|
end
|
69
|
-
sum < page_text / 2
|
70
78
|
end
|
71
79
|
|
72
80
|
def collect_metadates
|
73
|
-
@page.css('meta')
|
81
|
+
@page.css('meta') if @page
|
74
82
|
end
|
75
83
|
|
76
84
|
def metadates_good?
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
85
|
+
if @page
|
86
|
+
meta_tags = collect_metadates
|
87
|
+
return false if @page.css('title').size > 1 || meta_tags.nil?
|
88
|
+
node_names = []
|
89
|
+
meta_tags.each { |node| node_names << node['name'] }
|
90
|
+
return false if node_names.compact!.size < 1
|
91
|
+
node_names.uniq.size == node_names.size
|
92
|
+
end
|
83
93
|
end
|
84
94
|
# return hash with all titles, h1 and h2
|
85
95
|
def all_titles_h1_h2
|
86
|
-
|
87
|
-
|
88
|
-
|
96
|
+
if @page
|
97
|
+
out = []
|
98
|
+
out << @page.css('title').text << { @page_url => @page.css('h1').text }
|
99
|
+
out << { @page_url => @page.css('h2').text }
|
100
|
+
end
|
89
101
|
end
|
90
102
|
|
91
103
|
def home_a
|
92
|
-
|
93
|
-
|
94
|
-
|
104
|
+
if @page
|
105
|
+
home_a = []
|
106
|
+
all_a_tags_href.each do |link|
|
107
|
+
home_a << link if link.include? @site_url
|
108
|
+
end
|
109
|
+
home_a
|
95
110
|
end
|
96
|
-
home_a
|
97
111
|
end
|
98
112
|
|
99
113
|
def remote_a
|
100
|
-
|
101
|
-
|
102
|
-
|
114
|
+
if @page
|
115
|
+
remote_a = []
|
116
|
+
all_a_tags_href.uniq.each do |link|
|
117
|
+
remote_a << link unless link.include? @site_url
|
118
|
+
end
|
119
|
+
remote_a
|
103
120
|
end
|
104
|
-
remote_a
|
105
121
|
end
|
106
122
|
|
107
123
|
def all_a_tags_href
|
108
|
-
|
109
|
-
|
110
|
-
|
124
|
+
if @page
|
125
|
+
tags = []
|
126
|
+
@page.css('a').each do |node|
|
127
|
+
tags << node['href']
|
128
|
+
end
|
129
|
+
tags.compact
|
111
130
|
end
|
112
|
-
tags.compact
|
113
131
|
end
|
114
132
|
|
115
133
|
def wrong_a
|
116
|
-
|
117
|
-
|
118
|
-
|
134
|
+
if @page
|
135
|
+
wrong_a = []
|
136
|
+
all_a_tags_href.each do |link|
|
137
|
+
wrong_a << link if link.include? '?meta='
|
138
|
+
end
|
139
|
+
wrong_a
|
119
140
|
end
|
120
|
-
wrong_a
|
121
141
|
end
|
122
142
|
|
123
143
|
def h2?
|
124
|
-
@page.css('h2').size > 0
|
144
|
+
@page.css('h2').size > 0 if @page
|
125
145
|
end
|
126
146
|
|
127
147
|
def page_text_size
|
128
|
-
@page.text.size
|
148
|
+
@page.text.size if @page
|
129
149
|
end
|
130
150
|
|
131
151
|
def all_a_tags
|
132
|
-
|
133
|
-
|
134
|
-
|
152
|
+
if @page
|
153
|
+
tags = []
|
154
|
+
@page.css('a').each do |node|
|
155
|
+
tags << [node['href'], node['target'], node['rel']]
|
156
|
+
end
|
157
|
+
tags.compact
|
135
158
|
end
|
136
|
-
tags.compact
|
137
159
|
end
|
138
160
|
|
139
161
|
def all_titles
|
140
|
-
|
141
|
-
|
142
|
-
|
162
|
+
if @page
|
163
|
+
titles = []
|
164
|
+
@page.css('title').each { |tag| titles << tag.text }
|
165
|
+
titles
|
166
|
+
end
|
143
167
|
end
|
144
168
|
|
145
169
|
def all_meta_description_content
|
146
|
-
|
147
|
-
|
148
|
-
|
170
|
+
if @page
|
171
|
+
tags = []
|
172
|
+
@page.css("meta[name='description']").each do |t|
|
173
|
+
tags << t['content']
|
174
|
+
end
|
175
|
+
tags
|
149
176
|
end
|
150
|
-
tags
|
151
177
|
end
|
178
|
+
|
152
179
|
def h2
|
153
|
-
|
154
|
-
|
155
|
-
|
180
|
+
if @page
|
181
|
+
h2s = []
|
182
|
+
@page.css('h2').each { |tag| h2s << tag.text }
|
183
|
+
h2s
|
184
|
+
end
|
156
185
|
end
|
157
186
|
end
|
158
187
|
end
|
data/lib/site_analyzer/site.rb
CHANGED
@@ -36,9 +36,13 @@ module SiteAnalyzer
|
|
36
36
|
|
37
37
|
def add_pages_for_scan!
|
38
38
|
@pages_for_scan = []
|
39
|
+
@bad_pages = []
|
39
40
|
@pages.each do |page|
|
40
|
-
page.
|
41
|
-
|
41
|
+
@bad_pages << page.page_url unless page.page
|
42
|
+
if page.page
|
43
|
+
page.home_a.each do |link|
|
44
|
+
@pages_for_scan << link unless link.nil? || @scanned_pages.include?(link) || link.start_with?('mailto:') || link.start_with?('skype:') || link.end_with?('.jpg')
|
45
|
+
end
|
42
46
|
end
|
43
47
|
end
|
44
48
|
@pages_for_scan.clear if @pages_for_scan.size == 0
|
@@ -58,7 +62,9 @@ module SiteAnalyzer
|
|
58
62
|
def all_titles
|
59
63
|
result = []
|
60
64
|
@pages.each do |page|
|
61
|
-
|
65
|
+
if page.page
|
66
|
+
result << [page.page_url, page.titles]
|
67
|
+
end
|
62
68
|
end
|
63
69
|
result
|
64
70
|
end
|
@@ -66,7 +72,9 @@ module SiteAnalyzer
|
|
66
72
|
def all_descriptions
|
67
73
|
result = []
|
68
74
|
@pages.each do |page|
|
69
|
-
|
75
|
+
if page.page
|
76
|
+
result << [page.page_url, page.all_meta_description_content]
|
77
|
+
end
|
70
78
|
end
|
71
79
|
result
|
72
80
|
end
|
@@ -74,7 +82,9 @@ module SiteAnalyzer
|
|
74
82
|
def all_h2
|
75
83
|
result = []
|
76
84
|
@pages.each do |page|
|
77
|
-
|
85
|
+
if page.page
|
86
|
+
result << [page.page_url, page.h2]
|
87
|
+
end
|
78
88
|
end
|
79
89
|
result
|
80
90
|
end
|
@@ -82,11 +92,13 @@ module SiteAnalyzer
|
|
82
92
|
def all_a
|
83
93
|
result = []
|
84
94
|
@pages.each do |page|
|
85
|
-
page.
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
95
|
+
if page.page
|
96
|
+
page.all_a_tags.compact.each do |tag|
|
97
|
+
tag[0] = '-' unless tag[0]
|
98
|
+
tag[1] = '-' unless tag[1]
|
99
|
+
tag[2] = '-' unless tag[2]
|
100
|
+
result << [page.page_url, tag[0], tag[1], tag[2]]
|
101
|
+
end
|
90
102
|
end
|
91
103
|
end
|
92
104
|
result.compact
|