site_analyzer 0.3.14 → 0.3.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/SiteAnalyzer.iml +8 -1
- data/lib/site_analyzer/page.rb +84 -68
- data/lib/site_analyzer/report.rb +8 -8
- data/lib/site_analyzer/site.rb +22 -30
- data/lib/site_analyzer/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3fea66e1d0fa5a9b34e70289b4e475ff53a2afa2
|
4
|
+
data.tar.gz: 66f0019cbee985a9b5568c5f7dee4c9a1aabd984
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 04e6f0c4e25ffc460bd92fa40a2154129a503247c185eb87b7d24c02e5ff0400712983ddc296945f3f3ba0b907a526270417c9595658b1ea0f1a90ad4aad7dda
|
7
|
+
data.tar.gz: 403386d6cafffcbbb1034c133337ecfc27dea00c482a61b11f5f6aefb6bcacf18a6edcdcedbceb82411e1afaf8d1eeeeeed05f2adc9faf09e2c0b31783d205e3
|
data/README.md
CHANGED
@@ -26,7 +26,7 @@ require 'site_analyzer'<br>
|
|
26
26
|
|
27
27
|
SiteAnalyzer::Report.create site: 'http://savchuk.space', pages: 10, robot: false, console: true<br>
|
28
28
|
Return hash with report.<br>
|
29
|
-
arguments: site - url
|
29
|
+
arguments: site - url must start from http or https, pages - number of pages to scan, robot - use or not robot.txt file, console - output to console
|
30
30
|
|
31
31
|
<br>
|
32
32
|
<b>Author</b>
|
data/SiteAnalyzer.iml
CHANGED
@@ -6,15 +6,22 @@
|
|
6
6
|
<orderEntry type="inheritedJdk" />
|
7
7
|
<orderEntry type="sourceFolder" forTests="false" />
|
8
8
|
<orderEntry type="library" scope="PROVIDED" name="addressable (v2.3.8, rbenv: 2.2.2) [gem]" level="application" />
|
9
|
-
<orderEntry type="library" scope="PROVIDED" name="bundler (v1.10.
|
9
|
+
<orderEntry type="library" scope="PROVIDED" name="bundler (v1.10.6, rbenv: 2.2.2) [gem]" level="application" />
|
10
|
+
<orderEntry type="library" scope="PROVIDED" name="coderay (v1.1.0, rbenv: 2.2.2) [gem]" level="application" />
|
10
11
|
<orderEntry type="library" scope="PROVIDED" name="diff-lcs (v1.2.5, rbenv: 2.2.2) [gem]" level="application" />
|
11
12
|
<orderEntry type="library" scope="PROVIDED" name="ffi (v1.9.10, rbenv: 2.2.2) [gem]" level="application" />
|
12
13
|
<orderEntry type="library" scope="PROVIDED" name="formatador (v0.2.5, rbenv: 2.2.2) [gem]" level="application" />
|
14
|
+
<orderEntry type="library" scope="PROVIDED" name="guard (v2.13.0, rbenv: 2.2.2) [gem]" level="application" />
|
13
15
|
<orderEntry type="library" scope="PROVIDED" name="guard-compat (v1.2.1, rbenv: 2.2.2) [gem]" level="application" />
|
16
|
+
<orderEntry type="library" scope="PROVIDED" name="listen (v3.0.3, rbenv: 2.2.2) [gem]" level="application" />
|
17
|
+
<orderEntry type="library" scope="PROVIDED" name="lumberjack (v1.0.9, rbenv: 2.2.2) [gem]" level="application" />
|
14
18
|
<orderEntry type="library" scope="PROVIDED" name="mini_portile (v0.6.2, rbenv: 2.2.2) [gem]" level="application" />
|
15
19
|
<orderEntry type="library" scope="PROVIDED" name="nenv (v0.2.0, rbenv: 2.2.2) [gem]" level="application" />
|
16
20
|
<orderEntry type="library" scope="PROVIDED" name="nokogiri (v1.6.6.2, rbenv: 2.2.2) [gem]" level="application" />
|
21
|
+
<orderEntry type="library" scope="PROVIDED" name="notiffany (v0.0.7, rbenv: 2.2.2) [gem]" level="application" />
|
22
|
+
<orderEntry type="library" scope="PROVIDED" name="pry (v0.10.1, rbenv: 2.2.2) [gem]" level="application" />
|
17
23
|
<orderEntry type="library" scope="PROVIDED" name="rake (v10.4.2, rbenv: 2.2.2) [gem]" level="application" />
|
24
|
+
<orderEntry type="library" scope="PROVIDED" name="rb-fsevent (v0.9.5, rbenv: 2.2.2) [gem]" level="application" />
|
18
25
|
<orderEntry type="library" scope="PROVIDED" name="rb-inotify (v0.9.5, rbenv: 2.2.2) [gem]" level="application" />
|
19
26
|
<orderEntry type="library" scope="PROVIDED" name="robotstxt (v0.5.4, rbenv: 2.2.2) [gem]" level="application" />
|
20
27
|
<orderEntry type="library" scope="PROVIDED" name="rspec (v3.3.0, rbenv: 2.2.2) [gem]" level="application" />
|
data/lib/site_analyzer/page.rb
CHANGED
@@ -1,30 +1,76 @@
|
|
1
1
|
module SiteAnalyzer
|
2
|
-
# Get site page and provide
|
2
|
+
# Get site page and provide data for future analyse
|
3
3
|
require 'nokogiri'
|
4
4
|
require 'addressable/uri'
|
5
5
|
require 'timeout'
|
6
6
|
require 'stringex_lite'
|
7
7
|
require 'open-uri'
|
8
8
|
class Page
|
9
|
-
attr_reader :page_url, :
|
9
|
+
attr_reader :page_url, :page_path, :site_domain,
|
10
|
+
:all_titles, :title_good, :title_and_h1_good,
|
11
|
+
:meta_description_good, :meta_keywords, :code_less,
|
12
|
+
:meta_data, :meta_title_duplicates, :title_h1_h2,
|
13
|
+
:have_h2, :page_text_size, :page_a_tags,
|
14
|
+
:meta_desc_content, :h2_text, :hlu
|
15
|
+
# create page object, fill date and clear don't needed elements
|
10
16
|
def initialize(url)
|
11
17
|
@page_url = url
|
12
|
-
@page = []
|
13
|
-
@site_domain = ''
|
14
|
-
@page_path = ''
|
15
|
-
@titles = []
|
16
18
|
get_page(url)
|
17
|
-
fill_data_field!
|
19
|
+
fill_data_field! if @page
|
20
|
+
clear!
|
18
21
|
end
|
19
|
-
|
20
|
-
def fill_data_field!
|
21
|
-
@titles = all_titles
|
22
|
-
end
|
23
|
-
|
22
|
+
# to_s for report
|
24
23
|
def to_s
|
25
24
|
"Page url: #{@page_url} Site url: #{@site_domain}"
|
26
25
|
end
|
26
|
+
# get all home (that on this site) url on page
|
27
|
+
def home_a
|
28
|
+
if @page_a_tags
|
29
|
+
home_a = []
|
30
|
+
@page_a_tags.uniq.each do |link|
|
31
|
+
uri = URI(link[0].to_ascii) rescue nil #TODO: write additional logic for link to image
|
32
|
+
if uri && @site_domain
|
33
|
+
home_a << link[0] if uri.host == @site_domain
|
34
|
+
end
|
35
|
+
end
|
36
|
+
home_a
|
37
|
+
end
|
38
|
+
end
|
39
|
+
# get all remote link on page
|
40
|
+
def remote_a
|
41
|
+
if @page_a_tags
|
42
|
+
remote_a = []
|
43
|
+
@page_a_tags.uniq.each do |link|
|
44
|
+
uri = URI(link[0].to_ascii)
|
45
|
+
if uri && @site_domain
|
46
|
+
remote_a << link[0] unless uri.host == @site_domain
|
47
|
+
end
|
48
|
+
end
|
49
|
+
remote_a
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
private
|
27
54
|
|
55
|
+
# fill Page instant with data for report
|
56
|
+
def fill_data_field!
|
57
|
+
@all_titles = titles
|
58
|
+
@meta_data = collect_metadates
|
59
|
+
@title_h1_h2 = all_titles_h1_h2
|
60
|
+
@page_text_size = text_size
|
61
|
+
@page_a_tags = all_a_tags
|
62
|
+
@meta_desc_content = all_meta_description_content
|
63
|
+
@h2_text = h2
|
64
|
+
@hlu = bad_url
|
65
|
+
@title_good = title_good?
|
66
|
+
@title_and_h1_good = title_and_h1_good?
|
67
|
+
@meta_description_good = metadescription_good?
|
68
|
+
@meta_keywords = keywords_good?
|
69
|
+
@code_less = code_less?
|
70
|
+
@meta_title_duplicates = metadates_good?
|
71
|
+
@have_h2 = h2?
|
72
|
+
end
|
73
|
+
# get page with open-uri, then parse it with Nokogiri. Get site domain and path from URI
|
28
74
|
def get_page(url)
|
29
75
|
begin
|
30
76
|
timeout(30) do
|
@@ -37,11 +83,11 @@ module SiteAnalyzer
|
|
37
83
|
return nil
|
38
84
|
end
|
39
85
|
end
|
40
|
-
|
86
|
+
# check that title is one and less then 70 symbols
|
41
87
|
def title_good?
|
42
88
|
@page.css('title').size == 1 && @page.css('title').text.size < 70 if @page
|
43
89
|
end
|
44
|
-
# true if title and h1 have no
|
90
|
+
# true if title and h1 have no duplicates
|
45
91
|
def title_and_h1_good?
|
46
92
|
if @page
|
47
93
|
arr = []
|
@@ -75,7 +121,7 @@ module SiteAnalyzer
|
|
75
121
|
true
|
76
122
|
end
|
77
123
|
end
|
78
|
-
# true if code less then text
|
124
|
+
# true if code of page less then text on it
|
79
125
|
def code_less?
|
80
126
|
if @page
|
81
127
|
sum = 0
|
@@ -86,19 +132,20 @@ module SiteAnalyzer
|
|
86
132
|
sum < page_text / 2
|
87
133
|
end
|
88
134
|
end
|
89
|
-
|
135
|
+
# collect meta tags for future report
|
90
136
|
def collect_metadates
|
91
|
-
|
137
|
+
meta = []
|
138
|
+
meta = @page.css('meta') if @page
|
139
|
+
meta
|
92
140
|
end
|
93
|
-
|
141
|
+
# check meta and title tags duplicates
|
94
142
|
def metadates_good?
|
95
143
|
if @page
|
96
|
-
|
97
|
-
return false if @page.css('title').size > 1 || meta_tags.nil?
|
144
|
+
return false if @all_titles.size > 1 || @meta_data.empty?
|
98
145
|
node_names = []
|
99
|
-
|
100
|
-
|
101
|
-
node_names.uniq.size == node_names.size
|
146
|
+
@meta_data.each { |node| node_names << node['name'] }
|
147
|
+
node_names.compact!
|
148
|
+
node_names.uniq.size == node_names.size unless node_names.nil? || node_names.size < 1
|
102
149
|
end
|
103
150
|
end
|
104
151
|
# return hash with all titles, h1 and h2
|
@@ -107,53 +154,18 @@ module SiteAnalyzer
|
|
107
154
|
out = []
|
108
155
|
out << @page.css('title').text << { @page_url => @page.css('h1').text }
|
109
156
|
out << { @page_url => @page.css('h2').text }
|
157
|
+
out
|
110
158
|
end
|
111
159
|
end
|
112
|
-
|
113
|
-
def home_a
|
114
|
-
if @page
|
115
|
-
home_a = []
|
116
|
-
all_a_tags_href.uniq.each do |link|
|
117
|
-
uri = URI(link.to_ascii) rescue nil #TODO: write additional logic for link to image
|
118
|
-
if uri && @site_domain
|
119
|
-
home_a << link if uri.host == @site_domain
|
120
|
-
end
|
121
|
-
end
|
122
|
-
home_a
|
123
|
-
end
|
124
|
-
end
|
125
|
-
|
126
|
-
def remote_a
|
127
|
-
if @page
|
128
|
-
remote_a = []
|
129
|
-
all_a_tags_href.uniq.each do |link|
|
130
|
-
uri = URI(link.to_ascii)
|
131
|
-
if uri && @site_domain
|
132
|
-
remote_a << link unless uri.host == @site_domain
|
133
|
-
end
|
134
|
-
end
|
135
|
-
remote_a
|
136
|
-
end
|
137
|
-
end
|
138
|
-
|
139
|
-
def all_a_tags_href
|
140
|
-
if @page
|
141
|
-
tags = []
|
142
|
-
@page.css('a').each do |node|
|
143
|
-
tags << node['href']
|
144
|
-
end
|
145
|
-
tags.compact
|
146
|
-
end
|
147
|
-
end
|
148
|
-
|
160
|
+
# check if page have h2 tags
|
149
161
|
def h2?
|
150
162
|
@page.css('h2').size > 0 if @page
|
151
163
|
end
|
152
|
-
|
153
|
-
def
|
164
|
+
# return page size in symbols
|
165
|
+
def text_size
|
154
166
|
@page.text.size if @page
|
155
167
|
end
|
156
|
-
|
168
|
+
# get all a tags
|
157
169
|
def all_a_tags
|
158
170
|
if @page
|
159
171
|
tags = []
|
@@ -163,15 +175,15 @@ module SiteAnalyzer
|
|
163
175
|
tags.compact
|
164
176
|
end
|
165
177
|
end
|
166
|
-
|
167
|
-
def
|
178
|
+
# return all page titles
|
179
|
+
def titles
|
168
180
|
if @page
|
169
181
|
titles = []
|
170
182
|
@page.css('title').each { |tag| titles << tag.text }
|
171
183
|
titles
|
172
184
|
end
|
173
185
|
end
|
174
|
-
|
186
|
+
# return all meta description content
|
175
187
|
def all_meta_description_content
|
176
188
|
if @page
|
177
189
|
tags = []
|
@@ -181,7 +193,7 @@ module SiteAnalyzer
|
|
181
193
|
tags
|
182
194
|
end
|
183
195
|
end
|
184
|
-
|
196
|
+
# return all h2 tags text
|
185
197
|
def h2
|
186
198
|
if @page
|
187
199
|
h2s = []
|
@@ -189,9 +201,13 @@ module SiteAnalyzer
|
|
189
201
|
h2s
|
190
202
|
end
|
191
203
|
end
|
192
|
-
|
204
|
+
# check url of page that is must be HLU
|
193
205
|
def bad_url
|
194
206
|
@page_url if @page_path.size > 1 unless @page_path =~ /^[\w.\-\/]+$/i
|
195
207
|
end
|
208
|
+
# clear page from don't needed information
|
209
|
+
def clear!
|
210
|
+
@page = nil
|
211
|
+
end
|
196
212
|
end
|
197
213
|
end
|
data/lib/site_analyzer/report.rb
CHANGED
@@ -11,7 +11,7 @@ module SiteAnalyzer
|
|
11
11
|
@use_robot = use_robot
|
12
12
|
@site = Site.new(@site_domain, @max_pages, @use_robot)
|
13
13
|
end
|
14
|
-
|
14
|
+
# Entry point for gem. Create and show report. return array, show in console if select
|
15
15
|
def self.create(options)
|
16
16
|
options[:robot] = false if options[:robot] == 'false'
|
17
17
|
options[:console] = false if options[:console] == 'false'
|
@@ -40,7 +40,7 @@ module SiteAnalyzer
|
|
40
40
|
|
41
41
|
def to_s
|
42
42
|
return 'Report is empty' if @report.nil? || @report.empty?
|
43
|
-
header = Terminal::Table.new title: "Report for #{@site_domain} with #{@max_pages} pages
|
43
|
+
header = Terminal::Table.new title: "Report for #{@site_domain} with #{@max_pages} pages and robot.txt check is #{@use_robot}"
|
44
44
|
puts header
|
45
45
|
@report.each_pair do |key, value|
|
46
46
|
rows = []
|
@@ -57,7 +57,7 @@ module SiteAnalyzer
|
|
57
57
|
def check_titles_text_less_than_70
|
58
58
|
result = []
|
59
59
|
@site.pages.each do |page|
|
60
|
-
result << page.page_url unless page.title_good
|
60
|
+
result << page.page_url unless page.title_good
|
61
61
|
end
|
62
62
|
result
|
63
63
|
end
|
@@ -65,7 +65,7 @@ module SiteAnalyzer
|
|
65
65
|
def check_title_and_h1_for_doubles
|
66
66
|
result = []
|
67
67
|
@site.pages.each do |page|
|
68
|
-
result << page.page_url unless page.title_and_h1_good
|
68
|
+
result << page.page_url unless page.title_and_h1_good
|
69
69
|
end
|
70
70
|
result
|
71
71
|
end
|
@@ -73,7 +73,7 @@ module SiteAnalyzer
|
|
73
73
|
def check_meta_description
|
74
74
|
result = []
|
75
75
|
@site.pages.each do |page|
|
76
|
-
result << page.page_url unless page.
|
76
|
+
result << page.page_url unless page.meta_description_good
|
77
77
|
end
|
78
78
|
result
|
79
79
|
end
|
@@ -81,7 +81,7 @@ module SiteAnalyzer
|
|
81
81
|
def check_meta_keywords_tags
|
82
82
|
result = []
|
83
83
|
@site.pages.each do |page|
|
84
|
-
result << page.page_url unless page.
|
84
|
+
result << page.page_url unless page.meta_keywords
|
85
85
|
end
|
86
86
|
result
|
87
87
|
end
|
@@ -89,7 +89,7 @@ module SiteAnalyzer
|
|
89
89
|
def check_h2
|
90
90
|
result = []
|
91
91
|
@site.pages.each do |page|
|
92
|
-
result << page.page_url unless page.
|
92
|
+
result << page.page_url unless page.have_h2
|
93
93
|
end
|
94
94
|
result
|
95
95
|
end
|
@@ -105,7 +105,7 @@ module SiteAnalyzer
|
|
105
105
|
def code_more
|
106
106
|
result = []
|
107
107
|
@site.pages.each do |page|
|
108
|
-
result << page.page_url unless page.code_less
|
108
|
+
result << page.page_url unless page.code_less
|
109
109
|
end
|
110
110
|
result
|
111
111
|
end
|
data/lib/site_analyzer/site.rb
CHANGED
@@ -16,7 +16,7 @@ module SiteAnalyzer
|
|
16
16
|
@pages << Page.new(convert_to_valid(@main_url))
|
17
17
|
scan_site!
|
18
18
|
end
|
19
|
-
|
19
|
+
# check if page blocked by robot txt
|
20
20
|
def robot_txt_allowed?(url)
|
21
21
|
if @use_robot_txt
|
22
22
|
Robotstxt.allowed?(url, '*') rescue nil
|
@@ -24,7 +24,7 @@ module SiteAnalyzer
|
|
24
24
|
true
|
25
25
|
end
|
26
26
|
end
|
27
|
-
|
27
|
+
# scan pages: add page to scan, if still can scan do it, add new pages for scan from it and optimize massive of links
|
28
28
|
def scan_site!
|
29
29
|
add_pages_for_scan!
|
30
30
|
while @pages_for_scan.size > 0
|
@@ -38,20 +38,20 @@ module SiteAnalyzer
|
|
38
38
|
end
|
39
39
|
end
|
40
40
|
end
|
41
|
-
|
41
|
+
# add pages for scan array, also add bad pages to bad_pages array
|
42
42
|
def add_pages_for_scan!
|
43
43
|
@pages_for_scan = []
|
44
44
|
@bad_pages = []
|
45
45
|
@pages.each do |page|
|
46
|
-
@bad_pages << page.page_url unless page.
|
47
|
-
if page.
|
46
|
+
@bad_pages << page.page_url unless page.page_a_tags
|
47
|
+
if page.page_a_tags
|
48
48
|
page.home_a.each do |link|
|
49
49
|
@pages_for_scan << link
|
50
50
|
end
|
51
51
|
end
|
52
52
|
end
|
53
53
|
end
|
54
|
-
|
54
|
+
# create Page and add to to site
|
55
55
|
def add_page(url)
|
56
56
|
unless robot_txt_allowed?(url)
|
57
57
|
@scanned_pages << url
|
@@ -61,42 +61,42 @@ module SiteAnalyzer
|
|
61
61
|
@pages << page
|
62
62
|
@scanned_pages << url
|
63
63
|
end
|
64
|
-
|
64
|
+
# get all titles on site and return array of them
|
65
65
|
def all_titles
|
66
66
|
result = []
|
67
67
|
@pages.each do |page|
|
68
|
-
if page.
|
69
|
-
result << [page.page_url, page.
|
68
|
+
if page.page_a_tags
|
69
|
+
result << [page.page_url, page.all_titles]
|
70
70
|
end
|
71
71
|
end
|
72
72
|
result
|
73
73
|
end
|
74
|
-
|
74
|
+
# get all meta description tags content and return it as array
|
75
75
|
def all_descriptions
|
76
76
|
result = []
|
77
77
|
@pages.each do |page|
|
78
|
-
if page.
|
79
|
-
result << [page.page_url, page.
|
78
|
+
if page.page_a_tags
|
79
|
+
result << [page.page_url, page.meta_desc_content]
|
80
80
|
end
|
81
81
|
end
|
82
82
|
result
|
83
83
|
end
|
84
|
-
|
84
|
+
# get all h2 tags and return array of it
|
85
85
|
def all_h2
|
86
86
|
result = []
|
87
87
|
@pages.each do |page|
|
88
|
-
unless page.
|
89
|
-
result << [page.page_url, page.
|
88
|
+
unless page.page_a_tags
|
89
|
+
result << [page.page_url, page.h2_text]
|
90
90
|
end
|
91
91
|
end
|
92
92
|
result
|
93
93
|
end
|
94
|
-
|
94
|
+
# get all a tags and return array of it
|
95
95
|
def all_a
|
96
96
|
result = []
|
97
97
|
@pages.each do |page|
|
98
|
-
if page.
|
99
|
-
page.
|
98
|
+
if page.page_a_tags
|
99
|
+
page.page_a_tags.compact.each do |tag|
|
100
100
|
tag[0] = '-' unless tag[0]
|
101
101
|
tag[1] = '-' unless tag[1]
|
102
102
|
tag[2] = '-' unless tag[2]
|
@@ -106,29 +106,21 @@ module SiteAnalyzer
|
|
106
106
|
end
|
107
107
|
result.compact
|
108
108
|
end
|
109
|
-
|
110
|
-
def pages_url
|
111
|
-
result = []
|
112
|
-
@pages.each do |page|
|
113
|
-
result << page.page_url if page.page
|
114
|
-
end
|
115
|
-
result
|
116
|
-
end
|
117
|
-
|
109
|
+
# get all non HLU url and return array
|
118
110
|
def bad_urls
|
119
111
|
result = []
|
120
112
|
@pages.each do |page|
|
121
|
-
result << page.
|
113
|
+
result << page.hlu
|
122
114
|
end
|
123
115
|
result.compact!
|
124
116
|
end
|
125
|
-
|
117
|
+
# get new array pages for scan and compact it
|
126
118
|
def optimize_scan!
|
127
119
|
@pages_for_scan = @pages_for_scan.compact.uniq
|
128
120
|
@scanned_pages = @scanned_pages.compact.uniq
|
129
121
|
@pages_for_scan = @pages_for_scan - @scanned_pages
|
130
122
|
end
|
131
|
-
|
123
|
+
# check url and try to convert it to valid, remove .jpg links, add scheme to url
|
132
124
|
def convert_to_valid(url)
|
133
125
|
return nil if url =~ /.jpg$/i
|
134
126
|
url.insert(0, @main_url.first(5)) if url.start_with? '//'
|