site_analyzer 0.3.16 → 0.3.17
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +5 -2
- data/.rbenv-gemsets +1 -0
- data/Gemfile.lock +3 -3
- data/Guardfile +5 -5
- data/Rakefile +1 -1
- data/lib/site_analyzer/open-uri-patching.rb +3 -4
- data/lib/site_analyzer/page.rb +75 -91
- data/lib/site_analyzer/report.rb +4 -5
- data/lib/site_analyzer/site.rb +19 -28
- data/lib/site_analyzer/version.rb +1 -1
- data/site_analyzer.gemspec +3 -3
- metadata +4 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: cc00445497ff1b19d012ade1eef41f9ec4ddad99
|
4
|
+
data.tar.gz: 21a5e0ecfe14c07069ff8fb35a2d56d60d3062ea
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 13edaa969a406d7eb64c5965b037caefad6361bf3704e4e288e7b595f21bf1cd3743f95c407fe6d993b0054714cbcbe8164322b440f7f9752f438353dce1e27a
|
7
|
+
data.tar.gz: 9ce13e82b5bb231cd4d84dd14e14a3aee42dd2b3675b64eba53a4f933e230951e980341d8fa766849fc88c3d1e9d51a320a47e29b2390f69463829b3708d6557
|
data/.gitignore
CHANGED
data/.rbenv-gemsets
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
.gems
|
data/Gemfile.lock
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
site_analyzer (0.3.
|
4
|
+
site_analyzer (0.3.16)
|
5
5
|
addressable (~> 2.3)
|
6
6
|
nokogiri (~> 1.6)
|
7
7
|
robotstxt (~> 0.5)
|
@@ -26,7 +26,7 @@ GEM
|
|
26
26
|
shellany (~> 0.0)
|
27
27
|
thor (>= 0.18.1)
|
28
28
|
guard-compat (1.2.1)
|
29
|
-
guard-rspec (4.6.
|
29
|
+
guard-rspec (4.6.4)
|
30
30
|
guard (~> 2.1)
|
31
31
|
guard-compat (~> 1.1)
|
32
32
|
rspec (>= 2.99.0, < 4.0)
|
@@ -81,4 +81,4 @@ DEPENDENCIES
|
|
81
81
|
site_analyzer!
|
82
82
|
|
83
83
|
BUNDLED WITH
|
84
|
-
1.10.
|
84
|
+
1.10.6
|
data/Guardfile
CHANGED
@@ -47,9 +47,9 @@ guard :rspec, cmd: 'bundle exec rspec' do
|
|
47
47
|
|
48
48
|
watch(rails.controllers) do |m|
|
49
49
|
[
|
50
|
-
rspec.spec.("routing/#{m[1]}_routing"),
|
51
|
-
rspec.spec.("controllers/#{m[1]}_controller"),
|
52
|
-
rspec.spec.("acceptance/#{m[1]}")
|
50
|
+
rspec.spec.call("routing/#{m[1]}_routing"),
|
51
|
+
rspec.spec.call("controllers/#{m[1]}_controller"),
|
52
|
+
rspec.spec.call("acceptance/#{m[1]}")
|
53
53
|
]
|
54
54
|
end
|
55
55
|
|
@@ -59,8 +59,8 @@ guard :rspec, cmd: 'bundle exec rspec' do
|
|
59
59
|
watch(rails.app_controller) { "#{rspec.spec_dir}/controllers" }
|
60
60
|
|
61
61
|
# Capybara features specs
|
62
|
-
watch(rails.view_dirs) { |m| rspec.spec.("features/#{m[1]}") }
|
63
|
-
watch(rails.layouts) { |m| rspec.spec.("features/#{m[1]}") }
|
62
|
+
watch(rails.view_dirs) { |m| rspec.spec.call("features/#{m[1]}") }
|
63
|
+
watch(rails.layouts) { |m| rspec.spec.call("features/#{m[1]}") }
|
64
64
|
|
65
65
|
# Turnip features and steps
|
66
66
|
watch(%r{^spec/acceptance/(.+)\.feature$})
|
data/Rakefile
CHANGED
@@ -1,6 +1,5 @@
|
|
1
1
|
module OpenURI
|
2
|
-
|
3
|
-
def OpenURI.redirectable?(uri1, uri2) # :nodoc:
|
2
|
+
def self.redirectable?(uri1, uri2) # :nodoc:
|
4
3
|
# This test is intended to forbid a redirection from http://... to
|
5
4
|
# file:///etc/passwd, file:///dev/zero, etc. CVE-2011-1521
|
6
5
|
# https to http redirect is also forbidden intentionally.
|
@@ -8,6 +7,6 @@ module OpenURI
|
|
8
7
|
# (RFC 2109 4.3.1, RFC 2965 3.3, RFC 2616 15.1.3)
|
9
8
|
# However this is ad hoc. It should be extensible/configurable.
|
10
9
|
uri1.scheme.downcase == uri2.scheme.downcase ||
|
11
|
-
|
10
|
+
(/\A(?:http|ftp|https)\z/i =~ uri1.scheme && /\A(?:http|ftp|https)\z/i =~ uri2.scheme)
|
12
11
|
end
|
13
|
-
end
|
12
|
+
end
|
data/lib/site_analyzer/page.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
|
+
# Get site page and provide data for future analyse
|
1
2
|
module SiteAnalyzer
|
2
|
-
# Get site page and provide data for future analyse
|
3
3
|
require 'nokogiri'
|
4
4
|
require 'addressable/uri'
|
5
5
|
require 'timeout'
|
@@ -25,29 +25,25 @@ module SiteAnalyzer
|
|
25
25
|
end
|
26
26
|
# get all home (that on this site) url on page
|
27
27
|
def home_a
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
home_a << link[0] if uri.host == @site_domain
|
34
|
-
end
|
35
|
-
end
|
36
|
-
home_a
|
28
|
+
return unless @page_a_tags
|
29
|
+
home_a = []
|
30
|
+
@page_a_tags.uniq.each do |link|
|
31
|
+
uri = URI(link[0].to_ascii) rescue nil # TODO: write additional logic for link to image
|
32
|
+
home_a << link[0] if uri.host == @site_domain if uri && @site_domain
|
37
33
|
end
|
34
|
+
home_a
|
38
35
|
end
|
39
36
|
# get all remote link on page
|
40
37
|
def remote_a
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
end
|
38
|
+
return unless @page_a_tags
|
39
|
+
remote_a = []
|
40
|
+
@page_a_tags.uniq.each do |link|
|
41
|
+
uri = URI(link[0].to_ascii)
|
42
|
+
if uri && @site_domain
|
43
|
+
remote_a << link[0] unless uri.host == @site_domain
|
48
44
|
end
|
49
|
-
remote_a
|
50
45
|
end
|
46
|
+
remote_a
|
51
47
|
end
|
52
48
|
|
53
49
|
private
|
@@ -72,16 +68,14 @@ module SiteAnalyzer
|
|
72
68
|
end
|
73
69
|
# get page with open-uri, then parse it with Nokogiri. Get site domain and path from URI
|
74
70
|
def get_page(url)
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
@page = Nokogiri::HTML(page)
|
81
|
-
end
|
82
|
-
rescue Timeout::Error, EOFError, OpenURI::HTTPError, Errno::ENOENT, TypeError
|
83
|
-
return nil
|
71
|
+
timeout(30) do
|
72
|
+
page = open(url)
|
73
|
+
@site_domain = page.base_uri.host
|
74
|
+
@page_path = page.base_uri.request_uri
|
75
|
+
@page = Nokogiri::HTML(page)
|
84
76
|
end
|
77
|
+
rescue Timeout::Error, EOFError, OpenURI::HTTPError, Errno::ENOENT, TypeError
|
78
|
+
return nil
|
85
79
|
end
|
86
80
|
# check that title is one and less then 70 symbols
|
87
81
|
def title_good?
|
@@ -89,48 +83,44 @@ module SiteAnalyzer
|
|
89
83
|
end
|
90
84
|
# true if title and h1 have no duplicates
|
91
85
|
def title_and_h1_good?
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
end
|
86
|
+
return unless @page
|
87
|
+
arr = []
|
88
|
+
@page.css('h1').each { |node| arr << node.text }
|
89
|
+
@page.css('title').size == 1 && arr.uniq.size == arr.size
|
97
90
|
end
|
98
91
|
# true if metadescription less then 200 symbols
|
99
92
|
def metadescription_good?
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
end
|
93
|
+
return unless @page
|
94
|
+
tags = @page.css("meta[name='description']")
|
95
|
+
return false if tags.size == 0
|
96
|
+
tags.each do |t|
|
97
|
+
unless t['value'].nil?
|
98
|
+
return false if t['content'].size == 0 || t['content'].size > 200
|
107
99
|
end
|
108
|
-
true
|
109
100
|
end
|
101
|
+
true
|
110
102
|
end
|
111
103
|
# true if keywords less then 600 symbols
|
112
104
|
def keywords_good?
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
end
|
105
|
+
return unless @page
|
106
|
+
tags = @page.css("meta[name='keywords']")
|
107
|
+
return false if tags.size == 0
|
108
|
+
tags.each do |t|
|
109
|
+
unless t['value'].nil?
|
110
|
+
return false if t['content'].size == 0 || t['content'].size > 600
|
120
111
|
end
|
121
|
-
true
|
122
112
|
end
|
113
|
+
true
|
123
114
|
end
|
124
115
|
# true if code of page less then text on it
|
125
116
|
def code_less?
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
end
|
132
|
-
sum < page_text / 2
|
117
|
+
return unless @page
|
118
|
+
sum = 0
|
119
|
+
page_text = @page.text.size
|
120
|
+
@page.css('script').each do |tag|
|
121
|
+
sum += tag.text.size
|
133
122
|
end
|
123
|
+
sum < page_text / 2
|
134
124
|
end
|
135
125
|
# collect meta tags for future report
|
136
126
|
def collect_metadates
|
@@ -140,22 +130,20 @@ module SiteAnalyzer
|
|
140
130
|
end
|
141
131
|
# check meta and title tags duplicates
|
142
132
|
def metadates_good?
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
end
|
133
|
+
return unless @page
|
134
|
+
return false if @all_titles.size > 1 || @meta_data.empty?
|
135
|
+
node_names = []
|
136
|
+
@meta_data.each { |node| node_names << node['name'] }
|
137
|
+
node_names.compact!
|
138
|
+
node_names.uniq.size == node_names.size unless node_names.nil? || node_names.size < 1
|
150
139
|
end
|
151
140
|
# return hash with all titles, h1 and h2
|
152
141
|
def all_titles_h1_h2
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
end
|
142
|
+
return unless @page
|
143
|
+
out = []
|
144
|
+
out << @page.css('title').text << { @page_url => @page.css('h1').text }
|
145
|
+
out << { @page_url => @page.css('h2').text }
|
146
|
+
out
|
159
147
|
end
|
160
148
|
# check if page have h2 tags
|
161
149
|
def h2?
|
@@ -167,43 +155,39 @@ module SiteAnalyzer
|
|
167
155
|
end
|
168
156
|
# get all a tags
|
169
157
|
def all_a_tags
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
end
|
175
|
-
tags.compact
|
158
|
+
return unless @page
|
159
|
+
tags = []
|
160
|
+
@page.css('a').each do |node|
|
161
|
+
tags << [node['href'], node['target'], node['rel']]
|
176
162
|
end
|
163
|
+
tags.compact
|
177
164
|
end
|
178
165
|
# return all page titles
|
179
166
|
def titles
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
end
|
167
|
+
return unless @page
|
168
|
+
titles = []
|
169
|
+
@page.css('title').each { |tag| titles << tag.text }
|
170
|
+
titles
|
185
171
|
end
|
186
172
|
# return all meta description content
|
187
173
|
def all_meta_description_content
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
end
|
193
|
-
tags
|
174
|
+
return unless @page
|
175
|
+
tags = []
|
176
|
+
@page.css("meta[name='description']").each do |t|
|
177
|
+
tags << t['content']
|
194
178
|
end
|
179
|
+
tags
|
195
180
|
end
|
196
181
|
# return all h2 tags text
|
197
182
|
def h2
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
end
|
183
|
+
return unless @page
|
184
|
+
h2s = []
|
185
|
+
@page.css('h2').each { |tag| h2s << tag.text }
|
186
|
+
h2s
|
203
187
|
end
|
204
188
|
# check url of page that is must be HLU
|
205
189
|
def bad_url
|
206
|
-
@page_url if @page_path.size > 1 unless @page_path =~ /^[\w.\-\/]+$/i
|
190
|
+
@page_url if @page_path.size > 1 unless @page_path =~ %r(/^[\w.\-\/]+$/i)
|
207
191
|
end
|
208
192
|
# clear page from don't needed information
|
209
193
|
def clear!
|
data/lib/site_analyzer/report.rb
CHANGED
@@ -97,7 +97,7 @@ module SiteAnalyzer
|
|
97
97
|
def pages_size
|
98
98
|
result = []
|
99
99
|
@site.pages.each do |page|
|
100
|
-
result << [page.page_url
|
100
|
+
result << [page.page_url, page.page_text_size]
|
101
101
|
end
|
102
102
|
result
|
103
103
|
end
|
@@ -144,10 +144,9 @@ module SiteAnalyzer
|
|
144
144
|
counter = {}
|
145
145
|
result = []
|
146
146
|
in_array.compact.each do |url_desc_cont|
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
end
|
147
|
+
next unless url_desc_cont[1][0]
|
148
|
+
url_desc_cont[1][0].scan(/\w+/).each do |word|
|
149
|
+
all_words << word
|
151
150
|
end
|
152
151
|
end
|
153
152
|
all_words.each do |word|
|
data/lib/site_analyzer/site.rb
CHANGED
@@ -29,13 +29,12 @@ module SiteAnalyzer
|
|
29
29
|
add_pages_for_scan!
|
30
30
|
while @pages_for_scan.size > 0
|
31
31
|
page = convert_to_valid @pages_for_scan.pop
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
end
|
32
|
+
next unless page
|
33
|
+
@max_pages -= 1
|
34
|
+
add_page convert_to_valid(page)
|
35
|
+
return if @max_pages <= 0
|
36
|
+
add_pages_for_scan!
|
37
|
+
optimize_scan!
|
39
38
|
end
|
40
39
|
end
|
41
40
|
# add pages for scan array, also add bad pages to bad_pages array
|
@@ -44,10 +43,9 @@ module SiteAnalyzer
|
|
44
43
|
@bad_pages = []
|
45
44
|
@pages.each do |page|
|
46
45
|
@bad_pages << page.page_url unless page.page_a_tags
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
end
|
46
|
+
next unless page.page_a_tags
|
47
|
+
page.home_a.each do |link|
|
48
|
+
@pages_for_scan << link
|
51
49
|
end
|
52
50
|
end
|
53
51
|
end
|
@@ -65,9 +63,7 @@ module SiteAnalyzer
|
|
65
63
|
def all_titles
|
66
64
|
result = []
|
67
65
|
@pages.each do |page|
|
68
|
-
if page.page_a_tags
|
69
|
-
result << [page.page_url, page.all_titles]
|
70
|
-
end
|
66
|
+
result << [page.page_url, page.all_titles] if page.page_a_tags
|
71
67
|
end
|
72
68
|
result
|
73
69
|
end
|
@@ -75,9 +71,7 @@ module SiteAnalyzer
|
|
75
71
|
def all_descriptions
|
76
72
|
result = []
|
77
73
|
@pages.each do |page|
|
78
|
-
if page.page_a_tags
|
79
|
-
result << [page.page_url, page.meta_desc_content]
|
80
|
-
end
|
74
|
+
result << [page.page_url, page.meta_desc_content] if page.page_a_tags
|
81
75
|
end
|
82
76
|
result
|
83
77
|
end
|
@@ -85,9 +79,7 @@ module SiteAnalyzer
|
|
85
79
|
def all_h2
|
86
80
|
result = []
|
87
81
|
@pages.each do |page|
|
88
|
-
unless page.page_a_tags
|
89
|
-
result << [page.page_url, page.h2_text]
|
90
|
-
end
|
82
|
+
result << [page.page_url, page.h2_text] unless page.page_a_tags
|
91
83
|
end
|
92
84
|
result
|
93
85
|
end
|
@@ -95,13 +87,12 @@ module SiteAnalyzer
|
|
95
87
|
def all_a
|
96
88
|
result = []
|
97
89
|
@pages.each do |page|
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
end
|
90
|
+
next unless page.page_a_tags
|
91
|
+
page.page_a_tags.compact.each do |tag|
|
92
|
+
tag[0] = '-' unless tag[0]
|
93
|
+
tag[1] = '-' unless tag[1]
|
94
|
+
tag[2] = '-' unless tag[2]
|
95
|
+
result << [page.page_url, tag[0], tag[1], tag[2]]
|
105
96
|
end
|
106
97
|
end
|
107
98
|
result.compact
|
@@ -118,7 +109,7 @@ module SiteAnalyzer
|
|
118
109
|
def optimize_scan!
|
119
110
|
@pages_for_scan = @pages_for_scan.compact.uniq
|
120
111
|
@scanned_pages = @scanned_pages.compact.uniq
|
121
|
-
@pages_for_scan
|
112
|
+
@pages_for_scan -= @scanned_pages
|
122
113
|
end
|
123
114
|
# check url and try to convert it to valid, remove .jpg links, add scheme to url
|
124
115
|
def convert_to_valid(url)
|
data/site_analyzer.gemspec
CHANGED
@@ -9,8 +9,8 @@ Gem::Specification.new do |spec|
|
|
9
9
|
spec.authors = ['Denis Savchuk']
|
10
10
|
spec.email = ['mordorreal@gmail.com']
|
11
11
|
spec.date = '2015-07-01'
|
12
|
-
spec.summary =
|
13
|
-
spec.description =
|
12
|
+
spec.summary = 'Make report for SEO. Analyse site like SEOs like. '
|
13
|
+
spec.description = 'Create site report for SEO many options.'
|
14
14
|
spec.homepage = 'https://github.com/Mordorreal/SiteAnalyzer'
|
15
15
|
spec.license = 'MIT'
|
16
16
|
spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
@@ -21,7 +21,7 @@ Gem::Specification.new do |spec|
|
|
21
21
|
spec.add_development_dependency 'bundler', '~> 1.10'
|
22
22
|
spec.add_development_dependency 'rake', '~> 10.4'
|
23
23
|
spec.add_development_dependency 'rspec', '~> 3.3'
|
24
|
-
spec.add_development_dependency 'guard-rspec', '~>
|
24
|
+
spec.add_development_dependency 'guard-rspec', '~> 4.6'
|
25
25
|
spec.add_runtime_dependency 'nokogiri', '~> 1.6'
|
26
26
|
spec.add_runtime_dependency 'addressable', '~> 2.3'
|
27
27
|
spec.add_runtime_dependency 'robotstxt', '~> 0.5'
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: site_analyzer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.17
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Denis Savchuk
|
@@ -58,14 +58,14 @@ dependencies:
|
|
58
58
|
requirements:
|
59
59
|
- - "~>"
|
60
60
|
- !ruby/object:Gem::Version
|
61
|
-
version: '
|
61
|
+
version: '4.6'
|
62
62
|
type: :development
|
63
63
|
prerelease: false
|
64
64
|
version_requirements: !ruby/object:Gem::Requirement
|
65
65
|
requirements:
|
66
66
|
- - "~>"
|
67
67
|
- !ruby/object:Gem::Version
|
68
|
-
version: '
|
68
|
+
version: '4.6'
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: nokogiri
|
71
71
|
requirement: !ruby/object:Gem::Requirement
|
@@ -144,6 +144,7 @@ extensions: []
|
|
144
144
|
extra_rdoc_files: []
|
145
145
|
files:
|
146
146
|
- ".gitignore"
|
147
|
+
- ".rbenv-gemsets"
|
147
148
|
- ".rspec"
|
148
149
|
- ".travis.yml"
|
149
150
|
- CODE_OF_CONDUCT.md
|