site_analyzer 0.3.16 → 0.3.17
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +5 -2
- data/.rbenv-gemsets +1 -0
- data/Gemfile.lock +3 -3
- data/Guardfile +5 -5
- data/Rakefile +1 -1
- data/lib/site_analyzer/open-uri-patching.rb +3 -4
- data/lib/site_analyzer/page.rb +75 -91
- data/lib/site_analyzer/report.rb +4 -5
- data/lib/site_analyzer/site.rb +19 -28
- data/lib/site_analyzer/version.rb +1 -1
- data/site_analyzer.gemspec +3 -3
- metadata +4 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: cc00445497ff1b19d012ade1eef41f9ec4ddad99
|
4
|
+
data.tar.gz: 21a5e0ecfe14c07069ff8fb35a2d56d60d3062ea
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 13edaa969a406d7eb64c5965b037caefad6361bf3704e4e288e7b595f21bf1cd3743f95c407fe6d993b0054714cbcbe8164322b440f7f9752f438353dce1e27a
|
7
|
+
data.tar.gz: 9ce13e82b5bb231cd4d84dd14e14a3aee42dd2b3675b64eba53a4f933e230951e980341d8fa766849fc88c3d1e9d51a320a47e29b2390f69463829b3708d6557
|
data/.gitignore
CHANGED
data/.rbenv-gemsets
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
.gems
|
data/Gemfile.lock
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
site_analyzer (0.3.
|
4
|
+
site_analyzer (0.3.16)
|
5
5
|
addressable (~> 2.3)
|
6
6
|
nokogiri (~> 1.6)
|
7
7
|
robotstxt (~> 0.5)
|
@@ -26,7 +26,7 @@ GEM
|
|
26
26
|
shellany (~> 0.0)
|
27
27
|
thor (>= 0.18.1)
|
28
28
|
guard-compat (1.2.1)
|
29
|
-
guard-rspec (4.6.
|
29
|
+
guard-rspec (4.6.4)
|
30
30
|
guard (~> 2.1)
|
31
31
|
guard-compat (~> 1.1)
|
32
32
|
rspec (>= 2.99.0, < 4.0)
|
@@ -81,4 +81,4 @@ DEPENDENCIES
|
|
81
81
|
site_analyzer!
|
82
82
|
|
83
83
|
BUNDLED WITH
|
84
|
-
1.10.
|
84
|
+
1.10.6
|
data/Guardfile
CHANGED
@@ -47,9 +47,9 @@ guard :rspec, cmd: 'bundle exec rspec' do
|
|
47
47
|
|
48
48
|
watch(rails.controllers) do |m|
|
49
49
|
[
|
50
|
-
rspec.spec.("routing/#{m[1]}_routing"),
|
51
|
-
rspec.spec.("controllers/#{m[1]}_controller"),
|
52
|
-
rspec.spec.("acceptance/#{m[1]}")
|
50
|
+
rspec.spec.call("routing/#{m[1]}_routing"),
|
51
|
+
rspec.spec.call("controllers/#{m[1]}_controller"),
|
52
|
+
rspec.spec.call("acceptance/#{m[1]}")
|
53
53
|
]
|
54
54
|
end
|
55
55
|
|
@@ -59,8 +59,8 @@ guard :rspec, cmd: 'bundle exec rspec' do
|
|
59
59
|
watch(rails.app_controller) { "#{rspec.spec_dir}/controllers" }
|
60
60
|
|
61
61
|
# Capybara features specs
|
62
|
-
watch(rails.view_dirs) { |m| rspec.spec.("features/#{m[1]}") }
|
63
|
-
watch(rails.layouts) { |m| rspec.spec.("features/#{m[1]}") }
|
62
|
+
watch(rails.view_dirs) { |m| rspec.spec.call("features/#{m[1]}") }
|
63
|
+
watch(rails.layouts) { |m| rspec.spec.call("features/#{m[1]}") }
|
64
64
|
|
65
65
|
# Turnip features and steps
|
66
66
|
watch(%r{^spec/acceptance/(.+)\.feature$})
|
data/Rakefile
CHANGED
@@ -1,6 +1,5 @@
|
|
1
1
|
module OpenURI
|
2
|
-
|
3
|
-
def OpenURI.redirectable?(uri1, uri2) # :nodoc:
|
2
|
+
def self.redirectable?(uri1, uri2) # :nodoc:
|
4
3
|
# This test is intended to forbid a redirection from http://... to
|
5
4
|
# file:///etc/passwd, file:///dev/zero, etc. CVE-2011-1521
|
6
5
|
# https to http redirect is also forbidden intentionally.
|
@@ -8,6 +7,6 @@ module OpenURI
|
|
8
7
|
# (RFC 2109 4.3.1, RFC 2965 3.3, RFC 2616 15.1.3)
|
9
8
|
# However this is ad hoc. It should be extensible/configurable.
|
10
9
|
uri1.scheme.downcase == uri2.scheme.downcase ||
|
11
|
-
|
10
|
+
(/\A(?:http|ftp|https)\z/i =~ uri1.scheme && /\A(?:http|ftp|https)\z/i =~ uri2.scheme)
|
12
11
|
end
|
13
|
-
end
|
12
|
+
end
|
data/lib/site_analyzer/page.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
|
+
# Get site page and provide data for future analyse
|
1
2
|
module SiteAnalyzer
|
2
|
-
# Get site page and provide data for future analyse
|
3
3
|
require 'nokogiri'
|
4
4
|
require 'addressable/uri'
|
5
5
|
require 'timeout'
|
@@ -25,29 +25,25 @@ module SiteAnalyzer
|
|
25
25
|
end
|
26
26
|
# get all home (that on this site) url on page
|
27
27
|
def home_a
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
home_a << link[0] if uri.host == @site_domain
|
34
|
-
end
|
35
|
-
end
|
36
|
-
home_a
|
28
|
+
return unless @page_a_tags
|
29
|
+
home_a = []
|
30
|
+
@page_a_tags.uniq.each do |link|
|
31
|
+
uri = URI(link[0].to_ascii) rescue nil # TODO: write additional logic for link to image
|
32
|
+
home_a << link[0] if uri.host == @site_domain if uri && @site_domain
|
37
33
|
end
|
34
|
+
home_a
|
38
35
|
end
|
39
36
|
# get all remote link on page
|
40
37
|
def remote_a
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
end
|
38
|
+
return unless @page_a_tags
|
39
|
+
remote_a = []
|
40
|
+
@page_a_tags.uniq.each do |link|
|
41
|
+
uri = URI(link[0].to_ascii)
|
42
|
+
if uri && @site_domain
|
43
|
+
remote_a << link[0] unless uri.host == @site_domain
|
48
44
|
end
|
49
|
-
remote_a
|
50
45
|
end
|
46
|
+
remote_a
|
51
47
|
end
|
52
48
|
|
53
49
|
private
|
@@ -72,16 +68,14 @@ module SiteAnalyzer
|
|
72
68
|
end
|
73
69
|
# get page with open-uri, then parse it with Nokogiri. Get site domain and path from URI
|
74
70
|
def get_page(url)
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
@page = Nokogiri::HTML(page)
|
81
|
-
end
|
82
|
-
rescue Timeout::Error, EOFError, OpenURI::HTTPError, Errno::ENOENT, TypeError
|
83
|
-
return nil
|
71
|
+
timeout(30) do
|
72
|
+
page = open(url)
|
73
|
+
@site_domain = page.base_uri.host
|
74
|
+
@page_path = page.base_uri.request_uri
|
75
|
+
@page = Nokogiri::HTML(page)
|
84
76
|
end
|
77
|
+
rescue Timeout::Error, EOFError, OpenURI::HTTPError, Errno::ENOENT, TypeError
|
78
|
+
return nil
|
85
79
|
end
|
86
80
|
# check that title is one and less then 70 symbols
|
87
81
|
def title_good?
|
@@ -89,48 +83,44 @@ module SiteAnalyzer
|
|
89
83
|
end
|
90
84
|
# true if title and h1 have no duplicates
|
91
85
|
def title_and_h1_good?
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
end
|
86
|
+
return unless @page
|
87
|
+
arr = []
|
88
|
+
@page.css('h1').each { |node| arr << node.text }
|
89
|
+
@page.css('title').size == 1 && arr.uniq.size == arr.size
|
97
90
|
end
|
98
91
|
# true if metadescription less then 200 symbols
|
99
92
|
def metadescription_good?
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
end
|
93
|
+
return unless @page
|
94
|
+
tags = @page.css("meta[name='description']")
|
95
|
+
return false if tags.size == 0
|
96
|
+
tags.each do |t|
|
97
|
+
unless t['value'].nil?
|
98
|
+
return false if t['content'].size == 0 || t['content'].size > 200
|
107
99
|
end
|
108
|
-
true
|
109
100
|
end
|
101
|
+
true
|
110
102
|
end
|
111
103
|
# true if keywords less then 600 symbols
|
112
104
|
def keywords_good?
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
end
|
105
|
+
return unless @page
|
106
|
+
tags = @page.css("meta[name='keywords']")
|
107
|
+
return false if tags.size == 0
|
108
|
+
tags.each do |t|
|
109
|
+
unless t['value'].nil?
|
110
|
+
return false if t['content'].size == 0 || t['content'].size > 600
|
120
111
|
end
|
121
|
-
true
|
122
112
|
end
|
113
|
+
true
|
123
114
|
end
|
124
115
|
# true if code of page less then text on it
|
125
116
|
def code_less?
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
end
|
132
|
-
sum < page_text / 2
|
117
|
+
return unless @page
|
118
|
+
sum = 0
|
119
|
+
page_text = @page.text.size
|
120
|
+
@page.css('script').each do |tag|
|
121
|
+
sum += tag.text.size
|
133
122
|
end
|
123
|
+
sum < page_text / 2
|
134
124
|
end
|
135
125
|
# collect meta tags for future report
|
136
126
|
def collect_metadates
|
@@ -140,22 +130,20 @@ module SiteAnalyzer
|
|
140
130
|
end
|
141
131
|
# check meta and title tags duplicates
|
142
132
|
def metadates_good?
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
end
|
133
|
+
return unless @page
|
134
|
+
return false if @all_titles.size > 1 || @meta_data.empty?
|
135
|
+
node_names = []
|
136
|
+
@meta_data.each { |node| node_names << node['name'] }
|
137
|
+
node_names.compact!
|
138
|
+
node_names.uniq.size == node_names.size unless node_names.nil? || node_names.size < 1
|
150
139
|
end
|
151
140
|
# return hash with all titles, h1 and h2
|
152
141
|
def all_titles_h1_h2
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
end
|
142
|
+
return unless @page
|
143
|
+
out = []
|
144
|
+
out << @page.css('title').text << { @page_url => @page.css('h1').text }
|
145
|
+
out << { @page_url => @page.css('h2').text }
|
146
|
+
out
|
159
147
|
end
|
160
148
|
# check if page have h2 tags
|
161
149
|
def h2?
|
@@ -167,43 +155,39 @@ module SiteAnalyzer
|
|
167
155
|
end
|
168
156
|
# get all a tags
|
169
157
|
def all_a_tags
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
end
|
175
|
-
tags.compact
|
158
|
+
return unless @page
|
159
|
+
tags = []
|
160
|
+
@page.css('a').each do |node|
|
161
|
+
tags << [node['href'], node['target'], node['rel']]
|
176
162
|
end
|
163
|
+
tags.compact
|
177
164
|
end
|
178
165
|
# return all page titles
|
179
166
|
def titles
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
end
|
167
|
+
return unless @page
|
168
|
+
titles = []
|
169
|
+
@page.css('title').each { |tag| titles << tag.text }
|
170
|
+
titles
|
185
171
|
end
|
186
172
|
# return all meta description content
|
187
173
|
def all_meta_description_content
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
end
|
193
|
-
tags
|
174
|
+
return unless @page
|
175
|
+
tags = []
|
176
|
+
@page.css("meta[name='description']").each do |t|
|
177
|
+
tags << t['content']
|
194
178
|
end
|
179
|
+
tags
|
195
180
|
end
|
196
181
|
# return all h2 tags text
|
197
182
|
def h2
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
end
|
183
|
+
return unless @page
|
184
|
+
h2s = []
|
185
|
+
@page.css('h2').each { |tag| h2s << tag.text }
|
186
|
+
h2s
|
203
187
|
end
|
204
188
|
# check url of page that is must be HLU
|
205
189
|
def bad_url
|
206
|
-
@page_url if @page_path.size > 1 unless @page_path =~ /^[\w.\-\/]+$/i
|
190
|
+
@page_url if @page_path.size > 1 unless @page_path =~ %r(/^[\w.\-\/]+$/i)
|
207
191
|
end
|
208
192
|
# clear page from don't needed information
|
209
193
|
def clear!
|
data/lib/site_analyzer/report.rb
CHANGED
@@ -97,7 +97,7 @@ module SiteAnalyzer
|
|
97
97
|
def pages_size
|
98
98
|
result = []
|
99
99
|
@site.pages.each do |page|
|
100
|
-
result << [page.page_url
|
100
|
+
result << [page.page_url, page.page_text_size]
|
101
101
|
end
|
102
102
|
result
|
103
103
|
end
|
@@ -144,10 +144,9 @@ module SiteAnalyzer
|
|
144
144
|
counter = {}
|
145
145
|
result = []
|
146
146
|
in_array.compact.each do |url_desc_cont|
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
end
|
147
|
+
next unless url_desc_cont[1][0]
|
148
|
+
url_desc_cont[1][0].scan(/\w+/).each do |word|
|
149
|
+
all_words << word
|
151
150
|
end
|
152
151
|
end
|
153
152
|
all_words.each do |word|
|
data/lib/site_analyzer/site.rb
CHANGED
@@ -29,13 +29,12 @@ module SiteAnalyzer
|
|
29
29
|
add_pages_for_scan!
|
30
30
|
while @pages_for_scan.size > 0
|
31
31
|
page = convert_to_valid @pages_for_scan.pop
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
end
|
32
|
+
next unless page
|
33
|
+
@max_pages -= 1
|
34
|
+
add_page convert_to_valid(page)
|
35
|
+
return if @max_pages <= 0
|
36
|
+
add_pages_for_scan!
|
37
|
+
optimize_scan!
|
39
38
|
end
|
40
39
|
end
|
41
40
|
# add pages for scan array, also add bad pages to bad_pages array
|
@@ -44,10 +43,9 @@ module SiteAnalyzer
|
|
44
43
|
@bad_pages = []
|
45
44
|
@pages.each do |page|
|
46
45
|
@bad_pages << page.page_url unless page.page_a_tags
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
end
|
46
|
+
next unless page.page_a_tags
|
47
|
+
page.home_a.each do |link|
|
48
|
+
@pages_for_scan << link
|
51
49
|
end
|
52
50
|
end
|
53
51
|
end
|
@@ -65,9 +63,7 @@ module SiteAnalyzer
|
|
65
63
|
def all_titles
|
66
64
|
result = []
|
67
65
|
@pages.each do |page|
|
68
|
-
if page.page_a_tags
|
69
|
-
result << [page.page_url, page.all_titles]
|
70
|
-
end
|
66
|
+
result << [page.page_url, page.all_titles] if page.page_a_tags
|
71
67
|
end
|
72
68
|
result
|
73
69
|
end
|
@@ -75,9 +71,7 @@ module SiteAnalyzer
|
|
75
71
|
def all_descriptions
|
76
72
|
result = []
|
77
73
|
@pages.each do |page|
|
78
|
-
if page.page_a_tags
|
79
|
-
result << [page.page_url, page.meta_desc_content]
|
80
|
-
end
|
74
|
+
result << [page.page_url, page.meta_desc_content] if page.page_a_tags
|
81
75
|
end
|
82
76
|
result
|
83
77
|
end
|
@@ -85,9 +79,7 @@ module SiteAnalyzer
|
|
85
79
|
def all_h2
|
86
80
|
result = []
|
87
81
|
@pages.each do |page|
|
88
|
-
unless page.page_a_tags
|
89
|
-
result << [page.page_url, page.h2_text]
|
90
|
-
end
|
82
|
+
result << [page.page_url, page.h2_text] unless page.page_a_tags
|
91
83
|
end
|
92
84
|
result
|
93
85
|
end
|
@@ -95,13 +87,12 @@ module SiteAnalyzer
|
|
95
87
|
def all_a
|
96
88
|
result = []
|
97
89
|
@pages.each do |page|
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
end
|
90
|
+
next unless page.page_a_tags
|
91
|
+
page.page_a_tags.compact.each do |tag|
|
92
|
+
tag[0] = '-' unless tag[0]
|
93
|
+
tag[1] = '-' unless tag[1]
|
94
|
+
tag[2] = '-' unless tag[2]
|
95
|
+
result << [page.page_url, tag[0], tag[1], tag[2]]
|
105
96
|
end
|
106
97
|
end
|
107
98
|
result.compact
|
@@ -118,7 +109,7 @@ module SiteAnalyzer
|
|
118
109
|
def optimize_scan!
|
119
110
|
@pages_for_scan = @pages_for_scan.compact.uniq
|
120
111
|
@scanned_pages = @scanned_pages.compact.uniq
|
121
|
-
@pages_for_scan
|
112
|
+
@pages_for_scan -= @scanned_pages
|
122
113
|
end
|
123
114
|
# check url and try to convert it to valid, remove .jpg links, add scheme to url
|
124
115
|
def convert_to_valid(url)
|
data/site_analyzer.gemspec
CHANGED
@@ -9,8 +9,8 @@ Gem::Specification.new do |spec|
|
|
9
9
|
spec.authors = ['Denis Savchuk']
|
10
10
|
spec.email = ['mordorreal@gmail.com']
|
11
11
|
spec.date = '2015-07-01'
|
12
|
-
spec.summary =
|
13
|
-
spec.description =
|
12
|
+
spec.summary = 'Make report for SEO. Analyse site like SEOs like. '
|
13
|
+
spec.description = 'Create site report for SEO many options.'
|
14
14
|
spec.homepage = 'https://github.com/Mordorreal/SiteAnalyzer'
|
15
15
|
spec.license = 'MIT'
|
16
16
|
spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
@@ -21,7 +21,7 @@ Gem::Specification.new do |spec|
|
|
21
21
|
spec.add_development_dependency 'bundler', '~> 1.10'
|
22
22
|
spec.add_development_dependency 'rake', '~> 10.4'
|
23
23
|
spec.add_development_dependency 'rspec', '~> 3.3'
|
24
|
-
spec.add_development_dependency 'guard-rspec', '~>
|
24
|
+
spec.add_development_dependency 'guard-rspec', '~> 4.6'
|
25
25
|
spec.add_runtime_dependency 'nokogiri', '~> 1.6'
|
26
26
|
spec.add_runtime_dependency 'addressable', '~> 2.3'
|
27
27
|
spec.add_runtime_dependency 'robotstxt', '~> 0.5'
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: site_analyzer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.17
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Denis Savchuk
|
@@ -58,14 +58,14 @@ dependencies:
|
|
58
58
|
requirements:
|
59
59
|
- - "~>"
|
60
60
|
- !ruby/object:Gem::Version
|
61
|
-
version: '
|
61
|
+
version: '4.6'
|
62
62
|
type: :development
|
63
63
|
prerelease: false
|
64
64
|
version_requirements: !ruby/object:Gem::Requirement
|
65
65
|
requirements:
|
66
66
|
- - "~>"
|
67
67
|
- !ruby/object:Gem::Version
|
68
|
-
version: '
|
68
|
+
version: '4.6'
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: nokogiri
|
71
71
|
requirement: !ruby/object:Gem::Requirement
|
@@ -144,6 +144,7 @@ extensions: []
|
|
144
144
|
extra_rdoc_files: []
|
145
145
|
files:
|
146
146
|
- ".gitignore"
|
147
|
+
- ".rbenv-gemsets"
|
147
148
|
- ".rspec"
|
148
149
|
- ".travis.yml"
|
149
150
|
- CODE_OF_CONDUCT.md
|