validate-website 1.0.5 → 1.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Rakefile +3 -3
- data/lib/validate_website.rb +1 -0
- data/lib/validate_website/core.rb +33 -157
- data/lib/validate_website/crawl.rb +78 -0
- data/lib/validate_website/option_parser.rb +64 -59
- data/lib/validate_website/runner.rb +3 -3
- data/lib/validate_website/static.rb +102 -0
- data/lib/validate_website/validator.rb +44 -33
- data/lib/validate_website/version.rb +3 -0
- data/spec/core_spec.rb +3 -118
- data/spec/crawler_spec.rb +91 -0
- data/spec/data/w3.org-xhtml1-strict-errors.html +544 -0
- data/spec/spec_helper.rb +2 -1
- data/spec/static_spec.rb +38 -0
- data/spec/validator_spec.rb +40 -23
- data/spec/webmock_helper.rb +4 -3
- metadata +30 -8
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 134cbdd3da2da6847c525ffe5a2ee68f1a380ae2
|
4
|
+
data.tar.gz: 678ff514b9f6f368bbb78e93a8dc42f38a35a803
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: dd90c5dec7d0c80ea9b94abcb1a38a425ee59be32c73fe9fc97620a5a00ee4c9cf2dae52aa099509c8573f2dfca117377ed15373bce2d93ed7c25a43e6ed067e
|
7
|
+
data.tar.gz: 4d24ac9b1dccd744a3b7bd7ebbb18b946c05c4043a0c76898eafc543892e886715e2d99670ea91f3b9d0c3203425cf0de420445d041360c44fb67a934c6165c2
|
data/Rakefile
CHANGED
@@ -4,8 +4,8 @@ require 'rake/testtask'
|
|
4
4
|
task default: [:test]
|
5
5
|
|
6
6
|
RDoc::Task.new do |rd|
|
7
|
-
rd.main =
|
8
|
-
rd.rdoc_files.include(
|
7
|
+
rd.main = 'README.rdoc'
|
8
|
+
rd.rdoc_files.include('README.rdoc', 'lib/**/*.rb')
|
9
9
|
end
|
10
10
|
|
11
11
|
# install asciidoc libxml2-utils xmlto docbook-xsl docbook-xml
|
@@ -15,6 +15,6 @@ task :manpage do
|
|
15
15
|
end
|
16
16
|
|
17
17
|
Rake::TestTask.new do |t|
|
18
|
-
t.pattern =
|
18
|
+
t.pattern = 'spec/*_spec.rb'
|
19
19
|
end
|
20
20
|
task spec: :test
|
data/lib/validate_website.rb
CHANGED
@@ -1,5 +1,3 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
1
|
require 'set'
|
4
2
|
require 'open-uri'
|
5
3
|
require 'webrick/cookie'
|
@@ -10,7 +8,11 @@ require 'validate_website/colorful_messages'
|
|
10
8
|
|
11
9
|
require 'spidr'
|
12
10
|
|
11
|
+
# Base module ValidateWebsite
|
13
12
|
module ValidateWebsite
|
13
|
+
autoload :Crawl, 'validate_website/crawl'
|
14
|
+
autoload :Static, 'validate_website/static'
|
15
|
+
|
14
16
|
# Core class for static or website validation
|
15
17
|
class Core
|
16
18
|
attr_accessor :site
|
@@ -23,55 +25,16 @@ module ValidateWebsite
|
|
23
25
|
EXIT_FAILURE_NOT_FOUND = 65
|
24
26
|
EXIT_FAILURE_MARKUP_NOT_FOUND = 66
|
25
27
|
|
26
|
-
|
27
|
-
|
28
|
-
def initialize(options = {}, validation_type = :crawl)
|
28
|
+
def initialize(options = {}, validation_type)
|
29
29
|
@not_founds_count = 0
|
30
30
|
@errors_count = 0
|
31
|
-
@options = Parser.parse(options, validation_type)
|
31
|
+
@options = Parser.parse(options, validation_type).to_h
|
32
32
|
@site = @options[:site]
|
33
|
-
@service_url = @options[:
|
33
|
+
@service_url = @options[:html5_validator_service_url]
|
34
34
|
Validator.html5_validator_service_url = @service_url if @service_url
|
35
35
|
puts color(:note, "validating #{@site}\n", @options[:color])
|
36
36
|
end
|
37
37
|
|
38
|
-
# @param [Hash] options
|
39
|
-
# :color [Boolean] color output (true, false)
|
40
|
-
# :exclude [String] a String used by Regexp.new
|
41
|
-
# :markup [Boolean] Check the markup validity
|
42
|
-
# :not_found [Boolean] Check for not found page (404)
|
43
|
-
#
|
44
|
-
def crawl(options = {})
|
45
|
-
@options = @options.to_hash.merge(options)
|
46
|
-
@options.merge!(ignore_links: @options[:exclude]) if @options[:exclude]
|
47
|
-
puts color(:warning, "No internet connection") unless internet_connection?
|
48
|
-
|
49
|
-
@crawler = spidr_crawler(@site, @options)
|
50
|
-
print_status_line(@crawler.history.size,
|
51
|
-
@crawler.failures.size,
|
52
|
-
@not_founds_count,
|
53
|
-
@errors_count)
|
54
|
-
end
|
55
|
-
|
56
|
-
# @param [Hash] options
|
57
|
-
#
|
58
|
-
def crawl_static(options = {})
|
59
|
-
@options = @options.to_hash.merge(options)
|
60
|
-
@site = @options[:site]
|
61
|
-
|
62
|
-
files = Dir.glob(@options[:pattern])
|
63
|
-
files.each do |f|
|
64
|
-
next unless File.file?(f)
|
65
|
-
|
66
|
-
response = fake_httpresponse(open(f).read)
|
67
|
-
page = Spidr::Page.new(URI.join(@site, URI.encode(f)), response)
|
68
|
-
|
69
|
-
validate(page.doc, page.body, f) if @options[:markup]
|
70
|
-
check_static_not_found(page.links) if @options[:not_found]
|
71
|
-
end
|
72
|
-
print_status_line(files.size, 0, @not_founds_count, @errors_count)
|
73
|
-
end
|
74
|
-
|
75
38
|
def errors?
|
76
39
|
@errors_count > 0
|
77
40
|
end
|
@@ -99,71 +62,33 @@ module ValidateWebsite
|
|
99
62
|
end
|
100
63
|
end
|
101
64
|
|
102
|
-
private
|
103
|
-
|
104
|
-
def internet_connection?
|
105
|
-
true if open(ValidateWebsite::Core::PING_URL)
|
106
|
-
rescue
|
107
|
-
false
|
108
|
-
end
|
109
|
-
|
110
|
-
def static_site_link(l)
|
111
|
-
link = URI.parse(URI.encode(l))
|
112
|
-
link = URI.join(@site, link) if link.host.nil?
|
113
|
-
link
|
114
|
-
end
|
115
|
-
|
116
|
-
def in_static_domain?(site, link)
|
117
|
-
URI.parse(site).host == link.host
|
118
|
-
end
|
119
|
-
|
120
|
-
# check files linked on static document
|
121
|
-
# see lib/validate_website/runner.rb
|
122
|
-
def check_static_not_found(links)
|
123
|
-
links.each_with_object(Set[]) do |l, result|
|
124
|
-
next if l.include?('#')
|
125
|
-
link = static_site_link(l)
|
126
|
-
next unless in_static_domain?(@site, link)
|
127
|
-
file_path = URI.parse(File.join(Dir.getwd, link.path || '/')).path
|
128
|
-
not_found_error(file_path) && next unless File.exist?(file_path)
|
129
|
-
# Check CSS url()
|
130
|
-
if File.extname(file_path) == '.css'
|
131
|
-
response = fake_httpresponse(open(file_path).read, ['text/css'])
|
132
|
-
css_page = Spidr::Page.new(l, response)
|
133
|
-
result.merge extract_urls_from_css(css_page)
|
134
|
-
end
|
135
|
-
end
|
136
|
-
end
|
137
|
-
|
138
|
-
def not_found_error(location)
|
139
|
-
puts "\n"
|
140
|
-
puts color(:error, "#{location} linked but not exist", @options[:color])
|
141
|
-
@not_founds_count += 1
|
142
|
-
end
|
143
|
-
|
144
65
|
# Extract urls from CSS page
|
145
66
|
#
|
146
67
|
# @param [Spidr::Page] an Spidr::Page object
|
147
68
|
# @return [Array] Lists of urls
|
148
69
|
#
|
149
|
-
def extract_urls_from_css(page)
|
150
|
-
page.body.scan(
|
151
|
-
url = url.first.gsub("'",
|
152
|
-
abs = page.to_absolute(
|
153
|
-
result << abs
|
70
|
+
def self.extract_urls_from_css(page)
|
71
|
+
page.body.scan(%r{url\((['".\/\w-]+)\)}).reduce(Set[]) do |result, url|
|
72
|
+
url = url.first.gsub("'", '').gsub('"', '')
|
73
|
+
abs = page.to_absolute(url)
|
74
|
+
result << abs.to_s
|
154
75
|
end
|
155
76
|
end
|
156
77
|
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
78
|
+
private
|
79
|
+
|
80
|
+
def print_status_line(total, failures, not_founds, errors)
|
81
|
+
puts "\n\n"
|
82
|
+
puts color(:info, ["#{total} visited",
|
83
|
+
"#{failures} failures",
|
84
|
+
"#{not_founds} not founds",
|
85
|
+
"#{errors} errors"].join(', '), options[:color])
|
86
|
+
end
|
87
|
+
|
88
|
+
def not_found_error(location)
|
89
|
+
puts "\n"
|
90
|
+
puts color(:error, "#{location} linked but not exist", options[:color])
|
91
|
+
@not_founds_count += 1
|
167
92
|
end
|
168
93
|
|
169
94
|
##
|
@@ -177,65 +102,16 @@ module ValidateWebsite
|
|
177
102
|
if validator.valid?
|
178
103
|
print color(:success, '.', options[:color]) # rspec style
|
179
104
|
else
|
180
|
-
|
181
|
-
puts "\n"
|
182
|
-
puts color(:error, "* #{url}", options[:color])
|
183
|
-
if options[:verbose]
|
184
|
-
puts color(:error, validator.errors.join(', '), options[:color])
|
185
|
-
end
|
186
|
-
end
|
187
|
-
end
|
188
|
-
|
189
|
-
# Fake http response for Spidr static crawling
|
190
|
-
# see https://github.com/ruby/ruby/blob/trunk/lib/net/http/response.rb
|
191
|
-
#
|
192
|
-
# @param [String] response body
|
193
|
-
# @param [Array] content types
|
194
|
-
# @return [Net::HTTPResponse] fake http response
|
195
|
-
def fake_httpresponse(body, content_types = ['text/html', 'text/xhtml+xml'])
|
196
|
-
response = Net::HTTPResponse.new '1.1', 200, 'OK'
|
197
|
-
response.instance_variable_set(:@read, true)
|
198
|
-
response.body = body
|
199
|
-
content_types.each do |c|
|
200
|
-
response.add_field('content-type', c)
|
105
|
+
handle_validation_error(validator, url)
|
201
106
|
end
|
202
|
-
response
|
203
107
|
end
|
204
108
|
|
205
|
-
def
|
206
|
-
|
207
|
-
puts
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
end
|
212
|
-
|
213
|
-
def spidr_crawler(site, options)
|
214
|
-
@host = URI(site).host
|
215
|
-
Spidr.site(site, options) do |crawler|
|
216
|
-
crawler.cookies[@host] = default_cookies if options[:cookies]
|
217
|
-
crawler.every_css_page do |page|
|
218
|
-
extract_urls_from_css(page).each do |u|
|
219
|
-
crawler.enqueue(u)
|
220
|
-
end
|
221
|
-
end
|
222
|
-
|
223
|
-
crawler.every_html_page do |page|
|
224
|
-
extract_imgs_from_page(page).each do |i|
|
225
|
-
crawler.enqueue(i)
|
226
|
-
end
|
227
|
-
|
228
|
-
if options[:markup] && page.html?
|
229
|
-
validate(page.doc, page.body, page.url, options[:ignore])
|
230
|
-
end
|
231
|
-
end
|
232
|
-
|
233
|
-
if options[:not_found]
|
234
|
-
crawler.every_failed_url do |url|
|
235
|
-
not_found_error(url)
|
236
|
-
end
|
237
|
-
end
|
238
|
-
end
|
109
|
+
def handle_validation_error(validator, url)
|
110
|
+
@errors_count += 1
|
111
|
+
puts "\n"
|
112
|
+
puts color(:error, "* #{url}", options[:color])
|
113
|
+
return unless options[:verbose]
|
114
|
+
puts color(:error, validator.errors.join(', '), options[:color])
|
239
115
|
end
|
240
116
|
end
|
241
117
|
end
|
@@ -0,0 +1,78 @@
|
|
1
|
+
require 'validate_website/core'
|
2
|
+
|
3
|
+
module ValidateWebsite
|
4
|
+
# Class for http website validation
|
5
|
+
class Crawl < Core
|
6
|
+
def initialize(options = {}, validation_type = :crawl)
|
7
|
+
super
|
8
|
+
end
|
9
|
+
|
10
|
+
# @param [Hash] options
|
11
|
+
# :color [Boolean] color output (true, false)
|
12
|
+
# :exclude [String] a String used by Regexp.new
|
13
|
+
# :markup [Boolean] Check the markup validity
|
14
|
+
# :not_found [Boolean] Check for not found page (404)
|
15
|
+
#
|
16
|
+
def crawl(options = {})
|
17
|
+
@options = @options.merge(options)
|
18
|
+
@options.merge!(ignore_links: @options[:exclude]) if @options[:exclude]
|
19
|
+
|
20
|
+
@crawler = spidr_crawler(@site, @options)
|
21
|
+
print_status_line(@crawler.history.size,
|
22
|
+
@crawler.failures.size,
|
23
|
+
@not_founds_count,
|
24
|
+
@errors_count)
|
25
|
+
end
|
26
|
+
|
27
|
+
private
|
28
|
+
|
29
|
+
# Extract imgs urls from page
|
30
|
+
#
|
31
|
+
# @param [Spidr::Page] an Spidr::Page object
|
32
|
+
# @return [Array] Lists of urls
|
33
|
+
#
|
34
|
+
def extract_imgs_from_page(page)
|
35
|
+
page.doc.search('//img[@src]').reduce(Set[]) do |result, elem|
|
36
|
+
u = elem.attributes['src']
|
37
|
+
result << page.to_absolute(URI.parse(u))
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
def spidr_crawler(site, options)
|
42
|
+
@host = URI(site).host
|
43
|
+
Spidr.site(site, options) do |crawler|
|
44
|
+
crawler.cookies[@host] = default_cookies if options[:cookies]
|
45
|
+
on_every_css_page(crawler)
|
46
|
+
on_every_html_page(crawler)
|
47
|
+
on_every_failed_url(crawler)
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
def on_every_css_page(crawler)
|
52
|
+
crawler.every_css_page do |page|
|
53
|
+
ValidateWebsite::Core.extract_urls_from_css(page).each do |u|
|
54
|
+
crawler.enqueue(u)
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
def on_every_html_page(crawler)
|
60
|
+
crawler.every_html_page do |page|
|
61
|
+
extract_imgs_from_page(page).each do |i|
|
62
|
+
crawler.enqueue(i)
|
63
|
+
end
|
64
|
+
|
65
|
+
if options[:markup] && page.html?
|
66
|
+
validate(page.doc, page.body, page.url, options[:ignore])
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
def on_every_failed_url(crawler)
|
72
|
+
return unless options[:not_found]
|
73
|
+
crawler.every_failed_url do |url|
|
74
|
+
not_found_error(url)
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
@@ -4,38 +4,69 @@ require 'slop'
|
|
4
4
|
module ValidateWebsite
|
5
5
|
# Internal class for parse command line args
|
6
6
|
class Parser
|
7
|
+
VALID_TYPES = [:crawl, :static].freeze
|
8
|
+
|
7
9
|
DEFAULT_OPTIONS = {
|
10
|
+
site: 'http://localhost/',
|
11
|
+
pattern: '**/*.html',
|
12
|
+
exclude: nil,
|
13
|
+
user_agent: nil,
|
8
14
|
markup: true,
|
9
15
|
# crawler: log not found url (404 status code)
|
10
16
|
# static: log not found url (not on filesystem, `pwd` considered
|
11
|
-
# as root
|
17
|
+
# as root " / ")
|
12
18
|
not_found: false,
|
13
19
|
file: nil,
|
14
20
|
# regex to ignore certain validation errors
|
15
21
|
ignore: nil,
|
16
22
|
color: true,
|
17
23
|
# internal verbose for ValidateWebsite
|
18
|
-
verbose: false
|
24
|
+
verbose: false
|
19
25
|
}
|
20
26
|
|
21
|
-
|
22
|
-
site: 'http://localhost:3000/',
|
23
|
-
exclude: nil,
|
24
|
-
user_agent: nil,
|
25
|
-
}.merge(DEFAULT_OPTIONS)
|
26
|
-
|
27
|
-
DEFAULT_OPTIONS_STATIC = {
|
28
|
-
site: 'http://www.example.com/',
|
29
|
-
pattern: '**/*.html',
|
30
|
-
}.merge(DEFAULT_OPTIONS)
|
31
|
-
|
27
|
+
# Generic parse method for crawl or static options
|
32
28
|
def self.parse(options, type)
|
33
|
-
|
34
|
-
|
35
|
-
if Array
|
29
|
+
fail ArgumentError unless VALID_TYPES.include?(type)
|
30
|
+
# We are in command line (ARGV)
|
31
|
+
if options.is_a?(Array)
|
36
32
|
send("command_line_parse_#{type}", options)
|
37
33
|
else
|
38
|
-
|
34
|
+
# for testing or Ruby usage with a Hash
|
35
|
+
DEFAULT_OPTIONS.merge(options)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
def self.default_args
|
40
|
+
Slop.parse do |o|
|
41
|
+
yield o if block_given?
|
42
|
+
boolean_options(o)
|
43
|
+
o.regexp('-i', '--ignore',
|
44
|
+
'Validation errors to ignore (ex: "valign|autocorrect")')
|
45
|
+
o.string('-5', '--html5-validator-service-url',
|
46
|
+
'Change default html5 validator service URL')
|
47
|
+
verbose_help_options(o)
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
def self.boolean_options(o)
|
52
|
+
o.bool('-m', '--markup',
|
53
|
+
"Markup validation (default: #{DEFAULT_OPTIONS[:markup]})",
|
54
|
+
default: DEFAULT_OPTIONS[:markup])
|
55
|
+
o.bool('-n', '--not-found',
|
56
|
+
"Log not found url (default: #{DEFAULT_OPTIONS[:not_found]})",
|
57
|
+
default: DEFAULT_OPTIONS[:not_found])
|
58
|
+
o.bool('--color',
|
59
|
+
"Show colored output (default: #{DEFAULT_OPTIONS[:color]})",
|
60
|
+
default: DEFAULT_OPTIONS[:color])
|
61
|
+
end
|
62
|
+
|
63
|
+
def self.verbose_help_options(o)
|
64
|
+
o.bool('-v', '--verbose',
|
65
|
+
"Show validator errors (default: #{DEFAULT_OPTIONS[:verbose]})",
|
66
|
+
default: DEFAULT_OPTIONS[:verbose])
|
67
|
+
o.on('-h', '--help', 'Display this help message.') do
|
68
|
+
puts o
|
69
|
+
exit
|
39
70
|
end
|
40
71
|
end
|
41
72
|
|
@@ -43,28 +74,15 @@ module ValidateWebsite
|
|
43
74
|
# @params [ARGV]
|
44
75
|
# @return [Hash]
|
45
76
|
def self.command_line_parse_crawl(_args)
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
on("c", "cookies=", "Set defaults cookies")
|
56
|
-
on("m", "markup", "Markup validation",
|
57
|
-
default: DEFAULT_OPTIONS_CRAWL[:markup])
|
58
|
-
on("i", "ignore=", "Validation errors to ignore",
|
59
|
-
type: :regexp)
|
60
|
-
on(:n, :not_found, "Log not found url",
|
61
|
-
default: DEFAULT_OPTIONS_CRAWL[:not_found])
|
62
|
-
on("color", "Show colored output",
|
63
|
-
default: DEFAULT_OPTIONS_CRAWL[:color])
|
64
|
-
on("5", "html5-validator-service-url=",
|
65
|
-
"Change default html5 validator service URL")
|
66
|
-
on("v", "verbose", "Show validator errors",
|
67
|
-
default: DEFAULT_OPTIONS_CRAWL[:verbose])
|
77
|
+
default_args do |o|
|
78
|
+
o.string('-s', '--site',
|
79
|
+
"Website to crawl (default: #{DEFAULT_OPTIONS[:site]})",
|
80
|
+
default: DEFAULT_OPTIONS[:site])
|
81
|
+
o.string('-u', '--user-agent',
|
82
|
+
'Change user agent',
|
83
|
+
default: DEFAULT_OPTIONS[:user_agent])
|
84
|
+
o.regexp('-e', '--exclude', 'Url to exclude (ex: "redirect|news")')
|
85
|
+
o.string('-c', '--cookies', 'Set defaults cookies')
|
68
86
|
end
|
69
87
|
end
|
70
88
|
|
@@ -72,26 +90,13 @@ module ValidateWebsite
|
|
72
90
|
# @params [ARGV]
|
73
91
|
# @return [Hash]
|
74
92
|
def self.command_line_parse_static(_args)
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
on("c", "cookies=", "Set defaults cookies")
|
83
|
-
on("m", "markup", "Markup validation",
|
84
|
-
default: DEFAULT_OPTIONS_STATIC[:markup])
|
85
|
-
on("i", "ignore=", "Validation errors to ignore",
|
86
|
-
type: :regexp)
|
87
|
-
on(:n, :not_found, "Log not found url",
|
88
|
-
default: DEFAULT_OPTIONS_STATIC[:not_found])
|
89
|
-
on("color", "Show colored output",
|
90
|
-
default: DEFAULT_OPTIONS_STATIC[:color])
|
91
|
-
on("5", "html5-validator-service-url=",
|
92
|
-
"Change default html5 validator service URL")
|
93
|
-
on("v", "verbose", "Show validator errors",
|
94
|
-
default: DEFAULT_OPTIONS_STATIC[:verbose])
|
93
|
+
default_args do |o|
|
94
|
+
o.string('-s', '--site',
|
95
|
+
"Website to crawl (default: #{DEFAULT_OPTIONS[:site]})",
|
96
|
+
default: DEFAULT_OPTIONS[:site])
|
97
|
+
o.regexp('-p', '--pattern',
|
98
|
+
"Filename pattern (default: #{DEFAULT_OPTIONS[:pattern]})",
|
99
|
+
default: DEFAULT_OPTIONS[:pattern])
|
95
100
|
end
|
96
101
|
end
|
97
102
|
end
|