validate-website 1.0.5 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Rakefile +3 -3
- data/lib/validate_website.rb +1 -0
- data/lib/validate_website/core.rb +33 -157
- data/lib/validate_website/crawl.rb +78 -0
- data/lib/validate_website/option_parser.rb +64 -59
- data/lib/validate_website/runner.rb +3 -3
- data/lib/validate_website/static.rb +102 -0
- data/lib/validate_website/validator.rb +44 -33
- data/lib/validate_website/version.rb +3 -0
- data/spec/core_spec.rb +3 -118
- data/spec/crawler_spec.rb +91 -0
- data/spec/data/w3.org-xhtml1-strict-errors.html +544 -0
- data/spec/spec_helper.rb +2 -1
- data/spec/static_spec.rb +38 -0
- data/spec/validator_spec.rb +40 -23
- data/spec/webmock_helper.rb +4 -3
- metadata +30 -8
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 134cbdd3da2da6847c525ffe5a2ee68f1a380ae2
|
4
|
+
data.tar.gz: 678ff514b9f6f368bbb78e93a8dc42f38a35a803
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: dd90c5dec7d0c80ea9b94abcb1a38a425ee59be32c73fe9fc97620a5a00ee4c9cf2dae52aa099509c8573f2dfca117377ed15373bce2d93ed7c25a43e6ed067e
|
7
|
+
data.tar.gz: 4d24ac9b1dccd744a3b7bd7ebbb18b946c05c4043a0c76898eafc543892e886715e2d99670ea91f3b9d0c3203425cf0de420445d041360c44fb67a934c6165c2
|
data/Rakefile
CHANGED
@@ -4,8 +4,8 @@ require 'rake/testtask'
|
|
4
4
|
task default: [:test]
|
5
5
|
|
6
6
|
RDoc::Task.new do |rd|
|
7
|
-
rd.main =
|
8
|
-
rd.rdoc_files.include(
|
7
|
+
rd.main = 'README.rdoc'
|
8
|
+
rd.rdoc_files.include('README.rdoc', 'lib/**/*.rb')
|
9
9
|
end
|
10
10
|
|
11
11
|
# install asciidoc libxml2-utils xmlto docbook-xsl docbook-xml
|
@@ -15,6 +15,6 @@ task :manpage do
|
|
15
15
|
end
|
16
16
|
|
17
17
|
Rake::TestTask.new do |t|
|
18
|
-
t.pattern =
|
18
|
+
t.pattern = 'spec/*_spec.rb'
|
19
19
|
end
|
20
20
|
task spec: :test
|
data/lib/validate_website.rb
CHANGED
@@ -1,5 +1,3 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
1
|
require 'set'
|
4
2
|
require 'open-uri'
|
5
3
|
require 'webrick/cookie'
|
@@ -10,7 +8,11 @@ require 'validate_website/colorful_messages'
|
|
10
8
|
|
11
9
|
require 'spidr'
|
12
10
|
|
11
|
+
# Base module ValidateWebsite
|
13
12
|
module ValidateWebsite
|
13
|
+
autoload :Crawl, 'validate_website/crawl'
|
14
|
+
autoload :Static, 'validate_website/static'
|
15
|
+
|
14
16
|
# Core class for static or website validation
|
15
17
|
class Core
|
16
18
|
attr_accessor :site
|
@@ -23,55 +25,16 @@ module ValidateWebsite
|
|
23
25
|
EXIT_FAILURE_NOT_FOUND = 65
|
24
26
|
EXIT_FAILURE_MARKUP_NOT_FOUND = 66
|
25
27
|
|
26
|
-
|
27
|
-
|
28
|
-
def initialize(options = {}, validation_type = :crawl)
|
28
|
+
def initialize(options = {}, validation_type)
|
29
29
|
@not_founds_count = 0
|
30
30
|
@errors_count = 0
|
31
|
-
@options = Parser.parse(options, validation_type)
|
31
|
+
@options = Parser.parse(options, validation_type).to_h
|
32
32
|
@site = @options[:site]
|
33
|
-
@service_url = @options[:
|
33
|
+
@service_url = @options[:html5_validator_service_url]
|
34
34
|
Validator.html5_validator_service_url = @service_url if @service_url
|
35
35
|
puts color(:note, "validating #{@site}\n", @options[:color])
|
36
36
|
end
|
37
37
|
|
38
|
-
# @param [Hash] options
|
39
|
-
# :color [Boolean] color output (true, false)
|
40
|
-
# :exclude [String] a String used by Regexp.new
|
41
|
-
# :markup [Boolean] Check the markup validity
|
42
|
-
# :not_found [Boolean] Check for not found page (404)
|
43
|
-
#
|
44
|
-
def crawl(options = {})
|
45
|
-
@options = @options.to_hash.merge(options)
|
46
|
-
@options.merge!(ignore_links: @options[:exclude]) if @options[:exclude]
|
47
|
-
puts color(:warning, "No internet connection") unless internet_connection?
|
48
|
-
|
49
|
-
@crawler = spidr_crawler(@site, @options)
|
50
|
-
print_status_line(@crawler.history.size,
|
51
|
-
@crawler.failures.size,
|
52
|
-
@not_founds_count,
|
53
|
-
@errors_count)
|
54
|
-
end
|
55
|
-
|
56
|
-
# @param [Hash] options
|
57
|
-
#
|
58
|
-
def crawl_static(options = {})
|
59
|
-
@options = @options.to_hash.merge(options)
|
60
|
-
@site = @options[:site]
|
61
|
-
|
62
|
-
files = Dir.glob(@options[:pattern])
|
63
|
-
files.each do |f|
|
64
|
-
next unless File.file?(f)
|
65
|
-
|
66
|
-
response = fake_httpresponse(open(f).read)
|
67
|
-
page = Spidr::Page.new(URI.join(@site, URI.encode(f)), response)
|
68
|
-
|
69
|
-
validate(page.doc, page.body, f) if @options[:markup]
|
70
|
-
check_static_not_found(page.links) if @options[:not_found]
|
71
|
-
end
|
72
|
-
print_status_line(files.size, 0, @not_founds_count, @errors_count)
|
73
|
-
end
|
74
|
-
|
75
38
|
def errors?
|
76
39
|
@errors_count > 0
|
77
40
|
end
|
@@ -99,71 +62,33 @@ module ValidateWebsite
|
|
99
62
|
end
|
100
63
|
end
|
101
64
|
|
102
|
-
private
|
103
|
-
|
104
|
-
def internet_connection?
|
105
|
-
true if open(ValidateWebsite::Core::PING_URL)
|
106
|
-
rescue
|
107
|
-
false
|
108
|
-
end
|
109
|
-
|
110
|
-
def static_site_link(l)
|
111
|
-
link = URI.parse(URI.encode(l))
|
112
|
-
link = URI.join(@site, link) if link.host.nil?
|
113
|
-
link
|
114
|
-
end
|
115
|
-
|
116
|
-
def in_static_domain?(site, link)
|
117
|
-
URI.parse(site).host == link.host
|
118
|
-
end
|
119
|
-
|
120
|
-
# check files linked on static document
|
121
|
-
# see lib/validate_website/runner.rb
|
122
|
-
def check_static_not_found(links)
|
123
|
-
links.each_with_object(Set[]) do |l, result|
|
124
|
-
next if l.include?('#')
|
125
|
-
link = static_site_link(l)
|
126
|
-
next unless in_static_domain?(@site, link)
|
127
|
-
file_path = URI.parse(File.join(Dir.getwd, link.path || '/')).path
|
128
|
-
not_found_error(file_path) && next unless File.exist?(file_path)
|
129
|
-
# Check CSS url()
|
130
|
-
if File.extname(file_path) == '.css'
|
131
|
-
response = fake_httpresponse(open(file_path).read, ['text/css'])
|
132
|
-
css_page = Spidr::Page.new(l, response)
|
133
|
-
result.merge extract_urls_from_css(css_page)
|
134
|
-
end
|
135
|
-
end
|
136
|
-
end
|
137
|
-
|
138
|
-
def not_found_error(location)
|
139
|
-
puts "\n"
|
140
|
-
puts color(:error, "#{location} linked but not exist", @options[:color])
|
141
|
-
@not_founds_count += 1
|
142
|
-
end
|
143
|
-
|
144
65
|
# Extract urls from CSS page
|
145
66
|
#
|
146
67
|
# @param [Spidr::Page] an Spidr::Page object
|
147
68
|
# @return [Array] Lists of urls
|
148
69
|
#
|
149
|
-
def extract_urls_from_css(page)
|
150
|
-
page.body.scan(
|
151
|
-
url = url.first.gsub("'",
|
152
|
-
abs = page.to_absolute(
|
153
|
-
result << abs
|
70
|
+
def self.extract_urls_from_css(page)
|
71
|
+
page.body.scan(%r{url\((['".\/\w-]+)\)}).reduce(Set[]) do |result, url|
|
72
|
+
url = url.first.gsub("'", '').gsub('"', '')
|
73
|
+
abs = page.to_absolute(url)
|
74
|
+
result << abs.to_s
|
154
75
|
end
|
155
76
|
end
|
156
77
|
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
78
|
+
private
|
79
|
+
|
80
|
+
def print_status_line(total, failures, not_founds, errors)
|
81
|
+
puts "\n\n"
|
82
|
+
puts color(:info, ["#{total} visited",
|
83
|
+
"#{failures} failures",
|
84
|
+
"#{not_founds} not founds",
|
85
|
+
"#{errors} errors"].join(', '), options[:color])
|
86
|
+
end
|
87
|
+
|
88
|
+
def not_found_error(location)
|
89
|
+
puts "\n"
|
90
|
+
puts color(:error, "#{location} linked but not exist", options[:color])
|
91
|
+
@not_founds_count += 1
|
167
92
|
end
|
168
93
|
|
169
94
|
##
|
@@ -177,65 +102,16 @@ module ValidateWebsite
|
|
177
102
|
if validator.valid?
|
178
103
|
print color(:success, '.', options[:color]) # rspec style
|
179
104
|
else
|
180
|
-
|
181
|
-
puts "\n"
|
182
|
-
puts color(:error, "* #{url}", options[:color])
|
183
|
-
if options[:verbose]
|
184
|
-
puts color(:error, validator.errors.join(', '), options[:color])
|
185
|
-
end
|
186
|
-
end
|
187
|
-
end
|
188
|
-
|
189
|
-
# Fake http response for Spidr static crawling
|
190
|
-
# see https://github.com/ruby/ruby/blob/trunk/lib/net/http/response.rb
|
191
|
-
#
|
192
|
-
# @param [String] response body
|
193
|
-
# @param [Array] content types
|
194
|
-
# @return [Net::HTTPResponse] fake http response
|
195
|
-
def fake_httpresponse(body, content_types = ['text/html', 'text/xhtml+xml'])
|
196
|
-
response = Net::HTTPResponse.new '1.1', 200, 'OK'
|
197
|
-
response.instance_variable_set(:@read, true)
|
198
|
-
response.body = body
|
199
|
-
content_types.each do |c|
|
200
|
-
response.add_field('content-type', c)
|
105
|
+
handle_validation_error(validator, url)
|
201
106
|
end
|
202
|
-
response
|
203
107
|
end
|
204
108
|
|
205
|
-
def
|
206
|
-
|
207
|
-
puts
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
end
|
212
|
-
|
213
|
-
def spidr_crawler(site, options)
|
214
|
-
@host = URI(site).host
|
215
|
-
Spidr.site(site, options) do |crawler|
|
216
|
-
crawler.cookies[@host] = default_cookies if options[:cookies]
|
217
|
-
crawler.every_css_page do |page|
|
218
|
-
extract_urls_from_css(page).each do |u|
|
219
|
-
crawler.enqueue(u)
|
220
|
-
end
|
221
|
-
end
|
222
|
-
|
223
|
-
crawler.every_html_page do |page|
|
224
|
-
extract_imgs_from_page(page).each do |i|
|
225
|
-
crawler.enqueue(i)
|
226
|
-
end
|
227
|
-
|
228
|
-
if options[:markup] && page.html?
|
229
|
-
validate(page.doc, page.body, page.url, options[:ignore])
|
230
|
-
end
|
231
|
-
end
|
232
|
-
|
233
|
-
if options[:not_found]
|
234
|
-
crawler.every_failed_url do |url|
|
235
|
-
not_found_error(url)
|
236
|
-
end
|
237
|
-
end
|
238
|
-
end
|
109
|
+
def handle_validation_error(validator, url)
|
110
|
+
@errors_count += 1
|
111
|
+
puts "\n"
|
112
|
+
puts color(:error, "* #{url}", options[:color])
|
113
|
+
return unless options[:verbose]
|
114
|
+
puts color(:error, validator.errors.join(', '), options[:color])
|
239
115
|
end
|
240
116
|
end
|
241
117
|
end
|
@@ -0,0 +1,78 @@
|
|
1
|
+
require 'validate_website/core'
|
2
|
+
|
3
|
+
module ValidateWebsite
|
4
|
+
# Class for http website validation
|
5
|
+
class Crawl < Core
|
6
|
+
def initialize(options = {}, validation_type = :crawl)
|
7
|
+
super
|
8
|
+
end
|
9
|
+
|
10
|
+
# @param [Hash] options
|
11
|
+
# :color [Boolean] color output (true, false)
|
12
|
+
# :exclude [String] a String used by Regexp.new
|
13
|
+
# :markup [Boolean] Check the markup validity
|
14
|
+
# :not_found [Boolean] Check for not found page (404)
|
15
|
+
#
|
16
|
+
def crawl(options = {})
|
17
|
+
@options = @options.merge(options)
|
18
|
+
@options.merge!(ignore_links: @options[:exclude]) if @options[:exclude]
|
19
|
+
|
20
|
+
@crawler = spidr_crawler(@site, @options)
|
21
|
+
print_status_line(@crawler.history.size,
|
22
|
+
@crawler.failures.size,
|
23
|
+
@not_founds_count,
|
24
|
+
@errors_count)
|
25
|
+
end
|
26
|
+
|
27
|
+
private
|
28
|
+
|
29
|
+
# Extract imgs urls from page
|
30
|
+
#
|
31
|
+
# @param [Spidr::Page] an Spidr::Page object
|
32
|
+
# @return [Array] Lists of urls
|
33
|
+
#
|
34
|
+
def extract_imgs_from_page(page)
|
35
|
+
page.doc.search('//img[@src]').reduce(Set[]) do |result, elem|
|
36
|
+
u = elem.attributes['src']
|
37
|
+
result << page.to_absolute(URI.parse(u))
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
def spidr_crawler(site, options)
|
42
|
+
@host = URI(site).host
|
43
|
+
Spidr.site(site, options) do |crawler|
|
44
|
+
crawler.cookies[@host] = default_cookies if options[:cookies]
|
45
|
+
on_every_css_page(crawler)
|
46
|
+
on_every_html_page(crawler)
|
47
|
+
on_every_failed_url(crawler)
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
def on_every_css_page(crawler)
|
52
|
+
crawler.every_css_page do |page|
|
53
|
+
ValidateWebsite::Core.extract_urls_from_css(page).each do |u|
|
54
|
+
crawler.enqueue(u)
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
def on_every_html_page(crawler)
|
60
|
+
crawler.every_html_page do |page|
|
61
|
+
extract_imgs_from_page(page).each do |i|
|
62
|
+
crawler.enqueue(i)
|
63
|
+
end
|
64
|
+
|
65
|
+
if options[:markup] && page.html?
|
66
|
+
validate(page.doc, page.body, page.url, options[:ignore])
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
def on_every_failed_url(crawler)
|
72
|
+
return unless options[:not_found]
|
73
|
+
crawler.every_failed_url do |url|
|
74
|
+
not_found_error(url)
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
@@ -4,38 +4,69 @@ require 'slop'
|
|
4
4
|
module ValidateWebsite
|
5
5
|
# Internal class for parse command line args
|
6
6
|
class Parser
|
7
|
+
VALID_TYPES = [:crawl, :static].freeze
|
8
|
+
|
7
9
|
DEFAULT_OPTIONS = {
|
10
|
+
site: 'http://localhost/',
|
11
|
+
pattern: '**/*.html',
|
12
|
+
exclude: nil,
|
13
|
+
user_agent: nil,
|
8
14
|
markup: true,
|
9
15
|
# crawler: log not found url (404 status code)
|
10
16
|
# static: log not found url (not on filesystem, `pwd` considered
|
11
|
-
# as root
|
17
|
+
# as root " / ")
|
12
18
|
not_found: false,
|
13
19
|
file: nil,
|
14
20
|
# regex to ignore certain validation errors
|
15
21
|
ignore: nil,
|
16
22
|
color: true,
|
17
23
|
# internal verbose for ValidateWebsite
|
18
|
-
verbose: false
|
24
|
+
verbose: false
|
19
25
|
}
|
20
26
|
|
21
|
-
|
22
|
-
site: 'http://localhost:3000/',
|
23
|
-
exclude: nil,
|
24
|
-
user_agent: nil,
|
25
|
-
}.merge(DEFAULT_OPTIONS)
|
26
|
-
|
27
|
-
DEFAULT_OPTIONS_STATIC = {
|
28
|
-
site: 'http://www.example.com/',
|
29
|
-
pattern: '**/*.html',
|
30
|
-
}.merge(DEFAULT_OPTIONS)
|
31
|
-
|
27
|
+
# Generic parse method for crawl or static options
|
32
28
|
def self.parse(options, type)
|
33
|
-
|
34
|
-
|
35
|
-
if Array
|
29
|
+
fail ArgumentError unless VALID_TYPES.include?(type)
|
30
|
+
# We are in command line (ARGV)
|
31
|
+
if options.is_a?(Array)
|
36
32
|
send("command_line_parse_#{type}", options)
|
37
33
|
else
|
38
|
-
|
34
|
+
# for testing or Ruby usage with a Hash
|
35
|
+
DEFAULT_OPTIONS.merge(options)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
def self.default_args
|
40
|
+
Slop.parse do |o|
|
41
|
+
yield o if block_given?
|
42
|
+
boolean_options(o)
|
43
|
+
o.regexp('-i', '--ignore',
|
44
|
+
'Validation errors to ignore (ex: "valign|autocorrect")')
|
45
|
+
o.string('-5', '--html5-validator-service-url',
|
46
|
+
'Change default html5 validator service URL')
|
47
|
+
verbose_help_options(o)
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
def self.boolean_options(o)
|
52
|
+
o.bool('-m', '--markup',
|
53
|
+
"Markup validation (default: #{DEFAULT_OPTIONS[:markup]})",
|
54
|
+
default: DEFAULT_OPTIONS[:markup])
|
55
|
+
o.bool('-n', '--not-found',
|
56
|
+
"Log not found url (default: #{DEFAULT_OPTIONS[:not_found]})",
|
57
|
+
default: DEFAULT_OPTIONS[:not_found])
|
58
|
+
o.bool('--color',
|
59
|
+
"Show colored output (default: #{DEFAULT_OPTIONS[:color]})",
|
60
|
+
default: DEFAULT_OPTIONS[:color])
|
61
|
+
end
|
62
|
+
|
63
|
+
def self.verbose_help_options(o)
|
64
|
+
o.bool('-v', '--verbose',
|
65
|
+
"Show validator errors (default: #{DEFAULT_OPTIONS[:verbose]})",
|
66
|
+
default: DEFAULT_OPTIONS[:verbose])
|
67
|
+
o.on('-h', '--help', 'Display this help message.') do
|
68
|
+
puts o
|
69
|
+
exit
|
39
70
|
end
|
40
71
|
end
|
41
72
|
|
@@ -43,28 +74,15 @@ module ValidateWebsite
|
|
43
74
|
# @params [ARGV]
|
44
75
|
# @return [Hash]
|
45
76
|
def self.command_line_parse_crawl(_args)
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
on("c", "cookies=", "Set defaults cookies")
|
56
|
-
on("m", "markup", "Markup validation",
|
57
|
-
default: DEFAULT_OPTIONS_CRAWL[:markup])
|
58
|
-
on("i", "ignore=", "Validation errors to ignore",
|
59
|
-
type: :regexp)
|
60
|
-
on(:n, :not_found, "Log not found url",
|
61
|
-
default: DEFAULT_OPTIONS_CRAWL[:not_found])
|
62
|
-
on("color", "Show colored output",
|
63
|
-
default: DEFAULT_OPTIONS_CRAWL[:color])
|
64
|
-
on("5", "html5-validator-service-url=",
|
65
|
-
"Change default html5 validator service URL")
|
66
|
-
on("v", "verbose", "Show validator errors",
|
67
|
-
default: DEFAULT_OPTIONS_CRAWL[:verbose])
|
77
|
+
default_args do |o|
|
78
|
+
o.string('-s', '--site',
|
79
|
+
"Website to crawl (default: #{DEFAULT_OPTIONS[:site]})",
|
80
|
+
default: DEFAULT_OPTIONS[:site])
|
81
|
+
o.string('-u', '--user-agent',
|
82
|
+
'Change user agent',
|
83
|
+
default: DEFAULT_OPTIONS[:user_agent])
|
84
|
+
o.regexp('-e', '--exclude', 'Url to exclude (ex: "redirect|news")')
|
85
|
+
o.string('-c', '--cookies', 'Set defaults cookies')
|
68
86
|
end
|
69
87
|
end
|
70
88
|
|
@@ -72,26 +90,13 @@ module ValidateWebsite
|
|
72
90
|
# @params [ARGV]
|
73
91
|
# @return [Hash]
|
74
92
|
def self.command_line_parse_static(_args)
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
on("c", "cookies=", "Set defaults cookies")
|
83
|
-
on("m", "markup", "Markup validation",
|
84
|
-
default: DEFAULT_OPTIONS_STATIC[:markup])
|
85
|
-
on("i", "ignore=", "Validation errors to ignore",
|
86
|
-
type: :regexp)
|
87
|
-
on(:n, :not_found, "Log not found url",
|
88
|
-
default: DEFAULT_OPTIONS_STATIC[:not_found])
|
89
|
-
on("color", "Show colored output",
|
90
|
-
default: DEFAULT_OPTIONS_STATIC[:color])
|
91
|
-
on("5", "html5-validator-service-url=",
|
92
|
-
"Change default html5 validator service URL")
|
93
|
-
on("v", "verbose", "Show validator errors",
|
94
|
-
default: DEFAULT_OPTIONS_STATIC[:verbose])
|
93
|
+
default_args do |o|
|
94
|
+
o.string('-s', '--site',
|
95
|
+
"Website to crawl (default: #{DEFAULT_OPTIONS[:site]})",
|
96
|
+
default: DEFAULT_OPTIONS[:site])
|
97
|
+
o.regexp('-p', '--pattern',
|
98
|
+
"Filename pattern (default: #{DEFAULT_OPTIONS[:pattern]})",
|
99
|
+
default: DEFAULT_OPTIONS[:pattern])
|
95
100
|
end
|
96
101
|
end
|
97
102
|
end
|