validate-website 1.0.5 → 1.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Rakefile +3 -3
- data/lib/validate_website.rb +1 -0
- data/lib/validate_website/core.rb +33 -157
- data/lib/validate_website/crawl.rb +78 -0
- data/lib/validate_website/option_parser.rb +64 -59
- data/lib/validate_website/runner.rb +3 -3
- data/lib/validate_website/static.rb +102 -0
- data/lib/validate_website/validator.rb +44 -33
- data/lib/validate_website/version.rb +3 -0
- data/spec/core_spec.rb +3 -118
- data/spec/crawler_spec.rb +91 -0
- data/spec/data/w3.org-xhtml1-strict-errors.html +544 -0
- data/spec/spec_helper.rb +2 -1
- data/spec/static_spec.rb +38 -0
- data/spec/validator_spec.rb +40 -23
- data/spec/webmock_helper.rb +4 -3
- metadata +30 -8
@@ -12,15 +12,15 @@ module ValidateWebsite
|
|
12
12
|
|
13
13
|
def self.run_crawl(args)
|
14
14
|
trap_interrupt
|
15
|
-
validate_website = ValidateWebsite::
|
15
|
+
validate_website = ValidateWebsite::Crawl.new(args)
|
16
16
|
validate_website.crawl
|
17
17
|
validate_website.exit_status
|
18
18
|
end
|
19
19
|
|
20
20
|
def self.run_static(args)
|
21
21
|
trap_interrupt
|
22
|
-
validate_website = ValidateWebsite::
|
23
|
-
validate_website.
|
22
|
+
validate_website = ValidateWebsite::Static.new(args)
|
23
|
+
validate_website.crawl
|
24
24
|
validate_website.exit_status
|
25
25
|
end
|
26
26
|
end
|
@@ -0,0 +1,102 @@
|
|
1
|
+
require 'validate_website/core'
|
2
|
+
|
3
|
+
module ValidateWebsite
|
4
|
+
# Class for validation Static website
|
5
|
+
class Static < Core
|
6
|
+
CONTENT_TYPES = ['text/html', 'text/xhtml+xml']
|
7
|
+
|
8
|
+
def initialize(options = {}, validation_type = :static)
|
9
|
+
super
|
10
|
+
end
|
11
|
+
|
12
|
+
# @param [Hash] options
|
13
|
+
#
|
14
|
+
def crawl(options = {})
|
15
|
+
@options = @options.merge(options)
|
16
|
+
@site = @options[:site]
|
17
|
+
|
18
|
+
files = Dir.glob(@options[:pattern])
|
19
|
+
files.each do |f|
|
20
|
+
next unless File.file?(f)
|
21
|
+
check_static_file(f)
|
22
|
+
end
|
23
|
+
print_status_line(files.size, 0, @not_founds_count, @errors_count)
|
24
|
+
end
|
25
|
+
|
26
|
+
private
|
27
|
+
|
28
|
+
def generate_static_page(f)
|
29
|
+
response = self.class.fake_httpresponse(open(f).read)
|
30
|
+
Spidr::Page.new(URI.join(@site, URI.encode(f)), response)
|
31
|
+
end
|
32
|
+
|
33
|
+
def check_static_file(f)
|
34
|
+
page = generate_static_page(f)
|
35
|
+
validate(page.doc, page.body, f, @options[:ignore]) if @options[:markup]
|
36
|
+
check_static_not_found(page.links) if @options[:not_found]
|
37
|
+
end
|
38
|
+
|
39
|
+
StaticLink = Struct.new(:link, :site) do
|
40
|
+
def link_uri
|
41
|
+
@link_uri = URI.parse(URI.encode(link))
|
42
|
+
@link_uri = URI.join(site, @link_uri) if @link_uri.host.nil?
|
43
|
+
@link_uri
|
44
|
+
end
|
45
|
+
|
46
|
+
def in_static_domain?
|
47
|
+
URI.parse(site).host == link_uri.host
|
48
|
+
end
|
49
|
+
|
50
|
+
def extract_urls_from_fake_css_response
|
51
|
+
response = ValidateWebsite::Static.fake_httpresponse(
|
52
|
+
open(file_path).read,
|
53
|
+
['text/css'])
|
54
|
+
css_page = Spidr::Page.new(link_uri, response)
|
55
|
+
ValidateWebsite::Core.extract_urls_from_css(css_page)
|
56
|
+
end
|
57
|
+
|
58
|
+
def file_path
|
59
|
+
@file_path ||= URI.parse(
|
60
|
+
File.join(Dir.getwd, link_uri.path || '/')
|
61
|
+
).path
|
62
|
+
end
|
63
|
+
|
64
|
+
def extname
|
65
|
+
@extname ||= File.extname(file_path)
|
66
|
+
end
|
67
|
+
|
68
|
+
def check?
|
69
|
+
!link.include?('#') && in_static_domain?
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
# check files linked on static document
|
74
|
+
# see lib/validate_website/runner.rb
|
75
|
+
def check_static_not_found(links)
|
76
|
+
static_links = links.map { |l| StaticLink.new(l, @site) }
|
77
|
+
static_links.each do |static_link|
|
78
|
+
next unless static_link.check?
|
79
|
+
not_found_error(static_link.file_path) &&
|
80
|
+
next unless File.exist?(static_link.file_path)
|
81
|
+
next unless static_link.extname == '.css'
|
82
|
+
check_static_not_found static_link.extract_urls_from_fake_css_response
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
# Fake http response for Spidr static crawling
|
87
|
+
# see https://github.com/ruby/ruby/blob/trunk/lib/net/http/response.rb
|
88
|
+
#
|
89
|
+
# @param [String] response body
|
90
|
+
# @param [Array] content types
|
91
|
+
# @return [Net::HTTPResponse] fake http response
|
92
|
+
def self.fake_httpresponse(body, content_types = CONTENT_TYPES)
|
93
|
+
response = Net::HTTPResponse.new '1.1', 200, 'OK'
|
94
|
+
response.instance_variable_set(:@read, true)
|
95
|
+
response.body = body
|
96
|
+
content_types.each do |c|
|
97
|
+
response.add_field('content-type', c)
|
98
|
+
end
|
99
|
+
response
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
@@ -1,6 +1,8 @@
|
|
1
1
|
# encoding: utf-8
|
2
2
|
require 'uri'
|
3
3
|
require 'nokogiri'
|
4
|
+
require 'net/http'
|
5
|
+
require 'multipart_body'
|
4
6
|
|
5
7
|
module ValidateWebsite
|
6
8
|
# Document validation from DTD or XSD (webservice for html5)
|
@@ -12,7 +14,7 @@ module ValidateWebsite
|
|
12
14
|
attr_accessor :html5_validator_service_url
|
13
15
|
end
|
14
16
|
|
15
|
-
attr_reader :original_doc, :body, :dtd, :doc, :namespace, :xsd
|
17
|
+
attr_reader :original_doc, :body, :dtd, :doc, :namespace, :xsd
|
16
18
|
|
17
19
|
##
|
18
20
|
# @param [Nokogiri::HTML::Document] original_doc
|
@@ -24,18 +26,19 @@ module ValidateWebsite
|
|
24
26
|
@body = body
|
25
27
|
@ignore = ignore
|
26
28
|
@dtd = @original_doc.internal_subset
|
27
|
-
init_namespace(@dtd)
|
28
|
-
@errors = []
|
29
|
+
@namespace = init_namespace(@dtd)
|
29
30
|
end
|
30
31
|
|
31
32
|
##
|
32
33
|
# @return [Boolean]
|
33
34
|
def valid?
|
35
|
+
find_errors
|
34
36
|
errors.length == 0
|
35
37
|
end
|
36
38
|
|
39
|
+
# @return [Array] of errors
|
37
40
|
def errors
|
38
|
-
|
41
|
+
@errors.map!(&:to_s)
|
39
42
|
@ignore ? @errors.reject { |e| @ignore =~ e } : @errors
|
40
43
|
end
|
41
44
|
|
@@ -47,7 +50,7 @@ module ValidateWebsite
|
|
47
50
|
return unless dtd_uri.path
|
48
51
|
@dtd_uri = dtd_uri
|
49
52
|
# http://www.w3.org/TR/xhtml1/#dtds
|
50
|
-
|
53
|
+
File.basename(@dtd_uri.path, '.dtd')
|
51
54
|
end
|
52
55
|
|
53
56
|
def document
|
@@ -59,51 +62,59 @@ module ValidateWebsite
|
|
59
62
|
end
|
60
63
|
end
|
61
64
|
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
cfg.noent.dtdload.dtdvalid
|
66
|
-
}
|
67
|
-
end
|
68
|
-
|
69
|
-
# http://www.w3.org/TR/xhtml1-schema/
|
70
|
-
@xsd = Dir.chdir(XHTML_PATH) do
|
65
|
+
# http://www.w3.org/TR/xhtml1-schema/
|
66
|
+
def xsd
|
67
|
+
@xsd ||= Dir.chdir(XHTML_PATH) do
|
71
68
|
if @namespace && File.exist?(@namespace + '.xsd')
|
72
69
|
Nokogiri::XML::Schema(File.read(@namespace + '.xsd'))
|
73
70
|
end
|
74
71
|
end
|
72
|
+
end
|
75
73
|
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
74
|
+
# @return [Array] contain result errors
|
75
|
+
def validate(xml_doc, document_body)
|
76
|
+
if xsd
|
77
|
+
xsd.validate(xml_doc)
|
78
|
+
elsif document_body =~ /^\<!DOCTYPE html\>/i
|
79
|
+
html5_validate(document_body)
|
80
80
|
else
|
81
81
|
# dont have xsd fall back to dtd
|
82
|
-
|
82
|
+
Dir.chdir(XHTML_PATH) do
|
83
83
|
Nokogiri::HTML.parse(document)
|
84
|
-
end
|
85
|
-
@errors = @doc.errors
|
84
|
+
end.errors
|
86
85
|
end
|
86
|
+
end
|
87
87
|
|
88
|
+
# http://nokogiri.org/tutorials/ensuring_well_formed_markup.html
|
89
|
+
def find_errors
|
90
|
+
doc = Dir.chdir(XHTML_PATH) do
|
91
|
+
Nokogiri::XML(document) { |cfg| cfg.noent.dtdload.dtdvalid }
|
92
|
+
end
|
93
|
+
@errors = validate(doc, document)
|
88
94
|
rescue Nokogiri::XML::SyntaxError => e
|
89
|
-
# http://nokogiri.org/tutorials/ensuring_well_formed_markup.html
|
90
95
|
@errors << e
|
91
96
|
end
|
92
97
|
|
93
|
-
def
|
94
|
-
|
95
|
-
|
98
|
+
def html5_headers(multipart)
|
99
|
+
{
|
100
|
+
'Content-Type' => "multipart/form-data; boundary=#{multipart.boundary}",
|
101
|
+
'Content-Length' => multipart.to_s.bytesize.to_s
|
102
|
+
}
|
103
|
+
end
|
104
|
+
|
105
|
+
def html5_body(document)
|
96
106
|
url = URI.parse(self.class.html5_validator_service_url)
|
97
107
|
multipart = MultipartBody.new(content: document)
|
98
108
|
http = Net::HTTP.new(url.host, url.port)
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
109
|
+
http.start do |con|
|
110
|
+
con.post(url.path, multipart.to_s, html5_headers(multipart))
|
111
|
+
end.body
|
112
|
+
end
|
113
|
+
|
114
|
+
def html5_validate(document)
|
115
|
+
validator_document = Nokogiri::HTML(html5_body(document))
|
116
|
+
errors = validator_document.css('h2.invalid').map(&:content)
|
117
|
+
errors.concat validator_document.css('ol li.error').map(&:content)
|
107
118
|
end
|
108
119
|
end
|
109
120
|
end
|
data/spec/core_spec.rb
CHANGED
@@ -1,125 +1,10 @@
|
|
1
|
-
|
2
|
-
require File.expand_path('../spec_helper', __FILE__)
|
1
|
+
require_relative 'spec_helper'
|
3
2
|
|
4
3
|
describe ValidateWebsite::Core do
|
5
|
-
before do
|
6
|
-
WebMock.reset!
|
7
|
-
stub_request(:get, ValidateWebsite::Core::PING_URL).to_return(status: 200)
|
8
|
-
stub_request(:get, /#{SPEC_DOMAIN}/).to_return(status: 200)
|
9
|
-
@validate_website = ValidateWebsite::Core.new(color: false)
|
10
|
-
end
|
11
|
-
|
12
4
|
describe 'invalid options' do
|
13
5
|
it 'raise ArgumentError on wrong validation_type' do
|
14
|
-
proc {
|
15
|
-
|
16
|
-
}.must_raise ArgumentError
|
17
|
-
end
|
18
|
-
end
|
19
|
-
|
20
|
-
describe 'options' do
|
21
|
-
it 'can change user-agent' do
|
22
|
-
ua = %{Linux / Firefox 29: Mozilla/5.0 (X11; Linux x86_64; rv:29.0) \
|
23
|
-
Gecko/20100101 Firefox/29.0}
|
24
|
-
v = ValidateWebsite::Core.new({ site: SPEC_DOMAIN, user_agent: ua },
|
25
|
-
:crawl)
|
26
|
-
v.crawl
|
27
|
-
v.crawler.user_agent.must_equal ua
|
28
|
-
end
|
29
|
-
|
30
|
-
it 'can change html5 validator service url' do
|
31
|
-
s = 'http://localhost:8888/'
|
32
|
-
ValidateWebsite::Core.new({ site: SPEC_DOMAIN,
|
33
|
-
:'html5-validator-service-url' => s })
|
34
|
-
ValidateWebsite::Validator.html5_validator_service_url.must_equal s
|
35
|
-
end
|
36
|
-
end
|
37
|
-
|
38
|
-
describe('cookies') do
|
39
|
-
it 'can set cookies' do
|
40
|
-
cookies = 'tz=Europe%2FBerlin; guid=ZcpBshbtStgl9VjwTofq'
|
41
|
-
v = ValidateWebsite::Core.new({ site: SPEC_DOMAIN, cookies: cookies },
|
42
|
-
:crawl)
|
43
|
-
v.crawl
|
44
|
-
v.crawler.cookies.cookies_for_host(v.host).must_equal v.default_cookies
|
45
|
-
end
|
46
|
-
end
|
47
|
-
|
48
|
-
describe('html') do
|
49
|
-
it "extract url" do
|
50
|
-
name = 'xhtml1-strict'
|
51
|
-
file = File.join('spec', 'data', "#{name}.html")
|
52
|
-
page = FakePage.new(name,
|
53
|
-
body: open(file).read,
|
54
|
-
content_type: 'text/html')
|
55
|
-
@validate_website.site = page.url
|
56
|
-
@validate_website.crawl
|
57
|
-
@validate_website.crawler.history.size.must_equal 5
|
58
|
-
end
|
59
|
-
|
60
|
-
it 'extract link' do
|
61
|
-
name = 'html4-strict'
|
62
|
-
file = File.join('spec', 'data', "#{name}.html")
|
63
|
-
page = FakePage.new(name,
|
64
|
-
body: open(file).read,
|
65
|
-
content_type: 'text/html')
|
66
|
-
@validate_website.site = page.url
|
67
|
-
@validate_website.crawl
|
68
|
-
@validate_website.crawler.history.size.must_equal 98
|
69
|
-
end
|
70
|
-
end
|
71
|
-
|
72
|
-
describe('css') do
|
73
|
-
it "crawl css and extract url" do
|
74
|
-
page = FakePage.new('test.css',
|
75
|
-
body: '.t {background-image: url(pouet);}
|
76
|
-
.t {background-image: url(/image/pouet.png)}
|
77
|
-
.t {background-image: url(/image/pouet_42.png)}
|
78
|
-
.t {background-image: url(/image/pouet)}',
|
79
|
-
content_type: 'text/css')
|
80
|
-
@validate_website.site = page.url
|
81
|
-
@validate_website.crawl
|
82
|
-
@validate_website.crawler.history.size.must_equal 5
|
83
|
-
end
|
84
|
-
|
85
|
-
it "should extract url with single quote" do
|
86
|
-
page = FakePage.new('test.css',
|
87
|
-
body: ".test {background-image: url('pouet');}",
|
88
|
-
content_type: 'text/css')
|
89
|
-
@validate_website.site = page.url
|
90
|
-
@validate_website.crawl
|
91
|
-
@validate_website.crawler.history.size.must_equal 2
|
92
|
-
end
|
93
|
-
|
94
|
-
it "should extract url with double quote" do
|
95
|
-
page = FakePage.new('test.css',
|
96
|
-
body: ".test {background-image: url(\"pouet\");}",
|
97
|
-
content_type: 'text/css')
|
98
|
-
@validate_website.site = page.url
|
99
|
-
@validate_website.crawl
|
100
|
-
@validate_website.crawler.history.size.must_equal 2
|
101
|
-
end
|
102
|
-
end
|
103
|
-
|
104
|
-
describe('static') do
|
105
|
-
it 'no space in directory name' do
|
106
|
-
pattern = File.join(File.dirname(__FILE__), 'example/**/*.html')
|
107
|
-
@validate_website.crawl_static(pattern: pattern,
|
108
|
-
site: 'http://dev.af83.com/',
|
109
|
-
markup: false,
|
110
|
-
not_found: false)
|
111
|
-
@validate_website.not_founds_count.must_equal 0
|
112
|
-
end
|
113
|
-
|
114
|
-
it 'not found' do
|
115
|
-
pattern = File.join(File.dirname(__FILE__), '**/*.html')
|
116
|
-
Dir.chdir('spec/data') do
|
117
|
-
@validate_website.crawl_static(pattern: pattern,
|
118
|
-
site: 'https://linuxfr.org/',
|
119
|
-
markup: false,
|
120
|
-
not_found: true)
|
121
|
-
@validate_website.not_founds_count.must_equal 448
|
122
|
-
end
|
6
|
+
proc { ValidateWebsite::Core.new({ color: false }, :fail) }
|
7
|
+
.must_raise ArgumentError
|
123
8
|
end
|
124
9
|
end
|
125
10
|
end
|
@@ -0,0 +1,91 @@
|
|
1
|
+
require_relative 'spec_helper'
|
2
|
+
|
3
|
+
describe ValidateWebsite::Crawl do
|
4
|
+
before do
|
5
|
+
WebMock.reset!
|
6
|
+
stub_request(:get, /#{SPEC_DOMAIN}/).to_return(status: 200)
|
7
|
+
@validate_website = ValidateWebsite::Crawl.new(color: false)
|
8
|
+
end
|
9
|
+
|
10
|
+
describe 'options' do
|
11
|
+
it 'can change user-agent' do
|
12
|
+
ua = %{Linux / Firefox 29: Mozilla/5.0 (X11; Linux x86_64; rv:29.0) \
|
13
|
+
Gecko/20100101 Firefox/29.0}
|
14
|
+
v = ValidateWebsite::Crawl.new(site: SPEC_DOMAIN, user_agent: ua)
|
15
|
+
v.crawl
|
16
|
+
v.crawler.user_agent.must_equal ua
|
17
|
+
end
|
18
|
+
|
19
|
+
it 'can change html5 validator service url' do
|
20
|
+
s = 'http://localhost:8888/'
|
21
|
+
ValidateWebsite::Crawl.new(site: SPEC_DOMAIN,
|
22
|
+
html5_validator_service_url: s)
|
23
|
+
ValidateWebsite::Validator.html5_validator_service_url.must_equal s
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
describe('cookies') do
|
28
|
+
it 'can set cookies' do
|
29
|
+
cookies = 'tz=Europe%2FBerlin; guid=ZcpBshbtStgl9VjwTofq'
|
30
|
+
v = ValidateWebsite::Crawl.new(site: SPEC_DOMAIN, cookies: cookies)
|
31
|
+
v.crawl
|
32
|
+
v.crawler.cookies.cookies_for_host(v.host).must_equal v.default_cookies
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
describe('html') do
|
37
|
+
it 'extract url' do
|
38
|
+
name = 'xhtml1-strict'
|
39
|
+
file = File.join('spec', 'data', "#{name}.html")
|
40
|
+
page = FakePage.new(name,
|
41
|
+
body: open(file).read,
|
42
|
+
content_type: 'text/html')
|
43
|
+
@validate_website.site = page.url
|
44
|
+
@validate_website.crawl
|
45
|
+
@validate_website.crawler.history.size.must_equal 5
|
46
|
+
end
|
47
|
+
|
48
|
+
it 'extract link' do
|
49
|
+
name = 'html4-strict'
|
50
|
+
file = File.join('spec', 'data', "#{name}.html")
|
51
|
+
page = FakePage.new(name,
|
52
|
+
body: open(file).read,
|
53
|
+
content_type: 'text/html')
|
54
|
+
@validate_website.site = page.url
|
55
|
+
@validate_website.crawl
|
56
|
+
@validate_website.crawler.history.size.must_equal 98
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
describe('css') do
|
61
|
+
it 'crawl css and extract url' do
|
62
|
+
page = FakePage.new('test.css',
|
63
|
+
body: '.t {background-image: url(pouet);}
|
64
|
+
.t {background-image: url(/image/pouet.png)}
|
65
|
+
.t {background-image: url(/image/pouet_42.png)}
|
66
|
+
.t {background-image: url(/image/pouet)}',
|
67
|
+
content_type: 'text/css')
|
68
|
+
@validate_website.site = page.url
|
69
|
+
@validate_website.crawl
|
70
|
+
@validate_website.crawler.history.size.must_equal 5
|
71
|
+
end
|
72
|
+
|
73
|
+
it 'should extract url with single quote' do
|
74
|
+
page = FakePage.new('test.css',
|
75
|
+
body: ".test {background-image: url('pouet');}",
|
76
|
+
content_type: 'text/css')
|
77
|
+
@validate_website.site = page.url
|
78
|
+
@validate_website.crawl
|
79
|
+
@validate_website.crawler.history.size.must_equal 2
|
80
|
+
end
|
81
|
+
|
82
|
+
it 'should extract url with double quote' do
|
83
|
+
page = FakePage.new('test.css',
|
84
|
+
body: ".test {background-image: url(\"pouet\");}",
|
85
|
+
content_type: 'text/css')
|
86
|
+
@validate_website.site = page.url
|
87
|
+
@validate_website.crawl
|
88
|
+
@validate_website.crawler.history.size.must_equal 2
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|