validate-website 1.0.5 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Rakefile +3 -3
- data/lib/validate_website.rb +1 -0
- data/lib/validate_website/core.rb +33 -157
- data/lib/validate_website/crawl.rb +78 -0
- data/lib/validate_website/option_parser.rb +64 -59
- data/lib/validate_website/runner.rb +3 -3
- data/lib/validate_website/static.rb +102 -0
- data/lib/validate_website/validator.rb +44 -33
- data/lib/validate_website/version.rb +3 -0
- data/spec/core_spec.rb +3 -118
- data/spec/crawler_spec.rb +91 -0
- data/spec/data/w3.org-xhtml1-strict-errors.html +544 -0
- data/spec/spec_helper.rb +2 -1
- data/spec/static_spec.rb +38 -0
- data/spec/validator_spec.rb +40 -23
- data/spec/webmock_helper.rb +4 -3
- metadata +30 -8
@@ -12,15 +12,15 @@ module ValidateWebsite
|
|
12
12
|
|
13
13
|
def self.run_crawl(args)
|
14
14
|
trap_interrupt
|
15
|
-
validate_website = ValidateWebsite::
|
15
|
+
validate_website = ValidateWebsite::Crawl.new(args)
|
16
16
|
validate_website.crawl
|
17
17
|
validate_website.exit_status
|
18
18
|
end
|
19
19
|
|
20
20
|
def self.run_static(args)
|
21
21
|
trap_interrupt
|
22
|
-
validate_website = ValidateWebsite::
|
23
|
-
validate_website.
|
22
|
+
validate_website = ValidateWebsite::Static.new(args)
|
23
|
+
validate_website.crawl
|
24
24
|
validate_website.exit_status
|
25
25
|
end
|
26
26
|
end
|
@@ -0,0 +1,102 @@
|
|
1
|
+
require 'validate_website/core'
|
2
|
+
|
3
|
+
module ValidateWebsite
|
4
|
+
# Class for validation Static website
|
5
|
+
class Static < Core
|
6
|
+
CONTENT_TYPES = ['text/html', 'text/xhtml+xml']
|
7
|
+
|
8
|
+
def initialize(options = {}, validation_type = :static)
|
9
|
+
super
|
10
|
+
end
|
11
|
+
|
12
|
+
# @param [Hash] options
|
13
|
+
#
|
14
|
+
def crawl(options = {})
|
15
|
+
@options = @options.merge(options)
|
16
|
+
@site = @options[:site]
|
17
|
+
|
18
|
+
files = Dir.glob(@options[:pattern])
|
19
|
+
files.each do |f|
|
20
|
+
next unless File.file?(f)
|
21
|
+
check_static_file(f)
|
22
|
+
end
|
23
|
+
print_status_line(files.size, 0, @not_founds_count, @errors_count)
|
24
|
+
end
|
25
|
+
|
26
|
+
private
|
27
|
+
|
28
|
+
def generate_static_page(f)
|
29
|
+
response = self.class.fake_httpresponse(open(f).read)
|
30
|
+
Spidr::Page.new(URI.join(@site, URI.encode(f)), response)
|
31
|
+
end
|
32
|
+
|
33
|
+
def check_static_file(f)
|
34
|
+
page = generate_static_page(f)
|
35
|
+
validate(page.doc, page.body, f, @options[:ignore]) if @options[:markup]
|
36
|
+
check_static_not_found(page.links) if @options[:not_found]
|
37
|
+
end
|
38
|
+
|
39
|
+
StaticLink = Struct.new(:link, :site) do
|
40
|
+
def link_uri
|
41
|
+
@link_uri = URI.parse(URI.encode(link))
|
42
|
+
@link_uri = URI.join(site, @link_uri) if @link_uri.host.nil?
|
43
|
+
@link_uri
|
44
|
+
end
|
45
|
+
|
46
|
+
def in_static_domain?
|
47
|
+
URI.parse(site).host == link_uri.host
|
48
|
+
end
|
49
|
+
|
50
|
+
def extract_urls_from_fake_css_response
|
51
|
+
response = ValidateWebsite::Static.fake_httpresponse(
|
52
|
+
open(file_path).read,
|
53
|
+
['text/css'])
|
54
|
+
css_page = Spidr::Page.new(link_uri, response)
|
55
|
+
ValidateWebsite::Core.extract_urls_from_css(css_page)
|
56
|
+
end
|
57
|
+
|
58
|
+
def file_path
|
59
|
+
@file_path ||= URI.parse(
|
60
|
+
File.join(Dir.getwd, link_uri.path || '/')
|
61
|
+
).path
|
62
|
+
end
|
63
|
+
|
64
|
+
def extname
|
65
|
+
@extname ||= File.extname(file_path)
|
66
|
+
end
|
67
|
+
|
68
|
+
def check?
|
69
|
+
!link.include?('#') && in_static_domain?
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
# check files linked on static document
|
74
|
+
# see lib/validate_website/runner.rb
|
75
|
+
def check_static_not_found(links)
|
76
|
+
static_links = links.map { |l| StaticLink.new(l, @site) }
|
77
|
+
static_links.each do |static_link|
|
78
|
+
next unless static_link.check?
|
79
|
+
not_found_error(static_link.file_path) &&
|
80
|
+
next unless File.exist?(static_link.file_path)
|
81
|
+
next unless static_link.extname == '.css'
|
82
|
+
check_static_not_found static_link.extract_urls_from_fake_css_response
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
# Fake http response for Spidr static crawling
|
87
|
+
# see https://github.com/ruby/ruby/blob/trunk/lib/net/http/response.rb
|
88
|
+
#
|
89
|
+
# @param [String] response body
|
90
|
+
# @param [Array] content types
|
91
|
+
# @return [Net::HTTPResponse] fake http response
|
92
|
+
def self.fake_httpresponse(body, content_types = CONTENT_TYPES)
|
93
|
+
response = Net::HTTPResponse.new '1.1', 200, 'OK'
|
94
|
+
response.instance_variable_set(:@read, true)
|
95
|
+
response.body = body
|
96
|
+
content_types.each do |c|
|
97
|
+
response.add_field('content-type', c)
|
98
|
+
end
|
99
|
+
response
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
@@ -1,6 +1,8 @@
|
|
1
1
|
# encoding: utf-8
|
2
2
|
require 'uri'
|
3
3
|
require 'nokogiri'
|
4
|
+
require 'net/http'
|
5
|
+
require 'multipart_body'
|
4
6
|
|
5
7
|
module ValidateWebsite
|
6
8
|
# Document validation from DTD or XSD (webservice for html5)
|
@@ -12,7 +14,7 @@ module ValidateWebsite
|
|
12
14
|
attr_accessor :html5_validator_service_url
|
13
15
|
end
|
14
16
|
|
15
|
-
attr_reader :original_doc, :body, :dtd, :doc, :namespace, :xsd
|
17
|
+
attr_reader :original_doc, :body, :dtd, :doc, :namespace, :xsd
|
16
18
|
|
17
19
|
##
|
18
20
|
# @param [Nokogiri::HTML::Document] original_doc
|
@@ -24,18 +26,19 @@ module ValidateWebsite
|
|
24
26
|
@body = body
|
25
27
|
@ignore = ignore
|
26
28
|
@dtd = @original_doc.internal_subset
|
27
|
-
init_namespace(@dtd)
|
28
|
-
@errors = []
|
29
|
+
@namespace = init_namespace(@dtd)
|
29
30
|
end
|
30
31
|
|
31
32
|
##
|
32
33
|
# @return [Boolean]
|
33
34
|
def valid?
|
35
|
+
find_errors
|
34
36
|
errors.length == 0
|
35
37
|
end
|
36
38
|
|
39
|
+
# @return [Array] of errors
|
37
40
|
def errors
|
38
|
-
|
41
|
+
@errors.map!(&:to_s)
|
39
42
|
@ignore ? @errors.reject { |e| @ignore =~ e } : @errors
|
40
43
|
end
|
41
44
|
|
@@ -47,7 +50,7 @@ module ValidateWebsite
|
|
47
50
|
return unless dtd_uri.path
|
48
51
|
@dtd_uri = dtd_uri
|
49
52
|
# http://www.w3.org/TR/xhtml1/#dtds
|
50
|
-
|
53
|
+
File.basename(@dtd_uri.path, '.dtd')
|
51
54
|
end
|
52
55
|
|
53
56
|
def document
|
@@ -59,51 +62,59 @@ module ValidateWebsite
|
|
59
62
|
end
|
60
63
|
end
|
61
64
|
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
cfg.noent.dtdload.dtdvalid
|
66
|
-
}
|
67
|
-
end
|
68
|
-
|
69
|
-
# http://www.w3.org/TR/xhtml1-schema/
|
70
|
-
@xsd = Dir.chdir(XHTML_PATH) do
|
65
|
+
# http://www.w3.org/TR/xhtml1-schema/
|
66
|
+
def xsd
|
67
|
+
@xsd ||= Dir.chdir(XHTML_PATH) do
|
71
68
|
if @namespace && File.exist?(@namespace + '.xsd')
|
72
69
|
Nokogiri::XML::Schema(File.read(@namespace + '.xsd'))
|
73
70
|
end
|
74
71
|
end
|
72
|
+
end
|
75
73
|
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
74
|
+
# @return [Array] contain result errors
|
75
|
+
def validate(xml_doc, document_body)
|
76
|
+
if xsd
|
77
|
+
xsd.validate(xml_doc)
|
78
|
+
elsif document_body =~ /^\<!DOCTYPE html\>/i
|
79
|
+
html5_validate(document_body)
|
80
80
|
else
|
81
81
|
# dont have xsd fall back to dtd
|
82
|
-
|
82
|
+
Dir.chdir(XHTML_PATH) do
|
83
83
|
Nokogiri::HTML.parse(document)
|
84
|
-
end
|
85
|
-
@errors = @doc.errors
|
84
|
+
end.errors
|
86
85
|
end
|
86
|
+
end
|
87
87
|
|
88
|
+
# http://nokogiri.org/tutorials/ensuring_well_formed_markup.html
|
89
|
+
def find_errors
|
90
|
+
doc = Dir.chdir(XHTML_PATH) do
|
91
|
+
Nokogiri::XML(document) { |cfg| cfg.noent.dtdload.dtdvalid }
|
92
|
+
end
|
93
|
+
@errors = validate(doc, document)
|
88
94
|
rescue Nokogiri::XML::SyntaxError => e
|
89
|
-
# http://nokogiri.org/tutorials/ensuring_well_formed_markup.html
|
90
95
|
@errors << e
|
91
96
|
end
|
92
97
|
|
93
|
-
def
|
94
|
-
|
95
|
-
|
98
|
+
def html5_headers(multipart)
|
99
|
+
{
|
100
|
+
'Content-Type' => "multipart/form-data; boundary=#{multipart.boundary}",
|
101
|
+
'Content-Length' => multipart.to_s.bytesize.to_s
|
102
|
+
}
|
103
|
+
end
|
104
|
+
|
105
|
+
def html5_body(document)
|
96
106
|
url = URI.parse(self.class.html5_validator_service_url)
|
97
107
|
multipart = MultipartBody.new(content: document)
|
98
108
|
http = Net::HTTP.new(url.host, url.port)
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
109
|
+
http.start do |con|
|
110
|
+
con.post(url.path, multipart.to_s, html5_headers(multipart))
|
111
|
+
end.body
|
112
|
+
end
|
113
|
+
|
114
|
+
def html5_validate(document)
|
115
|
+
validator_document = Nokogiri::HTML(html5_body(document))
|
116
|
+
errors = validator_document.css('h2.invalid').map(&:content)
|
117
|
+
errors.concat validator_document.css('ol li.error').map(&:content)
|
107
118
|
end
|
108
119
|
end
|
109
120
|
end
|
data/spec/core_spec.rb
CHANGED
@@ -1,125 +1,10 @@
|
|
1
|
-
|
2
|
-
require File.expand_path('../spec_helper', __FILE__)
|
1
|
+
require_relative 'spec_helper'
|
3
2
|
|
4
3
|
describe ValidateWebsite::Core do
|
5
|
-
before do
|
6
|
-
WebMock.reset!
|
7
|
-
stub_request(:get, ValidateWebsite::Core::PING_URL).to_return(status: 200)
|
8
|
-
stub_request(:get, /#{SPEC_DOMAIN}/).to_return(status: 200)
|
9
|
-
@validate_website = ValidateWebsite::Core.new(color: false)
|
10
|
-
end
|
11
|
-
|
12
4
|
describe 'invalid options' do
|
13
5
|
it 'raise ArgumentError on wrong validation_type' do
|
14
|
-
proc {
|
15
|
-
|
16
|
-
}.must_raise ArgumentError
|
17
|
-
end
|
18
|
-
end
|
19
|
-
|
20
|
-
describe 'options' do
|
21
|
-
it 'can change user-agent' do
|
22
|
-
ua = %{Linux / Firefox 29: Mozilla/5.0 (X11; Linux x86_64; rv:29.0) \
|
23
|
-
Gecko/20100101 Firefox/29.0}
|
24
|
-
v = ValidateWebsite::Core.new({ site: SPEC_DOMAIN, user_agent: ua },
|
25
|
-
:crawl)
|
26
|
-
v.crawl
|
27
|
-
v.crawler.user_agent.must_equal ua
|
28
|
-
end
|
29
|
-
|
30
|
-
it 'can change html5 validator service url' do
|
31
|
-
s = 'http://localhost:8888/'
|
32
|
-
ValidateWebsite::Core.new({ site: SPEC_DOMAIN,
|
33
|
-
:'html5-validator-service-url' => s })
|
34
|
-
ValidateWebsite::Validator.html5_validator_service_url.must_equal s
|
35
|
-
end
|
36
|
-
end
|
37
|
-
|
38
|
-
describe('cookies') do
|
39
|
-
it 'can set cookies' do
|
40
|
-
cookies = 'tz=Europe%2FBerlin; guid=ZcpBshbtStgl9VjwTofq'
|
41
|
-
v = ValidateWebsite::Core.new({ site: SPEC_DOMAIN, cookies: cookies },
|
42
|
-
:crawl)
|
43
|
-
v.crawl
|
44
|
-
v.crawler.cookies.cookies_for_host(v.host).must_equal v.default_cookies
|
45
|
-
end
|
46
|
-
end
|
47
|
-
|
48
|
-
describe('html') do
|
49
|
-
it "extract url" do
|
50
|
-
name = 'xhtml1-strict'
|
51
|
-
file = File.join('spec', 'data', "#{name}.html")
|
52
|
-
page = FakePage.new(name,
|
53
|
-
body: open(file).read,
|
54
|
-
content_type: 'text/html')
|
55
|
-
@validate_website.site = page.url
|
56
|
-
@validate_website.crawl
|
57
|
-
@validate_website.crawler.history.size.must_equal 5
|
58
|
-
end
|
59
|
-
|
60
|
-
it 'extract link' do
|
61
|
-
name = 'html4-strict'
|
62
|
-
file = File.join('spec', 'data', "#{name}.html")
|
63
|
-
page = FakePage.new(name,
|
64
|
-
body: open(file).read,
|
65
|
-
content_type: 'text/html')
|
66
|
-
@validate_website.site = page.url
|
67
|
-
@validate_website.crawl
|
68
|
-
@validate_website.crawler.history.size.must_equal 98
|
69
|
-
end
|
70
|
-
end
|
71
|
-
|
72
|
-
describe('css') do
|
73
|
-
it "crawl css and extract url" do
|
74
|
-
page = FakePage.new('test.css',
|
75
|
-
body: '.t {background-image: url(pouet);}
|
76
|
-
.t {background-image: url(/image/pouet.png)}
|
77
|
-
.t {background-image: url(/image/pouet_42.png)}
|
78
|
-
.t {background-image: url(/image/pouet)}',
|
79
|
-
content_type: 'text/css')
|
80
|
-
@validate_website.site = page.url
|
81
|
-
@validate_website.crawl
|
82
|
-
@validate_website.crawler.history.size.must_equal 5
|
83
|
-
end
|
84
|
-
|
85
|
-
it "should extract url with single quote" do
|
86
|
-
page = FakePage.new('test.css',
|
87
|
-
body: ".test {background-image: url('pouet');}",
|
88
|
-
content_type: 'text/css')
|
89
|
-
@validate_website.site = page.url
|
90
|
-
@validate_website.crawl
|
91
|
-
@validate_website.crawler.history.size.must_equal 2
|
92
|
-
end
|
93
|
-
|
94
|
-
it "should extract url with double quote" do
|
95
|
-
page = FakePage.new('test.css',
|
96
|
-
body: ".test {background-image: url(\"pouet\");}",
|
97
|
-
content_type: 'text/css')
|
98
|
-
@validate_website.site = page.url
|
99
|
-
@validate_website.crawl
|
100
|
-
@validate_website.crawler.history.size.must_equal 2
|
101
|
-
end
|
102
|
-
end
|
103
|
-
|
104
|
-
describe('static') do
|
105
|
-
it 'no space in directory name' do
|
106
|
-
pattern = File.join(File.dirname(__FILE__), 'example/**/*.html')
|
107
|
-
@validate_website.crawl_static(pattern: pattern,
|
108
|
-
site: 'http://dev.af83.com/',
|
109
|
-
markup: false,
|
110
|
-
not_found: false)
|
111
|
-
@validate_website.not_founds_count.must_equal 0
|
112
|
-
end
|
113
|
-
|
114
|
-
it 'not found' do
|
115
|
-
pattern = File.join(File.dirname(__FILE__), '**/*.html')
|
116
|
-
Dir.chdir('spec/data') do
|
117
|
-
@validate_website.crawl_static(pattern: pattern,
|
118
|
-
site: 'https://linuxfr.org/',
|
119
|
-
markup: false,
|
120
|
-
not_found: true)
|
121
|
-
@validate_website.not_founds_count.must_equal 448
|
122
|
-
end
|
6
|
+
proc { ValidateWebsite::Core.new({ color: false }, :fail) }
|
7
|
+
.must_raise ArgumentError
|
123
8
|
end
|
124
9
|
end
|
125
10
|
end
|
@@ -0,0 +1,91 @@
|
|
1
|
+
require_relative 'spec_helper'
|
2
|
+
|
3
|
+
describe ValidateWebsite::Crawl do
|
4
|
+
before do
|
5
|
+
WebMock.reset!
|
6
|
+
stub_request(:get, /#{SPEC_DOMAIN}/).to_return(status: 200)
|
7
|
+
@validate_website = ValidateWebsite::Crawl.new(color: false)
|
8
|
+
end
|
9
|
+
|
10
|
+
describe 'options' do
|
11
|
+
it 'can change user-agent' do
|
12
|
+
ua = %{Linux / Firefox 29: Mozilla/5.0 (X11; Linux x86_64; rv:29.0) \
|
13
|
+
Gecko/20100101 Firefox/29.0}
|
14
|
+
v = ValidateWebsite::Crawl.new(site: SPEC_DOMAIN, user_agent: ua)
|
15
|
+
v.crawl
|
16
|
+
v.crawler.user_agent.must_equal ua
|
17
|
+
end
|
18
|
+
|
19
|
+
it 'can change html5 validator service url' do
|
20
|
+
s = 'http://localhost:8888/'
|
21
|
+
ValidateWebsite::Crawl.new(site: SPEC_DOMAIN,
|
22
|
+
html5_validator_service_url: s)
|
23
|
+
ValidateWebsite::Validator.html5_validator_service_url.must_equal s
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
describe('cookies') do
|
28
|
+
it 'can set cookies' do
|
29
|
+
cookies = 'tz=Europe%2FBerlin; guid=ZcpBshbtStgl9VjwTofq'
|
30
|
+
v = ValidateWebsite::Crawl.new(site: SPEC_DOMAIN, cookies: cookies)
|
31
|
+
v.crawl
|
32
|
+
v.crawler.cookies.cookies_for_host(v.host).must_equal v.default_cookies
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
describe('html') do
|
37
|
+
it 'extract url' do
|
38
|
+
name = 'xhtml1-strict'
|
39
|
+
file = File.join('spec', 'data', "#{name}.html")
|
40
|
+
page = FakePage.new(name,
|
41
|
+
body: open(file).read,
|
42
|
+
content_type: 'text/html')
|
43
|
+
@validate_website.site = page.url
|
44
|
+
@validate_website.crawl
|
45
|
+
@validate_website.crawler.history.size.must_equal 5
|
46
|
+
end
|
47
|
+
|
48
|
+
it 'extract link' do
|
49
|
+
name = 'html4-strict'
|
50
|
+
file = File.join('spec', 'data', "#{name}.html")
|
51
|
+
page = FakePage.new(name,
|
52
|
+
body: open(file).read,
|
53
|
+
content_type: 'text/html')
|
54
|
+
@validate_website.site = page.url
|
55
|
+
@validate_website.crawl
|
56
|
+
@validate_website.crawler.history.size.must_equal 98
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
describe('css') do
|
61
|
+
it 'crawl css and extract url' do
|
62
|
+
page = FakePage.new('test.css',
|
63
|
+
body: '.t {background-image: url(pouet);}
|
64
|
+
.t {background-image: url(/image/pouet.png)}
|
65
|
+
.t {background-image: url(/image/pouet_42.png)}
|
66
|
+
.t {background-image: url(/image/pouet)}',
|
67
|
+
content_type: 'text/css')
|
68
|
+
@validate_website.site = page.url
|
69
|
+
@validate_website.crawl
|
70
|
+
@validate_website.crawler.history.size.must_equal 5
|
71
|
+
end
|
72
|
+
|
73
|
+
it 'should extract url with single quote' do
|
74
|
+
page = FakePage.new('test.css',
|
75
|
+
body: ".test {background-image: url('pouet');}",
|
76
|
+
content_type: 'text/css')
|
77
|
+
@validate_website.site = page.url
|
78
|
+
@validate_website.crawl
|
79
|
+
@validate_website.crawler.history.size.must_equal 2
|
80
|
+
end
|
81
|
+
|
82
|
+
it 'should extract url with double quote' do
|
83
|
+
page = FakePage.new('test.css',
|
84
|
+
body: ".test {background-image: url(\"pouet\");}",
|
85
|
+
content_type: 'text/css')
|
86
|
+
@validate_website.site = page.url
|
87
|
+
@validate_website.crawl
|
88
|
+
@validate_website.crawler.history.size.must_equal 2
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|