validate-website 1.0.5 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -12,15 +12,15 @@ module ValidateWebsite
12
12
 
13
13
  def self.run_crawl(args)
14
14
  trap_interrupt
15
- validate_website = ValidateWebsite::Core.new(args, :crawl)
15
+ validate_website = ValidateWebsite::Crawl.new(args)
16
16
  validate_website.crawl
17
17
  validate_website.exit_status
18
18
  end
19
19
 
20
20
  def self.run_static(args)
21
21
  trap_interrupt
22
- validate_website = ValidateWebsite::Core.new(args, :static)
23
- validate_website.crawl_static
22
+ validate_website = ValidateWebsite::Static.new(args)
23
+ validate_website.crawl
24
24
  validate_website.exit_status
25
25
  end
26
26
  end
@@ -0,0 +1,102 @@
1
+ require 'validate_website/core'
2
+
3
+ module ValidateWebsite
4
+ # Class for validation Static website
5
+ class Static < Core
6
+ CONTENT_TYPES = ['text/html', 'text/xhtml+xml']
7
+
8
+ def initialize(options = {}, validation_type = :static)
9
+ super
10
+ end
11
+
12
+ # @param [Hash] options
13
+ #
14
+ def crawl(options = {})
15
+ @options = @options.merge(options)
16
+ @site = @options[:site]
17
+
18
+ files = Dir.glob(@options[:pattern])
19
+ files.each do |f|
20
+ next unless File.file?(f)
21
+ check_static_file(f)
22
+ end
23
+ print_status_line(files.size, 0, @not_founds_count, @errors_count)
24
+ end
25
+
26
+ private
27
+
28
+ def generate_static_page(f)
29
+ response = self.class.fake_httpresponse(open(f).read)
30
+ Spidr::Page.new(URI.join(@site, URI.encode(f)), response)
31
+ end
32
+
33
+ def check_static_file(f)
34
+ page = generate_static_page(f)
35
+ validate(page.doc, page.body, f, @options[:ignore]) if @options[:markup]
36
+ check_static_not_found(page.links) if @options[:not_found]
37
+ end
38
+
39
+ StaticLink = Struct.new(:link, :site) do
40
+ def link_uri
41
+ @link_uri = URI.parse(URI.encode(link))
42
+ @link_uri = URI.join(site, @link_uri) if @link_uri.host.nil?
43
+ @link_uri
44
+ end
45
+
46
+ def in_static_domain?
47
+ URI.parse(site).host == link_uri.host
48
+ end
49
+
50
+ def extract_urls_from_fake_css_response
51
+ response = ValidateWebsite::Static.fake_httpresponse(
52
+ open(file_path).read,
53
+ ['text/css'])
54
+ css_page = Spidr::Page.new(link_uri, response)
55
+ ValidateWebsite::Core.extract_urls_from_css(css_page)
56
+ end
57
+
58
+ def file_path
59
+ @file_path ||= URI.parse(
60
+ File.join(Dir.getwd, link_uri.path || '/')
61
+ ).path
62
+ end
63
+
64
+ def extname
65
+ @extname ||= File.extname(file_path)
66
+ end
67
+
68
+ def check?
69
+ !link.include?('#') && in_static_domain?
70
+ end
71
+ end
72
+
73
+ # check files linked on static document
74
+ # see lib/validate_website/runner.rb
75
+ def check_static_not_found(links)
76
+ static_links = links.map { |l| StaticLink.new(l, @site) }
77
+ static_links.each do |static_link|
78
+ next unless static_link.check?
79
+ not_found_error(static_link.file_path) &&
80
+ next unless File.exist?(static_link.file_path)
81
+ next unless static_link.extname == '.css'
82
+ check_static_not_found static_link.extract_urls_from_fake_css_response
83
+ end
84
+ end
85
+
86
+ # Fake http response for Spidr static crawling
87
+ # see https://github.com/ruby/ruby/blob/trunk/lib/net/http/response.rb
88
+ #
89
+ # @param [String] response body
90
+ # @param [Array] content types
91
+ # @return [Net::HTTPResponse] fake http response
92
+ def self.fake_httpresponse(body, content_types = CONTENT_TYPES)
93
+ response = Net::HTTPResponse.new '1.1', 200, 'OK'
94
+ response.instance_variable_set(:@read, true)
95
+ response.body = body
96
+ content_types.each do |c|
97
+ response.add_field('content-type', c)
98
+ end
99
+ response
100
+ end
101
+ end
102
+ end
@@ -1,6 +1,8 @@
1
1
  # encoding: utf-8
2
2
  require 'uri'
3
3
  require 'nokogiri'
4
+ require 'net/http'
5
+ require 'multipart_body'
4
6
 
5
7
  module ValidateWebsite
6
8
  # Document validation from DTD or XSD (webservice for html5)
@@ -12,7 +14,7 @@ module ValidateWebsite
12
14
  attr_accessor :html5_validator_service_url
13
15
  end
14
16
 
15
- attr_reader :original_doc, :body, :dtd, :doc, :namespace, :xsd, :errors
17
+ attr_reader :original_doc, :body, :dtd, :doc, :namespace, :xsd
16
18
 
17
19
  ##
18
20
  # @param [Nokogiri::HTML::Document] original_doc
@@ -24,18 +26,19 @@ module ValidateWebsite
24
26
  @body = body
25
27
  @ignore = ignore
26
28
  @dtd = @original_doc.internal_subset
27
- init_namespace(@dtd)
28
- @errors = []
29
+ @namespace = init_namespace(@dtd)
29
30
  end
30
31
 
31
32
  ##
32
33
  # @return [Boolean]
33
34
  def valid?
35
+ find_errors
34
36
  errors.length == 0
35
37
  end
36
38
 
39
+ # @return [Array] of errors
37
40
  def errors
38
- find_errors
41
+ @errors.map!(&:to_s)
39
42
  @ignore ? @errors.reject { |e| @ignore =~ e } : @errors
40
43
  end
41
44
 
@@ -47,7 +50,7 @@ module ValidateWebsite
47
50
  return unless dtd_uri.path
48
51
  @dtd_uri = dtd_uri
49
52
  # http://www.w3.org/TR/xhtml1/#dtds
50
- @namespace = File.basename(@dtd_uri.path, '.dtd')
53
+ File.basename(@dtd_uri.path, '.dtd')
51
54
  end
52
55
 
53
56
  def document
@@ -59,51 +62,59 @@ module ValidateWebsite
59
62
  end
60
63
  end
61
64
 
62
- def find_errors
63
- @doc = Dir.chdir(XHTML_PATH) do
64
- Nokogiri::XML(document) { |cfg|
65
- cfg.noent.dtdload.dtdvalid
66
- }
67
- end
68
-
69
- # http://www.w3.org/TR/xhtml1-schema/
70
- @xsd = Dir.chdir(XHTML_PATH) do
65
+ # http://www.w3.org/TR/xhtml1-schema/
66
+ def xsd
67
+ @xsd ||= Dir.chdir(XHTML_PATH) do
71
68
  if @namespace && File.exist?(@namespace + '.xsd')
72
69
  Nokogiri::XML::Schema(File.read(@namespace + '.xsd'))
73
70
  end
74
71
  end
72
+ end
75
73
 
76
- if @xsd
77
- @errors = @xsd.validate(@doc)
78
- elsif document =~ /^\<!DOCTYPE html\>/i
79
- html5_validate(document)
74
+ # @return [Array] contain result errors
75
+ def validate(xml_doc, document_body)
76
+ if xsd
77
+ xsd.validate(xml_doc)
78
+ elsif document_body =~ /^\<!DOCTYPE html\>/i
79
+ html5_validate(document_body)
80
80
  else
81
81
  # dont have xsd fall back to dtd
82
- @doc = Dir.chdir(XHTML_PATH) do
82
+ Dir.chdir(XHTML_PATH) do
83
83
  Nokogiri::HTML.parse(document)
84
- end
85
- @errors = @doc.errors
84
+ end.errors
86
85
  end
86
+ end
87
87
 
88
+ # http://nokogiri.org/tutorials/ensuring_well_formed_markup.html
89
+ def find_errors
90
+ doc = Dir.chdir(XHTML_PATH) do
91
+ Nokogiri::XML(document) { |cfg| cfg.noent.dtdload.dtdvalid }
92
+ end
93
+ @errors = validate(doc, document)
88
94
  rescue Nokogiri::XML::SyntaxError => e
89
- # http://nokogiri.org/tutorials/ensuring_well_formed_markup.html
90
95
  @errors << e
91
96
  end
92
97
 
93
- def html5_validate(document)
94
- require 'net/http'
95
- require 'multipart_body'
98
+ def html5_headers(multipart)
99
+ {
100
+ 'Content-Type' => "multipart/form-data; boundary=#{multipart.boundary}",
101
+ 'Content-Length' => multipart.to_s.bytesize.to_s
102
+ }
103
+ end
104
+
105
+ def html5_body(document)
96
106
  url = URI.parse(self.class.html5_validator_service_url)
97
107
  multipart = MultipartBody.new(content: document)
98
108
  http = Net::HTTP.new(url.host, url.port)
99
- headers = {
100
- 'Content-Type' => "multipart/form-data; boundary=#{multipart.boundary}",
101
- 'Content-Length' => multipart.to_s.bytesize.to_s,
102
- }
103
- res = http.start { |con| con.post(url.path, multipart.to_s, headers) }
104
- validator_document = Nokogiri::HTML(res.body)
105
- @errors = validator_document.css('h2.invalid').map(&:content)
106
- @errors.concat validator_document.css('ol li.error').map(&:content)
109
+ http.start do |con|
110
+ con.post(url.path, multipart.to_s, html5_headers(multipart))
111
+ end.body
112
+ end
113
+
114
+ def html5_validate(document)
115
+ validator_document = Nokogiri::HTML(html5_body(document))
116
+ errors = validator_document.css('h2.invalid').map(&:content)
117
+ errors.concat validator_document.css('ol li.error').map(&:content)
107
118
  end
108
119
  end
109
120
  end
@@ -0,0 +1,3 @@
1
+ module ValidateWebsite
2
+ VERSION = '1.1.0'.freeze
3
+ end
@@ -1,125 +1,10 @@
1
- # encoding: UTF-8
2
- require File.expand_path('../spec_helper', __FILE__)
1
+ require_relative 'spec_helper'
3
2
 
4
3
  describe ValidateWebsite::Core do
5
- before do
6
- WebMock.reset!
7
- stub_request(:get, ValidateWebsite::Core::PING_URL).to_return(status: 200)
8
- stub_request(:get, /#{SPEC_DOMAIN}/).to_return(status: 200)
9
- @validate_website = ValidateWebsite::Core.new(color: false)
10
- end
11
-
12
4
  describe 'invalid options' do
13
5
  it 'raise ArgumentError on wrong validation_type' do
14
- proc {
15
- ValidateWebsite::Core.new({ color: false }, :fail)
16
- }.must_raise ArgumentError
17
- end
18
- end
19
-
20
- describe 'options' do
21
- it 'can change user-agent' do
22
- ua = %{Linux / Firefox 29: Mozilla/5.0 (X11; Linux x86_64; rv:29.0) \
23
- Gecko/20100101 Firefox/29.0}
24
- v = ValidateWebsite::Core.new({ site: SPEC_DOMAIN, user_agent: ua },
25
- :crawl)
26
- v.crawl
27
- v.crawler.user_agent.must_equal ua
28
- end
29
-
30
- it 'can change html5 validator service url' do
31
- s = 'http://localhost:8888/'
32
- ValidateWebsite::Core.new({ site: SPEC_DOMAIN,
33
- :'html5-validator-service-url' => s })
34
- ValidateWebsite::Validator.html5_validator_service_url.must_equal s
35
- end
36
- end
37
-
38
- describe('cookies') do
39
- it 'can set cookies' do
40
- cookies = 'tz=Europe%2FBerlin; guid=ZcpBshbtStgl9VjwTofq'
41
- v = ValidateWebsite::Core.new({ site: SPEC_DOMAIN, cookies: cookies },
42
- :crawl)
43
- v.crawl
44
- v.crawler.cookies.cookies_for_host(v.host).must_equal v.default_cookies
45
- end
46
- end
47
-
48
- describe('html') do
49
- it "extract url" do
50
- name = 'xhtml1-strict'
51
- file = File.join('spec', 'data', "#{name}.html")
52
- page = FakePage.new(name,
53
- body: open(file).read,
54
- content_type: 'text/html')
55
- @validate_website.site = page.url
56
- @validate_website.crawl
57
- @validate_website.crawler.history.size.must_equal 5
58
- end
59
-
60
- it 'extract link' do
61
- name = 'html4-strict'
62
- file = File.join('spec', 'data', "#{name}.html")
63
- page = FakePage.new(name,
64
- body: open(file).read,
65
- content_type: 'text/html')
66
- @validate_website.site = page.url
67
- @validate_website.crawl
68
- @validate_website.crawler.history.size.must_equal 98
69
- end
70
- end
71
-
72
- describe('css') do
73
- it "crawl css and extract url" do
74
- page = FakePage.new('test.css',
75
- body: '.t {background-image: url(pouet);}
76
- .t {background-image: url(/image/pouet.png)}
77
- .t {background-image: url(/image/pouet_42.png)}
78
- .t {background-image: url(/image/pouet)}',
79
- content_type: 'text/css')
80
- @validate_website.site = page.url
81
- @validate_website.crawl
82
- @validate_website.crawler.history.size.must_equal 5
83
- end
84
-
85
- it "should extract url with single quote" do
86
- page = FakePage.new('test.css',
87
- body: ".test {background-image: url('pouet');}",
88
- content_type: 'text/css')
89
- @validate_website.site = page.url
90
- @validate_website.crawl
91
- @validate_website.crawler.history.size.must_equal 2
92
- end
93
-
94
- it "should extract url with double quote" do
95
- page = FakePage.new('test.css',
96
- body: ".test {background-image: url(\"pouet\");}",
97
- content_type: 'text/css')
98
- @validate_website.site = page.url
99
- @validate_website.crawl
100
- @validate_website.crawler.history.size.must_equal 2
101
- end
102
- end
103
-
104
- describe('static') do
105
- it 'no space in directory name' do
106
- pattern = File.join(File.dirname(__FILE__), 'example/**/*.html')
107
- @validate_website.crawl_static(pattern: pattern,
108
- site: 'http://dev.af83.com/',
109
- markup: false,
110
- not_found: false)
111
- @validate_website.not_founds_count.must_equal 0
112
- end
113
-
114
- it 'not found' do
115
- pattern = File.join(File.dirname(__FILE__), '**/*.html')
116
- Dir.chdir('spec/data') do
117
- @validate_website.crawl_static(pattern: pattern,
118
- site: 'https://linuxfr.org/',
119
- markup: false,
120
- not_found: true)
121
- @validate_website.not_founds_count.must_equal 448
122
- end
6
+ proc { ValidateWebsite::Core.new({ color: false }, :fail) }
7
+ .must_raise ArgumentError
123
8
  end
124
9
  end
125
10
  end
@@ -0,0 +1,91 @@
1
+ require_relative 'spec_helper'
2
+
3
+ describe ValidateWebsite::Crawl do
4
+ before do
5
+ WebMock.reset!
6
+ stub_request(:get, /#{SPEC_DOMAIN}/).to_return(status: 200)
7
+ @validate_website = ValidateWebsite::Crawl.new(color: false)
8
+ end
9
+
10
+ describe 'options' do
11
+ it 'can change user-agent' do
12
+ ua = %{Linux / Firefox 29: Mozilla/5.0 (X11; Linux x86_64; rv:29.0) \
13
+ Gecko/20100101 Firefox/29.0}
14
+ v = ValidateWebsite::Crawl.new(site: SPEC_DOMAIN, user_agent: ua)
15
+ v.crawl
16
+ v.crawler.user_agent.must_equal ua
17
+ end
18
+
19
+ it 'can change html5 validator service url' do
20
+ s = 'http://localhost:8888/'
21
+ ValidateWebsite::Crawl.new(site: SPEC_DOMAIN,
22
+ html5_validator_service_url: s)
23
+ ValidateWebsite::Validator.html5_validator_service_url.must_equal s
24
+ end
25
+ end
26
+
27
+ describe('cookies') do
28
+ it 'can set cookies' do
29
+ cookies = 'tz=Europe%2FBerlin; guid=ZcpBshbtStgl9VjwTofq'
30
+ v = ValidateWebsite::Crawl.new(site: SPEC_DOMAIN, cookies: cookies)
31
+ v.crawl
32
+ v.crawler.cookies.cookies_for_host(v.host).must_equal v.default_cookies
33
+ end
34
+ end
35
+
36
+ describe('html') do
37
+ it 'extract url' do
38
+ name = 'xhtml1-strict'
39
+ file = File.join('spec', 'data', "#{name}.html")
40
+ page = FakePage.new(name,
41
+ body: open(file).read,
42
+ content_type: 'text/html')
43
+ @validate_website.site = page.url
44
+ @validate_website.crawl
45
+ @validate_website.crawler.history.size.must_equal 5
46
+ end
47
+
48
+ it 'extract link' do
49
+ name = 'html4-strict'
50
+ file = File.join('spec', 'data', "#{name}.html")
51
+ page = FakePage.new(name,
52
+ body: open(file).read,
53
+ content_type: 'text/html')
54
+ @validate_website.site = page.url
55
+ @validate_website.crawl
56
+ @validate_website.crawler.history.size.must_equal 98
57
+ end
58
+ end
59
+
60
+ describe('css') do
61
+ it 'crawl css and extract url' do
62
+ page = FakePage.new('test.css',
63
+ body: '.t {background-image: url(pouet);}
64
+ .t {background-image: url(/image/pouet.png)}
65
+ .t {background-image: url(/image/pouet_42.png)}
66
+ .t {background-image: url(/image/pouet)}',
67
+ content_type: 'text/css')
68
+ @validate_website.site = page.url
69
+ @validate_website.crawl
70
+ @validate_website.crawler.history.size.must_equal 5
71
+ end
72
+
73
+ it 'should extract url with single quote' do
74
+ page = FakePage.new('test.css',
75
+ body: ".test {background-image: url('pouet');}",
76
+ content_type: 'text/css')
77
+ @validate_website.site = page.url
78
+ @validate_website.crawl
79
+ @validate_website.crawler.history.size.must_equal 2
80
+ end
81
+
82
+ it 'should extract url with double quote' do
83
+ page = FakePage.new('test.css',
84
+ body: ".test {background-image: url(\"pouet\");}",
85
+ content_type: 'text/css')
86
+ @validate_website.site = page.url
87
+ @validate_website.crawl
88
+ @validate_website.crawler.history.size.must_equal 2
89
+ end
90
+ end
91
+ end