validate-website 1.0.5 → 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -12,15 +12,15 @@ module ValidateWebsite
12
12
 
13
13
  def self.run_crawl(args)
14
14
  trap_interrupt
15
- validate_website = ValidateWebsite::Core.new(args, :crawl)
15
+ validate_website = ValidateWebsite::Crawl.new(args)
16
16
  validate_website.crawl
17
17
  validate_website.exit_status
18
18
  end
19
19
 
20
20
  def self.run_static(args)
21
21
  trap_interrupt
22
- validate_website = ValidateWebsite::Core.new(args, :static)
23
- validate_website.crawl_static
22
+ validate_website = ValidateWebsite::Static.new(args)
23
+ validate_website.crawl
24
24
  validate_website.exit_status
25
25
  end
26
26
  end
@@ -0,0 +1,102 @@
1
+ require 'validate_website/core'
2
+
3
+ module ValidateWebsite
4
+ # Class for validation Static website
5
+ class Static < Core
6
+ CONTENT_TYPES = ['text/html', 'text/xhtml+xml']
7
+
8
+ def initialize(options = {}, validation_type = :static)
9
+ super
10
+ end
11
+
12
+ # @param [Hash] options
13
+ #
14
+ def crawl(options = {})
15
+ @options = @options.merge(options)
16
+ @site = @options[:site]
17
+
18
+ files = Dir.glob(@options[:pattern])
19
+ files.each do |f|
20
+ next unless File.file?(f)
21
+ check_static_file(f)
22
+ end
23
+ print_status_line(files.size, 0, @not_founds_count, @errors_count)
24
+ end
25
+
26
+ private
27
+
28
+ def generate_static_page(f)
29
+ response = self.class.fake_httpresponse(open(f).read)
30
+ Spidr::Page.new(URI.join(@site, URI.encode(f)), response)
31
+ end
32
+
33
+ def check_static_file(f)
34
+ page = generate_static_page(f)
35
+ validate(page.doc, page.body, f, @options[:ignore]) if @options[:markup]
36
+ check_static_not_found(page.links) if @options[:not_found]
37
+ end
38
+
39
+ StaticLink = Struct.new(:link, :site) do
40
+ def link_uri
41
+ @link_uri = URI.parse(URI.encode(link))
42
+ @link_uri = URI.join(site, @link_uri) if @link_uri.host.nil?
43
+ @link_uri
44
+ end
45
+
46
+ def in_static_domain?
47
+ URI.parse(site).host == link_uri.host
48
+ end
49
+
50
+ def extract_urls_from_fake_css_response
51
+ response = ValidateWebsite::Static.fake_httpresponse(
52
+ open(file_path).read,
53
+ ['text/css'])
54
+ css_page = Spidr::Page.new(link_uri, response)
55
+ ValidateWebsite::Core.extract_urls_from_css(css_page)
56
+ end
57
+
58
+ def file_path
59
+ @file_path ||= URI.parse(
60
+ File.join(Dir.getwd, link_uri.path || '/')
61
+ ).path
62
+ end
63
+
64
+ def extname
65
+ @extname ||= File.extname(file_path)
66
+ end
67
+
68
+ def check?
69
+ !link.include?('#') && in_static_domain?
70
+ end
71
+ end
72
+
73
+ # check files linked on static document
74
+ # see lib/validate_website/runner.rb
75
+ def check_static_not_found(links)
76
+ static_links = links.map { |l| StaticLink.new(l, @site) }
77
+ static_links.each do |static_link|
78
+ next unless static_link.check?
79
+ not_found_error(static_link.file_path) &&
80
+ next unless File.exist?(static_link.file_path)
81
+ next unless static_link.extname == '.css'
82
+ check_static_not_found static_link.extract_urls_from_fake_css_response
83
+ end
84
+ end
85
+
86
+ # Fake http response for Spidr static crawling
87
+ # see https://github.com/ruby/ruby/blob/trunk/lib/net/http/response.rb
88
+ #
89
+ # @param [String] response body
90
+ # @param [Array] content types
91
+ # @return [Net::HTTPResponse] fake http response
92
+ def self.fake_httpresponse(body, content_types = CONTENT_TYPES)
93
+ response = Net::HTTPResponse.new '1.1', 200, 'OK'
94
+ response.instance_variable_set(:@read, true)
95
+ response.body = body
96
+ content_types.each do |c|
97
+ response.add_field('content-type', c)
98
+ end
99
+ response
100
+ end
101
+ end
102
+ end
@@ -1,6 +1,8 @@
1
1
  # encoding: utf-8
2
2
  require 'uri'
3
3
  require 'nokogiri'
4
+ require 'net/http'
5
+ require 'multipart_body'
4
6
 
5
7
  module ValidateWebsite
6
8
  # Document validation from DTD or XSD (webservice for html5)
@@ -12,7 +14,7 @@ module ValidateWebsite
12
14
  attr_accessor :html5_validator_service_url
13
15
  end
14
16
 
15
- attr_reader :original_doc, :body, :dtd, :doc, :namespace, :xsd, :errors
17
+ attr_reader :original_doc, :body, :dtd, :doc, :namespace, :xsd
16
18
 
17
19
  ##
18
20
  # @param [Nokogiri::HTML::Document] original_doc
@@ -24,18 +26,19 @@ module ValidateWebsite
24
26
  @body = body
25
27
  @ignore = ignore
26
28
  @dtd = @original_doc.internal_subset
27
- init_namespace(@dtd)
28
- @errors = []
29
+ @namespace = init_namespace(@dtd)
29
30
  end
30
31
 
31
32
  ##
32
33
  # @return [Boolean]
33
34
  def valid?
35
+ find_errors
34
36
  errors.length == 0
35
37
  end
36
38
 
39
+ # @return [Array] of errors
37
40
  def errors
38
- find_errors
41
+ @errors.map!(&:to_s)
39
42
  @ignore ? @errors.reject { |e| @ignore =~ e } : @errors
40
43
  end
41
44
 
@@ -47,7 +50,7 @@ module ValidateWebsite
47
50
  return unless dtd_uri.path
48
51
  @dtd_uri = dtd_uri
49
52
  # http://www.w3.org/TR/xhtml1/#dtds
50
- @namespace = File.basename(@dtd_uri.path, '.dtd')
53
+ File.basename(@dtd_uri.path, '.dtd')
51
54
  end
52
55
 
53
56
  def document
@@ -59,51 +62,59 @@ module ValidateWebsite
59
62
  end
60
63
  end
61
64
 
62
- def find_errors
63
- @doc = Dir.chdir(XHTML_PATH) do
64
- Nokogiri::XML(document) { |cfg|
65
- cfg.noent.dtdload.dtdvalid
66
- }
67
- end
68
-
69
- # http://www.w3.org/TR/xhtml1-schema/
70
- @xsd = Dir.chdir(XHTML_PATH) do
65
+ # http://www.w3.org/TR/xhtml1-schema/
66
+ def xsd
67
+ @xsd ||= Dir.chdir(XHTML_PATH) do
71
68
  if @namespace && File.exist?(@namespace + '.xsd')
72
69
  Nokogiri::XML::Schema(File.read(@namespace + '.xsd'))
73
70
  end
74
71
  end
72
+ end
75
73
 
76
- if @xsd
77
- @errors = @xsd.validate(@doc)
78
- elsif document =~ /^\<!DOCTYPE html\>/i
79
- html5_validate(document)
74
+ # @return [Array] contain result errors
75
+ def validate(xml_doc, document_body)
76
+ if xsd
77
+ xsd.validate(xml_doc)
78
+ elsif document_body =~ /^\<!DOCTYPE html\>/i
79
+ html5_validate(document_body)
80
80
  else
81
81
  # dont have xsd fall back to dtd
82
- @doc = Dir.chdir(XHTML_PATH) do
82
+ Dir.chdir(XHTML_PATH) do
83
83
  Nokogiri::HTML.parse(document)
84
- end
85
- @errors = @doc.errors
84
+ end.errors
86
85
  end
86
+ end
87
87
 
88
+ # http://nokogiri.org/tutorials/ensuring_well_formed_markup.html
89
+ def find_errors
90
+ doc = Dir.chdir(XHTML_PATH) do
91
+ Nokogiri::XML(document) { |cfg| cfg.noent.dtdload.dtdvalid }
92
+ end
93
+ @errors = validate(doc, document)
88
94
  rescue Nokogiri::XML::SyntaxError => e
89
- # http://nokogiri.org/tutorials/ensuring_well_formed_markup.html
90
95
  @errors << e
91
96
  end
92
97
 
93
- def html5_validate(document)
94
- require 'net/http'
95
- require 'multipart_body'
98
+ def html5_headers(multipart)
99
+ {
100
+ 'Content-Type' => "multipart/form-data; boundary=#{multipart.boundary}",
101
+ 'Content-Length' => multipart.to_s.bytesize.to_s
102
+ }
103
+ end
104
+
105
+ def html5_body(document)
96
106
  url = URI.parse(self.class.html5_validator_service_url)
97
107
  multipart = MultipartBody.new(content: document)
98
108
  http = Net::HTTP.new(url.host, url.port)
99
- headers = {
100
- 'Content-Type' => "multipart/form-data; boundary=#{multipart.boundary}",
101
- 'Content-Length' => multipart.to_s.bytesize.to_s,
102
- }
103
- res = http.start { |con| con.post(url.path, multipart.to_s, headers) }
104
- validator_document = Nokogiri::HTML(res.body)
105
- @errors = validator_document.css('h2.invalid').map(&:content)
106
- @errors.concat validator_document.css('ol li.error').map(&:content)
109
+ http.start do |con|
110
+ con.post(url.path, multipart.to_s, html5_headers(multipart))
111
+ end.body
112
+ end
113
+
114
+ def html5_validate(document)
115
+ validator_document = Nokogiri::HTML(html5_body(document))
116
+ errors = validator_document.css('h2.invalid').map(&:content)
117
+ errors.concat validator_document.css('ol li.error').map(&:content)
107
118
  end
108
119
  end
109
120
  end
@@ -0,0 +1,3 @@
1
+ module ValidateWebsite
2
+ VERSION = '1.1.0'.freeze
3
+ end
@@ -1,125 +1,10 @@
1
- # encoding: UTF-8
2
- require File.expand_path('../spec_helper', __FILE__)
1
+ require_relative 'spec_helper'
3
2
 
4
3
  describe ValidateWebsite::Core do
5
- before do
6
- WebMock.reset!
7
- stub_request(:get, ValidateWebsite::Core::PING_URL).to_return(status: 200)
8
- stub_request(:get, /#{SPEC_DOMAIN}/).to_return(status: 200)
9
- @validate_website = ValidateWebsite::Core.new(color: false)
10
- end
11
-
12
4
  describe 'invalid options' do
13
5
  it 'raise ArgumentError on wrong validation_type' do
14
- proc {
15
- ValidateWebsite::Core.new({ color: false }, :fail)
16
- }.must_raise ArgumentError
17
- end
18
- end
19
-
20
- describe 'options' do
21
- it 'can change user-agent' do
22
- ua = %{Linux / Firefox 29: Mozilla/5.0 (X11; Linux x86_64; rv:29.0) \
23
- Gecko/20100101 Firefox/29.0}
24
- v = ValidateWebsite::Core.new({ site: SPEC_DOMAIN, user_agent: ua },
25
- :crawl)
26
- v.crawl
27
- v.crawler.user_agent.must_equal ua
28
- end
29
-
30
- it 'can change html5 validator service url' do
31
- s = 'http://localhost:8888/'
32
- ValidateWebsite::Core.new({ site: SPEC_DOMAIN,
33
- :'html5-validator-service-url' => s })
34
- ValidateWebsite::Validator.html5_validator_service_url.must_equal s
35
- end
36
- end
37
-
38
- describe('cookies') do
39
- it 'can set cookies' do
40
- cookies = 'tz=Europe%2FBerlin; guid=ZcpBshbtStgl9VjwTofq'
41
- v = ValidateWebsite::Core.new({ site: SPEC_DOMAIN, cookies: cookies },
42
- :crawl)
43
- v.crawl
44
- v.crawler.cookies.cookies_for_host(v.host).must_equal v.default_cookies
45
- end
46
- end
47
-
48
- describe('html') do
49
- it "extract url" do
50
- name = 'xhtml1-strict'
51
- file = File.join('spec', 'data', "#{name}.html")
52
- page = FakePage.new(name,
53
- body: open(file).read,
54
- content_type: 'text/html')
55
- @validate_website.site = page.url
56
- @validate_website.crawl
57
- @validate_website.crawler.history.size.must_equal 5
58
- end
59
-
60
- it 'extract link' do
61
- name = 'html4-strict'
62
- file = File.join('spec', 'data', "#{name}.html")
63
- page = FakePage.new(name,
64
- body: open(file).read,
65
- content_type: 'text/html')
66
- @validate_website.site = page.url
67
- @validate_website.crawl
68
- @validate_website.crawler.history.size.must_equal 98
69
- end
70
- end
71
-
72
- describe('css') do
73
- it "crawl css and extract url" do
74
- page = FakePage.new('test.css',
75
- body: '.t {background-image: url(pouet);}
76
- .t {background-image: url(/image/pouet.png)}
77
- .t {background-image: url(/image/pouet_42.png)}
78
- .t {background-image: url(/image/pouet)}',
79
- content_type: 'text/css')
80
- @validate_website.site = page.url
81
- @validate_website.crawl
82
- @validate_website.crawler.history.size.must_equal 5
83
- end
84
-
85
- it "should extract url with single quote" do
86
- page = FakePage.new('test.css',
87
- body: ".test {background-image: url('pouet');}",
88
- content_type: 'text/css')
89
- @validate_website.site = page.url
90
- @validate_website.crawl
91
- @validate_website.crawler.history.size.must_equal 2
92
- end
93
-
94
- it "should extract url with double quote" do
95
- page = FakePage.new('test.css',
96
- body: ".test {background-image: url(\"pouet\");}",
97
- content_type: 'text/css')
98
- @validate_website.site = page.url
99
- @validate_website.crawl
100
- @validate_website.crawler.history.size.must_equal 2
101
- end
102
- end
103
-
104
- describe('static') do
105
- it 'no space in directory name' do
106
- pattern = File.join(File.dirname(__FILE__), 'example/**/*.html')
107
- @validate_website.crawl_static(pattern: pattern,
108
- site: 'http://dev.af83.com/',
109
- markup: false,
110
- not_found: false)
111
- @validate_website.not_founds_count.must_equal 0
112
- end
113
-
114
- it 'not found' do
115
- pattern = File.join(File.dirname(__FILE__), '**/*.html')
116
- Dir.chdir('spec/data') do
117
- @validate_website.crawl_static(pattern: pattern,
118
- site: 'https://linuxfr.org/',
119
- markup: false,
120
- not_found: true)
121
- @validate_website.not_founds_count.must_equal 448
122
- end
6
+ proc { ValidateWebsite::Core.new({ color: false }, :fail) }
7
+ .must_raise ArgumentError
123
8
  end
124
9
  end
125
10
  end
@@ -0,0 +1,91 @@
1
+ require_relative 'spec_helper'
2
+
3
+ describe ValidateWebsite::Crawl do
4
+ before do
5
+ WebMock.reset!
6
+ stub_request(:get, /#{SPEC_DOMAIN}/).to_return(status: 200)
7
+ @validate_website = ValidateWebsite::Crawl.new(color: false)
8
+ end
9
+
10
+ describe 'options' do
11
+ it 'can change user-agent' do
12
+ ua = %{Linux / Firefox 29: Mozilla/5.0 (X11; Linux x86_64; rv:29.0) \
13
+ Gecko/20100101 Firefox/29.0}
14
+ v = ValidateWebsite::Crawl.new(site: SPEC_DOMAIN, user_agent: ua)
15
+ v.crawl
16
+ v.crawler.user_agent.must_equal ua
17
+ end
18
+
19
+ it 'can change html5 validator service url' do
20
+ s = 'http://localhost:8888/'
21
+ ValidateWebsite::Crawl.new(site: SPEC_DOMAIN,
22
+ html5_validator_service_url: s)
23
+ ValidateWebsite::Validator.html5_validator_service_url.must_equal s
24
+ end
25
+ end
26
+
27
+ describe('cookies') do
28
+ it 'can set cookies' do
29
+ cookies = 'tz=Europe%2FBerlin; guid=ZcpBshbtStgl9VjwTofq'
30
+ v = ValidateWebsite::Crawl.new(site: SPEC_DOMAIN, cookies: cookies)
31
+ v.crawl
32
+ v.crawler.cookies.cookies_for_host(v.host).must_equal v.default_cookies
33
+ end
34
+ end
35
+
36
+ describe('html') do
37
+ it 'extract url' do
38
+ name = 'xhtml1-strict'
39
+ file = File.join('spec', 'data', "#{name}.html")
40
+ page = FakePage.new(name,
41
+ body: open(file).read,
42
+ content_type: 'text/html')
43
+ @validate_website.site = page.url
44
+ @validate_website.crawl
45
+ @validate_website.crawler.history.size.must_equal 5
46
+ end
47
+
48
+ it 'extract link' do
49
+ name = 'html4-strict'
50
+ file = File.join('spec', 'data', "#{name}.html")
51
+ page = FakePage.new(name,
52
+ body: open(file).read,
53
+ content_type: 'text/html')
54
+ @validate_website.site = page.url
55
+ @validate_website.crawl
56
+ @validate_website.crawler.history.size.must_equal 98
57
+ end
58
+ end
59
+
60
+ describe('css') do
61
+ it 'crawl css and extract url' do
62
+ page = FakePage.new('test.css',
63
+ body: '.t {background-image: url(pouet);}
64
+ .t {background-image: url(/image/pouet.png)}
65
+ .t {background-image: url(/image/pouet_42.png)}
66
+ .t {background-image: url(/image/pouet)}',
67
+ content_type: 'text/css')
68
+ @validate_website.site = page.url
69
+ @validate_website.crawl
70
+ @validate_website.crawler.history.size.must_equal 5
71
+ end
72
+
73
+ it 'should extract url with single quote' do
74
+ page = FakePage.new('test.css',
75
+ body: ".test {background-image: url('pouet');}",
76
+ content_type: 'text/css')
77
+ @validate_website.site = page.url
78
+ @validate_website.crawl
79
+ @validate_website.crawler.history.size.must_equal 2
80
+ end
81
+
82
+ it 'should extract url with double quote' do
83
+ page = FakePage.new('test.css',
84
+ body: ".test {background-image: url(\"pouet\");}",
85
+ content_type: 'text/css')
86
+ @validate_website.site = page.url
87
+ @validate_website.crawl
88
+ @validate_website.crawler.history.size.must_equal 2
89
+ end
90
+ end
91
+ end