RubyGems - validate-website - Versions diffs - 1.0.5 → 1.1.0 - Mend

validate-website 1.0.5 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

checksums.yaml +4 -4
data/Rakefile +3 -3
data/lib/validate_website.rb +1 -0
data/lib/validate_website/core.rb +33 -157
data/lib/validate_website/crawl.rb +78 -0
data/lib/validate_website/option_parser.rb +64 -59
data/lib/validate_website/runner.rb +3 -3
data/lib/validate_website/static.rb +102 -0
data/lib/validate_website/validator.rb +44 -33
data/lib/validate_website/version.rb +3 -0
data/spec/core_spec.rb +3 -118
data/spec/crawler_spec.rb +91 -0
data/spec/data/w3.org-xhtml1-strict-errors.html +544 -0
data/spec/spec_helper.rb +2 -1
data/spec/static_spec.rb +38 -0
data/spec/validator_spec.rb +40 -23
data/spec/webmock_helper.rb +4 -3
metadata +30 -8

data/lib/validate_website/runner.rb CHANGED

@@ -12,15 +12,15 @@ module ValidateWebsite
     def self.run_crawl(args)
       trap_interrupt
-      validate_website = ValidateWebsite::Core.new(args, :crawl)
+      validate_website = ValidateWebsite::Crawl.new(args)
       validate_website.crawl
       validate_website.exit_status
     end
     def self.run_static(args)
       trap_interrupt
-      validate_website = ValidateWebsite::Core.new(args, :static)
-      validate_website.crawl_static
+      validate_website = ValidateWebsite::Static.new(args)
+      validate_website.crawl
       validate_website.exit_status
     end
   end

data/lib/validate_website/static.rb ADDED

@@ -0,0 +1,102 @@
+require 'validate_website/core'
+module ValidateWebsite
+  # Class for validation Static website
+  class Static < Core
+    CONTENT_TYPES = ['text/html', 'text/xhtml+xml']
+    def initialize(options = {}, validation_type = :static)
+      super
+    end
+    # @param [Hash] options
+    #
+    def crawl(options = {})
+      @options = @options.merge(options)
+      @site = @options[:site]
+      files = Dir.glob(@options[:pattern])
+      files.each do |f|
+        next unless File.file?(f)
+        check_static_file(f)
+      end
+      print_status_line(files.size, 0, @not_founds_count, @errors_count)
+    end
+    private
+    def generate_static_page(f)
+      response = self.class.fake_httpresponse(open(f).read)
+      Spidr::Page.new(URI.join(@site, URI.encode(f)), response)
+    end
+    def check_static_file(f)
+      page = generate_static_page(f)
+      validate(page.doc, page.body, f, @options[:ignore]) if @options[:markup]
+      check_static_not_found(page.links) if @options[:not_found]
+    end
+    StaticLink = Struct.new(:link, :site) do
+      def link_uri
+        @link_uri = URI.parse(URI.encode(link))
+        @link_uri = URI.join(site, @link_uri) if @link_uri.host.nil?
+        @link_uri
+      end
+      def in_static_domain?
+        URI.parse(site).host == link_uri.host
+      end
+      def extract_urls_from_fake_css_response
+        response = ValidateWebsite::Static.fake_httpresponse(
+          open(file_path).read,
+          ['text/css'])
+        css_page = Spidr::Page.new(link_uri, response)
+        ValidateWebsite::Core.extract_urls_from_css(css_page)
+      end
+      def file_path
+        @file_path ||= URI.parse(
+          File.join(Dir.getwd, link_uri.path || '/')
+        ).path
+      end
+      def extname
+        @extname ||= File.extname(file_path)
+      end
+      def check?
+        !link.include?('#') && in_static_domain?
+      end
+    end
+    # check files linked on static document
+    # see lib/validate_website/runner.rb
+    def check_static_not_found(links)
+      static_links = links.map { |l| StaticLink.new(l, @site) }
+      static_links.each do |static_link|
+        next unless static_link.check?
+        not_found_error(static_link.file_path) &&
+          next unless File.exist?(static_link.file_path)
+        next unless static_link.extname == '.css'
+        check_static_not_found static_link.extract_urls_from_fake_css_response
+      end
+    end
+    # Fake http response for Spidr static crawling
+    # see https://github.com/ruby/ruby/blob/trunk/lib/net/http/response.rb
+    #
+    # @param [String] response body
+    # @param [Array] content types
+    # @return [Net::HTTPResponse] fake http response
+    def self.fake_httpresponse(body, content_types = CONTENT_TYPES)
+      response = Net::HTTPResponse.new '1.1', 200, 'OK'
+      response.instance_variable_set(:@read, true)
+      response.body = body
+      content_types.each do |c|
+        response.add_field('content-type', c)
+      end
+      response
+    end
+  end
+end

data/lib/validate_website/validator.rb CHANGED

@@ -1,6 +1,8 @@
 # encoding: utf-8
 require 'uri'
 require 'nokogiri'
+require 'net/http'
+require 'multipart_body'
 module ValidateWebsite
   # Document validation from DTD or XSD (webservice for html5)
@@ -12,7 +14,7 @@ module ValidateWebsite
       attr_accessor :html5_validator_service_url
     end
-    attr_reader :original_doc, :body, :dtd, :doc, :namespace, :xsd, :errors
+    attr_reader :original_doc, :body, :dtd, :doc, :namespace, :xsd
     ##
     # @param [Nokogiri::HTML::Document] original_doc
@@ -24,18 +26,19 @@ module ValidateWebsite
       @body = body
       @ignore = ignore
       @dtd = @original_doc.internal_subset
-      init_namespace(@dtd)
-      @errors = []
+      @namespace = init_namespace(@dtd)
     end
     ##
     # @return [Boolean]
     def valid?
+      find_errors
       errors.length == 0
     end
+    # @return [Array] of errors
     def errors
-      find_errors
+      @errors.map!(&:to_s)
       @ignore ? @errors.reject { |e| @ignore =~ e } : @errors
     end
@@ -47,7 +50,7 @@ module ValidateWebsite
       return unless dtd_uri.path
       @dtd_uri = dtd_uri
       # http://www.w3.org/TR/xhtml1/#dtds
-      @namespace = File.basename(@dtd_uri.path, '.dtd')
+      File.basename(@dtd_uri.path, '.dtd')
     end
     def document
@@ -59,51 +62,59 @@ module ValidateWebsite
       end
     end
-    def find_errors
-      @doc = Dir.chdir(XHTML_PATH) do
-        Nokogiri::XML(document) { |cfg|
-          cfg.noent.dtdload.dtdvalid
-        }
-      end
-      # http://www.w3.org/TR/xhtml1-schema/
-      @xsd = Dir.chdir(XHTML_PATH) do
+    # http://www.w3.org/TR/xhtml1-schema/
+    def xsd
+      @xsd ||= Dir.chdir(XHTML_PATH) do
         if @namespace && File.exist?(@namespace + '.xsd')
           Nokogiri::XML::Schema(File.read(@namespace + '.xsd'))
         end
       end
+    end
-      if @xsd
-        @errors = @xsd.validate(@doc)
-      elsif document =~ /^\<!DOCTYPE html\>/i
-        html5_validate(document)
+    # @return [Array] contain result errors
+    def validate(xml_doc, document_body)
+      if xsd
+        xsd.validate(xml_doc)
+      elsif document_body =~ /^\<!DOCTYPE html\>/i
+        html5_validate(document_body)
       else
         # dont have xsd fall back to dtd
-        @doc = Dir.chdir(XHTML_PATH) do
+        Dir.chdir(XHTML_PATH) do
           Nokogiri::HTML.parse(document)
-        end
-        @errors = @doc.errors
+        end.errors
       end
+    end
+    # http://nokogiri.org/tutorials/ensuring_well_formed_markup.html
+    def find_errors
+      doc = Dir.chdir(XHTML_PATH) do
+        Nokogiri::XML(document) { |cfg| cfg.noent.dtdload.dtdvalid }
+      end
+      @errors = validate(doc, document)
     rescue Nokogiri::XML::SyntaxError => e
-      # http://nokogiri.org/tutorials/ensuring_well_formed_markup.html
       @errors << e
     end
-    def html5_validate(document)
-      require 'net/http'
-      require 'multipart_body'
+    def html5_headers(multipart)
+      {
+        'Content-Type' => "multipart/form-data; boundary=#{multipart.boundary}",
+        'Content-Length' => multipart.to_s.bytesize.to_s
+      }
+    end
+    def html5_body(document)
       url = URI.parse(self.class.html5_validator_service_url)
       multipart = MultipartBody.new(content: document)
       http = Net::HTTP.new(url.host, url.port)
-      headers = {
-        'Content-Type' => "multipart/form-data; boundary=#{multipart.boundary}",
-        'Content-Length' => multipart.to_s.bytesize.to_s,
-      }
-      res = http.start { |con| con.post(url.path, multipart.to_s, headers) }
-      validator_document = Nokogiri::HTML(res.body)
-      @errors = validator_document.css('h2.invalid').map(&:content)
-      @errors.concat validator_document.css('ol li.error').map(&:content)
+      http.start do |con|
+        con.post(url.path, multipart.to_s, html5_headers(multipart))
+      end.body
+    end
+    def html5_validate(document)
+      validator_document = Nokogiri::HTML(html5_body(document))
+      errors = validator_document.css('h2.invalid').map(&:content)
+      errors.concat validator_document.css('ol li.error').map(&:content)
     end
   end
 end

data/lib/validate_website/version.rb ADDED

@@ -0,0 +1,3 @@
+module ValidateWebsite
+  VERSION = '1.1.0'.freeze
+end

data/spec/core_spec.rb CHANGED

@@ -1,125 +1,10 @@
-# encoding: UTF-8
-require File.expand_path('../spec_helper', __FILE__)
+require_relative 'spec_helper'
 describe ValidateWebsite::Core do
-  before do
-    WebMock.reset!
-    stub_request(:get, ValidateWebsite::Core::PING_URL).to_return(status: 200)
-    stub_request(:get, /#{SPEC_DOMAIN}/).to_return(status: 200)
-    @validate_website = ValidateWebsite::Core.new(color: false)
-  end
   describe 'invalid options' do
     it 'raise ArgumentError on wrong validation_type' do
-      proc {
-        ValidateWebsite::Core.new({ color: false }, :fail)
-      }.must_raise ArgumentError
-    end
-  end
-  describe 'options' do
-    it 'can change user-agent' do
-      ua = %{Linux / Firefox 29: Mozilla/5.0 (X11; Linux x86_64; rv:29.0) \
-      Gecko/20100101 Firefox/29.0}
-      v = ValidateWebsite::Core.new({ site: SPEC_DOMAIN, user_agent: ua },
-                                    :crawl)
-      v.crawl
-      v.crawler.user_agent.must_equal ua
-    end
-    it 'can change html5 validator service url' do
-      s = 'http://localhost:8888/'
-      ValidateWebsite::Core.new({ site: SPEC_DOMAIN,
-                                  :'html5-validator-service-url' => s })
-      ValidateWebsite::Validator.html5_validator_service_url.must_equal s
-    end
-  end
-  describe('cookies') do
-    it 'can set cookies' do
-      cookies = 'tz=Europe%2FBerlin; guid=ZcpBshbtStgl9VjwTofq'
-      v = ValidateWebsite::Core.new({ site: SPEC_DOMAIN, cookies: cookies },
-                                    :crawl)
-      v.crawl
-      v.crawler.cookies.cookies_for_host(v.host).must_equal v.default_cookies
-    end
-  end
-  describe('html') do
-    it "extract url" do
-      name = 'xhtml1-strict'
-      file = File.join('spec', 'data', "#{name}.html")
-      page = FakePage.new(name,
-                          body: open(file).read,
-                          content_type: 'text/html')
-      @validate_website.site = page.url
-      @validate_website.crawl
-      @validate_website.crawler.history.size.must_equal 5
-    end
-    it 'extract link' do
-      name = 'html4-strict'
-      file = File.join('spec', 'data', "#{name}.html")
-      page = FakePage.new(name,
-                          body: open(file).read,
-                          content_type: 'text/html')
-      @validate_website.site = page.url
-      @validate_website.crawl
-      @validate_website.crawler.history.size.must_equal 98
-    end
-  end
-  describe('css') do
-    it "crawl css and extract url" do
-      page = FakePage.new('test.css',
-                          body: '.t {background-image: url(pouet);}
-                                 .t {background-image: url(/image/pouet.png)}
-                                 .t {background-image: url(/image/pouet_42.png)}
-                                 .t {background-image: url(/image/pouet)}',
-                          content_type: 'text/css')
-      @validate_website.site = page.url
-      @validate_website.crawl
-      @validate_website.crawler.history.size.must_equal 5
-    end
-    it "should extract url with single quote" do
-      page = FakePage.new('test.css',
-                          body: ".test {background-image: url('pouet');}",
-                          content_type: 'text/css')
-      @validate_website.site = page.url
-      @validate_website.crawl
-      @validate_website.crawler.history.size.must_equal 2
-    end
-    it "should extract url with double quote" do
-      page = FakePage.new('test.css',
-                          body: ".test {background-image: url(\"pouet\");}",
-                          content_type: 'text/css')
-      @validate_website.site = page.url
-      @validate_website.crawl
-      @validate_website.crawler.history.size.must_equal 2
-    end
-  end
-  describe('static') do
-    it 'no space in directory name' do
-      pattern = File.join(File.dirname(__FILE__), 'example/**/*.html')
-      @validate_website.crawl_static(pattern: pattern,
-                                     site: 'http://dev.af83.com/',
-                                     markup: false,
-                                     not_found: false)
-      @validate_website.not_founds_count.must_equal 0
-    end
-    it 'not found' do
-      pattern = File.join(File.dirname(__FILE__), '**/*.html')
-      Dir.chdir('spec/data') do
-        @validate_website.crawl_static(pattern: pattern,
-                                       site: 'https://linuxfr.org/',
-                                       markup: false,
-                                       not_found: true)
-        @validate_website.not_founds_count.must_equal 448
-      end
+      proc { ValidateWebsite::Core.new({ color: false }, :fail) }
+        .must_raise ArgumentError
     end
   end
 end

data/spec/crawler_spec.rb ADDED

@@ -0,0 +1,91 @@
+require_relative 'spec_helper'
+describe ValidateWebsite::Crawl do
+  before do
+    WebMock.reset!
+    stub_request(:get, /#{SPEC_DOMAIN}/).to_return(status: 200)
+    @validate_website = ValidateWebsite::Crawl.new(color: false)
+  end
+  describe 'options' do
+    it 'can change user-agent' do
+      ua = %{Linux / Firefox 29: Mozilla/5.0 (X11; Linux x86_64; rv:29.0) \
+      Gecko/20100101 Firefox/29.0}
+      v = ValidateWebsite::Crawl.new(site: SPEC_DOMAIN, user_agent: ua)
+      v.crawl
+      v.crawler.user_agent.must_equal ua
+    end
+    it 'can change html5 validator service url' do
+      s = 'http://localhost:8888/'
+      ValidateWebsite::Crawl.new(site: SPEC_DOMAIN,
+                                 html5_validator_service_url: s)
+      ValidateWebsite::Validator.html5_validator_service_url.must_equal s
+    end
+  end
+  describe('cookies') do
+    it 'can set cookies' do
+      cookies = 'tz=Europe%2FBerlin; guid=ZcpBshbtStgl9VjwTofq'
+      v = ValidateWebsite::Crawl.new(site: SPEC_DOMAIN, cookies: cookies)
+      v.crawl
+      v.crawler.cookies.cookies_for_host(v.host).must_equal v.default_cookies
+    end
+  end
+  describe('html') do
+    it 'extract url' do
+      name = 'xhtml1-strict'
+      file = File.join('spec', 'data', "#{name}.html")
+      page = FakePage.new(name,
+                          body: open(file).read,
+                          content_type: 'text/html')
+      @validate_website.site = page.url
+      @validate_website.crawl
+      @validate_website.crawler.history.size.must_equal 5
+    end
+    it 'extract link' do
+      name = 'html4-strict'
+      file = File.join('spec', 'data', "#{name}.html")
+      page = FakePage.new(name,
+                          body: open(file).read,
+                          content_type: 'text/html')
+      @validate_website.site = page.url
+      @validate_website.crawl
+      @validate_website.crawler.history.size.must_equal 98
+    end
+  end
+  describe('css') do
+    it 'crawl css and extract url' do
+      page = FakePage.new('test.css',
+                          body: '.t {background-image: url(pouet);}
+                                 .t {background-image: url(/image/pouet.png)}
+                                 .t {background-image: url(/image/pouet_42.png)}
+                                 .t {background-image: url(/image/pouet)}',
+                          content_type: 'text/css')
+      @validate_website.site = page.url
+      @validate_website.crawl
+      @validate_website.crawler.history.size.must_equal 5
+    end
+    it 'should extract url with single quote' do
+      page = FakePage.new('test.css',
+                          body: ".test {background-image: url('pouet');}",
+                          content_type: 'text/css')
+      @validate_website.site = page.url
+      @validate_website.crawl
+      @validate_website.crawler.history.size.must_equal 2
+    end
+    it 'should extract url with double quote' do
+      page = FakePage.new('test.css',
+                          body: ".test {background-image: url(\"pouet\");}",
+                          content_type: 'text/css')
+      @validate_website.site = page.url
+      @validate_website.crawl
+      @validate_website.crawler.history.size.must_equal 2
+    end
+  end
+end