RubyGems - validate-website - Versions diffs - 0.3.1 → 0.3.5 - Mend

validate-website 0.3.1 → 0.3.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

data/README.rdoc CHANGED

@@ -17,6 +17,8 @@
   --auth=user,pass \ # http auth
   -e 'redirect|news' \ # exclude regex
   -n # log not found (404)
+  -c "name=val;name2=val2"
+  -v # verbose
 == REQUIREMENTS:

data/Rakefile CHANGED

@@ -1,4 +1,3 @@
-require 'rake/testtask'
 require 'rake/packagetask'
 require 'rake/rdoctask'
 require 'rake'
@@ -7,10 +6,10 @@ require 'find'
 # Globals
 PKG_NAME = 'validate-website'
-PKG_VERSION = '0.3.1'
+PKG_VERSION = '0.3.5'
 PKG_FILES = ['README.rdoc', 'Rakefile']
-Find.find('lib/', 'bin/') do |f|
+Find.find('lib/', 'bin/', 'spec/') do |f|
   if FileTest.directory?(f) and f =~ /\.svn|\.git/
     Find.prune
   else
@@ -22,11 +21,6 @@ end
 task :default => [:clean, :repackage]
-#Rake::TestTask.new do |t|
-  #t.libs << "test"
-  #t.test_files = FileList['test/tc_*.rb']
-#end
 Rake::RDocTask.new do |rd|
   f = []
   require 'find'
@@ -61,6 +55,8 @@ spec = Gem::Specification.new do |s|
   s.requirements << 'spk-anemone' << 'rainbow'
   s.add_dependency('spk-anemone', '>= 0.4.0')
   s.add_dependency('rainbow', '>= 1.1')
+  s.add_development_dependency('rspec', '>= 1.3.0')
+  s.add_development_dependency('fakeweb', '>= 1.3.0')
   s.require_path = 'lib'
   s.bindir = 'bin'
   s.executables << 'validate-website'

data/bin/validate-website CHANGED

@@ -5,47 +5,17 @@ developer_mode = false
 developer_mode = true if __FILE__ == $0
 require 'rubygems' if developer_mode
-require 'validator'
-require 'anemone'
-require 'colorful_messages'
 require 'validate_website'
-include ColorfulMessages
 validate_website = ValidateWebsite.new(ARGV)
 options = validate_website.options
-exit_code = 0
-Anemone.crawl(options[:site],
-              :user_agent => options[:useragent],
-              :authorization => options[:auth]) do |anemone|
-  anemone.skip_links_like Regexp.new(options[:exclude]) if options[:exclude]
-  anemone.on_every_page { |page|
-    url = page.url.to_s
-    print info(url)
-    # validate html/html+xml
-    if page.html? && page.fetched?
-      validator = Validator.new(page)
-      msg = " well formed? %s" % validator.valid?
-      if validator.valid?
-        puts success(msg)
-      else
-        exit_code = 1
-        puts error(msg)
-        validate_website.to_file(url)
-      end
-    end
-    if options[:not_found] && page.not_found?
-      exit_code = 1
-      puts error("%s linked in %s but not exist" % [url, page.referer])
-      validate_website.to_file(url)
-    end
-  }
-end
+exit_code = validate_website.crawl options[:site],
+              :user_agent     => options[:useragent],
+              :authorization  => options[:auth],
+              :cookies        => options[:cookies],
+              :accept_cookies => options[:accept_cookies],
+              :verbose        => options[:verbose]
 exit(exit_code)

data/lib/validate_website.rb CHANGED

@@ -1,11 +1,18 @@
 require 'optparse'
 require 'open-uri'
+require 'validator'
+require 'anemone'
+require 'colorful_messages'
+include ColorfulMessages
 class ValidateWebsite
   attr_reader :options
-  def initialize(args)
+  attr_reader :anemone
+  def initialize(args=[])
     @options = {
       :site        => 'http://localhost:3000/',
       :useragent   => Anemone::Core::DEFAULT_OPTS[:user_agent],
@@ -14,6 +21,9 @@ class ValidateWebsite
       :auth        => nil,
       # log not found url (404 status code)
       :not_found   => false,
+      :cookies     => nil,
+      :accept_cookies => true,
+      :verbose     => false,
     }
     parse(args)
@@ -42,6 +52,8 @@ class ValidateWebsite
       o.on("--auth=[user,pass]", Array,
            "Basic http authentification") { |v| @options[:auth] = v }
       o.on("-n", "--not-found", "Log not found url") { |v| @options[:not_found] = v }
+      o.on("-c", "--cookies=val", "Set defaults cookies") { |v| @options[:cookies] = v }
+      o.on("-v", "--verbose", "Verbose") { |v| @options[:verbose] = v }
       o.separator ""
       o.on_tail("-h", "--help", "Show this help message.") { puts o; exit }
@@ -49,7 +61,74 @@ class ValidateWebsite
     opts.parse!(args)
   end
+  def get_url(page, elem, attrname)
+    u = elem.attributes[attrname] if elem.attributes[attrname]
+    return if u.nil?
+    begin
+      abs = page.to_absolute(URI(u))
+    rescue
+      abs = nil
+    end
+    return abs if abs && page.in_domain?(abs)
+  end
   def to_file(msg)
     open(options[:file], 'a').write("#{msg}\n") if options[:file]
   end
+  def crawl(site, opts={})
+    exit_code = 0
+    @anemone = Anemone.crawl(site, opts) do |anemone|
+      anemone.skip_links_like Regexp.new(options[:exclude]) if options[:exclude]
+      anemone.focus_crawl { |p|
+        links = []
+        if p.html?
+          p.doc.css('img, script, iframe').each do |elem|
+            url = get_url(p, elem, "src")
+            links << url unless url.nil?
+          end
+          p.doc.css('link').each do |link|
+            url = get_url(p, link, "href")
+            links << url unless url.nil?
+          end
+        end
+        if p.content_type == 'text/css'
+          p.body.scan(/url\((['".\/\w-]+)\)/).each do |url|
+            url = url.to_s.gsub("'", "").gsub('"', '')
+            abs = p.to_absolute(URI(url))
+            links << abs
+          end
+        end
+        links.uniq!
+        p.links.concat(links)
+      }
+      anemone.on_every_page { |page|
+        url = page.url.to_s
+        # validate html/html+xml
+        if page.html? && page.fetched?
+          print info(url)
+          validator = Validator.new(page)
+          msg = " well formed? %s" % validator.valid?
+          if validator.valid?
+            puts success(msg)
+          else
+            exit_code = 1
+            puts error(msg)
+            to_file(url)
+          end
+        end
+        if options[:not_found] && page.not_found?
+          exit_code = 1
+          puts error("%s linked in %s but not exist" % [url, page.referer])
+          to_file(url)
+        end
+      }
+    end
+    exit_code
+  end
 end

data/spec/css_spec.rb ADDED

@@ -0,0 +1,53 @@
+require File.dirname(__FILE__) + '/spec_helper'
+describe ValidateWebsite do
+    before(:each) do
+      FakeWeb.clean_registry
+    end
+    it "should crawl css and extract url" do
+      pages = []
+      pages << FakePage.new('test.css',
+                            :body => ".test {background-image: url(pouet);}
+                                      .tests {background-image: url(/image/pouet.png)}
+                                      .tests {background-image: url(/image/pouet_42.png)}
+                                      .tests {background-image: url(/image/pouet)}",
+                            :content_type => 'text/css')
+      pages << FakePage.new('pouet',
+                            :content_type => 'image/png')
+      pages << FakePage.new('image/pouet',
+                            :content_type => 'image/png')
+      pages << FakePage.new('image/pouet.png',
+                            :content_type => 'image/png')
+      pages << FakePage.new('image/pouet_42.png',
+                            :content_type => 'image/png')
+      validate_website = ValidateWebsite.new
+      validate_website.crawl(pages[0].url)
+      validate_website.anemone.should have(5).pages
+  end
+  it "should extract url with single quote" do
+    pages = []
+    pages << FakePage.new('test.css',
+                          :body => ".test {background-image: url('pouet');}",
+                          :content_type => 'text/css')
+    pages << FakePage.new('pouet',
+                          :content_type => 'image/png')
+    validate_website = ValidateWebsite.new
+    validate_website.crawl(pages[0].url)
+    validate_website.anemone.should have(2).pages
+  end
+  it "should extract url with double quote" do
+    pages = []
+    pages << FakePage.new('test.css',
+                          :body => ".test {background-image: url(\"pouet\");}",
+                          :content_type => 'text/css')
+    pages << FakePage.new('pouet',
+                          :content_type => 'image/png')
+    validate_website = ValidateWebsite.new
+    validate_website.crawl(pages[0].url)
+    validate_website.anemone.should have(2).pages
+  end
+end

data/spec/fakeweb_helper.rb ADDED

@@ -0,0 +1,61 @@
+begin
+  require 'fakeweb'
+rescue LoadError
+  warn "You need the 'fakeweb' gem installed to test ValidateWebsite"
+  exit
+end
+FakeWeb.allow_net_connect = false
+class FakePage
+    attr_accessor :links
+    attr_accessor :hrefs
+    attr_accessor :body
+    def initialize(name = '', options = {})
+      @name = name
+      @links = [options[:links]].flatten if options.has_key?(:links)
+      @hrefs = [options[:hrefs]].flatten if options.has_key?(:hrefs)
+      @redirect = options[:redirect] if options.has_key?(:redirect)
+      @content_type = options[:content_type] || "text/html"
+      @body = options[:body]
+      create_body unless @body
+      add_to_fakeweb
+    end
+    def url
+      SPEC_DOMAIN + @name
+    end
+    private
+    def create_body
+      @body = "<html><body>"
+      @links.each{|l| @body += "<a href=\"#{SPEC_DOMAIN}#{l}\"></a>"} if @links
+      @hrefs.each{|h| @body += "<a href=\"#{h}\"></a>"} if @hrefs
+      @body += "</body></html>"
+    end
+    def add_to_fakeweb
+      options = {:body => @body, :content_type => @content_type, :status => [200, "OK"]}
+      if @redirect
+        options[:status] = [301, "Permanently Moved"]
+        # only prepend SPEC_DOMAIN if a relative url (without an http scheme) was specified
+        redirect_url = (@redirect =~ /http/) ? @redirect : SPEC_DOMAIN + @redirect
+        options[:location] = redirect_url
+        # register the page this one redirects to
+        FakeWeb.register_uri(:get, redirect_url, {:body => '',
+                                                  :content_type => @content_type,
+                                                  :status => [200, "OK"]})
+      end
+      FakeWeb.register_uri(:get, SPEC_DOMAIN + @name, options)
+    end
+  end
+#default root
+#ValidateWebSiteTest::FakePage.new

data/spec/spec_helper.rb ADDED

@@ -0,0 +1,8 @@
+require 'rubygems'
+require File.dirname(__FILE__) + '/fakeweb_helper'
+$:.unshift(File.dirname(__FILE__) + '/../lib/')
+require 'anemone'
+require 'validate_website'
+SPEC_DOMAIN = 'http://www.example.com/'

metadata CHANGED

@@ -1,13 +1,13 @@
 --- !ruby/object:Gem::Specification
 name: validate-website
 version: !ruby/object:Gem::Version
-  hash: 17
+  hash: 25
   prerelease: false
   segments:
   - 0
   - 3
-  - 1
-  version: 0.3.1
+  - 5
+  version: 0.3.5
 platform: ruby
 authors:
 - Laurent Arnoud
@@ -15,7 +15,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2010-08-18 00:00:00 +02:00
+date: 2010-08-25 00:00:00 +02:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency
@@ -49,6 +49,38 @@ dependencies:
         version: "1.1"
   type: :runtime
   version_requirements: *id002
+- !ruby/object:Gem::Dependency
+  name: rspec
+  prerelease: false
+  requirement: &id003 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        hash: 27
+        segments:
+        - 1
+        - 3
+        - 0
+        version: 1.3.0
+  type: :development
+  version_requirements: *id003
+- !ruby/object:Gem::Dependency
+  name: fakeweb
+  prerelease: false
+  requirement: &id004 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        hash: 27
+        segments:
+        - 1
+        - 3
+        - 0
+        version: 1.3.0
+  type: :development
+  version_requirements: *id004
 description: Web crawler for checking the validity of your documents
 email: laurent@spkdev.net
 executables:
@@ -150,6 +182,9 @@ files:
 - lib/xhtml/xhtml-ruby-1.xsd
 - lib/validate_website.rb
 - bin/validate-website
+- spec/spec_helper.rb
+- spec/css_spec.rb
+- spec/fakeweb_helper.rb
 has_rdoc: true
 homepage: http://github.com/spk/validate-website
 licenses: []