validate-website 0.3.1 → 0.3.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -17,6 +17,8 @@
17
17
  --auth=user,pass \ # http auth
18
18
  -e 'redirect|news' \ # exclude regex
19
19
  -n # log not found (404)
20
+ -c "name=val;name2=val2"
21
+ -v # verbose
20
22
 
21
23
  == REQUIREMENTS:
22
24
 
data/Rakefile CHANGED
@@ -1,4 +1,3 @@
1
- require 'rake/testtask'
2
1
  require 'rake/packagetask'
3
2
  require 'rake/rdoctask'
4
3
  require 'rake'
@@ -7,10 +6,10 @@ require 'find'
7
6
  # Globals
8
7
 
9
8
  PKG_NAME = 'validate-website'
10
- PKG_VERSION = '0.3.1'
9
+ PKG_VERSION = '0.3.5'
11
10
 
12
11
  PKG_FILES = ['README.rdoc', 'Rakefile']
13
- Find.find('lib/', 'bin/') do |f|
12
+ Find.find('lib/', 'bin/', 'spec/') do |f|
14
13
  if FileTest.directory?(f) and f =~ /\.svn|\.git/
15
14
  Find.prune
16
15
  else
@@ -22,11 +21,6 @@ end
22
21
 
23
22
  task :default => [:clean, :repackage]
24
23
 
25
- #Rake::TestTask.new do |t|
26
- #t.libs << "test"
27
- #t.test_files = FileList['test/tc_*.rb']
28
- #end
29
-
30
24
  Rake::RDocTask.new do |rd|
31
25
  f = []
32
26
  require 'find'
@@ -61,6 +55,8 @@ spec = Gem::Specification.new do |s|
61
55
  s.requirements << 'spk-anemone' << 'rainbow'
62
56
  s.add_dependency('spk-anemone', '>= 0.4.0')
63
57
  s.add_dependency('rainbow', '>= 1.1')
58
+ s.add_development_dependency('rspec', '>= 1.3.0')
59
+ s.add_development_dependency('fakeweb', '>= 1.3.0')
64
60
  s.require_path = 'lib'
65
61
  s.bindir = 'bin'
66
62
  s.executables << 'validate-website'
@@ -5,47 +5,17 @@ developer_mode = false
5
5
  developer_mode = true if __FILE__ == $0
6
6
  require 'rubygems' if developer_mode
7
7
 
8
- require 'validator'
9
- require 'anemone'
10
- require 'colorful_messages'
11
8
  require 'validate_website'
12
9
 
13
- include ColorfulMessages
14
-
15
10
  validate_website = ValidateWebsite.new(ARGV)
16
11
  options = validate_website.options
17
12
 
18
- exit_code = 0
19
-
20
- Anemone.crawl(options[:site],
21
- :user_agent => options[:useragent],
22
- :authorization => options[:auth]) do |anemone|
23
-
24
- anemone.skip_links_like Regexp.new(options[:exclude]) if options[:exclude]
25
-
26
- anemone.on_every_page { |page|
27
- url = page.url.to_s
28
- print info(url)
29
-
30
- # validate html/html+xml
31
- if page.html? && page.fetched?
32
- validator = Validator.new(page)
33
- msg = " well formed? %s" % validator.valid?
34
- if validator.valid?
35
- puts success(msg)
36
- else
37
- exit_code = 1
38
- puts error(msg)
39
- validate_website.to_file(url)
40
- end
41
- end
42
13
 
43
- if options[:not_found] && page.not_found?
44
- exit_code = 1
45
- puts error("%s linked in %s but not exist" % [url, page.referer])
46
- validate_website.to_file(url)
47
- end
48
- }
49
- end
14
+ exit_code = validate_website.crawl options[:site],
15
+ :user_agent => options[:useragent],
16
+ :authorization => options[:auth],
17
+ :cookies => options[:cookies],
18
+ :accept_cookies => options[:accept_cookies],
19
+ :verbose => options[:verbose]
50
20
 
51
21
  exit(exit_code)
@@ -1,11 +1,18 @@
1
1
  require 'optparse'
2
2
  require 'open-uri'
3
+ require 'validator'
4
+ require 'anemone'
5
+ require 'colorful_messages'
6
+
7
+ include ColorfulMessages
3
8
 
4
9
  class ValidateWebsite
5
10
 
6
11
  attr_reader :options
7
12
 
8
- def initialize(args)
13
+ attr_reader :anemone
14
+
15
+ def initialize(args=[])
9
16
  @options = {
10
17
  :site => 'http://localhost:3000/',
11
18
  :useragent => Anemone::Core::DEFAULT_OPTS[:user_agent],
@@ -14,6 +21,9 @@ class ValidateWebsite
14
21
  :auth => nil,
15
22
  # log not found url (404 status code)
16
23
  :not_found => false,
24
+ :cookies => nil,
25
+ :accept_cookies => true,
26
+ :verbose => false,
17
27
  }
18
28
  parse(args)
19
29
 
@@ -42,6 +52,8 @@ class ValidateWebsite
42
52
  o.on("--auth=[user,pass]", Array,
43
53
  "Basic http authentification") { |v| @options[:auth] = v }
44
54
  o.on("-n", "--not-found", "Log not found url") { |v| @options[:not_found] = v }
55
+ o.on("-c", "--cookies=val", "Set defaults cookies") { |v| @options[:cookies] = v }
56
+ o.on("-v", "--verbose", "Verbose") { |v| @options[:verbose] = v }
45
57
 
46
58
  o.separator ""
47
59
  o.on_tail("-h", "--help", "Show this help message.") { puts o; exit }
@@ -49,7 +61,74 @@ class ValidateWebsite
49
61
  opts.parse!(args)
50
62
  end
51
63
 
64
+ def get_url(page, elem, attrname)
65
+ u = elem.attributes[attrname] if elem.attributes[attrname]
66
+ return if u.nil?
67
+ begin
68
+ abs = page.to_absolute(URI(u))
69
+ rescue
70
+ abs = nil
71
+ end
72
+ return abs if abs && page.in_domain?(abs)
73
+ end
74
+
52
75
  def to_file(msg)
53
76
  open(options[:file], 'a').write("#{msg}\n") if options[:file]
54
77
  end
78
+
79
+ def crawl(site, opts={})
80
+ exit_code = 0
81
+
82
+ @anemone = Anemone.crawl(site, opts) do |anemone|
83
+ anemone.skip_links_like Regexp.new(options[:exclude]) if options[:exclude]
84
+
85
+ anemone.focus_crawl { |p|
86
+ links = []
87
+ if p.html?
88
+ p.doc.css('img, script, iframe').each do |elem|
89
+ url = get_url(p, elem, "src")
90
+ links << url unless url.nil?
91
+ end
92
+ p.doc.css('link').each do |link|
93
+ url = get_url(p, link, "href")
94
+ links << url unless url.nil?
95
+ end
96
+ end
97
+ if p.content_type == 'text/css'
98
+ p.body.scan(/url\((['".\/\w-]+)\)/).each do |url|
99
+ url = url.to_s.gsub("'", "").gsub('"', '')
100
+ abs = p.to_absolute(URI(url))
101
+ links << abs
102
+ end
103
+ end
104
+ links.uniq!
105
+ p.links.concat(links)
106
+ }
107
+
108
+ anemone.on_every_page { |page|
109
+ url = page.url.to_s
110
+
111
+ # validate html/html+xml
112
+ if page.html? && page.fetched?
113
+ print info(url)
114
+ validator = Validator.new(page)
115
+ msg = " well formed? %s" % validator.valid?
116
+ if validator.valid?
117
+ puts success(msg)
118
+ else
119
+ exit_code = 1
120
+ puts error(msg)
121
+ to_file(url)
122
+ end
123
+ end
124
+
125
+ if options[:not_found] && page.not_found?
126
+ exit_code = 1
127
+ puts error("%s linked in %s but not exist" % [url, page.referer])
128
+ to_file(url)
129
+ end
130
+ }
131
+ end
132
+ exit_code
133
+ end
55
134
  end
@@ -0,0 +1,53 @@
1
+ require File.dirname(__FILE__) + '/spec_helper'
2
+
3
+ describe ValidateWebsite do
4
+
5
+ before(:each) do
6
+ FakeWeb.clean_registry
7
+ end
8
+
9
+ it "should crawl css and extract url" do
10
+ pages = []
11
+ pages << FakePage.new('test.css',
12
+ :body => ".test {background-image: url(pouet);}
13
+ .tests {background-image: url(/image/pouet.png)}
14
+ .tests {background-image: url(/image/pouet_42.png)}
15
+ .tests {background-image: url(/image/pouet)}",
16
+ :content_type => 'text/css')
17
+ pages << FakePage.new('pouet',
18
+ :content_type => 'image/png')
19
+ pages << FakePage.new('image/pouet',
20
+ :content_type => 'image/png')
21
+ pages << FakePage.new('image/pouet.png',
22
+ :content_type => 'image/png')
23
+ pages << FakePage.new('image/pouet_42.png',
24
+ :content_type => 'image/png')
25
+ validate_website = ValidateWebsite.new
26
+ validate_website.crawl(pages[0].url)
27
+ validate_website.anemone.should have(5).pages
28
+ end
29
+
30
+ it "should extract url with single quote" do
31
+ pages = []
32
+ pages << FakePage.new('test.css',
33
+ :body => ".test {background-image: url('pouet');}",
34
+ :content_type => 'text/css')
35
+ pages << FakePage.new('pouet',
36
+ :content_type => 'image/png')
37
+ validate_website = ValidateWebsite.new
38
+ validate_website.crawl(pages[0].url)
39
+ validate_website.anemone.should have(2).pages
40
+ end
41
+
42
+ it "should extract url with double quote" do
43
+ pages = []
44
+ pages << FakePage.new('test.css',
45
+ :body => ".test {background-image: url(\"pouet\");}",
46
+ :content_type => 'text/css')
47
+ pages << FakePage.new('pouet',
48
+ :content_type => 'image/png')
49
+ validate_website = ValidateWebsite.new
50
+ validate_website.crawl(pages[0].url)
51
+ validate_website.anemone.should have(2).pages
52
+ end
53
+ end
@@ -0,0 +1,61 @@
1
+ begin
2
+ require 'fakeweb'
3
+ rescue LoadError
4
+ warn "You need the 'fakeweb' gem installed to test ValidateWebsite"
5
+ exit
6
+ end
7
+
8
+ FakeWeb.allow_net_connect = false
9
+
10
+ class FakePage
11
+ attr_accessor :links
12
+ attr_accessor :hrefs
13
+ attr_accessor :body
14
+
15
+ def initialize(name = '', options = {})
16
+ @name = name
17
+ @links = [options[:links]].flatten if options.has_key?(:links)
18
+ @hrefs = [options[:hrefs]].flatten if options.has_key?(:hrefs)
19
+ @redirect = options[:redirect] if options.has_key?(:redirect)
20
+ @content_type = options[:content_type] || "text/html"
21
+ @body = options[:body]
22
+
23
+ create_body unless @body
24
+ add_to_fakeweb
25
+ end
26
+
27
+ def url
28
+ SPEC_DOMAIN + @name
29
+ end
30
+
31
+ private
32
+
33
+ def create_body
34
+ @body = "<html><body>"
35
+ @links.each{|l| @body += "<a href=\"#{SPEC_DOMAIN}#{l}\"></a>"} if @links
36
+ @hrefs.each{|h| @body += "<a href=\"#{h}\"></a>"} if @hrefs
37
+ @body += "</body></html>"
38
+ end
39
+
40
+ def add_to_fakeweb
41
+ options = {:body => @body, :content_type => @content_type, :status => [200, "OK"]}
42
+
43
+ if @redirect
44
+ options[:status] = [301, "Permanently Moved"]
45
+
46
+ # only prepend SPEC_DOMAIN if a relative url (without an http scheme) was specified
47
+ redirect_url = (@redirect =~ /http/) ? @redirect : SPEC_DOMAIN + @redirect
48
+ options[:location] = redirect_url
49
+
50
+ # register the page this one redirects to
51
+ FakeWeb.register_uri(:get, redirect_url, {:body => '',
52
+ :content_type => @content_type,
53
+ :status => [200, "OK"]})
54
+ end
55
+
56
+ FakeWeb.register_uri(:get, SPEC_DOMAIN + @name, options)
57
+ end
58
+ end
59
+
60
+ #default root
61
+ #ValidateWebSiteTest::FakePage.new
@@ -0,0 +1,8 @@
1
+ require 'rubygems'
2
+ require File.dirname(__FILE__) + '/fakeweb_helper'
3
+
4
+ $:.unshift(File.dirname(__FILE__) + '/../lib/')
5
+ require 'anemone'
6
+ require 'validate_website'
7
+
8
+ SPEC_DOMAIN = 'http://www.example.com/'
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: validate-website
3
3
  version: !ruby/object:Gem::Version
4
- hash: 17
4
+ hash: 25
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
8
  - 3
9
- - 1
10
- version: 0.3.1
9
+ - 5
10
+ version: 0.3.5
11
11
  platform: ruby
12
12
  authors:
13
13
  - Laurent Arnoud
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2010-08-18 00:00:00 +02:00
18
+ date: 2010-08-25 00:00:00 +02:00
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency
@@ -49,6 +49,38 @@ dependencies:
49
49
  version: "1.1"
50
50
  type: :runtime
51
51
  version_requirements: *id002
52
+ - !ruby/object:Gem::Dependency
53
+ name: rspec
54
+ prerelease: false
55
+ requirement: &id003 !ruby/object:Gem::Requirement
56
+ none: false
57
+ requirements:
58
+ - - ">="
59
+ - !ruby/object:Gem::Version
60
+ hash: 27
61
+ segments:
62
+ - 1
63
+ - 3
64
+ - 0
65
+ version: 1.3.0
66
+ type: :development
67
+ version_requirements: *id003
68
+ - !ruby/object:Gem::Dependency
69
+ name: fakeweb
70
+ prerelease: false
71
+ requirement: &id004 !ruby/object:Gem::Requirement
72
+ none: false
73
+ requirements:
74
+ - - ">="
75
+ - !ruby/object:Gem::Version
76
+ hash: 27
77
+ segments:
78
+ - 1
79
+ - 3
80
+ - 0
81
+ version: 1.3.0
82
+ type: :development
83
+ version_requirements: *id004
52
84
  description: Web crawler for checking the validity of your documents
53
85
  email: laurent@spkdev.net
54
86
  executables:
@@ -150,6 +182,9 @@ files:
150
182
  - lib/xhtml/xhtml-ruby-1.xsd
151
183
  - lib/validate_website.rb
152
184
  - bin/validate-website
185
+ - spec/spec_helper.rb
186
+ - spec/css_spec.rb
187
+ - spec/fakeweb_helper.rb
153
188
  has_rdoc: true
154
189
  homepage: http://github.com/spk/validate-website
155
190
  licenses: []