validate-website 0.3.1 → 0.3.5

Sign up to get free protection for your applications and to get access to all the features.
@@ -17,6 +17,8 @@
17
17
  --auth=user,pass \ # http auth
18
18
  -e 'redirect|news' \ # exclude regex
19
19
  -n # log not found (404)
20
+ -c "name=val;name2=val2"
21
+ -v # verbose
20
22
 
21
23
  == REQUIREMENTS:
22
24
 
data/Rakefile CHANGED
@@ -1,4 +1,3 @@
1
- require 'rake/testtask'
2
1
  require 'rake/packagetask'
3
2
  require 'rake/rdoctask'
4
3
  require 'rake'
@@ -7,10 +6,10 @@ require 'find'
7
6
  # Globals
8
7
 
9
8
  PKG_NAME = 'validate-website'
10
- PKG_VERSION = '0.3.1'
9
+ PKG_VERSION = '0.3.5'
11
10
 
12
11
  PKG_FILES = ['README.rdoc', 'Rakefile']
13
- Find.find('lib/', 'bin/') do |f|
12
+ Find.find('lib/', 'bin/', 'spec/') do |f|
14
13
  if FileTest.directory?(f) and f =~ /\.svn|\.git/
15
14
  Find.prune
16
15
  else
@@ -22,11 +21,6 @@ end
22
21
 
23
22
  task :default => [:clean, :repackage]
24
23
 
25
- #Rake::TestTask.new do |t|
26
- #t.libs << "test"
27
- #t.test_files = FileList['test/tc_*.rb']
28
- #end
29
-
30
24
  Rake::RDocTask.new do |rd|
31
25
  f = []
32
26
  require 'find'
@@ -61,6 +55,8 @@ spec = Gem::Specification.new do |s|
61
55
  s.requirements << 'spk-anemone' << 'rainbow'
62
56
  s.add_dependency('spk-anemone', '>= 0.4.0')
63
57
  s.add_dependency('rainbow', '>= 1.1')
58
+ s.add_development_dependency('rspec', '>= 1.3.0')
59
+ s.add_development_dependency('fakeweb', '>= 1.3.0')
64
60
  s.require_path = 'lib'
65
61
  s.bindir = 'bin'
66
62
  s.executables << 'validate-website'
@@ -5,47 +5,17 @@ developer_mode = false
5
5
  developer_mode = true if __FILE__ == $0
6
6
  require 'rubygems' if developer_mode
7
7
 
8
- require 'validator'
9
- require 'anemone'
10
- require 'colorful_messages'
11
8
  require 'validate_website'
12
9
 
13
- include ColorfulMessages
14
-
15
10
  validate_website = ValidateWebsite.new(ARGV)
16
11
  options = validate_website.options
17
12
 
18
- exit_code = 0
19
-
20
- Anemone.crawl(options[:site],
21
- :user_agent => options[:useragent],
22
- :authorization => options[:auth]) do |anemone|
23
-
24
- anemone.skip_links_like Regexp.new(options[:exclude]) if options[:exclude]
25
-
26
- anemone.on_every_page { |page|
27
- url = page.url.to_s
28
- print info(url)
29
-
30
- # validate html/html+xml
31
- if page.html? && page.fetched?
32
- validator = Validator.new(page)
33
- msg = " well formed? %s" % validator.valid?
34
- if validator.valid?
35
- puts success(msg)
36
- else
37
- exit_code = 1
38
- puts error(msg)
39
- validate_website.to_file(url)
40
- end
41
- end
42
13
 
43
- if options[:not_found] && page.not_found?
44
- exit_code = 1
45
- puts error("%s linked in %s but not exist" % [url, page.referer])
46
- validate_website.to_file(url)
47
- end
48
- }
49
- end
14
+ exit_code = validate_website.crawl options[:site],
15
+ :user_agent => options[:useragent],
16
+ :authorization => options[:auth],
17
+ :cookies => options[:cookies],
18
+ :accept_cookies => options[:accept_cookies],
19
+ :verbose => options[:verbose]
50
20
 
51
21
  exit(exit_code)
@@ -1,11 +1,18 @@
1
1
  require 'optparse'
2
2
  require 'open-uri'
3
+ require 'validator'
4
+ require 'anemone'
5
+ require 'colorful_messages'
6
+
7
+ include ColorfulMessages
3
8
 
4
9
  class ValidateWebsite
5
10
 
6
11
  attr_reader :options
7
12
 
8
- def initialize(args)
13
+ attr_reader :anemone
14
+
15
+ def initialize(args=[])
9
16
  @options = {
10
17
  :site => 'http://localhost:3000/',
11
18
  :useragent => Anemone::Core::DEFAULT_OPTS[:user_agent],
@@ -14,6 +21,9 @@ class ValidateWebsite
14
21
  :auth => nil,
15
22
  # log not found url (404 status code)
16
23
  :not_found => false,
24
+ :cookies => nil,
25
+ :accept_cookies => true,
26
+ :verbose => false,
17
27
  }
18
28
  parse(args)
19
29
 
@@ -42,6 +52,8 @@ class ValidateWebsite
42
52
  o.on("--auth=[user,pass]", Array,
43
53
  "Basic http authentification") { |v| @options[:auth] = v }
44
54
  o.on("-n", "--not-found", "Log not found url") { |v| @options[:not_found] = v }
55
+ o.on("-c", "--cookies=val", "Set defaults cookies") { |v| @options[:cookies] = v }
56
+ o.on("-v", "--verbose", "Verbose") { |v| @options[:verbose] = v }
45
57
 
46
58
  o.separator ""
47
59
  o.on_tail("-h", "--help", "Show this help message.") { puts o; exit }
@@ -49,7 +61,74 @@ class ValidateWebsite
49
61
  opts.parse!(args)
50
62
  end
51
63
 
64
+ def get_url(page, elem, attrname)
65
+ u = elem.attributes[attrname] if elem.attributes[attrname]
66
+ return if u.nil?
67
+ begin
68
+ abs = page.to_absolute(URI(u))
69
+ rescue
70
+ abs = nil
71
+ end
72
+ return abs if abs && page.in_domain?(abs)
73
+ end
74
+
52
75
  def to_file(msg)
53
76
  open(options[:file], 'a').write("#{msg}\n") if options[:file]
54
77
  end
78
+
79
+ def crawl(site, opts={})
80
+ exit_code = 0
81
+
82
+ @anemone = Anemone.crawl(site, opts) do |anemone|
83
+ anemone.skip_links_like Regexp.new(options[:exclude]) if options[:exclude]
84
+
85
+ anemone.focus_crawl { |p|
86
+ links = []
87
+ if p.html?
88
+ p.doc.css('img, script, iframe').each do |elem|
89
+ url = get_url(p, elem, "src")
90
+ links << url unless url.nil?
91
+ end
92
+ p.doc.css('link').each do |link|
93
+ url = get_url(p, link, "href")
94
+ links << url unless url.nil?
95
+ end
96
+ end
97
+ if p.content_type == 'text/css'
98
+ p.body.scan(/url\((['".\/\w-]+)\)/).each do |url|
99
+ url = url.to_s.gsub("'", "").gsub('"', '')
100
+ abs = p.to_absolute(URI(url))
101
+ links << abs
102
+ end
103
+ end
104
+ links.uniq!
105
+ p.links.concat(links)
106
+ }
107
+
108
+ anemone.on_every_page { |page|
109
+ url = page.url.to_s
110
+
111
+ # validate html/html+xml
112
+ if page.html? && page.fetched?
113
+ print info(url)
114
+ validator = Validator.new(page)
115
+ msg = " well formed? %s" % validator.valid?
116
+ if validator.valid?
117
+ puts success(msg)
118
+ else
119
+ exit_code = 1
120
+ puts error(msg)
121
+ to_file(url)
122
+ end
123
+ end
124
+
125
+ if options[:not_found] && page.not_found?
126
+ exit_code = 1
127
+ puts error("%s linked in %s but not exist" % [url, page.referer])
128
+ to_file(url)
129
+ end
130
+ }
131
+ end
132
+ exit_code
133
+ end
55
134
  end
@@ -0,0 +1,53 @@
1
+ require File.dirname(__FILE__) + '/spec_helper'
2
+
3
+ describe ValidateWebsite do
4
+
5
+ before(:each) do
6
+ FakeWeb.clean_registry
7
+ end
8
+
9
+ it "should crawl css and extract url" do
10
+ pages = []
11
+ pages << FakePage.new('test.css',
12
+ :body => ".test {background-image: url(pouet);}
13
+ .tests {background-image: url(/image/pouet.png)}
14
+ .tests {background-image: url(/image/pouet_42.png)}
15
+ .tests {background-image: url(/image/pouet)}",
16
+ :content_type => 'text/css')
17
+ pages << FakePage.new('pouet',
18
+ :content_type => 'image/png')
19
+ pages << FakePage.new('image/pouet',
20
+ :content_type => 'image/png')
21
+ pages << FakePage.new('image/pouet.png',
22
+ :content_type => 'image/png')
23
+ pages << FakePage.new('image/pouet_42.png',
24
+ :content_type => 'image/png')
25
+ validate_website = ValidateWebsite.new
26
+ validate_website.crawl(pages[0].url)
27
+ validate_website.anemone.should have(5).pages
28
+ end
29
+
30
+ it "should extract url with single quote" do
31
+ pages = []
32
+ pages << FakePage.new('test.css',
33
+ :body => ".test {background-image: url('pouet');}",
34
+ :content_type => 'text/css')
35
+ pages << FakePage.new('pouet',
36
+ :content_type => 'image/png')
37
+ validate_website = ValidateWebsite.new
38
+ validate_website.crawl(pages[0].url)
39
+ validate_website.anemone.should have(2).pages
40
+ end
41
+
42
+ it "should extract url with double quote" do
43
+ pages = []
44
+ pages << FakePage.new('test.css',
45
+ :body => ".test {background-image: url(\"pouet\");}",
46
+ :content_type => 'text/css')
47
+ pages << FakePage.new('pouet',
48
+ :content_type => 'image/png')
49
+ validate_website = ValidateWebsite.new
50
+ validate_website.crawl(pages[0].url)
51
+ validate_website.anemone.should have(2).pages
52
+ end
53
+ end
@@ -0,0 +1,61 @@
1
+ begin
2
+ require 'fakeweb'
3
+ rescue LoadError
4
+ warn "You need the 'fakeweb' gem installed to test ValidateWebsite"
5
+ exit
6
+ end
7
+
8
+ FakeWeb.allow_net_connect = false
9
+
10
+ class FakePage
11
+ attr_accessor :links
12
+ attr_accessor :hrefs
13
+ attr_accessor :body
14
+
15
+ def initialize(name = '', options = {})
16
+ @name = name
17
+ @links = [options[:links]].flatten if options.has_key?(:links)
18
+ @hrefs = [options[:hrefs]].flatten if options.has_key?(:hrefs)
19
+ @redirect = options[:redirect] if options.has_key?(:redirect)
20
+ @content_type = options[:content_type] || "text/html"
21
+ @body = options[:body]
22
+
23
+ create_body unless @body
24
+ add_to_fakeweb
25
+ end
26
+
27
+ def url
28
+ SPEC_DOMAIN + @name
29
+ end
30
+
31
+ private
32
+
33
+ def create_body
34
+ @body = "<html><body>"
35
+ @links.each{|l| @body += "<a href=\"#{SPEC_DOMAIN}#{l}\"></a>"} if @links
36
+ @hrefs.each{|h| @body += "<a href=\"#{h}\"></a>"} if @hrefs
37
+ @body += "</body></html>"
38
+ end
39
+
40
+ def add_to_fakeweb
41
+ options = {:body => @body, :content_type => @content_type, :status => [200, "OK"]}
42
+
43
+ if @redirect
44
+ options[:status] = [301, "Permanently Moved"]
45
+
46
+ # only prepend SPEC_DOMAIN if a relative url (without an http scheme) was specified
47
+ redirect_url = (@redirect =~ /http/) ? @redirect : SPEC_DOMAIN + @redirect
48
+ options[:location] = redirect_url
49
+
50
+ # register the page this one redirects to
51
+ FakeWeb.register_uri(:get, redirect_url, {:body => '',
52
+ :content_type => @content_type,
53
+ :status => [200, "OK"]})
54
+ end
55
+
56
+ FakeWeb.register_uri(:get, SPEC_DOMAIN + @name, options)
57
+ end
58
+ end
59
+
60
+ #default root
61
+ #ValidateWebSiteTest::FakePage.new
@@ -0,0 +1,8 @@
1
+ require 'rubygems'
2
+ require File.dirname(__FILE__) + '/fakeweb_helper'
3
+
4
+ $:.unshift(File.dirname(__FILE__) + '/../lib/')
5
+ require 'anemone'
6
+ require 'validate_website'
7
+
8
+ SPEC_DOMAIN = 'http://www.example.com/'
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: validate-website
3
3
  version: !ruby/object:Gem::Version
4
- hash: 17
4
+ hash: 25
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
8
  - 3
9
- - 1
10
- version: 0.3.1
9
+ - 5
10
+ version: 0.3.5
11
11
  platform: ruby
12
12
  authors:
13
13
  - Laurent Arnoud
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2010-08-18 00:00:00 +02:00
18
+ date: 2010-08-25 00:00:00 +02:00
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency
@@ -49,6 +49,38 @@ dependencies:
49
49
  version: "1.1"
50
50
  type: :runtime
51
51
  version_requirements: *id002
52
+ - !ruby/object:Gem::Dependency
53
+ name: rspec
54
+ prerelease: false
55
+ requirement: &id003 !ruby/object:Gem::Requirement
56
+ none: false
57
+ requirements:
58
+ - - ">="
59
+ - !ruby/object:Gem::Version
60
+ hash: 27
61
+ segments:
62
+ - 1
63
+ - 3
64
+ - 0
65
+ version: 1.3.0
66
+ type: :development
67
+ version_requirements: *id003
68
+ - !ruby/object:Gem::Dependency
69
+ name: fakeweb
70
+ prerelease: false
71
+ requirement: &id004 !ruby/object:Gem::Requirement
72
+ none: false
73
+ requirements:
74
+ - - ">="
75
+ - !ruby/object:Gem::Version
76
+ hash: 27
77
+ segments:
78
+ - 1
79
+ - 3
80
+ - 0
81
+ version: 1.3.0
82
+ type: :development
83
+ version_requirements: *id004
52
84
  description: Web crawler for checking the validity of your documents
53
85
  email: laurent@spkdev.net
54
86
  executables:
@@ -150,6 +182,9 @@ files:
150
182
  - lib/xhtml/xhtml-ruby-1.xsd
151
183
  - lib/validate_website.rb
152
184
  - bin/validate-website
185
+ - spec/spec_helper.rb
186
+ - spec/css_spec.rb
187
+ - spec/fakeweb_helper.rb
153
188
  has_rdoc: true
154
189
  homepage: http://github.com/spk/validate-website
155
190
  licenses: []