validate-website 0.3.1 → 0.3.5
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +2 -0
- data/Rakefile +4 -8
- data/bin/validate-website +6 -36
- data/lib/validate_website.rb +80 -1
- data/spec/css_spec.rb +53 -0
- data/spec/fakeweb_helper.rb +61 -0
- data/spec/spec_helper.rb +8 -0
- metadata +39 -4
data/README.rdoc
CHANGED
data/Rakefile
CHANGED
@@ -1,4 +1,3 @@
|
|
1
|
-
require 'rake/testtask'
|
2
1
|
require 'rake/packagetask'
|
3
2
|
require 'rake/rdoctask'
|
4
3
|
require 'rake'
|
@@ -7,10 +6,10 @@ require 'find'
|
|
7
6
|
# Globals
|
8
7
|
|
9
8
|
PKG_NAME = 'validate-website'
|
10
|
-
PKG_VERSION = '0.3.
|
9
|
+
PKG_VERSION = '0.3.5'
|
11
10
|
|
12
11
|
PKG_FILES = ['README.rdoc', 'Rakefile']
|
13
|
-
Find.find('lib/', 'bin/') do |f|
|
12
|
+
Find.find('lib/', 'bin/', 'spec/') do |f|
|
14
13
|
if FileTest.directory?(f) and f =~ /\.svn|\.git/
|
15
14
|
Find.prune
|
16
15
|
else
|
@@ -22,11 +21,6 @@ end
|
|
22
21
|
|
23
22
|
task :default => [:clean, :repackage]
|
24
23
|
|
25
|
-
#Rake::TestTask.new do |t|
|
26
|
-
#t.libs << "test"
|
27
|
-
#t.test_files = FileList['test/tc_*.rb']
|
28
|
-
#end
|
29
|
-
|
30
24
|
Rake::RDocTask.new do |rd|
|
31
25
|
f = []
|
32
26
|
require 'find'
|
@@ -61,6 +55,8 @@ spec = Gem::Specification.new do |s|
|
|
61
55
|
s.requirements << 'spk-anemone' << 'rainbow'
|
62
56
|
s.add_dependency('spk-anemone', '>= 0.4.0')
|
63
57
|
s.add_dependency('rainbow', '>= 1.1')
|
58
|
+
s.add_development_dependency('rspec', '>= 1.3.0')
|
59
|
+
s.add_development_dependency('fakeweb', '>= 1.3.0')
|
64
60
|
s.require_path = 'lib'
|
65
61
|
s.bindir = 'bin'
|
66
62
|
s.executables << 'validate-website'
|
data/bin/validate-website
CHANGED
@@ -5,47 +5,17 @@ developer_mode = false
|
|
5
5
|
developer_mode = true if __FILE__ == $0
|
6
6
|
require 'rubygems' if developer_mode
|
7
7
|
|
8
|
-
require 'validator'
|
9
|
-
require 'anemone'
|
10
|
-
require 'colorful_messages'
|
11
8
|
require 'validate_website'
|
12
9
|
|
13
|
-
include ColorfulMessages
|
14
|
-
|
15
10
|
validate_website = ValidateWebsite.new(ARGV)
|
16
11
|
options = validate_website.options
|
17
12
|
|
18
|
-
exit_code = 0
|
19
|
-
|
20
|
-
Anemone.crawl(options[:site],
|
21
|
-
:user_agent => options[:useragent],
|
22
|
-
:authorization => options[:auth]) do |anemone|
|
23
|
-
|
24
|
-
anemone.skip_links_like Regexp.new(options[:exclude]) if options[:exclude]
|
25
|
-
|
26
|
-
anemone.on_every_page { |page|
|
27
|
-
url = page.url.to_s
|
28
|
-
print info(url)
|
29
|
-
|
30
|
-
# validate html/html+xml
|
31
|
-
if page.html? && page.fetched?
|
32
|
-
validator = Validator.new(page)
|
33
|
-
msg = " well formed? %s" % validator.valid?
|
34
|
-
if validator.valid?
|
35
|
-
puts success(msg)
|
36
|
-
else
|
37
|
-
exit_code = 1
|
38
|
-
puts error(msg)
|
39
|
-
validate_website.to_file(url)
|
40
|
-
end
|
41
|
-
end
|
42
13
|
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
end
|
14
|
+
exit_code = validate_website.crawl options[:site],
|
15
|
+
:user_agent => options[:useragent],
|
16
|
+
:authorization => options[:auth],
|
17
|
+
:cookies => options[:cookies],
|
18
|
+
:accept_cookies => options[:accept_cookies],
|
19
|
+
:verbose => options[:verbose]
|
50
20
|
|
51
21
|
exit(exit_code)
|
data/lib/validate_website.rb
CHANGED
@@ -1,11 +1,18 @@
|
|
1
1
|
require 'optparse'
|
2
2
|
require 'open-uri'
|
3
|
+
require 'validator'
|
4
|
+
require 'anemone'
|
5
|
+
require 'colorful_messages'
|
6
|
+
|
7
|
+
include ColorfulMessages
|
3
8
|
|
4
9
|
class ValidateWebsite
|
5
10
|
|
6
11
|
attr_reader :options
|
7
12
|
|
8
|
-
|
13
|
+
attr_reader :anemone
|
14
|
+
|
15
|
+
def initialize(args=[])
|
9
16
|
@options = {
|
10
17
|
:site => 'http://localhost:3000/',
|
11
18
|
:useragent => Anemone::Core::DEFAULT_OPTS[:user_agent],
|
@@ -14,6 +21,9 @@ class ValidateWebsite
|
|
14
21
|
:auth => nil,
|
15
22
|
# log not found url (404 status code)
|
16
23
|
:not_found => false,
|
24
|
+
:cookies => nil,
|
25
|
+
:accept_cookies => true,
|
26
|
+
:verbose => false,
|
17
27
|
}
|
18
28
|
parse(args)
|
19
29
|
|
@@ -42,6 +52,8 @@ class ValidateWebsite
|
|
42
52
|
o.on("--auth=[user,pass]", Array,
|
43
53
|
"Basic http authentification") { |v| @options[:auth] = v }
|
44
54
|
o.on("-n", "--not-found", "Log not found url") { |v| @options[:not_found] = v }
|
55
|
+
o.on("-c", "--cookies=val", "Set defaults cookies") { |v| @options[:cookies] = v }
|
56
|
+
o.on("-v", "--verbose", "Verbose") { |v| @options[:verbose] = v }
|
45
57
|
|
46
58
|
o.separator ""
|
47
59
|
o.on_tail("-h", "--help", "Show this help message.") { puts o; exit }
|
@@ -49,7 +61,74 @@ class ValidateWebsite
|
|
49
61
|
opts.parse!(args)
|
50
62
|
end
|
51
63
|
|
64
|
+
def get_url(page, elem, attrname)
|
65
|
+
u = elem.attributes[attrname] if elem.attributes[attrname]
|
66
|
+
return if u.nil?
|
67
|
+
begin
|
68
|
+
abs = page.to_absolute(URI(u))
|
69
|
+
rescue
|
70
|
+
abs = nil
|
71
|
+
end
|
72
|
+
return abs if abs && page.in_domain?(abs)
|
73
|
+
end
|
74
|
+
|
52
75
|
def to_file(msg)
|
53
76
|
open(options[:file], 'a').write("#{msg}\n") if options[:file]
|
54
77
|
end
|
78
|
+
|
79
|
+
def crawl(site, opts={})
|
80
|
+
exit_code = 0
|
81
|
+
|
82
|
+
@anemone = Anemone.crawl(site, opts) do |anemone|
|
83
|
+
anemone.skip_links_like Regexp.new(options[:exclude]) if options[:exclude]
|
84
|
+
|
85
|
+
anemone.focus_crawl { |p|
|
86
|
+
links = []
|
87
|
+
if p.html?
|
88
|
+
p.doc.css('img, script, iframe').each do |elem|
|
89
|
+
url = get_url(p, elem, "src")
|
90
|
+
links << url unless url.nil?
|
91
|
+
end
|
92
|
+
p.doc.css('link').each do |link|
|
93
|
+
url = get_url(p, link, "href")
|
94
|
+
links << url unless url.nil?
|
95
|
+
end
|
96
|
+
end
|
97
|
+
if p.content_type == 'text/css'
|
98
|
+
p.body.scan(/url\((['".\/\w-]+)\)/).each do |url|
|
99
|
+
url = url.to_s.gsub("'", "").gsub('"', '')
|
100
|
+
abs = p.to_absolute(URI(url))
|
101
|
+
links << abs
|
102
|
+
end
|
103
|
+
end
|
104
|
+
links.uniq!
|
105
|
+
p.links.concat(links)
|
106
|
+
}
|
107
|
+
|
108
|
+
anemone.on_every_page { |page|
|
109
|
+
url = page.url.to_s
|
110
|
+
|
111
|
+
# validate html/html+xml
|
112
|
+
if page.html? && page.fetched?
|
113
|
+
print info(url)
|
114
|
+
validator = Validator.new(page)
|
115
|
+
msg = " well formed? %s" % validator.valid?
|
116
|
+
if validator.valid?
|
117
|
+
puts success(msg)
|
118
|
+
else
|
119
|
+
exit_code = 1
|
120
|
+
puts error(msg)
|
121
|
+
to_file(url)
|
122
|
+
end
|
123
|
+
end
|
124
|
+
|
125
|
+
if options[:not_found] && page.not_found?
|
126
|
+
exit_code = 1
|
127
|
+
puts error("%s linked in %s but not exist" % [url, page.referer])
|
128
|
+
to_file(url)
|
129
|
+
end
|
130
|
+
}
|
131
|
+
end
|
132
|
+
exit_code
|
133
|
+
end
|
55
134
|
end
|
data/spec/css_spec.rb
ADDED
@@ -0,0 +1,53 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/spec_helper'
|
2
|
+
|
3
|
+
describe ValidateWebsite do
|
4
|
+
|
5
|
+
before(:each) do
|
6
|
+
FakeWeb.clean_registry
|
7
|
+
end
|
8
|
+
|
9
|
+
it "should crawl css and extract url" do
|
10
|
+
pages = []
|
11
|
+
pages << FakePage.new('test.css',
|
12
|
+
:body => ".test {background-image: url(pouet);}
|
13
|
+
.tests {background-image: url(/image/pouet.png)}
|
14
|
+
.tests {background-image: url(/image/pouet_42.png)}
|
15
|
+
.tests {background-image: url(/image/pouet)}",
|
16
|
+
:content_type => 'text/css')
|
17
|
+
pages << FakePage.new('pouet',
|
18
|
+
:content_type => 'image/png')
|
19
|
+
pages << FakePage.new('image/pouet',
|
20
|
+
:content_type => 'image/png')
|
21
|
+
pages << FakePage.new('image/pouet.png',
|
22
|
+
:content_type => 'image/png')
|
23
|
+
pages << FakePage.new('image/pouet_42.png',
|
24
|
+
:content_type => 'image/png')
|
25
|
+
validate_website = ValidateWebsite.new
|
26
|
+
validate_website.crawl(pages[0].url)
|
27
|
+
validate_website.anemone.should have(5).pages
|
28
|
+
end
|
29
|
+
|
30
|
+
it "should extract url with single quote" do
|
31
|
+
pages = []
|
32
|
+
pages << FakePage.new('test.css',
|
33
|
+
:body => ".test {background-image: url('pouet');}",
|
34
|
+
:content_type => 'text/css')
|
35
|
+
pages << FakePage.new('pouet',
|
36
|
+
:content_type => 'image/png')
|
37
|
+
validate_website = ValidateWebsite.new
|
38
|
+
validate_website.crawl(pages[0].url)
|
39
|
+
validate_website.anemone.should have(2).pages
|
40
|
+
end
|
41
|
+
|
42
|
+
it "should extract url with double quote" do
|
43
|
+
pages = []
|
44
|
+
pages << FakePage.new('test.css',
|
45
|
+
:body => ".test {background-image: url(\"pouet\");}",
|
46
|
+
:content_type => 'text/css')
|
47
|
+
pages << FakePage.new('pouet',
|
48
|
+
:content_type => 'image/png')
|
49
|
+
validate_website = ValidateWebsite.new
|
50
|
+
validate_website.crawl(pages[0].url)
|
51
|
+
validate_website.anemone.should have(2).pages
|
52
|
+
end
|
53
|
+
end
|
@@ -0,0 +1,61 @@
|
|
1
|
+
begin
|
2
|
+
require 'fakeweb'
|
3
|
+
rescue LoadError
|
4
|
+
warn "You need the 'fakeweb' gem installed to test ValidateWebsite"
|
5
|
+
exit
|
6
|
+
end
|
7
|
+
|
8
|
+
FakeWeb.allow_net_connect = false
|
9
|
+
|
10
|
+
class FakePage
|
11
|
+
attr_accessor :links
|
12
|
+
attr_accessor :hrefs
|
13
|
+
attr_accessor :body
|
14
|
+
|
15
|
+
def initialize(name = '', options = {})
|
16
|
+
@name = name
|
17
|
+
@links = [options[:links]].flatten if options.has_key?(:links)
|
18
|
+
@hrefs = [options[:hrefs]].flatten if options.has_key?(:hrefs)
|
19
|
+
@redirect = options[:redirect] if options.has_key?(:redirect)
|
20
|
+
@content_type = options[:content_type] || "text/html"
|
21
|
+
@body = options[:body]
|
22
|
+
|
23
|
+
create_body unless @body
|
24
|
+
add_to_fakeweb
|
25
|
+
end
|
26
|
+
|
27
|
+
def url
|
28
|
+
SPEC_DOMAIN + @name
|
29
|
+
end
|
30
|
+
|
31
|
+
private
|
32
|
+
|
33
|
+
def create_body
|
34
|
+
@body = "<html><body>"
|
35
|
+
@links.each{|l| @body += "<a href=\"#{SPEC_DOMAIN}#{l}\"></a>"} if @links
|
36
|
+
@hrefs.each{|h| @body += "<a href=\"#{h}\"></a>"} if @hrefs
|
37
|
+
@body += "</body></html>"
|
38
|
+
end
|
39
|
+
|
40
|
+
def add_to_fakeweb
|
41
|
+
options = {:body => @body, :content_type => @content_type, :status => [200, "OK"]}
|
42
|
+
|
43
|
+
if @redirect
|
44
|
+
options[:status] = [301, "Permanently Moved"]
|
45
|
+
|
46
|
+
# only prepend SPEC_DOMAIN if a relative url (without an http scheme) was specified
|
47
|
+
redirect_url = (@redirect =~ /http/) ? @redirect : SPEC_DOMAIN + @redirect
|
48
|
+
options[:location] = redirect_url
|
49
|
+
|
50
|
+
# register the page this one redirects to
|
51
|
+
FakeWeb.register_uri(:get, redirect_url, {:body => '',
|
52
|
+
:content_type => @content_type,
|
53
|
+
:status => [200, "OK"]})
|
54
|
+
end
|
55
|
+
|
56
|
+
FakeWeb.register_uri(:get, SPEC_DOMAIN + @name, options)
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
#default root
|
61
|
+
#ValidateWebSiteTest::FakePage.new
|
data/spec/spec_helper.rb
ADDED
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: validate-website
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 25
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 3
|
9
|
-
-
|
10
|
-
version: 0.3.
|
9
|
+
- 5
|
10
|
+
version: 0.3.5
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Laurent Arnoud
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2010-08-
|
18
|
+
date: 2010-08-25 00:00:00 +02:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
@@ -49,6 +49,38 @@ dependencies:
|
|
49
49
|
version: "1.1"
|
50
50
|
type: :runtime
|
51
51
|
version_requirements: *id002
|
52
|
+
- !ruby/object:Gem::Dependency
|
53
|
+
name: rspec
|
54
|
+
prerelease: false
|
55
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
56
|
+
none: false
|
57
|
+
requirements:
|
58
|
+
- - ">="
|
59
|
+
- !ruby/object:Gem::Version
|
60
|
+
hash: 27
|
61
|
+
segments:
|
62
|
+
- 1
|
63
|
+
- 3
|
64
|
+
- 0
|
65
|
+
version: 1.3.0
|
66
|
+
type: :development
|
67
|
+
version_requirements: *id003
|
68
|
+
- !ruby/object:Gem::Dependency
|
69
|
+
name: fakeweb
|
70
|
+
prerelease: false
|
71
|
+
requirement: &id004 !ruby/object:Gem::Requirement
|
72
|
+
none: false
|
73
|
+
requirements:
|
74
|
+
- - ">="
|
75
|
+
- !ruby/object:Gem::Version
|
76
|
+
hash: 27
|
77
|
+
segments:
|
78
|
+
- 1
|
79
|
+
- 3
|
80
|
+
- 0
|
81
|
+
version: 1.3.0
|
82
|
+
type: :development
|
83
|
+
version_requirements: *id004
|
52
84
|
description: Web crawler for checking the validity of your documents
|
53
85
|
email: laurent@spkdev.net
|
54
86
|
executables:
|
@@ -150,6 +182,9 @@ files:
|
|
150
182
|
- lib/xhtml/xhtml-ruby-1.xsd
|
151
183
|
- lib/validate_website.rb
|
152
184
|
- bin/validate-website
|
185
|
+
- spec/spec_helper.rb
|
186
|
+
- spec/css_spec.rb
|
187
|
+
- spec/fakeweb_helper.rb
|
153
188
|
has_rdoc: true
|
154
189
|
homepage: http://github.com/spk/validate-website
|
155
190
|
licenses: []
|