validate-website 0.3.1 → 0.3.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +2 -0
- data/Rakefile +4 -8
- data/bin/validate-website +6 -36
- data/lib/validate_website.rb +80 -1
- data/spec/css_spec.rb +53 -0
- data/spec/fakeweb_helper.rb +61 -0
- data/spec/spec_helper.rb +8 -0
- metadata +39 -4
data/README.rdoc
CHANGED
data/Rakefile
CHANGED
@@ -1,4 +1,3 @@
|
|
1
|
-
require 'rake/testtask'
|
2
1
|
require 'rake/packagetask'
|
3
2
|
require 'rake/rdoctask'
|
4
3
|
require 'rake'
|
@@ -7,10 +6,10 @@ require 'find'
|
|
7
6
|
# Globals
|
8
7
|
|
9
8
|
PKG_NAME = 'validate-website'
|
10
|
-
PKG_VERSION = '0.3.
|
9
|
+
PKG_VERSION = '0.3.5'
|
11
10
|
|
12
11
|
PKG_FILES = ['README.rdoc', 'Rakefile']
|
13
|
-
Find.find('lib/', 'bin/') do |f|
|
12
|
+
Find.find('lib/', 'bin/', 'spec/') do |f|
|
14
13
|
if FileTest.directory?(f) and f =~ /\.svn|\.git/
|
15
14
|
Find.prune
|
16
15
|
else
|
@@ -22,11 +21,6 @@ end
|
|
22
21
|
|
23
22
|
task :default => [:clean, :repackage]
|
24
23
|
|
25
|
-
#Rake::TestTask.new do |t|
|
26
|
-
#t.libs << "test"
|
27
|
-
#t.test_files = FileList['test/tc_*.rb']
|
28
|
-
#end
|
29
|
-
|
30
24
|
Rake::RDocTask.new do |rd|
|
31
25
|
f = []
|
32
26
|
require 'find'
|
@@ -61,6 +55,8 @@ spec = Gem::Specification.new do |s|
|
|
61
55
|
s.requirements << 'spk-anemone' << 'rainbow'
|
62
56
|
s.add_dependency('spk-anemone', '>= 0.4.0')
|
63
57
|
s.add_dependency('rainbow', '>= 1.1')
|
58
|
+
s.add_development_dependency('rspec', '>= 1.3.0')
|
59
|
+
s.add_development_dependency('fakeweb', '>= 1.3.0')
|
64
60
|
s.require_path = 'lib'
|
65
61
|
s.bindir = 'bin'
|
66
62
|
s.executables << 'validate-website'
|
data/bin/validate-website
CHANGED
@@ -5,47 +5,17 @@ developer_mode = false
|
|
5
5
|
developer_mode = true if __FILE__ == $0
|
6
6
|
require 'rubygems' if developer_mode
|
7
7
|
|
8
|
-
require 'validator'
|
9
|
-
require 'anemone'
|
10
|
-
require 'colorful_messages'
|
11
8
|
require 'validate_website'
|
12
9
|
|
13
|
-
include ColorfulMessages
|
14
|
-
|
15
10
|
validate_website = ValidateWebsite.new(ARGV)
|
16
11
|
options = validate_website.options
|
17
12
|
|
18
|
-
exit_code = 0
|
19
|
-
|
20
|
-
Anemone.crawl(options[:site],
|
21
|
-
:user_agent => options[:useragent],
|
22
|
-
:authorization => options[:auth]) do |anemone|
|
23
|
-
|
24
|
-
anemone.skip_links_like Regexp.new(options[:exclude]) if options[:exclude]
|
25
|
-
|
26
|
-
anemone.on_every_page { |page|
|
27
|
-
url = page.url.to_s
|
28
|
-
print info(url)
|
29
|
-
|
30
|
-
# validate html/html+xml
|
31
|
-
if page.html? && page.fetched?
|
32
|
-
validator = Validator.new(page)
|
33
|
-
msg = " well formed? %s" % validator.valid?
|
34
|
-
if validator.valid?
|
35
|
-
puts success(msg)
|
36
|
-
else
|
37
|
-
exit_code = 1
|
38
|
-
puts error(msg)
|
39
|
-
validate_website.to_file(url)
|
40
|
-
end
|
41
|
-
end
|
42
13
|
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
end
|
14
|
+
exit_code = validate_website.crawl options[:site],
|
15
|
+
:user_agent => options[:useragent],
|
16
|
+
:authorization => options[:auth],
|
17
|
+
:cookies => options[:cookies],
|
18
|
+
:accept_cookies => options[:accept_cookies],
|
19
|
+
:verbose => options[:verbose]
|
50
20
|
|
51
21
|
exit(exit_code)
|
data/lib/validate_website.rb
CHANGED
@@ -1,11 +1,18 @@
|
|
1
1
|
require 'optparse'
|
2
2
|
require 'open-uri'
|
3
|
+
require 'validator'
|
4
|
+
require 'anemone'
|
5
|
+
require 'colorful_messages'
|
6
|
+
|
7
|
+
include ColorfulMessages
|
3
8
|
|
4
9
|
class ValidateWebsite
|
5
10
|
|
6
11
|
attr_reader :options
|
7
12
|
|
8
|
-
|
13
|
+
attr_reader :anemone
|
14
|
+
|
15
|
+
def initialize(args=[])
|
9
16
|
@options = {
|
10
17
|
:site => 'http://localhost:3000/',
|
11
18
|
:useragent => Anemone::Core::DEFAULT_OPTS[:user_agent],
|
@@ -14,6 +21,9 @@ class ValidateWebsite
|
|
14
21
|
:auth => nil,
|
15
22
|
# log not found url (404 status code)
|
16
23
|
:not_found => false,
|
24
|
+
:cookies => nil,
|
25
|
+
:accept_cookies => true,
|
26
|
+
:verbose => false,
|
17
27
|
}
|
18
28
|
parse(args)
|
19
29
|
|
@@ -42,6 +52,8 @@ class ValidateWebsite
|
|
42
52
|
o.on("--auth=[user,pass]", Array,
|
43
53
|
"Basic http authentification") { |v| @options[:auth] = v }
|
44
54
|
o.on("-n", "--not-found", "Log not found url") { |v| @options[:not_found] = v }
|
55
|
+
o.on("-c", "--cookies=val", "Set defaults cookies") { |v| @options[:cookies] = v }
|
56
|
+
o.on("-v", "--verbose", "Verbose") { |v| @options[:verbose] = v }
|
45
57
|
|
46
58
|
o.separator ""
|
47
59
|
o.on_tail("-h", "--help", "Show this help message.") { puts o; exit }
|
@@ -49,7 +61,74 @@ class ValidateWebsite
|
|
49
61
|
opts.parse!(args)
|
50
62
|
end
|
51
63
|
|
64
|
+
def get_url(page, elem, attrname)
|
65
|
+
u = elem.attributes[attrname] if elem.attributes[attrname]
|
66
|
+
return if u.nil?
|
67
|
+
begin
|
68
|
+
abs = page.to_absolute(URI(u))
|
69
|
+
rescue
|
70
|
+
abs = nil
|
71
|
+
end
|
72
|
+
return abs if abs && page.in_domain?(abs)
|
73
|
+
end
|
74
|
+
|
52
75
|
def to_file(msg)
|
53
76
|
open(options[:file], 'a').write("#{msg}\n") if options[:file]
|
54
77
|
end
|
78
|
+
|
79
|
+
def crawl(site, opts={})
|
80
|
+
exit_code = 0
|
81
|
+
|
82
|
+
@anemone = Anemone.crawl(site, opts) do |anemone|
|
83
|
+
anemone.skip_links_like Regexp.new(options[:exclude]) if options[:exclude]
|
84
|
+
|
85
|
+
anemone.focus_crawl { |p|
|
86
|
+
links = []
|
87
|
+
if p.html?
|
88
|
+
p.doc.css('img, script, iframe').each do |elem|
|
89
|
+
url = get_url(p, elem, "src")
|
90
|
+
links << url unless url.nil?
|
91
|
+
end
|
92
|
+
p.doc.css('link').each do |link|
|
93
|
+
url = get_url(p, link, "href")
|
94
|
+
links << url unless url.nil?
|
95
|
+
end
|
96
|
+
end
|
97
|
+
if p.content_type == 'text/css'
|
98
|
+
p.body.scan(/url\((['".\/\w-]+)\)/).each do |url|
|
99
|
+
url = url.to_s.gsub("'", "").gsub('"', '')
|
100
|
+
abs = p.to_absolute(URI(url))
|
101
|
+
links << abs
|
102
|
+
end
|
103
|
+
end
|
104
|
+
links.uniq!
|
105
|
+
p.links.concat(links)
|
106
|
+
}
|
107
|
+
|
108
|
+
anemone.on_every_page { |page|
|
109
|
+
url = page.url.to_s
|
110
|
+
|
111
|
+
# validate html/html+xml
|
112
|
+
if page.html? && page.fetched?
|
113
|
+
print info(url)
|
114
|
+
validator = Validator.new(page)
|
115
|
+
msg = " well formed? %s" % validator.valid?
|
116
|
+
if validator.valid?
|
117
|
+
puts success(msg)
|
118
|
+
else
|
119
|
+
exit_code = 1
|
120
|
+
puts error(msg)
|
121
|
+
to_file(url)
|
122
|
+
end
|
123
|
+
end
|
124
|
+
|
125
|
+
if options[:not_found] && page.not_found?
|
126
|
+
exit_code = 1
|
127
|
+
puts error("%s linked in %s but not exist" % [url, page.referer])
|
128
|
+
to_file(url)
|
129
|
+
end
|
130
|
+
}
|
131
|
+
end
|
132
|
+
exit_code
|
133
|
+
end
|
55
134
|
end
|
data/spec/css_spec.rb
ADDED
@@ -0,0 +1,53 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/spec_helper'
|
2
|
+
|
3
|
+
describe ValidateWebsite do
|
4
|
+
|
5
|
+
before(:each) do
|
6
|
+
FakeWeb.clean_registry
|
7
|
+
end
|
8
|
+
|
9
|
+
it "should crawl css and extract url" do
|
10
|
+
pages = []
|
11
|
+
pages << FakePage.new('test.css',
|
12
|
+
:body => ".test {background-image: url(pouet);}
|
13
|
+
.tests {background-image: url(/image/pouet.png)}
|
14
|
+
.tests {background-image: url(/image/pouet_42.png)}
|
15
|
+
.tests {background-image: url(/image/pouet)}",
|
16
|
+
:content_type => 'text/css')
|
17
|
+
pages << FakePage.new('pouet',
|
18
|
+
:content_type => 'image/png')
|
19
|
+
pages << FakePage.new('image/pouet',
|
20
|
+
:content_type => 'image/png')
|
21
|
+
pages << FakePage.new('image/pouet.png',
|
22
|
+
:content_type => 'image/png')
|
23
|
+
pages << FakePage.new('image/pouet_42.png',
|
24
|
+
:content_type => 'image/png')
|
25
|
+
validate_website = ValidateWebsite.new
|
26
|
+
validate_website.crawl(pages[0].url)
|
27
|
+
validate_website.anemone.should have(5).pages
|
28
|
+
end
|
29
|
+
|
30
|
+
it "should extract url with single quote" do
|
31
|
+
pages = []
|
32
|
+
pages << FakePage.new('test.css',
|
33
|
+
:body => ".test {background-image: url('pouet');}",
|
34
|
+
:content_type => 'text/css')
|
35
|
+
pages << FakePage.new('pouet',
|
36
|
+
:content_type => 'image/png')
|
37
|
+
validate_website = ValidateWebsite.new
|
38
|
+
validate_website.crawl(pages[0].url)
|
39
|
+
validate_website.anemone.should have(2).pages
|
40
|
+
end
|
41
|
+
|
42
|
+
it "should extract url with double quote" do
|
43
|
+
pages = []
|
44
|
+
pages << FakePage.new('test.css',
|
45
|
+
:body => ".test {background-image: url(\"pouet\");}",
|
46
|
+
:content_type => 'text/css')
|
47
|
+
pages << FakePage.new('pouet',
|
48
|
+
:content_type => 'image/png')
|
49
|
+
validate_website = ValidateWebsite.new
|
50
|
+
validate_website.crawl(pages[0].url)
|
51
|
+
validate_website.anemone.should have(2).pages
|
52
|
+
end
|
53
|
+
end
|
@@ -0,0 +1,61 @@
|
|
1
|
+
begin
|
2
|
+
require 'fakeweb'
|
3
|
+
rescue LoadError
|
4
|
+
warn "You need the 'fakeweb' gem installed to test ValidateWebsite"
|
5
|
+
exit
|
6
|
+
end
|
7
|
+
|
8
|
+
FakeWeb.allow_net_connect = false
|
9
|
+
|
10
|
+
class FakePage
|
11
|
+
attr_accessor :links
|
12
|
+
attr_accessor :hrefs
|
13
|
+
attr_accessor :body
|
14
|
+
|
15
|
+
def initialize(name = '', options = {})
|
16
|
+
@name = name
|
17
|
+
@links = [options[:links]].flatten if options.has_key?(:links)
|
18
|
+
@hrefs = [options[:hrefs]].flatten if options.has_key?(:hrefs)
|
19
|
+
@redirect = options[:redirect] if options.has_key?(:redirect)
|
20
|
+
@content_type = options[:content_type] || "text/html"
|
21
|
+
@body = options[:body]
|
22
|
+
|
23
|
+
create_body unless @body
|
24
|
+
add_to_fakeweb
|
25
|
+
end
|
26
|
+
|
27
|
+
def url
|
28
|
+
SPEC_DOMAIN + @name
|
29
|
+
end
|
30
|
+
|
31
|
+
private
|
32
|
+
|
33
|
+
def create_body
|
34
|
+
@body = "<html><body>"
|
35
|
+
@links.each{|l| @body += "<a href=\"#{SPEC_DOMAIN}#{l}\"></a>"} if @links
|
36
|
+
@hrefs.each{|h| @body += "<a href=\"#{h}\"></a>"} if @hrefs
|
37
|
+
@body += "</body></html>"
|
38
|
+
end
|
39
|
+
|
40
|
+
def add_to_fakeweb
|
41
|
+
options = {:body => @body, :content_type => @content_type, :status => [200, "OK"]}
|
42
|
+
|
43
|
+
if @redirect
|
44
|
+
options[:status] = [301, "Permanently Moved"]
|
45
|
+
|
46
|
+
# only prepend SPEC_DOMAIN if a relative url (without an http scheme) was specified
|
47
|
+
redirect_url = (@redirect =~ /http/) ? @redirect : SPEC_DOMAIN + @redirect
|
48
|
+
options[:location] = redirect_url
|
49
|
+
|
50
|
+
# register the page this one redirects to
|
51
|
+
FakeWeb.register_uri(:get, redirect_url, {:body => '',
|
52
|
+
:content_type => @content_type,
|
53
|
+
:status => [200, "OK"]})
|
54
|
+
end
|
55
|
+
|
56
|
+
FakeWeb.register_uri(:get, SPEC_DOMAIN + @name, options)
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
#default root
|
61
|
+
#ValidateWebSiteTest::FakePage.new
|
data/spec/spec_helper.rb
ADDED
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: validate-website
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 25
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 3
|
9
|
-
-
|
10
|
-
version: 0.3.
|
9
|
+
- 5
|
10
|
+
version: 0.3.5
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Laurent Arnoud
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2010-08-
|
18
|
+
date: 2010-08-25 00:00:00 +02:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
@@ -49,6 +49,38 @@ dependencies:
|
|
49
49
|
version: "1.1"
|
50
50
|
type: :runtime
|
51
51
|
version_requirements: *id002
|
52
|
+
- !ruby/object:Gem::Dependency
|
53
|
+
name: rspec
|
54
|
+
prerelease: false
|
55
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
56
|
+
none: false
|
57
|
+
requirements:
|
58
|
+
- - ">="
|
59
|
+
- !ruby/object:Gem::Version
|
60
|
+
hash: 27
|
61
|
+
segments:
|
62
|
+
- 1
|
63
|
+
- 3
|
64
|
+
- 0
|
65
|
+
version: 1.3.0
|
66
|
+
type: :development
|
67
|
+
version_requirements: *id003
|
68
|
+
- !ruby/object:Gem::Dependency
|
69
|
+
name: fakeweb
|
70
|
+
prerelease: false
|
71
|
+
requirement: &id004 !ruby/object:Gem::Requirement
|
72
|
+
none: false
|
73
|
+
requirements:
|
74
|
+
- - ">="
|
75
|
+
- !ruby/object:Gem::Version
|
76
|
+
hash: 27
|
77
|
+
segments:
|
78
|
+
- 1
|
79
|
+
- 3
|
80
|
+
- 0
|
81
|
+
version: 1.3.0
|
82
|
+
type: :development
|
83
|
+
version_requirements: *id004
|
52
84
|
description: Web crawler for checking the validity of your documents
|
53
85
|
email: laurent@spkdev.net
|
54
86
|
executables:
|
@@ -150,6 +182,9 @@ files:
|
|
150
182
|
- lib/xhtml/xhtml-ruby-1.xsd
|
151
183
|
- lib/validate_website.rb
|
152
184
|
- bin/validate-website
|
185
|
+
- spec/spec_helper.rb
|
186
|
+
- spec/css_spec.rb
|
187
|
+
- spec/fakeweb_helper.rb
|
153
188
|
has_rdoc: true
|
154
189
|
homepage: http://github.com/spk/validate-website
|
155
190
|
licenses: []
|