validate-website 0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/README +39 -0
- data/Rakefile +71 -0
- data/bin/validate-website +77 -0
- data/lib/colorful_messages.rb +30 -0
- data/lib/spkspider.rb +147 -0
- metadata +59 -0
data/README
ADDED
@@ -0,0 +1,39 @@
|
|
1
|
+
== validate-website
|
2
|
+
|
3
|
+
== DESCRIPTION
|
4
|
+
|
5
|
+
Web crawler that print if the page is valid with the dtd.
|
6
|
+
compatible ruby 1.9
|
7
|
+
|
8
|
+
== SYNOPSIS
|
9
|
+
|
10
|
+
validate-website --help
|
11
|
+
validate-website -s "http://localhost:4567/" -u "Mozilla 5.0" -f not-well-formed.txt --auth=user,pass -e 'redirect|news'
|
12
|
+
|
13
|
+
== REQUIREMENTS:
|
14
|
+
|
15
|
+
libxml-ruby >= 1.1.3
|
16
|
+
|
17
|
+
== LICENSE
|
18
|
+
(The MIT License)
|
19
|
+
|
20
|
+
Copyright (c) 2009 spk
|
21
|
+
|
22
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
23
|
+
a copy of this software and associated documentation files (the
|
24
|
+
'Software'), to deal in the Software without restriction, including
|
25
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
26
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
27
|
+
permit persons to whom the Software is furnished to do so, subject to
|
28
|
+
the following conditions:
|
29
|
+
|
30
|
+
The above copyright notice and this permission notice shall be
|
31
|
+
included in all copies or substantial portions of the Software.
|
32
|
+
|
33
|
+
THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
|
34
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
35
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
36
|
+
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
37
|
+
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
38
|
+
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
39
|
+
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/Rakefile
ADDED
@@ -0,0 +1,71 @@
|
|
1
|
+
require 'rake/testtask'
|
2
|
+
require 'rake/packagetask'
|
3
|
+
require 'rake/rdoctask'
|
4
|
+
require 'rake'
|
5
|
+
require 'find'
|
6
|
+
|
7
|
+
# Globals
|
8
|
+
|
9
|
+
PKG_NAME = 'validate-website'
|
10
|
+
PKG_VERSION = '0.1'
|
11
|
+
|
12
|
+
PKG_FILES = ['README', 'Rakefile']
|
13
|
+
Find.find('lib/', 'bin/') do |f|
|
14
|
+
if FileTest.directory?(f) and f =~ /\.svn|\.git/
|
15
|
+
Find.prune
|
16
|
+
else
|
17
|
+
PKG_FILES << f
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
# Tasks
|
22
|
+
|
23
|
+
task :default => [:clean, :repackage]
|
24
|
+
|
25
|
+
#Rake::TestTask.new do |t|
|
26
|
+
#t.libs << "test"
|
27
|
+
#t.test_files = FileList['test/tc_*.rb']
|
28
|
+
#end
|
29
|
+
|
30
|
+
Rake::RDocTask.new do |rd|
|
31
|
+
f = []
|
32
|
+
require 'find'
|
33
|
+
Find.find('lib/') do |file|
|
34
|
+
# Skip hidden files (.svn/ directories and Vim swapfiles)
|
35
|
+
if file.split(/\//).last =~ /^\./
|
36
|
+
Find.prune
|
37
|
+
else
|
38
|
+
f << file if not FileTest.directory?(file)
|
39
|
+
end
|
40
|
+
end
|
41
|
+
rd.rdoc_files.include(f)
|
42
|
+
rd.options << '--all'
|
43
|
+
end
|
44
|
+
|
45
|
+
Rake::PackageTask.new(PKG_NAME, PKG_VERSION) do |p|
|
46
|
+
p.need_tar = true
|
47
|
+
p.package_files = PKG_FILES
|
48
|
+
end
|
49
|
+
|
50
|
+
# "Gem" part of the Rakefile
|
51
|
+
require 'rake/gempackagetask'
|
52
|
+
|
53
|
+
spec = Gem::Specification.new do |s|
|
54
|
+
s.author = 'spk'
|
55
|
+
s.email = 'spk@tuxfamily.org'
|
56
|
+
s.platform = Gem::Platform::RUBY
|
57
|
+
s.summary = "Web crawler for testing webpage validity"
|
58
|
+
s.name = PKG_NAME
|
59
|
+
s.version = PKG_VERSION
|
60
|
+
s.requirements << 'libxml-ruby'
|
61
|
+
s.require_path = 'lib'
|
62
|
+
s.bindir = 'bin'
|
63
|
+
s.executables << 'validate-website'
|
64
|
+
s.files = PKG_FILES
|
65
|
+
s.description = "Web crawler that print if the page is valid with the dtd"
|
66
|
+
end
|
67
|
+
|
68
|
+
Rake::GemPackageTask.new(spec) do |pkg|
|
69
|
+
pkg.need_zip = true
|
70
|
+
pkg.need_tar = true
|
71
|
+
end
|
@@ -0,0 +1,77 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
$:.unshift '../lib'
|
3
|
+
require 'spkspider'
|
4
|
+
require 'colorful_messages'
|
5
|
+
require 'open-uri'
|
6
|
+
require 'xml'
|
7
|
+
require 'optparse'
|
8
|
+
|
9
|
+
include ColorfulMessages
|
10
|
+
|
11
|
+
XML.default_validity_checking = true
|
12
|
+
XML.default_load_external_dtd = true
|
13
|
+
|
14
|
+
# default options
|
15
|
+
OPTIONS = {
|
16
|
+
:site => 'http://localhost:3000/',
|
17
|
+
:useragent => '',
|
18
|
+
:exclude => nil,
|
19
|
+
:file => nil,
|
20
|
+
:auth => nil,
|
21
|
+
}
|
22
|
+
|
23
|
+
ARGV.options do |o|
|
24
|
+
script_name = File.basename($0)
|
25
|
+
o.set_summary_indent(' ')
|
26
|
+
o.banner = "Usage: #{script_name} [OPTIONS]"
|
27
|
+
o.define_head "validate website"
|
28
|
+
o.separator ""
|
29
|
+
|
30
|
+
o.on("-s", "--site=val", String,
|
31
|
+
"Default: #{OPTIONS[:site]}") { |OPTIONS[:site]| }
|
32
|
+
o.on("-u", "--useragent=val", String,
|
33
|
+
"Default: #{OPTIONS[:useragent]}") { |OPTIONS[:useragent]| }
|
34
|
+
o.on("-e", "--exclude=val", String,
|
35
|
+
"Url to exclude") { |OPTIONS[:exclude]| }
|
36
|
+
o.on("-f", "--file=val", String,
|
37
|
+
"save not well formed urls") { |OPTIONS[:file]| }
|
38
|
+
o.on("--auth=[user,pass]", Array,
|
39
|
+
"Basic http authentification") { |OPTIONS[:auth]| }
|
40
|
+
|
41
|
+
o.separator ""
|
42
|
+
o.on_tail("-h", "--help", "Show this help message.") { puts o; exit }
|
43
|
+
o.parse!
|
44
|
+
end
|
45
|
+
|
46
|
+
spider = SpkSpider.new(OPTIONS[:site])
|
47
|
+
spider.user_agent = OPTIONS[:useragent]
|
48
|
+
spider.exclude = Regexp.new(OPTIONS[:exclude]) if OPTIONS[:exclude]
|
49
|
+
spider.basic_auth = OPTIONS[:auth]
|
50
|
+
|
51
|
+
if OPTIONS[:file]
|
52
|
+
file = OPTIONS[:file]
|
53
|
+
open(file, 'w').write('')
|
54
|
+
end
|
55
|
+
|
56
|
+
spider.crawl do |url, document|
|
57
|
+
begin
|
58
|
+
xp = XML::Parser.string(document)
|
59
|
+
exception = nil
|
60
|
+
XML::Error.set_handler do |error|
|
61
|
+
exception = error
|
62
|
+
end
|
63
|
+
|
64
|
+
doc = xp.parse
|
65
|
+
|
66
|
+
msg = " well formed? %s" % xp.context.well_formed?
|
67
|
+
if xp.context.well_formed?
|
68
|
+
print success(msg)
|
69
|
+
else
|
70
|
+
print error(msg)
|
71
|
+
open(file, 'a').write(url+"\n") if OPTIONS[:file]
|
72
|
+
end
|
73
|
+
rescue
|
74
|
+
print error(msg)
|
75
|
+
open(file, 'a').write(url+"\n") if OPTIONS[:file]
|
76
|
+
end
|
77
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
module ColorfulMessages
|
2
|
+
|
3
|
+
# red
|
4
|
+
def error(message)
|
5
|
+
"\033[1;31m#{message}\033[0m"
|
6
|
+
end
|
7
|
+
|
8
|
+
# yellow
|
9
|
+
def warning(message)
|
10
|
+
"\033[1;33m#{message}\033[0m"
|
11
|
+
end
|
12
|
+
|
13
|
+
# green
|
14
|
+
def success(message)
|
15
|
+
"\033[1;32m#{message}\033[0m"
|
16
|
+
end
|
17
|
+
|
18
|
+
alias_method :message, :success
|
19
|
+
|
20
|
+
# magenta
|
21
|
+
def note(message)
|
22
|
+
"\033[1;35m#{message}\033[0m"
|
23
|
+
end
|
24
|
+
|
25
|
+
# blue
|
26
|
+
def info(message)
|
27
|
+
"\033[1;34m#{message}\033[0m"
|
28
|
+
end
|
29
|
+
|
30
|
+
end
|
data/lib/spkspider.rb
ADDED
@@ -0,0 +1,147 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require 'open-uri'
|
3
|
+
# SpkSpider is a ruby crawler
|
4
|
+
|
5
|
+
class SpkSpider
|
6
|
+
VERSION = '0.0.5'
|
7
|
+
|
8
|
+
attr_accessor :links_to_visit, :site, :user_agent, :basic_auth
|
9
|
+
attr_accessor :parser, :exclude
|
10
|
+
attr_reader :visited_links, :external_links, :errors
|
11
|
+
|
12
|
+
# initialize method take the site to crawl in argument
|
13
|
+
def initialize(site)
|
14
|
+
puts "SpkSpider #{VERSION} initializing..."
|
15
|
+
@site = URI.parse(site) || raise("You didn't give me a site to crawl")
|
16
|
+
@user_agent = "SpkSpr/#{VERSION}"
|
17
|
+
@links_to_visit = Array.new
|
18
|
+
@visited_links = Array.new
|
19
|
+
@external_links = Array.new
|
20
|
+
@errors = Hash.new
|
21
|
+
@links_to_visit << site
|
22
|
+
@parser = 'xml'
|
23
|
+
puts "Ready to crawl"
|
24
|
+
end
|
25
|
+
|
26
|
+
def init_xml_parser(doc)
|
27
|
+
require 'xml'
|
28
|
+
xp = XML::HTMLParser.string(doc, {:options => XML::HTMLParser::Options::RECOVER | XML::HTMLParser::Options::NOERROR | XML::HTMLParser::Options::NOWARNING })
|
29
|
+
XML::Error.set_handler do |error|
|
30
|
+
exception = error
|
31
|
+
end
|
32
|
+
document = xp.parse
|
33
|
+
links = document.find("//a[@href]")
|
34
|
+
end
|
35
|
+
|
36
|
+
def fetch_links(doc)
|
37
|
+
case @parser
|
38
|
+
when 'xml'
|
39
|
+
init_xml_parser(doc)
|
40
|
+
when 'hpricot'
|
41
|
+
require 'hpricot'
|
42
|
+
Hpricot.buffer_size = 204800
|
43
|
+
Hpricot(doc).search("//a[@href]")
|
44
|
+
else
|
45
|
+
init_xml_parser(doc)
|
46
|
+
end
|
47
|
+
rescue
|
48
|
+
init_xml_parser(doc)
|
49
|
+
end
|
50
|
+
|
51
|
+
# download the document
|
52
|
+
def fetch_html(url)
|
53
|
+
uri = URI.parse(url)
|
54
|
+
print "Visiting: #{url}"
|
55
|
+
begin
|
56
|
+
@document = uri.read('User-Agent' => @user_agent, 'Referer' => url, :http_basic_authentication => @basic_auth)
|
57
|
+
rescue
|
58
|
+
# OpenURI::HTTPError
|
59
|
+
end
|
60
|
+
@visited_links << url
|
61
|
+
@document
|
62
|
+
end
|
63
|
+
|
64
|
+
# reading the document and extract the urls
|
65
|
+
def read_document(document, url)
|
66
|
+
if document
|
67
|
+
case document.content_type
|
68
|
+
when "text/html"
|
69
|
+
link_extractor(document, url)
|
70
|
+
else
|
71
|
+
print " ... not text/html, skipping ..."
|
72
|
+
end
|
73
|
+
else
|
74
|
+
print " ... document does not exist, skipping ..."
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
# extract the link and un-relative
|
79
|
+
def link_extractor(document, document_url)
|
80
|
+
links = fetch_links(document)
|
81
|
+
links.each do |link|
|
82
|
+
href = link.attributes['href']
|
83
|
+
if href && href.length > 0 && (@exclude && !href.match(@exclude) || @exclude.nil?)
|
84
|
+
begin
|
85
|
+
url = href
|
86
|
+
uri = URI.parse(url)
|
87
|
+
document_uri = URI.parse(document_url)
|
88
|
+
rescue
|
89
|
+
#print " #{url} skip this link"
|
90
|
+
next
|
91
|
+
end
|
92
|
+
else
|
93
|
+
#print " skip this link"
|
94
|
+
next
|
95
|
+
end
|
96
|
+
|
97
|
+
# Derelativeize links if necessary
|
98
|
+
if uri.relative?
|
99
|
+
url = document_uri.merge(url).to_s if url[0,1] == '?'
|
100
|
+
url = @site.merge(url).to_s
|
101
|
+
uri = URI.parse(url)
|
102
|
+
end
|
103
|
+
|
104
|
+
# skip anchor link
|
105
|
+
if url.include?('#')
|
106
|
+
#print '... Anchor link found, skipping ...'
|
107
|
+
next
|
108
|
+
end
|
109
|
+
|
110
|
+
# Check domain, if in same domain, keep link, else trash it
|
111
|
+
if uri.host != @site.host
|
112
|
+
@external_links << url
|
113
|
+
@external_links.uniq!
|
114
|
+
next
|
115
|
+
end
|
116
|
+
|
117
|
+
# Find out if we've seen this link already
|
118
|
+
if (@visited_links.include? url) || (@links_to_visit.include? url)
|
119
|
+
next
|
120
|
+
end
|
121
|
+
|
122
|
+
@links_to_visit << url
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
# lunch the crawling
|
127
|
+
def crawl
|
128
|
+
while !@links_to_visit.empty?
|
129
|
+
# get the first element of the links_to_visit
|
130
|
+
url = @links_to_visit.shift
|
131
|
+
document = fetch_html(url)
|
132
|
+
read_document(document, url)
|
133
|
+
if block_given?
|
134
|
+
yield(url, document)
|
135
|
+
end
|
136
|
+
puts ' done!'
|
137
|
+
end
|
138
|
+
end
|
139
|
+
end
|
140
|
+
|
141
|
+
if __FILE__ == $0
|
142
|
+
site = 'http://localhost:4567/'
|
143
|
+
site = ARGV[0] if ARGV[0]
|
144
|
+
spider = SpkSpider.new(site)
|
145
|
+
spider.user_agent = ''
|
146
|
+
spider.crawl
|
147
|
+
end
|
metadata
ADDED
@@ -0,0 +1,59 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: validate-website
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: "0.1"
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- spk
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2009-10-24 00:00:00 +02:00
|
13
|
+
default_executable:
|
14
|
+
dependencies: []
|
15
|
+
|
16
|
+
description: Web crawler that print if the page is valid with the dtd
|
17
|
+
email: spk@tuxfamily.org
|
18
|
+
executables:
|
19
|
+
- validate-website
|
20
|
+
extensions: []
|
21
|
+
|
22
|
+
extra_rdoc_files: []
|
23
|
+
|
24
|
+
files:
|
25
|
+
- README
|
26
|
+
- Rakefile
|
27
|
+
- lib/colorful_messages.rb
|
28
|
+
- lib/spkspider.rb
|
29
|
+
- bin/validate-website
|
30
|
+
has_rdoc: true
|
31
|
+
homepage:
|
32
|
+
licenses: []
|
33
|
+
|
34
|
+
post_install_message:
|
35
|
+
rdoc_options: []
|
36
|
+
|
37
|
+
require_paths:
|
38
|
+
- lib
|
39
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
40
|
+
requirements:
|
41
|
+
- - ">="
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: "0"
|
44
|
+
version:
|
45
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
46
|
+
requirements:
|
47
|
+
- - ">="
|
48
|
+
- !ruby/object:Gem::Version
|
49
|
+
version: "0"
|
50
|
+
version:
|
51
|
+
requirements:
|
52
|
+
- libxml-ruby
|
53
|
+
rubyforge_project:
|
54
|
+
rubygems_version: 1.3.5
|
55
|
+
signing_key:
|
56
|
+
specification_version: 3
|
57
|
+
summary: Web crawler for testing webpage validity
|
58
|
+
test_files: []
|
59
|
+
|