validate-website 0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +39 -0
- data/Rakefile +71 -0
- data/bin/validate-website +77 -0
- data/lib/colorful_messages.rb +30 -0
- data/lib/spkspider.rb +147 -0
- metadata +59 -0
data/README
ADDED
@@ -0,0 +1,39 @@
|
|
1
|
+
== validate-website
|
2
|
+
|
3
|
+
== DESCRIPTION
|
4
|
+
|
5
|
+
Web crawler that print if the page is valid with the dtd.
|
6
|
+
compatible ruby 1.9
|
7
|
+
|
8
|
+
== SYNOPSIS
|
9
|
+
|
10
|
+
validate-website --help
|
11
|
+
validate-website -s "http://localhost:4567/" -u "Mozilla 5.0" -f not-well-formed.txt --auth=user,pass -e 'redirect|news'
|
12
|
+
|
13
|
+
== REQUIREMENTS:
|
14
|
+
|
15
|
+
libxml-ruby >= 1.1.3
|
16
|
+
|
17
|
+
== LICENSE
|
18
|
+
(The MIT License)
|
19
|
+
|
20
|
+
Copyright (c) 2009 spk
|
21
|
+
|
22
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
23
|
+
a copy of this software and associated documentation files (the
|
24
|
+
'Software'), to deal in the Software without restriction, including
|
25
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
26
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
27
|
+
permit persons to whom the Software is furnished to do so, subject to
|
28
|
+
the following conditions:
|
29
|
+
|
30
|
+
The above copyright notice and this permission notice shall be
|
31
|
+
included in all copies or substantial portions of the Software.
|
32
|
+
|
33
|
+
THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
|
34
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
35
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
36
|
+
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
37
|
+
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
38
|
+
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
39
|
+
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/Rakefile
ADDED
@@ -0,0 +1,71 @@
|
|
1
|
+
require 'rake/testtask'
|
2
|
+
require 'rake/packagetask'
|
3
|
+
require 'rake/rdoctask'
|
4
|
+
require 'rake'
|
5
|
+
require 'find'
|
6
|
+
|
7
|
+
# Globals
|
8
|
+
|
9
|
+
PKG_NAME = 'validate-website'
|
10
|
+
PKG_VERSION = '0.1'
|
11
|
+
|
12
|
+
PKG_FILES = ['README', 'Rakefile']
|
13
|
+
Find.find('lib/', 'bin/') do |f|
|
14
|
+
if FileTest.directory?(f) and f =~ /\.svn|\.git/
|
15
|
+
Find.prune
|
16
|
+
else
|
17
|
+
PKG_FILES << f
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
# Tasks
|
22
|
+
|
23
|
+
task :default => [:clean, :repackage]
|
24
|
+
|
25
|
+
#Rake::TestTask.new do |t|
|
26
|
+
#t.libs << "test"
|
27
|
+
#t.test_files = FileList['test/tc_*.rb']
|
28
|
+
#end
|
29
|
+
|
30
|
+
Rake::RDocTask.new do |rd|
|
31
|
+
f = []
|
32
|
+
require 'find'
|
33
|
+
Find.find('lib/') do |file|
|
34
|
+
# Skip hidden files (.svn/ directories and Vim swapfiles)
|
35
|
+
if file.split(/\//).last =~ /^\./
|
36
|
+
Find.prune
|
37
|
+
else
|
38
|
+
f << file if not FileTest.directory?(file)
|
39
|
+
end
|
40
|
+
end
|
41
|
+
rd.rdoc_files.include(f)
|
42
|
+
rd.options << '--all'
|
43
|
+
end
|
44
|
+
|
45
|
+
Rake::PackageTask.new(PKG_NAME, PKG_VERSION) do |p|
|
46
|
+
p.need_tar = true
|
47
|
+
p.package_files = PKG_FILES
|
48
|
+
end
|
49
|
+
|
50
|
+
# "Gem" part of the Rakefile
|
51
|
+
require 'rake/gempackagetask'
|
52
|
+
|
53
|
+
spec = Gem::Specification.new do |s|
|
54
|
+
s.author = 'spk'
|
55
|
+
s.email = 'spk@tuxfamily.org'
|
56
|
+
s.platform = Gem::Platform::RUBY
|
57
|
+
s.summary = "Web crawler for testing webpage validity"
|
58
|
+
s.name = PKG_NAME
|
59
|
+
s.version = PKG_VERSION
|
60
|
+
s.requirements << 'libxml-ruby'
|
61
|
+
s.require_path = 'lib'
|
62
|
+
s.bindir = 'bin'
|
63
|
+
s.executables << 'validate-website'
|
64
|
+
s.files = PKG_FILES
|
65
|
+
s.description = "Web crawler that print if the page is valid with the dtd"
|
66
|
+
end
|
67
|
+
|
68
|
+
Rake::GemPackageTask.new(spec) do |pkg|
|
69
|
+
pkg.need_zip = true
|
70
|
+
pkg.need_tar = true
|
71
|
+
end
|
@@ -0,0 +1,77 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
$:.unshift '../lib'
|
3
|
+
require 'spkspider'
|
4
|
+
require 'colorful_messages'
|
5
|
+
require 'open-uri'
|
6
|
+
require 'xml'
|
7
|
+
require 'optparse'
|
8
|
+
|
9
|
+
include ColorfulMessages
|
10
|
+
|
11
|
+
XML.default_validity_checking = true
|
12
|
+
XML.default_load_external_dtd = true
|
13
|
+
|
14
|
+
# default options
|
15
|
+
OPTIONS = {
|
16
|
+
:site => 'http://localhost:3000/',
|
17
|
+
:useragent => '',
|
18
|
+
:exclude => nil,
|
19
|
+
:file => nil,
|
20
|
+
:auth => nil,
|
21
|
+
}
|
22
|
+
|
23
|
+
ARGV.options do |o|
|
24
|
+
script_name = File.basename($0)
|
25
|
+
o.set_summary_indent(' ')
|
26
|
+
o.banner = "Usage: #{script_name} [OPTIONS]"
|
27
|
+
o.define_head "validate website"
|
28
|
+
o.separator ""
|
29
|
+
|
30
|
+
o.on("-s", "--site=val", String,
|
31
|
+
"Default: #{OPTIONS[:site]}") { |OPTIONS[:site]| }
|
32
|
+
o.on("-u", "--useragent=val", String,
|
33
|
+
"Default: #{OPTIONS[:useragent]}") { |OPTIONS[:useragent]| }
|
34
|
+
o.on("-e", "--exclude=val", String,
|
35
|
+
"Url to exclude") { |OPTIONS[:exclude]| }
|
36
|
+
o.on("-f", "--file=val", String,
|
37
|
+
"save not well formed urls") { |OPTIONS[:file]| }
|
38
|
+
o.on("--auth=[user,pass]", Array,
|
39
|
+
"Basic http authentification") { |OPTIONS[:auth]| }
|
40
|
+
|
41
|
+
o.separator ""
|
42
|
+
o.on_tail("-h", "--help", "Show this help message.") { puts o; exit }
|
43
|
+
o.parse!
|
44
|
+
end
|
45
|
+
|
46
|
+
spider = SpkSpider.new(OPTIONS[:site])
|
47
|
+
spider.user_agent = OPTIONS[:useragent]
|
48
|
+
spider.exclude = Regexp.new(OPTIONS[:exclude]) if OPTIONS[:exclude]
|
49
|
+
spider.basic_auth = OPTIONS[:auth]
|
50
|
+
|
51
|
+
if OPTIONS[:file]
|
52
|
+
file = OPTIONS[:file]
|
53
|
+
open(file, 'w').write('')
|
54
|
+
end
|
55
|
+
|
56
|
+
spider.crawl do |url, document|
|
57
|
+
begin
|
58
|
+
xp = XML::Parser.string(document)
|
59
|
+
exception = nil
|
60
|
+
XML::Error.set_handler do |error|
|
61
|
+
exception = error
|
62
|
+
end
|
63
|
+
|
64
|
+
doc = xp.parse
|
65
|
+
|
66
|
+
msg = " well formed? %s" % xp.context.well_formed?
|
67
|
+
if xp.context.well_formed?
|
68
|
+
print success(msg)
|
69
|
+
else
|
70
|
+
print error(msg)
|
71
|
+
open(file, 'a').write(url+"\n") if OPTIONS[:file]
|
72
|
+
end
|
73
|
+
rescue
|
74
|
+
print error(msg)
|
75
|
+
open(file, 'a').write(url+"\n") if OPTIONS[:file]
|
76
|
+
end
|
77
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
module ColorfulMessages
|
2
|
+
|
3
|
+
# red
|
4
|
+
def error(message)
|
5
|
+
"\033[1;31m#{message}\033[0m"
|
6
|
+
end
|
7
|
+
|
8
|
+
# yellow
|
9
|
+
def warning(message)
|
10
|
+
"\033[1;33m#{message}\033[0m"
|
11
|
+
end
|
12
|
+
|
13
|
+
# green
|
14
|
+
def success(message)
|
15
|
+
"\033[1;32m#{message}\033[0m"
|
16
|
+
end
|
17
|
+
|
18
|
+
alias_method :message, :success
|
19
|
+
|
20
|
+
# magenta
|
21
|
+
def note(message)
|
22
|
+
"\033[1;35m#{message}\033[0m"
|
23
|
+
end
|
24
|
+
|
25
|
+
# blue
|
26
|
+
def info(message)
|
27
|
+
"\033[1;34m#{message}\033[0m"
|
28
|
+
end
|
29
|
+
|
30
|
+
end
|
data/lib/spkspider.rb
ADDED
@@ -0,0 +1,147 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require 'open-uri'
|
3
|
+
# SpkSpider is a ruby crawler
|
4
|
+
|
5
|
+
class SpkSpider
|
6
|
+
VERSION = '0.0.5'
|
7
|
+
|
8
|
+
attr_accessor :links_to_visit, :site, :user_agent, :basic_auth
|
9
|
+
attr_accessor :parser, :exclude
|
10
|
+
attr_reader :visited_links, :external_links, :errors
|
11
|
+
|
12
|
+
# initialize method take the site to crawl in argument
|
13
|
+
def initialize(site)
|
14
|
+
puts "SpkSpider #{VERSION} initializing..."
|
15
|
+
@site = URI.parse(site) || raise("You didn't give me a site to crawl")
|
16
|
+
@user_agent = "SpkSpr/#{VERSION}"
|
17
|
+
@links_to_visit = Array.new
|
18
|
+
@visited_links = Array.new
|
19
|
+
@external_links = Array.new
|
20
|
+
@errors = Hash.new
|
21
|
+
@links_to_visit << site
|
22
|
+
@parser = 'xml'
|
23
|
+
puts "Ready to crawl"
|
24
|
+
end
|
25
|
+
|
26
|
+
def init_xml_parser(doc)
|
27
|
+
require 'xml'
|
28
|
+
xp = XML::HTMLParser.string(doc, {:options => XML::HTMLParser::Options::RECOVER | XML::HTMLParser::Options::NOERROR | XML::HTMLParser::Options::NOWARNING })
|
29
|
+
XML::Error.set_handler do |error|
|
30
|
+
exception = error
|
31
|
+
end
|
32
|
+
document = xp.parse
|
33
|
+
links = document.find("//a[@href]")
|
34
|
+
end
|
35
|
+
|
36
|
+
def fetch_links(doc)
|
37
|
+
case @parser
|
38
|
+
when 'xml'
|
39
|
+
init_xml_parser(doc)
|
40
|
+
when 'hpricot'
|
41
|
+
require 'hpricot'
|
42
|
+
Hpricot.buffer_size = 204800
|
43
|
+
Hpricot(doc).search("//a[@href]")
|
44
|
+
else
|
45
|
+
init_xml_parser(doc)
|
46
|
+
end
|
47
|
+
rescue
|
48
|
+
init_xml_parser(doc)
|
49
|
+
end
|
50
|
+
|
51
|
+
# download the document
|
52
|
+
def fetch_html(url)
|
53
|
+
uri = URI.parse(url)
|
54
|
+
print "Visiting: #{url}"
|
55
|
+
begin
|
56
|
+
@document = uri.read('User-Agent' => @user_agent, 'Referer' => url, :http_basic_authentication => @basic_auth)
|
57
|
+
rescue
|
58
|
+
# OpenURI::HTTPError
|
59
|
+
end
|
60
|
+
@visited_links << url
|
61
|
+
@document
|
62
|
+
end
|
63
|
+
|
64
|
+
# reading the document and extract the urls
|
65
|
+
def read_document(document, url)
|
66
|
+
if document
|
67
|
+
case document.content_type
|
68
|
+
when "text/html"
|
69
|
+
link_extractor(document, url)
|
70
|
+
else
|
71
|
+
print " ... not text/html, skipping ..."
|
72
|
+
end
|
73
|
+
else
|
74
|
+
print " ... document does not exist, skipping ..."
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
# extract the link and un-relative
|
79
|
+
def link_extractor(document, document_url)
|
80
|
+
links = fetch_links(document)
|
81
|
+
links.each do |link|
|
82
|
+
href = link.attributes['href']
|
83
|
+
if href && href.length > 0 && (@exclude && !href.match(@exclude) || @exclude.nil?)
|
84
|
+
begin
|
85
|
+
url = href
|
86
|
+
uri = URI.parse(url)
|
87
|
+
document_uri = URI.parse(document_url)
|
88
|
+
rescue
|
89
|
+
#print " #{url} skip this link"
|
90
|
+
next
|
91
|
+
end
|
92
|
+
else
|
93
|
+
#print " skip this link"
|
94
|
+
next
|
95
|
+
end
|
96
|
+
|
97
|
+
# Derelativeize links if necessary
|
98
|
+
if uri.relative?
|
99
|
+
url = document_uri.merge(url).to_s if url[0,1] == '?'
|
100
|
+
url = @site.merge(url).to_s
|
101
|
+
uri = URI.parse(url)
|
102
|
+
end
|
103
|
+
|
104
|
+
# skip anchor link
|
105
|
+
if url.include?('#')
|
106
|
+
#print '... Anchor link found, skipping ...'
|
107
|
+
next
|
108
|
+
end
|
109
|
+
|
110
|
+
# Check domain, if in same domain, keep link, else trash it
|
111
|
+
if uri.host != @site.host
|
112
|
+
@external_links << url
|
113
|
+
@external_links.uniq!
|
114
|
+
next
|
115
|
+
end
|
116
|
+
|
117
|
+
# Find out if we've seen this link already
|
118
|
+
if (@visited_links.include? url) || (@links_to_visit.include? url)
|
119
|
+
next
|
120
|
+
end
|
121
|
+
|
122
|
+
@links_to_visit << url
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
# lunch the crawling
|
127
|
+
def crawl
|
128
|
+
while !@links_to_visit.empty?
|
129
|
+
# get the first element of the links_to_visit
|
130
|
+
url = @links_to_visit.shift
|
131
|
+
document = fetch_html(url)
|
132
|
+
read_document(document, url)
|
133
|
+
if block_given?
|
134
|
+
yield(url, document)
|
135
|
+
end
|
136
|
+
puts ' done!'
|
137
|
+
end
|
138
|
+
end
|
139
|
+
end
|
140
|
+
|
141
|
+
if __FILE__ == $0
|
142
|
+
site = 'http://localhost:4567/'
|
143
|
+
site = ARGV[0] if ARGV[0]
|
144
|
+
spider = SpkSpider.new(site)
|
145
|
+
spider.user_agent = ''
|
146
|
+
spider.crawl
|
147
|
+
end
|
metadata
ADDED
@@ -0,0 +1,59 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: validate-website
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: "0.1"
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- spk
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2009-10-24 00:00:00 +02:00
|
13
|
+
default_executable:
|
14
|
+
dependencies: []
|
15
|
+
|
16
|
+
description: Web crawler that print if the page is valid with the dtd
|
17
|
+
email: spk@tuxfamily.org
|
18
|
+
executables:
|
19
|
+
- validate-website
|
20
|
+
extensions: []
|
21
|
+
|
22
|
+
extra_rdoc_files: []
|
23
|
+
|
24
|
+
files:
|
25
|
+
- README
|
26
|
+
- Rakefile
|
27
|
+
- lib/colorful_messages.rb
|
28
|
+
- lib/spkspider.rb
|
29
|
+
- bin/validate-website
|
30
|
+
has_rdoc: true
|
31
|
+
homepage:
|
32
|
+
licenses: []
|
33
|
+
|
34
|
+
post_install_message:
|
35
|
+
rdoc_options: []
|
36
|
+
|
37
|
+
require_paths:
|
38
|
+
- lib
|
39
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
40
|
+
requirements:
|
41
|
+
- - ">="
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: "0"
|
44
|
+
version:
|
45
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
46
|
+
requirements:
|
47
|
+
- - ">="
|
48
|
+
- !ruby/object:Gem::Version
|
49
|
+
version: "0"
|
50
|
+
version:
|
51
|
+
requirements:
|
52
|
+
- libxml-ruby
|
53
|
+
rubyforge_project:
|
54
|
+
rubygems_version: 1.3.5
|
55
|
+
signing_key:
|
56
|
+
specification_version: 3
|
57
|
+
summary: Web crawler for testing webpage validity
|
58
|
+
test_files: []
|
59
|
+
|