validate-website 0.1 → 0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/README +15 -4
- data/Rakefile +6 -4
- data/bin/validate-website +33 -36
- data/lib/colorful_messages.rb +7 -10
- data/lib/validator.rb +37 -0
- data/lib/xhtml/xframes-1.xsd +166 -0
- data/lib/xhtml/xhtml-access-1.xsd +43 -0
- data/lib/xhtml/xhtml-applet-1.xsd +66 -0
- data/lib/xhtml/xhtml-attribs-1.xsd +67 -0
- data/lib/xhtml/xhtml-base-1.xsd +31 -0
- data/lib/xhtml/xhtml-basic-form-1.xsd +195 -0
- data/lib/xhtml/xhtml-basic-table-1.xsd +169 -0
- data/lib/xhtml/xhtml-basic10-model-1.xsd +385 -0
- data/lib/xhtml/xhtml-basic10-module-redefines-1.xsd +61 -0
- data/lib/xhtml/xhtml-basic10-modules-1.xsd +233 -0
- data/lib/xhtml/xhtml-basic10.xsd +99 -0
- data/lib/xhtml/xhtml-basic11-model-1.xsd +622 -0
- data/lib/xhtml/xhtml-basic11-modules-1.xsd +508 -0
- data/lib/xhtml/xhtml-basic11.xsd +105 -0
- data/lib/xhtml/xhtml-bdo-1.xsd +72 -0
- data/lib/xhtml/xhtml-blkphras-1.xsd +155 -0
- data/lib/xhtml/xhtml-blkpres-1.xsd +32 -0
- data/lib/xhtml/xhtml-blkstruct-1.xsd +44 -0
- data/lib/xhtml/xhtml-charent-1.xsd +38 -0
- data/lib/xhtml/xhtml-copyright-1.xsd +29 -0
- data/lib/xhtml/xhtml-csismap-1.xsd +91 -0
- data/lib/xhtml/xhtml-datatypes-1.xsd +177 -0
- data/lib/xhtml/xhtml-edit-1.xsd +34 -0
- data/lib/xhtml/xhtml-events-1.xsd +130 -0
- data/lib/xhtml/xhtml-form-1.xsd +321 -0
- data/lib/xhtml/xhtml-frames-1.xsd +113 -0
- data/lib/xhtml/xhtml-framework-1.xsd +62 -0
- data/lib/xhtml/xhtml-hypertext-1.xsd +47 -0
- data/lib/xhtml/xhtml-iframe-1.xsd +68 -0
- data/lib/xhtml/xhtml-image-1.xsd +40 -0
- data/lib/xhtml/xhtml-inlphras-1.xsd +158 -0
- data/lib/xhtml/xhtml-inlpres-1.xsd +34 -0
- data/lib/xhtml/xhtml-inlstruct-1.xsd +45 -0
- data/lib/xhtml/xhtml-inlstyle-1.xsd +22 -0
- data/lib/xhtml/xhtml-inputmode-1.xsd +35 -0
- data/lib/xhtml/xhtml-lat1.ent +196 -0
- data/lib/xhtml/xhtml-legacy-1.xsd +97 -0
- data/lib/xhtml/xhtml-link-1.xsd +45 -0
- data/lib/xhtml/xhtml-list-1.xsd +94 -0
- data/lib/xhtml/xhtml-meta-1.xsd +54 -0
- data/lib/xhtml/xhtml-metaAttributes-1.xsd +39 -0
- data/lib/xhtml/xhtml-misc-1.xsd +441 -0
- data/lib/xhtml/xhtml-nameident-1.xsd +63 -0
- data/lib/xhtml/xhtml-notations-1.xsd +69 -0
- data/lib/xhtml/xhtml-object-1.xsd +71 -0
- data/lib/xhtml/xhtml-param-1.xsd +46 -0
- data/lib/xhtml/xhtml-pres-1.xsd +46 -0
- data/lib/xhtml/xhtml-print-1.xsd +85 -0
- data/lib/xhtml/xhtml-print-model-1.xsd +604 -0
- data/lib/xhtml/xhtml-print-modules-1.xsd +422 -0
- data/lib/xhtml/xhtml-rdfa-1.dtd +438 -0
- data/lib/xhtml/xhtml-rdfa-1.xsd +116 -0
- data/lib/xhtml/xhtml-rdfa-model-1.xsd +461 -0
- data/lib/xhtml/xhtml-rdfa-modules-1.xsd +548 -0
- data/lib/xhtml/xhtml-ruby-1.xsd +170 -0
- data/lib/xhtml/xhtml-ruby-basic-1.xsd +84 -0
- data/lib/xhtml/xhtml-script-1.xsd +65 -0
- data/lib/xhtml/xhtml-special.ent +80 -0
- data/lib/xhtml/xhtml-ssismap-1.xsd +38 -0
- data/lib/xhtml/xhtml-struct-1.xsd +85 -0
- data/lib/xhtml/xhtml-style-1.xsd +47 -0
- data/lib/xhtml/xhtml-symbol.ent +237 -0
- data/lib/xhtml/xhtml-table-1.xsd +267 -0
- data/lib/xhtml/xhtml-target-1.xsd +49 -0
- data/lib/xhtml/xhtml-text-1.xsd +62 -0
- data/lib/xhtml/xhtml1-frameset.dtd +1235 -0
- data/lib/xhtml/xhtml1-frameset.xsd +2847 -0
- data/lib/xhtml/xhtml1-strict.dtd +978 -0
- data/lib/xhtml/xhtml1-strict.xsd +2211 -0
- data/lib/xhtml/xhtml1-transitional.dtd +1201 -0
- data/lib/xhtml/xhtml1-transitional.xsd +2755 -0
- data/lib/xhtml/xhtml11-model-1.xsd +715 -0
- data/lib/xhtml/xhtml11-module-redefines-1.xsd +335 -0
- data/lib/xhtml/xhtml11-modules-1.xsd +605 -0
- data/lib/xhtml/xhtml11.xsd +107 -0
- data/lib/xhtml/xhtml2.xsd +21 -0
- data/lib/xhtml/xml-events-1.xsd +73 -0
- data/lib/xhtml/xml-events-2.xsd +73 -0
- data/lib/xhtml/xml-events-attribs-1.xsd +73 -0
- data/lib/xhtml/xml-events-attribs-2.xsd +75 -0
- data/lib/xhtml/xml-events-copyright-1.xsd +34 -0
- data/lib/xhtml/xml-events-copyright-2.xsd +34 -0
- data/lib/xhtml/xml-handlers-1.xsd +136 -0
- data/lib/xhtml/xml-handlers-2.xsd +98 -0
- data/lib/xhtml/xml-script-1.xsd +38 -0
- data/lib/xhtml/xml.xsd +286 -0
- metadata +114 -8
- data/lib/spkspider.rb +0 -147
data/lib/spkspider.rb
DELETED
@@ -1,147 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
require 'open-uri'
|
3
|
-
# SpkSpider is a ruby crawler
|
4
|
-
|
5
|
-
class SpkSpider
|
6
|
-
VERSION = '0.0.5'
|
7
|
-
|
8
|
-
attr_accessor :links_to_visit, :site, :user_agent, :basic_auth
|
9
|
-
attr_accessor :parser, :exclude
|
10
|
-
attr_reader :visited_links, :external_links, :errors
|
11
|
-
|
12
|
-
# initialize method take the site to crawl in argument
|
13
|
-
def initialize(site)
|
14
|
-
puts "SpkSpider #{VERSION} initializing..."
|
15
|
-
@site = URI.parse(site) || raise("You didn't give me a site to crawl")
|
16
|
-
@user_agent = "SpkSpr/#{VERSION}"
|
17
|
-
@links_to_visit = Array.new
|
18
|
-
@visited_links = Array.new
|
19
|
-
@external_links = Array.new
|
20
|
-
@errors = Hash.new
|
21
|
-
@links_to_visit << site
|
22
|
-
@parser = 'xml'
|
23
|
-
puts "Ready to crawl"
|
24
|
-
end
|
25
|
-
|
26
|
-
def init_xml_parser(doc)
|
27
|
-
require 'xml'
|
28
|
-
xp = XML::HTMLParser.string(doc, {:options => XML::HTMLParser::Options::RECOVER | XML::HTMLParser::Options::NOERROR | XML::HTMLParser::Options::NOWARNING })
|
29
|
-
XML::Error.set_handler do |error|
|
30
|
-
exception = error
|
31
|
-
end
|
32
|
-
document = xp.parse
|
33
|
-
links = document.find("//a[@href]")
|
34
|
-
end
|
35
|
-
|
36
|
-
def fetch_links(doc)
|
37
|
-
case @parser
|
38
|
-
when 'xml'
|
39
|
-
init_xml_parser(doc)
|
40
|
-
when 'hpricot'
|
41
|
-
require 'hpricot'
|
42
|
-
Hpricot.buffer_size = 204800
|
43
|
-
Hpricot(doc).search("//a[@href]")
|
44
|
-
else
|
45
|
-
init_xml_parser(doc)
|
46
|
-
end
|
47
|
-
rescue
|
48
|
-
init_xml_parser(doc)
|
49
|
-
end
|
50
|
-
|
51
|
-
# download the document
|
52
|
-
def fetch_html(url)
|
53
|
-
uri = URI.parse(url)
|
54
|
-
print "Visiting: #{url}"
|
55
|
-
begin
|
56
|
-
@document = uri.read('User-Agent' => @user_agent, 'Referer' => url, :http_basic_authentication => @basic_auth)
|
57
|
-
rescue
|
58
|
-
# OpenURI::HTTPError
|
59
|
-
end
|
60
|
-
@visited_links << url
|
61
|
-
@document
|
62
|
-
end
|
63
|
-
|
64
|
-
# reading the document and extract the urls
|
65
|
-
def read_document(document, url)
|
66
|
-
if document
|
67
|
-
case document.content_type
|
68
|
-
when "text/html"
|
69
|
-
link_extractor(document, url)
|
70
|
-
else
|
71
|
-
print " ... not text/html, skipping ..."
|
72
|
-
end
|
73
|
-
else
|
74
|
-
print " ... document does not exist, skipping ..."
|
75
|
-
end
|
76
|
-
end
|
77
|
-
|
78
|
-
# extract the link and un-relative
|
79
|
-
def link_extractor(document, document_url)
|
80
|
-
links = fetch_links(document)
|
81
|
-
links.each do |link|
|
82
|
-
href = link.attributes['href']
|
83
|
-
if href && href.length > 0 && (@exclude && !href.match(@exclude) || @exclude.nil?)
|
84
|
-
begin
|
85
|
-
url = href
|
86
|
-
uri = URI.parse(url)
|
87
|
-
document_uri = URI.parse(document_url)
|
88
|
-
rescue
|
89
|
-
#print " #{url} skip this link"
|
90
|
-
next
|
91
|
-
end
|
92
|
-
else
|
93
|
-
#print " skip this link"
|
94
|
-
next
|
95
|
-
end
|
96
|
-
|
97
|
-
# Derelativeize links if necessary
|
98
|
-
if uri.relative?
|
99
|
-
url = document_uri.merge(url).to_s if url[0,1] == '?'
|
100
|
-
url = @site.merge(url).to_s
|
101
|
-
uri = URI.parse(url)
|
102
|
-
end
|
103
|
-
|
104
|
-
# skip anchor link
|
105
|
-
if url.include?('#')
|
106
|
-
#print '... Anchor link found, skipping ...'
|
107
|
-
next
|
108
|
-
end
|
109
|
-
|
110
|
-
# Check domain, if in same domain, keep link, else trash it
|
111
|
-
if uri.host != @site.host
|
112
|
-
@external_links << url
|
113
|
-
@external_links.uniq!
|
114
|
-
next
|
115
|
-
end
|
116
|
-
|
117
|
-
# Find out if we've seen this link already
|
118
|
-
if (@visited_links.include? url) || (@links_to_visit.include? url)
|
119
|
-
next
|
120
|
-
end
|
121
|
-
|
122
|
-
@links_to_visit << url
|
123
|
-
end
|
124
|
-
end
|
125
|
-
|
126
|
-
# lunch the crawling
|
127
|
-
def crawl
|
128
|
-
while !@links_to_visit.empty?
|
129
|
-
# get the first element of the links_to_visit
|
130
|
-
url = @links_to_visit.shift
|
131
|
-
document = fetch_html(url)
|
132
|
-
read_document(document, url)
|
133
|
-
if block_given?
|
134
|
-
yield(url, document)
|
135
|
-
end
|
136
|
-
puts ' done!'
|
137
|
-
end
|
138
|
-
end
|
139
|
-
end
|
140
|
-
|
141
|
-
if __FILE__ == $0
|
142
|
-
site = 'http://localhost:4567/'
|
143
|
-
site = ARGV[0] if ARGV[0]
|
144
|
-
spider = SpkSpider.new(site)
|
145
|
-
spider.user_agent = ''
|
146
|
-
spider.crawl
|
147
|
-
end
|