validate-website 0.1 → 0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +15 -4
- data/Rakefile +6 -4
- data/bin/validate-website +33 -36
- data/lib/colorful_messages.rb +7 -10
- data/lib/validator.rb +37 -0
- data/lib/xhtml/xframes-1.xsd +166 -0
- data/lib/xhtml/xhtml-access-1.xsd +43 -0
- data/lib/xhtml/xhtml-applet-1.xsd +66 -0
- data/lib/xhtml/xhtml-attribs-1.xsd +67 -0
- data/lib/xhtml/xhtml-base-1.xsd +31 -0
- data/lib/xhtml/xhtml-basic-form-1.xsd +195 -0
- data/lib/xhtml/xhtml-basic-table-1.xsd +169 -0
- data/lib/xhtml/xhtml-basic10-model-1.xsd +385 -0
- data/lib/xhtml/xhtml-basic10-module-redefines-1.xsd +61 -0
- data/lib/xhtml/xhtml-basic10-modules-1.xsd +233 -0
- data/lib/xhtml/xhtml-basic10.xsd +99 -0
- data/lib/xhtml/xhtml-basic11-model-1.xsd +622 -0
- data/lib/xhtml/xhtml-basic11-modules-1.xsd +508 -0
- data/lib/xhtml/xhtml-basic11.xsd +105 -0
- data/lib/xhtml/xhtml-bdo-1.xsd +72 -0
- data/lib/xhtml/xhtml-blkphras-1.xsd +155 -0
- data/lib/xhtml/xhtml-blkpres-1.xsd +32 -0
- data/lib/xhtml/xhtml-blkstruct-1.xsd +44 -0
- data/lib/xhtml/xhtml-charent-1.xsd +38 -0
- data/lib/xhtml/xhtml-copyright-1.xsd +29 -0
- data/lib/xhtml/xhtml-csismap-1.xsd +91 -0
- data/lib/xhtml/xhtml-datatypes-1.xsd +177 -0
- data/lib/xhtml/xhtml-edit-1.xsd +34 -0
- data/lib/xhtml/xhtml-events-1.xsd +130 -0
- data/lib/xhtml/xhtml-form-1.xsd +321 -0
- data/lib/xhtml/xhtml-frames-1.xsd +113 -0
- data/lib/xhtml/xhtml-framework-1.xsd +62 -0
- data/lib/xhtml/xhtml-hypertext-1.xsd +47 -0
- data/lib/xhtml/xhtml-iframe-1.xsd +68 -0
- data/lib/xhtml/xhtml-image-1.xsd +40 -0
- data/lib/xhtml/xhtml-inlphras-1.xsd +158 -0
- data/lib/xhtml/xhtml-inlpres-1.xsd +34 -0
- data/lib/xhtml/xhtml-inlstruct-1.xsd +45 -0
- data/lib/xhtml/xhtml-inlstyle-1.xsd +22 -0
- data/lib/xhtml/xhtml-inputmode-1.xsd +35 -0
- data/lib/xhtml/xhtml-lat1.ent +196 -0
- data/lib/xhtml/xhtml-legacy-1.xsd +97 -0
- data/lib/xhtml/xhtml-link-1.xsd +45 -0
- data/lib/xhtml/xhtml-list-1.xsd +94 -0
- data/lib/xhtml/xhtml-meta-1.xsd +54 -0
- data/lib/xhtml/xhtml-metaAttributes-1.xsd +39 -0
- data/lib/xhtml/xhtml-misc-1.xsd +441 -0
- data/lib/xhtml/xhtml-nameident-1.xsd +63 -0
- data/lib/xhtml/xhtml-notations-1.xsd +69 -0
- data/lib/xhtml/xhtml-object-1.xsd +71 -0
- data/lib/xhtml/xhtml-param-1.xsd +46 -0
- data/lib/xhtml/xhtml-pres-1.xsd +46 -0
- data/lib/xhtml/xhtml-print-1.xsd +85 -0
- data/lib/xhtml/xhtml-print-model-1.xsd +604 -0
- data/lib/xhtml/xhtml-print-modules-1.xsd +422 -0
- data/lib/xhtml/xhtml-rdfa-1.dtd +438 -0
- data/lib/xhtml/xhtml-rdfa-1.xsd +116 -0
- data/lib/xhtml/xhtml-rdfa-model-1.xsd +461 -0
- data/lib/xhtml/xhtml-rdfa-modules-1.xsd +548 -0
- data/lib/xhtml/xhtml-ruby-1.xsd +170 -0
- data/lib/xhtml/xhtml-ruby-basic-1.xsd +84 -0
- data/lib/xhtml/xhtml-script-1.xsd +65 -0
- data/lib/xhtml/xhtml-special.ent +80 -0
- data/lib/xhtml/xhtml-ssismap-1.xsd +38 -0
- data/lib/xhtml/xhtml-struct-1.xsd +85 -0
- data/lib/xhtml/xhtml-style-1.xsd +47 -0
- data/lib/xhtml/xhtml-symbol.ent +237 -0
- data/lib/xhtml/xhtml-table-1.xsd +267 -0
- data/lib/xhtml/xhtml-target-1.xsd +49 -0
- data/lib/xhtml/xhtml-text-1.xsd +62 -0
- data/lib/xhtml/xhtml1-frameset.dtd +1235 -0
- data/lib/xhtml/xhtml1-frameset.xsd +2847 -0
- data/lib/xhtml/xhtml1-strict.dtd +978 -0
- data/lib/xhtml/xhtml1-strict.xsd +2211 -0
- data/lib/xhtml/xhtml1-transitional.dtd +1201 -0
- data/lib/xhtml/xhtml1-transitional.xsd +2755 -0
- data/lib/xhtml/xhtml11-model-1.xsd +715 -0
- data/lib/xhtml/xhtml11-module-redefines-1.xsd +335 -0
- data/lib/xhtml/xhtml11-modules-1.xsd +605 -0
- data/lib/xhtml/xhtml11.xsd +107 -0
- data/lib/xhtml/xhtml2.xsd +21 -0
- data/lib/xhtml/xml-events-1.xsd +73 -0
- data/lib/xhtml/xml-events-2.xsd +73 -0
- data/lib/xhtml/xml-events-attribs-1.xsd +73 -0
- data/lib/xhtml/xml-events-attribs-2.xsd +75 -0
- data/lib/xhtml/xml-events-copyright-1.xsd +34 -0
- data/lib/xhtml/xml-events-copyright-2.xsd +34 -0
- data/lib/xhtml/xml-handlers-1.xsd +136 -0
- data/lib/xhtml/xml-handlers-2.xsd +98 -0
- data/lib/xhtml/xml-script-1.xsd +38 -0
- data/lib/xhtml/xml.xsd +286 -0
- metadata +114 -8
- data/lib/spkspider.rb +0 -147
data/lib/spkspider.rb
DELETED
|
@@ -1,147 +0,0 @@
|
|
|
1
|
-
# encoding: utf-8
|
|
2
|
-
require 'open-uri'
|
|
3
|
-
# SpkSpider is a ruby crawler
|
|
4
|
-
|
|
5
|
-
class SpkSpider
|
|
6
|
-
VERSION = '0.0.5'
|
|
7
|
-
|
|
8
|
-
attr_accessor :links_to_visit, :site, :user_agent, :basic_auth
|
|
9
|
-
attr_accessor :parser, :exclude
|
|
10
|
-
attr_reader :visited_links, :external_links, :errors
|
|
11
|
-
|
|
12
|
-
# initialize method take the site to crawl in argument
|
|
13
|
-
def initialize(site)
|
|
14
|
-
puts "SpkSpider #{VERSION} initializing..."
|
|
15
|
-
@site = URI.parse(site) || raise("You didn't give me a site to crawl")
|
|
16
|
-
@user_agent = "SpkSpr/#{VERSION}"
|
|
17
|
-
@links_to_visit = Array.new
|
|
18
|
-
@visited_links = Array.new
|
|
19
|
-
@external_links = Array.new
|
|
20
|
-
@errors = Hash.new
|
|
21
|
-
@links_to_visit << site
|
|
22
|
-
@parser = 'xml'
|
|
23
|
-
puts "Ready to crawl"
|
|
24
|
-
end
|
|
25
|
-
|
|
26
|
-
def init_xml_parser(doc)
|
|
27
|
-
require 'xml'
|
|
28
|
-
xp = XML::HTMLParser.string(doc, {:options => XML::HTMLParser::Options::RECOVER | XML::HTMLParser::Options::NOERROR | XML::HTMLParser::Options::NOWARNING })
|
|
29
|
-
XML::Error.set_handler do |error|
|
|
30
|
-
exception = error
|
|
31
|
-
end
|
|
32
|
-
document = xp.parse
|
|
33
|
-
links = document.find("//a[@href]")
|
|
34
|
-
end
|
|
35
|
-
|
|
36
|
-
def fetch_links(doc)
|
|
37
|
-
case @parser
|
|
38
|
-
when 'xml'
|
|
39
|
-
init_xml_parser(doc)
|
|
40
|
-
when 'hpricot'
|
|
41
|
-
require 'hpricot'
|
|
42
|
-
Hpricot.buffer_size = 204800
|
|
43
|
-
Hpricot(doc).search("//a[@href]")
|
|
44
|
-
else
|
|
45
|
-
init_xml_parser(doc)
|
|
46
|
-
end
|
|
47
|
-
rescue
|
|
48
|
-
init_xml_parser(doc)
|
|
49
|
-
end
|
|
50
|
-
|
|
51
|
-
# download the document
|
|
52
|
-
def fetch_html(url)
|
|
53
|
-
uri = URI.parse(url)
|
|
54
|
-
print "Visiting: #{url}"
|
|
55
|
-
begin
|
|
56
|
-
@document = uri.read('User-Agent' => @user_agent, 'Referer' => url, :http_basic_authentication => @basic_auth)
|
|
57
|
-
rescue
|
|
58
|
-
# OpenURI::HTTPError
|
|
59
|
-
end
|
|
60
|
-
@visited_links << url
|
|
61
|
-
@document
|
|
62
|
-
end
|
|
63
|
-
|
|
64
|
-
# reading the document and extract the urls
|
|
65
|
-
def read_document(document, url)
|
|
66
|
-
if document
|
|
67
|
-
case document.content_type
|
|
68
|
-
when "text/html"
|
|
69
|
-
link_extractor(document, url)
|
|
70
|
-
else
|
|
71
|
-
print " ... not text/html, skipping ..."
|
|
72
|
-
end
|
|
73
|
-
else
|
|
74
|
-
print " ... document does not exist, skipping ..."
|
|
75
|
-
end
|
|
76
|
-
end
|
|
77
|
-
|
|
78
|
-
# extract the link and un-relative
|
|
79
|
-
def link_extractor(document, document_url)
|
|
80
|
-
links = fetch_links(document)
|
|
81
|
-
links.each do |link|
|
|
82
|
-
href = link.attributes['href']
|
|
83
|
-
if href && href.length > 0 && (@exclude && !href.match(@exclude) || @exclude.nil?)
|
|
84
|
-
begin
|
|
85
|
-
url = href
|
|
86
|
-
uri = URI.parse(url)
|
|
87
|
-
document_uri = URI.parse(document_url)
|
|
88
|
-
rescue
|
|
89
|
-
#print " #{url} skip this link"
|
|
90
|
-
next
|
|
91
|
-
end
|
|
92
|
-
else
|
|
93
|
-
#print " skip this link"
|
|
94
|
-
next
|
|
95
|
-
end
|
|
96
|
-
|
|
97
|
-
# Derelativeize links if necessary
|
|
98
|
-
if uri.relative?
|
|
99
|
-
url = document_uri.merge(url).to_s if url[0,1] == '?'
|
|
100
|
-
url = @site.merge(url).to_s
|
|
101
|
-
uri = URI.parse(url)
|
|
102
|
-
end
|
|
103
|
-
|
|
104
|
-
# skip anchor link
|
|
105
|
-
if url.include?('#')
|
|
106
|
-
#print '... Anchor link found, skipping ...'
|
|
107
|
-
next
|
|
108
|
-
end
|
|
109
|
-
|
|
110
|
-
# Check domain, if in same domain, keep link, else trash it
|
|
111
|
-
if uri.host != @site.host
|
|
112
|
-
@external_links << url
|
|
113
|
-
@external_links.uniq!
|
|
114
|
-
next
|
|
115
|
-
end
|
|
116
|
-
|
|
117
|
-
# Find out if we've seen this link already
|
|
118
|
-
if (@visited_links.include? url) || (@links_to_visit.include? url)
|
|
119
|
-
next
|
|
120
|
-
end
|
|
121
|
-
|
|
122
|
-
@links_to_visit << url
|
|
123
|
-
end
|
|
124
|
-
end
|
|
125
|
-
|
|
126
|
-
# lunch the crawling
|
|
127
|
-
def crawl
|
|
128
|
-
while !@links_to_visit.empty?
|
|
129
|
-
# get the first element of the links_to_visit
|
|
130
|
-
url = @links_to_visit.shift
|
|
131
|
-
document = fetch_html(url)
|
|
132
|
-
read_document(document, url)
|
|
133
|
-
if block_given?
|
|
134
|
-
yield(url, document)
|
|
135
|
-
end
|
|
136
|
-
puts ' done!'
|
|
137
|
-
end
|
|
138
|
-
end
|
|
139
|
-
end
|
|
140
|
-
|
|
141
|
-
if __FILE__ == $0
|
|
142
|
-
site = 'http://localhost:4567/'
|
|
143
|
-
site = ARGV[0] if ARGV[0]
|
|
144
|
-
spider = SpkSpider.new(site)
|
|
145
|
-
spider.user_agent = ''
|
|
146
|
-
spider.crawl
|
|
147
|
-
end
|