validate-website 0.5.7 → 0.6.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +5 -4
- data/Rakefile +3 -3
- data/bin/validate-website +9 -11
- data/bin/validate-website-static +7 -18
- data/lib/validate_website.rb +1 -210
- data/lib/validate_website/colorful_messages.rb +28 -0
- data/lib/validate_website/core.rb +141 -0
- data/lib/validate_website/option_parser.rb +133 -0
- data/lib/validate_website/runner.rb +35 -0
- data/lib/validate_website/validator.rb +69 -0
- data/man/man1/validate-website-static.1 +82 -0
- data/{lib/xhtml → share/schemas}/frameset.dtd +0 -0
- data/{lib/xhtml → share/schemas}/loose.dtd +0 -0
- data/{lib/xhtml → share/schemas}/strict.dtd +0 -0
- data/{lib/xhtml → share/schemas}/xframes-1.xsd +0 -0
- data/{lib/xhtml → share/schemas}/xhtml-access-1.xsd +0 -0
- data/{lib/xhtml → share/schemas}/xhtml-applet-1.xsd +0 -0
- data/{lib/xhtml → share/schemas}/xhtml-attribs-1.xsd +0 -0
- data/{lib/xhtml → share/schemas}/xhtml-base-1.xsd +0 -0
- data/{lib/xhtml → share/schemas}/xhtml-basic-form-1.xsd +0 -0
- data/{lib/xhtml → share/schemas}/xhtml-basic-table-1.xsd +0 -0
- data/{lib/xhtml → share/schemas}/xhtml-basic10-model-1.xsd +0 -0
- data/{lib/xhtml → share/schemas}/xhtml-basic10-module-redefines-1.xsd +0 -0
- data/{lib/xhtml → share/schemas}/xhtml-basic10-modules-1.xsd +0 -0
- data/{lib/xhtml → share/schemas}/xhtml-basic10.xsd +0 -0
- data/{lib/xhtml → share/schemas}/xhtml-basic11-model-1.xsd +0 -0
- data/{lib/xhtml → share/schemas}/xhtml-basic11-modules-1.xsd +0 -0
- data/{lib/xhtml → share/schemas}/xhtml-basic11.dtd +0 -0
- data/{lib/xhtml → share/schemas}/xhtml-basic11.xsd +0 -0
- data/{lib/xhtml → share/schemas}/xhtml-bdo-1.xsd +0 -0
- data/{lib/xhtml → share/schemas}/xhtml-blkphras-1.xsd +0 -0
- data/{lib/xhtml → share/schemas}/xhtml-blkpres-1.xsd +0 -0
- data/{lib/xhtml → share/schemas}/xhtml-blkstruct-1.xsd +0 -0
- data/{lib/xhtml → share/schemas}/xhtml-charent-1.xsd +0 -0
- data/{lib/xhtml → share/schemas}/xhtml-copyright-1.xsd +0 -0
- data/{lib/xhtml → share/schemas}/xhtml-csismap-1.xsd +0 -0
- data/{lib/xhtml → share/schemas}/xhtml-datatypes-1.xsd +0 -0
- data/{lib/xhtml → share/schemas}/xhtml-edit-1.xsd +0 -0
- data/{lib/xhtml → share/schemas}/xhtml-events-1.xsd +0 -0
- data/{lib/xhtml → share/schemas}/xhtml-form-1.xsd +0 -0
- data/{lib/xhtml → share/schemas}/xhtml-frames-1.xsd +0 -0
- data/{lib/xhtml → share/schemas}/xhtml-framework-1.xsd +0 -0
- data/{lib/xhtml → share/schemas}/xhtml-hypertext-1.xsd +0 -0
- data/{lib/xhtml → share/schemas}/xhtml-iframe-1.xsd +0 -0
- data/{lib/xhtml → share/schemas}/xhtml-image-1.xsd +0 -0
- data/{lib/xhtml → share/schemas}/xhtml-inlphras-1.xsd +0 -0
- data/{lib/xhtml → share/schemas}/xhtml-inlpres-1.xsd +0 -0
- data/{lib/xhtml → share/schemas}/xhtml-inlstruct-1.xsd +0 -0
- data/{lib/xhtml → share/schemas}/xhtml-inlstyle-1.xsd +0 -0
- data/{lib/xhtml → share/schemas}/xhtml-inputmode-1.xsd +0 -0
- data/{lib/xhtml → share/schemas}/xhtml-lat1.ent +0 -0
- data/{lib/xhtml → share/schemas}/xhtml-legacy-1.xsd +0 -0
- data/{lib/xhtml → share/schemas}/xhtml-link-1.xsd +0 -0
- data/{lib/xhtml → share/schemas}/xhtml-list-1.xsd +0 -0
- data/{lib/xhtml → share/schemas}/xhtml-meta-1.xsd +0 -0
- data/{lib/xhtml → share/schemas}/xhtml-metaAttributes-1.xsd +0 -0
- data/{lib/xhtml → share/schemas}/xhtml-misc-1.xsd +0 -0
- data/{lib/xhtml → share/schemas}/xhtml-nameident-1.xsd +0 -0
- data/{lib/xhtml → share/schemas}/xhtml-notations-1.xsd +0 -0
- data/{lib/xhtml → share/schemas}/xhtml-object-1.xsd +0 -0
- data/{lib/xhtml → share/schemas}/xhtml-param-1.xsd +0 -0
- data/{lib/xhtml → share/schemas}/xhtml-pres-1.xsd +0 -0
- data/{lib/xhtml → share/schemas}/xhtml-print-1.xsd +0 -0
- data/{lib/xhtml → share/schemas}/xhtml-print-model-1.xsd +0 -0
- data/{lib/xhtml → share/schemas}/xhtml-print-modules-1.xsd +0 -0
- data/{lib/xhtml → share/schemas}/xhtml-rdfa-1.dtd +0 -0
- data/{lib/xhtml → share/schemas}/xhtml-rdfa-1.xsd +0 -0
- data/{lib/xhtml → share/schemas}/xhtml-rdfa-model-1.xsd +0 -0
- data/{lib/xhtml → share/schemas}/xhtml-rdfa-modules-1.xsd +0 -0
- data/{lib/xhtml → share/schemas}/xhtml-ruby-1.xsd +0 -0
- data/{lib/xhtml → share/schemas}/xhtml-ruby-basic-1.xsd +0 -0
- data/{lib/xhtml → share/schemas}/xhtml-script-1.xsd +0 -0
- data/{lib/xhtml → share/schemas}/xhtml-special.ent +0 -0
- data/{lib/xhtml → share/schemas}/xhtml-ssismap-1.xsd +0 -0
- data/{lib/xhtml → share/schemas}/xhtml-struct-1.xsd +0 -0
- data/{lib/xhtml → share/schemas}/xhtml-style-1.xsd +0 -0
- data/{lib/xhtml → share/schemas}/xhtml-symbol.ent +0 -0
- data/{lib/xhtml → share/schemas}/xhtml-table-1.xsd +0 -0
- data/{lib/xhtml → share/schemas}/xhtml-target-1.xsd +0 -0
- data/{lib/xhtml → share/schemas}/xhtml-text-1.xsd +0 -0
- data/{lib/xhtml → share/schemas}/xhtml1-frameset.dtd +0 -0
- data/{lib/xhtml → share/schemas}/xhtml1-frameset.xsd +0 -0
- data/{lib/xhtml → share/schemas}/xhtml1-strict.dtd +0 -0
- data/{lib/xhtml → share/schemas}/xhtml1-strict.xsd +0 -0
- data/{lib/xhtml → share/schemas}/xhtml1-transitional.dtd +0 -0
- data/{lib/xhtml → share/schemas}/xhtml1-transitional.xsd +0 -0
- data/{lib/xhtml → share/schemas}/xhtml11-model-1.xsd +0 -0
- data/{lib/xhtml → share/schemas}/xhtml11-module-redefines-1.xsd +0 -0
- data/{lib/xhtml → share/schemas}/xhtml11-modules-1.xsd +0 -0
- data/{lib/xhtml → share/schemas}/xhtml11.xsd +0 -0
- data/{lib/xhtml → share/schemas}/xhtml2.xsd +0 -0
- data/{lib/xhtml → share/schemas}/xml-events-1.xsd +0 -0
- data/{lib/xhtml → share/schemas}/xml-events-2.xsd +0 -0
- data/{lib/xhtml → share/schemas}/xml-events-attribs-1.xsd +0 -0
- data/{lib/xhtml → share/schemas}/xml-events-attribs-2.xsd +0 -0
- data/{lib/xhtml → share/schemas}/xml-events-copyright-1.xsd +0 -0
- data/{lib/xhtml → share/schemas}/xml-events-copyright-2.xsd +0 -0
- data/{lib/xhtml → share/schemas}/xml-handlers-1.xsd +0 -0
- data/{lib/xhtml → share/schemas}/xml-handlers-2.xsd +0 -0
- data/{lib/xhtml → share/schemas}/xml-script-1.xsd +0 -0
- data/{lib/xhtml → share/schemas}/xml.xsd +0 -0
- data/spec/core_spec.rb +56 -0
- data/spec/spec_helper.rb +1 -1
- data/spec/validator_spec.rb +3 -1
- metadata +102 -99
- data/lib/colorful_messages.rb +0 -28
- data/lib/validator.rb +0 -67
- data/spec/data/index.cs.html +0 -243
- data/spec/validate_website_spec.rb +0 -54
data/README.rdoc
CHANGED
@@ -12,14 +12,16 @@
|
|
12
12
|
|
13
13
|
== SYNOPSIS
|
14
14
|
|
15
|
-
validate-website [OPTIONS]
|
15
|
+
validate-website [OPTIONS]
|
16
|
+
validate-website-static [OPTIONS]
|
16
17
|
|
17
18
|
== DESCRIPTION
|
18
19
|
|
19
20
|
validate-website is a web crawler for checking the markup validity and not
|
20
21
|
found urls.
|
22
|
+
validate-website-static check the markup validity of your local documents.
|
21
23
|
|
22
|
-
== OPTIONS
|
24
|
+
== VALIDATE WEBSITE OPTIONS
|
23
25
|
|
24
26
|
-s, --site SITE
|
25
27
|
Website to crawl (Default: http://localhost:3000/)
|
@@ -56,9 +58,8 @@ found urls.
|
|
56
58
|
|
57
59
|
== REQUIREMENTS
|
58
60
|
|
59
|
-
*
|
61
|
+
* anemone, '>= 0.5.0'
|
60
62
|
* rainbow, '>= 1.1'
|
61
|
-
* html5, '= 0.10.0'
|
62
63
|
|
63
64
|
== CREDITS
|
64
65
|
|
data/Rakefile
CHANGED
@@ -7,10 +7,10 @@ require "rspec/core/rake_task" # RSpec 2.0
|
|
7
7
|
# Globals
|
8
8
|
|
9
9
|
PKG_NAME = 'validate-website'
|
10
|
-
PKG_VERSION = '0.
|
10
|
+
PKG_VERSION = '0.6.0'
|
11
11
|
|
12
12
|
PKG_FILES = ['README.rdoc', 'Rakefile', 'LICENSE']
|
13
|
-
Find.find('bin/', 'lib/', 'man/', 'spec/') do |f|
|
13
|
+
Find.find('bin/', 'lib/', 'man/', 'spec/', 'share/') do |f|
|
14
14
|
if FileTest.directory?(f) and f =~ /\.svn|\.git/
|
15
15
|
Find.prune
|
16
16
|
else
|
@@ -64,7 +64,7 @@ end
|
|
64
64
|
|
65
65
|
desc 'Update manpage from asciidoc file'
|
66
66
|
task :manpage do
|
67
|
-
system('a2x -f manpage -D man/man1
|
67
|
+
system('find doc/ -type f -exec a2x -f manpage -D man/man1 {} \;')
|
68
68
|
end
|
69
69
|
|
70
70
|
# RSpec 2.0
|
data/bin/validate-website
CHANGED
@@ -1,16 +1,14 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
# encoding: utf-8
|
3
3
|
|
4
|
-
lib_dir = File.join(File.dirname(__FILE__), '..', 'lib')
|
5
|
-
$:.unshift(File.expand_path(lib_dir))
|
6
|
-
|
7
4
|
developer_mode = false
|
8
5
|
developer_mode = true if __FILE__ == $0
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
6
|
+
if developer_mode
|
7
|
+
lib_dir = File.join(File.dirname(__FILE__), '..', 'lib')
|
8
|
+
$:.unshift(File.expand_path(lib_dir))
|
9
|
+
require 'rubygems'
|
10
|
+
end
|
11
|
+
|
12
|
+
require 'validate_website/runner'
|
13
|
+
exit_status = ValidateWebsite::Runner.run_crawl(ARGV)
|
14
|
+
exit(exit_status)
|
data/bin/validate-website-static
CHANGED
@@ -1,25 +1,14 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
# encoding: utf-8
|
3
3
|
|
4
|
-
lib_dir = File.join(File.dirname(__FILE__), '..', 'lib')
|
5
|
-
$:.unshift(File.expand_path(lib_dir))
|
6
|
-
|
7
4
|
developer_mode = false
|
8
5
|
developer_mode = true if __FILE__ == $0
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
validate_website = ValidateWebsite.new(ARGV)
|
14
|
-
|
15
|
-
files = Dir.glob(File.join("**", "*.html"))
|
16
|
-
files.each do |f|
|
17
|
-
next unless File.file?(f)
|
18
|
-
|
19
|
-
body = open(f).read
|
20
|
-
doc = Nokogiri::HTML(body)
|
21
|
-
|
22
|
-
validate_website.validate(doc, body, f)
|
6
|
+
if developer_mode
|
7
|
+
lib_dir = File.join(File.dirname(__FILE__), '..', 'lib')
|
8
|
+
$:.unshift(File.expand_path(lib_dir))
|
9
|
+
require 'rubygems'
|
23
10
|
end
|
24
11
|
|
25
|
-
|
12
|
+
require 'validate_website/runner'
|
13
|
+
exit_status = ValidateWebsite::Runner.run_static(ARGV)
|
14
|
+
exit(exit_status)
|
data/lib/validate_website.rb
CHANGED
@@ -1,211 +1,2 @@
|
|
1
1
|
# encoding: utf-8
|
2
|
-
|
3
|
-
require 'optparse'
|
4
|
-
require 'open-uri'
|
5
|
-
|
6
|
-
require 'validator'
|
7
|
-
require 'colorful_messages'
|
8
|
-
|
9
|
-
require 'anemone'
|
10
|
-
|
11
|
-
class ValidateWebsite
|
12
|
-
|
13
|
-
attr_accessor :site
|
14
|
-
attr_reader :options, :anemone
|
15
|
-
|
16
|
-
include ColorfulMessages
|
17
|
-
|
18
|
-
EXIT_SUCCESS = 0
|
19
|
-
EXIT_FAILURE_MARKUP = 64
|
20
|
-
EXIT_FAILURE_NOT_FOUND = 65
|
21
|
-
EXIT_FAILURE_MARKUP_NOT_FOUND = 66
|
22
|
-
|
23
|
-
def initialize(args=[], validation_type = :crawl)
|
24
|
-
@markup_error = nil
|
25
|
-
@not_found_error = nil
|
26
|
-
|
27
|
-
@options_crawl = {
|
28
|
-
:site => 'http://localhost:3000/',
|
29
|
-
:markup_validation => true,
|
30
|
-
:exclude => nil,
|
31
|
-
:file => nil,
|
32
|
-
# log not found url (404 status code)
|
33
|
-
:not_found => false,
|
34
|
-
# internal verbose for ValidateWebsite
|
35
|
-
:validate_verbose => false,
|
36
|
-
:quiet => false,
|
37
|
-
|
38
|
-
# Anemone options see anemone/lib/anemone/core.rb
|
39
|
-
:verbose => false,
|
40
|
-
:user_agent => Anemone::Core::DEFAULT_OPTS[:user_agent],
|
41
|
-
:cookies => nil,
|
42
|
-
:accept_cookies => true,
|
43
|
-
:redirect_limit => 0,
|
44
|
-
}
|
45
|
-
send("parse_#{validation_type}_options", args)
|
46
|
-
|
47
|
-
@file = @options[:file]
|
48
|
-
if @file
|
49
|
-
# truncate file
|
50
|
-
open(@file, 'w').write('')
|
51
|
-
end
|
52
|
-
|
53
|
-
@site = @options[:site]
|
54
|
-
end
|
55
|
-
|
56
|
-
def parse_crawl_options(args)
|
57
|
-
@options = @options_crawl
|
58
|
-
|
59
|
-
opts = OptionParser.new do |o|
|
60
|
-
o.set_summary_indent(' ')
|
61
|
-
o.banner = 'Usage: validate-website [OPTIONS]'
|
62
|
-
o.define_head 'validate-website - Web crawler for checking the validity'+
|
63
|
-
' of your documents'
|
64
|
-
o.separator ''
|
65
|
-
|
66
|
-
o.on("-s", "--site 'SITE'", String,
|
67
|
-
"Website to crawl (Default: #{@options[:site]})") { |v|
|
68
|
-
@options[:site] = v
|
69
|
-
}
|
70
|
-
o.on("-u", "--user-agent 'USERAGENT'", String,
|
71
|
-
"Change user agent (Default: #{@options[:user_agent]})") { |v|
|
72
|
-
@options[:user_agent] = v
|
73
|
-
}
|
74
|
-
o.on("-e", "--exclude 'EXCLUDE'", String,
|
75
|
-
"Url to exclude (ex: 'redirect|news')") { |v|
|
76
|
-
@options[:exclude] = v
|
77
|
-
}
|
78
|
-
o.on("-f", "--file 'FILE'", String,
|
79
|
-
"Save not well formed or not found urls") { |v| @options[:file] = v }
|
80
|
-
|
81
|
-
o.on("-c", "--cookies 'COOKIES'", String,
|
82
|
-
"Set defaults cookies") { |v| @options[:cookies] = v }
|
83
|
-
|
84
|
-
o.on("-m", "--[no-]markup-validation",
|
85
|
-
"Markup validation (Default: #{@options[:markup_validation]})") { |v|
|
86
|
-
@options[:markup_validation] = v
|
87
|
-
}
|
88
|
-
o.on("-n", "--not-found",
|
89
|
-
"Log not found url (Default: #{@options[:not_found]})") { |v|
|
90
|
-
@options[:not_found] = v
|
91
|
-
}
|
92
|
-
o.on("-v", "--verbose",
|
93
|
-
"Show validator errors (Default: #{@options[:validate_verbose]})") { |v|
|
94
|
-
@options[:validate_verbose] = v
|
95
|
-
}
|
96
|
-
o.on("-q", "--quiet",
|
97
|
-
"Only report errors (Default: #{@options[:quiet]})") { |v|
|
98
|
-
@options[:quiet] = v
|
99
|
-
}
|
100
|
-
o.on("-d", "--debug",
|
101
|
-
"Show anemone log (Default: #{@options[:verbose]})") { |v|
|
102
|
-
@options[:verbose] = v
|
103
|
-
}
|
104
|
-
|
105
|
-
o.separator ""
|
106
|
-
o.on_tail("-h", "--help", "Show this help message.") { puts o; exit }
|
107
|
-
end
|
108
|
-
opts.parse!(args)
|
109
|
-
end
|
110
|
-
|
111
|
-
def validate(doc, body, url, opts={})
|
112
|
-
opts = @options.merge(opts)
|
113
|
-
validator = Validator.new(doc, body)
|
114
|
-
msg = " well formed? %s" % validator.valid?
|
115
|
-
if validator.valid?
|
116
|
-
unless opts[:quiet]
|
117
|
-
print info(url)
|
118
|
-
puts success(msg)
|
119
|
-
end
|
120
|
-
else
|
121
|
-
@markup_error = true
|
122
|
-
print info(url)
|
123
|
-
puts error(msg)
|
124
|
-
puts error(validator.errors.join(", ")) if opts[:validate_verbose]
|
125
|
-
to_file(url)
|
126
|
-
end
|
127
|
-
end
|
128
|
-
|
129
|
-
def crawl(opts={})
|
130
|
-
opts = @options.merge(opts)
|
131
|
-
puts note("Validating #{@site}") if opts[:validate_verbose]
|
132
|
-
|
133
|
-
@anemone = Anemone.crawl(@site, opts) do |anemone|
|
134
|
-
anemone.skip_links_like Regexp.new(opts[:exclude]) if opts[:exclude]
|
135
|
-
|
136
|
-
# select the links on each page to follow (iframe, link, css url)
|
137
|
-
anemone.focus_crawl { |p|
|
138
|
-
links = []
|
139
|
-
if p.html?
|
140
|
-
p.doc.css('img, script, iframe').each do |elem|
|
141
|
-
url = get_url(p, elem, "src")
|
142
|
-
links << url unless url.nil?
|
143
|
-
end
|
144
|
-
p.doc.css('link').each do |link|
|
145
|
-
url = get_url(p, link, "href")
|
146
|
-
links << url unless url.nil?
|
147
|
-
end
|
148
|
-
end
|
149
|
-
if p.content_type == 'text/css'
|
150
|
-
p.body.scan(/url\((['".\/\w-]+)\)/).each do |url|
|
151
|
-
url = url.first.gsub("'", "").gsub('"', '')
|
152
|
-
abs = p.to_absolute(URI(url))
|
153
|
-
links << abs
|
154
|
-
end
|
155
|
-
end
|
156
|
-
links.uniq!
|
157
|
-
p.links.concat(links)
|
158
|
-
}
|
159
|
-
|
160
|
-
anemone.on_every_page { |page|
|
161
|
-
url = page.url.to_s
|
162
|
-
|
163
|
-
if opts[:markup_validation]
|
164
|
-
# validate html/html+xml
|
165
|
-
if page.html? && page.fetched?
|
166
|
-
validate(page.doc, page.body, url, opts)
|
167
|
-
end
|
168
|
-
end
|
169
|
-
|
170
|
-
if opts[:not_found] && page.not_found?
|
171
|
-
@not_found_error = true
|
172
|
-
puts error("%s linked in %s but not exist" % [url, page.referer])
|
173
|
-
to_file(url)
|
174
|
-
end
|
175
|
-
|
176
|
-
# throw away the page (hope this saves memory)
|
177
|
-
page = nil
|
178
|
-
}
|
179
|
-
end
|
180
|
-
end
|
181
|
-
|
182
|
-
def exit_status
|
183
|
-
if @markup_error && @not_found_error
|
184
|
-
EXIT_FAILURE_MARKUP_NOT_FOUND
|
185
|
-
elsif @markup_error
|
186
|
-
EXIT_FAILURE_MARKUP
|
187
|
-
elsif @not_found_error
|
188
|
-
EXIT_FAILURE_NOT_FOUND
|
189
|
-
else
|
190
|
-
EXIT_SUCCESS
|
191
|
-
end
|
192
|
-
end
|
193
|
-
|
194
|
-
private
|
195
|
-
def to_file(msg)
|
196
|
-
if @file && File.exist?(@file)
|
197
|
-
open(@file, 'a').write("#{msg}\n")
|
198
|
-
end
|
199
|
-
end
|
200
|
-
|
201
|
-
def get_url(page, elem, attrname)
|
202
|
-
u = elem.attributes[attrname] if elem.attributes[attrname]
|
203
|
-
return if u.nil?
|
204
|
-
begin
|
205
|
-
abs = page.to_absolute(URI(u))
|
206
|
-
rescue
|
207
|
-
abs = nil
|
208
|
-
end
|
209
|
-
return abs if abs && page.in_domain?(abs)
|
210
|
-
end
|
211
|
-
end
|
2
|
+
require 'validate_website/core'
|
@@ -0,0 +1,28 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require 'rainbow'
|
3
|
+
|
4
|
+
module ValidateWebsite
|
5
|
+
module ColorfulMessages
|
6
|
+
def error(message)
|
7
|
+
message.to_s.foreground(:red)
|
8
|
+
end
|
9
|
+
|
10
|
+
def warning(message)
|
11
|
+
message.to_s.foreground(:yellow)
|
12
|
+
end
|
13
|
+
|
14
|
+
def success(message)
|
15
|
+
message.to_s.foreground(:green)
|
16
|
+
end
|
17
|
+
|
18
|
+
alias_method :message, :success
|
19
|
+
|
20
|
+
def note(message)
|
21
|
+
message.to_s.foreground(:magenta)
|
22
|
+
end
|
23
|
+
|
24
|
+
def info(message)
|
25
|
+
message.to_s.foreground(:blue)
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,141 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require 'open-uri'
|
4
|
+
|
5
|
+
require 'validate_website/option_parser'
|
6
|
+
require 'validate_website/validator'
|
7
|
+
require 'validate_website/colorful_messages'
|
8
|
+
|
9
|
+
require 'anemone'
|
10
|
+
|
11
|
+
module ValidateWebsite
|
12
|
+
|
13
|
+
class Core
|
14
|
+
|
15
|
+
attr_accessor :site
|
16
|
+
attr_reader :options, :anemone
|
17
|
+
|
18
|
+
include ColorfulMessages
|
19
|
+
|
20
|
+
EXIT_SUCCESS = 0
|
21
|
+
EXIT_FAILURE_MARKUP = 64
|
22
|
+
EXIT_FAILURE_NOT_FOUND = 65
|
23
|
+
EXIT_FAILURE_MARKUP_NOT_FOUND = 66
|
24
|
+
|
25
|
+
def initialize(options={}, validation_type = :crawl)
|
26
|
+
@markup_error = nil
|
27
|
+
@not_found_error = nil
|
28
|
+
|
29
|
+
@options = Parser.parse(options, validation_type)
|
30
|
+
|
31
|
+
@file = @options[:file]
|
32
|
+
if @file
|
33
|
+
# truncate file
|
34
|
+
open(@file, 'w').write('')
|
35
|
+
end
|
36
|
+
|
37
|
+
@site = @options[:site]
|
38
|
+
end
|
39
|
+
|
40
|
+
def validate(doc, body, url, opts={})
|
41
|
+
opts = @options.merge(opts)
|
42
|
+
validator = Validator.new(doc, body)
|
43
|
+
msg = " well formed? %s" % validator.valid?
|
44
|
+
if validator.valid?
|
45
|
+
unless opts[:quiet]
|
46
|
+
print info(url)
|
47
|
+
puts success(msg)
|
48
|
+
end
|
49
|
+
else
|
50
|
+
@markup_error = true
|
51
|
+
print info(url)
|
52
|
+
puts error(msg)
|
53
|
+
puts error(validator.errors.join(", ")) if opts[:validate_verbose]
|
54
|
+
to_file(url)
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
def crawl(opts={})
|
59
|
+
opts = @options.merge(opts)
|
60
|
+
puts note("Validating #{@site}") if opts[:validate_verbose]
|
61
|
+
|
62
|
+
@anemone = Anemone.crawl(@site, opts) do |anemone|
|
63
|
+
anemone.skip_links_like Regexp.new(opts[:exclude]) if opts[:exclude]
|
64
|
+
|
65
|
+
# select the links on each page to follow (iframe, link, css url)
|
66
|
+
anemone.focus_crawl { |p|
|
67
|
+
links = []
|
68
|
+
if p.html?
|
69
|
+
p.doc.css('img, script, iframe').each do |elem|
|
70
|
+
url = get_url(p, elem, "src")
|
71
|
+
links << url unless url.nil?
|
72
|
+
end
|
73
|
+
p.doc.css('link').each do |link|
|
74
|
+
url = get_url(p, link, "href")
|
75
|
+
links << url unless url.nil?
|
76
|
+
end
|
77
|
+
end
|
78
|
+
if p.content_type == 'text/css'
|
79
|
+
p.body.scan(/url\((['".\/\w-]+)\)/).each do |url|
|
80
|
+
url = url.first.gsub("'", "").gsub('"', '')
|
81
|
+
abs = p.to_absolute(URI(url))
|
82
|
+
links << abs
|
83
|
+
end
|
84
|
+
end
|
85
|
+
links.uniq!
|
86
|
+
p.links.concat(links)
|
87
|
+
}
|
88
|
+
|
89
|
+
anemone.on_every_page { |page|
|
90
|
+
url = page.url.to_s
|
91
|
+
|
92
|
+
if opts[:markup_validation]
|
93
|
+
# validate html/html+xml
|
94
|
+
if page.html? && page.fetched?
|
95
|
+
validate(page.doc, page.body, url, opts)
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
if opts[:not_found] && page.not_found?
|
100
|
+
@not_found_error = true
|
101
|
+
puts error("%s linked in %s but not exist" % [url, page.referer])
|
102
|
+
to_file(url)
|
103
|
+
end
|
104
|
+
|
105
|
+
# throw away the page (hope this saves memory)
|
106
|
+
page = nil
|
107
|
+
}
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
def exit_status
|
112
|
+
if @markup_error && @not_found_error
|
113
|
+
EXIT_FAILURE_MARKUP_NOT_FOUND
|
114
|
+
elsif @markup_error
|
115
|
+
EXIT_FAILURE_MARKUP
|
116
|
+
elsif @not_found_error
|
117
|
+
EXIT_FAILURE_NOT_FOUND
|
118
|
+
else
|
119
|
+
EXIT_SUCCESS
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
123
|
+
private
|
124
|
+
def to_file(msg)
|
125
|
+
if @file && File.exist?(@file)
|
126
|
+
open(@file, 'a').write("#{msg}\n")
|
127
|
+
end
|
128
|
+
end
|
129
|
+
|
130
|
+
def get_url(page, elem, attrname)
|
131
|
+
u = elem.attributes[attrname] if elem.attributes[attrname]
|
132
|
+
return if u.nil?
|
133
|
+
begin
|
134
|
+
abs = page.to_absolute(URI(u))
|
135
|
+
rescue
|
136
|
+
abs = nil
|
137
|
+
end
|
138
|
+
return abs if abs && page.in_domain?(abs)
|
139
|
+
end
|
140
|
+
end
|
141
|
+
end
|