html-proofer 2.6.4 → 3.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/{htmlproof → htmlproofer} +31 -24
- data/lib/html-proofer.rb +47 -0
- data/lib/html-proofer/cache.rb +153 -0
- data/lib/html-proofer/check.rb +63 -0
- data/lib/{html/proofer/checks → html-proofer/check}/favicon.rb +2 -8
- data/lib/html-proofer/check/html.rb +21 -0
- data/lib/html-proofer/check/images.rb +47 -0
- data/lib/{html/proofer/checks → html-proofer/check}/links.rb +40 -48
- data/lib/html-proofer/check/scripts.rb +28 -0
- data/lib/html-proofer/configuration.rb +56 -0
- data/lib/html-proofer/element.rb +165 -0
- data/lib/{html/proofer/check_runner → html-proofer}/issue.rb +8 -10
- data/lib/html-proofer/log.rb +49 -0
- data/lib/html-proofer/runner.rb +160 -0
- data/lib/html-proofer/url_validator.rb +218 -0
- data/lib/html-proofer/utils.rb +40 -0
- data/lib/html-proofer/version.rb +3 -0
- metadata +20 -20
- data/lib/html/proofer.rb +0 -191
- data/lib/html/proofer/cache.rb +0 -141
- data/lib/html/proofer/check_runner.rb +0 -70
- data/lib/html/proofer/checkable.rb +0 -168
- data/lib/html/proofer/checks/html.rb +0 -46
- data/lib/html/proofer/checks/images.rb +0 -54
- data/lib/html/proofer/checks/scripts.rb +0 -40
- data/lib/html/proofer/configuration.rb +0 -48
- data/lib/html/proofer/log.rb +0 -42
- data/lib/html/proofer/url_validator.rb +0 -222
- data/lib/html/proofer/utils.rb +0 -42
- data/lib/html/proofer/version.rb +0 -5
- data/lib/html/proofer/xpathfunctions.rb +0 -9
@@ -0,0 +1,49 @@
|
|
1
|
+
require 'yell'
|
2
|
+
require 'colored'
|
3
|
+
|
4
|
+
module HTMLProofer
|
5
|
+
class Log
|
6
|
+
include Yell::Loggable
|
7
|
+
|
8
|
+
def initialize(log_level)
|
9
|
+
@logger = Yell.new(:format => false, \
|
10
|
+
:name => 'HTMLProofer', \
|
11
|
+
:level => "gte.#{log_level}") do |l|
|
12
|
+
l.adapter :stdout, :level => [:debug, :info, :warn]
|
13
|
+
l.adapter :stderr, :level => [:error, :fatal]
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
def log(level, message)
|
18
|
+
color = case level
|
19
|
+
when :debug
|
20
|
+
:light_blue
|
21
|
+
when :info
|
22
|
+
:blue
|
23
|
+
when :warn
|
24
|
+
:yellow
|
25
|
+
when :error, :fatal
|
26
|
+
:red
|
27
|
+
end
|
28
|
+
|
29
|
+
log_with_color(level, color, message)
|
30
|
+
end
|
31
|
+
|
32
|
+
def log_with_color(level, color, message)
|
33
|
+
@logger.send level, colorize(color, message)
|
34
|
+
end
|
35
|
+
|
36
|
+
def colorize(color, message)
|
37
|
+
if $stdout.isatty && $stderr.isatty
|
38
|
+
Colored.colorize(message, foreground: color)
|
39
|
+
else
|
40
|
+
message
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
# dumb override to play nice with Typhoeus/Ethon
|
45
|
+
def debug(message = nil)
|
46
|
+
log(:debug, message) unless message.nil?
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
@@ -0,0 +1,160 @@
|
|
1
|
+
module HTMLProofer
|
2
|
+
class Runner
|
3
|
+
include HTMLProofer::Utils
|
4
|
+
|
5
|
+
attr_reader :options, :external_urls
|
6
|
+
|
7
|
+
def initialize(src, opts = {})
|
8
|
+
@src = src
|
9
|
+
|
10
|
+
@options = HTMLProofer::Configuration::PROOFER_DEFAULTS.merge(opts)
|
11
|
+
|
12
|
+
@options[:typhoeus] = HTMLProofer::Configuration::TYPHOEUS_DEFAULTS.merge(opts[:typhoeus] || {})
|
13
|
+
@options[:hydra] = HTMLProofer::Configuration::HYDRA_DEFAULTS.merge(opts[:hydra] || {})
|
14
|
+
|
15
|
+
@options[:parallel] = HTMLProofer::Configuration::PARALLEL_DEFAULTS.merge(opts[:parallel] || {})
|
16
|
+
@options[:validation] = HTMLProofer::Configuration::VALIDATION_DEFAULTS.merge(opts[:validation] || {})
|
17
|
+
@options[:cache] = HTMLProofer::Configuration::CACHE_DEFAULTS.merge(opts[:cache] || {})
|
18
|
+
|
19
|
+
@type = @options.delete(:type)
|
20
|
+
@logger = HTMLProofer::Log.new(@options[:log_level])
|
21
|
+
|
22
|
+
if !@options[:cache].empty? && !File.exist?(STORAGE_DIR)
|
23
|
+
FileUtils.mkdir_p(STORAGE_DIR)
|
24
|
+
end
|
25
|
+
|
26
|
+
@failures = []
|
27
|
+
end
|
28
|
+
|
29
|
+
def run
|
30
|
+
@logger.log :info, "Running #{checks} on #{@src} on *#{@options[:extension]}... \n\n"
|
31
|
+
|
32
|
+
if @type == :links
|
33
|
+
check_list_of_links unless @options[:disable_external]
|
34
|
+
else
|
35
|
+
check_files
|
36
|
+
file_text = pluralize(files.length, 'file', 'files')
|
37
|
+
@logger.log :info, "Ran on #{file_text}!\n\n"
|
38
|
+
end
|
39
|
+
|
40
|
+
if @failures.empty?
|
41
|
+
@logger.log_with_color :info, :green, 'HTML-Proofer finished successfully.'
|
42
|
+
else
|
43
|
+
print_failed_tests
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
def check_list_of_links
|
48
|
+
if @options[:url_swap]
|
49
|
+
@src = @src.map do |url|
|
50
|
+
swap(url, @options[:url_swap])
|
51
|
+
end
|
52
|
+
end
|
53
|
+
@external_urls = Hash[*@src.map { |s| [s, nil] }.flatten]
|
54
|
+
validate_urls
|
55
|
+
end
|
56
|
+
|
57
|
+
# Collects any external URLs found in a directory of files. Also collectes
|
58
|
+
# every failed test from process_files.
|
59
|
+
# Sends the external URLs to Typhoeus for batch processing.
|
60
|
+
def check_files
|
61
|
+
@external_urls = {}
|
62
|
+
|
63
|
+
process_files.each do |item|
|
64
|
+
@external_urls.merge!(item[:external_urls])
|
65
|
+
@failures.concat(item[:failures])
|
66
|
+
end
|
67
|
+
|
68
|
+
# TODO: lazy. if we're checking only external links,
|
69
|
+
# we'll just trash all the failed tests. really, we should
|
70
|
+
# just not run those other checks at all.
|
71
|
+
if @options[:external_only]
|
72
|
+
@failures = []
|
73
|
+
validate_urls
|
74
|
+
elsif !@options[:disable_external]
|
75
|
+
validate_urls
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
# Walks over each implemented check and runs them on the files, in parallel.
|
80
|
+
def process_files
|
81
|
+
if @options[:parallel].empty?
|
82
|
+
files.map { |path| check_path(path) }
|
83
|
+
else
|
84
|
+
Parallel.map(files, @options[:parallel]) { |path| check_path(path) }
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
def check_path(path)
|
89
|
+
result = { :external_urls => {}, :failures => [] }
|
90
|
+
html = create_nokogiri(path)
|
91
|
+
|
92
|
+
@src = [@src] if @type == :file
|
93
|
+
|
94
|
+
@src.each do |src|
|
95
|
+
checks.each do |klass|
|
96
|
+
@logger.log :debug, "Checking #{klass.to_s.downcase} on #{path} ..."
|
97
|
+
check = Object.const_get(klass).new(src, path, html, @options)
|
98
|
+
check.run
|
99
|
+
result[:external_urls].merge!(check.external_urls)
|
100
|
+
result[:failures].concat(check.issues)
|
101
|
+
end
|
102
|
+
end
|
103
|
+
result
|
104
|
+
end
|
105
|
+
|
106
|
+
def validate_urls
|
107
|
+
url_validator = HTMLProofer::UrlValidator.new(@logger, @external_urls, @options)
|
108
|
+
@failures.concat(url_validator.run)
|
109
|
+
@external_urls = url_validator.external_urls
|
110
|
+
end
|
111
|
+
|
112
|
+
def files
|
113
|
+
@files ||= if @type == :directory
|
114
|
+
@src.map do |src|
|
115
|
+
pattern = File.join(src, '**', "*#{@options[:extension]}")
|
116
|
+
files = Dir.glob(pattern).select { |fn| File.file? fn }
|
117
|
+
files.reject { |f| ignore_file?(f) }
|
118
|
+
end.flatten
|
119
|
+
elsif @type == :file && File.extname(@src) == @options[:extension]
|
120
|
+
[@src].reject { |f| ignore_file?(f) }
|
121
|
+
else
|
122
|
+
[]
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
def ignore_file?(file)
|
127
|
+
@options[:file_ignore].each do |pattern|
|
128
|
+
return true if pattern.is_a?(String) && pattern == file
|
129
|
+
return true if pattern.is_a?(Regexp) && pattern =~ file
|
130
|
+
end
|
131
|
+
|
132
|
+
false
|
133
|
+
end
|
134
|
+
|
135
|
+
def checks
|
136
|
+
return @checks unless @checks.nil?
|
137
|
+
@checks = HTMLProofer::Check.subchecks.map(&:name)
|
138
|
+
@checks.delete('FaviconCheck') unless @options[:check_favicon]
|
139
|
+
@checks.delete('HtmlCheck') unless @options[:check_html]
|
140
|
+
@options[:checks_to_ignore].each { |ignored| @checks.delete(ignored) }
|
141
|
+
@checks
|
142
|
+
end
|
143
|
+
|
144
|
+
def failed_tests
|
145
|
+
result = []
|
146
|
+
return result if @failures.empty?
|
147
|
+
@failures.each { |f| result << f.to_s }
|
148
|
+
result
|
149
|
+
end
|
150
|
+
|
151
|
+
def print_failed_tests
|
152
|
+
sorted_failures = SortedIssues.new(@failures, @options[:error_sort], @logger)
|
153
|
+
|
154
|
+
sorted_failures.sort_and_report
|
155
|
+
count = @failures.length
|
156
|
+
failure_text = pluralize(count, 'failure', 'failures')
|
157
|
+
fail @logger.colorize :red, "HTML-Proofer found #{failure_text}!"
|
158
|
+
end
|
159
|
+
end
|
160
|
+
end
|
@@ -0,0 +1,218 @@
|
|
1
|
+
require 'typhoeus'
|
2
|
+
require 'uri'
|
3
|
+
require_relative './utils'
|
4
|
+
require_relative './cache'
|
5
|
+
|
6
|
+
module HTMLProofer
|
7
|
+
class UrlValidator
|
8
|
+
include HTMLProofer::Utils
|
9
|
+
|
10
|
+
attr_reader :external_urls
|
11
|
+
|
12
|
+
def initialize(logger, external_urls, options)
|
13
|
+
@logger = logger
|
14
|
+
@external_urls = external_urls
|
15
|
+
@failed_tests = []
|
16
|
+
@options = options
|
17
|
+
@hydra = Typhoeus::Hydra.new(@options[:hydra])
|
18
|
+
@cache = Cache.new(@logger, @options[:cache])
|
19
|
+
end
|
20
|
+
|
21
|
+
def run
|
22
|
+
@external_urls = remove_query_values
|
23
|
+
|
24
|
+
if @cache.use_cache?
|
25
|
+
urls_to_check = load_cache
|
26
|
+
external_link_checker(urls_to_check)
|
27
|
+
@cache.write
|
28
|
+
else
|
29
|
+
external_link_checker(@external_urls)
|
30
|
+
end
|
31
|
+
|
32
|
+
@failed_tests
|
33
|
+
end
|
34
|
+
|
35
|
+
def remove_query_values
|
36
|
+
return nil if @external_urls.nil?
|
37
|
+
paths_with_queries = {}
|
38
|
+
iterable_external_urls = @external_urls.dup
|
39
|
+
@external_urls.keys.each do |url|
|
40
|
+
uri = begin
|
41
|
+
Addressable::URI.parse(url)
|
42
|
+
rescue URI::Error, Addressable::URI::InvalidURIError
|
43
|
+
@logger.log :error, "#{url} is an invalid URL"
|
44
|
+
nil
|
45
|
+
end
|
46
|
+
next if uri.nil? || uri.query.nil?
|
47
|
+
iterable_external_urls.delete(url) unless new_url_query_values?(uri, paths_with_queries)
|
48
|
+
end
|
49
|
+
iterable_external_urls
|
50
|
+
end
|
51
|
+
|
52
|
+
# remember queries we've seen, ignore future ones
|
53
|
+
def new_url_query_values?(uri, paths_with_queries)
|
54
|
+
queries = uri.query_values.keys.join('-')
|
55
|
+
domain_path = extract_domain_path(uri)
|
56
|
+
if paths_with_queries[domain_path].nil?
|
57
|
+
paths_with_queries[domain_path] = [queries]
|
58
|
+
true
|
59
|
+
elsif !paths_with_queries[domain_path].include?(queries)
|
60
|
+
paths_with_queries[domain_path] << queries
|
61
|
+
true
|
62
|
+
else
|
63
|
+
false
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
def extract_domain_path(uri)
|
68
|
+
uri.host + uri.path
|
69
|
+
end
|
70
|
+
|
71
|
+
def load_cache
|
72
|
+
cache_count = @cache.size
|
73
|
+
cache_text = pluralize(cache_count, 'link', 'links')
|
74
|
+
|
75
|
+
@logger.log :info, "Found #{cache_text} in the cache..."
|
76
|
+
|
77
|
+
@cache.retrieve_urls(@external_urls)
|
78
|
+
end
|
79
|
+
|
80
|
+
# Proofer runs faster if we pull out all the external URLs and run the checks
|
81
|
+
# at the end. Otherwise, we're halting the consuming process for every file during
|
82
|
+
# `process_files`.
|
83
|
+
#
|
84
|
+
# In addition, sorting the list lets libcurl keep connections to the same hosts alive.
|
85
|
+
#
|
86
|
+
# Finally, we'll first make a HEAD request, rather than GETing all the contents.
|
87
|
+
# If the HEAD fails, we'll fall back to GET, as some servers are not configured
|
88
|
+
# for HEAD. If we've decided to check for hashes, we must do a GET--HEAD is
|
89
|
+
# not available as an option.
|
90
|
+
def external_link_checker(external_urls)
|
91
|
+
external_urls = Hash[external_urls.sort]
|
92
|
+
|
93
|
+
count = external_urls.length
|
94
|
+
check_text = pluralize(count, 'external link', 'external links')
|
95
|
+
@logger.log :info, "Checking #{check_text}..."
|
96
|
+
|
97
|
+
# Route log from Typhoeus/Ethon to our own logger
|
98
|
+
Ethon.logger = @logger
|
99
|
+
|
100
|
+
establish_queue(external_urls)
|
101
|
+
|
102
|
+
@hydra.run
|
103
|
+
end
|
104
|
+
|
105
|
+
def establish_queue(external_urls)
|
106
|
+
external_urls.each_pair do |url, filenames|
|
107
|
+
url = begin
|
108
|
+
clean_url(url)
|
109
|
+
rescue URI::Error, Addressable::URI::InvalidURIError
|
110
|
+
add_external_issue(filenames, "#{url} is an invalid URL")
|
111
|
+
next
|
112
|
+
end
|
113
|
+
|
114
|
+
method = if hash?(url) && @options[:check_external_hash]
|
115
|
+
:get
|
116
|
+
else
|
117
|
+
:head
|
118
|
+
end
|
119
|
+
queue_request(method, url, filenames)
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
123
|
+
def clean_url(href)
|
124
|
+
Addressable::URI.parse(href).normalize
|
125
|
+
end
|
126
|
+
|
127
|
+
def queue_request(method, href, filenames)
|
128
|
+
opts = @options[:typhoeus].merge({ :method => method })
|
129
|
+
request = Typhoeus::Request.new(href, opts)
|
130
|
+
request.on_complete { |response| response_handler(response, filenames) }
|
131
|
+
@hydra.queue request
|
132
|
+
end
|
133
|
+
|
134
|
+
def response_handler(response, filenames)
|
135
|
+
effective_url = response.options[:effective_url]
|
136
|
+
href = response.request.base_url.to_s
|
137
|
+
method = response.request.options[:method]
|
138
|
+
response_code = response.code
|
139
|
+
|
140
|
+
debug_msg = "Received a #{response_code} for #{href}"
|
141
|
+
debug_msg << " in #{filenames.join(' ')}" unless filenames.nil?
|
142
|
+
@logger.log :debug, debug_msg
|
143
|
+
|
144
|
+
return if @options[:http_status_ignore].include?(response_code)
|
145
|
+
|
146
|
+
if response_code.between?(200, 299)
|
147
|
+
unless check_hash_in_2xx_response(href, effective_url, response, filenames)
|
148
|
+
@cache.add(href, filenames, response_code)
|
149
|
+
end
|
150
|
+
elsif response.timed_out?
|
151
|
+
handle_timeout(href, filenames, response_code)
|
152
|
+
elsif response_code == 0
|
153
|
+
handle_failure(href, filenames, response_code)
|
154
|
+
elsif method == :head
|
155
|
+
queue_request(:get, href, filenames)
|
156
|
+
else
|
157
|
+
return if @options[:only_4xx] && !response_code.between?(400, 499)
|
158
|
+
# Received a non-successful http response.
|
159
|
+
msg = "External link #{href} failed: #{response_code} #{response.return_message}"
|
160
|
+
add_external_issue(filenames, msg, response_code)
|
161
|
+
@cache.add(href, filenames, response_code, msg)
|
162
|
+
end
|
163
|
+
end
|
164
|
+
|
165
|
+
# Even though the response was a success, we may have been asked to check
|
166
|
+
# if the hash on the URL exists on the page
|
167
|
+
def check_hash_in_2xx_response(href, effective_url, response, filenames)
|
168
|
+
return false if @options[:only_4xx]
|
169
|
+
return false unless @options[:check_external_hash]
|
170
|
+
return false unless (hash = hash?(href))
|
171
|
+
|
172
|
+
body_doc = create_nokogiri(response.body)
|
173
|
+
|
174
|
+
# user-content is a special addition by GitHub.
|
175
|
+
xpath = %(//*[@name="#{hash}"]|//*[@id="#{hash}"])
|
176
|
+
if URI.parse(href).host.match(/github\.com/i)
|
177
|
+
xpath << %(|//*[@name="user-content-#{hash}"]|//*[@id="user-content-#{hash}"])
|
178
|
+
end
|
179
|
+
|
180
|
+
return unless body_doc.xpath(xpath).empty?
|
181
|
+
|
182
|
+
msg = "External link #{href} failed: #{effective_url} exists, but the hash '#{hash}' does not"
|
183
|
+
add_external_issue(filenames, msg, response.code)
|
184
|
+
@cache.add(href, filenames, response.code, msg)
|
185
|
+
true
|
186
|
+
end
|
187
|
+
|
188
|
+
def handle_timeout(href, filenames, response_code)
|
189
|
+
msg = "External link #{href} failed: got a time out (response code #{response_code})"
|
190
|
+
@cache.add(href, filenames, 0, msg)
|
191
|
+
return if @options[:only_4xx]
|
192
|
+
add_external_issue(filenames, msg, response_code)
|
193
|
+
end
|
194
|
+
|
195
|
+
def handle_failure(href, filenames, response_code)
|
196
|
+
msg = "External link #{href} failed: response code #{response_code} means something's wrong"
|
197
|
+
@cache.add(href, filenames, 0, msg)
|
198
|
+
return if @options[:only_4xx]
|
199
|
+
add_external_issue(filenames, msg, response_code)
|
200
|
+
end
|
201
|
+
|
202
|
+
def add_external_issue(filenames, desc, status = nil)
|
203
|
+
# possible if we're checking an array of links
|
204
|
+
if filenames.nil?
|
205
|
+
@failed_tests << Issue.new('', desc, status: status)
|
206
|
+
else
|
207
|
+
filenames.each { |f| @failed_tests << Issue.new(f, desc, status: status) }
|
208
|
+
end
|
209
|
+
end
|
210
|
+
|
211
|
+
# Does the URL have a hash?
|
212
|
+
def hash?(url)
|
213
|
+
URI.parse(url).fragment
|
214
|
+
rescue URI::InvalidURIError
|
215
|
+
false
|
216
|
+
end
|
217
|
+
end
|
218
|
+
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
|
3
|
+
module HTMLProofer
|
4
|
+
module Utils
|
5
|
+
STORAGE_DIR = File.join('tmp', '.htmlproofer')
|
6
|
+
|
7
|
+
def pluralize(count, single, plural)
|
8
|
+
"#{count} " << (count == 1 ? single : plural)
|
9
|
+
end
|
10
|
+
|
11
|
+
def create_nokogiri(path)
|
12
|
+
if File.exist? path
|
13
|
+
content = File.open(path).read
|
14
|
+
else
|
15
|
+
content = path
|
16
|
+
end
|
17
|
+
|
18
|
+
Nokogiri::HTML(clean_content(content))
|
19
|
+
end
|
20
|
+
module_function :create_nokogiri
|
21
|
+
|
22
|
+
def swap(href, replacement)
|
23
|
+
replacement.each do |link, replace|
|
24
|
+
href = href.gsub(link, replace)
|
25
|
+
end
|
26
|
+
href
|
27
|
+
end
|
28
|
+
module_function :swap
|
29
|
+
|
30
|
+
# address a problem with Nokogiri's parsing URL entities
|
31
|
+
# problem from http://git.io/vBYU1
|
32
|
+
# solution from http://git.io/vBYUi
|
33
|
+
def clean_content(string)
|
34
|
+
string.gsub(%r{https?://([^>]+)}i) do |url|
|
35
|
+
url.gsub(/&(?!amp;)/, '&')
|
36
|
+
end
|
37
|
+
end
|
38
|
+
module_function :clean_content
|
39
|
+
end
|
40
|
+
end
|