html-proofer 2.6.4 → 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/{htmlproof → htmlproofer} +31 -24
- data/lib/html-proofer.rb +47 -0
- data/lib/html-proofer/cache.rb +153 -0
- data/lib/html-proofer/check.rb +63 -0
- data/lib/{html/proofer/checks → html-proofer/check}/favicon.rb +2 -8
- data/lib/html-proofer/check/html.rb +21 -0
- data/lib/html-proofer/check/images.rb +47 -0
- data/lib/{html/proofer/checks → html-proofer/check}/links.rb +40 -48
- data/lib/html-proofer/check/scripts.rb +28 -0
- data/lib/html-proofer/configuration.rb +56 -0
- data/lib/html-proofer/element.rb +165 -0
- data/lib/{html/proofer/check_runner → html-proofer}/issue.rb +8 -10
- data/lib/html-proofer/log.rb +49 -0
- data/lib/html-proofer/runner.rb +160 -0
- data/lib/html-proofer/url_validator.rb +218 -0
- data/lib/html-proofer/utils.rb +40 -0
- data/lib/html-proofer/version.rb +3 -0
- metadata +20 -20
- data/lib/html/proofer.rb +0 -191
- data/lib/html/proofer/cache.rb +0 -141
- data/lib/html/proofer/check_runner.rb +0 -70
- data/lib/html/proofer/checkable.rb +0 -168
- data/lib/html/proofer/checks/html.rb +0 -46
- data/lib/html/proofer/checks/images.rb +0 -54
- data/lib/html/proofer/checks/scripts.rb +0 -40
- data/lib/html/proofer/configuration.rb +0 -48
- data/lib/html/proofer/log.rb +0 -42
- data/lib/html/proofer/url_validator.rb +0 -222
- data/lib/html/proofer/utils.rb +0 -42
- data/lib/html/proofer/version.rb +0 -5
- data/lib/html/proofer/xpathfunctions.rb +0 -9
@@ -0,0 +1,49 @@
|
|
1
|
+
require 'yell'
|
2
|
+
require 'colored'
|
3
|
+
|
4
|
+
module HTMLProofer
|
5
|
+
class Log
|
6
|
+
include Yell::Loggable
|
7
|
+
|
8
|
+
def initialize(log_level)
|
9
|
+
@logger = Yell.new(:format => false, \
|
10
|
+
:name => 'HTMLProofer', \
|
11
|
+
:level => "gte.#{log_level}") do |l|
|
12
|
+
l.adapter :stdout, :level => [:debug, :info, :warn]
|
13
|
+
l.adapter :stderr, :level => [:error, :fatal]
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
def log(level, message)
|
18
|
+
color = case level
|
19
|
+
when :debug
|
20
|
+
:light_blue
|
21
|
+
when :info
|
22
|
+
:blue
|
23
|
+
when :warn
|
24
|
+
:yellow
|
25
|
+
when :error, :fatal
|
26
|
+
:red
|
27
|
+
end
|
28
|
+
|
29
|
+
log_with_color(level, color, message)
|
30
|
+
end
|
31
|
+
|
32
|
+
def log_with_color(level, color, message)
|
33
|
+
@logger.send level, colorize(color, message)
|
34
|
+
end
|
35
|
+
|
36
|
+
def colorize(color, message)
|
37
|
+
if $stdout.isatty && $stderr.isatty
|
38
|
+
Colored.colorize(message, foreground: color)
|
39
|
+
else
|
40
|
+
message
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
# dumb override to play nice with Typhoeus/Ethon
|
45
|
+
def debug(message = nil)
|
46
|
+
log(:debug, message) unless message.nil?
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
@@ -0,0 +1,160 @@
|
|
1
|
+
module HTMLProofer
|
2
|
+
class Runner
|
3
|
+
include HTMLProofer::Utils
|
4
|
+
|
5
|
+
attr_reader :options, :external_urls
|
6
|
+
|
7
|
+
def initialize(src, opts = {})
|
8
|
+
@src = src
|
9
|
+
|
10
|
+
@options = HTMLProofer::Configuration::PROOFER_DEFAULTS.merge(opts)
|
11
|
+
|
12
|
+
@options[:typhoeus] = HTMLProofer::Configuration::TYPHOEUS_DEFAULTS.merge(opts[:typhoeus] || {})
|
13
|
+
@options[:hydra] = HTMLProofer::Configuration::HYDRA_DEFAULTS.merge(opts[:hydra] || {})
|
14
|
+
|
15
|
+
@options[:parallel] = HTMLProofer::Configuration::PARALLEL_DEFAULTS.merge(opts[:parallel] || {})
|
16
|
+
@options[:validation] = HTMLProofer::Configuration::VALIDATION_DEFAULTS.merge(opts[:validation] || {})
|
17
|
+
@options[:cache] = HTMLProofer::Configuration::CACHE_DEFAULTS.merge(opts[:cache] || {})
|
18
|
+
|
19
|
+
@type = @options.delete(:type)
|
20
|
+
@logger = HTMLProofer::Log.new(@options[:log_level])
|
21
|
+
|
22
|
+
if !@options[:cache].empty? && !File.exist?(STORAGE_DIR)
|
23
|
+
FileUtils.mkdir_p(STORAGE_DIR)
|
24
|
+
end
|
25
|
+
|
26
|
+
@failures = []
|
27
|
+
end
|
28
|
+
|
29
|
+
def run
|
30
|
+
@logger.log :info, "Running #{checks} on #{@src} on *#{@options[:extension]}... \n\n"
|
31
|
+
|
32
|
+
if @type == :links
|
33
|
+
check_list_of_links unless @options[:disable_external]
|
34
|
+
else
|
35
|
+
check_files
|
36
|
+
file_text = pluralize(files.length, 'file', 'files')
|
37
|
+
@logger.log :info, "Ran on #{file_text}!\n\n"
|
38
|
+
end
|
39
|
+
|
40
|
+
if @failures.empty?
|
41
|
+
@logger.log_with_color :info, :green, 'HTML-Proofer finished successfully.'
|
42
|
+
else
|
43
|
+
print_failed_tests
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
def check_list_of_links
|
48
|
+
if @options[:url_swap]
|
49
|
+
@src = @src.map do |url|
|
50
|
+
swap(url, @options[:url_swap])
|
51
|
+
end
|
52
|
+
end
|
53
|
+
@external_urls = Hash[*@src.map { |s| [s, nil] }.flatten]
|
54
|
+
validate_urls
|
55
|
+
end
|
56
|
+
|
57
|
+
# Collects any external URLs found in a directory of files. Also collectes
|
58
|
+
# every failed test from process_files.
|
59
|
+
# Sends the external URLs to Typhoeus for batch processing.
|
60
|
+
def check_files
|
61
|
+
@external_urls = {}
|
62
|
+
|
63
|
+
process_files.each do |item|
|
64
|
+
@external_urls.merge!(item[:external_urls])
|
65
|
+
@failures.concat(item[:failures])
|
66
|
+
end
|
67
|
+
|
68
|
+
# TODO: lazy. if we're checking only external links,
|
69
|
+
# we'll just trash all the failed tests. really, we should
|
70
|
+
# just not run those other checks at all.
|
71
|
+
if @options[:external_only]
|
72
|
+
@failures = []
|
73
|
+
validate_urls
|
74
|
+
elsif !@options[:disable_external]
|
75
|
+
validate_urls
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
# Walks over each implemented check and runs them on the files, in parallel.
|
80
|
+
def process_files
|
81
|
+
if @options[:parallel].empty?
|
82
|
+
files.map { |path| check_path(path) }
|
83
|
+
else
|
84
|
+
Parallel.map(files, @options[:parallel]) { |path| check_path(path) }
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
def check_path(path)
|
89
|
+
result = { :external_urls => {}, :failures => [] }
|
90
|
+
html = create_nokogiri(path)
|
91
|
+
|
92
|
+
@src = [@src] if @type == :file
|
93
|
+
|
94
|
+
@src.each do |src|
|
95
|
+
checks.each do |klass|
|
96
|
+
@logger.log :debug, "Checking #{klass.to_s.downcase} on #{path} ..."
|
97
|
+
check = Object.const_get(klass).new(src, path, html, @options)
|
98
|
+
check.run
|
99
|
+
result[:external_urls].merge!(check.external_urls)
|
100
|
+
result[:failures].concat(check.issues)
|
101
|
+
end
|
102
|
+
end
|
103
|
+
result
|
104
|
+
end
|
105
|
+
|
106
|
+
def validate_urls
|
107
|
+
url_validator = HTMLProofer::UrlValidator.new(@logger, @external_urls, @options)
|
108
|
+
@failures.concat(url_validator.run)
|
109
|
+
@external_urls = url_validator.external_urls
|
110
|
+
end
|
111
|
+
|
112
|
+
def files
|
113
|
+
@files ||= if @type == :directory
|
114
|
+
@src.map do |src|
|
115
|
+
pattern = File.join(src, '**', "*#{@options[:extension]}")
|
116
|
+
files = Dir.glob(pattern).select { |fn| File.file? fn }
|
117
|
+
files.reject { |f| ignore_file?(f) }
|
118
|
+
end.flatten
|
119
|
+
elsif @type == :file && File.extname(@src) == @options[:extension]
|
120
|
+
[@src].reject { |f| ignore_file?(f) }
|
121
|
+
else
|
122
|
+
[]
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
def ignore_file?(file)
|
127
|
+
@options[:file_ignore].each do |pattern|
|
128
|
+
return true if pattern.is_a?(String) && pattern == file
|
129
|
+
return true if pattern.is_a?(Regexp) && pattern =~ file
|
130
|
+
end
|
131
|
+
|
132
|
+
false
|
133
|
+
end
|
134
|
+
|
135
|
+
def checks
|
136
|
+
return @checks unless @checks.nil?
|
137
|
+
@checks = HTMLProofer::Check.subchecks.map(&:name)
|
138
|
+
@checks.delete('FaviconCheck') unless @options[:check_favicon]
|
139
|
+
@checks.delete('HtmlCheck') unless @options[:check_html]
|
140
|
+
@options[:checks_to_ignore].each { |ignored| @checks.delete(ignored) }
|
141
|
+
@checks
|
142
|
+
end
|
143
|
+
|
144
|
+
def failed_tests
|
145
|
+
result = []
|
146
|
+
return result if @failures.empty?
|
147
|
+
@failures.each { |f| result << f.to_s }
|
148
|
+
result
|
149
|
+
end
|
150
|
+
|
151
|
+
def print_failed_tests
|
152
|
+
sorted_failures = SortedIssues.new(@failures, @options[:error_sort], @logger)
|
153
|
+
|
154
|
+
sorted_failures.sort_and_report
|
155
|
+
count = @failures.length
|
156
|
+
failure_text = pluralize(count, 'failure', 'failures')
|
157
|
+
fail @logger.colorize :red, "HTML-Proofer found #{failure_text}!"
|
158
|
+
end
|
159
|
+
end
|
160
|
+
end
|
@@ -0,0 +1,218 @@
|
|
1
|
+
require 'typhoeus'
|
2
|
+
require 'uri'
|
3
|
+
require_relative './utils'
|
4
|
+
require_relative './cache'
|
5
|
+
|
6
|
+
module HTMLProofer
|
7
|
+
class UrlValidator
|
8
|
+
include HTMLProofer::Utils
|
9
|
+
|
10
|
+
attr_reader :external_urls
|
11
|
+
|
12
|
+
def initialize(logger, external_urls, options)
|
13
|
+
@logger = logger
|
14
|
+
@external_urls = external_urls
|
15
|
+
@failed_tests = []
|
16
|
+
@options = options
|
17
|
+
@hydra = Typhoeus::Hydra.new(@options[:hydra])
|
18
|
+
@cache = Cache.new(@logger, @options[:cache])
|
19
|
+
end
|
20
|
+
|
21
|
+
def run
|
22
|
+
@external_urls = remove_query_values
|
23
|
+
|
24
|
+
if @cache.use_cache?
|
25
|
+
urls_to_check = load_cache
|
26
|
+
external_link_checker(urls_to_check)
|
27
|
+
@cache.write
|
28
|
+
else
|
29
|
+
external_link_checker(@external_urls)
|
30
|
+
end
|
31
|
+
|
32
|
+
@failed_tests
|
33
|
+
end
|
34
|
+
|
35
|
+
def remove_query_values
|
36
|
+
return nil if @external_urls.nil?
|
37
|
+
paths_with_queries = {}
|
38
|
+
iterable_external_urls = @external_urls.dup
|
39
|
+
@external_urls.keys.each do |url|
|
40
|
+
uri = begin
|
41
|
+
Addressable::URI.parse(url)
|
42
|
+
rescue URI::Error, Addressable::URI::InvalidURIError
|
43
|
+
@logger.log :error, "#{url} is an invalid URL"
|
44
|
+
nil
|
45
|
+
end
|
46
|
+
next if uri.nil? || uri.query.nil?
|
47
|
+
iterable_external_urls.delete(url) unless new_url_query_values?(uri, paths_with_queries)
|
48
|
+
end
|
49
|
+
iterable_external_urls
|
50
|
+
end
|
51
|
+
|
52
|
+
# remember queries we've seen, ignore future ones
|
53
|
+
def new_url_query_values?(uri, paths_with_queries)
|
54
|
+
queries = uri.query_values.keys.join('-')
|
55
|
+
domain_path = extract_domain_path(uri)
|
56
|
+
if paths_with_queries[domain_path].nil?
|
57
|
+
paths_with_queries[domain_path] = [queries]
|
58
|
+
true
|
59
|
+
elsif !paths_with_queries[domain_path].include?(queries)
|
60
|
+
paths_with_queries[domain_path] << queries
|
61
|
+
true
|
62
|
+
else
|
63
|
+
false
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
def extract_domain_path(uri)
|
68
|
+
uri.host + uri.path
|
69
|
+
end
|
70
|
+
|
71
|
+
def load_cache
|
72
|
+
cache_count = @cache.size
|
73
|
+
cache_text = pluralize(cache_count, 'link', 'links')
|
74
|
+
|
75
|
+
@logger.log :info, "Found #{cache_text} in the cache..."
|
76
|
+
|
77
|
+
@cache.retrieve_urls(@external_urls)
|
78
|
+
end
|
79
|
+
|
80
|
+
# Proofer runs faster if we pull out all the external URLs and run the checks
|
81
|
+
# at the end. Otherwise, we're halting the consuming process for every file during
|
82
|
+
# `process_files`.
|
83
|
+
#
|
84
|
+
# In addition, sorting the list lets libcurl keep connections to the same hosts alive.
|
85
|
+
#
|
86
|
+
# Finally, we'll first make a HEAD request, rather than GETing all the contents.
|
87
|
+
# If the HEAD fails, we'll fall back to GET, as some servers are not configured
|
88
|
+
# for HEAD. If we've decided to check for hashes, we must do a GET--HEAD is
|
89
|
+
# not available as an option.
|
90
|
+
def external_link_checker(external_urls)
|
91
|
+
external_urls = Hash[external_urls.sort]
|
92
|
+
|
93
|
+
count = external_urls.length
|
94
|
+
check_text = pluralize(count, 'external link', 'external links')
|
95
|
+
@logger.log :info, "Checking #{check_text}..."
|
96
|
+
|
97
|
+
# Route log from Typhoeus/Ethon to our own logger
|
98
|
+
Ethon.logger = @logger
|
99
|
+
|
100
|
+
establish_queue(external_urls)
|
101
|
+
|
102
|
+
@hydra.run
|
103
|
+
end
|
104
|
+
|
105
|
+
def establish_queue(external_urls)
|
106
|
+
external_urls.each_pair do |url, filenames|
|
107
|
+
url = begin
|
108
|
+
clean_url(url)
|
109
|
+
rescue URI::Error, Addressable::URI::InvalidURIError
|
110
|
+
add_external_issue(filenames, "#{url} is an invalid URL")
|
111
|
+
next
|
112
|
+
end
|
113
|
+
|
114
|
+
method = if hash?(url) && @options[:check_external_hash]
|
115
|
+
:get
|
116
|
+
else
|
117
|
+
:head
|
118
|
+
end
|
119
|
+
queue_request(method, url, filenames)
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
123
|
+
def clean_url(href)
|
124
|
+
Addressable::URI.parse(href).normalize
|
125
|
+
end
|
126
|
+
|
127
|
+
def queue_request(method, href, filenames)
|
128
|
+
opts = @options[:typhoeus].merge({ :method => method })
|
129
|
+
request = Typhoeus::Request.new(href, opts)
|
130
|
+
request.on_complete { |response| response_handler(response, filenames) }
|
131
|
+
@hydra.queue request
|
132
|
+
end
|
133
|
+
|
134
|
+
def response_handler(response, filenames)
|
135
|
+
effective_url = response.options[:effective_url]
|
136
|
+
href = response.request.base_url.to_s
|
137
|
+
method = response.request.options[:method]
|
138
|
+
response_code = response.code
|
139
|
+
|
140
|
+
debug_msg = "Received a #{response_code} for #{href}"
|
141
|
+
debug_msg << " in #{filenames.join(' ')}" unless filenames.nil?
|
142
|
+
@logger.log :debug, debug_msg
|
143
|
+
|
144
|
+
return if @options[:http_status_ignore].include?(response_code)
|
145
|
+
|
146
|
+
if response_code.between?(200, 299)
|
147
|
+
unless check_hash_in_2xx_response(href, effective_url, response, filenames)
|
148
|
+
@cache.add(href, filenames, response_code)
|
149
|
+
end
|
150
|
+
elsif response.timed_out?
|
151
|
+
handle_timeout(href, filenames, response_code)
|
152
|
+
elsif response_code == 0
|
153
|
+
handle_failure(href, filenames, response_code)
|
154
|
+
elsif method == :head
|
155
|
+
queue_request(:get, href, filenames)
|
156
|
+
else
|
157
|
+
return if @options[:only_4xx] && !response_code.between?(400, 499)
|
158
|
+
# Received a non-successful http response.
|
159
|
+
msg = "External link #{href} failed: #{response_code} #{response.return_message}"
|
160
|
+
add_external_issue(filenames, msg, response_code)
|
161
|
+
@cache.add(href, filenames, response_code, msg)
|
162
|
+
end
|
163
|
+
end
|
164
|
+
|
165
|
+
# Even though the response was a success, we may have been asked to check
|
166
|
+
# if the hash on the URL exists on the page
|
167
|
+
def check_hash_in_2xx_response(href, effective_url, response, filenames)
|
168
|
+
return false if @options[:only_4xx]
|
169
|
+
return false unless @options[:check_external_hash]
|
170
|
+
return false unless (hash = hash?(href))
|
171
|
+
|
172
|
+
body_doc = create_nokogiri(response.body)
|
173
|
+
|
174
|
+
# user-content is a special addition by GitHub.
|
175
|
+
xpath = %(//*[@name="#{hash}"]|//*[@id="#{hash}"])
|
176
|
+
if URI.parse(href).host.match(/github\.com/i)
|
177
|
+
xpath << %(|//*[@name="user-content-#{hash}"]|//*[@id="user-content-#{hash}"])
|
178
|
+
end
|
179
|
+
|
180
|
+
return unless body_doc.xpath(xpath).empty?
|
181
|
+
|
182
|
+
msg = "External link #{href} failed: #{effective_url} exists, but the hash '#{hash}' does not"
|
183
|
+
add_external_issue(filenames, msg, response.code)
|
184
|
+
@cache.add(href, filenames, response.code, msg)
|
185
|
+
true
|
186
|
+
end
|
187
|
+
|
188
|
+
def handle_timeout(href, filenames, response_code)
|
189
|
+
msg = "External link #{href} failed: got a time out (response code #{response_code})"
|
190
|
+
@cache.add(href, filenames, 0, msg)
|
191
|
+
return if @options[:only_4xx]
|
192
|
+
add_external_issue(filenames, msg, response_code)
|
193
|
+
end
|
194
|
+
|
195
|
+
def handle_failure(href, filenames, response_code)
|
196
|
+
msg = "External link #{href} failed: response code #{response_code} means something's wrong"
|
197
|
+
@cache.add(href, filenames, 0, msg)
|
198
|
+
return if @options[:only_4xx]
|
199
|
+
add_external_issue(filenames, msg, response_code)
|
200
|
+
end
|
201
|
+
|
202
|
+
def add_external_issue(filenames, desc, status = nil)
|
203
|
+
# possible if we're checking an array of links
|
204
|
+
if filenames.nil?
|
205
|
+
@failed_tests << Issue.new('', desc, status: status)
|
206
|
+
else
|
207
|
+
filenames.each { |f| @failed_tests << Issue.new(f, desc, status: status) }
|
208
|
+
end
|
209
|
+
end
|
210
|
+
|
211
|
+
# Does the URL have a hash?
|
212
|
+
def hash?(url)
|
213
|
+
URI.parse(url).fragment
|
214
|
+
rescue URI::InvalidURIError
|
215
|
+
false
|
216
|
+
end
|
217
|
+
end
|
218
|
+
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
|
3
|
+
module HTMLProofer
|
4
|
+
module Utils
|
5
|
+
STORAGE_DIR = File.join('tmp', '.htmlproofer')
|
6
|
+
|
7
|
+
def pluralize(count, single, plural)
|
8
|
+
"#{count} " << (count == 1 ? single : plural)
|
9
|
+
end
|
10
|
+
|
11
|
+
def create_nokogiri(path)
|
12
|
+
if File.exist? path
|
13
|
+
content = File.open(path).read
|
14
|
+
else
|
15
|
+
content = path
|
16
|
+
end
|
17
|
+
|
18
|
+
Nokogiri::HTML(clean_content(content))
|
19
|
+
end
|
20
|
+
module_function :create_nokogiri
|
21
|
+
|
22
|
+
def swap(href, replacement)
|
23
|
+
replacement.each do |link, replace|
|
24
|
+
href = href.gsub(link, replace)
|
25
|
+
end
|
26
|
+
href
|
27
|
+
end
|
28
|
+
module_function :swap
|
29
|
+
|
30
|
+
# address a problem with Nokogiri's parsing URL entities
|
31
|
+
# problem from http://git.io/vBYU1
|
32
|
+
# solution from http://git.io/vBYUi
|
33
|
+
def clean_content(string)
|
34
|
+
string.gsub(%r{https?://([^>]+)}i) do |url|
|
35
|
+
url.gsub(/&(?!amp;)/, '&')
|
36
|
+
end
|
37
|
+
end
|
38
|
+
module_function :clean_content
|
39
|
+
end
|
40
|
+
end
|