html-proofer 3.19.4 → 4.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/htmlproofer +44 -59
- data/lib/html-proofer.rb +1 -54
- data/lib/html_proofer/attribute/url.rb +251 -0
- data/lib/html_proofer/attribute.rb +15 -0
- data/lib/html_proofer/cache.rb +292 -0
- data/lib/html_proofer/check/favicon.rb +43 -0
- data/lib/html_proofer/check/images.rb +99 -0
- data/lib/html_proofer/check/links.rb +135 -0
- data/lib/html_proofer/check/open_graph.rb +42 -0
- data/lib/html_proofer/check/scripts.rb +49 -0
- data/lib/html_proofer/check.rb +94 -0
- data/lib/html_proofer/configuration.rb +91 -0
- data/lib/html_proofer/element.rb +144 -0
- data/lib/html_proofer/failure.rb +17 -0
- data/lib/{html-proofer → html_proofer}/log.rb +19 -19
- data/lib/html_proofer/reporter/cli.rb +33 -0
- data/lib/html_proofer/reporter.rb +23 -0
- data/lib/html_proofer/runner.rb +244 -0
- data/lib/html_proofer/url_validator/external.rb +193 -0
- data/lib/html_proofer/url_validator/internal.rb +97 -0
- data/lib/html_proofer/url_validator.rb +16 -0
- data/lib/{html-proofer → html_proofer}/utils.rb +9 -12
- data/lib/{html-proofer → html_proofer}/version.rb +1 -1
- data/lib/html_proofer/xpath_functions.rb +10 -0
- data/lib/html_proofer.rb +59 -0
- metadata +42 -22
- data/lib/html-proofer/cache.rb +0 -194
- data/lib/html-proofer/check/favicon.rb +0 -29
- data/lib/html-proofer/check/html.rb +0 -37
- data/lib/html-proofer/check/images.rb +0 -48
- data/lib/html-proofer/check/links.rb +0 -182
- data/lib/html-proofer/check/opengraph.rb +0 -46
- data/lib/html-proofer/check/scripts.rb +0 -42
- data/lib/html-proofer/check.rb +0 -75
- data/lib/html-proofer/configuration.rb +0 -88
- data/lib/html-proofer/element.rb +0 -265
- data/lib/html-proofer/issue.rb +0 -65
- data/lib/html-proofer/middleware.rb +0 -82
- data/lib/html-proofer/runner.rb +0 -249
- data/lib/html-proofer/url_validator.rb +0 -237
|
@@ -0,0 +1,244 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module HTMLProofer
|
|
4
|
+
class Runner
|
|
5
|
+
include HTMLProofer::Utils
|
|
6
|
+
|
|
7
|
+
attr_reader :options, :cache, :logger, :internal_urls, :external_urls, :checked_paths, :current_check
|
|
8
|
+
attr_accessor :current_filename, :current_source, :reporter
|
|
9
|
+
|
|
10
|
+
URL_TYPES = [:external, :internal].freeze
|
|
11
|
+
|
|
12
|
+
def initialize(src, opts = {})
|
|
13
|
+
@options = HTMLProofer::Configuration.generate_defaults(opts)
|
|
14
|
+
|
|
15
|
+
@type = @options.delete(:type)
|
|
16
|
+
@source = src
|
|
17
|
+
|
|
18
|
+
@logger = HTMLProofer::Log.new(@options[:log_level])
|
|
19
|
+
@cache = Cache.new(self, @options[:cache])
|
|
20
|
+
|
|
21
|
+
@external_urls = {}
|
|
22
|
+
@internal_urls = {}
|
|
23
|
+
@failures = []
|
|
24
|
+
|
|
25
|
+
@before_request = []
|
|
26
|
+
|
|
27
|
+
@checked_paths = {}
|
|
28
|
+
|
|
29
|
+
@current_check = nil
|
|
30
|
+
@current_source = nil
|
|
31
|
+
@current_filename = nil
|
|
32
|
+
|
|
33
|
+
@reporter = Reporter::Cli.new(logger: @logger)
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def run
|
|
37
|
+
check_text = pluralize(checks.length, "check", "checks")
|
|
38
|
+
|
|
39
|
+
if @type == :links
|
|
40
|
+
@logger.log(:info, "Running #{check_text} (#{format_checks_list(checks)}) on #{@source} ... \n\n")
|
|
41
|
+
check_list_of_links unless @options[:disable_external]
|
|
42
|
+
else
|
|
43
|
+
@logger.log(:info,
|
|
44
|
+
"Running #{check_text} (#{format_checks_list(checks)}) in #{@source} on *#{@options[:extensions].join(", ")} files...\n\n")
|
|
45
|
+
|
|
46
|
+
check_files
|
|
47
|
+
@logger.log(:info, "Ran on #{pluralize(files.length, "file", "files")}!\n\n")
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
@cache.write
|
|
51
|
+
|
|
52
|
+
@reporter.failures = @failures
|
|
53
|
+
|
|
54
|
+
if @failures.empty?
|
|
55
|
+
@logger.log(:info, "HTML-Proofer finished successfully.")
|
|
56
|
+
else
|
|
57
|
+
@failures.uniq!
|
|
58
|
+
report_failed_checks
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
def check_list_of_links
|
|
63
|
+
@external_urls = @source.uniq.each_with_object({}) do |link, hash|
|
|
64
|
+
url = Attribute::Url.new(self, link, base_url: nil).to_s
|
|
65
|
+
|
|
66
|
+
hash[url] = []
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
validate_external_urls
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
# Walks over each implemented check and runs them on the files, in parallel.
|
|
73
|
+
# Sends the collected external URLs to Typhoeus for batch processing.
|
|
74
|
+
def check_files
|
|
75
|
+
process_files.each do |result|
|
|
76
|
+
URL_TYPES.each do |url_type|
|
|
77
|
+
type = :"#{url_type}_urls"
|
|
78
|
+
ivar_name = "@#{type}"
|
|
79
|
+
ivar = instance_variable_get(ivar_name)
|
|
80
|
+
|
|
81
|
+
if ivar.empty?
|
|
82
|
+
instance_variable_set(ivar_name, result[type])
|
|
83
|
+
else
|
|
84
|
+
result[type].each do |url, metadata|
|
|
85
|
+
ivar[url] = [] if ivar[url].nil?
|
|
86
|
+
ivar[url].concat(metadata)
|
|
87
|
+
end
|
|
88
|
+
end
|
|
89
|
+
end
|
|
90
|
+
@failures.concat(result[:failures])
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
validate_external_urls unless @options[:disable_external]
|
|
94
|
+
|
|
95
|
+
validate_internal_urls
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
# Walks over each implemented check and runs them on the files, in parallel.
|
|
99
|
+
def process_files
|
|
100
|
+
if @options[:parallel][:enable]
|
|
101
|
+
Parallel.map(files, @options[:parallel]) { |file| load_file(file[:path], file[:source]) }
|
|
102
|
+
else
|
|
103
|
+
files.map do |file|
|
|
104
|
+
load_file(file[:path], file[:source])
|
|
105
|
+
end
|
|
106
|
+
end
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
def load_file(path, source)
|
|
110
|
+
@html = create_nokogiri(path)
|
|
111
|
+
check_parsed(path, source)
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
# Collects any external URLs found in a directory of files. Also collectes
|
|
115
|
+
# every failed test from process_files.
|
|
116
|
+
def check_parsed(path, source)
|
|
117
|
+
result = { internal_urls: {}, external_urls: {}, failures: [] }
|
|
118
|
+
|
|
119
|
+
checks.each do |klass|
|
|
120
|
+
@current_source = source
|
|
121
|
+
@current_filename = path
|
|
122
|
+
|
|
123
|
+
check = Object.const_get(klass).new(self, @html)
|
|
124
|
+
@logger.log(:debug, "Running #{check.short_name} in #{path}")
|
|
125
|
+
|
|
126
|
+
@current_check = check
|
|
127
|
+
|
|
128
|
+
check.run
|
|
129
|
+
|
|
130
|
+
result[:external_urls].merge!(check.external_urls) { |_key, old, current| old.concat(current) }
|
|
131
|
+
result[:internal_urls].merge!(check.internal_urls) { |_key, old, current| old.concat(current) }
|
|
132
|
+
result[:failures].concat(check.failures)
|
|
133
|
+
end
|
|
134
|
+
result
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
def validate_external_urls
|
|
138
|
+
external_url_validator = HTMLProofer::UrlValidator::External.new(self, @external_urls)
|
|
139
|
+
external_url_validator.before_request = @before_request
|
|
140
|
+
@failures.concat(external_url_validator.validate)
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
def validate_internal_urls
|
|
144
|
+
internal_link_validator = HTMLProofer::UrlValidator::Internal.new(self, @internal_urls)
|
|
145
|
+
@failures.concat(internal_link_validator.validate)
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
def files
|
|
149
|
+
@files ||= if @type == :directory
|
|
150
|
+
@source.map do |src|
|
|
151
|
+
pattern = File.join(src, "**", "*{#{@options[:extensions].join(",")}}")
|
|
152
|
+
Dir.glob(pattern).select do |f|
|
|
153
|
+
File.file?(f) && !ignore_file?(f)
|
|
154
|
+
end.map { |f| { source: src, path: f } }
|
|
155
|
+
end.flatten
|
|
156
|
+
elsif @type == :file && @options[:extensions].include?(File.extname(@source))
|
|
157
|
+
[@source].reject { |f| ignore_file?(f) }.map { |f| { source: f, path: f } }
|
|
158
|
+
else
|
|
159
|
+
[]
|
|
160
|
+
end
|
|
161
|
+
end
|
|
162
|
+
|
|
163
|
+
def ignore_file?(file)
|
|
164
|
+
@options[:ignore_files].each do |pattern|
|
|
165
|
+
return true if pattern.is_a?(String) && pattern == file
|
|
166
|
+
return true if pattern.is_a?(Regexp) && pattern =~ file
|
|
167
|
+
end
|
|
168
|
+
|
|
169
|
+
false
|
|
170
|
+
end
|
|
171
|
+
|
|
172
|
+
def check_sri?
|
|
173
|
+
@options[:check_sri]
|
|
174
|
+
end
|
|
175
|
+
|
|
176
|
+
def enforce_https?
|
|
177
|
+
@options[:enforce_https]
|
|
178
|
+
end
|
|
179
|
+
|
|
180
|
+
def checks
|
|
181
|
+
return @checks if defined?(@checks) && !@checks.nil?
|
|
182
|
+
|
|
183
|
+
return (@checks = ["LinkCheck"]) if @type == :links
|
|
184
|
+
|
|
185
|
+
@checks = HTMLProofer::Check.subchecks(@options).map(&:name)
|
|
186
|
+
|
|
187
|
+
@checks
|
|
188
|
+
end
|
|
189
|
+
|
|
190
|
+
def failed_checks
|
|
191
|
+
@reporter.failures.flatten.select { |f| f.is_a?(Failure) }
|
|
192
|
+
end
|
|
193
|
+
|
|
194
|
+
def report_failed_checks
|
|
195
|
+
@reporter.report
|
|
196
|
+
|
|
197
|
+
failure_text = pluralize(@failures.length, "failure", "failures")
|
|
198
|
+
@logger.log(:fatal, "\nHTML-Proofer found #{failure_text}!")
|
|
199
|
+
exit(1)
|
|
200
|
+
end
|
|
201
|
+
|
|
202
|
+
# Set before_request callback.
|
|
203
|
+
#
|
|
204
|
+
# @example Set before_request.
|
|
205
|
+
# request.before_request { |request| p "yay" }
|
|
206
|
+
#
|
|
207
|
+
# @param [ Block ] block The block to execute.
|
|
208
|
+
#
|
|
209
|
+
# @yield [ Typhoeus::Request ]
|
|
210
|
+
#
|
|
211
|
+
# @return [ Array<Block> ] All before_request blocks.
|
|
212
|
+
def before_request(&block)
|
|
213
|
+
@before_request ||= []
|
|
214
|
+
@before_request << block if block
|
|
215
|
+
@before_request
|
|
216
|
+
end
|
|
217
|
+
|
|
218
|
+
def load_internal_cache
|
|
219
|
+
load_cache(:internal)
|
|
220
|
+
end
|
|
221
|
+
|
|
222
|
+
def load_external_cache
|
|
223
|
+
load_cache(:external)
|
|
224
|
+
end
|
|
225
|
+
|
|
226
|
+
private def load_cache(type)
|
|
227
|
+
ivar = instance_variable_get("@#{type}_urls")
|
|
228
|
+
|
|
229
|
+
existing_urls_count = @cache.size(type)
|
|
230
|
+
cache_text = pluralize(existing_urls_count, "#{type} link", "#{type} links")
|
|
231
|
+
@logger.log(:debug, "Found #{cache_text} in the cache")
|
|
232
|
+
|
|
233
|
+
urls_to_check = @cache.retrieve_urls(ivar, type)
|
|
234
|
+
|
|
235
|
+
urls_to_check
|
|
236
|
+
end
|
|
237
|
+
|
|
238
|
+
private def format_checks_list(checks)
|
|
239
|
+
checks.map do |check|
|
|
240
|
+
check.sub(/HTMLProofer::Check::/, "")
|
|
241
|
+
end.join(", ")
|
|
242
|
+
end
|
|
243
|
+
end
|
|
244
|
+
end
|
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "typhoeus"
|
|
4
|
+
require "uri"
|
|
5
|
+
|
|
6
|
+
module HTMLProofer
|
|
7
|
+
class UrlValidator
|
|
8
|
+
class External < UrlValidator
|
|
9
|
+
include HTMLProofer::Utils
|
|
10
|
+
|
|
11
|
+
attr_reader :external_urls
|
|
12
|
+
attr_writer :before_request
|
|
13
|
+
|
|
14
|
+
def initialize(runner, external_urls)
|
|
15
|
+
super(runner)
|
|
16
|
+
|
|
17
|
+
@external_urls = external_urls
|
|
18
|
+
@hydra = Typhoeus::Hydra.new(@runner.options[:hydra])
|
|
19
|
+
@before_request = []
|
|
20
|
+
|
|
21
|
+
@paths_with_queries = {}
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def validate
|
|
25
|
+
urls_to_check = @cache.external_enabled? ? @runner.load_external_cache : @external_urls
|
|
26
|
+
urls_detected = pluralize(urls_to_check.count, "external link", "external links")
|
|
27
|
+
@logger.log(:info, "Checking #{urls_detected}")
|
|
28
|
+
|
|
29
|
+
run_external_link_checker(urls_to_check)
|
|
30
|
+
|
|
31
|
+
@failed_checks
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
# Proofer runs faster if we pull out all the external URLs and run the checks
|
|
35
|
+
# at the end. Otherwise, we're halting the consuming process for every file during
|
|
36
|
+
# `process_files`.
|
|
37
|
+
#
|
|
38
|
+
# In addition, sorting the list lets libcurl keep connections to the same hosts alive.
|
|
39
|
+
#
|
|
40
|
+
# Finally, we'll first make a HEAD request, rather than GETing all the contents.
|
|
41
|
+
# If the HEAD fails, we'll fall back to GET, as some servers are not configured
|
|
42
|
+
# for HEAD. If we've decided to check for hashes, we must do a GET--HEAD is
|
|
43
|
+
# not available as an option.
|
|
44
|
+
def run_external_link_checker(external_urls)
|
|
45
|
+
# Route log from Typhoeus/Ethon to our own logger
|
|
46
|
+
Ethon.logger = @logger
|
|
47
|
+
|
|
48
|
+
external_urls.each_pair do |external_url, metadata|
|
|
49
|
+
url = Attribute::Url.new(@runner, external_url, base_url: nil)
|
|
50
|
+
|
|
51
|
+
unless url.valid?
|
|
52
|
+
add_failure(metadata, "#{url} is an invalid URL", 0)
|
|
53
|
+
next
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
next unless new_url_query_values?(url)
|
|
57
|
+
|
|
58
|
+
method = if @runner.options[:check_external_hash] && url.hash?
|
|
59
|
+
:get
|
|
60
|
+
else
|
|
61
|
+
:head
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
queue_request(method, url, metadata)
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
@hydra.run
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
def queue_request(method, url, filenames)
|
|
71
|
+
opts = @runner.options[:typhoeus].merge(method: method)
|
|
72
|
+
request = Typhoeus::Request.new(url.url, opts)
|
|
73
|
+
@before_request.each do |callback|
|
|
74
|
+
callback.call(request)
|
|
75
|
+
end
|
|
76
|
+
request.on_complete { |response| response_handler(response, url, filenames) }
|
|
77
|
+
@hydra.queue(request)
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
def response_handler(response, url, filenames)
|
|
81
|
+
method = response.request.options[:method]
|
|
82
|
+
href = response.request.base_url.to_s
|
|
83
|
+
response_code = response.code
|
|
84
|
+
response.body.delete!("\x00")
|
|
85
|
+
|
|
86
|
+
@logger.log(:debug, "Received a #{response_code} for #{href}")
|
|
87
|
+
|
|
88
|
+
return if @runner.options[:ignore_status_codes].include?(response_code)
|
|
89
|
+
|
|
90
|
+
if response_code.between?(200, 299)
|
|
91
|
+
@cache.add_external(href, filenames, response_code, "OK", true) unless check_hash_in_2xx_response(href, url,
|
|
92
|
+
response, filenames)
|
|
93
|
+
elsif response.timed_out?
|
|
94
|
+
handle_timeout(href, filenames, response_code)
|
|
95
|
+
elsif response_code.zero?
|
|
96
|
+
handle_connection_failure(href, filenames, response_code, response.status_message)
|
|
97
|
+
elsif method == :head # some servers don't support HEAD
|
|
98
|
+
queue_request(:get, url, filenames)
|
|
99
|
+
else
|
|
100
|
+
return if @runner.options[:only_4xx] && !response_code.between?(400, 499)
|
|
101
|
+
|
|
102
|
+
# Received a non-successful http response.
|
|
103
|
+
status_message = blank?(response.status_message) ? "" : ": #{response.status_message}"
|
|
104
|
+
msg = "External link #{href} failed#{status_message}"
|
|
105
|
+
add_failure(filenames, msg, response_code)
|
|
106
|
+
@cache.add_external(href, filenames, response_code, msg, false)
|
|
107
|
+
end
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
# Even though the response was a success, we may have been asked to check
|
|
111
|
+
# if the hash on the URL exists on the page
|
|
112
|
+
def check_hash_in_2xx_response(href, url, response, filenames)
|
|
113
|
+
return false if @runner.options[:only_4xx]
|
|
114
|
+
return false unless @runner.options[:check_external_hash]
|
|
115
|
+
return false unless url.hash?
|
|
116
|
+
|
|
117
|
+
hash = url.hash
|
|
118
|
+
|
|
119
|
+
body_doc = create_nokogiri(response.body)
|
|
120
|
+
|
|
121
|
+
unencoded_hash = Addressable::URI.unescape(hash)
|
|
122
|
+
xpath = [%(//*[@name="#{hash}"]|/*[@name="#{unencoded_hash}"]|//*[@id="#{hash}"]|//*[@id="#{unencoded_hash}"])]
|
|
123
|
+
# user-content is a special addition by GitHub.
|
|
124
|
+
if url.host =~ /github\.com/i
|
|
125
|
+
xpath << [%(//*[@name="user-content-#{hash}"]|//*[@id="user-content-#{hash}"])]
|
|
126
|
+
# when linking to a file on GitHub, like #L12-L34, only the first "L" portion
|
|
127
|
+
# will be identified as a linkable portion
|
|
128
|
+
xpath << [%(//td[@id="#{Regexp.last_match[1]}"])] if hash =~ /\A(L\d)+/
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
return unless body_doc.xpath(xpath.join("|")).empty?
|
|
132
|
+
|
|
133
|
+
msg = "External link #{href} failed: #{url.sans_hash} exists, but the hash '#{hash}' does not"
|
|
134
|
+
add_failure(filenames, msg, response.code)
|
|
135
|
+
@cache.add_external(href, filenames, response.code, msg, false)
|
|
136
|
+
true
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
def handle_timeout(href, filenames, response_code)
|
|
140
|
+
msg = "External link #{href} failed: got a time out (response code #{response_code})"
|
|
141
|
+
@cache.add_external(href, filenames, 0, msg, false)
|
|
142
|
+
return if @runner.options[:only_4xx]
|
|
143
|
+
|
|
144
|
+
add_failure(filenames, msg, response_code)
|
|
145
|
+
end
|
|
146
|
+
|
|
147
|
+
def handle_connection_failure(href, metadata, response_code, status_message)
|
|
148
|
+
msgs = [<<~MSG,
|
|
149
|
+
External link #{href} failed with something very wrong.
|
|
150
|
+
It's possible libcurl couldn't connect to the server, or perhaps the request timed out.
|
|
151
|
+
Sometimes, making too many requests at once also breaks things.
|
|
152
|
+
MSG
|
|
153
|
+
]
|
|
154
|
+
|
|
155
|
+
msgs << "Either way, the return message from the server is: #{status_message}" unless blank?(status_message)
|
|
156
|
+
|
|
157
|
+
msg = msgs.join("\n").chomp
|
|
158
|
+
|
|
159
|
+
@cache.add_external(href, metadata, 0, msg, false)
|
|
160
|
+
return if @runner.options[:only_4xx]
|
|
161
|
+
|
|
162
|
+
add_failure(metadata, msg, response_code)
|
|
163
|
+
end
|
|
164
|
+
|
|
165
|
+
def add_failure(metadata, description, status = nil)
|
|
166
|
+
if blank?(metadata) # possible if we're checking an array of links
|
|
167
|
+
@failed_checks << Failure.new("", "Links > External", description, status: status)
|
|
168
|
+
else
|
|
169
|
+
metadata.each do |m|
|
|
170
|
+
@failed_checks << Failure.new(m[:filename], "Links > External", description, line: m[:line], status: status)
|
|
171
|
+
end
|
|
172
|
+
end
|
|
173
|
+
end
|
|
174
|
+
|
|
175
|
+
# remember queries we've seen, ignore future ones
|
|
176
|
+
private def new_url_query_values?(url)
|
|
177
|
+
return true if (query_values = url.query_values).nil?
|
|
178
|
+
|
|
179
|
+
queries = query_values.keys.join("-")
|
|
180
|
+
domain_path = url.domain_path
|
|
181
|
+
if @paths_with_queries[domain_path].nil?
|
|
182
|
+
@paths_with_queries[domain_path] = [queries]
|
|
183
|
+
true
|
|
184
|
+
elsif !@paths_with_queries[domain_path].include?(queries)
|
|
185
|
+
@paths_with_queries[domain_path] << queries
|
|
186
|
+
true
|
|
187
|
+
else
|
|
188
|
+
false
|
|
189
|
+
end
|
|
190
|
+
end
|
|
191
|
+
end
|
|
192
|
+
end
|
|
193
|
+
end
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module HTMLProofer
|
|
4
|
+
class UrlValidator
|
|
5
|
+
class Internal < UrlValidator
|
|
6
|
+
attr_reader :internal_urls
|
|
7
|
+
|
|
8
|
+
def initialize(runner, internal_urls)
|
|
9
|
+
super(runner)
|
|
10
|
+
|
|
11
|
+
@internal_urls = internal_urls
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def validate
|
|
15
|
+
urls_to_check = @cache.internal_enabled? ? @runner.load_internal_cache : @internal_urls
|
|
16
|
+
urls_detected = pluralize(urls_to_check.count, "internal link", "internal links")
|
|
17
|
+
@logger.log(:info, "Checking #{urls_detected}")
|
|
18
|
+
|
|
19
|
+
run_internal_link_checker(urls_to_check)
|
|
20
|
+
|
|
21
|
+
@failed_checks
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def run_internal_link_checker(links)
|
|
25
|
+
to_add = []
|
|
26
|
+
links.each_pair do |link, matched_files|
|
|
27
|
+
matched_files.each do |metadata|
|
|
28
|
+
url = HTMLProofer::Attribute::Url.new(@runner, link, base_url: metadata[:base_url])
|
|
29
|
+
|
|
30
|
+
@runner.current_source = metadata[:source]
|
|
31
|
+
@runner.current_filename = metadata[:filename]
|
|
32
|
+
|
|
33
|
+
unless file_exists?(url)
|
|
34
|
+
@failed_checks << Failure.new(@runner.current_filename, "Links > Internal",
|
|
35
|
+
"internally linking to #{url}, which does not exist", line: metadata[:line], status: nil, content: nil)
|
|
36
|
+
to_add << [url, metadata, false]
|
|
37
|
+
next
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
unless hash_exists?(url)
|
|
41
|
+
@failed_checks << Failure.new(@runner.current_filename, "Links > Internal",
|
|
42
|
+
"internally linking to #{url}; the file exists, but the hash '#{url.hash}' does not", line: metadata[:line], status: nil, content: nil)
|
|
43
|
+
to_add << [url, metadata, false]
|
|
44
|
+
next
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
to_add << [url, metadata, true]
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
# adding directly to the cache above results in an endless loop
|
|
52
|
+
to_add.each do |(url, metadata, exists)|
|
|
53
|
+
@cache.add_internal(url.to_s, metadata, exists)
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
@failed_checks
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
private def file_exists?(url)
|
|
60
|
+
absolute_path = url.absolute_path
|
|
61
|
+
return @runner.checked_paths[url.absolute_path] if @runner.checked_paths.key?(absolute_path)
|
|
62
|
+
|
|
63
|
+
@runner.checked_paths[url.absolute_path] = File.exist?(absolute_path)
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
# verify the target hash
|
|
67
|
+
private def hash_exists?(url)
|
|
68
|
+
href_hash = url.hash
|
|
69
|
+
return true if blank?(href_hash)
|
|
70
|
+
return true unless @runner.options[:check_internal_hash]
|
|
71
|
+
|
|
72
|
+
# prevents searching files we didn't ask about
|
|
73
|
+
return false unless url.known_extension?
|
|
74
|
+
return false unless url.has_hash?
|
|
75
|
+
|
|
76
|
+
decoded_href_hash = Addressable::URI.unescape(href_hash)
|
|
77
|
+
fragment_ids = [href_hash, decoded_href_hash]
|
|
78
|
+
# https://www.w3.org/TR/html5/single-page.html#scroll-to-fragid
|
|
79
|
+
fragment_ids.include?("top") || !find_fragments(fragment_ids, url).empty?
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
private def find_fragments(fragment_ids, url)
|
|
83
|
+
xpaths = fragment_ids.uniq.flat_map do |frag_id|
|
|
84
|
+
escaped_frag_id = "'#{frag_id.split("'").join("', \"'\", '")}', ''"
|
|
85
|
+
[
|
|
86
|
+
"//*[case_sensitive_equals(@id, concat(#{escaped_frag_id}))]",
|
|
87
|
+
"//*[case_sensitive_equals(@name, concat(#{escaped_frag_id}))]",
|
|
88
|
+
]
|
|
89
|
+
end
|
|
90
|
+
xpaths << XpathFunctions.new
|
|
91
|
+
|
|
92
|
+
html = create_nokogiri(url.absolute_path)
|
|
93
|
+
html.xpath(*xpaths)
|
|
94
|
+
end
|
|
95
|
+
end
|
|
96
|
+
end
|
|
97
|
+
end
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module HTMLProofer
|
|
4
|
+
class UrlValidator
|
|
5
|
+
include HTMLProofer::Utils
|
|
6
|
+
|
|
7
|
+
def initialize(runner)
|
|
8
|
+
@runner = runner
|
|
9
|
+
|
|
10
|
+
@cache = @runner.cache
|
|
11
|
+
@logger = @runner.logger
|
|
12
|
+
|
|
13
|
+
@failed_checks = []
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
end
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
-
require
|
|
3
|
+
require "nokogiri"
|
|
4
4
|
|
|
5
5
|
module HTMLProofer
|
|
6
6
|
module Utils
|
|
@@ -8,21 +8,18 @@ module HTMLProofer
|
|
|
8
8
|
"#{count} #{count == 1 ? single : plural}"
|
|
9
9
|
end
|
|
10
10
|
|
|
11
|
+
def blank?(obj)
|
|
12
|
+
obj.nil? || obj.empty?
|
|
13
|
+
end
|
|
14
|
+
|
|
11
15
|
def create_nokogiri(path)
|
|
12
16
|
content = if File.exist?(path) && !File.directory?(path)
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
+
File.read(path)
|
|
18
|
+
else
|
|
19
|
+
path
|
|
20
|
+
end
|
|
17
21
|
|
|
18
22
|
Nokogiri::HTML5(content, max_errors: -1)
|
|
19
23
|
end
|
|
20
|
-
|
|
21
|
-
def swap(href, replacement)
|
|
22
|
-
replacement.each do |link, replace|
|
|
23
|
-
href = href.gsub(link, replace)
|
|
24
|
-
end
|
|
25
|
-
href
|
|
26
|
-
end
|
|
27
24
|
end
|
|
28
25
|
end
|
data/lib/html_proofer.rb
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "zeitwerk"
|
|
4
|
+
lib_dir = File.join(File.dirname(__dir__), "lib")
|
|
5
|
+
gem_loader = Zeitwerk::Loader.for_gem
|
|
6
|
+
gem_loader.inflector.inflect(
|
|
7
|
+
"html_proofer" => "HTMLProofer"
|
|
8
|
+
)
|
|
9
|
+
gem_loader.ignore(File.join(lib_dir, "html-proofer.rb"))
|
|
10
|
+
gem_loader.setup
|
|
11
|
+
|
|
12
|
+
require "html_proofer/version"
|
|
13
|
+
|
|
14
|
+
require "parallel"
|
|
15
|
+
require "fileutils"
|
|
16
|
+
|
|
17
|
+
if ENV.fetch("DEBUG", false)
|
|
18
|
+
require "awesome_print"
|
|
19
|
+
require "debug"
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
module HTMLProofer
|
|
23
|
+
class << self
|
|
24
|
+
def check_file(file, options = {})
|
|
25
|
+
raise ArgumentError unless file.is_a?(String)
|
|
26
|
+
raise ArgumentError, "#{file} does not exist" unless File.exist?(file)
|
|
27
|
+
|
|
28
|
+
options[:type] = :file
|
|
29
|
+
HTMLProofer::Runner.new(file, options)
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def check_directory(directory, options = {})
|
|
33
|
+
raise ArgumentError unless directory.is_a?(String)
|
|
34
|
+
raise ArgumentError, "#{directory} does not exist" unless Dir.exist?(directory)
|
|
35
|
+
|
|
36
|
+
options[:type] = :directory
|
|
37
|
+
HTMLProofer::Runner.new([directory], options)
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
def check_directories(directories, options = {})
|
|
41
|
+
raise ArgumentError unless directories.is_a?(Array)
|
|
42
|
+
|
|
43
|
+
options[:type] = :directory
|
|
44
|
+
directories.each do |directory|
|
|
45
|
+
raise ArgumentError, "#{directory} does not exist" unless Dir.exist?(directory)
|
|
46
|
+
end
|
|
47
|
+
HTMLProofer::Runner.new(directories, options)
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
def check_links(links, options = {})
|
|
51
|
+
raise ArgumentError unless links.is_a?(Array)
|
|
52
|
+
|
|
53
|
+
options[:type] = :links
|
|
54
|
+
HTMLProofer::Runner.new(links, options)
|
|
55
|
+
end
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
gem_loader.eager_load
|