html-proofer 3.19.4 → 4.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. checksums.yaml +4 -4
  2. data/bin/htmlproofer +30 -57
  3. data/lib/html-proofer.rb +1 -54
  4. data/lib/html_proofer/attribute/url.rb +231 -0
  5. data/lib/html_proofer/attribute.rb +15 -0
  6. data/lib/html_proofer/cache.rb +234 -0
  7. data/lib/html_proofer/check/favicon.rb +35 -0
  8. data/lib/html_proofer/check/images.rb +62 -0
  9. data/lib/html_proofer/check/links.rb +118 -0
  10. data/lib/html_proofer/check/open_graph.rb +34 -0
  11. data/lib/html_proofer/check/scripts.rb +38 -0
  12. data/lib/html_proofer/check.rb +91 -0
  13. data/lib/{html-proofer → html_proofer}/configuration.rb +30 -31
  14. data/lib/html_proofer/element.rb +122 -0
  15. data/lib/html_proofer/failure.rb +17 -0
  16. data/lib/{html-proofer → html_proofer}/log.rb +0 -0
  17. data/lib/html_proofer/reporter/cli.rb +29 -0
  18. data/lib/html_proofer/reporter.rb +23 -0
  19. data/lib/html_proofer/runner.rb +245 -0
  20. data/lib/html_proofer/url_validator/external.rb +189 -0
  21. data/lib/html_proofer/url_validator/internal.rb +86 -0
  22. data/lib/html_proofer/url_validator.rb +16 -0
  23. data/lib/{html-proofer → html_proofer}/utils.rb +5 -8
  24. data/lib/{html-proofer → html_proofer}/version.rb +1 -1
  25. data/lib/html_proofer/xpath_functions.rb +10 -0
  26. data/lib/html_proofer.rb +56 -0
  27. metadata +46 -27
  28. data/lib/html-proofer/cache.rb +0 -194
  29. data/lib/html-proofer/check/favicon.rb +0 -29
  30. data/lib/html-proofer/check/html.rb +0 -37
  31. data/lib/html-proofer/check/images.rb +0 -48
  32. data/lib/html-proofer/check/links.rb +0 -182
  33. data/lib/html-proofer/check/opengraph.rb +0 -46
  34. data/lib/html-proofer/check/scripts.rb +0 -42
  35. data/lib/html-proofer/check.rb +0 -75
  36. data/lib/html-proofer/element.rb +0 -265
  37. data/lib/html-proofer/issue.rb +0 -65
  38. data/lib/html-proofer/middleware.rb +0 -82
  39. data/lib/html-proofer/runner.rb +0 -249
  40. data/lib/html-proofer/url_validator.rb +0 -237
@@ -1,249 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module HTMLProofer
4
- class Runner
5
- include HTMLProofer::Utils
6
-
7
- attr_reader :options, :internal_urls, :external_urls, :failures
8
-
9
- def initialize(src, opts = {})
10
- @src = src
11
-
12
- @options = HTMLProofer::Configuration::PROOFER_DEFAULTS.merge(opts)
13
-
14
- @options[:typhoeus] = HTMLProofer::Configuration::TYPHOEUS_DEFAULTS.merge(opts[:typhoeus] || {})
15
- @options[:hydra] = HTMLProofer::Configuration::HYDRA_DEFAULTS.merge(opts[:hydra] || {})
16
-
17
- @options[:parallel] = HTMLProofer::Configuration::PARALLEL_DEFAULTS.merge(opts[:parallel] || {})
18
- @options[:validation] = HTMLProofer::Configuration::VALIDATION_DEFAULTS.merge(opts[:validation] || {})
19
- @options[:cache] = HTMLProofer::Configuration::CACHE_DEFAULTS.merge(opts[:cache] || {})
20
-
21
- @type = @options.delete(:type)
22
- @logger = HTMLProofer::Log.new(@options[:log_level])
23
- @cache = Cache.new(@logger, @options[:cache])
24
- @internal_link_checks = nil
25
-
26
- # Add swap patterns for internal domains
27
- unless @options[:internal_domains].empty?
28
- @options[:internal_domains].each do |dom|
29
- @options[:url_swap][Regexp.new("^http://#{dom}")] = ''
30
- @options[:url_swap][Regexp.new("^https://#{dom}")] = ''
31
- @options[:url_swap][Regexp.new("^//#{dom}")] = ''
32
- end
33
- end
34
-
35
- @internal_urls = {}
36
- @internal_urls_to_paths = {}
37
- @external_urls = {}
38
- @failures = []
39
- @before_request = []
40
- end
41
-
42
- def run
43
- if @type == :links
44
- @logger.log :info, "Running #{checks} on #{@src}... \n\n"
45
- check_list_of_links unless @options[:disable_external]
46
- else
47
- @logger.log :info, "Running #{checks} on #{@src} on *#{@options[:extension]}... \n\n"
48
- check_files
49
- file_text = pluralize(files.length, 'file', 'files')
50
- @logger.log :info, "Ran on #{file_text}!\n\n"
51
- end
52
-
53
- if @failures.empty?
54
- @logger.log :info, 'HTML-Proofer finished successfully.'
55
- else
56
- @failures.uniq!
57
- print_failed_tests
58
- end
59
- end
60
-
61
- def check_list_of_links
62
- if @options[:url_swap]
63
- @src = @src.map do |url|
64
- swap(url, @options[:url_swap])
65
- end
66
- end
67
- @external_urls = @src.each_with_object({}) do |url, hash|
68
- hash[url] = nil
69
- end
70
- validate_external_urls
71
- end
72
-
73
- # Collects any external URLs found in a directory of files. Also collectes
74
- # every failed test from process_files.
75
- # Sends the external URLs to Typhoeus for batch processing.
76
- def check_files
77
- process_files.each do |item|
78
- @external_urls.merge!(item[:external_urls])
79
- @failures.concat(item[:failures])
80
- end
81
-
82
- # TODO: lazy. if we're checking only external links,
83
- # we'll just trash all the failed tests. really, we should
84
- # just not run those other checks at all.
85
- if @options[:external_only]
86
- @failures = []
87
- validate_external_urls
88
- elsif !@options[:disable_external]
89
- validate_external_urls
90
- validate_internal_urls
91
- else
92
- validate_internal_urls
93
- end
94
- end
95
-
96
- # Walks over each implemented check and runs them on the files, in parallel.
97
- def process_files
98
- if @options[:parallel].empty?
99
- files.map { |path| check_path(path) }
100
- else
101
- Parallel.map(files, @options[:parallel]) { |path| check_path(path) }
102
- end
103
- end
104
-
105
- def check_parsed(html, path)
106
- result = { external_urls: {}, failures: [] }
107
-
108
- @src = [@src] if @type == :file
109
-
110
- @src.each do |src|
111
- checks.each do |klass|
112
- @logger.log :debug, "Checking #{klass.to_s.downcase} on #{path} ..."
113
- check = Object.const_get(klass).new(src, path, html, @logger, @cache, @options)
114
- check.run
115
-
116
- if klass == 'LinkCheck'
117
- @internal_link_checks = check
118
- check.internal_urls.each_pair do |url, internal_urls|
119
- if @internal_urls_to_paths[url]
120
- @internal_urls_to_paths[url].concat(internal_urls.map(&:path))
121
- else
122
- @internal_urls_to_paths[url] = internal_urls.map(&:path)
123
- end
124
- end
125
- @internal_urls.merge!(check.internal_urls)
126
- end
127
-
128
- external_urls = check.external_urls
129
- external_urls = check.external_urls.transform_keys { |url| swap(url, @options[:url_swap]) } if @options[:url_swap]
130
- result[:external_urls].merge!(external_urls)
131
- result[:failures].concat(check.issues)
132
- end
133
- end
134
- result
135
- end
136
-
137
- def check_path(path)
138
- check_parsed(create_nokogiri(path), path)
139
- end
140
-
141
- def validate_external_urls
142
- url_validator = HTMLProofer::UrlValidator.new(@logger, @cache, @external_urls, @options)
143
- url_validator.before_request = @before_request
144
- @failures.concat(url_validator.run)
145
- @external_urls = url_validator.external_urls
146
- end
147
-
148
- def validate_internal_urls
149
- if @cache.use_cache?
150
- urls_to_check = load_internal_cache
151
-
152
- urls_to_check.each_pair do |url, internal_urls|
153
- # pulled from cache
154
- internal_urls = @internal_urls[url] unless internal_urls.first.is_a?(LinkCheck::InternalLink)
155
-
156
- result = @internal_link_checks.check_internal_link(internal_urls.first.link, internal_urls.first.path, internal_urls.first.line, internal_urls.first.content)
157
- code = result ? 200 : 404
158
- @cache.add(url, @internal_urls_to_paths[url].sort, code, '') # TODO: blank msg for now
159
- end
160
- @cache.write
161
- else
162
- @internal_urls.values.flatten.each do |internal_url|
163
- result = @internal_link_checks.check_internal_link(internal_url.link, internal_url.path, internal_url.line, internal_url.content)
164
- next if result
165
-
166
- @failures.concat(@internal_link_checks.issues) unless @internal_link_checks.issues.length.zero?
167
- end
168
- end
169
- end
170
-
171
- def files
172
- @files ||= if @type == :directory
173
- @src.map do |src|
174
- pattern = File.join(src, '**', "*#{@options[:extension]}")
175
- files = Dir.glob(pattern).select { |fn| File.file? fn }
176
- files.reject { |f| ignore_file?(f) }
177
- end.flatten
178
- elsif @type == :file && File.extname(@src) == @options[:extension]
179
- [@src].reject { |f| ignore_file?(f) }
180
- else
181
- []
182
- end
183
- end
184
-
185
- def ignore_file?(file)
186
- @options[:file_ignore].each do |pattern|
187
- return true if pattern.is_a?(String) && pattern == file
188
- return true if pattern.is_a?(Regexp) && pattern =~ file
189
- end
190
-
191
- false
192
- end
193
-
194
- def checks
195
- return @checks if defined?(@checks) && !@checks.nil?
196
-
197
- return (@checks = ['LinkCheck']) if @type == :links
198
-
199
- @checks = HTMLProofer::Check.subchecks.map(&:name)
200
- @checks.delete('FaviconCheck') unless @options[:check_favicon]
201
- @checks.delete('HtmlCheck') unless @options[:check_html]
202
- @checks.delete('OpenGraphCheck') unless @options[:check_opengraph]
203
- @options[:checks_to_ignore].each { |ignored| @checks.delete(ignored) }
204
- @checks
205
- end
206
-
207
- def failed_tests
208
- result = []
209
- return result if @failures.empty?
210
-
211
- @failures.each { |f| result << f.to_s }
212
- result
213
- end
214
-
215
- def print_failed_tests
216
- sorted_failures = SortedIssues.new(@failures, @options[:error_sort], @logger)
217
-
218
- sorted_failures.sort_and_report
219
- count = @failures.length
220
- failure_text = pluralize(count, 'failure', 'failures')
221
- @logger.log :fatal, "\nHTML-Proofer found #{failure_text}!"
222
- exit 1
223
- end
224
-
225
- # Set before_request callback.
226
- #
227
- # @example Set before_request.
228
- # request.before_request { |request| p "yay" }
229
- #
230
- # @param [ Block ] block The block to execute.
231
- #
232
- # @yield [ Typhoeus::Request ]
233
- #
234
- # @return [ Array<Block> ] All before_request blocks.
235
- def before_request(&block)
236
- @before_request ||= []
237
- @before_request << block if block
238
- @before_request
239
- end
240
-
241
- def load_internal_cache
242
- urls_to_check = @cache.retrieve_urls(@internal_urls, :internal)
243
- cache_text = pluralize(urls_to_check.count, 'internal link', 'internal links')
244
- @logger.log :info, "Found #{cache_text} in the cache..."
245
-
246
- urls_to_check
247
- end
248
- end
249
- end
@@ -1,237 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'typhoeus'
4
- require 'uri'
5
- require_relative './utils'
6
- require_relative './cache'
7
-
8
- module HTMLProofer
9
- class UrlValidator
10
- include HTMLProofer::Utils
11
-
12
- attr_reader :external_urls
13
- attr_writer :before_request
14
-
15
- def initialize(logger, cache, external_urls, options)
16
- @logger = logger
17
- @external_urls = external_urls
18
- @failed_tests = []
19
- @options = options
20
- @hydra = Typhoeus::Hydra.new(@options[:hydra])
21
- @cache = cache
22
- @before_request = []
23
- end
24
-
25
- def run
26
- @external_urls = remove_query_values
27
-
28
- if @cache.use_cache?
29
- urls_to_check = @cache.retrieve_urls(@external_urls, :external)
30
- external_link_checker(urls_to_check)
31
- @cache.write
32
- else
33
- external_link_checker(@external_urls)
34
- end
35
-
36
- @failed_tests
37
- end
38
-
39
- def remove_query_values
40
- return nil if @external_urls.nil?
41
-
42
- paths_with_queries = {}
43
- iterable_external_urls = @external_urls.dup
44
- @external_urls.each_key do |url|
45
- uri = begin
46
- Addressable::URI.parse(url)
47
- rescue URI::Error, Addressable::URI::InvalidURIError
48
- @logger.log :error, "#{url} is an invalid URL"
49
- nil
50
- end
51
- next if uri.nil? || uri.query.nil?
52
-
53
- iterable_external_urls.delete(url) unless new_url_query_values?(uri, paths_with_queries)
54
- end
55
- iterable_external_urls
56
- end
57
-
58
- # remember queries we've seen, ignore future ones
59
- def new_url_query_values?(uri, paths_with_queries)
60
- queries = uri.query_values.keys.join('-')
61
- domain_path = extract_domain_path(uri)
62
- if paths_with_queries[domain_path].nil?
63
- paths_with_queries[domain_path] = [queries]
64
- true
65
- elsif !paths_with_queries[domain_path].include?(queries)
66
- paths_with_queries[domain_path] << queries
67
- true
68
- else
69
- false
70
- end
71
- end
72
-
73
- def extract_domain_path(uri)
74
- uri.host + uri.path
75
- end
76
-
77
- # Proofer runs faster if we pull out all the external URLs and run the checks
78
- # at the end. Otherwise, we're halting the consuming process for every file during
79
- # `process_files`.
80
- #
81
- # In addition, sorting the list lets libcurl keep connections to the same hosts alive.
82
- #
83
- # Finally, we'll first make a HEAD request, rather than GETing all the contents.
84
- # If the HEAD fails, we'll fall back to GET, as some servers are not configured
85
- # for HEAD. If we've decided to check for hashes, we must do a GET--HEAD is
86
- # not available as an option.
87
- def external_link_checker(external_urls)
88
- external_urls = external_urls.sort.to_h
89
-
90
- count = external_urls.length
91
- check_text = pluralize(count, 'external link', 'external links')
92
- @logger.log :info, "Checking #{check_text}..."
93
-
94
- # Route log from Typhoeus/Ethon to our own logger
95
- Ethon.logger = @logger
96
-
97
- establish_queue(external_urls)
98
-
99
- @hydra.run
100
- end
101
-
102
- def establish_queue(external_urls)
103
- external_urls.each_pair do |url, filenames|
104
- url = begin
105
- clean_url(url)
106
- rescue URI::Error, Addressable::URI::InvalidURIError
107
- add_external_issue(filenames, "#{url} is an invalid URL")
108
- next
109
- end
110
-
111
- method = if hash?(url) && @options[:check_external_hash]
112
- :get
113
- else
114
- :head
115
- end
116
- queue_request(method, url, filenames)
117
- end
118
- end
119
-
120
- def clean_url(href)
121
- # catch any obvious issues, like strings in port numbers
122
- parsed = Addressable::URI.parse(href)
123
- if href =~ /^([!#{Regexp.last_match(0)}-;=?-\[\]_a-z~]|%[0-9a-fA-F]{2})+$/
124
- href
125
- else
126
- parsed.normalize
127
- end
128
- end
129
-
130
- def queue_request(method, href, filenames)
131
- opts = @options[:typhoeus].merge(method: method)
132
- request = Typhoeus::Request.new(href, opts)
133
- @before_request.each do |callback|
134
- callback.call(request)
135
- end
136
- request.on_complete { |response| response_handler(response, filenames) }
137
- @hydra.queue request
138
- end
139
-
140
- def response_handler(response, filenames)
141
- effective_url = response.options[:effective_url]
142
- href = response.request.base_url.to_s
143
- method = response.request.options[:method]
144
- response_code = response.code
145
- response.body.delete!("\x00")
146
-
147
- debug_msg = if filenames.nil?
148
- "Received a #{response_code} for #{href}"
149
- else
150
- "Received a #{response_code} for #{href} in #{filenames.join(' ')}"
151
- end
152
-
153
- @logger.log :debug, debug_msg
154
-
155
- return if @options[:http_status_ignore].include?(response_code)
156
-
157
- if response_code.between?(200, 299)
158
- @cache.add(href, filenames, response_code) unless check_hash_in_2xx_response(href, effective_url, response, filenames)
159
- elsif response.timed_out?
160
- handle_timeout(href, filenames, response_code)
161
- elsif response_code.zero?
162
- handle_failure(effective_url, filenames, response_code, response.return_message)
163
- elsif method == :head
164
- queue_request(:get, href, filenames)
165
- else
166
- return if @options[:only_4xx] && !response_code.between?(400, 499)
167
-
168
- # Received a non-successful http response.
169
- msg = "External link #{href} failed: #{response_code} #{response.return_message}"
170
- add_external_issue(filenames, msg, response_code)
171
- @cache.add(href, filenames, response_code, msg)
172
- end
173
- end
174
-
175
- # Even though the response was a success, we may have been asked to check
176
- # if the hash on the URL exists on the page
177
- def check_hash_in_2xx_response(href, effective_url, response, filenames)
178
- return false if @options[:only_4xx]
179
- return false unless @options[:check_external_hash]
180
- return false unless (hash = hash?(href))
181
-
182
- body_doc = create_nokogiri(response.body)
183
-
184
- unencoded_hash = Addressable::URI.unescape(hash)
185
- xpath = [%(//*[@name="#{hash}"]|/*[@name="#{unencoded_hash}"]|//*[@id="#{hash}"]|//*[@id="#{unencoded_hash}"])]
186
- # user-content is a special addition by GitHub.
187
- if URI.parse(href).host =~ /github\.com/i
188
- xpath << [%(//*[@name="user-content-#{hash}"]|//*[@id="user-content-#{hash}"])]
189
- # when linking to a file on GitHub, like #L12-L34, only the first "L" portion
190
- # will be identified as a linkable portion
191
- xpath << [%(//td[@id="#{Regexp.last_match[1]}"])] if hash =~ /\A(L\d)+/
192
- end
193
-
194
- return unless body_doc.xpath(xpath.join('|')).empty?
195
-
196
- msg = "External link #{href} failed: #{effective_url} exists, but the hash '#{hash}' does not"
197
- add_external_issue(filenames, msg, response.code)
198
- @cache.add(href, filenames, response.code, msg)
199
- true
200
- end
201
-
202
- def handle_timeout(href, filenames, response_code)
203
- msg = "External link #{href} failed: got a time out (response code #{response_code})"
204
- @cache.add(href, filenames, 0, msg)
205
- return if @options[:only_4xx]
206
-
207
- add_external_issue(filenames, msg, response_code)
208
- end
209
-
210
- def handle_failure(href, filenames, response_code, return_message)
211
- msg = "External link #{href} failed: response code #{response_code} means something's wrong.
212
- It's possible libcurl couldn't connect to the server or perhaps the request timed out.
213
- Sometimes, making too many requests at once also breaks things.
214
- Either way, the return message (if any) from the server is: #{return_message}"
215
- @cache.add(href, filenames, 0, msg)
216
- return if @options[:only_4xx]
217
-
218
- add_external_issue(filenames, msg, response_code)
219
- end
220
-
221
- def add_external_issue(filenames, desc, status = nil)
222
- # possible if we're checking an array of links
223
- if filenames.nil?
224
- @failed_tests << Issue.new('', desc, status: status)
225
- else
226
- filenames.each { |f| @failed_tests << Issue.new(f, desc, status: status) }
227
- end
228
- end
229
-
230
- # Does the URL have a hash?
231
- def hash?(url)
232
- URI.parse(url).fragment
233
- rescue URI::InvalidURIError
234
- false
235
- end
236
- end
237
- end