html-proofer 3.19.2 → 4.0.0.rc3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. checksums.yaml +4 -4
  2. data/bin/htmlproofer +31 -57
  3. data/lib/html-proofer.rb +1 -54
  4. data/lib/html_proofer/attribute/url.rb +231 -0
  5. data/lib/html_proofer/attribute.rb +15 -0
  6. data/lib/html_proofer/cache.rb +236 -0
  7. data/lib/html_proofer/check/favicon.rb +35 -0
  8. data/lib/html_proofer/check/images.rb +62 -0
  9. data/lib/html_proofer/check/links.rb +118 -0
  10. data/lib/html_proofer/check/open_graph.rb +34 -0
  11. data/lib/html_proofer/check/scripts.rb +38 -0
  12. data/lib/html_proofer/check.rb +91 -0
  13. data/lib/{html-proofer → html_proofer}/configuration.rb +30 -30
  14. data/lib/html_proofer/element.rb +122 -0
  15. data/lib/html_proofer/failure.rb +17 -0
  16. data/lib/{html-proofer → html_proofer}/log.rb +0 -0
  17. data/lib/html_proofer/reporter/cli.rb +29 -0
  18. data/lib/html_proofer/reporter.rb +23 -0
  19. data/lib/html_proofer/runner.rb +245 -0
  20. data/lib/html_proofer/url_validator/external.rb +189 -0
  21. data/lib/html_proofer/url_validator/internal.rb +86 -0
  22. data/lib/html_proofer/url_validator.rb +16 -0
  23. data/lib/{html-proofer → html_proofer}/utils.rb +6 -9
  24. data/lib/{html-proofer → html_proofer}/version.rb +1 -1
  25. data/lib/html_proofer/xpath_functions.rb +10 -0
  26. data/lib/html_proofer.rb +55 -0
  27. metadata +51 -30
  28. data/lib/html-proofer/cache.rb +0 -194
  29. data/lib/html-proofer/check/favicon.rb +0 -29
  30. data/lib/html-proofer/check/html.rb +0 -37
  31. data/lib/html-proofer/check/images.rb +0 -48
  32. data/lib/html-proofer/check/links.rb +0 -182
  33. data/lib/html-proofer/check/opengraph.rb +0 -46
  34. data/lib/html-proofer/check/scripts.rb +0 -42
  35. data/lib/html-proofer/check.rb +0 -75
  36. data/lib/html-proofer/element.rb +0 -261
  37. data/lib/html-proofer/issue.rb +0 -65
  38. data/lib/html-proofer/middleware.rb +0 -82
  39. data/lib/html-proofer/runner.rb +0 -248
  40. data/lib/html-proofer/url_validator.rb +0 -237
@@ -1,248 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module HTMLProofer
4
- class Runner
5
- include HTMLProofer::Utils
6
-
7
- attr_reader :options, :internal_urls, :external_urls, :failures
8
-
9
- def initialize(src, opts = {})
10
- @src = src
11
-
12
- @options = HTMLProofer::Configuration::PROOFER_DEFAULTS.merge(opts)
13
-
14
- @options[:typhoeus] = HTMLProofer::Configuration::TYPHOEUS_DEFAULTS.merge(opts[:typhoeus] || {})
15
- @options[:hydra] = HTMLProofer::Configuration::HYDRA_DEFAULTS.merge(opts[:hydra] || {})
16
-
17
- @options[:parallel] = HTMLProofer::Configuration::PARALLEL_DEFAULTS.merge(opts[:parallel] || {})
18
- @options[:validation] = HTMLProofer::Configuration::VALIDATION_DEFAULTS.merge(opts[:validation] || {})
19
- @options[:cache] = HTMLProofer::Configuration::CACHE_DEFAULTS.merge(opts[:cache] || {})
20
-
21
- @type = @options.delete(:type)
22
- @logger = HTMLProofer::Log.new(@options[:log_level])
23
- @cache = Cache.new(@logger, @options[:cache])
24
- @internal_link_checks = nil
25
-
26
- # Add swap patterns for internal domains
27
- unless @options[:internal_domains].empty?
28
- @options[:internal_domains].each do |dom|
29
- @options[:url_swap][Regexp.new("^http://#{dom}")] = ''
30
- @options[:url_swap][Regexp.new("^https://#{dom}")] = ''
31
- @options[:url_swap][Regexp.new("^//#{dom}")] = ''
32
- end
33
- end
34
-
35
- @internal_urls = {}
36
- @internal_urls_to_paths = {}
37
- @external_urls = {}
38
- @failures = []
39
- @before_request = []
40
- end
41
-
42
- def run
43
- if @type == :links
44
- @logger.log :info, "Running #{checks} on #{@src}... \n\n"
45
- check_list_of_links unless @options[:disable_external]
46
- else
47
- @logger.log :info, "Running #{checks} on #{@src} on *#{@options[:extension]}... \n\n"
48
- check_files
49
- file_text = pluralize(files.length, 'file', 'files')
50
- @logger.log :info, "Ran on #{file_text}!\n\n"
51
- end
52
-
53
- if @failures.empty?
54
- @logger.log :info, 'HTML-Proofer finished successfully.'
55
- else
56
- print_failed_tests
57
- end
58
- end
59
-
60
- def check_list_of_links
61
- if @options[:url_swap]
62
- @src = @src.map do |url|
63
- swap(url, @options[:url_swap])
64
- end
65
- end
66
- @external_urls = @src.each_with_object({}) do |url, hash|
67
- hash[url] = nil
68
- end
69
- validate_external_urls
70
- end
71
-
72
- # Collects any external URLs found in a directory of files. Also collectes
73
- # every failed test from process_files.
74
- # Sends the external URLs to Typhoeus for batch processing.
75
- def check_files
76
- process_files.each do |item|
77
- @external_urls.merge!(item[:external_urls])
78
- @failures.concat(item[:failures])
79
- end
80
-
81
- # TODO: lazy. if we're checking only external links,
82
- # we'll just trash all the failed tests. really, we should
83
- # just not run those other checks at all.
84
- if @options[:external_only]
85
- @failures = []
86
- validate_external_urls
87
- elsif !@options[:disable_external]
88
- validate_external_urls
89
- validate_internal_urls
90
- else
91
- validate_internal_urls
92
- end
93
- end
94
-
95
- # Walks over each implemented check and runs them on the files, in parallel.
96
- def process_files
97
- if @options[:parallel].empty?
98
- files.map { |path| check_path(path) }
99
- else
100
- Parallel.map(files, @options[:parallel]) { |path| check_path(path) }
101
- end
102
- end
103
-
104
- def check_parsed(html, path)
105
- result = { external_urls: {}, failures: [] }
106
-
107
- @src = [@src] if @type == :file
108
-
109
- @src.each do |src|
110
- checks.each do |klass|
111
- @logger.log :debug, "Checking #{klass.to_s.downcase} on #{path} ..."
112
- check = Object.const_get(klass).new(src, path, html, @logger, @cache, @options)
113
- check.run
114
-
115
- if klass == 'LinkCheck'
116
- @internal_link_checks = check
117
- check.internal_urls.each_pair do |url, internal_urls|
118
- if @internal_urls_to_paths[url]
119
- @internal_urls_to_paths[url].concat(internal_urls.map(&:path))
120
- else
121
- @internal_urls_to_paths[url] = internal_urls.map(&:path)
122
- end
123
- end
124
- @internal_urls.merge!(check.internal_urls)
125
- end
126
-
127
- external_urls = check.external_urls
128
- external_urls = check.external_urls.transform_keys { |url| swap(url, @options[:url_swap]) } if @options[:url_swap]
129
- result[:external_urls].merge!(external_urls)
130
- result[:failures].concat(check.issues)
131
- end
132
- end
133
- result
134
- end
135
-
136
- def check_path(path)
137
- check_parsed(create_nokogiri(path), path)
138
- end
139
-
140
- def validate_external_urls
141
- url_validator = HTMLProofer::UrlValidator.new(@logger, @cache, @external_urls, @options)
142
- url_validator.before_request = @before_request
143
- @failures.concat(url_validator.run)
144
- @external_urls = url_validator.external_urls
145
- end
146
-
147
- def validate_internal_urls
148
- if @cache.use_cache?
149
- urls_to_check = load_internal_cache
150
-
151
- urls_to_check.each_pair do |url, internal_urls|
152
- # pulled from cache
153
- internal_urls = @internal_urls[url] unless internal_urls.first.is_a?(LinkCheck::InternalLink)
154
-
155
- result = @internal_link_checks.check_internal_link(internal_urls.first.link, internal_urls.first.path, internal_urls.first.line, internal_urls.first.content)
156
- code = result ? 200 : 404
157
- @cache.add(url, @internal_urls_to_paths[url].sort, code, '') # TODO: blank msg for now
158
- end
159
- @cache.write
160
- else
161
- @internal_urls.values.flatten.each do |internal_url|
162
- result = @internal_link_checks.check_internal_link(internal_url.link, internal_url.path, internal_url.line, internal_url.content)
163
- next if result
164
-
165
- @failures.concat(@internal_link_checks.issues) unless @internal_link_checks.issues.length.zero?
166
- end
167
- end
168
- end
169
-
170
- def files
171
- @files ||= if @type == :directory
172
- @src.map do |src|
173
- pattern = File.join(src, '**', "*#{@options[:extension]}")
174
- files = Dir.glob(pattern).select { |fn| File.file? fn }
175
- files.reject { |f| ignore_file?(f) }
176
- end.flatten
177
- elsif @type == :file && File.extname(@src) == @options[:extension]
178
- [@src].reject { |f| ignore_file?(f) }
179
- else
180
- []
181
- end
182
- end
183
-
184
- def ignore_file?(file)
185
- @options[:file_ignore].each do |pattern|
186
- return true if pattern.is_a?(String) && pattern == file
187
- return true if pattern.is_a?(Regexp) && pattern =~ file
188
- end
189
-
190
- false
191
- end
192
-
193
- def checks
194
- return @checks if defined?(@checks) && !@checks.nil?
195
-
196
- return (@checks = ['LinkCheck']) if @type == :links
197
-
198
- @checks = HTMLProofer::Check.subchecks.map(&:name)
199
- @checks.delete('FaviconCheck') unless @options[:check_favicon]
200
- @checks.delete('HtmlCheck') unless @options[:check_html]
201
- @checks.delete('OpenGraphCheck') unless @options[:check_opengraph]
202
- @options[:checks_to_ignore].each { |ignored| @checks.delete(ignored) }
203
- @checks
204
- end
205
-
206
- def failed_tests
207
- result = []
208
- return result if @failures.empty?
209
-
210
- @failures.each { |f| result << f.to_s }
211
- result
212
- end
213
-
214
- def print_failed_tests
215
- sorted_failures = SortedIssues.new(@failures, @options[:error_sort], @logger)
216
-
217
- sorted_failures.sort_and_report
218
- count = @failures.length
219
- failure_text = pluralize(count, 'failure', 'failures')
220
- @logger.log :fatal, "\nHTML-Proofer found #{failure_text}!"
221
- exit 1
222
- end
223
-
224
- # Set before_request callback.
225
- #
226
- # @example Set before_request.
227
- # request.before_request { |request| p "yay" }
228
- #
229
- # @param [ Block ] block The block to execute.
230
- #
231
- # @yield [ Typhoeus::Request ]
232
- #
233
- # @return [ Array<Block> ] All before_request blocks.
234
- def before_request(&block)
235
- @before_request ||= []
236
- @before_request << block if block
237
- @before_request
238
- end
239
-
240
- def load_internal_cache
241
- urls_to_check = @cache.retrieve_urls(@internal_urls, :internal)
242
- cache_text = pluralize(urls_to_check.count, 'internal link', 'internal links')
243
- @logger.log :info, "Found #{cache_text} in the cache..."
244
-
245
- urls_to_check
246
- end
247
- end
248
- end
@@ -1,237 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'typhoeus'
4
- require 'uri'
5
- require_relative './utils'
6
- require_relative './cache'
7
-
8
- module HTMLProofer
9
- class UrlValidator
10
- include HTMLProofer::Utils
11
-
12
- attr_reader :external_urls
13
- attr_writer :before_request
14
-
15
- def initialize(logger, cache, external_urls, options)
16
- @logger = logger
17
- @external_urls = external_urls
18
- @failed_tests = []
19
- @options = options
20
- @hydra = Typhoeus::Hydra.new(@options[:hydra])
21
- @cache = cache
22
- @before_request = []
23
- end
24
-
25
- def run
26
- @external_urls = remove_query_values
27
-
28
- if @cache.use_cache?
29
- urls_to_check = @cache.retrieve_urls(@external_urls, :external)
30
- external_link_checker(urls_to_check)
31
- @cache.write
32
- else
33
- external_link_checker(@external_urls)
34
- end
35
-
36
- @failed_tests
37
- end
38
-
39
- def remove_query_values
40
- return nil if @external_urls.nil?
41
-
42
- paths_with_queries = {}
43
- iterable_external_urls = @external_urls.dup
44
- @external_urls.each_key do |url|
45
- uri = begin
46
- Addressable::URI.parse(url)
47
- rescue URI::Error, Addressable::URI::InvalidURIError
48
- @logger.log :error, "#{url} is an invalid URL"
49
- nil
50
- end
51
- next if uri.nil? || uri.query.nil?
52
-
53
- iterable_external_urls.delete(url) unless new_url_query_values?(uri, paths_with_queries)
54
- end
55
- iterable_external_urls
56
- end
57
-
58
- # remember queries we've seen, ignore future ones
59
- def new_url_query_values?(uri, paths_with_queries)
60
- queries = uri.query_values.keys.join('-')
61
- domain_path = extract_domain_path(uri)
62
- if paths_with_queries[domain_path].nil?
63
- paths_with_queries[domain_path] = [queries]
64
- true
65
- elsif !paths_with_queries[domain_path].include?(queries)
66
- paths_with_queries[domain_path] << queries
67
- true
68
- else
69
- false
70
- end
71
- end
72
-
73
- def extract_domain_path(uri)
74
- uri.host + uri.path
75
- end
76
-
77
- # Proofer runs faster if we pull out all the external URLs and run the checks
78
- # at the end. Otherwise, we're halting the consuming process for every file during
79
- # `process_files`.
80
- #
81
- # In addition, sorting the list lets libcurl keep connections to the same hosts alive.
82
- #
83
- # Finally, we'll first make a HEAD request, rather than GETing all the contents.
84
- # If the HEAD fails, we'll fall back to GET, as some servers are not configured
85
- # for HEAD. If we've decided to check for hashes, we must do a GET--HEAD is
86
- # not available as an option.
87
- def external_link_checker(external_urls)
88
- external_urls = external_urls.sort.to_h
89
-
90
- count = external_urls.length
91
- check_text = pluralize(count, 'external link', 'external links')
92
- @logger.log :info, "Checking #{check_text}..."
93
-
94
- # Route log from Typhoeus/Ethon to our own logger
95
- Ethon.logger = @logger
96
-
97
- establish_queue(external_urls)
98
-
99
- @hydra.run
100
- end
101
-
102
- def establish_queue(external_urls)
103
- external_urls.each_pair do |url, filenames|
104
- url = begin
105
- clean_url(url)
106
- rescue URI::Error, Addressable::URI::InvalidURIError
107
- add_external_issue(filenames, "#{url} is an invalid URL")
108
- next
109
- end
110
-
111
- method = if hash?(url) && @options[:check_external_hash]
112
- :get
113
- else
114
- :head
115
- end
116
- queue_request(method, url, filenames)
117
- end
118
- end
119
-
120
- def clean_url(href)
121
- # catch any obvious issues, like strings in port numbers
122
- parsed = Addressable::URI.parse(href)
123
- if href =~ /^([!#{Regexp.last_match(0)}-;=?-\[\]_a-z~]|%[0-9a-fA-F]{2})+$/
124
- href
125
- else
126
- parsed.normalize
127
- end
128
- end
129
-
130
- def queue_request(method, href, filenames)
131
- opts = @options[:typhoeus].merge(method: method)
132
- request = Typhoeus::Request.new(href, opts)
133
- @before_request.each do |callback|
134
- callback.call(request)
135
- end
136
- request.on_complete { |response| response_handler(response, filenames) }
137
- @hydra.queue request
138
- end
139
-
140
- def response_handler(response, filenames)
141
- effective_url = response.options[:effective_url]
142
- href = response.request.base_url.to_s
143
- method = response.request.options[:method]
144
- response_code = response.code
145
- response.body.delete!("\x00")
146
-
147
- debug_msg = if filenames.nil?
148
- "Received a #{response_code} for #{href}"
149
- else
150
- "Received a #{response_code} for #{href} in #{filenames.join(' ')}"
151
- end
152
-
153
- @logger.log :debug, debug_msg
154
-
155
- return if @options[:http_status_ignore].include?(response_code)
156
-
157
- if response_code.between?(200, 299)
158
- @cache.add(href, filenames, response_code) unless check_hash_in_2xx_response(href, effective_url, response, filenames)
159
- elsif response.timed_out?
160
- handle_timeout(href, filenames, response_code)
161
- elsif response_code.zero?
162
- handle_failure(effective_url, filenames, response_code, response.return_message)
163
- elsif method == :head
164
- queue_request(:get, href, filenames)
165
- else
166
- return if @options[:only_4xx] && !response_code.between?(400, 499)
167
-
168
- # Received a non-successful http response.
169
- msg = "External link #{href} failed: #{response_code} #{response.return_message}"
170
- add_external_issue(filenames, msg, response_code)
171
- @cache.add(href, filenames, response_code, msg)
172
- end
173
- end
174
-
175
- # Even though the response was a success, we may have been asked to check
176
- # if the hash on the URL exists on the page
177
- def check_hash_in_2xx_response(href, effective_url, response, filenames)
178
- return false if @options[:only_4xx]
179
- return false unless @options[:check_external_hash]
180
- return false unless (hash = hash?(href))
181
-
182
- body_doc = create_nokogiri(response.body)
183
-
184
- unencoded_hash = Addressable::URI.unescape(hash)
185
- xpath = [%(//*[@name="#{hash}"]|/*[@name="#{unencoded_hash}"]|//*[@id="#{hash}"]|//*[@id="#{unencoded_hash}"])]
186
- # user-content is a special addition by GitHub.
187
- if URI.parse(href).host =~ /github\.com/i
188
- xpath << [%(//*[@name="user-content-#{hash}"]|//*[@id="user-content-#{hash}"])]
189
- # when linking to a file on GitHub, like #L12-L34, only the first "L" portion
190
- # will be identified as a linkable portion
191
- xpath << [%(//td[@id="#{Regexp.last_match[1]}"])] if hash =~ /\A(L\d)+/
192
- end
193
-
194
- return unless body_doc.xpath(xpath.join('|')).empty?
195
-
196
- msg = "External link #{href} failed: #{effective_url} exists, but the hash '#{hash}' does not"
197
- add_external_issue(filenames, msg, response.code)
198
- @cache.add(href, filenames, response.code, msg)
199
- true
200
- end
201
-
202
- def handle_timeout(href, filenames, response_code)
203
- msg = "External link #{href} failed: got a time out (response code #{response_code})"
204
- @cache.add(href, filenames, 0, msg)
205
- return if @options[:only_4xx]
206
-
207
- add_external_issue(filenames, msg, response_code)
208
- end
209
-
210
- def handle_failure(href, filenames, response_code, return_message)
211
- msg = "External link #{href} failed: response code #{response_code} means something's wrong.
212
- It's possible libcurl couldn't connect to the server or perhaps the request timed out.
213
- Sometimes, making too many requests at once also breaks things.
214
- Either way, the return message (if any) from the server is: #{return_message}"
215
- @cache.add(href, filenames, 0, msg)
216
- return if @options[:only_4xx]
217
-
218
- add_external_issue(filenames, msg, response_code)
219
- end
220
-
221
- def add_external_issue(filenames, desc, status = nil)
222
- # possible if we're checking an array of links
223
- if filenames.nil?
224
- @failed_tests << Issue.new('', desc, status: status)
225
- else
226
- filenames.each { |f| @failed_tests << Issue.new(f, desc, status: status) }
227
- end
228
- end
229
-
230
- # Does the URL have a hash?
231
- def hash?(url)
232
- URI.parse(url).fragment
233
- rescue URI::InvalidURIError
234
- false
235
- end
236
- end
237
- end