html-proofer 3.19.4 → 4.0.0.rc1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (40) hide show
  1. checksums.yaml +4 -4
  2. data/bin/htmlproofer +30 -57
  3. data/lib/html-proofer.rb +1 -54
  4. data/lib/html_proofer/attribute/url.rb +231 -0
  5. data/lib/html_proofer/attribute.rb +15 -0
  6. data/lib/html_proofer/cache.rb +234 -0
  7. data/lib/html_proofer/check/favicon.rb +35 -0
  8. data/lib/html_proofer/check/images.rb +62 -0
  9. data/lib/html_proofer/check/links.rb +118 -0
  10. data/lib/html_proofer/check/open_graph.rb +34 -0
  11. data/lib/html_proofer/check/scripts.rb +38 -0
  12. data/lib/html_proofer/check.rb +91 -0
  13. data/lib/{html-proofer → html_proofer}/configuration.rb +30 -31
  14. data/lib/html_proofer/element.rb +122 -0
  15. data/lib/html_proofer/failure.rb +17 -0
  16. data/lib/{html-proofer → html_proofer}/log.rb +0 -0
  17. data/lib/html_proofer/reporter/cli.rb +29 -0
  18. data/lib/html_proofer/reporter.rb +23 -0
  19. data/lib/html_proofer/runner.rb +245 -0
  20. data/lib/html_proofer/url_validator/external.rb +189 -0
  21. data/lib/html_proofer/url_validator/internal.rb +86 -0
  22. data/lib/html_proofer/url_validator.rb +16 -0
  23. data/lib/{html-proofer → html_proofer}/utils.rb +5 -8
  24. data/lib/{html-proofer → html_proofer}/version.rb +1 -1
  25. data/lib/html_proofer/xpath_functions.rb +10 -0
  26. data/lib/html_proofer.rb +56 -0
  27. metadata +46 -27
  28. data/lib/html-proofer/cache.rb +0 -194
  29. data/lib/html-proofer/check/favicon.rb +0 -29
  30. data/lib/html-proofer/check/html.rb +0 -37
  31. data/lib/html-proofer/check/images.rb +0 -48
  32. data/lib/html-proofer/check/links.rb +0 -182
  33. data/lib/html-proofer/check/opengraph.rb +0 -46
  34. data/lib/html-proofer/check/scripts.rb +0 -42
  35. data/lib/html-proofer/check.rb +0 -75
  36. data/lib/html-proofer/element.rb +0 -265
  37. data/lib/html-proofer/issue.rb +0 -65
  38. data/lib/html-proofer/middleware.rb +0 -82
  39. data/lib/html-proofer/runner.rb +0 -249
  40. data/lib/html-proofer/url_validator.rb +0 -237
@@ -1,249 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module HTMLProofer
4
- class Runner
5
- include HTMLProofer::Utils
6
-
7
- attr_reader :options, :internal_urls, :external_urls, :failures
8
-
9
- def initialize(src, opts = {})
10
- @src = src
11
-
12
- @options = HTMLProofer::Configuration::PROOFER_DEFAULTS.merge(opts)
13
-
14
- @options[:typhoeus] = HTMLProofer::Configuration::TYPHOEUS_DEFAULTS.merge(opts[:typhoeus] || {})
15
- @options[:hydra] = HTMLProofer::Configuration::HYDRA_DEFAULTS.merge(opts[:hydra] || {})
16
-
17
- @options[:parallel] = HTMLProofer::Configuration::PARALLEL_DEFAULTS.merge(opts[:parallel] || {})
18
- @options[:validation] = HTMLProofer::Configuration::VALIDATION_DEFAULTS.merge(opts[:validation] || {})
19
- @options[:cache] = HTMLProofer::Configuration::CACHE_DEFAULTS.merge(opts[:cache] || {})
20
-
21
- @type = @options.delete(:type)
22
- @logger = HTMLProofer::Log.new(@options[:log_level])
23
- @cache = Cache.new(@logger, @options[:cache])
24
- @internal_link_checks = nil
25
-
26
- # Add swap patterns for internal domains
27
- unless @options[:internal_domains].empty?
28
- @options[:internal_domains].each do |dom|
29
- @options[:url_swap][Regexp.new("^http://#{dom}")] = ''
30
- @options[:url_swap][Regexp.new("^https://#{dom}")] = ''
31
- @options[:url_swap][Regexp.new("^//#{dom}")] = ''
32
- end
33
- end
34
-
35
- @internal_urls = {}
36
- @internal_urls_to_paths = {}
37
- @external_urls = {}
38
- @failures = []
39
- @before_request = []
40
- end
41
-
42
- def run
43
- if @type == :links
44
- @logger.log :info, "Running #{checks} on #{@src}... \n\n"
45
- check_list_of_links unless @options[:disable_external]
46
- else
47
- @logger.log :info, "Running #{checks} on #{@src} on *#{@options[:extension]}... \n\n"
48
- check_files
49
- file_text = pluralize(files.length, 'file', 'files')
50
- @logger.log :info, "Ran on #{file_text}!\n\n"
51
- end
52
-
53
- if @failures.empty?
54
- @logger.log :info, 'HTML-Proofer finished successfully.'
55
- else
56
- @failures.uniq!
57
- print_failed_tests
58
- end
59
- end
60
-
61
- def check_list_of_links
62
- if @options[:url_swap]
63
- @src = @src.map do |url|
64
- swap(url, @options[:url_swap])
65
- end
66
- end
67
- @external_urls = @src.each_with_object({}) do |url, hash|
68
- hash[url] = nil
69
- end
70
- validate_external_urls
71
- end
72
-
73
- # Collects any external URLs found in a directory of files. Also collectes
74
- # every failed test from process_files.
75
- # Sends the external URLs to Typhoeus for batch processing.
76
- def check_files
77
- process_files.each do |item|
78
- @external_urls.merge!(item[:external_urls])
79
- @failures.concat(item[:failures])
80
- end
81
-
82
- # TODO: lazy. if we're checking only external links,
83
- # we'll just trash all the failed tests. really, we should
84
- # just not run those other checks at all.
85
- if @options[:external_only]
86
- @failures = []
87
- validate_external_urls
88
- elsif !@options[:disable_external]
89
- validate_external_urls
90
- validate_internal_urls
91
- else
92
- validate_internal_urls
93
- end
94
- end
95
-
96
- # Walks over each implemented check and runs them on the files, in parallel.
97
- def process_files
98
- if @options[:parallel].empty?
99
- files.map { |path| check_path(path) }
100
- else
101
- Parallel.map(files, @options[:parallel]) { |path| check_path(path) }
102
- end
103
- end
104
-
105
- def check_parsed(html, path)
106
- result = { external_urls: {}, failures: [] }
107
-
108
- @src = [@src] if @type == :file
109
-
110
- @src.each do |src|
111
- checks.each do |klass|
112
- @logger.log :debug, "Checking #{klass.to_s.downcase} on #{path} ..."
113
- check = Object.const_get(klass).new(src, path, html, @logger, @cache, @options)
114
- check.run
115
-
116
- if klass == 'LinkCheck'
117
- @internal_link_checks = check
118
- check.internal_urls.each_pair do |url, internal_urls|
119
- if @internal_urls_to_paths[url]
120
- @internal_urls_to_paths[url].concat(internal_urls.map(&:path))
121
- else
122
- @internal_urls_to_paths[url] = internal_urls.map(&:path)
123
- end
124
- end
125
- @internal_urls.merge!(check.internal_urls)
126
- end
127
-
128
- external_urls = check.external_urls
129
- external_urls = check.external_urls.transform_keys { |url| swap(url, @options[:url_swap]) } if @options[:url_swap]
130
- result[:external_urls].merge!(external_urls)
131
- result[:failures].concat(check.issues)
132
- end
133
- end
134
- result
135
- end
136
-
137
- def check_path(path)
138
- check_parsed(create_nokogiri(path), path)
139
- end
140
-
141
- def validate_external_urls
142
- url_validator = HTMLProofer::UrlValidator.new(@logger, @cache, @external_urls, @options)
143
- url_validator.before_request = @before_request
144
- @failures.concat(url_validator.run)
145
- @external_urls = url_validator.external_urls
146
- end
147
-
148
- def validate_internal_urls
149
- if @cache.use_cache?
150
- urls_to_check = load_internal_cache
151
-
152
- urls_to_check.each_pair do |url, internal_urls|
153
- # pulled from cache
154
- internal_urls = @internal_urls[url] unless internal_urls.first.is_a?(LinkCheck::InternalLink)
155
-
156
- result = @internal_link_checks.check_internal_link(internal_urls.first.link, internal_urls.first.path, internal_urls.first.line, internal_urls.first.content)
157
- code = result ? 200 : 404
158
- @cache.add(url, @internal_urls_to_paths[url].sort, code, '') # TODO: blank msg for now
159
- end
160
- @cache.write
161
- else
162
- @internal_urls.values.flatten.each do |internal_url|
163
- result = @internal_link_checks.check_internal_link(internal_url.link, internal_url.path, internal_url.line, internal_url.content)
164
- next if result
165
-
166
- @failures.concat(@internal_link_checks.issues) unless @internal_link_checks.issues.length.zero?
167
- end
168
- end
169
- end
170
-
171
- def files
172
- @files ||= if @type == :directory
173
- @src.map do |src|
174
- pattern = File.join(src, '**', "*#{@options[:extension]}")
175
- files = Dir.glob(pattern).select { |fn| File.file? fn }
176
- files.reject { |f| ignore_file?(f) }
177
- end.flatten
178
- elsif @type == :file && File.extname(@src) == @options[:extension]
179
- [@src].reject { |f| ignore_file?(f) }
180
- else
181
- []
182
- end
183
- end
184
-
185
- def ignore_file?(file)
186
- @options[:file_ignore].each do |pattern|
187
- return true if pattern.is_a?(String) && pattern == file
188
- return true if pattern.is_a?(Regexp) && pattern =~ file
189
- end
190
-
191
- false
192
- end
193
-
194
- def checks
195
- return @checks if defined?(@checks) && !@checks.nil?
196
-
197
- return (@checks = ['LinkCheck']) if @type == :links
198
-
199
- @checks = HTMLProofer::Check.subchecks.map(&:name)
200
- @checks.delete('FaviconCheck') unless @options[:check_favicon]
201
- @checks.delete('HtmlCheck') unless @options[:check_html]
202
- @checks.delete('OpenGraphCheck') unless @options[:check_opengraph]
203
- @options[:checks_to_ignore].each { |ignored| @checks.delete(ignored) }
204
- @checks
205
- end
206
-
207
- def failed_tests
208
- result = []
209
- return result if @failures.empty?
210
-
211
- @failures.each { |f| result << f.to_s }
212
- result
213
- end
214
-
215
- def print_failed_tests
216
- sorted_failures = SortedIssues.new(@failures, @options[:error_sort], @logger)
217
-
218
- sorted_failures.sort_and_report
219
- count = @failures.length
220
- failure_text = pluralize(count, 'failure', 'failures')
221
- @logger.log :fatal, "\nHTML-Proofer found #{failure_text}!"
222
- exit 1
223
- end
224
-
225
- # Set before_request callback.
226
- #
227
- # @example Set before_request.
228
- # request.before_request { |request| p "yay" }
229
- #
230
- # @param [ Block ] block The block to execute.
231
- #
232
- # @yield [ Typhoeus::Request ]
233
- #
234
- # @return [ Array<Block> ] All before_request blocks.
235
- def before_request(&block)
236
- @before_request ||= []
237
- @before_request << block if block
238
- @before_request
239
- end
240
-
241
- def load_internal_cache
242
- urls_to_check = @cache.retrieve_urls(@internal_urls, :internal)
243
- cache_text = pluralize(urls_to_check.count, 'internal link', 'internal links')
244
- @logger.log :info, "Found #{cache_text} in the cache..."
245
-
246
- urls_to_check
247
- end
248
- end
249
- end
@@ -1,237 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'typhoeus'
4
- require 'uri'
5
- require_relative './utils'
6
- require_relative './cache'
7
-
8
- module HTMLProofer
9
- class UrlValidator
10
- include HTMLProofer::Utils
11
-
12
- attr_reader :external_urls
13
- attr_writer :before_request
14
-
15
- def initialize(logger, cache, external_urls, options)
16
- @logger = logger
17
- @external_urls = external_urls
18
- @failed_tests = []
19
- @options = options
20
- @hydra = Typhoeus::Hydra.new(@options[:hydra])
21
- @cache = cache
22
- @before_request = []
23
- end
24
-
25
- def run
26
- @external_urls = remove_query_values
27
-
28
- if @cache.use_cache?
29
- urls_to_check = @cache.retrieve_urls(@external_urls, :external)
30
- external_link_checker(urls_to_check)
31
- @cache.write
32
- else
33
- external_link_checker(@external_urls)
34
- end
35
-
36
- @failed_tests
37
- end
38
-
39
- def remove_query_values
40
- return nil if @external_urls.nil?
41
-
42
- paths_with_queries = {}
43
- iterable_external_urls = @external_urls.dup
44
- @external_urls.each_key do |url|
45
- uri = begin
46
- Addressable::URI.parse(url)
47
- rescue URI::Error, Addressable::URI::InvalidURIError
48
- @logger.log :error, "#{url} is an invalid URL"
49
- nil
50
- end
51
- next if uri.nil? || uri.query.nil?
52
-
53
- iterable_external_urls.delete(url) unless new_url_query_values?(uri, paths_with_queries)
54
- end
55
- iterable_external_urls
56
- end
57
-
58
- # remember queries we've seen, ignore future ones
59
- def new_url_query_values?(uri, paths_with_queries)
60
- queries = uri.query_values.keys.join('-')
61
- domain_path = extract_domain_path(uri)
62
- if paths_with_queries[domain_path].nil?
63
- paths_with_queries[domain_path] = [queries]
64
- true
65
- elsif !paths_with_queries[domain_path].include?(queries)
66
- paths_with_queries[domain_path] << queries
67
- true
68
- else
69
- false
70
- end
71
- end
72
-
73
- def extract_domain_path(uri)
74
- uri.host + uri.path
75
- end
76
-
77
- # Proofer runs faster if we pull out all the external URLs and run the checks
78
- # at the end. Otherwise, we're halting the consuming process for every file during
79
- # `process_files`.
80
- #
81
- # In addition, sorting the list lets libcurl keep connections to the same hosts alive.
82
- #
83
- # Finally, we'll first make a HEAD request, rather than GETing all the contents.
84
- # If the HEAD fails, we'll fall back to GET, as some servers are not configured
85
- # for HEAD. If we've decided to check for hashes, we must do a GET--HEAD is
86
- # not available as an option.
87
- def external_link_checker(external_urls)
88
- external_urls = external_urls.sort.to_h
89
-
90
- count = external_urls.length
91
- check_text = pluralize(count, 'external link', 'external links')
92
- @logger.log :info, "Checking #{check_text}..."
93
-
94
- # Route log from Typhoeus/Ethon to our own logger
95
- Ethon.logger = @logger
96
-
97
- establish_queue(external_urls)
98
-
99
- @hydra.run
100
- end
101
-
102
- def establish_queue(external_urls)
103
- external_urls.each_pair do |url, filenames|
104
- url = begin
105
- clean_url(url)
106
- rescue URI::Error, Addressable::URI::InvalidURIError
107
- add_external_issue(filenames, "#{url} is an invalid URL")
108
- next
109
- end
110
-
111
- method = if hash?(url) && @options[:check_external_hash]
112
- :get
113
- else
114
- :head
115
- end
116
- queue_request(method, url, filenames)
117
- end
118
- end
119
-
120
- def clean_url(href)
121
- # catch any obvious issues, like strings in port numbers
122
- parsed = Addressable::URI.parse(href)
123
- if href =~ /^([!#{Regexp.last_match(0)}-;=?-\[\]_a-z~]|%[0-9a-fA-F]{2})+$/
124
- href
125
- else
126
- parsed.normalize
127
- end
128
- end
129
-
130
- def queue_request(method, href, filenames)
131
- opts = @options[:typhoeus].merge(method: method)
132
- request = Typhoeus::Request.new(href, opts)
133
- @before_request.each do |callback|
134
- callback.call(request)
135
- end
136
- request.on_complete { |response| response_handler(response, filenames) }
137
- @hydra.queue request
138
- end
139
-
140
- def response_handler(response, filenames)
141
- effective_url = response.options[:effective_url]
142
- href = response.request.base_url.to_s
143
- method = response.request.options[:method]
144
- response_code = response.code
145
- response.body.delete!("\x00")
146
-
147
- debug_msg = if filenames.nil?
148
- "Received a #{response_code} for #{href}"
149
- else
150
- "Received a #{response_code} for #{href} in #{filenames.join(' ')}"
151
- end
152
-
153
- @logger.log :debug, debug_msg
154
-
155
- return if @options[:http_status_ignore].include?(response_code)
156
-
157
- if response_code.between?(200, 299)
158
- @cache.add(href, filenames, response_code) unless check_hash_in_2xx_response(href, effective_url, response, filenames)
159
- elsif response.timed_out?
160
- handle_timeout(href, filenames, response_code)
161
- elsif response_code.zero?
162
- handle_failure(effective_url, filenames, response_code, response.return_message)
163
- elsif method == :head
164
- queue_request(:get, href, filenames)
165
- else
166
- return if @options[:only_4xx] && !response_code.between?(400, 499)
167
-
168
- # Received a non-successful http response.
169
- msg = "External link #{href} failed: #{response_code} #{response.return_message}"
170
- add_external_issue(filenames, msg, response_code)
171
- @cache.add(href, filenames, response_code, msg)
172
- end
173
- end
174
-
175
- # Even though the response was a success, we may have been asked to check
176
- # if the hash on the URL exists on the page
177
- def check_hash_in_2xx_response(href, effective_url, response, filenames)
178
- return false if @options[:only_4xx]
179
- return false unless @options[:check_external_hash]
180
- return false unless (hash = hash?(href))
181
-
182
- body_doc = create_nokogiri(response.body)
183
-
184
- unencoded_hash = Addressable::URI.unescape(hash)
185
- xpath = [%(//*[@name="#{hash}"]|/*[@name="#{unencoded_hash}"]|//*[@id="#{hash}"]|//*[@id="#{unencoded_hash}"])]
186
- # user-content is a special addition by GitHub.
187
- if URI.parse(href).host =~ /github\.com/i
188
- xpath << [%(//*[@name="user-content-#{hash}"]|//*[@id="user-content-#{hash}"])]
189
- # when linking to a file on GitHub, like #L12-L34, only the first "L" portion
190
- # will be identified as a linkable portion
191
- xpath << [%(//td[@id="#{Regexp.last_match[1]}"])] if hash =~ /\A(L\d)+/
192
- end
193
-
194
- return unless body_doc.xpath(xpath.join('|')).empty?
195
-
196
- msg = "External link #{href} failed: #{effective_url} exists, but the hash '#{hash}' does not"
197
- add_external_issue(filenames, msg, response.code)
198
- @cache.add(href, filenames, response.code, msg)
199
- true
200
- end
201
-
202
- def handle_timeout(href, filenames, response_code)
203
- msg = "External link #{href} failed: got a time out (response code #{response_code})"
204
- @cache.add(href, filenames, 0, msg)
205
- return if @options[:only_4xx]
206
-
207
- add_external_issue(filenames, msg, response_code)
208
- end
209
-
210
- def handle_failure(href, filenames, response_code, return_message)
211
- msg = "External link #{href} failed: response code #{response_code} means something's wrong.
212
- It's possible libcurl couldn't connect to the server or perhaps the request timed out.
213
- Sometimes, making too many requests at once also breaks things.
214
- Either way, the return message (if any) from the server is: #{return_message}"
215
- @cache.add(href, filenames, 0, msg)
216
- return if @options[:only_4xx]
217
-
218
- add_external_issue(filenames, msg, response_code)
219
- end
220
-
221
- def add_external_issue(filenames, desc, status = nil)
222
- # possible if we're checking an array of links
223
- if filenames.nil?
224
- @failed_tests << Issue.new('', desc, status: status)
225
- else
226
- filenames.each { |f| @failed_tests << Issue.new(f, desc, status: status) }
227
- end
228
- end
229
-
230
- # Does the URL have a hash?
231
- def hash?(url)
232
- URI.parse(url).fragment
233
- rescue URI::InvalidURIError
234
- false
235
- end
236
- end
237
- end