html-proofer 3.19.1 → 4.0.0.rc2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (40) hide show
  1. checksums.yaml +4 -4
  2. data/bin/htmlproofer +31 -57
  3. data/lib/html-proofer.rb +1 -54
  4. data/lib/html_proofer/attribute/url.rb +231 -0
  5. data/lib/html_proofer/attribute.rb +15 -0
  6. data/lib/html_proofer/cache.rb +236 -0
  7. data/lib/html_proofer/check/favicon.rb +35 -0
  8. data/lib/html_proofer/check/images.rb +62 -0
  9. data/lib/html_proofer/check/links.rb +118 -0
  10. data/lib/html_proofer/check/open_graph.rb +34 -0
  11. data/lib/html_proofer/check/scripts.rb +38 -0
  12. data/lib/html_proofer/check.rb +91 -0
  13. data/lib/{html-proofer → html_proofer}/configuration.rb +30 -30
  14. data/lib/html_proofer/element.rb +122 -0
  15. data/lib/html_proofer/failure.rb +17 -0
  16. data/lib/{html-proofer → html_proofer}/log.rb +0 -0
  17. data/lib/html_proofer/reporter/cli.rb +29 -0
  18. data/lib/html_proofer/reporter.rb +23 -0
  19. data/lib/html_proofer/runner.rb +245 -0
  20. data/lib/html_proofer/url_validator/external.rb +189 -0
  21. data/lib/html_proofer/url_validator/internal.rb +86 -0
  22. data/lib/html_proofer/url_validator.rb +16 -0
  23. data/lib/{html-proofer → html_proofer}/utils.rb +6 -9
  24. data/lib/{html-proofer → html_proofer}/version.rb +1 -1
  25. data/lib/html_proofer/xpath_functions.rb +10 -0
  26. data/lib/html_proofer.rb +56 -0
  27. metadata +44 -37
  28. data/lib/html-proofer/cache.rb +0 -194
  29. data/lib/html-proofer/check/favicon.rb +0 -29
  30. data/lib/html-proofer/check/html.rb +0 -37
  31. data/lib/html-proofer/check/images.rb +0 -48
  32. data/lib/html-proofer/check/links.rb +0 -182
  33. data/lib/html-proofer/check/opengraph.rb +0 -46
  34. data/lib/html-proofer/check/scripts.rb +0 -42
  35. data/lib/html-proofer/check.rb +0 -75
  36. data/lib/html-proofer/element.rb +0 -261
  37. data/lib/html-proofer/issue.rb +0 -65
  38. data/lib/html-proofer/middleware.rb +0 -82
  39. data/lib/html-proofer/runner.rb +0 -248
  40. data/lib/html-proofer/url_validator.rb +0 -237
@@ -1,248 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module HTMLProofer
4
- class Runner
5
- include HTMLProofer::Utils
6
-
7
- attr_reader :options, :internal_urls, :external_urls, :failures
8
-
9
- def initialize(src, opts = {})
10
- @src = src
11
-
12
- @options = HTMLProofer::Configuration::PROOFER_DEFAULTS.merge(opts)
13
-
14
- @options[:typhoeus] = HTMLProofer::Configuration::TYPHOEUS_DEFAULTS.merge(opts[:typhoeus] || {})
15
- @options[:hydra] = HTMLProofer::Configuration::HYDRA_DEFAULTS.merge(opts[:hydra] || {})
16
-
17
- @options[:parallel] = HTMLProofer::Configuration::PARALLEL_DEFAULTS.merge(opts[:parallel] || {})
18
- @options[:validation] = HTMLProofer::Configuration::VALIDATION_DEFAULTS.merge(opts[:validation] || {})
19
- @options[:cache] = HTMLProofer::Configuration::CACHE_DEFAULTS.merge(opts[:cache] || {})
20
-
21
- @type = @options.delete(:type)
22
- @logger = HTMLProofer::Log.new(@options[:log_level])
23
- @cache = Cache.new(@logger, @options[:cache])
24
- @internal_link_checks = nil
25
-
26
- # Add swap patterns for internal domains
27
- unless @options[:internal_domains].empty?
28
- @options[:internal_domains].each do |dom|
29
- @options[:url_swap][Regexp.new("^http://#{dom}")] = ''
30
- @options[:url_swap][Regexp.new("^https://#{dom}")] = ''
31
- @options[:url_swap][Regexp.new("^//#{dom}")] = ''
32
- end
33
- end
34
-
35
- @internal_urls = {}
36
- @internal_urls_to_paths = {}
37
- @external_urls = {}
38
- @failures = []
39
- @before_request = []
40
- end
41
-
42
- def run
43
- if @type == :links
44
- @logger.log :info, "Running #{checks} on #{@src}... \n\n"
45
- check_list_of_links unless @options[:disable_external]
46
- else
47
- @logger.log :info, "Running #{checks} on #{@src} on *#{@options[:extension]}... \n\n"
48
- check_files
49
- file_text = pluralize(files.length, 'file', 'files')
50
- @logger.log :info, "Ran on #{file_text}!\n\n"
51
- end
52
-
53
- if @failures.empty?
54
- @logger.log :info, 'HTML-Proofer finished successfully.'
55
- else
56
- print_failed_tests
57
- end
58
- end
59
-
60
- def check_list_of_links
61
- if @options[:url_swap]
62
- @src = @src.map do |url|
63
- swap(url, @options[:url_swap])
64
- end
65
- end
66
- @external_urls = @src.each_with_object({}) do |url, hash|
67
- hash[url] = nil
68
- end
69
- validate_external_urls
70
- end
71
-
72
- # Collects any external URLs found in a directory of files. Also collectes
73
- # every failed test from process_files.
74
- # Sends the external URLs to Typhoeus for batch processing.
75
- def check_files
76
- process_files.each do |item|
77
- @external_urls.merge!(item[:external_urls])
78
- @failures.concat(item[:failures])
79
- end
80
-
81
- # TODO: lazy. if we're checking only external links,
82
- # we'll just trash all the failed tests. really, we should
83
- # just not run those other checks at all.
84
- if @options[:external_only]
85
- @failures = []
86
- validate_external_urls
87
- elsif !@options[:disable_external]
88
- validate_external_urls
89
- validate_internal_urls
90
- else
91
- validate_internal_urls
92
- end
93
- end
94
-
95
- # Walks over each implemented check and runs them on the files, in parallel.
96
- def process_files
97
- if @options[:parallel].empty?
98
- files.map { |path| check_path(path) }
99
- else
100
- Parallel.map(files, @options[:parallel]) { |path| check_path(path) }
101
- end
102
- end
103
-
104
- def check_parsed(html, path)
105
- result = { external_urls: {}, failures: [] }
106
-
107
- @src = [@src] if @type == :file
108
-
109
- @src.each do |src|
110
- checks.each do |klass|
111
- @logger.log :debug, "Checking #{klass.to_s.downcase} on #{path} ..."
112
- check = Object.const_get(klass).new(src, path, html, @logger, @cache, @options)
113
- check.run
114
-
115
- if klass == 'LinkCheck'
116
- @internal_link_checks = check
117
- check.internal_urls.each_pair do |url, internal_urls|
118
- if @internal_urls_to_paths[url]
119
- @internal_urls_to_paths[url].concat(internal_urls.map(&:path))
120
- else
121
- @internal_urls_to_paths[url] = internal_urls.map(&:path)
122
- end
123
- end
124
- @internal_urls.merge!(check.internal_urls)
125
- end
126
-
127
- external_urls = check.external_urls
128
- external_urls = check.external_urls.map { |url, file| [swap(url, @options[:url_swap]), file] }.to_h if @options[:url_swap]
129
- result[:external_urls].merge!(external_urls)
130
- result[:failures].concat(check.issues)
131
- end
132
- end
133
- result
134
- end
135
-
136
- def check_path(path)
137
- check_parsed(create_nokogiri(path), path)
138
- end
139
-
140
- def validate_external_urls
141
- url_validator = HTMLProofer::UrlValidator.new(@logger, @cache, @external_urls, @options)
142
- url_validator.before_request = @before_request
143
- @failures.concat(url_validator.run)
144
- @external_urls = url_validator.external_urls
145
- end
146
-
147
- def validate_internal_urls
148
- if @cache.use_cache?
149
- urls_to_check = load_internal_cache
150
-
151
- urls_to_check.each_pair do |url, internal_urls|
152
- # pulled from cache
153
- internal_urls = @internal_urls[url] unless internal_urls.first.is_a?(LinkCheck::InternalLink)
154
-
155
- result = @internal_link_checks.check_internal_link(internal_urls.first.link, internal_urls.first.path, internal_urls.first.line, internal_urls.first.content)
156
- code = result ? 200 : 404
157
- @cache.add(url, @internal_urls_to_paths[url].sort, code, '') # TODO: blank msg for now
158
- end
159
- @cache.write
160
- else
161
- @internal_urls.values.flatten.each do |internal_url|
162
- result = @internal_link_checks.check_internal_link(internal_url.link, internal_url.path, internal_url.line, internal_url.content)
163
- next if result
164
-
165
- @failures.concat(@internal_link_checks.issues) unless @internal_link_checks.issues.length.zero?
166
- end
167
- end
168
- end
169
-
170
- def files
171
- @files ||= if @type == :directory
172
- @src.map do |src|
173
- pattern = File.join(src, '**', "*#{@options[:extension]}")
174
- files = Dir.glob(pattern).select { |fn| File.file? fn }
175
- files.reject { |f| ignore_file?(f) }
176
- end.flatten
177
- elsif @type == :file && File.extname(@src) == @options[:extension]
178
- [@src].reject { |f| ignore_file?(f) }
179
- else
180
- []
181
- end
182
- end
183
-
184
- def ignore_file?(file)
185
- @options[:file_ignore].each do |pattern|
186
- return true if pattern.is_a?(String) && pattern == file
187
- return true if pattern.is_a?(Regexp) && pattern =~ file
188
- end
189
-
190
- false
191
- end
192
-
193
- def checks
194
- return @checks if defined?(@checks) && !@checks.nil?
195
-
196
- return (@checks = ['LinkCheck']) if @type == :links
197
-
198
- @checks = HTMLProofer::Check.subchecks.map(&:name)
199
- @checks.delete('FaviconCheck') unless @options[:check_favicon]
200
- @checks.delete('HtmlCheck') unless @options[:check_html]
201
- @checks.delete('OpenGraphCheck') unless @options[:check_opengraph]
202
- @options[:checks_to_ignore].each { |ignored| @checks.delete(ignored) }
203
- @checks
204
- end
205
-
206
- def failed_tests
207
- result = []
208
- return result if @failures.empty?
209
-
210
- @failures.each { |f| result << f.to_s }
211
- result
212
- end
213
-
214
- def print_failed_tests
215
- sorted_failures = SortedIssues.new(@failures, @options[:error_sort], @logger)
216
-
217
- sorted_failures.sort_and_report
218
- count = @failures.length
219
- failure_text = pluralize(count, 'failure', 'failures')
220
- @logger.log :fatal, "\nHTML-Proofer found #{failure_text}!"
221
- exit 1
222
- end
223
-
224
- # Set before_request callback.
225
- #
226
- # @example Set before_request.
227
- # request.before_request { |request| p "yay" }
228
- #
229
- # @param [ Block ] block The block to execute.
230
- #
231
- # @yield [ Typhoeus::Request ]
232
- #
233
- # @return [ Array<Block> ] All before_request blocks.
234
- def before_request(&block)
235
- @before_request ||= []
236
- @before_request << block if block
237
- @before_request
238
- end
239
-
240
- def load_internal_cache
241
- urls_to_check = @cache.retrieve_urls(@internal_urls, :internal)
242
- cache_text = pluralize(urls_to_check.count, 'internal link', 'internal links')
243
- @logger.log :info, "Found #{cache_text} in the cache..."
244
-
245
- urls_to_check
246
- end
247
- end
248
- end
@@ -1,237 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'typhoeus'
4
- require 'uri'
5
- require_relative './utils'
6
- require_relative './cache'
7
-
8
- module HTMLProofer
9
- class UrlValidator
10
- include HTMLProofer::Utils
11
-
12
- attr_reader :external_urls
13
- attr_writer :before_request
14
-
15
- def initialize(logger, cache, external_urls, options)
16
- @logger = logger
17
- @external_urls = external_urls
18
- @failed_tests = []
19
- @options = options
20
- @hydra = Typhoeus::Hydra.new(@options[:hydra])
21
- @cache = cache
22
- @before_request = []
23
- end
24
-
25
- def run
26
- @external_urls = remove_query_values
27
-
28
- if @cache.use_cache?
29
- urls_to_check = @cache.retrieve_urls(@external_urls, :external)
30
- external_link_checker(urls_to_check)
31
- @cache.write
32
- else
33
- external_link_checker(@external_urls)
34
- end
35
-
36
- @failed_tests
37
- end
38
-
39
- def remove_query_values
40
- return nil if @external_urls.nil?
41
-
42
- paths_with_queries = {}
43
- iterable_external_urls = @external_urls.dup
44
- @external_urls.each_key do |url|
45
- uri = begin
46
- Addressable::URI.parse(url)
47
- rescue URI::Error, Addressable::URI::InvalidURIError
48
- @logger.log :error, "#{url} is an invalid URL"
49
- nil
50
- end
51
- next if uri.nil? || uri.query.nil?
52
-
53
- iterable_external_urls.delete(url) unless new_url_query_values?(uri, paths_with_queries)
54
- end
55
- iterable_external_urls
56
- end
57
-
58
- # remember queries we've seen, ignore future ones
59
- def new_url_query_values?(uri, paths_with_queries)
60
- queries = uri.query_values.keys.join('-')
61
- domain_path = extract_domain_path(uri)
62
- if paths_with_queries[domain_path].nil?
63
- paths_with_queries[domain_path] = [queries]
64
- true
65
- elsif !paths_with_queries[domain_path].include?(queries)
66
- paths_with_queries[domain_path] << queries
67
- true
68
- else
69
- false
70
- end
71
- end
72
-
73
- def extract_domain_path(uri)
74
- uri.host + uri.path
75
- end
76
-
77
- # Proofer runs faster if we pull out all the external URLs and run the checks
78
- # at the end. Otherwise, we're halting the consuming process for every file during
79
- # `process_files`.
80
- #
81
- # In addition, sorting the list lets libcurl keep connections to the same hosts alive.
82
- #
83
- # Finally, we'll first make a HEAD request, rather than GETing all the contents.
84
- # If the HEAD fails, we'll fall back to GET, as some servers are not configured
85
- # for HEAD. If we've decided to check for hashes, we must do a GET--HEAD is
86
- # not available as an option.
87
- def external_link_checker(external_urls)
88
- external_urls = external_urls.sort.to_h
89
-
90
- count = external_urls.length
91
- check_text = pluralize(count, 'external link', 'external links')
92
- @logger.log :info, "Checking #{check_text}..."
93
-
94
- # Route log from Typhoeus/Ethon to our own logger
95
- Ethon.logger = @logger
96
-
97
- establish_queue(external_urls)
98
-
99
- @hydra.run
100
- end
101
-
102
- def establish_queue(external_urls)
103
- external_urls.each_pair do |url, filenames|
104
- url = begin
105
- clean_url(url)
106
- rescue URI::Error, Addressable::URI::InvalidURIError
107
- add_external_issue(filenames, "#{url} is an invalid URL")
108
- next
109
- end
110
-
111
- method = if hash?(url) && @options[:check_external_hash]
112
- :get
113
- else
114
- :head
115
- end
116
- queue_request(method, url, filenames)
117
- end
118
- end
119
-
120
- def clean_url(href)
121
- # catch any obvious issues, like strings in port numbers
122
- parsed = Addressable::URI.parse(href)
123
- if href =~ /^([!#{Regexp.last_match(0)}-;=?-\[\]_a-z~]|%[0-9a-fA-F]{2})+$/
124
- href
125
- else
126
- parsed.normalize
127
- end
128
- end
129
-
130
- def queue_request(method, href, filenames)
131
- opts = @options[:typhoeus].merge(method: method)
132
- request = Typhoeus::Request.new(href, opts)
133
- @before_request.each do |callback|
134
- callback.call(request)
135
- end
136
- request.on_complete { |response| response_handler(response, filenames) }
137
- @hydra.queue request
138
- end
139
-
140
- def response_handler(response, filenames)
141
- effective_url = response.options[:effective_url]
142
- href = response.request.base_url.to_s
143
- method = response.request.options[:method]
144
- response_code = response.code
145
- response.body.delete!("\x00")
146
-
147
- debug_msg = if filenames.nil?
148
- "Received a #{response_code} for #{href}"
149
- else
150
- "Received a #{response_code} for #{href} in #{filenames.join(' ')}"
151
- end
152
-
153
- @logger.log :debug, debug_msg
154
-
155
- return if @options[:http_status_ignore].include?(response_code)
156
-
157
- if response_code.between?(200, 299)
158
- @cache.add(href, filenames, response_code) unless check_hash_in_2xx_response(href, effective_url, response, filenames)
159
- elsif response.timed_out?
160
- handle_timeout(href, filenames, response_code)
161
- elsif response_code.zero?
162
- handle_failure(effective_url, filenames, response_code, response.return_message)
163
- elsif method == :head
164
- queue_request(:get, href, filenames)
165
- else
166
- return if @options[:only_4xx] && !response_code.between?(400, 499)
167
-
168
- # Received a non-successful http response.
169
- msg = "External link #{href} failed: #{response_code} #{response.return_message}"
170
- add_external_issue(filenames, msg, response_code)
171
- @cache.add(href, filenames, response_code, msg)
172
- end
173
- end
174
-
175
- # Even though the response was a success, we may have been asked to check
176
- # if the hash on the URL exists on the page
177
- def check_hash_in_2xx_response(href, effective_url, response, filenames)
178
- return false if @options[:only_4xx]
179
- return false unless @options[:check_external_hash]
180
- return false unless (hash = hash?(href))
181
-
182
- body_doc = create_nokogiri(response.body)
183
-
184
- unencoded_hash = Addressable::URI.unescape(hash)
185
- xpath = [%(//*[@name="#{hash}"]|/*[@name="#{unencoded_hash}"]|//*[@id="#{hash}"]|//*[@id="#{unencoded_hash}"])]
186
- # user-content is a special addition by GitHub.
187
- if URI.parse(href).host =~ /github\.com/i
188
- xpath << [%(//*[@name="user-content-#{hash}"]|//*[@id="user-content-#{hash}"])]
189
- # when linking to a file on GitHub, like #L12-L34, only the first "L" portion
190
- # will be identified as a linkable portion
191
- xpath << [%(//td[@id="#{Regexp.last_match[1]}"])] if hash =~ /\A(L\d)+/
192
- end
193
-
194
- return unless body_doc.xpath(xpath.join('|')).empty?
195
-
196
- msg = "External link #{href} failed: #{effective_url} exists, but the hash '#{hash}' does not"
197
- add_external_issue(filenames, msg, response.code)
198
- @cache.add(href, filenames, response.code, msg)
199
- true
200
- end
201
-
202
- def handle_timeout(href, filenames, response_code)
203
- msg = "External link #{href} failed: got a time out (response code #{response_code})"
204
- @cache.add(href, filenames, 0, msg)
205
- return if @options[:only_4xx]
206
-
207
- add_external_issue(filenames, msg, response_code)
208
- end
209
-
210
- def handle_failure(href, filenames, response_code, return_message)
211
- msg = "External link #{href} failed: response code #{response_code} means something's wrong.
212
- It's possible libcurl couldn't connect to the server or perhaps the request timed out.
213
- Sometimes, making too many requests at once also breaks things.
214
- Either way, the return message (if any) from the server is: #{return_message}"
215
- @cache.add(href, filenames, 0, msg)
216
- return if @options[:only_4xx]
217
-
218
- add_external_issue(filenames, msg, response_code)
219
- end
220
-
221
- def add_external_issue(filenames, desc, status = nil)
222
- # possible if we're checking an array of links
223
- if filenames.nil?
224
- @failed_tests << Issue.new('', desc, status: status)
225
- else
226
- filenames.each { |f| @failed_tests << Issue.new(f, desc, status: status) }
227
- end
228
- end
229
-
230
- # Does the URL have a hash?
231
- def hash?(url)
232
- URI.parse(url).fragment
233
- rescue URI::InvalidURIError
234
- false
235
- end
236
- end
237
- end