html-proofer 2.6.4 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,49 @@
1
+ require 'yell'
2
+ require 'colored'
3
+
4
+ module HTMLProofer
5
+ class Log
6
+ include Yell::Loggable
7
+
8
+ def initialize(log_level)
9
+ @logger = Yell.new(:format => false, \
10
+ :name => 'HTMLProofer', \
11
+ :level => "gte.#{log_level}") do |l|
12
+ l.adapter :stdout, :level => [:debug, :info, :warn]
13
+ l.adapter :stderr, :level => [:error, :fatal]
14
+ end
15
+ end
16
+
17
+ def log(level, message)
18
+ color = case level
19
+ when :debug
20
+ :light_blue
21
+ when :info
22
+ :blue
23
+ when :warn
24
+ :yellow
25
+ when :error, :fatal
26
+ :red
27
+ end
28
+
29
+ log_with_color(level, color, message)
30
+ end
31
+
32
+ def log_with_color(level, color, message)
33
+ @logger.send level, colorize(color, message)
34
+ end
35
+
36
+ def colorize(color, message)
37
+ if $stdout.isatty && $stderr.isatty
38
+ Colored.colorize(message, foreground: color)
39
+ else
40
+ message
41
+ end
42
+ end
43
+
44
+ # dumb override to play nice with Typhoeus/Ethon
45
+ def debug(message = nil)
46
+ log(:debug, message) unless message.nil?
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,160 @@
1
+ module HTMLProofer
2
+ class Runner
3
+ include HTMLProofer::Utils
4
+
5
+ attr_reader :options, :external_urls
6
+
7
+ def initialize(src, opts = {})
8
+ @src = src
9
+
10
+ @options = HTMLProofer::Configuration::PROOFER_DEFAULTS.merge(opts)
11
+
12
+ @options[:typhoeus] = HTMLProofer::Configuration::TYPHOEUS_DEFAULTS.merge(opts[:typhoeus] || {})
13
+ @options[:hydra] = HTMLProofer::Configuration::HYDRA_DEFAULTS.merge(opts[:hydra] || {})
14
+
15
+ @options[:parallel] = HTMLProofer::Configuration::PARALLEL_DEFAULTS.merge(opts[:parallel] || {})
16
+ @options[:validation] = HTMLProofer::Configuration::VALIDATION_DEFAULTS.merge(opts[:validation] || {})
17
+ @options[:cache] = HTMLProofer::Configuration::CACHE_DEFAULTS.merge(opts[:cache] || {})
18
+
19
+ @type = @options.delete(:type)
20
+ @logger = HTMLProofer::Log.new(@options[:log_level])
21
+
22
+ if !@options[:cache].empty? && !File.exist?(STORAGE_DIR)
23
+ FileUtils.mkdir_p(STORAGE_DIR)
24
+ end
25
+
26
+ @failures = []
27
+ end
28
+
29
+ def run
30
+ @logger.log :info, "Running #{checks} on #{@src} on *#{@options[:extension]}... \n\n"
31
+
32
+ if @type == :links
33
+ check_list_of_links unless @options[:disable_external]
34
+ else
35
+ check_files
36
+ file_text = pluralize(files.length, 'file', 'files')
37
+ @logger.log :info, "Ran on #{file_text}!\n\n"
38
+ end
39
+
40
+ if @failures.empty?
41
+ @logger.log_with_color :info, :green, 'HTML-Proofer finished successfully.'
42
+ else
43
+ print_failed_tests
44
+ end
45
+ end
46
+
47
+ def check_list_of_links
48
+ if @options[:url_swap]
49
+ @src = @src.map do |url|
50
+ swap(url, @options[:url_swap])
51
+ end
52
+ end
53
+ @external_urls = Hash[*@src.map { |s| [s, nil] }.flatten]
54
+ validate_urls
55
+ end
56
+
57
+ # Collects any external URLs found in a directory of files. Also collectes
58
+ # every failed test from process_files.
59
+ # Sends the external URLs to Typhoeus for batch processing.
60
+ def check_files
61
+ @external_urls = {}
62
+
63
+ process_files.each do |item|
64
+ @external_urls.merge!(item[:external_urls])
65
+ @failures.concat(item[:failures])
66
+ end
67
+
68
+ # TODO: lazy. if we're checking only external links,
69
+ # we'll just trash all the failed tests. really, we should
70
+ # just not run those other checks at all.
71
+ if @options[:external_only]
72
+ @failures = []
73
+ validate_urls
74
+ elsif !@options[:disable_external]
75
+ validate_urls
76
+ end
77
+ end
78
+
79
+ # Walks over each implemented check and runs them on the files, in parallel.
80
+ def process_files
81
+ if @options[:parallel].empty?
82
+ files.map { |path| check_path(path) }
83
+ else
84
+ Parallel.map(files, @options[:parallel]) { |path| check_path(path) }
85
+ end
86
+ end
87
+
88
+ def check_path(path)
89
+ result = { :external_urls => {}, :failures => [] }
90
+ html = create_nokogiri(path)
91
+
92
+ @src = [@src] if @type == :file
93
+
94
+ @src.each do |src|
95
+ checks.each do |klass|
96
+ @logger.log :debug, "Checking #{klass.to_s.downcase} on #{path} ..."
97
+ check = Object.const_get(klass).new(src, path, html, @options)
98
+ check.run
99
+ result[:external_urls].merge!(check.external_urls)
100
+ result[:failures].concat(check.issues)
101
+ end
102
+ end
103
+ result
104
+ end
105
+
106
+ def validate_urls
107
+ url_validator = HTMLProofer::UrlValidator.new(@logger, @external_urls, @options)
108
+ @failures.concat(url_validator.run)
109
+ @external_urls = url_validator.external_urls
110
+ end
111
+
112
+ def files
113
+ @files ||= if @type == :directory
114
+ @src.map do |src|
115
+ pattern = File.join(src, '**', "*#{@options[:extension]}")
116
+ files = Dir.glob(pattern).select { |fn| File.file? fn }
117
+ files.reject { |f| ignore_file?(f) }
118
+ end.flatten
119
+ elsif @type == :file && File.extname(@src) == @options[:extension]
120
+ [@src].reject { |f| ignore_file?(f) }
121
+ else
122
+ []
123
+ end
124
+ end
125
+
126
+ def ignore_file?(file)
127
+ @options[:file_ignore].each do |pattern|
128
+ return true if pattern.is_a?(String) && pattern == file
129
+ return true if pattern.is_a?(Regexp) && pattern =~ file
130
+ end
131
+
132
+ false
133
+ end
134
+
135
+ def checks
136
+ return @checks unless @checks.nil?
137
+ @checks = HTMLProofer::Check.subchecks.map(&:name)
138
+ @checks.delete('FaviconCheck') unless @options[:check_favicon]
139
+ @checks.delete('HtmlCheck') unless @options[:check_html]
140
+ @options[:checks_to_ignore].each { |ignored| @checks.delete(ignored) }
141
+ @checks
142
+ end
143
+
144
+ def failed_tests
145
+ result = []
146
+ return result if @failures.empty?
147
+ @failures.each { |f| result << f.to_s }
148
+ result
149
+ end
150
+
151
+ def print_failed_tests
152
+ sorted_failures = SortedIssues.new(@failures, @options[:error_sort], @logger)
153
+
154
+ sorted_failures.sort_and_report
155
+ count = @failures.length
156
+ failure_text = pluralize(count, 'failure', 'failures')
157
+ fail @logger.colorize :red, "HTML-Proofer found #{failure_text}!"
158
+ end
159
+ end
160
+ end
@@ -0,0 +1,218 @@
1
+ require 'typhoeus'
2
+ require 'uri'
3
+ require_relative './utils'
4
+ require_relative './cache'
5
+
6
+ module HTMLProofer
7
+ class UrlValidator
8
+ include HTMLProofer::Utils
9
+
10
+ attr_reader :external_urls
11
+
12
+ def initialize(logger, external_urls, options)
13
+ @logger = logger
14
+ @external_urls = external_urls
15
+ @failed_tests = []
16
+ @options = options
17
+ @hydra = Typhoeus::Hydra.new(@options[:hydra])
18
+ @cache = Cache.new(@logger, @options[:cache])
19
+ end
20
+
21
+ def run
22
+ @external_urls = remove_query_values
23
+
24
+ if @cache.use_cache?
25
+ urls_to_check = load_cache
26
+ external_link_checker(urls_to_check)
27
+ @cache.write
28
+ else
29
+ external_link_checker(@external_urls)
30
+ end
31
+
32
+ @failed_tests
33
+ end
34
+
35
+ def remove_query_values
36
+ return nil if @external_urls.nil?
37
+ paths_with_queries = {}
38
+ iterable_external_urls = @external_urls.dup
39
+ @external_urls.keys.each do |url|
40
+ uri = begin
41
+ Addressable::URI.parse(url)
42
+ rescue URI::Error, Addressable::URI::InvalidURIError
43
+ @logger.log :error, "#{url} is an invalid URL"
44
+ nil
45
+ end
46
+ next if uri.nil? || uri.query.nil?
47
+ iterable_external_urls.delete(url) unless new_url_query_values?(uri, paths_with_queries)
48
+ end
49
+ iterable_external_urls
50
+ end
51
+
52
+ # remember queries we've seen, ignore future ones
53
+ def new_url_query_values?(uri, paths_with_queries)
54
+ queries = uri.query_values.keys.join('-')
55
+ domain_path = extract_domain_path(uri)
56
+ if paths_with_queries[domain_path].nil?
57
+ paths_with_queries[domain_path] = [queries]
58
+ true
59
+ elsif !paths_with_queries[domain_path].include?(queries)
60
+ paths_with_queries[domain_path] << queries
61
+ true
62
+ else
63
+ false
64
+ end
65
+ end
66
+
67
+ def extract_domain_path(uri)
68
+ uri.host + uri.path
69
+ end
70
+
71
+ def load_cache
72
+ cache_count = @cache.size
73
+ cache_text = pluralize(cache_count, 'link', 'links')
74
+
75
+ @logger.log :info, "Found #{cache_text} in the cache..."
76
+
77
+ @cache.retrieve_urls(@external_urls)
78
+ end
79
+
80
+ # Proofer runs faster if we pull out all the external URLs and run the checks
81
+ # at the end. Otherwise, we're halting the consuming process for every file during
82
+ # `process_files`.
83
+ #
84
+ # In addition, sorting the list lets libcurl keep connections to the same hosts alive.
85
+ #
86
+ # Finally, we'll first make a HEAD request, rather than GETing all the contents.
87
+ # If the HEAD fails, we'll fall back to GET, as some servers are not configured
88
+ # for HEAD. If we've decided to check for hashes, we must do a GET--HEAD is
89
+ # not available as an option.
90
+ def external_link_checker(external_urls)
91
+ external_urls = Hash[external_urls.sort]
92
+
93
+ count = external_urls.length
94
+ check_text = pluralize(count, 'external link', 'external links')
95
+ @logger.log :info, "Checking #{check_text}..."
96
+
97
+ # Route log from Typhoeus/Ethon to our own logger
98
+ Ethon.logger = @logger
99
+
100
+ establish_queue(external_urls)
101
+
102
+ @hydra.run
103
+ end
104
+
105
+ def establish_queue(external_urls)
106
+ external_urls.each_pair do |url, filenames|
107
+ url = begin
108
+ clean_url(url)
109
+ rescue URI::Error, Addressable::URI::InvalidURIError
110
+ add_external_issue(filenames, "#{url} is an invalid URL")
111
+ next
112
+ end
113
+
114
+ method = if hash?(url) && @options[:check_external_hash]
115
+ :get
116
+ else
117
+ :head
118
+ end
119
+ queue_request(method, url, filenames)
120
+ end
121
+ end
122
+
123
+ def clean_url(href)
124
+ Addressable::URI.parse(href).normalize
125
+ end
126
+
127
+ def queue_request(method, href, filenames)
128
+ opts = @options[:typhoeus].merge({ :method => method })
129
+ request = Typhoeus::Request.new(href, opts)
130
+ request.on_complete { |response| response_handler(response, filenames) }
131
+ @hydra.queue request
132
+ end
133
+
134
+ def response_handler(response, filenames)
135
+ effective_url = response.options[:effective_url]
136
+ href = response.request.base_url.to_s
137
+ method = response.request.options[:method]
138
+ response_code = response.code
139
+
140
+ debug_msg = "Received a #{response_code} for #{href}"
141
+ debug_msg << " in #{filenames.join(' ')}" unless filenames.nil?
142
+ @logger.log :debug, debug_msg
143
+
144
+ return if @options[:http_status_ignore].include?(response_code)
145
+
146
+ if response_code.between?(200, 299)
147
+ unless check_hash_in_2xx_response(href, effective_url, response, filenames)
148
+ @cache.add(href, filenames, response_code)
149
+ end
150
+ elsif response.timed_out?
151
+ handle_timeout(href, filenames, response_code)
152
+ elsif response_code == 0
153
+ handle_failure(href, filenames, response_code)
154
+ elsif method == :head
155
+ queue_request(:get, href, filenames)
156
+ else
157
+ return if @options[:only_4xx] && !response_code.between?(400, 499)
158
+ # Received a non-successful http response.
159
+ msg = "External link #{href} failed: #{response_code} #{response.return_message}"
160
+ add_external_issue(filenames, msg, response_code)
161
+ @cache.add(href, filenames, response_code, msg)
162
+ end
163
+ end
164
+
165
+ # Even though the response was a success, we may have been asked to check
166
+ # if the hash on the URL exists on the page
167
+ def check_hash_in_2xx_response(href, effective_url, response, filenames)
168
+ return false if @options[:only_4xx]
169
+ return false unless @options[:check_external_hash]
170
+ return false unless (hash = hash?(href))
171
+
172
+ body_doc = create_nokogiri(response.body)
173
+
174
+ # user-content is a special addition by GitHub.
175
+ xpath = %(//*[@name="#{hash}"]|//*[@id="#{hash}"])
176
+ if URI.parse(href).host.match(/github\.com/i)
177
+ xpath << %(|//*[@name="user-content-#{hash}"]|//*[@id="user-content-#{hash}"])
178
+ end
179
+
180
+ return unless body_doc.xpath(xpath).empty?
181
+
182
+ msg = "External link #{href} failed: #{effective_url} exists, but the hash '#{hash}' does not"
183
+ add_external_issue(filenames, msg, response.code)
184
+ @cache.add(href, filenames, response.code, msg)
185
+ true
186
+ end
187
+
188
+ def handle_timeout(href, filenames, response_code)
189
+ msg = "External link #{href} failed: got a time out (response code #{response_code})"
190
+ @cache.add(href, filenames, 0, msg)
191
+ return if @options[:only_4xx]
192
+ add_external_issue(filenames, msg, response_code)
193
+ end
194
+
195
+ def handle_failure(href, filenames, response_code)
196
+ msg = "External link #{href} failed: response code #{response_code} means something's wrong"
197
+ @cache.add(href, filenames, 0, msg)
198
+ return if @options[:only_4xx]
199
+ add_external_issue(filenames, msg, response_code)
200
+ end
201
+
202
+ def add_external_issue(filenames, desc, status = nil)
203
+ # possible if we're checking an array of links
204
+ if filenames.nil?
205
+ @failed_tests << Issue.new('', desc, status: status)
206
+ else
207
+ filenames.each { |f| @failed_tests << Issue.new(f, desc, status: status) }
208
+ end
209
+ end
210
+
211
+ # Does the URL have a hash?
212
+ def hash?(url)
213
+ URI.parse(url).fragment
214
+ rescue URI::InvalidURIError
215
+ false
216
+ end
217
+ end
218
+ end
@@ -0,0 +1,40 @@
1
+ require 'nokogiri'
2
+
3
+ module HTMLProofer
4
+ module Utils
5
+ STORAGE_DIR = File.join('tmp', '.htmlproofer')
6
+
7
+ def pluralize(count, single, plural)
8
+ "#{count} " << (count == 1 ? single : plural)
9
+ end
10
+
11
+ def create_nokogiri(path)
12
+ if File.exist? path
13
+ content = File.open(path).read
14
+ else
15
+ content = path
16
+ end
17
+
18
+ Nokogiri::HTML(clean_content(content))
19
+ end
20
+ module_function :create_nokogiri
21
+
22
+ def swap(href, replacement)
23
+ replacement.each do |link, replace|
24
+ href = href.gsub(link, replace)
25
+ end
26
+ href
27
+ end
28
+ module_function :swap
29
+
30
+ # address a problem with Nokogiri's parsing URL entities
31
+ # problem from http://git.io/vBYU1
32
+ # solution from http://git.io/vBYUi
33
+ def clean_content(string)
34
+ string.gsub(%r{https?://([^>]+)}i) do |url|
35
+ url.gsub(/&(?!amp;)/, '&amp;')
36
+ end
37
+ end
38
+ module_function :clean_content
39
+ end
40
+ end