html-proofer 2.6.4 → 3.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,49 @@
1
+ require 'yell'
2
+ require 'colored'
3
+
4
+ module HTMLProofer
5
+ class Log
6
+ include Yell::Loggable
7
+
8
+ def initialize(log_level)
9
+ @logger = Yell.new(:format => false, \
10
+ :name => 'HTMLProofer', \
11
+ :level => "gte.#{log_level}") do |l|
12
+ l.adapter :stdout, :level => [:debug, :info, :warn]
13
+ l.adapter :stderr, :level => [:error, :fatal]
14
+ end
15
+ end
16
+
17
+ def log(level, message)
18
+ color = case level
19
+ when :debug
20
+ :light_blue
21
+ when :info
22
+ :blue
23
+ when :warn
24
+ :yellow
25
+ when :error, :fatal
26
+ :red
27
+ end
28
+
29
+ log_with_color(level, color, message)
30
+ end
31
+
32
+ def log_with_color(level, color, message)
33
+ @logger.send level, colorize(color, message)
34
+ end
35
+
36
+ def colorize(color, message)
37
+ if $stdout.isatty && $stderr.isatty
38
+ Colored.colorize(message, foreground: color)
39
+ else
40
+ message
41
+ end
42
+ end
43
+
44
+ # dumb override to play nice with Typhoeus/Ethon
45
+ def debug(message = nil)
46
+ log(:debug, message) unless message.nil?
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,160 @@
1
+ module HTMLProofer
2
+ class Runner
3
+ include HTMLProofer::Utils
4
+
5
+ attr_reader :options, :external_urls
6
+
7
+ def initialize(src, opts = {})
8
+ @src = src
9
+
10
+ @options = HTMLProofer::Configuration::PROOFER_DEFAULTS.merge(opts)
11
+
12
+ @options[:typhoeus] = HTMLProofer::Configuration::TYPHOEUS_DEFAULTS.merge(opts[:typhoeus] || {})
13
+ @options[:hydra] = HTMLProofer::Configuration::HYDRA_DEFAULTS.merge(opts[:hydra] || {})
14
+
15
+ @options[:parallel] = HTMLProofer::Configuration::PARALLEL_DEFAULTS.merge(opts[:parallel] || {})
16
+ @options[:validation] = HTMLProofer::Configuration::VALIDATION_DEFAULTS.merge(opts[:validation] || {})
17
+ @options[:cache] = HTMLProofer::Configuration::CACHE_DEFAULTS.merge(opts[:cache] || {})
18
+
19
+ @type = @options.delete(:type)
20
+ @logger = HTMLProofer::Log.new(@options[:log_level])
21
+
22
+ if !@options[:cache].empty? && !File.exist?(STORAGE_DIR)
23
+ FileUtils.mkdir_p(STORAGE_DIR)
24
+ end
25
+
26
+ @failures = []
27
+ end
28
+
29
+ def run
30
+ @logger.log :info, "Running #{checks} on #{@src} on *#{@options[:extension]}... \n\n"
31
+
32
+ if @type == :links
33
+ check_list_of_links unless @options[:disable_external]
34
+ else
35
+ check_files
36
+ file_text = pluralize(files.length, 'file', 'files')
37
+ @logger.log :info, "Ran on #{file_text}!\n\n"
38
+ end
39
+
40
+ if @failures.empty?
41
+ @logger.log_with_color :info, :green, 'HTML-Proofer finished successfully.'
42
+ else
43
+ print_failed_tests
44
+ end
45
+ end
46
+
47
+ def check_list_of_links
48
+ if @options[:url_swap]
49
+ @src = @src.map do |url|
50
+ swap(url, @options[:url_swap])
51
+ end
52
+ end
53
+ @external_urls = Hash[*@src.map { |s| [s, nil] }.flatten]
54
+ validate_urls
55
+ end
56
+
57
+ # Collects any external URLs found in a directory of files. Also collectes
58
+ # every failed test from process_files.
59
+ # Sends the external URLs to Typhoeus for batch processing.
60
+ def check_files
61
+ @external_urls = {}
62
+
63
+ process_files.each do |item|
64
+ @external_urls.merge!(item[:external_urls])
65
+ @failures.concat(item[:failures])
66
+ end
67
+
68
+ # TODO: lazy. if we're checking only external links,
69
+ # we'll just trash all the failed tests. really, we should
70
+ # just not run those other checks at all.
71
+ if @options[:external_only]
72
+ @failures = []
73
+ validate_urls
74
+ elsif !@options[:disable_external]
75
+ validate_urls
76
+ end
77
+ end
78
+
79
+ # Walks over each implemented check and runs them on the files, in parallel.
80
+ def process_files
81
+ if @options[:parallel].empty?
82
+ files.map { |path| check_path(path) }
83
+ else
84
+ Parallel.map(files, @options[:parallel]) { |path| check_path(path) }
85
+ end
86
+ end
87
+
88
+ def check_path(path)
89
+ result = { :external_urls => {}, :failures => [] }
90
+ html = create_nokogiri(path)
91
+
92
+ @src = [@src] if @type == :file
93
+
94
+ @src.each do |src|
95
+ checks.each do |klass|
96
+ @logger.log :debug, "Checking #{klass.to_s.downcase} on #{path} ..."
97
+ check = Object.const_get(klass).new(src, path, html, @options)
98
+ check.run
99
+ result[:external_urls].merge!(check.external_urls)
100
+ result[:failures].concat(check.issues)
101
+ end
102
+ end
103
+ result
104
+ end
105
+
106
+ def validate_urls
107
+ url_validator = HTMLProofer::UrlValidator.new(@logger, @external_urls, @options)
108
+ @failures.concat(url_validator.run)
109
+ @external_urls = url_validator.external_urls
110
+ end
111
+
112
+ def files
113
+ @files ||= if @type == :directory
114
+ @src.map do |src|
115
+ pattern = File.join(src, '**', "*#{@options[:extension]}")
116
+ files = Dir.glob(pattern).select { |fn| File.file? fn }
117
+ files.reject { |f| ignore_file?(f) }
118
+ end.flatten
119
+ elsif @type == :file && File.extname(@src) == @options[:extension]
120
+ [@src].reject { |f| ignore_file?(f) }
121
+ else
122
+ []
123
+ end
124
+ end
125
+
126
+ def ignore_file?(file)
127
+ @options[:file_ignore].each do |pattern|
128
+ return true if pattern.is_a?(String) && pattern == file
129
+ return true if pattern.is_a?(Regexp) && pattern =~ file
130
+ end
131
+
132
+ false
133
+ end
134
+
135
+ def checks
136
+ return @checks unless @checks.nil?
137
+ @checks = HTMLProofer::Check.subchecks.map(&:name)
138
+ @checks.delete('FaviconCheck') unless @options[:check_favicon]
139
+ @checks.delete('HtmlCheck') unless @options[:check_html]
140
+ @options[:checks_to_ignore].each { |ignored| @checks.delete(ignored) }
141
+ @checks
142
+ end
143
+
144
+ def failed_tests
145
+ result = []
146
+ return result if @failures.empty?
147
+ @failures.each { |f| result << f.to_s }
148
+ result
149
+ end
150
+
151
+ def print_failed_tests
152
+ sorted_failures = SortedIssues.new(@failures, @options[:error_sort], @logger)
153
+
154
+ sorted_failures.sort_and_report
155
+ count = @failures.length
156
+ failure_text = pluralize(count, 'failure', 'failures')
157
+ fail @logger.colorize :red, "HTML-Proofer found #{failure_text}!"
158
+ end
159
+ end
160
+ end
@@ -0,0 +1,218 @@
1
+ require 'typhoeus'
2
+ require 'uri'
3
+ require_relative './utils'
4
+ require_relative './cache'
5
+
6
+ module HTMLProofer
7
+ class UrlValidator
8
+ include HTMLProofer::Utils
9
+
10
+ attr_reader :external_urls
11
+
12
+ def initialize(logger, external_urls, options)
13
+ @logger = logger
14
+ @external_urls = external_urls
15
+ @failed_tests = []
16
+ @options = options
17
+ @hydra = Typhoeus::Hydra.new(@options[:hydra])
18
+ @cache = Cache.new(@logger, @options[:cache])
19
+ end
20
+
21
+ def run
22
+ @external_urls = remove_query_values
23
+
24
+ if @cache.use_cache?
25
+ urls_to_check = load_cache
26
+ external_link_checker(urls_to_check)
27
+ @cache.write
28
+ else
29
+ external_link_checker(@external_urls)
30
+ end
31
+
32
+ @failed_tests
33
+ end
34
+
35
+ def remove_query_values
36
+ return nil if @external_urls.nil?
37
+ paths_with_queries = {}
38
+ iterable_external_urls = @external_urls.dup
39
+ @external_urls.keys.each do |url|
40
+ uri = begin
41
+ Addressable::URI.parse(url)
42
+ rescue URI::Error, Addressable::URI::InvalidURIError
43
+ @logger.log :error, "#{url} is an invalid URL"
44
+ nil
45
+ end
46
+ next if uri.nil? || uri.query.nil?
47
+ iterable_external_urls.delete(url) unless new_url_query_values?(uri, paths_with_queries)
48
+ end
49
+ iterable_external_urls
50
+ end
51
+
52
+ # remember queries we've seen, ignore future ones
53
+ def new_url_query_values?(uri, paths_with_queries)
54
+ queries = uri.query_values.keys.join('-')
55
+ domain_path = extract_domain_path(uri)
56
+ if paths_with_queries[domain_path].nil?
57
+ paths_with_queries[domain_path] = [queries]
58
+ true
59
+ elsif !paths_with_queries[domain_path].include?(queries)
60
+ paths_with_queries[domain_path] << queries
61
+ true
62
+ else
63
+ false
64
+ end
65
+ end
66
+
67
+ def extract_domain_path(uri)
68
+ uri.host + uri.path
69
+ end
70
+
71
+ def load_cache
72
+ cache_count = @cache.size
73
+ cache_text = pluralize(cache_count, 'link', 'links')
74
+
75
+ @logger.log :info, "Found #{cache_text} in the cache..."
76
+
77
+ @cache.retrieve_urls(@external_urls)
78
+ end
79
+
80
+ # Proofer runs faster if we pull out all the external URLs and run the checks
81
+ # at the end. Otherwise, we're halting the consuming process for every file during
82
+ # `process_files`.
83
+ #
84
+ # In addition, sorting the list lets libcurl keep connections to the same hosts alive.
85
+ #
86
+ # Finally, we'll first make a HEAD request, rather than GETing all the contents.
87
+ # If the HEAD fails, we'll fall back to GET, as some servers are not configured
88
+ # for HEAD. If we've decided to check for hashes, we must do a GET--HEAD is
89
+ # not available as an option.
90
+ def external_link_checker(external_urls)
91
+ external_urls = Hash[external_urls.sort]
92
+
93
+ count = external_urls.length
94
+ check_text = pluralize(count, 'external link', 'external links')
95
+ @logger.log :info, "Checking #{check_text}..."
96
+
97
+ # Route log from Typhoeus/Ethon to our own logger
98
+ Ethon.logger = @logger
99
+
100
+ establish_queue(external_urls)
101
+
102
+ @hydra.run
103
+ end
104
+
105
+ def establish_queue(external_urls)
106
+ external_urls.each_pair do |url, filenames|
107
+ url = begin
108
+ clean_url(url)
109
+ rescue URI::Error, Addressable::URI::InvalidURIError
110
+ add_external_issue(filenames, "#{url} is an invalid URL")
111
+ next
112
+ end
113
+
114
+ method = if hash?(url) && @options[:check_external_hash]
115
+ :get
116
+ else
117
+ :head
118
+ end
119
+ queue_request(method, url, filenames)
120
+ end
121
+ end
122
+
123
+ def clean_url(href)
124
+ Addressable::URI.parse(href).normalize
125
+ end
126
+
127
+ def queue_request(method, href, filenames)
128
+ opts = @options[:typhoeus].merge({ :method => method })
129
+ request = Typhoeus::Request.new(href, opts)
130
+ request.on_complete { |response| response_handler(response, filenames) }
131
+ @hydra.queue request
132
+ end
133
+
134
+ def response_handler(response, filenames)
135
+ effective_url = response.options[:effective_url]
136
+ href = response.request.base_url.to_s
137
+ method = response.request.options[:method]
138
+ response_code = response.code
139
+
140
+ debug_msg = "Received a #{response_code} for #{href}"
141
+ debug_msg << " in #{filenames.join(' ')}" unless filenames.nil?
142
+ @logger.log :debug, debug_msg
143
+
144
+ return if @options[:http_status_ignore].include?(response_code)
145
+
146
+ if response_code.between?(200, 299)
147
+ unless check_hash_in_2xx_response(href, effective_url, response, filenames)
148
+ @cache.add(href, filenames, response_code)
149
+ end
150
+ elsif response.timed_out?
151
+ handle_timeout(href, filenames, response_code)
152
+ elsif response_code == 0
153
+ handle_failure(href, filenames, response_code)
154
+ elsif method == :head
155
+ queue_request(:get, href, filenames)
156
+ else
157
+ return if @options[:only_4xx] && !response_code.between?(400, 499)
158
+ # Received a non-successful http response.
159
+ msg = "External link #{href} failed: #{response_code} #{response.return_message}"
160
+ add_external_issue(filenames, msg, response_code)
161
+ @cache.add(href, filenames, response_code, msg)
162
+ end
163
+ end
164
+
165
+ # Even though the response was a success, we may have been asked to check
166
+ # if the hash on the URL exists on the page
167
+ def check_hash_in_2xx_response(href, effective_url, response, filenames)
168
+ return false if @options[:only_4xx]
169
+ return false unless @options[:check_external_hash]
170
+ return false unless (hash = hash?(href))
171
+
172
+ body_doc = create_nokogiri(response.body)
173
+
174
+ # user-content is a special addition by GitHub.
175
+ xpath = %(//*[@name="#{hash}"]|//*[@id="#{hash}"])
176
+ if URI.parse(href).host.match(/github\.com/i)
177
+ xpath << %(|//*[@name="user-content-#{hash}"]|//*[@id="user-content-#{hash}"])
178
+ end
179
+
180
+ return unless body_doc.xpath(xpath).empty?
181
+
182
+ msg = "External link #{href} failed: #{effective_url} exists, but the hash '#{hash}' does not"
183
+ add_external_issue(filenames, msg, response.code)
184
+ @cache.add(href, filenames, response.code, msg)
185
+ true
186
+ end
187
+
188
+ def handle_timeout(href, filenames, response_code)
189
+ msg = "External link #{href} failed: got a time out (response code #{response_code})"
190
+ @cache.add(href, filenames, 0, msg)
191
+ return if @options[:only_4xx]
192
+ add_external_issue(filenames, msg, response_code)
193
+ end
194
+
195
+ def handle_failure(href, filenames, response_code)
196
+ msg = "External link #{href} failed: response code #{response_code} means something's wrong"
197
+ @cache.add(href, filenames, 0, msg)
198
+ return if @options[:only_4xx]
199
+ add_external_issue(filenames, msg, response_code)
200
+ end
201
+
202
+ def add_external_issue(filenames, desc, status = nil)
203
+ # possible if we're checking an array of links
204
+ if filenames.nil?
205
+ @failed_tests << Issue.new('', desc, status: status)
206
+ else
207
+ filenames.each { |f| @failed_tests << Issue.new(f, desc, status: status) }
208
+ end
209
+ end
210
+
211
+ # Does the URL have a hash?
212
+ def hash?(url)
213
+ URI.parse(url).fragment
214
+ rescue URI::InvalidURIError
215
+ false
216
+ end
217
+ end
218
+ end
@@ -0,0 +1,40 @@
1
+ require 'nokogiri'
2
+
3
+ module HTMLProofer
4
+ module Utils
5
+ STORAGE_DIR = File.join('tmp', '.htmlproofer')
6
+
7
+ def pluralize(count, single, plural)
8
+ "#{count} " << (count == 1 ? single : plural)
9
+ end
10
+
11
+ def create_nokogiri(path)
12
+ if File.exist? path
13
+ content = File.open(path).read
14
+ else
15
+ content = path
16
+ end
17
+
18
+ Nokogiri::HTML(clean_content(content))
19
+ end
20
+ module_function :create_nokogiri
21
+
22
+ def swap(href, replacement)
23
+ replacement.each do |link, replace|
24
+ href = href.gsub(link, replace)
25
+ end
26
+ href
27
+ end
28
+ module_function :swap
29
+
30
+ # address a problem with Nokogiri's parsing URL entities
31
+ # problem from http://git.io/vBYU1
32
+ # solution from http://git.io/vBYUi
33
+ def clean_content(string)
34
+ string.gsub(%r{https?://([^>]+)}i) do |url|
35
+ url.gsub(/&(?!amp;)/, '&amp;')
36
+ end
37
+ end
38
+ module_function :clean_content
39
+ end
40
+ end