html-proofer 2.6.4 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,168 +0,0 @@
1
- require 'addressable/uri'
2
- require_relative './utils'
3
-
4
- module HTML
5
- class Proofer
6
- # Represents the superclass from which all checks derive.
7
- class Checkable
8
- include HTML::Proofer::Utils
9
-
10
- attr_reader :line
11
-
12
- def initialize(obj, check)
13
- obj.attributes.each_pair do |attribute, value|
14
- instance_variable_set("@#{attribute.tr('-:.', '_')}".to_sym, value.value)
15
- end
16
-
17
- @text = obj.content
18
- @check = check
19
- @checked_paths = {}
20
- @type = self.class.name
21
- @line = obj.line
22
-
23
- if @href && @check.options[:href_swap]
24
- @href = swap(@href, @check.options[:href_swap])
25
- end
26
-
27
- # fix up missing protocols
28
- @href.insert 0, 'http:' if @href =~ %r{^//}
29
- @src.insert 0, 'http:' if @src =~ %r{^//}
30
- end
31
-
32
- def url
33
- @src || @srcset || @href || ''
34
- end
35
-
36
- def valid?
37
- !parts.nil?
38
- end
39
-
40
- def parts
41
- @parts ||= Addressable::URI.parse url
42
- rescue URI::Error, Addressable::URI::InvalidURIError
43
- @parts = nil
44
- end
45
-
46
- def path
47
- Addressable::URI.unencode parts.path unless parts.nil?
48
- end
49
-
50
- def hash
51
- parts.fragment unless parts.nil?
52
- end
53
-
54
- def scheme
55
- parts.scheme unless parts.nil?
56
- end
57
-
58
- # path is to an external server
59
- def remote?
60
- %w( http https ).include? scheme
61
- end
62
-
63
- def non_http_remote?
64
- !scheme.nil? && !remote?
65
- end
66
-
67
- def ignore?
68
- return true if @data_proofer_ignore
69
-
70
- # ignore base64 encoded images
71
- if %w(ImageCheckable FaviconCheckable).include? @type
72
- return true if url.match(/^data:image/)
73
- end
74
-
75
- # ignore user defined URLs
76
- return true if ignores_pattern_check(@check.url_ignores)
77
-
78
- # ignore user defined hrefs
79
- if 'LinkCheckable' == @type
80
- return true if ignores_pattern_check(@check.href_ignores)
81
- end
82
-
83
- # ignore user defined alts
84
- if 'ImageCheckable' == @type
85
- return true if ignores_pattern_check(@check.alt_ignores)
86
- end
87
- end
88
-
89
- def ignore_empty_alt?
90
- @check.empty_alt_ignore
91
- end
92
-
93
- def allow_hash_href?
94
- @check.allow_hash_href
95
- end
96
-
97
- # path is external to the file
98
- def external?
99
- !internal?
100
- end
101
-
102
- # path is an anchor or a query
103
- def internal?
104
- url.start_with? '#', '?'
105
- end
106
-
107
- def file_path
108
- return if path.nil?
109
-
110
- if path =~ %r{^/} # path relative to root
111
- base = File.directory?(@check.src) ? @check.src : File.dirname(@check.src)
112
- elsif File.exist?(File.expand_path path, @check.src) # relative links, path is a file
113
- base = File.dirname @check.path
114
- elsif File.exist?(File.join(File.dirname(@check.path), path)) # relative links in nested dir, path is a file
115
- base = File.dirname @check.path
116
- else # relative link, path is a directory
117
- base = @check.path
118
- end
119
-
120
- file = File.join base, path
121
-
122
- # implicit index support
123
- if File.directory?(file) && !unslashed_directory?(file)
124
- file = File.join file, @check.options[:directory_index_file]
125
- end
126
-
127
- file
128
- end
129
-
130
- # checks if a file exists relative to the current pwd
131
- def exists?
132
- return @checked_paths[absolute_path] if @checked_paths.key? absolute_path
133
- @checked_paths[absolute_path] = File.exist? absolute_path
134
- end
135
-
136
- def absolute_path
137
- path = file_path || @check.path
138
- File.expand_path path, Dir.pwd
139
- end
140
-
141
- def ignores_pattern_check(links)
142
- links.each do |ignore|
143
- if ignore.is_a? String
144
- return true if ignore == url
145
- elsif ignore.is_a? Regexp
146
- return true if ignore =~ url
147
- end
148
- end
149
-
150
- false
151
- end
152
-
153
- def unslashed_directory?(file)
154
- File.directory?(file) && !file.end_with?(File::SEPARATOR) && !follow_location?
155
- end
156
-
157
- def follow_location?
158
- @check.typhoeus_opts && @check.typhoeus_opts[:followlocation]
159
- end
160
-
161
- private
162
-
163
- def real_attr(attr)
164
- attr.to_s unless attr.nil? || attr.empty?
165
- end
166
- end
167
- end
168
- end
@@ -1,46 +0,0 @@
1
- # encoding: utf-8
2
-
3
- class HtmlCheck < ::HTML::Proofer::CheckRunner
4
- # new html5 tags (source: http://www.w3schools.com/html/html5_new_elements.asp)
5
- # and svg child tags (source: https://developer.mozilla.org/en-US/docs/Web/SVG/Element)
6
- HTML5_TAGS = %w(article aside bdi details dialog figcaption
7
- figure footer header main mark menuitem meter
8
- nav progress rp rt ruby section summary
9
- time wbr datalist keygen output color date
10
- datetime datetime-local email month number
11
- range search tel time url week canvas
12
- svg audio embed source track video
13
- altGlyph altGlyphDef altGlyphItem animate
14
- animateColor animateMotion animateTransform
15
- circle clipPath color-profile cursor defs
16
- desc ellipse feBlend feColorMatrix
17
- feComponentTransfer feComposite feConvolveMatrix
18
- feDiffuseLighting feDisplacementMap feDistantLight
19
- feFlood feFuncA feFuncB feFuncG feFuncR feGaussianBlur
20
- feImage feMerge feMergeNode feMorphology feOffset
21
- fePointLight feSpecularLighting feSpotLight feTile
22
- feTurbulence filter font font-face font-face-format
23
- font-face-name font-face-src font-face-uri
24
- foreignObject g glyph glyphRef hkern image line
25
- linearGradient marker mask metadata missing-glyph
26
- mpath path pattern polygon polyline radialGradient
27
- rect set stop switch symbol text textPath tref tspan use
28
- view vkern)
29
-
30
- SCRIPT_EMBEDS_MSG = /Element script embeds close tag/
31
-
32
- def run
33
- @html.errors.each do |error|
34
- message = error.message
35
- line = error.line
36
- # Nokogiri (or rather libxml2 underhood) only recognizes html4 tags,
37
- # so we need to skip errors caused by the new tags in html5
38
- next if HTML5_TAGS.include? message[/Tag ([\w-]+) invalid/o, 1]
39
-
40
- # tags embedded in scripts are used in templating languages: http://git.io/vOovv
41
- next if @validation_opts[:ignore_script_embeds] && message =~ SCRIPT_EMBEDS_MSG
42
-
43
- add_issue(message, line)
44
- end
45
- end
46
- end
@@ -1,54 +0,0 @@
1
- # encoding: utf-8
2
-
3
- class ImageCheckable < ::HTML::Proofer::Checkable
4
- SCREEN_SHOT_REGEX = /Screen(?: |%20)Shot(?: |%20)\d+-\d+-\d+(?: |%20)at(?: |%20)\d+.\d+.\d+/
5
-
6
- attr_reader :alt
7
-
8
- def empty_alt_tag?
9
- alt.strip.empty?
10
- end
11
-
12
- def terrible_filename?
13
- src =~ SCREEN_SHOT_REGEX
14
- end
15
-
16
- def src
17
- real_attr(@src) || real_attr(@srcset)
18
- end
19
-
20
- def missing_src?
21
- !src
22
- end
23
- end
24
-
25
- class ImageCheck < ::HTML::Proofer::CheckRunner
26
- def run
27
- @html.css('img').each do |node|
28
- img = ImageCheckable.new(node, self)
29
- line = node.line
30
-
31
- next if img.ignore?
32
-
33
- # screenshot filenames should return because of terrible names
34
- next add_issue("image has a terrible filename (#{img.src})", line) if img.terrible_filename?
35
-
36
- # does the image exist?
37
- if img.missing_src?
38
- add_issue('image has no src or srcset attribute', line)
39
- else
40
- if img.remote?
41
- add_to_external_urls(img.src, line)
42
- else
43
- add_issue("internal image #{img.src} does not exist", line) unless img.exists?
44
- end
45
- end
46
-
47
- if img.alt.nil? || (img.empty_alt_tag? && !img.ignore_empty_alt?)
48
- add_issue("image #{img.src} does not have an alt attribute", line)
49
- end
50
- end
51
-
52
- external_urls
53
- end
54
- end
@@ -1,40 +0,0 @@
1
- # encoding: utf-8
2
-
3
- class ScriptCheckable < ::HTML::Proofer::Checkable
4
-
5
- def src
6
- real_attr @src
7
- end
8
-
9
- def missing_src?
10
- !src
11
- end
12
-
13
- def blank?
14
- @text.strip.empty?
15
- end
16
-
17
- end
18
-
19
- class ScriptCheck < ::HTML::Proofer::CheckRunner
20
- def run
21
- @html.css('script').each do |node|
22
- script = ScriptCheckable.new(node, self)
23
- line = node.line
24
-
25
- next if script.ignore?
26
- next unless script.blank?
27
-
28
- # does the script exist?
29
- if script.missing_src?
30
- add_issue('script is empty and has no src attribute', line)
31
- elsif script.remote?
32
- add_to_external_urls(script.src, line)
33
- else
34
- add_issue("internal script #{script.src} does not exist", line) unless script.exists?
35
- end
36
- end
37
-
38
- external_urls
39
- end
40
- end
@@ -1,48 +0,0 @@
1
- module HTML
2
- class Proofer
3
- module Configuration
4
- require_relative 'version'
5
-
6
- PROOFER_DEFAULTS = {
7
- :allow_hash_href => false,
8
- :alt_ignore => [],
9
- :check_external_hash => false,
10
- :check_favicon => false,
11
- :check_html => false,
12
- :checks_to_ignore => [],
13
- :directory_index_file => 'index.html',
14
- :disable_external => false,
15
- :empty_alt_ignore => false,
16
- :enforce_https => false,
17
- :error_sort => :path,
18
- :ext => '.html',
19
- :external_only => false,
20
- :file_ignore => [],
21
- :href_ignore => [],
22
- :href_swap => [],
23
- :only_4xx => false,
24
- :url_ignore => [],
25
- :verbose => false
26
- }
27
-
28
- TYPHOEUS_DEFAULTS = {
29
- :followlocation => true,
30
- :headers => {
31
- 'User-Agent' => "Mozilla/5.0 (compatible; HTML Proofer/#{HTML::Proofer::VERSION}; +https://github.com/gjtorikian/html-proofer)"
32
- }
33
- }
34
-
35
- HYDRA_DEFAULTS = {
36
- :max_concurrency => 50
37
- }
38
-
39
- def self.to_regex?(item)
40
- if item.start_with?('/') && item.end_with?('/')
41
- Regexp.new item[1...-1]
42
- else
43
- item
44
- end
45
- end
46
- end
47
- end
48
- end
@@ -1,42 +0,0 @@
1
- require 'yell'
2
- require 'colored'
3
-
4
- module HTML
5
- class Proofer
6
- class Log
7
- include Yell::Loggable
8
-
9
- def initialize(verbose, verbosity = nil)
10
- log_level = if verbosity.nil?
11
- verbose ? :debug : :info
12
- else
13
- verbosity
14
- end
15
-
16
- @logger = Yell.new(:format => false, \
17
- :name => 'HTML::Proofer', \
18
- :level => "gte.#{log_level}") do |l|
19
- l.adapter :stdout, :level => [:debug, :info, :warn]
20
- l.adapter :stderr, :level => [:error, :fatal]
21
- end
22
- end
23
-
24
- def log(level, color, message)
25
- @logger.send level, colorize(color, message)
26
- end
27
-
28
- def colorize(color, message)
29
- if $stdout.isatty && $stderr.isatty
30
- Colored.colorize(message, foreground: color)
31
- else
32
- message
33
- end
34
- end
35
-
36
- # dumb override to play nice with Typhoeus/Ethon
37
- def debug(message = nil)
38
- log(:debug, :yellow, message) unless message.nil?
39
- end
40
- end
41
- end
42
- end
@@ -1,222 +0,0 @@
1
- require 'typhoeus'
2
- require 'uri'
3
- require_relative './utils'
4
- require_relative './cache'
5
-
6
- module HTML
7
- class Proofer
8
- class UrlValidator
9
- include HTML::Proofer::Utils
10
-
11
- attr_accessor :logger, :external_urls, :iterable_external_urls, :hydra
12
-
13
- def initialize(logger, external_urls, options, typhoeus_opts, hydra_opts)
14
- @logger = logger
15
- @external_urls = external_urls
16
- @iterable_external_urls = {}
17
- @failed_tests = []
18
- @options = options
19
- @hydra = Typhoeus::Hydra.new(hydra_opts)
20
- @typhoeus_opts = typhoeus_opts
21
- @external_domain_paths_with_queries = {}
22
- @cache = Cache.new(@logger, @options[:cache])
23
- end
24
-
25
- def run
26
- @iterable_external_urls = remove_query_values
27
-
28
- if @cache.exists && @cache.load
29
- cache_count = @cache.cache_log.length
30
- cache_text = pluralize(cache_count, 'link', 'links')
31
-
32
- logger.log :info, :blue, "Found #{cache_text} in the cache..."
33
-
34
- urls_to_check = @cache.detect_url_changes(@iterable_external_urls)
35
-
36
- @cache.cache_log.each_pair do |url, cache|
37
- if @cache.within_timeframe?(cache['time'])
38
- next if cache['message'].empty? # these were successes to skip
39
- urls_to_check[url] = cache['filenames'] # these are failures to retry
40
- else
41
- urls_to_check[url] = cache['filenames'] # pass or fail, recheck expired links
42
- end
43
- end
44
-
45
- external_link_checker(urls_to_check)
46
- else
47
- external_link_checker(@iterable_external_urls)
48
- end
49
-
50
- @cache.write
51
- @failed_tests
52
- end
53
-
54
- def remove_query_values
55
- return nil if @external_urls.nil?
56
- iterable_external_urls = @external_urls.dup
57
- @external_urls.keys.each do |url|
58
- uri = begin
59
- Addressable::URI.parse(url)
60
- rescue URI::Error, Addressable::URI::InvalidURIError
61
- @logger.log :error, :red, "#{url} is an invalid URL"
62
- nil
63
- end
64
- next if uri.nil? || uri.query.nil?
65
- iterable_external_urls.delete(url) unless new_url_query_values?(uri)
66
- end
67
- iterable_external_urls
68
- end
69
-
70
- # remember queries we've seen, ignore future ones
71
- def new_url_query_values?(uri)
72
- queries = uri.query_values.keys.join('-')
73
- domain_path = extract_domain_path(uri)
74
- if @external_domain_paths_with_queries[domain_path].nil?
75
- @external_domain_paths_with_queries[domain_path] = [queries]
76
- true
77
- elsif !@external_domain_paths_with_queries[domain_path].include?(queries)
78
- @external_domain_paths_with_queries[domain_path] << queries
79
- true
80
- else
81
- false
82
- end
83
- end
84
-
85
- def extract_domain_path(uri)
86
- uri.host + uri.path
87
- end
88
-
89
- # Proofer runs faster if we pull out all the external URLs and run the checks
90
- # at the end. Otherwise, we're halting the consuming process for every file during
91
- # the check_directory_of_files process.
92
- #
93
- # In addition, sorting the list lets libcurl keep connections to the same hosts alive.
94
- #
95
- # Finally, we'll first make a HEAD request, rather than GETing all the contents.
96
- # If the HEAD fails, we'll fall back to GET, as some servers are not configured
97
- # for HEAD. If we've decided to check for hashes, we must do a GET--HEAD is
98
- # not an option.
99
- def external_link_checker(external_urls)
100
- external_urls = Hash[external_urls.sort]
101
-
102
- count = external_urls.length
103
- check_text = pluralize(count, 'external link', 'external links')
104
- logger.log :info, :blue, "Checking #{check_text}..."
105
-
106
- Ethon.logger = logger # log from Typhoeus/Ethon
107
-
108
- url_processor(external_urls)
109
-
110
- logger.log :debug, :yellow, "Running requests for:"
111
- logger.log :debug, :yellow, "###\n" + external_urls.keys.join("\n") + "\n###"
112
-
113
- hydra.run
114
- end
115
-
116
- def url_processor(external_urls)
117
- external_urls.each_pair do |href, filenames|
118
- href = begin
119
- clean_url(href)
120
- rescue URI::Error, Addressable::URI::InvalidURIError
121
- add_external_issue(filenames, "#{href} is an invalid URL")
122
- next
123
- end
124
-
125
- if hash?(href) && @options[:check_external_hash]
126
- queue_request(:get, href, filenames)
127
- else
128
- queue_request(:head, href, filenames)
129
- end
130
- end
131
- end
132
-
133
- def clean_url(href)
134
- Addressable::URI.parse(href).normalize
135
- end
136
-
137
- def queue_request(method, href, filenames)
138
- request = Typhoeus::Request.new(href, @typhoeus_opts.merge({ :method => method }))
139
- request.on_complete { |response| response_handler(response, filenames) }
140
- hydra.queue request
141
- end
142
-
143
- def response_handler(response, filenames)
144
- effective_url = response.options[:effective_url]
145
- href = response.request.base_url.to_s
146
- method = response.request.options[:method]
147
- response_code = response.code
148
-
149
- debug_msg = "Received a #{response_code} for #{href}"
150
- debug_msg << " in #{filenames.join(' ')}" unless filenames.nil?
151
- logger.log :debug, :yellow, debug_msg
152
-
153
- if response_code.between?(200, 299)
154
- check_hash_in_2xx_response(href, effective_url, response, filenames)
155
- @cache.add(href, filenames, response_code)
156
- elsif response.timed_out?
157
- handle_timeout(href, filenames, response_code)
158
- elsif response_code == 0
159
- handle_failure(href, filenames, response_code)
160
- elsif method == :head
161
- queue_request(:get, href, filenames)
162
- else
163
- return if @options[:only_4xx] && !response_code.between?(400, 499)
164
- # Received a non-successful http response.
165
- msg = "External link #{href} failed: #{response_code} #{response.return_message}"
166
- add_external_issue(filenames, msg, response_code)
167
- @cache.add(href, filenames, response_code, msg)
168
- end
169
- end
170
-
171
- # Even though the response was a success, we may have been asked to check
172
- # if the hash on the URL exists on the page
173
- def check_hash_in_2xx_response(href, effective_url, response, filenames)
174
- return if @options[:only_4xx]
175
- return unless @options[:check_external_hash]
176
- return unless (hash = hash?(href))
177
-
178
- body_doc = create_nokogiri(response.body)
179
-
180
- # user-content is a special addition by GitHub.
181
- xpath = %(//*[@name="#{hash}"]|//*[@id="#{hash}"])
182
- if URI.parse(href).host.match(/github\.com/i)
183
- xpath << %(|//*[@name="user-content-#{hash}"]|//*[@id="user-content-#{hash}"])
184
- end
185
-
186
- return unless body_doc.xpath(xpath).empty?
187
-
188
- msg = "External link #{href} failed: #{effective_url} exists, but the hash '#{hash}' does not"
189
- add_external_issue(filenames, msg, response.code)
190
- @cache.add(href, filenames, response.code, msg)
191
- end
192
-
193
- def handle_timeout(href, filenames, response_code)
194
- msg = "External link #{href} failed: got a time out (response code #{response_code})"
195
- @cache.add(href, filenames, 0, msg)
196
- return if @options[:only_4xx]
197
- add_external_issue(filenames, msg, response_code)
198
- end
199
-
200
- def handle_failure(href, filenames, response_code)
201
- msg = "External link #{href} failed: response code #{response_code} means something's wrong"
202
- @cache.add(href, filenames, 0, msg)
203
- return if @options[:only_4xx]
204
- add_external_issue(filenames, msg, response_code)
205
- end
206
-
207
- def add_external_issue(filenames, desc, status = nil)
208
- if filenames.nil?
209
- @failed_tests << CheckRunner::Issue.new('', desc, nil, status)
210
- else
211
- filenames.each { |f| @failed_tests << CheckRunner::Issue.new(f, desc, nil, status) }
212
- end
213
- end
214
-
215
- def hash?(url)
216
- URI.parse(url).fragment
217
- rescue URI::InvalidURIError
218
- nil
219
- end
220
- end
221
- end
222
- end