html-proofer 2.6.4 → 3.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,168 +0,0 @@
1
- require 'addressable/uri'
2
- require_relative './utils'
3
-
4
- module HTML
5
- class Proofer
6
- # Represents the superclass from which all checks derive.
7
- class Checkable
8
- include HTML::Proofer::Utils
9
-
10
- attr_reader :line
11
-
12
- def initialize(obj, check)
13
- obj.attributes.each_pair do |attribute, value|
14
- instance_variable_set("@#{attribute.tr('-:.', '_')}".to_sym, value.value)
15
- end
16
-
17
- @text = obj.content
18
- @check = check
19
- @checked_paths = {}
20
- @type = self.class.name
21
- @line = obj.line
22
-
23
- if @href && @check.options[:href_swap]
24
- @href = swap(@href, @check.options[:href_swap])
25
- end
26
-
27
- # fix up missing protocols
28
- @href.insert 0, 'http:' if @href =~ %r{^//}
29
- @src.insert 0, 'http:' if @src =~ %r{^//}
30
- end
31
-
32
- def url
33
- @src || @srcset || @href || ''
34
- end
35
-
36
- def valid?
37
- !parts.nil?
38
- end
39
-
40
- def parts
41
- @parts ||= Addressable::URI.parse url
42
- rescue URI::Error, Addressable::URI::InvalidURIError
43
- @parts = nil
44
- end
45
-
46
- def path
47
- Addressable::URI.unencode parts.path unless parts.nil?
48
- end
49
-
50
- def hash
51
- parts.fragment unless parts.nil?
52
- end
53
-
54
- def scheme
55
- parts.scheme unless parts.nil?
56
- end
57
-
58
- # path is to an external server
59
- def remote?
60
- %w( http https ).include? scheme
61
- end
62
-
63
- def non_http_remote?
64
- !scheme.nil? && !remote?
65
- end
66
-
67
- def ignore?
68
- return true if @data_proofer_ignore
69
-
70
- # ignore base64 encoded images
71
- if %w(ImageCheckable FaviconCheckable).include? @type
72
- return true if url.match(/^data:image/)
73
- end
74
-
75
- # ignore user defined URLs
76
- return true if ignores_pattern_check(@check.url_ignores)
77
-
78
- # ignore user defined hrefs
79
- if 'LinkCheckable' == @type
80
- return true if ignores_pattern_check(@check.href_ignores)
81
- end
82
-
83
- # ignore user defined alts
84
- if 'ImageCheckable' == @type
85
- return true if ignores_pattern_check(@check.alt_ignores)
86
- end
87
- end
88
-
89
- def ignore_empty_alt?
90
- @check.empty_alt_ignore
91
- end
92
-
93
- def allow_hash_href?
94
- @check.allow_hash_href
95
- end
96
-
97
- # path is external to the file
98
- def external?
99
- !internal?
100
- end
101
-
102
- # path is an anchor or a query
103
- def internal?
104
- url.start_with? '#', '?'
105
- end
106
-
107
- def file_path
108
- return if path.nil?
109
-
110
- if path =~ %r{^/} # path relative to root
111
- base = File.directory?(@check.src) ? @check.src : File.dirname(@check.src)
112
- elsif File.exist?(File.expand_path path, @check.src) # relative links, path is a file
113
- base = File.dirname @check.path
114
- elsif File.exist?(File.join(File.dirname(@check.path), path)) # relative links in nested dir, path is a file
115
- base = File.dirname @check.path
116
- else # relative link, path is a directory
117
- base = @check.path
118
- end
119
-
120
- file = File.join base, path
121
-
122
- # implicit index support
123
- if File.directory?(file) && !unslashed_directory?(file)
124
- file = File.join file, @check.options[:directory_index_file]
125
- end
126
-
127
- file
128
- end
129
-
130
- # checks if a file exists relative to the current pwd
131
- def exists?
132
- return @checked_paths[absolute_path] if @checked_paths.key? absolute_path
133
- @checked_paths[absolute_path] = File.exist? absolute_path
134
- end
135
-
136
- def absolute_path
137
- path = file_path || @check.path
138
- File.expand_path path, Dir.pwd
139
- end
140
-
141
- def ignores_pattern_check(links)
142
- links.each do |ignore|
143
- if ignore.is_a? String
144
- return true if ignore == url
145
- elsif ignore.is_a? Regexp
146
- return true if ignore =~ url
147
- end
148
- end
149
-
150
- false
151
- end
152
-
153
- def unslashed_directory?(file)
154
- File.directory?(file) && !file.end_with?(File::SEPARATOR) && !follow_location?
155
- end
156
-
157
- def follow_location?
158
- @check.typhoeus_opts && @check.typhoeus_opts[:followlocation]
159
- end
160
-
161
- private
162
-
163
- def real_attr(attr)
164
- attr.to_s unless attr.nil? || attr.empty?
165
- end
166
- end
167
- end
168
- end
@@ -1,46 +0,0 @@
1
- # encoding: utf-8
2
-
3
- class HtmlCheck < ::HTML::Proofer::CheckRunner
4
- # new html5 tags (source: http://www.w3schools.com/html/html5_new_elements.asp)
5
- # and svg child tags (source: https://developer.mozilla.org/en-US/docs/Web/SVG/Element)
6
- HTML5_TAGS = %w(article aside bdi details dialog figcaption
7
- figure footer header main mark menuitem meter
8
- nav progress rp rt ruby section summary
9
- time wbr datalist keygen output color date
10
- datetime datetime-local email month number
11
- range search tel time url week canvas
12
- svg audio embed source track video
13
- altGlyph altGlyphDef altGlyphItem animate
14
- animateColor animateMotion animateTransform
15
- circle clipPath color-profile cursor defs
16
- desc ellipse feBlend feColorMatrix
17
- feComponentTransfer feComposite feConvolveMatrix
18
- feDiffuseLighting feDisplacementMap feDistantLight
19
- feFlood feFuncA feFuncB feFuncG feFuncR feGaussianBlur
20
- feImage feMerge feMergeNode feMorphology feOffset
21
- fePointLight feSpecularLighting feSpotLight feTile
22
- feTurbulence filter font font-face font-face-format
23
- font-face-name font-face-src font-face-uri
24
- foreignObject g glyph glyphRef hkern image line
25
- linearGradient marker mask metadata missing-glyph
26
- mpath path pattern polygon polyline radialGradient
27
- rect set stop switch symbol text textPath tref tspan use
28
- view vkern)
29
-
30
- SCRIPT_EMBEDS_MSG = /Element script embeds close tag/
31
-
32
- def run
33
- @html.errors.each do |error|
34
- message = error.message
35
- line = error.line
36
- # Nokogiri (or rather libxml2 underhood) only recognizes html4 tags,
37
- # so we need to skip errors caused by the new tags in html5
38
- next if HTML5_TAGS.include? message[/Tag ([\w-]+) invalid/o, 1]
39
-
40
- # tags embedded in scripts are used in templating languages: http://git.io/vOovv
41
- next if @validation_opts[:ignore_script_embeds] && message =~ SCRIPT_EMBEDS_MSG
42
-
43
- add_issue(message, line)
44
- end
45
- end
46
- end
@@ -1,54 +0,0 @@
1
- # encoding: utf-8
2
-
3
- class ImageCheckable < ::HTML::Proofer::Checkable
4
- SCREEN_SHOT_REGEX = /Screen(?: |%20)Shot(?: |%20)\d+-\d+-\d+(?: |%20)at(?: |%20)\d+.\d+.\d+/
5
-
6
- attr_reader :alt
7
-
8
- def empty_alt_tag?
9
- alt.strip.empty?
10
- end
11
-
12
- def terrible_filename?
13
- src =~ SCREEN_SHOT_REGEX
14
- end
15
-
16
- def src
17
- real_attr(@src) || real_attr(@srcset)
18
- end
19
-
20
- def missing_src?
21
- !src
22
- end
23
- end
24
-
25
- class ImageCheck < ::HTML::Proofer::CheckRunner
26
- def run
27
- @html.css('img').each do |node|
28
- img = ImageCheckable.new(node, self)
29
- line = node.line
30
-
31
- next if img.ignore?
32
-
33
- # screenshot filenames should return because of terrible names
34
- next add_issue("image has a terrible filename (#{img.src})", line) if img.terrible_filename?
35
-
36
- # does the image exist?
37
- if img.missing_src?
38
- add_issue('image has no src or srcset attribute', line)
39
- else
40
- if img.remote?
41
- add_to_external_urls(img.src, line)
42
- else
43
- add_issue("internal image #{img.src} does not exist", line) unless img.exists?
44
- end
45
- end
46
-
47
- if img.alt.nil? || (img.empty_alt_tag? && !img.ignore_empty_alt?)
48
- add_issue("image #{img.src} does not have an alt attribute", line)
49
- end
50
- end
51
-
52
- external_urls
53
- end
54
- end
@@ -1,40 +0,0 @@
1
- # encoding: utf-8
2
-
3
- class ScriptCheckable < ::HTML::Proofer::Checkable
4
-
5
- def src
6
- real_attr @src
7
- end
8
-
9
- def missing_src?
10
- !src
11
- end
12
-
13
- def blank?
14
- @text.strip.empty?
15
- end
16
-
17
- end
18
-
19
- class ScriptCheck < ::HTML::Proofer::CheckRunner
20
- def run
21
- @html.css('script').each do |node|
22
- script = ScriptCheckable.new(node, self)
23
- line = node.line
24
-
25
- next if script.ignore?
26
- next unless script.blank?
27
-
28
- # does the script exist?
29
- if script.missing_src?
30
- add_issue('script is empty and has no src attribute', line)
31
- elsif script.remote?
32
- add_to_external_urls(script.src, line)
33
- else
34
- add_issue("internal script #{script.src} does not exist", line) unless script.exists?
35
- end
36
- end
37
-
38
- external_urls
39
- end
40
- end
@@ -1,48 +0,0 @@
1
- module HTML
2
- class Proofer
3
- module Configuration
4
- require_relative 'version'
5
-
6
- PROOFER_DEFAULTS = {
7
- :allow_hash_href => false,
8
- :alt_ignore => [],
9
- :check_external_hash => false,
10
- :check_favicon => false,
11
- :check_html => false,
12
- :checks_to_ignore => [],
13
- :directory_index_file => 'index.html',
14
- :disable_external => false,
15
- :empty_alt_ignore => false,
16
- :enforce_https => false,
17
- :error_sort => :path,
18
- :ext => '.html',
19
- :external_only => false,
20
- :file_ignore => [],
21
- :href_ignore => [],
22
- :href_swap => [],
23
- :only_4xx => false,
24
- :url_ignore => [],
25
- :verbose => false
26
- }
27
-
28
- TYPHOEUS_DEFAULTS = {
29
- :followlocation => true,
30
- :headers => {
31
- 'User-Agent' => "Mozilla/5.0 (compatible; HTML Proofer/#{HTML::Proofer::VERSION}; +https://github.com/gjtorikian/html-proofer)"
32
- }
33
- }
34
-
35
- HYDRA_DEFAULTS = {
36
- :max_concurrency => 50
37
- }
38
-
39
- def self.to_regex?(item)
40
- if item.start_with?('/') && item.end_with?('/')
41
- Regexp.new item[1...-1]
42
- else
43
- item
44
- end
45
- end
46
- end
47
- end
48
- end
@@ -1,42 +0,0 @@
1
- require 'yell'
2
- require 'colored'
3
-
4
- module HTML
5
- class Proofer
6
- class Log
7
- include Yell::Loggable
8
-
9
- def initialize(verbose, verbosity = nil)
10
- log_level = if verbosity.nil?
11
- verbose ? :debug : :info
12
- else
13
- verbosity
14
- end
15
-
16
- @logger = Yell.new(:format => false, \
17
- :name => 'HTML::Proofer', \
18
- :level => "gte.#{log_level}") do |l|
19
- l.adapter :stdout, :level => [:debug, :info, :warn]
20
- l.adapter :stderr, :level => [:error, :fatal]
21
- end
22
- end
23
-
24
- def log(level, color, message)
25
- @logger.send level, colorize(color, message)
26
- end
27
-
28
- def colorize(color, message)
29
- if $stdout.isatty && $stderr.isatty
30
- Colored.colorize(message, foreground: color)
31
- else
32
- message
33
- end
34
- end
35
-
36
- # dumb override to play nice with Typhoeus/Ethon
37
- def debug(message = nil)
38
- log(:debug, :yellow, message) unless message.nil?
39
- end
40
- end
41
- end
42
- end
@@ -1,222 +0,0 @@
1
- require 'typhoeus'
2
- require 'uri'
3
- require_relative './utils'
4
- require_relative './cache'
5
-
6
- module HTML
7
- class Proofer
8
- class UrlValidator
9
- include HTML::Proofer::Utils
10
-
11
- attr_accessor :logger, :external_urls, :iterable_external_urls, :hydra
12
-
13
- def initialize(logger, external_urls, options, typhoeus_opts, hydra_opts)
14
- @logger = logger
15
- @external_urls = external_urls
16
- @iterable_external_urls = {}
17
- @failed_tests = []
18
- @options = options
19
- @hydra = Typhoeus::Hydra.new(hydra_opts)
20
- @typhoeus_opts = typhoeus_opts
21
- @external_domain_paths_with_queries = {}
22
- @cache = Cache.new(@logger, @options[:cache])
23
- end
24
-
25
- def run
26
- @iterable_external_urls = remove_query_values
27
-
28
- if @cache.exists && @cache.load
29
- cache_count = @cache.cache_log.length
30
- cache_text = pluralize(cache_count, 'link', 'links')
31
-
32
- logger.log :info, :blue, "Found #{cache_text} in the cache..."
33
-
34
- urls_to_check = @cache.detect_url_changes(@iterable_external_urls)
35
-
36
- @cache.cache_log.each_pair do |url, cache|
37
- if @cache.within_timeframe?(cache['time'])
38
- next if cache['message'].empty? # these were successes to skip
39
- urls_to_check[url] = cache['filenames'] # these are failures to retry
40
- else
41
- urls_to_check[url] = cache['filenames'] # pass or fail, recheck expired links
42
- end
43
- end
44
-
45
- external_link_checker(urls_to_check)
46
- else
47
- external_link_checker(@iterable_external_urls)
48
- end
49
-
50
- @cache.write
51
- @failed_tests
52
- end
53
-
54
- def remove_query_values
55
- return nil if @external_urls.nil?
56
- iterable_external_urls = @external_urls.dup
57
- @external_urls.keys.each do |url|
58
- uri = begin
59
- Addressable::URI.parse(url)
60
- rescue URI::Error, Addressable::URI::InvalidURIError
61
- @logger.log :error, :red, "#{url} is an invalid URL"
62
- nil
63
- end
64
- next if uri.nil? || uri.query.nil?
65
- iterable_external_urls.delete(url) unless new_url_query_values?(uri)
66
- end
67
- iterable_external_urls
68
- end
69
-
70
- # remember queries we've seen, ignore future ones
71
- def new_url_query_values?(uri)
72
- queries = uri.query_values.keys.join('-')
73
- domain_path = extract_domain_path(uri)
74
- if @external_domain_paths_with_queries[domain_path].nil?
75
- @external_domain_paths_with_queries[domain_path] = [queries]
76
- true
77
- elsif !@external_domain_paths_with_queries[domain_path].include?(queries)
78
- @external_domain_paths_with_queries[domain_path] << queries
79
- true
80
- else
81
- false
82
- end
83
- end
84
-
85
- def extract_domain_path(uri)
86
- uri.host + uri.path
87
- end
88
-
89
- # Proofer runs faster if we pull out all the external URLs and run the checks
90
- # at the end. Otherwise, we're halting the consuming process for every file during
91
- # the check_directory_of_files process.
92
- #
93
- # In addition, sorting the list lets libcurl keep connections to the same hosts alive.
94
- #
95
- # Finally, we'll first make a HEAD request, rather than GETing all the contents.
96
- # If the HEAD fails, we'll fall back to GET, as some servers are not configured
97
- # for HEAD. If we've decided to check for hashes, we must do a GET--HEAD is
98
- # not an option.
99
- def external_link_checker(external_urls)
100
- external_urls = Hash[external_urls.sort]
101
-
102
- count = external_urls.length
103
- check_text = pluralize(count, 'external link', 'external links')
104
- logger.log :info, :blue, "Checking #{check_text}..."
105
-
106
- Ethon.logger = logger # log from Typhoeus/Ethon
107
-
108
- url_processor(external_urls)
109
-
110
- logger.log :debug, :yellow, "Running requests for:"
111
- logger.log :debug, :yellow, "###\n" + external_urls.keys.join("\n") + "\n###"
112
-
113
- hydra.run
114
- end
115
-
116
- def url_processor(external_urls)
117
- external_urls.each_pair do |href, filenames|
118
- href = begin
119
- clean_url(href)
120
- rescue URI::Error, Addressable::URI::InvalidURIError
121
- add_external_issue(filenames, "#{href} is an invalid URL")
122
- next
123
- end
124
-
125
- if hash?(href) && @options[:check_external_hash]
126
- queue_request(:get, href, filenames)
127
- else
128
- queue_request(:head, href, filenames)
129
- end
130
- end
131
- end
132
-
133
- def clean_url(href)
134
- Addressable::URI.parse(href).normalize
135
- end
136
-
137
- def queue_request(method, href, filenames)
138
- request = Typhoeus::Request.new(href, @typhoeus_opts.merge({ :method => method }))
139
- request.on_complete { |response| response_handler(response, filenames) }
140
- hydra.queue request
141
- end
142
-
143
- def response_handler(response, filenames)
144
- effective_url = response.options[:effective_url]
145
- href = response.request.base_url.to_s
146
- method = response.request.options[:method]
147
- response_code = response.code
148
-
149
- debug_msg = "Received a #{response_code} for #{href}"
150
- debug_msg << " in #{filenames.join(' ')}" unless filenames.nil?
151
- logger.log :debug, :yellow, debug_msg
152
-
153
- if response_code.between?(200, 299)
154
- check_hash_in_2xx_response(href, effective_url, response, filenames)
155
- @cache.add(href, filenames, response_code)
156
- elsif response.timed_out?
157
- handle_timeout(href, filenames, response_code)
158
- elsif response_code == 0
159
- handle_failure(href, filenames, response_code)
160
- elsif method == :head
161
- queue_request(:get, href, filenames)
162
- else
163
- return if @options[:only_4xx] && !response_code.between?(400, 499)
164
- # Received a non-successful http response.
165
- msg = "External link #{href} failed: #{response_code} #{response.return_message}"
166
- add_external_issue(filenames, msg, response_code)
167
- @cache.add(href, filenames, response_code, msg)
168
- end
169
- end
170
-
171
- # Even though the response was a success, we may have been asked to check
172
- # if the hash on the URL exists on the page
173
- def check_hash_in_2xx_response(href, effective_url, response, filenames)
174
- return if @options[:only_4xx]
175
- return unless @options[:check_external_hash]
176
- return unless (hash = hash?(href))
177
-
178
- body_doc = create_nokogiri(response.body)
179
-
180
- # user-content is a special addition by GitHub.
181
- xpath = %(//*[@name="#{hash}"]|//*[@id="#{hash}"])
182
- if URI.parse(href).host.match(/github\.com/i)
183
- xpath << %(|//*[@name="user-content-#{hash}"]|//*[@id="user-content-#{hash}"])
184
- end
185
-
186
- return unless body_doc.xpath(xpath).empty?
187
-
188
- msg = "External link #{href} failed: #{effective_url} exists, but the hash '#{hash}' does not"
189
- add_external_issue(filenames, msg, response.code)
190
- @cache.add(href, filenames, response.code, msg)
191
- end
192
-
193
- def handle_timeout(href, filenames, response_code)
194
- msg = "External link #{href} failed: got a time out (response code #{response_code})"
195
- @cache.add(href, filenames, 0, msg)
196
- return if @options[:only_4xx]
197
- add_external_issue(filenames, msg, response_code)
198
- end
199
-
200
- def handle_failure(href, filenames, response_code)
201
- msg = "External link #{href} failed: response code #{response_code} means something's wrong"
202
- @cache.add(href, filenames, 0, msg)
203
- return if @options[:only_4xx]
204
- add_external_issue(filenames, msg, response_code)
205
- end
206
-
207
- def add_external_issue(filenames, desc, status = nil)
208
- if filenames.nil?
209
- @failed_tests << CheckRunner::Issue.new('', desc, nil, status)
210
- else
211
- filenames.each { |f| @failed_tests << CheckRunner::Issue.new(f, desc, nil, status) }
212
- end
213
- end
214
-
215
- def hash?(url)
216
- URI.parse(url).fragment
217
- rescue URI::InvalidURIError
218
- nil
219
- end
220
- end
221
- end
222
- end