html-proofer 4.0.0.rc3 → 4.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/htmlproofer +6 -3
- data/lib/html-proofer.rb +1 -1
- data/lib/html_proofer/attribute/url.rb +186 -174
- data/lib/html_proofer/cache.rb +128 -85
- data/lib/html_proofer/check/favicon.rb +29 -24
- data/lib/html_proofer/check/images.rb +87 -47
- data/lib/html_proofer/check/links.rb +109 -98
- data/lib/html_proofer/check/open_graph.rb +30 -25
- data/lib/html_proofer/check/scripts.rb +36 -28
- data/lib/html_proofer/check.rb +11 -10
- data/lib/html_proofer/configuration.rb +16 -15
- data/lib/html_proofer/element.rb +41 -19
- data/lib/html_proofer/log.rb +19 -19
- data/lib/html_proofer/reporter/cli.rb +22 -18
- data/lib/html_proofer/reporter.rb +3 -3
- data/lib/html_proofer/runner.rb +45 -44
- data/lib/html_proofer/url_validator/external.rb +157 -152
- data/lib/html_proofer/url_validator/internal.rb +72 -62
- data/lib/html_proofer/utils.rb +5 -5
- data/lib/html_proofer/version.rb +1 -1
- data/lib/html_proofer.rb +11 -9
- metadata +8 -7
data/lib/html_proofer/cache.rb
CHANGED
@@ -1,8 +1,8 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
require
|
4
|
-
require
|
5
|
-
require
|
3
|
+
require "date"
|
4
|
+
require "json"
|
5
|
+
require "uri"
|
6
6
|
|
7
7
|
module HTMLProofer
|
8
8
|
class Cache
|
@@ -10,8 +10,8 @@ module HTMLProofer
|
|
10
10
|
|
11
11
|
CACHE_VERSION = 2
|
12
12
|
|
13
|
-
DEFAULT_STORAGE_DIR = File.join(
|
14
|
-
DEFAULT_CACHE_FILE_NAME =
|
13
|
+
DEFAULT_STORAGE_DIR = File.join("tmp", ".htmlproofer")
|
14
|
+
DEFAULT_CACHE_FILE_NAME = "cache.json"
|
15
15
|
|
16
16
|
URI_REGEXP = URI::DEFAULT_PARSER.make_regexp
|
17
17
|
|
@@ -21,7 +21,7 @@ module HTMLProofer
|
|
21
21
|
@runner = runner
|
22
22
|
@logger = @runner.logger
|
23
23
|
|
24
|
-
@cache_datetime =
|
24
|
+
@cache_datetime = Time.now
|
25
25
|
@cache_time = @cache_datetime.to_time
|
26
26
|
|
27
27
|
if blank?(options)
|
@@ -29,28 +29,25 @@ module HTMLProofer
|
|
29
29
|
else
|
30
30
|
define_singleton_method(:enabled?) { true }
|
31
31
|
setup_cache!(options)
|
32
|
-
@parsed_timeframe = parsed_timeframe(options[:timeframe])
|
33
|
-
end
|
34
|
-
end
|
35
32
|
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
time = Time.parse(time) if time.is_a?(String)
|
40
|
-
(@parsed_timeframe..@cache_time).cover?(time)
|
33
|
+
@external_timeframe = parsed_timeframe(options[:timeframe][:external])
|
34
|
+
@internal_timeframe = parsed_timeframe(options[:timeframe][:internal])
|
35
|
+
end
|
41
36
|
end
|
42
37
|
|
43
38
|
def parsed_timeframe(timeframe)
|
39
|
+
return nil if timeframe.nil?
|
40
|
+
|
44
41
|
time, date = timeframe.match(/(\d+)(\D)/).captures
|
45
42
|
time = time.to_i
|
46
43
|
case date
|
47
|
-
when
|
44
|
+
when "M"
|
48
45
|
time_ago(time, :months)
|
49
|
-
when
|
46
|
+
when "w"
|
50
47
|
time_ago(time, :weeks)
|
51
|
-
when
|
48
|
+
when "d"
|
52
49
|
time_ago(time, :days)
|
53
|
-
when
|
50
|
+
when "h"
|
54
51
|
time_ago(time, :hours)
|
55
52
|
else
|
56
53
|
raise ArgumentError, "#{date} is not a valid timeframe!"
|
@@ -71,7 +68,8 @@ module HTMLProofer
|
|
71
68
|
found = status_code.between?(200, 299)
|
72
69
|
|
73
70
|
clean_url = cleaned_url(url)
|
74
|
-
@cache_log[:external][clean_url] =
|
71
|
+
@cache_log[:external][clean_url] =
|
72
|
+
{ time: @cache_time.to_s, found: found, status_code: status_code, message: msg, metadata: filenames }
|
75
73
|
end
|
76
74
|
|
77
75
|
def detect_url_changes(urls_detected, type)
|
@@ -82,39 +80,104 @@ module HTMLProofer
|
|
82
80
|
additions
|
83
81
|
end
|
84
82
|
|
83
|
+
def write
|
84
|
+
return unless enabled?
|
85
|
+
|
86
|
+
File.write(@cache_file, @cache_log.to_json)
|
87
|
+
end
|
88
|
+
|
89
|
+
def retrieve_urls(urls_detected, type)
|
90
|
+
# if there are no urls, bail
|
91
|
+
return {} if urls_detected.empty?
|
92
|
+
|
93
|
+
urls_detected = urls_detected.transform_keys do |url|
|
94
|
+
cleaned_url(url)
|
95
|
+
end
|
96
|
+
|
97
|
+
urls_to_check = detect_url_changes(urls_detected, type)
|
98
|
+
|
99
|
+
@cache_log[type].each_pair do |url, cache|
|
100
|
+
within_timeframe = type == :external ? within_external_timeframe?(cache[:time]) : within_internal_timeframe?(cache[:time])
|
101
|
+
next if within_timeframe
|
102
|
+
|
103
|
+
urls_to_check[url] = cache[:metadata] # recheck expired links
|
104
|
+
end
|
105
|
+
|
106
|
+
urls_to_check
|
107
|
+
end
|
108
|
+
|
109
|
+
def within_external_timeframe?(time)
|
110
|
+
within_timeframe?(time, @external_timeframe)
|
111
|
+
end
|
112
|
+
|
113
|
+
def within_internal_timeframe?(time)
|
114
|
+
within_timeframe?(time, @internal_timeframe)
|
115
|
+
end
|
116
|
+
|
117
|
+
def empty?
|
118
|
+
blank?(@cache_log) || (@cache_log[:internal].empty? && @cache_log[:external].empty?)
|
119
|
+
end
|
120
|
+
|
121
|
+
def size(type)
|
122
|
+
@cache_log[type].size
|
123
|
+
end
|
124
|
+
|
85
125
|
private def construct_internal_link_metadata(metadata, found)
|
86
126
|
{
|
87
127
|
source: metadata[:source],
|
88
|
-
|
128
|
+
filename: metadata[:filename],
|
89
129
|
line: metadata[:line],
|
90
130
|
base_url: metadata[:base_url],
|
91
|
-
found: found
|
131
|
+
found: found,
|
92
132
|
}
|
93
133
|
end
|
94
134
|
|
95
135
|
# prepare to add new URLs detected
|
96
136
|
private def determine_additions(urls_detected, type)
|
97
|
-
additions = urls_detected
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
137
|
+
additions = type == :external ? determine_external_additions(urls_detected) : determine_internal_additions(urls_detected)
|
138
|
+
|
139
|
+
new_link_count = additions.length
|
140
|
+
new_link_text = pluralize(new_link_count, "new #{type} link", "new #{type} links")
|
141
|
+
@logger.log(:debug, "Adding #{new_link_text} to the cache")
|
142
|
+
|
143
|
+
additions
|
144
|
+
end
|
145
|
+
|
146
|
+
private def determine_external_additions(urls_detected)
|
147
|
+
urls_detected.reject do |url, _metadata|
|
148
|
+
if @cache_log[:external].include?(url)
|
149
|
+
@cache_log[:external][url][:found] # if this is false, we're trying again
|
107
150
|
else
|
108
|
-
@logger.log
|
151
|
+
@logger.log(:debug, "Adding #{url} to external cache")
|
109
152
|
false
|
110
153
|
end
|
111
154
|
end
|
155
|
+
end
|
112
156
|
|
113
|
-
|
114
|
-
|
115
|
-
|
157
|
+
private def determine_internal_additions(urls_detected)
|
158
|
+
urls_detected.each_with_object({}) do |(url, metadata), hsh|
|
159
|
+
# url is not even in cache
|
160
|
+
if @cache_log[:internal][url].nil?
|
161
|
+
hsh[url] = metadata
|
162
|
+
next
|
163
|
+
end
|
116
164
|
|
117
|
-
|
165
|
+
cache_metadata = @cache_log[:internal][url][:metadata]
|
166
|
+
incoming_metadata = urls_detected[url].each_with_object([]) do |incoming_url, arr|
|
167
|
+
existing_cache_metadata = cache_metadata.find { |k, _| k[:filename] == incoming_url[:filename] }
|
168
|
+
|
169
|
+
# cache for this url, from an existing path, exists as found
|
170
|
+
if !existing_cache_metadata.nil? && !existing_cache_metadata.empty? && existing_cache_metadata[:found]
|
171
|
+
metadata.find { |m| m[:filename] == existing_cache_metadata[:filename] }[:found] = true
|
172
|
+
next
|
173
|
+
end
|
174
|
+
|
175
|
+
@logger.log(:debug, "Adding #{incoming_url} to internal cache")
|
176
|
+
arr << incoming_url
|
177
|
+
end
|
178
|
+
|
179
|
+
hsh[url] = incoming_metadata
|
180
|
+
end
|
118
181
|
end
|
119
182
|
|
120
183
|
# remove from cache URLs that no longer exist
|
@@ -125,54 +188,21 @@ module HTMLProofer
|
|
125
188
|
if urls_detected.include?(url)
|
126
189
|
false
|
127
190
|
elsif url_matches_type?(url, type)
|
128
|
-
@logger.log
|
191
|
+
@logger.log(:debug, "Removing #{url} from #{type} cache")
|
129
192
|
deletions += 1
|
130
193
|
true
|
131
194
|
end
|
132
195
|
end
|
133
196
|
|
134
197
|
del_link_text = pluralize(deletions, "outdated #{type} link", "outdated #{type} links")
|
135
|
-
@logger.log
|
136
|
-
end
|
137
|
-
|
138
|
-
def write
|
139
|
-
return unless enabled?
|
140
|
-
|
141
|
-
File.write(@cache_file, @cache_log.to_json)
|
142
|
-
end
|
143
|
-
|
144
|
-
def retrieve_urls(urls_detected, type)
|
145
|
-
# if there are no urls, bail
|
146
|
-
return {} if urls_detected.empty?
|
147
|
-
|
148
|
-
urls_detected = urls_detected.transform_keys do |url|
|
149
|
-
cleaned_url(url)
|
150
|
-
end
|
151
|
-
|
152
|
-
urls_to_check = detect_url_changes(urls_detected, type)
|
153
|
-
|
154
|
-
@cache_log[type].each_pair do |url, cache|
|
155
|
-
next if within_timeframe?(cache[:time])
|
156
|
-
|
157
|
-
urls_to_check[url] = cache[:metadata] # recheck expired links
|
158
|
-
end
|
159
|
-
|
160
|
-
urls_to_check
|
161
|
-
end
|
162
|
-
|
163
|
-
def empty?
|
164
|
-
blank?(@cache_log) || (@cache_log[:internal].empty? && @cache_log[:external].empty?)
|
165
|
-
end
|
166
|
-
|
167
|
-
def size(type)
|
168
|
-
@cache_log[type].size
|
198
|
+
@logger.log(:debug, "Removing #{del_link_text} from the cache")
|
169
199
|
end
|
170
200
|
|
171
201
|
private def setup_cache!(options)
|
172
202
|
default_structure = {
|
173
203
|
version: CACHE_VERSION,
|
174
204
|
internal: {},
|
175
|
-
external: {}
|
205
|
+
external: {},
|
176
206
|
}
|
177
207
|
|
178
208
|
@storage_dir = options[:storage_dir] || DEFAULT_STORAGE_DIR
|
@@ -193,26 +223,32 @@ module HTMLProofer
|
|
193
223
|
|
194
224
|
old_cache = (cache_version = log[:version]).nil?
|
195
225
|
@cache_log = if old_cache # previous cache version, create a new one
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
226
|
+
default_structure
|
227
|
+
elsif cache_version != CACHE_VERSION
|
228
|
+
# if cache version is newer...do something
|
229
|
+
else
|
230
|
+
log[:internal] = log[:internal].transform_keys(&:to_s)
|
231
|
+
log[:external] = log[:external].transform_keys(&:to_s)
|
232
|
+
log
|
233
|
+
end
|
204
234
|
end
|
205
235
|
|
236
|
+
# https://github.com/rails/rails/blob/3872bc0e54d32e8bf3a6299b0bfe173d94b072fc/activesupport/lib/active_support/duration.rb#L112-L117
|
237
|
+
SECONDS_PER_HOUR = 3600
|
238
|
+
SECONDS_PER_DAY = 86400
|
239
|
+
SECONDS_PER_WEEK = 604800
|
240
|
+
SECONDS_PER_MONTH = 2629746 # 1/12 of a gregorian year
|
241
|
+
|
206
242
|
private def time_ago(measurement, unit)
|
207
243
|
case unit
|
208
244
|
when :months
|
209
|
-
@cache_datetime
|
245
|
+
@cache_datetime - (SECONDS_PER_MONTH * measurement)
|
210
246
|
when :weeks
|
211
|
-
@cache_datetime - (
|
247
|
+
@cache_datetime - (SECONDS_PER_WEEK * measurement)
|
212
248
|
when :days
|
213
|
-
@cache_datetime - measurement
|
249
|
+
@cache_datetime - (SECONDS_PER_DAY * measurement)
|
214
250
|
when :hours
|
215
|
-
@cache_datetime - Rational(
|
251
|
+
@cache_datetime - Rational(SECONDS_PER_HOUR * measurement)
|
216
252
|
end.to_time
|
217
253
|
end
|
218
254
|
|
@@ -224,7 +260,7 @@ module HTMLProofer
|
|
224
260
|
private def cleaned_url(url)
|
225
261
|
cleaned_url = escape_unescape(url)
|
226
262
|
|
227
|
-
return cleaned_url unless cleaned_url.end_with?(
|
263
|
+
return cleaned_url unless cleaned_url.end_with?("/", "#", "?") && cleaned_url.length > 1
|
228
264
|
|
229
265
|
cleaned_url[0..-2]
|
230
266
|
end
|
@@ -232,5 +268,12 @@ module HTMLProofer
|
|
232
268
|
private def escape_unescape(url)
|
233
269
|
Addressable::URI.parse(url).normalize.to_s
|
234
270
|
end
|
271
|
+
|
272
|
+
private def within_timeframe?(current_time, parsed_timeframe)
|
273
|
+
return false if current_time.nil? || parsed_timeframe.nil?
|
274
|
+
|
275
|
+
current_time = Time.parse(current_time) if current_time.is_a?(String)
|
276
|
+
(parsed_timeframe..@cache_time).cover?(current_time)
|
277
|
+
end
|
235
278
|
end
|
236
279
|
end
|
@@ -1,35 +1,40 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
3
|
+
module HTMLProofer
|
4
|
+
class Check
|
5
|
+
class Favicon < HTMLProofer::Check
|
6
|
+
def run
|
7
|
+
found = false
|
8
|
+
@html.css("link").each do |node|
|
9
|
+
@favicon = create_element(node)
|
8
10
|
|
9
|
-
|
11
|
+
next if @favicon.ignore?
|
10
12
|
|
11
|
-
|
12
|
-
|
13
|
+
break if (found = @favicon.node["rel"].split.last.eql?("icon"))
|
14
|
+
end
|
13
15
|
|
14
|
-
|
16
|
+
return if immediate_redirect?
|
15
17
|
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
18
|
+
if found
|
19
|
+
if @favicon.url.remote?
|
20
|
+
add_to_external_urls(@favicon.url, @favicon.line)
|
21
|
+
elsif !@favicon.url.exists?
|
22
|
+
add_failure("internal favicon #{@favicon.url.raw_attribute} does not exist", line: @favicon.line,
|
23
|
+
content: @favicon.content)
|
24
|
+
end
|
25
|
+
else
|
26
|
+
add_failure("no favicon provided")
|
27
|
+
end
|
21
28
|
end
|
22
|
-
else
|
23
|
-
add_failure('no favicon provided')
|
24
|
-
end
|
25
|
-
end
|
26
29
|
|
27
|
-
|
30
|
+
private
|
28
31
|
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
32
|
+
# allow any instant-redirect meta tag
|
33
|
+
def immediate_redirect?
|
34
|
+
@html.xpath("//meta[@http-equiv='refresh']").attribute("content").value.start_with?("0;")
|
35
|
+
rescue StandardError
|
36
|
+
false
|
37
|
+
end
|
38
|
+
end
|
34
39
|
end
|
35
40
|
end
|
@@ -1,62 +1,102 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
3
|
+
module HTMLProofer
|
4
|
+
class Check
|
5
|
+
class Images < HTMLProofer::Check
|
6
|
+
SCREEN_SHOT_REGEX = /Screen(?: |%20)Shot(?: |%20)\d+-\d+-\d+(?: |%20)at(?: |%20)\d+.\d+.\d+/.freeze
|
7
|
+
|
8
|
+
def run
|
9
|
+
@html.css("img").each do |node|
|
10
|
+
@img = create_element(node)
|
11
|
+
|
12
|
+
next if @img.ignore?
|
13
|
+
|
14
|
+
# screenshot filenames should return because of terrible names
|
15
|
+
add_failure("image has a terrible filename (#{@img.url.raw_attribute})", line: @img.line,
|
16
|
+
content: @img.content) if terrible_filename?
|
17
|
+
|
18
|
+
# does the image exist?
|
19
|
+
if missing_src?
|
20
|
+
add_failure("image has no src or srcset attribute", line: @img.line, content: @img.content)
|
21
|
+
elsif @img.url.remote?
|
22
|
+
add_to_external_urls(@img.url, @img.line)
|
23
|
+
elsif !@img.url.exists? && !@img.multiple_srcsets? && !@img.multiple_sizes?
|
24
|
+
add_failure("internal image #{@img.url.raw_attribute} does not exist", line: @img.line,
|
25
|
+
content: @img.content)
|
26
|
+
elsif @img.multiple_srcsets?
|
27
|
+
@img.srcsets.each do |srcset|
|
28
|
+
srcset_url = HTMLProofer::Attribute::Url.new(@runner, srcset, base_url: @img.base_url)
|
29
|
+
|
30
|
+
if srcset_url.remote?
|
31
|
+
add_to_external_urls(srcset_url.url, @img.line)
|
32
|
+
elsif !srcset_url.exists?
|
33
|
+
add_failure("internal image #{srcset} does not exist", line: @img.line, content: @img.content)
|
34
|
+
end
|
35
|
+
end
|
36
|
+
elsif @img.multiple_sizes?
|
37
|
+
@img.srcsets_wo_sizes.each do |srcset|
|
38
|
+
srcset_url = HTMLProofer::Attribute::Url.new(@runner, srcset, base_url: @img.base_url)
|
39
|
+
|
40
|
+
if srcset_url.remote?
|
41
|
+
add_to_external_urls(srcset_url.url, @img.line)
|
42
|
+
elsif !srcset_url.exists?
|
43
|
+
add_failure("internal image #{srcset} does not exist", line: @img.line, content: @img.content)
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
unless ignore_element?
|
49
|
+
if missing_alt_tag? && !ignore_missing_alt?
|
50
|
+
add_failure("image #{@img.url.raw_attribute} does not have an alt attribute", line: @img.line,
|
51
|
+
content: @img.content)
|
52
|
+
elsif (empty_alt_tag? || alt_all_spaces?) && !ignore_empty_alt?
|
53
|
+
add_failure("image #{@img.url.raw_attribute} has an alt attribute, but no content", line: @img.line,
|
54
|
+
content: @img.content)
|
55
|
+
end
|
31
56
|
end
|
57
|
+
|
58
|
+
add_failure("image #{@img.url.raw_attribute} uses the http scheme", line: @img.line,
|
59
|
+
content: @img.content) if @runner.enforce_https? && @img.url.http?
|
32
60
|
end
|
61
|
+
|
62
|
+
external_urls
|
33
63
|
end
|
34
64
|
|
35
|
-
|
65
|
+
def ignore_missing_alt?
|
66
|
+
@runner.options[:ignore_missing_alt]
|
67
|
+
end
|
36
68
|
|
37
|
-
|
38
|
-
|
69
|
+
def ignore_empty_alt?
|
70
|
+
@runner.options[:ignore_empty_alt]
|
71
|
+
end
|
39
72
|
|
40
|
-
|
41
|
-
|
73
|
+
def ignore_element?
|
74
|
+
@img.url.ignore? || @img.aria_hidden?
|
75
|
+
end
|
42
76
|
|
43
|
-
|
44
|
-
|
45
|
-
|
77
|
+
def missing_alt_tag?
|
78
|
+
@img.node["alt"].nil?
|
79
|
+
end
|
46
80
|
|
47
|
-
|
48
|
-
|
49
|
-
|
81
|
+
def empty_alt_tag?
|
82
|
+
!missing_alt_tag? && @img.node["alt"].empty?
|
83
|
+
end
|
50
84
|
|
51
|
-
|
52
|
-
|
53
|
-
|
85
|
+
def empty_whitespace_alt_tag?
|
86
|
+
!missing_alt_tag? && @img.node["alt"].strip.empty?
|
87
|
+
end
|
54
88
|
|
55
|
-
|
56
|
-
|
57
|
-
|
89
|
+
def alt_all_spaces?
|
90
|
+
!missing_alt_tag? && @img.node["alt"].split.all?(" ")
|
91
|
+
end
|
92
|
+
|
93
|
+
def terrible_filename?
|
94
|
+
@img.url.to_s =~ SCREEN_SHOT_REGEX
|
95
|
+
end
|
58
96
|
|
59
|
-
|
60
|
-
|
97
|
+
def missing_src?
|
98
|
+
blank?(@img.url.to_s)
|
99
|
+
end
|
100
|
+
end
|
61
101
|
end
|
62
102
|
end
|