html-proofer 3.19.0 → 4.0.0.rc1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/htmlproofer +31 -57
- data/lib/html-proofer.rb +1 -54
- data/lib/html_proofer/attribute/url.rb +231 -0
- data/lib/html_proofer/attribute.rb +15 -0
- data/lib/html_proofer/cache.rb +234 -0
- data/lib/html_proofer/check/favicon.rb +35 -0
- data/lib/html_proofer/check/images.rb +62 -0
- data/lib/html_proofer/check/links.rb +118 -0
- data/lib/html_proofer/check/open_graph.rb +34 -0
- data/lib/html_proofer/check/scripts.rb +38 -0
- data/lib/html_proofer/check.rb +91 -0
- data/lib/{html-proofer → html_proofer}/configuration.rb +32 -32
- data/lib/html_proofer/element.rb +122 -0
- data/lib/html_proofer/failure.rb +17 -0
- data/lib/{html-proofer → html_proofer}/log.rb +0 -0
- data/lib/html_proofer/reporter/cli.rb +29 -0
- data/lib/html_proofer/reporter.rb +23 -0
- data/lib/html_proofer/runner.rb +245 -0
- data/lib/html_proofer/url_validator/external.rb +189 -0
- data/lib/html_proofer/url_validator/internal.rb +86 -0
- data/lib/html_proofer/url_validator.rb +16 -0
- data/lib/{html-proofer → html_proofer}/utils.rb +6 -9
- data/lib/{html-proofer → html_proofer}/version.rb +1 -1
- data/lib/html_proofer/xpath_functions.rb +10 -0
- data/lib/html_proofer.rb +56 -0
- metadata +52 -31
- data/lib/html-proofer/cache.rb +0 -194
- data/lib/html-proofer/check/favicon.rb +0 -29
- data/lib/html-proofer/check/html.rb +0 -37
- data/lib/html-proofer/check/images.rb +0 -48
- data/lib/html-proofer/check/links.rb +0 -182
- data/lib/html-proofer/check/opengraph.rb +0 -46
- data/lib/html-proofer/check/scripts.rb +0 -42
- data/lib/html-proofer/check.rb +0 -75
- data/lib/html-proofer/element.rb +0 -261
- data/lib/html-proofer/issue.rb +0 -65
- data/lib/html-proofer/middleware.rb +0 -82
- data/lib/html-proofer/runner.rb +0 -248
- data/lib/html-proofer/url_validator.rb +0 -237
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: bec55c40cc2d01b65496b138570cf434e533d045476470e4ce1e6b0daf3d5408
|
4
|
+
data.tar.gz: dd77aaf59adf3eaaa48a6b20dab59adbb0a5974b0a4ded5f9fd51e1fc9ba3684
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5036e6f46c4e0ac32bd9a6f4bd891244f25cb43670c40eee7c7421c661f9a3b6e24edd15507b6290fd102f37988529250cecf5800b63b6ed1f8622dd983c76ec
|
7
|
+
data.tar.gz: cb9fdec8ec8774e8a9607d9b92b859767b6726e45b0bc54d6b1eee993de03c1a0a81a50035fe32543e96a7f1ee1d420366076d1d5956761abd1201086e35f057
|
data/bin/htmlproofer
CHANGED
@@ -15,43 +15,32 @@ Mercenary.program(:htmlproofer) do |p|
|
|
15
15
|
|
16
16
|
p.description 'Runs the HTML-Proofer suite on the files in PATH. For more details, see the README.'
|
17
17
|
|
18
|
-
p.option '
|
19
|
-
p.option '
|
18
|
+
p.option 'allow_hash_href', '--allow-hash-href', 'If `true`, assumes `href="#"` anchors are valid'
|
19
|
+
p.option 'allow_missing_href', '--allow-missing-href', 'If `true`, does not flag `a` tags missing `href`. In HTML5, this is technically allowed, but could also be human error.'
|
20
20
|
p.option 'as_links', '--as-links', 'Assumes that `PATH` is a comma-separated array of links to check.'
|
21
|
-
p.option '
|
22
|
-
p.option '
|
23
|
-
p.option '
|
24
|
-
p.option 'check_external_hash', '--check-external-hash', 'Checks whether external hashes exist (even if the webpage exists). This slows the checker down (default: `false`).'
|
25
|
-
p.option 'check_favicon', '--check-favicon', 'Enables the favicon checker (default: `false`).'
|
26
|
-
p.option 'check_html', '--check-html', 'Enables HTML validation errors from Nokogumbo (default: `false`).'
|
27
|
-
p.option 'check_img_http', '--check-img-http', 'Fails an image if it\'s marked as `http` (default: `false`).'
|
28
|
-
p.option 'check_opengraph', '--check-opengraph', 'Enables the Open Graph checker (default: `false`).'
|
21
|
+
p.option 'assume_extension', '--assume-extension <ext>', 'Automatically add specified extension to files for internal links, to allow extensionless URLs (as supported by most servers) (default: `.html`).'
|
22
|
+
p.option 'checks', '--checks check1,[check2,...]', Array, 'A comma-separated list of Strings indicating which checks you want to run (default: `["Links", "Images", "Scripts"]`)'
|
23
|
+
p.option 'check_external_hash', '--check-external-hash', 'Checks whether external hashes exist (even if the webpage exists) (default: `true`).'
|
29
24
|
p.option 'check_sri', '--check-sri', 'Check that `<link>` and `<script>` external resources use SRI (default: `false`).'
|
30
25
|
p.option 'directory_index_file', '--directory-index-file <filename>', String, 'Sets the file to look for when a link refers to a directory. (default: `index.html`)'
|
31
|
-
p.option 'disable_external', '--disable-external', 'If `true`, does not run the external link checker
|
32
|
-
p.option '
|
33
|
-
p.option '
|
34
|
-
p.option '
|
35
|
-
p.option '
|
36
|
-
p.option '
|
37
|
-
p.option '
|
38
|
-
p.option '
|
39
|
-
p.option 'internal_domains', '--internal-domains domain1,[domain2,...]', Array, 'A comma-separated list of Strings containing domains that will be treated as internal urls.'
|
40
|
-
p.option 'report_invalid_tags', '--report-invalid-tags', 'When `check_html` is enabled, HTML markup that is unknown to Nokogumbo are reported as errors (default: `false`)'
|
41
|
-
p.option 'report_missing_names', '--report-missing-names', 'When `check_html` is enabled, HTML markup that are missing entity names are reported as errors (default: `false`)'
|
42
|
-
p.option 'report_script_embeds', '--report-script-embeds', 'When `check_html` is enabled, `script` tags containing markup are reported as errors (default: `false`)'
|
43
|
-
p.option 'report_missing_doctype', '--report-missing-doctype', 'When `check_html` is enabled, HTML markup with missing or out-of-order `DOCTYPE` are reported as errors (default: `false`)'
|
44
|
-
p.option 'report_eof_tags', '--report-eof-tags', 'When `check_html` is enabled, HTML markup with tags that are malformed are reported as errors (default: `false`)'
|
45
|
-
p.option 'report_mismatched_tags', '--report-mismatched-tags', 'When `check_html` is enabled, HTML markup with mismatched tags are reported as errors (default: `false`)'
|
26
|
+
p.option 'disable_external', '--disable-external', 'If `true`, does not run the external link checker (default: `false`)'
|
27
|
+
p.option 'enforce_https', '--enforce-https', 'Fails a link if it\'s not marked as `https` (default: `true`).'
|
28
|
+
p.option 'extensions', '--extensions ext1,[ext2,...[', Array, 'A comma-separated list of Strings indicating the file extensions you would like to check (including the dot) (default: `.html`)'
|
29
|
+
p.option 'ignore_files', '--ignore-files file1,[file2,...]', Array, 'A comma-separated list of Strings or RegExps containing file paths that are safe to ignore'
|
30
|
+
p.option 'ignore_empty_mailto', '--ignore-empty-mailto', 'If `true`, allows `mailto:` `href`s which do not contain an email address'
|
31
|
+
p.option 'ignore_missing_alt', '--empty-alt-ignore', 'If `true`, ignores images with empty/missing alt tags'
|
32
|
+
p.option 'ignore_status_codes', '--http-status-ignore 123,[xxx, ...]', Array, 'A comma-separated list of numbers representing status codes to ignore.'
|
33
|
+
p.option 'ignore_urls', '--ignore-urls link1,[link2,...]', Array, 'A comma-separated list of Strings or RegExps containing URLs that are safe to ignore. This affects all HTML attributes, such as `alt` tags on images.'
|
46
34
|
p.option 'log_level', '--log-level <level>', String, 'Sets the logging level, as determined by Yell. One of `:debug`, `:info`, `:warn`, `:error`, or `:fatal`. (default: `:info`)'
|
47
35
|
p.option 'only_4xx', '--only-4xx', 'Only reports errors for links that fall within the 4xx status code range'
|
48
|
-
p.option 'storage_dir', '--storage-dir PATH', String, 'Directory where to store the cache log (default: "tmp/.htmlproofer")'
|
49
|
-
p.option 'timeframe', '--timeframe <time>', String, 'A string representing the caching timeframe.'
|
50
|
-
p.option 'typhoeus_config', '--typhoeus-config CONFIG', String, 'JSON-formatted string of Typhoeus config. Will override the html-proofer defaults.'
|
51
|
-
p.option 'hydra_config', '--hydra-config CONFIG', String, 'JSON-formatted string of Hydra config. Will override the html-proofer defaults.'
|
52
|
-
p.option 'url_ignore', '--url-ignore link1,[link2,...]', Array, 'A comma-separated list of Strings or RegExps containing URLs that are safe to ignore. It affects all HTML attributes. Note that non-HTTP(S) URIs are always ignored'
|
53
|
-
p.option 'url_swap', '--url-swap re:string,[re:string,...]', Array, 'A comma-separated list containing key-value pairs of `RegExp => String`. It transforms URLs that match `RegExp` into `String` via `gsub`. The escape sequences `\\:` should be used to produce literal `:`s.'
|
54
36
|
p.option 'root_dir', '--root-dir PATH', String, 'The absolute path to the directory serving your html-files.'
|
37
|
+
p.option 'swap_attributes', '--swap-attributes CONFIG', String, 'JSON-formatted config that maps element names to the preferred attribute to check (default: `{}`).'
|
38
|
+
p.option 'swap_urls', '--swap-urls re:string,[re:string,...]', Array, 'A comma-separated list containing key-value pairs of `RegExp => String`. It transforms URLs that match `RegExp` into `String` via `gsub`. The escape sequences `\\:` should be used to produce literal `:`s.'
|
39
|
+
|
40
|
+
p.option 'typhoeus', '--typhoeus CONFIG', String, 'JSON-formatted string of Typhoeus config. Will override the html-proofer defaults.'
|
41
|
+
p.option 'hydra', '--hydra CONFIG', String, 'JSON-formatted string of Hydra config. Will override the html-proofer defaults.'
|
42
|
+
p.option 'parallel', '--parallel CONFIG', String, 'JSON-formatted string of Parallel config. Will override the html-proofer defaults.'
|
43
|
+
p.option 'cache', '--cache CONFIG', String, 'JSON-formatted string of cache config. Will override the html-proofer defaults.'
|
55
44
|
|
56
45
|
p.action do |args, opts|
|
57
46
|
args = ['.'] if args.empty?
|
@@ -66,46 +55,31 @@ Mercenary.program(:htmlproofer) do |p|
|
|
66
55
|
end
|
67
56
|
|
68
57
|
# some minor manipulation of a special option
|
69
|
-
unless opts['
|
70
|
-
options[:
|
71
|
-
opts['
|
58
|
+
unless opts['swap_urls'].nil?
|
59
|
+
options[:swap_urls] = {}
|
60
|
+
opts['swap_urls'].each do |s|
|
72
61
|
splt = s.split(/(?<!\\):/, 2)
|
73
62
|
|
74
63
|
re = splt[0].gsub(/\\:/, ':')
|
75
64
|
string = splt[1].gsub(/\\:/, ':')
|
76
|
-
options[:
|
65
|
+
options[:swap_urls][Regexp.new(re)] = string
|
77
66
|
end
|
78
67
|
end
|
79
68
|
|
80
|
-
options[:error_sort] = opts['error-sort'].to_sym unless opts['error-sort'].nil?
|
81
69
|
options[:log_level] = opts['log_level'].to_sym unless opts['log_level'].nil?
|
82
70
|
|
83
|
-
options[:
|
84
|
-
options[:
|
85
|
-
options[:
|
86
|
-
options[:
|
87
|
-
options[:validation][:report_missing_doctype] = opts['report_missing_doctype'] unless opts['report_missing_doctype'].nil?
|
88
|
-
options[:validation][:report_eof_tags] = opts['report_eof_tags'] unless opts['report_eof_tags'].nil?
|
89
|
-
options[:validation][:report_mismatched_tags] = opts['report_mismatched_tags'] unless opts['report_mismatched_tags'].nil?
|
90
|
-
|
91
|
-
options[:typhoeus] = HTMLProofer::Configuration.parse_json_option('typhoeus_config', opts['typhoeus_config']) unless opts['typhoeus_config'].nil?
|
92
|
-
options[:hydra] = HTMLProofer::Configuration.parse_json_option('hydra_config', opts['hydra_config']) unless opts['hydra_config'].nil?
|
71
|
+
options[:typhoeus] = HTMLProofer::Configuration.parse_json_option('typhoeus', opts['typhoeus'], symbolize_names: false) unless opts['typhoeus'].nil?
|
72
|
+
options[:hydra] = HTMLProofer::Configuration.parse_json_option('hydra', opts['hydra']) unless opts['hydra'].nil?
|
73
|
+
options[:parallel] = HTMLProofer::Configuration.parse_json_option('parallel', opts['parallel']) unless opts['parallel'].nil?
|
74
|
+
options[:cache] = HTMLProofer::Configuration.parse_json_option('cache', opts['cache']) unless opts['cache'].nil?
|
93
75
|
|
94
|
-
unless opts['
|
95
|
-
options[:cache] ||= {}
|
96
|
-
options[:cache][:timeframe] = opts['timeframe'] unless opts['timeframe'].nil?
|
97
|
-
end
|
98
|
-
|
99
|
-
unless opts['storage_dir'].nil?
|
100
|
-
options[:cache] ||= {}
|
101
|
-
options[:cache][:storage_dir] = opts['storage_dir'] unless opts['storage_dir'].nil?
|
102
|
-
end
|
76
|
+
options[:swap_attributes] = HTMLProofer::Configuration.parse_json_option('swap_attributes', opts['swap_attributes'], symbolize_names: false) unless opts['swap_attributes'].nil?
|
103
77
|
|
104
|
-
options[:
|
78
|
+
options[:ignore_status_codes] = Array(options[:ignore_status_codes]).map(&:to_i)
|
105
79
|
|
106
80
|
paths = path.split(',')
|
107
81
|
if opts['as_links']
|
108
|
-
links = path.
|
82
|
+
links = path.split(',').map(&:strip)
|
109
83
|
HTMLProofer.check_links(links, options).run
|
110
84
|
elsif File.directory?(paths.first)
|
111
85
|
HTMLProofer.check_directories(paths, options).run
|
data/lib/html-proofer.rb
CHANGED
@@ -1,56 +1,3 @@
|
|
1
|
-
# rubocop:disable Naming/FileName
|
2
1
|
# frozen_string_literal: true
|
3
2
|
|
4
|
-
|
5
|
-
dir = File.join(File.dirname(__FILE__), path)
|
6
|
-
Dir[File.join(dir, '*.rb')].sort.each do |f|
|
7
|
-
require f
|
8
|
-
end
|
9
|
-
end
|
10
|
-
|
11
|
-
require_relative 'html-proofer/utils'
|
12
|
-
require_all 'html-proofer'
|
13
|
-
require_all 'html-proofer/check'
|
14
|
-
|
15
|
-
require 'parallel'
|
16
|
-
require 'fileutils'
|
17
|
-
|
18
|
-
begin
|
19
|
-
require 'awesome_print'
|
20
|
-
require 'pry-byebug'
|
21
|
-
rescue LoadError; end # rubocop:disable Lint/SuppressedException
|
22
|
-
module HTMLProofer
|
23
|
-
def self.check_file(file, options = {})
|
24
|
-
raise ArgumentError unless file.is_a?(String)
|
25
|
-
raise ArgumentError, "#{file} does not exist" unless File.exist?(file)
|
26
|
-
|
27
|
-
options[:type] = :file
|
28
|
-
HTMLProofer::Runner.new(file, options)
|
29
|
-
end
|
30
|
-
|
31
|
-
def self.check_directory(directory, options = {})
|
32
|
-
raise ArgumentError unless directory.is_a?(String)
|
33
|
-
raise ArgumentError, "#{directory} does not exist" unless Dir.exist?(directory)
|
34
|
-
|
35
|
-
options[:type] = :directory
|
36
|
-
HTMLProofer::Runner.new([directory], options)
|
37
|
-
end
|
38
|
-
|
39
|
-
def self.check_directories(directories, options = {})
|
40
|
-
raise ArgumentError unless directories.is_a?(Array)
|
41
|
-
|
42
|
-
options[:type] = :directory
|
43
|
-
directories.each do |directory|
|
44
|
-
raise ArgumentError, "#{directory} does not exist" unless Dir.exist?(directory)
|
45
|
-
end
|
46
|
-
HTMLProofer::Runner.new(directories, options)
|
47
|
-
end
|
48
|
-
|
49
|
-
def self.check_links(links, options = {})
|
50
|
-
raise ArgumentError unless links.is_a?(Array)
|
51
|
-
|
52
|
-
options[:type] = :links
|
53
|
-
HTMLProofer::Runner.new(links, options)
|
54
|
-
end
|
55
|
-
end
|
56
|
-
# rubocop:enable Naming/FileName
|
3
|
+
require_relative 'html_proofer'
|
@@ -0,0 +1,231 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class HTMLProofer::Attribute::Url < HTMLProofer::Attribute
|
4
|
+
attr_reader :url
|
5
|
+
|
6
|
+
REMOTE_SCHEMES = %w[http https].freeze
|
7
|
+
|
8
|
+
def initialize(runner, link_attribute, base_url: nil)
|
9
|
+
super
|
10
|
+
|
11
|
+
if @raw_attribute.nil?
|
12
|
+
@url = nil
|
13
|
+
else
|
14
|
+
@url = @raw_attribute.delete("\u200b").strip
|
15
|
+
@url = Addressable::URI.join(base_url, @url).to_s unless blank?(base_url)
|
16
|
+
|
17
|
+
swap_urls!
|
18
|
+
clean_url!
|
19
|
+
|
20
|
+
# convert "//" links to "https://"
|
21
|
+
@url.start_with?('//') ? @url = "https:#{@url}" : @url
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def to_s
|
26
|
+
@url
|
27
|
+
end
|
28
|
+
|
29
|
+
def known_extension?
|
30
|
+
return true if hash_link?
|
31
|
+
|
32
|
+
ext = File.extname(path)
|
33
|
+
|
34
|
+
# no extension means we use the assumed one
|
35
|
+
return @runner.options[:extensions].include?(@runner.options[:assume_extension]) if blank?(ext)
|
36
|
+
|
37
|
+
@runner.options[:extensions].include?(ext)
|
38
|
+
end
|
39
|
+
|
40
|
+
def unknown_extension?
|
41
|
+
!known_extension?
|
42
|
+
end
|
43
|
+
|
44
|
+
def ignore?
|
45
|
+
return true if (/^javascript:/).match?(@url)
|
46
|
+
return true if ignores_pattern?(@runner.options[:ignore_urls])
|
47
|
+
end
|
48
|
+
|
49
|
+
def valid?
|
50
|
+
!parts.nil?
|
51
|
+
end
|
52
|
+
|
53
|
+
def path?
|
54
|
+
!parts.host.nil? && !parts.path.nil?
|
55
|
+
end
|
56
|
+
|
57
|
+
def parts
|
58
|
+
@parts ||= Addressable::URI.parse @url
|
59
|
+
rescue URI::Error, Addressable::URI::InvalidURIError
|
60
|
+
@parts = nil
|
61
|
+
end
|
62
|
+
|
63
|
+
def path
|
64
|
+
Addressable::URI.unencode parts.path unless parts.nil?
|
65
|
+
end
|
66
|
+
|
67
|
+
def hash
|
68
|
+
parts&.fragment
|
69
|
+
end
|
70
|
+
|
71
|
+
# Does the URL have a hash?
|
72
|
+
def hash?
|
73
|
+
!blank?(hash)
|
74
|
+
end
|
75
|
+
|
76
|
+
def scheme
|
77
|
+
parts&.scheme
|
78
|
+
end
|
79
|
+
|
80
|
+
def remote?
|
81
|
+
REMOTE_SCHEMES.include?(scheme)
|
82
|
+
end
|
83
|
+
|
84
|
+
def http?
|
85
|
+
scheme == 'http'
|
86
|
+
end
|
87
|
+
|
88
|
+
def https?
|
89
|
+
scheme == 'https'
|
90
|
+
end
|
91
|
+
|
92
|
+
def non_http_remote?
|
93
|
+
!scheme.nil? && !remote?
|
94
|
+
end
|
95
|
+
|
96
|
+
def host
|
97
|
+
parts&.host
|
98
|
+
end
|
99
|
+
|
100
|
+
def domain_path
|
101
|
+
(host || '') + path
|
102
|
+
end
|
103
|
+
|
104
|
+
def query_values
|
105
|
+
parts&.query_values
|
106
|
+
end
|
107
|
+
|
108
|
+
# checks if a file exists relative to the current pwd
|
109
|
+
def exists?
|
110
|
+
return true if base64?
|
111
|
+
|
112
|
+
return @runner.checked_paths[absolute_path] if @runner.checked_paths.key?(absolute_path)
|
113
|
+
|
114
|
+
@runner.checked_paths[absolute_path] = File.exist?(absolute_path)
|
115
|
+
end
|
116
|
+
|
117
|
+
def base64?
|
118
|
+
/^data:image/.match?(@raw_attribute)
|
119
|
+
end
|
120
|
+
|
121
|
+
def absolute_path
|
122
|
+
path = file_path || @runner.current_path
|
123
|
+
|
124
|
+
File.expand_path(path, Dir.pwd)
|
125
|
+
end
|
126
|
+
|
127
|
+
def file_path
|
128
|
+
return if path.nil? || path.empty?
|
129
|
+
|
130
|
+
path_dot_ext = ''
|
131
|
+
|
132
|
+
path_dot_ext = path + @runner.options[:assume_extension] unless blank?(@runner.options[:assume_extension])
|
133
|
+
|
134
|
+
base = if absolute_path?(path) # path relative to root
|
135
|
+
# either overwrite with root_dir; or, if source is directory, use that; or, just get the current file's dirname
|
136
|
+
@runner.options[:root_dir] || (File.directory?(@runner.current_source) ? @runner.current_source : File.dirname(@runner.current_source))
|
137
|
+
# relative links, path is a file
|
138
|
+
elsif File.exist?(File.expand_path(path, @runner.current_source)) || File.exist?(File.expand_path(path_dot_ext, @runner.current_source))
|
139
|
+
File.dirname(@runner.current_path)
|
140
|
+
# relative links in nested dir, path is a file
|
141
|
+
elsif File.exist?(File.join(File.dirname(@runner.current_path), path)) || File.exist?(File.join(File.dirname(@runner.current_path), path_dot_ext)) # rubocop:disable Lint/DuplicateBranch
|
142
|
+
File.dirname(@runner.current_path)
|
143
|
+
# relative link, path is a directory
|
144
|
+
else
|
145
|
+
@runner.current_path
|
146
|
+
end
|
147
|
+
|
148
|
+
file = File.join(base, path)
|
149
|
+
|
150
|
+
if @runner.options[:assume_extension] && File.file?("#{file}#{@runner.options[:assume_extension]}")
|
151
|
+
file = "#{file}#{@runner.options[:assume_extension]}"
|
152
|
+
elsif File.directory?(file) && !unslashed_directory?(file) # implicit index support
|
153
|
+
file = File.join file, @runner.options[:directory_index_file]
|
154
|
+
end
|
155
|
+
|
156
|
+
file
|
157
|
+
end
|
158
|
+
|
159
|
+
def unslashed_directory?(file)
|
160
|
+
File.directory?(file) && !file.end_with?(File::SEPARATOR)
|
161
|
+
end
|
162
|
+
|
163
|
+
def absolute_path?(path)
|
164
|
+
path.start_with?('/')
|
165
|
+
end
|
166
|
+
|
167
|
+
# path is external to the file
|
168
|
+
def external?
|
169
|
+
!internal?
|
170
|
+
end
|
171
|
+
|
172
|
+
def internal?
|
173
|
+
relative_link? || internal_absolute_link? || hash_link?
|
174
|
+
end
|
175
|
+
|
176
|
+
def internal_absolute_link?
|
177
|
+
url.start_with?('/')
|
178
|
+
end
|
179
|
+
|
180
|
+
def relative_link?
|
181
|
+
return false if remote?
|
182
|
+
|
183
|
+
hash_link? || param_link? || url.start_with?('.') || url =~ /^\S/
|
184
|
+
end
|
185
|
+
|
186
|
+
def link_points_to_same_page?
|
187
|
+
hash_link || param_link
|
188
|
+
end
|
189
|
+
|
190
|
+
def hash_link?
|
191
|
+
url.start_with?('#')
|
192
|
+
end
|
193
|
+
|
194
|
+
def param_link?
|
195
|
+
url.start_with?('?')
|
196
|
+
end
|
197
|
+
|
198
|
+
def sans_hash
|
199
|
+
@url.to_s.sub(/##{hash}/, '')
|
200
|
+
end
|
201
|
+
|
202
|
+
# catch any obvious issues, like strings in port numbers
|
203
|
+
private def clean_url!
|
204
|
+
return if @url =~ /^([!#{Regexp.last_match(0)}-;=?-\[\]_a-z~]|%[0-9a-fA-F]{2})+$/
|
205
|
+
|
206
|
+
@url = Addressable::URI.parse(@url).normalize.to_s
|
207
|
+
end
|
208
|
+
|
209
|
+
private def swap_urls!
|
210
|
+
return @url if blank?(replacements = @runner.options[:swap_urls])
|
211
|
+
|
212
|
+
replacements.each do |link, replace|
|
213
|
+
@url = @url.gsub(link, replace)
|
214
|
+
end
|
215
|
+
end
|
216
|
+
|
217
|
+
private def ignores_pattern?(links_to_ignore)
|
218
|
+
return false unless links_to_ignore.is_a?(Array)
|
219
|
+
|
220
|
+
links_to_ignore.each do |link_to_ignore|
|
221
|
+
case link_to_ignore
|
222
|
+
when String
|
223
|
+
return true if link_to_ignore == @raw_attribute
|
224
|
+
when Regexp
|
225
|
+
return true if link_to_ignore&.match?(@raw_attribute)
|
226
|
+
end
|
227
|
+
end
|
228
|
+
|
229
|
+
false
|
230
|
+
end
|
231
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module HTMLProofer
|
4
|
+
# Represents an element currently being processed
|
5
|
+
class Attribute
|
6
|
+
include HTMLProofer::Utils
|
7
|
+
|
8
|
+
attr_reader :raw_attribute
|
9
|
+
|
10
|
+
def initialize(runner, raw_attribute, **_)
|
11
|
+
@runner = runner
|
12
|
+
@raw_attribute = raw_attribute
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,234 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'date'
|
4
|
+
require 'json'
|
5
|
+
require 'uri'
|
6
|
+
|
7
|
+
module HTMLProofer
|
8
|
+
class Cache
|
9
|
+
include HTMLProofer::Utils
|
10
|
+
|
11
|
+
CACHE_VERSION = 2
|
12
|
+
|
13
|
+
DEFAULT_STORAGE_DIR = File.join('tmp', '.htmlproofer')
|
14
|
+
DEFAULT_CACHE_FILE_NAME = 'cache.json'
|
15
|
+
|
16
|
+
URI_REGEXP = URI::DEFAULT_PARSER.make_regexp
|
17
|
+
|
18
|
+
attr_reader :exists, :cache_log, :storage_dir, :cache_file
|
19
|
+
|
20
|
+
def initialize(runner, options)
|
21
|
+
@runner = runner
|
22
|
+
@logger = @runner.logger
|
23
|
+
|
24
|
+
@cache_datetime = DateTime.now
|
25
|
+
@cache_time = @cache_datetime.to_time
|
26
|
+
|
27
|
+
if blank?(options)
|
28
|
+
define_singleton_method(:enabled?) { false }
|
29
|
+
else
|
30
|
+
define_singleton_method(:enabled?) { true }
|
31
|
+
setup_cache!(options)
|
32
|
+
@parsed_timeframe = parsed_timeframe(options[:timeframe])
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def within_timeframe?(time)
|
37
|
+
return false if time.nil?
|
38
|
+
|
39
|
+
time = Time.parse(time) if time.is_a?(String)
|
40
|
+
(@parsed_timeframe..@cache_time).cover?(time)
|
41
|
+
end
|
42
|
+
|
43
|
+
def parsed_timeframe(timeframe)
|
44
|
+
time, date = timeframe.match(/(\d+)(\D)/).captures
|
45
|
+
time = time.to_i
|
46
|
+
case date
|
47
|
+
when 'M'
|
48
|
+
time_ago(time, :months)
|
49
|
+
when 'w'
|
50
|
+
time_ago(time, :weeks)
|
51
|
+
when 'd'
|
52
|
+
time_ago(time, :days)
|
53
|
+
when 'h'
|
54
|
+
time_ago(time, :hours)
|
55
|
+
else
|
56
|
+
raise ArgumentError, "#{date} is not a valid timeframe!"
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
def add_internal(url, metadata, found)
|
61
|
+
return unless enabled?
|
62
|
+
|
63
|
+
@cache_log[:internal][url] = { time: @cache_time, metadata: [] } if @cache_log[:internal][url].nil?
|
64
|
+
|
65
|
+
@cache_log[:internal][url][:metadata] << construct_internal_link_metadata(metadata, found)
|
66
|
+
end
|
67
|
+
|
68
|
+
def add_external(url, filenames, status_code, msg)
|
69
|
+
return unless enabled?
|
70
|
+
|
71
|
+
found = status_code.between?(200, 299)
|
72
|
+
|
73
|
+
clean_url = cleaned_url(url)
|
74
|
+
@cache_log[:external][clean_url] = { time: @cache_time.to_s, found: found, status_code: status_code, message: msg, metadata: filenames }
|
75
|
+
end
|
76
|
+
|
77
|
+
def detect_url_changes(urls_detected, type)
|
78
|
+
additions = determine_additions(urls_detected, type)
|
79
|
+
|
80
|
+
determine_deletions(urls_detected, type)
|
81
|
+
|
82
|
+
additions
|
83
|
+
end
|
84
|
+
|
85
|
+
private def construct_internal_link_metadata(metadata, found)
|
86
|
+
{
|
87
|
+
source: metadata[:source],
|
88
|
+
current_path: metadata[:current_path],
|
89
|
+
line: metadata[:line],
|
90
|
+
base_url: metadata[:base_url],
|
91
|
+
found: found
|
92
|
+
}
|
93
|
+
end
|
94
|
+
|
95
|
+
# prepare to add new URLs detected
|
96
|
+
private def determine_additions(urls_detected, type)
|
97
|
+
additions = urls_detected.reject do |url, metadata|
|
98
|
+
url = cleaned_url(url)
|
99
|
+
|
100
|
+
if @cache_log[type].include?(url)
|
101
|
+
@cache_log[type][url][:metadata] = metadata
|
102
|
+
|
103
|
+
# if this is false, we're trying again
|
104
|
+
if type == :external
|
105
|
+
@cache_log[type][url][:found]
|
106
|
+
else
|
107
|
+
@cache_log[type][url][:metadata].none? { |m| m[:found] }
|
108
|
+
end
|
109
|
+
else
|
110
|
+
@logger.log :debug, "Adding #{url} to #{type} cache"
|
111
|
+
false
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
115
|
+
new_link_count = additions.length
|
116
|
+
new_link_text = pluralize(new_link_count, "new #{type} link", "new #{type} links")
|
117
|
+
@logger.log :debug, "Adding #{new_link_text} to the cache"
|
118
|
+
|
119
|
+
additions
|
120
|
+
end
|
121
|
+
|
122
|
+
# remove from cache URLs that no longer exist
|
123
|
+
private def determine_deletions(urls_detected, type)
|
124
|
+
deletions = 0
|
125
|
+
|
126
|
+
@cache_log[type].delete_if do |url, _|
|
127
|
+
url = cleaned_url(url)
|
128
|
+
|
129
|
+
if urls_detected.include?(url)
|
130
|
+
false
|
131
|
+
elsif url_matches_type?(url, type)
|
132
|
+
@logger.log :debug, "Removing #{url} from #{type} cache"
|
133
|
+
deletions += 1
|
134
|
+
true
|
135
|
+
end
|
136
|
+
end
|
137
|
+
|
138
|
+
del_link_text = pluralize(deletions, "outdated #{type} link", "outdated #{type} links")
|
139
|
+
@logger.log :debug, "Removing #{del_link_text} from the cache"
|
140
|
+
end
|
141
|
+
|
142
|
+
def write
|
143
|
+
return unless enabled?
|
144
|
+
|
145
|
+
File.write(@cache_file, @cache_log.to_json)
|
146
|
+
end
|
147
|
+
|
148
|
+
def retrieve_urls(urls_detected, type)
|
149
|
+
# if there are no urls, bail
|
150
|
+
return {} if urls_detected.empty?
|
151
|
+
|
152
|
+
urls_to_check = detect_url_changes(urls_detected, type)
|
153
|
+
|
154
|
+
@cache_log[type].each_pair do |url, cache|
|
155
|
+
next if within_timeframe?(cache[:time])
|
156
|
+
|
157
|
+
urls_to_check[url] = cache[:metadata] # recheck expired links
|
158
|
+
end
|
159
|
+
|
160
|
+
urls_to_check
|
161
|
+
end
|
162
|
+
|
163
|
+
def empty?
|
164
|
+
blank?(@cache_log) || (@cache_log[:internal].empty? && @cache_log[:external].empty?)
|
165
|
+
end
|
166
|
+
|
167
|
+
def size(type)
|
168
|
+
@cache_log[type].size
|
169
|
+
end
|
170
|
+
|
171
|
+
private def setup_cache!(options)
|
172
|
+
default_structure = {
|
173
|
+
version: CACHE_VERSION,
|
174
|
+
internal: {},
|
175
|
+
external: {}
|
176
|
+
}
|
177
|
+
|
178
|
+
@storage_dir = options[:storage_dir] || DEFAULT_STORAGE_DIR
|
179
|
+
|
180
|
+
FileUtils.mkdir_p(storage_dir) unless Dir.exist?(storage_dir)
|
181
|
+
|
182
|
+
cache_file_name = options[:cache_file] || DEFAULT_CACHE_FILE_NAME
|
183
|
+
|
184
|
+
@cache_file = File.join(storage_dir, cache_file_name)
|
185
|
+
|
186
|
+
return (@cache_log = default_structure) unless File.exist?(@cache_file)
|
187
|
+
|
188
|
+
contents = File.read(@cache_file)
|
189
|
+
|
190
|
+
return (@cache_log = default_structure) if blank?(contents)
|
191
|
+
|
192
|
+
log = JSON.parse(contents, symbolize_names: true)
|
193
|
+
|
194
|
+
old_cache = (cache_version = log[:version]).nil?
|
195
|
+
@cache_log = if old_cache # previous cache version, create a new one
|
196
|
+
default_structure
|
197
|
+
elsif cache_version != CACHE_VERSION
|
198
|
+
# if cache version is newer...do something
|
199
|
+
else
|
200
|
+
log[:internal] = log[:internal].transform_keys(&:to_s)
|
201
|
+
log[:external] = log[:external].transform_keys(&:to_s)
|
202
|
+
log
|
203
|
+
end
|
204
|
+
end
|
205
|
+
|
206
|
+
private def time_ago(measurement, unit)
|
207
|
+
case unit
|
208
|
+
when :months
|
209
|
+
@cache_datetime >> -measurement
|
210
|
+
when :weeks
|
211
|
+
@cache_datetime - (measurement * 7)
|
212
|
+
when :days
|
213
|
+
@cache_datetime - measurement
|
214
|
+
when :hours
|
215
|
+
@cache_datetime - Rational(measurement / 24.0)
|
216
|
+
end.to_time
|
217
|
+
end
|
218
|
+
|
219
|
+
private def url_matches_type?(url, type)
|
220
|
+
return true if type == :internal && url !~ URI_REGEXP
|
221
|
+
return true if type == :external && url =~ URI_REGEXP
|
222
|
+
end
|
223
|
+
|
224
|
+
private def cleaned_url(url)
|
225
|
+
return escape_unescape(url) unless url.end_with?('/', '#', '?') && url.length > 1
|
226
|
+
|
227
|
+
escape_unescape(url[0..-2])
|
228
|
+
end
|
229
|
+
|
230
|
+
private def escape_unescape(url)
|
231
|
+
Addressable::URI.parse(url).normalize.to_s
|
232
|
+
end
|
233
|
+
end
|
234
|
+
end
|