html-proofer 3.19.2 → 4.0.0.rc3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. checksums.yaml +4 -4
  2. data/bin/htmlproofer +31 -57
  3. data/lib/html-proofer.rb +1 -54
  4. data/lib/html_proofer/attribute/url.rb +231 -0
  5. data/lib/html_proofer/attribute.rb +15 -0
  6. data/lib/html_proofer/cache.rb +236 -0
  7. data/lib/html_proofer/check/favicon.rb +35 -0
  8. data/lib/html_proofer/check/images.rb +62 -0
  9. data/lib/html_proofer/check/links.rb +118 -0
  10. data/lib/html_proofer/check/open_graph.rb +34 -0
  11. data/lib/html_proofer/check/scripts.rb +38 -0
  12. data/lib/html_proofer/check.rb +91 -0
  13. data/lib/{html-proofer → html_proofer}/configuration.rb +30 -30
  14. data/lib/html_proofer/element.rb +122 -0
  15. data/lib/html_proofer/failure.rb +17 -0
  16. data/lib/{html-proofer → html_proofer}/log.rb +0 -0
  17. data/lib/html_proofer/reporter/cli.rb +29 -0
  18. data/lib/html_proofer/reporter.rb +23 -0
  19. data/lib/html_proofer/runner.rb +245 -0
  20. data/lib/html_proofer/url_validator/external.rb +189 -0
  21. data/lib/html_proofer/url_validator/internal.rb +86 -0
  22. data/lib/html_proofer/url_validator.rb +16 -0
  23. data/lib/{html-proofer → html_proofer}/utils.rb +6 -9
  24. data/lib/{html-proofer → html_proofer}/version.rb +1 -1
  25. data/lib/html_proofer/xpath_functions.rb +10 -0
  26. data/lib/html_proofer.rb +55 -0
  27. metadata +51 -30
  28. data/lib/html-proofer/cache.rb +0 -194
  29. data/lib/html-proofer/check/favicon.rb +0 -29
  30. data/lib/html-proofer/check/html.rb +0 -37
  31. data/lib/html-proofer/check/images.rb +0 -48
  32. data/lib/html-proofer/check/links.rb +0 -182
  33. data/lib/html-proofer/check/opengraph.rb +0 -46
  34. data/lib/html-proofer/check/scripts.rb +0 -42
  35. data/lib/html-proofer/check.rb +0 -75
  36. data/lib/html-proofer/element.rb +0 -261
  37. data/lib/html-proofer/issue.rb +0 -65
  38. data/lib/html-proofer/middleware.rb +0 -82
  39. data/lib/html-proofer/runner.rb +0 -248
  40. data/lib/html-proofer/url_validator.rb +0 -237
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: d06dbda8bf9baad3be96b5565fcb86de0892d4241c6b7ede08c3c2d7203f6752
4
- data.tar.gz: da895d696b7b9d1f3ca9c2e504e0c375ce9942f53dd5c2174f80570d32e3ad5b
3
+ metadata.gz: 1c88c30b96819085add5b171ae6c2b61ed24637d4acf25a0258b19fed2e6aa6b
4
+ data.tar.gz: 14d8383987c15edcb2868f35e7d6a56369cadd296bf0e31d5e0ed5d3ccd0f6d3
5
5
  SHA512:
6
- metadata.gz: 693f677cb91b9b0e79135ef27ed2771d34d7e5fee5bd368b20b5791a4e25a468ba769d299c2cd1417eccd989d9ac1290cfaf9dfd90b9534fa0af0440382e4e0f
7
- data.tar.gz: c6c33f309e3f8b00dc1721653f2e2099c647c9bd8d3b4ecaab4ec7b97161925911d1e6e3c8cabb30b9b9eb2274fed7238ed338da7d85d25b7c6981d537f6435d
6
+ metadata.gz: 3b2f276abff0c540bb08fc26d6267bacefc8e1a9bf2f5e2aa1dfb3294398b9a63fbcfe906baa24b013dd9bf19052f826ce5d47bfce53572381aba2e948b8f918
7
+ data.tar.gz: fff8dcd929eed104ab69543f49e7cba4005165f909505d82bd857d1791545593d61aa988a2ef9d91c9887b2a2c57aab8bffeb3e474f8b01521827a6abbf8d475
data/bin/htmlproofer CHANGED
@@ -15,43 +15,32 @@ Mercenary.program(:htmlproofer) do |p|
15
15
 
16
16
  p.description 'Runs the HTML-Proofer suite on the files in PATH. For more details, see the README.'
17
17
 
18
- p.option 'allow_missing_href', '--allow-missing-href', 'If `true`, does not flag `a` tags missing `href` (this is the default for HTML5).'
19
- p.option 'allow_hash_href', '--allow-hash-href', 'If `true`, ignores the `href="#"`'
18
+ p.option 'allow_hash_href', '--allow-hash-href', 'If `true`, assumes `href="#"` anchors are valid'
19
+ p.option 'allow_missing_href', '--allow-missing-href', 'If `true`, does not flag `a` tags missing `href`. In HTML5, this is technically allowed, but could also be human error.'
20
20
  p.option 'as_links', '--as-links', 'Assumes that `PATH` is a comma-separated array of links to check.'
21
- p.option 'alt_ignore', '--alt-ignore image1,[image2,...]', Array, 'A comma-separated list of Strings or RegExps containing `img`s whose missing `alt` tags are safe to ignore'
22
- p.option 'assume_extension', '--assume-extension', 'Automatically add extension (e.g. `.html`) to file paths, to allow extensionless URLs (as supported by Jekyll 3 and GitHub Pages) (default: `false`).'
23
- p.option 'checks_to_ignore', '--checks-to-ignore check1,[check2,...]', Array, 'A comma-separated list of Strings indicating which checks you do not want to run (default: `[]`)'
24
- p.option 'check_external_hash', '--check-external-hash', 'Checks whether external hashes exist (even if the webpage exists). This slows the checker down (default: `false`).'
25
- p.option 'check_favicon', '--check-favicon', 'Enables the favicon checker (default: `false`).'
26
- p.option 'check_html', '--check-html', 'Enables HTML validation errors from Nokogumbo (default: `false`).'
27
- p.option 'check_img_http', '--check-img-http', 'Fails an image if it\'s marked as `http` (default: `false`).'
28
- p.option 'check_opengraph', '--check-opengraph', 'Enables the Open Graph checker (default: `false`).'
21
+ p.option 'assume_extension', '--assume-extension <ext>', 'Automatically add specified extension to files for internal links, to allow extensionless URLs (as supported by most servers) (default: `.html`).'
22
+ p.option 'checks', '--checks check1,[check2,...]', Array, 'A comma-separated list of Strings indicating which checks you want to run (default: `["Links", "Images", "Scripts"]`)'
23
+ p.option 'check_external_hash', '--check-external-hash', 'Checks whether external hashes exist (even if the webpage exists) (default: `true`).'
29
24
  p.option 'check_sri', '--check-sri', 'Check that `<link>` and `<script>` external resources use SRI (default: `false`).'
30
25
  p.option 'directory_index_file', '--directory-index-file <filename>', String, 'Sets the file to look for when a link refers to a directory. (default: `index.html`)'
31
- p.option 'disable_external', '--disable-external', 'If `true`, does not run the external link checker, which can take a lot of time (default: `false`)'
32
- p.option 'empty_alt_ignore', '--empty-alt-ignore', 'If `true`, ignores images with empty alt tags'
33
- p.option 'error_sort', '--error-sort <sort>', String, 'Defines the sort order for error output. Can be `:path`, `:desc`, or `:status` (default: `:path`).'
34
- p.option 'enforce_https', '--enforce-https', 'Fails a link if it\'s not marked as `https` (default: `false`).'
35
- p.option 'extension', '--extension <ext>', String, 'The extension of your HTML files including the dot. (default: `.html`)'
36
- p.option 'external_only', '--external_only', 'Only checks problems with external references'
37
- p.option 'file_ignore', '--file-ignore file1,[file2,...]', Array, 'A comma-separated list of Strings or RegExps containing file paths that are safe to ignore'
38
- p.option 'http_status_ignore', '--http-status-ignore 123,[xxx, ...]', Array, 'A comma-separated list of numbers representing status codes to ignore.'
39
- p.option 'internal_domains', '--internal-domains domain1,[domain2,...]', Array, 'A comma-separated list of Strings containing domains that will be treated as internal urls.'
40
- p.option 'report_invalid_tags', '--report-invalid-tags', 'When `check_html` is enabled, HTML markup that is unknown to Nokogumbo are reported as errors (default: `false`)'
41
- p.option 'report_missing_names', '--report-missing-names', 'When `check_html` is enabled, HTML markup that are missing entity names are reported as errors (default: `false`)'
42
- p.option 'report_script_embeds', '--report-script-embeds', 'When `check_html` is enabled, `script` tags containing markup are reported as errors (default: `false`)'
43
- p.option 'report_missing_doctype', '--report-missing-doctype', 'When `check_html` is enabled, HTML markup with missing or out-of-order `DOCTYPE` are reported as errors (default: `false`)'
44
- p.option 'report_eof_tags', '--report-eof-tags', 'When `check_html` is enabled, HTML markup with tags that are malformed are reported as errors (default: `false`)'
45
- p.option 'report_mismatched_tags', '--report-mismatched-tags', 'When `check_html` is enabled, HTML markup with mismatched tags are reported as errors (default: `false`)'
26
+ p.option 'disable_external', '--disable-external', 'If `true`, does not run the external link checker (default: `false`)'
27
+ p.option 'enforce_https', '--enforce-https', 'Fails a link if it\'s not marked as `https` (default: `true`).'
28
+ p.option 'extensions', '--extensions ext1,[ext2,...[', Array, 'A comma-separated list of Strings indicating the file extensions you would like to check (including the dot) (default: `.html`)'
29
+ p.option 'ignore_files', '--ignore-files file1,[file2,...]', Array, 'A comma-separated list of Strings or RegExps containing file paths that are safe to ignore'
30
+ p.option 'ignore_empty_mailto', '--ignore-empty-mailto', 'If `true`, allows `mailto:` `href`s which do not contain an email address'
31
+ p.option 'ignore_missing_alt', '--empty-alt-ignore', 'If `true`, ignores images with empty/missing alt tags'
32
+ p.option 'ignore_status_codes', '--http-status-ignore 123,[xxx, ...]', Array, 'A comma-separated list of numbers representing status codes to ignore.'
33
+ p.option 'ignore_urls', '--ignore-urls link1,[link2,...]', Array, 'A comma-separated list of Strings or RegExps containing URLs that are safe to ignore. This affects all HTML attributes, such as `alt` tags on images.'
46
34
  p.option 'log_level', '--log-level <level>', String, 'Sets the logging level, as determined by Yell. One of `:debug`, `:info`, `:warn`, `:error`, or `:fatal`. (default: `:info`)'
47
35
  p.option 'only_4xx', '--only-4xx', 'Only reports errors for links that fall within the 4xx status code range'
48
- p.option 'storage_dir', '--storage-dir PATH', String, 'Directory where to store the cache log (default: "tmp/.htmlproofer")'
49
- p.option 'timeframe', '--timeframe <time>', String, 'A string representing the caching timeframe.'
50
- p.option 'typhoeus_config', '--typhoeus-config CONFIG', String, 'JSON-formatted string of Typhoeus config. Will override the html-proofer defaults.'
51
- p.option 'hydra_config', '--hydra-config CONFIG', String, 'JSON-formatted string of Hydra config. Will override the html-proofer defaults.'
52
- p.option 'url_ignore', '--url-ignore link1,[link2,...]', Array, 'A comma-separated list of Strings or RegExps containing URLs that are safe to ignore. It affects all HTML attributes. Note that non-HTTP(S) URIs are always ignored'
53
- p.option 'url_swap', '--url-swap re:string,[re:string,...]', Array, 'A comma-separated list containing key-value pairs of `RegExp => String`. It transforms URLs that match `RegExp` into `String` via `gsub`. The escape sequences `\\:` should be used to produce literal `:`s.'
54
36
  p.option 'root_dir', '--root-dir PATH', String, 'The absolute path to the directory serving your html-files.'
37
+ p.option 'swap_attributes', '--swap-attributes CONFIG', String, 'JSON-formatted config that maps element names to the preferred attribute to check (default: `{}`).'
38
+ p.option 'swap_urls', '--swap-urls re:string,[re:string,...]', Array, 'A comma-separated list containing key-value pairs of `RegExp => String`. It transforms URLs that match `RegExp` into `String` via `gsub`. The escape sequences `\\:` should be used to produce literal `:`s.'
39
+
40
+ p.option 'typhoeus', '--typhoeus CONFIG', String, 'JSON-formatted string of Typhoeus config. Will override the html-proofer defaults.'
41
+ p.option 'hydra', '--hydra CONFIG', String, 'JSON-formatted string of Hydra config. Will override the html-proofer defaults.'
42
+ p.option 'parallel', '--parallel CONFIG', String, 'JSON-formatted string of Parallel config. Will override the html-proofer defaults.'
43
+ p.option 'cache', '--cache CONFIG', String, 'JSON-formatted string of cache config. Will override the html-proofer defaults.'
55
44
 
56
45
  p.action do |args, opts|
57
46
  args = ['.'] if args.empty?
@@ -66,46 +55,31 @@ Mercenary.program(:htmlproofer) do |p|
66
55
  end
67
56
 
68
57
  # some minor manipulation of a special option
69
- unless opts['url_swap'].nil?
70
- options[:url_swap] = {}
71
- opts['url_swap'].each do |s|
58
+ unless opts['swap_urls'].nil?
59
+ options[:swap_urls] = {}
60
+ opts['swap_urls'].each do |s|
72
61
  splt = s.split(/(?<!\\):/, 2)
73
62
 
74
63
  re = splt[0].gsub(/\\:/, ':')
75
64
  string = splt[1].gsub(/\\:/, ':')
76
- options[:url_swap][Regexp.new(re)] = string
65
+ options[:swap_urls][Regexp.new(re)] = string
77
66
  end
78
67
  end
79
68
 
80
- options[:error_sort] = opts['error-sort'].to_sym unless opts['error-sort'].nil?
81
69
  options[:log_level] = opts['log_level'].to_sym unless opts['log_level'].nil?
82
70
 
83
- options[:validation] = HTMLProofer::Configuration::VALIDATION_DEFAULTS.dup
84
- options[:validation][:report_script_embeds] = opts['report_script_embeds'] unless opts['report_script_embeds'].nil?
85
- options[:validation][:report_missing_names] = opts['report_missing_names'] unless opts['report_missing_names'].nil?
86
- options[:validation][:report_invalid_tags] = opts['report_invalid_tags'] unless opts['report_invalid_tags'].nil?
87
- options[:validation][:report_missing_doctype] = opts['report_missing_doctype'] unless opts['report_missing_doctype'].nil?
88
- options[:validation][:report_eof_tags] = opts['report_eof_tags'] unless opts['report_eof_tags'].nil?
89
- options[:validation][:report_mismatched_tags] = opts['report_mismatched_tags'] unless opts['report_mismatched_tags'].nil?
90
-
91
- options[:typhoeus] = HTMLProofer::Configuration.parse_json_option('typhoeus_config', opts['typhoeus_config'], symbolize_names: false) unless opts['typhoeus_config'].nil?
92
- options[:hydra] = HTMLProofer::Configuration.parse_json_option('hydra_config', opts['hydra_config']) unless opts['hydra_config'].nil?
71
+ options[:typhoeus] = HTMLProofer::Configuration.parse_json_option('typhoeus', opts['typhoeus'], symbolize_names: false) unless opts['typhoeus'].nil?
72
+ options[:hydra] = HTMLProofer::Configuration.parse_json_option('hydra', opts['hydra']) unless opts['hydra'].nil?
73
+ options[:parallel] = HTMLProofer::Configuration.parse_json_option('parallel', opts['parallel']) unless opts['parallel'].nil?
74
+ options[:cache] = HTMLProofer::Configuration.parse_json_option('cache', opts['cache']) unless opts['cache'].nil?
93
75
 
94
- unless opts['timeframe'].nil?
95
- options[:cache] ||= {}
96
- options[:cache][:timeframe] = opts['timeframe'] unless opts['timeframe'].nil?
97
- end
98
-
99
- unless opts['storage_dir'].nil?
100
- options[:cache] ||= {}
101
- options[:cache][:storage_dir] = opts['storage_dir'] unless opts['storage_dir'].nil?
102
- end
76
+ options[:swap_attributes] = HTMLProofer::Configuration.parse_json_option('swap_attributes', opts['swap_attributes'], symbolize_names: false) unless opts['swap_attributes'].nil?
103
77
 
104
- options[:http_status_ignore] = Array(options[:http_status_ignore]).map(&:to_i)
78
+ options[:ignore_status_codes] = Array(options[:ignore_status_codes]).map(&:to_i)
105
79
 
106
80
  paths = path.split(',')
107
81
  if opts['as_links']
108
- links = path.delete(' ').split(',')
82
+ links = path.split(',').map(&:strip)
109
83
  HTMLProofer.check_links(links, options).run
110
84
  elsif File.directory?(paths.first)
111
85
  HTMLProofer.check_directories(paths, options).run
data/lib/html-proofer.rb CHANGED
@@ -1,56 +1,3 @@
1
- # rubocop:disable Naming/FileName
2
1
  # frozen_string_literal: true
3
2
 
4
- def require_all(path)
5
- dir = File.join(File.dirname(__FILE__), path)
6
- Dir[File.join(dir, '*.rb')].sort.each do |f|
7
- require f
8
- end
9
- end
10
-
11
- require_relative 'html-proofer/utils'
12
- require_all 'html-proofer'
13
- require_all 'html-proofer/check'
14
-
15
- require 'parallel'
16
- require 'fileutils'
17
-
18
- begin
19
- require 'awesome_print'
20
- require 'pry-byebug'
21
- rescue LoadError; end # rubocop:disable Lint/SuppressedException
22
- module HTMLProofer
23
- def self.check_file(file, options = {})
24
- raise ArgumentError unless file.is_a?(String)
25
- raise ArgumentError, "#{file} does not exist" unless File.exist?(file)
26
-
27
- options[:type] = :file
28
- HTMLProofer::Runner.new(file, options)
29
- end
30
-
31
- def self.check_directory(directory, options = {})
32
- raise ArgumentError unless directory.is_a?(String)
33
- raise ArgumentError, "#{directory} does not exist" unless Dir.exist?(directory)
34
-
35
- options[:type] = :directory
36
- HTMLProofer::Runner.new([directory], options)
37
- end
38
-
39
- def self.check_directories(directories, options = {})
40
- raise ArgumentError unless directories.is_a?(Array)
41
-
42
- options[:type] = :directory
43
- directories.each do |directory|
44
- raise ArgumentError, "#{directory} does not exist" unless Dir.exist?(directory)
45
- end
46
- HTMLProofer::Runner.new(directories, options)
47
- end
48
-
49
- def self.check_links(links, options = {})
50
- raise ArgumentError unless links.is_a?(Array)
51
-
52
- options[:type] = :links
53
- HTMLProofer::Runner.new(links, options)
54
- end
55
- end
56
- # rubocop:enable Naming/FileName
3
+ require_relative 'html_proofer'
@@ -0,0 +1,231 @@
1
+ # frozen_string_literal: true
2
+
3
+ class HTMLProofer::Attribute::Url < HTMLProofer::Attribute
4
+ attr_reader :url
5
+
6
+ REMOTE_SCHEMES = %w[http https].freeze
7
+
8
+ def initialize(runner, link_attribute, base_url: nil)
9
+ super
10
+
11
+ if @raw_attribute.nil?
12
+ @url = nil
13
+ else
14
+ @url = @raw_attribute.delete("\u200b").strip
15
+ @url = Addressable::URI.join(base_url, @url).to_s unless blank?(base_url)
16
+
17
+ swap_urls!
18
+ clean_url!
19
+
20
+ # convert "//" links to "https://"
21
+ @url.start_with?('//') ? @url = "https:#{@url}" : @url
22
+ end
23
+ end
24
+
25
+ def to_s
26
+ @url
27
+ end
28
+
29
+ def known_extension?
30
+ return true if hash_link?
31
+
32
+ ext = File.extname(path)
33
+
34
+ # no extension means we use the assumed one
35
+ return @runner.options[:extensions].include?(@runner.options[:assume_extension]) if blank?(ext)
36
+
37
+ @runner.options[:extensions].include?(ext)
38
+ end
39
+
40
+ def unknown_extension?
41
+ !known_extension?
42
+ end
43
+
44
+ def ignore?
45
+ return true if (/^javascript:/).match?(@url)
46
+ return true if ignores_pattern?(@runner.options[:ignore_urls])
47
+ end
48
+
49
+ def valid?
50
+ !parts.nil?
51
+ end
52
+
53
+ def path?
54
+ !parts.host.nil? && !parts.path.nil?
55
+ end
56
+
57
+ def parts
58
+ @parts ||= Addressable::URI.parse @url
59
+ rescue URI::Error, Addressable::URI::InvalidURIError
60
+ @parts = nil
61
+ end
62
+
63
+ def path
64
+ Addressable::URI.unencode parts.path unless parts.nil?
65
+ end
66
+
67
+ def hash
68
+ parts&.fragment
69
+ end
70
+
71
+ # Does the URL have a hash?
72
+ def hash?
73
+ !blank?(hash)
74
+ end
75
+
76
+ def scheme
77
+ parts&.scheme
78
+ end
79
+
80
+ def remote?
81
+ REMOTE_SCHEMES.include?(scheme)
82
+ end
83
+
84
+ def http?
85
+ scheme == 'http'
86
+ end
87
+
88
+ def https?
89
+ scheme == 'https'
90
+ end
91
+
92
+ def non_http_remote?
93
+ !scheme.nil? && !remote?
94
+ end
95
+
96
+ def host
97
+ parts&.host
98
+ end
99
+
100
+ def domain_path
101
+ (host || '') + path
102
+ end
103
+
104
+ def query_values
105
+ parts&.query_values
106
+ end
107
+
108
+ # checks if a file exists relative to the current pwd
109
+ def exists?
110
+ return true if base64?
111
+
112
+ return @runner.checked_paths[absolute_path] if @runner.checked_paths.key?(absolute_path)
113
+
114
+ @runner.checked_paths[absolute_path] = File.exist?(absolute_path)
115
+ end
116
+
117
+ def base64?
118
+ /^data:image/.match?(@raw_attribute)
119
+ end
120
+
121
+ def absolute_path
122
+ path = file_path || @runner.current_path
123
+
124
+ File.expand_path(path, Dir.pwd)
125
+ end
126
+
127
+ def file_path
128
+ return if path.nil? || path.empty?
129
+
130
+ path_dot_ext = ''
131
+
132
+ path_dot_ext = path + @runner.options[:assume_extension] unless blank?(@runner.options[:assume_extension])
133
+
134
+ base = if absolute_path?(path) # path relative to root
135
+ # either overwrite with root_dir; or, if source is directory, use that; or, just get the current file's dirname
136
+ @runner.options[:root_dir] || (File.directory?(@runner.current_source) ? @runner.current_source : File.dirname(@runner.current_source))
137
+ # relative links, path is a file
138
+ elsif File.exist?(File.expand_path(path, @runner.current_source)) || File.exist?(File.expand_path(path_dot_ext, @runner.current_source))
139
+ File.dirname(@runner.current_path)
140
+ # relative links in nested dir, path is a file
141
+ elsif File.exist?(File.join(File.dirname(@runner.current_path), path)) || File.exist?(File.join(File.dirname(@runner.current_path), path_dot_ext)) # rubocop:disable Lint/DuplicateBranch
142
+ File.dirname(@runner.current_path)
143
+ # relative link, path is a directory
144
+ else
145
+ @runner.current_path
146
+ end
147
+
148
+ file = File.join(base, path)
149
+
150
+ if @runner.options[:assume_extension] && File.file?("#{file}#{@runner.options[:assume_extension]}")
151
+ file = "#{file}#{@runner.options[:assume_extension]}"
152
+ elsif File.directory?(file) && !unslashed_directory?(file) # implicit index support
153
+ file = File.join file, @runner.options[:directory_index_file]
154
+ end
155
+
156
+ file
157
+ end
158
+
159
+ def unslashed_directory?(file)
160
+ File.directory?(file) && !file.end_with?(File::SEPARATOR)
161
+ end
162
+
163
+ def absolute_path?(path)
164
+ path.start_with?('/')
165
+ end
166
+
167
+ # path is external to the file
168
+ def external?
169
+ !internal?
170
+ end
171
+
172
+ def internal?
173
+ relative_link? || internal_absolute_link? || hash_link?
174
+ end
175
+
176
+ def internal_absolute_link?
177
+ url.start_with?('/')
178
+ end
179
+
180
+ def relative_link?
181
+ return false if remote?
182
+
183
+ hash_link? || param_link? || url.start_with?('.') || url =~ /^\S/
184
+ end
185
+
186
+ def link_points_to_same_page?
187
+ hash_link || param_link
188
+ end
189
+
190
+ def hash_link?
191
+ url.start_with?('#')
192
+ end
193
+
194
+ def param_link?
195
+ url.start_with?('?')
196
+ end
197
+
198
+ def sans_hash
199
+ @url.to_s.sub(/##{hash}/, '')
200
+ end
201
+
202
+ # catch any obvious issues, like strings in port numbers
203
+ private def clean_url!
204
+ return if @url =~ /^([!#{Regexp.last_match(0)}-;=?-\[\]_a-z~]|%[0-9a-fA-F]{2})+$/
205
+
206
+ @url = Addressable::URI.parse(@url).normalize.to_s
207
+ end
208
+
209
+ private def swap_urls!
210
+ return @url if blank?(replacements = @runner.options[:swap_urls])
211
+
212
+ replacements.each do |link, replace|
213
+ @url = @url.gsub(link, replace)
214
+ end
215
+ end
216
+
217
+ private def ignores_pattern?(links_to_ignore)
218
+ return false unless links_to_ignore.is_a?(Array)
219
+
220
+ links_to_ignore.each do |link_to_ignore|
221
+ case link_to_ignore
222
+ when String
223
+ return true if link_to_ignore == @raw_attribute
224
+ when Regexp
225
+ return true if link_to_ignore&.match?(@raw_attribute)
226
+ end
227
+ end
228
+
229
+ false
230
+ end
231
+ end
@@ -0,0 +1,15 @@
1
+ # frozen_string_literal: true
2
+
3
+ module HTMLProofer
4
+ # Represents an element currently being processed
5
+ class Attribute
6
+ include HTMLProofer::Utils
7
+
8
+ attr_reader :raw_attribute
9
+
10
+ def initialize(runner, raw_attribute, **_)
11
+ @runner = runner
12
+ @raw_attribute = raw_attribute
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,236 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'date'
4
+ require 'json'
5
+ require 'uri'
6
+
7
+ module HTMLProofer
8
+ class Cache
9
+ include HTMLProofer::Utils
10
+
11
+ CACHE_VERSION = 2
12
+
13
+ DEFAULT_STORAGE_DIR = File.join('tmp', '.htmlproofer')
14
+ DEFAULT_CACHE_FILE_NAME = 'cache.json'
15
+
16
+ URI_REGEXP = URI::DEFAULT_PARSER.make_regexp
17
+
18
+ attr_reader :exists, :cache_log, :storage_dir, :cache_file
19
+
20
+ def initialize(runner, options)
21
+ @runner = runner
22
+ @logger = @runner.logger
23
+
24
+ @cache_datetime = DateTime.now
25
+ @cache_time = @cache_datetime.to_time
26
+
27
+ if blank?(options)
28
+ define_singleton_method(:enabled?) { false }
29
+ else
30
+ define_singleton_method(:enabled?) { true }
31
+ setup_cache!(options)
32
+ @parsed_timeframe = parsed_timeframe(options[:timeframe])
33
+ end
34
+ end
35
+
36
+ def within_timeframe?(time)
37
+ return false if time.nil?
38
+
39
+ time = Time.parse(time) if time.is_a?(String)
40
+ (@parsed_timeframe..@cache_time).cover?(time)
41
+ end
42
+
43
+ def parsed_timeframe(timeframe)
44
+ time, date = timeframe.match(/(\d+)(\D)/).captures
45
+ time = time.to_i
46
+ case date
47
+ when 'M'
48
+ time_ago(time, :months)
49
+ when 'w'
50
+ time_ago(time, :weeks)
51
+ when 'd'
52
+ time_ago(time, :days)
53
+ when 'h'
54
+ time_ago(time, :hours)
55
+ else
56
+ raise ArgumentError, "#{date} is not a valid timeframe!"
57
+ end
58
+ end
59
+
60
+ def add_internal(url, metadata, found)
61
+ return unless enabled?
62
+
63
+ @cache_log[:internal][url] = { time: @cache_time, metadata: [] } if @cache_log[:internal][url].nil?
64
+
65
+ @cache_log[:internal][url][:metadata] << construct_internal_link_metadata(metadata, found)
66
+ end
67
+
68
+ def add_external(url, filenames, status_code, msg)
69
+ return unless enabled?
70
+
71
+ found = status_code.between?(200, 299)
72
+
73
+ clean_url = cleaned_url(url)
74
+ @cache_log[:external][clean_url] = { time: @cache_time.to_s, found: found, status_code: status_code, message: msg, metadata: filenames }
75
+ end
76
+
77
+ def detect_url_changes(urls_detected, type)
78
+ additions = determine_additions(urls_detected, type)
79
+
80
+ determine_deletions(urls_detected, type)
81
+
82
+ additions
83
+ end
84
+
85
+ private def construct_internal_link_metadata(metadata, found)
86
+ {
87
+ source: metadata[:source],
88
+ current_path: metadata[:current_path],
89
+ line: metadata[:line],
90
+ base_url: metadata[:base_url],
91
+ found: found
92
+ }
93
+ end
94
+
95
+ # prepare to add new URLs detected
96
+ private def determine_additions(urls_detected, type)
97
+ additions = urls_detected.reject do |url, metadata|
98
+ if @cache_log[type].include?(url)
99
+ @cache_log[type][url][:metadata] = metadata
100
+
101
+ # if this is false, we're trying again
102
+ if type == :external
103
+ @cache_log[type][url][:found]
104
+ else
105
+ @cache_log[type][url][:metadata].none? { |m| m[:found] }
106
+ end
107
+ else
108
+ @logger.log :debug, "Adding #{url} to #{type} cache"
109
+ false
110
+ end
111
+ end
112
+
113
+ new_link_count = additions.length
114
+ new_link_text = pluralize(new_link_count, "new #{type} link", "new #{type} links")
115
+ @logger.log :debug, "Adding #{new_link_text} to the cache"
116
+
117
+ additions
118
+ end
119
+
120
+ # remove from cache URLs that no longer exist
121
+ private def determine_deletions(urls_detected, type)
122
+ deletions = 0
123
+
124
+ @cache_log[type].delete_if do |url, _|
125
+ if urls_detected.include?(url)
126
+ false
127
+ elsif url_matches_type?(url, type)
128
+ @logger.log :debug, "Removing #{url} from #{type} cache"
129
+ deletions += 1
130
+ true
131
+ end
132
+ end
133
+
134
+ del_link_text = pluralize(deletions, "outdated #{type} link", "outdated #{type} links")
135
+ @logger.log :debug, "Removing #{del_link_text} from the cache"
136
+ end
137
+
138
+ def write
139
+ return unless enabled?
140
+
141
+ File.write(@cache_file, @cache_log.to_json)
142
+ end
143
+
144
+ def retrieve_urls(urls_detected, type)
145
+ # if there are no urls, bail
146
+ return {} if urls_detected.empty?
147
+
148
+ urls_detected = urls_detected.transform_keys do |url|
149
+ cleaned_url(url)
150
+ end
151
+
152
+ urls_to_check = detect_url_changes(urls_detected, type)
153
+
154
+ @cache_log[type].each_pair do |url, cache|
155
+ next if within_timeframe?(cache[:time])
156
+
157
+ urls_to_check[url] = cache[:metadata] # recheck expired links
158
+ end
159
+
160
+ urls_to_check
161
+ end
162
+
163
+ def empty?
164
+ blank?(@cache_log) || (@cache_log[:internal].empty? && @cache_log[:external].empty?)
165
+ end
166
+
167
+ def size(type)
168
+ @cache_log[type].size
169
+ end
170
+
171
+ private def setup_cache!(options)
172
+ default_structure = {
173
+ version: CACHE_VERSION,
174
+ internal: {},
175
+ external: {}
176
+ }
177
+
178
+ @storage_dir = options[:storage_dir] || DEFAULT_STORAGE_DIR
179
+
180
+ FileUtils.mkdir_p(storage_dir) unless Dir.exist?(storage_dir)
181
+
182
+ cache_file_name = options[:cache_file] || DEFAULT_CACHE_FILE_NAME
183
+
184
+ @cache_file = File.join(storage_dir, cache_file_name)
185
+
186
+ return (@cache_log = default_structure) unless File.exist?(@cache_file)
187
+
188
+ contents = File.read(@cache_file)
189
+
190
+ return (@cache_log = default_structure) if blank?(contents)
191
+
192
+ log = JSON.parse(contents, symbolize_names: true)
193
+
194
+ old_cache = (cache_version = log[:version]).nil?
195
+ @cache_log = if old_cache # previous cache version, create a new one
196
+ default_structure
197
+ elsif cache_version != CACHE_VERSION
198
+ # if cache version is newer...do something
199
+ else
200
+ log[:internal] = log[:internal].transform_keys(&:to_s)
201
+ log[:external] = log[:external].transform_keys(&:to_s)
202
+ log
203
+ end
204
+ end
205
+
206
+ private def time_ago(measurement, unit)
207
+ case unit
208
+ when :months
209
+ @cache_datetime >> -measurement
210
+ when :weeks
211
+ @cache_datetime - (measurement * 7)
212
+ when :days
213
+ @cache_datetime - measurement
214
+ when :hours
215
+ @cache_datetime - Rational(measurement / 24.0)
216
+ end.to_time
217
+ end
218
+
219
+ private def url_matches_type?(url, type)
220
+ return true if type == :internal && url !~ URI_REGEXP
221
+ return true if type == :external && url =~ URI_REGEXP
222
+ end
223
+
224
+ private def cleaned_url(url)
225
+ cleaned_url = escape_unescape(url)
226
+
227
+ return cleaned_url unless cleaned_url.end_with?('/', '#', '?') && cleaned_url.length > 1
228
+
229
+ cleaned_url[0..-2]
230
+ end
231
+
232
+ private def escape_unescape(url)
233
+ Addressable::URI.parse(url).normalize.to_s
234
+ end
235
+ end
236
+ end