html-proofer 3.19.4 → 4.0.0.rc1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (40) hide show
  1. checksums.yaml +4 -4
  2. data/bin/htmlproofer +30 -57
  3. data/lib/html-proofer.rb +1 -54
  4. data/lib/html_proofer/attribute/url.rb +231 -0
  5. data/lib/html_proofer/attribute.rb +15 -0
  6. data/lib/html_proofer/cache.rb +234 -0
  7. data/lib/html_proofer/check/favicon.rb +35 -0
  8. data/lib/html_proofer/check/images.rb +62 -0
  9. data/lib/html_proofer/check/links.rb +118 -0
  10. data/lib/html_proofer/check/open_graph.rb +34 -0
  11. data/lib/html_proofer/check/scripts.rb +38 -0
  12. data/lib/html_proofer/check.rb +91 -0
  13. data/lib/{html-proofer → html_proofer}/configuration.rb +30 -31
  14. data/lib/html_proofer/element.rb +122 -0
  15. data/lib/html_proofer/failure.rb +17 -0
  16. data/lib/{html-proofer → html_proofer}/log.rb +0 -0
  17. data/lib/html_proofer/reporter/cli.rb +29 -0
  18. data/lib/html_proofer/reporter.rb +23 -0
  19. data/lib/html_proofer/runner.rb +245 -0
  20. data/lib/html_proofer/url_validator/external.rb +189 -0
  21. data/lib/html_proofer/url_validator/internal.rb +86 -0
  22. data/lib/html_proofer/url_validator.rb +16 -0
  23. data/lib/{html-proofer → html_proofer}/utils.rb +5 -8
  24. data/lib/{html-proofer → html_proofer}/version.rb +1 -1
  25. data/lib/html_proofer/xpath_functions.rb +10 -0
  26. data/lib/html_proofer.rb +56 -0
  27. metadata +46 -27
  28. data/lib/html-proofer/cache.rb +0 -194
  29. data/lib/html-proofer/check/favicon.rb +0 -29
  30. data/lib/html-proofer/check/html.rb +0 -37
  31. data/lib/html-proofer/check/images.rb +0 -48
  32. data/lib/html-proofer/check/links.rb +0 -182
  33. data/lib/html-proofer/check/opengraph.rb +0 -46
  34. data/lib/html-proofer/check/scripts.rb +0 -42
  35. data/lib/html-proofer/check.rb +0 -75
  36. data/lib/html-proofer/element.rb +0 -265
  37. data/lib/html-proofer/issue.rb +0 -65
  38. data/lib/html-proofer/middleware.rb +0 -82
  39. data/lib/html-proofer/runner.rb +0 -249
  40. data/lib/html-proofer/url_validator.rb +0 -237
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f27b5c50ae5c1c77d5fbe36dbbdca327bcb96302912b726f7f955f643d1dfc48
4
- data.tar.gz: f09405cd0c70f1d2dc98f904c388bcab594f79107fdbe441c63f934821bef1b0
3
+ metadata.gz: bec55c40cc2d01b65496b138570cf434e533d045476470e4ce1e6b0daf3d5408
4
+ data.tar.gz: dd77aaf59adf3eaaa48a6b20dab59adbb0a5974b0a4ded5f9fd51e1fc9ba3684
5
5
  SHA512:
6
- metadata.gz: 53a8c98438f2056e7e2d926e926e10a6d0aa840b1b6f790860631912a2146dc20c68ca2b303d799a8fbfa723476e0e95dd5bc89695ceddf09ecede6f9acafbd1
7
- data.tar.gz: f68269ba70facf5ede07452d1029f49d17baadff8c6b4fd1d9de520c0ede91ff360bacb0cf46b5c719c0ae35c50ad61c6ce5b36171867f6a1c9d8c675d805ebc
6
+ metadata.gz: 5036e6f46c4e0ac32bd9a6f4bd891244f25cb43670c40eee7c7421c661f9a3b6e24edd15507b6290fd102f37988529250cecf5800b63b6ed1f8622dd983c76ec
7
+ data.tar.gz: cb9fdec8ec8774e8a9607d9b92b859767b6726e45b0bc54d6b1eee993de03c1a0a81a50035fe32543e96a7f1ee1d420366076d1d5956761abd1201086e35f057
data/bin/htmlproofer CHANGED
@@ -15,44 +15,32 @@ Mercenary.program(:htmlproofer) do |p|
15
15
 
16
16
  p.description 'Runs the HTML-Proofer suite on the files in PATH. For more details, see the README.'
17
17
 
18
- p.option 'allow_missing_href', '--allow-missing-href', 'If `true`, does not flag `a` tags missing `href` (this is the default for HTML5).'
19
- p.option 'allow_hash_href', '--allow-hash-href', 'If `true`, ignores the `href="#"`'
18
+ p.option 'allow_hash_href', '--allow-hash-href', 'If `true`, assumes `href="#"` anchors are valid'
19
+ p.option 'allow_missing_href', '--allow-missing-href', 'If `true`, does not flag `a` tags missing `href`. In HTML5, this is technically allowed, but could also be human error.'
20
20
  p.option 'as_links', '--as-links', 'Assumes that `PATH` is a comma-separated array of links to check.'
21
- p.option 'alt_ignore', '--alt-ignore image1,[image2,...]', Array, 'A comma-separated list of Strings or RegExps containing `img`s whose missing `alt` tags are safe to ignore'
22
- p.option 'assume_extension', '--assume-extension', 'Automatically add extension (e.g. `.html`) to file paths, to allow extensionless URLs (as supported by Jekyll 3 and GitHub Pages) (default: `false`).'
23
- p.option 'checks_to_ignore', '--checks-to-ignore check1,[check2,...]', Array, 'A comma-separated list of Strings indicating which checks you do not want to run (default: `[]`)'
24
- p.option 'check_external_hash', '--check-external-hash', 'Checks whether external hashes exist (even if the webpage exists). This slows the checker down (default: `false`).'
25
- p.option 'check_favicon', '--check-favicon', 'Enables the favicon checker (default: `false`).'
26
- p.option 'check_html', '--check-html', 'Enables HTML validation errors from Nokogumbo (default: `false`).'
27
- p.option 'check_img_http', '--check-img-http', 'Fails an image if it\'s marked as `http` (default: `false`).'
28
- p.option 'check_opengraph', '--check-opengraph', 'Enables the Open Graph checker (default: `false`).'
21
+ p.option 'assume_extension', '--assume-extension <ext>', 'Automatically add specified extension to files for internal links, to allow extensionless URLs (as supported by most servers) (default: `.html`).'
22
+ p.option 'checks', '--checks check1,[check2,...]', Array, 'A comma-separated list of Strings indicating which checks you want to run (default: `["Links", "Images", "Scripts"]`)'
23
+ p.option 'check_external_hash', '--check-external-hash', 'Checks whether external hashes exist (even if the webpage exists) (default: `true`).'
29
24
  p.option 'check_sri', '--check-sri', 'Check that `<link>` and `<script>` external resources use SRI (default: `false`).'
30
25
  p.option 'directory_index_file', '--directory-index-file <filename>', String, 'Sets the file to look for when a link refers to a directory. (default: `index.html`)'
31
- p.option 'disable_external', '--disable-external', 'If `true`, does not run the external link checker, which can take a lot of time (default: `false`)'
32
- p.option 'empty_alt_ignore', '--empty-alt-ignore', 'If `true`, ignores images with empty alt tags'
33
- p.option 'error_sort', '--error-sort <sort>', String, 'Defines the sort order for error output. Can be `:path`, `:desc`, or `:status` (default: `:path`).'
34
- p.option 'enforce_https', '--enforce-https', 'Fails a link if it\'s not marked as `https` (default: `false`).'
35
- p.option 'extension', '--extension <ext>', String, 'The extension of your HTML files including the dot. (default: `.html`)'
36
- p.option 'external_only', '--external_only', 'Only checks problems with external references'
37
- p.option 'file_ignore', '--file-ignore file1,[file2,...]', Array, 'A comma-separated list of Strings or RegExps containing file paths that are safe to ignore'
38
- p.option 'http_status_ignore', '--http-status-ignore 123,[xxx, ...]', Array, 'A comma-separated list of numbers representing status codes to ignore.'
39
- p.option 'internal_domains', '--internal-domains domain1,[domain2,...]', Array, 'A comma-separated list of Strings containing domains that will be treated as internal urls.'
26
+ p.option 'disable_external', '--disable-external', 'If `true`, does not run the external link checker (default: `false`)'
27
+ p.option 'enforce_https', '--enforce-https', 'Fails a link if it\'s not marked as `https` (default: `true`).'
28
+ p.option 'extensions', '--extensions ext1,[ext2,...[', Array, 'A comma-separated list of Strings indicating the file extensions you would like to check (including the dot) (default: `.html`)'
29
+ p.option 'ignore_files', '--ignore-files file1,[file2,...]', Array, 'A comma-separated list of Strings or RegExps containing file paths that are safe to ignore'
40
30
  p.option 'ignore_empty_mailto', '--ignore-empty-mailto', 'If `true`, allows `mailto:` `href`s which do not contain an email address'
41
- p.option 'report_invalid_tags', '--report-invalid-tags', 'When `check_html` is enabled, HTML markup that is unknown to Nokogumbo are reported as errors (default: `false`)'
42
- p.option 'report_missing_names', '--report-missing-names', 'When `check_html` is enabled, HTML markup that are missing entity names are reported as errors (default: `false`)'
43
- p.option 'report_script_embeds', '--report-script-embeds', 'When `check_html` is enabled, `script` tags containing markup are reported as errors (default: `false`)'
44
- p.option 'report_missing_doctype', '--report-missing-doctype', 'When `check_html` is enabled, HTML markup with missing or out-of-order `DOCTYPE` are reported as errors (default: `false`)'
45
- p.option 'report_eof_tags', '--report-eof-tags', 'When `check_html` is enabled, HTML markup with tags that are malformed are reported as errors (default: `false`)'
46
- p.option 'report_mismatched_tags', '--report-mismatched-tags', 'When `check_html` is enabled, HTML markup with mismatched tags are reported as errors (default: `false`)'
31
+ p.option 'ignore_missing_alt', '--empty-alt-ignore', 'If `true`, ignores images with empty/missing alt tags'
32
+ p.option 'ignore_status_codes', '--http-status-ignore 123,[xxx, ...]', Array, 'A comma-separated list of numbers representing status codes to ignore.'
33
+ p.option 'ignore_urls', '--ignore-urls link1,[link2,...]', Array, 'A comma-separated list of Strings or RegExps containing URLs that are safe to ignore. This affects all HTML attributes, such as `alt` tags on images.'
47
34
  p.option 'log_level', '--log-level <level>', String, 'Sets the logging level, as determined by Yell. One of `:debug`, `:info`, `:warn`, `:error`, or `:fatal`. (default: `:info`)'
48
35
  p.option 'only_4xx', '--only-4xx', 'Only reports errors for links that fall within the 4xx status code range'
49
- p.option 'storage_dir', '--storage-dir PATH', String, 'Directory where to store the cache log (default: "tmp/.htmlproofer")'
50
- p.option 'timeframe', '--timeframe <time>', String, 'A string representing the caching timeframe.'
51
- p.option 'typhoeus_config', '--typhoeus-config CONFIG', String, 'JSON-formatted string of Typhoeus config. Will override the html-proofer defaults.'
52
- p.option 'hydra_config', '--hydra-config CONFIG', String, 'JSON-formatted string of Hydra config. Will override the html-proofer defaults.'
53
- p.option 'url_ignore', '--url-ignore link1,[link2,...]', Array, 'A comma-separated list of Strings or RegExps containing URLs that are safe to ignore. It affects all HTML attributes. Note that non-HTTP(S) URIs are always ignored'
54
- p.option 'url_swap', '--url-swap re:string,[re:string,...]', Array, 'A comma-separated list containing key-value pairs of `RegExp => String`. It transforms URLs that match `RegExp` into `String` via `gsub`. The escape sequences `\\:` should be used to produce literal `:`s.'
55
36
  p.option 'root_dir', '--root-dir PATH', String, 'The absolute path to the directory serving your html-files.'
37
+ p.option 'swap_attributes', '--swap-attributes CONFIG', String, 'JSON-formatted config that maps element names to the preferred attribute to check (default: `{}`).'
38
+ p.option 'swap_urls', '--swap-urls re:string,[re:string,...]', Array, 'A comma-separated list containing key-value pairs of `RegExp => String`. It transforms URLs that match `RegExp` into `String` via `gsub`. The escape sequences `\\:` should be used to produce literal `:`s.'
39
+
40
+ p.option 'typhoeus', '--typhoeus CONFIG', String, 'JSON-formatted string of Typhoeus config. Will override the html-proofer defaults.'
41
+ p.option 'hydra', '--hydra CONFIG', String, 'JSON-formatted string of Hydra config. Will override the html-proofer defaults.'
42
+ p.option 'parallel', '--parallel CONFIG', String, 'JSON-formatted string of Parallel config. Will override the html-proofer defaults.'
43
+ p.option 'cache', '--cache CONFIG', String, 'JSON-formatted string of cache config. Will override the html-proofer defaults.'
56
44
 
57
45
  p.action do |args, opts|
58
46
  args = ['.'] if args.empty?
@@ -67,46 +55,31 @@ Mercenary.program(:htmlproofer) do |p|
67
55
  end
68
56
 
69
57
  # some minor manipulation of a special option
70
- unless opts['url_swap'].nil?
71
- options[:url_swap] = {}
72
- opts['url_swap'].each do |s|
58
+ unless opts['swap_urls'].nil?
59
+ options[:swap_urls] = {}
60
+ opts['swap_urls'].each do |s|
73
61
  splt = s.split(/(?<!\\):/, 2)
74
62
 
75
63
  re = splt[0].gsub(/\\:/, ':')
76
64
  string = splt[1].gsub(/\\:/, ':')
77
- options[:url_swap][Regexp.new(re)] = string
65
+ options[:swap_urls][Regexp.new(re)] = string
78
66
  end
79
67
  end
80
68
 
81
- options[:error_sort] = opts['error-sort'].to_sym unless opts['error-sort'].nil?
82
69
  options[:log_level] = opts['log_level'].to_sym unless opts['log_level'].nil?
83
70
 
84
- options[:validation] = HTMLProofer::Configuration::VALIDATION_DEFAULTS.dup
85
- options[:validation][:report_script_embeds] = opts['report_script_embeds'] unless opts['report_script_embeds'].nil?
86
- options[:validation][:report_missing_names] = opts['report_missing_names'] unless opts['report_missing_names'].nil?
87
- options[:validation][:report_invalid_tags] = opts['report_invalid_tags'] unless opts['report_invalid_tags'].nil?
88
- options[:validation][:report_missing_doctype] = opts['report_missing_doctype'] unless opts['report_missing_doctype'].nil?
89
- options[:validation][:report_eof_tags] = opts['report_eof_tags'] unless opts['report_eof_tags'].nil?
90
- options[:validation][:report_mismatched_tags] = opts['report_mismatched_tags'] unless opts['report_mismatched_tags'].nil?
91
-
92
- options[:typhoeus] = HTMLProofer::Configuration.parse_json_option('typhoeus_config', opts['typhoeus_config'], symbolize_names: false) unless opts['typhoeus_config'].nil?
93
- options[:hydra] = HTMLProofer::Configuration.parse_json_option('hydra_config', opts['hydra_config']) unless opts['hydra_config'].nil?
71
+ options[:typhoeus] = HTMLProofer::Configuration.parse_json_option('typhoeus', opts['typhoeus'], symbolize_names: false) unless opts['typhoeus'].nil?
72
+ options[:hydra] = HTMLProofer::Configuration.parse_json_option('hydra', opts['hydra']) unless opts['hydra'].nil?
73
+ options[:parallel] = HTMLProofer::Configuration.parse_json_option('parallel', opts['parallel']) unless opts['parallel'].nil?
74
+ options[:cache] = HTMLProofer::Configuration.parse_json_option('cache', opts['cache']) unless opts['cache'].nil?
94
75
 
95
- unless opts['timeframe'].nil?
96
- options[:cache] ||= {}
97
- options[:cache][:timeframe] = opts['timeframe'] unless opts['timeframe'].nil?
98
- end
99
-
100
- unless opts['storage_dir'].nil?
101
- options[:cache] ||= {}
102
- options[:cache][:storage_dir] = opts['storage_dir'] unless opts['storage_dir'].nil?
103
- end
76
+ options[:swap_attributes] = HTMLProofer::Configuration.parse_json_option('swap_attributes', opts['swap_attributes'], symbolize_names: false) unless opts['swap_attributes'].nil?
104
77
 
105
- options[:http_status_ignore] = Array(options[:http_status_ignore]).map(&:to_i)
78
+ options[:ignore_status_codes] = Array(options[:ignore_status_codes]).map(&:to_i)
106
79
 
107
80
  paths = path.split(',')
108
81
  if opts['as_links']
109
- links = path.delete(' ').split(',')
82
+ links = path.split(',').map(&:strip)
110
83
  HTMLProofer.check_links(links, options).run
111
84
  elsif File.directory?(paths.first)
112
85
  HTMLProofer.check_directories(paths, options).run
data/lib/html-proofer.rb CHANGED
@@ -1,56 +1,3 @@
1
- # rubocop:disable Naming/FileName
2
1
  # frozen_string_literal: true
3
2
 
4
- def require_all(path)
5
- dir = File.join(File.dirname(__FILE__), path)
6
- Dir[File.join(dir, '*.rb')].sort.each do |f|
7
- require f
8
- end
9
- end
10
-
11
- require_relative 'html-proofer/utils'
12
- require_all 'html-proofer'
13
- require_all 'html-proofer/check'
14
-
15
- require 'parallel'
16
- require 'fileutils'
17
-
18
- begin
19
- require 'awesome_print'
20
- require 'pry-byebug'
21
- rescue LoadError; end # rubocop:disable Lint/SuppressedException
22
- module HTMLProofer
23
- def self.check_file(file, options = {})
24
- raise ArgumentError unless file.is_a?(String)
25
- raise ArgumentError, "#{file} does not exist" unless File.exist?(file)
26
-
27
- options[:type] = :file
28
- HTMLProofer::Runner.new(file, options)
29
- end
30
-
31
- def self.check_directory(directory, options = {})
32
- raise ArgumentError unless directory.is_a?(String)
33
- raise ArgumentError, "#{directory} does not exist" unless Dir.exist?(directory)
34
-
35
- options[:type] = :directory
36
- HTMLProofer::Runner.new([directory], options)
37
- end
38
-
39
- def self.check_directories(directories, options = {})
40
- raise ArgumentError unless directories.is_a?(Array)
41
-
42
- options[:type] = :directory
43
- directories.each do |directory|
44
- raise ArgumentError, "#{directory} does not exist" unless Dir.exist?(directory)
45
- end
46
- HTMLProofer::Runner.new(directories, options)
47
- end
48
-
49
- def self.check_links(links, options = {})
50
- raise ArgumentError unless links.is_a?(Array)
51
-
52
- options[:type] = :links
53
- HTMLProofer::Runner.new(links, options)
54
- end
55
- end
56
- # rubocop:enable Naming/FileName
3
+ require_relative 'html_proofer'
@@ -0,0 +1,231 @@
1
+ # frozen_string_literal: true
2
+
3
+ class HTMLProofer::Attribute::Url < HTMLProofer::Attribute
4
+ attr_reader :url
5
+
6
+ REMOTE_SCHEMES = %w[http https].freeze
7
+
8
+ def initialize(runner, link_attribute, base_url: nil)
9
+ super
10
+
11
+ if @raw_attribute.nil?
12
+ @url = nil
13
+ else
14
+ @url = @raw_attribute.delete("\u200b").strip
15
+ @url = Addressable::URI.join(base_url, @url).to_s unless blank?(base_url)
16
+
17
+ swap_urls!
18
+ clean_url!
19
+
20
+ # convert "//" links to "https://"
21
+ @url.start_with?('//') ? @url = "https:#{@url}" : @url
22
+ end
23
+ end
24
+
25
+ def to_s
26
+ @url
27
+ end
28
+
29
+ def known_extension?
30
+ return true if hash_link?
31
+
32
+ ext = File.extname(path)
33
+
34
+ # no extension means we use the assumed one
35
+ return @runner.options[:extensions].include?(@runner.options[:assume_extension]) if blank?(ext)
36
+
37
+ @runner.options[:extensions].include?(ext)
38
+ end
39
+
40
+ def unknown_extension?
41
+ !known_extension?
42
+ end
43
+
44
+ def ignore?
45
+ return true if (/^javascript:/).match?(@url)
46
+ return true if ignores_pattern?(@runner.options[:ignore_urls])
47
+ end
48
+
49
+ def valid?
50
+ !parts.nil?
51
+ end
52
+
53
+ def path?
54
+ !parts.host.nil? && !parts.path.nil?
55
+ end
56
+
57
+ def parts
58
+ @parts ||= Addressable::URI.parse @url
59
+ rescue URI::Error, Addressable::URI::InvalidURIError
60
+ @parts = nil
61
+ end
62
+
63
+ def path
64
+ Addressable::URI.unencode parts.path unless parts.nil?
65
+ end
66
+
67
+ def hash
68
+ parts&.fragment
69
+ end
70
+
71
+ # Does the URL have a hash?
72
+ def hash?
73
+ !blank?(hash)
74
+ end
75
+
76
+ def scheme
77
+ parts&.scheme
78
+ end
79
+
80
+ def remote?
81
+ REMOTE_SCHEMES.include?(scheme)
82
+ end
83
+
84
+ def http?
85
+ scheme == 'http'
86
+ end
87
+
88
+ def https?
89
+ scheme == 'https'
90
+ end
91
+
92
+ def non_http_remote?
93
+ !scheme.nil? && !remote?
94
+ end
95
+
96
+ def host
97
+ parts&.host
98
+ end
99
+
100
+ def domain_path
101
+ (host || '') + path
102
+ end
103
+
104
+ def query_values
105
+ parts&.query_values
106
+ end
107
+
108
+ # checks if a file exists relative to the current pwd
109
+ def exists?
110
+ return true if base64?
111
+
112
+ return @runner.checked_paths[absolute_path] if @runner.checked_paths.key?(absolute_path)
113
+
114
+ @runner.checked_paths[absolute_path] = File.exist?(absolute_path)
115
+ end
116
+
117
+ def base64?
118
+ /^data:image/.match?(@raw_attribute)
119
+ end
120
+
121
+ def absolute_path
122
+ path = file_path || @runner.current_path
123
+
124
+ File.expand_path(path, Dir.pwd)
125
+ end
126
+
127
+ def file_path
128
+ return if path.nil? || path.empty?
129
+
130
+ path_dot_ext = ''
131
+
132
+ path_dot_ext = path + @runner.options[:assume_extension] unless blank?(@runner.options[:assume_extension])
133
+
134
+ base = if absolute_path?(path) # path relative to root
135
+ # either overwrite with root_dir; or, if source is directory, use that; or, just get the current file's dirname
136
+ @runner.options[:root_dir] || (File.directory?(@runner.current_source) ? @runner.current_source : File.dirname(@runner.current_source))
137
+ # relative links, path is a file
138
+ elsif File.exist?(File.expand_path(path, @runner.current_source)) || File.exist?(File.expand_path(path_dot_ext, @runner.current_source))
139
+ File.dirname(@runner.current_path)
140
+ # relative links in nested dir, path is a file
141
+ elsif File.exist?(File.join(File.dirname(@runner.current_path), path)) || File.exist?(File.join(File.dirname(@runner.current_path), path_dot_ext)) # rubocop:disable Lint/DuplicateBranch
142
+ File.dirname(@runner.current_path)
143
+ # relative link, path is a directory
144
+ else
145
+ @runner.current_path
146
+ end
147
+
148
+ file = File.join(base, path)
149
+
150
+ if @runner.options[:assume_extension] && File.file?("#{file}#{@runner.options[:assume_extension]}")
151
+ file = "#{file}#{@runner.options[:assume_extension]}"
152
+ elsif File.directory?(file) && !unslashed_directory?(file) # implicit index support
153
+ file = File.join file, @runner.options[:directory_index_file]
154
+ end
155
+
156
+ file
157
+ end
158
+
159
+ def unslashed_directory?(file)
160
+ File.directory?(file) && !file.end_with?(File::SEPARATOR)
161
+ end
162
+
163
+ def absolute_path?(path)
164
+ path.start_with?('/')
165
+ end
166
+
167
+ # path is external to the file
168
+ def external?
169
+ !internal?
170
+ end
171
+
172
+ def internal?
173
+ relative_link? || internal_absolute_link? || hash_link?
174
+ end
175
+
176
+ def internal_absolute_link?
177
+ url.start_with?('/')
178
+ end
179
+
180
+ def relative_link?
181
+ return false if remote?
182
+
183
+ hash_link? || param_link? || url.start_with?('.') || url =~ /^\S/
184
+ end
185
+
186
+ def link_points_to_same_page?
187
+ hash_link || param_link
188
+ end
189
+
190
+ def hash_link?
191
+ url.start_with?('#')
192
+ end
193
+
194
+ def param_link?
195
+ url.start_with?('?')
196
+ end
197
+
198
+ def sans_hash
199
+ @url.to_s.sub(/##{hash}/, '')
200
+ end
201
+
202
+ # catch any obvious issues, like strings in port numbers
203
+ private def clean_url!
204
+ return if @url =~ /^([!#{Regexp.last_match(0)}-;=?-\[\]_a-z~]|%[0-9a-fA-F]{2})+$/
205
+
206
+ @url = Addressable::URI.parse(@url).normalize.to_s
207
+ end
208
+
209
+ private def swap_urls!
210
+ return @url if blank?(replacements = @runner.options[:swap_urls])
211
+
212
+ replacements.each do |link, replace|
213
+ @url = @url.gsub(link, replace)
214
+ end
215
+ end
216
+
217
+ private def ignores_pattern?(links_to_ignore)
218
+ return false unless links_to_ignore.is_a?(Array)
219
+
220
+ links_to_ignore.each do |link_to_ignore|
221
+ case link_to_ignore
222
+ when String
223
+ return true if link_to_ignore == @raw_attribute
224
+ when Regexp
225
+ return true if link_to_ignore&.match?(@raw_attribute)
226
+ end
227
+ end
228
+
229
+ false
230
+ end
231
+ end
@@ -0,0 +1,15 @@
1
+ # frozen_string_literal: true
2
+
3
+ module HTMLProofer
4
+ # Represents an element currently being processed
5
+ class Attribute
6
+ include HTMLProofer::Utils
7
+
8
+ attr_reader :raw_attribute
9
+
10
+ def initialize(runner, raw_attribute, **_)
11
+ @runner = runner
12
+ @raw_attribute = raw_attribute
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,234 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'date'
4
+ require 'json'
5
+ require 'uri'
6
+
7
+ module HTMLProofer
8
+ class Cache
9
+ include HTMLProofer::Utils
10
+
11
+ CACHE_VERSION = 2
12
+
13
+ DEFAULT_STORAGE_DIR = File.join('tmp', '.htmlproofer')
14
+ DEFAULT_CACHE_FILE_NAME = 'cache.json'
15
+
16
+ URI_REGEXP = URI::DEFAULT_PARSER.make_regexp
17
+
18
+ attr_reader :exists, :cache_log, :storage_dir, :cache_file
19
+
20
+ def initialize(runner, options)
21
+ @runner = runner
22
+ @logger = @runner.logger
23
+
24
+ @cache_datetime = DateTime.now
25
+ @cache_time = @cache_datetime.to_time
26
+
27
+ if blank?(options)
28
+ define_singleton_method(:enabled?) { false }
29
+ else
30
+ define_singleton_method(:enabled?) { true }
31
+ setup_cache!(options)
32
+ @parsed_timeframe = parsed_timeframe(options[:timeframe])
33
+ end
34
+ end
35
+
36
+ def within_timeframe?(time)
37
+ return false if time.nil?
38
+
39
+ time = Time.parse(time) if time.is_a?(String)
40
+ (@parsed_timeframe..@cache_time).cover?(time)
41
+ end
42
+
43
+ def parsed_timeframe(timeframe)
44
+ time, date = timeframe.match(/(\d+)(\D)/).captures
45
+ time = time.to_i
46
+ case date
47
+ when 'M'
48
+ time_ago(time, :months)
49
+ when 'w'
50
+ time_ago(time, :weeks)
51
+ when 'd'
52
+ time_ago(time, :days)
53
+ when 'h'
54
+ time_ago(time, :hours)
55
+ else
56
+ raise ArgumentError, "#{date} is not a valid timeframe!"
57
+ end
58
+ end
59
+
60
+ def add_internal(url, metadata, found)
61
+ return unless enabled?
62
+
63
+ @cache_log[:internal][url] = { time: @cache_time, metadata: [] } if @cache_log[:internal][url].nil?
64
+
65
+ @cache_log[:internal][url][:metadata] << construct_internal_link_metadata(metadata, found)
66
+ end
67
+
68
+ def add_external(url, filenames, status_code, msg)
69
+ return unless enabled?
70
+
71
+ found = status_code.between?(200, 299)
72
+
73
+ clean_url = cleaned_url(url)
74
+ @cache_log[:external][clean_url] = { time: @cache_time.to_s, found: found, status_code: status_code, message: msg, metadata: filenames }
75
+ end
76
+
77
+ def detect_url_changes(urls_detected, type)
78
+ additions = determine_additions(urls_detected, type)
79
+
80
+ determine_deletions(urls_detected, type)
81
+
82
+ additions
83
+ end
84
+
85
+ private def construct_internal_link_metadata(metadata, found)
86
+ {
87
+ source: metadata[:source],
88
+ current_path: metadata[:current_path],
89
+ line: metadata[:line],
90
+ base_url: metadata[:base_url],
91
+ found: found
92
+ }
93
+ end
94
+
95
+ # prepare to add new URLs detected
96
+ private def determine_additions(urls_detected, type)
97
+ additions = urls_detected.reject do |url, metadata|
98
+ url = cleaned_url(url)
99
+
100
+ if @cache_log[type].include?(url)
101
+ @cache_log[type][url][:metadata] = metadata
102
+
103
+ # if this is false, we're trying again
104
+ if type == :external
105
+ @cache_log[type][url][:found]
106
+ else
107
+ @cache_log[type][url][:metadata].none? { |m| m[:found] }
108
+ end
109
+ else
110
+ @logger.log :debug, "Adding #{url} to #{type} cache"
111
+ false
112
+ end
113
+ end
114
+
115
+ new_link_count = additions.length
116
+ new_link_text = pluralize(new_link_count, "new #{type} link", "new #{type} links")
117
+ @logger.log :debug, "Adding #{new_link_text} to the cache"
118
+
119
+ additions
120
+ end
121
+
122
+ # remove from cache URLs that no longer exist
123
+ private def determine_deletions(urls_detected, type)
124
+ deletions = 0
125
+
126
+ @cache_log[type].delete_if do |url, _|
127
+ url = cleaned_url(url)
128
+
129
+ if urls_detected.include?(url)
130
+ false
131
+ elsif url_matches_type?(url, type)
132
+ @logger.log :debug, "Removing #{url} from #{type} cache"
133
+ deletions += 1
134
+ true
135
+ end
136
+ end
137
+
138
+ del_link_text = pluralize(deletions, "outdated #{type} link", "outdated #{type} links")
139
+ @logger.log :debug, "Removing #{del_link_text} from the cache"
140
+ end
141
+
142
+ def write
143
+ return unless enabled?
144
+
145
+ File.write(@cache_file, @cache_log.to_json)
146
+ end
147
+
148
+ def retrieve_urls(urls_detected, type)
149
+ # if there are no urls, bail
150
+ return {} if urls_detected.empty?
151
+
152
+ urls_to_check = detect_url_changes(urls_detected, type)
153
+
154
+ @cache_log[type].each_pair do |url, cache|
155
+ next if within_timeframe?(cache[:time])
156
+
157
+ urls_to_check[url] = cache[:metadata] # recheck expired links
158
+ end
159
+
160
+ urls_to_check
161
+ end
162
+
163
+ def empty?
164
+ blank?(@cache_log) || (@cache_log[:internal].empty? && @cache_log[:external].empty?)
165
+ end
166
+
167
+ def size(type)
168
+ @cache_log[type].size
169
+ end
170
+
171
+ private def setup_cache!(options)
172
+ default_structure = {
173
+ version: CACHE_VERSION,
174
+ internal: {},
175
+ external: {}
176
+ }
177
+
178
+ @storage_dir = options[:storage_dir] || DEFAULT_STORAGE_DIR
179
+
180
+ FileUtils.mkdir_p(storage_dir) unless Dir.exist?(storage_dir)
181
+
182
+ cache_file_name = options[:cache_file] || DEFAULT_CACHE_FILE_NAME
183
+
184
+ @cache_file = File.join(storage_dir, cache_file_name)
185
+
186
+ return (@cache_log = default_structure) unless File.exist?(@cache_file)
187
+
188
+ contents = File.read(@cache_file)
189
+
190
+ return (@cache_log = default_structure) if blank?(contents)
191
+
192
+ log = JSON.parse(contents, symbolize_names: true)
193
+
194
+ old_cache = (cache_version = log[:version]).nil?
195
+ @cache_log = if old_cache # previous cache version, create a new one
196
+ default_structure
197
+ elsif cache_version != CACHE_VERSION
198
+ # if cache version is newer...do something
199
+ else
200
+ log[:internal] = log[:internal].transform_keys(&:to_s)
201
+ log[:external] = log[:external].transform_keys(&:to_s)
202
+ log
203
+ end
204
+ end
205
+
206
+ private def time_ago(measurement, unit)
207
+ case unit
208
+ when :months
209
+ @cache_datetime >> -measurement
210
+ when :weeks
211
+ @cache_datetime - (measurement * 7)
212
+ when :days
213
+ @cache_datetime - measurement
214
+ when :hours
215
+ @cache_datetime - Rational(measurement / 24.0)
216
+ end.to_time
217
+ end
218
+
219
+ private def url_matches_type?(url, type)
220
+ return true if type == :internal && url !~ URI_REGEXP
221
+ return true if type == :external && url =~ URI_REGEXP
222
+ end
223
+
224
+ private def cleaned_url(url)
225
+ return escape_unescape(url) unless url.end_with?('/', '#', '?') && url.length > 1
226
+
227
+ escape_unescape(url[0..-2])
228
+ end
229
+
230
+ private def escape_unescape(url)
231
+ Addressable::URI.parse(url).normalize.to_s
232
+ end
233
+ end
234
+ end