html-proofer 3.19.4 → 4.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. checksums.yaml +4 -4
  2. data/bin/htmlproofer +30 -57
  3. data/lib/html-proofer.rb +1 -54
  4. data/lib/html_proofer/attribute/url.rb +231 -0
  5. data/lib/html_proofer/attribute.rb +15 -0
  6. data/lib/html_proofer/cache.rb +234 -0
  7. data/lib/html_proofer/check/favicon.rb +35 -0
  8. data/lib/html_proofer/check/images.rb +62 -0
  9. data/lib/html_proofer/check/links.rb +118 -0
  10. data/lib/html_proofer/check/open_graph.rb +34 -0
  11. data/lib/html_proofer/check/scripts.rb +38 -0
  12. data/lib/html_proofer/check.rb +91 -0
  13. data/lib/{html-proofer → html_proofer}/configuration.rb +30 -31
  14. data/lib/html_proofer/element.rb +122 -0
  15. data/lib/html_proofer/failure.rb +17 -0
  16. data/lib/{html-proofer → html_proofer}/log.rb +0 -0
  17. data/lib/html_proofer/reporter/cli.rb +29 -0
  18. data/lib/html_proofer/reporter.rb +23 -0
  19. data/lib/html_proofer/runner.rb +245 -0
  20. data/lib/html_proofer/url_validator/external.rb +189 -0
  21. data/lib/html_proofer/url_validator/internal.rb +86 -0
  22. data/lib/html_proofer/url_validator.rb +16 -0
  23. data/lib/{html-proofer → html_proofer}/utils.rb +5 -8
  24. data/lib/{html-proofer → html_proofer}/version.rb +1 -1
  25. data/lib/html_proofer/xpath_functions.rb +10 -0
  26. data/lib/html_proofer.rb +56 -0
  27. metadata +46 -27
  28. data/lib/html-proofer/cache.rb +0 -194
  29. data/lib/html-proofer/check/favicon.rb +0 -29
  30. data/lib/html-proofer/check/html.rb +0 -37
  31. data/lib/html-proofer/check/images.rb +0 -48
  32. data/lib/html-proofer/check/links.rb +0 -182
  33. data/lib/html-proofer/check/opengraph.rb +0 -46
  34. data/lib/html-proofer/check/scripts.rb +0 -42
  35. data/lib/html-proofer/check.rb +0 -75
  36. data/lib/html-proofer/element.rb +0 -265
  37. data/lib/html-proofer/issue.rb +0 -65
  38. data/lib/html-proofer/middleware.rb +0 -82
  39. data/lib/html-proofer/runner.rb +0 -249
  40. data/lib/html-proofer/url_validator.rb +0 -237
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f27b5c50ae5c1c77d5fbe36dbbdca327bcb96302912b726f7f955f643d1dfc48
4
- data.tar.gz: f09405cd0c70f1d2dc98f904c388bcab594f79107fdbe441c63f934821bef1b0
3
+ metadata.gz: bec55c40cc2d01b65496b138570cf434e533d045476470e4ce1e6b0daf3d5408
4
+ data.tar.gz: dd77aaf59adf3eaaa48a6b20dab59adbb0a5974b0a4ded5f9fd51e1fc9ba3684
5
5
  SHA512:
6
- metadata.gz: 53a8c98438f2056e7e2d926e926e10a6d0aa840b1b6f790860631912a2146dc20c68ca2b303d799a8fbfa723476e0e95dd5bc89695ceddf09ecede6f9acafbd1
7
- data.tar.gz: f68269ba70facf5ede07452d1029f49d17baadff8c6b4fd1d9de520c0ede91ff360bacb0cf46b5c719c0ae35c50ad61c6ce5b36171867f6a1c9d8c675d805ebc
6
+ metadata.gz: 5036e6f46c4e0ac32bd9a6f4bd891244f25cb43670c40eee7c7421c661f9a3b6e24edd15507b6290fd102f37988529250cecf5800b63b6ed1f8622dd983c76ec
7
+ data.tar.gz: cb9fdec8ec8774e8a9607d9b92b859767b6726e45b0bc54d6b1eee993de03c1a0a81a50035fe32543e96a7f1ee1d420366076d1d5956761abd1201086e35f057
data/bin/htmlproofer CHANGED
@@ -15,44 +15,32 @@ Mercenary.program(:htmlproofer) do |p|
15
15
 
16
16
  p.description 'Runs the HTML-Proofer suite on the files in PATH. For more details, see the README.'
17
17
 
18
- p.option 'allow_missing_href', '--allow-missing-href', 'If `true`, does not flag `a` tags missing `href` (this is the default for HTML5).'
19
- p.option 'allow_hash_href', '--allow-hash-href', 'If `true`, ignores the `href="#"`'
18
+ p.option 'allow_hash_href', '--allow-hash-href', 'If `true`, assumes `href="#"` anchors are valid'
19
+ p.option 'allow_missing_href', '--allow-missing-href', 'If `true`, does not flag `a` tags missing `href`. In HTML5, this is technically allowed, but could also be human error.'
20
20
  p.option 'as_links', '--as-links', 'Assumes that `PATH` is a comma-separated array of links to check.'
21
- p.option 'alt_ignore', '--alt-ignore image1,[image2,...]', Array, 'A comma-separated list of Strings or RegExps containing `img`s whose missing `alt` tags are safe to ignore'
22
- p.option 'assume_extension', '--assume-extension', 'Automatically add extension (e.g. `.html`) to file paths, to allow extensionless URLs (as supported by Jekyll 3 and GitHub Pages) (default: `false`).'
23
- p.option 'checks_to_ignore', '--checks-to-ignore check1,[check2,...]', Array, 'A comma-separated list of Strings indicating which checks you do not want to run (default: `[]`)'
24
- p.option 'check_external_hash', '--check-external-hash', 'Checks whether external hashes exist (even if the webpage exists). This slows the checker down (default: `false`).'
25
- p.option 'check_favicon', '--check-favicon', 'Enables the favicon checker (default: `false`).'
26
- p.option 'check_html', '--check-html', 'Enables HTML validation errors from Nokogumbo (default: `false`).'
27
- p.option 'check_img_http', '--check-img-http', 'Fails an image if it\'s marked as `http` (default: `false`).'
28
- p.option 'check_opengraph', '--check-opengraph', 'Enables the Open Graph checker (default: `false`).'
21
+ p.option 'assume_extension', '--assume-extension <ext>', 'Automatically add specified extension to files for internal links, to allow extensionless URLs (as supported by most servers) (default: `.html`).'
22
+ p.option 'checks', '--checks check1,[check2,...]', Array, 'A comma-separated list of Strings indicating which checks you want to run (default: `["Links", "Images", "Scripts"]`)'
23
+ p.option 'check_external_hash', '--check-external-hash', 'Checks whether external hashes exist (even if the webpage exists) (default: `true`).'
29
24
  p.option 'check_sri', '--check-sri', 'Check that `<link>` and `<script>` external resources use SRI (default: `false`).'
30
25
  p.option 'directory_index_file', '--directory-index-file <filename>', String, 'Sets the file to look for when a link refers to a directory. (default: `index.html`)'
31
- p.option 'disable_external', '--disable-external', 'If `true`, does not run the external link checker, which can take a lot of time (default: `false`)'
32
- p.option 'empty_alt_ignore', '--empty-alt-ignore', 'If `true`, ignores images with empty alt tags'
33
- p.option 'error_sort', '--error-sort <sort>', String, 'Defines the sort order for error output. Can be `:path`, `:desc`, or `:status` (default: `:path`).'
34
- p.option 'enforce_https', '--enforce-https', 'Fails a link if it\'s not marked as `https` (default: `false`).'
35
- p.option 'extension', '--extension <ext>', String, 'The extension of your HTML files including the dot. (default: `.html`)'
36
- p.option 'external_only', '--external_only', 'Only checks problems with external references'
37
- p.option 'file_ignore', '--file-ignore file1,[file2,...]', Array, 'A comma-separated list of Strings or RegExps containing file paths that are safe to ignore'
38
- p.option 'http_status_ignore', '--http-status-ignore 123,[xxx, ...]', Array, 'A comma-separated list of numbers representing status codes to ignore.'
39
- p.option 'internal_domains', '--internal-domains domain1,[domain2,...]', Array, 'A comma-separated list of Strings containing domains that will be treated as internal urls.'
26
+ p.option 'disable_external', '--disable-external', 'If `true`, does not run the external link checker (default: `false`)'
27
+ p.option 'enforce_https', '--enforce-https', 'Fails a link if it\'s not marked as `https` (default: `true`).'
28
+ p.option 'extensions', '--extensions ext1,[ext2,...[', Array, 'A comma-separated list of Strings indicating the file extensions you would like to check (including the dot) (default: `.html`)'
29
+ p.option 'ignore_files', '--ignore-files file1,[file2,...]', Array, 'A comma-separated list of Strings or RegExps containing file paths that are safe to ignore'
40
30
  p.option 'ignore_empty_mailto', '--ignore-empty-mailto', 'If `true`, allows `mailto:` `href`s which do not contain an email address'
41
- p.option 'report_invalid_tags', '--report-invalid-tags', 'When `check_html` is enabled, HTML markup that is unknown to Nokogumbo are reported as errors (default: `false`)'
42
- p.option 'report_missing_names', '--report-missing-names', 'When `check_html` is enabled, HTML markup that are missing entity names are reported as errors (default: `false`)'
43
- p.option 'report_script_embeds', '--report-script-embeds', 'When `check_html` is enabled, `script` tags containing markup are reported as errors (default: `false`)'
44
- p.option 'report_missing_doctype', '--report-missing-doctype', 'When `check_html` is enabled, HTML markup with missing or out-of-order `DOCTYPE` are reported as errors (default: `false`)'
45
- p.option 'report_eof_tags', '--report-eof-tags', 'When `check_html` is enabled, HTML markup with tags that are malformed are reported as errors (default: `false`)'
46
- p.option 'report_mismatched_tags', '--report-mismatched-tags', 'When `check_html` is enabled, HTML markup with mismatched tags are reported as errors (default: `false`)'
31
+ p.option 'ignore_missing_alt', '--empty-alt-ignore', 'If `true`, ignores images with empty/missing alt tags'
32
+ p.option 'ignore_status_codes', '--http-status-ignore 123,[xxx, ...]', Array, 'A comma-separated list of numbers representing status codes to ignore.'
33
+ p.option 'ignore_urls', '--ignore-urls link1,[link2,...]', Array, 'A comma-separated list of Strings or RegExps containing URLs that are safe to ignore. This affects all HTML attributes, such as `alt` tags on images.'
47
34
  p.option 'log_level', '--log-level <level>', String, 'Sets the logging level, as determined by Yell. One of `:debug`, `:info`, `:warn`, `:error`, or `:fatal`. (default: `:info`)'
48
35
  p.option 'only_4xx', '--only-4xx', 'Only reports errors for links that fall within the 4xx status code range'
49
- p.option 'storage_dir', '--storage-dir PATH', String, 'Directory where to store the cache log (default: "tmp/.htmlproofer")'
50
- p.option 'timeframe', '--timeframe <time>', String, 'A string representing the caching timeframe.'
51
- p.option 'typhoeus_config', '--typhoeus-config CONFIG', String, 'JSON-formatted string of Typhoeus config. Will override the html-proofer defaults.'
52
- p.option 'hydra_config', '--hydra-config CONFIG', String, 'JSON-formatted string of Hydra config. Will override the html-proofer defaults.'
53
- p.option 'url_ignore', '--url-ignore link1,[link2,...]', Array, 'A comma-separated list of Strings or RegExps containing URLs that are safe to ignore. It affects all HTML attributes. Note that non-HTTP(S) URIs are always ignored'
54
- p.option 'url_swap', '--url-swap re:string,[re:string,...]', Array, 'A comma-separated list containing key-value pairs of `RegExp => String`. It transforms URLs that match `RegExp` into `String` via `gsub`. The escape sequences `\\:` should be used to produce literal `:`s.'
55
36
  p.option 'root_dir', '--root-dir PATH', String, 'The absolute path to the directory serving your html-files.'
37
+ p.option 'swap_attributes', '--swap-attributes CONFIG', String, 'JSON-formatted config that maps element names to the preferred attribute to check (default: `{}`).'
38
+ p.option 'swap_urls', '--swap-urls re:string,[re:string,...]', Array, 'A comma-separated list containing key-value pairs of `RegExp => String`. It transforms URLs that match `RegExp` into `String` via `gsub`. The escape sequences `\\:` should be used to produce literal `:`s.'
39
+
40
+ p.option 'typhoeus', '--typhoeus CONFIG', String, 'JSON-formatted string of Typhoeus config. Will override the html-proofer defaults.'
41
+ p.option 'hydra', '--hydra CONFIG', String, 'JSON-formatted string of Hydra config. Will override the html-proofer defaults.'
42
+ p.option 'parallel', '--parallel CONFIG', String, 'JSON-formatted string of Parallel config. Will override the html-proofer defaults.'
43
+ p.option 'cache', '--cache CONFIG', String, 'JSON-formatted string of cache config. Will override the html-proofer defaults.'
56
44
 
57
45
  p.action do |args, opts|
58
46
  args = ['.'] if args.empty?
@@ -67,46 +55,31 @@ Mercenary.program(:htmlproofer) do |p|
67
55
  end
68
56
 
69
57
  # some minor manipulation of a special option
70
- unless opts['url_swap'].nil?
71
- options[:url_swap] = {}
72
- opts['url_swap'].each do |s|
58
+ unless opts['swap_urls'].nil?
59
+ options[:swap_urls] = {}
60
+ opts['swap_urls'].each do |s|
73
61
  splt = s.split(/(?<!\\):/, 2)
74
62
 
75
63
  re = splt[0].gsub(/\\:/, ':')
76
64
  string = splt[1].gsub(/\\:/, ':')
77
- options[:url_swap][Regexp.new(re)] = string
65
+ options[:swap_urls][Regexp.new(re)] = string
78
66
  end
79
67
  end
80
68
 
81
- options[:error_sort] = opts['error-sort'].to_sym unless opts['error-sort'].nil?
82
69
  options[:log_level] = opts['log_level'].to_sym unless opts['log_level'].nil?
83
70
 
84
- options[:validation] = HTMLProofer::Configuration::VALIDATION_DEFAULTS.dup
85
- options[:validation][:report_script_embeds] = opts['report_script_embeds'] unless opts['report_script_embeds'].nil?
86
- options[:validation][:report_missing_names] = opts['report_missing_names'] unless opts['report_missing_names'].nil?
87
- options[:validation][:report_invalid_tags] = opts['report_invalid_tags'] unless opts['report_invalid_tags'].nil?
88
- options[:validation][:report_missing_doctype] = opts['report_missing_doctype'] unless opts['report_missing_doctype'].nil?
89
- options[:validation][:report_eof_tags] = opts['report_eof_tags'] unless opts['report_eof_tags'].nil?
90
- options[:validation][:report_mismatched_tags] = opts['report_mismatched_tags'] unless opts['report_mismatched_tags'].nil?
91
-
92
- options[:typhoeus] = HTMLProofer::Configuration.parse_json_option('typhoeus_config', opts['typhoeus_config'], symbolize_names: false) unless opts['typhoeus_config'].nil?
93
- options[:hydra] = HTMLProofer::Configuration.parse_json_option('hydra_config', opts['hydra_config']) unless opts['hydra_config'].nil?
71
+ options[:typhoeus] = HTMLProofer::Configuration.parse_json_option('typhoeus', opts['typhoeus'], symbolize_names: false) unless opts['typhoeus'].nil?
72
+ options[:hydra] = HTMLProofer::Configuration.parse_json_option('hydra', opts['hydra']) unless opts['hydra'].nil?
73
+ options[:parallel] = HTMLProofer::Configuration.parse_json_option('parallel', opts['parallel']) unless opts['parallel'].nil?
74
+ options[:cache] = HTMLProofer::Configuration.parse_json_option('cache', opts['cache']) unless opts['cache'].nil?
94
75
 
95
- unless opts['timeframe'].nil?
96
- options[:cache] ||= {}
97
- options[:cache][:timeframe] = opts['timeframe'] unless opts['timeframe'].nil?
98
- end
99
-
100
- unless opts['storage_dir'].nil?
101
- options[:cache] ||= {}
102
- options[:cache][:storage_dir] = opts['storage_dir'] unless opts['storage_dir'].nil?
103
- end
76
+ options[:swap_attributes] = HTMLProofer::Configuration.parse_json_option('swap_attributes', opts['swap_attributes'], symbolize_names: false) unless opts['swap_attributes'].nil?
104
77
 
105
- options[:http_status_ignore] = Array(options[:http_status_ignore]).map(&:to_i)
78
+ options[:ignore_status_codes] = Array(options[:ignore_status_codes]).map(&:to_i)
106
79
 
107
80
  paths = path.split(',')
108
81
  if opts['as_links']
109
- links = path.delete(' ').split(',')
82
+ links = path.split(',').map(&:strip)
110
83
  HTMLProofer.check_links(links, options).run
111
84
  elsif File.directory?(paths.first)
112
85
  HTMLProofer.check_directories(paths, options).run
data/lib/html-proofer.rb CHANGED
@@ -1,56 +1,3 @@
1
- # rubocop:disable Naming/FileName
2
1
  # frozen_string_literal: true
3
2
 
4
- def require_all(path)
5
- dir = File.join(File.dirname(__FILE__), path)
6
- Dir[File.join(dir, '*.rb')].sort.each do |f|
7
- require f
8
- end
9
- end
10
-
11
- require_relative 'html-proofer/utils'
12
- require_all 'html-proofer'
13
- require_all 'html-proofer/check'
14
-
15
- require 'parallel'
16
- require 'fileutils'
17
-
18
- begin
19
- require 'awesome_print'
20
- require 'pry-byebug'
21
- rescue LoadError; end # rubocop:disable Lint/SuppressedException
22
- module HTMLProofer
23
- def self.check_file(file, options = {})
24
- raise ArgumentError unless file.is_a?(String)
25
- raise ArgumentError, "#{file} does not exist" unless File.exist?(file)
26
-
27
- options[:type] = :file
28
- HTMLProofer::Runner.new(file, options)
29
- end
30
-
31
- def self.check_directory(directory, options = {})
32
- raise ArgumentError unless directory.is_a?(String)
33
- raise ArgumentError, "#{directory} does not exist" unless Dir.exist?(directory)
34
-
35
- options[:type] = :directory
36
- HTMLProofer::Runner.new([directory], options)
37
- end
38
-
39
- def self.check_directories(directories, options = {})
40
- raise ArgumentError unless directories.is_a?(Array)
41
-
42
- options[:type] = :directory
43
- directories.each do |directory|
44
- raise ArgumentError, "#{directory} does not exist" unless Dir.exist?(directory)
45
- end
46
- HTMLProofer::Runner.new(directories, options)
47
- end
48
-
49
- def self.check_links(links, options = {})
50
- raise ArgumentError unless links.is_a?(Array)
51
-
52
- options[:type] = :links
53
- HTMLProofer::Runner.new(links, options)
54
- end
55
- end
56
- # rubocop:enable Naming/FileName
3
+ require_relative 'html_proofer'
@@ -0,0 +1,231 @@
1
+ # frozen_string_literal: true
2
+
3
+ class HTMLProofer::Attribute::Url < HTMLProofer::Attribute
4
+ attr_reader :url
5
+
6
+ REMOTE_SCHEMES = %w[http https].freeze
7
+
8
+ def initialize(runner, link_attribute, base_url: nil)
9
+ super
10
+
11
+ if @raw_attribute.nil?
12
+ @url = nil
13
+ else
14
+ @url = @raw_attribute.delete("\u200b").strip
15
+ @url = Addressable::URI.join(base_url, @url).to_s unless blank?(base_url)
16
+
17
+ swap_urls!
18
+ clean_url!
19
+
20
+ # convert "//" links to "https://"
21
+ @url.start_with?('//') ? @url = "https:#{@url}" : @url
22
+ end
23
+ end
24
+
25
+ def to_s
26
+ @url
27
+ end
28
+
29
+ def known_extension?
30
+ return true if hash_link?
31
+
32
+ ext = File.extname(path)
33
+
34
+ # no extension means we use the assumed one
35
+ return @runner.options[:extensions].include?(@runner.options[:assume_extension]) if blank?(ext)
36
+
37
+ @runner.options[:extensions].include?(ext)
38
+ end
39
+
40
+ def unknown_extension?
41
+ !known_extension?
42
+ end
43
+
44
+ def ignore?
45
+ return true if (/^javascript:/).match?(@url)
46
+ return true if ignores_pattern?(@runner.options[:ignore_urls])
47
+ end
48
+
49
+ def valid?
50
+ !parts.nil?
51
+ end
52
+
53
+ def path?
54
+ !parts.host.nil? && !parts.path.nil?
55
+ end
56
+
57
+ def parts
58
+ @parts ||= Addressable::URI.parse @url
59
+ rescue URI::Error, Addressable::URI::InvalidURIError
60
+ @parts = nil
61
+ end
62
+
63
+ def path
64
+ Addressable::URI.unencode parts.path unless parts.nil?
65
+ end
66
+
67
+ def hash
68
+ parts&.fragment
69
+ end
70
+
71
+ # Does the URL have a hash?
72
+ def hash?
73
+ !blank?(hash)
74
+ end
75
+
76
+ def scheme
77
+ parts&.scheme
78
+ end
79
+
80
+ def remote?
81
+ REMOTE_SCHEMES.include?(scheme)
82
+ end
83
+
84
+ def http?
85
+ scheme == 'http'
86
+ end
87
+
88
+ def https?
89
+ scheme == 'https'
90
+ end
91
+
92
+ def non_http_remote?
93
+ !scheme.nil? && !remote?
94
+ end
95
+
96
+ def host
97
+ parts&.host
98
+ end
99
+
100
+ def domain_path
101
+ (host || '') + path
102
+ end
103
+
104
+ def query_values
105
+ parts&.query_values
106
+ end
107
+
108
+ # checks if a file exists relative to the current pwd
109
+ def exists?
110
+ return true if base64?
111
+
112
+ return @runner.checked_paths[absolute_path] if @runner.checked_paths.key?(absolute_path)
113
+
114
+ @runner.checked_paths[absolute_path] = File.exist?(absolute_path)
115
+ end
116
+
117
+ def base64?
118
+ /^data:image/.match?(@raw_attribute)
119
+ end
120
+
121
+ def absolute_path
122
+ path = file_path || @runner.current_path
123
+
124
+ File.expand_path(path, Dir.pwd)
125
+ end
126
+
127
+ def file_path
128
+ return if path.nil? || path.empty?
129
+
130
+ path_dot_ext = ''
131
+
132
+ path_dot_ext = path + @runner.options[:assume_extension] unless blank?(@runner.options[:assume_extension])
133
+
134
+ base = if absolute_path?(path) # path relative to root
135
+ # either overwrite with root_dir; or, if source is directory, use that; or, just get the current file's dirname
136
+ @runner.options[:root_dir] || (File.directory?(@runner.current_source) ? @runner.current_source : File.dirname(@runner.current_source))
137
+ # relative links, path is a file
138
+ elsif File.exist?(File.expand_path(path, @runner.current_source)) || File.exist?(File.expand_path(path_dot_ext, @runner.current_source))
139
+ File.dirname(@runner.current_path)
140
+ # relative links in nested dir, path is a file
141
+ elsif File.exist?(File.join(File.dirname(@runner.current_path), path)) || File.exist?(File.join(File.dirname(@runner.current_path), path_dot_ext)) # rubocop:disable Lint/DuplicateBranch
142
+ File.dirname(@runner.current_path)
143
+ # relative link, path is a directory
144
+ else
145
+ @runner.current_path
146
+ end
147
+
148
+ file = File.join(base, path)
149
+
150
+ if @runner.options[:assume_extension] && File.file?("#{file}#{@runner.options[:assume_extension]}")
151
+ file = "#{file}#{@runner.options[:assume_extension]}"
152
+ elsif File.directory?(file) && !unslashed_directory?(file) # implicit index support
153
+ file = File.join file, @runner.options[:directory_index_file]
154
+ end
155
+
156
+ file
157
+ end
158
+
159
+ def unslashed_directory?(file)
160
+ File.directory?(file) && !file.end_with?(File::SEPARATOR)
161
+ end
162
+
163
+ def absolute_path?(path)
164
+ path.start_with?('/')
165
+ end
166
+
167
+ # path is external to the file
168
+ def external?
169
+ !internal?
170
+ end
171
+
172
+ def internal?
173
+ relative_link? || internal_absolute_link? || hash_link?
174
+ end
175
+
176
+ def internal_absolute_link?
177
+ url.start_with?('/')
178
+ end
179
+
180
+ def relative_link?
181
+ return false if remote?
182
+
183
+ hash_link? || param_link? || url.start_with?('.') || url =~ /^\S/
184
+ end
185
+
186
+ def link_points_to_same_page?
187
+ hash_link || param_link
188
+ end
189
+
190
+ def hash_link?
191
+ url.start_with?('#')
192
+ end
193
+
194
+ def param_link?
195
+ url.start_with?('?')
196
+ end
197
+
198
+ def sans_hash
199
+ @url.to_s.sub(/##{hash}/, '')
200
+ end
201
+
202
+ # catch any obvious issues, like strings in port numbers
203
+ private def clean_url!
204
+ return if @url =~ /^([!#{Regexp.last_match(0)}-;=?-\[\]_a-z~]|%[0-9a-fA-F]{2})+$/
205
+
206
+ @url = Addressable::URI.parse(@url).normalize.to_s
207
+ end
208
+
209
+ private def swap_urls!
210
+ return @url if blank?(replacements = @runner.options[:swap_urls])
211
+
212
+ replacements.each do |link, replace|
213
+ @url = @url.gsub(link, replace)
214
+ end
215
+ end
216
+
217
+ private def ignores_pattern?(links_to_ignore)
218
+ return false unless links_to_ignore.is_a?(Array)
219
+
220
+ links_to_ignore.each do |link_to_ignore|
221
+ case link_to_ignore
222
+ when String
223
+ return true if link_to_ignore == @raw_attribute
224
+ when Regexp
225
+ return true if link_to_ignore&.match?(@raw_attribute)
226
+ end
227
+ end
228
+
229
+ false
230
+ end
231
+ end
@@ -0,0 +1,15 @@
1
+ # frozen_string_literal: true
2
+
3
+ module HTMLProofer
4
+ # Represents an element currently being processed
5
+ class Attribute
6
+ include HTMLProofer::Utils
7
+
8
+ attr_reader :raw_attribute
9
+
10
+ def initialize(runner, raw_attribute, **_)
11
+ @runner = runner
12
+ @raw_attribute = raw_attribute
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,234 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'date'
4
+ require 'json'
5
+ require 'uri'
6
+
7
+ module HTMLProofer
8
+ class Cache
9
+ include HTMLProofer::Utils
10
+
11
+ CACHE_VERSION = 2
12
+
13
+ DEFAULT_STORAGE_DIR = File.join('tmp', '.htmlproofer')
14
+ DEFAULT_CACHE_FILE_NAME = 'cache.json'
15
+
16
+ URI_REGEXP = URI::DEFAULT_PARSER.make_regexp
17
+
18
+ attr_reader :exists, :cache_log, :storage_dir, :cache_file
19
+
20
+ def initialize(runner, options)
21
+ @runner = runner
22
+ @logger = @runner.logger
23
+
24
+ @cache_datetime = DateTime.now
25
+ @cache_time = @cache_datetime.to_time
26
+
27
+ if blank?(options)
28
+ define_singleton_method(:enabled?) { false }
29
+ else
30
+ define_singleton_method(:enabled?) { true }
31
+ setup_cache!(options)
32
+ @parsed_timeframe = parsed_timeframe(options[:timeframe])
33
+ end
34
+ end
35
+
36
+ def within_timeframe?(time)
37
+ return false if time.nil?
38
+
39
+ time = Time.parse(time) if time.is_a?(String)
40
+ (@parsed_timeframe..@cache_time).cover?(time)
41
+ end
42
+
43
+ def parsed_timeframe(timeframe)
44
+ time, date = timeframe.match(/(\d+)(\D)/).captures
45
+ time = time.to_i
46
+ case date
47
+ when 'M'
48
+ time_ago(time, :months)
49
+ when 'w'
50
+ time_ago(time, :weeks)
51
+ when 'd'
52
+ time_ago(time, :days)
53
+ when 'h'
54
+ time_ago(time, :hours)
55
+ else
56
+ raise ArgumentError, "#{date} is not a valid timeframe!"
57
+ end
58
+ end
59
+
60
+ def add_internal(url, metadata, found)
61
+ return unless enabled?
62
+
63
+ @cache_log[:internal][url] = { time: @cache_time, metadata: [] } if @cache_log[:internal][url].nil?
64
+
65
+ @cache_log[:internal][url][:metadata] << construct_internal_link_metadata(metadata, found)
66
+ end
67
+
68
+ def add_external(url, filenames, status_code, msg)
69
+ return unless enabled?
70
+
71
+ found = status_code.between?(200, 299)
72
+
73
+ clean_url = cleaned_url(url)
74
+ @cache_log[:external][clean_url] = { time: @cache_time.to_s, found: found, status_code: status_code, message: msg, metadata: filenames }
75
+ end
76
+
77
+ def detect_url_changes(urls_detected, type)
78
+ additions = determine_additions(urls_detected, type)
79
+
80
+ determine_deletions(urls_detected, type)
81
+
82
+ additions
83
+ end
84
+
85
+ private def construct_internal_link_metadata(metadata, found)
86
+ {
87
+ source: metadata[:source],
88
+ current_path: metadata[:current_path],
89
+ line: metadata[:line],
90
+ base_url: metadata[:base_url],
91
+ found: found
92
+ }
93
+ end
94
+
95
+ # prepare to add new URLs detected
96
+ private def determine_additions(urls_detected, type)
97
+ additions = urls_detected.reject do |url, metadata|
98
+ url = cleaned_url(url)
99
+
100
+ if @cache_log[type].include?(url)
101
+ @cache_log[type][url][:metadata] = metadata
102
+
103
+ # if this is false, we're trying again
104
+ if type == :external
105
+ @cache_log[type][url][:found]
106
+ else
107
+ @cache_log[type][url][:metadata].none? { |m| m[:found] }
108
+ end
109
+ else
110
+ @logger.log :debug, "Adding #{url} to #{type} cache"
111
+ false
112
+ end
113
+ end
114
+
115
+ new_link_count = additions.length
116
+ new_link_text = pluralize(new_link_count, "new #{type} link", "new #{type} links")
117
+ @logger.log :debug, "Adding #{new_link_text} to the cache"
118
+
119
+ additions
120
+ end
121
+
122
+ # remove from cache URLs that no longer exist
123
+ private def determine_deletions(urls_detected, type)
124
+ deletions = 0
125
+
126
+ @cache_log[type].delete_if do |url, _|
127
+ url = cleaned_url(url)
128
+
129
+ if urls_detected.include?(url)
130
+ false
131
+ elsif url_matches_type?(url, type)
132
+ @logger.log :debug, "Removing #{url} from #{type} cache"
133
+ deletions += 1
134
+ true
135
+ end
136
+ end
137
+
138
+ del_link_text = pluralize(deletions, "outdated #{type} link", "outdated #{type} links")
139
+ @logger.log :debug, "Removing #{del_link_text} from the cache"
140
+ end
141
+
142
+ def write
143
+ return unless enabled?
144
+
145
+ File.write(@cache_file, @cache_log.to_json)
146
+ end
147
+
148
+ def retrieve_urls(urls_detected, type)
149
+ # if there are no urls, bail
150
+ return {} if urls_detected.empty?
151
+
152
+ urls_to_check = detect_url_changes(urls_detected, type)
153
+
154
+ @cache_log[type].each_pair do |url, cache|
155
+ next if within_timeframe?(cache[:time])
156
+
157
+ urls_to_check[url] = cache[:metadata] # recheck expired links
158
+ end
159
+
160
+ urls_to_check
161
+ end
162
+
163
+ def empty?
164
+ blank?(@cache_log) || (@cache_log[:internal].empty? && @cache_log[:external].empty?)
165
+ end
166
+
167
+ def size(type)
168
+ @cache_log[type].size
169
+ end
170
+
171
+ private def setup_cache!(options)
172
+ default_structure = {
173
+ version: CACHE_VERSION,
174
+ internal: {},
175
+ external: {}
176
+ }
177
+
178
+ @storage_dir = options[:storage_dir] || DEFAULT_STORAGE_DIR
179
+
180
+ FileUtils.mkdir_p(storage_dir) unless Dir.exist?(storage_dir)
181
+
182
+ cache_file_name = options[:cache_file] || DEFAULT_CACHE_FILE_NAME
183
+
184
+ @cache_file = File.join(storage_dir, cache_file_name)
185
+
186
+ return (@cache_log = default_structure) unless File.exist?(@cache_file)
187
+
188
+ contents = File.read(@cache_file)
189
+
190
+ return (@cache_log = default_structure) if blank?(contents)
191
+
192
+ log = JSON.parse(contents, symbolize_names: true)
193
+
194
+ old_cache = (cache_version = log[:version]).nil?
195
+ @cache_log = if old_cache # previous cache version, create a new one
196
+ default_structure
197
+ elsif cache_version != CACHE_VERSION
198
+ # if cache version is newer...do something
199
+ else
200
+ log[:internal] = log[:internal].transform_keys(&:to_s)
201
+ log[:external] = log[:external].transform_keys(&:to_s)
202
+ log
203
+ end
204
+ end
205
+
206
+ private def time_ago(measurement, unit)
207
+ case unit
208
+ when :months
209
+ @cache_datetime >> -measurement
210
+ when :weeks
211
+ @cache_datetime - (measurement * 7)
212
+ when :days
213
+ @cache_datetime - measurement
214
+ when :hours
215
+ @cache_datetime - Rational(measurement / 24.0)
216
+ end.to_time
217
+ end
218
+
219
+ private def url_matches_type?(url, type)
220
+ return true if type == :internal && url !~ URI_REGEXP
221
+ return true if type == :external && url =~ URI_REGEXP
222
+ end
223
+
224
+ private def cleaned_url(url)
225
+ return escape_unescape(url) unless url.end_with?('/', '#', '?') && url.length > 1
226
+
227
+ escape_unescape(url[0..-2])
228
+ end
229
+
230
+ private def escape_unescape(url)
231
+ Addressable::URI.parse(url).normalize.to_s
232
+ end
233
+ end
234
+ end