RubyGems - html-proofer - Versions diffs - 3.19.4 → 4.0.0.rc1 - Mend

html-proofer 3.19.4 → 4.0.0.rc1

Files changed (40) hide show

checksums.yaml +4 -4
data/bin/htmlproofer +30 -57
data/lib/html-proofer.rb +1 -54
data/lib/html_proofer/attribute/url.rb +231 -0
data/lib/html_proofer/attribute.rb +15 -0
data/lib/html_proofer/cache.rb +234 -0
data/lib/html_proofer/check/favicon.rb +35 -0
data/lib/html_proofer/check/images.rb +62 -0
data/lib/html_proofer/check/links.rb +118 -0
data/lib/html_proofer/check/open_graph.rb +34 -0
data/lib/html_proofer/check/scripts.rb +38 -0
data/lib/html_proofer/check.rb +91 -0
data/lib/{html-proofer → html_proofer}/configuration.rb +30 -31
data/lib/html_proofer/element.rb +122 -0
data/lib/html_proofer/failure.rb +17 -0
data/lib/{html-proofer → html_proofer}/log.rb +0 -0
data/lib/html_proofer/reporter/cli.rb +29 -0
data/lib/html_proofer/reporter.rb +23 -0
data/lib/html_proofer/runner.rb +245 -0
data/lib/html_proofer/url_validator/external.rb +189 -0
data/lib/html_proofer/url_validator/internal.rb +86 -0
data/lib/html_proofer/url_validator.rb +16 -0
data/lib/{html-proofer → html_proofer}/utils.rb +5 -8
data/lib/{html-proofer → html_proofer}/version.rb +1 -1
data/lib/html_proofer/xpath_functions.rb +10 -0
data/lib/html_proofer.rb +56 -0
metadata +46 -27
data/lib/html-proofer/cache.rb +0 -194
data/lib/html-proofer/check/favicon.rb +0 -29
data/lib/html-proofer/check/html.rb +0 -37
data/lib/html-proofer/check/images.rb +0 -48
data/lib/html-proofer/check/links.rb +0 -182
data/lib/html-proofer/check/opengraph.rb +0 -46
data/lib/html-proofer/check/scripts.rb +0 -42
data/lib/html-proofer/check.rb +0 -75
data/lib/html-proofer/element.rb +0 -265
data/lib/html-proofer/issue.rb +0 -65
data/lib/html-proofer/middleware.rb +0 -82
data/lib/html-proofer/runner.rb +0 -249
data/lib/html-proofer/url_validator.rb +0 -237

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: f27b5c50ae5c1c77d5fbe36dbbdca327bcb96302912b726f7f955f643d1dfc48
-  data.tar.gz: f09405cd0c70f1d2dc98f904c388bcab594f79107fdbe441c63f934821bef1b0
+  metadata.gz: bec55c40cc2d01b65496b138570cf434e533d045476470e4ce1e6b0daf3d5408
+  data.tar.gz: dd77aaf59adf3eaaa48a6b20dab59adbb0a5974b0a4ded5f9fd51e1fc9ba3684
 SHA512:
-  metadata.gz: 53a8c98438f2056e7e2d926e926e10a6d0aa840b1b6f790860631912a2146dc20c68ca2b303d799a8fbfa723476e0e95dd5bc89695ceddf09ecede6f9acafbd1
-  data.tar.gz: f68269ba70facf5ede07452d1029f49d17baadff8c6b4fd1d9de520c0ede91ff360bacb0cf46b5c719c0ae35c50ad61c6ce5b36171867f6a1c9d8c675d805ebc
+  metadata.gz: 5036e6f46c4e0ac32bd9a6f4bd891244f25cb43670c40eee7c7421c661f9a3b6e24edd15507b6290fd102f37988529250cecf5800b63b6ed1f8622dd983c76ec
+  data.tar.gz: cb9fdec8ec8774e8a9607d9b92b859767b6726e45b0bc54d6b1eee993de03c1a0a81a50035fe32543e96a7f1ee1d420366076d1d5956761abd1201086e35f057

data/bin/htmlproofer CHANGED Viewed

@@ -15,44 +15,32 @@ Mercenary.program(:htmlproofer) do |p|
   p.description 'Runs the HTML-Proofer suite on the files in PATH. For more details, see the README.'
-  p.option 'allow_missing_href', '--allow-missing-href', 'If `true`, does not flag `a` tags missing `href` (this is the default for HTML5).'
-  p.option 'allow_hash_href', '--allow-hash-href', 'If `true`, ignores the `href="#"`'
+  p.option 'allow_hash_href', '--allow-hash-href', 'If `true`, assumes `href="#"` anchors are valid'
+  p.option 'allow_missing_href', '--allow-missing-href', 'If `true`, does not flag `a` tags missing `href`. In HTML5, this is technically allowed, but could also be human error.'
   p.option 'as_links', '--as-links', 'Assumes that `PATH` is a comma-separated array of links to check.'
-  p.option 'alt_ignore', '--alt-ignore image1,[image2,...]', Array, 'A comma-separated list of Strings or RegExps containing `img`s whose missing `alt` tags are safe to ignore'
-  p.option 'assume_extension', '--assume-extension', 'Automatically add extension (e.g. `.html`) to file paths, to allow extensionless URLs (as supported by Jekyll 3 and GitHub Pages) (default: `false`).'
-  p.option 'checks_to_ignore', '--checks-to-ignore check1,[check2,...]', Array, 'A comma-separated list of Strings indicating which checks you do not want to run (default: `[]`)'
-  p.option 'check_external_hash', '--check-external-hash', 'Checks whether external hashes exist (even if the webpage exists). This slows the checker down (default: `false`).'
-  p.option 'check_favicon', '--check-favicon', 'Enables the favicon checker (default: `false`).'
-  p.option 'check_html', '--check-html', 'Enables HTML validation errors from Nokogumbo (default: `false`).'
-  p.option 'check_img_http', '--check-img-http', 'Fails an image if it\'s marked as `http` (default: `false`).'
-  p.option 'check_opengraph', '--check-opengraph', 'Enables the Open Graph checker (default: `false`).'
+  p.option 'assume_extension', '--assume-extension <ext>', 'Automatically add specified extension to files for internal links, to allow extensionless URLs (as supported by most servers) (default: `.html`).'
+  p.option 'checks', '--checks check1,[check2,...]', Array, 'A comma-separated list of Strings indicating which checks you want to run (default: `["Links", "Images", "Scripts"]`)'
+  p.option 'check_external_hash', '--check-external-hash', 'Checks whether external hashes exist (even if the webpage exists) (default: `true`).'
   p.option 'check_sri', '--check-sri', 'Check that `<link>` and `<script>` external resources use SRI (default: `false`).'
   p.option 'directory_index_file', '--directory-index-file <filename>', String, 'Sets the file to look for when a link refers to a directory. (default: `index.html`)'
-  p.option 'disable_external', '--disable-external', 'If `true`, does not run the external link checker, which can take a lot of time (default: `false`)'
-  p.option 'empty_alt_ignore', '--empty-alt-ignore', 'If `true`, ignores images with empty alt tags'
-  p.option 'error_sort', '--error-sort <sort>', String, 'Defines the sort order for error output. Can be `:path`, `:desc`, or `:status` (default: `:path`).'
-  p.option 'enforce_https', '--enforce-https', 'Fails a link if it\'s not marked as `https` (default: `false`).'
-  p.option 'extension', '--extension <ext>', String, 'The extension of your HTML files including the dot. (default: `.html`)'
-  p.option 'external_only', '--external_only', 'Only checks problems with external references'
-  p.option 'file_ignore', '--file-ignore file1,[file2,...]', Array, 'A comma-separated list of Strings or RegExps containing file paths that are safe to ignore'
-  p.option 'http_status_ignore', '--http-status-ignore 123,[xxx, ...]', Array, 'A comma-separated list of numbers representing status codes to ignore.'
-  p.option 'internal_domains', '--internal-domains domain1,[domain2,...]', Array, 'A comma-separated list of Strings containing domains that will be treated as internal urls.'
+  p.option 'disable_external', '--disable-external', 'If `true`, does not run the external link checker (default: `false`)'
+  p.option 'enforce_https', '--enforce-https', 'Fails a link if it\'s not marked as `https` (default: `true`).'
+  p.option 'extensions', '--extensions ext1,[ext2,...[', Array, 'A comma-separated list of Strings indicating the file extensions you would like to check (including the dot) (default: `.html`)'
+  p.option 'ignore_files', '--ignore-files file1,[file2,...]', Array, 'A comma-separated list of Strings or RegExps containing file paths that are safe to ignore'
   p.option 'ignore_empty_mailto', '--ignore-empty-mailto', 'If `true`, allows `mailto:` `href`s which do not contain an email address'
-  p.option 'report_invalid_tags', '--report-invalid-tags', 'When `check_html` is enabled, HTML markup that is unknown to Nokogumbo are reported as errors (default: `false`)'
-  p.option 'report_missing_names', '--report-missing-names', 'When `check_html` is enabled, HTML markup that are missing entity names are reported as errors (default: `false`)'
-  p.option 'report_script_embeds', '--report-script-embeds', 'When `check_html` is enabled, `script` tags containing markup are reported as errors (default: `false`)'
-  p.option 'report_missing_doctype', '--report-missing-doctype', 'When `check_html` is enabled, HTML markup with missing or out-of-order `DOCTYPE` are reported as errors (default: `false`)'
-  p.option 'report_eof_tags', '--report-eof-tags', 'When `check_html` is enabled, HTML markup with tags that are malformed are reported as errors (default: `false`)'
-  p.option 'report_mismatched_tags', '--report-mismatched-tags', 'When `check_html` is enabled, HTML markup with mismatched tags are reported as errors (default: `false`)'
+  p.option 'ignore_missing_alt', '--empty-alt-ignore', 'If `true`, ignores images with empty/missing alt tags'
+  p.option 'ignore_status_codes', '--http-status-ignore 123,[xxx, ...]', Array, 'A comma-separated list of numbers representing status codes to ignore.'
+  p.option 'ignore_urls', '--ignore-urls link1,[link2,...]', Array, 'A comma-separated list of Strings or RegExps containing URLs that are safe to ignore. This affects all HTML attributes, such as `alt` tags on images.'
   p.option 'log_level', '--log-level <level>', String, 'Sets the logging level, as determined by Yell. One of `:debug`, `:info`, `:warn`, `:error`, or `:fatal`. (default: `:info`)'
   p.option 'only_4xx', '--only-4xx', 'Only reports errors for links that fall within the 4xx status code range'
-  p.option 'storage_dir', '--storage-dir PATH', String, 'Directory where to store the cache log (default: "tmp/.htmlproofer")'
-  p.option 'timeframe', '--timeframe <time>', String, 'A string representing the caching timeframe.'
-  p.option 'typhoeus_config', '--typhoeus-config CONFIG', String, 'JSON-formatted string of Typhoeus config. Will override the html-proofer defaults.'
-  p.option 'hydra_config', '--hydra-config CONFIG', String, 'JSON-formatted string of Hydra config. Will override the html-proofer defaults.'
-  p.option 'url_ignore', '--url-ignore link1,[link2,...]', Array, 'A comma-separated list of Strings or RegExps containing URLs that are safe to ignore. It affects all HTML attributes. Note that non-HTTP(S) URIs are always ignored'
-  p.option 'url_swap', '--url-swap re:string,[re:string,...]', Array, 'A comma-separated list containing key-value pairs of `RegExp => String`. It transforms URLs that match `RegExp` into `String` via `gsub`. The escape sequences `\\:` should be used to produce literal `:`s.'
   p.option 'root_dir', '--root-dir PATH', String, 'The absolute path to the directory serving your html-files.'
+  p.option 'swap_attributes', '--swap-attributes CONFIG', String, 'JSON-formatted config that maps element names to the preferred attribute to check (default: `{}`).'
+  p.option 'swap_urls', '--swap-urls re:string,[re:string,...]', Array, 'A comma-separated list containing key-value pairs of `RegExp => String`. It transforms URLs that match `RegExp` into `String` via `gsub`. The escape sequences `\\:` should be used to produce literal `:`s.'
+  p.option 'typhoeus', '--typhoeus CONFIG', String, 'JSON-formatted string of Typhoeus config. Will override the html-proofer defaults.'
+  p.option 'hydra', '--hydra CONFIG', String, 'JSON-formatted string of Hydra config. Will override the html-proofer defaults.'
+  p.option 'parallel', '--parallel CONFIG', String, 'JSON-formatted string of Parallel config. Will override the html-proofer defaults.'
+  p.option 'cache', '--cache CONFIG', String, 'JSON-formatted string of cache config. Will override the html-proofer defaults.'
   p.action do |args, opts|
     args = ['.'] if args.empty?
@@ -67,46 +55,31 @@ Mercenary.program(:htmlproofer) do |p|
     end
     # some minor manipulation of a special option
-    unless opts['url_swap'].nil?
-      options[:url_swap] = {}
-      opts['url_swap'].each do |s|
+    unless opts['swap_urls'].nil?
+      options[:swap_urls] = {}
+      opts['swap_urls'].each do |s|
         splt = s.split(/(?<!\\):/, 2)
         re = splt[0].gsub(/\\:/, ':')
         string = splt[1].gsub(/\\:/, ':')
-        options[:url_swap][Regexp.new(re)] = string
+        options[:swap_urls][Regexp.new(re)] = string
       end
     end
-    options[:error_sort] = opts['error-sort'].to_sym unless opts['error-sort'].nil?
     options[:log_level] = opts['log_level'].to_sym unless opts['log_level'].nil?
-    options[:validation] = HTMLProofer::Configuration::VALIDATION_DEFAULTS.dup
-    options[:validation][:report_script_embeds] = opts['report_script_embeds'] unless opts['report_script_embeds'].nil?
-    options[:validation][:report_missing_names] = opts['report_missing_names'] unless opts['report_missing_names'].nil?
-    options[:validation][:report_invalid_tags] = opts['report_invalid_tags'] unless opts['report_invalid_tags'].nil?
-    options[:validation][:report_missing_doctype] = opts['report_missing_doctype'] unless opts['report_missing_doctype'].nil?
-    options[:validation][:report_eof_tags] = opts['report_eof_tags'] unless opts['report_eof_tags'].nil?
-    options[:validation][:report_mismatched_tags] = opts['report_mismatched_tags'] unless opts['report_mismatched_tags'].nil?
-    options[:typhoeus] = HTMLProofer::Configuration.parse_json_option('typhoeus_config', opts['typhoeus_config'], symbolize_names: false) unless opts['typhoeus_config'].nil?
-    options[:hydra] = HTMLProofer::Configuration.parse_json_option('hydra_config', opts['hydra_config']) unless opts['hydra_config'].nil?
+    options[:typhoeus] = HTMLProofer::Configuration.parse_json_option('typhoeus', opts['typhoeus'], symbolize_names: false) unless opts['typhoeus'].nil?
+    options[:hydra] = HTMLProofer::Configuration.parse_json_option('hydra', opts['hydra']) unless opts['hydra'].nil?
+    options[:parallel] = HTMLProofer::Configuration.parse_json_option('parallel', opts['parallel']) unless opts['parallel'].nil?
+    options[:cache] = HTMLProofer::Configuration.parse_json_option('cache', opts['cache']) unless opts['cache'].nil?
-    unless opts['timeframe'].nil?
-      options[:cache] ||= {}
-      options[:cache][:timeframe] = opts['timeframe'] unless opts['timeframe'].nil?
-    end
-    unless opts['storage_dir'].nil?
-      options[:cache] ||= {}
-      options[:cache][:storage_dir] = opts['storage_dir'] unless opts['storage_dir'].nil?
-    end
+    options[:swap_attributes] = HTMLProofer::Configuration.parse_json_option('swap_attributes', opts['swap_attributes'], symbolize_names: false) unless opts['swap_attributes'].nil?
-    options[:http_status_ignore] = Array(options[:http_status_ignore]).map(&:to_i)
+    options[:ignore_status_codes] = Array(options[:ignore_status_codes]).map(&:to_i)
     paths = path.split(',')
     if opts['as_links']
-      links = path.delete(' ').split(',')
+      links = path.split(',').map(&:strip)
       HTMLProofer.check_links(links, options).run
     elsif File.directory?(paths.first)
       HTMLProofer.check_directories(paths, options).run

data/lib/html-proofer.rb CHANGED Viewed

@@ -1,56 +1,3 @@
-# rubocop:disable Naming/FileName
 # frozen_string_literal: true
-def require_all(path)
-  dir = File.join(File.dirname(__FILE__), path)
-  Dir[File.join(dir, '*.rb')].sort.each do |f|
-    require f
-  end
-end
-require_relative 'html-proofer/utils'
-require_all 'html-proofer'
-require_all 'html-proofer/check'
-require 'parallel'
-require 'fileutils'
-begin
-  require 'awesome_print'
-  require 'pry-byebug'
-rescue LoadError; end # rubocop:disable Lint/SuppressedException
-module HTMLProofer
-  def self.check_file(file, options = {})
-    raise ArgumentError unless file.is_a?(String)
-    raise ArgumentError, "#{file} does not exist" unless File.exist?(file)
-    options[:type] = :file
-    HTMLProofer::Runner.new(file, options)
-  end
-  def self.check_directory(directory, options = {})
-    raise ArgumentError unless directory.is_a?(String)
-    raise ArgumentError, "#{directory} does not exist" unless Dir.exist?(directory)
-    options[:type] = :directory
-    HTMLProofer::Runner.new([directory], options)
-  end
-  def self.check_directories(directories, options = {})
-    raise ArgumentError unless directories.is_a?(Array)
-    options[:type] = :directory
-    directories.each do |directory|
-      raise ArgumentError, "#{directory} does not exist" unless Dir.exist?(directory)
-    end
-    HTMLProofer::Runner.new(directories, options)
-  end
-  def self.check_links(links, options = {})
-    raise ArgumentError unless links.is_a?(Array)
-    options[:type] = :links
-    HTMLProofer::Runner.new(links, options)
-  end
-end
-# rubocop:enable Naming/FileName
+require_relative 'html_proofer'

data/lib/html_proofer/attribute/url.rb ADDED Viewed

@@ -0,0 +1,231 @@
+# frozen_string_literal: true
+class HTMLProofer::Attribute::Url < HTMLProofer::Attribute
+  attr_reader :url
+  REMOTE_SCHEMES = %w[http https].freeze
+  def initialize(runner, link_attribute, base_url: nil)
+    super
+    if @raw_attribute.nil?
+      @url = nil
+    else
+      @url = @raw_attribute.delete("\u200b").strip
+      @url = Addressable::URI.join(base_url, @url).to_s unless blank?(base_url)
+      swap_urls!
+      clean_url!
+      # convert "//" links to "https://"
+      @url.start_with?('//') ? @url = "https:#{@url}" : @url
+    end
+  end
+  def to_s
+    @url
+  end
+  def known_extension?
+    return true if hash_link?
+    ext = File.extname(path)
+    # no extension means we use the assumed one
+    return @runner.options[:extensions].include?(@runner.options[:assume_extension]) if blank?(ext)
+    @runner.options[:extensions].include?(ext)
+  end
+  def unknown_extension?
+    !known_extension?
+  end
+  def ignore?
+    return true if (/^javascript:/).match?(@url)
+    return true if ignores_pattern?(@runner.options[:ignore_urls])
+  end
+  def valid?
+    !parts.nil?
+  end
+  def path?
+    !parts.host.nil? && !parts.path.nil?
+  end
+  def parts
+    @parts ||= Addressable::URI.parse @url
+  rescue URI::Error, Addressable::URI::InvalidURIError
+    @parts = nil
+  end
+  def path
+    Addressable::URI.unencode parts.path unless parts.nil?
+  end
+  def hash
+    parts&.fragment
+  end
+  # Does the URL have a hash?
+  def hash?
+    !blank?(hash)
+  end
+  def scheme
+    parts&.scheme
+  end
+  def remote?
+    REMOTE_SCHEMES.include?(scheme)
+  end
+  def http?
+    scheme == 'http'
+  end
+  def https?
+    scheme == 'https'
+  end
+  def non_http_remote?
+    !scheme.nil? && !remote?
+  end
+  def host
+    parts&.host
+  end
+  def domain_path
+    (host || '') + path
+  end
+  def query_values
+    parts&.query_values
+  end
+  # checks if a file exists relative to the current pwd
+  def exists?
+    return true if base64?
+    return @runner.checked_paths[absolute_path] if @runner.checked_paths.key?(absolute_path)
+    @runner.checked_paths[absolute_path] = File.exist?(absolute_path)
+  end
+  def base64?
+    /^data:image/.match?(@raw_attribute)
+  end
+  def absolute_path
+    path = file_path || @runner.current_path
+    File.expand_path(path, Dir.pwd)
+  end
+  def file_path
+    return if path.nil? || path.empty?
+    path_dot_ext = ''
+    path_dot_ext = path + @runner.options[:assume_extension] unless blank?(@runner.options[:assume_extension])
+    base = if absolute_path?(path) # path relative to root
+             # either overwrite with root_dir; or, if source is directory, use that; or, just get the current file's dirname
+             @runner.options[:root_dir] || (File.directory?(@runner.current_source) ? @runner.current_source : File.dirname(@runner.current_source))
+           # relative links, path is a file
+           elsif File.exist?(File.expand_path(path, @runner.current_source)) || File.exist?(File.expand_path(path_dot_ext, @runner.current_source))
+             File.dirname(@runner.current_path)
+           # relative links in nested dir, path is a file
+           elsif File.exist?(File.join(File.dirname(@runner.current_path), path)) || File.exist?(File.join(File.dirname(@runner.current_path), path_dot_ext)) # rubocop:disable Lint/DuplicateBranch
+             File.dirname(@runner.current_path)
+           # relative link, path is a directory
+           else
+             @runner.current_path
+           end
+    file = File.join(base, path)
+    if @runner.options[:assume_extension] && File.file?("#{file}#{@runner.options[:assume_extension]}")
+      file = "#{file}#{@runner.options[:assume_extension]}"
+    elsif File.directory?(file) && !unslashed_directory?(file) # implicit index support
+      file = File.join file, @runner.options[:directory_index_file]
+    end
+    file
+  end
+  def unslashed_directory?(file)
+    File.directory?(file) && !file.end_with?(File::SEPARATOR)
+  end
+  def absolute_path?(path)
+    path.start_with?('/')
+  end
+  # path is external to the file
+  def external?
+    !internal?
+  end
+  def internal?
+    relative_link? || internal_absolute_link? || hash_link?
+  end
+  def internal_absolute_link?
+    url.start_with?('/')
+  end
+  def relative_link?
+    return false if remote?
+    hash_link? || param_link? || url.start_with?('.') || url =~ /^\S/
+  end
+  def link_points_to_same_page?
+    hash_link || param_link
+  end
+  def hash_link?
+    url.start_with?('#')
+  end
+  def param_link?
+    url.start_with?('?')
+  end
+  def sans_hash
+    @url.to_s.sub(/##{hash}/, '')
+  end
+  # catch any obvious issues, like strings in port numbers
+  private def clean_url!
+    return if @url =~ /^([!#{Regexp.last_match(0)}-;=?-\[\]_a-z~]|%[0-9a-fA-F]{2})+$/
+    @url = Addressable::URI.parse(@url).normalize.to_s
+  end
+  private def swap_urls!
+    return @url if blank?(replacements = @runner.options[:swap_urls])
+    replacements.each do |link, replace|
+      @url = @url.gsub(link, replace)
+    end
+  end
+  private def ignores_pattern?(links_to_ignore)
+    return false unless links_to_ignore.is_a?(Array)
+    links_to_ignore.each do |link_to_ignore|
+      case link_to_ignore
+      when String
+        return true if link_to_ignore == @raw_attribute
+      when Regexp
+        return true if link_to_ignore&.match?(@raw_attribute)
+      end
+    end
+    false
+  end
+end

data/lib/html_proofer/attribute.rb ADDED Viewed

@@ -0,0 +1,15 @@
+# frozen_string_literal: true
+module HTMLProofer
+  # Represents an element currently being processed
+  class Attribute
+    include HTMLProofer::Utils
+    attr_reader :raw_attribute
+    def initialize(runner, raw_attribute, **_)
+      @runner = runner
+      @raw_attribute = raw_attribute
+    end
+  end
+end

data/lib/html_proofer/cache.rb ADDED Viewed

@@ -0,0 +1,234 @@
+# frozen_string_literal: true
+require 'date'
+require 'json'
+require 'uri'
+module HTMLProofer
+  class Cache
+    include HTMLProofer::Utils
+    CACHE_VERSION = 2
+    DEFAULT_STORAGE_DIR = File.join('tmp', '.htmlproofer')
+    DEFAULT_CACHE_FILE_NAME = 'cache.json'
+    URI_REGEXP = URI::DEFAULT_PARSER.make_regexp
+    attr_reader :exists, :cache_log, :storage_dir, :cache_file
+    def initialize(runner, options)
+      @runner = runner
+      @logger = @runner.logger
+      @cache_datetime = DateTime.now
+      @cache_time = @cache_datetime.to_time
+      if blank?(options)
+        define_singleton_method(:enabled?) { false }
+      else
+        define_singleton_method(:enabled?) { true }
+        setup_cache!(options)
+        @parsed_timeframe = parsed_timeframe(options[:timeframe])
+      end
+    end
+    def within_timeframe?(time)
+      return false if time.nil?
+      time = Time.parse(time) if time.is_a?(String)
+      (@parsed_timeframe..@cache_time).cover?(time)
+    end
+    def parsed_timeframe(timeframe)
+      time, date = timeframe.match(/(\d+)(\D)/).captures
+      time = time.to_i
+      case date
+      when 'M'
+        time_ago(time, :months)
+      when 'w'
+        time_ago(time, :weeks)
+      when 'd'
+        time_ago(time, :days)
+      when 'h'
+        time_ago(time, :hours)
+      else
+        raise ArgumentError, "#{date} is not a valid timeframe!"
+      end
+    end
+    def add_internal(url, metadata, found)
+      return unless enabled?
+      @cache_log[:internal][url] = { time: @cache_time, metadata: [] } if @cache_log[:internal][url].nil?
+      @cache_log[:internal][url][:metadata] << construct_internal_link_metadata(metadata, found)
+    end
+    def add_external(url, filenames, status_code, msg)
+      return unless enabled?
+      found = status_code.between?(200, 299)
+      clean_url = cleaned_url(url)
+      @cache_log[:external][clean_url] = { time: @cache_time.to_s, found: found, status_code: status_code, message: msg, metadata: filenames }
+    end
+    def detect_url_changes(urls_detected, type)
+      additions = determine_additions(urls_detected, type)
+      determine_deletions(urls_detected, type)
+      additions
+    end
+    private def construct_internal_link_metadata(metadata, found)
+      {
+        source: metadata[:source],
+        current_path: metadata[:current_path],
+        line: metadata[:line],
+        base_url: metadata[:base_url],
+        found: found
+      }
+    end
+    # prepare to add new URLs detected
+    private def determine_additions(urls_detected, type)
+      additions = urls_detected.reject do |url, metadata|
+        url = cleaned_url(url)
+        if @cache_log[type].include?(url)
+          @cache_log[type][url][:metadata] = metadata
+          # if this is false, we're trying again
+          if type == :external
+            @cache_log[type][url][:found]
+          else
+            @cache_log[type][url][:metadata].none? { |m| m[:found] }
+          end
+        else
+          @logger.log :debug, "Adding #{url} to #{type} cache"
+          false
+        end
+      end
+      new_link_count = additions.length
+      new_link_text = pluralize(new_link_count, "new #{type} link", "new #{type} links")
+      @logger.log :debug, "Adding #{new_link_text} to the cache"
+      additions
+    end
+    # remove from cache URLs that no longer exist
+    private def determine_deletions(urls_detected, type)
+      deletions = 0
+      @cache_log[type].delete_if do |url, _|
+        url = cleaned_url(url)
+        if urls_detected.include?(url)
+          false
+        elsif url_matches_type?(url, type)
+          @logger.log :debug, "Removing #{url} from #{type} cache"
+          deletions += 1
+          true
+        end
+      end
+      del_link_text = pluralize(deletions, "outdated #{type} link", "outdated #{type} links")
+      @logger.log :debug, "Removing #{del_link_text} from the cache"
+    end
+    def write
+      return unless enabled?
+      File.write(@cache_file, @cache_log.to_json)
+    end
+    def retrieve_urls(urls_detected, type)
+      # if there are no urls, bail
+      return {} if urls_detected.empty?
+      urls_to_check = detect_url_changes(urls_detected, type)
+      @cache_log[type].each_pair do |url, cache|
+        next if within_timeframe?(cache[:time])
+        urls_to_check[url] = cache[:metadata] # recheck expired links
+      end
+      urls_to_check
+    end
+    def empty?
+      blank?(@cache_log) || (@cache_log[:internal].empty? && @cache_log[:external].empty?)
+    end
+    def size(type)
+      @cache_log[type].size
+    end
+    private def setup_cache!(options)
+      default_structure = {
+        version: CACHE_VERSION,
+        internal: {},
+        external: {}
+      }
+      @storage_dir = options[:storage_dir] || DEFAULT_STORAGE_DIR
+      FileUtils.mkdir_p(storage_dir) unless Dir.exist?(storage_dir)
+      cache_file_name = options[:cache_file] || DEFAULT_CACHE_FILE_NAME
+      @cache_file = File.join(storage_dir, cache_file_name)
+      return (@cache_log = default_structure) unless File.exist?(@cache_file)
+      contents = File.read(@cache_file)
+      return (@cache_log = default_structure) if blank?(contents)
+      log = JSON.parse(contents, symbolize_names: true)
+      old_cache = (cache_version = log[:version]).nil?
+      @cache_log = if old_cache # previous cache version, create a new one
+                     default_structure
+                   elsif cache_version != CACHE_VERSION
+                   # if cache version is newer...do something
+                   else
+                     log[:internal] = log[:internal].transform_keys(&:to_s)
+                     log[:external] = log[:external].transform_keys(&:to_s)
+                     log
+                   end
+    end
+    private def time_ago(measurement, unit)
+      case unit
+      when :months
+        @cache_datetime >> -measurement
+      when :weeks
+        @cache_datetime - (measurement * 7)
+      when :days
+        @cache_datetime - measurement
+      when :hours
+        @cache_datetime - Rational(measurement / 24.0)
+      end.to_time
+    end
+    private def url_matches_type?(url, type)
+      return true if type == :internal && url !~ URI_REGEXP
+      return true if type == :external && url =~ URI_REGEXP
+    end
+    private def cleaned_url(url)
+      return escape_unescape(url) unless url.end_with?('/', '#', '?') && url.length > 1
+      escape_unescape(url[0..-2])
+    end
+    private def escape_unescape(url)
+      Addressable::URI.parse(url).normalize.to_s
+    end
+  end
+end