RubyGems - html-proofer - Versions diffs - 3.19.2 → 4.0.0.rc3 - Mend

html-proofer 3.19.2 → 4.0.0.rc3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

checksums.yaml +4 -4
data/bin/htmlproofer +31 -57
data/lib/html-proofer.rb +1 -54
data/lib/html_proofer/attribute/url.rb +231 -0
data/lib/html_proofer/attribute.rb +15 -0
data/lib/html_proofer/cache.rb +236 -0
data/lib/html_proofer/check/favicon.rb +35 -0
data/lib/html_proofer/check/images.rb +62 -0
data/lib/html_proofer/check/links.rb +118 -0
data/lib/html_proofer/check/open_graph.rb +34 -0
data/lib/html_proofer/check/scripts.rb +38 -0
data/lib/html_proofer/check.rb +91 -0
data/lib/{html-proofer → html_proofer}/configuration.rb +30 -30
data/lib/html_proofer/element.rb +122 -0
data/lib/html_proofer/failure.rb +17 -0
data/lib/{html-proofer → html_proofer}/log.rb +0 -0
data/lib/html_proofer/reporter/cli.rb +29 -0
data/lib/html_proofer/reporter.rb +23 -0
data/lib/html_proofer/runner.rb +245 -0
data/lib/html_proofer/url_validator/external.rb +189 -0
data/lib/html_proofer/url_validator/internal.rb +86 -0
data/lib/html_proofer/url_validator.rb +16 -0
data/lib/{html-proofer → html_proofer}/utils.rb +6 -9
data/lib/{html-proofer → html_proofer}/version.rb +1 -1
data/lib/html_proofer/xpath_functions.rb +10 -0
data/lib/html_proofer.rb +55 -0
metadata +51 -30
data/lib/html-proofer/cache.rb +0 -194
data/lib/html-proofer/check/favicon.rb +0 -29
data/lib/html-proofer/check/html.rb +0 -37
data/lib/html-proofer/check/images.rb +0 -48
data/lib/html-proofer/check/links.rb +0 -182
data/lib/html-proofer/check/opengraph.rb +0 -46
data/lib/html-proofer/check/scripts.rb +0 -42
data/lib/html-proofer/check.rb +0 -75
data/lib/html-proofer/element.rb +0 -261
data/lib/html-proofer/issue.rb +0 -65
data/lib/html-proofer/middleware.rb +0 -82
data/lib/html-proofer/runner.rb +0 -248
data/lib/html-proofer/url_validator.rb +0 -237

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: d06dbda8bf9baad3be96b5565fcb86de0892d4241c6b7ede08c3c2d7203f6752
-  data.tar.gz: da895d696b7b9d1f3ca9c2e504e0c375ce9942f53dd5c2174f80570d32e3ad5b
+  metadata.gz: 1c88c30b96819085add5b171ae6c2b61ed24637d4acf25a0258b19fed2e6aa6b
+  data.tar.gz: 14d8383987c15edcb2868f35e7d6a56369cadd296bf0e31d5e0ed5d3ccd0f6d3
 SHA512:
-  metadata.gz: 693f677cb91b9b0e79135ef27ed2771d34d7e5fee5bd368b20b5791a4e25a468ba769d299c2cd1417eccd989d9ac1290cfaf9dfd90b9534fa0af0440382e4e0f
-  data.tar.gz: c6c33f309e3f8b00dc1721653f2e2099c647c9bd8d3b4ecaab4ec7b97161925911d1e6e3c8cabb30b9b9eb2274fed7238ed338da7d85d25b7c6981d537f6435d
+  metadata.gz: 3b2f276abff0c540bb08fc26d6267bacefc8e1a9bf2f5e2aa1dfb3294398b9a63fbcfe906baa24b013dd9bf19052f826ce5d47bfce53572381aba2e948b8f918
+  data.tar.gz: fff8dcd929eed104ab69543f49e7cba4005165f909505d82bd857d1791545593d61aa988a2ef9d91c9887b2a2c57aab8bffeb3e474f8b01521827a6abbf8d475

data/bin/htmlproofer CHANGED Viewed

@@ -15,43 +15,32 @@ Mercenary.program(:htmlproofer) do |p|
   p.description 'Runs the HTML-Proofer suite on the files in PATH. For more details, see the README.'
-  p.option 'allow_missing_href', '--allow-missing-href', 'If `true`, does not flag `a` tags missing `href` (this is the default for HTML5).'
-  p.option 'allow_hash_href', '--allow-hash-href', 'If `true`, ignores the `href="#"`'
+  p.option 'allow_hash_href', '--allow-hash-href', 'If `true`, assumes `href="#"` anchors are valid'
+  p.option 'allow_missing_href', '--allow-missing-href', 'If `true`, does not flag `a` tags missing `href`. In HTML5, this is technically allowed, but could also be human error.'
   p.option 'as_links', '--as-links', 'Assumes that `PATH` is a comma-separated array of links to check.'
-  p.option 'alt_ignore', '--alt-ignore image1,[image2,...]', Array, 'A comma-separated list of Strings or RegExps containing `img`s whose missing `alt` tags are safe to ignore'
-  p.option 'assume_extension', '--assume-extension', 'Automatically add extension (e.g. `.html`) to file paths, to allow extensionless URLs (as supported by Jekyll 3 and GitHub Pages) (default: `false`).'
-  p.option 'checks_to_ignore', '--checks-to-ignore check1,[check2,...]', Array, 'A comma-separated list of Strings indicating which checks you do not want to run (default: `[]`)'
-  p.option 'check_external_hash', '--check-external-hash', 'Checks whether external hashes exist (even if the webpage exists). This slows the checker down (default: `false`).'
-  p.option 'check_favicon', '--check-favicon', 'Enables the favicon checker (default: `false`).'
-  p.option 'check_html', '--check-html', 'Enables HTML validation errors from Nokogumbo (default: `false`).'
-  p.option 'check_img_http', '--check-img-http', 'Fails an image if it\'s marked as `http` (default: `false`).'
-  p.option 'check_opengraph', '--check-opengraph', 'Enables the Open Graph checker (default: `false`).'
+  p.option 'assume_extension', '--assume-extension <ext>', 'Automatically add specified extension to files for internal links, to allow extensionless URLs (as supported by most servers) (default: `.html`).'
+  p.option 'checks', '--checks check1,[check2,...]', Array, 'A comma-separated list of Strings indicating which checks you want to run (default: `["Links", "Images", "Scripts"]`)'
+  p.option 'check_external_hash', '--check-external-hash', 'Checks whether external hashes exist (even if the webpage exists) (default: `true`).'
   p.option 'check_sri', '--check-sri', 'Check that `<link>` and `<script>` external resources use SRI (default: `false`).'
   p.option 'directory_index_file', '--directory-index-file <filename>', String, 'Sets the file to look for when a link refers to a directory. (default: `index.html`)'
-  p.option 'disable_external', '--disable-external', 'If `true`, does not run the external link checker, which can take a lot of time (default: `false`)'
-  p.option 'empty_alt_ignore', '--empty-alt-ignore', 'If `true`, ignores images with empty alt tags'
-  p.option 'error_sort', '--error-sort <sort>', String, 'Defines the sort order for error output. Can be `:path`, `:desc`, or `:status` (default: `:path`).'
-  p.option 'enforce_https', '--enforce-https', 'Fails a link if it\'s not marked as `https` (default: `false`).'
-  p.option 'extension', '--extension <ext>', String, 'The extension of your HTML files including the dot. (default: `.html`)'
-  p.option 'external_only', '--external_only', 'Only checks problems with external references'
-  p.option 'file_ignore', '--file-ignore file1,[file2,...]', Array, 'A comma-separated list of Strings or RegExps containing file paths that are safe to ignore'
-  p.option 'http_status_ignore', '--http-status-ignore 123,[xxx, ...]', Array, 'A comma-separated list of numbers representing status codes to ignore.'
-  p.option 'internal_domains', '--internal-domains domain1,[domain2,...]', Array, 'A comma-separated list of Strings containing domains that will be treated as internal urls.'
-  p.option 'report_invalid_tags', '--report-invalid-tags', 'When `check_html` is enabled, HTML markup that is unknown to Nokogumbo are reported as errors (default: `false`)'
-  p.option 'report_missing_names', '--report-missing-names', 'When `check_html` is enabled, HTML markup that are missing entity names are reported as errors (default: `false`)'
-  p.option 'report_script_embeds', '--report-script-embeds', 'When `check_html` is enabled, `script` tags containing markup are reported as errors (default: `false`)'
-  p.option 'report_missing_doctype', '--report-missing-doctype', 'When `check_html` is enabled, HTML markup with missing or out-of-order `DOCTYPE` are reported as errors (default: `false`)'
-  p.option 'report_eof_tags', '--report-eof-tags', 'When `check_html` is enabled, HTML markup with tags that are malformed are reported as errors (default: `false`)'
-  p.option 'report_mismatched_tags', '--report-mismatched-tags', 'When `check_html` is enabled, HTML markup with mismatched tags are reported as errors (default: `false`)'
+  p.option 'disable_external', '--disable-external', 'If `true`, does not run the external link checker (default: `false`)'
+  p.option 'enforce_https', '--enforce-https', 'Fails a link if it\'s not marked as `https` (default: `true`).'
+  p.option 'extensions', '--extensions ext1,[ext2,...[', Array, 'A comma-separated list of Strings indicating the file extensions you would like to check (including the dot) (default: `.html`)'
+  p.option 'ignore_files', '--ignore-files file1,[file2,...]', Array, 'A comma-separated list of Strings or RegExps containing file paths that are safe to ignore'
+  p.option 'ignore_empty_mailto', '--ignore-empty-mailto', 'If `true`, allows `mailto:` `href`s which do not contain an email address'
+  p.option 'ignore_missing_alt', '--empty-alt-ignore', 'If `true`, ignores images with empty/missing alt tags'
+  p.option 'ignore_status_codes', '--http-status-ignore 123,[xxx, ...]', Array, 'A comma-separated list of numbers representing status codes to ignore.'
+  p.option 'ignore_urls', '--ignore-urls link1,[link2,...]', Array, 'A comma-separated list of Strings or RegExps containing URLs that are safe to ignore. This affects all HTML attributes, such as `alt` tags on images.'
   p.option 'log_level', '--log-level <level>', String, 'Sets the logging level, as determined by Yell. One of `:debug`, `:info`, `:warn`, `:error`, or `:fatal`. (default: `:info`)'
   p.option 'only_4xx', '--only-4xx', 'Only reports errors for links that fall within the 4xx status code range'
-  p.option 'storage_dir', '--storage-dir PATH', String, 'Directory where to store the cache log (default: "tmp/.htmlproofer")'
-  p.option 'timeframe', '--timeframe <time>', String, 'A string representing the caching timeframe.'
-  p.option 'typhoeus_config', '--typhoeus-config CONFIG', String, 'JSON-formatted string of Typhoeus config. Will override the html-proofer defaults.'
-  p.option 'hydra_config', '--hydra-config CONFIG', String, 'JSON-formatted string of Hydra config. Will override the html-proofer defaults.'
-  p.option 'url_ignore', '--url-ignore link1,[link2,...]', Array, 'A comma-separated list of Strings or RegExps containing URLs that are safe to ignore. It affects all HTML attributes. Note that non-HTTP(S) URIs are always ignored'
-  p.option 'url_swap', '--url-swap re:string,[re:string,...]', Array, 'A comma-separated list containing key-value pairs of `RegExp => String`. It transforms URLs that match `RegExp` into `String` via `gsub`. The escape sequences `\\:` should be used to produce literal `:`s.'
   p.option 'root_dir', '--root-dir PATH', String, 'The absolute path to the directory serving your html-files.'
+  p.option 'swap_attributes', '--swap-attributes CONFIG', String, 'JSON-formatted config that maps element names to the preferred attribute to check (default: `{}`).'
+  p.option 'swap_urls', '--swap-urls re:string,[re:string,...]', Array, 'A comma-separated list containing key-value pairs of `RegExp => String`. It transforms URLs that match `RegExp` into `String` via `gsub`. The escape sequences `\\:` should be used to produce literal `:`s.'
+  p.option 'typhoeus', '--typhoeus CONFIG', String, 'JSON-formatted string of Typhoeus config. Will override the html-proofer defaults.'
+  p.option 'hydra', '--hydra CONFIG', String, 'JSON-formatted string of Hydra config. Will override the html-proofer defaults.'
+  p.option 'parallel', '--parallel CONFIG', String, 'JSON-formatted string of Parallel config. Will override the html-proofer defaults.'
+  p.option 'cache', '--cache CONFIG', String, 'JSON-formatted string of cache config. Will override the html-proofer defaults.'
   p.action do |args, opts|
     args = ['.'] if args.empty?
@@ -66,46 +55,31 @@ Mercenary.program(:htmlproofer) do |p|
     end
     # some minor manipulation of a special option
-    unless opts['url_swap'].nil?
-      options[:url_swap] = {}
-      opts['url_swap'].each do |s|
+    unless opts['swap_urls'].nil?
+      options[:swap_urls] = {}
+      opts['swap_urls'].each do |s|
         splt = s.split(/(?<!\\):/, 2)
         re = splt[0].gsub(/\\:/, ':')
         string = splt[1].gsub(/\\:/, ':')
-        options[:url_swap][Regexp.new(re)] = string
+        options[:swap_urls][Regexp.new(re)] = string
       end
     end
-    options[:error_sort] = opts['error-sort'].to_sym unless opts['error-sort'].nil?
     options[:log_level] = opts['log_level'].to_sym unless opts['log_level'].nil?
-    options[:validation] = HTMLProofer::Configuration::VALIDATION_DEFAULTS.dup
-    options[:validation][:report_script_embeds] = opts['report_script_embeds'] unless opts['report_script_embeds'].nil?
-    options[:validation][:report_missing_names] = opts['report_missing_names'] unless opts['report_missing_names'].nil?
-    options[:validation][:report_invalid_tags] = opts['report_invalid_tags'] unless opts['report_invalid_tags'].nil?
-    options[:validation][:report_missing_doctype] = opts['report_missing_doctype'] unless opts['report_missing_doctype'].nil?
-    options[:validation][:report_eof_tags] = opts['report_eof_tags'] unless opts['report_eof_tags'].nil?
-    options[:validation][:report_mismatched_tags] = opts['report_mismatched_tags'] unless opts['report_mismatched_tags'].nil?
-    options[:typhoeus] = HTMLProofer::Configuration.parse_json_option('typhoeus_config', opts['typhoeus_config'], symbolize_names: false) unless opts['typhoeus_config'].nil?
-    options[:hydra] = HTMLProofer::Configuration.parse_json_option('hydra_config', opts['hydra_config']) unless opts['hydra_config'].nil?
+    options[:typhoeus] = HTMLProofer::Configuration.parse_json_option('typhoeus', opts['typhoeus'], symbolize_names: false) unless opts['typhoeus'].nil?
+    options[:hydra] = HTMLProofer::Configuration.parse_json_option('hydra', opts['hydra']) unless opts['hydra'].nil?
+    options[:parallel] = HTMLProofer::Configuration.parse_json_option('parallel', opts['parallel']) unless opts['parallel'].nil?
+    options[:cache] = HTMLProofer::Configuration.parse_json_option('cache', opts['cache']) unless opts['cache'].nil?
-    unless opts['timeframe'].nil?
-      options[:cache] ||= {}
-      options[:cache][:timeframe] = opts['timeframe'] unless opts['timeframe'].nil?
-    end
-    unless opts['storage_dir'].nil?
-      options[:cache] ||= {}
-      options[:cache][:storage_dir] = opts['storage_dir'] unless opts['storage_dir'].nil?
-    end
+    options[:swap_attributes] = HTMLProofer::Configuration.parse_json_option('swap_attributes', opts['swap_attributes'], symbolize_names: false) unless opts['swap_attributes'].nil?
-    options[:http_status_ignore] = Array(options[:http_status_ignore]).map(&:to_i)
+    options[:ignore_status_codes] = Array(options[:ignore_status_codes]).map(&:to_i)
     paths = path.split(',')
     if opts['as_links']
-      links = path.delete(' ').split(',')
+      links = path.split(',').map(&:strip)
       HTMLProofer.check_links(links, options).run
     elsif File.directory?(paths.first)
       HTMLProofer.check_directories(paths, options).run

data/lib/html-proofer.rb CHANGED Viewed

@@ -1,56 +1,3 @@
-# rubocop:disable Naming/FileName
 # frozen_string_literal: true
-def require_all(path)
-  dir = File.join(File.dirname(__FILE__), path)
-  Dir[File.join(dir, '*.rb')].sort.each do |f|
-    require f
-  end
-end
-require_relative 'html-proofer/utils'
-require_all 'html-proofer'
-require_all 'html-proofer/check'
-require 'parallel'
-require 'fileutils'
-begin
-  require 'awesome_print'
-  require 'pry-byebug'
-rescue LoadError; end # rubocop:disable Lint/SuppressedException
-module HTMLProofer
-  def self.check_file(file, options = {})
-    raise ArgumentError unless file.is_a?(String)
-    raise ArgumentError, "#{file} does not exist" unless File.exist?(file)
-    options[:type] = :file
-    HTMLProofer::Runner.new(file, options)
-  end
-  def self.check_directory(directory, options = {})
-    raise ArgumentError unless directory.is_a?(String)
-    raise ArgumentError, "#{directory} does not exist" unless Dir.exist?(directory)
-    options[:type] = :directory
-    HTMLProofer::Runner.new([directory], options)
-  end
-  def self.check_directories(directories, options = {})
-    raise ArgumentError unless directories.is_a?(Array)
-    options[:type] = :directory
-    directories.each do |directory|
-      raise ArgumentError, "#{directory} does not exist" unless Dir.exist?(directory)
-    end
-    HTMLProofer::Runner.new(directories, options)
-  end
-  def self.check_links(links, options = {})
-    raise ArgumentError unless links.is_a?(Array)
-    options[:type] = :links
-    HTMLProofer::Runner.new(links, options)
-  end
-end
-# rubocop:enable Naming/FileName
+require_relative 'html_proofer'

data/lib/html_proofer/attribute/url.rb ADDED Viewed

@@ -0,0 +1,231 @@
+# frozen_string_literal: true
+class HTMLProofer::Attribute::Url < HTMLProofer::Attribute
+  attr_reader :url
+  REMOTE_SCHEMES = %w[http https].freeze
+  def initialize(runner, link_attribute, base_url: nil)
+    super
+    if @raw_attribute.nil?
+      @url = nil
+    else
+      @url = @raw_attribute.delete("\u200b").strip
+      @url = Addressable::URI.join(base_url, @url).to_s unless blank?(base_url)
+      swap_urls!
+      clean_url!
+      # convert "//" links to "https://"
+      @url.start_with?('//') ? @url = "https:#{@url}" : @url
+    end
+  end
+  def to_s
+    @url
+  end
+  def known_extension?
+    return true if hash_link?
+    ext = File.extname(path)
+    # no extension means we use the assumed one
+    return @runner.options[:extensions].include?(@runner.options[:assume_extension]) if blank?(ext)
+    @runner.options[:extensions].include?(ext)
+  end
+  def unknown_extension?
+    !known_extension?
+  end
+  def ignore?
+    return true if (/^javascript:/).match?(@url)
+    return true if ignores_pattern?(@runner.options[:ignore_urls])
+  end
+  def valid?
+    !parts.nil?
+  end
+  def path?
+    !parts.host.nil? && !parts.path.nil?
+  end
+  def parts
+    @parts ||= Addressable::URI.parse @url
+  rescue URI::Error, Addressable::URI::InvalidURIError
+    @parts = nil
+  end
+  def path
+    Addressable::URI.unencode parts.path unless parts.nil?
+  end
+  def hash
+    parts&.fragment
+  end
+  # Does the URL have a hash?
+  def hash?
+    !blank?(hash)
+  end
+  def scheme
+    parts&.scheme
+  end
+  def remote?
+    REMOTE_SCHEMES.include?(scheme)
+  end
+  def http?
+    scheme == 'http'
+  end
+  def https?
+    scheme == 'https'
+  end
+  def non_http_remote?
+    !scheme.nil? && !remote?
+  end
+  def host
+    parts&.host
+  end
+  def domain_path
+    (host || '') + path
+  end
+  def query_values
+    parts&.query_values
+  end
+  # checks if a file exists relative to the current pwd
+  def exists?
+    return true if base64?
+    return @runner.checked_paths[absolute_path] if @runner.checked_paths.key?(absolute_path)
+    @runner.checked_paths[absolute_path] = File.exist?(absolute_path)
+  end
+  def base64?
+    /^data:image/.match?(@raw_attribute)
+  end
+  def absolute_path
+    path = file_path || @runner.current_path
+    File.expand_path(path, Dir.pwd)
+  end
+  def file_path
+    return if path.nil? || path.empty?
+    path_dot_ext = ''
+    path_dot_ext = path + @runner.options[:assume_extension] unless blank?(@runner.options[:assume_extension])
+    base = if absolute_path?(path) # path relative to root
+             # either overwrite with root_dir; or, if source is directory, use that; or, just get the current file's dirname
+             @runner.options[:root_dir] || (File.directory?(@runner.current_source) ? @runner.current_source : File.dirname(@runner.current_source))
+           # relative links, path is a file
+           elsif File.exist?(File.expand_path(path, @runner.current_source)) || File.exist?(File.expand_path(path_dot_ext, @runner.current_source))
+             File.dirname(@runner.current_path)
+           # relative links in nested dir, path is a file
+           elsif File.exist?(File.join(File.dirname(@runner.current_path), path)) || File.exist?(File.join(File.dirname(@runner.current_path), path_dot_ext)) # rubocop:disable Lint/DuplicateBranch
+             File.dirname(@runner.current_path)
+           # relative link, path is a directory
+           else
+             @runner.current_path
+           end
+    file = File.join(base, path)
+    if @runner.options[:assume_extension] && File.file?("#{file}#{@runner.options[:assume_extension]}")
+      file = "#{file}#{@runner.options[:assume_extension]}"
+    elsif File.directory?(file) && !unslashed_directory?(file) # implicit index support
+      file = File.join file, @runner.options[:directory_index_file]
+    end
+    file
+  end
+  def unslashed_directory?(file)
+    File.directory?(file) && !file.end_with?(File::SEPARATOR)
+  end
+  def absolute_path?(path)
+    path.start_with?('/')
+  end
+  # path is external to the file
+  def external?
+    !internal?
+  end
+  def internal?
+    relative_link? || internal_absolute_link? || hash_link?
+  end
+  def internal_absolute_link?
+    url.start_with?('/')
+  end
+  def relative_link?
+    return false if remote?
+    hash_link? || param_link? || url.start_with?('.') || url =~ /^\S/
+  end
+  def link_points_to_same_page?
+    hash_link || param_link
+  end
+  def hash_link?
+    url.start_with?('#')
+  end
+  def param_link?
+    url.start_with?('?')
+  end
+  def sans_hash
+    @url.to_s.sub(/##{hash}/, '')
+  end
+  # catch any obvious issues, like strings in port numbers
+  private def clean_url!
+    return if @url =~ /^([!#{Regexp.last_match(0)}-;=?-\[\]_a-z~]|%[0-9a-fA-F]{2})+$/
+    @url = Addressable::URI.parse(@url).normalize.to_s
+  end
+  private def swap_urls!
+    return @url if blank?(replacements = @runner.options[:swap_urls])
+    replacements.each do |link, replace|
+      @url = @url.gsub(link, replace)
+    end
+  end
+  private def ignores_pattern?(links_to_ignore)
+    return false unless links_to_ignore.is_a?(Array)
+    links_to_ignore.each do |link_to_ignore|
+      case link_to_ignore
+      when String
+        return true if link_to_ignore == @raw_attribute
+      when Regexp
+        return true if link_to_ignore&.match?(@raw_attribute)
+      end
+    end
+    false
+  end
+end

data/lib/html_proofer/attribute.rb ADDED Viewed

@@ -0,0 +1,15 @@
+# frozen_string_literal: true
+module HTMLProofer
+  # Represents an element currently being processed
+  class Attribute
+    include HTMLProofer::Utils
+    attr_reader :raw_attribute
+    def initialize(runner, raw_attribute, **_)
+      @runner = runner
+      @raw_attribute = raw_attribute
+    end
+  end
+end

data/lib/html_proofer/cache.rb ADDED Viewed

@@ -0,0 +1,236 @@
+# frozen_string_literal: true
+require 'date'
+require 'json'
+require 'uri'
+module HTMLProofer
+  class Cache
+    include HTMLProofer::Utils
+    CACHE_VERSION = 2
+    DEFAULT_STORAGE_DIR = File.join('tmp', '.htmlproofer')
+    DEFAULT_CACHE_FILE_NAME = 'cache.json'
+    URI_REGEXP = URI::DEFAULT_PARSER.make_regexp
+    attr_reader :exists, :cache_log, :storage_dir, :cache_file
+    def initialize(runner, options)
+      @runner = runner
+      @logger = @runner.logger
+      @cache_datetime = DateTime.now
+      @cache_time = @cache_datetime.to_time
+      if blank?(options)
+        define_singleton_method(:enabled?) { false }
+      else
+        define_singleton_method(:enabled?) { true }
+        setup_cache!(options)
+        @parsed_timeframe = parsed_timeframe(options[:timeframe])
+      end
+    end
+    def within_timeframe?(time)
+      return false if time.nil?
+      time = Time.parse(time) if time.is_a?(String)
+      (@parsed_timeframe..@cache_time).cover?(time)
+    end
+    def parsed_timeframe(timeframe)
+      time, date = timeframe.match(/(\d+)(\D)/).captures
+      time = time.to_i
+      case date
+      when 'M'
+        time_ago(time, :months)
+      when 'w'
+        time_ago(time, :weeks)
+      when 'd'
+        time_ago(time, :days)
+      when 'h'
+        time_ago(time, :hours)
+      else
+        raise ArgumentError, "#{date} is not a valid timeframe!"
+      end
+    end
+    def add_internal(url, metadata, found)
+      return unless enabled?
+      @cache_log[:internal][url] = { time: @cache_time, metadata: [] } if @cache_log[:internal][url].nil?
+      @cache_log[:internal][url][:metadata] << construct_internal_link_metadata(metadata, found)
+    end
+    def add_external(url, filenames, status_code, msg)
+      return unless enabled?
+      found = status_code.between?(200, 299)
+      clean_url = cleaned_url(url)
+      @cache_log[:external][clean_url] = { time: @cache_time.to_s, found: found, status_code: status_code, message: msg, metadata: filenames }
+    end
+    def detect_url_changes(urls_detected, type)
+      additions = determine_additions(urls_detected, type)
+      determine_deletions(urls_detected, type)
+      additions
+    end
+    private def construct_internal_link_metadata(metadata, found)
+      {
+        source: metadata[:source],
+        current_path: metadata[:current_path],
+        line: metadata[:line],
+        base_url: metadata[:base_url],
+        found: found
+      }
+    end
+    # prepare to add new URLs detected
+    private def determine_additions(urls_detected, type)
+      additions = urls_detected.reject do |url, metadata|
+        if @cache_log[type].include?(url)
+          @cache_log[type][url][:metadata] = metadata
+          # if this is false, we're trying again
+          if type == :external
+            @cache_log[type][url][:found]
+          else
+            @cache_log[type][url][:metadata].none? { |m| m[:found] }
+          end
+        else
+          @logger.log :debug, "Adding #{url} to #{type} cache"
+          false
+        end
+      end
+      new_link_count = additions.length
+      new_link_text = pluralize(new_link_count, "new #{type} link", "new #{type} links")
+      @logger.log :debug, "Adding #{new_link_text} to the cache"
+      additions
+    end
+    # remove from cache URLs that no longer exist
+    private def determine_deletions(urls_detected, type)
+      deletions = 0
+      @cache_log[type].delete_if do |url, _|
+        if urls_detected.include?(url)
+          false
+        elsif url_matches_type?(url, type)
+          @logger.log :debug, "Removing #{url} from #{type} cache"
+          deletions += 1
+          true
+        end
+      end
+      del_link_text = pluralize(deletions, "outdated #{type} link", "outdated #{type} links")
+      @logger.log :debug, "Removing #{del_link_text} from the cache"
+    end
+    def write
+      return unless enabled?
+      File.write(@cache_file, @cache_log.to_json)
+    end
+    def retrieve_urls(urls_detected, type)
+      # if there are no urls, bail
+      return {} if urls_detected.empty?
+      urls_detected = urls_detected.transform_keys do |url|
+        cleaned_url(url)
+      end
+      urls_to_check = detect_url_changes(urls_detected, type)
+      @cache_log[type].each_pair do |url, cache|
+        next if within_timeframe?(cache[:time])
+        urls_to_check[url] = cache[:metadata] # recheck expired links
+      end
+      urls_to_check
+    end
+    def empty?
+      blank?(@cache_log) || (@cache_log[:internal].empty? && @cache_log[:external].empty?)
+    end
+    def size(type)
+      @cache_log[type].size
+    end
+    private def setup_cache!(options)
+      default_structure = {
+        version: CACHE_VERSION,
+        internal: {},
+        external: {}
+      }
+      @storage_dir = options[:storage_dir] || DEFAULT_STORAGE_DIR
+      FileUtils.mkdir_p(storage_dir) unless Dir.exist?(storage_dir)
+      cache_file_name = options[:cache_file] || DEFAULT_CACHE_FILE_NAME
+      @cache_file = File.join(storage_dir, cache_file_name)
+      return (@cache_log = default_structure) unless File.exist?(@cache_file)
+      contents = File.read(@cache_file)
+      return (@cache_log = default_structure) if blank?(contents)
+      log = JSON.parse(contents, symbolize_names: true)
+      old_cache = (cache_version = log[:version]).nil?
+      @cache_log = if old_cache # previous cache version, create a new one
+                     default_structure
+                   elsif cache_version != CACHE_VERSION
+                   # if cache version is newer...do something
+                   else
+                     log[:internal] = log[:internal].transform_keys(&:to_s)
+                     log[:external] = log[:external].transform_keys(&:to_s)
+                     log
+                   end
+    end
+    private def time_ago(measurement, unit)
+      case unit
+      when :months
+        @cache_datetime >> -measurement
+      when :weeks
+        @cache_datetime - (measurement * 7)
+      when :days
+        @cache_datetime - measurement
+      when :hours
+        @cache_datetime - Rational(measurement / 24.0)
+      end.to_time
+    end
+    private def url_matches_type?(url, type)
+      return true if type == :internal && url !~ URI_REGEXP
+      return true if type == :external && url =~ URI_REGEXP
+    end
+    private def cleaned_url(url)
+      cleaned_url = escape_unescape(url)
+      return cleaned_url unless cleaned_url.end_with?('/', '#', '?') && cleaned_url.length > 1
+      cleaned_url[0..-2]
+    end
+    private def escape_unescape(url)
+      Addressable::URI.parse(url).normalize.to_s
+    end
+  end
+end