RubyGems - html-proofer - Versions diffs - 2.6.4 → 3.0.0 - Mend

html-proofer 2.6.4 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

checksums.yaml +4 -4
data/bin/{htmlproof → htmlproofer} +31 -24
data/lib/html-proofer.rb +47 -0
data/lib/html-proofer/cache.rb +153 -0
data/lib/html-proofer/check.rb +63 -0
data/lib/{html/proofer/checks → html-proofer/check}/favicon.rb +2 -8
data/lib/html-proofer/check/html.rb +21 -0
data/lib/html-proofer/check/images.rb +47 -0
data/lib/{html/proofer/checks → html-proofer/check}/links.rb +40 -48
data/lib/html-proofer/check/scripts.rb +28 -0
data/lib/html-proofer/configuration.rb +56 -0
data/lib/html-proofer/element.rb +165 -0
data/lib/{html/proofer/check_runner → html-proofer}/issue.rb +8 -10
data/lib/html-proofer/log.rb +49 -0
data/lib/html-proofer/runner.rb +160 -0
data/lib/html-proofer/url_validator.rb +218 -0
data/lib/html-proofer/utils.rb +40 -0
data/lib/html-proofer/version.rb +3 -0
metadata +20 -20
data/lib/html/proofer.rb +0 -191
data/lib/html/proofer/cache.rb +0 -141
data/lib/html/proofer/check_runner.rb +0 -70
data/lib/html/proofer/checkable.rb +0 -168
data/lib/html/proofer/checks/html.rb +0 -46
data/lib/html/proofer/checks/images.rb +0 -54
data/lib/html/proofer/checks/scripts.rb +0 -40
data/lib/html/proofer/configuration.rb +0 -48
data/lib/html/proofer/log.rb +0 -42
data/lib/html/proofer/url_validator.rb +0 -222
data/lib/html/proofer/utils.rb +0 -42
data/lib/html/proofer/version.rb +0 -5
data/lib/html/proofer/xpathfunctions.rb +0 -9

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 949542fac56daeafe93fbf17014e96cba9deba3c
-  data.tar.gz: b42e8375b5886aa310fcb1920ca0c676e9dedcc3
+  metadata.gz: 1ce6c96e75683a8da4de60754d83aef138221d02
+  data.tar.gz: b341f1ee5b7237e58cba8e775dfaa152eaeeb0a5
 SHA512:
-  metadata.gz: 6d2f2741cc3c6c7ca32220f1e338986148d5bf4c026e1bd5b5e7955652f6a0807d9037a4d71a0aaec7c81e3c9d58ae09ef649388c5be589c08991d27c2a88c33
-  data.tar.gz: 5a478a174241a58cf29192bb62a964703ecb5e90060ac702183067c42a894858dbad4cf5a7591cb216d92785b9216f1eaca2232bcfa248da2af20d206ac1daf6
+  metadata.gz: 000ff0816076d1e8b738526f380e88309fdc04d93fa203a43d62960696064062b7942647a5f79be0f8e7239ca69e67f08f669866352cabf67152a6f95beb643b
+  data.tar.gz: 1863ee52843f2d5071170aaed79ce2ae9f6902ffb1a99d93021b746eb9d3d792d460d4caff2f703907291915b0bc8861af8967cd4f5a394e0a6604aa23e0dec0

data/bin/{htmlproof → htmlproofer} RENAMED Viewed

@@ -3,19 +3,20 @@ STDOUT.sync = true
 $LOAD_PATH.unshift File.join(File.dirname(__FILE__), *%w( .. lib ))
-require 'html/proofer'
+require 'html-proofer'
 require 'mercenary'
-Mercenary.program(:htmlproof) do |p|
-  p.version HTML::Proofer::VERSION
+Mercenary.program(:htmlproofer) do |p|
+  p.version HTMLProofer::VERSION
   p.description %(Test your rendered HTML files to make sure they're accurate.)
-  p.syntax 'htmlproof PATH [options]'
+  p.syntax 'htmlproofer PATH [options]'
   p.description 'Runs the HTML-Proofer suite on the files in PATH. For more details, see the README.'
   p.option 'allow_hash_href', '--allow-hash-href', 'If `true`, ignores the `href` `#`'
   p.option 'as_links', '--as-links', 'Assumes that `PATH` is a comma-separated array of links to check.'
   p.option 'alt_ignore', '--alt-ignore image1,[image2,...]', Array, 'A comma-separated list of Strings or RegExps containing `img`s whose missing `alt` tags are safe to ignore'
+  p.option 'assume_extension', '--assume-extension', 'Automatically add extension (e.g. `.html`) to file paths, to allow extensionless URLs (as supported by Jekyll 3 and GitHub Pages) (default: `false`).'
   p.option 'checks_to_ignore', '--checks-to-ignore check1,[check2,...]', Array, ' An array of Strings indicating which checks you\'d like to not perform.'
   p.option 'check_external_hash', '--check-external-hash', 'Checks whether external hashes exist (even if the website exists). This slows the checker down (default: `false`).'
   p.option 'check_favicon', '--check-favicon', 'Enables the favicon checker (default: `false`).'
@@ -25,16 +26,17 @@ Mercenary.program(:htmlproof) do |p|
   p.option 'empty_alt_ignore', '--empty-alt-ignore', 'If `true`, ignores images with empty alt tags'
   p.option 'error_sort', '--error-sort SORT', 'Defines the sort order for error output. Can be `:path`, `:desc`, or `:status` (default: `path`).'
   p.option 'enforce_https', '--enforce-https', 'Fails a link if it\'s not marked as `https` (default: `false`).'
-  p.option 'ext', '--ext EXT', String, 'The extension of your HTML files including the dot. (default: `.html`)'
+  p.option 'extension', '--extension EXT', String, 'The extension of your HTML files including the dot. (default: `.html`)'
   p.option 'external_only', '--external_only', 'Only checks problems with external references'
   p.option 'file_ignore', '--file-ignore file1,[file2,...]', Array, 'A comma-separated list of Strings or RegExps containing file paths that are safe to ignore'
-  p.option 'href_ignore', '--href-ignore link1,[link2,...]', Array, 'A comma-separated list of Strings or RegExps containing `href`s that are safe to ignore. Note that non-HTTP(S) URIs are always ignored. **Will be renamed in a future release.**'
-  p.option 'href_swap', '--href-swap re:string,[re:string,...]', Array, 'A comma-separated list containing key-value pairs of `RegExp => String`. It transforms links that match `RegExp` into `String` via `gsub`. **Will be renamed in a future release.**'
-  p.option 'ignore_script_embeds', '--ignore-script-embeds', 'Ignore `check_html` errors associated with `script`s (default: `false`)'
+  p.option 'http_status_ignore', '--http-status-ignore 123,[xxx, ...]', Array, 'A comma-separated list of numbers representing status codes to ignore.'
+  p.option 'report_invalid_tags', '--report-invalid-tags', 'Ignore `check_html` errors associated with unknown markup (default: `false`)'
+  p.option 'report_script_embeds', '--report-script-embeds', 'Ignore `check_html` errors associated with `script`s (default: `false`)'
+  p.option 'log_level', '--log-level <level>', String, 'Sets the logging level, as determined by Yell'
   p.option 'only_4xx', '--only-4xx', 'Only reports errors for links that fall within the 4xx status code range'
+  p.option 'timeframe', '--timeframe <time>', String, 'A string representing the caching timeframe.'
   p.option 'url_ignore', '--url-ignore link1,[link2,...]', Array, 'A comma-separated list of Strings or RegExps containing URLs that are safe to ignore. It affects all HTML attributes. Note that non-HTTP(S) URIs are always ignored'
-  p.option 'verbose', '--verbose', 'If `true`, outputs extra information as the checking happens. Useful for debugging. **Will be deprecated in a future release.**'
-  p.option 'verbosity', '--verbosity', String, 'Sets the logging level, as determined by Yell'
+  p.option 'url_swap', '--url-swap re:string,[re:string,...]', Array, 'A comma-separated list containing key-value pairs of `RegExp => String`. It transforms URLs that match `RegExp` into `String` via `gsub`.'
   p.action do |args, opts|
     args = ['.'] if args.empty?
@@ -45,34 +47,39 @@ Mercenary.program(:htmlproof) do |p|
     # prepare everything to go to proofer
     p.options.select { |o| !opts[o.config_key].nil? }.each do |option|
       if option.return_type.to_s == 'Array' # TODO: is_a? doesn't work here?
-        opts[option.config_key] = opts[option.config_key].map { |i| HTML::Proofer::Configuration.to_regex?(i) }
+        opts[option.config_key] = opts[option.config_key].map { |i| HTMLProofer::Configuration.to_regex?(i) }
       end
       options[option.config_key.to_sym] = opts[option.config_key]
     end
     # some minor manipulation of a special option
-    unless opts['href_swap'].nil?
-      options[:href_swap] = {}
-      opts['href_swap'].each do |s|
+    unless opts['url_swap'].nil?
+      options[:url_swap] = {}
+      opts['url_swap'].each do |s|
         pair = s.split(':', 2)
-        options[:href_swap][Regexp.new(pair[0])] = pair[1]
+        options[:url_swap][Regexp.new(pair[0])] = pair[1]
       end
     end
-    # check for ignore_scripts_embeds as it should be set in :validation
-    unless opts['ignore_script_embeds'].nil?
-      options[:validation] = { :ignore_script_embeds => true }
-    end
     options[:error_sort] = opts['error-sort'].to_sym unless opts['error-sort'].nil?
-    options[:verbosity] = opts['verbosity'].to_sym unless opts['verbosity'].nil?
+    options[:log_level] = opts['log_level'].to_sym unless opts['log_level'].nil?
     # FIXME: this is gross
     options[:validation] = {}
-    options[:validation][:ignore_script_embeds] = opts['ignore_script_embeds']
+    options[:validation][:report_script_embeds] = opts['report_script_embeds']
+    options[:validation][:report_invalid_tags] = opts['report_invalid_tags']
-    path = path.delete(' ').split(',') if opts['as_links']
+    options[:cache] = {}
+    options[:cache][:timeframe] = opts['timeframe'] unless opts['timeframe'].nil?
-    HTML::Proofer.new(path, options).run
+    paths = path.split(',')
+    if opts['as_links']
+      links = path.delete(' ').split(',')
+      HTMLProofer.check_links(links, options).run
+    elsif File.directory?(paths.first)
+      HTMLProofer.check_directories(paths, options).run
+    else
+      HTMLProofer.check_file(path, options).run
+    end
   end
 end

data/lib/html-proofer.rb ADDED Viewed

@@ -0,0 +1,47 @@
+def require_all(path)
+  glob = File.join(File.dirname(__FILE__), path, '*.rb')
+  Dir[glob].each do |f|
+    require f
+  end
+end
+require_all 'html-proofer'
+require_all 'html-proofer/check'
+require 'parallel'
+require 'fileutils'
+begin
+  require 'awesome_print'
+rescue LoadError; end
+module HTMLProofer
+  def check_file(file, options = {})
+    fail ArgumentError unless file.is_a?(String)
+    options[:type] = :file
+    HTMLProofer::Runner.new(file, options)
+  end
+  module_function :check_file
+  def check_directory(directory, options = {})
+    fail ArgumentError unless directory.is_a?(String)
+    options[:type] = :directory
+    HTMLProofer::Runner.new([directory], options)
+  end
+  module_function :check_directory
+  def check_directories(directories, options = {})
+    fail ArgumentError unless directories.is_a?(Array)
+    options[:type] = :directory
+    HTMLProofer::Runner.new(directories, options)
+  end
+  module_function :check_directories
+  def check_links(links, options = {})
+    fail ArgumentError unless links.is_a?(Array)
+    options[:type] = :links
+    HTMLProofer::Runner.new(links, options)
+  end
+  module_function :check_links
+end

data/lib/html-proofer/cache.rb ADDED Viewed

@@ -0,0 +1,153 @@
+require_relative 'utils'
+require 'json'
+require 'active_support/core_ext/string'
+require 'active_support/core_ext/date'
+require 'active_support/core_ext/numeric/time'
+module HTMLProofer
+  class Cache
+    include HTMLProofer::Utils
+    CACHE_LOG = File.join(STORAGE_DIR, 'cache.log')
+    attr_reader :exists, :cache_log
+    def initialize(logger, options)
+      @logger = logger
+      @cache_log = {}
+      if options.nil? || options.empty?
+        define_singleton_method('use_cache?') { false }
+      else
+        define_singleton_method('use_cache?') { true }
+        @parsed_timeframe = parsed_timeframe(options[:timeframe])
+      end
+      @cache_time = Time.now
+      if File.exist?(CACHE_LOG)
+        contents = File.read(CACHE_LOG)
+        @cache_log = contents.empty? ? {} : JSON.parse(contents)
+      end
+    end
+    def within_timeframe?(time)
+      (@parsed_timeframe..@cache_time).cover?(time)
+    end
+    def urls
+      @cache_log['urls'] || []
+    end
+    def size
+      @cache_log.length
+    end
+    def parsed_timeframe(timeframe)
+      time, date = timeframe.match(/(\d+)(\D)/).captures
+      time = time.to_f
+      case date
+      when 'M'
+        time.months.ago
+      when 'w'
+        time.weeks.ago
+      when 'd'
+        time.days.ago
+      when 'h'
+        time.hours.ago
+      else
+        fail ArgumentError, "#{date} is not a valid timeframe!"
+      end
+    end
+    def add(url, filenames, status, msg = '')
+      data = {
+                :time => @cache_time,
+                :filenames => filenames,
+                :status => status,
+                :message => msg
+             }
+      @cache_log[clean_url(url)] = data
+    end
+    def detect_url_changes(found)
+      existing_urls = @cache_log.keys.map { |url| clean_url(url) }
+      found_urls = found.keys.map { |url| clean_url(url) }
+      # prepare to add new URLs detected
+      additions = found.reject do |url, _|
+        url = clean_url(url)
+        if existing_urls.include?(url)
+          true
+        else
+          @logger.log :debug, "Adding #{url} to cache check"
+          false
+        end
+      end
+      new_link_count = additions.length
+      new_link_text = pluralize(new_link_count, 'link', 'links')
+      @logger.log :info, "Adding #{new_link_text} to the cache..."
+      # remove from cache URLs that no longer exist
+      del = 0
+      @cache_log.delete_if do |url, _|
+        url = clean_url(url)
+        if !found_urls.include?(url)
+          @logger.log :debug, "Removing #{url} from cache check"
+          del += 1
+          true
+        else
+          false
+        end
+      end
+      del_link_text = pluralize(del, 'link', 'links')
+      @logger.log :info, "Removing #{del_link_text} from the cache..."
+      additions
+    end
+    def write
+      File.write(CACHE_LOG, @cache_log.to_json)
+    end
+    def load?
+      @load.nil?
+    end
+    def retrieve_urls(external_urls)
+      urls_to_check = detect_url_changes(external_urls)
+      @cache_log.each_pair do |url, cache|
+        if within_timeframe?(cache['time'])
+          next if cache['message'].empty? # these were successes to skip
+          urls_to_check[url] = cache['filenames'] # these are failures to retry
+        else
+          urls_to_check[url] = cache['filenames'] # pass or fail, recheck expired links
+        end
+      end
+      urls_to_check
+    end
+    # FIXME: there seems to be some discrepenacy where Typhoeus occasionally adds
+    # a trailing slash to URL strings, which causes issues with the cache
+    def slashless_url(url)
+      url.chomp('/')
+    end
+    # FIXME: it seems that Typhoeus actually acts on escaped URLs,
+    # but there's no way to get at that information, and the cache
+    # stores unescaped URLs. Because of this, some links, such as
+    # github.com/search/issues?q=is:open+is:issue+fig are not matched
+    # as github.com/search/issues?q=is%3Aopen+is%3Aissue+fig
+    def unescape_url(url)
+      Addressable::URI.unescape(url)
+    end
+    def clean_url(url)
+      slashless_url(unescape_url(url))
+    end
+  end
+end

data/lib/html-proofer/check.rb ADDED Viewed

@@ -0,0 +1,63 @@
+module HTMLProofer
+  # Mostly handles issue management and collecting of external URLs.
+  class Check
+    attr_reader :node, :element, :src, :path, :options, :issues, :external_urls
+    def initialize(src, path, html, options)
+      @src    = src
+      @path   = path
+      @html   = remove_ignored(html)
+      @options = options
+      @issues = []
+      @external_urls = {}
+    end
+    def create_element(node)
+      @node = node
+      Element.new(node, self)
+    end
+    def run
+      fail NotImplementedError, 'HTMLProofer::Check subclasses must implement #run'
+    end
+    def add_issue(desc, line: nil, status: -1)
+      @issues << Issue.new(@path, desc, line: line, status: status)
+    end
+    def add_to_external_urls(url, _)
+      return if @external_urls[url]
+      add_path_for_url(url)
+    end
+    def add_path_for_url(url)
+      if @external_urls[url]
+        @external_urls[url] << @path
+      else
+        @external_urls[url] = [@path]
+      end
+    end
+    def self.subchecks
+      classes = []
+      ObjectSpace.each_object(Class) do |c|
+        next unless c.superclass == self
+        classes << c
+      end
+      classes
+    end
+    def blank?(attr)
+      attr.nil? || attr.empty?
+    end
+    private
+    def remove_ignored(html)
+      html.css('code, pre, tt').each(&:unlink)
+      html
+    end
+  end
+end

data/lib/{html/proofer/checks → html-proofer/check}/favicon.rb RENAMED Viewed

@@ -1,14 +1,8 @@
-# encoding: utf-8
-class FaviconCheckable < ::HTML::Proofer::Checkable
-  attr_reader :rel
-end
-class FaviconCheck < ::HTML::Proofer::CheckRunner
+class FaviconCheck < ::HTMLProofer::Check
   def run
     found = false
     @html.xpath('//link[not(ancestor::pre or ancestor::code)]').each do |node|
-      favicon = FaviconCheckable.new(node, self)
+      favicon = create_element(node)
       next if favicon.ignore?
       found = true if favicon.rel.split(' ').last.eql? 'icon'
       break if found

data/lib/html-proofer/check/html.rb ADDED Viewed

@@ -0,0 +1,21 @@
+class HtmlCheck < ::HTMLProofer::Check
+  SCRIPT_EMBEDS_MSG = /Element script embeds close tag/
+  INVALID_TAG_MSG = /Tag ([\w\-:]+) invalid/
+  INVALID_PREFIX = /Namespace prefix/
+  def run
+    @html.errors.each do |error|
+      message = error.message
+      line    = error.line
+      if message =~ INVALID_TAG_MSG || message =~ INVALID_PREFIX
+        next unless options[:validation][:report_invalid_tags]
+      end
+      # tags embedded in scripts are used in templating languages: http://git.io/vOovv
+      next if !options[:validation][:report_script_embeds] && message =~ SCRIPT_EMBEDS_MSG
+      add_issue(message, line: line)
+    end
+  end
+end

data/lib/html-proofer/check/images.rb ADDED Viewed

@@ -0,0 +1,47 @@
+class ImageCheck < ::HTMLProofer::Check
+  SCREEN_SHOT_REGEX = /Screen(?: |%20)Shot(?: |%20)\d+-\d+-\d+(?: |%20)at(?: |%20)\d+.\d+.\d+/
+  def empty_alt_tag?
+    @img.alt.strip.empty?
+  end
+  def terrible_filename?
+    @img.url =~ SCREEN_SHOT_REGEX
+  end
+  def missing_src?
+    blank?(@img.url)
+  end
+  def run
+    @html.css('img').each do |node|
+      @img = create_element(node)
+      line = node.line
+      next if @img.ignore?
+      # screenshot filenames should return because of terrible names
+      if terrible_filename?
+        add_issue("image has a terrible filename (#{@img.url})", line: line)
+        next
+      end
+      # does the image exist?
+      if missing_src?
+        add_issue('image has no src or srcset attribute', line: line)
+      else
+        if @img.remote?
+          add_to_external_urls(@img.url, line)
+        elsif !@img.exists?
+          add_issue("internal image #{@img.url} does not exist", line: line)
+        end
+      end
+      if @img.alt.nil? || (empty_alt_tag? && !@img.ignore_empty_alt?)
+        add_issue("image #{@img.url} does not have an alt attribute", line: line)
+      end
+    end
+    external_urls
+  end
+end