html-proofer 2.6.4 → 3.0.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 949542fac56daeafe93fbf17014e96cba9deba3c
4
- data.tar.gz: b42e8375b5886aa310fcb1920ca0c676e9dedcc3
3
+ metadata.gz: 1ce6c96e75683a8da4de60754d83aef138221d02
4
+ data.tar.gz: b341f1ee5b7237e58cba8e775dfaa152eaeeb0a5
5
5
  SHA512:
6
- metadata.gz: 6d2f2741cc3c6c7ca32220f1e338986148d5bf4c026e1bd5b5e7955652f6a0807d9037a4d71a0aaec7c81e3c9d58ae09ef649388c5be589c08991d27c2a88c33
7
- data.tar.gz: 5a478a174241a58cf29192bb62a964703ecb5e90060ac702183067c42a894858dbad4cf5a7591cb216d92785b9216f1eaca2232bcfa248da2af20d206ac1daf6
6
+ metadata.gz: 000ff0816076d1e8b738526f380e88309fdc04d93fa203a43d62960696064062b7942647a5f79be0f8e7239ca69e67f08f669866352cabf67152a6f95beb643b
7
+ data.tar.gz: 1863ee52843f2d5071170aaed79ce2ae9f6902ffb1a99d93021b746eb9d3d792d460d4caff2f703907291915b0bc8861af8967cd4f5a394e0a6604aa23e0dec0
@@ -3,19 +3,20 @@ STDOUT.sync = true
3
3
 
4
4
  $LOAD_PATH.unshift File.join(File.dirname(__FILE__), *%w( .. lib ))
5
5
 
6
- require 'html/proofer'
6
+ require 'html-proofer'
7
7
  require 'mercenary'
8
8
 
9
- Mercenary.program(:htmlproof) do |p|
10
- p.version HTML::Proofer::VERSION
9
+ Mercenary.program(:htmlproofer) do |p|
10
+ p.version HTMLProofer::VERSION
11
11
  p.description %(Test your rendered HTML files to make sure they're accurate.)
12
- p.syntax 'htmlproof PATH [options]'
12
+ p.syntax 'htmlproofer PATH [options]'
13
13
 
14
14
  p.description 'Runs the HTML-Proofer suite on the files in PATH. For more details, see the README.'
15
15
 
16
16
  p.option 'allow_hash_href', '--allow-hash-href', 'If `true`, ignores the `href` `#`'
17
17
  p.option 'as_links', '--as-links', 'Assumes that `PATH` is a comma-separated array of links to check.'
18
18
  p.option 'alt_ignore', '--alt-ignore image1,[image2,...]', Array, 'A comma-separated list of Strings or RegExps containing `img`s whose missing `alt` tags are safe to ignore'
19
+ p.option 'assume_extension', '--assume-extension', 'Automatically add extension (e.g. `.html`) to file paths, to allow extensionless URLs (as supported by Jekyll 3 and GitHub Pages) (default: `false`).'
19
20
  p.option 'checks_to_ignore', '--checks-to-ignore check1,[check2,...]', Array, ' An array of Strings indicating which checks you\'d like to not perform.'
20
21
  p.option 'check_external_hash', '--check-external-hash', 'Checks whether external hashes exist (even if the website exists). This slows the checker down (default: `false`).'
21
22
  p.option 'check_favicon', '--check-favicon', 'Enables the favicon checker (default: `false`).'
@@ -25,16 +26,17 @@ Mercenary.program(:htmlproof) do |p|
25
26
  p.option 'empty_alt_ignore', '--empty-alt-ignore', 'If `true`, ignores images with empty alt tags'
26
27
  p.option 'error_sort', '--error-sort SORT', 'Defines the sort order for error output. Can be `:path`, `:desc`, or `:status` (default: `path`).'
27
28
  p.option 'enforce_https', '--enforce-https', 'Fails a link if it\'s not marked as `https` (default: `false`).'
28
- p.option 'ext', '--ext EXT', String, 'The extension of your HTML files including the dot. (default: `.html`)'
29
+ p.option 'extension', '--extension EXT', String, 'The extension of your HTML files including the dot. (default: `.html`)'
29
30
  p.option 'external_only', '--external_only', 'Only checks problems with external references'
30
31
  p.option 'file_ignore', '--file-ignore file1,[file2,...]', Array, 'A comma-separated list of Strings or RegExps containing file paths that are safe to ignore'
31
- p.option 'href_ignore', '--href-ignore link1,[link2,...]', Array, 'A comma-separated list of Strings or RegExps containing `href`s that are safe to ignore. Note that non-HTTP(S) URIs are always ignored. **Will be renamed in a future release.**'
32
- p.option 'href_swap', '--href-swap re:string,[re:string,...]', Array, 'A comma-separated list containing key-value pairs of `RegExp => String`. It transforms links that match `RegExp` into `String` via `gsub`. **Will be renamed in a future release.**'
33
- p.option 'ignore_script_embeds', '--ignore-script-embeds', 'Ignore `check_html` errors associated with `script`s (default: `false`)'
32
+ p.option 'http_status_ignore', '--http-status-ignore 123,[xxx, ...]', Array, 'A comma-separated list of numbers representing status codes to ignore.'
33
+ p.option 'report_invalid_tags', '--report-invalid-tags', 'Ignore `check_html` errors associated with unknown markup (default: `false`)'
34
+ p.option 'report_script_embeds', '--report-script-embeds', 'Ignore `check_html` errors associated with `script`s (default: `false`)'
35
+ p.option 'log_level', '--log-level <level>', String, 'Sets the logging level, as determined by Yell'
34
36
  p.option 'only_4xx', '--only-4xx', 'Only reports errors for links that fall within the 4xx status code range'
37
+ p.option 'timeframe', '--timeframe <time>', String, 'A string representing the caching timeframe.'
35
38
  p.option 'url_ignore', '--url-ignore link1,[link2,...]', Array, 'A comma-separated list of Strings or RegExps containing URLs that are safe to ignore. It affects all HTML attributes. Note that non-HTTP(S) URIs are always ignored'
36
- p.option 'verbose', '--verbose', 'If `true`, outputs extra information as the checking happens. Useful for debugging. **Will be deprecated in a future release.**'
37
- p.option 'verbosity', '--verbosity', String, 'Sets the logging level, as determined by Yell'
39
+ p.option 'url_swap', '--url-swap re:string,[re:string,...]', Array, 'A comma-separated list containing key-value pairs of `RegExp => String`. It transforms URLs that match `RegExp` into `String` via `gsub`.'
38
40
 
39
41
  p.action do |args, opts|
40
42
  args = ['.'] if args.empty?
@@ -45,34 +47,39 @@ Mercenary.program(:htmlproof) do |p|
45
47
  # prepare everything to go to proofer
46
48
  p.options.select { |o| !opts[o.config_key].nil? }.each do |option|
47
49
  if option.return_type.to_s == 'Array' # TODO: is_a? doesn't work here?
48
- opts[option.config_key] = opts[option.config_key].map { |i| HTML::Proofer::Configuration.to_regex?(i) }
50
+ opts[option.config_key] = opts[option.config_key].map { |i| HTMLProofer::Configuration.to_regex?(i) }
49
51
  end
50
52
  options[option.config_key.to_sym] = opts[option.config_key]
51
53
  end
52
54
 
53
55
  # some minor manipulation of a special option
54
- unless opts['href_swap'].nil?
55
- options[:href_swap] = {}
56
- opts['href_swap'].each do |s|
56
+ unless opts['url_swap'].nil?
57
+ options[:url_swap] = {}
58
+ opts['url_swap'].each do |s|
57
59
  pair = s.split(':', 2)
58
- options[:href_swap][Regexp.new(pair[0])] = pair[1]
60
+ options[:url_swap][Regexp.new(pair[0])] = pair[1]
59
61
  end
60
62
  end
61
63
 
62
- # check for ignore_scripts_embeds as it should be set in :validation
63
- unless opts['ignore_script_embeds'].nil?
64
- options[:validation] = { :ignore_script_embeds => true }
65
- end
66
-
67
64
  options[:error_sort] = opts['error-sort'].to_sym unless opts['error-sort'].nil?
68
- options[:verbosity] = opts['verbosity'].to_sym unless opts['verbosity'].nil?
65
+ options[:log_level] = opts['log_level'].to_sym unless opts['log_level'].nil?
69
66
 
70
67
  # FIXME: this is gross
71
68
  options[:validation] = {}
72
- options[:validation][:ignore_script_embeds] = opts['ignore_script_embeds']
69
+ options[:validation][:report_script_embeds] = opts['report_script_embeds']
70
+ options[:validation][:report_invalid_tags] = opts['report_invalid_tags']
73
71
 
74
- path = path.delete(' ').split(',') if opts['as_links']
72
+ options[:cache] = {}
73
+ options[:cache][:timeframe] = opts['timeframe'] unless opts['timeframe'].nil?
75
74
 
76
- HTML::Proofer.new(path, options).run
75
+ paths = path.split(',')
76
+ if opts['as_links']
77
+ links = path.delete(' ').split(',')
78
+ HTMLProofer.check_links(links, options).run
79
+ elsif File.directory?(paths.first)
80
+ HTMLProofer.check_directories(paths, options).run
81
+ else
82
+ HTMLProofer.check_file(path, options).run
83
+ end
77
84
  end
78
85
  end
@@ -0,0 +1,47 @@
1
+ def require_all(path)
2
+ glob = File.join(File.dirname(__FILE__), path, '*.rb')
3
+ Dir[glob].each do |f|
4
+ require f
5
+ end
6
+ end
7
+
8
+ require_all 'html-proofer'
9
+ require_all 'html-proofer/check'
10
+
11
+ require 'parallel'
12
+ require 'fileutils'
13
+
14
+ begin
15
+ require 'awesome_print'
16
+ rescue LoadError; end
17
+
18
+ module HTMLProofer
19
+
20
+ def check_file(file, options = {})
21
+ fail ArgumentError unless file.is_a?(String)
22
+ options[:type] = :file
23
+ HTMLProofer::Runner.new(file, options)
24
+ end
25
+ module_function :check_file
26
+
27
+ def check_directory(directory, options = {})
28
+ fail ArgumentError unless directory.is_a?(String)
29
+ options[:type] = :directory
30
+ HTMLProofer::Runner.new([directory], options)
31
+ end
32
+ module_function :check_directory
33
+
34
+ def check_directories(directories, options = {})
35
+ fail ArgumentError unless directories.is_a?(Array)
36
+ options[:type] = :directory
37
+ HTMLProofer::Runner.new(directories, options)
38
+ end
39
+ module_function :check_directories
40
+
41
+ def check_links(links, options = {})
42
+ fail ArgumentError unless links.is_a?(Array)
43
+ options[:type] = :links
44
+ HTMLProofer::Runner.new(links, options)
45
+ end
46
+ module_function :check_links
47
+ end
@@ -0,0 +1,153 @@
1
+ require_relative 'utils'
2
+
3
+ require 'json'
4
+ require 'active_support/core_ext/string'
5
+ require 'active_support/core_ext/date'
6
+ require 'active_support/core_ext/numeric/time'
7
+
8
+ module HTMLProofer
9
+ class Cache
10
+ include HTMLProofer::Utils
11
+
12
+ CACHE_LOG = File.join(STORAGE_DIR, 'cache.log')
13
+
14
+ attr_reader :exists, :cache_log
15
+
16
+ def initialize(logger, options)
17
+ @logger = logger
18
+ @cache_log = {}
19
+
20
+ if options.nil? || options.empty?
21
+ define_singleton_method('use_cache?') { false }
22
+ else
23
+ define_singleton_method('use_cache?') { true }
24
+ @parsed_timeframe = parsed_timeframe(options[:timeframe])
25
+ end
26
+
27
+ @cache_time = Time.now
28
+
29
+ if File.exist?(CACHE_LOG)
30
+ contents = File.read(CACHE_LOG)
31
+ @cache_log = contents.empty? ? {} : JSON.parse(contents)
32
+ end
33
+ end
34
+
35
+ def within_timeframe?(time)
36
+ (@parsed_timeframe..@cache_time).cover?(time)
37
+ end
38
+
39
+ def urls
40
+ @cache_log['urls'] || []
41
+ end
42
+
43
+ def size
44
+ @cache_log.length
45
+ end
46
+
47
+ def parsed_timeframe(timeframe)
48
+ time, date = timeframe.match(/(\d+)(\D)/).captures
49
+ time = time.to_f
50
+ case date
51
+ when 'M'
52
+ time.months.ago
53
+ when 'w'
54
+ time.weeks.ago
55
+ when 'd'
56
+ time.days.ago
57
+ when 'h'
58
+ time.hours.ago
59
+ else
60
+ fail ArgumentError, "#{date} is not a valid timeframe!"
61
+ end
62
+ end
63
+
64
+ def add(url, filenames, status, msg = '')
65
+ data = {
66
+ :time => @cache_time,
67
+ :filenames => filenames,
68
+ :status => status,
69
+ :message => msg
70
+ }
71
+
72
+ @cache_log[clean_url(url)] = data
73
+ end
74
+
75
+ def detect_url_changes(found)
76
+ existing_urls = @cache_log.keys.map { |url| clean_url(url) }
77
+ found_urls = found.keys.map { |url| clean_url(url) }
78
+
79
+ # prepare to add new URLs detected
80
+ additions = found.reject do |url, _|
81
+ url = clean_url(url)
82
+ if existing_urls.include?(url)
83
+ true
84
+ else
85
+ @logger.log :debug, "Adding #{url} to cache check"
86
+ false
87
+ end
88
+ end
89
+
90
+ new_link_count = additions.length
91
+ new_link_text = pluralize(new_link_count, 'link', 'links')
92
+ @logger.log :info, "Adding #{new_link_text} to the cache..."
93
+
94
+ # remove from cache URLs that no longer exist
95
+ del = 0
96
+ @cache_log.delete_if do |url, _|
97
+ url = clean_url(url)
98
+ if !found_urls.include?(url)
99
+ @logger.log :debug, "Removing #{url} from cache check"
100
+ del += 1
101
+ true
102
+ else
103
+ false
104
+ end
105
+ end
106
+
107
+ del_link_text = pluralize(del, 'link', 'links')
108
+ @logger.log :info, "Removing #{del_link_text} from the cache..."
109
+
110
+ additions
111
+ end
112
+
113
+ def write
114
+ File.write(CACHE_LOG, @cache_log.to_json)
115
+ end
116
+
117
+ def load?
118
+ @load.nil?
119
+ end
120
+
121
+ def retrieve_urls(external_urls)
122
+ urls_to_check = detect_url_changes(external_urls)
123
+ @cache_log.each_pair do |url, cache|
124
+ if within_timeframe?(cache['time'])
125
+ next if cache['message'].empty? # these were successes to skip
126
+ urls_to_check[url] = cache['filenames'] # these are failures to retry
127
+ else
128
+ urls_to_check[url] = cache['filenames'] # pass or fail, recheck expired links
129
+ end
130
+ end
131
+ urls_to_check
132
+ end
133
+
134
+ # FIXME: there seems to be some discrepenacy where Typhoeus occasionally adds
135
+ # a trailing slash to URL strings, which causes issues with the cache
136
+ def slashless_url(url)
137
+ url.chomp('/')
138
+ end
139
+
140
+ # FIXME: it seems that Typhoeus actually acts on escaped URLs,
141
+ # but there's no way to get at that information, and the cache
142
+ # stores unescaped URLs. Because of this, some links, such as
143
+ # github.com/search/issues?q=is:open+is:issue+fig are not matched
144
+ # as github.com/search/issues?q=is%3Aopen+is%3Aissue+fig
145
+ def unescape_url(url)
146
+ Addressable::URI.unescape(url)
147
+ end
148
+
149
+ def clean_url(url)
150
+ slashless_url(unescape_url(url))
151
+ end
152
+ end
153
+ end
@@ -0,0 +1,63 @@
1
+ module HTMLProofer
2
+ # Mostly handles issue management and collecting of external URLs.
3
+ class Check
4
+ attr_reader :node, :element, :src, :path, :options, :issues, :external_urls
5
+
6
+ def initialize(src, path, html, options)
7
+ @src = src
8
+ @path = path
9
+ @html = remove_ignored(html)
10
+ @options = options
11
+ @issues = []
12
+ @external_urls = {}
13
+ end
14
+
15
+ def create_element(node)
16
+ @node = node
17
+ Element.new(node, self)
18
+ end
19
+
20
+ def run
21
+ fail NotImplementedError, 'HTMLProofer::Check subclasses must implement #run'
22
+ end
23
+
24
+ def add_issue(desc, line: nil, status: -1)
25
+ @issues << Issue.new(@path, desc, line: line, status: status)
26
+ end
27
+
28
+ def add_to_external_urls(url, _)
29
+ return if @external_urls[url]
30
+ add_path_for_url(url)
31
+ end
32
+
33
+ def add_path_for_url(url)
34
+ if @external_urls[url]
35
+ @external_urls[url] << @path
36
+ else
37
+ @external_urls[url] = [@path]
38
+ end
39
+ end
40
+
41
+ def self.subchecks
42
+ classes = []
43
+
44
+ ObjectSpace.each_object(Class) do |c|
45
+ next unless c.superclass == self
46
+ classes << c
47
+ end
48
+
49
+ classes
50
+ end
51
+
52
+ def blank?(attr)
53
+ attr.nil? || attr.empty?
54
+ end
55
+
56
+ private
57
+
58
+ def remove_ignored(html)
59
+ html.css('code, pre, tt').each(&:unlink)
60
+ html
61
+ end
62
+ end
63
+ end
@@ -1,14 +1,8 @@
1
- # encoding: utf-8
2
-
3
- class FaviconCheckable < ::HTML::Proofer::Checkable
4
- attr_reader :rel
5
- end
6
-
7
- class FaviconCheck < ::HTML::Proofer::CheckRunner
1
+ class FaviconCheck < ::HTMLProofer::Check
8
2
  def run
9
3
  found = false
10
4
  @html.xpath('//link[not(ancestor::pre or ancestor::code)]').each do |node|
11
- favicon = FaviconCheckable.new(node, self)
5
+ favicon = create_element(node)
12
6
  next if favicon.ignore?
13
7
  found = true if favicon.rel.split(' ').last.eql? 'icon'
14
8
  break if found
@@ -0,0 +1,21 @@
1
+ class HtmlCheck < ::HTMLProofer::Check
2
+ SCRIPT_EMBEDS_MSG = /Element script embeds close tag/
3
+ INVALID_TAG_MSG = /Tag ([\w\-:]+) invalid/
4
+ INVALID_PREFIX = /Namespace prefix/
5
+
6
+ def run
7
+ @html.errors.each do |error|
8
+ message = error.message
9
+ line = error.line
10
+
11
+ if message =~ INVALID_TAG_MSG || message =~ INVALID_PREFIX
12
+ next unless options[:validation][:report_invalid_tags]
13
+ end
14
+
15
+ # tags embedded in scripts are used in templating languages: http://git.io/vOovv
16
+ next if !options[:validation][:report_script_embeds] && message =~ SCRIPT_EMBEDS_MSG
17
+
18
+ add_issue(message, line: line)
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,47 @@
1
+ class ImageCheck < ::HTMLProofer::Check
2
+ SCREEN_SHOT_REGEX = /Screen(?: |%20)Shot(?: |%20)\d+-\d+-\d+(?: |%20)at(?: |%20)\d+.\d+.\d+/
3
+
4
+ def empty_alt_tag?
5
+ @img.alt.strip.empty?
6
+ end
7
+
8
+ def terrible_filename?
9
+ @img.url =~ SCREEN_SHOT_REGEX
10
+ end
11
+
12
+ def missing_src?
13
+ blank?(@img.url)
14
+ end
15
+
16
+ def run
17
+ @html.css('img').each do |node|
18
+ @img = create_element(node)
19
+ line = node.line
20
+
21
+ next if @img.ignore?
22
+
23
+ # screenshot filenames should return because of terrible names
24
+ if terrible_filename?
25
+ add_issue("image has a terrible filename (#{@img.url})", line: line)
26
+ next
27
+ end
28
+
29
+ # does the image exist?
30
+ if missing_src?
31
+ add_issue('image has no src or srcset attribute', line: line)
32
+ else
33
+ if @img.remote?
34
+ add_to_external_urls(@img.url, line)
35
+ elsif !@img.exists?
36
+ add_issue("internal image #{@img.url} does not exist", line: line)
37
+ end
38
+ end
39
+
40
+ if @img.alt.nil? || (empty_alt_tag? && !@img.ignore_empty_alt?)
41
+ add_issue("image #{@img.url} does not have an alt attribute", line: line)
42
+ end
43
+ end
44
+
45
+ external_urls
46
+ end
47
+ end