broken_link_finder 0.9.3 → 0.11.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,137 @@
1
+ # frozen_string_literal: true
2
+
3
+ module BrokenLinkFinder
4
+ # Class responsible for handling the link collection logic.
5
+ class LinkManager
6
+ # Used for mapping pages to broken links.
7
+ attr_reader :broken_links
8
+
9
+ # Used for mapping pages to ignored links.
10
+ attr_reader :ignored_links
11
+
12
+ # Used to record crawl statistics e.g. duration etc.
13
+ attr_reader :crawl_stats
14
+
15
+ # Used to map a link (as is) to its absolute (crawlable) form.
16
+ attr_reader :broken_link_map
17
+
18
+ # Used to prevent crawling a broken link twice.
19
+ attr_reader :all_broken_links
20
+
21
+ # Used to prevent crawling an intact link twice.
22
+ attr_reader :all_intact_links
23
+
24
+ # Used for building crawl statistics.
25
+ attr_reader :all_ignored_links
26
+
27
+ # Returns a new LinkManager instance with empty link collections.
28
+ def initialize(sort)
29
+ raise "Sort by either :page or :link, not #{sort}" \
30
+ unless %i[page link].include?(sort)
31
+
32
+ @sort = sort
33
+ @lock = Mutex.new
34
+
35
+ empty # Initialises the link collections.
36
+ end
37
+
38
+ # Initialise/empty the link collection objects.
39
+ def empty
40
+ @broken_links = {}
41
+ @ignored_links = {}
42
+ @crawl_stats = {}
43
+ @broken_link_map = {}
44
+ @all_broken_links = Set.new
45
+ @all_intact_links = Set.new
46
+ @all_ignored_links = Set.new
47
+ end
48
+
49
+ # Append key => [value] to the broken link collections.
50
+ # If map: true, then the link will also be recorded in @broken_link_map.
51
+ def append_broken_link(doc, link, map: true)
52
+ key, value = get_key_value(doc.url, link)
53
+
54
+ @lock.synchronize do
55
+ @broken_links[key] = [] unless @broken_links[key]
56
+ @broken_links[key] << value
57
+
58
+ @all_broken_links << link
59
+
60
+ @broken_link_map[link] = link.make_absolute(doc) if map
61
+ end
62
+ end
63
+
64
+ # Remove the broken link from the necessary collections.
65
+ def remove_broken_link(link)
66
+ @lock.synchronize do
67
+ if @sort == :page
68
+ @broken_links.each { |_k, links| links.delete(link) }
69
+ @broken_links.delete_if { |_k, links| links.empty? }
70
+ else
71
+ @broken_links.delete(link)
72
+ end
73
+
74
+ @all_broken_links.delete(link)
75
+ @all_intact_links << link
76
+ end
77
+ end
78
+
79
+ # Append key => [value] to the ignored link collections.
80
+ def append_ignored_link(url, link)
81
+ key, value = get_key_value(url, link)
82
+
83
+ @lock.synchronize do
84
+ @ignored_links[key] = [] unless @ignored_links[key]
85
+ @ignored_links[key] << value
86
+
87
+ @all_ignored_links << link
88
+ end
89
+ end
90
+
91
+ # Append link to @all_intact_links.
92
+ def append_intact_link(link)
93
+ @lock.synchronize { @all_intact_links << link }
94
+ end
95
+
96
+ # Sorts the link collection's keys and values alphabetically.
97
+ def sort
98
+ @broken_links.values.map(&:uniq!)
99
+ @ignored_links.values.map(&:uniq!)
100
+
101
+ @broken_links = @broken_links.sort_by { |k, _v| k }.to_h
102
+ @ignored_links = @ignored_links.sort_by { |k, _v| k }.to_h
103
+
104
+ @broken_links.each { |_k, v| v.sort! }
105
+ @ignored_links.each { |_k, v| v.sort! }
106
+ end
107
+
108
+ # Tally's up various statistics about the crawl and its links.
109
+ def tally(url:, pages_crawled:, start:)
110
+ @crawl_stats[:url] = url
111
+ @crawl_stats[:pages_crawled] = pages_crawled
112
+ @crawl_stats[:num_pages] = pages_crawled.size
113
+ @crawl_stats[:num_links] = (
114
+ @all_broken_links.size + @all_intact_links.size + @all_ignored_links.size
115
+ )
116
+ @crawl_stats[:num_broken_links] = @all_broken_links.size
117
+ @crawl_stats[:num_intact_links] = @all_intact_links.size
118
+ @crawl_stats[:num_ignored_links] = @all_ignored_links.size
119
+ @crawl_stats[:duration] = Time.now - start
120
+ end
121
+
122
+ private
123
+
124
+ # Returns the correct key value depending on the @sort type.
125
+ # @sort == :page ? [url, link] : [link, url]
126
+ def get_key_value(url, link)
127
+ case @sort
128
+ when :page
129
+ [url, link]
130
+ when :link
131
+ [link, url]
132
+ else
133
+ raise "Unsupported sort type: #{sort}"
134
+ end
135
+ end
136
+ end
137
+ end
@@ -0,0 +1,137 @@
1
+ # frozen_string_literal: true
2
+
3
+ module BrokenLinkFinder
4
+ # Class responsible for reporting in a HTML format.
5
+ class HTMLReporter < Reporter
6
+ # Returns a new HTMLReporter instance.
7
+ # stream is any Object that responds to :puts and :print.
8
+ def initialize(stream, sort,
9
+ broken_links, ignored_links,
10
+ broken_link_map, crawl_stats)
11
+ super
12
+ end
13
+
14
+ # Pretty print a report detailing the full link summary.
15
+ def call(broken_verbose: true, ignored_verbose: false)
16
+ puts '<div class="broken_link_finder_report">'
17
+
18
+ report_crawl_summary
19
+ report_broken_links(verbose: broken_verbose)
20
+ report_ignored_links(verbose: ignored_verbose)
21
+
22
+ puts '</div>'
23
+
24
+ nil
25
+ end
26
+
27
+ private
28
+
29
+ # Report a summary of the overall crawl.
30
+ def report_crawl_summary
31
+ puts format(
32
+ '<p class="crawl_summary">Crawled <a href="%s">%s</a><br />%s page(s) containing %s unique link(s) in %s seconds</p>',
33
+ @crawl_stats[:url],
34
+ @crawl_stats[:url],
35
+ @crawl_stats[:num_pages],
36
+ @crawl_stats[:num_links],
37
+ @crawl_stats[:duration]&.truncate(2)
38
+ )
39
+ end
40
+
41
+ # Report a summary of the broken links.
42
+ def report_broken_links(verbose: true)
43
+ puts '<div class="broken_links">'
44
+
45
+ if @broken_links.empty?
46
+ puts_summary 'Good news, there are no broken links!', type: :broken
47
+ else
48
+ num_pages, num_links = get_hash_stats(@broken_links)
49
+ puts_summary "Found #{num_links} unique broken link(s) across #{num_pages} page(s):", type: :broken
50
+
51
+ @broken_links.each do |key, values|
52
+ puts_group(key, type: :broken) # Puts the opening <p> element.
53
+
54
+ if verbose || (values.length <= NUM_VALUES)
55
+ values.each { |value| puts_group_item value, type: :broken }
56
+ else # Only print N values and summarise the rest.
57
+ NUM_VALUES.times { |i| puts_group_item values[i], type: :broken }
58
+
59
+ objects = sort_by_page? ? 'link(s)' : 'page(s)'
60
+ puts "+ #{values.length - NUM_VALUES} other #{objects}, remove --concise to see them all<br />"
61
+ end
62
+
63
+ puts '</p>'
64
+ end
65
+ end
66
+
67
+ puts '</div>'
68
+ end
69
+
70
+ # Report a summary of the ignored links.
71
+ def report_ignored_links(verbose: false)
72
+ puts '<div class="ignored_links">'
73
+
74
+ if @ignored_links.any?
75
+ num_pages, num_links = get_hash_stats(@ignored_links)
76
+ puts_summary "Ignored #{num_links} unique unsupported link(s) across #{num_pages} page(s), which you should check manually:", type: :ignored
77
+
78
+ @ignored_links.each do |key, values|
79
+ puts_group(key, type: :ignored) # Puts the opening <p> element.
80
+
81
+ if verbose || (values.length <= NUM_VALUES)
82
+ values.each { |value| puts_group_item value, type: :ignored }
83
+ else # Only print N values and summarise the rest.
84
+ NUM_VALUES.times { |i| puts_group_item values[i], type: :ignored }
85
+
86
+ objects = sort_by_page? ? 'link(s)' : 'page(s)'
87
+ puts "+ #{values.length - NUM_VALUES} other #{objects}, use --verbose to see them all<br />"
88
+ end
89
+
90
+ puts '</p>'
91
+ end
92
+ end
93
+
94
+ puts '</div>'
95
+ end
96
+
97
+ def puts_summary(text, type:)
98
+ klass = (type == :broken) ? 'broken_links_summary' : 'ignored_links_summary'
99
+ puts "<p class=\"#{klass}\">#{text}</p>"
100
+ end
101
+
102
+ def puts_group(link, type:)
103
+ href = build_url(link)
104
+ a_element = "<a href=\"#{href}\">#{link}</a>"
105
+
106
+ case type
107
+ when :broken
108
+ msg = sort_by_page? ?
109
+ "The following broken links were found on '#{a_element}':" :
110
+ "The broken link '#{a_element}' was found on the following pages:"
111
+ klass = 'broken_links_group'
112
+ when :ignored
113
+ msg = sort_by_page? ?
114
+ "The following links were ignored on '#{a_element}':" :
115
+ "The link '#{a_element}' was ignored on the following pages:"
116
+ klass = 'ignored_links_group'
117
+ else
118
+ raise "type: must be :broken or :ignored, not: #{type}"
119
+ end
120
+
121
+ puts "<p class=\"#{klass}\">"
122
+ puts msg + '<br />'
123
+ end
124
+
125
+ def puts_group_item(value, type:)
126
+ klass = (type == :broken) ? 'broken_links_group_item' : 'ignored_links_group_item'
127
+ puts "<a class=\"#{klass}\" href=\"#{build_url(value)}\">#{value}</a><br />"
128
+ end
129
+
130
+ def build_url(link)
131
+ href = @broken_link_map[link]
132
+ href || link
133
+ end
134
+
135
+ alias_method :report, :call
136
+ end
137
+ end
@@ -0,0 +1,76 @@
1
+ # frozen_string_literal: true
2
+
3
+ module BrokenLinkFinder
4
+ # Generic reporter class to be inherited from by format specific reporters.
5
+ class Reporter
6
+ # The amount of pages/links to display when verbose is false.
7
+ NUM_VALUES = 3
8
+
9
+ # Returns a new Reporter instance.
10
+ # stream is any Object that responds to :puts and :print.
11
+ def initialize(stream, sort,
12
+ broken_links, ignored_links,
13
+ broken_link_map, crawl_stats)
14
+ unless stream.respond_to?(:puts) && stream.respond_to?(:print)
15
+ raise 'stream must respond_to? :puts and :print'
16
+ end
17
+ raise "sort by either :page or :link, not #{sort}" \
18
+ unless %i[page link].include?(sort)
19
+
20
+ @stream = stream
21
+ @sort = sort
22
+ @broken_links = broken_links
23
+ @ignored_links = ignored_links
24
+ @broken_link_map = broken_link_map
25
+ @crawl_stats = crawl_stats
26
+ end
27
+
28
+ # Pretty print a report detailing the full link summary.
29
+ def call(broken_verbose: true, ignored_verbose: false)
30
+ raise 'Not implemented by parent class'
31
+ end
32
+
33
+ protected
34
+
35
+ # Return true if the sort is by page.
36
+ def sort_by_page?
37
+ @sort == :page
38
+ end
39
+
40
+ # Returns the key/value statistics of hash e.g. the number of keys and
41
+ # combined values. The hash should be of the format: { 'str' => [...] }.
42
+ # Use like: `num_pages, num_links = get_hash_stats(links)`.
43
+ def get_hash_stats(hash)
44
+ num_keys = hash.keys.length
45
+ num_values = hash.values.flatten.uniq.length
46
+
47
+ sort_by_page? ?
48
+ [num_keys, num_values] :
49
+ [num_values, num_keys]
50
+ end
51
+
52
+ # Prints the text. Defaults to a blank line.
53
+ def print(text = '')
54
+ @stream.print(text)
55
+ end
56
+
57
+ # Prints the text + \n. Defaults to a blank line.
58
+ def puts(text = '')
59
+ @stream.puts(text)
60
+ end
61
+
62
+ # Prints text + \n\n.
63
+ def putsn(text)
64
+ puts(text)
65
+ puts
66
+ end
67
+
68
+ # Prints \n + text + \n.
69
+ def nputs(text)
70
+ puts
71
+ puts(text)
72
+ end
73
+
74
+ alias_method :report, :call
75
+ end
76
+ end
@@ -0,0 +1,88 @@
1
+ # frozen_string_literal: true
2
+
3
+ module BrokenLinkFinder
4
+ # Class responsible for reporting in a text format.
5
+ class TextReporter < Reporter
6
+ # Returns a new TextReporter instance.
7
+ # stream is any Object that responds to :puts and :print.
8
+ def initialize(stream, sort,
9
+ broken_links, ignored_links,
10
+ broken_link_map, crawl_stats)
11
+ super
12
+ end
13
+
14
+ # Pretty print a report detailing the full link summary.
15
+ def call(broken_verbose: true, ignored_verbose: false)
16
+ report_crawl_summary
17
+ report_broken_links(verbose: broken_verbose)
18
+ report_ignored_links(verbose: ignored_verbose)
19
+
20
+ nil
21
+ end
22
+
23
+ private
24
+
25
+ # Report a summary of the overall crawl.
26
+ def report_crawl_summary
27
+ puts "Crawled #{@crawl_stats[:url]}"
28
+ putsn format(
29
+ '%s page(s) containing %s unique link(s) in %s seconds',
30
+ @crawl_stats[:num_pages],
31
+ @crawl_stats[:num_links],
32
+ @crawl_stats[:duration]&.truncate(2)
33
+ )
34
+ end
35
+
36
+ # Report a summary of the broken links.
37
+ def report_broken_links(verbose: true)
38
+ if @broken_links.empty?
39
+ puts 'Good news, there are no broken links!'
40
+ else
41
+ num_pages, num_links = get_hash_stats(@broken_links)
42
+ puts "Found #{num_links} unique broken link(s) across #{num_pages} page(s):"
43
+
44
+ @broken_links.each do |key, values|
45
+ msg = sort_by_page? ?
46
+ "The following broken links were found on '#{key}':" :
47
+ "The broken link '#{key}' was found on the following pages:"
48
+ nputs msg
49
+
50
+ if verbose || (values.length <= NUM_VALUES)
51
+ values.each { |value| puts value }
52
+ else # Only print N values and summarise the rest.
53
+ NUM_VALUES.times { |i| puts values[i] }
54
+
55
+ objects = sort_by_page? ? 'link(s)' : 'page(s)'
56
+ puts "+ #{values.length - NUM_VALUES} other #{objects}, remove --concise to see them all"
57
+ end
58
+ end
59
+ end
60
+ end
61
+
62
+ # Report a summary of the ignored links.
63
+ def report_ignored_links(verbose: false)
64
+ if @ignored_links.any?
65
+ num_pages, num_links = get_hash_stats(@ignored_links)
66
+ nputs "Ignored #{num_links} unique unsupported link(s) across #{num_pages} page(s), which you should check manually:"
67
+
68
+ @ignored_links.each do |key, values|
69
+ msg = sort_by_page? ?
70
+ "The following links were ignored on '#{key}':" :
71
+ "The link '#{key}' was ignored on the following pages:"
72
+ nputs msg
73
+
74
+ if verbose || (values.length <= NUM_VALUES)
75
+ values.each { |value| puts value }
76
+ else # Only print N values and summarise the rest.
77
+ NUM_VALUES.times { |i| puts values[i] }
78
+
79
+ objects = sort_by_page? ? 'link(s)' : 'page(s)'
80
+ puts "+ #{values.length - NUM_VALUES} other #{objects}, use --verbose to see them all"
81
+ end
82
+ end
83
+ end
84
+ end
85
+
86
+ alias_method :report, :call
87
+ end
88
+ end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module BrokenLinkFinder
4
- VERSION = '0.9.3'
4
+ VERSION = '0.11.1'
5
5
  end
@@ -1,11 +1,31 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- # We extract all the Document's links, not just the links to other webpages.
4
- Wgit::Document.define_extension(
3
+ # Define a method on each doc for recording unparsable links.
4
+ # Unparsable links are recorded as broken links by Finder.
5
+ class Wgit::Document
6
+ def unparsable_links
7
+ @unparsable_links ||= []
8
+ end
9
+ end
10
+
11
+ # Returns a Wgit::Url or nil (if link is unparsable).
12
+ # A proc is preferrable to a function to avoid polluting the global namespace.
13
+ parse_link = lambda do |doc, link|
14
+ Wgit::Url.new(link)
15
+ rescue StandardError
16
+ doc.unparsable_links << link
17
+ nil
18
+ end
19
+
20
+ # We extract all the Document's links e.g. <a>, <img>, <script>, <link> etc.
21
+ Wgit::Document.define_extractor(
5
22
  :all_links,
6
- '//*/@href | //*/@src', # Any element with a href or src attribute.
23
+ '//*/@href | //*/@src', # Any element's href or src attribute URL.
7
24
  singleton: false,
8
25
  text_content_only: true
9
- ) do |links|
10
- links.uniq.to_urls
26
+ ) do |links, doc|
27
+ links
28
+ .uniq
29
+ .map { |link| parse_link.call(doc, link) }
30
+ .compact
11
31
  end