broken_link_finder 0.9.3 → 0.11.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,137 @@
1
+ # frozen_string_literal: true
2
+
3
+ module BrokenLinkFinder
4
+ # Class responsible for handling the link collection logic.
5
+ class LinkManager
6
+ # Used for mapping pages to broken links.
7
+ attr_reader :broken_links
8
+
9
+ # Used for mapping pages to ignored links.
10
+ attr_reader :ignored_links
11
+
12
+ # Used to record crawl statistics e.g. duration etc.
13
+ attr_reader :crawl_stats
14
+
15
+ # Used to map a link (as is) to its absolute (crawlable) form.
16
+ attr_reader :broken_link_map
17
+
18
+ # Used to prevent crawling a broken link twice.
19
+ attr_reader :all_broken_links
20
+
21
+ # Used to prevent crawling an intact link twice.
22
+ attr_reader :all_intact_links
23
+
24
+ # Used for building crawl statistics.
25
+ attr_reader :all_ignored_links
26
+
27
+ # Returns a new LinkManager instance with empty link collections.
28
+ def initialize(sort)
29
+ raise "Sort by either :page or :link, not #{sort}" \
30
+ unless %i[page link].include?(sort)
31
+
32
+ @sort = sort
33
+ @lock = Mutex.new
34
+
35
+ empty # Initialises the link collections.
36
+ end
37
+
38
+ # Initialise/empty the link collection objects.
39
+ def empty
40
+ @broken_links = {}
41
+ @ignored_links = {}
42
+ @crawl_stats = {}
43
+ @broken_link_map = {}
44
+ @all_broken_links = Set.new
45
+ @all_intact_links = Set.new
46
+ @all_ignored_links = Set.new
47
+ end
48
+
49
+ # Append key => [value] to the broken link collections.
50
+ # If map: true, then the link will also be recorded in @broken_link_map.
51
+ def append_broken_link(doc, link, map: true)
52
+ key, value = get_key_value(doc.url, link)
53
+
54
+ @lock.synchronize do
55
+ @broken_links[key] = [] unless @broken_links[key]
56
+ @broken_links[key] << value
57
+
58
+ @all_broken_links << link
59
+
60
+ @broken_link_map[link] = link.make_absolute(doc) if map
61
+ end
62
+ end
63
+
64
+ # Remove the broken link from the necessary collections.
65
+ def remove_broken_link(link)
66
+ @lock.synchronize do
67
+ if @sort == :page
68
+ @broken_links.each { |_k, links| links.delete(link) }
69
+ @broken_links.delete_if { |_k, links| links.empty? }
70
+ else
71
+ @broken_links.delete(link)
72
+ end
73
+
74
+ @all_broken_links.delete(link)
75
+ @all_intact_links << link
76
+ end
77
+ end
78
+
79
+ # Append key => [value] to the ignored link collections.
80
+ def append_ignored_link(url, link)
81
+ key, value = get_key_value(url, link)
82
+
83
+ @lock.synchronize do
84
+ @ignored_links[key] = [] unless @ignored_links[key]
85
+ @ignored_links[key] << value
86
+
87
+ @all_ignored_links << link
88
+ end
89
+ end
90
+
91
+ # Append link to @all_intact_links.
92
+ def append_intact_link(link)
93
+ @lock.synchronize { @all_intact_links << link }
94
+ end
95
+
96
+ # Sorts the link collection's keys and values alphabetically.
97
+ def sort
98
+ @broken_links.values.map(&:uniq!)
99
+ @ignored_links.values.map(&:uniq!)
100
+
101
+ @broken_links = @broken_links.sort_by { |k, _v| k }.to_h
102
+ @ignored_links = @ignored_links.sort_by { |k, _v| k }.to_h
103
+
104
+ @broken_links.each { |_k, v| v.sort! }
105
+ @ignored_links.each { |_k, v| v.sort! }
106
+ end
107
+
108
+ # Tally's up various statistics about the crawl and its links.
109
+ def tally(url:, pages_crawled:, start:)
110
+ @crawl_stats[:url] = url
111
+ @crawl_stats[:pages_crawled] = pages_crawled
112
+ @crawl_stats[:num_pages] = pages_crawled.size
113
+ @crawl_stats[:num_links] = (
114
+ @all_broken_links.size + @all_intact_links.size + @all_ignored_links.size
115
+ )
116
+ @crawl_stats[:num_broken_links] = @all_broken_links.size
117
+ @crawl_stats[:num_intact_links] = @all_intact_links.size
118
+ @crawl_stats[:num_ignored_links] = @all_ignored_links.size
119
+ @crawl_stats[:duration] = Time.now - start
120
+ end
121
+
122
+ private
123
+
124
+ # Returns the correct key value depending on the @sort type.
125
+ # @sort == :page ? [url, link] : [link, url]
126
+ def get_key_value(url, link)
127
+ case @sort
128
+ when :page
129
+ [url, link]
130
+ when :link
131
+ [link, url]
132
+ else
133
+ raise "Unsupported sort type: #{sort}"
134
+ end
135
+ end
136
+ end
137
+ end
@@ -0,0 +1,137 @@
1
+ # frozen_string_literal: true
2
+
3
+ module BrokenLinkFinder
4
+ # Class responsible for reporting in a HTML format.
5
+ class HTMLReporter < Reporter
6
+ # Returns a new HTMLReporter instance.
7
+ # stream is any Object that responds to :puts and :print.
8
+ def initialize(stream, sort,
9
+ broken_links, ignored_links,
10
+ broken_link_map, crawl_stats)
11
+ super
12
+ end
13
+
14
+ # Pretty print a report detailing the full link summary.
15
+ def call(broken_verbose: true, ignored_verbose: false)
16
+ puts '<div class="broken_link_finder_report">'
17
+
18
+ report_crawl_summary
19
+ report_broken_links(verbose: broken_verbose)
20
+ report_ignored_links(verbose: ignored_verbose)
21
+
22
+ puts '</div>'
23
+
24
+ nil
25
+ end
26
+
27
+ private
28
+
29
+ # Report a summary of the overall crawl.
30
+ def report_crawl_summary
31
+ puts format(
32
+ '<p class="crawl_summary">Crawled <a href="%s">%s</a><br />%s page(s) containing %s unique link(s) in %s seconds</p>',
33
+ @crawl_stats[:url],
34
+ @crawl_stats[:url],
35
+ @crawl_stats[:num_pages],
36
+ @crawl_stats[:num_links],
37
+ @crawl_stats[:duration]&.truncate(2)
38
+ )
39
+ end
40
+
41
+ # Report a summary of the broken links.
42
+ def report_broken_links(verbose: true)
43
+ puts '<div class="broken_links">'
44
+
45
+ if @broken_links.empty?
46
+ puts_summary 'Good news, there are no broken links!', type: :broken
47
+ else
48
+ num_pages, num_links = get_hash_stats(@broken_links)
49
+ puts_summary "Found #{num_links} unique broken link(s) across #{num_pages} page(s):", type: :broken
50
+
51
+ @broken_links.each do |key, values|
52
+ puts_group(key, type: :broken) # Puts the opening <p> element.
53
+
54
+ if verbose || (values.length <= NUM_VALUES)
55
+ values.each { |value| puts_group_item value, type: :broken }
56
+ else # Only print N values and summarise the rest.
57
+ NUM_VALUES.times { |i| puts_group_item values[i], type: :broken }
58
+
59
+ objects = sort_by_page? ? 'link(s)' : 'page(s)'
60
+ puts "+ #{values.length - NUM_VALUES} other #{objects}, remove --concise to see them all<br />"
61
+ end
62
+
63
+ puts '</p>'
64
+ end
65
+ end
66
+
67
+ puts '</div>'
68
+ end
69
+
70
+ # Report a summary of the ignored links.
71
+ def report_ignored_links(verbose: false)
72
+ puts '<div class="ignored_links">'
73
+
74
+ if @ignored_links.any?
75
+ num_pages, num_links = get_hash_stats(@ignored_links)
76
+ puts_summary "Ignored #{num_links} unique unsupported link(s) across #{num_pages} page(s), which you should check manually:", type: :ignored
77
+
78
+ @ignored_links.each do |key, values|
79
+ puts_group(key, type: :ignored) # Puts the opening <p> element.
80
+
81
+ if verbose || (values.length <= NUM_VALUES)
82
+ values.each { |value| puts_group_item value, type: :ignored }
83
+ else # Only print N values and summarise the rest.
84
+ NUM_VALUES.times { |i| puts_group_item values[i], type: :ignored }
85
+
86
+ objects = sort_by_page? ? 'link(s)' : 'page(s)'
87
+ puts "+ #{values.length - NUM_VALUES} other #{objects}, use --verbose to see them all<br />"
88
+ end
89
+
90
+ puts '</p>'
91
+ end
92
+ end
93
+
94
+ puts '</div>'
95
+ end
96
+
97
+ def puts_summary(text, type:)
98
+ klass = (type == :broken) ? 'broken_links_summary' : 'ignored_links_summary'
99
+ puts "<p class=\"#{klass}\">#{text}</p>"
100
+ end
101
+
102
+ def puts_group(link, type:)
103
+ href = build_url(link)
104
+ a_element = "<a href=\"#{href}\">#{link}</a>"
105
+
106
+ case type
107
+ when :broken
108
+ msg = sort_by_page? ?
109
+ "The following broken links were found on '#{a_element}':" :
110
+ "The broken link '#{a_element}' was found on the following pages:"
111
+ klass = 'broken_links_group'
112
+ when :ignored
113
+ msg = sort_by_page? ?
114
+ "The following links were ignored on '#{a_element}':" :
115
+ "The link '#{a_element}' was ignored on the following pages:"
116
+ klass = 'ignored_links_group'
117
+ else
118
+ raise "type: must be :broken or :ignored, not: #{type}"
119
+ end
120
+
121
+ puts "<p class=\"#{klass}\">"
122
+ puts msg + '<br />'
123
+ end
124
+
125
+ def puts_group_item(value, type:)
126
+ klass = (type == :broken) ? 'broken_links_group_item' : 'ignored_links_group_item'
127
+ puts "<a class=\"#{klass}\" href=\"#{build_url(value)}\">#{value}</a><br />"
128
+ end
129
+
130
+ def build_url(link)
131
+ href = @broken_link_map[link]
132
+ href || link
133
+ end
134
+
135
+ alias_method :report, :call
136
+ end
137
+ end
@@ -0,0 +1,76 @@
1
+ # frozen_string_literal: true
2
+
3
+ module BrokenLinkFinder
4
+ # Generic reporter class to be inherited from by format specific reporters.
5
+ class Reporter
6
+ # The amount of pages/links to display when verbose is false.
7
+ NUM_VALUES = 3
8
+
9
+ # Returns a new Reporter instance.
10
+ # stream is any Object that responds to :puts and :print.
11
+ def initialize(stream, sort,
12
+ broken_links, ignored_links,
13
+ broken_link_map, crawl_stats)
14
+ unless stream.respond_to?(:puts) && stream.respond_to?(:print)
15
+ raise 'stream must respond_to? :puts and :print'
16
+ end
17
+ raise "sort by either :page or :link, not #{sort}" \
18
+ unless %i[page link].include?(sort)
19
+
20
+ @stream = stream
21
+ @sort = sort
22
+ @broken_links = broken_links
23
+ @ignored_links = ignored_links
24
+ @broken_link_map = broken_link_map
25
+ @crawl_stats = crawl_stats
26
+ end
27
+
28
+ # Pretty print a report detailing the full link summary.
29
+ def call(broken_verbose: true, ignored_verbose: false)
30
+ raise 'Not implemented by parent class'
31
+ end
32
+
33
+ protected
34
+
35
+ # Return true if the sort is by page.
36
+ def sort_by_page?
37
+ @sort == :page
38
+ end
39
+
40
+ # Returns the key/value statistics of hash e.g. the number of keys and
41
+ # combined values. The hash should be of the format: { 'str' => [...] }.
42
+ # Use like: `num_pages, num_links = get_hash_stats(links)`.
43
+ def get_hash_stats(hash)
44
+ num_keys = hash.keys.length
45
+ num_values = hash.values.flatten.uniq.length
46
+
47
+ sort_by_page? ?
48
+ [num_keys, num_values] :
49
+ [num_values, num_keys]
50
+ end
51
+
52
+ # Prints the text. Defaults to a blank line.
53
+ def print(text = '')
54
+ @stream.print(text)
55
+ end
56
+
57
+ # Prints the text + \n. Defaults to a blank line.
58
+ def puts(text = '')
59
+ @stream.puts(text)
60
+ end
61
+
62
+ # Prints text + \n\n.
63
+ def putsn(text)
64
+ puts(text)
65
+ puts
66
+ end
67
+
68
+ # Prints \n + text + \n.
69
+ def nputs(text)
70
+ puts
71
+ puts(text)
72
+ end
73
+
74
+ alias_method :report, :call
75
+ end
76
+ end
@@ -0,0 +1,88 @@
1
+ # frozen_string_literal: true
2
+
3
+ module BrokenLinkFinder
4
+ # Class responsible for reporting in a text format.
5
+ class TextReporter < Reporter
6
+ # Returns a new TextReporter instance.
7
+ # stream is any Object that responds to :puts and :print.
8
+ def initialize(stream, sort,
9
+ broken_links, ignored_links,
10
+ broken_link_map, crawl_stats)
11
+ super
12
+ end
13
+
14
+ # Pretty print a report detailing the full link summary.
15
+ def call(broken_verbose: true, ignored_verbose: false)
16
+ report_crawl_summary
17
+ report_broken_links(verbose: broken_verbose)
18
+ report_ignored_links(verbose: ignored_verbose)
19
+
20
+ nil
21
+ end
22
+
23
+ private
24
+
25
+ # Report a summary of the overall crawl.
26
+ def report_crawl_summary
27
+ puts "Crawled #{@crawl_stats[:url]}"
28
+ putsn format(
29
+ '%s page(s) containing %s unique link(s) in %s seconds',
30
+ @crawl_stats[:num_pages],
31
+ @crawl_stats[:num_links],
32
+ @crawl_stats[:duration]&.truncate(2)
33
+ )
34
+ end
35
+
36
+ # Report a summary of the broken links.
37
+ def report_broken_links(verbose: true)
38
+ if @broken_links.empty?
39
+ puts 'Good news, there are no broken links!'
40
+ else
41
+ num_pages, num_links = get_hash_stats(@broken_links)
42
+ puts "Found #{num_links} unique broken link(s) across #{num_pages} page(s):"
43
+
44
+ @broken_links.each do |key, values|
45
+ msg = sort_by_page? ?
46
+ "The following broken links were found on '#{key}':" :
47
+ "The broken link '#{key}' was found on the following pages:"
48
+ nputs msg
49
+
50
+ if verbose || (values.length <= NUM_VALUES)
51
+ values.each { |value| puts value }
52
+ else # Only print N values and summarise the rest.
53
+ NUM_VALUES.times { |i| puts values[i] }
54
+
55
+ objects = sort_by_page? ? 'link(s)' : 'page(s)'
56
+ puts "+ #{values.length - NUM_VALUES} other #{objects}, remove --concise to see them all"
57
+ end
58
+ end
59
+ end
60
+ end
61
+
62
+ # Report a summary of the ignored links.
63
+ def report_ignored_links(verbose: false)
64
+ if @ignored_links.any?
65
+ num_pages, num_links = get_hash_stats(@ignored_links)
66
+ nputs "Ignored #{num_links} unique unsupported link(s) across #{num_pages} page(s), which you should check manually:"
67
+
68
+ @ignored_links.each do |key, values|
69
+ msg = sort_by_page? ?
70
+ "The following links were ignored on '#{key}':" :
71
+ "The link '#{key}' was ignored on the following pages:"
72
+ nputs msg
73
+
74
+ if verbose || (values.length <= NUM_VALUES)
75
+ values.each { |value| puts value }
76
+ else # Only print N values and summarise the rest.
77
+ NUM_VALUES.times { |i| puts values[i] }
78
+
79
+ objects = sort_by_page? ? 'link(s)' : 'page(s)'
80
+ puts "+ #{values.length - NUM_VALUES} other #{objects}, use --verbose to see them all"
81
+ end
82
+ end
83
+ end
84
+ end
85
+
86
+ alias_method :report, :call
87
+ end
88
+ end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module BrokenLinkFinder
4
- VERSION = '0.9.3'
4
+ VERSION = '0.11.1'
5
5
  end
@@ -1,11 +1,31 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- # We extract all the Document's links, not just the links to other webpages.
4
- Wgit::Document.define_extension(
3
+ # Define a method on each doc for recording unparsable links.
4
+ # Unparsable links are recorded as broken links by Finder.
5
+ class Wgit::Document
6
+ def unparsable_links
7
+ @unparsable_links ||= []
8
+ end
9
+ end
10
+
11
+ # Returns a Wgit::Url or nil (if link is unparsable).
12
+ # A proc is preferrable to a function to avoid polluting the global namespace.
13
+ parse_link = lambda do |doc, link|
14
+ Wgit::Url.new(link)
15
+ rescue StandardError
16
+ doc.unparsable_links << link
17
+ nil
18
+ end
19
+
20
+ # We extract all the Document's links e.g. <a>, <img>, <script>, <link> etc.
21
+ Wgit::Document.define_extractor(
5
22
  :all_links,
6
- '//*/@href | //*/@src', # Any element with a href or src attribute.
23
+ '//*/@href | //*/@src', # Any element's href or src attribute URL.
7
24
  singleton: false,
8
25
  text_content_only: true
9
- ) do |links|
10
- links.uniq.to_urls
26
+ ) do |links, doc|
27
+ links
28
+ .uniq
29
+ .map { |link| parse_link.call(doc, link) }
30
+ .compact
11
31
  end