broken_link_finder 0.9.3 → 0.11.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.ruby-version +1 -1
- data/CHANGELOG.md +51 -0
- data/Gemfile.lock +44 -33
- data/README.md +28 -19
- data/benchmark.rb +9 -5
- data/bin/console +11 -19
- data/bin/setup +1 -1
- data/broken_link_finder.gemspec +8 -5
- data/exe/broken_link_finder +12 -3
- data/lib/broken_link_finder.rb +6 -1
- data/lib/broken_link_finder/finder.rb +134 -141
- data/lib/broken_link_finder/link_manager.rb +137 -0
- data/lib/broken_link_finder/reporter/html_reporter.rb +137 -0
- data/lib/broken_link_finder/reporter/reporter.rb +76 -0
- data/lib/broken_link_finder/reporter/text_reporter.rb +88 -0
- data/lib/broken_link_finder/version.rb +1 -1
- data/lib/broken_link_finder/wgit_extensions.rb +25 -5
- metadata +18 -13
- data/lib/broken_link_finder/reporter.rb +0 -116
@@ -0,0 +1,137 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module BrokenLinkFinder
|
4
|
+
# Class responsible for handling the link collection logic.
|
5
|
+
class LinkManager
|
6
|
+
# Used for mapping pages to broken links.
|
7
|
+
attr_reader :broken_links
|
8
|
+
|
9
|
+
# Used for mapping pages to ignored links.
|
10
|
+
attr_reader :ignored_links
|
11
|
+
|
12
|
+
# Used to record crawl statistics e.g. duration etc.
|
13
|
+
attr_reader :crawl_stats
|
14
|
+
|
15
|
+
# Used to map a link (as is) to its absolute (crawlable) form.
|
16
|
+
attr_reader :broken_link_map
|
17
|
+
|
18
|
+
# Used to prevent crawling a broken link twice.
|
19
|
+
attr_reader :all_broken_links
|
20
|
+
|
21
|
+
# Used to prevent crawling an intact link twice.
|
22
|
+
attr_reader :all_intact_links
|
23
|
+
|
24
|
+
# Used for building crawl statistics.
|
25
|
+
attr_reader :all_ignored_links
|
26
|
+
|
27
|
+
# Returns a new LinkManager instance with empty link collections.
|
28
|
+
def initialize(sort)
|
29
|
+
raise "Sort by either :page or :link, not #{sort}" \
|
30
|
+
unless %i[page link].include?(sort)
|
31
|
+
|
32
|
+
@sort = sort
|
33
|
+
@lock = Mutex.new
|
34
|
+
|
35
|
+
empty # Initialises the link collections.
|
36
|
+
end
|
37
|
+
|
38
|
+
# Initialise/empty the link collection objects.
|
39
|
+
def empty
|
40
|
+
@broken_links = {}
|
41
|
+
@ignored_links = {}
|
42
|
+
@crawl_stats = {}
|
43
|
+
@broken_link_map = {}
|
44
|
+
@all_broken_links = Set.new
|
45
|
+
@all_intact_links = Set.new
|
46
|
+
@all_ignored_links = Set.new
|
47
|
+
end
|
48
|
+
|
49
|
+
# Append key => [value] to the broken link collections.
|
50
|
+
# If map: true, then the link will also be recorded in @broken_link_map.
|
51
|
+
def append_broken_link(doc, link, map: true)
|
52
|
+
key, value = get_key_value(doc.url, link)
|
53
|
+
|
54
|
+
@lock.synchronize do
|
55
|
+
@broken_links[key] = [] unless @broken_links[key]
|
56
|
+
@broken_links[key] << value
|
57
|
+
|
58
|
+
@all_broken_links << link
|
59
|
+
|
60
|
+
@broken_link_map[link] = link.make_absolute(doc) if map
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
# Remove the broken link from the necessary collections.
|
65
|
+
def remove_broken_link(link)
|
66
|
+
@lock.synchronize do
|
67
|
+
if @sort == :page
|
68
|
+
@broken_links.each { |_k, links| links.delete(link) }
|
69
|
+
@broken_links.delete_if { |_k, links| links.empty? }
|
70
|
+
else
|
71
|
+
@broken_links.delete(link)
|
72
|
+
end
|
73
|
+
|
74
|
+
@all_broken_links.delete(link)
|
75
|
+
@all_intact_links << link
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
# Append key => [value] to the ignored link collections.
|
80
|
+
def append_ignored_link(url, link)
|
81
|
+
key, value = get_key_value(url, link)
|
82
|
+
|
83
|
+
@lock.synchronize do
|
84
|
+
@ignored_links[key] = [] unless @ignored_links[key]
|
85
|
+
@ignored_links[key] << value
|
86
|
+
|
87
|
+
@all_ignored_links << link
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
# Append link to @all_intact_links.
|
92
|
+
def append_intact_link(link)
|
93
|
+
@lock.synchronize { @all_intact_links << link }
|
94
|
+
end
|
95
|
+
|
96
|
+
# Sorts the link collection's keys and values alphabetically.
|
97
|
+
def sort
|
98
|
+
@broken_links.values.map(&:uniq!)
|
99
|
+
@ignored_links.values.map(&:uniq!)
|
100
|
+
|
101
|
+
@broken_links = @broken_links.sort_by { |k, _v| k }.to_h
|
102
|
+
@ignored_links = @ignored_links.sort_by { |k, _v| k }.to_h
|
103
|
+
|
104
|
+
@broken_links.each { |_k, v| v.sort! }
|
105
|
+
@ignored_links.each { |_k, v| v.sort! }
|
106
|
+
end
|
107
|
+
|
108
|
+
# Tally's up various statistics about the crawl and its links.
|
109
|
+
def tally(url:, pages_crawled:, start:)
|
110
|
+
@crawl_stats[:url] = url
|
111
|
+
@crawl_stats[:pages_crawled] = pages_crawled
|
112
|
+
@crawl_stats[:num_pages] = pages_crawled.size
|
113
|
+
@crawl_stats[:num_links] = (
|
114
|
+
@all_broken_links.size + @all_intact_links.size + @all_ignored_links.size
|
115
|
+
)
|
116
|
+
@crawl_stats[:num_broken_links] = @all_broken_links.size
|
117
|
+
@crawl_stats[:num_intact_links] = @all_intact_links.size
|
118
|
+
@crawl_stats[:num_ignored_links] = @all_ignored_links.size
|
119
|
+
@crawl_stats[:duration] = Time.now - start
|
120
|
+
end
|
121
|
+
|
122
|
+
private
|
123
|
+
|
124
|
+
# Returns the correct key value depending on the @sort type.
|
125
|
+
# @sort == :page ? [url, link] : [link, url]
|
126
|
+
def get_key_value(url, link)
|
127
|
+
case @sort
|
128
|
+
when :page
|
129
|
+
[url, link]
|
130
|
+
when :link
|
131
|
+
[link, url]
|
132
|
+
else
|
133
|
+
raise "Unsupported sort type: #{sort}"
|
134
|
+
end
|
135
|
+
end
|
136
|
+
end
|
137
|
+
end
|
@@ -0,0 +1,137 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module BrokenLinkFinder
|
4
|
+
# Class responsible for reporting in a HTML format.
|
5
|
+
class HTMLReporter < Reporter
|
6
|
+
# Returns a new HTMLReporter instance.
|
7
|
+
# stream is any Object that responds to :puts and :print.
|
8
|
+
def initialize(stream, sort,
|
9
|
+
broken_links, ignored_links,
|
10
|
+
broken_link_map, crawl_stats)
|
11
|
+
super
|
12
|
+
end
|
13
|
+
|
14
|
+
# Pretty print a report detailing the full link summary.
|
15
|
+
def call(broken_verbose: true, ignored_verbose: false)
|
16
|
+
puts '<div class="broken_link_finder_report">'
|
17
|
+
|
18
|
+
report_crawl_summary
|
19
|
+
report_broken_links(verbose: broken_verbose)
|
20
|
+
report_ignored_links(verbose: ignored_verbose)
|
21
|
+
|
22
|
+
puts '</div>'
|
23
|
+
|
24
|
+
nil
|
25
|
+
end
|
26
|
+
|
27
|
+
private
|
28
|
+
|
29
|
+
# Report a summary of the overall crawl.
|
30
|
+
def report_crawl_summary
|
31
|
+
puts format(
|
32
|
+
'<p class="crawl_summary">Crawled <a href="%s">%s</a><br />%s page(s) containing %s unique link(s) in %s seconds</p>',
|
33
|
+
@crawl_stats[:url],
|
34
|
+
@crawl_stats[:url],
|
35
|
+
@crawl_stats[:num_pages],
|
36
|
+
@crawl_stats[:num_links],
|
37
|
+
@crawl_stats[:duration]&.truncate(2)
|
38
|
+
)
|
39
|
+
end
|
40
|
+
|
41
|
+
# Report a summary of the broken links.
|
42
|
+
def report_broken_links(verbose: true)
|
43
|
+
puts '<div class="broken_links">'
|
44
|
+
|
45
|
+
if @broken_links.empty?
|
46
|
+
puts_summary 'Good news, there are no broken links!', type: :broken
|
47
|
+
else
|
48
|
+
num_pages, num_links = get_hash_stats(@broken_links)
|
49
|
+
puts_summary "Found #{num_links} unique broken link(s) across #{num_pages} page(s):", type: :broken
|
50
|
+
|
51
|
+
@broken_links.each do |key, values|
|
52
|
+
puts_group(key, type: :broken) # Puts the opening <p> element.
|
53
|
+
|
54
|
+
if verbose || (values.length <= NUM_VALUES)
|
55
|
+
values.each { |value| puts_group_item value, type: :broken }
|
56
|
+
else # Only print N values and summarise the rest.
|
57
|
+
NUM_VALUES.times { |i| puts_group_item values[i], type: :broken }
|
58
|
+
|
59
|
+
objects = sort_by_page? ? 'link(s)' : 'page(s)'
|
60
|
+
puts "+ #{values.length - NUM_VALUES} other #{objects}, remove --concise to see them all<br />"
|
61
|
+
end
|
62
|
+
|
63
|
+
puts '</p>'
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
puts '</div>'
|
68
|
+
end
|
69
|
+
|
70
|
+
# Report a summary of the ignored links.
|
71
|
+
def report_ignored_links(verbose: false)
|
72
|
+
puts '<div class="ignored_links">'
|
73
|
+
|
74
|
+
if @ignored_links.any?
|
75
|
+
num_pages, num_links = get_hash_stats(@ignored_links)
|
76
|
+
puts_summary "Ignored #{num_links} unique unsupported link(s) across #{num_pages} page(s), which you should check manually:", type: :ignored
|
77
|
+
|
78
|
+
@ignored_links.each do |key, values|
|
79
|
+
puts_group(key, type: :ignored) # Puts the opening <p> element.
|
80
|
+
|
81
|
+
if verbose || (values.length <= NUM_VALUES)
|
82
|
+
values.each { |value| puts_group_item value, type: :ignored }
|
83
|
+
else # Only print N values and summarise the rest.
|
84
|
+
NUM_VALUES.times { |i| puts_group_item values[i], type: :ignored }
|
85
|
+
|
86
|
+
objects = sort_by_page? ? 'link(s)' : 'page(s)'
|
87
|
+
puts "+ #{values.length - NUM_VALUES} other #{objects}, use --verbose to see them all<br />"
|
88
|
+
end
|
89
|
+
|
90
|
+
puts '</p>'
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
puts '</div>'
|
95
|
+
end
|
96
|
+
|
97
|
+
def puts_summary(text, type:)
|
98
|
+
klass = (type == :broken) ? 'broken_links_summary' : 'ignored_links_summary'
|
99
|
+
puts "<p class=\"#{klass}\">#{text}</p>"
|
100
|
+
end
|
101
|
+
|
102
|
+
def puts_group(link, type:)
|
103
|
+
href = build_url(link)
|
104
|
+
a_element = "<a href=\"#{href}\">#{link}</a>"
|
105
|
+
|
106
|
+
case type
|
107
|
+
when :broken
|
108
|
+
msg = sort_by_page? ?
|
109
|
+
"The following broken links were found on '#{a_element}':" :
|
110
|
+
"The broken link '#{a_element}' was found on the following pages:"
|
111
|
+
klass = 'broken_links_group'
|
112
|
+
when :ignored
|
113
|
+
msg = sort_by_page? ?
|
114
|
+
"The following links were ignored on '#{a_element}':" :
|
115
|
+
"The link '#{a_element}' was ignored on the following pages:"
|
116
|
+
klass = 'ignored_links_group'
|
117
|
+
else
|
118
|
+
raise "type: must be :broken or :ignored, not: #{type}"
|
119
|
+
end
|
120
|
+
|
121
|
+
puts "<p class=\"#{klass}\">"
|
122
|
+
puts msg + '<br />'
|
123
|
+
end
|
124
|
+
|
125
|
+
def puts_group_item(value, type:)
|
126
|
+
klass = (type == :broken) ? 'broken_links_group_item' : 'ignored_links_group_item'
|
127
|
+
puts "<a class=\"#{klass}\" href=\"#{build_url(value)}\">#{value}</a><br />"
|
128
|
+
end
|
129
|
+
|
130
|
+
def build_url(link)
|
131
|
+
href = @broken_link_map[link]
|
132
|
+
href || link
|
133
|
+
end
|
134
|
+
|
135
|
+
alias_method :report, :call
|
136
|
+
end
|
137
|
+
end
|
@@ -0,0 +1,76 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module BrokenLinkFinder
|
4
|
+
# Generic reporter class to be inherited from by format specific reporters.
|
5
|
+
class Reporter
|
6
|
+
# The amount of pages/links to display when verbose is false.
|
7
|
+
NUM_VALUES = 3
|
8
|
+
|
9
|
+
# Returns a new Reporter instance.
|
10
|
+
# stream is any Object that responds to :puts and :print.
|
11
|
+
def initialize(stream, sort,
|
12
|
+
broken_links, ignored_links,
|
13
|
+
broken_link_map, crawl_stats)
|
14
|
+
unless stream.respond_to?(:puts) && stream.respond_to?(:print)
|
15
|
+
raise 'stream must respond_to? :puts and :print'
|
16
|
+
end
|
17
|
+
raise "sort by either :page or :link, not #{sort}" \
|
18
|
+
unless %i[page link].include?(sort)
|
19
|
+
|
20
|
+
@stream = stream
|
21
|
+
@sort = sort
|
22
|
+
@broken_links = broken_links
|
23
|
+
@ignored_links = ignored_links
|
24
|
+
@broken_link_map = broken_link_map
|
25
|
+
@crawl_stats = crawl_stats
|
26
|
+
end
|
27
|
+
|
28
|
+
# Pretty print a report detailing the full link summary.
|
29
|
+
def call(broken_verbose: true, ignored_verbose: false)
|
30
|
+
raise 'Not implemented by parent class'
|
31
|
+
end
|
32
|
+
|
33
|
+
protected
|
34
|
+
|
35
|
+
# Return true if the sort is by page.
|
36
|
+
def sort_by_page?
|
37
|
+
@sort == :page
|
38
|
+
end
|
39
|
+
|
40
|
+
# Returns the key/value statistics of hash e.g. the number of keys and
|
41
|
+
# combined values. The hash should be of the format: { 'str' => [...] }.
|
42
|
+
# Use like: `num_pages, num_links = get_hash_stats(links)`.
|
43
|
+
def get_hash_stats(hash)
|
44
|
+
num_keys = hash.keys.length
|
45
|
+
num_values = hash.values.flatten.uniq.length
|
46
|
+
|
47
|
+
sort_by_page? ?
|
48
|
+
[num_keys, num_values] :
|
49
|
+
[num_values, num_keys]
|
50
|
+
end
|
51
|
+
|
52
|
+
# Prints the text. Defaults to a blank line.
|
53
|
+
def print(text = '')
|
54
|
+
@stream.print(text)
|
55
|
+
end
|
56
|
+
|
57
|
+
# Prints the text + \n. Defaults to a blank line.
|
58
|
+
def puts(text = '')
|
59
|
+
@stream.puts(text)
|
60
|
+
end
|
61
|
+
|
62
|
+
# Prints text + \n\n.
|
63
|
+
def putsn(text)
|
64
|
+
puts(text)
|
65
|
+
puts
|
66
|
+
end
|
67
|
+
|
68
|
+
# Prints \n + text + \n.
|
69
|
+
def nputs(text)
|
70
|
+
puts
|
71
|
+
puts(text)
|
72
|
+
end
|
73
|
+
|
74
|
+
alias_method :report, :call
|
75
|
+
end
|
76
|
+
end
|
@@ -0,0 +1,88 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module BrokenLinkFinder
|
4
|
+
# Class responsible for reporting in a text format.
|
5
|
+
class TextReporter < Reporter
|
6
|
+
# Returns a new TextReporter instance.
|
7
|
+
# stream is any Object that responds to :puts and :print.
|
8
|
+
def initialize(stream, sort,
|
9
|
+
broken_links, ignored_links,
|
10
|
+
broken_link_map, crawl_stats)
|
11
|
+
super
|
12
|
+
end
|
13
|
+
|
14
|
+
# Pretty print a report detailing the full link summary.
|
15
|
+
def call(broken_verbose: true, ignored_verbose: false)
|
16
|
+
report_crawl_summary
|
17
|
+
report_broken_links(verbose: broken_verbose)
|
18
|
+
report_ignored_links(verbose: ignored_verbose)
|
19
|
+
|
20
|
+
nil
|
21
|
+
end
|
22
|
+
|
23
|
+
private
|
24
|
+
|
25
|
+
# Report a summary of the overall crawl.
|
26
|
+
def report_crawl_summary
|
27
|
+
puts "Crawled #{@crawl_stats[:url]}"
|
28
|
+
putsn format(
|
29
|
+
'%s page(s) containing %s unique link(s) in %s seconds',
|
30
|
+
@crawl_stats[:num_pages],
|
31
|
+
@crawl_stats[:num_links],
|
32
|
+
@crawl_stats[:duration]&.truncate(2)
|
33
|
+
)
|
34
|
+
end
|
35
|
+
|
36
|
+
# Report a summary of the broken links.
|
37
|
+
def report_broken_links(verbose: true)
|
38
|
+
if @broken_links.empty?
|
39
|
+
puts 'Good news, there are no broken links!'
|
40
|
+
else
|
41
|
+
num_pages, num_links = get_hash_stats(@broken_links)
|
42
|
+
puts "Found #{num_links} unique broken link(s) across #{num_pages} page(s):"
|
43
|
+
|
44
|
+
@broken_links.each do |key, values|
|
45
|
+
msg = sort_by_page? ?
|
46
|
+
"The following broken links were found on '#{key}':" :
|
47
|
+
"The broken link '#{key}' was found on the following pages:"
|
48
|
+
nputs msg
|
49
|
+
|
50
|
+
if verbose || (values.length <= NUM_VALUES)
|
51
|
+
values.each { |value| puts value }
|
52
|
+
else # Only print N values and summarise the rest.
|
53
|
+
NUM_VALUES.times { |i| puts values[i] }
|
54
|
+
|
55
|
+
objects = sort_by_page? ? 'link(s)' : 'page(s)'
|
56
|
+
puts "+ #{values.length - NUM_VALUES} other #{objects}, remove --concise to see them all"
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
# Report a summary of the ignored links.
|
63
|
+
def report_ignored_links(verbose: false)
|
64
|
+
if @ignored_links.any?
|
65
|
+
num_pages, num_links = get_hash_stats(@ignored_links)
|
66
|
+
nputs "Ignored #{num_links} unique unsupported link(s) across #{num_pages} page(s), which you should check manually:"
|
67
|
+
|
68
|
+
@ignored_links.each do |key, values|
|
69
|
+
msg = sort_by_page? ?
|
70
|
+
"The following links were ignored on '#{key}':" :
|
71
|
+
"The link '#{key}' was ignored on the following pages:"
|
72
|
+
nputs msg
|
73
|
+
|
74
|
+
if verbose || (values.length <= NUM_VALUES)
|
75
|
+
values.each { |value| puts value }
|
76
|
+
else # Only print N values and summarise the rest.
|
77
|
+
NUM_VALUES.times { |i| puts values[i] }
|
78
|
+
|
79
|
+
objects = sort_by_page? ? 'link(s)' : 'page(s)'
|
80
|
+
puts "+ #{values.length - NUM_VALUES} other #{objects}, use --verbose to see them all"
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
alias_method :report, :call
|
87
|
+
end
|
88
|
+
end
|
@@ -1,11 +1,31 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
#
|
4
|
-
|
3
|
+
# Define a method on each doc for recording unparsable links.
|
4
|
+
# Unparsable links are recorded as broken links by Finder.
|
5
|
+
class Wgit::Document
|
6
|
+
def unparsable_links
|
7
|
+
@unparsable_links ||= []
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
11
|
+
# Returns a Wgit::Url or nil (if link is unparsable).
|
12
|
+
# A proc is preferrable to a function to avoid polluting the global namespace.
|
13
|
+
parse_link = lambda do |doc, link|
|
14
|
+
Wgit::Url.new(link)
|
15
|
+
rescue StandardError
|
16
|
+
doc.unparsable_links << link
|
17
|
+
nil
|
18
|
+
end
|
19
|
+
|
20
|
+
# We extract all the Document's links e.g. <a>, <img>, <script>, <link> etc.
|
21
|
+
Wgit::Document.define_extractor(
|
5
22
|
:all_links,
|
6
|
-
'//*/@href | //*/@src', # Any element
|
23
|
+
'//*/@href | //*/@src', # Any element's href or src attribute URL.
|
7
24
|
singleton: false,
|
8
25
|
text_content_only: true
|
9
|
-
) do |links|
|
10
|
-
links
|
26
|
+
) do |links, doc|
|
27
|
+
links
|
28
|
+
.uniq
|
29
|
+
.map { |link| parse_link.call(doc, link) }
|
30
|
+
.compact
|
11
31
|
end
|