broken_link_finder 0.9.4 → 0.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.ruby-version +1 -1
- data/CHANGELOG.md +52 -0
- data/Gemfile.lock +51 -38
- data/README.md +65 -29
- data/benchmark.rb +9 -5
- data/bin/console +11 -19
- data/bin/setup +1 -1
- data/broken_link_finder.gemspec +8 -5
- data/exe/broken_link_finder +14 -3
- data/lib/broken_link_finder.rb +8 -2
- data/lib/broken_link_finder/finder.rb +131 -132
- data/lib/broken_link_finder/link_manager.rb +137 -0
- data/lib/broken_link_finder/reporter/html_reporter.rb +137 -0
- data/lib/broken_link_finder/reporter/reporter.rb +76 -0
- data/lib/broken_link_finder/reporter/text_reporter.rb +88 -0
- data/lib/broken_link_finder/version.rb +1 -1
- data/lib/broken_link_finder/wgit_extensions.rb +25 -5
- data/lib/broken_link_finder/xpath.rb +14 -0
- metadata +21 -15
- data/lib/broken_link_finder/reporter.rb +0 -116
@@ -0,0 +1,137 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module BrokenLinkFinder
|
4
|
+
# Class responsible for reporting in a HTML format.
|
5
|
+
class HTMLReporter < Reporter
|
6
|
+
# Returns a new HTMLReporter instance.
|
7
|
+
# stream is any Object that responds to :puts and :print.
|
8
|
+
def initialize(stream, sort,
|
9
|
+
broken_links, ignored_links,
|
10
|
+
broken_link_map, crawl_stats)
|
11
|
+
super
|
12
|
+
end
|
13
|
+
|
14
|
+
# Pretty print a report detailing the full link summary.
|
15
|
+
def call(broken_verbose: true, ignored_verbose: false)
|
16
|
+
puts '<div class="broken_link_finder_report">'
|
17
|
+
|
18
|
+
report_crawl_summary
|
19
|
+
report_broken_links(verbose: broken_verbose)
|
20
|
+
report_ignored_links(verbose: ignored_verbose)
|
21
|
+
|
22
|
+
puts '</div>'
|
23
|
+
|
24
|
+
nil
|
25
|
+
end
|
26
|
+
|
27
|
+
private
|
28
|
+
|
29
|
+
# Report a summary of the overall crawl.
|
30
|
+
def report_crawl_summary
|
31
|
+
puts format(
|
32
|
+
'<p class="crawl_summary">Crawled <a href="%s">%s</a><br />%s page(s) containing %s unique link(s) in %s seconds</p>',
|
33
|
+
@crawl_stats[:url],
|
34
|
+
@crawl_stats[:url],
|
35
|
+
@crawl_stats[:num_pages],
|
36
|
+
@crawl_stats[:num_links],
|
37
|
+
@crawl_stats[:duration]&.truncate(2)
|
38
|
+
)
|
39
|
+
end
|
40
|
+
|
41
|
+
# Report a summary of the broken links.
|
42
|
+
def report_broken_links(verbose: true)
|
43
|
+
puts '<div class="broken_links">'
|
44
|
+
|
45
|
+
if @broken_links.empty?
|
46
|
+
puts_summary 'Good news, there are no broken links!', type: :broken
|
47
|
+
else
|
48
|
+
num_pages, num_links = get_hash_stats(@broken_links)
|
49
|
+
puts_summary "Found #{num_links} unique broken link(s) across #{num_pages} page(s):", type: :broken
|
50
|
+
|
51
|
+
@broken_links.each do |key, values|
|
52
|
+
puts_group(key, type: :broken) # Puts the opening <p> element.
|
53
|
+
|
54
|
+
if verbose || (values.length <= NUM_VALUES)
|
55
|
+
values.each { |value| puts_group_item value, type: :broken }
|
56
|
+
else # Only print N values and summarise the rest.
|
57
|
+
NUM_VALUES.times { |i| puts_group_item values[i], type: :broken }
|
58
|
+
|
59
|
+
objects = sort_by_page? ? 'link(s)' : 'page(s)'
|
60
|
+
puts "+ #{values.length - NUM_VALUES} other #{objects}, remove --concise to see them all<br />"
|
61
|
+
end
|
62
|
+
|
63
|
+
puts '</p>'
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
puts '</div>'
|
68
|
+
end
|
69
|
+
|
70
|
+
# Report a summary of the ignored links.
|
71
|
+
def report_ignored_links(verbose: false)
|
72
|
+
puts '<div class="ignored_links">'
|
73
|
+
|
74
|
+
if @ignored_links.any?
|
75
|
+
num_pages, num_links = get_hash_stats(@ignored_links)
|
76
|
+
puts_summary "Ignored #{num_links} unique unsupported link(s) across #{num_pages} page(s), which you should check manually:", type: :ignored
|
77
|
+
|
78
|
+
@ignored_links.each do |key, values|
|
79
|
+
puts_group(key, type: :ignored) # Puts the opening <p> element.
|
80
|
+
|
81
|
+
if verbose || (values.length <= NUM_VALUES)
|
82
|
+
values.each { |value| puts_group_item value, type: :ignored }
|
83
|
+
else # Only print N values and summarise the rest.
|
84
|
+
NUM_VALUES.times { |i| puts_group_item values[i], type: :ignored }
|
85
|
+
|
86
|
+
objects = sort_by_page? ? 'link(s)' : 'page(s)'
|
87
|
+
puts "+ #{values.length - NUM_VALUES} other #{objects}, use --verbose to see them all<br />"
|
88
|
+
end
|
89
|
+
|
90
|
+
puts '</p>'
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
puts '</div>'
|
95
|
+
end
|
96
|
+
|
97
|
+
def puts_summary(text, type:)
|
98
|
+
klass = (type == :broken) ? 'broken_links_summary' : 'ignored_links_summary'
|
99
|
+
puts "<p class=\"#{klass}\">#{text}</p>"
|
100
|
+
end
|
101
|
+
|
102
|
+
def puts_group(link, type:)
|
103
|
+
href = build_url(link)
|
104
|
+
a_element = "<a href=\"#{href}\">#{link}</a>"
|
105
|
+
|
106
|
+
case type
|
107
|
+
when :broken
|
108
|
+
msg = sort_by_page? ?
|
109
|
+
"The following broken links were found on '#{a_element}':" :
|
110
|
+
"The broken link '#{a_element}' was found on the following pages:"
|
111
|
+
klass = 'broken_links_group'
|
112
|
+
when :ignored
|
113
|
+
msg = sort_by_page? ?
|
114
|
+
"The following links were ignored on '#{a_element}':" :
|
115
|
+
"The link '#{a_element}' was ignored on the following pages:"
|
116
|
+
klass = 'ignored_links_group'
|
117
|
+
else
|
118
|
+
raise "type: must be :broken or :ignored, not: #{type}"
|
119
|
+
end
|
120
|
+
|
121
|
+
puts "<p class=\"#{klass}\">"
|
122
|
+
puts msg + '<br />'
|
123
|
+
end
|
124
|
+
|
125
|
+
def puts_group_item(value, type:)
|
126
|
+
klass = (type == :broken) ? 'broken_links_group_item' : 'ignored_links_group_item'
|
127
|
+
puts "<a class=\"#{klass}\" href=\"#{build_url(value)}\">#{value}</a><br />"
|
128
|
+
end
|
129
|
+
|
130
|
+
def build_url(link)
|
131
|
+
href = @broken_link_map[link]
|
132
|
+
href || link
|
133
|
+
end
|
134
|
+
|
135
|
+
alias_method :report, :call
|
136
|
+
end
|
137
|
+
end
|
@@ -0,0 +1,76 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module BrokenLinkFinder
|
4
|
+
# Generic reporter class to be inherited from by format specific reporters.
|
5
|
+
class Reporter
|
6
|
+
# The amount of pages/links to display when verbose is false.
|
7
|
+
NUM_VALUES = 3
|
8
|
+
|
9
|
+
# Returns a new Reporter instance.
|
10
|
+
# stream is any Object that responds to :puts and :print.
|
11
|
+
def initialize(stream, sort,
|
12
|
+
broken_links, ignored_links,
|
13
|
+
broken_link_map, crawl_stats)
|
14
|
+
unless stream.respond_to?(:puts) && stream.respond_to?(:print)
|
15
|
+
raise 'stream must respond_to? :puts and :print'
|
16
|
+
end
|
17
|
+
raise "sort by either :page or :link, not #{sort}" \
|
18
|
+
unless %i[page link].include?(sort)
|
19
|
+
|
20
|
+
@stream = stream
|
21
|
+
@sort = sort
|
22
|
+
@broken_links = broken_links
|
23
|
+
@ignored_links = ignored_links
|
24
|
+
@broken_link_map = broken_link_map
|
25
|
+
@crawl_stats = crawl_stats
|
26
|
+
end
|
27
|
+
|
28
|
+
# Pretty print a report detailing the full link summary.
|
29
|
+
def call(broken_verbose: true, ignored_verbose: false)
|
30
|
+
raise 'Not implemented by parent class'
|
31
|
+
end
|
32
|
+
|
33
|
+
protected
|
34
|
+
|
35
|
+
# Return true if the sort is by page.
|
36
|
+
def sort_by_page?
|
37
|
+
@sort == :page
|
38
|
+
end
|
39
|
+
|
40
|
+
# Returns the key/value statistics of hash e.g. the number of keys and
|
41
|
+
# combined values. The hash should be of the format: { 'str' => [...] }.
|
42
|
+
# Use like: `num_pages, num_links = get_hash_stats(links)`.
|
43
|
+
def get_hash_stats(hash)
|
44
|
+
num_keys = hash.keys.length
|
45
|
+
num_values = hash.values.flatten.uniq.length
|
46
|
+
|
47
|
+
sort_by_page? ?
|
48
|
+
[num_keys, num_values] :
|
49
|
+
[num_values, num_keys]
|
50
|
+
end
|
51
|
+
|
52
|
+
# Prints the text. Defaults to a blank line.
|
53
|
+
def print(text = '')
|
54
|
+
@stream.print(text)
|
55
|
+
end
|
56
|
+
|
57
|
+
# Prints the text + \n. Defaults to a blank line.
|
58
|
+
def puts(text = '')
|
59
|
+
@stream.puts(text)
|
60
|
+
end
|
61
|
+
|
62
|
+
# Prints text + \n\n.
|
63
|
+
def putsn(text)
|
64
|
+
puts(text)
|
65
|
+
puts
|
66
|
+
end
|
67
|
+
|
68
|
+
# Prints \n + text + \n.
|
69
|
+
def nputs(text)
|
70
|
+
puts
|
71
|
+
puts(text)
|
72
|
+
end
|
73
|
+
|
74
|
+
alias_method :report, :call
|
75
|
+
end
|
76
|
+
end
|
@@ -0,0 +1,88 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module BrokenLinkFinder
|
4
|
+
# Class responsible for reporting in a text format.
|
5
|
+
class TextReporter < Reporter
|
6
|
+
# Returns a new TextReporter instance.
|
7
|
+
# stream is any Object that responds to :puts and :print.
|
8
|
+
def initialize(stream, sort,
|
9
|
+
broken_links, ignored_links,
|
10
|
+
broken_link_map, crawl_stats)
|
11
|
+
super
|
12
|
+
end
|
13
|
+
|
14
|
+
# Pretty print a report detailing the full link summary.
|
15
|
+
def call(broken_verbose: true, ignored_verbose: false)
|
16
|
+
report_crawl_summary
|
17
|
+
report_broken_links(verbose: broken_verbose)
|
18
|
+
report_ignored_links(verbose: ignored_verbose)
|
19
|
+
|
20
|
+
nil
|
21
|
+
end
|
22
|
+
|
23
|
+
private
|
24
|
+
|
25
|
+
# Report a summary of the overall crawl.
|
26
|
+
def report_crawl_summary
|
27
|
+
puts "Crawled #{@crawl_stats[:url]}"
|
28
|
+
putsn format(
|
29
|
+
'%s page(s) containing %s unique link(s) in %s seconds',
|
30
|
+
@crawl_stats[:num_pages],
|
31
|
+
@crawl_stats[:num_links],
|
32
|
+
@crawl_stats[:duration]&.truncate(2)
|
33
|
+
)
|
34
|
+
end
|
35
|
+
|
36
|
+
# Report a summary of the broken links.
|
37
|
+
def report_broken_links(verbose: true)
|
38
|
+
if @broken_links.empty?
|
39
|
+
puts 'Good news, there are no broken links!'
|
40
|
+
else
|
41
|
+
num_pages, num_links = get_hash_stats(@broken_links)
|
42
|
+
puts "Found #{num_links} unique broken link(s) across #{num_pages} page(s):"
|
43
|
+
|
44
|
+
@broken_links.each do |key, values|
|
45
|
+
msg = sort_by_page? ?
|
46
|
+
"The following broken links were found on '#{key}':" :
|
47
|
+
"The broken link '#{key}' was found on the following pages:"
|
48
|
+
nputs msg
|
49
|
+
|
50
|
+
if verbose || (values.length <= NUM_VALUES)
|
51
|
+
values.each { |value| puts value }
|
52
|
+
else # Only print N values and summarise the rest.
|
53
|
+
NUM_VALUES.times { |i| puts values[i] }
|
54
|
+
|
55
|
+
objects = sort_by_page? ? 'link(s)' : 'page(s)'
|
56
|
+
puts "+ #{values.length - NUM_VALUES} other #{objects}, remove --concise to see them all"
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
# Report a summary of the ignored links.
|
63
|
+
def report_ignored_links(verbose: false)
|
64
|
+
if @ignored_links.any?
|
65
|
+
num_pages, num_links = get_hash_stats(@ignored_links)
|
66
|
+
nputs "Ignored #{num_links} unique unsupported link(s) across #{num_pages} page(s), which you should check manually:"
|
67
|
+
|
68
|
+
@ignored_links.each do |key, values|
|
69
|
+
msg = sort_by_page? ?
|
70
|
+
"The following links were ignored on '#{key}':" :
|
71
|
+
"The link '#{key}' was ignored on the following pages:"
|
72
|
+
nputs msg
|
73
|
+
|
74
|
+
if verbose || (values.length <= NUM_VALUES)
|
75
|
+
values.each { |value| puts value }
|
76
|
+
else # Only print N values and summarise the rest.
|
77
|
+
NUM_VALUES.times { |i| puts values[i] }
|
78
|
+
|
79
|
+
objects = sort_by_page? ? 'link(s)' : 'page(s)'
|
80
|
+
puts "+ #{values.length - NUM_VALUES} other #{objects}, use --verbose to see them all"
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
alias_method :report, :call
|
87
|
+
end
|
88
|
+
end
|
@@ -1,11 +1,31 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
#
|
4
|
-
|
3
|
+
# Define a method on each doc for recording unparsable links.
|
4
|
+
# Unparsable links are recorded as broken links by Finder.
|
5
|
+
class Wgit::Document
|
6
|
+
def unparsable_links
|
7
|
+
@unparsable_links ||= []
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
11
|
+
# Returns a Wgit::Url or nil (if link is unparsable).
|
12
|
+
# A proc is preferrable to a function to avoid polluting the global namespace.
|
13
|
+
parse_link = lambda do |doc, link|
|
14
|
+
Wgit::Url.new(link)
|
15
|
+
rescue StandardError
|
16
|
+
doc.unparsable_links << link
|
17
|
+
nil
|
18
|
+
end
|
19
|
+
|
20
|
+
# Define a custom extractor for all page links we're interested in checking.
|
21
|
+
Wgit::Document.define_extractor(
|
5
22
|
:all_links,
|
6
|
-
|
23
|
+
lambda { BrokenLinkFinder::link_xpath },
|
7
24
|
singleton: false,
|
8
25
|
text_content_only: true
|
9
|
-
) do |links|
|
10
|
-
links
|
26
|
+
) do |links, doc|
|
27
|
+
links
|
28
|
+
.uniq
|
29
|
+
.map { |link| parse_link.call(doc, link) }
|
30
|
+
.compact
|
11
31
|
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module BrokenLinkFinder
|
4
|
+
# Extract all the Document's <body> links e.g. <a>, <img>, <script> etc.
|
5
|
+
DEFAULT_LINK_XPATH = '/html/body//*/@href | /html/body//*/@src'
|
6
|
+
|
7
|
+
@link_xpath = DEFAULT_LINK_XPATH
|
8
|
+
|
9
|
+
class << self
|
10
|
+
# The xpath used to extract links from a crawled page.
|
11
|
+
# Can be overridden as required.
|
12
|
+
attr_accessor :link_xpath
|
13
|
+
end
|
14
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: broken_link_finder
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.12.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Michael Telford
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2021-04-20 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -72,14 +72,14 @@ dependencies:
|
|
72
72
|
requirements:
|
73
73
|
- - "~>"
|
74
74
|
- !ruby/object:Gem::Version
|
75
|
-
version: '
|
75
|
+
version: '13.0'
|
76
76
|
type: :development
|
77
77
|
prerelease: false
|
78
78
|
version_requirements: !ruby/object:Gem::Requirement
|
79
79
|
requirements:
|
80
80
|
- - "~>"
|
81
81
|
- !ruby/object:Gem::Version
|
82
|
-
version: '
|
82
|
+
version: '13.0'
|
83
83
|
- !ruby/object:Gem::Dependency
|
84
84
|
name: webmock
|
85
85
|
requirement: !ruby/object:Gem::Requirement
|
@@ -100,42 +100,42 @@ dependencies:
|
|
100
100
|
requirements:
|
101
101
|
- - "~>"
|
102
102
|
- !ruby/object:Gem::Version
|
103
|
-
version: 0.20
|
103
|
+
version: '0.20'
|
104
104
|
type: :runtime
|
105
105
|
prerelease: false
|
106
106
|
version_requirements: !ruby/object:Gem::Requirement
|
107
107
|
requirements:
|
108
108
|
- - "~>"
|
109
109
|
- !ruby/object:Gem::Version
|
110
|
-
version: 0.20
|
110
|
+
version: '0.20'
|
111
111
|
- !ruby/object:Gem::Dependency
|
112
112
|
name: thread
|
113
113
|
requirement: !ruby/object:Gem::Requirement
|
114
114
|
requirements:
|
115
115
|
- - "~>"
|
116
116
|
- !ruby/object:Gem::Version
|
117
|
-
version: 0.2
|
117
|
+
version: '0.2'
|
118
118
|
type: :runtime
|
119
119
|
prerelease: false
|
120
120
|
version_requirements: !ruby/object:Gem::Requirement
|
121
121
|
requirements:
|
122
122
|
- - "~>"
|
123
123
|
- !ruby/object:Gem::Version
|
124
|
-
version: 0.2
|
124
|
+
version: '0.2'
|
125
125
|
- !ruby/object:Gem::Dependency
|
126
126
|
name: wgit
|
127
127
|
requirement: !ruby/object:Gem::Requirement
|
128
128
|
requirements:
|
129
129
|
- - "~>"
|
130
130
|
- !ruby/object:Gem::Version
|
131
|
-
version: 0.
|
131
|
+
version: '0.10'
|
132
132
|
type: :runtime
|
133
133
|
prerelease: false
|
134
134
|
version_requirements: !ruby/object:Gem::Requirement
|
135
135
|
requirements:
|
136
136
|
- - "~>"
|
137
137
|
- !ruby/object:Gem::Version
|
138
|
-
version: 0.
|
138
|
+
version: '0.10'
|
139
139
|
description: Finds a website's broken links using the 'wgit' gem and reports back
|
140
140
|
to you with a summary.
|
141
141
|
email: michael.telford@live.com
|
@@ -159,15 +159,22 @@ files:
|
|
159
159
|
- exe/broken_link_finder
|
160
160
|
- lib/broken_link_finder.rb
|
161
161
|
- lib/broken_link_finder/finder.rb
|
162
|
-
- lib/broken_link_finder/
|
162
|
+
- lib/broken_link_finder/link_manager.rb
|
163
|
+
- lib/broken_link_finder/reporter/html_reporter.rb
|
164
|
+
- lib/broken_link_finder/reporter/reporter.rb
|
165
|
+
- lib/broken_link_finder/reporter/text_reporter.rb
|
163
166
|
- lib/broken_link_finder/version.rb
|
164
167
|
- lib/broken_link_finder/wgit_extensions.rb
|
168
|
+
- lib/broken_link_finder/xpath.rb
|
165
169
|
- load.rb
|
166
170
|
homepage: https://github.com/michaeltelford/broken-link-finder
|
167
171
|
licenses:
|
168
172
|
- MIT
|
169
173
|
metadata:
|
170
174
|
source_code_uri: https://github.com/michaeltelford/broken-link-finder
|
175
|
+
changelog_uri: https://github.com/michaeltelford/broken-link-finder/blob/master/CHANGELOG.md
|
176
|
+
bug_tracker_uri: https://github.com/michaeltelford/broken-link-finder/issues
|
177
|
+
documentation_uri: https://www.rubydoc.info/gems/broken_link_finder
|
171
178
|
allowed_push_host: https://rubygems.org
|
172
179
|
post_install_message: Added the executable 'broken_link_finder' to $PATH
|
173
180
|
rdoc_options: []
|
@@ -184,9 +191,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
184
191
|
- !ruby/object:Gem::Version
|
185
192
|
version: '0'
|
186
193
|
requirements: []
|
187
|
-
|
188
|
-
|
189
|
-
signing_key:
|
194
|
+
rubygems_version: 3.1.2
|
195
|
+
signing_key:
|
190
196
|
specification_version: 4
|
191
197
|
summary: Finds a website's broken links and reports back to you with a summary.
|
192
198
|
test_files: []
|