broken_link_finder 0.9.4 → 0.12.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.ruby-version +1 -1
- data/CHANGELOG.md +52 -0
- data/Gemfile.lock +51 -38
- data/README.md +65 -29
- data/benchmark.rb +9 -5
- data/bin/console +11 -19
- data/bin/setup +1 -1
- data/broken_link_finder.gemspec +8 -5
- data/exe/broken_link_finder +14 -3
- data/lib/broken_link_finder.rb +8 -2
- data/lib/broken_link_finder/finder.rb +131 -132
- data/lib/broken_link_finder/link_manager.rb +137 -0
- data/lib/broken_link_finder/reporter/html_reporter.rb +137 -0
- data/lib/broken_link_finder/reporter/reporter.rb +76 -0
- data/lib/broken_link_finder/reporter/text_reporter.rb +88 -0
- data/lib/broken_link_finder/version.rb +1 -1
- data/lib/broken_link_finder/wgit_extensions.rb +25 -5
- data/lib/broken_link_finder/xpath.rb +14 -0
- metadata +21 -15
- data/lib/broken_link_finder/reporter.rb +0 -116
@@ -0,0 +1,137 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module BrokenLinkFinder
|
4
|
+
# Class responsible for reporting in a HTML format.
|
5
|
+
class HTMLReporter < Reporter
|
6
|
+
# Returns a new HTMLReporter instance.
|
7
|
+
# stream is any Object that responds to :puts and :print.
|
8
|
+
def initialize(stream, sort,
|
9
|
+
broken_links, ignored_links,
|
10
|
+
broken_link_map, crawl_stats)
|
11
|
+
super
|
12
|
+
end
|
13
|
+
|
14
|
+
# Pretty print a report detailing the full link summary.
|
15
|
+
def call(broken_verbose: true, ignored_verbose: false)
|
16
|
+
puts '<div class="broken_link_finder_report">'
|
17
|
+
|
18
|
+
report_crawl_summary
|
19
|
+
report_broken_links(verbose: broken_verbose)
|
20
|
+
report_ignored_links(verbose: ignored_verbose)
|
21
|
+
|
22
|
+
puts '</div>'
|
23
|
+
|
24
|
+
nil
|
25
|
+
end
|
26
|
+
|
27
|
+
private
|
28
|
+
|
29
|
+
# Report a summary of the overall crawl.
|
30
|
+
def report_crawl_summary
|
31
|
+
puts format(
|
32
|
+
'<p class="crawl_summary">Crawled <a href="%s">%s</a><br />%s page(s) containing %s unique link(s) in %s seconds</p>',
|
33
|
+
@crawl_stats[:url],
|
34
|
+
@crawl_stats[:url],
|
35
|
+
@crawl_stats[:num_pages],
|
36
|
+
@crawl_stats[:num_links],
|
37
|
+
@crawl_stats[:duration]&.truncate(2)
|
38
|
+
)
|
39
|
+
end
|
40
|
+
|
41
|
+
# Report a summary of the broken links.
|
42
|
+
def report_broken_links(verbose: true)
|
43
|
+
puts '<div class="broken_links">'
|
44
|
+
|
45
|
+
if @broken_links.empty?
|
46
|
+
puts_summary 'Good news, there are no broken links!', type: :broken
|
47
|
+
else
|
48
|
+
num_pages, num_links = get_hash_stats(@broken_links)
|
49
|
+
puts_summary "Found #{num_links} unique broken link(s) across #{num_pages} page(s):", type: :broken
|
50
|
+
|
51
|
+
@broken_links.each do |key, values|
|
52
|
+
puts_group(key, type: :broken) # Puts the opening <p> element.
|
53
|
+
|
54
|
+
if verbose || (values.length <= NUM_VALUES)
|
55
|
+
values.each { |value| puts_group_item value, type: :broken }
|
56
|
+
else # Only print N values and summarise the rest.
|
57
|
+
NUM_VALUES.times { |i| puts_group_item values[i], type: :broken }
|
58
|
+
|
59
|
+
objects = sort_by_page? ? 'link(s)' : 'page(s)'
|
60
|
+
puts "+ #{values.length - NUM_VALUES} other #{objects}, remove --concise to see them all<br />"
|
61
|
+
end
|
62
|
+
|
63
|
+
puts '</p>'
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
puts '</div>'
|
68
|
+
end
|
69
|
+
|
70
|
+
# Report a summary of the ignored links.
|
71
|
+
def report_ignored_links(verbose: false)
|
72
|
+
puts '<div class="ignored_links">'
|
73
|
+
|
74
|
+
if @ignored_links.any?
|
75
|
+
num_pages, num_links = get_hash_stats(@ignored_links)
|
76
|
+
puts_summary "Ignored #{num_links} unique unsupported link(s) across #{num_pages} page(s), which you should check manually:", type: :ignored
|
77
|
+
|
78
|
+
@ignored_links.each do |key, values|
|
79
|
+
puts_group(key, type: :ignored) # Puts the opening <p> element.
|
80
|
+
|
81
|
+
if verbose || (values.length <= NUM_VALUES)
|
82
|
+
values.each { |value| puts_group_item value, type: :ignored }
|
83
|
+
else # Only print N values and summarise the rest.
|
84
|
+
NUM_VALUES.times { |i| puts_group_item values[i], type: :ignored }
|
85
|
+
|
86
|
+
objects = sort_by_page? ? 'link(s)' : 'page(s)'
|
87
|
+
puts "+ #{values.length - NUM_VALUES} other #{objects}, use --verbose to see them all<br />"
|
88
|
+
end
|
89
|
+
|
90
|
+
puts '</p>'
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
puts '</div>'
|
95
|
+
end
|
96
|
+
|
97
|
+
def puts_summary(text, type:)
|
98
|
+
klass = (type == :broken) ? 'broken_links_summary' : 'ignored_links_summary'
|
99
|
+
puts "<p class=\"#{klass}\">#{text}</p>"
|
100
|
+
end
|
101
|
+
|
102
|
+
def puts_group(link, type:)
|
103
|
+
href = build_url(link)
|
104
|
+
a_element = "<a href=\"#{href}\">#{link}</a>"
|
105
|
+
|
106
|
+
case type
|
107
|
+
when :broken
|
108
|
+
msg = sort_by_page? ?
|
109
|
+
"The following broken links were found on '#{a_element}':" :
|
110
|
+
"The broken link '#{a_element}' was found on the following pages:"
|
111
|
+
klass = 'broken_links_group'
|
112
|
+
when :ignored
|
113
|
+
msg = sort_by_page? ?
|
114
|
+
"The following links were ignored on '#{a_element}':" :
|
115
|
+
"The link '#{a_element}' was ignored on the following pages:"
|
116
|
+
klass = 'ignored_links_group'
|
117
|
+
else
|
118
|
+
raise "type: must be :broken or :ignored, not: #{type}"
|
119
|
+
end
|
120
|
+
|
121
|
+
puts "<p class=\"#{klass}\">"
|
122
|
+
puts msg + '<br />'
|
123
|
+
end
|
124
|
+
|
125
|
+
def puts_group_item(value, type:)
|
126
|
+
klass = (type == :broken) ? 'broken_links_group_item' : 'ignored_links_group_item'
|
127
|
+
puts "<a class=\"#{klass}\" href=\"#{build_url(value)}\">#{value}</a><br />"
|
128
|
+
end
|
129
|
+
|
130
|
+
def build_url(link)
|
131
|
+
href = @broken_link_map[link]
|
132
|
+
href || link
|
133
|
+
end
|
134
|
+
|
135
|
+
alias_method :report, :call
|
136
|
+
end
|
137
|
+
end
|
@@ -0,0 +1,76 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module BrokenLinkFinder
|
4
|
+
# Generic reporter class to be inherited from by format specific reporters.
|
5
|
+
class Reporter
|
6
|
+
# The amount of pages/links to display when verbose is false.
|
7
|
+
NUM_VALUES = 3
|
8
|
+
|
9
|
+
# Returns a new Reporter instance.
|
10
|
+
# stream is any Object that responds to :puts and :print.
|
11
|
+
def initialize(stream, sort,
|
12
|
+
broken_links, ignored_links,
|
13
|
+
broken_link_map, crawl_stats)
|
14
|
+
unless stream.respond_to?(:puts) && stream.respond_to?(:print)
|
15
|
+
raise 'stream must respond_to? :puts and :print'
|
16
|
+
end
|
17
|
+
raise "sort by either :page or :link, not #{sort}" \
|
18
|
+
unless %i[page link].include?(sort)
|
19
|
+
|
20
|
+
@stream = stream
|
21
|
+
@sort = sort
|
22
|
+
@broken_links = broken_links
|
23
|
+
@ignored_links = ignored_links
|
24
|
+
@broken_link_map = broken_link_map
|
25
|
+
@crawl_stats = crawl_stats
|
26
|
+
end
|
27
|
+
|
28
|
+
# Pretty print a report detailing the full link summary.
|
29
|
+
def call(broken_verbose: true, ignored_verbose: false)
|
30
|
+
raise 'Not implemented by parent class'
|
31
|
+
end
|
32
|
+
|
33
|
+
protected
|
34
|
+
|
35
|
+
# Return true if the sort is by page.
|
36
|
+
def sort_by_page?
|
37
|
+
@sort == :page
|
38
|
+
end
|
39
|
+
|
40
|
+
# Returns the key/value statistics of hash e.g. the number of keys and
|
41
|
+
# combined values. The hash should be of the format: { 'str' => [...] }.
|
42
|
+
# Use like: `num_pages, num_links = get_hash_stats(links)`.
|
43
|
+
def get_hash_stats(hash)
|
44
|
+
num_keys = hash.keys.length
|
45
|
+
num_values = hash.values.flatten.uniq.length
|
46
|
+
|
47
|
+
sort_by_page? ?
|
48
|
+
[num_keys, num_values] :
|
49
|
+
[num_values, num_keys]
|
50
|
+
end
|
51
|
+
|
52
|
+
# Prints the text. Defaults to a blank line.
|
53
|
+
def print(text = '')
|
54
|
+
@stream.print(text)
|
55
|
+
end
|
56
|
+
|
57
|
+
# Prints the text + \n. Defaults to a blank line.
|
58
|
+
def puts(text = '')
|
59
|
+
@stream.puts(text)
|
60
|
+
end
|
61
|
+
|
62
|
+
# Prints text + \n\n.
|
63
|
+
def putsn(text)
|
64
|
+
puts(text)
|
65
|
+
puts
|
66
|
+
end
|
67
|
+
|
68
|
+
# Prints \n + text + \n.
|
69
|
+
def nputs(text)
|
70
|
+
puts
|
71
|
+
puts(text)
|
72
|
+
end
|
73
|
+
|
74
|
+
alias_method :report, :call
|
75
|
+
end
|
76
|
+
end
|
@@ -0,0 +1,88 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module BrokenLinkFinder
|
4
|
+
# Class responsible for reporting in a text format.
|
5
|
+
class TextReporter < Reporter
|
6
|
+
# Returns a new TextReporter instance.
|
7
|
+
# stream is any Object that responds to :puts and :print.
|
8
|
+
def initialize(stream, sort,
|
9
|
+
broken_links, ignored_links,
|
10
|
+
broken_link_map, crawl_stats)
|
11
|
+
super
|
12
|
+
end
|
13
|
+
|
14
|
+
# Pretty print a report detailing the full link summary.
|
15
|
+
def call(broken_verbose: true, ignored_verbose: false)
|
16
|
+
report_crawl_summary
|
17
|
+
report_broken_links(verbose: broken_verbose)
|
18
|
+
report_ignored_links(verbose: ignored_verbose)
|
19
|
+
|
20
|
+
nil
|
21
|
+
end
|
22
|
+
|
23
|
+
private
|
24
|
+
|
25
|
+
# Report a summary of the overall crawl.
|
26
|
+
def report_crawl_summary
|
27
|
+
puts "Crawled #{@crawl_stats[:url]}"
|
28
|
+
putsn format(
|
29
|
+
'%s page(s) containing %s unique link(s) in %s seconds',
|
30
|
+
@crawl_stats[:num_pages],
|
31
|
+
@crawl_stats[:num_links],
|
32
|
+
@crawl_stats[:duration]&.truncate(2)
|
33
|
+
)
|
34
|
+
end
|
35
|
+
|
36
|
+
# Report a summary of the broken links.
|
37
|
+
def report_broken_links(verbose: true)
|
38
|
+
if @broken_links.empty?
|
39
|
+
puts 'Good news, there are no broken links!'
|
40
|
+
else
|
41
|
+
num_pages, num_links = get_hash_stats(@broken_links)
|
42
|
+
puts "Found #{num_links} unique broken link(s) across #{num_pages} page(s):"
|
43
|
+
|
44
|
+
@broken_links.each do |key, values|
|
45
|
+
msg = sort_by_page? ?
|
46
|
+
"The following broken links were found on '#{key}':" :
|
47
|
+
"The broken link '#{key}' was found on the following pages:"
|
48
|
+
nputs msg
|
49
|
+
|
50
|
+
if verbose || (values.length <= NUM_VALUES)
|
51
|
+
values.each { |value| puts value }
|
52
|
+
else # Only print N values and summarise the rest.
|
53
|
+
NUM_VALUES.times { |i| puts values[i] }
|
54
|
+
|
55
|
+
objects = sort_by_page? ? 'link(s)' : 'page(s)'
|
56
|
+
puts "+ #{values.length - NUM_VALUES} other #{objects}, remove --concise to see them all"
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
# Report a summary of the ignored links.
|
63
|
+
def report_ignored_links(verbose: false)
|
64
|
+
if @ignored_links.any?
|
65
|
+
num_pages, num_links = get_hash_stats(@ignored_links)
|
66
|
+
nputs "Ignored #{num_links} unique unsupported link(s) across #{num_pages} page(s), which you should check manually:"
|
67
|
+
|
68
|
+
@ignored_links.each do |key, values|
|
69
|
+
msg = sort_by_page? ?
|
70
|
+
"The following links were ignored on '#{key}':" :
|
71
|
+
"The link '#{key}' was ignored on the following pages:"
|
72
|
+
nputs msg
|
73
|
+
|
74
|
+
if verbose || (values.length <= NUM_VALUES)
|
75
|
+
values.each { |value| puts value }
|
76
|
+
else # Only print N values and summarise the rest.
|
77
|
+
NUM_VALUES.times { |i| puts values[i] }
|
78
|
+
|
79
|
+
objects = sort_by_page? ? 'link(s)' : 'page(s)'
|
80
|
+
puts "+ #{values.length - NUM_VALUES} other #{objects}, use --verbose to see them all"
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
alias_method :report, :call
|
87
|
+
end
|
88
|
+
end
|
@@ -1,11 +1,31 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
#
|
4
|
-
|
3
|
+
# Define a method on each doc for recording unparsable links.
|
4
|
+
# Unparsable links are recorded as broken links by Finder.
|
5
|
+
class Wgit::Document
|
6
|
+
def unparsable_links
|
7
|
+
@unparsable_links ||= []
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
11
|
+
# Returns a Wgit::Url or nil (if link is unparsable).
|
12
|
+
# A proc is preferrable to a function to avoid polluting the global namespace.
|
13
|
+
parse_link = lambda do |doc, link|
|
14
|
+
Wgit::Url.new(link)
|
15
|
+
rescue StandardError
|
16
|
+
doc.unparsable_links << link
|
17
|
+
nil
|
18
|
+
end
|
19
|
+
|
20
|
+
# Define a custom extractor for all page links we're interested in checking.
|
21
|
+
Wgit::Document.define_extractor(
|
5
22
|
:all_links,
|
6
|
-
|
23
|
+
lambda { BrokenLinkFinder::link_xpath },
|
7
24
|
singleton: false,
|
8
25
|
text_content_only: true
|
9
|
-
) do |links|
|
10
|
-
links
|
26
|
+
) do |links, doc|
|
27
|
+
links
|
28
|
+
.uniq
|
29
|
+
.map { |link| parse_link.call(doc, link) }
|
30
|
+
.compact
|
11
31
|
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module BrokenLinkFinder
|
4
|
+
# Extract all the Document's <body> links e.g. <a>, <img>, <script> etc.
|
5
|
+
DEFAULT_LINK_XPATH = '/html/body//*/@href | /html/body//*/@src'
|
6
|
+
|
7
|
+
@link_xpath = DEFAULT_LINK_XPATH
|
8
|
+
|
9
|
+
class << self
|
10
|
+
# The xpath used to extract links from a crawled page.
|
11
|
+
# Can be overridden as required.
|
12
|
+
attr_accessor :link_xpath
|
13
|
+
end
|
14
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: broken_link_finder
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.12.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Michael Telford
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2021-04-20 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -72,14 +72,14 @@ dependencies:
|
|
72
72
|
requirements:
|
73
73
|
- - "~>"
|
74
74
|
- !ruby/object:Gem::Version
|
75
|
-
version: '
|
75
|
+
version: '13.0'
|
76
76
|
type: :development
|
77
77
|
prerelease: false
|
78
78
|
version_requirements: !ruby/object:Gem::Requirement
|
79
79
|
requirements:
|
80
80
|
- - "~>"
|
81
81
|
- !ruby/object:Gem::Version
|
82
|
-
version: '
|
82
|
+
version: '13.0'
|
83
83
|
- !ruby/object:Gem::Dependency
|
84
84
|
name: webmock
|
85
85
|
requirement: !ruby/object:Gem::Requirement
|
@@ -100,42 +100,42 @@ dependencies:
|
|
100
100
|
requirements:
|
101
101
|
- - "~>"
|
102
102
|
- !ruby/object:Gem::Version
|
103
|
-
version: 0.20
|
103
|
+
version: '0.20'
|
104
104
|
type: :runtime
|
105
105
|
prerelease: false
|
106
106
|
version_requirements: !ruby/object:Gem::Requirement
|
107
107
|
requirements:
|
108
108
|
- - "~>"
|
109
109
|
- !ruby/object:Gem::Version
|
110
|
-
version: 0.20
|
110
|
+
version: '0.20'
|
111
111
|
- !ruby/object:Gem::Dependency
|
112
112
|
name: thread
|
113
113
|
requirement: !ruby/object:Gem::Requirement
|
114
114
|
requirements:
|
115
115
|
- - "~>"
|
116
116
|
- !ruby/object:Gem::Version
|
117
|
-
version: 0.2
|
117
|
+
version: '0.2'
|
118
118
|
type: :runtime
|
119
119
|
prerelease: false
|
120
120
|
version_requirements: !ruby/object:Gem::Requirement
|
121
121
|
requirements:
|
122
122
|
- - "~>"
|
123
123
|
- !ruby/object:Gem::Version
|
124
|
-
version: 0.2
|
124
|
+
version: '0.2'
|
125
125
|
- !ruby/object:Gem::Dependency
|
126
126
|
name: wgit
|
127
127
|
requirement: !ruby/object:Gem::Requirement
|
128
128
|
requirements:
|
129
129
|
- - "~>"
|
130
130
|
- !ruby/object:Gem::Version
|
131
|
-
version: 0.
|
131
|
+
version: '0.10'
|
132
132
|
type: :runtime
|
133
133
|
prerelease: false
|
134
134
|
version_requirements: !ruby/object:Gem::Requirement
|
135
135
|
requirements:
|
136
136
|
- - "~>"
|
137
137
|
- !ruby/object:Gem::Version
|
138
|
-
version: 0.
|
138
|
+
version: '0.10'
|
139
139
|
description: Finds a website's broken links using the 'wgit' gem and reports back
|
140
140
|
to you with a summary.
|
141
141
|
email: michael.telford@live.com
|
@@ -159,15 +159,22 @@ files:
|
|
159
159
|
- exe/broken_link_finder
|
160
160
|
- lib/broken_link_finder.rb
|
161
161
|
- lib/broken_link_finder/finder.rb
|
162
|
-
- lib/broken_link_finder/
|
162
|
+
- lib/broken_link_finder/link_manager.rb
|
163
|
+
- lib/broken_link_finder/reporter/html_reporter.rb
|
164
|
+
- lib/broken_link_finder/reporter/reporter.rb
|
165
|
+
- lib/broken_link_finder/reporter/text_reporter.rb
|
163
166
|
- lib/broken_link_finder/version.rb
|
164
167
|
- lib/broken_link_finder/wgit_extensions.rb
|
168
|
+
- lib/broken_link_finder/xpath.rb
|
165
169
|
- load.rb
|
166
170
|
homepage: https://github.com/michaeltelford/broken-link-finder
|
167
171
|
licenses:
|
168
172
|
- MIT
|
169
173
|
metadata:
|
170
174
|
source_code_uri: https://github.com/michaeltelford/broken-link-finder
|
175
|
+
changelog_uri: https://github.com/michaeltelford/broken-link-finder/blob/master/CHANGELOG.md
|
176
|
+
bug_tracker_uri: https://github.com/michaeltelford/broken-link-finder/issues
|
177
|
+
documentation_uri: https://www.rubydoc.info/gems/broken_link_finder
|
171
178
|
allowed_push_host: https://rubygems.org
|
172
179
|
post_install_message: Added the executable 'broken_link_finder' to $PATH
|
173
180
|
rdoc_options: []
|
@@ -184,9 +191,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
184
191
|
- !ruby/object:Gem::Version
|
185
192
|
version: '0'
|
186
193
|
requirements: []
|
187
|
-
|
188
|
-
|
189
|
-
signing_key:
|
194
|
+
rubygems_version: 3.1.2
|
195
|
+
signing_key:
|
190
196
|
specification_version: 4
|
191
197
|
summary: Finds a website's broken links and reports back to you with a summary.
|
192
198
|
test_files: []
|