broken_link_finder 0.10.0 → 0.11.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +12 -0
- data/Gemfile.lock +9 -9
- data/README.md +4 -3
- data/bin/console +8 -6
- data/broken_link_finder.gemspec +1 -1
- data/exe/broken_link_finder +7 -1
- data/lib/broken_link_finder/finder.rb +80 -47
- data/lib/broken_link_finder/reporter/html_reporter.rb +7 -5
- data/lib/broken_link_finder/reporter/reporter.rb +1 -2
- data/lib/broken_link_finder/reporter/text_reporter.rb +5 -4
- data/lib/broken_link_finder/version.rb +1 -1
- data/lib/broken_link_finder/wgit_extensions.rb +24 -4
- metadata +4 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 77094cfe9d0790770b5c34b86bc578fc65d0e425dc089c4fda41a3c587af6e00
|
4
|
+
data.tar.gz: 40f7f59411744bcd010c46bf4bdc17e59dbd4bd191bc33613c2d2bf269ba79ba
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4f3f4b7720d24c393fb844ed62159870bde4dd4222a8b0ec69b4fff7b96086b909df63834ef56a1b71e2d68e4ec319357f208273b3be79d81c982602b7a53b8a
|
7
|
+
data.tar.gz: c5af07c99199765688672ca396e19db9093ca0cd32c5a9e37810909787892c5070c729b275fcc6a126ea71bf2bdab4c5616b98643f74a3266775d112d4a8c274
|
data/CHANGELOG.md
CHANGED
@@ -9,6 +9,18 @@
|
|
9
9
|
- ...
|
10
10
|
---
|
11
11
|
|
12
|
+
## v0.11.0
|
13
|
+
### Added
|
14
|
+
- Additional crawl statistics.
|
15
|
+
- Exit code handling to executable. `0` for success, `1` for an error scenario.
|
16
|
+
### Changed/Removed
|
17
|
+
- Updated the report formats slightly bringing various improvements such as the total number of links crawled etc.
|
18
|
+
### Fixed
|
19
|
+
- Bug in html report, summary url is now an `<a>` link.
|
20
|
+
- Bug in `Finder@broken_link_map` URLs and `Finder#crawl_stats[:url]` URL during redirects.
|
21
|
+
- Bug causing an error on crawling unparsable/invalid URL's.
|
22
|
+
---
|
23
|
+
|
12
24
|
## v0.10.0
|
13
25
|
### Added
|
14
26
|
- A `--html` flag to the `crawl` executable command which produces a HTML report (instead of text).
|
data/Gemfile.lock
CHANGED
@@ -1,33 +1,33 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
broken_link_finder (0.
|
4
|
+
broken_link_finder (0.11.0)
|
5
5
|
thor (~> 0.20)
|
6
6
|
thread (~> 0.2)
|
7
|
-
wgit (~> 0.
|
7
|
+
wgit (~> 0.8)
|
8
8
|
|
9
9
|
GEM
|
10
10
|
remote: https://rubygems.org/
|
11
11
|
specs:
|
12
12
|
addressable (2.6.0)
|
13
13
|
public_suffix (>= 2.0.2, < 4.0)
|
14
|
-
bson (4.
|
14
|
+
bson (4.7.1)
|
15
15
|
byebug (11.0.1)
|
16
16
|
coderay (1.1.2)
|
17
17
|
crack (0.4.3)
|
18
18
|
safe_yaml (~> 1.0.0)
|
19
19
|
ethon (0.12.0)
|
20
20
|
ffi (>= 1.3.0)
|
21
|
-
ffi (1.
|
21
|
+
ffi (1.12.1)
|
22
22
|
hashdiff (1.0.0)
|
23
23
|
maxitest (3.4.0)
|
24
24
|
minitest (>= 5.0.0, < 5.13.0)
|
25
25
|
method_source (0.9.2)
|
26
26
|
mini_portile2 (2.4.0)
|
27
27
|
minitest (5.12.2)
|
28
|
-
mongo (2.11.
|
29
|
-
bson (>= 4.
|
30
|
-
nokogiri (1.10.
|
28
|
+
mongo (2.11.3)
|
29
|
+
bson (>= 4.4.2, < 5.0.0)
|
30
|
+
nokogiri (1.10.7)
|
31
31
|
mini_portile2 (~> 2.4.0)
|
32
32
|
pry (0.12.2)
|
33
33
|
coderay (~> 1.1.0)
|
@@ -43,7 +43,7 @@ GEM
|
|
43
43
|
addressable (>= 2.3.6)
|
44
44
|
crack (>= 0.3.2)
|
45
45
|
hashdiff (>= 0.4.0, < 2.0.0)
|
46
|
-
wgit (0.
|
46
|
+
wgit (0.8.0)
|
47
47
|
addressable (~> 2.6)
|
48
48
|
mongo (~> 2.9)
|
49
49
|
nokogiri (~> 1.10)
|
@@ -65,4 +65,4 @@ RUBY VERSION
|
|
65
65
|
ruby 2.5.3p105
|
66
66
|
|
67
67
|
BUNDLED WITH
|
68
|
-
2.
|
68
|
+
2.1.4
|
data/README.md
CHANGED
@@ -91,9 +91,10 @@ See the full source code documentation [here](https://www.rubydoc.info/gems/brok
|
|
91
91
|
If broken links are found then the output will look something like:
|
92
92
|
|
93
93
|
```text
|
94
|
-
Crawled http://txti.es
|
94
|
+
Crawled http://txti.es
|
95
|
+
7 page(s) containing 32 unique link(s) in 6.82 seconds
|
95
96
|
|
96
|
-
Found 6 broken link(s) across 2 page(s):
|
97
|
+
Found 6 unique broken link(s) across 2 page(s):
|
97
98
|
|
98
99
|
The following broken links were found on 'http://txti.es/about':
|
99
100
|
http://twitter.com/thebarrytone
|
@@ -105,7 +106,7 @@ The following broken links were found on 'http://txti.es/how':
|
|
105
106
|
http://en.wikipedia.org/wiki/Markdown
|
106
107
|
http://imgur.com
|
107
108
|
|
108
|
-
Ignored 3 unsupported link(s) across 2 page(s), which you should check manually:
|
109
|
+
Ignored 3 unique unsupported link(s) across 2 page(s), which you should check manually:
|
109
110
|
|
110
111
|
The following links were ignored on 'http://txti.es':
|
111
112
|
tel:+13174562564
|
data/bin/console
CHANGED
@@ -23,12 +23,14 @@ end
|
|
23
23
|
# You can add fixtures and/or initialization code here...
|
24
24
|
reload
|
25
25
|
|
26
|
-
url
|
27
|
-
by_page
|
28
|
-
by_link
|
29
|
-
finder
|
26
|
+
def url; @url ||= 'http://txti.es/'; end
|
27
|
+
def by_page; @by_page ||= Finder.new; end
|
28
|
+
def by_link; @by_link ||= Finder.new(sort: :link); end
|
29
|
+
def finder; @finder ||= by_page; end
|
30
30
|
|
31
31
|
# Start the console.
|
32
|
-
puts
|
32
|
+
puts
|
33
|
+
puts "broken_link_finder v#{BrokenLinkFinder::VERSION} (#{Wgit.version_str})"
|
34
|
+
puts
|
33
35
|
|
34
|
-
|
36
|
+
Pry.start
|
data/broken_link_finder.gemspec
CHANGED
data/exe/broken_link_finder
CHANGED
@@ -29,13 +29,19 @@ class BrokenLinkFinderCLI < Thor
|
|
29
29
|
broken_verbose: broken_verbose,
|
30
30
|
ignored_verbose: ignored_verbose
|
31
31
|
)
|
32
|
-
|
32
|
+
|
33
|
+
exit 0
|
34
|
+
rescue StandardError => e
|
33
35
|
puts "An error has occurred: #{e.message}"
|
36
|
+
|
37
|
+
exit 1
|
34
38
|
end
|
35
39
|
|
36
40
|
desc 'version', 'Display the currently installed version'
|
37
41
|
def version
|
38
42
|
puts "broken_link_finder v#{BrokenLinkFinder::VERSION}"
|
43
|
+
|
44
|
+
exit 0
|
39
45
|
end
|
40
46
|
end
|
41
47
|
|
@@ -2,6 +2,7 @@
|
|
2
2
|
|
3
3
|
module BrokenLinkFinder
|
4
4
|
DEFAULT_MAX_THREADS = 100
|
5
|
+
SERVER_WAIT_TIME = 0.5
|
5
6
|
|
6
7
|
# Alias for BrokenLinkFinder::Finder.new.
|
7
8
|
def self.new(sort: :page, max_threads: DEFAULT_MAX_THREADS)
|
@@ -24,25 +25,28 @@ module BrokenLinkFinder
|
|
24
25
|
reset_crawl
|
25
26
|
end
|
26
27
|
|
27
|
-
# Clear/empty the link collection
|
28
|
+
# Clear/empty the link collection objects.
|
28
29
|
def reset_crawl
|
29
|
-
@broken_links
|
30
|
-
@ignored_links
|
31
|
-
@all_broken_links
|
32
|
-
@all_intact_links
|
33
|
-
@
|
34
|
-
@
|
30
|
+
@broken_links = {} # Used for mapping pages to broken links.
|
31
|
+
@ignored_links = {} # Used for mapping pages to ignored links.
|
32
|
+
@all_broken_links = Set.new # Used to prevent crawling a broken link twice.
|
33
|
+
@all_intact_links = Set.new # Used to prevent crawling an intact link twice.
|
34
|
+
@all_ignored_links = Set.new # Used for building crawl statistics.
|
35
|
+
@broken_link_map = {} # Maps a link to its absolute (crawlable) form.
|
36
|
+
@crawl_stats = {} # Records crawl stats e.g. duration etc.
|
35
37
|
end
|
36
38
|
|
37
|
-
# Finds broken links within a single page and
|
38
|
-
#
|
39
|
+
# Finds broken links within a single page and records them.
|
40
|
+
# Returns true if at least one broken link was found.
|
39
41
|
# Access the broken links afterwards with Finder#broken_links.
|
40
42
|
def crawl_url(url)
|
41
43
|
reset_crawl
|
42
44
|
|
43
45
|
start = Time.now
|
44
46
|
url = url.to_url
|
45
|
-
|
47
|
+
|
48
|
+
# We dup the url to avoid recording any redirects.
|
49
|
+
doc = @crawler.crawl(url.dup)
|
46
50
|
|
47
51
|
# Ensure the given page url is valid.
|
48
52
|
raise "Invalid or broken URL: #{url}" unless doc
|
@@ -57,9 +61,8 @@ module BrokenLinkFinder
|
|
57
61
|
@broken_links.any?
|
58
62
|
end
|
59
63
|
|
60
|
-
# Finds broken links within an entire site and
|
61
|
-
#
|
62
|
-
# at least one broken link was found and an Array of all pages crawled.
|
64
|
+
# Finds broken links within an entire site and records them.
|
65
|
+
# Returns true if at least one broken link was found.
|
63
66
|
# Access the broken links afterwards with Finder#broken_links.
|
64
67
|
def crawl_site(url)
|
65
68
|
reset_crawl
|
@@ -70,7 +73,8 @@ module BrokenLinkFinder
|
|
70
73
|
crawled = Set.new
|
71
74
|
|
72
75
|
# Crawl the site's HTML web pages looking for links.
|
73
|
-
|
76
|
+
# We dup the url to avoid recording any redirects.
|
77
|
+
externals = @crawler.crawl_site(url.dup) do |doc|
|
74
78
|
crawled << doc.url
|
75
79
|
next unless doc
|
76
80
|
|
@@ -91,22 +95,23 @@ module BrokenLinkFinder
|
|
91
95
|
@broken_links.any?
|
92
96
|
end
|
93
97
|
|
94
|
-
#
|
98
|
+
# Outputs the link report into a stream e.g. STDOUT or a file,
|
95
99
|
# anything that respond_to? :puts. Defaults to STDOUT.
|
96
|
-
def report(stream = STDOUT,
|
97
|
-
|
100
|
+
def report(stream = STDOUT, type: :text,
|
101
|
+
broken_verbose: true, ignored_verbose: false)
|
98
102
|
klass = case type
|
99
103
|
when :text
|
100
104
|
BrokenLinkFinder::TextReporter
|
101
105
|
when :html
|
102
106
|
BrokenLinkFinder::HTMLReporter
|
103
107
|
else
|
104
|
-
raise "type: must be :text or :html, not: :#{type}"
|
108
|
+
raise "The type: must be :text or :html, not: :#{type}"
|
105
109
|
end
|
106
110
|
|
107
|
-
reporter = klass.new(stream, @sort,
|
108
|
-
@
|
109
|
-
|
111
|
+
reporter = klass.new(stream, @sort,
|
112
|
+
@broken_links, @ignored_links,
|
113
|
+
@broken_link_map, @crawl_stats)
|
114
|
+
reporter.call(broken_verbose: broken_verbose,
|
110
115
|
ignored_verbose: ignored_verbose)
|
111
116
|
end
|
112
117
|
|
@@ -114,25 +119,28 @@ module BrokenLinkFinder
|
|
114
119
|
|
115
120
|
# Finds which links are unsupported or broken and records the details.
|
116
121
|
def find_broken_links(page)
|
122
|
+
process_unparsable_links(page) # Record them as broken.
|
123
|
+
|
117
124
|
links = get_supported_links(page)
|
118
125
|
|
119
126
|
# Iterate over the supported links checking if they're broken or not.
|
120
127
|
links.each do |link|
|
121
|
-
# Skip if the link has been
|
128
|
+
# Skip if the link has been encountered previously.
|
122
129
|
next if @all_intact_links.include?(link)
|
123
130
|
|
124
131
|
if @all_broken_links.include?(link)
|
125
|
-
|
132
|
+
# The link has already been proven broken so simply record it.
|
133
|
+
append_broken_link(page, link, map: false)
|
126
134
|
next
|
127
135
|
end
|
128
136
|
|
129
|
-
# The link hasn't been
|
137
|
+
# The link hasn't been encountered before so we crawl it.
|
130
138
|
link_doc = crawl_link(page, link)
|
131
139
|
|
132
|
-
# Determine if the crawled link is broken or not.
|
140
|
+
# Determine if the crawled link is broken or not and record it.
|
133
141
|
if link_broken?(link_doc)
|
134
|
-
append_broken_link(page
|
135
|
-
else
|
142
|
+
append_broken_link(page, link)
|
143
|
+
else # Record it as being intact.
|
136
144
|
@lock.synchronize { @all_intact_links << link }
|
137
145
|
end
|
138
146
|
end
|
@@ -140,14 +148,31 @@ module BrokenLinkFinder
|
|
140
148
|
nil
|
141
149
|
end
|
142
150
|
|
151
|
+
# Record each unparsable link as a broken link.
|
152
|
+
def process_unparsable_links(doc)
|
153
|
+
doc.unparsable_links.each do |link|
|
154
|
+
append_broken_link(doc, link, map: false)
|
155
|
+
@broken_link_map[link] = link
|
156
|
+
end
|
157
|
+
end
|
158
|
+
|
143
159
|
# Implements a retry mechanism for each of the broken links found.
|
144
160
|
# Removes any broken links found to be working OK.
|
145
161
|
def retry_broken_links
|
146
|
-
sleep(
|
162
|
+
sleep(SERVER_WAIT_TIME) # Give the servers a break, then retry the links.
|
147
163
|
|
148
|
-
@broken_link_map.
|
149
|
-
|
150
|
-
|
164
|
+
@broken_link_map.select! do |link, href|
|
165
|
+
# Don't retry unparsable links (which are Strings).
|
166
|
+
next(true) unless href.is_a?(Wgit::Url)
|
167
|
+
|
168
|
+
doc = @crawler.crawl(href.dup)
|
169
|
+
|
170
|
+
if link_broken?(doc)
|
171
|
+
true
|
172
|
+
else
|
173
|
+
remove_broken_link(link)
|
174
|
+
false
|
175
|
+
end
|
151
176
|
end
|
152
177
|
end
|
153
178
|
|
@@ -166,7 +191,7 @@ module BrokenLinkFinder
|
|
166
191
|
# Make the link absolute and crawl it, returning its Wgit::Document.
|
167
192
|
def crawl_link(doc, link)
|
168
193
|
link = link.prefix_base(doc)
|
169
|
-
@crawler.crawl(link)
|
194
|
+
@crawler.crawl(link.dup) # We dup link to avoid recording any redirects.
|
170
195
|
end
|
171
196
|
|
172
197
|
# Return if the crawled link is broken or not.
|
@@ -175,8 +200,9 @@ module BrokenLinkFinder
|
|
175
200
|
end
|
176
201
|
|
177
202
|
# Returns true if the link is/contains a broken anchor/fragment.
|
203
|
+
# E.g. /about#top should contain a HTML element with an @id of 'top' etc.
|
178
204
|
def has_broken_anchor(doc)
|
179
|
-
raise 'link document is nil' unless doc
|
205
|
+
raise 'The link document is nil' unless doc
|
180
206
|
|
181
207
|
fragment = doc.url.fragment
|
182
208
|
return false if fragment.nil? || fragment.empty?
|
@@ -184,22 +210,22 @@ module BrokenLinkFinder
|
|
184
210
|
doc.xpath("//*[@id='#{fragment}']").empty?
|
185
211
|
end
|
186
212
|
|
187
|
-
# Append key => [value] to
|
188
|
-
# If
|
189
|
-
def append_broken_link(
|
190
|
-
key, value = get_key_value(url, link)
|
213
|
+
# Append key => [value] to the broken link collections.
|
214
|
+
# If map: true, then the link will also be recorded in @broken_link_map.
|
215
|
+
def append_broken_link(doc, link, map: true)
|
216
|
+
key, value = get_key_value(doc.url, link)
|
191
217
|
|
192
218
|
@lock.synchronize do
|
193
219
|
@broken_links[key] = [] unless @broken_links[key]
|
194
220
|
@broken_links[key] << value
|
195
221
|
|
196
|
-
@all_broken_links
|
222
|
+
@all_broken_links << link
|
197
223
|
|
198
|
-
@broken_link_map[link] = link.prefix_base(doc) if
|
224
|
+
@broken_link_map[link] = link.prefix_base(doc) if map
|
199
225
|
end
|
200
226
|
end
|
201
227
|
|
202
|
-
# Remove the
|
228
|
+
# Remove the broken link from the necessary collections.
|
203
229
|
def remove_broken_link(link)
|
204
230
|
@lock.synchronize do
|
205
231
|
if @sort == :page
|
@@ -214,13 +240,15 @@ module BrokenLinkFinder
|
|
214
240
|
end
|
215
241
|
end
|
216
242
|
|
217
|
-
# Append key => [value] to
|
243
|
+
# Append key => [value] to the ignored link collections.
|
218
244
|
def append_ignored_link(url, link)
|
219
245
|
key, value = get_key_value(url, link)
|
220
246
|
|
221
247
|
@lock.synchronize do
|
222
248
|
@ignored_links[key] = [] unless @ignored_links[key]
|
223
249
|
@ignored_links[key] << value
|
250
|
+
|
251
|
+
@all_ignored_links << link
|
224
252
|
end
|
225
253
|
end
|
226
254
|
|
@@ -249,13 +277,18 @@ module BrokenLinkFinder
|
|
249
277
|
@ignored_links.each { |_k, v| v.sort! }
|
250
278
|
end
|
251
279
|
|
252
|
-
# Sets
|
280
|
+
# Sets various statistics about the crawl and its links.
|
253
281
|
def set_crawl_stats(url:, pages_crawled:, start:)
|
254
|
-
@crawl_stats[:url]
|
255
|
-
@crawl_stats[:pages_crawled]
|
256
|
-
@crawl_stats[:num_pages]
|
257
|
-
@crawl_stats[:num_links]
|
258
|
-
|
282
|
+
@crawl_stats[:url] = url
|
283
|
+
@crawl_stats[:pages_crawled] = pages_crawled
|
284
|
+
@crawl_stats[:num_pages] = pages_crawled.size
|
285
|
+
@crawl_stats[:num_links] = (
|
286
|
+
@all_broken_links.size + @all_intact_links.size + @all_ignored_links.size
|
287
|
+
)
|
288
|
+
@crawl_stats[:num_broken_links] = @all_broken_links.size
|
289
|
+
@crawl_stats[:num_intact_links] = @all_intact_links.size
|
290
|
+
@crawl_stats[:num_ignored_links] = @all_ignored_links.size
|
291
|
+
@crawl_stats[:duration] = Time.now - start
|
259
292
|
end
|
260
293
|
|
261
294
|
alias crawl_page crawl_url
|
@@ -28,9 +28,11 @@ module BrokenLinkFinder
|
|
28
28
|
# Report a summary of the overall crawl.
|
29
29
|
def report_crawl_summary
|
30
30
|
puts format(
|
31
|
-
'<p class="crawl_summary">Crawled %s (%s
|
31
|
+
'<p class="crawl_summary">Crawled <a href="%s">%s</a><br />%s page(s) containing %s unique link(s) in %s seconds</p>',
|
32
|
+
@crawl_stats[:url],
|
32
33
|
@crawl_stats[:url],
|
33
34
|
@crawl_stats[:num_pages],
|
35
|
+
@crawl_stats[:num_links],
|
34
36
|
@crawl_stats[:duration]&.truncate(2)
|
35
37
|
)
|
36
38
|
end
|
@@ -43,7 +45,7 @@ module BrokenLinkFinder
|
|
43
45
|
puts_summary 'Good news, there are no broken links!', type: :broken
|
44
46
|
else
|
45
47
|
num_pages, num_links = get_hash_stats(@broken_links)
|
46
|
-
puts_summary "Found #{num_links} broken link(s) across #{num_pages} page(s):", type: :broken
|
48
|
+
puts_summary "Found #{num_links} unique broken link(s) across #{num_pages} page(s):", type: :broken
|
47
49
|
|
48
50
|
@broken_links.each do |key, values|
|
49
51
|
puts_group(key, type: :broken) # Puts the opening <p> element.
|
@@ -70,7 +72,7 @@ module BrokenLinkFinder
|
|
70
72
|
|
71
73
|
if @ignored_links.any?
|
72
74
|
num_pages, num_links = get_hash_stats(@ignored_links)
|
73
|
-
puts_summary "Ignored #{num_links} unsupported link(s) across #{num_pages} page(s), which you should check manually:", type: :ignored
|
75
|
+
puts_summary "Ignored #{num_links} unique unsupported link(s) across #{num_pages} page(s), which you should check manually:", type: :ignored
|
74
76
|
|
75
77
|
@ignored_links.each do |key, values|
|
76
78
|
puts_group(key, type: :ignored) # Puts the opening <p> element.
|
@@ -125,8 +127,8 @@ module BrokenLinkFinder
|
|
125
127
|
end
|
126
128
|
|
127
129
|
def build_url(link)
|
128
|
-
|
129
|
-
|
130
|
+
href = @broken_link_map[link]
|
131
|
+
href || link
|
130
132
|
end
|
131
133
|
|
132
134
|
alias_method :report, :call
|
@@ -42,8 +42,7 @@ module BrokenLinkFinder
|
|
42
42
|
# Use like: `num_pages, num_links = get_hash_stats(links)`.
|
43
43
|
def get_hash_stats(hash)
|
44
44
|
num_keys = hash.keys.length
|
45
|
-
|
46
|
-
num_values = sort_by_page? ? values.length : values.uniq.length
|
45
|
+
num_values = hash.values.flatten.uniq.length
|
47
46
|
|
48
47
|
sort_by_page? ?
|
49
48
|
[num_keys, num_values] :
|
@@ -23,10 +23,11 @@ module BrokenLinkFinder
|
|
23
23
|
|
24
24
|
# Report a summary of the overall crawl.
|
25
25
|
def report_crawl_summary
|
26
|
+
puts "Crawled #{@crawl_stats[:url]}"
|
26
27
|
putsn format(
|
27
|
-
'
|
28
|
-
@crawl_stats[:url],
|
28
|
+
'%s page(s) containing %s unique link(s) in %s seconds',
|
29
29
|
@crawl_stats[:num_pages],
|
30
|
+
@crawl_stats[:num_links],
|
30
31
|
@crawl_stats[:duration]&.truncate(2)
|
31
32
|
)
|
32
33
|
end
|
@@ -37,7 +38,7 @@ module BrokenLinkFinder
|
|
37
38
|
puts 'Good news, there are no broken links!'
|
38
39
|
else
|
39
40
|
num_pages, num_links = get_hash_stats(@broken_links)
|
40
|
-
puts "Found #{num_links} broken link(s) across #{num_pages} page(s):"
|
41
|
+
puts "Found #{num_links} unique broken link(s) across #{num_pages} page(s):"
|
41
42
|
|
42
43
|
@broken_links.each do |key, values|
|
43
44
|
msg = sort_by_page? ?
|
@@ -61,7 +62,7 @@ module BrokenLinkFinder
|
|
61
62
|
def report_ignored_links(verbose: false)
|
62
63
|
if @ignored_links.any?
|
63
64
|
num_pages, num_links = get_hash_stats(@ignored_links)
|
64
|
-
nputs "Ignored #{num_links} unsupported link(s) across #{num_pages} page(s), which you should check manually:"
|
65
|
+
nputs "Ignored #{num_links} unique unsupported link(s) across #{num_pages} page(s), which you should check manually:"
|
65
66
|
|
66
67
|
@ignored_links.each do |key, values|
|
67
68
|
msg = sort_by_page? ?
|
@@ -1,11 +1,31 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
#
|
3
|
+
# Define a method on each doc for recording unparsable links.
|
4
|
+
# Unparsable links are recorded as broken links by Finder.
|
5
|
+
class Wgit::Document
|
6
|
+
def unparsable_links
|
7
|
+
@unparsable_links ||= []
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
11
|
+
# Returns a Wgit::Url or nil (if link is unparsable).
|
12
|
+
# A proc is preferrable to a function to avoid polluting the global namespace.
|
13
|
+
parse_link = lambda do |doc, link|
|
14
|
+
Wgit::Url.new(link)
|
15
|
+
rescue StandardError
|
16
|
+
doc.unparsable_links << link
|
17
|
+
nil
|
18
|
+
end
|
19
|
+
|
20
|
+
# We extract all the Document's links e.g. <a>, <img>, <script>, <link> etc.
|
4
21
|
Wgit::Document.define_extension(
|
5
22
|
:all_links,
|
6
|
-
'//*/@href | //*/@src', # Any element
|
23
|
+
'//*/@href | //*/@src', # Any element's href or src attribute URL.
|
7
24
|
singleton: false,
|
8
25
|
text_content_only: true
|
9
|
-
) do |links|
|
10
|
-
links
|
26
|
+
) do |links, doc|
|
27
|
+
links
|
28
|
+
.uniq
|
29
|
+
.map { |link| parse_link.call(doc, link) }
|
30
|
+
.compact
|
11
31
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: broken_link_finder
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.11.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Michael Telford
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2020-01-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -128,14 +128,14 @@ dependencies:
|
|
128
128
|
requirements:
|
129
129
|
- - "~>"
|
130
130
|
- !ruby/object:Gem::Version
|
131
|
-
version: '0.
|
131
|
+
version: '0.8'
|
132
132
|
type: :runtime
|
133
133
|
prerelease: false
|
134
134
|
version_requirements: !ruby/object:Gem::Requirement
|
135
135
|
requirements:
|
136
136
|
- - "~>"
|
137
137
|
- !ruby/object:Gem::Version
|
138
|
-
version: '0.
|
138
|
+
version: '0.8'
|
139
139
|
description: Finds a website's broken links using the 'wgit' gem and reports back
|
140
140
|
to you with a summary.
|
141
141
|
email: michael.telford@live.com
|