broken_link_finder 0.10.0 → 0.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +12 -0
- data/Gemfile.lock +9 -9
- data/README.md +4 -3
- data/bin/console +8 -6
- data/broken_link_finder.gemspec +1 -1
- data/exe/broken_link_finder +7 -1
- data/lib/broken_link_finder/finder.rb +80 -47
- data/lib/broken_link_finder/reporter/html_reporter.rb +7 -5
- data/lib/broken_link_finder/reporter/reporter.rb +1 -2
- data/lib/broken_link_finder/reporter/text_reporter.rb +5 -4
- data/lib/broken_link_finder/version.rb +1 -1
- data/lib/broken_link_finder/wgit_extensions.rb +24 -4
- metadata +4 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 77094cfe9d0790770b5c34b86bc578fc65d0e425dc089c4fda41a3c587af6e00
|
4
|
+
data.tar.gz: 40f7f59411744bcd010c46bf4bdc17e59dbd4bd191bc33613c2d2bf269ba79ba
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4f3f4b7720d24c393fb844ed62159870bde4dd4222a8b0ec69b4fff7b96086b909df63834ef56a1b71e2d68e4ec319357f208273b3be79d81c982602b7a53b8a
|
7
|
+
data.tar.gz: c5af07c99199765688672ca396e19db9093ca0cd32c5a9e37810909787892c5070c729b275fcc6a126ea71bf2bdab4c5616b98643f74a3266775d112d4a8c274
|
data/CHANGELOG.md
CHANGED
@@ -9,6 +9,18 @@
|
|
9
9
|
- ...
|
10
10
|
---
|
11
11
|
|
12
|
+
## v0.11.0
|
13
|
+
### Added
|
14
|
+
- Additional crawl statistics.
|
15
|
+
- Exit code handling to executable. `0` for success, `1` for an error scenario.
|
16
|
+
### Changed/Removed
|
17
|
+
- Updated the report formats slightly bringing various improvements such as the total number of links crawled etc.
|
18
|
+
### Fixed
|
19
|
+
- Bug in html report, summary url is now an `<a>` link.
|
20
|
+
- Bug in `Finder@broken_link_map` URLs and `Finder#crawl_stats[:url]` URL during redirects.
|
21
|
+
- Bug causing an error on crawling unparsable/invalid URL's.
|
22
|
+
---
|
23
|
+
|
12
24
|
## v0.10.0
|
13
25
|
### Added
|
14
26
|
- A `--html` flag to the `crawl` executable command which produces a HTML report (instead of text).
|
data/Gemfile.lock
CHANGED
@@ -1,33 +1,33 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
broken_link_finder (0.
|
4
|
+
broken_link_finder (0.11.0)
|
5
5
|
thor (~> 0.20)
|
6
6
|
thread (~> 0.2)
|
7
|
-
wgit (~> 0.
|
7
|
+
wgit (~> 0.8)
|
8
8
|
|
9
9
|
GEM
|
10
10
|
remote: https://rubygems.org/
|
11
11
|
specs:
|
12
12
|
addressable (2.6.0)
|
13
13
|
public_suffix (>= 2.0.2, < 4.0)
|
14
|
-
bson (4.
|
14
|
+
bson (4.7.1)
|
15
15
|
byebug (11.0.1)
|
16
16
|
coderay (1.1.2)
|
17
17
|
crack (0.4.3)
|
18
18
|
safe_yaml (~> 1.0.0)
|
19
19
|
ethon (0.12.0)
|
20
20
|
ffi (>= 1.3.0)
|
21
|
-
ffi (1.
|
21
|
+
ffi (1.12.1)
|
22
22
|
hashdiff (1.0.0)
|
23
23
|
maxitest (3.4.0)
|
24
24
|
minitest (>= 5.0.0, < 5.13.0)
|
25
25
|
method_source (0.9.2)
|
26
26
|
mini_portile2 (2.4.0)
|
27
27
|
minitest (5.12.2)
|
28
|
-
mongo (2.11.
|
29
|
-
bson (>= 4.
|
30
|
-
nokogiri (1.10.
|
28
|
+
mongo (2.11.3)
|
29
|
+
bson (>= 4.4.2, < 5.0.0)
|
30
|
+
nokogiri (1.10.7)
|
31
31
|
mini_portile2 (~> 2.4.0)
|
32
32
|
pry (0.12.2)
|
33
33
|
coderay (~> 1.1.0)
|
@@ -43,7 +43,7 @@ GEM
|
|
43
43
|
addressable (>= 2.3.6)
|
44
44
|
crack (>= 0.3.2)
|
45
45
|
hashdiff (>= 0.4.0, < 2.0.0)
|
46
|
-
wgit (0.
|
46
|
+
wgit (0.8.0)
|
47
47
|
addressable (~> 2.6)
|
48
48
|
mongo (~> 2.9)
|
49
49
|
nokogiri (~> 1.10)
|
@@ -65,4 +65,4 @@ RUBY VERSION
|
|
65
65
|
ruby 2.5.3p105
|
66
66
|
|
67
67
|
BUNDLED WITH
|
68
|
-
2.
|
68
|
+
2.1.4
|
data/README.md
CHANGED
@@ -91,9 +91,10 @@ See the full source code documentation [here](https://www.rubydoc.info/gems/brok
|
|
91
91
|
If broken links are found then the output will look something like:
|
92
92
|
|
93
93
|
```text
|
94
|
-
Crawled http://txti.es
|
94
|
+
Crawled http://txti.es
|
95
|
+
7 page(s) containing 32 unique link(s) in 6.82 seconds
|
95
96
|
|
96
|
-
Found 6 broken link(s) across 2 page(s):
|
97
|
+
Found 6 unique broken link(s) across 2 page(s):
|
97
98
|
|
98
99
|
The following broken links were found on 'http://txti.es/about':
|
99
100
|
http://twitter.com/thebarrytone
|
@@ -105,7 +106,7 @@ The following broken links were found on 'http://txti.es/how':
|
|
105
106
|
http://en.wikipedia.org/wiki/Markdown
|
106
107
|
http://imgur.com
|
107
108
|
|
108
|
-
Ignored 3 unsupported link(s) across 2 page(s), which you should check manually:
|
109
|
+
Ignored 3 unique unsupported link(s) across 2 page(s), which you should check manually:
|
109
110
|
|
110
111
|
The following links were ignored on 'http://txti.es':
|
111
112
|
tel:+13174562564
|
data/bin/console
CHANGED
@@ -23,12 +23,14 @@ end
|
|
23
23
|
# You can add fixtures and/or initialization code here...
|
24
24
|
reload
|
25
25
|
|
26
|
-
url
|
27
|
-
by_page
|
28
|
-
by_link
|
29
|
-
finder
|
26
|
+
def url; @url ||= 'http://txti.es/'; end
|
27
|
+
def by_page; @by_page ||= Finder.new; end
|
28
|
+
def by_link; @by_link ||= Finder.new(sort: :link); end
|
29
|
+
def finder; @finder ||= by_page; end
|
30
30
|
|
31
31
|
# Start the console.
|
32
|
-
puts
|
32
|
+
puts
|
33
|
+
puts "broken_link_finder v#{BrokenLinkFinder::VERSION} (#{Wgit.version_str})"
|
34
|
+
puts
|
33
35
|
|
34
|
-
|
36
|
+
Pry.start
|
data/broken_link_finder.gemspec
CHANGED
data/exe/broken_link_finder
CHANGED
@@ -29,13 +29,19 @@ class BrokenLinkFinderCLI < Thor
|
|
29
29
|
broken_verbose: broken_verbose,
|
30
30
|
ignored_verbose: ignored_verbose
|
31
31
|
)
|
32
|
-
|
32
|
+
|
33
|
+
exit 0
|
34
|
+
rescue StandardError => e
|
33
35
|
puts "An error has occurred: #{e.message}"
|
36
|
+
|
37
|
+
exit 1
|
34
38
|
end
|
35
39
|
|
36
40
|
desc 'version', 'Display the currently installed version'
|
37
41
|
def version
|
38
42
|
puts "broken_link_finder v#{BrokenLinkFinder::VERSION}"
|
43
|
+
|
44
|
+
exit 0
|
39
45
|
end
|
40
46
|
end
|
41
47
|
|
@@ -2,6 +2,7 @@
|
|
2
2
|
|
3
3
|
module BrokenLinkFinder
|
4
4
|
DEFAULT_MAX_THREADS = 100
|
5
|
+
SERVER_WAIT_TIME = 0.5
|
5
6
|
|
6
7
|
# Alias for BrokenLinkFinder::Finder.new.
|
7
8
|
def self.new(sort: :page, max_threads: DEFAULT_MAX_THREADS)
|
@@ -24,25 +25,28 @@ module BrokenLinkFinder
|
|
24
25
|
reset_crawl
|
25
26
|
end
|
26
27
|
|
27
|
-
# Clear/empty the link collection
|
28
|
+
# Clear/empty the link collection objects.
|
28
29
|
def reset_crawl
|
29
|
-
@broken_links
|
30
|
-
@ignored_links
|
31
|
-
@all_broken_links
|
32
|
-
@all_intact_links
|
33
|
-
@
|
34
|
-
@
|
30
|
+
@broken_links = {} # Used for mapping pages to broken links.
|
31
|
+
@ignored_links = {} # Used for mapping pages to ignored links.
|
32
|
+
@all_broken_links = Set.new # Used to prevent crawling a broken link twice.
|
33
|
+
@all_intact_links = Set.new # Used to prevent crawling an intact link twice.
|
34
|
+
@all_ignored_links = Set.new # Used for building crawl statistics.
|
35
|
+
@broken_link_map = {} # Maps a link to its absolute (crawlable) form.
|
36
|
+
@crawl_stats = {} # Records crawl stats e.g. duration etc.
|
35
37
|
end
|
36
38
|
|
37
|
-
# Finds broken links within a single page and
|
38
|
-
#
|
39
|
+
# Finds broken links within a single page and records them.
|
40
|
+
# Returns true if at least one broken link was found.
|
39
41
|
# Access the broken links afterwards with Finder#broken_links.
|
40
42
|
def crawl_url(url)
|
41
43
|
reset_crawl
|
42
44
|
|
43
45
|
start = Time.now
|
44
46
|
url = url.to_url
|
45
|
-
|
47
|
+
|
48
|
+
# We dup the url to avoid recording any redirects.
|
49
|
+
doc = @crawler.crawl(url.dup)
|
46
50
|
|
47
51
|
# Ensure the given page url is valid.
|
48
52
|
raise "Invalid or broken URL: #{url}" unless doc
|
@@ -57,9 +61,8 @@ module BrokenLinkFinder
|
|
57
61
|
@broken_links.any?
|
58
62
|
end
|
59
63
|
|
60
|
-
# Finds broken links within an entire site and
|
61
|
-
#
|
62
|
-
# at least one broken link was found and an Array of all pages crawled.
|
64
|
+
# Finds broken links within an entire site and records them.
|
65
|
+
# Returns true if at least one broken link was found.
|
63
66
|
# Access the broken links afterwards with Finder#broken_links.
|
64
67
|
def crawl_site(url)
|
65
68
|
reset_crawl
|
@@ -70,7 +73,8 @@ module BrokenLinkFinder
|
|
70
73
|
crawled = Set.new
|
71
74
|
|
72
75
|
# Crawl the site's HTML web pages looking for links.
|
73
|
-
|
76
|
+
# We dup the url to avoid recording any redirects.
|
77
|
+
externals = @crawler.crawl_site(url.dup) do |doc|
|
74
78
|
crawled << doc.url
|
75
79
|
next unless doc
|
76
80
|
|
@@ -91,22 +95,23 @@ module BrokenLinkFinder
|
|
91
95
|
@broken_links.any?
|
92
96
|
end
|
93
97
|
|
94
|
-
#
|
98
|
+
# Outputs the link report into a stream e.g. STDOUT or a file,
|
95
99
|
# anything that respond_to? :puts. Defaults to STDOUT.
|
96
|
-
def report(stream = STDOUT,
|
97
|
-
|
100
|
+
def report(stream = STDOUT, type: :text,
|
101
|
+
broken_verbose: true, ignored_verbose: false)
|
98
102
|
klass = case type
|
99
103
|
when :text
|
100
104
|
BrokenLinkFinder::TextReporter
|
101
105
|
when :html
|
102
106
|
BrokenLinkFinder::HTMLReporter
|
103
107
|
else
|
104
|
-
raise "type: must be :text or :html, not: :#{type}"
|
108
|
+
raise "The type: must be :text or :html, not: :#{type}"
|
105
109
|
end
|
106
110
|
|
107
|
-
reporter = klass.new(stream, @sort,
|
108
|
-
@
|
109
|
-
|
111
|
+
reporter = klass.new(stream, @sort,
|
112
|
+
@broken_links, @ignored_links,
|
113
|
+
@broken_link_map, @crawl_stats)
|
114
|
+
reporter.call(broken_verbose: broken_verbose,
|
110
115
|
ignored_verbose: ignored_verbose)
|
111
116
|
end
|
112
117
|
|
@@ -114,25 +119,28 @@ module BrokenLinkFinder
|
|
114
119
|
|
115
120
|
# Finds which links are unsupported or broken and records the details.
|
116
121
|
def find_broken_links(page)
|
122
|
+
process_unparsable_links(page) # Record them as broken.
|
123
|
+
|
117
124
|
links = get_supported_links(page)
|
118
125
|
|
119
126
|
# Iterate over the supported links checking if they're broken or not.
|
120
127
|
links.each do |link|
|
121
|
-
# Skip if the link has been
|
128
|
+
# Skip if the link has been encountered previously.
|
122
129
|
next if @all_intact_links.include?(link)
|
123
130
|
|
124
131
|
if @all_broken_links.include?(link)
|
125
|
-
|
132
|
+
# The link has already been proven broken so simply record it.
|
133
|
+
append_broken_link(page, link, map: false)
|
126
134
|
next
|
127
135
|
end
|
128
136
|
|
129
|
-
# The link hasn't been
|
137
|
+
# The link hasn't been encountered before so we crawl it.
|
130
138
|
link_doc = crawl_link(page, link)
|
131
139
|
|
132
|
-
# Determine if the crawled link is broken or not.
|
140
|
+
# Determine if the crawled link is broken or not and record it.
|
133
141
|
if link_broken?(link_doc)
|
134
|
-
append_broken_link(page
|
135
|
-
else
|
142
|
+
append_broken_link(page, link)
|
143
|
+
else # Record it as being intact.
|
136
144
|
@lock.synchronize { @all_intact_links << link }
|
137
145
|
end
|
138
146
|
end
|
@@ -140,14 +148,31 @@ module BrokenLinkFinder
|
|
140
148
|
nil
|
141
149
|
end
|
142
150
|
|
151
|
+
# Record each unparsable link as a broken link.
|
152
|
+
def process_unparsable_links(doc)
|
153
|
+
doc.unparsable_links.each do |link|
|
154
|
+
append_broken_link(doc, link, map: false)
|
155
|
+
@broken_link_map[link] = link
|
156
|
+
end
|
157
|
+
end
|
158
|
+
|
143
159
|
# Implements a retry mechanism for each of the broken links found.
|
144
160
|
# Removes any broken links found to be working OK.
|
145
161
|
def retry_broken_links
|
146
|
-
sleep(
|
162
|
+
sleep(SERVER_WAIT_TIME) # Give the servers a break, then retry the links.
|
147
163
|
|
148
|
-
@broken_link_map.
|
149
|
-
|
150
|
-
|
164
|
+
@broken_link_map.select! do |link, href|
|
165
|
+
# Don't retry unparsable links (which are Strings).
|
166
|
+
next(true) unless href.is_a?(Wgit::Url)
|
167
|
+
|
168
|
+
doc = @crawler.crawl(href.dup)
|
169
|
+
|
170
|
+
if link_broken?(doc)
|
171
|
+
true
|
172
|
+
else
|
173
|
+
remove_broken_link(link)
|
174
|
+
false
|
175
|
+
end
|
151
176
|
end
|
152
177
|
end
|
153
178
|
|
@@ -166,7 +191,7 @@ module BrokenLinkFinder
|
|
166
191
|
# Make the link absolute and crawl it, returning its Wgit::Document.
|
167
192
|
def crawl_link(doc, link)
|
168
193
|
link = link.prefix_base(doc)
|
169
|
-
@crawler.crawl(link)
|
194
|
+
@crawler.crawl(link.dup) # We dup link to avoid recording any redirects.
|
170
195
|
end
|
171
196
|
|
172
197
|
# Return if the crawled link is broken or not.
|
@@ -175,8 +200,9 @@ module BrokenLinkFinder
|
|
175
200
|
end
|
176
201
|
|
177
202
|
# Returns true if the link is/contains a broken anchor/fragment.
|
203
|
+
# E.g. /about#top should contain a HTML element with an @id of 'top' etc.
|
178
204
|
def has_broken_anchor(doc)
|
179
|
-
raise 'link document is nil' unless doc
|
205
|
+
raise 'The link document is nil' unless doc
|
180
206
|
|
181
207
|
fragment = doc.url.fragment
|
182
208
|
return false if fragment.nil? || fragment.empty?
|
@@ -184,22 +210,22 @@ module BrokenLinkFinder
|
|
184
210
|
doc.xpath("//*[@id='#{fragment}']").empty?
|
185
211
|
end
|
186
212
|
|
187
|
-
# Append key => [value] to
|
188
|
-
# If
|
189
|
-
def append_broken_link(
|
190
|
-
key, value = get_key_value(url, link)
|
213
|
+
# Append key => [value] to the broken link collections.
|
214
|
+
# If map: true, then the link will also be recorded in @broken_link_map.
|
215
|
+
def append_broken_link(doc, link, map: true)
|
216
|
+
key, value = get_key_value(doc.url, link)
|
191
217
|
|
192
218
|
@lock.synchronize do
|
193
219
|
@broken_links[key] = [] unless @broken_links[key]
|
194
220
|
@broken_links[key] << value
|
195
221
|
|
196
|
-
@all_broken_links
|
222
|
+
@all_broken_links << link
|
197
223
|
|
198
|
-
@broken_link_map[link] = link.prefix_base(doc) if
|
224
|
+
@broken_link_map[link] = link.prefix_base(doc) if map
|
199
225
|
end
|
200
226
|
end
|
201
227
|
|
202
|
-
# Remove the
|
228
|
+
# Remove the broken link from the necessary collections.
|
203
229
|
def remove_broken_link(link)
|
204
230
|
@lock.synchronize do
|
205
231
|
if @sort == :page
|
@@ -214,13 +240,15 @@ module BrokenLinkFinder
|
|
214
240
|
end
|
215
241
|
end
|
216
242
|
|
217
|
-
# Append key => [value] to
|
243
|
+
# Append key => [value] to the ignored link collections.
|
218
244
|
def append_ignored_link(url, link)
|
219
245
|
key, value = get_key_value(url, link)
|
220
246
|
|
221
247
|
@lock.synchronize do
|
222
248
|
@ignored_links[key] = [] unless @ignored_links[key]
|
223
249
|
@ignored_links[key] << value
|
250
|
+
|
251
|
+
@all_ignored_links << link
|
224
252
|
end
|
225
253
|
end
|
226
254
|
|
@@ -249,13 +277,18 @@ module BrokenLinkFinder
|
|
249
277
|
@ignored_links.each { |_k, v| v.sort! }
|
250
278
|
end
|
251
279
|
|
252
|
-
# Sets
|
280
|
+
# Sets various statistics about the crawl and its links.
|
253
281
|
def set_crawl_stats(url:, pages_crawled:, start:)
|
254
|
-
@crawl_stats[:url]
|
255
|
-
@crawl_stats[:pages_crawled]
|
256
|
-
@crawl_stats[:num_pages]
|
257
|
-
@crawl_stats[:num_links]
|
258
|
-
|
282
|
+
@crawl_stats[:url] = url
|
283
|
+
@crawl_stats[:pages_crawled] = pages_crawled
|
284
|
+
@crawl_stats[:num_pages] = pages_crawled.size
|
285
|
+
@crawl_stats[:num_links] = (
|
286
|
+
@all_broken_links.size + @all_intact_links.size + @all_ignored_links.size
|
287
|
+
)
|
288
|
+
@crawl_stats[:num_broken_links] = @all_broken_links.size
|
289
|
+
@crawl_stats[:num_intact_links] = @all_intact_links.size
|
290
|
+
@crawl_stats[:num_ignored_links] = @all_ignored_links.size
|
291
|
+
@crawl_stats[:duration] = Time.now - start
|
259
292
|
end
|
260
293
|
|
261
294
|
alias crawl_page crawl_url
|
@@ -28,9 +28,11 @@ module BrokenLinkFinder
|
|
28
28
|
# Report a summary of the overall crawl.
|
29
29
|
def report_crawl_summary
|
30
30
|
puts format(
|
31
|
-
'<p class="crawl_summary">Crawled %s (%s
|
31
|
+
'<p class="crawl_summary">Crawled <a href="%s">%s</a><br />%s page(s) containing %s unique link(s) in %s seconds</p>',
|
32
|
+
@crawl_stats[:url],
|
32
33
|
@crawl_stats[:url],
|
33
34
|
@crawl_stats[:num_pages],
|
35
|
+
@crawl_stats[:num_links],
|
34
36
|
@crawl_stats[:duration]&.truncate(2)
|
35
37
|
)
|
36
38
|
end
|
@@ -43,7 +45,7 @@ module BrokenLinkFinder
|
|
43
45
|
puts_summary 'Good news, there are no broken links!', type: :broken
|
44
46
|
else
|
45
47
|
num_pages, num_links = get_hash_stats(@broken_links)
|
46
|
-
puts_summary "Found #{num_links} broken link(s) across #{num_pages} page(s):", type: :broken
|
48
|
+
puts_summary "Found #{num_links} unique broken link(s) across #{num_pages} page(s):", type: :broken
|
47
49
|
|
48
50
|
@broken_links.each do |key, values|
|
49
51
|
puts_group(key, type: :broken) # Puts the opening <p> element.
|
@@ -70,7 +72,7 @@ module BrokenLinkFinder
|
|
70
72
|
|
71
73
|
if @ignored_links.any?
|
72
74
|
num_pages, num_links = get_hash_stats(@ignored_links)
|
73
|
-
puts_summary "Ignored #{num_links} unsupported link(s) across #{num_pages} page(s), which you should check manually:", type: :ignored
|
75
|
+
puts_summary "Ignored #{num_links} unique unsupported link(s) across #{num_pages} page(s), which you should check manually:", type: :ignored
|
74
76
|
|
75
77
|
@ignored_links.each do |key, values|
|
76
78
|
puts_group(key, type: :ignored) # Puts the opening <p> element.
|
@@ -125,8 +127,8 @@ module BrokenLinkFinder
|
|
125
127
|
end
|
126
128
|
|
127
129
|
def build_url(link)
|
128
|
-
|
129
|
-
|
130
|
+
href = @broken_link_map[link]
|
131
|
+
href || link
|
130
132
|
end
|
131
133
|
|
132
134
|
alias_method :report, :call
|
@@ -42,8 +42,7 @@ module BrokenLinkFinder
|
|
42
42
|
# Use like: `num_pages, num_links = get_hash_stats(links)`.
|
43
43
|
def get_hash_stats(hash)
|
44
44
|
num_keys = hash.keys.length
|
45
|
-
|
46
|
-
num_values = sort_by_page? ? values.length : values.uniq.length
|
45
|
+
num_values = hash.values.flatten.uniq.length
|
47
46
|
|
48
47
|
sort_by_page? ?
|
49
48
|
[num_keys, num_values] :
|
@@ -23,10 +23,11 @@ module BrokenLinkFinder
|
|
23
23
|
|
24
24
|
# Report a summary of the overall crawl.
|
25
25
|
def report_crawl_summary
|
26
|
+
puts "Crawled #{@crawl_stats[:url]}"
|
26
27
|
putsn format(
|
27
|
-
'
|
28
|
-
@crawl_stats[:url],
|
28
|
+
'%s page(s) containing %s unique link(s) in %s seconds',
|
29
29
|
@crawl_stats[:num_pages],
|
30
|
+
@crawl_stats[:num_links],
|
30
31
|
@crawl_stats[:duration]&.truncate(2)
|
31
32
|
)
|
32
33
|
end
|
@@ -37,7 +38,7 @@ module BrokenLinkFinder
|
|
37
38
|
puts 'Good news, there are no broken links!'
|
38
39
|
else
|
39
40
|
num_pages, num_links = get_hash_stats(@broken_links)
|
40
|
-
puts "Found #{num_links} broken link(s) across #{num_pages} page(s):"
|
41
|
+
puts "Found #{num_links} unique broken link(s) across #{num_pages} page(s):"
|
41
42
|
|
42
43
|
@broken_links.each do |key, values|
|
43
44
|
msg = sort_by_page? ?
|
@@ -61,7 +62,7 @@ module BrokenLinkFinder
|
|
61
62
|
def report_ignored_links(verbose: false)
|
62
63
|
if @ignored_links.any?
|
63
64
|
num_pages, num_links = get_hash_stats(@ignored_links)
|
64
|
-
nputs "Ignored #{num_links} unsupported link(s) across #{num_pages} page(s), which you should check manually:"
|
65
|
+
nputs "Ignored #{num_links} unique unsupported link(s) across #{num_pages} page(s), which you should check manually:"
|
65
66
|
|
66
67
|
@ignored_links.each do |key, values|
|
67
68
|
msg = sort_by_page? ?
|
@@ -1,11 +1,31 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
#
|
3
|
+
# Define a method on each doc for recording unparsable links.
|
4
|
+
# Unparsable links are recorded as broken links by Finder.
|
5
|
+
class Wgit::Document
|
6
|
+
def unparsable_links
|
7
|
+
@unparsable_links ||= []
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
11
|
+
# Returns a Wgit::Url or nil (if link is unparsable).
|
12
|
+
# A proc is preferrable to a function to avoid polluting the global namespace.
|
13
|
+
parse_link = lambda do |doc, link|
|
14
|
+
Wgit::Url.new(link)
|
15
|
+
rescue StandardError
|
16
|
+
doc.unparsable_links << link
|
17
|
+
nil
|
18
|
+
end
|
19
|
+
|
20
|
+
# We extract all the Document's links e.g. <a>, <img>, <script>, <link> etc.
|
4
21
|
Wgit::Document.define_extension(
|
5
22
|
:all_links,
|
6
|
-
'//*/@href | //*/@src', # Any element
|
23
|
+
'//*/@href | //*/@src', # Any element's href or src attribute URL.
|
7
24
|
singleton: false,
|
8
25
|
text_content_only: true
|
9
|
-
) do |links|
|
10
|
-
links
|
26
|
+
) do |links, doc|
|
27
|
+
links
|
28
|
+
.uniq
|
29
|
+
.map { |link| parse_link.call(doc, link) }
|
30
|
+
.compact
|
11
31
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: broken_link_finder
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.11.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Michael Telford
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2020-01-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -128,14 +128,14 @@ dependencies:
|
|
128
128
|
requirements:
|
129
129
|
- - "~>"
|
130
130
|
- !ruby/object:Gem::Version
|
131
|
-
version: '0.
|
131
|
+
version: '0.8'
|
132
132
|
type: :runtime
|
133
133
|
prerelease: false
|
134
134
|
version_requirements: !ruby/object:Gem::Requirement
|
135
135
|
requirements:
|
136
136
|
- - "~>"
|
137
137
|
- !ruby/object:Gem::Version
|
138
|
-
version: '0.
|
138
|
+
version: '0.8'
|
139
139
|
description: Finds a website's broken links using the 'wgit' gem and reports back
|
140
140
|
to you with a summary.
|
141
141
|
email: michael.telford@live.com
|