broken_link_finder 0.10.0 → 0.11.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 7a53784c1bd2f75c18b3492ea782b4cc2e229a94f89afcf33b60ef633512554e
4
- data.tar.gz: 393dca220b7f00d72314c93e7b877e0412afdf784fa2e563bbecb2dc6c6b29f7
3
+ metadata.gz: 77094cfe9d0790770b5c34b86bc578fc65d0e425dc089c4fda41a3c587af6e00
4
+ data.tar.gz: 40f7f59411744bcd010c46bf4bdc17e59dbd4bd191bc33613c2d2bf269ba79ba
5
5
  SHA512:
6
- metadata.gz: c0d304e5b0a9258265c5c084c0a6e5819c169ba8eb02b3c6317a37784a9ca12982b0fc520c3cca1060fde60126ee936708d7891c69133c5d72c9c0287a79b3f5
7
- data.tar.gz: c21a4aec2c077e2617fb625debad28f746148ad98229a27a590a4412601e30759c709aa3a6e6d80e81c16160e16968fc0392181fc9c75e4da06578452f7c5ab6
6
+ metadata.gz: 4f3f4b7720d24c393fb844ed62159870bde4dd4222a8b0ec69b4fff7b96086b909df63834ef56a1b71e2d68e4ec319357f208273b3be79d81c982602b7a53b8a
7
+ data.tar.gz: c5af07c99199765688672ca396e19db9093ca0cd32c5a9e37810909787892c5070c729b275fcc6a126ea71bf2bdab4c5616b98643f74a3266775d112d4a8c274
@@ -9,6 +9,18 @@
9
9
  - ...
10
10
  ---
11
11
 
12
+ ## v0.11.0
13
+ ### Added
14
+ - Additional crawl statistics.
15
+ - Exit code handling to executable. `0` for success, `1` for an error scenario.
16
+ ### Changed/Removed
17
+ - Updated the report formats slightly bringing various improvements such as the total number of links crawled etc.
18
+ ### Fixed
19
+ - Bug in html report, summary url is now an `<a>` link.
20
+ - Bug in `Finder@broken_link_map` URLs and `Finder#crawl_stats[:url]` URL during redirects.
21
+ - Bug causing an error on crawling unparsable/invalid URL's.
22
+ ---
23
+
12
24
  ## v0.10.0
13
25
  ### Added
14
26
  - A `--html` flag to the `crawl` executable command which produces a HTML report (instead of text).
@@ -1,33 +1,33 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- broken_link_finder (0.10.0)
4
+ broken_link_finder (0.11.0)
5
5
  thor (~> 0.20)
6
6
  thread (~> 0.2)
7
- wgit (~> 0.5)
7
+ wgit (~> 0.8)
8
8
 
9
9
  GEM
10
10
  remote: https://rubygems.org/
11
11
  specs:
12
12
  addressable (2.6.0)
13
13
  public_suffix (>= 2.0.2, < 4.0)
14
- bson (4.6.0)
14
+ bson (4.7.1)
15
15
  byebug (11.0.1)
16
16
  coderay (1.1.2)
17
17
  crack (0.4.3)
18
18
  safe_yaml (~> 1.0.0)
19
19
  ethon (0.12.0)
20
20
  ffi (>= 1.3.0)
21
- ffi (1.11.3)
21
+ ffi (1.12.1)
22
22
  hashdiff (1.0.0)
23
23
  maxitest (3.4.0)
24
24
  minitest (>= 5.0.0, < 5.13.0)
25
25
  method_source (0.9.2)
26
26
  mini_portile2 (2.4.0)
27
27
  minitest (5.12.2)
28
- mongo (2.11.1)
29
- bson (>= 4.6.0, < 5.0.0)
30
- nokogiri (1.10.5)
28
+ mongo (2.11.3)
29
+ bson (>= 4.4.2, < 5.0.0)
30
+ nokogiri (1.10.7)
31
31
  mini_portile2 (~> 2.4.0)
32
32
  pry (0.12.2)
33
33
  coderay (~> 1.1.0)
@@ -43,7 +43,7 @@ GEM
43
43
  addressable (>= 2.3.6)
44
44
  crack (>= 0.3.2)
45
45
  hashdiff (>= 0.4.0, < 2.0.0)
46
- wgit (0.5.1)
46
+ wgit (0.8.0)
47
47
  addressable (~> 2.6)
48
48
  mongo (~> 2.9)
49
49
  nokogiri (~> 1.10)
@@ -65,4 +65,4 @@ RUBY VERSION
65
65
  ruby 2.5.3p105
66
66
 
67
67
  BUNDLED WITH
68
- 2.0.2
68
+ 2.1.4
data/README.md CHANGED
@@ -91,9 +91,10 @@ See the full source code documentation [here](https://www.rubydoc.info/gems/brok
91
91
  If broken links are found then the output will look something like:
92
92
 
93
93
  ```text
94
- Crawled http://txti.es (7 page(s) in 7.88 seconds)
94
+ Crawled http://txti.es
95
+ 7 page(s) containing 32 unique link(s) in 6.82 seconds
95
96
 
96
- Found 6 broken link(s) across 2 page(s):
97
+ Found 6 unique broken link(s) across 2 page(s):
97
98
 
98
99
  The following broken links were found on 'http://txti.es/about':
99
100
  http://twitter.com/thebarrytone
@@ -105,7 +106,7 @@ The following broken links were found on 'http://txti.es/how':
105
106
  http://en.wikipedia.org/wiki/Markdown
106
107
  http://imgur.com
107
108
 
108
- Ignored 3 unsupported link(s) across 2 page(s), which you should check manually:
109
+ Ignored 3 unique unsupported link(s) across 2 page(s), which you should check manually:
109
110
 
110
111
  The following links were ignored on 'http://txti.es':
111
112
  tel:+13174562564
@@ -23,12 +23,14 @@ end
23
23
  # You can add fixtures and/or initialization code here...
24
24
  reload
25
25
 
26
- url = 'http://txti.es/'
27
- by_page = Finder.new
28
- by_link = Finder.new sort: :link
29
- finder = by_page
26
+ def url; @url ||= 'http://txti.es/'; end
27
+ def by_page; @by_page ||= Finder.new; end
28
+ def by_link; @by_link ||= Finder.new(sort: :link); end
29
+ def finder; @finder ||= by_page; end
30
30
 
31
31
  # Start the console.
32
- puts "\nbroken_link_finder v#{BrokenLinkFinder::VERSION} (#{Wgit.version_str})"
32
+ puts
33
+ puts "broken_link_finder v#{BrokenLinkFinder::VERSION} (#{Wgit.version_str})"
34
+ puts
33
35
 
34
- binding.pry
36
+ Pry.start
@@ -49,5 +49,5 @@ Gem::Specification.new do |spec|
49
49
 
50
50
  spec.add_runtime_dependency 'thor', '~> 0.20'
51
51
  spec.add_runtime_dependency 'thread', '~> 0.2'
52
- spec.add_runtime_dependency 'wgit', '~> 0.5'
52
+ spec.add_runtime_dependency 'wgit', '~> 0.8'
53
53
  end
@@ -29,13 +29,19 @@ class BrokenLinkFinderCLI < Thor
29
29
  broken_verbose: broken_verbose,
30
30
  ignored_verbose: ignored_verbose
31
31
  )
32
- rescue Exception => e
32
+
33
+ exit 0
34
+ rescue StandardError => e
33
35
  puts "An error has occurred: #{e.message}"
36
+
37
+ exit 1
34
38
  end
35
39
 
36
40
  desc 'version', 'Display the currently installed version'
37
41
  def version
38
42
  puts "broken_link_finder v#{BrokenLinkFinder::VERSION}"
43
+
44
+ exit 0
39
45
  end
40
46
  end
41
47
 
@@ -2,6 +2,7 @@
2
2
 
3
3
  module BrokenLinkFinder
4
4
  DEFAULT_MAX_THREADS = 100
5
+ SERVER_WAIT_TIME = 0.5
5
6
 
6
7
  # Alias for BrokenLinkFinder::Finder.new.
7
8
  def self.new(sort: :page, max_threads: DEFAULT_MAX_THREADS)
@@ -24,25 +25,28 @@ module BrokenLinkFinder
24
25
  reset_crawl
25
26
  end
26
27
 
27
- # Clear/empty the link collection Hashes.
28
+ # Clear/empty the link collection objects.
28
29
  def reset_crawl
29
- @broken_links = {}
30
- @ignored_links = {}
31
- @all_broken_links = Set.new # Used to prevent crawling a link twice.
32
- @all_intact_links = Set.new # "
33
- @broken_link_map = {} # Maps a link to its absolute form.
34
- @crawl_stats = {} # Records crawl stats e.g. duration etc.
30
+ @broken_links = {} # Used for mapping pages to broken links.
31
+ @ignored_links = {} # Used for mapping pages to ignored links.
32
+ @all_broken_links = Set.new # Used to prevent crawling a broken link twice.
33
+ @all_intact_links = Set.new # Used to prevent crawling an intact link twice.
34
+ @all_ignored_links = Set.new # Used for building crawl statistics.
35
+ @broken_link_map = {} # Maps a link to its absolute (crawlable) form.
36
+ @crawl_stats = {} # Records crawl stats e.g. duration etc.
35
37
  end
36
38
 
37
- # Finds broken links within a single page and appends them to the
38
- # @broken_links array. Returns true if at least one broken link was found.
39
+ # Finds broken links within a single page and records them.
40
+ # Returns true if at least one broken link was found.
39
41
  # Access the broken links afterwards with Finder#broken_links.
40
42
  def crawl_url(url)
41
43
  reset_crawl
42
44
 
43
45
  start = Time.now
44
46
  url = url.to_url
45
- doc = @crawler.crawl(url)
47
+
48
+ # We dup the url to avoid recording any redirects.
49
+ doc = @crawler.crawl(url.dup)
46
50
 
47
51
  # Ensure the given page url is valid.
48
52
  raise "Invalid or broken URL: #{url}" unless doc
@@ -57,9 +61,8 @@ module BrokenLinkFinder
57
61
  @broken_links.any?
58
62
  end
59
63
 
60
- # Finds broken links within an entire site and appends them to the
61
- # @broken_links array. Returns a tuple containing a Boolean of true if
62
- # at least one broken link was found and an Array of all pages crawled.
64
+ # Finds broken links within an entire site and records them.
65
+ # Returns true if at least one broken link was found.
63
66
  # Access the broken links afterwards with Finder#broken_links.
64
67
  def crawl_site(url)
65
68
  reset_crawl
@@ -70,7 +73,8 @@ module BrokenLinkFinder
70
73
  crawled = Set.new
71
74
 
72
75
  # Crawl the site's HTML web pages looking for links.
73
- externals = @crawler.crawl_site(url) do |doc|
76
+ # We dup the url to avoid recording any redirects.
77
+ externals = @crawler.crawl_site(url.dup) do |doc|
74
78
  crawled << doc.url
75
79
  next unless doc
76
80
 
@@ -91,22 +95,23 @@ module BrokenLinkFinder
91
95
  @broken_links.any?
92
96
  end
93
97
 
94
- # Pretty prints the link report into a stream e.g. STDOUT or a file,
98
+ # Outputs the link report into a stream e.g. STDOUT or a file,
95
99
  # anything that respond_to? :puts. Defaults to STDOUT.
96
- def report(stream = STDOUT,
97
- type: :text, broken_verbose: true, ignored_verbose: false)
100
+ def report(stream = STDOUT, type: :text,
101
+ broken_verbose: true, ignored_verbose: false)
98
102
  klass = case type
99
103
  when :text
100
104
  BrokenLinkFinder::TextReporter
101
105
  when :html
102
106
  BrokenLinkFinder::HTMLReporter
103
107
  else
104
- raise "type: must be :text or :html, not: :#{type}"
108
+ raise "The type: must be :text or :html, not: :#{type}"
105
109
  end
106
110
 
107
- reporter = klass.new(stream, @sort, @broken_links,
108
- @ignored_links, @broken_link_map, @crawl_stats)
109
- reporter.call(broken_verbose: broken_verbose,
111
+ reporter = klass.new(stream, @sort,
112
+ @broken_links, @ignored_links,
113
+ @broken_link_map, @crawl_stats)
114
+ reporter.call(broken_verbose: broken_verbose,
110
115
  ignored_verbose: ignored_verbose)
111
116
  end
112
117
 
@@ -114,25 +119,28 @@ module BrokenLinkFinder
114
119
 
115
120
  # Finds which links are unsupported or broken and records the details.
116
121
  def find_broken_links(page)
122
+ process_unparsable_links(page) # Record them as broken.
123
+
117
124
  links = get_supported_links(page)
118
125
 
119
126
  # Iterate over the supported links checking if they're broken or not.
120
127
  links.each do |link|
121
- # Skip if the link has been processed previously.
128
+ # Skip if the link has been encountered previously.
122
129
  next if @all_intact_links.include?(link)
123
130
 
124
131
  if @all_broken_links.include?(link)
125
- append_broken_link(page.url, link) # Record on which page.
132
+ # The link has already been proven broken so simply record it.
133
+ append_broken_link(page, link, map: false)
126
134
  next
127
135
  end
128
136
 
129
- # The link hasn't been processed before so we crawl it.
137
+ # The link hasn't been encountered before so we crawl it.
130
138
  link_doc = crawl_link(page, link)
131
139
 
132
- # Determine if the crawled link is broken or not.
140
+ # Determine if the crawled link is broken or not and record it.
133
141
  if link_broken?(link_doc)
134
- append_broken_link(page.url, link, doc: page)
135
- else
142
+ append_broken_link(page, link)
143
+ else # Record it as being intact.
136
144
  @lock.synchronize { @all_intact_links << link }
137
145
  end
138
146
  end
@@ -140,14 +148,31 @@ module BrokenLinkFinder
140
148
  nil
141
149
  end
142
150
 
151
+ # Record each unparsable link as a broken link.
152
+ def process_unparsable_links(doc)
153
+ doc.unparsable_links.each do |link|
154
+ append_broken_link(doc, link, map: false)
155
+ @broken_link_map[link] = link
156
+ end
157
+ end
158
+
143
159
  # Implements a retry mechanism for each of the broken links found.
144
160
  # Removes any broken links found to be working OK.
145
161
  def retry_broken_links
146
- sleep(0.5) # Give the servers a break, then retry the links.
162
+ sleep(SERVER_WAIT_TIME) # Give the servers a break, then retry the links.
147
163
 
148
- @broken_link_map.each do |link, href|
149
- doc = @crawler.crawl(href)
150
- remove_broken_link(link) unless link_broken?(doc)
164
+ @broken_link_map.select! do |link, href|
165
+ # Don't retry unparsable links (which are Strings).
166
+ next(true) unless href.is_a?(Wgit::Url)
167
+
168
+ doc = @crawler.crawl(href.dup)
169
+
170
+ if link_broken?(doc)
171
+ true
172
+ else
173
+ remove_broken_link(link)
174
+ false
175
+ end
151
176
  end
152
177
  end
153
178
 
@@ -166,7 +191,7 @@ module BrokenLinkFinder
166
191
  # Make the link absolute and crawl it, returning its Wgit::Document.
167
192
  def crawl_link(doc, link)
168
193
  link = link.prefix_base(doc)
169
- @crawler.crawl(link)
194
+ @crawler.crawl(link.dup) # We dup link to avoid recording any redirects.
170
195
  end
171
196
 
172
197
  # Return if the crawled link is broken or not.
@@ -175,8 +200,9 @@ module BrokenLinkFinder
175
200
  end
176
201
 
177
202
  # Returns true if the link is/contains a broken anchor/fragment.
203
+ # E.g. /about#top should contain a HTML element with an @id of 'top' etc.
178
204
  def has_broken_anchor(doc)
179
- raise 'link document is nil' unless doc
205
+ raise 'The link document is nil' unless doc
180
206
 
181
207
  fragment = doc.url.fragment
182
208
  return false if fragment.nil? || fragment.empty?
@@ -184,22 +210,22 @@ module BrokenLinkFinder
184
210
  doc.xpath("//*[@id='#{fragment}']").empty?
185
211
  end
186
212
 
187
- # Append key => [value] to @broken_links.
188
- # If doc: is provided then the link will be recorded in absolute form.
189
- def append_broken_link(url, link, doc: nil)
190
- key, value = get_key_value(url, link)
213
+ # Append key => [value] to the broken link collections.
214
+ # If map: true, then the link will also be recorded in @broken_link_map.
215
+ def append_broken_link(doc, link, map: true)
216
+ key, value = get_key_value(doc.url, link)
191
217
 
192
218
  @lock.synchronize do
193
219
  @broken_links[key] = [] unless @broken_links[key]
194
220
  @broken_links[key] << value
195
221
 
196
- @all_broken_links << link
222
+ @all_broken_links << link
197
223
 
198
- @broken_link_map[link] = link.prefix_base(doc) if doc
224
+ @broken_link_map[link] = link.prefix_base(doc) if map
199
225
  end
200
226
  end
201
227
 
202
- # Remove the broken_link from the necessary collections.
228
+ # Remove the broken link from the necessary collections.
203
229
  def remove_broken_link(link)
204
230
  @lock.synchronize do
205
231
  if @sort == :page
@@ -214,13 +240,15 @@ module BrokenLinkFinder
214
240
  end
215
241
  end
216
242
 
217
- # Append key => [value] to @ignored_links.
243
+ # Append key => [value] to the ignored link collections.
218
244
  def append_ignored_link(url, link)
219
245
  key, value = get_key_value(url, link)
220
246
 
221
247
  @lock.synchronize do
222
248
  @ignored_links[key] = [] unless @ignored_links[key]
223
249
  @ignored_links[key] << value
250
+
251
+ @all_ignored_links << link
224
252
  end
225
253
  end
226
254
 
@@ -249,13 +277,18 @@ module BrokenLinkFinder
249
277
  @ignored_links.each { |_k, v| v.sort! }
250
278
  end
251
279
 
252
- # Sets and returns the total number of links crawled.
280
+ # Sets various statistics about the crawl and its links.
253
281
  def set_crawl_stats(url:, pages_crawled:, start:)
254
- @crawl_stats[:url] = url
255
- @crawl_stats[:pages_crawled] = pages_crawled
256
- @crawl_stats[:num_pages] = pages_crawled.size
257
- @crawl_stats[:num_links] = @all_broken_links.size + @all_intact_links.size
258
- @crawl_stats[:duration] = Time.now - start
282
+ @crawl_stats[:url] = url
283
+ @crawl_stats[:pages_crawled] = pages_crawled
284
+ @crawl_stats[:num_pages] = pages_crawled.size
285
+ @crawl_stats[:num_links] = (
286
+ @all_broken_links.size + @all_intact_links.size + @all_ignored_links.size
287
+ )
288
+ @crawl_stats[:num_broken_links] = @all_broken_links.size
289
+ @crawl_stats[:num_intact_links] = @all_intact_links.size
290
+ @crawl_stats[:num_ignored_links] = @all_ignored_links.size
291
+ @crawl_stats[:duration] = Time.now - start
259
292
  end
260
293
 
261
294
  alias crawl_page crawl_url
@@ -28,9 +28,11 @@ module BrokenLinkFinder
28
28
  # Report a summary of the overall crawl.
29
29
  def report_crawl_summary
30
30
  puts format(
31
- '<p class="crawl_summary">Crawled %s (%s page(s) in %s seconds)</p>',
31
+ '<p class="crawl_summary">Crawled <a href="%s">%s</a><br />%s page(s) containing %s unique link(s) in %s seconds</p>',
32
+ @crawl_stats[:url],
32
33
  @crawl_stats[:url],
33
34
  @crawl_stats[:num_pages],
35
+ @crawl_stats[:num_links],
34
36
  @crawl_stats[:duration]&.truncate(2)
35
37
  )
36
38
  end
@@ -43,7 +45,7 @@ module BrokenLinkFinder
43
45
  puts_summary 'Good news, there are no broken links!', type: :broken
44
46
  else
45
47
  num_pages, num_links = get_hash_stats(@broken_links)
46
- puts_summary "Found #{num_links} broken link(s) across #{num_pages} page(s):", type: :broken
48
+ puts_summary "Found #{num_links} unique broken link(s) across #{num_pages} page(s):", type: :broken
47
49
 
48
50
  @broken_links.each do |key, values|
49
51
  puts_group(key, type: :broken) # Puts the opening <p> element.
@@ -70,7 +72,7 @@ module BrokenLinkFinder
70
72
 
71
73
  if @ignored_links.any?
72
74
  num_pages, num_links = get_hash_stats(@ignored_links)
73
- puts_summary "Ignored #{num_links} unsupported link(s) across #{num_pages} page(s), which you should check manually:", type: :ignored
75
+ puts_summary "Ignored #{num_links} unique unsupported link(s) across #{num_pages} page(s), which you should check manually:", type: :ignored
74
76
 
75
77
  @ignored_links.each do |key, values|
76
78
  puts_group(key, type: :ignored) # Puts the opening <p> element.
@@ -125,8 +127,8 @@ module BrokenLinkFinder
125
127
  end
126
128
 
127
129
  def build_url(link)
128
- return link if link.to_url.absolute?
129
- @broken_link_map.fetch(link)
130
+ href = @broken_link_map[link]
131
+ href || link
130
132
  end
131
133
 
132
134
  alias_method :report, :call
@@ -42,8 +42,7 @@ module BrokenLinkFinder
42
42
  # Use like: `num_pages, num_links = get_hash_stats(links)`.
43
43
  def get_hash_stats(hash)
44
44
  num_keys = hash.keys.length
45
- values = hash.values.flatten
46
- num_values = sort_by_page? ? values.length : values.uniq.length
45
+ num_values = hash.values.flatten.uniq.length
47
46
 
48
47
  sort_by_page? ?
49
48
  [num_keys, num_values] :
@@ -23,10 +23,11 @@ module BrokenLinkFinder
23
23
 
24
24
  # Report a summary of the overall crawl.
25
25
  def report_crawl_summary
26
+ puts "Crawled #{@crawl_stats[:url]}"
26
27
  putsn format(
27
- 'Crawled %s (%s page(s) in %s seconds)',
28
- @crawl_stats[:url],
28
+ '%s page(s) containing %s unique link(s) in %s seconds',
29
29
  @crawl_stats[:num_pages],
30
+ @crawl_stats[:num_links],
30
31
  @crawl_stats[:duration]&.truncate(2)
31
32
  )
32
33
  end
@@ -37,7 +38,7 @@ module BrokenLinkFinder
37
38
  puts 'Good news, there are no broken links!'
38
39
  else
39
40
  num_pages, num_links = get_hash_stats(@broken_links)
40
- puts "Found #{num_links} broken link(s) across #{num_pages} page(s):"
41
+ puts "Found #{num_links} unique broken link(s) across #{num_pages} page(s):"
41
42
 
42
43
  @broken_links.each do |key, values|
43
44
  msg = sort_by_page? ?
@@ -61,7 +62,7 @@ module BrokenLinkFinder
61
62
  def report_ignored_links(verbose: false)
62
63
  if @ignored_links.any?
63
64
  num_pages, num_links = get_hash_stats(@ignored_links)
64
- nputs "Ignored #{num_links} unsupported link(s) across #{num_pages} page(s), which you should check manually:"
65
+ nputs "Ignored #{num_links} unique unsupported link(s) across #{num_pages} page(s), which you should check manually:"
65
66
 
66
67
  @ignored_links.each do |key, values|
67
68
  msg = sort_by_page? ?
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module BrokenLinkFinder
4
- VERSION = '0.10.0'
4
+ VERSION = '0.11.0'
5
5
  end
@@ -1,11 +1,31 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- # We extract all the Document's links, not just the links to other webpages.
3
+ # Define a method on each doc for recording unparsable links.
4
+ # Unparsable links are recorded as broken links by Finder.
5
+ class Wgit::Document
6
+ def unparsable_links
7
+ @unparsable_links ||= []
8
+ end
9
+ end
10
+
11
+ # Returns a Wgit::Url or nil (if link is unparsable).
12
+ # A proc is preferrable to a function to avoid polluting the global namespace.
13
+ parse_link = lambda do |doc, link|
14
+ Wgit::Url.new(link)
15
+ rescue StandardError
16
+ doc.unparsable_links << link
17
+ nil
18
+ end
19
+
20
+ # We extract all the Document's links e.g. <a>, <img>, <script>, <link> etc.
4
21
  Wgit::Document.define_extension(
5
22
  :all_links,
6
- '//*/@href | //*/@src', # Any element with a href or src attribute.
23
+ '//*/@href | //*/@src', # Any element's href or src attribute URL.
7
24
  singleton: false,
8
25
  text_content_only: true
9
- ) do |links|
10
- links.uniq.to_urls
26
+ ) do |links, doc|
27
+ links
28
+ .uniq
29
+ .map { |link| parse_link.call(doc, link) }
30
+ .compact
11
31
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: broken_link_finder
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.10.0
4
+ version: 0.11.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Michael Telford
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-11-28 00:00:00.000000000 Z
11
+ date: 2020-01-27 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -128,14 +128,14 @@ dependencies:
128
128
  requirements:
129
129
  - - "~>"
130
130
  - !ruby/object:Gem::Version
131
- version: '0.5'
131
+ version: '0.8'
132
132
  type: :runtime
133
133
  prerelease: false
134
134
  version_requirements: !ruby/object:Gem::Requirement
135
135
  requirements:
136
136
  - - "~>"
137
137
  - !ruby/object:Gem::Version
138
- version: '0.5'
138
+ version: '0.8'
139
139
  description: Finds a website's broken links using the 'wgit' gem and reports back
140
140
  to you with a summary.
141
141
  email: michael.telford@live.com