broken_link_finder 0.11.0 → 0.11.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 77094cfe9d0790770b5c34b86bc578fc65d0e425dc089c4fda41a3c587af6e00
4
- data.tar.gz: 40f7f59411744bcd010c46bf4bdc17e59dbd4bd191bc33613c2d2bf269ba79ba
3
+ metadata.gz: 42e88495f7e7742db433223408b4a380c1d48e98a5a43e6da5303d3e7b024454
4
+ data.tar.gz: eae7fc953f0d8aa1bb1f9d5b53183cd68a15a9f83ab341f51023744b2d148063
5
5
  SHA512:
6
- metadata.gz: 4f3f4b7720d24c393fb844ed62159870bde4dd4222a8b0ec69b4fff7b96086b909df63834ef56a1b71e2d68e4ec319357f208273b3be79d81c982602b7a53b8a
7
- data.tar.gz: c5af07c99199765688672ca396e19db9093ca0cd32c5a9e37810909787892c5070c729b275fcc6a126ea71bf2bdab4c5616b98643f74a3266775d112d4a8c274
6
+ metadata.gz: 4496db994bfba83deeb14a1b870f43e2cfd2afa94f30b6596ee610f23103b55ae0d84a6443a3204b02ed8875c0daf0d8e9c565aaebd21173d5c4353509dac3c8
7
+ data.tar.gz: 2d70ee94d7128e6e212bc385e1045fd465c121f58b9a0d036d392ae1cbb5cd9ef5ea47e29eda85b6f17a0b0f5547902ca818967b3ffb4ad87c7d0b271da5323a
@@ -1 +1 @@
1
- 2.5.3
1
+ 2.7.0
@@ -9,6 +9,15 @@
9
9
  - ...
10
10
  ---
11
11
 
12
+ ## v0.11.1
13
+ ### Added
14
+ - ...
15
+ ### Changed/Removed
16
+ - Updated wgit gem to version 0.9.0 which contains improvements and bugs fixes.
17
+ ### Fixed
18
+ - ...
19
+ ---
20
+
12
21
  ## v0.11.0
13
22
  ### Added
14
23
  - Additional crawl statistics.
@@ -1,50 +1,61 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- broken_link_finder (0.11.0)
4
+ broken_link_finder (0.11.1)
5
5
  thor (~> 0.20)
6
6
  thread (~> 0.2)
7
- wgit (~> 0.8)
7
+ wgit (~> 0.9)
8
8
 
9
9
  GEM
10
10
  remote: https://rubygems.org/
11
11
  specs:
12
- addressable (2.6.0)
13
- public_suffix (>= 2.0.2, < 4.0)
14
- bson (4.7.1)
15
- byebug (11.0.1)
16
- coderay (1.1.2)
12
+ addressable (2.7.0)
13
+ public_suffix (>= 2.0.2, < 5.0)
14
+ bson (4.10.0)
15
+ byebug (11.1.3)
16
+ cliver (0.3.2)
17
+ coderay (1.1.3)
18
+ concurrent-ruby (1.1.6)
17
19
  crack (0.4.3)
18
20
  safe_yaml (~> 1.0.0)
19
21
  ethon (0.12.0)
20
22
  ffi (>= 1.3.0)
21
- ffi (1.12.1)
22
- hashdiff (1.0.0)
23
- maxitest (3.4.0)
24
- minitest (>= 5.0.0, < 5.13.0)
25
- method_source (0.9.2)
23
+ ferrum (0.9)
24
+ addressable (~> 2.5)
25
+ cliver (~> 0.3)
26
+ concurrent-ruby (~> 1.1)
27
+ websocket-driver (>= 0.6, < 0.8)
28
+ ffi (1.13.1)
29
+ hashdiff (1.0.1)
30
+ maxitest (3.6.0)
31
+ minitest (>= 5.0.0, < 5.14.0)
32
+ method_source (1.0.0)
26
33
  mini_portile2 (2.4.0)
27
- minitest (5.12.2)
28
- mongo (2.11.3)
29
- bson (>= 4.4.2, < 5.0.0)
30
- nokogiri (1.10.7)
34
+ minitest (5.13.0)
35
+ mongo (2.13.0)
36
+ bson (>= 4.8.2, < 5.0.0)
37
+ nokogiri (1.10.10)
31
38
  mini_portile2 (~> 2.4.0)
32
- pry (0.12.2)
33
- coderay (~> 1.1.0)
34
- method_source (~> 0.9.0)
35
- public_suffix (3.1.0)
36
- rake (10.5.0)
39
+ pry (0.13.1)
40
+ coderay (~> 1.1)
41
+ method_source (~> 1.0)
42
+ public_suffix (4.0.5)
43
+ rake (13.0.1)
37
44
  safe_yaml (1.0.5)
38
45
  thor (0.20.3)
39
46
  thread (0.2.2)
40
- typhoeus (1.3.1)
47
+ typhoeus (1.4.0)
41
48
  ethon (>= 0.9.0)
42
- webmock (3.7.6)
49
+ webmock (3.8.3)
43
50
  addressable (>= 2.3.6)
44
51
  crack (>= 0.3.2)
45
52
  hashdiff (>= 0.4.0, < 2.0.0)
46
- wgit (0.8.0)
53
+ websocket-driver (0.7.3)
54
+ websocket-extensions (>= 0.1.0)
55
+ websocket-extensions (0.1.5)
56
+ wgit (0.9.0)
47
57
  addressable (~> 2.6)
58
+ ferrum (~> 0.8)
48
59
  mongo (~> 2.9)
49
60
  nokogiri (~> 1.10)
50
61
  typhoeus (~> 1.3)
@@ -58,11 +69,11 @@ DEPENDENCIES
58
69
  byebug (~> 11.0)
59
70
  maxitest (~> 3.3)
60
71
  pry (~> 0.12)
61
- rake (~> 10.0)
72
+ rake (~> 13.0)
62
73
  webmock (~> 3.6)
63
74
 
64
75
  RUBY VERSION
65
- ruby 2.5.3p105
76
+ ruby 2.7.0p0
66
77
 
67
78
  BUNDLED WITH
68
79
  2.1.4
data/README.md CHANGED
@@ -1,10 +1,10 @@
1
1
  # Broken Link Finder
2
2
 
3
- Does what it says on the tin; Finds a website's broken links.
3
+ Does what it says on the tin - finds a website's broken links.
4
4
 
5
- Simply point it at a website and it will crawl all of its webpages searching for and identifing any broken links. You will then be presented with a concise summary of the broken links found.
5
+ Simply point it at a website and it will crawl all of its webpages searching for and identifing broken links. You will then be presented with a concise summary of any broken links found.
6
6
 
7
- Because `libcurl` is used under the hood, Broken Link Finder is fast!
7
+ Broken Link Finder is multi-threaded and uses `libcurl` under the hood, it's fast!
8
8
 
9
9
  ## How It Works
10
10
 
data/bin/setup CHANGED
@@ -5,4 +5,4 @@ set -vx
5
5
 
6
6
  bundle install
7
7
 
8
- # Do any other automated setup that you need to do here
8
+ # Do any other automated setup that you need to do here...
@@ -44,10 +44,10 @@ Gem::Specification.new do |spec|
44
44
  spec.add_development_dependency 'byebug', '~> 11.0'
45
45
  spec.add_development_dependency 'maxitest', '~> 3.3'
46
46
  spec.add_development_dependency 'pry', '~> 0.12'
47
- spec.add_development_dependency 'rake', '~> 10.0'
47
+ spec.add_development_dependency 'rake', '~> 13.0'
48
48
  spec.add_development_dependency 'webmock', '~> 3.6'
49
49
 
50
50
  spec.add_runtime_dependency 'thor', '~> 0.20'
51
51
  spec.add_runtime_dependency 'thread', '~> 0.2'
52
- spec.add_runtime_dependency 'wgit', '~> 0.8'
52
+ spec.add_runtime_dependency 'wgit', '~> 0.9'
53
53
  end
@@ -7,6 +7,7 @@ require 'set'
7
7
 
8
8
  require_relative './broken_link_finder/wgit_extensions'
9
9
  require_relative './broken_link_finder/version'
10
+ require_relative './broken_link_finder/link_manager'
10
11
  require_relative './broken_link_finder/reporter/reporter'
11
12
  require_relative './broken_link_finder/reporter/text_reporter'
12
13
  require_relative './broken_link_finder/reporter/html_reporter'
@@ -1,46 +1,53 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module BrokenLinkFinder
4
- DEFAULT_MAX_THREADS = 100
5
- SERVER_WAIT_TIME = 0.5
4
+ DEFAULT_MAX_THREADS = 100 # Used by Finder#crawl_site.
5
+ SERVER_WAIT_TIME = 0.5 # Used by Finder#retry_broken_links.
6
6
 
7
7
  # Alias for BrokenLinkFinder::Finder.new.
8
8
  def self.new(sort: :page, max_threads: DEFAULT_MAX_THREADS)
9
9
  Finder.new(sort: sort, max_threads: max_threads)
10
10
  end
11
11
 
12
+ # Class responsible for finding broken links on a page or site.
12
13
  class Finder
13
- attr_reader :sort, :max_threads, :broken_links, :ignored_links, :crawl_stats
14
+ # The collection key - either :page or :link.
15
+ attr_reader :sort
14
16
 
15
- # Creates a new Finder instance.
16
- def initialize(sort: :page, max_threads: BrokenLinkFinder::DEFAULT_MAX_THREADS)
17
+ # The max number of threads created during #crawl_site - one thread per page.
18
+ attr_reader :max_threads
19
+
20
+ # Returns a new Finder instance.
21
+ def initialize(sort: :page, max_threads: DEFAULT_MAX_THREADS)
17
22
  raise "Sort by either :page or :link, not #{sort}" \
18
23
  unless %i[page link].include?(sort)
19
24
 
20
25
  @sort = sort
21
26
  @max_threads = max_threads
22
- @lock = Mutex.new
23
27
  @crawler = Wgit::Crawler.new
28
+ @manager = BrokenLinkFinder::LinkManager.new(@sort)
29
+ end
30
+
31
+ # Returns the current broken links.
32
+ def broken_links
33
+ @manager.broken_links
34
+ end
24
35
 
25
- reset_crawl
36
+ # Returns the current ignored links.
37
+ def ignored_links
38
+ @manager.ignored_links
26
39
  end
27
40
 
28
- # Clear/empty the link collection objects.
29
- def reset_crawl
30
- @broken_links = {} # Used for mapping pages to broken links.
31
- @ignored_links = {} # Used for mapping pages to ignored links.
32
- @all_broken_links = Set.new # Used to prevent crawling a broken link twice.
33
- @all_intact_links = Set.new # Used to prevent crawling an intact link twice.
34
- @all_ignored_links = Set.new # Used for building crawl statistics.
35
- @broken_link_map = {} # Maps a link to its absolute (crawlable) form.
36
- @crawl_stats = {} # Records crawl stats e.g. duration etc.
41
+ # Returns the current crawl stats.
42
+ def crawl_stats
43
+ @manager.crawl_stats
37
44
  end
38
45
 
39
46
  # Finds broken links within a single page and records them.
40
47
  # Returns true if at least one broken link was found.
41
48
  # Access the broken links afterwards with Finder#broken_links.
42
49
  def crawl_url(url)
43
- reset_crawl
50
+ @manager.empty
44
51
 
45
52
  start = Time.now
46
53
  url = url.to_url
@@ -55,17 +62,17 @@ module BrokenLinkFinder
55
62
  find_broken_links(doc)
56
63
  retry_broken_links
57
64
 
58
- sort_links
59
- set_crawl_stats(url: url, pages_crawled: [url], start: start)
65
+ @manager.sort
66
+ @manager.tally(url: url, pages_crawled: [url], start: start)
60
67
 
61
- @broken_links.any?
68
+ broken_links.any?
62
69
  end
63
70
 
64
71
  # Finds broken links within an entire site and records them.
65
72
  # Returns true if at least one broken link was found.
66
73
  # Access the broken links afterwards with Finder#broken_links.
67
- def crawl_site(url)
68
- reset_crawl
74
+ def crawl_site(url, allow_paths: nil, disallow_paths: nil)
75
+ @manager.empty
69
76
 
70
77
  start = Time.now
71
78
  url = url.to_url
@@ -74,7 +81,8 @@ module BrokenLinkFinder
74
81
 
75
82
  # Crawl the site's HTML web pages looking for links.
76
83
  # We dup the url to avoid recording any redirects.
77
- externals = @crawler.crawl_site(url.dup) do |doc|
84
+ paths = { allow_paths: allow_paths, disallow_paths: disallow_paths }
85
+ externals = @crawler.crawl_site(url.dup, **paths) do |doc|
78
86
  crawled << doc.url
79
87
  next unless doc
80
88
 
@@ -82,17 +90,20 @@ module BrokenLinkFinder
82
90
  pool.process { find_broken_links(doc) }
83
91
  end
84
92
 
93
+ # Wait for all threads to finish, even if url was invalid.
94
+ pool.shutdown
95
+
85
96
  # Ensure the given website url is valid.
86
97
  raise "Invalid or broken URL: #{url}" unless externals
87
98
 
88
- # Wait for all threads to finish.
89
- pool.shutdown
90
99
  retry_broken_links
91
100
 
92
- sort_links
93
- set_crawl_stats(url: url, pages_crawled: crawled.to_a, start: start)
101
+ @manager.sort
102
+ @manager.tally(url: url, pages_crawled: crawled.to_a, start: start)
94
103
 
95
- @broken_links.any?
104
+ broken_links.any?
105
+ ensure
106
+ pool.shutdown if defined?(pool)
96
107
  end
97
108
 
98
109
  # Outputs the link report into a stream e.g. STDOUT or a file,
@@ -109,8 +120,8 @@ module BrokenLinkFinder
109
120
  end
110
121
 
111
122
  reporter = klass.new(stream, @sort,
112
- @broken_links, @ignored_links,
113
- @broken_link_map, @crawl_stats)
123
+ broken_links, ignored_links,
124
+ @manager.broken_link_map, crawl_stats)
114
125
  reporter.call(broken_verbose: broken_verbose,
115
126
  ignored_verbose: ignored_verbose)
116
127
  end
@@ -119,18 +130,18 @@ module BrokenLinkFinder
119
130
 
120
131
  # Finds which links are unsupported or broken and records the details.
121
132
  def find_broken_links(page)
122
- process_unparsable_links(page) # Record them as broken.
133
+ record_unparsable_links(page) # Record them as broken.
123
134
 
124
135
  links = get_supported_links(page)
125
136
 
126
137
  # Iterate over the supported links checking if they're broken or not.
127
138
  links.each do |link|
128
139
  # Skip if the link has been encountered previously.
129
- next if @all_intact_links.include?(link)
140
+ next if @manager.all_intact_links.include?(link)
130
141
 
131
- if @all_broken_links.include?(link)
142
+ if @manager.all_broken_links.include?(link)
132
143
  # The link has already been proven broken so simply record it.
133
- append_broken_link(page, link, map: false)
144
+ @manager.append_broken_link(page, link, map: false)
134
145
  next
135
146
  end
136
147
 
@@ -139,29 +150,21 @@ module BrokenLinkFinder
139
150
 
140
151
  # Determine if the crawled link is broken or not and record it.
141
152
  if link_broken?(link_doc)
142
- append_broken_link(page, link)
143
- else # Record it as being intact.
144
- @lock.synchronize { @all_intact_links << link }
153
+ @manager.append_broken_link(page, link)
154
+ else
155
+ @manager.append_intact_link(link)
145
156
  end
146
157
  end
147
158
 
148
159
  nil
149
160
  end
150
161
 
151
- # Record each unparsable link as a broken link.
152
- def process_unparsable_links(doc)
153
- doc.unparsable_links.each do |link|
154
- append_broken_link(doc, link, map: false)
155
- @broken_link_map[link] = link
156
- end
157
- end
158
-
159
162
  # Implements a retry mechanism for each of the broken links found.
160
163
  # Removes any broken links found to be working OK.
161
164
  def retry_broken_links
162
165
  sleep(SERVER_WAIT_TIME) # Give the servers a break, then retry the links.
163
166
 
164
- @broken_link_map.select! do |link, href|
167
+ @manager.broken_link_map.select! do |link, href|
165
168
  # Don't retry unparsable links (which are Strings).
166
169
  next(true) unless href.is_a?(Wgit::Url)
167
170
 
@@ -170,27 +173,35 @@ module BrokenLinkFinder
170
173
  if link_broken?(doc)
171
174
  true
172
175
  else
173
- remove_broken_link(link)
176
+ @manager.remove_broken_link(link)
174
177
  false
175
178
  end
176
179
  end
177
180
  end
178
181
 
182
+ # Record each unparsable link as a broken link.
183
+ def record_unparsable_links(doc)
184
+ doc.unparsable_links.each do |link|
185
+ # We map the link ourselves because link is a String, not a Wgit::Url.
186
+ @manager.append_broken_link(doc, link, map: false)
187
+ @manager.broken_link_map[link] = link
188
+ end
189
+ end
190
+
179
191
  # Report and reject any non supported links. Any link that is absolute and
180
192
  # doesn't start with 'http' is unsupported e.g. 'mailto:blah' etc.
181
193
  def get_supported_links(doc)
182
- doc.all_links
183
- .reject do |link|
184
- if link.is_absolute? && !link.start_with?('http')
185
- append_ignored_link(doc.url, link)
186
- true
187
- end
188
- end
194
+ doc.all_links.reject do |link|
195
+ if link.is_absolute? && !link.start_with?('http')
196
+ @manager.append_ignored_link(doc.url, link)
197
+ true
198
+ end
199
+ end
189
200
  end
190
201
 
191
202
  # Make the link absolute and crawl it, returning its Wgit::Document.
192
203
  def crawl_link(doc, link)
193
- link = link.prefix_base(doc)
204
+ link = link.make_absolute(doc)
194
205
  @crawler.crawl(link.dup) # We dup link to avoid recording any redirects.
195
206
  end
196
207
 
@@ -210,87 +221,6 @@ module BrokenLinkFinder
210
221
  doc.xpath("//*[@id='#{fragment}']").empty?
211
222
  end
212
223
 
213
- # Append key => [value] to the broken link collections.
214
- # If map: true, then the link will also be recorded in @broken_link_map.
215
- def append_broken_link(doc, link, map: true)
216
- key, value = get_key_value(doc.url, link)
217
-
218
- @lock.synchronize do
219
- @broken_links[key] = [] unless @broken_links[key]
220
- @broken_links[key] << value
221
-
222
- @all_broken_links << link
223
-
224
- @broken_link_map[link] = link.prefix_base(doc) if map
225
- end
226
- end
227
-
228
- # Remove the broken link from the necessary collections.
229
- def remove_broken_link(link)
230
- @lock.synchronize do
231
- if @sort == :page
232
- @broken_links.each { |_k, links| links.delete(link) }
233
- @broken_links.delete_if { |_k, links| links.empty? }
234
- else
235
- @broken_links.delete(link)
236
- end
237
-
238
- @all_broken_links.delete(link)
239
- @all_intact_links << link
240
- end
241
- end
242
-
243
- # Append key => [value] to the ignored link collections.
244
- def append_ignored_link(url, link)
245
- key, value = get_key_value(url, link)
246
-
247
- @lock.synchronize do
248
- @ignored_links[key] = [] unless @ignored_links[key]
249
- @ignored_links[key] << value
250
-
251
- @all_ignored_links << link
252
- end
253
- end
254
-
255
- # Returns the correct key value depending on the @sort type.
256
- # @sort == :page ? [url, link] : [link, url]
257
- def get_key_value(url, link)
258
- case @sort
259
- when :page
260
- [url, link]
261
- when :link
262
- [link, url]
263
- else
264
- raise "Unsupported sort type: #{sort}"
265
- end
266
- end
267
-
268
- # Sort keys and values alphabetically.
269
- def sort_links
270
- @broken_links.values.map(&:uniq!)
271
- @ignored_links.values.map(&:uniq!)
272
-
273
- @broken_links = @broken_links.sort_by { |k, _v| k }.to_h
274
- @ignored_links = @ignored_links.sort_by { |k, _v| k }.to_h
275
-
276
- @broken_links.each { |_k, v| v.sort! }
277
- @ignored_links.each { |_k, v| v.sort! }
278
- end
279
-
280
- # Sets various statistics about the crawl and its links.
281
- def set_crawl_stats(url:, pages_crawled:, start:)
282
- @crawl_stats[:url] = url
283
- @crawl_stats[:pages_crawled] = pages_crawled
284
- @crawl_stats[:num_pages] = pages_crawled.size
285
- @crawl_stats[:num_links] = (
286
- @all_broken_links.size + @all_intact_links.size + @all_ignored_links.size
287
- )
288
- @crawl_stats[:num_broken_links] = @all_broken_links.size
289
- @crawl_stats[:num_intact_links] = @all_intact_links.size
290
- @crawl_stats[:num_ignored_links] = @all_ignored_links.size
291
- @crawl_stats[:duration] = Time.now - start
292
- end
293
-
294
224
  alias crawl_page crawl_url
295
225
  alias crawl_r crawl_site
296
226
  end
@@ -0,0 +1,137 @@
1
+ # frozen_string_literal: true
2
+
3
+ module BrokenLinkFinder
4
+ # Class responsible for handling the link collection logic.
5
+ class LinkManager
6
+ # Used for mapping pages to broken links.
7
+ attr_reader :broken_links
8
+
9
+ # Used for mapping pages to ignored links.
10
+ attr_reader :ignored_links
11
+
12
+ # Used to record crawl statistics e.g. duration etc.
13
+ attr_reader :crawl_stats
14
+
15
+ # Used to map a link (as is) to its absolute (crawlable) form.
16
+ attr_reader :broken_link_map
17
+
18
+ # Used to prevent crawling a broken link twice.
19
+ attr_reader :all_broken_links
20
+
21
+ # Used to prevent crawling an intact link twice.
22
+ attr_reader :all_intact_links
23
+
24
+ # Used for building crawl statistics.
25
+ attr_reader :all_ignored_links
26
+
27
+ # Returns a new LinkManager instance with empty link collections.
28
+ def initialize(sort)
29
+ raise "Sort by either :page or :link, not #{sort}" \
30
+ unless %i[page link].include?(sort)
31
+
32
+ @sort = sort
33
+ @lock = Mutex.new
34
+
35
+ empty # Initialises the link collections.
36
+ end
37
+
38
+ # Initialise/empty the link collection objects.
39
+ def empty
40
+ @broken_links = {}
41
+ @ignored_links = {}
42
+ @crawl_stats = {}
43
+ @broken_link_map = {}
44
+ @all_broken_links = Set.new
45
+ @all_intact_links = Set.new
46
+ @all_ignored_links = Set.new
47
+ end
48
+
49
+ # Append key => [value] to the broken link collections.
50
+ # If map: true, then the link will also be recorded in @broken_link_map.
51
+ def append_broken_link(doc, link, map: true)
52
+ key, value = get_key_value(doc.url, link)
53
+
54
+ @lock.synchronize do
55
+ @broken_links[key] = [] unless @broken_links[key]
56
+ @broken_links[key] << value
57
+
58
+ @all_broken_links << link
59
+
60
+ @broken_link_map[link] = link.make_absolute(doc) if map
61
+ end
62
+ end
63
+
64
+ # Remove the broken link from the necessary collections.
65
+ def remove_broken_link(link)
66
+ @lock.synchronize do
67
+ if @sort == :page
68
+ @broken_links.each { |_k, links| links.delete(link) }
69
+ @broken_links.delete_if { |_k, links| links.empty? }
70
+ else
71
+ @broken_links.delete(link)
72
+ end
73
+
74
+ @all_broken_links.delete(link)
75
+ @all_intact_links << link
76
+ end
77
+ end
78
+
79
+ # Append key => [value] to the ignored link collections.
80
+ def append_ignored_link(url, link)
81
+ key, value = get_key_value(url, link)
82
+
83
+ @lock.synchronize do
84
+ @ignored_links[key] = [] unless @ignored_links[key]
85
+ @ignored_links[key] << value
86
+
87
+ @all_ignored_links << link
88
+ end
89
+ end
90
+
91
+ # Append link to @all_intact_links.
92
+ def append_intact_link(link)
93
+ @lock.synchronize { @all_intact_links << link }
94
+ end
95
+
96
+ # Sorts the link collection's keys and values alphabetically.
97
+ def sort
98
+ @broken_links.values.map(&:uniq!)
99
+ @ignored_links.values.map(&:uniq!)
100
+
101
+ @broken_links = @broken_links.sort_by { |k, _v| k }.to_h
102
+ @ignored_links = @ignored_links.sort_by { |k, _v| k }.to_h
103
+
104
+ @broken_links.each { |_k, v| v.sort! }
105
+ @ignored_links.each { |_k, v| v.sort! }
106
+ end
107
+
108
+ # Tally's up various statistics about the crawl and its links.
109
+ def tally(url:, pages_crawled:, start:)
110
+ @crawl_stats[:url] = url
111
+ @crawl_stats[:pages_crawled] = pages_crawled
112
+ @crawl_stats[:num_pages] = pages_crawled.size
113
+ @crawl_stats[:num_links] = (
114
+ @all_broken_links.size + @all_intact_links.size + @all_ignored_links.size
115
+ )
116
+ @crawl_stats[:num_broken_links] = @all_broken_links.size
117
+ @crawl_stats[:num_intact_links] = @all_intact_links.size
118
+ @crawl_stats[:num_ignored_links] = @all_ignored_links.size
119
+ @crawl_stats[:duration] = Time.now - start
120
+ end
121
+
122
+ private
123
+
124
+ # Returns the correct key value depending on the @sort type.
125
+ # @sort == :page ? [url, link] : [link, url]
126
+ def get_key_value(url, link)
127
+ case @sort
128
+ when :page
129
+ [url, link]
130
+ when :link
131
+ [link, url]
132
+ else
133
+ raise "Unsupported sort type: #{sort}"
134
+ end
135
+ end
136
+ end
137
+ end
@@ -1,8 +1,9 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module BrokenLinkFinder
4
+ # Class responsible for reporting in a HTML format.
4
5
  class HTMLReporter < Reporter
5
- # Creates a new HTMLReporter instance.
6
+ # Returns a new HTMLReporter instance.
6
7
  # stream is any Object that responds to :puts and :print.
7
8
  def initialize(stream, sort,
8
9
  broken_links, ignored_links,
@@ -6,7 +6,7 @@ module BrokenLinkFinder
6
6
  # The amount of pages/links to display when verbose is false.
7
7
  NUM_VALUES = 3
8
8
 
9
- # Creates a new Reporter instance.
9
+ # Returns a new Reporter instance.
10
10
  # stream is any Object that responds to :puts and :print.
11
11
  def initialize(stream, sort,
12
12
  broken_links, ignored_links,
@@ -1,8 +1,9 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module BrokenLinkFinder
4
+ # Class responsible for reporting in a text format.
4
5
  class TextReporter < Reporter
5
- # Creates a new TextReporter instance.
6
+ # Returns a new TextReporter instance.
6
7
  # stream is any Object that responds to :puts and :print.
7
8
  def initialize(stream, sort,
8
9
  broken_links, ignored_links,
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module BrokenLinkFinder
4
- VERSION = '0.11.0'
4
+ VERSION = '0.11.1'
5
5
  end
@@ -18,7 +18,7 @@ rescue StandardError
18
18
  end
19
19
 
20
20
  # We extract all the Document's links e.g. <a>, <img>, <script>, <link> etc.
21
- Wgit::Document.define_extension(
21
+ Wgit::Document.define_extractor(
22
22
  :all_links,
23
23
  '//*/@href | //*/@src', # Any element's href or src attribute URL.
24
24
  singleton: false,
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: broken_link_finder
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.11.0
4
+ version: 0.11.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Michael Telford
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-01-27 00:00:00.000000000 Z
11
+ date: 2020-07-31 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -72,14 +72,14 @@ dependencies:
72
72
  requirements:
73
73
  - - "~>"
74
74
  - !ruby/object:Gem::Version
75
- version: '10.0'
75
+ version: '13.0'
76
76
  type: :development
77
77
  prerelease: false
78
78
  version_requirements: !ruby/object:Gem::Requirement
79
79
  requirements:
80
80
  - - "~>"
81
81
  - !ruby/object:Gem::Version
82
- version: '10.0'
82
+ version: '13.0'
83
83
  - !ruby/object:Gem::Dependency
84
84
  name: webmock
85
85
  requirement: !ruby/object:Gem::Requirement
@@ -128,14 +128,14 @@ dependencies:
128
128
  requirements:
129
129
  - - "~>"
130
130
  - !ruby/object:Gem::Version
131
- version: '0.8'
131
+ version: '0.9'
132
132
  type: :runtime
133
133
  prerelease: false
134
134
  version_requirements: !ruby/object:Gem::Requirement
135
135
  requirements:
136
136
  - - "~>"
137
137
  - !ruby/object:Gem::Version
138
- version: '0.8'
138
+ version: '0.9'
139
139
  description: Finds a website's broken links using the 'wgit' gem and reports back
140
140
  to you with a summary.
141
141
  email: michael.telford@live.com
@@ -159,6 +159,7 @@ files:
159
159
  - exe/broken_link_finder
160
160
  - lib/broken_link_finder.rb
161
161
  - lib/broken_link_finder/finder.rb
162
+ - lib/broken_link_finder/link_manager.rb
162
163
  - lib/broken_link_finder/reporter/html_reporter.rb
163
164
  - lib/broken_link_finder/reporter/reporter.rb
164
165
  - lib/broken_link_finder/reporter/text_reporter.rb
@@ -189,7 +190,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
189
190
  - !ruby/object:Gem::Version
190
191
  version: '0'
191
192
  requirements: []
192
- rubygems_version: 3.0.6
193
+ rubygems_version: 3.1.2
193
194
  signing_key:
194
195
  specification_version: 4
195
196
  summary: Finds a website's broken links and reports back to you with a summary.