broken_link_finder 0.11.0 → 0.11.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 77094cfe9d0790770b5c34b86bc578fc65d0e425dc089c4fda41a3c587af6e00
4
- data.tar.gz: 40f7f59411744bcd010c46bf4bdc17e59dbd4bd191bc33613c2d2bf269ba79ba
3
+ metadata.gz: 42e88495f7e7742db433223408b4a380c1d48e98a5a43e6da5303d3e7b024454
4
+ data.tar.gz: eae7fc953f0d8aa1bb1f9d5b53183cd68a15a9f83ab341f51023744b2d148063
5
5
  SHA512:
6
- metadata.gz: 4f3f4b7720d24c393fb844ed62159870bde4dd4222a8b0ec69b4fff7b96086b909df63834ef56a1b71e2d68e4ec319357f208273b3be79d81c982602b7a53b8a
7
- data.tar.gz: c5af07c99199765688672ca396e19db9093ca0cd32c5a9e37810909787892c5070c729b275fcc6a126ea71bf2bdab4c5616b98643f74a3266775d112d4a8c274
6
+ metadata.gz: 4496db994bfba83deeb14a1b870f43e2cfd2afa94f30b6596ee610f23103b55ae0d84a6443a3204b02ed8875c0daf0d8e9c565aaebd21173d5c4353509dac3c8
7
+ data.tar.gz: 2d70ee94d7128e6e212bc385e1045fd465c121f58b9a0d036d392ae1cbb5cd9ef5ea47e29eda85b6f17a0b0f5547902ca818967b3ffb4ad87c7d0b271da5323a
@@ -1 +1 @@
1
- 2.5.3
1
+ 2.7.0
@@ -9,6 +9,15 @@
9
9
  - ...
10
10
  ---
11
11
 
12
+ ## v0.11.1
13
+ ### Added
14
+ - ...
15
+ ### Changed/Removed
16
+ - Updated wgit gem to version 0.9.0 which contains improvements and bugs fixes.
17
+ ### Fixed
18
+ - ...
19
+ ---
20
+
12
21
  ## v0.11.0
13
22
  ### Added
14
23
  - Additional crawl statistics.
@@ -1,50 +1,61 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- broken_link_finder (0.11.0)
4
+ broken_link_finder (0.11.1)
5
5
  thor (~> 0.20)
6
6
  thread (~> 0.2)
7
- wgit (~> 0.8)
7
+ wgit (~> 0.9)
8
8
 
9
9
  GEM
10
10
  remote: https://rubygems.org/
11
11
  specs:
12
- addressable (2.6.0)
13
- public_suffix (>= 2.0.2, < 4.0)
14
- bson (4.7.1)
15
- byebug (11.0.1)
16
- coderay (1.1.2)
12
+ addressable (2.7.0)
13
+ public_suffix (>= 2.0.2, < 5.0)
14
+ bson (4.10.0)
15
+ byebug (11.1.3)
16
+ cliver (0.3.2)
17
+ coderay (1.1.3)
18
+ concurrent-ruby (1.1.6)
17
19
  crack (0.4.3)
18
20
  safe_yaml (~> 1.0.0)
19
21
  ethon (0.12.0)
20
22
  ffi (>= 1.3.0)
21
- ffi (1.12.1)
22
- hashdiff (1.0.0)
23
- maxitest (3.4.0)
24
- minitest (>= 5.0.0, < 5.13.0)
25
- method_source (0.9.2)
23
+ ferrum (0.9)
24
+ addressable (~> 2.5)
25
+ cliver (~> 0.3)
26
+ concurrent-ruby (~> 1.1)
27
+ websocket-driver (>= 0.6, < 0.8)
28
+ ffi (1.13.1)
29
+ hashdiff (1.0.1)
30
+ maxitest (3.6.0)
31
+ minitest (>= 5.0.0, < 5.14.0)
32
+ method_source (1.0.0)
26
33
  mini_portile2 (2.4.0)
27
- minitest (5.12.2)
28
- mongo (2.11.3)
29
- bson (>= 4.4.2, < 5.0.0)
30
- nokogiri (1.10.7)
34
+ minitest (5.13.0)
35
+ mongo (2.13.0)
36
+ bson (>= 4.8.2, < 5.0.0)
37
+ nokogiri (1.10.10)
31
38
  mini_portile2 (~> 2.4.0)
32
- pry (0.12.2)
33
- coderay (~> 1.1.0)
34
- method_source (~> 0.9.0)
35
- public_suffix (3.1.0)
36
- rake (10.5.0)
39
+ pry (0.13.1)
40
+ coderay (~> 1.1)
41
+ method_source (~> 1.0)
42
+ public_suffix (4.0.5)
43
+ rake (13.0.1)
37
44
  safe_yaml (1.0.5)
38
45
  thor (0.20.3)
39
46
  thread (0.2.2)
40
- typhoeus (1.3.1)
47
+ typhoeus (1.4.0)
41
48
  ethon (>= 0.9.0)
42
- webmock (3.7.6)
49
+ webmock (3.8.3)
43
50
  addressable (>= 2.3.6)
44
51
  crack (>= 0.3.2)
45
52
  hashdiff (>= 0.4.0, < 2.0.0)
46
- wgit (0.8.0)
53
+ websocket-driver (0.7.3)
54
+ websocket-extensions (>= 0.1.0)
55
+ websocket-extensions (0.1.5)
56
+ wgit (0.9.0)
47
57
  addressable (~> 2.6)
58
+ ferrum (~> 0.8)
48
59
  mongo (~> 2.9)
49
60
  nokogiri (~> 1.10)
50
61
  typhoeus (~> 1.3)
@@ -58,11 +69,11 @@ DEPENDENCIES
58
69
  byebug (~> 11.0)
59
70
  maxitest (~> 3.3)
60
71
  pry (~> 0.12)
61
- rake (~> 10.0)
72
+ rake (~> 13.0)
62
73
  webmock (~> 3.6)
63
74
 
64
75
  RUBY VERSION
65
- ruby 2.5.3p105
76
+ ruby 2.7.0p0
66
77
 
67
78
  BUNDLED WITH
68
79
  2.1.4
data/README.md CHANGED
@@ -1,10 +1,10 @@
1
1
  # Broken Link Finder
2
2
 
3
- Does what it says on the tin; Finds a website's broken links.
3
+ Does what it says on the tin - finds a website's broken links.
4
4
 
5
- Simply point it at a website and it will crawl all of its webpages searching for and identifing any broken links. You will then be presented with a concise summary of the broken links found.
5
+ Simply point it at a website and it will crawl all of its webpages searching for and identifing broken links. You will then be presented with a concise summary of any broken links found.
6
6
 
7
- Because `libcurl` is used under the hood, Broken Link Finder is fast!
7
+ Broken Link Finder is multi-threaded and uses `libcurl` under the hood, it's fast!
8
8
 
9
9
  ## How It Works
10
10
 
data/bin/setup CHANGED
@@ -5,4 +5,4 @@ set -vx
5
5
 
6
6
  bundle install
7
7
 
8
- # Do any other automated setup that you need to do here
8
+ # Do any other automated setup that you need to do here...
@@ -44,10 +44,10 @@ Gem::Specification.new do |spec|
44
44
  spec.add_development_dependency 'byebug', '~> 11.0'
45
45
  spec.add_development_dependency 'maxitest', '~> 3.3'
46
46
  spec.add_development_dependency 'pry', '~> 0.12'
47
- spec.add_development_dependency 'rake', '~> 10.0'
47
+ spec.add_development_dependency 'rake', '~> 13.0'
48
48
  spec.add_development_dependency 'webmock', '~> 3.6'
49
49
 
50
50
  spec.add_runtime_dependency 'thor', '~> 0.20'
51
51
  spec.add_runtime_dependency 'thread', '~> 0.2'
52
- spec.add_runtime_dependency 'wgit', '~> 0.8'
52
+ spec.add_runtime_dependency 'wgit', '~> 0.9'
53
53
  end
@@ -7,6 +7,7 @@ require 'set'
7
7
 
8
8
  require_relative './broken_link_finder/wgit_extensions'
9
9
  require_relative './broken_link_finder/version'
10
+ require_relative './broken_link_finder/link_manager'
10
11
  require_relative './broken_link_finder/reporter/reporter'
11
12
  require_relative './broken_link_finder/reporter/text_reporter'
12
13
  require_relative './broken_link_finder/reporter/html_reporter'
@@ -1,46 +1,53 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module BrokenLinkFinder
4
- DEFAULT_MAX_THREADS = 100
5
- SERVER_WAIT_TIME = 0.5
4
+ DEFAULT_MAX_THREADS = 100 # Used by Finder#crawl_site.
5
+ SERVER_WAIT_TIME = 0.5 # Used by Finder#retry_broken_links.
6
6
 
7
7
  # Alias for BrokenLinkFinder::Finder.new.
8
8
  def self.new(sort: :page, max_threads: DEFAULT_MAX_THREADS)
9
9
  Finder.new(sort: sort, max_threads: max_threads)
10
10
  end
11
11
 
12
+ # Class responsible for finding broken links on a page or site.
12
13
  class Finder
13
- attr_reader :sort, :max_threads, :broken_links, :ignored_links, :crawl_stats
14
+ # The collection key - either :page or :link.
15
+ attr_reader :sort
14
16
 
15
- # Creates a new Finder instance.
16
- def initialize(sort: :page, max_threads: BrokenLinkFinder::DEFAULT_MAX_THREADS)
17
+ # The max number of threads created during #crawl_site - one thread per page.
18
+ attr_reader :max_threads
19
+
20
+ # Returns a new Finder instance.
21
+ def initialize(sort: :page, max_threads: DEFAULT_MAX_THREADS)
17
22
  raise "Sort by either :page or :link, not #{sort}" \
18
23
  unless %i[page link].include?(sort)
19
24
 
20
25
  @sort = sort
21
26
  @max_threads = max_threads
22
- @lock = Mutex.new
23
27
  @crawler = Wgit::Crawler.new
28
+ @manager = BrokenLinkFinder::LinkManager.new(@sort)
29
+ end
30
+
31
+ # Returns the current broken links.
32
+ def broken_links
33
+ @manager.broken_links
34
+ end
24
35
 
25
- reset_crawl
36
+ # Returns the current ignored links.
37
+ def ignored_links
38
+ @manager.ignored_links
26
39
  end
27
40
 
28
- # Clear/empty the link collection objects.
29
- def reset_crawl
30
- @broken_links = {} # Used for mapping pages to broken links.
31
- @ignored_links = {} # Used for mapping pages to ignored links.
32
- @all_broken_links = Set.new # Used to prevent crawling a broken link twice.
33
- @all_intact_links = Set.new # Used to prevent crawling an intact link twice.
34
- @all_ignored_links = Set.new # Used for building crawl statistics.
35
- @broken_link_map = {} # Maps a link to its absolute (crawlable) form.
36
- @crawl_stats = {} # Records crawl stats e.g. duration etc.
41
+ # Returns the current crawl stats.
42
+ def crawl_stats
43
+ @manager.crawl_stats
37
44
  end
38
45
 
39
46
  # Finds broken links within a single page and records them.
40
47
  # Returns true if at least one broken link was found.
41
48
  # Access the broken links afterwards with Finder#broken_links.
42
49
  def crawl_url(url)
43
- reset_crawl
50
+ @manager.empty
44
51
 
45
52
  start = Time.now
46
53
  url = url.to_url
@@ -55,17 +62,17 @@ module BrokenLinkFinder
55
62
  find_broken_links(doc)
56
63
  retry_broken_links
57
64
 
58
- sort_links
59
- set_crawl_stats(url: url, pages_crawled: [url], start: start)
65
+ @manager.sort
66
+ @manager.tally(url: url, pages_crawled: [url], start: start)
60
67
 
61
- @broken_links.any?
68
+ broken_links.any?
62
69
  end
63
70
 
64
71
  # Finds broken links within an entire site and records them.
65
72
  # Returns true if at least one broken link was found.
66
73
  # Access the broken links afterwards with Finder#broken_links.
67
- def crawl_site(url)
68
- reset_crawl
74
+ def crawl_site(url, allow_paths: nil, disallow_paths: nil)
75
+ @manager.empty
69
76
 
70
77
  start = Time.now
71
78
  url = url.to_url
@@ -74,7 +81,8 @@ module BrokenLinkFinder
74
81
 
75
82
  # Crawl the site's HTML web pages looking for links.
76
83
  # We dup the url to avoid recording any redirects.
77
- externals = @crawler.crawl_site(url.dup) do |doc|
84
+ paths = { allow_paths: allow_paths, disallow_paths: disallow_paths }
85
+ externals = @crawler.crawl_site(url.dup, **paths) do |doc|
78
86
  crawled << doc.url
79
87
  next unless doc
80
88
 
@@ -82,17 +90,20 @@ module BrokenLinkFinder
82
90
  pool.process { find_broken_links(doc) }
83
91
  end
84
92
 
93
+ # Wait for all threads to finish, even if url was invalid.
94
+ pool.shutdown
95
+
85
96
  # Ensure the given website url is valid.
86
97
  raise "Invalid or broken URL: #{url}" unless externals
87
98
 
88
- # Wait for all threads to finish.
89
- pool.shutdown
90
99
  retry_broken_links
91
100
 
92
- sort_links
93
- set_crawl_stats(url: url, pages_crawled: crawled.to_a, start: start)
101
+ @manager.sort
102
+ @manager.tally(url: url, pages_crawled: crawled.to_a, start: start)
94
103
 
95
- @broken_links.any?
104
+ broken_links.any?
105
+ ensure
106
+ pool.shutdown if defined?(pool)
96
107
  end
97
108
 
98
109
  # Outputs the link report into a stream e.g. STDOUT or a file,
@@ -109,8 +120,8 @@ module BrokenLinkFinder
109
120
  end
110
121
 
111
122
  reporter = klass.new(stream, @sort,
112
- @broken_links, @ignored_links,
113
- @broken_link_map, @crawl_stats)
123
+ broken_links, ignored_links,
124
+ @manager.broken_link_map, crawl_stats)
114
125
  reporter.call(broken_verbose: broken_verbose,
115
126
  ignored_verbose: ignored_verbose)
116
127
  end
@@ -119,18 +130,18 @@ module BrokenLinkFinder
119
130
 
120
131
  # Finds which links are unsupported or broken and records the details.
121
132
  def find_broken_links(page)
122
- process_unparsable_links(page) # Record them as broken.
133
+ record_unparsable_links(page) # Record them as broken.
123
134
 
124
135
  links = get_supported_links(page)
125
136
 
126
137
  # Iterate over the supported links checking if they're broken or not.
127
138
  links.each do |link|
128
139
  # Skip if the link has been encountered previously.
129
- next if @all_intact_links.include?(link)
140
+ next if @manager.all_intact_links.include?(link)
130
141
 
131
- if @all_broken_links.include?(link)
142
+ if @manager.all_broken_links.include?(link)
132
143
  # The link has already been proven broken so simply record it.
133
- append_broken_link(page, link, map: false)
144
+ @manager.append_broken_link(page, link, map: false)
134
145
  next
135
146
  end
136
147
 
@@ -139,29 +150,21 @@ module BrokenLinkFinder
139
150
 
140
151
  # Determine if the crawled link is broken or not and record it.
141
152
  if link_broken?(link_doc)
142
- append_broken_link(page, link)
143
- else # Record it as being intact.
144
- @lock.synchronize { @all_intact_links << link }
153
+ @manager.append_broken_link(page, link)
154
+ else
155
+ @manager.append_intact_link(link)
145
156
  end
146
157
  end
147
158
 
148
159
  nil
149
160
  end
150
161
 
151
- # Record each unparsable link as a broken link.
152
- def process_unparsable_links(doc)
153
- doc.unparsable_links.each do |link|
154
- append_broken_link(doc, link, map: false)
155
- @broken_link_map[link] = link
156
- end
157
- end
158
-
159
162
  # Implements a retry mechanism for each of the broken links found.
160
163
  # Removes any broken links found to be working OK.
161
164
  def retry_broken_links
162
165
  sleep(SERVER_WAIT_TIME) # Give the servers a break, then retry the links.
163
166
 
164
- @broken_link_map.select! do |link, href|
167
+ @manager.broken_link_map.select! do |link, href|
165
168
  # Don't retry unparsable links (which are Strings).
166
169
  next(true) unless href.is_a?(Wgit::Url)
167
170
 
@@ -170,27 +173,35 @@ module BrokenLinkFinder
170
173
  if link_broken?(doc)
171
174
  true
172
175
  else
173
- remove_broken_link(link)
176
+ @manager.remove_broken_link(link)
174
177
  false
175
178
  end
176
179
  end
177
180
  end
178
181
 
182
+ # Record each unparsable link as a broken link.
183
+ def record_unparsable_links(doc)
184
+ doc.unparsable_links.each do |link|
185
+ # We map the link ourselves because link is a String, not a Wgit::Url.
186
+ @manager.append_broken_link(doc, link, map: false)
187
+ @manager.broken_link_map[link] = link
188
+ end
189
+ end
190
+
179
191
  # Report and reject any non supported links. Any link that is absolute and
180
192
  # doesn't start with 'http' is unsupported e.g. 'mailto:blah' etc.
181
193
  def get_supported_links(doc)
182
- doc.all_links
183
- .reject do |link|
184
- if link.is_absolute? && !link.start_with?('http')
185
- append_ignored_link(doc.url, link)
186
- true
187
- end
188
- end
194
+ doc.all_links.reject do |link|
195
+ if link.is_absolute? && !link.start_with?('http')
196
+ @manager.append_ignored_link(doc.url, link)
197
+ true
198
+ end
199
+ end
189
200
  end
190
201
 
191
202
  # Make the link absolute and crawl it, returning its Wgit::Document.
192
203
  def crawl_link(doc, link)
193
- link = link.prefix_base(doc)
204
+ link = link.make_absolute(doc)
194
205
  @crawler.crawl(link.dup) # We dup link to avoid recording any redirects.
195
206
  end
196
207
 
@@ -210,87 +221,6 @@ module BrokenLinkFinder
210
221
  doc.xpath("//*[@id='#{fragment}']").empty?
211
222
  end
212
223
 
213
- # Append key => [value] to the broken link collections.
214
- # If map: true, then the link will also be recorded in @broken_link_map.
215
- def append_broken_link(doc, link, map: true)
216
- key, value = get_key_value(doc.url, link)
217
-
218
- @lock.synchronize do
219
- @broken_links[key] = [] unless @broken_links[key]
220
- @broken_links[key] << value
221
-
222
- @all_broken_links << link
223
-
224
- @broken_link_map[link] = link.prefix_base(doc) if map
225
- end
226
- end
227
-
228
- # Remove the broken link from the necessary collections.
229
- def remove_broken_link(link)
230
- @lock.synchronize do
231
- if @sort == :page
232
- @broken_links.each { |_k, links| links.delete(link) }
233
- @broken_links.delete_if { |_k, links| links.empty? }
234
- else
235
- @broken_links.delete(link)
236
- end
237
-
238
- @all_broken_links.delete(link)
239
- @all_intact_links << link
240
- end
241
- end
242
-
243
- # Append key => [value] to the ignored link collections.
244
- def append_ignored_link(url, link)
245
- key, value = get_key_value(url, link)
246
-
247
- @lock.synchronize do
248
- @ignored_links[key] = [] unless @ignored_links[key]
249
- @ignored_links[key] << value
250
-
251
- @all_ignored_links << link
252
- end
253
- end
254
-
255
- # Returns the correct key value depending on the @sort type.
256
- # @sort == :page ? [url, link] : [link, url]
257
- def get_key_value(url, link)
258
- case @sort
259
- when :page
260
- [url, link]
261
- when :link
262
- [link, url]
263
- else
264
- raise "Unsupported sort type: #{sort}"
265
- end
266
- end
267
-
268
- # Sort keys and values alphabetically.
269
- def sort_links
270
- @broken_links.values.map(&:uniq!)
271
- @ignored_links.values.map(&:uniq!)
272
-
273
- @broken_links = @broken_links.sort_by { |k, _v| k }.to_h
274
- @ignored_links = @ignored_links.sort_by { |k, _v| k }.to_h
275
-
276
- @broken_links.each { |_k, v| v.sort! }
277
- @ignored_links.each { |_k, v| v.sort! }
278
- end
279
-
280
- # Sets various statistics about the crawl and its links.
281
- def set_crawl_stats(url:, pages_crawled:, start:)
282
- @crawl_stats[:url] = url
283
- @crawl_stats[:pages_crawled] = pages_crawled
284
- @crawl_stats[:num_pages] = pages_crawled.size
285
- @crawl_stats[:num_links] = (
286
- @all_broken_links.size + @all_intact_links.size + @all_ignored_links.size
287
- )
288
- @crawl_stats[:num_broken_links] = @all_broken_links.size
289
- @crawl_stats[:num_intact_links] = @all_intact_links.size
290
- @crawl_stats[:num_ignored_links] = @all_ignored_links.size
291
- @crawl_stats[:duration] = Time.now - start
292
- end
293
-
294
224
  alias crawl_page crawl_url
295
225
  alias crawl_r crawl_site
296
226
  end
@@ -0,0 +1,137 @@
1
+ # frozen_string_literal: true
2
+
3
+ module BrokenLinkFinder
4
+ # Class responsible for handling the link collection logic.
5
+ class LinkManager
6
+ # Used for mapping pages to broken links.
7
+ attr_reader :broken_links
8
+
9
+ # Used for mapping pages to ignored links.
10
+ attr_reader :ignored_links
11
+
12
+ # Used to record crawl statistics e.g. duration etc.
13
+ attr_reader :crawl_stats
14
+
15
+ # Used to map a link (as is) to its absolute (crawlable) form.
16
+ attr_reader :broken_link_map
17
+
18
+ # Used to prevent crawling a broken link twice.
19
+ attr_reader :all_broken_links
20
+
21
+ # Used to prevent crawling an intact link twice.
22
+ attr_reader :all_intact_links
23
+
24
+ # Used for building crawl statistics.
25
+ attr_reader :all_ignored_links
26
+
27
+ # Returns a new LinkManager instance with empty link collections.
28
+ def initialize(sort)
29
+ raise "Sort by either :page or :link, not #{sort}" \
30
+ unless %i[page link].include?(sort)
31
+
32
+ @sort = sort
33
+ @lock = Mutex.new
34
+
35
+ empty # Initialises the link collections.
36
+ end
37
+
38
+ # Initialise/empty the link collection objects.
39
+ def empty
40
+ @broken_links = {}
41
+ @ignored_links = {}
42
+ @crawl_stats = {}
43
+ @broken_link_map = {}
44
+ @all_broken_links = Set.new
45
+ @all_intact_links = Set.new
46
+ @all_ignored_links = Set.new
47
+ end
48
+
49
+ # Append key => [value] to the broken link collections.
50
+ # If map: true, then the link will also be recorded in @broken_link_map.
51
+ def append_broken_link(doc, link, map: true)
52
+ key, value = get_key_value(doc.url, link)
53
+
54
+ @lock.synchronize do
55
+ @broken_links[key] = [] unless @broken_links[key]
56
+ @broken_links[key] << value
57
+
58
+ @all_broken_links << link
59
+
60
+ @broken_link_map[link] = link.make_absolute(doc) if map
61
+ end
62
+ end
63
+
64
+ # Remove the broken link from the necessary collections.
65
+ def remove_broken_link(link)
66
+ @lock.synchronize do
67
+ if @sort == :page
68
+ @broken_links.each { |_k, links| links.delete(link) }
69
+ @broken_links.delete_if { |_k, links| links.empty? }
70
+ else
71
+ @broken_links.delete(link)
72
+ end
73
+
74
+ @all_broken_links.delete(link)
75
+ @all_intact_links << link
76
+ end
77
+ end
78
+
79
+ # Append key => [value] to the ignored link collections.
80
+ def append_ignored_link(url, link)
81
+ key, value = get_key_value(url, link)
82
+
83
+ @lock.synchronize do
84
+ @ignored_links[key] = [] unless @ignored_links[key]
85
+ @ignored_links[key] << value
86
+
87
+ @all_ignored_links << link
88
+ end
89
+ end
90
+
91
+ # Append link to @all_intact_links.
92
+ def append_intact_link(link)
93
+ @lock.synchronize { @all_intact_links << link }
94
+ end
95
+
96
+ # Sorts the link collection's keys and values alphabetically.
97
+ def sort
98
+ @broken_links.values.map(&:uniq!)
99
+ @ignored_links.values.map(&:uniq!)
100
+
101
+ @broken_links = @broken_links.sort_by { |k, _v| k }.to_h
102
+ @ignored_links = @ignored_links.sort_by { |k, _v| k }.to_h
103
+
104
+ @broken_links.each { |_k, v| v.sort! }
105
+ @ignored_links.each { |_k, v| v.sort! }
106
+ end
107
+
108
+ # Tally's up various statistics about the crawl and its links.
109
+ def tally(url:, pages_crawled:, start:)
110
+ @crawl_stats[:url] = url
111
+ @crawl_stats[:pages_crawled] = pages_crawled
112
+ @crawl_stats[:num_pages] = pages_crawled.size
113
+ @crawl_stats[:num_links] = (
114
+ @all_broken_links.size + @all_intact_links.size + @all_ignored_links.size
115
+ )
116
+ @crawl_stats[:num_broken_links] = @all_broken_links.size
117
+ @crawl_stats[:num_intact_links] = @all_intact_links.size
118
+ @crawl_stats[:num_ignored_links] = @all_ignored_links.size
119
+ @crawl_stats[:duration] = Time.now - start
120
+ end
121
+
122
+ private
123
+
124
+ # Returns the correct key value depending on the @sort type.
125
+ # @sort == :page ? [url, link] : [link, url]
126
+ def get_key_value(url, link)
127
+ case @sort
128
+ when :page
129
+ [url, link]
130
+ when :link
131
+ [link, url]
132
+ else
133
+ raise "Unsupported sort type: #{sort}"
134
+ end
135
+ end
136
+ end
137
+ end
@@ -1,8 +1,9 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module BrokenLinkFinder
4
+ # Class responsible for reporting in a HTML format.
4
5
  class HTMLReporter < Reporter
5
- # Creates a new HTMLReporter instance.
6
+ # Returns a new HTMLReporter instance.
6
7
  # stream is any Object that responds to :puts and :print.
7
8
  def initialize(stream, sort,
8
9
  broken_links, ignored_links,
@@ -6,7 +6,7 @@ module BrokenLinkFinder
6
6
  # The amount of pages/links to display when verbose is false.
7
7
  NUM_VALUES = 3
8
8
 
9
- # Creates a new Reporter instance.
9
+ # Returns a new Reporter instance.
10
10
  # stream is any Object that responds to :puts and :print.
11
11
  def initialize(stream, sort,
12
12
  broken_links, ignored_links,
@@ -1,8 +1,9 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module BrokenLinkFinder
4
+ # Class responsible for reporting in a text format.
4
5
  class TextReporter < Reporter
5
- # Creates a new TextReporter instance.
6
+ # Returns a new TextReporter instance.
6
7
  # stream is any Object that responds to :puts and :print.
7
8
  def initialize(stream, sort,
8
9
  broken_links, ignored_links,
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module BrokenLinkFinder
4
- VERSION = '0.11.0'
4
+ VERSION = '0.11.1'
5
5
  end
@@ -18,7 +18,7 @@ rescue StandardError
18
18
  end
19
19
 
20
20
  # We extract all the Document's links e.g. <a>, <img>, <script>, <link> etc.
21
- Wgit::Document.define_extension(
21
+ Wgit::Document.define_extractor(
22
22
  :all_links,
23
23
  '//*/@href | //*/@src', # Any element's href or src attribute URL.
24
24
  singleton: false,
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: broken_link_finder
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.11.0
4
+ version: 0.11.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Michael Telford
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-01-27 00:00:00.000000000 Z
11
+ date: 2020-07-31 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -72,14 +72,14 @@ dependencies:
72
72
  requirements:
73
73
  - - "~>"
74
74
  - !ruby/object:Gem::Version
75
- version: '10.0'
75
+ version: '13.0'
76
76
  type: :development
77
77
  prerelease: false
78
78
  version_requirements: !ruby/object:Gem::Requirement
79
79
  requirements:
80
80
  - - "~>"
81
81
  - !ruby/object:Gem::Version
82
- version: '10.0'
82
+ version: '13.0'
83
83
  - !ruby/object:Gem::Dependency
84
84
  name: webmock
85
85
  requirement: !ruby/object:Gem::Requirement
@@ -128,14 +128,14 @@ dependencies:
128
128
  requirements:
129
129
  - - "~>"
130
130
  - !ruby/object:Gem::Version
131
- version: '0.8'
131
+ version: '0.9'
132
132
  type: :runtime
133
133
  prerelease: false
134
134
  version_requirements: !ruby/object:Gem::Requirement
135
135
  requirements:
136
136
  - - "~>"
137
137
  - !ruby/object:Gem::Version
138
- version: '0.8'
138
+ version: '0.9'
139
139
  description: Finds a website's broken links using the 'wgit' gem and reports back
140
140
  to you with a summary.
141
141
  email: michael.telford@live.com
@@ -159,6 +159,7 @@ files:
159
159
  - exe/broken_link_finder
160
160
  - lib/broken_link_finder.rb
161
161
  - lib/broken_link_finder/finder.rb
162
+ - lib/broken_link_finder/link_manager.rb
162
163
  - lib/broken_link_finder/reporter/html_reporter.rb
163
164
  - lib/broken_link_finder/reporter/reporter.rb
164
165
  - lib/broken_link_finder/reporter/text_reporter.rb
@@ -189,7 +190,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
189
190
  - !ruby/object:Gem::Version
190
191
  version: '0'
191
192
  requirements: []
192
- rubygems_version: 3.0.6
193
+ rubygems_version: 3.1.2
193
194
  signing_key:
194
195
  specification_version: 4
195
196
  summary: Finds a website's broken links and reports back to you with a summary.