broken_link_finder 0.9.4 → 0.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.ruby-version +1 -1
- data/CHANGELOG.md +52 -0
- data/Gemfile.lock +51 -38
- data/README.md +65 -29
- data/benchmark.rb +9 -5
- data/bin/console +11 -19
- data/bin/setup +1 -1
- data/broken_link_finder.gemspec +8 -5
- data/exe/broken_link_finder +14 -3
- data/lib/broken_link_finder.rb +8 -2
- data/lib/broken_link_finder/finder.rb +131 -132
- data/lib/broken_link_finder/link_manager.rb +137 -0
- data/lib/broken_link_finder/reporter/html_reporter.rb +137 -0
- data/lib/broken_link_finder/reporter/reporter.rb +76 -0
- data/lib/broken_link_finder/reporter/text_reporter.rb +88 -0
- data/lib/broken_link_finder/version.rb +1 -1
- data/lib/broken_link_finder/wgit_extensions.rb +25 -5
- data/lib/broken_link_finder/xpath.rb +14 -0
- metadata +21 -15
- data/lib/broken_link_finder/reporter.rb +0 -116
data/exe/broken_link_finder
CHANGED
@@ -9,30 +9,41 @@ class BrokenLinkFinderCLI < Thor
|
|
9
9
|
desc 'crawl [URL]', 'Find broken links at the URL'
|
10
10
|
option :recursive, type: :boolean, aliases: [:r], default: false, desc: 'Crawl the entire site.'
|
11
11
|
option :threads, type: :numeric, aliases: [:t], default: BrokenLinkFinder::DEFAULT_MAX_THREADS, desc: 'Max number of threads to use when crawling recursively; 1 thread per web page.'
|
12
|
+
option :xpath, type: :string, aliases: [:x], default: BrokenLinkFinder::DEFAULT_LINK_XPATH
|
13
|
+
option :html, type: :boolean, aliases: [:h], default: false, desc: 'Produce a HTML report (instead of text)'
|
12
14
|
option :sort_by_link, type: :boolean, aliases: [:l], default: false, desc: 'Makes report more concise if there are more pages crawled than broken links found. Use with -r on medium/large sites.'
|
13
15
|
option :verbose, type: :boolean, aliases: [:v], default: false, desc: 'Display all ignored links.'
|
14
16
|
option :concise, type: :boolean, aliases: [:c], default: false, desc: 'Display only a summary of broken links.'
|
15
17
|
def crawl(url)
|
16
18
|
url = "http://#{url}" unless url.start_with?('http')
|
17
19
|
|
20
|
+
report_type = options[:html] ? :html : :text
|
18
21
|
sort_by = options[:sort_by_link] ? :link : :page
|
19
22
|
max_threads = options[:threads]
|
20
23
|
broken_verbose = !options[:concise]
|
21
24
|
ignored_verbose = options[:verbose]
|
22
25
|
|
26
|
+
BrokenLinkFinder.link_xpath = options[:xpath]
|
23
27
|
finder = BrokenLinkFinder::Finder.new(sort: sort_by, max_threads: max_threads)
|
24
28
|
options[:recursive] ? finder.crawl_site(url) : finder.crawl_page(url)
|
25
|
-
finder.
|
26
|
-
|
29
|
+
finder.report(
|
30
|
+
type: report_type,
|
31
|
+
broken_verbose: broken_verbose,
|
27
32
|
ignored_verbose: ignored_verbose
|
28
33
|
)
|
29
|
-
|
34
|
+
|
35
|
+
exit 0
|
36
|
+
rescue StandardError => e
|
30
37
|
puts "An error has occurred: #{e.message}"
|
38
|
+
|
39
|
+
exit 1
|
31
40
|
end
|
32
41
|
|
33
42
|
desc 'version', 'Display the currently installed version'
|
34
43
|
def version
|
35
44
|
puts "broken_link_finder v#{BrokenLinkFinder::VERSION}"
|
45
|
+
|
46
|
+
exit 0
|
36
47
|
end
|
37
48
|
end
|
38
49
|
|
data/lib/broken_link_finder.rb
CHANGED
@@ -2,8 +2,14 @@
|
|
2
2
|
|
3
3
|
require 'wgit'
|
4
4
|
require 'wgit/core_ext'
|
5
|
+
require 'thread/pool'
|
6
|
+
require 'set'
|
5
7
|
|
6
|
-
require_relative './broken_link_finder/wgit_extensions'
|
7
8
|
require_relative './broken_link_finder/version'
|
8
|
-
require_relative './broken_link_finder/
|
9
|
+
require_relative './broken_link_finder/xpath'
|
10
|
+
require_relative './broken_link_finder/wgit_extensions'
|
11
|
+
require_relative './broken_link_finder/link_manager'
|
12
|
+
require_relative './broken_link_finder/reporter/reporter'
|
13
|
+
require_relative './broken_link_finder/reporter/text_reporter'
|
14
|
+
require_relative './broken_link_finder/reporter/html_reporter'
|
9
15
|
require_relative './broken_link_finder/finder'
|
@@ -1,228 +1,227 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
require_relative 'reporter'
|
4
|
-
require 'thread/pool'
|
5
|
-
require 'set'
|
6
|
-
|
7
3
|
module BrokenLinkFinder
|
8
|
-
DEFAULT_MAX_THREADS = 100
|
4
|
+
DEFAULT_MAX_THREADS = 100 # Used by Finder#crawl_site.
|
5
|
+
SERVER_WAIT_TIME = 0.5 # Used by Finder#retry_broken_links.
|
9
6
|
|
10
7
|
# Alias for BrokenLinkFinder::Finder.new.
|
11
8
|
def self.new(sort: :page, max_threads: DEFAULT_MAX_THREADS)
|
12
9
|
Finder.new(sort: sort, max_threads: max_threads)
|
13
10
|
end
|
14
11
|
|
12
|
+
# Class responsible for finding broken links on a page or site.
|
15
13
|
class Finder
|
16
|
-
|
14
|
+
# The collection key - either :page or :link.
|
15
|
+
attr_reader :sort
|
16
|
+
|
17
|
+
# The max number of threads created during #crawl_site - one thread per page.
|
18
|
+
attr_reader :max_threads
|
17
19
|
|
18
|
-
#
|
19
|
-
def initialize(sort: :page, max_threads:
|
20
|
+
# Returns a new Finder instance.
|
21
|
+
def initialize(sort: :page, max_threads: DEFAULT_MAX_THREADS)
|
20
22
|
raise "Sort by either :page or :link, not #{sort}" \
|
21
23
|
unless %i[page link].include?(sort)
|
22
24
|
|
23
25
|
@sort = sort
|
24
26
|
@max_threads = max_threads
|
25
|
-
@lock = Mutex.new
|
26
27
|
@crawler = Wgit::Crawler.new
|
28
|
+
@manager = BrokenLinkFinder::LinkManager.new(@sort)
|
29
|
+
end
|
27
30
|
|
28
|
-
|
31
|
+
# Returns the current broken links.
|
32
|
+
def broken_links
|
33
|
+
@manager.broken_links
|
29
34
|
end
|
30
35
|
|
31
|
-
#
|
32
|
-
def
|
33
|
-
@
|
34
|
-
@ignored_links = {}
|
35
|
-
@total_links_crawled = 0
|
36
|
-
@all_broken_links = Set.new
|
37
|
-
@all_intact_links = Set.new
|
36
|
+
# Returns the current ignored links.
|
37
|
+
def ignored_links
|
38
|
+
@manager.ignored_links
|
38
39
|
end
|
39
40
|
|
40
|
-
#
|
41
|
-
|
41
|
+
# Returns the current crawl stats.
|
42
|
+
def crawl_stats
|
43
|
+
@manager.crawl_stats
|
44
|
+
end
|
45
|
+
|
46
|
+
# Finds broken links within a single page and records them.
|
47
|
+
# Returns true if at least one broken link was found.
|
42
48
|
# Access the broken links afterwards with Finder#broken_links.
|
43
49
|
def crawl_url(url)
|
44
|
-
|
50
|
+
@manager.empty
|
51
|
+
|
52
|
+
start = Time.now
|
53
|
+
url = url.to_url
|
45
54
|
|
46
|
-
url
|
47
|
-
doc = @crawler.crawl(url)
|
55
|
+
# We dup the url to avoid recording any redirects.
|
56
|
+
doc = @crawler.crawl(url.dup)
|
48
57
|
|
49
58
|
# Ensure the given page url is valid.
|
50
59
|
raise "Invalid or broken URL: #{url}" unless doc
|
51
60
|
|
52
61
|
# Get all page links and determine which are broken.
|
53
62
|
find_broken_links(doc)
|
63
|
+
retry_broken_links
|
54
64
|
|
55
|
-
|
56
|
-
|
65
|
+
@manager.sort
|
66
|
+
@manager.tally(url: url, pages_crawled: [url], start: start)
|
57
67
|
|
58
|
-
|
68
|
+
broken_links.any?
|
59
69
|
end
|
60
70
|
|
61
|
-
# Finds broken links within an entire site and
|
62
|
-
#
|
63
|
-
# at least one broken link was found and an Array of all pages crawled.
|
71
|
+
# Finds broken links within an entire site and records them.
|
72
|
+
# Returns true if at least one broken link was found.
|
64
73
|
# Access the broken links afterwards with Finder#broken_links.
|
65
|
-
def crawl_site(url)
|
66
|
-
|
74
|
+
def crawl_site(url, allow_paths: nil, disallow_paths: nil)
|
75
|
+
@manager.empty
|
67
76
|
|
68
|
-
|
69
|
-
|
70
|
-
|
77
|
+
start = Time.now
|
78
|
+
url = url.to_url
|
79
|
+
pool = Thread.pool(@max_threads)
|
80
|
+
crawled = Set.new
|
71
81
|
|
72
82
|
# Crawl the site's HTML web pages looking for links.
|
73
|
-
|
74
|
-
|
83
|
+
# We dup the url to avoid recording any redirects.
|
84
|
+
paths = { allow_paths: allow_paths, disallow_paths: disallow_paths }
|
85
|
+
externals = @crawler.crawl_site(url.dup, **paths) do |doc|
|
86
|
+
crawled << doc.url
|
75
87
|
next unless doc
|
76
88
|
|
77
89
|
# Start a thread for each page, checking for broken links.
|
78
90
|
pool.process { find_broken_links(doc) }
|
79
91
|
end
|
80
92
|
|
93
|
+
# Wait for all threads to finish, even if url was invalid.
|
94
|
+
pool.shutdown
|
95
|
+
|
81
96
|
# Ensure the given website url is valid.
|
82
97
|
raise "Invalid or broken URL: #{url}" unless externals
|
83
98
|
|
84
|
-
|
85
|
-
pool.shutdown
|
99
|
+
retry_broken_links
|
86
100
|
|
87
|
-
|
88
|
-
|
101
|
+
@manager.sort
|
102
|
+
@manager.tally(url: url, pages_crawled: crawled.to_a, start: start)
|
89
103
|
|
90
|
-
|
104
|
+
broken_links.any?
|
105
|
+
ensure
|
106
|
+
pool.shutdown if defined?(pool)
|
91
107
|
end
|
92
108
|
|
93
|
-
#
|
109
|
+
# Outputs the link report into a stream e.g. STDOUT or a file,
|
94
110
|
# anything that respond_to? :puts. Defaults to STDOUT.
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
111
|
+
def report(stream = STDOUT, type: :text,
|
112
|
+
broken_verbose: true, ignored_verbose: false)
|
113
|
+
klass = case type
|
114
|
+
when :text
|
115
|
+
BrokenLinkFinder::TextReporter
|
116
|
+
when :html
|
117
|
+
BrokenLinkFinder::HTMLReporter
|
118
|
+
else
|
119
|
+
raise "The type: must be :text or :html, not: :#{type}"
|
120
|
+
end
|
121
|
+
|
122
|
+
reporter = klass.new(stream, @sort,
|
123
|
+
broken_links, ignored_links,
|
124
|
+
@manager.broken_link_map, crawl_stats)
|
125
|
+
reporter.call(broken_verbose: broken_verbose,
|
126
|
+
ignored_verbose: ignored_verbose)
|
110
127
|
end
|
111
128
|
|
112
129
|
private
|
113
130
|
|
114
131
|
# Finds which links are unsupported or broken and records the details.
|
115
132
|
def find_broken_links(page)
|
133
|
+
record_unparsable_links(page) # Record them as broken.
|
134
|
+
|
116
135
|
links = get_supported_links(page)
|
117
136
|
|
118
137
|
# Iterate over the supported links checking if they're broken or not.
|
119
138
|
links.each do |link|
|
120
|
-
#
|
121
|
-
next if @all_intact_links.include?(link)
|
139
|
+
# Skip if the link has been encountered previously.
|
140
|
+
next if @manager.all_intact_links.include?(link)
|
122
141
|
|
123
|
-
if @all_broken_links.include?(link)
|
124
|
-
|
142
|
+
if @manager.all_broken_links.include?(link)
|
143
|
+
# The link has already been proven broken so simply record it.
|
144
|
+
@manager.append_broken_link(page, link, map: false)
|
125
145
|
next
|
126
146
|
end
|
127
147
|
|
128
|
-
# The link hasn't been
|
148
|
+
# The link hasn't been encountered before so we crawl it.
|
129
149
|
link_doc = crawl_link(page, link)
|
130
150
|
|
131
|
-
# Determine if the crawled link is broken or not.
|
132
|
-
if
|
133
|
-
|
134
|
-
has_broken_anchor(link_doc)
|
135
|
-
append_broken_link(page.url, link)
|
151
|
+
# Determine if the crawled link is broken or not and record it.
|
152
|
+
if link_broken?(link_doc)
|
153
|
+
@manager.append_broken_link(page, link)
|
136
154
|
else
|
137
|
-
@
|
155
|
+
@manager.append_intact_link(link)
|
138
156
|
end
|
139
157
|
end
|
140
158
|
|
141
159
|
nil
|
142
160
|
end
|
143
161
|
|
144
|
-
#
|
145
|
-
#
|
146
|
-
def
|
147
|
-
|
148
|
-
.reject do |link|
|
149
|
-
if link.is_absolute? && !link.start_with?('http')
|
150
|
-
append_ignored_link(doc.url, link)
|
151
|
-
true
|
152
|
-
end
|
153
|
-
end
|
154
|
-
end
|
155
|
-
|
156
|
-
# Makes the link absolute and crawls it, returning its Wgit::Document.
|
157
|
-
def crawl_link(doc, link)
|
158
|
-
link = link.prefix_base(doc)
|
159
|
-
@crawler.crawl(link)
|
160
|
-
end
|
162
|
+
# Implements a retry mechanism for each of the broken links found.
|
163
|
+
# Removes any broken links found to be working OK.
|
164
|
+
def retry_broken_links
|
165
|
+
sleep(SERVER_WAIT_TIME) # Give the servers a break, then retry the links.
|
161
166
|
|
162
|
-
|
163
|
-
|
164
|
-
|
167
|
+
@manager.broken_link_map.select! do |link, href|
|
168
|
+
# Don't retry unparsable links (which are Strings).
|
169
|
+
next(true) unless href.is_a?(Wgit::Url)
|
165
170
|
|
166
|
-
|
167
|
-
return false if fragment.nil? || fragment.empty?
|
171
|
+
doc = @crawler.crawl(href.dup)
|
168
172
|
|
169
|
-
|
173
|
+
if link_broken?(doc)
|
174
|
+
true
|
175
|
+
else
|
176
|
+
@manager.remove_broken_link(link)
|
177
|
+
false
|
178
|
+
end
|
179
|
+
end
|
170
180
|
end
|
171
181
|
|
172
|
-
#
|
173
|
-
def
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
@
|
178
|
-
@broken_links[key] << value
|
179
|
-
|
180
|
-
@all_broken_links << link
|
182
|
+
# Record each unparsable link as a broken link.
|
183
|
+
def record_unparsable_links(doc)
|
184
|
+
doc.unparsable_links.each do |link|
|
185
|
+
# We map the link ourselves because link is a String, not a Wgit::Url.
|
186
|
+
@manager.append_broken_link(doc, link, map: false)
|
187
|
+
@manager.broken_link_map[link] = link
|
181
188
|
end
|
182
189
|
end
|
183
190
|
|
184
|
-
#
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
+
# Report and reject any non supported links. Any link that is absolute and
|
192
|
+
# doesn't start with 'http' is unsupported e.g. 'mailto:blah' etc.
|
193
|
+
def get_supported_links(doc)
|
194
|
+
doc.all_links.reject do |link|
|
195
|
+
if link.is_absolute? && !link.start_with?('http')
|
196
|
+
@manager.append_ignored_link(doc.url, link)
|
197
|
+
true
|
198
|
+
end
|
191
199
|
end
|
192
200
|
end
|
193
201
|
|
194
|
-
#
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
when :page
|
199
|
-
[url, link]
|
200
|
-
when :link
|
201
|
-
[link, url]
|
202
|
-
else
|
203
|
-
raise "Unsupported sort type: #{sort}"
|
204
|
-
end
|
202
|
+
# Make the link absolute and crawl it, returning its Wgit::Document.
|
203
|
+
def crawl_link(doc, link)
|
204
|
+
link = link.make_absolute(doc)
|
205
|
+
@crawler.crawl(link.dup) # We dup link to avoid recording any redirects.
|
205
206
|
end
|
206
207
|
|
207
|
-
#
|
208
|
-
def
|
209
|
-
@
|
210
|
-
|
208
|
+
# Return if the crawled link is broken or not.
|
209
|
+
def link_broken?(doc)
|
210
|
+
doc.nil? || @crawler.last_response.not_found? || has_broken_anchor(doc)
|
211
|
+
end
|
211
212
|
|
212
|
-
|
213
|
-
|
213
|
+
# Returns true if the link is/contains a broken anchor/fragment.
|
214
|
+
# E.g. /about#top should contain a HTML element with an @id of 'top' etc.
|
215
|
+
def has_broken_anchor(doc)
|
216
|
+
raise 'The link document is nil' unless doc
|
214
217
|
|
215
|
-
|
216
|
-
|
217
|
-
end
|
218
|
+
fragment = doc.url.fragment
|
219
|
+
return false if fragment.nil? || fragment.empty?
|
218
220
|
|
219
|
-
|
220
|
-
def set_total_links_crawled
|
221
|
-
@total_links_crawled = @all_broken_links.size + @all_intact_links.size
|
221
|
+
doc.xpath("//*[@id='#{fragment}']").empty?
|
222
222
|
end
|
223
223
|
|
224
|
-
alias crawl_page
|
225
|
-
alias crawl_r
|
226
|
-
alias pretty_print_link_summary pretty_print_link_report
|
224
|
+
alias crawl_page crawl_url
|
225
|
+
alias crawl_r crawl_site
|
227
226
|
end
|
228
227
|
end
|
@@ -0,0 +1,137 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module BrokenLinkFinder
|
4
|
+
# Class responsible for handling the link collection logic.
|
5
|
+
class LinkManager
|
6
|
+
# Used for mapping pages to broken links.
|
7
|
+
attr_reader :broken_links
|
8
|
+
|
9
|
+
# Used for mapping pages to ignored links.
|
10
|
+
attr_reader :ignored_links
|
11
|
+
|
12
|
+
# Used to record crawl statistics e.g. duration etc.
|
13
|
+
attr_reader :crawl_stats
|
14
|
+
|
15
|
+
# Used to map a link (as is) to its absolute (crawlable) form.
|
16
|
+
attr_reader :broken_link_map
|
17
|
+
|
18
|
+
# Used to prevent crawling a broken link twice.
|
19
|
+
attr_reader :all_broken_links
|
20
|
+
|
21
|
+
# Used to prevent crawling an intact link twice.
|
22
|
+
attr_reader :all_intact_links
|
23
|
+
|
24
|
+
# Used for building crawl statistics.
|
25
|
+
attr_reader :all_ignored_links
|
26
|
+
|
27
|
+
# Returns a new LinkManager instance with empty link collections.
|
28
|
+
def initialize(sort)
|
29
|
+
raise "Sort by either :page or :link, not #{sort}" \
|
30
|
+
unless %i[page link].include?(sort)
|
31
|
+
|
32
|
+
@sort = sort
|
33
|
+
@lock = Mutex.new
|
34
|
+
|
35
|
+
empty # Initialises the link collections.
|
36
|
+
end
|
37
|
+
|
38
|
+
# Initialise/empty the link collection objects.
|
39
|
+
def empty
|
40
|
+
@broken_links = {}
|
41
|
+
@ignored_links = {}
|
42
|
+
@crawl_stats = {}
|
43
|
+
@broken_link_map = {}
|
44
|
+
@all_broken_links = Set.new
|
45
|
+
@all_intact_links = Set.new
|
46
|
+
@all_ignored_links = Set.new
|
47
|
+
end
|
48
|
+
|
49
|
+
# Append key => [value] to the broken link collections.
|
50
|
+
# If map: true, then the link will also be recorded in @broken_link_map.
|
51
|
+
def append_broken_link(doc, link, map: true)
|
52
|
+
key, value = get_key_value(doc.url, link)
|
53
|
+
|
54
|
+
@lock.synchronize do
|
55
|
+
@broken_links[key] = [] unless @broken_links[key]
|
56
|
+
@broken_links[key] << value
|
57
|
+
|
58
|
+
@all_broken_links << link
|
59
|
+
|
60
|
+
@broken_link_map[link] = link.make_absolute(doc) if map
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
# Remove the broken link from the necessary collections.
|
65
|
+
def remove_broken_link(link)
|
66
|
+
@lock.synchronize do
|
67
|
+
if @sort == :page
|
68
|
+
@broken_links.each { |_k, links| links.delete(link) }
|
69
|
+
@broken_links.delete_if { |_k, links| links.empty? }
|
70
|
+
else
|
71
|
+
@broken_links.delete(link)
|
72
|
+
end
|
73
|
+
|
74
|
+
@all_broken_links.delete(link)
|
75
|
+
@all_intact_links << link
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
# Append key => [value] to the ignored link collections.
|
80
|
+
def append_ignored_link(url, link)
|
81
|
+
key, value = get_key_value(url, link)
|
82
|
+
|
83
|
+
@lock.synchronize do
|
84
|
+
@ignored_links[key] = [] unless @ignored_links[key]
|
85
|
+
@ignored_links[key] << value
|
86
|
+
|
87
|
+
@all_ignored_links << link
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
# Append link to @all_intact_links.
|
92
|
+
def append_intact_link(link)
|
93
|
+
@lock.synchronize { @all_intact_links << link }
|
94
|
+
end
|
95
|
+
|
96
|
+
# Sorts the link collection's keys and values alphabetically.
|
97
|
+
def sort
|
98
|
+
@broken_links.values.map(&:uniq!)
|
99
|
+
@ignored_links.values.map(&:uniq!)
|
100
|
+
|
101
|
+
@broken_links = @broken_links.sort_by { |k, _v| k }.to_h
|
102
|
+
@ignored_links = @ignored_links.sort_by { |k, _v| k }.to_h
|
103
|
+
|
104
|
+
@broken_links.each { |_k, v| v.sort! }
|
105
|
+
@ignored_links.each { |_k, v| v.sort! }
|
106
|
+
end
|
107
|
+
|
108
|
+
# Tally's up various statistics about the crawl and its links.
|
109
|
+
def tally(url:, pages_crawled:, start:)
|
110
|
+
@crawl_stats[:url] = url
|
111
|
+
@crawl_stats[:pages_crawled] = pages_crawled
|
112
|
+
@crawl_stats[:num_pages] = pages_crawled.size
|
113
|
+
@crawl_stats[:num_links] = (
|
114
|
+
@all_broken_links.size + @all_intact_links.size + @all_ignored_links.size
|
115
|
+
)
|
116
|
+
@crawl_stats[:num_broken_links] = @all_broken_links.size
|
117
|
+
@crawl_stats[:num_intact_links] = @all_intact_links.size
|
118
|
+
@crawl_stats[:num_ignored_links] = @all_ignored_links.size
|
119
|
+
@crawl_stats[:duration] = Time.now - start
|
120
|
+
end
|
121
|
+
|
122
|
+
private
|
123
|
+
|
124
|
+
# Returns the correct key value depending on the @sort type.
|
125
|
+
# @sort == :page ? [url, link] : [link, url]
|
126
|
+
def get_key_value(url, link)
|
127
|
+
case @sort
|
128
|
+
when :page
|
129
|
+
[url, link]
|
130
|
+
when :link
|
131
|
+
[link, url]
|
132
|
+
else
|
133
|
+
raise "Unsupported sort type: #{sort}"
|
134
|
+
end
|
135
|
+
end
|
136
|
+
end
|
137
|
+
end
|