broken_link_finder 0.9.4 → 0.12.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.ruby-version +1 -1
- data/CHANGELOG.md +52 -0
- data/Gemfile.lock +51 -38
- data/README.md +65 -29
- data/benchmark.rb +9 -5
- data/bin/console +11 -19
- data/bin/setup +1 -1
- data/broken_link_finder.gemspec +8 -5
- data/exe/broken_link_finder +14 -3
- data/lib/broken_link_finder.rb +8 -2
- data/lib/broken_link_finder/finder.rb +131 -132
- data/lib/broken_link_finder/link_manager.rb +137 -0
- data/lib/broken_link_finder/reporter/html_reporter.rb +137 -0
- data/lib/broken_link_finder/reporter/reporter.rb +76 -0
- data/lib/broken_link_finder/reporter/text_reporter.rb +88 -0
- data/lib/broken_link_finder/version.rb +1 -1
- data/lib/broken_link_finder/wgit_extensions.rb +25 -5
- data/lib/broken_link_finder/xpath.rb +14 -0
- metadata +21 -15
- data/lib/broken_link_finder/reporter.rb +0 -116
data/exe/broken_link_finder
CHANGED
@@ -9,30 +9,41 @@ class BrokenLinkFinderCLI < Thor
|
|
9
9
|
desc 'crawl [URL]', 'Find broken links at the URL'
|
10
10
|
option :recursive, type: :boolean, aliases: [:r], default: false, desc: 'Crawl the entire site.'
|
11
11
|
option :threads, type: :numeric, aliases: [:t], default: BrokenLinkFinder::DEFAULT_MAX_THREADS, desc: 'Max number of threads to use when crawling recursively; 1 thread per web page.'
|
12
|
+
option :xpath, type: :string, aliases: [:x], default: BrokenLinkFinder::DEFAULT_LINK_XPATH
|
13
|
+
option :html, type: :boolean, aliases: [:h], default: false, desc: 'Produce a HTML report (instead of text)'
|
12
14
|
option :sort_by_link, type: :boolean, aliases: [:l], default: false, desc: 'Makes report more concise if there are more pages crawled than broken links found. Use with -r on medium/large sites.'
|
13
15
|
option :verbose, type: :boolean, aliases: [:v], default: false, desc: 'Display all ignored links.'
|
14
16
|
option :concise, type: :boolean, aliases: [:c], default: false, desc: 'Display only a summary of broken links.'
|
15
17
|
def crawl(url)
|
16
18
|
url = "http://#{url}" unless url.start_with?('http')
|
17
19
|
|
20
|
+
report_type = options[:html] ? :html : :text
|
18
21
|
sort_by = options[:sort_by_link] ? :link : :page
|
19
22
|
max_threads = options[:threads]
|
20
23
|
broken_verbose = !options[:concise]
|
21
24
|
ignored_verbose = options[:verbose]
|
22
25
|
|
26
|
+
BrokenLinkFinder.link_xpath = options[:xpath]
|
23
27
|
finder = BrokenLinkFinder::Finder.new(sort: sort_by, max_threads: max_threads)
|
24
28
|
options[:recursive] ? finder.crawl_site(url) : finder.crawl_page(url)
|
25
|
-
finder.
|
26
|
-
|
29
|
+
finder.report(
|
30
|
+
type: report_type,
|
31
|
+
broken_verbose: broken_verbose,
|
27
32
|
ignored_verbose: ignored_verbose
|
28
33
|
)
|
29
|
-
|
34
|
+
|
35
|
+
exit 0
|
36
|
+
rescue StandardError => e
|
30
37
|
puts "An error has occurred: #{e.message}"
|
38
|
+
|
39
|
+
exit 1
|
31
40
|
end
|
32
41
|
|
33
42
|
desc 'version', 'Display the currently installed version'
|
34
43
|
def version
|
35
44
|
puts "broken_link_finder v#{BrokenLinkFinder::VERSION}"
|
45
|
+
|
46
|
+
exit 0
|
36
47
|
end
|
37
48
|
end
|
38
49
|
|
data/lib/broken_link_finder.rb
CHANGED
@@ -2,8 +2,14 @@
|
|
2
2
|
|
3
3
|
require 'wgit'
|
4
4
|
require 'wgit/core_ext'
|
5
|
+
require 'thread/pool'
|
6
|
+
require 'set'
|
5
7
|
|
6
|
-
require_relative './broken_link_finder/wgit_extensions'
|
7
8
|
require_relative './broken_link_finder/version'
|
8
|
-
require_relative './broken_link_finder/
|
9
|
+
require_relative './broken_link_finder/xpath'
|
10
|
+
require_relative './broken_link_finder/wgit_extensions'
|
11
|
+
require_relative './broken_link_finder/link_manager'
|
12
|
+
require_relative './broken_link_finder/reporter/reporter'
|
13
|
+
require_relative './broken_link_finder/reporter/text_reporter'
|
14
|
+
require_relative './broken_link_finder/reporter/html_reporter'
|
9
15
|
require_relative './broken_link_finder/finder'
|
@@ -1,228 +1,227 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
require_relative 'reporter'
|
4
|
-
require 'thread/pool'
|
5
|
-
require 'set'
|
6
|
-
|
7
3
|
module BrokenLinkFinder
|
8
|
-
DEFAULT_MAX_THREADS = 100
|
4
|
+
DEFAULT_MAX_THREADS = 100 # Used by Finder#crawl_site.
|
5
|
+
SERVER_WAIT_TIME = 0.5 # Used by Finder#retry_broken_links.
|
9
6
|
|
10
7
|
# Alias for BrokenLinkFinder::Finder.new.
|
11
8
|
def self.new(sort: :page, max_threads: DEFAULT_MAX_THREADS)
|
12
9
|
Finder.new(sort: sort, max_threads: max_threads)
|
13
10
|
end
|
14
11
|
|
12
|
+
# Class responsible for finding broken links on a page or site.
|
15
13
|
class Finder
|
16
|
-
|
14
|
+
# The collection key - either :page or :link.
|
15
|
+
attr_reader :sort
|
16
|
+
|
17
|
+
# The max number of threads created during #crawl_site - one thread per page.
|
18
|
+
attr_reader :max_threads
|
17
19
|
|
18
|
-
#
|
19
|
-
def initialize(sort: :page, max_threads:
|
20
|
+
# Returns a new Finder instance.
|
21
|
+
def initialize(sort: :page, max_threads: DEFAULT_MAX_THREADS)
|
20
22
|
raise "Sort by either :page or :link, not #{sort}" \
|
21
23
|
unless %i[page link].include?(sort)
|
22
24
|
|
23
25
|
@sort = sort
|
24
26
|
@max_threads = max_threads
|
25
|
-
@lock = Mutex.new
|
26
27
|
@crawler = Wgit::Crawler.new
|
28
|
+
@manager = BrokenLinkFinder::LinkManager.new(@sort)
|
29
|
+
end
|
27
30
|
|
28
|
-
|
31
|
+
# Returns the current broken links.
|
32
|
+
def broken_links
|
33
|
+
@manager.broken_links
|
29
34
|
end
|
30
35
|
|
31
|
-
#
|
32
|
-
def
|
33
|
-
@
|
34
|
-
@ignored_links = {}
|
35
|
-
@total_links_crawled = 0
|
36
|
-
@all_broken_links = Set.new
|
37
|
-
@all_intact_links = Set.new
|
36
|
+
# Returns the current ignored links.
|
37
|
+
def ignored_links
|
38
|
+
@manager.ignored_links
|
38
39
|
end
|
39
40
|
|
40
|
-
#
|
41
|
-
|
41
|
+
# Returns the current crawl stats.
|
42
|
+
def crawl_stats
|
43
|
+
@manager.crawl_stats
|
44
|
+
end
|
45
|
+
|
46
|
+
# Finds broken links within a single page and records them.
|
47
|
+
# Returns true if at least one broken link was found.
|
42
48
|
# Access the broken links afterwards with Finder#broken_links.
|
43
49
|
def crawl_url(url)
|
44
|
-
|
50
|
+
@manager.empty
|
51
|
+
|
52
|
+
start = Time.now
|
53
|
+
url = url.to_url
|
45
54
|
|
46
|
-
url
|
47
|
-
doc = @crawler.crawl(url)
|
55
|
+
# We dup the url to avoid recording any redirects.
|
56
|
+
doc = @crawler.crawl(url.dup)
|
48
57
|
|
49
58
|
# Ensure the given page url is valid.
|
50
59
|
raise "Invalid or broken URL: #{url}" unless doc
|
51
60
|
|
52
61
|
# Get all page links and determine which are broken.
|
53
62
|
find_broken_links(doc)
|
63
|
+
retry_broken_links
|
54
64
|
|
55
|
-
|
56
|
-
|
65
|
+
@manager.sort
|
66
|
+
@manager.tally(url: url, pages_crawled: [url], start: start)
|
57
67
|
|
58
|
-
|
68
|
+
broken_links.any?
|
59
69
|
end
|
60
70
|
|
61
|
-
# Finds broken links within an entire site and
|
62
|
-
#
|
63
|
-
# at least one broken link was found and an Array of all pages crawled.
|
71
|
+
# Finds broken links within an entire site and records them.
|
72
|
+
# Returns true if at least one broken link was found.
|
64
73
|
# Access the broken links afterwards with Finder#broken_links.
|
65
|
-
def crawl_site(url)
|
66
|
-
|
74
|
+
def crawl_site(url, allow_paths: nil, disallow_paths: nil)
|
75
|
+
@manager.empty
|
67
76
|
|
68
|
-
|
69
|
-
|
70
|
-
|
77
|
+
start = Time.now
|
78
|
+
url = url.to_url
|
79
|
+
pool = Thread.pool(@max_threads)
|
80
|
+
crawled = Set.new
|
71
81
|
|
72
82
|
# Crawl the site's HTML web pages looking for links.
|
73
|
-
|
74
|
-
|
83
|
+
# We dup the url to avoid recording any redirects.
|
84
|
+
paths = { allow_paths: allow_paths, disallow_paths: disallow_paths }
|
85
|
+
externals = @crawler.crawl_site(url.dup, **paths) do |doc|
|
86
|
+
crawled << doc.url
|
75
87
|
next unless doc
|
76
88
|
|
77
89
|
# Start a thread for each page, checking for broken links.
|
78
90
|
pool.process { find_broken_links(doc) }
|
79
91
|
end
|
80
92
|
|
93
|
+
# Wait for all threads to finish, even if url was invalid.
|
94
|
+
pool.shutdown
|
95
|
+
|
81
96
|
# Ensure the given website url is valid.
|
82
97
|
raise "Invalid or broken URL: #{url}" unless externals
|
83
98
|
|
84
|
-
|
85
|
-
pool.shutdown
|
99
|
+
retry_broken_links
|
86
100
|
|
87
|
-
|
88
|
-
|
101
|
+
@manager.sort
|
102
|
+
@manager.tally(url: url, pages_crawled: crawled.to_a, start: start)
|
89
103
|
|
90
|
-
|
104
|
+
broken_links.any?
|
105
|
+
ensure
|
106
|
+
pool.shutdown if defined?(pool)
|
91
107
|
end
|
92
108
|
|
93
|
-
#
|
109
|
+
# Outputs the link report into a stream e.g. STDOUT or a file,
|
94
110
|
# anything that respond_to? :puts. Defaults to STDOUT.
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
111
|
+
def report(stream = STDOUT, type: :text,
|
112
|
+
broken_verbose: true, ignored_verbose: false)
|
113
|
+
klass = case type
|
114
|
+
when :text
|
115
|
+
BrokenLinkFinder::TextReporter
|
116
|
+
when :html
|
117
|
+
BrokenLinkFinder::HTMLReporter
|
118
|
+
else
|
119
|
+
raise "The type: must be :text or :html, not: :#{type}"
|
120
|
+
end
|
121
|
+
|
122
|
+
reporter = klass.new(stream, @sort,
|
123
|
+
broken_links, ignored_links,
|
124
|
+
@manager.broken_link_map, crawl_stats)
|
125
|
+
reporter.call(broken_verbose: broken_verbose,
|
126
|
+
ignored_verbose: ignored_verbose)
|
110
127
|
end
|
111
128
|
|
112
129
|
private
|
113
130
|
|
114
131
|
# Finds which links are unsupported or broken and records the details.
|
115
132
|
def find_broken_links(page)
|
133
|
+
record_unparsable_links(page) # Record them as broken.
|
134
|
+
|
116
135
|
links = get_supported_links(page)
|
117
136
|
|
118
137
|
# Iterate over the supported links checking if they're broken or not.
|
119
138
|
links.each do |link|
|
120
|
-
#
|
121
|
-
next if @all_intact_links.include?(link)
|
139
|
+
# Skip if the link has been encountered previously.
|
140
|
+
next if @manager.all_intact_links.include?(link)
|
122
141
|
|
123
|
-
if @all_broken_links.include?(link)
|
124
|
-
|
142
|
+
if @manager.all_broken_links.include?(link)
|
143
|
+
# The link has already been proven broken so simply record it.
|
144
|
+
@manager.append_broken_link(page, link, map: false)
|
125
145
|
next
|
126
146
|
end
|
127
147
|
|
128
|
-
# The link hasn't been
|
148
|
+
# The link hasn't been encountered before so we crawl it.
|
129
149
|
link_doc = crawl_link(page, link)
|
130
150
|
|
131
|
-
# Determine if the crawled link is broken or not.
|
132
|
-
if
|
133
|
-
|
134
|
-
has_broken_anchor(link_doc)
|
135
|
-
append_broken_link(page.url, link)
|
151
|
+
# Determine if the crawled link is broken or not and record it.
|
152
|
+
if link_broken?(link_doc)
|
153
|
+
@manager.append_broken_link(page, link)
|
136
154
|
else
|
137
|
-
@
|
155
|
+
@manager.append_intact_link(link)
|
138
156
|
end
|
139
157
|
end
|
140
158
|
|
141
159
|
nil
|
142
160
|
end
|
143
161
|
|
144
|
-
#
|
145
|
-
#
|
146
|
-
def
|
147
|
-
|
148
|
-
.reject do |link|
|
149
|
-
if link.is_absolute? && !link.start_with?('http')
|
150
|
-
append_ignored_link(doc.url, link)
|
151
|
-
true
|
152
|
-
end
|
153
|
-
end
|
154
|
-
end
|
155
|
-
|
156
|
-
# Makes the link absolute and crawls it, returning its Wgit::Document.
|
157
|
-
def crawl_link(doc, link)
|
158
|
-
link = link.prefix_base(doc)
|
159
|
-
@crawler.crawl(link)
|
160
|
-
end
|
162
|
+
# Implements a retry mechanism for each of the broken links found.
|
163
|
+
# Removes any broken links found to be working OK.
|
164
|
+
def retry_broken_links
|
165
|
+
sleep(SERVER_WAIT_TIME) # Give the servers a break, then retry the links.
|
161
166
|
|
162
|
-
|
163
|
-
|
164
|
-
|
167
|
+
@manager.broken_link_map.select! do |link, href|
|
168
|
+
# Don't retry unparsable links (which are Strings).
|
169
|
+
next(true) unless href.is_a?(Wgit::Url)
|
165
170
|
|
166
|
-
|
167
|
-
return false if fragment.nil? || fragment.empty?
|
171
|
+
doc = @crawler.crawl(href.dup)
|
168
172
|
|
169
|
-
|
173
|
+
if link_broken?(doc)
|
174
|
+
true
|
175
|
+
else
|
176
|
+
@manager.remove_broken_link(link)
|
177
|
+
false
|
178
|
+
end
|
179
|
+
end
|
170
180
|
end
|
171
181
|
|
172
|
-
#
|
173
|
-
def
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
@
|
178
|
-
@broken_links[key] << value
|
179
|
-
|
180
|
-
@all_broken_links << link
|
182
|
+
# Record each unparsable link as a broken link.
|
183
|
+
def record_unparsable_links(doc)
|
184
|
+
doc.unparsable_links.each do |link|
|
185
|
+
# We map the link ourselves because link is a String, not a Wgit::Url.
|
186
|
+
@manager.append_broken_link(doc, link, map: false)
|
187
|
+
@manager.broken_link_map[link] = link
|
181
188
|
end
|
182
189
|
end
|
183
190
|
|
184
|
-
#
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
+
# Report and reject any non supported links. Any link that is absolute and
|
192
|
+
# doesn't start with 'http' is unsupported e.g. 'mailto:blah' etc.
|
193
|
+
def get_supported_links(doc)
|
194
|
+
doc.all_links.reject do |link|
|
195
|
+
if link.is_absolute? && !link.start_with?('http')
|
196
|
+
@manager.append_ignored_link(doc.url, link)
|
197
|
+
true
|
198
|
+
end
|
191
199
|
end
|
192
200
|
end
|
193
201
|
|
194
|
-
#
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
when :page
|
199
|
-
[url, link]
|
200
|
-
when :link
|
201
|
-
[link, url]
|
202
|
-
else
|
203
|
-
raise "Unsupported sort type: #{sort}"
|
204
|
-
end
|
202
|
+
# Make the link absolute and crawl it, returning its Wgit::Document.
|
203
|
+
def crawl_link(doc, link)
|
204
|
+
link = link.make_absolute(doc)
|
205
|
+
@crawler.crawl(link.dup) # We dup link to avoid recording any redirects.
|
205
206
|
end
|
206
207
|
|
207
|
-
#
|
208
|
-
def
|
209
|
-
@
|
210
|
-
|
208
|
+
# Return if the crawled link is broken or not.
|
209
|
+
def link_broken?(doc)
|
210
|
+
doc.nil? || @crawler.last_response.not_found? || has_broken_anchor(doc)
|
211
|
+
end
|
211
212
|
|
212
|
-
|
213
|
-
|
213
|
+
# Returns true if the link is/contains a broken anchor/fragment.
|
214
|
+
# E.g. /about#top should contain a HTML element with an @id of 'top' etc.
|
215
|
+
def has_broken_anchor(doc)
|
216
|
+
raise 'The link document is nil' unless doc
|
214
217
|
|
215
|
-
|
216
|
-
|
217
|
-
end
|
218
|
+
fragment = doc.url.fragment
|
219
|
+
return false if fragment.nil? || fragment.empty?
|
218
220
|
|
219
|
-
|
220
|
-
def set_total_links_crawled
|
221
|
-
@total_links_crawled = @all_broken_links.size + @all_intact_links.size
|
221
|
+
doc.xpath("//*[@id='#{fragment}']").empty?
|
222
222
|
end
|
223
223
|
|
224
|
-
alias crawl_page
|
225
|
-
alias crawl_r
|
226
|
-
alias pretty_print_link_summary pretty_print_link_report
|
224
|
+
alias crawl_page crawl_url
|
225
|
+
alias crawl_r crawl_site
|
227
226
|
end
|
228
227
|
end
|
@@ -0,0 +1,137 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module BrokenLinkFinder
|
4
|
+
# Class responsible for handling the link collection logic.
|
5
|
+
class LinkManager
|
6
|
+
# Used for mapping pages to broken links.
|
7
|
+
attr_reader :broken_links
|
8
|
+
|
9
|
+
# Used for mapping pages to ignored links.
|
10
|
+
attr_reader :ignored_links
|
11
|
+
|
12
|
+
# Used to record crawl statistics e.g. duration etc.
|
13
|
+
attr_reader :crawl_stats
|
14
|
+
|
15
|
+
# Used to map a link (as is) to its absolute (crawlable) form.
|
16
|
+
attr_reader :broken_link_map
|
17
|
+
|
18
|
+
# Used to prevent crawling a broken link twice.
|
19
|
+
attr_reader :all_broken_links
|
20
|
+
|
21
|
+
# Used to prevent crawling an intact link twice.
|
22
|
+
attr_reader :all_intact_links
|
23
|
+
|
24
|
+
# Used for building crawl statistics.
|
25
|
+
attr_reader :all_ignored_links
|
26
|
+
|
27
|
+
# Returns a new LinkManager instance with empty link collections.
|
28
|
+
def initialize(sort)
|
29
|
+
raise "Sort by either :page or :link, not #{sort}" \
|
30
|
+
unless %i[page link].include?(sort)
|
31
|
+
|
32
|
+
@sort = sort
|
33
|
+
@lock = Mutex.new
|
34
|
+
|
35
|
+
empty # Initialises the link collections.
|
36
|
+
end
|
37
|
+
|
38
|
+
# Initialise/empty the link collection objects.
|
39
|
+
def empty
|
40
|
+
@broken_links = {}
|
41
|
+
@ignored_links = {}
|
42
|
+
@crawl_stats = {}
|
43
|
+
@broken_link_map = {}
|
44
|
+
@all_broken_links = Set.new
|
45
|
+
@all_intact_links = Set.new
|
46
|
+
@all_ignored_links = Set.new
|
47
|
+
end
|
48
|
+
|
49
|
+
# Append key => [value] to the broken link collections.
|
50
|
+
# If map: true, then the link will also be recorded in @broken_link_map.
|
51
|
+
def append_broken_link(doc, link, map: true)
|
52
|
+
key, value = get_key_value(doc.url, link)
|
53
|
+
|
54
|
+
@lock.synchronize do
|
55
|
+
@broken_links[key] = [] unless @broken_links[key]
|
56
|
+
@broken_links[key] << value
|
57
|
+
|
58
|
+
@all_broken_links << link
|
59
|
+
|
60
|
+
@broken_link_map[link] = link.make_absolute(doc) if map
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
# Remove the broken link from the necessary collections.
|
65
|
+
def remove_broken_link(link)
|
66
|
+
@lock.synchronize do
|
67
|
+
if @sort == :page
|
68
|
+
@broken_links.each { |_k, links| links.delete(link) }
|
69
|
+
@broken_links.delete_if { |_k, links| links.empty? }
|
70
|
+
else
|
71
|
+
@broken_links.delete(link)
|
72
|
+
end
|
73
|
+
|
74
|
+
@all_broken_links.delete(link)
|
75
|
+
@all_intact_links << link
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
# Append key => [value] to the ignored link collections.
|
80
|
+
def append_ignored_link(url, link)
|
81
|
+
key, value = get_key_value(url, link)
|
82
|
+
|
83
|
+
@lock.synchronize do
|
84
|
+
@ignored_links[key] = [] unless @ignored_links[key]
|
85
|
+
@ignored_links[key] << value
|
86
|
+
|
87
|
+
@all_ignored_links << link
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
# Append link to @all_intact_links.
|
92
|
+
def append_intact_link(link)
|
93
|
+
@lock.synchronize { @all_intact_links << link }
|
94
|
+
end
|
95
|
+
|
96
|
+
# Sorts the link collection's keys and values alphabetically.
|
97
|
+
def sort
|
98
|
+
@broken_links.values.map(&:uniq!)
|
99
|
+
@ignored_links.values.map(&:uniq!)
|
100
|
+
|
101
|
+
@broken_links = @broken_links.sort_by { |k, _v| k }.to_h
|
102
|
+
@ignored_links = @ignored_links.sort_by { |k, _v| k }.to_h
|
103
|
+
|
104
|
+
@broken_links.each { |_k, v| v.sort! }
|
105
|
+
@ignored_links.each { |_k, v| v.sort! }
|
106
|
+
end
|
107
|
+
|
108
|
+
# Tally's up various statistics about the crawl and its links.
|
109
|
+
def tally(url:, pages_crawled:, start:)
|
110
|
+
@crawl_stats[:url] = url
|
111
|
+
@crawl_stats[:pages_crawled] = pages_crawled
|
112
|
+
@crawl_stats[:num_pages] = pages_crawled.size
|
113
|
+
@crawl_stats[:num_links] = (
|
114
|
+
@all_broken_links.size + @all_intact_links.size + @all_ignored_links.size
|
115
|
+
)
|
116
|
+
@crawl_stats[:num_broken_links] = @all_broken_links.size
|
117
|
+
@crawl_stats[:num_intact_links] = @all_intact_links.size
|
118
|
+
@crawl_stats[:num_ignored_links] = @all_ignored_links.size
|
119
|
+
@crawl_stats[:duration] = Time.now - start
|
120
|
+
end
|
121
|
+
|
122
|
+
private
|
123
|
+
|
124
|
+
# Returns the correct key value depending on the @sort type.
|
125
|
+
# @sort == :page ? [url, link] : [link, url]
|
126
|
+
def get_key_value(url, link)
|
127
|
+
case @sort
|
128
|
+
when :page
|
129
|
+
[url, link]
|
130
|
+
when :link
|
131
|
+
[link, url]
|
132
|
+
else
|
133
|
+
raise "Unsupported sort type: #{sort}"
|
134
|
+
end
|
135
|
+
end
|
136
|
+
end
|
137
|
+
end
|