cobweb 1.0.11 → 1.0.12
Sign up to get free protection for your applications and to get access to all the features.
- data/README.textile +4 -3
- data/lib/cobweb.rb +31 -8
- data/lib/cobweb_crawler.rb +7 -8
- data/lib/cobweb_process_job.rb +1 -1
- data/lib/cobweb_version.rb +1 -1
- data/lib/crawl.rb +11 -4
- data/lib/crawl_finished_worker.rb +27 -0
- data/lib/crawl_helper.rb +250 -0
- data/lib/crawl_job.rb +2 -2
- data/lib/crawl_process_worker.rb +31 -0
- data/lib/crawl_worker.rb +118 -0
- data/lib/sidekiq/cobweb_helper.rb +16 -0
- data/lib/stats.rb +12 -11
- data/lib/uri_helper.rb +8 -0
- data/spec/cobweb/cobweb_crawl_helper_spec.rb +4 -1
- data/spec/cobweb/cobweb_crawl_spec.rb +29 -13
- data/spec/cobweb/cobweb_crawler_spec.rb +33 -14
- data/spec/cobweb/cobweb_links_spec.rb +2 -1
- data/spec/cobweb/cobweb_spec.rb +3 -0
- data/spec/cobweb/content_link_parser_spec.rb +4 -0
- data/spec/cobweb/{cobweb_job_spec.rb → crawl_job_spec.rb} +52 -9
- data/spec/cobweb/crawl_worker_spec.rb +250 -0
- data/spec/cobweb/robots_spec.rb +2 -1
- data/spec/http_stubs.rb +95 -0
- data/spec/samples/sample_site/{boxgrid.html → boxgrid>withsillyname.html} +1 -1
- data/spec/samples/sample_site/dashboard.html +1 -1
- data/spec/samples/sample_site/forms.html +1 -1
- data/spec/samples/sample_site/gallery.html +1 -1
- data/spec/samples/sample_site/more.html +1 -1
- data/spec/samples/sample_site/tables.html +1 -1
- data/spec/samples/sample_site/typography.html +1 -1
- data/spec/spec_helper.rb +6 -88
- metadata +85 -35
- data/spec/cobweb/site_test_spec.rb.tmp +0 -101
data/README.textile
CHANGED
@@ -1,10 +1,11 @@
|
|
1
1
|
|
2
|
-
h1. Cobweb v1.0.
|
2
|
+
h1. Cobweb v1.0.12
|
3
3
|
|
4
4
|
"@cobweb_gem":https://twitter.com/cobweb_gem
|
5
|
-
|
6
|
-
!https://secure.travis-ci.org/stewartmckee/cobweb.png?branch=master!
|
5
|
+
!https://badge.fury.io/rb/cobweb.png!:http://badge.fury.io/rb/cobweb
|
7
6
|
!https://gemnasium.com/stewartmckee/cobweb.png!
|
7
|
+
!https://coveralls.io/repos/stewartmckee/cobweb/badge.png?branch=master(Coverage Status)!:https://coveralls.io/r/stewartmckee/cobweb
|
8
|
+
|
8
9
|
|
9
10
|
h2. Intro
|
10
11
|
|
data/lib/cobweb.rb
CHANGED
@@ -33,8 +33,14 @@ class Cobweb
|
|
33
33
|
default_use_encoding_safe_process_job_to false
|
34
34
|
default_follow_redirects_to true
|
35
35
|
default_redirect_limit_to 10
|
36
|
-
|
37
|
-
|
36
|
+
default_queue_system_to :resque
|
37
|
+
if @options[:queue_system] == :resque
|
38
|
+
default_processing_queue_to "CobwebProcessJob"
|
39
|
+
default_crawl_finished_queue_to "CobwebFinishedJob"
|
40
|
+
else
|
41
|
+
default_processing_queue_to "CrawlProcessWorker"
|
42
|
+
default_crawl_finished_queue_to "CrawlFinishedWorker"
|
43
|
+
end
|
38
44
|
default_quiet_to true
|
39
45
|
default_debug_to false
|
40
46
|
default_cache_to 300
|
@@ -49,7 +55,7 @@ class Cobweb
|
|
49
55
|
default_user_agent_to "cobweb/#{Cobweb.version} (ruby/#{RUBY_VERSION} nokogiri/#{Nokogiri::VERSION})"
|
50
56
|
default_valid_mime_types_to ["*/*"]
|
51
57
|
default_raise_exceptions_to false
|
52
|
-
|
58
|
+
default_store_inbound_links_to false
|
53
59
|
|
54
60
|
end
|
55
61
|
|
@@ -80,7 +86,14 @@ class Cobweb
|
|
80
86
|
|
81
87
|
# add internal_urls into redis
|
82
88
|
@options[:internal_urls].map{|url| @redis.sadd("internal_urls", url)}
|
83
|
-
|
89
|
+
if @options[:queue_system] == :resque
|
90
|
+
Resque.enqueue(CrawlJob, request)
|
91
|
+
elsif @options[:queue_system] == :sidekiq
|
92
|
+
CrawlWorker.perform_async(request)
|
93
|
+
else
|
94
|
+
raise "Unknown queue system: #{content_request[:queue_system]}"
|
95
|
+
end
|
96
|
+
|
84
97
|
request
|
85
98
|
end
|
86
99
|
|
@@ -124,8 +137,13 @@ class Cobweb
|
|
124
137
|
|
125
138
|
# check if it has already been cached
|
126
139
|
if ((@options[:cache_type] == :crawl_based && redis.get(unique_id)) || (@options[:cache_type] == :full && full_redis.get(unique_id))) && @options[:cache]
|
127
|
-
|
128
|
-
|
140
|
+
if @options[:cache_type] == :crawl_based
|
141
|
+
puts "Cache hit for #{url}" unless @options[:quiet]
|
142
|
+
content = HashUtil.deep_symbolize_keys(Marshal.load(redis.get(unique_id)))
|
143
|
+
else
|
144
|
+
puts "Cache hit for #{url}" unless @options[:quiet]
|
145
|
+
content = HashUtil.deep_symbolize_keys(Marshal.load(full_redis.get(unique_id)))
|
146
|
+
end
|
129
147
|
else
|
130
148
|
# retrieve data
|
131
149
|
#unless @http && @http.address == uri.host && @http.port == uri.inferred_port
|
@@ -204,8 +222,13 @@ class Cobweb
|
|
204
222
|
end
|
205
223
|
# add content to cache if required
|
206
224
|
if @options[:cache]
|
207
|
-
|
208
|
-
|
225
|
+
if @options[:cache_type] == :crawl_based
|
226
|
+
redis.set(unique_id, Marshal.dump(content))
|
227
|
+
redis.expire unique_id, @options[:cache].to_i
|
228
|
+
else
|
229
|
+
full_redis.set(unique_id, Marshal.dump(content))
|
230
|
+
full_redis.expire unique_id, @options[:cache].to_i
|
231
|
+
end
|
209
232
|
end
|
210
233
|
rescue RedirectError => e
|
211
234
|
raise e if @options[:raise_exceptions]
|
data/lib/cobweb_crawler.rb
CHANGED
@@ -96,14 +96,12 @@ class CobwebCrawler
|
|
96
96
|
@redis.sadd "crawled", url.to_s
|
97
97
|
@redis.incr "crawl-counter"
|
98
98
|
|
99
|
-
|
99
|
+
document_links = ContentLinkParser.new(url, content[:body]).all_links(:valid_schemes => [:http, :https]).uniq
|
100
100
|
|
101
101
|
# select the link if its internal (eliminate external before expensive lookups in queued and crawled)
|
102
102
|
cobweb_links = CobwebLinks.new(@options)
|
103
103
|
|
104
|
-
internal_links =
|
105
|
-
|
106
|
-
all_internal_links = internal_links
|
104
|
+
internal_links = document_links.select{|link| cobweb_links.internal?(link) || (@options[:crawl_linked_external] && cobweb_links.internal?(url.to_s) && !cobweb_links.matches_external?(link))}
|
107
105
|
|
108
106
|
# reject the link if we've crawled it or queued it
|
109
107
|
internal_links.reject!{|link| @redis.sismember("crawled", link)}
|
@@ -120,12 +118,13 @@ class CobwebCrawler
|
|
120
118
|
@queue_counter += 1
|
121
119
|
end
|
122
120
|
|
123
|
-
if @options[:
|
124
|
-
|
125
|
-
|
121
|
+
if @options[:store_inbound_links]
|
122
|
+
document_links.each do |target_link|
|
123
|
+
target_uri = UriHelper.parse(target_link)
|
124
|
+
@redis.sadd("inbound_links_#{Digest::MD5.hexdigest(target_uri.to_s)}", UriHelper.parse(url).to_s)
|
126
125
|
end
|
127
126
|
end
|
128
|
-
|
127
|
+
|
129
128
|
@crawl_counter = @redis.scard("crawled").to_i
|
130
129
|
@queue_counter = @redis.scard("queued").to_i
|
131
130
|
|
data/lib/cobweb_process_job.rb
CHANGED
data/lib/cobweb_version.rb
CHANGED
data/lib/crawl.rb
CHANGED
@@ -88,11 +88,12 @@ module CobwebModule
|
|
88
88
|
|
89
89
|
@cobweb_links = CobwebLinks.new(@options)
|
90
90
|
if within_queue_limits?
|
91
|
-
|
91
|
+
document_links = ContentLinkParser.new(@options[:url], content.body, @options).all_links(:valid_schemes => [:http, :https])
|
92
92
|
#get rid of duplicate links in the same page.
|
93
|
-
|
93
|
+
document_links.uniq!
|
94
|
+
|
94
95
|
# select the link if its internal
|
95
|
-
internal_links.select
|
96
|
+
internal_links = document_links.select{ |link| @cobweb_links.internal?(link) }
|
96
97
|
|
97
98
|
# reject the link if we've crawled it or queued it
|
98
99
|
internal_links.reject! { |link| @redis.sismember("crawled", link) }
|
@@ -111,6 +112,13 @@ module CobwebModule
|
|
111
112
|
end
|
112
113
|
end
|
113
114
|
end
|
115
|
+
|
116
|
+
if @options[:store_inbound_links]
|
117
|
+
document_links.each do |link|
|
118
|
+
uri = URI.parse(link)
|
119
|
+
@redis.sadd("inbound_links_#{Digest::MD5.hexdigest(uri.to_s)}", url)
|
120
|
+
end
|
121
|
+
end
|
114
122
|
end
|
115
123
|
end
|
116
124
|
|
@@ -180,7 +188,6 @@ module CobwebModule
|
|
180
188
|
|
181
189
|
def finished
|
182
190
|
set_first_to_finish
|
183
|
-
debug_ap "CRAWL FINISHED #{@options[:url]}, #{counters}, #{@redis.get("original_base_url")}, #{@redis.get("crawled_base_url")}"
|
184
191
|
@stats.end_crawl(@options)
|
185
192
|
end
|
186
193
|
|
@@ -0,0 +1,27 @@
|
|
1
|
+
require 'sidekiq'
|
2
|
+
require File.expand_path(File.dirname(__FILE__) + '/sidekiq/cobweb_helper')
|
3
|
+
|
4
|
+
# If your client is single-threaded, we just need a single connection in our Redis connection pool
|
5
|
+
#Sidekiq.configure_client do |config|
|
6
|
+
# config.redis = { :namespace => 'x', :size => 1, :url => 'redis://localhost:6379/14' }
|
7
|
+
#end
|
8
|
+
|
9
|
+
# Sidekiq server is multi-threaded so our Redis connection pool size defaults to concurrency (-c)
|
10
|
+
#Sidekiq.configure_server do |config|
|
11
|
+
# config.redis = { :namespace => 'x', :url => 'redis://localhost:6379/14' }
|
12
|
+
#end
|
13
|
+
|
14
|
+
class CrawlFinishedWorker
|
15
|
+
|
16
|
+
include Sidekiq::Worker
|
17
|
+
|
18
|
+
sidekiq_options queue: "crawl_finished_worker"
|
19
|
+
|
20
|
+
|
21
|
+
def perform(statistics)
|
22
|
+
puts "Dummy Finished Job"
|
23
|
+
|
24
|
+
ap statistics
|
25
|
+
|
26
|
+
end
|
27
|
+
end
|
data/lib/crawl_helper.rb
ADDED
@@ -0,0 +1,250 @@
|
|
1
|
+
class CrawlHelper
|
2
|
+
|
3
|
+
require "net/https"
|
4
|
+
require "uri"
|
5
|
+
require "redis"
|
6
|
+
require 'namespaced_redis'
|
7
|
+
|
8
|
+
def self.crawl_page(content_request)
|
9
|
+
# change all hash keys to symbols
|
10
|
+
content_request = HashUtil.deep_symbolize_keys(content_request)
|
11
|
+
@content_request = content_request
|
12
|
+
|
13
|
+
content_request[:redis_options] = {} unless content_request.has_key? :redis_options
|
14
|
+
content_request[:crawl_limit_by_page] = false unless content_request.has_key? :crawl_limit_by_page
|
15
|
+
content_request[:valid_mime_types] = ["*/*"] unless content_request.has_key? :valid_mime_types
|
16
|
+
content_request[:queue_system] = content_request[:queue_system].to_sym
|
17
|
+
|
18
|
+
@redis = NamespacedRedis.new(content_request[:redis_options], "cobweb-#{Cobweb.version}-#{content_request[:crawl_id]}")
|
19
|
+
@stats = Stats.new(content_request)
|
20
|
+
|
21
|
+
@debug = content_request[:debug]
|
22
|
+
|
23
|
+
decrement_queue_counter
|
24
|
+
|
25
|
+
# check we haven't crawled this url before
|
26
|
+
unless @redis.sismember "crawled", content_request[:url]
|
27
|
+
# if there is no limit or we're still under it lets get the url
|
28
|
+
if within_crawl_limits?(content_request[:crawl_limit])
|
29
|
+
content = Cobweb.new(content_request).get(content_request[:url], content_request)
|
30
|
+
if content_request[:url] == @redis.get("original_base_url")
|
31
|
+
@redis.set("crawled_base_url", content[:base_url])
|
32
|
+
end
|
33
|
+
if is_permitted_type(content)
|
34
|
+
begin
|
35
|
+
# move the url from the queued list to the crawled list - for both the original url, and the content url (to handle redirects)
|
36
|
+
@redis.srem "queued", content_request[:url]
|
37
|
+
@redis.sadd "crawled", content_request[:url]
|
38
|
+
@redis.srem "queued", content[:url]
|
39
|
+
@redis.sadd "crawled", content[:url]
|
40
|
+
# increment the counter if we are not limiting by page only || we are limiting count by page and it is a page
|
41
|
+
if content_request[:crawl_limit_by_page]
|
42
|
+
if content[:mime_type].match("text/html")
|
43
|
+
increment_crawl_started_counter
|
44
|
+
end
|
45
|
+
else
|
46
|
+
increment_crawl_started_counter
|
47
|
+
end
|
48
|
+
|
49
|
+
## update statistics
|
50
|
+
@stats.update_status("Crawling #{content_request[:url]}...")
|
51
|
+
@stats.update_statistics(content)
|
52
|
+
|
53
|
+
# set the base url if this is the first page
|
54
|
+
set_base_url @redis, content, content_request
|
55
|
+
|
56
|
+
@cobweb_links = CobwebLinks.new(content_request)
|
57
|
+
if within_queue_limits?(content_request[:crawl_limit])
|
58
|
+
internal_links = ContentLinkParser.new(content_request[:url], content[:body], content_request).all_links(:valid_schemes => [:http, :https])
|
59
|
+
|
60
|
+
# select the link if its internal
|
61
|
+
internal_links.select! { |link| @cobweb_links.internal?(link) }
|
62
|
+
|
63
|
+
# reject the link if we've crawled it or queued it
|
64
|
+
internal_links.reject! { |link| @redis.sismember("crawled", link) }
|
65
|
+
internal_links.reject! { |link| @redis.sismember("queued", link) }
|
66
|
+
|
67
|
+
internal_links.each do |link|
|
68
|
+
enqueue_content(content_request, link) if within_queue_limits?(content_request[:crawl_limit])
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
# enqueue to processing queue
|
73
|
+
send_to_processing_queue(content, content_request)
|
74
|
+
|
75
|
+
#if the enqueue counter has been requested update that
|
76
|
+
if content_request.has_key? :enqueue_counter_key
|
77
|
+
enqueue_redis = NamespacedRedis.new(content_request[:redis_options], content_request[:enqueue_counter_namespace].to_s)
|
78
|
+
current_count = enqueue_redis.hget(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field]).to_i
|
79
|
+
enqueue_redis.hset(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field], current_count+1)
|
80
|
+
end
|
81
|
+
|
82
|
+
ensure
|
83
|
+
#update the queued and crawled lists if we are within the crawl limits.
|
84
|
+
|
85
|
+
# update the queue and crawl counts -- doing this very late in the piece so that the following transaction all occurs at once.
|
86
|
+
# really we should do this with a lock https://github.com/PatrickTulskie/redis-lock
|
87
|
+
if content_request[:crawl_limit_by_page]
|
88
|
+
if content[:mime_type].match("text/html")
|
89
|
+
increment_crawl_counter
|
90
|
+
end
|
91
|
+
else
|
92
|
+
increment_crawl_counter
|
93
|
+
end
|
94
|
+
puts "Crawled: #{@crawl_counter} Limit: #{content_request[:crawl_limit]} Queued: #{@queue_counter} In Progress: #{@crawl_started_counter-@crawl_counter}" if @debug
|
95
|
+
end
|
96
|
+
else
|
97
|
+
puts "ignoring #{content_request[:url]} as mime_type is #{content[:mime_type]}" if content_request[:debug]
|
98
|
+
end
|
99
|
+
else
|
100
|
+
puts "ignoring #{content_request[:url]} as outside of crawl limits." if content_request[:debug]
|
101
|
+
end
|
102
|
+
|
103
|
+
else
|
104
|
+
@redis.srem "queued", content_request[:url]
|
105
|
+
puts "Already crawled #{content_request[:url]}" if content_request[:debug]
|
106
|
+
end
|
107
|
+
|
108
|
+
# if there's nothing left queued or the crawled limit has been reached
|
109
|
+
refresh_counters
|
110
|
+
if content_request[:crawl_limit].nil? || content_request[:crawl_limit] == 0
|
111
|
+
if @queue_counter+@crawl_started_counter-@crawl_counter == 0
|
112
|
+
finished(content_request)
|
113
|
+
end
|
114
|
+
elsif (@queue_counter +@crawl_started_counter-@crawl_counter)== 0 || @crawl_counter >= content_request[:crawl_limit].to_i
|
115
|
+
finished(content_request)
|
116
|
+
end
|
117
|
+
|
118
|
+
end
|
119
|
+
|
120
|
+
# Sets the crawl status to 'Crawl Finished' and enqueues the crawl finished job
|
121
|
+
def self.finished(content_request)
|
122
|
+
# finished
|
123
|
+
if @redis.hget("statistics", "current_status")!= "Crawl Finished"
|
124
|
+
ap "CRAWL FINISHED #{content_request[:url]}, #{counters}, #{@redis.get("original_base_url")}, #{@redis.get("crawled_base_url")}" if content_request[:debug]
|
125
|
+
@stats.end_crawl(content_request)
|
126
|
+
|
127
|
+
additional_stats = {:crawl_id => content_request[:crawl_id], :crawled_base_url => @redis.get("crawled_base_url")}
|
128
|
+
additional_stats[:redis_options] = content_request[:redis_options] unless content_request[:redis_options] == {}
|
129
|
+
additional_stats[:source_id] = content_request[:source_id] unless content_request[:source_id].nil?
|
130
|
+
|
131
|
+
if content_request[:queue_system] == :resque
|
132
|
+
Resque.enqueue(const_get(content_request[:crawl_finished_queue]), @stats.get_statistics.merge(additional_stats))
|
133
|
+
elsif content_request[:queue_system] == :sidekiq
|
134
|
+
puts "Queueing Finished on Sidekiq"
|
135
|
+
const_get(content_request[:crawl_finished_queue]).perform_async(@stats.get_statistics.merge(additional_stats))
|
136
|
+
else
|
137
|
+
raise "Unknown queue system: #{content_request[:queue_system]}"
|
138
|
+
end
|
139
|
+
else
|
140
|
+
# nothing to report here, we're skipping the remaining urls as we're outside of the crawl limit
|
141
|
+
end
|
142
|
+
end
|
143
|
+
|
144
|
+
# Enqueues the content to the processing queue setup in options
|
145
|
+
def self.send_to_processing_queue(content, content_request)
|
146
|
+
content_to_send = content.merge({:internal_urls => content_request[:internal_urls], :redis_options => content_request[:redis_options], :source_id => content_request[:source_id], :crawl_id => content_request[:crawl_id]})
|
147
|
+
if content_request[:direct_call_process_job]
|
148
|
+
clazz = const_get(content_request[:processing_queue])
|
149
|
+
clazz.perform(content_to_send)
|
150
|
+
elsif content_request[:use_encoding_safe_process_job]
|
151
|
+
content_to_send[:body] = Base64.encode64(content[:body])
|
152
|
+
content_to_send[:processing_queue] = content_request[:processing_queue]
|
153
|
+
Resque.enqueue(EncodingSafeProcessJob, content_to_send)
|
154
|
+
else
|
155
|
+
if content_request[:queue_system] == :resque
|
156
|
+
Resque.enqueue(const_get(content_request[:processing_queue]), content_to_send)
|
157
|
+
elsif content_request[:queue_system] == :sidekiq
|
158
|
+
puts "Queueing on Sidekiq"
|
159
|
+
const_get(content_request[:processing_queue]).perform_async(content_to_send)
|
160
|
+
else
|
161
|
+
raise "Unknown queue system: #{content_request[:queue_system]}"
|
162
|
+
end
|
163
|
+
end
|
164
|
+
puts "#{content_request[:url]} has been sent for processing. use_encoding_safe_process_job: #{content_request[:use_encoding_safe_process_job]}" if content_request[:debug]
|
165
|
+
end
|
166
|
+
|
167
|
+
private
|
168
|
+
|
169
|
+
# Helper method to determine if this content is to be processed or not
|
170
|
+
def self.is_permitted_type(content)
|
171
|
+
@content_request[:valid_mime_types].each do |mime_type|
|
172
|
+
return true if content[:mime_type].match(Cobweb.escape_pattern_for_regex(mime_type))
|
173
|
+
end
|
174
|
+
false
|
175
|
+
end
|
176
|
+
|
177
|
+
# Returns true if the crawl count is within limits
|
178
|
+
def self.within_crawl_limits?(crawl_limit)
|
179
|
+
refresh_counters
|
180
|
+
crawl_limit.nil? or @crawl_started_counter < crawl_limit.to_i
|
181
|
+
end
|
182
|
+
|
183
|
+
# Returns true if the queue count is calculated to be still within limits when complete
|
184
|
+
def self.within_queue_limits?(crawl_limit)
|
185
|
+
refresh_counters
|
186
|
+
(@content_request[:crawl_limit_by_page]&& (crawl_limit.nil? or @crawl_counter < crawl_limit.to_i)) || within_crawl_limits?(crawl_limit) && (crawl_limit.nil? || (@queue_counter + @crawl_counter) < crawl_limit.to_i)
|
187
|
+
end
|
188
|
+
|
189
|
+
# Sets the base url in redis. If the first page is a redirect, it sets the base_url to the destination
|
190
|
+
def self.set_base_url(redis, content, content_request)
|
191
|
+
if redis.get("base_url").nil?
|
192
|
+
unless content[:redirect_through].nil? || content[:redirect_through].empty? || !content_request[:first_page_redirect_internal]
|
193
|
+
uri = Addressable::URI.parse(content[:redirect_through].last)
|
194
|
+
redis.sadd("internal_urls", [uri.scheme, "://", uri.host, "/*"].join)
|
195
|
+
end
|
196
|
+
redis.set("base_url", content[:url])
|
197
|
+
end
|
198
|
+
end
|
199
|
+
|
200
|
+
# Enqueues content to the crawl_job queue
|
201
|
+
def self.enqueue_content(content_request, link)
|
202
|
+
new_request = content_request.clone
|
203
|
+
new_request[:url] = link
|
204
|
+
new_request[:parent] = content_request[:url]
|
205
|
+
if content_request[:queue_system] == :resque
|
206
|
+
Resque.enqueue(CrawlJob, new_request)
|
207
|
+
elsif content_request[:queue_system] == :sidekiq
|
208
|
+
puts "Queueing content on Sidekiq"
|
209
|
+
CrawlWorker.perform_async(new_request)
|
210
|
+
else
|
211
|
+
raise "Unknown queue system: #{content_request[:queue_system]}"
|
212
|
+
end
|
213
|
+
@redis.sadd "queued", link
|
214
|
+
increment_queue_counter
|
215
|
+
end
|
216
|
+
|
217
|
+
# Increments the queue counter and refreshes crawl counters
|
218
|
+
def self.increment_queue_counter
|
219
|
+
@redis.incr "queue-counter"
|
220
|
+
refresh_counters
|
221
|
+
end
|
222
|
+
# Increments the crawl counter and refreshes crawl counters
|
223
|
+
def self.increment_crawl_counter
|
224
|
+
@redis.incr "crawl-counter"
|
225
|
+
refresh_counters
|
226
|
+
end
|
227
|
+
def self.increment_crawl_started_counter
|
228
|
+
@redis.incr "crawl-started-counter"
|
229
|
+
refresh_counters
|
230
|
+
end
|
231
|
+
# Decrements the queue counter and refreshes crawl counters
|
232
|
+
def self.decrement_queue_counter
|
233
|
+
@redis.decr "queue-counter"
|
234
|
+
refresh_counters
|
235
|
+
end
|
236
|
+
# Refreshes the crawl counters
|
237
|
+
def self.refresh_counters
|
238
|
+
@crawl_counter = @redis.get("crawl-counter").to_i
|
239
|
+
@crawl_started_counter = @redis.get("crawl-started-counter").to_i
|
240
|
+
@queue_counter = @redis.get("queue-counter").to_i
|
241
|
+
end
|
242
|
+
|
243
|
+
def self.print_counters
|
244
|
+
puts counters
|
245
|
+
end
|
246
|
+
|
247
|
+
def self.counters
|
248
|
+
"@crawl_counter: #{@crawl_counter} @crawl_started_counter: #{@crawl_started_counter} @queue_counter: #{@queue_counter}"
|
249
|
+
end
|
250
|
+
end
|