cobweb 1.0.11 → 1.0.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.textile +4 -3
- data/lib/cobweb.rb +31 -8
- data/lib/cobweb_crawler.rb +7 -8
- data/lib/cobweb_process_job.rb +1 -1
- data/lib/cobweb_version.rb +1 -1
- data/lib/crawl.rb +11 -4
- data/lib/crawl_finished_worker.rb +27 -0
- data/lib/crawl_helper.rb +250 -0
- data/lib/crawl_job.rb +2 -2
- data/lib/crawl_process_worker.rb +31 -0
- data/lib/crawl_worker.rb +118 -0
- data/lib/sidekiq/cobweb_helper.rb +16 -0
- data/lib/stats.rb +12 -11
- data/lib/uri_helper.rb +8 -0
- data/spec/cobweb/cobweb_crawl_helper_spec.rb +4 -1
- data/spec/cobweb/cobweb_crawl_spec.rb +29 -13
- data/spec/cobweb/cobweb_crawler_spec.rb +33 -14
- data/spec/cobweb/cobweb_links_spec.rb +2 -1
- data/spec/cobweb/cobweb_spec.rb +3 -0
- data/spec/cobweb/content_link_parser_spec.rb +4 -0
- data/spec/cobweb/{cobweb_job_spec.rb → crawl_job_spec.rb} +52 -9
- data/spec/cobweb/crawl_worker_spec.rb +250 -0
- data/spec/cobweb/robots_spec.rb +2 -1
- data/spec/http_stubs.rb +95 -0
- data/spec/samples/sample_site/{boxgrid.html → boxgrid>withsillyname.html} +1 -1
- data/spec/samples/sample_site/dashboard.html +1 -1
- data/spec/samples/sample_site/forms.html +1 -1
- data/spec/samples/sample_site/gallery.html +1 -1
- data/spec/samples/sample_site/more.html +1 -1
- data/spec/samples/sample_site/tables.html +1 -1
- data/spec/samples/sample_site/typography.html +1 -1
- data/spec/spec_helper.rb +6 -88
- metadata +85 -35
- data/spec/cobweb/site_test_spec.rb.tmp +0 -101
data/README.textile
CHANGED
@@ -1,10 +1,11 @@
|
|
1
1
|
|
2
|
-
h1. Cobweb v1.0.
|
2
|
+
h1. Cobweb v1.0.12
|
3
3
|
|
4
4
|
"@cobweb_gem":https://twitter.com/cobweb_gem
|
5
|
-
|
6
|
-
!https://secure.travis-ci.org/stewartmckee/cobweb.png?branch=master!
|
5
|
+
!https://badge.fury.io/rb/cobweb.png!:http://badge.fury.io/rb/cobweb
|
7
6
|
!https://gemnasium.com/stewartmckee/cobweb.png!
|
7
|
+
!https://coveralls.io/repos/stewartmckee/cobweb/badge.png?branch=master(Coverage Status)!:https://coveralls.io/r/stewartmckee/cobweb
|
8
|
+
|
8
9
|
|
9
10
|
h2. Intro
|
10
11
|
|
data/lib/cobweb.rb
CHANGED
@@ -33,8 +33,14 @@ class Cobweb
|
|
33
33
|
default_use_encoding_safe_process_job_to false
|
34
34
|
default_follow_redirects_to true
|
35
35
|
default_redirect_limit_to 10
|
36
|
-
|
37
|
-
|
36
|
+
default_queue_system_to :resque
|
37
|
+
if @options[:queue_system] == :resque
|
38
|
+
default_processing_queue_to "CobwebProcessJob"
|
39
|
+
default_crawl_finished_queue_to "CobwebFinishedJob"
|
40
|
+
else
|
41
|
+
default_processing_queue_to "CrawlProcessWorker"
|
42
|
+
default_crawl_finished_queue_to "CrawlFinishedWorker"
|
43
|
+
end
|
38
44
|
default_quiet_to true
|
39
45
|
default_debug_to false
|
40
46
|
default_cache_to 300
|
@@ -49,7 +55,7 @@ class Cobweb
|
|
49
55
|
default_user_agent_to "cobweb/#{Cobweb.version} (ruby/#{RUBY_VERSION} nokogiri/#{Nokogiri::VERSION})"
|
50
56
|
default_valid_mime_types_to ["*/*"]
|
51
57
|
default_raise_exceptions_to false
|
52
|
-
|
58
|
+
default_store_inbound_links_to false
|
53
59
|
|
54
60
|
end
|
55
61
|
|
@@ -80,7 +86,14 @@ class Cobweb
|
|
80
86
|
|
81
87
|
# add internal_urls into redis
|
82
88
|
@options[:internal_urls].map{|url| @redis.sadd("internal_urls", url)}
|
83
|
-
|
89
|
+
if @options[:queue_system] == :resque
|
90
|
+
Resque.enqueue(CrawlJob, request)
|
91
|
+
elsif @options[:queue_system] == :sidekiq
|
92
|
+
CrawlWorker.perform_async(request)
|
93
|
+
else
|
94
|
+
raise "Unknown queue system: #{content_request[:queue_system]}"
|
95
|
+
end
|
96
|
+
|
84
97
|
request
|
85
98
|
end
|
86
99
|
|
@@ -124,8 +137,13 @@ class Cobweb
|
|
124
137
|
|
125
138
|
# check if it has already been cached
|
126
139
|
if ((@options[:cache_type] == :crawl_based && redis.get(unique_id)) || (@options[:cache_type] == :full && full_redis.get(unique_id))) && @options[:cache]
|
127
|
-
|
128
|
-
|
140
|
+
if @options[:cache_type] == :crawl_based
|
141
|
+
puts "Cache hit for #{url}" unless @options[:quiet]
|
142
|
+
content = HashUtil.deep_symbolize_keys(Marshal.load(redis.get(unique_id)))
|
143
|
+
else
|
144
|
+
puts "Cache hit for #{url}" unless @options[:quiet]
|
145
|
+
content = HashUtil.deep_symbolize_keys(Marshal.load(full_redis.get(unique_id)))
|
146
|
+
end
|
129
147
|
else
|
130
148
|
# retrieve data
|
131
149
|
#unless @http && @http.address == uri.host && @http.port == uri.inferred_port
|
@@ -204,8 +222,13 @@ class Cobweb
|
|
204
222
|
end
|
205
223
|
# add content to cache if required
|
206
224
|
if @options[:cache]
|
207
|
-
|
208
|
-
|
225
|
+
if @options[:cache_type] == :crawl_based
|
226
|
+
redis.set(unique_id, Marshal.dump(content))
|
227
|
+
redis.expire unique_id, @options[:cache].to_i
|
228
|
+
else
|
229
|
+
full_redis.set(unique_id, Marshal.dump(content))
|
230
|
+
full_redis.expire unique_id, @options[:cache].to_i
|
231
|
+
end
|
209
232
|
end
|
210
233
|
rescue RedirectError => e
|
211
234
|
raise e if @options[:raise_exceptions]
|
data/lib/cobweb_crawler.rb
CHANGED
@@ -96,14 +96,12 @@ class CobwebCrawler
|
|
96
96
|
@redis.sadd "crawled", url.to_s
|
97
97
|
@redis.incr "crawl-counter"
|
98
98
|
|
99
|
-
|
99
|
+
document_links = ContentLinkParser.new(url, content[:body]).all_links(:valid_schemes => [:http, :https]).uniq
|
100
100
|
|
101
101
|
# select the link if its internal (eliminate external before expensive lookups in queued and crawled)
|
102
102
|
cobweb_links = CobwebLinks.new(@options)
|
103
103
|
|
104
|
-
internal_links =
|
105
|
-
|
106
|
-
all_internal_links = internal_links
|
104
|
+
internal_links = document_links.select{|link| cobweb_links.internal?(link) || (@options[:crawl_linked_external] && cobweb_links.internal?(url.to_s) && !cobweb_links.matches_external?(link))}
|
107
105
|
|
108
106
|
# reject the link if we've crawled it or queued it
|
109
107
|
internal_links.reject!{|link| @redis.sismember("crawled", link)}
|
@@ -120,12 +118,13 @@ class CobwebCrawler
|
|
120
118
|
@queue_counter += 1
|
121
119
|
end
|
122
120
|
|
123
|
-
if @options[:
|
124
|
-
|
125
|
-
|
121
|
+
if @options[:store_inbound_links]
|
122
|
+
document_links.each do |target_link|
|
123
|
+
target_uri = UriHelper.parse(target_link)
|
124
|
+
@redis.sadd("inbound_links_#{Digest::MD5.hexdigest(target_uri.to_s)}", UriHelper.parse(url).to_s)
|
126
125
|
end
|
127
126
|
end
|
128
|
-
|
127
|
+
|
129
128
|
@crawl_counter = @redis.scard("crawled").to_i
|
130
129
|
@queue_counter = @redis.scard("queued").to_i
|
131
130
|
|
data/lib/cobweb_process_job.rb
CHANGED
data/lib/cobweb_version.rb
CHANGED
data/lib/crawl.rb
CHANGED
@@ -88,11 +88,12 @@ module CobwebModule
|
|
88
88
|
|
89
89
|
@cobweb_links = CobwebLinks.new(@options)
|
90
90
|
if within_queue_limits?
|
91
|
-
|
91
|
+
document_links = ContentLinkParser.new(@options[:url], content.body, @options).all_links(:valid_schemes => [:http, :https])
|
92
92
|
#get rid of duplicate links in the same page.
|
93
|
-
|
93
|
+
document_links.uniq!
|
94
|
+
|
94
95
|
# select the link if its internal
|
95
|
-
internal_links.select
|
96
|
+
internal_links = document_links.select{ |link| @cobweb_links.internal?(link) }
|
96
97
|
|
97
98
|
# reject the link if we've crawled it or queued it
|
98
99
|
internal_links.reject! { |link| @redis.sismember("crawled", link) }
|
@@ -111,6 +112,13 @@ module CobwebModule
|
|
111
112
|
end
|
112
113
|
end
|
113
114
|
end
|
115
|
+
|
116
|
+
if @options[:store_inbound_links]
|
117
|
+
document_links.each do |link|
|
118
|
+
uri = URI.parse(link)
|
119
|
+
@redis.sadd("inbound_links_#{Digest::MD5.hexdigest(uri.to_s)}", url)
|
120
|
+
end
|
121
|
+
end
|
114
122
|
end
|
115
123
|
end
|
116
124
|
|
@@ -180,7 +188,6 @@ module CobwebModule
|
|
180
188
|
|
181
189
|
def finished
|
182
190
|
set_first_to_finish
|
183
|
-
debug_ap "CRAWL FINISHED #{@options[:url]}, #{counters}, #{@redis.get("original_base_url")}, #{@redis.get("crawled_base_url")}"
|
184
191
|
@stats.end_crawl(@options)
|
185
192
|
end
|
186
193
|
|
@@ -0,0 +1,27 @@
|
|
1
|
+
require 'sidekiq'
|
2
|
+
require File.expand_path(File.dirname(__FILE__) + '/sidekiq/cobweb_helper')
|
3
|
+
|
4
|
+
# If your client is single-threaded, we just need a single connection in our Redis connection pool
|
5
|
+
#Sidekiq.configure_client do |config|
|
6
|
+
# config.redis = { :namespace => 'x', :size => 1, :url => 'redis://localhost:6379/14' }
|
7
|
+
#end
|
8
|
+
|
9
|
+
# Sidekiq server is multi-threaded so our Redis connection pool size defaults to concurrency (-c)
|
10
|
+
#Sidekiq.configure_server do |config|
|
11
|
+
# config.redis = { :namespace => 'x', :url => 'redis://localhost:6379/14' }
|
12
|
+
#end
|
13
|
+
|
14
|
+
class CrawlFinishedWorker
|
15
|
+
|
16
|
+
include Sidekiq::Worker
|
17
|
+
|
18
|
+
sidekiq_options queue: "crawl_finished_worker"
|
19
|
+
|
20
|
+
|
21
|
+
def perform(statistics)
|
22
|
+
puts "Dummy Finished Job"
|
23
|
+
|
24
|
+
ap statistics
|
25
|
+
|
26
|
+
end
|
27
|
+
end
|
data/lib/crawl_helper.rb
ADDED
@@ -0,0 +1,250 @@
|
|
1
|
+
class CrawlHelper
|
2
|
+
|
3
|
+
require "net/https"
|
4
|
+
require "uri"
|
5
|
+
require "redis"
|
6
|
+
require 'namespaced_redis'
|
7
|
+
|
8
|
+
def self.crawl_page(content_request)
|
9
|
+
# change all hash keys to symbols
|
10
|
+
content_request = HashUtil.deep_symbolize_keys(content_request)
|
11
|
+
@content_request = content_request
|
12
|
+
|
13
|
+
content_request[:redis_options] = {} unless content_request.has_key? :redis_options
|
14
|
+
content_request[:crawl_limit_by_page] = false unless content_request.has_key? :crawl_limit_by_page
|
15
|
+
content_request[:valid_mime_types] = ["*/*"] unless content_request.has_key? :valid_mime_types
|
16
|
+
content_request[:queue_system] = content_request[:queue_system].to_sym
|
17
|
+
|
18
|
+
@redis = NamespacedRedis.new(content_request[:redis_options], "cobweb-#{Cobweb.version}-#{content_request[:crawl_id]}")
|
19
|
+
@stats = Stats.new(content_request)
|
20
|
+
|
21
|
+
@debug = content_request[:debug]
|
22
|
+
|
23
|
+
decrement_queue_counter
|
24
|
+
|
25
|
+
# check we haven't crawled this url before
|
26
|
+
unless @redis.sismember "crawled", content_request[:url]
|
27
|
+
# if there is no limit or we're still under it lets get the url
|
28
|
+
if within_crawl_limits?(content_request[:crawl_limit])
|
29
|
+
content = Cobweb.new(content_request).get(content_request[:url], content_request)
|
30
|
+
if content_request[:url] == @redis.get("original_base_url")
|
31
|
+
@redis.set("crawled_base_url", content[:base_url])
|
32
|
+
end
|
33
|
+
if is_permitted_type(content)
|
34
|
+
begin
|
35
|
+
# move the url from the queued list to the crawled list - for both the original url, and the content url (to handle redirects)
|
36
|
+
@redis.srem "queued", content_request[:url]
|
37
|
+
@redis.sadd "crawled", content_request[:url]
|
38
|
+
@redis.srem "queued", content[:url]
|
39
|
+
@redis.sadd "crawled", content[:url]
|
40
|
+
# increment the counter if we are not limiting by page only || we are limiting count by page and it is a page
|
41
|
+
if content_request[:crawl_limit_by_page]
|
42
|
+
if content[:mime_type].match("text/html")
|
43
|
+
increment_crawl_started_counter
|
44
|
+
end
|
45
|
+
else
|
46
|
+
increment_crawl_started_counter
|
47
|
+
end
|
48
|
+
|
49
|
+
## update statistics
|
50
|
+
@stats.update_status("Crawling #{content_request[:url]}...")
|
51
|
+
@stats.update_statistics(content)
|
52
|
+
|
53
|
+
# set the base url if this is the first page
|
54
|
+
set_base_url @redis, content, content_request
|
55
|
+
|
56
|
+
@cobweb_links = CobwebLinks.new(content_request)
|
57
|
+
if within_queue_limits?(content_request[:crawl_limit])
|
58
|
+
internal_links = ContentLinkParser.new(content_request[:url], content[:body], content_request).all_links(:valid_schemes => [:http, :https])
|
59
|
+
|
60
|
+
# select the link if its internal
|
61
|
+
internal_links.select! { |link| @cobweb_links.internal?(link) }
|
62
|
+
|
63
|
+
# reject the link if we've crawled it or queued it
|
64
|
+
internal_links.reject! { |link| @redis.sismember("crawled", link) }
|
65
|
+
internal_links.reject! { |link| @redis.sismember("queued", link) }
|
66
|
+
|
67
|
+
internal_links.each do |link|
|
68
|
+
enqueue_content(content_request, link) if within_queue_limits?(content_request[:crawl_limit])
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
# enqueue to processing queue
|
73
|
+
send_to_processing_queue(content, content_request)
|
74
|
+
|
75
|
+
#if the enqueue counter has been requested update that
|
76
|
+
if content_request.has_key? :enqueue_counter_key
|
77
|
+
enqueue_redis = NamespacedRedis.new(content_request[:redis_options], content_request[:enqueue_counter_namespace].to_s)
|
78
|
+
current_count = enqueue_redis.hget(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field]).to_i
|
79
|
+
enqueue_redis.hset(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field], current_count+1)
|
80
|
+
end
|
81
|
+
|
82
|
+
ensure
|
83
|
+
#update the queued and crawled lists if we are within the crawl limits.
|
84
|
+
|
85
|
+
# update the queue and crawl counts -- doing this very late in the piece so that the following transaction all occurs at once.
|
86
|
+
# really we should do this with a lock https://github.com/PatrickTulskie/redis-lock
|
87
|
+
if content_request[:crawl_limit_by_page]
|
88
|
+
if content[:mime_type].match("text/html")
|
89
|
+
increment_crawl_counter
|
90
|
+
end
|
91
|
+
else
|
92
|
+
increment_crawl_counter
|
93
|
+
end
|
94
|
+
puts "Crawled: #{@crawl_counter} Limit: #{content_request[:crawl_limit]} Queued: #{@queue_counter} In Progress: #{@crawl_started_counter-@crawl_counter}" if @debug
|
95
|
+
end
|
96
|
+
else
|
97
|
+
puts "ignoring #{content_request[:url]} as mime_type is #{content[:mime_type]}" if content_request[:debug]
|
98
|
+
end
|
99
|
+
else
|
100
|
+
puts "ignoring #{content_request[:url]} as outside of crawl limits." if content_request[:debug]
|
101
|
+
end
|
102
|
+
|
103
|
+
else
|
104
|
+
@redis.srem "queued", content_request[:url]
|
105
|
+
puts "Already crawled #{content_request[:url]}" if content_request[:debug]
|
106
|
+
end
|
107
|
+
|
108
|
+
# if there's nothing left queued or the crawled limit has been reached
|
109
|
+
refresh_counters
|
110
|
+
if content_request[:crawl_limit].nil? || content_request[:crawl_limit] == 0
|
111
|
+
if @queue_counter+@crawl_started_counter-@crawl_counter == 0
|
112
|
+
finished(content_request)
|
113
|
+
end
|
114
|
+
elsif (@queue_counter +@crawl_started_counter-@crawl_counter)== 0 || @crawl_counter >= content_request[:crawl_limit].to_i
|
115
|
+
finished(content_request)
|
116
|
+
end
|
117
|
+
|
118
|
+
end
|
119
|
+
|
120
|
+
# Sets the crawl status to 'Crawl Finished' and enqueues the crawl finished job
|
121
|
+
def self.finished(content_request)
|
122
|
+
# finished
|
123
|
+
if @redis.hget("statistics", "current_status")!= "Crawl Finished"
|
124
|
+
ap "CRAWL FINISHED #{content_request[:url]}, #{counters}, #{@redis.get("original_base_url")}, #{@redis.get("crawled_base_url")}" if content_request[:debug]
|
125
|
+
@stats.end_crawl(content_request)
|
126
|
+
|
127
|
+
additional_stats = {:crawl_id => content_request[:crawl_id], :crawled_base_url => @redis.get("crawled_base_url")}
|
128
|
+
additional_stats[:redis_options] = content_request[:redis_options] unless content_request[:redis_options] == {}
|
129
|
+
additional_stats[:source_id] = content_request[:source_id] unless content_request[:source_id].nil?
|
130
|
+
|
131
|
+
if content_request[:queue_system] == :resque
|
132
|
+
Resque.enqueue(const_get(content_request[:crawl_finished_queue]), @stats.get_statistics.merge(additional_stats))
|
133
|
+
elsif content_request[:queue_system] == :sidekiq
|
134
|
+
puts "Queueing Finished on Sidekiq"
|
135
|
+
const_get(content_request[:crawl_finished_queue]).perform_async(@stats.get_statistics.merge(additional_stats))
|
136
|
+
else
|
137
|
+
raise "Unknown queue system: #{content_request[:queue_system]}"
|
138
|
+
end
|
139
|
+
else
|
140
|
+
# nothing to report here, we're skipping the remaining urls as we're outside of the crawl limit
|
141
|
+
end
|
142
|
+
end
|
143
|
+
|
144
|
+
# Enqueues the content to the processing queue setup in options
|
145
|
+
def self.send_to_processing_queue(content, content_request)
|
146
|
+
content_to_send = content.merge({:internal_urls => content_request[:internal_urls], :redis_options => content_request[:redis_options], :source_id => content_request[:source_id], :crawl_id => content_request[:crawl_id]})
|
147
|
+
if content_request[:direct_call_process_job]
|
148
|
+
clazz = const_get(content_request[:processing_queue])
|
149
|
+
clazz.perform(content_to_send)
|
150
|
+
elsif content_request[:use_encoding_safe_process_job]
|
151
|
+
content_to_send[:body] = Base64.encode64(content[:body])
|
152
|
+
content_to_send[:processing_queue] = content_request[:processing_queue]
|
153
|
+
Resque.enqueue(EncodingSafeProcessJob, content_to_send)
|
154
|
+
else
|
155
|
+
if content_request[:queue_system] == :resque
|
156
|
+
Resque.enqueue(const_get(content_request[:processing_queue]), content_to_send)
|
157
|
+
elsif content_request[:queue_system] == :sidekiq
|
158
|
+
puts "Queueing on Sidekiq"
|
159
|
+
const_get(content_request[:processing_queue]).perform_async(content_to_send)
|
160
|
+
else
|
161
|
+
raise "Unknown queue system: #{content_request[:queue_system]}"
|
162
|
+
end
|
163
|
+
end
|
164
|
+
puts "#{content_request[:url]} has been sent for processing. use_encoding_safe_process_job: #{content_request[:use_encoding_safe_process_job]}" if content_request[:debug]
|
165
|
+
end
|
166
|
+
|
167
|
+
private
|
168
|
+
|
169
|
+
# Helper method to determine if this content is to be processed or not
|
170
|
+
def self.is_permitted_type(content)
|
171
|
+
@content_request[:valid_mime_types].each do |mime_type|
|
172
|
+
return true if content[:mime_type].match(Cobweb.escape_pattern_for_regex(mime_type))
|
173
|
+
end
|
174
|
+
false
|
175
|
+
end
|
176
|
+
|
177
|
+
# Returns true if the crawl count is within limits
|
178
|
+
def self.within_crawl_limits?(crawl_limit)
|
179
|
+
refresh_counters
|
180
|
+
crawl_limit.nil? or @crawl_started_counter < crawl_limit.to_i
|
181
|
+
end
|
182
|
+
|
183
|
+
# Returns true if the queue count is calculated to be still within limits when complete
|
184
|
+
def self.within_queue_limits?(crawl_limit)
|
185
|
+
refresh_counters
|
186
|
+
(@content_request[:crawl_limit_by_page]&& (crawl_limit.nil? or @crawl_counter < crawl_limit.to_i)) || within_crawl_limits?(crawl_limit) && (crawl_limit.nil? || (@queue_counter + @crawl_counter) < crawl_limit.to_i)
|
187
|
+
end
|
188
|
+
|
189
|
+
# Sets the base url in redis. If the first page is a redirect, it sets the base_url to the destination
|
190
|
+
def self.set_base_url(redis, content, content_request)
|
191
|
+
if redis.get("base_url").nil?
|
192
|
+
unless content[:redirect_through].nil? || content[:redirect_through].empty? || !content_request[:first_page_redirect_internal]
|
193
|
+
uri = Addressable::URI.parse(content[:redirect_through].last)
|
194
|
+
redis.sadd("internal_urls", [uri.scheme, "://", uri.host, "/*"].join)
|
195
|
+
end
|
196
|
+
redis.set("base_url", content[:url])
|
197
|
+
end
|
198
|
+
end
|
199
|
+
|
200
|
+
# Enqueues content to the crawl_job queue
|
201
|
+
def self.enqueue_content(content_request, link)
|
202
|
+
new_request = content_request.clone
|
203
|
+
new_request[:url] = link
|
204
|
+
new_request[:parent] = content_request[:url]
|
205
|
+
if content_request[:queue_system] == :resque
|
206
|
+
Resque.enqueue(CrawlJob, new_request)
|
207
|
+
elsif content_request[:queue_system] == :sidekiq
|
208
|
+
puts "Queueing content on Sidekiq"
|
209
|
+
CrawlWorker.perform_async(new_request)
|
210
|
+
else
|
211
|
+
raise "Unknown queue system: #{content_request[:queue_system]}"
|
212
|
+
end
|
213
|
+
@redis.sadd "queued", link
|
214
|
+
increment_queue_counter
|
215
|
+
end
|
216
|
+
|
217
|
+
# Increments the queue counter and refreshes crawl counters
|
218
|
+
def self.increment_queue_counter
|
219
|
+
@redis.incr "queue-counter"
|
220
|
+
refresh_counters
|
221
|
+
end
|
222
|
+
# Increments the crawl counter and refreshes crawl counters
|
223
|
+
def self.increment_crawl_counter
|
224
|
+
@redis.incr "crawl-counter"
|
225
|
+
refresh_counters
|
226
|
+
end
|
227
|
+
def self.increment_crawl_started_counter
|
228
|
+
@redis.incr "crawl-started-counter"
|
229
|
+
refresh_counters
|
230
|
+
end
|
231
|
+
# Decrements the queue counter and refreshes crawl counters
|
232
|
+
def self.decrement_queue_counter
|
233
|
+
@redis.decr "queue-counter"
|
234
|
+
refresh_counters
|
235
|
+
end
|
236
|
+
# Refreshes the crawl counters
|
237
|
+
def self.refresh_counters
|
238
|
+
@crawl_counter = @redis.get("crawl-counter").to_i
|
239
|
+
@crawl_started_counter = @redis.get("crawl-started-counter").to_i
|
240
|
+
@queue_counter = @redis.get("queue-counter").to_i
|
241
|
+
end
|
242
|
+
|
243
|
+
def self.print_counters
|
244
|
+
puts counters
|
245
|
+
end
|
246
|
+
|
247
|
+
def self.counters
|
248
|
+
"@crawl_counter: #{@crawl_counter} @crawl_started_counter: #{@crawl_started_counter} @queue_counter: #{@queue_counter}"
|
249
|
+
end
|
250
|
+
end
|