cobweb 1.0.11 → 1.0.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.textile +4 -3
- data/lib/cobweb.rb +31 -8
- data/lib/cobweb_crawler.rb +7 -8
- data/lib/cobweb_process_job.rb +1 -1
- data/lib/cobweb_version.rb +1 -1
- data/lib/crawl.rb +11 -4
- data/lib/crawl_finished_worker.rb +27 -0
- data/lib/crawl_helper.rb +250 -0
- data/lib/crawl_job.rb +2 -2
- data/lib/crawl_process_worker.rb +31 -0
- data/lib/crawl_worker.rb +118 -0
- data/lib/sidekiq/cobweb_helper.rb +16 -0
- data/lib/stats.rb +12 -11
- data/lib/uri_helper.rb +8 -0
- data/spec/cobweb/cobweb_crawl_helper_spec.rb +4 -1
- data/spec/cobweb/cobweb_crawl_spec.rb +29 -13
- data/spec/cobweb/cobweb_crawler_spec.rb +33 -14
- data/spec/cobweb/cobweb_links_spec.rb +2 -1
- data/spec/cobweb/cobweb_spec.rb +3 -0
- data/spec/cobweb/content_link_parser_spec.rb +4 -0
- data/spec/cobweb/{cobweb_job_spec.rb → crawl_job_spec.rb} +52 -9
- data/spec/cobweb/crawl_worker_spec.rb +250 -0
- data/spec/cobweb/robots_spec.rb +2 -1
- data/spec/http_stubs.rb +95 -0
- data/spec/samples/sample_site/{boxgrid.html → boxgrid>withsillyname.html} +1 -1
- data/spec/samples/sample_site/dashboard.html +1 -1
- data/spec/samples/sample_site/forms.html +1 -1
- data/spec/samples/sample_site/gallery.html +1 -1
- data/spec/samples/sample_site/more.html +1 -1
- data/spec/samples/sample_site/tables.html +1 -1
- data/spec/samples/sample_site/typography.html +1 -1
- data/spec/spec_helper.rb +6 -88
- metadata +85 -35
- data/spec/cobweb/site_test_spec.rb.tmp +0 -101
data/lib/crawl_job.rb
CHANGED
@@ -84,6 +84,7 @@ class CrawlJob
|
|
84
84
|
def self.send_to_processing_queue(content, content_request)
|
85
85
|
content_to_send = content.merge({:internal_urls => content_request[:internal_urls], :redis_options => content_request[:redis_options], :source_id => content_request[:source_id], :crawl_id => content_request[:crawl_id]})
|
86
86
|
if content_request[:direct_call_process_job]
|
87
|
+
#clazz = content_request[:processing_queue].to_s.constantize
|
87
88
|
clazz = const_get(content_request[:processing_queue])
|
88
89
|
clazz.perform(content_to_send)
|
89
90
|
elsif content_request[:use_encoding_safe_process_job]
|
@@ -107,6 +108,5 @@ class CrawlJob
|
|
107
108
|
#to help prevent accidentally double processing a link, let's mark it as queued just before the Resque.enqueue statement, rather than just after.
|
108
109
|
Resque.enqueue(CrawlJob, new_request)
|
109
110
|
end
|
110
|
-
|
111
111
|
|
112
|
-
end
|
112
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
|
2
|
+
require 'sidekiq'
|
3
|
+
require File.expand_path(File.dirname(__FILE__) + '/sidekiq/cobweb_helper')
|
4
|
+
|
5
|
+
# If your client is single-threaded, we just need a single connection in our Redis connection pool
|
6
|
+
#Sidekiq.configure_client do |config|
|
7
|
+
# config.redis = { :namespace => 'x', :size => 1, :url => 'redis://localhost:6379/14' }
|
8
|
+
#end
|
9
|
+
|
10
|
+
# Sidekiq server is multi-threaded so our Redis connection pool size defaults to concurrency (-c)
|
11
|
+
#Sidekiq.configure_server do |config|
|
12
|
+
# config.redis = { :namespace => 'x', :url => 'redis://localhost:6379/14' }
|
13
|
+
#end
|
14
|
+
|
15
|
+
class CrawlProcessWorker
|
16
|
+
|
17
|
+
include Sidekiq::Worker
|
18
|
+
|
19
|
+
sidekiq_options queue: "crawl_process_worker"
|
20
|
+
|
21
|
+
def perform(content)
|
22
|
+
content = HashUtil.deep_symbolize_keys(content)
|
23
|
+
puts "Dummy Processing for #{content[:url]}"
|
24
|
+
end
|
25
|
+
def self.queue_size
|
26
|
+
Sidekiq.redis do |conn|
|
27
|
+
conn.llen("queue:#{get_sidekiq_options["queue"]}")
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
end
|
data/lib/crawl_worker.rb
ADDED
@@ -0,0 +1,118 @@
|
|
1
|
+
require 'sidekiq'
|
2
|
+
require File.expand_path(File.dirname(__FILE__) + '/cobweb')
|
3
|
+
require File.expand_path(File.dirname(__FILE__) + '/sidekiq/cobweb_helper')
|
4
|
+
|
5
|
+
# If your client is single-threaded, we just need a single connection in our Redis connection pool
|
6
|
+
#Sidekiq.configure_client do |config|
|
7
|
+
# config.redis = { :namespace => 'x', :size => 1, :url => 'redis://localhost:6379/14' }
|
8
|
+
#end
|
9
|
+
|
10
|
+
# Sidekiq server is multi-threaded so our Redis connection pool size defaults to concurrency (-c)
|
11
|
+
#Sidekiq.configure_server do |config|
|
12
|
+
# config.redis = { :namespace => 'x', :url => 'redis://localhost:6379/14' }
|
13
|
+
#end
|
14
|
+
|
15
|
+
class CrawlWorker
|
16
|
+
include Sidekiq::Worker
|
17
|
+
sidekiq_options queue: "crawl_worker"
|
18
|
+
sidekiq_options retry: false
|
19
|
+
|
20
|
+
def perform(content_request)
|
21
|
+
# setup the crawl class to manage the crawl of this object
|
22
|
+
@crawl = CobwebModule::Crawl.new(content_request)
|
23
|
+
|
24
|
+
# update the counters and then perform the get, returns false if we are outwith limits
|
25
|
+
if @crawl.retrieve
|
26
|
+
|
27
|
+
# if the crawled object is an object type we are interested
|
28
|
+
if @crawl.content.permitted_type?
|
29
|
+
|
30
|
+
# extract links from content and process them if we are still within queue limits (block will not run if we are outwith limits)
|
31
|
+
@crawl.process_links do |link|
|
32
|
+
|
33
|
+
@crawl.debug_puts "ENQUEUED LINK: #{link}"
|
34
|
+
enqueue_content(content_request, link)
|
35
|
+
|
36
|
+
end
|
37
|
+
|
38
|
+
if @crawl.to_be_processed?
|
39
|
+
|
40
|
+
@crawl.process do
|
41
|
+
|
42
|
+
# enqueue to processing queue
|
43
|
+
@crawl.debug_puts "ENQUEUED [#{@crawl.redis.get("crawl_job_enqueued_count")}] URL: #{@crawl.content.url}"
|
44
|
+
send_to_processing_queue(@crawl.content.to_hash, content_request)
|
45
|
+
|
46
|
+
#if the enqueue counter has been requested update that
|
47
|
+
if content_request.has_key?(:enqueue_counter_key)
|
48
|
+
enqueue_redis = Redis::Namespace.new(content_request[:enqueue_counter_namespace].to_s, :redis => Redis.new(content_request[:redis_options]))
|
49
|
+
current_count = enqueue_redis.hget(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field]).to_i
|
50
|
+
enqueue_redis.hset(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field], current_count+1)
|
51
|
+
end
|
52
|
+
|
53
|
+
end
|
54
|
+
else
|
55
|
+
@crawl.debug_puts "@crawl.finished? #{@crawl.finished?}"
|
56
|
+
@crawl.debug_puts "@crawl.within_crawl_limits? #{@crawl.within_crawl_limits?}"
|
57
|
+
@crawl.debug_puts "@crawl.first_to_finish? #{@crawl.first_to_finish?}"
|
58
|
+
end
|
59
|
+
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
@crawl.lock("finished") do
|
64
|
+
# let the crawl know we're finished with this object
|
65
|
+
@crawl.finished_processing
|
66
|
+
|
67
|
+
# test queue and crawl sizes to see if we have completed the crawl
|
68
|
+
@crawl.debug_puts "finished? #{@crawl.finished?}"
|
69
|
+
@crawl.debug_puts "first_to_finish? #{@crawl.first_to_finish?}" if @crawl.finished?
|
70
|
+
if @crawl.finished? && @crawl.first_to_finish?
|
71
|
+
@crawl.debug_puts "Calling crawl_job finished"
|
72
|
+
finished(content_request)
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
def self.jobs
|
77
|
+
Sidekiq.redis do |conn|
|
78
|
+
conn.smembers(get_sidekiq_options[:queue]).count
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
|
83
|
+
# Sets the crawl status to CobwebCrawlHelper::FINISHED and enqueues the crawl finished job
|
84
|
+
def finished(content_request)
|
85
|
+
additional_stats = {:crawl_id => content_request[:crawl_id], :crawled_base_url => @crawl.crawled_base_url}
|
86
|
+
additional_stats[:redis_options] = content_request[:redis_options] unless content_request[:redis_options] == {}
|
87
|
+
additional_stats[:source_id] = content_request[:source_id] unless content_request[:source_id].nil?
|
88
|
+
|
89
|
+
@crawl.finished
|
90
|
+
|
91
|
+
@crawl.debug_puts "increment crawl_finished_enqueued_count"
|
92
|
+
@crawl.redis.incr("crawl_finished_enqueued_count")
|
93
|
+
content_request[:crawl_finished_queue].constantize.perform_async(@crawl.statistics.merge(additional_stats))
|
94
|
+
end
|
95
|
+
|
96
|
+
# Enqueues the content to the processing queue setup in options
|
97
|
+
def send_to_processing_queue(content, content_request)
|
98
|
+
content_to_send = content.merge({:internal_urls => content_request[:internal_urls], :redis_options => content_request[:redis_options], :source_id => content_request[:source_id], :crawl_id => content_request[:crawl_id]})
|
99
|
+
if content_request[:direct_call_process_job]
|
100
|
+
clazz = content_request[:processing_queue].constantize
|
101
|
+
clazz.perform(content_to_send)
|
102
|
+
else
|
103
|
+
content_request[:processing_queue].constantize.perform_async(content_to_send)
|
104
|
+
end
|
105
|
+
@crawl.debug_puts "#{content_request[:url]} has been sent for processing. use_encoding_safe_process_job: #{content_request[:use_encoding_safe_process_job]}"
|
106
|
+
end
|
107
|
+
|
108
|
+
private
|
109
|
+
|
110
|
+
# Enqueues content to the crawl_job queue
|
111
|
+
def enqueue_content(content_request, link)
|
112
|
+
new_request = content_request.clone
|
113
|
+
new_request[:url] = link
|
114
|
+
new_request[:parent] = content_request[:url]
|
115
|
+
CrawlWorker.perform_async(new_request)
|
116
|
+
end
|
117
|
+
|
118
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
module Sidekiq
|
2
|
+
module Worker
|
3
|
+
module ClassMethods
|
4
|
+
def queue_size
|
5
|
+
Sidekiq.redis do |conn|
|
6
|
+
conn.llen("queue:#{get_sidekiq_options["queue"]}")
|
7
|
+
end
|
8
|
+
end
|
9
|
+
def queue_items(start=0, finish=-1)
|
10
|
+
Sidekiq.redis do |conn|
|
11
|
+
conn.lrange("queue:#{get_sidekiq_options["queue"]}", start, finish)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
data/lib/stats.rb
CHANGED
@@ -38,13 +38,13 @@ class Stats
|
|
38
38
|
@redis.smembers "crawled"
|
39
39
|
end
|
40
40
|
|
41
|
-
def inbound_links_for(url
|
42
|
-
|
41
|
+
def inbound_links_for(url)
|
42
|
+
uri = UriHelper.parse(url)
|
43
|
+
@redis.smembers("inbound_links_#{Digest::MD5.hexdigest(uri.to_s)}")
|
43
44
|
end
|
44
45
|
|
45
46
|
# Returns statistics hash. update_statistics takes the content hash, extracts statistics from it and updates redis with the data.
|
46
47
|
def update_statistics(content, crawl_counter=@redis.scard("crawled").to_i, queue_counter=@redis.scard("queued").to_i)
|
47
|
-
|
48
48
|
@lock.synchronize {
|
49
49
|
@statistics = get_statistics
|
50
50
|
|
@@ -94,6 +94,7 @@ class Stats
|
|
94
94
|
else
|
95
95
|
mime_counts = {content[:mime_type] => 1}
|
96
96
|
end
|
97
|
+
|
97
98
|
@statistics[:mime_counts] = mime_counts.to_json
|
98
99
|
|
99
100
|
# record mime categories stats
|
@@ -151,18 +152,18 @@ class Stats
|
|
151
152
|
# Returns the statistics hash
|
152
153
|
def get_statistics
|
153
154
|
|
154
|
-
|
155
|
-
if
|
156
|
-
|
155
|
+
statistics = HashUtil.deep_symbolize_keys(@redis.hgetall("statistics"))
|
156
|
+
if statistics[:status_counts].nil?
|
157
|
+
statistics[:status_counts]
|
157
158
|
else
|
158
|
-
|
159
|
+
statistics[:status_counts] = JSON.parse(statistics[:status_counts])
|
159
160
|
end
|
160
|
-
if
|
161
|
-
|
161
|
+
if statistics[:mime_counts].nil?
|
162
|
+
statistics[:mime_counts]
|
162
163
|
else
|
163
|
-
|
164
|
+
statistics[:mime_counts] = JSON.parse(statistics[:mime_counts])
|
164
165
|
end
|
165
|
-
|
166
|
+
statistics
|
166
167
|
end
|
167
168
|
|
168
169
|
# Sets the current status of the crawl
|
data/lib/uri_helper.rb
CHANGED
@@ -1,8 +1,11 @@
|
|
1
1
|
require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
|
2
|
+
require 'resolv'
|
2
3
|
|
3
4
|
describe CobwebModule::Crawl, :local_only => true do
|
4
|
-
|
5
|
+
include HttpStubs
|
5
6
|
before(:each) do
|
7
|
+
setup_stubs
|
8
|
+
|
6
9
|
@local_redis = {:host => "localhost", :port => 6379}
|
7
10
|
@remote_redis = {:host => "remote-redis", :port => 6379}
|
8
11
|
|
@@ -12,25 +15,38 @@ describe CobwebModule::Crawl, :local_only => true do
|
|
12
15
|
describe "remote redis" do
|
13
16
|
before(:each) do
|
14
17
|
@local = CobwebModule::Crawl.new(:redis_options => @local_redis)
|
15
|
-
@remote = CobwebModule::Crawl.new(:redis_options => @remote_redis)
|
16
|
-
|
17
18
|
@local.redis.del("test_redis")
|
18
|
-
|
19
|
+
|
20
|
+
begin
|
21
|
+
Resolv.getaddress @remote_redis[:host]
|
22
|
+
@remote = CobwebModule::Crawl.new(:redis_options => @remote_redis)
|
23
|
+
@remote.redis.del("test_redis")
|
24
|
+
rescue
|
25
|
+
@remote = nil
|
26
|
+
end
|
19
27
|
|
20
28
|
end
|
21
29
|
it "should connect to the local redis" do
|
22
|
-
@
|
23
|
-
|
24
|
-
|
30
|
+
if @remote
|
31
|
+
@local.redis.exists("test_redis").should be_false
|
32
|
+
@local.redis.set("test_redis", 1)
|
33
|
+
@local.redis.exists("test_redis").should be_true
|
25
34
|
|
26
|
-
|
35
|
+
@remote.redis.exists("test_redis").should be_false
|
36
|
+
else
|
37
|
+
puts "WARNING: can't connect to remote redis"
|
38
|
+
end
|
27
39
|
end
|
28
40
|
it "should connect to the remote redis" do
|
29
|
-
@remote
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
41
|
+
if @remote
|
42
|
+
@remote.redis.exists("test_redis").should be_false
|
43
|
+
@remote.redis.set("test_redis", 1)
|
44
|
+
@remote.redis.exists("test_redis").should be_true
|
45
|
+
|
46
|
+
@local.redis.exists("test_redis").should be_false
|
47
|
+
else
|
48
|
+
puts "WARNING: can't connect to remote redis"
|
49
|
+
end
|
34
50
|
end
|
35
51
|
end
|
36
52
|
end
|
@@ -23,25 +23,23 @@ describe CobwebCrawler do
|
|
23
23
|
end
|
24
24
|
|
25
25
|
describe "crawl" do
|
26
|
+
|
26
27
|
it "should crawl a site" do
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
statistics
|
33
|
-
|
34
|
-
statistics.
|
35
|
-
statistics.get_statistics.should
|
28
|
+
|
29
|
+
@crawler = CobwebCrawler.new({:cache => false, :quiet => true, :debug => false})
|
30
|
+
@statistics = @crawler.crawl(@base_url)
|
31
|
+
|
32
|
+
@statistics.should_not be_nil
|
33
|
+
@statistics.get_statistics.should be_an_instance_of Hash
|
34
|
+
|
35
|
+
@statistics.get_statistics[:mime_counts]["text/html"].should == 8
|
36
|
+
@statistics.get_statistics[:mime_counts]["text/css"].should == 18
|
37
|
+
@statistics.get_statistics[:mime_counts]["image/jpeg"].should == 25
|
36
38
|
|
37
39
|
end
|
38
40
|
|
39
41
|
it "should take a block" do
|
40
|
-
|
41
|
-
# temporary tests to run crawler - proper specs to follow.. honest
|
42
|
-
|
43
|
-
crawler = CobwebCrawler.new({:cache => false, :quiet => false, :debug => false})
|
44
|
-
|
42
|
+
crawler = CobwebCrawler.new({:cache => false, :quiet => true, :debug => false, :crawl_limit => 1})
|
45
43
|
statistics = crawler.crawl(@base_url) do |content, statistics|
|
46
44
|
content[:url].should_not be_nil
|
47
45
|
statistics[:average_length].should_not be_nil
|
@@ -50,6 +48,27 @@ describe CobwebCrawler do
|
|
50
48
|
statistics.should_not be_nil
|
51
49
|
statistics.get_statistics.should be_an_instance_of Hash
|
52
50
|
|
51
|
+
statistics.get_statistics[:mime_counts]["text/html"].should == 1
|
52
|
+
|
53
|
+
end
|
54
|
+
|
55
|
+
context "storing inbound links" do
|
56
|
+
|
57
|
+
before(:each) do
|
58
|
+
@crawler = CobwebCrawler.new({:cache => false, :quiet => true, :debug => false, :store_inbound_links => true})
|
59
|
+
@statistics = @crawler.crawl(@base_url)
|
60
|
+
end
|
61
|
+
|
62
|
+
it "should store inbound links" do
|
63
|
+
|
64
|
+
@statistics.inbound_links_for("http://localhost:3532/typography.html").should_not be_empty
|
65
|
+
@statistics.inbound_links_for("http://localhost:3532/typography.html").sort.should == ["http://localhost:3532/gallery.html", "http://localhost:3532/boxgrid%3Ewithsillyname.html", "http://localhost:3532/more.html", "http://localhost:3532/", "http://localhost:3532/tables.html", "http://localhost:3532/typography.html", "http://localhost:3532/forms.html", "http://localhost:3532/dashboard.html"].sort
|
66
|
+
end
|
67
|
+
|
68
|
+
it "should handle url encoding" do
|
69
|
+
@statistics.inbound_links_for("http://localhost:3532/boxgrid%3Ewithsillyname.html").sort.should == ["http://localhost:3532/boxgrid%3Ewithsillyname.html", "http://localhost:3532/gallery.html", "http://localhost:3532/more.html", "http://localhost:3532/tables.html", "http://localhost:3532/typography.html", "http://localhost:3532/forms.html", "http://localhost:3532/dashboard.html"].sort
|
70
|
+
end
|
71
|
+
|
53
72
|
end
|
54
73
|
end
|
55
74
|
|
@@ -2,8 +2,9 @@ require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
|
|
2
2
|
require File.expand_path(File.dirname(__FILE__) + '/../../lib/cobweb_links')
|
3
3
|
|
4
4
|
describe CobwebLinks do
|
5
|
-
|
5
|
+
include HttpStubs
|
6
6
|
before(:each) do
|
7
|
+
setup_stubs
|
7
8
|
|
8
9
|
@base_url = "http://www.baseurl.com/"
|
9
10
|
|
data/spec/cobweb/cobweb_spec.rb
CHANGED
@@ -2,6 +2,10 @@ require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
|
|
2
2
|
require File.expand_path(File.dirname(__FILE__) + '/../../lib/content_link_parser.rb')
|
3
3
|
|
4
4
|
describe ContentLinkParser do
|
5
|
+
include HttpStubs
|
6
|
+
before(:each) do
|
7
|
+
setup_stubs
|
8
|
+
end
|
5
9
|
|
6
10
|
describe "Sample Links Document" do
|
7
11
|
before(:each) do
|
@@ -1,18 +1,51 @@
|
|
1
1
|
require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
|
2
2
|
|
3
|
-
|
3
|
+
RESQUE_WORKER_COUNT = 10
|
4
|
+
|
5
|
+
describe CrawlJob, :local_only => true, :disabled => true do
|
4
6
|
|
5
7
|
before(:all) do
|
6
8
|
#store all existing resque process ids so we don't kill them afterwards
|
9
|
+
|
7
10
|
@existing_processes = `ps aux | grep resque | grep -v grep | grep -v resque-web | awk '{print $2}'`.split("\n")
|
11
|
+
if Resque.workers.count > 0 && @existing_processes.empty?
|
12
|
+
raise "Ghost workers present in resque, please clear before running specs"
|
13
|
+
elsif Resque.workers.count == 0 && !@existing_processes.empty?
|
14
|
+
raise "Ghost worker processes present (#{@existing_processes.join(',')})"
|
15
|
+
elsif Resque.workers.count > 0 && !@existing_processes.empty?
|
16
|
+
raise "Resque workers present, please end other resque processes before running this spec"
|
17
|
+
end
|
8
18
|
|
9
19
|
# START WORKERS ONLY FOR CRAWL QUEUE SO WE CAN COUNT ENQUEUED PROCESS AND FINISH QUEUES
|
10
|
-
|
11
|
-
`mkdir
|
12
|
-
`mkdir tmp`
|
13
|
-
|
14
|
-
|
15
|
-
|
20
|
+
`mkdir log` unless Dir.exist?(File.expand_path(File.dirname(__FILE__) + '/../../log'))
|
21
|
+
`mkdir tmp` unless Dir.exist?(File.expand_path(File.dirname(__FILE__) + '/../../tmp'))
|
22
|
+
`mkdir tmp/pids` unless Dir.exist?(File.expand_path(File.dirname(__FILE__) + '/../../tmp/pids'))
|
23
|
+
io = IO.popen("nohup rake resque:workers INTERVAL=1 PIDFILE=./tmp/pids/resque.pid COUNT=#{RESQUE_WORKER_COUNT} QUEUE=cobweb_crawl_job > log/output.log &")
|
24
|
+
|
25
|
+
counter = 0
|
26
|
+
print "Starting Resque Processes"
|
27
|
+
until counter > 10 || workers_processes_started?
|
28
|
+
print "."
|
29
|
+
counter += 1
|
30
|
+
sleep 0.5
|
31
|
+
end
|
32
|
+
puts ""
|
33
|
+
|
34
|
+
|
35
|
+
counter = 0
|
36
|
+
print "Waiting for Resque Workers"
|
37
|
+
until counter > 50 || workers_running?
|
38
|
+
print "."
|
39
|
+
counter += 1
|
40
|
+
sleep 0.5
|
41
|
+
end
|
42
|
+
puts ""
|
43
|
+
|
44
|
+
if Resque.workers.count == RESQUE_WORKER_COUNT
|
45
|
+
puts "Workers Running."
|
46
|
+
else
|
47
|
+
raise "Workers didn't appear, please check environment"
|
48
|
+
end
|
16
49
|
|
17
50
|
end
|
18
51
|
|
@@ -237,7 +270,7 @@ describe Cobweb, :local_only => true, :disabled => true do
|
|
237
270
|
after(:all) do
|
238
271
|
|
239
272
|
@all_processes = `ps aux | grep resque | grep -v grep | grep -v resque-web | awk '{print $2}'`.split("\n")
|
240
|
-
command = "kill -
|
273
|
+
command = "kill -s QUIT #{(@all_processes - @existing_processes).join(" ")}"
|
241
274
|
IO.popen(command)
|
242
275
|
|
243
276
|
clear_queues
|
@@ -249,13 +282,23 @@ def wait_for_crawl_finished(crawl_id, timeout=20)
|
|
249
282
|
@counter = 0
|
250
283
|
start_time = Time.now
|
251
284
|
while(running?(crawl_id) && Time.now < start_time + timeout) do
|
252
|
-
sleep
|
285
|
+
sleep 1
|
253
286
|
end
|
254
287
|
if Time.now > start_time + timeout
|
255
288
|
raise "End of crawl not detected"
|
256
289
|
end
|
257
290
|
end
|
258
291
|
|
292
|
+
def workers_processes_started?
|
293
|
+
@all_processes = `ps aux | grep resque | grep -v grep | grep -v resque-web | awk '{print $2}'`.split("\n")
|
294
|
+
@new_processes = @all_processes - @existing_processes
|
295
|
+
@new_processes.count == RESQUE_WORKER_COUNT
|
296
|
+
end
|
297
|
+
|
298
|
+
def workers_running?
|
299
|
+
Resque.workers.count > 0
|
300
|
+
end
|
301
|
+
|
259
302
|
def running?(crawl_id)
|
260
303
|
status = @stat.get_status
|
261
304
|
result = true
|