cobweb 1.0.28 → 1.0.29
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.textile +71 -67
- data/lib/cobweb.rb +41 -41
- data/lib/cobweb_crawl_helper.rb +1 -5
- data/lib/cobweb_version.rb +2 -2
- data/lib/crawl_worker.rb +14 -14
- data/lib/export_command.rb +3 -3
- data/lib/report_command.rb +1 -1
- data/lib/string.rb +4 -9
- data/spec/cobweb/cobweb_crawler_spec.rb +15 -15
- data/spec/cobweb/crawl_job_spec.rb +8 -6
- data/spec/cobweb/crawl_worker_spec.rb +32 -32
- data/spec/samples/sample_site/{boxgrid>withsillyname.html → boxgridwithsillyname.html} +37 -37
- data/spec/samples/sample_site/dashboard.html +1 -1
- data/spec/samples/sample_site/forms.html +1 -1
- data/spec/samples/sample_site/gallery.html +1 -1
- data/spec/samples/sample_site/more.html +1 -1
- data/spec/samples/sample_site/tables.html +1 -1
- data/spec/samples/sample_site/typography.html +1 -1
- metadata +4 -4
data/lib/cobweb_crawl_helper.rb
CHANGED
@@ -52,19 +52,15 @@ class CobwebCrawlHelper
|
|
52
52
|
end
|
53
53
|
if options[:queue_system] == :sidekiq && SIDEKIQ_INSTALLED
|
54
54
|
|
55
|
-
puts "deleteing from crawl_worker"
|
56
55
|
queue = Sidekiq::Queue.new("crawl_worker")
|
57
56
|
queue.each do |job|
|
58
|
-
ap job.args # => [1, 2, 3]
|
59
57
|
job.delete if job.args[0]["crawl_id"] == id
|
60
58
|
end
|
61
59
|
|
62
60
|
|
63
61
|
process_queue_name = Kernel.const_get(options[:processing_queue]).sidekiq_options_hash["queue"]
|
64
|
-
puts "deleting from #{process_queue_name}"
|
65
62
|
queue = Sidekiq::Queue.new(process_queue_name)
|
66
63
|
queue.each do |job|
|
67
|
-
ap job.args # => [1, 2, 3]
|
68
64
|
job.delete if job.args[0]["crawl_id"] == id
|
69
65
|
end
|
70
66
|
end
|
@@ -102,4 +98,4 @@ class CobwebCrawlHelper
|
|
102
98
|
@data[:crawl_id]
|
103
99
|
end
|
104
100
|
|
105
|
-
end
|
101
|
+
end
|
data/lib/cobweb_version.rb
CHANGED
data/lib/crawl_worker.rb
CHANGED
@@ -14,31 +14,31 @@ require File.expand_path(File.dirname(__FILE__) + '/sidekiq/cobweb_helper')
|
|
14
14
|
class CrawlWorker
|
15
15
|
include Sidekiq::Worker
|
16
16
|
sidekiq_options :queue => "crawl_worker", :retry => false if SIDEKIQ_INSTALLED
|
17
|
-
|
17
|
+
|
18
18
|
def perform(content_request)
|
19
19
|
puts "Performing for #{content_request["url"]}"
|
20
20
|
# setup the crawl class to manage the crawl of this object
|
21
21
|
@crawl = CobwebModule::Crawl.new(content_request)
|
22
|
-
|
22
|
+
|
23
23
|
# update the counters and then perform the get, returns false if we are outwith limits
|
24
24
|
if @crawl.retrieve
|
25
|
-
|
25
|
+
|
26
26
|
# if the crawled object is an object type we are interested
|
27
27
|
if @crawl.content.permitted_type?
|
28
|
-
|
28
|
+
|
29
29
|
# extract links from content and process them if we are still within queue limits (block will not run if we are outwith limits)
|
30
30
|
@crawl.process_links do |link|
|
31
31
|
@crawl.lock("queue_links") do
|
32
32
|
if @crawl.within_crawl_limits? && !@crawl.already_handled?(link)
|
33
33
|
# enqueue the links to sidekiq
|
34
|
-
@crawl.debug_puts "QUEUED LINK: #{link}"
|
34
|
+
@crawl.debug_puts "QUEUED LINK: #{link}"
|
35
35
|
enqueue_content(content_request, link)
|
36
36
|
end
|
37
37
|
end
|
38
38
|
end
|
39
|
-
|
39
|
+
|
40
40
|
if @crawl.to_be_processed?
|
41
|
-
|
41
|
+
|
42
42
|
@crawl.process do
|
43
43
|
|
44
44
|
# enqueue to processing queue
|
@@ -51,17 +51,17 @@ class CrawlWorker
|
|
51
51
|
current_count = enqueue_redis.hget(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field]).to_i
|
52
52
|
enqueue_redis.hset(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field], current_count+1)
|
53
53
|
end
|
54
|
-
|
54
|
+
|
55
55
|
end
|
56
56
|
else
|
57
57
|
@crawl.debug_puts "@crawl.finished? #{@crawl.finished?}"
|
58
58
|
@crawl.debug_puts "@crawl.within_crawl_limits? #{@crawl.within_crawl_limits?}"
|
59
59
|
@crawl.debug_puts "@crawl.first_to_finish? #{@crawl.first_to_finish?}"
|
60
60
|
end
|
61
|
-
|
61
|
+
|
62
62
|
end
|
63
63
|
end
|
64
|
-
|
64
|
+
|
65
65
|
#@crawl.lock("finished") do
|
66
66
|
# let the crawl know we're finished with this object
|
67
67
|
@crawl.finished_processing
|
@@ -79,7 +79,7 @@ class CrawlWorker
|
|
79
79
|
conn.smembers(get_sidekiq_options[:queue]).count
|
80
80
|
end
|
81
81
|
end
|
82
|
-
|
82
|
+
|
83
83
|
|
84
84
|
# Sets the crawl status to CobwebCrawlHelper::FINISHED and enqueues the crawl finished job
|
85
85
|
def finished(content_request)
|
@@ -93,7 +93,7 @@ class CrawlWorker
|
|
93
93
|
@crawl.redis.incr("crawl_finished_enqueued_count")
|
94
94
|
content_request[:crawl_finished_queue].constantize.perform_async(@crawl.statistics.merge(additional_stats))
|
95
95
|
end
|
96
|
-
|
96
|
+
|
97
97
|
# Enqueues the content to the processing queue setup in options
|
98
98
|
def send_to_processing_queue(content, content_request)
|
99
99
|
content_to_send = content.merge({:internal_urls => content_request[:internal_urls], :redis_options => content_request[:redis_options], :source_id => content_request[:source_id], :crawl_id => content_request[:crawl_id]})
|
@@ -110,7 +110,7 @@ class CrawlWorker
|
|
110
110
|
end
|
111
111
|
|
112
112
|
private
|
113
|
-
|
113
|
+
|
114
114
|
# Enqueues content to the crawl_job queue
|
115
115
|
def enqueue_content(content_request, link)
|
116
116
|
new_request = content_request.clone
|
@@ -119,4 +119,4 @@ class CrawlWorker
|
|
119
119
|
CrawlWorker.perform_async(new_request)
|
120
120
|
end
|
121
121
|
|
122
|
-
end
|
122
|
+
end
|
data/lib/export_command.rb
CHANGED
@@ -35,7 +35,7 @@ class ExportCommand
|
|
35
35
|
Dir.mkdir(options[:root_path]) unless File.exist?(options[:root_path])
|
36
36
|
|
37
37
|
uri.path.split("/")[0..-2].each do |dir|
|
38
|
-
path+="/" unless path.
|
38
|
+
path+="/" unless path.cobweb_ends_with?("/")
|
39
39
|
path+=dir
|
40
40
|
if File.exist?(options[:root_path] + path) && !File.directory?(options[:root_path] + path)
|
41
41
|
FileUtils.mv(options[:root_path] + path, options[:root_path] + path + ".tmp")
|
@@ -45,7 +45,7 @@ class ExportCommand
|
|
45
45
|
Dir.mkdir(options[:root_path] + path) unless Dir.exist?(options[:root_path] + path)
|
46
46
|
end
|
47
47
|
end
|
48
|
-
path += "/" unless path.
|
48
|
+
path += "/" unless path.cobweb_ends_with?("/")
|
49
49
|
filename = uri.path.split("/")[-1]
|
50
50
|
if filename.nil? || filename.empty?
|
51
51
|
filename = "index.html"
|
@@ -82,4 +82,4 @@ class ExportCommand
|
|
82
82
|
end
|
83
83
|
|
84
84
|
end
|
85
|
-
end
|
85
|
+
end
|
data/lib/report_command.rb
CHANGED
@@ -22,7 +22,7 @@ class ReportCommand
|
|
22
22
|
CSV.open(options[:output], "wb", :force_quotes => true) do |csv|
|
23
23
|
|
24
24
|
statistics = @crawler.crawl(options[:url]) do |page|
|
25
|
-
puts "Reporting on #{page[:url]}"
|
25
|
+
puts "Reporting on #{page[:url]} [#{page[:status_code]}]"
|
26
26
|
@doc = page[:body]
|
27
27
|
page["link_rel"] = scope.link_tag_with_rel("canonical")["href"]
|
28
28
|
page["title"] = scope.head_tag.title_tag.contents
|
data/lib/string.rb
CHANGED
@@ -1,12 +1,7 @@
|
|
1
1
|
class String
|
2
2
|
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
suffix = args[0]
|
7
|
-
suffix.respond_to?(:to_str) && self[-suffix.length, suffix.length] == suffix
|
8
|
-
else
|
9
|
-
super
|
10
|
-
end
|
3
|
+
def cobweb_ends_with?(val)
|
4
|
+
suffix = val
|
5
|
+
suffix.respond_to?(:to_str) && self[-suffix.length, suffix.length] == suffix
|
11
6
|
end
|
12
|
-
end
|
7
|
+
end
|
@@ -4,9 +4,9 @@ describe CobwebCrawler do
|
|
4
4
|
|
5
5
|
before(:each) do
|
6
6
|
pending("thin not installed") unless THIN_INSTALLED
|
7
|
-
|
7
|
+
|
8
8
|
@base_url = "http://localhost:3532/"
|
9
|
-
|
9
|
+
|
10
10
|
@default_headers = {"Cache-Control" => "private, max-age=0",
|
11
11
|
"Date" => "Wed, 10 Nov 2010 09:06:17 GMT",
|
12
12
|
"Expires" => "-1",
|
@@ -16,13 +16,13 @@ describe CobwebCrawler do
|
|
16
16
|
"Server" => "gws",
|
17
17
|
"X-XSS-Protection" => "1; mode=block"}
|
18
18
|
|
19
|
-
end
|
19
|
+
end
|
20
|
+
|
20
21
|
|
21
|
-
|
22
22
|
it "should generate a cobweb_crawler object" do
|
23
23
|
CobwebCrawler.new.should be_an_instance_of CobwebCrawler
|
24
24
|
end
|
25
|
-
|
25
|
+
|
26
26
|
describe "crawl" do
|
27
27
|
|
28
28
|
it "should crawl a site" do
|
@@ -36,21 +36,21 @@ describe CobwebCrawler do
|
|
36
36
|
@statistics.get_statistics[:mime_counts]["text/html"].should == 8
|
37
37
|
@statistics.get_statistics[:mime_counts]["text/css"].should == 18
|
38
38
|
@statistics.get_statistics[:mime_counts]["image/jpeg"].should == 25
|
39
|
-
|
39
|
+
|
40
40
|
end
|
41
|
-
|
41
|
+
|
42
42
|
it "should take a block" do
|
43
43
|
crawler = CobwebCrawler.new({:cache => false, :quiet => true, :debug => false, :crawl_limit => 1})
|
44
44
|
statistics = crawler.crawl(@base_url) do |content, statistics|
|
45
45
|
content[:url].should_not be_nil
|
46
46
|
statistics[:average_length].should_not be_nil
|
47
47
|
end
|
48
|
-
|
48
|
+
|
49
49
|
statistics.should_not be_nil
|
50
50
|
statistics.get_statistics.should be_an_instance_of Hash
|
51
|
-
|
51
|
+
|
52
52
|
statistics.get_statistics[:mime_counts]["text/html"].should == 1
|
53
|
-
|
53
|
+
|
54
54
|
end
|
55
55
|
|
56
56
|
context "internal_links" do
|
@@ -91,19 +91,19 @@ describe CobwebCrawler do
|
|
91
91
|
before(:each) do
|
92
92
|
pending("thin not installed") unless THIN_INSTALLED
|
93
93
|
@crawler = CobwebCrawler.new({:cache => false, :quiet => true, :debug => false, :store_inbound_links => true})
|
94
|
-
@statistics = @crawler.crawl(@base_url)
|
94
|
+
@statistics = @crawler.crawl(@base_url)
|
95
95
|
end
|
96
96
|
|
97
97
|
it "should store inbound links" do
|
98
98
|
@statistics.inbound_links_for("http://localhost:3532/typography.html").should_not be_empty
|
99
|
-
@statistics.inbound_links_for("http://localhost:3532/typography.html").sort.should == ["http://localhost:3532/gallery.html", "http://localhost:3532/
|
99
|
+
@statistics.inbound_links_for("http://localhost:3532/typography.html").sort.should == ["http://localhost:3532/gallery.html", "http://localhost:3532/boxgridwithsillyname.html", "http://localhost:3532/more.html", "http://localhost:3532/", "http://localhost:3532/tables.html", "http://localhost:3532/typography.html", "http://localhost:3532/forms.html", "http://localhost:3532/dashboard.html"].sort
|
100
100
|
end
|
101
101
|
|
102
102
|
it "should handle url encoding" do
|
103
|
-
@statistics.inbound_links_for("http://localhost:3532/
|
103
|
+
@statistics.inbound_links_for("http://localhost:3532/boxgridwithsillyname.html").sort.should == ["http://localhost:3532/boxgridwithsillyname.html", "http://localhost:3532/gallery.html", "http://localhost:3532/more.html", "http://localhost:3532/tables.html", "http://localhost:3532/typography.html", "http://localhost:3532/forms.html", "http://localhost:3532/dashboard.html"].sort
|
104
104
|
end
|
105
105
|
|
106
106
|
end
|
107
|
-
end
|
107
|
+
end
|
108
108
|
|
109
|
-
end
|
109
|
+
end
|
@@ -6,7 +6,7 @@ describe CrawlJob, :local_only => true, :disabled => true do
|
|
6
6
|
|
7
7
|
before(:all) do
|
8
8
|
#store all existing resque process ids so we don't kill them afterwards
|
9
|
-
if RESQUE_INSTALLED && THIN_INSTALLED
|
9
|
+
if RESQUE_INSTALLED && THIN_INSTALLED
|
10
10
|
|
11
11
|
@existing_processes = `ps aux | grep resque | grep -v grep | grep -v resque-web | awk '{print $2}'`.split("\n")
|
12
12
|
if Resque.workers.count > 0 && @existing_processes.empty?
|
@@ -168,7 +168,7 @@ describe CrawlJob, :local_only => true, :disabled => true do
|
|
168
168
|
# wait_for_crawl_finished crawl[:crawl_id]
|
169
169
|
# @redis.get("crawl_job_enqueued_count").to_i.should == 20
|
170
170
|
# end
|
171
|
-
#
|
171
|
+
#
|
172
172
|
# end
|
173
173
|
describe "limit to 1" do
|
174
174
|
before(:each) do
|
@@ -271,11 +271,13 @@ describe CrawlJob, :local_only => true, :disabled => true do
|
|
271
271
|
|
272
272
|
after(:all) do
|
273
273
|
|
274
|
-
|
275
|
-
|
276
|
-
|
274
|
+
if RESQUE_INSTALLED
|
275
|
+
@all_processes = `ps aux | grep resque | grep -v grep | grep -v resque-web | awk '{print $2}'`.split("\n")
|
276
|
+
command = "kill -s QUIT #{(@all_processes - @existing_processes).join(" ")}"
|
277
|
+
IO.popen(command)
|
277
278
|
|
278
|
-
|
279
|
+
clear_queues
|
280
|
+
end
|
279
281
|
end
|
280
282
|
|
281
283
|
end
|
@@ -9,14 +9,15 @@ describe CrawlWorker, :local_only => true do
|
|
9
9
|
@existing_processes = `ps aux | grep sidekiq | grep -v grep | awk '{print $2}'`.split("\n")
|
10
10
|
|
11
11
|
raise "Sidekiq is already running, please stop before running specs." if @existing_processes.count > 0
|
12
|
-
|
12
|
+
|
13
13
|
# START WORKERS ONLY FOR CRAWL QUEUE SO WE CAN COUNT ENQUEUED PROCESS AND FINISH QUEUES
|
14
14
|
puts "Starting Workers... Please Wait..."
|
15
15
|
`mkdir log`
|
16
16
|
`rm -rf output.log`
|
17
|
-
|
17
|
+
puts "calling: nohup sidekiq -v -r ./lib/crawl_worker.rb -q crawl_worker > ./log/output.log 2>&1"
|
18
|
+
io = IO.popen("nohup sidekiq -r ./lib/crawl_worker.rb -q crawl_worker > ./log/output.log 2>&1")
|
18
19
|
puts "Workers Started."
|
19
|
-
end
|
20
|
+
end
|
20
21
|
end
|
21
22
|
|
22
23
|
before(:each) do
|
@@ -24,7 +25,7 @@ describe CrawlWorker, :local_only => true do
|
|
24
25
|
pending("thin not installed") unless THIN_INSTALLED
|
25
26
|
@base_url = "http://localhost:3532/"
|
26
27
|
@base_page_count = 77
|
27
|
-
|
28
|
+
|
28
29
|
clear_sidekiq_queues
|
29
30
|
end
|
30
31
|
|
@@ -40,7 +41,7 @@ describe CrawlWorker, :local_only => true do
|
|
40
41
|
}
|
41
42
|
@cobweb = Cobweb.new @request
|
42
43
|
end
|
43
|
-
|
44
|
+
|
44
45
|
it "should crawl entire site" do
|
45
46
|
crawl = @cobweb.start(@base_url)
|
46
47
|
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
@@ -66,7 +67,7 @@ describe CrawlWorker, :local_only => true do
|
|
66
67
|
}
|
67
68
|
@cobweb = Cobweb.new @request
|
68
69
|
end
|
69
|
-
|
70
|
+
|
70
71
|
it "should only crawl html pages" do
|
71
72
|
crawl = @cobweb.start(@base_url)
|
72
73
|
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
@@ -78,10 +79,10 @@ describe CrawlWorker, :local_only => true do
|
|
78
79
|
mime_types.count.should == 8
|
79
80
|
mime_types.map{|m| m.should == "text/html"}
|
80
81
|
mime_types.select{|m| m=="text/html"}.count.should == 8
|
81
|
-
|
82
|
-
|
82
|
+
|
83
|
+
|
83
84
|
end
|
84
|
-
|
85
|
+
|
85
86
|
end
|
86
87
|
describe "with a crawl limit" do
|
87
88
|
before(:each) do
|
@@ -93,34 +94,34 @@ describe CrawlWorker, :local_only => true do
|
|
93
94
|
:cache => nil
|
94
95
|
}
|
95
96
|
end
|
96
|
-
|
97
|
+
|
97
98
|
describe "of 1" do
|
98
99
|
before(:each) do
|
99
100
|
@request[:crawl_limit] = 1
|
100
101
|
@cobweb = Cobweb.new @request
|
101
102
|
end
|
102
|
-
|
103
|
+
|
103
104
|
it "should not crawl the entire site" do
|
104
105
|
crawl = @cobweb.start(@base_url)
|
105
106
|
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
106
107
|
wait_for_crawl_finished crawl[:crawl_id]
|
107
108
|
CrawlProcessWorker.queue_size.should_not == @base_page_count
|
108
|
-
end
|
109
|
+
end
|
109
110
|
it "should only crawl 1 page" do
|
110
111
|
crawl = @cobweb.start(@base_url)
|
111
112
|
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
112
113
|
wait_for_crawl_finished crawl[:crawl_id]
|
113
114
|
CrawlProcessWorker.queue_size.should == 1
|
114
|
-
end
|
115
|
+
end
|
115
116
|
it "should notify of crawl finished" do
|
116
117
|
crawl = @cobweb.start(@base_url)
|
117
118
|
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
118
119
|
wait_for_crawl_finished crawl[:crawl_id]
|
119
120
|
CrawlFinishedWorker.queue_size.should == 1
|
120
|
-
end
|
121
|
-
|
121
|
+
end
|
122
|
+
|
122
123
|
end
|
123
|
-
|
124
|
+
|
124
125
|
describe "of 5" do
|
125
126
|
before(:each) do
|
126
127
|
@request[:crawl_limit] = 5
|
@@ -131,36 +132,36 @@ describe CrawlWorker, :local_only => true do
|
|
131
132
|
@request[:crawl_limit_by_page] = true
|
132
133
|
@cobweb = Cobweb.new @request
|
133
134
|
end
|
134
|
-
|
135
|
+
|
135
136
|
it "should only use html pages towards the crawl limit" do
|
136
137
|
crawl = @cobweb.start(@base_url)
|
137
138
|
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
138
139
|
wait_for_crawl_finished crawl[:crawl_id]
|
139
|
-
|
140
|
+
|
140
141
|
mime_types = CrawlProcessWorker.queue_items(0, 200).map{|job| JSON.parse(job)["args"][0]["mime_type"]}
|
141
142
|
mime_types.select{|m| m=="text/html"}.count.should == 5
|
142
143
|
end
|
143
144
|
end
|
144
145
|
end
|
145
|
-
|
146
|
+
|
146
147
|
describe "of 10" do
|
147
148
|
before(:each) do
|
148
149
|
@request[:crawl_limit] = 10
|
149
150
|
@cobweb = Cobweb.new @request
|
150
151
|
end
|
151
|
-
|
152
|
+
|
152
153
|
it "should not crawl the entire site" do
|
153
154
|
crawl = @cobweb.start(@base_url)
|
154
155
|
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
155
156
|
wait_for_crawl_finished crawl[:crawl_id]
|
156
157
|
CrawlProcessWorker.queue_size.should_not == @base_page_count
|
157
|
-
end
|
158
|
+
end
|
158
159
|
it "should notify of crawl finished" do
|
159
160
|
crawl = @cobweb.start(@base_url)
|
160
161
|
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
161
162
|
wait_for_crawl_finished crawl[:crawl_id]
|
162
163
|
CrawlFinishedWorker.queue_size.should == 1
|
163
|
-
end
|
164
|
+
end
|
164
165
|
it "should only crawl 10 objects" do
|
165
166
|
crawl = @cobweb.start(@base_url)
|
166
167
|
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
@@ -168,38 +169,38 @@ describe CrawlWorker, :local_only => true do
|
|
168
169
|
CrawlProcessWorker.queue_size.should == 10
|
169
170
|
end
|
170
171
|
end
|
171
|
-
|
172
|
+
|
172
173
|
describe "of 100" do
|
173
174
|
before(:each) do
|
174
175
|
@request[:crawl_limit] = 100
|
175
176
|
@cobweb = Cobweb.new @request
|
176
177
|
end
|
177
|
-
|
178
|
+
|
178
179
|
it "should crawl the entire sample site" do
|
179
180
|
crawl = @cobweb.start(@base_url)
|
180
181
|
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
181
182
|
wait_for_crawl_finished crawl[:crawl_id]
|
182
183
|
CrawlProcessWorker.queue_size.should == @base_page_count
|
183
|
-
end
|
184
|
+
end
|
184
185
|
it "should notify of crawl finished" do
|
185
186
|
crawl = @cobweb.start(@base_url)
|
186
187
|
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
187
188
|
wait_for_crawl_finished crawl[:crawl_id]
|
188
189
|
CrawlFinishedWorker.queue_size.should == 1
|
189
|
-
end
|
190
|
+
end
|
190
191
|
it "should not crawl more than 100 pages" do
|
191
192
|
crawl = @cobweb.start(@base_url)
|
192
193
|
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
193
194
|
wait_for_crawl_finished crawl[:crawl_id]
|
194
195
|
CrawlProcessWorker.queue_size.should_not > 100
|
195
|
-
end
|
196
|
-
end
|
196
|
+
end
|
197
|
+
end
|
197
198
|
end
|
198
199
|
|
199
200
|
after(:all) do
|
200
201
|
@all_processes = `ps aux | grep sidekiq | grep -v grep | grep -v sidekiq-web | awk '{print $2}'`.split("\n")
|
201
202
|
unless (@all_processes - @existing_processes).empty?
|
202
|
-
command = "kill #{(@all_processes - @existing_processes).join(" ")}"
|
203
|
+
command = "kill -9 #{(@all_processes - @existing_processes).join(" ")}"
|
203
204
|
IO.popen(command)
|
204
205
|
end
|
205
206
|
clear_sidekiq_queues
|
@@ -211,6 +212,7 @@ def wait_for_crawl_finished(crawl_id, timeout=20)
|
|
211
212
|
@counter = 0
|
212
213
|
start_time = Time.now
|
213
214
|
while(running?(crawl_id) && Time.now < start_time + timeout) do
|
215
|
+
puts Sidekiq::Stats.new.queues
|
214
216
|
sleep 1
|
215
217
|
end
|
216
218
|
if Time.now > start_time + timeout
|
@@ -246,9 +248,7 @@ def clear_sidekiq_queues
|
|
246
248
|
end
|
247
249
|
end
|
248
250
|
sleep 5
|
249
|
-
|
251
|
+
|
250
252
|
CrawlProcessWorker.queue_size.should == 0
|
251
253
|
CrawlFinishedWorker.queue_size.should == 0
|
252
254
|
end
|
253
|
-
|
254
|
-
|