cobweb 1.0.28 → 1.0.29
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.textile +71 -67
- data/lib/cobweb.rb +41 -41
- data/lib/cobweb_crawl_helper.rb +1 -5
- data/lib/cobweb_version.rb +2 -2
- data/lib/crawl_worker.rb +14 -14
- data/lib/export_command.rb +3 -3
- data/lib/report_command.rb +1 -1
- data/lib/string.rb +4 -9
- data/spec/cobweb/cobweb_crawler_spec.rb +15 -15
- data/spec/cobweb/crawl_job_spec.rb +8 -6
- data/spec/cobweb/crawl_worker_spec.rb +32 -32
- data/spec/samples/sample_site/{boxgrid>withsillyname.html → boxgridwithsillyname.html} +37 -37
- data/spec/samples/sample_site/dashboard.html +1 -1
- data/spec/samples/sample_site/forms.html +1 -1
- data/spec/samples/sample_site/gallery.html +1 -1
- data/spec/samples/sample_site/more.html +1 -1
- data/spec/samples/sample_site/tables.html +1 -1
- data/spec/samples/sample_site/typography.html +1 -1
- metadata +4 -4
data/lib/cobweb_crawl_helper.rb
CHANGED
@@ -52,19 +52,15 @@ class CobwebCrawlHelper
|
|
52
52
|
end
|
53
53
|
if options[:queue_system] == :sidekiq && SIDEKIQ_INSTALLED
|
54
54
|
|
55
|
-
puts "deleteing from crawl_worker"
|
56
55
|
queue = Sidekiq::Queue.new("crawl_worker")
|
57
56
|
queue.each do |job|
|
58
|
-
ap job.args # => [1, 2, 3]
|
59
57
|
job.delete if job.args[0]["crawl_id"] == id
|
60
58
|
end
|
61
59
|
|
62
60
|
|
63
61
|
process_queue_name = Kernel.const_get(options[:processing_queue]).sidekiq_options_hash["queue"]
|
64
|
-
puts "deleting from #{process_queue_name}"
|
65
62
|
queue = Sidekiq::Queue.new(process_queue_name)
|
66
63
|
queue.each do |job|
|
67
|
-
ap job.args # => [1, 2, 3]
|
68
64
|
job.delete if job.args[0]["crawl_id"] == id
|
69
65
|
end
|
70
66
|
end
|
@@ -102,4 +98,4 @@ class CobwebCrawlHelper
|
|
102
98
|
@data[:crawl_id]
|
103
99
|
end
|
104
100
|
|
105
|
-
end
|
101
|
+
end
|
data/lib/cobweb_version.rb
CHANGED
data/lib/crawl_worker.rb
CHANGED
@@ -14,31 +14,31 @@ require File.expand_path(File.dirname(__FILE__) + '/sidekiq/cobweb_helper')
|
|
14
14
|
class CrawlWorker
|
15
15
|
include Sidekiq::Worker
|
16
16
|
sidekiq_options :queue => "crawl_worker", :retry => false if SIDEKIQ_INSTALLED
|
17
|
-
|
17
|
+
|
18
18
|
def perform(content_request)
|
19
19
|
puts "Performing for #{content_request["url"]}"
|
20
20
|
# setup the crawl class to manage the crawl of this object
|
21
21
|
@crawl = CobwebModule::Crawl.new(content_request)
|
22
|
-
|
22
|
+
|
23
23
|
# update the counters and then perform the get, returns false if we are outwith limits
|
24
24
|
if @crawl.retrieve
|
25
|
-
|
25
|
+
|
26
26
|
# if the crawled object is an object type we are interested
|
27
27
|
if @crawl.content.permitted_type?
|
28
|
-
|
28
|
+
|
29
29
|
# extract links from content and process them if we are still within queue limits (block will not run if we are outwith limits)
|
30
30
|
@crawl.process_links do |link|
|
31
31
|
@crawl.lock("queue_links") do
|
32
32
|
if @crawl.within_crawl_limits? && !@crawl.already_handled?(link)
|
33
33
|
# enqueue the links to sidekiq
|
34
|
-
@crawl.debug_puts "QUEUED LINK: #{link}"
|
34
|
+
@crawl.debug_puts "QUEUED LINK: #{link}"
|
35
35
|
enqueue_content(content_request, link)
|
36
36
|
end
|
37
37
|
end
|
38
38
|
end
|
39
|
-
|
39
|
+
|
40
40
|
if @crawl.to_be_processed?
|
41
|
-
|
41
|
+
|
42
42
|
@crawl.process do
|
43
43
|
|
44
44
|
# enqueue to processing queue
|
@@ -51,17 +51,17 @@ class CrawlWorker
|
|
51
51
|
current_count = enqueue_redis.hget(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field]).to_i
|
52
52
|
enqueue_redis.hset(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field], current_count+1)
|
53
53
|
end
|
54
|
-
|
54
|
+
|
55
55
|
end
|
56
56
|
else
|
57
57
|
@crawl.debug_puts "@crawl.finished? #{@crawl.finished?}"
|
58
58
|
@crawl.debug_puts "@crawl.within_crawl_limits? #{@crawl.within_crawl_limits?}"
|
59
59
|
@crawl.debug_puts "@crawl.first_to_finish? #{@crawl.first_to_finish?}"
|
60
60
|
end
|
61
|
-
|
61
|
+
|
62
62
|
end
|
63
63
|
end
|
64
|
-
|
64
|
+
|
65
65
|
#@crawl.lock("finished") do
|
66
66
|
# let the crawl know we're finished with this object
|
67
67
|
@crawl.finished_processing
|
@@ -79,7 +79,7 @@ class CrawlWorker
|
|
79
79
|
conn.smembers(get_sidekiq_options[:queue]).count
|
80
80
|
end
|
81
81
|
end
|
82
|
-
|
82
|
+
|
83
83
|
|
84
84
|
# Sets the crawl status to CobwebCrawlHelper::FINISHED and enqueues the crawl finished job
|
85
85
|
def finished(content_request)
|
@@ -93,7 +93,7 @@ class CrawlWorker
|
|
93
93
|
@crawl.redis.incr("crawl_finished_enqueued_count")
|
94
94
|
content_request[:crawl_finished_queue].constantize.perform_async(@crawl.statistics.merge(additional_stats))
|
95
95
|
end
|
96
|
-
|
96
|
+
|
97
97
|
# Enqueues the content to the processing queue setup in options
|
98
98
|
def send_to_processing_queue(content, content_request)
|
99
99
|
content_to_send = content.merge({:internal_urls => content_request[:internal_urls], :redis_options => content_request[:redis_options], :source_id => content_request[:source_id], :crawl_id => content_request[:crawl_id]})
|
@@ -110,7 +110,7 @@ class CrawlWorker
|
|
110
110
|
end
|
111
111
|
|
112
112
|
private
|
113
|
-
|
113
|
+
|
114
114
|
# Enqueues content to the crawl_job queue
|
115
115
|
def enqueue_content(content_request, link)
|
116
116
|
new_request = content_request.clone
|
@@ -119,4 +119,4 @@ class CrawlWorker
|
|
119
119
|
CrawlWorker.perform_async(new_request)
|
120
120
|
end
|
121
121
|
|
122
|
-
end
|
122
|
+
end
|
data/lib/export_command.rb
CHANGED
@@ -35,7 +35,7 @@ class ExportCommand
|
|
35
35
|
Dir.mkdir(options[:root_path]) unless File.exist?(options[:root_path])
|
36
36
|
|
37
37
|
uri.path.split("/")[0..-2].each do |dir|
|
38
|
-
path+="/" unless path.
|
38
|
+
path+="/" unless path.cobweb_ends_with?("/")
|
39
39
|
path+=dir
|
40
40
|
if File.exist?(options[:root_path] + path) && !File.directory?(options[:root_path] + path)
|
41
41
|
FileUtils.mv(options[:root_path] + path, options[:root_path] + path + ".tmp")
|
@@ -45,7 +45,7 @@ class ExportCommand
|
|
45
45
|
Dir.mkdir(options[:root_path] + path) unless Dir.exist?(options[:root_path] + path)
|
46
46
|
end
|
47
47
|
end
|
48
|
-
path += "/" unless path.
|
48
|
+
path += "/" unless path.cobweb_ends_with?("/")
|
49
49
|
filename = uri.path.split("/")[-1]
|
50
50
|
if filename.nil? || filename.empty?
|
51
51
|
filename = "index.html"
|
@@ -82,4 +82,4 @@ class ExportCommand
|
|
82
82
|
end
|
83
83
|
|
84
84
|
end
|
85
|
-
end
|
85
|
+
end
|
data/lib/report_command.rb
CHANGED
@@ -22,7 +22,7 @@ class ReportCommand
|
|
22
22
|
CSV.open(options[:output], "wb", :force_quotes => true) do |csv|
|
23
23
|
|
24
24
|
statistics = @crawler.crawl(options[:url]) do |page|
|
25
|
-
puts "Reporting on #{page[:url]}"
|
25
|
+
puts "Reporting on #{page[:url]} [#{page[:status_code]}]"
|
26
26
|
@doc = page[:body]
|
27
27
|
page["link_rel"] = scope.link_tag_with_rel("canonical")["href"]
|
28
28
|
page["title"] = scope.head_tag.title_tag.contents
|
data/lib/string.rb
CHANGED
@@ -1,12 +1,7 @@
|
|
1
1
|
class String
|
2
2
|
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
suffix = args[0]
|
7
|
-
suffix.respond_to?(:to_str) && self[-suffix.length, suffix.length] == suffix
|
8
|
-
else
|
9
|
-
super
|
10
|
-
end
|
3
|
+
def cobweb_ends_with?(val)
|
4
|
+
suffix = val
|
5
|
+
suffix.respond_to?(:to_str) && self[-suffix.length, suffix.length] == suffix
|
11
6
|
end
|
12
|
-
end
|
7
|
+
end
|
@@ -4,9 +4,9 @@ describe CobwebCrawler do
|
|
4
4
|
|
5
5
|
before(:each) do
|
6
6
|
pending("thin not installed") unless THIN_INSTALLED
|
7
|
-
|
7
|
+
|
8
8
|
@base_url = "http://localhost:3532/"
|
9
|
-
|
9
|
+
|
10
10
|
@default_headers = {"Cache-Control" => "private, max-age=0",
|
11
11
|
"Date" => "Wed, 10 Nov 2010 09:06:17 GMT",
|
12
12
|
"Expires" => "-1",
|
@@ -16,13 +16,13 @@ describe CobwebCrawler do
|
|
16
16
|
"Server" => "gws",
|
17
17
|
"X-XSS-Protection" => "1; mode=block"}
|
18
18
|
|
19
|
-
end
|
19
|
+
end
|
20
|
+
|
20
21
|
|
21
|
-
|
22
22
|
it "should generate a cobweb_crawler object" do
|
23
23
|
CobwebCrawler.new.should be_an_instance_of CobwebCrawler
|
24
24
|
end
|
25
|
-
|
25
|
+
|
26
26
|
describe "crawl" do
|
27
27
|
|
28
28
|
it "should crawl a site" do
|
@@ -36,21 +36,21 @@ describe CobwebCrawler do
|
|
36
36
|
@statistics.get_statistics[:mime_counts]["text/html"].should == 8
|
37
37
|
@statistics.get_statistics[:mime_counts]["text/css"].should == 18
|
38
38
|
@statistics.get_statistics[:mime_counts]["image/jpeg"].should == 25
|
39
|
-
|
39
|
+
|
40
40
|
end
|
41
|
-
|
41
|
+
|
42
42
|
it "should take a block" do
|
43
43
|
crawler = CobwebCrawler.new({:cache => false, :quiet => true, :debug => false, :crawl_limit => 1})
|
44
44
|
statistics = crawler.crawl(@base_url) do |content, statistics|
|
45
45
|
content[:url].should_not be_nil
|
46
46
|
statistics[:average_length].should_not be_nil
|
47
47
|
end
|
48
|
-
|
48
|
+
|
49
49
|
statistics.should_not be_nil
|
50
50
|
statistics.get_statistics.should be_an_instance_of Hash
|
51
|
-
|
51
|
+
|
52
52
|
statistics.get_statistics[:mime_counts]["text/html"].should == 1
|
53
|
-
|
53
|
+
|
54
54
|
end
|
55
55
|
|
56
56
|
context "internal_links" do
|
@@ -91,19 +91,19 @@ describe CobwebCrawler do
|
|
91
91
|
before(:each) do
|
92
92
|
pending("thin not installed") unless THIN_INSTALLED
|
93
93
|
@crawler = CobwebCrawler.new({:cache => false, :quiet => true, :debug => false, :store_inbound_links => true})
|
94
|
-
@statistics = @crawler.crawl(@base_url)
|
94
|
+
@statistics = @crawler.crawl(@base_url)
|
95
95
|
end
|
96
96
|
|
97
97
|
it "should store inbound links" do
|
98
98
|
@statistics.inbound_links_for("http://localhost:3532/typography.html").should_not be_empty
|
99
|
-
@statistics.inbound_links_for("http://localhost:3532/typography.html").sort.should == ["http://localhost:3532/gallery.html", "http://localhost:3532/
|
99
|
+
@statistics.inbound_links_for("http://localhost:3532/typography.html").sort.should == ["http://localhost:3532/gallery.html", "http://localhost:3532/boxgridwithsillyname.html", "http://localhost:3532/more.html", "http://localhost:3532/", "http://localhost:3532/tables.html", "http://localhost:3532/typography.html", "http://localhost:3532/forms.html", "http://localhost:3532/dashboard.html"].sort
|
100
100
|
end
|
101
101
|
|
102
102
|
it "should handle url encoding" do
|
103
|
-
@statistics.inbound_links_for("http://localhost:3532/
|
103
|
+
@statistics.inbound_links_for("http://localhost:3532/boxgridwithsillyname.html").sort.should == ["http://localhost:3532/boxgridwithsillyname.html", "http://localhost:3532/gallery.html", "http://localhost:3532/more.html", "http://localhost:3532/tables.html", "http://localhost:3532/typography.html", "http://localhost:3532/forms.html", "http://localhost:3532/dashboard.html"].sort
|
104
104
|
end
|
105
105
|
|
106
106
|
end
|
107
|
-
end
|
107
|
+
end
|
108
108
|
|
109
|
-
end
|
109
|
+
end
|
@@ -6,7 +6,7 @@ describe CrawlJob, :local_only => true, :disabled => true do
|
|
6
6
|
|
7
7
|
before(:all) do
|
8
8
|
#store all existing resque process ids so we don't kill them afterwards
|
9
|
-
if RESQUE_INSTALLED && THIN_INSTALLED
|
9
|
+
if RESQUE_INSTALLED && THIN_INSTALLED
|
10
10
|
|
11
11
|
@existing_processes = `ps aux | grep resque | grep -v grep | grep -v resque-web | awk '{print $2}'`.split("\n")
|
12
12
|
if Resque.workers.count > 0 && @existing_processes.empty?
|
@@ -168,7 +168,7 @@ describe CrawlJob, :local_only => true, :disabled => true do
|
|
168
168
|
# wait_for_crawl_finished crawl[:crawl_id]
|
169
169
|
# @redis.get("crawl_job_enqueued_count").to_i.should == 20
|
170
170
|
# end
|
171
|
-
#
|
171
|
+
#
|
172
172
|
# end
|
173
173
|
describe "limit to 1" do
|
174
174
|
before(:each) do
|
@@ -271,11 +271,13 @@ describe CrawlJob, :local_only => true, :disabled => true do
|
|
271
271
|
|
272
272
|
after(:all) do
|
273
273
|
|
274
|
-
|
275
|
-
|
276
|
-
|
274
|
+
if RESQUE_INSTALLED
|
275
|
+
@all_processes = `ps aux | grep resque | grep -v grep | grep -v resque-web | awk '{print $2}'`.split("\n")
|
276
|
+
command = "kill -s QUIT #{(@all_processes - @existing_processes).join(" ")}"
|
277
|
+
IO.popen(command)
|
277
278
|
|
278
|
-
|
279
|
+
clear_queues
|
280
|
+
end
|
279
281
|
end
|
280
282
|
|
281
283
|
end
|
@@ -9,14 +9,15 @@ describe CrawlWorker, :local_only => true do
|
|
9
9
|
@existing_processes = `ps aux | grep sidekiq | grep -v grep | awk '{print $2}'`.split("\n")
|
10
10
|
|
11
11
|
raise "Sidekiq is already running, please stop before running specs." if @existing_processes.count > 0
|
12
|
-
|
12
|
+
|
13
13
|
# START WORKERS ONLY FOR CRAWL QUEUE SO WE CAN COUNT ENQUEUED PROCESS AND FINISH QUEUES
|
14
14
|
puts "Starting Workers... Please Wait..."
|
15
15
|
`mkdir log`
|
16
16
|
`rm -rf output.log`
|
17
|
-
|
17
|
+
puts "calling: nohup sidekiq -v -r ./lib/crawl_worker.rb -q crawl_worker > ./log/output.log 2>&1"
|
18
|
+
io = IO.popen("nohup sidekiq -r ./lib/crawl_worker.rb -q crawl_worker > ./log/output.log 2>&1")
|
18
19
|
puts "Workers Started."
|
19
|
-
end
|
20
|
+
end
|
20
21
|
end
|
21
22
|
|
22
23
|
before(:each) do
|
@@ -24,7 +25,7 @@ describe CrawlWorker, :local_only => true do
|
|
24
25
|
pending("thin not installed") unless THIN_INSTALLED
|
25
26
|
@base_url = "http://localhost:3532/"
|
26
27
|
@base_page_count = 77
|
27
|
-
|
28
|
+
|
28
29
|
clear_sidekiq_queues
|
29
30
|
end
|
30
31
|
|
@@ -40,7 +41,7 @@ describe CrawlWorker, :local_only => true do
|
|
40
41
|
}
|
41
42
|
@cobweb = Cobweb.new @request
|
42
43
|
end
|
43
|
-
|
44
|
+
|
44
45
|
it "should crawl entire site" do
|
45
46
|
crawl = @cobweb.start(@base_url)
|
46
47
|
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
@@ -66,7 +67,7 @@ describe CrawlWorker, :local_only => true do
|
|
66
67
|
}
|
67
68
|
@cobweb = Cobweb.new @request
|
68
69
|
end
|
69
|
-
|
70
|
+
|
70
71
|
it "should only crawl html pages" do
|
71
72
|
crawl = @cobweb.start(@base_url)
|
72
73
|
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
@@ -78,10 +79,10 @@ describe CrawlWorker, :local_only => true do
|
|
78
79
|
mime_types.count.should == 8
|
79
80
|
mime_types.map{|m| m.should == "text/html"}
|
80
81
|
mime_types.select{|m| m=="text/html"}.count.should == 8
|
81
|
-
|
82
|
-
|
82
|
+
|
83
|
+
|
83
84
|
end
|
84
|
-
|
85
|
+
|
85
86
|
end
|
86
87
|
describe "with a crawl limit" do
|
87
88
|
before(:each) do
|
@@ -93,34 +94,34 @@ describe CrawlWorker, :local_only => true do
|
|
93
94
|
:cache => nil
|
94
95
|
}
|
95
96
|
end
|
96
|
-
|
97
|
+
|
97
98
|
describe "of 1" do
|
98
99
|
before(:each) do
|
99
100
|
@request[:crawl_limit] = 1
|
100
101
|
@cobweb = Cobweb.new @request
|
101
102
|
end
|
102
|
-
|
103
|
+
|
103
104
|
it "should not crawl the entire site" do
|
104
105
|
crawl = @cobweb.start(@base_url)
|
105
106
|
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
106
107
|
wait_for_crawl_finished crawl[:crawl_id]
|
107
108
|
CrawlProcessWorker.queue_size.should_not == @base_page_count
|
108
|
-
end
|
109
|
+
end
|
109
110
|
it "should only crawl 1 page" do
|
110
111
|
crawl = @cobweb.start(@base_url)
|
111
112
|
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
112
113
|
wait_for_crawl_finished crawl[:crawl_id]
|
113
114
|
CrawlProcessWorker.queue_size.should == 1
|
114
|
-
end
|
115
|
+
end
|
115
116
|
it "should notify of crawl finished" do
|
116
117
|
crawl = @cobweb.start(@base_url)
|
117
118
|
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
118
119
|
wait_for_crawl_finished crawl[:crawl_id]
|
119
120
|
CrawlFinishedWorker.queue_size.should == 1
|
120
|
-
end
|
121
|
-
|
121
|
+
end
|
122
|
+
|
122
123
|
end
|
123
|
-
|
124
|
+
|
124
125
|
describe "of 5" do
|
125
126
|
before(:each) do
|
126
127
|
@request[:crawl_limit] = 5
|
@@ -131,36 +132,36 @@ describe CrawlWorker, :local_only => true do
|
|
131
132
|
@request[:crawl_limit_by_page] = true
|
132
133
|
@cobweb = Cobweb.new @request
|
133
134
|
end
|
134
|
-
|
135
|
+
|
135
136
|
it "should only use html pages towards the crawl limit" do
|
136
137
|
crawl = @cobweb.start(@base_url)
|
137
138
|
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
138
139
|
wait_for_crawl_finished crawl[:crawl_id]
|
139
|
-
|
140
|
+
|
140
141
|
mime_types = CrawlProcessWorker.queue_items(0, 200).map{|job| JSON.parse(job)["args"][0]["mime_type"]}
|
141
142
|
mime_types.select{|m| m=="text/html"}.count.should == 5
|
142
143
|
end
|
143
144
|
end
|
144
145
|
end
|
145
|
-
|
146
|
+
|
146
147
|
describe "of 10" do
|
147
148
|
before(:each) do
|
148
149
|
@request[:crawl_limit] = 10
|
149
150
|
@cobweb = Cobweb.new @request
|
150
151
|
end
|
151
|
-
|
152
|
+
|
152
153
|
it "should not crawl the entire site" do
|
153
154
|
crawl = @cobweb.start(@base_url)
|
154
155
|
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
155
156
|
wait_for_crawl_finished crawl[:crawl_id]
|
156
157
|
CrawlProcessWorker.queue_size.should_not == @base_page_count
|
157
|
-
end
|
158
|
+
end
|
158
159
|
it "should notify of crawl finished" do
|
159
160
|
crawl = @cobweb.start(@base_url)
|
160
161
|
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
161
162
|
wait_for_crawl_finished crawl[:crawl_id]
|
162
163
|
CrawlFinishedWorker.queue_size.should == 1
|
163
|
-
end
|
164
|
+
end
|
164
165
|
it "should only crawl 10 objects" do
|
165
166
|
crawl = @cobweb.start(@base_url)
|
166
167
|
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
@@ -168,38 +169,38 @@ describe CrawlWorker, :local_only => true do
|
|
168
169
|
CrawlProcessWorker.queue_size.should == 10
|
169
170
|
end
|
170
171
|
end
|
171
|
-
|
172
|
+
|
172
173
|
describe "of 100" do
|
173
174
|
before(:each) do
|
174
175
|
@request[:crawl_limit] = 100
|
175
176
|
@cobweb = Cobweb.new @request
|
176
177
|
end
|
177
|
-
|
178
|
+
|
178
179
|
it "should crawl the entire sample site" do
|
179
180
|
crawl = @cobweb.start(@base_url)
|
180
181
|
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
181
182
|
wait_for_crawl_finished crawl[:crawl_id]
|
182
183
|
CrawlProcessWorker.queue_size.should == @base_page_count
|
183
|
-
end
|
184
|
+
end
|
184
185
|
it "should notify of crawl finished" do
|
185
186
|
crawl = @cobweb.start(@base_url)
|
186
187
|
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
187
188
|
wait_for_crawl_finished crawl[:crawl_id]
|
188
189
|
CrawlFinishedWorker.queue_size.should == 1
|
189
|
-
end
|
190
|
+
end
|
190
191
|
it "should not crawl more than 100 pages" do
|
191
192
|
crawl = @cobweb.start(@base_url)
|
192
193
|
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
193
194
|
wait_for_crawl_finished crawl[:crawl_id]
|
194
195
|
CrawlProcessWorker.queue_size.should_not > 100
|
195
|
-
end
|
196
|
-
end
|
196
|
+
end
|
197
|
+
end
|
197
198
|
end
|
198
199
|
|
199
200
|
after(:all) do
|
200
201
|
@all_processes = `ps aux | grep sidekiq | grep -v grep | grep -v sidekiq-web | awk '{print $2}'`.split("\n")
|
201
202
|
unless (@all_processes - @existing_processes).empty?
|
202
|
-
command = "kill #{(@all_processes - @existing_processes).join(" ")}"
|
203
|
+
command = "kill -9 #{(@all_processes - @existing_processes).join(" ")}"
|
203
204
|
IO.popen(command)
|
204
205
|
end
|
205
206
|
clear_sidekiq_queues
|
@@ -211,6 +212,7 @@ def wait_for_crawl_finished(crawl_id, timeout=20)
|
|
211
212
|
@counter = 0
|
212
213
|
start_time = Time.now
|
213
214
|
while(running?(crawl_id) && Time.now < start_time + timeout) do
|
215
|
+
puts Sidekiq::Stats.new.queues
|
214
216
|
sleep 1
|
215
217
|
end
|
216
218
|
if Time.now > start_time + timeout
|
@@ -246,9 +248,7 @@ def clear_sidekiq_queues
|
|
246
248
|
end
|
247
249
|
end
|
248
250
|
sleep 5
|
249
|
-
|
251
|
+
|
250
252
|
CrawlProcessWorker.queue_size.should == 0
|
251
253
|
CrawlFinishedWorker.queue_size.should == 0
|
252
254
|
end
|
253
|
-
|
254
|
-
|