documentcloud-cloud-crowd 0.1.0 → 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- data/README +8 -8
- data/cloud-crowd.gemspec +8 -8
- data/config/config.example.ru +8 -2
- data/config/config.example.yml +6 -15
- data/examples/process_pdfs_example.rb +1 -1
- data/examples/word_count_example.rb +1 -0
- data/lib/cloud-crowd.rb +6 -5
- data/lib/cloud_crowd/action.rb +11 -7
- data/lib/cloud_crowd/asset_store/filesystem_store.rb +5 -0
- data/lib/cloud_crowd/asset_store/s3_store.rb +7 -3
- data/lib/cloud_crowd/asset_store.rb +1 -1
- data/lib/cloud_crowd/command_line.rb +14 -53
- data/lib/cloud_crowd/exceptions.rb +4 -0
- data/lib/cloud_crowd/helpers/authorization.rb +2 -2
- data/lib/cloud_crowd/helpers/resources.rb +0 -20
- data/lib/cloud_crowd/models/job.rb +25 -26
- data/lib/cloud_crowd/models/node_record.rb +81 -0
- data/lib/cloud_crowd/models/work_unit.rb +70 -30
- data/lib/cloud_crowd/models.rb +1 -1
- data/lib/cloud_crowd/node.rb +87 -0
- data/lib/cloud_crowd/schema.rb +19 -16
- data/lib/cloud_crowd/{app.rb → server.rb} +25 -30
- data/lib/cloud_crowd/worker.rb +50 -74
- data/public/css/admin_console.css +26 -14
- data/public/images/server.png +0 -0
- data/public/js/admin_console.js +45 -18
- data/test/acceptance/test_failing_work_units.rb +1 -1
- data/test/acceptance/{test_app.rb → test_server.rb} +15 -15
- data/test/acceptance/test_word_count.rb +3 -9
- data/test/blueprints.rb +0 -1
- data/test/config/config.ru +1 -1
- data/test/config/config.yml +1 -3
- data/test/unit/test_configuration.rb +1 -1
- data/test/unit/test_job.rb +1 -0
- data/test/unit/test_work_unit.rb +2 -4
- data/views/index.erb +13 -8
- metadata +9 -9
- data/lib/cloud_crowd/daemon.rb +0 -95
- data/lib/cloud_crowd/models/worker_record.rb +0 -61
- data/lib/cloud_crowd/runner.rb +0 -15
@@ -0,0 +1,81 @@
|
|
1
|
+
module CloudCrowd
|
2
|
+
|
3
|
+
# A NodeRecord is the record of a Node running remotely. We can use it to
|
4
|
+
# assign work units to the node, and keep track of its status.
|
5
|
+
class NodeRecord < ActiveRecord::Base
|
6
|
+
|
7
|
+
has_many :work_units
|
8
|
+
|
9
|
+
validates_presence_of :host, :ip_address, :port
|
10
|
+
|
11
|
+
before_destroy :clear_work_units
|
12
|
+
|
13
|
+
# Available Nodes haven't used up their maxiumum number of workers yet.
|
14
|
+
named_scope :available, {
|
15
|
+
:conditions => ['(max_workers is null or (select count(*) from work_units where node_record_id = node_records.id) < max_workers)'],
|
16
|
+
:order => 'updated_at asc'
|
17
|
+
}
|
18
|
+
|
19
|
+
# Save a Node's current status to the database.
|
20
|
+
def self.check_in(params, request)
|
21
|
+
attrs = {
|
22
|
+
:ip_address => request.ip,
|
23
|
+
:port => params[:port],
|
24
|
+
:max_workers => params[:max_workers],
|
25
|
+
:enabled_actions => params[:enabled_actions],
|
26
|
+
:updated_at => Time.now
|
27
|
+
}
|
28
|
+
self.find_or_create_by_host(params[:host]).update_attributes!(attrs)
|
29
|
+
end
|
30
|
+
|
31
|
+
def send_work_unit(unit)
|
32
|
+
result = node['/work'].post(:work_unit => unit.to_json)
|
33
|
+
unit.assign_to(self, JSON.parse(result)['pid'])
|
34
|
+
touch
|
35
|
+
rescue Errno::ECONNREFUSED
|
36
|
+
self.destroy # Couldn't post to node, assume it's gone away.
|
37
|
+
end
|
38
|
+
|
39
|
+
def actions
|
40
|
+
enabled_actions.split(',')
|
41
|
+
end
|
42
|
+
|
43
|
+
def busy?
|
44
|
+
max_workers && work_units.count >= max_workers
|
45
|
+
end
|
46
|
+
|
47
|
+
def url
|
48
|
+
@url ||= "http://#{host}:#{port}"
|
49
|
+
end
|
50
|
+
|
51
|
+
def node
|
52
|
+
return @node if @node
|
53
|
+
params = [url]
|
54
|
+
params += [CloudCrowd.config[:login], CloudCrowd.config[:password]] if CloudCrowd.config[:use_http_authentication]
|
55
|
+
@node = RestClient::Resource.new(*params)
|
56
|
+
end
|
57
|
+
|
58
|
+
def display_status
|
59
|
+
busy? ? 'busy' : 'available'
|
60
|
+
end
|
61
|
+
|
62
|
+
def worker_pids
|
63
|
+
work_units.all(:select => 'worker_pid').map(&:worker_pid)
|
64
|
+
end
|
65
|
+
|
66
|
+
def to_json(opts={})
|
67
|
+
{ 'host' => host,
|
68
|
+
'workers' => worker_pids,
|
69
|
+
'status' => display_status,
|
70
|
+
}.to_json
|
71
|
+
end
|
72
|
+
|
73
|
+
|
74
|
+
private
|
75
|
+
|
76
|
+
def clear_work_units
|
77
|
+
WorkUnit.update_all('node_record_id = null, worker_pid = null', "node_record_id = #{id}")
|
78
|
+
end
|
79
|
+
|
80
|
+
end
|
81
|
+
end
|
@@ -8,39 +8,77 @@ module CloudCrowd
|
|
8
8
|
include ModelStatus
|
9
9
|
|
10
10
|
belongs_to :job
|
11
|
-
belongs_to :
|
11
|
+
belongs_to :node_record
|
12
12
|
|
13
13
|
validates_presence_of :job_id, :status, :input, :action
|
14
|
+
|
15
|
+
named_scope :taken, {:conditions => ["worker_pid is not null"]}
|
16
|
+
named_scope :available, {:conditions => {:worker_pid => nil, :status => INCOMPLETE}}
|
17
|
+
named_scope :reserved, {:conditions => {:worker_pid => 0}}
|
14
18
|
|
15
|
-
|
19
|
+
# Attempt to send a list of work_units to nodes with available capacity.
|
20
|
+
# Do this in a separate thread so that the request can return, satisfied.
|
21
|
+
# A single application server process stops the same WorkUnit from being
|
22
|
+
# distributed to multiple nodes by reserving all the available ones.
|
23
|
+
def self.distribute_to_nodes
|
24
|
+
return unless WorkUnit.reserve_available
|
25
|
+
work_units = WorkUnit.reserved
|
26
|
+
available_nodes = NodeRecord.available
|
27
|
+
until work_units.empty? do
|
28
|
+
node = available_nodes.shift
|
29
|
+
unit = work_units.first
|
30
|
+
break unless node
|
31
|
+
next unless node.actions.include? unit.action
|
32
|
+
sent = node.send_work_unit(unit)
|
33
|
+
if sent
|
34
|
+
work_units.shift
|
35
|
+
available_nodes.push(node) unless node.busy?
|
36
|
+
end
|
37
|
+
end
|
38
|
+
WorkUnit.cancel_reservations
|
39
|
+
end
|
40
|
+
|
41
|
+
# Reserves all available WorkUnits. Returns false if there were none
|
42
|
+
# available.
|
43
|
+
def self.reserve_available
|
44
|
+
WorkUnit.available.update_all('worker_pid = 0') > 0
|
45
|
+
end
|
16
46
|
|
17
|
-
|
18
|
-
|
19
|
-
# can be retrieved for processing. Optionally, specify the +offset+ to peek
|
20
|
-
# further on in line.
|
21
|
-
def self.dequeue(worker_name, enabled_actions=[], offset=0)
|
22
|
-
unit = self.first(
|
23
|
-
:conditions => {:status => INCOMPLETE, :worker_record_id => nil, :action => enabled_actions},
|
24
|
-
:order => "created_at asc",
|
25
|
-
:offset => offset
|
26
|
-
)
|
27
|
-
unit ? unit.assign_to(worker_name) : nil
|
47
|
+
def self.cancel_reservations
|
48
|
+
WorkUnit.reserved.update_all('worker_pid = null')
|
28
49
|
end
|
29
50
|
|
30
|
-
|
31
|
-
|
32
|
-
|
51
|
+
def self.find_by_worker_name(name)
|
52
|
+
pid, host = name.split('@')
|
53
|
+
node = NodeRecord.find_by_host(host)
|
54
|
+
node && node.work_units.find_by_worker_pid(pid)
|
33
55
|
end
|
34
56
|
|
35
57
|
# Mark this unit as having finished successfully.
|
58
|
+
# TODO: Refactor alongside check_for_completion ... look into doubleparse.
|
36
59
|
def finish(output, time_taken)
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
60
|
+
if splitting?
|
61
|
+
[JSON.parse(JSON.parse(output)['output'])].flatten.each do |wu_input|
|
62
|
+
WorkUnit.create(
|
63
|
+
:job => job,
|
64
|
+
:action => action,
|
65
|
+
:input => wu_input,
|
66
|
+
:status => PROCESSING
|
67
|
+
)
|
68
|
+
end
|
69
|
+
self.destroy
|
70
|
+
job.set_next_status if job.work_units.splitting.count <= 0
|
71
|
+
else
|
72
|
+
update_attributes({
|
73
|
+
:status => SUCCEEDED,
|
74
|
+
:node_record => nil,
|
75
|
+
:worker_pid => nil,
|
76
|
+
:attempts => attempts + 1,
|
77
|
+
:output => output,
|
78
|
+
:time => time_taken
|
79
|
+
})
|
80
|
+
job.check_for_completion
|
81
|
+
end
|
44
82
|
end
|
45
83
|
|
46
84
|
# Mark this unit as having failed. May attempt a retry.
|
@@ -49,26 +87,28 @@ module CloudCrowd
|
|
49
87
|
return try_again if tries < CloudCrowd.config[:work_unit_retries]
|
50
88
|
update_attributes({
|
51
89
|
:status => FAILED,
|
52
|
-
:
|
90
|
+
:node_record => nil,
|
91
|
+
:worker_pid => nil,
|
53
92
|
:attempts => tries,
|
54
93
|
:output => output,
|
55
94
|
:time => time_taken
|
56
95
|
})
|
96
|
+
self.job.check_for_completion
|
57
97
|
end
|
58
98
|
|
59
99
|
# Ever tried. Ever failed. No matter. Try again. Fail again. Fail better.
|
60
100
|
def try_again
|
61
101
|
update_attributes({
|
62
|
-
:
|
63
|
-
:
|
102
|
+
:node_record => nil,
|
103
|
+
:worker_pid => nil,
|
104
|
+
:attempts => self.attempts + 1
|
64
105
|
})
|
65
106
|
end
|
66
107
|
|
67
108
|
# When a Worker checks out a WorkUnit, establish the connection between
|
68
|
-
# WorkUnit and
|
69
|
-
def assign_to(
|
70
|
-
|
71
|
-
self.save ? self : nil
|
109
|
+
# WorkUnit and NodeRecord.
|
110
|
+
def assign_to(node_record, worker_pid)
|
111
|
+
update_attributes!(:node_record => node_record, :worker_pid => worker_pid)
|
72
112
|
end
|
73
113
|
|
74
114
|
# The JSON representation of a WorkUnit shares the Job's options with all
|
data/lib/cloud_crowd/models.rb
CHANGED
@@ -0,0 +1,87 @@
|
|
1
|
+
module CloudCrowd
|
2
|
+
|
3
|
+
class Node < Sinatra::Default
|
4
|
+
|
5
|
+
# A Node's default port. You only run a single node per machine, so they
|
6
|
+
# can all use the same port without problems.
|
7
|
+
DEFAULT_PORT = 9063
|
8
|
+
|
9
|
+
attr_reader :server, :asset_store
|
10
|
+
|
11
|
+
set :root, ROOT
|
12
|
+
set :authorization_realm, "CloudCrowd"
|
13
|
+
|
14
|
+
helpers Helpers
|
15
|
+
|
16
|
+
# methodoverride allows the _method param.
|
17
|
+
enable :methodoverride
|
18
|
+
|
19
|
+
# Enabling HTTP Authentication turns it on for all requests.
|
20
|
+
before do
|
21
|
+
login_required if CloudCrowd.config[:use_http_authentication]
|
22
|
+
end
|
23
|
+
|
24
|
+
# To monitor a Node with Monit, God, Nagios, or another tool, you can hit
|
25
|
+
# /heartbeat to make sure its still up.
|
26
|
+
get '/heartbeat' do
|
27
|
+
"buh-bump"
|
28
|
+
end
|
29
|
+
|
30
|
+
post '/work' do
|
31
|
+
pid = fork { Worker.new(self, JSON.parse(params[:work_unit])) }
|
32
|
+
Process.detach(pid)
|
33
|
+
json :pid => pid
|
34
|
+
end
|
35
|
+
|
36
|
+
def initialize(port=DEFAULT_PORT)
|
37
|
+
require 'json'
|
38
|
+
@server = CloudCrowd.central_server
|
39
|
+
@host = Socket.gethostname
|
40
|
+
@enabled_actions = CloudCrowd.actions.keys
|
41
|
+
@asset_store = AssetStore.new
|
42
|
+
@port = port || DEFAULT_PORT
|
43
|
+
|
44
|
+
trap_signals
|
45
|
+
start_server
|
46
|
+
check_in
|
47
|
+
@server_thread.join
|
48
|
+
end
|
49
|
+
|
50
|
+
def check_in
|
51
|
+
@server["/node/#{@host}"].put(
|
52
|
+
:port => @port,
|
53
|
+
:max_workers => CloudCrowd.config[:max_workers],
|
54
|
+
:enabled_actions => @enabled_actions.join(',')
|
55
|
+
)
|
56
|
+
rescue Errno::ECONNREFUSED
|
57
|
+
puts "Failed to connect to the central server (#{@server.to_s}), exiting..."
|
58
|
+
raise SystemExit
|
59
|
+
end
|
60
|
+
|
61
|
+
def check_out
|
62
|
+
@server["/node/#{@host}"].delete
|
63
|
+
end
|
64
|
+
|
65
|
+
def start_server
|
66
|
+
@server_thread = Thread.new do
|
67
|
+
Thin::Server.start('0.0.0.0', @port, self, :signals => false)
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
|
72
|
+
private
|
73
|
+
|
74
|
+
def trap_signals
|
75
|
+
Signal.trap('INT') { shut_down }
|
76
|
+
Signal.trap('KILL') { shut_down }
|
77
|
+
Signal.trap('TERM') { shut_down }
|
78
|
+
end
|
79
|
+
|
80
|
+
def shut_down
|
81
|
+
check_out
|
82
|
+
Process.exit
|
83
|
+
end
|
84
|
+
|
85
|
+
end
|
86
|
+
|
87
|
+
end
|
data/lib/cloud_crowd/schema.rb
CHANGED
@@ -10,7 +10,16 @@ ActiveRecord::Schema.define(:version => 1) do
|
|
10
10
|
t.float "time"
|
11
11
|
t.string "callback_url"
|
12
12
|
t.string "email"
|
13
|
-
t.
|
13
|
+
t.datetime "created_at"
|
14
|
+
t.datetime "updated_at"
|
15
|
+
end
|
16
|
+
|
17
|
+
create_table "node_records", :force => true do |t|
|
18
|
+
t.string "host", :null => false
|
19
|
+
t.string "ip_address", :null => false
|
20
|
+
t.integer "port", :null => false
|
21
|
+
t.string "enabled_actions", :default => '', :null => false
|
22
|
+
t.integer "max_workers"
|
14
23
|
t.datetime "created_at"
|
15
24
|
t.datetime "updated_at"
|
16
25
|
end
|
@@ -21,25 +30,19 @@ ActiveRecord::Schema.define(:version => 1) do
|
|
21
30
|
t.text "input", :null => false
|
22
31
|
t.string "action", :null => false
|
23
32
|
t.integer "attempts", :default => 0, :null => false
|
24
|
-
t.integer "
|
25
|
-
t.integer "
|
33
|
+
t.integer "node_record_id"
|
34
|
+
t.integer "worker_pid"
|
26
35
|
t.float "time"
|
27
36
|
t.text "output"
|
28
37
|
t.datetime "created_at"
|
29
38
|
t.datetime "updated_at"
|
30
39
|
end
|
31
|
-
|
32
|
-
create_table "worker_records", :force => true do |t|
|
33
|
-
t.string "name", :null => false
|
34
|
-
t.string "thread_status", :null => false
|
35
|
-
t.datetime "created_at"
|
36
|
-
t.datetime "updated_at"
|
37
|
-
end
|
38
|
-
|
39
|
-
add_index "jobs", ["status"], :name => "index_jobs_on_status"
|
40
|
-
add_index "work_units", ["job_id"], :name => "index_work_units_on_job_id"
|
41
|
-
add_index "work_units", ["status", "worker_record_id", "action"], :name => "index_work_units_on_status_and_worker_record_id_and_action"
|
42
|
-
add_index "worker_records", ["name"], :name => "index_worker_records_on_name"
|
43
|
-
add_index "worker_records", ["updated_at"], :name => "index_worker_records_on_updated_at"
|
44
40
|
|
41
|
+
# Here be indices. After looking, it seems faster not to have them at all.
|
42
|
+
#
|
43
|
+
# add_index "jobs", ["status"], :name => "index_jobs_on_status"
|
44
|
+
# add_index "work_units", ["job_id"], :name => "index_work_units_on_job_id"
|
45
|
+
# add_index "work_units", ["worker_pid"], :name => "index_work_units_on_worker_pid"
|
46
|
+
# add_index "work_units", ["worker_pid", "status"], :name => "index_work_units_on_worker_pid_and_status"
|
47
|
+
# add_index "work_units", ["worker_pid", "node_record_id"], :name => "index_work_units_on_worker_pid_and_node_record_id"
|
45
48
|
end
|
@@ -16,7 +16,7 @@ module CloudCrowd
|
|
16
16
|
# [post /work] Dequeue the next WorkUnit, and hand it off to the worker.
|
17
17
|
# [put /work/:unit_id] Mark a finished WorkUnit as completed or failed, with results.
|
18
18
|
# [put /worker] Keep a record of an actively running worker.
|
19
|
-
class
|
19
|
+
class Server < Sinatra::Default
|
20
20
|
|
21
21
|
set :root, ROOT
|
22
22
|
set :authorization_realm, "CloudCrowd"
|
@@ -42,15 +42,14 @@ module CloudCrowd
|
|
42
42
|
get '/status' do
|
43
43
|
json(
|
44
44
|
'jobs' => Job.incomplete,
|
45
|
-
'
|
45
|
+
'nodes' => NodeRecord.all(:order => 'host desc'),
|
46
46
|
'work_unit_count' => WorkUnit.incomplete.count
|
47
47
|
)
|
48
48
|
end
|
49
49
|
|
50
|
-
# Get the JSON for a worker
|
50
|
+
# Get the JSON for what a worker is up to.
|
51
51
|
get '/worker/:name' do
|
52
|
-
|
53
|
-
json((record && record.work_unit) || {})
|
52
|
+
json WorkUnit.find_by_worker_name(params[:name]) || {}
|
54
53
|
end
|
55
54
|
|
56
55
|
# To monitor the central server with Monit, God, Nagios, or another
|
@@ -62,8 +61,11 @@ module CloudCrowd
|
|
62
61
|
# PUBLIC API:
|
63
62
|
|
64
63
|
# Start a new job. Accepts a JSON representation of the job-to-be.
|
64
|
+
# Distributes all work units to available nodes.
|
65
65
|
post '/jobs' do
|
66
|
-
|
66
|
+
job = Job.create_from_request(JSON.parse(params[:job]))
|
67
|
+
WorkUnit.distribute_to_nodes
|
68
|
+
json job
|
67
69
|
end
|
68
70
|
|
69
71
|
# Check the status of a job, returning the output if finished, and the
|
@@ -79,36 +81,29 @@ module CloudCrowd
|
|
79
81
|
json nil
|
80
82
|
end
|
81
83
|
|
82
|
-
# INTERNAL
|
84
|
+
# INTERNAL NODE API:
|
83
85
|
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
json
|
86
|
+
put '/node/:host' do
|
87
|
+
NodeRecord.check_in(params, request)
|
88
|
+
WorkUnit.distribute_to_nodes
|
89
|
+
json nil
|
90
|
+
end
|
91
|
+
|
92
|
+
delete '/node/:host' do
|
93
|
+
NodeRecord.destroy_all(:host => params[:host])
|
94
|
+
json nil
|
88
95
|
end
|
89
96
|
|
90
97
|
# When workers are done with their unit, either successfully on in failure,
|
91
|
-
# they mark it back on the central server and
|
92
|
-
#
|
98
|
+
# they mark it back on the central server and exit. Triggers distribution
|
99
|
+
# of pending work units.
|
93
100
|
put '/work/:work_unit_id' do
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
json dequeue_work_unit
|
99
|
-
when 'failed'
|
100
|
-
current_work_unit.fail(params[:output], params[:time])
|
101
|
-
json dequeue_work_unit(1)
|
102
|
-
else
|
103
|
-
error(500, "Completing a work unit must specify status.")
|
104
|
-
end
|
101
|
+
case params[:status]
|
102
|
+
when 'succeeded' then current_work_unit.finish(params[:output], params[:time])
|
103
|
+
when 'failed' then current_work_unit.fail(params[:output], params[:time])
|
104
|
+
else error(500, "Completing a work unit must specify status.")
|
105
105
|
end
|
106
|
-
|
107
|
-
|
108
|
-
# Every so often workers check in to let the central server know that
|
109
|
-
# they're still alive. Keep up-to-date records
|
110
|
-
put '/worker' do
|
111
|
-
params[:terminated] ? WorkerRecord.check_out(params) : WorkerRecord.check_in(params)
|
106
|
+
WorkUnit.distribute_to_nodes
|
112
107
|
json nil
|
113
108
|
end
|
114
109
|
|
data/lib/cloud_crowd/worker.rb
CHANGED
@@ -10,10 +10,6 @@ module CloudCrowd
|
|
10
10
|
# having failed.
|
11
11
|
class Worker
|
12
12
|
|
13
|
-
# The time between worker check-ins with the central server, informing
|
14
|
-
# it of the current status, and simply that it's still alive.
|
15
|
-
CHECK_IN_INTERVAL = 60
|
16
|
-
|
17
13
|
# Wait five seconds to retry, after internal communcication errors.
|
18
14
|
RETRY_WAIT = 5
|
19
15
|
|
@@ -22,32 +18,30 @@ module CloudCrowd
|
|
22
18
|
# Spinning up a worker will create a new AssetStore with a persistent
|
23
19
|
# connection to S3. This AssetStore gets passed into each action, for use
|
24
20
|
# as it is run.
|
25
|
-
def initialize
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
@
|
30
|
-
@
|
31
|
-
|
32
|
-
|
33
|
-
end
|
34
|
-
|
35
|
-
# Ask the central server for the first WorkUnit in line.
|
36
|
-
def fetch_work_unit
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
end
|
21
|
+
def initialize(node, work_unit)
|
22
|
+
Signal.trap('INT') { shut_down }
|
23
|
+
Signal.trap('KILL') { shut_down }
|
24
|
+
Signal.trap('TERM') { shut_down }
|
25
|
+
@pid = $$
|
26
|
+
@node = node
|
27
|
+
setup_work_unit(work_unit)
|
28
|
+
run
|
29
|
+
end
|
30
|
+
|
31
|
+
# # Ask the central server for the first WorkUnit in line.
|
32
|
+
# def fetch_work_unit
|
33
|
+
# keep_trying_to "fetch a new work unit" do
|
34
|
+
# unit_json = @server['/work'].post(base_params)
|
35
|
+
# setup_work_unit(unit_json)
|
36
|
+
# end
|
37
|
+
# end
|
42
38
|
|
43
39
|
# Return output to the central server, marking the current work unit as done.
|
44
40
|
def complete_work_unit(result)
|
45
41
|
keep_trying_to "complete work unit" do
|
46
42
|
data = completion_params.merge({:status => 'succeeded', :output => result})
|
47
|
-
|
43
|
+
@node.server["/work/#{data[:id]}"].put(data)
|
48
44
|
log "finished #{display_work_unit} in #{data[:time]} seconds"
|
49
|
-
clear_work_unit
|
50
|
-
setup_work_unit(unit_json)
|
51
45
|
end
|
52
46
|
end
|
53
47
|
|
@@ -55,36 +49,11 @@ module CloudCrowd
|
|
55
49
|
def fail_work_unit(exception)
|
56
50
|
keep_trying_to "mark work unit as failed" do
|
57
51
|
data = completion_params.merge({:status => 'failed', :output => {'output' => exception.message}.to_json})
|
58
|
-
|
52
|
+
@node.server["/work/#{data[:id]}"].put(data)
|
59
53
|
log "failed #{display_work_unit} in #{data[:time]} seconds\n#{exception.message}\n#{exception.backtrace}"
|
60
|
-
clear_work_unit
|
61
|
-
setup_work_unit(unit_json)
|
62
54
|
end
|
63
55
|
end
|
64
56
|
|
65
|
-
# Check in with the central server. Let it know the condition of the work
|
66
|
-
# thread, the action and status we're processing, and our hostname and PID.
|
67
|
-
def check_in(thread_status)
|
68
|
-
keep_trying_to "check in with central" do
|
69
|
-
@server["/worker"].put({
|
70
|
-
:name => @name,
|
71
|
-
:thread_status => thread_status
|
72
|
-
})
|
73
|
-
end
|
74
|
-
end
|
75
|
-
|
76
|
-
# Inform the central server that this worker is finished. This is the only
|
77
|
-
# remote method that doesn't retry on connection errors -- if the worker
|
78
|
-
# can't connect to the central server while it's trying to shutdown, it
|
79
|
-
# should close, regardless.
|
80
|
-
def check_out
|
81
|
-
@server["/worker"].put({
|
82
|
-
:name => @name,
|
83
|
-
:terminated => true
|
84
|
-
})
|
85
|
-
log 'exiting'
|
86
|
-
end
|
87
|
-
|
88
57
|
# We expect and require internal communication between the central server
|
89
58
|
# and the workers to succeed. If it fails for any reason, log it, and then
|
90
59
|
# keep trying the same request.
|
@@ -100,33 +69,31 @@ module CloudCrowd
|
|
100
69
|
end
|
101
70
|
end
|
102
71
|
|
103
|
-
# Does this Worker have a job to do?
|
104
|
-
def has_work?
|
105
|
-
@action_name && @input && @options
|
106
|
-
end
|
107
|
-
|
108
72
|
# Loggable string of the current work unit.
|
109
73
|
def display_work_unit
|
110
|
-
"unit ##{@options['work_unit_id']} (#{@action_name})"
|
74
|
+
"unit ##{@options['work_unit_id']} (#{@action_name}/#{CloudCrowd.display_status(@status)})"
|
111
75
|
end
|
112
76
|
|
113
77
|
# Executes the current work unit, catching all exceptions as failures.
|
114
78
|
def run_work_unit
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
79
|
+
@worker_thread = Thread.new do
|
80
|
+
begin
|
81
|
+
result = nil
|
82
|
+
@action = CloudCrowd.actions[@action_name].new(@status, @input, @options, @node.asset_store)
|
83
|
+
Dir.chdir(@action.work_directory) do
|
84
|
+
result = case @status
|
85
|
+
when PROCESSING then @action.process
|
86
|
+
when SPLITTING then @action.split
|
87
|
+
when MERGING then @action.merge
|
88
|
+
else raise Error::StatusUnspecified, "work units must specify their status"
|
89
|
+
end
|
124
90
|
end
|
91
|
+
complete_work_unit({'output' => result}.to_json)
|
92
|
+
rescue Exception => e
|
93
|
+
fail_work_unit(e)
|
125
94
|
end
|
126
|
-
complete_work_unit({'output' => result}.to_json)
|
127
|
-
rescue Exception => e
|
128
|
-
fail_work_unit(e)
|
129
95
|
end
|
96
|
+
@worker_thread.join
|
130
97
|
end
|
131
98
|
|
132
99
|
# Wraps <tt>run_work_unit</tt> to benchmark the execution time, if requested.
|
@@ -142,8 +109,7 @@ module CloudCrowd
|
|
142
109
|
# Common parameters to send back to central.
|
143
110
|
def base_params
|
144
111
|
@base_params ||= {
|
145
|
-
:
|
146
|
-
:worker_actions => @enabled_actions.join(',')
|
112
|
+
:pid => @pid
|
147
113
|
}
|
148
114
|
end
|
149
115
|
|
@@ -157,9 +123,8 @@ module CloudCrowd
|
|
157
123
|
end
|
158
124
|
|
159
125
|
# Extract our instance variables from a WorkUnit's JSON.
|
160
|
-
def setup_work_unit(
|
161
|
-
return false unless
|
162
|
-
unit = JSON.parse(unit_json)
|
126
|
+
def setup_work_unit(unit)
|
127
|
+
return false unless unit
|
163
128
|
@start_time = Time.now
|
164
129
|
@action_name, @input, @options, @status = unit['action'], unit['input'], unit['options'], unit['status']
|
165
130
|
@options['job_id'] = unit['job_id']
|
@@ -171,7 +136,7 @@ module CloudCrowd
|
|
171
136
|
|
172
137
|
# Log a message to the daemon log. Includes PID for identification.
|
173
138
|
def log(message)
|
174
|
-
puts "Worker ##{@
|
139
|
+
puts "Worker ##{@pid}: #{message}" unless ENV['RACK_ENV'] == 'test'
|
175
140
|
end
|
176
141
|
|
177
142
|
# When we're done with a unit, clear out our instance variables to make way
|
@@ -181,6 +146,17 @@ module CloudCrowd
|
|
181
146
|
@action, @action_name, @input, @options, @start_time = nil, nil, nil, nil, nil
|
182
147
|
end
|
183
148
|
|
149
|
+
# Force the worker to quit, even if it's in the middle of processing.
|
150
|
+
# If it had checked out a work unit, the node should have released it on
|
151
|
+
# the central server already.
|
152
|
+
def shut_down
|
153
|
+
if @worker_thread
|
154
|
+
@worker_thread.kill
|
155
|
+
@worker_thread.kill! if @worker_thread.alive?
|
156
|
+
end
|
157
|
+
Process.exit
|
158
|
+
end
|
159
|
+
|
184
160
|
end
|
185
161
|
|
186
162
|
end
|