documentcloud-cloud-crowd 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +8 -8
- data/cloud-crowd.gemspec +8 -8
- data/config/config.example.ru +8 -2
- data/config/config.example.yml +6 -15
- data/examples/process_pdfs_example.rb +1 -1
- data/examples/word_count_example.rb +1 -0
- data/lib/cloud-crowd.rb +6 -5
- data/lib/cloud_crowd/action.rb +11 -7
- data/lib/cloud_crowd/asset_store/filesystem_store.rb +5 -0
- data/lib/cloud_crowd/asset_store/s3_store.rb +7 -3
- data/lib/cloud_crowd/asset_store.rb +1 -1
- data/lib/cloud_crowd/command_line.rb +14 -53
- data/lib/cloud_crowd/exceptions.rb +4 -0
- data/lib/cloud_crowd/helpers/authorization.rb +2 -2
- data/lib/cloud_crowd/helpers/resources.rb +0 -20
- data/lib/cloud_crowd/models/job.rb +25 -26
- data/lib/cloud_crowd/models/node_record.rb +81 -0
- data/lib/cloud_crowd/models/work_unit.rb +70 -30
- data/lib/cloud_crowd/models.rb +1 -1
- data/lib/cloud_crowd/node.rb +87 -0
- data/lib/cloud_crowd/schema.rb +19 -16
- data/lib/cloud_crowd/{app.rb → server.rb} +25 -30
- data/lib/cloud_crowd/worker.rb +50 -74
- data/public/css/admin_console.css +26 -14
- data/public/images/server.png +0 -0
- data/public/js/admin_console.js +45 -18
- data/test/acceptance/test_failing_work_units.rb +1 -1
- data/test/acceptance/{test_app.rb → test_server.rb} +15 -15
- data/test/acceptance/test_word_count.rb +3 -9
- data/test/blueprints.rb +0 -1
- data/test/config/config.ru +1 -1
- data/test/config/config.yml +1 -3
- data/test/unit/test_configuration.rb +1 -1
- data/test/unit/test_job.rb +1 -0
- data/test/unit/test_work_unit.rb +2 -4
- data/views/index.erb +13 -8
- metadata +9 -9
- data/lib/cloud_crowd/daemon.rb +0 -95
- data/lib/cloud_crowd/models/worker_record.rb +0 -61
- data/lib/cloud_crowd/runner.rb +0 -15
@@ -0,0 +1,81 @@
|
|
1
|
+
module CloudCrowd
|
2
|
+
|
3
|
+
# A NodeRecord is the record of a Node running remotely. We can use it to
|
4
|
+
# assign work units to the node, and keep track of its status.
|
5
|
+
class NodeRecord < ActiveRecord::Base
|
6
|
+
|
7
|
+
has_many :work_units
|
8
|
+
|
9
|
+
validates_presence_of :host, :ip_address, :port
|
10
|
+
|
11
|
+
before_destroy :clear_work_units
|
12
|
+
|
13
|
+
# Available Nodes haven't used up their maxiumum number of workers yet.
|
14
|
+
named_scope :available, {
|
15
|
+
:conditions => ['(max_workers is null or (select count(*) from work_units where node_record_id = node_records.id) < max_workers)'],
|
16
|
+
:order => 'updated_at asc'
|
17
|
+
}
|
18
|
+
|
19
|
+
# Save a Node's current status to the database.
|
20
|
+
def self.check_in(params, request)
|
21
|
+
attrs = {
|
22
|
+
:ip_address => request.ip,
|
23
|
+
:port => params[:port],
|
24
|
+
:max_workers => params[:max_workers],
|
25
|
+
:enabled_actions => params[:enabled_actions],
|
26
|
+
:updated_at => Time.now
|
27
|
+
}
|
28
|
+
self.find_or_create_by_host(params[:host]).update_attributes!(attrs)
|
29
|
+
end
|
30
|
+
|
31
|
+
def send_work_unit(unit)
|
32
|
+
result = node['/work'].post(:work_unit => unit.to_json)
|
33
|
+
unit.assign_to(self, JSON.parse(result)['pid'])
|
34
|
+
touch
|
35
|
+
rescue Errno::ECONNREFUSED
|
36
|
+
self.destroy # Couldn't post to node, assume it's gone away.
|
37
|
+
end
|
38
|
+
|
39
|
+
def actions
|
40
|
+
enabled_actions.split(',')
|
41
|
+
end
|
42
|
+
|
43
|
+
def busy?
|
44
|
+
max_workers && work_units.count >= max_workers
|
45
|
+
end
|
46
|
+
|
47
|
+
def url
|
48
|
+
@url ||= "http://#{host}:#{port}"
|
49
|
+
end
|
50
|
+
|
51
|
+
def node
|
52
|
+
return @node if @node
|
53
|
+
params = [url]
|
54
|
+
params += [CloudCrowd.config[:login], CloudCrowd.config[:password]] if CloudCrowd.config[:use_http_authentication]
|
55
|
+
@node = RestClient::Resource.new(*params)
|
56
|
+
end
|
57
|
+
|
58
|
+
def display_status
|
59
|
+
busy? ? 'busy' : 'available'
|
60
|
+
end
|
61
|
+
|
62
|
+
def worker_pids
|
63
|
+
work_units.all(:select => 'worker_pid').map(&:worker_pid)
|
64
|
+
end
|
65
|
+
|
66
|
+
def to_json(opts={})
|
67
|
+
{ 'host' => host,
|
68
|
+
'workers' => worker_pids,
|
69
|
+
'status' => display_status,
|
70
|
+
}.to_json
|
71
|
+
end
|
72
|
+
|
73
|
+
|
74
|
+
private
|
75
|
+
|
76
|
+
def clear_work_units
|
77
|
+
WorkUnit.update_all('node_record_id = null, worker_pid = null', "node_record_id = #{id}")
|
78
|
+
end
|
79
|
+
|
80
|
+
end
|
81
|
+
end
|
@@ -8,39 +8,77 @@ module CloudCrowd
|
|
8
8
|
include ModelStatus
|
9
9
|
|
10
10
|
belongs_to :job
|
11
|
-
belongs_to :
|
11
|
+
belongs_to :node_record
|
12
12
|
|
13
13
|
validates_presence_of :job_id, :status, :input, :action
|
14
|
+
|
15
|
+
named_scope :taken, {:conditions => ["worker_pid is not null"]}
|
16
|
+
named_scope :available, {:conditions => {:worker_pid => nil, :status => INCOMPLETE}}
|
17
|
+
named_scope :reserved, {:conditions => {:worker_pid => 0}}
|
14
18
|
|
15
|
-
|
19
|
+
# Attempt to send a list of work_units to nodes with available capacity.
|
20
|
+
# Do this in a separate thread so that the request can return, satisfied.
|
21
|
+
# A single application server process stops the same WorkUnit from being
|
22
|
+
# distributed to multiple nodes by reserving all the available ones.
|
23
|
+
def self.distribute_to_nodes
|
24
|
+
return unless WorkUnit.reserve_available
|
25
|
+
work_units = WorkUnit.reserved
|
26
|
+
available_nodes = NodeRecord.available
|
27
|
+
until work_units.empty? do
|
28
|
+
node = available_nodes.shift
|
29
|
+
unit = work_units.first
|
30
|
+
break unless node
|
31
|
+
next unless node.actions.include? unit.action
|
32
|
+
sent = node.send_work_unit(unit)
|
33
|
+
if sent
|
34
|
+
work_units.shift
|
35
|
+
available_nodes.push(node) unless node.busy?
|
36
|
+
end
|
37
|
+
end
|
38
|
+
WorkUnit.cancel_reservations
|
39
|
+
end
|
40
|
+
|
41
|
+
# Reserves all available WorkUnits. Returns false if there were none
|
42
|
+
# available.
|
43
|
+
def self.reserve_available
|
44
|
+
WorkUnit.available.update_all('worker_pid = 0') > 0
|
45
|
+
end
|
16
46
|
|
17
|
-
|
18
|
-
|
19
|
-
# can be retrieved for processing. Optionally, specify the +offset+ to peek
|
20
|
-
# further on in line.
|
21
|
-
def self.dequeue(worker_name, enabled_actions=[], offset=0)
|
22
|
-
unit = self.first(
|
23
|
-
:conditions => {:status => INCOMPLETE, :worker_record_id => nil, :action => enabled_actions},
|
24
|
-
:order => "created_at asc",
|
25
|
-
:offset => offset
|
26
|
-
)
|
27
|
-
unit ? unit.assign_to(worker_name) : nil
|
47
|
+
def self.cancel_reservations
|
48
|
+
WorkUnit.reserved.update_all('worker_pid = null')
|
28
49
|
end
|
29
50
|
|
30
|
-
|
31
|
-
|
32
|
-
|
51
|
+
def self.find_by_worker_name(name)
|
52
|
+
pid, host = name.split('@')
|
53
|
+
node = NodeRecord.find_by_host(host)
|
54
|
+
node && node.work_units.find_by_worker_pid(pid)
|
33
55
|
end
|
34
56
|
|
35
57
|
# Mark this unit as having finished successfully.
|
58
|
+
# TODO: Refactor alongside check_for_completion ... look into doubleparse.
|
36
59
|
def finish(output, time_taken)
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
60
|
+
if splitting?
|
61
|
+
[JSON.parse(JSON.parse(output)['output'])].flatten.each do |wu_input|
|
62
|
+
WorkUnit.create(
|
63
|
+
:job => job,
|
64
|
+
:action => action,
|
65
|
+
:input => wu_input,
|
66
|
+
:status => PROCESSING
|
67
|
+
)
|
68
|
+
end
|
69
|
+
self.destroy
|
70
|
+
job.set_next_status if job.work_units.splitting.count <= 0
|
71
|
+
else
|
72
|
+
update_attributes({
|
73
|
+
:status => SUCCEEDED,
|
74
|
+
:node_record => nil,
|
75
|
+
:worker_pid => nil,
|
76
|
+
:attempts => attempts + 1,
|
77
|
+
:output => output,
|
78
|
+
:time => time_taken
|
79
|
+
})
|
80
|
+
job.check_for_completion
|
81
|
+
end
|
44
82
|
end
|
45
83
|
|
46
84
|
# Mark this unit as having failed. May attempt a retry.
|
@@ -49,26 +87,28 @@ module CloudCrowd
|
|
49
87
|
return try_again if tries < CloudCrowd.config[:work_unit_retries]
|
50
88
|
update_attributes({
|
51
89
|
:status => FAILED,
|
52
|
-
:
|
90
|
+
:node_record => nil,
|
91
|
+
:worker_pid => nil,
|
53
92
|
:attempts => tries,
|
54
93
|
:output => output,
|
55
94
|
:time => time_taken
|
56
95
|
})
|
96
|
+
self.job.check_for_completion
|
57
97
|
end
|
58
98
|
|
59
99
|
# Ever tried. Ever failed. No matter. Try again. Fail again. Fail better.
|
60
100
|
def try_again
|
61
101
|
update_attributes({
|
62
|
-
:
|
63
|
-
:
|
102
|
+
:node_record => nil,
|
103
|
+
:worker_pid => nil,
|
104
|
+
:attempts => self.attempts + 1
|
64
105
|
})
|
65
106
|
end
|
66
107
|
|
67
108
|
# When a Worker checks out a WorkUnit, establish the connection between
|
68
|
-
# WorkUnit and
|
69
|
-
def assign_to(
|
70
|
-
|
71
|
-
self.save ? self : nil
|
109
|
+
# WorkUnit and NodeRecord.
|
110
|
+
def assign_to(node_record, worker_pid)
|
111
|
+
update_attributes!(:node_record => node_record, :worker_pid => worker_pid)
|
72
112
|
end
|
73
113
|
|
74
114
|
# The JSON representation of a WorkUnit shares the Job's options with all
|
data/lib/cloud_crowd/models.rb
CHANGED
@@ -0,0 +1,87 @@
|
|
1
|
+
module CloudCrowd
|
2
|
+
|
3
|
+
class Node < Sinatra::Default
|
4
|
+
|
5
|
+
# A Node's default port. You only run a single node per machine, so they
|
6
|
+
# can all use the same port without problems.
|
7
|
+
DEFAULT_PORT = 9063
|
8
|
+
|
9
|
+
attr_reader :server, :asset_store
|
10
|
+
|
11
|
+
set :root, ROOT
|
12
|
+
set :authorization_realm, "CloudCrowd"
|
13
|
+
|
14
|
+
helpers Helpers
|
15
|
+
|
16
|
+
# methodoverride allows the _method param.
|
17
|
+
enable :methodoverride
|
18
|
+
|
19
|
+
# Enabling HTTP Authentication turns it on for all requests.
|
20
|
+
before do
|
21
|
+
login_required if CloudCrowd.config[:use_http_authentication]
|
22
|
+
end
|
23
|
+
|
24
|
+
# To monitor a Node with Monit, God, Nagios, or another tool, you can hit
|
25
|
+
# /heartbeat to make sure its still up.
|
26
|
+
get '/heartbeat' do
|
27
|
+
"buh-bump"
|
28
|
+
end
|
29
|
+
|
30
|
+
post '/work' do
|
31
|
+
pid = fork { Worker.new(self, JSON.parse(params[:work_unit])) }
|
32
|
+
Process.detach(pid)
|
33
|
+
json :pid => pid
|
34
|
+
end
|
35
|
+
|
36
|
+
def initialize(port=DEFAULT_PORT)
|
37
|
+
require 'json'
|
38
|
+
@server = CloudCrowd.central_server
|
39
|
+
@host = Socket.gethostname
|
40
|
+
@enabled_actions = CloudCrowd.actions.keys
|
41
|
+
@asset_store = AssetStore.new
|
42
|
+
@port = port || DEFAULT_PORT
|
43
|
+
|
44
|
+
trap_signals
|
45
|
+
start_server
|
46
|
+
check_in
|
47
|
+
@server_thread.join
|
48
|
+
end
|
49
|
+
|
50
|
+
def check_in
|
51
|
+
@server["/node/#{@host}"].put(
|
52
|
+
:port => @port,
|
53
|
+
:max_workers => CloudCrowd.config[:max_workers],
|
54
|
+
:enabled_actions => @enabled_actions.join(',')
|
55
|
+
)
|
56
|
+
rescue Errno::ECONNREFUSED
|
57
|
+
puts "Failed to connect to the central server (#{@server.to_s}), exiting..."
|
58
|
+
raise SystemExit
|
59
|
+
end
|
60
|
+
|
61
|
+
def check_out
|
62
|
+
@server["/node/#{@host}"].delete
|
63
|
+
end
|
64
|
+
|
65
|
+
def start_server
|
66
|
+
@server_thread = Thread.new do
|
67
|
+
Thin::Server.start('0.0.0.0', @port, self, :signals => false)
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
|
72
|
+
private
|
73
|
+
|
74
|
+
def trap_signals
|
75
|
+
Signal.trap('INT') { shut_down }
|
76
|
+
Signal.trap('KILL') { shut_down }
|
77
|
+
Signal.trap('TERM') { shut_down }
|
78
|
+
end
|
79
|
+
|
80
|
+
def shut_down
|
81
|
+
check_out
|
82
|
+
Process.exit
|
83
|
+
end
|
84
|
+
|
85
|
+
end
|
86
|
+
|
87
|
+
end
|
data/lib/cloud_crowd/schema.rb
CHANGED
@@ -10,7 +10,16 @@ ActiveRecord::Schema.define(:version => 1) do
|
|
10
10
|
t.float "time"
|
11
11
|
t.string "callback_url"
|
12
12
|
t.string "email"
|
13
|
-
t.
|
13
|
+
t.datetime "created_at"
|
14
|
+
t.datetime "updated_at"
|
15
|
+
end
|
16
|
+
|
17
|
+
create_table "node_records", :force => true do |t|
|
18
|
+
t.string "host", :null => false
|
19
|
+
t.string "ip_address", :null => false
|
20
|
+
t.integer "port", :null => false
|
21
|
+
t.string "enabled_actions", :default => '', :null => false
|
22
|
+
t.integer "max_workers"
|
14
23
|
t.datetime "created_at"
|
15
24
|
t.datetime "updated_at"
|
16
25
|
end
|
@@ -21,25 +30,19 @@ ActiveRecord::Schema.define(:version => 1) do
|
|
21
30
|
t.text "input", :null => false
|
22
31
|
t.string "action", :null => false
|
23
32
|
t.integer "attempts", :default => 0, :null => false
|
24
|
-
t.integer "
|
25
|
-
t.integer "
|
33
|
+
t.integer "node_record_id"
|
34
|
+
t.integer "worker_pid"
|
26
35
|
t.float "time"
|
27
36
|
t.text "output"
|
28
37
|
t.datetime "created_at"
|
29
38
|
t.datetime "updated_at"
|
30
39
|
end
|
31
|
-
|
32
|
-
create_table "worker_records", :force => true do |t|
|
33
|
-
t.string "name", :null => false
|
34
|
-
t.string "thread_status", :null => false
|
35
|
-
t.datetime "created_at"
|
36
|
-
t.datetime "updated_at"
|
37
|
-
end
|
38
|
-
|
39
|
-
add_index "jobs", ["status"], :name => "index_jobs_on_status"
|
40
|
-
add_index "work_units", ["job_id"], :name => "index_work_units_on_job_id"
|
41
|
-
add_index "work_units", ["status", "worker_record_id", "action"], :name => "index_work_units_on_status_and_worker_record_id_and_action"
|
42
|
-
add_index "worker_records", ["name"], :name => "index_worker_records_on_name"
|
43
|
-
add_index "worker_records", ["updated_at"], :name => "index_worker_records_on_updated_at"
|
44
40
|
|
41
|
+
# Here be indices. After looking, it seems faster not to have them at all.
|
42
|
+
#
|
43
|
+
# add_index "jobs", ["status"], :name => "index_jobs_on_status"
|
44
|
+
# add_index "work_units", ["job_id"], :name => "index_work_units_on_job_id"
|
45
|
+
# add_index "work_units", ["worker_pid"], :name => "index_work_units_on_worker_pid"
|
46
|
+
# add_index "work_units", ["worker_pid", "status"], :name => "index_work_units_on_worker_pid_and_status"
|
47
|
+
# add_index "work_units", ["worker_pid", "node_record_id"], :name => "index_work_units_on_worker_pid_and_node_record_id"
|
45
48
|
end
|
@@ -16,7 +16,7 @@ module CloudCrowd
|
|
16
16
|
# [post /work] Dequeue the next WorkUnit, and hand it off to the worker.
|
17
17
|
# [put /work/:unit_id] Mark a finished WorkUnit as completed or failed, with results.
|
18
18
|
# [put /worker] Keep a record of an actively running worker.
|
19
|
-
class
|
19
|
+
class Server < Sinatra::Default
|
20
20
|
|
21
21
|
set :root, ROOT
|
22
22
|
set :authorization_realm, "CloudCrowd"
|
@@ -42,15 +42,14 @@ module CloudCrowd
|
|
42
42
|
get '/status' do
|
43
43
|
json(
|
44
44
|
'jobs' => Job.incomplete,
|
45
|
-
'
|
45
|
+
'nodes' => NodeRecord.all(:order => 'host desc'),
|
46
46
|
'work_unit_count' => WorkUnit.incomplete.count
|
47
47
|
)
|
48
48
|
end
|
49
49
|
|
50
|
-
# Get the JSON for a worker
|
50
|
+
# Get the JSON for what a worker is up to.
|
51
51
|
get '/worker/:name' do
|
52
|
-
|
53
|
-
json((record && record.work_unit) || {})
|
52
|
+
json WorkUnit.find_by_worker_name(params[:name]) || {}
|
54
53
|
end
|
55
54
|
|
56
55
|
# To monitor the central server with Monit, God, Nagios, or another
|
@@ -62,8 +61,11 @@ module CloudCrowd
|
|
62
61
|
# PUBLIC API:
|
63
62
|
|
64
63
|
# Start a new job. Accepts a JSON representation of the job-to-be.
|
64
|
+
# Distributes all work units to available nodes.
|
65
65
|
post '/jobs' do
|
66
|
-
|
66
|
+
job = Job.create_from_request(JSON.parse(params[:job]))
|
67
|
+
WorkUnit.distribute_to_nodes
|
68
|
+
json job
|
67
69
|
end
|
68
70
|
|
69
71
|
# Check the status of a job, returning the output if finished, and the
|
@@ -79,36 +81,29 @@ module CloudCrowd
|
|
79
81
|
json nil
|
80
82
|
end
|
81
83
|
|
82
|
-
# INTERNAL
|
84
|
+
# INTERNAL NODE API:
|
83
85
|
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
json
|
86
|
+
put '/node/:host' do
|
87
|
+
NodeRecord.check_in(params, request)
|
88
|
+
WorkUnit.distribute_to_nodes
|
89
|
+
json nil
|
90
|
+
end
|
91
|
+
|
92
|
+
delete '/node/:host' do
|
93
|
+
NodeRecord.destroy_all(:host => params[:host])
|
94
|
+
json nil
|
88
95
|
end
|
89
96
|
|
90
97
|
# When workers are done with their unit, either successfully on in failure,
|
91
|
-
# they mark it back on the central server and
|
92
|
-
#
|
98
|
+
# they mark it back on the central server and exit. Triggers distribution
|
99
|
+
# of pending work units.
|
93
100
|
put '/work/:work_unit_id' do
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
json dequeue_work_unit
|
99
|
-
when 'failed'
|
100
|
-
current_work_unit.fail(params[:output], params[:time])
|
101
|
-
json dequeue_work_unit(1)
|
102
|
-
else
|
103
|
-
error(500, "Completing a work unit must specify status.")
|
104
|
-
end
|
101
|
+
case params[:status]
|
102
|
+
when 'succeeded' then current_work_unit.finish(params[:output], params[:time])
|
103
|
+
when 'failed' then current_work_unit.fail(params[:output], params[:time])
|
104
|
+
else error(500, "Completing a work unit must specify status.")
|
105
105
|
end
|
106
|
-
|
107
|
-
|
108
|
-
# Every so often workers check in to let the central server know that
|
109
|
-
# they're still alive. Keep up-to-date records
|
110
|
-
put '/worker' do
|
111
|
-
params[:terminated] ? WorkerRecord.check_out(params) : WorkerRecord.check_in(params)
|
106
|
+
WorkUnit.distribute_to_nodes
|
112
107
|
json nil
|
113
108
|
end
|
114
109
|
|
data/lib/cloud_crowd/worker.rb
CHANGED
@@ -10,10 +10,6 @@ module CloudCrowd
|
|
10
10
|
# having failed.
|
11
11
|
class Worker
|
12
12
|
|
13
|
-
# The time between worker check-ins with the central server, informing
|
14
|
-
# it of the current status, and simply that it's still alive.
|
15
|
-
CHECK_IN_INTERVAL = 60
|
16
|
-
|
17
13
|
# Wait five seconds to retry, after internal communcication errors.
|
18
14
|
RETRY_WAIT = 5
|
19
15
|
|
@@ -22,32 +18,30 @@ module CloudCrowd
|
|
22
18
|
# Spinning up a worker will create a new AssetStore with a persistent
|
23
19
|
# connection to S3. This AssetStore gets passed into each action, for use
|
24
20
|
# as it is run.
|
25
|
-
def initialize
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
@
|
30
|
-
@
|
31
|
-
|
32
|
-
|
33
|
-
end
|
34
|
-
|
35
|
-
# Ask the central server for the first WorkUnit in line.
|
36
|
-
def fetch_work_unit
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
end
|
21
|
+
def initialize(node, work_unit)
|
22
|
+
Signal.trap('INT') { shut_down }
|
23
|
+
Signal.trap('KILL') { shut_down }
|
24
|
+
Signal.trap('TERM') { shut_down }
|
25
|
+
@pid = $$
|
26
|
+
@node = node
|
27
|
+
setup_work_unit(work_unit)
|
28
|
+
run
|
29
|
+
end
|
30
|
+
|
31
|
+
# # Ask the central server for the first WorkUnit in line.
|
32
|
+
# def fetch_work_unit
|
33
|
+
# keep_trying_to "fetch a new work unit" do
|
34
|
+
# unit_json = @server['/work'].post(base_params)
|
35
|
+
# setup_work_unit(unit_json)
|
36
|
+
# end
|
37
|
+
# end
|
42
38
|
|
43
39
|
# Return output to the central server, marking the current work unit as done.
|
44
40
|
def complete_work_unit(result)
|
45
41
|
keep_trying_to "complete work unit" do
|
46
42
|
data = completion_params.merge({:status => 'succeeded', :output => result})
|
47
|
-
|
43
|
+
@node.server["/work/#{data[:id]}"].put(data)
|
48
44
|
log "finished #{display_work_unit} in #{data[:time]} seconds"
|
49
|
-
clear_work_unit
|
50
|
-
setup_work_unit(unit_json)
|
51
45
|
end
|
52
46
|
end
|
53
47
|
|
@@ -55,36 +49,11 @@ module CloudCrowd
|
|
55
49
|
def fail_work_unit(exception)
|
56
50
|
keep_trying_to "mark work unit as failed" do
|
57
51
|
data = completion_params.merge({:status => 'failed', :output => {'output' => exception.message}.to_json})
|
58
|
-
|
52
|
+
@node.server["/work/#{data[:id]}"].put(data)
|
59
53
|
log "failed #{display_work_unit} in #{data[:time]} seconds\n#{exception.message}\n#{exception.backtrace}"
|
60
|
-
clear_work_unit
|
61
|
-
setup_work_unit(unit_json)
|
62
54
|
end
|
63
55
|
end
|
64
56
|
|
65
|
-
# Check in with the central server. Let it know the condition of the work
|
66
|
-
# thread, the action and status we're processing, and our hostname and PID.
|
67
|
-
def check_in(thread_status)
|
68
|
-
keep_trying_to "check in with central" do
|
69
|
-
@server["/worker"].put({
|
70
|
-
:name => @name,
|
71
|
-
:thread_status => thread_status
|
72
|
-
})
|
73
|
-
end
|
74
|
-
end
|
75
|
-
|
76
|
-
# Inform the central server that this worker is finished. This is the only
|
77
|
-
# remote method that doesn't retry on connection errors -- if the worker
|
78
|
-
# can't connect to the central server while it's trying to shutdown, it
|
79
|
-
# should close, regardless.
|
80
|
-
def check_out
|
81
|
-
@server["/worker"].put({
|
82
|
-
:name => @name,
|
83
|
-
:terminated => true
|
84
|
-
})
|
85
|
-
log 'exiting'
|
86
|
-
end
|
87
|
-
|
88
57
|
# We expect and require internal communication between the central server
|
89
58
|
# and the workers to succeed. If it fails for any reason, log it, and then
|
90
59
|
# keep trying the same request.
|
@@ -100,33 +69,31 @@ module CloudCrowd
|
|
100
69
|
end
|
101
70
|
end
|
102
71
|
|
103
|
-
# Does this Worker have a job to do?
|
104
|
-
def has_work?
|
105
|
-
@action_name && @input && @options
|
106
|
-
end
|
107
|
-
|
108
72
|
# Loggable string of the current work unit.
|
109
73
|
def display_work_unit
|
110
|
-
"unit ##{@options['work_unit_id']} (#{@action_name})"
|
74
|
+
"unit ##{@options['work_unit_id']} (#{@action_name}/#{CloudCrowd.display_status(@status)})"
|
111
75
|
end
|
112
76
|
|
113
77
|
# Executes the current work unit, catching all exceptions as failures.
|
114
78
|
def run_work_unit
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
79
|
+
@worker_thread = Thread.new do
|
80
|
+
begin
|
81
|
+
result = nil
|
82
|
+
@action = CloudCrowd.actions[@action_name].new(@status, @input, @options, @node.asset_store)
|
83
|
+
Dir.chdir(@action.work_directory) do
|
84
|
+
result = case @status
|
85
|
+
when PROCESSING then @action.process
|
86
|
+
when SPLITTING then @action.split
|
87
|
+
when MERGING then @action.merge
|
88
|
+
else raise Error::StatusUnspecified, "work units must specify their status"
|
89
|
+
end
|
124
90
|
end
|
91
|
+
complete_work_unit({'output' => result}.to_json)
|
92
|
+
rescue Exception => e
|
93
|
+
fail_work_unit(e)
|
125
94
|
end
|
126
|
-
complete_work_unit({'output' => result}.to_json)
|
127
|
-
rescue Exception => e
|
128
|
-
fail_work_unit(e)
|
129
95
|
end
|
96
|
+
@worker_thread.join
|
130
97
|
end
|
131
98
|
|
132
99
|
# Wraps <tt>run_work_unit</tt> to benchmark the execution time, if requested.
|
@@ -142,8 +109,7 @@ module CloudCrowd
|
|
142
109
|
# Common parameters to send back to central.
|
143
110
|
def base_params
|
144
111
|
@base_params ||= {
|
145
|
-
:
|
146
|
-
:worker_actions => @enabled_actions.join(',')
|
112
|
+
:pid => @pid
|
147
113
|
}
|
148
114
|
end
|
149
115
|
|
@@ -157,9 +123,8 @@ module CloudCrowd
|
|
157
123
|
end
|
158
124
|
|
159
125
|
# Extract our instance variables from a WorkUnit's JSON.
|
160
|
-
def setup_work_unit(
|
161
|
-
return false unless
|
162
|
-
unit = JSON.parse(unit_json)
|
126
|
+
def setup_work_unit(unit)
|
127
|
+
return false unless unit
|
163
128
|
@start_time = Time.now
|
164
129
|
@action_name, @input, @options, @status = unit['action'], unit['input'], unit['options'], unit['status']
|
165
130
|
@options['job_id'] = unit['job_id']
|
@@ -171,7 +136,7 @@ module CloudCrowd
|
|
171
136
|
|
172
137
|
# Log a message to the daemon log. Includes PID for identification.
|
173
138
|
def log(message)
|
174
|
-
puts "Worker ##{@
|
139
|
+
puts "Worker ##{@pid}: #{message}" unless ENV['RACK_ENV'] == 'test'
|
175
140
|
end
|
176
141
|
|
177
142
|
# When we're done with a unit, clear out our instance variables to make way
|
@@ -181,6 +146,17 @@ module CloudCrowd
|
|
181
146
|
@action, @action_name, @input, @options, @start_time = nil, nil, nil, nil, nil
|
182
147
|
end
|
183
148
|
|
149
|
+
# Force the worker to quit, even if it's in the middle of processing.
|
150
|
+
# If it had checked out a work unit, the node should have released it on
|
151
|
+
# the central server already.
|
152
|
+
def shut_down
|
153
|
+
if @worker_thread
|
154
|
+
@worker_thread.kill
|
155
|
+
@worker_thread.kill! if @worker_thread.alive?
|
156
|
+
end
|
157
|
+
Process.exit
|
158
|
+
end
|
159
|
+
|
184
160
|
end
|
185
161
|
|
186
162
|
end
|