cloud-crowd 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +16 -16
- data/cloud-crowd.gemspec +10 -9
- data/config/config.example.ru +8 -2
- data/config/config.example.yml +21 -25
- data/examples/process_pdfs_example.rb +1 -1
- data/examples/word_count_example.rb +1 -0
- data/lib/cloud-crowd.rb +47 -28
- data/lib/cloud_crowd/action.rb +14 -8
- data/lib/cloud_crowd/asset_store.rb +8 -8
- data/lib/cloud_crowd/asset_store/filesystem_store.rb +18 -7
- data/lib/cloud_crowd/asset_store/s3_store.rb +14 -11
- data/lib/cloud_crowd/command_line.rb +24 -58
- data/lib/cloud_crowd/exceptions.rb +7 -0
- data/lib/cloud_crowd/helpers/authorization.rb +5 -3
- data/lib/cloud_crowd/helpers/resources.rb +0 -20
- data/lib/cloud_crowd/models.rb +1 -1
- data/lib/cloud_crowd/models/job.rb +37 -40
- data/lib/cloud_crowd/models/node_record.rb +95 -0
- data/lib/cloud_crowd/models/work_unit.rb +87 -33
- data/lib/cloud_crowd/node.rb +105 -0
- data/lib/cloud_crowd/schema.rb +22 -18
- data/lib/cloud_crowd/{app.rb → server.rb} +34 -34
- data/lib/cloud_crowd/worker.rb +68 -107
- data/public/css/admin_console.css +40 -18
- data/public/images/server.png +0 -0
- data/public/images/server_busy.png +0 -0
- data/public/js/admin_console.js +47 -18
- data/test/acceptance/test_failing_work_units.rb +1 -1
- data/test/acceptance/{test_app.rb → test_server.rb} +15 -15
- data/test/acceptance/test_word_count.rb +3 -9
- data/test/blueprints.rb +0 -1
- data/test/config/config.ru +1 -1
- data/test/config/config.yml +2 -4
- data/test/unit/test_action.rb +1 -1
- data/test/unit/test_configuration.rb +1 -1
- data/test/unit/test_job.rb +3 -0
- data/test/unit/test_work_unit.rb +2 -4
- data/views/{index.erb → operations_center.erb} +13 -8
- metadata +11 -10
- data/lib/cloud_crowd/daemon.rb +0 -95
- data/lib/cloud_crowd/models/worker_record.rb +0 -61
- data/lib/cloud_crowd/runner.rb +0 -15
@@ -0,0 +1,95 @@
|
|
1
|
+
module CloudCrowd
|
2
|
+
|
3
|
+
# A NodeRecord is the central server's record of a Node running remotely. We
|
4
|
+
# can use it to assign WorkUnits to the Node, and keep track of its status.
|
5
|
+
# When a Node exits, it destroys this record.
|
6
|
+
class NodeRecord < ActiveRecord::Base
|
7
|
+
|
8
|
+
has_many :work_units
|
9
|
+
|
10
|
+
validates_presence_of :host, :ip_address, :port
|
11
|
+
|
12
|
+
before_destroy :clear_work_units
|
13
|
+
|
14
|
+
# Available Nodes haven't used up their maxiumum number of workers yet.
|
15
|
+
named_scope :available, {
|
16
|
+
:conditions => ['(max_workers is null or (select count(*) from work_units where node_record_id = node_records.id) < max_workers)'],
|
17
|
+
:order => 'updated_at asc'
|
18
|
+
}
|
19
|
+
|
20
|
+
# Register a Node with the central server. Currently this only happens at
|
21
|
+
# Node startup.
|
22
|
+
def self.check_in(params, request)
|
23
|
+
attrs = {
|
24
|
+
:ip_address => request.ip,
|
25
|
+
:port => params[:port],
|
26
|
+
:max_workers => params[:max_workers],
|
27
|
+
:enabled_actions => params[:enabled_actions]
|
28
|
+
}
|
29
|
+
self.find_or_create_by_host(params[:host]).update_attributes!(attrs)
|
30
|
+
end
|
31
|
+
|
32
|
+
# Dispatch a WorkUnit to this node. Places the node at back at the end of
|
33
|
+
# the rotation. If we fail to send the WorkUnit, we consider the node to be
|
34
|
+
# down, and remove this record, freeing up all of its checked-out work units.
|
35
|
+
def send_work_unit(unit)
|
36
|
+
result = node['/work'].post(:work_unit => unit.to_json)
|
37
|
+
unit.assign_to(self, JSON.parse(result)['pid'])
|
38
|
+
touch
|
39
|
+
rescue Errno::ECONNREFUSED
|
40
|
+
self.destroy # Couldn't post to node, assume it's gone away.
|
41
|
+
end
|
42
|
+
|
43
|
+
# What Actions is this Node able to run?
|
44
|
+
def actions
|
45
|
+
enabled_actions.split(',')
|
46
|
+
end
|
47
|
+
|
48
|
+
# Is this Node too busy for more work? (Determined by number of workers.)
|
49
|
+
def busy?
|
50
|
+
max_workers && work_units.count >= max_workers
|
51
|
+
end
|
52
|
+
|
53
|
+
# The URL at which this Node may be reached.
|
54
|
+
# TODO: Make sure that the host actually has externally accessible DNS.
|
55
|
+
def url
|
56
|
+
@url ||= "http://#{host}:#{port}"
|
57
|
+
end
|
58
|
+
|
59
|
+
# Keep a RestClient::Resource handy for contacting the Node, including
|
60
|
+
# HTTP authentication, if configured.
|
61
|
+
def node
|
62
|
+
@node ||= RestClient::Resource.new(url, CloudCrowd.client_options)
|
63
|
+
end
|
64
|
+
|
65
|
+
# The printable status of the Node.
|
66
|
+
def display_status
|
67
|
+
busy? ? 'busy' : 'available'
|
68
|
+
end
|
69
|
+
|
70
|
+
# A list of the process ids of the workers currently being run by the Node.
|
71
|
+
def worker_pids
|
72
|
+
work_units.all(:select => 'worker_pid').map(&:worker_pid)
|
73
|
+
end
|
74
|
+
|
75
|
+
# The JSON representation of a NodeRecord includes its worker_pids.
|
76
|
+
def to_json(opts={})
|
77
|
+
{ 'host' => host,
|
78
|
+
'workers' => worker_pids,
|
79
|
+
'status' => display_status
|
80
|
+
}.to_json
|
81
|
+
end
|
82
|
+
|
83
|
+
|
84
|
+
private
|
85
|
+
|
86
|
+
# When a Node shuts down, we free up all of the WorkUnits that it had
|
87
|
+
# reserved, and they become available for others to pick up. Redistribute
|
88
|
+
# the WorkUnits in a separate thread to avoid delaying Node shutdown.
|
89
|
+
def clear_work_units
|
90
|
+
WorkUnit.update_all('node_record_id = null, worker_pid = null', "node_record_id = #{id}")
|
91
|
+
Thread.new { WorkUnit.distribute_to_nodes }
|
92
|
+
end
|
93
|
+
|
94
|
+
end
|
95
|
+
end
|
@@ -8,39 +8,84 @@ module CloudCrowd
|
|
8
8
|
include ModelStatus
|
9
9
|
|
10
10
|
belongs_to :job
|
11
|
-
belongs_to :
|
11
|
+
belongs_to :node_record
|
12
12
|
|
13
13
|
validates_presence_of :job_id, :status, :input, :action
|
14
14
|
|
15
|
-
|
15
|
+
# Available WorkUnits are waiting to be distributed to Nodes for processing.
|
16
|
+
named_scope :available, {:conditions => {:reservation => nil, :worker_pid => nil, :status => INCOMPLETE}}
|
17
|
+
# Reserved WorkUnits have been marked for distribution by a central server process.
|
18
|
+
named_scope :reserved, {:conditions => {:reservation => $$}, :order => 'updated_at asc'}
|
16
19
|
|
17
|
-
#
|
18
|
-
#
|
19
|
-
#
|
20
|
-
#
|
21
|
-
def self.
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
20
|
+
# Attempt to send a list of work_units to nodes with available capacity.
|
21
|
+
# A single central server process stops the same WorkUnit from being
|
22
|
+
# distributed to multiple nodes by reserving it first. The algorithm used
|
23
|
+
# should be lock-free.
|
24
|
+
def self.distribute_to_nodes
|
25
|
+
return unless WorkUnit.reserve_available
|
26
|
+
work_units = WorkUnit.reserved
|
27
|
+
available_nodes = NodeRecord.available
|
28
|
+
until work_units.empty? do
|
29
|
+
node = available_nodes.shift
|
30
|
+
unit = work_units.first
|
31
|
+
break unless node
|
32
|
+
next unless node.actions.include? unit.action
|
33
|
+
sent = node.send_work_unit(unit)
|
34
|
+
if sent
|
35
|
+
work_units.shift
|
36
|
+
available_nodes.push(node) unless node.busy?
|
37
|
+
end
|
38
|
+
end
|
39
|
+
ensure
|
40
|
+
WorkUnit.cancel_reservations
|
28
41
|
end
|
29
42
|
|
30
|
-
#
|
31
|
-
|
32
|
-
|
43
|
+
# Reserves all available WorkUnits for this process. Returns false if there
|
44
|
+
# were none available.
|
45
|
+
def self.reserve_available
|
46
|
+
WorkUnit.available.update_all("reservation = #{$$}") > 0
|
47
|
+
end
|
48
|
+
|
49
|
+
# Cancels all outstanding WorkUnit reservations for this process.
|
50
|
+
def self.cancel_reservations
|
51
|
+
WorkUnit.reserved.update_all('reservation = null')
|
52
|
+
end
|
53
|
+
|
54
|
+
# Look up a WorkUnit by the worker that's currently processing it. Specified
|
55
|
+
# by <tt>pid@host</tt>.
|
56
|
+
def self.find_by_worker_name(name)
|
57
|
+
pid, host = name.split('@')
|
58
|
+
node = NodeRecord.find_by_host(host)
|
59
|
+
node && node.work_units.find_by_worker_pid(pid)
|
60
|
+
end
|
61
|
+
|
62
|
+
# Convenience method for starting a new WorkUnit.
|
63
|
+
def self.start(job, action, input, status)
|
64
|
+
self.create(:job => job, :action => action, :input => input, :status => status)
|
33
65
|
end
|
34
66
|
|
35
67
|
# Mark this unit as having finished successfully.
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
68
|
+
# Splitting work units are handled differently (an optimization) -- they
|
69
|
+
# immediately fire off all of their resulting WorkUnits for processing,
|
70
|
+
# without waiting for the rest of their splitting cousins to complete.
|
71
|
+
def finish(result, time_taken)
|
72
|
+
if splitting?
|
73
|
+
[JSON.parse(parsed_output(result))].flatten.each do |new_input|
|
74
|
+
WorkUnit.start(job, action, new_input, PROCESSING)
|
75
|
+
end
|
76
|
+
self.destroy
|
77
|
+
job.set_next_status if job.done_splitting?
|
78
|
+
else
|
79
|
+
update_attributes({
|
80
|
+
:status => SUCCEEDED,
|
81
|
+
:node_record => nil,
|
82
|
+
:worker_pid => nil,
|
83
|
+
:attempts => attempts + 1,
|
84
|
+
:output => result,
|
85
|
+
:time => time_taken
|
86
|
+
})
|
87
|
+
job.check_for_completion
|
88
|
+
end
|
44
89
|
end
|
45
90
|
|
46
91
|
# Mark this unit as having failed. May attempt a retry.
|
@@ -49,30 +94,39 @@ module CloudCrowd
|
|
49
94
|
return try_again if tries < CloudCrowd.config[:work_unit_retries]
|
50
95
|
update_attributes({
|
51
96
|
:status => FAILED,
|
52
|
-
:
|
97
|
+
:node_record => nil,
|
98
|
+
:worker_pid => nil,
|
53
99
|
:attempts => tries,
|
54
100
|
:output => output,
|
55
101
|
:time => time_taken
|
56
102
|
})
|
103
|
+
self.job.check_for_completion
|
57
104
|
end
|
58
105
|
|
59
106
|
# Ever tried. Ever failed. No matter. Try again. Fail again. Fail better.
|
60
107
|
def try_again
|
61
108
|
update_attributes({
|
62
|
-
:
|
63
|
-
:
|
109
|
+
:node_record => nil,
|
110
|
+
:worker_pid => nil,
|
111
|
+
:attempts => self.attempts + 1
|
64
112
|
})
|
65
113
|
end
|
66
114
|
|
67
|
-
# When a
|
68
|
-
# WorkUnit and
|
69
|
-
def assign_to(
|
70
|
-
|
71
|
-
|
115
|
+
# When a Node checks out a WorkUnit, establish the connection between
|
116
|
+
# WorkUnit and NodeRecord and record the worker_pid.
|
117
|
+
def assign_to(node_record, worker_pid)
|
118
|
+
update_attributes!(:node_record => node_record, :worker_pid => worker_pid)
|
119
|
+
end
|
120
|
+
|
121
|
+
# All output needs to be wrapped in a JSON object for consistency
|
122
|
+
# (unfortunately, JSON.parse needs the top-level to be an object or array).
|
123
|
+
# Convenience method to provide the parsed version.
|
124
|
+
def parsed_output(out = self.output)
|
125
|
+
JSON.parse(out)['output']
|
72
126
|
end
|
73
127
|
|
74
128
|
# The JSON representation of a WorkUnit shares the Job's options with all
|
75
|
-
# its
|
129
|
+
# its cousin WorkUnits.
|
76
130
|
def to_json
|
77
131
|
{
|
78
132
|
'id' => self.id,
|
@@ -0,0 +1,105 @@
|
|
1
|
+
module CloudCrowd
|
2
|
+
|
3
|
+
# A Node is a Sinatra/Thin application that runs a single instance per-machine
|
4
|
+
# It registers with the central server, receives WorkUnits, and forks off
|
5
|
+
# Workers to process them. The actions are:
|
6
|
+
#
|
7
|
+
# [get /heartbeat] Returns 200 OK to let monitoring tools know the server's up.
|
8
|
+
# [post /work] The central server hits <tt>/work</tt> to dispatch a WorkUnit to this Node.
|
9
|
+
class Node < Sinatra::Default
|
10
|
+
|
11
|
+
# A Node's default port. You only run a single node per machine, so they
|
12
|
+
# can all use the same port without any problems.
|
13
|
+
DEFAULT_PORT = 9063
|
14
|
+
|
15
|
+
attr_reader :server, :asset_store
|
16
|
+
|
17
|
+
set :root, ROOT
|
18
|
+
set :authorization_realm, "CloudCrowd"
|
19
|
+
|
20
|
+
helpers Helpers
|
21
|
+
|
22
|
+
# methodoverride allows the _method param.
|
23
|
+
enable :methodoverride
|
24
|
+
|
25
|
+
# Enabling HTTP Authentication turns it on for all requests.
|
26
|
+
# This works the same way as in the central CloudCrowd::Server.
|
27
|
+
before do
|
28
|
+
login_required if CloudCrowd.config[:http_authentication]
|
29
|
+
end
|
30
|
+
|
31
|
+
# To monitor a Node with Monit, God, Nagios, or another tool, you can hit
|
32
|
+
# /heartbeat to make sure its still online.
|
33
|
+
get '/heartbeat' do
|
34
|
+
"buh-bump"
|
35
|
+
end
|
36
|
+
|
37
|
+
# Posts a WorkUnit to this Node. Forks a Worker and returns the process id.
|
38
|
+
post '/work' do
|
39
|
+
pid = fork { Worker.new(self, JSON.parse(params[:work_unit])) }
|
40
|
+
Process.detach(pid)
|
41
|
+
json :pid => pid
|
42
|
+
end
|
43
|
+
|
44
|
+
# Creating a Node registers with the central server and starts listening for
|
45
|
+
# incoming WorkUnits.
|
46
|
+
def initialize(port=DEFAULT_PORT)
|
47
|
+
require 'json'
|
48
|
+
@server = CloudCrowd.central_server
|
49
|
+
@host = Socket.gethostname
|
50
|
+
@enabled_actions = CloudCrowd.actions.keys
|
51
|
+
@asset_store = AssetStore.new
|
52
|
+
@port = port || DEFAULT_PORT
|
53
|
+
|
54
|
+
trap_signals
|
55
|
+
start_server
|
56
|
+
check_in
|
57
|
+
@server_thread.join
|
58
|
+
end
|
59
|
+
|
60
|
+
# Checking in with the central server informs it of the location and
|
61
|
+
# configuration of this Node. If it can't check-in, there's no point in
|
62
|
+
# starting.
|
63
|
+
def check_in
|
64
|
+
@server["/node/#{@host}"].put(
|
65
|
+
:port => @port,
|
66
|
+
:max_workers => CloudCrowd.config[:max_workers],
|
67
|
+
:enabled_actions => @enabled_actions.join(',')
|
68
|
+
)
|
69
|
+
rescue Errno::ECONNREFUSED
|
70
|
+
puts "Failed to connect to the central server (#{@server.to_s}), exiting..."
|
71
|
+
raise SystemExit
|
72
|
+
end
|
73
|
+
|
74
|
+
# Before exiting, the Node checks out with the central server, releasing all
|
75
|
+
# of its WorkUnits for other Nodes to handle
|
76
|
+
def check_out
|
77
|
+
@server["/node/#{@host}"].delete
|
78
|
+
end
|
79
|
+
|
80
|
+
|
81
|
+
private
|
82
|
+
|
83
|
+
# Launch the Node's Thin server in a separate thread because it blocks.
|
84
|
+
def start_server
|
85
|
+
@server_thread = Thread.new do
|
86
|
+
Thin::Server.start('0.0.0.0', @port, self, :signals => false)
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
# Trap exit signals in order to shut down cleanly.
|
91
|
+
def trap_signals
|
92
|
+
Signal.trap('INT') { shut_down }
|
93
|
+
Signal.trap('KILL') { shut_down }
|
94
|
+
Signal.trap('TERM') { shut_down }
|
95
|
+
end
|
96
|
+
|
97
|
+
# At shut down, de-register with the central server before exiting.
|
98
|
+
def shut_down
|
99
|
+
check_out
|
100
|
+
Process.exit
|
101
|
+
end
|
102
|
+
|
103
|
+
end
|
104
|
+
|
105
|
+
end
|
data/lib/cloud_crowd/schema.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
# Complete schema for CloudCrowd.
|
2
|
-
ActiveRecord::Schema.define(:version =>
|
2
|
+
ActiveRecord::Schema.define(:version => CloudCrowd::SCHEMA_VERSION) do
|
3
3
|
|
4
4
|
create_table "jobs", :force => true do |t|
|
5
5
|
t.integer "status", :null => false
|
@@ -10,7 +10,16 @@ ActiveRecord::Schema.define(:version => 1) do
|
|
10
10
|
t.float "time"
|
11
11
|
t.string "callback_url"
|
12
12
|
t.string "email"
|
13
|
-
t.
|
13
|
+
t.datetime "created_at"
|
14
|
+
t.datetime "updated_at"
|
15
|
+
end
|
16
|
+
|
17
|
+
create_table "node_records", :force => true do |t|
|
18
|
+
t.string "host", :null => false
|
19
|
+
t.string "ip_address", :null => false
|
20
|
+
t.integer "port", :null => false
|
21
|
+
t.string "enabled_actions", :default => '', :null => false
|
22
|
+
t.integer "max_workers"
|
14
23
|
t.datetime "created_at"
|
15
24
|
t.datetime "updated_at"
|
16
25
|
end
|
@@ -20,26 +29,21 @@ ActiveRecord::Schema.define(:version => 1) do
|
|
20
29
|
t.integer "job_id", :null => false
|
21
30
|
t.text "input", :null => false
|
22
31
|
t.string "action", :null => false
|
23
|
-
t.integer "attempts",
|
24
|
-
t.integer "
|
25
|
-
t.integer "
|
32
|
+
t.integer "attempts", :default => 0, :null => false
|
33
|
+
t.integer "node_record_id"
|
34
|
+
t.integer "worker_pid"
|
35
|
+
t.integer "reservation"
|
26
36
|
t.float "time"
|
27
37
|
t.text "output"
|
28
38
|
t.datetime "created_at"
|
29
39
|
t.datetime "updated_at"
|
30
40
|
end
|
31
|
-
|
32
|
-
create_table "worker_records", :force => true do |t|
|
33
|
-
t.string "name", :null => false
|
34
|
-
t.string "thread_status", :null => false
|
35
|
-
t.datetime "created_at"
|
36
|
-
t.datetime "updated_at"
|
37
|
-
end
|
38
|
-
|
39
|
-
add_index "jobs", ["status"], :name => "index_jobs_on_status"
|
40
|
-
add_index "work_units", ["job_id"], :name => "index_work_units_on_job_id"
|
41
|
-
add_index "work_units", ["status", "worker_record_id", "action"], :name => "index_work_units_on_status_and_worker_record_id_and_action"
|
42
|
-
add_index "worker_records", ["name"], :name => "index_worker_records_on_name"
|
43
|
-
add_index "worker_records", ["updated_at"], :name => "index_worker_records_on_updated_at"
|
44
41
|
|
42
|
+
# Here be indices. After looking, it seems faster not to have them at all.
|
43
|
+
#
|
44
|
+
# add_index "jobs", ["status"], :name => "index_jobs_on_status"
|
45
|
+
# add_index "work_units", ["job_id"], :name => "index_work_units_on_job_id"
|
46
|
+
# add_index "work_units", ["worker_pid"], :name => "index_work_units_on_worker_pid"
|
47
|
+
# add_index "work_units", ["worker_pid", "status"], :name => "index_work_units_on_worker_pid_and_status"
|
48
|
+
# add_index "work_units", ["worker_pid", "node_record_id"], :name => "index_work_units_on_worker_pid_and_node_record_id"
|
45
49
|
end
|
@@ -5,6 +5,7 @@ module CloudCrowd
|
|
5
5
|
# == Admin
|
6
6
|
# [get /] Render the admin console, with a progress meter for running jobs.
|
7
7
|
# [get /status] Get the combined JSON of every active job and worker.
|
8
|
+
# [get /worker/:name] Look up the details of a WorkUnit that a Worker is busy processing.
|
8
9
|
# [get /heartbeat] Returns 200 OK to let monitoring tools know the server's up.
|
9
10
|
#
|
10
11
|
# == Public API
|
@@ -13,10 +14,10 @@ module CloudCrowd
|
|
13
14
|
# [delete /jobs/:job_id] Clean up a Job when you're done downloading the results. Removes all intermediate files.
|
14
15
|
#
|
15
16
|
# == Internal Workers API
|
16
|
-
# [
|
17
|
+
# [puts /node/:host] Registers a new Node, making it available for processing.
|
18
|
+
# [delete /node/:host] Removes a Node from the registry, freeing up any WorkUnits that it had checked out.
|
17
19
|
# [put /work/:unit_id] Mark a finished WorkUnit as completed or failed, with results.
|
18
|
-
|
19
|
-
class App < Sinatra::Default
|
20
|
+
class Server < Sinatra::Default
|
20
21
|
|
21
22
|
set :root, ROOT
|
22
23
|
set :authorization_realm, "CloudCrowd"
|
@@ -28,12 +29,12 @@ module CloudCrowd
|
|
28
29
|
|
29
30
|
# Enabling HTTP Authentication turns it on for all requests.
|
30
31
|
before do
|
31
|
-
login_required if CloudCrowd.config[:
|
32
|
+
login_required if CloudCrowd.config[:http_authentication]
|
32
33
|
end
|
33
34
|
|
34
35
|
# Render the admin console.
|
35
36
|
get '/' do
|
36
|
-
erb :
|
37
|
+
erb :operations_center
|
37
38
|
end
|
38
39
|
|
39
40
|
# Get the JSON for every active job in the queue and every active worker
|
@@ -42,15 +43,14 @@ module CloudCrowd
|
|
42
43
|
get '/status' do
|
43
44
|
json(
|
44
45
|
'jobs' => Job.incomplete,
|
45
|
-
'
|
46
|
+
'nodes' => NodeRecord.all(:order => 'host desc'),
|
46
47
|
'work_unit_count' => WorkUnit.incomplete.count
|
47
48
|
)
|
48
49
|
end
|
49
50
|
|
50
|
-
# Get the JSON for a worker
|
51
|
+
# Get the JSON for what a worker is up to.
|
51
52
|
get '/worker/:name' do
|
52
|
-
|
53
|
-
json((record && record.work_unit) || {})
|
53
|
+
json WorkUnit.find_by_worker_name(params[:name]) || {}
|
54
54
|
end
|
55
55
|
|
56
56
|
# To monitor the central server with Monit, God, Nagios, or another
|
@@ -62,8 +62,11 @@ module CloudCrowd
|
|
62
62
|
# PUBLIC API:
|
63
63
|
|
64
64
|
# Start a new job. Accepts a JSON representation of the job-to-be.
|
65
|
+
# Distributes all work units to available nodes.
|
65
66
|
post '/jobs' do
|
66
|
-
|
67
|
+
job = Job.create_from_request(JSON.parse(params[:job]))
|
68
|
+
WorkUnit.distribute_to_nodes
|
69
|
+
json job
|
67
70
|
end
|
68
71
|
|
69
72
|
# Check the status of a job, returning the output if finished, and the
|
@@ -79,36 +82,33 @@ module CloudCrowd
|
|
79
82
|
json nil
|
80
83
|
end
|
81
84
|
|
82
|
-
# INTERNAL
|
85
|
+
# INTERNAL NODE API:
|
83
86
|
|
84
|
-
#
|
85
|
-
#
|
86
|
-
|
87
|
-
|
87
|
+
# A new Node will this this action to register its location and
|
88
|
+
# configuration with the central server. Triggers distribution of WorkUnits.
|
89
|
+
put '/node/:host' do
|
90
|
+
NodeRecord.check_in(params, request)
|
91
|
+
WorkUnit.distribute_to_nodes
|
92
|
+
json nil
|
93
|
+
end
|
94
|
+
|
95
|
+
# Deregisters a Node from the central server. Releases and redistributes any
|
96
|
+
# WorkUnits it may have had checked out.
|
97
|
+
delete '/node/:host' do
|
98
|
+
NodeRecord.destroy_all(:host => params[:host])
|
99
|
+
json nil
|
88
100
|
end
|
89
101
|
|
90
102
|
# When workers are done with their unit, either successfully on in failure,
|
91
|
-
# they mark it back on the central server and
|
92
|
-
#
|
103
|
+
# they mark it back on the central server and exit. Triggers distribution
|
104
|
+
# of pending work units.
|
93
105
|
put '/work/:work_unit_id' do
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
json dequeue_work_unit
|
99
|
-
when 'failed'
|
100
|
-
current_work_unit.fail(params[:output], params[:time])
|
101
|
-
json dequeue_work_unit(1)
|
102
|
-
else
|
103
|
-
error(500, "Completing a work unit must specify status.")
|
104
|
-
end
|
106
|
+
case params[:status]
|
107
|
+
when 'succeeded' then current_work_unit.finish(params[:output], params[:time])
|
108
|
+
when 'failed' then current_work_unit.fail(params[:output], params[:time])
|
109
|
+
else error(500, "Completing a work unit must specify status.")
|
105
110
|
end
|
106
|
-
|
107
|
-
|
108
|
-
# Every so often workers check in to let the central server know that
|
109
|
-
# they're still alive. Keep up-to-date records
|
110
|
-
put '/worker' do
|
111
|
-
params[:terminated] ? WorkerRecord.check_out(params) : WorkerRecord.check_in(params)
|
111
|
+
WorkUnit.distribute_to_nodes
|
112
112
|
json nil
|
113
113
|
end
|
114
114
|
|