cloud-crowd 0.1.0 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README +16 -16
- data/cloud-crowd.gemspec +10 -9
- data/config/config.example.ru +8 -2
- data/config/config.example.yml +21 -25
- data/examples/process_pdfs_example.rb +1 -1
- data/examples/word_count_example.rb +1 -0
- data/lib/cloud-crowd.rb +47 -28
- data/lib/cloud_crowd/action.rb +14 -8
- data/lib/cloud_crowd/asset_store.rb +8 -8
- data/lib/cloud_crowd/asset_store/filesystem_store.rb +18 -7
- data/lib/cloud_crowd/asset_store/s3_store.rb +14 -11
- data/lib/cloud_crowd/command_line.rb +24 -58
- data/lib/cloud_crowd/exceptions.rb +7 -0
- data/lib/cloud_crowd/helpers/authorization.rb +5 -3
- data/lib/cloud_crowd/helpers/resources.rb +0 -20
- data/lib/cloud_crowd/models.rb +1 -1
- data/lib/cloud_crowd/models/job.rb +37 -40
- data/lib/cloud_crowd/models/node_record.rb +95 -0
- data/lib/cloud_crowd/models/work_unit.rb +87 -33
- data/lib/cloud_crowd/node.rb +105 -0
- data/lib/cloud_crowd/schema.rb +22 -18
- data/lib/cloud_crowd/{app.rb → server.rb} +34 -34
- data/lib/cloud_crowd/worker.rb +68 -107
- data/public/css/admin_console.css +40 -18
- data/public/images/server.png +0 -0
- data/public/images/server_busy.png +0 -0
- data/public/js/admin_console.js +47 -18
- data/test/acceptance/test_failing_work_units.rb +1 -1
- data/test/acceptance/{test_app.rb → test_server.rb} +15 -15
- data/test/acceptance/test_word_count.rb +3 -9
- data/test/blueprints.rb +0 -1
- data/test/config/config.ru +1 -1
- data/test/config/config.yml +2 -4
- data/test/unit/test_action.rb +1 -1
- data/test/unit/test_configuration.rb +1 -1
- data/test/unit/test_job.rb +3 -0
- data/test/unit/test_work_unit.rb +2 -4
- data/views/{index.erb → operations_center.erb} +13 -8
- metadata +11 -10
- data/lib/cloud_crowd/daemon.rb +0 -95
- data/lib/cloud_crowd/models/worker_record.rb +0 -61
- data/lib/cloud_crowd/runner.rb +0 -15
@@ -0,0 +1,95 @@
|
|
1
|
+
module CloudCrowd
|
2
|
+
|
3
|
+
# A NodeRecord is the central server's record of a Node running remotely. We
|
4
|
+
# can use it to assign WorkUnits to the Node, and keep track of its status.
|
5
|
+
# When a Node exits, it destroys this record.
|
6
|
+
class NodeRecord < ActiveRecord::Base
|
7
|
+
|
8
|
+
has_many :work_units
|
9
|
+
|
10
|
+
validates_presence_of :host, :ip_address, :port
|
11
|
+
|
12
|
+
before_destroy :clear_work_units
|
13
|
+
|
14
|
+
# Available Nodes haven't used up their maxiumum number of workers yet.
|
15
|
+
named_scope :available, {
|
16
|
+
:conditions => ['(max_workers is null or (select count(*) from work_units where node_record_id = node_records.id) < max_workers)'],
|
17
|
+
:order => 'updated_at asc'
|
18
|
+
}
|
19
|
+
|
20
|
+
# Register a Node with the central server. Currently this only happens at
|
21
|
+
# Node startup.
|
22
|
+
def self.check_in(params, request)
|
23
|
+
attrs = {
|
24
|
+
:ip_address => request.ip,
|
25
|
+
:port => params[:port],
|
26
|
+
:max_workers => params[:max_workers],
|
27
|
+
:enabled_actions => params[:enabled_actions]
|
28
|
+
}
|
29
|
+
self.find_or_create_by_host(params[:host]).update_attributes!(attrs)
|
30
|
+
end
|
31
|
+
|
32
|
+
# Dispatch a WorkUnit to this node. Places the node at back at the end of
|
33
|
+
# the rotation. If we fail to send the WorkUnit, we consider the node to be
|
34
|
+
# down, and remove this record, freeing up all of its checked-out work units.
|
35
|
+
def send_work_unit(unit)
|
36
|
+
result = node['/work'].post(:work_unit => unit.to_json)
|
37
|
+
unit.assign_to(self, JSON.parse(result)['pid'])
|
38
|
+
touch
|
39
|
+
rescue Errno::ECONNREFUSED
|
40
|
+
self.destroy # Couldn't post to node, assume it's gone away.
|
41
|
+
end
|
42
|
+
|
43
|
+
# What Actions is this Node able to run?
|
44
|
+
def actions
|
45
|
+
enabled_actions.split(',')
|
46
|
+
end
|
47
|
+
|
48
|
+
# Is this Node too busy for more work? (Determined by number of workers.)
|
49
|
+
def busy?
|
50
|
+
max_workers && work_units.count >= max_workers
|
51
|
+
end
|
52
|
+
|
53
|
+
# The URL at which this Node may be reached.
|
54
|
+
# TODO: Make sure that the host actually has externally accessible DNS.
|
55
|
+
def url
|
56
|
+
@url ||= "http://#{host}:#{port}"
|
57
|
+
end
|
58
|
+
|
59
|
+
# Keep a RestClient::Resource handy for contacting the Node, including
|
60
|
+
# HTTP authentication, if configured.
|
61
|
+
def node
|
62
|
+
@node ||= RestClient::Resource.new(url, CloudCrowd.client_options)
|
63
|
+
end
|
64
|
+
|
65
|
+
# The printable status of the Node.
|
66
|
+
def display_status
|
67
|
+
busy? ? 'busy' : 'available'
|
68
|
+
end
|
69
|
+
|
70
|
+
# A list of the process ids of the workers currently being run by the Node.
|
71
|
+
def worker_pids
|
72
|
+
work_units.all(:select => 'worker_pid').map(&:worker_pid)
|
73
|
+
end
|
74
|
+
|
75
|
+
# The JSON representation of a NodeRecord includes its worker_pids.
|
76
|
+
def to_json(opts={})
|
77
|
+
{ 'host' => host,
|
78
|
+
'workers' => worker_pids,
|
79
|
+
'status' => display_status
|
80
|
+
}.to_json
|
81
|
+
end
|
82
|
+
|
83
|
+
|
84
|
+
private
|
85
|
+
|
86
|
+
# When a Node shuts down, we free up all of the WorkUnits that it had
|
87
|
+
# reserved, and they become available for others to pick up. Redistribute
|
88
|
+
# the WorkUnits in a separate thread to avoid delaying Node shutdown.
|
89
|
+
def clear_work_units
|
90
|
+
WorkUnit.update_all('node_record_id = null, worker_pid = null', "node_record_id = #{id}")
|
91
|
+
Thread.new { WorkUnit.distribute_to_nodes }
|
92
|
+
end
|
93
|
+
|
94
|
+
end
|
95
|
+
end
|
@@ -8,39 +8,84 @@ module CloudCrowd
|
|
8
8
|
include ModelStatus
|
9
9
|
|
10
10
|
belongs_to :job
|
11
|
-
belongs_to :
|
11
|
+
belongs_to :node_record
|
12
12
|
|
13
13
|
validates_presence_of :job_id, :status, :input, :action
|
14
14
|
|
15
|
-
|
15
|
+
# Available WorkUnits are waiting to be distributed to Nodes for processing.
|
16
|
+
named_scope :available, {:conditions => {:reservation => nil, :worker_pid => nil, :status => INCOMPLETE}}
|
17
|
+
# Reserved WorkUnits have been marked for distribution by a central server process.
|
18
|
+
named_scope :reserved, {:conditions => {:reservation => $$}, :order => 'updated_at asc'}
|
16
19
|
|
17
|
-
#
|
18
|
-
#
|
19
|
-
#
|
20
|
-
#
|
21
|
-
def self.
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
20
|
+
# Attempt to send a list of work_units to nodes with available capacity.
|
21
|
+
# A single central server process stops the same WorkUnit from being
|
22
|
+
# distributed to multiple nodes by reserving it first. The algorithm used
|
23
|
+
# should be lock-free.
|
24
|
+
def self.distribute_to_nodes
|
25
|
+
return unless WorkUnit.reserve_available
|
26
|
+
work_units = WorkUnit.reserved
|
27
|
+
available_nodes = NodeRecord.available
|
28
|
+
until work_units.empty? do
|
29
|
+
node = available_nodes.shift
|
30
|
+
unit = work_units.first
|
31
|
+
break unless node
|
32
|
+
next unless node.actions.include? unit.action
|
33
|
+
sent = node.send_work_unit(unit)
|
34
|
+
if sent
|
35
|
+
work_units.shift
|
36
|
+
available_nodes.push(node) unless node.busy?
|
37
|
+
end
|
38
|
+
end
|
39
|
+
ensure
|
40
|
+
WorkUnit.cancel_reservations
|
28
41
|
end
|
29
42
|
|
30
|
-
#
|
31
|
-
|
32
|
-
|
43
|
+
# Reserves all available WorkUnits for this process. Returns false if there
|
44
|
+
# were none available.
|
45
|
+
def self.reserve_available
|
46
|
+
WorkUnit.available.update_all("reservation = #{$$}") > 0
|
47
|
+
end
|
48
|
+
|
49
|
+
# Cancels all outstanding WorkUnit reservations for this process.
|
50
|
+
def self.cancel_reservations
|
51
|
+
WorkUnit.reserved.update_all('reservation = null')
|
52
|
+
end
|
53
|
+
|
54
|
+
# Look up a WorkUnit by the worker that's currently processing it. Specified
|
55
|
+
# by <tt>pid@host</tt>.
|
56
|
+
def self.find_by_worker_name(name)
|
57
|
+
pid, host = name.split('@')
|
58
|
+
node = NodeRecord.find_by_host(host)
|
59
|
+
node && node.work_units.find_by_worker_pid(pid)
|
60
|
+
end
|
61
|
+
|
62
|
+
# Convenience method for starting a new WorkUnit.
|
63
|
+
def self.start(job, action, input, status)
|
64
|
+
self.create(:job => job, :action => action, :input => input, :status => status)
|
33
65
|
end
|
34
66
|
|
35
67
|
# Mark this unit as having finished successfully.
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
68
|
+
# Splitting work units are handled differently (an optimization) -- they
|
69
|
+
# immediately fire off all of their resulting WorkUnits for processing,
|
70
|
+
# without waiting for the rest of their splitting cousins to complete.
|
71
|
+
def finish(result, time_taken)
|
72
|
+
if splitting?
|
73
|
+
[JSON.parse(parsed_output(result))].flatten.each do |new_input|
|
74
|
+
WorkUnit.start(job, action, new_input, PROCESSING)
|
75
|
+
end
|
76
|
+
self.destroy
|
77
|
+
job.set_next_status if job.done_splitting?
|
78
|
+
else
|
79
|
+
update_attributes({
|
80
|
+
:status => SUCCEEDED,
|
81
|
+
:node_record => nil,
|
82
|
+
:worker_pid => nil,
|
83
|
+
:attempts => attempts + 1,
|
84
|
+
:output => result,
|
85
|
+
:time => time_taken
|
86
|
+
})
|
87
|
+
job.check_for_completion
|
88
|
+
end
|
44
89
|
end
|
45
90
|
|
46
91
|
# Mark this unit as having failed. May attempt a retry.
|
@@ -49,30 +94,39 @@ module CloudCrowd
|
|
49
94
|
return try_again if tries < CloudCrowd.config[:work_unit_retries]
|
50
95
|
update_attributes({
|
51
96
|
:status => FAILED,
|
52
|
-
:
|
97
|
+
:node_record => nil,
|
98
|
+
:worker_pid => nil,
|
53
99
|
:attempts => tries,
|
54
100
|
:output => output,
|
55
101
|
:time => time_taken
|
56
102
|
})
|
103
|
+
self.job.check_for_completion
|
57
104
|
end
|
58
105
|
|
59
106
|
# Ever tried. Ever failed. No matter. Try again. Fail again. Fail better.
|
60
107
|
def try_again
|
61
108
|
update_attributes({
|
62
|
-
:
|
63
|
-
:
|
109
|
+
:node_record => nil,
|
110
|
+
:worker_pid => nil,
|
111
|
+
:attempts => self.attempts + 1
|
64
112
|
})
|
65
113
|
end
|
66
114
|
|
67
|
-
# When a
|
68
|
-
# WorkUnit and
|
69
|
-
def assign_to(
|
70
|
-
|
71
|
-
|
115
|
+
# When a Node checks out a WorkUnit, establish the connection between
|
116
|
+
# WorkUnit and NodeRecord and record the worker_pid.
|
117
|
+
def assign_to(node_record, worker_pid)
|
118
|
+
update_attributes!(:node_record => node_record, :worker_pid => worker_pid)
|
119
|
+
end
|
120
|
+
|
121
|
+
# All output needs to be wrapped in a JSON object for consistency
|
122
|
+
# (unfortunately, JSON.parse needs the top-level to be an object or array).
|
123
|
+
# Convenience method to provide the parsed version.
|
124
|
+
def parsed_output(out = self.output)
|
125
|
+
JSON.parse(out)['output']
|
72
126
|
end
|
73
127
|
|
74
128
|
# The JSON representation of a WorkUnit shares the Job's options with all
|
75
|
-
# its
|
129
|
+
# its cousin WorkUnits.
|
76
130
|
def to_json
|
77
131
|
{
|
78
132
|
'id' => self.id,
|
@@ -0,0 +1,105 @@
|
|
1
|
+
module CloudCrowd
|
2
|
+
|
3
|
+
# A Node is a Sinatra/Thin application that runs a single instance per-machine
|
4
|
+
# It registers with the central server, receives WorkUnits, and forks off
|
5
|
+
# Workers to process them. The actions are:
|
6
|
+
#
|
7
|
+
# [get /heartbeat] Returns 200 OK to let monitoring tools know the server's up.
|
8
|
+
# [post /work] The central server hits <tt>/work</tt> to dispatch a WorkUnit to this Node.
|
9
|
+
class Node < Sinatra::Default
|
10
|
+
|
11
|
+
# A Node's default port. You only run a single node per machine, so they
|
12
|
+
# can all use the same port without any problems.
|
13
|
+
DEFAULT_PORT = 9063
|
14
|
+
|
15
|
+
attr_reader :server, :asset_store
|
16
|
+
|
17
|
+
set :root, ROOT
|
18
|
+
set :authorization_realm, "CloudCrowd"
|
19
|
+
|
20
|
+
helpers Helpers
|
21
|
+
|
22
|
+
# methodoverride allows the _method param.
|
23
|
+
enable :methodoverride
|
24
|
+
|
25
|
+
# Enabling HTTP Authentication turns it on for all requests.
|
26
|
+
# This works the same way as in the central CloudCrowd::Server.
|
27
|
+
before do
|
28
|
+
login_required if CloudCrowd.config[:http_authentication]
|
29
|
+
end
|
30
|
+
|
31
|
+
# To monitor a Node with Monit, God, Nagios, or another tool, you can hit
|
32
|
+
# /heartbeat to make sure its still online.
|
33
|
+
get '/heartbeat' do
|
34
|
+
"buh-bump"
|
35
|
+
end
|
36
|
+
|
37
|
+
# Posts a WorkUnit to this Node. Forks a Worker and returns the process id.
|
38
|
+
post '/work' do
|
39
|
+
pid = fork { Worker.new(self, JSON.parse(params[:work_unit])) }
|
40
|
+
Process.detach(pid)
|
41
|
+
json :pid => pid
|
42
|
+
end
|
43
|
+
|
44
|
+
# Creating a Node registers with the central server and starts listening for
|
45
|
+
# incoming WorkUnits.
|
46
|
+
def initialize(port=DEFAULT_PORT)
|
47
|
+
require 'json'
|
48
|
+
@server = CloudCrowd.central_server
|
49
|
+
@host = Socket.gethostname
|
50
|
+
@enabled_actions = CloudCrowd.actions.keys
|
51
|
+
@asset_store = AssetStore.new
|
52
|
+
@port = port || DEFAULT_PORT
|
53
|
+
|
54
|
+
trap_signals
|
55
|
+
start_server
|
56
|
+
check_in
|
57
|
+
@server_thread.join
|
58
|
+
end
|
59
|
+
|
60
|
+
# Checking in with the central server informs it of the location and
|
61
|
+
# configuration of this Node. If it can't check-in, there's no point in
|
62
|
+
# starting.
|
63
|
+
def check_in
|
64
|
+
@server["/node/#{@host}"].put(
|
65
|
+
:port => @port,
|
66
|
+
:max_workers => CloudCrowd.config[:max_workers],
|
67
|
+
:enabled_actions => @enabled_actions.join(',')
|
68
|
+
)
|
69
|
+
rescue Errno::ECONNREFUSED
|
70
|
+
puts "Failed to connect to the central server (#{@server.to_s}), exiting..."
|
71
|
+
raise SystemExit
|
72
|
+
end
|
73
|
+
|
74
|
+
# Before exiting, the Node checks out with the central server, releasing all
|
75
|
+
# of its WorkUnits for other Nodes to handle
|
76
|
+
def check_out
|
77
|
+
@server["/node/#{@host}"].delete
|
78
|
+
end
|
79
|
+
|
80
|
+
|
81
|
+
private
|
82
|
+
|
83
|
+
# Launch the Node's Thin server in a separate thread because it blocks.
|
84
|
+
def start_server
|
85
|
+
@server_thread = Thread.new do
|
86
|
+
Thin::Server.start('0.0.0.0', @port, self, :signals => false)
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
# Trap exit signals in order to shut down cleanly.
|
91
|
+
def trap_signals
|
92
|
+
Signal.trap('INT') { shut_down }
|
93
|
+
Signal.trap('KILL') { shut_down }
|
94
|
+
Signal.trap('TERM') { shut_down }
|
95
|
+
end
|
96
|
+
|
97
|
+
# At shut down, de-register with the central server before exiting.
|
98
|
+
def shut_down
|
99
|
+
check_out
|
100
|
+
Process.exit
|
101
|
+
end
|
102
|
+
|
103
|
+
end
|
104
|
+
|
105
|
+
end
|
data/lib/cloud_crowd/schema.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
# Complete schema for CloudCrowd.
|
2
|
-
ActiveRecord::Schema.define(:version =>
|
2
|
+
ActiveRecord::Schema.define(:version => CloudCrowd::SCHEMA_VERSION) do
|
3
3
|
|
4
4
|
create_table "jobs", :force => true do |t|
|
5
5
|
t.integer "status", :null => false
|
@@ -10,7 +10,16 @@ ActiveRecord::Schema.define(:version => 1) do
|
|
10
10
|
t.float "time"
|
11
11
|
t.string "callback_url"
|
12
12
|
t.string "email"
|
13
|
-
t.
|
13
|
+
t.datetime "created_at"
|
14
|
+
t.datetime "updated_at"
|
15
|
+
end
|
16
|
+
|
17
|
+
create_table "node_records", :force => true do |t|
|
18
|
+
t.string "host", :null => false
|
19
|
+
t.string "ip_address", :null => false
|
20
|
+
t.integer "port", :null => false
|
21
|
+
t.string "enabled_actions", :default => '', :null => false
|
22
|
+
t.integer "max_workers"
|
14
23
|
t.datetime "created_at"
|
15
24
|
t.datetime "updated_at"
|
16
25
|
end
|
@@ -20,26 +29,21 @@ ActiveRecord::Schema.define(:version => 1) do
|
|
20
29
|
t.integer "job_id", :null => false
|
21
30
|
t.text "input", :null => false
|
22
31
|
t.string "action", :null => false
|
23
|
-
t.integer "attempts",
|
24
|
-
t.integer "
|
25
|
-
t.integer "
|
32
|
+
t.integer "attempts", :default => 0, :null => false
|
33
|
+
t.integer "node_record_id"
|
34
|
+
t.integer "worker_pid"
|
35
|
+
t.integer "reservation"
|
26
36
|
t.float "time"
|
27
37
|
t.text "output"
|
28
38
|
t.datetime "created_at"
|
29
39
|
t.datetime "updated_at"
|
30
40
|
end
|
31
|
-
|
32
|
-
create_table "worker_records", :force => true do |t|
|
33
|
-
t.string "name", :null => false
|
34
|
-
t.string "thread_status", :null => false
|
35
|
-
t.datetime "created_at"
|
36
|
-
t.datetime "updated_at"
|
37
|
-
end
|
38
|
-
|
39
|
-
add_index "jobs", ["status"], :name => "index_jobs_on_status"
|
40
|
-
add_index "work_units", ["job_id"], :name => "index_work_units_on_job_id"
|
41
|
-
add_index "work_units", ["status", "worker_record_id", "action"], :name => "index_work_units_on_status_and_worker_record_id_and_action"
|
42
|
-
add_index "worker_records", ["name"], :name => "index_worker_records_on_name"
|
43
|
-
add_index "worker_records", ["updated_at"], :name => "index_worker_records_on_updated_at"
|
44
41
|
|
42
|
+
# Here be indices. After looking, it seems faster not to have them at all.
|
43
|
+
#
|
44
|
+
# add_index "jobs", ["status"], :name => "index_jobs_on_status"
|
45
|
+
# add_index "work_units", ["job_id"], :name => "index_work_units_on_job_id"
|
46
|
+
# add_index "work_units", ["worker_pid"], :name => "index_work_units_on_worker_pid"
|
47
|
+
# add_index "work_units", ["worker_pid", "status"], :name => "index_work_units_on_worker_pid_and_status"
|
48
|
+
# add_index "work_units", ["worker_pid", "node_record_id"], :name => "index_work_units_on_worker_pid_and_node_record_id"
|
45
49
|
end
|
@@ -5,6 +5,7 @@ module CloudCrowd
|
|
5
5
|
# == Admin
|
6
6
|
# [get /] Render the admin console, with a progress meter for running jobs.
|
7
7
|
# [get /status] Get the combined JSON of every active job and worker.
|
8
|
+
# [get /worker/:name] Look up the details of a WorkUnit that a Worker is busy processing.
|
8
9
|
# [get /heartbeat] Returns 200 OK to let monitoring tools know the server's up.
|
9
10
|
#
|
10
11
|
# == Public API
|
@@ -13,10 +14,10 @@ module CloudCrowd
|
|
13
14
|
# [delete /jobs/:job_id] Clean up a Job when you're done downloading the results. Removes all intermediate files.
|
14
15
|
#
|
15
16
|
# == Internal Workers API
|
16
|
-
# [
|
17
|
+
# [puts /node/:host] Registers a new Node, making it available for processing.
|
18
|
+
# [delete /node/:host] Removes a Node from the registry, freeing up any WorkUnits that it had checked out.
|
17
19
|
# [put /work/:unit_id] Mark a finished WorkUnit as completed or failed, with results.
|
18
|
-
|
19
|
-
class App < Sinatra::Default
|
20
|
+
class Server < Sinatra::Default
|
20
21
|
|
21
22
|
set :root, ROOT
|
22
23
|
set :authorization_realm, "CloudCrowd"
|
@@ -28,12 +29,12 @@ module CloudCrowd
|
|
28
29
|
|
29
30
|
# Enabling HTTP Authentication turns it on for all requests.
|
30
31
|
before do
|
31
|
-
login_required if CloudCrowd.config[:
|
32
|
+
login_required if CloudCrowd.config[:http_authentication]
|
32
33
|
end
|
33
34
|
|
34
35
|
# Render the admin console.
|
35
36
|
get '/' do
|
36
|
-
erb :
|
37
|
+
erb :operations_center
|
37
38
|
end
|
38
39
|
|
39
40
|
# Get the JSON for every active job in the queue and every active worker
|
@@ -42,15 +43,14 @@ module CloudCrowd
|
|
42
43
|
get '/status' do
|
43
44
|
json(
|
44
45
|
'jobs' => Job.incomplete,
|
45
|
-
'
|
46
|
+
'nodes' => NodeRecord.all(:order => 'host desc'),
|
46
47
|
'work_unit_count' => WorkUnit.incomplete.count
|
47
48
|
)
|
48
49
|
end
|
49
50
|
|
50
|
-
# Get the JSON for a worker
|
51
|
+
# Get the JSON for what a worker is up to.
|
51
52
|
get '/worker/:name' do
|
52
|
-
|
53
|
-
json((record && record.work_unit) || {})
|
53
|
+
json WorkUnit.find_by_worker_name(params[:name]) || {}
|
54
54
|
end
|
55
55
|
|
56
56
|
# To monitor the central server with Monit, God, Nagios, or another
|
@@ -62,8 +62,11 @@ module CloudCrowd
|
|
62
62
|
# PUBLIC API:
|
63
63
|
|
64
64
|
# Start a new job. Accepts a JSON representation of the job-to-be.
|
65
|
+
# Distributes all work units to available nodes.
|
65
66
|
post '/jobs' do
|
66
|
-
|
67
|
+
job = Job.create_from_request(JSON.parse(params[:job]))
|
68
|
+
WorkUnit.distribute_to_nodes
|
69
|
+
json job
|
67
70
|
end
|
68
71
|
|
69
72
|
# Check the status of a job, returning the output if finished, and the
|
@@ -79,36 +82,33 @@ module CloudCrowd
|
|
79
82
|
json nil
|
80
83
|
end
|
81
84
|
|
82
|
-
# INTERNAL
|
85
|
+
# INTERNAL NODE API:
|
83
86
|
|
84
|
-
#
|
85
|
-
#
|
86
|
-
|
87
|
-
|
87
|
+
# A new Node will this this action to register its location and
|
88
|
+
# configuration with the central server. Triggers distribution of WorkUnits.
|
89
|
+
put '/node/:host' do
|
90
|
+
NodeRecord.check_in(params, request)
|
91
|
+
WorkUnit.distribute_to_nodes
|
92
|
+
json nil
|
93
|
+
end
|
94
|
+
|
95
|
+
# Deregisters a Node from the central server. Releases and redistributes any
|
96
|
+
# WorkUnits it may have had checked out.
|
97
|
+
delete '/node/:host' do
|
98
|
+
NodeRecord.destroy_all(:host => params[:host])
|
99
|
+
json nil
|
88
100
|
end
|
89
101
|
|
90
102
|
# When workers are done with their unit, either successfully on in failure,
|
91
|
-
# they mark it back on the central server and
|
92
|
-
#
|
103
|
+
# they mark it back on the central server and exit. Triggers distribution
|
104
|
+
# of pending work units.
|
93
105
|
put '/work/:work_unit_id' do
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
json dequeue_work_unit
|
99
|
-
when 'failed'
|
100
|
-
current_work_unit.fail(params[:output], params[:time])
|
101
|
-
json dequeue_work_unit(1)
|
102
|
-
else
|
103
|
-
error(500, "Completing a work unit must specify status.")
|
104
|
-
end
|
106
|
+
case params[:status]
|
107
|
+
when 'succeeded' then current_work_unit.finish(params[:output], params[:time])
|
108
|
+
when 'failed' then current_work_unit.fail(params[:output], params[:time])
|
109
|
+
else error(500, "Completing a work unit must specify status.")
|
105
110
|
end
|
106
|
-
|
107
|
-
|
108
|
-
# Every so often workers check in to let the central server know that
|
109
|
-
# they're still alive. Keep up-to-date records
|
110
|
-
put '/worker' do
|
111
|
-
params[:terminated] ? WorkerRecord.check_out(params) : WorkerRecord.check_in(params)
|
111
|
+
WorkUnit.distribute_to_nodes
|
112
112
|
json nil
|
113
113
|
end
|
114
114
|
|