cloud-crowd 0.1.0 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README +16 -16
- data/cloud-crowd.gemspec +10 -9
- data/config/config.example.ru +8 -2
- data/config/config.example.yml +21 -25
- data/examples/process_pdfs_example.rb +1 -1
- data/examples/word_count_example.rb +1 -0
- data/lib/cloud-crowd.rb +47 -28
- data/lib/cloud_crowd/action.rb +14 -8
- data/lib/cloud_crowd/asset_store.rb +8 -8
- data/lib/cloud_crowd/asset_store/filesystem_store.rb +18 -7
- data/lib/cloud_crowd/asset_store/s3_store.rb +14 -11
- data/lib/cloud_crowd/command_line.rb +24 -58
- data/lib/cloud_crowd/exceptions.rb +7 -0
- data/lib/cloud_crowd/helpers/authorization.rb +5 -3
- data/lib/cloud_crowd/helpers/resources.rb +0 -20
- data/lib/cloud_crowd/models.rb +1 -1
- data/lib/cloud_crowd/models/job.rb +37 -40
- data/lib/cloud_crowd/models/node_record.rb +95 -0
- data/lib/cloud_crowd/models/work_unit.rb +87 -33
- data/lib/cloud_crowd/node.rb +105 -0
- data/lib/cloud_crowd/schema.rb +22 -18
- data/lib/cloud_crowd/{app.rb → server.rb} +34 -34
- data/lib/cloud_crowd/worker.rb +68 -107
- data/public/css/admin_console.css +40 -18
- data/public/images/server.png +0 -0
- data/public/images/server_busy.png +0 -0
- data/public/js/admin_console.js +47 -18
- data/test/acceptance/test_failing_work_units.rb +1 -1
- data/test/acceptance/{test_app.rb → test_server.rb} +15 -15
- data/test/acceptance/test_word_count.rb +3 -9
- data/test/blueprints.rb +0 -1
- data/test/config/config.ru +1 -1
- data/test/config/config.yml +2 -4
- data/test/unit/test_action.rb +1 -1
- data/test/unit/test_configuration.rb +1 -1
- data/test/unit/test_job.rb +3 -0
- data/test/unit/test_work_unit.rb +2 -4
- data/views/{index.erb → operations_center.erb} +13 -8
- metadata +11 -10
- data/lib/cloud_crowd/daemon.rb +0 -95
- data/lib/cloud_crowd/models/worker_record.rb +0 -61
- data/lib/cloud_crowd/runner.rb +0 -15
data/lib/cloud_crowd/worker.rb
CHANGED
@@ -1,90 +1,48 @@
|
|
1
1
|
module CloudCrowd
|
2
2
|
|
3
|
-
# The Worker,
|
4
|
-
#
|
5
|
-
#
|
6
|
-
# actions directory). If communication with the central server is
|
7
|
-
# the
|
8
|
-
# Worker::RETRY_WAIT seconds. Any exceptions that take place during
|
3
|
+
# The Worker, forked off from the Node when a new WorkUnit is received,
|
4
|
+
# launches an Action for processing. Workers will only ever receive WorkUnits
|
5
|
+
# that they are able to handle (for which they have a corresponding action in
|
6
|
+
# their actions directory). If communication with the central server is
|
7
|
+
# interrupted, the Worker will repeatedly attempt to complete its unit --
|
8
|
+
# every Worker::RETRY_WAIT seconds. Any exceptions that take place during
|
9
9
|
# the course of the Action will cause the Worker to mark the WorkUnit as
|
10
|
-
# having failed.
|
10
|
+
# having failed. When finished, the Worker's process exits, minimizing the
|
11
|
+
# potential for memory leaks.
|
11
12
|
class Worker
|
12
13
|
|
13
|
-
# The time between worker check-ins with the central server, informing
|
14
|
-
# it of the current status, and simply that it's still alive.
|
15
|
-
CHECK_IN_INTERVAL = 60
|
16
|
-
|
17
14
|
# Wait five seconds to retry, after internal communcication errors.
|
18
15
|
RETRY_WAIT = 5
|
19
16
|
|
20
17
|
attr_reader :action
|
21
18
|
|
22
|
-
#
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
@store = AssetStore.new
|
30
|
-
@server = CloudCrowd.central_server
|
31
|
-
@enabled_actions = CloudCrowd.actions.keys
|
32
|
-
log 'started'
|
19
|
+
# A new Worker begins processing its WorkUnit straight off.
|
20
|
+
def initialize(node, work_unit)
|
21
|
+
@pid = $$
|
22
|
+
@node = node
|
23
|
+
trap_signals
|
24
|
+
setup_work_unit(work_unit)
|
25
|
+
run
|
33
26
|
end
|
34
27
|
|
35
|
-
#
|
36
|
-
def fetch_work_unit
|
37
|
-
keep_trying_to "fetch a new work unit" do
|
38
|
-
unit_json = @server['/work'].post(base_params)
|
39
|
-
setup_work_unit(unit_json)
|
40
|
-
end
|
41
|
-
end
|
42
|
-
|
43
|
-
# Return output to the central server, marking the current work unit as done.
|
28
|
+
# Return output to the central server, marking the WorkUnit done.
|
44
29
|
def complete_work_unit(result)
|
45
30
|
keep_trying_to "complete work unit" do
|
46
|
-
data =
|
47
|
-
|
31
|
+
data = base_params.merge({:status => 'succeeded', :output => result})
|
32
|
+
@node.server["/work/#{data[:id]}"].put(data)
|
48
33
|
log "finished #{display_work_unit} in #{data[:time]} seconds"
|
49
|
-
clear_work_unit
|
50
|
-
setup_work_unit(unit_json)
|
51
34
|
end
|
52
35
|
end
|
53
36
|
|
54
|
-
# Mark the
|
37
|
+
# Mark the WorkUnit failed, returning the exception to central.
|
55
38
|
def fail_work_unit(exception)
|
56
39
|
keep_trying_to "mark work unit as failed" do
|
57
|
-
data =
|
58
|
-
|
40
|
+
data = base_params.merge({:status => 'failed', :output => {'output' => exception.message}.to_json})
|
41
|
+
@node.server["/work/#{data[:id]}"].put(data)
|
59
42
|
log "failed #{display_work_unit} in #{data[:time]} seconds\n#{exception.message}\n#{exception.backtrace}"
|
60
|
-
clear_work_unit
|
61
|
-
setup_work_unit(unit_json)
|
62
|
-
end
|
63
|
-
end
|
64
|
-
|
65
|
-
# Check in with the central server. Let it know the condition of the work
|
66
|
-
# thread, the action and status we're processing, and our hostname and PID.
|
67
|
-
def check_in(thread_status)
|
68
|
-
keep_trying_to "check in with central" do
|
69
|
-
@server["/worker"].put({
|
70
|
-
:name => @name,
|
71
|
-
:thread_status => thread_status
|
72
|
-
})
|
73
43
|
end
|
74
44
|
end
|
75
45
|
|
76
|
-
# Inform the central server that this worker is finished. This is the only
|
77
|
-
# remote method that doesn't retry on connection errors -- if the worker
|
78
|
-
# can't connect to the central server while it's trying to shutdown, it
|
79
|
-
# should close, regardless.
|
80
|
-
def check_out
|
81
|
-
@server["/worker"].put({
|
82
|
-
:name => @name,
|
83
|
-
:terminated => true
|
84
|
-
})
|
85
|
-
log 'exiting'
|
86
|
-
end
|
87
|
-
|
88
46
|
# We expect and require internal communication between the central server
|
89
47
|
# and the workers to succeed. If it fails for any reason, log it, and then
|
90
48
|
# keep trying the same request.
|
@@ -100,36 +58,38 @@ module CloudCrowd
|
|
100
58
|
end
|
101
59
|
end
|
102
60
|
|
103
|
-
#
|
104
|
-
def has_work?
|
105
|
-
@action_name && @input && @options
|
106
|
-
end
|
107
|
-
|
108
|
-
# Loggable string of the current work unit.
|
61
|
+
# Loggable details describing what the Worker is up to.
|
109
62
|
def display_work_unit
|
110
|
-
"unit ##{@options['work_unit_id']} (#{@action_name})"
|
63
|
+
"unit ##{@options['work_unit_id']} (#{@action_name}/#{CloudCrowd.display_status(@status)})"
|
111
64
|
end
|
112
65
|
|
113
|
-
# Executes the
|
66
|
+
# Executes the WorkUnit by running the Action, catching all exceptions as
|
67
|
+
# failures. We capture the thread so that we can kill it from the outside,
|
68
|
+
# when exiting.
|
114
69
|
def run_work_unit
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
70
|
+
@worker_thread = Thread.new do
|
71
|
+
begin
|
72
|
+
result = nil
|
73
|
+
@action = CloudCrowd.actions[@action_name].new(@status, @input, @options, @node.asset_store)
|
74
|
+
Dir.chdir(@action.work_directory) do
|
75
|
+
result = case @status
|
76
|
+
when PROCESSING then @action.process
|
77
|
+
when SPLITTING then @action.split
|
78
|
+
when MERGING then @action.merge
|
79
|
+
else raise Error::StatusUnspecified, "work units must specify their status"
|
80
|
+
end
|
124
81
|
end
|
82
|
+
complete_work_unit({'output' => result}.to_json)
|
83
|
+
rescue Exception => e
|
84
|
+
fail_work_unit(e)
|
85
|
+
ensure
|
86
|
+
@action.cleanup_work_directory
|
125
87
|
end
|
126
|
-
complete_work_unit({'output' => result}.to_json)
|
127
|
-
rescue Exception => e
|
128
|
-
fail_work_unit(e)
|
129
88
|
end
|
89
|
+
@worker_thread.join
|
130
90
|
end
|
131
91
|
|
132
|
-
# Wraps
|
92
|
+
# Wraps run_work_unit to benchmark the execution time, if requested.
|
133
93
|
def run
|
134
94
|
return run_work_unit unless @options['benchmark']
|
135
95
|
status = CloudCrowd.display_status(@status)
|
@@ -139,27 +99,17 @@ module CloudCrowd
|
|
139
99
|
|
140
100
|
private
|
141
101
|
|
142
|
-
# Common parameters to send back to central.
|
143
|
-
def base_params
|
144
|
-
@base_params ||= {
|
145
|
-
:worker_name => @name,
|
146
|
-
:worker_actions => @enabled_actions.join(',')
|
147
|
-
}
|
148
|
-
end
|
149
|
-
|
150
102
|
# Common parameters to send back to central upon unit completion,
|
151
103
|
# regardless of success or failure.
|
152
|
-
def
|
153
|
-
|
154
|
-
:id
|
155
|
-
:time
|
156
|
-
})
|
104
|
+
def base_params
|
105
|
+
{ :pid => @pid,
|
106
|
+
:id => @options['work_unit_id'],
|
107
|
+
:time => Time.now - @start_time }
|
157
108
|
end
|
158
109
|
|
159
|
-
# Extract
|
160
|
-
def setup_work_unit(
|
161
|
-
return false unless
|
162
|
-
unit = JSON.parse(unit_json)
|
110
|
+
# Extract the Worker's instance variables from a WorkUnit's JSON.
|
111
|
+
def setup_work_unit(unit)
|
112
|
+
return false unless unit
|
163
113
|
@start_time = Time.now
|
164
114
|
@action_name, @input, @options, @status = unit['action'], unit['input'], unit['options'], unit['status']
|
165
115
|
@options['job_id'] = unit['job_id']
|
@@ -171,14 +121,25 @@ module CloudCrowd
|
|
171
121
|
|
172
122
|
# Log a message to the daemon log. Includes PID for identification.
|
173
123
|
def log(message)
|
174
|
-
puts "Worker ##{@
|
124
|
+
puts "Worker ##{@pid}: #{message}" unless ENV['RACK_ENV'] == 'test'
|
175
125
|
end
|
176
126
|
|
177
|
-
# When
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
127
|
+
# When signaled to exit, make sure that the Worker shuts down cleanly.
|
128
|
+
def trap_signals
|
129
|
+
Signal.trap('INT') { shut_down }
|
130
|
+
Signal.trap('KILL') { shut_down }
|
131
|
+
Signal.trap('TERM') { shut_down }
|
132
|
+
end
|
133
|
+
|
134
|
+
# Force the Worker to quit, even if it's in the middle of processing.
|
135
|
+
# If it had a checked-out WorkUnit, the Node should have released it on
|
136
|
+
# the central server already.
|
137
|
+
def shut_down
|
138
|
+
if @worker_thread
|
139
|
+
@worker_thread.kill
|
140
|
+
@worker_thread.kill! if @worker_thread.alive?
|
141
|
+
end
|
142
|
+
Process.exit
|
182
143
|
end
|
183
144
|
|
184
145
|
end
|
@@ -134,44 +134,57 @@ body {
|
|
134
134
|
}
|
135
135
|
#sidebar_header {
|
136
136
|
position: absolute;
|
137
|
+
width: 250px;
|
137
138
|
top: 5px; left: 8px;
|
138
139
|
color: #404040;
|
139
140
|
text-shadow: 0px 1px 1px #eee;
|
140
141
|
}
|
141
|
-
#sidebar_header.
|
142
|
-
#sidebar_header .
|
142
|
+
#sidebar_header.no_nodes .no_nodes,
|
143
|
+
#sidebar_header .has_nodes {
|
143
144
|
display: block;
|
144
145
|
}
|
145
|
-
#sidebar_header .
|
146
|
-
#sidebar_header.
|
146
|
+
#sidebar_header .no_nodes,
|
147
|
+
#sidebar_header.no_nodes .has_nodes {
|
147
148
|
display: none;
|
148
149
|
}
|
149
|
-
#
|
150
|
+
#nodes {
|
150
151
|
position: absolute;
|
151
152
|
padding: 2px 0;
|
152
153
|
top: 21px; left: 0; bottom: 21px;
|
153
154
|
width: 298px;
|
154
155
|
overflow-y: auto; overflow-x: hidden;
|
155
156
|
}
|
156
|
-
#
|
157
|
+
#nodes .node, #nodes .worker {
|
157
158
|
border: 1px solid transparent;
|
158
159
|
margin: 1px 7px;
|
159
160
|
padding-left: 18px;
|
160
|
-
|
161
|
-
|
162
|
-
background: url(/images/bullet_white.png) no-repeat left center;
|
163
|
-
cursor: pointer;
|
161
|
+
background-position: left center;
|
162
|
+
background-repeat: no-repeat;
|
164
163
|
}
|
165
|
-
#
|
166
|
-
|
167
|
-
|
168
|
-
background: url(/images/
|
164
|
+
#nodes .node {
|
165
|
+
font-size: 11px;
|
166
|
+
line-height: 22px;
|
167
|
+
background-image: url(/images/server.png);
|
169
168
|
}
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
169
|
+
#nodes .node.busy {
|
170
|
+
background-image: url(/images/server_busy.png);
|
171
|
+
}
|
172
|
+
#nodes .node.busy span.busy {
|
173
|
+
font-size: 9px;
|
174
|
+
color: #7f7f7f;
|
175
|
+
text-transform: uppercase;
|
176
|
+
}
|
177
|
+
#nodes .worker {
|
178
|
+
font-size: 10px;
|
179
|
+
line-height: 18px;
|
180
|
+
cursor: pointer;
|
181
|
+
background-image: url(/images/bullet_green.png);
|
174
182
|
}
|
183
|
+
#nodes .worker:hover {
|
184
|
+
border: 1px solid #aaa;
|
185
|
+
border-radius: 4px; -moz-border-radius: 4px; -webkit-border-radius: 4px;
|
186
|
+
background-color: #ccc;
|
187
|
+
}
|
175
188
|
|
176
189
|
#worker_info {
|
177
190
|
position: absolute;
|
@@ -216,6 +229,15 @@ body {
|
|
216
229
|
text-shadow: 0px 1px 1px #eee;
|
217
230
|
margin-bottom: 10px;
|
218
231
|
}
|
232
|
+
.legend_box {
|
233
|
+
display: inline-block;
|
234
|
+
width: 10px; height: 10px;
|
235
|
+
border: 1px solid #bbb;
|
236
|
+
position: relative;
|
237
|
+
top: 1px;
|
238
|
+
margin: 0 1px;
|
239
|
+
background-color: #a1003d;
|
240
|
+
}
|
219
241
|
.graph {
|
220
242
|
height: 150px;
|
221
243
|
}
|
Binary file
|
Binary file
|
data/public/js/admin_console.js
CHANGED
@@ -22,26 +22,30 @@ window.Console = {
|
|
22
22
|
|
23
23
|
// All options for drawing the system graphs.
|
24
24
|
GRAPH_OPTIONS : {
|
25
|
-
xaxis
|
26
|
-
yaxis
|
27
|
-
legend
|
28
|
-
grid
|
25
|
+
xaxis : {mode : 'time', timeformat : '%M:%S'},
|
26
|
+
yaxis : {tickDecimals : 0},
|
27
|
+
legend : {show : false},
|
28
|
+
grid : {backgroundColor : '#7f7f7f', color : '#555', tickColor : '#666', borderWidth : 2}
|
29
29
|
},
|
30
30
|
JOBS_COLOR : '#db3a0f',
|
31
|
-
|
31
|
+
NODES_COLOR : '#1870ab',
|
32
|
+
WORKERS_COLOR : '#45a4e5',
|
32
33
|
WORK_UNITS_COLOR : '#ffba14',
|
33
34
|
|
34
35
|
// Starting the console begins polling the server.
|
35
36
|
initialize : function() {
|
36
37
|
this._jobsHistory = [];
|
38
|
+
this._nodesHistory = [];
|
37
39
|
this._workersHistory = [];
|
38
40
|
this._workUnitsHistory = [];
|
39
|
-
this._histories = [this._jobsHistory, this._workersHistory, this._workUnitsHistory];
|
41
|
+
this._histories = [this._jobsHistory, this._nodesHistory, this._workersHistory, this._workUnitsHistory];
|
40
42
|
this._queue = $('#jobs');
|
41
43
|
this._workerInfo = $('#worker_info');
|
42
44
|
this._disconnected = $('#disconnected');
|
43
45
|
$(window).bind('resize', Console.renderGraphs);
|
44
|
-
$('#
|
46
|
+
$('#nodes .worker').live('click', Console.getWorkerInfo);
|
47
|
+
$('#workers_legend').css({background : this.WORKERS_COLOR});
|
48
|
+
$('#nodes_legend').css({background : this.NODES_COLOR});
|
45
49
|
this.getStatus();
|
46
50
|
$.each(this.PRELOAD_IMAGES, function(){ var i = new Image(); i.src = this; });
|
47
51
|
},
|
@@ -51,13 +55,14 @@ window.Console = {
|
|
51
55
|
getStatus : function() {
|
52
56
|
$.ajax({url : '/status', dataType : 'json', success : function(resp) {
|
53
57
|
Console._jobs = resp.jobs;
|
54
|
-
Console.
|
58
|
+
Console._nodes = resp.nodes;
|
55
59
|
Console._workUnitCount = resp.work_unit_count;
|
60
|
+
Console._workerCount = Console.countWorkers();
|
56
61
|
Console.recordDataPoint();
|
57
62
|
if (Console._disconnected.is(':visible')) Console._disconnected.fadeOut(Console.ANIMATION_SPEED);
|
58
63
|
$('#queue').toggleClass('no_jobs', Console._jobs.length <= 0);
|
59
64
|
Console.renderJobs();
|
60
|
-
Console.
|
65
|
+
Console.renderNodes();
|
61
66
|
Console.renderGraphs();
|
62
67
|
setTimeout(Console.getStatus, Console.POLL_INTERVAL);
|
63
68
|
}, error : function(request, status, errorThrown) {
|
@@ -66,6 +71,13 @@ window.Console = {
|
|
66
71
|
}});
|
67
72
|
},
|
68
73
|
|
74
|
+
// Count the total number of workers in the current list of nodes.
|
75
|
+
countWorkers : function() {
|
76
|
+
var sum = 0;
|
77
|
+
for (var i=0; i < this._nodes.length; i++) sum += this._nodes[i].workers.length;
|
78
|
+
return sum;
|
79
|
+
},
|
80
|
+
|
69
81
|
// Render an individual job afresh.
|
70
82
|
renderJob : function(job) {
|
71
83
|
this._queue.append('<div class="job" id="job_' + job.id + '" style="width:' + job.width + '%; background: #' + job.color + ';"><div class="completion ' + (job.percent_complete <= 0 ? 'zero' : '') + '" style="width:' + job.percent_complete + '%;"></div><div class="percent_complete">' + job.percent_complete + '%</div><div class="job_id">#' + job.id + '</div></div>');
|
@@ -105,12 +117,21 @@ window.Console = {
|
|
105
117
|
},
|
106
118
|
|
107
119
|
// Re-render all workers from scratch each time.
|
108
|
-
|
120
|
+
// This method is desperately in need of Javascript templates...
|
121
|
+
renderNodes : function() {
|
109
122
|
var header = $('#sidebar_header');
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
123
|
+
var nc = this._nodes.length, wc = this._workerCount;
|
124
|
+
$('.has_nodes', header).html(nc + " Node" + (nc != 1 ? 's' : '') + " / " + wc + " Worker" + (wc != 1 ? 's' : ''));
|
125
|
+
header.toggleClass('no_nodes', this._nodes.length <= 0);
|
126
|
+
$('#nodes').html($.map(this._nodes, function(node) {
|
127
|
+
var html = "";
|
128
|
+
var extra = node.status == 'busy' ? ' <span class="busy">[busy]</span>' : '';
|
129
|
+
html += '<div class="node ' + node.status + '">' + node.host + extra + '</div>';
|
130
|
+
html += $.map(node.workers, function(pid) {
|
131
|
+
var name = pid + '@' + node.host;
|
132
|
+
return '<div class="worker" rel="' + name + '">' + name + '</div>';
|
133
|
+
}).join('');
|
134
|
+
return html;
|
114
135
|
}).join(''));
|
115
136
|
},
|
116
137
|
|
@@ -118,7 +139,8 @@ window.Console = {
|
|
118
139
|
recordDataPoint : function() {
|
119
140
|
var timestamp = (new Date()).getTime();
|
120
141
|
this._jobsHistory.push([timestamp, this._jobs.length]);
|
121
|
-
this.
|
142
|
+
this._nodesHistory.push([timestamp, this._nodes.length]);
|
143
|
+
this._workersHistory.push([timestamp, this._workerCount]);
|
122
144
|
this._workUnitsHistory.push([timestamp, this._workUnitCount]);
|
123
145
|
$.each(this._histories, function() {
|
124
146
|
if (this.length > Console.MAX_DATA_POINTS) this.shift();
|
@@ -127,9 +149,16 @@ window.Console = {
|
|
127
149
|
|
128
150
|
// Convert our recorded data points into a format Flot can understand.
|
129
151
|
renderGraphs : function() {
|
130
|
-
$.plot($('#
|
131
|
-
|
132
|
-
|
152
|
+
$.plot($('#work_units_graph'), [
|
153
|
+
{label : 'Work Units in Queue', color : Console.WORK_UNITS_COLOR, data : Console._workUnitsHistory}
|
154
|
+
], Console.GRAPH_OPTIONS);
|
155
|
+
$.plot($('#jobs_graph'), [
|
156
|
+
{label : 'Jobs in Queue', color : Console.JOBS_COLOR, data : Console._jobsHistory}
|
157
|
+
], Console.GRAPH_OPTIONS);
|
158
|
+
$.plot($('#workers_graph'), [
|
159
|
+
{label : 'Nodes', color : Console.NODES_COLOR, data : Console._nodesHistory},
|
160
|
+
{label : 'Workers', color : Console.WORKERS_COLOR, data : Console._workersHistory}
|
161
|
+
], Console.GRAPH_OPTIONS);
|
133
162
|
},
|
134
163
|
|
135
164
|
// Request the Worker info from the central server.
|