cloud-crowd 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +16 -16
- data/cloud-crowd.gemspec +10 -9
- data/config/config.example.ru +8 -2
- data/config/config.example.yml +21 -25
- data/examples/process_pdfs_example.rb +1 -1
- data/examples/word_count_example.rb +1 -0
- data/lib/cloud-crowd.rb +47 -28
- data/lib/cloud_crowd/action.rb +14 -8
- data/lib/cloud_crowd/asset_store.rb +8 -8
- data/lib/cloud_crowd/asset_store/filesystem_store.rb +18 -7
- data/lib/cloud_crowd/asset_store/s3_store.rb +14 -11
- data/lib/cloud_crowd/command_line.rb +24 -58
- data/lib/cloud_crowd/exceptions.rb +7 -0
- data/lib/cloud_crowd/helpers/authorization.rb +5 -3
- data/lib/cloud_crowd/helpers/resources.rb +0 -20
- data/lib/cloud_crowd/models.rb +1 -1
- data/lib/cloud_crowd/models/job.rb +37 -40
- data/lib/cloud_crowd/models/node_record.rb +95 -0
- data/lib/cloud_crowd/models/work_unit.rb +87 -33
- data/lib/cloud_crowd/node.rb +105 -0
- data/lib/cloud_crowd/schema.rb +22 -18
- data/lib/cloud_crowd/{app.rb → server.rb} +34 -34
- data/lib/cloud_crowd/worker.rb +68 -107
- data/public/css/admin_console.css +40 -18
- data/public/images/server.png +0 -0
- data/public/images/server_busy.png +0 -0
- data/public/js/admin_console.js +47 -18
- data/test/acceptance/test_failing_work_units.rb +1 -1
- data/test/acceptance/{test_app.rb → test_server.rb} +15 -15
- data/test/acceptance/test_word_count.rb +3 -9
- data/test/blueprints.rb +0 -1
- data/test/config/config.ru +1 -1
- data/test/config/config.yml +2 -4
- data/test/unit/test_action.rb +1 -1
- data/test/unit/test_configuration.rb +1 -1
- data/test/unit/test_job.rb +3 -0
- data/test/unit/test_work_unit.rb +2 -4
- data/views/{index.erb → operations_center.erb} +13 -8
- metadata +11 -10
- data/lib/cloud_crowd/daemon.rb +0 -95
- data/lib/cloud_crowd/models/worker_record.rb +0 -61
- data/lib/cloud_crowd/runner.rb +0 -15
data/lib/cloud_crowd/worker.rb
CHANGED
@@ -1,90 +1,48 @@
|
|
1
1
|
module CloudCrowd
|
2
2
|
|
3
|
-
# The Worker,
|
4
|
-
#
|
5
|
-
#
|
6
|
-
# actions directory). If communication with the central server is
|
7
|
-
# the
|
8
|
-
# Worker::RETRY_WAIT seconds. Any exceptions that take place during
|
3
|
+
# The Worker, forked off from the Node when a new WorkUnit is received,
|
4
|
+
# launches an Action for processing. Workers will only ever receive WorkUnits
|
5
|
+
# that they are able to handle (for which they have a corresponding action in
|
6
|
+
# their actions directory). If communication with the central server is
|
7
|
+
# interrupted, the Worker will repeatedly attempt to complete its unit --
|
8
|
+
# every Worker::RETRY_WAIT seconds. Any exceptions that take place during
|
9
9
|
# the course of the Action will cause the Worker to mark the WorkUnit as
|
10
|
-
# having failed.
|
10
|
+
# having failed. When finished, the Worker's process exits, minimizing the
|
11
|
+
# potential for memory leaks.
|
11
12
|
class Worker
|
12
13
|
|
13
|
-
# The time between worker check-ins with the central server, informing
|
14
|
-
# it of the current status, and simply that it's still alive.
|
15
|
-
CHECK_IN_INTERVAL = 60
|
16
|
-
|
17
14
|
# Wait five seconds to retry, after internal communcication errors.
|
18
15
|
RETRY_WAIT = 5
|
19
16
|
|
20
17
|
attr_reader :action
|
21
18
|
|
22
|
-
#
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
@store = AssetStore.new
|
30
|
-
@server = CloudCrowd.central_server
|
31
|
-
@enabled_actions = CloudCrowd.actions.keys
|
32
|
-
log 'started'
|
19
|
+
# A new Worker begins processing its WorkUnit straight off.
|
20
|
+
def initialize(node, work_unit)
|
21
|
+
@pid = $$
|
22
|
+
@node = node
|
23
|
+
trap_signals
|
24
|
+
setup_work_unit(work_unit)
|
25
|
+
run
|
33
26
|
end
|
34
27
|
|
35
|
-
#
|
36
|
-
def fetch_work_unit
|
37
|
-
keep_trying_to "fetch a new work unit" do
|
38
|
-
unit_json = @server['/work'].post(base_params)
|
39
|
-
setup_work_unit(unit_json)
|
40
|
-
end
|
41
|
-
end
|
42
|
-
|
43
|
-
# Return output to the central server, marking the current work unit as done.
|
28
|
+
# Return output to the central server, marking the WorkUnit done.
|
44
29
|
def complete_work_unit(result)
|
45
30
|
keep_trying_to "complete work unit" do
|
46
|
-
data =
|
47
|
-
|
31
|
+
data = base_params.merge({:status => 'succeeded', :output => result})
|
32
|
+
@node.server["/work/#{data[:id]}"].put(data)
|
48
33
|
log "finished #{display_work_unit} in #{data[:time]} seconds"
|
49
|
-
clear_work_unit
|
50
|
-
setup_work_unit(unit_json)
|
51
34
|
end
|
52
35
|
end
|
53
36
|
|
54
|
-
# Mark the
|
37
|
+
# Mark the WorkUnit failed, returning the exception to central.
|
55
38
|
def fail_work_unit(exception)
|
56
39
|
keep_trying_to "mark work unit as failed" do
|
57
|
-
data =
|
58
|
-
|
40
|
+
data = base_params.merge({:status => 'failed', :output => {'output' => exception.message}.to_json})
|
41
|
+
@node.server["/work/#{data[:id]}"].put(data)
|
59
42
|
log "failed #{display_work_unit} in #{data[:time]} seconds\n#{exception.message}\n#{exception.backtrace}"
|
60
|
-
clear_work_unit
|
61
|
-
setup_work_unit(unit_json)
|
62
|
-
end
|
63
|
-
end
|
64
|
-
|
65
|
-
# Check in with the central server. Let it know the condition of the work
|
66
|
-
# thread, the action and status we're processing, and our hostname and PID.
|
67
|
-
def check_in(thread_status)
|
68
|
-
keep_trying_to "check in with central" do
|
69
|
-
@server["/worker"].put({
|
70
|
-
:name => @name,
|
71
|
-
:thread_status => thread_status
|
72
|
-
})
|
73
43
|
end
|
74
44
|
end
|
75
45
|
|
76
|
-
# Inform the central server that this worker is finished. This is the only
|
77
|
-
# remote method that doesn't retry on connection errors -- if the worker
|
78
|
-
# can't connect to the central server while it's trying to shutdown, it
|
79
|
-
# should close, regardless.
|
80
|
-
def check_out
|
81
|
-
@server["/worker"].put({
|
82
|
-
:name => @name,
|
83
|
-
:terminated => true
|
84
|
-
})
|
85
|
-
log 'exiting'
|
86
|
-
end
|
87
|
-
|
88
46
|
# We expect and require internal communication between the central server
|
89
47
|
# and the workers to succeed. If it fails for any reason, log it, and then
|
90
48
|
# keep trying the same request.
|
@@ -100,36 +58,38 @@ module CloudCrowd
|
|
100
58
|
end
|
101
59
|
end
|
102
60
|
|
103
|
-
#
|
104
|
-
def has_work?
|
105
|
-
@action_name && @input && @options
|
106
|
-
end
|
107
|
-
|
108
|
-
# Loggable string of the current work unit.
|
61
|
+
# Loggable details describing what the Worker is up to.
|
109
62
|
def display_work_unit
|
110
|
-
"unit ##{@options['work_unit_id']} (#{@action_name})"
|
63
|
+
"unit ##{@options['work_unit_id']} (#{@action_name}/#{CloudCrowd.display_status(@status)})"
|
111
64
|
end
|
112
65
|
|
113
|
-
# Executes the
|
66
|
+
# Executes the WorkUnit by running the Action, catching all exceptions as
|
67
|
+
# failures. We capture the thread so that we can kill it from the outside,
|
68
|
+
# when exiting.
|
114
69
|
def run_work_unit
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
70
|
+
@worker_thread = Thread.new do
|
71
|
+
begin
|
72
|
+
result = nil
|
73
|
+
@action = CloudCrowd.actions[@action_name].new(@status, @input, @options, @node.asset_store)
|
74
|
+
Dir.chdir(@action.work_directory) do
|
75
|
+
result = case @status
|
76
|
+
when PROCESSING then @action.process
|
77
|
+
when SPLITTING then @action.split
|
78
|
+
when MERGING then @action.merge
|
79
|
+
else raise Error::StatusUnspecified, "work units must specify their status"
|
80
|
+
end
|
124
81
|
end
|
82
|
+
complete_work_unit({'output' => result}.to_json)
|
83
|
+
rescue Exception => e
|
84
|
+
fail_work_unit(e)
|
85
|
+
ensure
|
86
|
+
@action.cleanup_work_directory
|
125
87
|
end
|
126
|
-
complete_work_unit({'output' => result}.to_json)
|
127
|
-
rescue Exception => e
|
128
|
-
fail_work_unit(e)
|
129
88
|
end
|
89
|
+
@worker_thread.join
|
130
90
|
end
|
131
91
|
|
132
|
-
# Wraps
|
92
|
+
# Wraps run_work_unit to benchmark the execution time, if requested.
|
133
93
|
def run
|
134
94
|
return run_work_unit unless @options['benchmark']
|
135
95
|
status = CloudCrowd.display_status(@status)
|
@@ -139,27 +99,17 @@ module CloudCrowd
|
|
139
99
|
|
140
100
|
private
|
141
101
|
|
142
|
-
# Common parameters to send back to central.
|
143
|
-
def base_params
|
144
|
-
@base_params ||= {
|
145
|
-
:worker_name => @name,
|
146
|
-
:worker_actions => @enabled_actions.join(',')
|
147
|
-
}
|
148
|
-
end
|
149
|
-
|
150
102
|
# Common parameters to send back to central upon unit completion,
|
151
103
|
# regardless of success or failure.
|
152
|
-
def
|
153
|
-
|
154
|
-
:id
|
155
|
-
:time
|
156
|
-
})
|
104
|
+
def base_params
|
105
|
+
{ :pid => @pid,
|
106
|
+
:id => @options['work_unit_id'],
|
107
|
+
:time => Time.now - @start_time }
|
157
108
|
end
|
158
109
|
|
159
|
-
# Extract
|
160
|
-
def setup_work_unit(
|
161
|
-
return false unless
|
162
|
-
unit = JSON.parse(unit_json)
|
110
|
+
# Extract the Worker's instance variables from a WorkUnit's JSON.
|
111
|
+
def setup_work_unit(unit)
|
112
|
+
return false unless unit
|
163
113
|
@start_time = Time.now
|
164
114
|
@action_name, @input, @options, @status = unit['action'], unit['input'], unit['options'], unit['status']
|
165
115
|
@options['job_id'] = unit['job_id']
|
@@ -171,14 +121,25 @@ module CloudCrowd
|
|
171
121
|
|
172
122
|
# Log a message to the daemon log. Includes PID for identification.
|
173
123
|
def log(message)
|
174
|
-
puts "Worker ##{@
|
124
|
+
puts "Worker ##{@pid}: #{message}" unless ENV['RACK_ENV'] == 'test'
|
175
125
|
end
|
176
126
|
|
177
|
-
# When
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
127
|
+
# When signaled to exit, make sure that the Worker shuts down cleanly.
|
128
|
+
def trap_signals
|
129
|
+
Signal.trap('INT') { shut_down }
|
130
|
+
Signal.trap('KILL') { shut_down }
|
131
|
+
Signal.trap('TERM') { shut_down }
|
132
|
+
end
|
133
|
+
|
134
|
+
# Force the Worker to quit, even if it's in the middle of processing.
|
135
|
+
# If it had a checked-out WorkUnit, the Node should have released it on
|
136
|
+
# the central server already.
|
137
|
+
def shut_down
|
138
|
+
if @worker_thread
|
139
|
+
@worker_thread.kill
|
140
|
+
@worker_thread.kill! if @worker_thread.alive?
|
141
|
+
end
|
142
|
+
Process.exit
|
182
143
|
end
|
183
144
|
|
184
145
|
end
|
@@ -134,44 +134,57 @@ body {
|
|
134
134
|
}
|
135
135
|
#sidebar_header {
|
136
136
|
position: absolute;
|
137
|
+
width: 250px;
|
137
138
|
top: 5px; left: 8px;
|
138
139
|
color: #404040;
|
139
140
|
text-shadow: 0px 1px 1px #eee;
|
140
141
|
}
|
141
|
-
#sidebar_header.
|
142
|
-
#sidebar_header .
|
142
|
+
#sidebar_header.no_nodes .no_nodes,
|
143
|
+
#sidebar_header .has_nodes {
|
143
144
|
display: block;
|
144
145
|
}
|
145
|
-
#sidebar_header .
|
146
|
-
#sidebar_header.
|
146
|
+
#sidebar_header .no_nodes,
|
147
|
+
#sidebar_header.no_nodes .has_nodes {
|
147
148
|
display: none;
|
148
149
|
}
|
149
|
-
#
|
150
|
+
#nodes {
|
150
151
|
position: absolute;
|
151
152
|
padding: 2px 0;
|
152
153
|
top: 21px; left: 0; bottom: 21px;
|
153
154
|
width: 298px;
|
154
155
|
overflow-y: auto; overflow-x: hidden;
|
155
156
|
}
|
156
|
-
#
|
157
|
+
#nodes .node, #nodes .worker {
|
157
158
|
border: 1px solid transparent;
|
158
159
|
margin: 1px 7px;
|
159
160
|
padding-left: 18px;
|
160
|
-
|
161
|
-
|
162
|
-
background: url(/images/bullet_white.png) no-repeat left center;
|
163
|
-
cursor: pointer;
|
161
|
+
background-position: left center;
|
162
|
+
background-repeat: no-repeat;
|
164
163
|
}
|
165
|
-
#
|
166
|
-
|
167
|
-
|
168
|
-
background: url(/images/
|
164
|
+
#nodes .node {
|
165
|
+
font-size: 11px;
|
166
|
+
line-height: 22px;
|
167
|
+
background-image: url(/images/server.png);
|
169
168
|
}
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
169
|
+
#nodes .node.busy {
|
170
|
+
background-image: url(/images/server_busy.png);
|
171
|
+
}
|
172
|
+
#nodes .node.busy span.busy {
|
173
|
+
font-size: 9px;
|
174
|
+
color: #7f7f7f;
|
175
|
+
text-transform: uppercase;
|
176
|
+
}
|
177
|
+
#nodes .worker {
|
178
|
+
font-size: 10px;
|
179
|
+
line-height: 18px;
|
180
|
+
cursor: pointer;
|
181
|
+
background-image: url(/images/bullet_green.png);
|
174
182
|
}
|
183
|
+
#nodes .worker:hover {
|
184
|
+
border: 1px solid #aaa;
|
185
|
+
border-radius: 4px; -moz-border-radius: 4px; -webkit-border-radius: 4px;
|
186
|
+
background-color: #ccc;
|
187
|
+
}
|
175
188
|
|
176
189
|
#worker_info {
|
177
190
|
position: absolute;
|
@@ -216,6 +229,15 @@ body {
|
|
216
229
|
text-shadow: 0px 1px 1px #eee;
|
217
230
|
margin-bottom: 10px;
|
218
231
|
}
|
232
|
+
.legend_box {
|
233
|
+
display: inline-block;
|
234
|
+
width: 10px; height: 10px;
|
235
|
+
border: 1px solid #bbb;
|
236
|
+
position: relative;
|
237
|
+
top: 1px;
|
238
|
+
margin: 0 1px;
|
239
|
+
background-color: #a1003d;
|
240
|
+
}
|
219
241
|
.graph {
|
220
242
|
height: 150px;
|
221
243
|
}
|
Binary file
|
Binary file
|
data/public/js/admin_console.js
CHANGED
@@ -22,26 +22,30 @@ window.Console = {
|
|
22
22
|
|
23
23
|
// All options for drawing the system graphs.
|
24
24
|
GRAPH_OPTIONS : {
|
25
|
-
xaxis
|
26
|
-
yaxis
|
27
|
-
legend
|
28
|
-
grid
|
25
|
+
xaxis : {mode : 'time', timeformat : '%M:%S'},
|
26
|
+
yaxis : {tickDecimals : 0},
|
27
|
+
legend : {show : false},
|
28
|
+
grid : {backgroundColor : '#7f7f7f', color : '#555', tickColor : '#666', borderWidth : 2}
|
29
29
|
},
|
30
30
|
JOBS_COLOR : '#db3a0f',
|
31
|
-
|
31
|
+
NODES_COLOR : '#1870ab',
|
32
|
+
WORKERS_COLOR : '#45a4e5',
|
32
33
|
WORK_UNITS_COLOR : '#ffba14',
|
33
34
|
|
34
35
|
// Starting the console begins polling the server.
|
35
36
|
initialize : function() {
|
36
37
|
this._jobsHistory = [];
|
38
|
+
this._nodesHistory = [];
|
37
39
|
this._workersHistory = [];
|
38
40
|
this._workUnitsHistory = [];
|
39
|
-
this._histories = [this._jobsHistory, this._workersHistory, this._workUnitsHistory];
|
41
|
+
this._histories = [this._jobsHistory, this._nodesHistory, this._workersHistory, this._workUnitsHistory];
|
40
42
|
this._queue = $('#jobs');
|
41
43
|
this._workerInfo = $('#worker_info');
|
42
44
|
this._disconnected = $('#disconnected');
|
43
45
|
$(window).bind('resize', Console.renderGraphs);
|
44
|
-
$('#
|
46
|
+
$('#nodes .worker').live('click', Console.getWorkerInfo);
|
47
|
+
$('#workers_legend').css({background : this.WORKERS_COLOR});
|
48
|
+
$('#nodes_legend').css({background : this.NODES_COLOR});
|
45
49
|
this.getStatus();
|
46
50
|
$.each(this.PRELOAD_IMAGES, function(){ var i = new Image(); i.src = this; });
|
47
51
|
},
|
@@ -51,13 +55,14 @@ window.Console = {
|
|
51
55
|
getStatus : function() {
|
52
56
|
$.ajax({url : '/status', dataType : 'json', success : function(resp) {
|
53
57
|
Console._jobs = resp.jobs;
|
54
|
-
Console.
|
58
|
+
Console._nodes = resp.nodes;
|
55
59
|
Console._workUnitCount = resp.work_unit_count;
|
60
|
+
Console._workerCount = Console.countWorkers();
|
56
61
|
Console.recordDataPoint();
|
57
62
|
if (Console._disconnected.is(':visible')) Console._disconnected.fadeOut(Console.ANIMATION_SPEED);
|
58
63
|
$('#queue').toggleClass('no_jobs', Console._jobs.length <= 0);
|
59
64
|
Console.renderJobs();
|
60
|
-
Console.
|
65
|
+
Console.renderNodes();
|
61
66
|
Console.renderGraphs();
|
62
67
|
setTimeout(Console.getStatus, Console.POLL_INTERVAL);
|
63
68
|
}, error : function(request, status, errorThrown) {
|
@@ -66,6 +71,13 @@ window.Console = {
|
|
66
71
|
}});
|
67
72
|
},
|
68
73
|
|
74
|
+
// Count the total number of workers in the current list of nodes.
|
75
|
+
countWorkers : function() {
|
76
|
+
var sum = 0;
|
77
|
+
for (var i=0; i < this._nodes.length; i++) sum += this._nodes[i].workers.length;
|
78
|
+
return sum;
|
79
|
+
},
|
80
|
+
|
69
81
|
// Render an individual job afresh.
|
70
82
|
renderJob : function(job) {
|
71
83
|
this._queue.append('<div class="job" id="job_' + job.id + '" style="width:' + job.width + '%; background: #' + job.color + ';"><div class="completion ' + (job.percent_complete <= 0 ? 'zero' : '') + '" style="width:' + job.percent_complete + '%;"></div><div class="percent_complete">' + job.percent_complete + '%</div><div class="job_id">#' + job.id + '</div></div>');
|
@@ -105,12 +117,21 @@ window.Console = {
|
|
105
117
|
},
|
106
118
|
|
107
119
|
// Re-render all workers from scratch each time.
|
108
|
-
|
120
|
+
// This method is desperately in need of Javascript templates...
|
121
|
+
renderNodes : function() {
|
109
122
|
var header = $('#sidebar_header');
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
123
|
+
var nc = this._nodes.length, wc = this._workerCount;
|
124
|
+
$('.has_nodes', header).html(nc + " Node" + (nc != 1 ? 's' : '') + " / " + wc + " Worker" + (wc != 1 ? 's' : ''));
|
125
|
+
header.toggleClass('no_nodes', this._nodes.length <= 0);
|
126
|
+
$('#nodes').html($.map(this._nodes, function(node) {
|
127
|
+
var html = "";
|
128
|
+
var extra = node.status == 'busy' ? ' <span class="busy">[busy]</span>' : '';
|
129
|
+
html += '<div class="node ' + node.status + '">' + node.host + extra + '</div>';
|
130
|
+
html += $.map(node.workers, function(pid) {
|
131
|
+
var name = pid + '@' + node.host;
|
132
|
+
return '<div class="worker" rel="' + name + '">' + name + '</div>';
|
133
|
+
}).join('');
|
134
|
+
return html;
|
114
135
|
}).join(''));
|
115
136
|
},
|
116
137
|
|
@@ -118,7 +139,8 @@ window.Console = {
|
|
118
139
|
recordDataPoint : function() {
|
119
140
|
var timestamp = (new Date()).getTime();
|
120
141
|
this._jobsHistory.push([timestamp, this._jobs.length]);
|
121
|
-
this.
|
142
|
+
this._nodesHistory.push([timestamp, this._nodes.length]);
|
143
|
+
this._workersHistory.push([timestamp, this._workerCount]);
|
122
144
|
this._workUnitsHistory.push([timestamp, this._workUnitCount]);
|
123
145
|
$.each(this._histories, function() {
|
124
146
|
if (this.length > Console.MAX_DATA_POINTS) this.shift();
|
@@ -127,9 +149,16 @@ window.Console = {
|
|
127
149
|
|
128
150
|
// Convert our recorded data points into a format Flot can understand.
|
129
151
|
renderGraphs : function() {
|
130
|
-
$.plot($('#
|
131
|
-
|
132
|
-
|
152
|
+
$.plot($('#work_units_graph'), [
|
153
|
+
{label : 'Work Units in Queue', color : Console.WORK_UNITS_COLOR, data : Console._workUnitsHistory}
|
154
|
+
], Console.GRAPH_OPTIONS);
|
155
|
+
$.plot($('#jobs_graph'), [
|
156
|
+
{label : 'Jobs in Queue', color : Console.JOBS_COLOR, data : Console._jobsHistory}
|
157
|
+
], Console.GRAPH_OPTIONS);
|
158
|
+
$.plot($('#workers_graph'), [
|
159
|
+
{label : 'Nodes', color : Console.NODES_COLOR, data : Console._nodesHistory},
|
160
|
+
{label : 'Workers', color : Console.WORKERS_COLOR, data : Console._workersHistory}
|
161
|
+
], Console.GRAPH_OPTIONS);
|
133
162
|
},
|
134
163
|
|
135
164
|
// Request the Worker info from the central server.
|