cloud-crowd 0.1.0 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (42) hide show
  1. data/README +16 -16
  2. data/cloud-crowd.gemspec +10 -9
  3. data/config/config.example.ru +8 -2
  4. data/config/config.example.yml +21 -25
  5. data/examples/process_pdfs_example.rb +1 -1
  6. data/examples/word_count_example.rb +1 -0
  7. data/lib/cloud-crowd.rb +47 -28
  8. data/lib/cloud_crowd/action.rb +14 -8
  9. data/lib/cloud_crowd/asset_store.rb +8 -8
  10. data/lib/cloud_crowd/asset_store/filesystem_store.rb +18 -7
  11. data/lib/cloud_crowd/asset_store/s3_store.rb +14 -11
  12. data/lib/cloud_crowd/command_line.rb +24 -58
  13. data/lib/cloud_crowd/exceptions.rb +7 -0
  14. data/lib/cloud_crowd/helpers/authorization.rb +5 -3
  15. data/lib/cloud_crowd/helpers/resources.rb +0 -20
  16. data/lib/cloud_crowd/models.rb +1 -1
  17. data/lib/cloud_crowd/models/job.rb +37 -40
  18. data/lib/cloud_crowd/models/node_record.rb +95 -0
  19. data/lib/cloud_crowd/models/work_unit.rb +87 -33
  20. data/lib/cloud_crowd/node.rb +105 -0
  21. data/lib/cloud_crowd/schema.rb +22 -18
  22. data/lib/cloud_crowd/{app.rb → server.rb} +34 -34
  23. data/lib/cloud_crowd/worker.rb +68 -107
  24. data/public/css/admin_console.css +40 -18
  25. data/public/images/server.png +0 -0
  26. data/public/images/server_busy.png +0 -0
  27. data/public/js/admin_console.js +47 -18
  28. data/test/acceptance/test_failing_work_units.rb +1 -1
  29. data/test/acceptance/{test_app.rb → test_server.rb} +15 -15
  30. data/test/acceptance/test_word_count.rb +3 -9
  31. data/test/blueprints.rb +0 -1
  32. data/test/config/config.ru +1 -1
  33. data/test/config/config.yml +2 -4
  34. data/test/unit/test_action.rb +1 -1
  35. data/test/unit/test_configuration.rb +1 -1
  36. data/test/unit/test_job.rb +3 -0
  37. data/test/unit/test_work_unit.rb +2 -4
  38. data/views/{index.erb → operations_center.erb} +13 -8
  39. metadata +11 -10
  40. data/lib/cloud_crowd/daemon.rb +0 -95
  41. data/lib/cloud_crowd/models/worker_record.rb +0 -61
  42. data/lib/cloud_crowd/runner.rb +0 -15
@@ -1,90 +1,48 @@
1
1
  module CloudCrowd
2
2
 
3
- # The Worker, run at intervals by the Daemon, fetches WorkUnits from the
4
- # central server and dispatches Actions to process them. Workers only fetch
5
- # units that they are able to handle (for which they have an action in their
6
- # actions directory). If communication with the central server is interrupted,
7
- # the WorkUnit will repeatedly attempt to complete its unit -- every
8
- # Worker::RETRY_WAIT seconds. Any exceptions that take place during
3
+ # The Worker, forked off from the Node when a new WorkUnit is received,
4
+ # launches an Action for processing. Workers will only ever receive WorkUnits
5
+ # that they are able to handle (for which they have a corresponding action in
6
+ # their actions directory). If communication with the central server is
7
+ # interrupted, the Worker will repeatedly attempt to complete its unit --
8
+ # every Worker::RETRY_WAIT seconds. Any exceptions that take place during
9
9
  # the course of the Action will cause the Worker to mark the WorkUnit as
10
- # having failed.
10
+ # having failed. When finished, the Worker's process exits, minimizing the
11
+ # potential for memory leaks.
11
12
  class Worker
12
13
 
13
- # The time between worker check-ins with the central server, informing
14
- # it of the current status, and simply that it's still alive.
15
- CHECK_IN_INTERVAL = 60
16
-
17
14
  # Wait five seconds to retry, after internal communcication errors.
18
15
  RETRY_WAIT = 5
19
16
 
20
17
  attr_reader :action
21
18
 
22
- # Spinning up a worker will create a new AssetStore with a persistent
23
- # connection to S3. This AssetStore gets passed into each action, for use
24
- # as it is run.
25
- def initialize
26
- @id = $$
27
- @hostname = Socket.gethostname
28
- @name = "#{@id}@#{@hostname}"
29
- @store = AssetStore.new
30
- @server = CloudCrowd.central_server
31
- @enabled_actions = CloudCrowd.actions.keys
32
- log 'started'
19
+ # A new Worker begins processing its WorkUnit straight off.
20
+ def initialize(node, work_unit)
21
+ @pid = $$
22
+ @node = node
23
+ trap_signals
24
+ setup_work_unit(work_unit)
25
+ run
33
26
  end
34
27
 
35
- # Ask the central server for the first WorkUnit in line.
36
- def fetch_work_unit
37
- keep_trying_to "fetch a new work unit" do
38
- unit_json = @server['/work'].post(base_params)
39
- setup_work_unit(unit_json)
40
- end
41
- end
42
-
43
- # Return output to the central server, marking the current work unit as done.
28
+ # Return output to the central server, marking the WorkUnit done.
44
29
  def complete_work_unit(result)
45
30
  keep_trying_to "complete work unit" do
46
- data = completion_params.merge({:status => 'succeeded', :output => result})
47
- unit_json = @server["/work/#{data[:id]}"].put(data)
31
+ data = base_params.merge({:status => 'succeeded', :output => result})
32
+ @node.server["/work/#{data[:id]}"].put(data)
48
33
  log "finished #{display_work_unit} in #{data[:time]} seconds"
49
- clear_work_unit
50
- setup_work_unit(unit_json)
51
34
  end
52
35
  end
53
36
 
54
- # Mark the current work unit as failed, returning the exception to central.
37
+ # Mark the WorkUnit failed, returning the exception to central.
55
38
  def fail_work_unit(exception)
56
39
  keep_trying_to "mark work unit as failed" do
57
- data = completion_params.merge({:status => 'failed', :output => {'output' => exception.message}.to_json})
58
- unit_json = @server["/work/#{data[:id]}"].put(data)
40
+ data = base_params.merge({:status => 'failed', :output => {'output' => exception.message}.to_json})
41
+ @node.server["/work/#{data[:id]}"].put(data)
59
42
  log "failed #{display_work_unit} in #{data[:time]} seconds\n#{exception.message}\n#{exception.backtrace}"
60
- clear_work_unit
61
- setup_work_unit(unit_json)
62
- end
63
- end
64
-
65
- # Check in with the central server. Let it know the condition of the work
66
- # thread, the action and status we're processing, and our hostname and PID.
67
- def check_in(thread_status)
68
- keep_trying_to "check in with central" do
69
- @server["/worker"].put({
70
- :name => @name,
71
- :thread_status => thread_status
72
- })
73
43
  end
74
44
  end
75
45
 
76
- # Inform the central server that this worker is finished. This is the only
77
- # remote method that doesn't retry on connection errors -- if the worker
78
- # can't connect to the central server while it's trying to shutdown, it
79
- # should close, regardless.
80
- def check_out
81
- @server["/worker"].put({
82
- :name => @name,
83
- :terminated => true
84
- })
85
- log 'exiting'
86
- end
87
-
88
46
  # We expect and require internal communication between the central server
89
47
  # and the workers to succeed. If it fails for any reason, log it, and then
90
48
  # keep trying the same request.
@@ -100,36 +58,38 @@ module CloudCrowd
100
58
  end
101
59
  end
102
60
 
103
- # Does this Worker have a job to do?
104
- def has_work?
105
- @action_name && @input && @options
106
- end
107
-
108
- # Loggable string of the current work unit.
61
+ # Loggable details describing what the Worker is up to.
109
62
  def display_work_unit
110
- "unit ##{@options['work_unit_id']} (#{@action_name})"
63
+ "unit ##{@options['work_unit_id']} (#{@action_name}/#{CloudCrowd.display_status(@status)})"
111
64
  end
112
65
 
113
- # Executes the current work unit, catching all exceptions as failures.
66
+ # Executes the WorkUnit by running the Action, catching all exceptions as
67
+ # failures. We capture the thread so that we can kill it from the outside,
68
+ # when exiting.
114
69
  def run_work_unit
115
- begin
116
- result = nil
117
- @action = CloudCrowd.actions[@action_name].new(@status, @input, @options, @store)
118
- Dir.chdir(@action.work_directory) do
119
- result = case @status
120
- when PROCESSING then @action.process
121
- when SPLITTING then @action.split
122
- when MERGING then @action.merge
123
- else raise Error::StatusUnspecified, "work units must specify their status"
70
+ @worker_thread = Thread.new do
71
+ begin
72
+ result = nil
73
+ @action = CloudCrowd.actions[@action_name].new(@status, @input, @options, @node.asset_store)
74
+ Dir.chdir(@action.work_directory) do
75
+ result = case @status
76
+ when PROCESSING then @action.process
77
+ when SPLITTING then @action.split
78
+ when MERGING then @action.merge
79
+ else raise Error::StatusUnspecified, "work units must specify their status"
80
+ end
124
81
  end
82
+ complete_work_unit({'output' => result}.to_json)
83
+ rescue Exception => e
84
+ fail_work_unit(e)
85
+ ensure
86
+ @action.cleanup_work_directory
125
87
  end
126
- complete_work_unit({'output' => result}.to_json)
127
- rescue Exception => e
128
- fail_work_unit(e)
129
88
  end
89
+ @worker_thread.join
130
90
  end
131
91
 
132
- # Wraps <tt>run_work_unit</tt> to benchmark the execution time, if requested.
92
+ # Wraps run_work_unit to benchmark the execution time, if requested.
133
93
  def run
134
94
  return run_work_unit unless @options['benchmark']
135
95
  status = CloudCrowd.display_status(@status)
@@ -139,27 +99,17 @@ module CloudCrowd
139
99
 
140
100
  private
141
101
 
142
- # Common parameters to send back to central.
143
- def base_params
144
- @base_params ||= {
145
- :worker_name => @name,
146
- :worker_actions => @enabled_actions.join(',')
147
- }
148
- end
149
-
150
102
  # Common parameters to send back to central upon unit completion,
151
103
  # regardless of success or failure.
152
- def completion_params
153
- base_params.merge({
154
- :id => @options['work_unit_id'],
155
- :time => Time.now - @start_time
156
- })
104
+ def base_params
105
+ { :pid => @pid,
106
+ :id => @options['work_unit_id'],
107
+ :time => Time.now - @start_time }
157
108
  end
158
109
 
159
- # Extract our instance variables from a WorkUnit's JSON.
160
- def setup_work_unit(unit_json)
161
- return false unless unit_json
162
- unit = JSON.parse(unit_json)
110
+ # Extract the Worker's instance variables from a WorkUnit's JSON.
111
+ def setup_work_unit(unit)
112
+ return false unless unit
163
113
  @start_time = Time.now
164
114
  @action_name, @input, @options, @status = unit['action'], unit['input'], unit['options'], unit['status']
165
115
  @options['job_id'] = unit['job_id']
@@ -171,14 +121,25 @@ module CloudCrowd
171
121
 
172
122
  # Log a message to the daemon log. Includes PID for identification.
173
123
  def log(message)
174
- puts "Worker ##{@id}: #{message}" unless ENV['RACK_ENV'] == 'test'
124
+ puts "Worker ##{@pid}: #{message}" unless ENV['RACK_ENV'] == 'test'
175
125
  end
176
126
 
177
- # When we're done with a unit, clear out our instance variables to make way
178
- # for the next one. Also, remove all of the unit's temporary storage.
179
- def clear_work_unit
180
- @action.cleanup_work_directory
181
- @action, @action_name, @input, @options, @start_time = nil, nil, nil, nil, nil
127
+ # When signaled to exit, make sure that the Worker shuts down cleanly.
128
+ def trap_signals
129
+ Signal.trap('INT') { shut_down }
130
+ Signal.trap('KILL') { shut_down }
131
+ Signal.trap('TERM') { shut_down }
132
+ end
133
+
134
+ # Force the Worker to quit, even if it's in the middle of processing.
135
+ # If it had a checked-out WorkUnit, the Node should have released it on
136
+ # the central server already.
137
+ def shut_down
138
+ if @worker_thread
139
+ @worker_thread.kill
140
+ @worker_thread.kill! if @worker_thread.alive?
141
+ end
142
+ Process.exit
182
143
  end
183
144
 
184
145
  end
@@ -134,44 +134,57 @@ body {
134
134
  }
135
135
  #sidebar_header {
136
136
  position: absolute;
137
+ width: 250px;
137
138
  top: 5px; left: 8px;
138
139
  color: #404040;
139
140
  text-shadow: 0px 1px 1px #eee;
140
141
  }
141
- #sidebar_header.no_workers .no_workers,
142
- #sidebar_header .has_workers {
142
+ #sidebar_header.no_nodes .no_nodes,
143
+ #sidebar_header .has_nodes {
143
144
  display: block;
144
145
  }
145
- #sidebar_header .no_workers,
146
- #sidebar_header.no_workers .has_workers {
146
+ #sidebar_header .no_nodes,
147
+ #sidebar_header.no_nodes .has_nodes {
147
148
  display: none;
148
149
  }
149
- #workers {
150
+ #nodes {
150
151
  position: absolute;
151
152
  padding: 2px 0;
152
153
  top: 21px; left: 0; bottom: 21px;
153
154
  width: 298px;
154
155
  overflow-y: auto; overflow-x: hidden;
155
156
  }
156
- #workers .worker {
157
+ #nodes .node, #nodes .worker {
157
158
  border: 1px solid transparent;
158
159
  margin: 1px 7px;
159
160
  padding-left: 18px;
160
- font-size: 11px;
161
- line-height: 22px;
162
- background: url(/images/bullet_white.png) no-repeat left center;
163
- cursor: pointer;
161
+ background-position: left center;
162
+ background-repeat: no-repeat;
164
163
  }
165
- #workers .worker.processing,
166
- #workers .worker.splitting,
167
- #workers .worker.merging {
168
- background: url(/images/bullet_green.png) no-repeat left center;
164
+ #nodes .node {
165
+ font-size: 11px;
166
+ line-height: 22px;
167
+ background-image: url(/images/server.png);
169
168
  }
170
- #workers .worker:hover {
171
- border: 1px solid #aaa;
172
- border-radius: 4px; -moz-border-radius: 4px; -webkit-border-radius: 4px;
173
- background-color: #ccc;
169
+ #nodes .node.busy {
170
+ background-image: url(/images/server_busy.png);
171
+ }
172
+ #nodes .node.busy span.busy {
173
+ font-size: 9px;
174
+ color: #7f7f7f;
175
+ text-transform: uppercase;
176
+ }
177
+ #nodes .worker {
178
+ font-size: 10px;
179
+ line-height: 18px;
180
+ cursor: pointer;
181
+ background-image: url(/images/bullet_green.png);
174
182
  }
183
+ #nodes .worker:hover {
184
+ border: 1px solid #aaa;
185
+ border-radius: 4px; -moz-border-radius: 4px; -webkit-border-radius: 4px;
186
+ background-color: #ccc;
187
+ }
175
188
 
176
189
  #worker_info {
177
190
  position: absolute;
@@ -216,6 +229,15 @@ body {
216
229
  text-shadow: 0px 1px 1px #eee;
217
230
  margin-bottom: 10px;
218
231
  }
232
+ .legend_box {
233
+ display: inline-block;
234
+ width: 10px; height: 10px;
235
+ border: 1px solid #bbb;
236
+ position: relative;
237
+ top: 1px;
238
+ margin: 0 1px;
239
+ background-color: #a1003d;
240
+ }
219
241
  .graph {
220
242
  height: 150px;
221
243
  }
Binary file
@@ -22,26 +22,30 @@ window.Console = {
22
22
 
23
23
  // All options for drawing the system graphs.
24
24
  GRAPH_OPTIONS : {
25
- xaxis : {mode : 'time', timeformat : '%M:%S'},
26
- yaxis : {tickDecimals : 0},
27
- legend : {show : false},
28
- grid : {backgroundColor : '#7f7f7f', color : '#555', tickColor : '#666', borderWidth : 2}
25
+ xaxis : {mode : 'time', timeformat : '%M:%S'},
26
+ yaxis : {tickDecimals : 0},
27
+ legend : {show : false},
28
+ grid : {backgroundColor : '#7f7f7f', color : '#555', tickColor : '#666', borderWidth : 2}
29
29
  },
30
30
  JOBS_COLOR : '#db3a0f',
31
- WORKERS_COLOR : '#a1003d',
31
+ NODES_COLOR : '#1870ab',
32
+ WORKERS_COLOR : '#45a4e5',
32
33
  WORK_UNITS_COLOR : '#ffba14',
33
34
 
34
35
  // Starting the console begins polling the server.
35
36
  initialize : function() {
36
37
  this._jobsHistory = [];
38
+ this._nodesHistory = [];
37
39
  this._workersHistory = [];
38
40
  this._workUnitsHistory = [];
39
- this._histories = [this._jobsHistory, this._workersHistory, this._workUnitsHistory];
41
+ this._histories = [this._jobsHistory, this._nodesHistory, this._workersHistory, this._workUnitsHistory];
40
42
  this._queue = $('#jobs');
41
43
  this._workerInfo = $('#worker_info');
42
44
  this._disconnected = $('#disconnected');
43
45
  $(window).bind('resize', Console.renderGraphs);
44
- $('#workers .worker').live('click', Console.getWorkerInfo);
46
+ $('#nodes .worker').live('click', Console.getWorkerInfo);
47
+ $('#workers_legend').css({background : this.WORKERS_COLOR});
48
+ $('#nodes_legend').css({background : this.NODES_COLOR});
45
49
  this.getStatus();
46
50
  $.each(this.PRELOAD_IMAGES, function(){ var i = new Image(); i.src = this; });
47
51
  },
@@ -51,13 +55,14 @@ window.Console = {
51
55
  getStatus : function() {
52
56
  $.ajax({url : '/status', dataType : 'json', success : function(resp) {
53
57
  Console._jobs = resp.jobs;
54
- Console._workers = resp.workers;
58
+ Console._nodes = resp.nodes;
55
59
  Console._workUnitCount = resp.work_unit_count;
60
+ Console._workerCount = Console.countWorkers();
56
61
  Console.recordDataPoint();
57
62
  if (Console._disconnected.is(':visible')) Console._disconnected.fadeOut(Console.ANIMATION_SPEED);
58
63
  $('#queue').toggleClass('no_jobs', Console._jobs.length <= 0);
59
64
  Console.renderJobs();
60
- Console.renderWorkers();
65
+ Console.renderNodes();
61
66
  Console.renderGraphs();
62
67
  setTimeout(Console.getStatus, Console.POLL_INTERVAL);
63
68
  }, error : function(request, status, errorThrown) {
@@ -66,6 +71,13 @@ window.Console = {
66
71
  }});
67
72
  },
68
73
 
74
+ // Count the total number of workers in the current list of nodes.
75
+ countWorkers : function() {
76
+ var sum = 0;
77
+ for (var i=0; i < this._nodes.length; i++) sum += this._nodes[i].workers.length;
78
+ return sum;
79
+ },
80
+
69
81
  // Render an individual job afresh.
70
82
  renderJob : function(job) {
71
83
  this._queue.append('<div class="job" id="job_' + job.id + '" style="width:' + job.width + '%; background: #' + job.color + ';"><div class="completion ' + (job.percent_complete <= 0 ? 'zero' : '') + '" style="width:' + job.percent_complete + '%;"></div><div class="percent_complete">' + job.percent_complete + '%</div><div class="job_id">#' + job.id + '</div></div>');
@@ -105,12 +117,21 @@ window.Console = {
105
117
  },
106
118
 
107
119
  // Re-render all workers from scratch each time.
108
- renderWorkers : function() {
120
+ // This method is desperately in need of Javascript templates...
121
+ renderNodes : function() {
109
122
  var header = $('#sidebar_header');
110
- $('.has_workers', header).html(this._workers.length + " Active Worker Daemons");
111
- header.toggleClass('no_workers', this._workers.length <= 0);
112
- $('#workers').html($.map(this._workers, function(w) {
113
- return '<div class="worker ' + w.status + '" rel="' + w.name + '">' + w.name + '</div>';
123
+ var nc = this._nodes.length, wc = this._workerCount;
124
+ $('.has_nodes', header).html(nc + " Node" + (nc != 1 ? 's' : '') + " / " + wc + " Worker" + (wc != 1 ? 's' : ''));
125
+ header.toggleClass('no_nodes', this._nodes.length <= 0);
126
+ $('#nodes').html($.map(this._nodes, function(node) {
127
+ var html = "";
128
+ var extra = node.status == 'busy' ? ' <span class="busy">[busy]</span>' : '';
129
+ html += '<div class="node ' + node.status + '">' + node.host + extra + '</div>';
130
+ html += $.map(node.workers, function(pid) {
131
+ var name = pid + '@' + node.host;
132
+ return '<div class="worker" rel="' + name + '">' + name + '</div>';
133
+ }).join('');
134
+ return html;
114
135
  }).join(''));
115
136
  },
116
137
 
@@ -118,7 +139,8 @@ window.Console = {
118
139
  recordDataPoint : function() {
119
140
  var timestamp = (new Date()).getTime();
120
141
  this._jobsHistory.push([timestamp, this._jobs.length]);
121
- this._workersHistory.push([timestamp, this._workers.length]);
142
+ this._nodesHistory.push([timestamp, this._nodes.length]);
143
+ this._workersHistory.push([timestamp, this._workerCount]);
122
144
  this._workUnitsHistory.push([timestamp, this._workUnitCount]);
123
145
  $.each(this._histories, function() {
124
146
  if (this.length > Console.MAX_DATA_POINTS) this.shift();
@@ -127,9 +149,16 @@ window.Console = {
127
149
 
128
150
  // Convert our recorded data points into a format Flot can understand.
129
151
  renderGraphs : function() {
130
- $.plot($('#jobs_graph'), [{label : 'Jobs in Queue', color : Console.JOBS_COLOR, data : Console._jobsHistory}], Console.GRAPH_OPTIONS);
131
- $.plot($('#workers_graph'), [{label : 'Active Workers', color : Console.WORKERS_COLOR, data : Console._workersHistory}], Console.GRAPH_OPTIONS);
132
- $.plot($('#work_units_graph'), [{label : 'Work Units in Queue', color : Console.WORK_UNITS_COLOR, data : Console._workUnitsHistory}], Console.GRAPH_OPTIONS);
152
+ $.plot($('#work_units_graph'), [
153
+ {label : 'Work Units in Queue', color : Console.WORK_UNITS_COLOR, data : Console._workUnitsHistory}
154
+ ], Console.GRAPH_OPTIONS);
155
+ $.plot($('#jobs_graph'), [
156
+ {label : 'Jobs in Queue', color : Console.JOBS_COLOR, data : Console._jobsHistory}
157
+ ], Console.GRAPH_OPTIONS);
158
+ $.plot($('#workers_graph'), [
159
+ {label : 'Nodes', color : Console.NODES_COLOR, data : Console._nodesHistory},
160
+ {label : 'Workers', color : Console.WORKERS_COLOR, data : Console._workersHistory}
161
+ ], Console.GRAPH_OPTIONS);
133
162
  },
134
163
 
135
164
  // Request the Worker info from the central server.