cloud-crowd 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. data/README +16 -16
  2. data/cloud-crowd.gemspec +10 -9
  3. data/config/config.example.ru +8 -2
  4. data/config/config.example.yml +21 -25
  5. data/examples/process_pdfs_example.rb +1 -1
  6. data/examples/word_count_example.rb +1 -0
  7. data/lib/cloud-crowd.rb +47 -28
  8. data/lib/cloud_crowd/action.rb +14 -8
  9. data/lib/cloud_crowd/asset_store.rb +8 -8
  10. data/lib/cloud_crowd/asset_store/filesystem_store.rb +18 -7
  11. data/lib/cloud_crowd/asset_store/s3_store.rb +14 -11
  12. data/lib/cloud_crowd/command_line.rb +24 -58
  13. data/lib/cloud_crowd/exceptions.rb +7 -0
  14. data/lib/cloud_crowd/helpers/authorization.rb +5 -3
  15. data/lib/cloud_crowd/helpers/resources.rb +0 -20
  16. data/lib/cloud_crowd/models.rb +1 -1
  17. data/lib/cloud_crowd/models/job.rb +37 -40
  18. data/lib/cloud_crowd/models/node_record.rb +95 -0
  19. data/lib/cloud_crowd/models/work_unit.rb +87 -33
  20. data/lib/cloud_crowd/node.rb +105 -0
  21. data/lib/cloud_crowd/schema.rb +22 -18
  22. data/lib/cloud_crowd/{app.rb → server.rb} +34 -34
  23. data/lib/cloud_crowd/worker.rb +68 -107
  24. data/public/css/admin_console.css +40 -18
  25. data/public/images/server.png +0 -0
  26. data/public/images/server_busy.png +0 -0
  27. data/public/js/admin_console.js +47 -18
  28. data/test/acceptance/test_failing_work_units.rb +1 -1
  29. data/test/acceptance/{test_app.rb → test_server.rb} +15 -15
  30. data/test/acceptance/test_word_count.rb +3 -9
  31. data/test/blueprints.rb +0 -1
  32. data/test/config/config.ru +1 -1
  33. data/test/config/config.yml +2 -4
  34. data/test/unit/test_action.rb +1 -1
  35. data/test/unit/test_configuration.rb +1 -1
  36. data/test/unit/test_job.rb +3 -0
  37. data/test/unit/test_work_unit.rb +2 -4
  38. data/views/{index.erb → operations_center.erb} +13 -8
  39. metadata +11 -10
  40. data/lib/cloud_crowd/daemon.rb +0 -95
  41. data/lib/cloud_crowd/models/worker_record.rb +0 -61
  42. data/lib/cloud_crowd/runner.rb +0 -15
@@ -1,90 +1,48 @@
1
1
  module CloudCrowd
2
2
 
3
- # The Worker, run at intervals by the Daemon, fetches WorkUnits from the
4
- # central server and dispatches Actions to process them. Workers only fetch
5
- # units that they are able to handle (for which they have an action in their
6
- # actions directory). If communication with the central server is interrupted,
7
- # the WorkUnit will repeatedly attempt to complete its unit -- every
8
- # Worker::RETRY_WAIT seconds. Any exceptions that take place during
3
+ # The Worker, forked off from the Node when a new WorkUnit is received,
4
+ # launches an Action for processing. Workers will only ever receive WorkUnits
5
+ # that they are able to handle (for which they have a corresponding action in
6
+ # their actions directory). If communication with the central server is
7
+ # interrupted, the Worker will repeatedly attempt to complete its unit --
8
+ # every Worker::RETRY_WAIT seconds. Any exceptions that take place during
9
9
  # the course of the Action will cause the Worker to mark the WorkUnit as
10
- # having failed.
10
+ # having failed. When finished, the Worker's process exits, minimizing the
11
+ # potential for memory leaks.
11
12
  class Worker
12
13
 
13
- # The time between worker check-ins with the central server, informing
14
- # it of the current status, and simply that it's still alive.
15
- CHECK_IN_INTERVAL = 60
16
-
17
14
  # Wait five seconds to retry, after internal communcication errors.
18
15
  RETRY_WAIT = 5
19
16
 
20
17
  attr_reader :action
21
18
 
22
- # Spinning up a worker will create a new AssetStore with a persistent
23
- # connection to S3. This AssetStore gets passed into each action, for use
24
- # as it is run.
25
- def initialize
26
- @id = $$
27
- @hostname = Socket.gethostname
28
- @name = "#{@id}@#{@hostname}"
29
- @store = AssetStore.new
30
- @server = CloudCrowd.central_server
31
- @enabled_actions = CloudCrowd.actions.keys
32
- log 'started'
19
+ # A new Worker begins processing its WorkUnit straight off.
20
+ def initialize(node, work_unit)
21
+ @pid = $$
22
+ @node = node
23
+ trap_signals
24
+ setup_work_unit(work_unit)
25
+ run
33
26
  end
34
27
 
35
- # Ask the central server for the first WorkUnit in line.
36
- def fetch_work_unit
37
- keep_trying_to "fetch a new work unit" do
38
- unit_json = @server['/work'].post(base_params)
39
- setup_work_unit(unit_json)
40
- end
41
- end
42
-
43
- # Return output to the central server, marking the current work unit as done.
28
+ # Return output to the central server, marking the WorkUnit done.
44
29
  def complete_work_unit(result)
45
30
  keep_trying_to "complete work unit" do
46
- data = completion_params.merge({:status => 'succeeded', :output => result})
47
- unit_json = @server["/work/#{data[:id]}"].put(data)
31
+ data = base_params.merge({:status => 'succeeded', :output => result})
32
+ @node.server["/work/#{data[:id]}"].put(data)
48
33
  log "finished #{display_work_unit} in #{data[:time]} seconds"
49
- clear_work_unit
50
- setup_work_unit(unit_json)
51
34
  end
52
35
  end
53
36
 
54
- # Mark the current work unit as failed, returning the exception to central.
37
+ # Mark the WorkUnit failed, returning the exception to central.
55
38
  def fail_work_unit(exception)
56
39
  keep_trying_to "mark work unit as failed" do
57
- data = completion_params.merge({:status => 'failed', :output => {'output' => exception.message}.to_json})
58
- unit_json = @server["/work/#{data[:id]}"].put(data)
40
+ data = base_params.merge({:status => 'failed', :output => {'output' => exception.message}.to_json})
41
+ @node.server["/work/#{data[:id]}"].put(data)
59
42
  log "failed #{display_work_unit} in #{data[:time]} seconds\n#{exception.message}\n#{exception.backtrace}"
60
- clear_work_unit
61
- setup_work_unit(unit_json)
62
- end
63
- end
64
-
65
- # Check in with the central server. Let it know the condition of the work
66
- # thread, the action and status we're processing, and our hostname and PID.
67
- def check_in(thread_status)
68
- keep_trying_to "check in with central" do
69
- @server["/worker"].put({
70
- :name => @name,
71
- :thread_status => thread_status
72
- })
73
43
  end
74
44
  end
75
45
 
76
- # Inform the central server that this worker is finished. This is the only
77
- # remote method that doesn't retry on connection errors -- if the worker
78
- # can't connect to the central server while it's trying to shutdown, it
79
- # should close, regardless.
80
- def check_out
81
- @server["/worker"].put({
82
- :name => @name,
83
- :terminated => true
84
- })
85
- log 'exiting'
86
- end
87
-
88
46
  # We expect and require internal communication between the central server
89
47
  # and the workers to succeed. If it fails for any reason, log it, and then
90
48
  # keep trying the same request.
@@ -100,36 +58,38 @@ module CloudCrowd
100
58
  end
101
59
  end
102
60
 
103
- # Does this Worker have a job to do?
104
- def has_work?
105
- @action_name && @input && @options
106
- end
107
-
108
- # Loggable string of the current work unit.
61
+ # Loggable details describing what the Worker is up to.
109
62
  def display_work_unit
110
- "unit ##{@options['work_unit_id']} (#{@action_name})"
63
+ "unit ##{@options['work_unit_id']} (#{@action_name}/#{CloudCrowd.display_status(@status)})"
111
64
  end
112
65
 
113
- # Executes the current work unit, catching all exceptions as failures.
66
+ # Executes the WorkUnit by running the Action, catching all exceptions as
67
+ # failures. We capture the thread so that we can kill it from the outside,
68
+ # when exiting.
114
69
  def run_work_unit
115
- begin
116
- result = nil
117
- @action = CloudCrowd.actions[@action_name].new(@status, @input, @options, @store)
118
- Dir.chdir(@action.work_directory) do
119
- result = case @status
120
- when PROCESSING then @action.process
121
- when SPLITTING then @action.split
122
- when MERGING then @action.merge
123
- else raise Error::StatusUnspecified, "work units must specify their status"
70
+ @worker_thread = Thread.new do
71
+ begin
72
+ result = nil
73
+ @action = CloudCrowd.actions[@action_name].new(@status, @input, @options, @node.asset_store)
74
+ Dir.chdir(@action.work_directory) do
75
+ result = case @status
76
+ when PROCESSING then @action.process
77
+ when SPLITTING then @action.split
78
+ when MERGING then @action.merge
79
+ else raise Error::StatusUnspecified, "work units must specify their status"
80
+ end
124
81
  end
82
+ complete_work_unit({'output' => result}.to_json)
83
+ rescue Exception => e
84
+ fail_work_unit(e)
85
+ ensure
86
+ @action.cleanup_work_directory
125
87
  end
126
- complete_work_unit({'output' => result}.to_json)
127
- rescue Exception => e
128
- fail_work_unit(e)
129
88
  end
89
+ @worker_thread.join
130
90
  end
131
91
 
132
- # Wraps <tt>run_work_unit</tt> to benchmark the execution time, if requested.
92
+ # Wraps run_work_unit to benchmark the execution time, if requested.
133
93
  def run
134
94
  return run_work_unit unless @options['benchmark']
135
95
  status = CloudCrowd.display_status(@status)
@@ -139,27 +99,17 @@ module CloudCrowd
139
99
 
140
100
  private
141
101
 
142
- # Common parameters to send back to central.
143
- def base_params
144
- @base_params ||= {
145
- :worker_name => @name,
146
- :worker_actions => @enabled_actions.join(',')
147
- }
148
- end
149
-
150
102
  # Common parameters to send back to central upon unit completion,
151
103
  # regardless of success or failure.
152
- def completion_params
153
- base_params.merge({
154
- :id => @options['work_unit_id'],
155
- :time => Time.now - @start_time
156
- })
104
+ def base_params
105
+ { :pid => @pid,
106
+ :id => @options['work_unit_id'],
107
+ :time => Time.now - @start_time }
157
108
  end
158
109
 
159
- # Extract our instance variables from a WorkUnit's JSON.
160
- def setup_work_unit(unit_json)
161
- return false unless unit_json
162
- unit = JSON.parse(unit_json)
110
+ # Extract the Worker's instance variables from a WorkUnit's JSON.
111
+ def setup_work_unit(unit)
112
+ return false unless unit
163
113
  @start_time = Time.now
164
114
  @action_name, @input, @options, @status = unit['action'], unit['input'], unit['options'], unit['status']
165
115
  @options['job_id'] = unit['job_id']
@@ -171,14 +121,25 @@ module CloudCrowd
171
121
 
172
122
  # Log a message to the daemon log. Includes PID for identification.
173
123
  def log(message)
174
- puts "Worker ##{@id}: #{message}" unless ENV['RACK_ENV'] == 'test'
124
+ puts "Worker ##{@pid}: #{message}" unless ENV['RACK_ENV'] == 'test'
175
125
  end
176
126
 
177
- # When we're done with a unit, clear out our instance variables to make way
178
- # for the next one. Also, remove all of the unit's temporary storage.
179
- def clear_work_unit
180
- @action.cleanup_work_directory
181
- @action, @action_name, @input, @options, @start_time = nil, nil, nil, nil, nil
127
+ # When signaled to exit, make sure that the Worker shuts down cleanly.
128
+ def trap_signals
129
+ Signal.trap('INT') { shut_down }
130
+ Signal.trap('KILL') { shut_down }
131
+ Signal.trap('TERM') { shut_down }
132
+ end
133
+
134
+ # Force the Worker to quit, even if it's in the middle of processing.
135
+ # If it had a checked-out WorkUnit, the Node should have released it on
136
+ # the central server already.
137
+ def shut_down
138
+ if @worker_thread
139
+ @worker_thread.kill
140
+ @worker_thread.kill! if @worker_thread.alive?
141
+ end
142
+ Process.exit
182
143
  end
183
144
 
184
145
  end
@@ -134,44 +134,57 @@ body {
134
134
  }
135
135
  #sidebar_header {
136
136
  position: absolute;
137
+ width: 250px;
137
138
  top: 5px; left: 8px;
138
139
  color: #404040;
139
140
  text-shadow: 0px 1px 1px #eee;
140
141
  }
141
- #sidebar_header.no_workers .no_workers,
142
- #sidebar_header .has_workers {
142
+ #sidebar_header.no_nodes .no_nodes,
143
+ #sidebar_header .has_nodes {
143
144
  display: block;
144
145
  }
145
- #sidebar_header .no_workers,
146
- #sidebar_header.no_workers .has_workers {
146
+ #sidebar_header .no_nodes,
147
+ #sidebar_header.no_nodes .has_nodes {
147
148
  display: none;
148
149
  }
149
- #workers {
150
+ #nodes {
150
151
  position: absolute;
151
152
  padding: 2px 0;
152
153
  top: 21px; left: 0; bottom: 21px;
153
154
  width: 298px;
154
155
  overflow-y: auto; overflow-x: hidden;
155
156
  }
156
- #workers .worker {
157
+ #nodes .node, #nodes .worker {
157
158
  border: 1px solid transparent;
158
159
  margin: 1px 7px;
159
160
  padding-left: 18px;
160
- font-size: 11px;
161
- line-height: 22px;
162
- background: url(/images/bullet_white.png) no-repeat left center;
163
- cursor: pointer;
161
+ background-position: left center;
162
+ background-repeat: no-repeat;
164
163
  }
165
- #workers .worker.processing,
166
- #workers .worker.splitting,
167
- #workers .worker.merging {
168
- background: url(/images/bullet_green.png) no-repeat left center;
164
+ #nodes .node {
165
+ font-size: 11px;
166
+ line-height: 22px;
167
+ background-image: url(/images/server.png);
169
168
  }
170
- #workers .worker:hover {
171
- border: 1px solid #aaa;
172
- border-radius: 4px; -moz-border-radius: 4px; -webkit-border-radius: 4px;
173
- background-color: #ccc;
169
+ #nodes .node.busy {
170
+ background-image: url(/images/server_busy.png);
171
+ }
172
+ #nodes .node.busy span.busy {
173
+ font-size: 9px;
174
+ color: #7f7f7f;
175
+ text-transform: uppercase;
176
+ }
177
+ #nodes .worker {
178
+ font-size: 10px;
179
+ line-height: 18px;
180
+ cursor: pointer;
181
+ background-image: url(/images/bullet_green.png);
174
182
  }
183
+ #nodes .worker:hover {
184
+ border: 1px solid #aaa;
185
+ border-radius: 4px; -moz-border-radius: 4px; -webkit-border-radius: 4px;
186
+ background-color: #ccc;
187
+ }
175
188
 
176
189
  #worker_info {
177
190
  position: absolute;
@@ -216,6 +229,15 @@ body {
216
229
  text-shadow: 0px 1px 1px #eee;
217
230
  margin-bottom: 10px;
218
231
  }
232
+ .legend_box {
233
+ display: inline-block;
234
+ width: 10px; height: 10px;
235
+ border: 1px solid #bbb;
236
+ position: relative;
237
+ top: 1px;
238
+ margin: 0 1px;
239
+ background-color: #a1003d;
240
+ }
219
241
  .graph {
220
242
  height: 150px;
221
243
  }
Binary file
@@ -22,26 +22,30 @@ window.Console = {
22
22
 
23
23
  // All options for drawing the system graphs.
24
24
  GRAPH_OPTIONS : {
25
- xaxis : {mode : 'time', timeformat : '%M:%S'},
26
- yaxis : {tickDecimals : 0},
27
- legend : {show : false},
28
- grid : {backgroundColor : '#7f7f7f', color : '#555', tickColor : '#666', borderWidth : 2}
25
+ xaxis : {mode : 'time', timeformat : '%M:%S'},
26
+ yaxis : {tickDecimals : 0},
27
+ legend : {show : false},
28
+ grid : {backgroundColor : '#7f7f7f', color : '#555', tickColor : '#666', borderWidth : 2}
29
29
  },
30
30
  JOBS_COLOR : '#db3a0f',
31
- WORKERS_COLOR : '#a1003d',
31
+ NODES_COLOR : '#1870ab',
32
+ WORKERS_COLOR : '#45a4e5',
32
33
  WORK_UNITS_COLOR : '#ffba14',
33
34
 
34
35
  // Starting the console begins polling the server.
35
36
  initialize : function() {
36
37
  this._jobsHistory = [];
38
+ this._nodesHistory = [];
37
39
  this._workersHistory = [];
38
40
  this._workUnitsHistory = [];
39
- this._histories = [this._jobsHistory, this._workersHistory, this._workUnitsHistory];
41
+ this._histories = [this._jobsHistory, this._nodesHistory, this._workersHistory, this._workUnitsHistory];
40
42
  this._queue = $('#jobs');
41
43
  this._workerInfo = $('#worker_info');
42
44
  this._disconnected = $('#disconnected');
43
45
  $(window).bind('resize', Console.renderGraphs);
44
- $('#workers .worker').live('click', Console.getWorkerInfo);
46
+ $('#nodes .worker').live('click', Console.getWorkerInfo);
47
+ $('#workers_legend').css({background : this.WORKERS_COLOR});
48
+ $('#nodes_legend').css({background : this.NODES_COLOR});
45
49
  this.getStatus();
46
50
  $.each(this.PRELOAD_IMAGES, function(){ var i = new Image(); i.src = this; });
47
51
  },
@@ -51,13 +55,14 @@ window.Console = {
51
55
  getStatus : function() {
52
56
  $.ajax({url : '/status', dataType : 'json', success : function(resp) {
53
57
  Console._jobs = resp.jobs;
54
- Console._workers = resp.workers;
58
+ Console._nodes = resp.nodes;
55
59
  Console._workUnitCount = resp.work_unit_count;
60
+ Console._workerCount = Console.countWorkers();
56
61
  Console.recordDataPoint();
57
62
  if (Console._disconnected.is(':visible')) Console._disconnected.fadeOut(Console.ANIMATION_SPEED);
58
63
  $('#queue').toggleClass('no_jobs', Console._jobs.length <= 0);
59
64
  Console.renderJobs();
60
- Console.renderWorkers();
65
+ Console.renderNodes();
61
66
  Console.renderGraphs();
62
67
  setTimeout(Console.getStatus, Console.POLL_INTERVAL);
63
68
  }, error : function(request, status, errorThrown) {
@@ -66,6 +71,13 @@ window.Console = {
66
71
  }});
67
72
  },
68
73
 
74
+ // Count the total number of workers in the current list of nodes.
75
+ countWorkers : function() {
76
+ var sum = 0;
77
+ for (var i=0; i < this._nodes.length; i++) sum += this._nodes[i].workers.length;
78
+ return sum;
79
+ },
80
+
69
81
  // Render an individual job afresh.
70
82
  renderJob : function(job) {
71
83
  this._queue.append('<div class="job" id="job_' + job.id + '" style="width:' + job.width + '%; background: #' + job.color + ';"><div class="completion ' + (job.percent_complete <= 0 ? 'zero' : '') + '" style="width:' + job.percent_complete + '%;"></div><div class="percent_complete">' + job.percent_complete + '%</div><div class="job_id">#' + job.id + '</div></div>');
@@ -105,12 +117,21 @@ window.Console = {
105
117
  },
106
118
 
107
119
  // Re-render all workers from scratch each time.
108
- renderWorkers : function() {
120
+ // This method is desperately in need of Javascript templates...
121
+ renderNodes : function() {
109
122
  var header = $('#sidebar_header');
110
- $('.has_workers', header).html(this._workers.length + " Active Worker Daemons");
111
- header.toggleClass('no_workers', this._workers.length <= 0);
112
- $('#workers').html($.map(this._workers, function(w) {
113
- return '<div class="worker ' + w.status + '" rel="' + w.name + '">' + w.name + '</div>';
123
+ var nc = this._nodes.length, wc = this._workerCount;
124
+ $('.has_nodes', header).html(nc + " Node" + (nc != 1 ? 's' : '') + " / " + wc + " Worker" + (wc != 1 ? 's' : ''));
125
+ header.toggleClass('no_nodes', this._nodes.length <= 0);
126
+ $('#nodes').html($.map(this._nodes, function(node) {
127
+ var html = "";
128
+ var extra = node.status == 'busy' ? ' <span class="busy">[busy]</span>' : '';
129
+ html += '<div class="node ' + node.status + '">' + node.host + extra + '</div>';
130
+ html += $.map(node.workers, function(pid) {
131
+ var name = pid + '@' + node.host;
132
+ return '<div class="worker" rel="' + name + '">' + name + '</div>';
133
+ }).join('');
134
+ return html;
114
135
  }).join(''));
115
136
  },
116
137
 
@@ -118,7 +139,8 @@ window.Console = {
118
139
  recordDataPoint : function() {
119
140
  var timestamp = (new Date()).getTime();
120
141
  this._jobsHistory.push([timestamp, this._jobs.length]);
121
- this._workersHistory.push([timestamp, this._workers.length]);
142
+ this._nodesHistory.push([timestamp, this._nodes.length]);
143
+ this._workersHistory.push([timestamp, this._workerCount]);
122
144
  this._workUnitsHistory.push([timestamp, this._workUnitCount]);
123
145
  $.each(this._histories, function() {
124
146
  if (this.length > Console.MAX_DATA_POINTS) this.shift();
@@ -127,9 +149,16 @@ window.Console = {
127
149
 
128
150
  // Convert our recorded data points into a format Flot can understand.
129
151
  renderGraphs : function() {
130
- $.plot($('#jobs_graph'), [{label : 'Jobs in Queue', color : Console.JOBS_COLOR, data : Console._jobsHistory}], Console.GRAPH_OPTIONS);
131
- $.plot($('#workers_graph'), [{label : 'Active Workers', color : Console.WORKERS_COLOR, data : Console._workersHistory}], Console.GRAPH_OPTIONS);
132
- $.plot($('#work_units_graph'), [{label : 'Work Units in Queue', color : Console.WORK_UNITS_COLOR, data : Console._workUnitsHistory}], Console.GRAPH_OPTIONS);
152
+ $.plot($('#work_units_graph'), [
153
+ {label : 'Work Units in Queue', color : Console.WORK_UNITS_COLOR, data : Console._workUnitsHistory}
154
+ ], Console.GRAPH_OPTIONS);
155
+ $.plot($('#jobs_graph'), [
156
+ {label : 'Jobs in Queue', color : Console.JOBS_COLOR, data : Console._jobsHistory}
157
+ ], Console.GRAPH_OPTIONS);
158
+ $.plot($('#workers_graph'), [
159
+ {label : 'Nodes', color : Console.NODES_COLOR, data : Console._nodesHistory},
160
+ {label : 'Workers', color : Console.WORKERS_COLOR, data : Console._workersHistory}
161
+ ], Console.GRAPH_OPTIONS);
133
162
  },
134
163
 
135
164
  // Request the Worker info from the central server.