cloud-crowd 0.3.3 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -6,36 +6,36 @@ module CloudCrowd
6
6
  # are each run as a single WorkUnit.
7
7
  class WorkUnit < ActiveRecord::Base
8
8
  include ModelStatus
9
-
9
+
10
10
  # We use a random number in (0...MAX_RESERVATION) to reserve work units.
11
11
  # The size of the maximum signed integer in MySQL -- SQLite has no limit.
12
12
  MAX_RESERVATION = 2147483647
13
-
13
+
14
14
  # We only reserve a certain number of WorkUnits in a single go, to avoid
15
15
  # reserving the entire table.
16
16
  RESERVATION_LIMIT = 25
17
-
17
+
18
18
  belongs_to :job
19
19
  belongs_to :node_record
20
-
20
+
21
21
  validates_presence_of :job_id, :status, :input, :action
22
-
22
+
23
23
  # Available WorkUnits are waiting to be distributed to Nodes for processing.
24
24
  named_scope :available, {:conditions => {:reservation => nil, :worker_pid => nil, :status => INCOMPLETE}}
25
25
  # Reserved WorkUnits have been marked for distribution by a central server process.
26
- named_scope :reserved, lambda {|reservation|
26
+ named_scope :reserved, lambda {|reservation|
27
27
  {:conditions => {:reservation => reservation}, :order => 'updated_at asc'}
28
28
  }
29
-
29
+
30
30
  # Attempt to send a list of WorkUnits to nodes with available capacity.
31
31
  # A single central server process stops the same WorkUnit from being
32
32
  # distributed to multiple nodes by reserving it first. The algorithm used
33
33
  # should be lock-free.
34
34
  #
35
35
  # We reserve WorkUnits for this process in chunks of RESERVATION_LIMIT size,
36
- # and try to match them to Nodes that are capable of handling the Action.
37
- # WorkUnits get removed from the availability list when they are
38
- # successfully sent, and Nodes get removed when they are busy or have the
36
+ # and try to match them to Nodes that are capable of handling the Action.
37
+ # WorkUnits get removed from the availability list when they are
38
+ # successfully sent, and Nodes get removed when they are busy or have the
39
39
  # action in question disabled.
40
40
  def self.distribute_to_nodes
41
41
  reservation = nil
@@ -44,11 +44,13 @@ module CloudCrowd
44
44
  work_units = WorkUnit.reserved(reservation)
45
45
  available_nodes = NodeRecord.available
46
46
  while node = available_nodes.shift and unit = work_units.shift do
47
- if node.actions.include? unit.action
47
+ if node.actions.include?(unit.action)
48
48
  if node.send_work_unit(unit)
49
49
  available_nodes.push(node) unless node.busy?
50
50
  next
51
51
  end
52
+ else
53
+ unit.cancel_reservation
52
54
  end
53
55
  work_units.push(unit)
54
56
  end
@@ -57,26 +59,26 @@ module CloudCrowd
57
59
  ensure
58
60
  WorkUnit.cancel_reservations(reservation) if reservation
59
61
  end
60
-
61
- # Reserves all available WorkUnits for this process. Returns false if there
62
+
63
+ # Reserves all available WorkUnits for this process. Returns false if there
62
64
  # were none available.
63
65
  def self.reserve_available(options={})
64
66
  reservation = ActiveSupport::SecureRandom.random_number(MAX_RESERVATION)
65
67
  any = WorkUnit.available.update_all("reservation = #{reservation}", nil, options) > 0
66
68
  any && reservation
67
69
  end
68
-
70
+
69
71
  # Cancels all outstanding WorkUnit reservations for this process.
70
72
  def self.cancel_reservations(reservation)
71
73
  WorkUnit.reserved(reservation).update_all('reservation = null')
72
74
  end
73
-
75
+
74
76
  # Cancels all outstanding WorkUnit reservations for all processes. (Useful
75
77
  # in the console for debugging.)
76
78
  def self.cancel_all_reservations
77
79
  WorkUnit.update_all('reservation = null')
78
80
  end
79
-
81
+
80
82
  # Look up a WorkUnit by the worker that's currently processing it. Specified
81
83
  # by <tt>pid@host</tt>.
82
84
  def self.find_by_worker_name(name)
@@ -84,16 +86,16 @@ module CloudCrowd
84
86
  node = NodeRecord.find_by_host(host)
85
87
  node && node.work_units.find_by_worker_pid(pid)
86
88
  end
87
-
89
+
88
90
  # Convenience method for starting a new WorkUnit.
89
91
  def self.start(job, action, input, status)
90
92
  input = input.to_json unless input.is_a? String
91
93
  self.create(:job => job, :action => action, :input => input, :status => status)
92
94
  end
93
-
95
+
94
96
  # Mark this unit as having finished successfully.
95
- # Splitting work units are handled differently (an optimization) -- they
96
- # immediately fire off all of their resulting WorkUnits for processing,
97
+ # Splitting work units are handled differently (an optimization) -- they
98
+ # immediately fire off all of their resulting WorkUnits for processing,
97
99
  # without waiting for the rest of their splitting cousins to complete.
98
100
  def finish(result, time_taken)
99
101
  if splitting?
@@ -114,7 +116,7 @@ module CloudCrowd
114
116
  job && job.check_for_completion
115
117
  end
116
118
  end
117
-
119
+
118
120
  # Mark this unit as having failed. May attempt a retry.
119
121
  def fail(output, time_taken)
120
122
  tries = self.attempts + 1
@@ -129,7 +131,7 @@ module CloudCrowd
129
131
  })
130
132
  job && job.check_for_completion
131
133
  end
132
-
134
+
133
135
  # Ever tried. Ever failed. No matter. Try again. Fail again. Fail better.
134
136
  def try_again
135
137
  update_attributes({
@@ -138,20 +140,25 @@ module CloudCrowd
138
140
  :attempts => self.attempts + 1
139
141
  })
140
142
  end
141
-
143
+
144
+ # If the node can't process the unit, cancel it's reservation.
145
+ def cancel_reservation
146
+ update_attributes!(:reservation => nil)
147
+ end
148
+
142
149
  # When a Node checks out a WorkUnit, establish the connection between
143
150
  # WorkUnit and NodeRecord and record the worker_pid.
144
151
  def assign_to(node_record, worker_pid)
145
152
  update_attributes!(:node_record => node_record, :worker_pid => worker_pid)
146
153
  end
147
-
148
- # All output needs to be wrapped in a JSON object for consistency
149
- # (unfortunately, JSON.parse needs the top-level to be an object or array).
154
+
155
+ # All output needs to be wrapped in a JSON object for consistency
156
+ # (unfortunately, JSON.parse needs the top-level to be an object or array).
150
157
  # Convenience method to provide the parsed version.
151
158
  def parsed_output(out = self.output)
152
159
  JSON.parse(out)['output']
153
160
  end
154
-
161
+
155
162
  # The JSON representation of a WorkUnit shares the Job's options with all
156
163
  # its cousin WorkUnits.
157
164
  def to_json
@@ -165,6 +172,6 @@ module CloudCrowd
165
172
  'status' => self.status
166
173
  }.to_json
167
174
  end
168
-
175
+
169
176
  end
170
177
  end
@@ -1,57 +1,57 @@
1
1
  module CloudCrowd
2
-
2
+
3
3
  # A Node is a Sinatra/Thin application that runs a single instance per-machine
4
- # It registers with the central server, receives WorkUnits, and forks off
4
+ # It registers with the central server, receives WorkUnits, and forks off
5
5
  # Workers to process them. The actions are:
6
6
  #
7
7
  # [get /heartbeat] Returns 200 OK to let monitoring tools know the server's up.
8
8
  # [post /work] The central server hits <tt>/work</tt> to dispatch a WorkUnit to this Node.
9
- class Node < Sinatra::Default
10
-
9
+ class Node < Sinatra::Base
10
+
11
11
  # A Node's default port. You only run a single node per machine, so they
12
12
  # can all use the same port without any problems.
13
13
  DEFAULT_PORT = 9063
14
-
15
- # A list of regex scrapers, which let us extract the one-minute load
14
+
15
+ # A list of regex scrapers, which let us extract the one-minute load
16
16
  # average and the amount of free memory on different flavors of UNIX.
17
-
17
+
18
18
  SCRAPE_UPTIME = /\d+\.\d+/
19
19
  SCRAPE_LINUX_MEMORY = /MemFree:\s+(\d+) kB/
20
- SCRAPE_MAC_MEMORY = /Pages free:\s+(\d+)./
20
+ SCRAPE_MAC_MEMORY = /Pages free:\s+(\d+)./
21
21
  SCRAPE_MAC_PAGE = /page size of (\d+) bytes/
22
-
22
+
23
23
  # The interval at which the node monitors the machine's load and memory use
24
24
  # (if configured to do so in config.yml).
25
25
  MONITOR_INTERVAL = 3
26
-
26
+
27
27
  # The interval at which the node regularly checks in with central (5 min).
28
28
  CHECK_IN_INTERVAL = 300
29
-
29
+
30
30
  # The response sent back when this node is overloaded.
31
31
  OVERLOADED_MESSAGE = 'Node Overloaded'
32
-
32
+
33
33
  attr_reader :enabled_actions, :host, :port, :central
34
-
34
+
35
35
  set :root, ROOT
36
36
  set :authorization_realm, "CloudCrowd"
37
-
37
+
38
38
  helpers Helpers
39
-
39
+
40
40
  # methodoverride allows the _method param.
41
41
  enable :methodoverride
42
-
42
+
43
43
  # Enabling HTTP Authentication turns it on for all requests.
44
44
  # This works the same way as in the central CloudCrowd::Server.
45
45
  before do
46
46
  login_required if CloudCrowd.config[:http_authentication]
47
47
  end
48
-
49
- # To monitor a Node with Monit, God, Nagios, or another tool, you can hit
48
+
49
+ # To monitor a Node with Monit, God, Nagios, or another tool, you can hit
50
50
  # /heartbeat to make sure its still online.
51
51
  get '/heartbeat' do
52
52
  "buh-bump"
53
53
  end
54
-
54
+
55
55
  # Posts a WorkUnit to this Node. Forks a Worker and returns the process id.
56
56
  # Returns a 503 if this Node is overloaded.
57
57
  post '/work' do
@@ -61,14 +61,14 @@ module CloudCrowd
61
61
  Process.detach(pid)
62
62
  json :pid => pid
63
63
  end
64
-
64
+
65
65
  # When creating a node, specify the port it should run on.
66
66
  def initialize(port=nil, daemon=false)
67
67
  require 'json'
68
68
  CloudCrowd.identity = :node
69
69
  @central = CloudCrowd.central_server
70
70
  @host = Socket.gethostname
71
- @enabled_actions = CloudCrowd.actions.keys
71
+ @enabled_actions = CloudCrowd.actions.keys - (CloudCrowd.config[:disabled_actions] || [])
72
72
  @port = port || DEFAULT_PORT
73
73
  @daemon = daemon
74
74
  @overloaded = false
@@ -76,7 +76,7 @@ module CloudCrowd
76
76
  @min_memory = CloudCrowd.config[:min_free_memory]
77
77
  start unless test?
78
78
  end
79
-
79
+
80
80
  # Starting up a Node registers with the central server and begins to listen
81
81
  # for incoming WorkUnits.
82
82
  def start
@@ -94,9 +94,9 @@ module CloudCrowd
94
94
  monitor_system if @max_load || @min_memory
95
95
  @server_thread.join
96
96
  end
97
-
98
- # Checking in with the central server informs it of the location and
99
- # configuration of this Node. If it can't check-in, there's no point in
97
+
98
+ # Checking in with the central server informs it of the location and
99
+ # configuration of this Node. If it can't check-in, there's no point in
100
100
  # starting.
101
101
  def check_in(critical=false)
102
102
  @central["/node/#{@host}"].put(
@@ -109,31 +109,31 @@ module CloudCrowd
109
109
  puts "Failed to connect to the central server (#{@central.to_s})."
110
110
  raise SystemExit if critical
111
111
  end
112
-
112
+
113
113
  # Before exiting, the Node checks out with the central server, releasing all
114
114
  # of its WorkUnits for other Nodes to handle
115
115
  def check_out
116
116
  @central["/node/#{@host}"].delete
117
117
  end
118
-
118
+
119
119
  # Lazy-initialize the asset_store, preferably after the Node has launched.
120
120
  def asset_store
121
121
  @asset_store ||= AssetStore.new
122
122
  end
123
-
124
- # Is the node overloaded? If configured, checks if the load average is
123
+
124
+ # Is the node overloaded? If configured, checks if the load average is
125
125
  # greater than 'max_load', or if the available RAM is less than
126
126
  # 'min_free_memory'.
127
127
  def overloaded?
128
128
  (@max_load && load_average > @max_load) ||
129
129
  (@min_memory && free_memory < @min_memory)
130
130
  end
131
-
131
+
132
132
  # The current one-minute load average.
133
133
  def load_average
134
134
  `uptime`.match(SCRAPE_UPTIME).to_s.to_f
135
135
  end
136
-
136
+
137
137
  # The current amount of free memory in megabytes.
138
138
  def free_memory
139
139
  case RUBY_PLATFORM
@@ -147,12 +147,12 @@ module CloudCrowd
147
147
  raise NotImplementedError, "'min_free_memory' is not yet implemented on your platform"
148
148
  end
149
149
  end
150
-
151
-
150
+
151
+
152
152
  private
153
-
154
- # Launch a monitoring thread that periodically checks the node's load
155
- # average and the amount of free memory remaining. If we transition out of
153
+
154
+ # Launch a monitoring thread that periodically checks the node's load
155
+ # average and the amount of free memory remaining. If we transition out of
156
156
  # the overloaded state, let central know.
157
157
  def monitor_system
158
158
  @monitor_thread = Thread.new do
@@ -164,9 +164,9 @@ module CloudCrowd
164
164
  end
165
165
  end
166
166
  end
167
-
168
- # If communication is interrupted for external reasons, the central server
169
- # will assume that the node has gone down. Checking in will let central know
167
+
168
+ # If communication is interrupted for external reasons, the central server
169
+ # will assume that the node has gone down. Checking in will let central know
170
170
  # it's still online.
171
171
  def check_in_periodically
172
172
  @check_in_thread = Thread.new do
@@ -176,7 +176,7 @@ module CloudCrowd
176
176
  end
177
177
  end
178
178
  end
179
-
179
+
180
180
  # Trap exit signals in order to shut down cleanly.
181
181
  def trap_signals
182
182
  Signal.trap('QUIT') { shut_down }
@@ -184,7 +184,7 @@ module CloudCrowd
184
184
  Signal.trap('KILL') { shut_down }
185
185
  Signal.trap('TERM') { shut_down }
186
186
  end
187
-
187
+
188
188
  # At shut down, de-register with the central server before exiting.
189
189
  def shut_down
190
190
  @check_in_thread.kill if @check_in_thread
@@ -193,7 +193,7 @@ module CloudCrowd
193
193
  @server_thread.kill if @server_thread
194
194
  Process.exit
195
195
  end
196
-
196
+
197
197
  end
198
-
198
+
199
199
  end
@@ -13,7 +13,7 @@ ActiveRecord::Schema.define(:version => CloudCrowd::SCHEMA_VERSION) do
13
13
  t.datetime "created_at"
14
14
  t.datetime "updated_at"
15
15
  end
16
-
16
+
17
17
  create_table "node_records", :force => true do |t|
18
18
  t.string "host", :null => false
19
19
  t.string "ip_address", :null => false
@@ -41,10 +41,10 @@ ActiveRecord::Schema.define(:version => CloudCrowd::SCHEMA_VERSION) do
41
41
  end
42
42
 
43
43
  # Here be indices. After looking, it seems faster not to have them at all.
44
- #
45
- # add_index "jobs", ["status"], :name => "index_jobs_on_status"
46
- # add_index "work_units", ["job_id"], :name => "index_work_units_on_job_id"
47
- # add_index "work_units", ["worker_pid"], :name => "index_work_units_on_worker_pid"
48
- # add_index "work_units", ["worker_pid", "status"], :name => "index_work_units_on_worker_pid_and_status"
49
- # add_index "work_units", ["worker_pid", "node_record_id"], :name => "index_work_units_on_worker_pid_and_node_record_id"
44
+ #
45
+ add_index "jobs", ["status"], :name => "index_jobs_on_status"
46
+ add_index "work_units", ["job_id"], :name => "index_work_units_on_job_id"
47
+ add_index "work_units", ["worker_pid"], :name => "index_work_units_on_worker_pid"
48
+ add_index "work_units", ["worker_pid", "status"], :name => "index_work_units_on_worker_pid_and_status"
49
+ add_index "work_units", ["worker_pid", "node_record_id"], :name => "index_work_units_on_worker_pid_and_node_record_id"
50
50
  end
@@ -1,5 +1,5 @@
1
1
  module CloudCrowd
2
-
2
+
3
3
  # The main CloudCrowd (Sinatra) application. The actions are:
4
4
  #
5
5
  # == Admin
@@ -7,60 +7,65 @@ module CloudCrowd
7
7
  # [get /status] Get the combined JSON of every active job and worker.
8
8
  # [get /worker/:name] Look up the details of a WorkUnit that a Worker is busy processing.
9
9
  # [get /heartbeat] Returns 200 OK to let monitoring tools know the server's up.
10
- #
10
+ #
11
11
  # == Public API
12
12
  # [post /jobs] Begin a new Job. Post with a JSON representation of the job-to-be. (see examples).
13
13
  # [get /jobs/:job_id] Check the status of a Job. Response includes output, if the Job has finished.
14
14
  # [delete /jobs/:job_id] Clean up a Job when you're done downloading the results. Removes all intermediate files.
15
15
  #
16
16
  # == Internal Workers API
17
- # [puts /node/:host] Registers a new Node, making it available for processing.
17
+ # [put /node/:host] Registers a new Node, making it available for processing.
18
18
  # [delete /node/:host] Removes a Node from the registry, freeing up any WorkUnits that it had checked out.
19
19
  # [put /work/:unit_id] Mark a finished WorkUnit as completed or failed, with results.
20
- class Server < Sinatra::Default
21
-
20
+ class Server < Sinatra::Base
21
+
22
22
  set :root, ROOT
23
23
  set :authorization_realm, "CloudCrowd"
24
-
24
+
25
25
  helpers Helpers
26
-
26
+
27
27
  # static serves files from /public, methodoverride allows the _method param.
28
28
  enable :static, :methodoverride
29
-
29
+
30
30
  # Enabling HTTP Authentication turns it on for all requests.
31
31
  before do
32
32
  login_required if CloudCrowd.config[:http_authentication]
33
33
  end
34
-
34
+
35
35
  # Render the admin console.
36
36
  get '/' do
37
37
  erb :operations_center
38
38
  end
39
-
39
+
40
40
  # Get the JSON for every active job in the queue and every active worker
41
41
  # in the system. This action may get a little worrisome as the system grows
42
42
  # larger -- keep it in mind.
43
43
  get '/status' do
44
44
  json(
45
- 'jobs' => Job.incomplete,
46
45
  'nodes' => NodeRecord.all(:order => 'host desc'),
46
+ 'job_count' => Job.incomplete.count,
47
47
  'work_unit_count' => WorkUnit.incomplete.count
48
48
  )
49
49
  end
50
-
50
+
51
+ # Get the last 100 lines of log messages.
52
+ get '/log' do
53
+ `tail -n 100 #{CloudCrowd.log_path('server.log')}`
54
+ end
55
+
51
56
  # Get the JSON for what a worker is up to.
52
57
  get '/worker/:name' do
53
58
  json WorkUnit.find_by_worker_name(params[:name]) || {}
54
59
  end
55
-
56
- # To monitor the central server with Monit, God, Nagios, or another
60
+
61
+ # To monitor the central server with Monit, God, Nagios, or another
57
62
  # monitoring tool, you can hit /heartbeat to make sure.
58
63
  get '/heartbeat' do
59
64
  "buh-bump"
60
65
  end
61
-
66
+
62
67
  # PUBLIC API:
63
-
68
+
64
69
  # Start a new job. Accepts a JSON representation of the job-to-be.
65
70
  # Distributes all work units to available nodes.
66
71
  post '/jobs' do
@@ -68,37 +73,37 @@ module CloudCrowd
68
73
  WorkUnit.distribute_to_nodes
69
74
  json job
70
75
  end
71
-
76
+
72
77
  # Check the status of a job, returning the output if finished, and the
73
- # number of work units remaining otherwise.
78
+ # number of work units remaining otherwise.
74
79
  get '/jobs/:job_id' do
75
80
  json current_job
76
81
  end
77
-
78
- # Cleans up a Job's saved S3 files. Delete a Job after you're done
82
+
83
+ # Cleans up a Job's saved S3 files. Delete a Job after you're done
79
84
  # downloading the results.
80
85
  delete '/jobs/:job_id' do
81
86
  current_job.destroy
82
87
  json nil
83
88
  end
84
-
89
+
85
90
  # INTERNAL NODE API:
86
-
87
- # A new Node will this this action to register its location and
88
- # configuration with the central server. Triggers distribution of WorkUnits.
91
+
92
+ # A new Node will this this action to register its location and
93
+ # configuration with the central server. Triggers distribution of WorkUnits.
89
94
  put '/node/:host' do
90
95
  NodeRecord.check_in(params, request)
91
96
  WorkUnit.distribute_to_nodes
92
97
  json nil
93
98
  end
94
-
95
- # Deregisters a Node from the central server. Releases and redistributes any
99
+
100
+ # Deregisters a Node from the central server. Releases and redistributes any
96
101
  # WorkUnits it may have had checked out.
97
102
  delete '/node/:host' do
98
103
  NodeRecord.destroy_all(:host => params[:host])
99
104
  json nil
100
105
  end
101
-
106
+
102
107
  # When workers are done with their unit, either successfully on in failure,
103
108
  # they mark it back on the central server and exit. Triggers distribution
104
109
  # of pending work units.
@@ -111,13 +116,13 @@ module CloudCrowd
111
116
  WorkUnit.distribute_to_nodes
112
117
  json nil
113
118
  end
114
-
119
+
115
120
  # At initialization record the identity of this Ruby instance as a server.
116
121
  def initialize(*args)
117
122
  super(*args)
118
123
  CloudCrowd.identity = :server
119
124
  end
120
-
125
+
121
126
  end
122
-
127
+
123
128
  end