cloud-crowd 0.3.3 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -6,36 +6,36 @@ module CloudCrowd
6
6
  # are each run as a single WorkUnit.
7
7
  class WorkUnit < ActiveRecord::Base
8
8
  include ModelStatus
9
-
9
+
10
10
  # We use a random number in (0...MAX_RESERVATION) to reserve work units.
11
11
  # The size of the maximum signed integer in MySQL -- SQLite has no limit.
12
12
  MAX_RESERVATION = 2147483647
13
-
13
+
14
14
  # We only reserve a certain number of WorkUnits in a single go, to avoid
15
15
  # reserving the entire table.
16
16
  RESERVATION_LIMIT = 25
17
-
17
+
18
18
  belongs_to :job
19
19
  belongs_to :node_record
20
-
20
+
21
21
  validates_presence_of :job_id, :status, :input, :action
22
-
22
+
23
23
  # Available WorkUnits are waiting to be distributed to Nodes for processing.
24
24
  named_scope :available, {:conditions => {:reservation => nil, :worker_pid => nil, :status => INCOMPLETE}}
25
25
  # Reserved WorkUnits have been marked for distribution by a central server process.
26
- named_scope :reserved, lambda {|reservation|
26
+ named_scope :reserved, lambda {|reservation|
27
27
  {:conditions => {:reservation => reservation}, :order => 'updated_at asc'}
28
28
  }
29
-
29
+
30
30
  # Attempt to send a list of WorkUnits to nodes with available capacity.
31
31
  # A single central server process stops the same WorkUnit from being
32
32
  # distributed to multiple nodes by reserving it first. The algorithm used
33
33
  # should be lock-free.
34
34
  #
35
35
  # We reserve WorkUnits for this process in chunks of RESERVATION_LIMIT size,
36
- # and try to match them to Nodes that are capable of handling the Action.
37
- # WorkUnits get removed from the availability list when they are
38
- # successfully sent, and Nodes get removed when they are busy or have the
36
+ # and try to match them to Nodes that are capable of handling the Action.
37
+ # WorkUnits get removed from the availability list when they are
38
+ # successfully sent, and Nodes get removed when they are busy or have the
39
39
  # action in question disabled.
40
40
  def self.distribute_to_nodes
41
41
  reservation = nil
@@ -44,11 +44,13 @@ module CloudCrowd
44
44
  work_units = WorkUnit.reserved(reservation)
45
45
  available_nodes = NodeRecord.available
46
46
  while node = available_nodes.shift and unit = work_units.shift do
47
- if node.actions.include? unit.action
47
+ if node.actions.include?(unit.action)
48
48
  if node.send_work_unit(unit)
49
49
  available_nodes.push(node) unless node.busy?
50
50
  next
51
51
  end
52
+ else
53
+ unit.cancel_reservation
52
54
  end
53
55
  work_units.push(unit)
54
56
  end
@@ -57,26 +59,26 @@ module CloudCrowd
57
59
  ensure
58
60
  WorkUnit.cancel_reservations(reservation) if reservation
59
61
  end
60
-
61
- # Reserves all available WorkUnits for this process. Returns false if there
62
+
63
+ # Reserves all available WorkUnits for this process. Returns false if there
62
64
  # were none available.
63
65
  def self.reserve_available(options={})
64
66
  reservation = ActiveSupport::SecureRandom.random_number(MAX_RESERVATION)
65
67
  any = WorkUnit.available.update_all("reservation = #{reservation}", nil, options) > 0
66
68
  any && reservation
67
69
  end
68
-
70
+
69
71
  # Cancels all outstanding WorkUnit reservations for this process.
70
72
  def self.cancel_reservations(reservation)
71
73
  WorkUnit.reserved(reservation).update_all('reservation = null')
72
74
  end
73
-
75
+
74
76
  # Cancels all outstanding WorkUnit reservations for all processes. (Useful
75
77
  # in the console for debugging.)
76
78
  def self.cancel_all_reservations
77
79
  WorkUnit.update_all('reservation = null')
78
80
  end
79
-
81
+
80
82
  # Look up a WorkUnit by the worker that's currently processing it. Specified
81
83
  # by <tt>pid@host</tt>.
82
84
  def self.find_by_worker_name(name)
@@ -84,16 +86,16 @@ module CloudCrowd
84
86
  node = NodeRecord.find_by_host(host)
85
87
  node && node.work_units.find_by_worker_pid(pid)
86
88
  end
87
-
89
+
88
90
  # Convenience method for starting a new WorkUnit.
89
91
  def self.start(job, action, input, status)
90
92
  input = input.to_json unless input.is_a? String
91
93
  self.create(:job => job, :action => action, :input => input, :status => status)
92
94
  end
93
-
95
+
94
96
  # Mark this unit as having finished successfully.
95
- # Splitting work units are handled differently (an optimization) -- they
96
- # immediately fire off all of their resulting WorkUnits for processing,
97
+ # Splitting work units are handled differently (an optimization) -- they
98
+ # immediately fire off all of their resulting WorkUnits for processing,
97
99
  # without waiting for the rest of their splitting cousins to complete.
98
100
  def finish(result, time_taken)
99
101
  if splitting?
@@ -114,7 +116,7 @@ module CloudCrowd
114
116
  job && job.check_for_completion
115
117
  end
116
118
  end
117
-
119
+
118
120
  # Mark this unit as having failed. May attempt a retry.
119
121
  def fail(output, time_taken)
120
122
  tries = self.attempts + 1
@@ -129,7 +131,7 @@ module CloudCrowd
129
131
  })
130
132
  job && job.check_for_completion
131
133
  end
132
-
134
+
133
135
  # Ever tried. Ever failed. No matter. Try again. Fail again. Fail better.
134
136
  def try_again
135
137
  update_attributes({
@@ -138,20 +140,25 @@ module CloudCrowd
138
140
  :attempts => self.attempts + 1
139
141
  })
140
142
  end
141
-
143
+
144
+ # If the node can't process the unit, cancel it's reservation.
145
+ def cancel_reservation
146
+ update_attributes!(:reservation => nil)
147
+ end
148
+
142
149
  # When a Node checks out a WorkUnit, establish the connection between
143
150
  # WorkUnit and NodeRecord and record the worker_pid.
144
151
  def assign_to(node_record, worker_pid)
145
152
  update_attributes!(:node_record => node_record, :worker_pid => worker_pid)
146
153
  end
147
-
148
- # All output needs to be wrapped in a JSON object for consistency
149
- # (unfortunately, JSON.parse needs the top-level to be an object or array).
154
+
155
+ # All output needs to be wrapped in a JSON object for consistency
156
+ # (unfortunately, JSON.parse needs the top-level to be an object or array).
150
157
  # Convenience method to provide the parsed version.
151
158
  def parsed_output(out = self.output)
152
159
  JSON.parse(out)['output']
153
160
  end
154
-
161
+
155
162
  # The JSON representation of a WorkUnit shares the Job's options with all
156
163
  # its cousin WorkUnits.
157
164
  def to_json
@@ -165,6 +172,6 @@ module CloudCrowd
165
172
  'status' => self.status
166
173
  }.to_json
167
174
  end
168
-
175
+
169
176
  end
170
177
  end
@@ -1,57 +1,57 @@
1
1
  module CloudCrowd
2
-
2
+
3
3
  # A Node is a Sinatra/Thin application that runs a single instance per-machine
4
- # It registers with the central server, receives WorkUnits, and forks off
4
+ # It registers with the central server, receives WorkUnits, and forks off
5
5
  # Workers to process them. The actions are:
6
6
  #
7
7
  # [get /heartbeat] Returns 200 OK to let monitoring tools know the server's up.
8
8
  # [post /work] The central server hits <tt>/work</tt> to dispatch a WorkUnit to this Node.
9
- class Node < Sinatra::Default
10
-
9
+ class Node < Sinatra::Base
10
+
11
11
  # A Node's default port. You only run a single node per machine, so they
12
12
  # can all use the same port without any problems.
13
13
  DEFAULT_PORT = 9063
14
-
15
- # A list of regex scrapers, which let us extract the one-minute load
14
+
15
+ # A list of regex scrapers, which let us extract the one-minute load
16
16
  # average and the amount of free memory on different flavors of UNIX.
17
-
17
+
18
18
  SCRAPE_UPTIME = /\d+\.\d+/
19
19
  SCRAPE_LINUX_MEMORY = /MemFree:\s+(\d+) kB/
20
- SCRAPE_MAC_MEMORY = /Pages free:\s+(\d+)./
20
+ SCRAPE_MAC_MEMORY = /Pages free:\s+(\d+)./
21
21
  SCRAPE_MAC_PAGE = /page size of (\d+) bytes/
22
-
22
+
23
23
  # The interval at which the node monitors the machine's load and memory use
24
24
  # (if configured to do so in config.yml).
25
25
  MONITOR_INTERVAL = 3
26
-
26
+
27
27
  # The interval at which the node regularly checks in with central (5 min).
28
28
  CHECK_IN_INTERVAL = 300
29
-
29
+
30
30
  # The response sent back when this node is overloaded.
31
31
  OVERLOADED_MESSAGE = 'Node Overloaded'
32
-
32
+
33
33
  attr_reader :enabled_actions, :host, :port, :central
34
-
34
+
35
35
  set :root, ROOT
36
36
  set :authorization_realm, "CloudCrowd"
37
-
37
+
38
38
  helpers Helpers
39
-
39
+
40
40
  # methodoverride allows the _method param.
41
41
  enable :methodoverride
42
-
42
+
43
43
  # Enabling HTTP Authentication turns it on for all requests.
44
44
  # This works the same way as in the central CloudCrowd::Server.
45
45
  before do
46
46
  login_required if CloudCrowd.config[:http_authentication]
47
47
  end
48
-
49
- # To monitor a Node with Monit, God, Nagios, or another tool, you can hit
48
+
49
+ # To monitor a Node with Monit, God, Nagios, or another tool, you can hit
50
50
  # /heartbeat to make sure its still online.
51
51
  get '/heartbeat' do
52
52
  "buh-bump"
53
53
  end
54
-
54
+
55
55
  # Posts a WorkUnit to this Node. Forks a Worker and returns the process id.
56
56
  # Returns a 503 if this Node is overloaded.
57
57
  post '/work' do
@@ -61,14 +61,14 @@ module CloudCrowd
61
61
  Process.detach(pid)
62
62
  json :pid => pid
63
63
  end
64
-
64
+
65
65
  # When creating a node, specify the port it should run on.
66
66
  def initialize(port=nil, daemon=false)
67
67
  require 'json'
68
68
  CloudCrowd.identity = :node
69
69
  @central = CloudCrowd.central_server
70
70
  @host = Socket.gethostname
71
- @enabled_actions = CloudCrowd.actions.keys
71
+ @enabled_actions = CloudCrowd.actions.keys - (CloudCrowd.config[:disabled_actions] || [])
72
72
  @port = port || DEFAULT_PORT
73
73
  @daemon = daemon
74
74
  @overloaded = false
@@ -76,7 +76,7 @@ module CloudCrowd
76
76
  @min_memory = CloudCrowd.config[:min_free_memory]
77
77
  start unless test?
78
78
  end
79
-
79
+
80
80
  # Starting up a Node registers with the central server and begins to listen
81
81
  # for incoming WorkUnits.
82
82
  def start
@@ -94,9 +94,9 @@ module CloudCrowd
94
94
  monitor_system if @max_load || @min_memory
95
95
  @server_thread.join
96
96
  end
97
-
98
- # Checking in with the central server informs it of the location and
99
- # configuration of this Node. If it can't check-in, there's no point in
97
+
98
+ # Checking in with the central server informs it of the location and
99
+ # configuration of this Node. If it can't check-in, there's no point in
100
100
  # starting.
101
101
  def check_in(critical=false)
102
102
  @central["/node/#{@host}"].put(
@@ -109,31 +109,31 @@ module CloudCrowd
109
109
  puts "Failed to connect to the central server (#{@central.to_s})."
110
110
  raise SystemExit if critical
111
111
  end
112
-
112
+
113
113
  # Before exiting, the Node checks out with the central server, releasing all
114
114
  # of its WorkUnits for other Nodes to handle
115
115
  def check_out
116
116
  @central["/node/#{@host}"].delete
117
117
  end
118
-
118
+
119
119
  # Lazy-initialize the asset_store, preferably after the Node has launched.
120
120
  def asset_store
121
121
  @asset_store ||= AssetStore.new
122
122
  end
123
-
124
- # Is the node overloaded? If configured, checks if the load average is
123
+
124
+ # Is the node overloaded? If configured, checks if the load average is
125
125
  # greater than 'max_load', or if the available RAM is less than
126
126
  # 'min_free_memory'.
127
127
  def overloaded?
128
128
  (@max_load && load_average > @max_load) ||
129
129
  (@min_memory && free_memory < @min_memory)
130
130
  end
131
-
131
+
132
132
  # The current one-minute load average.
133
133
  def load_average
134
134
  `uptime`.match(SCRAPE_UPTIME).to_s.to_f
135
135
  end
136
-
136
+
137
137
  # The current amount of free memory in megabytes.
138
138
  def free_memory
139
139
  case RUBY_PLATFORM
@@ -147,12 +147,12 @@ module CloudCrowd
147
147
  raise NotImplementedError, "'min_free_memory' is not yet implemented on your platform"
148
148
  end
149
149
  end
150
-
151
-
150
+
151
+
152
152
  private
153
-
154
- # Launch a monitoring thread that periodically checks the node's load
155
- # average and the amount of free memory remaining. If we transition out of
153
+
154
+ # Launch a monitoring thread that periodically checks the node's load
155
+ # average and the amount of free memory remaining. If we transition out of
156
156
  # the overloaded state, let central know.
157
157
  def monitor_system
158
158
  @monitor_thread = Thread.new do
@@ -164,9 +164,9 @@ module CloudCrowd
164
164
  end
165
165
  end
166
166
  end
167
-
168
- # If communication is interrupted for external reasons, the central server
169
- # will assume that the node has gone down. Checking in will let central know
167
+
168
+ # If communication is interrupted for external reasons, the central server
169
+ # will assume that the node has gone down. Checking in will let central know
170
170
  # it's still online.
171
171
  def check_in_periodically
172
172
  @check_in_thread = Thread.new do
@@ -176,7 +176,7 @@ module CloudCrowd
176
176
  end
177
177
  end
178
178
  end
179
-
179
+
180
180
  # Trap exit signals in order to shut down cleanly.
181
181
  def trap_signals
182
182
  Signal.trap('QUIT') { shut_down }
@@ -184,7 +184,7 @@ module CloudCrowd
184
184
  Signal.trap('KILL') { shut_down }
185
185
  Signal.trap('TERM') { shut_down }
186
186
  end
187
-
187
+
188
188
  # At shut down, de-register with the central server before exiting.
189
189
  def shut_down
190
190
  @check_in_thread.kill if @check_in_thread
@@ -193,7 +193,7 @@ module CloudCrowd
193
193
  @server_thread.kill if @server_thread
194
194
  Process.exit
195
195
  end
196
-
196
+
197
197
  end
198
-
198
+
199
199
  end
@@ -13,7 +13,7 @@ ActiveRecord::Schema.define(:version => CloudCrowd::SCHEMA_VERSION) do
13
13
  t.datetime "created_at"
14
14
  t.datetime "updated_at"
15
15
  end
16
-
16
+
17
17
  create_table "node_records", :force => true do |t|
18
18
  t.string "host", :null => false
19
19
  t.string "ip_address", :null => false
@@ -41,10 +41,10 @@ ActiveRecord::Schema.define(:version => CloudCrowd::SCHEMA_VERSION) do
41
41
  end
42
42
 
43
43
  # Here be indices. After looking, it seems faster not to have them at all.
44
- #
45
- # add_index "jobs", ["status"], :name => "index_jobs_on_status"
46
- # add_index "work_units", ["job_id"], :name => "index_work_units_on_job_id"
47
- # add_index "work_units", ["worker_pid"], :name => "index_work_units_on_worker_pid"
48
- # add_index "work_units", ["worker_pid", "status"], :name => "index_work_units_on_worker_pid_and_status"
49
- # add_index "work_units", ["worker_pid", "node_record_id"], :name => "index_work_units_on_worker_pid_and_node_record_id"
44
+ #
45
+ add_index "jobs", ["status"], :name => "index_jobs_on_status"
46
+ add_index "work_units", ["job_id"], :name => "index_work_units_on_job_id"
47
+ add_index "work_units", ["worker_pid"], :name => "index_work_units_on_worker_pid"
48
+ add_index "work_units", ["worker_pid", "status"], :name => "index_work_units_on_worker_pid_and_status"
49
+ add_index "work_units", ["worker_pid", "node_record_id"], :name => "index_work_units_on_worker_pid_and_node_record_id"
50
50
  end
@@ -1,5 +1,5 @@
1
1
  module CloudCrowd
2
-
2
+
3
3
  # The main CloudCrowd (Sinatra) application. The actions are:
4
4
  #
5
5
  # == Admin
@@ -7,60 +7,65 @@ module CloudCrowd
7
7
  # [get /status] Get the combined JSON of every active job and worker.
8
8
  # [get /worker/:name] Look up the details of a WorkUnit that a Worker is busy processing.
9
9
  # [get /heartbeat] Returns 200 OK to let monitoring tools know the server's up.
10
- #
10
+ #
11
11
  # == Public API
12
12
  # [post /jobs] Begin a new Job. Post with a JSON representation of the job-to-be. (see examples).
13
13
  # [get /jobs/:job_id] Check the status of a Job. Response includes output, if the Job has finished.
14
14
  # [delete /jobs/:job_id] Clean up a Job when you're done downloading the results. Removes all intermediate files.
15
15
  #
16
16
  # == Internal Workers API
17
- # [puts /node/:host] Registers a new Node, making it available for processing.
17
+ # [put /node/:host] Registers a new Node, making it available for processing.
18
18
  # [delete /node/:host] Removes a Node from the registry, freeing up any WorkUnits that it had checked out.
19
19
  # [put /work/:unit_id] Mark a finished WorkUnit as completed or failed, with results.
20
- class Server < Sinatra::Default
21
-
20
+ class Server < Sinatra::Base
21
+
22
22
  set :root, ROOT
23
23
  set :authorization_realm, "CloudCrowd"
24
-
24
+
25
25
  helpers Helpers
26
-
26
+
27
27
  # static serves files from /public, methodoverride allows the _method param.
28
28
  enable :static, :methodoverride
29
-
29
+
30
30
  # Enabling HTTP Authentication turns it on for all requests.
31
31
  before do
32
32
  login_required if CloudCrowd.config[:http_authentication]
33
33
  end
34
-
34
+
35
35
  # Render the admin console.
36
36
  get '/' do
37
37
  erb :operations_center
38
38
  end
39
-
39
+
40
40
  # Get the JSON for every active job in the queue and every active worker
41
41
  # in the system. This action may get a little worrisome as the system grows
42
42
  # larger -- keep it in mind.
43
43
  get '/status' do
44
44
  json(
45
- 'jobs' => Job.incomplete,
46
45
  'nodes' => NodeRecord.all(:order => 'host desc'),
46
+ 'job_count' => Job.incomplete.count,
47
47
  'work_unit_count' => WorkUnit.incomplete.count
48
48
  )
49
49
  end
50
-
50
+
51
+ # Get the last 100 lines of log messages.
52
+ get '/log' do
53
+ `tail -n 100 #{CloudCrowd.log_path('server.log')}`
54
+ end
55
+
51
56
  # Get the JSON for what a worker is up to.
52
57
  get '/worker/:name' do
53
58
  json WorkUnit.find_by_worker_name(params[:name]) || {}
54
59
  end
55
-
56
- # To monitor the central server with Monit, God, Nagios, or another
60
+
61
+ # To monitor the central server with Monit, God, Nagios, or another
57
62
  # monitoring tool, you can hit /heartbeat to make sure.
58
63
  get '/heartbeat' do
59
64
  "buh-bump"
60
65
  end
61
-
66
+
62
67
  # PUBLIC API:
63
-
68
+
64
69
  # Start a new job. Accepts a JSON representation of the job-to-be.
65
70
  # Distributes all work units to available nodes.
66
71
  post '/jobs' do
@@ -68,37 +73,37 @@ module CloudCrowd
68
73
  WorkUnit.distribute_to_nodes
69
74
  json job
70
75
  end
71
-
76
+
72
77
  # Check the status of a job, returning the output if finished, and the
73
- # number of work units remaining otherwise.
78
+ # number of work units remaining otherwise.
74
79
  get '/jobs/:job_id' do
75
80
  json current_job
76
81
  end
77
-
78
- # Cleans up a Job's saved S3 files. Delete a Job after you're done
82
+
83
+ # Cleans up a Job's saved S3 files. Delete a Job after you're done
79
84
  # downloading the results.
80
85
  delete '/jobs/:job_id' do
81
86
  current_job.destroy
82
87
  json nil
83
88
  end
84
-
89
+
85
90
  # INTERNAL NODE API:
86
-
87
- # A new Node will this this action to register its location and
88
- # configuration with the central server. Triggers distribution of WorkUnits.
91
+
92
+ # A new Node will this this action to register its location and
93
+ # configuration with the central server. Triggers distribution of WorkUnits.
89
94
  put '/node/:host' do
90
95
  NodeRecord.check_in(params, request)
91
96
  WorkUnit.distribute_to_nodes
92
97
  json nil
93
98
  end
94
-
95
- # Deregisters a Node from the central server. Releases and redistributes any
99
+
100
+ # Deregisters a Node from the central server. Releases and redistributes any
96
101
  # WorkUnits it may have had checked out.
97
102
  delete '/node/:host' do
98
103
  NodeRecord.destroy_all(:host => params[:host])
99
104
  json nil
100
105
  end
101
-
106
+
102
107
  # When workers are done with their unit, either successfully on in failure,
103
108
  # they mark it back on the central server and exit. Triggers distribution
104
109
  # of pending work units.
@@ -111,13 +116,13 @@ module CloudCrowd
111
116
  WorkUnit.distribute_to_nodes
112
117
  json nil
113
118
  end
114
-
119
+
115
120
  # At initialization record the identity of this Ruby instance as a server.
116
121
  def initialize(*args)
117
122
  super(*args)
118
123
  CloudCrowd.identity = :server
119
124
  end
120
-
125
+
121
126
  end
122
-
127
+
123
128
  end