documentcloud-cloud-crowd 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -25,7 +25,7 @@ module CloudCrowd
25
25
  # turned on, then every request is authenticated, including between
26
26
  # the nodes and the central server.
27
27
  def authorize(login, password)
28
- return true unless CloudCrowd.config[:use_http_authentication]
28
+ return true unless CloudCrowd.config[:http_authentication]
29
29
  return CloudCrowd.config[:login] == login &&
30
30
  CloudCrowd.config[:password] == password
31
31
  end
@@ -33,10 +33,12 @@ module CloudCrowd
33
33
 
34
34
  private
35
35
 
36
+ # Provide a Rack Authorization object.
36
37
  def auth
37
38
  @auth ||= Rack::Auth::Basic::Request.new(request.env)
38
39
  end
39
40
 
41
+ # Unauthorized requests will prompt the browser to provide credentials.
40
42
  def unauthorized!(realm = Server.authorization_realm)
41
43
  response['WWW-Authenticate'] = "Basic realm=\"#{realm}\""
42
44
  halt 401, 'Authorization Required'
@@ -33,12 +33,10 @@ module CloudCrowd
33
33
  return unless all_work_units_complete?
34
34
  set_next_status
35
35
  outs = gather_outputs_from_work_units
36
- update_attributes(:outputs => outs.to_json, :time => time_taken) if complete?
37
-
38
- case self.status
39
- when PROCESSING then queue_for_workers(outs.map {|o| JSON.parse(o) }.flatten)
40
- when MERGING then queue_for_workers(outs.to_json)
41
- else fire_callback
36
+ return queue_for_workers(outs) if merging?
37
+ if complete?
38
+ update_attributes(:outputs => outs, :time => time_taken)
39
+ fire_callback if callback_url
42
40
  end
43
41
  self
44
42
  end
@@ -60,7 +58,6 @@ module CloudCrowd
60
58
  # If the callback_url is successfully pinged, we proceed to cleanup the job.
61
59
  # TODO: This should be moved into a Work Unit...
62
60
  def fire_callback
63
- return unless callback_url
64
61
  begin
65
62
  RestClient.post(callback_url, {:job => self.to_json})
66
63
  self.destroy
@@ -91,6 +88,11 @@ module CloudCrowd
91
88
  self.action_class.public_instance_methods.include? 'split'
92
89
  end
93
90
 
91
+ # This job is done splitting if it's finished with its splitting work units.
92
+ def done_splitting?
93
+ splittable? && work_units.splitting.count <= 0
94
+ end
95
+
94
96
  # This job is mergeable if its Action has a +merge+ method.
95
97
  def mergeable?
96
98
  self.processing? && self.action_class.public_instance_methods.include?('merge')
@@ -98,14 +100,16 @@ module CloudCrowd
98
100
 
99
101
  # Retrieve the class for this Job's Action.
100
102
  def action_class
101
- klass = CloudCrowd.actions[self.action]
102
- return klass if klass
103
+ @action_class ||= CloudCrowd.actions[self.action]
104
+ return @action_class if @action_class
103
105
  raise Error::ActionNotFound, "no action named: '#{self.action}' could be found"
104
106
  end
105
107
 
106
108
  # How complete is this Job?
107
109
  # Unfortunately, with the current processing sequence, the percent_complete
108
- # can pull a fast one and go backwards.
110
+ # can pull a fast one and go backwards. This happens when there's a single
111
+ # large input that takes a long time to split, and when it finally does it
112
+ # creates a whole swarm of work units. This seems unavoidable.
109
113
  def percent_complete
110
114
  return 99 if merging?
111
115
  return 100 if complete?
@@ -143,12 +147,12 @@ module CloudCrowd
143
147
  private
144
148
 
145
149
  # When the WorkUnits are all finished, gather all their outputs together
146
- # before removing them from the database entirely.
150
+ # before removing them from the database entirely. Returns their merged JSON.
147
151
  def gather_outputs_from_work_units
148
152
  units = self.work_units.complete
149
- outs = self.work_units.complete.map {|u| JSON.parse(u.output)['output'] }
153
+ outs = self.work_units.complete.map {|u| u.parsed_output }
150
154
  self.work_units.complete.destroy_all
151
- outs
155
+ outs.to_json
152
156
  end
153
157
 
154
158
  # When starting a new job, or moving to a new stage, split up the inputs
@@ -156,14 +160,8 @@ module CloudCrowd
156
160
  # away.
157
161
  def queue_for_workers(input=nil)
158
162
  input ||= JSON.parse(self.inputs)
159
- [input].flatten.map do |wu_input|
160
- WorkUnit.create(
161
- :job => self,
162
- :action => self.action,
163
- :input => wu_input,
164
- :status => self.status
165
- )
166
- end
163
+ [input].flatten.each {|i| WorkUnit.start(self, action, i, status) }
164
+ self
167
165
  end
168
166
 
169
167
  # A Job starts out either splitting or processing, depending on its action.
@@ -1,7 +1,8 @@
1
1
  module CloudCrowd
2
2
 
3
- # A NodeRecord is the record of a Node running remotely. We can use it to
4
- # assign work units to the node, and keep track of its status.
3
+ # A NodeRecord is the central server's record of a Node running remotely. We
4
+ # can use it to assign WorkUnits to the Node, and keep track of its status.
5
+ # When a Node exits, it destroys this record.
5
6
  class NodeRecord < ActiveRecord::Base
6
7
 
7
8
  has_many :work_units
@@ -16,18 +17,21 @@ module CloudCrowd
16
17
  :order => 'updated_at asc'
17
18
  }
18
19
 
19
- # Save a Node's current status to the database.
20
+ # Register a Node with the central server. Currently this only happens at
21
+ # Node startup.
20
22
  def self.check_in(params, request)
21
23
  attrs = {
22
24
  :ip_address => request.ip,
23
25
  :port => params[:port],
24
26
  :max_workers => params[:max_workers],
25
- :enabled_actions => params[:enabled_actions],
26
- :updated_at => Time.now
27
+ :enabled_actions => params[:enabled_actions]
27
28
  }
28
29
  self.find_or_create_by_host(params[:host]).update_attributes!(attrs)
29
30
  end
30
31
 
32
+ # Dispatch a WorkUnit to this node. Places the node at back at the end of
33
+ # the rotation. If we fail to send the WorkUnit, we consider the node to be
34
+ # down, and remove this record, freeing up all of its checked-out work units.
31
35
  def send_work_unit(unit)
32
36
  result = node['/work'].post(:work_unit => unit.to_json)
33
37
  unit.assign_to(self, JSON.parse(result)['pid'])
@@ -36,45 +40,55 @@ module CloudCrowd
36
40
  self.destroy # Couldn't post to node, assume it's gone away.
37
41
  end
38
42
 
43
+ # What Actions is this Node able to run?
39
44
  def actions
40
45
  enabled_actions.split(',')
41
46
  end
42
47
 
48
+ # Is this Node too busy for more work? (Determined by number of workers.)
43
49
  def busy?
44
50
  max_workers && work_units.count >= max_workers
45
51
  end
46
52
 
53
+ # The URL at which this Node may be reached.
54
+ # TODO: Make sure that the host actually has externally accessible DNS.
47
55
  def url
48
56
  @url ||= "http://#{host}:#{port}"
49
57
  end
50
58
 
59
+ # Keep a RestClient::Resource handy for contacting the Node, including
60
+ # HTTP authentication, if configured.
51
61
  def node
52
- return @node if @node
53
- params = [url]
54
- params += [CloudCrowd.config[:login], CloudCrowd.config[:password]] if CloudCrowd.config[:use_http_authentication]
55
- @node = RestClient::Resource.new(*params)
62
+ @node ||= RestClient::Resource.new(url, CloudCrowd.client_options)
56
63
  end
57
64
 
65
+ # The printable status of the Node.
58
66
  def display_status
59
67
  busy? ? 'busy' : 'available'
60
68
  end
61
69
 
70
+ # A list of the process ids of the workers currently being run by the Node.
62
71
  def worker_pids
63
72
  work_units.all(:select => 'worker_pid').map(&:worker_pid)
64
73
  end
65
74
 
75
+ # The JSON representation of a NodeRecord includes its worker_pids.
66
76
  def to_json(opts={})
67
77
  { 'host' => host,
68
78
  'workers' => worker_pids,
69
- 'status' => display_status,
79
+ 'status' => display_status
70
80
  }.to_json
71
81
  end
72
82
 
73
83
 
74
84
  private
75
85
 
86
+ # When a Node shuts down, we free up all of the WorkUnits that it had
87
+ # reserved, and they become available for others to pick up. Redistribute
88
+ # the WorkUnits in a separate thread to avoid delaying Node shutdown.
76
89
  def clear_work_units
77
90
  WorkUnit.update_all('node_record_id = null, worker_pid = null', "node_record_id = #{id}")
91
+ Thread.new { WorkUnit.distribute_to_nodes }
78
92
  end
79
93
 
80
94
  end
@@ -11,15 +11,16 @@ module CloudCrowd
11
11
  belongs_to :node_record
12
12
 
13
13
  validates_presence_of :job_id, :status, :input, :action
14
-
15
- named_scope :taken, {:conditions => ["worker_pid is not null"]}
16
- named_scope :available, {:conditions => {:worker_pid => nil, :status => INCOMPLETE}}
17
- named_scope :reserved, {:conditions => {:worker_pid => 0}}
14
+
15
+ # Available WorkUnits are waiting to be distributed to Nodes for processing.
16
+ named_scope :available, {:conditions => {:reservation => nil, :worker_pid => nil, :status => INCOMPLETE}}
17
+ # Reserved WorkUnits have been marked for distribution by a central server process.
18
+ named_scope :reserved, {:conditions => {:reservation => $$}, :order => 'updated_at asc'}
18
19
 
19
20
  # Attempt to send a list of work_units to nodes with available capacity.
20
- # Do this in a separate thread so that the request can return, satisfied.
21
- # A single application server process stops the same WorkUnit from being
22
- # distributed to multiple nodes by reserving all the available ones.
21
+ # A single central server process stops the same WorkUnit from being
22
+ # distributed to multiple nodes by reserving it first. The algorithm used
23
+ # should be lock-free.
23
24
  def self.distribute_to_nodes
24
25
  return unless WorkUnit.reserve_available
25
26
  work_units = WorkUnit.reserved
@@ -35,46 +36,52 @@ module CloudCrowd
35
36
  available_nodes.push(node) unless node.busy?
36
37
  end
37
38
  end
39
+ ensure
38
40
  WorkUnit.cancel_reservations
39
41
  end
40
42
 
41
- # Reserves all available WorkUnits. Returns false if there were none
42
- # available.
43
+ # Reserves all available WorkUnits for this process. Returns false if there
44
+ # were none available.
43
45
  def self.reserve_available
44
- WorkUnit.available.update_all('worker_pid = 0') > 0
46
+ WorkUnit.available.update_all("reservation = #{$$}") > 0
45
47
  end
46
48
 
49
+ # Cancels all outstanding WorkUnit reservations for this process.
47
50
  def self.cancel_reservations
48
- WorkUnit.reserved.update_all('worker_pid = null')
51
+ WorkUnit.reserved.update_all('reservation = null')
49
52
  end
50
53
 
54
+ # Look up a WorkUnit by the worker that's currently processing it. Specified
55
+ # by <tt>pid@host</tt>.
51
56
  def self.find_by_worker_name(name)
52
57
  pid, host = name.split('@')
53
58
  node = NodeRecord.find_by_host(host)
54
59
  node && node.work_units.find_by_worker_pid(pid)
55
60
  end
56
61
 
62
+ # Convenience method for starting a new WorkUnit.
63
+ def self.start(job, action, input, status)
64
+ self.create(:job => job, :action => action, :input => input, :status => status)
65
+ end
66
+
57
67
  # Mark this unit as having finished successfully.
58
- # TODO: Refactor alongside check_for_completion ... look into doubleparse.
59
- def finish(output, time_taken)
68
+ # Splitting work units are handled differently (an optimization) -- they
69
+ # immediately fire off all of their resulting WorkUnits for processing,
70
+ # without waiting for the rest of their splitting cousins to complete.
71
+ def finish(result, time_taken)
60
72
  if splitting?
61
- [JSON.parse(JSON.parse(output)['output'])].flatten.each do |wu_input|
62
- WorkUnit.create(
63
- :job => job,
64
- :action => action,
65
- :input => wu_input,
66
- :status => PROCESSING
67
- )
73
+ [JSON.parse(parsed_output(result))].flatten.each do |new_input|
74
+ WorkUnit.start(job, action, new_input, PROCESSING)
68
75
  end
69
76
  self.destroy
70
- job.set_next_status if job.work_units.splitting.count <= 0
77
+ job.set_next_status if job.done_splitting?
71
78
  else
72
79
  update_attributes({
73
80
  :status => SUCCEEDED,
74
81
  :node_record => nil,
75
82
  :worker_pid => nil,
76
83
  :attempts => attempts + 1,
77
- :output => output,
84
+ :output => result,
78
85
  :time => time_taken
79
86
  })
80
87
  job.check_for_completion
@@ -105,14 +112,21 @@ module CloudCrowd
105
112
  })
106
113
  end
107
114
 
108
- # When a Worker checks out a WorkUnit, establish the connection between
109
- # WorkUnit and NodeRecord.
115
+ # When a Node checks out a WorkUnit, establish the connection between
116
+ # WorkUnit and NodeRecord and record the worker_pid.
110
117
  def assign_to(node_record, worker_pid)
111
118
  update_attributes!(:node_record => node_record, :worker_pid => worker_pid)
112
119
  end
113
120
 
121
+ # All output needs to be wrapped in a JSON object for consistency
122
+ # (unfortunately, JSON.parse needs the top-level to be an object or array).
123
+ # Convenience method to provide the parsed version.
124
+ def parsed_output(out = self.output)
125
+ JSON.parse(out)['output']
126
+ end
127
+
114
128
  # The JSON representation of a WorkUnit shares the Job's options with all
115
- # its sister WorkUnits.
129
+ # its cousin WorkUnits.
116
130
  def to_json
117
131
  {
118
132
  'id' => self.id,
@@ -1,9 +1,15 @@
1
1
  module CloudCrowd
2
2
 
3
+ # A Node is a Sinatra/Thin application that runs a single instance per-machine
4
+ # It registers with the central server, receives WorkUnits, and forks off
5
+ # Workers to process them. The actions are:
6
+ #
7
+ # [get /heartbeat] Returns 200 OK to let monitoring tools know the server's up.
8
+ # [post /work] The central server hits <tt>/work</tt> to dispatch a WorkUnit to this Node.
3
9
  class Node < Sinatra::Default
4
10
 
5
11
  # A Node's default port. You only run a single node per machine, so they
6
- # can all use the same port without problems.
12
+ # can all use the same port without any problems.
7
13
  DEFAULT_PORT = 9063
8
14
 
9
15
  attr_reader :server, :asset_store
@@ -17,22 +23,26 @@ module CloudCrowd
17
23
  enable :methodoverride
18
24
 
19
25
  # Enabling HTTP Authentication turns it on for all requests.
26
+ # This works the same way as in the central CloudCrowd::Server.
20
27
  before do
21
- login_required if CloudCrowd.config[:use_http_authentication]
28
+ login_required if CloudCrowd.config[:http_authentication]
22
29
  end
23
30
 
24
31
  # To monitor a Node with Monit, God, Nagios, or another tool, you can hit
25
- # /heartbeat to make sure its still up.
32
+ # /heartbeat to make sure its still online.
26
33
  get '/heartbeat' do
27
34
  "buh-bump"
28
35
  end
29
36
 
37
+ # Posts a WorkUnit to this Node. Forks a Worker and returns the process id.
30
38
  post '/work' do
31
39
  pid = fork { Worker.new(self, JSON.parse(params[:work_unit])) }
32
40
  Process.detach(pid)
33
41
  json :pid => pid
34
42
  end
35
43
 
44
+ # Creating a Node registers with the central server and starts listening for
45
+ # incoming WorkUnits.
36
46
  def initialize(port=DEFAULT_PORT)
37
47
  require 'json'
38
48
  @server = CloudCrowd.central_server
@@ -47,6 +57,9 @@ module CloudCrowd
47
57
  @server_thread.join
48
58
  end
49
59
 
60
+ # Checking in with the central server informs it of the location and
61
+ # configuration of this Node. If it can't check-in, there's no point in
62
+ # starting.
50
63
  def check_in
51
64
  @server["/node/#{@host}"].put(
52
65
  :port => @port,
@@ -58,25 +71,30 @@ module CloudCrowd
58
71
  raise SystemExit
59
72
  end
60
73
 
74
+ # Before exiting, the Node checks out with the central server, releasing all
75
+ # of its WorkUnits for other Nodes to handle
61
76
  def check_out
62
77
  @server["/node/#{@host}"].delete
63
78
  end
64
79
 
80
+
81
+ private
82
+
83
+ # Launch the Node's Thin server in a separate thread because it blocks.
65
84
  def start_server
66
85
  @server_thread = Thread.new do
67
86
  Thin::Server.start('0.0.0.0', @port, self, :signals => false)
68
87
  end
69
88
  end
70
89
 
71
-
72
- private
73
-
90
+ # Trap exit signals in order to shut down cleanly.
74
91
  def trap_signals
75
92
  Signal.trap('INT') { shut_down }
76
93
  Signal.trap('KILL') { shut_down }
77
94
  Signal.trap('TERM') { shut_down }
78
95
  end
79
96
 
97
+ # At shut down, de-register with the central server before exiting.
80
98
  def shut_down
81
99
  check_out
82
100
  Process.exit
@@ -1,5 +1,5 @@
1
1
  # Complete schema for CloudCrowd.
2
- ActiveRecord::Schema.define(:version => 1) do
2
+ ActiveRecord::Schema.define(:version => CloudCrowd::SCHEMA_VERSION) do
3
3
 
4
4
  create_table "jobs", :force => true do |t|
5
5
  t.integer "status", :null => false
@@ -29,9 +29,10 @@ ActiveRecord::Schema.define(:version => 1) do
29
29
  t.integer "job_id", :null => false
30
30
  t.text "input", :null => false
31
31
  t.string "action", :null => false
32
- t.integer "attempts", :default => 0, :null => false
32
+ t.integer "attempts", :default => 0, :null => false
33
33
  t.integer "node_record_id"
34
34
  t.integer "worker_pid"
35
+ t.integer "reservation"
35
36
  t.float "time"
36
37
  t.text "output"
37
38
  t.datetime "created_at"
@@ -5,6 +5,7 @@ module CloudCrowd
5
5
  # == Admin
6
6
  # [get /] Render the admin console, with a progress meter for running jobs.
7
7
  # [get /status] Get the combined JSON of every active job and worker.
8
+ # [get /worker/:name] Look up the details of a WorkUnit that a Worker is busy processing.
8
9
  # [get /heartbeat] Returns 200 OK to let monitoring tools know the server's up.
9
10
  #
10
11
  # == Public API
@@ -13,9 +14,9 @@ module CloudCrowd
13
14
  # [delete /jobs/:job_id] Clean up a Job when you're done downloading the results. Removes all intermediate files.
14
15
  #
15
16
  # == Internal Workers API
16
- # [post /work] Dequeue the next WorkUnit, and hand it off to the worker.
17
+ # [puts /node/:host] Registers a new Node, making it available for processing.
18
+ # [delete /node/:host] Removes a Node from the registry, freeing up any WorkUnits that it had checked out.
17
19
  # [put /work/:unit_id] Mark a finished WorkUnit as completed or failed, with results.
18
- # [put /worker] Keep a record of an actively running worker.
19
20
  class Server < Sinatra::Default
20
21
 
21
22
  set :root, ROOT
@@ -28,12 +29,12 @@ module CloudCrowd
28
29
 
29
30
  # Enabling HTTP Authentication turns it on for all requests.
30
31
  before do
31
- login_required if CloudCrowd.config[:use_http_authentication]
32
+ login_required if CloudCrowd.config[:http_authentication]
32
33
  end
33
34
 
34
35
  # Render the admin console.
35
36
  get '/' do
36
- erb :index
37
+ erb :operations_center
37
38
  end
38
39
 
39
40
  # Get the JSON for every active job in the queue and every active worker
@@ -83,12 +84,16 @@ module CloudCrowd
83
84
 
84
85
  # INTERNAL NODE API:
85
86
 
87
+ # A new Node will this this action to register its location and
88
+ # configuration with the central server. Triggers distribution of WorkUnits.
86
89
  put '/node/:host' do
87
90
  NodeRecord.check_in(params, request)
88
91
  WorkUnit.distribute_to_nodes
89
92
  json nil
90
93
  end
91
94
 
95
+ # Deregisters a Node from the central server. Releases and redistributes any
96
+ # WorkUnits it may have had checked out.
92
97
  delete '/node/:host' do
93
98
  NodeRecord.destroy_all(:host => params[:host])
94
99
  json nil