RubyGems - documentcloud-cloud-crowd - Versions diffs - 0.1.1 → 0.2.0 - Mend

documentcloud-cloud-crowd 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

data/README +16 -16
data/cloud-crowd.gemspec +4 -3
data/config/config.example.yml +17 -12
data/lib/cloud-crowd.rb +42 -24
data/lib/cloud_crowd/action.rb +6 -4
data/lib/cloud_crowd/asset_store.rb +7 -7
data/lib/cloud_crowd/asset_store/filesystem_store.rb +15 -9
data/lib/cloud_crowd/asset_store/s3_store.rb +10 -11
data/lib/cloud_crowd/command_line.rb +12 -7
data/lib/cloud_crowd/exceptions.rb +7 -4
data/lib/cloud_crowd/helpers/authorization.rb +3 -1
data/lib/cloud_crowd/models/job.rb +19 -21
data/lib/cloud_crowd/models/node_record.rb +24 -10
data/lib/cloud_crowd/models/work_unit.rb +39 -25
data/lib/cloud_crowd/node.rb +24 -6
data/lib/cloud_crowd/schema.rb +3 -2
data/lib/cloud_crowd/server.rb +9 -4
data/lib/cloud_crowd/worker.rb +33 -48
data/public/css/admin_console.css +17 -7
data/public/images/server_busy.png +0 -0
data/public/js/admin_console.js +3 -1
data/test/config/config.yml +1 -1
data/test/unit/test_action.rb +1 -1
data/test/unit/test_job.rb +2 -0
data/views/{index.erb → operations_center.erb} +5 -5
metadata +4 -3

data/lib/cloud_crowd/helpers/authorization.rb CHANGED Viewed

@@ -25,7 +25,7 @@ module CloudCrowd
       # turned on, then every request is authenticated, including between
       # the nodes and the central server.
       def authorize(login, password)
-        return true unless CloudCrowd.config[:use_http_authentication]
+        return true unless CloudCrowd.config[:http_authentication]
         return CloudCrowd.config[:login] == login &&
                CloudCrowd.config[:password] == password
       end
@@ -33,10 +33,12 @@ module CloudCrowd
       private
+      # Provide a Rack Authorization object.
       def auth
         @auth ||= Rack::Auth::Basic::Request.new(request.env)
       end
+      # Unauthorized requests will prompt the browser to provide credentials.
       def unauthorized!(realm = Server.authorization_realm)
         response['WWW-Authenticate'] = "Basic realm=\"#{realm}\""
         halt 401, 'Authorization Required'

data/lib/cloud_crowd/models/job.rb CHANGED Viewed

@@ -33,12 +33,10 @@ module CloudCrowd
       return unless all_work_units_complete?
       set_next_status
       outs = gather_outputs_from_work_units
-      update_attributes(:outputs => outs.to_json, :time => time_taken) if complete?
-      case self.status
-      when PROCESSING then queue_for_workers(outs.map {|o| JSON.parse(o) }.flatten)
-      when MERGING    then queue_for_workers(outs.to_json)
-      else                 fire_callback
+      return queue_for_workers(outs) if merging?
+      if complete?
+        update_attributes(:outputs => outs, :time => time_taken)
+        fire_callback if callback_url
       end
       self
     end
@@ -60,7 +58,6 @@ module CloudCrowd
     # If the callback_url is successfully pinged, we proceed to cleanup the job.
     # TODO: This should be moved into a Work Unit...
     def fire_callback
-      return unless callback_url
       begin
         RestClient.post(callback_url, {:job => self.to_json})
         self.destroy
@@ -91,6 +88,11 @@ module CloudCrowd
       self.action_class.public_instance_methods.include? 'split'
     end
+    # This job is done splitting if it's finished with its splitting work units.
+    def done_splitting?
+      splittable? && work_units.splitting.count <= 0
+    end
     # This job is mergeable if its Action has a +merge+ method.
     def mergeable?
       self.processing? && self.action_class.public_instance_methods.include?('merge')
@@ -98,14 +100,16 @@ module CloudCrowd
     # Retrieve the class for this Job's Action.
     def action_class
-      klass = CloudCrowd.actions[self.action]
-      return klass if klass
+      @action_class ||= CloudCrowd.actions[self.action]
+      return @action_class if @action_class
       raise Error::ActionNotFound, "no action named: '#{self.action}' could be found"
     end
     # How complete is this Job?
     # Unfortunately, with the current processing sequence, the percent_complete
-    # can pull a fast one and go backwards.
+    # can pull a fast one and go backwards. This happens when there's a single
+    # large input that takes a long time to split, and when it finally does it
+    # creates a whole swarm of work units. This seems unavoidable.
     def percent_complete
       return 99  if merging?
       return 100 if complete?
@@ -143,12 +147,12 @@ module CloudCrowd
     private
     # When the WorkUnits are all finished, gather all their outputs together
-    # before removing them from the database entirely.
+    # before removing them from the database entirely. Returns their merged JSON.
     def gather_outputs_from_work_units
       units = self.work_units.complete
-      outs = self.work_units.complete.map {|u| JSON.parse(u.output)['output'] }
+      outs = self.work_units.complete.map {|u| u.parsed_output }
       self.work_units.complete.destroy_all
-      outs
+      outs.to_json
     end
     # When starting a new job, or moving to a new stage, split up the inputs
@@ -156,14 +160,8 @@ module CloudCrowd
     # away.
     def queue_for_workers(input=nil)
       input ||= JSON.parse(self.inputs)
-      [input].flatten.map do |wu_input|
-        WorkUnit.create(
-          :job    => self,
-          :action => self.action,
-          :input  => wu_input,
-          :status => self.status
-        )
-      end
+      [input].flatten.each {|i| WorkUnit.start(self, action, i, status) }
+      self
     end
     # A Job starts out either splitting or processing, depending on its action.

data/lib/cloud_crowd/models/node_record.rb CHANGED Viewed

@@ -1,7 +1,8 @@
 module CloudCrowd
-  # A NodeRecord is the record of a Node running remotely. We can use it to
-  # assign work units to the node, and keep track of its status.
+  # A NodeRecord is the central server's record of a Node running remotely. We
+  # can use it to assign WorkUnits to the Node, and keep track of its status.
+  # When a Node exits, it destroys this record.
   class NodeRecord < ActiveRecord::Base
     has_many :work_units
@@ -16,18 +17,21 @@ module CloudCrowd
       :order      => 'updated_at asc'
     }
-    # Save a Node's current status to the database.
+    # Register a Node with the central server. Currently this only happens at
+    # Node startup.
     def self.check_in(params, request)
       attrs = {
         :ip_address       => request.ip,
         :port             => params[:port],
         :max_workers      => params[:max_workers],
-        :enabled_actions  => params[:enabled_actions],
-        :updated_at       => Time.now
+        :enabled_actions  => params[:enabled_actions]
       }
       self.find_or_create_by_host(params[:host]).update_attributes!(attrs)
     end
+    # Dispatch a WorkUnit to this node. Places the node at back at the end of
+    # the rotation. If we fail to send the WorkUnit, we consider the node to be
+    # down, and remove this record, freeing up all of its checked-out work units.
     def send_work_unit(unit)
       result = node['/work'].post(:work_unit => unit.to_json)
       unit.assign_to(self, JSON.parse(result)['pid'])
@@ -36,45 +40,55 @@ module CloudCrowd
       self.destroy # Couldn't post to node, assume it's gone away.
     end
+    # What Actions is this Node able to run?
     def actions
       enabled_actions.split(',')
     end
+    # Is this Node too busy for more work? (Determined by number of workers.)
     def busy?
       max_workers && work_units.count >= max_workers
     end
+    # The URL at which this Node may be reached.
+    # TODO: Make sure that the host actually has externally accessible DNS.
     def url
       @url ||= "http://#{host}:#{port}"
     end
+    # Keep a RestClient::Resource handy for contacting the Node, including
+    # HTTP authentication, if configured.
     def node
-      return @node if @node
-      params = [url]
-      params += [CloudCrowd.config[:login], CloudCrowd.config[:password]] if CloudCrowd.config[:use_http_authentication]
-      @node = RestClient::Resource.new(*params)
+      @node ||= RestClient::Resource.new(url, CloudCrowd.client_options)
     end
+    # The printable status of the Node.
     def display_status
       busy? ? 'busy' : 'available'
     end
+    # A list of the process ids of the workers currently being run by the Node.
     def worker_pids
       work_units.all(:select => 'worker_pid').map(&:worker_pid)
     end
+    # The JSON representation of a NodeRecord includes its worker_pids.
     def to_json(opts={})
       { 'host'    => host,
         'workers' => worker_pids,
-        'status'  => display_status,
+        'status'  => display_status
       }.to_json
     end
     private
+    # When a Node shuts down, we free up all of the WorkUnits that it had
+    # reserved, and they become available for others to pick up. Redistribute
+    # the WorkUnits in a separate thread to avoid delaying Node shutdown.
     def clear_work_units
       WorkUnit.update_all('node_record_id = null, worker_pid = null', "node_record_id = #{id}")
+      Thread.new { WorkUnit.distribute_to_nodes }
     end
   end

data/lib/cloud_crowd/models/work_unit.rb CHANGED Viewed

@@ -11,15 +11,16 @@ module CloudCrowd
     belongs_to :node_record
     validates_presence_of :job_id, :status, :input, :action
-    named_scope :taken,     {:conditions => ["worker_pid is not null"]}
-    named_scope :available, {:conditions => {:worker_pid => nil, :status => INCOMPLETE}}
-    named_scope :reserved,  {:conditions => {:worker_pid => 0}}
+    # Available WorkUnits are waiting to be distributed to Nodes for processing.
+    named_scope :available, {:conditions => {:reservation => nil, :worker_pid => nil, :status => INCOMPLETE}}
+    # Reserved WorkUnits have been marked for distribution by a central server process.
+    named_scope :reserved,  {:conditions => {:reservation => $$}, :order => 'updated_at asc'}
     # Attempt to send a list of work_units to nodes with available capacity.
-    # Do this in a separate thread so that the request can return, satisfied.
-    # A single application server process stops the same WorkUnit from being
-    # distributed to multiple nodes by reserving all the available ones.
+    # A single central server process stops the same WorkUnit from being
+    # distributed to multiple nodes by reserving it first. The algorithm used
+    # should be lock-free.
     def self.distribute_to_nodes
       return unless WorkUnit.reserve_available
       work_units = WorkUnit.reserved
@@ -35,46 +36,52 @@ module CloudCrowd
           available_nodes.push(node) unless node.busy?
         end
       end
+    ensure
       WorkUnit.cancel_reservations
     end
-    # Reserves all available WorkUnits. Returns false if there were none
-    # available.
+    # Reserves all available WorkUnits for this process. Returns false if there
+    # were none available.
     def self.reserve_available
-      WorkUnit.available.update_all('worker_pid = 0') > 0
+      WorkUnit.available.update_all("reservation = #{$$}") > 0
     end
+    # Cancels all outstanding WorkUnit reservations for this process.
     def self.cancel_reservations
-      WorkUnit.reserved.update_all('worker_pid = null')
+      WorkUnit.reserved.update_all('reservation = null')
     end
+    # Look up a WorkUnit by the worker that's currently processing it. Specified
+    # by <tt>pid@host</tt>.
     def self.find_by_worker_name(name)
       pid, host = name.split('@')
       node = NodeRecord.find_by_host(host)
       node && node.work_units.find_by_worker_pid(pid)
     end
+    # Convenience method for starting a new WorkUnit.
+    def self.start(job, action, input, status)
+      self.create(:job => job, :action => action, :input => input, :status => status)
+    end
     # Mark this unit as having finished successfully.
-    # TODO: Refactor alongside check_for_completion ... look into doubleparse.
-    def finish(output, time_taken)
+    # Splitting work units are handled differently (an optimization) -- they
+    # immediately fire off all of their resulting WorkUnits for processing,
+    # without waiting for the rest of their splitting cousins to complete.
+    def finish(result, time_taken)
       if splitting?
-        [JSON.parse(JSON.parse(output)['output'])].flatten.each do |wu_input|
-          WorkUnit.create(
-            :job    => job,
-            :action => action,
-            :input  => wu_input,
-            :status => PROCESSING
-          )
+        [JSON.parse(parsed_output(result))].flatten.each do |new_input|
+          WorkUnit.start(job, action, new_input, PROCESSING)
         end
         self.destroy
-        job.set_next_status if job.work_units.splitting.count <= 0
+        job.set_next_status if job.done_splitting?
       else
         update_attributes({
           :status         => SUCCEEDED,
           :node_record    => nil,
           :worker_pid     => nil,
           :attempts       => attempts + 1,
-          :output         => output,
+          :output         => result,
           :time           => time_taken
         })
         job.check_for_completion
@@ -105,14 +112,21 @@ module CloudCrowd
       })
     end
-    # When a Worker checks out a WorkUnit, establish the connection between
-    # WorkUnit and NodeRecord.
+    # When a Node checks out a WorkUnit, establish the connection between
+    # WorkUnit and NodeRecord and record the worker_pid.
     def assign_to(node_record, worker_pid)
       update_attributes!(:node_record => node_record, :worker_pid => worker_pid)
     end
+    # All output needs to be wrapped in a JSON object for consistency
+    # (unfortunately, JSON.parse needs the top-level to be an object or array).
+    # Convenience method to provide the parsed version.
+    def parsed_output(out = self.output)
+      JSON.parse(out)['output']
+    end
     # The JSON representation of a WorkUnit shares the Job's options with all
-    # its sister WorkUnits.
+    # its cousin WorkUnits.
     def to_json
       {
         'id'        => self.id,

data/lib/cloud_crowd/node.rb CHANGED Viewed

@@ -1,9 +1,15 @@
 module CloudCrowd
+  # A Node is a Sinatra/Thin application that runs a single instance per-machine
+  # It registers with the central server, receives WorkUnits, and forks off
+  # Workers to process them. The actions are:
+  #
+  # [get /heartbeat] Returns 200 OK to let monitoring tools know the server's up.
+  # [post /work] The central server hits <tt>/work</tt> to dispatch a WorkUnit to this Node.
   class Node < Sinatra::Default
     # A Node's default port. You only run a single node per machine, so they
-    # can all use the same port without problems.
+    # can all use the same port without any problems.
     DEFAULT_PORT = 9063
     attr_reader :server, :asset_store
@@ -17,22 +23,26 @@ module CloudCrowd
     enable :methodoverride
     # Enabling HTTP Authentication turns it on for all requests.
+    # This works the same way as in the central CloudCrowd::Server.
     before do
-      login_required if CloudCrowd.config[:use_http_authentication]
+      login_required if CloudCrowd.config[:http_authentication]
     end
     # To monitor a Node with Monit, God, Nagios, or another tool, you can hit
-    # /heartbeat to make sure its still up.
+    # /heartbeat to make sure its still online.
     get '/heartbeat' do
       "buh-bump"
     end
+    # Posts a WorkUnit to this Node. Forks a Worker and returns the process id.
     post '/work' do
       pid = fork { Worker.new(self, JSON.parse(params[:work_unit])) }
       Process.detach(pid)
       json :pid => pid
     end
+    # Creating a Node registers with the central server and starts listening for
+    # incoming WorkUnits.
     def initialize(port=DEFAULT_PORT)
       require 'json'
       @server           = CloudCrowd.central_server
@@ -47,6 +57,9 @@ module CloudCrowd
       @server_thread.join
     end
+    # Checking in with the central server informs it of the location and
+    # configuration of this Node. If it can't check-in, there's no point in
+    # starting.
     def check_in
       @server["/node/#{@host}"].put(
         :port             => @port,
@@ -58,25 +71,30 @@ module CloudCrowd
       raise SystemExit
     end
+    # Before exiting, the Node checks out with the central server, releasing all
+    # of its WorkUnits for other Nodes to handle
     def check_out
       @server["/node/#{@host}"].delete
     end
+    private
+    # Launch the Node's Thin server in a separate thread because it blocks.
     def start_server
       @server_thread = Thread.new do
         Thin::Server.start('0.0.0.0', @port, self, :signals => false)
       end
     end
-    private
+    # Trap exit signals in order to shut down cleanly.
     def trap_signals
       Signal.trap('INT')  { shut_down }
       Signal.trap('KILL') { shut_down }
       Signal.trap('TERM') { shut_down }
     end
+    # At shut down, de-register with the central server before exiting.
     def shut_down
       check_out
       Process.exit

data/lib/cloud_crowd/schema.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # Complete schema for CloudCrowd.
-ActiveRecord::Schema.define(:version => 1) do
+ActiveRecord::Schema.define(:version => CloudCrowd::SCHEMA_VERSION) do
   create_table "jobs", :force => true do |t|
     t.integer  "status",                      :null => false
@@ -29,9 +29,10 @@ ActiveRecord::Schema.define(:version => 1) do
     t.integer  "job_id",                          :null => false
     t.text     "input",                           :null => false
     t.string   "action",                          :null => false
-    t.integer  "attempts",     :default => 0,     :null => false
+    t.integer  "attempts",      :default => 0,    :null => false
     t.integer  "node_record_id"
     t.integer  "worker_pid"
+    t.integer  "reservation"
     t.float    "time"
     t.text     "output"
     t.datetime "created_at"

data/lib/cloud_crowd/server.rb CHANGED Viewed

@@ -5,6 +5,7 @@ module CloudCrowd
   # == Admin
   # [get /] Render the admin console, with a progress meter for running jobs.
   # [get /status] Get the combined JSON of every active job and worker.
+  # [get /worker/:name] Look up the details of a WorkUnit that a Worker is busy processing.
   # [get /heartbeat] Returns 200 OK to let monitoring tools know the server's up.
   #
   # == Public API
@@ -13,9 +14,9 @@ module CloudCrowd
   # [delete /jobs/:job_id] Clean up a Job when you're done downloading the results. Removes all intermediate files.
   #
   # == Internal Workers API
-  # [post /work] Dequeue the next WorkUnit, and hand it off to the worker.
+  # [puts /node/:host] Registers a new Node, making it available for processing.
+  # [delete /node/:host] Removes a Node from the registry, freeing up any WorkUnits that it had checked out.
   # [put /work/:unit_id] Mark a finished WorkUnit as completed or failed, with results.
-  # [put /worker] Keep a record of an actively running worker.
   class Server < Sinatra::Default
     set :root, ROOT
@@ -28,12 +29,12 @@ module CloudCrowd
     # Enabling HTTP Authentication turns it on for all requests.
     before do
-      login_required if CloudCrowd.config[:use_http_authentication]
+      login_required if CloudCrowd.config[:http_authentication]
     end
     # Render the admin console.
     get '/' do
-      erb :index
+      erb :operations_center
     end
     # Get the JSON for every active job in the queue and every active worker
@@ -83,12 +84,16 @@ module CloudCrowd
     # INTERNAL NODE API:
+    # A new Node will this this action to register its location and
+    # configuration with the central server. Triggers distribution of WorkUnits.
     put '/node/:host' do
       NodeRecord.check_in(params, request)
       WorkUnit.distribute_to_nodes
       json nil
     end
+    # Deregisters a Node from the central server. Releases and redistributes any
+    # WorkUnits it may have had checked out.
     delete '/node/:host' do
       NodeRecord.destroy_all(:host => params[:host])
       json nil