RubyGems - documentcloud-cloud-crowd - Versions diffs - 0.1.0 → 0.1.1 - Mend

documentcloud-cloud-crowd 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

data/README +8 -8
data/cloud-crowd.gemspec +8 -8
data/config/config.example.ru +8 -2
data/config/config.example.yml +6 -15
data/examples/process_pdfs_example.rb +1 -1
data/examples/word_count_example.rb +1 -0
data/lib/cloud-crowd.rb +6 -5
data/lib/cloud_crowd/action.rb +11 -7
data/lib/cloud_crowd/asset_store/filesystem_store.rb +5 -0
data/lib/cloud_crowd/asset_store/s3_store.rb +7 -3
data/lib/cloud_crowd/asset_store.rb +1 -1
data/lib/cloud_crowd/command_line.rb +14 -53
data/lib/cloud_crowd/exceptions.rb +4 -0
data/lib/cloud_crowd/helpers/authorization.rb +2 -2
data/lib/cloud_crowd/helpers/resources.rb +0 -20
data/lib/cloud_crowd/models/job.rb +25 -26
data/lib/cloud_crowd/models/node_record.rb +81 -0
data/lib/cloud_crowd/models/work_unit.rb +70 -30
data/lib/cloud_crowd/models.rb +1 -1
data/lib/cloud_crowd/node.rb +87 -0
data/lib/cloud_crowd/schema.rb +19 -16
data/lib/cloud_crowd/{app.rb → server.rb} +25 -30
data/lib/cloud_crowd/worker.rb +50 -74
data/public/css/admin_console.css +26 -14
data/public/images/server.png +0 -0
data/public/js/admin_console.js +45 -18
data/test/acceptance/test_failing_work_units.rb +1 -1
data/test/acceptance/{test_app.rb → test_server.rb} +15 -15
data/test/acceptance/test_word_count.rb +3 -9
data/test/blueprints.rb +0 -1
data/test/config/config.ru +1 -1
data/test/config/config.yml +1 -3
data/test/unit/test_configuration.rb +1 -1
data/test/unit/test_job.rb +1 -0
data/test/unit/test_work_unit.rb +2 -4
data/views/index.erb +13 -8
metadata +9 -9
data/lib/cloud_crowd/daemon.rb +0 -95
data/lib/cloud_crowd/models/worker_record.rb +0 -61
data/lib/cloud_crowd/runner.rb +0 -15

data/lib/cloud_crowd/models/node_record.rb ADDED Viewed

@@ -0,0 +1,81 @@
+module CloudCrowd
+  # A NodeRecord is the record of a Node running remotely. We can use it to
+  # assign work units to the node, and keep track of its status.
+  class NodeRecord < ActiveRecord::Base
+    has_many :work_units
+    validates_presence_of :host, :ip_address, :port
+    before_destroy :clear_work_units
+    # Available Nodes haven't used up their maxiumum number of workers yet.
+    named_scope :available, {
+      :conditions => ['(max_workers is null or (select count(*) from work_units where node_record_id = node_records.id) < max_workers)'],
+      :order      => 'updated_at asc'
+    }
+    # Save a Node's current status to the database.
+    def self.check_in(params, request)
+      attrs = {
+        :ip_address       => request.ip,
+        :port             => params[:port],
+        :max_workers      => params[:max_workers],
+        :enabled_actions  => params[:enabled_actions],
+        :updated_at       => Time.now
+      }
+      self.find_or_create_by_host(params[:host]).update_attributes!(attrs)
+    end
+    def send_work_unit(unit)
+      result = node['/work'].post(:work_unit => unit.to_json)
+      unit.assign_to(self, JSON.parse(result)['pid'])
+      touch
+    rescue Errno::ECONNREFUSED
+      self.destroy # Couldn't post to node, assume it's gone away.
+    end
+    def actions
+      enabled_actions.split(',')
+    end
+    def busy?
+      max_workers && work_units.count >= max_workers
+    end
+    def url
+      @url ||= "http://#{host}:#{port}"
+    end
+    def node
+      return @node if @node
+      params = [url]
+      params += [CloudCrowd.config[:login], CloudCrowd.config[:password]] if CloudCrowd.config[:use_http_authentication]
+      @node = RestClient::Resource.new(*params)
+    end
+    def display_status
+      busy? ? 'busy' : 'available'
+    end
+    def worker_pids
+      work_units.all(:select => 'worker_pid').map(&:worker_pid)
+    end
+    def to_json(opts={})
+      { 'host'    => host,
+        'workers' => worker_pids,
+        'status'  => display_status,
+      }.to_json
+    end
+    private
+    def clear_work_units
+      WorkUnit.update_all('node_record_id = null, worker_pid = null', "node_record_id = #{id}")
+    end
+  end
+end

data/lib/cloud_crowd/models/work_unit.rb CHANGED Viewed

@@ -8,39 +8,77 @@ module CloudCrowd
     include ModelStatus
     belongs_to :job
-    belongs_to :worker_record
+    belongs_to :node_record
     validates_presence_of :job_id, :status, :input, :action
+    named_scope :taken,     {:conditions => ["worker_pid is not null"]}
+    named_scope :available, {:conditions => {:worker_pid => nil, :status => INCOMPLETE}}
+    named_scope :reserved,  {:conditions => {:worker_pid => 0}}
-    after_save :check_for_job_completion
+    # Attempt to send a list of work_units to nodes with available capacity.
+    # Do this in a separate thread so that the request can return, satisfied.
+    # A single application server process stops the same WorkUnit from being
+    # distributed to multiple nodes by reserving all the available ones.
+    def self.distribute_to_nodes
+      return unless WorkUnit.reserve_available
+      work_units = WorkUnit.reserved
+      available_nodes = NodeRecord.available
+      until work_units.empty? do
+        node = available_nodes.shift
+        unit = work_units.first
+        break unless node
+        next unless node.actions.include? unit.action
+        sent = node.send_work_unit(unit)
+        if sent
+          work_units.shift
+          available_nodes.push(node) unless node.busy?
+        end
+      end
+      WorkUnit.cancel_reservations
+    end
+    # Reserves all available WorkUnits. Returns false if there were none
+    # available.
+    def self.reserve_available
+      WorkUnit.available.update_all('worker_pid = 0') > 0
+    end
-    # Find the first available WorkUnit in the queue, and take it out.
-    # +enabled_actions+ must be passed to whitelist the types of WorkUnits than
-    # can be retrieved for processing. Optionally, specify the +offset+ to peek
-    # further on in line.
-    def self.dequeue(worker_name, enabled_actions=[], offset=0)
-      unit = self.first(
-        :conditions => {:status => INCOMPLETE, :worker_record_id => nil, :action => enabled_actions},
-        :order      => "created_at asc",
-        :offset     => offset
-      )
-      unit ? unit.assign_to(worker_name) : nil
+    def self.cancel_reservations
+      WorkUnit.reserved.update_all('worker_pid = null')
     end
-    # After saving a WorkUnit, its Job should check if it just became complete.
-    def check_for_job_completion
-      self.job.check_for_completion if complete?
+    def self.find_by_worker_name(name)
+      pid, host = name.split('@')
+      node = NodeRecord.find_by_host(host)
+      node && node.work_units.find_by_worker_pid(pid)
     end
     # Mark this unit as having finished successfully.
+    # TODO: Refactor alongside check_for_completion ... look into doubleparse.
     def finish(output, time_taken)
-      update_attributes({
-        :status         => SUCCEEDED,
-        :worker_record  => nil,
-        :attempts       => self.attempts + 1,
-        :output         => output,
-        :time           => time_taken
-      })
+      if splitting?
+        [JSON.parse(JSON.parse(output)['output'])].flatten.each do |wu_input|
+          WorkUnit.create(
+            :job    => job,
+            :action => action,
+            :input  => wu_input,
+            :status => PROCESSING
+          )
+        end
+        self.destroy
+        job.set_next_status if job.work_units.splitting.count <= 0
+      else
+        update_attributes({
+          :status         => SUCCEEDED,
+          :node_record    => nil,
+          :worker_pid     => nil,
+          :attempts       => attempts + 1,
+          :output         => output,
+          :time           => time_taken
+        })
+        job.check_for_completion
+      end
     end
     # Mark this unit as having failed. May attempt a retry.
@@ -49,26 +87,28 @@ module CloudCrowd
       return try_again if tries < CloudCrowd.config[:work_unit_retries]
       update_attributes({
         :status         => FAILED,
-        :worker_record  => nil,
+        :node_record    => nil,
+        :worker_pid     => nil,
         :attempts       => tries,
         :output         => output,
         :time           => time_taken
       })
+      self.job.check_for_completion
     end
     # Ever tried. Ever failed. No matter. Try again. Fail again. Fail better.
     def try_again
       update_attributes({
-        :worker_record  => nil,
-        :attempts       => self.attempts + 1
+        :node_record  => nil,
+        :worker_pid   => nil,
+        :attempts     => self.attempts + 1
       })
     end
     # When a Worker checks out a WorkUnit, establish the connection between
-    # WorkUnit and WorkerRecord.
-    def assign_to(worker_name)
-      self.worker_record = WorkerRecord.find_by_name!(worker_name)
-      self.save ? self : nil
+    # WorkUnit and NodeRecord.
+    def assign_to(node_record, worker_pid)
+      update_attributes!(:node_record => node_record, :worker_pid => worker_pid)
     end
     # The JSON representation of a WorkUnit shares the Job's options with all

data/lib/cloud_crowd/models.rb CHANGED Viewed

@@ -36,5 +36,5 @@ module CloudCrowd
 end
 require 'cloud_crowd/models/job'
+require 'cloud_crowd/models/node_record'
 require 'cloud_crowd/models/work_unit'
-require 'cloud_crowd/models/worker_record'

data/lib/cloud_crowd/node.rb ADDED Viewed

@@ -0,0 +1,87 @@
+module CloudCrowd
+  class Node < Sinatra::Default
+    # A Node's default port. You only run a single node per machine, so they
+    # can all use the same port without problems.
+    DEFAULT_PORT = 9063
+    attr_reader :server, :asset_store
+    set :root, ROOT
+    set :authorization_realm, "CloudCrowd"
+    helpers Helpers
+    # methodoverride allows the _method param.
+    enable :methodoverride
+    # Enabling HTTP Authentication turns it on for all requests.
+    before do
+      login_required if CloudCrowd.config[:use_http_authentication]
+    end
+    # To monitor a Node with Monit, God, Nagios, or another tool, you can hit
+    # /heartbeat to make sure its still up.
+    get '/heartbeat' do
+      "buh-bump"
+    end
+    post '/work' do
+      pid = fork { Worker.new(self, JSON.parse(params[:work_unit])) }
+      Process.detach(pid)
+      json :pid => pid
+    end
+    def initialize(port=DEFAULT_PORT)
+      require 'json'
+      @server           = CloudCrowd.central_server
+      @host             = Socket.gethostname
+      @enabled_actions  = CloudCrowd.actions.keys
+      @asset_store      = AssetStore.new
+      @port             = port || DEFAULT_PORT
+      trap_signals
+      start_server
+      check_in
+      @server_thread.join
+    end
+    def check_in
+      @server["/node/#{@host}"].put(
+        :port             => @port,
+        :max_workers      => CloudCrowd.config[:max_workers],
+        :enabled_actions  => @enabled_actions.join(',')
+      )
+    rescue Errno::ECONNREFUSED
+      puts "Failed to connect to the central server (#{@server.to_s}), exiting..."
+      raise SystemExit
+    end
+    def check_out
+      @server["/node/#{@host}"].delete
+    end
+    def start_server
+      @server_thread = Thread.new do
+        Thin::Server.start('0.0.0.0', @port, self, :signals => false)
+      end
+    end
+    private
+    def trap_signals
+      Signal.trap('INT')  { shut_down }
+      Signal.trap('KILL') { shut_down }
+      Signal.trap('TERM') { shut_down }
+    end
+    def shut_down
+      check_out
+      Process.exit
+    end
+  end
+end

data/lib/cloud_crowd/schema.rb CHANGED Viewed

@@ -10,7 +10,16 @@ ActiveRecord::Schema.define(:version => 1) do
     t.float    "time"
     t.string   "callback_url"
     t.string   "email"
-    t.integer  "lock_version", :default => 0, :null => false
+    t.datetime "created_at"
+    t.datetime "updated_at"
+  end
+  create_table "node_records", :force => true do |t|
+    t.string   "host",                            :null => false
+    t.string   "ip_address",                      :null => false
+    t.integer  "port",                            :null => false
+    t.string   "enabled_actions", :default => '', :null => false
+    t.integer  "max_workers"
     t.datetime "created_at"
     t.datetime "updated_at"
   end
@@ -21,25 +30,19 @@ ActiveRecord::Schema.define(:version => 1) do
     t.text     "input",                           :null => false
     t.string   "action",                          :null => false
     t.integer  "attempts",     :default => 0,     :null => false
-    t.integer  "lock_version", :default => 0,     :null => false
-    t.integer  "worker_record_id"
+    t.integer  "node_record_id"
+    t.integer  "worker_pid"
     t.float    "time"
     t.text     "output"
     t.datetime "created_at"
     t.datetime "updated_at"
   end
-  create_table "worker_records", :force => true do |t|
-    t.string   "name",          :null => false
-    t.string   "thread_status", :null => false
-    t.datetime "created_at"
-    t.datetime "updated_at"
-  end
-  add_index "jobs", ["status"], :name => "index_jobs_on_status"
-  add_index "work_units", ["job_id"], :name => "index_work_units_on_job_id"
-  add_index "work_units", ["status", "worker_record_id", "action"], :name => "index_work_units_on_status_and_worker_record_id_and_action"
-  add_index "worker_records", ["name"], :name => "index_worker_records_on_name"
-  add_index "worker_records", ["updated_at"], :name => "index_worker_records_on_updated_at"
+  # Here be indices. After looking, it seems faster not to have them at all.
+  #
+  # add_index "jobs", ["status"], :name => "index_jobs_on_status"
+  # add_index "work_units", ["job_id"], :name => "index_work_units_on_job_id"
+  # add_index "work_units", ["worker_pid"], :name => "index_work_units_on_worker_pid"
+  # add_index "work_units", ["worker_pid", "status"], :name => "index_work_units_on_worker_pid_and_status"
+  # add_index "work_units", ["worker_pid", "node_record_id"], :name => "index_work_units_on_worker_pid_and_node_record_id"
 end

data/lib/cloud_crowd/{app.rb → server.rb} RENAMED Viewed

@@ -16,7 +16,7 @@ module CloudCrowd
   # [post /work] Dequeue the next WorkUnit, and hand it off to the worker.
   # [put /work/:unit_id] Mark a finished WorkUnit as completed or failed, with results.
   # [put /worker] Keep a record of an actively running worker.
-  class App < Sinatra::Default
+  class Server < Sinatra::Default
     set :root, ROOT
     set :authorization_realm, "CloudCrowd"
@@ -42,15 +42,14 @@ module CloudCrowd
     get '/status' do
       json(
         'jobs'            => Job.incomplete,
-        'workers'         => WorkerRecord.alive(:order => 'name desc'),
+        'nodes'           => NodeRecord.all(:order => 'host desc'),
         'work_unit_count' => WorkUnit.incomplete.count
       )
     end
-    # Get the JSON for a worker record's work unit, if one exists.
+    # Get the JSON for what a worker is up to.
     get '/worker/:name' do
-      record = WorkerRecord.find_by_name params[:name]
-      json((record && record.work_unit) || {})
+      json WorkUnit.find_by_worker_name(params[:name]) || {}
     end
     # To monitor the central server with Monit, God, Nagios, or another
@@ -62,8 +61,11 @@ module CloudCrowd
     # PUBLIC API:
     # Start a new job. Accepts a JSON representation of the job-to-be.
+    # Distributes all work units to available nodes.
     post '/jobs' do
-      json Job.create_from_request(JSON.parse(params[:job]))
+      job = Job.create_from_request(JSON.parse(params[:job]))
+      WorkUnit.distribute_to_nodes
+      json job
     end
     # Check the status of a job, returning the output if finished, and the
@@ -79,36 +81,29 @@ module CloudCrowd
       json nil
     end
-    # INTERNAL WORKER DAEMON API:
+    # INTERNAL NODE API:
-    # Internal method for worker daemons to fetch the work unit at the front
-    # of the queue. Work unit is marked as taken and handed off to the worker.
-    post '/work' do
-      json dequeue_work_unit
+    put '/node/:host' do
+      NodeRecord.check_in(params, request)
+      WorkUnit.distribute_to_nodes
+      json nil
+    end
+    delete '/node/:host' do
+      NodeRecord.destroy_all(:host => params[:host])
+      json nil
     end
     # When workers are done with their unit, either successfully on in failure,
-    # they mark it back on the central server and retrieve another. Failures
-    # pull from one down in the queue, so as to not repeat the same unit.
+    # they mark it back on the central server and exit. Triggers distribution
+    # of pending work units.
     put '/work/:work_unit_id' do
-      handle_conflicts(409) do
-        case params[:status]
-        when 'succeeded'
-          current_work_unit.finish(params[:output], params[:time])
-          json dequeue_work_unit
-        when 'failed'
-          current_work_unit.fail(params[:output], params[:time])
-          json dequeue_work_unit(1)
-        else
-          error(500, "Completing a work unit must specify status.")
-        end
+      case params[:status]
+      when 'succeeded' then current_work_unit.finish(params[:output], params[:time])
+      when 'failed'    then current_work_unit.fail(params[:output], params[:time])
+      else             error(500, "Completing a work unit must specify status.")
       end
-    end
-    # Every so often workers check in to let the central server know that
-    # they're still alive. Keep up-to-date records
-    put '/worker' do
-      params[:terminated] ? WorkerRecord.check_out(params) : WorkerRecord.check_in(params)
+      WorkUnit.distribute_to_nodes
       json nil
     end

data/lib/cloud_crowd/worker.rb CHANGED Viewed

@@ -10,10 +10,6 @@ module CloudCrowd
   # having failed.
   class Worker
-    # The time between worker check-ins with the central server, informing
-    # it of the current status, and simply that it's still alive.
-    CHECK_IN_INTERVAL = 60
     # Wait five seconds to retry, after internal communcication errors.
     RETRY_WAIT = 5
@@ -22,32 +18,30 @@ module CloudCrowd
     # Spinning up a worker will create a new AssetStore with a persistent
     # connection to S3. This AssetStore gets passed into each action, for use
     # as it is run.
-    def initialize
-      @id               = $$
-      @hostname         = Socket.gethostname
-      @name             = "#{@id}@#{@hostname}"
-      @store            = AssetStore.new
-      @server           = CloudCrowd.central_server
-      @enabled_actions  = CloudCrowd.actions.keys
-      log 'started'
-    end
-    # Ask the central server for the first WorkUnit in line.
-    def fetch_work_unit
-      keep_trying_to "fetch a new work unit" do
-        unit_json = @server['/work'].post(base_params)
-        setup_work_unit(unit_json)
-      end
-    end
+    def initialize(node, work_unit)
+      Signal.trap('INT') { shut_down }
+      Signal.trap('KILL') { shut_down }
+      Signal.trap('TERM') { shut_down }
+      @pid  = $$
+      @node = node
+      setup_work_unit(work_unit)
+      run
+    end
+    # # Ask the central server for the first WorkUnit in line.
+    # def fetch_work_unit
+    #   keep_trying_to "fetch a new work unit" do
+    #     unit_json = @server['/work'].post(base_params)
+    #     setup_work_unit(unit_json)
+    #   end
+    # end
     # Return output to the central server, marking the current work unit as done.
     def complete_work_unit(result)
       keep_trying_to "complete work unit" do
         data = completion_params.merge({:status => 'succeeded', :output => result})
-        unit_json = @server["/work/#{data[:id]}"].put(data)
+        @node.server["/work/#{data[:id]}"].put(data)
         log "finished #{display_work_unit} in #{data[:time]} seconds"
-        clear_work_unit
-        setup_work_unit(unit_json)
       end
     end
@@ -55,36 +49,11 @@ module CloudCrowd
     def fail_work_unit(exception)
       keep_trying_to "mark work unit as failed" do
         data = completion_params.merge({:status => 'failed', :output => {'output' => exception.message}.to_json})
-        unit_json = @server["/work/#{data[:id]}"].put(data)
+        @node.server["/work/#{data[:id]}"].put(data)
         log "failed #{display_work_unit} in #{data[:time]} seconds\n#{exception.message}\n#{exception.backtrace}"
-        clear_work_unit
-        setup_work_unit(unit_json)
       end
     end
-    # Check in with the central server. Let it know the condition of the work
-    # thread, the action and status we're processing, and our hostname and PID.
-    def check_in(thread_status)
-      keep_trying_to "check in with central" do
-        @server["/worker"].put({
-          :name          => @name,
-          :thread_status => thread_status
-        })
-      end
-    end
-    # Inform the central server that this worker is finished. This is the only
-    # remote method that doesn't retry on connection errors -- if the worker
-    # can't connect to the central server while it's trying to shutdown, it
-    # should close, regardless.
-    def check_out
-      @server["/worker"].put({
-        :name       => @name,
-        :terminated => true
-      })
-      log 'exiting'
-    end
     # We expect and require internal communication between the central server
     # and the workers to succeed. If it fails for any reason, log it, and then
     # keep trying the same request.
@@ -100,33 +69,31 @@ module CloudCrowd
       end
     end
-    # Does this Worker have a job to do?
-    def has_work?
-      @action_name && @input && @options
-    end
     # Loggable string of the current work unit.
     def display_work_unit
-      "unit ##{@options['work_unit_id']} (#{@action_name})"
+      "unit ##{@options['work_unit_id']} (#{@action_name}/#{CloudCrowd.display_status(@status)})"
     end
     # Executes the current work unit, catching all exceptions as failures.
     def run_work_unit
-      begin
-        result = nil
-        @action = CloudCrowd.actions[@action_name].new(@status, @input, @options, @store)
-        Dir.chdir(@action.work_directory) do
-          result = case @status
-          when PROCESSING then @action.process
-          when SPLITTING  then @action.split
-          when MERGING    then @action.merge
-          else raise Error::StatusUnspecified, "work units must specify their status"
+      @worker_thread = Thread.new do
+        begin
+          result = nil
+          @action = CloudCrowd.actions[@action_name].new(@status, @input, @options, @node.asset_store)
+          Dir.chdir(@action.work_directory) do
+            result = case @status
+            when PROCESSING then @action.process
+            when SPLITTING  then @action.split
+            when MERGING    then @action.merge
+            else raise Error::StatusUnspecified, "work units must specify their status"
+            end
           end
+          complete_work_unit({'output' => result}.to_json)
+        rescue Exception => e
+          fail_work_unit(e)
         end
-        complete_work_unit({'output' => result}.to_json)
-      rescue Exception => e
-        fail_work_unit(e)
       end
+      @worker_thread.join
     end
     # Wraps <tt>run_work_unit</tt> to benchmark the execution time, if requested.
@@ -142,8 +109,7 @@ module CloudCrowd
     # Common parameters to send back to central.
     def base_params
       @base_params ||= {
-        :worker_name    => @name,
-        :worker_actions => @enabled_actions.join(',')
+        :pid => @pid
       }
     end
@@ -157,9 +123,8 @@ module CloudCrowd
     end
     # Extract our instance variables from a WorkUnit's JSON.
-    def setup_work_unit(unit_json)
-      return false unless unit_json
-      unit = JSON.parse(unit_json)
+    def setup_work_unit(unit)
+      return false unless unit
       @start_time = Time.now
       @action_name, @input, @options, @status = unit['action'], unit['input'], unit['options'], unit['status']
       @options['job_id'] = unit['job_id']
@@ -171,7 +136,7 @@ module CloudCrowd
     # Log a message to the daemon log. Includes PID for identification.
     def log(message)
-      puts "Worker ##{@id}: #{message}" unless ENV['RACK_ENV'] == 'test'
+      puts "Worker ##{@pid}: #{message}" unless ENV['RACK_ENV'] == 'test'
     end
     # When we're done with a unit, clear out our instance variables to make way
@@ -181,6 +146,17 @@ module CloudCrowd
       @action, @action_name, @input, @options, @start_time = nil, nil, nil, nil, nil
     end
+    # Force the worker to quit, even if it's in the middle of processing.
+    # If it had checked out a work unit, the node should have released it on
+    # the central server already.
+    def shut_down
+      if @worker_thread
+        @worker_thread.kill
+        @worker_thread.kill! if @worker_thread.alive?
+      end
+      Process.exit
+    end
   end
 end