RubyGems - cloud-crowd - Versions diffs - 0.1.0 → 0.2.0 - Mend

cloud-crowd 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (42) hide show

data/README +16 -16
data/cloud-crowd.gemspec +10 -9
data/config/config.example.ru +8 -2
data/config/config.example.yml +21 -25
data/examples/process_pdfs_example.rb +1 -1
data/examples/word_count_example.rb +1 -0
data/lib/cloud-crowd.rb +47 -28
data/lib/cloud_crowd/action.rb +14 -8
data/lib/cloud_crowd/asset_store.rb +8 -8
data/lib/cloud_crowd/asset_store/filesystem_store.rb +18 -7
data/lib/cloud_crowd/asset_store/s3_store.rb +14 -11
data/lib/cloud_crowd/command_line.rb +24 -58
data/lib/cloud_crowd/exceptions.rb +7 -0
data/lib/cloud_crowd/helpers/authorization.rb +5 -3
data/lib/cloud_crowd/helpers/resources.rb +0 -20
data/lib/cloud_crowd/models.rb +1 -1
data/lib/cloud_crowd/models/job.rb +37 -40
data/lib/cloud_crowd/models/node_record.rb +95 -0
data/lib/cloud_crowd/models/work_unit.rb +87 -33
data/lib/cloud_crowd/node.rb +105 -0
data/lib/cloud_crowd/schema.rb +22 -18
data/lib/cloud_crowd/{app.rb → server.rb} +34 -34
data/lib/cloud_crowd/worker.rb +68 -107
data/public/css/admin_console.css +40 -18
data/public/images/server.png +0 -0
data/public/images/server_busy.png +0 -0
data/public/js/admin_console.js +47 -18
data/test/acceptance/test_failing_work_units.rb +1 -1
data/test/acceptance/{test_app.rb → test_server.rb} +15 -15
data/test/acceptance/test_word_count.rb +3 -9
data/test/blueprints.rb +0 -1
data/test/config/config.ru +1 -1
data/test/config/config.yml +2 -4
data/test/unit/test_action.rb +1 -1
data/test/unit/test_configuration.rb +1 -1
data/test/unit/test_job.rb +3 -0
data/test/unit/test_work_unit.rb +2 -4
data/views/{index.erb → operations_center.erb} +13 -8
metadata +11 -10
data/lib/cloud_crowd/daemon.rb +0 -95
data/lib/cloud_crowd/models/worker_record.rb +0 -61
data/lib/cloud_crowd/runner.rb +0 -15

data/lib/cloud_crowd/models/node_record.rb ADDED

@@ -0,0 +1,95 @@
+module CloudCrowd
+  # A NodeRecord is the central server's record of a Node running remotely. We
+  # can use it to assign WorkUnits to the Node, and keep track of its status.
+  # When a Node exits, it destroys this record.
+  class NodeRecord < ActiveRecord::Base
+    has_many :work_units
+    validates_presence_of :host, :ip_address, :port
+    before_destroy :clear_work_units
+    # Available Nodes haven't used up their maxiumum number of workers yet.
+    named_scope :available, {
+      :conditions => ['(max_workers is null or (select count(*) from work_units where node_record_id = node_records.id) < max_workers)'],
+      :order      => 'updated_at asc'
+    }
+    # Register a Node with the central server. Currently this only happens at
+    # Node startup.
+    def self.check_in(params, request)
+      attrs = {
+        :ip_address       => request.ip,
+        :port             => params[:port],
+        :max_workers      => params[:max_workers],
+        :enabled_actions  => params[:enabled_actions]
+      }
+      self.find_or_create_by_host(params[:host]).update_attributes!(attrs)
+    end
+    # Dispatch a WorkUnit to this node. Places the node at back at the end of
+    # the rotation. If we fail to send the WorkUnit, we consider the node to be
+    # down, and remove this record, freeing up all of its checked-out work units.
+    def send_work_unit(unit)
+      result = node['/work'].post(:work_unit => unit.to_json)
+      unit.assign_to(self, JSON.parse(result)['pid'])
+      touch
+    rescue Errno::ECONNREFUSED
+      self.destroy # Couldn't post to node, assume it's gone away.
+    end
+    # What Actions is this Node able to run?
+    def actions
+      enabled_actions.split(',')
+    end
+    # Is this Node too busy for more work? (Determined by number of workers.)
+    def busy?
+      max_workers && work_units.count >= max_workers
+    end
+    # The URL at which this Node may be reached.
+    # TODO: Make sure that the host actually has externally accessible DNS.
+    def url
+      @url ||= "http://#{host}:#{port}"
+    end
+    # Keep a RestClient::Resource handy for contacting the Node, including
+    # HTTP authentication, if configured.
+    def node
+      @node ||= RestClient::Resource.new(url, CloudCrowd.client_options)
+    end
+    # The printable status of the Node.
+    def display_status
+      busy? ? 'busy' : 'available'
+    end
+    # A list of the process ids of the workers currently being run by the Node.
+    def worker_pids
+      work_units.all(:select => 'worker_pid').map(&:worker_pid)
+    end
+    # The JSON representation of a NodeRecord includes its worker_pids.
+    def to_json(opts={})
+      { 'host'    => host,
+        'workers' => worker_pids,
+        'status'  => display_status
+      }.to_json
+    end
+    private
+    # When a Node shuts down, we free up all of the WorkUnits that it had
+    # reserved, and they become available for others to pick up. Redistribute
+    # the WorkUnits in a separate thread to avoid delaying Node shutdown.
+    def clear_work_units
+      WorkUnit.update_all('node_record_id = null, worker_pid = null', "node_record_id = #{id}")
+      Thread.new { WorkUnit.distribute_to_nodes }
+    end
+  end
+end

data/lib/cloud_crowd/models/work_unit.rb CHANGED

@@ -8,39 +8,84 @@ module CloudCrowd
     include ModelStatus
     belongs_to :job
-    belongs_to :worker_record
+    belongs_to :node_record
     validates_presence_of :job_id, :status, :input, :action
-    after_save :check_for_job_completion
+    # Available WorkUnits are waiting to be distributed to Nodes for processing.
+    named_scope :available, {:conditions => {:reservation => nil, :worker_pid => nil, :status => INCOMPLETE}}
+    # Reserved WorkUnits have been marked for distribution by a central server process.
+    named_scope :reserved,  {:conditions => {:reservation => $$}, :order => 'updated_at asc'}
-    # Find the first available WorkUnit in the queue, and take it out.
-    # +enabled_actions+ must be passed to whitelist the types of WorkUnits than
-    # can be retrieved for processing. Optionally, specify the +offset+ to peek
-    # further on in line.
-    def self.dequeue(worker_name, enabled_actions=[], offset=0)
-      unit = self.first(
-        :conditions => {:status => INCOMPLETE, :worker_record_id => nil, :action => enabled_actions},
-        :order      => "created_at asc",
-        :offset     => offset
-      )
-      unit ? unit.assign_to(worker_name) : nil
+    # Attempt to send a list of work_units to nodes with available capacity.
+    # A single central server process stops the same WorkUnit from being
+    # distributed to multiple nodes by reserving it first. The algorithm used
+    # should be lock-free.
+    def self.distribute_to_nodes
+      return unless WorkUnit.reserve_available
+      work_units = WorkUnit.reserved
+      available_nodes = NodeRecord.available
+      until work_units.empty? do
+        node = available_nodes.shift
+        unit = work_units.first
+        break unless node
+        next unless node.actions.include? unit.action
+        sent = node.send_work_unit(unit)
+        if sent
+          work_units.shift
+          available_nodes.push(node) unless node.busy?
+        end
+      end
+    ensure
+      WorkUnit.cancel_reservations
     end
-    # After saving a WorkUnit, its Job should check if it just became complete.
-    def check_for_job_completion
-      self.job.check_for_completion if complete?
+    # Reserves all available WorkUnits for this process. Returns false if there
+    # were none available.
+    def self.reserve_available
+      WorkUnit.available.update_all("reservation = #{$$}") > 0
+    end
+    # Cancels all outstanding WorkUnit reservations for this process.
+    def self.cancel_reservations
+      WorkUnit.reserved.update_all('reservation = null')
+    end
+    # Look up a WorkUnit by the worker that's currently processing it. Specified
+    # by <tt>pid@host</tt>.
+    def self.find_by_worker_name(name)
+      pid, host = name.split('@')
+      node = NodeRecord.find_by_host(host)
+      node && node.work_units.find_by_worker_pid(pid)
+    end
+    # Convenience method for starting a new WorkUnit.
+    def self.start(job, action, input, status)
+      self.create(:job => job, :action => action, :input => input, :status => status)
     end
     # Mark this unit as having finished successfully.
-    def finish(output, time_taken)
-      update_attributes({
-        :status         => SUCCEEDED,
-        :worker_record  => nil,
-        :attempts       => self.attempts + 1,
-        :output         => output,
-        :time           => time_taken
-      })
+    # Splitting work units are handled differently (an optimization) -- they
+    # immediately fire off all of their resulting WorkUnits for processing,
+    # without waiting for the rest of their splitting cousins to complete.
+    def finish(result, time_taken)
+      if splitting?
+        [JSON.parse(parsed_output(result))].flatten.each do |new_input|
+          WorkUnit.start(job, action, new_input, PROCESSING)
+        end
+        self.destroy
+        job.set_next_status if job.done_splitting?
+      else
+        update_attributes({
+          :status         => SUCCEEDED,
+          :node_record    => nil,
+          :worker_pid     => nil,
+          :attempts       => attempts + 1,
+          :output         => result,
+          :time           => time_taken
+        })
+        job.check_for_completion
+      end
     end
     # Mark this unit as having failed. May attempt a retry.
@@ -49,30 +94,39 @@ module CloudCrowd
       return try_again if tries < CloudCrowd.config[:work_unit_retries]
       update_attributes({
         :status         => FAILED,
-        :worker_record  => nil,
+        :node_record    => nil,
+        :worker_pid     => nil,
         :attempts       => tries,
         :output         => output,
         :time           => time_taken
       })
+      self.job.check_for_completion
     end
     # Ever tried. Ever failed. No matter. Try again. Fail again. Fail better.
     def try_again
       update_attributes({
-        :worker_record  => nil,
-        :attempts       => self.attempts + 1
+        :node_record  => nil,
+        :worker_pid   => nil,
+        :attempts     => self.attempts + 1
       })
     end
-    # When a Worker checks out a WorkUnit, establish the connection between
-    # WorkUnit and WorkerRecord.
-    def assign_to(worker_name)
-      self.worker_record = WorkerRecord.find_by_name!(worker_name)
-      self.save ? self : nil
+    # When a Node checks out a WorkUnit, establish the connection between
+    # WorkUnit and NodeRecord and record the worker_pid.
+    def assign_to(node_record, worker_pid)
+      update_attributes!(:node_record => node_record, :worker_pid => worker_pid)
+    end
+    # All output needs to be wrapped in a JSON object for consistency
+    # (unfortunately, JSON.parse needs the top-level to be an object or array).
+    # Convenience method to provide the parsed version.
+    def parsed_output(out = self.output)
+      JSON.parse(out)['output']
     end
     # The JSON representation of a WorkUnit shares the Job's options with all
-    # its sister WorkUnits.
+    # its cousin WorkUnits.
     def to_json
       {
         'id'        => self.id,

data/lib/cloud_crowd/node.rb ADDED

@@ -0,0 +1,105 @@
+module CloudCrowd
+  # A Node is a Sinatra/Thin application that runs a single instance per-machine
+  # It registers with the central server, receives WorkUnits, and forks off
+  # Workers to process them. The actions are:
+  #
+  # [get /heartbeat] Returns 200 OK to let monitoring tools know the server's up.
+  # [post /work] The central server hits <tt>/work</tt> to dispatch a WorkUnit to this Node.
+  class Node < Sinatra::Default
+    # A Node's default port. You only run a single node per machine, so they
+    # can all use the same port without any problems.
+    DEFAULT_PORT = 9063
+    attr_reader :server, :asset_store
+    set :root, ROOT
+    set :authorization_realm, "CloudCrowd"
+    helpers Helpers
+    # methodoverride allows the _method param.
+    enable :methodoverride
+    # Enabling HTTP Authentication turns it on for all requests.
+    # This works the same way as in the central CloudCrowd::Server.
+    before do
+      login_required if CloudCrowd.config[:http_authentication]
+    end
+    # To monitor a Node with Monit, God, Nagios, or another tool, you can hit
+    # /heartbeat to make sure its still online.
+    get '/heartbeat' do
+      "buh-bump"
+    end
+    # Posts a WorkUnit to this Node. Forks a Worker and returns the process id.
+    post '/work' do
+      pid = fork { Worker.new(self, JSON.parse(params[:work_unit])) }
+      Process.detach(pid)
+      json :pid => pid
+    end
+    # Creating a Node registers with the central server and starts listening for
+    # incoming WorkUnits.
+    def initialize(port=DEFAULT_PORT)
+      require 'json'
+      @server           = CloudCrowd.central_server
+      @host             = Socket.gethostname
+      @enabled_actions  = CloudCrowd.actions.keys
+      @asset_store      = AssetStore.new
+      @port             = port || DEFAULT_PORT
+      trap_signals
+      start_server
+      check_in
+      @server_thread.join
+    end
+    # Checking in with the central server informs it of the location and
+    # configuration of this Node. If it can't check-in, there's no point in
+    # starting.
+    def check_in
+      @server["/node/#{@host}"].put(
+        :port             => @port,
+        :max_workers      => CloudCrowd.config[:max_workers],
+        :enabled_actions  => @enabled_actions.join(',')
+      )
+    rescue Errno::ECONNREFUSED
+      puts "Failed to connect to the central server (#{@server.to_s}), exiting..."
+      raise SystemExit
+    end
+    # Before exiting, the Node checks out with the central server, releasing all
+    # of its WorkUnits for other Nodes to handle
+    def check_out
+      @server["/node/#{@host}"].delete
+    end
+    private
+    # Launch the Node's Thin server in a separate thread because it blocks.
+    def start_server
+      @server_thread = Thread.new do
+        Thin::Server.start('0.0.0.0', @port, self, :signals => false)
+      end
+    end
+    # Trap exit signals in order to shut down cleanly.
+    def trap_signals
+      Signal.trap('INT')  { shut_down }
+      Signal.trap('KILL') { shut_down }
+      Signal.trap('TERM') { shut_down }
+    end
+    # At shut down, de-register with the central server before exiting.
+    def shut_down
+      check_out
+      Process.exit
+    end
+  end
+end

data/lib/cloud_crowd/schema.rb CHANGED

@@ -1,5 +1,5 @@
 # Complete schema for CloudCrowd.
-ActiveRecord::Schema.define(:version => 1) do
+ActiveRecord::Schema.define(:version => CloudCrowd::SCHEMA_VERSION) do
   create_table "jobs", :force => true do |t|
     t.integer  "status",                      :null => false
@@ -10,7 +10,16 @@ ActiveRecord::Schema.define(:version => 1) do
     t.float    "time"
     t.string   "callback_url"
     t.string   "email"
-    t.integer  "lock_version", :default => 0, :null => false
+    t.datetime "created_at"
+    t.datetime "updated_at"
+  end
+  create_table "node_records", :force => true do |t|
+    t.string   "host",                            :null => false
+    t.string   "ip_address",                      :null => false
+    t.integer  "port",                            :null => false
+    t.string   "enabled_actions", :default => '', :null => false
+    t.integer  "max_workers"
     t.datetime "created_at"
     t.datetime "updated_at"
   end
@@ -20,26 +29,21 @@ ActiveRecord::Schema.define(:version => 1) do
     t.integer  "job_id",                          :null => false
     t.text     "input",                           :null => false
     t.string   "action",                          :null => false
-    t.integer  "attempts",     :default => 0,     :null => false
-    t.integer  "lock_version", :default => 0,     :null => false
-    t.integer  "worker_record_id"
+    t.integer  "attempts",      :default => 0,    :null => false
+    t.integer  "node_record_id"
+    t.integer  "worker_pid"
+    t.integer  "reservation"
     t.float    "time"
     t.text     "output"
     t.datetime "created_at"
     t.datetime "updated_at"
   end
-  create_table "worker_records", :force => true do |t|
-    t.string   "name",          :null => false
-    t.string   "thread_status", :null => false
-    t.datetime "created_at"
-    t.datetime "updated_at"
-  end
-  add_index "jobs", ["status"], :name => "index_jobs_on_status"
-  add_index "work_units", ["job_id"], :name => "index_work_units_on_job_id"
-  add_index "work_units", ["status", "worker_record_id", "action"], :name => "index_work_units_on_status_and_worker_record_id_and_action"
-  add_index "worker_records", ["name"], :name => "index_worker_records_on_name"
-  add_index "worker_records", ["updated_at"], :name => "index_worker_records_on_updated_at"
+  # Here be indices. After looking, it seems faster not to have them at all.
+  #
+  # add_index "jobs", ["status"], :name => "index_jobs_on_status"
+  # add_index "work_units", ["job_id"], :name => "index_work_units_on_job_id"
+  # add_index "work_units", ["worker_pid"], :name => "index_work_units_on_worker_pid"
+  # add_index "work_units", ["worker_pid", "status"], :name => "index_work_units_on_worker_pid_and_status"
+  # add_index "work_units", ["worker_pid", "node_record_id"], :name => "index_work_units_on_worker_pid_and_node_record_id"
 end

data/lib/cloud_crowd/{app.rb → server.rb} RENAMED

@@ -5,6 +5,7 @@ module CloudCrowd
   # == Admin
   # [get /] Render the admin console, with a progress meter for running jobs.
   # [get /status] Get the combined JSON of every active job and worker.
+  # [get /worker/:name] Look up the details of a WorkUnit that a Worker is busy processing.
   # [get /heartbeat] Returns 200 OK to let monitoring tools know the server's up.
   #
   # == Public API
@@ -13,10 +14,10 @@ module CloudCrowd
   # [delete /jobs/:job_id] Clean up a Job when you're done downloading the results. Removes all intermediate files.
   #
   # == Internal Workers API
-  # [post /work] Dequeue the next WorkUnit, and hand it off to the worker.
+  # [puts /node/:host] Registers a new Node, making it available for processing.
+  # [delete /node/:host] Removes a Node from the registry, freeing up any WorkUnits that it had checked out.
   # [put /work/:unit_id] Mark a finished WorkUnit as completed or failed, with results.
-  # [put /worker] Keep a record of an actively running worker.
-  class App < Sinatra::Default
+  class Server < Sinatra::Default
     set :root, ROOT
     set :authorization_realm, "CloudCrowd"
@@ -28,12 +29,12 @@ module CloudCrowd
     # Enabling HTTP Authentication turns it on for all requests.
     before do
-      login_required if CloudCrowd.config[:use_http_authentication]
+      login_required if CloudCrowd.config[:http_authentication]
     end
     # Render the admin console.
     get '/' do
-      erb :index
+      erb :operations_center
     end
     # Get the JSON for every active job in the queue and every active worker
@@ -42,15 +43,14 @@ module CloudCrowd
     get '/status' do
       json(
         'jobs'            => Job.incomplete,
-        'workers'         => WorkerRecord.alive(:order => 'name desc'),
+        'nodes'           => NodeRecord.all(:order => 'host desc'),
         'work_unit_count' => WorkUnit.incomplete.count
       )
     end
-    # Get the JSON for a worker record's work unit, if one exists.
+    # Get the JSON for what a worker is up to.
     get '/worker/:name' do
-      record = WorkerRecord.find_by_name params[:name]
-      json((record && record.work_unit) || {})
+      json WorkUnit.find_by_worker_name(params[:name]) || {}
     end
     # To monitor the central server with Monit, God, Nagios, or another
@@ -62,8 +62,11 @@ module CloudCrowd
     # PUBLIC API:
     # Start a new job. Accepts a JSON representation of the job-to-be.
+    # Distributes all work units to available nodes.
     post '/jobs' do
-      json Job.create_from_request(JSON.parse(params[:job]))
+      job = Job.create_from_request(JSON.parse(params[:job]))
+      WorkUnit.distribute_to_nodes
+      json job
     end
     # Check the status of a job, returning the output if finished, and the
@@ -79,36 +82,33 @@ module CloudCrowd
       json nil
     end
-    # INTERNAL WORKER DAEMON API:
+    # INTERNAL NODE API:
-    # Internal method for worker daemons to fetch the work unit at the front
-    # of the queue. Work unit is marked as taken and handed off to the worker.
-    post '/work' do
-      json dequeue_work_unit
+    # A new Node will this this action to register its location and
+    # configuration with the central server. Triggers distribution of WorkUnits.
+    put '/node/:host' do
+      NodeRecord.check_in(params, request)
+      WorkUnit.distribute_to_nodes
+      json nil
+    end
+    # Deregisters a Node from the central server. Releases and redistributes any
+    # WorkUnits it may have had checked out.
+    delete '/node/:host' do
+      NodeRecord.destroy_all(:host => params[:host])
+      json nil
     end
     # When workers are done with their unit, either successfully on in failure,
-    # they mark it back on the central server and retrieve another. Failures
-    # pull from one down in the queue, so as to not repeat the same unit.
+    # they mark it back on the central server and exit. Triggers distribution
+    # of pending work units.
     put '/work/:work_unit_id' do
-      handle_conflicts(409) do
-        case params[:status]
-        when 'succeeded'
-          current_work_unit.finish(params[:output], params[:time])
-          json dequeue_work_unit
-        when 'failed'
-          current_work_unit.fail(params[:output], params[:time])
-          json dequeue_work_unit(1)
-        else
-          error(500, "Completing a work unit must specify status.")
-        end
+      case params[:status]
+      when 'succeeded' then current_work_unit.finish(params[:output], params[:time])
+      when 'failed'    then current_work_unit.fail(params[:output], params[:time])
+      else             error(500, "Completing a work unit must specify status.")
       end
-    end
-    # Every so often workers check in to let the central server know that
-    # they're still alive. Keep up-to-date records
-    put '/worker' do
-      params[:terminated] ? WorkerRecord.check_out(params) : WorkerRecord.check_in(params)
+      WorkUnit.distribute_to_nodes
       json nil
     end