RubyGems - documentcloud-cloud-crowd - Versions diffs - 0.0.2 → 0.0.3 - Mend

documentcloud-cloud-crowd 0.0.2 → 0.0.3

Files changed (20) hide show

data/cloud-crowd.gemspec +1 -1
data/config/config.example.ru +0 -1
data/config/config.example.yml +41 -11
data/config/database.example.yml +3 -0
data/lib/cloud-crowd.rb +31 -5
data/lib/cloud_crowd/app.rb +16 -21
data/lib/cloud_crowd/daemon.rb +7 -5
data/lib/cloud_crowd/helpers/resources.rb +21 -0
data/lib/cloud_crowd/models/job.rb +123 -120
data/lib/cloud_crowd/models/work_unit.rb +74 -61
data/lib/cloud_crowd/models.rb +0 -2
data/lib/cloud_crowd/runner.rb +4 -16
data/lib/cloud_crowd/worker.rb +12 -9
data/test/acceptance/test_failing_work_units.rb +1 -1
data/test/blueprints.rb +3 -3
data/test/config/config.yml +1 -1
data/test/test_helper.rb +0 -2
data/test/unit/test_job.rb +4 -4
data/test/unit/test_work_unit.rb +2 -2
metadata +1 -1

data/cloud-crowd.gemspec CHANGED Viewed

@@ -1,6 +1,6 @@
 Gem::Specification.new do |s|
   s.name      = 'cloud-crowd'
-  s.version   = '0.0.2'         # Keep version in sync with cloud-cloud.rb
+  s.version   = '0.0.3'         # Keep version in sync with cloud-cloud.rb
   s.date      = '2009-08-23'
   s.homepage    = "http://documentcloud.org" # wiki page on github?

data/config/config.example.ru CHANGED Viewed

@@ -8,7 +8,6 @@
 require 'rubygems'
 require 'cloud-crowd'
-require 'cloud_crowd/app'
 CloudCrowd.configure(File.dirname(__FILE__) + '/config.yml')
 CloudCrowd.configure_database(File.dirname(__FILE__) + '/database.yml')

data/config/config.example.yml CHANGED Viewed

@@ -1,16 +1,46 @@
-:num_workers:             4
-:default_worker_wait:     1
-:max_worker_wait:         20
-:worker_wait_multiplier:  1.3
-:worker_retry_wait:       5
-:work_unit_retries:       3
+# The URL where you're planning on running the server/queue/database.
 :central_server:          http://localhost:9173
+# Please provide your AWS credentials for S3 storage of job output.
+:aws_access_key:          [your AWS access key]
+:aws_secret_key:          [your AWS secret access key]
+# Choose an S3 bucket to store all CloudCrowd output, and decide if you'd like
+# to keep all resulting files on S3 private. If so, you'll receive authenticated
+# S3 URLs as job output, good for 24 hours. If left public, you'll get the
+# straight URLs to the files on S3.
+:s3_bucket:               [your CloudCrowd bucket]
+:use_s3_authentication:   no
+# Use HTTP Basic Auth for all requests? (Includes all internal worker requests
+# to the central server). If yes, specify the login and password that all
+# requests must provide for authentication.
 :use_http_authentication: no
 :login:                   [your login name]
 :password:                [your password]
-:use_s3_authentication:   no
-:s3_bucket:               [your CloudCrowd bucket]
-:aws_access_key:          [your AWS access key]
-:aws_secret_key:          [your AWS secret access key]
+# Set the following numbers to tweak the configuration of your worker daemons.
+# Optimum results will depend on proportion of the Memory/CPU/IO bottlenecks
+# in your actions, the number of central servers you have running, and your
+# desired balance between latency and traffic.
+# The number of workers that `crowd workers start` spins up.
+:num_workers:             4
+# The minimum number of seconds a worker waits between checking the job queue.
+:min_worker_wait:         1
+# The maximum number of seconds a worker waits between checking the job queue.
+:max_worker_wait:         20
+# The backoff multiplier the worker uses to slow down the check interval when
+# there's no work in the queue.
+:worker_wait_multiplier:  1.3
+# The number of seconds a worker waits to retry when there's some kind of
+# internal error (ie. the central server fails to respond)
+:worker_retry_wait:       5
+# The number of separate attempts that will be made to process an individual
+# work unit, before marking it as having failed.
+:work_unit_retries:       3

data/config/database.example.yml CHANGED Viewed

@@ -1,3 +1,6 @@
+# This is a standard ActiveRecord database.yml file. You can configure it
+# to use any database that ActiveRecord supports.
 :adapter:  mysql
 :encoding: utf8
 :username: root

data/lib/cloud-crowd.rb CHANGED Viewed

@@ -1,21 +1,47 @@
+# The Grand Central of code loading...
 $LOAD_PATH.unshift File.expand_path(File.dirname(__FILE__))
 # Common Gems:
-require 'json'
-require 'rest_client'
-require 'right_aws'
+require 'rubygems'
+gem 'activerecord'
+gem 'daemons'
+gem 'json'
+gem 'rest-client'
+gem 'right_aws'
+gem 'sinatra'
 # Common CloudCrowd libs:
 require 'cloud_crowd/core_ext'
-require 'cloud_crowd/action'
+# Autoloading for all the pieces which may or may not be needed:
+autoload :ActiveRecord, 'activerecord'
+autoload :Benchmark,    'benchmark'
+autoload :Daemons,      'daemons'
+autoload :ERB,          'erb'
+autoload :FileUtils,    'fileutils'
+autoload :JSON,         'json'
+autoload :RestClient,   'rest_client'
+autoload :RightAws,     'right_aws'
+autoload :Sinatra,      'sinatra'
+autoload :Socket,       'socket'
+autoload :YAML,         'yaml'
 module CloudCrowd
+  # Autoload all the CloudCrowd classes which may not be required.
+  autoload :App,        'cloud_crowd/app'
+  autoload :Action,     'cloud_crowd/action'
+  autoload :AssetStore, 'cloud_crowd/asset_store'
+  autoload :Helpers,    'cloud_crowd/helpers'
+  autoload :Job,        'cloud_crowd/models'
+  autoload :WorkUnit,   'cloud_crowd/models'
   # Root directory of the CloudCrowd gem.
   ROOT        = File.expand_path(File.dirname(__FILE__) + '/..')
   # Keep the version in sync with the gemspec.
-  VERSION     = '0.0.2'
+  VERSION     = '0.0.3'
   # A Job is processing if its WorkUnits in the queue to be handled by workers.
   PROCESSING  = 1

data/lib/cloud_crowd/app.rb CHANGED Viewed

@@ -1,8 +1,3 @@
-require 'erb'
-require 'sinatra'
-require 'cloud_crowd/models'
-require 'cloud_crowd/helpers'
 module CloudCrowd
   class App < Sinatra::Default
@@ -10,7 +5,7 @@ module CloudCrowd
     # static serves files from /public, methodoverride allows the _method param.
     enable :static, :methodoverride
-    set :root,                CloudCrowd::ROOT
+    set :root, CloudCrowd::ROOT
     set :authorization_realm, "CloudCrowd"
     helpers CloudCrowd::Helpers
@@ -40,29 +35,29 @@ module CloudCrowd
     # Internal method for worker daemons to fetch the work unit at the front
     # of the queue. Work unit is marked as taken and handed off to the worker.
     get '/work' do
-      begin
-        unit = WorkUnit.first(:conditions => {:status => CloudCrowd::INCOMPLETE, :taken => false}, :order => "created_at desc")
-        return status(204) && '' unless unit
-        unit.update_attributes(:taken => true)
-        unit.to_json
-      rescue ActiveRecord::StaleObjectError => e
-        return status(204) && ''
-      end
+      dequeue_work_unit
     end
     # When workers are done with their unit, either successfully on in failure,
-    # they mark it back on the central server.
+    # they mark it back on the central server and retrieve another. Failures
+    # pull from one down in the queue, so as to not repeat the same unit.
     put '/work/:work_unit_id' do
-      case params[:status]
-      when 'succeeded' then current_work_unit.finish(params[:output], params[:time])
-      when 'failed'    then current_work_unit.fail(params[:output], params[:time])
-      else             return error(500, "Completing a work unit must specify status.")
+      handle_conflicts(409) do
+        case params[:status]
+        when 'succeeded'
+          current_work_unit.finish(params[:output], params[:time])
+          dequeue_work_unit
+        when 'failed'
+          current_work_unit.fail(params[:output], params[:time])
+          dequeue_work_unit(1)
+        else
+          return error(500, "Completing a work unit must specify status.")
+        end
       end
-      return status(204) && ''
     end
     # To monitor the central server with Monit, God, Nagios, or another
-    # monitoring tool, you can hit /heartbeat to check.
+    # monitoring tool, you can hit /heartbeat to make sure.
     get '/heartbeat' do
       "buh-bump"
     end

data/lib/cloud_crowd/daemon.rb CHANGED Viewed

@@ -10,12 +10,12 @@ module CloudCrowd
   # isn't any work to be done, and speeds back up when there is.
   class Daemon
-    DEFAULT_WAIT    = CloudCrowd.config[:default_worker_wait]
+    MIN_WAIT        = CloudCrowd.config[:min_worker_wait]
     MAX_WAIT        = CloudCrowd.config[:max_worker_wait]
     WAIT_MULTIPLIER = CloudCrowd.config[:worker_wait_multiplier]
     def initialize
-      @wait_time = DEFAULT_WAIT
+      @wait_time = MIN_WAIT
       @worker = CloudCrowd::Worker.new
       Signal.trap('INT',  'EXIT')
       Signal.trap('KILL', 'EXIT')
@@ -31,9 +31,11 @@ module CloudCrowd
       loop do
         @worker.fetch_work_unit
         if @worker.has_work?
-          @worker.run
-          @wait_time = DEFAULT_WAIT
-          sleep 0.01 # So as to listen for incoming signals.
+          @wait_time = MIN_WAIT
+          while @worker.has_work?
+            @worker.run
+            sleep 0.01 # So as to listen for incoming signals.
+          end
         else
           @wait_time = [@wait_time * WAIT_MULTIPLIER, MAX_WAIT].min
           sleep @wait_time

data/lib/cloud_crowd/helpers/resources.rb CHANGED Viewed

@@ -10,6 +10,27 @@ module CloudCrowd
         @work_unit ||= WorkUnit.find_by_id(params[:work_unit_id]) or raise Sinatra::NotFound
       end
+      # Try to fetch a work unit from the queue. If none are pending, respond
+      # with no content.
+      def dequeue_work_unit(offset=0)
+        handle_conflicts do
+          unit = WorkUnit.dequeue(offset)
+          return status(204) && '' unless unit
+          unit.to_json
+        end
+      end
+      # We're using ActiveRecords optimistic locking, so stale work units
+      # may sometimes arise. handle_conflicts responds with a the HTTP status
+      # code of your choosing if the update failed to be applied.
+      def handle_conflicts(code=204)
+        begin
+          yield
+        rescue ActiveRecord::StaleObjectError => e
+          return status(code) && ''
+        end
+      end
     end
   end
 end

data/lib/cloud_crowd/models/job.rb CHANGED Viewed

@@ -1,129 +1,132 @@
-# A chunk of work that will be farmed out into many WorkUnits to be processed
-# in parallel by all the active CloudCrowd::Workers. Jobs are defined by a list
-# of inputs (usually public urls to files), an action (the name of a script that
-# CloudCrowd knows how to run), and, eventually a corresponding list of output.
-class Job < ActiveRecord::Base
-  include CloudCrowd::ModelStatus
-  has_many :work_units, :dependent => :destroy
-  validates_presence_of :status, :inputs, :action, :options
-  # Create a Job from an incoming JSON or XML request, and add it to the queue.
-  # TODO: Add XML support.
-  def self.create_from_request(h)
-    self.create(
-      :inputs       => h['inputs'].to_json,
-      :action       => h['action'],
-      :options      => (h['options'] || {}).to_json,
-      :owner_email  => h['owner_email'],
-      :callback_url => h['callback_url']
-    )
-  end
-  def after_create
-    self.queue_for_workers(JSON.parse(self.inputs))
-  end
-  def before_validation_on_create
-    self.status = self.splittable? ? CloudCrowd::SPLITTING : CloudCrowd::PROCESSING
-  end
-  # After work units are marked successful, we check to see if all of them have
-  # finished, if so, this job is complete.
-  def check_for_completion
-    return unless all_work_units_complete?
-    transition_to_next_phase
-    output_list = gather_outputs_from_work_units
+module CloudCrowd
+  # A chunk of work that will be farmed out into many WorkUnits to be processed
+  # in parallel by all the active CloudCrowd::Workers. Jobs are defined by a list
+  # of inputs (usually public urls to files), an action (the name of a script that
+  # CloudCrowd knows how to run), and, eventually a corresponding list of output.
+  class Job < ActiveRecord::Base
+    include CloudCrowd::ModelStatus
+    has_many :work_units, :dependent => :destroy
-    if complete?
-      self.outputs = output_list.to_json
-      self.time = Time.now - self.created_at
+    validates_presence_of :status, :inputs, :action, :options
+    # Create a Job from an incoming JSON or XML request, and add it to the queue.
+    # TODO: Add XML support.
+    def self.create_from_request(h)
+      self.create(
+        :inputs       => h['inputs'].to_json,
+        :action       => h['action'],
+        :options      => (h['options'] || {}).to_json,
+        :owner_email  => h['owner_email'],
+        :callback_url => h['callback_url']
+      )
     end
-    self.save
-    case self.status
-    when CloudCrowd::PROCESSING then queue_for_workers(output_list.map {|o| JSON.parse(o) }.flatten)
-    when CloudCrowd::MERGING    then queue_for_workers(output_list.to_json)
-    else                             fire_callback
+    def after_create
+      self.queue_for_workers(JSON.parse(self.inputs))
     end
-    self
-  end
-  # Transition from the current phase to the next one.
-  def transition_to_next_phase
-    self.status = any_work_units_failed? ? CloudCrowd::FAILED     :
-                  self.splitting?        ? CloudCrowd::PROCESSING :
-                  self.should_merge?     ? CloudCrowd::MERGING    :
-                                           CloudCrowd::SUCCEEDED
-  end
-  # If a callback_url is defined, post the Job's JSON to it upon completion.
-  def fire_callback
-    begin
-      RestClient.post(callback_url, {:job => self.to_json}) if callback_url
-    rescue RestClient::Exception => e
-      puts "Failed to fire job callback. Hmmm, what should happen here?"
+    def before_validation_on_create
+      self.status = self.splittable? ? CloudCrowd::SPLITTING : CloudCrowd::PROCESSING
     end
-  end
-  # Cleaning up after a job will remove all of its files from S3.
-  def cleanup
-    CloudCrowd::AssetStore.new.cleanup_job(self)
-  end
-  # Have all of the WorkUnits finished? We could trade reads for writes here
-  # by keeping a completed_count on the Job itself.
-  def all_work_units_complete?
-    self.work_units.incomplete.count <= 0
-  end
-  # Have any of the WorkUnits failed?
-  def any_work_units_failed?
-    self.work_units.failed.count > 0
-  end
-  def splittable?
-    self.action_class.new.respond_to? :split
-  end
-  def should_merge?
-    self.processing? && self.action_class.new.respond_to?(:merge)
-  end
-  def action_class
-    CloudCrowd.actions(self.action)
-  end
-  def gather_outputs_from_work_units
-    outs = self.work_units.complete.map {|wu| wu.output }
-    self.work_units.complete.destroy_all
-    outs
-  end
-  def display_status
-    CloudCrowd.display_status(self.status)
-  end
-  def work_units_remaining
-    self.work_units.incomplete.count
-  end
-  # A JSON representation of this job includes the statuses of its component
-  # WorkUnits, as well as any completed outputs.
-  def to_json(opts={})
-    atts = {'id' => self.id, 'status' => self.display_status, 'work_units_remaining' => self.work_units_remaining}
-    atts.merge!({'outputs' => JSON.parse(self.outputs)}) if self.outputs
-    atts.merge!({'time' => self.time}) if self.time
-    atts.to_json
-  end
+    # After work units are marked successful, we check to see if all of them have
+    # finished, if so, this job is complete.
+    def check_for_completion
+      return unless all_work_units_complete?
+      transition_to_next_phase
+      output_list = gather_outputs_from_work_units
-  # When starting a new job, or moving to a new stage, split up the inputs
-  # into WorkUnits, and queue them.
-  def queue_for_workers(input)
-    [input].flatten.each do |wu_input|
-      WorkUnit.create(:job => self, :input => wu_input, :status => self.status)
+      if complete?
+        self.outputs = output_list.to_json
+        self.time = Time.now - self.created_at
+      end
+      self.save
+      case self.status
+      when CloudCrowd::PROCESSING then queue_for_workers(output_list.map {|o| JSON.parse(o) }.flatten)
+      when CloudCrowd::MERGING    then queue_for_workers(output_list.to_json)
+      else                             fire_callback
+      end
+      self
+    end
+    # Transition from the current phase to the next one.
+    def transition_to_next_phase
+      self.status = any_work_units_failed? ? CloudCrowd::FAILED     :
+                    self.splitting?        ? CloudCrowd::PROCESSING :
+                    self.should_merge?     ? CloudCrowd::MERGING    :
+                                             CloudCrowd::SUCCEEDED
+    end
+    # If a callback_url is defined, post the Job's JSON to it upon completion.
+    def fire_callback
+      begin
+        RestClient.post(callback_url, {:job => self.to_json}) if callback_url
+      rescue RestClient::Exception => e
+        puts "Failed to fire job callback. Hmmm, what should happen here?"
+      end
+    end
+    # Cleaning up after a job will remove all of its files from S3.
+    def cleanup
+      CloudCrowd::AssetStore.new.cleanup_job(self)
+    end
+    # Have all of the WorkUnits finished? We could trade reads for writes here
+    # by keeping a completed_count on the Job itself.
+    def all_work_units_complete?
+      self.work_units.incomplete.count <= 0
+    end
+    # Have any of the WorkUnits failed?
+    def any_work_units_failed?
+      self.work_units.failed.count > 0
+    end
+    def splittable?
+      self.action_class.new.respond_to? :split
+    end
+    def should_merge?
+      self.processing? && self.action_class.new.respond_to?(:merge)
+    end
+    def action_class
+      CloudCrowd.actions(self.action)
+    end
+    def gather_outputs_from_work_units
+      outs = self.work_units.complete.map {|wu| wu.output }
+      self.work_units.complete.destroy_all
+      outs
+    end
+    def display_status
+      CloudCrowd.display_status(self.status)
+    end
+    def work_units_remaining
+      self.work_units.incomplete.count
+    end
+    # A JSON representation of this job includes the statuses of its component
+    # WorkUnits, as well as any completed outputs.
+    def to_json(opts={})
+      atts = {'id' => self.id, 'status' => self.display_status, 'work_units_remaining' => self.work_units_remaining}
+      atts.merge!({'outputs' => JSON.parse(self.outputs)}) if self.outputs
+      atts.merge!({'time' => self.time}) if self.time
+      atts.to_json
+    end
+    # When starting a new job, or moving to a new stage, split up the inputs
+    # into WorkUnits, and queue them.
+    def queue_for_workers(input)
+      [input].flatten.each do |wu_input|
+        WorkUnit.create(:job => self, :input => wu_input, :status => self.status)
+      end
     end
   end
 end

data/lib/cloud_crowd/models/work_unit.rb CHANGED Viewed

@@ -1,62 +1,75 @@
-# A WorkUnit is an atomic chunk of work from a job, processing a single input
-# through a single action. All WorkUnits receive the same options.
-class WorkUnit < ActiveRecord::Base
-  include CloudCrowd::ModelStatus
-  belongs_to :job
-  validates_presence_of :job_id, :status, :input
-  after_save :check_for_job_completion
-  # After saving a WorkUnit, it's Job should check if it just become complete.
-  def check_for_job_completion
-    self.job.check_for_completion if complete?
+module CloudCrowd
+  # A WorkUnit is an atomic chunk of work from a job, processing a single input
+  # through a single action. All WorkUnits receive the same options.
+  class WorkUnit < ActiveRecord::Base
+    include CloudCrowd::ModelStatus
+    belongs_to :job
+    validates_presence_of :job_id, :status, :input
+    after_save :check_for_job_completion
+    # Find the Nth available WorkUnit in the queue, and take it out.
+    def self.dequeue(offset=0)
+      unit = self.first(
+        :conditions => {:status => CloudCrowd::INCOMPLETE, :taken => false},
+        :order      => "created_at asc",
+        :offset     => offset
+      )
+      unit ? unit.update_attributes(:taken => true) && unit : nil
+    end
+    # After saving a WorkUnit, it's Job should check if it just become complete.
+    def check_for_job_completion
+      self.job.check_for_completion if complete?
+    end
+    # Mark this unit as having finished successfully.
+    def finish(output, time_taken)
+      update_attributes({
+        :status   => CloudCrowd::SUCCEEDED,
+        :taken    => false,
+        :attempts => self.attempts + 1,
+        :output   => output,
+        :time     => time_taken
+      })
+    end
+    # Mark this unit as having failed. May attempt a retry.
+    def fail(output, time_taken)
+      tries = self.attempts + 1
+      return try_again if tries < CloudCrowd.config[:work_unit_retries]
+      update_attributes({
+        :status   => CloudCrowd::FAILED,
+        :taken    => false,
+        :attempts => tries,
+        :output   => output,
+        :time     => time_taken
+      })
+    end
+    # Ever tried. Ever failed. No matter. Try again. Fail again. Fail better.
+    def try_again
+      update_attributes({
+        :taken    => false,
+        :attempts => self.attempts + 1
+      })
+    end
+    # The JSON representation of a WorkUnit contains common elements of its job.
+    def to_json
+      {
+        'id'        => self.id,
+        'job_id'    => self.job_id,
+        'input'     => self.input,
+        'attempts'  => self.attempts,
+        'action'    => self.job.action,
+        'options'   => JSON.parse(self.job.options),
+        'status'    => self.status
+      }.to_json
+    end
   end
-  # Mark this unit as having finished successfully.
-  def finish(output, time_taken)
-    update_attributes({
-      :status   => CloudCrowd::SUCCEEDED,
-      :taken    => false,
-      :attempts => self.attempts + 1,
-      :output   => output,
-      :time     => time_taken
-    })
-  end
-  # Mark this unit as having failed. May attempt a retry.
-  def fail(output, time_taken)
-    tries = self.attempts + 1
-    return try_again if tries < CloudCrowd.config[:work_unit_retries]
-    update_attributes({
-      :status   => CloudCrowd::FAILED,
-      :taken    => false,
-      :attempts => tries,
-      :output   => output,
-      :time     => time_taken
-    })
-  end
-  # Ever tried. Ever failed. No matter. Try again. Fail again. Fail better.
-  def try_again
-    update_attributes({
-      :taken    => false,
-      :attempts => self.attempts + 1
-    })
-  end
-  # The JSON representation of a WorkUnit contains common elements of its job.
-  def to_json
-    {
-      'id'        => self.id,
-      'job_id'    => self.job_id,
-      'input'     => self.input,
-      'attempts'  => self.attempts,
-      'action'    => self.job.action,
-      'options'   => JSON.parse(self.job.options),
-      'status'    => self.status
-    }.to_json
-  end
-end
+end

data/lib/cloud_crowd/models.rb CHANGED Viewed

@@ -1,5 +1,3 @@
-require 'activerecord'
 module CloudCrowd
   module ModelStatus

data/lib/cloud_crowd/runner.rb CHANGED Viewed

@@ -1,22 +1,10 @@
-# This is the script that kicks off a single CloudCrowd::Daemon. Because the
-# daemons don't load the entire rails stack, this file functions like a mini
-# environment.rb, loading all the common gems that we need.
-# Standard Libs
-require 'fileutils'
-require 'benchmark'
-require 'socket'
-# Gems
-require 'rubygems'
-require 'daemons'
-require 'yaml'
-FileUtils.mkdir('log') unless File.exists?('log')
+# This is the script that kicks off a single CloudCrowd::Daemon. Rely on
+# cloud-crowd.rb for autoloading of all the code we need.
 # Daemon/Worker Dependencies.
 require "#{File.dirname(__FILE__)}/../cloud-crowd"
-require 'cloud_crowd/asset_store'
+FileUtils.mkdir('log') unless File.exists?('log')
 Daemons.run("#{CloudCrowd::ROOT}/lib/cloud_crowd/daemon.rb", {
   :app_name   => "cloud_crowd_worker",

data/lib/cloud_crowd/worker.rb CHANGED Viewed

@@ -22,10 +22,7 @@ module CloudCrowd
     def fetch_work_unit
       keep_trying_to "fetch a new work unit" do
         unit_json = @server['/work'].get
-        return unless unit_json # No content means no work for us.
-        @start_time = Time.now
-        parse_work_unit unit_json
-        log "fetched work unit for #{@action_name}"
+        setup_work_unit(unit_json)
       end
     end
@@ -33,8 +30,10 @@ module CloudCrowd
     def complete_work_unit(result)
       keep_trying_to "complete work unit" do
         data = completion_params.merge({:status => 'succeeded', :output => result})
-        @server["/work/#{data[:id]}"].put(data)
+        unit_json = @server["/work/#{data[:id]}"].put(data)
         log "finished #{@action_name} in #{data[:time]} seconds"
+        clear_work_unit
+        setup_work_unit(unit_json)
       end
     end
@@ -42,8 +41,10 @@ module CloudCrowd
     def fail_work_unit(exception)
       keep_trying_to "mark work unit as failed" do
         data = completion_params.merge({:status => 'failed', :output => exception.message})
-        @server["/work/#{data[:id]}"].put(data)
+        unit_json = @server["/work/#{data[:id]}"].put(data)
         log "failed #{@action_name} in #{data[:time]} seconds\n#{exception.message}\n#{exception.backtrace}"
+        clear_work_unit
+        setup_work_unit(unit_json)
       end
     end
@@ -78,8 +79,6 @@ module CloudCrowd
         complete_work_unit(result)
       rescue Exception => e
         fail_work_unit(e)
-      ensure
-        clear_work_unit
       end
     end
@@ -107,12 +106,16 @@ module CloudCrowd
     end
     # Extract our instance variables from a WorkUnit's JSON.
-    def parse_work_unit(unit_json)
+    def setup_work_unit(unit_json)
+      return false unless unit_json
       unit = JSON.parse(unit_json)
+      @start_time = Time.now
       @action_name, @input, @options, @status = unit['action'], unit['input'], unit['options'], unit['status']
       @options['job_id'] = unit['job_id']
       @options['work_unit_id'] = unit['id']
       @options['attempts'] ||= unit['attempts']
+      log "fetched work unit for #{@action_name}"
+      return true
     end
     # Log a message to the daemon log. Includes PID for identification.

data/test/acceptance/test_failing_work_units.rb CHANGED Viewed

@@ -13,7 +13,7 @@ class FailingWorkUnitsTest < Test::Unit::TestCase
     }.to_json
     assert browser.last_response.ok?
-    job = Job.last
+    job = CloudCrowd::Job.last
     (CloudCrowd.config[:work_unit_retries] - 1).times do
       job.work_units.each {|unit| unit.fail('failed', 10) }
     end

data/test/blueprints.rb CHANGED Viewed

@@ -1,14 +1,14 @@
 Sham.url        { Faker::Internet.domain_name + "/" + Faker::Internet.domain_word + ".jpg" }
-Job.blueprint do
+CloudCrowd::Job.blueprint do
   status  { CloudCrowd::PROCESSING }
   inputs  { ['http://www.google.com/intl/en_ALL/images/logo.gif'].to_json }
   action  { 'graphics_magick' }
   options { {}.to_json }
 end
-WorkUnit.blueprint do
-  job    { Job.make }
+CloudCrowd::WorkUnit.blueprint do
+  job    { CloudCrowd::Job.make }
   status { CloudCrowd::PROCESSING }
   taken  { false }
   input  { Sham.url }

data/test/config/config.yml CHANGED Viewed

@@ -1,5 +1,5 @@
 :num_workers:             4
-:default_worker_wait:     1
+:min_worker_wait:         1
 :max_worker_wait:         20
 :worker_wait_multiplier:  1.3
 :worker_retry_wait:       5

data/test/test_helper.rb CHANGED Viewed

@@ -2,8 +2,6 @@ require 'rubygems'
 here = File.dirname(__FILE__)
 require File.expand_path(here + "/../lib/cloud-crowd")
-require 'cloud_crowd/app'
 CloudCrowd.configure(here + '/config/config.yml')
 CloudCrowd.configure_database(here + '/config/database.yml')

data/test/unit/test_job.rb CHANGED Viewed

@@ -5,7 +5,7 @@ class JobTest < Test::Unit::TestCase
   context "A CloudCrowd Job" do
     setup do
-      @job = Job.make
+      @job = CloudCrowd::Job.make
       @unit = @job.work_units.first
     end
@@ -32,7 +32,7 @@ class JobTest < Test::Unit::TestCase
     end
     should "be able to create a job from a JSON request" do
-      job = Job.create_from_request(JSON.parse(<<-EOS
+      job = CloudCrowd::Job.create_from_request(JSON.parse(<<-EOS
       { "inputs"       : ["one", "two", "three"],
         "action"       : "graphics_magick",
         "owner_email"  : "bob@example.com",
@@ -46,13 +46,13 @@ class JobTest < Test::Unit::TestCase
     end
     should "create jobs with a SPLITTING status for actions that have a split method defined" do
-      job = Job.create_from_request({'inputs' => ['1'], 'action' => 'pdf_to_images'})
+      job = CloudCrowd::Job.create_from_request({'inputs' => ['1'], 'action' => 'pdf_to_images'})
       assert job.splittable?
       assert job.splitting?
     end
     should "fire a callback when a job has finished, successfully or not" do
-      Job.any_instance.expects(:fire_callback)
+      CloudCrowd::Job.any_instance.expects(:fire_callback)
       @job.work_units.first.finish('output', 10)
       assert @job.all_work_units_complete?
     end

data/test/unit/test_work_unit.rb CHANGED Viewed

@@ -5,7 +5,7 @@ class WorkUnitTest < Test::Unit::TestCase
   context "A WorkUnit" do
     setup do
-      @unit = WorkUnit.make
+      @unit = CloudCrowd::WorkUnit.make
       @job = @unit.job
     end
@@ -26,7 +26,7 @@ class WorkUnitTest < Test::Unit::TestCase
     end
     should "have JSON that includes job attributes" do
-      job = Job.make
+      job = CloudCrowd::Job.make
       unit_data = JSON.parse(job.work_units.first.to_json)
       assert unit_data['job_id'] == job.id
       assert unit_data['action'] == job.action

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: documentcloud-cloud-crowd
 version: !ruby/object:Gem::Version
-  version: 0.0.2
+  version: 0.0.3
 platform: ruby
 authors:
 - Jeremy Ashkenas