RubyGems - documentcloud-cloud-crowd - Versions diffs - 0.0.1 - Mend

documentcloud-cloud-crowd 0.0.1

Files changed (30) hide show

data/actions/graphics_magick.rb +44 -0
data/bin/crowd +5 -0
data/cloud-crowd.gemspec +71 -0
data/config/config.example.ru +17 -0
data/config/config.example.yml +11 -0
data/config/database.example.yml +6 -0
data/lib/cloud-crowd.rb +96 -0
data/lib/cloud_crowd/action.rb +88 -0
data/lib/cloud_crowd/app.rb +54 -0
data/lib/cloud_crowd/asset_store.rb +58 -0
data/lib/cloud_crowd/command_line.rb +198 -0
data/lib/cloud_crowd/core_ext.rb +10 -0
data/lib/cloud_crowd/daemon.rb +48 -0
data/lib/cloud_crowd/helpers/resources.rb +15 -0
data/lib/cloud_crowd/helpers/urls.rb +7 -0
data/lib/cloud_crowd/helpers.rb +8 -0
data/lib/cloud_crowd/models/job.rb +129 -0
data/lib/cloud_crowd/models/work_unit.rb +62 -0
data/lib/cloud_crowd/models.rb +31 -0
data/lib/cloud_crowd/runner.rb +29 -0
data/lib/cloud_crowd/schema.rb +34 -0
data/lib/cloud_crowd/worker.rb +115 -0
data/test/acceptance/test_failing_work_units.rb +32 -0
data/test/blueprints.rb +15 -0
data/test/config/test_config.yml +10 -0
data/test/config/test_database.yml +6 -0
data/test/test_helper.rb +18 -0
data/test/unit/test_job.rb +70 -0
data/test/unit/test_work_unit.rb +55 -0
metadata +190 -0

data/lib/cloud_crowd/daemon.rb ADDED Viewed

@@ -0,0 +1,48 @@
+CloudCrowd.configure(ENV['CLOUD_CROWD_CONFIG'])
+require 'cloud_crowd/worker'
+module CloudCrowd
+  # A CloudCrowd::Daemon, started by the Daemons gem, runs a CloudCrowd::Worker in
+  # a loop, continually fetching and processing WorkUnits from the central
+  # server. The Daemon backs off and pings central less frequently when there
+  # isn't any work to be done, and speeds back up when there is.
+  class Daemon
+    DEFAULT_WAIT    = CloudCrowd.config[:default_worker_wait]
+    MAX_WAIT        = CloudCrowd.config[:max_worker_wait]
+    WAIT_MULTIPLIER = CloudCrowd.config[:worker_wait_multiplier]
+    def initialize
+      @wait_time = DEFAULT_WAIT
+      @worker = CloudCrowd::Worker.new
+      Signal.trap('INT',  'EXIT')
+      Signal.trap('KILL', 'EXIT')
+      Signal.trap('TERM', 'EXIT')
+    end
+    # Loop forever, fetching WorkUnits.
+    # TODO: Workers busy with their work units won't die until the unit has
+    # been finished. This should probably be wrapped in an appropriately lengthy
+    # timeout, or should be killable from the outside by terminating the thread.
+    # In either case, nasty un-cleaned-up bits might be left behind.
+    def run
+      loop do
+        @worker.fetch_work_unit
+        if @worker.has_work?
+          @worker.run
+          @wait_time = DEFAULT_WAIT
+          sleep 0.01 # So as to listen for incoming signals.
+        else
+          @wait_time = [@wait_time * WAIT_MULTIPLIER, MAX_WAIT].min
+          sleep @wait_time
+        end
+      end
+    end
+  end
+end
+CloudCrowd::Daemon.new.run

data/lib/cloud_crowd/helpers/resources.rb ADDED Viewed

@@ -0,0 +1,15 @@
+module CloudCrowd
+  module Helpers
+    module Resources
+      def current_job
+        @job ||= Job.find_by_id(params[:job_id]) or raise Sinatra::NotFound
+      end
+      def current_work_unit
+        @work_unit ||= WorkUnit.find_by_id(params[:work_unit_id]) or raise Sinatra::NotFound
+      end
+    end
+  end
+end

data/lib/cloud_crowd/helpers/urls.rb ADDED Viewed

@@ -0,0 +1,7 @@
+module CloudCrowd
+  module Helpers
+    module Urls
+    end
+  end
+end

data/lib/cloud_crowd/helpers.rb ADDED Viewed

@@ -0,0 +1,8 @@
+require 'cloud_crowd/helpers/resources'
+require 'cloud_crowd/helpers/urls'
+module CloudCrowd
+  module Helpers
+    include Resources, Urls #, Rack::Utils
+  end
+end

data/lib/cloud_crowd/models/job.rb ADDED Viewed

@@ -0,0 +1,129 @@
+# A chunk of work that will be farmed out into many WorkUnits to be processed
+# in parallel by all the active CloudCrowd::Workers. Jobs are defined by a list
+# of inputs (usually public urls to files), an action (the name of a script that
+# CloudCrowd knows how to run), and, eventually a corresponding list of output.
+class Job < ActiveRecord::Base
+  include CloudCrowd::ModelStatus
+  has_many :work_units, :dependent => :destroy
+  validates_presence_of :status, :inputs, :action, :options
+  # Create a Job from an incoming JSON or XML request, and add it to the queue.
+  # TODO: Add XML support.
+  def self.create_from_request(h)
+    self.create(
+      :inputs       => h['inputs'].to_json,
+      :action       => h['action'],
+      :options      => (h['options'] || {}).to_json,
+      :owner_email  => h['owner_email'],
+      :callback_url => h['callback_url']
+    )
+  end
+  def after_create
+    self.queue_for_workers(JSON.parse(self.inputs))
+  end
+  def before_validation_on_create
+    self.status = self.splittable? ? CloudCrowd::SPLITTING : CloudCrowd::PROCESSING
+  end
+  # After work units are marked successful, we check to see if all of them have
+  # finished, if so, this job is complete.
+  def check_for_completion
+    return unless all_work_units_complete?
+    transition_to_next_phase
+    output_list = gather_outputs_from_work_units
+    if complete?
+      self.outputs = output_list.to_json
+      self.time = Time.now - self.created_at
+    end
+    self.save
+    case self.status
+    when CloudCrowd::PROCESSING then queue_for_workers(output_list.map {|o| JSON.parse(o) }.flatten)
+    when CloudCrowd::MERGING    then queue_for_workers(output_list.to_json)
+    else                             fire_callback
+    end
+    self
+  end
+  # Transition from the current phase to the next one.
+  def transition_to_next_phase
+    self.status = any_work_units_failed? ? CloudCrowd::FAILED     :
+                  self.splitting?        ? CloudCrowd::PROCESSING :
+                  self.should_merge?     ? CloudCrowd::MERGING    :
+                                           CloudCrowd::SUCCEEDED
+  end
+  # If a callback_url is defined, post the Job's JSON to it upon completion.
+  def fire_callback
+    begin
+      RestClient.post(callback_url, {:job => self.to_json}) if callback_url
+    rescue RestClient::Exception => e
+      puts "Failed to fire job callback. Hmmm, what should happen here?"
+    end
+  end
+  # Cleaning up after a job will remove all of its files from S3.
+  def cleanup
+    CloudCrowd::AssetStore.new.cleanup_job(self)
+  end
+  # Have all of the WorkUnits finished? We could trade reads for writes here
+  # by keeping a completed_count on the Job itself.
+  def all_work_units_complete?
+    self.work_units.incomplete.count <= 0
+  end
+  # Have any of the WorkUnits failed?
+  def any_work_units_failed?
+    self.work_units.failed.count > 0
+  end
+  def splittable?
+    self.action_class.new.respond_to? :split
+  end
+  def should_merge?
+    self.processing? && self.action_class.new.respond_to?(:merge)
+  end
+  def action_class
+    CloudCrowd.actions(self.action)
+  end
+  def gather_outputs_from_work_units
+    outs = self.work_units.complete.map {|wu| wu.output }
+    self.work_units.complete.destroy_all
+    outs
+  end
+  def display_status
+    CloudCrowd.display_status(self.status)
+  end
+  def work_units_remaining
+    self.work_units.incomplete.count
+  end
+  # A JSON representation of this job includes the statuses of its component
+  # WorkUnits, as well as any completed outputs.
+  def to_json(opts={})
+    atts = {'id' => self.id, 'status' => self.display_status, 'work_units_remaining' => self.work_units_remaining}
+    atts.merge!({'output' => JSON.parse(self.outputs)}) if self.outputs
+    atts.merge!({'time' => self.time}) if self.time
+    atts.to_json
+  end
+  # When starting a new job, or moving to a new stage, split up the inputs
+  # into WorkUnits, and queue them.
+  def queue_for_workers(input)
+    [input].flatten.each do |wu_input|
+      WorkUnit.create(:job => self, :input => wu_input, :status => self.status)
+    end
+  end
+end

data/lib/cloud_crowd/models/work_unit.rb ADDED Viewed

@@ -0,0 +1,62 @@
+# A WorkUnit is an atomic chunk of work from a job, processing a single input
+# through a single action. All WorkUnits receive the same options.
+class WorkUnit < ActiveRecord::Base
+  include CloudCrowd::ModelStatus
+  belongs_to :job
+  validates_presence_of :job_id, :status, :input
+  after_save :check_for_job_completion
+  # After saving a WorkUnit, it's Job should check if it just become complete.
+  def check_for_job_completion
+    self.job.check_for_completion if complete?
+  end
+  # Mark this unit as having finished successfully.
+  def finish(output, time_taken)
+    update_attributes({
+      :status   => CloudCrowd::SUCCEEDED,
+      :taken    => false,
+      :attempts => self.attempts + 1,
+      :output   => output,
+      :time     => time_taken
+    })
+  end
+  # Mark this unit as having failed. May attempt a retry.
+  def fail(output, time_taken)
+    tries = self.attempts + 1
+    return try_again if tries < CloudCrowd.config[:work_unit_retries]
+    update_attributes({
+      :status   => CloudCrowd::FAILED,
+      :taken    => false,
+      :attempts => tries,
+      :output   => output,
+      :time     => time_taken
+    })
+  end
+  # Ever tried. Ever failed. No matter. Try again. Fail again. Fail better.
+  def try_again
+    update_attributes({
+      :taken    => false,
+      :attempts => self.attempts + 1
+    })
+  end
+  # The JSON representation of a WorkUnit contains common elements of its job.
+  def to_json
+    {
+      'id'        => self.id,
+      'job_id'    => self.job_id,
+      'input'     => self.input,
+      'attempts'  => self.attempts,
+      'action'    => self.job.action,
+      'options'   => JSON.parse(self.job.options),
+      'status'    => self.status
+    }.to_json
+  end
+end

data/lib/cloud_crowd/models.rb ADDED Viewed

@@ -0,0 +1,31 @@
+module CloudCrowd
+  module ModelStatus
+    def self.included(klass)
+      klass.class_eval do
+        # Note that COMPLETE and INCOMPLETE are unions of other states.
+        named_scope 'processing', :conditions => {:status => CloudCrowd::PROCESSING}
+        named_scope 'succeeded',  :conditions => {:status => CloudCrowd::SUCCEEDED}
+        named_scope 'failed',     :conditions => {:status => CloudCrowd::FAILED}
+        named_scope 'splitting',  :conditions => {:status => CloudCrowd::SPLITTING}
+        named_scope 'merging',    :conditions => {:status => CloudCrowd::MERGING}
+        named_scope 'complete',   :conditions => {:status => CloudCrowd::COMPLETE}
+        named_scope 'incomplete', :conditions => {:status => CloudCrowd::INCOMPLETE}
+      end
+    end
+    def processing?;  self.status == CloudCrowd::PROCESSING;          end
+    def succeeded?;   self.status == CloudCrowd::SUCCEEDED;           end
+    def failed?;      self.status == CloudCrowd::FAILED;              end
+    def splitting?;   self.status == CloudCrowd::SPLITTING;           end
+    def merging?;     self.status == CloudCrowd::MERGING;             end
+    def complete?;    CloudCrowd::COMPLETE.include?(self.status);     end
+    def incomplete?;  CloudCrowd::INCOMPLETE.include?(self.status);   end
+  end
+end
+require 'cloud_crowd/models/job'
+require 'cloud_crowd/models/work_unit'

data/lib/cloud_crowd/runner.rb ADDED Viewed

@@ -0,0 +1,29 @@
+# This is the script that kicks off a single CloudCrowd::Daemon. Because the
+# daemons don't load the entire rails stack, this file functions like a mini
+# environment.rb, loading all the common gems that we need.
+# CloudCrowd::App.root = File.expand_path(File.dirname(__FILE__) + '/../..') unless defined?(CloudCrowd::App.root)
+# Standard Lib and Gems
+require 'fileutils'
+require 'rubygems'
+require 'daemons'
+require 'socket'
+require 'yaml'
+require 'json'
+require 'rest_client'
+require 'right_aws'
+FileUtils.mkdir('log') unless File.exists?('log')
+# Daemon/Worker Dependencies.
+require "#{File.dirname(__FILE__)}/../cloud-crowd"
+Daemons.run("#{CloudCrowd::App.root}/lib/cloud_crowd/daemon.rb", {
+  :app_name   => "cloud_crowd_worker",
+  :dir_mode   => :normal,
+  :dir        => 'log',
+  :multiple   => true,
+  :backtrace  => true,
+  :log_output => true
+})

data/lib/cloud_crowd/schema.rb ADDED Viewed

@@ -0,0 +1,34 @@
+# Complete schema for CloudCrowd.
+ActiveRecord::Schema.define(:version => 1) do
+  create_table "jobs", :force => true do |t|
+    t.integer  "status",                      :null => false
+    t.text     "inputs",                      :null => false
+    t.string   "action",                      :null => false
+    t.text     "options",                     :null => false
+    t.text     "outputs"
+    t.float    "time"
+    t.string   "callback_url"
+    t.string   "owner_email"
+    t.integer  "lock_version", :default => 0, :null => false
+    t.datetime "created_at"
+    t.datetime "updated_at"
+  end
+  create_table "work_units", :force => true do |t|
+    t.integer  "status",                          :null => false
+    t.integer  "job_id",                          :null => false
+    t.text     "input",                           :null => false
+    t.integer  "attempts",     :default => 0,     :null => false
+    t.integer  "lock_version", :default => 0,     :null => false
+    t.boolean  "taken",        :default => false, :null => false
+    t.float    "time"
+    t.text     "output"
+    t.datetime "created_at"
+    t.datetime "updated_at"
+  end
+  add_index "work_units", ["job_id"], :name => "index_work_units_on_job_id"
+  add_index "work_units", ["status", "taken"], :name => "index_work_units_on_status_and_taken"
+end

data/lib/cloud_crowd/worker.rb ADDED Viewed

@@ -0,0 +1,115 @@
+module CloudCrowd
+  class Worker
+    CENTRAL_URL = CloudCrowd.config[:central_server]
+    RETRY_WAIT = CloudCrowd.config[:worker_retry_wait]
+    attr_reader :action
+    # Spinning up a worker will create a new AssetStore with a persistent
+    # connection to S3. This AssetStore gets passed into each action, for use
+    # as it is run.
+    def initialize
+      @id = $$
+      @hostname = Socket.gethostname
+      @store = CloudCrowd::AssetStore.new
+    end
+    # Ask the central server for a new WorkUnit.
+    def fetch_work_unit
+      keep_trying_to "fetch a new work unit" do
+        unit_json = RestClient.get("#{CENTRAL_URL}/work")
+        return unless unit_json # No content means no work for us.
+        @start_time = Time.now
+        parse_work_unit unit_json
+        log "fetched work unit for #{@action_name}"
+      end
+    end
+    # Return output to the central server, marking the current work unit as done.
+    def complete_work_unit(result)
+      keep_trying_to "complete work unit" do
+        data = completion_params.merge({:status => 'succeeded', :output => result})
+        RestClient.put("#{CENTRAL_URL}/work/#{data[:id]}", data)
+        log "finished #{@action_name} in #{data[:time]} seconds"
+      end
+    end
+    # Mark the current work unit as failed, returning the exception to central.
+    def fail_work_unit(exception)
+      keep_trying_to "mark work unit as failed" do
+        data = completion_params.merge({:status => 'failed', :output => exception.message})
+        RestClient.put("#{CENTRAL_URL}/work/#{data[:id]}", data)
+        log "failed #{@action_name} in #{data[:time]} seconds\n#{exception.message}\n#{exception.backtrace}"
+      end
+    end
+    def keep_trying_to(title)
+      begin
+        yield
+      rescue Exception => e
+        log "failed to #{title} -- retry in #{RETRY_WAIT} seconds"
+        log e.message
+        log e.backtrace
+        sleep RETRY_WAIT
+        retry
+      end
+    end
+    # Does this Worker have a job to do?
+    def has_work?
+      @action_name && @input && @options
+    end
+    # Executes the current work unit, catching all exceptions as failures.
+    def run
+      begin
+        @action = CloudCrowd.actions(@action_name).new
+        @action.configure(@status, @input, @options, @store)
+        result = case @status
+        when CloudCrowd::PROCESSING then @action.process
+        when CloudCrowd::SPLITTING  then @action.split
+        when CloudCrowd::MERGING    then @action.merge
+        else raise "Work units must specify their status."
+        end
+        complete_work_unit(result)
+      rescue Exception => e
+        fail_work_unit(e)
+      ensure
+        clear_work_unit
+      end
+    end
+    private
+    # Common parameters to send back to central, regardless of success or failure.
+    def completion_params
+      {:id => @options['work_unit_id'], :time => Time.now - @start_time}
+    end
+    # Extract our instance variables from a WorkUnit's JSON.
+    def parse_work_unit(unit_json)
+      unit = JSON.parse(unit_json)
+      @action_name, @input, @options, @status = unit['action'], unit['input'], unit['options'], unit['status']
+      @options['job_id'] = unit['job_id']
+      @options['work_unit_id'] = unit['id']
+      @options['attempts'] ||= unit['attempts']
+    end
+    # Log a message to the daemon log. Includes PID for identification.
+    def log(message)
+      puts "Worker ##{@id}: #{message}"
+    end
+    # When we're done with a unit, clear out our ivars to make way for the next.
+    # Also, remove all of the previous unit's temporary storage.
+    def clear_work_unit
+      @action.cleanup_work_directory
+      @action, @action_name, @input, @options, @start_time = nil, nil, nil, nil, nil
+    end
+  end
+end

data/test/acceptance/test_failing_work_units.rb ADDED Viewed

@@ -0,0 +1,32 @@
+require 'test_helper'
+# A Worker Daemon needs to be running to perform this integration test.
+class FailingWorkUnitsTest < Test::Unit::TestCase
+  should "retry work units when they fail" do
+    browser = Rack::Test::Session.new(Rack::MockSession.new(CloudCrowd::App))
+    browser.post '/jobs', :json => {
+      'action'  => 'failure_testing',
+      'inputs'  => ['one', 'two', 'three'],
+      'options' => {}
+    }.to_json
+    assert browser.last_response.ok?
+    job = Job.last
+    (CloudCrowd.config[:work_unit_retries] - 1).times do
+      job.work_units.each {|unit| unit.fail('failed', 10) }
+    end
+    assert job.reload.work_units_remaining == 3
+    job.work_units.reload.each_with_index do |unit, i|
+      assert unit.processing?
+      assert unit.attempts == CloudCrowd.config[:work_unit_retries] - 1
+      unit.fail('failed', 10)
+      assert unit.job.any_work_units_failed? if i == 0
+    end
+    assert job.reload.failed?
+    assert job.work_units.count == 0
+  end
+end

data/test/blueprints.rb ADDED Viewed

@@ -0,0 +1,15 @@
+Sham.url        { Faker::Internet.domain_name + "/" + Faker::Internet.domain_word + ".jpg" }
+Job.blueprint do
+  status  { CloudCrowd::PROCESSING }
+  inputs  { ['http://www.google.com/intl/en_ALL/images/logo.gif'].to_json }
+  action  { 'graphics_magick' }
+  options { {}.to_json }
+end
+WorkUnit.blueprint do
+  job    { Job.make }
+  status { CloudCrowd::PROCESSING }
+  taken  { false }
+  input  { Sham.url }
+end

data/test/config/test_config.yml ADDED Viewed

@@ -0,0 +1,10 @@
+:num_workers:             4
+:default_worker_wait:     1
+:max_worker_wait:         20
+:worker_wait_multiplier:  1.3
+:worker_retry_wait:       5
+:work_unit_retries:       3
+:central_server:          http://localhost:9173
+# TODO: Add a somewhat oversized work unit timeout.

data/test/config/test_database.yml ADDED Viewed

@@ -0,0 +1,6 @@
+:adapter:  mysql
+:encoding: utf8
+:username: root
+:password:
+:socket:   /tmp/mysql.sock
+:database: cloud_crowd_test

data/test/test_helper.rb ADDED Viewed

@@ -0,0 +1,18 @@
+require 'rubygems'
+here = File.dirname(__FILE__)
+require File.expand_path(here + "/../lib/cloud-crowd")
+CloudCrowd.configure(here + '/config/test_config.yml')
+CloudCrowd.configure_database(here + '/config/test_database.yml')
+require 'faker'
+require 'sham'
+require 'rack/test'
+require 'shoulda/active_record'
+require 'machinist/active_record'
+require 'mocha'
+require "#{CloudCrowd::App.root}/test/blueprints.rb"
+class Test::Unit::TestCase
+  include CloudCrowd
+end

data/test/unit/test_job.rb ADDED Viewed

@@ -0,0 +1,70 @@
+require 'test_helper'
+class JobTest < Test::Unit::TestCase
+  context "A CloudCrowd Job" do
+    setup do
+      @job = Job.make
+      @unit = @job.work_units.first
+    end
+    subject { @job }
+    should_have_many :work_units
+    should_validate_presence_of :status, :inputs, :action, :options
+    should "create all of its work units as soon as the job is created" do
+      assert @job.work_units.count >= 1
+      assert @job.work_units_remaining == 1
+      assert @job.processing?
+      assert @unit.processing?
+      assert !@job.all_work_units_complete?
+    end
+    should "know its completion status" do
+      assert !@job.all_work_units_complete?
+      @unit.update_attributes(:status => CloudCrowd::SUCCEEDED, :output => 'hello')
+      assert @job.reload.all_work_units_complete?
+      assert @job.work_units_remaining == 0
+      assert @job.outputs == "[\"hello\"]"
+    end
+    should "be able to create a job from a JSON request" do
+      job = Job.create_from_request(JSON.parse(<<-EOS
+      { "inputs"       : ["one", "two", "three"],
+        "action"       : "graphics_magick",
+        "owner_email"  : "bob@example.com",
+        "callback_url" : "http://example.com/callback" }
+      EOS
+      ))
+      assert job.work_units.count == 3
+      assert job.action == 'graphics_magick'
+      assert job.action_class == GraphicsMagick
+      assert job.callback_url == "http://example.com/callback"
+    end
+    should "create jobs with a SPLITTING status for actions that have a split method defined" do
+      job = Job.create_from_request({'inputs' => ['1'], 'action' => 'pdf_to_images'})
+      assert job.splittable?
+      assert job.splitting?
+    end
+    should "fire a callback when a job has finished, successfully or not" do
+      Job.any_instance.expects(:fire_callback)
+      @job.work_units.first.finish('output', 10)
+      assert @job.all_work_units_complete?
+    end
+    should "have a 'pretty' display of the Job's status" do
+      assert @job.display_status == 'processing'
+      @job.update_attribute(:status, CloudCrowd::FAILED)
+      assert @job.display_status == 'failed'
+      @job.update_attribute(:status, CloudCrowd::MERGING)
+      assert @job.display_status == 'merging'
+    end
+  end
+end