RubyGems - cloud-crowd - Versions diffs - 0.1.0 - Mend

cloud-crowd 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (66) hide show

data/EPIGRAPHS +17 -0
data/LICENSE +22 -0
data/README +93 -0
data/actions/graphics_magick.rb +43 -0
data/actions/process_pdfs.rb +92 -0
data/actions/word_count.rb +14 -0
data/bin/crowd +5 -0
data/cloud-crowd.gemspec +111 -0
data/config/config.example.ru +17 -0
data/config/config.example.yml +48 -0
data/config/database.example.yml +9 -0
data/examples/graphics_magick_example.rb +44 -0
data/examples/process_pdfs_example.rb +40 -0
data/examples/word_count_example.rb +41 -0
data/lib/cloud-crowd.rb +130 -0
data/lib/cloud_crowd/action.rb +101 -0
data/lib/cloud_crowd/app.rb +117 -0
data/lib/cloud_crowd/asset_store.rb +41 -0
data/lib/cloud_crowd/asset_store/filesystem_store.rb +28 -0
data/lib/cloud_crowd/asset_store/s3_store.rb +40 -0
data/lib/cloud_crowd/command_line.rb +209 -0
data/lib/cloud_crowd/daemon.rb +95 -0
data/lib/cloud_crowd/exceptions.rb +28 -0
data/lib/cloud_crowd/helpers.rb +8 -0
data/lib/cloud_crowd/helpers/authorization.rb +50 -0
data/lib/cloud_crowd/helpers/resources.rb +45 -0
data/lib/cloud_crowd/inflector.rb +19 -0
data/lib/cloud_crowd/models.rb +40 -0
data/lib/cloud_crowd/models/job.rb +176 -0
data/lib/cloud_crowd/models/work_unit.rb +89 -0
data/lib/cloud_crowd/models/worker_record.rb +61 -0
data/lib/cloud_crowd/runner.rb +15 -0
data/lib/cloud_crowd/schema.rb +45 -0
data/lib/cloud_crowd/worker.rb +186 -0
data/public/css/admin_console.css +221 -0
data/public/css/reset.css +42 -0
data/public/images/bullet_green.png +0 -0
data/public/images/bullet_white.png +0 -0
data/public/images/cloud_hand.png +0 -0
data/public/images/header_back.png +0 -0
data/public/images/logo.png +0 -0
data/public/images/queue_fill.png +0 -0
data/public/images/server_error.png +0 -0
data/public/images/sidebar_bottom.png +0 -0
data/public/images/sidebar_top.png +0 -0
data/public/images/worker_info.png +0 -0
data/public/images/worker_info_loading.gif +0 -0
data/public/js/admin_console.js +168 -0
data/public/js/excanvas.js +1 -0
data/public/js/flot.js +1 -0
data/public/js/jquery.js +19 -0
data/test/acceptance/test_app.rb +72 -0
data/test/acceptance/test_failing_work_units.rb +32 -0
data/test/acceptance/test_word_count.rb +49 -0
data/test/blueprints.rb +17 -0
data/test/config/actions/failure_testing.rb +13 -0
data/test/config/config.ru +17 -0
data/test/config/config.yml +7 -0
data/test/config/database.yml +6 -0
data/test/test_helper.rb +19 -0
data/test/unit/test_action.rb +49 -0
data/test/unit/test_configuration.rb +28 -0
data/test/unit/test_job.rb +78 -0
data/test/unit/test_work_unit.rb +55 -0
data/views/index.erb +77 -0
metadata +233 -0

data/lib/cloud_crowd/helpers/authorization.rb ADDED

@@ -0,0 +1,50 @@
+module CloudCrowd
+  module Helpers
+    # Authorization takes after sinatra-authorization... See
+    # http://github.com/integrity/sinatra-authorization
+    # for the original.
+    module Authorization
+      # Ensure that the request includes the correct credentials.
+      def login_required
+        return if authorized?
+        unauthorized! unless auth.provided?
+        bad_request!  unless auth.basic?
+        unauthorized! unless authorize(*auth.credentials)
+        request.env['REMOTE_USER'] = auth.username
+      end
+      # Has the request been authenticated?
+      def authorized?
+        !!request.env['REMOTE_USER']
+      end
+      # A request is authorized if its login and password match those stored
+      # in config.yml, or if authentication is disabled. If authentication is
+      # turned on, then every request is authenticated, including between
+      # the worker daemons and the central server.
+      def authorize(login, password)
+        return true unless CloudCrowd.config[:use_http_authentication]
+        return CloudCrowd.config[:login] == login &&
+               CloudCrowd.config[:password] == password
+      end
+      private
+      def auth
+        @auth ||= Rack::Auth::Basic::Request.new(request.env)
+      end
+      def unauthorized!(realm = App.authorization_realm)
+        response['WWW-Authenticate'] = "Basic realm=\"#{realm}\""
+        halt 401, 'Authorization Required'
+      end
+      def bad_request!
+        halt 400, 'Bad Request'
+      end
+    end
+  end
+end

data/lib/cloud_crowd/helpers/resources.rb ADDED

@@ -0,0 +1,45 @@
+module CloudCrowd
+  module Helpers
+    module Resources
+      # Convenience method for responding with JSON. Sets the content-type,
+      # serializes, and allows empty responses.
+      def json(obj)
+        content_type :json
+        return status(204) && '' if obj.nil?
+        obj.to_json
+      end
+      # Lazy-fetch the job specified by <tt>job_id</tt>.
+      def current_job
+        @job ||= Job.find_by_id(params[:job_id]) or raise Sinatra::NotFound
+      end
+      # Lazy-fetch the WorkUnit specified by <tt>work_unit_id</tt>.
+      def current_work_unit
+        @work_unit ||= WorkUnit.find_by_id(params[:work_unit_id]) or raise Sinatra::NotFound
+      end
+      # Try to fetch a work unit from the queue. If none are pending, respond
+      # with no content.
+      def dequeue_work_unit(offset=0)
+        handle_conflicts do
+          worker, actions = params[:worker_name], params[:worker_actions].split(',')
+          WorkUnit.dequeue(worker, actions, offset)
+        end
+      end
+      # We're using ActiveRecords optimistic locking, so stale work units
+      # may sometimes arise. handle_conflicts responds with a the HTTP status
+      # code of your choosing if the update failed to be applied.
+      def handle_conflicts(code=204)
+        begin
+          yield
+        rescue ActiveRecord::StaleObjectError => e
+          return status(code) && ''
+        end
+      end
+    end
+  end
+end

data/lib/cloud_crowd/inflector.rb ADDED

@@ -0,0 +1,19 @@
+module CloudCrowd
+  # Pilfered in parts from the ActiveSupport::Inflector.
+  module Inflector
+    def self.camelize(word)
+      word.to_s.gsub(/\/(.?)/) { "::#{$1.upcase}" }.gsub(/(?:^|_)(.)/) { $1.upcase }
+    end
+    def self.underscore(word)
+      word.to_s.gsub(/::/, '/').
+        gsub(/([A-Z]+)([A-Z][a-z])/,'\1_\2').
+        gsub(/([a-z\d])([A-Z])/,'\1_\2').
+        tr("-", "_").
+        downcase
+    end
+  end
+end

data/lib/cloud_crowd/models.rb ADDED

@@ -0,0 +1,40 @@
+module CloudCrowd
+  # Adds named scopes and query methods for every CloudCrowd status to
+  # both Jobs and WorkUnits.
+  module ModelStatus
+    def self.included(klass)
+      klass.class_eval do
+        # Note that COMPLETE and INCOMPLETE are unions of other states.
+        named_scope 'processing', :conditions => {:status => PROCESSING}
+        named_scope 'succeeded',  :conditions => {:status => SUCCEEDED}
+        named_scope 'failed',     :conditions => {:status => FAILED}
+        named_scope 'splitting',  :conditions => {:status => SPLITTING}
+        named_scope 'merging',    :conditions => {:status => MERGING}
+        named_scope 'complete',   :conditions => {:status => COMPLETE}
+        named_scope 'incomplete', :conditions => {:status => INCOMPLETE}
+      end
+    end
+    def processing?;  self.status == PROCESSING;          end
+    def succeeded?;   self.status == SUCCEEDED;           end
+    def failed?;      self.status == FAILED;              end
+    def splitting?;   self.status == SPLITTING;           end
+    def merging?;     self.status == MERGING;             end
+    def complete?;    COMPLETE.include?(self.status);     end
+    def incomplete?;  INCOMPLETE.include?(self.status);   end
+    # Get the displayable status name of the model's status code.
+    def display_status
+      CloudCrowd.display_status(self.status)
+    end
+  end
+end
+require 'cloud_crowd/models/job'
+require 'cloud_crowd/models/work_unit'
+require 'cloud_crowd/models/worker_record'

data/lib/cloud_crowd/models/job.rb ADDED

@@ -0,0 +1,176 @@
+module CloudCrowd
+  # A chunk of work that will be farmed out into many WorkUnits to be processed
+  # in parallel by each active CloudCrowd::Worker. Jobs are defined by a list
+  # of inputs (usually public urls to files), an action (the name of a script that
+  # CloudCrowd knows how to run), and, eventually a corresponding list of output.
+  class Job < ActiveRecord::Base
+    include ModelStatus
+    has_many :work_units, :dependent => :destroy
+    validates_presence_of :status, :inputs, :action, :options
+    before_validation_on_create :set_initial_status
+    after_create                :queue_for_workers
+    before_destroy              :cleanup_assets
+    # Create a Job from an incoming JSON or XML request, and add it to the queue.
+    # TODO: Think about XML support.
+    def self.create_from_request(h)
+      self.create(
+        :inputs       => h['inputs'].to_json,
+        :action       => h['action'],
+        :options      => (h['options'] || {}).to_json,
+        :email        => h['email'],
+        :callback_url => h['callback_url']
+      )
+    end
+    # After work units are marked successful, we check to see if all of them have
+    # finished, if so, continue on to the next phase of the job.
+    def check_for_completion
+      return unless all_work_units_complete?
+      transition_to_next_phase
+      output_list = gather_outputs_from_work_units
+      if complete?
+        self.outputs = output_list.to_json
+        self.time = Time.now - self.created_at
+      end
+      self.save
+      case self.status
+      when PROCESSING then queue_for_workers(output_list.map {|o| JSON.parse(o) }.flatten)
+      when MERGING    then queue_for_workers(output_list.to_json)
+      else                 fire_callback
+      end
+      self
+    end
+    # If a <tt>callback_url</tt> is defined, post the Job's JSON to it upon
+    # completion. The <tt>callback_url</tt> may include HTTP basic authentication,
+    # if you like:
+    #   http://user:password@example.com/job_complete
+    def fire_callback
+      begin
+        RestClient.post(callback_url, {:job => self.to_json}) if callback_url
+      rescue RestClient::Exception => e
+        puts "Failed to fire job callback. Hmmm, what should happen here?"
+      end
+    end
+    # Cleaning up after a job will remove all of its files from S3. Destroying
+    # a Job calls cleanup_assets first.
+    def cleanup_assets
+      AssetStore.new.cleanup(self)
+    end
+    # Have all of the WorkUnits finished?
+    #--
+    # We could trade reads for writes here
+    # by keeping a completed_count on the Job itself.
+    #++
+    def all_work_units_complete?
+      self.work_units.incomplete.count <= 0
+    end
+    # Have any of the WorkUnits failed?
+    def any_work_units_failed?
+      self.work_units.failed.count > 0
+    end
+    # This job is splittable if its Action has a +split+ method.
+    def splittable?
+      self.action_class.public_instance_methods.include? 'split'
+    end
+    # This job is mergeable if its Action has a +merge+ method.
+    def mergeable?
+      self.processing? && self.action_class.public_instance_methods.include?('merge')
+    end
+    # Retrieve the class for this Job's Action.
+    def action_class
+      klass = CloudCrowd.actions[self.action]
+      return klass if klass
+      raise Error::ActionNotFound, "no action named: '#{self.action}' could be found"
+    end
+    # How complete is this Job?
+    def percent_complete
+      return 0   if splitting?
+      return 100 if complete?
+      return 99  if merging?
+      (work_units.complete.count / work_units.count.to_f * 100).round
+    end
+    # How long has this Job taken?
+    def time_taken
+      return self.time if self.time
+      Time.now - self.created_at
+    end
+    # Generate a stable 8-bit Hex color code, based on the Job's id.
+    def color
+      @color ||= Digest::MD5.hexdigest(self.id.to_s)[-7...-1]
+    end
+    # A JSON representation of this job includes the statuses of its component
+    # WorkUnits, as well as any completed outputs.
+    def to_json(opts={})
+      atts = {
+        'id'                => id,
+        'color'             => color,
+        'status'            => display_status,
+        'percent_complete'  => percent_complete,
+        'work_units'        => work_units.count,
+        'time_taken'        => time_taken
+      }
+      atts['outputs'] = JSON.parse(outputs) if outputs
+      atts['email']   = email               if email
+      atts.to_json
+    end
+    private
+    # When the WorkUnits are all finished, gather all their outputs together
+    # before removing them from the database entirely.
+    def gather_outputs_from_work_units
+      units = self.work_units.complete
+      outs = self.work_units.complete.map {|u| JSON.parse(u.output)['output'] }
+      self.work_units.complete.destroy_all
+      outs
+    end
+    # Transition this Job's status to the appropriate next status.
+    def transition_to_next_phase
+      self.status = any_work_units_failed? ? FAILED     :
+                    self.splitting?        ? PROCESSING :
+                    self.mergeable?        ? MERGING    :
+                                             SUCCEEDED
+    end
+    # When starting a new job, or moving to a new stage, split up the inputs
+    # into WorkUnits, and queue them. Workers will start picking them up right
+    # away.
+    def queue_for_workers(input=nil)
+      input ||= JSON.parse(self.inputs)
+      [input].flatten.each do |wu_input|
+        WorkUnit.create(
+          :job    => self,
+          :action => self.action,
+          :input  => wu_input,
+          :status => self.status
+        )
+      end
+    end
+    # A Job starts out either splitting or processing, depending on its action.
+    def set_initial_status
+      self.status = self.splittable? ? SPLITTING : PROCESSING
+    end
+  end
+end

data/lib/cloud_crowd/models/work_unit.rb ADDED

@@ -0,0 +1,89 @@
+module CloudCrowd
+  # A WorkUnit is an atomic chunk of work from a job, processing a single input
+  # through a single action. The WorkUnits are run in parallel, with each worker
+  # daemon processing one at a time. The splitting and merging stages of a job
+  # are each run as a single WorkUnit.
+  class WorkUnit < ActiveRecord::Base
+    include ModelStatus
+    belongs_to :job
+    belongs_to :worker_record
+    validates_presence_of :job_id, :status, :input, :action
+    after_save :check_for_job_completion
+    # Find the first available WorkUnit in the queue, and take it out.
+    # +enabled_actions+ must be passed to whitelist the types of WorkUnits than
+    # can be retrieved for processing. Optionally, specify the +offset+ to peek
+    # further on in line.
+    def self.dequeue(worker_name, enabled_actions=[], offset=0)
+      unit = self.first(
+        :conditions => {:status => INCOMPLETE, :worker_record_id => nil, :action => enabled_actions},
+        :order      => "created_at asc",
+        :offset     => offset
+      )
+      unit ? unit.assign_to(worker_name) : nil
+    end
+    # After saving a WorkUnit, its Job should check if it just became complete.
+    def check_for_job_completion
+      self.job.check_for_completion if complete?
+    end
+    # Mark this unit as having finished successfully.
+    def finish(output, time_taken)
+      update_attributes({
+        :status         => SUCCEEDED,
+        :worker_record  => nil,
+        :attempts       => self.attempts + 1,
+        :output         => output,
+        :time           => time_taken
+      })
+    end
+    # Mark this unit as having failed. May attempt a retry.
+    def fail(output, time_taken)
+      tries = self.attempts + 1
+      return try_again if tries < CloudCrowd.config[:work_unit_retries]
+      update_attributes({
+        :status         => FAILED,
+        :worker_record  => nil,
+        :attempts       => tries,
+        :output         => output,
+        :time           => time_taken
+      })
+    end
+    # Ever tried. Ever failed. No matter. Try again. Fail again. Fail better.
+    def try_again
+      update_attributes({
+        :worker_record  => nil,
+        :attempts       => self.attempts + 1
+      })
+    end
+    # When a Worker checks out a WorkUnit, establish the connection between
+    # WorkUnit and WorkerRecord.
+    def assign_to(worker_name)
+      self.worker_record = WorkerRecord.find_by_name!(worker_name)
+      self.save ? self : nil
+    end
+    # The JSON representation of a WorkUnit shares the Job's options with all
+    # its sister WorkUnits.
+    def to_json
+      {
+        'id'        => self.id,
+        'job_id'    => self.job_id,
+        'input'     => self.input,
+        'attempts'  => self.attempts,
+        'action'    => self.action,
+        'options'   => JSON.parse(self.job.options),
+        'status'    => self.status
+      }.to_json
+    end
+  end
+end

data/lib/cloud_crowd/models/worker_record.rb ADDED

@@ -0,0 +1,61 @@
+module CloudCrowd
+  # A WorkerRecord is a recording of an active worker daemon running remotely.
+  # Every time it checks in, we keep track of its status. The attributes shown
+  # here may lag their actual values by up to Worker::CHECK_IN_INTERVAL seconds.
+  class WorkerRecord < ActiveRecord::Base
+    EXPIRES_AFTER = 2 * Worker::CHECK_IN_INTERVAL
+    has_one :work_unit
+    validates_presence_of :name, :thread_status
+    before_destroy :clear_work_units
+    named_scope :alive, lambda { {:conditions => ['updated_at > ?', Time.now - EXPIRES_AFTER]} }
+    named_scope :dead,  lambda { {:conditions => ['updated_at <= ?', Time.now - EXPIRES_AFTER]} }
+    # Save a Worker's current status to the database.
+    def self.check_in(params)
+      attrs = {:thread_status => params[:thread_status], :updated_at => Time.now}
+      self.find_or_create_by_name(params[:name]).update_attributes!(attrs)
+    end
+    # Remove a terminated Worker's record from the database.
+    def self.check_out(params)
+      self.find_by_name(params[:name]).destroy
+    end
+    # We consider the worker to be alive if it's checked in more recently
+    # than twice the expected interval ago.
+    def alive?
+      updated_at > Time.now - EXPIRES_AFTER
+    end
+    # Derive the Worker's PID on the remote machine from the name.
+    def pid
+      @pid ||= self.name.split('@').first
+    end
+    # Derive the hostname from the Worker's name.
+    def hostname
+      @hostname ||= self.name.split('@').last
+    end
+    def to_json(opts={})
+      {
+        'name'    => name,
+        'status'  => work_unit && work_unit.display_status,
+      }.to_json
+    end
+    private
+    def clear_work_units
+      WorkUnit.update_all('worker_record_id = null', "worker_record_id = #{id}")
+    end
+  end
+end