RubyGems - documentcloud-cloud-crowd - Versions diffs - 0.0.4 → 0.0.5 - Mend

documentcloud-cloud-crowd 0.0.4 → 0.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

data/EPIGRAPHS +17 -0
data/LICENSE +22 -0
data/README +75 -0
data/actions/graphics_magick.rb +1 -3
data/actions/process_pdfs.rb +92 -0
data/cloud-crowd.gemspec +24 -4
data/config/config.example.yml +5 -0
data/examples/graphics_magick_example.rb +48 -0
data/examples/process_pdfs_example.rb +30 -0
data/lib/cloud-crowd.rb +25 -20
data/lib/cloud_crowd/action.rb +29 -24
data/lib/cloud_crowd/app.rb +40 -13
data/lib/cloud_crowd/asset_store.rb +13 -6
data/lib/cloud_crowd/command_line.rb +11 -5
data/lib/cloud_crowd/daemon.rb +7 -2
data/lib/cloud_crowd/exceptions.rb +17 -0
data/lib/cloud_crowd/helpers.rb +1 -1
data/lib/cloud_crowd/helpers/authorization.rb +7 -3
data/lib/cloud_crowd/helpers/resources.rb +12 -3
data/lib/cloud_crowd/inflector.rb +1 -1
data/lib/cloud_crowd/models/job.rb +75 -38
data/lib/cloud_crowd/models/work_unit.rb +14 -8
data/lib/cloud_crowd/schema.rb +3 -1
data/lib/cloud_crowd/worker.rb +32 -15
data/public/css/admin_console.css +51 -0
data/public/css/reset.css +52 -0
data/public/images/queue_fill.png +0 -0
data/public/js/admin_console.js +51 -0
data/public/js/jquery-1.3.2.js +4376 -0
data/test/acceptance/test_failing_work_units.rb +2 -2
data/test/blueprints.rb +1 -0
data/test/config/config.ru +17 -0
data/test/unit/test_job.rb +5 -5
data/test/unit/test_work_unit.rb +1 -1
data/views/index.erb +22 -0
metadata +27 -8

data/lib/cloud_crowd/action.rb CHANGED

@@ -1,52 +1,54 @@
 module CloudCrowd
-  # Base CloudCrowd::Action class. Override this with your custom action steps.
-  #
-  # Public API to CloudCrowd::Action subclasses:
-  # +input+, +input_path+, +file_name+, +work_directory+, +options+, +save+
-  #
-  # CloudCrowd::Actions must implement a +process+ method, which must return a
+  # As you write your custom actions, have them inherit from CloudCrowd::Action.
+  # All actions must implement a +process+ method, which should return a
   # JSON-serializeable object that will be used as the output for the work unit.
+  # See the default actions for examples.
+  #
   # Optionally, actions may define +split+ and +merge+ methods to do mapping
-  # and reducing around the input.
-  # +split+ must return an array of inputs.
-  # +merge+ must return the output for the job.
-  # All actions run inside of their individual +work_directory+.
+  # and reducing around the +input+. +split+ should return an array of URLs --
+  # to be mapped into WorkUnits and processed in parallel. In the +merge+ step,
+  # +input+ will be an array of all the resulting outputs from calling process.
+  #
+  # All actions have use of an individual +work_directory+, for scratch files,
+  # and spend their duration inside of it, so relative paths work well.
   class Action
     attr_reader :input, :input_path, :file_name, :options, :work_directory
-    # Configuring a new Action sets up all of the read-only variables that
+    # Initializing an Action sets up all of the read-only variables that
     # form the bulk of the API for action subclasses. (Paths to read from and
-    # write to). It creates the work_directory and moves into it.
-    def configure(status, input, options, store)
+    # write to). It creates the +work_directory+ and moves into it.
+    # If we're not merging multiple results, it downloads the input file into
+    # the +work_directory+ before starting.
+    def initialize(status, input, options, store)
       @input, @options, @store = input, options, store
       @job_id, @work_unit_id = options['job_id'], options['work_unit_id']
       @work_directory = File.expand_path(File.join(@store.temp_storage_path, storage_prefix))
       FileUtils.mkdir_p(@work_directory) unless File.exists?(@work_directory)
       Dir.chdir @work_directory
       unless status == MERGING
-        @input_path = File.join(@work_directory, File.basename(@input))
+        @input_path = File.join(@work_directory, safe_filename(@input))
         @file_name = File.basename(@input_path, File.extname(@input_path))
         download(@input, @input_path)
       end
     end
-    # Each CloudCrowd::Action must implement a +process+ method.
+    # Each Action subclass must implement a +process+ method, overriding this.
     def process
       raise NotImplementedError.new("CloudCrowd::Actions must override 'process' with their own processing code.")
     end
-    # Download a file to the specified path using curl.
+    # Download a file to the specified path with *curl*.
     def download(url, path)
-      `curl -s "#{url}" > #{path}`
+      `curl -s "#{url}" > "#{path}"`
       path
     end
-    # Takes a local filesystem path, and returns the public (or authenticated)
-    # url on S3 where the file was saved.
+    # Takes a local filesystem path, saves the file to S3, and returns the
+    # public (or authenticated) url on S3 where the file can be accessed.
     def save(file_path)
-      save_path = File.join(s3_storage_path, File.basename(file_path))
+      save_path = File.join(storage_prefix, File.basename(file_path))
       @store.save(file_path, save_path)
       return @store.url(save_path)
     end
@@ -61,6 +63,13 @@ module CloudCrowd
     private
+    # Convert an unsafe URL into a filesystem-friendly filename.
+    def safe_filename(url)
+      ext = File.extname(url)
+      name = File.basename(url).gsub(/%\d+/, '-').gsub(/[^a-zA-Z0-9_\-.]/, '')
+      File.basename(name, ext).gsub('.', '-') + ext
+    end
     # The directory prefix to use for both local and S3 storage.
     # [action_name]/job_[job_id]/unit_[work_unit_it]
     def storage_prefix
@@ -71,10 +80,6 @@ module CloudCrowd
       @storage_prefix ||= File.join(path_parts)
     end
-    def s3_storage_path
-      @s3_storage_path ||= storage_prefix
-    end
   end
 end

data/lib/cloud_crowd/app.rb CHANGED

@@ -1,5 +1,20 @@
 module CloudCrowd
+  # The main CloudCrowd (Sinatra) application. The actions are:
+  #
+  # == Admin
+  # [get /] Render the admin console, with a progress meter for running jobs.
+  # [get /jobs] Get the combined JSON of every active job in the queue.
+  # [get /heartbeat] Returns 200 OK to let monitoring tools know the server's up.
+  #
+  # == Public API
+  # [post /jobs] Begin a new Job. Post with a JSON representation of the job-to-be. (see examples).
+  # [get /jobs/:job_id] Check the status of a Job. Response includes output, if the Job has finished.
+  # [delete /jobs/:job_id] Clean up a Job when you're done downloading the results. Removes all intermediate files.
+  #
+  # == Internal Workers API
+  # [post /work] Dequeue the next WorkUnit, and hand it off to the worker.
+  # [put /work/:unit_id] Mark a finished WorkUnit as completed or failed, with results.
   class App < Sinatra::Default
     set :root, ROOT
@@ -15,28 +30,46 @@ module CloudCrowd
       login_required if CloudCrowd.config[:use_http_authentication]
     end
+    # Render the admin console.
+    get '/' do
+      erb :index
+    end
+    # Get the JSON for every active job in the queue.
+    get '/jobs' do
+      json Job.incomplete
+    end
+    # To monitor the central server with Monit, God, Nagios, or another
+    # monitoring tool, you can hit /heartbeat to make sure.
+    get '/heartbeat' do
+      "buh-bump"
+    end
+    # PUBLIC API:
     # Start a new job. Accepts a JSON representation of the job-to-be.
     post '/jobs' do
-      Job.create_from_request(JSON.parse(params[:json])).to_json
+      json Job.create_from_request(JSON.parse(params[:job]))
     end
     # Check the status of a job, returning the output if finished, and the
     # number of work units remaining otherwise.
     get '/jobs/:job_id' do
-      current_job.to_json
+      json current_job
     end
     # Cleans up a Job's saved S3 files. Delete a Job after you're done
     # downloading the results.
     delete '/jobs/:job_id' do
       current_job.cleanup
-      ''
+      json nil
     end
     # Internal method for worker daemons to fetch the work unit at the front
     # of the queue. Work unit is marked as taken and handed off to the worker.
-    get '/work' do
-      dequeue_work_unit
+    post '/work' do
+      json dequeue_work_unit
     end
     # When workers are done with their unit, either successfully on in failure,
@@ -47,22 +80,16 @@ module CloudCrowd
         case params[:status]
         when 'succeeded'
           current_work_unit.finish(params[:output], params[:time])
-          dequeue_work_unit
+          json dequeue_work_unit
         when 'failed'
           current_work_unit.fail(params[:output], params[:time])
-          dequeue_work_unit(1)
+          json dequeue_work_unit(1)
         else
           error(500, "Completing a work unit must specify status.")
         end
       end
     end
-    # To monitor the central server with Monit, God, Nagios, or another
-    # monitoring tool, you can hit /heartbeat to make sure.
-    get '/heartbeat' do
-      "buh-bump"
-    end
   end
 end

data/lib/cloud_crowd/asset_store.rb CHANGED

@@ -2,24 +2,31 @@ require 'tmpdir'
 module CloudCrowd
-  # The CloudCrowd::AssetStore should provide a common API for stashing and retrieving
-  # assets via URLs, in production this will be S3 but in development it may
-  # be the filesystem or /tmp.
+  # The AssetStore provides a common API for storing files and returning URLs
+  # that can access them. In production this will be S3 but in development
+  # it may be the filesystem.
+  #
+  # You shouldn't need to use the AssetStore directly -- Action's +download+
+  # and +save+ methods use it behind the scenes.
   class AssetStore
     include FileUtils
+    # Creating an AssetStore will determine wether to save private or public
+    # files on S3, depending on the value of <tt>use_s3_authentication</tt> in
+    # <tt>config.yml</tt>.
     def initialize
       @use_auth = CloudCrowd.config[:use_s3_authentication]
       mkdir_p temp_storage_path unless File.exists? temp_storage_path
     end
-    # Path to CloudCrowd's temporary local storage.
+    # Get the path to CloudCrowd's temporary local storage. All actions run
+    # in subdirectories of this.
     def temp_storage_path
       "#{Dir.tmpdir}/cloud_crowd_tmp"
     end
-    # Copy a finished file from our local storage to S3. Save it publicly if
-    # we're not configured to use S3 authentication.
+    # Copy a finished file from our local storage to S3. Save it publicly unless
+    # we're configured to use S3 authentication.
     def save(local_path, save_path)
       ensure_s3_connection
       permission = @use_auth ? 'private' : 'public-read'

data/lib/cloud_crowd/command_line.rb CHANGED

@@ -14,6 +14,8 @@ module CloudCrowd
     # Command-line banner for the usage message.
     BANNER = <<-EOS
+CloudCrowd is a Ruby & AWS batch processing system, MapReduce style.
 Usage: crowd COMMAND OPTIONS
 COMMANDS:
@@ -45,6 +47,7 @@ OPTIONS:
     def run_console
       require 'irb'
       require 'irb/completion'
+      require 'pp'
       load_code
       connect_to_database
       IRB.start
@@ -60,9 +63,9 @@ OPTIONS:
       require 'rubygems'
       rackup_path = File.expand_path("#{@options[:config_path]}/config.ru")
       if Gem.available? 'thin'
-        exec "thin -e production -p #{@options[:port]} -R #{rackup_path} start"
+        exec "thin -e #{@options[:environment]} -p #{@options[:port]} -R #{rackup_path} start"
       else
-        exec "rackup -E production -p #{@options[:port]} #{rackup_path}"
+        exec "rackup -E #{@options[:environment]} -p #{@options[:port]} #{rackup_path}"
       end
     end
@@ -127,7 +130,7 @@ OPTIONS:
     # Print `crowd` usage.
     def usage
-      puts @option_parser
+      puts "\n#{@option_parser}\n"
     end
@@ -141,11 +144,11 @@ OPTIONS:
       found ? @config_dir = true : config_not_found
     end
-    # Parse all options for all actions.
-    # TODO: Think about parsing options per sub-command separately.
+    # Parse all options for all commands.
     def parse_options
       @options = {
         :port         => 9173,
+        :environment  => 'production',
         :config_path  => ENV['CLOUD_CROWD_CONFIG'] || '.'
       }
       @option_parser = OptionParser.new do |opts|
@@ -158,6 +161,9 @@ OPTIONS:
         opts.on('-p', '--port PORT', 'central server port number') do |port_num|
           @options[:port] = port_num
         end
+        opts.on('-e', '--environment ENV', 'Sinatra environment (code reloading)') do |env|
+          @options[:environment] = env
+        end
         opts.on_tail('-v', '--version', 'show version') do
           load_code
           puts "CloudCrowd version #{VERSION}"

data/lib/cloud_crowd/daemon.rb CHANGED

@@ -6,8 +6,13 @@ module CloudCrowd
   # A CloudCrowd::Daemon, started by the Daemons gem, runs a CloudCrowd::Worker in
   # a loop, continually fetching and processing WorkUnits from the central
-  # server. The Daemon backs off and pings central less frequently when there
-  # isn't any work to be done, and speeds back up when there is.
+  # server.
+  #
+  # The Daemon backs off and pings the central server less frequently
+  # when there isn't any work to be done, and speeds back up when there is.
+  #
+  # The `crowd` command responds to all the usual methods that the Daemons gem
+  # supports.
   class Daemon
     MIN_WAIT        = CloudCrowd.config[:min_worker_wait]

data/lib/cloud_crowd/exceptions.rb ADDED

@@ -0,0 +1,17 @@
+module CloudCrowd
+  # Base Error class which all custom CloudCrowd exceptions inherit from.
+  class Error < RuntimeError #:nodoc:
+  end
+  # ActionNotFound is raised when a job is created for an action that doesn't
+  # exist.
+  class ActionNotFound < Error #:nodoc:
+  end
+  # StatusUnspecified is raised when a WorkUnit returns without a valid
+  # status code.
+  class StatusUnspecified < Error #:nodoc:
+  end
+end

data/lib/cloud_crowd/helpers.rb CHANGED

@@ -2,7 +2,7 @@ require 'cloud_crowd/helpers/authorization'
 require 'cloud_crowd/helpers/resources'
 module CloudCrowd
-  module Helpers
+  module Helpers #:nodoc:
     include Authorization, Resources #, Rack::Utils
   end
 end

data/lib/cloud_crowd/helpers/authorization.rb CHANGED

@@ -1,7 +1,9 @@
-# After sinatra-authorization...
 module CloudCrowd
   module Helpers
+    # Authorization takes after sinatra-authorization... See
+    # http://github.com/integrity/sinatra-authorization
+    # for the original.
     module Authorization
       # Ensure that the request includes the correct credentials.
@@ -19,7 +21,9 @@ module CloudCrowd
       end
       # A request is authorized if its login and password match those stored
-      # in config.yml, or if authentication is disabled.
+      # in config.yml, or if authentication is disabled. If authentication is
+      # turned on, then every request is authenticated, including between
+      # the worker daemons and the central server.
       def authorize(login, password)
         return true unless CloudCrowd.config[:use_http_authentication]
         return CloudCrowd.config[:login] == login &&

data/lib/cloud_crowd/helpers/resources.rb CHANGED

@@ -2,10 +2,20 @@ module CloudCrowd
   module Helpers
     module Resources
+      # Convenience method for responding with JSON. Sets the content-type,
+      # serializes, and allows empty responses.
+      def json(obj)
+        content_type :json
+        return status(204) && '' if obj.nil?
+        obj.to_json
+      end
+      # Lazy-fetch the job specified by <tt>job_id</tt>.
       def current_job
         @job ||= Job.find_by_id(params[:job_id]) or raise Sinatra::NotFound
       end
+      # Lazy-fetch the WorkUnit specified by <tt>work_unit_id</tt>.
       def current_work_unit
         @work_unit ||= WorkUnit.find_by_id(params[:work_unit_id]) or raise Sinatra::NotFound
       end
@@ -14,9 +24,8 @@ module CloudCrowd
       # with no content.
       def dequeue_work_unit(offset=0)
         handle_conflicts do
-          unit = WorkUnit.dequeue(offset)
-          return status(204) && '' unless unit
-          unit.to_json
+          actions = params[:enabled_actions].split(',')
+          WorkUnit.dequeue(actions, offset)
         end
       end

data/lib/cloud_crowd/inflector.rb CHANGED

@@ -1,7 +1,7 @@
 module CloudCrowd
   # Pilfered in parts from the ActiveSupport::Inflector.
-  module Inflector
+  module Inflector #:nodoc:
     def self.camelize(word)
       word.to_s.gsub(/\/(.?)/) { "::#{$1.upcase}" }.gsub(/(?:^|_)(.)/) { $1.upcase }

data/lib/cloud_crowd/models/job.rb CHANGED

@@ -1,7 +1,7 @@
 module CloudCrowd
   # A chunk of work that will be farmed out into many WorkUnits to be processed
-  # in parallel by all the active CloudCrowd::Workers. Jobs are defined by a list
+  # in parallel by each active CloudCrowd::Worker. Jobs are defined by a list
   # of inputs (usually public urls to files), an action (the name of a script that
   # CloudCrowd knows how to run), and, eventually a corresponding list of output.
   class Job < ActiveRecord::Base
@@ -10,9 +10,13 @@ module CloudCrowd
     has_many :work_units, :dependent => :destroy
     validates_presence_of :status, :inputs, :action, :options
+    before_validation_on_create :set_initial_status
+    after_create                :queue_for_workers
+    before_destroy              :cleanup
     # Create a Job from an incoming JSON or XML request, and add it to the queue.
-    # TODO: Add XML support.
+    # TODO: Think about XML support.
     def self.create_from_request(h)
       self.create(
         :inputs       => h['inputs'].to_json,
@@ -23,16 +27,6 @@ module CloudCrowd
       )
     end
-    # Creating a job creates its corresponding work units, adding them
-    # to the queue.
-    def after_create
-      self.queue_for_workers(JSON.parse(self.inputs))
-    end
-    def before_validation_on_create
-      self.status = self.splittable? ? SPLITTING : PROCESSING
-    end
     # After work units are marked successful, we check to see if all of them have
     # finished, if so, continue on to the next phase of the job.
     def check_for_completion
@@ -54,15 +48,10 @@ module CloudCrowd
       self
     end
-    # Transition this Job's status to the following one.
-    def transition_to_next_phase
-      self.status = any_work_units_failed? ? FAILED     :
-                    self.splitting?        ? PROCESSING :
-                    self.mergeable?        ? MERGING    :
-                                             SUCCEEDED
-    end
-    # If a callback_url is defined, post the Job's JSON to it upon completion.
+    # If a <tt>callback_url</tt> is defined, post the Job's JSON to it upon
+    # completion. The <tt>callback_url</tt> may include HTTP basic authentication,
+    # if you like:
+    #   http://user:password@example.com/job_complete
     def fire_callback
       begin
         RestClient.post(callback_url, {:job => self.to_json}) if callback_url
@@ -71,13 +60,17 @@ module CloudCrowd
       end
     end
-    # Cleaning up after a job will remove all of its files from S3.
+    # Cleaning up after a job will remove all of its files from S3. Destroying
+    # a Job calls cleanup first.
     def cleanup
       AssetStore.new.cleanup_job(self)
     end
-    # Have all of the WorkUnits finished? We could trade reads for writes here
+    # Have all of the WorkUnits finished?
+    #--
+    # We could trade reads for writes here
     # by keeping a completed_count on the Job itself.
+    #++
     def all_work_units_complete?
       self.work_units.incomplete.count <= 0
     end
@@ -97,19 +90,14 @@ module CloudCrowd
       self.processing? && self.action_class.public_instance_methods.include?('merge')
     end
-    # Retrieve the class for this Job's Action, loading it if necessary.
+    # Retrieve the class for this Job's Action.
     def action_class
-      CloudCrowd.actions(self.action)
-    end
-    # When the WorkUnits are all finished, gather all their outputs together
-    # before removing them from the database entirely.
-    def gather_outputs_from_work_units
-      outs = self.work_units.complete.map {|wu| wu.output }
-      self.work_units.complete.destroy_all
-      outs
+      klass = CloudCrowd.actions[self.action]
+      return klass if klass
+      raise ActionNotFound, "no action named: '#{self.action}' could be found"
     end
+    # Get the displayable status name of the Job's status code.
     def display_status
       CloudCrowd.display_status(self.status)
     end
@@ -122,22 +110,71 @@ module CloudCrowd
       (work_units.complete.count / work_units.count.to_f * 100).round
     end
+    # How long has this Job taken?
+    def time_taken
+      return self.time if self.time
+      Time.now - self.created_at
+    end
+    # Generate a stable 8-bit Hex color code, based on the Job's id.
+    def color
+      @color ||= Digest::MD5.hexdigest(self.id.to_s)[-7...-1]
+    end
     # A JSON representation of this job includes the statuses of its component
     # WorkUnits, as well as any completed outputs.
     def to_json(opts={})
-      atts = {'id' => self.id, 'status' => self.display_status, 'percent_complete' => self.percent_complete}
+      atts = {
+        'id'                => self.id,
+        'color'             => self.color,
+        'status'            => self.display_status,
+        'percent_complete'  => self.percent_complete,
+        'work_units'        => self.work_units.count,
+        'time_taken'        => self.time_taken
+      }
       atts.merge!({'outputs' => JSON.parse(self.outputs)}) if self.outputs
-      atts.merge!({'time' => self.time}) if self.time
       atts.to_json
     end
+    private
+    # When the WorkUnits are all finished, gather all their outputs together
+    # before removing them from the database entirely.
+    def gather_outputs_from_work_units
+      units = self.work_units.complete
+      outs = self.work_units.complete.map {|u| JSON.parse(u.output)['output'] }
+      self.work_units.complete.destroy_all
+      outs
+    end
+    # Transition this Job's status to the appropriate next status.
+    def transition_to_next_phase
+      self.status = any_work_units_failed? ? FAILED     :
+                    self.splitting?        ? PROCESSING :
+                    self.mergeable?        ? MERGING    :
+                                             SUCCEEDED
+    end
     # When starting a new job, or moving to a new stage, split up the inputs
-    # into WorkUnits, and queue them.
-    def queue_for_workers(input)
+    # into WorkUnits, and queue them. Workers will start picking them up right
+    # away.
+    def queue_for_workers(input=nil)
+      input ||= JSON.parse(self.inputs)
       [input].flatten.each do |wu_input|
-        WorkUnit.create(:job => self, :input => wu_input, :status => self.status)
+        WorkUnit.create(
+          :job    => self,
+          :action => self.action,
+          :input  => wu_input,
+          :status => self.status
+        )
       end
     end
+    # A Job starts out either splitting or processing, depending on its action.
+    def set_initial_status
+      self.status = self.splittable? ? SPLITTING : PROCESSING
+    end
   end
 end