RubyGems - cloud-crowd - Versions diffs - 0.1.0 - Mend

cloud-crowd 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (66) hide show

data/EPIGRAPHS +17 -0
data/LICENSE +22 -0
data/README +93 -0
data/actions/graphics_magick.rb +43 -0
data/actions/process_pdfs.rb +92 -0
data/actions/word_count.rb +14 -0
data/bin/crowd +5 -0
data/cloud-crowd.gemspec +111 -0
data/config/config.example.ru +17 -0
data/config/config.example.yml +48 -0
data/config/database.example.yml +9 -0
data/examples/graphics_magick_example.rb +44 -0
data/examples/process_pdfs_example.rb +40 -0
data/examples/word_count_example.rb +41 -0
data/lib/cloud-crowd.rb +130 -0
data/lib/cloud_crowd/action.rb +101 -0
data/lib/cloud_crowd/app.rb +117 -0
data/lib/cloud_crowd/asset_store.rb +41 -0
data/lib/cloud_crowd/asset_store/filesystem_store.rb +28 -0
data/lib/cloud_crowd/asset_store/s3_store.rb +40 -0
data/lib/cloud_crowd/command_line.rb +209 -0
data/lib/cloud_crowd/daemon.rb +95 -0
data/lib/cloud_crowd/exceptions.rb +28 -0
data/lib/cloud_crowd/helpers.rb +8 -0
data/lib/cloud_crowd/helpers/authorization.rb +50 -0
data/lib/cloud_crowd/helpers/resources.rb +45 -0
data/lib/cloud_crowd/inflector.rb +19 -0
data/lib/cloud_crowd/models.rb +40 -0
data/lib/cloud_crowd/models/job.rb +176 -0
data/lib/cloud_crowd/models/work_unit.rb +89 -0
data/lib/cloud_crowd/models/worker_record.rb +61 -0
data/lib/cloud_crowd/runner.rb +15 -0
data/lib/cloud_crowd/schema.rb +45 -0
data/lib/cloud_crowd/worker.rb +186 -0
data/public/css/admin_console.css +221 -0
data/public/css/reset.css +42 -0
data/public/images/bullet_green.png +0 -0
data/public/images/bullet_white.png +0 -0
data/public/images/cloud_hand.png +0 -0
data/public/images/header_back.png +0 -0
data/public/images/logo.png +0 -0
data/public/images/queue_fill.png +0 -0
data/public/images/server_error.png +0 -0
data/public/images/sidebar_bottom.png +0 -0
data/public/images/sidebar_top.png +0 -0
data/public/images/worker_info.png +0 -0
data/public/images/worker_info_loading.gif +0 -0
data/public/js/admin_console.js +168 -0
data/public/js/excanvas.js +1 -0
data/public/js/flot.js +1 -0
data/public/js/jquery.js +19 -0
data/test/acceptance/test_app.rb +72 -0
data/test/acceptance/test_failing_work_units.rb +32 -0
data/test/acceptance/test_word_count.rb +49 -0
data/test/blueprints.rb +17 -0
data/test/config/actions/failure_testing.rb +13 -0
data/test/config/config.ru +17 -0
data/test/config/config.yml +7 -0
data/test/config/database.yml +6 -0
data/test/test_helper.rb +19 -0
data/test/unit/test_action.rb +49 -0
data/test/unit/test_configuration.rb +28 -0
data/test/unit/test_job.rb +78 -0
data/test/unit/test_work_unit.rb +55 -0
data/views/index.erb +77 -0
metadata +233 -0

data/config/database.example.yml ADDED

@@ -0,0 +1,9 @@
+# This is a standard ActiveRecord database.yml file. You can configure it
+# to use any database that ActiveRecord supports.
+:adapter:  mysql
+:encoding: utf8
+:username: root
+:password:
+:socket:   /tmp/mysql.sock
+:database: cloud_crowd

data/examples/graphics_magick_example.rb ADDED

@@ -0,0 +1,44 @@
+#!/usr/bin/env ruby -rubygems
+require 'restclient'
+require 'json'
+# This example demonstrates the GraphicsMagick action by taking in a list of
+# five images, and producing annotated, blurred, and black and white versions
+# of each image. See actions/graphics_magick.rb
+RestClient.post('http://localhost:9173/jobs',
+  {:job => {
+    'action' => 'graphics_magick',
+    'inputs' => [
+      'http://www.sci-fi-o-rama.com/wp-content/uploads/2008/10/dan_mcpharlin_the_land_of_sleeping_things.jpg',
+      'http://www.sci-fi-o-rama.com/wp-content/uploads/2009/07/dan_mcpharlin_wired_spread01.jpg',
+      'http://www.sci-fi-o-rama.com/wp-content/uploads/2009/07/dan_mcpharlin_wired_spread03.jpg',
+      'http://www.sci-fi-o-rama.com/wp-content/uploads/2009/07/dan_mcpharlin_wired_spread02.jpg',
+      'http://www.sci-fi-o-rama.com/wp-content/uploads/2009/02/dan_mcpharlin_untitled.jpg'
+    ],
+    'options' => {
+      'steps' => [{
+        'name'      => 'annotated',
+        'command'   => 'convert',
+        'options'   => '-font helvetica -fill red -draw "font-size 35; text 75,75 CloudCrowd!"',
+        'extension' => 'jpg'
+      },{
+        'name'      => 'blurred',
+        'command'   => 'convert',
+        'options'   => '-blur 10x5',
+        'extension' => 'png'
+      },{
+        'name'      => 'bw',
+        'input'     => 'blurred',
+        'command'   => 'convert',
+        'options'   => '-monochrome',
+        'extension' => 'jpg'
+      }]
+    }
+  }.to_json}
+)

data/examples/process_pdfs_example.rb ADDED

@@ -0,0 +1,40 @@
+#!/usr/bin/env ruby -rubygems
+require 'restclient'
+require 'json'
+# This example demonstrates a fairly complicated PDF-processing action, designed
+# to extract the PDF's text, and produce GIF versions of each page. The action
+# (actions/process_pdfs.rb) shows an example of using all three steps,
+# split, process, and merge.
+RestClient.post('http://localhost:9173/jobs',
+  {:job => {
+    'action' => 'process_pdfs',
+    'inputs' => [
+      'http://tigger.uic.edu/~victor/personal/futurism.pdf',
+      'http://www.jonasmekas.com/Catalog_excerpt/The%20Avant-Garde%20From%20Futurism%20to%20Fluxus.pdf',
+      'http://www.dzignism.com/articles/Futurist.Manifesto.pdf',
+      'http://benfry.com/phd/dissertation-050312b-acrobat.pdf'
+    ],
+    'options' => {
+      'batch_size' => 7,
+      'images' => [{
+        'name'      => '700',
+        'options'   => '-resize 700x -density 220 -depth 4 -unsharp 0.5x0.5+0.5+0.03',
+        'extension' => 'gif'
+      },{
+        'name'      => '1000',
+        'options'   => '-resize 1000x -density 220 -depth 4 -unsharp 0.5x0.5+0.5+0.03',
+        'extension' => 'gif'
+      }]
+    }
+  }.to_json}
+)

data/examples/word_count_example.rb ADDED

@@ -0,0 +1,41 @@
+#!/usr/bin/env ruby -rubygems
+require 'restclient'
+require 'json'
+# Let's count all the words in Shakespeare.
+RestClient.post('http://localhost:9173/jobs',
+  {:job => {
+    'action' => 'word_count',
+    'inputs' => [
+      'http://www.gutenberg.org/dirs/etext97/1ws3010.txt',  # All's Well That Ends Well
+      'http://www.gutenberg.org/dirs/etext99/1ws3511.txt',  # Anthony and Cleopatra
+      'http://www.gutenberg.org/dirs/etext97/1ws2510.txt',  # As You Like It
+      'http://www.gutenberg.org/dirs/etext97/1ws0610.txt',  # The Comedy of Errors
+      'http://www.gutenberg.org/dirs/etext99/1ws3911.txt',  # Cymbeline
+      'http://www.gutenberg.org/dirs/etext00/0ws2610.txt',  # Hamlet
+      'http://www.gutenberg.org/dirs/etext00/0ws1910.txt',  # Henry IV
+      'http://www.gutenberg.org/dirs/etext99/1ws2411.txt',  # Julius Caesar
+      'http://www.gutenberg.org/dirs/etext98/2ws3310.txt',  # King Lear
+      'http://www.gutenberg.org/dirs/etext99/1ws1211j.txt', # Love's Labour's Lost
+      'http://www.gutenberg.org/dirs/etext98/2ws3410.txt',  # Macbeth
+      'http://www.gutenberg.org/dirs/etext98/2ws1810.txt',  # The Merchant of Venice
+      'http://www.gutenberg.org/dirs/etext99/1ws1711.txt',  # Midsummer Night's Dream
+      'http://www.gutenberg.org/dirs/etext98/3ws2210.txt',  # Much Ado About Nothing
+      'http://www.gutenberg.org/dirs/etext00/0ws3210.txt',  # Othello
+      'http://www.gutenberg.org/dirs/etext98/2ws1610.txt',  # Romeo and Juliet
+      'http://www.gutenberg.org/dirs/etext98/2ws1010.txt',  # The Taming of the Shrew
+      'http://www.gutenberg.org/dirs/etext99/1ws4111.txt',  # The Tempest
+      'http://www.gutenberg.org/dirs/etext00/0ws0910.txt',  # Titus Andronicus
+      'http://www.gutenberg.org/dirs/etext99/1ws2911.txt',  # Troilus and Cressida
+      'http://www.gutenberg.org/dirs/etext98/3ws2810.txt',  # Twelfth Night
+      'http://www.gutenberg.org/files/1539/1539.txt'        # The Winter's Tale
+    ]
+  }.to_json}
+)
+# With 23 Workers running, and over Wifi, it counted all the words in 5.5 secs.

data/lib/cloud-crowd.rb ADDED

@@ -0,0 +1,130 @@
+# The Grand Central of code loading...
+$LOAD_PATH.unshift File.expand_path(File.dirname(__FILE__))
+# Common Gems:
+require 'rubygems'
+gem 'activerecord'
+gem 'daemons'
+gem 'json'
+gem 'rest-client'
+gem 'right_aws'
+gem 'sinatra'
+# Autoloading for all the pieces which may or may not be needed:
+autoload :ActiveRecord, 'activerecord'
+autoload :Benchmark,    'benchmark'
+autoload :Daemons,      'daemons'
+autoload :Digest,       'digest'
+autoload :ERB,          'erb'
+autoload :FileUtils,    'fileutils'
+autoload :JSON,         'json'
+autoload :RestClient,   'restclient'
+autoload :RightAws,     'right_aws'
+autoload :Sinatra,      'sinatra'
+autoload :Socket,       'socket'
+autoload :YAML,         'yaml'
+# Common code which should really be required in every circumstance.
+require 'cloud_crowd/exceptions'
+module CloudCrowd
+  # Autoload all the CloudCrowd classes which may not be required.
+  autoload :App,          'cloud_crowd/app'
+  autoload :Action,       'cloud_crowd/action'
+  autoload :AssetStore,   'cloud_crowd/asset_store'
+  autoload :Helpers,      'cloud_crowd/helpers'
+  autoload :Inflector,    'cloud_crowd/inflector'
+  autoload :Job,          'cloud_crowd/models'
+  autoload :Worker,       'cloud_crowd/worker'
+  autoload :WorkUnit,     'cloud_crowd/models'
+  autoload :WorkerRecord, 'cloud_crowd/models'
+  # Root directory of the CloudCrowd gem.
+  ROOT        = File.expand_path(File.dirname(__FILE__) + '/..')
+  # Keep the version in sync with the gemspec.
+  VERSION     = '0.1.0'
+  # A Job is processing if its WorkUnits in the queue to be handled by workers.
+  PROCESSING  = 1
+  # A Job has succeeded if all of its WorkUnits have finished successfully.
+  SUCCEEDED   = 2
+  # A Job has failed if even a single one of its WorkUnits has failed (they may
+  # be attempted multiple times on failure, however).
+  FAILED      = 3
+  # A Job is splitting if it's in the process of dividing its inputs up into
+  # multiple WorkUnits.
+  SPLITTING   = 4
+  # A Job is merging if it's busy collecting all of its successful WorkUnits
+  # back together into the final result.
+  MERGING     = 5
+  # A work unit is considered to be complete if it succeeded or if it failed.
+  COMPLETE    = [SUCCEEDED, FAILED]
+  # A work unit is considered incomplete if it's being processed, split up or
+  # merged together.
+  INCOMPLETE  = [PROCESSING, SPLITTING, MERGING]
+  # Mapping of statuses to their display strings.
+  DISPLAY_STATUS_MAP = ['unknown', 'processing', 'succeeded', 'failed', 'splitting', 'merging']
+  class << self
+    attr_reader :config
+    # Configure CloudCrowd by passing in the path to <tt>config.yml</tt>.
+    def configure(config_path)
+      @config_path = File.expand_path(File.dirname(config_path))
+      @config = YAML.load_file(config_path)
+    end
+    # Configure the CloudCrowd central database (and connect to it), by passing
+    # in a path to <tt>database.yml</tt>. The file should use the standard
+    # ActiveRecord connection format.
+    def configure_database(config_path)
+      configuration = YAML.load_file(config_path)
+      ActiveRecord::Base.establish_connection(configuration)
+    end
+    # Get a reference to the central server, including authentication,
+    # if configured.
+    def central_server
+      return @central_server if @central_server
+      params = [CloudCrowd.config[:central_server]]
+      params += [CloudCrowd.config[:login], CloudCrowd.config[:password]] if CloudCrowd.config[:use_http_authentication]
+      @central_server = RestClient::Resource.new(*params)
+    end
+    # Return the displayable status name of an internal CloudCrowd status number.
+    # (See the above constants).
+    def display_status(status)
+      DISPLAY_STATUS_MAP[status] || 'unknown'
+    end
+    # CloudCrowd::Actions are requested dynamically by name. Access them through
+    # this actions property, which behaves like a hash. At load time, we
+    # load all installed Actions and CloudCrowd's default Actions into it.
+    # If you wish to have certain workers be specialized to only handle certain
+    # Actions, then install only those into the actions directory.
+    def actions
+      return @actions if @actions
+      @actions = {}
+      default_actions   = Dir["#{ROOT}/actions/*.rb"]
+      installed_actions = Dir["#{@config_path}/actions/*.rb"]
+      custom_actions    = Dir["#{CloudCrowd.config[:actions_path]}/*.rb"]
+      (default_actions + installed_actions + custom_actions).each do |path|
+        name = File.basename(path, File.extname(path))
+        require path
+        @actions[name] = Module.const_get(Inflector.camelize(name))
+      end
+      @actions
+    end
+  end
+end

data/lib/cloud_crowd/action.rb ADDED

@@ -0,0 +1,101 @@
+module CloudCrowd
+  # As you write your custom actions, have them inherit from CloudCrowd::Action.
+  # All actions must implement a +process+ method, which should return a
+  # JSON-serializable object that will be used as the output for the work unit.
+  # See the default actions for examples.
+  #
+  # Optionally, actions may define +split+ and +merge+ methods to do mapping
+  # and reducing around the +input+. +split+ should return an array of URLs --
+  # to be mapped into WorkUnits and processed in parallel. In the +merge+ step,
+  # +input+ will be an array of all the resulting outputs from calling process.
+  #
+  # All actions have use of an individual +work_directory+, for scratch files,
+  # and spend their duration inside of it, so relative paths work well.
+  class Action
+    FILE_URL = /\Afile:\/\//
+    attr_reader :input, :input_path, :file_name, :options, :work_directory
+    # Initializing an Action sets up all of the read-only variables that
+    # form the bulk of the API for action subclasses. (Paths to read from and
+    # write to). It creates the +work_directory+ and moves into it.
+    # If we're not merging multiple results, it downloads the input file into
+    # the +work_directory+ before starting.
+    def initialize(status, input, options, store)
+      @input, @options, @store = input, options, store
+      @job_id, @work_unit_id = options['job_id'], options['work_unit_id']
+      @work_directory = File.expand_path(File.join(@store.temp_storage_path, storage_prefix))
+      FileUtils.mkdir_p(@work_directory) unless File.exists?(@work_directory)
+      status == MERGING ? parse_input : download_input
+    end
+    # Each Action subclass must implement a +process+ method, overriding this.
+    def process
+      raise NotImplementedError.new("CloudCrowd::Actions must override 'process' with their own processing code.")
+    end
+    # Download a file to the specified path.
+    def download(url, path)
+      if url.match(FILE_URL)
+        FileUtils.cp(url.sub(FILE_URL, ''), path)
+      else
+        resp = RestClient::Request.execute(:url => url, :method => :get, :raw_response => true)
+        FileUtils.mv resp.file.path, path
+      end
+      path
+    end
+    # Takes a local filesystem path, saves the file to S3, and returns the
+    # public (or authenticated) url on S3 where the file can be accessed.
+    def save(file_path)
+      save_path = File.join(storage_prefix, File.basename(file_path))
+      @store.save(file_path, save_path)
+    end
+    # After the Action has finished, we remove the work directory and return
+    # to the root directory (where daemons run by default).
+    def cleanup_work_directory
+      FileUtils.rm_r(@work_directory) if File.exists?(@work_directory)
+    end
+    private
+    # Convert an unsafe URL into a filesystem-friendly filename.
+    def safe_filename(url)
+      ext  = File.extname(url)
+      name = URI.unescape(File.basename(url)).gsub(/[^a-zA-Z0-9_\-.]/, '-').gsub(/-+/, '-')
+      File.basename(name, ext).gsub('.', '-') + ext
+    end
+    # The directory prefix to use for both local and S3 storage.
+    # [action_name]/job_[job_id]/unit_[work_unit_it]
+    def storage_prefix
+      path_parts = []
+      path_parts << Inflector.underscore(self.class)
+      path_parts << "job_#{@job_id}"
+      path_parts << "unit_#{@work_unit_id}" if @work_unit_id
+      @storage_prefix ||= File.join(path_parts)
+    end
+    # If we know that the input is JSON, replace it with the parsed form.
+    def parse_input
+      @input = JSON.parse(@input)
+    end
+    # If the input is a URL, download the file before beginning processing.
+    def download_input
+      Dir.chdir(@work_directory) do
+        input_is_url = !!URI.parse(@input) rescue false
+        return unless input_is_url
+        @input_path = File.join(@work_directory, safe_filename(@input))
+        @file_name = File.basename(@input_path, File.extname(@input_path))
+        download(@input, @input_path)
+      end
+    end
+  end
+end

data/lib/cloud_crowd/app.rb ADDED

@@ -0,0 +1,117 @@
+module CloudCrowd
+  # The main CloudCrowd (Sinatra) application. The actions are:
+  #
+  # == Admin
+  # [get /] Render the admin console, with a progress meter for running jobs.
+  # [get /status] Get the combined JSON of every active job and worker.
+  # [get /heartbeat] Returns 200 OK to let monitoring tools know the server's up.
+  #
+  # == Public API
+  # [post /jobs] Begin a new Job. Post with a JSON representation of the job-to-be. (see examples).
+  # [get /jobs/:job_id] Check the status of a Job. Response includes output, if the Job has finished.
+  # [delete /jobs/:job_id] Clean up a Job when you're done downloading the results. Removes all intermediate files.
+  #
+  # == Internal Workers API
+  # [post /work] Dequeue the next WorkUnit, and hand it off to the worker.
+  # [put /work/:unit_id] Mark a finished WorkUnit as completed or failed, with results.
+  # [put /worker] Keep a record of an actively running worker.
+  class App < Sinatra::Default
+    set :root, ROOT
+    set :authorization_realm, "CloudCrowd"
+    helpers Helpers
+    # static serves files from /public, methodoverride allows the _method param.
+    enable :static, :methodoverride
+    # Enabling HTTP Authentication turns it on for all requests.
+    before do
+      login_required if CloudCrowd.config[:use_http_authentication]
+    end
+    # Render the admin console.
+    get '/' do
+      erb :index
+    end
+    # Get the JSON for every active job in the queue and every active worker
+    # in the system. This action may get a little worrisome as the system grows
+    # larger -- keep it in mind.
+    get '/status' do
+      json(
+        'jobs'            => Job.incomplete,
+        'workers'         => WorkerRecord.alive(:order => 'name desc'),
+        'work_unit_count' => WorkUnit.incomplete.count
+      )
+    end
+    # Get the JSON for a worker record's work unit, if one exists.
+    get '/worker/:name' do
+      record = WorkerRecord.find_by_name params[:name]
+      json((record && record.work_unit) || {})
+    end
+    # To monitor the central server with Monit, God, Nagios, or another
+    # monitoring tool, you can hit /heartbeat to make sure.
+    get '/heartbeat' do
+      "buh-bump"
+    end
+    # PUBLIC API:
+    # Start a new job. Accepts a JSON representation of the job-to-be.
+    post '/jobs' do
+      json Job.create_from_request(JSON.parse(params[:job]))
+    end
+    # Check the status of a job, returning the output if finished, and the
+    # number of work units remaining otherwise.
+    get '/jobs/:job_id' do
+      json current_job
+    end
+    # Cleans up a Job's saved S3 files. Delete a Job after you're done
+    # downloading the results.
+    delete '/jobs/:job_id' do
+      current_job.destroy
+      json nil
+    end
+    # INTERNAL WORKER DAEMON API:
+    # Internal method for worker daemons to fetch the work unit at the front
+    # of the queue. Work unit is marked as taken and handed off to the worker.
+    post '/work' do
+      json dequeue_work_unit
+    end
+    # When workers are done with their unit, either successfully on in failure,
+    # they mark it back on the central server and retrieve another. Failures
+    # pull from one down in the queue, so as to not repeat the same unit.
+    put '/work/:work_unit_id' do
+      handle_conflicts(409) do
+        case params[:status]
+        when 'succeeded'
+          current_work_unit.finish(params[:output], params[:time])
+          json dequeue_work_unit
+        when 'failed'
+          current_work_unit.fail(params[:output], params[:time])
+          json dequeue_work_unit(1)
+        else
+          error(500, "Completing a work unit must specify status.")
+        end
+      end
+    end
+    # Every so often workers check in to let the central server know that
+    # they're still alive. Keep up-to-date records
+    put '/worker' do
+      params[:terminated] ? WorkerRecord.check_out(params) : WorkerRecord.check_in(params)
+      json nil
+    end
+  end
+end