RubyGems - cloud-crowd - Versions diffs - 0.1.0 - Mend

cloud-crowd 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (66) hide show

data/EPIGRAPHS +17 -0
data/LICENSE +22 -0
data/README +93 -0
data/actions/graphics_magick.rb +43 -0
data/actions/process_pdfs.rb +92 -0
data/actions/word_count.rb +14 -0
data/bin/crowd +5 -0
data/cloud-crowd.gemspec +111 -0
data/config/config.example.ru +17 -0
data/config/config.example.yml +48 -0
data/config/database.example.yml +9 -0
data/examples/graphics_magick_example.rb +44 -0
data/examples/process_pdfs_example.rb +40 -0
data/examples/word_count_example.rb +41 -0
data/lib/cloud-crowd.rb +130 -0
data/lib/cloud_crowd/action.rb +101 -0
data/lib/cloud_crowd/app.rb +117 -0
data/lib/cloud_crowd/asset_store.rb +41 -0
data/lib/cloud_crowd/asset_store/filesystem_store.rb +28 -0
data/lib/cloud_crowd/asset_store/s3_store.rb +40 -0
data/lib/cloud_crowd/command_line.rb +209 -0
data/lib/cloud_crowd/daemon.rb +95 -0
data/lib/cloud_crowd/exceptions.rb +28 -0
data/lib/cloud_crowd/helpers.rb +8 -0
data/lib/cloud_crowd/helpers/authorization.rb +50 -0
data/lib/cloud_crowd/helpers/resources.rb +45 -0
data/lib/cloud_crowd/inflector.rb +19 -0
data/lib/cloud_crowd/models.rb +40 -0
data/lib/cloud_crowd/models/job.rb +176 -0
data/lib/cloud_crowd/models/work_unit.rb +89 -0
data/lib/cloud_crowd/models/worker_record.rb +61 -0
data/lib/cloud_crowd/runner.rb +15 -0
data/lib/cloud_crowd/schema.rb +45 -0
data/lib/cloud_crowd/worker.rb +186 -0
data/public/css/admin_console.css +221 -0
data/public/css/reset.css +42 -0
data/public/images/bullet_green.png +0 -0
data/public/images/bullet_white.png +0 -0
data/public/images/cloud_hand.png +0 -0
data/public/images/header_back.png +0 -0
data/public/images/logo.png +0 -0
data/public/images/queue_fill.png +0 -0
data/public/images/server_error.png +0 -0
data/public/images/sidebar_bottom.png +0 -0
data/public/images/sidebar_top.png +0 -0
data/public/images/worker_info.png +0 -0
data/public/images/worker_info_loading.gif +0 -0
data/public/js/admin_console.js +168 -0
data/public/js/excanvas.js +1 -0
data/public/js/flot.js +1 -0
data/public/js/jquery.js +19 -0
data/test/acceptance/test_app.rb +72 -0
data/test/acceptance/test_failing_work_units.rb +32 -0
data/test/acceptance/test_word_count.rb +49 -0
data/test/blueprints.rb +17 -0
data/test/config/actions/failure_testing.rb +13 -0
data/test/config/config.ru +17 -0
data/test/config/config.yml +7 -0
data/test/config/database.yml +6 -0
data/test/test_helper.rb +19 -0
data/test/unit/test_action.rb +49 -0
data/test/unit/test_configuration.rb +28 -0
data/test/unit/test_job.rb +78 -0
data/test/unit/test_work_unit.rb +55 -0
data/views/index.erb +77 -0
metadata +233 -0

data/lib/cloud_crowd/asset_store.rb ADDED

@@ -0,0 +1,41 @@
+require 'tmpdir'
+module CloudCrowd
+  # The AssetStore provides a common API for storing files and returning URLs
+  # that can access them. In production this will be S3 but in development
+  # it may be the filesystem.
+  #
+  # You shouldn't need to use the AssetStore directly -- Action's +download+
+  # and +save+ methods use it behind the scenes.
+  class AssetStore
+    autoload :S3Store,         'cloud_crowd/asset_store/s3_store'
+    autoload :FilesystemStore, 'cloud_crowd/asset_store/filesystem_store'
+    LOCAL_STORAGE_PATH = '/tmp/cloud_crowd_storage'
+    # Configure the AssetStore with the specific storage implementation
+    # specified by 'storage' in <tt>config.yml</tt>.
+    case CloudCrowd.config[:storage]
+    when 's3'         then include S3Store
+    when 'filesystem' then include FilesystemStore
+    else raise Error::StorageNotFound, "#{CloudCrowd.config[:storage]} is not a valid storage back end"
+    end
+    # Creating the AssetStore ensures that its scratch directory exists.
+    def initialize
+      @use_auth = CloudCrowd.config[:use_s3_authentication]
+      FileUtils.mkdir_p temp_storage_path unless File.exists? temp_storage_path
+      raise Error::StorageNotWritable, "#{temp_storage_path} is not writable" unless File.writable?(temp_storage_path)
+    end
+    # Get the path to CloudCrowd's temporary local storage. All actions run
+    # in subdirectories of this.
+    def temp_storage_path
+      "#{Dir.tmpdir}/cloud_crowd_tmp"
+    end
+  end
+end

data/lib/cloud_crowd/asset_store/filesystem_store.rb ADDED

@@ -0,0 +1,28 @@
+module CloudCrowd
+  class AssetStore
+    # The FilesystemStore is an implementation of the AssetStore, good only for
+    # use in development, testing, or if you're only running a single-machine
+    # installation.
+    module FilesystemStore
+      # Save a file to somewhere semi-persistent on the filesystem. Can be used
+      # in development, when offline, or if you happen to have a single-machine
+      # CloudCrowd installation. To use, configure <tt>:storage => 'filesystem'</tt>.
+      def save(local_path, save_path)
+        save_path = File.join(LOCAL_STORAGE_PATH, save_path)
+        save_dir = File.dirname(save_path)
+        FileUtils.mkdir_p save_dir unless File.exists? save_dir
+        FileUtils.cp(local_path, save_path)
+        "file://#{File.expand_path(save_path)}"
+      end
+      # Remove all of a Job's result files from the filesystem.
+      def cleanup(job)
+        path = "#{LOCAL_STORAGE_PATH}/#{job.action}/job_#{job.id}"
+        FileUtils.rm_r(path) if File.exists?(path)
+      end
+    end
+  end
+end

data/lib/cloud_crowd/asset_store/s3_store.rb ADDED

@@ -0,0 +1,40 @@
+module CloudCrowd
+  class AssetStore
+    # The S3Store is an implementation of an AssetStore that uses a bucket
+    # on S3 for all resulting files.
+    module S3Store
+      # Save a finished file from local storage to S3. Save it publicly unless
+      # we're configured to use S3 authentication. Authenticated links expire
+      # after one day by default.
+      def save(local_path, save_path)
+        ensure_s3_connection
+        if @use_auth
+          @bucket.put(save_path, File.open(local_path), {}, 'private')
+          @s3.interface.get_link(@bucket, save_path)
+        else
+          @bucket.put(save_path, File.open(local_path), {}, 'public-read')
+          @bucket.key(save_path).public_link
+        end
+      end
+      # Remove all of a Job's resulting files from S3, both intermediate and finished.
+      def cleanup(job)
+        ensure_s3_connection
+        @bucket.delete_folder("#{job.action}/job_#{job.id}")
+      end
+      # Workers, through the course of many WorkUnits, keep around an AssetStore.
+      # Ensure we have a persistent S3 connection after first use.
+      def ensure_s3_connection
+        unless @s3 && @bucket
+          params = {:port => 80, :protocol => 'http'}
+          @s3 = RightAws::S3.new(CloudCrowd.config[:aws_access_key], CloudCrowd.config[:aws_secret_key], params)
+          @bucket = @s3.bucket(CloudCrowd.config[:s3_bucket], true)
+        end
+      end
+    end
+  end
+end

data/lib/cloud_crowd/command_line.rb ADDED

@@ -0,0 +1,209 @@
+require 'optparse'
+module CloudCrowd
+  class CommandLine
+    # Configuration files required for the `crowd` command to function.
+    CONFIG_FILES = ['config.yml', 'config.ru', 'database.yml']
+    # Reference the absolute path to the root.
+    CC_ROOT = File.expand_path(File.dirname(__FILE__) + '/../..')
+    # Path to the Daemons gem script which launches workers.
+    WORKER_RUNNER = File.expand_path("#{CC_ROOT}/lib/cloud_crowd/runner.rb")
+    # Command-line banner for the usage message.
+    BANNER = <<-EOS
+CloudCrowd is a MapReduce-inspired Parallel Processing System for Ruby.
+Wiki: http://wiki.github.com/documentcloud/cloud-crowd
+Rdoc: http://rdoc.info/projects/documentcloud/cloud-crowd
+Usage: crowd COMMAND OPTIONS
+Commands:
+  install       Install the CloudCrowd configuration files to the specified directory
+  server        Start up the central server (requires a database)
+  workers       Control worker daemons, use: (start | stop | restart | status | run)
+  console       Launch a CloudCrowd console, connected to the central database
+  load_schema   Load the schema into the database specified by database.yml
+Options:
+    EOS
+    # Creating a CloudCrowd::CommandLine runs from the contents of ARGV.
+    def initialize
+      parse_options
+      command = ARGV.shift
+      case command
+      when 'console'      then run_console
+      when 'server'       then run_server
+      when 'workers'      then run_workers_command
+      when 'load_schema'  then run_load_schema
+      when 'install'      then run_install
+      else                     usage
+      end
+    end
+    # Spin up an IRB session with the CloudCrowd code loaded in, and a database
+    # connection established. The equivalent of Rails' `script/console`.
+    def run_console
+      require 'irb'
+      require 'irb/completion'
+      require 'pp'
+      load_code
+      connect_to_database
+      IRB.start
+    end
+    # Convenience command for quickly spinning up the central server. More
+    # sophisticated deployments, load-balancing across multiple app servers,
+    # should use the config.ru rackup file directly. This method will start
+    # a single Thin server, if Thin is installed, otherwise the rackup defaults
+    # (Mongrel, falling back to WEBrick). The equivalent of Rails' script/server.
+    def run_server
+      ensure_config
+      require 'rubygems'
+      rackup_path = File.expand_path("#{@options[:config_path]}/config.ru")
+      if Gem.available? 'thin'
+        exec "thin -e #{@options[:environment]} -p #{@options[:port]} -R #{rackup_path} start"
+      else
+        exec "rackup -E #{@options[:environment]} -p #{@options[:port]} #{rackup_path}"
+      end
+    end
+    # Load in the database schema to the database specified in 'database.yml'.
+    def run_load_schema
+      load_code
+      connect_to_database
+      require 'cloud_crowd/schema.rb'
+    end
+    # Install the required CloudCrowd configuration files into the specified
+    # directory, or the current one.
+    def run_install
+      require 'fileutils'
+      install_path = ARGV.shift || '.'
+      FileUtils.mkdir_p install_path unless File.exists?(install_path)
+      install_file "#{CC_ROOT}/config/config.example.yml", "#{install_path}/config.yml"
+      install_file "#{CC_ROOT}/config/config.example.ru", "#{install_path}/config.ru"
+      install_file "#{CC_ROOT}/config/database.example.yml", "#{install_path}/database.yml"
+      install_file "#{CC_ROOT}/actions", "#{install_path}/actions", true
+    end
+    # Manipulate worker daemons -- handles all commands that the Daemons gem
+    # provides: start, stop, restart, run, and status.
+    def run_workers_command
+      ensure_config
+      command = ARGV.shift
+      case command
+      when 'start'    then start_workers
+      when 'stop'     then stop_workers
+      when 'restart'  then stop_workers && start_workers
+      when 'run'      then run_worker
+      when 'status'   then show_worker_status
+      else                 usage
+      end
+    end
+    # Start up N workers, specified by argument or the number of workers in
+    # config.yml.
+    def start_workers
+      load_code
+      num_workers = @options[:num_workers] || CloudCrowd.config[:num_workers]
+      num_workers.times do
+        `CLOUD_CROWD_CONFIG='#{File.expand_path(@options[:config_path] + "/config.yml")}' ruby #{WORKER_RUNNER} start`
+      end
+    end
+    # For debugging, run a single worker in the current process, showing output.
+    def run_worker
+      exec "CLOUD_CROWD_CONFIG='#{File.expand_path(@options[:config_path] + "/config.yml")}' ruby #{WORKER_RUNNER} run"
+    end
+    # Stop all active workers.
+    def stop_workers
+      `ruby #{WORKER_RUNNER} stop`
+    end
+    # Display the status of all active workers.
+    def show_worker_status
+      puts `ruby #{WORKER_RUNNER} status`
+    end
+    # Print `crowd` usage.
+    def usage
+      puts "\n#{@option_parser}\n"
+    end
+    private
+    # Check for configuration files, either in the current directory, or in
+    # the CLOUD_CROWD_CONFIG environment variable. Exit if they're not found.
+    def ensure_config
+      return if @config_found
+      found = CONFIG_FILES.all? {|f| File.exists? "#{@options[:config_path]}/#{f}" }
+      found ? @config_dir = true : config_not_found
+    end
+    # Parse all options for all commands.
+    def parse_options
+      @options = {
+        :port         => 9173,
+        :environment  => 'production',
+        :config_path  => ENV['CLOUD_CROWD_CONFIG'] || '.'
+      }
+      @option_parser = OptionParser.new do |opts|
+        opts.on('-c', '--config PATH', 'path to configuration directory') do |conf_path|
+          @options[:config_path] = conf_path
+        end
+        opts.on('-n', '--num-workers NUM', OptionParser::DecimalInteger, 'number of worker processes') do |num|
+          @options[:num_workers] = num
+        end
+        opts.on('-p', '--port PORT', 'central server port number') do |port_num|
+          @options[:port] = port_num
+        end
+        opts.on('-e', '--environment ENV', 'server environment (sinatra)') do |env|
+          @options[:environment] = env
+        end
+        opts.on_tail('-v', '--version', 'show version') do
+          load_code
+          puts "CloudCrowd version #{VERSION}"
+          exit
+        end
+      end
+      @option_parser.banner = BANNER
+      @option_parser.parse!(ARGV)
+    end
+    # Load in the CloudCrowd module code, dependencies, lib files and models.
+    # Not all commands require this.
+    def load_code
+      ensure_config
+      require 'rubygems'
+      require "#{CC_ROOT}/lib/cloud-crowd"
+      CloudCrowd.configure("#{@options[:config_path]}/config.yml")
+    end
+    # Establish a connection to the central server's database. Not all commands
+    # require this.
+    def connect_to_database
+      require 'cloud_crowd/models'
+      CloudCrowd.configure_database("#{@options[:config_path]}/database.yml")
+    end
+    # Exit with an explanation if the configuration files couldn't be found.
+    def config_not_found
+      puts "`crowd` can't find the CloudCrowd configuration directory. Please either run `crowd` from inside of the configuration directory, or use `crowd -c path/to/config`"
+      exit(1)
+    end
+    # Install a file and log the installation.
+    def install_file(source, dest, is_dir=false)
+      is_dir ? FileUtils.cp_r(source, dest) : FileUtils.cp(source, dest)
+      puts "installed #{dest}"
+    end
+  end
+end

data/lib/cloud_crowd/daemon.rb ADDED

@@ -0,0 +1,95 @@
+CloudCrowd.configure(ENV['CLOUD_CROWD_CONFIG'])
+module CloudCrowd
+  # A CloudCrowd::Daemon, started by the Daemons gem, runs a CloudCrowd::Worker in
+  # a loop, continually fetching and processing WorkUnits from the central
+  # server.
+  #
+  # The Daemon backs off and pings the central server less frequently
+  # when there isn't any work to be done, and speeds back up when there is.
+  #
+  # The `crowd` command responds to all the usual methods that the Daemons gem
+  # supports.
+  class Daemon
+    # The back-off factor used to slow down requests for new work units
+    # when the queue is empty.
+    WAIT_MULTIPLIER   = 1.5
+    MIN_WAIT = CloudCrowd.config[:min_worker_wait]
+    MAX_WAIT = CloudCrowd.config[:max_worker_wait]
+    def initialize
+      @wait_time  = MIN_WAIT
+      @worker     = Worker.new
+      Signal.trap('INT')  { kill_worker_and_exit }
+      Signal.trap('KILL') { kill_worker_and_exit }
+      Signal.trap('TERM') { kill_worker_and_exit }
+    end
+    # Spin up our worker and monitoring threads. The monitor's the boss, and
+    # will feel no compunction in killing the worker thread if necessary.
+    # Check in before starting up. If check in fails, there's no sense in going.
+    def run
+      @worker.check_in('starting')
+      @work_thread = run_worker
+      @monitor_thread = run_monitor
+      @monitor_thread.join
+    end
+    private
+    # Loop forever, fetching WorkUnits and processing them.
+    def run_worker
+      Thread.new do
+        loop do
+          @worker.fetch_work_unit
+          if @worker.has_work?
+            @wait_time = MIN_WAIT
+            while @worker.has_work?
+              @worker.run
+              sleep 0.01 # So as to listen for incoming signals.
+            end
+          else
+            @wait_time = [@wait_time * WAIT_MULTIPLIER, MAX_WAIT].min
+            sleep @wait_time
+          end
+        end
+      end
+    end
+    # Checks in to let the central server know it's still alive every
+    # CHECK_IN_INTERVAL seconds. Restarts the work_thread if it has died.
+    def run_monitor
+      Thread.new do
+        sleep Worker::CHECK_IN_INTERVAL
+        loop do
+          @work_thread = run_monitor unless @work_thread.alive? || @exit_started
+          @worker.check_in(@work_thread.status)
+          sleep Worker::CHECK_IN_INTERVAL
+        end
+      end
+    end
+    def running?
+      @work_thread.alive? || @monitor_thread.alive?
+    end
+    # At exit, kill the worker thread, gently at first, then forcefully.
+    def kill_worker_and_exit
+      @worker.check_out
+      @exit_started = Time.now
+      @work_thread.kill && @monitor_thread.kill
+      sleep 0.3 while running? && Time.now - @exit_started < WORKER_EXIT_WAIT
+      return Process.exit unless running?
+      @work_thread.kill! && @monitor_thread.kill!
+      Process.exit
+    end
+  end
+end
+CloudCrowd::Daemon.new.run

data/lib/cloud_crowd/exceptions.rb ADDED

@@ -0,0 +1,28 @@
+module CloudCrowd
+  # Base Error class which all custom CloudCrowd exceptions inherit from.
+  # Rescuing CloudCrowd::Error (or RuntimeError) will get all custom exceptions.
+  class Error < RuntimeError
+    # ActionNotFound is raised when a job is created for an action that doesn't
+    # exist.
+    class ActionNotFound < Error
+    end
+    # StorageNotFound is raised when config.yml specifies a storage back end that
+    # doesn't exist.
+    class StorageNotFound < Error
+    end
+    # If the AssetStore can't write to its scratch directory.
+    class StorageNotWritable < Error
+    end
+    # StatusUnspecified is raised when a WorkUnit returns without a valid
+    # status code.
+    class StatusUnspecified < Error
+    end
+  end
+end

data/lib/cloud_crowd/helpers.rb ADDED

@@ -0,0 +1,8 @@
+require 'cloud_crowd/helpers/authorization'
+require 'cloud_crowd/helpers/resources'
+module CloudCrowd
+  module Helpers
+    include Authorization, Resources #, Rack::Utils
+  end
+end