RubyGems - documentcloud-cloud-crowd - Versions diffs - 0.1.1 → 0.2.0 - Mend

documentcloud-cloud-crowd 0.1.1 → 0.2.0

Files changed (26) hide show

data/README +16 -16
data/cloud-crowd.gemspec +4 -3
data/config/config.example.yml +17 -12
data/lib/cloud-crowd.rb +42 -24
data/lib/cloud_crowd/action.rb +6 -4
data/lib/cloud_crowd/asset_store.rb +7 -7
data/lib/cloud_crowd/asset_store/filesystem_store.rb +15 -9
data/lib/cloud_crowd/asset_store/s3_store.rb +10 -11
data/lib/cloud_crowd/command_line.rb +12 -7
data/lib/cloud_crowd/exceptions.rb +7 -4
data/lib/cloud_crowd/helpers/authorization.rb +3 -1
data/lib/cloud_crowd/models/job.rb +19 -21
data/lib/cloud_crowd/models/node_record.rb +24 -10
data/lib/cloud_crowd/models/work_unit.rb +39 -25
data/lib/cloud_crowd/node.rb +24 -6
data/lib/cloud_crowd/schema.rb +3 -2
data/lib/cloud_crowd/server.rb +9 -4
data/lib/cloud_crowd/worker.rb +33 -48
data/public/css/admin_console.css +17 -7
data/public/images/server_busy.png +0 -0
data/public/js/admin_console.js +3 -1
data/test/config/config.yml +1 -1
data/test/unit/test_action.rb +1 -1
data/test/unit/test_job.rb +2 -0
data/views/{index.erb → operations_center.erb} +5 -5
metadata +4 -3

data/README CHANGED Viewed

@@ -26,7 +26,7 @@
     * Parallel processing for the rest of us
     * Write your scripts in Ruby
-    * Built for Amazon EC2 and S3
+    * Works with Amazon EC2 and S3
     * split -> process -> merge
     * As easy as `gem install cloud-crowd`
@@ -63,31 +63,31 @@
     # Edit the configuration files to your satisfaction, add AWS credentials,
     # and then load the CloudCrowd schema into your configured database.
-      >> mate ~/config/cloud-crowd/config.yml
-      >> mate ~/config/cloud-crowd/database.yml
+      >> cd ~/config/cloud-crowd
+      >> mate config.yml
+      >> mate database.yml
+      >> [create the database you just configured...]
       >> crowd load_schema
     # Write your actions, and install them into the 'actions' subdirectory.
-    # CloudCrowd comes with some default actions as an example.
+    # CloudCrowd comes with a few default actions as an example.
     # To launch the central server (make sure that you include its location
-    # in config.yml), either:
+    # in config.yml):
       >> crowd server
-    # or:
+    # The configuration folder also includes 'config.ru', which can be used by
+     # any Rack-compliant webserver to run your central server.
-      >> thin -R config.ru --servers 3 -e production start
+    # Then, to launch a node of workers:
-    # Any server that supports Rack should work with the rackup file.
+      >> crowd node
-    # Then, to spin up 10 workers:
+    # To spin up remote nodes, install the 'cloud-crowd' gem and copy over
+    # your configuration directory. Run `crowd node`, and the remote machines
+    # will register with the central server, becoming available for processing.
-      >> crowd workers start -n 10
-    # To spin up workers remotely, install the 'cloud-crowd' gem, and copy over
-    # your configuration directory.
-    # At this point you can visit your server console at localhost:9173 to
-    # view all of your workers, ready for action.
+    # At this point you can visit your Operations Center at localhost:9173 to
+    # view all of your nodes, ready for action.

data/cloud-crowd.gemspec CHANGED Viewed

@@ -1,7 +1,7 @@
 Gem::Specification.new do |s|
   s.name      = 'cloud-crowd'
-  s.version   = '0.1.1'         # Keep version in sync with cloud-cloud.rb
-  s.date      = '2009-09-15'
+  s.version   = '0.2.0'         # Keep version in sync with cloud-cloud.rb
+  s.date      = '2009-09-17'
   s.homepage    = "http://wiki.github.com/documentcloud/cloud-crowd"
   s.summary     = "Parallel Processing for the Rest of Us"
@@ -83,6 +83,7 @@ public/images/header_back.png
 public/images/logo.png
 public/images/queue_fill.png
 public/images/server.png
+public/images/server_busy.png
 public/images/server_error.png
 public/images/sidebar_bottom.png
 public/images/sidebar_top.png
@@ -106,6 +107,6 @@ test/unit/test_action.rb
 test/unit/test_configuration.rb
 test/unit/test_job.rb
 test/unit/test_work_unit.rb
-views/index.erb
+views/operations_center.erb
 )
 end

data/config/config.example.yml CHANGED Viewed

@@ -1,33 +1,38 @@
 # The URL where you're planning on running the central server/queue/database.
-:central_server:          http://localhost:9173
+:central_server:      http://localhost:9173
 # Set the maximum number of workers allowed per-node. Workers only run while
 # there's work to be done. It's best to set 'max_workers' below the point where
 # you'd start to swap or peg your CPU (as determined by experiment).
-:max_workers:             5
+:max_workers:         5
 # The storage back-end that you'd like to use for intermediate and final results
 # of processing. 's3' and 'filesystem' are supported. 'filesystem' should only
-# be used in development, or on single-machine installations.
-:storage:                 s3
+# be used in development, on single-machine installations, or networked drives.
+:storage:             s3
 # Please provide your AWS credentials for S3 storage of job output.
-:aws_access_key:          [your AWS access key]
-:aws_secret_key:          [your AWS secret access key]
+:aws_access_key:      [your AWS access key]
+:aws_secret_key:      [your AWS secret access key]
 # Choose an S3 bucket to store all CloudCrowd output, and decide if you'd like
 # to keep all resulting files on S3 private. If so, you'll receive authenticated
 # S3 URLs as job output, good for 24 hours. If left public, you'll get the
 # straight URLs to the files on S3.
-:s3_bucket:               [your CloudCrowd bucket]
-:use_s3_authentication:   no
+:s3_bucket:           [your CloudCrowd bucket]
+:s3_authentication:   no
+# If you're using the 'filesystem' storage, perhaps with an NFS share or
+# something similar, all files will be saved inside of the 'local_storage_path'.
+# The default value if left unspecified is '/tmp/cloud_crowd_storage'.
+:local_storage_path:  /tmp/cloud_crowd_storage
 # Use HTTP Basic Auth for all requests? (Includes all internal worker requests
 # to the central server). If yes, specify the login and password that all
 # requests must provide for authentication.
-:use_http_authentication: no
-:login:                   [your login name]
-:password:                [your password]
+:http_authentication: no
+:login:               [your login name]
+:password:            [your password]
 # By default, CloudCrowd looks for installed actions inside the 'actions'
 # subdirectory of this configuration folder. 'actions_path' allows you to load
@@ -36,4 +41,4 @@
 # The number of separate attempts that will be made to process an individual
 # work unit, before marking it as having failed.
-:work_unit_retries:       3
+:work_unit_retries:   3

data/lib/cloud-crowd.rb CHANGED Viewed

@@ -30,7 +30,7 @@ require 'cloud_crowd/exceptions'
 module CloudCrowd
-  # Autoload all the CloudCrowd classes which may not be required.
+  # Autoload all the CloudCrowd internals.
   autoload :Action,       'cloud_crowd/action'
   autoload :AssetStore,   'cloud_crowd/asset_store'
   autoload :Helpers,      'cloud_crowd/helpers'
@@ -42,36 +42,38 @@ module CloudCrowd
   autoload :Worker,       'cloud_crowd/worker'
   autoload :WorkUnit,     'cloud_crowd/models'
-  # Root directory of the CloudCrowd gem.
-  ROOT        = File.expand_path(File.dirname(__FILE__) + '/..')
+  # Keep this version in sync with the gemspec.
+  VERSION        = '0.2.0'
+  # Increment the schema version when there's a backwards incompatible change.
+  SCHEMA_VERSION = 2
-  # Keep the version in sync with the gemspec.
-  VERSION     = '0.1.1'
+  # Root directory of the CloudCrowd gem.
+  ROOT           = File.expand_path(File.dirname(__FILE__) + '/..')
-  # A Job is processing if its WorkUnits in the queue to be handled by workers.
-  PROCESSING  = 1
+  # A Job is processing if its WorkUnits are in the queue to be handled by nodes.
+  PROCESSING     = 1
   # A Job has succeeded if all of its WorkUnits have finished successfully.
-  SUCCEEDED   = 2
+  SUCCEEDED      = 2
   # A Job has failed if even a single one of its WorkUnits has failed (they may
   # be attempted multiple times on failure, however).
-  FAILED      = 3
+  FAILED         = 3
   # A Job is splitting if it's in the process of dividing its inputs up into
   # multiple WorkUnits.
-  SPLITTING   = 4
+  SPLITTING      = 4
   # A Job is merging if it's busy collecting all of its successful WorkUnits
   # back together into the final result.
-  MERGING     = 5
+  MERGING        = 5
-  # A work unit is considered to be complete if it succeeded or if it failed.
-  COMPLETE    = [SUCCEEDED, FAILED]
+  # A Job is considered to be complete if it succeeded or if it failed.
+  COMPLETE       = [SUCCEEDED, FAILED]
-  # A work unit is considered incomplete if it's being processed, split up or
-  # merged together.
-  INCOMPLETE  = [PROCESSING, SPLITTING, MERGING]
+  # A Job is considered incomplete if it's being processed, split up or merged.
+  INCOMPLETE     = [PROCESSING, SPLITTING, MERGING]
   # Mapping of statuses to their display strings.
   DISPLAY_STATUS_MAP = ['unknown', 'processing', 'succeeded', 'failed', 'splitting', 'merging']
@@ -88,18 +90,34 @@ module CloudCrowd
     # Configure the CloudCrowd central database (and connect to it), by passing
     # in a path to <tt>database.yml</tt>. The file should use the standard
     # ActiveRecord connection format.
-    def configure_database(config_path)
+    def configure_database(config_path, validate_schema=true)
       configuration = YAML.load_file(config_path)
       ActiveRecord::Base.establish_connection(configuration)
+      if validate_schema
+        version = ActiveRecord::Base.connection.select_values('select max(version) from schema_migrations').first.to_i
+        return true if version == SCHEMA_VERSION
+        puts "Your database schema is out of date. Please use `crowd load_schema` to update it. This will wipe all the tables, so make sure that your jobs have a chance to finish first.\nexiting..."
+        exit
+      end
     end
-    # Get a reference to the central server, including authentication,
-    # if configured.
+    # Get a reference to the central server, including authentication if
+    # configured.
     def central_server
-      return @central_server if @central_server
-      params = [CloudCrowd.config[:central_server]]
-      params += [CloudCrowd.config[:login], CloudCrowd.config[:password]] if CloudCrowd.config[:use_http_authentication]
-      @central_server = RestClient::Resource.new(*params)
+      @central_server ||= RestClient::Resource.new(CloudCrowd.config[:central_server], CloudCrowd.client_options)
+    end
+    # The standard RestClient options for the central server talking to nodes,
+    # as well as the other way around. There's a timeout of 5 seconds to open
+    # a connection, and a timeout of 30 to finish reading it.
+    def client_options
+      return @client_options if @client_options
+      @client_options = {:timeout => 30, :open_timeout => 5}
+      if CloudCrowd.config[:http_authentication]
+        @client_options[:user]      = CloudCrowd.config[:login]
+        @client_options[:password]  = CloudCrowd.config[:password]
+      end
+      @client_options
     end
     # Return the displayable status name of an internal CloudCrowd status number.
@@ -111,7 +129,7 @@ module CloudCrowd
     # CloudCrowd::Actions are requested dynamically by name. Access them through
     # this actions property, which behaves like a hash. At load time, we
     # load all installed Actions and CloudCrowd's default Actions into it.
-    # If you wish to have certain workers be specialized to only handle certain
+    # If you wish to have certain nodes be specialized to only handle certain
     # Actions, then install only those into the actions directory.
     def actions
       return @actions if @actions

data/lib/cloud_crowd/action.rb CHANGED Viewed

@@ -38,17 +38,19 @@ module CloudCrowd
     # Download a file to the specified path.
     def download(url, path)
-      URI.parse(url) # Sanity check.
       `curl -s "#{url}" > "#{path}"`
+      return path
+      # The previous implementation is below, and, although it would be
+      # wonderful not to shell out, RestClient wasn't handling URLs with encoded
+      # entities (%20, for example), and doesn't let you download to a given
+      # location. Getting a RestClient patch in would be ideal.
+      #
       # if url.match(FILE_URL)
       #   FileUtils.cp(url.sub(FILE_URL, ''), path)
       # else
-      #   # An alternative would be shelling out: `curl -s "#{url}" > "#{path}"`
-      #   puts url
       #   resp = RestClient::Request.execute(:url => url, :method => :get, :raw_response => true)
       #   FileUtils.mv resp.file.path, path
       # end
-      path
     end
     # Takes a local filesystem path, saves the file to S3, and returns the

data/lib/cloud_crowd/asset_store.rb CHANGED Viewed

@@ -3,18 +3,18 @@ require 'tmpdir'
 module CloudCrowd
   # The AssetStore provides a common API for storing files and returning URLs
-  # that can access them. In production this will be S3 but in development
-  # it may be the filesystem.
+  # that can access them. At the moment, the files can be saved to either S3, or
+  # the local filesystem. You shouldn't need to use the AssetStore directly --
+  # Action's +download+ and +save+ methods use it behind the scenes.
   #
-  # You shouldn't need to use the AssetStore directly -- Action's +download+
-  # and +save+ methods use it behind the scenes.
+  # To implement a new back-end for the AssetStore, you must provide
+  # <tt>save(local_path, save_path)</tt>, <tt>cleanup(job)</tt>, and optionally,
+  # a <tt>setup</tt> method that will be called once at initialization.
   class AssetStore
     autoload :S3Store,         'cloud_crowd/asset_store/s3_store'
     autoload :FilesystemStore, 'cloud_crowd/asset_store/filesystem_store'
-    LOCAL_STORAGE_PATH = '/tmp/cloud_crowd_storage'
     # Configure the AssetStore with the specific storage implementation
     # specified by 'storage' in <tt>config.yml</tt>.
     case CloudCrowd.config[:storage]

data/lib/cloud_crowd/asset_store/filesystem_store.rb CHANGED Viewed

@@ -2,20 +2,26 @@ module CloudCrowd
   class AssetStore
     # The FilesystemStore is an implementation of the AssetStore, good only for
-    # use in development, testing, or if you're only running a single-machine
-    # installation.
+    # use in development, testing, if you're only running a single-machine
+    # installation, or are using a networked drive.
     module FilesystemStore
-      # Make sure that local storage is writeable before starting.
+      DEFAULT_STORAGE_PATH = '/tmp/cloud_crowd_storage'
+      attr_reader :local_storage_path
+      # Make sure that local storage exists and is writeable before starting.
       def setup
-        raise Error::StorageNotWritable, "#{LOCAL_STORAGE_PATH} is not writable" unless File.writable?(LOCAL_STORAGE_PATH)
+        lsp = @local_storage_path = CloudCrowd.config[:local_storage_path] || DEFAULT_STORAGE_PATH
+        FileUtils.mkdir_p(lsp) unless File.exists?(lsp)
+        raise Error::StorageNotWritable, "#{lsp} is not writable" unless File.writable?(lsp)
       end
-      # Save a file to somewhere semi-persistent on the filesystem. Can be used
-      # in development, when offline, or if you happen to have a single-machine
-      # CloudCrowd installation. To use, configure <tt>:storage => 'filesystem'</tt>.
+      # Save a file to somewhere semi-persistent on the filesystem. To use,
+      # configure <tt>:storage: 'filesystem'</tt> in *config.yml*, as well as
+      # <tt>:local_storage_path:</tt>.
       def save(local_path, save_path)
-        save_path = File.join(LOCAL_STORAGE_PATH, save_path)
+        save_path = File.join(@local_storage_path, save_path)
         save_dir = File.dirname(save_path)
         FileUtils.mkdir_p save_dir unless File.exists? save_dir
         FileUtils.cp(local_path, save_path)
@@ -24,7 +30,7 @@ module CloudCrowd
       # Remove all of a Job's result files from the filesystem.
       def cleanup(job)
-        path = "#{LOCAL_STORAGE_PATH}/#{job.action}/job_#{job.id}"
+        path = "#{@local_storage_path}/#{job.action}/job_#{job.id}"
         FileUtils.rm_r(path) if File.exists?(path)
       end
     end

data/lib/cloud_crowd/asset_store/s3_store.rb CHANGED Viewed

@@ -7,8 +7,16 @@ module CloudCrowd
       # Configure authentication and establish a connection to S3, first thing.
       def setup
-        @use_auth = CloudCrowd.config[:use_s3_authentication]
-        establish_s3_connection
+        @use_auth   = CloudCrowd.config[:s3_authentication]
+        bucket_name = CloudCrowd.config[:s3_bucket]
+        key, secret = CloudCrowd.config[:aws_access_key], CloudCrowd.config[:aws_secret_key]
+        valid_conf  = [bucket_name, key, secret].all? {|s| s.is_a? String }
+        raise Error::MissingConfiguration, "An S3 account must be configured in 'config.yml' before 's3' storage can be used" unless valid_conf
+        protocol    = @use_auth ? 'https' : 'http'
+        port        = @use_auth ? 443 : 80
+        @s3         = RightAws::S3.new(key, secret, :protocol => protocol, :port => port)
+        @bucket     = @s3.bucket(bucket_name)
+        @bucket     = @s3.bucket(bucket_name, true) unless @bucket
       end
       # Save a finished file from local storage to S3. Save it publicly unless
@@ -29,15 +37,6 @@ module CloudCrowd
         @bucket.delete_folder("#{job.action}/job_#{job.id}")
       end
-      # Workers, through the course of many WorkUnits, keep around an AssetStore.
-      # Ensure we have a persistent S3 connection after first use.
-      def establish_s3_connection
-        unless @s3 && @bucket
-          params = {:port => 80, :protocol => 'http'}
-          @s3 = RightAws::S3.new(CloudCrowd.config[:aws_access_key], CloudCrowd.config[:aws_secret_key], params)
-          @bucket = @s3.bucket(CloudCrowd.config[:s3_bucket], true)
-        end
-      end
     end
   end

data/lib/cloud_crowd/command_line.rb CHANGED Viewed

@@ -49,7 +49,7 @@ Options:
       require 'irb/completion'
       require 'pp'
       load_code
-      connect_to_database
+      connect_to_database(true)
       IRB.start
     end
@@ -81,7 +81,7 @@ Options:
     # Load in the database schema to the database specified in 'database.yml'.
     def run_load_schema
       load_code
-      connect_to_database
+      connect_to_database(false)
       require 'cloud_crowd/schema.rb'
     end
@@ -92,8 +92,8 @@ Options:
       install_path = ARGV.shift || '.'
       FileUtils.mkdir_p install_path unless File.exists?(install_path)
       install_file "#{CC_ROOT}/config/config.example.yml", "#{install_path}/config.yml"
-      install_file "#{CC_ROOT}/config/database.example.yml", "#{install_path}/database.yml"
       install_file "#{CC_ROOT}/config/config.example.ru", "#{install_path}/config.ru"
+      install_file "#{CC_ROOT}/config/database.example.yml", "#{install_path}/database.yml"
       install_file "#{CC_ROOT}/actions", "#{install_path}/actions", true
     end
@@ -149,19 +149,24 @@ Options:
     # Establish a connection to the central server's database. Not all commands
     # require this.
-    def connect_to_database
+    def connect_to_database(validate_schema)
       require 'cloud_crowd/models'
-      CloudCrowd.configure_database("#{@options[:config_path]}/database.yml")
+      CloudCrowd.configure_database("#{@options[:config_path]}/database.yml", validate_schema)
     end
     # Exit with an explanation if the configuration files couldn't be found.
     def config_not_found
-      puts "`crowd` can't find the CloudCrowd configuration directory. Please either run `crowd` from inside of the configuration directory, or use `crowd -c path/to/config`"
+      puts "`crowd` can't find the CloudCrowd configuration directory. Please use `crowd -c path/to/config`, or run `crowd` from inside of the configuration directory itself."
       exit(1)
     end
-    # Install a file and log the installation.
+    # Install a file and log the installation. If we're overwriting a file,
+    # offer a chance to back out.
     def install_file(source, dest, is_dir=false)
+      if File.exists?(dest)
+        print "#{dest} already exists. Overwrite it? (yes/no) "
+        return unless ['y', 'yes', 'ok'].include? gets.chomp.downcase
+      end
       is_dir ? FileUtils.cp_r(source, dest) : FileUtils.cp(source, dest)
       puts "installed #{dest}"
     end

data/lib/cloud_crowd/exceptions.rb CHANGED Viewed

@@ -2,16 +2,14 @@ module CloudCrowd
   # Base Error class which all custom CloudCrowd exceptions inherit from.
   # Rescuing CloudCrowd::Error (or RuntimeError) will get all custom exceptions.
+  # If your cluster is correctly configured, you should never expect to see any
+  # of these.
   class Error < RuntimeError
     # ActionNotFound is raised when a job is created for an action that doesn't
     # exist.
     class ActionNotFound < Error
     end
-    # CentralServerUnavailable is used then the central server can't be reached.
-    class CentralServerUnavailable < Error
-    end
     # StorageNotFound is raised when config.yml specifies a storage back end that
     # doesn't exist.
@@ -27,6 +25,11 @@ module CloudCrowd
     class StatusUnspecified < Error
     end
+    # MissingConfiguration is raised when we're trying to run a method that
+    # needs configuration not present in config.yml.
+    class MissingConfiguration < Error
+    end
   end
 end