RubyGems - cloud-crowd - Versions diffs - 0.1.0 → 0.2.0 - Mend

cloud-crowd 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (42) hide show

data/README +16 -16
data/cloud-crowd.gemspec +10 -9
data/config/config.example.ru +8 -2
data/config/config.example.yml +21 -25
data/examples/process_pdfs_example.rb +1 -1
data/examples/word_count_example.rb +1 -0
data/lib/cloud-crowd.rb +47 -28
data/lib/cloud_crowd/action.rb +14 -8
data/lib/cloud_crowd/asset_store.rb +8 -8
data/lib/cloud_crowd/asset_store/filesystem_store.rb +18 -7
data/lib/cloud_crowd/asset_store/s3_store.rb +14 -11
data/lib/cloud_crowd/command_line.rb +24 -58
data/lib/cloud_crowd/exceptions.rb +7 -0
data/lib/cloud_crowd/helpers/authorization.rb +5 -3
data/lib/cloud_crowd/helpers/resources.rb +0 -20
data/lib/cloud_crowd/models.rb +1 -1
data/lib/cloud_crowd/models/job.rb +37 -40
data/lib/cloud_crowd/models/node_record.rb +95 -0
data/lib/cloud_crowd/models/work_unit.rb +87 -33
data/lib/cloud_crowd/node.rb +105 -0
data/lib/cloud_crowd/schema.rb +22 -18
data/lib/cloud_crowd/{app.rb → server.rb} +34 -34
data/lib/cloud_crowd/worker.rb +68 -107
data/public/css/admin_console.css +40 -18
data/public/images/server.png +0 -0
data/public/images/server_busy.png +0 -0
data/public/js/admin_console.js +47 -18
data/test/acceptance/test_failing_work_units.rb +1 -1
data/test/acceptance/{test_app.rb → test_server.rb} +15 -15
data/test/acceptance/test_word_count.rb +3 -9
data/test/blueprints.rb +0 -1
data/test/config/config.ru +1 -1
data/test/config/config.yml +2 -4
data/test/unit/test_action.rb +1 -1
data/test/unit/test_configuration.rb +1 -1
data/test/unit/test_job.rb +3 -0
data/test/unit/test_work_unit.rb +2 -4
data/views/{index.erb → operations_center.erb} +13 -8
metadata +11 -10
data/lib/cloud_crowd/daemon.rb +0 -95
data/lib/cloud_crowd/models/worker_record.rb +0 -61
data/lib/cloud_crowd/runner.rb +0 -15

data/README CHANGED

@@ -26,7 +26,7 @@
     * Parallel processing for the rest of us
     * Write your scripts in Ruby
-    * Built for Amazon EC2 and S3
+    * Works with Amazon EC2 and S3
     * split -> process -> merge
     * As easy as `gem install cloud-crowd`
@@ -63,31 +63,31 @@
     # Edit the configuration files to your satisfaction, add AWS credentials,
     # and then load the CloudCrowd schema into your configured database.
-      >> mate ~/config/cloud-crowd/config.yml
-      >> mate ~/config/cloud-crowd/database.yml
+      >> cd ~/config/cloud-crowd
+      >> mate config.yml
+      >> mate database.yml
+      >> [create the database you just configured...]
       >> crowd load_schema
     # Write your actions, and install them into the 'actions' subdirectory.
-    # CloudCrowd comes with some default actions as an example.
+    # CloudCrowd comes with a few default actions as an example.
     # To launch the central server (make sure that you include its location
-    # in config.yml), either:
+    # in config.yml):
       >> crowd server
-    # or:
+    # The configuration folder also includes 'config.ru', which can be used by
+     # any Rack-compliant webserver to run your central server.
-      >> thin -R config.ru --servers 3 -e production start
+    # Then, to launch a node of workers:
-    # Any server that supports Rack should work with the rackup file.
+      >> crowd node
-    # Then, to spin up 10 workers:
+    # To spin up remote nodes, install the 'cloud-crowd' gem and copy over
+    # your configuration directory. Run `crowd node`, and the remote machines
+    # will register with the central server, becoming available for processing.
-      >> crowd workers start -n 10
-    # To spin up workers remotely, install the 'cloud-crowd' gem, and copy over
-    # your configuration directory.
-    # At this point you can visit your server console at localhost:9173 to
-    # view all of your workers, ready for action.
+    # At this point you can visit your Operations Center at localhost:9173 to
+    # view all of your nodes, ready for action.

data/cloud-crowd.gemspec CHANGED

@@ -1,7 +1,7 @@
 Gem::Specification.new do |s|
   s.name      = 'cloud-crowd'
-  s.version   = '0.1.0'         # Keep version in sync with cloud-cloud.rb
-  s.date      = '2009-09-14'
+  s.version   = '0.2.0'         # Keep version in sync with cloud-cloud.rb
+  s.date      = '2009-09-17'
   s.homepage    = "http://wiki.github.com/documentcloud/cloud-crowd"
   s.summary     = "Parallel Processing for the Rest of Us"
@@ -32,7 +32,7 @@ Gem::Specification.new do |s|
   s.add_dependency 'json',          ['>= 1.1.7']
   s.add_dependency 'rest-client',   ['>= 1.0.3']
   s.add_dependency 'right_aws',     ['>= 1.10.0']
-  s.add_dependency 'daemons',       ['>= 1.0.10']
+  s.add_dependency 'thin',          ['>= 1.2.4']
   if s.respond_to?(:add_development_dependency)
     s.add_development_dependency 'faker',               ['>= 0.3.1']
@@ -56,23 +56,22 @@ examples/process_pdfs_example.rb
 examples/word_count_example.rb
 lib/cloud-crowd.rb
 lib/cloud_crowd/action.rb
-lib/cloud_crowd/app.rb
 lib/cloud_crowd/asset_store/filesystem_store.rb
 lib/cloud_crowd/asset_store/s3_store.rb
 lib/cloud_crowd/asset_store.rb
 lib/cloud_crowd/command_line.rb
-lib/cloud_crowd/daemon.rb
 lib/cloud_crowd/exceptions.rb
 lib/cloud_crowd/helpers/authorization.rb
 lib/cloud_crowd/helpers/resources.rb
 lib/cloud_crowd/helpers.rb
 lib/cloud_crowd/inflector.rb
 lib/cloud_crowd/models/job.rb
+lib/cloud_crowd/models/node_record.rb
 lib/cloud_crowd/models/work_unit.rb
-lib/cloud_crowd/models/worker_record.rb
 lib/cloud_crowd/models.rb
-lib/cloud_crowd/runner.rb
+lib/cloud_crowd/node.rb
 lib/cloud_crowd/schema.rb
+lib/cloud_crowd/server.rb
 lib/cloud_crowd/worker.rb
 LICENSE
 public/css/admin_console.css
@@ -83,6 +82,8 @@ public/images/cloud_hand.png
 public/images/header_back.png
 public/images/logo.png
 public/images/queue_fill.png
+public/images/server.png
+public/images/server_busy.png
 public/images/server_error.png
 public/images/sidebar_bottom.png
 public/images/sidebar_top.png
@@ -93,7 +94,7 @@ public/js/excanvas.js
 public/js/flot.js
 public/js/jquery.js
 README
-test/acceptance/test_app.rb
+test/acceptance/test_server.rb
 test/acceptance/test_failing_work_units.rb
 test/acceptance/test_word_count.rb
 test/blueprints.rb
@@ -106,6 +107,6 @@ test/unit/test_action.rb
 test/unit/test_configuration.rb
 test/unit/test_job.rb
 test/unit/test_work_unit.rb
-views/index.erb
+views/operations_center.erb
 )
 end

data/config/config.example.ru CHANGED

@@ -4,7 +4,13 @@
 # using any Rack-compliant server handler. For example, start up three servers
 # with a specified port number, using Thin:
 #
-# thin start -R config.ru -p 9173 --servers 3
+# thin start -R config.ru --servers 3
+#
+# Or a single server with Unicorn:
+#
+# unicorn config.ru
+#
 require 'rubygems'
 require 'cloud-crowd'
@@ -13,5 +19,5 @@ CloudCrowd.configure(File.dirname(__FILE__) + '/config.yml')
 CloudCrowd.configure_database(File.dirname(__FILE__) + '/database.yml')
 map '/' do
-  run CloudCrowd::App
+  run CloudCrowd::Server
 end

data/config/config.example.yml CHANGED

@@ -1,48 +1,44 @@
 # The URL where you're planning on running the central server/queue/database.
-:central_server:          http://localhost:9173
+:central_server:      http://localhost:9173
+# Set the maximum number of workers allowed per-node. Workers only run while
+# there's work to be done. It's best to set 'max_workers' below the point where
+# you'd start to swap or peg your CPU (as determined by experiment).
+:max_workers:         5
 # The storage back-end that you'd like to use for intermediate and final results
 # of processing. 's3' and 'filesystem' are supported. 'filesystem' should only
-# be used in development, or on single-machine installations.
-:storage:                 s3
+# be used in development, on single-machine installations, or networked drives.
+:storage:             s3
 # Please provide your AWS credentials for S3 storage of job output.
-:aws_access_key:          [your AWS access key]
-:aws_secret_key:          [your AWS secret access key]
+:aws_access_key:      [your AWS access key]
+:aws_secret_key:      [your AWS secret access key]
 # Choose an S3 bucket to store all CloudCrowd output, and decide if you'd like
 # to keep all resulting files on S3 private. If so, you'll receive authenticated
 # S3 URLs as job output, good for 24 hours. If left public, you'll get the
 # straight URLs to the files on S3.
-:s3_bucket:               [your CloudCrowd bucket]
-:use_s3_authentication:   no
+:s3_bucket:           [your CloudCrowd bucket]
+:s3_authentication:   no
+# If you're using the 'filesystem' storage, perhaps with an NFS share or
+# something similar, all files will be saved inside of the 'local_storage_path'.
+# The default value if left unspecified is '/tmp/cloud_crowd_storage'.
+:local_storage_path:  /tmp/cloud_crowd_storage
 # Use HTTP Basic Auth for all requests? (Includes all internal worker requests
 # to the central server). If yes, specify the login and password that all
 # requests must provide for authentication.
-:use_http_authentication: no
-:login:                   [your login name]
-:password:                [your password]
+:http_authentication: no
+:login:               [your login name]
+:password:            [your password]
 # By default, CloudCrowd looks for installed actions inside the 'actions'
 # subdirectory of this configuration folder. 'actions_path' allows you to load
 # additional actions from a location of your choice.
 # :actions_path: /path/to/actions
-# Set the following numbers to tweak the configuration of your worker daemons.
-# Optimum results will depend on proportion of the Memory/CPU/IO bottlenecks
-# in your actions, the number of central servers you have running, and your
-# desired balance between latency and traffic.
-# The number of workers that `crowd workers start` spins up.
-:num_workers:             3
-# The minimum number of seconds a worker waits between checking the job queue.
-:min_worker_wait:         1
-# The maximum number of seconds a worker waits between checking the job queue.
-:max_worker_wait:         5
 # The number of separate attempts that will be made to process an individual
 # work unit, before marking it as having failed.
-:work_unit_retries:       3
+:work_unit_retries:   3

data/examples/process_pdfs_example.rb CHANGED

@@ -17,7 +17,7 @@ RestClient.post('http://localhost:9173/jobs',
       'http://tigger.uic.edu/~victor/personal/futurism.pdf',
       'http://www.jonasmekas.com/Catalog_excerpt/The%20Avant-Garde%20From%20Futurism%20to%20Fluxus.pdf',
       'http://www.dzignism.com/articles/Futurist.Manifesto.pdf',
-      'http://benfry.com/phd/dissertation-050312b-acrobat.pdf'
+      'http://www.pitt.edu/~slavic/sisc/SISC4/dadswell.pdf'
     ],
     'options' => {

data/examples/word_count_example.rb CHANGED

@@ -39,3 +39,4 @@ RestClient.post('http://localhost:9173/jobs',
 )
 # With 23 Workers running, and over Wifi, it counted all the words in 5.5 secs.
+# On a fast internet connection, you may not even see this job show up.

data/lib/cloud-crowd.rb CHANGED

@@ -5,16 +5,15 @@ $LOAD_PATH.unshift File.expand_path(File.dirname(__FILE__))
 # Common Gems:
 require 'rubygems'
 gem 'activerecord'
-gem 'daemons'
 gem 'json'
 gem 'rest-client'
 gem 'right_aws'
 gem 'sinatra'
+gem 'thin'
 # Autoloading for all the pieces which may or may not be needed:
 autoload :ActiveRecord, 'activerecord'
 autoload :Benchmark,    'benchmark'
-autoload :Daemons,      'daemons'
 autoload :Digest,       'digest'
 autoload :ERB,          'erb'
 autoload :FileUtils,    'fileutils'
@@ -23,6 +22,7 @@ autoload :RestClient,   'restclient'
 autoload :RightAws,     'right_aws'
 autoload :Sinatra,      'sinatra'
 autoload :Socket,       'socket'
+autoload :Thin,         'thin'
 autoload :YAML,         'yaml'
 # Common code which should really be required in every circumstance.
@@ -30,47 +30,50 @@ require 'cloud_crowd/exceptions'
 module CloudCrowd
-  # Autoload all the CloudCrowd classes which may not be required.
-  autoload :App,          'cloud_crowd/app'
+  # Autoload all the CloudCrowd internals.
   autoload :Action,       'cloud_crowd/action'
   autoload :AssetStore,   'cloud_crowd/asset_store'
   autoload :Helpers,      'cloud_crowd/helpers'
   autoload :Inflector,    'cloud_crowd/inflector'
   autoload :Job,          'cloud_crowd/models'
+  autoload :Node,         'cloud_crowd/node'
+  autoload :NodeRecord,   'cloud_crowd/models'
+  autoload :Server,       'cloud_crowd/server'
   autoload :Worker,       'cloud_crowd/worker'
   autoload :WorkUnit,     'cloud_crowd/models'
-  autoload :WorkerRecord, 'cloud_crowd/models'
-  # Root directory of the CloudCrowd gem.
-  ROOT        = File.expand_path(File.dirname(__FILE__) + '/..')
+  # Keep this version in sync with the gemspec.
+  VERSION        = '0.2.0'
+  # Increment the schema version when there's a backwards incompatible change.
+  SCHEMA_VERSION = 2
-  # Keep the version in sync with the gemspec.
-  VERSION     = '0.1.0'
+  # Root directory of the CloudCrowd gem.
+  ROOT           = File.expand_path(File.dirname(__FILE__) + '/..')
-  # A Job is processing if its WorkUnits in the queue to be handled by workers.
-  PROCESSING  = 1
+  # A Job is processing if its WorkUnits are in the queue to be handled by nodes.
+  PROCESSING     = 1
   # A Job has succeeded if all of its WorkUnits have finished successfully.
-  SUCCEEDED   = 2
+  SUCCEEDED      = 2
   # A Job has failed if even a single one of its WorkUnits has failed (they may
   # be attempted multiple times on failure, however).
-  FAILED      = 3
+  FAILED         = 3
   # A Job is splitting if it's in the process of dividing its inputs up into
   # multiple WorkUnits.
-  SPLITTING   = 4
+  SPLITTING      = 4
   # A Job is merging if it's busy collecting all of its successful WorkUnits
   # back together into the final result.
-  MERGING     = 5
+  MERGING        = 5
-  # A work unit is considered to be complete if it succeeded or if it failed.
-  COMPLETE    = [SUCCEEDED, FAILED]
+  # A Job is considered to be complete if it succeeded or if it failed.
+  COMPLETE       = [SUCCEEDED, FAILED]
-  # A work unit is considered incomplete if it's being processed, split up or
-  # merged together.
-  INCOMPLETE  = [PROCESSING, SPLITTING, MERGING]
+  # A Job is considered incomplete if it's being processed, split up or merged.
+  INCOMPLETE     = [PROCESSING, SPLITTING, MERGING]
   # Mapping of statuses to their display strings.
   DISPLAY_STATUS_MAP = ['unknown', 'processing', 'succeeded', 'failed', 'splitting', 'merging']
@@ -87,18 +90,34 @@ module CloudCrowd
     # Configure the CloudCrowd central database (and connect to it), by passing
     # in a path to <tt>database.yml</tt>. The file should use the standard
     # ActiveRecord connection format.
-    def configure_database(config_path)
+    def configure_database(config_path, validate_schema=true)
       configuration = YAML.load_file(config_path)
       ActiveRecord::Base.establish_connection(configuration)
+      if validate_schema
+        version = ActiveRecord::Base.connection.select_values('select max(version) from schema_migrations').first.to_i
+        return true if version == SCHEMA_VERSION
+        puts "Your database schema is out of date. Please use `crowd load_schema` to update it. This will wipe all the tables, so make sure that your jobs have a chance to finish first.\nexiting..."
+        exit
+      end
     end
-    # Get a reference to the central server, including authentication,
-    # if configured.
+    # Get a reference to the central server, including authentication if
+    # configured.
     def central_server
-      return @central_server if @central_server
-      params = [CloudCrowd.config[:central_server]]
-      params += [CloudCrowd.config[:login], CloudCrowd.config[:password]] if CloudCrowd.config[:use_http_authentication]
-      @central_server = RestClient::Resource.new(*params)
+      @central_server ||= RestClient::Resource.new(CloudCrowd.config[:central_server], CloudCrowd.client_options)
+    end
+    # The standard RestClient options for the central server talking to nodes,
+    # as well as the other way around. There's a timeout of 5 seconds to open
+    # a connection, and a timeout of 30 to finish reading it.
+    def client_options
+      return @client_options if @client_options
+      @client_options = {:timeout => 30, :open_timeout => 5}
+      if CloudCrowd.config[:http_authentication]
+        @client_options[:user]      = CloudCrowd.config[:login]
+        @client_options[:password]  = CloudCrowd.config[:password]
+      end
+      @client_options
     end
     # Return the displayable status name of an internal CloudCrowd status number.
@@ -110,7 +129,7 @@ module CloudCrowd
     # CloudCrowd::Actions are requested dynamically by name. Access them through
     # this actions property, which behaves like a hash. At load time, we
     # load all installed Actions and CloudCrowd's default Actions into it.
-    # If you wish to have certain workers be specialized to only handle certain
+    # If you wish to have certain nodes be specialized to only handle certain
     # Actions, then install only those into the actions directory.
     def actions
       return @actions if @actions

data/lib/cloud_crowd/action.rb CHANGED

@@ -38,13 +38,19 @@ module CloudCrowd
     # Download a file to the specified path.
     def download(url, path)
-      if url.match(FILE_URL)
-        FileUtils.cp(url.sub(FILE_URL, ''), path)
-      else
-        resp = RestClient::Request.execute(:url => url, :method => :get, :raw_response => true)
-        FileUtils.mv resp.file.path, path
-      end
-      path
+      `curl -s "#{url}" > "#{path}"`
+      return path
+      # The previous implementation is below, and, although it would be
+      # wonderful not to shell out, RestClient wasn't handling URLs with encoded
+      # entities (%20, for example), and doesn't let you download to a given
+      # location. Getting a RestClient patch in would be ideal.
+      #
+      # if url.match(FILE_URL)
+      #   FileUtils.cp(url.sub(FILE_URL, ''), path)
+      # else
+      #   resp = RestClient::Request.execute(:url => url, :method => :get, :raw_response => true)
+      #   FileUtils.mv resp.file.path, path
+      # end
     end
     # Takes a local filesystem path, saves the file to S3, and returns the
@@ -55,7 +61,7 @@ module CloudCrowd
     end
     # After the Action has finished, we remove the work directory and return
-    # to the root directory (where daemons run by default).
+    # to the root directory (where workers run by default).
     def cleanup_work_directory
       FileUtils.rm_r(@work_directory) if File.exists?(@work_directory)
     end

data/lib/cloud_crowd/asset_store.rb CHANGED

@@ -3,18 +3,18 @@ require 'tmpdir'
 module CloudCrowd
   # The AssetStore provides a common API for storing files and returning URLs
-  # that can access them. In production this will be S3 but in development
-  # it may be the filesystem.
+  # that can access them. At the moment, the files can be saved to either S3, or
+  # the local filesystem. You shouldn't need to use the AssetStore directly --
+  # Action's +download+ and +save+ methods use it behind the scenes.
   #
-  # You shouldn't need to use the AssetStore directly -- Action's +download+
-  # and +save+ methods use it behind the scenes.
+  # To implement a new back-end for the AssetStore, you must provide
+  # <tt>save(local_path, save_path)</tt>, <tt>cleanup(job)</tt>, and optionally,
+  # a <tt>setup</tt> method that will be called once at initialization.
   class AssetStore
     autoload :S3Store,         'cloud_crowd/asset_store/s3_store'
     autoload :FilesystemStore, 'cloud_crowd/asset_store/filesystem_store'
-    LOCAL_STORAGE_PATH = '/tmp/cloud_crowd_storage'
     # Configure the AssetStore with the specific storage implementation
     # specified by 'storage' in <tt>config.yml</tt>.
     case CloudCrowd.config[:storage]
@@ -25,9 +25,9 @@ module CloudCrowd
     # Creating the AssetStore ensures that its scratch directory exists.
     def initialize
-      @use_auth = CloudCrowd.config[:use_s3_authentication]
       FileUtils.mkdir_p temp_storage_path unless File.exists? temp_storage_path
       raise Error::StorageNotWritable, "#{temp_storage_path} is not writable" unless File.writable?(temp_storage_path)
+      setup if respond_to? :setup
     end
     # Get the path to CloudCrowd's temporary local storage. All actions run