RubyGems - cloud-crowd - Versions diffs - 0.1.0 → 0.2.0 - Mend

cloud-crowd 0.1.0 → 0.2.0

Files changed (42) hide show

data/README +16 -16
data/cloud-crowd.gemspec +10 -9
data/config/config.example.ru +8 -2
data/config/config.example.yml +21 -25
data/examples/process_pdfs_example.rb +1 -1
data/examples/word_count_example.rb +1 -0
data/lib/cloud-crowd.rb +47 -28
data/lib/cloud_crowd/action.rb +14 -8
data/lib/cloud_crowd/asset_store.rb +8 -8
data/lib/cloud_crowd/asset_store/filesystem_store.rb +18 -7
data/lib/cloud_crowd/asset_store/s3_store.rb +14 -11
data/lib/cloud_crowd/command_line.rb +24 -58
data/lib/cloud_crowd/exceptions.rb +7 -0
data/lib/cloud_crowd/helpers/authorization.rb +5 -3
data/lib/cloud_crowd/helpers/resources.rb +0 -20
data/lib/cloud_crowd/models.rb +1 -1
data/lib/cloud_crowd/models/job.rb +37 -40
data/lib/cloud_crowd/models/node_record.rb +95 -0
data/lib/cloud_crowd/models/work_unit.rb +87 -33
data/lib/cloud_crowd/node.rb +105 -0
data/lib/cloud_crowd/schema.rb +22 -18
data/lib/cloud_crowd/{app.rb → server.rb} +34 -34
data/lib/cloud_crowd/worker.rb +68 -107
data/public/css/admin_console.css +40 -18
data/public/images/server.png +0 -0
data/public/images/server_busy.png +0 -0
data/public/js/admin_console.js +47 -18
data/test/acceptance/test_failing_work_units.rb +1 -1
data/test/acceptance/{test_app.rb → test_server.rb} +15 -15
data/test/acceptance/test_word_count.rb +3 -9
data/test/blueprints.rb +0 -1
data/test/config/config.ru +1 -1
data/test/config/config.yml +2 -4
data/test/unit/test_action.rb +1 -1
data/test/unit/test_configuration.rb +1 -1
data/test/unit/test_job.rb +3 -0
data/test/unit/test_work_unit.rb +2 -4
data/views/{index.erb → operations_center.erb} +13 -8
metadata +11 -10
data/lib/cloud_crowd/daemon.rb +0 -95
data/lib/cloud_crowd/models/worker_record.rb +0 -61
data/lib/cloud_crowd/runner.rb +0 -15

data/README CHANGED

@@ -26,7 +26,7 @@
     * Parallel processing for the rest of us
     * Write your scripts in Ruby
-    * Built for Amazon EC2 and S3
+    * Works with Amazon EC2 and S3
     * split -> process -> merge
     * As easy as `gem install cloud-crowd`
@@ -63,31 +63,31 @@
     # Edit the configuration files to your satisfaction, add AWS credentials,
     # and then load the CloudCrowd schema into your configured database.
-      >> mate ~/config/cloud-crowd/config.yml
-      >> mate ~/config/cloud-crowd/database.yml
+      >> cd ~/config/cloud-crowd
+      >> mate config.yml
+      >> mate database.yml
+      >> [create the database you just configured...]
       >> crowd load_schema
     # Write your actions, and install them into the 'actions' subdirectory.
-    # CloudCrowd comes with some default actions as an example.
+    # CloudCrowd comes with a few default actions as an example.
     # To launch the central server (make sure that you include its location
-    # in config.yml), either:
+    # in config.yml):
       >> crowd server
-    # or:
+    # The configuration folder also includes 'config.ru', which can be used by
+     # any Rack-compliant webserver to run your central server.
-      >> thin -R config.ru --servers 3 -e production start
+    # Then, to launch a node of workers:
-    # Any server that supports Rack should work with the rackup file.
+      >> crowd node
-    # Then, to spin up 10 workers:
+    # To spin up remote nodes, install the 'cloud-crowd' gem and copy over
+    # your configuration directory. Run `crowd node`, and the remote machines
+    # will register with the central server, becoming available for processing.
-      >> crowd workers start -n 10
-    # To spin up workers remotely, install the 'cloud-crowd' gem, and copy over
-    # your configuration directory.
-    # At this point you can visit your server console at localhost:9173 to
-    # view all of your workers, ready for action.
+    # At this point you can visit your Operations Center at localhost:9173 to
+    # view all of your nodes, ready for action.

data/cloud-crowd.gemspec CHANGED

@@ -1,7 +1,7 @@
 Gem::Specification.new do |s|
   s.name      = 'cloud-crowd'
-  s.version   = '0.1.0'         # Keep version in sync with cloud-cloud.rb
-  s.date      = '2009-09-14'
+  s.version   = '0.2.0'         # Keep version in sync with cloud-cloud.rb
+  s.date      = '2009-09-17'
   s.homepage    = "http://wiki.github.com/documentcloud/cloud-crowd"
   s.summary     = "Parallel Processing for the Rest of Us"
@@ -32,7 +32,7 @@ Gem::Specification.new do |s|
   s.add_dependency 'json',          ['>= 1.1.7']
   s.add_dependency 'rest-client',   ['>= 1.0.3']
   s.add_dependency 'right_aws',     ['>= 1.10.0']
-  s.add_dependency 'daemons',       ['>= 1.0.10']
+  s.add_dependency 'thin',          ['>= 1.2.4']
   if s.respond_to?(:add_development_dependency)
     s.add_development_dependency 'faker',               ['>= 0.3.1']
@@ -56,23 +56,22 @@ examples/process_pdfs_example.rb
 examples/word_count_example.rb
 lib/cloud-crowd.rb
 lib/cloud_crowd/action.rb
-lib/cloud_crowd/app.rb
 lib/cloud_crowd/asset_store/filesystem_store.rb
 lib/cloud_crowd/asset_store/s3_store.rb
 lib/cloud_crowd/asset_store.rb
 lib/cloud_crowd/command_line.rb
-lib/cloud_crowd/daemon.rb
 lib/cloud_crowd/exceptions.rb
 lib/cloud_crowd/helpers/authorization.rb
 lib/cloud_crowd/helpers/resources.rb
 lib/cloud_crowd/helpers.rb
 lib/cloud_crowd/inflector.rb
 lib/cloud_crowd/models/job.rb
+lib/cloud_crowd/models/node_record.rb
 lib/cloud_crowd/models/work_unit.rb
-lib/cloud_crowd/models/worker_record.rb
 lib/cloud_crowd/models.rb
-lib/cloud_crowd/runner.rb
+lib/cloud_crowd/node.rb
 lib/cloud_crowd/schema.rb
+lib/cloud_crowd/server.rb
 lib/cloud_crowd/worker.rb
 LICENSE
 public/css/admin_console.css
@@ -83,6 +82,8 @@ public/images/cloud_hand.png
 public/images/header_back.png
 public/images/logo.png
 public/images/queue_fill.png
+public/images/server.png
+public/images/server_busy.png
 public/images/server_error.png
 public/images/sidebar_bottom.png
 public/images/sidebar_top.png
@@ -93,7 +94,7 @@ public/js/excanvas.js
 public/js/flot.js
 public/js/jquery.js
 README
-test/acceptance/test_app.rb
+test/acceptance/test_server.rb
 test/acceptance/test_failing_work_units.rb
 test/acceptance/test_word_count.rb
 test/blueprints.rb
@@ -106,6 +107,6 @@ test/unit/test_action.rb
 test/unit/test_configuration.rb
 test/unit/test_job.rb
 test/unit/test_work_unit.rb
-views/index.erb
+views/operations_center.erb
 )
 end

data/config/config.example.ru CHANGED

@@ -4,7 +4,13 @@
 # using any Rack-compliant server handler. For example, start up three servers
 # with a specified port number, using Thin:
 #
-# thin start -R config.ru -p 9173 --servers 3
+# thin start -R config.ru --servers 3
+#
+# Or a single server with Unicorn:
+#
+# unicorn config.ru
+#
 require 'rubygems'
 require 'cloud-crowd'
@@ -13,5 +19,5 @@ CloudCrowd.configure(File.dirname(__FILE__) + '/config.yml')
 CloudCrowd.configure_database(File.dirname(__FILE__) + '/database.yml')
 map '/' do
-  run CloudCrowd::App
+  run CloudCrowd::Server
 end

data/config/config.example.yml CHANGED

@@ -1,48 +1,44 @@
 # The URL where you're planning on running the central server/queue/database.
-:central_server:          http://localhost:9173
+:central_server:      http://localhost:9173
+# Set the maximum number of workers allowed per-node. Workers only run while
+# there's work to be done. It's best to set 'max_workers' below the point where
+# you'd start to swap or peg your CPU (as determined by experiment).
+:max_workers:         5
 # The storage back-end that you'd like to use for intermediate and final results
 # of processing. 's3' and 'filesystem' are supported. 'filesystem' should only
-# be used in development, or on single-machine installations.
-:storage:                 s3
+# be used in development, on single-machine installations, or networked drives.
+:storage:             s3
 # Please provide your AWS credentials for S3 storage of job output.
-:aws_access_key:          [your AWS access key]
-:aws_secret_key:          [your AWS secret access key]
+:aws_access_key:      [your AWS access key]
+:aws_secret_key:      [your AWS secret access key]
 # Choose an S3 bucket to store all CloudCrowd output, and decide if you'd like
 # to keep all resulting files on S3 private. If so, you'll receive authenticated
 # S3 URLs as job output, good for 24 hours. If left public, you'll get the
 # straight URLs to the files on S3.
-:s3_bucket:               [your CloudCrowd bucket]
-:use_s3_authentication:   no
+:s3_bucket:           [your CloudCrowd bucket]
+:s3_authentication:   no
+# If you're using the 'filesystem' storage, perhaps with an NFS share or
+# something similar, all files will be saved inside of the 'local_storage_path'.
+# The default value if left unspecified is '/tmp/cloud_crowd_storage'.
+:local_storage_path:  /tmp/cloud_crowd_storage
 # Use HTTP Basic Auth for all requests? (Includes all internal worker requests
 # to the central server). If yes, specify the login and password that all
 # requests must provide for authentication.
-:use_http_authentication: no
-:login:                   [your login name]
-:password:                [your password]
+:http_authentication: no
+:login:               [your login name]
+:password:            [your password]
 # By default, CloudCrowd looks for installed actions inside the 'actions'
 # subdirectory of this configuration folder. 'actions_path' allows you to load
 # additional actions from a location of your choice.
 # :actions_path: /path/to/actions
-# Set the following numbers to tweak the configuration of your worker daemons.
-# Optimum results will depend on proportion of the Memory/CPU/IO bottlenecks
-# in your actions, the number of central servers you have running, and your
-# desired balance between latency and traffic.
-# The number of workers that `crowd workers start` spins up.
-:num_workers:             3
-# The minimum number of seconds a worker waits between checking the job queue.
-:min_worker_wait:         1
-# The maximum number of seconds a worker waits between checking the job queue.
-:max_worker_wait:         5
 # The number of separate attempts that will be made to process an individual
 # work unit, before marking it as having failed.
-:work_unit_retries:       3
+:work_unit_retries:   3

data/examples/process_pdfs_example.rb CHANGED

@@ -17,7 +17,7 @@ RestClient.post('http://localhost:9173/jobs',
       'http://tigger.uic.edu/~victor/personal/futurism.pdf',
       'http://www.jonasmekas.com/Catalog_excerpt/The%20Avant-Garde%20From%20Futurism%20to%20Fluxus.pdf',
       'http://www.dzignism.com/articles/Futurist.Manifesto.pdf',
-      'http://benfry.com/phd/dissertation-050312b-acrobat.pdf'
+      'http://www.pitt.edu/~slavic/sisc/SISC4/dadswell.pdf'
     ],
     'options' => {

data/examples/word_count_example.rb CHANGED

@@ -39,3 +39,4 @@ RestClient.post('http://localhost:9173/jobs',
 )
 # With 23 Workers running, and over Wifi, it counted all the words in 5.5 secs.
+# On a fast internet connection, you may not even see this job show up.

data/lib/cloud-crowd.rb CHANGED

@@ -5,16 +5,15 @@ $LOAD_PATH.unshift File.expand_path(File.dirname(__FILE__))
 # Common Gems:
 require 'rubygems'
 gem 'activerecord'
-gem 'daemons'
 gem 'json'
 gem 'rest-client'
 gem 'right_aws'
 gem 'sinatra'
+gem 'thin'
 # Autoloading for all the pieces which may or may not be needed:
 autoload :ActiveRecord, 'activerecord'
 autoload :Benchmark,    'benchmark'
-autoload :Daemons,      'daemons'
 autoload :Digest,       'digest'
 autoload :ERB,          'erb'
 autoload :FileUtils,    'fileutils'
@@ -23,6 +22,7 @@ autoload :RestClient,   'restclient'
 autoload :RightAws,     'right_aws'
 autoload :Sinatra,      'sinatra'
 autoload :Socket,       'socket'
+autoload :Thin,         'thin'
 autoload :YAML,         'yaml'
 # Common code which should really be required in every circumstance.
@@ -30,47 +30,50 @@ require 'cloud_crowd/exceptions'
 module CloudCrowd
-  # Autoload all the CloudCrowd classes which may not be required.
-  autoload :App,          'cloud_crowd/app'
+  # Autoload all the CloudCrowd internals.
   autoload :Action,       'cloud_crowd/action'
   autoload :AssetStore,   'cloud_crowd/asset_store'
   autoload :Helpers,      'cloud_crowd/helpers'
   autoload :Inflector,    'cloud_crowd/inflector'
   autoload :Job,          'cloud_crowd/models'
+  autoload :Node,         'cloud_crowd/node'
+  autoload :NodeRecord,   'cloud_crowd/models'
+  autoload :Server,       'cloud_crowd/server'
   autoload :Worker,       'cloud_crowd/worker'
   autoload :WorkUnit,     'cloud_crowd/models'
-  autoload :WorkerRecord, 'cloud_crowd/models'
-  # Root directory of the CloudCrowd gem.
-  ROOT        = File.expand_path(File.dirname(__FILE__) + '/..')
+  # Keep this version in sync with the gemspec.
+  VERSION        = '0.2.0'
+  # Increment the schema version when there's a backwards incompatible change.
+  SCHEMA_VERSION = 2
-  # Keep the version in sync with the gemspec.
-  VERSION     = '0.1.0'
+  # Root directory of the CloudCrowd gem.
+  ROOT           = File.expand_path(File.dirname(__FILE__) + '/..')
-  # A Job is processing if its WorkUnits in the queue to be handled by workers.
-  PROCESSING  = 1
+  # A Job is processing if its WorkUnits are in the queue to be handled by nodes.
+  PROCESSING     = 1
   # A Job has succeeded if all of its WorkUnits have finished successfully.
-  SUCCEEDED   = 2
+  SUCCEEDED      = 2
   # A Job has failed if even a single one of its WorkUnits has failed (they may
   # be attempted multiple times on failure, however).
-  FAILED      = 3
+  FAILED         = 3
   # A Job is splitting if it's in the process of dividing its inputs up into
   # multiple WorkUnits.
-  SPLITTING   = 4
+  SPLITTING      = 4
   # A Job is merging if it's busy collecting all of its successful WorkUnits
   # back together into the final result.
-  MERGING     = 5
+  MERGING        = 5
-  # A work unit is considered to be complete if it succeeded or if it failed.
-  COMPLETE    = [SUCCEEDED, FAILED]
+  # A Job is considered to be complete if it succeeded or if it failed.
+  COMPLETE       = [SUCCEEDED, FAILED]
-  # A work unit is considered incomplete if it's being processed, split up or
-  # merged together.
-  INCOMPLETE  = [PROCESSING, SPLITTING, MERGING]
+  # A Job is considered incomplete if it's being processed, split up or merged.
+  INCOMPLETE     = [PROCESSING, SPLITTING, MERGING]
   # Mapping of statuses to their display strings.
   DISPLAY_STATUS_MAP = ['unknown', 'processing', 'succeeded', 'failed', 'splitting', 'merging']
@@ -87,18 +90,34 @@ module CloudCrowd
     # Configure the CloudCrowd central database (and connect to it), by passing
     # in a path to <tt>database.yml</tt>. The file should use the standard
     # ActiveRecord connection format.
-    def configure_database(config_path)
+    def configure_database(config_path, validate_schema=true)
       configuration = YAML.load_file(config_path)
       ActiveRecord::Base.establish_connection(configuration)
+      if validate_schema
+        version = ActiveRecord::Base.connection.select_values('select max(version) from schema_migrations').first.to_i
+        return true if version == SCHEMA_VERSION
+        puts "Your database schema is out of date. Please use `crowd load_schema` to update it. This will wipe all the tables, so make sure that your jobs have a chance to finish first.\nexiting..."
+        exit
+      end
     end
-    # Get a reference to the central server, including authentication,
-    # if configured.
+    # Get a reference to the central server, including authentication if
+    # configured.
     def central_server
-      return @central_server if @central_server
-      params = [CloudCrowd.config[:central_server]]
-      params += [CloudCrowd.config[:login], CloudCrowd.config[:password]] if CloudCrowd.config[:use_http_authentication]
-      @central_server = RestClient::Resource.new(*params)
+      @central_server ||= RestClient::Resource.new(CloudCrowd.config[:central_server], CloudCrowd.client_options)
+    end
+    # The standard RestClient options for the central server talking to nodes,
+    # as well as the other way around. There's a timeout of 5 seconds to open
+    # a connection, and a timeout of 30 to finish reading it.
+    def client_options
+      return @client_options if @client_options
+      @client_options = {:timeout => 30, :open_timeout => 5}
+      if CloudCrowd.config[:http_authentication]
+        @client_options[:user]      = CloudCrowd.config[:login]
+        @client_options[:password]  = CloudCrowd.config[:password]
+      end
+      @client_options
     end
     # Return the displayable status name of an internal CloudCrowd status number.
@@ -110,7 +129,7 @@ module CloudCrowd
     # CloudCrowd::Actions are requested dynamically by name. Access them through
     # this actions property, which behaves like a hash. At load time, we
     # load all installed Actions and CloudCrowd's default Actions into it.
-    # If you wish to have certain workers be specialized to only handle certain
+    # If you wish to have certain nodes be specialized to only handle certain
     # Actions, then install only those into the actions directory.
     def actions
       return @actions if @actions

data/lib/cloud_crowd/action.rb CHANGED

@@ -38,13 +38,19 @@ module CloudCrowd
     # Download a file to the specified path.
     def download(url, path)
-      if url.match(FILE_URL)
-        FileUtils.cp(url.sub(FILE_URL, ''), path)
-      else
-        resp = RestClient::Request.execute(:url => url, :method => :get, :raw_response => true)
-        FileUtils.mv resp.file.path, path
-      end
-      path
+      `curl -s "#{url}" > "#{path}"`
+      return path
+      # The previous implementation is below, and, although it would be
+      # wonderful not to shell out, RestClient wasn't handling URLs with encoded
+      # entities (%20, for example), and doesn't let you download to a given
+      # location. Getting a RestClient patch in would be ideal.
+      #
+      # if url.match(FILE_URL)
+      #   FileUtils.cp(url.sub(FILE_URL, ''), path)
+      # else
+      #   resp = RestClient::Request.execute(:url => url, :method => :get, :raw_response => true)
+      #   FileUtils.mv resp.file.path, path
+      # end
     end
     # Takes a local filesystem path, saves the file to S3, and returns the
@@ -55,7 +61,7 @@ module CloudCrowd
     end
     # After the Action has finished, we remove the work directory and return
-    # to the root directory (where daemons run by default).
+    # to the root directory (where workers run by default).
     def cleanup_work_directory
       FileUtils.rm_r(@work_directory) if File.exists?(@work_directory)
     end

data/lib/cloud_crowd/asset_store.rb CHANGED

@@ -3,18 +3,18 @@ require 'tmpdir'
 module CloudCrowd
   # The AssetStore provides a common API for storing files and returning URLs
-  # that can access them. In production this will be S3 but in development
-  # it may be the filesystem.
+  # that can access them. At the moment, the files can be saved to either S3, or
+  # the local filesystem. You shouldn't need to use the AssetStore directly --
+  # Action's +download+ and +save+ methods use it behind the scenes.
   #
-  # You shouldn't need to use the AssetStore directly -- Action's +download+
-  # and +save+ methods use it behind the scenes.
+  # To implement a new back-end for the AssetStore, you must provide
+  # <tt>save(local_path, save_path)</tt>, <tt>cleanup(job)</tt>, and optionally,
+  # a <tt>setup</tt> method that will be called once at initialization.
   class AssetStore
     autoload :S3Store,         'cloud_crowd/asset_store/s3_store'
     autoload :FilesystemStore, 'cloud_crowd/asset_store/filesystem_store'
-    LOCAL_STORAGE_PATH = '/tmp/cloud_crowd_storage'
     # Configure the AssetStore with the specific storage implementation
     # specified by 'storage' in <tt>config.yml</tt>.
     case CloudCrowd.config[:storage]
@@ -25,9 +25,9 @@ module CloudCrowd
     # Creating the AssetStore ensures that its scratch directory exists.
     def initialize
-      @use_auth = CloudCrowd.config[:use_s3_authentication]
       FileUtils.mkdir_p temp_storage_path unless File.exists? temp_storage_path
       raise Error::StorageNotWritable, "#{temp_storage_path} is not writable" unless File.writable?(temp_storage_path)
+      setup if respond_to? :setup
     end
     # Get the path to CloudCrowd's temporary local storage. All actions run