RubyGems - cloud-crowd - Versions diffs - 0.3.3 → 0.4.0 - Mend

cloud-crowd 0.3.3 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

data/cloud-crowd.gemspec +6 -6
data/config/config.example.yml +23 -10
data/lib/cloud-crowd.rb +4 -4
data/lib/cloud_crowd/action.rb +24 -23
data/lib/cloud_crowd/asset_store.rb +3 -1
data/lib/cloud_crowd/asset_store/cloudfiles_store.rb +41 -0
data/lib/cloud_crowd/asset_store/s3_store.rb +9 -7
data/lib/cloud_crowd/models/node_record.rb +27 -26
data/lib/cloud_crowd/models/work_unit.rb +35 -28
data/lib/cloud_crowd/node.rb +43 -43
data/lib/cloud_crowd/schema.rb +7 -7
data/lib/cloud_crowd/server.rb +35 -30
data/public/css/admin_console.css +25 -62
data/public/js/admin_console.js +53 -70
data/test/acceptance/test_server.rb +14 -16
data/test/unit/test_action.rb +17 -15
data/views/operations_center.erb +26 -13
metadata +94 -59

data/cloud-crowd.gemspec CHANGED

@@ -1,7 +1,7 @@
 Gem::Specification.new do |s|
   s.name      = 'cloud-crowd'
-  s.version   = '0.3.3'         # Keep version in sync with cloud-cloud.rb
-  s.date      = '2010-01-27'
+  s.version   = '0.4.0'         # Keep version in sync with cloud-cloud.rb
+  s.date      = '2010-03-31'
   s.homepage    = "http://wiki.github.com/documentcloud/cloud-crowd"
   s.summary     = "Parallel Processing for the Rest of Us"
@@ -27,11 +27,10 @@ Gem::Specification.new do |s|
                          '--main'     << 'README' <<
                          '--all'
-  s.add_dependency 'sinatra',       ['>= 0.9.4']
-  s.add_dependency 'activerecord',  ['>= 2.3.3']
+  s.add_dependency 'sinatra',       ['~> 0.9']
+  s.add_dependency 'activerecord',  ['~> 2.3']
   s.add_dependency 'json',          ['>= 1.1.7']
-  s.add_dependency 'rest-client',   ['>= 1.0.3']
-  s.add_dependency 'right_aws',     ['>= 1.10.0']
+  s.add_dependency 'rest-client',   ['>= 1.4']
   s.add_dependency 'thin',          ['>= 1.2.4']
   if s.respond_to?(:add_development_dependency)
@@ -58,6 +57,7 @@ lib/cloud-crowd.rb
 lib/cloud_crowd/action.rb
 lib/cloud_crowd/asset_store/filesystem_store.rb
 lib/cloud_crowd/asset_store/s3_store.rb
+lib/cloud_crowd/asset_store/cloudfiles_store.rb
 lib/cloud_crowd/asset_store.rb
 lib/cloud_crowd/command_line.rb
 lib/cloud_crowd/exceptions.rb

data/config/config.example.yml CHANGED

@@ -2,20 +2,21 @@
 :central_server:      http://localhost:9173
 # The following settings allow you to control the number of workers that can run
-# on a given node, to prevent the node from becoming overloaded. 'max_workers'
+# on a given node, to prevent the node from becoming overloaded. 'max_workers'
 # is a simple cap on the maximum number of workers a node is allowed to run
 # concurrently. 'max_load' is the maximum (one-minute) load average, above which
 # a node will refuse to take new work. 'min_free_memory' is the minimum amount
-# of free RAM (in megabytes) a node is allowed to have, below which no new
+# of free RAM (in megabytes) a node is allowed to have, below which no new
 # workers are run. These settings may be used in any combination.
 :max_workers:         5
 # :max_load:            5.0
 # :min_free_memory:     150
 # The storage back-end that you'd like to use for intermediate and final results
-# of processing. 's3' and 'filesystem' are supported. 'filesystem' should only
-# be used in development, on single-machine installations, or networked drives.
-# If you *are* developing an action, filesystem is certainly faster and easier.
+# of processing. 's3', 'filesystem', and 'cloudfiles' are supported.
+# 'filesystem' should only be used in development, on single-machine installations,
+# or networked drives. If you *are* developing an action, filesystem is certainly
+# faster and easier.
 :storage:             s3
 # Please provide your AWS credentials for S3 storage of job output.
@@ -29,22 +30,34 @@
 :s3_bucket:           [your CloudCrowd bucket]
 :s3_authentication:   no
-# The following settings configure local paths. 'local_storage_path' is the
+# Cloudfiles
+:cloudfiles_username:  [your Rackspace Cloud Files username]
+:cloudfiles_api_key:   [your Rackspace Cloud Files API key]
+:cloudfiles_container: [your Rackspace Cloud Files container]
+# The following settings configure local paths. 'local_storage_path' is the
 # directory in which all files will be saved if you're using the 'filesystem'
-# storage. 'log_path' and 'pid_path' are the directories in which daemonized
-# servers and nodes will store their process ids and log files. The default
+# storage. 'log_path' and 'pid_path' are the directories in which daemonized
+# servers and nodes will store their process ids and log files. The default
 # values are listed.
 # :local_storage_path:  /tmp/cloud_crowd_storage
 # :log_path:            log
 # :pid_path:            tmp/pids
-# Use HTTP Basic Auth for all requests? (Includes all internal worker requests
-# to the central server). If yes, specify the login and password that all
+# Use HTTP Basic Auth for all requests? (Includes all internal worker requests
+# to the central server). If yes, specify the login and password that all
 # requests must provide for authentication.
 :http_authentication: no
 :login:               [your login name]
 :password:            [your password]
+# Disable all the default built-in actions
+# :disable_default_actions: true
+# Disable specific actions for the node
+# Use this if you want to disable a limited number of actions
+# :disabled_actions: ['word_count']
 # By default, CloudCrowd looks for installed actions inside the 'actions'
 # subdirectory of this configuration folder. 'actions_path' allows you to load
 # additional actions from a location of your choice.

data/lib/cloud-crowd.rb CHANGED

@@ -4,10 +4,9 @@ $LOAD_PATH.unshift File.expand_path(File.dirname(__FILE__))
 # Common Gems:
 require 'rubygems'
-gem 'activerecord'
+gem 'activerecord', '~> 2.0'
 gem 'json'
 gem 'rest-client'
-gem 'right_aws'
 gem 'sinatra'
 gem 'thin'
@@ -20,6 +19,7 @@ autoload :FileUtils,    'fileutils'
 autoload :JSON,         'json'
 autoload :RestClient,   'rest_client'
 autoload :RightAws,     'right_aws'
+autoload :CloudFiles,   'cloudfiles'
 autoload :Sinatra,      'sinatra'
 autoload :Thin,         'thin'
 autoload :YAML,         'yaml'
@@ -44,7 +44,7 @@ module CloudCrowd
   autoload :WorkUnit,     'cloud_crowd/models'
   # Keep this version in sync with the gemspec.
-  VERSION        = '0.3.3'
+  VERSION        = '0.4.0'
   # Increment the schema version when there's a backwards incompatible change.
   SCHEMA_VERSION = 3
@@ -166,7 +166,7 @@ module CloudCrowd
     # Retrieve the list of every installed Action for this node or server.
     def action_paths
-      default_actions   = Dir["#{ROOT}/actions/*.rb"]
+      default_actions   = config[:disable_default_actions] ? [] : Dir["#{ROOT}/actions/*.rb"]
       installed_actions = Dir["#{@config_path}/actions/*.rb"]
       custom_actions    = CloudCrowd.config[:actions_path] ? Dir["#{CloudCrowd.config[:actions_path]}/*.rb"] : []
       default_actions + installed_actions + custom_actions

data/lib/cloud_crowd/action.rb CHANGED

@@ -1,7 +1,7 @@
 module CloudCrowd
   # As you write your custom actions, have them inherit from CloudCrowd::Action.
-  # All actions must implement a +process+ method, which should return a
+  # All actions must implement a +process+ method, which should return a
   # JSON-serializable object that will be used as the output for the work unit.
   # See the default actions for examples.
   #
@@ -16,11 +16,11 @@ module CloudCrowd
   # Note that Actions inherit a backticks (`) method that raises an Exception
   # if the external command fails.
   class Action
     FILE_URL = /\Afile:\/\//
     attr_reader :input, :input_path, :file_name, :options, :work_directory
     # Initializing an Action sets up all of the read-only variables that
     # form the bulk of the API for action subclasses. (Paths to read from and
     # write to). It creates the +work_directory+ and moves into it.
@@ -34,17 +34,17 @@ module CloudCrowd
       parse_input
       download_input
     end
     # Each Action subclass must implement a +process+ method, overriding this.
     def process
       raise NotImplementedError, "CloudCrowd::Actions must override 'process' with their own processing code."
     end
     # Download a file to the specified path.
     def download(url, path)
       `curl -s "#{url}" > "#{path}"`
       return path
-      # The previous implementation is below, and, although it would be
+      # The previous implementation is below, and, although it would be
       # wonderful not to shell out, RestClient wasn't handling URLs with encoded
       # entities (%20, for example), and doesn't let you download to a given
       # location. Getting a RestClient patch in would be ideal.
@@ -56,21 +56,21 @@ module CloudCrowd
       #   FileUtils.mv resp.file.path, path
       # end
     end
-    # Takes a local filesystem path, saves the file to S3, and returns the
-    # public (or authenticated) url on S3 where the file can be accessed.
+    # Takes a local filesystem path, saves the file to S3, and returns the
+    # public (or authenticated) url on S3 where the file can be accessed.
     def save(file_path)
       save_path = File.join(storage_prefix, File.basename(file_path))
       @store.save(file_path, save_path)
     end
     # After the Action has finished, we remove the work directory and return
     # to the root directory (where workers run by default).
     def cleanup_work_directory
       FileUtils.rm_r(@work_directory) if File.exists?(@work_directory)
     end
-    # Actions have a backticks command that raises a CommandFailed exception
+    # Actions have a backticks command that raises a CommandFailed exception
     # on failure, so that processing doesn't just blithely continue.
     def `(command)
       result    = super(command)
@@ -78,17 +78,18 @@ module CloudCrowd
       raise Error::CommandFailed.new(result, exit_code) unless exit_code == 0
       result
     end
     private
     # Convert an unsafe URL into a filesystem-friendly filename.
     def safe_filename(url)
+      url.sub!(/\?.*\Z/, '')
       ext  = File.extname(url)
       name = URI.unescape(File.basename(url)).gsub(/[^a-zA-Z0-9_\-.]/, '-').gsub(/-+/, '-')
       File.basename(name, ext).gsub('.', '-') + ext
     end
     # The directory prefix to use for both local and S3 storage.
     # [action]/job_[job_id]/unit_[work_unit_it]
     def storage_prefix
@@ -98,18 +99,18 @@ module CloudCrowd
       path_parts << "unit_#{@work_unit_id}" if @work_unit_id
       @storage_prefix ||= File.join(path_parts)
     end
     # If we think that the input is JSON, replace it with the parsed form.
     # It would be great if the JSON module had an is_json? method.
     def parse_input
       return unless ['[', '{'].include? @input[0..0]
       @input = JSON.parse(@input) rescue @input
     end
     def input_is_url?
       !URI.parse(@input).scheme.nil? rescue false
     end
     # If the input is a URL, download the file before beginning processing.
     def download_input
       return unless input_is_url?
@@ -119,7 +120,7 @@ module CloudCrowd
         download(@input, @input_path)
       end
     end
   end
 end

data/lib/cloud_crowd/asset_store.rb CHANGED

@@ -14,12 +14,14 @@ module CloudCrowd
     autoload :S3Store,         'cloud_crowd/asset_store/s3_store'
     autoload :FilesystemStore, 'cloud_crowd/asset_store/filesystem_store'
+    autoload :CloudfilesStore, 'cloud_crowd/asset_store/cloudfiles_store'
     # Configure the AssetStore with the specific storage implementation
     # specified by 'storage' in <tt>config.yml</tt>.
     case CloudCrowd.config[:storage]
-    when 's3'         then include S3Store
     when 'filesystem' then include FilesystemStore
+    when 's3'         then include S3Store
+    when 'cloudfiles' then include CloudfilesStore
     else raise Error::StorageNotFound, "#{CloudCrowd.config[:storage]} is not a valid storage back end"
     end

data/lib/cloud_crowd/asset_store/cloudfiles_store.rb ADDED

@@ -0,0 +1,41 @@
+gem 'cloudfiles'
+module CloudCrowd
+  class AssetStore
+    # The CloudFilesStore is an implementation of an AssetStore that uses a Rackspace Cloud
+    module CloudfilesStore
+      # Configure Rackspace Cloud and connect
+      def setup
+        username  = CloudCrowd.config[:cloudfiles_username]
+        api_key   = CloudCrowd.config[:cloudfiles_api_key]
+        container = CloudCrowd.config[:cloudfiles_container]
+        valid_conf  = [username, api_key, container].all? {|s| s.is_a? String }
+        raise Error::MissingConfiguration, "A Rackspace Cloud Files account must be configured in 'config.yml' before 'cloudfiles' storage can be used" unless valid_conf
+        @cloud = CloudFiles::Connection.new(username, api_key)
+        @container = @cloud.container container
+      end
+      # Save a finished file from local storage to Cloud Files.
+      def save(local_path, save_path)
+        object = @container.create_object save_path, true
+        object.load_from_filename local_path
+        object.public_url
+      end
+      # Remove all of a Job's resulting files from Cloud Files, both intermediate and finished.
+      def cleanup(job)
+          @container.objects(:prefix => "#{job.action}/job_#{job.id}").each do |object|
+            begin
+              @container.delete_object object
+            rescue
+              log "failed to delete #{job.action}/job_#{job.id}"
+            end
+          end
+      end
+    end
+  end
+end

data/lib/cloud_crowd/asset_store/s3_store.rb CHANGED

@@ -1,10 +1,12 @@
+gem 'right_aws'
 module CloudCrowd
   class AssetStore
     # The S3Store is an implementation of an AssetStore that uses a bucket
     # on S3 for all resulting files.
     module S3Store
       # Configure authentication and establish a connection to S3, first thing.
       def setup
         @use_auth   = CloudCrowd.config[:s3_authentication]
@@ -18,8 +20,8 @@ module CloudCrowd
         @bucket     = @s3.bucket(bucket_name)
         @bucket     = @s3.bucket(bucket_name, true) unless @bucket
       end
-      # Save a finished file from local storage to S3. Save it publicly unless
+      # Save a finished file from local storage to S3. Save it publicly unless
       # we're configured to use S3 authentication. Authenticated links expire
       # after one day by default.
       def save(local_path, save_path)
@@ -31,13 +33,13 @@ module CloudCrowd
           @bucket.key(save_path).public_link
         end
       end
       # Remove all of a Job's resulting files from S3, both intermediate and finished.
       def cleanup(job)
         @bucket.delete_folder("#{job.action}/job_#{job.id}")
       end
     end
   end
 end

data/lib/cloud_crowd/models/node_record.rb CHANGED

@@ -1,24 +1,25 @@
 module CloudCrowd
-  # A NodeRecord is the central server's record of a Node running remotely. We
+  # A NodeRecord is the central server's record of a Node running remotely. We
   # can use it to assign WorkUnits to the Node, and keep track of its status.
   # When a Node exits, it destroys this record.
   class NodeRecord < ActiveRecord::Base
     has_many :work_units
     validates_presence_of :host, :ip_address, :port, :enabled_actions
     after_destroy :redistribute_work_units
     # Available Nodes haven't used up their maxiumum number of workers yet.
     named_scope :available, {
       :conditions => ['(max_workers is null or (select count(*) from work_units where node_record_id = node_records.id) < max_workers)'],
       :order      => 'updated_at asc'
     }
-    # Register a Node with the central server. Currently this only happens at
-    # Node startup.
+    # Register a Node with the central server. This happens periodically
+    # (once every `Node::CHECK_IN_INTERVAL` seconds). Nodes will be de-registered
+    # if they checked in within a reasonable interval.
     def self.check_in(params, request)
       attrs = {
         :ip_address       => request.ip,
@@ -29,15 +30,15 @@ module CloudCrowd
       }
       self.find_or_create_by_host(params[:host]).update_attributes!(attrs)
     end
     # Dispatch a WorkUnit to this node. Places the node at back at the end of
     # the rotation. If we fail to send the WorkUnit, we consider the node to be
     # down, and remove this record, freeing up all of its checked-out work units.
-    # If the Node responds that it's overloaded, we mark it as busy. Returns
+    # If the Node responds that it's overloaded, we mark it as busy. Returns
     # true if the WorkUnit was dispatched successfully.
     def send_work_unit(unit)
       result = node['/work'].post(:work_unit => unit.to_json)
-      unit.assign_to(self, JSON.parse(result)['pid'])
+      unit.assign_to(self, JSON.parse(result.body)['pid'])
       touch && true
     rescue RestClient::RequestFailed => e
       raise e unless e.http_code == 503 && e.http_body == Node::OVERLOADED_MESSAGE
@@ -46,45 +47,45 @@ module CloudCrowd
       # Couldn't post to node, assume it's gone away.
       destroy && false
     end
     # What Actions is this Node able to run?
     def actions
       @actions ||= enabled_actions.split(',')
     end
-    # Is this Node too busy for more work? Determined by number of workers, or
+    # Is this Node too busy for more work? Determined by number of workers, or
     # the Node's load average, as configured in config.yml.
     def busy?
       busy || (max_workers && work_units.count >= max_workers)
     end
     # The URL at which this Node may be reached.
     # TODO: Make sure that the host actually has externally accessible DNS.
     def url
       @url ||= "http://#{host}:#{port}"
     end
-    # Keep a RestClient::Resource handy for contacting the Node, including
+    # Keep a RestClient::Resource handy for contacting the Node, including
     # HTTP authentication, if configured.
     def node
       @node ||= RestClient::Resource.new(url, CloudCrowd.client_options)
     end
     # The printable status of the Node.
     def display_status
       busy? ? 'busy' : 'available'
     end
     # A list of the process ids of the workers currently being run by the Node.
     def worker_pids
       work_units.all(:select => 'worker_pid').map(&:worker_pid)
     end
     # Release all of this Node's WorkUnits for other nodes to take.
     def release_work_units
       WorkUnit.update_all('node_record_id = null, worker_pid = null', "node_record_id = #{id}")
     end
     # The JSON representation of a NodeRecord includes its worker_pids.
     def to_json(opts={})
       { 'host'    => host,
@@ -92,16 +93,16 @@ module CloudCrowd
         'status'  => display_status
       }.to_json
     end
     private
-    # When a Node exits, release its WorkUnits and redistribute them to others.
+    # When a Node exits, release its WorkUnits and redistribute them to others.
     # Redistribute in a separate thread to avoid delaying shutdown.
     def redistribute_work_units
       release_work_units
       Thread.new { WorkUnit.distribute_to_nodes }
     end
   end
 end