RubyGems - cloud-crowd - Versions diffs - 0.4.1 → 0.5.0 - Mend

cloud-crowd 0.4.1 → 0.5.0

Files changed (18) hide show

data/cloud-crowd.gemspec +2 -2
data/config/config.example.yml +6 -67
data/lib/cloud-crowd.rb +5 -4
data/lib/cloud_crowd/action.rb +13 -14
data/lib/cloud_crowd/asset_store/cloudfiles_store.rb +6 -6
data/lib/cloud_crowd/command_line.rb +35 -32
data/lib/cloud_crowd/models/job.rb +1 -1
data/lib/cloud_crowd/models/node_record.rb +3 -1
data/lib/cloud_crowd/models/work_unit.rb +6 -1
data/lib/cloud_crowd/node.rb +6 -4
data/lib/cloud_crowd/schema.rb +2 -1
data/lib/cloud_crowd/server.rb +1 -1
data/public/css/admin_console.css +2 -1
data/public/js/admin_console.js +4 -3
data/test/config/config.yml +1 -1
data/test/unit/test_action.rb +15 -1
data/test/unit/test_node.rb +10 -9
metadata +4 -4

@@ -1,7 +1,7 @@
 Gem::Specification.new do |s|
   s.name      = 'cloud-crowd'
-  s.version   = '0.4.1'         # Keep version in sync with cloud-cloud.rb
-  s.date      = '2010-04-22'
+  s.version   = '0.5.0'         # Keep version in sync with cloud-cloud.rb
+  s.date      = '2010-06-22'
   s.homepage    = "http://wiki.github.com/documentcloud/cloud-crowd"
   s.summary     = "Parallel Processing for the Rest of Us"

data/config/config.example.yml CHANGED

@@ -1,68 +1,7 @@
-# The URL where you're planning on running the central server/queue/database.
-:central_server:      http://localhost:9173
+# This file configures your CloudCrowd installation, and should be consistent
+# between your server and all of your nodes. For more information, see:
+# http://wiki.github.com/documentcloud/cloud-crowd/the-configuration-folder
-# The following settings allow you to control the number of workers that can run
-# on a given node, to prevent the node from becoming overloaded. 'max_workers'
-# is a simple cap on the maximum number of workers a node is allowed to run
-# concurrently. 'max_load' is the maximum (one-minute) load average, above which
-# a node will refuse to take new work. 'min_free_memory' is the minimum amount
-# of free RAM (in megabytes) a node is allowed to have, below which no new
-# workers are run. These settings may be used in any combination.
-:max_workers:         5
-# :max_load:            5.0
-# :min_free_memory:     150
-# The storage back-end that you'd like to use for intermediate and final results
-# of processing. 's3', 'filesystem', and 'cloudfiles' are supported.
-# 'filesystem' should only be used in development, on single-machine installations,
-# or networked drives. If you *are* developing an action, filesystem is certainly
-# faster and easier.
-:storage:             s3
-# Please provide your AWS credentials for S3 storage of job output.
-:aws_access_key:      [your AWS access key]
-:aws_secret_key:      [your AWS secret access key]
-# Choose an S3 bucket to store all CloudCrowd output, and decide if you'd like
-# to keep all resulting files on S3 private. If so, you'll receive authenticated
-# S3 URLs as job output, good for 24 hours. If left public, you'll get the
-# straight URLs to the files on S3.
-:s3_bucket:           [your CloudCrowd bucket]
-:s3_authentication:   no
-# Cloudfiles
-:cloudfiles_username:  [your Rackspace Cloud Files username]
-:cloudfiles_api_key:   [your Rackspace Cloud Files API key]
-:cloudfiles_container: [your Rackspace Cloud Files container]
-# The following settings configure local paths. 'local_storage_path' is the
-# directory in which all files will be saved if you're using the 'filesystem'
-# storage. 'log_path' and 'pid_path' are the directories in which daemonized
-# servers and nodes will store their process ids and log files. The default
-# values are listed.
-# :local_storage_path:  /tmp/cloud_crowd_storage
-# :log_path:            log
-# :pid_path:            tmp/pids
-# Use HTTP Basic Auth for all requests? (Includes all internal worker requests
-# to the central server). If yes, specify the login and password that all
-# requests must provide for authentication.
-:http_authentication: no
-:login:               [your login name]
-:password:            [your password]
-# Disable all the default built-in actions
-# :disable_default_actions: true
-# Disable specific actions for the node
-# Use this if you want to disable a limited number of actions
-# :disabled_actions: ['word_count']
-# By default, CloudCrowd looks for installed actions inside the 'actions'
-# subdirectory of this configuration folder. 'actions_path' allows you to load
-# additional actions from a location of your choice.
-# :actions_path: /path/to/actions
-# The number of separate attempts that will be made to process an individual
-# work unit, before marking it as having failed.
-:work_unit_retries:   3
+:central_server:    http://localhost:9173
+:max_workers:       5
+:storage:           filesystem

data/lib/cloud-crowd.rb CHANGED

@@ -26,6 +26,7 @@ autoload :YAML,         'yaml'
 # Common code which should really be required in every circumstance.
 require 'socket'
+require 'net/http'
 require 'cloud_crowd/exceptions'
 module CloudCrowd
@@ -44,10 +45,10 @@ module CloudCrowd
   autoload :WorkUnit,     'cloud_crowd/models'
   # Keep this version in sync with the gemspec.
-  VERSION        = '0.4.1'
+  VERSION        = '0.5.0'
   # Increment the schema version when there's a backwards incompatible change.
-  SCHEMA_VERSION = 3
+  SCHEMA_VERSION = 4
   # Root directory of the CloudCrowd gem.
   ROOT           = File.expand_path(File.dirname(__FILE__) + '/..')
@@ -92,14 +93,14 @@ module CloudCrowd
     # Configure CloudCrowd by passing in the path to <tt>config.yml</tt>.
     def configure(config_path)
       @config_path = File.expand_path(File.dirname(config_path))
-      @config = YAML.load_file(config_path)
+      @config = YAML.load(ERB.new(File.read(config_path)).result)
     end
     # Configure the CloudCrowd central database (and connect to it), by passing
     # in a path to <tt>database.yml</tt>. The file should use the standard
     # ActiveRecord connection format.
     def configure_database(config_path, validate_schema=true)
-      configuration = YAML.load_file(config_path)
+      configuration = YAML.load(ERB.new(File.read(config_path)).result)
       ActiveRecord::Base.establish_connection(configuration)
       if validate_schema
         version = ActiveRecord::Base.connection.select_values('select max(version) from schema_migrations').first.to_i

data/lib/cloud_crowd/action.rb CHANGED

@@ -42,19 +42,18 @@ module CloudCrowd
     # Download a file to the specified path.
     def download(url, path)
-      `curl -s "#{url}" > "#{path}"`
-      return path
-      # The previous implementation is below, and, although it would be
-      # wonderful not to shell out, RestClient wasn't handling URLs with encoded
-      # entities (%20, for example), and doesn't let you download to a given
-      # location. Getting a RestClient patch in would be ideal.
-      #
-      # if url.match(FILE_URL)
-      #   FileUtils.cp(url.sub(FILE_URL, ''), path)
-      # else
-      #   resp = RestClient::Request.execute(:url => url, :method => :get, :raw_response => true)
-      #   FileUtils.mv resp.file.path, path
-      # end
+      if url.match(FILE_URL)
+        FileUtils.cp(url.sub(FILE_URL, ''), path)
+      else
+        File.open(path, 'w+') do |file|
+          Net::HTTP.get_response(URI(url)) do |response|
+            response.read_body do |chunk|
+              file.write chunk
+            end
+          end
+        end
+      end
+      path
     end
     # Takes a local filesystem path, saves the file to S3, and returns the
@@ -84,7 +83,7 @@ module CloudCrowd
     # Convert an unsafe URL into a filesystem-friendly filename.
     def safe_filename(url)
-      url.sub!(/\?.*\Z/, '')
+      url  = url.sub(/\?.*\Z/, '')
       ext  = File.extname(url)
       name = URI.unescape(File.basename(url)).gsub(/[^a-zA-Z0-9_\-.]/, '-').gsub(/-+/, '-')
       File.basename(name, ext).gsub('.', '-') + ext

data/lib/cloud_crowd/asset_store/cloudfiles_store.rb CHANGED

@@ -27,13 +27,13 @@ module CloudCrowd
       # Remove all of a Job's resulting files from Cloud Files, both intermediate and finished.
       def cleanup(job)
-          @container.objects(:prefix => "#{job.action}/job_#{job.id}").each do |object|
-            begin
-              @container.delete_object object
-            rescue
-              log "failed to delete #{job.action}/job_#{job.id}"
-            end
+        @container.objects(:prefix => "#{job.action}/job_#{job.id}").each do |object|
+          begin
+            @container.delete_object object
+          rescue
+            log "failed to delete #{job.action}/job_#{job.id}"
           end
+        end
       end
     end

data/lib/cloud_crowd/command_line.rb CHANGED

@@ -2,13 +2,13 @@ require 'optparse'
 module CloudCrowd
   class CommandLine
     # Configuration files required for the `crowd` command to function.
     CONFIG_FILES = ['config.yml', 'config.ru', 'database.yml']
     # Reference the absolute path to the root.
     CC_ROOT = File.expand_path(File.dirname(__FILE__) + '/../..')
     # Command-line banner for the usage message.
     BANNER = <<-EOS
 CloudCrowd is a MapReduce-inspired Parallel Processing System for Ruby.
@@ -25,13 +25,13 @@ Commands:
   console       Launch a CloudCrowd console, connected to the central database
   load_schema   Load the schema into the database specified by database.yml
   cleanup       Removes jobs that were finished over --days (7 by default) ago
   server -d [start | stop | restart]    Servers and nodes can be launched as
   node -d [start | stop | restart]      daemons, then stopped or restarted.
 Options:
     EOS
     # Creating a CloudCrowd::CommandLine runs from the contents of ARGV.
     def initialize
       parse_options
@@ -47,7 +47,7 @@ Options:
       else                     usage
       end
     end
     # Spin up an IRB session with the CloudCrowd code loaded in, and a database
     # connection established. The equivalent of Rails' `script/console`.
     def run_console
@@ -60,7 +60,7 @@ Options:
       Object.send(:include, CloudCrowd)
       IRB.start
     end
     # `crowd server` can either 'start', 'stop', or 'restart'.
     def run_server(subcommand)
       load_code
@@ -71,7 +71,7 @@ Options:
       when 'restart'  then restart_server
       end
     end
     # Convenience command for quickly spinning up the central server. More
     # sophisticated deployments, load-balancing across multiple app servers,
     # should use the config.ru rackup file directly. This method will start
@@ -86,19 +86,19 @@ Options:
       puts "Starting CloudCrowd Central Server on port #{port}..."
       exec "thin -e #{@options[:environment]} -p #{port} #{daemonize} --tag cloud-crowd-server --log #{log_path} --pid #{pid_path} -R #{rackup_path} start"
     end
     # Stop the daemonized central server, if it exists.
     def stop_server
       Thin::Server.kill(CloudCrowd.pid_path('server.pid'), 0)
     end
     # Restart the daemonized central server.
     def restart_server
       stop_server
       sleep 1
       start_server
     end
     # `crowd node` can either 'start', 'stop', or 'restart'.
     def run_node(subcommand)
       load_code
@@ -109,34 +109,34 @@ Options:
       when 'restart'  then restart_node
       end
     end
     # Launch a Node. Please only run a single node per machine. The Node process
     # will be long-lived, although its workers will come and go.
     def start_node
-      port = @options[:port] || Node::DEFAULT_PORT
-      puts "Starting CloudCrowd Node on port #{port}..."
-      Node.new(port, @options[:daemonize])
+      @options[:port] ||= Node::DEFAULT_PORT
+      puts "Starting CloudCrowd Node on port #{@options[:port]}..."
+      Node.new(@options)
     end
     # If the daemonized Node is running, stop it.
     def stop_node
       Thin::Server.kill CloudCrowd.pid_path('node.pid')
     end
     # Restart the daemonized Node, if it exists.
     def restart_node
       stop_node
       sleep 1
       start_node
     end
     # Load in the database schema to the database specified in 'database.yml'.
     def run_load_schema
       load_code
       connect_to_database(false)
       require 'cloud_crowd/schema.rb'
     end
     # Install the required CloudCrowd configuration files into the specified
     # directory, or the current one.
     def run_install(install_path)
@@ -148,22 +148,22 @@ Options:
       install_file "#{CC_ROOT}/config/database.example.yml", "#{install_path}/database.yml"
       install_file "#{CC_ROOT}/actions", "#{install_path}/actions", true
     end
     # Clean up all Jobs in the CloudCrowd database older than --days old.
     def run_cleanup
       load_code
       connect_to_database(true)
       Job.cleanup_all(:days => @options[:days])
     end
     # Print `crowd` usage.
     def usage
       puts "\n#{@option_parser}\n"
     end
     private
     # Check for configuration files, either in the current directory, or in
     # the CLOUD_CROWD_CONFIG environment variable. Exit if they're not found.
     def ensure_config
@@ -171,9 +171,9 @@ Options:
       found = CONFIG_FILES.all? {|f| File.exists? "#{@options[:config_path]}/#{f}" }
       found ? @config_dir = true : config_not_found
     end
     # Parse all options for all commands.
-    # Valid options are: --config --port --environment --daemonize --days.
+    # Valid options are: --config --port --environment --tag --daemonize --days.
     def parse_options
       @options = {
         :environment  => 'production',
@@ -190,6 +190,9 @@ Options:
         opts.on('-e', '--environment ENV', 'server environment (defaults to production)') do |env|
           @options[:environment] = env
         end
+        opts.on('-t', '--tag TAG', 'tag a node with a name') do |tag|
+          @options[:tag] = tag
+        end
         opts.on('-d', '--daemonize', 'run as a background daemon') do |daemonize|
           @options[:daemonize] = daemonize
         end
@@ -205,7 +208,7 @@ Options:
       @option_parser.banner = BANNER
       @option_parser.parse!(ARGV)
     end
     # Load in the CloudCrowd module code, dependencies, lib files and models.
     # Not all commands require this.
     def load_code
@@ -213,21 +216,21 @@ Options:
       require "#{CC_ROOT}/lib/cloud-crowd"
       CloudCrowd.configure("#{@options[:config_path]}/config.yml")
     end
     # Establish a connection to the central server's database. Not all commands
     # require this.
     def connect_to_database(validate_schema)
       require 'cloud_crowd/models'
       CloudCrowd.configure_database("#{@options[:config_path]}/database.yml", validate_schema)
     end
     # Exit with an explanation if the configuration files couldn't be found.
     def config_not_found
       puts "`crowd` can't find the CloudCrowd configuration directory. Please use `crowd -c path/to/config`, or run `crowd` from inside of the configuration directory itself."
       exit(1)
     end
-    # Install a file and log the installation. If we're overwriting a file,
+    # Install a file and log the installation. If we're overwriting a file,
     # offer a chance to back out.
     def install_file(source, dest, is_dir=false)
       if File.exists?(dest)
@@ -237,6 +240,6 @@ Options:
       is_dir ? FileUtils.cp_r(source, dest) : FileUtils.cp(source, dest)
       puts "installed #{dest}" unless ENV['RACK_ENV'] == 'test'
     end
   end
 end

data/lib/cloud_crowd/models/job.rb CHANGED

@@ -48,7 +48,7 @@ module CloudCrowd
       return queue_for_workers([outs]) if merging?
       if complete?
         update_attributes(:outputs => outs, :time => time_taken)
-        puts "Job ##{id} (#{action}) #{display_status}."
+        puts "Job ##{id} (#{action}) #{display_status}." unless ENV['RACK_ENV'] == 'test'
         Thread.new { fire_callback } if callback_url
       end
       self

data/lib/cloud_crowd/models/node_record.rb CHANGED

@@ -28,6 +28,7 @@ module CloudCrowd
         :ip_address       => request.ip,
         :port             => params[:host].match(PORT)[1].to_i,
         :busy             => params[:busy],
+        :tag              => params[:tag],
         :max_workers      => params[:max_workers],
         :enabled_actions  => params[:enabled_actions]
       }
@@ -93,7 +94,8 @@ module CloudCrowd
     def to_json(opts={})
       { 'host'    => host,
         'workers' => worker_pids,
-        'status'  => display_status
+        'status'  => display_status,
+        'tag'     => tag
       }.to_json
     end

data/lib/cloud_crowd/models/work_unit.rb CHANGED

@@ -39,8 +39,9 @@ module CloudCrowd
     # action in question disabled.
     def self.distribute_to_nodes
       reservation = nil
+      filter = {}
       loop do
-        return unless reservation = WorkUnit.reserve_available(:limit => RESERVATION_LIMIT)
+        return unless reservation = WorkUnit.reserve_available(:limit => RESERVATION_LIMIT, :conditions => filter)
         work_units = WorkUnit.reserved(reservation)
         available_nodes = NodeRecord.available
         while node = available_nodes.shift and unit = work_units.shift do
@@ -54,6 +55,10 @@ module CloudCrowd
           end
           work_units.push(unit)
         end
+        if work_units.any? && available_nodes.any?
+          filter = {:action => available_nodes.map {|node| node.actions }.flatten.uniq }
+          next
+        end
         return if work_units.any? || available_nodes.empty?
       end
     ensure

data/lib/cloud_crowd/node.rb CHANGED

@@ -30,7 +30,7 @@ module CloudCrowd
     # The response sent back when this node is overloaded.
     OVERLOADED_MESSAGE  = 'Node Overloaded'
-    attr_reader :enabled_actions, :host, :port, :central
+    attr_reader :enabled_actions, :host, :port, :tag, :central
     set :root, ROOT
     set :authorization_realm, "CloudCrowd"
@@ -63,15 +63,16 @@ module CloudCrowd
     end
     # When creating a node, specify the port it should run on.
-    def initialize(port=nil, daemon=false)
+    def initialize(options={})
       require 'json'
       CloudCrowd.identity = :node
       @central          = CloudCrowd.central_server
       @host             = Socket.gethostname
       @enabled_actions  = CloudCrowd.actions.keys - (CloudCrowd.config[:disabled_actions] || [])
-      @port             = port || DEFAULT_PORT
+      @port             = options[:port] || DEFAULT_PORT
       @id               = "#{@host}:#{@port}"
-      @daemon           = daemon
+      @daemon           = !!options[:daemonize]
+      @tag              = options[:tag]
       @overloaded       = false
       @max_load         = CloudCrowd.config[:max_load]
       @min_memory       = CloudCrowd.config[:min_free_memory]
@@ -102,6 +103,7 @@ module CloudCrowd
     def check_in(critical=false)
       @central["/node/#{@id}"].put(
         :busy             => @overloaded,
+        :tag              => @tag,
         :max_workers      => CloudCrowd.config[:max_workers],
         :enabled_actions  => @enabled_actions.join(',')
       )

data/lib/cloud_crowd/schema.rb CHANGED

@@ -18,8 +18,9 @@ ActiveRecord::Schema.define(:version => CloudCrowd::SCHEMA_VERSION) do
     t.string   "host",                                :null => false
     t.string   "ip_address",                          :null => false
     t.integer  "port",                                :null => false
-    t.string   "enabled_actions", :default => '',     :null => false
+    t.text     "enabled_actions", :default => '',     :null => false
     t.boolean  "busy",            :default => false,  :null => false
+    t.string   "tag"
     t.integer  "max_workers"
     t.datetime "created_at"
     t.datetime "updated_at"

data/lib/cloud_crowd/server.rb CHANGED

@@ -71,7 +71,7 @@ module CloudCrowd
     post '/jobs' do
       job = Job.create_from_request(JSON.parse(params[:job]))
       WorkUnit.distribute_to_nodes
-      puts "Job ##{job.id} (#{job.action}) started."
+      puts "Job ##{job.id} (#{job.action}) started." unless ENV['RACK_ENV'] == 'test'
       json job
     end

data/public/css/admin_console.css CHANGED

@@ -126,8 +126,9 @@ body {
     }
       #nodes .node {
         font-size: 11px;
-        line-height: 22px;
+        line-height: 22px; height: 22px;
         background-image: url(../images/server.png);
+        overflow: hidden;
       }
         #nodes .node.busy {
           background-image: url(../images/server_busy.png);

data/public/js/admin_console.js CHANGED

@@ -107,10 +107,11 @@ window.Console = {
     $('.has_nodes', header).html(nc + " Node" + (nc != 1 ? 's' : '') + " / " + wc + " Worker" + (wc != 1 ? 's' : ''));
     header.toggleClass('no_nodes', this._nodes.length <= 0);
     $('#nodes').html($.map(this._nodes, function(node) {
-      var html = "";
+      var html  = "";
       var extra = node.status == 'busy' ? ' <span class="busy">[busy]</span>' : '';
-      html += '<div class="node ' + node.status + '">' + node.host + extra + '</div>';
-      html += $.map(node.workers, function(pid) {
+      var tag   = node.tag ? '[' + node.tag + '] ' : '';
+      html      += '<div class="node ' + node.status + '">' + tag + node.host + extra + '</div>';
+      html      += $.map(node.workers, function(pid) {
         var name = pid + '@' + node.host;
         return '<div class="worker" rel="' + name + '">' + name + '</div>';
       }).join('');

data/test/config/config.yml CHANGED

@@ -1,4 +1,4 @@
-:max_workers:             10
+:max_workers:             <%= 5 * 2 %>
 :work_unit_retries:       3
 :central_server:          http://localhost:9173

data/test/unit/test_action.rb CHANGED

@@ -42,8 +42,15 @@ class ActionTest < Test::Unit::TestCase
       assert name == 'file.pdf'
     end
+    should "not change the original URL when generating a safe filename" do
+      url = "http://example.com/file.format?parameter=value"
+      path = @action.safe_filename url
+      assert url == "http://example.com/file.format?parameter=value"
+      assert path == "file.format"
+    end
     should "be able to count the number of words in this file" do
-      assert @action.process == 219
+      assert @action.process == 274
     end
     should "raise an exception when backticks fail" do
@@ -51,6 +58,13 @@ class ActionTest < Test::Unit::TestCase
       assert_raise(CloudCrowd::Error::CommandFailed) { @action.process }
     end
+    should "be able to download a remote file" do
+      path = "temp.txt"
+      @action.download('http://example.com', path)
+      assert File.read(path).match(/These domain names are reserved for use in documentation/)
+      FileUtils.rm path
+    end
   end

data/test/unit/test_node.rb CHANGED

@@ -1,31 +1,32 @@
 require 'test_helper'
 class NodeUnitTest < Test::Unit::TestCase
   context "A Node" do
     setup do
-      @node = Node.new(11011).instance_variable_get(:@app)
+      @node = Node.new(:port => 11011, :tag => "nodule").instance_variable_get(:@app)
     end
     should "set the identity of the Ruby instance" do
       assert CloudCrowd.node?
     end
     should "instantiate correctly" do
       assert @node.central.to_s == "http://localhost:9173"
       assert @node.port == 11011
       assert @node.host == Socket.gethostname
       assert @node.enabled_actions.length > 2
       assert @node.asset_store.is_a? AssetStore::FilesystemStore
+      assert @node.tag == "nodule"
     end
     should "trap signals and launch a server at start" do
       Thin::Server.any_instance.expects(:start)
       @node.expects(:check_in)
       @node.start
     end
     should "be able to determine if the node is overloaded" do
       assert !@node.overloaded?
       @node.instance_variable_set :@max_load, 0.01
@@ -35,7 +36,7 @@ class NodeUnitTest < Test::Unit::TestCase
       @node.instance_variable_set :@min_memory, 8000
       assert @node.overloaded?
     end
   end
 end

metadata CHANGED

@@ -4,9 +4,9 @@ version: !ruby/object:Gem::Version
   prerelease: false
   segments:
   - 0
-  - 4
-  - 1
-  version: 0.4.1
+  - 5
+  - 0
+  version: 0.5.0
 platform: ruby
 authors:
 - Jeremy Ashkenas
@@ -14,7 +14,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2010-04-22 00:00:00 -04:00
+date: 2010-06-22 00:00:00 -04:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency