RubyGems - cloud-crowd - Versions diffs - 0.3.3 → 0.4.0 - Mend

cloud-crowd 0.3.3 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

data/cloud-crowd.gemspec +6 -6
data/config/config.example.yml +23 -10
data/lib/cloud-crowd.rb +4 -4
data/lib/cloud_crowd/action.rb +24 -23
data/lib/cloud_crowd/asset_store.rb +3 -1
data/lib/cloud_crowd/asset_store/cloudfiles_store.rb +41 -0
data/lib/cloud_crowd/asset_store/s3_store.rb +9 -7
data/lib/cloud_crowd/models/node_record.rb +27 -26
data/lib/cloud_crowd/models/work_unit.rb +35 -28
data/lib/cloud_crowd/node.rb +43 -43
data/lib/cloud_crowd/schema.rb +7 -7
data/lib/cloud_crowd/server.rb +35 -30
data/public/css/admin_console.css +25 -62
data/public/js/admin_console.js +53 -70
data/test/acceptance/test_server.rb +14 -16
data/test/unit/test_action.rb +17 -15
data/views/operations_center.erb +26 -13
metadata +94 -59

data/lib/cloud_crowd/models/work_unit.rb CHANGED

@@ -6,36 +6,36 @@ module CloudCrowd
   # are each run as a single WorkUnit.
   class WorkUnit < ActiveRecord::Base
     include ModelStatus
     # We use a random number in (0...MAX_RESERVATION) to reserve work units.
     # The size of the maximum signed integer in MySQL -- SQLite has no limit.
     MAX_RESERVATION = 2147483647
     # We only reserve a certain number of WorkUnits in a single go, to avoid
     # reserving the entire table.
     RESERVATION_LIMIT = 25
     belongs_to :job
     belongs_to :node_record
     validates_presence_of :job_id, :status, :input, :action
     # Available WorkUnits are waiting to be distributed to Nodes for processing.
     named_scope :available, {:conditions => {:reservation => nil, :worker_pid => nil, :status => INCOMPLETE}}
     # Reserved WorkUnits have been marked for distribution by a central server process.
-    named_scope :reserved,  lambda {|reservation|
+    named_scope :reserved,  lambda {|reservation|
       {:conditions => {:reservation => reservation}, :order => 'updated_at asc'}
     }
     # Attempt to send a list of WorkUnits to nodes with available capacity.
     # A single central server process stops the same WorkUnit from being
     # distributed to multiple nodes by reserving it first. The algorithm used
     # should be lock-free.
     #
     # We reserve WorkUnits for this process in chunks of RESERVATION_LIMIT size,
-    # and try to match them to Nodes that are capable of handling the Action.
-    # WorkUnits get removed from the availability list when they are
-    # successfully sent, and Nodes get removed when they are busy or have the
+    # and try to match them to Nodes that are capable of handling the Action.
+    # WorkUnits get removed from the availability list when they are
+    # successfully sent, and Nodes get removed when they are busy or have the
     # action in question disabled.
     def self.distribute_to_nodes
       reservation = nil
@@ -44,11 +44,13 @@ module CloudCrowd
         work_units = WorkUnit.reserved(reservation)
         available_nodes = NodeRecord.available
         while node = available_nodes.shift and unit = work_units.shift do
-          if node.actions.include? unit.action
+          if node.actions.include?(unit.action)
             if node.send_work_unit(unit)
               available_nodes.push(node) unless node.busy?
               next
             end
+          else
+            unit.cancel_reservation
           end
           work_units.push(unit)
         end
@@ -57,26 +59,26 @@ module CloudCrowd
     ensure
       WorkUnit.cancel_reservations(reservation) if reservation
     end
-    # Reserves all available WorkUnits for this process. Returns false if there
+    # Reserves all available WorkUnits for this process. Returns false if there
     # were none available.
     def self.reserve_available(options={})
       reservation = ActiveSupport::SecureRandom.random_number(MAX_RESERVATION)
       any = WorkUnit.available.update_all("reservation = #{reservation}", nil, options) > 0
       any && reservation
     end
     # Cancels all outstanding WorkUnit reservations for this process.
     def self.cancel_reservations(reservation)
       WorkUnit.reserved(reservation).update_all('reservation = null')
     end
     # Cancels all outstanding WorkUnit reservations for all processes. (Useful
     # in the console for debugging.)
     def self.cancel_all_reservations
       WorkUnit.update_all('reservation = null')
     end
     # Look up a WorkUnit by the worker that's currently processing it. Specified
     # by <tt>pid@host</tt>.
     def self.find_by_worker_name(name)
@@ -84,16 +86,16 @@ module CloudCrowd
       node = NodeRecord.find_by_host(host)
       node && node.work_units.find_by_worker_pid(pid)
     end
     # Convenience method for starting a new WorkUnit.
     def self.start(job, action, input, status)
       input = input.to_json unless input.is_a? String
       self.create(:job => job, :action => action, :input => input, :status => status)
     end
     # Mark this unit as having finished successfully.
-    # Splitting work units are handled differently (an optimization) -- they
-    # immediately fire off all of their resulting WorkUnits for processing,
+    # Splitting work units are handled differently (an optimization) -- they
+    # immediately fire off all of their resulting WorkUnits for processing,
     # without waiting for the rest of their splitting cousins to complete.
     def finish(result, time_taken)
       if splitting?
@@ -114,7 +116,7 @@ module CloudCrowd
         job && job.check_for_completion
       end
     end
     # Mark this unit as having failed. May attempt a retry.
     def fail(output, time_taken)
       tries = self.attempts + 1
@@ -129,7 +131,7 @@ module CloudCrowd
       })
       job && job.check_for_completion
     end
     # Ever tried. Ever failed. No matter. Try again. Fail again. Fail better.
     def try_again
       update_attributes({
@@ -138,20 +140,25 @@ module CloudCrowd
         :attempts     => self.attempts + 1
       })
     end
+    # If the node can't process the unit, cancel it's reservation.
+    def cancel_reservation
+      update_attributes!(:reservation => nil)
+    end
     # When a Node checks out a WorkUnit, establish the connection between
     # WorkUnit and NodeRecord and record the worker_pid.
     def assign_to(node_record, worker_pid)
       update_attributes!(:node_record => node_record, :worker_pid => worker_pid)
     end
-    # All output needs to be wrapped in a JSON object for consistency
-    # (unfortunately, JSON.parse needs the top-level to be an object or array).
+    # All output needs to be wrapped in a JSON object for consistency
+    # (unfortunately, JSON.parse needs the top-level to be an object or array).
     # Convenience method to provide the parsed version.
     def parsed_output(out = self.output)
       JSON.parse(out)['output']
     end
     # The JSON representation of a WorkUnit shares the Job's options with all
     # its cousin WorkUnits.
     def to_json
@@ -165,6 +172,6 @@ module CloudCrowd
         'status'    => self.status
       }.to_json
     end
   end
 end

data/lib/cloud_crowd/node.rb CHANGED

@@ -1,57 +1,57 @@
 module CloudCrowd
   # A Node is a Sinatra/Thin application that runs a single instance per-machine
-  # It registers with the central server, receives WorkUnits, and forks off
+  # It registers with the central server, receives WorkUnits, and forks off
   # Workers to process them. The actions are:
   #
   # [get /heartbeat] Returns 200 OK to let monitoring tools know the server's up.
   # [post /work] The central server hits <tt>/work</tt> to dispatch a WorkUnit to this Node.
-  class Node < Sinatra::Default
+  class Node < Sinatra::Base
     # A Node's default port. You only run a single node per machine, so they
     # can all use the same port without any problems.
     DEFAULT_PORT        = 9063
-    # A list of regex scrapers, which let us extract the one-minute load
+    # A list of regex scrapers, which let us extract the one-minute load
     # average and the amount of free memory on different flavors of UNIX.
     SCRAPE_UPTIME       = /\d+\.\d+/
     SCRAPE_LINUX_MEMORY = /MemFree:\s+(\d+) kB/
-    SCRAPE_MAC_MEMORY   = /Pages free:\s+(\d+)./
+    SCRAPE_MAC_MEMORY   = /Pages free:\s+(\d+)./
     SCRAPE_MAC_PAGE     = /page size of (\d+) bytes/
     # The interval at which the node monitors the machine's load and memory use
     # (if configured to do so in config.yml).
     MONITOR_INTERVAL    = 3
     # The interval at which the node regularly checks in with central (5 min).
     CHECK_IN_INTERVAL   = 300
     # The response sent back when this node is overloaded.
     OVERLOADED_MESSAGE  = 'Node Overloaded'
     attr_reader :enabled_actions, :host, :port, :central
     set :root, ROOT
     set :authorization_realm, "CloudCrowd"
     helpers Helpers
     # methodoverride allows the _method param.
     enable :methodoverride
     # Enabling HTTP Authentication turns it on for all requests.
     # This works the same way as in the central CloudCrowd::Server.
     before do
       login_required if CloudCrowd.config[:http_authentication]
     end
-    # To monitor a Node with Monit, God, Nagios, or another tool, you can hit
+    # To monitor a Node with Monit, God, Nagios, or another tool, you can hit
     # /heartbeat to make sure its still online.
     get '/heartbeat' do
       "buh-bump"
     end
     # Posts a WorkUnit to this Node. Forks a Worker and returns the process id.
     # Returns a 503 if this Node is overloaded.
     post '/work' do
@@ -61,14 +61,14 @@ module CloudCrowd
       Process.detach(pid)
       json :pid => pid
     end
     # When creating a node, specify the port it should run on.
     def initialize(port=nil, daemon=false)
       require 'json'
       CloudCrowd.identity = :node
       @central          = CloudCrowd.central_server
       @host             = Socket.gethostname
-      @enabled_actions  = CloudCrowd.actions.keys
+      @enabled_actions  = CloudCrowd.actions.keys - (CloudCrowd.config[:disabled_actions] || [])
       @port             = port || DEFAULT_PORT
       @daemon           = daemon
       @overloaded       = false
@@ -76,7 +76,7 @@ module CloudCrowd
       @min_memory       = CloudCrowd.config[:min_free_memory]
       start unless test?
     end
     # Starting up a Node registers with the central server and begins to listen
     # for incoming WorkUnits.
     def start
@@ -94,9 +94,9 @@ module CloudCrowd
       monitor_system if @max_load || @min_memory
       @server_thread.join
     end
-    # Checking in with the central server informs it of the location and
-    # configuration of this Node. If it can't check-in, there's no point in
+    # Checking in with the central server informs it of the location and
+    # configuration of this Node. If it can't check-in, there's no point in
     # starting.
     def check_in(critical=false)
       @central["/node/#{@host}"].put(
@@ -109,31 +109,31 @@ module CloudCrowd
       puts "Failed to connect to the central server (#{@central.to_s})."
       raise SystemExit if critical
     end
     # Before exiting, the Node checks out with the central server, releasing all
     # of its WorkUnits for other Nodes to handle
     def check_out
       @central["/node/#{@host}"].delete
     end
     # Lazy-initialize the asset_store, preferably after the Node has launched.
     def asset_store
       @asset_store ||= AssetStore.new
     end
-    # Is the node overloaded? If configured, checks if the load average is
+    # Is the node overloaded? If configured, checks if the load average is
     # greater than 'max_load', or if the available RAM is less than
     # 'min_free_memory'.
     def overloaded?
       (@max_load && load_average > @max_load) ||
       (@min_memory && free_memory < @min_memory)
     end
     # The current one-minute load average.
     def load_average
       `uptime`.match(SCRAPE_UPTIME).to_s.to_f
     end
     # The current amount of free memory in megabytes.
     def free_memory
       case RUBY_PLATFORM
@@ -147,12 +147,12 @@ module CloudCrowd
         raise NotImplementedError, "'min_free_memory' is not yet implemented on your platform"
       end
     end
     private
-    # Launch a monitoring thread that periodically checks the node's load
-    # average and the amount of free memory remaining. If we transition out of
+    # Launch a monitoring thread that periodically checks the node's load
+    # average and the amount of free memory remaining. If we transition out of
     # the overloaded state, let central know.
     def monitor_system
       @monitor_thread = Thread.new do
@@ -164,9 +164,9 @@ module CloudCrowd
         end
       end
     end
-    # If communication is interrupted for external reasons, the central server
-    # will assume that the node has gone down. Checking in will let central know
+    # If communication is interrupted for external reasons, the central server
+    # will assume that the node has gone down. Checking in will let central know
     # it's still online.
     def check_in_periodically
       @check_in_thread = Thread.new do
@@ -176,7 +176,7 @@ module CloudCrowd
         end
       end
     end
     # Trap exit signals in order to shut down cleanly.
     def trap_signals
       Signal.trap('QUIT') { shut_down }
@@ -184,7 +184,7 @@ module CloudCrowd
       Signal.trap('KILL') { shut_down }
       Signal.trap('TERM') { shut_down }
     end
     # At shut down, de-register with the central server before exiting.
     def shut_down
       @check_in_thread.kill if @check_in_thread
@@ -193,7 +193,7 @@ module CloudCrowd
       @server_thread.kill if @server_thread
       Process.exit
     end
   end
 end

data/lib/cloud_crowd/schema.rb CHANGED

@@ -13,7 +13,7 @@ ActiveRecord::Schema.define(:version => CloudCrowd::SCHEMA_VERSION) do
     t.datetime "created_at"
     t.datetime "updated_at"
   end
   create_table "node_records", :force => true do |t|
     t.string   "host",                                :null => false
     t.string   "ip_address",                          :null => false
@@ -41,10 +41,10 @@ ActiveRecord::Schema.define(:version => CloudCrowd::SCHEMA_VERSION) do
   end
   # Here be indices. After looking, it seems faster not to have them at all.
-  #
-  # add_index "jobs", ["status"], :name => "index_jobs_on_status"
-  # add_index "work_units", ["job_id"], :name => "index_work_units_on_job_id"
-  # add_index "work_units", ["worker_pid"], :name => "index_work_units_on_worker_pid"
-  # add_index "work_units", ["worker_pid", "status"], :name => "index_work_units_on_worker_pid_and_status"
-  # add_index "work_units", ["worker_pid", "node_record_id"], :name => "index_work_units_on_worker_pid_and_node_record_id"
+  #
+  add_index "jobs", ["status"], :name => "index_jobs_on_status"
+  add_index "work_units", ["job_id"], :name => "index_work_units_on_job_id"
+  add_index "work_units", ["worker_pid"], :name => "index_work_units_on_worker_pid"
+  add_index "work_units", ["worker_pid", "status"], :name => "index_work_units_on_worker_pid_and_status"
+  add_index "work_units", ["worker_pid", "node_record_id"], :name => "index_work_units_on_worker_pid_and_node_record_id"
 end

data/lib/cloud_crowd/server.rb CHANGED

@@ -1,5 +1,5 @@
 module CloudCrowd
   # The main CloudCrowd (Sinatra) application. The actions are:
   #
   # == Admin
@@ -7,60 +7,65 @@ module CloudCrowd
   # [get /status] Get the combined JSON of every active job and worker.
   # [get /worker/:name] Look up the details of a WorkUnit that a Worker is busy processing.
   # [get /heartbeat] Returns 200 OK to let monitoring tools know the server's up.
-  #
+  #
   # == Public API
   # [post /jobs] Begin a new Job. Post with a JSON representation of the job-to-be. (see examples).
   # [get /jobs/:job_id] Check the status of a Job. Response includes output, if the Job has finished.
   # [delete /jobs/:job_id] Clean up a Job when you're done downloading the results. Removes all intermediate files.
   #
   # == Internal Workers API
-  # [puts /node/:host] Registers a new Node, making it available for processing.
+  # [put /node/:host] Registers a new Node, making it available for processing.
   # [delete /node/:host] Removes a Node from the registry, freeing up any WorkUnits that it had checked out.
   # [put /work/:unit_id] Mark a finished WorkUnit as completed or failed, with results.
-  class Server < Sinatra::Default
+  class Server < Sinatra::Base
     set :root, ROOT
     set :authorization_realm, "CloudCrowd"
     helpers Helpers
     # static serves files from /public, methodoverride allows the _method param.
     enable :static, :methodoverride
     # Enabling HTTP Authentication turns it on for all requests.
     before do
       login_required if CloudCrowd.config[:http_authentication]
     end
     # Render the admin console.
     get '/' do
       erb :operations_center
     end
     # Get the JSON for every active job in the queue and every active worker
     # in the system. This action may get a little worrisome as the system grows
     # larger -- keep it in mind.
     get '/status' do
       json(
-        'jobs'            => Job.incomplete,
         'nodes'           => NodeRecord.all(:order => 'host desc'),
+        'job_count'       => Job.incomplete.count,
         'work_unit_count' => WorkUnit.incomplete.count
       )
     end
+    # Get the last 100 lines of log messages.
+    get '/log' do
+      `tail -n 100 #{CloudCrowd.log_path('server.log')}`
+    end
     # Get the JSON for what a worker is up to.
     get '/worker/:name' do
       json WorkUnit.find_by_worker_name(params[:name]) || {}
     end
-    # To monitor the central server with Monit, God, Nagios, or another
+    # To monitor the central server with Monit, God, Nagios, or another
     # monitoring tool, you can hit /heartbeat to make sure.
     get '/heartbeat' do
       "buh-bump"
     end
     # PUBLIC API:
     # Start a new job. Accepts a JSON representation of the job-to-be.
     # Distributes all work units to available nodes.
     post '/jobs' do
@@ -68,37 +73,37 @@ module CloudCrowd
       WorkUnit.distribute_to_nodes
       json job
     end
     # Check the status of a job, returning the output if finished, and the
-    # number of work units remaining otherwise.
+    # number of work units remaining otherwise.
     get '/jobs/:job_id' do
       json current_job
     end
-    # Cleans up a Job's saved S3 files. Delete a Job after you're done
+    # Cleans up a Job's saved S3 files. Delete a Job after you're done
     # downloading the results.
     delete '/jobs/:job_id' do
       current_job.destroy
       json nil
     end
     # INTERNAL NODE API:
-    # A new Node will this this action to register its location and
-    # configuration with the central server. Triggers distribution of WorkUnits.
+    # A new Node will this this action to register its location and
+    # configuration with the central server. Triggers distribution of WorkUnits.
     put '/node/:host' do
       NodeRecord.check_in(params, request)
       WorkUnit.distribute_to_nodes
       json nil
     end
-    # Deregisters a Node from the central server. Releases and redistributes any
+    # Deregisters a Node from the central server. Releases and redistributes any
     # WorkUnits it may have had checked out.
     delete '/node/:host' do
       NodeRecord.destroy_all(:host => params[:host])
       json nil
     end
     # When workers are done with their unit, either successfully on in failure,
     # they mark it back on the central server and exit. Triggers distribution
     # of pending work units.
@@ -111,13 +116,13 @@ module CloudCrowd
       WorkUnit.distribute_to_nodes
       json nil
     end
     # At initialization record the identity of this Ruby instance as a server.
     def initialize(*args)
       super(*args)
       CloudCrowd.identity = :server
     end
   end
 end