documentcloud-cloud-crowd 0.2.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/cloud-crowd.gemspec CHANGED
@@ -1,7 +1,7 @@
1
1
  Gem::Specification.new do |s|
2
2
  s.name = 'cloud-crowd'
3
- s.version = '0.2.0' # Keep version in sync with cloud-cloud.rb
4
- s.date = '2009-09-17'
3
+ s.version = '0.2.1' # Keep version in sync with cloud-cloud.rb
4
+ s.date = '2009-09-18'
5
5
 
6
6
  s.homepage = "http://wiki.github.com/documentcloud/cloud-crowd"
7
7
  s.summary = "Parallel Processing for the Rest of Us"
@@ -94,8 +94,9 @@ public/js/excanvas.js
94
94
  public/js/flot.js
95
95
  public/js/jquery.js
96
96
  README
97
- test/acceptance/test_server.rb
97
+ test/acceptance/test_node.rb
98
98
  test/acceptance/test_failing_work_units.rb
99
+ test/acceptance/test_server.rb
99
100
  test/acceptance/test_word_count.rb
100
101
  test/blueprints.rb
101
102
  test/config/config.ru
@@ -105,7 +106,10 @@ test/config/actions/failure_testing.rb
105
106
  test/test_helper.rb
106
107
  test/unit/test_action.rb
107
108
  test/unit/test_configuration.rb
109
+ test/unit/test_node.rb
110
+ test/unit/test_node_record.rb
108
111
  test/unit/test_job.rb
112
+ test/unit/test_worker.rb
109
113
  test/unit/test_work_unit.rb
110
114
  views/operations_center.erb
111
115
  )
@@ -1,14 +1,21 @@
1
1
  # The URL where you're planning on running the central server/queue/database.
2
2
  :central_server: http://localhost:9173
3
3
 
4
- # Set the maximum number of workers allowed per-node. Workers only run while
5
- # there's work to be done. It's best to set 'max_workers' below the point where
6
- # you'd start to swap or peg your CPU (as determined by experiment).
4
+ # The following settings allow you to control the number of workers that can run
5
+ # on a given node, to prevent the node from becoming overloaded. 'max_workers'
6
+ # is a simple cap on the maximum number of workers a node is allowed to run
7
+ # concurrently. 'max_load' is the maximum (one-minute) load average, above which
8
+ # a node will refuse to take new work. 'min_free_memory' is the minimum amount
9
+ # of free RAM (in megabytes) a node is allowed to have, below which no new
10
+ # workers are run. These settings may be used in any combination.
7
11
  :max_workers: 5
12
+ # :max_load: 5.0
13
+ # :min_free_memory: 150
8
14
 
9
15
  # The storage back-end that you'd like to use for intermediate and final results
10
16
  # of processing. 's3' and 'filesystem' are supported. 'filesystem' should only
11
17
  # be used in development, on single-machine installations, or networked drives.
18
+ # If you *are* developing an action, filesystem is certainly faster and easier.
12
19
  :storage: s3
13
20
 
14
21
  # Please provide your AWS credentials for S3 storage of job output.
@@ -6,4 +6,10 @@
6
6
  :username: root
7
7
  :password:
8
8
  :socket: /tmp/mysql.sock
9
- :database: cloud_crowd
9
+ :database: cloud_crowd
10
+
11
+ # If you'd prefer to use an SQLite database instead, the following configuration
12
+ # will do nicely:
13
+ #
14
+ # :adapter: sqlite3
15
+ # :database: cloud_crowd.db
data/lib/cloud-crowd.rb CHANGED
@@ -43,10 +43,10 @@ module CloudCrowd
43
43
  autoload :WorkUnit, 'cloud_crowd/models'
44
44
 
45
45
  # Keep this version in sync with the gemspec.
46
- VERSION = '0.2.0'
46
+ VERSION = '0.2.1'
47
47
 
48
48
  # Increment the schema version when there's a backwards incompatible change.
49
- SCHEMA_VERSION = 2
49
+ SCHEMA_VERSION = 3
50
50
 
51
51
  # Root directory of the CloudCrowd gem.
52
52
  ROOT = File.expand_path(File.dirname(__FILE__) + '/..')
@@ -12,6 +12,9 @@ module CloudCrowd
12
12
  #
13
13
  # All actions have use of an individual +work_directory+, for scratch files,
14
14
  # and spend their duration inside of it, so relative paths work well.
15
+ #
16
+ # Note that Actions inherit a backticks (`) method that raises an Exception
17
+ # if the external command fails.
15
18
  class Action
16
19
 
17
20
  FILE_URL = /\Afile:\/\//
@@ -33,7 +36,7 @@ module CloudCrowd
33
36
 
34
37
  # Each Action subclass must implement a +process+ method, overriding this.
35
38
  def process
36
- raise NotImplementedError.new("CloudCrowd::Actions must override 'process' with their own processing code.")
39
+ raise NotImplementedError, "CloudCrowd::Actions must override 'process' with their own processing code."
37
40
  end
38
41
 
39
42
  # Download a file to the specified path.
@@ -66,6 +69,15 @@ module CloudCrowd
66
69
  FileUtils.rm_r(@work_directory) if File.exists?(@work_directory)
67
70
  end
68
71
 
72
+ # Actions have a backticks command that raises a CommandFailed exception
73
+ # on failure, so that processing doesn't just blithely continue.
74
+ def `(command)
75
+ result = super(command)
76
+ exit_code = $?.to_i
77
+ raise Error::CommandFailed.new(result, exit_code) unless exit_code == 0
78
+ result
79
+ end
80
+
69
81
 
70
82
  private
71
83
 
@@ -77,7 +89,7 @@ module CloudCrowd
77
89
  end
78
90
 
79
91
  # The directory prefix to use for both local and S3 storage.
80
- # [action_name]/job_[job_id]/unit_[work_unit_it]
92
+ # [action]/job_[job_id]/unit_[work_unit_it]
81
93
  def storage_prefix
82
94
  path_parts = []
83
95
  path_parts << Inflector.underscore(self.class)
@@ -93,9 +105,9 @@ module CloudCrowd
93
105
 
94
106
  # If the input is a URL, download the file before beginning processing.
95
107
  def download_input
108
+ input_is_url = !!URI.parse(@input) rescue false
109
+ return unless input_is_url
96
110
  Dir.chdir(@work_directory) do
97
- input_is_url = !!URI.parse(@input) rescue false
98
- return unless input_is_url
99
111
  @input_path = File.join(@work_directory, safe_filename(@input))
100
112
  @file_name = File.basename(@input_path, File.extname(@input_path))
101
113
  download(@input, @input_path)
@@ -11,7 +11,7 @@ module CloudCrowd
11
11
  class ActionNotFound < Error
12
12
  end
13
13
 
14
- # StorageNotFound is raised when config.yml specifies a storage back end that
14
+ # StorageNotFound is raised when config.yml specifies a storage back-end that
15
15
  # doesn't exist.
16
16
  class StorageNotFound < Error
17
17
  end
@@ -30,6 +30,17 @@ module CloudCrowd
30
30
  class MissingConfiguration < Error
31
31
  end
32
32
 
33
+ # CommandFailed is raised when an action shells out, and the external
34
+ # command returns a non-zero exit code.
35
+ class CommandFailed < Error
36
+ attr_reader :exit_code
37
+
38
+ def initialize(message, exit_code)
39
+ super(message)
40
+ @exit_code = exit_code
41
+ end
42
+ end
43
+
33
44
  end
34
45
 
35
46
  end
@@ -15,8 +15,7 @@ module CloudCrowd
15
15
  after_create :queue_for_workers
16
16
  before_destroy :cleanup_assets
17
17
 
18
- # Create a Job from an incoming JSON or XML request, and add it to the queue.
19
- # TODO: Think about XML support.
18
+ # Create a Job from an incoming JSON request, and add it to the queue.
20
19
  def self.create_from_request(h)
21
20
  self.create(
22
21
  :inputs => h['inputs'].to_json,
@@ -41,7 +40,8 @@ module CloudCrowd
41
40
  self
42
41
  end
43
42
 
44
- # Transition this Job's status to the appropriate next status.
43
+ # Transition this Job's current status to the appropriate next one, based
44
+ # on the state of the WorkUnits and the nature of the Action.
45
45
  def set_next_status
46
46
  update_attribute(:status,
47
47
  any_work_units_failed? ? FAILED :
@@ -66,8 +66,9 @@ module CloudCrowd
66
66
  end
67
67
  end
68
68
 
69
- # Cleaning up after a job will remove all of its files from S3. Destroying
70
- # a Job calls cleanup_assets first.
69
+ # Cleaning up after a job will remove all of its files from S3 or the
70
+ # filesystem. Destroying a Job will cleanup_assets first. Run this in a
71
+ # separate thread to get out of the transaction's way.
71
72
  # TODO: Convert this into a 'cleanup' work unit that gets run by a worker.
72
73
  def cleanup_assets
73
74
  AssetStore.new.cleanup(self)
@@ -7,9 +7,9 @@ module CloudCrowd
7
7
 
8
8
  has_many :work_units
9
9
 
10
- validates_presence_of :host, :ip_address, :port
10
+ validates_presence_of :host, :ip_address, :port, :enabled_actions
11
11
 
12
- before_destroy :clear_work_units
12
+ after_destroy :redistribute_work_units
13
13
 
14
14
  # Available Nodes haven't used up their maxiumum number of workers yet.
15
15
  named_scope :available, {
@@ -23,6 +23,7 @@ module CloudCrowd
23
23
  attrs = {
24
24
  :ip_address => request.ip,
25
25
  :port => params[:port],
26
+ :busy => params[:busy],
26
27
  :max_workers => params[:max_workers],
27
28
  :enabled_actions => params[:enabled_actions]
28
29
  }
@@ -32,12 +33,17 @@ module CloudCrowd
32
33
  # Dispatch a WorkUnit to this node. Places the node at back at the end of
33
34
  # the rotation. If we fail to send the WorkUnit, we consider the node to be
34
35
  # down, and remove this record, freeing up all of its checked-out work units.
36
+ # If the Node responds that it's overloaded, we mark it as busy. Returns
37
+ # true if the WorkUnit was dispatched successfully.
35
38
  def send_work_unit(unit)
36
39
  result = node['/work'].post(:work_unit => unit.to_json)
37
40
  unit.assign_to(self, JSON.parse(result)['pid'])
38
- touch
39
- rescue Errno::ECONNREFUSED
40
- self.destroy # Couldn't post to node, assume it's gone away.
41
+ touch && true
42
+ rescue Errno::ECONNREFUSED # Couldn't post to node, assume it's gone away.
43
+ destroy && false
44
+ rescue RestClient::RequestFailed => e
45
+ raise e unless e.http_code == 503 && e.http_body == Node::OVERLOADED_MESSAGE
46
+ update_attribute(:busy, true) && false
41
47
  end
42
48
 
43
49
  # What Actions is this Node able to run?
@@ -45,9 +51,10 @@ module CloudCrowd
45
51
  enabled_actions.split(',')
46
52
  end
47
53
 
48
- # Is this Node too busy for more work? (Determined by number of workers.)
54
+ # Is this Node too busy for more work? Determined by number of workers, or
55
+ # the Node's load average, as configured in config.yml.
49
56
  def busy?
50
- max_workers && work_units.count >= max_workers
57
+ busy || (max_workers && work_units.count >= max_workers)
51
58
  end
52
59
 
53
60
  # The URL at which this Node may be reached.
@@ -72,6 +79,11 @@ module CloudCrowd
72
79
  work_units.all(:select => 'worker_pid').map(&:worker_pid)
73
80
  end
74
81
 
82
+ # Release all of this Node's WorkUnits for other nodes to take.
83
+ def release_work_units
84
+ WorkUnit.update_all('node_record_id = null, worker_pid = null', "node_record_id = #{id}")
85
+ end
86
+
75
87
  # The JSON representation of a NodeRecord includes its worker_pids.
76
88
  def to_json(opts={})
77
89
  { 'host' => host,
@@ -83,11 +95,10 @@ module CloudCrowd
83
95
 
84
96
  private
85
97
 
86
- # When a Node shuts down, we free up all of the WorkUnits that it had
87
- # reserved, and they become available for others to pick up. Redistribute
88
- # the WorkUnits in a separate thread to avoid delaying Node shutdown.
89
- def clear_work_units
90
- WorkUnit.update_all('node_record_id = null, worker_pid = null', "node_record_id = #{id}")
98
+ # When a Node exits, release its WorkUnits and redistribute them to others.
99
+ # Redistribute in a separate thread to avoid delaying shutdown.
100
+ def redistribute_work_units
101
+ release_work_units
91
102
  Thread.new { WorkUnit.distribute_to_nodes }
92
103
  end
93
104
 
@@ -28,7 +28,7 @@ module CloudCrowd
28
28
  until work_units.empty? do
29
29
  node = available_nodes.shift
30
30
  unit = work_units.first
31
- break unless node
31
+ break unless node && unit
32
32
  next unless node.actions.include? unit.action
33
33
  sent = node.send_work_unit(unit)
34
34
  if sent
@@ -51,6 +51,12 @@ module CloudCrowd
51
51
  WorkUnit.reserved.update_all('reservation = null')
52
52
  end
53
53
 
54
+ # Cancels all outstanding WorkUnit reservations for all processes. (Useful
55
+ # in the console for debugging.)
56
+ def self.cancel_all_reservations
57
+ WorkUnit.update_all('reservation = null')
58
+ end
59
+
54
60
  # Look up a WorkUnit by the worker that's currently processing it. Specified
55
61
  # by <tt>pid@host</tt>.
56
62
  def self.find_by_worker_name(name)
@@ -74,7 +80,7 @@ module CloudCrowd
74
80
  WorkUnit.start(job, action, new_input, PROCESSING)
75
81
  end
76
82
  self.destroy
77
- job.set_next_status if job.done_splitting?
83
+ job.set_next_status if job && job.done_splitting?
78
84
  else
79
85
  update_attributes({
80
86
  :status => SUCCEEDED,
@@ -84,7 +90,7 @@ module CloudCrowd
84
90
  :output => result,
85
91
  :time => time_taken
86
92
  })
87
- job.check_for_completion
93
+ job && job.check_for_completion
88
94
  end
89
95
  end
90
96
 
@@ -10,9 +10,24 @@ module CloudCrowd
10
10
 
11
11
  # A Node's default port. You only run a single node per machine, so they
12
12
  # can all use the same port without any problems.
13
- DEFAULT_PORT = 9063
13
+ DEFAULT_PORT = 9063
14
14
 
15
- attr_reader :server, :asset_store
15
+ # A list of regex scrapers, which let us extract the one-minute load
16
+ # average and the amount of free memory on different flavors of UNIX.
17
+
18
+ SCRAPE_UPTIME = /\d+\.\d+/
19
+ SCRAPE_LINUX_MEMORY = /MemFree:\s+(\d+) kB/
20
+ SCRAPE_MAC_MEMORY = /Pages free:\s+(\d+)./
21
+ SCRAPE_MAC_PAGE = /page size of (\d+) bytes/
22
+
23
+ # The interval at which the node monitors the machine's load and memory use
24
+ # (if configured to do so in config.yml).
25
+ MONITOR_INTERVAL = 3
26
+
27
+ # The response sent back when this node is overloaded.
28
+ OVERLOADED_MESSAGE = 'Node Overloaded'
29
+
30
+ attr_reader :asset_store, :enabled_actions, :host, :port, :server
16
31
 
17
32
  set :root, ROOT
18
33
  set :authorization_realm, "CloudCrowd"
@@ -35,14 +50,15 @@ module CloudCrowd
35
50
  end
36
51
 
37
52
  # Posts a WorkUnit to this Node. Forks a Worker and returns the process id.
53
+ # Returns a 503 if this Node is overloaded.
38
54
  post '/work' do
39
- pid = fork { Worker.new(self, JSON.parse(params[:work_unit])) }
55
+ throw :halt, [503, OVERLOADED_MESSAGE] if @overloaded
56
+ pid = fork { Worker.new(self, JSON.parse(params[:work_unit])).run }
40
57
  Process.detach(pid)
41
58
  json :pid => pid
42
59
  end
43
60
 
44
- # Creating a Node registers with the central server and starts listening for
45
- # incoming WorkUnits.
61
+ # When creating a node, specify the port it should run on.
46
62
  def initialize(port=DEFAULT_PORT)
47
63
  require 'json'
48
64
  @server = CloudCrowd.central_server
@@ -50,25 +66,35 @@ module CloudCrowd
50
66
  @enabled_actions = CloudCrowd.actions.keys
51
67
  @asset_store = AssetStore.new
52
68
  @port = port || DEFAULT_PORT
53
-
69
+ @overloaded = false
70
+ @max_load = CloudCrowd.config[:max_load]
71
+ @min_memory = CloudCrowd.config[:min_free_memory]
72
+ start unless test?
73
+ end
74
+
75
+ # Starting up a Node registers with the central server and begins to listen
76
+ # for incoming WorkUnits.
77
+ def start
54
78
  trap_signals
55
79
  start_server
56
- check_in
80
+ monitor_system if @max_load || @min_memory
81
+ check_in(true)
57
82
  @server_thread.join
58
83
  end
59
84
 
60
85
  # Checking in with the central server informs it of the location and
61
86
  # configuration of this Node. If it can't check-in, there's no point in
62
87
  # starting.
63
- def check_in
88
+ def check_in(critical=false)
64
89
  @server["/node/#{@host}"].put(
65
90
  :port => @port,
91
+ :busy => @overloaded,
66
92
  :max_workers => CloudCrowd.config[:max_workers],
67
93
  :enabled_actions => @enabled_actions.join(',')
68
94
  )
69
95
  rescue Errno::ECONNREFUSED
70
- puts "Failed to connect to the central server (#{@server.to_s}), exiting..."
71
- raise SystemExit
96
+ puts "Failed to connect to the central server (#{@server.to_s})."
97
+ raise SystemExit if critical
72
98
  end
73
99
 
74
100
  # Before exiting, the Node checks out with the central server, releasing all
@@ -77,6 +103,33 @@ module CloudCrowd
77
103
  @server["/node/#{@host}"].delete
78
104
  end
79
105
 
106
+ # Is the node overloaded? If configured, checks if the load average is
107
+ # greater than 'max_load', or if the available RAM is less than
108
+ # 'min_free_memory'.
109
+ def overloaded?
110
+ (@max_load && load_average > @max_load) ||
111
+ (@min_memory && free_memory < @min_memory)
112
+ end
113
+
114
+ # The current one-minute load average.
115
+ def load_average
116
+ `uptime`.match(SCRAPE_UPTIME).to_s.to_f
117
+ end
118
+
119
+ # The current amount of free memory in megabytes.
120
+ def free_memory
121
+ case RUBY_PLATFORM
122
+ when /darwin/
123
+ stats = `vm_stat`
124
+ @mac_page_size ||= stats.match(SCRAPE_MAC_PAGE)[1].to_f / 1048576.0
125
+ stats.match(SCRAPE_MAC_MEMORY)[1].to_f * @mac_page_size
126
+ when /linux/
127
+ `cat /proc/meminfo`.match(SCRAPE_LINUX_MEMORY)[1].to_f / 1024.0
128
+ else
129
+ raise NotImplementedError, "'min_free_memory' is not yet implemented on your platform"
130
+ end
131
+ end
132
+
80
133
 
81
134
  private
82
135
 
@@ -87,6 +140,20 @@ module CloudCrowd
87
140
  end
88
141
  end
89
142
 
143
+ # Launch a monitoring thread that periodically checks the node's load
144
+ # average and the amount of free memory remaining. If we transition out of
145
+ # the overloaded state, let central know.
146
+ def monitor_system
147
+ @monitor_thread = Thread.new do
148
+ loop do
149
+ was_overloaded = @overloaded
150
+ @overloaded = overloaded?
151
+ check_in if was_overloaded && !@overloaded
152
+ sleep MONITOR_INTERVAL
153
+ end
154
+ end
155
+ end
156
+
90
157
  # Trap exit signals in order to shut down cleanly.
91
158
  def trap_signals
92
159
  Signal.trap('INT') { shut_down }
@@ -96,7 +163,9 @@ module CloudCrowd
96
163
 
97
164
  # At shut down, de-register with the central server before exiting.
98
165
  def shut_down
166
+ @monitor_thread.kill if @monitor_thread
99
167
  check_out
168
+ @server_thread.kill
100
169
  Process.exit
101
170
  end
102
171
 
@@ -15,10 +15,11 @@ ActiveRecord::Schema.define(:version => CloudCrowd::SCHEMA_VERSION) do
15
15
  end
16
16
 
17
17
  create_table "node_records", :force => true do |t|
18
- t.string "host", :null => false
19
- t.string "ip_address", :null => false
20
- t.integer "port", :null => false
21
- t.string "enabled_actions", :default => '', :null => false
18
+ t.string "host", :null => false
19
+ t.string "ip_address", :null => false
20
+ t.integer "port", :null => false
21
+ t.string "enabled_actions", :default => '', :null => false
22
+ t.boolean "busy", :default => false, :null => false
22
23
  t.integer "max_workers"
23
24
  t.datetime "created_at"
24
25
  t.datetime "updated_at"
@@ -14,15 +14,16 @@ module CloudCrowd
14
14
  # Wait five seconds to retry, after internal communcication errors.
15
15
  RETRY_WAIT = 5
16
16
 
17
- attr_reader :action
17
+ attr_reader :pid, :node, :unit, :status
18
18
 
19
- # A new Worker begins processing its WorkUnit straight off.
20
- def initialize(node, work_unit)
21
- @pid = $$
22
- @node = node
23
- trap_signals
24
- setup_work_unit(work_unit)
25
- run
19
+ # A new Worker customizes itself to its WorkUnit at instantiation.
20
+ def initialize(node, unit)
21
+ @start_time = Time.now
22
+ @pid = $$
23
+ @node = node
24
+ @unit = unit
25
+ @status = @unit['status']
26
+ @retry_wait = RETRY_WAIT
26
27
  end
27
28
 
28
29
  # Return output to the central server, marking the WorkUnit done.
@@ -49,18 +50,20 @@ module CloudCrowd
49
50
  def keep_trying_to(title)
50
51
  begin
51
52
  yield
53
+ rescue RestClient::ResourceNotFound => e
54
+ log "work unit ##{@unit['id']} doesn't exist. discarding..."
52
55
  rescue Exception => e
53
- log "failed to #{title} -- retry in #{RETRY_WAIT} seconds"
56
+ log "failed to #{title} -- retry in #{@retry_wait} seconds"
54
57
  log e.message
55
58
  log e.backtrace
56
- sleep RETRY_WAIT
59
+ sleep @retry_wait
57
60
  retry
58
61
  end
59
62
  end
60
63
 
61
64
  # Loggable details describing what the Worker is up to.
62
65
  def display_work_unit
63
- "unit ##{@options['work_unit_id']} (#{@action_name}/#{CloudCrowd.display_status(@status)})"
66
+ "unit ##{@unit['id']} (#{@unit['action']}/#{CloudCrowd.display_status(@status)})"
64
67
  end
65
68
 
66
69
  # Executes the WorkUnit by running the Action, catching all exceptions as
@@ -70,12 +73,13 @@ module CloudCrowd
70
73
  @worker_thread = Thread.new do
71
74
  begin
72
75
  result = nil
73
- @action = CloudCrowd.actions[@action_name].new(@status, @input, @options, @node.asset_store)
74
- Dir.chdir(@action.work_directory) do
76
+ action_class = CloudCrowd.actions[@unit['action']]
77
+ action = action_class.new(@status, @unit['input'], enhanced_unit_options, @node.asset_store)
78
+ Dir.chdir(action.work_directory) do
75
79
  result = case @status
76
- when PROCESSING then @action.process
77
- when SPLITTING then @action.split
78
- when MERGING then @action.merge
80
+ when PROCESSING then action.process
81
+ when SPLITTING then action.split
82
+ when MERGING then action.merge
79
83
  else raise Error::StatusUnspecified, "work units must specify their status"
80
84
  end
81
85
  end
@@ -83,7 +87,7 @@ module CloudCrowd
83
87
  rescue Exception => e
84
88
  fail_work_unit(e)
85
89
  ensure
86
- @action.cleanup_work_directory
90
+ action.cleanup_work_directory if action
87
91
  end
88
92
  end
89
93
  @worker_thread.join
@@ -91,9 +95,26 @@ module CloudCrowd
91
95
 
92
96
  # Wraps run_work_unit to benchmark the execution time, if requested.
93
97
  def run
94
- return run_work_unit unless @options['benchmark']
95
- status = CloudCrowd.display_status(@status)
96
- log("ran #{@action_name}/#{status} in " + Benchmark.measure { run_work_unit }.to_s)
98
+ trap_signals
99
+ log "starting #{display_work_unit}"
100
+ return run_work_unit unless @unit['options']['benchmark']
101
+ log("ran #{display_work_unit} in " + Benchmark.measure { run_work_unit }.to_s)
102
+ end
103
+
104
+ # There are some potentially important attributes of the WorkUnit that we'd
105
+ # like to pass into the Action -- in case it needs to know them. They will
106
+ # always be made available in the options hash.
107
+ def enhanced_unit_options
108
+ @unit['options'].merge({
109
+ 'job_id' => @unit['job_id'],
110
+ 'work_unit_id' => @unit['id'],
111
+ 'attempts' => @unit['attempts']
112
+ })
113
+ end
114
+
115
+ # How long has this worker been running for?
116
+ def time_taken
117
+ Time.now - @start_time
97
118
  end
98
119
 
99
120
 
@@ -103,20 +124,8 @@ module CloudCrowd
103
124
  # regardless of success or failure.
104
125
  def base_params
105
126
  { :pid => @pid,
106
- :id => @options['work_unit_id'],
107
- :time => Time.now - @start_time }
108
- end
109
-
110
- # Extract the Worker's instance variables from a WorkUnit's JSON.
111
- def setup_work_unit(unit)
112
- return false unless unit
113
- @start_time = Time.now
114
- @action_name, @input, @options, @status = unit['action'], unit['input'], unit['options'], unit['status']
115
- @options['job_id'] = unit['job_id']
116
- @options['work_unit_id'] = unit['id']
117
- @options['attempts'] ||= unit['attempts']
118
- log "fetched #{display_work_unit}"
119
- return true
127
+ :id => @unit['id'],
128
+ :time => time_taken }
120
129
  end
121
130
 
122
131
  # Log a message to the daemon log. Includes PID for identification.
@@ -4,6 +4,7 @@ require 'test_helper'
4
4
  class FailingWorkUnitsTest < Test::Unit::TestCase
5
5
 
6
6
  should "retry work units when they fail" do
7
+ WorkUnit.expects(:distribute_to_nodes).returns(true)
7
8
  browser = Rack::Test::Session.new(Rack::MockSession.new(CloudCrowd::Server))
8
9
 
9
10
  browser.post '/jobs', :job => {
@@ -0,0 +1,20 @@
1
+ require 'test_helper'
2
+
3
+ class NodeAcceptanceTest < Test::Unit::TestCase
4
+
5
+ include Rack::Test::Methods
6
+
7
+ def app
8
+ CloudCrowd::Node
9
+ end
10
+
11
+ context "The CloudCrowd::Node (Sinatra)" do
12
+
13
+ should "have a heartbeat" do
14
+ get '/heartbeat'
15
+ assert last_response.body == 'buh-bump'
16
+ end
17
+
18
+ end
19
+
20
+ end
@@ -46,6 +46,7 @@ class ServerTest < Test::Unit::TestCase
46
46
  end
47
47
 
48
48
  should "be able to create a job" do
49
+ WorkUnit.expects(:distribute_to_nodes).returns(true)
49
50
  post('/jobs', :job => '{"action":"graphics_magick","inputs":["http://www.google.com/"]}')
50
51
  assert last_response.ok?
51
52
  job_info = JSON.parse(last_response.body)
@@ -5,16 +5,13 @@ class WordCountTest < Test::Unit::TestCase
5
5
  context "the word_count action" do
6
6
 
7
7
  setup do
8
+ WorkUnit.expects(:distribute_to_nodes).returns(true)
8
9
  @asset_store = AssetStore.new
9
10
  @browser = Rack::Test::Session.new(Rack::MockSession.new(CloudCrowd::Server))
10
11
  @browser.put('/worker', :name => 'test_worker', :thread_status => 'sleeping')
11
12
  post_job_to_count_words_in_this_file
12
13
  @job_id = JSON.parse(@browser.last_response.body)['id']
13
14
  end
14
-
15
- teardown do
16
- CloudCrowd::Job.destroy_all
17
- end
18
15
 
19
16
  should "be able to create a word_count job" do
20
17
  assert @browser.last_response.ok?
@@ -26,7 +23,7 @@ class WordCountTest < Test::Unit::TestCase
26
23
  should "be able to perform the processing stage of a word_count" do
27
24
  action = CloudCrowd.actions['word_count'].new(1, "file://#{File.expand_path(__FILE__)}", {}, @asset_store)
28
25
  count = action.process
29
- assert count == 104
26
+ assert count == 101
30
27
  end
31
28
 
32
29
  end
data/test/blueprints.rb CHANGED
@@ -1,4 +1,5 @@
1
1
  Sham.url { Faker::Internet.domain_name + "/" + Faker::Internet.domain_word + ".jpg" }
2
+ Sham.host { Faker::Internet.domain_name + '.local' }
2
3
 
3
4
  CloudCrowd::Job.blueprint do
4
5
  status { CloudCrowd::PROCESSING }
@@ -8,9 +9,17 @@ CloudCrowd::Job.blueprint do
8
9
  email { 'noone@example.com' }
9
10
  end
10
11
 
12
+ CloudCrowd::NodeRecord.blueprint do
13
+ host
14
+ ip_address { '127.0.0.1' }
15
+ port { 6093 }
16
+ enabled_actions { 'graphics_magick,word_count' }
17
+ max_workers { 3 }
18
+ end
19
+
11
20
  CloudCrowd::WorkUnit.blueprint do
12
- job { CloudCrowd::Job.make }
13
- status { CloudCrowd::PROCESSING }
14
- input { Sham.url }
15
- action { 'graphics_magick' }
21
+ job { CloudCrowd::Job.make }
22
+ status { CloudCrowd::PROCESSING }
23
+ input { '{"key":"value"}' }
24
+ action { 'graphics_magick' }
16
25
  end
@@ -1,6 +1,3 @@
1
- :adapter: mysql
2
- :encoding: utf8
3
- :username: root
4
- :password:
5
- :socket: /tmp/mysql.sock
6
- :database: cloud_crowd_test
1
+ :adapter: sqlite3
2
+ :database: test/cloud_crowd_test.db
3
+ :timeout: 5000
@@ -0,0 +1,38 @@
1
+ require 'test_helper'
2
+
3
+ class NodeUnitTest < Test::Unit::TestCase
4
+
5
+ context "A Node" do
6
+
7
+ setup do
8
+ @node = Node.new(11011).instance_variable_get(:@app)
9
+ end
10
+
11
+ should "instantiate correctly" do
12
+ assert @node.server.to_s == "http://localhost:9173"
13
+ assert @node.port == 11011
14
+ assert @node.host == Socket.gethostname
15
+ assert @node.enabled_actions.length > 2
16
+ assert @node.asset_store.is_a? AssetStore::FilesystemStore
17
+ end
18
+
19
+ should "trap signals and launch a server at start" do
20
+ Signal.expects(:trap).times(3)
21
+ Thin::Server.expects(:start)
22
+ @node.expects(:check_in)
23
+ @node.start
24
+ end
25
+
26
+ should "be able to determine if the node is overloaded" do
27
+ assert !@node.overloaded?
28
+ @node.instance_variable_set :@max_load, 0.01
29
+ assert @node.overloaded?
30
+ @node.instance_variable_set :@max_load, nil
31
+ assert !@node.overloaded?
32
+ @node.instance_variable_set :@min_memory, 8000
33
+ assert @node.overloaded?
34
+ end
35
+
36
+ end
37
+
38
+ end
@@ -0,0 +1,42 @@
1
+ require 'test_helper'
2
+
3
+ class NodeRecordTest < Test::Unit::TestCase
4
+
5
+ context "A NodeRecord" do
6
+
7
+ setup do
8
+ @node = CloudCrowd::NodeRecord.make
9
+ end
10
+
11
+ subject { @node }
12
+
13
+ should_have_many :work_units
14
+
15
+ should_validate_presence_of :host, :ip_address, :port, :enabled_actions
16
+
17
+ should "be available" do
18
+ assert NodeRecord.available.map(&:id).include? @node.id
19
+ end
20
+
21
+ should "know its enabled actions" do
22
+ assert @node.actions.include? 'graphics_magick'
23
+ assert @node.actions.include? 'word_count'
24
+ end
25
+
26
+ should "know if the node is busy" do
27
+ assert !@node.busy?
28
+ assert @node.display_status == 'available'
29
+ (@node.max_workers + 1).times { WorkUnit.make(:node_record => @node) }
30
+ assert @node.busy?
31
+ assert @node.display_status == 'busy'
32
+ @node.release_work_units
33
+ assert !@node.busy?
34
+ end
35
+
36
+ should "be reachable at a URL" do
37
+ assert !!URI.parse(@node.url)
38
+ end
39
+
40
+ end
41
+
42
+ end
@@ -0,0 +1,48 @@
1
+ require 'test_helper'
2
+
3
+ class WorkerTest < Test::Unit::TestCase
4
+
5
+ context "A CloudCrowd::Worker" do
6
+
7
+ setup do
8
+ @node = Node.new.instance_variable_get(:@app)
9
+ @unit = WorkUnit.make
10
+ @worker = Worker.new(@node, JSON.parse(@unit.to_json))
11
+ end
12
+
13
+ should "instantiate correctly" do
14
+ assert @worker.pid == $$
15
+ assert @worker.unit['id'] == @unit.id
16
+ assert @worker.status == @unit.status
17
+ assert @worker.node == @node
18
+ assert @worker.time_taken > 0
19
+ end
20
+
21
+ should "be able to retry operations that must succeed" do
22
+ @worker.instance_variable_set :@retry_wait, 0.01
23
+ @worker.expects(:log).at_least(3)
24
+ tries = 0
25
+ @worker.keep_trying_to("do something critical") do
26
+ tries += 1;
27
+ raise 'hell' unless tries > 3
28
+ assert "made it through"
29
+ end
30
+ end
31
+
32
+ should "be able to run an action and try to complete it" do
33
+ GraphicsMagick.any_instance.expects(:process).returns('the answer')
34
+ GraphicsMagick.any_instance.expects(:cleanup_work_directory)
35
+ @worker.expects(:complete_work_unit).with({'output' => 'the answer'}.to_json)
36
+ @worker.run_work_unit
37
+ end
38
+
39
+ should "enchance the options that an action receives with extra info" do
40
+ opts = @worker.enhanced_unit_options
41
+ assert opts['work_unit_id'] == @unit.id
42
+ assert opts['job_id'] == @unit.job.id
43
+ assert opts['attempts'] == @unit.attempts
44
+ end
45
+
46
+ end
47
+
48
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: documentcloud-cloud-crowd
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.2.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jeremy Ashkenas
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-09-17 00:00:00 -07:00
12
+ date: 2009-09-18 00:00:00 -07:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
@@ -182,8 +182,9 @@ files:
182
182
  - public/js/flot.js
183
183
  - public/js/jquery.js
184
184
  - README
185
- - test/acceptance/test_server.rb
185
+ - test/acceptance/test_node.rb
186
186
  - test/acceptance/test_failing_work_units.rb
187
+ - test/acceptance/test_server.rb
187
188
  - test/acceptance/test_word_count.rb
188
189
  - test/blueprints.rb
189
190
  - test/config/config.ru
@@ -193,7 +194,10 @@ files:
193
194
  - test/test_helper.rb
194
195
  - test/unit/test_action.rb
195
196
  - test/unit/test_configuration.rb
197
+ - test/unit/test_node.rb
198
+ - test/unit/test_node_record.rb
196
199
  - test/unit/test_job.rb
200
+ - test/unit/test_worker.rb
197
201
  - test/unit/test_work_unit.rb
198
202
  - views/operations_center.erb
199
203
  has_rdoc: true