documentcloud-cloud-crowd 0.2.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/cloud-crowd.gemspec +7 -3
- data/config/config.example.yml +10 -3
- data/config/database.example.yml +7 -1
- data/lib/cloud-crowd.rb +2 -2
- data/lib/cloud_crowd/action.rb +16 -4
- data/lib/cloud_crowd/exceptions.rb +12 -1
- data/lib/cloud_crowd/models/job.rb +6 -5
- data/lib/cloud_crowd/models/node_record.rb +23 -12
- data/lib/cloud_crowd/models/work_unit.rb +9 -3
- data/lib/cloud_crowd/node.rb +79 -10
- data/lib/cloud_crowd/schema.rb +5 -4
- data/lib/cloud_crowd/worker.rb +43 -34
- data/test/acceptance/test_failing_work_units.rb +1 -0
- data/test/acceptance/test_node.rb +20 -0
- data/test/acceptance/test_server.rb +1 -0
- data/test/acceptance/test_word_count.rb +2 -5
- data/test/blueprints.rb +13 -4
- data/test/config/database.yml +3 -6
- data/test/unit/test_node.rb +38 -0
- data/test/unit/test_node_record.rb +42 -0
- data/test/unit/test_worker.rb +48 -0
- metadata +7 -3
data/cloud-crowd.gemspec
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
Gem::Specification.new do |s|
|
2
2
|
s.name = 'cloud-crowd'
|
3
|
-
s.version = '0.2.
|
4
|
-
s.date = '2009-09-
|
3
|
+
s.version = '0.2.1' # Keep version in sync with cloud-cloud.rb
|
4
|
+
s.date = '2009-09-18'
|
5
5
|
|
6
6
|
s.homepage = "http://wiki.github.com/documentcloud/cloud-crowd"
|
7
7
|
s.summary = "Parallel Processing for the Rest of Us"
|
@@ -94,8 +94,9 @@ public/js/excanvas.js
|
|
94
94
|
public/js/flot.js
|
95
95
|
public/js/jquery.js
|
96
96
|
README
|
97
|
-
test/acceptance/
|
97
|
+
test/acceptance/test_node.rb
|
98
98
|
test/acceptance/test_failing_work_units.rb
|
99
|
+
test/acceptance/test_server.rb
|
99
100
|
test/acceptance/test_word_count.rb
|
100
101
|
test/blueprints.rb
|
101
102
|
test/config/config.ru
|
@@ -105,7 +106,10 @@ test/config/actions/failure_testing.rb
|
|
105
106
|
test/test_helper.rb
|
106
107
|
test/unit/test_action.rb
|
107
108
|
test/unit/test_configuration.rb
|
109
|
+
test/unit/test_node.rb
|
110
|
+
test/unit/test_node_record.rb
|
108
111
|
test/unit/test_job.rb
|
112
|
+
test/unit/test_worker.rb
|
109
113
|
test/unit/test_work_unit.rb
|
110
114
|
views/operations_center.erb
|
111
115
|
)
|
data/config/config.example.yml
CHANGED
@@ -1,14 +1,21 @@
|
|
1
1
|
# The URL where you're planning on running the central server/queue/database.
|
2
2
|
:central_server: http://localhost:9173
|
3
3
|
|
4
|
-
#
|
5
|
-
#
|
6
|
-
#
|
4
|
+
# The following settings allow you to control the number of workers that can run
|
5
|
+
# on a given node, to prevent the node from becoming overloaded. 'max_workers'
|
6
|
+
# is a simple cap on the maximum number of workers a node is allowed to run
|
7
|
+
# concurrently. 'max_load' is the maximum (one-minute) load average, above which
|
8
|
+
# a node will refuse to take new work. 'min_free_memory' is the minimum amount
|
9
|
+
# of free RAM (in megabytes) a node is allowed to have, below which no new
|
10
|
+
# workers are run. These settings may be used in any combination.
|
7
11
|
:max_workers: 5
|
12
|
+
# :max_load: 5.0
|
13
|
+
# :min_free_memory: 150
|
8
14
|
|
9
15
|
# The storage back-end that you'd like to use for intermediate and final results
|
10
16
|
# of processing. 's3' and 'filesystem' are supported. 'filesystem' should only
|
11
17
|
# be used in development, on single-machine installations, or networked drives.
|
18
|
+
# If you *are* developing an action, filesystem is certainly faster and easier.
|
12
19
|
:storage: s3
|
13
20
|
|
14
21
|
# Please provide your AWS credentials for S3 storage of job output.
|
data/config/database.example.yml
CHANGED
@@ -6,4 +6,10 @@
|
|
6
6
|
:username: root
|
7
7
|
:password:
|
8
8
|
:socket: /tmp/mysql.sock
|
9
|
-
:database: cloud_crowd
|
9
|
+
:database: cloud_crowd
|
10
|
+
|
11
|
+
# If you'd prefer to use an SQLite database instead, the following configuration
|
12
|
+
# will do nicely:
|
13
|
+
#
|
14
|
+
# :adapter: sqlite3
|
15
|
+
# :database: cloud_crowd.db
|
data/lib/cloud-crowd.rb
CHANGED
@@ -43,10 +43,10 @@ module CloudCrowd
|
|
43
43
|
autoload :WorkUnit, 'cloud_crowd/models'
|
44
44
|
|
45
45
|
# Keep this version in sync with the gemspec.
|
46
|
-
VERSION = '0.2.
|
46
|
+
VERSION = '0.2.1'
|
47
47
|
|
48
48
|
# Increment the schema version when there's a backwards incompatible change.
|
49
|
-
SCHEMA_VERSION =
|
49
|
+
SCHEMA_VERSION = 3
|
50
50
|
|
51
51
|
# Root directory of the CloudCrowd gem.
|
52
52
|
ROOT = File.expand_path(File.dirname(__FILE__) + '/..')
|
data/lib/cloud_crowd/action.rb
CHANGED
@@ -12,6 +12,9 @@ module CloudCrowd
|
|
12
12
|
#
|
13
13
|
# All actions have use of an individual +work_directory+, for scratch files,
|
14
14
|
# and spend their duration inside of it, so relative paths work well.
|
15
|
+
#
|
16
|
+
# Note that Actions inherit a backticks (`) method that raises an Exception
|
17
|
+
# if the external command fails.
|
15
18
|
class Action
|
16
19
|
|
17
20
|
FILE_URL = /\Afile:\/\//
|
@@ -33,7 +36,7 @@ module CloudCrowd
|
|
33
36
|
|
34
37
|
# Each Action subclass must implement a +process+ method, overriding this.
|
35
38
|
def process
|
36
|
-
raise NotImplementedError
|
39
|
+
raise NotImplementedError, "CloudCrowd::Actions must override 'process' with their own processing code."
|
37
40
|
end
|
38
41
|
|
39
42
|
# Download a file to the specified path.
|
@@ -66,6 +69,15 @@ module CloudCrowd
|
|
66
69
|
FileUtils.rm_r(@work_directory) if File.exists?(@work_directory)
|
67
70
|
end
|
68
71
|
|
72
|
+
# Actions have a backticks command that raises a CommandFailed exception
|
73
|
+
# on failure, so that processing doesn't just blithely continue.
|
74
|
+
def `(command)
|
75
|
+
result = super(command)
|
76
|
+
exit_code = $?.to_i
|
77
|
+
raise Error::CommandFailed.new(result, exit_code) unless exit_code == 0
|
78
|
+
result
|
79
|
+
end
|
80
|
+
|
69
81
|
|
70
82
|
private
|
71
83
|
|
@@ -77,7 +89,7 @@ module CloudCrowd
|
|
77
89
|
end
|
78
90
|
|
79
91
|
# The directory prefix to use for both local and S3 storage.
|
80
|
-
# [
|
92
|
+
# [action]/job_[job_id]/unit_[work_unit_it]
|
81
93
|
def storage_prefix
|
82
94
|
path_parts = []
|
83
95
|
path_parts << Inflector.underscore(self.class)
|
@@ -93,9 +105,9 @@ module CloudCrowd
|
|
93
105
|
|
94
106
|
# If the input is a URL, download the file before beginning processing.
|
95
107
|
def download_input
|
108
|
+
input_is_url = !!URI.parse(@input) rescue false
|
109
|
+
return unless input_is_url
|
96
110
|
Dir.chdir(@work_directory) do
|
97
|
-
input_is_url = !!URI.parse(@input) rescue false
|
98
|
-
return unless input_is_url
|
99
111
|
@input_path = File.join(@work_directory, safe_filename(@input))
|
100
112
|
@file_name = File.basename(@input_path, File.extname(@input_path))
|
101
113
|
download(@input, @input_path)
|
@@ -11,7 +11,7 @@ module CloudCrowd
|
|
11
11
|
class ActionNotFound < Error
|
12
12
|
end
|
13
13
|
|
14
|
-
# StorageNotFound is raised when config.yml specifies a storage back
|
14
|
+
# StorageNotFound is raised when config.yml specifies a storage back-end that
|
15
15
|
# doesn't exist.
|
16
16
|
class StorageNotFound < Error
|
17
17
|
end
|
@@ -30,6 +30,17 @@ module CloudCrowd
|
|
30
30
|
class MissingConfiguration < Error
|
31
31
|
end
|
32
32
|
|
33
|
+
# CommandFailed is raised when an action shells out, and the external
|
34
|
+
# command returns a non-zero exit code.
|
35
|
+
class CommandFailed < Error
|
36
|
+
attr_reader :exit_code
|
37
|
+
|
38
|
+
def initialize(message, exit_code)
|
39
|
+
super(message)
|
40
|
+
@exit_code = exit_code
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
33
44
|
end
|
34
45
|
|
35
46
|
end
|
@@ -15,8 +15,7 @@ module CloudCrowd
|
|
15
15
|
after_create :queue_for_workers
|
16
16
|
before_destroy :cleanup_assets
|
17
17
|
|
18
|
-
# Create a Job from an incoming JSON
|
19
|
-
# TODO: Think about XML support.
|
18
|
+
# Create a Job from an incoming JSON request, and add it to the queue.
|
20
19
|
def self.create_from_request(h)
|
21
20
|
self.create(
|
22
21
|
:inputs => h['inputs'].to_json,
|
@@ -41,7 +40,8 @@ module CloudCrowd
|
|
41
40
|
self
|
42
41
|
end
|
43
42
|
|
44
|
-
# Transition this Job's status to the appropriate next
|
43
|
+
# Transition this Job's current status to the appropriate next one, based
|
44
|
+
# on the state of the WorkUnits and the nature of the Action.
|
45
45
|
def set_next_status
|
46
46
|
update_attribute(:status,
|
47
47
|
any_work_units_failed? ? FAILED :
|
@@ -66,8 +66,9 @@ module CloudCrowd
|
|
66
66
|
end
|
67
67
|
end
|
68
68
|
|
69
|
-
# Cleaning up after a job will remove all of its files from S3
|
70
|
-
# a Job
|
69
|
+
# Cleaning up after a job will remove all of its files from S3 or the
|
70
|
+
# filesystem. Destroying a Job will cleanup_assets first. Run this in a
|
71
|
+
# separate thread to get out of the transaction's way.
|
71
72
|
# TODO: Convert this into a 'cleanup' work unit that gets run by a worker.
|
72
73
|
def cleanup_assets
|
73
74
|
AssetStore.new.cleanup(self)
|
@@ -7,9 +7,9 @@ module CloudCrowd
|
|
7
7
|
|
8
8
|
has_many :work_units
|
9
9
|
|
10
|
-
validates_presence_of :host, :ip_address, :port
|
10
|
+
validates_presence_of :host, :ip_address, :port, :enabled_actions
|
11
11
|
|
12
|
-
|
12
|
+
after_destroy :redistribute_work_units
|
13
13
|
|
14
14
|
# Available Nodes haven't used up their maxiumum number of workers yet.
|
15
15
|
named_scope :available, {
|
@@ -23,6 +23,7 @@ module CloudCrowd
|
|
23
23
|
attrs = {
|
24
24
|
:ip_address => request.ip,
|
25
25
|
:port => params[:port],
|
26
|
+
:busy => params[:busy],
|
26
27
|
:max_workers => params[:max_workers],
|
27
28
|
:enabled_actions => params[:enabled_actions]
|
28
29
|
}
|
@@ -32,12 +33,17 @@ module CloudCrowd
|
|
32
33
|
# Dispatch a WorkUnit to this node. Places the node at back at the end of
|
33
34
|
# the rotation. If we fail to send the WorkUnit, we consider the node to be
|
34
35
|
# down, and remove this record, freeing up all of its checked-out work units.
|
36
|
+
# If the Node responds that it's overloaded, we mark it as busy. Returns
|
37
|
+
# true if the WorkUnit was dispatched successfully.
|
35
38
|
def send_work_unit(unit)
|
36
39
|
result = node['/work'].post(:work_unit => unit.to_json)
|
37
40
|
unit.assign_to(self, JSON.parse(result)['pid'])
|
38
|
-
touch
|
39
|
-
rescue Errno::ECONNREFUSED
|
40
|
-
|
41
|
+
touch && true
|
42
|
+
rescue Errno::ECONNREFUSED # Couldn't post to node, assume it's gone away.
|
43
|
+
destroy && false
|
44
|
+
rescue RestClient::RequestFailed => e
|
45
|
+
raise e unless e.http_code == 503 && e.http_body == Node::OVERLOADED_MESSAGE
|
46
|
+
update_attribute(:busy, true) && false
|
41
47
|
end
|
42
48
|
|
43
49
|
# What Actions is this Node able to run?
|
@@ -45,9 +51,10 @@ module CloudCrowd
|
|
45
51
|
enabled_actions.split(',')
|
46
52
|
end
|
47
53
|
|
48
|
-
# Is this Node too busy for more work?
|
54
|
+
# Is this Node too busy for more work? Determined by number of workers, or
|
55
|
+
# the Node's load average, as configured in config.yml.
|
49
56
|
def busy?
|
50
|
-
max_workers && work_units.count >= max_workers
|
57
|
+
busy || (max_workers && work_units.count >= max_workers)
|
51
58
|
end
|
52
59
|
|
53
60
|
# The URL at which this Node may be reached.
|
@@ -72,6 +79,11 @@ module CloudCrowd
|
|
72
79
|
work_units.all(:select => 'worker_pid').map(&:worker_pid)
|
73
80
|
end
|
74
81
|
|
82
|
+
# Release all of this Node's WorkUnits for other nodes to take.
|
83
|
+
def release_work_units
|
84
|
+
WorkUnit.update_all('node_record_id = null, worker_pid = null', "node_record_id = #{id}")
|
85
|
+
end
|
86
|
+
|
75
87
|
# The JSON representation of a NodeRecord includes its worker_pids.
|
76
88
|
def to_json(opts={})
|
77
89
|
{ 'host' => host,
|
@@ -83,11 +95,10 @@ module CloudCrowd
|
|
83
95
|
|
84
96
|
private
|
85
97
|
|
86
|
-
# When a Node
|
87
|
-
#
|
88
|
-
|
89
|
-
|
90
|
-
WorkUnit.update_all('node_record_id = null, worker_pid = null', "node_record_id = #{id}")
|
98
|
+
# When a Node exits, release its WorkUnits and redistribute them to others.
|
99
|
+
# Redistribute in a separate thread to avoid delaying shutdown.
|
100
|
+
def redistribute_work_units
|
101
|
+
release_work_units
|
91
102
|
Thread.new { WorkUnit.distribute_to_nodes }
|
92
103
|
end
|
93
104
|
|
@@ -28,7 +28,7 @@ module CloudCrowd
|
|
28
28
|
until work_units.empty? do
|
29
29
|
node = available_nodes.shift
|
30
30
|
unit = work_units.first
|
31
|
-
break unless node
|
31
|
+
break unless node && unit
|
32
32
|
next unless node.actions.include? unit.action
|
33
33
|
sent = node.send_work_unit(unit)
|
34
34
|
if sent
|
@@ -51,6 +51,12 @@ module CloudCrowd
|
|
51
51
|
WorkUnit.reserved.update_all('reservation = null')
|
52
52
|
end
|
53
53
|
|
54
|
+
# Cancels all outstanding WorkUnit reservations for all processes. (Useful
|
55
|
+
# in the console for debugging.)
|
56
|
+
def self.cancel_all_reservations
|
57
|
+
WorkUnit.update_all('reservation = null')
|
58
|
+
end
|
59
|
+
|
54
60
|
# Look up a WorkUnit by the worker that's currently processing it. Specified
|
55
61
|
# by <tt>pid@host</tt>.
|
56
62
|
def self.find_by_worker_name(name)
|
@@ -74,7 +80,7 @@ module CloudCrowd
|
|
74
80
|
WorkUnit.start(job, action, new_input, PROCESSING)
|
75
81
|
end
|
76
82
|
self.destroy
|
77
|
-
job.set_next_status if job.done_splitting?
|
83
|
+
job.set_next_status if job && job.done_splitting?
|
78
84
|
else
|
79
85
|
update_attributes({
|
80
86
|
:status => SUCCEEDED,
|
@@ -84,7 +90,7 @@ module CloudCrowd
|
|
84
90
|
:output => result,
|
85
91
|
:time => time_taken
|
86
92
|
})
|
87
|
-
job.check_for_completion
|
93
|
+
job && job.check_for_completion
|
88
94
|
end
|
89
95
|
end
|
90
96
|
|
data/lib/cloud_crowd/node.rb
CHANGED
@@ -10,9 +10,24 @@ module CloudCrowd
|
|
10
10
|
|
11
11
|
# A Node's default port. You only run a single node per machine, so they
|
12
12
|
# can all use the same port without any problems.
|
13
|
-
DEFAULT_PORT
|
13
|
+
DEFAULT_PORT = 9063
|
14
14
|
|
15
|
-
|
15
|
+
# A list of regex scrapers, which let us extract the one-minute load
|
16
|
+
# average and the amount of free memory on different flavors of UNIX.
|
17
|
+
|
18
|
+
SCRAPE_UPTIME = /\d+\.\d+/
|
19
|
+
SCRAPE_LINUX_MEMORY = /MemFree:\s+(\d+) kB/
|
20
|
+
SCRAPE_MAC_MEMORY = /Pages free:\s+(\d+)./
|
21
|
+
SCRAPE_MAC_PAGE = /page size of (\d+) bytes/
|
22
|
+
|
23
|
+
# The interval at which the node monitors the machine's load and memory use
|
24
|
+
# (if configured to do so in config.yml).
|
25
|
+
MONITOR_INTERVAL = 3
|
26
|
+
|
27
|
+
# The response sent back when this node is overloaded.
|
28
|
+
OVERLOADED_MESSAGE = 'Node Overloaded'
|
29
|
+
|
30
|
+
attr_reader :asset_store, :enabled_actions, :host, :port, :server
|
16
31
|
|
17
32
|
set :root, ROOT
|
18
33
|
set :authorization_realm, "CloudCrowd"
|
@@ -35,14 +50,15 @@ module CloudCrowd
|
|
35
50
|
end
|
36
51
|
|
37
52
|
# Posts a WorkUnit to this Node. Forks a Worker and returns the process id.
|
53
|
+
# Returns a 503 if this Node is overloaded.
|
38
54
|
post '/work' do
|
39
|
-
|
55
|
+
throw :halt, [503, OVERLOADED_MESSAGE] if @overloaded
|
56
|
+
pid = fork { Worker.new(self, JSON.parse(params[:work_unit])).run }
|
40
57
|
Process.detach(pid)
|
41
58
|
json :pid => pid
|
42
59
|
end
|
43
60
|
|
44
|
-
#
|
45
|
-
# incoming WorkUnits.
|
61
|
+
# When creating a node, specify the port it should run on.
|
46
62
|
def initialize(port=DEFAULT_PORT)
|
47
63
|
require 'json'
|
48
64
|
@server = CloudCrowd.central_server
|
@@ -50,25 +66,35 @@ module CloudCrowd
|
|
50
66
|
@enabled_actions = CloudCrowd.actions.keys
|
51
67
|
@asset_store = AssetStore.new
|
52
68
|
@port = port || DEFAULT_PORT
|
53
|
-
|
69
|
+
@overloaded = false
|
70
|
+
@max_load = CloudCrowd.config[:max_load]
|
71
|
+
@min_memory = CloudCrowd.config[:min_free_memory]
|
72
|
+
start unless test?
|
73
|
+
end
|
74
|
+
|
75
|
+
# Starting up a Node registers with the central server and begins to listen
|
76
|
+
# for incoming WorkUnits.
|
77
|
+
def start
|
54
78
|
trap_signals
|
55
79
|
start_server
|
56
|
-
|
80
|
+
monitor_system if @max_load || @min_memory
|
81
|
+
check_in(true)
|
57
82
|
@server_thread.join
|
58
83
|
end
|
59
84
|
|
60
85
|
# Checking in with the central server informs it of the location and
|
61
86
|
# configuration of this Node. If it can't check-in, there's no point in
|
62
87
|
# starting.
|
63
|
-
def check_in
|
88
|
+
def check_in(critical=false)
|
64
89
|
@server["/node/#{@host}"].put(
|
65
90
|
:port => @port,
|
91
|
+
:busy => @overloaded,
|
66
92
|
:max_workers => CloudCrowd.config[:max_workers],
|
67
93
|
:enabled_actions => @enabled_actions.join(',')
|
68
94
|
)
|
69
95
|
rescue Errno::ECONNREFUSED
|
70
|
-
puts "Failed to connect to the central server (#{@server.to_s})
|
71
|
-
raise SystemExit
|
96
|
+
puts "Failed to connect to the central server (#{@server.to_s})."
|
97
|
+
raise SystemExit if critical
|
72
98
|
end
|
73
99
|
|
74
100
|
# Before exiting, the Node checks out with the central server, releasing all
|
@@ -77,6 +103,33 @@ module CloudCrowd
|
|
77
103
|
@server["/node/#{@host}"].delete
|
78
104
|
end
|
79
105
|
|
106
|
+
# Is the node overloaded? If configured, checks if the load average is
|
107
|
+
# greater than 'max_load', or if the available RAM is less than
|
108
|
+
# 'min_free_memory'.
|
109
|
+
def overloaded?
|
110
|
+
(@max_load && load_average > @max_load) ||
|
111
|
+
(@min_memory && free_memory < @min_memory)
|
112
|
+
end
|
113
|
+
|
114
|
+
# The current one-minute load average.
|
115
|
+
def load_average
|
116
|
+
`uptime`.match(SCRAPE_UPTIME).to_s.to_f
|
117
|
+
end
|
118
|
+
|
119
|
+
# The current amount of free memory in megabytes.
|
120
|
+
def free_memory
|
121
|
+
case RUBY_PLATFORM
|
122
|
+
when /darwin/
|
123
|
+
stats = `vm_stat`
|
124
|
+
@mac_page_size ||= stats.match(SCRAPE_MAC_PAGE)[1].to_f / 1048576.0
|
125
|
+
stats.match(SCRAPE_MAC_MEMORY)[1].to_f * @mac_page_size
|
126
|
+
when /linux/
|
127
|
+
`cat /proc/meminfo`.match(SCRAPE_LINUX_MEMORY)[1].to_f / 1024.0
|
128
|
+
else
|
129
|
+
raise NotImplementedError, "'min_free_memory' is not yet implemented on your platform"
|
130
|
+
end
|
131
|
+
end
|
132
|
+
|
80
133
|
|
81
134
|
private
|
82
135
|
|
@@ -87,6 +140,20 @@ module CloudCrowd
|
|
87
140
|
end
|
88
141
|
end
|
89
142
|
|
143
|
+
# Launch a monitoring thread that periodically checks the node's load
|
144
|
+
# average and the amount of free memory remaining. If we transition out of
|
145
|
+
# the overloaded state, let central know.
|
146
|
+
def monitor_system
|
147
|
+
@monitor_thread = Thread.new do
|
148
|
+
loop do
|
149
|
+
was_overloaded = @overloaded
|
150
|
+
@overloaded = overloaded?
|
151
|
+
check_in if was_overloaded && !@overloaded
|
152
|
+
sleep MONITOR_INTERVAL
|
153
|
+
end
|
154
|
+
end
|
155
|
+
end
|
156
|
+
|
90
157
|
# Trap exit signals in order to shut down cleanly.
|
91
158
|
def trap_signals
|
92
159
|
Signal.trap('INT') { shut_down }
|
@@ -96,7 +163,9 @@ module CloudCrowd
|
|
96
163
|
|
97
164
|
# At shut down, de-register with the central server before exiting.
|
98
165
|
def shut_down
|
166
|
+
@monitor_thread.kill if @monitor_thread
|
99
167
|
check_out
|
168
|
+
@server_thread.kill
|
100
169
|
Process.exit
|
101
170
|
end
|
102
171
|
|
data/lib/cloud_crowd/schema.rb
CHANGED
@@ -15,10 +15,11 @@ ActiveRecord::Schema.define(:version => CloudCrowd::SCHEMA_VERSION) do
|
|
15
15
|
end
|
16
16
|
|
17
17
|
create_table "node_records", :force => true do |t|
|
18
|
-
t.string "host",
|
19
|
-
t.string "ip_address",
|
20
|
-
t.integer "port",
|
21
|
-
t.string "enabled_actions", :default => '',
|
18
|
+
t.string "host", :null => false
|
19
|
+
t.string "ip_address", :null => false
|
20
|
+
t.integer "port", :null => false
|
21
|
+
t.string "enabled_actions", :default => '', :null => false
|
22
|
+
t.boolean "busy", :default => false, :null => false
|
22
23
|
t.integer "max_workers"
|
23
24
|
t.datetime "created_at"
|
24
25
|
t.datetime "updated_at"
|
data/lib/cloud_crowd/worker.rb
CHANGED
@@ -14,15 +14,16 @@ module CloudCrowd
|
|
14
14
|
# Wait five seconds to retry, after internal communcication errors.
|
15
15
|
RETRY_WAIT = 5
|
16
16
|
|
17
|
-
attr_reader :
|
17
|
+
attr_reader :pid, :node, :unit, :status
|
18
18
|
|
19
|
-
# A new Worker
|
20
|
-
def initialize(node,
|
21
|
-
@
|
22
|
-
@
|
23
|
-
|
24
|
-
|
25
|
-
|
19
|
+
# A new Worker customizes itself to its WorkUnit at instantiation.
|
20
|
+
def initialize(node, unit)
|
21
|
+
@start_time = Time.now
|
22
|
+
@pid = $$
|
23
|
+
@node = node
|
24
|
+
@unit = unit
|
25
|
+
@status = @unit['status']
|
26
|
+
@retry_wait = RETRY_WAIT
|
26
27
|
end
|
27
28
|
|
28
29
|
# Return output to the central server, marking the WorkUnit done.
|
@@ -49,18 +50,20 @@ module CloudCrowd
|
|
49
50
|
def keep_trying_to(title)
|
50
51
|
begin
|
51
52
|
yield
|
53
|
+
rescue RestClient::ResourceNotFound => e
|
54
|
+
log "work unit ##{@unit['id']} doesn't exist. discarding..."
|
52
55
|
rescue Exception => e
|
53
|
-
log "failed to #{title} -- retry in #{
|
56
|
+
log "failed to #{title} -- retry in #{@retry_wait} seconds"
|
54
57
|
log e.message
|
55
58
|
log e.backtrace
|
56
|
-
sleep
|
59
|
+
sleep @retry_wait
|
57
60
|
retry
|
58
61
|
end
|
59
62
|
end
|
60
63
|
|
61
64
|
# Loggable details describing what the Worker is up to.
|
62
65
|
def display_work_unit
|
63
|
-
"unit ##{@
|
66
|
+
"unit ##{@unit['id']} (#{@unit['action']}/#{CloudCrowd.display_status(@status)})"
|
64
67
|
end
|
65
68
|
|
66
69
|
# Executes the WorkUnit by running the Action, catching all exceptions as
|
@@ -70,12 +73,13 @@ module CloudCrowd
|
|
70
73
|
@worker_thread = Thread.new do
|
71
74
|
begin
|
72
75
|
result = nil
|
73
|
-
|
74
|
-
|
76
|
+
action_class = CloudCrowd.actions[@unit['action']]
|
77
|
+
action = action_class.new(@status, @unit['input'], enhanced_unit_options, @node.asset_store)
|
78
|
+
Dir.chdir(action.work_directory) do
|
75
79
|
result = case @status
|
76
|
-
when PROCESSING then
|
77
|
-
when SPLITTING then
|
78
|
-
when MERGING then
|
80
|
+
when PROCESSING then action.process
|
81
|
+
when SPLITTING then action.split
|
82
|
+
when MERGING then action.merge
|
79
83
|
else raise Error::StatusUnspecified, "work units must specify their status"
|
80
84
|
end
|
81
85
|
end
|
@@ -83,7 +87,7 @@ module CloudCrowd
|
|
83
87
|
rescue Exception => e
|
84
88
|
fail_work_unit(e)
|
85
89
|
ensure
|
86
|
-
|
90
|
+
action.cleanup_work_directory if action
|
87
91
|
end
|
88
92
|
end
|
89
93
|
@worker_thread.join
|
@@ -91,9 +95,26 @@ module CloudCrowd
|
|
91
95
|
|
92
96
|
# Wraps run_work_unit to benchmark the execution time, if requested.
|
93
97
|
def run
|
94
|
-
|
95
|
-
|
96
|
-
|
98
|
+
trap_signals
|
99
|
+
log "starting #{display_work_unit}"
|
100
|
+
return run_work_unit unless @unit['options']['benchmark']
|
101
|
+
log("ran #{display_work_unit} in " + Benchmark.measure { run_work_unit }.to_s)
|
102
|
+
end
|
103
|
+
|
104
|
+
# There are some potentially important attributes of the WorkUnit that we'd
|
105
|
+
# like to pass into the Action -- in case it needs to know them. They will
|
106
|
+
# always be made available in the options hash.
|
107
|
+
def enhanced_unit_options
|
108
|
+
@unit['options'].merge({
|
109
|
+
'job_id' => @unit['job_id'],
|
110
|
+
'work_unit_id' => @unit['id'],
|
111
|
+
'attempts' => @unit['attempts']
|
112
|
+
})
|
113
|
+
end
|
114
|
+
|
115
|
+
# How long has this worker been running for?
|
116
|
+
def time_taken
|
117
|
+
Time.now - @start_time
|
97
118
|
end
|
98
119
|
|
99
120
|
|
@@ -103,20 +124,8 @@ module CloudCrowd
|
|
103
124
|
# regardless of success or failure.
|
104
125
|
def base_params
|
105
126
|
{ :pid => @pid,
|
106
|
-
:id => @
|
107
|
-
:time =>
|
108
|
-
end
|
109
|
-
|
110
|
-
# Extract the Worker's instance variables from a WorkUnit's JSON.
|
111
|
-
def setup_work_unit(unit)
|
112
|
-
return false unless unit
|
113
|
-
@start_time = Time.now
|
114
|
-
@action_name, @input, @options, @status = unit['action'], unit['input'], unit['options'], unit['status']
|
115
|
-
@options['job_id'] = unit['job_id']
|
116
|
-
@options['work_unit_id'] = unit['id']
|
117
|
-
@options['attempts'] ||= unit['attempts']
|
118
|
-
log "fetched #{display_work_unit}"
|
119
|
-
return true
|
127
|
+
:id => @unit['id'],
|
128
|
+
:time => time_taken }
|
120
129
|
end
|
121
130
|
|
122
131
|
# Log a message to the daemon log. Includes PID for identification.
|
@@ -4,6 +4,7 @@ require 'test_helper'
|
|
4
4
|
class FailingWorkUnitsTest < Test::Unit::TestCase
|
5
5
|
|
6
6
|
should "retry work units when they fail" do
|
7
|
+
WorkUnit.expects(:distribute_to_nodes).returns(true)
|
7
8
|
browser = Rack::Test::Session.new(Rack::MockSession.new(CloudCrowd::Server))
|
8
9
|
|
9
10
|
browser.post '/jobs', :job => {
|
@@ -0,0 +1,20 @@
|
|
1
|
+
require 'test_helper'
|
2
|
+
|
3
|
+
class NodeAcceptanceTest < Test::Unit::TestCase
|
4
|
+
|
5
|
+
include Rack::Test::Methods
|
6
|
+
|
7
|
+
def app
|
8
|
+
CloudCrowd::Node
|
9
|
+
end
|
10
|
+
|
11
|
+
context "The CloudCrowd::Node (Sinatra)" do
|
12
|
+
|
13
|
+
should "have a heartbeat" do
|
14
|
+
get '/heartbeat'
|
15
|
+
assert last_response.body == 'buh-bump'
|
16
|
+
end
|
17
|
+
|
18
|
+
end
|
19
|
+
|
20
|
+
end
|
@@ -46,6 +46,7 @@ class ServerTest < Test::Unit::TestCase
|
|
46
46
|
end
|
47
47
|
|
48
48
|
should "be able to create a job" do
|
49
|
+
WorkUnit.expects(:distribute_to_nodes).returns(true)
|
49
50
|
post('/jobs', :job => '{"action":"graphics_magick","inputs":["http://www.google.com/"]}')
|
50
51
|
assert last_response.ok?
|
51
52
|
job_info = JSON.parse(last_response.body)
|
@@ -5,16 +5,13 @@ class WordCountTest < Test::Unit::TestCase
|
|
5
5
|
context "the word_count action" do
|
6
6
|
|
7
7
|
setup do
|
8
|
+
WorkUnit.expects(:distribute_to_nodes).returns(true)
|
8
9
|
@asset_store = AssetStore.new
|
9
10
|
@browser = Rack::Test::Session.new(Rack::MockSession.new(CloudCrowd::Server))
|
10
11
|
@browser.put('/worker', :name => 'test_worker', :thread_status => 'sleeping')
|
11
12
|
post_job_to_count_words_in_this_file
|
12
13
|
@job_id = JSON.parse(@browser.last_response.body)['id']
|
13
14
|
end
|
14
|
-
|
15
|
-
teardown do
|
16
|
-
CloudCrowd::Job.destroy_all
|
17
|
-
end
|
18
15
|
|
19
16
|
should "be able to create a word_count job" do
|
20
17
|
assert @browser.last_response.ok?
|
@@ -26,7 +23,7 @@ class WordCountTest < Test::Unit::TestCase
|
|
26
23
|
should "be able to perform the processing stage of a word_count" do
|
27
24
|
action = CloudCrowd.actions['word_count'].new(1, "file://#{File.expand_path(__FILE__)}", {}, @asset_store)
|
28
25
|
count = action.process
|
29
|
-
assert count ==
|
26
|
+
assert count == 101
|
30
27
|
end
|
31
28
|
|
32
29
|
end
|
data/test/blueprints.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
Sham.url { Faker::Internet.domain_name + "/" + Faker::Internet.domain_word + ".jpg" }
|
2
|
+
Sham.host { Faker::Internet.domain_name + '.local' }
|
2
3
|
|
3
4
|
CloudCrowd::Job.blueprint do
|
4
5
|
status { CloudCrowd::PROCESSING }
|
@@ -8,9 +9,17 @@ CloudCrowd::Job.blueprint do
|
|
8
9
|
email { 'noone@example.com' }
|
9
10
|
end
|
10
11
|
|
12
|
+
CloudCrowd::NodeRecord.blueprint do
|
13
|
+
host
|
14
|
+
ip_address { '127.0.0.1' }
|
15
|
+
port { 6093 }
|
16
|
+
enabled_actions { 'graphics_magick,word_count' }
|
17
|
+
max_workers { 3 }
|
18
|
+
end
|
19
|
+
|
11
20
|
CloudCrowd::WorkUnit.blueprint do
|
12
|
-
job
|
13
|
-
status
|
14
|
-
input
|
15
|
-
action
|
21
|
+
job { CloudCrowd::Job.make }
|
22
|
+
status { CloudCrowd::PROCESSING }
|
23
|
+
input { '{"key":"value"}' }
|
24
|
+
action { 'graphics_magick' }
|
16
25
|
end
|
data/test/config/database.yml
CHANGED
@@ -0,0 +1,38 @@
|
|
1
|
+
require 'test_helper'
|
2
|
+
|
3
|
+
class NodeUnitTest < Test::Unit::TestCase
|
4
|
+
|
5
|
+
context "A Node" do
|
6
|
+
|
7
|
+
setup do
|
8
|
+
@node = Node.new(11011).instance_variable_get(:@app)
|
9
|
+
end
|
10
|
+
|
11
|
+
should "instantiate correctly" do
|
12
|
+
assert @node.server.to_s == "http://localhost:9173"
|
13
|
+
assert @node.port == 11011
|
14
|
+
assert @node.host == Socket.gethostname
|
15
|
+
assert @node.enabled_actions.length > 2
|
16
|
+
assert @node.asset_store.is_a? AssetStore::FilesystemStore
|
17
|
+
end
|
18
|
+
|
19
|
+
should "trap signals and launch a server at start" do
|
20
|
+
Signal.expects(:trap).times(3)
|
21
|
+
Thin::Server.expects(:start)
|
22
|
+
@node.expects(:check_in)
|
23
|
+
@node.start
|
24
|
+
end
|
25
|
+
|
26
|
+
should "be able to determine if the node is overloaded" do
|
27
|
+
assert !@node.overloaded?
|
28
|
+
@node.instance_variable_set :@max_load, 0.01
|
29
|
+
assert @node.overloaded?
|
30
|
+
@node.instance_variable_set :@max_load, nil
|
31
|
+
assert !@node.overloaded?
|
32
|
+
@node.instance_variable_set :@min_memory, 8000
|
33
|
+
assert @node.overloaded?
|
34
|
+
end
|
35
|
+
|
36
|
+
end
|
37
|
+
|
38
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
require 'test_helper'
|
2
|
+
|
3
|
+
class NodeRecordTest < Test::Unit::TestCase
|
4
|
+
|
5
|
+
context "A NodeRecord" do
|
6
|
+
|
7
|
+
setup do
|
8
|
+
@node = CloudCrowd::NodeRecord.make
|
9
|
+
end
|
10
|
+
|
11
|
+
subject { @node }
|
12
|
+
|
13
|
+
should_have_many :work_units
|
14
|
+
|
15
|
+
should_validate_presence_of :host, :ip_address, :port, :enabled_actions
|
16
|
+
|
17
|
+
should "be available" do
|
18
|
+
assert NodeRecord.available.map(&:id).include? @node.id
|
19
|
+
end
|
20
|
+
|
21
|
+
should "know its enabled actions" do
|
22
|
+
assert @node.actions.include? 'graphics_magick'
|
23
|
+
assert @node.actions.include? 'word_count'
|
24
|
+
end
|
25
|
+
|
26
|
+
should "know if the node is busy" do
|
27
|
+
assert !@node.busy?
|
28
|
+
assert @node.display_status == 'available'
|
29
|
+
(@node.max_workers + 1).times { WorkUnit.make(:node_record => @node) }
|
30
|
+
assert @node.busy?
|
31
|
+
assert @node.display_status == 'busy'
|
32
|
+
@node.release_work_units
|
33
|
+
assert !@node.busy?
|
34
|
+
end
|
35
|
+
|
36
|
+
should "be reachable at a URL" do
|
37
|
+
assert !!URI.parse(@node.url)
|
38
|
+
end
|
39
|
+
|
40
|
+
end
|
41
|
+
|
42
|
+
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
require 'test_helper'
|
2
|
+
|
3
|
+
class WorkerTest < Test::Unit::TestCase
|
4
|
+
|
5
|
+
context "A CloudCrowd::Worker" do
|
6
|
+
|
7
|
+
setup do
|
8
|
+
@node = Node.new.instance_variable_get(:@app)
|
9
|
+
@unit = WorkUnit.make
|
10
|
+
@worker = Worker.new(@node, JSON.parse(@unit.to_json))
|
11
|
+
end
|
12
|
+
|
13
|
+
should "instantiate correctly" do
|
14
|
+
assert @worker.pid == $$
|
15
|
+
assert @worker.unit['id'] == @unit.id
|
16
|
+
assert @worker.status == @unit.status
|
17
|
+
assert @worker.node == @node
|
18
|
+
assert @worker.time_taken > 0
|
19
|
+
end
|
20
|
+
|
21
|
+
should "be able to retry operations that must succeed" do
|
22
|
+
@worker.instance_variable_set :@retry_wait, 0.01
|
23
|
+
@worker.expects(:log).at_least(3)
|
24
|
+
tries = 0
|
25
|
+
@worker.keep_trying_to("do something critical") do
|
26
|
+
tries += 1;
|
27
|
+
raise 'hell' unless tries > 3
|
28
|
+
assert "made it through"
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
should "be able to run an action and try to complete it" do
|
33
|
+
GraphicsMagick.any_instance.expects(:process).returns('the answer')
|
34
|
+
GraphicsMagick.any_instance.expects(:cleanup_work_directory)
|
35
|
+
@worker.expects(:complete_work_unit).with({'output' => 'the answer'}.to_json)
|
36
|
+
@worker.run_work_unit
|
37
|
+
end
|
38
|
+
|
39
|
+
should "enchance the options that an action receives with extra info" do
|
40
|
+
opts = @worker.enhanced_unit_options
|
41
|
+
assert opts['work_unit_id'] == @unit.id
|
42
|
+
assert opts['job_id'] == @unit.job.id
|
43
|
+
assert opts['attempts'] == @unit.attempts
|
44
|
+
end
|
45
|
+
|
46
|
+
end
|
47
|
+
|
48
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: documentcloud-cloud-crowd
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jeremy Ashkenas
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2009-09-
|
12
|
+
date: 2009-09-18 00:00:00 -07:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
@@ -182,8 +182,9 @@ files:
|
|
182
182
|
- public/js/flot.js
|
183
183
|
- public/js/jquery.js
|
184
184
|
- README
|
185
|
-
- test/acceptance/
|
185
|
+
- test/acceptance/test_node.rb
|
186
186
|
- test/acceptance/test_failing_work_units.rb
|
187
|
+
- test/acceptance/test_server.rb
|
187
188
|
- test/acceptance/test_word_count.rb
|
188
189
|
- test/blueprints.rb
|
189
190
|
- test/config/config.ru
|
@@ -193,7 +194,10 @@ files:
|
|
193
194
|
- test/test_helper.rb
|
194
195
|
- test/unit/test_action.rb
|
195
196
|
- test/unit/test_configuration.rb
|
197
|
+
- test/unit/test_node.rb
|
198
|
+
- test/unit/test_node_record.rb
|
196
199
|
- test/unit/test_job.rb
|
200
|
+
- test/unit/test_worker.rb
|
197
201
|
- test/unit/test_work_unit.rb
|
198
202
|
- views/operations_center.erb
|
199
203
|
has_rdoc: true
|