documentcloud-cloud-crowd 0.2.0 → 0.2.1
Sign up to get free protection for your applications and to get access to all the features.
- data/cloud-crowd.gemspec +7 -3
- data/config/config.example.yml +10 -3
- data/config/database.example.yml +7 -1
- data/lib/cloud-crowd.rb +2 -2
- data/lib/cloud_crowd/action.rb +16 -4
- data/lib/cloud_crowd/exceptions.rb +12 -1
- data/lib/cloud_crowd/models/job.rb +6 -5
- data/lib/cloud_crowd/models/node_record.rb +23 -12
- data/lib/cloud_crowd/models/work_unit.rb +9 -3
- data/lib/cloud_crowd/node.rb +79 -10
- data/lib/cloud_crowd/schema.rb +5 -4
- data/lib/cloud_crowd/worker.rb +43 -34
- data/test/acceptance/test_failing_work_units.rb +1 -0
- data/test/acceptance/test_node.rb +20 -0
- data/test/acceptance/test_server.rb +1 -0
- data/test/acceptance/test_word_count.rb +2 -5
- data/test/blueprints.rb +13 -4
- data/test/config/database.yml +3 -6
- data/test/unit/test_node.rb +38 -0
- data/test/unit/test_node_record.rb +42 -0
- data/test/unit/test_worker.rb +48 -0
- metadata +7 -3
data/cloud-crowd.gemspec
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
Gem::Specification.new do |s|
|
2
2
|
s.name = 'cloud-crowd'
|
3
|
-
s.version = '0.2.
|
4
|
-
s.date = '2009-09-
|
3
|
+
s.version = '0.2.1' # Keep version in sync with cloud-cloud.rb
|
4
|
+
s.date = '2009-09-18'
|
5
5
|
|
6
6
|
s.homepage = "http://wiki.github.com/documentcloud/cloud-crowd"
|
7
7
|
s.summary = "Parallel Processing for the Rest of Us"
|
@@ -94,8 +94,9 @@ public/js/excanvas.js
|
|
94
94
|
public/js/flot.js
|
95
95
|
public/js/jquery.js
|
96
96
|
README
|
97
|
-
test/acceptance/
|
97
|
+
test/acceptance/test_node.rb
|
98
98
|
test/acceptance/test_failing_work_units.rb
|
99
|
+
test/acceptance/test_server.rb
|
99
100
|
test/acceptance/test_word_count.rb
|
100
101
|
test/blueprints.rb
|
101
102
|
test/config/config.ru
|
@@ -105,7 +106,10 @@ test/config/actions/failure_testing.rb
|
|
105
106
|
test/test_helper.rb
|
106
107
|
test/unit/test_action.rb
|
107
108
|
test/unit/test_configuration.rb
|
109
|
+
test/unit/test_node.rb
|
110
|
+
test/unit/test_node_record.rb
|
108
111
|
test/unit/test_job.rb
|
112
|
+
test/unit/test_worker.rb
|
109
113
|
test/unit/test_work_unit.rb
|
110
114
|
views/operations_center.erb
|
111
115
|
)
|
data/config/config.example.yml
CHANGED
@@ -1,14 +1,21 @@
|
|
1
1
|
# The URL where you're planning on running the central server/queue/database.
|
2
2
|
:central_server: http://localhost:9173
|
3
3
|
|
4
|
-
#
|
5
|
-
#
|
6
|
-
#
|
4
|
+
# The following settings allow you to control the number of workers that can run
|
5
|
+
# on a given node, to prevent the node from becoming overloaded. 'max_workers'
|
6
|
+
# is a simple cap on the maximum number of workers a node is allowed to run
|
7
|
+
# concurrently. 'max_load' is the maximum (one-minute) load average, above which
|
8
|
+
# a node will refuse to take new work. 'min_free_memory' is the minimum amount
|
9
|
+
# of free RAM (in megabytes) a node is allowed to have, below which no new
|
10
|
+
# workers are run. These settings may be used in any combination.
|
7
11
|
:max_workers: 5
|
12
|
+
# :max_load: 5.0
|
13
|
+
# :min_free_memory: 150
|
8
14
|
|
9
15
|
# The storage back-end that you'd like to use for intermediate and final results
|
10
16
|
# of processing. 's3' and 'filesystem' are supported. 'filesystem' should only
|
11
17
|
# be used in development, on single-machine installations, or networked drives.
|
18
|
+
# If you *are* developing an action, filesystem is certainly faster and easier.
|
12
19
|
:storage: s3
|
13
20
|
|
14
21
|
# Please provide your AWS credentials for S3 storage of job output.
|
data/config/database.example.yml
CHANGED
@@ -6,4 +6,10 @@
|
|
6
6
|
:username: root
|
7
7
|
:password:
|
8
8
|
:socket: /tmp/mysql.sock
|
9
|
-
:database: cloud_crowd
|
9
|
+
:database: cloud_crowd
|
10
|
+
|
11
|
+
# If you'd prefer to use an SQLite database instead, the following configuration
|
12
|
+
# will do nicely:
|
13
|
+
#
|
14
|
+
# :adapter: sqlite3
|
15
|
+
# :database: cloud_crowd.db
|
data/lib/cloud-crowd.rb
CHANGED
@@ -43,10 +43,10 @@ module CloudCrowd
|
|
43
43
|
autoload :WorkUnit, 'cloud_crowd/models'
|
44
44
|
|
45
45
|
# Keep this version in sync with the gemspec.
|
46
|
-
VERSION = '0.2.
|
46
|
+
VERSION = '0.2.1'
|
47
47
|
|
48
48
|
# Increment the schema version when there's a backwards incompatible change.
|
49
|
-
SCHEMA_VERSION =
|
49
|
+
SCHEMA_VERSION = 3
|
50
50
|
|
51
51
|
# Root directory of the CloudCrowd gem.
|
52
52
|
ROOT = File.expand_path(File.dirname(__FILE__) + '/..')
|
data/lib/cloud_crowd/action.rb
CHANGED
@@ -12,6 +12,9 @@ module CloudCrowd
|
|
12
12
|
#
|
13
13
|
# All actions have use of an individual +work_directory+, for scratch files,
|
14
14
|
# and spend their duration inside of it, so relative paths work well.
|
15
|
+
#
|
16
|
+
# Note that Actions inherit a backticks (`) method that raises an Exception
|
17
|
+
# if the external command fails.
|
15
18
|
class Action
|
16
19
|
|
17
20
|
FILE_URL = /\Afile:\/\//
|
@@ -33,7 +36,7 @@ module CloudCrowd
|
|
33
36
|
|
34
37
|
# Each Action subclass must implement a +process+ method, overriding this.
|
35
38
|
def process
|
36
|
-
raise NotImplementedError
|
39
|
+
raise NotImplementedError, "CloudCrowd::Actions must override 'process' with their own processing code."
|
37
40
|
end
|
38
41
|
|
39
42
|
# Download a file to the specified path.
|
@@ -66,6 +69,15 @@ module CloudCrowd
|
|
66
69
|
FileUtils.rm_r(@work_directory) if File.exists?(@work_directory)
|
67
70
|
end
|
68
71
|
|
72
|
+
# Actions have a backticks command that raises a CommandFailed exception
|
73
|
+
# on failure, so that processing doesn't just blithely continue.
|
74
|
+
def `(command)
|
75
|
+
result = super(command)
|
76
|
+
exit_code = $?.to_i
|
77
|
+
raise Error::CommandFailed.new(result, exit_code) unless exit_code == 0
|
78
|
+
result
|
79
|
+
end
|
80
|
+
|
69
81
|
|
70
82
|
private
|
71
83
|
|
@@ -77,7 +89,7 @@ module CloudCrowd
|
|
77
89
|
end
|
78
90
|
|
79
91
|
# The directory prefix to use for both local and S3 storage.
|
80
|
-
# [
|
92
|
+
# [action]/job_[job_id]/unit_[work_unit_it]
|
81
93
|
def storage_prefix
|
82
94
|
path_parts = []
|
83
95
|
path_parts << Inflector.underscore(self.class)
|
@@ -93,9 +105,9 @@ module CloudCrowd
|
|
93
105
|
|
94
106
|
# If the input is a URL, download the file before beginning processing.
|
95
107
|
def download_input
|
108
|
+
input_is_url = !!URI.parse(@input) rescue false
|
109
|
+
return unless input_is_url
|
96
110
|
Dir.chdir(@work_directory) do
|
97
|
-
input_is_url = !!URI.parse(@input) rescue false
|
98
|
-
return unless input_is_url
|
99
111
|
@input_path = File.join(@work_directory, safe_filename(@input))
|
100
112
|
@file_name = File.basename(@input_path, File.extname(@input_path))
|
101
113
|
download(@input, @input_path)
|
@@ -11,7 +11,7 @@ module CloudCrowd
|
|
11
11
|
class ActionNotFound < Error
|
12
12
|
end
|
13
13
|
|
14
|
-
# StorageNotFound is raised when config.yml specifies a storage back
|
14
|
+
# StorageNotFound is raised when config.yml specifies a storage back-end that
|
15
15
|
# doesn't exist.
|
16
16
|
class StorageNotFound < Error
|
17
17
|
end
|
@@ -30,6 +30,17 @@ module CloudCrowd
|
|
30
30
|
class MissingConfiguration < Error
|
31
31
|
end
|
32
32
|
|
33
|
+
# CommandFailed is raised when an action shells out, and the external
|
34
|
+
# command returns a non-zero exit code.
|
35
|
+
class CommandFailed < Error
|
36
|
+
attr_reader :exit_code
|
37
|
+
|
38
|
+
def initialize(message, exit_code)
|
39
|
+
super(message)
|
40
|
+
@exit_code = exit_code
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
33
44
|
end
|
34
45
|
|
35
46
|
end
|
@@ -15,8 +15,7 @@ module CloudCrowd
|
|
15
15
|
after_create :queue_for_workers
|
16
16
|
before_destroy :cleanup_assets
|
17
17
|
|
18
|
-
# Create a Job from an incoming JSON
|
19
|
-
# TODO: Think about XML support.
|
18
|
+
# Create a Job from an incoming JSON request, and add it to the queue.
|
20
19
|
def self.create_from_request(h)
|
21
20
|
self.create(
|
22
21
|
:inputs => h['inputs'].to_json,
|
@@ -41,7 +40,8 @@ module CloudCrowd
|
|
41
40
|
self
|
42
41
|
end
|
43
42
|
|
44
|
-
# Transition this Job's status to the appropriate next
|
43
|
+
# Transition this Job's current status to the appropriate next one, based
|
44
|
+
# on the state of the WorkUnits and the nature of the Action.
|
45
45
|
def set_next_status
|
46
46
|
update_attribute(:status,
|
47
47
|
any_work_units_failed? ? FAILED :
|
@@ -66,8 +66,9 @@ module CloudCrowd
|
|
66
66
|
end
|
67
67
|
end
|
68
68
|
|
69
|
-
# Cleaning up after a job will remove all of its files from S3
|
70
|
-
# a Job
|
69
|
+
# Cleaning up after a job will remove all of its files from S3 or the
|
70
|
+
# filesystem. Destroying a Job will cleanup_assets first. Run this in a
|
71
|
+
# separate thread to get out of the transaction's way.
|
71
72
|
# TODO: Convert this into a 'cleanup' work unit that gets run by a worker.
|
72
73
|
def cleanup_assets
|
73
74
|
AssetStore.new.cleanup(self)
|
@@ -7,9 +7,9 @@ module CloudCrowd
|
|
7
7
|
|
8
8
|
has_many :work_units
|
9
9
|
|
10
|
-
validates_presence_of :host, :ip_address, :port
|
10
|
+
validates_presence_of :host, :ip_address, :port, :enabled_actions
|
11
11
|
|
12
|
-
|
12
|
+
after_destroy :redistribute_work_units
|
13
13
|
|
14
14
|
# Available Nodes haven't used up their maxiumum number of workers yet.
|
15
15
|
named_scope :available, {
|
@@ -23,6 +23,7 @@ module CloudCrowd
|
|
23
23
|
attrs = {
|
24
24
|
:ip_address => request.ip,
|
25
25
|
:port => params[:port],
|
26
|
+
:busy => params[:busy],
|
26
27
|
:max_workers => params[:max_workers],
|
27
28
|
:enabled_actions => params[:enabled_actions]
|
28
29
|
}
|
@@ -32,12 +33,17 @@ module CloudCrowd
|
|
32
33
|
# Dispatch a WorkUnit to this node. Places the node at back at the end of
|
33
34
|
# the rotation. If we fail to send the WorkUnit, we consider the node to be
|
34
35
|
# down, and remove this record, freeing up all of its checked-out work units.
|
36
|
+
# If the Node responds that it's overloaded, we mark it as busy. Returns
|
37
|
+
# true if the WorkUnit was dispatched successfully.
|
35
38
|
def send_work_unit(unit)
|
36
39
|
result = node['/work'].post(:work_unit => unit.to_json)
|
37
40
|
unit.assign_to(self, JSON.parse(result)['pid'])
|
38
|
-
touch
|
39
|
-
rescue Errno::ECONNREFUSED
|
40
|
-
|
41
|
+
touch && true
|
42
|
+
rescue Errno::ECONNREFUSED # Couldn't post to node, assume it's gone away.
|
43
|
+
destroy && false
|
44
|
+
rescue RestClient::RequestFailed => e
|
45
|
+
raise e unless e.http_code == 503 && e.http_body == Node::OVERLOADED_MESSAGE
|
46
|
+
update_attribute(:busy, true) && false
|
41
47
|
end
|
42
48
|
|
43
49
|
# What Actions is this Node able to run?
|
@@ -45,9 +51,10 @@ module CloudCrowd
|
|
45
51
|
enabled_actions.split(',')
|
46
52
|
end
|
47
53
|
|
48
|
-
# Is this Node too busy for more work?
|
54
|
+
# Is this Node too busy for more work? Determined by number of workers, or
|
55
|
+
# the Node's load average, as configured in config.yml.
|
49
56
|
def busy?
|
50
|
-
max_workers && work_units.count >= max_workers
|
57
|
+
busy || (max_workers && work_units.count >= max_workers)
|
51
58
|
end
|
52
59
|
|
53
60
|
# The URL at which this Node may be reached.
|
@@ -72,6 +79,11 @@ module CloudCrowd
|
|
72
79
|
work_units.all(:select => 'worker_pid').map(&:worker_pid)
|
73
80
|
end
|
74
81
|
|
82
|
+
# Release all of this Node's WorkUnits for other nodes to take.
|
83
|
+
def release_work_units
|
84
|
+
WorkUnit.update_all('node_record_id = null, worker_pid = null', "node_record_id = #{id}")
|
85
|
+
end
|
86
|
+
|
75
87
|
# The JSON representation of a NodeRecord includes its worker_pids.
|
76
88
|
def to_json(opts={})
|
77
89
|
{ 'host' => host,
|
@@ -83,11 +95,10 @@ module CloudCrowd
|
|
83
95
|
|
84
96
|
private
|
85
97
|
|
86
|
-
# When a Node
|
87
|
-
#
|
88
|
-
|
89
|
-
|
90
|
-
WorkUnit.update_all('node_record_id = null, worker_pid = null', "node_record_id = #{id}")
|
98
|
+
# When a Node exits, release its WorkUnits and redistribute them to others.
|
99
|
+
# Redistribute in a separate thread to avoid delaying shutdown.
|
100
|
+
def redistribute_work_units
|
101
|
+
release_work_units
|
91
102
|
Thread.new { WorkUnit.distribute_to_nodes }
|
92
103
|
end
|
93
104
|
|
@@ -28,7 +28,7 @@ module CloudCrowd
|
|
28
28
|
until work_units.empty? do
|
29
29
|
node = available_nodes.shift
|
30
30
|
unit = work_units.first
|
31
|
-
break unless node
|
31
|
+
break unless node && unit
|
32
32
|
next unless node.actions.include? unit.action
|
33
33
|
sent = node.send_work_unit(unit)
|
34
34
|
if sent
|
@@ -51,6 +51,12 @@ module CloudCrowd
|
|
51
51
|
WorkUnit.reserved.update_all('reservation = null')
|
52
52
|
end
|
53
53
|
|
54
|
+
# Cancels all outstanding WorkUnit reservations for all processes. (Useful
|
55
|
+
# in the console for debugging.)
|
56
|
+
def self.cancel_all_reservations
|
57
|
+
WorkUnit.update_all('reservation = null')
|
58
|
+
end
|
59
|
+
|
54
60
|
# Look up a WorkUnit by the worker that's currently processing it. Specified
|
55
61
|
# by <tt>pid@host</tt>.
|
56
62
|
def self.find_by_worker_name(name)
|
@@ -74,7 +80,7 @@ module CloudCrowd
|
|
74
80
|
WorkUnit.start(job, action, new_input, PROCESSING)
|
75
81
|
end
|
76
82
|
self.destroy
|
77
|
-
job.set_next_status if job.done_splitting?
|
83
|
+
job.set_next_status if job && job.done_splitting?
|
78
84
|
else
|
79
85
|
update_attributes({
|
80
86
|
:status => SUCCEEDED,
|
@@ -84,7 +90,7 @@ module CloudCrowd
|
|
84
90
|
:output => result,
|
85
91
|
:time => time_taken
|
86
92
|
})
|
87
|
-
job.check_for_completion
|
93
|
+
job && job.check_for_completion
|
88
94
|
end
|
89
95
|
end
|
90
96
|
|
data/lib/cloud_crowd/node.rb
CHANGED
@@ -10,9 +10,24 @@ module CloudCrowd
|
|
10
10
|
|
11
11
|
# A Node's default port. You only run a single node per machine, so they
|
12
12
|
# can all use the same port without any problems.
|
13
|
-
DEFAULT_PORT
|
13
|
+
DEFAULT_PORT = 9063
|
14
14
|
|
15
|
-
|
15
|
+
# A list of regex scrapers, which let us extract the one-minute load
|
16
|
+
# average and the amount of free memory on different flavors of UNIX.
|
17
|
+
|
18
|
+
SCRAPE_UPTIME = /\d+\.\d+/
|
19
|
+
SCRAPE_LINUX_MEMORY = /MemFree:\s+(\d+) kB/
|
20
|
+
SCRAPE_MAC_MEMORY = /Pages free:\s+(\d+)./
|
21
|
+
SCRAPE_MAC_PAGE = /page size of (\d+) bytes/
|
22
|
+
|
23
|
+
# The interval at which the node monitors the machine's load and memory use
|
24
|
+
# (if configured to do so in config.yml).
|
25
|
+
MONITOR_INTERVAL = 3
|
26
|
+
|
27
|
+
# The response sent back when this node is overloaded.
|
28
|
+
OVERLOADED_MESSAGE = 'Node Overloaded'
|
29
|
+
|
30
|
+
attr_reader :asset_store, :enabled_actions, :host, :port, :server
|
16
31
|
|
17
32
|
set :root, ROOT
|
18
33
|
set :authorization_realm, "CloudCrowd"
|
@@ -35,14 +50,15 @@ module CloudCrowd
|
|
35
50
|
end
|
36
51
|
|
37
52
|
# Posts a WorkUnit to this Node. Forks a Worker and returns the process id.
|
53
|
+
# Returns a 503 if this Node is overloaded.
|
38
54
|
post '/work' do
|
39
|
-
|
55
|
+
throw :halt, [503, OVERLOADED_MESSAGE] if @overloaded
|
56
|
+
pid = fork { Worker.new(self, JSON.parse(params[:work_unit])).run }
|
40
57
|
Process.detach(pid)
|
41
58
|
json :pid => pid
|
42
59
|
end
|
43
60
|
|
44
|
-
#
|
45
|
-
# incoming WorkUnits.
|
61
|
+
# When creating a node, specify the port it should run on.
|
46
62
|
def initialize(port=DEFAULT_PORT)
|
47
63
|
require 'json'
|
48
64
|
@server = CloudCrowd.central_server
|
@@ -50,25 +66,35 @@ module CloudCrowd
|
|
50
66
|
@enabled_actions = CloudCrowd.actions.keys
|
51
67
|
@asset_store = AssetStore.new
|
52
68
|
@port = port || DEFAULT_PORT
|
53
|
-
|
69
|
+
@overloaded = false
|
70
|
+
@max_load = CloudCrowd.config[:max_load]
|
71
|
+
@min_memory = CloudCrowd.config[:min_free_memory]
|
72
|
+
start unless test?
|
73
|
+
end
|
74
|
+
|
75
|
+
# Starting up a Node registers with the central server and begins to listen
|
76
|
+
# for incoming WorkUnits.
|
77
|
+
def start
|
54
78
|
trap_signals
|
55
79
|
start_server
|
56
|
-
|
80
|
+
monitor_system if @max_load || @min_memory
|
81
|
+
check_in(true)
|
57
82
|
@server_thread.join
|
58
83
|
end
|
59
84
|
|
60
85
|
# Checking in with the central server informs it of the location and
|
61
86
|
# configuration of this Node. If it can't check-in, there's no point in
|
62
87
|
# starting.
|
63
|
-
def check_in
|
88
|
+
def check_in(critical=false)
|
64
89
|
@server["/node/#{@host}"].put(
|
65
90
|
:port => @port,
|
91
|
+
:busy => @overloaded,
|
66
92
|
:max_workers => CloudCrowd.config[:max_workers],
|
67
93
|
:enabled_actions => @enabled_actions.join(',')
|
68
94
|
)
|
69
95
|
rescue Errno::ECONNREFUSED
|
70
|
-
puts "Failed to connect to the central server (#{@server.to_s})
|
71
|
-
raise SystemExit
|
96
|
+
puts "Failed to connect to the central server (#{@server.to_s})."
|
97
|
+
raise SystemExit if critical
|
72
98
|
end
|
73
99
|
|
74
100
|
# Before exiting, the Node checks out with the central server, releasing all
|
@@ -77,6 +103,33 @@ module CloudCrowd
|
|
77
103
|
@server["/node/#{@host}"].delete
|
78
104
|
end
|
79
105
|
|
106
|
+
# Is the node overloaded? If configured, checks if the load average is
|
107
|
+
# greater than 'max_load', or if the available RAM is less than
|
108
|
+
# 'min_free_memory'.
|
109
|
+
def overloaded?
|
110
|
+
(@max_load && load_average > @max_load) ||
|
111
|
+
(@min_memory && free_memory < @min_memory)
|
112
|
+
end
|
113
|
+
|
114
|
+
# The current one-minute load average.
|
115
|
+
def load_average
|
116
|
+
`uptime`.match(SCRAPE_UPTIME).to_s.to_f
|
117
|
+
end
|
118
|
+
|
119
|
+
# The current amount of free memory in megabytes.
|
120
|
+
def free_memory
|
121
|
+
case RUBY_PLATFORM
|
122
|
+
when /darwin/
|
123
|
+
stats = `vm_stat`
|
124
|
+
@mac_page_size ||= stats.match(SCRAPE_MAC_PAGE)[1].to_f / 1048576.0
|
125
|
+
stats.match(SCRAPE_MAC_MEMORY)[1].to_f * @mac_page_size
|
126
|
+
when /linux/
|
127
|
+
`cat /proc/meminfo`.match(SCRAPE_LINUX_MEMORY)[1].to_f / 1024.0
|
128
|
+
else
|
129
|
+
raise NotImplementedError, "'min_free_memory' is not yet implemented on your platform"
|
130
|
+
end
|
131
|
+
end
|
132
|
+
|
80
133
|
|
81
134
|
private
|
82
135
|
|
@@ -87,6 +140,20 @@ module CloudCrowd
|
|
87
140
|
end
|
88
141
|
end
|
89
142
|
|
143
|
+
# Launch a monitoring thread that periodically checks the node's load
|
144
|
+
# average and the amount of free memory remaining. If we transition out of
|
145
|
+
# the overloaded state, let central know.
|
146
|
+
def monitor_system
|
147
|
+
@monitor_thread = Thread.new do
|
148
|
+
loop do
|
149
|
+
was_overloaded = @overloaded
|
150
|
+
@overloaded = overloaded?
|
151
|
+
check_in if was_overloaded && !@overloaded
|
152
|
+
sleep MONITOR_INTERVAL
|
153
|
+
end
|
154
|
+
end
|
155
|
+
end
|
156
|
+
|
90
157
|
# Trap exit signals in order to shut down cleanly.
|
91
158
|
def trap_signals
|
92
159
|
Signal.trap('INT') { shut_down }
|
@@ -96,7 +163,9 @@ module CloudCrowd
|
|
96
163
|
|
97
164
|
# At shut down, de-register with the central server before exiting.
|
98
165
|
def shut_down
|
166
|
+
@monitor_thread.kill if @monitor_thread
|
99
167
|
check_out
|
168
|
+
@server_thread.kill
|
100
169
|
Process.exit
|
101
170
|
end
|
102
171
|
|
data/lib/cloud_crowd/schema.rb
CHANGED
@@ -15,10 +15,11 @@ ActiveRecord::Schema.define(:version => CloudCrowd::SCHEMA_VERSION) do
|
|
15
15
|
end
|
16
16
|
|
17
17
|
create_table "node_records", :force => true do |t|
|
18
|
-
t.string "host",
|
19
|
-
t.string "ip_address",
|
20
|
-
t.integer "port",
|
21
|
-
t.string "enabled_actions", :default => '',
|
18
|
+
t.string "host", :null => false
|
19
|
+
t.string "ip_address", :null => false
|
20
|
+
t.integer "port", :null => false
|
21
|
+
t.string "enabled_actions", :default => '', :null => false
|
22
|
+
t.boolean "busy", :default => false, :null => false
|
22
23
|
t.integer "max_workers"
|
23
24
|
t.datetime "created_at"
|
24
25
|
t.datetime "updated_at"
|
data/lib/cloud_crowd/worker.rb
CHANGED
@@ -14,15 +14,16 @@ module CloudCrowd
|
|
14
14
|
# Wait five seconds to retry, after internal communcication errors.
|
15
15
|
RETRY_WAIT = 5
|
16
16
|
|
17
|
-
attr_reader :
|
17
|
+
attr_reader :pid, :node, :unit, :status
|
18
18
|
|
19
|
-
# A new Worker
|
20
|
-
def initialize(node,
|
21
|
-
@
|
22
|
-
@
|
23
|
-
|
24
|
-
|
25
|
-
|
19
|
+
# A new Worker customizes itself to its WorkUnit at instantiation.
|
20
|
+
def initialize(node, unit)
|
21
|
+
@start_time = Time.now
|
22
|
+
@pid = $$
|
23
|
+
@node = node
|
24
|
+
@unit = unit
|
25
|
+
@status = @unit['status']
|
26
|
+
@retry_wait = RETRY_WAIT
|
26
27
|
end
|
27
28
|
|
28
29
|
# Return output to the central server, marking the WorkUnit done.
|
@@ -49,18 +50,20 @@ module CloudCrowd
|
|
49
50
|
def keep_trying_to(title)
|
50
51
|
begin
|
51
52
|
yield
|
53
|
+
rescue RestClient::ResourceNotFound => e
|
54
|
+
log "work unit ##{@unit['id']} doesn't exist. discarding..."
|
52
55
|
rescue Exception => e
|
53
|
-
log "failed to #{title} -- retry in #{
|
56
|
+
log "failed to #{title} -- retry in #{@retry_wait} seconds"
|
54
57
|
log e.message
|
55
58
|
log e.backtrace
|
56
|
-
sleep
|
59
|
+
sleep @retry_wait
|
57
60
|
retry
|
58
61
|
end
|
59
62
|
end
|
60
63
|
|
61
64
|
# Loggable details describing what the Worker is up to.
|
62
65
|
def display_work_unit
|
63
|
-
"unit ##{@
|
66
|
+
"unit ##{@unit['id']} (#{@unit['action']}/#{CloudCrowd.display_status(@status)})"
|
64
67
|
end
|
65
68
|
|
66
69
|
# Executes the WorkUnit by running the Action, catching all exceptions as
|
@@ -70,12 +73,13 @@ module CloudCrowd
|
|
70
73
|
@worker_thread = Thread.new do
|
71
74
|
begin
|
72
75
|
result = nil
|
73
|
-
|
74
|
-
|
76
|
+
action_class = CloudCrowd.actions[@unit['action']]
|
77
|
+
action = action_class.new(@status, @unit['input'], enhanced_unit_options, @node.asset_store)
|
78
|
+
Dir.chdir(action.work_directory) do
|
75
79
|
result = case @status
|
76
|
-
when PROCESSING then
|
77
|
-
when SPLITTING then
|
78
|
-
when MERGING then
|
80
|
+
when PROCESSING then action.process
|
81
|
+
when SPLITTING then action.split
|
82
|
+
when MERGING then action.merge
|
79
83
|
else raise Error::StatusUnspecified, "work units must specify their status"
|
80
84
|
end
|
81
85
|
end
|
@@ -83,7 +87,7 @@ module CloudCrowd
|
|
83
87
|
rescue Exception => e
|
84
88
|
fail_work_unit(e)
|
85
89
|
ensure
|
86
|
-
|
90
|
+
action.cleanup_work_directory if action
|
87
91
|
end
|
88
92
|
end
|
89
93
|
@worker_thread.join
|
@@ -91,9 +95,26 @@ module CloudCrowd
|
|
91
95
|
|
92
96
|
# Wraps run_work_unit to benchmark the execution time, if requested.
|
93
97
|
def run
|
94
|
-
|
95
|
-
|
96
|
-
|
98
|
+
trap_signals
|
99
|
+
log "starting #{display_work_unit}"
|
100
|
+
return run_work_unit unless @unit['options']['benchmark']
|
101
|
+
log("ran #{display_work_unit} in " + Benchmark.measure { run_work_unit }.to_s)
|
102
|
+
end
|
103
|
+
|
104
|
+
# There are some potentially important attributes of the WorkUnit that we'd
|
105
|
+
# like to pass into the Action -- in case it needs to know them. They will
|
106
|
+
# always be made available in the options hash.
|
107
|
+
def enhanced_unit_options
|
108
|
+
@unit['options'].merge({
|
109
|
+
'job_id' => @unit['job_id'],
|
110
|
+
'work_unit_id' => @unit['id'],
|
111
|
+
'attempts' => @unit['attempts']
|
112
|
+
})
|
113
|
+
end
|
114
|
+
|
115
|
+
# How long has this worker been running for?
|
116
|
+
def time_taken
|
117
|
+
Time.now - @start_time
|
97
118
|
end
|
98
119
|
|
99
120
|
|
@@ -103,20 +124,8 @@ module CloudCrowd
|
|
103
124
|
# regardless of success or failure.
|
104
125
|
def base_params
|
105
126
|
{ :pid => @pid,
|
106
|
-
:id => @
|
107
|
-
:time =>
|
108
|
-
end
|
109
|
-
|
110
|
-
# Extract the Worker's instance variables from a WorkUnit's JSON.
|
111
|
-
def setup_work_unit(unit)
|
112
|
-
return false unless unit
|
113
|
-
@start_time = Time.now
|
114
|
-
@action_name, @input, @options, @status = unit['action'], unit['input'], unit['options'], unit['status']
|
115
|
-
@options['job_id'] = unit['job_id']
|
116
|
-
@options['work_unit_id'] = unit['id']
|
117
|
-
@options['attempts'] ||= unit['attempts']
|
118
|
-
log "fetched #{display_work_unit}"
|
119
|
-
return true
|
127
|
+
:id => @unit['id'],
|
128
|
+
:time => time_taken }
|
120
129
|
end
|
121
130
|
|
122
131
|
# Log a message to the daemon log. Includes PID for identification.
|
@@ -4,6 +4,7 @@ require 'test_helper'
|
|
4
4
|
class FailingWorkUnitsTest < Test::Unit::TestCase
|
5
5
|
|
6
6
|
should "retry work units when they fail" do
|
7
|
+
WorkUnit.expects(:distribute_to_nodes).returns(true)
|
7
8
|
browser = Rack::Test::Session.new(Rack::MockSession.new(CloudCrowd::Server))
|
8
9
|
|
9
10
|
browser.post '/jobs', :job => {
|
@@ -0,0 +1,20 @@
|
|
1
|
+
require 'test_helper'
|
2
|
+
|
3
|
+
class NodeAcceptanceTest < Test::Unit::TestCase
|
4
|
+
|
5
|
+
include Rack::Test::Methods
|
6
|
+
|
7
|
+
def app
|
8
|
+
CloudCrowd::Node
|
9
|
+
end
|
10
|
+
|
11
|
+
context "The CloudCrowd::Node (Sinatra)" do
|
12
|
+
|
13
|
+
should "have a heartbeat" do
|
14
|
+
get '/heartbeat'
|
15
|
+
assert last_response.body == 'buh-bump'
|
16
|
+
end
|
17
|
+
|
18
|
+
end
|
19
|
+
|
20
|
+
end
|
@@ -46,6 +46,7 @@ class ServerTest < Test::Unit::TestCase
|
|
46
46
|
end
|
47
47
|
|
48
48
|
should "be able to create a job" do
|
49
|
+
WorkUnit.expects(:distribute_to_nodes).returns(true)
|
49
50
|
post('/jobs', :job => '{"action":"graphics_magick","inputs":["http://www.google.com/"]}')
|
50
51
|
assert last_response.ok?
|
51
52
|
job_info = JSON.parse(last_response.body)
|
@@ -5,16 +5,13 @@ class WordCountTest < Test::Unit::TestCase
|
|
5
5
|
context "the word_count action" do
|
6
6
|
|
7
7
|
setup do
|
8
|
+
WorkUnit.expects(:distribute_to_nodes).returns(true)
|
8
9
|
@asset_store = AssetStore.new
|
9
10
|
@browser = Rack::Test::Session.new(Rack::MockSession.new(CloudCrowd::Server))
|
10
11
|
@browser.put('/worker', :name => 'test_worker', :thread_status => 'sleeping')
|
11
12
|
post_job_to_count_words_in_this_file
|
12
13
|
@job_id = JSON.parse(@browser.last_response.body)['id']
|
13
14
|
end
|
14
|
-
|
15
|
-
teardown do
|
16
|
-
CloudCrowd::Job.destroy_all
|
17
|
-
end
|
18
15
|
|
19
16
|
should "be able to create a word_count job" do
|
20
17
|
assert @browser.last_response.ok?
|
@@ -26,7 +23,7 @@ class WordCountTest < Test::Unit::TestCase
|
|
26
23
|
should "be able to perform the processing stage of a word_count" do
|
27
24
|
action = CloudCrowd.actions['word_count'].new(1, "file://#{File.expand_path(__FILE__)}", {}, @asset_store)
|
28
25
|
count = action.process
|
29
|
-
assert count ==
|
26
|
+
assert count == 101
|
30
27
|
end
|
31
28
|
|
32
29
|
end
|
data/test/blueprints.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
Sham.url { Faker::Internet.domain_name + "/" + Faker::Internet.domain_word + ".jpg" }
|
2
|
+
Sham.host { Faker::Internet.domain_name + '.local' }
|
2
3
|
|
3
4
|
CloudCrowd::Job.blueprint do
|
4
5
|
status { CloudCrowd::PROCESSING }
|
@@ -8,9 +9,17 @@ CloudCrowd::Job.blueprint do
|
|
8
9
|
email { 'noone@example.com' }
|
9
10
|
end
|
10
11
|
|
12
|
+
CloudCrowd::NodeRecord.blueprint do
|
13
|
+
host
|
14
|
+
ip_address { '127.0.0.1' }
|
15
|
+
port { 6093 }
|
16
|
+
enabled_actions { 'graphics_magick,word_count' }
|
17
|
+
max_workers { 3 }
|
18
|
+
end
|
19
|
+
|
11
20
|
CloudCrowd::WorkUnit.blueprint do
|
12
|
-
job
|
13
|
-
status
|
14
|
-
input
|
15
|
-
action
|
21
|
+
job { CloudCrowd::Job.make }
|
22
|
+
status { CloudCrowd::PROCESSING }
|
23
|
+
input { '{"key":"value"}' }
|
24
|
+
action { 'graphics_magick' }
|
16
25
|
end
|
data/test/config/database.yml
CHANGED
@@ -0,0 +1,38 @@
|
|
1
|
+
require 'test_helper'
|
2
|
+
|
3
|
+
class NodeUnitTest < Test::Unit::TestCase
|
4
|
+
|
5
|
+
context "A Node" do
|
6
|
+
|
7
|
+
setup do
|
8
|
+
@node = Node.new(11011).instance_variable_get(:@app)
|
9
|
+
end
|
10
|
+
|
11
|
+
should "instantiate correctly" do
|
12
|
+
assert @node.server.to_s == "http://localhost:9173"
|
13
|
+
assert @node.port == 11011
|
14
|
+
assert @node.host == Socket.gethostname
|
15
|
+
assert @node.enabled_actions.length > 2
|
16
|
+
assert @node.asset_store.is_a? AssetStore::FilesystemStore
|
17
|
+
end
|
18
|
+
|
19
|
+
should "trap signals and launch a server at start" do
|
20
|
+
Signal.expects(:trap).times(3)
|
21
|
+
Thin::Server.expects(:start)
|
22
|
+
@node.expects(:check_in)
|
23
|
+
@node.start
|
24
|
+
end
|
25
|
+
|
26
|
+
should "be able to determine if the node is overloaded" do
|
27
|
+
assert !@node.overloaded?
|
28
|
+
@node.instance_variable_set :@max_load, 0.01
|
29
|
+
assert @node.overloaded?
|
30
|
+
@node.instance_variable_set :@max_load, nil
|
31
|
+
assert !@node.overloaded?
|
32
|
+
@node.instance_variable_set :@min_memory, 8000
|
33
|
+
assert @node.overloaded?
|
34
|
+
end
|
35
|
+
|
36
|
+
end
|
37
|
+
|
38
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
require 'test_helper'
|
2
|
+
|
3
|
+
class NodeRecordTest < Test::Unit::TestCase
|
4
|
+
|
5
|
+
context "A NodeRecord" do
|
6
|
+
|
7
|
+
setup do
|
8
|
+
@node = CloudCrowd::NodeRecord.make
|
9
|
+
end
|
10
|
+
|
11
|
+
subject { @node }
|
12
|
+
|
13
|
+
should_have_many :work_units
|
14
|
+
|
15
|
+
should_validate_presence_of :host, :ip_address, :port, :enabled_actions
|
16
|
+
|
17
|
+
should "be available" do
|
18
|
+
assert NodeRecord.available.map(&:id).include? @node.id
|
19
|
+
end
|
20
|
+
|
21
|
+
should "know its enabled actions" do
|
22
|
+
assert @node.actions.include? 'graphics_magick'
|
23
|
+
assert @node.actions.include? 'word_count'
|
24
|
+
end
|
25
|
+
|
26
|
+
should "know if the node is busy" do
|
27
|
+
assert !@node.busy?
|
28
|
+
assert @node.display_status == 'available'
|
29
|
+
(@node.max_workers + 1).times { WorkUnit.make(:node_record => @node) }
|
30
|
+
assert @node.busy?
|
31
|
+
assert @node.display_status == 'busy'
|
32
|
+
@node.release_work_units
|
33
|
+
assert !@node.busy?
|
34
|
+
end
|
35
|
+
|
36
|
+
should "be reachable at a URL" do
|
37
|
+
assert !!URI.parse(@node.url)
|
38
|
+
end
|
39
|
+
|
40
|
+
end
|
41
|
+
|
42
|
+
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
require 'test_helper'
|
2
|
+
|
3
|
+
class WorkerTest < Test::Unit::TestCase
|
4
|
+
|
5
|
+
context "A CloudCrowd::Worker" do
|
6
|
+
|
7
|
+
setup do
|
8
|
+
@node = Node.new.instance_variable_get(:@app)
|
9
|
+
@unit = WorkUnit.make
|
10
|
+
@worker = Worker.new(@node, JSON.parse(@unit.to_json))
|
11
|
+
end
|
12
|
+
|
13
|
+
should "instantiate correctly" do
|
14
|
+
assert @worker.pid == $$
|
15
|
+
assert @worker.unit['id'] == @unit.id
|
16
|
+
assert @worker.status == @unit.status
|
17
|
+
assert @worker.node == @node
|
18
|
+
assert @worker.time_taken > 0
|
19
|
+
end
|
20
|
+
|
21
|
+
should "be able to retry operations that must succeed" do
|
22
|
+
@worker.instance_variable_set :@retry_wait, 0.01
|
23
|
+
@worker.expects(:log).at_least(3)
|
24
|
+
tries = 0
|
25
|
+
@worker.keep_trying_to("do something critical") do
|
26
|
+
tries += 1;
|
27
|
+
raise 'hell' unless tries > 3
|
28
|
+
assert "made it through"
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
should "be able to run an action and try to complete it" do
|
33
|
+
GraphicsMagick.any_instance.expects(:process).returns('the answer')
|
34
|
+
GraphicsMagick.any_instance.expects(:cleanup_work_directory)
|
35
|
+
@worker.expects(:complete_work_unit).with({'output' => 'the answer'}.to_json)
|
36
|
+
@worker.run_work_unit
|
37
|
+
end
|
38
|
+
|
39
|
+
should "enchance the options that an action receives with extra info" do
|
40
|
+
opts = @worker.enhanced_unit_options
|
41
|
+
assert opts['work_unit_id'] == @unit.id
|
42
|
+
assert opts['job_id'] == @unit.job.id
|
43
|
+
assert opts['attempts'] == @unit.attempts
|
44
|
+
end
|
45
|
+
|
46
|
+
end
|
47
|
+
|
48
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: documentcloud-cloud-crowd
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jeremy Ashkenas
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2009-09-
|
12
|
+
date: 2009-09-18 00:00:00 -07:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
@@ -182,8 +182,9 @@ files:
|
|
182
182
|
- public/js/flot.js
|
183
183
|
- public/js/jquery.js
|
184
184
|
- README
|
185
|
-
- test/acceptance/
|
185
|
+
- test/acceptance/test_node.rb
|
186
186
|
- test/acceptance/test_failing_work_units.rb
|
187
|
+
- test/acceptance/test_server.rb
|
187
188
|
- test/acceptance/test_word_count.rb
|
188
189
|
- test/blueprints.rb
|
189
190
|
- test/config/config.ru
|
@@ -193,7 +194,10 @@ files:
|
|
193
194
|
- test/test_helper.rb
|
194
195
|
- test/unit/test_action.rb
|
195
196
|
- test/unit/test_configuration.rb
|
197
|
+
- test/unit/test_node.rb
|
198
|
+
- test/unit/test_node_record.rb
|
196
199
|
- test/unit/test_job.rb
|
200
|
+
- test/unit/test_worker.rb
|
197
201
|
- test/unit/test_work_unit.rb
|
198
202
|
- views/operations_center.erb
|
199
203
|
has_rdoc: true
|