cloud-crowd 0.3.0 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/cloud-crowd.gemspec +7 -7
- data/config/config.example.ru +2 -2
- data/lib/cloud-crowd.rb +31 -31
- data/lib/cloud_crowd/worker.rb +34 -34
- data/test/config/config.ru +2 -2
- metadata +2 -2
data/cloud-crowd.gemspec
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
Gem::Specification.new do |s|
|
2
2
|
s.name = 'cloud-crowd'
|
3
|
-
s.version = '0.3.
|
4
|
-
s.date = '2009-11-
|
3
|
+
s.version = '0.3.1' # Keep version in sync with cloud-cloud.rb
|
4
|
+
s.date = '2009-11-19'
|
5
5
|
|
6
6
|
s.homepage = "http://wiki.github.com/documentcloud/cloud-crowd"
|
7
7
|
s.summary = "Parallel Processing for the Rest of Us"
|
@@ -12,21 +12,21 @@ Gem::Specification.new do |s|
|
|
12
12
|
everywhere is black with people and more come streaming from all sides as though
|
13
13
|
streets had only one direction.
|
14
14
|
EOS
|
15
|
-
|
15
|
+
|
16
16
|
s.authors = ['Jeremy Ashkenas']
|
17
17
|
s.email = 'jeremy@documentcloud.org'
|
18
18
|
s.rubyforge_project = 'cloud-crowd'
|
19
|
-
|
19
|
+
|
20
20
|
s.require_paths = ['lib']
|
21
21
|
s.executables = ['crowd']
|
22
|
-
|
22
|
+
|
23
23
|
s.has_rdoc = true
|
24
24
|
s.extra_rdoc_files = ['README']
|
25
25
|
s.rdoc_options << '--title' << 'CloudCrowd | Parallel Processing for the Rest of Us' <<
|
26
26
|
'--exclude' << 'test' <<
|
27
27
|
'--main' << 'README' <<
|
28
28
|
'--all'
|
29
|
-
|
29
|
+
|
30
30
|
s.add_dependency 'sinatra', ['>= 0.9.4']
|
31
31
|
s.add_dependency 'activerecord', ['>= 2.3.3']
|
32
32
|
s.add_dependency 'json', ['>= 1.1.7']
|
@@ -41,7 +41,7 @@ Gem::Specification.new do |s|
|
|
41
41
|
s.add_development_dependency 'rack-test', ['>= 0.4.1']
|
42
42
|
s.add_development_dependency 'mocha', ['>= 0.9.7']
|
43
43
|
end
|
44
|
-
|
44
|
+
|
45
45
|
s.files = %w(
|
46
46
|
actions/graphics_magick.rb
|
47
47
|
actions/process_pdfs.rb
|
data/config/config.example.ru
CHANGED
@@ -15,8 +15,8 @@
|
|
15
15
|
require 'rubygems'
|
16
16
|
require 'cloud-crowd'
|
17
17
|
|
18
|
-
CloudCrowd.configure(File.dirname(__FILE__) + '/config.yml')
|
19
|
-
CloudCrowd.configure_database(File.dirname(__FILE__) + '/database.yml')
|
18
|
+
CloudCrowd.configure(::File.dirname(__FILE__) + '/config.yml')
|
19
|
+
CloudCrowd.configure_database(::File.dirname(__FILE__) + '/database.yml')
|
20
20
|
|
21
21
|
map '/' do
|
22
22
|
run CloudCrowd::Server
|
data/lib/cloud-crowd.rb
CHANGED
@@ -29,7 +29,7 @@ require 'socket'
|
|
29
29
|
require 'cloud_crowd/exceptions'
|
30
30
|
|
31
31
|
module CloudCrowd
|
32
|
-
|
32
|
+
|
33
33
|
# Autoload all the CloudCrowd internals.
|
34
34
|
autoload :Action, 'cloud_crowd/action'
|
35
35
|
autoload :AssetStore, 'cloud_crowd/asset_store'
|
@@ -42,53 +42,53 @@ module CloudCrowd
|
|
42
42
|
autoload :Server, 'cloud_crowd/server'
|
43
43
|
autoload :Worker, 'cloud_crowd/worker'
|
44
44
|
autoload :WorkUnit, 'cloud_crowd/models'
|
45
|
-
|
45
|
+
|
46
46
|
# Keep this version in sync with the gemspec.
|
47
|
-
VERSION = '0.3.
|
48
|
-
|
47
|
+
VERSION = '0.3.1'
|
48
|
+
|
49
49
|
# Increment the schema version when there's a backwards incompatible change.
|
50
50
|
SCHEMA_VERSION = 3
|
51
|
-
|
51
|
+
|
52
52
|
# Root directory of the CloudCrowd gem.
|
53
53
|
ROOT = File.expand_path(File.dirname(__FILE__) + '/..')
|
54
|
-
|
54
|
+
|
55
55
|
# Default folder to log daemonized servers and nodes into.
|
56
56
|
LOG_PATH = 'log'
|
57
|
-
|
57
|
+
|
58
58
|
# Default folder to contain the pids of daemonized servers and nodes.
|
59
59
|
PID_PATH = 'tmp/pids'
|
60
|
-
|
60
|
+
|
61
61
|
# A Job is processing if its WorkUnits are in the queue to be handled by nodes.
|
62
62
|
PROCESSING = 1
|
63
|
-
|
63
|
+
|
64
64
|
# A Job has succeeded if all of its WorkUnits have finished successfully.
|
65
65
|
SUCCEEDED = 2
|
66
|
-
|
66
|
+
|
67
67
|
# A Job has failed if even a single one of its WorkUnits has failed (they may
|
68
68
|
# be attempted multiple times on failure, however).
|
69
69
|
FAILED = 3
|
70
|
-
|
70
|
+
|
71
71
|
# A Job is splitting if it's in the process of dividing its inputs up into
|
72
72
|
# multiple WorkUnits.
|
73
73
|
SPLITTING = 4
|
74
|
-
|
74
|
+
|
75
75
|
# A Job is merging if it's busy collecting all of its successful WorkUnits
|
76
76
|
# back together into the final result.
|
77
77
|
MERGING = 5
|
78
|
-
|
78
|
+
|
79
79
|
# A Job is considered to be complete if it succeeded or if it failed.
|
80
80
|
COMPLETE = [SUCCEEDED, FAILED]
|
81
|
-
|
81
|
+
|
82
82
|
# A Job is considered incomplete if it's being processed, split up or merged.
|
83
83
|
INCOMPLETE = [PROCESSING, SPLITTING, MERGING]
|
84
|
-
|
84
|
+
|
85
85
|
# Mapping of statuses to their display strings.
|
86
86
|
DISPLAY_STATUS_MAP = ['unknown', 'processing', 'succeeded', 'failed', 'splitting', 'merging']
|
87
|
-
|
87
|
+
|
88
88
|
class << self
|
89
89
|
attr_reader :config
|
90
90
|
attr_accessor :identity
|
91
|
-
|
91
|
+
|
92
92
|
# Configure CloudCrowd by passing in the path to <tt>config.yml</tt>.
|
93
93
|
def configure(config_path)
|
94
94
|
@config_path = File.expand_path(File.dirname(config_path))
|
@@ -96,7 +96,7 @@ module CloudCrowd
|
|
96
96
|
end
|
97
97
|
|
98
98
|
# Configure the CloudCrowd central database (and connect to it), by passing
|
99
|
-
# in a path to <tt>database.yml</tt>. The file should use the standard
|
99
|
+
# in a path to <tt>database.yml</tt>. The file should use the standard
|
100
100
|
# ActiveRecord connection format.
|
101
101
|
def configure_database(config_path, validate_schema=true)
|
102
102
|
configuration = YAML.load_file(config_path)
|
@@ -108,25 +108,25 @@ module CloudCrowd
|
|
108
108
|
exit
|
109
109
|
end
|
110
110
|
end
|
111
|
-
|
112
|
-
# Get a reference to the central server, including authentication if
|
111
|
+
|
112
|
+
# Get a reference to the central server, including authentication if
|
113
113
|
# configured.
|
114
114
|
def central_server
|
115
115
|
@central_server ||= RestClient::Resource.new(CloudCrowd.config[:central_server], CloudCrowd.client_options)
|
116
116
|
end
|
117
|
-
|
117
|
+
|
118
118
|
# The path that daemonized servers and nodes will log to.
|
119
119
|
def log_path(log_file=nil)
|
120
120
|
@log_path ||= config[:log_path] || LOG_PATH
|
121
121
|
log_file ? File.join(@log_path, log_file) : @log_path
|
122
122
|
end
|
123
|
-
|
123
|
+
|
124
124
|
# The path in which daemonized servers and nodes will store their pids.
|
125
125
|
def pid_path(pid_file=nil)
|
126
126
|
@pid_path ||= config[:pid_path] || PID_PATH
|
127
127
|
pid_file ? File.join(@pid_path, pid_file) : @pid_path
|
128
128
|
end
|
129
|
-
|
129
|
+
|
130
130
|
# The standard RestClient options for the central server talking to nodes,
|
131
131
|
# as well as the other way around. There's a timeout of 5 seconds to open
|
132
132
|
# a connection, and a timeout of 30 to finish reading it.
|
@@ -145,11 +145,11 @@ module CloudCrowd
|
|
145
145
|
def display_status(status)
|
146
146
|
DISPLAY_STATUS_MAP[status] || 'unknown'
|
147
147
|
end
|
148
|
-
|
148
|
+
|
149
149
|
# CloudCrowd::Actions are requested dynamically by name. Access them through
|
150
150
|
# this actions property, which behaves like a hash. At load time, we
|
151
151
|
# load all installed Actions and CloudCrowd's default Actions into it.
|
152
|
-
# If you wish to have certain nodes be specialized to only handle certain
|
152
|
+
# If you wish to have certain nodes be specialized to only handle certain
|
153
153
|
# Actions, then install only those into the actions directory.
|
154
154
|
def actions
|
155
155
|
return @actions if @actions
|
@@ -160,10 +160,10 @@ module CloudCrowd
|
|
160
160
|
memo
|
161
161
|
end
|
162
162
|
rescue NameError => e
|
163
|
-
adjusted_message = "One of your actions failed to load. Please ensure that the name of your action class can be deduced from the name of the file. ex: 'word_count.rb' => 'WordCount'\n#{e.message}"
|
163
|
+
adjusted_message = "One of your actions failed to load. Please ensure that the name of your action class can be deduced from the name of the file. ex: 'word_count.rb' => 'WordCount'\n#{e.message}"
|
164
164
|
raise NameError.new(adjusted_message, e.name)
|
165
165
|
end
|
166
|
-
|
166
|
+
|
167
167
|
# Retrieve the list of every installed Action for this node or server.
|
168
168
|
def action_paths
|
169
169
|
default_actions = Dir["#{ROOT}/actions/*.rb"]
|
@@ -171,18 +171,18 @@ module CloudCrowd
|
|
171
171
|
custom_actions = CloudCrowd.config[:actions_path] ? Dir["#{CloudCrowd.config[:actions_path]}/*.rb"] : []
|
172
172
|
default_actions + installed_actions + custom_actions
|
173
173
|
end
|
174
|
-
|
174
|
+
|
175
175
|
# Is this CloudCrowd instance a server? Useful for avoiding loading unneeded
|
176
176
|
# code from actions.
|
177
177
|
def server?
|
178
178
|
@identity == :server
|
179
179
|
end
|
180
|
-
|
180
|
+
|
181
181
|
# Or is it a node?
|
182
182
|
def node?
|
183
183
|
@identity == :node
|
184
184
|
end
|
185
|
-
|
185
|
+
|
186
186
|
end
|
187
|
-
|
187
|
+
|
188
188
|
end
|
data/lib/cloud_crowd/worker.rb
CHANGED
@@ -1,21 +1,21 @@
|
|
1
1
|
module CloudCrowd
|
2
|
-
|
3
|
-
# The Worker, forked off from the Node when a new WorkUnit is received,
|
2
|
+
|
3
|
+
# The Worker, forked off from the Node when a new WorkUnit is received,
|
4
4
|
# launches an Action for processing. Workers will only ever receive WorkUnits
|
5
|
-
# that they are able to handle (for which they have a corresponding action in
|
6
|
-
# their actions directory). If communication with the central server is
|
7
|
-
# interrupted, the Worker will repeatedly attempt to complete its unit --
|
8
|
-
# every Worker::RETRY_WAIT seconds. Any exceptions that take place during
|
9
|
-
# the course of the Action will cause the Worker to mark the WorkUnit as
|
5
|
+
# that they are able to handle (for which they have a corresponding action in
|
6
|
+
# their actions directory). If communication with the central server is
|
7
|
+
# interrupted, the Worker will repeatedly attempt to complete its unit --
|
8
|
+
# every Worker::RETRY_WAIT seconds. Any exceptions that take place during
|
9
|
+
# the course of the Action will cause the Worker to mark the WorkUnit as
|
10
10
|
# having failed. When finished, the Worker's process exits, minimizing the
|
11
11
|
# potential for memory leaks.
|
12
12
|
class Worker
|
13
|
-
|
13
|
+
|
14
14
|
# Wait five seconds to retry, after internal communcication errors.
|
15
15
|
RETRY_WAIT = 5
|
16
|
-
|
16
|
+
|
17
17
|
attr_reader :pid, :node, :unit, :status
|
18
|
-
|
18
|
+
|
19
19
|
# A new Worker customizes itself to its WorkUnit at instantiation.
|
20
20
|
def initialize(node, unit)
|
21
21
|
@start_time = Time.now
|
@@ -25,7 +25,7 @@ module CloudCrowd
|
|
25
25
|
@status = @unit['status']
|
26
26
|
@retry_wait = RETRY_WAIT
|
27
27
|
end
|
28
|
-
|
28
|
+
|
29
29
|
# Return output to the central server, marking the WorkUnit done.
|
30
30
|
def complete_work_unit(result)
|
31
31
|
keep_trying_to "complete work unit" do
|
@@ -34,7 +34,7 @@ module CloudCrowd
|
|
34
34
|
log "finished #{display_work_unit} in #{data[:time]} seconds"
|
35
35
|
end
|
36
36
|
end
|
37
|
-
|
37
|
+
|
38
38
|
# Mark the WorkUnit failed, returning the exception to central.
|
39
39
|
def fail_work_unit(exception)
|
40
40
|
keep_trying_to "mark work unit as failed" do
|
@@ -43,9 +43,9 @@ module CloudCrowd
|
|
43
43
|
log "failed #{display_work_unit} in #{data[:time]} seconds\n#{exception.message}\n#{exception.backtrace}"
|
44
44
|
end
|
45
45
|
end
|
46
|
-
|
46
|
+
|
47
47
|
# We expect and require internal communication between the central server
|
48
|
-
# and the workers to succeed. If it fails for any reason, log it, and then
|
48
|
+
# and the workers to succeed. If it fails for any reason, log it, and then
|
49
49
|
# keep trying the same request.
|
50
50
|
def keep_trying_to(title)
|
51
51
|
begin
|
@@ -60,13 +60,13 @@ module CloudCrowd
|
|
60
60
|
retry
|
61
61
|
end
|
62
62
|
end
|
63
|
-
|
63
|
+
|
64
64
|
# Loggable details describing what the Worker is up to.
|
65
65
|
def display_work_unit
|
66
66
|
"unit ##{@unit['id']} (#{@unit['action']}/#{CloudCrowd.display_status(@status)})"
|
67
67
|
end
|
68
|
-
|
69
|
-
# Executes the WorkUnit by running the Action, catching all exceptions as
|
68
|
+
|
69
|
+
# Executes the WorkUnit by running the Action, catching all exceptions as
|
70
70
|
# failures. We capture the thread so that we can kill it from the outside,
|
71
71
|
# when exiting.
|
72
72
|
def run_work_unit
|
@@ -82,14 +82,14 @@ module CloudCrowd
|
|
82
82
|
else raise Error::StatusUnspecified, "work units must specify their status"
|
83
83
|
end
|
84
84
|
end
|
85
|
+
action.cleanup_work_directory if action
|
85
86
|
complete_work_unit({'output' => result}.to_json)
|
86
87
|
rescue Exception => e
|
87
|
-
fail_work_unit(e)
|
88
|
-
ensure
|
89
88
|
action.cleanup_work_directory if action
|
89
|
+
fail_work_unit(e)
|
90
90
|
end
|
91
91
|
end
|
92
|
-
|
92
|
+
|
93
93
|
# Run this worker inside of a fork. Attempts to exit cleanly.
|
94
94
|
# Wraps run_work_unit to benchmark the execution time, if requested.
|
95
95
|
def run
|
@@ -102,39 +102,39 @@ module CloudCrowd
|
|
102
102
|
end
|
103
103
|
Process.exit!
|
104
104
|
end
|
105
|
-
|
106
|
-
# There are some potentially important attributes of the WorkUnit that we'd
|
107
|
-
# like to pass into the Action -- in case it needs to know them. They will
|
105
|
+
|
106
|
+
# There are some potentially important attributes of the WorkUnit that we'd
|
107
|
+
# like to pass into the Action -- in case it needs to know them. They will
|
108
108
|
# always be made available in the options hash.
|
109
109
|
def enhanced_unit_options
|
110
110
|
@unit['options'].merge({
|
111
111
|
'job_id' => @unit['job_id'],
|
112
112
|
'work_unit_id' => @unit['id'],
|
113
|
-
'attempts' => @unit['attempts']
|
113
|
+
'attempts' => @unit['attempts']
|
114
114
|
})
|
115
115
|
end
|
116
|
-
|
116
|
+
|
117
117
|
# How long has this worker been running for?
|
118
118
|
def time_taken
|
119
119
|
Time.now - @start_time
|
120
120
|
end
|
121
|
-
|
122
|
-
|
121
|
+
|
122
|
+
|
123
123
|
private
|
124
|
-
|
125
|
-
# Common parameters to send back to central upon unit completion,
|
124
|
+
|
125
|
+
# Common parameters to send back to central upon unit completion,
|
126
126
|
# regardless of success or failure.
|
127
127
|
def base_params
|
128
128
|
{ :pid => @pid,
|
129
|
-
:id => @unit['id'],
|
129
|
+
:id => @unit['id'],
|
130
130
|
:time => time_taken }
|
131
131
|
end
|
132
|
-
|
132
|
+
|
133
133
|
# Log a message to the daemon log. Includes PID for identification.
|
134
134
|
def log(message)
|
135
135
|
puts "Worker ##{@pid}: #{message}" unless ENV['RACK_ENV'] == 'test'
|
136
136
|
end
|
137
|
-
|
137
|
+
|
138
138
|
# When signaled to exit, make sure that the Worker shuts down without firing
|
139
139
|
# the Node's at_exit callbacks.
|
140
140
|
def trap_signals
|
@@ -143,7 +143,7 @@ module CloudCrowd
|
|
143
143
|
Signal.trap('KILL') { Process.exit! }
|
144
144
|
Signal.trap('TERM') { Process.exit! }
|
145
145
|
end
|
146
|
-
|
146
|
+
|
147
147
|
end
|
148
|
-
|
148
|
+
|
149
149
|
end
|
data/test/config/config.ru
CHANGED
@@ -9,8 +9,8 @@
|
|
9
9
|
require 'rubygems'
|
10
10
|
require 'cloud-crowd'
|
11
11
|
|
12
|
-
CloudCrowd.configure(File.dirname(__FILE__) + '/config.yml')
|
13
|
-
CloudCrowd.configure_database(File.dirname(__FILE__) + '/database.yml')
|
12
|
+
CloudCrowd.configure(::File.dirname(__FILE__) + '/config.yml')
|
13
|
+
CloudCrowd.configure_database(::File.dirname(__FILE__) + '/database.yml')
|
14
14
|
|
15
15
|
map '/' do
|
16
16
|
run CloudCrowd::Server
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cloud-crowd
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jeremy Ashkenas
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2009-11-
|
12
|
+
date: 2009-11-19 00:00:00 -05:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|