mooktakim-cloud-crowd 0.3.4
Sign up to get free protection for your applications and to get access to all the features.
- data/EPIGRAPHS +17 -0
- data/LICENSE +22 -0
- data/README +93 -0
- data/actions/graphics_magick.rb +43 -0
- data/actions/process_pdfs.rb +92 -0
- data/actions/word_count.rb +16 -0
- data/bin/crowd +5 -0
- data/config/config.example.ru +23 -0
- data/config/config.example.yml +55 -0
- data/config/database.example.yml +16 -0
- data/examples/graphics_magick_example.rb +44 -0
- data/examples/process_pdfs_example.rb +40 -0
- data/examples/word_count_example.rb +42 -0
- data/lib/cloud-crowd.rb +188 -0
- data/lib/cloud_crowd/action.rb +125 -0
- data/lib/cloud_crowd/asset_store/filesystem_store.rb +39 -0
- data/lib/cloud_crowd/asset_store/s3_store.rb +43 -0
- data/lib/cloud_crowd/asset_store.rb +41 -0
- data/lib/cloud_crowd/command_line.rb +242 -0
- data/lib/cloud_crowd/exceptions.rb +46 -0
- data/lib/cloud_crowd/helpers/authorization.rb +52 -0
- data/lib/cloud_crowd/helpers/resources.rb +25 -0
- data/lib/cloud_crowd/helpers.rb +8 -0
- data/lib/cloud_crowd/inflector.rb +19 -0
- data/lib/cloud_crowd/models/job.rb +190 -0
- data/lib/cloud_crowd/models/node_record.rb +107 -0
- data/lib/cloud_crowd/models/work_unit.rb +170 -0
- data/lib/cloud_crowd/models.rb +40 -0
- data/lib/cloud_crowd/node.rb +199 -0
- data/lib/cloud_crowd/schema.rb +50 -0
- data/lib/cloud_crowd/server.rb +123 -0
- data/lib/cloud_crowd/worker.rb +149 -0
- data/mooktakim-cloud-crowd.gemspec +116 -0
- data/public/css/admin_console.css +243 -0
- data/public/css/reset.css +42 -0
- data/public/images/bullet_green.png +0 -0
- data/public/images/bullet_white.png +0 -0
- data/public/images/cloud_hand.png +0 -0
- data/public/images/header_back.png +0 -0
- data/public/images/logo.png +0 -0
- data/public/images/queue_fill.png +0 -0
- data/public/images/server.png +0 -0
- data/public/images/server_busy.png +0 -0
- data/public/images/server_error.png +0 -0
- data/public/images/sidebar_bottom.png +0 -0
- data/public/images/sidebar_top.png +0 -0
- data/public/images/worker_info.png +0 -0
- data/public/images/worker_info_loading.gif +0 -0
- data/public/js/admin_console.js +197 -0
- data/public/js/excanvas.js +1 -0
- data/public/js/flot.js +1 -0
- data/public/js/jquery.js +19 -0
- data/test/acceptance/test_failing_work_units.rb +33 -0
- data/test/acceptance/test_node.rb +20 -0
- data/test/acceptance/test_server.rb +66 -0
- data/test/acceptance/test_word_count.rb +40 -0
- data/test/blueprints.rb +25 -0
- data/test/config/actions/failure_testing.rb +13 -0
- data/test/config/config.ru +17 -0
- data/test/config/config.yml +6 -0
- data/test/config/database.yml +3 -0
- data/test/test_helper.rb +19 -0
- data/test/unit/test_action.rb +70 -0
- data/test/unit/test_configuration.rb +48 -0
- data/test/unit/test_job.rb +103 -0
- data/test/unit/test_node.rb +41 -0
- data/test/unit/test_node_record.rb +42 -0
- data/test/unit/test_work_unit.rb +53 -0
- data/test/unit/test_worker.rb +48 -0
- data/views/operations_center.erb +82 -0
- metadata +290 -0
@@ -0,0 +1,199 @@
|
|
1
|
+
module CloudCrowd
|
2
|
+
|
3
|
+
# A Node is a Sinatra/Thin application that runs a single instance per-machine
|
4
|
+
# It registers with the central server, receives WorkUnits, and forks off
|
5
|
+
# Workers to process them. The actions are:
|
6
|
+
#
|
7
|
+
# [get /heartbeat] Returns 200 OK to let monitoring tools know the server's up.
|
8
|
+
# [post /work] The central server hits <tt>/work</tt> to dispatch a WorkUnit to this Node.
|
9
|
+
class Node < Sinatra::Default
|
10
|
+
|
11
|
+
# A Node's default port. You only run a single node per machine, so they
|
12
|
+
# can all use the same port without any problems.
|
13
|
+
DEFAULT_PORT = 9063
|
14
|
+
|
15
|
+
# A list of regex scrapers, which let us extract the one-minute load
|
16
|
+
# average and the amount of free memory on different flavors of UNIX.
|
17
|
+
|
18
|
+
SCRAPE_UPTIME = /\d+\.\d+/
|
19
|
+
SCRAPE_LINUX_MEMORY = /MemFree:\s+(\d+) kB/
|
20
|
+
SCRAPE_MAC_MEMORY = /Pages free:\s+(\d+)./
|
21
|
+
SCRAPE_MAC_PAGE = /page size of (\d+) bytes/
|
22
|
+
|
23
|
+
# The interval at which the node monitors the machine's load and memory use
|
24
|
+
# (if configured to do so in config.yml).
|
25
|
+
MONITOR_INTERVAL = 3
|
26
|
+
|
27
|
+
# The interval at which the node regularly checks in with central (5 min).
|
28
|
+
CHECK_IN_INTERVAL = 300
|
29
|
+
|
30
|
+
# The response sent back when this node is overloaded.
|
31
|
+
OVERLOADED_MESSAGE = 'Node Overloaded'
|
32
|
+
|
33
|
+
attr_reader :enabled_actions, :host, :port, :central
|
34
|
+
|
35
|
+
set :root, ROOT
|
36
|
+
set :authorization_realm, "CloudCrowd"
|
37
|
+
|
38
|
+
helpers Helpers
|
39
|
+
|
40
|
+
# methodoverride allows the _method param.
|
41
|
+
enable :methodoverride
|
42
|
+
|
43
|
+
# Enabling HTTP Authentication turns it on for all requests.
|
44
|
+
# This works the same way as in the central CloudCrowd::Server.
|
45
|
+
before do
|
46
|
+
login_required if CloudCrowd.config[:http_authentication]
|
47
|
+
end
|
48
|
+
|
49
|
+
# To monitor a Node with Monit, God, Nagios, or another tool, you can hit
|
50
|
+
# /heartbeat to make sure its still online.
|
51
|
+
get '/heartbeat' do
|
52
|
+
"buh-bump"
|
53
|
+
end
|
54
|
+
|
55
|
+
# Posts a WorkUnit to this Node. Forks a Worker and returns the process id.
|
56
|
+
# Returns a 503 if this Node is overloaded.
|
57
|
+
post '/work' do
|
58
|
+
throw :halt, [503, OVERLOADED_MESSAGE] if @overloaded
|
59
|
+
unit = JSON.parse(params[:work_unit])
|
60
|
+
pid = fork { Worker.new(self, unit).run }
|
61
|
+
Process.detach(pid)
|
62
|
+
json :pid => pid
|
63
|
+
end
|
64
|
+
|
65
|
+
# When creating a node, specify the port it should run on.
|
66
|
+
def initialize(port=nil, daemon=false)
|
67
|
+
require 'json'
|
68
|
+
CloudCrowd.identity = :node
|
69
|
+
@central = CloudCrowd.central_server
|
70
|
+
@host = Socket.gethostname
|
71
|
+
@enabled_actions = CloudCrowd.actions.keys
|
72
|
+
@port = port || DEFAULT_PORT
|
73
|
+
@daemon = daemon
|
74
|
+
@overloaded = false
|
75
|
+
@max_load = CloudCrowd.config[:max_load]
|
76
|
+
@min_memory = CloudCrowd.config[:min_free_memory]
|
77
|
+
start unless test?
|
78
|
+
end
|
79
|
+
|
80
|
+
# Starting up a Node registers with the central server and begins to listen
|
81
|
+
# for incoming WorkUnits.
|
82
|
+
def start
|
83
|
+
FileUtils.mkdir_p(CloudCrowd.log_path) if @daemon && !File.exists?(CloudCrowd.log_path)
|
84
|
+
@server = Thin::Server.new('0.0.0.0', @port, self, :signals => false)
|
85
|
+
@server.tag = 'cloud-crowd-node'
|
86
|
+
@server.pid_file = CloudCrowd.pid_path('node.pid')
|
87
|
+
@server.log_file = CloudCrowd.log_path('node.log')
|
88
|
+
@server.daemonize if @daemon
|
89
|
+
trap_signals
|
90
|
+
asset_store
|
91
|
+
@server_thread = Thread.new { @server.start }
|
92
|
+
check_in(true)
|
93
|
+
check_in_periodically
|
94
|
+
monitor_system if @max_load || @min_memory
|
95
|
+
@server_thread.join
|
96
|
+
end
|
97
|
+
|
98
|
+
# Checking in with the central server informs it of the location and
|
99
|
+
# configuration of this Node. If it can't check-in, there's no point in
|
100
|
+
# starting.
|
101
|
+
def check_in(critical=false)
|
102
|
+
@central["/node/#{@host}"].put(
|
103
|
+
:port => @port,
|
104
|
+
:busy => @overloaded,
|
105
|
+
:max_workers => CloudCrowd.config[:max_workers],
|
106
|
+
:enabled_actions => @enabled_actions.join(',')
|
107
|
+
)
|
108
|
+
rescue RestClient::Exception, Errno::ECONNREFUSED
|
109
|
+
puts "Failed to connect to the central server (#{@central.to_s})."
|
110
|
+
raise SystemExit if critical
|
111
|
+
end
|
112
|
+
|
113
|
+
# Before exiting, the Node checks out with the central server, releasing all
|
114
|
+
# of its WorkUnits for other Nodes to handle
|
115
|
+
def check_out
|
116
|
+
@central["/node/#{@host}"].delete
|
117
|
+
end
|
118
|
+
|
119
|
+
# Lazy-initialize the asset_store, preferably after the Node has launched.
|
120
|
+
def asset_store
|
121
|
+
@asset_store ||= AssetStore.new
|
122
|
+
end
|
123
|
+
|
124
|
+
# Is the node overloaded? If configured, checks if the load average is
|
125
|
+
# greater than 'max_load', or if the available RAM is less than
|
126
|
+
# 'min_free_memory'.
|
127
|
+
def overloaded?
|
128
|
+
(@max_load && load_average > @max_load) ||
|
129
|
+
(@min_memory && free_memory < @min_memory)
|
130
|
+
end
|
131
|
+
|
132
|
+
# The current one-minute load average.
|
133
|
+
def load_average
|
134
|
+
`uptime`.match(SCRAPE_UPTIME).to_s.to_f
|
135
|
+
end
|
136
|
+
|
137
|
+
# The current amount of free memory in megabytes.
|
138
|
+
def free_memory
|
139
|
+
case RUBY_PLATFORM
|
140
|
+
when /darwin/
|
141
|
+
stats = `vm_stat`
|
142
|
+
@mac_page_size ||= stats.match(SCRAPE_MAC_PAGE)[1].to_f / 1048576.0
|
143
|
+
stats.match(SCRAPE_MAC_MEMORY)[1].to_f * @mac_page_size
|
144
|
+
when /linux/
|
145
|
+
`cat /proc/meminfo`.match(SCRAPE_LINUX_MEMORY)[1].to_f / 1024.0
|
146
|
+
else
|
147
|
+
raise NotImplementedError, "'min_free_memory' is not yet implemented on your platform"
|
148
|
+
end
|
149
|
+
end
|
150
|
+
|
151
|
+
|
152
|
+
private
|
153
|
+
|
154
|
+
# Launch a monitoring thread that periodically checks the node's load
|
155
|
+
# average and the amount of free memory remaining. If we transition out of
|
156
|
+
# the overloaded state, let central know.
|
157
|
+
def monitor_system
|
158
|
+
@monitor_thread = Thread.new do
|
159
|
+
loop do
|
160
|
+
was_overloaded = @overloaded
|
161
|
+
@overloaded = overloaded?
|
162
|
+
check_in if was_overloaded && !@overloaded
|
163
|
+
sleep MONITOR_INTERVAL
|
164
|
+
end
|
165
|
+
end
|
166
|
+
end
|
167
|
+
|
168
|
+
# If communication is interrupted for external reasons, the central server
|
169
|
+
# will assume that the node has gone down. Checking in will let central know
|
170
|
+
# it's still online.
|
171
|
+
def check_in_periodically
|
172
|
+
@check_in_thread = Thread.new do
|
173
|
+
loop do
|
174
|
+
sleep CHECK_IN_INTERVAL
|
175
|
+
check_in
|
176
|
+
end
|
177
|
+
end
|
178
|
+
end
|
179
|
+
|
180
|
+
# Trap exit signals in order to shut down cleanly.
|
181
|
+
def trap_signals
|
182
|
+
Signal.trap('QUIT') { shut_down }
|
183
|
+
Signal.trap('INT') { shut_down }
|
184
|
+
Signal.trap('KILL') { shut_down }
|
185
|
+
Signal.trap('TERM') { shut_down }
|
186
|
+
end
|
187
|
+
|
188
|
+
# At shut down, de-register with the central server before exiting.
|
189
|
+
def shut_down
|
190
|
+
@check_in_thread.kill if @check_in_thread
|
191
|
+
@monitor_thread.kill if @monitor_thread
|
192
|
+
check_out
|
193
|
+
@server_thread.kill if @server_thread
|
194
|
+
Process.exit
|
195
|
+
end
|
196
|
+
|
197
|
+
end
|
198
|
+
|
199
|
+
end
|
@@ -0,0 +1,50 @@
|
|
1
|
+
# Complete schema for CloudCrowd.
|
2
|
+
ActiveRecord::Schema.define(:version => CloudCrowd::SCHEMA_VERSION) do
|
3
|
+
|
4
|
+
create_table "jobs", :force => true do |t|
|
5
|
+
t.integer "status", :null => false
|
6
|
+
t.text "inputs", :null => false
|
7
|
+
t.string "action", :null => false
|
8
|
+
t.text "options", :null => false
|
9
|
+
t.text "outputs"
|
10
|
+
t.float "time"
|
11
|
+
t.string "callback_url"
|
12
|
+
t.string "email"
|
13
|
+
t.datetime "created_at"
|
14
|
+
t.datetime "updated_at"
|
15
|
+
end
|
16
|
+
|
17
|
+
create_table "node_records", :force => true do |t|
|
18
|
+
t.string "host", :null => false
|
19
|
+
t.string "ip_address", :null => false
|
20
|
+
t.integer "port", :null => false
|
21
|
+
t.string "enabled_actions", :default => '', :null => false
|
22
|
+
t.boolean "busy", :default => false, :null => false
|
23
|
+
t.integer "max_workers"
|
24
|
+
t.datetime "created_at"
|
25
|
+
t.datetime "updated_at"
|
26
|
+
end
|
27
|
+
|
28
|
+
create_table "work_units", :force => true do |t|
|
29
|
+
t.integer "status", :null => false
|
30
|
+
t.integer "job_id", :null => false
|
31
|
+
t.text "input", :null => false
|
32
|
+
t.string "action", :null => false
|
33
|
+
t.integer "attempts", :default => 0, :null => false
|
34
|
+
t.integer "node_record_id"
|
35
|
+
t.integer "worker_pid"
|
36
|
+
t.integer "reservation"
|
37
|
+
t.float "time"
|
38
|
+
t.text "output"
|
39
|
+
t.datetime "created_at"
|
40
|
+
t.datetime "updated_at"
|
41
|
+
end
|
42
|
+
|
43
|
+
# Here be indices. After looking, it seems faster not to have them at all.
|
44
|
+
#
|
45
|
+
# add_index "jobs", ["status"], :name => "index_jobs_on_status"
|
46
|
+
# add_index "work_units", ["job_id"], :name => "index_work_units_on_job_id"
|
47
|
+
# add_index "work_units", ["worker_pid"], :name => "index_work_units_on_worker_pid"
|
48
|
+
# add_index "work_units", ["worker_pid", "status"], :name => "index_work_units_on_worker_pid_and_status"
|
49
|
+
# add_index "work_units", ["worker_pid", "node_record_id"], :name => "index_work_units_on_worker_pid_and_node_record_id"
|
50
|
+
end
|
@@ -0,0 +1,123 @@
|
|
1
|
+
module CloudCrowd
|
2
|
+
|
3
|
+
# The main CloudCrowd (Sinatra) application. The actions are:
|
4
|
+
#
|
5
|
+
# == Admin
|
6
|
+
# [get /] Render the admin console, with a progress meter for running jobs.
|
7
|
+
# [get /status] Get the combined JSON of every active job and worker.
|
8
|
+
# [get /worker/:name] Look up the details of a WorkUnit that a Worker is busy processing.
|
9
|
+
# [get /heartbeat] Returns 200 OK to let monitoring tools know the server's up.
|
10
|
+
#
|
11
|
+
# == Public API
|
12
|
+
# [post /jobs] Begin a new Job. Post with a JSON representation of the job-to-be. (see examples).
|
13
|
+
# [get /jobs/:job_id] Check the status of a Job. Response includes output, if the Job has finished.
|
14
|
+
# [delete /jobs/:job_id] Clean up a Job when you're done downloading the results. Removes all intermediate files.
|
15
|
+
#
|
16
|
+
# == Internal Workers API
|
17
|
+
# [puts /node/:host] Registers a new Node, making it available for processing.
|
18
|
+
# [delete /node/:host] Removes a Node from the registry, freeing up any WorkUnits that it had checked out.
|
19
|
+
# [put /work/:unit_id] Mark a finished WorkUnit as completed or failed, with results.
|
20
|
+
class Server < Sinatra::Default
|
21
|
+
|
22
|
+
set :root, ROOT
|
23
|
+
set :authorization_realm, "CloudCrowd"
|
24
|
+
|
25
|
+
helpers Helpers
|
26
|
+
|
27
|
+
# static serves files from /public, methodoverride allows the _method param.
|
28
|
+
enable :static, :methodoverride
|
29
|
+
|
30
|
+
# Enabling HTTP Authentication turns it on for all requests.
|
31
|
+
before do
|
32
|
+
login_required if CloudCrowd.config[:http_authentication]
|
33
|
+
end
|
34
|
+
|
35
|
+
# Render the admin console.
|
36
|
+
get '/' do
|
37
|
+
erb :operations_center
|
38
|
+
end
|
39
|
+
|
40
|
+
# Get the JSON for every active job in the queue and every active worker
|
41
|
+
# in the system. This action may get a little worrisome as the system grows
|
42
|
+
# larger -- keep it in mind.
|
43
|
+
get '/status' do
|
44
|
+
json(
|
45
|
+
'jobs' => Job.incomplete,
|
46
|
+
'nodes' => NodeRecord.all(:order => 'host desc'),
|
47
|
+
'work_unit_count' => WorkUnit.incomplete.count
|
48
|
+
)
|
49
|
+
end
|
50
|
+
|
51
|
+
# Get the JSON for what a worker is up to.
|
52
|
+
get '/worker/:name' do
|
53
|
+
json WorkUnit.find_by_worker_name(params[:name]) || {}
|
54
|
+
end
|
55
|
+
|
56
|
+
# To monitor the central server with Monit, God, Nagios, or another
|
57
|
+
# monitoring tool, you can hit /heartbeat to make sure.
|
58
|
+
get '/heartbeat' do
|
59
|
+
"buh-bump"
|
60
|
+
end
|
61
|
+
|
62
|
+
# PUBLIC API:
|
63
|
+
|
64
|
+
# Start a new job. Accepts a JSON representation of the job-to-be.
|
65
|
+
# Distributes all work units to available nodes.
|
66
|
+
post '/jobs' do
|
67
|
+
job = Job.create_from_request(JSON.parse(params[:job]))
|
68
|
+
WorkUnit.distribute_to_nodes
|
69
|
+
json job
|
70
|
+
end
|
71
|
+
|
72
|
+
# Check the status of a job, returning the output if finished, and the
|
73
|
+
# number of work units remaining otherwise.
|
74
|
+
get '/jobs/:job_id' do
|
75
|
+
json current_job
|
76
|
+
end
|
77
|
+
|
78
|
+
# Cleans up a Job's saved S3 files. Delete a Job after you're done
|
79
|
+
# downloading the results.
|
80
|
+
delete '/jobs/:job_id' do
|
81
|
+
current_job.destroy
|
82
|
+
json nil
|
83
|
+
end
|
84
|
+
|
85
|
+
# INTERNAL NODE API:
|
86
|
+
|
87
|
+
# A new Node will this this action to register its location and
|
88
|
+
# configuration with the central server. Triggers distribution of WorkUnits.
|
89
|
+
put '/node/:host' do
|
90
|
+
NodeRecord.check_in(params, request)
|
91
|
+
WorkUnit.distribute_to_nodes
|
92
|
+
json nil
|
93
|
+
end
|
94
|
+
|
95
|
+
# Deregisters a Node from the central server. Releases and redistributes any
|
96
|
+
# WorkUnits it may have had checked out.
|
97
|
+
delete '/node/:host' do
|
98
|
+
NodeRecord.destroy_all(:host => params[:host])
|
99
|
+
json nil
|
100
|
+
end
|
101
|
+
|
102
|
+
# When workers are done with their unit, either successfully on in failure,
|
103
|
+
# they mark it back on the central server and exit. Triggers distribution
|
104
|
+
# of pending work units.
|
105
|
+
put '/work/:work_unit_id' do
|
106
|
+
case params[:status]
|
107
|
+
when 'succeeded' then current_work_unit.finish(params[:output], params[:time])
|
108
|
+
when 'failed' then current_work_unit.fail(params[:output], params[:time])
|
109
|
+
else error(500, "Completing a work unit must specify status.")
|
110
|
+
end
|
111
|
+
WorkUnit.distribute_to_nodes
|
112
|
+
json nil
|
113
|
+
end
|
114
|
+
|
115
|
+
# At initialization record the identity of this Ruby instance as a server.
|
116
|
+
def initialize(*args)
|
117
|
+
super(*args)
|
118
|
+
CloudCrowd.identity = :server
|
119
|
+
end
|
120
|
+
|
121
|
+
end
|
122
|
+
|
123
|
+
end
|
@@ -0,0 +1,149 @@
|
|
1
|
+
module CloudCrowd
|
2
|
+
|
3
|
+
# The Worker, forked off from the Node when a new WorkUnit is received,
|
4
|
+
# launches an Action for processing. Workers will only ever receive WorkUnits
|
5
|
+
# that they are able to handle (for which they have a corresponding action in
|
6
|
+
# their actions directory). If communication with the central server is
|
7
|
+
# interrupted, the Worker will repeatedly attempt to complete its unit --
|
8
|
+
# every Worker::RETRY_WAIT seconds. Any exceptions that take place during
|
9
|
+
# the course of the Action will cause the Worker to mark the WorkUnit as
|
10
|
+
# having failed. When finished, the Worker's process exits, minimizing the
|
11
|
+
# potential for memory leaks.
|
12
|
+
class Worker
|
13
|
+
|
14
|
+
# Wait five seconds to retry, after internal communcication errors.
|
15
|
+
RETRY_WAIT = 5
|
16
|
+
|
17
|
+
attr_reader :pid, :node, :unit, :status
|
18
|
+
|
19
|
+
# A new Worker customizes itself to its WorkUnit at instantiation.
|
20
|
+
def initialize(node, unit)
|
21
|
+
@start_time = Time.now
|
22
|
+
@pid = $$
|
23
|
+
@node = node
|
24
|
+
@unit = unit
|
25
|
+
@status = @unit['status']
|
26
|
+
@retry_wait = RETRY_WAIT
|
27
|
+
end
|
28
|
+
|
29
|
+
# Return output to the central server, marking the WorkUnit done.
|
30
|
+
def complete_work_unit(result)
|
31
|
+
keep_trying_to "complete work unit" do
|
32
|
+
data = base_params.merge({:status => 'succeeded', :output => result})
|
33
|
+
@node.central["/work/#{data[:id]}"].put(data)
|
34
|
+
log "finished #{display_work_unit} in #{data[:time]} seconds"
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
# Mark the WorkUnit failed, returning the exception to central.
|
39
|
+
def fail_work_unit(exception)
|
40
|
+
keep_trying_to "mark work unit as failed" do
|
41
|
+
data = base_params.merge({:status => 'failed', :output => {'output' => exception.message}.to_json})
|
42
|
+
@node.central["/work/#{data[:id]}"].put(data)
|
43
|
+
log "failed #{display_work_unit} in #{data[:time]} seconds\n#{exception.message}\n#{exception.backtrace}"
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
# We expect and require internal communication between the central server
|
48
|
+
# and the workers to succeed. If it fails for any reason, log it, and then
|
49
|
+
# keep trying the same request.
|
50
|
+
def keep_trying_to(title)
|
51
|
+
begin
|
52
|
+
yield
|
53
|
+
rescue RestClient::ResourceNotFound => e
|
54
|
+
log "work unit ##{@unit['id']} doesn't exist. discarding..."
|
55
|
+
rescue Exception => e
|
56
|
+
log "failed to #{title} -- retry in #{@retry_wait} seconds"
|
57
|
+
log e.message
|
58
|
+
log e.backtrace
|
59
|
+
sleep @retry_wait
|
60
|
+
retry
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
# Loggable details describing what the Worker is up to.
|
65
|
+
def display_work_unit
|
66
|
+
"unit ##{@unit['id']} (#{@unit['action']}/#{CloudCrowd.display_status(@status)})"
|
67
|
+
end
|
68
|
+
|
69
|
+
# Executes the WorkUnit by running the Action, catching all exceptions as
|
70
|
+
# failures. We capture the thread so that we can kill it from the outside,
|
71
|
+
# when exiting.
|
72
|
+
def run_work_unit
|
73
|
+
begin
|
74
|
+
result = nil
|
75
|
+
action_class = CloudCrowd.actions[@unit['action']]
|
76
|
+
action = action_class.new(@status, @unit['input'], enhanced_unit_options, @node.asset_store)
|
77
|
+
Dir.chdir(action.work_directory) do
|
78
|
+
result = case @status
|
79
|
+
when PROCESSING then action.process
|
80
|
+
when SPLITTING then action.split
|
81
|
+
when MERGING then action.merge
|
82
|
+
else raise Error::StatusUnspecified, "work units must specify their status"
|
83
|
+
end
|
84
|
+
end
|
85
|
+
action.cleanup_work_directory if action
|
86
|
+
complete_work_unit({'output' => result}.to_json)
|
87
|
+
rescue Exception => e
|
88
|
+
action.cleanup_work_directory if action
|
89
|
+
fail_work_unit(e)
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
# Run this worker inside of a fork. Attempts to exit cleanly.
|
94
|
+
# Wraps run_work_unit to benchmark the execution time, if requested.
|
95
|
+
def run
|
96
|
+
trap_signals
|
97
|
+
log "starting #{display_work_unit}"
|
98
|
+
if @unit['options']['benchmark']
|
99
|
+
log("ran #{display_work_unit} in " + Benchmark.measure { run_work_unit }.to_s)
|
100
|
+
else
|
101
|
+
run_work_unit
|
102
|
+
end
|
103
|
+
Process.exit!
|
104
|
+
end
|
105
|
+
|
106
|
+
# There are some potentially important attributes of the WorkUnit that we'd
|
107
|
+
# like to pass into the Action -- in case it needs to know them. They will
|
108
|
+
# always be made available in the options hash.
|
109
|
+
def enhanced_unit_options
|
110
|
+
@unit['options'].merge({
|
111
|
+
'job_id' => @unit['job_id'],
|
112
|
+
'work_unit_id' => @unit['id'],
|
113
|
+
'attempts' => @unit['attempts']
|
114
|
+
})
|
115
|
+
end
|
116
|
+
|
117
|
+
# How long has this worker been running for?
|
118
|
+
def time_taken
|
119
|
+
Time.now - @start_time
|
120
|
+
end
|
121
|
+
|
122
|
+
|
123
|
+
private
|
124
|
+
|
125
|
+
# Common parameters to send back to central upon unit completion,
|
126
|
+
# regardless of success or failure.
|
127
|
+
def base_params
|
128
|
+
{ :pid => @pid,
|
129
|
+
:id => @unit['id'],
|
130
|
+
:time => time_taken }
|
131
|
+
end
|
132
|
+
|
133
|
+
# Log a message to the daemon log. Includes PID for identification.
|
134
|
+
def log(message)
|
135
|
+
puts "Worker ##{@pid}: #{message}" unless ENV['RACK_ENV'] == 'test'
|
136
|
+
end
|
137
|
+
|
138
|
+
# When signaled to exit, make sure that the Worker shuts down without firing
|
139
|
+
# the Node's at_exit callbacks.
|
140
|
+
def trap_signals
|
141
|
+
Signal.trap('QUIT') { Process.exit! }
|
142
|
+
Signal.trap('INT') { Process.exit! }
|
143
|
+
Signal.trap('KILL') { Process.exit! }
|
144
|
+
Signal.trap('TERM') { Process.exit! }
|
145
|
+
end
|
146
|
+
|
147
|
+
end
|
148
|
+
|
149
|
+
end
|
@@ -0,0 +1,116 @@
|
|
1
|
+
Gem::Specification.new do |s|
|
2
|
+
s.name = 'mooktakim-cloud-crowd'
|
3
|
+
s.version = '0.3.4' # Keep version in sync with cloud-cloud.rb
|
4
|
+
s.date = '2010-02-26'
|
5
|
+
|
6
|
+
s.homepage = "http://wiki.github.com/documentcloud/cloud-crowd"
|
7
|
+
s.summary = "Parallel Processing for the Rest of Us"
|
8
|
+
s.description = <<-EOS
|
9
|
+
The crowd, suddenly there where there was nothing before, is a mysterious and
|
10
|
+
universal phenomenon. A few people may have been standing together -- five, ten
|
11
|
+
or twelve, nor more; nothing has been announced, nothing is expected. Suddenly
|
12
|
+
everywhere is black with people and more come streaming from all sides as though
|
13
|
+
streets had only one direction.
|
14
|
+
EOS
|
15
|
+
|
16
|
+
s.authors = ['Jeremy Ashkenas', 'Mooktakim Ahmed']
|
17
|
+
s.email = 'jeremy@documentcloud.org'
|
18
|
+
s.rubyforge_project = 'cloud-crowd'
|
19
|
+
|
20
|
+
s.require_paths = ['lib']
|
21
|
+
s.executables = ['crowd']
|
22
|
+
|
23
|
+
s.has_rdoc = true
|
24
|
+
s.extra_rdoc_files = ['README']
|
25
|
+
s.rdoc_options << '--title' << 'CloudCrowd | Parallel Processing for the Rest of Us' <<
|
26
|
+
'--exclude' << 'test' <<
|
27
|
+
'--main' << 'README' <<
|
28
|
+
'--all'
|
29
|
+
|
30
|
+
s.add_dependency 'sinatra', ['>= 0.9.4']
|
31
|
+
s.add_dependency 'activerecord', ['>= 2.3.3']
|
32
|
+
s.add_dependency 'json', ['>= 1.1.7']
|
33
|
+
s.add_dependency 'rest-client', ['>= 1.0.3']
|
34
|
+
s.add_dependency 'right_aws', ['>= 1.10.0']
|
35
|
+
s.add_dependency 'thin', ['>= 1.2.4']
|
36
|
+
|
37
|
+
if s.respond_to?(:add_development_dependency)
|
38
|
+
s.add_development_dependency 'faker', ['>= 0.3.1']
|
39
|
+
s.add_development_dependency 'thoughtbot-shoulda', ['>= 2.10.2']
|
40
|
+
s.add_development_dependency 'notahat-machinist', ['>= 1.0.3']
|
41
|
+
s.add_development_dependency 'rack-test', ['>= 0.4.1']
|
42
|
+
s.add_development_dependency 'mocha', ['>= 0.9.7']
|
43
|
+
end
|
44
|
+
|
45
|
+
s.files = %w(
|
46
|
+
actions/graphics_magick.rb
|
47
|
+
actions/process_pdfs.rb
|
48
|
+
actions/word_count.rb
|
49
|
+
mooktakim-cloud-crowd.gemspec
|
50
|
+
config/config.example.ru
|
51
|
+
config/config.example.yml
|
52
|
+
config/database.example.yml
|
53
|
+
EPIGRAPHS
|
54
|
+
examples/graphics_magick_example.rb
|
55
|
+
examples/process_pdfs_example.rb
|
56
|
+
examples/word_count_example.rb
|
57
|
+
lib/cloud-crowd.rb
|
58
|
+
lib/cloud_crowd/action.rb
|
59
|
+
lib/cloud_crowd/asset_store/filesystem_store.rb
|
60
|
+
lib/cloud_crowd/asset_store/s3_store.rb
|
61
|
+
lib/cloud_crowd/asset_store.rb
|
62
|
+
lib/cloud_crowd/command_line.rb
|
63
|
+
lib/cloud_crowd/exceptions.rb
|
64
|
+
lib/cloud_crowd/helpers/authorization.rb
|
65
|
+
lib/cloud_crowd/helpers/resources.rb
|
66
|
+
lib/cloud_crowd/helpers.rb
|
67
|
+
lib/cloud_crowd/inflector.rb
|
68
|
+
lib/cloud_crowd/models/job.rb
|
69
|
+
lib/cloud_crowd/models/node_record.rb
|
70
|
+
lib/cloud_crowd/models/work_unit.rb
|
71
|
+
lib/cloud_crowd/models.rb
|
72
|
+
lib/cloud_crowd/node.rb
|
73
|
+
lib/cloud_crowd/schema.rb
|
74
|
+
lib/cloud_crowd/server.rb
|
75
|
+
lib/cloud_crowd/worker.rb
|
76
|
+
LICENSE
|
77
|
+
public/css/admin_console.css
|
78
|
+
public/css/reset.css
|
79
|
+
public/images/bullet_green.png
|
80
|
+
public/images/bullet_white.png
|
81
|
+
public/images/cloud_hand.png
|
82
|
+
public/images/header_back.png
|
83
|
+
public/images/logo.png
|
84
|
+
public/images/queue_fill.png
|
85
|
+
public/images/server.png
|
86
|
+
public/images/server_busy.png
|
87
|
+
public/images/server_error.png
|
88
|
+
public/images/sidebar_bottom.png
|
89
|
+
public/images/sidebar_top.png
|
90
|
+
public/images/worker_info.png
|
91
|
+
public/images/worker_info_loading.gif
|
92
|
+
public/js/admin_console.js
|
93
|
+
public/js/excanvas.js
|
94
|
+
public/js/flot.js
|
95
|
+
public/js/jquery.js
|
96
|
+
README
|
97
|
+
test/acceptance/test_node.rb
|
98
|
+
test/acceptance/test_failing_work_units.rb
|
99
|
+
test/acceptance/test_server.rb
|
100
|
+
test/acceptance/test_word_count.rb
|
101
|
+
test/blueprints.rb
|
102
|
+
test/config/config.ru
|
103
|
+
test/config/config.yml
|
104
|
+
test/config/database.yml
|
105
|
+
test/config/actions/failure_testing.rb
|
106
|
+
test/test_helper.rb
|
107
|
+
test/unit/test_action.rb
|
108
|
+
test/unit/test_configuration.rb
|
109
|
+
test/unit/test_node.rb
|
110
|
+
test/unit/test_node_record.rb
|
111
|
+
test/unit/test_job.rb
|
112
|
+
test/unit/test_worker.rb
|
113
|
+
test/unit/test_work_unit.rb
|
114
|
+
views/operations_center.erb
|
115
|
+
)
|
116
|
+
end
|