mooktakim-cloud-crowd 0.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/EPIGRAPHS +17 -0
- data/LICENSE +22 -0
- data/README +93 -0
- data/actions/graphics_magick.rb +43 -0
- data/actions/process_pdfs.rb +92 -0
- data/actions/word_count.rb +16 -0
- data/bin/crowd +5 -0
- data/config/config.example.ru +23 -0
- data/config/config.example.yml +55 -0
- data/config/database.example.yml +16 -0
- data/examples/graphics_magick_example.rb +44 -0
- data/examples/process_pdfs_example.rb +40 -0
- data/examples/word_count_example.rb +42 -0
- data/lib/cloud-crowd.rb +188 -0
- data/lib/cloud_crowd/action.rb +125 -0
- data/lib/cloud_crowd/asset_store/filesystem_store.rb +39 -0
- data/lib/cloud_crowd/asset_store/s3_store.rb +43 -0
- data/lib/cloud_crowd/asset_store.rb +41 -0
- data/lib/cloud_crowd/command_line.rb +242 -0
- data/lib/cloud_crowd/exceptions.rb +46 -0
- data/lib/cloud_crowd/helpers/authorization.rb +52 -0
- data/lib/cloud_crowd/helpers/resources.rb +25 -0
- data/lib/cloud_crowd/helpers.rb +8 -0
- data/lib/cloud_crowd/inflector.rb +19 -0
- data/lib/cloud_crowd/models/job.rb +190 -0
- data/lib/cloud_crowd/models/node_record.rb +107 -0
- data/lib/cloud_crowd/models/work_unit.rb +170 -0
- data/lib/cloud_crowd/models.rb +40 -0
- data/lib/cloud_crowd/node.rb +199 -0
- data/lib/cloud_crowd/schema.rb +50 -0
- data/lib/cloud_crowd/server.rb +123 -0
- data/lib/cloud_crowd/worker.rb +149 -0
- data/mooktakim-cloud-crowd.gemspec +116 -0
- data/public/css/admin_console.css +243 -0
- data/public/css/reset.css +42 -0
- data/public/images/bullet_green.png +0 -0
- data/public/images/bullet_white.png +0 -0
- data/public/images/cloud_hand.png +0 -0
- data/public/images/header_back.png +0 -0
- data/public/images/logo.png +0 -0
- data/public/images/queue_fill.png +0 -0
- data/public/images/server.png +0 -0
- data/public/images/server_busy.png +0 -0
- data/public/images/server_error.png +0 -0
- data/public/images/sidebar_bottom.png +0 -0
- data/public/images/sidebar_top.png +0 -0
- data/public/images/worker_info.png +0 -0
- data/public/images/worker_info_loading.gif +0 -0
- data/public/js/admin_console.js +197 -0
- data/public/js/excanvas.js +1 -0
- data/public/js/flot.js +1 -0
- data/public/js/jquery.js +19 -0
- data/test/acceptance/test_failing_work_units.rb +33 -0
- data/test/acceptance/test_node.rb +20 -0
- data/test/acceptance/test_server.rb +66 -0
- data/test/acceptance/test_word_count.rb +40 -0
- data/test/blueprints.rb +25 -0
- data/test/config/actions/failure_testing.rb +13 -0
- data/test/config/config.ru +17 -0
- data/test/config/config.yml +6 -0
- data/test/config/database.yml +3 -0
- data/test/test_helper.rb +19 -0
- data/test/unit/test_action.rb +70 -0
- data/test/unit/test_configuration.rb +48 -0
- data/test/unit/test_job.rb +103 -0
- data/test/unit/test_node.rb +41 -0
- data/test/unit/test_node_record.rb +42 -0
- data/test/unit/test_work_unit.rb +53 -0
- data/test/unit/test_worker.rb +48 -0
- data/views/operations_center.erb +82 -0
- metadata +290 -0
@@ -0,0 +1,199 @@
|
|
1
|
+
module CloudCrowd
|
2
|
+
|
3
|
+
# A Node is a Sinatra/Thin application that runs a single instance per-machine
|
4
|
+
# It registers with the central server, receives WorkUnits, and forks off
|
5
|
+
# Workers to process them. The actions are:
|
6
|
+
#
|
7
|
+
# [get /heartbeat] Returns 200 OK to let monitoring tools know the server's up.
|
8
|
+
# [post /work] The central server hits <tt>/work</tt> to dispatch a WorkUnit to this Node.
|
9
|
+
class Node < Sinatra::Default
|
10
|
+
|
11
|
+
# A Node's default port. You only run a single node per machine, so they
|
12
|
+
# can all use the same port without any problems.
|
13
|
+
DEFAULT_PORT = 9063
|
14
|
+
|
15
|
+
# A list of regex scrapers, which let us extract the one-minute load
|
16
|
+
# average and the amount of free memory on different flavors of UNIX.
|
17
|
+
|
18
|
+
SCRAPE_UPTIME = /\d+\.\d+/
|
19
|
+
SCRAPE_LINUX_MEMORY = /MemFree:\s+(\d+) kB/
|
20
|
+
SCRAPE_MAC_MEMORY = /Pages free:\s+(\d+)./
|
21
|
+
SCRAPE_MAC_PAGE = /page size of (\d+) bytes/
|
22
|
+
|
23
|
+
# The interval at which the node monitors the machine's load and memory use
|
24
|
+
# (if configured to do so in config.yml).
|
25
|
+
MONITOR_INTERVAL = 3
|
26
|
+
|
27
|
+
# The interval at which the node regularly checks in with central (5 min).
|
28
|
+
CHECK_IN_INTERVAL = 300
|
29
|
+
|
30
|
+
# The response sent back when this node is overloaded.
|
31
|
+
OVERLOADED_MESSAGE = 'Node Overloaded'
|
32
|
+
|
33
|
+
attr_reader :enabled_actions, :host, :port, :central
|
34
|
+
|
35
|
+
set :root, ROOT
|
36
|
+
set :authorization_realm, "CloudCrowd"
|
37
|
+
|
38
|
+
helpers Helpers
|
39
|
+
|
40
|
+
# methodoverride allows the _method param.
|
41
|
+
enable :methodoverride
|
42
|
+
|
43
|
+
# Enabling HTTP Authentication turns it on for all requests.
|
44
|
+
# This works the same way as in the central CloudCrowd::Server.
|
45
|
+
before do
|
46
|
+
login_required if CloudCrowd.config[:http_authentication]
|
47
|
+
end
|
48
|
+
|
49
|
+
# To monitor a Node with Monit, God, Nagios, or another tool, you can hit
|
50
|
+
# /heartbeat to make sure its still online.
|
51
|
+
get '/heartbeat' do
|
52
|
+
"buh-bump"
|
53
|
+
end
|
54
|
+
|
55
|
+
# Posts a WorkUnit to this Node. Forks a Worker and returns the process id.
|
56
|
+
# Returns a 503 if this Node is overloaded.
|
57
|
+
post '/work' do
|
58
|
+
throw :halt, [503, OVERLOADED_MESSAGE] if @overloaded
|
59
|
+
unit = JSON.parse(params[:work_unit])
|
60
|
+
pid = fork { Worker.new(self, unit).run }
|
61
|
+
Process.detach(pid)
|
62
|
+
json :pid => pid
|
63
|
+
end
|
64
|
+
|
65
|
+
# When creating a node, specify the port it should run on.
|
66
|
+
def initialize(port=nil, daemon=false)
|
67
|
+
require 'json'
|
68
|
+
CloudCrowd.identity = :node
|
69
|
+
@central = CloudCrowd.central_server
|
70
|
+
@host = Socket.gethostname
|
71
|
+
@enabled_actions = CloudCrowd.actions.keys
|
72
|
+
@port = port || DEFAULT_PORT
|
73
|
+
@daemon = daemon
|
74
|
+
@overloaded = false
|
75
|
+
@max_load = CloudCrowd.config[:max_load]
|
76
|
+
@min_memory = CloudCrowd.config[:min_free_memory]
|
77
|
+
start unless test?
|
78
|
+
end
|
79
|
+
|
80
|
+
# Starting up a Node registers with the central server and begins to listen
|
81
|
+
# for incoming WorkUnits.
|
82
|
+
def start
|
83
|
+
FileUtils.mkdir_p(CloudCrowd.log_path) if @daemon && !File.exists?(CloudCrowd.log_path)
|
84
|
+
@server = Thin::Server.new('0.0.0.0', @port, self, :signals => false)
|
85
|
+
@server.tag = 'cloud-crowd-node'
|
86
|
+
@server.pid_file = CloudCrowd.pid_path('node.pid')
|
87
|
+
@server.log_file = CloudCrowd.log_path('node.log')
|
88
|
+
@server.daemonize if @daemon
|
89
|
+
trap_signals
|
90
|
+
asset_store
|
91
|
+
@server_thread = Thread.new { @server.start }
|
92
|
+
check_in(true)
|
93
|
+
check_in_periodically
|
94
|
+
monitor_system if @max_load || @min_memory
|
95
|
+
@server_thread.join
|
96
|
+
end
|
97
|
+
|
98
|
+
# Checking in with the central server informs it of the location and
|
99
|
+
# configuration of this Node. If it can't check-in, there's no point in
|
100
|
+
# starting.
|
101
|
+
def check_in(critical=false)
|
102
|
+
@central["/node/#{@host}"].put(
|
103
|
+
:port => @port,
|
104
|
+
:busy => @overloaded,
|
105
|
+
:max_workers => CloudCrowd.config[:max_workers],
|
106
|
+
:enabled_actions => @enabled_actions.join(',')
|
107
|
+
)
|
108
|
+
rescue RestClient::Exception, Errno::ECONNREFUSED
|
109
|
+
puts "Failed to connect to the central server (#{@central.to_s})."
|
110
|
+
raise SystemExit if critical
|
111
|
+
end
|
112
|
+
|
113
|
+
# Before exiting, the Node checks out with the central server, releasing all
|
114
|
+
# of its WorkUnits for other Nodes to handle
|
115
|
+
def check_out
|
116
|
+
@central["/node/#{@host}"].delete
|
117
|
+
end
|
118
|
+
|
119
|
+
# Lazy-initialize the asset_store, preferably after the Node has launched.
|
120
|
+
def asset_store
|
121
|
+
@asset_store ||= AssetStore.new
|
122
|
+
end
|
123
|
+
|
124
|
+
# Is the node overloaded? If configured, checks if the load average is
|
125
|
+
# greater than 'max_load', or if the available RAM is less than
|
126
|
+
# 'min_free_memory'.
|
127
|
+
def overloaded?
|
128
|
+
(@max_load && load_average > @max_load) ||
|
129
|
+
(@min_memory && free_memory < @min_memory)
|
130
|
+
end
|
131
|
+
|
132
|
+
# The current one-minute load average.
|
133
|
+
def load_average
|
134
|
+
`uptime`.match(SCRAPE_UPTIME).to_s.to_f
|
135
|
+
end
|
136
|
+
|
137
|
+
# The current amount of free memory in megabytes.
|
138
|
+
def free_memory
|
139
|
+
case RUBY_PLATFORM
|
140
|
+
when /darwin/
|
141
|
+
stats = `vm_stat`
|
142
|
+
@mac_page_size ||= stats.match(SCRAPE_MAC_PAGE)[1].to_f / 1048576.0
|
143
|
+
stats.match(SCRAPE_MAC_MEMORY)[1].to_f * @mac_page_size
|
144
|
+
when /linux/
|
145
|
+
`cat /proc/meminfo`.match(SCRAPE_LINUX_MEMORY)[1].to_f / 1024.0
|
146
|
+
else
|
147
|
+
raise NotImplementedError, "'min_free_memory' is not yet implemented on your platform"
|
148
|
+
end
|
149
|
+
end
|
150
|
+
|
151
|
+
|
152
|
+
private
|
153
|
+
|
154
|
+
# Launch a monitoring thread that periodically checks the node's load
|
155
|
+
# average and the amount of free memory remaining. If we transition out of
|
156
|
+
# the overloaded state, let central know.
|
157
|
+
def monitor_system
|
158
|
+
@monitor_thread = Thread.new do
|
159
|
+
loop do
|
160
|
+
was_overloaded = @overloaded
|
161
|
+
@overloaded = overloaded?
|
162
|
+
check_in if was_overloaded && !@overloaded
|
163
|
+
sleep MONITOR_INTERVAL
|
164
|
+
end
|
165
|
+
end
|
166
|
+
end
|
167
|
+
|
168
|
+
# If communication is interrupted for external reasons, the central server
|
169
|
+
# will assume that the node has gone down. Checking in will let central know
|
170
|
+
# it's still online.
|
171
|
+
def check_in_periodically
|
172
|
+
@check_in_thread = Thread.new do
|
173
|
+
loop do
|
174
|
+
sleep CHECK_IN_INTERVAL
|
175
|
+
check_in
|
176
|
+
end
|
177
|
+
end
|
178
|
+
end
|
179
|
+
|
180
|
+
# Trap exit signals in order to shut down cleanly.
|
181
|
+
def trap_signals
|
182
|
+
Signal.trap('QUIT') { shut_down }
|
183
|
+
Signal.trap('INT') { shut_down }
|
184
|
+
Signal.trap('KILL') { shut_down }
|
185
|
+
Signal.trap('TERM') { shut_down }
|
186
|
+
end
|
187
|
+
|
188
|
+
# At shut down, de-register with the central server before exiting.
|
189
|
+
def shut_down
|
190
|
+
@check_in_thread.kill if @check_in_thread
|
191
|
+
@monitor_thread.kill if @monitor_thread
|
192
|
+
check_out
|
193
|
+
@server_thread.kill if @server_thread
|
194
|
+
Process.exit
|
195
|
+
end
|
196
|
+
|
197
|
+
end
|
198
|
+
|
199
|
+
end
|
@@ -0,0 +1,50 @@
|
|
1
|
+
# Complete schema for CloudCrowd.
|
2
|
+
ActiveRecord::Schema.define(:version => CloudCrowd::SCHEMA_VERSION) do
|
3
|
+
|
4
|
+
create_table "jobs", :force => true do |t|
|
5
|
+
t.integer "status", :null => false
|
6
|
+
t.text "inputs", :null => false
|
7
|
+
t.string "action", :null => false
|
8
|
+
t.text "options", :null => false
|
9
|
+
t.text "outputs"
|
10
|
+
t.float "time"
|
11
|
+
t.string "callback_url"
|
12
|
+
t.string "email"
|
13
|
+
t.datetime "created_at"
|
14
|
+
t.datetime "updated_at"
|
15
|
+
end
|
16
|
+
|
17
|
+
create_table "node_records", :force => true do |t|
|
18
|
+
t.string "host", :null => false
|
19
|
+
t.string "ip_address", :null => false
|
20
|
+
t.integer "port", :null => false
|
21
|
+
t.string "enabled_actions", :default => '', :null => false
|
22
|
+
t.boolean "busy", :default => false, :null => false
|
23
|
+
t.integer "max_workers"
|
24
|
+
t.datetime "created_at"
|
25
|
+
t.datetime "updated_at"
|
26
|
+
end
|
27
|
+
|
28
|
+
create_table "work_units", :force => true do |t|
|
29
|
+
t.integer "status", :null => false
|
30
|
+
t.integer "job_id", :null => false
|
31
|
+
t.text "input", :null => false
|
32
|
+
t.string "action", :null => false
|
33
|
+
t.integer "attempts", :default => 0, :null => false
|
34
|
+
t.integer "node_record_id"
|
35
|
+
t.integer "worker_pid"
|
36
|
+
t.integer "reservation"
|
37
|
+
t.float "time"
|
38
|
+
t.text "output"
|
39
|
+
t.datetime "created_at"
|
40
|
+
t.datetime "updated_at"
|
41
|
+
end
|
42
|
+
|
43
|
+
# Here be indices. After looking, it seems faster not to have them at all.
|
44
|
+
#
|
45
|
+
# add_index "jobs", ["status"], :name => "index_jobs_on_status"
|
46
|
+
# add_index "work_units", ["job_id"], :name => "index_work_units_on_job_id"
|
47
|
+
# add_index "work_units", ["worker_pid"], :name => "index_work_units_on_worker_pid"
|
48
|
+
# add_index "work_units", ["worker_pid", "status"], :name => "index_work_units_on_worker_pid_and_status"
|
49
|
+
# add_index "work_units", ["worker_pid", "node_record_id"], :name => "index_work_units_on_worker_pid_and_node_record_id"
|
50
|
+
end
|
@@ -0,0 +1,123 @@
|
|
1
|
+
module CloudCrowd
|
2
|
+
|
3
|
+
# The main CloudCrowd (Sinatra) application. The actions are:
|
4
|
+
#
|
5
|
+
# == Admin
|
6
|
+
# [get /] Render the admin console, with a progress meter for running jobs.
|
7
|
+
# [get /status] Get the combined JSON of every active job and worker.
|
8
|
+
# [get /worker/:name] Look up the details of a WorkUnit that a Worker is busy processing.
|
9
|
+
# [get /heartbeat] Returns 200 OK to let monitoring tools know the server's up.
|
10
|
+
#
|
11
|
+
# == Public API
|
12
|
+
# [post /jobs] Begin a new Job. Post with a JSON representation of the job-to-be. (see examples).
|
13
|
+
# [get /jobs/:job_id] Check the status of a Job. Response includes output, if the Job has finished.
|
14
|
+
# [delete /jobs/:job_id] Clean up a Job when you're done downloading the results. Removes all intermediate files.
|
15
|
+
#
|
16
|
+
# == Internal Workers API
|
17
|
+
# [puts /node/:host] Registers a new Node, making it available for processing.
|
18
|
+
# [delete /node/:host] Removes a Node from the registry, freeing up any WorkUnits that it had checked out.
|
19
|
+
# [put /work/:unit_id] Mark a finished WorkUnit as completed or failed, with results.
|
20
|
+
class Server < Sinatra::Default
|
21
|
+
|
22
|
+
set :root, ROOT
|
23
|
+
set :authorization_realm, "CloudCrowd"
|
24
|
+
|
25
|
+
helpers Helpers
|
26
|
+
|
27
|
+
# static serves files from /public, methodoverride allows the _method param.
|
28
|
+
enable :static, :methodoverride
|
29
|
+
|
30
|
+
# Enabling HTTP Authentication turns it on for all requests.
|
31
|
+
before do
|
32
|
+
login_required if CloudCrowd.config[:http_authentication]
|
33
|
+
end
|
34
|
+
|
35
|
+
# Render the admin console.
|
36
|
+
get '/' do
|
37
|
+
erb :operations_center
|
38
|
+
end
|
39
|
+
|
40
|
+
# Get the JSON for every active job in the queue and every active worker
|
41
|
+
# in the system. This action may get a little worrisome as the system grows
|
42
|
+
# larger -- keep it in mind.
|
43
|
+
get '/status' do
|
44
|
+
json(
|
45
|
+
'jobs' => Job.incomplete,
|
46
|
+
'nodes' => NodeRecord.all(:order => 'host desc'),
|
47
|
+
'work_unit_count' => WorkUnit.incomplete.count
|
48
|
+
)
|
49
|
+
end
|
50
|
+
|
51
|
+
# Get the JSON for what a worker is up to.
|
52
|
+
get '/worker/:name' do
|
53
|
+
json WorkUnit.find_by_worker_name(params[:name]) || {}
|
54
|
+
end
|
55
|
+
|
56
|
+
# To monitor the central server with Monit, God, Nagios, or another
|
57
|
+
# monitoring tool, you can hit /heartbeat to make sure.
|
58
|
+
get '/heartbeat' do
|
59
|
+
"buh-bump"
|
60
|
+
end
|
61
|
+
|
62
|
+
# PUBLIC API:
|
63
|
+
|
64
|
+
# Start a new job. Accepts a JSON representation of the job-to-be.
|
65
|
+
# Distributes all work units to available nodes.
|
66
|
+
post '/jobs' do
|
67
|
+
job = Job.create_from_request(JSON.parse(params[:job]))
|
68
|
+
WorkUnit.distribute_to_nodes
|
69
|
+
json job
|
70
|
+
end
|
71
|
+
|
72
|
+
# Check the status of a job, returning the output if finished, and the
|
73
|
+
# number of work units remaining otherwise.
|
74
|
+
get '/jobs/:job_id' do
|
75
|
+
json current_job
|
76
|
+
end
|
77
|
+
|
78
|
+
# Cleans up a Job's saved S3 files. Delete a Job after you're done
|
79
|
+
# downloading the results.
|
80
|
+
delete '/jobs/:job_id' do
|
81
|
+
current_job.destroy
|
82
|
+
json nil
|
83
|
+
end
|
84
|
+
|
85
|
+
# INTERNAL NODE API:
|
86
|
+
|
87
|
+
# A new Node will this this action to register its location and
|
88
|
+
# configuration with the central server. Triggers distribution of WorkUnits.
|
89
|
+
put '/node/:host' do
|
90
|
+
NodeRecord.check_in(params, request)
|
91
|
+
WorkUnit.distribute_to_nodes
|
92
|
+
json nil
|
93
|
+
end
|
94
|
+
|
95
|
+
# Deregisters a Node from the central server. Releases and redistributes any
|
96
|
+
# WorkUnits it may have had checked out.
|
97
|
+
delete '/node/:host' do
|
98
|
+
NodeRecord.destroy_all(:host => params[:host])
|
99
|
+
json nil
|
100
|
+
end
|
101
|
+
|
102
|
+
# When workers are done with their unit, either successfully on in failure,
|
103
|
+
# they mark it back on the central server and exit. Triggers distribution
|
104
|
+
# of pending work units.
|
105
|
+
put '/work/:work_unit_id' do
|
106
|
+
case params[:status]
|
107
|
+
when 'succeeded' then current_work_unit.finish(params[:output], params[:time])
|
108
|
+
when 'failed' then current_work_unit.fail(params[:output], params[:time])
|
109
|
+
else error(500, "Completing a work unit must specify status.")
|
110
|
+
end
|
111
|
+
WorkUnit.distribute_to_nodes
|
112
|
+
json nil
|
113
|
+
end
|
114
|
+
|
115
|
+
# At initialization record the identity of this Ruby instance as a server.
|
116
|
+
def initialize(*args)
|
117
|
+
super(*args)
|
118
|
+
CloudCrowd.identity = :server
|
119
|
+
end
|
120
|
+
|
121
|
+
end
|
122
|
+
|
123
|
+
end
|
@@ -0,0 +1,149 @@
|
|
1
|
+
module CloudCrowd
|
2
|
+
|
3
|
+
# The Worker, forked off from the Node when a new WorkUnit is received,
|
4
|
+
# launches an Action for processing. Workers will only ever receive WorkUnits
|
5
|
+
# that they are able to handle (for which they have a corresponding action in
|
6
|
+
# their actions directory). If communication with the central server is
|
7
|
+
# interrupted, the Worker will repeatedly attempt to complete its unit --
|
8
|
+
# every Worker::RETRY_WAIT seconds. Any exceptions that take place during
|
9
|
+
# the course of the Action will cause the Worker to mark the WorkUnit as
|
10
|
+
# having failed. When finished, the Worker's process exits, minimizing the
|
11
|
+
# potential for memory leaks.
|
12
|
+
class Worker
|
13
|
+
|
14
|
+
# Wait five seconds to retry, after internal communcication errors.
|
15
|
+
RETRY_WAIT = 5
|
16
|
+
|
17
|
+
attr_reader :pid, :node, :unit, :status
|
18
|
+
|
19
|
+
# A new Worker customizes itself to its WorkUnit at instantiation.
|
20
|
+
def initialize(node, unit)
|
21
|
+
@start_time = Time.now
|
22
|
+
@pid = $$
|
23
|
+
@node = node
|
24
|
+
@unit = unit
|
25
|
+
@status = @unit['status']
|
26
|
+
@retry_wait = RETRY_WAIT
|
27
|
+
end
|
28
|
+
|
29
|
+
# Return output to the central server, marking the WorkUnit done.
|
30
|
+
def complete_work_unit(result)
|
31
|
+
keep_trying_to "complete work unit" do
|
32
|
+
data = base_params.merge({:status => 'succeeded', :output => result})
|
33
|
+
@node.central["/work/#{data[:id]}"].put(data)
|
34
|
+
log "finished #{display_work_unit} in #{data[:time]} seconds"
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
# Mark the WorkUnit failed, returning the exception to central.
|
39
|
+
def fail_work_unit(exception)
|
40
|
+
keep_trying_to "mark work unit as failed" do
|
41
|
+
data = base_params.merge({:status => 'failed', :output => {'output' => exception.message}.to_json})
|
42
|
+
@node.central["/work/#{data[:id]}"].put(data)
|
43
|
+
log "failed #{display_work_unit} in #{data[:time]} seconds\n#{exception.message}\n#{exception.backtrace}"
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
# We expect and require internal communication between the central server
|
48
|
+
# and the workers to succeed. If it fails for any reason, log it, and then
|
49
|
+
# keep trying the same request.
|
50
|
+
def keep_trying_to(title)
|
51
|
+
begin
|
52
|
+
yield
|
53
|
+
rescue RestClient::ResourceNotFound => e
|
54
|
+
log "work unit ##{@unit['id']} doesn't exist. discarding..."
|
55
|
+
rescue Exception => e
|
56
|
+
log "failed to #{title} -- retry in #{@retry_wait} seconds"
|
57
|
+
log e.message
|
58
|
+
log e.backtrace
|
59
|
+
sleep @retry_wait
|
60
|
+
retry
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
# Loggable details describing what the Worker is up to.
|
65
|
+
def display_work_unit
|
66
|
+
"unit ##{@unit['id']} (#{@unit['action']}/#{CloudCrowd.display_status(@status)})"
|
67
|
+
end
|
68
|
+
|
69
|
+
# Executes the WorkUnit by running the Action, catching all exceptions as
|
70
|
+
# failures. We capture the thread so that we can kill it from the outside,
|
71
|
+
# when exiting.
|
72
|
+
def run_work_unit
|
73
|
+
begin
|
74
|
+
result = nil
|
75
|
+
action_class = CloudCrowd.actions[@unit['action']]
|
76
|
+
action = action_class.new(@status, @unit['input'], enhanced_unit_options, @node.asset_store)
|
77
|
+
Dir.chdir(action.work_directory) do
|
78
|
+
result = case @status
|
79
|
+
when PROCESSING then action.process
|
80
|
+
when SPLITTING then action.split
|
81
|
+
when MERGING then action.merge
|
82
|
+
else raise Error::StatusUnspecified, "work units must specify their status"
|
83
|
+
end
|
84
|
+
end
|
85
|
+
action.cleanup_work_directory if action
|
86
|
+
complete_work_unit({'output' => result}.to_json)
|
87
|
+
rescue Exception => e
|
88
|
+
action.cleanup_work_directory if action
|
89
|
+
fail_work_unit(e)
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
# Run this worker inside of a fork. Attempts to exit cleanly.
|
94
|
+
# Wraps run_work_unit to benchmark the execution time, if requested.
|
95
|
+
def run
|
96
|
+
trap_signals
|
97
|
+
log "starting #{display_work_unit}"
|
98
|
+
if @unit['options']['benchmark']
|
99
|
+
log("ran #{display_work_unit} in " + Benchmark.measure { run_work_unit }.to_s)
|
100
|
+
else
|
101
|
+
run_work_unit
|
102
|
+
end
|
103
|
+
Process.exit!
|
104
|
+
end
|
105
|
+
|
106
|
+
# There are some potentially important attributes of the WorkUnit that we'd
|
107
|
+
# like to pass into the Action -- in case it needs to know them. They will
|
108
|
+
# always be made available in the options hash.
|
109
|
+
def enhanced_unit_options
|
110
|
+
@unit['options'].merge({
|
111
|
+
'job_id' => @unit['job_id'],
|
112
|
+
'work_unit_id' => @unit['id'],
|
113
|
+
'attempts' => @unit['attempts']
|
114
|
+
})
|
115
|
+
end
|
116
|
+
|
117
|
+
# How long has this worker been running for?
|
118
|
+
def time_taken
|
119
|
+
Time.now - @start_time
|
120
|
+
end
|
121
|
+
|
122
|
+
|
123
|
+
private
|
124
|
+
|
125
|
+
# Common parameters to send back to central upon unit completion,
|
126
|
+
# regardless of success or failure.
|
127
|
+
def base_params
|
128
|
+
{ :pid => @pid,
|
129
|
+
:id => @unit['id'],
|
130
|
+
:time => time_taken }
|
131
|
+
end
|
132
|
+
|
133
|
+
# Log a message to the daemon log. Includes PID for identification.
|
134
|
+
def log(message)
|
135
|
+
puts "Worker ##{@pid}: #{message}" unless ENV['RACK_ENV'] == 'test'
|
136
|
+
end
|
137
|
+
|
138
|
+
# When signaled to exit, make sure that the Worker shuts down without firing
|
139
|
+
# the Node's at_exit callbacks.
|
140
|
+
def trap_signals
|
141
|
+
Signal.trap('QUIT') { Process.exit! }
|
142
|
+
Signal.trap('INT') { Process.exit! }
|
143
|
+
Signal.trap('KILL') { Process.exit! }
|
144
|
+
Signal.trap('TERM') { Process.exit! }
|
145
|
+
end
|
146
|
+
|
147
|
+
end
|
148
|
+
|
149
|
+
end
|
@@ -0,0 +1,116 @@
|
|
1
|
+
Gem::Specification.new do |s|
|
2
|
+
s.name = 'mooktakim-cloud-crowd'
|
3
|
+
s.version = '0.3.4' # Keep version in sync with cloud-cloud.rb
|
4
|
+
s.date = '2010-02-26'
|
5
|
+
|
6
|
+
s.homepage = "http://wiki.github.com/documentcloud/cloud-crowd"
|
7
|
+
s.summary = "Parallel Processing for the Rest of Us"
|
8
|
+
s.description = <<-EOS
|
9
|
+
The crowd, suddenly there where there was nothing before, is a mysterious and
|
10
|
+
universal phenomenon. A few people may have been standing together -- five, ten
|
11
|
+
or twelve, nor more; nothing has been announced, nothing is expected. Suddenly
|
12
|
+
everywhere is black with people and more come streaming from all sides as though
|
13
|
+
streets had only one direction.
|
14
|
+
EOS
|
15
|
+
|
16
|
+
s.authors = ['Jeremy Ashkenas', 'Mooktakim Ahmed']
|
17
|
+
s.email = 'jeremy@documentcloud.org'
|
18
|
+
s.rubyforge_project = 'cloud-crowd'
|
19
|
+
|
20
|
+
s.require_paths = ['lib']
|
21
|
+
s.executables = ['crowd']
|
22
|
+
|
23
|
+
s.has_rdoc = true
|
24
|
+
s.extra_rdoc_files = ['README']
|
25
|
+
s.rdoc_options << '--title' << 'CloudCrowd | Parallel Processing for the Rest of Us' <<
|
26
|
+
'--exclude' << 'test' <<
|
27
|
+
'--main' << 'README' <<
|
28
|
+
'--all'
|
29
|
+
|
30
|
+
s.add_dependency 'sinatra', ['>= 0.9.4']
|
31
|
+
s.add_dependency 'activerecord', ['>= 2.3.3']
|
32
|
+
s.add_dependency 'json', ['>= 1.1.7']
|
33
|
+
s.add_dependency 'rest-client', ['>= 1.0.3']
|
34
|
+
s.add_dependency 'right_aws', ['>= 1.10.0']
|
35
|
+
s.add_dependency 'thin', ['>= 1.2.4']
|
36
|
+
|
37
|
+
if s.respond_to?(:add_development_dependency)
|
38
|
+
s.add_development_dependency 'faker', ['>= 0.3.1']
|
39
|
+
s.add_development_dependency 'thoughtbot-shoulda', ['>= 2.10.2']
|
40
|
+
s.add_development_dependency 'notahat-machinist', ['>= 1.0.3']
|
41
|
+
s.add_development_dependency 'rack-test', ['>= 0.4.1']
|
42
|
+
s.add_development_dependency 'mocha', ['>= 0.9.7']
|
43
|
+
end
|
44
|
+
|
45
|
+
s.files = %w(
|
46
|
+
actions/graphics_magick.rb
|
47
|
+
actions/process_pdfs.rb
|
48
|
+
actions/word_count.rb
|
49
|
+
mooktakim-cloud-crowd.gemspec
|
50
|
+
config/config.example.ru
|
51
|
+
config/config.example.yml
|
52
|
+
config/database.example.yml
|
53
|
+
EPIGRAPHS
|
54
|
+
examples/graphics_magick_example.rb
|
55
|
+
examples/process_pdfs_example.rb
|
56
|
+
examples/word_count_example.rb
|
57
|
+
lib/cloud-crowd.rb
|
58
|
+
lib/cloud_crowd/action.rb
|
59
|
+
lib/cloud_crowd/asset_store/filesystem_store.rb
|
60
|
+
lib/cloud_crowd/asset_store/s3_store.rb
|
61
|
+
lib/cloud_crowd/asset_store.rb
|
62
|
+
lib/cloud_crowd/command_line.rb
|
63
|
+
lib/cloud_crowd/exceptions.rb
|
64
|
+
lib/cloud_crowd/helpers/authorization.rb
|
65
|
+
lib/cloud_crowd/helpers/resources.rb
|
66
|
+
lib/cloud_crowd/helpers.rb
|
67
|
+
lib/cloud_crowd/inflector.rb
|
68
|
+
lib/cloud_crowd/models/job.rb
|
69
|
+
lib/cloud_crowd/models/node_record.rb
|
70
|
+
lib/cloud_crowd/models/work_unit.rb
|
71
|
+
lib/cloud_crowd/models.rb
|
72
|
+
lib/cloud_crowd/node.rb
|
73
|
+
lib/cloud_crowd/schema.rb
|
74
|
+
lib/cloud_crowd/server.rb
|
75
|
+
lib/cloud_crowd/worker.rb
|
76
|
+
LICENSE
|
77
|
+
public/css/admin_console.css
|
78
|
+
public/css/reset.css
|
79
|
+
public/images/bullet_green.png
|
80
|
+
public/images/bullet_white.png
|
81
|
+
public/images/cloud_hand.png
|
82
|
+
public/images/header_back.png
|
83
|
+
public/images/logo.png
|
84
|
+
public/images/queue_fill.png
|
85
|
+
public/images/server.png
|
86
|
+
public/images/server_busy.png
|
87
|
+
public/images/server_error.png
|
88
|
+
public/images/sidebar_bottom.png
|
89
|
+
public/images/sidebar_top.png
|
90
|
+
public/images/worker_info.png
|
91
|
+
public/images/worker_info_loading.gif
|
92
|
+
public/js/admin_console.js
|
93
|
+
public/js/excanvas.js
|
94
|
+
public/js/flot.js
|
95
|
+
public/js/jquery.js
|
96
|
+
README
|
97
|
+
test/acceptance/test_node.rb
|
98
|
+
test/acceptance/test_failing_work_units.rb
|
99
|
+
test/acceptance/test_server.rb
|
100
|
+
test/acceptance/test_word_count.rb
|
101
|
+
test/blueprints.rb
|
102
|
+
test/config/config.ru
|
103
|
+
test/config/config.yml
|
104
|
+
test/config/database.yml
|
105
|
+
test/config/actions/failure_testing.rb
|
106
|
+
test/test_helper.rb
|
107
|
+
test/unit/test_action.rb
|
108
|
+
test/unit/test_configuration.rb
|
109
|
+
test/unit/test_node.rb
|
110
|
+
test/unit/test_node_record.rb
|
111
|
+
test/unit/test_job.rb
|
112
|
+
test/unit/test_worker.rb
|
113
|
+
test/unit/test_work_unit.rb
|
114
|
+
views/operations_center.erb
|
115
|
+
)
|
116
|
+
end
|