mooktakim-cloud-crowd 0.3.4
Sign up to get free protection for your applications and to get access to all the features.
- data/EPIGRAPHS +17 -0
- data/LICENSE +22 -0
- data/README +93 -0
- data/actions/graphics_magick.rb +43 -0
- data/actions/process_pdfs.rb +92 -0
- data/actions/word_count.rb +16 -0
- data/bin/crowd +5 -0
- data/config/config.example.ru +23 -0
- data/config/config.example.yml +55 -0
- data/config/database.example.yml +16 -0
- data/examples/graphics_magick_example.rb +44 -0
- data/examples/process_pdfs_example.rb +40 -0
- data/examples/word_count_example.rb +42 -0
- data/lib/cloud-crowd.rb +188 -0
- data/lib/cloud_crowd/action.rb +125 -0
- data/lib/cloud_crowd/asset_store/filesystem_store.rb +39 -0
- data/lib/cloud_crowd/asset_store/s3_store.rb +43 -0
- data/lib/cloud_crowd/asset_store.rb +41 -0
- data/lib/cloud_crowd/command_line.rb +242 -0
- data/lib/cloud_crowd/exceptions.rb +46 -0
- data/lib/cloud_crowd/helpers/authorization.rb +52 -0
- data/lib/cloud_crowd/helpers/resources.rb +25 -0
- data/lib/cloud_crowd/helpers.rb +8 -0
- data/lib/cloud_crowd/inflector.rb +19 -0
- data/lib/cloud_crowd/models/job.rb +190 -0
- data/lib/cloud_crowd/models/node_record.rb +107 -0
- data/lib/cloud_crowd/models/work_unit.rb +170 -0
- data/lib/cloud_crowd/models.rb +40 -0
- data/lib/cloud_crowd/node.rb +199 -0
- data/lib/cloud_crowd/schema.rb +50 -0
- data/lib/cloud_crowd/server.rb +123 -0
- data/lib/cloud_crowd/worker.rb +149 -0
- data/mooktakim-cloud-crowd.gemspec +116 -0
- data/public/css/admin_console.css +243 -0
- data/public/css/reset.css +42 -0
- data/public/images/bullet_green.png +0 -0
- data/public/images/bullet_white.png +0 -0
- data/public/images/cloud_hand.png +0 -0
- data/public/images/header_back.png +0 -0
- data/public/images/logo.png +0 -0
- data/public/images/queue_fill.png +0 -0
- data/public/images/server.png +0 -0
- data/public/images/server_busy.png +0 -0
- data/public/images/server_error.png +0 -0
- data/public/images/sidebar_bottom.png +0 -0
- data/public/images/sidebar_top.png +0 -0
- data/public/images/worker_info.png +0 -0
- data/public/images/worker_info_loading.gif +0 -0
- data/public/js/admin_console.js +197 -0
- data/public/js/excanvas.js +1 -0
- data/public/js/flot.js +1 -0
- data/public/js/jquery.js +19 -0
- data/test/acceptance/test_failing_work_units.rb +33 -0
- data/test/acceptance/test_node.rb +20 -0
- data/test/acceptance/test_server.rb +66 -0
- data/test/acceptance/test_word_count.rb +40 -0
- data/test/blueprints.rb +25 -0
- data/test/config/actions/failure_testing.rb +13 -0
- data/test/config/config.ru +17 -0
- data/test/config/config.yml +6 -0
- data/test/config/database.yml +3 -0
- data/test/test_helper.rb +19 -0
- data/test/unit/test_action.rb +70 -0
- data/test/unit/test_configuration.rb +48 -0
- data/test/unit/test_job.rb +103 -0
- data/test/unit/test_node.rb +41 -0
- data/test/unit/test_node_record.rb +42 -0
- data/test/unit/test_work_unit.rb +53 -0
- data/test/unit/test_worker.rb +48 -0
- data/views/operations_center.erb +82 -0
- metadata +290 -0
@@ -0,0 +1,46 @@
|
|
1
|
+
module CloudCrowd
|
2
|
+
|
3
|
+
# Base Error class which all custom CloudCrowd exceptions inherit from.
|
4
|
+
# Rescuing CloudCrowd::Error (or RuntimeError) will get all custom exceptions.
|
5
|
+
# If your cluster is correctly configured, you should never expect to see any
|
6
|
+
# of these.
|
7
|
+
class Error < RuntimeError
|
8
|
+
|
9
|
+
# ActionNotFound is raised when a job is created for an action that doesn't
|
10
|
+
# exist.
|
11
|
+
class ActionNotFound < Error
|
12
|
+
end
|
13
|
+
|
14
|
+
# StorageNotFound is raised when config.yml specifies a storage back-end that
|
15
|
+
# doesn't exist.
|
16
|
+
class StorageNotFound < Error
|
17
|
+
end
|
18
|
+
|
19
|
+
# If the AssetStore can't write to its scratch directory.
|
20
|
+
class StorageNotWritable < Error
|
21
|
+
end
|
22
|
+
|
23
|
+
# StatusUnspecified is raised when a WorkUnit returns without a valid
|
24
|
+
# status code.
|
25
|
+
class StatusUnspecified < Error
|
26
|
+
end
|
27
|
+
|
28
|
+
# MissingConfiguration is raised when we're trying to run a method that
|
29
|
+
# needs configuration not present in config.yml.
|
30
|
+
class MissingConfiguration < Error
|
31
|
+
end
|
32
|
+
|
33
|
+
# CommandFailed is raised when an action shells out, and the external
|
34
|
+
# command returns a non-zero exit code.
|
35
|
+
class CommandFailed < Error
|
36
|
+
attr_reader :exit_code
|
37
|
+
|
38
|
+
def initialize(message, exit_code)
|
39
|
+
super(message)
|
40
|
+
@exit_code = exit_code
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
end
|
45
|
+
|
46
|
+
end
|
@@ -0,0 +1,52 @@
|
|
1
|
+
module CloudCrowd
|
2
|
+
module Helpers
|
3
|
+
|
4
|
+
# Authorization takes after sinatra-authorization... See
|
5
|
+
# http://github.com/integrity/sinatra-authorization
|
6
|
+
# for the original.
|
7
|
+
module Authorization
|
8
|
+
|
9
|
+
# Ensure that the request includes the correct credentials.
|
10
|
+
def login_required
|
11
|
+
return if authorized?
|
12
|
+
unauthorized! unless auth.provided?
|
13
|
+
bad_request! unless auth.basic?
|
14
|
+
unauthorized! unless authorize(*auth.credentials)
|
15
|
+
request.env['REMOTE_USER'] = auth.username
|
16
|
+
end
|
17
|
+
|
18
|
+
# Has the request been authenticated?
|
19
|
+
def authorized?
|
20
|
+
!!request.env['REMOTE_USER']
|
21
|
+
end
|
22
|
+
|
23
|
+
# A request is authorized if its login and password match those stored
|
24
|
+
# in config.yml, or if authentication is disabled. If authentication is
|
25
|
+
# turned on, then every request is authenticated, including between
|
26
|
+
# the nodes and the central server.
|
27
|
+
def authorize(login, password)
|
28
|
+
return true unless CloudCrowd.config[:http_authentication]
|
29
|
+
return CloudCrowd.config[:login] == login &&
|
30
|
+
CloudCrowd.config[:password] == password
|
31
|
+
end
|
32
|
+
|
33
|
+
|
34
|
+
private
|
35
|
+
|
36
|
+
# Provide a Rack Authorization object.
|
37
|
+
def auth
|
38
|
+
@auth ||= Rack::Auth::Basic::Request.new(request.env)
|
39
|
+
end
|
40
|
+
|
41
|
+
# Unauthorized requests will prompt the browser to provide credentials.
|
42
|
+
def unauthorized!(realm = Server.authorization_realm)
|
43
|
+
response['WWW-Authenticate'] = "Basic realm=\"#{realm}\""
|
44
|
+
halt 401, 'Authorization Required'
|
45
|
+
end
|
46
|
+
|
47
|
+
def bad_request!
|
48
|
+
halt 400, 'Bad Request'
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
module CloudCrowd
|
2
|
+
module Helpers
|
3
|
+
module Resources
|
4
|
+
|
5
|
+
# Convenience method for responding with JSON. Sets the content-type,
|
6
|
+
# serializes, and allows empty responses.
|
7
|
+
def json(obj)
|
8
|
+
content_type :json
|
9
|
+
return status(204) && '' if obj.nil?
|
10
|
+
obj.to_json
|
11
|
+
end
|
12
|
+
|
13
|
+
# Lazy-fetch the job specified by <tt>job_id</tt>.
|
14
|
+
def current_job
|
15
|
+
@job ||= Job.find_by_id(params[:job_id]) or raise Sinatra::NotFound
|
16
|
+
end
|
17
|
+
|
18
|
+
# Lazy-fetch the WorkUnit specified by <tt>work_unit_id</tt>.
|
19
|
+
def current_work_unit
|
20
|
+
@work_unit ||= WorkUnit.find_by_id(params[:work_unit_id]) or raise Sinatra::NotFound
|
21
|
+
end
|
22
|
+
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module CloudCrowd
|
2
|
+
|
3
|
+
# Pilfered in parts from the ActiveSupport::Inflector.
|
4
|
+
module Inflector
|
5
|
+
|
6
|
+
def self.camelize(word)
|
7
|
+
word.to_s.gsub(/\/(.?)/) { "::#{$1.upcase}" }.gsub(/(?:^|_)(.)/) { $1.upcase }
|
8
|
+
end
|
9
|
+
|
10
|
+
def self.underscore(word)
|
11
|
+
word.to_s.gsub(/::/, '/').
|
12
|
+
gsub(/([A-Z]+)([A-Z][a-z])/,'\1_\2').
|
13
|
+
gsub(/([a-z\d])([A-Z])/,'\1_\2').
|
14
|
+
tr("-", "_").
|
15
|
+
downcase
|
16
|
+
end
|
17
|
+
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,190 @@
|
|
1
|
+
module CloudCrowd
|
2
|
+
|
3
|
+
# A chunk of work that will be farmed out into many WorkUnits to be processed
|
4
|
+
# in parallel by each active CloudCrowd::Worker. Jobs are defined by a list
|
5
|
+
# of inputs (usually public urls to files), an action (the name of a script that
|
6
|
+
# CloudCrowd knows how to run), and, eventually a corresponding list of output.
|
7
|
+
class Job < ActiveRecord::Base
|
8
|
+
include ModelStatus
|
9
|
+
|
10
|
+
CLEANUP_GRACE_PERIOD = 7 # That's a week.
|
11
|
+
|
12
|
+
has_many :work_units, :dependent => :destroy
|
13
|
+
|
14
|
+
validates_presence_of :status, :inputs, :action, :options
|
15
|
+
|
16
|
+
before_validation_on_create :set_initial_status
|
17
|
+
after_create :queue_for_workers
|
18
|
+
before_destroy :cleanup_assets
|
19
|
+
|
20
|
+
# Jobs that were last updated more than N days ago.
|
21
|
+
named_scope :older_than, lambda {|num| {:conditions => ['updated_at < ?', num.days.ago]} }
|
22
|
+
|
23
|
+
# Create a Job from an incoming JSON request, and add it to the queue.
|
24
|
+
def self.create_from_request(h)
|
25
|
+
self.create(
|
26
|
+
:inputs => h['inputs'].to_json,
|
27
|
+
:action => h['action'],
|
28
|
+
:options => (h['options'] || {}).to_json,
|
29
|
+
:email => h['email'],
|
30
|
+
:callback_url => h['callback_url']
|
31
|
+
)
|
32
|
+
end
|
33
|
+
|
34
|
+
# Clean up all jobs beyond a certain age.
|
35
|
+
def self.cleanup_all(opts = {})
|
36
|
+
days = opts[:days] || CLEANUP_GRACE_PERIOD
|
37
|
+
self.complete.older_than(days).find_in_batches(:batch_size => 100) do |jobs|
|
38
|
+
jobs.each {|job| job.destroy }
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
# After work units are marked successful, we check to see if all of them have
|
43
|
+
# finished, if so, continue on to the next phase of the job.
|
44
|
+
def check_for_completion
|
45
|
+
return unless all_work_units_complete?
|
46
|
+
set_next_status
|
47
|
+
outs = gather_outputs_from_work_units
|
48
|
+
return queue_for_workers([outs]) if merging?
|
49
|
+
if complete?
|
50
|
+
update_attributes(:outputs => outs, :time => time_taken)
|
51
|
+
Thread.new { fire_callback } if callback_url
|
52
|
+
end
|
53
|
+
self
|
54
|
+
end
|
55
|
+
|
56
|
+
# Transition this Job's current status to the appropriate next one, based
|
57
|
+
# on the state of the WorkUnits and the nature of the Action.
|
58
|
+
def set_next_status
|
59
|
+
update_attribute(:status,
|
60
|
+
any_work_units_failed? ? FAILED :
|
61
|
+
self.splitting? ? PROCESSING :
|
62
|
+
self.mergeable? ? MERGING :
|
63
|
+
SUCCEEDED
|
64
|
+
)
|
65
|
+
end
|
66
|
+
|
67
|
+
# If a <tt>callback_url</tt> is defined, post the Job's JSON to it upon
|
68
|
+
# completion. The <tt>callback_url</tt> may include HTTP basic authentication,
|
69
|
+
# if you like:
|
70
|
+
# http://user:password@example.com/job_complete
|
71
|
+
# If the callback URL returns a '201 Created' HTTP status code, CloudCrowd
|
72
|
+
# will assume that the resource has been successfully created, and the Job
|
73
|
+
# will be cleaned up.
|
74
|
+
def fire_callback
|
75
|
+
begin
|
76
|
+
response = RestClient.post(callback_url, {:job => self.to_json})
|
77
|
+
Thread.new { self.destroy } if response && response.code == 201
|
78
|
+
rescue RestClient::Exception => e
|
79
|
+
puts "Job ##{id} failed to fire callback: #{callback_url}"
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
# Cleaning up after a job will remove all of its files from S3 or the
|
84
|
+
# filesystem. Destroying a Job will cleanup_assets first. Run this in a
|
85
|
+
# separate thread to get out of the transaction's way.
|
86
|
+
# TODO: Convert this into a 'cleanup' work unit that gets run by a worker.
|
87
|
+
def cleanup_assets
|
88
|
+
AssetStore.new.cleanup(self)
|
89
|
+
end
|
90
|
+
|
91
|
+
# Have all of the WorkUnits finished?
|
92
|
+
def all_work_units_complete?
|
93
|
+
self.work_units.incomplete.count <= 0
|
94
|
+
end
|
95
|
+
|
96
|
+
# Have any of the WorkUnits failed?
|
97
|
+
def any_work_units_failed?
|
98
|
+
self.work_units.failed.count > 0
|
99
|
+
end
|
100
|
+
|
101
|
+
# This job is splittable if its Action has a +split+ method.
|
102
|
+
def splittable?
|
103
|
+
self.action_class.public_instance_methods.map {|m| m.to_sym }.include? :split
|
104
|
+
end
|
105
|
+
|
106
|
+
# This job is done splitting if it's finished with its splitting work units.
|
107
|
+
def done_splitting?
|
108
|
+
splittable? && work_units.splitting.count <= 0
|
109
|
+
end
|
110
|
+
|
111
|
+
# This job is mergeable if its Action has a +merge+ method.
|
112
|
+
def mergeable?
|
113
|
+
self.processing? && self.action_class.public_instance_methods.map {|m| m.to_sym }.include?(:merge)
|
114
|
+
end
|
115
|
+
|
116
|
+
# Retrieve the class for this Job's Action.
|
117
|
+
def action_class
|
118
|
+
@action_class ||= CloudCrowd.actions[self.action]
|
119
|
+
return @action_class if @action_class
|
120
|
+
raise Error::ActionNotFound, "no action named: '#{self.action}' could be found"
|
121
|
+
end
|
122
|
+
|
123
|
+
# How complete is this Job?
|
124
|
+
# Unfortunately, with the current processing sequence, the percent_complete
|
125
|
+
# can pull a fast one and go backwards. This happens when there's a single
|
126
|
+
# large input that takes a long time to split, and when it finally does it
|
127
|
+
# creates a whole swarm of work units. This seems unavoidable.
|
128
|
+
def percent_complete
|
129
|
+
return 99 if merging?
|
130
|
+
return 100 if complete?
|
131
|
+
unit_count = work_units.count
|
132
|
+
return 100 if unit_count <= 0
|
133
|
+
(work_units.complete.count / unit_count.to_f * 100).round
|
134
|
+
end
|
135
|
+
|
136
|
+
# How long has this Job taken?
|
137
|
+
def time_taken
|
138
|
+
return self.time if self.time
|
139
|
+
Time.now - self.created_at
|
140
|
+
end
|
141
|
+
|
142
|
+
# Generate a stable 8-bit Hex color code, based on the Job's id.
|
143
|
+
def color
|
144
|
+
@color ||= Digest::MD5.hexdigest(self.id.to_s)[-7...-1]
|
145
|
+
end
|
146
|
+
|
147
|
+
# A JSON representation of this job includes the statuses of its component
|
148
|
+
# WorkUnits, as well as any completed outputs.
|
149
|
+
def to_json(opts={})
|
150
|
+
atts = {
|
151
|
+
'id' => id,
|
152
|
+
'color' => color,
|
153
|
+
'status' => display_status,
|
154
|
+
'percent_complete' => percent_complete,
|
155
|
+
'work_units' => work_units.count,
|
156
|
+
'time_taken' => time_taken
|
157
|
+
}
|
158
|
+
atts['outputs'] = JSON.parse(outputs) if outputs
|
159
|
+
atts['email'] = email if email
|
160
|
+
atts.to_json
|
161
|
+
end
|
162
|
+
|
163
|
+
|
164
|
+
private
|
165
|
+
|
166
|
+
# When the WorkUnits are all finished, gather all their outputs together
|
167
|
+
# before removing them from the database entirely. Returns their merged JSON.
|
168
|
+
def gather_outputs_from_work_units
|
169
|
+
units = self.work_units.complete
|
170
|
+
outs = self.work_units.complete.map {|u| u.parsed_output }
|
171
|
+
self.work_units.complete.destroy_all
|
172
|
+
outs.to_json
|
173
|
+
end
|
174
|
+
|
175
|
+
# When starting a new job, or moving to a new stage, split up the inputs
|
176
|
+
# into WorkUnits, and queue them. Workers will start picking them up right
|
177
|
+
# away.
|
178
|
+
def queue_for_workers(input=nil)
|
179
|
+
input ||= JSON.parse(self.inputs)
|
180
|
+
input.each {|i| WorkUnit.start(self, action, i, status) }
|
181
|
+
self
|
182
|
+
end
|
183
|
+
|
184
|
+
# A Job starts out either splitting or processing, depending on its action.
|
185
|
+
def set_initial_status
|
186
|
+
self.status = self.splittable? ? SPLITTING : PROCESSING
|
187
|
+
end
|
188
|
+
|
189
|
+
end
|
190
|
+
end
|
@@ -0,0 +1,107 @@
|
|
1
|
+
module CloudCrowd
|
2
|
+
|
3
|
+
# A NodeRecord is the central server's record of a Node running remotely. We
|
4
|
+
# can use it to assign WorkUnits to the Node, and keep track of its status.
|
5
|
+
# When a Node exits, it destroys this record.
|
6
|
+
class NodeRecord < ActiveRecord::Base
|
7
|
+
|
8
|
+
has_many :work_units
|
9
|
+
|
10
|
+
validates_presence_of :host, :ip_address, :port, :enabled_actions
|
11
|
+
|
12
|
+
after_destroy :redistribute_work_units
|
13
|
+
|
14
|
+
# Available Nodes haven't used up their maxiumum number of workers yet.
|
15
|
+
named_scope :available, {
|
16
|
+
:conditions => ['(max_workers is null or (select count(*) from work_units where node_record_id = node_records.id) < max_workers)'],
|
17
|
+
:order => 'updated_at asc'
|
18
|
+
}
|
19
|
+
|
20
|
+
# Register a Node with the central server. Currently this only happens at
|
21
|
+
# Node startup.
|
22
|
+
def self.check_in(params, request)
|
23
|
+
attrs = {
|
24
|
+
:ip_address => request.ip,
|
25
|
+
:port => params[:port],
|
26
|
+
:busy => params[:busy],
|
27
|
+
:max_workers => params[:max_workers],
|
28
|
+
:enabled_actions => params[:enabled_actions]
|
29
|
+
}
|
30
|
+
self.find_or_create_by_host(params[:host]).update_attributes!(attrs)
|
31
|
+
end
|
32
|
+
|
33
|
+
# Dispatch a WorkUnit to this node. Places the node at back at the end of
|
34
|
+
# the rotation. If we fail to send the WorkUnit, we consider the node to be
|
35
|
+
# down, and remove this record, freeing up all of its checked-out work units.
|
36
|
+
# If the Node responds that it's overloaded, we mark it as busy. Returns
|
37
|
+
# true if the WorkUnit was dispatched successfully.
|
38
|
+
def send_work_unit(unit)
|
39
|
+
result = node['/work'].post(:work_unit => unit.to_json)
|
40
|
+
unit.assign_to(self, JSON.parse(result)['pid'])
|
41
|
+
touch && true
|
42
|
+
rescue RestClient::RequestFailed => e
|
43
|
+
raise e unless e.http_code == 503 && e.http_body == Node::OVERLOADED_MESSAGE
|
44
|
+
update_attribute(:busy, true) && false
|
45
|
+
rescue RestClient::Exception, Errno::ECONNREFUSED, Timeout::Error
|
46
|
+
# Couldn't post to node, assume it's gone away.
|
47
|
+
destroy && false
|
48
|
+
end
|
49
|
+
|
50
|
+
# What Actions is this Node able to run?
|
51
|
+
def actions
|
52
|
+
@actions ||= enabled_actions.split(',')
|
53
|
+
end
|
54
|
+
|
55
|
+
# Is this Node too busy for more work? Determined by number of workers, or
|
56
|
+
# the Node's load average, as configured in config.yml.
|
57
|
+
def busy?
|
58
|
+
busy || (max_workers && work_units.count >= max_workers)
|
59
|
+
end
|
60
|
+
|
61
|
+
# The URL at which this Node may be reached.
|
62
|
+
# TODO: Make sure that the host actually has externally accessible DNS.
|
63
|
+
def url
|
64
|
+
@url ||= "http://#{host}:#{port}"
|
65
|
+
end
|
66
|
+
|
67
|
+
# Keep a RestClient::Resource handy for contacting the Node, including
|
68
|
+
# HTTP authentication, if configured.
|
69
|
+
def node
|
70
|
+
@node ||= RestClient::Resource.new(url, CloudCrowd.client_options)
|
71
|
+
end
|
72
|
+
|
73
|
+
# The printable status of the Node.
|
74
|
+
def display_status
|
75
|
+
busy? ? 'busy' : 'available'
|
76
|
+
end
|
77
|
+
|
78
|
+
# A list of the process ids of the workers currently being run by the Node.
|
79
|
+
def worker_pids
|
80
|
+
work_units.all(:select => 'worker_pid').map(&:worker_pid)
|
81
|
+
end
|
82
|
+
|
83
|
+
# Release all of this Node's WorkUnits for other nodes to take.
|
84
|
+
def release_work_units
|
85
|
+
WorkUnit.update_all('node_record_id = null, worker_pid = null', "node_record_id = #{id}")
|
86
|
+
end
|
87
|
+
|
88
|
+
# The JSON representation of a NodeRecord includes its worker_pids.
|
89
|
+
def to_json(opts={})
|
90
|
+
{ 'host' => host,
|
91
|
+
'workers' => worker_pids,
|
92
|
+
'status' => display_status
|
93
|
+
}.to_json
|
94
|
+
end
|
95
|
+
|
96
|
+
|
97
|
+
private
|
98
|
+
|
99
|
+
# When a Node exits, release its WorkUnits and redistribute them to others.
|
100
|
+
# Redistribute in a separate thread to avoid delaying shutdown.
|
101
|
+
def redistribute_work_units
|
102
|
+
release_work_units
|
103
|
+
Thread.new { WorkUnit.distribute_to_nodes }
|
104
|
+
end
|
105
|
+
|
106
|
+
end
|
107
|
+
end
|
@@ -0,0 +1,170 @@
|
|
1
|
+
module CloudCrowd
|
2
|
+
|
3
|
+
# A WorkUnit is an atomic chunk of work from a job, processing a single input
|
4
|
+
# through a single action. The WorkUnits are run in parallel, with each worker
|
5
|
+
# daemon processing one at a time. The splitting and merging stages of a job
|
6
|
+
# are each run as a single WorkUnit.
|
7
|
+
class WorkUnit < ActiveRecord::Base
|
8
|
+
include ModelStatus
|
9
|
+
|
10
|
+
# We use a random number in (0...MAX_RESERVATION) to reserve work units.
|
11
|
+
# The size of the maximum signed integer in MySQL -- SQLite has no limit.
|
12
|
+
MAX_RESERVATION = 2147483647
|
13
|
+
|
14
|
+
# We only reserve a certain number of WorkUnits in a single go, to avoid
|
15
|
+
# reserving the entire table.
|
16
|
+
RESERVATION_LIMIT = 25
|
17
|
+
|
18
|
+
belongs_to :job
|
19
|
+
belongs_to :node_record
|
20
|
+
|
21
|
+
validates_presence_of :job_id, :status, :input, :action
|
22
|
+
|
23
|
+
# Available WorkUnits are waiting to be distributed to Nodes for processing.
|
24
|
+
named_scope :available, {:conditions => {:reservation => nil, :worker_pid => nil, :status => INCOMPLETE}}
|
25
|
+
# Reserved WorkUnits have been marked for distribution by a central server process.
|
26
|
+
named_scope :reserved, lambda {|reservation|
|
27
|
+
{:conditions => {:reservation => reservation}, :order => 'updated_at asc'}
|
28
|
+
}
|
29
|
+
|
30
|
+
# Attempt to send a list of WorkUnits to nodes with available capacity.
|
31
|
+
# A single central server process stops the same WorkUnit from being
|
32
|
+
# distributed to multiple nodes by reserving it first. The algorithm used
|
33
|
+
# should be lock-free.
|
34
|
+
#
|
35
|
+
# We reserve WorkUnits for this process in chunks of RESERVATION_LIMIT size,
|
36
|
+
# and try to match them to Nodes that are capable of handling the Action.
|
37
|
+
# WorkUnits get removed from the availability list when they are
|
38
|
+
# successfully sent, and Nodes get removed when they are busy or have the
|
39
|
+
# action in question disabled.
|
40
|
+
def self.distribute_to_nodes
|
41
|
+
reservation = nil
|
42
|
+
loop do
|
43
|
+
return unless reservation = WorkUnit.reserve_available(:limit => RESERVATION_LIMIT)
|
44
|
+
work_units = WorkUnit.reserved(reservation)
|
45
|
+
available_nodes = NodeRecord.available
|
46
|
+
while node = available_nodes.shift and unit = work_units.shift do
|
47
|
+
if node.actions.include? unit.action
|
48
|
+
if node.send_work_unit(unit)
|
49
|
+
available_nodes.push(node) unless node.busy?
|
50
|
+
next
|
51
|
+
end
|
52
|
+
end
|
53
|
+
work_units.push(unit)
|
54
|
+
end
|
55
|
+
return if work_units.any? || available_nodes.empty?
|
56
|
+
end
|
57
|
+
ensure
|
58
|
+
WorkUnit.cancel_reservations(reservation) if reservation
|
59
|
+
end
|
60
|
+
|
61
|
+
# Reserves all available WorkUnits for this process. Returns false if there
|
62
|
+
# were none available.
|
63
|
+
def self.reserve_available(options={})
|
64
|
+
reservation = ActiveSupport::SecureRandom.random_number(MAX_RESERVATION)
|
65
|
+
any = WorkUnit.available.update_all("reservation = #{reservation}", nil, options) > 0
|
66
|
+
any && reservation
|
67
|
+
end
|
68
|
+
|
69
|
+
# Cancels all outstanding WorkUnit reservations for this process.
|
70
|
+
def self.cancel_reservations(reservation)
|
71
|
+
WorkUnit.reserved(reservation).update_all('reservation = null')
|
72
|
+
end
|
73
|
+
|
74
|
+
# Cancels all outstanding WorkUnit reservations for all processes. (Useful
|
75
|
+
# in the console for debugging.)
|
76
|
+
def self.cancel_all_reservations
|
77
|
+
WorkUnit.update_all('reservation = null')
|
78
|
+
end
|
79
|
+
|
80
|
+
# Look up a WorkUnit by the worker that's currently processing it. Specified
|
81
|
+
# by <tt>pid@host</tt>.
|
82
|
+
def self.find_by_worker_name(name)
|
83
|
+
pid, host = name.split('@')
|
84
|
+
node = NodeRecord.find_by_host(host)
|
85
|
+
node && node.work_units.find_by_worker_pid(pid)
|
86
|
+
end
|
87
|
+
|
88
|
+
# Convenience method for starting a new WorkUnit.
|
89
|
+
def self.start(job, action, input, status)
|
90
|
+
input = input.to_json unless input.is_a? String
|
91
|
+
self.create(:job => job, :action => action, :input => input, :status => status)
|
92
|
+
end
|
93
|
+
|
94
|
+
# Mark this unit as having finished successfully.
|
95
|
+
# Splitting work units are handled differently (an optimization) -- they
|
96
|
+
# immediately fire off all of their resulting WorkUnits for processing,
|
97
|
+
# without waiting for the rest of their splitting cousins to complete.
|
98
|
+
def finish(result, time_taken)
|
99
|
+
if splitting?
|
100
|
+
[parsed_output(result)].flatten.each do |new_input|
|
101
|
+
WorkUnit.start(job, action, new_input, PROCESSING)
|
102
|
+
end
|
103
|
+
self.destroy
|
104
|
+
job.set_next_status if job && job.done_splitting?
|
105
|
+
else
|
106
|
+
update_attributes({
|
107
|
+
:status => SUCCEEDED,
|
108
|
+
:node_record => nil,
|
109
|
+
:worker_pid => nil,
|
110
|
+
:attempts => attempts + 1,
|
111
|
+
:output => result,
|
112
|
+
:time => time_taken
|
113
|
+
})
|
114
|
+
job && job.check_for_completion
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
118
|
+
# Mark this unit as having failed. May attempt a retry.
|
119
|
+
def fail(output, time_taken)
|
120
|
+
tries = self.attempts + 1
|
121
|
+
return try_again if tries < CloudCrowd.config[:work_unit_retries]
|
122
|
+
update_attributes({
|
123
|
+
:status => FAILED,
|
124
|
+
:node_record => nil,
|
125
|
+
:worker_pid => nil,
|
126
|
+
:attempts => tries,
|
127
|
+
:output => output,
|
128
|
+
:time => time_taken
|
129
|
+
})
|
130
|
+
job && job.check_for_completion
|
131
|
+
end
|
132
|
+
|
133
|
+
# Ever tried. Ever failed. No matter. Try again. Fail again. Fail better.
|
134
|
+
def try_again
|
135
|
+
update_attributes({
|
136
|
+
:node_record => nil,
|
137
|
+
:worker_pid => nil,
|
138
|
+
:attempts => self.attempts + 1
|
139
|
+
})
|
140
|
+
end
|
141
|
+
|
142
|
+
# When a Node checks out a WorkUnit, establish the connection between
|
143
|
+
# WorkUnit and NodeRecord and record the worker_pid.
|
144
|
+
def assign_to(node_record, worker_pid)
|
145
|
+
update_attributes!(:node_record => node_record, :worker_pid => worker_pid)
|
146
|
+
end
|
147
|
+
|
148
|
+
# All output needs to be wrapped in a JSON object for consistency
|
149
|
+
# (unfortunately, JSON.parse needs the top-level to be an object or array).
|
150
|
+
# Convenience method to provide the parsed version.
|
151
|
+
def parsed_output(out = self.output)
|
152
|
+
JSON.parse(out)['output']
|
153
|
+
end
|
154
|
+
|
155
|
+
# The JSON representation of a WorkUnit shares the Job's options with all
|
156
|
+
# its cousin WorkUnits.
|
157
|
+
def to_json
|
158
|
+
{
|
159
|
+
'id' => self.id,
|
160
|
+
'job_id' => self.job_id,
|
161
|
+
'input' => self.input,
|
162
|
+
'attempts' => self.attempts,
|
163
|
+
'action' => self.action,
|
164
|
+
'options' => JSON.parse(self.job.options),
|
165
|
+
'status' => self.status
|
166
|
+
}.to_json
|
167
|
+
end
|
168
|
+
|
169
|
+
end
|
170
|
+
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
module CloudCrowd
|
2
|
+
|
3
|
+
# Adds named scopes and query methods for every CloudCrowd status to
|
4
|
+
# both Jobs and WorkUnits.
|
5
|
+
module ModelStatus
|
6
|
+
|
7
|
+
def self.included(klass)
|
8
|
+
|
9
|
+
klass.class_eval do
|
10
|
+
# Note that COMPLETE and INCOMPLETE are unions of other states.
|
11
|
+
named_scope 'processing', :conditions => {:status => PROCESSING}
|
12
|
+
named_scope 'succeeded', :conditions => {:status => SUCCEEDED}
|
13
|
+
named_scope 'failed', :conditions => {:status => FAILED}
|
14
|
+
named_scope 'splitting', :conditions => {:status => SPLITTING}
|
15
|
+
named_scope 'merging', :conditions => {:status => MERGING}
|
16
|
+
named_scope 'complete', :conditions => {:status => COMPLETE}
|
17
|
+
named_scope 'incomplete', :conditions => {:status => INCOMPLETE}
|
18
|
+
end
|
19
|
+
|
20
|
+
end
|
21
|
+
|
22
|
+
def processing?; self.status == PROCESSING; end
|
23
|
+
def succeeded?; self.status == SUCCEEDED; end
|
24
|
+
def failed?; self.status == FAILED; end
|
25
|
+
def splitting?; self.status == SPLITTING; end
|
26
|
+
def merging?; self.status == MERGING; end
|
27
|
+
def complete?; COMPLETE.include?(self.status); end
|
28
|
+
def incomplete?; INCOMPLETE.include?(self.status); end
|
29
|
+
|
30
|
+
# Get the displayable status name of the model's status code.
|
31
|
+
def display_status
|
32
|
+
CloudCrowd.display_status(self.status)
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
require 'cloud_crowd/models/job'
|
39
|
+
require 'cloud_crowd/models/node_record'
|
40
|
+
require 'cloud_crowd/models/work_unit'
|