mooktakim-cloud-crowd 0.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/EPIGRAPHS +17 -0
- data/LICENSE +22 -0
- data/README +93 -0
- data/actions/graphics_magick.rb +43 -0
- data/actions/process_pdfs.rb +92 -0
- data/actions/word_count.rb +16 -0
- data/bin/crowd +5 -0
- data/config/config.example.ru +23 -0
- data/config/config.example.yml +55 -0
- data/config/database.example.yml +16 -0
- data/examples/graphics_magick_example.rb +44 -0
- data/examples/process_pdfs_example.rb +40 -0
- data/examples/word_count_example.rb +42 -0
- data/lib/cloud-crowd.rb +188 -0
- data/lib/cloud_crowd/action.rb +125 -0
- data/lib/cloud_crowd/asset_store/filesystem_store.rb +39 -0
- data/lib/cloud_crowd/asset_store/s3_store.rb +43 -0
- data/lib/cloud_crowd/asset_store.rb +41 -0
- data/lib/cloud_crowd/command_line.rb +242 -0
- data/lib/cloud_crowd/exceptions.rb +46 -0
- data/lib/cloud_crowd/helpers/authorization.rb +52 -0
- data/lib/cloud_crowd/helpers/resources.rb +25 -0
- data/lib/cloud_crowd/helpers.rb +8 -0
- data/lib/cloud_crowd/inflector.rb +19 -0
- data/lib/cloud_crowd/models/job.rb +190 -0
- data/lib/cloud_crowd/models/node_record.rb +107 -0
- data/lib/cloud_crowd/models/work_unit.rb +170 -0
- data/lib/cloud_crowd/models.rb +40 -0
- data/lib/cloud_crowd/node.rb +199 -0
- data/lib/cloud_crowd/schema.rb +50 -0
- data/lib/cloud_crowd/server.rb +123 -0
- data/lib/cloud_crowd/worker.rb +149 -0
- data/mooktakim-cloud-crowd.gemspec +116 -0
- data/public/css/admin_console.css +243 -0
- data/public/css/reset.css +42 -0
- data/public/images/bullet_green.png +0 -0
- data/public/images/bullet_white.png +0 -0
- data/public/images/cloud_hand.png +0 -0
- data/public/images/header_back.png +0 -0
- data/public/images/logo.png +0 -0
- data/public/images/queue_fill.png +0 -0
- data/public/images/server.png +0 -0
- data/public/images/server_busy.png +0 -0
- data/public/images/server_error.png +0 -0
- data/public/images/sidebar_bottom.png +0 -0
- data/public/images/sidebar_top.png +0 -0
- data/public/images/worker_info.png +0 -0
- data/public/images/worker_info_loading.gif +0 -0
- data/public/js/admin_console.js +197 -0
- data/public/js/excanvas.js +1 -0
- data/public/js/flot.js +1 -0
- data/public/js/jquery.js +19 -0
- data/test/acceptance/test_failing_work_units.rb +33 -0
- data/test/acceptance/test_node.rb +20 -0
- data/test/acceptance/test_server.rb +66 -0
- data/test/acceptance/test_word_count.rb +40 -0
- data/test/blueprints.rb +25 -0
- data/test/config/actions/failure_testing.rb +13 -0
- data/test/config/config.ru +17 -0
- data/test/config/config.yml +6 -0
- data/test/config/database.yml +3 -0
- data/test/test_helper.rb +19 -0
- data/test/unit/test_action.rb +70 -0
- data/test/unit/test_configuration.rb +48 -0
- data/test/unit/test_job.rb +103 -0
- data/test/unit/test_node.rb +41 -0
- data/test/unit/test_node_record.rb +42 -0
- data/test/unit/test_work_unit.rb +53 -0
- data/test/unit/test_worker.rb +48 -0
- data/views/operations_center.erb +82 -0
- metadata +290 -0
@@ -0,0 +1,46 @@
|
|
1
|
+
module CloudCrowd
|
2
|
+
|
3
|
+
# Base Error class which all custom CloudCrowd exceptions inherit from.
|
4
|
+
# Rescuing CloudCrowd::Error (or RuntimeError) will get all custom exceptions.
|
5
|
+
# If your cluster is correctly configured, you should never expect to see any
|
6
|
+
# of these.
|
7
|
+
class Error < RuntimeError
|
8
|
+
|
9
|
+
# ActionNotFound is raised when a job is created for an action that doesn't
|
10
|
+
# exist.
|
11
|
+
class ActionNotFound < Error
|
12
|
+
end
|
13
|
+
|
14
|
+
# StorageNotFound is raised when config.yml specifies a storage back-end that
|
15
|
+
# doesn't exist.
|
16
|
+
class StorageNotFound < Error
|
17
|
+
end
|
18
|
+
|
19
|
+
# If the AssetStore can't write to its scratch directory.
|
20
|
+
class StorageNotWritable < Error
|
21
|
+
end
|
22
|
+
|
23
|
+
# StatusUnspecified is raised when a WorkUnit returns without a valid
|
24
|
+
# status code.
|
25
|
+
class StatusUnspecified < Error
|
26
|
+
end
|
27
|
+
|
28
|
+
# MissingConfiguration is raised when we're trying to run a method that
|
29
|
+
# needs configuration not present in config.yml.
|
30
|
+
class MissingConfiguration < Error
|
31
|
+
end
|
32
|
+
|
33
|
+
# CommandFailed is raised when an action shells out, and the external
|
34
|
+
# command returns a non-zero exit code.
|
35
|
+
class CommandFailed < Error
|
36
|
+
attr_reader :exit_code
|
37
|
+
|
38
|
+
def initialize(message, exit_code)
|
39
|
+
super(message)
|
40
|
+
@exit_code = exit_code
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
end
|
45
|
+
|
46
|
+
end
|
@@ -0,0 +1,52 @@
|
|
1
|
+
module CloudCrowd
|
2
|
+
module Helpers
|
3
|
+
|
4
|
+
# Authorization takes after sinatra-authorization... See
|
5
|
+
# http://github.com/integrity/sinatra-authorization
|
6
|
+
# for the original.
|
7
|
+
module Authorization
|
8
|
+
|
9
|
+
# Ensure that the request includes the correct credentials.
|
10
|
+
def login_required
|
11
|
+
return if authorized?
|
12
|
+
unauthorized! unless auth.provided?
|
13
|
+
bad_request! unless auth.basic?
|
14
|
+
unauthorized! unless authorize(*auth.credentials)
|
15
|
+
request.env['REMOTE_USER'] = auth.username
|
16
|
+
end
|
17
|
+
|
18
|
+
# Has the request been authenticated?
|
19
|
+
def authorized?
|
20
|
+
!!request.env['REMOTE_USER']
|
21
|
+
end
|
22
|
+
|
23
|
+
# A request is authorized if its login and password match those stored
|
24
|
+
# in config.yml, or if authentication is disabled. If authentication is
|
25
|
+
# turned on, then every request is authenticated, including between
|
26
|
+
# the nodes and the central server.
|
27
|
+
def authorize(login, password)
|
28
|
+
return true unless CloudCrowd.config[:http_authentication]
|
29
|
+
return CloudCrowd.config[:login] == login &&
|
30
|
+
CloudCrowd.config[:password] == password
|
31
|
+
end
|
32
|
+
|
33
|
+
|
34
|
+
private
|
35
|
+
|
36
|
+
# Provide a Rack Authorization object.
|
37
|
+
def auth
|
38
|
+
@auth ||= Rack::Auth::Basic::Request.new(request.env)
|
39
|
+
end
|
40
|
+
|
41
|
+
# Unauthorized requests will prompt the browser to provide credentials.
|
42
|
+
def unauthorized!(realm = Server.authorization_realm)
|
43
|
+
response['WWW-Authenticate'] = "Basic realm=\"#{realm}\""
|
44
|
+
halt 401, 'Authorization Required'
|
45
|
+
end
|
46
|
+
|
47
|
+
def bad_request!
|
48
|
+
halt 400, 'Bad Request'
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
module CloudCrowd
|
2
|
+
module Helpers
|
3
|
+
module Resources
|
4
|
+
|
5
|
+
# Convenience method for responding with JSON. Sets the content-type,
|
6
|
+
# serializes, and allows empty responses.
|
7
|
+
def json(obj)
|
8
|
+
content_type :json
|
9
|
+
return status(204) && '' if obj.nil?
|
10
|
+
obj.to_json
|
11
|
+
end
|
12
|
+
|
13
|
+
# Lazy-fetch the job specified by <tt>job_id</tt>.
|
14
|
+
def current_job
|
15
|
+
@job ||= Job.find_by_id(params[:job_id]) or raise Sinatra::NotFound
|
16
|
+
end
|
17
|
+
|
18
|
+
# Lazy-fetch the WorkUnit specified by <tt>work_unit_id</tt>.
|
19
|
+
def current_work_unit
|
20
|
+
@work_unit ||= WorkUnit.find_by_id(params[:work_unit_id]) or raise Sinatra::NotFound
|
21
|
+
end
|
22
|
+
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module CloudCrowd
|
2
|
+
|
3
|
+
# Pilfered in parts from the ActiveSupport::Inflector.
|
4
|
+
module Inflector
|
5
|
+
|
6
|
+
def self.camelize(word)
|
7
|
+
word.to_s.gsub(/\/(.?)/) { "::#{$1.upcase}" }.gsub(/(?:^|_)(.)/) { $1.upcase }
|
8
|
+
end
|
9
|
+
|
10
|
+
def self.underscore(word)
|
11
|
+
word.to_s.gsub(/::/, '/').
|
12
|
+
gsub(/([A-Z]+)([A-Z][a-z])/,'\1_\2').
|
13
|
+
gsub(/([a-z\d])([A-Z])/,'\1_\2').
|
14
|
+
tr("-", "_").
|
15
|
+
downcase
|
16
|
+
end
|
17
|
+
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,190 @@
|
|
1
|
+
module CloudCrowd
|
2
|
+
|
3
|
+
# A chunk of work that will be farmed out into many WorkUnits to be processed
|
4
|
+
# in parallel by each active CloudCrowd::Worker. Jobs are defined by a list
|
5
|
+
# of inputs (usually public urls to files), an action (the name of a script that
|
6
|
+
# CloudCrowd knows how to run), and, eventually a corresponding list of output.
|
7
|
+
class Job < ActiveRecord::Base
|
8
|
+
include ModelStatus
|
9
|
+
|
10
|
+
CLEANUP_GRACE_PERIOD = 7 # That's a week.
|
11
|
+
|
12
|
+
has_many :work_units, :dependent => :destroy
|
13
|
+
|
14
|
+
validates_presence_of :status, :inputs, :action, :options
|
15
|
+
|
16
|
+
before_validation_on_create :set_initial_status
|
17
|
+
after_create :queue_for_workers
|
18
|
+
before_destroy :cleanup_assets
|
19
|
+
|
20
|
+
# Jobs that were last updated more than N days ago.
|
21
|
+
named_scope :older_than, lambda {|num| {:conditions => ['updated_at < ?', num.days.ago]} }
|
22
|
+
|
23
|
+
# Create a Job from an incoming JSON request, and add it to the queue.
|
24
|
+
def self.create_from_request(h)
|
25
|
+
self.create(
|
26
|
+
:inputs => h['inputs'].to_json,
|
27
|
+
:action => h['action'],
|
28
|
+
:options => (h['options'] || {}).to_json,
|
29
|
+
:email => h['email'],
|
30
|
+
:callback_url => h['callback_url']
|
31
|
+
)
|
32
|
+
end
|
33
|
+
|
34
|
+
# Clean up all jobs beyond a certain age.
|
35
|
+
def self.cleanup_all(opts = {})
|
36
|
+
days = opts[:days] || CLEANUP_GRACE_PERIOD
|
37
|
+
self.complete.older_than(days).find_in_batches(:batch_size => 100) do |jobs|
|
38
|
+
jobs.each {|job| job.destroy }
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
# After work units are marked successful, we check to see if all of them have
|
43
|
+
# finished, if so, continue on to the next phase of the job.
|
44
|
+
def check_for_completion
|
45
|
+
return unless all_work_units_complete?
|
46
|
+
set_next_status
|
47
|
+
outs = gather_outputs_from_work_units
|
48
|
+
return queue_for_workers([outs]) if merging?
|
49
|
+
if complete?
|
50
|
+
update_attributes(:outputs => outs, :time => time_taken)
|
51
|
+
Thread.new { fire_callback } if callback_url
|
52
|
+
end
|
53
|
+
self
|
54
|
+
end
|
55
|
+
|
56
|
+
# Transition this Job's current status to the appropriate next one, based
|
57
|
+
# on the state of the WorkUnits and the nature of the Action.
|
58
|
+
def set_next_status
|
59
|
+
update_attribute(:status,
|
60
|
+
any_work_units_failed? ? FAILED :
|
61
|
+
self.splitting? ? PROCESSING :
|
62
|
+
self.mergeable? ? MERGING :
|
63
|
+
SUCCEEDED
|
64
|
+
)
|
65
|
+
end
|
66
|
+
|
67
|
+
# If a <tt>callback_url</tt> is defined, post the Job's JSON to it upon
|
68
|
+
# completion. The <tt>callback_url</tt> may include HTTP basic authentication,
|
69
|
+
# if you like:
|
70
|
+
# http://user:password@example.com/job_complete
|
71
|
+
# If the callback URL returns a '201 Created' HTTP status code, CloudCrowd
|
72
|
+
# will assume that the resource has been successfully created, and the Job
|
73
|
+
# will be cleaned up.
|
74
|
+
def fire_callback
|
75
|
+
begin
|
76
|
+
response = RestClient.post(callback_url, {:job => self.to_json})
|
77
|
+
Thread.new { self.destroy } if response && response.code == 201
|
78
|
+
rescue RestClient::Exception => e
|
79
|
+
puts "Job ##{id} failed to fire callback: #{callback_url}"
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
# Cleaning up after a job will remove all of its files from S3 or the
|
84
|
+
# filesystem. Destroying a Job will cleanup_assets first. Run this in a
|
85
|
+
# separate thread to get out of the transaction's way.
|
86
|
+
# TODO: Convert this into a 'cleanup' work unit that gets run by a worker.
|
87
|
+
def cleanup_assets
|
88
|
+
AssetStore.new.cleanup(self)
|
89
|
+
end
|
90
|
+
|
91
|
+
# Have all of the WorkUnits finished?
|
92
|
+
def all_work_units_complete?
|
93
|
+
self.work_units.incomplete.count <= 0
|
94
|
+
end
|
95
|
+
|
96
|
+
# Have any of the WorkUnits failed?
|
97
|
+
def any_work_units_failed?
|
98
|
+
self.work_units.failed.count > 0
|
99
|
+
end
|
100
|
+
|
101
|
+
# This job is splittable if its Action has a +split+ method.
|
102
|
+
def splittable?
|
103
|
+
self.action_class.public_instance_methods.map {|m| m.to_sym }.include? :split
|
104
|
+
end
|
105
|
+
|
106
|
+
# This job is done splitting if it's finished with its splitting work units.
|
107
|
+
def done_splitting?
|
108
|
+
splittable? && work_units.splitting.count <= 0
|
109
|
+
end
|
110
|
+
|
111
|
+
# This job is mergeable if its Action has a +merge+ method.
|
112
|
+
def mergeable?
|
113
|
+
self.processing? && self.action_class.public_instance_methods.map {|m| m.to_sym }.include?(:merge)
|
114
|
+
end
|
115
|
+
|
116
|
+
# Retrieve the class for this Job's Action.
|
117
|
+
def action_class
|
118
|
+
@action_class ||= CloudCrowd.actions[self.action]
|
119
|
+
return @action_class if @action_class
|
120
|
+
raise Error::ActionNotFound, "no action named: '#{self.action}' could be found"
|
121
|
+
end
|
122
|
+
|
123
|
+
# How complete is this Job?
|
124
|
+
# Unfortunately, with the current processing sequence, the percent_complete
|
125
|
+
# can pull a fast one and go backwards. This happens when there's a single
|
126
|
+
# large input that takes a long time to split, and when it finally does it
|
127
|
+
# creates a whole swarm of work units. This seems unavoidable.
|
128
|
+
def percent_complete
|
129
|
+
return 99 if merging?
|
130
|
+
return 100 if complete?
|
131
|
+
unit_count = work_units.count
|
132
|
+
return 100 if unit_count <= 0
|
133
|
+
(work_units.complete.count / unit_count.to_f * 100).round
|
134
|
+
end
|
135
|
+
|
136
|
+
# How long has this Job taken?
|
137
|
+
def time_taken
|
138
|
+
return self.time if self.time
|
139
|
+
Time.now - self.created_at
|
140
|
+
end
|
141
|
+
|
142
|
+
# Generate a stable 8-bit Hex color code, based on the Job's id.
|
143
|
+
def color
|
144
|
+
@color ||= Digest::MD5.hexdigest(self.id.to_s)[-7...-1]
|
145
|
+
end
|
146
|
+
|
147
|
+
# A JSON representation of this job includes the statuses of its component
|
148
|
+
# WorkUnits, as well as any completed outputs.
|
149
|
+
def to_json(opts={})
|
150
|
+
atts = {
|
151
|
+
'id' => id,
|
152
|
+
'color' => color,
|
153
|
+
'status' => display_status,
|
154
|
+
'percent_complete' => percent_complete,
|
155
|
+
'work_units' => work_units.count,
|
156
|
+
'time_taken' => time_taken
|
157
|
+
}
|
158
|
+
atts['outputs'] = JSON.parse(outputs) if outputs
|
159
|
+
atts['email'] = email if email
|
160
|
+
atts.to_json
|
161
|
+
end
|
162
|
+
|
163
|
+
|
164
|
+
private
|
165
|
+
|
166
|
+
# When the WorkUnits are all finished, gather all their outputs together
|
167
|
+
# before removing them from the database entirely. Returns their merged JSON.
|
168
|
+
def gather_outputs_from_work_units
|
169
|
+
units = self.work_units.complete
|
170
|
+
outs = self.work_units.complete.map {|u| u.parsed_output }
|
171
|
+
self.work_units.complete.destroy_all
|
172
|
+
outs.to_json
|
173
|
+
end
|
174
|
+
|
175
|
+
# When starting a new job, or moving to a new stage, split up the inputs
|
176
|
+
# into WorkUnits, and queue them. Workers will start picking them up right
|
177
|
+
# away.
|
178
|
+
def queue_for_workers(input=nil)
|
179
|
+
input ||= JSON.parse(self.inputs)
|
180
|
+
input.each {|i| WorkUnit.start(self, action, i, status) }
|
181
|
+
self
|
182
|
+
end
|
183
|
+
|
184
|
+
# A Job starts out either splitting or processing, depending on its action.
|
185
|
+
def set_initial_status
|
186
|
+
self.status = self.splittable? ? SPLITTING : PROCESSING
|
187
|
+
end
|
188
|
+
|
189
|
+
end
|
190
|
+
end
|
@@ -0,0 +1,107 @@
|
|
1
|
+
module CloudCrowd
|
2
|
+
|
3
|
+
# A NodeRecord is the central server's record of a Node running remotely. We
|
4
|
+
# can use it to assign WorkUnits to the Node, and keep track of its status.
|
5
|
+
# When a Node exits, it destroys this record.
|
6
|
+
class NodeRecord < ActiveRecord::Base
|
7
|
+
|
8
|
+
has_many :work_units
|
9
|
+
|
10
|
+
validates_presence_of :host, :ip_address, :port, :enabled_actions
|
11
|
+
|
12
|
+
after_destroy :redistribute_work_units
|
13
|
+
|
14
|
+
# Available Nodes haven't used up their maxiumum number of workers yet.
|
15
|
+
named_scope :available, {
|
16
|
+
:conditions => ['(max_workers is null or (select count(*) from work_units where node_record_id = node_records.id) < max_workers)'],
|
17
|
+
:order => 'updated_at asc'
|
18
|
+
}
|
19
|
+
|
20
|
+
# Register a Node with the central server. Currently this only happens at
|
21
|
+
# Node startup.
|
22
|
+
def self.check_in(params, request)
|
23
|
+
attrs = {
|
24
|
+
:ip_address => request.ip,
|
25
|
+
:port => params[:port],
|
26
|
+
:busy => params[:busy],
|
27
|
+
:max_workers => params[:max_workers],
|
28
|
+
:enabled_actions => params[:enabled_actions]
|
29
|
+
}
|
30
|
+
self.find_or_create_by_host(params[:host]).update_attributes!(attrs)
|
31
|
+
end
|
32
|
+
|
33
|
+
# Dispatch a WorkUnit to this node. Places the node at back at the end of
|
34
|
+
# the rotation. If we fail to send the WorkUnit, we consider the node to be
|
35
|
+
# down, and remove this record, freeing up all of its checked-out work units.
|
36
|
+
# If the Node responds that it's overloaded, we mark it as busy. Returns
|
37
|
+
# true if the WorkUnit was dispatched successfully.
|
38
|
+
def send_work_unit(unit)
|
39
|
+
result = node['/work'].post(:work_unit => unit.to_json)
|
40
|
+
unit.assign_to(self, JSON.parse(result)['pid'])
|
41
|
+
touch && true
|
42
|
+
rescue RestClient::RequestFailed => e
|
43
|
+
raise e unless e.http_code == 503 && e.http_body == Node::OVERLOADED_MESSAGE
|
44
|
+
update_attribute(:busy, true) && false
|
45
|
+
rescue RestClient::Exception, Errno::ECONNREFUSED, Timeout::Error
|
46
|
+
# Couldn't post to node, assume it's gone away.
|
47
|
+
destroy && false
|
48
|
+
end
|
49
|
+
|
50
|
+
# What Actions is this Node able to run?
|
51
|
+
def actions
|
52
|
+
@actions ||= enabled_actions.split(',')
|
53
|
+
end
|
54
|
+
|
55
|
+
# Is this Node too busy for more work? Determined by number of workers, or
|
56
|
+
# the Node's load average, as configured in config.yml.
|
57
|
+
def busy?
|
58
|
+
busy || (max_workers && work_units.count >= max_workers)
|
59
|
+
end
|
60
|
+
|
61
|
+
# The URL at which this Node may be reached.
|
62
|
+
# TODO: Make sure that the host actually has externally accessible DNS.
|
63
|
+
def url
|
64
|
+
@url ||= "http://#{host}:#{port}"
|
65
|
+
end
|
66
|
+
|
67
|
+
# Keep a RestClient::Resource handy for contacting the Node, including
|
68
|
+
# HTTP authentication, if configured.
|
69
|
+
def node
|
70
|
+
@node ||= RestClient::Resource.new(url, CloudCrowd.client_options)
|
71
|
+
end
|
72
|
+
|
73
|
+
# The printable status of the Node.
|
74
|
+
def display_status
|
75
|
+
busy? ? 'busy' : 'available'
|
76
|
+
end
|
77
|
+
|
78
|
+
# A list of the process ids of the workers currently being run by the Node.
|
79
|
+
def worker_pids
|
80
|
+
work_units.all(:select => 'worker_pid').map(&:worker_pid)
|
81
|
+
end
|
82
|
+
|
83
|
+
# Release all of this Node's WorkUnits for other nodes to take.
|
84
|
+
def release_work_units
|
85
|
+
WorkUnit.update_all('node_record_id = null, worker_pid = null', "node_record_id = #{id}")
|
86
|
+
end
|
87
|
+
|
88
|
+
# The JSON representation of a NodeRecord includes its worker_pids.
|
89
|
+
def to_json(opts={})
|
90
|
+
{ 'host' => host,
|
91
|
+
'workers' => worker_pids,
|
92
|
+
'status' => display_status
|
93
|
+
}.to_json
|
94
|
+
end
|
95
|
+
|
96
|
+
|
97
|
+
private
|
98
|
+
|
99
|
+
# When a Node exits, release its WorkUnits and redistribute them to others.
|
100
|
+
# Redistribute in a separate thread to avoid delaying shutdown.
|
101
|
+
def redistribute_work_units
|
102
|
+
release_work_units
|
103
|
+
Thread.new { WorkUnit.distribute_to_nodes }
|
104
|
+
end
|
105
|
+
|
106
|
+
end
|
107
|
+
end
|
@@ -0,0 +1,170 @@
|
|
1
|
+
module CloudCrowd
|
2
|
+
|
3
|
+
# A WorkUnit is an atomic chunk of work from a job, processing a single input
|
4
|
+
# through a single action. The WorkUnits are run in parallel, with each worker
|
5
|
+
# daemon processing one at a time. The splitting and merging stages of a job
|
6
|
+
# are each run as a single WorkUnit.
|
7
|
+
class WorkUnit < ActiveRecord::Base
|
8
|
+
include ModelStatus
|
9
|
+
|
10
|
+
# We use a random number in (0...MAX_RESERVATION) to reserve work units.
|
11
|
+
# The size of the maximum signed integer in MySQL -- SQLite has no limit.
|
12
|
+
MAX_RESERVATION = 2147483647
|
13
|
+
|
14
|
+
# We only reserve a certain number of WorkUnits in a single go, to avoid
|
15
|
+
# reserving the entire table.
|
16
|
+
RESERVATION_LIMIT = 25
|
17
|
+
|
18
|
+
belongs_to :job
|
19
|
+
belongs_to :node_record
|
20
|
+
|
21
|
+
validates_presence_of :job_id, :status, :input, :action
|
22
|
+
|
23
|
+
# Available WorkUnits are waiting to be distributed to Nodes for processing.
|
24
|
+
named_scope :available, {:conditions => {:reservation => nil, :worker_pid => nil, :status => INCOMPLETE}}
|
25
|
+
# Reserved WorkUnits have been marked for distribution by a central server process.
|
26
|
+
named_scope :reserved, lambda {|reservation|
|
27
|
+
{:conditions => {:reservation => reservation}, :order => 'updated_at asc'}
|
28
|
+
}
|
29
|
+
|
30
|
+
# Attempt to send a list of WorkUnits to nodes with available capacity.
|
31
|
+
# A single central server process stops the same WorkUnit from being
|
32
|
+
# distributed to multiple nodes by reserving it first. The algorithm used
|
33
|
+
# should be lock-free.
|
34
|
+
#
|
35
|
+
# We reserve WorkUnits for this process in chunks of RESERVATION_LIMIT size,
|
36
|
+
# and try to match them to Nodes that are capable of handling the Action.
|
37
|
+
# WorkUnits get removed from the availability list when they are
|
38
|
+
# successfully sent, and Nodes get removed when they are busy or have the
|
39
|
+
# action in question disabled.
|
40
|
+
def self.distribute_to_nodes
|
41
|
+
reservation = nil
|
42
|
+
loop do
|
43
|
+
return unless reservation = WorkUnit.reserve_available(:limit => RESERVATION_LIMIT)
|
44
|
+
work_units = WorkUnit.reserved(reservation)
|
45
|
+
available_nodes = NodeRecord.available
|
46
|
+
while node = available_nodes.shift and unit = work_units.shift do
|
47
|
+
if node.actions.include? unit.action
|
48
|
+
if node.send_work_unit(unit)
|
49
|
+
available_nodes.push(node) unless node.busy?
|
50
|
+
next
|
51
|
+
end
|
52
|
+
end
|
53
|
+
work_units.push(unit)
|
54
|
+
end
|
55
|
+
return if work_units.any? || available_nodes.empty?
|
56
|
+
end
|
57
|
+
ensure
|
58
|
+
WorkUnit.cancel_reservations(reservation) if reservation
|
59
|
+
end
|
60
|
+
|
61
|
+
# Reserves all available WorkUnits for this process. Returns false if there
|
62
|
+
# were none available.
|
63
|
+
def self.reserve_available(options={})
|
64
|
+
reservation = ActiveSupport::SecureRandom.random_number(MAX_RESERVATION)
|
65
|
+
any = WorkUnit.available.update_all("reservation = #{reservation}", nil, options) > 0
|
66
|
+
any && reservation
|
67
|
+
end
|
68
|
+
|
69
|
+
# Cancels all outstanding WorkUnit reservations for this process.
|
70
|
+
def self.cancel_reservations(reservation)
|
71
|
+
WorkUnit.reserved(reservation).update_all('reservation = null')
|
72
|
+
end
|
73
|
+
|
74
|
+
# Cancels all outstanding WorkUnit reservations for all processes. (Useful
|
75
|
+
# in the console for debugging.)
|
76
|
+
def self.cancel_all_reservations
|
77
|
+
WorkUnit.update_all('reservation = null')
|
78
|
+
end
|
79
|
+
|
80
|
+
# Look up a WorkUnit by the worker that's currently processing it. Specified
|
81
|
+
# by <tt>pid@host</tt>.
|
82
|
+
def self.find_by_worker_name(name)
|
83
|
+
pid, host = name.split('@')
|
84
|
+
node = NodeRecord.find_by_host(host)
|
85
|
+
node && node.work_units.find_by_worker_pid(pid)
|
86
|
+
end
|
87
|
+
|
88
|
+
# Convenience method for starting a new WorkUnit.
|
89
|
+
def self.start(job, action, input, status)
|
90
|
+
input = input.to_json unless input.is_a? String
|
91
|
+
self.create(:job => job, :action => action, :input => input, :status => status)
|
92
|
+
end
|
93
|
+
|
94
|
+
# Mark this unit as having finished successfully.
|
95
|
+
# Splitting work units are handled differently (an optimization) -- they
|
96
|
+
# immediately fire off all of their resulting WorkUnits for processing,
|
97
|
+
# without waiting for the rest of their splitting cousins to complete.
|
98
|
+
def finish(result, time_taken)
|
99
|
+
if splitting?
|
100
|
+
[parsed_output(result)].flatten.each do |new_input|
|
101
|
+
WorkUnit.start(job, action, new_input, PROCESSING)
|
102
|
+
end
|
103
|
+
self.destroy
|
104
|
+
job.set_next_status if job && job.done_splitting?
|
105
|
+
else
|
106
|
+
update_attributes({
|
107
|
+
:status => SUCCEEDED,
|
108
|
+
:node_record => nil,
|
109
|
+
:worker_pid => nil,
|
110
|
+
:attempts => attempts + 1,
|
111
|
+
:output => result,
|
112
|
+
:time => time_taken
|
113
|
+
})
|
114
|
+
job && job.check_for_completion
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
118
|
+
# Mark this unit as having failed. May attempt a retry.
|
119
|
+
def fail(output, time_taken)
|
120
|
+
tries = self.attempts + 1
|
121
|
+
return try_again if tries < CloudCrowd.config[:work_unit_retries]
|
122
|
+
update_attributes({
|
123
|
+
:status => FAILED,
|
124
|
+
:node_record => nil,
|
125
|
+
:worker_pid => nil,
|
126
|
+
:attempts => tries,
|
127
|
+
:output => output,
|
128
|
+
:time => time_taken
|
129
|
+
})
|
130
|
+
job && job.check_for_completion
|
131
|
+
end
|
132
|
+
|
133
|
+
# Ever tried. Ever failed. No matter. Try again. Fail again. Fail better.
|
134
|
+
def try_again
|
135
|
+
update_attributes({
|
136
|
+
:node_record => nil,
|
137
|
+
:worker_pid => nil,
|
138
|
+
:attempts => self.attempts + 1
|
139
|
+
})
|
140
|
+
end
|
141
|
+
|
142
|
+
# When a Node checks out a WorkUnit, establish the connection between
|
143
|
+
# WorkUnit and NodeRecord and record the worker_pid.
|
144
|
+
def assign_to(node_record, worker_pid)
|
145
|
+
update_attributes!(:node_record => node_record, :worker_pid => worker_pid)
|
146
|
+
end
|
147
|
+
|
148
|
+
# All output needs to be wrapped in a JSON object for consistency
|
149
|
+
# (unfortunately, JSON.parse needs the top-level to be an object or array).
|
150
|
+
# Convenience method to provide the parsed version.
|
151
|
+
def parsed_output(out = self.output)
|
152
|
+
JSON.parse(out)['output']
|
153
|
+
end
|
154
|
+
|
155
|
+
# The JSON representation of a WorkUnit shares the Job's options with all
|
156
|
+
# its cousin WorkUnits.
|
157
|
+
def to_json
|
158
|
+
{
|
159
|
+
'id' => self.id,
|
160
|
+
'job_id' => self.job_id,
|
161
|
+
'input' => self.input,
|
162
|
+
'attempts' => self.attempts,
|
163
|
+
'action' => self.action,
|
164
|
+
'options' => JSON.parse(self.job.options),
|
165
|
+
'status' => self.status
|
166
|
+
}.to_json
|
167
|
+
end
|
168
|
+
|
169
|
+
end
|
170
|
+
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
module CloudCrowd
|
2
|
+
|
3
|
+
# Adds named scopes and query methods for every CloudCrowd status to
|
4
|
+
# both Jobs and WorkUnits.
|
5
|
+
module ModelStatus
|
6
|
+
|
7
|
+
def self.included(klass)
|
8
|
+
|
9
|
+
klass.class_eval do
|
10
|
+
# Note that COMPLETE and INCOMPLETE are unions of other states.
|
11
|
+
named_scope 'processing', :conditions => {:status => PROCESSING}
|
12
|
+
named_scope 'succeeded', :conditions => {:status => SUCCEEDED}
|
13
|
+
named_scope 'failed', :conditions => {:status => FAILED}
|
14
|
+
named_scope 'splitting', :conditions => {:status => SPLITTING}
|
15
|
+
named_scope 'merging', :conditions => {:status => MERGING}
|
16
|
+
named_scope 'complete', :conditions => {:status => COMPLETE}
|
17
|
+
named_scope 'incomplete', :conditions => {:status => INCOMPLETE}
|
18
|
+
end
|
19
|
+
|
20
|
+
end
|
21
|
+
|
22
|
+
def processing?; self.status == PROCESSING; end
|
23
|
+
def succeeded?; self.status == SUCCEEDED; end
|
24
|
+
def failed?; self.status == FAILED; end
|
25
|
+
def splitting?; self.status == SPLITTING; end
|
26
|
+
def merging?; self.status == MERGING; end
|
27
|
+
def complete?; COMPLETE.include?(self.status); end
|
28
|
+
def incomplete?; INCOMPLETE.include?(self.status); end
|
29
|
+
|
30
|
+
# Get the displayable status name of the model's status code.
|
31
|
+
def display_status
|
32
|
+
CloudCrowd.display_status(self.status)
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
require 'cloud_crowd/models/job'
|
39
|
+
require 'cloud_crowd/models/node_record'
|
40
|
+
require 'cloud_crowd/models/work_unit'
|