cloud-crowd 0.3.1 → 0.3.2
Sign up to get free protection for your applications and to get access to all the features.
- data/actions/word_count.rb +6 -4
- data/cloud-crowd.gemspec +2 -2
- data/lib/cloud-crowd.rb +3 -3
- data/lib/cloud_crowd/models/job.rb +40 -40
- metadata +2 -2
data/actions/word_count.rb
CHANGED
@@ -1,14 +1,16 @@
|
|
1
|
-
# A parallel WordCount. Depends on the 'wc' utility.
|
1
|
+
# A parallel WordCount. Depends on the 'wc' utility.
|
2
2
|
class WordCount < CloudCrowd::Action
|
3
|
-
|
3
|
+
|
4
4
|
# Count the words in a single book.
|
5
|
+
# Pretend that this takes longer than it really does, for demonstration purposes.
|
5
6
|
def process
|
7
|
+
sleep 5
|
6
8
|
(`wc -w #{input_path}`).match(/\A\s*(\d+)/)[1].to_i
|
7
9
|
end
|
8
|
-
|
10
|
+
|
9
11
|
# Sum the total word count.
|
10
12
|
def merge
|
11
13
|
input.inject(0) {|sum, count| sum + count }
|
12
14
|
end
|
13
|
-
|
15
|
+
|
14
16
|
end
|
data/cloud-crowd.gemspec
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
Gem::Specification.new do |s|
|
2
2
|
s.name = 'cloud-crowd'
|
3
|
-
s.version = '0.3.
|
4
|
-
s.date = '
|
3
|
+
s.version = '0.3.2' # Keep version in sync with cloud-cloud.rb
|
4
|
+
s.date = '2010-01-08'
|
5
5
|
|
6
6
|
s.homepage = "http://wiki.github.com/documentcloud/cloud-crowd"
|
7
7
|
s.summary = "Parallel Processing for the Rest of Us"
|
data/lib/cloud-crowd.rb
CHANGED
@@ -12,13 +12,13 @@ gem 'sinatra'
|
|
12
12
|
gem 'thin'
|
13
13
|
|
14
14
|
# Autoloading for all the pieces which may or may not be needed:
|
15
|
-
autoload :ActiveRecord, '
|
15
|
+
autoload :ActiveRecord, 'active_record'
|
16
16
|
autoload :Benchmark, 'benchmark'
|
17
17
|
autoload :Digest, 'digest'
|
18
18
|
autoload :ERB, 'erb'
|
19
19
|
autoload :FileUtils, 'fileutils'
|
20
20
|
autoload :JSON, 'json'
|
21
|
-
autoload :RestClient, '
|
21
|
+
autoload :RestClient, 'rest_client'
|
22
22
|
autoload :RightAws, 'right_aws'
|
23
23
|
autoload :Sinatra, 'sinatra'
|
24
24
|
autoload :Thin, 'thin'
|
@@ -44,7 +44,7 @@ module CloudCrowd
|
|
44
44
|
autoload :WorkUnit, 'cloud_crowd/models'
|
45
45
|
|
46
46
|
# Keep this version in sync with the gemspec.
|
47
|
-
VERSION = '0.3.
|
47
|
+
VERSION = '0.3.2'
|
48
48
|
|
49
49
|
# Increment the schema version when there's a backwards incompatible change.
|
50
50
|
SCHEMA_VERSION = 3
|
@@ -1,25 +1,25 @@
|
|
1
1
|
module CloudCrowd
|
2
|
-
|
2
|
+
|
3
3
|
# A chunk of work that will be farmed out into many WorkUnits to be processed
|
4
4
|
# in parallel by each active CloudCrowd::Worker. Jobs are defined by a list
|
5
|
-
# of inputs (usually public urls to files), an action (the name of a script that
|
5
|
+
# of inputs (usually public urls to files), an action (the name of a script that
|
6
6
|
# CloudCrowd knows how to run), and, eventually a corresponding list of output.
|
7
7
|
class Job < ActiveRecord::Base
|
8
8
|
include ModelStatus
|
9
|
-
|
9
|
+
|
10
10
|
CLEANUP_GRACE_PERIOD = 7 # That's a week.
|
11
|
-
|
11
|
+
|
12
12
|
has_many :work_units, :dependent => :destroy
|
13
|
-
|
13
|
+
|
14
14
|
validates_presence_of :status, :inputs, :action, :options
|
15
|
-
|
15
|
+
|
16
16
|
before_validation_on_create :set_initial_status
|
17
17
|
after_create :queue_for_workers
|
18
18
|
before_destroy :cleanup_assets
|
19
|
-
|
19
|
+
|
20
20
|
# Jobs that were last updated more than N days ago.
|
21
21
|
named_scope :older_than, lambda {|num| {:conditions => ['updated_at < ?', num.days.ago]} }
|
22
|
-
|
22
|
+
|
23
23
|
# Create a Job from an incoming JSON request, and add it to the queue.
|
24
24
|
def self.create_from_request(h)
|
25
25
|
self.create(
|
@@ -30,7 +30,7 @@ module CloudCrowd
|
|
30
30
|
:callback_url => h['callback_url']
|
31
31
|
)
|
32
32
|
end
|
33
|
-
|
33
|
+
|
34
34
|
# Clean up all jobs beyond a certain age.
|
35
35
|
def self.cleanup_all(opts = {})
|
36
36
|
days = opts[:days] || CLEANUP_GRACE_PERIOD
|
@@ -38,12 +38,12 @@ module CloudCrowd
|
|
38
38
|
jobs.each {|job| job.destroy }
|
39
39
|
end
|
40
40
|
end
|
41
|
-
|
41
|
+
|
42
42
|
# After work units are marked successful, we check to see if all of them have
|
43
|
-
# finished, if so, continue on to the next phase of the job.
|
43
|
+
# finished, if so, continue on to the next phase of the job.
|
44
44
|
def check_for_completion
|
45
45
|
return unless all_work_units_complete?
|
46
|
-
set_next_status
|
46
|
+
set_next_status
|
47
47
|
outs = gather_outputs_from_work_units
|
48
48
|
return queue_for_workers([outs]) if merging?
|
49
49
|
if complete?
|
@@ -52,7 +52,7 @@ module CloudCrowd
|
|
52
52
|
end
|
53
53
|
self
|
54
54
|
end
|
55
|
-
|
55
|
+
|
56
56
|
# Transition this Job's current status to the appropriate next one, based
|
57
57
|
# on the state of the WorkUnits and the nature of the Action.
|
58
58
|
def set_next_status
|
@@ -63,12 +63,12 @@ module CloudCrowd
|
|
63
63
|
SUCCEEDED
|
64
64
|
)
|
65
65
|
end
|
66
|
-
|
67
|
-
# If a <tt>callback_url</tt> is defined, post the Job's JSON to it upon
|
66
|
+
|
67
|
+
# If a <tt>callback_url</tt> is defined, post the Job's JSON to it upon
|
68
68
|
# completion. The <tt>callback_url</tt> may include HTTP basic authentication,
|
69
69
|
# if you like:
|
70
70
|
# http://user:password@example.com/job_complete
|
71
|
-
# If the callback URL returns a '201 Created' HTTP status code, CloudCrowd
|
71
|
+
# If the callback URL returns a '201 Created' HTTP status code, CloudCrowd
|
72
72
|
# will assume that the resource has been successfully created, and the Job
|
73
73
|
# will be cleaned up.
|
74
74
|
def fire_callback
|
@@ -76,54 +76,54 @@ module CloudCrowd
|
|
76
76
|
response = RestClient.post(callback_url, {:job => self.to_json})
|
77
77
|
Thread.new { self.destroy } if response && response.code == 201
|
78
78
|
rescue RestClient::Exception => e
|
79
|
-
puts "
|
79
|
+
puts "Job ##{id} failed to fire callback: #{callback_url}"
|
80
80
|
end
|
81
81
|
end
|
82
|
-
|
82
|
+
|
83
83
|
# Cleaning up after a job will remove all of its files from S3 or the
|
84
|
-
# filesystem. Destroying a Job will cleanup_assets first. Run this in a
|
84
|
+
# filesystem. Destroying a Job will cleanup_assets first. Run this in a
|
85
85
|
# separate thread to get out of the transaction's way.
|
86
86
|
# TODO: Convert this into a 'cleanup' work unit that gets run by a worker.
|
87
87
|
def cleanup_assets
|
88
88
|
AssetStore.new.cleanup(self)
|
89
89
|
end
|
90
|
-
|
91
|
-
# Have all of the WorkUnits finished?
|
90
|
+
|
91
|
+
# Have all of the WorkUnits finished?
|
92
92
|
def all_work_units_complete?
|
93
93
|
self.work_units.incomplete.count <= 0
|
94
94
|
end
|
95
|
-
|
95
|
+
|
96
96
|
# Have any of the WorkUnits failed?
|
97
97
|
def any_work_units_failed?
|
98
98
|
self.work_units.failed.count > 0
|
99
99
|
end
|
100
|
-
|
100
|
+
|
101
101
|
# This job is splittable if its Action has a +split+ method.
|
102
102
|
def splittable?
|
103
103
|
self.action_class.public_instance_methods.map {|m| m.to_sym }.include? :split
|
104
104
|
end
|
105
|
-
|
105
|
+
|
106
106
|
# This job is done splitting if it's finished with its splitting work units.
|
107
107
|
def done_splitting?
|
108
108
|
splittable? && work_units.splitting.count <= 0
|
109
109
|
end
|
110
|
-
|
110
|
+
|
111
111
|
# This job is mergeable if its Action has a +merge+ method.
|
112
112
|
def mergeable?
|
113
113
|
self.processing? && self.action_class.public_instance_methods.map {|m| m.to_sym }.include?(:merge)
|
114
114
|
end
|
115
|
-
|
115
|
+
|
116
116
|
# Retrieve the class for this Job's Action.
|
117
117
|
def action_class
|
118
118
|
@action_class ||= CloudCrowd.actions[self.action]
|
119
119
|
return @action_class if @action_class
|
120
120
|
raise Error::ActionNotFound, "no action named: '#{self.action}' could be found"
|
121
121
|
end
|
122
|
-
|
122
|
+
|
123
123
|
# How complete is this Job?
|
124
124
|
# Unfortunately, with the current processing sequence, the percent_complete
|
125
125
|
# can pull a fast one and go backwards. This happens when there's a single
|
126
|
-
# large input that takes a long time to split, and when it finally does it
|
126
|
+
# large input that takes a long time to split, and when it finally does it
|
127
127
|
# creates a whole swarm of work units. This seems unavoidable.
|
128
128
|
def percent_complete
|
129
129
|
return 99 if merging?
|
@@ -132,25 +132,25 @@ module CloudCrowd
|
|
132
132
|
return 100 if unit_count <= 0
|
133
133
|
(work_units.complete.count / unit_count.to_f * 100).round
|
134
134
|
end
|
135
|
-
|
135
|
+
|
136
136
|
# How long has this Job taken?
|
137
137
|
def time_taken
|
138
138
|
return self.time if self.time
|
139
139
|
Time.now - self.created_at
|
140
140
|
end
|
141
|
-
|
141
|
+
|
142
142
|
# Generate a stable 8-bit Hex color code, based on the Job's id.
|
143
143
|
def color
|
144
144
|
@color ||= Digest::MD5.hexdigest(self.id.to_s)[-7...-1]
|
145
145
|
end
|
146
|
-
|
146
|
+
|
147
147
|
# A JSON representation of this job includes the statuses of its component
|
148
148
|
# WorkUnits, as well as any completed outputs.
|
149
149
|
def to_json(opts={})
|
150
150
|
atts = {
|
151
151
|
'id' => id,
|
152
152
|
'color' => color,
|
153
|
-
'status' => display_status,
|
153
|
+
'status' => display_status,
|
154
154
|
'percent_complete' => percent_complete,
|
155
155
|
'work_units' => work_units.count,
|
156
156
|
'time_taken' => time_taken
|
@@ -159,10 +159,10 @@ module CloudCrowd
|
|
159
159
|
atts['email'] = email if email
|
160
160
|
atts.to_json
|
161
161
|
end
|
162
|
-
|
163
|
-
|
162
|
+
|
163
|
+
|
164
164
|
private
|
165
|
-
|
165
|
+
|
166
166
|
# When the WorkUnits are all finished, gather all their outputs together
|
167
167
|
# before removing them from the database entirely. Returns their merged JSON.
|
168
168
|
def gather_outputs_from_work_units
|
@@ -171,20 +171,20 @@ module CloudCrowd
|
|
171
171
|
self.work_units.complete.destroy_all
|
172
172
|
outs.to_json
|
173
173
|
end
|
174
|
-
|
175
|
-
# When starting a new job, or moving to a new stage, split up the inputs
|
174
|
+
|
175
|
+
# When starting a new job, or moving to a new stage, split up the inputs
|
176
176
|
# into WorkUnits, and queue them. Workers will start picking them up right
|
177
177
|
# away.
|
178
178
|
def queue_for_workers(input=nil)
|
179
179
|
input ||= JSON.parse(self.inputs)
|
180
|
-
input.each {|i| WorkUnit.start(self, action, i, status) }
|
180
|
+
input.each {|i| WorkUnit.start(self, action, i, status) }
|
181
181
|
self
|
182
182
|
end
|
183
|
-
|
183
|
+
|
184
184
|
# A Job starts out either splitting or processing, depending on its action.
|
185
185
|
def set_initial_status
|
186
186
|
self.status = self.splittable? ? SPLITTING : PROCESSING
|
187
187
|
end
|
188
|
-
|
188
|
+
|
189
189
|
end
|
190
190
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cloud-crowd
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jeremy Ashkenas
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date:
|
12
|
+
date: 2010-01-08 00:00:00 -05:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|