cloud-crowd 0.3.1 → 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/actions/word_count.rb +6 -4
- data/cloud-crowd.gemspec +2 -2
- data/lib/cloud-crowd.rb +3 -3
- data/lib/cloud_crowd/models/job.rb +40 -40
- metadata +2 -2
data/actions/word_count.rb
CHANGED
@@ -1,14 +1,16 @@
|
|
1
|
-
# A parallel WordCount. Depends on the 'wc' utility.
|
1
|
+
# A parallel WordCount. Depends on the 'wc' utility.
|
2
2
|
class WordCount < CloudCrowd::Action
|
3
|
-
|
3
|
+
|
4
4
|
# Count the words in a single book.
|
5
|
+
# Pretend that this takes longer than it really does, for demonstration purposes.
|
5
6
|
def process
|
7
|
+
sleep 5
|
6
8
|
(`wc -w #{input_path}`).match(/\A\s*(\d+)/)[1].to_i
|
7
9
|
end
|
8
|
-
|
10
|
+
|
9
11
|
# Sum the total word count.
|
10
12
|
def merge
|
11
13
|
input.inject(0) {|sum, count| sum + count }
|
12
14
|
end
|
13
|
-
|
15
|
+
|
14
16
|
end
|
data/cloud-crowd.gemspec
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
Gem::Specification.new do |s|
|
2
2
|
s.name = 'cloud-crowd'
|
3
|
-
s.version = '0.3.
|
4
|
-
s.date = '
|
3
|
+
s.version = '0.3.2' # Keep version in sync with cloud-cloud.rb
|
4
|
+
s.date = '2010-01-08'
|
5
5
|
|
6
6
|
s.homepage = "http://wiki.github.com/documentcloud/cloud-crowd"
|
7
7
|
s.summary = "Parallel Processing for the Rest of Us"
|
data/lib/cloud-crowd.rb
CHANGED
@@ -12,13 +12,13 @@ gem 'sinatra'
|
|
12
12
|
gem 'thin'
|
13
13
|
|
14
14
|
# Autoloading for all the pieces which may or may not be needed:
|
15
|
-
autoload :ActiveRecord, '
|
15
|
+
autoload :ActiveRecord, 'active_record'
|
16
16
|
autoload :Benchmark, 'benchmark'
|
17
17
|
autoload :Digest, 'digest'
|
18
18
|
autoload :ERB, 'erb'
|
19
19
|
autoload :FileUtils, 'fileutils'
|
20
20
|
autoload :JSON, 'json'
|
21
|
-
autoload :RestClient, '
|
21
|
+
autoload :RestClient, 'rest_client'
|
22
22
|
autoload :RightAws, 'right_aws'
|
23
23
|
autoload :Sinatra, 'sinatra'
|
24
24
|
autoload :Thin, 'thin'
|
@@ -44,7 +44,7 @@ module CloudCrowd
|
|
44
44
|
autoload :WorkUnit, 'cloud_crowd/models'
|
45
45
|
|
46
46
|
# Keep this version in sync with the gemspec.
|
47
|
-
VERSION = '0.3.
|
47
|
+
VERSION = '0.3.2'
|
48
48
|
|
49
49
|
# Increment the schema version when there's a backwards incompatible change.
|
50
50
|
SCHEMA_VERSION = 3
|
@@ -1,25 +1,25 @@
|
|
1
1
|
module CloudCrowd
|
2
|
-
|
2
|
+
|
3
3
|
# A chunk of work that will be farmed out into many WorkUnits to be processed
|
4
4
|
# in parallel by each active CloudCrowd::Worker. Jobs are defined by a list
|
5
|
-
# of inputs (usually public urls to files), an action (the name of a script that
|
5
|
+
# of inputs (usually public urls to files), an action (the name of a script that
|
6
6
|
# CloudCrowd knows how to run), and, eventually a corresponding list of output.
|
7
7
|
class Job < ActiveRecord::Base
|
8
8
|
include ModelStatus
|
9
|
-
|
9
|
+
|
10
10
|
CLEANUP_GRACE_PERIOD = 7 # That's a week.
|
11
|
-
|
11
|
+
|
12
12
|
has_many :work_units, :dependent => :destroy
|
13
|
-
|
13
|
+
|
14
14
|
validates_presence_of :status, :inputs, :action, :options
|
15
|
-
|
15
|
+
|
16
16
|
before_validation_on_create :set_initial_status
|
17
17
|
after_create :queue_for_workers
|
18
18
|
before_destroy :cleanup_assets
|
19
|
-
|
19
|
+
|
20
20
|
# Jobs that were last updated more than N days ago.
|
21
21
|
named_scope :older_than, lambda {|num| {:conditions => ['updated_at < ?', num.days.ago]} }
|
22
|
-
|
22
|
+
|
23
23
|
# Create a Job from an incoming JSON request, and add it to the queue.
|
24
24
|
def self.create_from_request(h)
|
25
25
|
self.create(
|
@@ -30,7 +30,7 @@ module CloudCrowd
|
|
30
30
|
:callback_url => h['callback_url']
|
31
31
|
)
|
32
32
|
end
|
33
|
-
|
33
|
+
|
34
34
|
# Clean up all jobs beyond a certain age.
|
35
35
|
def self.cleanup_all(opts = {})
|
36
36
|
days = opts[:days] || CLEANUP_GRACE_PERIOD
|
@@ -38,12 +38,12 @@ module CloudCrowd
|
|
38
38
|
jobs.each {|job| job.destroy }
|
39
39
|
end
|
40
40
|
end
|
41
|
-
|
41
|
+
|
42
42
|
# After work units are marked successful, we check to see if all of them have
|
43
|
-
# finished, if so, continue on to the next phase of the job.
|
43
|
+
# finished, if so, continue on to the next phase of the job.
|
44
44
|
def check_for_completion
|
45
45
|
return unless all_work_units_complete?
|
46
|
-
set_next_status
|
46
|
+
set_next_status
|
47
47
|
outs = gather_outputs_from_work_units
|
48
48
|
return queue_for_workers([outs]) if merging?
|
49
49
|
if complete?
|
@@ -52,7 +52,7 @@ module CloudCrowd
|
|
52
52
|
end
|
53
53
|
self
|
54
54
|
end
|
55
|
-
|
55
|
+
|
56
56
|
# Transition this Job's current status to the appropriate next one, based
|
57
57
|
# on the state of the WorkUnits and the nature of the Action.
|
58
58
|
def set_next_status
|
@@ -63,12 +63,12 @@ module CloudCrowd
|
|
63
63
|
SUCCEEDED
|
64
64
|
)
|
65
65
|
end
|
66
|
-
|
67
|
-
# If a <tt>callback_url</tt> is defined, post the Job's JSON to it upon
|
66
|
+
|
67
|
+
# If a <tt>callback_url</tt> is defined, post the Job's JSON to it upon
|
68
68
|
# completion. The <tt>callback_url</tt> may include HTTP basic authentication,
|
69
69
|
# if you like:
|
70
70
|
# http://user:password@example.com/job_complete
|
71
|
-
# If the callback URL returns a '201 Created' HTTP status code, CloudCrowd
|
71
|
+
# If the callback URL returns a '201 Created' HTTP status code, CloudCrowd
|
72
72
|
# will assume that the resource has been successfully created, and the Job
|
73
73
|
# will be cleaned up.
|
74
74
|
def fire_callback
|
@@ -76,54 +76,54 @@ module CloudCrowd
|
|
76
76
|
response = RestClient.post(callback_url, {:job => self.to_json})
|
77
77
|
Thread.new { self.destroy } if response && response.code == 201
|
78
78
|
rescue RestClient::Exception => e
|
79
|
-
puts "
|
79
|
+
puts "Job ##{id} failed to fire callback: #{callback_url}"
|
80
80
|
end
|
81
81
|
end
|
82
|
-
|
82
|
+
|
83
83
|
# Cleaning up after a job will remove all of its files from S3 or the
|
84
|
-
# filesystem. Destroying a Job will cleanup_assets first. Run this in a
|
84
|
+
# filesystem. Destroying a Job will cleanup_assets first. Run this in a
|
85
85
|
# separate thread to get out of the transaction's way.
|
86
86
|
# TODO: Convert this into a 'cleanup' work unit that gets run by a worker.
|
87
87
|
def cleanup_assets
|
88
88
|
AssetStore.new.cleanup(self)
|
89
89
|
end
|
90
|
-
|
91
|
-
# Have all of the WorkUnits finished?
|
90
|
+
|
91
|
+
# Have all of the WorkUnits finished?
|
92
92
|
def all_work_units_complete?
|
93
93
|
self.work_units.incomplete.count <= 0
|
94
94
|
end
|
95
|
-
|
95
|
+
|
96
96
|
# Have any of the WorkUnits failed?
|
97
97
|
def any_work_units_failed?
|
98
98
|
self.work_units.failed.count > 0
|
99
99
|
end
|
100
|
-
|
100
|
+
|
101
101
|
# This job is splittable if its Action has a +split+ method.
|
102
102
|
def splittable?
|
103
103
|
self.action_class.public_instance_methods.map {|m| m.to_sym }.include? :split
|
104
104
|
end
|
105
|
-
|
105
|
+
|
106
106
|
# This job is done splitting if it's finished with its splitting work units.
|
107
107
|
def done_splitting?
|
108
108
|
splittable? && work_units.splitting.count <= 0
|
109
109
|
end
|
110
|
-
|
110
|
+
|
111
111
|
# This job is mergeable if its Action has a +merge+ method.
|
112
112
|
def mergeable?
|
113
113
|
self.processing? && self.action_class.public_instance_methods.map {|m| m.to_sym }.include?(:merge)
|
114
114
|
end
|
115
|
-
|
115
|
+
|
116
116
|
# Retrieve the class for this Job's Action.
|
117
117
|
def action_class
|
118
118
|
@action_class ||= CloudCrowd.actions[self.action]
|
119
119
|
return @action_class if @action_class
|
120
120
|
raise Error::ActionNotFound, "no action named: '#{self.action}' could be found"
|
121
121
|
end
|
122
|
-
|
122
|
+
|
123
123
|
# How complete is this Job?
|
124
124
|
# Unfortunately, with the current processing sequence, the percent_complete
|
125
125
|
# can pull a fast one and go backwards. This happens when there's a single
|
126
|
-
# large input that takes a long time to split, and when it finally does it
|
126
|
+
# large input that takes a long time to split, and when it finally does it
|
127
127
|
# creates a whole swarm of work units. This seems unavoidable.
|
128
128
|
def percent_complete
|
129
129
|
return 99 if merging?
|
@@ -132,25 +132,25 @@ module CloudCrowd
|
|
132
132
|
return 100 if unit_count <= 0
|
133
133
|
(work_units.complete.count / unit_count.to_f * 100).round
|
134
134
|
end
|
135
|
-
|
135
|
+
|
136
136
|
# How long has this Job taken?
|
137
137
|
def time_taken
|
138
138
|
return self.time if self.time
|
139
139
|
Time.now - self.created_at
|
140
140
|
end
|
141
|
-
|
141
|
+
|
142
142
|
# Generate a stable 8-bit Hex color code, based on the Job's id.
|
143
143
|
def color
|
144
144
|
@color ||= Digest::MD5.hexdigest(self.id.to_s)[-7...-1]
|
145
145
|
end
|
146
|
-
|
146
|
+
|
147
147
|
# A JSON representation of this job includes the statuses of its component
|
148
148
|
# WorkUnits, as well as any completed outputs.
|
149
149
|
def to_json(opts={})
|
150
150
|
atts = {
|
151
151
|
'id' => id,
|
152
152
|
'color' => color,
|
153
|
-
'status' => display_status,
|
153
|
+
'status' => display_status,
|
154
154
|
'percent_complete' => percent_complete,
|
155
155
|
'work_units' => work_units.count,
|
156
156
|
'time_taken' => time_taken
|
@@ -159,10 +159,10 @@ module CloudCrowd
|
|
159
159
|
atts['email'] = email if email
|
160
160
|
atts.to_json
|
161
161
|
end
|
162
|
-
|
163
|
-
|
162
|
+
|
163
|
+
|
164
164
|
private
|
165
|
-
|
165
|
+
|
166
166
|
# When the WorkUnits are all finished, gather all their outputs together
|
167
167
|
# before removing them from the database entirely. Returns their merged JSON.
|
168
168
|
def gather_outputs_from_work_units
|
@@ -171,20 +171,20 @@ module CloudCrowd
|
|
171
171
|
self.work_units.complete.destroy_all
|
172
172
|
outs.to_json
|
173
173
|
end
|
174
|
-
|
175
|
-
# When starting a new job, or moving to a new stage, split up the inputs
|
174
|
+
|
175
|
+
# When starting a new job, or moving to a new stage, split up the inputs
|
176
176
|
# into WorkUnits, and queue them. Workers will start picking them up right
|
177
177
|
# away.
|
178
178
|
def queue_for_workers(input=nil)
|
179
179
|
input ||= JSON.parse(self.inputs)
|
180
|
-
input.each {|i| WorkUnit.start(self, action, i, status) }
|
180
|
+
input.each {|i| WorkUnit.start(self, action, i, status) }
|
181
181
|
self
|
182
182
|
end
|
183
|
-
|
183
|
+
|
184
184
|
# A Job starts out either splitting or processing, depending on its action.
|
185
185
|
def set_initial_status
|
186
186
|
self.status = self.splittable? ? SPLITTING : PROCESSING
|
187
187
|
end
|
188
|
-
|
188
|
+
|
189
189
|
end
|
190
190
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cloud-crowd
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jeremy Ashkenas
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date:
|
12
|
+
date: 2010-01-08 00:00:00 -05:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|