cloud-crowd 0.3.1 → 0.3.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,14 +1,16 @@
1
- # A parallel WordCount. Depends on the 'wc' utility.
1
+ # A parallel WordCount. Depends on the 'wc' utility.
2
2
  class WordCount < CloudCrowd::Action
3
-
3
+
4
4
  # Count the words in a single book.
5
+ # Pretend that this takes longer than it really does, for demonstration purposes.
5
6
  def process
7
+ sleep 5
6
8
  (`wc -w #{input_path}`).match(/\A\s*(\d+)/)[1].to_i
7
9
  end
8
-
10
+
9
11
  # Sum the total word count.
10
12
  def merge
11
13
  input.inject(0) {|sum, count| sum + count }
12
14
  end
13
-
15
+
14
16
  end
@@ -1,7 +1,7 @@
1
1
  Gem::Specification.new do |s|
2
2
  s.name = 'cloud-crowd'
3
- s.version = '0.3.1' # Keep version in sync with cloud-cloud.rb
4
- s.date = '2009-11-19'
3
+ s.version = '0.3.2' # Keep version in sync with cloud-cloud.rb
4
+ s.date = '2010-01-08'
5
5
 
6
6
  s.homepage = "http://wiki.github.com/documentcloud/cloud-crowd"
7
7
  s.summary = "Parallel Processing for the Rest of Us"
@@ -12,13 +12,13 @@ gem 'sinatra'
12
12
  gem 'thin'
13
13
 
14
14
  # Autoloading for all the pieces which may or may not be needed:
15
- autoload :ActiveRecord, 'activerecord'
15
+ autoload :ActiveRecord, 'active_record'
16
16
  autoload :Benchmark, 'benchmark'
17
17
  autoload :Digest, 'digest'
18
18
  autoload :ERB, 'erb'
19
19
  autoload :FileUtils, 'fileutils'
20
20
  autoload :JSON, 'json'
21
- autoload :RestClient, 'restclient'
21
+ autoload :RestClient, 'rest_client'
22
22
  autoload :RightAws, 'right_aws'
23
23
  autoload :Sinatra, 'sinatra'
24
24
  autoload :Thin, 'thin'
@@ -44,7 +44,7 @@ module CloudCrowd
44
44
  autoload :WorkUnit, 'cloud_crowd/models'
45
45
 
46
46
  # Keep this version in sync with the gemspec.
47
- VERSION = '0.3.1'
47
+ VERSION = '0.3.2'
48
48
 
49
49
  # Increment the schema version when there's a backwards incompatible change.
50
50
  SCHEMA_VERSION = 3
@@ -1,25 +1,25 @@
1
1
  module CloudCrowd
2
-
2
+
3
3
  # A chunk of work that will be farmed out into many WorkUnits to be processed
4
4
  # in parallel by each active CloudCrowd::Worker. Jobs are defined by a list
5
- # of inputs (usually public urls to files), an action (the name of a script that
5
+ # of inputs (usually public urls to files), an action (the name of a script that
6
6
  # CloudCrowd knows how to run), and, eventually a corresponding list of output.
7
7
  class Job < ActiveRecord::Base
8
8
  include ModelStatus
9
-
9
+
10
10
  CLEANUP_GRACE_PERIOD = 7 # That's a week.
11
-
11
+
12
12
  has_many :work_units, :dependent => :destroy
13
-
13
+
14
14
  validates_presence_of :status, :inputs, :action, :options
15
-
15
+
16
16
  before_validation_on_create :set_initial_status
17
17
  after_create :queue_for_workers
18
18
  before_destroy :cleanup_assets
19
-
19
+
20
20
  # Jobs that were last updated more than N days ago.
21
21
  named_scope :older_than, lambda {|num| {:conditions => ['updated_at < ?', num.days.ago]} }
22
-
22
+
23
23
  # Create a Job from an incoming JSON request, and add it to the queue.
24
24
  def self.create_from_request(h)
25
25
  self.create(
@@ -30,7 +30,7 @@ module CloudCrowd
30
30
  :callback_url => h['callback_url']
31
31
  )
32
32
  end
33
-
33
+
34
34
  # Clean up all jobs beyond a certain age.
35
35
  def self.cleanup_all(opts = {})
36
36
  days = opts[:days] || CLEANUP_GRACE_PERIOD
@@ -38,12 +38,12 @@ module CloudCrowd
38
38
  jobs.each {|job| job.destroy }
39
39
  end
40
40
  end
41
-
41
+
42
42
  # After work units are marked successful, we check to see if all of them have
43
- # finished, if so, continue on to the next phase of the job.
43
+ # finished, if so, continue on to the next phase of the job.
44
44
  def check_for_completion
45
45
  return unless all_work_units_complete?
46
- set_next_status
46
+ set_next_status
47
47
  outs = gather_outputs_from_work_units
48
48
  return queue_for_workers([outs]) if merging?
49
49
  if complete?
@@ -52,7 +52,7 @@ module CloudCrowd
52
52
  end
53
53
  self
54
54
  end
55
-
55
+
56
56
  # Transition this Job's current status to the appropriate next one, based
57
57
  # on the state of the WorkUnits and the nature of the Action.
58
58
  def set_next_status
@@ -63,12 +63,12 @@ module CloudCrowd
63
63
  SUCCEEDED
64
64
  )
65
65
  end
66
-
67
- # If a <tt>callback_url</tt> is defined, post the Job's JSON to it upon
66
+
67
+ # If a <tt>callback_url</tt> is defined, post the Job's JSON to it upon
68
68
  # completion. The <tt>callback_url</tt> may include HTTP basic authentication,
69
69
  # if you like:
70
70
  # http://user:password@example.com/job_complete
71
- # If the callback URL returns a '201 Created' HTTP status code, CloudCrowd
71
+ # If the callback URL returns a '201 Created' HTTP status code, CloudCrowd
72
72
  # will assume that the resource has been successfully created, and the Job
73
73
  # will be cleaned up.
74
74
  def fire_callback
@@ -76,54 +76,54 @@ module CloudCrowd
76
76
  response = RestClient.post(callback_url, {:job => self.to_json})
77
77
  Thread.new { self.destroy } if response && response.code == 201
78
78
  rescue RestClient::Exception => e
79
- puts "Failed to fire job callback. Hmmm, what should happen here?"
79
+ puts "Job ##{id} failed to fire callback: #{callback_url}"
80
80
  end
81
81
  end
82
-
82
+
83
83
  # Cleaning up after a job will remove all of its files from S3 or the
84
- # filesystem. Destroying a Job will cleanup_assets first. Run this in a
84
+ # filesystem. Destroying a Job will cleanup_assets first. Run this in a
85
85
  # separate thread to get out of the transaction's way.
86
86
  # TODO: Convert this into a 'cleanup' work unit that gets run by a worker.
87
87
  def cleanup_assets
88
88
  AssetStore.new.cleanup(self)
89
89
  end
90
-
91
- # Have all of the WorkUnits finished?
90
+
91
+ # Have all of the WorkUnits finished?
92
92
  def all_work_units_complete?
93
93
  self.work_units.incomplete.count <= 0
94
94
  end
95
-
95
+
96
96
  # Have any of the WorkUnits failed?
97
97
  def any_work_units_failed?
98
98
  self.work_units.failed.count > 0
99
99
  end
100
-
100
+
101
101
  # This job is splittable if its Action has a +split+ method.
102
102
  def splittable?
103
103
  self.action_class.public_instance_methods.map {|m| m.to_sym }.include? :split
104
104
  end
105
-
105
+
106
106
  # This job is done splitting if it's finished with its splitting work units.
107
107
  def done_splitting?
108
108
  splittable? && work_units.splitting.count <= 0
109
109
  end
110
-
110
+
111
111
  # This job is mergeable if its Action has a +merge+ method.
112
112
  def mergeable?
113
113
  self.processing? && self.action_class.public_instance_methods.map {|m| m.to_sym }.include?(:merge)
114
114
  end
115
-
115
+
116
116
  # Retrieve the class for this Job's Action.
117
117
  def action_class
118
118
  @action_class ||= CloudCrowd.actions[self.action]
119
119
  return @action_class if @action_class
120
120
  raise Error::ActionNotFound, "no action named: '#{self.action}' could be found"
121
121
  end
122
-
122
+
123
123
  # How complete is this Job?
124
124
  # Unfortunately, with the current processing sequence, the percent_complete
125
125
  # can pull a fast one and go backwards. This happens when there's a single
126
- # large input that takes a long time to split, and when it finally does it
126
+ # large input that takes a long time to split, and when it finally does it
127
127
  # creates a whole swarm of work units. This seems unavoidable.
128
128
  def percent_complete
129
129
  return 99 if merging?
@@ -132,25 +132,25 @@ module CloudCrowd
132
132
  return 100 if unit_count <= 0
133
133
  (work_units.complete.count / unit_count.to_f * 100).round
134
134
  end
135
-
135
+
136
136
  # How long has this Job taken?
137
137
  def time_taken
138
138
  return self.time if self.time
139
139
  Time.now - self.created_at
140
140
  end
141
-
141
+
142
142
  # Generate a stable 8-bit Hex color code, based on the Job's id.
143
143
  def color
144
144
  @color ||= Digest::MD5.hexdigest(self.id.to_s)[-7...-1]
145
145
  end
146
-
146
+
147
147
  # A JSON representation of this job includes the statuses of its component
148
148
  # WorkUnits, as well as any completed outputs.
149
149
  def to_json(opts={})
150
150
  atts = {
151
151
  'id' => id,
152
152
  'color' => color,
153
- 'status' => display_status,
153
+ 'status' => display_status,
154
154
  'percent_complete' => percent_complete,
155
155
  'work_units' => work_units.count,
156
156
  'time_taken' => time_taken
@@ -159,10 +159,10 @@ module CloudCrowd
159
159
  atts['email'] = email if email
160
160
  atts.to_json
161
161
  end
162
-
163
-
162
+
163
+
164
164
  private
165
-
165
+
166
166
  # When the WorkUnits are all finished, gather all their outputs together
167
167
  # before removing them from the database entirely. Returns their merged JSON.
168
168
  def gather_outputs_from_work_units
@@ -171,20 +171,20 @@ module CloudCrowd
171
171
  self.work_units.complete.destroy_all
172
172
  outs.to_json
173
173
  end
174
-
175
- # When starting a new job, or moving to a new stage, split up the inputs
174
+
175
+ # When starting a new job, or moving to a new stage, split up the inputs
176
176
  # into WorkUnits, and queue them. Workers will start picking them up right
177
177
  # away.
178
178
  def queue_for_workers(input=nil)
179
179
  input ||= JSON.parse(self.inputs)
180
- input.each {|i| WorkUnit.start(self, action, i, status) }
180
+ input.each {|i| WorkUnit.start(self, action, i, status) }
181
181
  self
182
182
  end
183
-
183
+
184
184
  # A Job starts out either splitting or processing, depending on its action.
185
185
  def set_initial_status
186
186
  self.status = self.splittable? ? SPLITTING : PROCESSING
187
187
  end
188
-
188
+
189
189
  end
190
190
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cloud-crowd
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.1
4
+ version: 0.3.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jeremy Ashkenas
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-11-19 00:00:00 -05:00
12
+ date: 2010-01-08 00:00:00 -05:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency