documentcloud-cloud-crowd 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/cloud-crowd.gemspec CHANGED
@@ -1,6 +1,6 @@
1
1
  Gem::Specification.new do |s|
2
2
  s.name = 'cloud-crowd'
3
- s.version = '0.0.2' # Keep version in sync with cloud-cloud.rb
3
+ s.version = '0.0.3' # Keep version in sync with cloud-cloud.rb
4
4
  s.date = '2009-08-23'
5
5
 
6
6
  s.homepage = "http://documentcloud.org" # wiki page on github?
@@ -8,7 +8,6 @@
8
8
 
9
9
  require 'rubygems'
10
10
  require 'cloud-crowd'
11
- require 'cloud_crowd/app'
12
11
 
13
12
  CloudCrowd.configure(File.dirname(__FILE__) + '/config.yml')
14
13
  CloudCrowd.configure_database(File.dirname(__FILE__) + '/database.yml')
@@ -1,16 +1,46 @@
1
- :num_workers: 4
2
- :default_worker_wait: 1
3
- :max_worker_wait: 20
4
- :worker_wait_multiplier: 1.3
5
- :worker_retry_wait: 5
6
- :work_unit_retries: 3
7
-
1
+ # The URL where you're planning on running the server/queue/database.
8
2
  :central_server: http://localhost:9173
3
+
4
+ # Please provide your AWS credentials for S3 storage of job output.
5
+ :aws_access_key: [your AWS access key]
6
+ :aws_secret_key: [your AWS secret access key]
7
+
8
+ # Choose an S3 bucket to store all CloudCrowd output, and decide if you'd like
9
+ # to keep all resulting files on S3 private. If so, you'll receive authenticated
10
+ # S3 URLs as job output, good for 24 hours. If left public, you'll get the
11
+ # straight URLs to the files on S3.
12
+ :s3_bucket: [your CloudCrowd bucket]
13
+ :use_s3_authentication: no
14
+
15
+ # Use HTTP Basic Auth for all requests? (Includes all internal worker requests
16
+ # to the central server). If yes, specify the login and password that all
17
+ # requests must provide for authentication.
9
18
  :use_http_authentication: no
10
19
  :login: [your login name]
11
20
  :password: [your password]
12
21
 
13
- :use_s3_authentication: no
14
- :s3_bucket: [your CloudCrowd bucket]
15
- :aws_access_key: [your AWS access key]
16
- :aws_secret_key: [your AWS secret access key]
22
+ # Set the following numbers to tweak the configuration of your worker daemons.
23
+ # Optimum results will depend on proportion of the Memory/CPU/IO bottlenecks
24
+ # in your actions, the number of central servers you have running, and your
25
+ # desired balance between latency and traffic.
26
+
27
+ # The number of workers that `crowd workers start` spins up.
28
+ :num_workers: 4
29
+
30
+ # The minimum number of seconds a worker waits between checking the job queue.
31
+ :min_worker_wait: 1
32
+
33
+ # The maximum number of seconds a worker waits between checking the job queue.
34
+ :max_worker_wait: 20
35
+
36
+ # The backoff multiplier the worker uses to slow down the check interval when
37
+ # there's no work in the queue.
38
+ :worker_wait_multiplier: 1.3
39
+
40
+ # The number of seconds a worker waits to retry when there's some kind of
41
+ # internal error (ie. the central server fails to respond)
42
+ :worker_retry_wait: 5
43
+
44
+ # The number of separate attempts that will be made to process an individual
45
+ # work unit, before marking it as having failed.
46
+ :work_unit_retries: 3
@@ -1,3 +1,6 @@
1
+ # This is a standard ActiveRecord database.yml file. You can configure it
2
+ # to use any database that ActiveRecord supports.
3
+
1
4
  :adapter: mysql
2
5
  :encoding: utf8
3
6
  :username: root
data/lib/cloud-crowd.rb CHANGED
@@ -1,21 +1,47 @@
1
+ # The Grand Central of code loading...
2
+
1
3
  $LOAD_PATH.unshift File.expand_path(File.dirname(__FILE__))
2
4
 
3
5
  # Common Gems:
4
- require 'json'
5
- require 'rest_client'
6
- require 'right_aws'
6
+ require 'rubygems'
7
+ gem 'activerecord'
8
+ gem 'daemons'
9
+ gem 'json'
10
+ gem 'rest-client'
11
+ gem 'right_aws'
12
+ gem 'sinatra'
7
13
 
8
14
  # Common CloudCrowd libs:
9
15
  require 'cloud_crowd/core_ext'
10
- require 'cloud_crowd/action'
16
+
17
+ # Autoloading for all the pieces which may or may not be needed:
18
+ autoload :ActiveRecord, 'activerecord'
19
+ autoload :Benchmark, 'benchmark'
20
+ autoload :Daemons, 'daemons'
21
+ autoload :ERB, 'erb'
22
+ autoload :FileUtils, 'fileutils'
23
+ autoload :JSON, 'json'
24
+ autoload :RestClient, 'rest_client'
25
+ autoload :RightAws, 'right_aws'
26
+ autoload :Sinatra, 'sinatra'
27
+ autoload :Socket, 'socket'
28
+ autoload :YAML, 'yaml'
11
29
 
12
30
  module CloudCrowd
13
31
 
32
+ # Autoload all the CloudCrowd classes which may not be required.
33
+ autoload :App, 'cloud_crowd/app'
34
+ autoload :Action, 'cloud_crowd/action'
35
+ autoload :AssetStore, 'cloud_crowd/asset_store'
36
+ autoload :Helpers, 'cloud_crowd/helpers'
37
+ autoload :Job, 'cloud_crowd/models'
38
+ autoload :WorkUnit, 'cloud_crowd/models'
39
+
14
40
  # Root directory of the CloudCrowd gem.
15
41
  ROOT = File.expand_path(File.dirname(__FILE__) + '/..')
16
42
 
17
43
  # Keep the version in sync with the gemspec.
18
- VERSION = '0.0.2'
44
+ VERSION = '0.0.3'
19
45
 
20
46
  # A Job is processing if its WorkUnits in the queue to be handled by workers.
21
47
  PROCESSING = 1
@@ -1,8 +1,3 @@
1
- require 'erb'
2
- require 'sinatra'
3
- require 'cloud_crowd/models'
4
- require 'cloud_crowd/helpers'
5
-
6
1
  module CloudCrowd
7
2
 
8
3
  class App < Sinatra::Default
@@ -10,7 +5,7 @@ module CloudCrowd
10
5
  # static serves files from /public, methodoverride allows the _method param.
11
6
  enable :static, :methodoverride
12
7
 
13
- set :root, CloudCrowd::ROOT
8
+ set :root, CloudCrowd::ROOT
14
9
  set :authorization_realm, "CloudCrowd"
15
10
 
16
11
  helpers CloudCrowd::Helpers
@@ -40,29 +35,29 @@ module CloudCrowd
40
35
  # Internal method for worker daemons to fetch the work unit at the front
41
36
  # of the queue. Work unit is marked as taken and handed off to the worker.
42
37
  get '/work' do
43
- begin
44
- unit = WorkUnit.first(:conditions => {:status => CloudCrowd::INCOMPLETE, :taken => false}, :order => "created_at desc")
45
- return status(204) && '' unless unit
46
- unit.update_attributes(:taken => true)
47
- unit.to_json
48
- rescue ActiveRecord::StaleObjectError => e
49
- return status(204) && ''
50
- end
38
+ dequeue_work_unit
51
39
  end
52
40
 
53
41
  # When workers are done with their unit, either successfully on in failure,
54
- # they mark it back on the central server.
42
+ # they mark it back on the central server and retrieve another. Failures
43
+ # pull from one down in the queue, so as to not repeat the same unit.
55
44
  put '/work/:work_unit_id' do
56
- case params[:status]
57
- when 'succeeded' then current_work_unit.finish(params[:output], params[:time])
58
- when 'failed' then current_work_unit.fail(params[:output], params[:time])
59
- else return error(500, "Completing a work unit must specify status.")
45
+ handle_conflicts(409) do
46
+ case params[:status]
47
+ when 'succeeded'
48
+ current_work_unit.finish(params[:output], params[:time])
49
+ dequeue_work_unit
50
+ when 'failed'
51
+ current_work_unit.fail(params[:output], params[:time])
52
+ dequeue_work_unit(1)
53
+ else
54
+ return error(500, "Completing a work unit must specify status.")
55
+ end
60
56
  end
61
- return status(204) && ''
62
57
  end
63
58
 
64
59
  # To monitor the central server with Monit, God, Nagios, or another
65
- # monitoring tool, you can hit /heartbeat to check.
60
+ # monitoring tool, you can hit /heartbeat to make sure.
66
61
  get '/heartbeat' do
67
62
  "buh-bump"
68
63
  end
@@ -10,12 +10,12 @@ module CloudCrowd
10
10
  # isn't any work to be done, and speeds back up when there is.
11
11
  class Daemon
12
12
 
13
- DEFAULT_WAIT = CloudCrowd.config[:default_worker_wait]
13
+ MIN_WAIT = CloudCrowd.config[:min_worker_wait]
14
14
  MAX_WAIT = CloudCrowd.config[:max_worker_wait]
15
15
  WAIT_MULTIPLIER = CloudCrowd.config[:worker_wait_multiplier]
16
16
 
17
17
  def initialize
18
- @wait_time = DEFAULT_WAIT
18
+ @wait_time = MIN_WAIT
19
19
  @worker = CloudCrowd::Worker.new
20
20
  Signal.trap('INT', 'EXIT')
21
21
  Signal.trap('KILL', 'EXIT')
@@ -31,9 +31,11 @@ module CloudCrowd
31
31
  loop do
32
32
  @worker.fetch_work_unit
33
33
  if @worker.has_work?
34
- @worker.run
35
- @wait_time = DEFAULT_WAIT
36
- sleep 0.01 # So as to listen for incoming signals.
34
+ @wait_time = MIN_WAIT
35
+ while @worker.has_work?
36
+ @worker.run
37
+ sleep 0.01 # So as to listen for incoming signals.
38
+ end
37
39
  else
38
40
  @wait_time = [@wait_time * WAIT_MULTIPLIER, MAX_WAIT].min
39
41
  sleep @wait_time
@@ -10,6 +10,27 @@ module CloudCrowd
10
10
  @work_unit ||= WorkUnit.find_by_id(params[:work_unit_id]) or raise Sinatra::NotFound
11
11
  end
12
12
 
13
+ # Try to fetch a work unit from the queue. If none are pending, respond
14
+ # with no content.
15
+ def dequeue_work_unit(offset=0)
16
+ handle_conflicts do
17
+ unit = WorkUnit.dequeue(offset)
18
+ return status(204) && '' unless unit
19
+ unit.to_json
20
+ end
21
+ end
22
+
23
+ # We're using ActiveRecords optimistic locking, so stale work units
24
+ # may sometimes arise. handle_conflicts responds with a the HTTP status
25
+ # code of your choosing if the update failed to be applied.
26
+ def handle_conflicts(code=204)
27
+ begin
28
+ yield
29
+ rescue ActiveRecord::StaleObjectError => e
30
+ return status(code) && ''
31
+ end
32
+ end
33
+
13
34
  end
14
35
  end
15
36
  end
@@ -1,129 +1,132 @@
1
- # A chunk of work that will be farmed out into many WorkUnits to be processed
2
- # in parallel by all the active CloudCrowd::Workers. Jobs are defined by a list
3
- # of inputs (usually public urls to files), an action (the name of a script that
4
- # CloudCrowd knows how to run), and, eventually a corresponding list of output.
5
- class Job < ActiveRecord::Base
6
- include CloudCrowd::ModelStatus
7
-
8
- has_many :work_units, :dependent => :destroy
9
-
10
- validates_presence_of :status, :inputs, :action, :options
11
-
12
- # Create a Job from an incoming JSON or XML request, and add it to the queue.
13
- # TODO: Add XML support.
14
- def self.create_from_request(h)
15
- self.create(
16
- :inputs => h['inputs'].to_json,
17
- :action => h['action'],
18
- :options => (h['options'] || {}).to_json,
19
- :owner_email => h['owner_email'],
20
- :callback_url => h['callback_url']
21
- )
22
- end
23
-
24
- def after_create
25
- self.queue_for_workers(JSON.parse(self.inputs))
26
- end
27
-
28
- def before_validation_on_create
29
- self.status = self.splittable? ? CloudCrowd::SPLITTING : CloudCrowd::PROCESSING
30
- end
31
-
32
- # After work units are marked successful, we check to see if all of them have
33
- # finished, if so, this job is complete.
34
- def check_for_completion
35
- return unless all_work_units_complete?
36
- transition_to_next_phase
37
- output_list = gather_outputs_from_work_units
1
+ module CloudCrowd
2
+
3
+ # A chunk of work that will be farmed out into many WorkUnits to be processed
4
+ # in parallel by all the active CloudCrowd::Workers. Jobs are defined by a list
5
+ # of inputs (usually public urls to files), an action (the name of a script that
6
+ # CloudCrowd knows how to run), and, eventually a corresponding list of output.
7
+ class Job < ActiveRecord::Base
8
+ include CloudCrowd::ModelStatus
9
+
10
+ has_many :work_units, :dependent => :destroy
38
11
 
39
- if complete?
40
- self.outputs = output_list.to_json
41
- self.time = Time.now - self.created_at
12
+ validates_presence_of :status, :inputs, :action, :options
13
+
14
+ # Create a Job from an incoming JSON or XML request, and add it to the queue.
15
+ # TODO: Add XML support.
16
+ def self.create_from_request(h)
17
+ self.create(
18
+ :inputs => h['inputs'].to_json,
19
+ :action => h['action'],
20
+ :options => (h['options'] || {}).to_json,
21
+ :owner_email => h['owner_email'],
22
+ :callback_url => h['callback_url']
23
+ )
42
24
  end
43
- self.save
44
25
 
45
- case self.status
46
- when CloudCrowd::PROCESSING then queue_for_workers(output_list.map {|o| JSON.parse(o) }.flatten)
47
- when CloudCrowd::MERGING then queue_for_workers(output_list.to_json)
48
- else fire_callback
26
+ def after_create
27
+ self.queue_for_workers(JSON.parse(self.inputs))
49
28
  end
50
- self
51
- end
52
-
53
- # Transition from the current phase to the next one.
54
- def transition_to_next_phase
55
- self.status = any_work_units_failed? ? CloudCrowd::FAILED :
56
- self.splitting? ? CloudCrowd::PROCESSING :
57
- self.should_merge? ? CloudCrowd::MERGING :
58
- CloudCrowd::SUCCEEDED
59
- end
60
-
61
- # If a callback_url is defined, post the Job's JSON to it upon completion.
62
- def fire_callback
63
- begin
64
- RestClient.post(callback_url, {:job => self.to_json}) if callback_url
65
- rescue RestClient::Exception => e
66
- puts "Failed to fire job callback. Hmmm, what should happen here?"
29
+
30
+ def before_validation_on_create
31
+ self.status = self.splittable? ? CloudCrowd::SPLITTING : CloudCrowd::PROCESSING
67
32
  end
68
- end
69
-
70
- # Cleaning up after a job will remove all of its files from S3.
71
- def cleanup
72
- CloudCrowd::AssetStore.new.cleanup_job(self)
73
- end
74
-
75
- # Have all of the WorkUnits finished? We could trade reads for writes here
76
- # by keeping a completed_count on the Job itself.
77
- def all_work_units_complete?
78
- self.work_units.incomplete.count <= 0
79
- end
80
-
81
- # Have any of the WorkUnits failed?
82
- def any_work_units_failed?
83
- self.work_units.failed.count > 0
84
- end
85
-
86
- def splittable?
87
- self.action_class.new.respond_to? :split
88
- end
89
-
90
- def should_merge?
91
- self.processing? && self.action_class.new.respond_to?(:merge)
92
- end
93
-
94
- def action_class
95
- CloudCrowd.actions(self.action)
96
- end
97
-
98
- def gather_outputs_from_work_units
99
- outs = self.work_units.complete.map {|wu| wu.output }
100
- self.work_units.complete.destroy_all
101
- outs
102
- end
103
-
104
- def display_status
105
- CloudCrowd.display_status(self.status)
106
- end
107
-
108
- def work_units_remaining
109
- self.work_units.incomplete.count
110
- end
111
-
112
- # A JSON representation of this job includes the statuses of its component
113
- # WorkUnits, as well as any completed outputs.
114
- def to_json(opts={})
115
- atts = {'id' => self.id, 'status' => self.display_status, 'work_units_remaining' => self.work_units_remaining}
116
- atts.merge!({'outputs' => JSON.parse(self.outputs)}) if self.outputs
117
- atts.merge!({'time' => self.time}) if self.time
118
- atts.to_json
119
- end
33
+
34
+ # After work units are marked successful, we check to see if all of them have
35
+ # finished, if so, this job is complete.
36
+ def check_for_completion
37
+ return unless all_work_units_complete?
38
+ transition_to_next_phase
39
+ output_list = gather_outputs_from_work_units
120
40
 
121
- # When starting a new job, or moving to a new stage, split up the inputs
122
- # into WorkUnits, and queue them.
123
- def queue_for_workers(input)
124
- [input].flatten.each do |wu_input|
125
- WorkUnit.create(:job => self, :input => wu_input, :status => self.status)
41
+ if complete?
42
+ self.outputs = output_list.to_json
43
+ self.time = Time.now - self.created_at
44
+ end
45
+ self.save
46
+
47
+ case self.status
48
+ when CloudCrowd::PROCESSING then queue_for_workers(output_list.map {|o| JSON.parse(o) }.flatten)
49
+ when CloudCrowd::MERGING then queue_for_workers(output_list.to_json)
50
+ else fire_callback
51
+ end
52
+ self
53
+ end
54
+
55
+ # Transition from the current phase to the next one.
56
+ def transition_to_next_phase
57
+ self.status = any_work_units_failed? ? CloudCrowd::FAILED :
58
+ self.splitting? ? CloudCrowd::PROCESSING :
59
+ self.should_merge? ? CloudCrowd::MERGING :
60
+ CloudCrowd::SUCCEEDED
61
+ end
62
+
63
+ # If a callback_url is defined, post the Job's JSON to it upon completion.
64
+ def fire_callback
65
+ begin
66
+ RestClient.post(callback_url, {:job => self.to_json}) if callback_url
67
+ rescue RestClient::Exception => e
68
+ puts "Failed to fire job callback. Hmmm, what should happen here?"
69
+ end
70
+ end
71
+
72
+ # Cleaning up after a job will remove all of its files from S3.
73
+ def cleanup
74
+ CloudCrowd::AssetStore.new.cleanup_job(self)
75
+ end
76
+
77
+ # Have all of the WorkUnits finished? We could trade reads for writes here
78
+ # by keeping a completed_count on the Job itself.
79
+ def all_work_units_complete?
80
+ self.work_units.incomplete.count <= 0
81
+ end
82
+
83
+ # Have any of the WorkUnits failed?
84
+ def any_work_units_failed?
85
+ self.work_units.failed.count > 0
86
+ end
87
+
88
+ def splittable?
89
+ self.action_class.new.respond_to? :split
90
+ end
91
+
92
+ def should_merge?
93
+ self.processing? && self.action_class.new.respond_to?(:merge)
94
+ end
95
+
96
+ def action_class
97
+ CloudCrowd.actions(self.action)
98
+ end
99
+
100
+ def gather_outputs_from_work_units
101
+ outs = self.work_units.complete.map {|wu| wu.output }
102
+ self.work_units.complete.destroy_all
103
+ outs
104
+ end
105
+
106
+ def display_status
107
+ CloudCrowd.display_status(self.status)
108
+ end
109
+
110
+ def work_units_remaining
111
+ self.work_units.incomplete.count
112
+ end
113
+
114
+ # A JSON representation of this job includes the statuses of its component
115
+ # WorkUnits, as well as any completed outputs.
116
+ def to_json(opts={})
117
+ atts = {'id' => self.id, 'status' => self.display_status, 'work_units_remaining' => self.work_units_remaining}
118
+ atts.merge!({'outputs' => JSON.parse(self.outputs)}) if self.outputs
119
+ atts.merge!({'time' => self.time}) if self.time
120
+ atts.to_json
121
+ end
122
+
123
+ # When starting a new job, or moving to a new stage, split up the inputs
124
+ # into WorkUnits, and queue them.
125
+ def queue_for_workers(input)
126
+ [input].flatten.each do |wu_input|
127
+ WorkUnit.create(:job => self, :input => wu_input, :status => self.status)
128
+ end
126
129
  end
130
+
127
131
  end
128
-
129
132
  end
@@ -1,62 +1,75 @@
1
- # A WorkUnit is an atomic chunk of work from a job, processing a single input
2
- # through a single action. All WorkUnits receive the same options.
3
- class WorkUnit < ActiveRecord::Base
4
- include CloudCrowd::ModelStatus
5
-
6
- belongs_to :job
7
-
8
- validates_presence_of :job_id, :status, :input
9
-
10
- after_save :check_for_job_completion
11
-
12
- # After saving a WorkUnit, it's Job should check if it just become complete.
13
- def check_for_job_completion
14
- self.job.check_for_completion if complete?
1
+ module CloudCrowd
2
+
3
+ # A WorkUnit is an atomic chunk of work from a job, processing a single input
4
+ # through a single action. All WorkUnits receive the same options.
5
+ class WorkUnit < ActiveRecord::Base
6
+ include CloudCrowd::ModelStatus
7
+
8
+ belongs_to :job
9
+
10
+ validates_presence_of :job_id, :status, :input
11
+
12
+ after_save :check_for_job_completion
13
+
14
+ # Find the Nth available WorkUnit in the queue, and take it out.
15
+ def self.dequeue(offset=0)
16
+ unit = self.first(
17
+ :conditions => {:status => CloudCrowd::INCOMPLETE, :taken => false},
18
+ :order => "created_at asc",
19
+ :offset => offset
20
+ )
21
+ unit ? unit.update_attributes(:taken => true) && unit : nil
22
+ end
23
+
24
+ # After saving a WorkUnit, it's Job should check if it just become complete.
25
+ def check_for_job_completion
26
+ self.job.check_for_completion if complete?
27
+ end
28
+
29
+ # Mark this unit as having finished successfully.
30
+ def finish(output, time_taken)
31
+ update_attributes({
32
+ :status => CloudCrowd::SUCCEEDED,
33
+ :taken => false,
34
+ :attempts => self.attempts + 1,
35
+ :output => output,
36
+ :time => time_taken
37
+ })
38
+ end
39
+
40
+ # Mark this unit as having failed. May attempt a retry.
41
+ def fail(output, time_taken)
42
+ tries = self.attempts + 1
43
+ return try_again if tries < CloudCrowd.config[:work_unit_retries]
44
+ update_attributes({
45
+ :status => CloudCrowd::FAILED,
46
+ :taken => false,
47
+ :attempts => tries,
48
+ :output => output,
49
+ :time => time_taken
50
+ })
51
+ end
52
+
53
+ # Ever tried. Ever failed. No matter. Try again. Fail again. Fail better.
54
+ def try_again
55
+ update_attributes({
56
+ :taken => false,
57
+ :attempts => self.attempts + 1
58
+ })
59
+ end
60
+
61
+ # The JSON representation of a WorkUnit contains common elements of its job.
62
+ def to_json
63
+ {
64
+ 'id' => self.id,
65
+ 'job_id' => self.job_id,
66
+ 'input' => self.input,
67
+ 'attempts' => self.attempts,
68
+ 'action' => self.job.action,
69
+ 'options' => JSON.parse(self.job.options),
70
+ 'status' => self.status
71
+ }.to_json
72
+ end
73
+
15
74
  end
16
-
17
- # Mark this unit as having finished successfully.
18
- def finish(output, time_taken)
19
- update_attributes({
20
- :status => CloudCrowd::SUCCEEDED,
21
- :taken => false,
22
- :attempts => self.attempts + 1,
23
- :output => output,
24
- :time => time_taken
25
- })
26
- end
27
-
28
- # Mark this unit as having failed. May attempt a retry.
29
- def fail(output, time_taken)
30
- tries = self.attempts + 1
31
- return try_again if tries < CloudCrowd.config[:work_unit_retries]
32
- update_attributes({
33
- :status => CloudCrowd::FAILED,
34
- :taken => false,
35
- :attempts => tries,
36
- :output => output,
37
- :time => time_taken
38
- })
39
- end
40
-
41
- # Ever tried. Ever failed. No matter. Try again. Fail again. Fail better.
42
- def try_again
43
- update_attributes({
44
- :taken => false,
45
- :attempts => self.attempts + 1
46
- })
47
- end
48
-
49
- # The JSON representation of a WorkUnit contains common elements of its job.
50
- def to_json
51
- {
52
- 'id' => self.id,
53
- 'job_id' => self.job_id,
54
- 'input' => self.input,
55
- 'attempts' => self.attempts,
56
- 'action' => self.job.action,
57
- 'options' => JSON.parse(self.job.options),
58
- 'status' => self.status
59
- }.to_json
60
- end
61
-
62
- end
75
+ end
@@ -1,5 +1,3 @@
1
- require 'activerecord'
2
-
3
1
  module CloudCrowd
4
2
  module ModelStatus
5
3
 
@@ -1,22 +1,10 @@
1
- # This is the script that kicks off a single CloudCrowd::Daemon. Because the
2
- # daemons don't load the entire rails stack, this file functions like a mini
3
- # environment.rb, loading all the common gems that we need.
4
-
5
- # Standard Libs
6
- require 'fileutils'
7
- require 'benchmark'
8
- require 'socket'
9
-
10
- # Gems
11
- require 'rubygems'
12
- require 'daemons'
13
- require 'yaml'
14
-
15
- FileUtils.mkdir('log') unless File.exists?('log')
1
+ # This is the script that kicks off a single CloudCrowd::Daemon. Rely on
2
+ # cloud-crowd.rb for autoloading of all the code we need.
16
3
 
17
4
  # Daemon/Worker Dependencies.
18
5
  require "#{File.dirname(__FILE__)}/../cloud-crowd"
19
- require 'cloud_crowd/asset_store'
6
+
7
+ FileUtils.mkdir('log') unless File.exists?('log')
20
8
 
21
9
  Daemons.run("#{CloudCrowd::ROOT}/lib/cloud_crowd/daemon.rb", {
22
10
  :app_name => "cloud_crowd_worker",
@@ -22,10 +22,7 @@ module CloudCrowd
22
22
  def fetch_work_unit
23
23
  keep_trying_to "fetch a new work unit" do
24
24
  unit_json = @server['/work'].get
25
- return unless unit_json # No content means no work for us.
26
- @start_time = Time.now
27
- parse_work_unit unit_json
28
- log "fetched work unit for #{@action_name}"
25
+ setup_work_unit(unit_json)
29
26
  end
30
27
  end
31
28
 
@@ -33,8 +30,10 @@ module CloudCrowd
33
30
  def complete_work_unit(result)
34
31
  keep_trying_to "complete work unit" do
35
32
  data = completion_params.merge({:status => 'succeeded', :output => result})
36
- @server["/work/#{data[:id]}"].put(data)
33
+ unit_json = @server["/work/#{data[:id]}"].put(data)
37
34
  log "finished #{@action_name} in #{data[:time]} seconds"
35
+ clear_work_unit
36
+ setup_work_unit(unit_json)
38
37
  end
39
38
  end
40
39
 
@@ -42,8 +41,10 @@ module CloudCrowd
42
41
  def fail_work_unit(exception)
43
42
  keep_trying_to "mark work unit as failed" do
44
43
  data = completion_params.merge({:status => 'failed', :output => exception.message})
45
- @server["/work/#{data[:id]}"].put(data)
44
+ unit_json = @server["/work/#{data[:id]}"].put(data)
46
45
  log "failed #{@action_name} in #{data[:time]} seconds\n#{exception.message}\n#{exception.backtrace}"
46
+ clear_work_unit
47
+ setup_work_unit(unit_json)
47
48
  end
48
49
  end
49
50
 
@@ -78,8 +79,6 @@ module CloudCrowd
78
79
  complete_work_unit(result)
79
80
  rescue Exception => e
80
81
  fail_work_unit(e)
81
- ensure
82
- clear_work_unit
83
82
  end
84
83
  end
85
84
 
@@ -107,12 +106,16 @@ module CloudCrowd
107
106
  end
108
107
 
109
108
  # Extract our instance variables from a WorkUnit's JSON.
110
- def parse_work_unit(unit_json)
109
+ def setup_work_unit(unit_json)
110
+ return false unless unit_json
111
111
  unit = JSON.parse(unit_json)
112
+ @start_time = Time.now
112
113
  @action_name, @input, @options, @status = unit['action'], unit['input'], unit['options'], unit['status']
113
114
  @options['job_id'] = unit['job_id']
114
115
  @options['work_unit_id'] = unit['id']
115
116
  @options['attempts'] ||= unit['attempts']
117
+ log "fetched work unit for #{@action_name}"
118
+ return true
116
119
  end
117
120
 
118
121
  # Log a message to the daemon log. Includes PID for identification.
@@ -13,7 +13,7 @@ class FailingWorkUnitsTest < Test::Unit::TestCase
13
13
  }.to_json
14
14
  assert browser.last_response.ok?
15
15
 
16
- job = Job.last
16
+ job = CloudCrowd::Job.last
17
17
  (CloudCrowd.config[:work_unit_retries] - 1).times do
18
18
  job.work_units.each {|unit| unit.fail('failed', 10) }
19
19
  end
data/test/blueprints.rb CHANGED
@@ -1,14 +1,14 @@
1
1
  Sham.url { Faker::Internet.domain_name + "/" + Faker::Internet.domain_word + ".jpg" }
2
2
 
3
- Job.blueprint do
3
+ CloudCrowd::Job.blueprint do
4
4
  status { CloudCrowd::PROCESSING }
5
5
  inputs { ['http://www.google.com/intl/en_ALL/images/logo.gif'].to_json }
6
6
  action { 'graphics_magick' }
7
7
  options { {}.to_json }
8
8
  end
9
9
 
10
- WorkUnit.blueprint do
11
- job { Job.make }
10
+ CloudCrowd::WorkUnit.blueprint do
11
+ job { CloudCrowd::Job.make }
12
12
  status { CloudCrowd::PROCESSING }
13
13
  taken { false }
14
14
  input { Sham.url }
@@ -1,5 +1,5 @@
1
1
  :num_workers: 4
2
- :default_worker_wait: 1
2
+ :min_worker_wait: 1
3
3
  :max_worker_wait: 20
4
4
  :worker_wait_multiplier: 1.3
5
5
  :worker_retry_wait: 5
data/test/test_helper.rb CHANGED
@@ -2,8 +2,6 @@ require 'rubygems'
2
2
 
3
3
  here = File.dirname(__FILE__)
4
4
  require File.expand_path(here + "/../lib/cloud-crowd")
5
- require 'cloud_crowd/app'
6
-
7
5
  CloudCrowd.configure(here + '/config/config.yml')
8
6
  CloudCrowd.configure_database(here + '/config/database.yml')
9
7
 
@@ -5,7 +5,7 @@ class JobTest < Test::Unit::TestCase
5
5
  context "A CloudCrowd Job" do
6
6
 
7
7
  setup do
8
- @job = Job.make
8
+ @job = CloudCrowd::Job.make
9
9
  @unit = @job.work_units.first
10
10
  end
11
11
 
@@ -32,7 +32,7 @@ class JobTest < Test::Unit::TestCase
32
32
  end
33
33
 
34
34
  should "be able to create a job from a JSON request" do
35
- job = Job.create_from_request(JSON.parse(<<-EOS
35
+ job = CloudCrowd::Job.create_from_request(JSON.parse(<<-EOS
36
36
  { "inputs" : ["one", "two", "three"],
37
37
  "action" : "graphics_magick",
38
38
  "owner_email" : "bob@example.com",
@@ -46,13 +46,13 @@ class JobTest < Test::Unit::TestCase
46
46
  end
47
47
 
48
48
  should "create jobs with a SPLITTING status for actions that have a split method defined" do
49
- job = Job.create_from_request({'inputs' => ['1'], 'action' => 'pdf_to_images'})
49
+ job = CloudCrowd::Job.create_from_request({'inputs' => ['1'], 'action' => 'pdf_to_images'})
50
50
  assert job.splittable?
51
51
  assert job.splitting?
52
52
  end
53
53
 
54
54
  should "fire a callback when a job has finished, successfully or not" do
55
- Job.any_instance.expects(:fire_callback)
55
+ CloudCrowd::Job.any_instance.expects(:fire_callback)
56
56
  @job.work_units.first.finish('output', 10)
57
57
  assert @job.all_work_units_complete?
58
58
  end
@@ -5,7 +5,7 @@ class WorkUnitTest < Test::Unit::TestCase
5
5
  context "A WorkUnit" do
6
6
 
7
7
  setup do
8
- @unit = WorkUnit.make
8
+ @unit = CloudCrowd::WorkUnit.make
9
9
  @job = @unit.job
10
10
  end
11
11
 
@@ -26,7 +26,7 @@ class WorkUnitTest < Test::Unit::TestCase
26
26
  end
27
27
 
28
28
  should "have JSON that includes job attributes" do
29
- job = Job.make
29
+ job = CloudCrowd::Job.make
30
30
  unit_data = JSON.parse(job.work_units.first.to_json)
31
31
  assert unit_data['job_id'] == job.id
32
32
  assert unit_data['action'] == job.action
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: documentcloud-cloud-crowd
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jeremy Ashkenas