documentcloud-cloud-crowd 0.0.2 → 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
data/cloud-crowd.gemspec CHANGED
@@ -1,6 +1,6 @@
1
1
  Gem::Specification.new do |s|
2
2
  s.name = 'cloud-crowd'
3
- s.version = '0.0.2' # Keep version in sync with cloud-cloud.rb
3
+ s.version = '0.0.3' # Keep version in sync with cloud-cloud.rb
4
4
  s.date = '2009-08-23'
5
5
 
6
6
  s.homepage = "http://documentcloud.org" # wiki page on github?
@@ -8,7 +8,6 @@
8
8
 
9
9
  require 'rubygems'
10
10
  require 'cloud-crowd'
11
- require 'cloud_crowd/app'
12
11
 
13
12
  CloudCrowd.configure(File.dirname(__FILE__) + '/config.yml')
14
13
  CloudCrowd.configure_database(File.dirname(__FILE__) + '/database.yml')
@@ -1,16 +1,46 @@
1
- :num_workers: 4
2
- :default_worker_wait: 1
3
- :max_worker_wait: 20
4
- :worker_wait_multiplier: 1.3
5
- :worker_retry_wait: 5
6
- :work_unit_retries: 3
7
-
1
+ # The URL where you're planning on running the server/queue/database.
8
2
  :central_server: http://localhost:9173
3
+
4
+ # Please provide your AWS credentials for S3 storage of job output.
5
+ :aws_access_key: [your AWS access key]
6
+ :aws_secret_key: [your AWS secret access key]
7
+
8
+ # Choose an S3 bucket to store all CloudCrowd output, and decide if you'd like
9
+ # to keep all resulting files on S3 private. If so, you'll receive authenticated
10
+ # S3 URLs as job output, good for 24 hours. If left public, you'll get the
11
+ # straight URLs to the files on S3.
12
+ :s3_bucket: [your CloudCrowd bucket]
13
+ :use_s3_authentication: no
14
+
15
+ # Use HTTP Basic Auth for all requests? (Includes all internal worker requests
16
+ # to the central server). If yes, specify the login and password that all
17
+ # requests must provide for authentication.
9
18
  :use_http_authentication: no
10
19
  :login: [your login name]
11
20
  :password: [your password]
12
21
 
13
- :use_s3_authentication: no
14
- :s3_bucket: [your CloudCrowd bucket]
15
- :aws_access_key: [your AWS access key]
16
- :aws_secret_key: [your AWS secret access key]
22
+ # Set the following numbers to tweak the configuration of your worker daemons.
23
+ # Optimum results will depend on proportion of the Memory/CPU/IO bottlenecks
24
+ # in your actions, the number of central servers you have running, and your
25
+ # desired balance between latency and traffic.
26
+
27
+ # The number of workers that `crowd workers start` spins up.
28
+ :num_workers: 4
29
+
30
+ # The minimum number of seconds a worker waits between checking the job queue.
31
+ :min_worker_wait: 1
32
+
33
+ # The maximum number of seconds a worker waits between checking the job queue.
34
+ :max_worker_wait: 20
35
+
36
+ # The backoff multiplier the worker uses to slow down the check interval when
37
+ # there's no work in the queue.
38
+ :worker_wait_multiplier: 1.3
39
+
40
+ # The number of seconds a worker waits to retry when there's some kind of
41
+ # internal error (ie. the central server fails to respond)
42
+ :worker_retry_wait: 5
43
+
44
+ # The number of separate attempts that will be made to process an individual
45
+ # work unit, before marking it as having failed.
46
+ :work_unit_retries: 3
@@ -1,3 +1,6 @@
1
+ # This is a standard ActiveRecord database.yml file. You can configure it
2
+ # to use any database that ActiveRecord supports.
3
+
1
4
  :adapter: mysql
2
5
  :encoding: utf8
3
6
  :username: root
data/lib/cloud-crowd.rb CHANGED
@@ -1,21 +1,47 @@
1
+ # The Grand Central of code loading...
2
+
1
3
  $LOAD_PATH.unshift File.expand_path(File.dirname(__FILE__))
2
4
 
3
5
  # Common Gems:
4
- require 'json'
5
- require 'rest_client'
6
- require 'right_aws'
6
+ require 'rubygems'
7
+ gem 'activerecord'
8
+ gem 'daemons'
9
+ gem 'json'
10
+ gem 'rest-client'
11
+ gem 'right_aws'
12
+ gem 'sinatra'
7
13
 
8
14
  # Common CloudCrowd libs:
9
15
  require 'cloud_crowd/core_ext'
10
- require 'cloud_crowd/action'
16
+
17
+ # Autoloading for all the pieces which may or may not be needed:
18
+ autoload :ActiveRecord, 'activerecord'
19
+ autoload :Benchmark, 'benchmark'
20
+ autoload :Daemons, 'daemons'
21
+ autoload :ERB, 'erb'
22
+ autoload :FileUtils, 'fileutils'
23
+ autoload :JSON, 'json'
24
+ autoload :RestClient, 'rest_client'
25
+ autoload :RightAws, 'right_aws'
26
+ autoload :Sinatra, 'sinatra'
27
+ autoload :Socket, 'socket'
28
+ autoload :YAML, 'yaml'
11
29
 
12
30
  module CloudCrowd
13
31
 
32
+ # Autoload all the CloudCrowd classes which may not be required.
33
+ autoload :App, 'cloud_crowd/app'
34
+ autoload :Action, 'cloud_crowd/action'
35
+ autoload :AssetStore, 'cloud_crowd/asset_store'
36
+ autoload :Helpers, 'cloud_crowd/helpers'
37
+ autoload :Job, 'cloud_crowd/models'
38
+ autoload :WorkUnit, 'cloud_crowd/models'
39
+
14
40
  # Root directory of the CloudCrowd gem.
15
41
  ROOT = File.expand_path(File.dirname(__FILE__) + '/..')
16
42
 
17
43
  # Keep the version in sync with the gemspec.
18
- VERSION = '0.0.2'
44
+ VERSION = '0.0.3'
19
45
 
20
46
  # A Job is processing if its WorkUnits in the queue to be handled by workers.
21
47
  PROCESSING = 1
@@ -1,8 +1,3 @@
1
- require 'erb'
2
- require 'sinatra'
3
- require 'cloud_crowd/models'
4
- require 'cloud_crowd/helpers'
5
-
6
1
  module CloudCrowd
7
2
 
8
3
  class App < Sinatra::Default
@@ -10,7 +5,7 @@ module CloudCrowd
10
5
  # static serves files from /public, methodoverride allows the _method param.
11
6
  enable :static, :methodoverride
12
7
 
13
- set :root, CloudCrowd::ROOT
8
+ set :root, CloudCrowd::ROOT
14
9
  set :authorization_realm, "CloudCrowd"
15
10
 
16
11
  helpers CloudCrowd::Helpers
@@ -40,29 +35,29 @@ module CloudCrowd
40
35
  # Internal method for worker daemons to fetch the work unit at the front
41
36
  # of the queue. Work unit is marked as taken and handed off to the worker.
42
37
  get '/work' do
43
- begin
44
- unit = WorkUnit.first(:conditions => {:status => CloudCrowd::INCOMPLETE, :taken => false}, :order => "created_at desc")
45
- return status(204) && '' unless unit
46
- unit.update_attributes(:taken => true)
47
- unit.to_json
48
- rescue ActiveRecord::StaleObjectError => e
49
- return status(204) && ''
50
- end
38
+ dequeue_work_unit
51
39
  end
52
40
 
53
41
  # When workers are done with their unit, either successfully on in failure,
54
- # they mark it back on the central server.
42
+ # they mark it back on the central server and retrieve another. Failures
43
+ # pull from one down in the queue, so as to not repeat the same unit.
55
44
  put '/work/:work_unit_id' do
56
- case params[:status]
57
- when 'succeeded' then current_work_unit.finish(params[:output], params[:time])
58
- when 'failed' then current_work_unit.fail(params[:output], params[:time])
59
- else return error(500, "Completing a work unit must specify status.")
45
+ handle_conflicts(409) do
46
+ case params[:status]
47
+ when 'succeeded'
48
+ current_work_unit.finish(params[:output], params[:time])
49
+ dequeue_work_unit
50
+ when 'failed'
51
+ current_work_unit.fail(params[:output], params[:time])
52
+ dequeue_work_unit(1)
53
+ else
54
+ return error(500, "Completing a work unit must specify status.")
55
+ end
60
56
  end
61
- return status(204) && ''
62
57
  end
63
58
 
64
59
  # To monitor the central server with Monit, God, Nagios, or another
65
- # monitoring tool, you can hit /heartbeat to check.
60
+ # monitoring tool, you can hit /heartbeat to make sure.
66
61
  get '/heartbeat' do
67
62
  "buh-bump"
68
63
  end
@@ -10,12 +10,12 @@ module CloudCrowd
10
10
  # isn't any work to be done, and speeds back up when there is.
11
11
  class Daemon
12
12
 
13
- DEFAULT_WAIT = CloudCrowd.config[:default_worker_wait]
13
+ MIN_WAIT = CloudCrowd.config[:min_worker_wait]
14
14
  MAX_WAIT = CloudCrowd.config[:max_worker_wait]
15
15
  WAIT_MULTIPLIER = CloudCrowd.config[:worker_wait_multiplier]
16
16
 
17
17
  def initialize
18
- @wait_time = DEFAULT_WAIT
18
+ @wait_time = MIN_WAIT
19
19
  @worker = CloudCrowd::Worker.new
20
20
  Signal.trap('INT', 'EXIT')
21
21
  Signal.trap('KILL', 'EXIT')
@@ -31,9 +31,11 @@ module CloudCrowd
31
31
  loop do
32
32
  @worker.fetch_work_unit
33
33
  if @worker.has_work?
34
- @worker.run
35
- @wait_time = DEFAULT_WAIT
36
- sleep 0.01 # So as to listen for incoming signals.
34
+ @wait_time = MIN_WAIT
35
+ while @worker.has_work?
36
+ @worker.run
37
+ sleep 0.01 # So as to listen for incoming signals.
38
+ end
37
39
  else
38
40
  @wait_time = [@wait_time * WAIT_MULTIPLIER, MAX_WAIT].min
39
41
  sleep @wait_time
@@ -10,6 +10,27 @@ module CloudCrowd
10
10
  @work_unit ||= WorkUnit.find_by_id(params[:work_unit_id]) or raise Sinatra::NotFound
11
11
  end
12
12
 
13
+ # Try to fetch a work unit from the queue. If none are pending, respond
14
+ # with no content.
15
+ def dequeue_work_unit(offset=0)
16
+ handle_conflicts do
17
+ unit = WorkUnit.dequeue(offset)
18
+ return status(204) && '' unless unit
19
+ unit.to_json
20
+ end
21
+ end
22
+
23
+ # We're using ActiveRecords optimistic locking, so stale work units
24
+ # may sometimes arise. handle_conflicts responds with a the HTTP status
25
+ # code of your choosing if the update failed to be applied.
26
+ def handle_conflicts(code=204)
27
+ begin
28
+ yield
29
+ rescue ActiveRecord::StaleObjectError => e
30
+ return status(code) && ''
31
+ end
32
+ end
33
+
13
34
  end
14
35
  end
15
36
  end
@@ -1,129 +1,132 @@
1
- # A chunk of work that will be farmed out into many WorkUnits to be processed
2
- # in parallel by all the active CloudCrowd::Workers. Jobs are defined by a list
3
- # of inputs (usually public urls to files), an action (the name of a script that
4
- # CloudCrowd knows how to run), and, eventually a corresponding list of output.
5
- class Job < ActiveRecord::Base
6
- include CloudCrowd::ModelStatus
7
-
8
- has_many :work_units, :dependent => :destroy
9
-
10
- validates_presence_of :status, :inputs, :action, :options
11
-
12
- # Create a Job from an incoming JSON or XML request, and add it to the queue.
13
- # TODO: Add XML support.
14
- def self.create_from_request(h)
15
- self.create(
16
- :inputs => h['inputs'].to_json,
17
- :action => h['action'],
18
- :options => (h['options'] || {}).to_json,
19
- :owner_email => h['owner_email'],
20
- :callback_url => h['callback_url']
21
- )
22
- end
23
-
24
- def after_create
25
- self.queue_for_workers(JSON.parse(self.inputs))
26
- end
27
-
28
- def before_validation_on_create
29
- self.status = self.splittable? ? CloudCrowd::SPLITTING : CloudCrowd::PROCESSING
30
- end
31
-
32
- # After work units are marked successful, we check to see if all of them have
33
- # finished, if so, this job is complete.
34
- def check_for_completion
35
- return unless all_work_units_complete?
36
- transition_to_next_phase
37
- output_list = gather_outputs_from_work_units
1
+ module CloudCrowd
2
+
3
+ # A chunk of work that will be farmed out into many WorkUnits to be processed
4
+ # in parallel by all the active CloudCrowd::Workers. Jobs are defined by a list
5
+ # of inputs (usually public urls to files), an action (the name of a script that
6
+ # CloudCrowd knows how to run), and, eventually a corresponding list of output.
7
+ class Job < ActiveRecord::Base
8
+ include CloudCrowd::ModelStatus
9
+
10
+ has_many :work_units, :dependent => :destroy
38
11
 
39
- if complete?
40
- self.outputs = output_list.to_json
41
- self.time = Time.now - self.created_at
12
+ validates_presence_of :status, :inputs, :action, :options
13
+
14
+ # Create a Job from an incoming JSON or XML request, and add it to the queue.
15
+ # TODO: Add XML support.
16
+ def self.create_from_request(h)
17
+ self.create(
18
+ :inputs => h['inputs'].to_json,
19
+ :action => h['action'],
20
+ :options => (h['options'] || {}).to_json,
21
+ :owner_email => h['owner_email'],
22
+ :callback_url => h['callback_url']
23
+ )
42
24
  end
43
- self.save
44
25
 
45
- case self.status
46
- when CloudCrowd::PROCESSING then queue_for_workers(output_list.map {|o| JSON.parse(o) }.flatten)
47
- when CloudCrowd::MERGING then queue_for_workers(output_list.to_json)
48
- else fire_callback
26
+ def after_create
27
+ self.queue_for_workers(JSON.parse(self.inputs))
49
28
  end
50
- self
51
- end
52
-
53
- # Transition from the current phase to the next one.
54
- def transition_to_next_phase
55
- self.status = any_work_units_failed? ? CloudCrowd::FAILED :
56
- self.splitting? ? CloudCrowd::PROCESSING :
57
- self.should_merge? ? CloudCrowd::MERGING :
58
- CloudCrowd::SUCCEEDED
59
- end
60
-
61
- # If a callback_url is defined, post the Job's JSON to it upon completion.
62
- def fire_callback
63
- begin
64
- RestClient.post(callback_url, {:job => self.to_json}) if callback_url
65
- rescue RestClient::Exception => e
66
- puts "Failed to fire job callback. Hmmm, what should happen here?"
29
+
30
+ def before_validation_on_create
31
+ self.status = self.splittable? ? CloudCrowd::SPLITTING : CloudCrowd::PROCESSING
67
32
  end
68
- end
69
-
70
- # Cleaning up after a job will remove all of its files from S3.
71
- def cleanup
72
- CloudCrowd::AssetStore.new.cleanup_job(self)
73
- end
74
-
75
- # Have all of the WorkUnits finished? We could trade reads for writes here
76
- # by keeping a completed_count on the Job itself.
77
- def all_work_units_complete?
78
- self.work_units.incomplete.count <= 0
79
- end
80
-
81
- # Have any of the WorkUnits failed?
82
- def any_work_units_failed?
83
- self.work_units.failed.count > 0
84
- end
85
-
86
- def splittable?
87
- self.action_class.new.respond_to? :split
88
- end
89
-
90
- def should_merge?
91
- self.processing? && self.action_class.new.respond_to?(:merge)
92
- end
93
-
94
- def action_class
95
- CloudCrowd.actions(self.action)
96
- end
97
-
98
- def gather_outputs_from_work_units
99
- outs = self.work_units.complete.map {|wu| wu.output }
100
- self.work_units.complete.destroy_all
101
- outs
102
- end
103
-
104
- def display_status
105
- CloudCrowd.display_status(self.status)
106
- end
107
-
108
- def work_units_remaining
109
- self.work_units.incomplete.count
110
- end
111
-
112
- # A JSON representation of this job includes the statuses of its component
113
- # WorkUnits, as well as any completed outputs.
114
- def to_json(opts={})
115
- atts = {'id' => self.id, 'status' => self.display_status, 'work_units_remaining' => self.work_units_remaining}
116
- atts.merge!({'outputs' => JSON.parse(self.outputs)}) if self.outputs
117
- atts.merge!({'time' => self.time}) if self.time
118
- atts.to_json
119
- end
33
+
34
+ # After work units are marked successful, we check to see if all of them have
35
+ # finished, if so, this job is complete.
36
+ def check_for_completion
37
+ return unless all_work_units_complete?
38
+ transition_to_next_phase
39
+ output_list = gather_outputs_from_work_units
120
40
 
121
- # When starting a new job, or moving to a new stage, split up the inputs
122
- # into WorkUnits, and queue them.
123
- def queue_for_workers(input)
124
- [input].flatten.each do |wu_input|
125
- WorkUnit.create(:job => self, :input => wu_input, :status => self.status)
41
+ if complete?
42
+ self.outputs = output_list.to_json
43
+ self.time = Time.now - self.created_at
44
+ end
45
+ self.save
46
+
47
+ case self.status
48
+ when CloudCrowd::PROCESSING then queue_for_workers(output_list.map {|o| JSON.parse(o) }.flatten)
49
+ when CloudCrowd::MERGING then queue_for_workers(output_list.to_json)
50
+ else fire_callback
51
+ end
52
+ self
53
+ end
54
+
55
+ # Transition from the current phase to the next one.
56
+ def transition_to_next_phase
57
+ self.status = any_work_units_failed? ? CloudCrowd::FAILED :
58
+ self.splitting? ? CloudCrowd::PROCESSING :
59
+ self.should_merge? ? CloudCrowd::MERGING :
60
+ CloudCrowd::SUCCEEDED
61
+ end
62
+
63
+ # If a callback_url is defined, post the Job's JSON to it upon completion.
64
+ def fire_callback
65
+ begin
66
+ RestClient.post(callback_url, {:job => self.to_json}) if callback_url
67
+ rescue RestClient::Exception => e
68
+ puts "Failed to fire job callback. Hmmm, what should happen here?"
69
+ end
70
+ end
71
+
72
+ # Cleaning up after a job will remove all of its files from S3.
73
+ def cleanup
74
+ CloudCrowd::AssetStore.new.cleanup_job(self)
75
+ end
76
+
77
+ # Have all of the WorkUnits finished? We could trade reads for writes here
78
+ # by keeping a completed_count on the Job itself.
79
+ def all_work_units_complete?
80
+ self.work_units.incomplete.count <= 0
81
+ end
82
+
83
+ # Have any of the WorkUnits failed?
84
+ def any_work_units_failed?
85
+ self.work_units.failed.count > 0
86
+ end
87
+
88
+ def splittable?
89
+ self.action_class.new.respond_to? :split
90
+ end
91
+
92
+ def should_merge?
93
+ self.processing? && self.action_class.new.respond_to?(:merge)
94
+ end
95
+
96
+ def action_class
97
+ CloudCrowd.actions(self.action)
98
+ end
99
+
100
+ def gather_outputs_from_work_units
101
+ outs = self.work_units.complete.map {|wu| wu.output }
102
+ self.work_units.complete.destroy_all
103
+ outs
104
+ end
105
+
106
+ def display_status
107
+ CloudCrowd.display_status(self.status)
108
+ end
109
+
110
+ def work_units_remaining
111
+ self.work_units.incomplete.count
112
+ end
113
+
114
+ # A JSON representation of this job includes the statuses of its component
115
+ # WorkUnits, as well as any completed outputs.
116
+ def to_json(opts={})
117
+ atts = {'id' => self.id, 'status' => self.display_status, 'work_units_remaining' => self.work_units_remaining}
118
+ atts.merge!({'outputs' => JSON.parse(self.outputs)}) if self.outputs
119
+ atts.merge!({'time' => self.time}) if self.time
120
+ atts.to_json
121
+ end
122
+
123
+ # When starting a new job, or moving to a new stage, split up the inputs
124
+ # into WorkUnits, and queue them.
125
+ def queue_for_workers(input)
126
+ [input].flatten.each do |wu_input|
127
+ WorkUnit.create(:job => self, :input => wu_input, :status => self.status)
128
+ end
126
129
  end
130
+
127
131
  end
128
-
129
132
  end
@@ -1,62 +1,75 @@
1
- # A WorkUnit is an atomic chunk of work from a job, processing a single input
2
- # through a single action. All WorkUnits receive the same options.
3
- class WorkUnit < ActiveRecord::Base
4
- include CloudCrowd::ModelStatus
5
-
6
- belongs_to :job
7
-
8
- validates_presence_of :job_id, :status, :input
9
-
10
- after_save :check_for_job_completion
11
-
12
- # After saving a WorkUnit, it's Job should check if it just become complete.
13
- def check_for_job_completion
14
- self.job.check_for_completion if complete?
1
+ module CloudCrowd
2
+
3
+ # A WorkUnit is an atomic chunk of work from a job, processing a single input
4
+ # through a single action. All WorkUnits receive the same options.
5
+ class WorkUnit < ActiveRecord::Base
6
+ include CloudCrowd::ModelStatus
7
+
8
+ belongs_to :job
9
+
10
+ validates_presence_of :job_id, :status, :input
11
+
12
+ after_save :check_for_job_completion
13
+
14
+ # Find the Nth available WorkUnit in the queue, and take it out.
15
+ def self.dequeue(offset=0)
16
+ unit = self.first(
17
+ :conditions => {:status => CloudCrowd::INCOMPLETE, :taken => false},
18
+ :order => "created_at asc",
19
+ :offset => offset
20
+ )
21
+ unit ? unit.update_attributes(:taken => true) && unit : nil
22
+ end
23
+
24
+ # After saving a WorkUnit, it's Job should check if it just become complete.
25
+ def check_for_job_completion
26
+ self.job.check_for_completion if complete?
27
+ end
28
+
29
+ # Mark this unit as having finished successfully.
30
+ def finish(output, time_taken)
31
+ update_attributes({
32
+ :status => CloudCrowd::SUCCEEDED,
33
+ :taken => false,
34
+ :attempts => self.attempts + 1,
35
+ :output => output,
36
+ :time => time_taken
37
+ })
38
+ end
39
+
40
+ # Mark this unit as having failed. May attempt a retry.
41
+ def fail(output, time_taken)
42
+ tries = self.attempts + 1
43
+ return try_again if tries < CloudCrowd.config[:work_unit_retries]
44
+ update_attributes({
45
+ :status => CloudCrowd::FAILED,
46
+ :taken => false,
47
+ :attempts => tries,
48
+ :output => output,
49
+ :time => time_taken
50
+ })
51
+ end
52
+
53
+ # Ever tried. Ever failed. No matter. Try again. Fail again. Fail better.
54
+ def try_again
55
+ update_attributes({
56
+ :taken => false,
57
+ :attempts => self.attempts + 1
58
+ })
59
+ end
60
+
61
+ # The JSON representation of a WorkUnit contains common elements of its job.
62
+ def to_json
63
+ {
64
+ 'id' => self.id,
65
+ 'job_id' => self.job_id,
66
+ 'input' => self.input,
67
+ 'attempts' => self.attempts,
68
+ 'action' => self.job.action,
69
+ 'options' => JSON.parse(self.job.options),
70
+ 'status' => self.status
71
+ }.to_json
72
+ end
73
+
15
74
  end
16
-
17
- # Mark this unit as having finished successfully.
18
- def finish(output, time_taken)
19
- update_attributes({
20
- :status => CloudCrowd::SUCCEEDED,
21
- :taken => false,
22
- :attempts => self.attempts + 1,
23
- :output => output,
24
- :time => time_taken
25
- })
26
- end
27
-
28
- # Mark this unit as having failed. May attempt a retry.
29
- def fail(output, time_taken)
30
- tries = self.attempts + 1
31
- return try_again if tries < CloudCrowd.config[:work_unit_retries]
32
- update_attributes({
33
- :status => CloudCrowd::FAILED,
34
- :taken => false,
35
- :attempts => tries,
36
- :output => output,
37
- :time => time_taken
38
- })
39
- end
40
-
41
- # Ever tried. Ever failed. No matter. Try again. Fail again. Fail better.
42
- def try_again
43
- update_attributes({
44
- :taken => false,
45
- :attempts => self.attempts + 1
46
- })
47
- end
48
-
49
- # The JSON representation of a WorkUnit contains common elements of its job.
50
- def to_json
51
- {
52
- 'id' => self.id,
53
- 'job_id' => self.job_id,
54
- 'input' => self.input,
55
- 'attempts' => self.attempts,
56
- 'action' => self.job.action,
57
- 'options' => JSON.parse(self.job.options),
58
- 'status' => self.status
59
- }.to_json
60
- end
61
-
62
- end
75
+ end
@@ -1,5 +1,3 @@
1
- require 'activerecord'
2
-
3
1
  module CloudCrowd
4
2
  module ModelStatus
5
3
 
@@ -1,22 +1,10 @@
1
- # This is the script that kicks off a single CloudCrowd::Daemon. Because the
2
- # daemons don't load the entire rails stack, this file functions like a mini
3
- # environment.rb, loading all the common gems that we need.
4
-
5
- # Standard Libs
6
- require 'fileutils'
7
- require 'benchmark'
8
- require 'socket'
9
-
10
- # Gems
11
- require 'rubygems'
12
- require 'daemons'
13
- require 'yaml'
14
-
15
- FileUtils.mkdir('log') unless File.exists?('log')
1
+ # This is the script that kicks off a single CloudCrowd::Daemon. Rely on
2
+ # cloud-crowd.rb for autoloading of all the code we need.
16
3
 
17
4
  # Daemon/Worker Dependencies.
18
5
  require "#{File.dirname(__FILE__)}/../cloud-crowd"
19
- require 'cloud_crowd/asset_store'
6
+
7
+ FileUtils.mkdir('log') unless File.exists?('log')
20
8
 
21
9
  Daemons.run("#{CloudCrowd::ROOT}/lib/cloud_crowd/daemon.rb", {
22
10
  :app_name => "cloud_crowd_worker",
@@ -22,10 +22,7 @@ module CloudCrowd
22
22
  def fetch_work_unit
23
23
  keep_trying_to "fetch a new work unit" do
24
24
  unit_json = @server['/work'].get
25
- return unless unit_json # No content means no work for us.
26
- @start_time = Time.now
27
- parse_work_unit unit_json
28
- log "fetched work unit for #{@action_name}"
25
+ setup_work_unit(unit_json)
29
26
  end
30
27
  end
31
28
 
@@ -33,8 +30,10 @@ module CloudCrowd
33
30
  def complete_work_unit(result)
34
31
  keep_trying_to "complete work unit" do
35
32
  data = completion_params.merge({:status => 'succeeded', :output => result})
36
- @server["/work/#{data[:id]}"].put(data)
33
+ unit_json = @server["/work/#{data[:id]}"].put(data)
37
34
  log "finished #{@action_name} in #{data[:time]} seconds"
35
+ clear_work_unit
36
+ setup_work_unit(unit_json)
38
37
  end
39
38
  end
40
39
 
@@ -42,8 +41,10 @@ module CloudCrowd
42
41
  def fail_work_unit(exception)
43
42
  keep_trying_to "mark work unit as failed" do
44
43
  data = completion_params.merge({:status => 'failed', :output => exception.message})
45
- @server["/work/#{data[:id]}"].put(data)
44
+ unit_json = @server["/work/#{data[:id]}"].put(data)
46
45
  log "failed #{@action_name} in #{data[:time]} seconds\n#{exception.message}\n#{exception.backtrace}"
46
+ clear_work_unit
47
+ setup_work_unit(unit_json)
47
48
  end
48
49
  end
49
50
 
@@ -78,8 +79,6 @@ module CloudCrowd
78
79
  complete_work_unit(result)
79
80
  rescue Exception => e
80
81
  fail_work_unit(e)
81
- ensure
82
- clear_work_unit
83
82
  end
84
83
  end
85
84
 
@@ -107,12 +106,16 @@ module CloudCrowd
107
106
  end
108
107
 
109
108
  # Extract our instance variables from a WorkUnit's JSON.
110
- def parse_work_unit(unit_json)
109
+ def setup_work_unit(unit_json)
110
+ return false unless unit_json
111
111
  unit = JSON.parse(unit_json)
112
+ @start_time = Time.now
112
113
  @action_name, @input, @options, @status = unit['action'], unit['input'], unit['options'], unit['status']
113
114
  @options['job_id'] = unit['job_id']
114
115
  @options['work_unit_id'] = unit['id']
115
116
  @options['attempts'] ||= unit['attempts']
117
+ log "fetched work unit for #{@action_name}"
118
+ return true
116
119
  end
117
120
 
118
121
  # Log a message to the daemon log. Includes PID for identification.
@@ -13,7 +13,7 @@ class FailingWorkUnitsTest < Test::Unit::TestCase
13
13
  }.to_json
14
14
  assert browser.last_response.ok?
15
15
 
16
- job = Job.last
16
+ job = CloudCrowd::Job.last
17
17
  (CloudCrowd.config[:work_unit_retries] - 1).times do
18
18
  job.work_units.each {|unit| unit.fail('failed', 10) }
19
19
  end
data/test/blueprints.rb CHANGED
@@ -1,14 +1,14 @@
1
1
  Sham.url { Faker::Internet.domain_name + "/" + Faker::Internet.domain_word + ".jpg" }
2
2
 
3
- Job.blueprint do
3
+ CloudCrowd::Job.blueprint do
4
4
  status { CloudCrowd::PROCESSING }
5
5
  inputs { ['http://www.google.com/intl/en_ALL/images/logo.gif'].to_json }
6
6
  action { 'graphics_magick' }
7
7
  options { {}.to_json }
8
8
  end
9
9
 
10
- WorkUnit.blueprint do
11
- job { Job.make }
10
+ CloudCrowd::WorkUnit.blueprint do
11
+ job { CloudCrowd::Job.make }
12
12
  status { CloudCrowd::PROCESSING }
13
13
  taken { false }
14
14
  input { Sham.url }
@@ -1,5 +1,5 @@
1
1
  :num_workers: 4
2
- :default_worker_wait: 1
2
+ :min_worker_wait: 1
3
3
  :max_worker_wait: 20
4
4
  :worker_wait_multiplier: 1.3
5
5
  :worker_retry_wait: 5
data/test/test_helper.rb CHANGED
@@ -2,8 +2,6 @@ require 'rubygems'
2
2
 
3
3
  here = File.dirname(__FILE__)
4
4
  require File.expand_path(here + "/../lib/cloud-crowd")
5
- require 'cloud_crowd/app'
6
-
7
5
  CloudCrowd.configure(here + '/config/config.yml')
8
6
  CloudCrowd.configure_database(here + '/config/database.yml')
9
7
 
@@ -5,7 +5,7 @@ class JobTest < Test::Unit::TestCase
5
5
  context "A CloudCrowd Job" do
6
6
 
7
7
  setup do
8
- @job = Job.make
8
+ @job = CloudCrowd::Job.make
9
9
  @unit = @job.work_units.first
10
10
  end
11
11
 
@@ -32,7 +32,7 @@ class JobTest < Test::Unit::TestCase
32
32
  end
33
33
 
34
34
  should "be able to create a job from a JSON request" do
35
- job = Job.create_from_request(JSON.parse(<<-EOS
35
+ job = CloudCrowd::Job.create_from_request(JSON.parse(<<-EOS
36
36
  { "inputs" : ["one", "two", "three"],
37
37
  "action" : "graphics_magick",
38
38
  "owner_email" : "bob@example.com",
@@ -46,13 +46,13 @@ class JobTest < Test::Unit::TestCase
46
46
  end
47
47
 
48
48
  should "create jobs with a SPLITTING status for actions that have a split method defined" do
49
- job = Job.create_from_request({'inputs' => ['1'], 'action' => 'pdf_to_images'})
49
+ job = CloudCrowd::Job.create_from_request({'inputs' => ['1'], 'action' => 'pdf_to_images'})
50
50
  assert job.splittable?
51
51
  assert job.splitting?
52
52
  end
53
53
 
54
54
  should "fire a callback when a job has finished, successfully or not" do
55
- Job.any_instance.expects(:fire_callback)
55
+ CloudCrowd::Job.any_instance.expects(:fire_callback)
56
56
  @job.work_units.first.finish('output', 10)
57
57
  assert @job.all_work_units_complete?
58
58
  end
@@ -5,7 +5,7 @@ class WorkUnitTest < Test::Unit::TestCase
5
5
  context "A WorkUnit" do
6
6
 
7
7
  setup do
8
- @unit = WorkUnit.make
8
+ @unit = CloudCrowd::WorkUnit.make
9
9
  @job = @unit.job
10
10
  end
11
11
 
@@ -26,7 +26,7 @@ class WorkUnitTest < Test::Unit::TestCase
26
26
  end
27
27
 
28
28
  should "have JSON that includes job attributes" do
29
- job = Job.make
29
+ job = CloudCrowd::Job.make
30
30
  unit_data = JSON.parse(job.work_units.first.to_json)
31
31
  assert unit_data['job_id'] == job.id
32
32
  assert unit_data['action'] == job.action
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: documentcloud-cloud-crowd
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jeremy Ashkenas