documentcloud-cloud-crowd 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,48 @@
1
+ CloudCrowd.configure(ENV['CLOUD_CROWD_CONFIG'])
2
+
3
+ require 'cloud_crowd/worker'
4
+
5
+ module CloudCrowd
6
+
7
+ # A CloudCrowd::Daemon, started by the Daemons gem, runs a CloudCrowd::Worker in
8
+ # a loop, continually fetching and processing WorkUnits from the central
9
+ # server. The Daemon backs off and pings central less frequently when there
10
+ # isn't any work to be done, and speeds back up when there is.
11
+ class Daemon
12
+
13
+ DEFAULT_WAIT = CloudCrowd.config[:default_worker_wait]
14
+ MAX_WAIT = CloudCrowd.config[:max_worker_wait]
15
+ WAIT_MULTIPLIER = CloudCrowd.config[:worker_wait_multiplier]
16
+
17
+ def initialize
18
+ @wait_time = DEFAULT_WAIT
19
+ @worker = CloudCrowd::Worker.new
20
+ Signal.trap('INT', 'EXIT')
21
+ Signal.trap('KILL', 'EXIT')
22
+ Signal.trap('TERM', 'EXIT')
23
+ end
24
+
25
+ # Loop forever, fetching WorkUnits.
26
+ # TODO: Workers busy with their work units won't die until the unit has
27
+ # been finished. This should probably be wrapped in an appropriately lengthy
28
+ # timeout, or should be killable from the outside by terminating the thread.
29
+ # In either case, nasty un-cleaned-up bits might be left behind.
30
+ def run
31
+ loop do
32
+ @worker.fetch_work_unit
33
+ if @worker.has_work?
34
+ @worker.run
35
+ @wait_time = DEFAULT_WAIT
36
+ sleep 0.01 # So as to listen for incoming signals.
37
+ else
38
+ @wait_time = [@wait_time * WAIT_MULTIPLIER, MAX_WAIT].min
39
+ sleep @wait_time
40
+ end
41
+ end
42
+ end
43
+
44
+ end
45
+
46
+ end
47
+
48
+ CloudCrowd::Daemon.new.run
@@ -0,0 +1,15 @@
1
+ module CloudCrowd
2
+ module Helpers
3
+ module Resources
4
+
5
+ def current_job
6
+ @job ||= Job.find_by_id(params[:job_id]) or raise Sinatra::NotFound
7
+ end
8
+
9
+ def current_work_unit
10
+ @work_unit ||= WorkUnit.find_by_id(params[:work_unit_id]) or raise Sinatra::NotFound
11
+ end
12
+
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,7 @@
1
+ module CloudCrowd
2
+ module Helpers
3
+ module Urls
4
+
5
+ end
6
+ end
7
+ end
@@ -0,0 +1,8 @@
1
+ require 'cloud_crowd/helpers/resources'
2
+ require 'cloud_crowd/helpers/urls'
3
+
4
+ module CloudCrowd
5
+ module Helpers
6
+ include Resources, Urls #, Rack::Utils
7
+ end
8
+ end
@@ -0,0 +1,129 @@
1
+ # A chunk of work that will be farmed out into many WorkUnits to be processed
2
+ # in parallel by all the active CloudCrowd::Workers. Jobs are defined by a list
3
+ # of inputs (usually public urls to files), an action (the name of a script that
4
+ # CloudCrowd knows how to run), and, eventually a corresponding list of output.
5
+ class Job < ActiveRecord::Base
6
+ include CloudCrowd::ModelStatus
7
+
8
+ has_many :work_units, :dependent => :destroy
9
+
10
+ validates_presence_of :status, :inputs, :action, :options
11
+
12
+ # Create a Job from an incoming JSON or XML request, and add it to the queue.
13
+ # TODO: Add XML support.
14
+ def self.create_from_request(h)
15
+ self.create(
16
+ :inputs => h['inputs'].to_json,
17
+ :action => h['action'],
18
+ :options => (h['options'] || {}).to_json,
19
+ :owner_email => h['owner_email'],
20
+ :callback_url => h['callback_url']
21
+ )
22
+ end
23
+
24
+ def after_create
25
+ self.queue_for_workers(JSON.parse(self.inputs))
26
+ end
27
+
28
+ def before_validation_on_create
29
+ self.status = self.splittable? ? CloudCrowd::SPLITTING : CloudCrowd::PROCESSING
30
+ end
31
+
32
+ # After work units are marked successful, we check to see if all of them have
33
+ # finished, if so, this job is complete.
34
+ def check_for_completion
35
+ return unless all_work_units_complete?
36
+ transition_to_next_phase
37
+ output_list = gather_outputs_from_work_units
38
+
39
+ if complete?
40
+ self.outputs = output_list.to_json
41
+ self.time = Time.now - self.created_at
42
+ end
43
+ self.save
44
+
45
+ case self.status
46
+ when CloudCrowd::PROCESSING then queue_for_workers(output_list.map {|o| JSON.parse(o) }.flatten)
47
+ when CloudCrowd::MERGING then queue_for_workers(output_list.to_json)
48
+ else fire_callback
49
+ end
50
+ self
51
+ end
52
+
53
+ # Transition from the current phase to the next one.
54
+ def transition_to_next_phase
55
+ self.status = any_work_units_failed? ? CloudCrowd::FAILED :
56
+ self.splitting? ? CloudCrowd::PROCESSING :
57
+ self.should_merge? ? CloudCrowd::MERGING :
58
+ CloudCrowd::SUCCEEDED
59
+ end
60
+
61
+ # If a callback_url is defined, post the Job's JSON to it upon completion.
62
+ def fire_callback
63
+ begin
64
+ RestClient.post(callback_url, {:job => self.to_json}) if callback_url
65
+ rescue RestClient::Exception => e
66
+ puts "Failed to fire job callback. Hmmm, what should happen here?"
67
+ end
68
+ end
69
+
70
+ # Cleaning up after a job will remove all of its files from S3.
71
+ def cleanup
72
+ CloudCrowd::AssetStore.new.cleanup_job(self)
73
+ end
74
+
75
+ # Have all of the WorkUnits finished? We could trade reads for writes here
76
+ # by keeping a completed_count on the Job itself.
77
+ def all_work_units_complete?
78
+ self.work_units.incomplete.count <= 0
79
+ end
80
+
81
+ # Have any of the WorkUnits failed?
82
+ def any_work_units_failed?
83
+ self.work_units.failed.count > 0
84
+ end
85
+
86
+ def splittable?
87
+ self.action_class.new.respond_to? :split
88
+ end
89
+
90
+ def should_merge?
91
+ self.processing? && self.action_class.new.respond_to?(:merge)
92
+ end
93
+
94
+ def action_class
95
+ CloudCrowd.actions(self.action)
96
+ end
97
+
98
+ def gather_outputs_from_work_units
99
+ outs = self.work_units.complete.map {|wu| wu.output }
100
+ self.work_units.complete.destroy_all
101
+ outs
102
+ end
103
+
104
+ def display_status
105
+ CloudCrowd.display_status(self.status)
106
+ end
107
+
108
+ def work_units_remaining
109
+ self.work_units.incomplete.count
110
+ end
111
+
112
+ # A JSON representation of this job includes the statuses of its component
113
+ # WorkUnits, as well as any completed outputs.
114
+ def to_json(opts={})
115
+ atts = {'id' => self.id, 'status' => self.display_status, 'work_units_remaining' => self.work_units_remaining}
116
+ atts.merge!({'output' => JSON.parse(self.outputs)}) if self.outputs
117
+ atts.merge!({'time' => self.time}) if self.time
118
+ atts.to_json
119
+ end
120
+
121
+ # When starting a new job, or moving to a new stage, split up the inputs
122
+ # into WorkUnits, and queue them.
123
+ def queue_for_workers(input)
124
+ [input].flatten.each do |wu_input|
125
+ WorkUnit.create(:job => self, :input => wu_input, :status => self.status)
126
+ end
127
+ end
128
+
129
+ end
@@ -0,0 +1,62 @@
1
+ # A WorkUnit is an atomic chunk of work from a job, processing a single input
2
+ # through a single action. All WorkUnits receive the same options.
3
+ class WorkUnit < ActiveRecord::Base
4
+ include CloudCrowd::ModelStatus
5
+
6
+ belongs_to :job
7
+
8
+ validates_presence_of :job_id, :status, :input
9
+
10
+ after_save :check_for_job_completion
11
+
12
+ # After saving a WorkUnit, it's Job should check if it just become complete.
13
+ def check_for_job_completion
14
+ self.job.check_for_completion if complete?
15
+ end
16
+
17
+ # Mark this unit as having finished successfully.
18
+ def finish(output, time_taken)
19
+ update_attributes({
20
+ :status => CloudCrowd::SUCCEEDED,
21
+ :taken => false,
22
+ :attempts => self.attempts + 1,
23
+ :output => output,
24
+ :time => time_taken
25
+ })
26
+ end
27
+
28
+ # Mark this unit as having failed. May attempt a retry.
29
+ def fail(output, time_taken)
30
+ tries = self.attempts + 1
31
+ return try_again if tries < CloudCrowd.config[:work_unit_retries]
32
+ update_attributes({
33
+ :status => CloudCrowd::FAILED,
34
+ :taken => false,
35
+ :attempts => tries,
36
+ :output => output,
37
+ :time => time_taken
38
+ })
39
+ end
40
+
41
+ # Ever tried. Ever failed. No matter. Try again. Fail again. Fail better.
42
+ def try_again
43
+ update_attributes({
44
+ :taken => false,
45
+ :attempts => self.attempts + 1
46
+ })
47
+ end
48
+
49
+ # The JSON representation of a WorkUnit contains common elements of its job.
50
+ def to_json
51
+ {
52
+ 'id' => self.id,
53
+ 'job_id' => self.job_id,
54
+ 'input' => self.input,
55
+ 'attempts' => self.attempts,
56
+ 'action' => self.job.action,
57
+ 'options' => JSON.parse(self.job.options),
58
+ 'status' => self.status
59
+ }.to_json
60
+ end
61
+
62
+ end
@@ -0,0 +1,31 @@
1
+ module CloudCrowd
2
+ module ModelStatus
3
+
4
+ def self.included(klass)
5
+
6
+ klass.class_eval do
7
+ # Note that COMPLETE and INCOMPLETE are unions of other states.
8
+ named_scope 'processing', :conditions => {:status => CloudCrowd::PROCESSING}
9
+ named_scope 'succeeded', :conditions => {:status => CloudCrowd::SUCCEEDED}
10
+ named_scope 'failed', :conditions => {:status => CloudCrowd::FAILED}
11
+ named_scope 'splitting', :conditions => {:status => CloudCrowd::SPLITTING}
12
+ named_scope 'merging', :conditions => {:status => CloudCrowd::MERGING}
13
+ named_scope 'complete', :conditions => {:status => CloudCrowd::COMPLETE}
14
+ named_scope 'incomplete', :conditions => {:status => CloudCrowd::INCOMPLETE}
15
+ end
16
+
17
+ end
18
+
19
+ def processing?; self.status == CloudCrowd::PROCESSING; end
20
+ def succeeded?; self.status == CloudCrowd::SUCCEEDED; end
21
+ def failed?; self.status == CloudCrowd::FAILED; end
22
+ def splitting?; self.status == CloudCrowd::SPLITTING; end
23
+ def merging?; self.status == CloudCrowd::MERGING; end
24
+ def complete?; CloudCrowd::COMPLETE.include?(self.status); end
25
+ def incomplete?; CloudCrowd::INCOMPLETE.include?(self.status); end
26
+
27
+ end
28
+ end
29
+
30
+ require 'cloud_crowd/models/job'
31
+ require 'cloud_crowd/models/work_unit'
@@ -0,0 +1,29 @@
1
+ # This is the script that kicks off a single CloudCrowd::Daemon. Because the
2
+ # daemons don't load the entire rails stack, this file functions like a mini
3
+ # environment.rb, loading all the common gems that we need.
4
+
5
+ # CloudCrowd::App.root = File.expand_path(File.dirname(__FILE__) + '/../..') unless defined?(CloudCrowd::App.root)
6
+
7
+ # Standard Lib and Gems
8
+ require 'fileutils'
9
+ require 'rubygems'
10
+ require 'daemons'
11
+ require 'socket'
12
+ require 'yaml'
13
+ require 'json'
14
+ require 'rest_client'
15
+ require 'right_aws'
16
+
17
+ FileUtils.mkdir('log') unless File.exists?('log')
18
+
19
+ # Daemon/Worker Dependencies.
20
+ require "#{File.dirname(__FILE__)}/../cloud-crowd"
21
+
22
+ Daemons.run("#{CloudCrowd::App.root}/lib/cloud_crowd/daemon.rb", {
23
+ :app_name => "cloud_crowd_worker",
24
+ :dir_mode => :normal,
25
+ :dir => 'log',
26
+ :multiple => true,
27
+ :backtrace => true,
28
+ :log_output => true
29
+ })
@@ -0,0 +1,34 @@
1
+ # Complete schema for CloudCrowd.
2
+ ActiveRecord::Schema.define(:version => 1) do
3
+
4
+ create_table "jobs", :force => true do |t|
5
+ t.integer "status", :null => false
6
+ t.text "inputs", :null => false
7
+ t.string "action", :null => false
8
+ t.text "options", :null => false
9
+ t.text "outputs"
10
+ t.float "time"
11
+ t.string "callback_url"
12
+ t.string "owner_email"
13
+ t.integer "lock_version", :default => 0, :null => false
14
+ t.datetime "created_at"
15
+ t.datetime "updated_at"
16
+ end
17
+
18
+ create_table "work_units", :force => true do |t|
19
+ t.integer "status", :null => false
20
+ t.integer "job_id", :null => false
21
+ t.text "input", :null => false
22
+ t.integer "attempts", :default => 0, :null => false
23
+ t.integer "lock_version", :default => 0, :null => false
24
+ t.boolean "taken", :default => false, :null => false
25
+ t.float "time"
26
+ t.text "output"
27
+ t.datetime "created_at"
28
+ t.datetime "updated_at"
29
+ end
30
+
31
+ add_index "work_units", ["job_id"], :name => "index_work_units_on_job_id"
32
+ add_index "work_units", ["status", "taken"], :name => "index_work_units_on_status_and_taken"
33
+
34
+ end
@@ -0,0 +1,115 @@
1
+ module CloudCrowd
2
+
3
+ class Worker
4
+
5
+ CENTRAL_URL = CloudCrowd.config[:central_server]
6
+ RETRY_WAIT = CloudCrowd.config[:worker_retry_wait]
7
+
8
+ attr_reader :action
9
+
10
+ # Spinning up a worker will create a new AssetStore with a persistent
11
+ # connection to S3. This AssetStore gets passed into each action, for use
12
+ # as it is run.
13
+ def initialize
14
+ @id = $$
15
+ @hostname = Socket.gethostname
16
+ @store = CloudCrowd::AssetStore.new
17
+ end
18
+
19
+ # Ask the central server for a new WorkUnit.
20
+ def fetch_work_unit
21
+ keep_trying_to "fetch a new work unit" do
22
+ unit_json = RestClient.get("#{CENTRAL_URL}/work")
23
+ return unless unit_json # No content means no work for us.
24
+ @start_time = Time.now
25
+ parse_work_unit unit_json
26
+ log "fetched work unit for #{@action_name}"
27
+ end
28
+ end
29
+
30
+ # Return output to the central server, marking the current work unit as done.
31
+ def complete_work_unit(result)
32
+ keep_trying_to "complete work unit" do
33
+ data = completion_params.merge({:status => 'succeeded', :output => result})
34
+ RestClient.put("#{CENTRAL_URL}/work/#{data[:id]}", data)
35
+ log "finished #{@action_name} in #{data[:time]} seconds"
36
+ end
37
+ end
38
+
39
+ # Mark the current work unit as failed, returning the exception to central.
40
+ def fail_work_unit(exception)
41
+ keep_trying_to "mark work unit as failed" do
42
+ data = completion_params.merge({:status => 'failed', :output => exception.message})
43
+ RestClient.put("#{CENTRAL_URL}/work/#{data[:id]}", data)
44
+ log "failed #{@action_name} in #{data[:time]} seconds\n#{exception.message}\n#{exception.backtrace}"
45
+ end
46
+ end
47
+
48
+ def keep_trying_to(title)
49
+ begin
50
+ yield
51
+ rescue Exception => e
52
+ log "failed to #{title} -- retry in #{RETRY_WAIT} seconds"
53
+ log e.message
54
+ log e.backtrace
55
+ sleep RETRY_WAIT
56
+ retry
57
+ end
58
+ end
59
+
60
+ # Does this Worker have a job to do?
61
+ def has_work?
62
+ @action_name && @input && @options
63
+ end
64
+
65
+ # Executes the current work unit, catching all exceptions as failures.
66
+ def run
67
+ begin
68
+ @action = CloudCrowd.actions(@action_name).new
69
+ @action.configure(@status, @input, @options, @store)
70
+ result = case @status
71
+ when CloudCrowd::PROCESSING then @action.process
72
+ when CloudCrowd::SPLITTING then @action.split
73
+ when CloudCrowd::MERGING then @action.merge
74
+ else raise "Work units must specify their status."
75
+ end
76
+ complete_work_unit(result)
77
+ rescue Exception => e
78
+ fail_work_unit(e)
79
+ ensure
80
+ clear_work_unit
81
+ end
82
+ end
83
+
84
+
85
+ private
86
+
87
+ # Common parameters to send back to central, regardless of success or failure.
88
+ def completion_params
89
+ {:id => @options['work_unit_id'], :time => Time.now - @start_time}
90
+ end
91
+
92
+ # Extract our instance variables from a WorkUnit's JSON.
93
+ def parse_work_unit(unit_json)
94
+ unit = JSON.parse(unit_json)
95
+ @action_name, @input, @options, @status = unit['action'], unit['input'], unit['options'], unit['status']
96
+ @options['job_id'] = unit['job_id']
97
+ @options['work_unit_id'] = unit['id']
98
+ @options['attempts'] ||= unit['attempts']
99
+ end
100
+
101
+ # Log a message to the daemon log. Includes PID for identification.
102
+ def log(message)
103
+ puts "Worker ##{@id}: #{message}"
104
+ end
105
+
106
+ # When we're done with a unit, clear out our ivars to make way for the next.
107
+ # Also, remove all of the previous unit's temporary storage.
108
+ def clear_work_unit
109
+ @action.cleanup_work_directory
110
+ @action, @action_name, @input, @options, @start_time = nil, nil, nil, nil, nil
111
+ end
112
+
113
+ end
114
+
115
+ end
@@ -0,0 +1,32 @@
1
+ require 'test_helper'
2
+
3
+ # A Worker Daemon needs to be running to perform this integration test.
4
+ class FailingWorkUnitsTest < Test::Unit::TestCase
5
+
6
+ should "retry work units when they fail" do
7
+ browser = Rack::Test::Session.new(Rack::MockSession.new(CloudCrowd::App))
8
+
9
+ browser.post '/jobs', :json => {
10
+ 'action' => 'failure_testing',
11
+ 'inputs' => ['one', 'two', 'three'],
12
+ 'options' => {}
13
+ }.to_json
14
+ assert browser.last_response.ok?
15
+
16
+ job = Job.last
17
+ (CloudCrowd.config[:work_unit_retries] - 1).times do
18
+ job.work_units.each {|unit| unit.fail('failed', 10) }
19
+ end
20
+ assert job.reload.work_units_remaining == 3
21
+ job.work_units.reload.each_with_index do |unit, i|
22
+ assert unit.processing?
23
+ assert unit.attempts == CloudCrowd.config[:work_unit_retries] - 1
24
+ unit.fail('failed', 10)
25
+ assert unit.job.any_work_units_failed? if i == 0
26
+ end
27
+ assert job.reload.failed?
28
+ assert job.work_units.count == 0
29
+ end
30
+
31
+ end
32
+
@@ -0,0 +1,15 @@
1
+ Sham.url { Faker::Internet.domain_name + "/" + Faker::Internet.domain_word + ".jpg" }
2
+
3
+ Job.blueprint do
4
+ status { CloudCrowd::PROCESSING }
5
+ inputs { ['http://www.google.com/intl/en_ALL/images/logo.gif'].to_json }
6
+ action { 'graphics_magick' }
7
+ options { {}.to_json }
8
+ end
9
+
10
+ WorkUnit.blueprint do
11
+ job { Job.make }
12
+ status { CloudCrowd::PROCESSING }
13
+ taken { false }
14
+ input { Sham.url }
15
+ end
@@ -0,0 +1,10 @@
1
+ :num_workers: 4
2
+ :default_worker_wait: 1
3
+ :max_worker_wait: 20
4
+ :worker_wait_multiplier: 1.3
5
+ :worker_retry_wait: 5
6
+ :work_unit_retries: 3
7
+
8
+ :central_server: http://localhost:9173
9
+
10
+ # TODO: Add a somewhat oversized work unit timeout.
@@ -0,0 +1,6 @@
1
+ :adapter: mysql
2
+ :encoding: utf8
3
+ :username: root
4
+ :password:
5
+ :socket: /tmp/mysql.sock
6
+ :database: cloud_crowd_test
@@ -0,0 +1,18 @@
1
+ require 'rubygems'
2
+
3
+ here = File.dirname(__FILE__)
4
+ require File.expand_path(here + "/../lib/cloud-crowd")
5
+ CloudCrowd.configure(here + '/config/test_config.yml')
6
+ CloudCrowd.configure_database(here + '/config/test_database.yml')
7
+
8
+ require 'faker'
9
+ require 'sham'
10
+ require 'rack/test'
11
+ require 'shoulda/active_record'
12
+ require 'machinist/active_record'
13
+ require 'mocha'
14
+ require "#{CloudCrowd::App.root}/test/blueprints.rb"
15
+
16
+ class Test::Unit::TestCase
17
+ include CloudCrowd
18
+ end
@@ -0,0 +1,70 @@
1
+ require 'test_helper'
2
+
3
+ class JobTest < Test::Unit::TestCase
4
+
5
+ context "A CloudCrowd Job" do
6
+
7
+ setup do
8
+ @job = Job.make
9
+ @unit = @job.work_units.first
10
+ end
11
+
12
+ subject { @job }
13
+
14
+ should_have_many :work_units
15
+
16
+ should_validate_presence_of :status, :inputs, :action, :options
17
+
18
+ should "create all of its work units as soon as the job is created" do
19
+ assert @job.work_units.count >= 1
20
+ assert @job.work_units_remaining == 1
21
+ assert @job.processing?
22
+ assert @unit.processing?
23
+ assert !@job.all_work_units_complete?
24
+ end
25
+
26
+ should "know its completion status" do
27
+ assert !@job.all_work_units_complete?
28
+ @unit.update_attributes(:status => CloudCrowd::SUCCEEDED, :output => 'hello')
29
+ assert @job.reload.all_work_units_complete?
30
+ assert @job.work_units_remaining == 0
31
+ assert @job.outputs == "[\"hello\"]"
32
+ end
33
+
34
+ should "be able to create a job from a JSON request" do
35
+ job = Job.create_from_request(JSON.parse(<<-EOS
36
+ { "inputs" : ["one", "two", "three"],
37
+ "action" : "graphics_magick",
38
+ "owner_email" : "bob@example.com",
39
+ "callback_url" : "http://example.com/callback" }
40
+ EOS
41
+ ))
42
+ assert job.work_units.count == 3
43
+ assert job.action == 'graphics_magick'
44
+ assert job.action_class == GraphicsMagick
45
+ assert job.callback_url == "http://example.com/callback"
46
+ end
47
+
48
+ should "create jobs with a SPLITTING status for actions that have a split method defined" do
49
+ job = Job.create_from_request({'inputs' => ['1'], 'action' => 'pdf_to_images'})
50
+ assert job.splittable?
51
+ assert job.splitting?
52
+ end
53
+
54
+ should "fire a callback when a job has finished, successfully or not" do
55
+ Job.any_instance.expects(:fire_callback)
56
+ @job.work_units.first.finish('output', 10)
57
+ assert @job.all_work_units_complete?
58
+ end
59
+
60
+ should "have a 'pretty' display of the Job's status" do
61
+ assert @job.display_status == 'processing'
62
+ @job.update_attribute(:status, CloudCrowd::FAILED)
63
+ assert @job.display_status == 'failed'
64
+ @job.update_attribute(:status, CloudCrowd::MERGING)
65
+ assert @job.display_status == 'merging'
66
+ end
67
+
68
+ end
69
+
70
+ end