documentcloud-cloud-crowd 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,48 @@
1
+ CloudCrowd.configure(ENV['CLOUD_CROWD_CONFIG'])
2
+
3
+ require 'cloud_crowd/worker'
4
+
5
+ module CloudCrowd
6
+
7
+ # A CloudCrowd::Daemon, started by the Daemons gem, runs a CloudCrowd::Worker in
8
+ # a loop, continually fetching and processing WorkUnits from the central
9
+ # server. The Daemon backs off and pings central less frequently when there
10
+ # isn't any work to be done, and speeds back up when there is.
11
+ class Daemon
12
+
13
+ DEFAULT_WAIT = CloudCrowd.config[:default_worker_wait]
14
+ MAX_WAIT = CloudCrowd.config[:max_worker_wait]
15
+ WAIT_MULTIPLIER = CloudCrowd.config[:worker_wait_multiplier]
16
+
17
+ def initialize
18
+ @wait_time = DEFAULT_WAIT
19
+ @worker = CloudCrowd::Worker.new
20
+ Signal.trap('INT', 'EXIT')
21
+ Signal.trap('KILL', 'EXIT')
22
+ Signal.trap('TERM', 'EXIT')
23
+ end
24
+
25
+ # Loop forever, fetching WorkUnits.
26
+ # TODO: Workers busy with their work units won't die until the unit has
27
+ # been finished. This should probably be wrapped in an appropriately lengthy
28
+ # timeout, or should be killable from the outside by terminating the thread.
29
+ # In either case, nasty un-cleaned-up bits might be left behind.
30
+ def run
31
+ loop do
32
+ @worker.fetch_work_unit
33
+ if @worker.has_work?
34
+ @worker.run
35
+ @wait_time = DEFAULT_WAIT
36
+ sleep 0.01 # So as to listen for incoming signals.
37
+ else
38
+ @wait_time = [@wait_time * WAIT_MULTIPLIER, MAX_WAIT].min
39
+ sleep @wait_time
40
+ end
41
+ end
42
+ end
43
+
44
+ end
45
+
46
+ end
47
+
48
+ CloudCrowd::Daemon.new.run
@@ -0,0 +1,15 @@
1
+ module CloudCrowd
2
+ module Helpers
3
+ module Resources
4
+
5
+ def current_job
6
+ @job ||= Job.find_by_id(params[:job_id]) or raise Sinatra::NotFound
7
+ end
8
+
9
+ def current_work_unit
10
+ @work_unit ||= WorkUnit.find_by_id(params[:work_unit_id]) or raise Sinatra::NotFound
11
+ end
12
+
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,7 @@
1
+ module CloudCrowd
2
+ module Helpers
3
+ module Urls
4
+
5
+ end
6
+ end
7
+ end
@@ -0,0 +1,8 @@
1
+ require 'cloud_crowd/helpers/resources'
2
+ require 'cloud_crowd/helpers/urls'
3
+
4
+ module CloudCrowd
5
+ module Helpers
6
+ include Resources, Urls #, Rack::Utils
7
+ end
8
+ end
@@ -0,0 +1,129 @@
1
+ # A chunk of work that will be farmed out into many WorkUnits to be processed
2
+ # in parallel by all the active CloudCrowd::Workers. Jobs are defined by a list
3
+ # of inputs (usually public urls to files), an action (the name of a script that
4
+ # CloudCrowd knows how to run), and, eventually a corresponding list of output.
5
+ class Job < ActiveRecord::Base
6
+ include CloudCrowd::ModelStatus
7
+
8
+ has_many :work_units, :dependent => :destroy
9
+
10
+ validates_presence_of :status, :inputs, :action, :options
11
+
12
+ # Create a Job from an incoming JSON or XML request, and add it to the queue.
13
+ # TODO: Add XML support.
14
+ def self.create_from_request(h)
15
+ self.create(
16
+ :inputs => h['inputs'].to_json,
17
+ :action => h['action'],
18
+ :options => (h['options'] || {}).to_json,
19
+ :owner_email => h['owner_email'],
20
+ :callback_url => h['callback_url']
21
+ )
22
+ end
23
+
24
+ def after_create
25
+ self.queue_for_workers(JSON.parse(self.inputs))
26
+ end
27
+
28
+ def before_validation_on_create
29
+ self.status = self.splittable? ? CloudCrowd::SPLITTING : CloudCrowd::PROCESSING
30
+ end
31
+
32
+ # After work units are marked successful, we check to see if all of them have
33
+ # finished, if so, this job is complete.
34
+ def check_for_completion
35
+ return unless all_work_units_complete?
36
+ transition_to_next_phase
37
+ output_list = gather_outputs_from_work_units
38
+
39
+ if complete?
40
+ self.outputs = output_list.to_json
41
+ self.time = Time.now - self.created_at
42
+ end
43
+ self.save
44
+
45
+ case self.status
46
+ when CloudCrowd::PROCESSING then queue_for_workers(output_list.map {|o| JSON.parse(o) }.flatten)
47
+ when CloudCrowd::MERGING then queue_for_workers(output_list.to_json)
48
+ else fire_callback
49
+ end
50
+ self
51
+ end
52
+
53
+ # Transition from the current phase to the next one.
54
+ def transition_to_next_phase
55
+ self.status = any_work_units_failed? ? CloudCrowd::FAILED :
56
+ self.splitting? ? CloudCrowd::PROCESSING :
57
+ self.should_merge? ? CloudCrowd::MERGING :
58
+ CloudCrowd::SUCCEEDED
59
+ end
60
+
61
+ # If a callback_url is defined, post the Job's JSON to it upon completion.
62
+ def fire_callback
63
+ begin
64
+ RestClient.post(callback_url, {:job => self.to_json}) if callback_url
65
+ rescue RestClient::Exception => e
66
+ puts "Failed to fire job callback. Hmmm, what should happen here?"
67
+ end
68
+ end
69
+
70
+ # Cleaning up after a job will remove all of its files from S3.
71
+ def cleanup
72
+ CloudCrowd::AssetStore.new.cleanup_job(self)
73
+ end
74
+
75
+ # Have all of the WorkUnits finished? We could trade reads for writes here
76
+ # by keeping a completed_count on the Job itself.
77
+ def all_work_units_complete?
78
+ self.work_units.incomplete.count <= 0
79
+ end
80
+
81
+ # Have any of the WorkUnits failed?
82
+ def any_work_units_failed?
83
+ self.work_units.failed.count > 0
84
+ end
85
+
86
+ def splittable?
87
+ self.action_class.new.respond_to? :split
88
+ end
89
+
90
+ def should_merge?
91
+ self.processing? && self.action_class.new.respond_to?(:merge)
92
+ end
93
+
94
+ def action_class
95
+ CloudCrowd.actions(self.action)
96
+ end
97
+
98
+ def gather_outputs_from_work_units
99
+ outs = self.work_units.complete.map {|wu| wu.output }
100
+ self.work_units.complete.destroy_all
101
+ outs
102
+ end
103
+
104
+ def display_status
105
+ CloudCrowd.display_status(self.status)
106
+ end
107
+
108
+ def work_units_remaining
109
+ self.work_units.incomplete.count
110
+ end
111
+
112
+ # A JSON representation of this job includes the statuses of its component
113
+ # WorkUnits, as well as any completed outputs.
114
+ def to_json(opts={})
115
+ atts = {'id' => self.id, 'status' => self.display_status, 'work_units_remaining' => self.work_units_remaining}
116
+ atts.merge!({'output' => JSON.parse(self.outputs)}) if self.outputs
117
+ atts.merge!({'time' => self.time}) if self.time
118
+ atts.to_json
119
+ end
120
+
121
+ # When starting a new job, or moving to a new stage, split up the inputs
122
+ # into WorkUnits, and queue them.
123
+ def queue_for_workers(input)
124
+ [input].flatten.each do |wu_input|
125
+ WorkUnit.create(:job => self, :input => wu_input, :status => self.status)
126
+ end
127
+ end
128
+
129
+ end
@@ -0,0 +1,62 @@
1
+ # A WorkUnit is an atomic chunk of work from a job, processing a single input
2
+ # through a single action. All WorkUnits receive the same options.
3
+ class WorkUnit < ActiveRecord::Base
4
+ include CloudCrowd::ModelStatus
5
+
6
+ belongs_to :job
7
+
8
+ validates_presence_of :job_id, :status, :input
9
+
10
+ after_save :check_for_job_completion
11
+
12
+ # After saving a WorkUnit, it's Job should check if it just become complete.
13
+ def check_for_job_completion
14
+ self.job.check_for_completion if complete?
15
+ end
16
+
17
+ # Mark this unit as having finished successfully.
18
+ def finish(output, time_taken)
19
+ update_attributes({
20
+ :status => CloudCrowd::SUCCEEDED,
21
+ :taken => false,
22
+ :attempts => self.attempts + 1,
23
+ :output => output,
24
+ :time => time_taken
25
+ })
26
+ end
27
+
28
+ # Mark this unit as having failed. May attempt a retry.
29
+ def fail(output, time_taken)
30
+ tries = self.attempts + 1
31
+ return try_again if tries < CloudCrowd.config[:work_unit_retries]
32
+ update_attributes({
33
+ :status => CloudCrowd::FAILED,
34
+ :taken => false,
35
+ :attempts => tries,
36
+ :output => output,
37
+ :time => time_taken
38
+ })
39
+ end
40
+
41
+ # Ever tried. Ever failed. No matter. Try again. Fail again. Fail better.
42
+ def try_again
43
+ update_attributes({
44
+ :taken => false,
45
+ :attempts => self.attempts + 1
46
+ })
47
+ end
48
+
49
+ # The JSON representation of a WorkUnit contains common elements of its job.
50
+ def to_json
51
+ {
52
+ 'id' => self.id,
53
+ 'job_id' => self.job_id,
54
+ 'input' => self.input,
55
+ 'attempts' => self.attempts,
56
+ 'action' => self.job.action,
57
+ 'options' => JSON.parse(self.job.options),
58
+ 'status' => self.status
59
+ }.to_json
60
+ end
61
+
62
+ end
@@ -0,0 +1,31 @@
1
+ module CloudCrowd
2
+ module ModelStatus
3
+
4
+ def self.included(klass)
5
+
6
+ klass.class_eval do
7
+ # Note that COMPLETE and INCOMPLETE are unions of other states.
8
+ named_scope 'processing', :conditions => {:status => CloudCrowd::PROCESSING}
9
+ named_scope 'succeeded', :conditions => {:status => CloudCrowd::SUCCEEDED}
10
+ named_scope 'failed', :conditions => {:status => CloudCrowd::FAILED}
11
+ named_scope 'splitting', :conditions => {:status => CloudCrowd::SPLITTING}
12
+ named_scope 'merging', :conditions => {:status => CloudCrowd::MERGING}
13
+ named_scope 'complete', :conditions => {:status => CloudCrowd::COMPLETE}
14
+ named_scope 'incomplete', :conditions => {:status => CloudCrowd::INCOMPLETE}
15
+ end
16
+
17
+ end
18
+
19
+ def processing?; self.status == CloudCrowd::PROCESSING; end
20
+ def succeeded?; self.status == CloudCrowd::SUCCEEDED; end
21
+ def failed?; self.status == CloudCrowd::FAILED; end
22
+ def splitting?; self.status == CloudCrowd::SPLITTING; end
23
+ def merging?; self.status == CloudCrowd::MERGING; end
24
+ def complete?; CloudCrowd::COMPLETE.include?(self.status); end
25
+ def incomplete?; CloudCrowd::INCOMPLETE.include?(self.status); end
26
+
27
+ end
28
+ end
29
+
30
+ require 'cloud_crowd/models/job'
31
+ require 'cloud_crowd/models/work_unit'
@@ -0,0 +1,29 @@
1
+ # This is the script that kicks off a single CloudCrowd::Daemon. Because the
2
+ # daemons don't load the entire rails stack, this file functions like a mini
3
+ # environment.rb, loading all the common gems that we need.
4
+
5
+ # CloudCrowd::App.root = File.expand_path(File.dirname(__FILE__) + '/../..') unless defined?(CloudCrowd::App.root)
6
+
7
+ # Standard Lib and Gems
8
+ require 'fileutils'
9
+ require 'rubygems'
10
+ require 'daemons'
11
+ require 'socket'
12
+ require 'yaml'
13
+ require 'json'
14
+ require 'rest_client'
15
+ require 'right_aws'
16
+
17
+ FileUtils.mkdir('log') unless File.exists?('log')
18
+
19
+ # Daemon/Worker Dependencies.
20
+ require "#{File.dirname(__FILE__)}/../cloud-crowd"
21
+
22
+ Daemons.run("#{CloudCrowd::App.root}/lib/cloud_crowd/daemon.rb", {
23
+ :app_name => "cloud_crowd_worker",
24
+ :dir_mode => :normal,
25
+ :dir => 'log',
26
+ :multiple => true,
27
+ :backtrace => true,
28
+ :log_output => true
29
+ })
@@ -0,0 +1,34 @@
1
+ # Complete schema for CloudCrowd.
2
+ ActiveRecord::Schema.define(:version => 1) do
3
+
4
+ create_table "jobs", :force => true do |t|
5
+ t.integer "status", :null => false
6
+ t.text "inputs", :null => false
7
+ t.string "action", :null => false
8
+ t.text "options", :null => false
9
+ t.text "outputs"
10
+ t.float "time"
11
+ t.string "callback_url"
12
+ t.string "owner_email"
13
+ t.integer "lock_version", :default => 0, :null => false
14
+ t.datetime "created_at"
15
+ t.datetime "updated_at"
16
+ end
17
+
18
+ create_table "work_units", :force => true do |t|
19
+ t.integer "status", :null => false
20
+ t.integer "job_id", :null => false
21
+ t.text "input", :null => false
22
+ t.integer "attempts", :default => 0, :null => false
23
+ t.integer "lock_version", :default => 0, :null => false
24
+ t.boolean "taken", :default => false, :null => false
25
+ t.float "time"
26
+ t.text "output"
27
+ t.datetime "created_at"
28
+ t.datetime "updated_at"
29
+ end
30
+
31
+ add_index "work_units", ["job_id"], :name => "index_work_units_on_job_id"
32
+ add_index "work_units", ["status", "taken"], :name => "index_work_units_on_status_and_taken"
33
+
34
+ end
@@ -0,0 +1,115 @@
1
+ module CloudCrowd
2
+
3
+ class Worker
4
+
5
+ CENTRAL_URL = CloudCrowd.config[:central_server]
6
+ RETRY_WAIT = CloudCrowd.config[:worker_retry_wait]
7
+
8
+ attr_reader :action
9
+
10
+ # Spinning up a worker will create a new AssetStore with a persistent
11
+ # connection to S3. This AssetStore gets passed into each action, for use
12
+ # as it is run.
13
+ def initialize
14
+ @id = $$
15
+ @hostname = Socket.gethostname
16
+ @store = CloudCrowd::AssetStore.new
17
+ end
18
+
19
+ # Ask the central server for a new WorkUnit.
20
+ def fetch_work_unit
21
+ keep_trying_to "fetch a new work unit" do
22
+ unit_json = RestClient.get("#{CENTRAL_URL}/work")
23
+ return unless unit_json # No content means no work for us.
24
+ @start_time = Time.now
25
+ parse_work_unit unit_json
26
+ log "fetched work unit for #{@action_name}"
27
+ end
28
+ end
29
+
30
+ # Return output to the central server, marking the current work unit as done.
31
+ def complete_work_unit(result)
32
+ keep_trying_to "complete work unit" do
33
+ data = completion_params.merge({:status => 'succeeded', :output => result})
34
+ RestClient.put("#{CENTRAL_URL}/work/#{data[:id]}", data)
35
+ log "finished #{@action_name} in #{data[:time]} seconds"
36
+ end
37
+ end
38
+
39
+ # Mark the current work unit as failed, returning the exception to central.
40
+ def fail_work_unit(exception)
41
+ keep_trying_to "mark work unit as failed" do
42
+ data = completion_params.merge({:status => 'failed', :output => exception.message})
43
+ RestClient.put("#{CENTRAL_URL}/work/#{data[:id]}", data)
44
+ log "failed #{@action_name} in #{data[:time]} seconds\n#{exception.message}\n#{exception.backtrace}"
45
+ end
46
+ end
47
+
48
+ def keep_trying_to(title)
49
+ begin
50
+ yield
51
+ rescue Exception => e
52
+ log "failed to #{title} -- retry in #{RETRY_WAIT} seconds"
53
+ log e.message
54
+ log e.backtrace
55
+ sleep RETRY_WAIT
56
+ retry
57
+ end
58
+ end
59
+
60
+ # Does this Worker have a job to do?
61
+ def has_work?
62
+ @action_name && @input && @options
63
+ end
64
+
65
+ # Executes the current work unit, catching all exceptions as failures.
66
+ def run
67
+ begin
68
+ @action = CloudCrowd.actions(@action_name).new
69
+ @action.configure(@status, @input, @options, @store)
70
+ result = case @status
71
+ when CloudCrowd::PROCESSING then @action.process
72
+ when CloudCrowd::SPLITTING then @action.split
73
+ when CloudCrowd::MERGING then @action.merge
74
+ else raise "Work units must specify their status."
75
+ end
76
+ complete_work_unit(result)
77
+ rescue Exception => e
78
+ fail_work_unit(e)
79
+ ensure
80
+ clear_work_unit
81
+ end
82
+ end
83
+
84
+
85
+ private
86
+
87
+ # Common parameters to send back to central, regardless of success or failure.
88
+ def completion_params
89
+ {:id => @options['work_unit_id'], :time => Time.now - @start_time}
90
+ end
91
+
92
+ # Extract our instance variables from a WorkUnit's JSON.
93
+ def parse_work_unit(unit_json)
94
+ unit = JSON.parse(unit_json)
95
+ @action_name, @input, @options, @status = unit['action'], unit['input'], unit['options'], unit['status']
96
+ @options['job_id'] = unit['job_id']
97
+ @options['work_unit_id'] = unit['id']
98
+ @options['attempts'] ||= unit['attempts']
99
+ end
100
+
101
+ # Log a message to the daemon log. Includes PID for identification.
102
+ def log(message)
103
+ puts "Worker ##{@id}: #{message}"
104
+ end
105
+
106
+ # When we're done with a unit, clear out our ivars to make way for the next.
107
+ # Also, remove all of the previous unit's temporary storage.
108
+ def clear_work_unit
109
+ @action.cleanup_work_directory
110
+ @action, @action_name, @input, @options, @start_time = nil, nil, nil, nil, nil
111
+ end
112
+
113
+ end
114
+
115
+ end
@@ -0,0 +1,32 @@
1
+ require 'test_helper'
2
+
3
+ # A Worker Daemon needs to be running to perform this integration test.
4
+ class FailingWorkUnitsTest < Test::Unit::TestCase
5
+
6
+ should "retry work units when they fail" do
7
+ browser = Rack::Test::Session.new(Rack::MockSession.new(CloudCrowd::App))
8
+
9
+ browser.post '/jobs', :json => {
10
+ 'action' => 'failure_testing',
11
+ 'inputs' => ['one', 'two', 'three'],
12
+ 'options' => {}
13
+ }.to_json
14
+ assert browser.last_response.ok?
15
+
16
+ job = Job.last
17
+ (CloudCrowd.config[:work_unit_retries] - 1).times do
18
+ job.work_units.each {|unit| unit.fail('failed', 10) }
19
+ end
20
+ assert job.reload.work_units_remaining == 3
21
+ job.work_units.reload.each_with_index do |unit, i|
22
+ assert unit.processing?
23
+ assert unit.attempts == CloudCrowd.config[:work_unit_retries] - 1
24
+ unit.fail('failed', 10)
25
+ assert unit.job.any_work_units_failed? if i == 0
26
+ end
27
+ assert job.reload.failed?
28
+ assert job.work_units.count == 0
29
+ end
30
+
31
+ end
32
+
@@ -0,0 +1,15 @@
1
+ Sham.url { Faker::Internet.domain_name + "/" + Faker::Internet.domain_word + ".jpg" }
2
+
3
+ Job.blueprint do
4
+ status { CloudCrowd::PROCESSING }
5
+ inputs { ['http://www.google.com/intl/en_ALL/images/logo.gif'].to_json }
6
+ action { 'graphics_magick' }
7
+ options { {}.to_json }
8
+ end
9
+
10
+ WorkUnit.blueprint do
11
+ job { Job.make }
12
+ status { CloudCrowd::PROCESSING }
13
+ taken { false }
14
+ input { Sham.url }
15
+ end
@@ -0,0 +1,10 @@
1
+ :num_workers: 4
2
+ :default_worker_wait: 1
3
+ :max_worker_wait: 20
4
+ :worker_wait_multiplier: 1.3
5
+ :worker_retry_wait: 5
6
+ :work_unit_retries: 3
7
+
8
+ :central_server: http://localhost:9173
9
+
10
+ # TODO: Add a somewhat oversized work unit timeout.
@@ -0,0 +1,6 @@
1
+ :adapter: mysql
2
+ :encoding: utf8
3
+ :username: root
4
+ :password:
5
+ :socket: /tmp/mysql.sock
6
+ :database: cloud_crowd_test
@@ -0,0 +1,18 @@
1
+ require 'rubygems'
2
+
3
+ here = File.dirname(__FILE__)
4
+ require File.expand_path(here + "/../lib/cloud-crowd")
5
+ CloudCrowd.configure(here + '/config/test_config.yml')
6
+ CloudCrowd.configure_database(here + '/config/test_database.yml')
7
+
8
+ require 'faker'
9
+ require 'sham'
10
+ require 'rack/test'
11
+ require 'shoulda/active_record'
12
+ require 'machinist/active_record'
13
+ require 'mocha'
14
+ require "#{CloudCrowd::App.root}/test/blueprints.rb"
15
+
16
+ class Test::Unit::TestCase
17
+ include CloudCrowd
18
+ end
@@ -0,0 +1,70 @@
1
+ require 'test_helper'
2
+
3
+ class JobTest < Test::Unit::TestCase
4
+
5
+ context "A CloudCrowd Job" do
6
+
7
+ setup do
8
+ @job = Job.make
9
+ @unit = @job.work_units.first
10
+ end
11
+
12
+ subject { @job }
13
+
14
+ should_have_many :work_units
15
+
16
+ should_validate_presence_of :status, :inputs, :action, :options
17
+
18
+ should "create all of its work units as soon as the job is created" do
19
+ assert @job.work_units.count >= 1
20
+ assert @job.work_units_remaining == 1
21
+ assert @job.processing?
22
+ assert @unit.processing?
23
+ assert !@job.all_work_units_complete?
24
+ end
25
+
26
+ should "know its completion status" do
27
+ assert !@job.all_work_units_complete?
28
+ @unit.update_attributes(:status => CloudCrowd::SUCCEEDED, :output => 'hello')
29
+ assert @job.reload.all_work_units_complete?
30
+ assert @job.work_units_remaining == 0
31
+ assert @job.outputs == "[\"hello\"]"
32
+ end
33
+
34
+ should "be able to create a job from a JSON request" do
35
+ job = Job.create_from_request(JSON.parse(<<-EOS
36
+ { "inputs" : ["one", "two", "three"],
37
+ "action" : "graphics_magick",
38
+ "owner_email" : "bob@example.com",
39
+ "callback_url" : "http://example.com/callback" }
40
+ EOS
41
+ ))
42
+ assert job.work_units.count == 3
43
+ assert job.action == 'graphics_magick'
44
+ assert job.action_class == GraphicsMagick
45
+ assert job.callback_url == "http://example.com/callback"
46
+ end
47
+
48
+ should "create jobs with a SPLITTING status for actions that have a split method defined" do
49
+ job = Job.create_from_request({'inputs' => ['1'], 'action' => 'pdf_to_images'})
50
+ assert job.splittable?
51
+ assert job.splitting?
52
+ end
53
+
54
+ should "fire a callback when a job has finished, successfully or not" do
55
+ Job.any_instance.expects(:fire_callback)
56
+ @job.work_units.first.finish('output', 10)
57
+ assert @job.all_work_units_complete?
58
+ end
59
+
60
+ should "have a 'pretty' display of the Job's status" do
61
+ assert @job.display_status == 'processing'
62
+ @job.update_attribute(:status, CloudCrowd::FAILED)
63
+ assert @job.display_status == 'failed'
64
+ @job.update_attribute(:status, CloudCrowd::MERGING)
65
+ assert @job.display_status == 'merging'
66
+ end
67
+
68
+ end
69
+
70
+ end