documentcloud-cloud-crowd 0.0.4 → 0.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,52 +1,54 @@
1
1
  module CloudCrowd
2
2
 
3
- # Base CloudCrowd::Action class. Override this with your custom action steps.
4
- #
5
- # Public API to CloudCrowd::Action subclasses:
6
- # +input+, +input_path+, +file_name+, +work_directory+, +options+, +save+
7
- #
8
- # CloudCrowd::Actions must implement a +process+ method, which must return a
3
+ # As you write your custom actions, have them inherit from CloudCrowd::Action.
4
+ # All actions must implement a +process+ method, which should return a
9
5
  # JSON-serializeable object that will be used as the output for the work unit.
6
+ # See the default actions for examples.
7
+ #
10
8
  # Optionally, actions may define +split+ and +merge+ methods to do mapping
11
- # and reducing around the input.
12
- # +split+ must return an array of inputs.
13
- # +merge+ must return the output for the job.
14
- # All actions run inside of their individual +work_directory+.
9
+ # and reducing around the +input+. +split+ should return an array of URLs --
10
+ # to be mapped into WorkUnits and processed in parallel. In the +merge+ step,
11
+ # +input+ will be an array of all the resulting outputs from calling process.
12
+ #
13
+ # All actions have use of an individual +work_directory+, for scratch files,
14
+ # and spend their duration inside of it, so relative paths work well.
15
15
  class Action
16
16
 
17
17
  attr_reader :input, :input_path, :file_name, :options, :work_directory
18
18
 
19
- # Configuring a new Action sets up all of the read-only variables that
19
+ # Initializing an Action sets up all of the read-only variables that
20
20
  # form the bulk of the API for action subclasses. (Paths to read from and
21
- # write to). It creates the work_directory and moves into it.
22
- def configure(status, input, options, store)
21
+ # write to). It creates the +work_directory+ and moves into it.
22
+ # If we're not merging multiple results, it downloads the input file into
23
+ # the +work_directory+ before starting.
24
+ def initialize(status, input, options, store)
23
25
  @input, @options, @store = input, options, store
24
26
  @job_id, @work_unit_id = options['job_id'], options['work_unit_id']
25
27
  @work_directory = File.expand_path(File.join(@store.temp_storage_path, storage_prefix))
26
28
  FileUtils.mkdir_p(@work_directory) unless File.exists?(@work_directory)
27
29
  Dir.chdir @work_directory
28
30
  unless status == MERGING
29
- @input_path = File.join(@work_directory, File.basename(@input))
31
+ @input_path = File.join(@work_directory, safe_filename(@input))
30
32
  @file_name = File.basename(@input_path, File.extname(@input_path))
31
33
  download(@input, @input_path)
32
34
  end
33
35
  end
34
36
 
35
- # Each CloudCrowd::Action must implement a +process+ method.
37
+ # Each Action subclass must implement a +process+ method, overriding this.
36
38
  def process
37
39
  raise NotImplementedError.new("CloudCrowd::Actions must override 'process' with their own processing code.")
38
40
  end
39
41
 
40
- # Download a file to the specified path using curl.
42
+ # Download a file to the specified path with *curl*.
41
43
  def download(url, path)
42
- `curl -s "#{url}" > #{path}`
44
+ `curl -s "#{url}" > "#{path}"`
43
45
  path
44
46
  end
45
47
 
46
- # Takes a local filesystem path, and returns the public (or authenticated)
47
- # url on S3 where the file was saved.
48
+ # Takes a local filesystem path, saves the file to S3, and returns the
49
+ # public (or authenticated) url on S3 where the file can be accessed.
48
50
  def save(file_path)
49
- save_path = File.join(s3_storage_path, File.basename(file_path))
51
+ save_path = File.join(storage_prefix, File.basename(file_path))
50
52
  @store.save(file_path, save_path)
51
53
  return @store.url(save_path)
52
54
  end
@@ -61,6 +63,13 @@ module CloudCrowd
61
63
 
62
64
  private
63
65
 
66
+ # Convert an unsafe URL into a filesystem-friendly filename.
67
+ def safe_filename(url)
68
+ ext = File.extname(url)
69
+ name = File.basename(url).gsub(/%\d+/, '-').gsub(/[^a-zA-Z0-9_\-.]/, '')
70
+ File.basename(name, ext).gsub('.', '-') + ext
71
+ end
72
+
64
73
  # The directory prefix to use for both local and S3 storage.
65
74
  # [action_name]/job_[job_id]/unit_[work_unit_it]
66
75
  def storage_prefix
@@ -71,10 +80,6 @@ module CloudCrowd
71
80
  @storage_prefix ||= File.join(path_parts)
72
81
  end
73
82
 
74
- def s3_storage_path
75
- @s3_storage_path ||= storage_prefix
76
- end
77
-
78
83
  end
79
84
 
80
85
  end
@@ -1,5 +1,20 @@
1
1
  module CloudCrowd
2
2
 
3
+ # The main CloudCrowd (Sinatra) application. The actions are:
4
+ #
5
+ # == Admin
6
+ # [get /] Render the admin console, with a progress meter for running jobs.
7
+ # [get /jobs] Get the combined JSON of every active job in the queue.
8
+ # [get /heartbeat] Returns 200 OK to let monitoring tools know the server's up.
9
+ #
10
+ # == Public API
11
+ # [post /jobs] Begin a new Job. Post with a JSON representation of the job-to-be. (see examples).
12
+ # [get /jobs/:job_id] Check the status of a Job. Response includes output, if the Job has finished.
13
+ # [delete /jobs/:job_id] Clean up a Job when you're done downloading the results. Removes all intermediate files.
14
+ #
15
+ # == Internal Workers API
16
+ # [post /work] Dequeue the next WorkUnit, and hand it off to the worker.
17
+ # [put /work/:unit_id] Mark a finished WorkUnit as completed or failed, with results.
3
18
  class App < Sinatra::Default
4
19
 
5
20
  set :root, ROOT
@@ -15,28 +30,46 @@ module CloudCrowd
15
30
  login_required if CloudCrowd.config[:use_http_authentication]
16
31
  end
17
32
 
33
+ # Render the admin console.
34
+ get '/' do
35
+ erb :index
36
+ end
37
+
38
+ # Get the JSON for every active job in the queue.
39
+ get '/jobs' do
40
+ json Job.incomplete
41
+ end
42
+
43
+ # To monitor the central server with Monit, God, Nagios, or another
44
+ # monitoring tool, you can hit /heartbeat to make sure.
45
+ get '/heartbeat' do
46
+ "buh-bump"
47
+ end
48
+
49
+ # PUBLIC API:
50
+
18
51
  # Start a new job. Accepts a JSON representation of the job-to-be.
19
52
  post '/jobs' do
20
- Job.create_from_request(JSON.parse(params[:json])).to_json
53
+ json Job.create_from_request(JSON.parse(params[:job]))
21
54
  end
22
55
 
23
56
  # Check the status of a job, returning the output if finished, and the
24
57
  # number of work units remaining otherwise.
25
58
  get '/jobs/:job_id' do
26
- current_job.to_json
59
+ json current_job
27
60
  end
28
61
 
29
62
  # Cleans up a Job's saved S3 files. Delete a Job after you're done
30
63
  # downloading the results.
31
64
  delete '/jobs/:job_id' do
32
65
  current_job.cleanup
33
- ''
66
+ json nil
34
67
  end
35
68
 
36
69
  # Internal method for worker daemons to fetch the work unit at the front
37
70
  # of the queue. Work unit is marked as taken and handed off to the worker.
38
- get '/work' do
39
- dequeue_work_unit
71
+ post '/work' do
72
+ json dequeue_work_unit
40
73
  end
41
74
 
42
75
  # When workers are done with their unit, either successfully on in failure,
@@ -47,22 +80,16 @@ module CloudCrowd
47
80
  case params[:status]
48
81
  when 'succeeded'
49
82
  current_work_unit.finish(params[:output], params[:time])
50
- dequeue_work_unit
83
+ json dequeue_work_unit
51
84
  when 'failed'
52
85
  current_work_unit.fail(params[:output], params[:time])
53
- dequeue_work_unit(1)
86
+ json dequeue_work_unit(1)
54
87
  else
55
88
  error(500, "Completing a work unit must specify status.")
56
89
  end
57
90
  end
58
91
  end
59
92
 
60
- # To monitor the central server with Monit, God, Nagios, or another
61
- # monitoring tool, you can hit /heartbeat to make sure.
62
- get '/heartbeat' do
63
- "buh-bump"
64
- end
65
-
66
93
  end
67
94
 
68
95
  end
@@ -2,24 +2,31 @@ require 'tmpdir'
2
2
 
3
3
  module CloudCrowd
4
4
 
5
- # The CloudCrowd::AssetStore should provide a common API for stashing and retrieving
6
- # assets via URLs, in production this will be S3 but in development it may
7
- # be the filesystem or /tmp.
5
+ # The AssetStore provides a common API for storing files and returning URLs
6
+ # that can access them. In production this will be S3 but in development
7
+ # it may be the filesystem.
8
+ #
9
+ # You shouldn't need to use the AssetStore directly -- Action's +download+
10
+ # and +save+ methods use it behind the scenes.
8
11
  class AssetStore
9
12
  include FileUtils
10
13
 
14
+ # Creating an AssetStore will determine wether to save private or public
15
+ # files on S3, depending on the value of <tt>use_s3_authentication</tt> in
16
+ # <tt>config.yml</tt>.
11
17
  def initialize
12
18
  @use_auth = CloudCrowd.config[:use_s3_authentication]
13
19
  mkdir_p temp_storage_path unless File.exists? temp_storage_path
14
20
  end
15
21
 
16
- # Path to CloudCrowd's temporary local storage.
22
+ # Get the path to CloudCrowd's temporary local storage. All actions run
23
+ # in subdirectories of this.
17
24
  def temp_storage_path
18
25
  "#{Dir.tmpdir}/cloud_crowd_tmp"
19
26
  end
20
27
 
21
- # Copy a finished file from our local storage to S3. Save it publicly if
22
- # we're not configured to use S3 authentication.
28
+ # Copy a finished file from our local storage to S3. Save it publicly unless
29
+ # we're configured to use S3 authentication.
23
30
  def save(local_path, save_path)
24
31
  ensure_s3_connection
25
32
  permission = @use_auth ? 'private' : 'public-read'
@@ -14,6 +14,8 @@ module CloudCrowd
14
14
 
15
15
  # Command-line banner for the usage message.
16
16
  BANNER = <<-EOS
17
+ CloudCrowd is a Ruby & AWS batch processing system, MapReduce style.
18
+
17
19
  Usage: crowd COMMAND OPTIONS
18
20
 
19
21
  COMMANDS:
@@ -45,6 +47,7 @@ OPTIONS:
45
47
  def run_console
46
48
  require 'irb'
47
49
  require 'irb/completion'
50
+ require 'pp'
48
51
  load_code
49
52
  connect_to_database
50
53
  IRB.start
@@ -60,9 +63,9 @@ OPTIONS:
60
63
  require 'rubygems'
61
64
  rackup_path = File.expand_path("#{@options[:config_path]}/config.ru")
62
65
  if Gem.available? 'thin'
63
- exec "thin -e production -p #{@options[:port]} -R #{rackup_path} start"
66
+ exec "thin -e #{@options[:environment]} -p #{@options[:port]} -R #{rackup_path} start"
64
67
  else
65
- exec "rackup -E production -p #{@options[:port]} #{rackup_path}"
68
+ exec "rackup -E #{@options[:environment]} -p #{@options[:port]} #{rackup_path}"
66
69
  end
67
70
  end
68
71
 
@@ -127,7 +130,7 @@ OPTIONS:
127
130
 
128
131
  # Print `crowd` usage.
129
132
  def usage
130
- puts @option_parser
133
+ puts "\n#{@option_parser}\n"
131
134
  end
132
135
 
133
136
 
@@ -141,11 +144,11 @@ OPTIONS:
141
144
  found ? @config_dir = true : config_not_found
142
145
  end
143
146
 
144
- # Parse all options for all actions.
145
- # TODO: Think about parsing options per sub-command separately.
147
+ # Parse all options for all commands.
146
148
  def parse_options
147
149
  @options = {
148
150
  :port => 9173,
151
+ :environment => 'production',
149
152
  :config_path => ENV['CLOUD_CROWD_CONFIG'] || '.'
150
153
  }
151
154
  @option_parser = OptionParser.new do |opts|
@@ -158,6 +161,9 @@ OPTIONS:
158
161
  opts.on('-p', '--port PORT', 'central server port number') do |port_num|
159
162
  @options[:port] = port_num
160
163
  end
164
+ opts.on('-e', '--environment ENV', 'Sinatra environment (code reloading)') do |env|
165
+ @options[:environment] = env
166
+ end
161
167
  opts.on_tail('-v', '--version', 'show version') do
162
168
  load_code
163
169
  puts "CloudCrowd version #{VERSION}"
@@ -6,8 +6,13 @@ module CloudCrowd
6
6
 
7
7
  # A CloudCrowd::Daemon, started by the Daemons gem, runs a CloudCrowd::Worker in
8
8
  # a loop, continually fetching and processing WorkUnits from the central
9
- # server. The Daemon backs off and pings central less frequently when there
10
- # isn't any work to be done, and speeds back up when there is.
9
+ # server.
10
+ #
11
+ # The Daemon backs off and pings the central server less frequently
12
+ # when there isn't any work to be done, and speeds back up when there is.
13
+ #
14
+ # The `crowd` command responds to all the usual methods that the Daemons gem
15
+ # supports.
11
16
  class Daemon
12
17
 
13
18
  MIN_WAIT = CloudCrowd.config[:min_worker_wait]
@@ -0,0 +1,17 @@
1
+ module CloudCrowd
2
+
3
+ # Base Error class which all custom CloudCrowd exceptions inherit from.
4
+ class Error < RuntimeError #:nodoc:
5
+ end
6
+
7
+ # ActionNotFound is raised when a job is created for an action that doesn't
8
+ # exist.
9
+ class ActionNotFound < Error #:nodoc:
10
+ end
11
+
12
+ # StatusUnspecified is raised when a WorkUnit returns without a valid
13
+ # status code.
14
+ class StatusUnspecified < Error #:nodoc:
15
+ end
16
+
17
+ end
@@ -2,7 +2,7 @@ require 'cloud_crowd/helpers/authorization'
2
2
  require 'cloud_crowd/helpers/resources'
3
3
 
4
4
  module CloudCrowd
5
- module Helpers
5
+ module Helpers #:nodoc:
6
6
  include Authorization, Resources #, Rack::Utils
7
7
  end
8
8
  end
@@ -1,7 +1,9 @@
1
- # After sinatra-authorization...
2
-
3
1
  module CloudCrowd
4
2
  module Helpers
3
+
4
+ # Authorization takes after sinatra-authorization... See
5
+ # http://github.com/integrity/sinatra-authorization
6
+ # for the original.
5
7
  module Authorization
6
8
 
7
9
  # Ensure that the request includes the correct credentials.
@@ -19,7 +21,9 @@ module CloudCrowd
19
21
  end
20
22
 
21
23
  # A request is authorized if its login and password match those stored
22
- # in config.yml, or if authentication is disabled.
24
+ # in config.yml, or if authentication is disabled. If authentication is
25
+ # turned on, then every request is authenticated, including between
26
+ # the worker daemons and the central server.
23
27
  def authorize(login, password)
24
28
  return true unless CloudCrowd.config[:use_http_authentication]
25
29
  return CloudCrowd.config[:login] == login &&
@@ -2,10 +2,20 @@ module CloudCrowd
2
2
  module Helpers
3
3
  module Resources
4
4
 
5
+ # Convenience method for responding with JSON. Sets the content-type,
6
+ # serializes, and allows empty responses.
7
+ def json(obj)
8
+ content_type :json
9
+ return status(204) && '' if obj.nil?
10
+ obj.to_json
11
+ end
12
+
13
+ # Lazy-fetch the job specified by <tt>job_id</tt>.
5
14
  def current_job
6
15
  @job ||= Job.find_by_id(params[:job_id]) or raise Sinatra::NotFound
7
16
  end
8
17
 
18
+ # Lazy-fetch the WorkUnit specified by <tt>work_unit_id</tt>.
9
19
  def current_work_unit
10
20
  @work_unit ||= WorkUnit.find_by_id(params[:work_unit_id]) or raise Sinatra::NotFound
11
21
  end
@@ -14,9 +24,8 @@ module CloudCrowd
14
24
  # with no content.
15
25
  def dequeue_work_unit(offset=0)
16
26
  handle_conflicts do
17
- unit = WorkUnit.dequeue(offset)
18
- return status(204) && '' unless unit
19
- unit.to_json
27
+ actions = params[:enabled_actions].split(',')
28
+ WorkUnit.dequeue(actions, offset)
20
29
  end
21
30
  end
22
31
 
@@ -1,7 +1,7 @@
1
1
  module CloudCrowd
2
2
 
3
3
  # Pilfered in parts from the ActiveSupport::Inflector.
4
- module Inflector
4
+ module Inflector #:nodoc:
5
5
 
6
6
  def self.camelize(word)
7
7
  word.to_s.gsub(/\/(.?)/) { "::#{$1.upcase}" }.gsub(/(?:^|_)(.)/) { $1.upcase }
@@ -1,7 +1,7 @@
1
1
  module CloudCrowd
2
2
 
3
3
  # A chunk of work that will be farmed out into many WorkUnits to be processed
4
- # in parallel by all the active CloudCrowd::Workers. Jobs are defined by a list
4
+ # in parallel by each active CloudCrowd::Worker. Jobs are defined by a list
5
5
  # of inputs (usually public urls to files), an action (the name of a script that
6
6
  # CloudCrowd knows how to run), and, eventually a corresponding list of output.
7
7
  class Job < ActiveRecord::Base
@@ -10,9 +10,13 @@ module CloudCrowd
10
10
  has_many :work_units, :dependent => :destroy
11
11
 
12
12
  validates_presence_of :status, :inputs, :action, :options
13
+
14
+ before_validation_on_create :set_initial_status
15
+ after_create :queue_for_workers
16
+ before_destroy :cleanup
13
17
 
14
18
  # Create a Job from an incoming JSON or XML request, and add it to the queue.
15
- # TODO: Add XML support.
19
+ # TODO: Think about XML support.
16
20
  def self.create_from_request(h)
17
21
  self.create(
18
22
  :inputs => h['inputs'].to_json,
@@ -23,16 +27,6 @@ module CloudCrowd
23
27
  )
24
28
  end
25
29
 
26
- # Creating a job creates its corresponding work units, adding them
27
- # to the queue.
28
- def after_create
29
- self.queue_for_workers(JSON.parse(self.inputs))
30
- end
31
-
32
- def before_validation_on_create
33
- self.status = self.splittable? ? SPLITTING : PROCESSING
34
- end
35
-
36
30
  # After work units are marked successful, we check to see if all of them have
37
31
  # finished, if so, continue on to the next phase of the job.
38
32
  def check_for_completion
@@ -54,15 +48,10 @@ module CloudCrowd
54
48
  self
55
49
  end
56
50
 
57
- # Transition this Job's status to the following one.
58
- def transition_to_next_phase
59
- self.status = any_work_units_failed? ? FAILED :
60
- self.splitting? ? PROCESSING :
61
- self.mergeable? ? MERGING :
62
- SUCCEEDED
63
- end
64
-
65
- # If a callback_url is defined, post the Job's JSON to it upon completion.
51
+ # If a <tt>callback_url</tt> is defined, post the Job's JSON to it upon
52
+ # completion. The <tt>callback_url</tt> may include HTTP basic authentication,
53
+ # if you like:
54
+ # http://user:password@example.com/job_complete
66
55
  def fire_callback
67
56
  begin
68
57
  RestClient.post(callback_url, {:job => self.to_json}) if callback_url
@@ -71,13 +60,17 @@ module CloudCrowd
71
60
  end
72
61
  end
73
62
 
74
- # Cleaning up after a job will remove all of its files from S3.
63
+ # Cleaning up after a job will remove all of its files from S3. Destroying
64
+ # a Job calls cleanup first.
75
65
  def cleanup
76
66
  AssetStore.new.cleanup_job(self)
77
67
  end
78
68
 
79
- # Have all of the WorkUnits finished? We could trade reads for writes here
69
+ # Have all of the WorkUnits finished?
70
+ #--
71
+ # We could trade reads for writes here
80
72
  # by keeping a completed_count on the Job itself.
73
+ #++
81
74
  def all_work_units_complete?
82
75
  self.work_units.incomplete.count <= 0
83
76
  end
@@ -97,19 +90,14 @@ module CloudCrowd
97
90
  self.processing? && self.action_class.public_instance_methods.include?('merge')
98
91
  end
99
92
 
100
- # Retrieve the class for this Job's Action, loading it if necessary.
93
+ # Retrieve the class for this Job's Action.
101
94
  def action_class
102
- CloudCrowd.actions(self.action)
103
- end
104
-
105
- # When the WorkUnits are all finished, gather all their outputs together
106
- # before removing them from the database entirely.
107
- def gather_outputs_from_work_units
108
- outs = self.work_units.complete.map {|wu| wu.output }
109
- self.work_units.complete.destroy_all
110
- outs
95
+ klass = CloudCrowd.actions[self.action]
96
+ return klass if klass
97
+ raise ActionNotFound, "no action named: '#{self.action}' could be found"
111
98
  end
112
99
 
100
+ # Get the displayable status name of the Job's status code.
113
101
  def display_status
114
102
  CloudCrowd.display_status(self.status)
115
103
  end
@@ -122,22 +110,71 @@ module CloudCrowd
122
110
  (work_units.complete.count / work_units.count.to_f * 100).round
123
111
  end
124
112
 
113
+ # How long has this Job taken?
114
+ def time_taken
115
+ return self.time if self.time
116
+ Time.now - self.created_at
117
+ end
118
+
119
+ # Generate a stable 8-bit Hex color code, based on the Job's id.
120
+ def color
121
+ @color ||= Digest::MD5.hexdigest(self.id.to_s)[-7...-1]
122
+ end
123
+
125
124
  # A JSON representation of this job includes the statuses of its component
126
125
  # WorkUnits, as well as any completed outputs.
127
126
  def to_json(opts={})
128
- atts = {'id' => self.id, 'status' => self.display_status, 'percent_complete' => self.percent_complete}
127
+ atts = {
128
+ 'id' => self.id,
129
+ 'color' => self.color,
130
+ 'status' => self.display_status,
131
+ 'percent_complete' => self.percent_complete,
132
+ 'work_units' => self.work_units.count,
133
+ 'time_taken' => self.time_taken
134
+ }
129
135
  atts.merge!({'outputs' => JSON.parse(self.outputs)}) if self.outputs
130
- atts.merge!({'time' => self.time}) if self.time
131
136
  atts.to_json
132
137
  end
138
+
139
+
140
+ private
141
+
142
+ # When the WorkUnits are all finished, gather all their outputs together
143
+ # before removing them from the database entirely.
144
+ def gather_outputs_from_work_units
145
+ units = self.work_units.complete
146
+ outs = self.work_units.complete.map {|u| JSON.parse(u.output)['output'] }
147
+ self.work_units.complete.destroy_all
148
+ outs
149
+ end
150
+
151
+ # Transition this Job's status to the appropriate next status.
152
+ def transition_to_next_phase
153
+ self.status = any_work_units_failed? ? FAILED :
154
+ self.splitting? ? PROCESSING :
155
+ self.mergeable? ? MERGING :
156
+ SUCCEEDED
157
+ end
133
158
 
134
159
  # When starting a new job, or moving to a new stage, split up the inputs
135
- # into WorkUnits, and queue them.
136
- def queue_for_workers(input)
160
+ # into WorkUnits, and queue them. Workers will start picking them up right
161
+ # away.
162
+ def queue_for_workers(input=nil)
163
+ input ||= JSON.parse(self.inputs)
137
164
  [input].flatten.each do |wu_input|
138
- WorkUnit.create(:job => self, :input => wu_input, :status => self.status)
165
+ WorkUnit.create(
166
+ :job => self,
167
+ :action => self.action,
168
+ :input => wu_input,
169
+ :status => self.status
170
+ )
139
171
  end
140
172
  end
141
173
 
174
+ # A Job starts out either splitting or processing, depending on its action.
175
+ def set_initial_status
176
+ self.status = self.splittable? ? SPLITTING : PROCESSING
177
+ end
178
+
142
179
  end
143
180
  end