documentcloud-cloud-crowd 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. data/README +8 -8
  2. data/cloud-crowd.gemspec +8 -8
  3. data/config/config.example.ru +8 -2
  4. data/config/config.example.yml +6 -15
  5. data/examples/process_pdfs_example.rb +1 -1
  6. data/examples/word_count_example.rb +1 -0
  7. data/lib/cloud-crowd.rb +6 -5
  8. data/lib/cloud_crowd/action.rb +11 -7
  9. data/lib/cloud_crowd/asset_store/filesystem_store.rb +5 -0
  10. data/lib/cloud_crowd/asset_store/s3_store.rb +7 -3
  11. data/lib/cloud_crowd/asset_store.rb +1 -1
  12. data/lib/cloud_crowd/command_line.rb +14 -53
  13. data/lib/cloud_crowd/exceptions.rb +4 -0
  14. data/lib/cloud_crowd/helpers/authorization.rb +2 -2
  15. data/lib/cloud_crowd/helpers/resources.rb +0 -20
  16. data/lib/cloud_crowd/models/job.rb +25 -26
  17. data/lib/cloud_crowd/models/node_record.rb +81 -0
  18. data/lib/cloud_crowd/models/work_unit.rb +70 -30
  19. data/lib/cloud_crowd/models.rb +1 -1
  20. data/lib/cloud_crowd/node.rb +87 -0
  21. data/lib/cloud_crowd/schema.rb +19 -16
  22. data/lib/cloud_crowd/{app.rb → server.rb} +25 -30
  23. data/lib/cloud_crowd/worker.rb +50 -74
  24. data/public/css/admin_console.css +26 -14
  25. data/public/images/server.png +0 -0
  26. data/public/js/admin_console.js +45 -18
  27. data/test/acceptance/test_failing_work_units.rb +1 -1
  28. data/test/acceptance/{test_app.rb → test_server.rb} +15 -15
  29. data/test/acceptance/test_word_count.rb +3 -9
  30. data/test/blueprints.rb +0 -1
  31. data/test/config/config.ru +1 -1
  32. data/test/config/config.yml +1 -3
  33. data/test/unit/test_configuration.rb +1 -1
  34. data/test/unit/test_job.rb +1 -0
  35. data/test/unit/test_work_unit.rb +2 -4
  36. data/views/index.erb +13 -8
  37. metadata +9 -9
  38. data/lib/cloud_crowd/daemon.rb +0 -95
  39. data/lib/cloud_crowd/models/worker_record.rb +0 -61
  40. data/lib/cloud_crowd/runner.rb +0 -15
data/README CHANGED
@@ -30,19 +30,19 @@
30
30
  * split -> process -> merge
31
31
  * As easy as `gem install cloud-crowd`
32
32
 
33
- Well-suited for:
34
-
35
- * Generating or resizing images.
36
- * Encoding video.
37
- * Running text extraction or OCR on PDFs.
38
- * Migrating a large file set or database.
39
- * Web scraping.
33
+ Well-suited for:
34
+
35
+ * Generating or resizing images.
36
+ * Encoding video.
37
+ * Running text extraction or OCR on PDFs.
38
+ * Migrating a large file set or database.
39
+ * Web scraping.
40
40
 
41
41
 
42
42
  ~ Documentation ~
43
43
 
44
44
  Wiki: http://wiki.github.com/documentcloud/cloud-crowd
45
- Rdoc: http://rdoc.info/projects/documentcloud/cloud-crowd
45
+ Rdoc: http://rdoc.info/projects/documentcloud/cloud-crowd
46
46
 
47
47
 
48
48
  ~ Getting started ~
data/cloud-crowd.gemspec CHANGED
@@ -1,7 +1,7 @@
1
1
  Gem::Specification.new do |s|
2
2
  s.name = 'cloud-crowd'
3
- s.version = '0.1.0' # Keep version in sync with cloud-cloud.rb
4
- s.date = '2009-09-01'
3
+ s.version = '0.1.1' # Keep version in sync with cloud-cloud.rb
4
+ s.date = '2009-09-15'
5
5
 
6
6
  s.homepage = "http://wiki.github.com/documentcloud/cloud-crowd"
7
7
  s.summary = "Parallel Processing for the Rest of Us"
@@ -32,7 +32,7 @@ Gem::Specification.new do |s|
32
32
  s.add_dependency 'json', ['>= 1.1.7']
33
33
  s.add_dependency 'rest-client', ['>= 1.0.3']
34
34
  s.add_dependency 'right_aws', ['>= 1.10.0']
35
- s.add_dependency 'daemons', ['>= 1.0.10']
35
+ s.add_dependency 'thin', ['>= 1.2.4']
36
36
 
37
37
  if s.respond_to?(:add_development_dependency)
38
38
  s.add_development_dependency 'faker', ['>= 0.3.1']
@@ -56,23 +56,22 @@ examples/process_pdfs_example.rb
56
56
  examples/word_count_example.rb
57
57
  lib/cloud-crowd.rb
58
58
  lib/cloud_crowd/action.rb
59
- lib/cloud_crowd/app.rb
60
59
  lib/cloud_crowd/asset_store/filesystem_store.rb
61
60
  lib/cloud_crowd/asset_store/s3_store.rb
62
61
  lib/cloud_crowd/asset_store.rb
63
62
  lib/cloud_crowd/command_line.rb
64
- lib/cloud_crowd/daemon.rb
65
63
  lib/cloud_crowd/exceptions.rb
66
64
  lib/cloud_crowd/helpers/authorization.rb
67
65
  lib/cloud_crowd/helpers/resources.rb
68
66
  lib/cloud_crowd/helpers.rb
69
67
  lib/cloud_crowd/inflector.rb
70
68
  lib/cloud_crowd/models/job.rb
69
+ lib/cloud_crowd/models/node_record.rb
71
70
  lib/cloud_crowd/models/work_unit.rb
72
- lib/cloud_crowd/models/worker_record.rb
73
71
  lib/cloud_crowd/models.rb
74
- lib/cloud_crowd/runner.rb
72
+ lib/cloud_crowd/node.rb
75
73
  lib/cloud_crowd/schema.rb
74
+ lib/cloud_crowd/server.rb
76
75
  lib/cloud_crowd/worker.rb
77
76
  LICENSE
78
77
  public/css/admin_console.css
@@ -83,6 +82,7 @@ public/images/cloud_hand.png
83
82
  public/images/header_back.png
84
83
  public/images/logo.png
85
84
  public/images/queue_fill.png
85
+ public/images/server.png
86
86
  public/images/server_error.png
87
87
  public/images/sidebar_bottom.png
88
88
  public/images/sidebar_top.png
@@ -93,7 +93,7 @@ public/js/excanvas.js
93
93
  public/js/flot.js
94
94
  public/js/jquery.js
95
95
  README
96
- test/acceptance/test_app.rb
96
+ test/acceptance/test_server.rb
97
97
  test/acceptance/test_failing_work_units.rb
98
98
  test/acceptance/test_word_count.rb
99
99
  test/blueprints.rb
@@ -4,7 +4,13 @@
4
4
  # using any Rack-compliant server handler. For example, start up three servers
5
5
  # with a specified port number, using Thin:
6
6
  #
7
- # thin start -R config.ru -p 9173 --servers 3
7
+ # thin start -R config.ru --servers 3
8
+ #
9
+ # Or a single server with Unicorn:
10
+ #
11
+ # unicorn config.ru
12
+ #
13
+
8
14
 
9
15
  require 'rubygems'
10
16
  require 'cloud-crowd'
@@ -13,5 +19,5 @@ CloudCrowd.configure(File.dirname(__FILE__) + '/config.yml')
13
19
  CloudCrowd.configure_database(File.dirname(__FILE__) + '/database.yml')
14
20
 
15
21
  map '/' do
16
- run CloudCrowd::App
22
+ run CloudCrowd::Server
17
23
  end
@@ -1,6 +1,11 @@
1
1
  # The URL where you're planning on running the central server/queue/database.
2
2
  :central_server: http://localhost:9173
3
3
 
4
+ # Set the maximum number of workers allowed per-node. Workers only run while
5
+ # there's work to be done. It's best to set 'max_workers' below the point where
6
+ # you'd start to swap or peg your CPU (as determined by experiment).
7
+ :max_workers: 5
8
+
4
9
  # The storage back-end that you'd like to use for intermediate and final results
5
10
  # of processing. 's3' and 'filesystem' are supported. 'filesystem' should only
6
11
  # be used in development, or on single-machine installations.
@@ -29,20 +34,6 @@
29
34
  # additional actions from a location of your choice.
30
35
  # :actions_path: /path/to/actions
31
36
 
32
- # Set the following numbers to tweak the configuration of your worker daemons.
33
- # Optimum results will depend on proportion of the Memory/CPU/IO bottlenecks
34
- # in your actions, the number of central servers you have running, and your
35
- # desired balance between latency and traffic.
36
-
37
- # The number of workers that `crowd workers start` spins up.
38
- :num_workers: 3
39
-
40
- # The minimum number of seconds a worker waits between checking the job queue.
41
- :min_worker_wait: 1
42
-
43
- # The maximum number of seconds a worker waits between checking the job queue.
44
- :max_worker_wait: 5
45
-
46
37
  # The number of separate attempts that will be made to process an individual
47
38
  # work unit, before marking it as having failed.
48
- :work_unit_retries: 3
39
+ :work_unit_retries: 3
@@ -17,7 +17,7 @@ RestClient.post('http://localhost:9173/jobs',
17
17
  'http://tigger.uic.edu/~victor/personal/futurism.pdf',
18
18
  'http://www.jonasmekas.com/Catalog_excerpt/The%20Avant-Garde%20From%20Futurism%20to%20Fluxus.pdf',
19
19
  'http://www.dzignism.com/articles/Futurist.Manifesto.pdf',
20
- 'http://benfry.com/phd/dissertation-050312b-acrobat.pdf'
20
+ 'http://www.pitt.edu/~slavic/sisc/SISC4/dadswell.pdf'
21
21
  ],
22
22
 
23
23
  'options' => {
@@ -39,3 +39,4 @@ RestClient.post('http://localhost:9173/jobs',
39
39
  )
40
40
 
41
41
  # With 23 Workers running, and over Wifi, it counted all the words in 5.5 secs.
42
+ # On a fast internet connection, you may not even see this job show up.
data/lib/cloud-crowd.rb CHANGED
@@ -5,16 +5,15 @@ $LOAD_PATH.unshift File.expand_path(File.dirname(__FILE__))
5
5
  # Common Gems:
6
6
  require 'rubygems'
7
7
  gem 'activerecord'
8
- gem 'daemons'
9
8
  gem 'json'
10
9
  gem 'rest-client'
11
10
  gem 'right_aws'
12
11
  gem 'sinatra'
12
+ gem 'thin'
13
13
 
14
14
  # Autoloading for all the pieces which may or may not be needed:
15
15
  autoload :ActiveRecord, 'activerecord'
16
16
  autoload :Benchmark, 'benchmark'
17
- autoload :Daemons, 'daemons'
18
17
  autoload :Digest, 'digest'
19
18
  autoload :ERB, 'erb'
20
19
  autoload :FileUtils, 'fileutils'
@@ -23,6 +22,7 @@ autoload :RestClient, 'restclient'
23
22
  autoload :RightAws, 'right_aws'
24
23
  autoload :Sinatra, 'sinatra'
25
24
  autoload :Socket, 'socket'
25
+ autoload :Thin, 'thin'
26
26
  autoload :YAML, 'yaml'
27
27
 
28
28
  # Common code which should really be required in every circumstance.
@@ -31,21 +31,22 @@ require 'cloud_crowd/exceptions'
31
31
  module CloudCrowd
32
32
 
33
33
  # Autoload all the CloudCrowd classes which may not be required.
34
- autoload :App, 'cloud_crowd/app'
35
34
  autoload :Action, 'cloud_crowd/action'
36
35
  autoload :AssetStore, 'cloud_crowd/asset_store'
37
36
  autoload :Helpers, 'cloud_crowd/helpers'
38
37
  autoload :Inflector, 'cloud_crowd/inflector'
39
38
  autoload :Job, 'cloud_crowd/models'
39
+ autoload :Node, 'cloud_crowd/node'
40
+ autoload :NodeRecord, 'cloud_crowd/models'
41
+ autoload :Server, 'cloud_crowd/server'
40
42
  autoload :Worker, 'cloud_crowd/worker'
41
43
  autoload :WorkUnit, 'cloud_crowd/models'
42
- autoload :WorkerRecord, 'cloud_crowd/models'
43
44
 
44
45
  # Root directory of the CloudCrowd gem.
45
46
  ROOT = File.expand_path(File.dirname(__FILE__) + '/..')
46
47
 
47
48
  # Keep the version in sync with the gemspec.
48
- VERSION = '0.1.0'
49
+ VERSION = '0.1.1'
49
50
 
50
51
  # A Job is processing if its WorkUnits in the queue to be handled by workers.
51
52
  PROCESSING = 1
@@ -38,12 +38,16 @@ module CloudCrowd
38
38
 
39
39
  # Download a file to the specified path.
40
40
  def download(url, path)
41
- if url.match(FILE_URL)
42
- FileUtils.cp(url.sub(FILE_URL, ''), path)
43
- else
44
- resp = RestClient::Request.execute(:url => url, :method => :get, :raw_response => true)
45
- FileUtils.mv resp.file.path, path
46
- end
41
+ URI.parse(url) # Sanity check.
42
+ `curl -s "#{url}" > "#{path}"`
43
+ # if url.match(FILE_URL)
44
+ # FileUtils.cp(url.sub(FILE_URL, ''), path)
45
+ # else
46
+ # # An alternative would be shelling out: `curl -s "#{url}" > "#{path}"`
47
+ # puts url
48
+ # resp = RestClient::Request.execute(:url => url, :method => :get, :raw_response => true)
49
+ # FileUtils.mv resp.file.path, path
50
+ # end
47
51
  path
48
52
  end
49
53
 
@@ -55,7 +59,7 @@ module CloudCrowd
55
59
  end
56
60
 
57
61
  # After the Action has finished, we remove the work directory and return
58
- # to the root directory (where daemons run by default).
62
+ # to the root directory (where workers run by default).
59
63
  def cleanup_work_directory
60
64
  FileUtils.rm_r(@work_directory) if File.exists?(@work_directory)
61
65
  end
@@ -6,6 +6,11 @@ module CloudCrowd
6
6
  # installation.
7
7
  module FilesystemStore
8
8
 
9
+ # Make sure that local storage is writeable before starting.
10
+ def setup
11
+ raise Error::StorageNotWritable, "#{LOCAL_STORAGE_PATH} is not writable" unless File.writable?(LOCAL_STORAGE_PATH)
12
+ end
13
+
9
14
  # Save a file to somewhere semi-persistent on the filesystem. Can be used
10
15
  # in development, when offline, or if you happen to have a single-machine
11
16
  # CloudCrowd installation. To use, configure <tt>:storage => 'filesystem'</tt>.
@@ -5,11 +5,16 @@ module CloudCrowd
5
5
  # on S3 for all resulting files.
6
6
  module S3Store
7
7
 
8
+ # Configure authentication and establish a connection to S3, first thing.
9
+ def setup
10
+ @use_auth = CloudCrowd.config[:use_s3_authentication]
11
+ establish_s3_connection
12
+ end
13
+
8
14
  # Save a finished file from local storage to S3. Save it publicly unless
9
15
  # we're configured to use S3 authentication. Authenticated links expire
10
16
  # after one day by default.
11
17
  def save(local_path, save_path)
12
- ensure_s3_connection
13
18
  if @use_auth
14
19
  @bucket.put(save_path, File.open(local_path), {}, 'private')
15
20
  @s3.interface.get_link(@bucket, save_path)
@@ -21,13 +26,12 @@ module CloudCrowd
21
26
 
22
27
  # Remove all of a Job's resulting files from S3, both intermediate and finished.
23
28
  def cleanup(job)
24
- ensure_s3_connection
25
29
  @bucket.delete_folder("#{job.action}/job_#{job.id}")
26
30
  end
27
31
 
28
32
  # Workers, through the course of many WorkUnits, keep around an AssetStore.
29
33
  # Ensure we have a persistent S3 connection after first use.
30
- def ensure_s3_connection
34
+ def establish_s3_connection
31
35
  unless @s3 && @bucket
32
36
  params = {:port => 80, :protocol => 'http'}
33
37
  @s3 = RightAws::S3.new(CloudCrowd.config[:aws_access_key], CloudCrowd.config[:aws_secret_key], params)
@@ -25,9 +25,9 @@ module CloudCrowd
25
25
 
26
26
  # Creating the AssetStore ensures that its scratch directory exists.
27
27
  def initialize
28
- @use_auth = CloudCrowd.config[:use_s3_authentication]
29
28
  FileUtils.mkdir_p temp_storage_path unless File.exists? temp_storage_path
30
29
  raise Error::StorageNotWritable, "#{temp_storage_path} is not writable" unless File.writable?(temp_storage_path)
30
+ setup if respond_to? :setup
31
31
  end
32
32
 
33
33
  # Get the path to CloudCrowd's temporary local storage. All actions run
@@ -9,9 +9,6 @@ module CloudCrowd
9
9
  # Reference the absolute path to the root.
10
10
  CC_ROOT = File.expand_path(File.dirname(__FILE__) + '/../..')
11
11
 
12
- # Path to the Daemons gem script which launches workers.
13
- WORKER_RUNNER = File.expand_path("#{CC_ROOT}/lib/cloud_crowd/runner.rb")
14
-
15
12
  # Command-line banner for the usage message.
16
13
  BANNER = <<-EOS
17
14
  CloudCrowd is a MapReduce-inspired Parallel Processing System for Ruby.
@@ -24,7 +21,7 @@ Usage: crowd COMMAND OPTIONS
24
21
  Commands:
25
22
  install Install the CloudCrowd configuration files to the specified directory
26
23
  server Start up the central server (requires a database)
27
- workers Control worker daemons, use: (start | stop | restart | status | run)
24
+ node Start up a worker node (only one node per machine, please)
28
25
  console Launch a CloudCrowd console, connected to the central database
29
26
  load_schema Load the schema into the database specified by database.yml
30
27
 
@@ -38,7 +35,7 @@ Options:
38
35
  case command
39
36
  when 'console' then run_console
40
37
  when 'server' then run_server
41
- when 'workers' then run_workers_command
38
+ when 'node' then run_node
42
39
  when 'load_schema' then run_load_schema
43
40
  when 'install' then run_install
44
41
  else usage
@@ -63,6 +60,7 @@ Options:
63
60
  # (Mongrel, falling back to WEBrick). The equivalent of Rails' script/server.
64
61
  def run_server
65
62
  ensure_config
63
+ @options[:port] ||= 9173
66
64
  require 'rubygems'
67
65
  rackup_path = File.expand_path("#{@options[:config_path]}/config.ru")
68
66
  if Gem.available? 'thin'
@@ -72,6 +70,14 @@ Options:
72
70
  end
73
71
  end
74
72
 
73
+ # Launch a Node. Please only run a single node per machine. The Node process
74
+ # will be long-lived, although its workers will come and go.
75
+ def run_node
76
+ ENV['RACK_ENV'] = @options['environment']
77
+ load_code
78
+ Node.new(@options[:port])
79
+ end
80
+
75
81
  # Load in the database schema to the database specified in 'database.yml'.
76
82
  def run_load_schema
77
83
  load_code
@@ -86,51 +92,11 @@ Options:
86
92
  install_path = ARGV.shift || '.'
87
93
  FileUtils.mkdir_p install_path unless File.exists?(install_path)
88
94
  install_file "#{CC_ROOT}/config/config.example.yml", "#{install_path}/config.yml"
89
- install_file "#{CC_ROOT}/config/config.example.ru", "#{install_path}/config.ru"
90
95
  install_file "#{CC_ROOT}/config/database.example.yml", "#{install_path}/database.yml"
96
+ install_file "#{CC_ROOT}/config/config.example.ru", "#{install_path}/config.ru"
91
97
  install_file "#{CC_ROOT}/actions", "#{install_path}/actions", true
92
98
  end
93
99
 
94
- # Manipulate worker daemons -- handles all commands that the Daemons gem
95
- # provides: start, stop, restart, run, and status.
96
- def run_workers_command
97
- ensure_config
98
- command = ARGV.shift
99
- case command
100
- when 'start' then start_workers
101
- when 'stop' then stop_workers
102
- when 'restart' then stop_workers && start_workers
103
- when 'run' then run_worker
104
- when 'status' then show_worker_status
105
- else usage
106
- end
107
- end
108
-
109
- # Start up N workers, specified by argument or the number of workers in
110
- # config.yml.
111
- def start_workers
112
- load_code
113
- num_workers = @options[:num_workers] || CloudCrowd.config[:num_workers]
114
- num_workers.times do
115
- `CLOUD_CROWD_CONFIG='#{File.expand_path(@options[:config_path] + "/config.yml")}' ruby #{WORKER_RUNNER} start`
116
- end
117
- end
118
-
119
- # For debugging, run a single worker in the current process, showing output.
120
- def run_worker
121
- exec "CLOUD_CROWD_CONFIG='#{File.expand_path(@options[:config_path] + "/config.yml")}' ruby #{WORKER_RUNNER} run"
122
- end
123
-
124
- # Stop all active workers.
125
- def stop_workers
126
- `ruby #{WORKER_RUNNER} stop`
127
- end
128
-
129
- # Display the status of all active workers.
130
- def show_worker_status
131
- puts `ruby #{WORKER_RUNNER} status`
132
- end
133
-
134
100
  # Print `crowd` usage.
135
101
  def usage
136
102
  puts "\n#{@option_parser}\n"
@@ -150,7 +116,6 @@ Options:
150
116
  # Parse all options for all commands.
151
117
  def parse_options
152
118
  @options = {
153
- :port => 9173,
154
119
  :environment => 'production',
155
120
  :config_path => ENV['CLOUD_CROWD_CONFIG'] || '.'
156
121
  }
@@ -158,17 +123,14 @@ Options:
158
123
  opts.on('-c', '--config PATH', 'path to configuration directory') do |conf_path|
159
124
  @options[:config_path] = conf_path
160
125
  end
161
- opts.on('-n', '--num-workers NUM', OptionParser::DecimalInteger, 'number of worker processes') do |num|
162
- @options[:num_workers] = num
163
- end
164
- opts.on('-p', '--port PORT', 'central server port number') do |port_num|
126
+ opts.on('-p', '--port PORT', 'port number for server (central or node)') do |port_num|
165
127
  @options[:port] = port_num
166
128
  end
167
129
  opts.on('-e', '--environment ENV', 'server environment (sinatra)') do |env|
168
130
  @options[:environment] = env
169
131
  end
170
132
  opts.on_tail('-v', '--version', 'show version') do
171
- load_code
133
+ require "#{CC_ROOT}/lib/cloud-crowd"
172
134
  puts "CloudCrowd version #{VERSION}"
173
135
  exit
174
136
  end
@@ -181,7 +143,6 @@ Options:
181
143
  # Not all commands require this.
182
144
  def load_code
183
145
  ensure_config
184
- require 'rubygems'
185
146
  require "#{CC_ROOT}/lib/cloud-crowd"
186
147
  CloudCrowd.configure("#{@options[:config_path]}/config.yml")
187
148
  end
@@ -8,6 +8,10 @@ module CloudCrowd
8
8
  # exist.
9
9
  class ActionNotFound < Error
10
10
  end
11
+
12
+ # CentralServerUnavailable is used then the central server can't be reached.
13
+ class CentralServerUnavailable < Error
14
+ end
11
15
 
12
16
  # StorageNotFound is raised when config.yml specifies a storage back end that
13
17
  # doesn't exist.
@@ -23,7 +23,7 @@ module CloudCrowd
23
23
  # A request is authorized if its login and password match those stored
24
24
  # in config.yml, or if authentication is disabled. If authentication is
25
25
  # turned on, then every request is authenticated, including between
26
- # the worker daemons and the central server.
26
+ # the nodes and the central server.
27
27
  def authorize(login, password)
28
28
  return true unless CloudCrowd.config[:use_http_authentication]
29
29
  return CloudCrowd.config[:login] == login &&
@@ -37,7 +37,7 @@ module CloudCrowd
37
37
  @auth ||= Rack::Auth::Basic::Request.new(request.env)
38
38
  end
39
39
 
40
- def unauthorized!(realm = App.authorization_realm)
40
+ def unauthorized!(realm = Server.authorization_realm)
41
41
  response['WWW-Authenticate'] = "Basic realm=\"#{realm}\""
42
42
  halt 401, 'Authorization Required'
43
43
  end
@@ -20,26 +20,6 @@ module CloudCrowd
20
20
  @work_unit ||= WorkUnit.find_by_id(params[:work_unit_id]) or raise Sinatra::NotFound
21
21
  end
22
22
 
23
- # Try to fetch a work unit from the queue. If none are pending, respond
24
- # with no content.
25
- def dequeue_work_unit(offset=0)
26
- handle_conflicts do
27
- worker, actions = params[:worker_name], params[:worker_actions].split(',')
28
- WorkUnit.dequeue(worker, actions, offset)
29
- end
30
- end
31
-
32
- # We're using ActiveRecords optimistic locking, so stale work units
33
- # may sometimes arise. handle_conflicts responds with a the HTTP status
34
- # code of your choosing if the update failed to be applied.
35
- def handle_conflicts(code=204)
36
- begin
37
- yield
38
- rescue ActiveRecord::StaleObjectError => e
39
- return status(code) && ''
40
- end
41
- end
42
-
43
23
  end
44
24
  end
45
25
  end
@@ -31,30 +31,39 @@ module CloudCrowd
31
31
  # finished, if so, continue on to the next phase of the job.
32
32
  def check_for_completion
33
33
  return unless all_work_units_complete?
34
- transition_to_next_phase
35
- output_list = gather_outputs_from_work_units
36
-
37
- if complete?
38
- self.outputs = output_list.to_json
39
- self.time = Time.now - self.created_at
40
- end
41
- self.save
34
+ set_next_status
35
+ outs = gather_outputs_from_work_units
36
+ update_attributes(:outputs => outs.to_json, :time => time_taken) if complete?
42
37
 
43
38
  case self.status
44
- when PROCESSING then queue_for_workers(output_list.map {|o| JSON.parse(o) }.flatten)
45
- when MERGING then queue_for_workers(output_list.to_json)
39
+ when PROCESSING then queue_for_workers(outs.map {|o| JSON.parse(o) }.flatten)
40
+ when MERGING then queue_for_workers(outs.to_json)
46
41
  else fire_callback
47
42
  end
48
43
  self
49
44
  end
50
45
 
46
+ # Transition this Job's status to the appropriate next status.
47
+ def set_next_status
48
+ update_attribute(:status,
49
+ any_work_units_failed? ? FAILED :
50
+ self.splitting? ? PROCESSING :
51
+ self.mergeable? ? MERGING :
52
+ SUCCEEDED
53
+ )
54
+ end
55
+
51
56
  # If a <tt>callback_url</tt> is defined, post the Job's JSON to it upon
52
57
  # completion. The <tt>callback_url</tt> may include HTTP basic authentication,
53
58
  # if you like:
54
59
  # http://user:password@example.com/job_complete
60
+ # If the callback_url is successfully pinged, we proceed to cleanup the job.
61
+ # TODO: This should be moved into a Work Unit...
55
62
  def fire_callback
63
+ return unless callback_url
56
64
  begin
57
- RestClient.post(callback_url, {:job => self.to_json}) if callback_url
65
+ RestClient.post(callback_url, {:job => self.to_json})
66
+ self.destroy
58
67
  rescue RestClient::Exception => e
59
68
  puts "Failed to fire job callback. Hmmm, what should happen here?"
60
69
  end
@@ -62,15 +71,12 @@ module CloudCrowd
62
71
 
63
72
  # Cleaning up after a job will remove all of its files from S3. Destroying
64
73
  # a Job calls cleanup_assets first.
74
+ # TODO: Convert this into a 'cleanup' work unit that gets run by a worker.
65
75
  def cleanup_assets
66
76
  AssetStore.new.cleanup(self)
67
77
  end
68
78
 
69
79
  # Have all of the WorkUnits finished?
70
- #--
71
- # We could trade reads for writes here
72
- # by keeping a completed_count on the Job itself.
73
- #++
74
80
  def all_work_units_complete?
75
81
  self.work_units.incomplete.count <= 0
76
82
  end
@@ -98,10 +104,11 @@ module CloudCrowd
98
104
  end
99
105
 
100
106
  # How complete is this Job?
107
+ # Unfortunately, with the current processing sequence, the percent_complete
108
+ # can pull a fast one and go backwards.
101
109
  def percent_complete
102
- return 0 if splitting?
103
- return 100 if complete?
104
110
  return 99 if merging?
111
+ return 100 if complete?
105
112
  (work_units.complete.count / work_units.count.to_f * 100).round
106
113
  end
107
114
 
@@ -143,21 +150,13 @@ module CloudCrowd
143
150
  self.work_units.complete.destroy_all
144
151
  outs
145
152
  end
146
-
147
- # Transition this Job's status to the appropriate next status.
148
- def transition_to_next_phase
149
- self.status = any_work_units_failed? ? FAILED :
150
- self.splitting? ? PROCESSING :
151
- self.mergeable? ? MERGING :
152
- SUCCEEDED
153
- end
154
153
 
155
154
  # When starting a new job, or moving to a new stage, split up the inputs
156
155
  # into WorkUnits, and queue them. Workers will start picking them up right
157
156
  # away.
158
157
  def queue_for_workers(input=nil)
159
158
  input ||= JSON.parse(self.inputs)
160
- [input].flatten.each do |wu_input|
159
+ [input].flatten.map do |wu_input|
161
160
  WorkUnit.create(
162
161
  :job => self,
163
162
  :action => self.action,