cloud-crowd 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. data/README +16 -16
  2. data/cloud-crowd.gemspec +10 -9
  3. data/config/config.example.ru +8 -2
  4. data/config/config.example.yml +21 -25
  5. data/examples/process_pdfs_example.rb +1 -1
  6. data/examples/word_count_example.rb +1 -0
  7. data/lib/cloud-crowd.rb +47 -28
  8. data/lib/cloud_crowd/action.rb +14 -8
  9. data/lib/cloud_crowd/asset_store.rb +8 -8
  10. data/lib/cloud_crowd/asset_store/filesystem_store.rb +18 -7
  11. data/lib/cloud_crowd/asset_store/s3_store.rb +14 -11
  12. data/lib/cloud_crowd/command_line.rb +24 -58
  13. data/lib/cloud_crowd/exceptions.rb +7 -0
  14. data/lib/cloud_crowd/helpers/authorization.rb +5 -3
  15. data/lib/cloud_crowd/helpers/resources.rb +0 -20
  16. data/lib/cloud_crowd/models.rb +1 -1
  17. data/lib/cloud_crowd/models/job.rb +37 -40
  18. data/lib/cloud_crowd/models/node_record.rb +95 -0
  19. data/lib/cloud_crowd/models/work_unit.rb +87 -33
  20. data/lib/cloud_crowd/node.rb +105 -0
  21. data/lib/cloud_crowd/schema.rb +22 -18
  22. data/lib/cloud_crowd/{app.rb → server.rb} +34 -34
  23. data/lib/cloud_crowd/worker.rb +68 -107
  24. data/public/css/admin_console.css +40 -18
  25. data/public/images/server.png +0 -0
  26. data/public/images/server_busy.png +0 -0
  27. data/public/js/admin_console.js +47 -18
  28. data/test/acceptance/test_failing_work_units.rb +1 -1
  29. data/test/acceptance/{test_app.rb → test_server.rb} +15 -15
  30. data/test/acceptance/test_word_count.rb +3 -9
  31. data/test/blueprints.rb +0 -1
  32. data/test/config/config.ru +1 -1
  33. data/test/config/config.yml +2 -4
  34. data/test/unit/test_action.rb +1 -1
  35. data/test/unit/test_configuration.rb +1 -1
  36. data/test/unit/test_job.rb +3 -0
  37. data/test/unit/test_work_unit.rb +2 -4
  38. data/views/{index.erb → operations_center.erb} +13 -8
  39. metadata +11 -10
  40. data/lib/cloud_crowd/daemon.rb +0 -95
  41. data/lib/cloud_crowd/models/worker_record.rb +0 -61
  42. data/lib/cloud_crowd/runner.rb +0 -15
@@ -2,15 +2,26 @@ module CloudCrowd
2
2
  class AssetStore
3
3
 
4
4
  # The FilesystemStore is an implementation of the AssetStore, good only for
5
- # use in development, testing, or if you're only running a single-machine
6
- # installation.
5
+ # use in development, testing, if you're only running a single-machine
6
+ # installation, or are using a networked drive.
7
7
  module FilesystemStore
8
8
 
9
- # Save a file to somewhere semi-persistent on the filesystem. Can be used
10
- # in development, when offline, or if you happen to have a single-machine
11
- # CloudCrowd installation. To use, configure <tt>:storage => 'filesystem'</tt>.
9
+ DEFAULT_STORAGE_PATH = '/tmp/cloud_crowd_storage'
10
+
11
+ attr_reader :local_storage_path
12
+
13
+ # Make sure that local storage exists and is writeable before starting.
14
+ def setup
15
+ lsp = @local_storage_path = CloudCrowd.config[:local_storage_path] || DEFAULT_STORAGE_PATH
16
+ FileUtils.mkdir_p(lsp) unless File.exists?(lsp)
17
+ raise Error::StorageNotWritable, "#{lsp} is not writable" unless File.writable?(lsp)
18
+ end
19
+
20
+ # Save a file to somewhere semi-persistent on the filesystem. To use,
21
+ # configure <tt>:storage: 'filesystem'</tt> in *config.yml*, as well as
22
+ # <tt>:local_storage_path:</tt>.
12
23
  def save(local_path, save_path)
13
- save_path = File.join(LOCAL_STORAGE_PATH, save_path)
24
+ save_path = File.join(@local_storage_path, save_path)
14
25
  save_dir = File.dirname(save_path)
15
26
  FileUtils.mkdir_p save_dir unless File.exists? save_dir
16
27
  FileUtils.cp(local_path, save_path)
@@ -19,7 +30,7 @@ module CloudCrowd
19
30
 
20
31
  # Remove all of a Job's result files from the filesystem.
21
32
  def cleanup(job)
22
- path = "#{LOCAL_STORAGE_PATH}/#{job.action}/job_#{job.id}"
33
+ path = "#{@local_storage_path}/#{job.action}/job_#{job.id}"
23
34
  FileUtils.rm_r(path) if File.exists?(path)
24
35
  end
25
36
  end
@@ -5,11 +5,24 @@ module CloudCrowd
5
5
  # on S3 for all resulting files.
6
6
  module S3Store
7
7
 
8
+ # Configure authentication and establish a connection to S3, first thing.
9
+ def setup
10
+ @use_auth = CloudCrowd.config[:s3_authentication]
11
+ bucket_name = CloudCrowd.config[:s3_bucket]
12
+ key, secret = CloudCrowd.config[:aws_access_key], CloudCrowd.config[:aws_secret_key]
13
+ valid_conf = [bucket_name, key, secret].all? {|s| s.is_a? String }
14
+ raise Error::MissingConfiguration, "An S3 account must be configured in 'config.yml' before 's3' storage can be used" unless valid_conf
15
+ protocol = @use_auth ? 'https' : 'http'
16
+ port = @use_auth ? 443 : 80
17
+ @s3 = RightAws::S3.new(key, secret, :protocol => protocol, :port => port)
18
+ @bucket = @s3.bucket(bucket_name)
19
+ @bucket = @s3.bucket(bucket_name, true) unless @bucket
20
+ end
21
+
8
22
  # Save a finished file from local storage to S3. Save it publicly unless
9
23
  # we're configured to use S3 authentication. Authenticated links expire
10
24
  # after one day by default.
11
25
  def save(local_path, save_path)
12
- ensure_s3_connection
13
26
  if @use_auth
14
27
  @bucket.put(save_path, File.open(local_path), {}, 'private')
15
28
  @s3.interface.get_link(@bucket, save_path)
@@ -21,19 +34,9 @@ module CloudCrowd
21
34
 
22
35
  # Remove all of a Job's resulting files from S3, both intermediate and finished.
23
36
  def cleanup(job)
24
- ensure_s3_connection
25
37
  @bucket.delete_folder("#{job.action}/job_#{job.id}")
26
38
  end
27
39
 
28
- # Workers, through the course of many WorkUnits, keep around an AssetStore.
29
- # Ensure we have a persistent S3 connection after first use.
30
- def ensure_s3_connection
31
- unless @s3 && @bucket
32
- params = {:port => 80, :protocol => 'http'}
33
- @s3 = RightAws::S3.new(CloudCrowd.config[:aws_access_key], CloudCrowd.config[:aws_secret_key], params)
34
- @bucket = @s3.bucket(CloudCrowd.config[:s3_bucket], true)
35
- end
36
- end
37
40
  end
38
41
 
39
42
  end
@@ -9,9 +9,6 @@ module CloudCrowd
9
9
  # Reference the absolute path to the root.
10
10
  CC_ROOT = File.expand_path(File.dirname(__FILE__) + '/../..')
11
11
 
12
- # Path to the Daemons gem script which launches workers.
13
- WORKER_RUNNER = File.expand_path("#{CC_ROOT}/lib/cloud_crowd/runner.rb")
14
-
15
12
  # Command-line banner for the usage message.
16
13
  BANNER = <<-EOS
17
14
  CloudCrowd is a MapReduce-inspired Parallel Processing System for Ruby.
@@ -24,7 +21,7 @@ Usage: crowd COMMAND OPTIONS
24
21
  Commands:
25
22
  install Install the CloudCrowd configuration files to the specified directory
26
23
  server Start up the central server (requires a database)
27
- workers Control worker daemons, use: (start | stop | restart | status | run)
24
+ node Start up a worker node (only one node per machine, please)
28
25
  console Launch a CloudCrowd console, connected to the central database
29
26
  load_schema Load the schema into the database specified by database.yml
30
27
 
@@ -38,7 +35,7 @@ Options:
38
35
  case command
39
36
  when 'console' then run_console
40
37
  when 'server' then run_server
41
- when 'workers' then run_workers_command
38
+ when 'node' then run_node
42
39
  when 'load_schema' then run_load_schema
43
40
  when 'install' then run_install
44
41
  else usage
@@ -52,7 +49,7 @@ Options:
52
49
  require 'irb/completion'
53
50
  require 'pp'
54
51
  load_code
55
- connect_to_database
52
+ connect_to_database(true)
56
53
  IRB.start
57
54
  end
58
55
 
@@ -63,6 +60,7 @@ Options:
63
60
  # (Mongrel, falling back to WEBrick). The equivalent of Rails' script/server.
64
61
  def run_server
65
62
  ensure_config
63
+ @options[:port] ||= 9173
66
64
  require 'rubygems'
67
65
  rackup_path = File.expand_path("#{@options[:config_path]}/config.ru")
68
66
  if Gem.available? 'thin'
@@ -72,10 +70,18 @@ Options:
72
70
  end
73
71
  end
74
72
 
73
+ # Launch a Node. Please only run a single node per machine. The Node process
74
+ # will be long-lived, although its workers will come and go.
75
+ def run_node
76
+ ENV['RACK_ENV'] = @options['environment']
77
+ load_code
78
+ Node.new(@options[:port])
79
+ end
80
+
75
81
  # Load in the database schema to the database specified in 'database.yml'.
76
82
  def run_load_schema
77
83
  load_code
78
- connect_to_database
84
+ connect_to_database(false)
79
85
  require 'cloud_crowd/schema.rb'
80
86
  end
81
87
 
@@ -91,46 +97,6 @@ Options:
91
97
  install_file "#{CC_ROOT}/actions", "#{install_path}/actions", true
92
98
  end
93
99
 
94
- # Manipulate worker daemons -- handles all commands that the Daemons gem
95
- # provides: start, stop, restart, run, and status.
96
- def run_workers_command
97
- ensure_config
98
- command = ARGV.shift
99
- case command
100
- when 'start' then start_workers
101
- when 'stop' then stop_workers
102
- when 'restart' then stop_workers && start_workers
103
- when 'run' then run_worker
104
- when 'status' then show_worker_status
105
- else usage
106
- end
107
- end
108
-
109
- # Start up N workers, specified by argument or the number of workers in
110
- # config.yml.
111
- def start_workers
112
- load_code
113
- num_workers = @options[:num_workers] || CloudCrowd.config[:num_workers]
114
- num_workers.times do
115
- `CLOUD_CROWD_CONFIG='#{File.expand_path(@options[:config_path] + "/config.yml")}' ruby #{WORKER_RUNNER} start`
116
- end
117
- end
118
-
119
- # For debugging, run a single worker in the current process, showing output.
120
- def run_worker
121
- exec "CLOUD_CROWD_CONFIG='#{File.expand_path(@options[:config_path] + "/config.yml")}' ruby #{WORKER_RUNNER} run"
122
- end
123
-
124
- # Stop all active workers.
125
- def stop_workers
126
- `ruby #{WORKER_RUNNER} stop`
127
- end
128
-
129
- # Display the status of all active workers.
130
- def show_worker_status
131
- puts `ruby #{WORKER_RUNNER} status`
132
- end
133
-
134
100
  # Print `crowd` usage.
135
101
  def usage
136
102
  puts "\n#{@option_parser}\n"
@@ -150,7 +116,6 @@ Options:
150
116
  # Parse all options for all commands.
151
117
  def parse_options
152
118
  @options = {
153
- :port => 9173,
154
119
  :environment => 'production',
155
120
  :config_path => ENV['CLOUD_CROWD_CONFIG'] || '.'
156
121
  }
@@ -158,17 +123,14 @@ Options:
158
123
  opts.on('-c', '--config PATH', 'path to configuration directory') do |conf_path|
159
124
  @options[:config_path] = conf_path
160
125
  end
161
- opts.on('-n', '--num-workers NUM', OptionParser::DecimalInteger, 'number of worker processes') do |num|
162
- @options[:num_workers] = num
163
- end
164
- opts.on('-p', '--port PORT', 'central server port number') do |port_num|
126
+ opts.on('-p', '--port PORT', 'port number for server (central or node)') do |port_num|
165
127
  @options[:port] = port_num
166
128
  end
167
129
  opts.on('-e', '--environment ENV', 'server environment (sinatra)') do |env|
168
130
  @options[:environment] = env
169
131
  end
170
132
  opts.on_tail('-v', '--version', 'show version') do
171
- load_code
133
+ require "#{CC_ROOT}/lib/cloud-crowd"
172
134
  puts "CloudCrowd version #{VERSION}"
173
135
  exit
174
136
  end
@@ -181,26 +143,30 @@ Options:
181
143
  # Not all commands require this.
182
144
  def load_code
183
145
  ensure_config
184
- require 'rubygems'
185
146
  require "#{CC_ROOT}/lib/cloud-crowd"
186
147
  CloudCrowd.configure("#{@options[:config_path]}/config.yml")
187
148
  end
188
149
 
189
150
  # Establish a connection to the central server's database. Not all commands
190
151
  # require this.
191
- def connect_to_database
152
+ def connect_to_database(validate_schema)
192
153
  require 'cloud_crowd/models'
193
- CloudCrowd.configure_database("#{@options[:config_path]}/database.yml")
154
+ CloudCrowd.configure_database("#{@options[:config_path]}/database.yml", validate_schema)
194
155
  end
195
156
 
196
157
  # Exit with an explanation if the configuration files couldn't be found.
197
158
  def config_not_found
198
- puts "`crowd` can't find the CloudCrowd configuration directory. Please either run `crowd` from inside of the configuration directory, or use `crowd -c path/to/config`"
159
+ puts "`crowd` can't find the CloudCrowd configuration directory. Please use `crowd -c path/to/config`, or run `crowd` from inside of the configuration directory itself."
199
160
  exit(1)
200
161
  end
201
162
 
202
- # Install a file and log the installation.
163
+ # Install a file and log the installation. If we're overwriting a file,
164
+ # offer a chance to back out.
203
165
  def install_file(source, dest, is_dir=false)
166
+ if File.exists?(dest)
167
+ print "#{dest} already exists. Overwrite it? (yes/no) "
168
+ return unless ['y', 'yes', 'ok'].include? gets.chomp.downcase
169
+ end
204
170
  is_dir ? FileUtils.cp_r(source, dest) : FileUtils.cp(source, dest)
205
171
  puts "installed #{dest}"
206
172
  end
@@ -2,6 +2,8 @@ module CloudCrowd
2
2
 
3
3
  # Base Error class which all custom CloudCrowd exceptions inherit from.
4
4
  # Rescuing CloudCrowd::Error (or RuntimeError) will get all custom exceptions.
5
+ # If your cluster is correctly configured, you should never expect to see any
6
+ # of these.
5
7
  class Error < RuntimeError
6
8
 
7
9
  # ActionNotFound is raised when a job is created for an action that doesn't
@@ -23,6 +25,11 @@ module CloudCrowd
23
25
  class StatusUnspecified < Error
24
26
  end
25
27
 
28
+ # MissingConfiguration is raised when we're trying to run a method that
29
+ # needs configuration not present in config.yml.
30
+ class MissingConfiguration < Error
31
+ end
32
+
26
33
  end
27
34
 
28
35
  end
@@ -23,9 +23,9 @@ module CloudCrowd
23
23
  # A request is authorized if its login and password match those stored
24
24
  # in config.yml, or if authentication is disabled. If authentication is
25
25
  # turned on, then every request is authenticated, including between
26
- # the worker daemons and the central server.
26
+ # the nodes and the central server.
27
27
  def authorize(login, password)
28
- return true unless CloudCrowd.config[:use_http_authentication]
28
+ return true unless CloudCrowd.config[:http_authentication]
29
29
  return CloudCrowd.config[:login] == login &&
30
30
  CloudCrowd.config[:password] == password
31
31
  end
@@ -33,11 +33,13 @@ module CloudCrowd
33
33
 
34
34
  private
35
35
 
36
+ # Provide a Rack Authorization object.
36
37
  def auth
37
38
  @auth ||= Rack::Auth::Basic::Request.new(request.env)
38
39
  end
39
40
 
40
- def unauthorized!(realm = App.authorization_realm)
41
+ # Unauthorized requests will prompt the browser to provide credentials.
42
+ def unauthorized!(realm = Server.authorization_realm)
41
43
  response['WWW-Authenticate'] = "Basic realm=\"#{realm}\""
42
44
  halt 401, 'Authorization Required'
43
45
  end
@@ -20,26 +20,6 @@ module CloudCrowd
20
20
  @work_unit ||= WorkUnit.find_by_id(params[:work_unit_id]) or raise Sinatra::NotFound
21
21
  end
22
22
 
23
- # Try to fetch a work unit from the queue. If none are pending, respond
24
- # with no content.
25
- def dequeue_work_unit(offset=0)
26
- handle_conflicts do
27
- worker, actions = params[:worker_name], params[:worker_actions].split(',')
28
- WorkUnit.dequeue(worker, actions, offset)
29
- end
30
- end
31
-
32
- # We're using ActiveRecords optimistic locking, so stale work units
33
- # may sometimes arise. handle_conflicts responds with a the HTTP status
34
- # code of your choosing if the update failed to be applied.
35
- def handle_conflicts(code=204)
36
- begin
37
- yield
38
- rescue ActiveRecord::StaleObjectError => e
39
- return status(code) && ''
40
- end
41
- end
42
-
43
23
  end
44
24
  end
45
25
  end
@@ -36,5 +36,5 @@ module CloudCrowd
36
36
  end
37
37
 
38
38
  require 'cloud_crowd/models/job'
39
+ require 'cloud_crowd/models/node_record'
39
40
  require 'cloud_crowd/models/work_unit'
40
- require 'cloud_crowd/models/worker_record'
@@ -31,30 +31,36 @@ module CloudCrowd
31
31
  # finished, if so, continue on to the next phase of the job.
32
32
  def check_for_completion
33
33
  return unless all_work_units_complete?
34
- transition_to_next_phase
35
- output_list = gather_outputs_from_work_units
36
-
34
+ set_next_status
35
+ outs = gather_outputs_from_work_units
36
+ return queue_for_workers(outs) if merging?
37
37
  if complete?
38
- self.outputs = output_list.to_json
39
- self.time = Time.now - self.created_at
40
- end
41
- self.save
42
-
43
- case self.status
44
- when PROCESSING then queue_for_workers(output_list.map {|o| JSON.parse(o) }.flatten)
45
- when MERGING then queue_for_workers(output_list.to_json)
46
- else fire_callback
38
+ update_attributes(:outputs => outs, :time => time_taken)
39
+ fire_callback if callback_url
47
40
  end
48
41
  self
49
42
  end
50
43
 
44
+ # Transition this Job's status to the appropriate next status.
45
+ def set_next_status
46
+ update_attribute(:status,
47
+ any_work_units_failed? ? FAILED :
48
+ self.splitting? ? PROCESSING :
49
+ self.mergeable? ? MERGING :
50
+ SUCCEEDED
51
+ )
52
+ end
53
+
51
54
  # If a <tt>callback_url</tt> is defined, post the Job's JSON to it upon
52
55
  # completion. The <tt>callback_url</tt> may include HTTP basic authentication,
53
56
  # if you like:
54
57
  # http://user:password@example.com/job_complete
58
+ # If the callback_url is successfully pinged, we proceed to cleanup the job.
59
+ # TODO: This should be moved into a Work Unit...
55
60
  def fire_callback
56
61
  begin
57
- RestClient.post(callback_url, {:job => self.to_json}) if callback_url
62
+ RestClient.post(callback_url, {:job => self.to_json})
63
+ self.destroy
58
64
  rescue RestClient::Exception => e
59
65
  puts "Failed to fire job callback. Hmmm, what should happen here?"
60
66
  end
@@ -62,15 +68,12 @@ module CloudCrowd
62
68
 
63
69
  # Cleaning up after a job will remove all of its files from S3. Destroying
64
70
  # a Job calls cleanup_assets first.
71
+ # TODO: Convert this into a 'cleanup' work unit that gets run by a worker.
65
72
  def cleanup_assets
66
73
  AssetStore.new.cleanup(self)
67
74
  end
68
75
 
69
76
  # Have all of the WorkUnits finished?
70
- #--
71
- # We could trade reads for writes here
72
- # by keeping a completed_count on the Job itself.
73
- #++
74
77
  def all_work_units_complete?
75
78
  self.work_units.incomplete.count <= 0
76
79
  end
@@ -85,6 +88,11 @@ module CloudCrowd
85
88
  self.action_class.public_instance_methods.include? 'split'
86
89
  end
87
90
 
91
+ # This job is done splitting if it's finished with its splitting work units.
92
+ def done_splitting?
93
+ splittable? && work_units.splitting.count <= 0
94
+ end
95
+
88
96
  # This job is mergeable if its Action has a +merge+ method.
89
97
  def mergeable?
90
98
  self.processing? && self.action_class.public_instance_methods.include?('merge')
@@ -92,16 +100,19 @@ module CloudCrowd
92
100
 
93
101
  # Retrieve the class for this Job's Action.
94
102
  def action_class
95
- klass = CloudCrowd.actions[self.action]
96
- return klass if klass
103
+ @action_class ||= CloudCrowd.actions[self.action]
104
+ return @action_class if @action_class
97
105
  raise Error::ActionNotFound, "no action named: '#{self.action}' could be found"
98
106
  end
99
107
 
100
108
  # How complete is this Job?
109
+ # Unfortunately, with the current processing sequence, the percent_complete
110
+ # can pull a fast one and go backwards. This happens when there's a single
111
+ # large input that takes a long time to split, and when it finally does it
112
+ # creates a whole swarm of work units. This seems unavoidable.
101
113
  def percent_complete
102
- return 0 if splitting?
103
- return 100 if complete?
104
114
  return 99 if merging?
115
+ return 100 if complete?
105
116
  (work_units.complete.count / work_units.count.to_f * 100).round
106
117
  end
107
118
 
@@ -136,20 +147,12 @@ module CloudCrowd
136
147
  private
137
148
 
138
149
  # When the WorkUnits are all finished, gather all their outputs together
139
- # before removing them from the database entirely.
150
+ # before removing them from the database entirely. Returns their merged JSON.
140
151
  def gather_outputs_from_work_units
141
152
  units = self.work_units.complete
142
- outs = self.work_units.complete.map {|u| JSON.parse(u.output)['output'] }
153
+ outs = self.work_units.complete.map {|u| u.parsed_output }
143
154
  self.work_units.complete.destroy_all
144
- outs
145
- end
146
-
147
- # Transition this Job's status to the appropriate next status.
148
- def transition_to_next_phase
149
- self.status = any_work_units_failed? ? FAILED :
150
- self.splitting? ? PROCESSING :
151
- self.mergeable? ? MERGING :
152
- SUCCEEDED
155
+ outs.to_json
153
156
  end
154
157
 
155
158
  # When starting a new job, or moving to a new stage, split up the inputs
@@ -157,14 +160,8 @@ module CloudCrowd
157
160
  # away.
158
161
  def queue_for_workers(input=nil)
159
162
  input ||= JSON.parse(self.inputs)
160
- [input].flatten.each do |wu_input|
161
- WorkUnit.create(
162
- :job => self,
163
- :action => self.action,
164
- :input => wu_input,
165
- :status => self.status
166
- )
167
- end
163
+ [input].flatten.each {|i| WorkUnit.start(self, action, i, status) }
164
+ self
168
165
  end
169
166
 
170
167
  # A Job starts out either splitting or processing, depending on its action.