cloud-crowd 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. data/EPIGRAPHS +17 -0
  2. data/LICENSE +22 -0
  3. data/README +93 -0
  4. data/actions/graphics_magick.rb +43 -0
  5. data/actions/process_pdfs.rb +92 -0
  6. data/actions/word_count.rb +14 -0
  7. data/bin/crowd +5 -0
  8. data/cloud-crowd.gemspec +111 -0
  9. data/config/config.example.ru +17 -0
  10. data/config/config.example.yml +48 -0
  11. data/config/database.example.yml +9 -0
  12. data/examples/graphics_magick_example.rb +44 -0
  13. data/examples/process_pdfs_example.rb +40 -0
  14. data/examples/word_count_example.rb +41 -0
  15. data/lib/cloud-crowd.rb +130 -0
  16. data/lib/cloud_crowd/action.rb +101 -0
  17. data/lib/cloud_crowd/app.rb +117 -0
  18. data/lib/cloud_crowd/asset_store.rb +41 -0
  19. data/lib/cloud_crowd/asset_store/filesystem_store.rb +28 -0
  20. data/lib/cloud_crowd/asset_store/s3_store.rb +40 -0
  21. data/lib/cloud_crowd/command_line.rb +209 -0
  22. data/lib/cloud_crowd/daemon.rb +95 -0
  23. data/lib/cloud_crowd/exceptions.rb +28 -0
  24. data/lib/cloud_crowd/helpers.rb +8 -0
  25. data/lib/cloud_crowd/helpers/authorization.rb +50 -0
  26. data/lib/cloud_crowd/helpers/resources.rb +45 -0
  27. data/lib/cloud_crowd/inflector.rb +19 -0
  28. data/lib/cloud_crowd/models.rb +40 -0
  29. data/lib/cloud_crowd/models/job.rb +176 -0
  30. data/lib/cloud_crowd/models/work_unit.rb +89 -0
  31. data/lib/cloud_crowd/models/worker_record.rb +61 -0
  32. data/lib/cloud_crowd/runner.rb +15 -0
  33. data/lib/cloud_crowd/schema.rb +45 -0
  34. data/lib/cloud_crowd/worker.rb +186 -0
  35. data/public/css/admin_console.css +221 -0
  36. data/public/css/reset.css +42 -0
  37. data/public/images/bullet_green.png +0 -0
  38. data/public/images/bullet_white.png +0 -0
  39. data/public/images/cloud_hand.png +0 -0
  40. data/public/images/header_back.png +0 -0
  41. data/public/images/logo.png +0 -0
  42. data/public/images/queue_fill.png +0 -0
  43. data/public/images/server_error.png +0 -0
  44. data/public/images/sidebar_bottom.png +0 -0
  45. data/public/images/sidebar_top.png +0 -0
  46. data/public/images/worker_info.png +0 -0
  47. data/public/images/worker_info_loading.gif +0 -0
  48. data/public/js/admin_console.js +168 -0
  49. data/public/js/excanvas.js +1 -0
  50. data/public/js/flot.js +1 -0
  51. data/public/js/jquery.js +19 -0
  52. data/test/acceptance/test_app.rb +72 -0
  53. data/test/acceptance/test_failing_work_units.rb +32 -0
  54. data/test/acceptance/test_word_count.rb +49 -0
  55. data/test/blueprints.rb +17 -0
  56. data/test/config/actions/failure_testing.rb +13 -0
  57. data/test/config/config.ru +17 -0
  58. data/test/config/config.yml +7 -0
  59. data/test/config/database.yml +6 -0
  60. data/test/test_helper.rb +19 -0
  61. data/test/unit/test_action.rb +49 -0
  62. data/test/unit/test_configuration.rb +28 -0
  63. data/test/unit/test_job.rb +78 -0
  64. data/test/unit/test_work_unit.rb +55 -0
  65. data/views/index.erb +77 -0
  66. metadata +233 -0
@@ -0,0 +1,41 @@
1
+ require 'tmpdir'
2
+
3
+ module CloudCrowd
4
+
5
+ # The AssetStore provides a common API for storing files and returning URLs
6
+ # that can access them. In production this will be S3 but in development
7
+ # it may be the filesystem.
8
+ #
9
+ # You shouldn't need to use the AssetStore directly -- Action's +download+
10
+ # and +save+ methods use it behind the scenes.
11
+ class AssetStore
12
+
13
+ autoload :S3Store, 'cloud_crowd/asset_store/s3_store'
14
+ autoload :FilesystemStore, 'cloud_crowd/asset_store/filesystem_store'
15
+
16
+ LOCAL_STORAGE_PATH = '/tmp/cloud_crowd_storage'
17
+
18
+ # Configure the AssetStore with the specific storage implementation
19
+ # specified by 'storage' in <tt>config.yml</tt>.
20
+ case CloudCrowd.config[:storage]
21
+ when 's3' then include S3Store
22
+ when 'filesystem' then include FilesystemStore
23
+ else raise Error::StorageNotFound, "#{CloudCrowd.config[:storage]} is not a valid storage back end"
24
+ end
25
+
26
+ # Creating the AssetStore ensures that its scratch directory exists.
27
+ def initialize
28
+ @use_auth = CloudCrowd.config[:use_s3_authentication]
29
+ FileUtils.mkdir_p temp_storage_path unless File.exists? temp_storage_path
30
+ raise Error::StorageNotWritable, "#{temp_storage_path} is not writable" unless File.writable?(temp_storage_path)
31
+ end
32
+
33
+ # Get the path to CloudCrowd's temporary local storage. All actions run
34
+ # in subdirectories of this.
35
+ def temp_storage_path
36
+ "#{Dir.tmpdir}/cloud_crowd_tmp"
37
+ end
38
+
39
+ end
40
+
41
+ end
@@ -0,0 +1,28 @@
1
+ module CloudCrowd
2
+ class AssetStore
3
+
4
+ # The FilesystemStore is an implementation of the AssetStore, good only for
5
+ # use in development, testing, or if you're only running a single-machine
6
+ # installation.
7
+ module FilesystemStore
8
+
9
+ # Save a file to somewhere semi-persistent on the filesystem. Can be used
10
+ # in development, when offline, or if you happen to have a single-machine
11
+ # CloudCrowd installation. To use, configure <tt>:storage => 'filesystem'</tt>.
12
+ def save(local_path, save_path)
13
+ save_path = File.join(LOCAL_STORAGE_PATH, save_path)
14
+ save_dir = File.dirname(save_path)
15
+ FileUtils.mkdir_p save_dir unless File.exists? save_dir
16
+ FileUtils.cp(local_path, save_path)
17
+ "file://#{File.expand_path(save_path)}"
18
+ end
19
+
20
+ # Remove all of a Job's result files from the filesystem.
21
+ def cleanup(job)
22
+ path = "#{LOCAL_STORAGE_PATH}/#{job.action}/job_#{job.id}"
23
+ FileUtils.rm_r(path) if File.exists?(path)
24
+ end
25
+ end
26
+
27
+ end
28
+ end
@@ -0,0 +1,40 @@
1
+ module CloudCrowd
2
+ class AssetStore
3
+
4
+ # The S3Store is an implementation of an AssetStore that uses a bucket
5
+ # on S3 for all resulting files.
6
+ module S3Store
7
+
8
+ # Save a finished file from local storage to S3. Save it publicly unless
9
+ # we're configured to use S3 authentication. Authenticated links expire
10
+ # after one day by default.
11
+ def save(local_path, save_path)
12
+ ensure_s3_connection
13
+ if @use_auth
14
+ @bucket.put(save_path, File.open(local_path), {}, 'private')
15
+ @s3.interface.get_link(@bucket, save_path)
16
+ else
17
+ @bucket.put(save_path, File.open(local_path), {}, 'public-read')
18
+ @bucket.key(save_path).public_link
19
+ end
20
+ end
21
+
22
+ # Remove all of a Job's resulting files from S3, both intermediate and finished.
23
+ def cleanup(job)
24
+ ensure_s3_connection
25
+ @bucket.delete_folder("#{job.action}/job_#{job.id}")
26
+ end
27
+
28
+ # Workers, through the course of many WorkUnits, keep around an AssetStore.
29
+ # Ensure we have a persistent S3 connection after first use.
30
+ def ensure_s3_connection
31
+ unless @s3 && @bucket
32
+ params = {:port => 80, :protocol => 'http'}
33
+ @s3 = RightAws::S3.new(CloudCrowd.config[:aws_access_key], CloudCrowd.config[:aws_secret_key], params)
34
+ @bucket = @s3.bucket(CloudCrowd.config[:s3_bucket], true)
35
+ end
36
+ end
37
+ end
38
+
39
+ end
40
+ end
@@ -0,0 +1,209 @@
1
+ require 'optparse'
2
+
3
+ module CloudCrowd
4
+ class CommandLine
5
+
6
+ # Configuration files required for the `crowd` command to function.
7
+ CONFIG_FILES = ['config.yml', 'config.ru', 'database.yml']
8
+
9
+ # Reference the absolute path to the root.
10
+ CC_ROOT = File.expand_path(File.dirname(__FILE__) + '/../..')
11
+
12
+ # Path to the Daemons gem script which launches workers.
13
+ WORKER_RUNNER = File.expand_path("#{CC_ROOT}/lib/cloud_crowd/runner.rb")
14
+
15
+ # Command-line banner for the usage message.
16
+ BANNER = <<-EOS
17
+ CloudCrowd is a MapReduce-inspired Parallel Processing System for Ruby.
18
+
19
+ Wiki: http://wiki.github.com/documentcloud/cloud-crowd
20
+ Rdoc: http://rdoc.info/projects/documentcloud/cloud-crowd
21
+
22
+ Usage: crowd COMMAND OPTIONS
23
+
24
+ Commands:
25
+ install Install the CloudCrowd configuration files to the specified directory
26
+ server Start up the central server (requires a database)
27
+ workers Control worker daemons, use: (start | stop | restart | status | run)
28
+ console Launch a CloudCrowd console, connected to the central database
29
+ load_schema Load the schema into the database specified by database.yml
30
+
31
+ Options:
32
+ EOS
33
+
34
+ # Creating a CloudCrowd::CommandLine runs from the contents of ARGV.
35
+ def initialize
36
+ parse_options
37
+ command = ARGV.shift
38
+ case command
39
+ when 'console' then run_console
40
+ when 'server' then run_server
41
+ when 'workers' then run_workers_command
42
+ when 'load_schema' then run_load_schema
43
+ when 'install' then run_install
44
+ else usage
45
+ end
46
+ end
47
+
48
+ # Spin up an IRB session with the CloudCrowd code loaded in, and a database
49
+ # connection established. The equivalent of Rails' `script/console`.
50
+ def run_console
51
+ require 'irb'
52
+ require 'irb/completion'
53
+ require 'pp'
54
+ load_code
55
+ connect_to_database
56
+ IRB.start
57
+ end
58
+
59
+ # Convenience command for quickly spinning up the central server. More
60
+ # sophisticated deployments, load-balancing across multiple app servers,
61
+ # should use the config.ru rackup file directly. This method will start
62
+ # a single Thin server, if Thin is installed, otherwise the rackup defaults
63
+ # (Mongrel, falling back to WEBrick). The equivalent of Rails' script/server.
64
+ def run_server
65
+ ensure_config
66
+ require 'rubygems'
67
+ rackup_path = File.expand_path("#{@options[:config_path]}/config.ru")
68
+ if Gem.available? 'thin'
69
+ exec "thin -e #{@options[:environment]} -p #{@options[:port]} -R #{rackup_path} start"
70
+ else
71
+ exec "rackup -E #{@options[:environment]} -p #{@options[:port]} #{rackup_path}"
72
+ end
73
+ end
74
+
75
+ # Load in the database schema to the database specified in 'database.yml'.
76
+ def run_load_schema
77
+ load_code
78
+ connect_to_database
79
+ require 'cloud_crowd/schema.rb'
80
+ end
81
+
82
+ # Install the required CloudCrowd configuration files into the specified
83
+ # directory, or the current one.
84
+ def run_install
85
+ require 'fileutils'
86
+ install_path = ARGV.shift || '.'
87
+ FileUtils.mkdir_p install_path unless File.exists?(install_path)
88
+ install_file "#{CC_ROOT}/config/config.example.yml", "#{install_path}/config.yml"
89
+ install_file "#{CC_ROOT}/config/config.example.ru", "#{install_path}/config.ru"
90
+ install_file "#{CC_ROOT}/config/database.example.yml", "#{install_path}/database.yml"
91
+ install_file "#{CC_ROOT}/actions", "#{install_path}/actions", true
92
+ end
93
+
94
+ # Manipulate worker daemons -- handles all commands that the Daemons gem
95
+ # provides: start, stop, restart, run, and status.
96
+ def run_workers_command
97
+ ensure_config
98
+ command = ARGV.shift
99
+ case command
100
+ when 'start' then start_workers
101
+ when 'stop' then stop_workers
102
+ when 'restart' then stop_workers && start_workers
103
+ when 'run' then run_worker
104
+ when 'status' then show_worker_status
105
+ else usage
106
+ end
107
+ end
108
+
109
+ # Start up N workers, specified by argument or the number of workers in
110
+ # config.yml.
111
+ def start_workers
112
+ load_code
113
+ num_workers = @options[:num_workers] || CloudCrowd.config[:num_workers]
114
+ num_workers.times do
115
+ `CLOUD_CROWD_CONFIG='#{File.expand_path(@options[:config_path] + "/config.yml")}' ruby #{WORKER_RUNNER} start`
116
+ end
117
+ end
118
+
119
+ # For debugging, run a single worker in the current process, showing output.
120
+ def run_worker
121
+ exec "CLOUD_CROWD_CONFIG='#{File.expand_path(@options[:config_path] + "/config.yml")}' ruby #{WORKER_RUNNER} run"
122
+ end
123
+
124
+ # Stop all active workers.
125
+ def stop_workers
126
+ `ruby #{WORKER_RUNNER} stop`
127
+ end
128
+
129
+ # Display the status of all active workers.
130
+ def show_worker_status
131
+ puts `ruby #{WORKER_RUNNER} status`
132
+ end
133
+
134
+ # Print `crowd` usage.
135
+ def usage
136
+ puts "\n#{@option_parser}\n"
137
+ end
138
+
139
+
140
+ private
141
+
142
+ # Check for configuration files, either in the current directory, or in
143
+ # the CLOUD_CROWD_CONFIG environment variable. Exit if they're not found.
144
+ def ensure_config
145
+ return if @config_found
146
+ found = CONFIG_FILES.all? {|f| File.exists? "#{@options[:config_path]}/#{f}" }
147
+ found ? @config_dir = true : config_not_found
148
+ end
149
+
150
+ # Parse all options for all commands.
151
+ def parse_options
152
+ @options = {
153
+ :port => 9173,
154
+ :environment => 'production',
155
+ :config_path => ENV['CLOUD_CROWD_CONFIG'] || '.'
156
+ }
157
+ @option_parser = OptionParser.new do |opts|
158
+ opts.on('-c', '--config PATH', 'path to configuration directory') do |conf_path|
159
+ @options[:config_path] = conf_path
160
+ end
161
+ opts.on('-n', '--num-workers NUM', OptionParser::DecimalInteger, 'number of worker processes') do |num|
162
+ @options[:num_workers] = num
163
+ end
164
+ opts.on('-p', '--port PORT', 'central server port number') do |port_num|
165
+ @options[:port] = port_num
166
+ end
167
+ opts.on('-e', '--environment ENV', 'server environment (sinatra)') do |env|
168
+ @options[:environment] = env
169
+ end
170
+ opts.on_tail('-v', '--version', 'show version') do
171
+ load_code
172
+ puts "CloudCrowd version #{VERSION}"
173
+ exit
174
+ end
175
+ end
176
+ @option_parser.banner = BANNER
177
+ @option_parser.parse!(ARGV)
178
+ end
179
+
180
+ # Load in the CloudCrowd module code, dependencies, lib files and models.
181
+ # Not all commands require this.
182
+ def load_code
183
+ ensure_config
184
+ require 'rubygems'
185
+ require "#{CC_ROOT}/lib/cloud-crowd"
186
+ CloudCrowd.configure("#{@options[:config_path]}/config.yml")
187
+ end
188
+
189
+ # Establish a connection to the central server's database. Not all commands
190
+ # require this.
191
+ def connect_to_database
192
+ require 'cloud_crowd/models'
193
+ CloudCrowd.configure_database("#{@options[:config_path]}/database.yml")
194
+ end
195
+
196
+ # Exit with an explanation if the configuration files couldn't be found.
197
+ def config_not_found
198
+ puts "`crowd` can't find the CloudCrowd configuration directory. Please either run `crowd` from inside of the configuration directory, or use `crowd -c path/to/config`"
199
+ exit(1)
200
+ end
201
+
202
+ # Install a file and log the installation.
203
+ def install_file(source, dest, is_dir=false)
204
+ is_dir ? FileUtils.cp_r(source, dest) : FileUtils.cp(source, dest)
205
+ puts "installed #{dest}"
206
+ end
207
+
208
+ end
209
+ end
@@ -0,0 +1,95 @@
1
+ CloudCrowd.configure(ENV['CLOUD_CROWD_CONFIG'])
2
+
3
+ module CloudCrowd
4
+
5
+ # A CloudCrowd::Daemon, started by the Daemons gem, runs a CloudCrowd::Worker in
6
+ # a loop, continually fetching and processing WorkUnits from the central
7
+ # server.
8
+ #
9
+ # The Daemon backs off and pings the central server less frequently
10
+ # when there isn't any work to be done, and speeds back up when there is.
11
+ #
12
+ # The `crowd` command responds to all the usual methods that the Daemons gem
13
+ # supports.
14
+ class Daemon
15
+
16
+ # The back-off factor used to slow down requests for new work units
17
+ # when the queue is empty.
18
+ WAIT_MULTIPLIER = 1.5
19
+
20
+ MIN_WAIT = CloudCrowd.config[:min_worker_wait]
21
+ MAX_WAIT = CloudCrowd.config[:max_worker_wait]
22
+
23
+ def initialize
24
+ @wait_time = MIN_WAIT
25
+ @worker = Worker.new
26
+ Signal.trap('INT') { kill_worker_and_exit }
27
+ Signal.trap('KILL') { kill_worker_and_exit }
28
+ Signal.trap('TERM') { kill_worker_and_exit }
29
+ end
30
+
31
+ # Spin up our worker and monitoring threads. The monitor's the boss, and
32
+ # will feel no compunction in killing the worker thread if necessary.
33
+ # Check in before starting up. If check in fails, there's no sense in going.
34
+ def run
35
+ @worker.check_in('starting')
36
+ @work_thread = run_worker
37
+ @monitor_thread = run_monitor
38
+ @monitor_thread.join
39
+ end
40
+
41
+
42
+ private
43
+
44
+ # Loop forever, fetching WorkUnits and processing them.
45
+ def run_worker
46
+ Thread.new do
47
+ loop do
48
+ @worker.fetch_work_unit
49
+ if @worker.has_work?
50
+ @wait_time = MIN_WAIT
51
+ while @worker.has_work?
52
+ @worker.run
53
+ sleep 0.01 # So as to listen for incoming signals.
54
+ end
55
+ else
56
+ @wait_time = [@wait_time * WAIT_MULTIPLIER, MAX_WAIT].min
57
+ sleep @wait_time
58
+ end
59
+ end
60
+ end
61
+ end
62
+
63
+ # Checks in to let the central server know it's still alive every
64
+ # CHECK_IN_INTERVAL seconds. Restarts the work_thread if it has died.
65
+ def run_monitor
66
+ Thread.new do
67
+ sleep Worker::CHECK_IN_INTERVAL
68
+ loop do
69
+ @work_thread = run_monitor unless @work_thread.alive? || @exit_started
70
+ @worker.check_in(@work_thread.status)
71
+ sleep Worker::CHECK_IN_INTERVAL
72
+ end
73
+ end
74
+ end
75
+
76
+ def running?
77
+ @work_thread.alive? || @monitor_thread.alive?
78
+ end
79
+
80
+ # At exit, kill the worker thread, gently at first, then forcefully.
81
+ def kill_worker_and_exit
82
+ @worker.check_out
83
+ @exit_started = Time.now
84
+ @work_thread.kill && @monitor_thread.kill
85
+ sleep 0.3 while running? && Time.now - @exit_started < WORKER_EXIT_WAIT
86
+ return Process.exit unless running?
87
+ @work_thread.kill! && @monitor_thread.kill!
88
+ Process.exit
89
+ end
90
+
91
+ end
92
+
93
+ end
94
+
95
+ CloudCrowd::Daemon.new.run
@@ -0,0 +1,28 @@
1
+ module CloudCrowd
2
+
3
+ # Base Error class which all custom CloudCrowd exceptions inherit from.
4
+ # Rescuing CloudCrowd::Error (or RuntimeError) will get all custom exceptions.
5
+ class Error < RuntimeError
6
+
7
+ # ActionNotFound is raised when a job is created for an action that doesn't
8
+ # exist.
9
+ class ActionNotFound < Error
10
+ end
11
+
12
+ # StorageNotFound is raised when config.yml specifies a storage back end that
13
+ # doesn't exist.
14
+ class StorageNotFound < Error
15
+ end
16
+
17
+ # If the AssetStore can't write to its scratch directory.
18
+ class StorageNotWritable < Error
19
+ end
20
+
21
+ # StatusUnspecified is raised when a WorkUnit returns without a valid
22
+ # status code.
23
+ class StatusUnspecified < Error
24
+ end
25
+
26
+ end
27
+
28
+ end
@@ -0,0 +1,8 @@
1
+ require 'cloud_crowd/helpers/authorization'
2
+ require 'cloud_crowd/helpers/resources'
3
+
4
+ module CloudCrowd
5
+ module Helpers
6
+ include Authorization, Resources #, Rack::Utils
7
+ end
8
+ end