cloud-crowd 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. data/README +16 -16
  2. data/cloud-crowd.gemspec +10 -9
  3. data/config/config.example.ru +8 -2
  4. data/config/config.example.yml +21 -25
  5. data/examples/process_pdfs_example.rb +1 -1
  6. data/examples/word_count_example.rb +1 -0
  7. data/lib/cloud-crowd.rb +47 -28
  8. data/lib/cloud_crowd/action.rb +14 -8
  9. data/lib/cloud_crowd/asset_store.rb +8 -8
  10. data/lib/cloud_crowd/asset_store/filesystem_store.rb +18 -7
  11. data/lib/cloud_crowd/asset_store/s3_store.rb +14 -11
  12. data/lib/cloud_crowd/command_line.rb +24 -58
  13. data/lib/cloud_crowd/exceptions.rb +7 -0
  14. data/lib/cloud_crowd/helpers/authorization.rb +5 -3
  15. data/lib/cloud_crowd/helpers/resources.rb +0 -20
  16. data/lib/cloud_crowd/models.rb +1 -1
  17. data/lib/cloud_crowd/models/job.rb +37 -40
  18. data/lib/cloud_crowd/models/node_record.rb +95 -0
  19. data/lib/cloud_crowd/models/work_unit.rb +87 -33
  20. data/lib/cloud_crowd/node.rb +105 -0
  21. data/lib/cloud_crowd/schema.rb +22 -18
  22. data/lib/cloud_crowd/{app.rb → server.rb} +34 -34
  23. data/lib/cloud_crowd/worker.rb +68 -107
  24. data/public/css/admin_console.css +40 -18
  25. data/public/images/server.png +0 -0
  26. data/public/images/server_busy.png +0 -0
  27. data/public/js/admin_console.js +47 -18
  28. data/test/acceptance/test_failing_work_units.rb +1 -1
  29. data/test/acceptance/{test_app.rb → test_server.rb} +15 -15
  30. data/test/acceptance/test_word_count.rb +3 -9
  31. data/test/blueprints.rb +0 -1
  32. data/test/config/config.ru +1 -1
  33. data/test/config/config.yml +2 -4
  34. data/test/unit/test_action.rb +1 -1
  35. data/test/unit/test_configuration.rb +1 -1
  36. data/test/unit/test_job.rb +3 -0
  37. data/test/unit/test_work_unit.rb +2 -4
  38. data/views/{index.erb → operations_center.erb} +13 -8
  39. metadata +11 -10
  40. data/lib/cloud_crowd/daemon.rb +0 -95
  41. data/lib/cloud_crowd/models/worker_record.rb +0 -61
  42. data/lib/cloud_crowd/runner.rb +0 -15
data/README CHANGED
@@ -26,7 +26,7 @@
26
26
 
27
27
  * Parallel processing for the rest of us
28
28
  * Write your scripts in Ruby
29
- * Built for Amazon EC2 and S3
29
+ * Works with Amazon EC2 and S3
30
30
  * split -> process -> merge
31
31
  * As easy as `gem install cloud-crowd`
32
32
 
@@ -63,31 +63,31 @@
63
63
  # Edit the configuration files to your satisfaction, add AWS credentials,
64
64
  # and then load the CloudCrowd schema into your configured database.
65
65
 
66
- >> mate ~/config/cloud-crowd/config.yml
67
- >> mate ~/config/cloud-crowd/database.yml
66
+ >> cd ~/config/cloud-crowd
67
+ >> mate config.yml
68
+ >> mate database.yml
69
+ >> [create the database you just configured...]
68
70
  >> crowd load_schema
69
71
 
70
72
  # Write your actions, and install them into the 'actions' subdirectory.
71
- # CloudCrowd comes with some default actions as an example.
73
+ # CloudCrowd comes with a few default actions as an example.
72
74
 
73
75
  # To launch the central server (make sure that you include its location
74
- # in config.yml), either:
76
+ # in config.yml):
75
77
 
76
78
  >> crowd server
77
79
 
78
- # or:
80
+ # The configuration folder also includes 'config.ru', which can be used by
81
+ # any Rack-compliant webserver to run your central server.
79
82
 
80
- >> thin -R config.ru --servers 3 -e production start
83
+ # Then, to launch a node of workers:
81
84
 
82
- # Any server that supports Rack should work with the rackup file.
85
+ >> crowd node
83
86
 
84
- # Then, to spin up 10 workers:
87
+ # To spin up remote nodes, install the 'cloud-crowd' gem and copy over
88
+ # your configuration directory. Run `crowd node`, and the remote machines
89
+ # will register with the central server, becoming available for processing.
85
90
 
86
- >> crowd workers start -n 10
87
-
88
- # To spin up workers remotely, install the 'cloud-crowd' gem, and copy over
89
- # your configuration directory.
90
-
91
- # At this point you can visit your server console at localhost:9173 to
92
- # view all of your workers, ready for action.
91
+ # At this point you can visit your Operations Center at localhost:9173 to
92
+ # view all of your nodes, ready for action.
93
93
 
@@ -1,7 +1,7 @@
1
1
  Gem::Specification.new do |s|
2
2
  s.name = 'cloud-crowd'
3
- s.version = '0.1.0' # Keep version in sync with cloud-cloud.rb
4
- s.date = '2009-09-14'
3
+ s.version = '0.2.0' # Keep version in sync with cloud-cloud.rb
4
+ s.date = '2009-09-17'
5
5
 
6
6
  s.homepage = "http://wiki.github.com/documentcloud/cloud-crowd"
7
7
  s.summary = "Parallel Processing for the Rest of Us"
@@ -32,7 +32,7 @@ Gem::Specification.new do |s|
32
32
  s.add_dependency 'json', ['>= 1.1.7']
33
33
  s.add_dependency 'rest-client', ['>= 1.0.3']
34
34
  s.add_dependency 'right_aws', ['>= 1.10.0']
35
- s.add_dependency 'daemons', ['>= 1.0.10']
35
+ s.add_dependency 'thin', ['>= 1.2.4']
36
36
 
37
37
  if s.respond_to?(:add_development_dependency)
38
38
  s.add_development_dependency 'faker', ['>= 0.3.1']
@@ -56,23 +56,22 @@ examples/process_pdfs_example.rb
56
56
  examples/word_count_example.rb
57
57
  lib/cloud-crowd.rb
58
58
  lib/cloud_crowd/action.rb
59
- lib/cloud_crowd/app.rb
60
59
  lib/cloud_crowd/asset_store/filesystem_store.rb
61
60
  lib/cloud_crowd/asset_store/s3_store.rb
62
61
  lib/cloud_crowd/asset_store.rb
63
62
  lib/cloud_crowd/command_line.rb
64
- lib/cloud_crowd/daemon.rb
65
63
  lib/cloud_crowd/exceptions.rb
66
64
  lib/cloud_crowd/helpers/authorization.rb
67
65
  lib/cloud_crowd/helpers/resources.rb
68
66
  lib/cloud_crowd/helpers.rb
69
67
  lib/cloud_crowd/inflector.rb
70
68
  lib/cloud_crowd/models/job.rb
69
+ lib/cloud_crowd/models/node_record.rb
71
70
  lib/cloud_crowd/models/work_unit.rb
72
- lib/cloud_crowd/models/worker_record.rb
73
71
  lib/cloud_crowd/models.rb
74
- lib/cloud_crowd/runner.rb
72
+ lib/cloud_crowd/node.rb
75
73
  lib/cloud_crowd/schema.rb
74
+ lib/cloud_crowd/server.rb
76
75
  lib/cloud_crowd/worker.rb
77
76
  LICENSE
78
77
  public/css/admin_console.css
@@ -83,6 +82,8 @@ public/images/cloud_hand.png
83
82
  public/images/header_back.png
84
83
  public/images/logo.png
85
84
  public/images/queue_fill.png
85
+ public/images/server.png
86
+ public/images/server_busy.png
86
87
  public/images/server_error.png
87
88
  public/images/sidebar_bottom.png
88
89
  public/images/sidebar_top.png
@@ -93,7 +94,7 @@ public/js/excanvas.js
93
94
  public/js/flot.js
94
95
  public/js/jquery.js
95
96
  README
96
- test/acceptance/test_app.rb
97
+ test/acceptance/test_server.rb
97
98
  test/acceptance/test_failing_work_units.rb
98
99
  test/acceptance/test_word_count.rb
99
100
  test/blueprints.rb
@@ -106,6 +107,6 @@ test/unit/test_action.rb
106
107
  test/unit/test_configuration.rb
107
108
  test/unit/test_job.rb
108
109
  test/unit/test_work_unit.rb
109
- views/index.erb
110
+ views/operations_center.erb
110
111
  )
111
112
  end
@@ -4,7 +4,13 @@
4
4
  # using any Rack-compliant server handler. For example, start up three servers
5
5
  # with a specified port number, using Thin:
6
6
  #
7
- # thin start -R config.ru -p 9173 --servers 3
7
+ # thin start -R config.ru --servers 3
8
+ #
9
+ # Or a single server with Unicorn:
10
+ #
11
+ # unicorn config.ru
12
+ #
13
+
8
14
 
9
15
  require 'rubygems'
10
16
  require 'cloud-crowd'
@@ -13,5 +19,5 @@ CloudCrowd.configure(File.dirname(__FILE__) + '/config.yml')
13
19
  CloudCrowd.configure_database(File.dirname(__FILE__) + '/database.yml')
14
20
 
15
21
  map '/' do
16
- run CloudCrowd::App
22
+ run CloudCrowd::Server
17
23
  end
@@ -1,48 +1,44 @@
1
1
  # The URL where you're planning on running the central server/queue/database.
2
- :central_server: http://localhost:9173
2
+ :central_server: http://localhost:9173
3
+
4
+ # Set the maximum number of workers allowed per-node. Workers only run while
5
+ # there's work to be done. It's best to set 'max_workers' below the point where
6
+ # you'd start to swap or peg your CPU (as determined by experiment).
7
+ :max_workers: 5
3
8
 
4
9
  # The storage back-end that you'd like to use for intermediate and final results
5
10
  # of processing. 's3' and 'filesystem' are supported. 'filesystem' should only
6
- # be used in development, or on single-machine installations.
7
- :storage: s3
11
+ # be used in development, on single-machine installations, or networked drives.
12
+ :storage: s3
8
13
 
9
14
  # Please provide your AWS credentials for S3 storage of job output.
10
- :aws_access_key: [your AWS access key]
11
- :aws_secret_key: [your AWS secret access key]
15
+ :aws_access_key: [your AWS access key]
16
+ :aws_secret_key: [your AWS secret access key]
12
17
 
13
18
  # Choose an S3 bucket to store all CloudCrowd output, and decide if you'd like
14
19
  # to keep all resulting files on S3 private. If so, you'll receive authenticated
15
20
  # S3 URLs as job output, good for 24 hours. If left public, you'll get the
16
21
  # straight URLs to the files on S3.
17
- :s3_bucket: [your CloudCrowd bucket]
18
- :use_s3_authentication: no
22
+ :s3_bucket: [your CloudCrowd bucket]
23
+ :s3_authentication: no
24
+
25
+ # If you're using the 'filesystem' storage, perhaps with an NFS share or
26
+ # something similar, all files will be saved inside of the 'local_storage_path'.
27
+ # The default value if left unspecified is '/tmp/cloud_crowd_storage'.
28
+ :local_storage_path: /tmp/cloud_crowd_storage
19
29
 
20
30
  # Use HTTP Basic Auth for all requests? (Includes all internal worker requests
21
31
  # to the central server). If yes, specify the login and password that all
22
32
  # requests must provide for authentication.
23
- :use_http_authentication: no
24
- :login: [your login name]
25
- :password: [your password]
33
+ :http_authentication: no
34
+ :login: [your login name]
35
+ :password: [your password]
26
36
 
27
37
  # By default, CloudCrowd looks for installed actions inside the 'actions'
28
38
  # subdirectory of this configuration folder. 'actions_path' allows you to load
29
39
  # additional actions from a location of your choice.
30
40
  # :actions_path: /path/to/actions
31
41
 
32
- # Set the following numbers to tweak the configuration of your worker daemons.
33
- # Optimum results will depend on proportion of the Memory/CPU/IO bottlenecks
34
- # in your actions, the number of central servers you have running, and your
35
- # desired balance between latency and traffic.
36
-
37
- # The number of workers that `crowd workers start` spins up.
38
- :num_workers: 3
39
-
40
- # The minimum number of seconds a worker waits between checking the job queue.
41
- :min_worker_wait: 1
42
-
43
- # The maximum number of seconds a worker waits between checking the job queue.
44
- :max_worker_wait: 5
45
-
46
42
  # The number of separate attempts that will be made to process an individual
47
43
  # work unit, before marking it as having failed.
48
- :work_unit_retries: 3
44
+ :work_unit_retries: 3
@@ -17,7 +17,7 @@ RestClient.post('http://localhost:9173/jobs',
17
17
  'http://tigger.uic.edu/~victor/personal/futurism.pdf',
18
18
  'http://www.jonasmekas.com/Catalog_excerpt/The%20Avant-Garde%20From%20Futurism%20to%20Fluxus.pdf',
19
19
  'http://www.dzignism.com/articles/Futurist.Manifesto.pdf',
20
- 'http://benfry.com/phd/dissertation-050312b-acrobat.pdf'
20
+ 'http://www.pitt.edu/~slavic/sisc/SISC4/dadswell.pdf'
21
21
  ],
22
22
 
23
23
  'options' => {
@@ -39,3 +39,4 @@ RestClient.post('http://localhost:9173/jobs',
39
39
  )
40
40
 
41
41
  # With 23 Workers running, and over Wifi, it counted all the words in 5.5 secs.
42
+ # On a fast internet connection, you may not even see this job show up.
@@ -5,16 +5,15 @@ $LOAD_PATH.unshift File.expand_path(File.dirname(__FILE__))
5
5
  # Common Gems:
6
6
  require 'rubygems'
7
7
  gem 'activerecord'
8
- gem 'daemons'
9
8
  gem 'json'
10
9
  gem 'rest-client'
11
10
  gem 'right_aws'
12
11
  gem 'sinatra'
12
+ gem 'thin'
13
13
 
14
14
  # Autoloading for all the pieces which may or may not be needed:
15
15
  autoload :ActiveRecord, 'activerecord'
16
16
  autoload :Benchmark, 'benchmark'
17
- autoload :Daemons, 'daemons'
18
17
  autoload :Digest, 'digest'
19
18
  autoload :ERB, 'erb'
20
19
  autoload :FileUtils, 'fileutils'
@@ -23,6 +22,7 @@ autoload :RestClient, 'restclient'
23
22
  autoload :RightAws, 'right_aws'
24
23
  autoload :Sinatra, 'sinatra'
25
24
  autoload :Socket, 'socket'
25
+ autoload :Thin, 'thin'
26
26
  autoload :YAML, 'yaml'
27
27
 
28
28
  # Common code which should really be required in every circumstance.
@@ -30,47 +30,50 @@ require 'cloud_crowd/exceptions'
30
30
 
31
31
  module CloudCrowd
32
32
 
33
- # Autoload all the CloudCrowd classes which may not be required.
34
- autoload :App, 'cloud_crowd/app'
33
+ # Autoload all the CloudCrowd internals.
35
34
  autoload :Action, 'cloud_crowd/action'
36
35
  autoload :AssetStore, 'cloud_crowd/asset_store'
37
36
  autoload :Helpers, 'cloud_crowd/helpers'
38
37
  autoload :Inflector, 'cloud_crowd/inflector'
39
38
  autoload :Job, 'cloud_crowd/models'
39
+ autoload :Node, 'cloud_crowd/node'
40
+ autoload :NodeRecord, 'cloud_crowd/models'
41
+ autoload :Server, 'cloud_crowd/server'
40
42
  autoload :Worker, 'cloud_crowd/worker'
41
43
  autoload :WorkUnit, 'cloud_crowd/models'
42
- autoload :WorkerRecord, 'cloud_crowd/models'
43
44
 
44
- # Root directory of the CloudCrowd gem.
45
- ROOT = File.expand_path(File.dirname(__FILE__) + '/..')
45
+ # Keep this version in sync with the gemspec.
46
+ VERSION = '0.2.0'
47
+
48
+ # Increment the schema version when there's a backwards incompatible change.
49
+ SCHEMA_VERSION = 2
46
50
 
47
- # Keep the version in sync with the gemspec.
48
- VERSION = '0.1.0'
51
+ # Root directory of the CloudCrowd gem.
52
+ ROOT = File.expand_path(File.dirname(__FILE__) + '/..')
49
53
 
50
- # A Job is processing if its WorkUnits in the queue to be handled by workers.
51
- PROCESSING = 1
54
+ # A Job is processing if its WorkUnits are in the queue to be handled by nodes.
55
+ PROCESSING = 1
52
56
 
53
57
  # A Job has succeeded if all of its WorkUnits have finished successfully.
54
- SUCCEEDED = 2
58
+ SUCCEEDED = 2
55
59
 
56
60
  # A Job has failed if even a single one of its WorkUnits has failed (they may
57
61
  # be attempted multiple times on failure, however).
58
- FAILED = 3
62
+ FAILED = 3
59
63
 
60
64
  # A Job is splitting if it's in the process of dividing its inputs up into
61
65
  # multiple WorkUnits.
62
- SPLITTING = 4
66
+ SPLITTING = 4
63
67
 
64
68
  # A Job is merging if it's busy collecting all of its successful WorkUnits
65
69
  # back together into the final result.
66
- MERGING = 5
70
+ MERGING = 5
67
71
 
68
- # A work unit is considered to be complete if it succeeded or if it failed.
69
- COMPLETE = [SUCCEEDED, FAILED]
72
+ # A Job is considered to be complete if it succeeded or if it failed.
73
+ COMPLETE = [SUCCEEDED, FAILED]
70
74
 
71
- # A work unit is considered incomplete if it's being processed, split up or
72
- # merged together.
73
- INCOMPLETE = [PROCESSING, SPLITTING, MERGING]
75
+ # A Job is considered incomplete if it's being processed, split up or merged.
76
+ INCOMPLETE = [PROCESSING, SPLITTING, MERGING]
74
77
 
75
78
  # Mapping of statuses to their display strings.
76
79
  DISPLAY_STATUS_MAP = ['unknown', 'processing', 'succeeded', 'failed', 'splitting', 'merging']
@@ -87,18 +90,34 @@ module CloudCrowd
87
90
  # Configure the CloudCrowd central database (and connect to it), by passing
88
91
  # in a path to <tt>database.yml</tt>. The file should use the standard
89
92
  # ActiveRecord connection format.
90
- def configure_database(config_path)
93
+ def configure_database(config_path, validate_schema=true)
91
94
  configuration = YAML.load_file(config_path)
92
95
  ActiveRecord::Base.establish_connection(configuration)
96
+ if validate_schema
97
+ version = ActiveRecord::Base.connection.select_values('select max(version) from schema_migrations').first.to_i
98
+ return true if version == SCHEMA_VERSION
99
+ puts "Your database schema is out of date. Please use `crowd load_schema` to update it. This will wipe all the tables, so make sure that your jobs have a chance to finish first.\nexiting..."
100
+ exit
101
+ end
93
102
  end
94
103
 
95
- # Get a reference to the central server, including authentication,
96
- # if configured.
104
+ # Get a reference to the central server, including authentication if
105
+ # configured.
97
106
  def central_server
98
- return @central_server if @central_server
99
- params = [CloudCrowd.config[:central_server]]
100
- params += [CloudCrowd.config[:login], CloudCrowd.config[:password]] if CloudCrowd.config[:use_http_authentication]
101
- @central_server = RestClient::Resource.new(*params)
107
+ @central_server ||= RestClient::Resource.new(CloudCrowd.config[:central_server], CloudCrowd.client_options)
108
+ end
109
+
110
+ # The standard RestClient options for the central server talking to nodes,
111
+ # as well as the other way around. There's a timeout of 5 seconds to open
112
+ # a connection, and a timeout of 30 to finish reading it.
113
+ def client_options
114
+ return @client_options if @client_options
115
+ @client_options = {:timeout => 30, :open_timeout => 5}
116
+ if CloudCrowd.config[:http_authentication]
117
+ @client_options[:user] = CloudCrowd.config[:login]
118
+ @client_options[:password] = CloudCrowd.config[:password]
119
+ end
120
+ @client_options
102
121
  end
103
122
 
104
123
  # Return the displayable status name of an internal CloudCrowd status number.
@@ -110,7 +129,7 @@ module CloudCrowd
110
129
  # CloudCrowd::Actions are requested dynamically by name. Access them through
111
130
  # this actions property, which behaves like a hash. At load time, we
112
131
  # load all installed Actions and CloudCrowd's default Actions into it.
113
- # If you wish to have certain workers be specialized to only handle certain
132
+ # If you wish to have certain nodes be specialized to only handle certain
114
133
  # Actions, then install only those into the actions directory.
115
134
  def actions
116
135
  return @actions if @actions
@@ -38,13 +38,19 @@ module CloudCrowd
38
38
 
39
39
  # Download a file to the specified path.
40
40
  def download(url, path)
41
- if url.match(FILE_URL)
42
- FileUtils.cp(url.sub(FILE_URL, ''), path)
43
- else
44
- resp = RestClient::Request.execute(:url => url, :method => :get, :raw_response => true)
45
- FileUtils.mv resp.file.path, path
46
- end
47
- path
41
+ `curl -s "#{url}" > "#{path}"`
42
+ return path
43
+ # The previous implementation is below, and, although it would be
44
+ # wonderful not to shell out, RestClient wasn't handling URLs with encoded
45
+ # entities (%20, for example), and doesn't let you download to a given
46
+ # location. Getting a RestClient patch in would be ideal.
47
+ #
48
+ # if url.match(FILE_URL)
49
+ # FileUtils.cp(url.sub(FILE_URL, ''), path)
50
+ # else
51
+ # resp = RestClient::Request.execute(:url => url, :method => :get, :raw_response => true)
52
+ # FileUtils.mv resp.file.path, path
53
+ # end
48
54
  end
49
55
 
50
56
  # Takes a local filesystem path, saves the file to S3, and returns the
@@ -55,7 +61,7 @@ module CloudCrowd
55
61
  end
56
62
 
57
63
  # After the Action has finished, we remove the work directory and return
58
- # to the root directory (where daemons run by default).
64
+ # to the root directory (where workers run by default).
59
65
  def cleanup_work_directory
60
66
  FileUtils.rm_r(@work_directory) if File.exists?(@work_directory)
61
67
  end
@@ -3,18 +3,18 @@ require 'tmpdir'
3
3
  module CloudCrowd
4
4
 
5
5
  # The AssetStore provides a common API for storing files and returning URLs
6
- # that can access them. In production this will be S3 but in development
7
- # it may be the filesystem.
6
+ # that can access them. At the moment, the files can be saved to either S3, or
7
+ # the local filesystem. You shouldn't need to use the AssetStore directly --
8
+ # Action's +download+ and +save+ methods use it behind the scenes.
8
9
  #
9
- # You shouldn't need to use the AssetStore directly -- Action's +download+
10
- # and +save+ methods use it behind the scenes.
10
+ # To implement a new back-end for the AssetStore, you must provide
11
+ # <tt>save(local_path, save_path)</tt>, <tt>cleanup(job)</tt>, and optionally,
12
+ # a <tt>setup</tt> method that will be called once at initialization.
11
13
  class AssetStore
12
14
 
13
15
  autoload :S3Store, 'cloud_crowd/asset_store/s3_store'
14
16
  autoload :FilesystemStore, 'cloud_crowd/asset_store/filesystem_store'
15
-
16
- LOCAL_STORAGE_PATH = '/tmp/cloud_crowd_storage'
17
-
17
+
18
18
  # Configure the AssetStore with the specific storage implementation
19
19
  # specified by 'storage' in <tt>config.yml</tt>.
20
20
  case CloudCrowd.config[:storage]
@@ -25,9 +25,9 @@ module CloudCrowd
25
25
 
26
26
  # Creating the AssetStore ensures that its scratch directory exists.
27
27
  def initialize
28
- @use_auth = CloudCrowd.config[:use_s3_authentication]
29
28
  FileUtils.mkdir_p temp_storage_path unless File.exists? temp_storage_path
30
29
  raise Error::StorageNotWritable, "#{temp_storage_path} is not writable" unless File.writable?(temp_storage_path)
30
+ setup if respond_to? :setup
31
31
  end
32
32
 
33
33
  # Get the path to CloudCrowd's temporary local storage. All actions run