cloud-crowd 0.1.0 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (42) hide show
  1. data/README +16 -16
  2. data/cloud-crowd.gemspec +10 -9
  3. data/config/config.example.ru +8 -2
  4. data/config/config.example.yml +21 -25
  5. data/examples/process_pdfs_example.rb +1 -1
  6. data/examples/word_count_example.rb +1 -0
  7. data/lib/cloud-crowd.rb +47 -28
  8. data/lib/cloud_crowd/action.rb +14 -8
  9. data/lib/cloud_crowd/asset_store.rb +8 -8
  10. data/lib/cloud_crowd/asset_store/filesystem_store.rb +18 -7
  11. data/lib/cloud_crowd/asset_store/s3_store.rb +14 -11
  12. data/lib/cloud_crowd/command_line.rb +24 -58
  13. data/lib/cloud_crowd/exceptions.rb +7 -0
  14. data/lib/cloud_crowd/helpers/authorization.rb +5 -3
  15. data/lib/cloud_crowd/helpers/resources.rb +0 -20
  16. data/lib/cloud_crowd/models.rb +1 -1
  17. data/lib/cloud_crowd/models/job.rb +37 -40
  18. data/lib/cloud_crowd/models/node_record.rb +95 -0
  19. data/lib/cloud_crowd/models/work_unit.rb +87 -33
  20. data/lib/cloud_crowd/node.rb +105 -0
  21. data/lib/cloud_crowd/schema.rb +22 -18
  22. data/lib/cloud_crowd/{app.rb → server.rb} +34 -34
  23. data/lib/cloud_crowd/worker.rb +68 -107
  24. data/public/css/admin_console.css +40 -18
  25. data/public/images/server.png +0 -0
  26. data/public/images/server_busy.png +0 -0
  27. data/public/js/admin_console.js +47 -18
  28. data/test/acceptance/test_failing_work_units.rb +1 -1
  29. data/test/acceptance/{test_app.rb → test_server.rb} +15 -15
  30. data/test/acceptance/test_word_count.rb +3 -9
  31. data/test/blueprints.rb +0 -1
  32. data/test/config/config.ru +1 -1
  33. data/test/config/config.yml +2 -4
  34. data/test/unit/test_action.rb +1 -1
  35. data/test/unit/test_configuration.rb +1 -1
  36. data/test/unit/test_job.rb +3 -0
  37. data/test/unit/test_work_unit.rb +2 -4
  38. data/views/{index.erb → operations_center.erb} +13 -8
  39. metadata +11 -10
  40. data/lib/cloud_crowd/daemon.rb +0 -95
  41. data/lib/cloud_crowd/models/worker_record.rb +0 -61
  42. data/lib/cloud_crowd/runner.rb +0 -15
data/README CHANGED
@@ -26,7 +26,7 @@
26
26
 
27
27
  * Parallel processing for the rest of us
28
28
  * Write your scripts in Ruby
29
- * Built for Amazon EC2 and S3
29
+ * Works with Amazon EC2 and S3
30
30
  * split -> process -> merge
31
31
  * As easy as `gem install cloud-crowd`
32
32
 
@@ -63,31 +63,31 @@
63
63
  # Edit the configuration files to your satisfaction, add AWS credentials,
64
64
  # and then load the CloudCrowd schema into your configured database.
65
65
 
66
- >> mate ~/config/cloud-crowd/config.yml
67
- >> mate ~/config/cloud-crowd/database.yml
66
+ >> cd ~/config/cloud-crowd
67
+ >> mate config.yml
68
+ >> mate database.yml
69
+ >> [create the database you just configured...]
68
70
  >> crowd load_schema
69
71
 
70
72
  # Write your actions, and install them into the 'actions' subdirectory.
71
- # CloudCrowd comes with some default actions as an example.
73
+ # CloudCrowd comes with a few default actions as an example.
72
74
 
73
75
  # To launch the central server (make sure that you include its location
74
- # in config.yml), either:
76
+ # in config.yml):
75
77
 
76
78
  >> crowd server
77
79
 
78
- # or:
80
+ # The configuration folder also includes 'config.ru', which can be used by
81
+ # any Rack-compliant webserver to run your central server.
79
82
 
80
- >> thin -R config.ru --servers 3 -e production start
83
+ # Then, to launch a node of workers:
81
84
 
82
- # Any server that supports Rack should work with the rackup file.
85
+ >> crowd node
83
86
 
84
- # Then, to spin up 10 workers:
87
+ # To spin up remote nodes, install the 'cloud-crowd' gem and copy over
88
+ # your configuration directory. Run `crowd node`, and the remote machines
89
+ # will register with the central server, becoming available for processing.
85
90
 
86
- >> crowd workers start -n 10
87
-
88
- # To spin up workers remotely, install the 'cloud-crowd' gem, and copy over
89
- # your configuration directory.
90
-
91
- # At this point you can visit your server console at localhost:9173 to
92
- # view all of your workers, ready for action.
91
+ # At this point you can visit your Operations Center at localhost:9173 to
92
+ # view all of your nodes, ready for action.
93
93
 
@@ -1,7 +1,7 @@
1
1
  Gem::Specification.new do |s|
2
2
  s.name = 'cloud-crowd'
3
- s.version = '0.1.0' # Keep version in sync with cloud-cloud.rb
4
- s.date = '2009-09-14'
3
+ s.version = '0.2.0' # Keep version in sync with cloud-cloud.rb
4
+ s.date = '2009-09-17'
5
5
 
6
6
  s.homepage = "http://wiki.github.com/documentcloud/cloud-crowd"
7
7
  s.summary = "Parallel Processing for the Rest of Us"
@@ -32,7 +32,7 @@ Gem::Specification.new do |s|
32
32
  s.add_dependency 'json', ['>= 1.1.7']
33
33
  s.add_dependency 'rest-client', ['>= 1.0.3']
34
34
  s.add_dependency 'right_aws', ['>= 1.10.0']
35
- s.add_dependency 'daemons', ['>= 1.0.10']
35
+ s.add_dependency 'thin', ['>= 1.2.4']
36
36
 
37
37
  if s.respond_to?(:add_development_dependency)
38
38
  s.add_development_dependency 'faker', ['>= 0.3.1']
@@ -56,23 +56,22 @@ examples/process_pdfs_example.rb
56
56
  examples/word_count_example.rb
57
57
  lib/cloud-crowd.rb
58
58
  lib/cloud_crowd/action.rb
59
- lib/cloud_crowd/app.rb
60
59
  lib/cloud_crowd/asset_store/filesystem_store.rb
61
60
  lib/cloud_crowd/asset_store/s3_store.rb
62
61
  lib/cloud_crowd/asset_store.rb
63
62
  lib/cloud_crowd/command_line.rb
64
- lib/cloud_crowd/daemon.rb
65
63
  lib/cloud_crowd/exceptions.rb
66
64
  lib/cloud_crowd/helpers/authorization.rb
67
65
  lib/cloud_crowd/helpers/resources.rb
68
66
  lib/cloud_crowd/helpers.rb
69
67
  lib/cloud_crowd/inflector.rb
70
68
  lib/cloud_crowd/models/job.rb
69
+ lib/cloud_crowd/models/node_record.rb
71
70
  lib/cloud_crowd/models/work_unit.rb
72
- lib/cloud_crowd/models/worker_record.rb
73
71
  lib/cloud_crowd/models.rb
74
- lib/cloud_crowd/runner.rb
72
+ lib/cloud_crowd/node.rb
75
73
  lib/cloud_crowd/schema.rb
74
+ lib/cloud_crowd/server.rb
76
75
  lib/cloud_crowd/worker.rb
77
76
  LICENSE
78
77
  public/css/admin_console.css
@@ -83,6 +82,8 @@ public/images/cloud_hand.png
83
82
  public/images/header_back.png
84
83
  public/images/logo.png
85
84
  public/images/queue_fill.png
85
+ public/images/server.png
86
+ public/images/server_busy.png
86
87
  public/images/server_error.png
87
88
  public/images/sidebar_bottom.png
88
89
  public/images/sidebar_top.png
@@ -93,7 +94,7 @@ public/js/excanvas.js
93
94
  public/js/flot.js
94
95
  public/js/jquery.js
95
96
  README
96
- test/acceptance/test_app.rb
97
+ test/acceptance/test_server.rb
97
98
  test/acceptance/test_failing_work_units.rb
98
99
  test/acceptance/test_word_count.rb
99
100
  test/blueprints.rb
@@ -106,6 +107,6 @@ test/unit/test_action.rb
106
107
  test/unit/test_configuration.rb
107
108
  test/unit/test_job.rb
108
109
  test/unit/test_work_unit.rb
109
- views/index.erb
110
+ views/operations_center.erb
110
111
  )
111
112
  end
@@ -4,7 +4,13 @@
4
4
  # using any Rack-compliant server handler. For example, start up three servers
5
5
  # with a specified port number, using Thin:
6
6
  #
7
- # thin start -R config.ru -p 9173 --servers 3
7
+ # thin start -R config.ru --servers 3
8
+ #
9
+ # Or a single server with Unicorn:
10
+ #
11
+ # unicorn config.ru
12
+ #
13
+
8
14
 
9
15
  require 'rubygems'
10
16
  require 'cloud-crowd'
@@ -13,5 +19,5 @@ CloudCrowd.configure(File.dirname(__FILE__) + '/config.yml')
13
19
  CloudCrowd.configure_database(File.dirname(__FILE__) + '/database.yml')
14
20
 
15
21
  map '/' do
16
- run CloudCrowd::App
22
+ run CloudCrowd::Server
17
23
  end
@@ -1,48 +1,44 @@
1
1
  # The URL where you're planning on running the central server/queue/database.
2
- :central_server: http://localhost:9173
2
+ :central_server: http://localhost:9173
3
+
4
+ # Set the maximum number of workers allowed per-node. Workers only run while
5
+ # there's work to be done. It's best to set 'max_workers' below the point where
6
+ # you'd start to swap or peg your CPU (as determined by experiment).
7
+ :max_workers: 5
3
8
 
4
9
  # The storage back-end that you'd like to use for intermediate and final results
5
10
  # of processing. 's3' and 'filesystem' are supported. 'filesystem' should only
6
- # be used in development, or on single-machine installations.
7
- :storage: s3
11
+ # be used in development, on single-machine installations, or networked drives.
12
+ :storage: s3
8
13
 
9
14
  # Please provide your AWS credentials for S3 storage of job output.
10
- :aws_access_key: [your AWS access key]
11
- :aws_secret_key: [your AWS secret access key]
15
+ :aws_access_key: [your AWS access key]
16
+ :aws_secret_key: [your AWS secret access key]
12
17
 
13
18
  # Choose an S3 bucket to store all CloudCrowd output, and decide if you'd like
14
19
  # to keep all resulting files on S3 private. If so, you'll receive authenticated
15
20
  # S3 URLs as job output, good for 24 hours. If left public, you'll get the
16
21
  # straight URLs to the files on S3.
17
- :s3_bucket: [your CloudCrowd bucket]
18
- :use_s3_authentication: no
22
+ :s3_bucket: [your CloudCrowd bucket]
23
+ :s3_authentication: no
24
+
25
+ # If you're using the 'filesystem' storage, perhaps with an NFS share or
26
+ # something similar, all files will be saved inside of the 'local_storage_path'.
27
+ # The default value if left unspecified is '/tmp/cloud_crowd_storage'.
28
+ :local_storage_path: /tmp/cloud_crowd_storage
19
29
 
20
30
  # Use HTTP Basic Auth for all requests? (Includes all internal worker requests
21
31
  # to the central server). If yes, specify the login and password that all
22
32
  # requests must provide for authentication.
23
- :use_http_authentication: no
24
- :login: [your login name]
25
- :password: [your password]
33
+ :http_authentication: no
34
+ :login: [your login name]
35
+ :password: [your password]
26
36
 
27
37
  # By default, CloudCrowd looks for installed actions inside the 'actions'
28
38
  # subdirectory of this configuration folder. 'actions_path' allows you to load
29
39
  # additional actions from a location of your choice.
30
40
  # :actions_path: /path/to/actions
31
41
 
32
- # Set the following numbers to tweak the configuration of your worker daemons.
33
- # Optimum results will depend on proportion of the Memory/CPU/IO bottlenecks
34
- # in your actions, the number of central servers you have running, and your
35
- # desired balance between latency and traffic.
36
-
37
- # The number of workers that `crowd workers start` spins up.
38
- :num_workers: 3
39
-
40
- # The minimum number of seconds a worker waits between checking the job queue.
41
- :min_worker_wait: 1
42
-
43
- # The maximum number of seconds a worker waits between checking the job queue.
44
- :max_worker_wait: 5
45
-
46
42
  # The number of separate attempts that will be made to process an individual
47
43
  # work unit, before marking it as having failed.
48
- :work_unit_retries: 3
44
+ :work_unit_retries: 3
@@ -17,7 +17,7 @@ RestClient.post('http://localhost:9173/jobs',
17
17
  'http://tigger.uic.edu/~victor/personal/futurism.pdf',
18
18
  'http://www.jonasmekas.com/Catalog_excerpt/The%20Avant-Garde%20From%20Futurism%20to%20Fluxus.pdf',
19
19
  'http://www.dzignism.com/articles/Futurist.Manifesto.pdf',
20
- 'http://benfry.com/phd/dissertation-050312b-acrobat.pdf'
20
+ 'http://www.pitt.edu/~slavic/sisc/SISC4/dadswell.pdf'
21
21
  ],
22
22
 
23
23
  'options' => {
@@ -39,3 +39,4 @@ RestClient.post('http://localhost:9173/jobs',
39
39
  )
40
40
 
41
41
  # With 23 Workers running, and over Wifi, it counted all the words in 5.5 secs.
42
+ # On a fast internet connection, you may not even see this job show up.
@@ -5,16 +5,15 @@ $LOAD_PATH.unshift File.expand_path(File.dirname(__FILE__))
5
5
  # Common Gems:
6
6
  require 'rubygems'
7
7
  gem 'activerecord'
8
- gem 'daemons'
9
8
  gem 'json'
10
9
  gem 'rest-client'
11
10
  gem 'right_aws'
12
11
  gem 'sinatra'
12
+ gem 'thin'
13
13
 
14
14
  # Autoloading for all the pieces which may or may not be needed:
15
15
  autoload :ActiveRecord, 'activerecord'
16
16
  autoload :Benchmark, 'benchmark'
17
- autoload :Daemons, 'daemons'
18
17
  autoload :Digest, 'digest'
19
18
  autoload :ERB, 'erb'
20
19
  autoload :FileUtils, 'fileutils'
@@ -23,6 +22,7 @@ autoload :RestClient, 'restclient'
23
22
  autoload :RightAws, 'right_aws'
24
23
  autoload :Sinatra, 'sinatra'
25
24
  autoload :Socket, 'socket'
25
+ autoload :Thin, 'thin'
26
26
  autoload :YAML, 'yaml'
27
27
 
28
28
  # Common code which should really be required in every circumstance.
@@ -30,47 +30,50 @@ require 'cloud_crowd/exceptions'
30
30
 
31
31
  module CloudCrowd
32
32
 
33
- # Autoload all the CloudCrowd classes which may not be required.
34
- autoload :App, 'cloud_crowd/app'
33
+ # Autoload all the CloudCrowd internals.
35
34
  autoload :Action, 'cloud_crowd/action'
36
35
  autoload :AssetStore, 'cloud_crowd/asset_store'
37
36
  autoload :Helpers, 'cloud_crowd/helpers'
38
37
  autoload :Inflector, 'cloud_crowd/inflector'
39
38
  autoload :Job, 'cloud_crowd/models'
39
+ autoload :Node, 'cloud_crowd/node'
40
+ autoload :NodeRecord, 'cloud_crowd/models'
41
+ autoload :Server, 'cloud_crowd/server'
40
42
  autoload :Worker, 'cloud_crowd/worker'
41
43
  autoload :WorkUnit, 'cloud_crowd/models'
42
- autoload :WorkerRecord, 'cloud_crowd/models'
43
44
 
44
- # Root directory of the CloudCrowd gem.
45
- ROOT = File.expand_path(File.dirname(__FILE__) + '/..')
45
+ # Keep this version in sync with the gemspec.
46
+ VERSION = '0.2.0'
47
+
48
+ # Increment the schema version when there's a backwards incompatible change.
49
+ SCHEMA_VERSION = 2
46
50
 
47
- # Keep the version in sync with the gemspec.
48
- VERSION = '0.1.0'
51
+ # Root directory of the CloudCrowd gem.
52
+ ROOT = File.expand_path(File.dirname(__FILE__) + '/..')
49
53
 
50
- # A Job is processing if its WorkUnits in the queue to be handled by workers.
51
- PROCESSING = 1
54
+ # A Job is processing if its WorkUnits are in the queue to be handled by nodes.
55
+ PROCESSING = 1
52
56
 
53
57
  # A Job has succeeded if all of its WorkUnits have finished successfully.
54
- SUCCEEDED = 2
58
+ SUCCEEDED = 2
55
59
 
56
60
  # A Job has failed if even a single one of its WorkUnits has failed (they may
57
61
  # be attempted multiple times on failure, however).
58
- FAILED = 3
62
+ FAILED = 3
59
63
 
60
64
  # A Job is splitting if it's in the process of dividing its inputs up into
61
65
  # multiple WorkUnits.
62
- SPLITTING = 4
66
+ SPLITTING = 4
63
67
 
64
68
  # A Job is merging if it's busy collecting all of its successful WorkUnits
65
69
  # back together into the final result.
66
- MERGING = 5
70
+ MERGING = 5
67
71
 
68
- # A work unit is considered to be complete if it succeeded or if it failed.
69
- COMPLETE = [SUCCEEDED, FAILED]
72
+ # A Job is considered to be complete if it succeeded or if it failed.
73
+ COMPLETE = [SUCCEEDED, FAILED]
70
74
 
71
- # A work unit is considered incomplete if it's being processed, split up or
72
- # merged together.
73
- INCOMPLETE = [PROCESSING, SPLITTING, MERGING]
75
+ # A Job is considered incomplete if it's being processed, split up or merged.
76
+ INCOMPLETE = [PROCESSING, SPLITTING, MERGING]
74
77
 
75
78
  # Mapping of statuses to their display strings.
76
79
  DISPLAY_STATUS_MAP = ['unknown', 'processing', 'succeeded', 'failed', 'splitting', 'merging']
@@ -87,18 +90,34 @@ module CloudCrowd
87
90
  # Configure the CloudCrowd central database (and connect to it), by passing
88
91
  # in a path to <tt>database.yml</tt>. The file should use the standard
89
92
  # ActiveRecord connection format.
90
- def configure_database(config_path)
93
+ def configure_database(config_path, validate_schema=true)
91
94
  configuration = YAML.load_file(config_path)
92
95
  ActiveRecord::Base.establish_connection(configuration)
96
+ if validate_schema
97
+ version = ActiveRecord::Base.connection.select_values('select max(version) from schema_migrations').first.to_i
98
+ return true if version == SCHEMA_VERSION
99
+ puts "Your database schema is out of date. Please use `crowd load_schema` to update it. This will wipe all the tables, so make sure that your jobs have a chance to finish first.\nexiting..."
100
+ exit
101
+ end
93
102
  end
94
103
 
95
- # Get a reference to the central server, including authentication,
96
- # if configured.
104
+ # Get a reference to the central server, including authentication if
105
+ # configured.
97
106
  def central_server
98
- return @central_server if @central_server
99
- params = [CloudCrowd.config[:central_server]]
100
- params += [CloudCrowd.config[:login], CloudCrowd.config[:password]] if CloudCrowd.config[:use_http_authentication]
101
- @central_server = RestClient::Resource.new(*params)
107
+ @central_server ||= RestClient::Resource.new(CloudCrowd.config[:central_server], CloudCrowd.client_options)
108
+ end
109
+
110
+ # The standard RestClient options for the central server talking to nodes,
111
+ # as well as the other way around. There's a timeout of 5 seconds to open
112
+ # a connection, and a timeout of 30 to finish reading it.
113
+ def client_options
114
+ return @client_options if @client_options
115
+ @client_options = {:timeout => 30, :open_timeout => 5}
116
+ if CloudCrowd.config[:http_authentication]
117
+ @client_options[:user] = CloudCrowd.config[:login]
118
+ @client_options[:password] = CloudCrowd.config[:password]
119
+ end
120
+ @client_options
102
121
  end
103
122
 
104
123
  # Return the displayable status name of an internal CloudCrowd status number.
@@ -110,7 +129,7 @@ module CloudCrowd
110
129
  # CloudCrowd::Actions are requested dynamically by name. Access them through
111
130
  # this actions property, which behaves like a hash. At load time, we
112
131
  # load all installed Actions and CloudCrowd's default Actions into it.
113
- # If you wish to have certain workers be specialized to only handle certain
132
+ # If you wish to have certain nodes be specialized to only handle certain
114
133
  # Actions, then install only those into the actions directory.
115
134
  def actions
116
135
  return @actions if @actions
@@ -38,13 +38,19 @@ module CloudCrowd
38
38
 
39
39
  # Download a file to the specified path.
40
40
  def download(url, path)
41
- if url.match(FILE_URL)
42
- FileUtils.cp(url.sub(FILE_URL, ''), path)
43
- else
44
- resp = RestClient::Request.execute(:url => url, :method => :get, :raw_response => true)
45
- FileUtils.mv resp.file.path, path
46
- end
47
- path
41
+ `curl -s "#{url}" > "#{path}"`
42
+ return path
43
+ # The previous implementation is below, and, although it would be
44
+ # wonderful not to shell out, RestClient wasn't handling URLs with encoded
45
+ # entities (%20, for example), and doesn't let you download to a given
46
+ # location. Getting a RestClient patch in would be ideal.
47
+ #
48
+ # if url.match(FILE_URL)
49
+ # FileUtils.cp(url.sub(FILE_URL, ''), path)
50
+ # else
51
+ # resp = RestClient::Request.execute(:url => url, :method => :get, :raw_response => true)
52
+ # FileUtils.mv resp.file.path, path
53
+ # end
48
54
  end
49
55
 
50
56
  # Takes a local filesystem path, saves the file to S3, and returns the
@@ -55,7 +61,7 @@ module CloudCrowd
55
61
  end
56
62
 
57
63
  # After the Action has finished, we remove the work directory and return
58
- # to the root directory (where daemons run by default).
64
+ # to the root directory (where workers run by default).
59
65
  def cleanup_work_directory
60
66
  FileUtils.rm_r(@work_directory) if File.exists?(@work_directory)
61
67
  end
@@ -3,18 +3,18 @@ require 'tmpdir'
3
3
  module CloudCrowd
4
4
 
5
5
  # The AssetStore provides a common API for storing files and returning URLs
6
- # that can access them. In production this will be S3 but in development
7
- # it may be the filesystem.
6
+ # that can access them. At the moment, the files can be saved to either S3, or
7
+ # the local filesystem. You shouldn't need to use the AssetStore directly --
8
+ # Action's +download+ and +save+ methods use it behind the scenes.
8
9
  #
9
- # You shouldn't need to use the AssetStore directly -- Action's +download+
10
- # and +save+ methods use it behind the scenes.
10
+ # To implement a new back-end for the AssetStore, you must provide
11
+ # <tt>save(local_path, save_path)</tt>, <tt>cleanup(job)</tt>, and optionally,
12
+ # a <tt>setup</tt> method that will be called once at initialization.
11
13
  class AssetStore
12
14
 
13
15
  autoload :S3Store, 'cloud_crowd/asset_store/s3_store'
14
16
  autoload :FilesystemStore, 'cloud_crowd/asset_store/filesystem_store'
15
-
16
- LOCAL_STORAGE_PATH = '/tmp/cloud_crowd_storage'
17
-
17
+
18
18
  # Configure the AssetStore with the specific storage implementation
19
19
  # specified by 'storage' in <tt>config.yml</tt>.
20
20
  case CloudCrowd.config[:storage]
@@ -25,9 +25,9 @@ module CloudCrowd
25
25
 
26
26
  # Creating the AssetStore ensures that its scratch directory exists.
27
27
  def initialize
28
- @use_auth = CloudCrowd.config[:use_s3_authentication]
29
28
  FileUtils.mkdir_p temp_storage_path unless File.exists? temp_storage_path
30
29
  raise Error::StorageNotWritable, "#{temp_storage_path} is not writable" unless File.writable?(temp_storage_path)
30
+ setup if respond_to? :setup
31
31
  end
32
32
 
33
33
  # Get the path to CloudCrowd's temporary local storage. All actions run