documentcloud-cloud-crowd 0.1.1 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
data/README CHANGED
@@ -26,7 +26,7 @@
26
26
 
27
27
  * Parallel processing for the rest of us
28
28
  * Write your scripts in Ruby
29
- * Built for Amazon EC2 and S3
29
+ * Works with Amazon EC2 and S3
30
30
  * split -> process -> merge
31
31
  * As easy as `gem install cloud-crowd`
32
32
 
@@ -63,31 +63,31 @@
63
63
  # Edit the configuration files to your satisfaction, add AWS credentials,
64
64
  # and then load the CloudCrowd schema into your configured database.
65
65
 
66
- >> mate ~/config/cloud-crowd/config.yml
67
- >> mate ~/config/cloud-crowd/database.yml
66
+ >> cd ~/config/cloud-crowd
67
+ >> mate config.yml
68
+ >> mate database.yml
69
+ >> [create the database you just configured...]
68
70
  >> crowd load_schema
69
71
 
70
72
  # Write your actions, and install them into the 'actions' subdirectory.
71
- # CloudCrowd comes with some default actions as an example.
73
+ # CloudCrowd comes with a few default actions as an example.
72
74
 
73
75
  # To launch the central server (make sure that you include its location
74
- # in config.yml), either:
76
+ # in config.yml):
75
77
 
76
78
  >> crowd server
77
79
 
78
- # or:
80
+ # The configuration folder also includes 'config.ru', which can be used by
81
+ # any Rack-compliant webserver to run your central server.
79
82
 
80
- >> thin -R config.ru --servers 3 -e production start
83
+ # Then, to launch a node of workers:
81
84
 
82
- # Any server that supports Rack should work with the rackup file.
85
+ >> crowd node
83
86
 
84
- # Then, to spin up 10 workers:
87
+ # To spin up remote nodes, install the 'cloud-crowd' gem and copy over
88
+ # your configuration directory. Run `crowd node`, and the remote machines
89
+ # will register with the central server, becoming available for processing.
85
90
 
86
- >> crowd workers start -n 10
87
-
88
- # To spin up workers remotely, install the 'cloud-crowd' gem, and copy over
89
- # your configuration directory.
90
-
91
- # At this point you can visit your server console at localhost:9173 to
92
- # view all of your workers, ready for action.
91
+ # At this point you can visit your Operations Center at localhost:9173 to
92
+ # view all of your nodes, ready for action.
93
93
 
data/cloud-crowd.gemspec CHANGED
@@ -1,7 +1,7 @@
1
1
  Gem::Specification.new do |s|
2
2
  s.name = 'cloud-crowd'
3
- s.version = '0.1.1' # Keep version in sync with cloud-cloud.rb
4
- s.date = '2009-09-15'
3
+ s.version = '0.2.0' # Keep version in sync with cloud-cloud.rb
4
+ s.date = '2009-09-17'
5
5
 
6
6
  s.homepage = "http://wiki.github.com/documentcloud/cloud-crowd"
7
7
  s.summary = "Parallel Processing for the Rest of Us"
@@ -83,6 +83,7 @@ public/images/header_back.png
83
83
  public/images/logo.png
84
84
  public/images/queue_fill.png
85
85
  public/images/server.png
86
+ public/images/server_busy.png
86
87
  public/images/server_error.png
87
88
  public/images/sidebar_bottom.png
88
89
  public/images/sidebar_top.png
@@ -106,6 +107,6 @@ test/unit/test_action.rb
106
107
  test/unit/test_configuration.rb
107
108
  test/unit/test_job.rb
108
109
  test/unit/test_work_unit.rb
109
- views/index.erb
110
+ views/operations_center.erb
110
111
  )
111
112
  end
@@ -1,33 +1,38 @@
1
1
  # The URL where you're planning on running the central server/queue/database.
2
- :central_server: http://localhost:9173
2
+ :central_server: http://localhost:9173
3
3
 
4
4
  # Set the maximum number of workers allowed per-node. Workers only run while
5
5
  # there's work to be done. It's best to set 'max_workers' below the point where
6
6
  # you'd start to swap or peg your CPU (as determined by experiment).
7
- :max_workers: 5
7
+ :max_workers: 5
8
8
 
9
9
  # The storage back-end that you'd like to use for intermediate and final results
10
10
  # of processing. 's3' and 'filesystem' are supported. 'filesystem' should only
11
- # be used in development, or on single-machine installations.
12
- :storage: s3
11
+ # be used in development, on single-machine installations, or networked drives.
12
+ :storage: s3
13
13
 
14
14
  # Please provide your AWS credentials for S3 storage of job output.
15
- :aws_access_key: [your AWS access key]
16
- :aws_secret_key: [your AWS secret access key]
15
+ :aws_access_key: [your AWS access key]
16
+ :aws_secret_key: [your AWS secret access key]
17
17
 
18
18
  # Choose an S3 bucket to store all CloudCrowd output, and decide if you'd like
19
19
  # to keep all resulting files on S3 private. If so, you'll receive authenticated
20
20
  # S3 URLs as job output, good for 24 hours. If left public, you'll get the
21
21
  # straight URLs to the files on S3.
22
- :s3_bucket: [your CloudCrowd bucket]
23
- :use_s3_authentication: no
22
+ :s3_bucket: [your CloudCrowd bucket]
23
+ :s3_authentication: no
24
+
25
+ # If you're using the 'filesystem' storage, perhaps with an NFS share or
26
+ # something similar, all files will be saved inside of the 'local_storage_path'.
27
+ # The default value if left unspecified is '/tmp/cloud_crowd_storage'.
28
+ :local_storage_path: /tmp/cloud_crowd_storage
24
29
 
25
30
  # Use HTTP Basic Auth for all requests? (Includes all internal worker requests
26
31
  # to the central server). If yes, specify the login and password that all
27
32
  # requests must provide for authentication.
28
- :use_http_authentication: no
29
- :login: [your login name]
30
- :password: [your password]
33
+ :http_authentication: no
34
+ :login: [your login name]
35
+ :password: [your password]
31
36
 
32
37
  # By default, CloudCrowd looks for installed actions inside the 'actions'
33
38
  # subdirectory of this configuration folder. 'actions_path' allows you to load
@@ -36,4 +41,4 @@
36
41
 
37
42
  # The number of separate attempts that will be made to process an individual
38
43
  # work unit, before marking it as having failed.
39
- :work_unit_retries: 3
44
+ :work_unit_retries: 3
data/lib/cloud-crowd.rb CHANGED
@@ -30,7 +30,7 @@ require 'cloud_crowd/exceptions'
30
30
 
31
31
  module CloudCrowd
32
32
 
33
- # Autoload all the CloudCrowd classes which may not be required.
33
+ # Autoload all the CloudCrowd internals.
34
34
  autoload :Action, 'cloud_crowd/action'
35
35
  autoload :AssetStore, 'cloud_crowd/asset_store'
36
36
  autoload :Helpers, 'cloud_crowd/helpers'
@@ -42,36 +42,38 @@ module CloudCrowd
42
42
  autoload :Worker, 'cloud_crowd/worker'
43
43
  autoload :WorkUnit, 'cloud_crowd/models'
44
44
 
45
- # Root directory of the CloudCrowd gem.
46
- ROOT = File.expand_path(File.dirname(__FILE__) + '/..')
45
+ # Keep this version in sync with the gemspec.
46
+ VERSION = '0.2.0'
47
+
48
+ # Increment the schema version when there's a backwards incompatible change.
49
+ SCHEMA_VERSION = 2
47
50
 
48
- # Keep the version in sync with the gemspec.
49
- VERSION = '0.1.1'
51
+ # Root directory of the CloudCrowd gem.
52
+ ROOT = File.expand_path(File.dirname(__FILE__) + '/..')
50
53
 
51
- # A Job is processing if its WorkUnits in the queue to be handled by workers.
52
- PROCESSING = 1
54
+ # A Job is processing if its WorkUnits are in the queue to be handled by nodes.
55
+ PROCESSING = 1
53
56
 
54
57
  # A Job has succeeded if all of its WorkUnits have finished successfully.
55
- SUCCEEDED = 2
58
+ SUCCEEDED = 2
56
59
 
57
60
  # A Job has failed if even a single one of its WorkUnits has failed (they may
58
61
  # be attempted multiple times on failure, however).
59
- FAILED = 3
62
+ FAILED = 3
60
63
 
61
64
  # A Job is splitting if it's in the process of dividing its inputs up into
62
65
  # multiple WorkUnits.
63
- SPLITTING = 4
66
+ SPLITTING = 4
64
67
 
65
68
  # A Job is merging if it's busy collecting all of its successful WorkUnits
66
69
  # back together into the final result.
67
- MERGING = 5
70
+ MERGING = 5
68
71
 
69
- # A work unit is considered to be complete if it succeeded or if it failed.
70
- COMPLETE = [SUCCEEDED, FAILED]
72
+ # A Job is considered to be complete if it succeeded or if it failed.
73
+ COMPLETE = [SUCCEEDED, FAILED]
71
74
 
72
- # A work unit is considered incomplete if it's being processed, split up or
73
- # merged together.
74
- INCOMPLETE = [PROCESSING, SPLITTING, MERGING]
75
+ # A Job is considered incomplete if it's being processed, split up or merged.
76
+ INCOMPLETE = [PROCESSING, SPLITTING, MERGING]
75
77
 
76
78
  # Mapping of statuses to their display strings.
77
79
  DISPLAY_STATUS_MAP = ['unknown', 'processing', 'succeeded', 'failed', 'splitting', 'merging']
@@ -88,18 +90,34 @@ module CloudCrowd
88
90
  # Configure the CloudCrowd central database (and connect to it), by passing
89
91
  # in a path to <tt>database.yml</tt>. The file should use the standard
90
92
  # ActiveRecord connection format.
91
- def configure_database(config_path)
93
+ def configure_database(config_path, validate_schema=true)
92
94
  configuration = YAML.load_file(config_path)
93
95
  ActiveRecord::Base.establish_connection(configuration)
96
+ if validate_schema
97
+ version = ActiveRecord::Base.connection.select_values('select max(version) from schema_migrations').first.to_i
98
+ return true if version == SCHEMA_VERSION
99
+ puts "Your database schema is out of date. Please use `crowd load_schema` to update it. This will wipe all the tables, so make sure that your jobs have a chance to finish first.\nexiting..."
100
+ exit
101
+ end
94
102
  end
95
103
 
96
- # Get a reference to the central server, including authentication,
97
- # if configured.
104
+ # Get a reference to the central server, including authentication if
105
+ # configured.
98
106
  def central_server
99
- return @central_server if @central_server
100
- params = [CloudCrowd.config[:central_server]]
101
- params += [CloudCrowd.config[:login], CloudCrowd.config[:password]] if CloudCrowd.config[:use_http_authentication]
102
- @central_server = RestClient::Resource.new(*params)
107
+ @central_server ||= RestClient::Resource.new(CloudCrowd.config[:central_server], CloudCrowd.client_options)
108
+ end
109
+
110
+ # The standard RestClient options for the central server talking to nodes,
111
+ # as well as the other way around. There's a timeout of 5 seconds to open
112
+ # a connection, and a timeout of 30 to finish reading it.
113
+ def client_options
114
+ return @client_options if @client_options
115
+ @client_options = {:timeout => 30, :open_timeout => 5}
116
+ if CloudCrowd.config[:http_authentication]
117
+ @client_options[:user] = CloudCrowd.config[:login]
118
+ @client_options[:password] = CloudCrowd.config[:password]
119
+ end
120
+ @client_options
103
121
  end
104
122
 
105
123
  # Return the displayable status name of an internal CloudCrowd status number.
@@ -111,7 +129,7 @@ module CloudCrowd
111
129
  # CloudCrowd::Actions are requested dynamically by name. Access them through
112
130
  # this actions property, which behaves like a hash. At load time, we
113
131
  # load all installed Actions and CloudCrowd's default Actions into it.
114
- # If you wish to have certain workers be specialized to only handle certain
132
+ # If you wish to have certain nodes be specialized to only handle certain
115
133
  # Actions, then install only those into the actions directory.
116
134
  def actions
117
135
  return @actions if @actions
@@ -38,17 +38,19 @@ module CloudCrowd
38
38
 
39
39
  # Download a file to the specified path.
40
40
  def download(url, path)
41
- URI.parse(url) # Sanity check.
42
41
  `curl -s "#{url}" > "#{path}"`
42
+ return path
43
+ # The previous implementation is below, and, although it would be
44
+ # wonderful not to shell out, RestClient wasn't handling URLs with encoded
45
+ # entities (%20, for example), and doesn't let you download to a given
46
+ # location. Getting a RestClient patch in would be ideal.
47
+ #
43
48
  # if url.match(FILE_URL)
44
49
  # FileUtils.cp(url.sub(FILE_URL, ''), path)
45
50
  # else
46
- # # An alternative would be shelling out: `curl -s "#{url}" > "#{path}"`
47
- # puts url
48
51
  # resp = RestClient::Request.execute(:url => url, :method => :get, :raw_response => true)
49
52
  # FileUtils.mv resp.file.path, path
50
53
  # end
51
- path
52
54
  end
53
55
 
54
56
  # Takes a local filesystem path, saves the file to S3, and returns the
@@ -3,18 +3,18 @@ require 'tmpdir'
3
3
  module CloudCrowd
4
4
 
5
5
  # The AssetStore provides a common API for storing files and returning URLs
6
- # that can access them. In production this will be S3 but in development
7
- # it may be the filesystem.
6
+ # that can access them. At the moment, the files can be saved to either S3, or
7
+ # the local filesystem. You shouldn't need to use the AssetStore directly --
8
+ # Action's +download+ and +save+ methods use it behind the scenes.
8
9
  #
9
- # You shouldn't need to use the AssetStore directly -- Action's +download+
10
- # and +save+ methods use it behind the scenes.
10
+ # To implement a new back-end for the AssetStore, you must provide
11
+ # <tt>save(local_path, save_path)</tt>, <tt>cleanup(job)</tt>, and optionally,
12
+ # a <tt>setup</tt> method that will be called once at initialization.
11
13
  class AssetStore
12
14
 
13
15
  autoload :S3Store, 'cloud_crowd/asset_store/s3_store'
14
16
  autoload :FilesystemStore, 'cloud_crowd/asset_store/filesystem_store'
15
-
16
- LOCAL_STORAGE_PATH = '/tmp/cloud_crowd_storage'
17
-
17
+
18
18
  # Configure the AssetStore with the specific storage implementation
19
19
  # specified by 'storage' in <tt>config.yml</tt>.
20
20
  case CloudCrowd.config[:storage]
@@ -2,20 +2,26 @@ module CloudCrowd
2
2
  class AssetStore
3
3
 
4
4
  # The FilesystemStore is an implementation of the AssetStore, good only for
5
- # use in development, testing, or if you're only running a single-machine
6
- # installation.
5
+ # use in development, testing, if you're only running a single-machine
6
+ # installation, or are using a networked drive.
7
7
  module FilesystemStore
8
8
 
9
- # Make sure that local storage is writeable before starting.
9
+ DEFAULT_STORAGE_PATH = '/tmp/cloud_crowd_storage'
10
+
11
+ attr_reader :local_storage_path
12
+
13
+ # Make sure that local storage exists and is writeable before starting.
10
14
  def setup
11
- raise Error::StorageNotWritable, "#{LOCAL_STORAGE_PATH} is not writable" unless File.writable?(LOCAL_STORAGE_PATH)
15
+ lsp = @local_storage_path = CloudCrowd.config[:local_storage_path] || DEFAULT_STORAGE_PATH
16
+ FileUtils.mkdir_p(lsp) unless File.exists?(lsp)
17
+ raise Error::StorageNotWritable, "#{lsp} is not writable" unless File.writable?(lsp)
12
18
  end
13
19
 
14
- # Save a file to somewhere semi-persistent on the filesystem. Can be used
15
- # in development, when offline, or if you happen to have a single-machine
16
- # CloudCrowd installation. To use, configure <tt>:storage => 'filesystem'</tt>.
20
+ # Save a file to somewhere semi-persistent on the filesystem. To use,
21
+ # configure <tt>:storage: 'filesystem'</tt> in *config.yml*, as well as
22
+ # <tt>:local_storage_path:</tt>.
17
23
  def save(local_path, save_path)
18
- save_path = File.join(LOCAL_STORAGE_PATH, save_path)
24
+ save_path = File.join(@local_storage_path, save_path)
19
25
  save_dir = File.dirname(save_path)
20
26
  FileUtils.mkdir_p save_dir unless File.exists? save_dir
21
27
  FileUtils.cp(local_path, save_path)
@@ -24,7 +30,7 @@ module CloudCrowd
24
30
 
25
31
  # Remove all of a Job's result files from the filesystem.
26
32
  def cleanup(job)
27
- path = "#{LOCAL_STORAGE_PATH}/#{job.action}/job_#{job.id}"
33
+ path = "#{@local_storage_path}/#{job.action}/job_#{job.id}"
28
34
  FileUtils.rm_r(path) if File.exists?(path)
29
35
  end
30
36
  end
@@ -7,8 +7,16 @@ module CloudCrowd
7
7
 
8
8
  # Configure authentication and establish a connection to S3, first thing.
9
9
  def setup
10
- @use_auth = CloudCrowd.config[:use_s3_authentication]
11
- establish_s3_connection
10
+ @use_auth = CloudCrowd.config[:s3_authentication]
11
+ bucket_name = CloudCrowd.config[:s3_bucket]
12
+ key, secret = CloudCrowd.config[:aws_access_key], CloudCrowd.config[:aws_secret_key]
13
+ valid_conf = [bucket_name, key, secret].all? {|s| s.is_a? String }
14
+ raise Error::MissingConfiguration, "An S3 account must be configured in 'config.yml' before 's3' storage can be used" unless valid_conf
15
+ protocol = @use_auth ? 'https' : 'http'
16
+ port = @use_auth ? 443 : 80
17
+ @s3 = RightAws::S3.new(key, secret, :protocol => protocol, :port => port)
18
+ @bucket = @s3.bucket(bucket_name)
19
+ @bucket = @s3.bucket(bucket_name, true) unless @bucket
12
20
  end
13
21
 
14
22
  # Save a finished file from local storage to S3. Save it publicly unless
@@ -29,15 +37,6 @@ module CloudCrowd
29
37
  @bucket.delete_folder("#{job.action}/job_#{job.id}")
30
38
  end
31
39
 
32
- # Workers, through the course of many WorkUnits, keep around an AssetStore.
33
- # Ensure we have a persistent S3 connection after first use.
34
- def establish_s3_connection
35
- unless @s3 && @bucket
36
- params = {:port => 80, :protocol => 'http'}
37
- @s3 = RightAws::S3.new(CloudCrowd.config[:aws_access_key], CloudCrowd.config[:aws_secret_key], params)
38
- @bucket = @s3.bucket(CloudCrowd.config[:s3_bucket], true)
39
- end
40
- end
41
40
  end
42
41
 
43
42
  end
@@ -49,7 +49,7 @@ Options:
49
49
  require 'irb/completion'
50
50
  require 'pp'
51
51
  load_code
52
- connect_to_database
52
+ connect_to_database(true)
53
53
  IRB.start
54
54
  end
55
55
 
@@ -81,7 +81,7 @@ Options:
81
81
  # Load in the database schema to the database specified in 'database.yml'.
82
82
  def run_load_schema
83
83
  load_code
84
- connect_to_database
84
+ connect_to_database(false)
85
85
  require 'cloud_crowd/schema.rb'
86
86
  end
87
87
 
@@ -92,8 +92,8 @@ Options:
92
92
  install_path = ARGV.shift || '.'
93
93
  FileUtils.mkdir_p install_path unless File.exists?(install_path)
94
94
  install_file "#{CC_ROOT}/config/config.example.yml", "#{install_path}/config.yml"
95
- install_file "#{CC_ROOT}/config/database.example.yml", "#{install_path}/database.yml"
96
95
  install_file "#{CC_ROOT}/config/config.example.ru", "#{install_path}/config.ru"
96
+ install_file "#{CC_ROOT}/config/database.example.yml", "#{install_path}/database.yml"
97
97
  install_file "#{CC_ROOT}/actions", "#{install_path}/actions", true
98
98
  end
99
99
 
@@ -149,19 +149,24 @@ Options:
149
149
 
150
150
  # Establish a connection to the central server's database. Not all commands
151
151
  # require this.
152
- def connect_to_database
152
+ def connect_to_database(validate_schema)
153
153
  require 'cloud_crowd/models'
154
- CloudCrowd.configure_database("#{@options[:config_path]}/database.yml")
154
+ CloudCrowd.configure_database("#{@options[:config_path]}/database.yml", validate_schema)
155
155
  end
156
156
 
157
157
  # Exit with an explanation if the configuration files couldn't be found.
158
158
  def config_not_found
159
- puts "`crowd` can't find the CloudCrowd configuration directory. Please either run `crowd` from inside of the configuration directory, or use `crowd -c path/to/config`"
159
+ puts "`crowd` can't find the CloudCrowd configuration directory. Please use `crowd -c path/to/config`, or run `crowd` from inside of the configuration directory itself."
160
160
  exit(1)
161
161
  end
162
162
 
163
- # Install a file and log the installation.
163
+ # Install a file and log the installation. If we're overwriting a file,
164
+ # offer a chance to back out.
164
165
  def install_file(source, dest, is_dir=false)
166
+ if File.exists?(dest)
167
+ print "#{dest} already exists. Overwrite it? (yes/no) "
168
+ return unless ['y', 'yes', 'ok'].include? gets.chomp.downcase
169
+ end
165
170
  is_dir ? FileUtils.cp_r(source, dest) : FileUtils.cp(source, dest)
166
171
  puts "installed #{dest}"
167
172
  end
@@ -2,16 +2,14 @@ module CloudCrowd
2
2
 
3
3
  # Base Error class which all custom CloudCrowd exceptions inherit from.
4
4
  # Rescuing CloudCrowd::Error (or RuntimeError) will get all custom exceptions.
5
+ # If your cluster is correctly configured, you should never expect to see any
6
+ # of these.
5
7
  class Error < RuntimeError
6
8
 
7
9
  # ActionNotFound is raised when a job is created for an action that doesn't
8
10
  # exist.
9
11
  class ActionNotFound < Error
10
12
  end
11
-
12
- # CentralServerUnavailable is used then the central server can't be reached.
13
- class CentralServerUnavailable < Error
14
- end
15
13
 
16
14
  # StorageNotFound is raised when config.yml specifies a storage back end that
17
15
  # doesn't exist.
@@ -27,6 +25,11 @@ module CloudCrowd
27
25
  class StatusUnspecified < Error
28
26
  end
29
27
 
28
+ # MissingConfiguration is raised when we're trying to run a method that
29
+ # needs configuration not present in config.yml.
30
+ class MissingConfiguration < Error
31
+ end
32
+
30
33
  end
31
34
 
32
35
  end