documentcloud-cloud-crowd 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README CHANGED
@@ -26,7 +26,7 @@
26
26
 
27
27
  * Parallel processing for the rest of us
28
28
  * Write your scripts in Ruby
29
- * Built for Amazon EC2 and S3
29
+ * Works with Amazon EC2 and S3
30
30
  * split -> process -> merge
31
31
  * As easy as `gem install cloud-crowd`
32
32
 
@@ -63,31 +63,31 @@
63
63
  # Edit the configuration files to your satisfaction, add AWS credentials,
64
64
  # and then load the CloudCrowd schema into your configured database.
65
65
 
66
- >> mate ~/config/cloud-crowd/config.yml
67
- >> mate ~/config/cloud-crowd/database.yml
66
+ >> cd ~/config/cloud-crowd
67
+ >> mate config.yml
68
+ >> mate database.yml
69
+ >> [create the database you just configured...]
68
70
  >> crowd load_schema
69
71
 
70
72
  # Write your actions, and install them into the 'actions' subdirectory.
71
- # CloudCrowd comes with some default actions as an example.
73
+ # CloudCrowd comes with a few default actions as an example.
72
74
 
73
75
  # To launch the central server (make sure that you include its location
74
- # in config.yml), either:
76
+ # in config.yml):
75
77
 
76
78
  >> crowd server
77
79
 
78
- # or:
80
+ # The configuration folder also includes 'config.ru', which can be used by
81
+ # any Rack-compliant webserver to run your central server.
79
82
 
80
- >> thin -R config.ru --servers 3 -e production start
83
+ # Then, to launch a node of workers:
81
84
 
82
- # Any server that supports Rack should work with the rackup file.
85
+ >> crowd node
83
86
 
84
- # Then, to spin up 10 workers:
87
+ # To spin up remote nodes, install the 'cloud-crowd' gem and copy over
88
+ # your configuration directory. Run `crowd node`, and the remote machines
89
+ # will register with the central server, becoming available for processing.
85
90
 
86
- >> crowd workers start -n 10
87
-
88
- # To spin up workers remotely, install the 'cloud-crowd' gem, and copy over
89
- # your configuration directory.
90
-
91
- # At this point you can visit your server console at localhost:9173 to
92
- # view all of your workers, ready for action.
91
+ # At this point you can visit your Operations Center at localhost:9173 to
92
+ # view all of your nodes, ready for action.
93
93
 
data/cloud-crowd.gemspec CHANGED
@@ -1,7 +1,7 @@
1
1
  Gem::Specification.new do |s|
2
2
  s.name = 'cloud-crowd'
3
- s.version = '0.1.1' # Keep version in sync with cloud-cloud.rb
4
- s.date = '2009-09-15'
3
+ s.version = '0.2.0' # Keep version in sync with cloud-cloud.rb
4
+ s.date = '2009-09-17'
5
5
 
6
6
  s.homepage = "http://wiki.github.com/documentcloud/cloud-crowd"
7
7
  s.summary = "Parallel Processing for the Rest of Us"
@@ -83,6 +83,7 @@ public/images/header_back.png
83
83
  public/images/logo.png
84
84
  public/images/queue_fill.png
85
85
  public/images/server.png
86
+ public/images/server_busy.png
86
87
  public/images/server_error.png
87
88
  public/images/sidebar_bottom.png
88
89
  public/images/sidebar_top.png
@@ -106,6 +107,6 @@ test/unit/test_action.rb
106
107
  test/unit/test_configuration.rb
107
108
  test/unit/test_job.rb
108
109
  test/unit/test_work_unit.rb
109
- views/index.erb
110
+ views/operations_center.erb
110
111
  )
111
112
  end
@@ -1,33 +1,38 @@
1
1
  # The URL where you're planning on running the central server/queue/database.
2
- :central_server: http://localhost:9173
2
+ :central_server: http://localhost:9173
3
3
 
4
4
  # Set the maximum number of workers allowed per-node. Workers only run while
5
5
  # there's work to be done. It's best to set 'max_workers' below the point where
6
6
  # you'd start to swap or peg your CPU (as determined by experiment).
7
- :max_workers: 5
7
+ :max_workers: 5
8
8
 
9
9
  # The storage back-end that you'd like to use for intermediate and final results
10
10
  # of processing. 's3' and 'filesystem' are supported. 'filesystem' should only
11
- # be used in development, or on single-machine installations.
12
- :storage: s3
11
+ # be used in development, on single-machine installations, or networked drives.
12
+ :storage: s3
13
13
 
14
14
  # Please provide your AWS credentials for S3 storage of job output.
15
- :aws_access_key: [your AWS access key]
16
- :aws_secret_key: [your AWS secret access key]
15
+ :aws_access_key: [your AWS access key]
16
+ :aws_secret_key: [your AWS secret access key]
17
17
 
18
18
  # Choose an S3 bucket to store all CloudCrowd output, and decide if you'd like
19
19
  # to keep all resulting files on S3 private. If so, you'll receive authenticated
20
20
  # S3 URLs as job output, good for 24 hours. If left public, you'll get the
21
21
  # straight URLs to the files on S3.
22
- :s3_bucket: [your CloudCrowd bucket]
23
- :use_s3_authentication: no
22
+ :s3_bucket: [your CloudCrowd bucket]
23
+ :s3_authentication: no
24
+
25
+ # If you're using the 'filesystem' storage, perhaps with an NFS share or
26
+ # something similar, all files will be saved inside of the 'local_storage_path'.
27
+ # The default value if left unspecified is '/tmp/cloud_crowd_storage'.
28
+ :local_storage_path: /tmp/cloud_crowd_storage
24
29
 
25
30
  # Use HTTP Basic Auth for all requests? (Includes all internal worker requests
26
31
  # to the central server). If yes, specify the login and password that all
27
32
  # requests must provide for authentication.
28
- :use_http_authentication: no
29
- :login: [your login name]
30
- :password: [your password]
33
+ :http_authentication: no
34
+ :login: [your login name]
35
+ :password: [your password]
31
36
 
32
37
  # By default, CloudCrowd looks for installed actions inside the 'actions'
33
38
  # subdirectory of this configuration folder. 'actions_path' allows you to load
@@ -36,4 +41,4 @@
36
41
 
37
42
  # The number of separate attempts that will be made to process an individual
38
43
  # work unit, before marking it as having failed.
39
- :work_unit_retries: 3
44
+ :work_unit_retries: 3
data/lib/cloud-crowd.rb CHANGED
@@ -30,7 +30,7 @@ require 'cloud_crowd/exceptions'
30
30
 
31
31
  module CloudCrowd
32
32
 
33
- # Autoload all the CloudCrowd classes which may not be required.
33
+ # Autoload all the CloudCrowd internals.
34
34
  autoload :Action, 'cloud_crowd/action'
35
35
  autoload :AssetStore, 'cloud_crowd/asset_store'
36
36
  autoload :Helpers, 'cloud_crowd/helpers'
@@ -42,36 +42,38 @@ module CloudCrowd
42
42
  autoload :Worker, 'cloud_crowd/worker'
43
43
  autoload :WorkUnit, 'cloud_crowd/models'
44
44
 
45
- # Root directory of the CloudCrowd gem.
46
- ROOT = File.expand_path(File.dirname(__FILE__) + '/..')
45
+ # Keep this version in sync with the gemspec.
46
+ VERSION = '0.2.0'
47
+
48
+ # Increment the schema version when there's a backwards incompatible change.
49
+ SCHEMA_VERSION = 2
47
50
 
48
- # Keep the version in sync with the gemspec.
49
- VERSION = '0.1.1'
51
+ # Root directory of the CloudCrowd gem.
52
+ ROOT = File.expand_path(File.dirname(__FILE__) + '/..')
50
53
 
51
- # A Job is processing if its WorkUnits in the queue to be handled by workers.
52
- PROCESSING = 1
54
+ # A Job is processing if its WorkUnits are in the queue to be handled by nodes.
55
+ PROCESSING = 1
53
56
 
54
57
  # A Job has succeeded if all of its WorkUnits have finished successfully.
55
- SUCCEEDED = 2
58
+ SUCCEEDED = 2
56
59
 
57
60
  # A Job has failed if even a single one of its WorkUnits has failed (they may
58
61
  # be attempted multiple times on failure, however).
59
- FAILED = 3
62
+ FAILED = 3
60
63
 
61
64
  # A Job is splitting if it's in the process of dividing its inputs up into
62
65
  # multiple WorkUnits.
63
- SPLITTING = 4
66
+ SPLITTING = 4
64
67
 
65
68
  # A Job is merging if it's busy collecting all of its successful WorkUnits
66
69
  # back together into the final result.
67
- MERGING = 5
70
+ MERGING = 5
68
71
 
69
- # A work unit is considered to be complete if it succeeded or if it failed.
70
- COMPLETE = [SUCCEEDED, FAILED]
72
+ # A Job is considered to be complete if it succeeded or if it failed.
73
+ COMPLETE = [SUCCEEDED, FAILED]
71
74
 
72
- # A work unit is considered incomplete if it's being processed, split up or
73
- # merged together.
74
- INCOMPLETE = [PROCESSING, SPLITTING, MERGING]
75
+ # A Job is considered incomplete if it's being processed, split up or merged.
76
+ INCOMPLETE = [PROCESSING, SPLITTING, MERGING]
75
77
 
76
78
  # Mapping of statuses to their display strings.
77
79
  DISPLAY_STATUS_MAP = ['unknown', 'processing', 'succeeded', 'failed', 'splitting', 'merging']
@@ -88,18 +90,34 @@ module CloudCrowd
88
90
  # Configure the CloudCrowd central database (and connect to it), by passing
89
91
  # in a path to <tt>database.yml</tt>. The file should use the standard
90
92
  # ActiveRecord connection format.
91
- def configure_database(config_path)
93
+ def configure_database(config_path, validate_schema=true)
92
94
  configuration = YAML.load_file(config_path)
93
95
  ActiveRecord::Base.establish_connection(configuration)
96
+ if validate_schema
97
+ version = ActiveRecord::Base.connection.select_values('select max(version) from schema_migrations').first.to_i
98
+ return true if version == SCHEMA_VERSION
99
+ puts "Your database schema is out of date. Please use `crowd load_schema` to update it. This will wipe all the tables, so make sure that your jobs have a chance to finish first.\nexiting..."
100
+ exit
101
+ end
94
102
  end
95
103
 
96
- # Get a reference to the central server, including authentication,
97
- # if configured.
104
+ # Get a reference to the central server, including authentication if
105
+ # configured.
98
106
  def central_server
99
- return @central_server if @central_server
100
- params = [CloudCrowd.config[:central_server]]
101
- params += [CloudCrowd.config[:login], CloudCrowd.config[:password]] if CloudCrowd.config[:use_http_authentication]
102
- @central_server = RestClient::Resource.new(*params)
107
+ @central_server ||= RestClient::Resource.new(CloudCrowd.config[:central_server], CloudCrowd.client_options)
108
+ end
109
+
110
+ # The standard RestClient options for the central server talking to nodes,
111
+ # as well as the other way around. There's a timeout of 5 seconds to open
112
+ # a connection, and a timeout of 30 to finish reading it.
113
+ def client_options
114
+ return @client_options if @client_options
115
+ @client_options = {:timeout => 30, :open_timeout => 5}
116
+ if CloudCrowd.config[:http_authentication]
117
+ @client_options[:user] = CloudCrowd.config[:login]
118
+ @client_options[:password] = CloudCrowd.config[:password]
119
+ end
120
+ @client_options
103
121
  end
104
122
 
105
123
  # Return the displayable status name of an internal CloudCrowd status number.
@@ -111,7 +129,7 @@ module CloudCrowd
111
129
  # CloudCrowd::Actions are requested dynamically by name. Access them through
112
130
  # this actions property, which behaves like a hash. At load time, we
113
131
  # load all installed Actions and CloudCrowd's default Actions into it.
114
- # If you wish to have certain workers be specialized to only handle certain
132
+ # If you wish to have certain nodes be specialized to only handle certain
115
133
  # Actions, then install only those into the actions directory.
116
134
  def actions
117
135
  return @actions if @actions
@@ -38,17 +38,19 @@ module CloudCrowd
38
38
 
39
39
  # Download a file to the specified path.
40
40
  def download(url, path)
41
- URI.parse(url) # Sanity check.
42
41
  `curl -s "#{url}" > "#{path}"`
42
+ return path
43
+ # The previous implementation is below, and, although it would be
44
+ # wonderful not to shell out, RestClient wasn't handling URLs with encoded
45
+ # entities (%20, for example), and doesn't let you download to a given
46
+ # location. Getting a RestClient patch in would be ideal.
47
+ #
43
48
  # if url.match(FILE_URL)
44
49
  # FileUtils.cp(url.sub(FILE_URL, ''), path)
45
50
  # else
46
- # # An alternative would be shelling out: `curl -s "#{url}" > "#{path}"`
47
- # puts url
48
51
  # resp = RestClient::Request.execute(:url => url, :method => :get, :raw_response => true)
49
52
  # FileUtils.mv resp.file.path, path
50
53
  # end
51
- path
52
54
  end
53
55
 
54
56
  # Takes a local filesystem path, saves the file to S3, and returns the
@@ -3,18 +3,18 @@ require 'tmpdir'
3
3
  module CloudCrowd
4
4
 
5
5
  # The AssetStore provides a common API for storing files and returning URLs
6
- # that can access them. In production this will be S3 but in development
7
- # it may be the filesystem.
6
+ # that can access them. At the moment, the files can be saved to either S3, or
7
+ # the local filesystem. You shouldn't need to use the AssetStore directly --
8
+ # Action's +download+ and +save+ methods use it behind the scenes.
8
9
  #
9
- # You shouldn't need to use the AssetStore directly -- Action's +download+
10
- # and +save+ methods use it behind the scenes.
10
+ # To implement a new back-end for the AssetStore, you must provide
11
+ # <tt>save(local_path, save_path)</tt>, <tt>cleanup(job)</tt>, and optionally,
12
+ # a <tt>setup</tt> method that will be called once at initialization.
11
13
  class AssetStore
12
14
 
13
15
  autoload :S3Store, 'cloud_crowd/asset_store/s3_store'
14
16
  autoload :FilesystemStore, 'cloud_crowd/asset_store/filesystem_store'
15
-
16
- LOCAL_STORAGE_PATH = '/tmp/cloud_crowd_storage'
17
-
17
+
18
18
  # Configure the AssetStore with the specific storage implementation
19
19
  # specified by 'storage' in <tt>config.yml</tt>.
20
20
  case CloudCrowd.config[:storage]
@@ -2,20 +2,26 @@ module CloudCrowd
2
2
  class AssetStore
3
3
 
4
4
  # The FilesystemStore is an implementation of the AssetStore, good only for
5
- # use in development, testing, or if you're only running a single-machine
6
- # installation.
5
+ # use in development, testing, if you're only running a single-machine
6
+ # installation, or are using a networked drive.
7
7
  module FilesystemStore
8
8
 
9
- # Make sure that local storage is writeable before starting.
9
+ DEFAULT_STORAGE_PATH = '/tmp/cloud_crowd_storage'
10
+
11
+ attr_reader :local_storage_path
12
+
13
+ # Make sure that local storage exists and is writeable before starting.
10
14
  def setup
11
- raise Error::StorageNotWritable, "#{LOCAL_STORAGE_PATH} is not writable" unless File.writable?(LOCAL_STORAGE_PATH)
15
+ lsp = @local_storage_path = CloudCrowd.config[:local_storage_path] || DEFAULT_STORAGE_PATH
16
+ FileUtils.mkdir_p(lsp) unless File.exists?(lsp)
17
+ raise Error::StorageNotWritable, "#{lsp} is not writable" unless File.writable?(lsp)
12
18
  end
13
19
 
14
- # Save a file to somewhere semi-persistent on the filesystem. Can be used
15
- # in development, when offline, or if you happen to have a single-machine
16
- # CloudCrowd installation. To use, configure <tt>:storage => 'filesystem'</tt>.
20
+ # Save a file to somewhere semi-persistent on the filesystem. To use,
21
+ # configure <tt>:storage: 'filesystem'</tt> in *config.yml*, as well as
22
+ # <tt>:local_storage_path:</tt>.
17
23
  def save(local_path, save_path)
18
- save_path = File.join(LOCAL_STORAGE_PATH, save_path)
24
+ save_path = File.join(@local_storage_path, save_path)
19
25
  save_dir = File.dirname(save_path)
20
26
  FileUtils.mkdir_p save_dir unless File.exists? save_dir
21
27
  FileUtils.cp(local_path, save_path)
@@ -24,7 +30,7 @@ module CloudCrowd
24
30
 
25
31
  # Remove all of a Job's result files from the filesystem.
26
32
  def cleanup(job)
27
- path = "#{LOCAL_STORAGE_PATH}/#{job.action}/job_#{job.id}"
33
+ path = "#{@local_storage_path}/#{job.action}/job_#{job.id}"
28
34
  FileUtils.rm_r(path) if File.exists?(path)
29
35
  end
30
36
  end
@@ -7,8 +7,16 @@ module CloudCrowd
7
7
 
8
8
  # Configure authentication and establish a connection to S3, first thing.
9
9
  def setup
10
- @use_auth = CloudCrowd.config[:use_s3_authentication]
11
- establish_s3_connection
10
+ @use_auth = CloudCrowd.config[:s3_authentication]
11
+ bucket_name = CloudCrowd.config[:s3_bucket]
12
+ key, secret = CloudCrowd.config[:aws_access_key], CloudCrowd.config[:aws_secret_key]
13
+ valid_conf = [bucket_name, key, secret].all? {|s| s.is_a? String }
14
+ raise Error::MissingConfiguration, "An S3 account must be configured in 'config.yml' before 's3' storage can be used" unless valid_conf
15
+ protocol = @use_auth ? 'https' : 'http'
16
+ port = @use_auth ? 443 : 80
17
+ @s3 = RightAws::S3.new(key, secret, :protocol => protocol, :port => port)
18
+ @bucket = @s3.bucket(bucket_name)
19
+ @bucket = @s3.bucket(bucket_name, true) unless @bucket
12
20
  end
13
21
 
14
22
  # Save a finished file from local storage to S3. Save it publicly unless
@@ -29,15 +37,6 @@ module CloudCrowd
29
37
  @bucket.delete_folder("#{job.action}/job_#{job.id}")
30
38
  end
31
39
 
32
- # Workers, through the course of many WorkUnits, keep around an AssetStore.
33
- # Ensure we have a persistent S3 connection after first use.
34
- def establish_s3_connection
35
- unless @s3 && @bucket
36
- params = {:port => 80, :protocol => 'http'}
37
- @s3 = RightAws::S3.new(CloudCrowd.config[:aws_access_key], CloudCrowd.config[:aws_secret_key], params)
38
- @bucket = @s3.bucket(CloudCrowd.config[:s3_bucket], true)
39
- end
40
- end
41
40
  end
42
41
 
43
42
  end
@@ -49,7 +49,7 @@ Options:
49
49
  require 'irb/completion'
50
50
  require 'pp'
51
51
  load_code
52
- connect_to_database
52
+ connect_to_database(true)
53
53
  IRB.start
54
54
  end
55
55
 
@@ -81,7 +81,7 @@ Options:
81
81
  # Load in the database schema to the database specified in 'database.yml'.
82
82
  def run_load_schema
83
83
  load_code
84
- connect_to_database
84
+ connect_to_database(false)
85
85
  require 'cloud_crowd/schema.rb'
86
86
  end
87
87
 
@@ -92,8 +92,8 @@ Options:
92
92
  install_path = ARGV.shift || '.'
93
93
  FileUtils.mkdir_p install_path unless File.exists?(install_path)
94
94
  install_file "#{CC_ROOT}/config/config.example.yml", "#{install_path}/config.yml"
95
- install_file "#{CC_ROOT}/config/database.example.yml", "#{install_path}/database.yml"
96
95
  install_file "#{CC_ROOT}/config/config.example.ru", "#{install_path}/config.ru"
96
+ install_file "#{CC_ROOT}/config/database.example.yml", "#{install_path}/database.yml"
97
97
  install_file "#{CC_ROOT}/actions", "#{install_path}/actions", true
98
98
  end
99
99
 
@@ -149,19 +149,24 @@ Options:
149
149
 
150
150
  # Establish a connection to the central server's database. Not all commands
151
151
  # require this.
152
- def connect_to_database
152
+ def connect_to_database(validate_schema)
153
153
  require 'cloud_crowd/models'
154
- CloudCrowd.configure_database("#{@options[:config_path]}/database.yml")
154
+ CloudCrowd.configure_database("#{@options[:config_path]}/database.yml", validate_schema)
155
155
  end
156
156
 
157
157
  # Exit with an explanation if the configuration files couldn't be found.
158
158
  def config_not_found
159
- puts "`crowd` can't find the CloudCrowd configuration directory. Please either run `crowd` from inside of the configuration directory, or use `crowd -c path/to/config`"
159
+ puts "`crowd` can't find the CloudCrowd configuration directory. Please use `crowd -c path/to/config`, or run `crowd` from inside of the configuration directory itself."
160
160
  exit(1)
161
161
  end
162
162
 
163
- # Install a file and log the installation.
163
+ # Install a file and log the installation. If we're overwriting a file,
164
+ # offer a chance to back out.
164
165
  def install_file(source, dest, is_dir=false)
166
+ if File.exists?(dest)
167
+ print "#{dest} already exists. Overwrite it? (yes/no) "
168
+ return unless ['y', 'yes', 'ok'].include? gets.chomp.downcase
169
+ end
165
170
  is_dir ? FileUtils.cp_r(source, dest) : FileUtils.cp(source, dest)
166
171
  puts "installed #{dest}"
167
172
  end
@@ -2,16 +2,14 @@ module CloudCrowd
2
2
 
3
3
  # Base Error class which all custom CloudCrowd exceptions inherit from.
4
4
  # Rescuing CloudCrowd::Error (or RuntimeError) will get all custom exceptions.
5
+ # If your cluster is correctly configured, you should never expect to see any
6
+ # of these.
5
7
  class Error < RuntimeError
6
8
 
7
9
  # ActionNotFound is raised when a job is created for an action that doesn't
8
10
  # exist.
9
11
  class ActionNotFound < Error
10
12
  end
11
-
12
- # CentralServerUnavailable is used then the central server can't be reached.
13
- class CentralServerUnavailable < Error
14
- end
15
13
 
16
14
  # StorageNotFound is raised when config.yml specifies a storage back end that
17
15
  # doesn't exist.
@@ -27,6 +25,11 @@ module CloudCrowd
27
25
  class StatusUnspecified < Error
28
26
  end
29
27
 
28
+ # MissingConfiguration is raised when we're trying to run a method that
29
+ # needs configuration not present in config.yml.
30
+ class MissingConfiguration < Error
31
+ end
32
+
30
33
  end
31
34
 
32
35
  end