cloud-crowd 0.3.3 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,7 @@
1
1
  Gem::Specification.new do |s|
2
2
  s.name = 'cloud-crowd'
3
- s.version = '0.3.3' # Keep version in sync with cloud-cloud.rb
4
- s.date = '2010-01-27'
3
+ s.version = '0.4.0' # Keep version in sync with cloud-cloud.rb
4
+ s.date = '2010-03-31'
5
5
 
6
6
  s.homepage = "http://wiki.github.com/documentcloud/cloud-crowd"
7
7
  s.summary = "Parallel Processing for the Rest of Us"
@@ -27,11 +27,10 @@ Gem::Specification.new do |s|
27
27
  '--main' << 'README' <<
28
28
  '--all'
29
29
 
30
- s.add_dependency 'sinatra', ['>= 0.9.4']
31
- s.add_dependency 'activerecord', ['>= 2.3.3']
30
+ s.add_dependency 'sinatra', ['~> 0.9']
31
+ s.add_dependency 'activerecord', ['~> 2.3']
32
32
  s.add_dependency 'json', ['>= 1.1.7']
33
- s.add_dependency 'rest-client', ['>= 1.0.3']
34
- s.add_dependency 'right_aws', ['>= 1.10.0']
33
+ s.add_dependency 'rest-client', ['>= 1.4']
35
34
  s.add_dependency 'thin', ['>= 1.2.4']
36
35
 
37
36
  if s.respond_to?(:add_development_dependency)
@@ -58,6 +57,7 @@ lib/cloud-crowd.rb
58
57
  lib/cloud_crowd/action.rb
59
58
  lib/cloud_crowd/asset_store/filesystem_store.rb
60
59
  lib/cloud_crowd/asset_store/s3_store.rb
60
+ lib/cloud_crowd/asset_store/cloudfiles_store.rb
61
61
  lib/cloud_crowd/asset_store.rb
62
62
  lib/cloud_crowd/command_line.rb
63
63
  lib/cloud_crowd/exceptions.rb
@@ -2,20 +2,21 @@
2
2
  :central_server: http://localhost:9173
3
3
 
4
4
  # The following settings allow you to control the number of workers that can run
5
- # on a given node, to prevent the node from becoming overloaded. 'max_workers'
5
+ # on a given node, to prevent the node from becoming overloaded. 'max_workers'
6
6
  # is a simple cap on the maximum number of workers a node is allowed to run
7
7
  # concurrently. 'max_load' is the maximum (one-minute) load average, above which
8
8
  # a node will refuse to take new work. 'min_free_memory' is the minimum amount
9
- # of free RAM (in megabytes) a node is allowed to have, below which no new
9
+ # of free RAM (in megabytes) a node is allowed to have, below which no new
10
10
  # workers are run. These settings may be used in any combination.
11
11
  :max_workers: 5
12
12
  # :max_load: 5.0
13
13
  # :min_free_memory: 150
14
14
 
15
15
  # The storage back-end that you'd like to use for intermediate and final results
16
- # of processing. 's3' and 'filesystem' are supported. 'filesystem' should only
17
- # be used in development, on single-machine installations, or networked drives.
18
- # If you *are* developing an action, filesystem is certainly faster and easier.
16
+ # of processing. 's3', 'filesystem', and 'cloudfiles' are supported.
17
+ # 'filesystem' should only be used in development, on single-machine installations,
18
+ # or networked drives. If you *are* developing an action, filesystem is certainly
19
+ # faster and easier.
19
20
  :storage: s3
20
21
 
21
22
  # Please provide your AWS credentials for S3 storage of job output.
@@ -29,22 +30,34 @@
29
30
  :s3_bucket: [your CloudCrowd bucket]
30
31
  :s3_authentication: no
31
32
 
32
- # The following settings configure local paths. 'local_storage_path' is the
33
+ # Cloudfiles
34
+ :cloudfiles_username: [your Rackspace Cloud Files username]
35
+ :cloudfiles_api_key: [your Rackspace Cloud Files API key]
36
+ :cloudfiles_container: [your Rackspace Cloud Files container]
37
+
38
+ # The following settings configure local paths. 'local_storage_path' is the
33
39
  # directory in which all files will be saved if you're using the 'filesystem'
34
- # storage. 'log_path' and 'pid_path' are the directories in which daemonized
35
- # servers and nodes will store their process ids and log files. The default
40
+ # storage. 'log_path' and 'pid_path' are the directories in which daemonized
41
+ # servers and nodes will store their process ids and log files. The default
36
42
  # values are listed.
37
43
  # :local_storage_path: /tmp/cloud_crowd_storage
38
44
  # :log_path: log
39
45
  # :pid_path: tmp/pids
40
46
 
41
- # Use HTTP Basic Auth for all requests? (Includes all internal worker requests
42
- # to the central server). If yes, specify the login and password that all
47
+ # Use HTTP Basic Auth for all requests? (Includes all internal worker requests
48
+ # to the central server). If yes, specify the login and password that all
43
49
  # requests must provide for authentication.
44
50
  :http_authentication: no
45
51
  :login: [your login name]
46
52
  :password: [your password]
47
53
 
54
+ # Disable all the default built-in actions
55
+ # :disable_default_actions: true
56
+
57
+ # Disable specific actions for the node
58
+ # Use this if you want to disable a limited number of actions
59
+ # :disabled_actions: ['word_count']
60
+
48
61
  # By default, CloudCrowd looks for installed actions inside the 'actions'
49
62
  # subdirectory of this configuration folder. 'actions_path' allows you to load
50
63
  # additional actions from a location of your choice.
@@ -4,10 +4,9 @@ $LOAD_PATH.unshift File.expand_path(File.dirname(__FILE__))
4
4
 
5
5
  # Common Gems:
6
6
  require 'rubygems'
7
- gem 'activerecord'
7
+ gem 'activerecord', '~> 2.0'
8
8
  gem 'json'
9
9
  gem 'rest-client'
10
- gem 'right_aws'
11
10
  gem 'sinatra'
12
11
  gem 'thin'
13
12
 
@@ -20,6 +19,7 @@ autoload :FileUtils, 'fileutils'
20
19
  autoload :JSON, 'json'
21
20
  autoload :RestClient, 'rest_client'
22
21
  autoload :RightAws, 'right_aws'
22
+ autoload :CloudFiles, 'cloudfiles'
23
23
  autoload :Sinatra, 'sinatra'
24
24
  autoload :Thin, 'thin'
25
25
  autoload :YAML, 'yaml'
@@ -44,7 +44,7 @@ module CloudCrowd
44
44
  autoload :WorkUnit, 'cloud_crowd/models'
45
45
 
46
46
  # Keep this version in sync with the gemspec.
47
- VERSION = '0.3.3'
47
+ VERSION = '0.4.0'
48
48
 
49
49
  # Increment the schema version when there's a backwards incompatible change.
50
50
  SCHEMA_VERSION = 3
@@ -166,7 +166,7 @@ module CloudCrowd
166
166
 
167
167
  # Retrieve the list of every installed Action for this node or server.
168
168
  def action_paths
169
- default_actions = Dir["#{ROOT}/actions/*.rb"]
169
+ default_actions = config[:disable_default_actions] ? [] : Dir["#{ROOT}/actions/*.rb"]
170
170
  installed_actions = Dir["#{@config_path}/actions/*.rb"]
171
171
  custom_actions = CloudCrowd.config[:actions_path] ? Dir["#{CloudCrowd.config[:actions_path]}/*.rb"] : []
172
172
  default_actions + installed_actions + custom_actions
@@ -1,7 +1,7 @@
1
1
  module CloudCrowd
2
-
2
+
3
3
  # As you write your custom actions, have them inherit from CloudCrowd::Action.
4
- # All actions must implement a +process+ method, which should return a
4
+ # All actions must implement a +process+ method, which should return a
5
5
  # JSON-serializable object that will be used as the output for the work unit.
6
6
  # See the default actions for examples.
7
7
  #
@@ -16,11 +16,11 @@ module CloudCrowd
16
16
  # Note that Actions inherit a backticks (`) method that raises an Exception
17
17
  # if the external command fails.
18
18
  class Action
19
-
19
+
20
20
  FILE_URL = /\Afile:\/\//
21
-
21
+
22
22
  attr_reader :input, :input_path, :file_name, :options, :work_directory
23
-
23
+
24
24
  # Initializing an Action sets up all of the read-only variables that
25
25
  # form the bulk of the API for action subclasses. (Paths to read from and
26
26
  # write to). It creates the +work_directory+ and moves into it.
@@ -34,17 +34,17 @@ module CloudCrowd
34
34
  parse_input
35
35
  download_input
36
36
  end
37
-
37
+
38
38
  # Each Action subclass must implement a +process+ method, overriding this.
39
39
  def process
40
40
  raise NotImplementedError, "CloudCrowd::Actions must override 'process' with their own processing code."
41
41
  end
42
-
42
+
43
43
  # Download a file to the specified path.
44
44
  def download(url, path)
45
45
  `curl -s "#{url}" > "#{path}"`
46
46
  return path
47
- # The previous implementation is below, and, although it would be
47
+ # The previous implementation is below, and, although it would be
48
48
  # wonderful not to shell out, RestClient wasn't handling URLs with encoded
49
49
  # entities (%20, for example), and doesn't let you download to a given
50
50
  # location. Getting a RestClient patch in would be ideal.
@@ -56,21 +56,21 @@ module CloudCrowd
56
56
  # FileUtils.mv resp.file.path, path
57
57
  # end
58
58
  end
59
-
60
- # Takes a local filesystem path, saves the file to S3, and returns the
61
- # public (or authenticated) url on S3 where the file can be accessed.
59
+
60
+ # Takes a local filesystem path, saves the file to S3, and returns the
61
+ # public (or authenticated) url on S3 where the file can be accessed.
62
62
  def save(file_path)
63
63
  save_path = File.join(storage_prefix, File.basename(file_path))
64
64
  @store.save(file_path, save_path)
65
65
  end
66
-
66
+
67
67
  # After the Action has finished, we remove the work directory and return
68
68
  # to the root directory (where workers run by default).
69
69
  def cleanup_work_directory
70
70
  FileUtils.rm_r(@work_directory) if File.exists?(@work_directory)
71
71
  end
72
-
73
- # Actions have a backticks command that raises a CommandFailed exception
72
+
73
+ # Actions have a backticks command that raises a CommandFailed exception
74
74
  # on failure, so that processing doesn't just blithely continue.
75
75
  def `(command)
76
76
  result = super(command)
@@ -78,17 +78,18 @@ module CloudCrowd
78
78
  raise Error::CommandFailed.new(result, exit_code) unless exit_code == 0
79
79
  result
80
80
  end
81
-
82
-
81
+
82
+
83
83
  private
84
-
84
+
85
85
  # Convert an unsafe URL into a filesystem-friendly filename.
86
86
  def safe_filename(url)
87
+ url.sub!(/\?.*\Z/, '')
87
88
  ext = File.extname(url)
88
89
  name = URI.unescape(File.basename(url)).gsub(/[^a-zA-Z0-9_\-.]/, '-').gsub(/-+/, '-')
89
90
  File.basename(name, ext).gsub('.', '-') + ext
90
91
  end
91
-
92
+
92
93
  # The directory prefix to use for both local and S3 storage.
93
94
  # [action]/job_[job_id]/unit_[work_unit_it]
94
95
  def storage_prefix
@@ -98,18 +99,18 @@ module CloudCrowd
98
99
  path_parts << "unit_#{@work_unit_id}" if @work_unit_id
99
100
  @storage_prefix ||= File.join(path_parts)
100
101
  end
101
-
102
+
102
103
  # If we think that the input is JSON, replace it with the parsed form.
103
104
  # It would be great if the JSON module had an is_json? method.
104
105
  def parse_input
105
106
  return unless ['[', '{'].include? @input[0..0]
106
107
  @input = JSON.parse(@input) rescue @input
107
108
  end
108
-
109
+
109
110
  def input_is_url?
110
111
  !URI.parse(@input).scheme.nil? rescue false
111
112
  end
112
-
113
+
113
114
  # If the input is a URL, download the file before beginning processing.
114
115
  def download_input
115
116
  return unless input_is_url?
@@ -119,7 +120,7 @@ module CloudCrowd
119
120
  download(@input, @input_path)
120
121
  end
121
122
  end
122
-
123
+
123
124
  end
124
-
125
+
125
126
  end
@@ -14,12 +14,14 @@ module CloudCrowd
14
14
 
15
15
  autoload :S3Store, 'cloud_crowd/asset_store/s3_store'
16
16
  autoload :FilesystemStore, 'cloud_crowd/asset_store/filesystem_store'
17
+ autoload :CloudfilesStore, 'cloud_crowd/asset_store/cloudfiles_store'
17
18
 
18
19
  # Configure the AssetStore with the specific storage implementation
19
20
  # specified by 'storage' in <tt>config.yml</tt>.
20
21
  case CloudCrowd.config[:storage]
21
- when 's3' then include S3Store
22
22
  when 'filesystem' then include FilesystemStore
23
+ when 's3' then include S3Store
24
+ when 'cloudfiles' then include CloudfilesStore
23
25
  else raise Error::StorageNotFound, "#{CloudCrowd.config[:storage]} is not a valid storage back end"
24
26
  end
25
27
 
@@ -0,0 +1,41 @@
1
+ gem 'cloudfiles'
2
+
3
+ module CloudCrowd
4
+ class AssetStore
5
+
6
+ # The CloudFilesStore is an implementation of an AssetStore that uses a Rackspace Cloud
7
+ module CloudfilesStore
8
+
9
+ # Configure Rackspace Cloud and connect
10
+ def setup
11
+ username = CloudCrowd.config[:cloudfiles_username]
12
+ api_key = CloudCrowd.config[:cloudfiles_api_key]
13
+ container = CloudCrowd.config[:cloudfiles_container]
14
+ valid_conf = [username, api_key, container].all? {|s| s.is_a? String }
15
+ raise Error::MissingConfiguration, "A Rackspace Cloud Files account must be configured in 'config.yml' before 'cloudfiles' storage can be used" unless valid_conf
16
+
17
+ @cloud = CloudFiles::Connection.new(username, api_key)
18
+ @container = @cloud.container container
19
+ end
20
+
21
+ # Save a finished file from local storage to Cloud Files.
22
+ def save(local_path, save_path)
23
+ object = @container.create_object save_path, true
24
+ object.load_from_filename local_path
25
+ object.public_url
26
+ end
27
+
28
+ # Remove all of a Job's resulting files from Cloud Files, both intermediate and finished.
29
+ def cleanup(job)
30
+ @container.objects(:prefix => "#{job.action}/job_#{job.id}").each do |object|
31
+ begin
32
+ @container.delete_object object
33
+ rescue
34
+ log "failed to delete #{job.action}/job_#{job.id}"
35
+ end
36
+ end
37
+ end
38
+ end
39
+
40
+ end
41
+ end
@@ -1,10 +1,12 @@
1
+ gem 'right_aws'
2
+
1
3
  module CloudCrowd
2
4
  class AssetStore
3
-
5
+
4
6
  # The S3Store is an implementation of an AssetStore that uses a bucket
5
7
  # on S3 for all resulting files.
6
8
  module S3Store
7
-
9
+
8
10
  # Configure authentication and establish a connection to S3, first thing.
9
11
  def setup
10
12
  @use_auth = CloudCrowd.config[:s3_authentication]
@@ -18,8 +20,8 @@ module CloudCrowd
18
20
  @bucket = @s3.bucket(bucket_name)
19
21
  @bucket = @s3.bucket(bucket_name, true) unless @bucket
20
22
  end
21
-
22
- # Save a finished file from local storage to S3. Save it publicly unless
23
+
24
+ # Save a finished file from local storage to S3. Save it publicly unless
23
25
  # we're configured to use S3 authentication. Authenticated links expire
24
26
  # after one day by default.
25
27
  def save(local_path, save_path)
@@ -31,13 +33,13 @@ module CloudCrowd
31
33
  @bucket.key(save_path).public_link
32
34
  end
33
35
  end
34
-
36
+
35
37
  # Remove all of a Job's resulting files from S3, both intermediate and finished.
36
38
  def cleanup(job)
37
39
  @bucket.delete_folder("#{job.action}/job_#{job.id}")
38
40
  end
39
-
41
+
40
42
  end
41
-
43
+
42
44
  end
43
45
  end
@@ -1,24 +1,25 @@
1
1
  module CloudCrowd
2
2
 
3
- # A NodeRecord is the central server's record of a Node running remotely. We
3
+ # A NodeRecord is the central server's record of a Node running remotely. We
4
4
  # can use it to assign WorkUnits to the Node, and keep track of its status.
5
5
  # When a Node exits, it destroys this record.
6
6
  class NodeRecord < ActiveRecord::Base
7
-
7
+
8
8
  has_many :work_units
9
-
9
+
10
10
  validates_presence_of :host, :ip_address, :port, :enabled_actions
11
-
11
+
12
12
  after_destroy :redistribute_work_units
13
-
13
+
14
14
  # Available Nodes haven't used up their maxiumum number of workers yet.
15
15
  named_scope :available, {
16
16
  :conditions => ['(max_workers is null or (select count(*) from work_units where node_record_id = node_records.id) < max_workers)'],
17
17
  :order => 'updated_at asc'
18
18
  }
19
-
20
- # Register a Node with the central server. Currently this only happens at
21
- # Node startup.
19
+
20
+ # Register a Node with the central server. This happens periodically
21
+ # (once every `Node::CHECK_IN_INTERVAL` seconds). Nodes will be de-registered
22
+ # if they checked in within a reasonable interval.
22
23
  def self.check_in(params, request)
23
24
  attrs = {
24
25
  :ip_address => request.ip,
@@ -29,15 +30,15 @@ module CloudCrowd
29
30
  }
30
31
  self.find_or_create_by_host(params[:host]).update_attributes!(attrs)
31
32
  end
32
-
33
+
33
34
  # Dispatch a WorkUnit to this node. Places the node at back at the end of
34
35
  # the rotation. If we fail to send the WorkUnit, we consider the node to be
35
36
  # down, and remove this record, freeing up all of its checked-out work units.
36
- # If the Node responds that it's overloaded, we mark it as busy. Returns
37
+ # If the Node responds that it's overloaded, we mark it as busy. Returns
37
38
  # true if the WorkUnit was dispatched successfully.
38
39
  def send_work_unit(unit)
39
40
  result = node['/work'].post(:work_unit => unit.to_json)
40
- unit.assign_to(self, JSON.parse(result)['pid'])
41
+ unit.assign_to(self, JSON.parse(result.body)['pid'])
41
42
  touch && true
42
43
  rescue RestClient::RequestFailed => e
43
44
  raise e unless e.http_code == 503 && e.http_body == Node::OVERLOADED_MESSAGE
@@ -46,45 +47,45 @@ module CloudCrowd
46
47
  # Couldn't post to node, assume it's gone away.
47
48
  destroy && false
48
49
  end
49
-
50
+
50
51
  # What Actions is this Node able to run?
51
52
  def actions
52
53
  @actions ||= enabled_actions.split(',')
53
54
  end
54
-
55
- # Is this Node too busy for more work? Determined by number of workers, or
55
+
56
+ # Is this Node too busy for more work? Determined by number of workers, or
56
57
  # the Node's load average, as configured in config.yml.
57
58
  def busy?
58
59
  busy || (max_workers && work_units.count >= max_workers)
59
60
  end
60
-
61
+
61
62
  # The URL at which this Node may be reached.
62
63
  # TODO: Make sure that the host actually has externally accessible DNS.
63
64
  def url
64
65
  @url ||= "http://#{host}:#{port}"
65
66
  end
66
-
67
- # Keep a RestClient::Resource handy for contacting the Node, including
67
+
68
+ # Keep a RestClient::Resource handy for contacting the Node, including
68
69
  # HTTP authentication, if configured.
69
70
  def node
70
71
  @node ||= RestClient::Resource.new(url, CloudCrowd.client_options)
71
72
  end
72
-
73
+
73
74
  # The printable status of the Node.
74
75
  def display_status
75
76
  busy? ? 'busy' : 'available'
76
77
  end
77
-
78
+
78
79
  # A list of the process ids of the workers currently being run by the Node.
79
80
  def worker_pids
80
81
  work_units.all(:select => 'worker_pid').map(&:worker_pid)
81
82
  end
82
-
83
+
83
84
  # Release all of this Node's WorkUnits for other nodes to take.
84
85
  def release_work_units
85
86
  WorkUnit.update_all('node_record_id = null, worker_pid = null', "node_record_id = #{id}")
86
87
  end
87
-
88
+
88
89
  # The JSON representation of a NodeRecord includes its worker_pids.
89
90
  def to_json(opts={})
90
91
  { 'host' => host,
@@ -92,16 +93,16 @@ module CloudCrowd
92
93
  'status' => display_status
93
94
  }.to_json
94
95
  end
95
-
96
-
96
+
97
+
97
98
  private
98
-
99
- # When a Node exits, release its WorkUnits and redistribute them to others.
99
+
100
+ # When a Node exits, release its WorkUnits and redistribute them to others.
100
101
  # Redistribute in a separate thread to avoid delaying shutdown.
101
102
  def redistribute_work_units
102
103
  release_work_units
103
104
  Thread.new { WorkUnit.distribute_to_nodes }
104
105
  end
105
-
106
+
106
107
  end
107
108
  end