cloud-crowd 0.3.3 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,7 +1,7 @@
1
1
  Gem::Specification.new do |s|
2
2
  s.name = 'cloud-crowd'
3
- s.version = '0.3.3' # Keep version in sync with cloud-cloud.rb
4
- s.date = '2010-01-27'
3
+ s.version = '0.4.0' # Keep version in sync with cloud-cloud.rb
4
+ s.date = '2010-03-31'
5
5
 
6
6
  s.homepage = "http://wiki.github.com/documentcloud/cloud-crowd"
7
7
  s.summary = "Parallel Processing for the Rest of Us"
@@ -27,11 +27,10 @@ Gem::Specification.new do |s|
27
27
  '--main' << 'README' <<
28
28
  '--all'
29
29
 
30
- s.add_dependency 'sinatra', ['>= 0.9.4']
31
- s.add_dependency 'activerecord', ['>= 2.3.3']
30
+ s.add_dependency 'sinatra', ['~> 0.9']
31
+ s.add_dependency 'activerecord', ['~> 2.3']
32
32
  s.add_dependency 'json', ['>= 1.1.7']
33
- s.add_dependency 'rest-client', ['>= 1.0.3']
34
- s.add_dependency 'right_aws', ['>= 1.10.0']
33
+ s.add_dependency 'rest-client', ['>= 1.4']
35
34
  s.add_dependency 'thin', ['>= 1.2.4']
36
35
 
37
36
  if s.respond_to?(:add_development_dependency)
@@ -58,6 +57,7 @@ lib/cloud-crowd.rb
58
57
  lib/cloud_crowd/action.rb
59
58
  lib/cloud_crowd/asset_store/filesystem_store.rb
60
59
  lib/cloud_crowd/asset_store/s3_store.rb
60
+ lib/cloud_crowd/asset_store/cloudfiles_store.rb
61
61
  lib/cloud_crowd/asset_store.rb
62
62
  lib/cloud_crowd/command_line.rb
63
63
  lib/cloud_crowd/exceptions.rb
@@ -2,20 +2,21 @@
2
2
  :central_server: http://localhost:9173
3
3
 
4
4
  # The following settings allow you to control the number of workers that can run
5
- # on a given node, to prevent the node from becoming overloaded. 'max_workers'
5
+ # on a given node, to prevent the node from becoming overloaded. 'max_workers'
6
6
  # is a simple cap on the maximum number of workers a node is allowed to run
7
7
  # concurrently. 'max_load' is the maximum (one-minute) load average, above which
8
8
  # a node will refuse to take new work. 'min_free_memory' is the minimum amount
9
- # of free RAM (in megabytes) a node is allowed to have, below which no new
9
+ # of free RAM (in megabytes) a node is allowed to have, below which no new
10
10
  # workers are run. These settings may be used in any combination.
11
11
  :max_workers: 5
12
12
  # :max_load: 5.0
13
13
  # :min_free_memory: 150
14
14
 
15
15
  # The storage back-end that you'd like to use for intermediate and final results
16
- # of processing. 's3' and 'filesystem' are supported. 'filesystem' should only
17
- # be used in development, on single-machine installations, or networked drives.
18
- # If you *are* developing an action, filesystem is certainly faster and easier.
16
+ # of processing. 's3', 'filesystem', and 'cloudfiles' are supported.
17
+ # 'filesystem' should only be used in development, on single-machine installations,
18
+ # or networked drives. If you *are* developing an action, filesystem is certainly
19
+ # faster and easier.
19
20
  :storage: s3
20
21
 
21
22
  # Please provide your AWS credentials for S3 storage of job output.
@@ -29,22 +30,34 @@
29
30
  :s3_bucket: [your CloudCrowd bucket]
30
31
  :s3_authentication: no
31
32
 
32
- # The following settings configure local paths. 'local_storage_path' is the
33
+ # Cloudfiles
34
+ :cloudfiles_username: [your Rackspace Cloud Files username]
35
+ :cloudfiles_api_key: [your Rackspace Cloud Files API key]
36
+ :cloudfiles_container: [your Rackspace Cloud Files container]
37
+
38
+ # The following settings configure local paths. 'local_storage_path' is the
33
39
  # directory in which all files will be saved if you're using the 'filesystem'
34
- # storage. 'log_path' and 'pid_path' are the directories in which daemonized
35
- # servers and nodes will store their process ids and log files. The default
40
+ # storage. 'log_path' and 'pid_path' are the directories in which daemonized
41
+ # servers and nodes will store their process ids and log files. The default
36
42
  # values are listed.
37
43
  # :local_storage_path: /tmp/cloud_crowd_storage
38
44
  # :log_path: log
39
45
  # :pid_path: tmp/pids
40
46
 
41
- # Use HTTP Basic Auth for all requests? (Includes all internal worker requests
42
- # to the central server). If yes, specify the login and password that all
47
+ # Use HTTP Basic Auth for all requests? (Includes all internal worker requests
48
+ # to the central server). If yes, specify the login and password that all
43
49
  # requests must provide for authentication.
44
50
  :http_authentication: no
45
51
  :login: [your login name]
46
52
  :password: [your password]
47
53
 
54
+ # Disable all the default built-in actions
55
+ # :disable_default_actions: true
56
+
57
+ # Disable specific actions for the node
58
+ # Use this if you want to disable a limited number of actions
59
+ # :disabled_actions: ['word_count']
60
+
48
61
  # By default, CloudCrowd looks for installed actions inside the 'actions'
49
62
  # subdirectory of this configuration folder. 'actions_path' allows you to load
50
63
  # additional actions from a location of your choice.
@@ -4,10 +4,9 @@ $LOAD_PATH.unshift File.expand_path(File.dirname(__FILE__))
4
4
 
5
5
  # Common Gems:
6
6
  require 'rubygems'
7
- gem 'activerecord'
7
+ gem 'activerecord', '~> 2.0'
8
8
  gem 'json'
9
9
  gem 'rest-client'
10
- gem 'right_aws'
11
10
  gem 'sinatra'
12
11
  gem 'thin'
13
12
 
@@ -20,6 +19,7 @@ autoload :FileUtils, 'fileutils'
20
19
  autoload :JSON, 'json'
21
20
  autoload :RestClient, 'rest_client'
22
21
  autoload :RightAws, 'right_aws'
22
+ autoload :CloudFiles, 'cloudfiles'
23
23
  autoload :Sinatra, 'sinatra'
24
24
  autoload :Thin, 'thin'
25
25
  autoload :YAML, 'yaml'
@@ -44,7 +44,7 @@ module CloudCrowd
44
44
  autoload :WorkUnit, 'cloud_crowd/models'
45
45
 
46
46
  # Keep this version in sync with the gemspec.
47
- VERSION = '0.3.3'
47
+ VERSION = '0.4.0'
48
48
 
49
49
  # Increment the schema version when there's a backwards incompatible change.
50
50
  SCHEMA_VERSION = 3
@@ -166,7 +166,7 @@ module CloudCrowd
166
166
 
167
167
  # Retrieve the list of every installed Action for this node or server.
168
168
  def action_paths
169
- default_actions = Dir["#{ROOT}/actions/*.rb"]
169
+ default_actions = config[:disable_default_actions] ? [] : Dir["#{ROOT}/actions/*.rb"]
170
170
  installed_actions = Dir["#{@config_path}/actions/*.rb"]
171
171
  custom_actions = CloudCrowd.config[:actions_path] ? Dir["#{CloudCrowd.config[:actions_path]}/*.rb"] : []
172
172
  default_actions + installed_actions + custom_actions
@@ -1,7 +1,7 @@
1
1
  module CloudCrowd
2
-
2
+
3
3
  # As you write your custom actions, have them inherit from CloudCrowd::Action.
4
- # All actions must implement a +process+ method, which should return a
4
+ # All actions must implement a +process+ method, which should return a
5
5
  # JSON-serializable object that will be used as the output for the work unit.
6
6
  # See the default actions for examples.
7
7
  #
@@ -16,11 +16,11 @@ module CloudCrowd
16
16
  # Note that Actions inherit a backticks (`) method that raises an Exception
17
17
  # if the external command fails.
18
18
  class Action
19
-
19
+
20
20
  FILE_URL = /\Afile:\/\//
21
-
21
+
22
22
  attr_reader :input, :input_path, :file_name, :options, :work_directory
23
-
23
+
24
24
  # Initializing an Action sets up all of the read-only variables that
25
25
  # form the bulk of the API for action subclasses. (Paths to read from and
26
26
  # write to). It creates the +work_directory+ and moves into it.
@@ -34,17 +34,17 @@ module CloudCrowd
34
34
  parse_input
35
35
  download_input
36
36
  end
37
-
37
+
38
38
  # Each Action subclass must implement a +process+ method, overriding this.
39
39
  def process
40
40
  raise NotImplementedError, "CloudCrowd::Actions must override 'process' with their own processing code."
41
41
  end
42
-
42
+
43
43
  # Download a file to the specified path.
44
44
  def download(url, path)
45
45
  `curl -s "#{url}" > "#{path}"`
46
46
  return path
47
- # The previous implementation is below, and, although it would be
47
+ # The previous implementation is below, and, although it would be
48
48
  # wonderful not to shell out, RestClient wasn't handling URLs with encoded
49
49
  # entities (%20, for example), and doesn't let you download to a given
50
50
  # location. Getting a RestClient patch in would be ideal.
@@ -56,21 +56,21 @@ module CloudCrowd
56
56
  # FileUtils.mv resp.file.path, path
57
57
  # end
58
58
  end
59
-
60
- # Takes a local filesystem path, saves the file to S3, and returns the
61
- # public (or authenticated) url on S3 where the file can be accessed.
59
+
60
+ # Takes a local filesystem path, saves the file to S3, and returns the
61
+ # public (or authenticated) url on S3 where the file can be accessed.
62
62
  def save(file_path)
63
63
  save_path = File.join(storage_prefix, File.basename(file_path))
64
64
  @store.save(file_path, save_path)
65
65
  end
66
-
66
+
67
67
  # After the Action has finished, we remove the work directory and return
68
68
  # to the root directory (where workers run by default).
69
69
  def cleanup_work_directory
70
70
  FileUtils.rm_r(@work_directory) if File.exists?(@work_directory)
71
71
  end
72
-
73
- # Actions have a backticks command that raises a CommandFailed exception
72
+
73
+ # Actions have a backticks command that raises a CommandFailed exception
74
74
  # on failure, so that processing doesn't just blithely continue.
75
75
  def `(command)
76
76
  result = super(command)
@@ -78,17 +78,18 @@ module CloudCrowd
78
78
  raise Error::CommandFailed.new(result, exit_code) unless exit_code == 0
79
79
  result
80
80
  end
81
-
82
-
81
+
82
+
83
83
  private
84
-
84
+
85
85
  # Convert an unsafe URL into a filesystem-friendly filename.
86
86
  def safe_filename(url)
87
+ url.sub!(/\?.*\Z/, '')
87
88
  ext = File.extname(url)
88
89
  name = URI.unescape(File.basename(url)).gsub(/[^a-zA-Z0-9_\-.]/, '-').gsub(/-+/, '-')
89
90
  File.basename(name, ext).gsub('.', '-') + ext
90
91
  end
91
-
92
+
92
93
  # The directory prefix to use for both local and S3 storage.
93
94
  # [action]/job_[job_id]/unit_[work_unit_it]
94
95
  def storage_prefix
@@ -98,18 +99,18 @@ module CloudCrowd
98
99
  path_parts << "unit_#{@work_unit_id}" if @work_unit_id
99
100
  @storage_prefix ||= File.join(path_parts)
100
101
  end
101
-
102
+
102
103
  # If we think that the input is JSON, replace it with the parsed form.
103
104
  # It would be great if the JSON module had an is_json? method.
104
105
  def parse_input
105
106
  return unless ['[', '{'].include? @input[0..0]
106
107
  @input = JSON.parse(@input) rescue @input
107
108
  end
108
-
109
+
109
110
  def input_is_url?
110
111
  !URI.parse(@input).scheme.nil? rescue false
111
112
  end
112
-
113
+
113
114
  # If the input is a URL, download the file before beginning processing.
114
115
  def download_input
115
116
  return unless input_is_url?
@@ -119,7 +120,7 @@ module CloudCrowd
119
120
  download(@input, @input_path)
120
121
  end
121
122
  end
122
-
123
+
123
124
  end
124
-
125
+
125
126
  end
@@ -14,12 +14,14 @@ module CloudCrowd
14
14
 
15
15
  autoload :S3Store, 'cloud_crowd/asset_store/s3_store'
16
16
  autoload :FilesystemStore, 'cloud_crowd/asset_store/filesystem_store'
17
+ autoload :CloudfilesStore, 'cloud_crowd/asset_store/cloudfiles_store'
17
18
 
18
19
  # Configure the AssetStore with the specific storage implementation
19
20
  # specified by 'storage' in <tt>config.yml</tt>.
20
21
  case CloudCrowd.config[:storage]
21
- when 's3' then include S3Store
22
22
  when 'filesystem' then include FilesystemStore
23
+ when 's3' then include S3Store
24
+ when 'cloudfiles' then include CloudfilesStore
23
25
  else raise Error::StorageNotFound, "#{CloudCrowd.config[:storage]} is not a valid storage back end"
24
26
  end
25
27
 
@@ -0,0 +1,41 @@
1
+ gem 'cloudfiles'
2
+
3
+ module CloudCrowd
4
+ class AssetStore
5
+
6
+ # The CloudFilesStore is an implementation of an AssetStore that uses a Rackspace Cloud
7
+ module CloudfilesStore
8
+
9
+ # Configure Rackspace Cloud and connect
10
+ def setup
11
+ username = CloudCrowd.config[:cloudfiles_username]
12
+ api_key = CloudCrowd.config[:cloudfiles_api_key]
13
+ container = CloudCrowd.config[:cloudfiles_container]
14
+ valid_conf = [username, api_key, container].all? {|s| s.is_a? String }
15
+ raise Error::MissingConfiguration, "A Rackspace Cloud Files account must be configured in 'config.yml' before 'cloudfiles' storage can be used" unless valid_conf
16
+
17
+ @cloud = CloudFiles::Connection.new(username, api_key)
18
+ @container = @cloud.container container
19
+ end
20
+
21
+ # Save a finished file from local storage to Cloud Files.
22
+ def save(local_path, save_path)
23
+ object = @container.create_object save_path, true
24
+ object.load_from_filename local_path
25
+ object.public_url
26
+ end
27
+
28
+ # Remove all of a Job's resulting files from Cloud Files, both intermediate and finished.
29
+ def cleanup(job)
30
+ @container.objects(:prefix => "#{job.action}/job_#{job.id}").each do |object|
31
+ begin
32
+ @container.delete_object object
33
+ rescue
34
+ log "failed to delete #{job.action}/job_#{job.id}"
35
+ end
36
+ end
37
+ end
38
+ end
39
+
40
+ end
41
+ end
@@ -1,10 +1,12 @@
1
+ gem 'right_aws'
2
+
1
3
  module CloudCrowd
2
4
  class AssetStore
3
-
5
+
4
6
  # The S3Store is an implementation of an AssetStore that uses a bucket
5
7
  # on S3 for all resulting files.
6
8
  module S3Store
7
-
9
+
8
10
  # Configure authentication and establish a connection to S3, first thing.
9
11
  def setup
10
12
  @use_auth = CloudCrowd.config[:s3_authentication]
@@ -18,8 +20,8 @@ module CloudCrowd
18
20
  @bucket = @s3.bucket(bucket_name)
19
21
  @bucket = @s3.bucket(bucket_name, true) unless @bucket
20
22
  end
21
-
22
- # Save a finished file from local storage to S3. Save it publicly unless
23
+
24
+ # Save a finished file from local storage to S3. Save it publicly unless
23
25
  # we're configured to use S3 authentication. Authenticated links expire
24
26
  # after one day by default.
25
27
  def save(local_path, save_path)
@@ -31,13 +33,13 @@ module CloudCrowd
31
33
  @bucket.key(save_path).public_link
32
34
  end
33
35
  end
34
-
36
+
35
37
  # Remove all of a Job's resulting files from S3, both intermediate and finished.
36
38
  def cleanup(job)
37
39
  @bucket.delete_folder("#{job.action}/job_#{job.id}")
38
40
  end
39
-
41
+
40
42
  end
41
-
43
+
42
44
  end
43
45
  end
@@ -1,24 +1,25 @@
1
1
  module CloudCrowd
2
2
 
3
- # A NodeRecord is the central server's record of a Node running remotely. We
3
+ # A NodeRecord is the central server's record of a Node running remotely. We
4
4
  # can use it to assign WorkUnits to the Node, and keep track of its status.
5
5
  # When a Node exits, it destroys this record.
6
6
  class NodeRecord < ActiveRecord::Base
7
-
7
+
8
8
  has_many :work_units
9
-
9
+
10
10
  validates_presence_of :host, :ip_address, :port, :enabled_actions
11
-
11
+
12
12
  after_destroy :redistribute_work_units
13
-
13
+
14
14
  # Available Nodes haven't used up their maxiumum number of workers yet.
15
15
  named_scope :available, {
16
16
  :conditions => ['(max_workers is null or (select count(*) from work_units where node_record_id = node_records.id) < max_workers)'],
17
17
  :order => 'updated_at asc'
18
18
  }
19
-
20
- # Register a Node with the central server. Currently this only happens at
21
- # Node startup.
19
+
20
+ # Register a Node with the central server. This happens periodically
21
+ # (once every `Node::CHECK_IN_INTERVAL` seconds). Nodes will be de-registered
22
+ # if they checked in within a reasonable interval.
22
23
  def self.check_in(params, request)
23
24
  attrs = {
24
25
  :ip_address => request.ip,
@@ -29,15 +30,15 @@ module CloudCrowd
29
30
  }
30
31
  self.find_or_create_by_host(params[:host]).update_attributes!(attrs)
31
32
  end
32
-
33
+
33
34
  # Dispatch a WorkUnit to this node. Places the node at back at the end of
34
35
  # the rotation. If we fail to send the WorkUnit, we consider the node to be
35
36
  # down, and remove this record, freeing up all of its checked-out work units.
36
- # If the Node responds that it's overloaded, we mark it as busy. Returns
37
+ # If the Node responds that it's overloaded, we mark it as busy. Returns
37
38
  # true if the WorkUnit was dispatched successfully.
38
39
  def send_work_unit(unit)
39
40
  result = node['/work'].post(:work_unit => unit.to_json)
40
- unit.assign_to(self, JSON.parse(result)['pid'])
41
+ unit.assign_to(self, JSON.parse(result.body)['pid'])
41
42
  touch && true
42
43
  rescue RestClient::RequestFailed => e
43
44
  raise e unless e.http_code == 503 && e.http_body == Node::OVERLOADED_MESSAGE
@@ -46,45 +47,45 @@ module CloudCrowd
46
47
  # Couldn't post to node, assume it's gone away.
47
48
  destroy && false
48
49
  end
49
-
50
+
50
51
  # What Actions is this Node able to run?
51
52
  def actions
52
53
  @actions ||= enabled_actions.split(',')
53
54
  end
54
-
55
- # Is this Node too busy for more work? Determined by number of workers, or
55
+
56
+ # Is this Node too busy for more work? Determined by number of workers, or
56
57
  # the Node's load average, as configured in config.yml.
57
58
  def busy?
58
59
  busy || (max_workers && work_units.count >= max_workers)
59
60
  end
60
-
61
+
61
62
  # The URL at which this Node may be reached.
62
63
  # TODO: Make sure that the host actually has externally accessible DNS.
63
64
  def url
64
65
  @url ||= "http://#{host}:#{port}"
65
66
  end
66
-
67
- # Keep a RestClient::Resource handy for contacting the Node, including
67
+
68
+ # Keep a RestClient::Resource handy for contacting the Node, including
68
69
  # HTTP authentication, if configured.
69
70
  def node
70
71
  @node ||= RestClient::Resource.new(url, CloudCrowd.client_options)
71
72
  end
72
-
73
+
73
74
  # The printable status of the Node.
74
75
  def display_status
75
76
  busy? ? 'busy' : 'available'
76
77
  end
77
-
78
+
78
79
  # A list of the process ids of the workers currently being run by the Node.
79
80
  def worker_pids
80
81
  work_units.all(:select => 'worker_pid').map(&:worker_pid)
81
82
  end
82
-
83
+
83
84
  # Release all of this Node's WorkUnits for other nodes to take.
84
85
  def release_work_units
85
86
  WorkUnit.update_all('node_record_id = null, worker_pid = null', "node_record_id = #{id}")
86
87
  end
87
-
88
+
88
89
  # The JSON representation of a NodeRecord includes its worker_pids.
89
90
  def to_json(opts={})
90
91
  { 'host' => host,
@@ -92,16 +93,16 @@ module CloudCrowd
92
93
  'status' => display_status
93
94
  }.to_json
94
95
  end
95
-
96
-
96
+
97
+
97
98
  private
98
-
99
- # When a Node exits, release its WorkUnits and redistribute them to others.
99
+
100
+ # When a Node exits, release its WorkUnits and redistribute them to others.
100
101
  # Redistribute in a separate thread to avoid delaying shutdown.
101
102
  def redistribute_work_units
102
103
  release_work_units
103
104
  Thread.new { WorkUnit.distribute_to_nodes }
104
105
  end
105
-
106
+
106
107
  end
107
108
  end