mooktakim-cloud-crowd 0.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/EPIGRAPHS +17 -0
- data/LICENSE +22 -0
- data/README +93 -0
- data/actions/graphics_magick.rb +43 -0
- data/actions/process_pdfs.rb +92 -0
- data/actions/word_count.rb +16 -0
- data/bin/crowd +5 -0
- data/config/config.example.ru +23 -0
- data/config/config.example.yml +55 -0
- data/config/database.example.yml +16 -0
- data/examples/graphics_magick_example.rb +44 -0
- data/examples/process_pdfs_example.rb +40 -0
- data/examples/word_count_example.rb +42 -0
- data/lib/cloud-crowd.rb +188 -0
- data/lib/cloud_crowd/action.rb +125 -0
- data/lib/cloud_crowd/asset_store/filesystem_store.rb +39 -0
- data/lib/cloud_crowd/asset_store/s3_store.rb +43 -0
- data/lib/cloud_crowd/asset_store.rb +41 -0
- data/lib/cloud_crowd/command_line.rb +242 -0
- data/lib/cloud_crowd/exceptions.rb +46 -0
- data/lib/cloud_crowd/helpers/authorization.rb +52 -0
- data/lib/cloud_crowd/helpers/resources.rb +25 -0
- data/lib/cloud_crowd/helpers.rb +8 -0
- data/lib/cloud_crowd/inflector.rb +19 -0
- data/lib/cloud_crowd/models/job.rb +190 -0
- data/lib/cloud_crowd/models/node_record.rb +107 -0
- data/lib/cloud_crowd/models/work_unit.rb +170 -0
- data/lib/cloud_crowd/models.rb +40 -0
- data/lib/cloud_crowd/node.rb +199 -0
- data/lib/cloud_crowd/schema.rb +50 -0
- data/lib/cloud_crowd/server.rb +123 -0
- data/lib/cloud_crowd/worker.rb +149 -0
- data/mooktakim-cloud-crowd.gemspec +116 -0
- data/public/css/admin_console.css +243 -0
- data/public/css/reset.css +42 -0
- data/public/images/bullet_green.png +0 -0
- data/public/images/bullet_white.png +0 -0
- data/public/images/cloud_hand.png +0 -0
- data/public/images/header_back.png +0 -0
- data/public/images/logo.png +0 -0
- data/public/images/queue_fill.png +0 -0
- data/public/images/server.png +0 -0
- data/public/images/server_busy.png +0 -0
- data/public/images/server_error.png +0 -0
- data/public/images/sidebar_bottom.png +0 -0
- data/public/images/sidebar_top.png +0 -0
- data/public/images/worker_info.png +0 -0
- data/public/images/worker_info_loading.gif +0 -0
- data/public/js/admin_console.js +197 -0
- data/public/js/excanvas.js +1 -0
- data/public/js/flot.js +1 -0
- data/public/js/jquery.js +19 -0
- data/test/acceptance/test_failing_work_units.rb +33 -0
- data/test/acceptance/test_node.rb +20 -0
- data/test/acceptance/test_server.rb +66 -0
- data/test/acceptance/test_word_count.rb +40 -0
- data/test/blueprints.rb +25 -0
- data/test/config/actions/failure_testing.rb +13 -0
- data/test/config/config.ru +17 -0
- data/test/config/config.yml +6 -0
- data/test/config/database.yml +3 -0
- data/test/test_helper.rb +19 -0
- data/test/unit/test_action.rb +70 -0
- data/test/unit/test_configuration.rb +48 -0
- data/test/unit/test_job.rb +103 -0
- data/test/unit/test_node.rb +41 -0
- data/test/unit/test_node_record.rb +42 -0
- data/test/unit/test_work_unit.rb +53 -0
- data/test/unit/test_worker.rb +48 -0
- data/views/operations_center.erb +82 -0
- metadata +290 -0
data/lib/cloud-crowd.rb
ADDED
@@ -0,0 +1,188 @@
|
|
1
|
+
# The Grand Central of code loading...
|
2
|
+
|
3
|
+
$LOAD_PATH.unshift File.expand_path(File.dirname(__FILE__))
|
4
|
+
|
5
|
+
# Common Gems:
|
6
|
+
require 'rubygems'
|
7
|
+
gem 'activerecord', '2.3.5'
|
8
|
+
gem 'json'
|
9
|
+
gem 'rest-client'
|
10
|
+
gem 'right_aws'
|
11
|
+
gem 'sinatra'
|
12
|
+
gem 'thin'
|
13
|
+
|
14
|
+
# Autoloading for all the pieces which may or may not be needed:
|
15
|
+
autoload :ActiveRecord, 'active_record'
|
16
|
+
autoload :Benchmark, 'benchmark'
|
17
|
+
autoload :Digest, 'digest'
|
18
|
+
autoload :ERB, 'erb'
|
19
|
+
autoload :FileUtils, 'fileutils'
|
20
|
+
autoload :JSON, 'json'
|
21
|
+
autoload :RestClient, 'rest_client'
|
22
|
+
autoload :RightAws, 'right_aws'
|
23
|
+
autoload :Sinatra, 'sinatra'
|
24
|
+
autoload :Thin, 'thin'
|
25
|
+
autoload :YAML, 'yaml'
|
26
|
+
|
27
|
+
# Common code which should really be required in every circumstance.
|
28
|
+
require 'socket'
|
29
|
+
require 'cloud_crowd/exceptions'
|
30
|
+
|
31
|
+
module CloudCrowd
|
32
|
+
|
33
|
+
# Autoload all the CloudCrowd internals.
|
34
|
+
autoload :Action, 'cloud_crowd/action'
|
35
|
+
autoload :AssetStore, 'cloud_crowd/asset_store'
|
36
|
+
autoload :CommandLine, 'cloud_crowd/command_line'
|
37
|
+
autoload :Helpers, 'cloud_crowd/helpers'
|
38
|
+
autoload :Inflector, 'cloud_crowd/inflector'
|
39
|
+
autoload :Job, 'cloud_crowd/models'
|
40
|
+
autoload :Node, 'cloud_crowd/node'
|
41
|
+
autoload :NodeRecord, 'cloud_crowd/models'
|
42
|
+
autoload :Server, 'cloud_crowd/server'
|
43
|
+
autoload :Worker, 'cloud_crowd/worker'
|
44
|
+
autoload :WorkUnit, 'cloud_crowd/models'
|
45
|
+
|
46
|
+
# Keep this version in sync with the gemspec.
|
47
|
+
VERSION = '0.3.3'
|
48
|
+
|
49
|
+
# Increment the schema version when there's a backwards incompatible change.
|
50
|
+
SCHEMA_VERSION = 3
|
51
|
+
|
52
|
+
# Root directory of the CloudCrowd gem.
|
53
|
+
ROOT = File.expand_path(File.dirname(__FILE__) + '/..')
|
54
|
+
|
55
|
+
# Default folder to log daemonized servers and nodes into.
|
56
|
+
LOG_PATH = 'log'
|
57
|
+
|
58
|
+
# Default folder to contain the pids of daemonized servers and nodes.
|
59
|
+
PID_PATH = 'tmp/pids'
|
60
|
+
|
61
|
+
# A Job is processing if its WorkUnits are in the queue to be handled by nodes.
|
62
|
+
PROCESSING = 1
|
63
|
+
|
64
|
+
# A Job has succeeded if all of its WorkUnits have finished successfully.
|
65
|
+
SUCCEEDED = 2
|
66
|
+
|
67
|
+
# A Job has failed if even a single one of its WorkUnits has failed (they may
|
68
|
+
# be attempted multiple times on failure, however).
|
69
|
+
FAILED = 3
|
70
|
+
|
71
|
+
# A Job is splitting if it's in the process of dividing its inputs up into
|
72
|
+
# multiple WorkUnits.
|
73
|
+
SPLITTING = 4
|
74
|
+
|
75
|
+
# A Job is merging if it's busy collecting all of its successful WorkUnits
|
76
|
+
# back together into the final result.
|
77
|
+
MERGING = 5
|
78
|
+
|
79
|
+
# A Job is considered to be complete if it succeeded or if it failed.
|
80
|
+
COMPLETE = [SUCCEEDED, FAILED]
|
81
|
+
|
82
|
+
# A Job is considered incomplete if it's being processed, split up or merged.
|
83
|
+
INCOMPLETE = [PROCESSING, SPLITTING, MERGING]
|
84
|
+
|
85
|
+
# Mapping of statuses to their display strings.
|
86
|
+
DISPLAY_STATUS_MAP = ['unknown', 'processing', 'succeeded', 'failed', 'splitting', 'merging']
|
87
|
+
|
88
|
+
class << self
|
89
|
+
attr_reader :config
|
90
|
+
attr_accessor :identity
|
91
|
+
|
92
|
+
# Configure CloudCrowd by passing in the path to <tt>config.yml</tt>.
|
93
|
+
def configure(config_path)
|
94
|
+
@config_path = File.expand_path(File.dirname(config_path))
|
95
|
+
@config = YAML.load_file(config_path)
|
96
|
+
end
|
97
|
+
|
98
|
+
# Configure the CloudCrowd central database (and connect to it), by passing
|
99
|
+
# in a path to <tt>database.yml</tt>. The file should use the standard
|
100
|
+
# ActiveRecord connection format.
|
101
|
+
def configure_database(config_path, validate_schema=true)
|
102
|
+
configuration = YAML.load_file(config_path)
|
103
|
+
ActiveRecord::Base.establish_connection(configuration)
|
104
|
+
if validate_schema
|
105
|
+
version = ActiveRecord::Base.connection.select_values('select max(version) from schema_migrations').first.to_i
|
106
|
+
return true if version == SCHEMA_VERSION
|
107
|
+
puts "Your database schema is out of date. Please use `crowd load_schema` to update it. This will wipe all the tables, so make sure that your jobs have a chance to finish first.\nexiting..."
|
108
|
+
exit
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
# Get a reference to the central server, including authentication if
|
113
|
+
# configured.
|
114
|
+
def central_server
|
115
|
+
@central_server ||= RestClient::Resource.new(CloudCrowd.config[:central_server], CloudCrowd.client_options)
|
116
|
+
end
|
117
|
+
|
118
|
+
# The path that daemonized servers and nodes will log to.
|
119
|
+
def log_path(log_file=nil)
|
120
|
+
@log_path ||= config[:log_path] || LOG_PATH
|
121
|
+
log_file ? File.join(@log_path, log_file) : @log_path
|
122
|
+
end
|
123
|
+
|
124
|
+
# The path in which daemonized servers and nodes will store their pids.
|
125
|
+
def pid_path(pid_file=nil)
|
126
|
+
@pid_path ||= config[:pid_path] || PID_PATH
|
127
|
+
pid_file ? File.join(@pid_path, pid_file) : @pid_path
|
128
|
+
end
|
129
|
+
|
130
|
+
# The standard RestClient options for the central server talking to nodes,
|
131
|
+
# as well as the other way around. There's a timeout of 5 seconds to open
|
132
|
+
# a connection, and a timeout of 30 to finish reading it.
|
133
|
+
def client_options
|
134
|
+
return @client_options if @client_options
|
135
|
+
@client_options = {:timeout => 30, :open_timeout => 5}
|
136
|
+
if CloudCrowd.config[:http_authentication]
|
137
|
+
@client_options[:user] = CloudCrowd.config[:login]
|
138
|
+
@client_options[:password] = CloudCrowd.config[:password]
|
139
|
+
end
|
140
|
+
@client_options
|
141
|
+
end
|
142
|
+
|
143
|
+
# Return the displayable status name of an internal CloudCrowd status number.
|
144
|
+
# (See the above constants).
|
145
|
+
def display_status(status)
|
146
|
+
DISPLAY_STATUS_MAP[status] || 'unknown'
|
147
|
+
end
|
148
|
+
|
149
|
+
# CloudCrowd::Actions are requested dynamically by name. Access them through
|
150
|
+
# this actions property, which behaves like a hash. At load time, we
|
151
|
+
# load all installed Actions and CloudCrowd's default Actions into it.
|
152
|
+
# If you wish to have certain nodes be specialized to only handle certain
|
153
|
+
# Actions, then install only those into the actions directory.
|
154
|
+
def actions
|
155
|
+
return @actions if @actions
|
156
|
+
@actions = action_paths.inject({}) do |memo, path|
|
157
|
+
name = File.basename(path, File.extname(path))
|
158
|
+
require path
|
159
|
+
memo[name] = Module.const_get(Inflector.camelize(name))
|
160
|
+
memo
|
161
|
+
end
|
162
|
+
rescue NameError => e
|
163
|
+
adjusted_message = "One of your actions failed to load. Please ensure that the name of your action class can be deduced from the name of the file. ex: 'word_count.rb' => 'WordCount'\n#{e.message}"
|
164
|
+
raise NameError.new(adjusted_message, e.name)
|
165
|
+
end
|
166
|
+
|
167
|
+
# Retrieve the list of every installed Action for this node or server.
|
168
|
+
def action_paths
|
169
|
+
default_actions = Dir["#{ROOT}/actions/*.rb"]
|
170
|
+
installed_actions = Dir["#{@config_path}/actions/*.rb"]
|
171
|
+
custom_actions = CloudCrowd.config[:actions_path] ? Dir["#{CloudCrowd.config[:actions_path]}/*.rb"] : []
|
172
|
+
default_actions + installed_actions + custom_actions
|
173
|
+
end
|
174
|
+
|
175
|
+
# Is this CloudCrowd instance a server? Useful for avoiding loading unneeded
|
176
|
+
# code from actions.
|
177
|
+
def server?
|
178
|
+
@identity == :server
|
179
|
+
end
|
180
|
+
|
181
|
+
# Or is it a node?
|
182
|
+
def node?
|
183
|
+
@identity == :node
|
184
|
+
end
|
185
|
+
|
186
|
+
end
|
187
|
+
|
188
|
+
end
|
@@ -0,0 +1,125 @@
|
|
1
|
+
module CloudCrowd
|
2
|
+
|
3
|
+
# As you write your custom actions, have them inherit from CloudCrowd::Action.
|
4
|
+
# All actions must implement a +process+ method, which should return a
|
5
|
+
# JSON-serializable object that will be used as the output for the work unit.
|
6
|
+
# See the default actions for examples.
|
7
|
+
#
|
8
|
+
# Optionally, actions may define +split+ and +merge+ methods to do mapping
|
9
|
+
# and reducing around the +input+. +split+ should return an array of URLs --
|
10
|
+
# to be mapped into WorkUnits and processed in parallel. In the +merge+ step,
|
11
|
+
# +input+ will be an array of all the resulting outputs from calling process.
|
12
|
+
#
|
13
|
+
# All actions have use of an individual +work_directory+, for scratch files,
|
14
|
+
# and spend their duration inside of it, so relative paths work well.
|
15
|
+
#
|
16
|
+
# Note that Actions inherit a backticks (`) method that raises an Exception
|
17
|
+
# if the external command fails.
|
18
|
+
class Action
|
19
|
+
|
20
|
+
FILE_URL = /\Afile:\/\//
|
21
|
+
|
22
|
+
attr_reader :input, :input_path, :file_name, :options, :work_directory
|
23
|
+
|
24
|
+
# Initializing an Action sets up all of the read-only variables that
|
25
|
+
# form the bulk of the API for action subclasses. (Paths to read from and
|
26
|
+
# write to). It creates the +work_directory+ and moves into it.
|
27
|
+
# If we're not merging multiple results, it downloads the input file into
|
28
|
+
# the +work_directory+ before starting.
|
29
|
+
def initialize(status, input, options, store)
|
30
|
+
@input, @options, @store = input, options, store
|
31
|
+
@job_id, @work_unit_id = options['job_id'], options['work_unit_id']
|
32
|
+
@work_directory = File.expand_path(File.join(@store.temp_storage_path, storage_prefix))
|
33
|
+
FileUtils.mkdir_p(@work_directory) unless File.exists?(@work_directory)
|
34
|
+
parse_input
|
35
|
+
download_input
|
36
|
+
end
|
37
|
+
|
38
|
+
# Each Action subclass must implement a +process+ method, overriding this.
|
39
|
+
def process
|
40
|
+
raise NotImplementedError, "CloudCrowd::Actions must override 'process' with their own processing code."
|
41
|
+
end
|
42
|
+
|
43
|
+
# Download a file to the specified path.
|
44
|
+
def download(url, path)
|
45
|
+
`curl -s "#{url}" > "#{path}"`
|
46
|
+
return path
|
47
|
+
# The previous implementation is below, and, although it would be
|
48
|
+
# wonderful not to shell out, RestClient wasn't handling URLs with encoded
|
49
|
+
# entities (%20, for example), and doesn't let you download to a given
|
50
|
+
# location. Getting a RestClient patch in would be ideal.
|
51
|
+
#
|
52
|
+
# if url.match(FILE_URL)
|
53
|
+
# FileUtils.cp(url.sub(FILE_URL, ''), path)
|
54
|
+
# else
|
55
|
+
# resp = RestClient::Request.execute(:url => url, :method => :get, :raw_response => true)
|
56
|
+
# FileUtils.mv resp.file.path, path
|
57
|
+
# end
|
58
|
+
end
|
59
|
+
|
60
|
+
# Takes a local filesystem path, saves the file to S3, and returns the
|
61
|
+
# public (or authenticated) url on S3 where the file can be accessed.
|
62
|
+
def save(file_path)
|
63
|
+
save_path = File.join(storage_prefix, File.basename(file_path))
|
64
|
+
@store.save(file_path, save_path)
|
65
|
+
end
|
66
|
+
|
67
|
+
# After the Action has finished, we remove the work directory and return
|
68
|
+
# to the root directory (where workers run by default).
|
69
|
+
def cleanup_work_directory
|
70
|
+
FileUtils.rm_r(@work_directory) if File.exists?(@work_directory)
|
71
|
+
end
|
72
|
+
|
73
|
+
# Actions have a backticks command that raises a CommandFailed exception
|
74
|
+
# on failure, so that processing doesn't just blithely continue.
|
75
|
+
def `(command)
|
76
|
+
result = super(command)
|
77
|
+
exit_code = $?.to_i
|
78
|
+
raise Error::CommandFailed.new(result, exit_code) unless exit_code == 0
|
79
|
+
result
|
80
|
+
end
|
81
|
+
|
82
|
+
|
83
|
+
private
|
84
|
+
|
85
|
+
# Convert an unsafe URL into a filesystem-friendly filename.
|
86
|
+
def safe_filename(url)
|
87
|
+
ext = File.extname(url)
|
88
|
+
name = URI.unescape(File.basename(url)).gsub(/[^a-zA-Z0-9_\-.]/, '-').gsub(/-+/, '-')
|
89
|
+
File.basename(name, ext).gsub('.', '-') + ext
|
90
|
+
end
|
91
|
+
|
92
|
+
# The directory prefix to use for both local and S3 storage.
|
93
|
+
# [action]/job_[job_id]/unit_[work_unit_it]
|
94
|
+
def storage_prefix
|
95
|
+
path_parts = []
|
96
|
+
path_parts << Inflector.underscore(self.class)
|
97
|
+
path_parts << "job_#{@job_id}"
|
98
|
+
path_parts << "unit_#{@work_unit_id}" if @work_unit_id
|
99
|
+
@storage_prefix ||= File.join(path_parts)
|
100
|
+
end
|
101
|
+
|
102
|
+
# If we think that the input is JSON, replace it with the parsed form.
|
103
|
+
# It would be great if the JSON module had an is_json? method.
|
104
|
+
def parse_input
|
105
|
+
return unless ['[', '{'].include? @input[0..0]
|
106
|
+
@input = JSON.parse(@input) rescue @input
|
107
|
+
end
|
108
|
+
|
109
|
+
def input_is_url?
|
110
|
+
!URI.parse(@input).scheme.nil? rescue false
|
111
|
+
end
|
112
|
+
|
113
|
+
# If the input is a URL, download the file before beginning processing.
|
114
|
+
def download_input
|
115
|
+
return unless input_is_url?
|
116
|
+
Dir.chdir(@work_directory) do
|
117
|
+
@input_path = File.join(@work_directory, safe_filename(@input))
|
118
|
+
@file_name = File.basename(@input_path, File.extname(@input_path))
|
119
|
+
download(@input, @input_path)
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
123
|
+
end
|
124
|
+
|
125
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
module CloudCrowd
|
2
|
+
class AssetStore
|
3
|
+
|
4
|
+
# The FilesystemStore is an implementation of the AssetStore, good only for
|
5
|
+
# use in development, testing, if you're only running a single-machine
|
6
|
+
# installation, or are using a networked drive.
|
7
|
+
module FilesystemStore
|
8
|
+
|
9
|
+
DEFAULT_STORAGE_PATH = '/tmp/cloud_crowd_storage'
|
10
|
+
|
11
|
+
attr_reader :local_storage_path
|
12
|
+
|
13
|
+
# Make sure that local storage exists and is writeable before starting.
|
14
|
+
def setup
|
15
|
+
lsp = @local_storage_path = CloudCrowd.config[:local_storage_path] || DEFAULT_STORAGE_PATH
|
16
|
+
FileUtils.mkdir_p(lsp) unless File.exists?(lsp)
|
17
|
+
raise Error::StorageNotWritable, "#{lsp} is not writable" unless File.writable?(lsp)
|
18
|
+
end
|
19
|
+
|
20
|
+
# Save a file to somewhere semi-persistent on the filesystem. To use,
|
21
|
+
# configure <tt>:storage: 'filesystem'</tt> in *config.yml*, as well as
|
22
|
+
# <tt>:local_storage_path:</tt>.
|
23
|
+
def save(local_path, save_path)
|
24
|
+
save_path = File.join(@local_storage_path, save_path)
|
25
|
+
save_dir = File.dirname(save_path)
|
26
|
+
FileUtils.mkdir_p save_dir unless File.exists? save_dir
|
27
|
+
FileUtils.cp(local_path, save_path)
|
28
|
+
"file://#{File.expand_path(save_path)}"
|
29
|
+
end
|
30
|
+
|
31
|
+
# Remove all of a Job's result files from the filesystem.
|
32
|
+
def cleanup(job)
|
33
|
+
path = "#{@local_storage_path}/#{job.action}/job_#{job.id}"
|
34
|
+
FileUtils.rm_r(path) if File.exists?(path)
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
end
|
39
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
module CloudCrowd
|
2
|
+
class AssetStore
|
3
|
+
|
4
|
+
# The S3Store is an implementation of an AssetStore that uses a bucket
|
5
|
+
# on S3 for all resulting files.
|
6
|
+
module S3Store
|
7
|
+
|
8
|
+
# Configure authentication and establish a connection to S3, first thing.
|
9
|
+
def setup
|
10
|
+
@use_auth = CloudCrowd.config[:s3_authentication]
|
11
|
+
bucket_name = CloudCrowd.config[:s3_bucket]
|
12
|
+
key, secret = CloudCrowd.config[:aws_access_key], CloudCrowd.config[:aws_secret_key]
|
13
|
+
valid_conf = [bucket_name, key, secret].all? {|s| s.is_a? String }
|
14
|
+
raise Error::MissingConfiguration, "An S3 account must be configured in 'config.yml' before 's3' storage can be used" unless valid_conf
|
15
|
+
protocol = @use_auth ? 'https' : 'http'
|
16
|
+
port = @use_auth ? 443 : 80
|
17
|
+
@s3 = RightAws::S3.new(key, secret, :protocol => protocol, :port => port)
|
18
|
+
@bucket = @s3.bucket(bucket_name)
|
19
|
+
@bucket = @s3.bucket(bucket_name, true) unless @bucket
|
20
|
+
end
|
21
|
+
|
22
|
+
# Save a finished file from local storage to S3. Save it publicly unless
|
23
|
+
# we're configured to use S3 authentication. Authenticated links expire
|
24
|
+
# after one day by default.
|
25
|
+
def save(local_path, save_path)
|
26
|
+
if @use_auth
|
27
|
+
@bucket.put(save_path, File.open(local_path), {}, 'private')
|
28
|
+
@s3.interface.get_link(@bucket, save_path)
|
29
|
+
else
|
30
|
+
@bucket.put(save_path, File.open(local_path), {}, 'public-read')
|
31
|
+
@bucket.key(save_path).public_link
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
# Remove all of a Job's resulting files from S3, both intermediate and finished.
|
36
|
+
def cleanup(job)
|
37
|
+
@bucket.delete_folder("#{job.action}/job_#{job.id}")
|
38
|
+
end
|
39
|
+
|
40
|
+
end
|
41
|
+
|
42
|
+
end
|
43
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
require 'tmpdir'
|
2
|
+
|
3
|
+
module CloudCrowd
|
4
|
+
|
5
|
+
# The AssetStore provides a common API for storing files and returning URLs
|
6
|
+
# that can access them. At the moment, the files can be saved to either S3, or
|
7
|
+
# the local filesystem. You shouldn't need to use the AssetStore directly --
|
8
|
+
# Action's +download+ and +save+ methods use it behind the scenes.
|
9
|
+
#
|
10
|
+
# To implement a new back-end for the AssetStore, you must provide
|
11
|
+
# <tt>save(local_path, save_path)</tt>, <tt>cleanup(job)</tt>, and optionally,
|
12
|
+
# a <tt>setup</tt> method that will be called once at initialization.
|
13
|
+
class AssetStore
|
14
|
+
|
15
|
+
autoload :S3Store, 'cloud_crowd/asset_store/s3_store'
|
16
|
+
autoload :FilesystemStore, 'cloud_crowd/asset_store/filesystem_store'
|
17
|
+
|
18
|
+
# Configure the AssetStore with the specific storage implementation
|
19
|
+
# specified by 'storage' in <tt>config.yml</tt>.
|
20
|
+
case CloudCrowd.config[:storage]
|
21
|
+
when 's3' then include S3Store
|
22
|
+
when 'filesystem' then include FilesystemStore
|
23
|
+
else raise Error::StorageNotFound, "#{CloudCrowd.config[:storage]} is not a valid storage back end"
|
24
|
+
end
|
25
|
+
|
26
|
+
# Creating the AssetStore ensures that its scratch directory exists.
|
27
|
+
def initialize
|
28
|
+
FileUtils.mkdir_p temp_storage_path unless File.exists? temp_storage_path
|
29
|
+
raise Error::StorageNotWritable, "#{temp_storage_path} is not writable" unless File.writable?(temp_storage_path)
|
30
|
+
setup if respond_to? :setup
|
31
|
+
end
|
32
|
+
|
33
|
+
# Get the path to CloudCrowd's temporary local storage. All actions run
|
34
|
+
# in subdirectories of this.
|
35
|
+
def temp_storage_path
|
36
|
+
@temp_storage_path ||= CloudCrowd.config[:temp_storage_path] || "#{Dir.tmpdir}/cloud_crowd_tmp"
|
37
|
+
end
|
38
|
+
|
39
|
+
end
|
40
|
+
|
41
|
+
end
|
@@ -0,0 +1,242 @@
|
|
1
|
+
require 'optparse'
|
2
|
+
|
3
|
+
module CloudCrowd
|
4
|
+
class CommandLine
|
5
|
+
|
6
|
+
# Configuration files required for the `crowd` command to function.
|
7
|
+
CONFIG_FILES = ['config.yml', 'database.yml']
|
8
|
+
|
9
|
+
# Reference the absolute path to the root.
|
10
|
+
CC_ROOT = File.expand_path(File.dirname(__FILE__) + '/../..')
|
11
|
+
|
12
|
+
# Command-line banner for the usage message.
|
13
|
+
BANNER = <<-EOS
|
14
|
+
CloudCrowd is a MapReduce-inspired Parallel Processing System for Ruby.
|
15
|
+
|
16
|
+
Wiki: http://wiki.github.com/documentcloud/cloud-crowd
|
17
|
+
Rdoc: http://rdoc.info/projects/documentcloud/cloud-crowd
|
18
|
+
|
19
|
+
Usage: crowd COMMAND OPTIONS
|
20
|
+
|
21
|
+
Commands:
|
22
|
+
install Install the CloudCrowd configuration files to the specified directory
|
23
|
+
server Start up the central server (requires a database)
|
24
|
+
node Start up a worker node (only one node per machine, please)
|
25
|
+
console Launch a CloudCrowd console, connected to the central database
|
26
|
+
load_schema Load the schema into the database specified by database.yml
|
27
|
+
cleanup Removes jobs that were finished over --days (7 by default) ago
|
28
|
+
|
29
|
+
server -d [start | stop | restart] Servers and nodes can be launched as
|
30
|
+
node -d [start | stop | restart] daemons, then stopped or restarted.
|
31
|
+
|
32
|
+
Options:
|
33
|
+
EOS
|
34
|
+
|
35
|
+
# Creating a CloudCrowd::CommandLine runs from the contents of ARGV.
|
36
|
+
def initialize
|
37
|
+
parse_options
|
38
|
+
command = ARGV.shift
|
39
|
+
subcommand = ARGV.shift
|
40
|
+
case command
|
41
|
+
when 'console' then run_console
|
42
|
+
when 'server' then run_server(subcommand)
|
43
|
+
when 'node' then run_node(subcommand)
|
44
|
+
when 'load_schema' then run_load_schema
|
45
|
+
when 'install' then run_install(subcommand)
|
46
|
+
when 'cleanup' then run_cleanup
|
47
|
+
else usage
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
# Spin up an IRB session with the CloudCrowd code loaded in, and a database
|
52
|
+
# connection established. The equivalent of Rails' `script/console`.
|
53
|
+
def run_console
|
54
|
+
require 'irb'
|
55
|
+
require 'irb/completion'
|
56
|
+
require 'pp'
|
57
|
+
load_code
|
58
|
+
connect_to_database true
|
59
|
+
CloudCrowd::Server # Preload server to autoload classes.
|
60
|
+
Object.send(:include, CloudCrowd)
|
61
|
+
IRB.start
|
62
|
+
end
|
63
|
+
|
64
|
+
# `crowd server` can either 'start', 'stop', or 'restart'.
|
65
|
+
def run_server(subcommand)
|
66
|
+
load_code
|
67
|
+
subcommand ||= 'start'
|
68
|
+
case subcommand
|
69
|
+
when 'start' then start_server
|
70
|
+
when 'stop' then stop_server
|
71
|
+
when 'restart' then restart_server
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
# Convenience command for quickly spinning up the central server. More
|
76
|
+
# sophisticated deployments, load-balancing across multiple app servers,
|
77
|
+
# should use the config.ru rackup file directly. This method will start
|
78
|
+
# a single Thin server.
|
79
|
+
def start_server
|
80
|
+
port = @options[:port] || 9173
|
81
|
+
daemonize = @options[:daemonize] ? '-d' : ''
|
82
|
+
log_path = CloudCrowd.log_path('server.log')
|
83
|
+
pid_path = CloudCrowd.pid_path('server.pid')
|
84
|
+
rackup_path = File.expand_path("#{@options[:config_path]}/config.ru")
|
85
|
+
FileUtils.mkdir_p(CloudCrowd.log_path) if @options[:daemonize] && !File.exists?(CloudCrowd.log_path)
|
86
|
+
puts "Starting CloudCrowd Central Server on port #{port}..."
|
87
|
+
exec "thin -e #{@options[:environment]} -p #{port} #{daemonize} --tag cloud-crowd-server --log #{log_path} --pid #{pid_path} -R #{rackup_path} start"
|
88
|
+
end
|
89
|
+
|
90
|
+
# Stop the daemonized central server, if it exists.
|
91
|
+
def stop_server
|
92
|
+
Thin::Server.kill(CloudCrowd.pid_path('server.pid'), 0)
|
93
|
+
end
|
94
|
+
|
95
|
+
# Restart the daemonized central server.
|
96
|
+
def restart_server
|
97
|
+
stop_server
|
98
|
+
sleep 1
|
99
|
+
start_server
|
100
|
+
end
|
101
|
+
|
102
|
+
# `crowd node` can either 'start', 'stop', or 'restart'.
|
103
|
+
def run_node(subcommand)
|
104
|
+
load_code
|
105
|
+
ENV['RACK_ENV'] = @options[:environment]
|
106
|
+
case (subcommand || 'start')
|
107
|
+
when 'start' then start_node
|
108
|
+
when 'stop' then stop_node
|
109
|
+
when 'restart' then restart_node
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
# Launch a Node. Please only run a single node per machine. The Node process
|
114
|
+
# will be long-lived, although its workers will come and go.
|
115
|
+
def start_node
|
116
|
+
port = @options[:port] || Node::DEFAULT_PORT
|
117
|
+
puts "Starting CloudCrowd Node on port #{port}..."
|
118
|
+
Node.new(port, @options[:daemonize])
|
119
|
+
end
|
120
|
+
|
121
|
+
# If the daemonized Node is running, stop it.
|
122
|
+
def stop_node
|
123
|
+
Thin::Server.kill CloudCrowd.pid_path('node.pid')
|
124
|
+
end
|
125
|
+
|
126
|
+
# Restart the daemonized Node, if it exists.
|
127
|
+
def restart_node
|
128
|
+
stop_node
|
129
|
+
sleep 1
|
130
|
+
start_node
|
131
|
+
end
|
132
|
+
|
133
|
+
# Load in the database schema to the database specified in 'database.yml'.
|
134
|
+
def run_load_schema
|
135
|
+
load_code
|
136
|
+
connect_to_database(false)
|
137
|
+
require 'cloud_crowd/schema.rb'
|
138
|
+
end
|
139
|
+
|
140
|
+
# Install the required CloudCrowd configuration files into the specified
|
141
|
+
# directory, or the current one.
|
142
|
+
def run_install(install_path)
|
143
|
+
require 'fileutils'
|
144
|
+
install_path ||= '.'
|
145
|
+
FileUtils.mkdir_p install_path unless File.exists?(install_path)
|
146
|
+
install_file "#{CC_ROOT}/config/config.example.yml", "#{install_path}/config.yml"
|
147
|
+
install_file "#{CC_ROOT}/config/config.example.ru", "#{install_path}/config.ru"
|
148
|
+
install_file "#{CC_ROOT}/config/database.example.yml", "#{install_path}/database.yml"
|
149
|
+
install_file "#{CC_ROOT}/actions", "#{install_path}/actions", true
|
150
|
+
end
|
151
|
+
|
152
|
+
# Clean up all Jobs in the CloudCrowd database older than --days old.
|
153
|
+
def run_cleanup
|
154
|
+
load_code
|
155
|
+
connect_to_database(true)
|
156
|
+
Job.cleanup_all(:days => @options[:days])
|
157
|
+
end
|
158
|
+
|
159
|
+
# Print `crowd` usage.
|
160
|
+
def usage
|
161
|
+
puts "\n#{@option_parser}\n"
|
162
|
+
end
|
163
|
+
|
164
|
+
|
165
|
+
private
|
166
|
+
|
167
|
+
# Check for configuration files, either in the current directory, or in
|
168
|
+
# the CLOUD_CROWD_CONFIG environment variable. Exit if they're not found.
|
169
|
+
def ensure_config
|
170
|
+
return if @config_found
|
171
|
+
found = CONFIG_FILES.all? {|f| File.exists? "#{@options[:config_path]}/#{f}" }
|
172
|
+
found ? @config_dir = true : config_not_found
|
173
|
+
end
|
174
|
+
|
175
|
+
# Parse all options for all commands.
|
176
|
+
# Valid options are: --config --port --environment --daemonize --days.
|
177
|
+
def parse_options
|
178
|
+
@options = {
|
179
|
+
:environment => 'production',
|
180
|
+
:config_path => ENV['CLOUD_CROWD_CONFIG'] || '.',
|
181
|
+
:daemonize => false
|
182
|
+
}
|
183
|
+
@option_parser = OptionParser.new do |opts|
|
184
|
+
opts.on('-c', '--config PATH', 'path to configuration directory') do |conf_path|
|
185
|
+
@options[:config_path] = conf_path
|
186
|
+
end
|
187
|
+
opts.on('-p', '--port PORT', 'port number for server (central or node)') do |port_num|
|
188
|
+
@options[:port] = port_num
|
189
|
+
end
|
190
|
+
opts.on('-e', '--environment ENV', 'server environment (defaults to production)') do |env|
|
191
|
+
@options[:environment] = env
|
192
|
+
end
|
193
|
+
opts.on('-d', '--daemonize', 'run as a background daemon') do |daemonize|
|
194
|
+
@options[:daemonize] = daemonize
|
195
|
+
end
|
196
|
+
opts.on('--days NUM_DAYS', 'grace period before cleanup (7 by default)') do |days|
|
197
|
+
@options[:days] = days.to_i if days.match(/\A\d+\Z/)
|
198
|
+
end
|
199
|
+
opts.on_tail('-v', '--version', 'show version') do
|
200
|
+
require "#{CC_ROOT}/lib/cloud-crowd"
|
201
|
+
puts "CloudCrowd version #{VERSION}"
|
202
|
+
exit
|
203
|
+
end
|
204
|
+
end
|
205
|
+
@option_parser.banner = BANNER
|
206
|
+
@option_parser.parse!(ARGV)
|
207
|
+
end
|
208
|
+
|
209
|
+
# Load in the CloudCrowd module code, dependencies, lib files and models.
|
210
|
+
# Not all commands require this.
|
211
|
+
def load_code
|
212
|
+
ensure_config
|
213
|
+
require "#{CC_ROOT}/lib/cloud-crowd"
|
214
|
+
CloudCrowd.configure("#{@options[:config_path]}/config.yml")
|
215
|
+
end
|
216
|
+
|
217
|
+
# Establish a connection to the central server's database. Not all commands
|
218
|
+
# require this.
|
219
|
+
def connect_to_database(validate_schema)
|
220
|
+
require 'cloud_crowd/models'
|
221
|
+
CloudCrowd.configure_database("#{@options[:config_path]}/database.yml", validate_schema)
|
222
|
+
end
|
223
|
+
|
224
|
+
# Exit with an explanation if the configuration files couldn't be found.
|
225
|
+
def config_not_found
|
226
|
+
puts "`crowd` can't find the CloudCrowd configuration directory. Please use `crowd -c path/to/config`, or run `crowd` from inside of the configuration directory itself."
|
227
|
+
exit(1)
|
228
|
+
end
|
229
|
+
|
230
|
+
# Install a file and log the installation. If we're overwriting a file,
|
231
|
+
# offer a chance to back out.
|
232
|
+
def install_file(source, dest, is_dir=false)
|
233
|
+
if File.exists?(dest)
|
234
|
+
print "#{dest} already exists. Overwrite it? (yes/no) "
|
235
|
+
return unless ['y', 'yes', 'ok'].include? gets.chomp.downcase
|
236
|
+
end
|
237
|
+
is_dir ? FileUtils.cp_r(source, dest) : FileUtils.cp(source, dest)
|
238
|
+
puts "installed #{dest}" unless ENV['RACK_ENV'] == 'test'
|
239
|
+
end
|
240
|
+
|
241
|
+
end
|
242
|
+
end
|