documentcloud-cloud-crowd 0.0.5 → 0.0.6
Sign up to get free protection for your applications and to get access to all the features.
- data/README +59 -50
- data/actions/process_pdfs.rb +3 -3
- data/actions/word_count.rb +14 -0
- data/cloud-crowd.gemspec +27 -13
- data/config/config.example.yml +8 -11
- data/examples/graphics_magick_example.rb +40 -44
- data/examples/process_pdfs_example.rb +39 -29
- data/examples/word_count_example.rb +41 -0
- data/lib/cloud-crowd.rb +20 -17
- data/lib/cloud_crowd/action.rb +26 -9
- data/lib/cloud_crowd/app.rb +26 -4
- data/lib/cloud_crowd/asset_store.rb +69 -40
- data/lib/cloud_crowd/command_line.rb +6 -4
- data/lib/cloud_crowd/daemon.rb +65 -25
- data/lib/cloud_crowd/exceptions.rb +5 -0
- data/lib/cloud_crowd/helpers/resources.rb +2 -2
- data/lib/cloud_crowd/models/job.rb +9 -13
- data/lib/cloud_crowd/models/work_unit.rb +23 -15
- data/lib/cloud_crowd/models/worker_record.rb +61 -0
- data/lib/cloud_crowd/models.rb +7 -1
- data/lib/cloud_crowd/schema.rb +12 -3
- data/lib/cloud_crowd/worker.rb +48 -10
- data/public/css/admin_console.css +174 -4
- data/public/css/reset.css +17 -27
- data/public/images/bullet_green.png +0 -0
- data/public/images/bullet_white.png +0 -0
- data/public/images/cloud_hand.png +0 -0
- data/public/images/header_back.png +0 -0
- data/public/images/logo.png +0 -0
- data/public/images/server_error.png +0 -0
- data/public/images/sidebar_bottom.png +0 -0
- data/public/images/sidebar_top.png +0 -0
- data/public/images/worker_info.png +0 -0
- data/public/images/worker_info_loading.gif +0 -0
- data/public/js/admin_console.js +127 -10
- data/public/js/excanvas.pack.js +1 -0
- data/public/js/jquery-1.3.2.min.js +19 -0
- data/public/js/jquery.flot.pack.js +1 -0
- data/test/acceptance/test_word_count.rb +49 -0
- data/test/blueprints.rb +6 -5
- data/test/config/config.yml +1 -4
- data/test/test_helper.rb +1 -0
- data/test/unit/test_job.rb +12 -4
- data/test/unit/test_work_unit.rb +2 -2
- data/views/index.erb +69 -14
- metadata +23 -6
- data/public/js/jquery-1.3.2.js +0 -4376
data/README
CHANGED
@@ -22,54 +22,63 @@
|
|
22
22
|
|
23
23
|
|
24
24
|
|
25
|
-
|
25
|
+
~ CloudCrowd ~
|
26
26
|
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
27
|
+
* Parallel processing for the rest of us
|
28
|
+
* Write your scripts in Ruby
|
29
|
+
* Built for Amazon EC2 and S3
|
30
|
+
* split -> process -> merge
|
31
|
+
* As easy as `gem install cloud-crowd`
|
32
|
+
|
33
|
+
|
34
|
+
~ Wiki ~
|
35
|
+
|
36
|
+
http://wiki.github.com/documentcloud/cloud-crowd
|
37
|
+
|
38
|
+
|
39
|
+
~ Getting started ~
|
40
|
+
|
41
|
+
# Install the gem.
|
42
|
+
|
43
|
+
>> sudo gem install cloud-crowd
|
44
|
+
|
45
|
+
# Install the CloudCrowd configuration files to a location of your choosing.
|
46
|
+
|
47
|
+
>> crowd install ~/config/cloud-crowd
|
48
|
+
|
49
|
+
# Now, you can use the full complement of `crowd` commands from inside of
|
50
|
+
# this configuration directory. To see the available commands:
|
51
|
+
|
52
|
+
>> crowd --help
|
53
|
+
|
54
|
+
# Edit the configuration files to your satisfaction, add AWS credentials,
|
55
|
+
# and then load the CloudCrowd schema into your configured database.
|
56
|
+
|
57
|
+
>> mate ~/config/cloud-crowd/config.yml
|
58
|
+
>> mate ~/config/cloud-crowd/database.yml
|
59
|
+
>> crowd load_schema
|
60
|
+
|
61
|
+
# Write your actions, and install them into the 'actions' subdirectory.
|
62
|
+
# CloudCrowd comes with some default actions as an example.
|
63
|
+
|
64
|
+
# To launch the central server (make sure that you include its location
|
65
|
+
# in config.yml), either:
|
66
|
+
|
67
|
+
>> crowd server
|
68
|
+
|
69
|
+
# or:
|
70
|
+
|
71
|
+
>> thin -R config.ru --servers 3 -e production start
|
72
|
+
|
73
|
+
# Any server that supports Rack should work with the rackup file.
|
74
|
+
|
75
|
+
# Then, to spin up 10 workers:
|
76
|
+
|
77
|
+
>> crowd workers start -n 10
|
78
|
+
|
79
|
+
# To spin up workers remotely, install the 'cloud-crowd' gem, and copy over
|
80
|
+
# your configuration directory.
|
81
|
+
|
82
|
+
# At this point you can visit your server console at localhost:9173 to
|
83
|
+
# view all of your workers, ready for action.
|
84
|
+
|
data/actions/process_pdfs.rb
CHANGED
@@ -6,8 +6,8 @@
|
|
6
6
|
# See <tt>examples/process_pdfs_example.rb</tt> for more information.
|
7
7
|
class ProcessPdfs < CloudCrowd::Action
|
8
8
|
|
9
|
-
# Split up a large pdf into single-page pdfs.
|
10
|
-
# The double pdftk shuffle fixes the document xrefs.
|
9
|
+
# Split up a large pdf into single-page pdfs. Batch them into 'batch_size'
|
10
|
+
# chunks for processing. The double pdftk shuffle fixes the document xrefs.
|
11
11
|
def split
|
12
12
|
`pdftk #{input_path} burst output "#{file_name}_%05d.pdf_temp"`
|
13
13
|
FileUtils.rm input_path
|
@@ -41,7 +41,7 @@ class ProcessPdfs < CloudCrowd::Action
|
|
41
41
|
# the concatenated merge of the full-text into a single tar archive, ready to
|
42
42
|
# for download.
|
43
43
|
def merge
|
44
|
-
|
44
|
+
input.each do |batch_url|
|
45
45
|
batch_path = File.basename(batch_url)
|
46
46
|
download(batch_url, batch_path)
|
47
47
|
`tar -xzf #{batch_path}`
|
@@ -0,0 +1,14 @@
|
|
1
|
+
# A parallel WordCount. Depends on the 'wc' utility.
|
2
|
+
class WordCount < CloudCrowd::Action
|
3
|
+
|
4
|
+
# Count the words in a single book.
|
5
|
+
def process
|
6
|
+
(`wc -w #{input_path}`).match(/\A\s*(\d+)/)[1].to_i
|
7
|
+
end
|
8
|
+
|
9
|
+
# Sum the total word count.
|
10
|
+
def merge
|
11
|
+
input.inject(0) {|sum, count| sum + count }
|
12
|
+
end
|
13
|
+
|
14
|
+
end
|
data/cloud-crowd.gemspec
CHANGED
@@ -1,10 +1,10 @@
|
|
1
1
|
Gem::Specification.new do |s|
|
2
2
|
s.name = 'cloud-crowd'
|
3
|
-
s.version = '0.0.
|
3
|
+
s.version = '0.0.6' # Keep version in sync with cloud-cloud.rb
|
4
4
|
s.date = '2009-09-01'
|
5
5
|
|
6
|
-
s.homepage = "http://documentcloud
|
7
|
-
s.summary = "
|
6
|
+
s.homepage = "http://wiki.github.com/documentcloud/cloud-crowd"
|
7
|
+
s.summary = "Parallel Processing for the Rest of Us"
|
8
8
|
s.description = <<-EOS
|
9
9
|
The crowd, suddenly there where there was nothing before, is a mysterious and
|
10
10
|
universal phenomenon. A few people may have been standing together -- five, ten
|
@@ -13,18 +13,16 @@ Gem::Specification.new do |s|
|
|
13
13
|
streets had only one direction.
|
14
14
|
EOS
|
15
15
|
|
16
|
-
s.authors
|
17
|
-
s.email
|
18
|
-
s.rubyforge_project
|
19
|
-
|
20
|
-
s.require_paths = ['lib']
|
21
|
-
s.executables = ['crowd']
|
22
|
-
|
23
|
-
# s.post_install_message = "Run `crowd --help` for information on using CloudCrowd."
|
16
|
+
s.authors = ['Jeremy Ashkenas']
|
17
|
+
s.email = 'jeremy@documentcloud.org'
|
18
|
+
s.rubyforge_project = 'cloud-crowd'
|
24
19
|
|
20
|
+
s.require_paths = ['lib']
|
21
|
+
s.executables = ['crowd']
|
22
|
+
|
25
23
|
s.has_rdoc = true
|
26
24
|
s.extra_rdoc_files = ['README']
|
27
|
-
s.rdoc_options << '--title' << 'CloudCrowd |
|
25
|
+
s.rdoc_options << '--title' << 'CloudCrowd | Parallel Processing for the Rest of Us' <<
|
28
26
|
'--exclude' << 'test' <<
|
29
27
|
'--main' << 'README' <<
|
30
28
|
'--all'
|
@@ -47,6 +45,7 @@ Gem::Specification.new do |s|
|
|
47
45
|
s.files = %w(
|
48
46
|
actions/graphics_magick.rb
|
49
47
|
actions/process_pdfs.rb
|
48
|
+
actions/word_count.rb
|
50
49
|
cloud-crowd.gemspec
|
51
50
|
config/config.example.ru
|
52
51
|
config/config.example.yml
|
@@ -54,6 +53,7 @@ config/database.example.yml
|
|
54
53
|
EPIGRAPHS
|
55
54
|
examples/graphics_magick_example.rb
|
56
55
|
examples/process_pdfs_example.rb
|
56
|
+
examples/word_count_example.rb
|
57
57
|
lib/cloud-crowd.rb
|
58
58
|
lib/cloud_crowd/action.rb
|
59
59
|
lib/cloud_crowd/app.rb
|
@@ -67,6 +67,7 @@ lib/cloud_crowd/helpers.rb
|
|
67
67
|
lib/cloud_crowd/inflector.rb
|
68
68
|
lib/cloud_crowd/models/job.rb
|
69
69
|
lib/cloud_crowd/models/work_unit.rb
|
70
|
+
lib/cloud_crowd/models/worker_record.rb
|
70
71
|
lib/cloud_crowd/models.rb
|
71
72
|
lib/cloud_crowd/runner.rb
|
72
73
|
lib/cloud_crowd/schema.rb
|
@@ -74,11 +75,24 @@ lib/cloud_crowd/worker.rb
|
|
74
75
|
LICENSE
|
75
76
|
public/css/admin_console.css
|
76
77
|
public/css/reset.css
|
78
|
+
public/images/bullet_green.png
|
79
|
+
public/images/bullet_white.png
|
80
|
+
public/images/cloud_hand.png
|
81
|
+
public/images/header_back.png
|
82
|
+
public/images/logo.png
|
77
83
|
public/images/queue_fill.png
|
84
|
+
public/images/server_error.png
|
85
|
+
public/images/sidebar_bottom.png
|
86
|
+
public/images/sidebar_top.png
|
87
|
+
public/images/worker_info.png
|
88
|
+
public/images/worker_info_loading.gif
|
78
89
|
public/js/admin_console.js
|
79
|
-
public/js/
|
90
|
+
public/js/excanvas.pack.js
|
91
|
+
public/js/jquery.flot.pack.js
|
92
|
+
public/js/jquery-1.3.2.min.js
|
80
93
|
README
|
81
94
|
test/acceptance/test_failing_work_units.rb
|
95
|
+
test/acceptance/test_word_count.rb
|
82
96
|
test/blueprints.rb
|
83
97
|
test/config/config.ru
|
84
98
|
test/config/config.yml
|
data/config/config.example.yml
CHANGED
@@ -1,6 +1,11 @@
|
|
1
|
-
# The URL where you're planning on running the server/queue/database.
|
1
|
+
# The URL where you're planning on running the central server/queue/database.
|
2
2
|
:central_server: http://localhost:9173
|
3
3
|
|
4
|
+
# The storage back-end that you'd like to use for intermediate and final results
|
5
|
+
# of processing. 's3' and 'filesystem' are supported. 'filesystem' should only
|
6
|
+
# be used in development, or on single-machine installations.
|
7
|
+
:storage: s3
|
8
|
+
|
4
9
|
# Please provide your AWS credentials for S3 storage of job output.
|
5
10
|
:aws_access_key: [your AWS access key]
|
6
11
|
:aws_secret_key: [your AWS secret access key]
|
@@ -20,8 +25,8 @@
|
|
20
25
|
:password: [your password]
|
21
26
|
|
22
27
|
# By default, CloudCrowd looks for installed actions inside the 'actions'
|
23
|
-
# subdirectory of this configuration folder. 'actions_path' allows you to
|
24
|
-
#
|
28
|
+
# subdirectory of this configuration folder. 'actions_path' allows you to load
|
29
|
+
# additional actions from a location of your choice.
|
25
30
|
# :actions_path: /path/to/actions
|
26
31
|
|
27
32
|
# Set the following numbers to tweak the configuration of your worker daemons.
|
@@ -38,14 +43,6 @@
|
|
38
43
|
# The maximum number of seconds a worker waits between checking the job queue.
|
39
44
|
:max_worker_wait: 20
|
40
45
|
|
41
|
-
# The backoff multiplier the worker uses to slow down the check interval when
|
42
|
-
# there's no work in the queue.
|
43
|
-
:worker_wait_multiplier: 1.3
|
44
|
-
|
45
|
-
# The number of seconds a worker waits to retry when there's some kind of
|
46
|
-
# internal error (ie. the central server fails to respond)
|
47
|
-
:worker_retry_wait: 5
|
48
|
-
|
49
46
|
# The number of separate attempts that will be made to process an individual
|
50
47
|
# work unit, before marking it as having failed.
|
51
48
|
:work_unit_retries: 3
|
@@ -1,48 +1,44 @@
|
|
1
|
-
|
2
|
-
# This is a fancy example that produces black and white, annotated, and blurred
|
3
|
-
# versions of a list of URLs downloaded from the web.
|
1
|
+
#!/usr/bin/env ruby -rubygems
|
4
2
|
|
3
|
+
require 'restclient'
|
5
4
|
require 'json'
|
6
5
|
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
'action' => 'graphics_magick',
|
12
|
-
|
13
|
-
'inputs' => [
|
14
|
-
'http://www.sci-fi-o-rama.com/wp-content/uploads/2008/10/dan_mcpharlin_the_land_of_sleeping_things.jpg',
|
15
|
-
'http://www.sci-fi-o-rama.com/wp-content/uploads/2009/07/dan_mcpharlin_wired_spread01.jpg',
|
16
|
-
'http://www.sci-fi-o-rama.com/wp-content/uploads/2009/07/dan_mcpharlin_wired_spread03.jpg',
|
17
|
-
'http://www.sci-fi-o-rama.com/wp-content/uploads/2009/07/dan_mcpharlin_wired_spread02.jpg',
|
18
|
-
'http://www.sci-fi-o-rama.com/wp-content/uploads/2009/02/dan_mcpharlin_untitled.jpg'
|
19
|
-
],
|
20
|
-
|
21
|
-
'options' => {
|
22
|
-
'steps' => [{
|
23
|
-
'name' => 'annotated',
|
24
|
-
'command' => 'convert',
|
25
|
-
'options' => '-font helvetica -fill red -draw "font-size 35; text 75,75 CloudCrowd!"',
|
26
|
-
'extension' => 'jpg'
|
27
|
-
},{
|
28
|
-
'name' => 'blurred',
|
29
|
-
'command' => 'convert',
|
30
|
-
'options' => '-blur 10x5',
|
31
|
-
'extension' => 'png'
|
32
|
-
},{
|
33
|
-
'name' => 'bw',
|
34
|
-
'input' => 'blurred',
|
35
|
-
'command' => 'convert',
|
36
|
-
'options' => '-monochrome',
|
37
|
-
'extension' => 'jpg'
|
38
|
-
}]
|
39
|
-
}
|
40
|
-
|
41
|
-
}.to_json}
|
42
|
-
)
|
6
|
+
# This example demonstrates the GraphicsMagick action by taking in a list of
|
7
|
+
# five images, and producing annotated, blurred, and black and white versions
|
8
|
+
# of each image. See actions/graphics_magick.rb
|
43
9
|
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
10
|
+
RestClient.post('http://localhost:9173/jobs',
|
11
|
+
{:job => {
|
12
|
+
|
13
|
+
'action' => 'graphics_magick',
|
14
|
+
|
15
|
+
'inputs' => [
|
16
|
+
'http://www.sci-fi-o-rama.com/wp-content/uploads/2008/10/dan_mcpharlin_the_land_of_sleeping_things.jpg',
|
17
|
+
'http://www.sci-fi-o-rama.com/wp-content/uploads/2009/07/dan_mcpharlin_wired_spread01.jpg',
|
18
|
+
'http://www.sci-fi-o-rama.com/wp-content/uploads/2009/07/dan_mcpharlin_wired_spread03.jpg',
|
19
|
+
'http://www.sci-fi-o-rama.com/wp-content/uploads/2009/07/dan_mcpharlin_wired_spread02.jpg',
|
20
|
+
'http://www.sci-fi-o-rama.com/wp-content/uploads/2009/02/dan_mcpharlin_untitled.jpg'
|
21
|
+
],
|
22
|
+
|
23
|
+
'options' => {
|
24
|
+
'steps' => [{
|
25
|
+
'name' => 'annotated',
|
26
|
+
'command' => 'convert',
|
27
|
+
'options' => '-font helvetica -fill red -draw "font-size 35; text 75,75 CloudCrowd!"',
|
28
|
+
'extension' => 'jpg'
|
29
|
+
},{
|
30
|
+
'name' => 'blurred',
|
31
|
+
'command' => 'convert',
|
32
|
+
'options' => '-blur 10x5',
|
33
|
+
'extension' => 'png'
|
34
|
+
},{
|
35
|
+
'name' => 'bw',
|
36
|
+
'input' => 'blurred',
|
37
|
+
'command' => 'convert',
|
38
|
+
'options' => '-monochrome',
|
39
|
+
'extension' => 'jpg'
|
40
|
+
}]
|
41
|
+
}
|
42
|
+
|
43
|
+
}.to_json}
|
44
|
+
)
|
@@ -1,30 +1,40 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
1
|
+
#!/usr/bin/env ruby -rubygems
|
2
|
+
|
3
|
+
require 'restclient'
|
4
|
+
require 'json'
|
5
|
+
|
6
|
+
# This example demonstrates a fairly complicated PDF-processing action, designed
|
7
|
+
# to extract the PDF's text, and produce GIF versions of each page. The action
|
8
|
+
# (actions/process_pdfs.rb) shows an example of using all three steps,
|
9
|
+
# split, process, and merge.
|
10
|
+
|
11
|
+
RestClient.post('http://localhost:9173/jobs',
|
12
|
+
{:job => {
|
13
|
+
|
14
|
+
'action' => 'process_pdfs',
|
15
|
+
|
16
|
+
'inputs' => [
|
17
|
+
'http://tigger.uic.edu/~victor/personal/futurism.pdf',
|
18
|
+
'http://www.jonasmekas.com/Catalog_excerpt/The%20Avant-Garde%20From%20Futurism%20to%20Fluxus.pdf',
|
19
|
+
'http://www.dzignism.com/articles/Futurist.Manifesto.pdf',
|
20
|
+
'http://benfry.com/phd/dissertation-050312b-acrobat.pdf'
|
21
|
+
],
|
22
|
+
|
23
|
+
'options' => {
|
24
|
+
|
25
|
+
'batch_size' => 7,
|
26
|
+
|
27
|
+
'images' => [{
|
28
|
+
'name' => '700',
|
29
|
+
'options' => '-resize 700x -density 220 -depth 4 -unsharp 0.5x0.5+0.5+0.03',
|
30
|
+
'extension' => 'gif'
|
31
|
+
},{
|
32
|
+
'name' => '1000',
|
33
|
+
'options' => '-resize 1000x -density 220 -depth 4 -unsharp 0.5x0.5+0.5+0.03',
|
34
|
+
'extension' => 'gif'
|
35
|
+
}]
|
36
|
+
|
37
|
+
}
|
38
|
+
|
39
|
+
}.to_json}
|
30
40
|
)
|
@@ -0,0 +1,41 @@
|
|
1
|
+
#!/usr/bin/env ruby -rubygems
|
2
|
+
|
3
|
+
require 'restclient'
|
4
|
+
require 'json'
|
5
|
+
|
6
|
+
# Let's count all the words in Shakespeare.
|
7
|
+
|
8
|
+
RestClient.post('http://localhost:9173/jobs',
|
9
|
+
{:job => {
|
10
|
+
|
11
|
+
'action' => 'word_count',
|
12
|
+
|
13
|
+
'inputs' => [
|
14
|
+
'http://www.gutenberg.org/dirs/etext97/1ws3010.txt', # All's Well That Ends Well
|
15
|
+
'http://www.gutenberg.org/dirs/etext99/1ws3511.txt', # Anthony and Cleopatra
|
16
|
+
'http://www.gutenberg.org/dirs/etext97/1ws2510.txt', # As You Like It
|
17
|
+
'http://www.gutenberg.org/dirs/etext97/1ws0610.txt', # The Comedy of Errors
|
18
|
+
'http://www.gutenberg.org/dirs/etext99/1ws3911.txt', # Cymbeline
|
19
|
+
'http://www.gutenberg.org/dirs/etext00/0ws2610.txt', # Hamlet
|
20
|
+
'http://www.gutenberg.org/dirs/etext00/0ws1910.txt', # Henry IV
|
21
|
+
'http://www.gutenberg.org/dirs/etext99/1ws2411.txt', # Julius Caesar
|
22
|
+
'http://www.gutenberg.org/dirs/etext98/2ws3310.txt', # King Lear
|
23
|
+
'http://www.gutenberg.org/dirs/etext99/1ws1211j.txt', # Love's Labour's Lost
|
24
|
+
'http://www.gutenberg.org/dirs/etext98/2ws3410.txt', # Macbeth
|
25
|
+
'http://www.gutenberg.org/dirs/etext98/2ws1810.txt', # The Merchant of Venice
|
26
|
+
'http://www.gutenberg.org/dirs/etext99/1ws1711.txt', # Midsummer Night's Dream
|
27
|
+
'http://www.gutenberg.org/dirs/etext98/3ws2210.txt', # Much Ado About Nothing
|
28
|
+
'http://www.gutenberg.org/dirs/etext00/0ws3210.txt', # Othello
|
29
|
+
'http://www.gutenberg.org/dirs/etext98/2ws1610.txt', # Romeo and Juliet
|
30
|
+
'http://www.gutenberg.org/dirs/etext98/2ws1010.txt', # The Taming of the Shrew
|
31
|
+
'http://www.gutenberg.org/dirs/etext99/1ws4111.txt', # The Tempest
|
32
|
+
'http://www.gutenberg.org/dirs/etext00/0ws0910.txt', # Titus Andronicus
|
33
|
+
'http://www.gutenberg.org/dirs/etext99/1ws2911.txt', # Troilus and Cressida
|
34
|
+
'http://www.gutenberg.org/dirs/etext98/3ws2810.txt', # Twelfth Night
|
35
|
+
'http://www.gutenberg.org/files/1539/1539.txt' # The Winter's Tale
|
36
|
+
]
|
37
|
+
|
38
|
+
}.to_json}
|
39
|
+
)
|
40
|
+
|
41
|
+
# With 23 Workers running, and over Wifi, it counted all the words in 5.5 secs.
|
data/lib/cloud-crowd.rb
CHANGED
@@ -19,28 +19,33 @@ autoload :Digest, 'digest'
|
|
19
19
|
autoload :ERB, 'erb'
|
20
20
|
autoload :FileUtils, 'fileutils'
|
21
21
|
autoload :JSON, 'json'
|
22
|
-
autoload :RestClient, '
|
22
|
+
autoload :RestClient, 'restclient'
|
23
23
|
autoload :RightAws, 'right_aws'
|
24
24
|
autoload :Sinatra, 'sinatra'
|
25
25
|
autoload :Socket, 'socket'
|
26
26
|
autoload :YAML, 'yaml'
|
27
27
|
|
28
|
+
# Common code which should really be required in every circumstance.
|
29
|
+
require 'cloud_crowd/exceptions'
|
30
|
+
|
28
31
|
module CloudCrowd
|
29
32
|
|
30
33
|
# Autoload all the CloudCrowd classes which may not be required.
|
31
|
-
autoload :App,
|
32
|
-
autoload :Action,
|
33
|
-
autoload :AssetStore,
|
34
|
-
autoload :Helpers,
|
35
|
-
autoload :Inflector,
|
36
|
-
autoload :Job,
|
37
|
-
autoload :
|
34
|
+
autoload :App, 'cloud_crowd/app'
|
35
|
+
autoload :Action, 'cloud_crowd/action'
|
36
|
+
autoload :AssetStore, 'cloud_crowd/asset_store'
|
37
|
+
autoload :Helpers, 'cloud_crowd/helpers'
|
38
|
+
autoload :Inflector, 'cloud_crowd/inflector'
|
39
|
+
autoload :Job, 'cloud_crowd/models'
|
40
|
+
autoload :Worker, 'cloud_crowd/worker'
|
41
|
+
autoload :WorkUnit, 'cloud_crowd/models'
|
42
|
+
autoload :WorkerRecord, 'cloud_crowd/models'
|
38
43
|
|
39
44
|
# Root directory of the CloudCrowd gem.
|
40
45
|
ROOT = File.expand_path(File.dirname(__FILE__) + '/..')
|
41
46
|
|
42
47
|
# Keep the version in sync with the gemspec.
|
43
|
-
VERSION = '0.0.
|
48
|
+
VERSION = '0.0.6'
|
44
49
|
|
45
50
|
# A Job is processing if its WorkUnits in the queue to be handled by workers.
|
46
51
|
PROCESSING = 1
|
@@ -68,9 +73,7 @@ module CloudCrowd
|
|
68
73
|
INCOMPLETE = [PROCESSING, SPLITTING, MERGING]
|
69
74
|
|
70
75
|
# Mapping of statuses to their display strings.
|
71
|
-
DISPLAY_STATUS_MAP =
|
72
|
-
1 => 'processing', 2 => 'succeeded', 3 => 'failed', 4 => 'splitting', 5 => 'merging'
|
73
|
-
}
|
76
|
+
DISPLAY_STATUS_MAP = ['unknown', 'processing', 'succeeded', 'failed', 'splitting', 'merging']
|
74
77
|
|
75
78
|
class << self
|
76
79
|
attr_reader :config
|
@@ -101,7 +104,7 @@ module CloudCrowd
|
|
101
104
|
# Return the displayable status name of an internal CloudCrowd status number.
|
102
105
|
# (See the above constants).
|
103
106
|
def display_status(status)
|
104
|
-
DISPLAY_STATUS_MAP[status]
|
107
|
+
DISPLAY_STATUS_MAP[status] || 'unknown'
|
105
108
|
end
|
106
109
|
|
107
110
|
# CloudCrowd::Actions are requested dynamically by name. Access them through
|
@@ -112,10 +115,10 @@ module CloudCrowd
|
|
112
115
|
def actions
|
113
116
|
return @actions if @actions
|
114
117
|
@actions = {}
|
115
|
-
default_actions
|
116
|
-
|
117
|
-
|
118
|
-
(default_actions + custom_actions).each do |path|
|
118
|
+
default_actions = Dir["#{ROOT}/actions/*.rb"]
|
119
|
+
installed_actions = Dir["#{@config_path}/actions/*.rb"]
|
120
|
+
custom_actions = Dir["#{CloudCrowd.config[:actions_path]}/*.rb"]
|
121
|
+
(default_actions + installed_actions + custom_actions).each do |path|
|
119
122
|
name = File.basename(path, File.extname(path))
|
120
123
|
require path
|
121
124
|
@actions[name] = Module.const_get(Inflector.camelize(name))
|
data/lib/cloud_crowd/action.rb
CHANGED
@@ -2,7 +2,7 @@ module CloudCrowd
|
|
2
2
|
|
3
3
|
# As you write your custom actions, have them inherit from CloudCrowd::Action.
|
4
4
|
# All actions must implement a +process+ method, which should return a
|
5
|
-
# JSON-
|
5
|
+
# JSON-serializable object that will be used as the output for the work unit.
|
6
6
|
# See the default actions for examples.
|
7
7
|
#
|
8
8
|
# Optionally, actions may define +split+ and +merge+ methods to do mapping
|
@@ -14,6 +14,8 @@ module CloudCrowd
|
|
14
14
|
# and spend their duration inside of it, so relative paths work well.
|
15
15
|
class Action
|
16
16
|
|
17
|
+
FILE_URL = /\Afile:\/\//
|
18
|
+
|
17
19
|
attr_reader :input, :input_path, :file_name, :options, :work_directory
|
18
20
|
|
19
21
|
# Initializing an Action sets up all of the read-only variables that
|
@@ -27,11 +29,7 @@ module CloudCrowd
|
|
27
29
|
@work_directory = File.expand_path(File.join(@store.temp_storage_path, storage_prefix))
|
28
30
|
FileUtils.mkdir_p(@work_directory) unless File.exists?(@work_directory)
|
29
31
|
Dir.chdir @work_directory
|
30
|
-
|
31
|
-
@input_path = File.join(@work_directory, safe_filename(@input))
|
32
|
-
@file_name = File.basename(@input_path, File.extname(@input_path))
|
33
|
-
download(@input, @input_path)
|
34
|
-
end
|
32
|
+
status == MERGING ? parse_input : download_input
|
35
33
|
end
|
36
34
|
|
37
35
|
# Each Action subclass must implement a +process+ method, overriding this.
|
@@ -39,9 +37,14 @@ module CloudCrowd
|
|
39
37
|
raise NotImplementedError.new("CloudCrowd::Actions must override 'process' with their own processing code.")
|
40
38
|
end
|
41
39
|
|
42
|
-
# Download a file to the specified path
|
40
|
+
# Download a file to the specified path.
|
43
41
|
def download(url, path)
|
44
|
-
|
42
|
+
if url.match(FILE_URL)
|
43
|
+
FileUtils.cp(url.sub(FILE_URL, ''), path)
|
44
|
+
else
|
45
|
+
resp = RestClient::Request.execute(:url => url, :method => :get, :raw_response => true)
|
46
|
+
FileUtils.mv resp.file.path, path
|
47
|
+
end
|
45
48
|
path
|
46
49
|
end
|
47
50
|
|
@@ -57,7 +60,7 @@ module CloudCrowd
|
|
57
60
|
# to the root directory (where daemons run by default).
|
58
61
|
def cleanup_work_directory
|
59
62
|
Dir.chdir '/'
|
60
|
-
FileUtils.rm_r(@work_directory)
|
63
|
+
FileUtils.rm_r(@work_directory) if File.exists?(@work_directory)
|
61
64
|
end
|
62
65
|
|
63
66
|
|
@@ -80,6 +83,20 @@ module CloudCrowd
|
|
80
83
|
@storage_prefix ||= File.join(path_parts)
|
81
84
|
end
|
82
85
|
|
86
|
+
# If we know that the input is JSON, replace it with the parsed form.
|
87
|
+
def parse_input
|
88
|
+
@input = JSON.parse(@input)
|
89
|
+
end
|
90
|
+
|
91
|
+
# If the input is a URL, download the file before beginning processing.
|
92
|
+
def download_input
|
93
|
+
input_is_url = !!URI.parse(@input) rescue false
|
94
|
+
return unless input_is_url
|
95
|
+
@input_path = File.join(@work_directory, safe_filename(@input))
|
96
|
+
@file_name = File.basename(@input_path, File.extname(@input_path))
|
97
|
+
download(@input, @input_path)
|
98
|
+
end
|
99
|
+
|
83
100
|
end
|
84
101
|
|
85
102
|
end
|