documentcloud-cloud-crowd 0.0.5 → 0.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +59 -50
- data/actions/process_pdfs.rb +3 -3
- data/actions/word_count.rb +14 -0
- data/cloud-crowd.gemspec +27 -13
- data/config/config.example.yml +8 -11
- data/examples/graphics_magick_example.rb +40 -44
- data/examples/process_pdfs_example.rb +39 -29
- data/examples/word_count_example.rb +41 -0
- data/lib/cloud-crowd.rb +20 -17
- data/lib/cloud_crowd/action.rb +26 -9
- data/lib/cloud_crowd/app.rb +26 -4
- data/lib/cloud_crowd/asset_store.rb +69 -40
- data/lib/cloud_crowd/command_line.rb +6 -4
- data/lib/cloud_crowd/daemon.rb +65 -25
- data/lib/cloud_crowd/exceptions.rb +5 -0
- data/lib/cloud_crowd/helpers/resources.rb +2 -2
- data/lib/cloud_crowd/models/job.rb +9 -13
- data/lib/cloud_crowd/models/work_unit.rb +23 -15
- data/lib/cloud_crowd/models/worker_record.rb +61 -0
- data/lib/cloud_crowd/models.rb +7 -1
- data/lib/cloud_crowd/schema.rb +12 -3
- data/lib/cloud_crowd/worker.rb +48 -10
- data/public/css/admin_console.css +174 -4
- data/public/css/reset.css +17 -27
- data/public/images/bullet_green.png +0 -0
- data/public/images/bullet_white.png +0 -0
- data/public/images/cloud_hand.png +0 -0
- data/public/images/header_back.png +0 -0
- data/public/images/logo.png +0 -0
- data/public/images/server_error.png +0 -0
- data/public/images/sidebar_bottom.png +0 -0
- data/public/images/sidebar_top.png +0 -0
- data/public/images/worker_info.png +0 -0
- data/public/images/worker_info_loading.gif +0 -0
- data/public/js/admin_console.js +127 -10
- data/public/js/excanvas.pack.js +1 -0
- data/public/js/jquery-1.3.2.min.js +19 -0
- data/public/js/jquery.flot.pack.js +1 -0
- data/test/acceptance/test_word_count.rb +49 -0
- data/test/blueprints.rb +6 -5
- data/test/config/config.yml +1 -4
- data/test/test_helper.rb +1 -0
- data/test/unit/test_job.rb +12 -4
- data/test/unit/test_work_unit.rb +2 -2
- data/views/index.erb +69 -14
- metadata +23 -6
- data/public/js/jquery-1.3.2.js +0 -4376
data/README
CHANGED
@@ -22,54 +22,63 @@
|
|
22
22
|
|
23
23
|
|
24
24
|
|
25
|
-
|
25
|
+
~ CloudCrowd ~
|
26
26
|
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
27
|
+
* Parallel processing for the rest of us
|
28
|
+
* Write your scripts in Ruby
|
29
|
+
* Built for Amazon EC2 and S3
|
30
|
+
* split -> process -> merge
|
31
|
+
* As easy as `gem install cloud-crowd`
|
32
|
+
|
33
|
+
|
34
|
+
~ Wiki ~
|
35
|
+
|
36
|
+
http://wiki.github.com/documentcloud/cloud-crowd
|
37
|
+
|
38
|
+
|
39
|
+
~ Getting started ~
|
40
|
+
|
41
|
+
# Install the gem.
|
42
|
+
|
43
|
+
>> sudo gem install cloud-crowd
|
44
|
+
|
45
|
+
# Install the CloudCrowd configuration files to a location of your choosing.
|
46
|
+
|
47
|
+
>> crowd install ~/config/cloud-crowd
|
48
|
+
|
49
|
+
# Now, you can use the full complement of `crowd` commands from inside of
|
50
|
+
# this configuration directory. To see the available commands:
|
51
|
+
|
52
|
+
>> crowd --help
|
53
|
+
|
54
|
+
# Edit the configuration files to your satisfaction, add AWS credentials,
|
55
|
+
# and then load the CloudCrowd schema into your configured database.
|
56
|
+
|
57
|
+
>> mate ~/config/cloud-crowd/config.yml
|
58
|
+
>> mate ~/config/cloud-crowd/database.yml
|
59
|
+
>> crowd load_schema
|
60
|
+
|
61
|
+
# Write your actions, and install them into the 'actions' subdirectory.
|
62
|
+
# CloudCrowd comes with some default actions as an example.
|
63
|
+
|
64
|
+
# To launch the central server (make sure that you include its location
|
65
|
+
# in config.yml), either:
|
66
|
+
|
67
|
+
>> crowd server
|
68
|
+
|
69
|
+
# or:
|
70
|
+
|
71
|
+
>> thin -R config.ru --servers 3 -e production start
|
72
|
+
|
73
|
+
# Any server that supports Rack should work with the rackup file.
|
74
|
+
|
75
|
+
# Then, to spin up 10 workers:
|
76
|
+
|
77
|
+
>> crowd workers start -n 10
|
78
|
+
|
79
|
+
# To spin up workers remotely, install the 'cloud-crowd' gem, and copy over
|
80
|
+
# your configuration directory.
|
81
|
+
|
82
|
+
# At this point you can visit your server console at localhost:9173 to
|
83
|
+
# view all of your workers, ready for action.
|
84
|
+
|
data/actions/process_pdfs.rb
CHANGED
@@ -6,8 +6,8 @@
|
|
6
6
|
# See <tt>examples/process_pdfs_example.rb</tt> for more information.
|
7
7
|
class ProcessPdfs < CloudCrowd::Action
|
8
8
|
|
9
|
-
# Split up a large pdf into single-page pdfs.
|
10
|
-
# The double pdftk shuffle fixes the document xrefs.
|
9
|
+
# Split up a large pdf into single-page pdfs. Batch them into 'batch_size'
|
10
|
+
# chunks for processing. The double pdftk shuffle fixes the document xrefs.
|
11
11
|
def split
|
12
12
|
`pdftk #{input_path} burst output "#{file_name}_%05d.pdf_temp"`
|
13
13
|
FileUtils.rm input_path
|
@@ -41,7 +41,7 @@ class ProcessPdfs < CloudCrowd::Action
|
|
41
41
|
# the concatenated merge of the full-text into a single tar archive, ready to
|
42
42
|
# for download.
|
43
43
|
def merge
|
44
|
-
|
44
|
+
input.each do |batch_url|
|
45
45
|
batch_path = File.basename(batch_url)
|
46
46
|
download(batch_url, batch_path)
|
47
47
|
`tar -xzf #{batch_path}`
|
@@ -0,0 +1,14 @@
|
|
1
|
+
# A parallel WordCount. Depends on the 'wc' utility.
|
2
|
+
class WordCount < CloudCrowd::Action
|
3
|
+
|
4
|
+
# Count the words in a single book.
|
5
|
+
def process
|
6
|
+
(`wc -w #{input_path}`).match(/\A\s*(\d+)/)[1].to_i
|
7
|
+
end
|
8
|
+
|
9
|
+
# Sum the total word count.
|
10
|
+
def merge
|
11
|
+
input.inject(0) {|sum, count| sum + count }
|
12
|
+
end
|
13
|
+
|
14
|
+
end
|
data/cloud-crowd.gemspec
CHANGED
@@ -1,10 +1,10 @@
|
|
1
1
|
Gem::Specification.new do |s|
|
2
2
|
s.name = 'cloud-crowd'
|
3
|
-
s.version = '0.0.
|
3
|
+
s.version = '0.0.6' # Keep version in sync with cloud-cloud.rb
|
4
4
|
s.date = '2009-09-01'
|
5
5
|
|
6
|
-
s.homepage = "http://documentcloud
|
7
|
-
s.summary = "
|
6
|
+
s.homepage = "http://wiki.github.com/documentcloud/cloud-crowd"
|
7
|
+
s.summary = "Parallel Processing for the Rest of Us"
|
8
8
|
s.description = <<-EOS
|
9
9
|
The crowd, suddenly there where there was nothing before, is a mysterious and
|
10
10
|
universal phenomenon. A few people may have been standing together -- five, ten
|
@@ -13,18 +13,16 @@ Gem::Specification.new do |s|
|
|
13
13
|
streets had only one direction.
|
14
14
|
EOS
|
15
15
|
|
16
|
-
s.authors
|
17
|
-
s.email
|
18
|
-
s.rubyforge_project
|
19
|
-
|
20
|
-
s.require_paths = ['lib']
|
21
|
-
s.executables = ['crowd']
|
22
|
-
|
23
|
-
# s.post_install_message = "Run `crowd --help` for information on using CloudCrowd."
|
16
|
+
s.authors = ['Jeremy Ashkenas']
|
17
|
+
s.email = 'jeremy@documentcloud.org'
|
18
|
+
s.rubyforge_project = 'cloud-crowd'
|
24
19
|
|
20
|
+
s.require_paths = ['lib']
|
21
|
+
s.executables = ['crowd']
|
22
|
+
|
25
23
|
s.has_rdoc = true
|
26
24
|
s.extra_rdoc_files = ['README']
|
27
|
-
s.rdoc_options << '--title' << 'CloudCrowd |
|
25
|
+
s.rdoc_options << '--title' << 'CloudCrowd | Parallel Processing for the Rest of Us' <<
|
28
26
|
'--exclude' << 'test' <<
|
29
27
|
'--main' << 'README' <<
|
30
28
|
'--all'
|
@@ -47,6 +45,7 @@ Gem::Specification.new do |s|
|
|
47
45
|
s.files = %w(
|
48
46
|
actions/graphics_magick.rb
|
49
47
|
actions/process_pdfs.rb
|
48
|
+
actions/word_count.rb
|
50
49
|
cloud-crowd.gemspec
|
51
50
|
config/config.example.ru
|
52
51
|
config/config.example.yml
|
@@ -54,6 +53,7 @@ config/database.example.yml
|
|
54
53
|
EPIGRAPHS
|
55
54
|
examples/graphics_magick_example.rb
|
56
55
|
examples/process_pdfs_example.rb
|
56
|
+
examples/word_count_example.rb
|
57
57
|
lib/cloud-crowd.rb
|
58
58
|
lib/cloud_crowd/action.rb
|
59
59
|
lib/cloud_crowd/app.rb
|
@@ -67,6 +67,7 @@ lib/cloud_crowd/helpers.rb
|
|
67
67
|
lib/cloud_crowd/inflector.rb
|
68
68
|
lib/cloud_crowd/models/job.rb
|
69
69
|
lib/cloud_crowd/models/work_unit.rb
|
70
|
+
lib/cloud_crowd/models/worker_record.rb
|
70
71
|
lib/cloud_crowd/models.rb
|
71
72
|
lib/cloud_crowd/runner.rb
|
72
73
|
lib/cloud_crowd/schema.rb
|
@@ -74,11 +75,24 @@ lib/cloud_crowd/worker.rb
|
|
74
75
|
LICENSE
|
75
76
|
public/css/admin_console.css
|
76
77
|
public/css/reset.css
|
78
|
+
public/images/bullet_green.png
|
79
|
+
public/images/bullet_white.png
|
80
|
+
public/images/cloud_hand.png
|
81
|
+
public/images/header_back.png
|
82
|
+
public/images/logo.png
|
77
83
|
public/images/queue_fill.png
|
84
|
+
public/images/server_error.png
|
85
|
+
public/images/sidebar_bottom.png
|
86
|
+
public/images/sidebar_top.png
|
87
|
+
public/images/worker_info.png
|
88
|
+
public/images/worker_info_loading.gif
|
78
89
|
public/js/admin_console.js
|
79
|
-
public/js/
|
90
|
+
public/js/excanvas.pack.js
|
91
|
+
public/js/jquery.flot.pack.js
|
92
|
+
public/js/jquery-1.3.2.min.js
|
80
93
|
README
|
81
94
|
test/acceptance/test_failing_work_units.rb
|
95
|
+
test/acceptance/test_word_count.rb
|
82
96
|
test/blueprints.rb
|
83
97
|
test/config/config.ru
|
84
98
|
test/config/config.yml
|
data/config/config.example.yml
CHANGED
@@ -1,6 +1,11 @@
|
|
1
|
-
# The URL where you're planning on running the server/queue/database.
|
1
|
+
# The URL where you're planning on running the central server/queue/database.
|
2
2
|
:central_server: http://localhost:9173
|
3
3
|
|
4
|
+
# The storage back-end that you'd like to use for intermediate and final results
|
5
|
+
# of processing. 's3' and 'filesystem' are supported. 'filesystem' should only
|
6
|
+
# be used in development, or on single-machine installations.
|
7
|
+
:storage: s3
|
8
|
+
|
4
9
|
# Please provide your AWS credentials for S3 storage of job output.
|
5
10
|
:aws_access_key: [your AWS access key]
|
6
11
|
:aws_secret_key: [your AWS secret access key]
|
@@ -20,8 +25,8 @@
|
|
20
25
|
:password: [your password]
|
21
26
|
|
22
27
|
# By default, CloudCrowd looks for installed actions inside the 'actions'
|
23
|
-
# subdirectory of this configuration folder. 'actions_path' allows you to
|
24
|
-
#
|
28
|
+
# subdirectory of this configuration folder. 'actions_path' allows you to load
|
29
|
+
# additional actions from a location of your choice.
|
25
30
|
# :actions_path: /path/to/actions
|
26
31
|
|
27
32
|
# Set the following numbers to tweak the configuration of your worker daemons.
|
@@ -38,14 +43,6 @@
|
|
38
43
|
# The maximum number of seconds a worker waits between checking the job queue.
|
39
44
|
:max_worker_wait: 20
|
40
45
|
|
41
|
-
# The backoff multiplier the worker uses to slow down the check interval when
|
42
|
-
# there's no work in the queue.
|
43
|
-
:worker_wait_multiplier: 1.3
|
44
|
-
|
45
|
-
# The number of seconds a worker waits to retry when there's some kind of
|
46
|
-
# internal error (ie. the central server fails to respond)
|
47
|
-
:worker_retry_wait: 5
|
48
|
-
|
49
46
|
# The number of separate attempts that will be made to process an individual
|
50
47
|
# work unit, before marking it as having failed.
|
51
48
|
:work_unit_retries: 3
|
@@ -1,48 +1,44 @@
|
|
1
|
-
|
2
|
-
# This is a fancy example that produces black and white, annotated, and blurred
|
3
|
-
# versions of a list of URLs downloaded from the web.
|
1
|
+
#!/usr/bin/env ruby -rubygems
|
4
2
|
|
3
|
+
require 'restclient'
|
5
4
|
require 'json'
|
6
5
|
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
'action' => 'graphics_magick',
|
12
|
-
|
13
|
-
'inputs' => [
|
14
|
-
'http://www.sci-fi-o-rama.com/wp-content/uploads/2008/10/dan_mcpharlin_the_land_of_sleeping_things.jpg',
|
15
|
-
'http://www.sci-fi-o-rama.com/wp-content/uploads/2009/07/dan_mcpharlin_wired_spread01.jpg',
|
16
|
-
'http://www.sci-fi-o-rama.com/wp-content/uploads/2009/07/dan_mcpharlin_wired_spread03.jpg',
|
17
|
-
'http://www.sci-fi-o-rama.com/wp-content/uploads/2009/07/dan_mcpharlin_wired_spread02.jpg',
|
18
|
-
'http://www.sci-fi-o-rama.com/wp-content/uploads/2009/02/dan_mcpharlin_untitled.jpg'
|
19
|
-
],
|
20
|
-
|
21
|
-
'options' => {
|
22
|
-
'steps' => [{
|
23
|
-
'name' => 'annotated',
|
24
|
-
'command' => 'convert',
|
25
|
-
'options' => '-font helvetica -fill red -draw "font-size 35; text 75,75 CloudCrowd!"',
|
26
|
-
'extension' => 'jpg'
|
27
|
-
},{
|
28
|
-
'name' => 'blurred',
|
29
|
-
'command' => 'convert',
|
30
|
-
'options' => '-blur 10x5',
|
31
|
-
'extension' => 'png'
|
32
|
-
},{
|
33
|
-
'name' => 'bw',
|
34
|
-
'input' => 'blurred',
|
35
|
-
'command' => 'convert',
|
36
|
-
'options' => '-monochrome',
|
37
|
-
'extension' => 'jpg'
|
38
|
-
}]
|
39
|
-
}
|
40
|
-
|
41
|
-
}.to_json}
|
42
|
-
)
|
6
|
+
# This example demonstrates the GraphicsMagick action by taking in a list of
|
7
|
+
# five images, and producing annotated, blurred, and black and white versions
|
8
|
+
# of each image. See actions/graphics_magick.rb
|
43
9
|
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
10
|
+
RestClient.post('http://localhost:9173/jobs',
|
11
|
+
{:job => {
|
12
|
+
|
13
|
+
'action' => 'graphics_magick',
|
14
|
+
|
15
|
+
'inputs' => [
|
16
|
+
'http://www.sci-fi-o-rama.com/wp-content/uploads/2008/10/dan_mcpharlin_the_land_of_sleeping_things.jpg',
|
17
|
+
'http://www.sci-fi-o-rama.com/wp-content/uploads/2009/07/dan_mcpharlin_wired_spread01.jpg',
|
18
|
+
'http://www.sci-fi-o-rama.com/wp-content/uploads/2009/07/dan_mcpharlin_wired_spread03.jpg',
|
19
|
+
'http://www.sci-fi-o-rama.com/wp-content/uploads/2009/07/dan_mcpharlin_wired_spread02.jpg',
|
20
|
+
'http://www.sci-fi-o-rama.com/wp-content/uploads/2009/02/dan_mcpharlin_untitled.jpg'
|
21
|
+
],
|
22
|
+
|
23
|
+
'options' => {
|
24
|
+
'steps' => [{
|
25
|
+
'name' => 'annotated',
|
26
|
+
'command' => 'convert',
|
27
|
+
'options' => '-font helvetica -fill red -draw "font-size 35; text 75,75 CloudCrowd!"',
|
28
|
+
'extension' => 'jpg'
|
29
|
+
},{
|
30
|
+
'name' => 'blurred',
|
31
|
+
'command' => 'convert',
|
32
|
+
'options' => '-blur 10x5',
|
33
|
+
'extension' => 'png'
|
34
|
+
},{
|
35
|
+
'name' => 'bw',
|
36
|
+
'input' => 'blurred',
|
37
|
+
'command' => 'convert',
|
38
|
+
'options' => '-monochrome',
|
39
|
+
'extension' => 'jpg'
|
40
|
+
}]
|
41
|
+
}
|
42
|
+
|
43
|
+
}.to_json}
|
44
|
+
)
|
@@ -1,30 +1,40 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
1
|
+
#!/usr/bin/env ruby -rubygems
|
2
|
+
|
3
|
+
require 'restclient'
|
4
|
+
require 'json'
|
5
|
+
|
6
|
+
# This example demonstrates a fairly complicated PDF-processing action, designed
|
7
|
+
# to extract the PDF's text, and produce GIF versions of each page. The action
|
8
|
+
# (actions/process_pdfs.rb) shows an example of using all three steps,
|
9
|
+
# split, process, and merge.
|
10
|
+
|
11
|
+
RestClient.post('http://localhost:9173/jobs',
|
12
|
+
{:job => {
|
13
|
+
|
14
|
+
'action' => 'process_pdfs',
|
15
|
+
|
16
|
+
'inputs' => [
|
17
|
+
'http://tigger.uic.edu/~victor/personal/futurism.pdf',
|
18
|
+
'http://www.jonasmekas.com/Catalog_excerpt/The%20Avant-Garde%20From%20Futurism%20to%20Fluxus.pdf',
|
19
|
+
'http://www.dzignism.com/articles/Futurist.Manifesto.pdf',
|
20
|
+
'http://benfry.com/phd/dissertation-050312b-acrobat.pdf'
|
21
|
+
],
|
22
|
+
|
23
|
+
'options' => {
|
24
|
+
|
25
|
+
'batch_size' => 7,
|
26
|
+
|
27
|
+
'images' => [{
|
28
|
+
'name' => '700',
|
29
|
+
'options' => '-resize 700x -density 220 -depth 4 -unsharp 0.5x0.5+0.5+0.03',
|
30
|
+
'extension' => 'gif'
|
31
|
+
},{
|
32
|
+
'name' => '1000',
|
33
|
+
'options' => '-resize 1000x -density 220 -depth 4 -unsharp 0.5x0.5+0.5+0.03',
|
34
|
+
'extension' => 'gif'
|
35
|
+
}]
|
36
|
+
|
37
|
+
}
|
38
|
+
|
39
|
+
}.to_json}
|
30
40
|
)
|
@@ -0,0 +1,41 @@
|
|
1
|
+
#!/usr/bin/env ruby -rubygems
|
2
|
+
|
3
|
+
require 'restclient'
|
4
|
+
require 'json'
|
5
|
+
|
6
|
+
# Let's count all the words in Shakespeare.
|
7
|
+
|
8
|
+
RestClient.post('http://localhost:9173/jobs',
|
9
|
+
{:job => {
|
10
|
+
|
11
|
+
'action' => 'word_count',
|
12
|
+
|
13
|
+
'inputs' => [
|
14
|
+
'http://www.gutenberg.org/dirs/etext97/1ws3010.txt', # All's Well That Ends Well
|
15
|
+
'http://www.gutenberg.org/dirs/etext99/1ws3511.txt', # Anthony and Cleopatra
|
16
|
+
'http://www.gutenberg.org/dirs/etext97/1ws2510.txt', # As You Like It
|
17
|
+
'http://www.gutenberg.org/dirs/etext97/1ws0610.txt', # The Comedy of Errors
|
18
|
+
'http://www.gutenberg.org/dirs/etext99/1ws3911.txt', # Cymbeline
|
19
|
+
'http://www.gutenberg.org/dirs/etext00/0ws2610.txt', # Hamlet
|
20
|
+
'http://www.gutenberg.org/dirs/etext00/0ws1910.txt', # Henry IV
|
21
|
+
'http://www.gutenberg.org/dirs/etext99/1ws2411.txt', # Julius Caesar
|
22
|
+
'http://www.gutenberg.org/dirs/etext98/2ws3310.txt', # King Lear
|
23
|
+
'http://www.gutenberg.org/dirs/etext99/1ws1211j.txt', # Love's Labour's Lost
|
24
|
+
'http://www.gutenberg.org/dirs/etext98/2ws3410.txt', # Macbeth
|
25
|
+
'http://www.gutenberg.org/dirs/etext98/2ws1810.txt', # The Merchant of Venice
|
26
|
+
'http://www.gutenberg.org/dirs/etext99/1ws1711.txt', # Midsummer Night's Dream
|
27
|
+
'http://www.gutenberg.org/dirs/etext98/3ws2210.txt', # Much Ado About Nothing
|
28
|
+
'http://www.gutenberg.org/dirs/etext00/0ws3210.txt', # Othello
|
29
|
+
'http://www.gutenberg.org/dirs/etext98/2ws1610.txt', # Romeo and Juliet
|
30
|
+
'http://www.gutenberg.org/dirs/etext98/2ws1010.txt', # The Taming of the Shrew
|
31
|
+
'http://www.gutenberg.org/dirs/etext99/1ws4111.txt', # The Tempest
|
32
|
+
'http://www.gutenberg.org/dirs/etext00/0ws0910.txt', # Titus Andronicus
|
33
|
+
'http://www.gutenberg.org/dirs/etext99/1ws2911.txt', # Troilus and Cressida
|
34
|
+
'http://www.gutenberg.org/dirs/etext98/3ws2810.txt', # Twelfth Night
|
35
|
+
'http://www.gutenberg.org/files/1539/1539.txt' # The Winter's Tale
|
36
|
+
]
|
37
|
+
|
38
|
+
}.to_json}
|
39
|
+
)
|
40
|
+
|
41
|
+
# With 23 Workers running, and over Wifi, it counted all the words in 5.5 secs.
|
data/lib/cloud-crowd.rb
CHANGED
@@ -19,28 +19,33 @@ autoload :Digest, 'digest'
|
|
19
19
|
autoload :ERB, 'erb'
|
20
20
|
autoload :FileUtils, 'fileutils'
|
21
21
|
autoload :JSON, 'json'
|
22
|
-
autoload :RestClient, '
|
22
|
+
autoload :RestClient, 'restclient'
|
23
23
|
autoload :RightAws, 'right_aws'
|
24
24
|
autoload :Sinatra, 'sinatra'
|
25
25
|
autoload :Socket, 'socket'
|
26
26
|
autoload :YAML, 'yaml'
|
27
27
|
|
28
|
+
# Common code which should really be required in every circumstance.
|
29
|
+
require 'cloud_crowd/exceptions'
|
30
|
+
|
28
31
|
module CloudCrowd
|
29
32
|
|
30
33
|
# Autoload all the CloudCrowd classes which may not be required.
|
31
|
-
autoload :App,
|
32
|
-
autoload :Action,
|
33
|
-
autoload :AssetStore,
|
34
|
-
autoload :Helpers,
|
35
|
-
autoload :Inflector,
|
36
|
-
autoload :Job,
|
37
|
-
autoload :
|
34
|
+
autoload :App, 'cloud_crowd/app'
|
35
|
+
autoload :Action, 'cloud_crowd/action'
|
36
|
+
autoload :AssetStore, 'cloud_crowd/asset_store'
|
37
|
+
autoload :Helpers, 'cloud_crowd/helpers'
|
38
|
+
autoload :Inflector, 'cloud_crowd/inflector'
|
39
|
+
autoload :Job, 'cloud_crowd/models'
|
40
|
+
autoload :Worker, 'cloud_crowd/worker'
|
41
|
+
autoload :WorkUnit, 'cloud_crowd/models'
|
42
|
+
autoload :WorkerRecord, 'cloud_crowd/models'
|
38
43
|
|
39
44
|
# Root directory of the CloudCrowd gem.
|
40
45
|
ROOT = File.expand_path(File.dirname(__FILE__) + '/..')
|
41
46
|
|
42
47
|
# Keep the version in sync with the gemspec.
|
43
|
-
VERSION = '0.0.
|
48
|
+
VERSION = '0.0.6'
|
44
49
|
|
45
50
|
# A Job is processing if its WorkUnits in the queue to be handled by workers.
|
46
51
|
PROCESSING = 1
|
@@ -68,9 +73,7 @@ module CloudCrowd
|
|
68
73
|
INCOMPLETE = [PROCESSING, SPLITTING, MERGING]
|
69
74
|
|
70
75
|
# Mapping of statuses to their display strings.
|
71
|
-
DISPLAY_STATUS_MAP =
|
72
|
-
1 => 'processing', 2 => 'succeeded', 3 => 'failed', 4 => 'splitting', 5 => 'merging'
|
73
|
-
}
|
76
|
+
DISPLAY_STATUS_MAP = ['unknown', 'processing', 'succeeded', 'failed', 'splitting', 'merging']
|
74
77
|
|
75
78
|
class << self
|
76
79
|
attr_reader :config
|
@@ -101,7 +104,7 @@ module CloudCrowd
|
|
101
104
|
# Return the displayable status name of an internal CloudCrowd status number.
|
102
105
|
# (See the above constants).
|
103
106
|
def display_status(status)
|
104
|
-
DISPLAY_STATUS_MAP[status]
|
107
|
+
DISPLAY_STATUS_MAP[status] || 'unknown'
|
105
108
|
end
|
106
109
|
|
107
110
|
# CloudCrowd::Actions are requested dynamically by name. Access them through
|
@@ -112,10 +115,10 @@ module CloudCrowd
|
|
112
115
|
def actions
|
113
116
|
return @actions if @actions
|
114
117
|
@actions = {}
|
115
|
-
default_actions
|
116
|
-
|
117
|
-
|
118
|
-
(default_actions + custom_actions).each do |path|
|
118
|
+
default_actions = Dir["#{ROOT}/actions/*.rb"]
|
119
|
+
installed_actions = Dir["#{@config_path}/actions/*.rb"]
|
120
|
+
custom_actions = Dir["#{CloudCrowd.config[:actions_path]}/*.rb"]
|
121
|
+
(default_actions + installed_actions + custom_actions).each do |path|
|
119
122
|
name = File.basename(path, File.extname(path))
|
120
123
|
require path
|
121
124
|
@actions[name] = Module.const_get(Inflector.camelize(name))
|
data/lib/cloud_crowd/action.rb
CHANGED
@@ -2,7 +2,7 @@ module CloudCrowd
|
|
2
2
|
|
3
3
|
# As you write your custom actions, have them inherit from CloudCrowd::Action.
|
4
4
|
# All actions must implement a +process+ method, which should return a
|
5
|
-
# JSON-
|
5
|
+
# JSON-serializable object that will be used as the output for the work unit.
|
6
6
|
# See the default actions for examples.
|
7
7
|
#
|
8
8
|
# Optionally, actions may define +split+ and +merge+ methods to do mapping
|
@@ -14,6 +14,8 @@ module CloudCrowd
|
|
14
14
|
# and spend their duration inside of it, so relative paths work well.
|
15
15
|
class Action
|
16
16
|
|
17
|
+
FILE_URL = /\Afile:\/\//
|
18
|
+
|
17
19
|
attr_reader :input, :input_path, :file_name, :options, :work_directory
|
18
20
|
|
19
21
|
# Initializing an Action sets up all of the read-only variables that
|
@@ -27,11 +29,7 @@ module CloudCrowd
|
|
27
29
|
@work_directory = File.expand_path(File.join(@store.temp_storage_path, storage_prefix))
|
28
30
|
FileUtils.mkdir_p(@work_directory) unless File.exists?(@work_directory)
|
29
31
|
Dir.chdir @work_directory
|
30
|
-
|
31
|
-
@input_path = File.join(@work_directory, safe_filename(@input))
|
32
|
-
@file_name = File.basename(@input_path, File.extname(@input_path))
|
33
|
-
download(@input, @input_path)
|
34
|
-
end
|
32
|
+
status == MERGING ? parse_input : download_input
|
35
33
|
end
|
36
34
|
|
37
35
|
# Each Action subclass must implement a +process+ method, overriding this.
|
@@ -39,9 +37,14 @@ module CloudCrowd
|
|
39
37
|
raise NotImplementedError.new("CloudCrowd::Actions must override 'process' with their own processing code.")
|
40
38
|
end
|
41
39
|
|
42
|
-
# Download a file to the specified path
|
40
|
+
# Download a file to the specified path.
|
43
41
|
def download(url, path)
|
44
|
-
|
42
|
+
if url.match(FILE_URL)
|
43
|
+
FileUtils.cp(url.sub(FILE_URL, ''), path)
|
44
|
+
else
|
45
|
+
resp = RestClient::Request.execute(:url => url, :method => :get, :raw_response => true)
|
46
|
+
FileUtils.mv resp.file.path, path
|
47
|
+
end
|
45
48
|
path
|
46
49
|
end
|
47
50
|
|
@@ -57,7 +60,7 @@ module CloudCrowd
|
|
57
60
|
# to the root directory (where daemons run by default).
|
58
61
|
def cleanup_work_directory
|
59
62
|
Dir.chdir '/'
|
60
|
-
FileUtils.rm_r(@work_directory)
|
63
|
+
FileUtils.rm_r(@work_directory) if File.exists?(@work_directory)
|
61
64
|
end
|
62
65
|
|
63
66
|
|
@@ -80,6 +83,20 @@ module CloudCrowd
|
|
80
83
|
@storage_prefix ||= File.join(path_parts)
|
81
84
|
end
|
82
85
|
|
86
|
+
# If we know that the input is JSON, replace it with the parsed form.
|
87
|
+
def parse_input
|
88
|
+
@input = JSON.parse(@input)
|
89
|
+
end
|
90
|
+
|
91
|
+
# If the input is a URL, download the file before beginning processing.
|
92
|
+
def download_input
|
93
|
+
input_is_url = !!URI.parse(@input) rescue false
|
94
|
+
return unless input_is_url
|
95
|
+
@input_path = File.join(@work_directory, safe_filename(@input))
|
96
|
+
@file_name = File.basename(@input_path, File.extname(@input_path))
|
97
|
+
download(@input, @input_path)
|
98
|
+
end
|
99
|
+
|
83
100
|
end
|
84
101
|
|
85
102
|
end
|