RubyGems - documentcloud-cloud-crowd - Versions diffs - 0.0.5 → 0.0.6 - Mend

documentcloud-cloud-crowd 0.0.5 → 0.0.6

Files changed (47) hide show

data/README +59 -50
data/actions/process_pdfs.rb +3 -3
data/actions/word_count.rb +14 -0
data/cloud-crowd.gemspec +27 -13
data/config/config.example.yml +8 -11
data/examples/graphics_magick_example.rb +40 -44
data/examples/process_pdfs_example.rb +39 -29
data/examples/word_count_example.rb +41 -0
data/lib/cloud-crowd.rb +20 -17
data/lib/cloud_crowd/action.rb +26 -9
data/lib/cloud_crowd/app.rb +26 -4
data/lib/cloud_crowd/asset_store.rb +69 -40
data/lib/cloud_crowd/command_line.rb +6 -4
data/lib/cloud_crowd/daemon.rb +65 -25
data/lib/cloud_crowd/exceptions.rb +5 -0
data/lib/cloud_crowd/helpers/resources.rb +2 -2
data/lib/cloud_crowd/models/job.rb +9 -13
data/lib/cloud_crowd/models/work_unit.rb +23 -15
data/lib/cloud_crowd/models/worker_record.rb +61 -0
data/lib/cloud_crowd/models.rb +7 -1
data/lib/cloud_crowd/schema.rb +12 -3
data/lib/cloud_crowd/worker.rb +48 -10
data/public/css/admin_console.css +174 -4
data/public/css/reset.css +17 -27
data/public/images/bullet_green.png +0 -0
data/public/images/bullet_white.png +0 -0
data/public/images/cloud_hand.png +0 -0
data/public/images/header_back.png +0 -0
data/public/images/logo.png +0 -0
data/public/images/server_error.png +0 -0
data/public/images/sidebar_bottom.png +0 -0
data/public/images/sidebar_top.png +0 -0
data/public/images/worker_info.png +0 -0
data/public/images/worker_info_loading.gif +0 -0
data/public/js/admin_console.js +127 -10
data/public/js/excanvas.pack.js +1 -0
data/public/js/jquery-1.3.2.min.js +19 -0
data/public/js/jquery.flot.pack.js +1 -0
data/test/acceptance/test_word_count.rb +49 -0
data/test/blueprints.rb +6 -5
data/test/config/config.yml +1 -4
data/test/test_helper.rb +1 -0
data/test/unit/test_job.rb +12 -4
data/test/unit/test_work_unit.rb +2 -2
data/views/index.erb +69 -14
metadata +23 -6
data/public/js/jquery-1.3.2.js +0 -4376

data/README CHANGED Viewed

@@ -22,54 +22,63 @@
-	~ CloudCrowd ~
+  ~ CloudCrowd ~
-		* A batch-processing system, map-reduce style
-		* Write your scripts in Ruby
-		* Built for Amazon EC2 and S3
-		* split -> process -> merge
-		* As easy as `gem install cloud-crowd`
-	~ Getting started ~
-		# Install the gem (documentcloud-cloud-crowd until the first official release).
-		>> sudo gem install cloud-crowd
-		# Install the CloudCrowd configuration files to a location of your choosing.
-		>> crowd install ~/config/cloud-crowd
-		# Now, you can use the full complement of `crowd` commands from inside of
-		# this configuration directory. To see the available commands:
-		>> crowd --help
-		# Edit the configuration files to your satisfaction, and add AWS credentials.
-		>> mate ~/config/cloud-crowd/config.yml
-		>> mate ~/config/cloud-crowd/database.yml
-		# Write your actions, and install them into the 'actions' subdirectory.
-		# CloudCrowd comes with some default actions as an example.
-		# To spin up the central server (make sure that you include its location
-		# in config.yml), either:
-		>> crowd server
-		# or:
-		>> thin -R config.ru --servers 3 -e production start
-		# Any server that supports Rack should work with the rackup file.
-		# Then, to spin up 10 workers:
-		>> crowd workers start -n 10
-		# To spin up workers remotely, install the 'cloud-crowd' gem, and copy over
-		# your configuration directory.
+    * Parallel processing for the rest of us
+    * Write your scripts in Ruby
+    * Built for Amazon EC2 and S3
+    * split -> process -> merge
+    * As easy as `gem install cloud-crowd`
+  ~ Wiki ~
+    http://wiki.github.com/documentcloud/cloud-crowd
+  ~ Getting started ~
+    # Install the gem.
+      >> sudo gem install cloud-crowd
+    # Install the CloudCrowd configuration files to a location of your choosing.
+      >> crowd install ~/config/cloud-crowd
+    # Now, you can use the full complement of `crowd` commands from inside of
+    # this configuration directory. To see the available commands:
+      >> crowd --help
+    # Edit the configuration files to your satisfaction, add AWS credentials,
+    # and then load the CloudCrowd schema into your configured database.
+      >> mate ~/config/cloud-crowd/config.yml
+      >> mate ~/config/cloud-crowd/database.yml
+      >> crowd load_schema
+    # Write your actions, and install them into the 'actions' subdirectory.
+    # CloudCrowd comes with some default actions as an example.
+    # To launch the central server (make sure that you include its location
+    # in config.yml), either:
+      >> crowd server
+    # or:
+      >> thin -R config.ru --servers 3 -e production start
+    # Any server that supports Rack should work with the rackup file.
+    # Then, to spin up 10 workers:
+      >> crowd workers start -n 10
+    # To spin up workers remotely, install the 'cloud-crowd' gem, and copy over
+    # your configuration directory.
+    # At this point you can visit your server console at localhost:9173 to
+    # view all of your workers, ready for action.

data/actions/process_pdfs.rb CHANGED Viewed

@@ -6,8 +6,8 @@
 # See <tt>examples/process_pdfs_example.rb</tt> for more information.
 class ProcessPdfs < CloudCrowd::Action
-  # Split up a large pdf into single-page pdfs.
-  # The double pdftk shuffle fixes the document xrefs.
+  # Split up a large pdf into single-page pdfs. Batch them into 'batch_size'
+  # chunks for processing. The double pdftk shuffle fixes the document xrefs.
   def split
     `pdftk #{input_path} burst output "#{file_name}_%05d.pdf_temp"`
     FileUtils.rm input_path
@@ -41,7 +41,7 @@ class ProcessPdfs < CloudCrowd::Action
   # the concatenated merge of the full-text into a single tar archive, ready to
   # for download.
   def merge
-    JSON.parse(input).each do |batch_url|
+    input.each do |batch_url|
       batch_path = File.basename(batch_url)
       download(batch_url, batch_path)
       `tar -xzf #{batch_path}`

data/actions/word_count.rb ADDED Viewed

@@ -0,0 +1,14 @@
+# A parallel WordCount. Depends on the 'wc' utility.
+class WordCount < CloudCrowd::Action
+  # Count the words in a single book.
+  def process
+    (`wc -w #{input_path}`).match(/\A\s*(\d+)/)[1].to_i
+  end
+  # Sum the total word count.
+  def merge
+    input.inject(0) {|sum, count| sum + count }
+  end
+end

data/cloud-crowd.gemspec CHANGED Viewed

@@ -1,10 +1,10 @@
 Gem::Specification.new do |s|
   s.name      = 'cloud-crowd'
-  s.version   = '0.0.5'         # Keep version in sync with cloud-cloud.rb
+  s.version   = '0.0.6'         # Keep version in sync with cloud-cloud.rb
   s.date      = '2009-09-01'
-  s.homepage    = "http://documentcloud.org" # wiki page on github?
-  s.summary     = "Better living through Map --> Ruby --> Reduce"
+  s.homepage    = "http://wiki.github.com/documentcloud/cloud-crowd"
+  s.summary     = "Parallel Processing for the Rest of Us"
   s.description = <<-EOS
     The crowd, suddenly there where there was nothing before, is a mysterious and
     universal phenomenon. A few people may have been standing together -- five, ten
@@ -13,18 +13,16 @@ Gem::Specification.new do |s|
     streets had only one direction.
   EOS
-  s.authors     = ['Jeremy Ashkenas']
-  s.email       = 'jeremy@documentcloud.org'
-  s.rubyforge_project    = 'cloud-crowd'
-  s.require_paths = ['lib']
-  s.executables   = ['crowd']
-  # s.post_install_message = "Run `crowd --help` for information on using CloudCrowd."
+  s.authors           = ['Jeremy Ashkenas']
+  s.email             = 'jeremy@documentcloud.org'
+  s.rubyforge_project = 'cloud-crowd'
+  s.require_paths     = ['lib']
+  s.executables       = ['crowd']
   s.has_rdoc          = true
   s.extra_rdoc_files  = ['README']
-  s.rdoc_options      << '--title'    << 'CloudCrowd | Better Living through Map --> Ruby --> Reduce' <<
+  s.rdoc_options      << '--title'    << 'CloudCrowd | Parallel Processing for the Rest of Us' <<
                          '--exclude'  << 'test' <<
                          '--main'     << 'README' <<
                          '--all'
@@ -47,6 +45,7 @@ Gem::Specification.new do |s|
   s.files = %w(
 actions/graphics_magick.rb
 actions/process_pdfs.rb
+actions/word_count.rb
 cloud-crowd.gemspec
 config/config.example.ru
 config/config.example.yml
@@ -54,6 +53,7 @@ config/database.example.yml
 EPIGRAPHS
 examples/graphics_magick_example.rb
 examples/process_pdfs_example.rb
+examples/word_count_example.rb
 lib/cloud-crowd.rb
 lib/cloud_crowd/action.rb
 lib/cloud_crowd/app.rb
@@ -67,6 +67,7 @@ lib/cloud_crowd/helpers.rb
 lib/cloud_crowd/inflector.rb
 lib/cloud_crowd/models/job.rb
 lib/cloud_crowd/models/work_unit.rb
+lib/cloud_crowd/models/worker_record.rb
 lib/cloud_crowd/models.rb
 lib/cloud_crowd/runner.rb
 lib/cloud_crowd/schema.rb
@@ -74,11 +75,24 @@ lib/cloud_crowd/worker.rb
 LICENSE
 public/css/admin_console.css
 public/css/reset.css
+public/images/bullet_green.png
+public/images/bullet_white.png
+public/images/cloud_hand.png
+public/images/header_back.png
+public/images/logo.png
 public/images/queue_fill.png
+public/images/server_error.png
+public/images/sidebar_bottom.png
+public/images/sidebar_top.png
+public/images/worker_info.png
+public/images/worker_info_loading.gif
 public/js/admin_console.js
-public/js/jquery-1.3.2.js
+public/js/excanvas.pack.js
+public/js/jquery.flot.pack.js
+public/js/jquery-1.3.2.min.js
 README
 test/acceptance/test_failing_work_units.rb
+test/acceptance/test_word_count.rb
 test/blueprints.rb
 test/config/config.ru
 test/config/config.yml

data/config/config.example.yml CHANGED Viewed

@@ -1,6 +1,11 @@
-# The URL where you're planning on running the server/queue/database.
+# The URL where you're planning on running the central server/queue/database.
 :central_server:          http://localhost:9173
+# The storage back-end that you'd like to use for intermediate and final results
+# of processing. 's3' and 'filesystem' are supported. 'filesystem' should only
+# be used in development, or on single-machine installations.
+:storage:                 s3
 # Please provide your AWS credentials for S3 storage of job output.
 :aws_access_key:          [your AWS access key]
 :aws_secret_key:          [your AWS secret access key]
@@ -20,8 +25,8 @@
 :password:                [your password]
 # By default, CloudCrowd looks for installed actions inside the 'actions'
-# subdirectory of this configuration folder. 'actions_path' allows you to install
-# them in a different location.
+# subdirectory of this configuration folder. 'actions_path' allows you to load
+# additional actions from a location of your choice.
 # :actions_path: /path/to/actions
 # Set the following numbers to tweak the configuration of your worker daemons.
@@ -38,14 +43,6 @@
 # The maximum number of seconds a worker waits between checking the job queue.
 :max_worker_wait:         20
-# The backoff multiplier the worker uses to slow down the check interval when
-# there's no work in the queue.
-:worker_wait_multiplier:  1.3
-# The number of seconds a worker waits to retry when there's some kind of
-# internal error (ie. the central server fails to respond)
-:worker_retry_wait:       5
 # The number of separate attempts that will be made to process an individual
 # work unit, before marking it as having failed.
 :work_unit_retries:       3

data/examples/graphics_magick_example.rb CHANGED Viewed

@@ -1,48 +1,44 @@
-# Inside of a restclient session:
-# This is a fancy example that produces black and white, annotated, and blurred
-# versions of a list of URLs downloaded from the web.
+#!/usr/bin/env ruby -rubygems
+require 'restclient'
 require 'json'
-RestClient.post(
-	'http://localhost:9173/jobs',
-	{:job => {
-		'action' => 'graphics_magick',
-		'inputs' => [
-			'http://www.sci-fi-o-rama.com/wp-content/uploads/2008/10/dan_mcpharlin_the_land_of_sleeping_things.jpg',
-			'http://www.sci-fi-o-rama.com/wp-content/uploads/2009/07/dan_mcpharlin_wired_spread01.jpg',
-			'http://www.sci-fi-o-rama.com/wp-content/uploads/2009/07/dan_mcpharlin_wired_spread03.jpg',
-			'http://www.sci-fi-o-rama.com/wp-content/uploads/2009/07/dan_mcpharlin_wired_spread02.jpg',
-			'http://www.sci-fi-o-rama.com/wp-content/uploads/2009/02/dan_mcpharlin_untitled.jpg'
-		],
-		'options' => {
-			'steps' => [{
-				'name' 			=> 'annotated',
-				'command' 	=> 'convert',
-				'options'		=> '-font helvetica -fill red -draw "font-size 35; text 75,75 CloudCrowd!"',
-				'extension' => 'jpg'
-			},{
-				'name'			=> 'blurred',
-				'command' 	=> 'convert',
-				'options'		=> '-blur 10x5',
-				'extension' => 'png'
-			},{
-				'name' 			=> 'bw',
-				'input'			=> 'blurred',
-				'command' 	=> 'convert',
-				'options' 	=> '-monochrome',
-				'extension' => 'jpg'
-			}]
-		}
-	}.to_json}
-)
+# This example demonstrates the GraphicsMagick action by taking in a list of
+# five images, and producing annotated, blurred, and black and white versions
+# of each image. See actions/graphics_magick.rb
-# status = RestClient.get('http://localhost:9173/jobs/[job_id]')
-# puts JSON.parse(RestClient.get('http://localhost:9173/jobs/[job_id]'))['outputs'].values.map {|v|
-#		JSON.parse(v).map {|v| v['url']}
-#	}.flatten.join("\n")
+RestClient.post('http://localhost:9173/jobs',
+  {:job => {
+    'action' => 'graphics_magick',
+    'inputs' => [
+      'http://www.sci-fi-o-rama.com/wp-content/uploads/2008/10/dan_mcpharlin_the_land_of_sleeping_things.jpg',
+      'http://www.sci-fi-o-rama.com/wp-content/uploads/2009/07/dan_mcpharlin_wired_spread01.jpg',
+      'http://www.sci-fi-o-rama.com/wp-content/uploads/2009/07/dan_mcpharlin_wired_spread03.jpg',
+      'http://www.sci-fi-o-rama.com/wp-content/uploads/2009/07/dan_mcpharlin_wired_spread02.jpg',
+      'http://www.sci-fi-o-rama.com/wp-content/uploads/2009/02/dan_mcpharlin_untitled.jpg'
+    ],
+    'options' => {
+      'steps' => [{
+        'name'      => 'annotated',
+        'command'   => 'convert',
+        'options'   => '-font helvetica -fill red -draw "font-size 35; text 75,75 CloudCrowd!"',
+        'extension' => 'jpg'
+      },{
+        'name'      => 'blurred',
+        'command'   => 'convert',
+        'options'   => '-blur 10x5',
+        'extension' => 'png'
+      },{
+        'name'      => 'bw',
+        'input'     => 'blurred',
+        'command'   => 'convert',
+        'options'   => '-monochrome',
+        'extension' => 'jpg'
+      }]
+    }
+  }.to_json}
+)

data/examples/process_pdfs_example.rb CHANGED Viewed

@@ -1,30 +1,40 @@
-RestClient.post(
-	'http://localhost:9173/jobs',
-	{:job => {
-		'action' => 'process_pdfs',
-		'inputs' => [
-		  'http://tigger.uic.edu/~victor/personal/futurism.pdf',
-		  'http://www.jonasmekas.com/Catalog_excerpt/The%20Avant-Garde%20From%20Futurism%20to%20Fluxus.pdf',
-		  'http://www.dzignism.com/articles/Futurist.Manifesto.pdf'
-		],
-		'options' => {
-		  'batch_size' => 7,
-		  'images' => [{
-				'name' 			=> '700',
-				'options'		=> '-resize 700x -density 220 -depth 4 -unsharp 0.5x0.5+0.5+0.03',
-				'extension' => 'gif'
-			},{
-				'name' 			=> '1000',
-				'options'		=> '-resize 1000x -density 220 -depth 4 -unsharp 0.5x0.5+0.5+0.03',
-				'extension' => 'gif'
-			}]
-		}
-	}.to_json}
+#!/usr/bin/env ruby -rubygems
+require 'restclient'
+require 'json'
+# This example demonstrates a fairly complicated PDF-processing action, designed
+# to extract the PDF's text, and produce GIF versions of each page. The action
+# (actions/process_pdfs.rb) shows an example of using all three steps,
+# split, process, and merge.
+RestClient.post('http://localhost:9173/jobs',
+  {:job => {
+    'action' => 'process_pdfs',
+    'inputs' => [
+      'http://tigger.uic.edu/~victor/personal/futurism.pdf',
+      'http://www.jonasmekas.com/Catalog_excerpt/The%20Avant-Garde%20From%20Futurism%20to%20Fluxus.pdf',
+      'http://www.dzignism.com/articles/Futurist.Manifesto.pdf',
+      'http://benfry.com/phd/dissertation-050312b-acrobat.pdf'
+    ],
+    'options' => {
+      'batch_size' => 7,
+      'images' => [{
+        'name'      => '700',
+        'options'   => '-resize 700x -density 220 -depth 4 -unsharp 0.5x0.5+0.5+0.03',
+        'extension' => 'gif'
+      },{
+        'name'      => '1000',
+        'options'   => '-resize 1000x -density 220 -depth 4 -unsharp 0.5x0.5+0.5+0.03',
+        'extension' => 'gif'
+      }]
+    }
+  }.to_json}
 )

data/examples/word_count_example.rb ADDED Viewed

@@ -0,0 +1,41 @@
+#!/usr/bin/env ruby -rubygems
+require 'restclient'
+require 'json'
+# Let's count all the words in Shakespeare.
+RestClient.post('http://localhost:9173/jobs',
+  {:job => {
+    'action' => 'word_count',
+    'inputs' => [
+      'http://www.gutenberg.org/dirs/etext97/1ws3010.txt',  # All's Well That Ends Well
+      'http://www.gutenberg.org/dirs/etext99/1ws3511.txt',  # Anthony and Cleopatra
+      'http://www.gutenberg.org/dirs/etext97/1ws2510.txt',  # As You Like It
+      'http://www.gutenberg.org/dirs/etext97/1ws0610.txt',  # The Comedy of Errors
+      'http://www.gutenberg.org/dirs/etext99/1ws3911.txt',  # Cymbeline
+      'http://www.gutenberg.org/dirs/etext00/0ws2610.txt',  # Hamlet
+      'http://www.gutenberg.org/dirs/etext00/0ws1910.txt',  # Henry IV
+      'http://www.gutenberg.org/dirs/etext99/1ws2411.txt',  # Julius Caesar
+      'http://www.gutenberg.org/dirs/etext98/2ws3310.txt',  # King Lear
+      'http://www.gutenberg.org/dirs/etext99/1ws1211j.txt', # Love's Labour's Lost
+      'http://www.gutenberg.org/dirs/etext98/2ws3410.txt',  # Macbeth
+      'http://www.gutenberg.org/dirs/etext98/2ws1810.txt',  # The Merchant of Venice
+      'http://www.gutenberg.org/dirs/etext99/1ws1711.txt',  # Midsummer Night's Dream
+      'http://www.gutenberg.org/dirs/etext98/3ws2210.txt',  # Much Ado About Nothing
+      'http://www.gutenberg.org/dirs/etext00/0ws3210.txt',  # Othello
+      'http://www.gutenberg.org/dirs/etext98/2ws1610.txt',  # Romeo and Juliet
+      'http://www.gutenberg.org/dirs/etext98/2ws1010.txt',  # The Taming of the Shrew
+      'http://www.gutenberg.org/dirs/etext99/1ws4111.txt',  # The Tempest
+      'http://www.gutenberg.org/dirs/etext00/0ws0910.txt',  # Titus Andronicus
+      'http://www.gutenberg.org/dirs/etext99/1ws2911.txt',  # Troilus and Cressida
+      'http://www.gutenberg.org/dirs/etext98/3ws2810.txt',  # Twelfth Night
+      'http://www.gutenberg.org/files/1539/1539.txt'        # The Winter's Tale
+    ]
+  }.to_json}
+)
+# With 23 Workers running, and over Wifi, it counted all the words in 5.5 secs.

data/lib/cloud-crowd.rb CHANGED Viewed

@@ -19,28 +19,33 @@ autoload :Digest,       'digest'
 autoload :ERB,          'erb'
 autoload :FileUtils,    'fileutils'
 autoload :JSON,         'json'
-autoload :RestClient,   'rest_client'
+autoload :RestClient,   'restclient'
 autoload :RightAws,     'right_aws'
 autoload :Sinatra,      'sinatra'
 autoload :Socket,       'socket'
 autoload :YAML,         'yaml'
+# Common code which should really be required in every circumstance.
+require 'cloud_crowd/exceptions'
 module CloudCrowd
   # Autoload all the CloudCrowd classes which may not be required.
-  autoload :App,        'cloud_crowd/app'
-  autoload :Action,     'cloud_crowd/action'
-  autoload :AssetStore, 'cloud_crowd/asset_store'
-  autoload :Helpers,    'cloud_crowd/helpers'
-  autoload :Inflector,  'cloud_crowd/inflector'
-  autoload :Job,        'cloud_crowd/models'
-  autoload :WorkUnit,   'cloud_crowd/models'
+  autoload :App,          'cloud_crowd/app'
+  autoload :Action,       'cloud_crowd/action'
+  autoload :AssetStore,   'cloud_crowd/asset_store'
+  autoload :Helpers,      'cloud_crowd/helpers'
+  autoload :Inflector,    'cloud_crowd/inflector'
+  autoload :Job,          'cloud_crowd/models'
+  autoload :Worker,       'cloud_crowd/worker'
+  autoload :WorkUnit,     'cloud_crowd/models'
+  autoload :WorkerRecord, 'cloud_crowd/models'
   # Root directory of the CloudCrowd gem.
   ROOT        = File.expand_path(File.dirname(__FILE__) + '/..')
   # Keep the version in sync with the gemspec.
-  VERSION     = '0.0.5'
+  VERSION     = '0.0.6'
   # A Job is processing if its WorkUnits in the queue to be handled by workers.
   PROCESSING  = 1
@@ -68,9 +73,7 @@ module CloudCrowd
   INCOMPLETE  = [PROCESSING, SPLITTING, MERGING]
   # Mapping of statuses to their display strings.
-  DISPLAY_STATUS_MAP = {
-    1 => 'processing', 2 => 'succeeded', 3 => 'failed', 4 => 'splitting', 5 => 'merging'
-  }
+  DISPLAY_STATUS_MAP = ['unknown', 'processing', 'succeeded', 'failed', 'splitting', 'merging']
   class << self
     attr_reader :config
@@ -101,7 +104,7 @@ module CloudCrowd
     # Return the displayable status name of an internal CloudCrowd status number.
     # (See the above constants).
     def display_status(status)
-      DISPLAY_STATUS_MAP[status]
+      DISPLAY_STATUS_MAP[status] || 'unknown'
     end
     # CloudCrowd::Actions are requested dynamically by name. Access them through
@@ -112,10 +115,10 @@ module CloudCrowd
     def actions
       return @actions if @actions
       @actions = {}
-      default_actions = Dir["#{ROOT}/actions/*.rb"]
-      custom_actions  = Dir["#{CloudCrowd.config[:actions_path]}/*.rb"] ||
-                        Dir["#{@config_path}/actions/*.rb"]
-      (default_actions + custom_actions).each do |path|
+      default_actions   = Dir["#{ROOT}/actions/*.rb"]
+      installed_actions = Dir["#{@config_path}/actions/*.rb"]
+      custom_actions    = Dir["#{CloudCrowd.config[:actions_path]}/*.rb"]
+      (default_actions + installed_actions + custom_actions).each do |path|
         name = File.basename(path, File.extname(path))
         require path
         @actions[name] = Module.const_get(Inflector.camelize(name))

data/lib/cloud_crowd/action.rb CHANGED Viewed

@@ -2,7 +2,7 @@ module CloudCrowd
   # As you write your custom actions, have them inherit from CloudCrowd::Action.
   # All actions must implement a +process+ method, which should return a
-  # JSON-serializeable object that will be used as the output for the work unit.
+  # JSON-serializable object that will be used as the output for the work unit.
   # See the default actions for examples.
   #
   # Optionally, actions may define +split+ and +merge+ methods to do mapping
@@ -14,6 +14,8 @@ module CloudCrowd
   # and spend their duration inside of it, so relative paths work well.
   class Action
+    FILE_URL = /\Afile:\/\//
     attr_reader :input, :input_path, :file_name, :options, :work_directory
     # Initializing an Action sets up all of the read-only variables that
@@ -27,11 +29,7 @@ module CloudCrowd
       @work_directory = File.expand_path(File.join(@store.temp_storage_path, storage_prefix))
       FileUtils.mkdir_p(@work_directory) unless File.exists?(@work_directory)
       Dir.chdir @work_directory
-      unless status == MERGING
-        @input_path = File.join(@work_directory, safe_filename(@input))
-        @file_name = File.basename(@input_path, File.extname(@input_path))
-        download(@input, @input_path)
-      end
+      status == MERGING ? parse_input : download_input
     end
     # Each Action subclass must implement a +process+ method, overriding this.
@@ -39,9 +37,14 @@ module CloudCrowd
       raise NotImplementedError.new("CloudCrowd::Actions must override 'process' with their own processing code.")
     end
-    # Download a file to the specified path with *curl*.
+    # Download a file to the specified path.
     def download(url, path)
-      `curl -s "#{url}" > "#{path}"`
+      if url.match(FILE_URL)
+        FileUtils.cp(url.sub(FILE_URL, ''), path)
+      else
+        resp = RestClient::Request.execute(:url => url, :method => :get, :raw_response => true)
+        FileUtils.mv resp.file.path, path
+      end
       path
     end
@@ -57,7 +60,7 @@ module CloudCrowd
     # to the root directory (where daemons run by default).
     def cleanup_work_directory
       Dir.chdir '/'
-      FileUtils.rm_r(@work_directory)
+      FileUtils.rm_r(@work_directory) if File.exists?(@work_directory)
     end
@@ -80,6 +83,20 @@ module CloudCrowd
       @storage_prefix ||= File.join(path_parts)
     end
+    # If we know that the input is JSON, replace it with the parsed form.
+    def parse_input
+      @input = JSON.parse(@input)
+    end
+    # If the input is a URL, download the file before beginning processing.
+    def download_input
+      input_is_url = !!URI.parse(@input) rescue false
+      return unless input_is_url
+      @input_path = File.join(@work_directory, safe_filename(@input))
+      @file_name = File.basename(@input_path, File.extname(@input_path))
+      download(@input, @input_path)
+    end
   end
 end