documentcloud-cloud-crowd 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +8 -8
- data/cloud-crowd.gemspec +8 -8
- data/config/config.example.ru +8 -2
- data/config/config.example.yml +6 -15
- data/examples/process_pdfs_example.rb +1 -1
- data/examples/word_count_example.rb +1 -0
- data/lib/cloud-crowd.rb +6 -5
- data/lib/cloud_crowd/action.rb +11 -7
- data/lib/cloud_crowd/asset_store/filesystem_store.rb +5 -0
- data/lib/cloud_crowd/asset_store/s3_store.rb +7 -3
- data/lib/cloud_crowd/asset_store.rb +1 -1
- data/lib/cloud_crowd/command_line.rb +14 -53
- data/lib/cloud_crowd/exceptions.rb +4 -0
- data/lib/cloud_crowd/helpers/authorization.rb +2 -2
- data/lib/cloud_crowd/helpers/resources.rb +0 -20
- data/lib/cloud_crowd/models/job.rb +25 -26
- data/lib/cloud_crowd/models/node_record.rb +81 -0
- data/lib/cloud_crowd/models/work_unit.rb +70 -30
- data/lib/cloud_crowd/models.rb +1 -1
- data/lib/cloud_crowd/node.rb +87 -0
- data/lib/cloud_crowd/schema.rb +19 -16
- data/lib/cloud_crowd/{app.rb → server.rb} +25 -30
- data/lib/cloud_crowd/worker.rb +50 -74
- data/public/css/admin_console.css +26 -14
- data/public/images/server.png +0 -0
- data/public/js/admin_console.js +45 -18
- data/test/acceptance/test_failing_work_units.rb +1 -1
- data/test/acceptance/{test_app.rb → test_server.rb} +15 -15
- data/test/acceptance/test_word_count.rb +3 -9
- data/test/blueprints.rb +0 -1
- data/test/config/config.ru +1 -1
- data/test/config/config.yml +1 -3
- data/test/unit/test_configuration.rb +1 -1
- data/test/unit/test_job.rb +1 -0
- data/test/unit/test_work_unit.rb +2 -4
- data/views/index.erb +13 -8
- metadata +9 -9
- data/lib/cloud_crowd/daemon.rb +0 -95
- data/lib/cloud_crowd/models/worker_record.rb +0 -61
- data/lib/cloud_crowd/runner.rb +0 -15
    
        data/README
    CHANGED
    
    | @@ -30,19 +30,19 @@ | |
| 30 30 | 
             
                * split -> process -> merge
         | 
| 31 31 | 
             
                * As easy as `gem install cloud-crowd`
         | 
| 32 32 |  | 
| 33 | 
            -
             | 
| 34 | 
            -
             | 
| 35 | 
            -
             | 
| 36 | 
            -
             | 
| 37 | 
            -
             | 
| 38 | 
            -
             | 
| 39 | 
            -
             | 
| 33 | 
            +
                Well-suited for:
         | 
| 34 | 
            +
                
         | 
| 35 | 
            +
                * Generating or resizing images.
         | 
| 36 | 
            +
                * Encoding video.
         | 
| 37 | 
            +
                * Running text extraction or OCR on PDFs.
         | 
| 38 | 
            +
                * Migrating a large file set or database.
         | 
| 39 | 
            +
                * Web scraping.
         | 
| 40 40 |  | 
| 41 41 |  | 
| 42 42 | 
             
              ~ Documentation ~
         | 
| 43 43 |  | 
| 44 44 | 
             
                Wiki: http://wiki.github.com/documentcloud/cloud-crowd
         | 
| 45 | 
            -
             | 
| 45 | 
            +
                Rdoc: http://rdoc.info/projects/documentcloud/cloud-crowd
         | 
| 46 46 |  | 
| 47 47 |  | 
| 48 48 | 
             
              ~ Getting started ~
         | 
    
        data/cloud-crowd.gemspec
    CHANGED
    
    | @@ -1,7 +1,7 @@ | |
| 1 1 | 
             
            Gem::Specification.new do |s|
         | 
| 2 2 | 
             
              s.name      = 'cloud-crowd'
         | 
| 3 | 
            -
              s.version   = '0.1. | 
| 4 | 
            -
              s.date      = '2009-09- | 
| 3 | 
            +
              s.version   = '0.1.1'         # Keep version in sync with cloud-cloud.rb
         | 
| 4 | 
            +
              s.date      = '2009-09-15'
         | 
| 5 5 |  | 
| 6 6 | 
             
              s.homepage    = "http://wiki.github.com/documentcloud/cloud-crowd"
         | 
| 7 7 | 
             
              s.summary     = "Parallel Processing for the Rest of Us"
         | 
| @@ -32,7 +32,7 @@ Gem::Specification.new do |s| | |
| 32 32 | 
             
              s.add_dependency 'json',          ['>= 1.1.7']
         | 
| 33 33 | 
             
              s.add_dependency 'rest-client',   ['>= 1.0.3']
         | 
| 34 34 | 
             
              s.add_dependency 'right_aws',     ['>= 1.10.0']
         | 
| 35 | 
            -
              s.add_dependency ' | 
| 35 | 
            +
              s.add_dependency 'thin',          ['>= 1.2.4']
         | 
| 36 36 |  | 
| 37 37 | 
             
              if s.respond_to?(:add_development_dependency)
         | 
| 38 38 | 
             
                s.add_development_dependency 'faker',               ['>= 0.3.1']
         | 
| @@ -56,23 +56,22 @@ examples/process_pdfs_example.rb | |
| 56 56 | 
             
            examples/word_count_example.rb
         | 
| 57 57 | 
             
            lib/cloud-crowd.rb
         | 
| 58 58 | 
             
            lib/cloud_crowd/action.rb
         | 
| 59 | 
            -
            lib/cloud_crowd/app.rb
         | 
| 60 59 | 
             
            lib/cloud_crowd/asset_store/filesystem_store.rb
         | 
| 61 60 | 
             
            lib/cloud_crowd/asset_store/s3_store.rb
         | 
| 62 61 | 
             
            lib/cloud_crowd/asset_store.rb
         | 
| 63 62 | 
             
            lib/cloud_crowd/command_line.rb
         | 
| 64 | 
            -
            lib/cloud_crowd/daemon.rb
         | 
| 65 63 | 
             
            lib/cloud_crowd/exceptions.rb
         | 
| 66 64 | 
             
            lib/cloud_crowd/helpers/authorization.rb
         | 
| 67 65 | 
             
            lib/cloud_crowd/helpers/resources.rb
         | 
| 68 66 | 
             
            lib/cloud_crowd/helpers.rb
         | 
| 69 67 | 
             
            lib/cloud_crowd/inflector.rb
         | 
| 70 68 | 
             
            lib/cloud_crowd/models/job.rb
         | 
| 69 | 
            +
            lib/cloud_crowd/models/node_record.rb
         | 
| 71 70 | 
             
            lib/cloud_crowd/models/work_unit.rb
         | 
| 72 | 
            -
            lib/cloud_crowd/models/worker_record.rb
         | 
| 73 71 | 
             
            lib/cloud_crowd/models.rb
         | 
| 74 | 
            -
            lib/cloud_crowd/ | 
| 72 | 
            +
            lib/cloud_crowd/node.rb
         | 
| 75 73 | 
             
            lib/cloud_crowd/schema.rb
         | 
| 74 | 
            +
            lib/cloud_crowd/server.rb
         | 
| 76 75 | 
             
            lib/cloud_crowd/worker.rb
         | 
| 77 76 | 
             
            LICENSE
         | 
| 78 77 | 
             
            public/css/admin_console.css
         | 
| @@ -83,6 +82,7 @@ public/images/cloud_hand.png | |
| 83 82 | 
             
            public/images/header_back.png
         | 
| 84 83 | 
             
            public/images/logo.png
         | 
| 85 84 | 
             
            public/images/queue_fill.png
         | 
| 85 | 
            +
            public/images/server.png
         | 
| 86 86 | 
             
            public/images/server_error.png
         | 
| 87 87 | 
             
            public/images/sidebar_bottom.png
         | 
| 88 88 | 
             
            public/images/sidebar_top.png
         | 
| @@ -93,7 +93,7 @@ public/js/excanvas.js | |
| 93 93 | 
             
            public/js/flot.js
         | 
| 94 94 | 
             
            public/js/jquery.js
         | 
| 95 95 | 
             
            README
         | 
| 96 | 
            -
            test/acceptance/ | 
| 96 | 
            +
            test/acceptance/test_server.rb
         | 
| 97 97 | 
             
            test/acceptance/test_failing_work_units.rb
         | 
| 98 98 | 
             
            test/acceptance/test_word_count.rb
         | 
| 99 99 | 
             
            test/blueprints.rb
         | 
    
        data/config/config.example.ru
    CHANGED
    
    | @@ -4,7 +4,13 @@ | |
| 4 4 | 
             
            # using any Rack-compliant server handler. For example, start up three servers 
         | 
| 5 5 | 
             
            # with a specified port number, using Thin:
         | 
| 6 6 | 
             
            #
         | 
| 7 | 
            -
            # thin start -R config.ru  | 
| 7 | 
            +
            # thin start -R config.ru --servers 3
         | 
| 8 | 
            +
            #
         | 
| 9 | 
            +
            # Or a single server with Unicorn:
         | 
| 10 | 
            +
            #
         | 
| 11 | 
            +
            # unicorn config.ru
         | 
| 12 | 
            +
            #
         | 
| 13 | 
            +
             | 
| 8 14 |  | 
| 9 15 | 
             
            require 'rubygems'
         | 
| 10 16 | 
             
            require 'cloud-crowd'
         | 
| @@ -13,5 +19,5 @@ CloudCrowd.configure(File.dirname(__FILE__) + '/config.yml') | |
| 13 19 | 
             
            CloudCrowd.configure_database(File.dirname(__FILE__) + '/database.yml')
         | 
| 14 20 |  | 
| 15 21 | 
             
            map '/' do
         | 
| 16 | 
            -
              run CloudCrowd:: | 
| 22 | 
            +
              run CloudCrowd::Server
         | 
| 17 23 | 
             
            end
         | 
    
        data/config/config.example.yml
    CHANGED
    
    | @@ -1,6 +1,11 @@ | |
| 1 1 | 
             
            # The URL where you're planning on running the central server/queue/database.
         | 
| 2 2 | 
             
            :central_server:          http://localhost:9173
         | 
| 3 3 |  | 
| 4 | 
            +
            # Set the maximum number of workers allowed per-node. Workers only run while 
         | 
| 5 | 
            +
            # there's work to be done. It's best to set 'max_workers' below the point where 
         | 
| 6 | 
            +
            # you'd start to swap or peg your CPU (as determined by experiment).
         | 
| 7 | 
            +
            :max_workers:             5
         | 
| 8 | 
            +
             | 
| 4 9 | 
             
            # The storage back-end that you'd like to use for intermediate and final results
         | 
| 5 10 | 
             
            # of processing. 's3' and 'filesystem' are supported. 'filesystem' should only
         | 
| 6 11 | 
             
            # be used in development, or on single-machine installations.
         | 
| @@ -29,20 +34,6 @@ | |
| 29 34 | 
             
            # additional actions from a location of your choice.
         | 
| 30 35 | 
             
            # :actions_path: /path/to/actions
         | 
| 31 36 |  | 
| 32 | 
            -
            # Set the following numbers to tweak the configuration of your worker daemons. 
         | 
| 33 | 
            -
            # Optimum results will depend on proportion of the Memory/CPU/IO bottlenecks
         | 
| 34 | 
            -
            # in your actions, the number of central servers you have running, and your
         | 
| 35 | 
            -
            # desired balance between latency and traffic.
         | 
| 36 | 
            -
              
         | 
| 37 | 
            -
            # The number of workers that `crowd workers start` spins up.
         | 
| 38 | 
            -
            :num_workers:             3
         | 
| 39 | 
            -
             | 
| 40 | 
            -
            # The minimum number of seconds a worker waits between checking the job queue.
         | 
| 41 | 
            -
            :min_worker_wait:         1
         | 
| 42 | 
            -
             | 
| 43 | 
            -
            # The maximum number of seconds a worker waits between checking the job queue.
         | 
| 44 | 
            -
            :max_worker_wait:         5
         | 
| 45 | 
            -
             | 
| 46 37 | 
             
            # The number of separate attempts that will be made to process an individual
         | 
| 47 38 | 
             
            # work unit, before marking it as having failed.
         | 
| 48 | 
            -
            :work_unit_retries:       3
         | 
| 39 | 
            +
            :work_unit_retries:       3
         | 
| @@ -17,7 +17,7 @@ RestClient.post('http://localhost:9173/jobs', | |
| 17 17 | 
             
                  'http://tigger.uic.edu/~victor/personal/futurism.pdf',
         | 
| 18 18 | 
             
                  'http://www.jonasmekas.com/Catalog_excerpt/The%20Avant-Garde%20From%20Futurism%20to%20Fluxus.pdf',
         | 
| 19 19 | 
             
                  'http://www.dzignism.com/articles/Futurist.Manifesto.pdf',
         | 
| 20 | 
            -
                  'http:// | 
| 20 | 
            +
                  'http://www.pitt.edu/~slavic/sisc/SISC4/dadswell.pdf'
         | 
| 21 21 | 
             
                ],
         | 
| 22 22 |  | 
| 23 23 | 
             
                'options' => {
         | 
    
        data/lib/cloud-crowd.rb
    CHANGED
    
    | @@ -5,16 +5,15 @@ $LOAD_PATH.unshift File.expand_path(File.dirname(__FILE__)) | |
| 5 5 | 
             
            # Common Gems:
         | 
| 6 6 | 
             
            require 'rubygems'
         | 
| 7 7 | 
             
            gem 'activerecord'
         | 
| 8 | 
            -
            gem 'daemons'
         | 
| 9 8 | 
             
            gem 'json'
         | 
| 10 9 | 
             
            gem 'rest-client'
         | 
| 11 10 | 
             
            gem 'right_aws'
         | 
| 12 11 | 
             
            gem 'sinatra'
         | 
| 12 | 
            +
            gem 'thin'
         | 
| 13 13 |  | 
| 14 14 | 
             
            # Autoloading for all the pieces which may or may not be needed:
         | 
| 15 15 | 
             
            autoload :ActiveRecord, 'activerecord'
         | 
| 16 16 | 
             
            autoload :Benchmark,    'benchmark'
         | 
| 17 | 
            -
            autoload :Daemons,      'daemons'
         | 
| 18 17 | 
             
            autoload :Digest,       'digest'
         | 
| 19 18 | 
             
            autoload :ERB,          'erb'
         | 
| 20 19 | 
             
            autoload :FileUtils,    'fileutils'
         | 
| @@ -23,6 +22,7 @@ autoload :RestClient,   'restclient' | |
| 23 22 | 
             
            autoload :RightAws,     'right_aws'
         | 
| 24 23 | 
             
            autoload :Sinatra,      'sinatra'
         | 
| 25 24 | 
             
            autoload :Socket,       'socket'
         | 
| 25 | 
            +
            autoload :Thin,         'thin'
         | 
| 26 26 | 
             
            autoload :YAML,         'yaml'
         | 
| 27 27 |  | 
| 28 28 | 
             
            # Common code which should really be required in every circumstance.
         | 
| @@ -31,21 +31,22 @@ require 'cloud_crowd/exceptions' | |
| 31 31 | 
             
            module CloudCrowd
         | 
| 32 32 |  | 
| 33 33 | 
             
              # Autoload all the CloudCrowd classes which may not be required.
         | 
| 34 | 
            -
              autoload :App,          'cloud_crowd/app'
         | 
| 35 34 | 
             
              autoload :Action,       'cloud_crowd/action'
         | 
| 36 35 | 
             
              autoload :AssetStore,   'cloud_crowd/asset_store'
         | 
| 37 36 | 
             
              autoload :Helpers,      'cloud_crowd/helpers'
         | 
| 38 37 | 
             
              autoload :Inflector,    'cloud_crowd/inflector'
         | 
| 39 38 | 
             
              autoload :Job,          'cloud_crowd/models'
         | 
| 39 | 
            +
              autoload :Node,         'cloud_crowd/node'
         | 
| 40 | 
            +
              autoload :NodeRecord,   'cloud_crowd/models'
         | 
| 41 | 
            +
              autoload :Server,       'cloud_crowd/server'
         | 
| 40 42 | 
             
              autoload :Worker,       'cloud_crowd/worker'
         | 
| 41 43 | 
             
              autoload :WorkUnit,     'cloud_crowd/models'
         | 
| 42 | 
            -
              autoload :WorkerRecord, 'cloud_crowd/models'
         | 
| 43 44 |  | 
| 44 45 | 
             
              # Root directory of the CloudCrowd gem.
         | 
| 45 46 | 
             
              ROOT        = File.expand_path(File.dirname(__FILE__) + '/..')
         | 
| 46 47 |  | 
| 47 48 | 
             
              # Keep the version in sync with the gemspec.
         | 
| 48 | 
            -
              VERSION     = '0.1. | 
| 49 | 
            +
              VERSION     = '0.1.1'
         | 
| 49 50 |  | 
| 50 51 | 
             
              # A Job is processing if its WorkUnits in the queue to be handled by workers.
         | 
| 51 52 | 
             
              PROCESSING  = 1
         | 
    
        data/lib/cloud_crowd/action.rb
    CHANGED
    
    | @@ -38,12 +38,16 @@ module CloudCrowd | |
| 38 38 |  | 
| 39 39 | 
             
                # Download a file to the specified path.
         | 
| 40 40 | 
             
                def download(url, path)
         | 
| 41 | 
            -
                   | 
| 42 | 
            -
             | 
| 43 | 
            -
                   | 
| 44 | 
            -
             | 
| 45 | 
            -
             | 
| 46 | 
            -
                   | 
| 41 | 
            +
                  URI.parse(url) # Sanity check.
         | 
| 42 | 
            +
                  `curl -s "#{url}" > "#{path}"`
         | 
| 43 | 
            +
                  # if url.match(FILE_URL)
         | 
| 44 | 
            +
                  #   FileUtils.cp(url.sub(FILE_URL, ''), path)
         | 
| 45 | 
            +
                  # else
         | 
| 46 | 
            +
                  #   # An alternative would be shelling out: `curl -s "#{url}" > "#{path}"`
         | 
| 47 | 
            +
                  #   puts url
         | 
| 48 | 
            +
                  #   resp = RestClient::Request.execute(:url => url, :method => :get, :raw_response => true)
         | 
| 49 | 
            +
                  #   FileUtils.mv resp.file.path, path
         | 
| 50 | 
            +
                  # end
         | 
| 47 51 | 
             
                  path
         | 
| 48 52 | 
             
                end
         | 
| 49 53 |  | 
| @@ -55,7 +59,7 @@ module CloudCrowd | |
| 55 59 | 
             
                end
         | 
| 56 60 |  | 
| 57 61 | 
             
                # After the Action has finished, we remove the work directory and return
         | 
| 58 | 
            -
                # to the root directory (where  | 
| 62 | 
            +
                # to the root directory (where workers run by default).
         | 
| 59 63 | 
             
                def cleanup_work_directory
         | 
| 60 64 | 
             
                  FileUtils.rm_r(@work_directory) if File.exists?(@work_directory)
         | 
| 61 65 | 
             
                end
         | 
| @@ -6,6 +6,11 @@ module CloudCrowd | |
| 6 6 | 
             
                # installation.
         | 
| 7 7 | 
             
                module FilesystemStore
         | 
| 8 8 |  | 
| 9 | 
            +
                  # Make sure that local storage is writeable before starting.
         | 
| 10 | 
            +
                  def setup
         | 
| 11 | 
            +
                    raise Error::StorageNotWritable, "#{LOCAL_STORAGE_PATH} is not writable" unless File.writable?(LOCAL_STORAGE_PATH)
         | 
| 12 | 
            +
                  end
         | 
| 13 | 
            +
                  
         | 
| 9 14 | 
             
                  # Save a file to somewhere semi-persistent on the filesystem. Can be used
         | 
| 10 15 | 
             
                  # in development, when offline, or if you happen to have a single-machine
         | 
| 11 16 | 
             
                  # CloudCrowd installation. To use, configure <tt>:storage => 'filesystem'</tt>.
         | 
| @@ -5,11 +5,16 @@ module CloudCrowd | |
| 5 5 | 
             
                # on S3 for all resulting files.
         | 
| 6 6 | 
             
                module S3Store
         | 
| 7 7 |  | 
| 8 | 
            +
                  # Configure authentication and establish a connection to S3, first thing.
         | 
| 9 | 
            +
                  def setup
         | 
| 10 | 
            +
                    @use_auth = CloudCrowd.config[:use_s3_authentication]
         | 
| 11 | 
            +
                    establish_s3_connection
         | 
| 12 | 
            +
                  end
         | 
| 13 | 
            +
                  
         | 
| 8 14 | 
             
                  # Save a finished file from local storage to S3. Save it publicly unless 
         | 
| 9 15 | 
             
                  # we're configured to use S3 authentication. Authenticated links expire
         | 
| 10 16 | 
             
                  # after one day by default.
         | 
| 11 17 | 
             
                  def save(local_path, save_path)
         | 
| 12 | 
            -
                    ensure_s3_connection
         | 
| 13 18 | 
             
                    if @use_auth
         | 
| 14 19 | 
             
                      @bucket.put(save_path, File.open(local_path), {}, 'private')
         | 
| 15 20 | 
             
                      @s3.interface.get_link(@bucket, save_path)
         | 
| @@ -21,13 +26,12 @@ module CloudCrowd | |
| 21 26 |  | 
| 22 27 | 
             
                  # Remove all of a Job's resulting files from S3, both intermediate and finished.
         | 
| 23 28 | 
             
                  def cleanup(job)
         | 
| 24 | 
            -
                    ensure_s3_connection
         | 
| 25 29 | 
             
                    @bucket.delete_folder("#{job.action}/job_#{job.id}")
         | 
| 26 30 | 
             
                  end
         | 
| 27 31 |  | 
| 28 32 | 
             
                  # Workers, through the course of many WorkUnits, keep around an AssetStore.
         | 
| 29 33 | 
             
                  # Ensure we have a persistent S3 connection after first use.
         | 
| 30 | 
            -
                  def  | 
| 34 | 
            +
                  def establish_s3_connection
         | 
| 31 35 | 
             
                    unless @s3 && @bucket
         | 
| 32 36 | 
             
                      params = {:port => 80, :protocol => 'http'}
         | 
| 33 37 | 
             
                      @s3 = RightAws::S3.new(CloudCrowd.config[:aws_access_key], CloudCrowd.config[:aws_secret_key], params)
         | 
| @@ -25,9 +25,9 @@ module CloudCrowd | |
| 25 25 |  | 
| 26 26 | 
             
                # Creating the AssetStore ensures that its scratch directory exists.
         | 
| 27 27 | 
             
                def initialize
         | 
| 28 | 
            -
                  @use_auth = CloudCrowd.config[:use_s3_authentication]
         | 
| 29 28 | 
             
                  FileUtils.mkdir_p temp_storage_path unless File.exists? temp_storage_path
         | 
| 30 29 | 
             
                  raise Error::StorageNotWritable, "#{temp_storage_path} is not writable" unless File.writable?(temp_storage_path)
         | 
| 30 | 
            +
                  setup if respond_to? :setup
         | 
| 31 31 | 
             
                end
         | 
| 32 32 |  | 
| 33 33 | 
             
                # Get the path to CloudCrowd's temporary local storage. All actions run
         | 
| @@ -9,9 +9,6 @@ module CloudCrowd | |
| 9 9 | 
             
                # Reference the absolute path to the root.
         | 
| 10 10 | 
             
                CC_ROOT = File.expand_path(File.dirname(__FILE__) + '/../..')
         | 
| 11 11 |  | 
| 12 | 
            -
                # Path to the Daemons gem script which launches workers.
         | 
| 13 | 
            -
                WORKER_RUNNER = File.expand_path("#{CC_ROOT}/lib/cloud_crowd/runner.rb")
         | 
| 14 | 
            -
                
         | 
| 15 12 | 
             
                # Command-line banner for the usage message.
         | 
| 16 13 | 
             
                BANNER = <<-EOS
         | 
| 17 14 | 
             
            CloudCrowd is a MapReduce-inspired Parallel Processing System for Ruby.
         | 
| @@ -24,7 +21,7 @@ Usage: crowd COMMAND OPTIONS | |
| 24 21 | 
             
            Commands:
         | 
| 25 22 | 
             
              install       Install the CloudCrowd configuration files to the specified directory
         | 
| 26 23 | 
             
              server        Start up the central server (requires a database)
         | 
| 27 | 
            -
               | 
| 24 | 
            +
              node          Start up a worker node (only one node per machine, please)
         | 
| 28 25 | 
             
              console       Launch a CloudCrowd console, connected to the central database
         | 
| 29 26 | 
             
              load_schema   Load the schema into the database specified by database.yml
         | 
| 30 27 |  | 
| @@ -38,7 +35,7 @@ Options: | |
| 38 35 | 
             
                  case command
         | 
| 39 36 | 
             
                  when 'console'      then run_console
         | 
| 40 37 | 
             
                  when 'server'       then run_server
         | 
| 41 | 
            -
                  when ' | 
| 38 | 
            +
                  when 'node'         then run_node
         | 
| 42 39 | 
             
                  when 'load_schema'  then run_load_schema
         | 
| 43 40 | 
             
                  when 'install'      then run_install
         | 
| 44 41 | 
             
                  else                     usage
         | 
| @@ -63,6 +60,7 @@ Options: | |
| 63 60 | 
             
                # (Mongrel, falling back to WEBrick). The equivalent of Rails' script/server.
         | 
| 64 61 | 
             
                def run_server
         | 
| 65 62 | 
             
                  ensure_config
         | 
| 63 | 
            +
                  @options[:port] ||= 9173
         | 
| 66 64 | 
             
                  require 'rubygems'
         | 
| 67 65 | 
             
                  rackup_path = File.expand_path("#{@options[:config_path]}/config.ru")
         | 
| 68 66 | 
             
                  if Gem.available? 'thin'
         | 
| @@ -72,6 +70,14 @@ Options: | |
| 72 70 | 
             
                  end
         | 
| 73 71 | 
             
                end
         | 
| 74 72 |  | 
| 73 | 
            +
                # Launch a Node. Please only run a single node per machine. The Node process
         | 
| 74 | 
            +
                # will be long-lived, although its workers will come and go.
         | 
| 75 | 
            +
                def run_node
         | 
| 76 | 
            +
                  ENV['RACK_ENV'] = @options['environment']
         | 
| 77 | 
            +
                  load_code
         | 
| 78 | 
            +
                  Node.new(@options[:port])
         | 
| 79 | 
            +
                end
         | 
| 80 | 
            +
                
         | 
| 75 81 | 
             
                # Load in the database schema to the database specified in 'database.yml'.
         | 
| 76 82 | 
             
                def run_load_schema
         | 
| 77 83 | 
             
                  load_code
         | 
| @@ -86,51 +92,11 @@ Options: | |
| 86 92 | 
             
                  install_path = ARGV.shift || '.'
         | 
| 87 93 | 
             
                  FileUtils.mkdir_p install_path unless File.exists?(install_path)
         | 
| 88 94 | 
             
                  install_file "#{CC_ROOT}/config/config.example.yml", "#{install_path}/config.yml"
         | 
| 89 | 
            -
                  install_file "#{CC_ROOT}/config/config.example.ru", "#{install_path}/config.ru"
         | 
| 90 95 | 
             
                  install_file "#{CC_ROOT}/config/database.example.yml", "#{install_path}/database.yml"
         | 
| 96 | 
            +
                  install_file "#{CC_ROOT}/config/config.example.ru", "#{install_path}/config.ru"
         | 
| 91 97 | 
             
                  install_file "#{CC_ROOT}/actions", "#{install_path}/actions", true
         | 
| 92 98 | 
             
                end
         | 
| 93 99 |  | 
| 94 | 
            -
                # Manipulate worker daemons -- handles all commands that the Daemons gem
         | 
| 95 | 
            -
                # provides: start, stop, restart, run, and status.
         | 
| 96 | 
            -
                def run_workers_command
         | 
| 97 | 
            -
                  ensure_config
         | 
| 98 | 
            -
                  command = ARGV.shift
         | 
| 99 | 
            -
                  case command
         | 
| 100 | 
            -
                  when 'start'    then start_workers
         | 
| 101 | 
            -
                  when 'stop'     then stop_workers
         | 
| 102 | 
            -
                  when 'restart'  then stop_workers && start_workers
         | 
| 103 | 
            -
                  when 'run'      then run_worker
         | 
| 104 | 
            -
                  when 'status'   then show_worker_status
         | 
| 105 | 
            -
                  else                 usage
         | 
| 106 | 
            -
                  end
         | 
| 107 | 
            -
                end
         | 
| 108 | 
            -
                
         | 
| 109 | 
            -
                # Start up N workers, specified by argument or the number of workers in
         | 
| 110 | 
            -
                # config.yml.
         | 
| 111 | 
            -
                def start_workers
         | 
| 112 | 
            -
                  load_code
         | 
| 113 | 
            -
                  num_workers = @options[:num_workers] || CloudCrowd.config[:num_workers]
         | 
| 114 | 
            -
                  num_workers.times do
         | 
| 115 | 
            -
                    `CLOUD_CROWD_CONFIG='#{File.expand_path(@options[:config_path] + "/config.yml")}' ruby #{WORKER_RUNNER} start`
         | 
| 116 | 
            -
                  end
         | 
| 117 | 
            -
                end
         | 
| 118 | 
            -
                
         | 
| 119 | 
            -
                # For debugging, run a single worker in the current process, showing output.
         | 
| 120 | 
            -
                def run_worker
         | 
| 121 | 
            -
                  exec "CLOUD_CROWD_CONFIG='#{File.expand_path(@options[:config_path] + "/config.yml")}' ruby #{WORKER_RUNNER} run"
         | 
| 122 | 
            -
                end
         | 
| 123 | 
            -
                
         | 
| 124 | 
            -
                # Stop all active workers.
         | 
| 125 | 
            -
                def stop_workers
         | 
| 126 | 
            -
                  `ruby #{WORKER_RUNNER} stop`
         | 
| 127 | 
            -
                end
         | 
| 128 | 
            -
             | 
| 129 | 
            -
                # Display the status of all active workers.
         | 
| 130 | 
            -
                def show_worker_status
         | 
| 131 | 
            -
                  puts `ruby #{WORKER_RUNNER} status`
         | 
| 132 | 
            -
                end
         | 
| 133 | 
            -
                
         | 
| 134 100 | 
             
                # Print `crowd` usage.
         | 
| 135 101 | 
             
                def usage
         | 
| 136 102 | 
             
                  puts "\n#{@option_parser}\n"
         | 
| @@ -150,7 +116,6 @@ Options: | |
| 150 116 | 
             
                # Parse all options for all commands.
         | 
| 151 117 | 
             
                def parse_options
         | 
| 152 118 | 
             
                  @options = {
         | 
| 153 | 
            -
                    :port         => 9173,
         | 
| 154 119 | 
             
                    :environment  => 'production',
         | 
| 155 120 | 
             
                    :config_path  => ENV['CLOUD_CROWD_CONFIG'] || '.'
         | 
| 156 121 | 
             
                  }
         | 
| @@ -158,17 +123,14 @@ Options: | |
| 158 123 | 
             
                    opts.on('-c', '--config PATH', 'path to configuration directory') do |conf_path|
         | 
| 159 124 | 
             
                      @options[:config_path] = conf_path
         | 
| 160 125 | 
             
                    end
         | 
| 161 | 
            -
                    opts.on('- | 
| 162 | 
            -
                      @options[:num_workers] = num
         | 
| 163 | 
            -
                    end
         | 
| 164 | 
            -
                    opts.on('-p', '--port PORT', 'central server port number') do |port_num|
         | 
| 126 | 
            +
                    opts.on('-p', '--port PORT', 'port number for server (central or node)') do |port_num|
         | 
| 165 127 | 
             
                      @options[:port] = port_num
         | 
| 166 128 | 
             
                    end
         | 
| 167 129 | 
             
                    opts.on('-e', '--environment ENV', 'server environment (sinatra)') do |env|
         | 
| 168 130 | 
             
                      @options[:environment] = env
         | 
| 169 131 | 
             
                    end
         | 
| 170 132 | 
             
                    opts.on_tail('-v', '--version', 'show version') do
         | 
| 171 | 
            -
                       | 
| 133 | 
            +
                      require "#{CC_ROOT}/lib/cloud-crowd"
         | 
| 172 134 | 
             
                      puts "CloudCrowd version #{VERSION}"
         | 
| 173 135 | 
             
                      exit
         | 
| 174 136 | 
             
                    end
         | 
| @@ -181,7 +143,6 @@ Options: | |
| 181 143 | 
             
                # Not all commands require this.
         | 
| 182 144 | 
             
                def load_code
         | 
| 183 145 | 
             
                  ensure_config
         | 
| 184 | 
            -
                  require 'rubygems'
         | 
| 185 146 | 
             
                  require "#{CC_ROOT}/lib/cloud-crowd"
         | 
| 186 147 | 
             
                  CloudCrowd.configure("#{@options[:config_path]}/config.yml")
         | 
| 187 148 | 
             
                end
         | 
| @@ -8,6 +8,10 @@ module CloudCrowd | |
| 8 8 | 
             
                # exist.
         | 
| 9 9 | 
             
                class ActionNotFound < Error
         | 
| 10 10 | 
             
                end
         | 
| 11 | 
            +
                
         | 
| 12 | 
            +
                # CentralServerUnavailable is used then the central server can't be reached.
         | 
| 13 | 
            +
                class CentralServerUnavailable < Error
         | 
| 14 | 
            +
                end
         | 
| 11 15 |  | 
| 12 16 | 
             
                # StorageNotFound is raised when config.yml specifies a storage back end that
         | 
| 13 17 | 
             
                # doesn't exist.
         | 
| @@ -23,7 +23,7 @@ module CloudCrowd | |
| 23 23 | 
             
                  # A request is authorized if its login and password match those stored
         | 
| 24 24 | 
             
                  # in config.yml, or if authentication is disabled. If authentication is
         | 
| 25 25 | 
             
                  # turned on, then every request is authenticated, including between 
         | 
| 26 | 
            -
                  # the  | 
| 26 | 
            +
                  # the nodes and the central server.
         | 
| 27 27 | 
             
                  def authorize(login, password)
         | 
| 28 28 | 
             
                    return true unless CloudCrowd.config[:use_http_authentication]
         | 
| 29 29 | 
             
                    return CloudCrowd.config[:login] == login &&
         | 
| @@ -37,7 +37,7 @@ module CloudCrowd | |
| 37 37 | 
             
                    @auth ||= Rack::Auth::Basic::Request.new(request.env)
         | 
| 38 38 | 
             
                  end
         | 
| 39 39 |  | 
| 40 | 
            -
                  def unauthorized!(realm =  | 
| 40 | 
            +
                  def unauthorized!(realm = Server.authorization_realm)
         | 
| 41 41 | 
             
                    response['WWW-Authenticate'] = "Basic realm=\"#{realm}\""
         | 
| 42 42 | 
             
                    halt 401, 'Authorization Required'
         | 
| 43 43 | 
             
                  end
         | 
| @@ -20,26 +20,6 @@ module CloudCrowd | |
| 20 20 | 
             
                    @work_unit ||= WorkUnit.find_by_id(params[:work_unit_id]) or raise Sinatra::NotFound
         | 
| 21 21 | 
             
                  end
         | 
| 22 22 |  | 
| 23 | 
            -
                  # Try to fetch a work unit from the queue. If none are pending, respond
         | 
| 24 | 
            -
                  # with no content.
         | 
| 25 | 
            -
                  def dequeue_work_unit(offset=0)
         | 
| 26 | 
            -
                    handle_conflicts do
         | 
| 27 | 
            -
                      worker, actions = params[:worker_name], params[:worker_actions].split(',')
         | 
| 28 | 
            -
                      WorkUnit.dequeue(worker, actions, offset)
         | 
| 29 | 
            -
                    end
         | 
| 30 | 
            -
                  end
         | 
| 31 | 
            -
                  
         | 
| 32 | 
            -
                  # We're using ActiveRecords optimistic locking, so stale work units
         | 
| 33 | 
            -
                  # may sometimes arise. handle_conflicts responds with a the HTTP status
         | 
| 34 | 
            -
                  # code of your choosing if the update failed to be applied.
         | 
| 35 | 
            -
                  def handle_conflicts(code=204)
         | 
| 36 | 
            -
                    begin
         | 
| 37 | 
            -
                      yield
         | 
| 38 | 
            -
                    rescue ActiveRecord::StaleObjectError => e
         | 
| 39 | 
            -
                      return status(code) && ''
         | 
| 40 | 
            -
                    end
         | 
| 41 | 
            -
                  end
         | 
| 42 | 
            -
                  
         | 
| 43 23 | 
             
                end
         | 
| 44 24 | 
             
              end
         | 
| 45 25 | 
             
            end
         | 
| @@ -31,30 +31,39 @@ module CloudCrowd | |
| 31 31 | 
             
                # finished, if so, continue on to the next phase of the job. 
         | 
| 32 32 | 
             
                def check_for_completion
         | 
| 33 33 | 
             
                  return unless all_work_units_complete?
         | 
| 34 | 
            -
                   | 
| 35 | 
            -
                   | 
| 36 | 
            -
                  
         | 
| 37 | 
            -
                  if complete?
         | 
| 38 | 
            -
                    self.outputs = output_list.to_json
         | 
| 39 | 
            -
                    self.time = Time.now - self.created_at
         | 
| 40 | 
            -
                  end
         | 
| 41 | 
            -
                  self.save
         | 
| 34 | 
            +
                  set_next_status    
         | 
| 35 | 
            +
                  outs = gather_outputs_from_work_units
         | 
| 36 | 
            +
                  update_attributes(:outputs => outs.to_json, :time => time_taken) if complete?
         | 
| 42 37 |  | 
| 43 38 | 
             
                  case self.status
         | 
| 44 | 
            -
                  when PROCESSING then queue_for_workers( | 
| 45 | 
            -
                  when MERGING    then queue_for_workers( | 
| 39 | 
            +
                  when PROCESSING then queue_for_workers(outs.map {|o| JSON.parse(o) }.flatten)
         | 
| 40 | 
            +
                  when MERGING    then queue_for_workers(outs.to_json)
         | 
| 46 41 | 
             
                  else                 fire_callback
         | 
| 47 42 | 
             
                  end
         | 
| 48 43 | 
             
                  self
         | 
| 49 44 | 
             
                end
         | 
| 50 45 |  | 
| 46 | 
            +
                # Transition this Job's status to the appropriate next status.
         | 
| 47 | 
            +
                def set_next_status
         | 
| 48 | 
            +
                  update_attribute(:status,
         | 
| 49 | 
            +
                    any_work_units_failed? ? FAILED     :
         | 
| 50 | 
            +
                    self.splitting?        ? PROCESSING :
         | 
| 51 | 
            +
                    self.mergeable?        ? MERGING    :
         | 
| 52 | 
            +
                                             SUCCEEDED
         | 
| 53 | 
            +
                  )
         | 
| 54 | 
            +
                end
         | 
| 55 | 
            +
                
         | 
| 51 56 | 
             
                # If a <tt>callback_url</tt> is defined, post the Job's JSON to it upon 
         | 
| 52 57 | 
             
                # completion. The <tt>callback_url</tt> may include HTTP basic authentication,
         | 
| 53 58 | 
             
                # if you like:
         | 
| 54 59 | 
             
                #   http://user:password@example.com/job_complete
         | 
| 60 | 
            +
                # If the callback_url is successfully pinged, we proceed to cleanup the job.
         | 
| 61 | 
            +
                # TODO: This should be moved into a Work Unit...
         | 
| 55 62 | 
             
                def fire_callback
         | 
| 63 | 
            +
                  return unless callback_url
         | 
| 56 64 | 
             
                  begin
         | 
| 57 | 
            -
                    RestClient.post(callback_url, {:job => self.to_json}) | 
| 65 | 
            +
                    RestClient.post(callback_url, {:job => self.to_json})
         | 
| 66 | 
            +
                    self.destroy
         | 
| 58 67 | 
             
                  rescue RestClient::Exception => e
         | 
| 59 68 | 
             
                    puts "Failed to fire job callback. Hmmm, what should happen here?"
         | 
| 60 69 | 
             
                  end
         | 
| @@ -62,15 +71,12 @@ module CloudCrowd | |
| 62 71 |  | 
| 63 72 | 
             
                # Cleaning up after a job will remove all of its files from S3. Destroying
         | 
| 64 73 | 
             
                # a Job calls cleanup_assets first.
         | 
| 74 | 
            +
                # TODO: Convert this into a 'cleanup' work unit that gets run by a worker.
         | 
| 65 75 | 
             
                def cleanup_assets
         | 
| 66 76 | 
             
                  AssetStore.new.cleanup(self)
         | 
| 67 77 | 
             
                end
         | 
| 68 78 |  | 
| 69 79 | 
             
                # Have all of the WorkUnits finished? 
         | 
| 70 | 
            -
                #--
         | 
| 71 | 
            -
                # We could trade reads for writes here
         | 
| 72 | 
            -
                # by keeping a completed_count on the Job itself.
         | 
| 73 | 
            -
                #++
         | 
| 74 80 | 
             
                def all_work_units_complete?
         | 
| 75 81 | 
             
                  self.work_units.incomplete.count <= 0
         | 
| 76 82 | 
             
                end
         | 
| @@ -98,10 +104,11 @@ module CloudCrowd | |
| 98 104 | 
             
                end
         | 
| 99 105 |  | 
| 100 106 | 
             
                # How complete is this Job?
         | 
| 107 | 
            +
                # Unfortunately, with the current processing sequence, the percent_complete
         | 
| 108 | 
            +
                # can pull a fast one and go backwards.
         | 
| 101 109 | 
             
                def percent_complete
         | 
| 102 | 
            -
                  return 0   if splitting?
         | 
| 103 | 
            -
                  return 100 if complete?
         | 
| 104 110 | 
             
                  return 99  if merging?
         | 
| 111 | 
            +
                  return 100 if complete?
         | 
| 105 112 | 
             
                  (work_units.complete.count / work_units.count.to_f * 100).round
         | 
| 106 113 | 
             
                end
         | 
| 107 114 |  | 
| @@ -143,21 +150,13 @@ module CloudCrowd | |
| 143 150 | 
             
                  self.work_units.complete.destroy_all
         | 
| 144 151 | 
             
                  outs
         | 
| 145 152 | 
             
                end
         | 
| 146 | 
            -
                
         | 
| 147 | 
            -
                # Transition this Job's status to the appropriate next status.
         | 
| 148 | 
            -
                def transition_to_next_phase
         | 
| 149 | 
            -
                  self.status = any_work_units_failed? ? FAILED     :
         | 
| 150 | 
            -
                                self.splitting?        ? PROCESSING :
         | 
| 151 | 
            -
                                self.mergeable?        ? MERGING    :
         | 
| 152 | 
            -
                                                         SUCCEEDED
         | 
| 153 | 
            -
                end
         | 
| 154 153 |  | 
| 155 154 | 
             
                # When starting a new job, or moving to a new stage, split up the inputs 
         | 
| 156 155 | 
             
                # into WorkUnits, and queue them. Workers will start picking them up right
         | 
| 157 156 | 
             
                # away.
         | 
| 158 157 | 
             
                def queue_for_workers(input=nil)
         | 
| 159 158 | 
             
                  input ||= JSON.parse(self.inputs)
         | 
| 160 | 
            -
                  [input].flatten. | 
| 159 | 
            +
                  [input].flatten.map do |wu_input|
         | 
| 161 160 | 
             
                    WorkUnit.create(
         | 
| 162 161 | 
             
                      :job    => self, 
         | 
| 163 162 | 
             
                      :action => self.action, 
         |