scbi_mapreduce 0.0.29
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +49 -0
- data/Manifest.txt +46 -0
- data/PostInstall.txt +7 -0
- data/README.rdoc +295 -0
- data/Rakefile +28 -0
- data/bin/scbi_mapreduce +52 -0
- data/lib/scbi_mapreduce.rb +15 -0
- data/lib/scbi_mapreduce/error_handler.rb +15 -0
- data/lib/scbi_mapreduce/main_worker.rb +50 -0
- data/lib/scbi_mapreduce/manager.rb +110 -0
- data/lib/scbi_mapreduce/work_manager.rb +405 -0
- data/lib/scbi_mapreduce/worker.rb +163 -0
- data/lib/scbi_mapreduce/worker_launcher.rb +96 -0
- data/lib/scbi_mapreduce/zlib_serializer.rb +32 -0
- data/script/console +10 -0
- data/script/destroy +14 -0
- data/script/generate +14 -0
- data/skeleton/dummy_calcs/README.txt +25 -0
- data/skeleton/dummy_calcs/lib/calculations.rb +37 -0
- data/skeleton/dummy_calcs/lib/thread_pool.rb +107 -0
- data/skeleton/dummy_calcs/linear_implementation.rb +22 -0
- data/skeleton/dummy_calcs/main.rb +67 -0
- data/skeleton/dummy_calcs/my_worker.rb +56 -0
- data/skeleton/dummy_calcs/my_worker_manager.rb +52 -0
- data/skeleton/dummy_calcs/threads_implementation.rb +33 -0
- data/skeleton/remove_mids/README.txt +30 -0
- data/skeleton/remove_mids/launch_only_workers.rb +29 -0
- data/skeleton/remove_mids/lib/db/mids.fasta +120 -0
- data/skeleton/remove_mids/lib/find_mids.rb +191 -0
- data/skeleton/remove_mids/lib/global_match.rb +97 -0
- data/skeleton/remove_mids/linear_implementation.rb +87 -0
- data/skeleton/remove_mids/main.rb +89 -0
- data/skeleton/remove_mids/my_worker.rb +59 -0
- data/skeleton/remove_mids/my_worker_manager.rb +68 -0
- data/skeleton/simple/README.txt +16 -0
- data/skeleton/simple/main.rb +41 -0
- data/skeleton/simple/my_worker.rb +53 -0
- data/skeleton/simple/my_worker_manager.rb +55 -0
- data/test/drb_test/main.rb +31 -0
- data/test/drb_test/my_worker.rb +36 -0
- data/test/drb_test/my_worker_manager.rb +41 -0
- data/test/drb_test/scbi_drb_checkpoint +1 -0
- data/test/drb_test/scbi_mapreduce_checkpoint +1 -0
- data/test/test_helper.rb +3 -0
- data/test/test_scbi_drb.rb +11 -0
- metadata +127 -0
    
        data/History.txt
    ADDED
    
    | @@ -0,0 +1,49 @@ | |
| 1 | 
            +
            === 0.0.29 2011-06-13
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            First rubygems release
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            === 0.0.21 2011-05-19
         | 
| 6 | 
            +
             | 
| 7 | 
            +
            added zlib serialization
         | 
| 8 | 
            +
             | 
| 9 | 
            +
            === 0.0.20 2011-05-18
         | 
| 10 | 
            +
             | 
| 11 | 
            +
            Own serializer
         | 
| 12 | 
            +
             | 
| 13 | 
            +
            === 0.0.19 2011-05-11
         | 
| 14 | 
            +
             | 
| 15 | 
            +
            Added dummy_calculations skeleton
         | 
| 16 | 
            +
             | 
| 17 | 
            +
            === 0.0.18 2011-05-11
         | 
| 18 | 
            +
             | 
| 19 | 
            +
            Added fibo skeleton
         | 
| 20 | 
            +
             | 
| 21 | 
            +
            === 0.0.17 2011-05-09
         | 
| 22 | 
            +
             | 
| 23 | 
            +
            New sequence Skeleton
         | 
| 24 | 
            +
             | 
| 25 | 
            +
            === 0.0.16 2011-05-05
         | 
| 26 | 
            +
             | 
| 27 | 
            +
            Automatically create log directory
         | 
| 28 | 
            +
             | 
| 29 | 
            +
            === 0.0.4 2010-08-26
         | 
| 30 | 
            +
             | 
| 31 | 
            +
            * 1 minor fix:
         | 
| 32 | 
            +
              * add initial config interchange between server and workers
         | 
| 33 | 
            +
             | 
| 34 | 
            +
             | 
| 35 | 
            +
            === 0.0.3 2010-08-24
         | 
| 36 | 
            +
             | 
| 37 | 
            +
            * 1 minor fix:
         | 
| 38 | 
            +
              * changed start_worker from post_init
         | 
| 39 | 
            +
             | 
| 40 | 
            +
             | 
| 41 | 
            +
            === 0.0.2 2010-08-06
         | 
| 42 | 
            +
             | 
| 43 | 
            +
            * 1 minor fix:
         | 
| 44 | 
            +
              * changed logs names
         | 
| 45 | 
            +
             | 
| 46 | 
            +
            === 0.0.1 2010-06-11
         | 
| 47 | 
            +
             | 
| 48 | 
            +
            * 1 major enhancement:
         | 
| 49 | 
            +
              * Initial release
         | 
    
        data/Manifest.txt
    ADDED
    
    | @@ -0,0 +1,46 @@ | |
| 1 | 
            +
            History.txt
         | 
| 2 | 
            +
            lib/scbi_mapreduce/error_handler.rb
         | 
| 3 | 
            +
            lib/scbi_mapreduce/main_worker.rb
         | 
| 4 | 
            +
            lib/scbi_mapreduce/manager.rb
         | 
| 5 | 
            +
            lib/scbi_mapreduce/work_manager.rb
         | 
| 6 | 
            +
            lib/scbi_mapreduce/worker.rb
         | 
| 7 | 
            +
            lib/scbi_mapreduce/zlib_serializer.rb
         | 
| 8 | 
            +
            lib/scbi_mapreduce/worker_launcher.rb
         | 
| 9 | 
            +
            lib/scbi_mapreduce.rb
         | 
| 10 | 
            +
            Manifest.txt
         | 
| 11 | 
            +
            PostInstall.txt
         | 
| 12 | 
            +
            Rakefile
         | 
| 13 | 
            +
            README.rdoc
         | 
| 14 | 
            +
            script/console
         | 
| 15 | 
            +
            script/destroy
         | 
| 16 | 
            +
            script/generate
         | 
| 17 | 
            +
            test/drb_test/logs
         | 
| 18 | 
            +
            test/drb_test/main.rb
         | 
| 19 | 
            +
            test/drb_test/my_worker.rb
         | 
| 20 | 
            +
            test/drb_test/my_worker_manager.rb
         | 
| 21 | 
            +
            test/drb_test/scbi_drb_checkpoint
         | 
| 22 | 
            +
            test/drb_test/scbi_mapreduce_checkpoint
         | 
| 23 | 
            +
            test/test_helper.rb
         | 
| 24 | 
            +
            test/test_scbi_drb.rb
         | 
| 25 | 
            +
            bin/scbi_mapreduce
         | 
| 26 | 
            +
            skeleton/simple/main.rb
         | 
| 27 | 
            +
            skeleton/simple/my_worker.rb
         | 
| 28 | 
            +
            skeleton/simple/my_worker_manager.rb
         | 
| 29 | 
            +
            skeleton/simple/README.txt
         | 
| 30 | 
            +
            skeleton/remove_mids/launch_only_workers.rb
         | 
| 31 | 
            +
            skeleton/remove_mids/lib/db/mids.fasta
         | 
| 32 | 
            +
            skeleton/remove_mids/lib/find_mids.rb
         | 
| 33 | 
            +
            skeleton/remove_mids/lib/global_match.rb
         | 
| 34 | 
            +
            skeleton/remove_mids/linear_implementation.rb
         | 
| 35 | 
            +
            skeleton/remove_mids/main.rb
         | 
| 36 | 
            +
            skeleton/remove_mids/my_worker.rb
         | 
| 37 | 
            +
            skeleton/remove_mids/my_worker_manager.rb
         | 
| 38 | 
            +
            skeleton/remove_mids/README.txt
         | 
| 39 | 
            +
            skeleton/dummy_calcs/lib/calculations.rb
         | 
| 40 | 
            +
            skeleton/dummy_calcs/lib/thread_pool.rb
         | 
| 41 | 
            +
            skeleton/dummy_calcs/linear_implementation.rb
         | 
| 42 | 
            +
            skeleton/dummy_calcs/main.rb
         | 
| 43 | 
            +
            skeleton/dummy_calcs/my_worker.rb
         | 
| 44 | 
            +
            skeleton/dummy_calcs/my_worker_manager.rb
         | 
| 45 | 
            +
            skeleton/dummy_calcs/README.txt
         | 
| 46 | 
            +
            skeleton/dummy_calcs/threads_implementation.rb
         | 
    
        data/PostInstall.txt
    ADDED
    
    
    
        data/README.rdoc
    ADDED
    
    | @@ -0,0 +1,295 @@ | |
| 1 | 
            +
            = scbi_mapreduce
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            * http://www.scbi.uma.es/downloads
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            == DESCRIPTION:
         | 
| 6 | 
            +
             | 
| 7 | 
            +
            scbi_mapreduce brings parallel and distributed computing capabilities to your code, with a very easy to use framework that allows you to exploit your clustered or cloud computational resources.
         | 
| 8 | 
            +
             | 
| 9 | 
            +
            == FEATURES:
         | 
| 10 | 
            +
             | 
| 11 | 
            +
            scbi_mapreduce provides a black boxed distributed programming. Users only need to code some predefined methods in order to achieve distribution. Programming remains sequential at user level (this avoids the hassle of threads or processes handling).
         | 
| 12 | 
            +
             | 
| 13 | 
            +
            When a project using scbi_mapreduce is run, a Manager process and a bunch of workers are created (workers can be in different machines). Manager will dispatch new data to available workers (mapping phase), each worker receives its data, manipulates it and returns the data again to Manager that will aggregate it as desired (reducction phase). 
         | 
| 14 | 
            +
             | 
| 15 | 
            +
            The manager is always waiting for workers connections or requests. When a new worker connects, it automatically receives some initial params from the server. After the initial configuration, each worker receives a first chunk of work data. Once a worker has done its job with the received data, it sends the results back to the manager, the manager saves the data, and sends a new assignment to the worker. This process is repeated until manager doesn’t have more data to be processed.
         | 
| 16 | 
            +
             | 
| 17 | 
            +
            === Some cool features of scbi_mapreduce are:
         | 
| 18 | 
            +
             | 
| 19 | 
            +
            - Automatic project creation using a generator and templates (you only need to modify some methods since a scaffold is automatically created for you)
         | 
| 20 | 
            +
            - Variable data-chunksizes: data can be grouped on variable size chunks in order to optimize network transfers and processing
         | 
| 21 | 
            +
            - Fixed order: order of input data can be maintained after the parallel execution (uses a cache to store out of order data until it is needed)
         | 
| 22 | 
            +
            - Checkpoint: current processing status can be committed to disk allowing to retake the execution of an interrupted job at the last committed point
         | 
| 23 | 
            +
            - Compression: data transfers can be automatically compressed
         | 
| 24 | 
            +
            - Encription: data transfers can be automatically encripted
         | 
| 25 | 
            +
             | 
| 26 | 
            +
            === Worker-specific features:
         | 
| 27 | 
            +
             | 
| 28 | 
            +
            - Workers are automatically spawned over the cluster (be sure to configure automatic login via ssh with ssh keys)
         | 
| 29 | 
            +
            - Additional workers can be launched/stopped at any time
         | 
| 30 | 
            +
            - Workers can be executed over a mixture of architectures and operating systems simultaneously (x86 64, ia64, i686 - OSX, Linux, UNIX)
         | 
| 31 | 
            +
            - Workers of different speeds works at full capacity all the time, without producing delays on faster workers
         | 
| 32 | 
            +
            - scbi_mapreduce uses tcp/ip and because of that it can be used over a wide variety of interconnection networks (ethernet, Gigabit, InfinyBand, Myrinet, optic-fiber with ip, etc...), and of course, over the internet (although performance will be restricted by network latency and speed)
         | 
| 33 | 
            +
            - High work throughput. About 18000 works (1 kb of data) per second with a single core manager
         | 
| 34 | 
            +
            - Number of workers is highly scalable. Done tests with up to 80 distributed cores.
         | 
| 35 | 
            +
            - Same solution works on standalone machines, clusters, cloud, SMP machines, or a mixture of them
         | 
| 36 | 
            +
             | 
| 37 | 
            +
            === Other features
         | 
| 38 | 
            +
             | 
| 39 | 
            +
            - Exhaustive log option: manager and by worker logs are very useful at development stages
         | 
| 40 | 
            +
            - Processing stats: scbi_mapreduce calculates individual performance statistics for each worker and a global one for manager process.
         | 
| 41 | 
            +
            - scbi_mapreduce makes use of evented IO (EventMachine) being efficient regarding to networked I/O operations
         | 
| 42 | 
            +
            - Reduced disk I/O: data is read only once, subsequent transfers and splitting are done in RAM (this is very appropriate when disk I/O is already quoted in Cloud or pay per use services)
         | 
| 43 | 
            +
            - There is no need to use shared storage, (although the software must be installed on all worker machines)
         | 
| 44 | 
            +
            - Worker error handling: when an exception raises in a worker, it is reported to manager, where it can be handled appropriately
         | 
| 45 | 
            +
            - High error rate aborting: if a high error rate is detected, execution is aborted in order to preserve computational resources so the user don’t need to execute the whole dataset to find that there was a programming mistake (very useful with pay per use services)
         | 
| 46 | 
            +
             | 
| 47 | 
            +
             | 
| 48 | 
            +
            scbi_mapreduce has been tested on production with PBS and Moab/Slurm queue systems, but it can be easily adapted to other ones.
         | 
| 49 | 
            +
             | 
| 50 | 
            +
            == SYNOPSIS:
         | 
| 51 | 
            +
             | 
| 52 | 
            +
            scbi_mapreduce provides an automated code generator like rails. To use it, you only need to issue this command:
         | 
| 53 | 
            +
             | 
| 54 | 
            +
              scbi_mapreduce app_name template
         | 
| 55 | 
            +
              
         | 
| 56 | 
            +
            E.g.: To create a simple app demo (other templates are avaiable, to list them execute scbi_mapreduce without arguments):
         | 
| 57 | 
            +
             | 
| 58 | 
            +
              scbi_mapreduce my_app simple
         | 
| 59 | 
            +
              
         | 
| 60 | 
            +
            A full project template will be created for you with (at least) the following files:
         | 
| 61 | 
            +
             | 
| 62 | 
            +
              my_app/main.rb
         | 
| 63 | 
            +
              my_app/my_worker.rb
         | 
| 64 | 
            +
              my_app/my_worker_manager.rb
         | 
| 65 | 
            +
              my_app/README.txt
         | 
| 66 | 
            +
             | 
| 67 | 
            +
             | 
| 68 | 
            +
            You can run main.rb as any other ruby script.
         | 
| 69 | 
            +
             | 
| 70 | 
            +
              cd my_app
         | 
| 71 | 
            +
              ruby main.rb
         | 
| 72 | 
            +
              
         | 
| 73 | 
            +
            Now that evething is working, you must modify +my_worker+ and +my_worker_manager+ in order to do the desired work.
         | 
| 74 | 
            +
             | 
| 75 | 
            +
            === my_worker_manager.rb
         | 
| 76 | 
            +
             | 
| 77 | 
            +
            In my_worker_manager you open input files, split data in chunks that are automatically sent to workers, and later on writes down data to disk when workers finished them. Here are the basic methods that can be personalized.
         | 
| 78 | 
            +
             | 
| 79 | 
            +
            The most important ones are +next_work+ (where data is splitted into chunks), and +work_received+ (where processed data is received from workers):
         | 
| 80 | 
            +
             | 
| 81 | 
            +
              # next_work method is called every time a worker needs a new work
         | 
| 82 | 
            +
              # Here you can read data from disk
         | 
| 83 | 
            +
              # This method must return the work data or nil if no more data is available
         | 
| 84 | 
            +
              def next_work
         | 
| 85 | 
            +
                @@remaining_data -= 1
         | 
| 86 | 
            +
             | 
| 87 | 
            +
                e = @@basic_string
         | 
| 88 | 
            +
             | 
| 89 | 
            +
                e = nil if @@remaining_data<0
         | 
| 90 | 
            +
                return e
         | 
| 91 | 
            +
             | 
| 92 | 
            +
              end
         | 
| 93 | 
            +
              
         | 
| 94 | 
            +
            -
         | 
| 95 | 
            +
             | 
| 96 | 
            +
              # work_received is executed each time a worker has finished a job.
         | 
| 97 | 
            +
              # Here you can write results down to disk, perform some aggregated statistics, etc...
         | 
| 98 | 
            +
              def work_received(results)
         | 
| 99 | 
            +
             | 
| 100 | 
            +
                # write_data_to_disk(results)
         | 
| 101 | 
            +
              end
         | 
| 102 | 
            +
             | 
| 103 | 
            +
             | 
| 104 | 
            +
            There are also some other methods that can be used to send initial configuration parameters, open and close files, etc...
         | 
| 105 | 
            +
             | 
| 106 | 
            +
              # init_work_manager is executed at the start, prior to any processing.
         | 
| 107 | 
            +
              # You can use init_work_manager to initialize global variables, open files, etc...
         | 
| 108 | 
            +
              # Note that an instance of MyWorkerManager will be created for each
         | 
| 109 | 
            +
              # worker connection, and thus, all global variables here should be
         | 
| 110 | 
            +
              # class variables (starting with @@)
         | 
| 111 | 
            +
              def self.init_work_manager
         | 
| 112 | 
            +
              
         | 
| 113 | 
            +
                # use 200000 strings
         | 
| 114 | 
            +
                @@remaining_data = 200000
         | 
| 115 | 
            +
              
         | 
| 116 | 
            +
                # of 1024 characters each
         | 
| 117 | 
            +
                @@basic_string='a'*1024
         | 
| 118 | 
            +
             | 
| 119 | 
            +
              end
         | 
| 120 | 
            +
             | 
| 121 | 
            +
            -
         | 
| 122 | 
            +
             | 
| 123 | 
            +
              # end_work_manager is executed at the end, when all the process is done.
         | 
| 124 | 
            +
              # You can use it to close files opened in init_work_manager
         | 
| 125 | 
            +
              def self.end_work_manager
         | 
| 126 | 
            +
             | 
| 127 | 
            +
              end
         | 
| 128 | 
            +
             | 
| 129 | 
            +
            -
         | 
| 130 | 
            +
             | 
| 131 | 
            +
              # worker_initial_config is used to send initial parameters to workers.
         | 
| 132 | 
            +
              # The method is executed once per each worker
         | 
| 133 | 
            +
              def worker_initial_config
         | 
| 134 | 
            +
             | 
| 135 | 
            +
              end
         | 
| 136 | 
            +
             | 
| 137 | 
            +
             
         | 
| 138 | 
            +
            === my_worker.rb
         | 
| 139 | 
            +
             | 
| 140 | 
            +
            The main method that needs to be modified on my_worker.rb is +process_object+. It is executed each time new data is available, and is where the real distributed processing takes place since it is executed simultaneously on different machines.
         | 
| 141 | 
            +
             | 
| 142 | 
            +
              def process_object(objs)
         | 
| 143 | 
            +
             | 
| 144 | 
            +
                # iterate over all objects received
         | 
| 145 | 
            +
                objs.each do |obj|
         | 
| 146 | 
            +
                  # convert to uppercase
         | 
| 147 | 
            +
                  obj.upcase!
         | 
| 148 | 
            +
                end
         | 
| 149 | 
            +
             | 
| 150 | 
            +
                # return objs back to manager
         | 
| 151 | 
            +
                return objs
         | 
| 152 | 
            +
              end
         | 
| 153 | 
            +
             | 
| 154 | 
            +
             | 
| 155 | 
            +
            There are other useful methods:
         | 
| 156 | 
            +
             | 
| 157 | 
            +
               # starting_worker method is called one time at initialization
         | 
| 158 | 
            +
               # and allows you to initialize your variables
         | 
| 159 | 
            +
               def starting_worker
         | 
| 160 | 
            +
             | 
| 161 | 
            +
                 # You can use worker logs at any time in this way:
         | 
| 162 | 
            +
                 # $WORKER_LOG.info "Starting a worker"
         | 
| 163 | 
            +
             | 
| 164 | 
            +
               end
         | 
| 165 | 
            +
             | 
| 166 | 
            +
            -
         | 
| 167 | 
            +
             | 
| 168 | 
            +
               # receive_initial_config is called only once just after
         | 
| 169 | 
            +
               # the first connection, when initial parameters are
         | 
| 170 | 
            +
               # received from manager
         | 
| 171 | 
            +
               def receive_initial_config(parameters)
         | 
| 172 | 
            +
             | 
| 173 | 
            +
                 # Reads the parameters
         | 
| 174 | 
            +
             | 
| 175 | 
            +
                 # You can use worker logs at any time in this way:
         | 
| 176 | 
            +
                 # $WORKER_LOG.info "Params received"
         | 
| 177 | 
            +
             | 
| 178 | 
            +
                 # save received parameters, if any
         | 
| 179 | 
            +
                 # @params = parameters
         | 
| 180 | 
            +
               end
         | 
| 181 | 
            +
             | 
| 182 | 
            +
            -
         | 
| 183 | 
            +
             | 
| 184 | 
            +
               # process_object method is called for each received object.
         | 
| 185 | 
            +
               # Be aware that objs is always an array, and you must iterate
         | 
| 186 | 
            +
               # over it if you need to process it independently
         | 
| 187 | 
            +
               #
         | 
| 188 | 
            +
               # The value returned here will be received by the work_received
         | 
| 189 | 
            +
               # method at your worker_manager subclass.
         | 
| 190 | 
            +
               def process_object(objs)
         | 
| 191 | 
            +
             | 
| 192 | 
            +
                 # iterate over all objects received
         | 
| 193 | 
            +
                 objs.each do |obj|
         | 
| 194 | 
            +
             | 
| 195 | 
            +
                   # convert to uppercase
         | 
| 196 | 
            +
                   obj.upcase!
         | 
| 197 | 
            +
                 end
         | 
| 198 | 
            +
             | 
| 199 | 
            +
                 # return objs back to manager
         | 
| 200 | 
            +
                 return objs
         | 
| 201 | 
            +
               end
         | 
| 202 | 
            +
             | 
| 203 | 
            +
            -
         | 
| 204 | 
            +
             | 
| 205 | 
            +
               # called once, when the worker is about to be closed
         | 
| 206 | 
            +
               def closing_worker
         | 
| 207 | 
            +
             | 
| 208 | 
            +
               end
         | 
| 209 | 
            +
             
         | 
| 210 | 
            +
            === main.rb
         | 
| 211 | 
            +
             | 
| 212 | 
            +
             | 
| 213 | 
            +
            On main.rb is where the manager and workers are launched. Here you define listening ip. 
         | 
| 214 | 
            +
             | 
| 215 | 
            +
              # listen on all ips at port 50000
         | 
| 216 | 
            +
              ip='0.0.0.0'
         | 
| 217 | 
            +
              port = 50000
         | 
| 218 | 
            +
              
         | 
| 219 | 
            +
            If you are using a cluster and thus don't know where manager will be executed, you can specify the initial part of the ip interface. Eg.: if you specify ip='10.16', scbi_mapreduce will use the network interface that matches this ip:
         | 
| 220 | 
            +
             | 
| 221 | 
            +
            The number of workers can be a number (workers are launched on the same machine than Manager), or a list of machine names, in which case workers are launched via ssh on remote machines and automatically connected to Manager.
         | 
| 222 | 
            +
             | 
| 223 | 
            +
              # set number of workers. You can also provide an array with worker names.
         | 
| 224 | 
            +
              # Those workers names can be read from a file produced by the existing
         | 
| 225 | 
            +
              # queue system, if any.
         | 
| 226 | 
            +
              workers = 8
         | 
| 227 | 
            +
             | 
| 228 | 
            +
            Your worker file will be used to launch workers.
         | 
| 229 | 
            +
             | 
| 230 | 
            +
              # we need the path to my_worker in order to launch it when necessary
         | 
| 231 | 
            +
              custom_worker_file = File.join(File.dirname(__FILE__),'my_worker.rb')
         | 
| 232 | 
            +
             | 
| 233 | 
            +
              # initialize the work manager. Here you can pass parameters like file names
         | 
| 234 | 
            +
              MyWorkerManager.init_work_manager
         | 
| 235 | 
            +
             | 
| 236 | 
            +
              # launch processor server
         | 
| 237 | 
            +
              mgr = ScbiMapreduce::Manager.new(ip, port, workers, MyWorkerManager, custom_worker_file, STDOUT)
         | 
| 238 | 
            +
             | 
| 239 | 
            +
            You can also set additional properties:
         | 
| 240 | 
            +
             | 
| 241 | 
            +
             | 
| 242 | 
            +
              # if you want basic checkpointing. Some performance drop should be expected
         | 
| 243 | 
            +
              # mgr.checkpointing=true
         | 
| 244 | 
            +
             | 
| 245 | 
            +
              # if you want to keep the order of input data. Some performance drop should be expected
         | 
| 246 | 
            +
              # mgr.keep_order=true
         | 
| 247 | 
            +
             | 
| 248 | 
            +
              # you can set the size of packets of data sent to workers
         | 
| 249 | 
            +
              mgr.chunk_size=100
         | 
| 250 | 
            +
             | 
| 251 | 
            +
             | 
| 252 | 
            +
            And finally, start the server:
         | 
| 253 | 
            +
             | 
| 254 | 
            +
              # start processing
         | 
| 255 | 
            +
              mgr.start_server
         | 
| 256 | 
            +
             | 
| 257 | 
            +
             | 
| 258 | 
            +
              # this line is reached when all data has been processed
         | 
| 259 | 
            +
              puts "Program finished"
         | 
| 260 | 
            +
             | 
| 261 | 
            +
             | 
| 262 | 
            +
            == REQUIREMENTS:
         | 
| 263 | 
            +
             | 
| 264 | 
            +
            * Ruby 1.9.2 (you can install it by: rvm install 1.9.2)
         | 
| 265 | 
            +
            * OSX, Linux, UNIX and other UNIX-like operating systems. (Windows may work if ssh is available to spawn jobs. Not tested)
         | 
| 266 | 
            +
            * eventmachine gem (is automatically installed)
         | 
| 267 | 
            +
             | 
| 268 | 
            +
            == INSTALL:
         | 
| 269 | 
            +
             | 
| 270 | 
            +
            * gem install scbi_mapreduce
         | 
| 271 | 
            +
             | 
| 272 | 
            +
            == LICENSE:
         | 
| 273 | 
            +
             | 
| 274 | 
            +
            (The MIT License)
         | 
| 275 | 
            +
             | 
| 276 | 
            +
            Copyright (c) 2010 Dario Guerrero
         | 
| 277 | 
            +
             | 
| 278 | 
            +
            Permission is hereby granted, free of charge, to any person obtaining
         | 
| 279 | 
            +
            a copy of this software and associated documentation files (the
         | 
| 280 | 
            +
            'Software'), to deal in the Software without restriction, including
         | 
| 281 | 
            +
            without limitation the rights to use, copy, modify, merge, publish,
         | 
| 282 | 
            +
            distribute, sublicense, and/or sell copies of the Software, and to
         | 
| 283 | 
            +
            permit persons to whom the Software is furnished to do so, subject to
         | 
| 284 | 
            +
            the following conditions:
         | 
| 285 | 
            +
             | 
| 286 | 
            +
            The above copyright notice and this permission notice shall be
         | 
| 287 | 
            +
            included in all copies or substantial portions of the Software.
         | 
| 288 | 
            +
             | 
| 289 | 
            +
            THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
         | 
| 290 | 
            +
            EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
         | 
| 291 | 
            +
            MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
         | 
| 292 | 
            +
            IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
         | 
| 293 | 
            +
            CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
         | 
| 294 | 
            +
            TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
         | 
| 295 | 
            +
            SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
         | 
    
        data/Rakefile
    ADDED
    
    | @@ -0,0 +1,28 @@ | |
| 1 | 
            +
            require 'rubygems'
         | 
| 2 | 
            +
            gem 'hoe', '>= 2.1.0'
         | 
| 3 | 
            +
            require 'hoe'
         | 
| 4 | 
            +
            require 'fileutils'
         | 
| 5 | 
            +
            require './lib/scbi_mapreduce'
         | 
| 6 | 
            +
             | 
| 7 | 
            +
            Hoe.plugin :newgem
         | 
| 8 | 
            +
            # Hoe.plugin :website
         | 
| 9 | 
            +
            # Hoe.plugin :cucumberfeatures
         | 
| 10 | 
            +
             | 
| 11 | 
            +
            # Generate all the Rake tasks
         | 
| 12 | 
            +
            # Run 'rake -T' to see list of generated tasks (from gem root directory)
         | 
| 13 | 
            +
            $hoe = Hoe.spec 'scbi_mapreduce' do
         | 
| 14 | 
            +
              self.developer 'Dario Guerrero', 'dariogf@gmail.com'
         | 
| 15 | 
            +
              self.post_install_message = 'PostInstall.txt' # TODO remove if post-install message not required
         | 
| 16 | 
            +
              self.rubyforge_name       = self.name # TODO this is default value
         | 
| 17 | 
            +
              # self.extra_deps         = [['activesupport','>= 2.0.2']]
         | 
| 18 | 
            +
              self.extra_deps         = [['eventmachine','>= 0.12.0']]
         | 
| 19 | 
            +
             | 
| 20 | 
            +
             | 
| 21 | 
            +
            end
         | 
| 22 | 
            +
             | 
| 23 | 
            +
            require 'newgem/tasks'
         | 
| 24 | 
            +
            Dir['tasks/**/*.rake'].each { |t| load t }
         | 
| 25 | 
            +
             | 
| 26 | 
            +
            # TODO - want other tests/tasks run by default? Add them to the list
         | 
| 27 | 
            +
            # remove_task :default
         | 
| 28 | 
            +
            # task :default => [:spec, :features]
         | 
    
        data/bin/scbi_mapreduce
    ADDED
    
    | @@ -0,0 +1,52 @@ | |
| 1 | 
            +
            #!/usr/bin/env ruby
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            source_base= File.join(File.dirname(File.dirname(__FILE__)),'skeleton')
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            if ARGV.count<2
         | 
| 6 | 
            +
              puts "Tool to create a scbi_mapreduce demo application that you can use as a template"
         | 
| 7 | 
            +
              puts
         | 
| 8 | 
            +
              puts "Usage #{$0} app_name template"
         | 
| 9 | 
            +
              puts
         | 
| 10 | 
            +
              puts "E.g.: #{$0} my_app simple"
         | 
| 11 | 
            +
              puts
         | 
| 12 | 
            +
              puts "====== AVAILABLE TEMPLATES ======"
         | 
| 13 | 
            +
             | 
| 14 | 
            +
              s=`ls #{source_base}`
         | 
| 15 | 
            +
              puts s
         | 
| 16 | 
            +
              
         | 
| 17 | 
            +
              exit
         | 
| 18 | 
            +
            end
         | 
| 19 | 
            +
             | 
| 20 | 
            +
            app_name = ARGV[0]
         | 
| 21 | 
            +
            template = ARGV[1]
         | 
| 22 | 
            +
             | 
| 23 | 
            +
            if File.exists?(app_name)
         | 
| 24 | 
            +
              puts "#{app_name} already exists, aborting"
         | 
| 25 | 
            +
              exit -1
         | 
| 26 | 
            +
            end
         | 
| 27 | 
            +
             | 
| 28 | 
            +
            source_base= File.join(source_base,template)
         | 
| 29 | 
            +
             | 
| 30 | 
            +
            files=['main.rb','my_worker.rb','my_worker_manager.rb']
         | 
| 31 | 
            +
             | 
| 32 | 
            +
            puts "Creating scbi_mapreduce application: #{app_name}"
         | 
| 33 | 
            +
            puts
         | 
| 34 | 
            +
            puts "Creating files:"
         | 
| 35 | 
            +
            puts "="*20
         | 
| 36 | 
            +
            system("cp -r #{source_base} #{app_name}")
         | 
| 37 | 
            +
             | 
| 38 | 
            +
            # puts files
         | 
| 39 | 
            +
            s=`find #{app_name}`
         | 
| 40 | 
            +
            puts s
         | 
| 41 | 
            +
             | 
| 42 | 
            +
            description_file=File.join(source_base,'README.txt')
         | 
| 43 | 
            +
             | 
| 44 | 
            +
            if File.exists?(description_file)
         | 
| 45 | 
            +
              puts
         | 
| 46 | 
            +
              puts File.read(description_file)
         | 
| 47 | 
            +
            end
         | 
| 48 | 
            +
             | 
| 49 | 
            +
            # files.each do |file|
         | 
| 50 | 
            +
            #   puts "Creating file: #{file}"
         | 
| 51 | 
            +
            #   system("cp -r #{File.join(source_base,file)} #{app_name}")
         | 
| 52 | 
            +
            # end
         |