RubyGems - bulk-processor - Versions diffs - 0.5.1 → 0.6.0 - Mend

bulk-processor 0.5.1 → 0.6.0

Files changed (16) hide show

checksums.yaml +4 -4
data/README.md +62 -1
data/lib/bulk_processor/back_end/active_job.rb +9 -9
data/lib/bulk_processor/back_end/dynosaur.rb +12 -9
data/lib/bulk_processor/back_end.rb +3 -4
data/lib/bulk_processor/config.rb +5 -0
data/lib/bulk_processor/file_splitter.rb +59 -0
data/lib/bulk_processor/job/process_csv.rb +22 -0
data/lib/bulk_processor/job/split_csv.rb +41 -0
data/lib/bulk_processor/row_chunker/balanced.rb +29 -0
data/lib/bulk_processor/row_chunker/boundary.rb +55 -0
data/lib/bulk_processor/tasks.rb +12 -3
data/lib/bulk_processor/version.rb +1 -1
data/lib/bulk_processor.rb +12 -8
metadata +7 -3
data/lib/bulk_processor/job.rb +0 -20

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 5938f4133413fe9183607008301eda98d5961a3a
-  data.tar.gz: 60f3090d882a96290250a59ccce5d1ff7bd49139
+  metadata.gz: 77320b807b3cd9862490408058611d9b461cf83f
+  data.tar.gz: 203e565ab7f722c6f639527b4065e2e5f495aa57
 SHA512:
-  metadata.gz: 5541b7a3bc23bccd4842fbf7a5439ac40d06a0497ac6cf033368546c59b72f0d7e3d637e6e063597ad726d3ea48928fdbf6631e6d7d647719469336afcfed1cd
-  data.tar.gz: 1538237aafa5c71cb1c8a04f64eb0b921dae8aa4153235ba8386fd01e4a43478c9a7074b7ffc12eb656ef4cd1a1916dbc11b531a55c1c463e87e616c52f27489
+  metadata.gz: 4f89dd796184485f44d0018a9819cf42ec5c147ff20693c63e7055b43fd2e0807e5b268f622e8555157e1ac36b01207c9b23fe6f0212eb2a744924b6e3533d91
+  data.tar.gz: 2373584252697f040d460070a93958cc944c2f0045948233125f4e0ee06d39daf5080191dd0f07e6ad0432e461c7947e01e821a5e81e7dbc994c558571a1da44

data/README.md CHANGED Viewed

@@ -65,7 +65,7 @@ The CSV file passed to BulkProcessor will be persisted on AWS S3 so that the job
 can access it. This requires configuring AWS credentials, the S3 bucket in which
 to store the file, and a local temp directory to hold the file locally.
-### Setting up the processor and handler
+### Setting up the processor
 You will need to supply a class for CSV processing. This class must respond to the
 `start` instance method, the `required_columns` and `optional_columns` class methods,
@@ -229,6 +229,67 @@ else
 end
 ```
+#### Parallelization
+For larger CSV files, you may wish to process rows in parallel. This gem allows
+you to scale up to an arbitrary number of parallel processes by providing an optional
+argument to `#start`. Doing this will cause the input CSV file to be split into
+*N* number of smaller CSV files, each one being processed in separate processes.
+It is important to note that the file *must* be sorted by the boundary column for
+it to deliver on its promise.
+```ruby
+processor = BulkProcessor.new(
+              key: file_name,
+              stream: file_stream,
+              processor_class: PetCSVProcessor,
+              payload: { recipient: current_user.email }
+            )
+if processor.start(5)
+  # Split the main CSV into 5 smaller files and process in parallel.
+else
+  # Something went wrong, alert the file uploader
+  handle_invalid_file(processor.errors)
+end
+```
+By default, the file will be split into equal-sized partitions. If you need the partitions
+to keep all rows with the same value for a column into the same partition, define `.boundary_column`
+on the processor class to return the name of that column. E.g.
+```csv
+pet_id,meal,mead_date
+1,kibble,2015-11-02
+1,bits,2015-11-03
+...
+1,alpo,2015-12-31
+2,alpo,2015-11-01
+...
+```
+```ruby
+class PetCSVProcessor
+  def self.boundary_column
+    'pet_id'
+  end
+  ...
+end
+```
+Finally, to be notified of any failures in the splitting process, you can define
+`.handler_class` on your processor class to return a class that implements the Handler role.
+If an error is raised in the splitting, `#fail!` will be called on the Handler with
+the error.
+```ruby
+class PetCSVProcessor
+  def self.handler_class
+    PetHandler
+  end
+  ...
+end
+```
 ### BulkProcessor::CSVProcessor::Result
 The result instances passed from BulkProcessor::CSVProcessor to the Handler

data/lib/bulk_processor/back_end/active_job.rb CHANGED Viewed

@@ -1,25 +1,25 @@
 class BulkProcessor
   module BackEnd
+    # Execute jobs via ActiveJob, e.g. Resque
     class ActiveJob
-      def initialize(processor_class:, payload:, file_class:, key:)
+      def initialize(processor_class:, payload:, key:)
         @processor_class = processor_class
         @payload = payload
-        @file_class = file_class
         @key = key
       end
       def start
-        Job.perform_later(
-          processor_class.name,
-          PayloadSerializer.serialize(payload),
-          file_class.name,
-          key
-        )
+        Job::ProcessCSV.perform_later(processor_class.name, payload, key)
+      end
+      def split(num_processes)
+        Job::SplitCSV.perform_later(processor_class.name, payload,
+                                    key, num_processes)
       end
       private
-      attr_reader :processor_class, :payload, :file_class, :key
+      attr_reader :processor_class, :payload, :key
     end
   end
 end

data/lib/bulk_processor/back_end/dynosaur.rb CHANGED Viewed

@@ -2,11 +2,11 @@ require 'dynosaur'
 class BulkProcessor
   module BackEnd
+    # Execute jobs via rake tasks that will spawn a new Heroku dyno
     class Dynosaur
-      def initialize(processor_class:, payload:, file_class:, key:)
+      def initialize(processor_class:, payload:, key:)
         @processor_class = processor_class
         @payload = payload
-        @file_class = file_class
         @key = key
         configure_dynosaur
       end
@@ -14,19 +14,22 @@ class BulkProcessor
       def start
         args = {
           task: 'bulk_processor:start',
-          args: [
-            processor_class.name,
-            PayloadSerializer.serialize(payload),
-            file_class.name,
-            key
-          ]
+          args: [processor_class.name, payload, key]
+        }
+        ::Dynosaur::Process::Heroku.new(args).start
+      end
+      def split(num_processes)
+        args = {
+          task: 'bulk_processor:split',
+          args: [processor_class.name, payload, key, num_processes]
         }
         ::Dynosaur::Process::Heroku.new(args).start
       end
       private
-      attr_reader :processor_class, :payload, :file_class, :key
+      attr_reader :processor_class, :payload, :key
       def configure_dynosaur
         ::Dynosaur::Client::HerokuClient.configure do |config|

data/lib/bulk_processor/back_end.rb CHANGED Viewed

@@ -1,14 +1,13 @@
 class BulkProcessor
   module BackEnd
     class << self
-      def start(processor_class:, payload:, file_class:, key:)
+      def start(processor_class:, payload:, key:, num_processes: 1)
         back_end = back_end_class.new(
           processor_class: processor_class,
-          payload: payload,
-          file_class: file_class,
+          payload: PayloadSerializer.serialize(payload),
           key: key
         )
-        back_end.start
+        num_processes > 1 ? back_end.split(num_processes) : back_end.start
       end
       private

data/lib/bulk_processor/config.rb CHANGED Viewed

@@ -2,12 +2,17 @@ class BulkProcessor
   # Store configuration data set by clients
   class Config
     attr_reader :queue_adapter
+    attr_writer :file_class
     attr_accessor :back_end, :temp_directory
     def queue_adapter=(adapter)
       ActiveJob::Base.queue_adapter = @queue_adapter = adapter
     end
+    def file_class
+      @file_class || BulkProcessor::S3File
+    end
     def aws
       @aws ||= Struct.new(:access_key_id, :secret_access_key, :bucket).new
     end

data/lib/bulk_processor/file_splitter.rb ADDED Viewed

@@ -0,0 +1,59 @@
+class BulkProcessor
+  # Split a CSV file on S3 using the specified chunker
+  class FileSplitter
+    def initialize(key:, row_chunker:)
+      @key = key
+      @row_chunker = row_chunker
+    end
+    # Generate multiple files on S3, composed of chunks of the input file.
+    #
+    # @return [Array<String>] the S3 keys for each new file
+    def split!
+      return @keys if instance_variable_defined?('@keys')
+      ranges = row_chunker.ranges_for(input_csv)
+      @keys = ranges.map.with_index do |range, index|
+        chunk_key = key_from_index(index, ranges.count)
+        contents = csv_from_range(range)
+        BulkProcessor.config.file_class.new(chunk_key).write(contents)
+        chunk_key
+      end
+    end
+    private
+    attr_reader :key, :row_chunker
+    def headers
+      input_csv.headers
+    end
+    def input_csv
+      return @input_csv if instance_variable_defined?('@input_csv')
+      BulkProcessor.config.file_class.new(key).open do |input_file|
+        @input_csv = CSV.parse(input_file, headers: true)
+      end
+      @input_csv
+    end
+    def csv_from_range(range)
+      return CSV.generate { |csv| csv << headers } if range.count == 0
+      CSV.generate(headers: headers, write_headers: true) do |csv|
+        range.each { |row_num| csv << input_csv[row_num] }
+      end
+    end
+    def key_from_index(index, total)
+      parts = key.split('.')
+      if parts.length == 1
+        name_part = key
+        ext_part = ''
+      else
+        name_part = parts[0..-2].join('.')
+        ext_part = ".#{parts.last}"
+      end
+      "#{name_part}_#{index + 1}-of-#{total}#{ext_part}"
+    end
+  end
+end

data/lib/bulk_processor/job/process_csv.rb ADDED Viewed

@@ -0,0 +1,22 @@
+require 'active_job'
+class BulkProcessor
+  # ActiveJob to handle processing the CSV in the background
+  module Job
+    class ProcessCSV < ActiveJob::Base
+      queue_as 'bulk_processor'
+      def perform(processor_class, payload, key)
+        file = BulkProcessor.config.file_class.new(key)
+        payload = PayloadSerializer.deserialize(payload).merge('key' => key)
+        file.open do |f|
+          csv = CSV.parse(f.read, headers: true)
+          processor = processor_class.constantize.new(csv, payload: payload)
+          processor.start
+        end
+      ensure
+        file.try(:delete)
+      end
+    end
+  end
+end

data/lib/bulk_processor/job/split_csv.rb ADDED Viewed

@@ -0,0 +1,41 @@
+require 'active_job'
+class BulkProcessor
+  # ActiveJob to handle processing the CSV in the background
+  module Job
+    class SplitCSV < ActiveJob::Base
+      queue_as 'bulk_processor'
+      def perform(processor_class, payload, key, num_chunks)
+        processor_class = processor_class.constantize
+        chunker = row_chunker(processor_class, num_chunks)
+        payload = PayloadSerializer.deserialize(payload)
+        splitter = FileSplitter.new(key: key, row_chunker: chunker)
+        keys = splitter.split!
+        keys.each do |key|
+          BackEnd.start(processor_class: processor_class, payload: payload, key: key)
+        end
+      rescue Exception => error
+        if processor_class.respond_to?(:handler_class)
+          payload = payload.merge('key' => key)
+          handler = processor_class.handler_class.new(payload: payload, results: [])
+          handler.fail!(error)
+        end
+        raise
+      ensure
+        BulkProcessor.config.file_class.new(key).delete
+      end
+      private
+      def row_chunker(processor_class, num_chunks)
+        if processor_class.respond_to?(:boundary_column)
+          boundary_column = processor_class.boundary_column
+          RowChunker::Boundary.new(num_chunks, boundary_column: boundary_column)
+        else
+          RowChunker::Balanced.new(num_chunks)
+        end
+      end
+    end
+  end
+end

data/lib/bulk_processor/row_chunker/balanced.rb ADDED Viewed

@@ -0,0 +1,29 @@
+class BulkProcessor
+  module RowChunker
+    # Determine the partitions for a balanced break up of the input CSV file.
+    # All partitions will have a size within 1 row of every other partition.
+    class Balanced
+      def initialize(num_chunks)
+        @num_chunks = num_chunks
+      end
+      def ranges_for(csv)
+        ideal_size = csv.count / num_chunks
+        num_chunks.times.map do |index|
+          start_index = index * ideal_size
+          if index == num_chunks - 1
+            # force the last chunk to go to the very last row
+            end_index = csv.count - 1
+          else
+            end_index = start_index + ideal_size - 1
+          end
+          (start_index..end_index)
+        end
+      end
+      private
+      attr_reader :num_chunks
+    end
+  end
+end

data/lib/bulk_processor/row_chunker/boundary.rb ADDED Viewed

@@ -0,0 +1,55 @@
+class BulkProcessor
+  module RowChunker
+    # Determine the partitions that ensure all consecutive rows with the same
+    # value for boundary_column are in the same partion. The CSV must be sorted
+    # on this column to get the desired results. This class makes an attempt to
+    # keep the partion sizes equal, but obviously prioritizes the boundary
+    # column values over partition size.
+    class Boundary
+      def initialize(num_chunks, boundary_column:)
+        @num_chunks = num_chunks
+        @boundary_column = boundary_column
+      end
+      def ranges_for(csv)
+        @ranges ||= begin
+          # Start with a balanced partition, then make adjustments from there
+          chunker = Balanced.new(num_chunks)
+          adjust_for_boundaries(chunker.ranges_for(csv), csv)
+        end
+      end
+      private
+      attr_reader :num_chunks, :boundary_column
+      def adjust_for_boundaries(balanced_ranges, csv)
+        balanced_endings = balanced_ranges.map(&:last)
+        last_indexes = []
+        while balanced_endings.any?
+          last_index = [last_indexes.last, balanced_endings.shift].compact.max
+          last_index += 1 until at_boundary?(csv, last_index)
+          last_indexes << last_index
+        end
+        to_ranges(last_indexes)
+      end
+      def to_ranges(last_indexes)
+        first_indexes = last_indexes.dup
+        first_indexes.pop
+        first_indexes.map! { |index| index + 1 }
+        first_indexes.unshift(0)
+        first_indexes.map.with_index do |first_index, index|
+          (first_index..last_indexes[index])
+        end
+      end
+      def at_boundary?(csv, index)
+        return true if index == csv.count - 1
+        csv[index][boundary_column] != csv[index + 1][boundary_column]
+      end
+    end
+  end
+end

data/lib/bulk_processor/tasks.rb CHANGED Viewed

@@ -7,14 +7,23 @@ class BulkProcessor
     def install_tasks
       namespace :bulk_processor do
         desc 'Start processing a CSV file'
-        task :start, [:processor_class, :payload, :file_class, :key] => :environment do |_task, args|
-          Job.new.perform(
+        task :start, [:processor_class, :payload, :key] => :environment do |_task, args|
+          Job::ProcessCSV.new.perform(
             args[:processor_class],
             args[:payload],
-            args[:file_class],
             args[:key]
           )
         end
+        desc 'Split a CSV file and process each piece'
+        task :split, [:processor_class, :payload, :key, :num_chunks] => :environment do |_task, args|
+          Job::SplitCSV.new.perform(
+            args[:processor_class],
+            args[:payload],
+            args[:key],
+            args[:num_chunks]
+          )
+        end
       end
     end
   end

data/lib/bulk_processor/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 class BulkProcessor
-  VERSION = '0.5.1'.freeze
+  VERSION = '0.6.0'.freeze
 end

data/lib/bulk_processor.rb CHANGED Viewed

@@ -2,8 +2,12 @@ require 'bulk_processor/back_end'
 require 'bulk_processor/back_end/active_job'
 require 'bulk_processor/back_end/dynosaur'
 require 'bulk_processor/config'
-require 'bulk_processor/job'
+require 'bulk_processor/file_splitter'
+require 'bulk_processor/job/process_csv'
+require 'bulk_processor/job/split_csv'
 require 'bulk_processor/payload_serializer'
+require 'bulk_processor/row_chunker/balanced'
+require 'bulk_processor/row_chunker/boundary'
 require 'bulk_processor/s3_file'
 require 'bulk_processor/stream_encoder'
 require 'bulk_processor/validated_csv'
@@ -32,8 +36,8 @@ class BulkProcessor
   end
   # Validate the CSV and enqueue if for processing in the background.
-  def start(file_class: S3File)
-    if file_class.new(key).exists?
+  def start(num_processes = 1)
+    if BulkProcessor.config.file_class.new(key).exists?
       errors << "Already processing #{key}, please wait for it to finish"
       return false
     end
@@ -47,7 +51,7 @@ class BulkProcessor
     )
     if csv.valid?
-      start_backend(file_class, encoded_contents)
+      start_backend(encoded_contents, num_processes)
     else
       errors.concat(csv.errors)
     end
@@ -58,11 +62,11 @@ class BulkProcessor
   attr_reader :key, :stream, :processor_class, :payload
-  def start_backend(file_class, contents)
-    file = file_class.new(key)
+  def start_backend(contents, num_processes)
+    file = BulkProcessor.config.file_class.new(key)
     file.write(contents)
-    BackEnd.start(processor_class: processor_class, payload: payload,
-                  file_class: file_class, key: key)
+    BackEnd.start(processor_class: processor_class, payload: payload, key: key,
+                  num_processes: num_processes)
   rescue Exception
     # Clean up the file, which is treated as a lock, if we bail out of here
     # unexpectedly.

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: bulk-processor
 version: !ruby/object:Gem::Version
-  version: 0.5.1
+  version: 0.6.0
 platform: ruby
 authors:
 - Tom Collier, Justin Richard
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2016-01-21 00:00:00.000000000 Z
+date: 2016-01-22 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: activejob
@@ -152,8 +152,12 @@ files:
 - lib/bulk_processor/csv_processor/no_op_post_processor.rb
 - lib/bulk_processor/csv_processor/result.rb
 - lib/bulk_processor/csv_processor/row_processor.rb
-- lib/bulk_processor/job.rb
+- lib/bulk_processor/file_splitter.rb
+- lib/bulk_processor/job/process_csv.rb
+- lib/bulk_processor/job/split_csv.rb
 - lib/bulk_processor/payload_serializer.rb
+- lib/bulk_processor/row_chunker/balanced.rb
+- lib/bulk_processor/row_chunker/boundary.rb
 - lib/bulk_processor/s3_file.rb
 - lib/bulk_processor/stream_encoder.rb
 - lib/bulk_processor/tasks.rb

data/lib/bulk_processor/job.rb DELETED Viewed

@@ -1,20 +0,0 @@
-require 'active_job'
-class BulkProcessor
-  # ActiveJob to handle processing the CSV in the background
-  class Job < ActiveJob::Base
-    queue_as 'bulk_processor'
-    def perform(processor_class, payload, file_class, key)
-      file = file_class.constantize.new(key)
-      payload = PayloadSerializer.deserialize(payload)
-      file.open do |f|
-        csv = CSV.parse(f.read, headers: true)
-        processor = processor_class.constantize.new(csv, payload: payload)
-        processor.start
-      end
-    ensure
-      file.try(:delete)
-    end
-  end
-end