RubyGems - bulk-processor - Versions diffs - 0.1.0 → 0.2.0 - Mend

bulk-processor 0.1.0 → 0.2.0

Files changed (10) hide show

checksums.yaml +4 -4
data/README.md +81 -31
data/lib/bulk_processor/config.rb +1 -0
data/lib/bulk_processor/csv_processor.rb +98 -0
data/lib/bulk_processor/job.rb +6 -19
data/lib/bulk_processor/no_op_handler.rb +12 -0
data/lib/bulk_processor/validated_csv.rb +30 -11
data/lib/bulk_processor/version.rb +1 -1
data/lib/bulk_processor.rb +13 -12
metadata +6 -3

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 81b6cbba22963959c1e355f71f78790cbc0b6592
-  data.tar.gz: 8859344abc19572527f69ba9259c5b1289bde273
+  metadata.gz: 56580f10538cc75b8fbfb8006248905cb0cfeb71
+  data.tar.gz: 3ce3bc03cf878836f5a768a3be0fad169bfabdc6
 SHA512:
-  metadata.gz: 584baf85c25c5a741eb8392ab068a3e1798a71398dd3f53e0da6e24aca144648fb77c8442865a4f5c134193fbf255f4b5cf55f630bc1915241e1f2d229189ce7
-  data.tar.gz: 7a5cc788e2b8bc7caf6d4960aa28fd2233661b3e8f5d1b2d90ed4a474ba8885c35ee63fcf077b0b25a1a0c2cb9408ca1c22846105592a7de9a87b9330e04c93f
+  metadata.gz: 1c4fa2fdf92ec038c73c6eec886a682e9cb82f8fdb1ef0ade84a0eefa0eb1fffbd29cd510c3c555c6456b5ee7e600c2e52999031169dfbd773a73a593ee543ee
+  data.tar.gz: b32c0129b95fa2a44e7d2dca5455bca2a60137aac6d0f97b260b60e90d8c79d28acd4e91a12f9d9a6b14e0a1ac4735d505b31bea182499f14c9c19991fbfa930

data/README.md CHANGED Viewed

@@ -34,11 +34,48 @@ The default is `:inline`, which skips queueing and processes synchronously. Sinc
 this is backed by ActiveJob, all of the adapters in [ActiveJob::QueueAdapters]( http://api.rubyonrails.org/classes/ActiveJob/QueueAdapters.html ),
 including `:resque`.
-You will also need to supply a class for item processing and a class/module for completion handling.
-The item processor instance must respond to the following messages:
+You will also need to supply a class for CSV processing. This class must respond to the
+`start` instance method, the `required_columns` and `optional_columns` class methods,
+and have the following signature for initialize:
+```ruby
+class PetCSVProcessor
+  # @return [Array<String>] column headers that must be present
+  def self.required_columns
+    ['species', 'name', 'age']
+  end
+  # @return [Array<String>] column headers that may be present. If a column
+  #   header is present that is not in 'required_columns' or 'optional_columns',
+  #   the file will be considered invalid and no rows will be processed.
+  def self.optional_columns
+    ['favorite_toy', 'talents']
+  end
+  def initialize(records, payload:)
+    # Assign instance variables and do any other setup
+  end
+  def start
+    # Process the records
+  end
+end
 ```
-class PetItemProcessor
+To account for a common use case, a base `BulkProcessor::CSVProcessor` class is provided,
+though it must be explicitly required. This base class can be subclassed to build a CSV processor.
+This base class implements the initializer and `#start` methods and returns an empty set for `.optional_columns`.
+The `#start` method iterates over each record, processes it using a `RowProcessor`,
+accumulates the results, which are passed off to a `Handler`. An example
+implementation could look like:
+```ruby
+require 'bulk_processor/csv_processor'
+class PetCSVProcessor < BulkProcessor::CSVProcessor
+  # Note: this must be overridden in a subclass
+  #
   # @return [Array<String>] column headers that must be present
   def self.required_columns
     ['species', 'name', 'age']
@@ -51,18 +88,27 @@ class PetItemProcessor
     ['favorite_toy', 'talents']
   end
-  # Instantiate the processor with a single row from the CSV represented by
-  # a Hash<String, String>
-  def initialize(record_hash, payload)
-    @record_hash = record_hash
-    @payload = payload
-    @messages = []
-    @success = false
+  # Note: this must be overridden in a subclass
+  #
+  # @return [RowProcessor] a class that implements the RowProcessor role
+  def self.row_processor_class
+    PetRowProcessor
+  end
+  # @return [Handler] a class that implements the Handler role
+  def self.handler_class
+    PetHandler
+  end
+end
+class PetRowProcessor
+  def initialize(record, payload:)
+    # Assign instance variables and do any other setup
   end
   # Process the row, e.g. create a new record in the DB, send an email, etc
   def process!
-    pet = Pet.new(record_hash)
+    pet = Pet.new(record)
     if pet.save
       @success = true
     else
@@ -72,25 +118,17 @@ class PetItemProcessor
   # @return [true|false] true iff the item was processed completely
   def success?
-    @success
+    @success == true
   end
   # @return [Array<String>] list of messages for this item to pass back to the
   #   completion handler.
   def messages
-    @messages
+    @messages || []
   end
 end
-```
-A completion handler must respond to the following messages
-```ruby
-module NotificationHandler
-  # Handle full or partial processing of records. Unless there was a fatal
-  # error, all row indexes will be present either successes or errors, but not
-  # both.
-  #
+class PetHandler
   # @param payload [Hash] the payload passed into 'BulkProcessor.process', can
   #   be used to pass metadata around, e.g. the email address to send a
   #   completion report to
@@ -100,25 +138,37 @@ module NotificationHandler
   #   (may be empty), e.g. { 0 => [], 1 => ['pet ID = 22 created'] }
   # @param errors [Hash<Fixnum, Array<String>>] similar structure to successes,
   #   but rows that were not completed successfully.
+  def initialize(payload:, successes:, errors:)
+    # Assign instance variables and do any other setup
+  end
+  # Notify the owner that their pets were processed
+  def complete!
+    OwnerMailer.competed(successes, errors)
+  end
+  # Notify the owner that processing failed
+  #
   # @param fatal_error [StandardError] if nil, then all rows were processed,
   #   else the error that was raise is passed in here
-  def self.complete(payload, successes, errors, fatal_error = nil)
-    if fatal_error
-      PetProcessorMailer.fail(payload['recipient'], successes, errors, fatal_error)
-    else
-      PetProcessorMailer.complete(payload['recipient'], successes, errors)
-    end
+  def fail!(fatal_error)
+    OwnerMailer.failed(fatal_error)
   end
 end
 ```
-Requesting file processing
+Putting it all together
 ```ruby
-processor = BulkProcessor.new(file_stream, PetItemProcessor, NotificationHandler, {recipient: current_user.email})
-if processor.process
+processor = BulkProcessor.new(
+              stream: file_stream,
+              processor_class: PetCSVProcessor,
+              payload: {recipient: current_user.email}
+            )
+if processor.start
   # The job has been enqueued, go get a coffee and wait
 else
+  # Something went wrong, alert the file uploader
   handle_invalid_file(processor.errors)
 end
 ```

data/lib/bulk_processor/config.rb CHANGED Viewed

@@ -1,4 +1,5 @@
 class BulkProcessor
+  # Store configuration data set by clients
   class Config
     attr_reader :queue_adapter

data/lib/bulk_processor/csv_processor.rb ADDED Viewed

@@ -0,0 +1,98 @@
+require_relative 'no_op_handler'
+class BulkProcessor
+  # An abstract implmentation of the CSVProcessor role. Provides
+  #
+  #   * A default implementation of `.optional_columns`, returning []
+  #   * An initializer that assigns the arguments as instance attributes
+  #   * An implementation of #start to cover a common use case
+  #
+  # The common use case cover by this class' implementation of `#start` is
+  #
+  #   1. Iteratively process each record
+  #   2. Accumulate the results (did the processing succeed? what were the error
+  #      messages?)
+  #   3. Send the results to an instance of the Handler role.
+  #
+  # This class adds 2 required class methods that can be overridden in any
+  # subclass
+  #
+  #   * row_processor_class - (required) Returns the class that implements the
+  #     RowProcessor role to process rows of the CSV
+  #   * handler_class - (optional) Returns the class that implements the Handler
+  #     role,  which handles results from the completion (or failure) of
+  #     processing the entire CSV.
+  #
+  # The `required_columns` method must still be implemented in a subclass
+  #
+  class CSVProcessor
+    # @return [RowProcessor] a class that implements the RowProcessor interface
+    def self.row_processor_class
+      raise NotImplementedError,
+            "#{self.class.name} must implement #{__method__}"
+    end
+    # @return [Handler] a class that implements the Handler role
+    def self.handler_class
+      NoOpHandler
+    end
+    # @return [Array<String>] column headers that must be present
+    def self.required_columns
+      raise NotImplementedError,
+            "#{self.class.name} must implement #{__method__}"
+    end
+    # @return [Array<String>] column headers that may be present. If a column
+    #   header is present that is not in 'required_columns' or
+    #   'optional_columns', the file will be considered invalid and no rows will
+    #   be processed.
+    def self.optional_columns
+      []
+    end
+    def initialize(records, payload: {})
+      @records = records
+      @payload = payload
+      @successes = {}
+      @errors = {}
+    end
+    # Iteratively process each record, accumulate the results, and pass those
+    # off to the handler. If an unrescued error is raised for any record,
+    # processing will halt for all remaining records and the `#fail!` will be
+    # invoked on the handler.
+    def start
+      records.each_with_index do |record, index|
+        processor = row_processor(record)
+        processor.process!
+        if processor.success?
+          successes[index] = processor.messages
+        else
+          errors[index] = processor.messages
+        end
+      end
+      handler.complete!
+    rescue Exception => exception
+      handler.fail!(exception)
+      # Swallow any StandardError, since we are already reporting it to the
+      # user. However, we must re-raise Exceptions, such as SIGTERMs since they
+      # need to be handled at a level above this gem.
+      raise unless exception.is_a?(StandardError)
+    end
+    private
+    attr_reader :records, :payload, :successes, :errors
+    def handler
+      self.class.handler_class.new(payload: payload, successes: successes,
+                                   errors: errors)
+    end
+    def row_processor(record)
+      self.class.row_processor_class.new(record, payload: payload)
+    end
+  end
+end

data/lib/bulk_processor/job.rb CHANGED Viewed

@@ -1,26 +1,13 @@
+require 'active_job'
 class BulkProcessor
+  # ActiveJob to handle processing the CSV in the background
   class Job < ActiveJob::Base
     queue_as 'bulk_processor'
-    def perform(records, item_proccessor, handler, payload)
-      item_proccessor_class = item_proccessor.constantize
-      handler_class = handler.constantize
-      successes = {}
-      failures = {}
-      records.each_with_index do |record, index|
-        processor = item_proccessor_class.new(record, payload)
-        processor.process!
-        if processor.success?
-          successes[index] = processor.messages
-        else
-          failures[index] = processor.messages
-        end
-      end
-      handler_class.complete(payload, successes, failures, nil)
-    rescue Exception => exception
-      handler_class.complete(payload, successes, failures, exception)
-      raise unless exception.is_a?(StandardError)
+    def perform(records, processor_class, payload)
+      processor = processor_class.constantize.new(records, payload: payload)
+      processor.start
     end
   end
 end

data/lib/bulk_processor/no_op_handler.rb ADDED Viewed

@@ -0,0 +1,12 @@
+class BulkProcessor
+  class NoOpHandler
+    def initialize(payload:, successes:, errors:)
+    end
+    def complete!
+    end
+    def fail!(fatal_error)
+    end
+  end
+end

data/lib/bulk_processor/validated_csv.rb CHANGED Viewed

@@ -1,11 +1,20 @@
+require 'csv'
 class BulkProcessor
+  # A Wrapper on CSV that validates column headers.
   class ValidatedCSV
     PARSING_OPTIONS  = { headers: true, header_converters: :downcase }
     private_constant :PARSING_OPTIONS
+    # This cryptic message usually just means that the header row contains a
+    # blank field; in ruby ~> 2.1.5 It is the error message for a NoMethodError
+    # raised when parsing a CSV.
     BAD_HEADERS_ERROR_MSG = "undefined method `encode' for nil:NilClass"
     private_constant :BAD_HEADERS_ERROR_MSG
+    MISSING_COLUMN_MESSAGE = 'Missing or malformed column header, is one of them blank?'
+    private_constant :MISSING_COLUMN_MESSAGE
     attr_reader :errors, :records
     def initialize(stream, required_headers, optional_headers)
@@ -15,7 +24,12 @@ class BulkProcessor
       @errors = []
     end
+    # @return [true|false] true iff:
+    #   * All required columns are present
+    #   * No column exists that isn't a required or optional column
+    #   * No column heading is blank
     def valid?
+      return false if csv.nil?
       @errors = []
       if missing_headers.any?
@@ -26,20 +40,17 @@ class BulkProcessor
         errors << "Unrecognized column(s) found: #{extra_headers.join(', ')}"
       end
-      unless csv.headers.all?
-        errors << 'Missing or malformed column header, is one of them blank?'
-      end
-    rescue NoMethodError => error
-      if error.message == BAD_HEADERS_ERROR_MSG
-        errors << 'Missing or malformed column header, is one of them blank?'
-      else
-        raise error
+      if csv.headers.any? { |header| header.nil? || header.strip == '' }
+        errors << MISSING_COLUMN_MESSAGE
       end
-    ensure
-      return errors.empty?
+      errors.empty?
     end
+    # @return [Array<Hash<String, String>>] a serializable representation of the
+    #   CSV that will be passed to the background job.
     def row_hashes
+      return [] unless valid?
       csv.map(&:to_hash)
     end
@@ -48,7 +59,15 @@ class BulkProcessor
     attr_reader :stream, :required_headers, :optional_headers
     def csv
-      @csv ||= CSV.parse(stream, PARSING_OPTIONS)
+      return @csv if instance_variable_defined?('@csv')
+      @csv = CSV.parse(stream, PARSING_OPTIONS)
+    rescue NoMethodError => error
+      if error.message == BAD_HEADERS_ERROR_MSG
+        errors << MISSING_COLUMN_MESSAGE
+        @csv = nil
+      else
+        raise error
+      end
     end
     def missing_headers

data/lib/bulk_processor/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 class BulkProcessor
-  VERSION = '0.1.0'
+  VERSION = '0.2.0'
 end

data/lib/bulk_processor.rb CHANGED Viewed

@@ -1,12 +1,10 @@
-require 'active_job'
-require 'csv'
 require 'bulk_processor/config'
 require 'bulk_processor/job'
 require 'bulk_processor/stream_encoder'
 require 'bulk_processor/validated_csv'
 require 'bulk_processor/version'
+# Process large CSV files in the background.
 class BulkProcessor
   class << self
     def config
@@ -16,31 +14,34 @@ class BulkProcessor
     def configure
       yield config
     end
   end
-  attr_reader :stream, :item_processor, :handler, :payload, :errors
+  attr_reader :errors
-  def initialize(stream, item_processor, handler, payload = {})
+  def initialize(stream:, processor_class:, payload: {})
     @stream = stream
-    @item_processor = item_processor
-    @handler = handler
+    @processor_class = processor_class
     @payload = payload
     @errors = []
   end
-  def process
+  # Validate the CSV and enqueue if for processing in the background.
+  def start
     csv = ValidatedCSV.new(
       StreamEncoder.new(stream).encoded,
-      item_processor.required_columns,
-      item_processor.optional_columns
+      processor_class.required_columns,
+      processor_class.optional_columns
     )
     if csv.valid?
-      Job.perform_later(csv.row_hashes, item_processor.to_s, handler.to_s, payload)
+      Job.perform_later(csv.row_hashes, processor_class.name, payload)
     else
       @errors = csv.errors
     end
     @errors.empty?
   end
+  private
+  attr_reader :stream, :processor_class, :payload
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: bulk-processor
 version: !ruby/object:Gem::Version
-  version: 0.1.0
+  version: 0.2.0
 platform: ruby
 authors:
 - Tom Collier, Justin Richard
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2016-01-14 00:00:00.000000000 Z
+date: 2016-01-15 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: activejob
@@ -102,7 +102,9 @@ files:
 - bulk-processor.gemspec
 - lib/bulk_processor.rb
 - lib/bulk_processor/config.rb
+- lib/bulk_processor/csv_processor.rb
 - lib/bulk_processor/job.rb
+- lib/bulk_processor/no_op_handler.rb
 - lib/bulk_processor/stream_encoder.rb
 - lib/bulk_processor/validated_csv.rb
 - lib/bulk_processor/version.rb
@@ -126,8 +128,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
       version: '0'
 requirements: []
 rubyforge_project:
-rubygems_version: 2.4.3
+rubygems_version: 2.4.5
 signing_key:
 specification_version: 4
 summary: Background process CSV data
 test_files: []
+has_rdoc: