RubyGems - bulk-processor - Versions diffs - 0.1.0 → 0.2.0 - Mend

bulk-processor 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

checksums.yaml +4 -4
data/README.md +81 -31
data/lib/bulk_processor/config.rb +1 -0
data/lib/bulk_processor/csv_processor.rb +98 -0
data/lib/bulk_processor/job.rb +6 -19
data/lib/bulk_processor/no_op_handler.rb +12 -0
data/lib/bulk_processor/validated_csv.rb +30 -11
data/lib/bulk_processor/version.rb +1 -1
data/lib/bulk_processor.rb +13 -12
metadata +6 -3

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 81b6cbba22963959c1e355f71f78790cbc0b6592
-  data.tar.gz: 8859344abc19572527f69ba9259c5b1289bde273
+  metadata.gz: 56580f10538cc75b8fbfb8006248905cb0cfeb71
+  data.tar.gz: 3ce3bc03cf878836f5a768a3be0fad169bfabdc6
 SHA512:
-  metadata.gz: 584baf85c25c5a741eb8392ab068a3e1798a71398dd3f53e0da6e24aca144648fb77c8442865a4f5c134193fbf255f4b5cf55f630bc1915241e1f2d229189ce7
-  data.tar.gz: 7a5cc788e2b8bc7caf6d4960aa28fd2233661b3e8f5d1b2d90ed4a474ba8885c35ee63fcf077b0b25a1a0c2cb9408ca1c22846105592a7de9a87b9330e04c93f
+  metadata.gz: 1c4fa2fdf92ec038c73c6eec886a682e9cb82f8fdb1ef0ade84a0eefa0eb1fffbd29cd510c3c555c6456b5ee7e600c2e52999031169dfbd773a73a593ee543ee
+  data.tar.gz: b32c0129b95fa2a44e7d2dca5455bca2a60137aac6d0f97b260b60e90d8c79d28acd4e91a12f9d9a6b14e0a1ac4735d505b31bea182499f14c9c19991fbfa930

data/README.md CHANGED Viewed

@@ -34,11 +34,48 @@ The default is `:inline`, which skips queueing and processes synchronously. Sinc
 this is backed by ActiveJob, all of the adapters in [ActiveJob::QueueAdapters]( http://api.rubyonrails.org/classes/ActiveJob/QueueAdapters.html ),
 including `:resque`.
-You will also need to supply a class for item processing and a class/module for completion handling.
-The item processor instance must respond to the following messages:
+You will also need to supply a class for CSV processing. This class must respond to the
+`start` instance method, the `required_columns` and `optional_columns` class methods,
+and have the following signature for initialize:
+```ruby
+class PetCSVProcessor
+  # @return [Array<String>] column headers that must be present
+  def self.required_columns
+    ['species', 'name', 'age']
+  end
+  # @return [Array<String>] column headers that may be present. If a column
+  #   header is present that is not in 'required_columns' or 'optional_columns',
+  #   the file will be considered invalid and no rows will be processed.
+  def self.optional_columns
+    ['favorite_toy', 'talents']
+  end
+  def initialize(records, payload:)
+    # Assign instance variables and do any other setup
+  end
+  def start
+    # Process the records
+  end
+end
 ```
-class PetItemProcessor
+To account for a common use case, a base `BulkProcessor::CSVProcessor` class is provided,
+though it must be explicitly required. This base class can be subclassed to build a CSV processor.
+This base class implements the initializer and `#start` methods and returns an empty set for `.optional_columns`.
+The `#start` method iterates over each record, processes it using a `RowProcessor`,
+accumulates the results, which are passed off to a `Handler`. An example
+implementation could look like:
+```ruby
+require 'bulk_processor/csv_processor'
+class PetCSVProcessor < BulkProcessor::CSVProcessor
+  # Note: this must be overridden in a subclass
+  #
   # @return [Array<String>] column headers that must be present
   def self.required_columns
     ['species', 'name', 'age']
@@ -51,18 +88,27 @@ class PetItemProcessor
     ['favorite_toy', 'talents']
   end
-  # Instantiate the processor with a single row from the CSV represented by
-  # a Hash<String, String>
-  def initialize(record_hash, payload)
-    @record_hash = record_hash
-    @payload = payload
-    @messages = []
-    @success = false
+  # Note: this must be overridden in a subclass
+  #
+  # @return [RowProcessor] a class that implements the RowProcessor role
+  def self.row_processor_class
+    PetRowProcessor
+  end
+  # @return [Handler] a class that implements the Handler role
+  def self.handler_class
+    PetHandler
+  end
+end
+class PetRowProcessor
+  def initialize(record, payload:)
+    # Assign instance variables and do any other setup
   end
   # Process the row, e.g. create a new record in the DB, send an email, etc
   def process!
-    pet = Pet.new(record_hash)
+    pet = Pet.new(record)
     if pet.save
       @success = true
     else
@@ -72,25 +118,17 @@ class PetItemProcessor
   # @return [true|false] true iff the item was processed completely
   def success?
-    @success
+    @success == true
   end
   # @return [Array<String>] list of messages for this item to pass back to the
   #   completion handler.
   def messages
-    @messages
+    @messages || []
   end
 end
-```
-A completion handler must respond to the following messages
-```ruby
-module NotificationHandler
-  # Handle full or partial processing of records. Unless there was a fatal
-  # error, all row indexes will be present either successes or errors, but not
-  # both.
-  #
+class PetHandler
   # @param payload [Hash] the payload passed into 'BulkProcessor.process', can
   #   be used to pass metadata around, e.g. the email address to send a
   #   completion report to
@@ -100,25 +138,37 @@ module NotificationHandler
   #   (may be empty), e.g. { 0 => [], 1 => ['pet ID = 22 created'] }
   # @param errors [Hash<Fixnum, Array<String>>] similar structure to successes,
   #   but rows that were not completed successfully.
+  def initialize(payload:, successes:, errors:)
+    # Assign instance variables and do any other setup
+  end
+  # Notify the owner that their pets were processed
+  def complete!
+    OwnerMailer.competed(successes, errors)
+  end
+  # Notify the owner that processing failed
+  #
   # @param fatal_error [StandardError] if nil, then all rows were processed,
   #   else the error that was raise is passed in here
-  def self.complete(payload, successes, errors, fatal_error = nil)
-    if fatal_error
-      PetProcessorMailer.fail(payload['recipient'], successes, errors, fatal_error)
-    else
-      PetProcessorMailer.complete(payload['recipient'], successes, errors)
-    end
+  def fail!(fatal_error)
+    OwnerMailer.failed(fatal_error)
   end
 end
 ```
-Requesting file processing
+Putting it all together
 ```ruby
-processor = BulkProcessor.new(file_stream, PetItemProcessor, NotificationHandler, {recipient: current_user.email})
-if processor.process
+processor = BulkProcessor.new(
+              stream: file_stream,
+              processor_class: PetCSVProcessor,
+              payload: {recipient: current_user.email}
+            )
+if processor.start
   # The job has been enqueued, go get a coffee and wait
 else
+  # Something went wrong, alert the file uploader
   handle_invalid_file(processor.errors)
 end
 ```

data/lib/bulk_processor/config.rb CHANGED Viewed

@@ -1,4 +1,5 @@
 class BulkProcessor
+  # Store configuration data set by clients
   class Config
     attr_reader :queue_adapter

data/lib/bulk_processor/csv_processor.rb ADDED Viewed

@@ -0,0 +1,98 @@
+require_relative 'no_op_handler'
+class BulkProcessor
+  # An abstract implmentation of the CSVProcessor role. Provides
+  #
+  #   * A default implementation of `.optional_columns`, returning []
+  #   * An initializer that assigns the arguments as instance attributes
+  #   * An implementation of #start to cover a common use case
+  #
+  # The common use case cover by this class' implementation of `#start` is
+  #
+  #   1. Iteratively process each record
+  #   2. Accumulate the results (did the processing succeed? what were the error
+  #      messages?)
+  #   3. Send the results to an instance of the Handler role.
+  #
+  # This class adds 2 required class methods that can be overridden in any
+  # subclass
+  #
+  #   * row_processor_class - (required) Returns the class that implements the
+  #     RowProcessor role to process rows of the CSV
+  #   * handler_class - (optional) Returns the class that implements the Handler
+  #     role,  which handles results from the completion (or failure) of
+  #     processing the entire CSV.
+  #
+  # The `required_columns` method must still be implemented in a subclass
+  #
+  class CSVProcessor
+    # @return [RowProcessor] a class that implements the RowProcessor interface
+    def self.row_processor_class
+      raise NotImplementedError,
+            "#{self.class.name} must implement #{__method__}"
+    end
+    # @return [Handler] a class that implements the Handler role
+    def self.handler_class
+      NoOpHandler
+    end
+    # @return [Array<String>] column headers that must be present
+    def self.required_columns
+      raise NotImplementedError,
+            "#{self.class.name} must implement #{__method__}"
+    end
+    # @return [Array<String>] column headers that may be present. If a column
+    #   header is present that is not in 'required_columns' or
+    #   'optional_columns', the file will be considered invalid and no rows will
+    #   be processed.
+    def self.optional_columns
+      []
+    end
+    def initialize(records, payload: {})
+      @records = records
+      @payload = payload
+      @successes = {}
+      @errors = {}
+    end
+    # Iteratively process each record, accumulate the results, and pass those
+    # off to the handler. If an unrescued error is raised for any record,
+    # processing will halt for all remaining records and the `#fail!` will be
+    # invoked on the handler.
+    def start
+      records.each_with_index do |record, index|
+        processor = row_processor(record)
+        processor.process!
+        if processor.success?
+          successes[index] = processor.messages
+        else
+          errors[index] = processor.messages
+        end
+      end
+      handler.complete!
+    rescue Exception => exception
+      handler.fail!(exception)
+      # Swallow any StandardError, since we are already reporting it to the
+      # user. However, we must re-raise Exceptions, such as SIGTERMs since they
+      # need to be handled at a level above this gem.
+      raise unless exception.is_a?(StandardError)
+    end
+    private
+    attr_reader :records, :payload, :successes, :errors
+    def handler
+      self.class.handler_class.new(payload: payload, successes: successes,
+                                   errors: errors)
+    end
+    def row_processor(record)
+      self.class.row_processor_class.new(record, payload: payload)
+    end
+  end
+end

data/lib/bulk_processor/job.rb CHANGED Viewed

@@ -1,26 +1,13 @@
+require 'active_job'
 class BulkProcessor
+  # ActiveJob to handle processing the CSV in the background
   class Job < ActiveJob::Base
     queue_as 'bulk_processor'
-    def perform(records, item_proccessor, handler, payload)
-      item_proccessor_class = item_proccessor.constantize
-      handler_class = handler.constantize
-      successes = {}
-      failures = {}
-      records.each_with_index do |record, index|
-        processor = item_proccessor_class.new(record, payload)
-        processor.process!
-        if processor.success?
-          successes[index] = processor.messages
-        else
-          failures[index] = processor.messages
-        end
-      end
-      handler_class.complete(payload, successes, failures, nil)
-    rescue Exception => exception
-      handler_class.complete(payload, successes, failures, exception)
-      raise unless exception.is_a?(StandardError)
+    def perform(records, processor_class, payload)
+      processor = processor_class.constantize.new(records, payload: payload)
+      processor.start
     end
   end
 end

data/lib/bulk_processor/no_op_handler.rb ADDED Viewed

@@ -0,0 +1,12 @@
+class BulkProcessor
+  class NoOpHandler
+    def initialize(payload:, successes:, errors:)
+    end
+    def complete!
+    end
+    def fail!(fatal_error)
+    end
+  end
+end

data/lib/bulk_processor/validated_csv.rb CHANGED Viewed

@@ -1,11 +1,20 @@
+require 'csv'
 class BulkProcessor
+  # A Wrapper on CSV that validates column headers.
   class ValidatedCSV
     PARSING_OPTIONS  = { headers: true, header_converters: :downcase }
     private_constant :PARSING_OPTIONS
+    # This cryptic message usually just means that the header row contains a
+    # blank field; in ruby ~> 2.1.5 It is the error message for a NoMethodError
+    # raised when parsing a CSV.
     BAD_HEADERS_ERROR_MSG = "undefined method `encode' for nil:NilClass"
     private_constant :BAD_HEADERS_ERROR_MSG
+    MISSING_COLUMN_MESSAGE = 'Missing or malformed column header, is one of them blank?'
+    private_constant :MISSING_COLUMN_MESSAGE
     attr_reader :errors, :records
     def initialize(stream, required_headers, optional_headers)
@@ -15,7 +24,12 @@ class BulkProcessor
       @errors = []
     end
+    # @return [true|false] true iff:
+    #   * All required columns are present
+    #   * No column exists that isn't a required or optional column
+    #   * No column heading is blank
     def valid?
+      return false if csv.nil?
       @errors = []
       if missing_headers.any?
@@ -26,20 +40,17 @@ class BulkProcessor
         errors << "Unrecognized column(s) found: #{extra_headers.join(', ')}"
       end
-      unless csv.headers.all?
-        errors << 'Missing or malformed column header, is one of them blank?'
-      end
-    rescue NoMethodError => error
-      if error.message == BAD_HEADERS_ERROR_MSG
-        errors << 'Missing or malformed column header, is one of them blank?'
-      else
-        raise error
+      if csv.headers.any? { |header| header.nil? || header.strip == '' }
+        errors << MISSING_COLUMN_MESSAGE
       end
-    ensure
-      return errors.empty?
+      errors.empty?
     end
+    # @return [Array<Hash<String, String>>] a serializable representation of the
+    #   CSV that will be passed to the background job.
     def row_hashes
+      return [] unless valid?
       csv.map(&:to_hash)
     end
@@ -48,7 +59,15 @@ class BulkProcessor
     attr_reader :stream, :required_headers, :optional_headers
     def csv
-      @csv ||= CSV.parse(stream, PARSING_OPTIONS)
+      return @csv if instance_variable_defined?('@csv')
+      @csv = CSV.parse(stream, PARSING_OPTIONS)
+    rescue NoMethodError => error
+      if error.message == BAD_HEADERS_ERROR_MSG
+        errors << MISSING_COLUMN_MESSAGE
+        @csv = nil
+      else
+        raise error
+      end
     end
     def missing_headers

data/lib/bulk_processor/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 class BulkProcessor
-  VERSION = '0.1.0'
+  VERSION = '0.2.0'
 end

data/lib/bulk_processor.rb CHANGED Viewed

@@ -1,12 +1,10 @@
-require 'active_job'
-require 'csv'
 require 'bulk_processor/config'
 require 'bulk_processor/job'
 require 'bulk_processor/stream_encoder'
 require 'bulk_processor/validated_csv'
 require 'bulk_processor/version'
+# Process large CSV files in the background.
 class BulkProcessor
   class << self
     def config
@@ -16,31 +14,34 @@ class BulkProcessor
     def configure
       yield config
     end
   end
-  attr_reader :stream, :item_processor, :handler, :payload, :errors
+  attr_reader :errors
-  def initialize(stream, item_processor, handler, payload = {})
+  def initialize(stream:, processor_class:, payload: {})
     @stream = stream
-    @item_processor = item_processor
-    @handler = handler
+    @processor_class = processor_class
     @payload = payload
     @errors = []
   end
-  def process
+  # Validate the CSV and enqueue if for processing in the background.
+  def start
     csv = ValidatedCSV.new(
       StreamEncoder.new(stream).encoded,
-      item_processor.required_columns,
-      item_processor.optional_columns
+      processor_class.required_columns,
+      processor_class.optional_columns
     )
     if csv.valid?
-      Job.perform_later(csv.row_hashes, item_processor.to_s, handler.to_s, payload)
+      Job.perform_later(csv.row_hashes, processor_class.name, payload)
     else
       @errors = csv.errors
     end
     @errors.empty?
   end
+  private
+  attr_reader :stream, :processor_class, :payload
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: bulk-processor
 version: !ruby/object:Gem::Version
-  version: 0.1.0
+  version: 0.2.0
 platform: ruby
 authors:
 - Tom Collier, Justin Richard
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2016-01-14 00:00:00.000000000 Z
+date: 2016-01-15 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: activejob
@@ -102,7 +102,9 @@ files:
 - bulk-processor.gemspec
 - lib/bulk_processor.rb
 - lib/bulk_processor/config.rb
+- lib/bulk_processor/csv_processor.rb
 - lib/bulk_processor/job.rb
+- lib/bulk_processor/no_op_handler.rb
 - lib/bulk_processor/stream_encoder.rb
 - lib/bulk_processor/validated_csv.rb
 - lib/bulk_processor/version.rb
@@ -126,8 +128,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
       version: '0'
 requirements: []
 rubyforge_project:
-rubygems_version: 2.4.3
+rubygems_version: 2.4.5
 signing_key:
 specification_version: 4
 summary: Background process CSV data
 test_files: []
+has_rdoc: