RubyGems - bulk-processor - Versions diffs - 0.3.0 → 0.4.0 - Mend

bulk-processor 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

checksums.yaml +4 -4
data/.gitignore +1 -0
data/README.md +19 -4
data/bulk-processor.gemspec +1 -0
data/lib/bulk_processor.rb +27 -7
data/lib/bulk_processor/config.rb +5 -0
data/lib/bulk_processor/csv_processor/row_processor.rb +1 -1
data/lib/bulk_processor/job.rb +9 -3
data/lib/bulk_processor/s3_file.rb +83 -0
data/lib/bulk_processor/validated_csv.rb +1 -8
data/lib/bulk_processor/version.rb +1 -1
metadata +16 -1

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: d7b6dc445bf46f3f35477510449e5f56aed3f854
-  data.tar.gz: ef53879ba52375923ba2b55460b5fb217665d68c
+  metadata.gz: 163fb73da29263963c492e14b85b35290e2f8d64
+  data.tar.gz: 352d9b5659f885f238a922361d6f341496ff36f1
 SHA512:
-  metadata.gz: a4b9727d06824b5cf68789a4f3fb00d5b039d079201c33f4f68bfdb9ee720a9517df2e78d12614dcf1f1ca2673e7580dfd914b8963d3b463747026ce4681fec6
-  data.tar.gz: 67c87c2517515fd5912d05f494ce2d3454409cdb9417466a112b48be00ab9a4d9cfb1b72fe57f4139403bbdc9980803f5e3140405efdae5c1c1ab54339f95e42
+  metadata.gz: f908072ce3303676c8c8440dff4f8d47646571be81ea71bc668e523f6bce7e94d5e4553c3f2a31a1d9eeba797a630b47a6c010b9bf97d495f575cdf844a1f4df
+  data.tar.gz: af56ed66e8a44edcabd9962f16ba0cfc89e2d784285506ecc34ec792f636592009e302a970403eb2dca45f21e77bc944718d18108cfd0018905d1f81cc243aad

data/.gitignore CHANGED Viewed

@@ -1,2 +1,3 @@
 /Gemfile.lock
 bulk-processor-*.gem
+/tmp

data/README.md CHANGED Viewed

@@ -24,17 +24,29 @@ Or install it yourself as:
 ## Usage
+### Configuration
 Bulk processor requires the following configuration
 ```ruby
 BulkProcessor.queue_adapter = <adapter>
+BulkProcessor.temp_directory = '/tmp'
+BulkProcessor.aws.access_key_id = 'my-aws-access-key'
+BulkProcessor.aws.secret_access_key = 'my-aws-secret'
+BulkProcessor.aws.bucket = 'my-s3-bucket'
 ```
-The default is `:inline`, which skips queueing and processes synchronously. Since
-this is backed by ActiveJob, all of the adapters in [ActiveJob::QueueAdapters]( http://api.rubyonrails.org/classes/ActiveJob/QueueAdapters.html ),
+The default queue_adapter is `:inline`, which skips queueing and processes synchronously. Since
+this is backed by ActiveJob, all of the adapters in [ActiveJob::QueueAdapters]( http://api.rubyonrails.org/classes/ActiveJob/QueueAdapters.html ) are supported,
 including `:resque`.
-You will also need to supply a class for CSV processing. This class must respond to the
+The CSV file passed to BulkProcessor will be persisted on AWS S3 so that the job
+can access it. This requires configuring AWS credentials, the S3 bucket in which
+to store the file, and a local temp directory to hold the file locally.
+### Setting up the processor and handler
+You will need to supply a class for CSV processing. This class must respond to the
 `start` instance method, the `required_columns` and `optional_columns` class methods,
 and have the following signature for initialize:
@@ -62,6 +74,8 @@ class PetCSVProcessor
 end
 ```
+#### Swiss Army Knife base class
 To account for a common use case, a base `BulkProcessor::CSVProcessor` class is provided,
 though it must be explicitly required. This base class can be subclassed to build a CSV processor.
 This base class implements the initializer and `#start` methods and returns an empty set for `.optional_columns`.
@@ -177,10 +191,11 @@ class PetHandler
 end
 ```
-Putting it all together
+### Kicking off the process
 ```ruby
 processor = BulkProcessor.new(
+              key: file_name,
               stream: file_stream,
               processor_class: PetCSVProcessor,
               payload: { recipient: current_user.email }

data/bulk-processor.gemspec CHANGED Viewed

@@ -22,6 +22,7 @@ success or failure report
   spec.required_ruby_version = '>= 2.1'
   spec.add_runtime_dependency 'activejob', '~> 4'
+  spec.add_runtime_dependency 'aws-sdk', '~> 2.1'
   spec.add_development_dependency 'bundler'
   spec.add_development_dependency 'pry-byebug', '~> 3'

data/lib/bulk_processor.rb CHANGED Viewed

@@ -1,5 +1,6 @@
 require 'bulk_processor/config'
 require 'bulk_processor/job'
+require 'bulk_processor/s3_file'
 require 'bulk_processor/stream_encoder'
 require 'bulk_processor/validated_csv'
 require 'bulk_processor/version'
@@ -18,7 +19,8 @@ class BulkProcessor
   attr_reader :errors
-  def initialize(stream:, processor_class:, payload: {})
+  def initialize(key:, stream:, processor_class:, payload: {})
+    @key = key
     @stream = stream
     @processor_class = processor_class
     @payload = payload
@@ -26,22 +28,40 @@ class BulkProcessor
   end
   # Validate the CSV and enqueue if for processing in the background.
-  def start
+  def start(file_class: S3File)
+    if file_class.new(key).exists?
+      errors << "Already processing #{key}, please wait for it to finish"
+      return false
+    end
+    encoded_contents = StreamEncoder.new(stream).encoded
     csv = ValidatedCSV.new(
-      StreamEncoder.new(stream).encoded,
+      encoded_contents,
       processor_class.required_columns,
       processor_class.optional_columns
     )
     if csv.valid?
-      Job.perform_later(csv.row_hashes, processor_class.name, payload)
+      perform_later(file_class, encoded_contents)
     else
-      @errors = csv.errors
+      errors.concat(csv.errors)
     end
-    @errors.empty?
+    errors.empty?
   end
   private
-  attr_reader :stream, :processor_class, :payload
+  attr_reader :key, :stream, :processor_class, :payload
+  def perform_later(file_class, contents)
+    file = file_class.new(key)
+    file.write(contents)
+    Job.perform_later(processor_class.name, payload, file_class.name, key)
+  rescue Exception
+    # Clean up the file, which is treated as a lock, if we bail out of here
+    # unexpectedly.
+    file.try(:delete)
+    raise
+  end
 end

data/lib/bulk_processor/config.rb CHANGED Viewed

@@ -2,9 +2,14 @@ class BulkProcessor
   # Store configuration data set by clients
   class Config
     attr_reader :queue_adapter
+    attr_accessor :temp_directory
     def queue_adapter=(adapter)
       ActiveJob::Base.queue_adapter = @queue_adapter = adapter
     end
+    def aws
+      @aws ||= Struct.new(:access_key_id, :secret_access_key, :bucket).new
+    end
   end
 end

data/lib/bulk_processor/csv_processor/row_processor.rb CHANGED Viewed

@@ -54,7 +54,7 @@ class BulkProcessor
       # @return [Hash<String, String>] the set of primary keys and their values
       #   for this row
       def primary_attrs
-        row.slice(*primary_keys)
+        row.to_hash.slice(*primary_keys)
       end
     end
   end

data/lib/bulk_processor/job.rb CHANGED Viewed

@@ -5,9 +5,15 @@ class BulkProcessor
   class Job < ActiveJob::Base
     queue_as 'bulk_processor'
-    def perform(records, processor_class, payload)
-      processor = processor_class.constantize.new(records, payload: payload)
-      processor.start
+    def perform(processor_class, payload, file_class, key)
+      file = file_class.constantize.new(key)
+      file.open do |f|
+        csv = CSV.parse(f.read, headers: true)
+        processor = processor_class.constantize.new(csv, payload: payload)
+        processor.start
+      end
+    ensure
+      file.try(:delete)
     end
   end
 end

data/lib/bulk_processor/s3_file.rb ADDED Viewed

@@ -0,0 +1,83 @@
+require 'aws-sdk'
+class BulkProcessor
+  # Read and write files in a pre-configured S3 bucket.
+  class S3File
+    NAMESPACE = 'bulk_processor'.freeze
+    private_constant :NAMESPACE
+    # @param key [String] the unique identifier (within the bucket) used to
+    #   access the file
+    def initialize(key)
+      @key = "#{NAMESPACE}/#{key}"
+    end
+    def exists?
+      client.get_object(bucket: bucket, key: key)
+      true
+    rescue Aws::S3::Errors::NoSuchKey
+      false
+    end
+    # Yield the file stored in the bucket identified by the key. The file is
+    # only guaranteed to exist locally within the block, any attempts to access
+    # the file outside of the block will fail.
+    #
+    # @yields [File] a local copy of the remote file
+    def open
+      with_temp_file do |local_file|
+        client.get_object({ bucket: bucket, key: key }, target: local_file)
+        local_file.rewind
+        yield local_file
+      end
+    end
+    # Write a new file to the bucket on S3
+    #
+    # @param contents [String] the contents of the file to create
+    # @return [String] the URL of the new file
+    def write(contents)
+      remote_file = resource.bucket(bucket).object(key)
+      remote_file.put(body: contents)
+      remote_file.public_url
+    end
+    def delete
+      client.delete_object(bucket: bucket, key: key)
+    end
+    private
+    attr_reader :bucket, :key
+    def bucket
+      BulkProcessor.config.aws.bucket || raise('AWS bucket must be set in the config')
+    end
+    def access_key_id
+      BulkProcessor.config.aws.access_key_id || raise('AWS access_key_id must be set in the config')
+    end
+    def secret_access_key
+      BulkProcessor.config.aws.secret_access_key || raise('AWS secret_access_key must be set in the config')
+    end
+    def resource
+      Aws::S3::Resource.new(client: client)
+    end
+    def client
+      credentials = Aws::Credentials.new(access_key_id, secret_access_key)
+      Aws::S3::Client.new(credentials: credentials)
+    end
+    def with_temp_file
+      base_dir = Pathname.new(BulkProcessor.config.temp_directory)
+      file = Tempfile.new('aws_utils', base_dir)
+      yield file
+    ensure
+      file.close if file && !file.closed?
+      file.try(:unlink)
+    end
+  end
+end

data/lib/bulk_processor/validated_csv.rb CHANGED Viewed

@@ -3,7 +3,7 @@ require 'csv'
 class BulkProcessor
   # A Wrapper on CSV that validates column headers.
   class ValidatedCSV
-    PARSING_OPTIONS  = { headers: true, header_converters: :downcase }.freeze
+    PARSING_OPTIONS  = { headers: true }.freeze
     private_constant :PARSING_OPTIONS
     # This cryptic message usually just means that the header row contains a
@@ -48,13 +48,6 @@ class BulkProcessor
       errors.empty?
     end
-    # @return [Array<Hash<String, String>>] a serializable representation of the
-    #   CSV that will be passed to the background job.
-    def row_hashes
-      return [] unless valid?
-      csv.map(&:to_hash)
-    end
     private
     attr_reader :stream, :required_headers, :optional_headers

data/lib/bulk_processor/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 class BulkProcessor
-  VERSION = '0.3.0'.freeze
+  VERSION = '0.4.0'.freeze
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: bulk-processor
 version: !ruby/object:Gem::Version
-  version: 0.3.0
+  version: 0.4.0
 platform: ruby
 authors:
 - Tom Collier, Justin Richard
@@ -24,6 +24,20 @@ dependencies:
     - - "~>"
       - !ruby/object:Gem::Version
         version: '4'
+- !ruby/object:Gem::Dependency
+  name: aws-sdk
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '2.1'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '2.1'
 - !ruby/object:Gem::Dependency
   name: bundler
   requirement: !ruby/object:Gem::Requirement
@@ -108,6 +122,7 @@ files:
 - lib/bulk_processor/csv_processor/result.rb
 - lib/bulk_processor/csv_processor/row_processor.rb
 - lib/bulk_processor/job.rb
+- lib/bulk_processor/s3_file.rb
 - lib/bulk_processor/stream_encoder.rb
 - lib/bulk_processor/validated_csv.rb
 - lib/bulk_processor/version.rb