bulk-processor 0.3.0 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: d7b6dc445bf46f3f35477510449e5f56aed3f854
4
- data.tar.gz: ef53879ba52375923ba2b55460b5fb217665d68c
3
+ metadata.gz: 163fb73da29263963c492e14b85b35290e2f8d64
4
+ data.tar.gz: 352d9b5659f885f238a922361d6f341496ff36f1
5
5
  SHA512:
6
- metadata.gz: a4b9727d06824b5cf68789a4f3fb00d5b039d079201c33f4f68bfdb9ee720a9517df2e78d12614dcf1f1ca2673e7580dfd914b8963d3b463747026ce4681fec6
7
- data.tar.gz: 67c87c2517515fd5912d05f494ce2d3454409cdb9417466a112b48be00ab9a4d9cfb1b72fe57f4139403bbdc9980803f5e3140405efdae5c1c1ab54339f95e42
6
+ metadata.gz: f908072ce3303676c8c8440dff4f8d47646571be81ea71bc668e523f6bce7e94d5e4553c3f2a31a1d9eeba797a630b47a6c010b9bf97d495f575cdf844a1f4df
7
+ data.tar.gz: af56ed66e8a44edcabd9962f16ba0cfc89e2d784285506ecc34ec792f636592009e302a970403eb2dca45f21e77bc944718d18108cfd0018905d1f81cc243aad
data/.gitignore CHANGED
@@ -1,2 +1,3 @@
1
1
  /Gemfile.lock
2
2
  bulk-processor-*.gem
3
+ /tmp
data/README.md CHANGED
@@ -24,17 +24,29 @@ Or install it yourself as:
24
24
 
25
25
  ## Usage
26
26
 
27
+ ### Configuration
28
+
27
29
  Bulk processor requires the following configuration
28
30
 
29
31
  ```ruby
30
32
  BulkProcessor.queue_adapter = <adapter>
33
+ BulkProcessor.temp_directory = '/tmp'
34
+ BulkProcessor.aws.access_key_id = 'my-aws-access-key'
35
+ BulkProcessor.aws.secret_access_key = 'my-aws-secret'
36
+ BulkProcessor.aws.bucket = 'my-s3-bucket'
31
37
  ```
32
38
 
33
- The default is `:inline`, which skips queueing and processes synchronously. Since
34
- this is backed by ActiveJob, all of the adapters in [ActiveJob::QueueAdapters]( http://api.rubyonrails.org/classes/ActiveJob/QueueAdapters.html ),
39
+ The default queue_adapter is `:inline`, which skips queueing and processes synchronously. Since
40
+ this is backed by ActiveJob, all of the adapters in [ActiveJob::QueueAdapters]( http://api.rubyonrails.org/classes/ActiveJob/QueueAdapters.html ) are supported,
35
41
  including `:resque`.
36
42
 
37
- You will also need to supply a class for CSV processing. This class must respond to the
43
+ The CSV file passed to BulkProcessor will be persisted on AWS S3 so that the job
44
+ can access it. This requires configuring AWS credentials, the S3 bucket in which
45
+ to store the file, and a local temp directory to hold the file locally.
46
+
47
+ ### Setting up the processor and handler
48
+
49
+ You will need to supply a class for CSV processing. This class must respond to the
38
50
  `start` instance method, the `required_columns` and `optional_columns` class methods,
39
51
  and have the following signature for initialize:
40
52
 
@@ -62,6 +74,8 @@ class PetCSVProcessor
62
74
  end
63
75
  ```
64
76
 
77
+ #### Swiss Army Knife base class
78
+
65
79
  To account for a common use case, a base `BulkProcessor::CSVProcessor` class is provided,
66
80
  though it must be explicitly required. This base class can be subclassed to build a CSV processor.
67
81
  This base class implements the initializer and `#start` methods and returns an empty set for `.optional_columns`.
@@ -177,10 +191,11 @@ class PetHandler
177
191
  end
178
192
  ```
179
193
 
180
- Putting it all together
194
+ ### Kicking off the process
181
195
 
182
196
  ```ruby
183
197
  processor = BulkProcessor.new(
198
+ key: file_name,
184
199
  stream: file_stream,
185
200
  processor_class: PetCSVProcessor,
186
201
  payload: { recipient: current_user.email }
@@ -22,6 +22,7 @@ success or failure report
22
22
  spec.required_ruby_version = '>= 2.1'
23
23
 
24
24
  spec.add_runtime_dependency 'activejob', '~> 4'
25
+ spec.add_runtime_dependency 'aws-sdk', '~> 2.1'
25
26
 
26
27
  spec.add_development_dependency 'bundler'
27
28
  spec.add_development_dependency 'pry-byebug', '~> 3'
@@ -1,5 +1,6 @@
1
1
  require 'bulk_processor/config'
2
2
  require 'bulk_processor/job'
3
+ require 'bulk_processor/s3_file'
3
4
  require 'bulk_processor/stream_encoder'
4
5
  require 'bulk_processor/validated_csv'
5
6
  require 'bulk_processor/version'
@@ -18,7 +19,8 @@ class BulkProcessor
18
19
 
19
20
  attr_reader :errors
20
21
 
21
- def initialize(stream:, processor_class:, payload: {})
22
+ def initialize(key:, stream:, processor_class:, payload: {})
23
+ @key = key
22
24
  @stream = stream
23
25
  @processor_class = processor_class
24
26
  @payload = payload
@@ -26,22 +28,40 @@ class BulkProcessor
26
28
  end
27
29
 
28
30
  # Validate the CSV and enqueue if for processing in the background.
29
- def start
31
+ def start(file_class: S3File)
32
+ if file_class.new(key).exists?
33
+ errors << "Already processing #{key}, please wait for it to finish"
34
+ return false
35
+ end
36
+
37
+ encoded_contents = StreamEncoder.new(stream).encoded
38
+
30
39
  csv = ValidatedCSV.new(
31
- StreamEncoder.new(stream).encoded,
40
+ encoded_contents,
32
41
  processor_class.required_columns,
33
42
  processor_class.optional_columns
34
43
  )
35
44
 
36
45
  if csv.valid?
37
- Job.perform_later(csv.row_hashes, processor_class.name, payload)
46
+ perform_later(file_class, encoded_contents)
38
47
  else
39
- @errors = csv.errors
48
+ errors.concat(csv.errors)
40
49
  end
41
- @errors.empty?
50
+ errors.empty?
42
51
  end
43
52
 
44
53
  private
45
54
 
46
- attr_reader :stream, :processor_class, :payload
55
+ attr_reader :key, :stream, :processor_class, :payload
56
+
57
+ def perform_later(file_class, contents)
58
+ file = file_class.new(key)
59
+ file.write(contents)
60
+ Job.perform_later(processor_class.name, payload, file_class.name, key)
61
+ rescue Exception
62
+ # Clean up the file, which is treated as a lock, if we bail out of here
63
+ # unexpectedly.
64
+ file.try(:delete)
65
+ raise
66
+ end
47
67
  end
@@ -2,9 +2,14 @@ class BulkProcessor
2
2
  # Store configuration data set by clients
3
3
  class Config
4
4
  attr_reader :queue_adapter
5
+ attr_accessor :temp_directory
5
6
 
6
7
  def queue_adapter=(adapter)
7
8
  ActiveJob::Base.queue_adapter = @queue_adapter = adapter
8
9
  end
10
+
11
+ def aws
12
+ @aws ||= Struct.new(:access_key_id, :secret_access_key, :bucket).new
13
+ end
9
14
  end
10
15
  end
@@ -54,7 +54,7 @@ class BulkProcessor
54
54
  # @return [Hash<String, String>] the set of primary keys and their values
55
55
  # for this row
56
56
  def primary_attrs
57
- row.slice(*primary_keys)
57
+ row.to_hash.slice(*primary_keys)
58
58
  end
59
59
  end
60
60
  end
@@ -5,9 +5,15 @@ class BulkProcessor
5
5
  class Job < ActiveJob::Base
6
6
  queue_as 'bulk_processor'
7
7
 
8
- def perform(records, processor_class, payload)
9
- processor = processor_class.constantize.new(records, payload: payload)
10
- processor.start
8
+ def perform(processor_class, payload, file_class, key)
9
+ file = file_class.constantize.new(key)
10
+ file.open do |f|
11
+ csv = CSV.parse(f.read, headers: true)
12
+ processor = processor_class.constantize.new(csv, payload: payload)
13
+ processor.start
14
+ end
15
+ ensure
16
+ file.try(:delete)
11
17
  end
12
18
  end
13
19
  end
@@ -0,0 +1,83 @@
1
+ require 'aws-sdk'
2
+
3
+ class BulkProcessor
4
+ # Read and write files in a pre-configured S3 bucket.
5
+ class S3File
6
+ NAMESPACE = 'bulk_processor'.freeze
7
+ private_constant :NAMESPACE
8
+
9
+ # @param key [String] the unique identifier (within the bucket) used to
10
+ # access the file
11
+ def initialize(key)
12
+ @key = "#{NAMESPACE}/#{key}"
13
+ end
14
+
15
+ def exists?
16
+ client.get_object(bucket: bucket, key: key)
17
+ true
18
+ rescue Aws::S3::Errors::NoSuchKey
19
+ false
20
+ end
21
+
22
+ # Yield the file stored in the bucket identified by the key. The file is
23
+ # only guaranteed to exist locally within the block, any attempts to access
24
+ # the file outside of the block will fail.
25
+ #
26
+ # @yields [File] a local copy of the remote file
27
+ def open
28
+ with_temp_file do |local_file|
29
+ client.get_object({ bucket: bucket, key: key }, target: local_file)
30
+ local_file.rewind
31
+ yield local_file
32
+ end
33
+ end
34
+
35
+ # Write a new file to the bucket on S3
36
+ #
37
+ # @param contents [String] the contents of the file to create
38
+ # @return [String] the URL of the new file
39
+ def write(contents)
40
+ remote_file = resource.bucket(bucket).object(key)
41
+ remote_file.put(body: contents)
42
+ remote_file.public_url
43
+ end
44
+
45
+ def delete
46
+ client.delete_object(bucket: bucket, key: key)
47
+ end
48
+
49
+ private
50
+
51
+ attr_reader :bucket, :key
52
+
53
+ def bucket
54
+ BulkProcessor.config.aws.bucket || raise('AWS bucket must be set in the config')
55
+ end
56
+
57
+ def access_key_id
58
+ BulkProcessor.config.aws.access_key_id || raise('AWS access_key_id must be set in the config')
59
+ end
60
+
61
+ def secret_access_key
62
+ BulkProcessor.config.aws.secret_access_key || raise('AWS secret_access_key must be set in the config')
63
+ end
64
+
65
+ def resource
66
+ Aws::S3::Resource.new(client: client)
67
+ end
68
+
69
+ def client
70
+ credentials = Aws::Credentials.new(access_key_id, secret_access_key)
71
+ Aws::S3::Client.new(credentials: credentials)
72
+ end
73
+
74
+ def with_temp_file
75
+ base_dir = Pathname.new(BulkProcessor.config.temp_directory)
76
+ file = Tempfile.new('aws_utils', base_dir)
77
+ yield file
78
+ ensure
79
+ file.close if file && !file.closed?
80
+ file.try(:unlink)
81
+ end
82
+ end
83
+ end
@@ -3,7 +3,7 @@ require 'csv'
3
3
  class BulkProcessor
4
4
  # A Wrapper on CSV that validates column headers.
5
5
  class ValidatedCSV
6
- PARSING_OPTIONS = { headers: true, header_converters: :downcase }.freeze
6
+ PARSING_OPTIONS = { headers: true }.freeze
7
7
  private_constant :PARSING_OPTIONS
8
8
 
9
9
  # This cryptic message usually just means that the header row contains a
@@ -48,13 +48,6 @@ class BulkProcessor
48
48
  errors.empty?
49
49
  end
50
50
 
51
- # @return [Array<Hash<String, String>>] a serializable representation of the
52
- # CSV that will be passed to the background job.
53
- def row_hashes
54
- return [] unless valid?
55
- csv.map(&:to_hash)
56
- end
57
-
58
51
  private
59
52
 
60
53
  attr_reader :stream, :required_headers, :optional_headers
@@ -1,3 +1,3 @@
1
1
  class BulkProcessor
2
- VERSION = '0.3.0'.freeze
2
+ VERSION = '0.4.0'.freeze
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bulk-processor
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Tom Collier, Justin Richard
@@ -24,6 +24,20 @@ dependencies:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
26
  version: '4'
27
+ - !ruby/object:Gem::Dependency
28
+ name: aws-sdk
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '2.1'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '2.1'
27
41
  - !ruby/object:Gem::Dependency
28
42
  name: bundler
29
43
  requirement: !ruby/object:Gem::Requirement
@@ -108,6 +122,7 @@ files:
108
122
  - lib/bulk_processor/csv_processor/result.rb
109
123
  - lib/bulk_processor/csv_processor/row_processor.rb
110
124
  - lib/bulk_processor/job.rb
125
+ - lib/bulk_processor/s3_file.rb
111
126
  - lib/bulk_processor/stream_encoder.rb
112
127
  - lib/bulk_processor/validated_csv.rb
113
128
  - lib/bulk_processor/version.rb