bulk-processor 0.3.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/README.md +19 -4
- data/bulk-processor.gemspec +1 -0
- data/lib/bulk_processor.rb +27 -7
- data/lib/bulk_processor/config.rb +5 -0
- data/lib/bulk_processor/csv_processor/row_processor.rb +1 -1
- data/lib/bulk_processor/job.rb +9 -3
- data/lib/bulk_processor/s3_file.rb +83 -0
- data/lib/bulk_processor/validated_csv.rb +1 -8
- data/lib/bulk_processor/version.rb +1 -1
- metadata +16 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 163fb73da29263963c492e14b85b35290e2f8d64
|
4
|
+
data.tar.gz: 352d9b5659f885f238a922361d6f341496ff36f1
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f908072ce3303676c8c8440dff4f8d47646571be81ea71bc668e523f6bce7e94d5e4553c3f2a31a1d9eeba797a630b47a6c010b9bf97d495f575cdf844a1f4df
|
7
|
+
data.tar.gz: af56ed66e8a44edcabd9962f16ba0cfc89e2d784285506ecc34ec792f636592009e302a970403eb2dca45f21e77bc944718d18108cfd0018905d1f81cc243aad
|
data/.gitignore
CHANGED
data/README.md
CHANGED
@@ -24,17 +24,29 @@ Or install it yourself as:
|
|
24
24
|
|
25
25
|
## Usage
|
26
26
|
|
27
|
+
### Configuration
|
28
|
+
|
27
29
|
Bulk processor requires the following configuration
|
28
30
|
|
29
31
|
```ruby
|
30
32
|
BulkProcessor.queue_adapter = <adapter>
|
33
|
+
BulkProcessor.temp_directory = '/tmp'
|
34
|
+
BulkProcessor.aws.access_key_id = 'my-aws-access-key'
|
35
|
+
BulkProcessor.aws.secret_access_key = 'my-aws-secret'
|
36
|
+
BulkProcessor.aws.bucket = 'my-s3-bucket'
|
31
37
|
```
|
32
38
|
|
33
|
-
The default is `:inline`, which skips queueing and processes synchronously. Since
|
34
|
-
this is backed by ActiveJob, all of the adapters in [ActiveJob::QueueAdapters]( http://api.rubyonrails.org/classes/ActiveJob/QueueAdapters.html ),
|
39
|
+
The default queue_adapter is `:inline`, which skips queueing and processes synchronously. Since
|
40
|
+
this is backed by ActiveJob, all of the adapters in [ActiveJob::QueueAdapters]( http://api.rubyonrails.org/classes/ActiveJob/QueueAdapters.html ) are supported,
|
35
41
|
including `:resque`.
|
36
42
|
|
37
|
-
|
43
|
+
The CSV file passed to BulkProcessor will be persisted on AWS S3 so that the job
|
44
|
+
can access it. This requires configuring AWS credentials, the S3 bucket in which
|
45
|
+
to store the file, and a local temp directory to hold the file locally.
|
46
|
+
|
47
|
+
### Setting up the processor and handler
|
48
|
+
|
49
|
+
You will need to supply a class for CSV processing. This class must respond to the
|
38
50
|
`start` instance method, the `required_columns` and `optional_columns` class methods,
|
39
51
|
and have the following signature for initialize:
|
40
52
|
|
@@ -62,6 +74,8 @@ class PetCSVProcessor
|
|
62
74
|
end
|
63
75
|
```
|
64
76
|
|
77
|
+
#### Swiss Army Knife base class
|
78
|
+
|
65
79
|
To account for a common use case, a base `BulkProcessor::CSVProcessor` class is provided,
|
66
80
|
though it must be explicitly required. This base class can be subclassed to build a CSV processor.
|
67
81
|
This base class implements the initializer and `#start` methods and returns an empty set for `.optional_columns`.
|
@@ -177,10 +191,11 @@ class PetHandler
|
|
177
191
|
end
|
178
192
|
```
|
179
193
|
|
180
|
-
|
194
|
+
### Kicking off the process
|
181
195
|
|
182
196
|
```ruby
|
183
197
|
processor = BulkProcessor.new(
|
198
|
+
key: file_name,
|
184
199
|
stream: file_stream,
|
185
200
|
processor_class: PetCSVProcessor,
|
186
201
|
payload: { recipient: current_user.email }
|
data/bulk-processor.gemspec
CHANGED
@@ -22,6 +22,7 @@ success or failure report
|
|
22
22
|
spec.required_ruby_version = '>= 2.1'
|
23
23
|
|
24
24
|
spec.add_runtime_dependency 'activejob', '~> 4'
|
25
|
+
spec.add_runtime_dependency 'aws-sdk', '~> 2.1'
|
25
26
|
|
26
27
|
spec.add_development_dependency 'bundler'
|
27
28
|
spec.add_development_dependency 'pry-byebug', '~> 3'
|
data/lib/bulk_processor.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
require 'bulk_processor/config'
|
2
2
|
require 'bulk_processor/job'
|
3
|
+
require 'bulk_processor/s3_file'
|
3
4
|
require 'bulk_processor/stream_encoder'
|
4
5
|
require 'bulk_processor/validated_csv'
|
5
6
|
require 'bulk_processor/version'
|
@@ -18,7 +19,8 @@ class BulkProcessor
|
|
18
19
|
|
19
20
|
attr_reader :errors
|
20
21
|
|
21
|
-
def initialize(stream:, processor_class:, payload: {})
|
22
|
+
def initialize(key:, stream:, processor_class:, payload: {})
|
23
|
+
@key = key
|
22
24
|
@stream = stream
|
23
25
|
@processor_class = processor_class
|
24
26
|
@payload = payload
|
@@ -26,22 +28,40 @@ class BulkProcessor
|
|
26
28
|
end
|
27
29
|
|
28
30
|
# Validate the CSV and enqueue if for processing in the background.
|
29
|
-
def start
|
31
|
+
def start(file_class: S3File)
|
32
|
+
if file_class.new(key).exists?
|
33
|
+
errors << "Already processing #{key}, please wait for it to finish"
|
34
|
+
return false
|
35
|
+
end
|
36
|
+
|
37
|
+
encoded_contents = StreamEncoder.new(stream).encoded
|
38
|
+
|
30
39
|
csv = ValidatedCSV.new(
|
31
|
-
|
40
|
+
encoded_contents,
|
32
41
|
processor_class.required_columns,
|
33
42
|
processor_class.optional_columns
|
34
43
|
)
|
35
44
|
|
36
45
|
if csv.valid?
|
37
|
-
|
46
|
+
perform_later(file_class, encoded_contents)
|
38
47
|
else
|
39
|
-
|
48
|
+
errors.concat(csv.errors)
|
40
49
|
end
|
41
|
-
|
50
|
+
errors.empty?
|
42
51
|
end
|
43
52
|
|
44
53
|
private
|
45
54
|
|
46
|
-
attr_reader :stream, :processor_class, :payload
|
55
|
+
attr_reader :key, :stream, :processor_class, :payload
|
56
|
+
|
57
|
+
def perform_later(file_class, contents)
|
58
|
+
file = file_class.new(key)
|
59
|
+
file.write(contents)
|
60
|
+
Job.perform_later(processor_class.name, payload, file_class.name, key)
|
61
|
+
rescue Exception
|
62
|
+
# Clean up the file, which is treated as a lock, if we bail out of here
|
63
|
+
# unexpectedly.
|
64
|
+
file.try(:delete)
|
65
|
+
raise
|
66
|
+
end
|
47
67
|
end
|
@@ -2,9 +2,14 @@ class BulkProcessor
|
|
2
2
|
# Store configuration data set by clients
|
3
3
|
class Config
|
4
4
|
attr_reader :queue_adapter
|
5
|
+
attr_accessor :temp_directory
|
5
6
|
|
6
7
|
def queue_adapter=(adapter)
|
7
8
|
ActiveJob::Base.queue_adapter = @queue_adapter = adapter
|
8
9
|
end
|
10
|
+
|
11
|
+
def aws
|
12
|
+
@aws ||= Struct.new(:access_key_id, :secret_access_key, :bucket).new
|
13
|
+
end
|
9
14
|
end
|
10
15
|
end
|
data/lib/bulk_processor/job.rb
CHANGED
@@ -5,9 +5,15 @@ class BulkProcessor
|
|
5
5
|
class Job < ActiveJob::Base
|
6
6
|
queue_as 'bulk_processor'
|
7
7
|
|
8
|
-
def perform(
|
9
|
-
|
10
|
-
|
8
|
+
def perform(processor_class, payload, file_class, key)
|
9
|
+
file = file_class.constantize.new(key)
|
10
|
+
file.open do |f|
|
11
|
+
csv = CSV.parse(f.read, headers: true)
|
12
|
+
processor = processor_class.constantize.new(csv, payload: payload)
|
13
|
+
processor.start
|
14
|
+
end
|
15
|
+
ensure
|
16
|
+
file.try(:delete)
|
11
17
|
end
|
12
18
|
end
|
13
19
|
end
|
@@ -0,0 +1,83 @@
|
|
1
|
+
require 'aws-sdk'
|
2
|
+
|
3
|
+
class BulkProcessor
|
4
|
+
# Read and write files in a pre-configured S3 bucket.
|
5
|
+
class S3File
|
6
|
+
NAMESPACE = 'bulk_processor'.freeze
|
7
|
+
private_constant :NAMESPACE
|
8
|
+
|
9
|
+
# @param key [String] the unique identifier (within the bucket) used to
|
10
|
+
# access the file
|
11
|
+
def initialize(key)
|
12
|
+
@key = "#{NAMESPACE}/#{key}"
|
13
|
+
end
|
14
|
+
|
15
|
+
def exists?
|
16
|
+
client.get_object(bucket: bucket, key: key)
|
17
|
+
true
|
18
|
+
rescue Aws::S3::Errors::NoSuchKey
|
19
|
+
false
|
20
|
+
end
|
21
|
+
|
22
|
+
# Yield the file stored in the bucket identified by the key. The file is
|
23
|
+
# only guaranteed to exist locally within the block, any attempts to access
|
24
|
+
# the file outside of the block will fail.
|
25
|
+
#
|
26
|
+
# @yields [File] a local copy of the remote file
|
27
|
+
def open
|
28
|
+
with_temp_file do |local_file|
|
29
|
+
client.get_object({ bucket: bucket, key: key }, target: local_file)
|
30
|
+
local_file.rewind
|
31
|
+
yield local_file
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
# Write a new file to the bucket on S3
|
36
|
+
#
|
37
|
+
# @param contents [String] the contents of the file to create
|
38
|
+
# @return [String] the URL of the new file
|
39
|
+
def write(contents)
|
40
|
+
remote_file = resource.bucket(bucket).object(key)
|
41
|
+
remote_file.put(body: contents)
|
42
|
+
remote_file.public_url
|
43
|
+
end
|
44
|
+
|
45
|
+
def delete
|
46
|
+
client.delete_object(bucket: bucket, key: key)
|
47
|
+
end
|
48
|
+
|
49
|
+
private
|
50
|
+
|
51
|
+
attr_reader :bucket, :key
|
52
|
+
|
53
|
+
def bucket
|
54
|
+
BulkProcessor.config.aws.bucket || raise('AWS bucket must be set in the config')
|
55
|
+
end
|
56
|
+
|
57
|
+
def access_key_id
|
58
|
+
BulkProcessor.config.aws.access_key_id || raise('AWS access_key_id must be set in the config')
|
59
|
+
end
|
60
|
+
|
61
|
+
def secret_access_key
|
62
|
+
BulkProcessor.config.aws.secret_access_key || raise('AWS secret_access_key must be set in the config')
|
63
|
+
end
|
64
|
+
|
65
|
+
def resource
|
66
|
+
Aws::S3::Resource.new(client: client)
|
67
|
+
end
|
68
|
+
|
69
|
+
def client
|
70
|
+
credentials = Aws::Credentials.new(access_key_id, secret_access_key)
|
71
|
+
Aws::S3::Client.new(credentials: credentials)
|
72
|
+
end
|
73
|
+
|
74
|
+
def with_temp_file
|
75
|
+
base_dir = Pathname.new(BulkProcessor.config.temp_directory)
|
76
|
+
file = Tempfile.new('aws_utils', base_dir)
|
77
|
+
yield file
|
78
|
+
ensure
|
79
|
+
file.close if file && !file.closed?
|
80
|
+
file.try(:unlink)
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
@@ -3,7 +3,7 @@ require 'csv'
|
|
3
3
|
class BulkProcessor
|
4
4
|
# A Wrapper on CSV that validates column headers.
|
5
5
|
class ValidatedCSV
|
6
|
-
PARSING_OPTIONS = { headers: true
|
6
|
+
PARSING_OPTIONS = { headers: true }.freeze
|
7
7
|
private_constant :PARSING_OPTIONS
|
8
8
|
|
9
9
|
# This cryptic message usually just means that the header row contains a
|
@@ -48,13 +48,6 @@ class BulkProcessor
|
|
48
48
|
errors.empty?
|
49
49
|
end
|
50
50
|
|
51
|
-
# @return [Array<Hash<String, String>>] a serializable representation of the
|
52
|
-
# CSV that will be passed to the background job.
|
53
|
-
def row_hashes
|
54
|
-
return [] unless valid?
|
55
|
-
csv.map(&:to_hash)
|
56
|
-
end
|
57
|
-
|
58
51
|
private
|
59
52
|
|
60
53
|
attr_reader :stream, :required_headers, :optional_headers
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bulk-processor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Tom Collier, Justin Richard
|
@@ -24,6 +24,20 @@ dependencies:
|
|
24
24
|
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: '4'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: aws-sdk
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '2.1'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '2.1'
|
27
41
|
- !ruby/object:Gem::Dependency
|
28
42
|
name: bundler
|
29
43
|
requirement: !ruby/object:Gem::Requirement
|
@@ -108,6 +122,7 @@ files:
|
|
108
122
|
- lib/bulk_processor/csv_processor/result.rb
|
109
123
|
- lib/bulk_processor/csv_processor/row_processor.rb
|
110
124
|
- lib/bulk_processor/job.rb
|
125
|
+
- lib/bulk_processor/s3_file.rb
|
111
126
|
- lib/bulk_processor/stream_encoder.rb
|
112
127
|
- lib/bulk_processor/validated_csv.rb
|
113
128
|
- lib/bulk_processor/version.rb
|