bulk-processor 0.3.0 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/README.md +19 -4
- data/bulk-processor.gemspec +1 -0
- data/lib/bulk_processor.rb +27 -7
- data/lib/bulk_processor/config.rb +5 -0
- data/lib/bulk_processor/csv_processor/row_processor.rb +1 -1
- data/lib/bulk_processor/job.rb +9 -3
- data/lib/bulk_processor/s3_file.rb +83 -0
- data/lib/bulk_processor/validated_csv.rb +1 -8
- data/lib/bulk_processor/version.rb +1 -1
- metadata +16 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 163fb73da29263963c492e14b85b35290e2f8d64
|
4
|
+
data.tar.gz: 352d9b5659f885f238a922361d6f341496ff36f1
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f908072ce3303676c8c8440dff4f8d47646571be81ea71bc668e523f6bce7e94d5e4553c3f2a31a1d9eeba797a630b47a6c010b9bf97d495f575cdf844a1f4df
|
7
|
+
data.tar.gz: af56ed66e8a44edcabd9962f16ba0cfc89e2d784285506ecc34ec792f636592009e302a970403eb2dca45f21e77bc944718d18108cfd0018905d1f81cc243aad
|
data/.gitignore
CHANGED
data/README.md
CHANGED
@@ -24,17 +24,29 @@ Or install it yourself as:
|
|
24
24
|
|
25
25
|
## Usage
|
26
26
|
|
27
|
+
### Configuration
|
28
|
+
|
27
29
|
Bulk processor requires the following configuration
|
28
30
|
|
29
31
|
```ruby
|
30
32
|
BulkProcessor.queue_adapter = <adapter>
|
33
|
+
BulkProcessor.temp_directory = '/tmp'
|
34
|
+
BulkProcessor.aws.access_key_id = 'my-aws-access-key'
|
35
|
+
BulkProcessor.aws.secret_access_key = 'my-aws-secret'
|
36
|
+
BulkProcessor.aws.bucket = 'my-s3-bucket'
|
31
37
|
```
|
32
38
|
|
33
|
-
The default is `:inline`, which skips queueing and processes synchronously. Since
|
34
|
-
this is backed by ActiveJob, all of the adapters in [ActiveJob::QueueAdapters]( http://api.rubyonrails.org/classes/ActiveJob/QueueAdapters.html ),
|
39
|
+
The default queue_adapter is `:inline`, which skips queueing and processes synchronously. Since
|
40
|
+
this is backed by ActiveJob, all of the adapters in [ActiveJob::QueueAdapters]( http://api.rubyonrails.org/classes/ActiveJob/QueueAdapters.html ) are supported,
|
35
41
|
including `:resque`.
|
36
42
|
|
37
|
-
|
43
|
+
The CSV file passed to BulkProcessor will be persisted on AWS S3 so that the job
|
44
|
+
can access it. This requires configuring AWS credentials, the S3 bucket in which
|
45
|
+
to store the file, and a local temp directory to hold the file locally.
|
46
|
+
|
47
|
+
### Setting up the processor and handler
|
48
|
+
|
49
|
+
You will need to supply a class for CSV processing. This class must respond to the
|
38
50
|
`start` instance method, the `required_columns` and `optional_columns` class methods,
|
39
51
|
and have the following signature for initialize:
|
40
52
|
|
@@ -62,6 +74,8 @@ class PetCSVProcessor
|
|
62
74
|
end
|
63
75
|
```
|
64
76
|
|
77
|
+
#### Swiss Army Knife base class
|
78
|
+
|
65
79
|
To account for a common use case, a base `BulkProcessor::CSVProcessor` class is provided,
|
66
80
|
though it must be explicitly required. This base class can be subclassed to build a CSV processor.
|
67
81
|
This base class implements the initializer and `#start` methods and returns an empty set for `.optional_columns`.
|
@@ -177,10 +191,11 @@ class PetHandler
|
|
177
191
|
end
|
178
192
|
```
|
179
193
|
|
180
|
-
|
194
|
+
### Kicking off the process
|
181
195
|
|
182
196
|
```ruby
|
183
197
|
processor = BulkProcessor.new(
|
198
|
+
key: file_name,
|
184
199
|
stream: file_stream,
|
185
200
|
processor_class: PetCSVProcessor,
|
186
201
|
payload: { recipient: current_user.email }
|
data/bulk-processor.gemspec
CHANGED
@@ -22,6 +22,7 @@ success or failure report
|
|
22
22
|
spec.required_ruby_version = '>= 2.1'
|
23
23
|
|
24
24
|
spec.add_runtime_dependency 'activejob', '~> 4'
|
25
|
+
spec.add_runtime_dependency 'aws-sdk', '~> 2.1'
|
25
26
|
|
26
27
|
spec.add_development_dependency 'bundler'
|
27
28
|
spec.add_development_dependency 'pry-byebug', '~> 3'
|
data/lib/bulk_processor.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
require 'bulk_processor/config'
|
2
2
|
require 'bulk_processor/job'
|
3
|
+
require 'bulk_processor/s3_file'
|
3
4
|
require 'bulk_processor/stream_encoder'
|
4
5
|
require 'bulk_processor/validated_csv'
|
5
6
|
require 'bulk_processor/version'
|
@@ -18,7 +19,8 @@ class BulkProcessor
|
|
18
19
|
|
19
20
|
attr_reader :errors
|
20
21
|
|
21
|
-
def initialize(stream:, processor_class:, payload: {})
|
22
|
+
def initialize(key:, stream:, processor_class:, payload: {})
|
23
|
+
@key = key
|
22
24
|
@stream = stream
|
23
25
|
@processor_class = processor_class
|
24
26
|
@payload = payload
|
@@ -26,22 +28,40 @@ class BulkProcessor
|
|
26
28
|
end
|
27
29
|
|
28
30
|
# Validate the CSV and enqueue if for processing in the background.
|
29
|
-
def start
|
31
|
+
def start(file_class: S3File)
|
32
|
+
if file_class.new(key).exists?
|
33
|
+
errors << "Already processing #{key}, please wait for it to finish"
|
34
|
+
return false
|
35
|
+
end
|
36
|
+
|
37
|
+
encoded_contents = StreamEncoder.new(stream).encoded
|
38
|
+
|
30
39
|
csv = ValidatedCSV.new(
|
31
|
-
|
40
|
+
encoded_contents,
|
32
41
|
processor_class.required_columns,
|
33
42
|
processor_class.optional_columns
|
34
43
|
)
|
35
44
|
|
36
45
|
if csv.valid?
|
37
|
-
|
46
|
+
perform_later(file_class, encoded_contents)
|
38
47
|
else
|
39
|
-
|
48
|
+
errors.concat(csv.errors)
|
40
49
|
end
|
41
|
-
|
50
|
+
errors.empty?
|
42
51
|
end
|
43
52
|
|
44
53
|
private
|
45
54
|
|
46
|
-
attr_reader :stream, :processor_class, :payload
|
55
|
+
attr_reader :key, :stream, :processor_class, :payload
|
56
|
+
|
57
|
+
def perform_later(file_class, contents)
|
58
|
+
file = file_class.new(key)
|
59
|
+
file.write(contents)
|
60
|
+
Job.perform_later(processor_class.name, payload, file_class.name, key)
|
61
|
+
rescue Exception
|
62
|
+
# Clean up the file, which is treated as a lock, if we bail out of here
|
63
|
+
# unexpectedly.
|
64
|
+
file.try(:delete)
|
65
|
+
raise
|
66
|
+
end
|
47
67
|
end
|
@@ -2,9 +2,14 @@ class BulkProcessor
|
|
2
2
|
# Store configuration data set by clients
|
3
3
|
class Config
|
4
4
|
attr_reader :queue_adapter
|
5
|
+
attr_accessor :temp_directory
|
5
6
|
|
6
7
|
def queue_adapter=(adapter)
|
7
8
|
ActiveJob::Base.queue_adapter = @queue_adapter = adapter
|
8
9
|
end
|
10
|
+
|
11
|
+
def aws
|
12
|
+
@aws ||= Struct.new(:access_key_id, :secret_access_key, :bucket).new
|
13
|
+
end
|
9
14
|
end
|
10
15
|
end
|
data/lib/bulk_processor/job.rb
CHANGED
@@ -5,9 +5,15 @@ class BulkProcessor
|
|
5
5
|
class Job < ActiveJob::Base
|
6
6
|
queue_as 'bulk_processor'
|
7
7
|
|
8
|
-
def perform(
|
9
|
-
|
10
|
-
|
8
|
+
def perform(processor_class, payload, file_class, key)
|
9
|
+
file = file_class.constantize.new(key)
|
10
|
+
file.open do |f|
|
11
|
+
csv = CSV.parse(f.read, headers: true)
|
12
|
+
processor = processor_class.constantize.new(csv, payload: payload)
|
13
|
+
processor.start
|
14
|
+
end
|
15
|
+
ensure
|
16
|
+
file.try(:delete)
|
11
17
|
end
|
12
18
|
end
|
13
19
|
end
|
@@ -0,0 +1,83 @@
|
|
1
|
+
require 'aws-sdk'
|
2
|
+
|
3
|
+
class BulkProcessor
|
4
|
+
# Read and write files in a pre-configured S3 bucket.
|
5
|
+
class S3File
|
6
|
+
NAMESPACE = 'bulk_processor'.freeze
|
7
|
+
private_constant :NAMESPACE
|
8
|
+
|
9
|
+
# @param key [String] the unique identifier (within the bucket) used to
|
10
|
+
# access the file
|
11
|
+
def initialize(key)
|
12
|
+
@key = "#{NAMESPACE}/#{key}"
|
13
|
+
end
|
14
|
+
|
15
|
+
def exists?
|
16
|
+
client.get_object(bucket: bucket, key: key)
|
17
|
+
true
|
18
|
+
rescue Aws::S3::Errors::NoSuchKey
|
19
|
+
false
|
20
|
+
end
|
21
|
+
|
22
|
+
# Yield the file stored in the bucket identified by the key. The file is
|
23
|
+
# only guaranteed to exist locally within the block, any attempts to access
|
24
|
+
# the file outside of the block will fail.
|
25
|
+
#
|
26
|
+
# @yields [File] a local copy of the remote file
|
27
|
+
def open
|
28
|
+
with_temp_file do |local_file|
|
29
|
+
client.get_object({ bucket: bucket, key: key }, target: local_file)
|
30
|
+
local_file.rewind
|
31
|
+
yield local_file
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
# Write a new file to the bucket on S3
|
36
|
+
#
|
37
|
+
# @param contents [String] the contents of the file to create
|
38
|
+
# @return [String] the URL of the new file
|
39
|
+
def write(contents)
|
40
|
+
remote_file = resource.bucket(bucket).object(key)
|
41
|
+
remote_file.put(body: contents)
|
42
|
+
remote_file.public_url
|
43
|
+
end
|
44
|
+
|
45
|
+
def delete
|
46
|
+
client.delete_object(bucket: bucket, key: key)
|
47
|
+
end
|
48
|
+
|
49
|
+
private
|
50
|
+
|
51
|
+
attr_reader :bucket, :key
|
52
|
+
|
53
|
+
def bucket
|
54
|
+
BulkProcessor.config.aws.bucket || raise('AWS bucket must be set in the config')
|
55
|
+
end
|
56
|
+
|
57
|
+
def access_key_id
|
58
|
+
BulkProcessor.config.aws.access_key_id || raise('AWS access_key_id must be set in the config')
|
59
|
+
end
|
60
|
+
|
61
|
+
def secret_access_key
|
62
|
+
BulkProcessor.config.aws.secret_access_key || raise('AWS secret_access_key must be set in the config')
|
63
|
+
end
|
64
|
+
|
65
|
+
def resource
|
66
|
+
Aws::S3::Resource.new(client: client)
|
67
|
+
end
|
68
|
+
|
69
|
+
def client
|
70
|
+
credentials = Aws::Credentials.new(access_key_id, secret_access_key)
|
71
|
+
Aws::S3::Client.new(credentials: credentials)
|
72
|
+
end
|
73
|
+
|
74
|
+
def with_temp_file
|
75
|
+
base_dir = Pathname.new(BulkProcessor.config.temp_directory)
|
76
|
+
file = Tempfile.new('aws_utils', base_dir)
|
77
|
+
yield file
|
78
|
+
ensure
|
79
|
+
file.close if file && !file.closed?
|
80
|
+
file.try(:unlink)
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
@@ -3,7 +3,7 @@ require 'csv'
|
|
3
3
|
class BulkProcessor
|
4
4
|
# A Wrapper on CSV that validates column headers.
|
5
5
|
class ValidatedCSV
|
6
|
-
PARSING_OPTIONS = { headers: true
|
6
|
+
PARSING_OPTIONS = { headers: true }.freeze
|
7
7
|
private_constant :PARSING_OPTIONS
|
8
8
|
|
9
9
|
# This cryptic message usually just means that the header row contains a
|
@@ -48,13 +48,6 @@ class BulkProcessor
|
|
48
48
|
errors.empty?
|
49
49
|
end
|
50
50
|
|
51
|
-
# @return [Array<Hash<String, String>>] a serializable representation of the
|
52
|
-
# CSV that will be passed to the background job.
|
53
|
-
def row_hashes
|
54
|
-
return [] unless valid?
|
55
|
-
csv.map(&:to_hash)
|
56
|
-
end
|
57
|
-
|
58
51
|
private
|
59
52
|
|
60
53
|
attr_reader :stream, :required_headers, :optional_headers
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bulk-processor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Tom Collier, Justin Richard
|
@@ -24,6 +24,20 @@ dependencies:
|
|
24
24
|
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: '4'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: aws-sdk
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '2.1'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '2.1'
|
27
41
|
- !ruby/object:Gem::Dependency
|
28
42
|
name: bundler
|
29
43
|
requirement: !ruby/object:Gem::Requirement
|
@@ -108,6 +122,7 @@ files:
|
|
108
122
|
- lib/bulk_processor/csv_processor/result.rb
|
109
123
|
- lib/bulk_processor/csv_processor/row_processor.rb
|
110
124
|
- lib/bulk_processor/job.rb
|
125
|
+
- lib/bulk_processor/s3_file.rb
|
111
126
|
- lib/bulk_processor/stream_encoder.rb
|
112
127
|
- lib/bulk_processor/validated_csv.rb
|
113
128
|
- lib/bulk_processor/version.rb
|