bulk-processor 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +81 -31
- data/lib/bulk_processor/config.rb +1 -0
- data/lib/bulk_processor/csv_processor.rb +98 -0
- data/lib/bulk_processor/job.rb +6 -19
- data/lib/bulk_processor/no_op_handler.rb +12 -0
- data/lib/bulk_processor/validated_csv.rb +30 -11
- data/lib/bulk_processor/version.rb +1 -1
- data/lib/bulk_processor.rb +13 -12
- metadata +6 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 56580f10538cc75b8fbfb8006248905cb0cfeb71
|
4
|
+
data.tar.gz: 3ce3bc03cf878836f5a768a3be0fad169bfabdc6
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1c4fa2fdf92ec038c73c6eec886a682e9cb82f8fdb1ef0ade84a0eefa0eb1fffbd29cd510c3c555c6456b5ee7e600c2e52999031169dfbd773a73a593ee543ee
|
7
|
+
data.tar.gz: b32c0129b95fa2a44e7d2dca5455bca2a60137aac6d0f97b260b60e90d8c79d28acd4e91a12f9d9a6b14e0a1ac4735d505b31bea182499f14c9c19991fbfa930
|
data/README.md
CHANGED
@@ -34,11 +34,48 @@ The default is `:inline`, which skips queueing and processes synchronously. Sinc
|
|
34
34
|
this is backed by ActiveJob, all of the adapters in [ActiveJob::QueueAdapters]( http://api.rubyonrails.org/classes/ActiveJob/QueueAdapters.html ),
|
35
35
|
including `:resque`.
|
36
36
|
|
37
|
-
You will also need to supply a class for
|
38
|
-
|
37
|
+
You will also need to supply a class for CSV processing. This class must respond to the
|
38
|
+
`start` instance method, the `required_columns` and `optional_columns` class methods,
|
39
|
+
and have the following signature for initialize:
|
39
40
|
|
41
|
+
```ruby
|
42
|
+
class PetCSVProcessor
|
43
|
+
# @return [Array<String>] column headers that must be present
|
44
|
+
def self.required_columns
|
45
|
+
['species', 'name', 'age']
|
46
|
+
end
|
47
|
+
|
48
|
+
# @return [Array<String>] column headers that may be present. If a column
|
49
|
+
# header is present that is not in 'required_columns' or 'optional_columns',
|
50
|
+
# the file will be considered invalid and no rows will be processed.
|
51
|
+
def self.optional_columns
|
52
|
+
['favorite_toy', 'talents']
|
53
|
+
end
|
54
|
+
|
55
|
+
def initialize(records, payload:)
|
56
|
+
# Assign instance variables and do any other setup
|
57
|
+
end
|
58
|
+
|
59
|
+
def start
|
60
|
+
# Process the records
|
61
|
+
end
|
62
|
+
end
|
40
63
|
```
|
41
|
-
|
64
|
+
|
65
|
+
To account for a common use case, a base `BulkProcessor::CSVProcessor` class is provided,
|
66
|
+
though it must be explicitly required. This base class can be subclassed to build a CSV processor.
|
67
|
+
This base class implements the initializer and `#start` methods and returns an empty set for `.optional_columns`.
|
68
|
+
|
69
|
+
The `#start` method iterates over each record, processes it using a `RowProcessor`,
|
70
|
+
accumulates the results, which are passed off to a `Handler`. An example
|
71
|
+
implementation could look like:
|
72
|
+
|
73
|
+
```ruby
|
74
|
+
require 'bulk_processor/csv_processor'
|
75
|
+
|
76
|
+
class PetCSVProcessor < BulkProcessor::CSVProcessor
|
77
|
+
# Note: this must be overridden in a subclass
|
78
|
+
#
|
42
79
|
# @return [Array<String>] column headers that must be present
|
43
80
|
def self.required_columns
|
44
81
|
['species', 'name', 'age']
|
@@ -51,18 +88,27 @@ class PetItemProcessor
|
|
51
88
|
['favorite_toy', 'talents']
|
52
89
|
end
|
53
90
|
|
54
|
-
#
|
55
|
-
#
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
91
|
+
# Note: this must be overridden in a subclass
|
92
|
+
#
|
93
|
+
# @return [RowProcessor] a class that implements the RowProcessor role
|
94
|
+
def self.row_processor_class
|
95
|
+
PetRowProcessor
|
96
|
+
end
|
97
|
+
|
98
|
+
# @return [Handler] a class that implements the Handler role
|
99
|
+
def self.handler_class
|
100
|
+
PetHandler
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
class PetRowProcessor
|
105
|
+
def initialize(record, payload:)
|
106
|
+
# Assign instance variables and do any other setup
|
61
107
|
end
|
62
108
|
|
63
109
|
# Process the row, e.g. create a new record in the DB, send an email, etc
|
64
110
|
def process!
|
65
|
-
pet = Pet.new(
|
111
|
+
pet = Pet.new(record)
|
66
112
|
if pet.save
|
67
113
|
@success = true
|
68
114
|
else
|
@@ -72,25 +118,17 @@ class PetItemProcessor
|
|
72
118
|
|
73
119
|
# @return [true|false] true iff the item was processed completely
|
74
120
|
def success?
|
75
|
-
@success
|
121
|
+
@success == true
|
76
122
|
end
|
77
123
|
|
78
124
|
# @return [Array<String>] list of messages for this item to pass back to the
|
79
125
|
# completion handler.
|
80
126
|
def messages
|
81
|
-
@messages
|
127
|
+
@messages || []
|
82
128
|
end
|
83
129
|
end
|
84
|
-
```
|
85
130
|
|
86
|
-
|
87
|
-
|
88
|
-
```ruby
|
89
|
-
module NotificationHandler
|
90
|
-
# Handle full or partial processing of records. Unless there was a fatal
|
91
|
-
# error, all row indexes will be present either successes or errors, but not
|
92
|
-
# both.
|
93
|
-
#
|
131
|
+
class PetHandler
|
94
132
|
# @param payload [Hash] the payload passed into 'BulkProcessor.process', can
|
95
133
|
# be used to pass metadata around, e.g. the email address to send a
|
96
134
|
# completion report to
|
@@ -100,25 +138,37 @@ module NotificationHandler
|
|
100
138
|
# (may be empty), e.g. { 0 => [], 1 => ['pet ID = 22 created'] }
|
101
139
|
# @param errors [Hash<Fixnum, Array<String>>] similar structure to successes,
|
102
140
|
# but rows that were not completed successfully.
|
141
|
+
def initialize(payload:, successes:, errors:)
|
142
|
+
# Assign instance variables and do any other setup
|
143
|
+
end
|
144
|
+
|
145
|
+
# Notify the owner that their pets were processed
|
146
|
+
def complete!
|
147
|
+
OwnerMailer.competed(successes, errors)
|
148
|
+
end
|
149
|
+
|
150
|
+
# Notify the owner that processing failed
|
151
|
+
#
|
103
152
|
# @param fatal_error [StandardError] if nil, then all rows were processed,
|
104
153
|
# else the error that was raise is passed in here
|
105
|
-
def
|
106
|
-
|
107
|
-
PetProcessorMailer.fail(payload['recipient'], successes, errors, fatal_error)
|
108
|
-
else
|
109
|
-
PetProcessorMailer.complete(payload['recipient'], successes, errors)
|
110
|
-
end
|
154
|
+
def fail!(fatal_error)
|
155
|
+
OwnerMailer.failed(fatal_error)
|
111
156
|
end
|
112
157
|
end
|
113
158
|
```
|
114
159
|
|
115
|
-
|
160
|
+
Putting it all together
|
116
161
|
|
117
162
|
```ruby
|
118
|
-
processor = BulkProcessor.new(
|
119
|
-
|
163
|
+
processor = BulkProcessor.new(
|
164
|
+
stream: file_stream,
|
165
|
+
processor_class: PetCSVProcessor,
|
166
|
+
payload: {recipient: current_user.email}
|
167
|
+
)
|
168
|
+
if processor.start
|
120
169
|
# The job has been enqueued, go get a coffee and wait
|
121
170
|
else
|
171
|
+
# Something went wrong, alert the file uploader
|
122
172
|
handle_invalid_file(processor.errors)
|
123
173
|
end
|
124
174
|
```
|
@@ -0,0 +1,98 @@
|
|
1
|
+
require_relative 'no_op_handler'
|
2
|
+
|
3
|
+
class BulkProcessor
|
4
|
+
# An abstract implmentation of the CSVProcessor role. Provides
|
5
|
+
#
|
6
|
+
# * A default implementation of `.optional_columns`, returning []
|
7
|
+
# * An initializer that assigns the arguments as instance attributes
|
8
|
+
# * An implementation of #start to cover a common use case
|
9
|
+
#
|
10
|
+
# The common use case cover by this class' implementation of `#start` is
|
11
|
+
#
|
12
|
+
# 1. Iteratively process each record
|
13
|
+
# 2. Accumulate the results (did the processing succeed? what were the error
|
14
|
+
# messages?)
|
15
|
+
# 3. Send the results to an instance of the Handler role.
|
16
|
+
#
|
17
|
+
# This class adds 2 required class methods that can be overridden in any
|
18
|
+
# subclass
|
19
|
+
#
|
20
|
+
# * row_processor_class - (required) Returns the class that implements the
|
21
|
+
# RowProcessor role to process rows of the CSV
|
22
|
+
# * handler_class - (optional) Returns the class that implements the Handler
|
23
|
+
# role, which handles results from the completion (or failure) of
|
24
|
+
# processing the entire CSV.
|
25
|
+
#
|
26
|
+
# The `required_columns` method must still be implemented in a subclass
|
27
|
+
#
|
28
|
+
class CSVProcessor
|
29
|
+
# @return [RowProcessor] a class that implements the RowProcessor interface
|
30
|
+
def self.row_processor_class
|
31
|
+
raise NotImplementedError,
|
32
|
+
"#{self.class.name} must implement #{__method__}"
|
33
|
+
end
|
34
|
+
|
35
|
+
# @return [Handler] a class that implements the Handler role
|
36
|
+
def self.handler_class
|
37
|
+
NoOpHandler
|
38
|
+
end
|
39
|
+
|
40
|
+
# @return [Array<String>] column headers that must be present
|
41
|
+
def self.required_columns
|
42
|
+
raise NotImplementedError,
|
43
|
+
"#{self.class.name} must implement #{__method__}"
|
44
|
+
end
|
45
|
+
|
46
|
+
# @return [Array<String>] column headers that may be present. If a column
|
47
|
+
# header is present that is not in 'required_columns' or
|
48
|
+
# 'optional_columns', the file will be considered invalid and no rows will
|
49
|
+
# be processed.
|
50
|
+
def self.optional_columns
|
51
|
+
[]
|
52
|
+
end
|
53
|
+
|
54
|
+
def initialize(records, payload: {})
|
55
|
+
@records = records
|
56
|
+
@payload = payload
|
57
|
+
@successes = {}
|
58
|
+
@errors = {}
|
59
|
+
end
|
60
|
+
|
61
|
+
# Iteratively process each record, accumulate the results, and pass those
|
62
|
+
# off to the handler. If an unrescued error is raised for any record,
|
63
|
+
# processing will halt for all remaining records and the `#fail!` will be
|
64
|
+
# invoked on the handler.
|
65
|
+
def start
|
66
|
+
records.each_with_index do |record, index|
|
67
|
+
processor = row_processor(record)
|
68
|
+
processor.process!
|
69
|
+
if processor.success?
|
70
|
+
successes[index] = processor.messages
|
71
|
+
else
|
72
|
+
errors[index] = processor.messages
|
73
|
+
end
|
74
|
+
end
|
75
|
+
handler.complete!
|
76
|
+
rescue Exception => exception
|
77
|
+
handler.fail!(exception)
|
78
|
+
|
79
|
+
# Swallow any StandardError, since we are already reporting it to the
|
80
|
+
# user. However, we must re-raise Exceptions, such as SIGTERMs since they
|
81
|
+
# need to be handled at a level above this gem.
|
82
|
+
raise unless exception.is_a?(StandardError)
|
83
|
+
end
|
84
|
+
|
85
|
+
private
|
86
|
+
|
87
|
+
attr_reader :records, :payload, :successes, :errors
|
88
|
+
|
89
|
+
def handler
|
90
|
+
self.class.handler_class.new(payload: payload, successes: successes,
|
91
|
+
errors: errors)
|
92
|
+
end
|
93
|
+
|
94
|
+
def row_processor(record)
|
95
|
+
self.class.row_processor_class.new(record, payload: payload)
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|
data/lib/bulk_processor/job.rb
CHANGED
@@ -1,26 +1,13 @@
|
|
1
|
+
require 'active_job'
|
2
|
+
|
1
3
|
class BulkProcessor
|
4
|
+
# ActiveJob to handle processing the CSV in the background
|
2
5
|
class Job < ActiveJob::Base
|
3
6
|
queue_as 'bulk_processor'
|
4
7
|
|
5
|
-
def perform(records,
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
successes = {}
|
10
|
-
failures = {}
|
11
|
-
records.each_with_index do |record, index|
|
12
|
-
processor = item_proccessor_class.new(record, payload)
|
13
|
-
processor.process!
|
14
|
-
if processor.success?
|
15
|
-
successes[index] = processor.messages
|
16
|
-
else
|
17
|
-
failures[index] = processor.messages
|
18
|
-
end
|
19
|
-
end
|
20
|
-
handler_class.complete(payload, successes, failures, nil)
|
21
|
-
rescue Exception => exception
|
22
|
-
handler_class.complete(payload, successes, failures, exception)
|
23
|
-
raise unless exception.is_a?(StandardError)
|
8
|
+
def perform(records, processor_class, payload)
|
9
|
+
processor = processor_class.constantize.new(records, payload: payload)
|
10
|
+
processor.start
|
24
11
|
end
|
25
12
|
end
|
26
13
|
end
|
@@ -1,11 +1,20 @@
|
|
1
|
+
require 'csv'
|
2
|
+
|
1
3
|
class BulkProcessor
|
4
|
+
# A Wrapper on CSV that validates column headers.
|
2
5
|
class ValidatedCSV
|
3
6
|
PARSING_OPTIONS = { headers: true, header_converters: :downcase }
|
4
7
|
private_constant :PARSING_OPTIONS
|
5
8
|
|
9
|
+
# This cryptic message usually just means that the header row contains a
|
10
|
+
# blank field; in ruby ~> 2.1.5 It is the error message for a NoMethodError
|
11
|
+
# raised when parsing a CSV.
|
6
12
|
BAD_HEADERS_ERROR_MSG = "undefined method `encode' for nil:NilClass"
|
7
13
|
private_constant :BAD_HEADERS_ERROR_MSG
|
8
14
|
|
15
|
+
MISSING_COLUMN_MESSAGE = 'Missing or malformed column header, is one of them blank?'
|
16
|
+
private_constant :MISSING_COLUMN_MESSAGE
|
17
|
+
|
9
18
|
attr_reader :errors, :records
|
10
19
|
|
11
20
|
def initialize(stream, required_headers, optional_headers)
|
@@ -15,7 +24,12 @@ class BulkProcessor
|
|
15
24
|
@errors = []
|
16
25
|
end
|
17
26
|
|
27
|
+
# @return [true|false] true iff:
|
28
|
+
# * All required columns are present
|
29
|
+
# * No column exists that isn't a required or optional column
|
30
|
+
# * No column heading is blank
|
18
31
|
def valid?
|
32
|
+
return false if csv.nil?
|
19
33
|
@errors = []
|
20
34
|
|
21
35
|
if missing_headers.any?
|
@@ -26,20 +40,17 @@ class BulkProcessor
|
|
26
40
|
errors << "Unrecognized column(s) found: #{extra_headers.join(', ')}"
|
27
41
|
end
|
28
42
|
|
29
|
-
|
30
|
-
errors <<
|
31
|
-
end
|
32
|
-
rescue NoMethodError => error
|
33
|
-
if error.message == BAD_HEADERS_ERROR_MSG
|
34
|
-
errors << 'Missing or malformed column header, is one of them blank?'
|
35
|
-
else
|
36
|
-
raise error
|
43
|
+
if csv.headers.any? { |header| header.nil? || header.strip == '' }
|
44
|
+
errors << MISSING_COLUMN_MESSAGE
|
37
45
|
end
|
38
|
-
|
39
|
-
|
46
|
+
|
47
|
+
errors.empty?
|
40
48
|
end
|
41
49
|
|
50
|
+
# @return [Array<Hash<String, String>>] a serializable representation of the
|
51
|
+
# CSV that will be passed to the background job.
|
42
52
|
def row_hashes
|
53
|
+
return [] unless valid?
|
43
54
|
csv.map(&:to_hash)
|
44
55
|
end
|
45
56
|
|
@@ -48,7 +59,15 @@ class BulkProcessor
|
|
48
59
|
attr_reader :stream, :required_headers, :optional_headers
|
49
60
|
|
50
61
|
def csv
|
51
|
-
@csv
|
62
|
+
return @csv if instance_variable_defined?('@csv')
|
63
|
+
@csv = CSV.parse(stream, PARSING_OPTIONS)
|
64
|
+
rescue NoMethodError => error
|
65
|
+
if error.message == BAD_HEADERS_ERROR_MSG
|
66
|
+
errors << MISSING_COLUMN_MESSAGE
|
67
|
+
@csv = nil
|
68
|
+
else
|
69
|
+
raise error
|
70
|
+
end
|
52
71
|
end
|
53
72
|
|
54
73
|
def missing_headers
|
data/lib/bulk_processor.rb
CHANGED
@@ -1,12 +1,10 @@
|
|
1
|
-
require 'active_job'
|
2
|
-
require 'csv'
|
3
|
-
|
4
1
|
require 'bulk_processor/config'
|
5
2
|
require 'bulk_processor/job'
|
6
3
|
require 'bulk_processor/stream_encoder'
|
7
4
|
require 'bulk_processor/validated_csv'
|
8
5
|
require 'bulk_processor/version'
|
9
6
|
|
7
|
+
# Process large CSV files in the background.
|
10
8
|
class BulkProcessor
|
11
9
|
class << self
|
12
10
|
def config
|
@@ -16,31 +14,34 @@ class BulkProcessor
|
|
16
14
|
def configure
|
17
15
|
yield config
|
18
16
|
end
|
19
|
-
|
20
17
|
end
|
21
18
|
|
22
|
-
attr_reader :
|
19
|
+
attr_reader :errors
|
23
20
|
|
24
|
-
def initialize(stream
|
21
|
+
def initialize(stream:, processor_class:, payload: {})
|
25
22
|
@stream = stream
|
26
|
-
@
|
27
|
-
@handler = handler
|
23
|
+
@processor_class = processor_class
|
28
24
|
@payload = payload
|
29
25
|
@errors = []
|
30
26
|
end
|
31
27
|
|
32
|
-
|
28
|
+
# Validate the CSV and enqueue if for processing in the background.
|
29
|
+
def start
|
33
30
|
csv = ValidatedCSV.new(
|
34
31
|
StreamEncoder.new(stream).encoded,
|
35
|
-
|
36
|
-
|
32
|
+
processor_class.required_columns,
|
33
|
+
processor_class.optional_columns
|
37
34
|
)
|
38
35
|
|
39
36
|
if csv.valid?
|
40
|
-
Job.perform_later(csv.row_hashes,
|
37
|
+
Job.perform_later(csv.row_hashes, processor_class.name, payload)
|
41
38
|
else
|
42
39
|
@errors = csv.errors
|
43
40
|
end
|
44
41
|
@errors.empty?
|
45
42
|
end
|
43
|
+
|
44
|
+
private
|
45
|
+
|
46
|
+
attr_reader :stream, :processor_class, :payload
|
46
47
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bulk-processor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Tom Collier, Justin Richard
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-01-
|
11
|
+
date: 2016-01-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activejob
|
@@ -102,7 +102,9 @@ files:
|
|
102
102
|
- bulk-processor.gemspec
|
103
103
|
- lib/bulk_processor.rb
|
104
104
|
- lib/bulk_processor/config.rb
|
105
|
+
- lib/bulk_processor/csv_processor.rb
|
105
106
|
- lib/bulk_processor/job.rb
|
107
|
+
- lib/bulk_processor/no_op_handler.rb
|
106
108
|
- lib/bulk_processor/stream_encoder.rb
|
107
109
|
- lib/bulk_processor/validated_csv.rb
|
108
110
|
- lib/bulk_processor/version.rb
|
@@ -126,8 +128,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
126
128
|
version: '0'
|
127
129
|
requirements: []
|
128
130
|
rubyforge_project:
|
129
|
-
rubygems_version: 2.4.
|
131
|
+
rubygems_version: 2.4.5
|
130
132
|
signing_key:
|
131
133
|
specification_version: 4
|
132
134
|
summary: Background process CSV data
|
133
135
|
test_files: []
|
136
|
+
has_rdoc:
|