bulk-processor 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +59 -28
- data/lib/bulk_processor/csv_processor/no_op_handler.rb +15 -0
- data/lib/bulk_processor/csv_processor/no_op_post_processor.rb +16 -0
- data/lib/bulk_processor/csv_processor/result.rb +20 -0
- data/lib/bulk_processor/csv_processor/row_processor.rb +61 -0
- data/lib/bulk_processor/csv_processor.rb +36 -21
- data/lib/bulk_processor/stream_encoder.rb +1 -1
- data/lib/bulk_processor/validated_csv.rb +7 -6
- data/lib/bulk_processor/version.rb +1 -1
- metadata +6 -3
- data/lib/bulk_processor/no_op_handler.rb +0 -12
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d7b6dc445bf46f3f35477510449e5f56aed3f854
|
4
|
+
data.tar.gz: ef53879ba52375923ba2b55460b5fb217665d68c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a4b9727d06824b5cf68789a4f3fb00d5b039d079201c33f4f68bfdb9ee720a9517df2e78d12614dcf1f1ca2673e7580dfd914b8963d3b463747026ce4681fec6
|
7
|
+
data.tar.gz: 67c87c2517515fd5912d05f494ce2d3454409cdb9417466a112b48be00ab9a4d9cfb1b72fe57f4139403bbdc9980803f5e3140405efdae5c1c1ab54339f95e42
|
data/README.md
CHANGED
@@ -52,12 +52,12 @@ class PetCSVProcessor
|
|
52
52
|
['favorite_toy', 'talents']
|
53
53
|
end
|
54
54
|
|
55
|
-
def initialize(
|
55
|
+
def initialize(csv, payload:)
|
56
56
|
# Assign instance variables and do any other setup
|
57
57
|
end
|
58
58
|
|
59
59
|
def start
|
60
|
-
# Process the
|
60
|
+
# Process the CSV
|
61
61
|
end
|
62
62
|
end
|
63
63
|
```
|
@@ -66,7 +66,7 @@ To account for a common use case, a base `BulkProcessor::CSVProcessor` class is
|
|
66
66
|
though it must be explicitly required. This base class can be subclassed to build a CSV processor.
|
67
67
|
This base class implements the initializer and `#start` methods and returns an empty set for `.optional_columns`.
|
68
68
|
|
69
|
-
The `#start` method iterates over each
|
69
|
+
The `#start` method iterates over each row, processes it using a `RowProcessor`,
|
70
70
|
accumulates the results, which are passed off to a `Handler`. An example
|
71
71
|
implementation could look like:
|
72
72
|
|
@@ -95,56 +95,76 @@ class PetCSVProcessor < BulkProcessor::CSVProcessor
|
|
95
95
|
PetRowProcessor
|
96
96
|
end
|
97
97
|
|
98
|
+
# @return [PostProcessor] a class that implements the PostProcessor role
|
99
|
+
def self.post_processor_class
|
100
|
+
PetPostProcessor
|
101
|
+
end
|
102
|
+
|
98
103
|
# @return [Handler] a class that implements the Handler role
|
99
104
|
def self.handler_class
|
100
105
|
PetHandler
|
101
106
|
end
|
102
107
|
end
|
108
|
+
```
|
103
109
|
|
104
|
-
|
105
|
-
|
106
|
-
# Assign instance variables and do any other setup
|
107
|
-
end
|
108
|
-
|
110
|
+
```ruby
|
111
|
+
class PetRowProcessor < BulkProcessor::CSVProcessor::RowProcessor
|
109
112
|
# Process the row, e.g. create a new record in the DB, send an email, etc
|
110
113
|
def process!
|
111
|
-
pet = Pet.new(
|
114
|
+
pet = Pet.new(row)
|
112
115
|
if pet.save
|
113
|
-
|
116
|
+
self.successful = true
|
114
117
|
else
|
115
|
-
|
118
|
+
messages.concat(pet.errors.full_messages)
|
116
119
|
end
|
117
120
|
end
|
118
121
|
|
119
|
-
#
|
120
|
-
|
121
|
-
|
122
|
+
# Setting these allow us to identify error messages by these key/values for
|
123
|
+
# a row, rather than using the row number
|
124
|
+
def primary_keys
|
125
|
+
['species', 'name']
|
122
126
|
end
|
127
|
+
end
|
128
|
+
```
|
123
129
|
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
130
|
+
```ruby
|
131
|
+
class PetPostProcessor
|
132
|
+
attr_reader :results
|
133
|
+
|
134
|
+
def initialize(row_processors)
|
135
|
+
# Assign instance variables and do any other setup
|
136
|
+
end
|
137
|
+
|
138
|
+
def start
|
139
|
+
cat_count = 0
|
140
|
+
@results = []
|
141
|
+
row_processors.each do |row_processor|
|
142
|
+
cat_count += 1 if row_processor.cat?
|
143
|
+
end
|
144
|
+
|
145
|
+
if cat_count > 2
|
146
|
+
@results << BulkProcessor::CSVProcessor::Result.new(messages: ['Too many cats!'],
|
147
|
+
successful: false)
|
148
|
+
end
|
128
149
|
end
|
129
150
|
end
|
151
|
+
```
|
130
152
|
|
153
|
+
```ruby
|
131
154
|
class PetHandler
|
132
155
|
# @param payload [Hash] the payload passed into 'BulkProcessor.process', can
|
133
156
|
# be used to pass metadata around, e.g. the email address to send a
|
134
157
|
# completion report to
|
135
|
-
# @param
|
136
|
-
#
|
137
|
-
#
|
138
|
-
|
139
|
-
# @param errors [Hash<Fixnum, Array<String>>] similar structure to successes,
|
140
|
-
# but rows that were not completed successfully.
|
141
|
-
def initialize(payload:, successes:, errors:)
|
158
|
+
# @param results [Array<BulkProcessor::CSVProcessor::RowProcessor>] results
|
159
|
+
# for processing the rows (there will be one pre row in the CSV plus zero
|
160
|
+
# or more from post-processing)
|
161
|
+
def initialize(payload:, results:)
|
142
162
|
# Assign instance variables and do any other setup
|
143
163
|
end
|
144
164
|
|
145
165
|
# Notify the owner that their pets were processed
|
146
166
|
def complete!
|
147
|
-
OwnerMailer.
|
167
|
+
OwnerMailer.completed(results, payload)
|
148
168
|
end
|
149
169
|
|
150
170
|
# Notify the owner that processing failed
|
@@ -152,7 +172,7 @@ class PetHandler
|
|
152
172
|
# @param fatal_error [StandardError] if nil, then all rows were processed,
|
153
173
|
# else the error that was raise is passed in here
|
154
174
|
def fail!(fatal_error)
|
155
|
-
OwnerMailer.failed(fatal_error)
|
175
|
+
OwnerMailer.failed(fatal_error, payload)
|
156
176
|
end
|
157
177
|
end
|
158
178
|
```
|
@@ -163,7 +183,7 @@ Putting it all together
|
|
163
183
|
processor = BulkProcessor.new(
|
164
184
|
stream: file_stream,
|
165
185
|
processor_class: PetCSVProcessor,
|
166
|
-
payload: {recipient: current_user.email}
|
186
|
+
payload: { recipient: current_user.email }
|
167
187
|
)
|
168
188
|
if processor.start
|
169
189
|
# The job has been enqueued, go get a coffee and wait
|
@@ -173,6 +193,17 @@ else
|
|
173
193
|
end
|
174
194
|
```
|
175
195
|
|
196
|
+
### BulkProcessor::CSVProcessor::Result
|
197
|
+
|
198
|
+
The result instances passed from BulkProcessor::CSVProcessor to the Handler
|
199
|
+
respond to the following messages:
|
200
|
+
|
201
|
+
* `#messages [Array<String>]` - zero or more messages generated when processing the row
|
202
|
+
* `#row_num [Fixnum|nil]` - the CSV row number (starting with 2) or nil if result is from post-processing
|
203
|
+
* `#primary_attributes [Hash]` - a set of values that can be used to identify which row the messages are for.
|
204
|
+
You must override `#primary_keys` to use this.
|
205
|
+
* `#successful?` - true iff the processing happened with no errors
|
206
|
+
|
176
207
|
## Development
|
177
208
|
|
178
209
|
After checking out the repo, run `bin/setup` to install dependencies. Then, run `bin/console` for an interactive prompt that will allow you to experiment.
|
@@ -0,0 +1,20 @@
|
|
1
|
+
class BulkProcessor
|
2
|
+
class CSVProcessor
|
3
|
+
# A container for messages generated by processing that need to be passed
|
4
|
+
# back to the handler.
|
5
|
+
class Result
|
6
|
+
attr_reader :messages, :primary_attributes, :row_num
|
7
|
+
|
8
|
+
def initialize(messages:, successful:, row_num: nil, primary_attributes: nil)
|
9
|
+
@messages = messages
|
10
|
+
@successful = successful
|
11
|
+
@row_num = row_num
|
12
|
+
@primary_attributes = primary_attributes
|
13
|
+
end
|
14
|
+
|
15
|
+
def successful?
|
16
|
+
@successful
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,61 @@
|
|
1
|
+
class BulkProcessor
|
2
|
+
class CSVProcessor
|
3
|
+
# An abstract implementation of the RowProcessor role. This class implements
|
4
|
+
# `#results` by returning an array of `Results`. To subclass, just implement
|
5
|
+
# `#process` to handle the row.
|
6
|
+
#
|
7
|
+
# The row will be considered a failure by default. After a row is successfully
|
8
|
+
# processed, set `self.successful = true`. If there are any messages that
|
9
|
+
# should be passed back to the Handler, add them to the `#errors` array.
|
10
|
+
#
|
11
|
+
# You can optionally override `#primary_keys` so that the result returned
|
12
|
+
# has more natural identifiers than just the row number. For example, you
|
13
|
+
# setting this to ['species', 'name'] (for the PetRowProcessor example from
|
14
|
+
# the README), the result would have `#primary_attributes` like
|
15
|
+
#
|
16
|
+
# { 'species' => 'dog', 'name' => 'Fido' }
|
17
|
+
#
|
18
|
+
class RowProcessor
|
19
|
+
attr_reader :messages
|
20
|
+
|
21
|
+
def initialize(row, row_num:, payload:)
|
22
|
+
@row = row
|
23
|
+
@row_num = row_num
|
24
|
+
@payload = payload
|
25
|
+
@successful = false
|
26
|
+
@messages = []
|
27
|
+
end
|
28
|
+
|
29
|
+
def process!
|
30
|
+
raise NotImplementedError,
|
31
|
+
"#{self.class.name} must implement #{__method__}"
|
32
|
+
end
|
33
|
+
|
34
|
+
def successful?
|
35
|
+
@successful
|
36
|
+
end
|
37
|
+
|
38
|
+
def result
|
39
|
+
Result.new(messages: messages, row_num: row_num,
|
40
|
+
primary_attributes: primary_attrs, successful: @successful)
|
41
|
+
end
|
42
|
+
|
43
|
+
private
|
44
|
+
|
45
|
+
attr_reader :row, :row_num, :payload
|
46
|
+
attr_writer :successful
|
47
|
+
|
48
|
+
# Override this with an array of column names that can be used to uniquely
|
49
|
+
# identify a row, if you'd prefer to not identify rows by row number
|
50
|
+
def primary_keys
|
51
|
+
[]
|
52
|
+
end
|
53
|
+
|
54
|
+
# @return [Hash<String, String>] the set of primary keys and their values
|
55
|
+
# for this row
|
56
|
+
def primary_attrs
|
57
|
+
row.slice(*primary_keys)
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
@@ -1,4 +1,7 @@
|
|
1
|
-
require_relative 'no_op_handler'
|
1
|
+
require_relative 'csv_processor/no_op_handler'
|
2
|
+
require_relative 'csv_processor/no_op_post_processor'
|
3
|
+
require_relative 'csv_processor/result'
|
4
|
+
require_relative 'csv_processor/row_processor'
|
2
5
|
|
3
6
|
class BulkProcessor
|
4
7
|
# An abstract implmentation of the CSVProcessor role. Provides
|
@@ -9,7 +12,7 @@ class BulkProcessor
|
|
9
12
|
#
|
10
13
|
# The common use case cover by this class' implementation of `#start` is
|
11
14
|
#
|
12
|
-
# 1. Iteratively process each
|
15
|
+
# 1. Iteratively process each row
|
13
16
|
# 2. Accumulate the results (did the processing succeed? what were the error
|
14
17
|
# messages?)
|
15
18
|
# 3. Send the results to an instance of the Handler role.
|
@@ -26,6 +29,12 @@ class BulkProcessor
|
|
26
29
|
# The `required_columns` method must still be implemented in a subclass
|
27
30
|
#
|
28
31
|
class CSVProcessor
|
32
|
+
# Since the first data column in a CSV is row 2, but will have index 0 in
|
33
|
+
# the items array, we need to offset the index by 2 when we add a row
|
34
|
+
# identifier to all error messages.
|
35
|
+
FIRST_ROW_OFFSET = 2
|
36
|
+
private_constant :FIRST_ROW_OFFSET
|
37
|
+
|
29
38
|
# @return [RowProcessor] a class that implements the RowProcessor interface
|
30
39
|
def self.row_processor_class
|
31
40
|
raise NotImplementedError,
|
@@ -37,6 +46,11 @@ class BulkProcessor
|
|
37
46
|
NoOpHandler
|
38
47
|
end
|
39
48
|
|
49
|
+
# @return [PostProcessor] a class that implements the PostProcessor role
|
50
|
+
def self.post_processor_class
|
51
|
+
NoOpPostProcessor
|
52
|
+
end
|
53
|
+
|
40
54
|
# @return [Array<String>] column headers that must be present
|
41
55
|
def self.required_columns
|
42
56
|
raise NotImplementedError,
|
@@ -51,27 +65,22 @@ class BulkProcessor
|
|
51
65
|
[]
|
52
66
|
end
|
53
67
|
|
54
|
-
def initialize(
|
55
|
-
@records = records
|
68
|
+
def initialize(csv, payload: {})
|
56
69
|
@payload = payload
|
57
|
-
@
|
58
|
-
@
|
70
|
+
@row_processors = csv.map.with_index(&method(:row_processor))
|
71
|
+
@results = []
|
59
72
|
end
|
60
73
|
|
61
|
-
# Iteratively process each
|
62
|
-
# off to the handler. If an unrescued error is raised for any
|
63
|
-
# processing will halt for all remaining
|
74
|
+
# Iteratively process each row, accumulate the results, and pass those
|
75
|
+
# off to the handler. If an unrescued error is raised for any row,
|
76
|
+
# processing will halt for all remaining rows and the `#fail!` will be
|
64
77
|
# invoked on the handler.
|
65
78
|
def start
|
66
|
-
|
67
|
-
processor = row_processor(record)
|
79
|
+
row_processors.each do |processor|
|
68
80
|
processor.process!
|
69
|
-
|
70
|
-
successes[index] = processor.messages
|
71
|
-
else
|
72
|
-
errors[index] = processor.messages
|
73
|
-
end
|
81
|
+
results << processor.result
|
74
82
|
end
|
83
|
+
post_processes
|
75
84
|
handler.complete!
|
76
85
|
rescue Exception => exception
|
77
86
|
handler.fail!(exception)
|
@@ -84,15 +93,21 @@ class BulkProcessor
|
|
84
93
|
|
85
94
|
private
|
86
95
|
|
87
|
-
attr_reader :
|
96
|
+
attr_reader :row_processors, :payload, :results
|
88
97
|
|
89
98
|
def handler
|
90
|
-
self.class.handler_class.new(payload: payload,
|
91
|
-
|
99
|
+
self.class.handler_class.new(payload: payload, results: results)
|
100
|
+
end
|
101
|
+
|
102
|
+
def row_processor(row, index)
|
103
|
+
row_num = index + FIRST_ROW_OFFSET
|
104
|
+
self.class.row_processor_class.new(row, row_num: row_num, payload: payload)
|
92
105
|
end
|
93
106
|
|
94
|
-
def
|
95
|
-
self.class.
|
107
|
+
def post_processes
|
108
|
+
post_processor = self.class.post_processor_class.new(row_processors)
|
109
|
+
post_processor.start
|
110
|
+
results.concat(post_processor.results)
|
96
111
|
end
|
97
112
|
end
|
98
113
|
end
|
@@ -2,7 +2,7 @@ class BulkProcessor
|
|
2
2
|
# Force encode a stream into UTF-8 by removing invalid and undefined
|
3
3
|
# characters.
|
4
4
|
class StreamEncoder
|
5
|
-
ENCODING_OPTIONS = { undef: :replace, invalid: :replace, replace: '' }
|
5
|
+
ENCODING_OPTIONS = { undef: :replace, invalid: :replace, replace: '' }.freeze
|
6
6
|
private_constant :ENCODING_OPTIONS
|
7
7
|
|
8
8
|
def initialize(stream)
|
@@ -3,19 +3,20 @@ require 'csv'
|
|
3
3
|
class BulkProcessor
|
4
4
|
# A Wrapper on CSV that validates column headers.
|
5
5
|
class ValidatedCSV
|
6
|
-
PARSING_OPTIONS = { headers: true, header_converters: :downcase }
|
6
|
+
PARSING_OPTIONS = { headers: true, header_converters: :downcase }.freeze
|
7
7
|
private_constant :PARSING_OPTIONS
|
8
8
|
|
9
9
|
# This cryptic message usually just means that the header row contains a
|
10
10
|
# blank field; in ruby ~> 2.1.5 It is the error message for a NoMethodError
|
11
11
|
# raised when parsing a CSV.
|
12
|
-
BAD_HEADERS_ERROR_MSG = "undefined method `encode' for nil:NilClass"
|
12
|
+
BAD_HEADERS_ERROR_MSG = "undefined method `encode' for nil:NilClass".freeze
|
13
13
|
private_constant :BAD_HEADERS_ERROR_MSG
|
14
14
|
|
15
|
-
MISSING_COLUMN_MESSAGE =
|
15
|
+
MISSING_COLUMN_MESSAGE =
|
16
|
+
'Missing or malformed column header, is one of them blank?'.freeze
|
16
17
|
private_constant :MISSING_COLUMN_MESSAGE
|
17
18
|
|
18
|
-
attr_reader :errors
|
19
|
+
attr_reader :errors
|
19
20
|
|
20
21
|
def initialize(stream, required_headers, optional_headers)
|
21
22
|
@stream = stream
|
@@ -33,11 +34,11 @@ class BulkProcessor
|
|
33
34
|
@errors = []
|
34
35
|
|
35
36
|
if missing_headers.any?
|
36
|
-
errors << "Missing required column(s): #{missing_headers.join(', ')}"
|
37
|
+
errors << "Missing required column(s): #{missing_headers.join(', ')}".freeze
|
37
38
|
end
|
38
39
|
|
39
40
|
if extra_headers.any?
|
40
|
-
errors << "Unrecognized column(s) found: #{extra_headers.join(', ')}"
|
41
|
+
errors << "Unrecognized column(s) found: #{extra_headers.join(', ')}".freeze
|
41
42
|
end
|
42
43
|
|
43
44
|
if csv.headers.any? { |header| header.nil? || header.strip == '' }
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bulk-processor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Tom Collier, Justin Richard
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-01-
|
11
|
+
date: 2016-01-21 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activejob
|
@@ -103,8 +103,11 @@ files:
|
|
103
103
|
- lib/bulk_processor.rb
|
104
104
|
- lib/bulk_processor/config.rb
|
105
105
|
- lib/bulk_processor/csv_processor.rb
|
106
|
+
- lib/bulk_processor/csv_processor/no_op_handler.rb
|
107
|
+
- lib/bulk_processor/csv_processor/no_op_post_processor.rb
|
108
|
+
- lib/bulk_processor/csv_processor/result.rb
|
109
|
+
- lib/bulk_processor/csv_processor/row_processor.rb
|
106
110
|
- lib/bulk_processor/job.rb
|
107
|
-
- lib/bulk_processor/no_op_handler.rb
|
108
111
|
- lib/bulk_processor/stream_encoder.rb
|
109
112
|
- lib/bulk_processor/validated_csv.rb
|
110
113
|
- lib/bulk_processor/version.rb
|