bulk-processor 0.1.0 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +81 -31
- data/lib/bulk_processor/config.rb +1 -0
- data/lib/bulk_processor/csv_processor.rb +98 -0
- data/lib/bulk_processor/job.rb +6 -19
- data/lib/bulk_processor/no_op_handler.rb +12 -0
- data/lib/bulk_processor/validated_csv.rb +30 -11
- data/lib/bulk_processor/version.rb +1 -1
- data/lib/bulk_processor.rb +13 -12
- metadata +6 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 56580f10538cc75b8fbfb8006248905cb0cfeb71
|
4
|
+
data.tar.gz: 3ce3bc03cf878836f5a768a3be0fad169bfabdc6
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1c4fa2fdf92ec038c73c6eec886a682e9cb82f8fdb1ef0ade84a0eefa0eb1fffbd29cd510c3c555c6456b5ee7e600c2e52999031169dfbd773a73a593ee543ee
|
7
|
+
data.tar.gz: b32c0129b95fa2a44e7d2dca5455bca2a60137aac6d0f97b260b60e90d8c79d28acd4e91a12f9d9a6b14e0a1ac4735d505b31bea182499f14c9c19991fbfa930
|
data/README.md
CHANGED
@@ -34,11 +34,48 @@ The default is `:inline`, which skips queueing and processes synchronously. Sinc
|
|
34
34
|
this is backed by ActiveJob, all of the adapters in [ActiveJob::QueueAdapters]( http://api.rubyonrails.org/classes/ActiveJob/QueueAdapters.html ),
|
35
35
|
including `:resque`.
|
36
36
|
|
37
|
-
You will also need to supply a class for
|
38
|
-
|
37
|
+
You will also need to supply a class for CSV processing. This class must respond to the
|
38
|
+
`start` instance method, the `required_columns` and `optional_columns` class methods,
|
39
|
+
and have the following signature for initialize:
|
39
40
|
|
41
|
+
```ruby
|
42
|
+
class PetCSVProcessor
|
43
|
+
# @return [Array<String>] column headers that must be present
|
44
|
+
def self.required_columns
|
45
|
+
['species', 'name', 'age']
|
46
|
+
end
|
47
|
+
|
48
|
+
# @return [Array<String>] column headers that may be present. If a column
|
49
|
+
# header is present that is not in 'required_columns' or 'optional_columns',
|
50
|
+
# the file will be considered invalid and no rows will be processed.
|
51
|
+
def self.optional_columns
|
52
|
+
['favorite_toy', 'talents']
|
53
|
+
end
|
54
|
+
|
55
|
+
def initialize(records, payload:)
|
56
|
+
# Assign instance variables and do any other setup
|
57
|
+
end
|
58
|
+
|
59
|
+
def start
|
60
|
+
# Process the records
|
61
|
+
end
|
62
|
+
end
|
40
63
|
```
|
41
|
-
|
64
|
+
|
65
|
+
To account for a common use case, a base `BulkProcessor::CSVProcessor` class is provided,
|
66
|
+
though it must be explicitly required. This base class can be subclassed to build a CSV processor.
|
67
|
+
This base class implements the initializer and `#start` methods and returns an empty set for `.optional_columns`.
|
68
|
+
|
69
|
+
The `#start` method iterates over each record, processes it using a `RowProcessor`,
|
70
|
+
accumulates the results, which are passed off to a `Handler`. An example
|
71
|
+
implementation could look like:
|
72
|
+
|
73
|
+
```ruby
|
74
|
+
require 'bulk_processor/csv_processor'
|
75
|
+
|
76
|
+
class PetCSVProcessor < BulkProcessor::CSVProcessor
|
77
|
+
# Note: this must be overridden in a subclass
|
78
|
+
#
|
42
79
|
# @return [Array<String>] column headers that must be present
|
43
80
|
def self.required_columns
|
44
81
|
['species', 'name', 'age']
|
@@ -51,18 +88,27 @@ class PetItemProcessor
|
|
51
88
|
['favorite_toy', 'talents']
|
52
89
|
end
|
53
90
|
|
54
|
-
#
|
55
|
-
#
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
91
|
+
# Note: this must be overridden in a subclass
|
92
|
+
#
|
93
|
+
# @return [RowProcessor] a class that implements the RowProcessor role
|
94
|
+
def self.row_processor_class
|
95
|
+
PetRowProcessor
|
96
|
+
end
|
97
|
+
|
98
|
+
# @return [Handler] a class that implements the Handler role
|
99
|
+
def self.handler_class
|
100
|
+
PetHandler
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
class PetRowProcessor
|
105
|
+
def initialize(record, payload:)
|
106
|
+
# Assign instance variables and do any other setup
|
61
107
|
end
|
62
108
|
|
63
109
|
# Process the row, e.g. create a new record in the DB, send an email, etc
|
64
110
|
def process!
|
65
|
-
pet = Pet.new(
|
111
|
+
pet = Pet.new(record)
|
66
112
|
if pet.save
|
67
113
|
@success = true
|
68
114
|
else
|
@@ -72,25 +118,17 @@ class PetItemProcessor
|
|
72
118
|
|
73
119
|
# @return [true|false] true iff the item was processed completely
|
74
120
|
def success?
|
75
|
-
@success
|
121
|
+
@success == true
|
76
122
|
end
|
77
123
|
|
78
124
|
# @return [Array<String>] list of messages for this item to pass back to the
|
79
125
|
# completion handler.
|
80
126
|
def messages
|
81
|
-
@messages
|
127
|
+
@messages || []
|
82
128
|
end
|
83
129
|
end
|
84
|
-
```
|
85
130
|
|
86
|
-
|
87
|
-
|
88
|
-
```ruby
|
89
|
-
module NotificationHandler
|
90
|
-
# Handle full or partial processing of records. Unless there was a fatal
|
91
|
-
# error, all row indexes will be present either successes or errors, but not
|
92
|
-
# both.
|
93
|
-
#
|
131
|
+
class PetHandler
|
94
132
|
# @param payload [Hash] the payload passed into 'BulkProcessor.process', can
|
95
133
|
# be used to pass metadata around, e.g. the email address to send a
|
96
134
|
# completion report to
|
@@ -100,25 +138,37 @@ module NotificationHandler
|
|
100
138
|
# (may be empty), e.g. { 0 => [], 1 => ['pet ID = 22 created'] }
|
101
139
|
# @param errors [Hash<Fixnum, Array<String>>] similar structure to successes,
|
102
140
|
# but rows that were not completed successfully.
|
141
|
+
def initialize(payload:, successes:, errors:)
|
142
|
+
# Assign instance variables and do any other setup
|
143
|
+
end
|
144
|
+
|
145
|
+
# Notify the owner that their pets were processed
|
146
|
+
def complete!
|
147
|
+
OwnerMailer.competed(successes, errors)
|
148
|
+
end
|
149
|
+
|
150
|
+
# Notify the owner that processing failed
|
151
|
+
#
|
103
152
|
# @param fatal_error [StandardError] if nil, then all rows were processed,
|
104
153
|
# else the error that was raise is passed in here
|
105
|
-
def
|
106
|
-
|
107
|
-
PetProcessorMailer.fail(payload['recipient'], successes, errors, fatal_error)
|
108
|
-
else
|
109
|
-
PetProcessorMailer.complete(payload['recipient'], successes, errors)
|
110
|
-
end
|
154
|
+
def fail!(fatal_error)
|
155
|
+
OwnerMailer.failed(fatal_error)
|
111
156
|
end
|
112
157
|
end
|
113
158
|
```
|
114
159
|
|
115
|
-
|
160
|
+
Putting it all together
|
116
161
|
|
117
162
|
```ruby
|
118
|
-
processor = BulkProcessor.new(
|
119
|
-
|
163
|
+
processor = BulkProcessor.new(
|
164
|
+
stream: file_stream,
|
165
|
+
processor_class: PetCSVProcessor,
|
166
|
+
payload: {recipient: current_user.email}
|
167
|
+
)
|
168
|
+
if processor.start
|
120
169
|
# The job has been enqueued, go get a coffee and wait
|
121
170
|
else
|
171
|
+
# Something went wrong, alert the file uploader
|
122
172
|
handle_invalid_file(processor.errors)
|
123
173
|
end
|
124
174
|
```
|
@@ -0,0 +1,98 @@
|
|
1
|
+
require_relative 'no_op_handler'
|
2
|
+
|
3
|
+
class BulkProcessor
|
4
|
+
# An abstract implmentation of the CSVProcessor role. Provides
|
5
|
+
#
|
6
|
+
# * A default implementation of `.optional_columns`, returning []
|
7
|
+
# * An initializer that assigns the arguments as instance attributes
|
8
|
+
# * An implementation of #start to cover a common use case
|
9
|
+
#
|
10
|
+
# The common use case cover by this class' implementation of `#start` is
|
11
|
+
#
|
12
|
+
# 1. Iteratively process each record
|
13
|
+
# 2. Accumulate the results (did the processing succeed? what were the error
|
14
|
+
# messages?)
|
15
|
+
# 3. Send the results to an instance of the Handler role.
|
16
|
+
#
|
17
|
+
# This class adds 2 required class methods that can be overridden in any
|
18
|
+
# subclass
|
19
|
+
#
|
20
|
+
# * row_processor_class - (required) Returns the class that implements the
|
21
|
+
# RowProcessor role to process rows of the CSV
|
22
|
+
# * handler_class - (optional) Returns the class that implements the Handler
|
23
|
+
# role, which handles results from the completion (or failure) of
|
24
|
+
# processing the entire CSV.
|
25
|
+
#
|
26
|
+
# The `required_columns` method must still be implemented in a subclass
|
27
|
+
#
|
28
|
+
class CSVProcessor
|
29
|
+
# @return [RowProcessor] a class that implements the RowProcessor interface
|
30
|
+
def self.row_processor_class
|
31
|
+
raise NotImplementedError,
|
32
|
+
"#{self.class.name} must implement #{__method__}"
|
33
|
+
end
|
34
|
+
|
35
|
+
# @return [Handler] a class that implements the Handler role
|
36
|
+
def self.handler_class
|
37
|
+
NoOpHandler
|
38
|
+
end
|
39
|
+
|
40
|
+
# @return [Array<String>] column headers that must be present
|
41
|
+
def self.required_columns
|
42
|
+
raise NotImplementedError,
|
43
|
+
"#{self.class.name} must implement #{__method__}"
|
44
|
+
end
|
45
|
+
|
46
|
+
# @return [Array<String>] column headers that may be present. If a column
|
47
|
+
# header is present that is not in 'required_columns' or
|
48
|
+
# 'optional_columns', the file will be considered invalid and no rows will
|
49
|
+
# be processed.
|
50
|
+
def self.optional_columns
|
51
|
+
[]
|
52
|
+
end
|
53
|
+
|
54
|
+
def initialize(records, payload: {})
|
55
|
+
@records = records
|
56
|
+
@payload = payload
|
57
|
+
@successes = {}
|
58
|
+
@errors = {}
|
59
|
+
end
|
60
|
+
|
61
|
+
# Iteratively process each record, accumulate the results, and pass those
|
62
|
+
# off to the handler. If an unrescued error is raised for any record,
|
63
|
+
# processing will halt for all remaining records and the `#fail!` will be
|
64
|
+
# invoked on the handler.
|
65
|
+
def start
|
66
|
+
records.each_with_index do |record, index|
|
67
|
+
processor = row_processor(record)
|
68
|
+
processor.process!
|
69
|
+
if processor.success?
|
70
|
+
successes[index] = processor.messages
|
71
|
+
else
|
72
|
+
errors[index] = processor.messages
|
73
|
+
end
|
74
|
+
end
|
75
|
+
handler.complete!
|
76
|
+
rescue Exception => exception
|
77
|
+
handler.fail!(exception)
|
78
|
+
|
79
|
+
# Swallow any StandardError, since we are already reporting it to the
|
80
|
+
# user. However, we must re-raise Exceptions, such as SIGTERMs since they
|
81
|
+
# need to be handled at a level above this gem.
|
82
|
+
raise unless exception.is_a?(StandardError)
|
83
|
+
end
|
84
|
+
|
85
|
+
private
|
86
|
+
|
87
|
+
attr_reader :records, :payload, :successes, :errors
|
88
|
+
|
89
|
+
def handler
|
90
|
+
self.class.handler_class.new(payload: payload, successes: successes,
|
91
|
+
errors: errors)
|
92
|
+
end
|
93
|
+
|
94
|
+
def row_processor(record)
|
95
|
+
self.class.row_processor_class.new(record, payload: payload)
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|
data/lib/bulk_processor/job.rb
CHANGED
@@ -1,26 +1,13 @@
|
|
1
|
+
require 'active_job'
|
2
|
+
|
1
3
|
class BulkProcessor
|
4
|
+
# ActiveJob to handle processing the CSV in the background
|
2
5
|
class Job < ActiveJob::Base
|
3
6
|
queue_as 'bulk_processor'
|
4
7
|
|
5
|
-
def perform(records,
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
successes = {}
|
10
|
-
failures = {}
|
11
|
-
records.each_with_index do |record, index|
|
12
|
-
processor = item_proccessor_class.new(record, payload)
|
13
|
-
processor.process!
|
14
|
-
if processor.success?
|
15
|
-
successes[index] = processor.messages
|
16
|
-
else
|
17
|
-
failures[index] = processor.messages
|
18
|
-
end
|
19
|
-
end
|
20
|
-
handler_class.complete(payload, successes, failures, nil)
|
21
|
-
rescue Exception => exception
|
22
|
-
handler_class.complete(payload, successes, failures, exception)
|
23
|
-
raise unless exception.is_a?(StandardError)
|
8
|
+
def perform(records, processor_class, payload)
|
9
|
+
processor = processor_class.constantize.new(records, payload: payload)
|
10
|
+
processor.start
|
24
11
|
end
|
25
12
|
end
|
26
13
|
end
|
@@ -1,11 +1,20 @@
|
|
1
|
+
require 'csv'
|
2
|
+
|
1
3
|
class BulkProcessor
|
4
|
+
# A Wrapper on CSV that validates column headers.
|
2
5
|
class ValidatedCSV
|
3
6
|
PARSING_OPTIONS = { headers: true, header_converters: :downcase }
|
4
7
|
private_constant :PARSING_OPTIONS
|
5
8
|
|
9
|
+
# This cryptic message usually just means that the header row contains a
|
10
|
+
# blank field; in ruby ~> 2.1.5 It is the error message for a NoMethodError
|
11
|
+
# raised when parsing a CSV.
|
6
12
|
BAD_HEADERS_ERROR_MSG = "undefined method `encode' for nil:NilClass"
|
7
13
|
private_constant :BAD_HEADERS_ERROR_MSG
|
8
14
|
|
15
|
+
MISSING_COLUMN_MESSAGE = 'Missing or malformed column header, is one of them blank?'
|
16
|
+
private_constant :MISSING_COLUMN_MESSAGE
|
17
|
+
|
9
18
|
attr_reader :errors, :records
|
10
19
|
|
11
20
|
def initialize(stream, required_headers, optional_headers)
|
@@ -15,7 +24,12 @@ class BulkProcessor
|
|
15
24
|
@errors = []
|
16
25
|
end
|
17
26
|
|
27
|
+
# @return [true|false] true iff:
|
28
|
+
# * All required columns are present
|
29
|
+
# * No column exists that isn't a required or optional column
|
30
|
+
# * No column heading is blank
|
18
31
|
def valid?
|
32
|
+
return false if csv.nil?
|
19
33
|
@errors = []
|
20
34
|
|
21
35
|
if missing_headers.any?
|
@@ -26,20 +40,17 @@ class BulkProcessor
|
|
26
40
|
errors << "Unrecognized column(s) found: #{extra_headers.join(', ')}"
|
27
41
|
end
|
28
42
|
|
29
|
-
|
30
|
-
errors <<
|
31
|
-
end
|
32
|
-
rescue NoMethodError => error
|
33
|
-
if error.message == BAD_HEADERS_ERROR_MSG
|
34
|
-
errors << 'Missing or malformed column header, is one of them blank?'
|
35
|
-
else
|
36
|
-
raise error
|
43
|
+
if csv.headers.any? { |header| header.nil? || header.strip == '' }
|
44
|
+
errors << MISSING_COLUMN_MESSAGE
|
37
45
|
end
|
38
|
-
|
39
|
-
|
46
|
+
|
47
|
+
errors.empty?
|
40
48
|
end
|
41
49
|
|
50
|
+
# @return [Array<Hash<String, String>>] a serializable representation of the
|
51
|
+
# CSV that will be passed to the background job.
|
42
52
|
def row_hashes
|
53
|
+
return [] unless valid?
|
43
54
|
csv.map(&:to_hash)
|
44
55
|
end
|
45
56
|
|
@@ -48,7 +59,15 @@ class BulkProcessor
|
|
48
59
|
attr_reader :stream, :required_headers, :optional_headers
|
49
60
|
|
50
61
|
def csv
|
51
|
-
@csv
|
62
|
+
return @csv if instance_variable_defined?('@csv')
|
63
|
+
@csv = CSV.parse(stream, PARSING_OPTIONS)
|
64
|
+
rescue NoMethodError => error
|
65
|
+
if error.message == BAD_HEADERS_ERROR_MSG
|
66
|
+
errors << MISSING_COLUMN_MESSAGE
|
67
|
+
@csv = nil
|
68
|
+
else
|
69
|
+
raise error
|
70
|
+
end
|
52
71
|
end
|
53
72
|
|
54
73
|
def missing_headers
|
data/lib/bulk_processor.rb
CHANGED
@@ -1,12 +1,10 @@
|
|
1
|
-
require 'active_job'
|
2
|
-
require 'csv'
|
3
|
-
|
4
1
|
require 'bulk_processor/config'
|
5
2
|
require 'bulk_processor/job'
|
6
3
|
require 'bulk_processor/stream_encoder'
|
7
4
|
require 'bulk_processor/validated_csv'
|
8
5
|
require 'bulk_processor/version'
|
9
6
|
|
7
|
+
# Process large CSV files in the background.
|
10
8
|
class BulkProcessor
|
11
9
|
class << self
|
12
10
|
def config
|
@@ -16,31 +14,34 @@ class BulkProcessor
|
|
16
14
|
def configure
|
17
15
|
yield config
|
18
16
|
end
|
19
|
-
|
20
17
|
end
|
21
18
|
|
22
|
-
attr_reader :
|
19
|
+
attr_reader :errors
|
23
20
|
|
24
|
-
def initialize(stream
|
21
|
+
def initialize(stream:, processor_class:, payload: {})
|
25
22
|
@stream = stream
|
26
|
-
@
|
27
|
-
@handler = handler
|
23
|
+
@processor_class = processor_class
|
28
24
|
@payload = payload
|
29
25
|
@errors = []
|
30
26
|
end
|
31
27
|
|
32
|
-
|
28
|
+
# Validate the CSV and enqueue if for processing in the background.
|
29
|
+
def start
|
33
30
|
csv = ValidatedCSV.new(
|
34
31
|
StreamEncoder.new(stream).encoded,
|
35
|
-
|
36
|
-
|
32
|
+
processor_class.required_columns,
|
33
|
+
processor_class.optional_columns
|
37
34
|
)
|
38
35
|
|
39
36
|
if csv.valid?
|
40
|
-
Job.perform_later(csv.row_hashes,
|
37
|
+
Job.perform_later(csv.row_hashes, processor_class.name, payload)
|
41
38
|
else
|
42
39
|
@errors = csv.errors
|
43
40
|
end
|
44
41
|
@errors.empty?
|
45
42
|
end
|
43
|
+
|
44
|
+
private
|
45
|
+
|
46
|
+
attr_reader :stream, :processor_class, :payload
|
46
47
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bulk-processor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Tom Collier, Justin Richard
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-01-
|
11
|
+
date: 2016-01-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activejob
|
@@ -102,7 +102,9 @@ files:
|
|
102
102
|
- bulk-processor.gemspec
|
103
103
|
- lib/bulk_processor.rb
|
104
104
|
- lib/bulk_processor/config.rb
|
105
|
+
- lib/bulk_processor/csv_processor.rb
|
105
106
|
- lib/bulk_processor/job.rb
|
107
|
+
- lib/bulk_processor/no_op_handler.rb
|
106
108
|
- lib/bulk_processor/stream_encoder.rb
|
107
109
|
- lib/bulk_processor/validated_csv.rb
|
108
110
|
- lib/bulk_processor/version.rb
|
@@ -126,8 +128,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
126
128
|
version: '0'
|
127
129
|
requirements: []
|
128
130
|
rubyforge_project:
|
129
|
-
rubygems_version: 2.4.
|
131
|
+
rubygems_version: 2.4.5
|
130
132
|
signing_key:
|
131
133
|
specification_version: 4
|
132
134
|
summary: Background process CSV data
|
133
135
|
test_files: []
|
136
|
+
has_rdoc:
|