bulk-processor 0.5.1 → 0.6.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +62 -1
- data/lib/bulk_processor/back_end/active_job.rb +9 -9
- data/lib/bulk_processor/back_end/dynosaur.rb +12 -9
- data/lib/bulk_processor/back_end.rb +3 -4
- data/lib/bulk_processor/config.rb +5 -0
- data/lib/bulk_processor/file_splitter.rb +59 -0
- data/lib/bulk_processor/job/process_csv.rb +22 -0
- data/lib/bulk_processor/job/split_csv.rb +41 -0
- data/lib/bulk_processor/row_chunker/balanced.rb +29 -0
- data/lib/bulk_processor/row_chunker/boundary.rb +55 -0
- data/lib/bulk_processor/tasks.rb +12 -3
- data/lib/bulk_processor/version.rb +1 -1
- data/lib/bulk_processor.rb +12 -8
- metadata +7 -3
- data/lib/bulk_processor/job.rb +0 -20
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 77320b807b3cd9862490408058611d9b461cf83f
|
4
|
+
data.tar.gz: 203e565ab7f722c6f639527b4065e2e5f495aa57
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4f89dd796184485f44d0018a9819cf42ec5c147ff20693c63e7055b43fd2e0807e5b268f622e8555157e1ac36b01207c9b23fe6f0212eb2a744924b6e3533d91
|
7
|
+
data.tar.gz: 2373584252697f040d460070a93958cc944c2f0045948233125f4e0ee06d39daf5080191dd0f07e6ad0432e461c7947e01e821a5e81e7dbc994c558571a1da44
|
data/README.md
CHANGED
@@ -65,7 +65,7 @@ The CSV file passed to BulkProcessor will be persisted on AWS S3 so that the job
|
|
65
65
|
can access it. This requires configuring AWS credentials, the S3 bucket in which
|
66
66
|
to store the file, and a local temp directory to hold the file locally.
|
67
67
|
|
68
|
-
### Setting up the processor
|
68
|
+
### Setting up the processor
|
69
69
|
|
70
70
|
You will need to supply a class for CSV processing. This class must respond to the
|
71
71
|
`start` instance method, the `required_columns` and `optional_columns` class methods,
|
@@ -229,6 +229,67 @@ else
|
|
229
229
|
end
|
230
230
|
```
|
231
231
|
|
232
|
+
#### Parallelization
|
233
|
+
|
234
|
+
For larger CSV files, you may wish to process rows in parallel. This gem allows
|
235
|
+
you to scale up to an arbitrary number of parallel processes by providing an optional
|
236
|
+
argument to `#start`. Doing this will cause the input CSV file to be split into
|
237
|
+
*N* number of smaller CSV files, each one being processed in separate processes.
|
238
|
+
It is important to note that the file *must* be sorted by the boundary column for
|
239
|
+
it to deliver on its promise.
|
240
|
+
|
241
|
+
```ruby
|
242
|
+
processor = BulkProcessor.new(
|
243
|
+
key: file_name,
|
244
|
+
stream: file_stream,
|
245
|
+
processor_class: PetCSVProcessor,
|
246
|
+
payload: { recipient: current_user.email }
|
247
|
+
)
|
248
|
+
if processor.start(5)
|
249
|
+
# Split the main CSV into 5 smaller files and process in parallel.
|
250
|
+
else
|
251
|
+
# Something went wrong, alert the file uploader
|
252
|
+
handle_invalid_file(processor.errors)
|
253
|
+
end
|
254
|
+
```
|
255
|
+
|
256
|
+
By default, the file will be split into equal-sized partitions. If you need the partitions
|
257
|
+
to keep all rows with the same value for a column into the same partition, define `.boundary_column`
|
258
|
+
on the processor class to return the name of that column. E.g.
|
259
|
+
|
260
|
+
```csv
|
261
|
+
pet_id,meal,mead_date
|
262
|
+
1,kibble,2015-11-02
|
263
|
+
1,bits,2015-11-03
|
264
|
+
...
|
265
|
+
1,alpo,2015-12-31
|
266
|
+
2,alpo,2015-11-01
|
267
|
+
...
|
268
|
+
```
|
269
|
+
|
270
|
+
```ruby
|
271
|
+
class PetCSVProcessor
|
272
|
+
def self.boundary_column
|
273
|
+
'pet_id'
|
274
|
+
end
|
275
|
+
...
|
276
|
+
end
|
277
|
+
```
|
278
|
+
|
279
|
+
Finally, to be notified of any failures in the splitting process, you can define
|
280
|
+
`.handler_class` on your processor class to return a class that implements the Handler role.
|
281
|
+
If an error is raised in the splitting, `#fail!` will be called on the Handler with
|
282
|
+
the error.
|
283
|
+
|
284
|
+
```ruby
|
285
|
+
class PetCSVProcessor
|
286
|
+
def self.handler_class
|
287
|
+
PetHandler
|
288
|
+
end
|
289
|
+
...
|
290
|
+
end
|
291
|
+
```
|
292
|
+
|
232
293
|
### BulkProcessor::CSVProcessor::Result
|
233
294
|
|
234
295
|
The result instances passed from BulkProcessor::CSVProcessor to the Handler
|
@@ -1,25 +1,25 @@
|
|
1
1
|
class BulkProcessor
|
2
2
|
module BackEnd
|
3
|
+
# Execute jobs via ActiveJob, e.g. Resque
|
3
4
|
class ActiveJob
|
4
|
-
def initialize(processor_class:, payload:,
|
5
|
+
def initialize(processor_class:, payload:, key:)
|
5
6
|
@processor_class = processor_class
|
6
7
|
@payload = payload
|
7
|
-
@file_class = file_class
|
8
8
|
@key = key
|
9
9
|
end
|
10
10
|
|
11
11
|
def start
|
12
|
-
Job.perform_later(
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
12
|
+
Job::ProcessCSV.perform_later(processor_class.name, payload, key)
|
13
|
+
end
|
14
|
+
|
15
|
+
def split(num_processes)
|
16
|
+
Job::SplitCSV.perform_later(processor_class.name, payload,
|
17
|
+
key, num_processes)
|
18
18
|
end
|
19
19
|
|
20
20
|
private
|
21
21
|
|
22
|
-
attr_reader :processor_class, :payload, :
|
22
|
+
attr_reader :processor_class, :payload, :key
|
23
23
|
end
|
24
24
|
end
|
25
25
|
end
|
@@ -2,11 +2,11 @@ require 'dynosaur'
|
|
2
2
|
|
3
3
|
class BulkProcessor
|
4
4
|
module BackEnd
|
5
|
+
# Execute jobs via rake tasks that will spawn a new Heroku dyno
|
5
6
|
class Dynosaur
|
6
|
-
def initialize(processor_class:, payload:,
|
7
|
+
def initialize(processor_class:, payload:, key:)
|
7
8
|
@processor_class = processor_class
|
8
9
|
@payload = payload
|
9
|
-
@file_class = file_class
|
10
10
|
@key = key
|
11
11
|
configure_dynosaur
|
12
12
|
end
|
@@ -14,19 +14,22 @@ class BulkProcessor
|
|
14
14
|
def start
|
15
15
|
args = {
|
16
16
|
task: 'bulk_processor:start',
|
17
|
-
args: [
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
17
|
+
args: [processor_class.name, payload, key]
|
18
|
+
}
|
19
|
+
::Dynosaur::Process::Heroku.new(args).start
|
20
|
+
end
|
21
|
+
|
22
|
+
def split(num_processes)
|
23
|
+
args = {
|
24
|
+
task: 'bulk_processor:split',
|
25
|
+
args: [processor_class.name, payload, key, num_processes]
|
23
26
|
}
|
24
27
|
::Dynosaur::Process::Heroku.new(args).start
|
25
28
|
end
|
26
29
|
|
27
30
|
private
|
28
31
|
|
29
|
-
attr_reader :processor_class, :payload, :
|
32
|
+
attr_reader :processor_class, :payload, :key
|
30
33
|
|
31
34
|
def configure_dynosaur
|
32
35
|
::Dynosaur::Client::HerokuClient.configure do |config|
|
@@ -1,14 +1,13 @@
|
|
1
1
|
class BulkProcessor
|
2
2
|
module BackEnd
|
3
3
|
class << self
|
4
|
-
def start(processor_class:, payload:,
|
4
|
+
def start(processor_class:, payload:, key:, num_processes: 1)
|
5
5
|
back_end = back_end_class.new(
|
6
6
|
processor_class: processor_class,
|
7
|
-
payload: payload,
|
8
|
-
file_class: file_class,
|
7
|
+
payload: PayloadSerializer.serialize(payload),
|
9
8
|
key: key
|
10
9
|
)
|
11
|
-
back_end.start
|
10
|
+
num_processes > 1 ? back_end.split(num_processes) : back_end.start
|
12
11
|
end
|
13
12
|
|
14
13
|
private
|
@@ -2,12 +2,17 @@ class BulkProcessor
|
|
2
2
|
# Store configuration data set by clients
|
3
3
|
class Config
|
4
4
|
attr_reader :queue_adapter
|
5
|
+
attr_writer :file_class
|
5
6
|
attr_accessor :back_end, :temp_directory
|
6
7
|
|
7
8
|
def queue_adapter=(adapter)
|
8
9
|
ActiveJob::Base.queue_adapter = @queue_adapter = adapter
|
9
10
|
end
|
10
11
|
|
12
|
+
def file_class
|
13
|
+
@file_class || BulkProcessor::S3File
|
14
|
+
end
|
15
|
+
|
11
16
|
def aws
|
12
17
|
@aws ||= Struct.new(:access_key_id, :secret_access_key, :bucket).new
|
13
18
|
end
|
@@ -0,0 +1,59 @@
|
|
1
|
+
class BulkProcessor
|
2
|
+
# Split a CSV file on S3 using the specified chunker
|
3
|
+
class FileSplitter
|
4
|
+
def initialize(key:, row_chunker:)
|
5
|
+
@key = key
|
6
|
+
@row_chunker = row_chunker
|
7
|
+
end
|
8
|
+
|
9
|
+
# Generate multiple files on S3, composed of chunks of the input file.
|
10
|
+
#
|
11
|
+
# @return [Array<String>] the S3 keys for each new file
|
12
|
+
def split!
|
13
|
+
return @keys if instance_variable_defined?('@keys')
|
14
|
+
ranges = row_chunker.ranges_for(input_csv)
|
15
|
+
@keys = ranges.map.with_index do |range, index|
|
16
|
+
chunk_key = key_from_index(index, ranges.count)
|
17
|
+
contents = csv_from_range(range)
|
18
|
+
BulkProcessor.config.file_class.new(chunk_key).write(contents)
|
19
|
+
chunk_key
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
private
|
24
|
+
|
25
|
+
attr_reader :key, :row_chunker
|
26
|
+
|
27
|
+
def headers
|
28
|
+
input_csv.headers
|
29
|
+
end
|
30
|
+
|
31
|
+
def input_csv
|
32
|
+
return @input_csv if instance_variable_defined?('@input_csv')
|
33
|
+
BulkProcessor.config.file_class.new(key).open do |input_file|
|
34
|
+
@input_csv = CSV.parse(input_file, headers: true)
|
35
|
+
end
|
36
|
+
@input_csv
|
37
|
+
end
|
38
|
+
|
39
|
+
def csv_from_range(range)
|
40
|
+
return CSV.generate { |csv| csv << headers } if range.count == 0
|
41
|
+
CSV.generate(headers: headers, write_headers: true) do |csv|
|
42
|
+
range.each { |row_num| csv << input_csv[row_num] }
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
def key_from_index(index, total)
|
47
|
+
parts = key.split('.')
|
48
|
+
if parts.length == 1
|
49
|
+
name_part = key
|
50
|
+
ext_part = ''
|
51
|
+
else
|
52
|
+
name_part = parts[0..-2].join('.')
|
53
|
+
ext_part = ".#{parts.last}"
|
54
|
+
end
|
55
|
+
|
56
|
+
"#{name_part}_#{index + 1}-of-#{total}#{ext_part}"
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
require 'active_job'
|
2
|
+
|
3
|
+
class BulkProcessor
|
4
|
+
# ActiveJob to handle processing the CSV in the background
|
5
|
+
module Job
|
6
|
+
class ProcessCSV < ActiveJob::Base
|
7
|
+
queue_as 'bulk_processor'
|
8
|
+
|
9
|
+
def perform(processor_class, payload, key)
|
10
|
+
file = BulkProcessor.config.file_class.new(key)
|
11
|
+
payload = PayloadSerializer.deserialize(payload).merge('key' => key)
|
12
|
+
file.open do |f|
|
13
|
+
csv = CSV.parse(f.read, headers: true)
|
14
|
+
processor = processor_class.constantize.new(csv, payload: payload)
|
15
|
+
processor.start
|
16
|
+
end
|
17
|
+
ensure
|
18
|
+
file.try(:delete)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
require 'active_job'
|
2
|
+
|
3
|
+
class BulkProcessor
|
4
|
+
# ActiveJob to handle processing the CSV in the background
|
5
|
+
module Job
|
6
|
+
class SplitCSV < ActiveJob::Base
|
7
|
+
queue_as 'bulk_processor'
|
8
|
+
|
9
|
+
def perform(processor_class, payload, key, num_chunks)
|
10
|
+
processor_class = processor_class.constantize
|
11
|
+
chunker = row_chunker(processor_class, num_chunks)
|
12
|
+
payload = PayloadSerializer.deserialize(payload)
|
13
|
+
splitter = FileSplitter.new(key: key, row_chunker: chunker)
|
14
|
+
keys = splitter.split!
|
15
|
+
keys.each do |key|
|
16
|
+
BackEnd.start(processor_class: processor_class, payload: payload, key: key)
|
17
|
+
end
|
18
|
+
rescue Exception => error
|
19
|
+
if processor_class.respond_to?(:handler_class)
|
20
|
+
payload = payload.merge('key' => key)
|
21
|
+
handler = processor_class.handler_class.new(payload: payload, results: [])
|
22
|
+
handler.fail!(error)
|
23
|
+
end
|
24
|
+
raise
|
25
|
+
ensure
|
26
|
+
BulkProcessor.config.file_class.new(key).delete
|
27
|
+
end
|
28
|
+
|
29
|
+
private
|
30
|
+
|
31
|
+
def row_chunker(processor_class, num_chunks)
|
32
|
+
if processor_class.respond_to?(:boundary_column)
|
33
|
+
boundary_column = processor_class.boundary_column
|
34
|
+
RowChunker::Boundary.new(num_chunks, boundary_column: boundary_column)
|
35
|
+
else
|
36
|
+
RowChunker::Balanced.new(num_chunks)
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
class BulkProcessor
|
2
|
+
module RowChunker
|
3
|
+
# Determine the partitions for a balanced break up of the input CSV file.
|
4
|
+
# All partitions will have a size within 1 row of every other partition.
|
5
|
+
class Balanced
|
6
|
+
def initialize(num_chunks)
|
7
|
+
@num_chunks = num_chunks
|
8
|
+
end
|
9
|
+
|
10
|
+
def ranges_for(csv)
|
11
|
+
ideal_size = csv.count / num_chunks
|
12
|
+
num_chunks.times.map do |index|
|
13
|
+
start_index = index * ideal_size
|
14
|
+
if index == num_chunks - 1
|
15
|
+
# force the last chunk to go to the very last row
|
16
|
+
end_index = csv.count - 1
|
17
|
+
else
|
18
|
+
end_index = start_index + ideal_size - 1
|
19
|
+
end
|
20
|
+
(start_index..end_index)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
private
|
25
|
+
|
26
|
+
attr_reader :num_chunks
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
class BulkProcessor
|
2
|
+
module RowChunker
|
3
|
+
# Determine the partitions that ensure all consecutive rows with the same
|
4
|
+
# value for boundary_column are in the same partion. The CSV must be sorted
|
5
|
+
# on this column to get the desired results. This class makes an attempt to
|
6
|
+
# keep the partion sizes equal, but obviously prioritizes the boundary
|
7
|
+
# column values over partition size.
|
8
|
+
class Boundary
|
9
|
+
def initialize(num_chunks, boundary_column:)
|
10
|
+
@num_chunks = num_chunks
|
11
|
+
@boundary_column = boundary_column
|
12
|
+
end
|
13
|
+
|
14
|
+
def ranges_for(csv)
|
15
|
+
@ranges ||= begin
|
16
|
+
# Start with a balanced partition, then make adjustments from there
|
17
|
+
chunker = Balanced.new(num_chunks)
|
18
|
+
adjust_for_boundaries(chunker.ranges_for(csv), csv)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
private
|
23
|
+
|
24
|
+
attr_reader :num_chunks, :boundary_column
|
25
|
+
|
26
|
+
def adjust_for_boundaries(balanced_ranges, csv)
|
27
|
+
balanced_endings = balanced_ranges.map(&:last)
|
28
|
+
|
29
|
+
last_indexes = []
|
30
|
+
while balanced_endings.any?
|
31
|
+
last_index = [last_indexes.last, balanced_endings.shift].compact.max
|
32
|
+
last_index += 1 until at_boundary?(csv, last_index)
|
33
|
+
last_indexes << last_index
|
34
|
+
end
|
35
|
+
|
36
|
+
to_ranges(last_indexes)
|
37
|
+
end
|
38
|
+
|
39
|
+
def to_ranges(last_indexes)
|
40
|
+
first_indexes = last_indexes.dup
|
41
|
+
first_indexes.pop
|
42
|
+
first_indexes.map! { |index| index + 1 }
|
43
|
+
first_indexes.unshift(0)
|
44
|
+
first_indexes.map.with_index do |first_index, index|
|
45
|
+
(first_index..last_indexes[index])
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
def at_boundary?(csv, index)
|
50
|
+
return true if index == csv.count - 1
|
51
|
+
csv[index][boundary_column] != csv[index + 1][boundary_column]
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
data/lib/bulk_processor/tasks.rb
CHANGED
@@ -7,14 +7,23 @@ class BulkProcessor
|
|
7
7
|
def install_tasks
|
8
8
|
namespace :bulk_processor do
|
9
9
|
desc 'Start processing a CSV file'
|
10
|
-
task :start, [:processor_class, :payload, :
|
11
|
-
Job.new.perform(
|
10
|
+
task :start, [:processor_class, :payload, :key] => :environment do |_task, args|
|
11
|
+
Job::ProcessCSV.new.perform(
|
12
12
|
args[:processor_class],
|
13
13
|
args[:payload],
|
14
|
-
args[:file_class],
|
15
14
|
args[:key]
|
16
15
|
)
|
17
16
|
end
|
17
|
+
|
18
|
+
desc 'Split a CSV file and process each piece'
|
19
|
+
task :split, [:processor_class, :payload, :key, :num_chunks] => :environment do |_task, args|
|
20
|
+
Job::SplitCSV.new.perform(
|
21
|
+
args[:processor_class],
|
22
|
+
args[:payload],
|
23
|
+
args[:key],
|
24
|
+
args[:num_chunks]
|
25
|
+
)
|
26
|
+
end
|
18
27
|
end
|
19
28
|
end
|
20
29
|
end
|
data/lib/bulk_processor.rb
CHANGED
@@ -2,8 +2,12 @@ require 'bulk_processor/back_end'
|
|
2
2
|
require 'bulk_processor/back_end/active_job'
|
3
3
|
require 'bulk_processor/back_end/dynosaur'
|
4
4
|
require 'bulk_processor/config'
|
5
|
-
require 'bulk_processor/
|
5
|
+
require 'bulk_processor/file_splitter'
|
6
|
+
require 'bulk_processor/job/process_csv'
|
7
|
+
require 'bulk_processor/job/split_csv'
|
6
8
|
require 'bulk_processor/payload_serializer'
|
9
|
+
require 'bulk_processor/row_chunker/balanced'
|
10
|
+
require 'bulk_processor/row_chunker/boundary'
|
7
11
|
require 'bulk_processor/s3_file'
|
8
12
|
require 'bulk_processor/stream_encoder'
|
9
13
|
require 'bulk_processor/validated_csv'
|
@@ -32,8 +36,8 @@ class BulkProcessor
|
|
32
36
|
end
|
33
37
|
|
34
38
|
# Validate the CSV and enqueue if for processing in the background.
|
35
|
-
def start(
|
36
|
-
if file_class.new(key).exists?
|
39
|
+
def start(num_processes = 1)
|
40
|
+
if BulkProcessor.config.file_class.new(key).exists?
|
37
41
|
errors << "Already processing #{key}, please wait for it to finish"
|
38
42
|
return false
|
39
43
|
end
|
@@ -47,7 +51,7 @@ class BulkProcessor
|
|
47
51
|
)
|
48
52
|
|
49
53
|
if csv.valid?
|
50
|
-
start_backend(
|
54
|
+
start_backend(encoded_contents, num_processes)
|
51
55
|
else
|
52
56
|
errors.concat(csv.errors)
|
53
57
|
end
|
@@ -58,11 +62,11 @@ class BulkProcessor
|
|
58
62
|
|
59
63
|
attr_reader :key, :stream, :processor_class, :payload
|
60
64
|
|
61
|
-
def start_backend(
|
62
|
-
file = file_class.new(key)
|
65
|
+
def start_backend(contents, num_processes)
|
66
|
+
file = BulkProcessor.config.file_class.new(key)
|
63
67
|
file.write(contents)
|
64
|
-
BackEnd.start(processor_class: processor_class, payload: payload,
|
65
|
-
|
68
|
+
BackEnd.start(processor_class: processor_class, payload: payload, key: key,
|
69
|
+
num_processes: num_processes)
|
66
70
|
rescue Exception
|
67
71
|
# Clean up the file, which is treated as a lock, if we bail out of here
|
68
72
|
# unexpectedly.
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bulk-processor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.6.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Tom Collier, Justin Richard
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-01-
|
11
|
+
date: 2016-01-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activejob
|
@@ -152,8 +152,12 @@ files:
|
|
152
152
|
- lib/bulk_processor/csv_processor/no_op_post_processor.rb
|
153
153
|
- lib/bulk_processor/csv_processor/result.rb
|
154
154
|
- lib/bulk_processor/csv_processor/row_processor.rb
|
155
|
-
- lib/bulk_processor/
|
155
|
+
- lib/bulk_processor/file_splitter.rb
|
156
|
+
- lib/bulk_processor/job/process_csv.rb
|
157
|
+
- lib/bulk_processor/job/split_csv.rb
|
156
158
|
- lib/bulk_processor/payload_serializer.rb
|
159
|
+
- lib/bulk_processor/row_chunker/balanced.rb
|
160
|
+
- lib/bulk_processor/row_chunker/boundary.rb
|
157
161
|
- lib/bulk_processor/s3_file.rb
|
158
162
|
- lib/bulk_processor/stream_encoder.rb
|
159
163
|
- lib/bulk_processor/tasks.rb
|
data/lib/bulk_processor/job.rb
DELETED
@@ -1,20 +0,0 @@
|
|
1
|
-
require 'active_job'
|
2
|
-
|
3
|
-
class BulkProcessor
|
4
|
-
# ActiveJob to handle processing the CSV in the background
|
5
|
-
class Job < ActiveJob::Base
|
6
|
-
queue_as 'bulk_processor'
|
7
|
-
|
8
|
-
def perform(processor_class, payload, file_class, key)
|
9
|
-
file = file_class.constantize.new(key)
|
10
|
-
payload = PayloadSerializer.deserialize(payload)
|
11
|
-
file.open do |f|
|
12
|
-
csv = CSV.parse(f.read, headers: true)
|
13
|
-
processor = processor_class.constantize.new(csv, payload: payload)
|
14
|
-
processor.start
|
15
|
-
end
|
16
|
-
ensure
|
17
|
-
file.try(:delete)
|
18
|
-
end
|
19
|
-
end
|
20
|
-
end
|