bulk-processor 0.5.1 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +62 -1
- data/lib/bulk_processor/back_end/active_job.rb +9 -9
- data/lib/bulk_processor/back_end/dynosaur.rb +12 -9
- data/lib/bulk_processor/back_end.rb +3 -4
- data/lib/bulk_processor/config.rb +5 -0
- data/lib/bulk_processor/file_splitter.rb +59 -0
- data/lib/bulk_processor/job/process_csv.rb +22 -0
- data/lib/bulk_processor/job/split_csv.rb +41 -0
- data/lib/bulk_processor/row_chunker/balanced.rb +29 -0
- data/lib/bulk_processor/row_chunker/boundary.rb +55 -0
- data/lib/bulk_processor/tasks.rb +12 -3
- data/lib/bulk_processor/version.rb +1 -1
- data/lib/bulk_processor.rb +12 -8
- metadata +7 -3
- data/lib/bulk_processor/job.rb +0 -20
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 77320b807b3cd9862490408058611d9b461cf83f
|
4
|
+
data.tar.gz: 203e565ab7f722c6f639527b4065e2e5f495aa57
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4f89dd796184485f44d0018a9819cf42ec5c147ff20693c63e7055b43fd2e0807e5b268f622e8555157e1ac36b01207c9b23fe6f0212eb2a744924b6e3533d91
|
7
|
+
data.tar.gz: 2373584252697f040d460070a93958cc944c2f0045948233125f4e0ee06d39daf5080191dd0f07e6ad0432e461c7947e01e821a5e81e7dbc994c558571a1da44
|
data/README.md
CHANGED
@@ -65,7 +65,7 @@ The CSV file passed to BulkProcessor will be persisted on AWS S3 so that the job
|
|
65
65
|
can access it. This requires configuring AWS credentials, the S3 bucket in which
|
66
66
|
to store the file, and a local temp directory to hold the file locally.
|
67
67
|
|
68
|
-
### Setting up the processor
|
68
|
+
### Setting up the processor
|
69
69
|
|
70
70
|
You will need to supply a class for CSV processing. This class must respond to the
|
71
71
|
`start` instance method, the `required_columns` and `optional_columns` class methods,
|
@@ -229,6 +229,67 @@ else
|
|
229
229
|
end
|
230
230
|
```
|
231
231
|
|
232
|
+
#### Parallelization
|
233
|
+
|
234
|
+
For larger CSV files, you may wish to process rows in parallel. This gem allows
|
235
|
+
you to scale up to an arbitrary number of parallel processes by providing an optional
|
236
|
+
argument to `#start`. Doing this will cause the input CSV file to be split into
|
237
|
+
*N* number of smaller CSV files, each one being processed in separate processes.
|
238
|
+
It is important to note that the file *must* be sorted by the boundary column for
|
239
|
+
it to deliver on its promise.
|
240
|
+
|
241
|
+
```ruby
|
242
|
+
processor = BulkProcessor.new(
|
243
|
+
key: file_name,
|
244
|
+
stream: file_stream,
|
245
|
+
processor_class: PetCSVProcessor,
|
246
|
+
payload: { recipient: current_user.email }
|
247
|
+
)
|
248
|
+
if processor.start(5)
|
249
|
+
# Split the main CSV into 5 smaller files and process in parallel.
|
250
|
+
else
|
251
|
+
# Something went wrong, alert the file uploader
|
252
|
+
handle_invalid_file(processor.errors)
|
253
|
+
end
|
254
|
+
```
|
255
|
+
|
256
|
+
By default, the file will be split into equal-sized partitions. If you need the partitions
|
257
|
+
to keep all rows with the same value for a column into the same partition, define `.boundary_column`
|
258
|
+
on the processor class to return the name of that column. E.g.
|
259
|
+
|
260
|
+
```csv
|
261
|
+
pet_id,meal,mead_date
|
262
|
+
1,kibble,2015-11-02
|
263
|
+
1,bits,2015-11-03
|
264
|
+
...
|
265
|
+
1,alpo,2015-12-31
|
266
|
+
2,alpo,2015-11-01
|
267
|
+
...
|
268
|
+
```
|
269
|
+
|
270
|
+
```ruby
|
271
|
+
class PetCSVProcessor
|
272
|
+
def self.boundary_column
|
273
|
+
'pet_id'
|
274
|
+
end
|
275
|
+
...
|
276
|
+
end
|
277
|
+
```
|
278
|
+
|
279
|
+
Finally, to be notified of any failures in the splitting process, you can define
|
280
|
+
`.handler_class` on your processor class to return a class that implements the Handler role.
|
281
|
+
If an error is raised in the splitting, `#fail!` will be called on the Handler with
|
282
|
+
the error.
|
283
|
+
|
284
|
+
```ruby
|
285
|
+
class PetCSVProcessor
|
286
|
+
def self.handler_class
|
287
|
+
PetHandler
|
288
|
+
end
|
289
|
+
...
|
290
|
+
end
|
291
|
+
```
|
292
|
+
|
232
293
|
### BulkProcessor::CSVProcessor::Result
|
233
294
|
|
234
295
|
The result instances passed from BulkProcessor::CSVProcessor to the Handler
|
@@ -1,25 +1,25 @@
|
|
1
1
|
class BulkProcessor
|
2
2
|
module BackEnd
|
3
|
+
# Execute jobs via ActiveJob, e.g. Resque
|
3
4
|
class ActiveJob
|
4
|
-
def initialize(processor_class:, payload:,
|
5
|
+
def initialize(processor_class:, payload:, key:)
|
5
6
|
@processor_class = processor_class
|
6
7
|
@payload = payload
|
7
|
-
@file_class = file_class
|
8
8
|
@key = key
|
9
9
|
end
|
10
10
|
|
11
11
|
def start
|
12
|
-
Job.perform_later(
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
12
|
+
Job::ProcessCSV.perform_later(processor_class.name, payload, key)
|
13
|
+
end
|
14
|
+
|
15
|
+
def split(num_processes)
|
16
|
+
Job::SplitCSV.perform_later(processor_class.name, payload,
|
17
|
+
key, num_processes)
|
18
18
|
end
|
19
19
|
|
20
20
|
private
|
21
21
|
|
22
|
-
attr_reader :processor_class, :payload, :
|
22
|
+
attr_reader :processor_class, :payload, :key
|
23
23
|
end
|
24
24
|
end
|
25
25
|
end
|
@@ -2,11 +2,11 @@ require 'dynosaur'
|
|
2
2
|
|
3
3
|
class BulkProcessor
|
4
4
|
module BackEnd
|
5
|
+
# Execute jobs via rake tasks that will spawn a new Heroku dyno
|
5
6
|
class Dynosaur
|
6
|
-
def initialize(processor_class:, payload:,
|
7
|
+
def initialize(processor_class:, payload:, key:)
|
7
8
|
@processor_class = processor_class
|
8
9
|
@payload = payload
|
9
|
-
@file_class = file_class
|
10
10
|
@key = key
|
11
11
|
configure_dynosaur
|
12
12
|
end
|
@@ -14,19 +14,22 @@ class BulkProcessor
|
|
14
14
|
def start
|
15
15
|
args = {
|
16
16
|
task: 'bulk_processor:start',
|
17
|
-
args: [
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
17
|
+
args: [processor_class.name, payload, key]
|
18
|
+
}
|
19
|
+
::Dynosaur::Process::Heroku.new(args).start
|
20
|
+
end
|
21
|
+
|
22
|
+
def split(num_processes)
|
23
|
+
args = {
|
24
|
+
task: 'bulk_processor:split',
|
25
|
+
args: [processor_class.name, payload, key, num_processes]
|
23
26
|
}
|
24
27
|
::Dynosaur::Process::Heroku.new(args).start
|
25
28
|
end
|
26
29
|
|
27
30
|
private
|
28
31
|
|
29
|
-
attr_reader :processor_class, :payload, :
|
32
|
+
attr_reader :processor_class, :payload, :key
|
30
33
|
|
31
34
|
def configure_dynosaur
|
32
35
|
::Dynosaur::Client::HerokuClient.configure do |config|
|
@@ -1,14 +1,13 @@
|
|
1
1
|
class BulkProcessor
|
2
2
|
module BackEnd
|
3
3
|
class << self
|
4
|
-
def start(processor_class:, payload:,
|
4
|
+
def start(processor_class:, payload:, key:, num_processes: 1)
|
5
5
|
back_end = back_end_class.new(
|
6
6
|
processor_class: processor_class,
|
7
|
-
payload: payload,
|
8
|
-
file_class: file_class,
|
7
|
+
payload: PayloadSerializer.serialize(payload),
|
9
8
|
key: key
|
10
9
|
)
|
11
|
-
back_end.start
|
10
|
+
num_processes > 1 ? back_end.split(num_processes) : back_end.start
|
12
11
|
end
|
13
12
|
|
14
13
|
private
|
@@ -2,12 +2,17 @@ class BulkProcessor
|
|
2
2
|
# Store configuration data set by clients
|
3
3
|
class Config
|
4
4
|
attr_reader :queue_adapter
|
5
|
+
attr_writer :file_class
|
5
6
|
attr_accessor :back_end, :temp_directory
|
6
7
|
|
7
8
|
def queue_adapter=(adapter)
|
8
9
|
ActiveJob::Base.queue_adapter = @queue_adapter = adapter
|
9
10
|
end
|
10
11
|
|
12
|
+
def file_class
|
13
|
+
@file_class || BulkProcessor::S3File
|
14
|
+
end
|
15
|
+
|
11
16
|
def aws
|
12
17
|
@aws ||= Struct.new(:access_key_id, :secret_access_key, :bucket).new
|
13
18
|
end
|
@@ -0,0 +1,59 @@
|
|
1
|
+
class BulkProcessor
|
2
|
+
# Split a CSV file on S3 using the specified chunker
|
3
|
+
class FileSplitter
|
4
|
+
def initialize(key:, row_chunker:)
|
5
|
+
@key = key
|
6
|
+
@row_chunker = row_chunker
|
7
|
+
end
|
8
|
+
|
9
|
+
# Generate multiple files on S3, composed of chunks of the input file.
|
10
|
+
#
|
11
|
+
# @return [Array<String>] the S3 keys for each new file
|
12
|
+
def split!
|
13
|
+
return @keys if instance_variable_defined?('@keys')
|
14
|
+
ranges = row_chunker.ranges_for(input_csv)
|
15
|
+
@keys = ranges.map.with_index do |range, index|
|
16
|
+
chunk_key = key_from_index(index, ranges.count)
|
17
|
+
contents = csv_from_range(range)
|
18
|
+
BulkProcessor.config.file_class.new(chunk_key).write(contents)
|
19
|
+
chunk_key
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
private
|
24
|
+
|
25
|
+
attr_reader :key, :row_chunker
|
26
|
+
|
27
|
+
def headers
|
28
|
+
input_csv.headers
|
29
|
+
end
|
30
|
+
|
31
|
+
def input_csv
|
32
|
+
return @input_csv if instance_variable_defined?('@input_csv')
|
33
|
+
BulkProcessor.config.file_class.new(key).open do |input_file|
|
34
|
+
@input_csv = CSV.parse(input_file, headers: true)
|
35
|
+
end
|
36
|
+
@input_csv
|
37
|
+
end
|
38
|
+
|
39
|
+
def csv_from_range(range)
|
40
|
+
return CSV.generate { |csv| csv << headers } if range.count == 0
|
41
|
+
CSV.generate(headers: headers, write_headers: true) do |csv|
|
42
|
+
range.each { |row_num| csv << input_csv[row_num] }
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
def key_from_index(index, total)
|
47
|
+
parts = key.split('.')
|
48
|
+
if parts.length == 1
|
49
|
+
name_part = key
|
50
|
+
ext_part = ''
|
51
|
+
else
|
52
|
+
name_part = parts[0..-2].join('.')
|
53
|
+
ext_part = ".#{parts.last}"
|
54
|
+
end
|
55
|
+
|
56
|
+
"#{name_part}_#{index + 1}-of-#{total}#{ext_part}"
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
require 'active_job'
|
2
|
+
|
3
|
+
class BulkProcessor
|
4
|
+
# ActiveJob to handle processing the CSV in the background
|
5
|
+
module Job
|
6
|
+
class ProcessCSV < ActiveJob::Base
|
7
|
+
queue_as 'bulk_processor'
|
8
|
+
|
9
|
+
def perform(processor_class, payload, key)
|
10
|
+
file = BulkProcessor.config.file_class.new(key)
|
11
|
+
payload = PayloadSerializer.deserialize(payload).merge('key' => key)
|
12
|
+
file.open do |f|
|
13
|
+
csv = CSV.parse(f.read, headers: true)
|
14
|
+
processor = processor_class.constantize.new(csv, payload: payload)
|
15
|
+
processor.start
|
16
|
+
end
|
17
|
+
ensure
|
18
|
+
file.try(:delete)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
require 'active_job'
|
2
|
+
|
3
|
+
class BulkProcessor
|
4
|
+
# ActiveJob to handle processing the CSV in the background
|
5
|
+
module Job
|
6
|
+
class SplitCSV < ActiveJob::Base
|
7
|
+
queue_as 'bulk_processor'
|
8
|
+
|
9
|
+
def perform(processor_class, payload, key, num_chunks)
|
10
|
+
processor_class = processor_class.constantize
|
11
|
+
chunker = row_chunker(processor_class, num_chunks)
|
12
|
+
payload = PayloadSerializer.deserialize(payload)
|
13
|
+
splitter = FileSplitter.new(key: key, row_chunker: chunker)
|
14
|
+
keys = splitter.split!
|
15
|
+
keys.each do |key|
|
16
|
+
BackEnd.start(processor_class: processor_class, payload: payload, key: key)
|
17
|
+
end
|
18
|
+
rescue Exception => error
|
19
|
+
if processor_class.respond_to?(:handler_class)
|
20
|
+
payload = payload.merge('key' => key)
|
21
|
+
handler = processor_class.handler_class.new(payload: payload, results: [])
|
22
|
+
handler.fail!(error)
|
23
|
+
end
|
24
|
+
raise
|
25
|
+
ensure
|
26
|
+
BulkProcessor.config.file_class.new(key).delete
|
27
|
+
end
|
28
|
+
|
29
|
+
private
|
30
|
+
|
31
|
+
def row_chunker(processor_class, num_chunks)
|
32
|
+
if processor_class.respond_to?(:boundary_column)
|
33
|
+
boundary_column = processor_class.boundary_column
|
34
|
+
RowChunker::Boundary.new(num_chunks, boundary_column: boundary_column)
|
35
|
+
else
|
36
|
+
RowChunker::Balanced.new(num_chunks)
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
class BulkProcessor
|
2
|
+
module RowChunker
|
3
|
+
# Determine the partitions for a balanced break up of the input CSV file.
|
4
|
+
# All partitions will have a size within 1 row of every other partition.
|
5
|
+
class Balanced
|
6
|
+
def initialize(num_chunks)
|
7
|
+
@num_chunks = num_chunks
|
8
|
+
end
|
9
|
+
|
10
|
+
def ranges_for(csv)
|
11
|
+
ideal_size = csv.count / num_chunks
|
12
|
+
num_chunks.times.map do |index|
|
13
|
+
start_index = index * ideal_size
|
14
|
+
if index == num_chunks - 1
|
15
|
+
# force the last chunk to go to the very last row
|
16
|
+
end_index = csv.count - 1
|
17
|
+
else
|
18
|
+
end_index = start_index + ideal_size - 1
|
19
|
+
end
|
20
|
+
(start_index..end_index)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
private
|
25
|
+
|
26
|
+
attr_reader :num_chunks
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
class BulkProcessor
|
2
|
+
module RowChunker
|
3
|
+
# Determine the partitions that ensure all consecutive rows with the same
|
4
|
+
# value for boundary_column are in the same partion. The CSV must be sorted
|
5
|
+
# on this column to get the desired results. This class makes an attempt to
|
6
|
+
# keep the partion sizes equal, but obviously prioritizes the boundary
|
7
|
+
# column values over partition size.
|
8
|
+
class Boundary
|
9
|
+
def initialize(num_chunks, boundary_column:)
|
10
|
+
@num_chunks = num_chunks
|
11
|
+
@boundary_column = boundary_column
|
12
|
+
end
|
13
|
+
|
14
|
+
def ranges_for(csv)
|
15
|
+
@ranges ||= begin
|
16
|
+
# Start with a balanced partition, then make adjustments from there
|
17
|
+
chunker = Balanced.new(num_chunks)
|
18
|
+
adjust_for_boundaries(chunker.ranges_for(csv), csv)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
private
|
23
|
+
|
24
|
+
attr_reader :num_chunks, :boundary_column
|
25
|
+
|
26
|
+
def adjust_for_boundaries(balanced_ranges, csv)
|
27
|
+
balanced_endings = balanced_ranges.map(&:last)
|
28
|
+
|
29
|
+
last_indexes = []
|
30
|
+
while balanced_endings.any?
|
31
|
+
last_index = [last_indexes.last, balanced_endings.shift].compact.max
|
32
|
+
last_index += 1 until at_boundary?(csv, last_index)
|
33
|
+
last_indexes << last_index
|
34
|
+
end
|
35
|
+
|
36
|
+
to_ranges(last_indexes)
|
37
|
+
end
|
38
|
+
|
39
|
+
def to_ranges(last_indexes)
|
40
|
+
first_indexes = last_indexes.dup
|
41
|
+
first_indexes.pop
|
42
|
+
first_indexes.map! { |index| index + 1 }
|
43
|
+
first_indexes.unshift(0)
|
44
|
+
first_indexes.map.with_index do |first_index, index|
|
45
|
+
(first_index..last_indexes[index])
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
def at_boundary?(csv, index)
|
50
|
+
return true if index == csv.count - 1
|
51
|
+
csv[index][boundary_column] != csv[index + 1][boundary_column]
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
data/lib/bulk_processor/tasks.rb
CHANGED
@@ -7,14 +7,23 @@ class BulkProcessor
|
|
7
7
|
def install_tasks
|
8
8
|
namespace :bulk_processor do
|
9
9
|
desc 'Start processing a CSV file'
|
10
|
-
task :start, [:processor_class, :payload, :
|
11
|
-
Job.new.perform(
|
10
|
+
task :start, [:processor_class, :payload, :key] => :environment do |_task, args|
|
11
|
+
Job::ProcessCSV.new.perform(
|
12
12
|
args[:processor_class],
|
13
13
|
args[:payload],
|
14
|
-
args[:file_class],
|
15
14
|
args[:key]
|
16
15
|
)
|
17
16
|
end
|
17
|
+
|
18
|
+
desc 'Split a CSV file and process each piece'
|
19
|
+
task :split, [:processor_class, :payload, :key, :num_chunks] => :environment do |_task, args|
|
20
|
+
Job::SplitCSV.new.perform(
|
21
|
+
args[:processor_class],
|
22
|
+
args[:payload],
|
23
|
+
args[:key],
|
24
|
+
args[:num_chunks]
|
25
|
+
)
|
26
|
+
end
|
18
27
|
end
|
19
28
|
end
|
20
29
|
end
|
data/lib/bulk_processor.rb
CHANGED
@@ -2,8 +2,12 @@ require 'bulk_processor/back_end'
|
|
2
2
|
require 'bulk_processor/back_end/active_job'
|
3
3
|
require 'bulk_processor/back_end/dynosaur'
|
4
4
|
require 'bulk_processor/config'
|
5
|
-
require 'bulk_processor/
|
5
|
+
require 'bulk_processor/file_splitter'
|
6
|
+
require 'bulk_processor/job/process_csv'
|
7
|
+
require 'bulk_processor/job/split_csv'
|
6
8
|
require 'bulk_processor/payload_serializer'
|
9
|
+
require 'bulk_processor/row_chunker/balanced'
|
10
|
+
require 'bulk_processor/row_chunker/boundary'
|
7
11
|
require 'bulk_processor/s3_file'
|
8
12
|
require 'bulk_processor/stream_encoder'
|
9
13
|
require 'bulk_processor/validated_csv'
|
@@ -32,8 +36,8 @@ class BulkProcessor
|
|
32
36
|
end
|
33
37
|
|
34
38
|
# Validate the CSV and enqueue if for processing in the background.
|
35
|
-
def start(
|
36
|
-
if file_class.new(key).exists?
|
39
|
+
def start(num_processes = 1)
|
40
|
+
if BulkProcessor.config.file_class.new(key).exists?
|
37
41
|
errors << "Already processing #{key}, please wait for it to finish"
|
38
42
|
return false
|
39
43
|
end
|
@@ -47,7 +51,7 @@ class BulkProcessor
|
|
47
51
|
)
|
48
52
|
|
49
53
|
if csv.valid?
|
50
|
-
start_backend(
|
54
|
+
start_backend(encoded_contents, num_processes)
|
51
55
|
else
|
52
56
|
errors.concat(csv.errors)
|
53
57
|
end
|
@@ -58,11 +62,11 @@ class BulkProcessor
|
|
58
62
|
|
59
63
|
attr_reader :key, :stream, :processor_class, :payload
|
60
64
|
|
61
|
-
def start_backend(
|
62
|
-
file = file_class.new(key)
|
65
|
+
def start_backend(contents, num_processes)
|
66
|
+
file = BulkProcessor.config.file_class.new(key)
|
63
67
|
file.write(contents)
|
64
|
-
BackEnd.start(processor_class: processor_class, payload: payload,
|
65
|
-
|
68
|
+
BackEnd.start(processor_class: processor_class, payload: payload, key: key,
|
69
|
+
num_processes: num_processes)
|
66
70
|
rescue Exception
|
67
71
|
# Clean up the file, which is treated as a lock, if we bail out of here
|
68
72
|
# unexpectedly.
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bulk-processor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.6.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Tom Collier, Justin Richard
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-01-
|
11
|
+
date: 2016-01-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activejob
|
@@ -152,8 +152,12 @@ files:
|
|
152
152
|
- lib/bulk_processor/csv_processor/no_op_post_processor.rb
|
153
153
|
- lib/bulk_processor/csv_processor/result.rb
|
154
154
|
- lib/bulk_processor/csv_processor/row_processor.rb
|
155
|
-
- lib/bulk_processor/
|
155
|
+
- lib/bulk_processor/file_splitter.rb
|
156
|
+
- lib/bulk_processor/job/process_csv.rb
|
157
|
+
- lib/bulk_processor/job/split_csv.rb
|
156
158
|
- lib/bulk_processor/payload_serializer.rb
|
159
|
+
- lib/bulk_processor/row_chunker/balanced.rb
|
160
|
+
- lib/bulk_processor/row_chunker/boundary.rb
|
157
161
|
- lib/bulk_processor/s3_file.rb
|
158
162
|
- lib/bulk_processor/stream_encoder.rb
|
159
163
|
- lib/bulk_processor/tasks.rb
|
data/lib/bulk_processor/job.rb
DELETED
@@ -1,20 +0,0 @@
|
|
1
|
-
require 'active_job'
|
2
|
-
|
3
|
-
class BulkProcessor
|
4
|
-
# ActiveJob to handle processing the CSV in the background
|
5
|
-
class Job < ActiveJob::Base
|
6
|
-
queue_as 'bulk_processor'
|
7
|
-
|
8
|
-
def perform(processor_class, payload, file_class, key)
|
9
|
-
file = file_class.constantize.new(key)
|
10
|
-
payload = PayloadSerializer.deserialize(payload)
|
11
|
-
file.open do |f|
|
12
|
-
csv = CSV.parse(f.read, headers: true)
|
13
|
-
processor = processor_class.constantize.new(csv, payload: payload)
|
14
|
-
processor.start
|
15
|
-
end
|
16
|
-
ensure
|
17
|
-
file.try(:delete)
|
18
|
-
end
|
19
|
-
end
|
20
|
-
end
|