bulk-processor 0.5.1 → 0.6.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 5938f4133413fe9183607008301eda98d5961a3a
4
- data.tar.gz: 60f3090d882a96290250a59ccce5d1ff7bd49139
3
+ metadata.gz: 77320b807b3cd9862490408058611d9b461cf83f
4
+ data.tar.gz: 203e565ab7f722c6f639527b4065e2e5f495aa57
5
5
  SHA512:
6
- metadata.gz: 5541b7a3bc23bccd4842fbf7a5439ac40d06a0497ac6cf033368546c59b72f0d7e3d637e6e063597ad726d3ea48928fdbf6631e6d7d647719469336afcfed1cd
7
- data.tar.gz: 1538237aafa5c71cb1c8a04f64eb0b921dae8aa4153235ba8386fd01e4a43478c9a7074b7ffc12eb656ef4cd1a1916dbc11b531a55c1c463e87e616c52f27489
6
+ metadata.gz: 4f89dd796184485f44d0018a9819cf42ec5c147ff20693c63e7055b43fd2e0807e5b268f622e8555157e1ac36b01207c9b23fe6f0212eb2a744924b6e3533d91
7
+ data.tar.gz: 2373584252697f040d460070a93958cc944c2f0045948233125f4e0ee06d39daf5080191dd0f07e6ad0432e461c7947e01e821a5e81e7dbc994c558571a1da44
data/README.md CHANGED
@@ -65,7 +65,7 @@ The CSV file passed to BulkProcessor will be persisted on AWS S3 so that the job
65
65
  can access it. This requires configuring AWS credentials, the S3 bucket in which
66
66
  to store the file, and a local temp directory to hold the file locally.
67
67
 
68
- ### Setting up the processor and handler
68
+ ### Setting up the processor
69
69
 
70
70
  You will need to supply a class for CSV processing. This class must respond to the
71
71
  `start` instance method, the `required_columns` and `optional_columns` class methods,
@@ -229,6 +229,67 @@ else
229
229
  end
230
230
  ```
231
231
 
232
+ #### Parallelization
233
+
234
+ For larger CSV files, you may wish to process rows in parallel. This gem allows
235
+ you to scale up to an arbitrary number of parallel processes by providing an optional
236
+ argument to `#start`. Doing this will cause the input CSV file to be split into
237
+ *N* number of smaller CSV files, each one being processed in separate processes.
238
+ It is important to note that the file *must* be sorted by the boundary column for
239
+ it to deliver on its promise.
240
+
241
+ ```ruby
242
+ processor = BulkProcessor.new(
243
+ key: file_name,
244
+ stream: file_stream,
245
+ processor_class: PetCSVProcessor,
246
+ payload: { recipient: current_user.email }
247
+ )
248
+ if processor.start(5)
249
+ # Split the main CSV into 5 smaller files and process in parallel.
250
+ else
251
+ # Something went wrong, alert the file uploader
252
+ handle_invalid_file(processor.errors)
253
+ end
254
+ ```
255
+
256
+ By default, the file will be split into equal-sized partitions. If you need the partitions
257
+ to keep all rows with the same value for a column into the same partition, define `.boundary_column`
258
+ on the processor class to return the name of that column. E.g.
259
+
260
+ ```csv
261
+ pet_id,meal,mead_date
262
+ 1,kibble,2015-11-02
263
+ 1,bits,2015-11-03
264
+ ...
265
+ 1,alpo,2015-12-31
266
+ 2,alpo,2015-11-01
267
+ ...
268
+ ```
269
+
270
+ ```ruby
271
+ class PetCSVProcessor
272
+ def self.boundary_column
273
+ 'pet_id'
274
+ end
275
+ ...
276
+ end
277
+ ```
278
+
279
+ Finally, to be notified of any failures in the splitting process, you can define
280
+ `.handler_class` on your processor class to return a class that implements the Handler role.
281
+ If an error is raised in the splitting, `#fail!` will be called on the Handler with
282
+ the error.
283
+
284
+ ```ruby
285
+ class PetCSVProcessor
286
+ def self.handler_class
287
+ PetHandler
288
+ end
289
+ ...
290
+ end
291
+ ```
292
+
232
293
  ### BulkProcessor::CSVProcessor::Result
233
294
 
234
295
  The result instances passed from BulkProcessor::CSVProcessor to the Handler
@@ -1,25 +1,25 @@
1
1
  class BulkProcessor
2
2
  module BackEnd
3
+ # Execute jobs via ActiveJob, e.g. Resque
3
4
  class ActiveJob
4
- def initialize(processor_class:, payload:, file_class:, key:)
5
+ def initialize(processor_class:, payload:, key:)
5
6
  @processor_class = processor_class
6
7
  @payload = payload
7
- @file_class = file_class
8
8
  @key = key
9
9
  end
10
10
 
11
11
  def start
12
- Job.perform_later(
13
- processor_class.name,
14
- PayloadSerializer.serialize(payload),
15
- file_class.name,
16
- key
17
- )
12
+ Job::ProcessCSV.perform_later(processor_class.name, payload, key)
13
+ end
14
+
15
+ def split(num_processes)
16
+ Job::SplitCSV.perform_later(processor_class.name, payload,
17
+ key, num_processes)
18
18
  end
19
19
 
20
20
  private
21
21
 
22
- attr_reader :processor_class, :payload, :file_class, :key
22
+ attr_reader :processor_class, :payload, :key
23
23
  end
24
24
  end
25
25
  end
@@ -2,11 +2,11 @@ require 'dynosaur'
2
2
 
3
3
  class BulkProcessor
4
4
  module BackEnd
5
+ # Execute jobs via rake tasks that will spawn a new Heroku dyno
5
6
  class Dynosaur
6
- def initialize(processor_class:, payload:, file_class:, key:)
7
+ def initialize(processor_class:, payload:, key:)
7
8
  @processor_class = processor_class
8
9
  @payload = payload
9
- @file_class = file_class
10
10
  @key = key
11
11
  configure_dynosaur
12
12
  end
@@ -14,19 +14,22 @@ class BulkProcessor
14
14
  def start
15
15
  args = {
16
16
  task: 'bulk_processor:start',
17
- args: [
18
- processor_class.name,
19
- PayloadSerializer.serialize(payload),
20
- file_class.name,
21
- key
22
- ]
17
+ args: [processor_class.name, payload, key]
18
+ }
19
+ ::Dynosaur::Process::Heroku.new(args).start
20
+ end
21
+
22
+ def split(num_processes)
23
+ args = {
24
+ task: 'bulk_processor:split',
25
+ args: [processor_class.name, payload, key, num_processes]
23
26
  }
24
27
  ::Dynosaur::Process::Heroku.new(args).start
25
28
  end
26
29
 
27
30
  private
28
31
 
29
- attr_reader :processor_class, :payload, :file_class, :key
32
+ attr_reader :processor_class, :payload, :key
30
33
 
31
34
  def configure_dynosaur
32
35
  ::Dynosaur::Client::HerokuClient.configure do |config|
@@ -1,14 +1,13 @@
1
1
  class BulkProcessor
2
2
  module BackEnd
3
3
  class << self
4
- def start(processor_class:, payload:, file_class:, key:)
4
+ def start(processor_class:, payload:, key:, num_processes: 1)
5
5
  back_end = back_end_class.new(
6
6
  processor_class: processor_class,
7
- payload: payload,
8
- file_class: file_class,
7
+ payload: PayloadSerializer.serialize(payload),
9
8
  key: key
10
9
  )
11
- back_end.start
10
+ num_processes > 1 ? back_end.split(num_processes) : back_end.start
12
11
  end
13
12
 
14
13
  private
@@ -2,12 +2,17 @@ class BulkProcessor
2
2
  # Store configuration data set by clients
3
3
  class Config
4
4
  attr_reader :queue_adapter
5
+ attr_writer :file_class
5
6
  attr_accessor :back_end, :temp_directory
6
7
 
7
8
  def queue_adapter=(adapter)
8
9
  ActiveJob::Base.queue_adapter = @queue_adapter = adapter
9
10
  end
10
11
 
12
+ def file_class
13
+ @file_class || BulkProcessor::S3File
14
+ end
15
+
11
16
  def aws
12
17
  @aws ||= Struct.new(:access_key_id, :secret_access_key, :bucket).new
13
18
  end
@@ -0,0 +1,59 @@
1
+ class BulkProcessor
2
+ # Split a CSV file on S3 using the specified chunker
3
+ class FileSplitter
4
+ def initialize(key:, row_chunker:)
5
+ @key = key
6
+ @row_chunker = row_chunker
7
+ end
8
+
9
+ # Generate multiple files on S3, composed of chunks of the input file.
10
+ #
11
+ # @return [Array<String>] the S3 keys for each new file
12
+ def split!
13
+ return @keys if instance_variable_defined?('@keys')
14
+ ranges = row_chunker.ranges_for(input_csv)
15
+ @keys = ranges.map.with_index do |range, index|
16
+ chunk_key = key_from_index(index, ranges.count)
17
+ contents = csv_from_range(range)
18
+ BulkProcessor.config.file_class.new(chunk_key).write(contents)
19
+ chunk_key
20
+ end
21
+ end
22
+
23
+ private
24
+
25
+ attr_reader :key, :row_chunker
26
+
27
+ def headers
28
+ input_csv.headers
29
+ end
30
+
31
+ def input_csv
32
+ return @input_csv if instance_variable_defined?('@input_csv')
33
+ BulkProcessor.config.file_class.new(key).open do |input_file|
34
+ @input_csv = CSV.parse(input_file, headers: true)
35
+ end
36
+ @input_csv
37
+ end
38
+
39
+ def csv_from_range(range)
40
+ return CSV.generate { |csv| csv << headers } if range.count == 0
41
+ CSV.generate(headers: headers, write_headers: true) do |csv|
42
+ range.each { |row_num| csv << input_csv[row_num] }
43
+ end
44
+ end
45
+
46
+ def key_from_index(index, total)
47
+ parts = key.split('.')
48
+ if parts.length == 1
49
+ name_part = key
50
+ ext_part = ''
51
+ else
52
+ name_part = parts[0..-2].join('.')
53
+ ext_part = ".#{parts.last}"
54
+ end
55
+
56
+ "#{name_part}_#{index + 1}-of-#{total}#{ext_part}"
57
+ end
58
+ end
59
+ end
@@ -0,0 +1,22 @@
1
+ require 'active_job'
2
+
3
+ class BulkProcessor
4
+ # ActiveJob to handle processing the CSV in the background
5
+ module Job
6
+ class ProcessCSV < ActiveJob::Base
7
+ queue_as 'bulk_processor'
8
+
9
+ def perform(processor_class, payload, key)
10
+ file = BulkProcessor.config.file_class.new(key)
11
+ payload = PayloadSerializer.deserialize(payload).merge('key' => key)
12
+ file.open do |f|
13
+ csv = CSV.parse(f.read, headers: true)
14
+ processor = processor_class.constantize.new(csv, payload: payload)
15
+ processor.start
16
+ end
17
+ ensure
18
+ file.try(:delete)
19
+ end
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,41 @@
1
+ require 'active_job'
2
+
3
+ class BulkProcessor
4
+ # ActiveJob to handle processing the CSV in the background
5
+ module Job
6
+ class SplitCSV < ActiveJob::Base
7
+ queue_as 'bulk_processor'
8
+
9
+ def perform(processor_class, payload, key, num_chunks)
10
+ processor_class = processor_class.constantize
11
+ chunker = row_chunker(processor_class, num_chunks)
12
+ payload = PayloadSerializer.deserialize(payload)
13
+ splitter = FileSplitter.new(key: key, row_chunker: chunker)
14
+ keys = splitter.split!
15
+ keys.each do |key|
16
+ BackEnd.start(processor_class: processor_class, payload: payload, key: key)
17
+ end
18
+ rescue Exception => error
19
+ if processor_class.respond_to?(:handler_class)
20
+ payload = payload.merge('key' => key)
21
+ handler = processor_class.handler_class.new(payload: payload, results: [])
22
+ handler.fail!(error)
23
+ end
24
+ raise
25
+ ensure
26
+ BulkProcessor.config.file_class.new(key).delete
27
+ end
28
+
29
+ private
30
+
31
+ def row_chunker(processor_class, num_chunks)
32
+ if processor_class.respond_to?(:boundary_column)
33
+ boundary_column = processor_class.boundary_column
34
+ RowChunker::Boundary.new(num_chunks, boundary_column: boundary_column)
35
+ else
36
+ RowChunker::Balanced.new(num_chunks)
37
+ end
38
+ end
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,29 @@
1
+ class BulkProcessor
2
+ module RowChunker
3
+ # Determine the partitions for a balanced break up of the input CSV file.
4
+ # All partitions will have a size within 1 row of every other partition.
5
+ class Balanced
6
+ def initialize(num_chunks)
7
+ @num_chunks = num_chunks
8
+ end
9
+
10
+ def ranges_for(csv)
11
+ ideal_size = csv.count / num_chunks
12
+ num_chunks.times.map do |index|
13
+ start_index = index * ideal_size
14
+ if index == num_chunks - 1
15
+ # force the last chunk to go to the very last row
16
+ end_index = csv.count - 1
17
+ else
18
+ end_index = start_index + ideal_size - 1
19
+ end
20
+ (start_index..end_index)
21
+ end
22
+ end
23
+
24
+ private
25
+
26
+ attr_reader :num_chunks
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,55 @@
1
+ class BulkProcessor
2
+ module RowChunker
3
+ # Determine the partitions that ensure all consecutive rows with the same
4
+ # value for boundary_column are in the same partion. The CSV must be sorted
5
+ # on this column to get the desired results. This class makes an attempt to
6
+ # keep the partion sizes equal, but obviously prioritizes the boundary
7
+ # column values over partition size.
8
+ class Boundary
9
+ def initialize(num_chunks, boundary_column:)
10
+ @num_chunks = num_chunks
11
+ @boundary_column = boundary_column
12
+ end
13
+
14
+ def ranges_for(csv)
15
+ @ranges ||= begin
16
+ # Start with a balanced partition, then make adjustments from there
17
+ chunker = Balanced.new(num_chunks)
18
+ adjust_for_boundaries(chunker.ranges_for(csv), csv)
19
+ end
20
+ end
21
+
22
+ private
23
+
24
+ attr_reader :num_chunks, :boundary_column
25
+
26
+ def adjust_for_boundaries(balanced_ranges, csv)
27
+ balanced_endings = balanced_ranges.map(&:last)
28
+
29
+ last_indexes = []
30
+ while balanced_endings.any?
31
+ last_index = [last_indexes.last, balanced_endings.shift].compact.max
32
+ last_index += 1 until at_boundary?(csv, last_index)
33
+ last_indexes << last_index
34
+ end
35
+
36
+ to_ranges(last_indexes)
37
+ end
38
+
39
+ def to_ranges(last_indexes)
40
+ first_indexes = last_indexes.dup
41
+ first_indexes.pop
42
+ first_indexes.map! { |index| index + 1 }
43
+ first_indexes.unshift(0)
44
+ first_indexes.map.with_index do |first_index, index|
45
+ (first_index..last_indexes[index])
46
+ end
47
+ end
48
+
49
+ def at_boundary?(csv, index)
50
+ return true if index == csv.count - 1
51
+ csv[index][boundary_column] != csv[index + 1][boundary_column]
52
+ end
53
+ end
54
+ end
55
+ end
@@ -7,14 +7,23 @@ class BulkProcessor
7
7
  def install_tasks
8
8
  namespace :bulk_processor do
9
9
  desc 'Start processing a CSV file'
10
- task :start, [:processor_class, :payload, :file_class, :key] => :environment do |_task, args|
11
- Job.new.perform(
10
+ task :start, [:processor_class, :payload, :key] => :environment do |_task, args|
11
+ Job::ProcessCSV.new.perform(
12
12
  args[:processor_class],
13
13
  args[:payload],
14
- args[:file_class],
15
14
  args[:key]
16
15
  )
17
16
  end
17
+
18
+ desc 'Split a CSV file and process each piece'
19
+ task :split, [:processor_class, :payload, :key, :num_chunks] => :environment do |_task, args|
20
+ Job::SplitCSV.new.perform(
21
+ args[:processor_class],
22
+ args[:payload],
23
+ args[:key],
24
+ args[:num_chunks]
25
+ )
26
+ end
18
27
  end
19
28
  end
20
29
  end
@@ -1,3 +1,3 @@
1
1
  class BulkProcessor
2
- VERSION = '0.5.1'.freeze
2
+ VERSION = '0.6.0'.freeze
3
3
  end
@@ -2,8 +2,12 @@ require 'bulk_processor/back_end'
2
2
  require 'bulk_processor/back_end/active_job'
3
3
  require 'bulk_processor/back_end/dynosaur'
4
4
  require 'bulk_processor/config'
5
- require 'bulk_processor/job'
5
+ require 'bulk_processor/file_splitter'
6
+ require 'bulk_processor/job/process_csv'
7
+ require 'bulk_processor/job/split_csv'
6
8
  require 'bulk_processor/payload_serializer'
9
+ require 'bulk_processor/row_chunker/balanced'
10
+ require 'bulk_processor/row_chunker/boundary'
7
11
  require 'bulk_processor/s3_file'
8
12
  require 'bulk_processor/stream_encoder'
9
13
  require 'bulk_processor/validated_csv'
@@ -32,8 +36,8 @@ class BulkProcessor
32
36
  end
33
37
 
34
38
  # Validate the CSV and enqueue if for processing in the background.
35
- def start(file_class: S3File)
36
- if file_class.new(key).exists?
39
+ def start(num_processes = 1)
40
+ if BulkProcessor.config.file_class.new(key).exists?
37
41
  errors << "Already processing #{key}, please wait for it to finish"
38
42
  return false
39
43
  end
@@ -47,7 +51,7 @@ class BulkProcessor
47
51
  )
48
52
 
49
53
  if csv.valid?
50
- start_backend(file_class, encoded_contents)
54
+ start_backend(encoded_contents, num_processes)
51
55
  else
52
56
  errors.concat(csv.errors)
53
57
  end
@@ -58,11 +62,11 @@ class BulkProcessor
58
62
 
59
63
  attr_reader :key, :stream, :processor_class, :payload
60
64
 
61
- def start_backend(file_class, contents)
62
- file = file_class.new(key)
65
+ def start_backend(contents, num_processes)
66
+ file = BulkProcessor.config.file_class.new(key)
63
67
  file.write(contents)
64
- BackEnd.start(processor_class: processor_class, payload: payload,
65
- file_class: file_class, key: key)
68
+ BackEnd.start(processor_class: processor_class, payload: payload, key: key,
69
+ num_processes: num_processes)
66
70
  rescue Exception
67
71
  # Clean up the file, which is treated as a lock, if we bail out of here
68
72
  # unexpectedly.
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bulk-processor
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.1
4
+ version: 0.6.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Tom Collier, Justin Richard
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-01-21 00:00:00.000000000 Z
11
+ date: 2016-01-22 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: activejob
@@ -152,8 +152,12 @@ files:
152
152
  - lib/bulk_processor/csv_processor/no_op_post_processor.rb
153
153
  - lib/bulk_processor/csv_processor/result.rb
154
154
  - lib/bulk_processor/csv_processor/row_processor.rb
155
- - lib/bulk_processor/job.rb
155
+ - lib/bulk_processor/file_splitter.rb
156
+ - lib/bulk_processor/job/process_csv.rb
157
+ - lib/bulk_processor/job/split_csv.rb
156
158
  - lib/bulk_processor/payload_serializer.rb
159
+ - lib/bulk_processor/row_chunker/balanced.rb
160
+ - lib/bulk_processor/row_chunker/boundary.rb
157
161
  - lib/bulk_processor/s3_file.rb
158
162
  - lib/bulk_processor/stream_encoder.rb
159
163
  - lib/bulk_processor/tasks.rb
@@ -1,20 +0,0 @@
1
- require 'active_job'
2
-
3
- class BulkProcessor
4
- # ActiveJob to handle processing the CSV in the background
5
- class Job < ActiveJob::Base
6
- queue_as 'bulk_processor'
7
-
8
- def perform(processor_class, payload, file_class, key)
9
- file = file_class.constantize.new(key)
10
- payload = PayloadSerializer.deserialize(payload)
11
- file.open do |f|
12
- csv = CSV.parse(f.read, headers: true)
13
- processor = processor_class.constantize.new(csv, payload: payload)
14
- processor.start
15
- end
16
- ensure
17
- file.try(:delete)
18
- end
19
- end
20
- end