bulk-processor 0.6.0 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 77320b807b3cd9862490408058611d9b461cf83f
4
- data.tar.gz: 203e565ab7f722c6f639527b4065e2e5f495aa57
3
+ metadata.gz: f5d0cde0e86097416d0234ead33d2836f4e08c7a
4
+ data.tar.gz: d176ea1309c0e618ccf718fd8c4320a2b3948831
5
5
  SHA512:
6
- metadata.gz: 4f89dd796184485f44d0018a9819cf42ec5c147ff20693c63e7055b43fd2e0807e5b268f622e8555157e1ac36b01207c9b23fe6f0212eb2a744924b6e3533d91
7
- data.tar.gz: 2373584252697f040d460070a93958cc944c2f0045948233125f4e0ee06d39daf5080191dd0f07e6ad0432e461c7947e01e821a5e81e7dbc994c558571a1da44
6
+ metadata.gz: 5dfea7cabd9b210116fd21644d34740793f785df799b954c79bdc452f0ec11155575465f0d9ac6bd6b4c1fc2df2a8ca96902c142c99ff15c6a5182761e970f55
7
+ data.tar.gz: 9b8dc6e9fcd908b6bf0ddc2e6e9c4a36c89ba398ccb11016ed2668c188339b953a4e8c4c58bf8c8025b7ae1b3903170871617fd8e6a808a6d68164ede85af3aa
data/README.md CHANGED
@@ -30,6 +30,15 @@ Bulk processor requires the following configuration
30
30
 
31
31
  #### Back end: ActiveJob
32
32
 
33
+ Include the `activejob` and back-end queueing gems in your Gemfile, e.g.
34
+
35
+ ```ruby
36
+ # Gemfile
37
+ gem 'activejob'
38
+ gem 'bulk-processor'
39
+ gem 'resque'
40
+ ```
41
+
33
42
  ```ruby
34
43
  BulkProcessor.back_end = :active_job
35
44
  BulkProcessor.queue_adapter = <adapter>
@@ -41,6 +50,14 @@ including `:resque`.
41
50
 
42
51
  #### Back end: Dynosaur
43
52
 
53
+ Include the `dynosaur` gem in your Gemfile, e.g.
54
+
55
+ ```ruby
56
+ # Gemfile
57
+ gem 'dynosaur'
58
+ gem 'resque'
59
+ ```
60
+
44
61
  ```ruby
45
62
  BulkProcessor.back_end = :dynosaur
46
63
  BulkProcessor.heroku.api_key = 'my-heroku-api-key'
@@ -49,7 +66,12 @@ BulkProcessor.heroku.app_name = 'my-heroku-app-name'
49
66
 
50
67
  ```ruby
51
68
  # Rakefile
52
- require 'bulk_processor/tasks'
69
+ require 'bulk_processor/back_end/dynosaur/tasks'
70
+
71
+ # If you do not already have an :enivronment rake task, create a no-op one as
72
+ # Dynosaur tasks depend on it.
73
+ task :environment
74
+ end
53
75
  ```
54
76
 
55
77
  #### AWS S3
@@ -21,12 +21,12 @@ success or failure report
21
21
  spec.require_paths = ['lib']
22
22
  spec.required_ruby_version = '>= 2.1'
23
23
 
24
- spec.add_runtime_dependency 'activejob', '~> 4'
25
24
  spec.add_runtime_dependency 'aws-sdk', '~> 2.1'
26
- spec.add_runtime_dependency 'dynosaur', '~> 0.2.1'
27
25
  spec.add_runtime_dependency 'rack', '~> 1.5'
28
26
 
27
+ spec.add_development_dependency 'activejob', '~> 4'
29
28
  spec.add_development_dependency 'bundler'
29
+ spec.add_development_dependency 'dynosaur', '~> 0.2.1'
30
30
  spec.add_development_dependency 'pry-byebug', '~> 3'
31
31
  spec.add_development_dependency 'rake', '~> 10.4'
32
32
  spec.add_development_dependency 'rspec', '~> 3.3'
@@ -0,0 +1,18 @@
1
+ class BulkProcessor
2
+ module BackEnd
3
+ class ActiveJob
4
+ # ActiveJob to handle processing the CSV in the background
5
+ class ProcessCSVJob < ::ActiveJob::Base
6
+ queue_as 'bulk_processor'
7
+
8
+ def perform(processor_class, payload, key)
9
+ BulkProcessor::ProcessCSV.new(
10
+ processor_class.constantize,
11
+ PayloadSerializer.deserialize(payload),
12
+ key
13
+ ).perform
14
+ end
15
+ end
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,19 @@
1
+ class BulkProcessor
2
+ module BackEnd
3
+ class ActiveJob
4
+ # ActiveJob to handle processing the CSV in the background
5
+ class SplitCSVJob < ::ActiveJob::Base
6
+ queue_as 'bulk_processor'
7
+
8
+ def perform(processor_class, payload, key, num_chunks)
9
+ BulkProcessor::SplitCSV.new(
10
+ processor_class.constantize,
11
+ PayloadSerializer.deserialize(payload),
12
+ key,
13
+ num_chunks
14
+ ).perform
15
+ end
16
+ end
17
+ end
18
+ end
19
+ end
@@ -1,20 +1,25 @@
1
+ require 'active_job'
2
+
3
+ require_relative 'active_job/process_csv_job'
4
+ require_relative 'active_job/split_csv_job'
5
+
1
6
  class BulkProcessor
2
7
  module BackEnd
3
8
  # Execute jobs via ActiveJob, e.g. Resque
4
9
  class ActiveJob
5
10
  def initialize(processor_class:, payload:, key:)
6
- @processor_class = processor_class
7
- @payload = payload
11
+ @processor_class = processor_class.name
12
+ @payload = PayloadSerializer.serialize(payload)
8
13
  @key = key
9
14
  end
10
15
 
11
16
  def start
12
- Job::ProcessCSV.perform_later(processor_class.name, payload, key)
17
+ ActiveJob::ProcessCSVJob.perform_later(processor_class, payload, key)
13
18
  end
14
19
 
15
20
  def split(num_processes)
16
- Job::SplitCSV.perform_later(processor_class.name, payload,
17
- key, num_processes)
21
+ ActiveJob::SplitCSVJob.perform_later(processor_class, payload, key,
22
+ num_processes)
18
23
  end
19
24
 
20
25
  private
@@ -0,0 +1,26 @@
1
+ require 'rake'
2
+
3
+ class BulkProcessor
4
+ module BackEnd
5
+ class ActiveJob
6
+ class ProcessCSVTask
7
+ include Rake::DSL
8
+
9
+ def install_task
10
+ namespace :bulk_processor do
11
+ desc 'Start processing a CSV file'
12
+ task :start, [:processor_class, :payload, :key] => :environment do |_task, args|
13
+ BulkProcessor::ProcessCSV.new(
14
+ args[:processor_class].constantize,
15
+ PayloadSerializer.deserialize(args[:payload]),
16
+ args[:key]
17
+ ).perform
18
+ end
19
+ end
20
+ end
21
+ end
22
+ end
23
+ end
24
+ end
25
+
26
+ BulkProcessor::BackEnd::ActiveJob::ProcessCSVTask.new.install_task
@@ -0,0 +1,27 @@
1
+ require 'rake'
2
+
3
+ class BulkProcessor
4
+ module BackEnd
5
+ class ActiveJob
6
+ class SplitCSVTask
7
+ include Rake::DSL
8
+
9
+ def install_task
10
+ namespace :bulk_processor do
11
+ desc 'Split a CSV file and process each piece'
12
+ task :split, [:processor_class, :payload, :key, :num_chunks] => :environment do |_task, args|
13
+ BulkProcessor::SplitCSV.new(
14
+ args[:processor_class].constantize,
15
+ PayloadSerializer.deserialize(args[:payload]),
16
+ args[:key],
17
+ args[:num_chunks].to_i
18
+ ).perform
19
+ end
20
+ end
21
+ end
22
+ end
23
+ end
24
+ end
25
+ end
26
+
27
+ BulkProcessor::BackEnd::ActiveJob::SplitCSVTask.new.install_task
@@ -0,0 +1,2 @@
1
+ require_relative 'process_csv_task'
2
+ require_relative 'split_csv_task'
@@ -1,12 +1,14 @@
1
1
  require 'dynosaur'
2
2
 
3
+ require_relative 'dynosaur/tasks'
4
+
3
5
  class BulkProcessor
4
6
  module BackEnd
5
7
  # Execute jobs via rake tasks that will spawn a new Heroku dyno
6
8
  class Dynosaur
7
9
  def initialize(processor_class:, payload:, key:)
8
- @processor_class = processor_class
9
- @payload = payload
10
+ @processor_class = processor_class.name
11
+ @payload = PayloadSerializer.serialize(payload)
10
12
  @key = key
11
13
  configure_dynosaur
12
14
  end
@@ -14,7 +16,7 @@ class BulkProcessor
14
16
  def start
15
17
  args = {
16
18
  task: 'bulk_processor:start',
17
- args: [processor_class.name, payload, key]
19
+ args: [processor_class, payload, key]
18
20
  }
19
21
  ::Dynosaur::Process::Heroku.new(args).start
20
22
  end
@@ -22,7 +24,7 @@ class BulkProcessor
22
24
  def split(num_processes)
23
25
  args = {
24
26
  task: 'bulk_processor:split',
25
- args: [processor_class.name, payload, key, num_processes]
27
+ args: [processor_class, payload, key, num_processes.to_s]
26
28
  }
27
29
  ::Dynosaur::Process::Heroku.new(args).start
28
30
  end
@@ -4,7 +4,7 @@ class BulkProcessor
4
4
  def start(processor_class:, payload:, key:, num_processes: 1)
5
5
  back_end = back_end_class.new(
6
6
  processor_class: processor_class,
7
- payload: PayloadSerializer.serialize(payload),
7
+ payload: payload,
8
8
  key: key
9
9
  )
10
10
  num_processes > 1 ? back_end.split(num_processes) : back_end.start
@@ -1,9 +1,17 @@
1
1
  class BulkProcessor
2
2
  # Store configuration data set by clients
3
3
  class Config
4
- attr_reader :queue_adapter
4
+ attr_reader :back_end, :queue_adapter
5
5
  attr_writer :file_class
6
- attr_accessor :back_end, :temp_directory
6
+ attr_accessor :temp_directory
7
+
8
+ def back_end=(back_end)
9
+ require_relative "back_end/#{back_end}"
10
+ @back_end = back_end
11
+ rescue LoadError => error
12
+ puts error.message
13
+ raise ArgumentError, "Invalid back-end: #{back_end}"
14
+ end
7
15
 
8
16
  def queue_adapter=(adapter)
9
17
  ActiveJob::Base.queue_adapter = @queue_adapter = adapter
@@ -0,0 +1,24 @@
1
+ class BulkProcessor
2
+ class ProcessCSV
3
+ def initialize(processor_class, payload, key)
4
+ @processor_class = processor_class
5
+ @payload = payload
6
+ @key = key
7
+ end
8
+
9
+ def perform
10
+ file = BulkProcessor.config.file_class.new(key)
11
+ file.open do |f|
12
+ csv = CSV.parse(f.read, headers: true)
13
+ processor = processor_class.new(csv, payload: payload.merge('key' => key))
14
+ processor.start
15
+ end
16
+ ensure
17
+ file.try(:delete)
18
+ end
19
+
20
+ private
21
+
22
+ attr_reader :processor_class, :payload, :key
23
+ end
24
+ end
@@ -0,0 +1,46 @@
1
+ class BulkProcessor
2
+ class SplitCSV
3
+ def initialize(processor_class, payload, key, num_chunks)
4
+ @processor_class = processor_class
5
+ @payload = payload
6
+ @key = key
7
+ @num_chunks = num_chunks
8
+ end
9
+
10
+ def perform
11
+ splitter = FileSplitter.new(key: key, row_chunker: row_chunker)
12
+ keys = splitter.split!
13
+ keys.each do |key|
14
+ BackEnd.start(processor_class: processor_class, payload: payload, key: key)
15
+ end
16
+ rescue Exception => error
17
+ handle_error(error)
18
+ raise
19
+ ensure
20
+ BulkProcessor.config.file_class.new(key).delete
21
+ end
22
+
23
+ private
24
+
25
+ attr_reader :processor_class, :payload, :key, :num_chunks
26
+
27
+ def row_chunker
28
+ if processor_class.respond_to?(:boundary_column)
29
+ boundary_column = processor_class.boundary_column
30
+ RowChunker::Boundary.new(num_chunks, boundary_column: boundary_column)
31
+ else
32
+ RowChunker::Balanced.new(num_chunks)
33
+ end
34
+ end
35
+
36
+ def handle_error(error)
37
+ if processor_class.respond_to?(:handler_class)
38
+ handler = processor_class.handler_class.new(
39
+ payload: payload.merge('key' => key),
40
+ results: []
41
+ )
42
+ handler.fail!(error)
43
+ end
44
+ end
45
+ end
46
+ end
@@ -1,3 +1,3 @@
1
1
  class BulkProcessor
2
- VERSION = '0.6.0'.freeze
2
+ VERSION = '0.7.0'.freeze
3
3
  end
@@ -1,14 +1,12 @@
1
1
  require 'bulk_processor/back_end'
2
- require 'bulk_processor/back_end/active_job'
3
- require 'bulk_processor/back_end/dynosaur'
4
2
  require 'bulk_processor/config'
5
3
  require 'bulk_processor/file_splitter'
6
- require 'bulk_processor/job/process_csv'
7
- require 'bulk_processor/job/split_csv'
8
4
  require 'bulk_processor/payload_serializer'
5
+ require 'bulk_processor/process_csv'
9
6
  require 'bulk_processor/row_chunker/balanced'
10
7
  require 'bulk_processor/row_chunker/boundary'
11
8
  require 'bulk_processor/s3_file'
9
+ require 'bulk_processor/split_csv'
12
10
  require 'bulk_processor/stream_encoder'
13
11
  require 'bulk_processor/validated_csv'
14
12
  require 'bulk_processor/version'
metadata CHANGED
@@ -1,29 +1,15 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bulk-processor
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.0
4
+ version: 0.7.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Tom Collier, Justin Richard
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-01-22 00:00:00.000000000 Z
11
+ date: 2016-01-26 00:00:00.000000000 Z
12
12
  dependencies:
13
- - !ruby/object:Gem::Dependency
14
- name: activejob
15
- requirement: !ruby/object:Gem::Requirement
16
- requirements:
17
- - - "~>"
18
- - !ruby/object:Gem::Version
19
- version: '4'
20
- type: :runtime
21
- prerelease: false
22
- version_requirements: !ruby/object:Gem::Requirement
23
- requirements:
24
- - - "~>"
25
- - !ruby/object:Gem::Version
26
- version: '4'
27
13
  - !ruby/object:Gem::Dependency
28
14
  name: aws-sdk
29
15
  requirement: !ruby/object:Gem::Requirement
@@ -39,33 +25,33 @@ dependencies:
39
25
  - !ruby/object:Gem::Version
40
26
  version: '2.1'
41
27
  - !ruby/object:Gem::Dependency
42
- name: dynosaur
28
+ name: rack
43
29
  requirement: !ruby/object:Gem::Requirement
44
30
  requirements:
45
31
  - - "~>"
46
32
  - !ruby/object:Gem::Version
47
- version: 0.2.1
33
+ version: '1.5'
48
34
  type: :runtime
49
35
  prerelease: false
50
36
  version_requirements: !ruby/object:Gem::Requirement
51
37
  requirements:
52
38
  - - "~>"
53
39
  - !ruby/object:Gem::Version
54
- version: 0.2.1
40
+ version: '1.5'
55
41
  - !ruby/object:Gem::Dependency
56
- name: rack
42
+ name: activejob
57
43
  requirement: !ruby/object:Gem::Requirement
58
44
  requirements:
59
45
  - - "~>"
60
46
  - !ruby/object:Gem::Version
61
- version: '1.5'
62
- type: :runtime
47
+ version: '4'
48
+ type: :development
63
49
  prerelease: false
64
50
  version_requirements: !ruby/object:Gem::Requirement
65
51
  requirements:
66
52
  - - "~>"
67
53
  - !ruby/object:Gem::Version
68
- version: '1.5'
54
+ version: '4'
69
55
  - !ruby/object:Gem::Dependency
70
56
  name: bundler
71
57
  requirement: !ruby/object:Gem::Requirement
@@ -80,6 +66,20 @@ dependencies:
80
66
  - - ">="
81
67
  - !ruby/object:Gem::Version
82
68
  version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: dynosaur
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: 0.2.1
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: 0.2.1
83
83
  - !ruby/object:Gem::Dependency
84
84
  name: pry-byebug
85
85
  requirement: !ruby/object:Gem::Requirement
@@ -145,7 +145,12 @@ files:
145
145
  - lib/bulk_processor.rb
146
146
  - lib/bulk_processor/back_end.rb
147
147
  - lib/bulk_processor/back_end/active_job.rb
148
+ - lib/bulk_processor/back_end/active_job/process_csv_job.rb
149
+ - lib/bulk_processor/back_end/active_job/split_csv_job.rb
148
150
  - lib/bulk_processor/back_end/dynosaur.rb
151
+ - lib/bulk_processor/back_end/dynosaur/process_csv_task.rb
152
+ - lib/bulk_processor/back_end/dynosaur/split_csv_task.rb
153
+ - lib/bulk_processor/back_end/dynosaur/tasks.rb
149
154
  - lib/bulk_processor/config.rb
150
155
  - lib/bulk_processor/csv_processor.rb
151
156
  - lib/bulk_processor/csv_processor/no_op_handler.rb
@@ -153,14 +158,13 @@ files:
153
158
  - lib/bulk_processor/csv_processor/result.rb
154
159
  - lib/bulk_processor/csv_processor/row_processor.rb
155
160
  - lib/bulk_processor/file_splitter.rb
156
- - lib/bulk_processor/job/process_csv.rb
157
- - lib/bulk_processor/job/split_csv.rb
158
161
  - lib/bulk_processor/payload_serializer.rb
162
+ - lib/bulk_processor/process_csv.rb
159
163
  - lib/bulk_processor/row_chunker/balanced.rb
160
164
  - lib/bulk_processor/row_chunker/boundary.rb
161
165
  - lib/bulk_processor/s3_file.rb
166
+ - lib/bulk_processor/split_csv.rb
162
167
  - lib/bulk_processor/stream_encoder.rb
163
- - lib/bulk_processor/tasks.rb
164
168
  - lib/bulk_processor/validated_csv.rb
165
169
  - lib/bulk_processor/version.rb
166
170
  homepage:
@@ -1,22 +0,0 @@
1
- require 'active_job'
2
-
3
- class BulkProcessor
4
- # ActiveJob to handle processing the CSV in the background
5
- module Job
6
- class ProcessCSV < ActiveJob::Base
7
- queue_as 'bulk_processor'
8
-
9
- def perform(processor_class, payload, key)
10
- file = BulkProcessor.config.file_class.new(key)
11
- payload = PayloadSerializer.deserialize(payload).merge('key' => key)
12
- file.open do |f|
13
- csv = CSV.parse(f.read, headers: true)
14
- processor = processor_class.constantize.new(csv, payload: payload)
15
- processor.start
16
- end
17
- ensure
18
- file.try(:delete)
19
- end
20
- end
21
- end
22
- end
@@ -1,41 +0,0 @@
1
- require 'active_job'
2
-
3
- class BulkProcessor
4
- # ActiveJob to handle processing the CSV in the background
5
- module Job
6
- class SplitCSV < ActiveJob::Base
7
- queue_as 'bulk_processor'
8
-
9
- def perform(processor_class, payload, key, num_chunks)
10
- processor_class = processor_class.constantize
11
- chunker = row_chunker(processor_class, num_chunks)
12
- payload = PayloadSerializer.deserialize(payload)
13
- splitter = FileSplitter.new(key: key, row_chunker: chunker)
14
- keys = splitter.split!
15
- keys.each do |key|
16
- BackEnd.start(processor_class: processor_class, payload: payload, key: key)
17
- end
18
- rescue Exception => error
19
- if processor_class.respond_to?(:handler_class)
20
- payload = payload.merge('key' => key)
21
- handler = processor_class.handler_class.new(payload: payload, results: [])
22
- handler.fail!(error)
23
- end
24
- raise
25
- ensure
26
- BulkProcessor.config.file_class.new(key).delete
27
- end
28
-
29
- private
30
-
31
- def row_chunker(processor_class, num_chunks)
32
- if processor_class.respond_to?(:boundary_column)
33
- boundary_column = processor_class.boundary_column
34
- RowChunker::Boundary.new(num_chunks, boundary_column: boundary_column)
35
- else
36
- RowChunker::Balanced.new(num_chunks)
37
- end
38
- end
39
- end
40
- end
41
- end
@@ -1,32 +0,0 @@
1
- require 'rake'
2
-
3
- class BulkProcessor
4
- class Tasks
5
- include Rake::DSL
6
-
7
- def install_tasks
8
- namespace :bulk_processor do
9
- desc 'Start processing a CSV file'
10
- task :start, [:processor_class, :payload, :key] => :environment do |_task, args|
11
- Job::ProcessCSV.new.perform(
12
- args[:processor_class],
13
- args[:payload],
14
- args[:key]
15
- )
16
- end
17
-
18
- desc 'Split a CSV file and process each piece'
19
- task :split, [:processor_class, :payload, :key, :num_chunks] => :environment do |_task, args|
20
- Job::SplitCSV.new.perform(
21
- args[:processor_class],
22
- args[:payload],
23
- args[:key],
24
- args[:num_chunks]
25
- )
26
- end
27
- end
28
- end
29
- end
30
- end
31
-
32
- BulkProcessor::Tasks.new.install_tasks