bulk-processor 0.6.0 → 0.7.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 77320b807b3cd9862490408058611d9b461cf83f
4
- data.tar.gz: 203e565ab7f722c6f639527b4065e2e5f495aa57
3
+ metadata.gz: f5d0cde0e86097416d0234ead33d2836f4e08c7a
4
+ data.tar.gz: d176ea1309c0e618ccf718fd8c4320a2b3948831
5
5
  SHA512:
6
- metadata.gz: 4f89dd796184485f44d0018a9819cf42ec5c147ff20693c63e7055b43fd2e0807e5b268f622e8555157e1ac36b01207c9b23fe6f0212eb2a744924b6e3533d91
7
- data.tar.gz: 2373584252697f040d460070a93958cc944c2f0045948233125f4e0ee06d39daf5080191dd0f07e6ad0432e461c7947e01e821a5e81e7dbc994c558571a1da44
6
+ metadata.gz: 5dfea7cabd9b210116fd21644d34740793f785df799b954c79bdc452f0ec11155575465f0d9ac6bd6b4c1fc2df2a8ca96902c142c99ff15c6a5182761e970f55
7
+ data.tar.gz: 9b8dc6e9fcd908b6bf0ddc2e6e9c4a36c89ba398ccb11016ed2668c188339b953a4e8c4c58bf8c8025b7ae1b3903170871617fd8e6a808a6d68164ede85af3aa
data/README.md CHANGED
@@ -30,6 +30,15 @@ Bulk processor requires the following configuration
30
30
 
31
31
  #### Back end: ActiveJob
32
32
 
33
+ Include the `activejob` and back-end queueing gems in your Gemfile, e.g.
34
+
35
+ ```ruby
36
+ # Gemfile
37
+ gem 'activejob'
38
+ gem 'bulk-processor'
39
+ gem 'resque'
40
+ ```
41
+
33
42
  ```ruby
34
43
  BulkProcessor.back_end = :active_job
35
44
  BulkProcessor.queue_adapter = <adapter>
@@ -41,6 +50,14 @@ including `:resque`.
41
50
 
42
51
  #### Back end: Dynosaur
43
52
 
53
+ Include the `dynosaur` gem in your Gemfile, e.g.
54
+
55
+ ```ruby
56
+ # Gemfile
57
+ gem 'dynosaur'
58
+ gem 'resque'
59
+ ```
60
+
44
61
  ```ruby
45
62
  BulkProcessor.back_end = :dynosaur
46
63
  BulkProcessor.heroku.api_key = 'my-heroku-api-key'
@@ -49,7 +66,12 @@ BulkProcessor.heroku.app_name = 'my-heroku-app-name'
49
66
 
50
67
  ```ruby
51
68
  # Rakefile
52
- require 'bulk_processor/tasks'
69
+ require 'bulk_processor/back_end/dynosaur/tasks'
70
+
71
+ # If you do not already have an :enivronment rake task, create a no-op one as
72
+ # Dynosaur tasks depend on it.
73
+ task :environment
74
+ end
53
75
  ```
54
76
 
55
77
  #### AWS S3
@@ -21,12 +21,12 @@ success or failure report
21
21
  spec.require_paths = ['lib']
22
22
  spec.required_ruby_version = '>= 2.1'
23
23
 
24
- spec.add_runtime_dependency 'activejob', '~> 4'
25
24
  spec.add_runtime_dependency 'aws-sdk', '~> 2.1'
26
- spec.add_runtime_dependency 'dynosaur', '~> 0.2.1'
27
25
  spec.add_runtime_dependency 'rack', '~> 1.5'
28
26
 
27
+ spec.add_development_dependency 'activejob', '~> 4'
29
28
  spec.add_development_dependency 'bundler'
29
+ spec.add_development_dependency 'dynosaur', '~> 0.2.1'
30
30
  spec.add_development_dependency 'pry-byebug', '~> 3'
31
31
  spec.add_development_dependency 'rake', '~> 10.4'
32
32
  spec.add_development_dependency 'rspec', '~> 3.3'
@@ -0,0 +1,18 @@
1
+ class BulkProcessor
2
+ module BackEnd
3
+ class ActiveJob
4
+ # ActiveJob to handle processing the CSV in the background
5
+ class ProcessCSVJob < ::ActiveJob::Base
6
+ queue_as 'bulk_processor'
7
+
8
+ def perform(processor_class, payload, key)
9
+ BulkProcessor::ProcessCSV.new(
10
+ processor_class.constantize,
11
+ PayloadSerializer.deserialize(payload),
12
+ key
13
+ ).perform
14
+ end
15
+ end
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,19 @@
1
+ class BulkProcessor
2
+ module BackEnd
3
+ class ActiveJob
4
+ # ActiveJob to handle processing the CSV in the background
5
+ class SplitCSVJob < ::ActiveJob::Base
6
+ queue_as 'bulk_processor'
7
+
8
+ def perform(processor_class, payload, key, num_chunks)
9
+ BulkProcessor::SplitCSV.new(
10
+ processor_class.constantize,
11
+ PayloadSerializer.deserialize(payload),
12
+ key,
13
+ num_chunks
14
+ ).perform
15
+ end
16
+ end
17
+ end
18
+ end
19
+ end
@@ -1,20 +1,25 @@
1
+ require 'active_job'
2
+
3
+ require_relative 'active_job/process_csv_job'
4
+ require_relative 'active_job/split_csv_job'
5
+
1
6
  class BulkProcessor
2
7
  module BackEnd
3
8
  # Execute jobs via ActiveJob, e.g. Resque
4
9
  class ActiveJob
5
10
  def initialize(processor_class:, payload:, key:)
6
- @processor_class = processor_class
7
- @payload = payload
11
+ @processor_class = processor_class.name
12
+ @payload = PayloadSerializer.serialize(payload)
8
13
  @key = key
9
14
  end
10
15
 
11
16
  def start
12
- Job::ProcessCSV.perform_later(processor_class.name, payload, key)
17
+ ActiveJob::ProcessCSVJob.perform_later(processor_class, payload, key)
13
18
  end
14
19
 
15
20
  def split(num_processes)
16
- Job::SplitCSV.perform_later(processor_class.name, payload,
17
- key, num_processes)
21
+ ActiveJob::SplitCSVJob.perform_later(processor_class, payload, key,
22
+ num_processes)
18
23
  end
19
24
 
20
25
  private
@@ -0,0 +1,26 @@
1
+ require 'rake'
2
+
3
+ class BulkProcessor
4
+ module BackEnd
5
+ class ActiveJob
6
+ class ProcessCSVTask
7
+ include Rake::DSL
8
+
9
+ def install_task
10
+ namespace :bulk_processor do
11
+ desc 'Start processing a CSV file'
12
+ task :start, [:processor_class, :payload, :key] => :environment do |_task, args|
13
+ BulkProcessor::ProcessCSV.new(
14
+ args[:processor_class].constantize,
15
+ PayloadSerializer.deserialize(args[:payload]),
16
+ args[:key]
17
+ ).perform
18
+ end
19
+ end
20
+ end
21
+ end
22
+ end
23
+ end
24
+ end
25
+
26
+ BulkProcessor::BackEnd::ActiveJob::ProcessCSVTask.new.install_task
@@ -0,0 +1,27 @@
1
+ require 'rake'
2
+
3
+ class BulkProcessor
4
+ module BackEnd
5
+ class ActiveJob
6
+ class SplitCSVTask
7
+ include Rake::DSL
8
+
9
+ def install_task
10
+ namespace :bulk_processor do
11
+ desc 'Split a CSV file and process each piece'
12
+ task :split, [:processor_class, :payload, :key, :num_chunks] => :environment do |_task, args|
13
+ BulkProcessor::SplitCSV.new(
14
+ args[:processor_class].constantize,
15
+ PayloadSerializer.deserialize(args[:payload]),
16
+ args[:key],
17
+ args[:num_chunks].to_i
18
+ ).perform
19
+ end
20
+ end
21
+ end
22
+ end
23
+ end
24
+ end
25
+ end
26
+
27
+ BulkProcessor::BackEnd::ActiveJob::SplitCSVTask.new.install_task
@@ -0,0 +1,2 @@
1
+ require_relative 'process_csv_task'
2
+ require_relative 'split_csv_task'
@@ -1,12 +1,14 @@
1
1
  require 'dynosaur'
2
2
 
3
+ require_relative 'dynosaur/tasks'
4
+
3
5
  class BulkProcessor
4
6
  module BackEnd
5
7
  # Execute jobs via rake tasks that will spawn a new Heroku dyno
6
8
  class Dynosaur
7
9
  def initialize(processor_class:, payload:, key:)
8
- @processor_class = processor_class
9
- @payload = payload
10
+ @processor_class = processor_class.name
11
+ @payload = PayloadSerializer.serialize(payload)
10
12
  @key = key
11
13
  configure_dynosaur
12
14
  end
@@ -14,7 +16,7 @@ class BulkProcessor
14
16
  def start
15
17
  args = {
16
18
  task: 'bulk_processor:start',
17
- args: [processor_class.name, payload, key]
19
+ args: [processor_class, payload, key]
18
20
  }
19
21
  ::Dynosaur::Process::Heroku.new(args).start
20
22
  end
@@ -22,7 +24,7 @@ class BulkProcessor
22
24
  def split(num_processes)
23
25
  args = {
24
26
  task: 'bulk_processor:split',
25
- args: [processor_class.name, payload, key, num_processes]
27
+ args: [processor_class, payload, key, num_processes.to_s]
26
28
  }
27
29
  ::Dynosaur::Process::Heroku.new(args).start
28
30
  end
@@ -4,7 +4,7 @@ class BulkProcessor
4
4
  def start(processor_class:, payload:, key:, num_processes: 1)
5
5
  back_end = back_end_class.new(
6
6
  processor_class: processor_class,
7
- payload: PayloadSerializer.serialize(payload),
7
+ payload: payload,
8
8
  key: key
9
9
  )
10
10
  num_processes > 1 ? back_end.split(num_processes) : back_end.start
@@ -1,9 +1,17 @@
1
1
  class BulkProcessor
2
2
  # Store configuration data set by clients
3
3
  class Config
4
- attr_reader :queue_adapter
4
+ attr_reader :back_end, :queue_adapter
5
5
  attr_writer :file_class
6
- attr_accessor :back_end, :temp_directory
6
+ attr_accessor :temp_directory
7
+
8
+ def back_end=(back_end)
9
+ require_relative "back_end/#{back_end}"
10
+ @back_end = back_end
11
+ rescue LoadError => error
12
+ puts error.message
13
+ raise ArgumentError, "Invalid back-end: #{back_end}"
14
+ end
7
15
 
8
16
  def queue_adapter=(adapter)
9
17
  ActiveJob::Base.queue_adapter = @queue_adapter = adapter
@@ -0,0 +1,24 @@
1
+ class BulkProcessor
2
+ class ProcessCSV
3
+ def initialize(processor_class, payload, key)
4
+ @processor_class = processor_class
5
+ @payload = payload
6
+ @key = key
7
+ end
8
+
9
+ def perform
10
+ file = BulkProcessor.config.file_class.new(key)
11
+ file.open do |f|
12
+ csv = CSV.parse(f.read, headers: true)
13
+ processor = processor_class.new(csv, payload: payload.merge('key' => key))
14
+ processor.start
15
+ end
16
+ ensure
17
+ file.try(:delete)
18
+ end
19
+
20
+ private
21
+
22
+ attr_reader :processor_class, :payload, :key
23
+ end
24
+ end
@@ -0,0 +1,46 @@
1
+ class BulkProcessor
2
+ class SplitCSV
3
+ def initialize(processor_class, payload, key, num_chunks)
4
+ @processor_class = processor_class
5
+ @payload = payload
6
+ @key = key
7
+ @num_chunks = num_chunks
8
+ end
9
+
10
+ def perform
11
+ splitter = FileSplitter.new(key: key, row_chunker: row_chunker)
12
+ keys = splitter.split!
13
+ keys.each do |key|
14
+ BackEnd.start(processor_class: processor_class, payload: payload, key: key)
15
+ end
16
+ rescue Exception => error
17
+ handle_error(error)
18
+ raise
19
+ ensure
20
+ BulkProcessor.config.file_class.new(key).delete
21
+ end
22
+
23
+ private
24
+
25
+ attr_reader :processor_class, :payload, :key, :num_chunks
26
+
27
+ def row_chunker
28
+ if processor_class.respond_to?(:boundary_column)
29
+ boundary_column = processor_class.boundary_column
30
+ RowChunker::Boundary.new(num_chunks, boundary_column: boundary_column)
31
+ else
32
+ RowChunker::Balanced.new(num_chunks)
33
+ end
34
+ end
35
+
36
+ def handle_error(error)
37
+ if processor_class.respond_to?(:handler_class)
38
+ handler = processor_class.handler_class.new(
39
+ payload: payload.merge('key' => key),
40
+ results: []
41
+ )
42
+ handler.fail!(error)
43
+ end
44
+ end
45
+ end
46
+ end
@@ -1,3 +1,3 @@
1
1
  class BulkProcessor
2
- VERSION = '0.6.0'.freeze
2
+ VERSION = '0.7.0'.freeze
3
3
  end
@@ -1,14 +1,12 @@
1
1
  require 'bulk_processor/back_end'
2
- require 'bulk_processor/back_end/active_job'
3
- require 'bulk_processor/back_end/dynosaur'
4
2
  require 'bulk_processor/config'
5
3
  require 'bulk_processor/file_splitter'
6
- require 'bulk_processor/job/process_csv'
7
- require 'bulk_processor/job/split_csv'
8
4
  require 'bulk_processor/payload_serializer'
5
+ require 'bulk_processor/process_csv'
9
6
  require 'bulk_processor/row_chunker/balanced'
10
7
  require 'bulk_processor/row_chunker/boundary'
11
8
  require 'bulk_processor/s3_file'
9
+ require 'bulk_processor/split_csv'
12
10
  require 'bulk_processor/stream_encoder'
13
11
  require 'bulk_processor/validated_csv'
14
12
  require 'bulk_processor/version'
metadata CHANGED
@@ -1,29 +1,15 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bulk-processor
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.0
4
+ version: 0.7.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Tom Collier, Justin Richard
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-01-22 00:00:00.000000000 Z
11
+ date: 2016-01-26 00:00:00.000000000 Z
12
12
  dependencies:
13
- - !ruby/object:Gem::Dependency
14
- name: activejob
15
- requirement: !ruby/object:Gem::Requirement
16
- requirements:
17
- - - "~>"
18
- - !ruby/object:Gem::Version
19
- version: '4'
20
- type: :runtime
21
- prerelease: false
22
- version_requirements: !ruby/object:Gem::Requirement
23
- requirements:
24
- - - "~>"
25
- - !ruby/object:Gem::Version
26
- version: '4'
27
13
  - !ruby/object:Gem::Dependency
28
14
  name: aws-sdk
29
15
  requirement: !ruby/object:Gem::Requirement
@@ -39,33 +25,33 @@ dependencies:
39
25
  - !ruby/object:Gem::Version
40
26
  version: '2.1'
41
27
  - !ruby/object:Gem::Dependency
42
- name: dynosaur
28
+ name: rack
43
29
  requirement: !ruby/object:Gem::Requirement
44
30
  requirements:
45
31
  - - "~>"
46
32
  - !ruby/object:Gem::Version
47
- version: 0.2.1
33
+ version: '1.5'
48
34
  type: :runtime
49
35
  prerelease: false
50
36
  version_requirements: !ruby/object:Gem::Requirement
51
37
  requirements:
52
38
  - - "~>"
53
39
  - !ruby/object:Gem::Version
54
- version: 0.2.1
40
+ version: '1.5'
55
41
  - !ruby/object:Gem::Dependency
56
- name: rack
42
+ name: activejob
57
43
  requirement: !ruby/object:Gem::Requirement
58
44
  requirements:
59
45
  - - "~>"
60
46
  - !ruby/object:Gem::Version
61
- version: '1.5'
62
- type: :runtime
47
+ version: '4'
48
+ type: :development
63
49
  prerelease: false
64
50
  version_requirements: !ruby/object:Gem::Requirement
65
51
  requirements:
66
52
  - - "~>"
67
53
  - !ruby/object:Gem::Version
68
- version: '1.5'
54
+ version: '4'
69
55
  - !ruby/object:Gem::Dependency
70
56
  name: bundler
71
57
  requirement: !ruby/object:Gem::Requirement
@@ -80,6 +66,20 @@ dependencies:
80
66
  - - ">="
81
67
  - !ruby/object:Gem::Version
82
68
  version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: dynosaur
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: 0.2.1
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: 0.2.1
83
83
  - !ruby/object:Gem::Dependency
84
84
  name: pry-byebug
85
85
  requirement: !ruby/object:Gem::Requirement
@@ -145,7 +145,12 @@ files:
145
145
  - lib/bulk_processor.rb
146
146
  - lib/bulk_processor/back_end.rb
147
147
  - lib/bulk_processor/back_end/active_job.rb
148
+ - lib/bulk_processor/back_end/active_job/process_csv_job.rb
149
+ - lib/bulk_processor/back_end/active_job/split_csv_job.rb
148
150
  - lib/bulk_processor/back_end/dynosaur.rb
151
+ - lib/bulk_processor/back_end/dynosaur/process_csv_task.rb
152
+ - lib/bulk_processor/back_end/dynosaur/split_csv_task.rb
153
+ - lib/bulk_processor/back_end/dynosaur/tasks.rb
149
154
  - lib/bulk_processor/config.rb
150
155
  - lib/bulk_processor/csv_processor.rb
151
156
  - lib/bulk_processor/csv_processor/no_op_handler.rb
@@ -153,14 +158,13 @@ files:
153
158
  - lib/bulk_processor/csv_processor/result.rb
154
159
  - lib/bulk_processor/csv_processor/row_processor.rb
155
160
  - lib/bulk_processor/file_splitter.rb
156
- - lib/bulk_processor/job/process_csv.rb
157
- - lib/bulk_processor/job/split_csv.rb
158
161
  - lib/bulk_processor/payload_serializer.rb
162
+ - lib/bulk_processor/process_csv.rb
159
163
  - lib/bulk_processor/row_chunker/balanced.rb
160
164
  - lib/bulk_processor/row_chunker/boundary.rb
161
165
  - lib/bulk_processor/s3_file.rb
166
+ - lib/bulk_processor/split_csv.rb
162
167
  - lib/bulk_processor/stream_encoder.rb
163
- - lib/bulk_processor/tasks.rb
164
168
  - lib/bulk_processor/validated_csv.rb
165
169
  - lib/bulk_processor/version.rb
166
170
  homepage:
@@ -1,22 +0,0 @@
1
- require 'active_job'
2
-
3
- class BulkProcessor
4
- # ActiveJob to handle processing the CSV in the background
5
- module Job
6
- class ProcessCSV < ActiveJob::Base
7
- queue_as 'bulk_processor'
8
-
9
- def perform(processor_class, payload, key)
10
- file = BulkProcessor.config.file_class.new(key)
11
- payload = PayloadSerializer.deserialize(payload).merge('key' => key)
12
- file.open do |f|
13
- csv = CSV.parse(f.read, headers: true)
14
- processor = processor_class.constantize.new(csv, payload: payload)
15
- processor.start
16
- end
17
- ensure
18
- file.try(:delete)
19
- end
20
- end
21
- end
22
- end
@@ -1,41 +0,0 @@
1
- require 'active_job'
2
-
3
- class BulkProcessor
4
- # ActiveJob to handle processing the CSV in the background
5
- module Job
6
- class SplitCSV < ActiveJob::Base
7
- queue_as 'bulk_processor'
8
-
9
- def perform(processor_class, payload, key, num_chunks)
10
- processor_class = processor_class.constantize
11
- chunker = row_chunker(processor_class, num_chunks)
12
- payload = PayloadSerializer.deserialize(payload)
13
- splitter = FileSplitter.new(key: key, row_chunker: chunker)
14
- keys = splitter.split!
15
- keys.each do |key|
16
- BackEnd.start(processor_class: processor_class, payload: payload, key: key)
17
- end
18
- rescue Exception => error
19
- if processor_class.respond_to?(:handler_class)
20
- payload = payload.merge('key' => key)
21
- handler = processor_class.handler_class.new(payload: payload, results: [])
22
- handler.fail!(error)
23
- end
24
- raise
25
- ensure
26
- BulkProcessor.config.file_class.new(key).delete
27
- end
28
-
29
- private
30
-
31
- def row_chunker(processor_class, num_chunks)
32
- if processor_class.respond_to?(:boundary_column)
33
- boundary_column = processor_class.boundary_column
34
- RowChunker::Boundary.new(num_chunks, boundary_column: boundary_column)
35
- else
36
- RowChunker::Balanced.new(num_chunks)
37
- end
38
- end
39
- end
40
- end
41
- end
@@ -1,32 +0,0 @@
1
- require 'rake'
2
-
3
- class BulkProcessor
4
- class Tasks
5
- include Rake::DSL
6
-
7
- def install_tasks
8
- namespace :bulk_processor do
9
- desc 'Start processing a CSV file'
10
- task :start, [:processor_class, :payload, :key] => :environment do |_task, args|
11
- Job::ProcessCSV.new.perform(
12
- args[:processor_class],
13
- args[:payload],
14
- args[:key]
15
- )
16
- end
17
-
18
- desc 'Split a CSV file and process each piece'
19
- task :split, [:processor_class, :payload, :key, :num_chunks] => :environment do |_task, args|
20
- Job::SplitCSV.new.perform(
21
- args[:processor_class],
22
- args[:payload],
23
- args[:key],
24
- args[:num_chunks]
25
- )
26
- end
27
- end
28
- end
29
- end
30
- end
31
-
32
- BulkProcessor::Tasks.new.install_tasks