rocketjob 6.0.0.rc2 → 6.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +164 -8
  3. data/lib/rocket_job/batch/categories.rb +26 -24
  4. data/lib/rocket_job/batch/io.rb +128 -128
  5. data/lib/rocket_job/batch/worker.rb +14 -12
  6. data/lib/rocket_job/batch.rb +0 -1
  7. data/lib/rocket_job/category/base.rb +10 -7
  8. data/lib/rocket_job/category/input.rb +61 -1
  9. data/lib/rocket_job/category/output.rb +9 -0
  10. data/lib/rocket_job/dirmon_entry.rb +1 -1
  11. data/lib/rocket_job/job_exception.rb +1 -1
  12. data/lib/rocket_job/jobs/conversion_job.rb +43 -0
  13. data/lib/rocket_job/jobs/dirmon_job.rb +24 -35
  14. data/lib/rocket_job/jobs/housekeeping_job.rb +4 -5
  15. data/lib/rocket_job/jobs/on_demand_batch_job.rb +15 -11
  16. data/lib/rocket_job/jobs/on_demand_job.rb +2 -2
  17. data/lib/rocket_job/jobs/upload_file_job.rb +4 -0
  18. data/lib/rocket_job/plugins/cron.rb +60 -20
  19. data/lib/rocket_job/plugins/job/persistence.rb +36 -0
  20. data/lib/rocket_job/plugins/restart.rb +3 -110
  21. data/lib/rocket_job/plugins/state_machine.rb +2 -2
  22. data/lib/rocket_job/plugins/throttle_dependent_jobs.rb +10 -5
  23. data/lib/rocket_job/sliced/bzip2_output_slice.rb +18 -19
  24. data/lib/rocket_job/sliced/compressed_slice.rb +3 -6
  25. data/lib/rocket_job/sliced/encrypted_bzip2_output_slice.rb +49 -0
  26. data/lib/rocket_job/sliced/encrypted_slice.rb +4 -6
  27. data/lib/rocket_job/sliced/input.rb +42 -54
  28. data/lib/rocket_job/sliced/slice.rb +7 -3
  29. data/lib/rocket_job/sliced/slices.rb +12 -9
  30. data/lib/rocket_job/sliced/writer/input.rb +46 -18
  31. data/lib/rocket_job/sliced.rb +1 -19
  32. data/lib/rocket_job/subscribers/secret_config.rb +17 -0
  33. data/lib/rocket_job/supervisor.rb +10 -8
  34. data/lib/rocket_job/version.rb +1 -1
  35. data/lib/rocketjob.rb +4 -3
  36. metadata +12 -12
  37. data/lib/rocket_job/batch/tabular/input.rb +0 -133
  38. data/lib/rocket_job/batch/tabular/output.rb +0 -67
  39. data/lib/rocket_job/batch/tabular.rb +0 -58
@@ -2,128 +2,21 @@ require "active_support/concern"
2
2
 
3
3
  module RocketJob
4
4
  module Plugins
5
- # Automatically starts a new instance of this job anytime it fails, aborts, or completes.
6
- #
7
- # Notes:
8
- # * Restartable jobs automatically abort if they fail. This prevents the failed job from being retried.
9
- # - To disable this behavior, add the following empty method:
10
- # def rocket_job_restart_abort
11
- # end
12
- # * On destroy this job is destroyed without starting a new instance.
13
- # * On Abort a new instance is created.
14
- # * Include `RocketJob::Plugins::Singleton` to prevent multiple copies of a job from running at
15
- # the same time.
16
- # * The job will not be restarted if:
17
- # - A validation fails after creating the new instance of this job.
18
- # - The job has expired.
19
- # * Only the fields that have `copy_on_restart: true` will be passed onto the new instance of this job.
20
- #
21
- # Example:
22
- #
23
- # class RestartableJob < RocketJob::Job
24
- # include RocketJob::Plugins::Restart
25
- #
26
- # # Retain the completed job under the completed tab in Rocket Job Web Interface.
27
- # self.destroy_on_complete = false
28
- #
29
- # # Will be copied to the new job on restart.
30
- # field :limit, type: Integer, copy_on_restart: true
31
- #
32
- # # Will _not_ be copied to the new job on restart.
33
- # field :list, type: Array, default: [1,2,3]
34
- #
35
- # # Set run_at every time a new instance of the job is created.
36
- # after_initialize set_run_at, if: :new_record?
37
- #
38
- # def perform
39
- # puts "The limit is #{limit}"
40
- # puts "The list is #{list}"
41
- # 'DONE'
42
- # end
43
- #
44
- # private
45
- #
46
- # # Run this job in 30 minutes.
47
- # def set_run_at
48
- # self.run_at = 30.minutes.from_now
49
- # end
50
- # end
51
- #
52
- # job = RestartableJob.create!(limit: 10, list: [4,5,6])
53
- # job.reload.state
54
- # # => :queued
55
- #
56
- # job.limit
57
- # # => 10
58
- #
59
- # job.list
60
- # # => [4,5,6]
61
- #
62
- # # Wait 30 minutes ...
63
- #
64
- # job.reload.state
65
- # # => :completed
66
- #
67
- # # A new instance was automatically created.
68
- # job2 = RestartableJob.last
69
- # job2.state
70
- # # => :queued
71
- #
72
- # job2.limit
73
- # # => 10
74
- #
75
- # job2.list
76
- # # => [1,2,3]
5
+ # @deprecated
77
6
  module Restart
78
7
  extend ActiveSupport::Concern
79
8
 
80
9
  included do
81
- after_abort :rocket_job_restart_new_instance
82
- after_complete :rocket_job_restart_new_instance
10
+ after_abort :create_restart!
11
+ after_complete :create_restart!
83
12
  after_fail :rocket_job_restart_abort
84
13
  end
85
14
 
86
15
  private
87
16
 
88
- # Run again in the future, even if this run fails with an exception
89
- def rocket_job_restart_new_instance
90
- if expired?
91
- logger.info("Job has expired. Not creating a new instance.")
92
- return
93
- end
94
- job_attrs =
95
- rocket_job_restart_attributes.each_with_object({}) { |attr, attrs| attrs[attr] = send(attr) }
96
- job = self.class.new(job_attrs)
97
-
98
- # Copy across input and output categories to new scheduled job so that all of the
99
- # settings are remembered between instance. Example: slice_size
100
- job.input_categories = input_categories if respond_to?(:input_categories)
101
- job.output_categories = output_categories if respond_to?(:output_categories)
102
-
103
- rocket_job_restart_save(job)
104
- end
105
-
106
17
  def rocket_job_restart_abort
107
18
  new_record? ? abort : abort!
108
19
  end
109
-
110
- # Allow Singleton to prevent the creation of a new job if one is already running
111
- # Retry since the delete may not have persisted to disk yet.
112
- def rocket_job_restart_save(job, retry_limit = 10, sleep_interval = 0.5)
113
- count = 0
114
- while count < retry_limit
115
- if job.save
116
- logger.info("Created a new job instance: #{job.id}")
117
- return true
118
- else
119
- logger.info("Job already active, retrying after a short sleep")
120
- sleep(sleep_interval)
121
- end
122
- count += 1
123
- end
124
- logger.error("New job instance not started: #{job.errors.messages.inspect}")
125
- false
126
- end
127
20
  end
128
21
  end
129
22
  end
@@ -36,8 +36,8 @@ module RocketJob
36
36
  raise(ArgumentError, "Cannot supply both a method name and a block") if methods.size.positive? && block
37
37
  raise(ArgumentError, "Must supply either a method name or a block") unless methods.size.positive? || block
38
38
 
39
- # TODO: Somehow get AASM to support options such as :if and :unless to be consistent with other callbacks
40
- # For example:
39
+ # Limitation with AASM. It only supports guards on event transitions, not for callbacks.
40
+ # For example, AASM does not support callback options such as :if and :unless, yet Rails callbacks do.
41
41
  # before_start :my_callback, unless: :encrypted?
42
42
  # before_start :my_callback, if: :encrypted?
43
43
  event = aasm.state_machine.events[event_name]
@@ -11,17 +11,22 @@ module RocketJob
11
11
  extend ActiveSupport::Concern
12
12
 
13
13
  included do
14
- class_attribute :dependent_jobs
15
- self.dependent_jobs = nil
14
+ field :dependent_jobs, type: Array, class_attribute: true, user_editable: true, copy_on_restart: true
16
15
 
17
- define_throttle :dependent_job_exists?
18
- define_batch_throttle :dependent_job_exists? if respond_to?(:define_batch_throttle)
16
+ define_throttle :dependent_jobs_running?
17
+ define_batch_throttle :dependent_jobs_running? if respond_to?(:define_batch_throttle)
18
+ end
19
+
20
+ class_methods do
21
+ def depends_on_job(*jobs)
22
+ self.dependent_jobs = Array(jobs).collect(&:to_s)
23
+ end
19
24
  end
20
25
 
21
26
  private
22
27
 
23
28
  # Checks if there are any dependent jobs are running
24
- def dependent_job_exists?
29
+ def dependent_jobs_running?
25
30
  return false if dependent_jobs.blank?
26
31
 
27
32
  jobs_count = RocketJob::Job.running.where(:_type.in => dependent_jobs).count
@@ -7,36 +7,35 @@ module RocketJob
7
7
  # * The `bzip2` linux command line utility supports multiple embedded BZip2 stream,
8
8
  # but some other custom implementations may not. They may only read the first slice and stop.
9
9
  # * It is only designed for use on output collections.
10
- #
11
- # To download the output when using this slice:
12
- #
13
- # # Download the binary BZip2 streams into a single file
14
- # IOStreams.path(output_file_name).stream(:none).writer do |io|
15
- # job.download { |slice| io << slice[:binary] }
16
- # end
17
10
  class BZip2OutputSlice < ::RocketJob::Sliced::Slice
18
- # This is a specialized binary slice for creating binary data from each slice
11
+ # This is a specialized binary slice for creating BZip2 binary data from each slice
19
12
  # that must be downloaded as-is into output files.
20
- def self.binary?
21
- true
13
+ def self.binary_format
14
+ :bz2
15
+ end
16
+
17
+ # Compress the supplied records with BZip2
18
+ def self.to_binary(records, record_delimiter = "\n")
19
+ return [] if records.blank?
20
+
21
+ lines = Array(records).join(record_delimiter) + record_delimiter
22
+ s = StringIO.new
23
+ IOStreams::Bzip2::Writer.stream(s) { |io| io.write(lines) }
24
+ s.string
22
25
  end
23
26
 
24
27
  private
25
28
 
29
+ # Returns [Hash] the BZip2 compressed binary data in binary form when reading back from Mongo.
26
30
  def parse_records
27
- records = attributes.delete("records")
28
-
29
31
  # Convert BSON::Binary to a string
30
- @records = [{binary: records.data}]
32
+ @records = [attributes.delete("records").data]
31
33
  end
32
34
 
35
+ # Returns [BSON::Binary] the records compressed using BZip2 into a string.
33
36
  def serialize_records
34
- return [] if @records.nil? || @records.empty?
35
-
36
- lines = records.to_a.join("\n") + "\n"
37
- s = StringIO.new
38
- IOStreams::Bzip2::Writer.stream(s) { |io| io.write(lines) }
39
- BSON::Binary.new(s.string)
37
+ # TODO: Make the line terminator configurable
38
+ BSON::Binary.new(self.class.to_binary(@records))
40
39
  end
41
40
  end
42
41
  end
@@ -6,13 +6,10 @@ module RocketJob
6
6
  private
7
7
 
8
8
  def parse_records
9
- records = attributes.delete("records")
10
-
11
9
  # Convert BSON::Binary to a string
12
- binary_str = records.data
13
-
14
- str = Zlib::Inflate.inflate(binary_str)
15
- @records = Hash.from_bson(BSON::ByteBuffer.new(str))["r"]
10
+ compressed_str = attributes.delete("records").data
11
+ decompressed_str = Zlib::Inflate.inflate(compressed_str)
12
+ @records = Hash.from_bson(BSON::ByteBuffer.new(decompressed_str))["r"]
16
13
  end
17
14
 
18
15
  def serialize_records
@@ -0,0 +1,49 @@
1
+ module RocketJob
2
+ module Sliced
3
+ # This is a specialized output serializer that renders each output slice as a single BZip2 compressed stream.
4
+ # BZip2 allows multiple output streams to be written into a single BZip2 file.
5
+ #
6
+ # Notes:
7
+ # * The `bzip2` linux command line utility supports multiple embedded BZip2 stream,
8
+ # but some other custom implementations may not. They may only read the first slice and stop.
9
+ # * It is only designed for use on output collections.
10
+ class EncryptedBZip2OutputSlice < ::RocketJob::Sliced::Slice
11
+ # This is a specialized binary slice for creating BZip2 binary data from each slice
12
+ # that must be downloaded as-is into output files.
13
+ def self.binary_format
14
+ :bz2
15
+ end
16
+
17
+ private
18
+
19
+ # Returns [Hash] the BZip2 compressed binary data in binary form when reading back from Mongo.
20
+ def parse_records
21
+ # Convert BSON::Binary to a string
22
+ encrypted_str = attributes.delete("records").data
23
+
24
+ # Decrypt string
25
+ header = SymmetricEncryption::Header.new
26
+ header.parse(encrypted_str)
27
+ # Use the header that is present to decrypt the data, since its version could be different
28
+ decrypted_str = header.cipher.binary_decrypt(encrypted_str, header: header)
29
+
30
+ @records = [decrypted_str]
31
+ end
32
+
33
+ # Returns [BSON::Binary] the records compressed using BZip2 into a string.
34
+ def serialize_records
35
+ return [] if @records.nil? || @records.empty?
36
+
37
+ # TODO: Make the line terminator configurable
38
+ lines = records.to_a.join("\n") + "\n"
39
+ s = StringIO.new
40
+ IOStreams::Bzip2::Writer.stream(s) { |io| io.write(lines) }
41
+
42
+ # Encrypt to binary without applying an encoding such as Base64
43
+ # Use a random_iv with each encryption for better security
44
+ data = SymmetricEncryption.cipher.binary_encrypt(s.string, random_iv: true, compress: false)
45
+ BSON::Binary.new(data)
46
+ end
47
+ end
48
+ end
49
+ end
@@ -6,17 +6,15 @@ module RocketJob
6
6
  private
7
7
 
8
8
  def parse_records
9
- records = attributes.delete("records")
10
-
11
9
  # Convert BSON::Binary to a string
12
- binary_str = records.data
10
+ encrypted_str = attributes.delete("records").data
13
11
 
14
12
  header = SymmetricEncryption::Header.new
15
- header.parse(binary_str)
13
+ header.parse(encrypted_str)
16
14
  # Use the header that is present to decrypt the data, since its version could be different
17
- str = header.cipher.binary_decrypt(binary_str, header: header)
15
+ decrypted_str = header.cipher.binary_decrypt(encrypted_str, header: header)
18
16
 
19
- @records = Hash.from_bson(BSON::ByteBuffer.new(str))["r"]
17
+ @records = Hash.from_bson(BSON::ByteBuffer.new(decrypted_str))["r"]
20
18
  end
21
19
 
22
20
  def serialize_records
@@ -1,16 +1,16 @@
1
1
  module RocketJob
2
2
  module Sliced
3
3
  class Input < Slices
4
- def upload(on_first: nil, &block)
4
+ def upload(**args, &block)
5
5
  # Create indexes before uploading
6
6
  create_indexes
7
- Writer::Input.collect(self, on_first: on_first, &block)
7
+ Writer::Input.collect(self, **args, &block)
8
8
  rescue Exception => e
9
9
  drop
10
10
  raise(e)
11
11
  end
12
12
 
13
- def upload_mongo_query(criteria, *column_names, &block)
13
+ def upload_mongo_query(criteria, columns: [], slice_batch_size: nil, &block)
14
14
  options = criteria.options
15
15
 
16
16
  # Without a block extract the fields from the supplied criteria
@@ -18,23 +18,21 @@ module RocketJob
18
18
  # Criteria is returning old school :fields instead of :projections
19
19
  options[:projection] = options.delete(:fields) if options.key?(:fields)
20
20
  else
21
- column_names = column_names.collect(&:to_s)
22
- column_names << "_id" if column_names.size.zero?
23
-
24
- fields = options.delete(:fields) || {}
25
- column_names.each { |col| fields[col] = 1 }
21
+ columns = columns.blank? ? ["_id"] : columns.collect(&:to_s)
22
+ fields = options.delete(:fields) || {}
23
+ columns.each { |col| fields[col] = 1 }
26
24
  options[:projection] = fields
27
25
 
28
26
  block =
29
- if column_names.size == 1
30
- column = column_names.first
27
+ if columns.size == 1
28
+ column = columns.first
31
29
  ->(document) { document[column] }
32
30
  else
33
- ->(document) { column_names.collect { |c| document[c] } }
31
+ ->(document) { columns.collect { |c| document[c] } }
34
32
  end
35
33
  end
36
34
 
37
- upload do |records|
35
+ upload(slice_batch_size: slice_batch_size) do |records|
38
36
  # Drop down to the mongo driver level to avoid constructing a Model for each document returned
39
37
  criteria.klass.collection.find(criteria.selector, options).each do |document|
40
38
  records << block.call(document)
@@ -42,58 +40,48 @@ module RocketJob
42
40
  end
43
41
  end
44
42
 
45
- def upload_arel(arel, *column_names, &block)
43
+ def upload_arel(arel, columns: nil, slice_batch_size: nil, &block)
46
44
  unless block
47
- column_names = column_names.empty? ? [:id] : column_names.collect(&:to_sym)
45
+ columns = columns.blank? ? [:id] : columns.collect(&:to_sym)
48
46
 
49
47
  block =
50
- if column_names.size == 1
51
- column = column_names.first
52
- ->(model) { model.send(column) }
48
+ if columns.size == 1
49
+ column = columns.first
50
+ ->(model) { model.public_send(column) }
53
51
  else
54
- ->(model) { column_names.collect { |c| model.send(c) } }
52
+ ->(model) { columns.collect { |c| model.public_send(c) } }
55
53
  end
56
54
  # find_each requires the :id column in the query
57
- selection = column_names.include?(:id) ? column_names : column_names + [:id]
55
+ selection = columns.include?(:id) ? columns : columns + [:id]
58
56
  arel = arel.select(selection)
59
57
  end
60
58
 
61
- upload { |records| arel.find_each { |model| records << block.call(model) } }
59
+ upload(slice_batch_size: slice_batch_size) { |records| arel.find_each { |model| records << block.call(model) } }
62
60
  end
63
61
 
64
- def upload_integer_range(start_id, last_id)
65
- # Create indexes before uploading
66
- create_indexes
67
- count = 0
68
- while start_id <= last_id
69
- end_id = start_id + slice_size - 1
70
- end_id = last_id if end_id > last_id
71
- create!(records: [[start_id, end_id]])
72
- start_id += slice_size
73
- count += 1
62
+ def upload_integer_range(start_id, last_id, slice_batch_size: 1_000)
63
+ # Each "record" is actually a range of Integers which makes up each slice
64
+ upload(slice_size: 1, slice_batch_size: slice_batch_size) do |records|
65
+ while start_id <= last_id
66
+ end_id = start_id + slice_size - 1
67
+ end_id = last_id if end_id > last_id
68
+ records << [start_id, end_id]
69
+ start_id += slice_size
70
+ end
74
71
  end
75
- count
76
- rescue Exception => e
77
- drop
78
- raise(e)
79
72
  end
80
73
 
81
- def upload_integer_range_in_reverse_order(start_id, last_id)
82
- # Create indexes before uploading
83
- create_indexes
84
- end_id = last_id
85
- count = 0
86
- while end_id >= start_id
87
- first_id = end_id - slice_size + 1
88
- first_id = start_id if first_id.negative? || (first_id < start_id)
89
- create!(records: [[first_id, end_id]])
90
- end_id -= slice_size
91
- count += 1
74
+ def upload_integer_range_in_reverse_order(start_id, last_id, slice_batch_size: 1_000)
75
+ # Each "record" is actually a range of Integers which makes up each slice
76
+ upload(slice_size: 1, slice_batch_size: slice_batch_size) do |records|
77
+ end_id = last_id
78
+ while end_id >= start_id
79
+ first_id = end_id - slice_size + 1
80
+ first_id = start_id if first_id.negative? || (first_id < start_id)
81
+ records << [first_id, end_id]
82
+ end_id -= slice_size
83
+ end
92
84
  end
93
- count
94
- rescue Exception => e
95
- drop
96
- raise(e)
97
85
  end
98
86
 
99
87
  # Iterate over each failed record, if any
@@ -137,11 +125,11 @@ module RocketJob
137
125
  # TODO: Will it perform faster without the id sort?
138
126
  # I.e. Just process on a FIFO basis?
139
127
  document = all.queued.
140
- sort("_id" => 1).
141
- find_one_and_update(
142
- {"$set" => {worker_name: worker_name, state: "running", started_at: Time.now}},
143
- return_document: :after
144
- )
128
+ sort("_id" => 1).
129
+ find_one_and_update(
130
+ {"$set" => {worker_name: worker_name, state: "running", started_at: Time.now}},
131
+ return_document: :after
132
+ )
145
133
  document.collection_name = collection_name if document
146
134
  document
147
135
  end
@@ -95,9 +95,13 @@ module RocketJob
95
95
  end
96
96
 
97
97
  # Returns whether this is a specialized binary slice for creating binary data from each slice
98
- # that is then just downloaded as-is into output files.
99
- def self.binary?
100
- false
98
+ # that is downloaded without conversion into output files.
99
+ def self.binary_format
100
+ end
101
+
102
+ # For binary formats only, format the supplied records into the binary format for this slice
103
+ def self.to_binary(_records)
104
+ raise NotImplementedError
101
105
  end
102
106
 
103
107
  # `records` array has special handling so that it can be modified in place instead of having
@@ -42,12 +42,6 @@ module RocketJob
42
42
  slice
43
43
  end
44
44
 
45
- # Returns whether this collection contains specialized binary slices for creating binary data from each slice
46
- # that is then just downloaded as-is into output files.
47
- def binary?
48
- slice_class.binary?
49
- end
50
-
51
45
  # Returns output slices in the order of their id
52
46
  # which is usually the order in which they were written.
53
47
  def each(&block)
@@ -96,6 +90,11 @@ module RocketJob
96
90
  slice
97
91
  end
98
92
 
93
+ def insert_many(slices)
94
+ documents = slices.collect(&:as_document)
95
+ all.collection.insert_many(documents)
96
+ end
97
+
99
98
  # Append to an existing slice if already present
100
99
  def append(slice, input_slice)
101
100
  existing_slice = all.where(id: input_slice.id).first
@@ -111,9 +110,13 @@ module RocketJob
111
110
 
112
111
  # Index for find_and_modify only if it is not already present
113
112
  def create_indexes
114
- all.collection.indexes.create_one(state: 1, _id: 1) if all.collection.indexes.none? { |i| i["name"] == "state_1__id_1" }
115
- rescue Mongo::Error::OperationFailure
116
- all.collection.indexes.create_one(state: 1, _id: 1)
113
+ missing =
114
+ begin
115
+ all.collection.indexes.none? { |i| i["name"] == "state_1__id_1" }
116
+ rescue Mongo::Error::OperationFailure
117
+ true
118
+ end
119
+ all.collection.indexes.create_one({state: 1, _id: 1}, unique: true) if missing
117
120
  end
118
121
 
119
122
  # Forward additional methods.
@@ -12,43 +12,71 @@ module RocketJob
12
12
  # Block to call on the first line only, instead of storing in the slice.
13
13
  # Useful for extracting the header row
14
14
  # Default: nil
15
- def self.collect(input, **args)
16
- writer = new(input, **args)
15
+ #
16
+ # slice_size: [Integer]
17
+ # Override the slice size when uploading for example ranges, where slice is the size
18
+ # of the range itself.
19
+ #
20
+ # slice_batch_size: [Integer]
21
+ # The number of slices to batch up and to bulk load.
22
+ # For smaller slices this significantly improves upload performance.
23
+ # Note: If `slice_batch_size` is too high, it can exceed the maximum BSON block size.
24
+ def self.collect(data_store, **args)
25
+ writer = new(data_store, **args)
17
26
  yield(writer)
18
27
  writer.record_count
19
28
  ensure
20
- writer&.close
29
+ writer&.flush
21
30
  end
22
31
 
23
- def initialize(input, on_first: nil)
24
- @on_first = on_first
25
- @batch_count = 0
26
- @record_count = 0
27
- @input = input
28
- @record_number = 1
29
- @slice = @input.new(first_record_number: @record_number)
32
+ def initialize(data_store, on_first: nil, slice_size: nil, slice_batch_size: nil)
33
+ @on_first = on_first
34
+ @record_count = 0
35
+ @data_store = data_store
36
+ @slice_size = slice_size || @data_store.slice_size
37
+ @slice_batch_size = slice_batch_size || 20
38
+ @batch = []
39
+ @batch_count = 0
40
+ new_slice
30
41
  end
31
42
 
32
43
  def <<(line)
33
- @record_number += 1
34
44
  if @on_first
35
45
  @on_first.call(line)
36
46
  @on_first = nil
37
47
  return self
38
48
  end
39
49
  @slice << line
40
- @batch_count += 1
41
50
  @record_count += 1
42
- if @batch_count >= @input.slice_size
43
- @input.insert(@slice)
44
- @batch_count = 0
45
- @slice = @input.new(first_record_number: @record_number)
51
+ if @slice.size >= @slice_size
52
+ save_slice
53
+ new_slice
46
54
  end
47
55
  self
48
56
  end
49
57
 
50
- def close
51
- @input.insert(@slice) if @slice.size.positive?
58
+ def flush
59
+ if @slice_batch_size
60
+ @batch << @slice if @slice.size.positive?
61
+ @data_store.insert_many(@batch)
62
+ @batch = []
63
+ @batch_count = 0
64
+ elsif @slice.size.positive?
65
+ @data_store.insert(@slice)
66
+ end
67
+ end
68
+
69
+ def new_slice
70
+ @slice = @data_store.new(first_record_number: @record_count + 1)
71
+ end
72
+
73
+ def save_slice
74
+ return flush unless @slice_batch_size
75
+
76
+ @batch_count += 1
77
+ return flush if @batch_count >= @slice_batch_size
78
+
79
+ @batch << @slice
52
80
  end
53
81
  end
54
82
  end
@@ -2,6 +2,7 @@ module RocketJob
2
2
  module Sliced
3
3
  autoload :BZip2OutputSlice, "rocket_job/sliced/bzip2_output_slice"
4
4
  autoload :CompressedSlice, "rocket_job/sliced/compressed_slice"
5
+ autoload :EncryptedBZip2OutputSlice, "rocket_job/sliced/encrypted_bzip2_output_slice"
5
6
  autoload :EncryptedSlice, "rocket_job/sliced/encrypted_slice"
6
7
  autoload :Input, "rocket_job/sliced/input"
7
8
  autoload :Output, "rocket_job/sliced/output"
@@ -13,24 +14,5 @@ module RocketJob
13
14
  autoload :Input, "rocket_job/sliced/writer/input"
14
15
  autoload :Output, "rocket_job/sliced/writer/output"
15
16
  end
16
-
17
- # Returns [RocketJob::Sliced::Slices] for the relevant direction and category.
18
- def self.factory(direction, category, job)
19
- collection_name = "rocket_job.#{direction}s.#{job.id}"
20
- collection_name << ".#{category.name}" unless category.name == :main
21
-
22
- case direction
23
- when :input
24
- RocketJob::Sliced::Input.new(
25
- collection_name: collection_name,
26
- slice_class: category.serializer_class,
27
- slice_size: category.slice_size
28
- )
29
- when :output
30
- RocketJob::Sliced::Output.new(collection_name: collection_name, slice_class: category.serializer_class)
31
- else
32
- raise(ArgumentError, "Unknown direction: #{direction.inspect}")
33
- end
34
- end
35
17
  end
36
18
  end