rocketjob 6.0.0.rc3 → 6.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +26 -0
  3. data/lib/rocket_job/batch/categories.rb +24 -20
  4. data/lib/rocket_job/batch/io.rb +128 -128
  5. data/lib/rocket_job/batch/worker.rb +14 -12
  6. data/lib/rocket_job/category/base.rb +10 -7
  7. data/lib/rocket_job/category/input.rb +61 -1
  8. data/lib/rocket_job/category/output.rb +9 -0
  9. data/lib/rocket_job/dirmon_entry.rb +1 -1
  10. data/lib/rocket_job/jobs/conversion_job.rb +21 -17
  11. data/lib/rocket_job/jobs/dirmon_job.rb +24 -35
  12. data/lib/rocket_job/jobs/housekeeping_job.rb +4 -5
  13. data/lib/rocket_job/jobs/on_demand_batch_job.rb +7 -5
  14. data/lib/rocket_job/jobs/on_demand_job.rb +2 -2
  15. data/lib/rocket_job/jobs/upload_file_job.rb +4 -0
  16. data/lib/rocket_job/plugins/cron.rb +60 -20
  17. data/lib/rocket_job/plugins/job/persistence.rb +36 -0
  18. data/lib/rocket_job/plugins/restart.rb +3 -110
  19. data/lib/rocket_job/plugins/state_machine.rb +2 -2
  20. data/lib/rocket_job/plugins/throttle_dependent_jobs.rb +1 -2
  21. data/lib/rocket_job/sliced/bzip2_output_slice.rb +18 -19
  22. data/lib/rocket_job/sliced/compressed_slice.rb +3 -6
  23. data/lib/rocket_job/sliced/encrypted_bzip2_output_slice.rb +49 -0
  24. data/lib/rocket_job/sliced/encrypted_slice.rb +4 -6
  25. data/lib/rocket_job/sliced/input.rb +42 -54
  26. data/lib/rocket_job/sliced/slice.rb +7 -3
  27. data/lib/rocket_job/sliced/slices.rb +12 -9
  28. data/lib/rocket_job/sliced/writer/input.rb +46 -18
  29. data/lib/rocket_job/sliced.rb +1 -19
  30. data/lib/rocket_job/version.rb +1 -1
  31. data/lib/rocketjob.rb +2 -2
  32. metadata +8 -10
  33. data/lib/rocket_job/batch/tabular/input.rb +0 -133
  34. data/lib/rocket_job/batch/tabular/output.rb +0 -67
  35. data/lib/rocket_job/batch/tabular.rb +0 -58
@@ -95,9 +95,13 @@ module RocketJob
95
95
  end
96
96
 
97
97
  # Returns whether this is a specialized binary slice for creating binary data from each slice
98
- # that is then just downloaded as-is into output files.
99
- def self.binary?
100
- false
98
+ # that is downloaded without conversion into output files.
99
+ def self.binary_format
100
+ end
101
+
102
+ # For binary formats only, format the supplied records into the binary format for this slice
103
+ def self.to_binary(_records)
104
+ raise NotImplementedError
101
105
  end
102
106
 
103
107
  # `records` array has special handling so that it can be modified in place instead of having
@@ -42,12 +42,6 @@ module RocketJob
42
42
  slice
43
43
  end
44
44
 
45
- # Returns whether this collection contains specialized binary slices for creating binary data from each slice
46
- # that is then just downloaded as-is into output files.
47
- def binary?
48
- slice_class.binary?
49
- end
50
-
51
45
  # Returns output slices in the order of their id
52
46
  # which is usually the order in which they were written.
53
47
  def each(&block)
@@ -96,6 +90,11 @@ module RocketJob
96
90
  slice
97
91
  end
98
92
 
93
+ def insert_many(slices)
94
+ documents = slices.collect(&:as_document)
95
+ all.collection.insert_many(documents)
96
+ end
97
+
99
98
  # Append to an existing slice if already present
100
99
  def append(slice, input_slice)
101
100
  existing_slice = all.where(id: input_slice.id).first
@@ -111,9 +110,13 @@ module RocketJob
111
110
 
112
111
  # Index for find_and_modify only if it is not already present
113
112
  def create_indexes
114
- all.collection.indexes.create_one(state: 1, _id: 1) if all.collection.indexes.none? { |i| i["name"] == "state_1__id_1" }
115
- rescue Mongo::Error::OperationFailure
116
- all.collection.indexes.create_one(state: 1, _id: 1)
113
+ missing =
114
+ begin
115
+ all.collection.indexes.none? { |i| i["name"] == "state_1__id_1" }
116
+ rescue Mongo::Error::OperationFailure
117
+ true
118
+ end
119
+ all.collection.indexes.create_one({state: 1, _id: 1}, unique: true) if missing
117
120
  end
118
121
 
119
122
  # Forward additional methods.
@@ -12,43 +12,71 @@ module RocketJob
12
12
  # Block to call on the first line only, instead of storing in the slice.
13
13
  # Useful for extracting the header row
14
14
  # Default: nil
15
- def self.collect(input, **args)
16
- writer = new(input, **args)
15
+ #
16
+ # slice_size: [Integer]
17
+ # Override the slice size when uploading for example ranges, where slice is the size
18
+ # of the range itself.
19
+ #
20
+ # slice_batch_size: [Integer]
21
+ # The number of slices to batch up and to bulk load.
22
+ # For smaller slices this significantly improves upload performance.
23
+ # Note: If `slice_batch_size` is too high, it can exceed the maximum BSON block size.
24
+ def self.collect(data_store, **args)
25
+ writer = new(data_store, **args)
17
26
  yield(writer)
18
27
  writer.record_count
19
28
  ensure
20
- writer&.close
29
+ writer&.flush
21
30
  end
22
31
 
23
- def initialize(input, on_first: nil)
24
- @on_first = on_first
25
- @batch_count = 0
26
- @record_count = 0
27
- @input = input
28
- @record_number = 1
29
- @slice = @input.new(first_record_number: @record_number)
32
+ def initialize(data_store, on_first: nil, slice_size: nil, slice_batch_size: nil)
33
+ @on_first = on_first
34
+ @record_count = 0
35
+ @data_store = data_store
36
+ @slice_size = slice_size || @data_store.slice_size
37
+ @slice_batch_size = slice_batch_size || 20
38
+ @batch = []
39
+ @batch_count = 0
40
+ new_slice
30
41
  end
31
42
 
32
43
  def <<(line)
33
- @record_number += 1
34
44
  if @on_first
35
45
  @on_first.call(line)
36
46
  @on_first = nil
37
47
  return self
38
48
  end
39
49
  @slice << line
40
- @batch_count += 1
41
50
  @record_count += 1
42
- if @batch_count >= @input.slice_size
43
- @input.insert(@slice)
44
- @batch_count = 0
45
- @slice = @input.new(first_record_number: @record_number)
51
+ if @slice.size >= @slice_size
52
+ save_slice
53
+ new_slice
46
54
  end
47
55
  self
48
56
  end
49
57
 
50
- def close
51
- @input.insert(@slice) if @slice.size.positive?
58
+ def flush
59
+ if @slice_batch_size
60
+ @batch << @slice if @slice.size.positive?
61
+ @data_store.insert_many(@batch)
62
+ @batch = []
63
+ @batch_count = 0
64
+ elsif @slice.size.positive?
65
+ @data_store.insert(@slice)
66
+ end
67
+ end
68
+
69
+ def new_slice
70
+ @slice = @data_store.new(first_record_number: @record_count + 1)
71
+ end
72
+
73
+ def save_slice
74
+ return flush unless @slice_batch_size
75
+
76
+ @batch_count += 1
77
+ return flush if @batch_count >= @slice_batch_size
78
+
79
+ @batch << @slice
52
80
  end
53
81
  end
54
82
  end
@@ -2,6 +2,7 @@ module RocketJob
2
2
  module Sliced
3
3
  autoload :BZip2OutputSlice, "rocket_job/sliced/bzip2_output_slice"
4
4
  autoload :CompressedSlice, "rocket_job/sliced/compressed_slice"
5
+ autoload :EncryptedBZip2OutputSlice, "rocket_job/sliced/encrypted_bzip2_output_slice"
5
6
  autoload :EncryptedSlice, "rocket_job/sliced/encrypted_slice"
6
7
  autoload :Input, "rocket_job/sliced/input"
7
8
  autoload :Output, "rocket_job/sliced/output"
@@ -13,24 +14,5 @@ module RocketJob
13
14
  autoload :Input, "rocket_job/sliced/writer/input"
14
15
  autoload :Output, "rocket_job/sliced/writer/output"
15
16
  end
16
-
17
- # Returns [RocketJob::Sliced::Slices] for the relevant direction and category.
18
- def self.factory(direction, category, job)
19
- collection_name = "rocket_job.#{direction}s.#{job.id}"
20
- collection_name << ".#{category.name}" unless category.name == :main
21
-
22
- case direction
23
- when :input
24
- RocketJob::Sliced::Input.new(
25
- collection_name: collection_name,
26
- slice_class: category.serializer_class,
27
- slice_size: category.slice_size
28
- )
29
- when :output
30
- RocketJob::Sliced::Output.new(collection_name: collection_name, slice_class: category.serializer_class)
31
- else
32
- raise(ArgumentError, "Unknown direction: #{direction.inspect}")
33
- end
34
- end
35
17
  end
36
18
  end
@@ -1,3 +1,3 @@
1
1
  module RocketJob
2
- VERSION = "6.0.0.rc3".freeze
2
+ VERSION = "6.0.0".freeze
3
3
  end
data/lib/rocketjob.rb CHANGED
@@ -63,7 +63,6 @@ module RocketJob
63
63
  autoload :Cron, "rocket_job/plugins/cron"
64
64
  autoload :Document, "rocket_job/plugins/document"
65
65
  autoload :ProcessingWindow, "rocket_job/plugins/processing_window"
66
- autoload :Restart, "rocket_job/plugins/restart"
67
66
  autoload :Retry, "rocket_job/plugins/retry"
68
67
  autoload :Singleton, "rocket_job/plugins/singleton"
69
68
  autoload :StateMachine, "rocket_job/plugins/state_machine"
@@ -73,11 +72,12 @@ module RocketJob
73
72
 
74
73
  module Jobs
75
74
  autoload :ActiveJob, "rocket_job/jobs/active_job"
75
+ autoload :ConversionJob, "rocket_job/jobs/conversion_job"
76
76
  autoload :CopyFileJob, "rocket_job/jobs/copy_file_job"
77
77
  autoload :DirmonJob, "rocket_job/jobs/dirmon_job"
78
+ autoload :HousekeepingJob, "rocket_job/jobs/housekeeping_job"
78
79
  autoload :OnDemandBatchJob, "rocket_job/jobs/on_demand_batch_job"
79
80
  autoload :OnDemandJob, "rocket_job/jobs/on_demand_job"
80
- autoload :HousekeepingJob, "rocket_job/jobs/housekeeping_job"
81
81
  autoload :PerformanceJob, "rocket_job/jobs/performance_job"
82
82
  autoload :SimpleJob, "rocket_job/jobs/simple_job"
83
83
  autoload :UploadFileJob, "rocket_job/jobs/upload_file_job"
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rocketjob
3
3
  version: !ruby/object:Gem::Version
4
- version: 6.0.0.rc3
4
+ version: 6.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Reid Morrison
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-06-23 00:00:00.000000000 Z
11
+ date: 2021-08-25 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: aasm
@@ -58,14 +58,14 @@ dependencies:
58
58
  requirements:
59
59
  - - "~>"
60
60
  - !ruby/object:Gem::Version
61
- version: '1.6'
61
+ version: '1.9'
62
62
  type: :runtime
63
63
  prerelease: false
64
64
  version_requirements: !ruby/object:Gem::Requirement
65
65
  requirements:
66
66
  - - "~>"
67
67
  - !ruby/object:Gem::Version
68
- version: '1.6'
68
+ version: '1.9'
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: mongoid
71
71
  requirement: !ruby/object:Gem::Requirement
@@ -134,9 +134,6 @@ files:
134
134
  - lib/rocket_job/batch/results.rb
135
135
  - lib/rocket_job/batch/state_machine.rb
136
136
  - lib/rocket_job/batch/statistics.rb
137
- - lib/rocket_job/batch/tabular.rb
138
- - lib/rocket_job/batch/tabular/input.rb
139
- - lib/rocket_job/batch/tabular/output.rb
140
137
  - lib/rocket_job/batch/throttle.rb
141
138
  - lib/rocket_job/batch/throttle_running_workers.rb
142
139
  - lib/rocket_job/batch/throttle_windows.rb
@@ -198,6 +195,7 @@ files:
198
195
  - lib/rocket_job/sliced.rb
199
196
  - lib/rocket_job/sliced/bzip2_output_slice.rb
200
197
  - lib/rocket_job/sliced/compressed_slice.rb
198
+ - lib/rocket_job/sliced/encrypted_bzip2_output_slice.rb
201
199
  - lib/rocket_job/sliced/encrypted_slice.rb
202
200
  - lib/rocket_job/sliced/input.rb
203
201
  - lib/rocket_job/sliced/output.rb
@@ -233,11 +231,11 @@ required_ruby_version: !ruby/object:Gem::Requirement
233
231
  version: '2.5'
234
232
  required_rubygems_version: !ruby/object:Gem::Requirement
235
233
  requirements:
236
- - - ">"
234
+ - - ">="
237
235
  - !ruby/object:Gem::Version
238
- version: 1.3.1
236
+ version: '0'
239
237
  requirements: []
240
- rubygems_version: 3.2.15
238
+ rubygems_version: 3.2.22
241
239
  signing_key:
242
240
  specification_version: 4
243
241
  summary: Ruby's missing batch processing system.
@@ -1,133 +0,0 @@
1
- require "active_support/concern"
2
-
3
- module RocketJob
4
- module Batch
5
- class Tabular
6
- # @deprecated
7
- module Input
8
- extend ActiveSupport::Concern
9
-
10
- included do
11
- warn "#{name} is using RocketJob::Batch::Tabular::Input which is deprecated"
12
-
13
- field :tabular_input_header, type: Array, class_attribute: true, user_editable: true
14
- field :tabular_input_format, type: Mongoid::StringifiedSymbol, default: :csv, class_attribute: true, user_editable: true
15
- field :tabular_input_options, type: Hash, class_attribute: true
16
-
17
- # tabular_input_mode: [:line | :array | :hash]
18
- # :line
19
- # Uploads the file a line (String) at a time for processing by workers.
20
- # :array
21
- # Parses each line from the file as an Array and uploads each array for processing by workers.
22
- # :hash
23
- # Parses each line from the file into a Hash and uploads each hash for processing by workers.
24
- # See IOStreams#each.
25
- field :tabular_input_mode, type: Mongoid::StringifiedSymbol, default: :line, class_attribute: true, user_editable: true, copy_on_restart: true
26
-
27
- validates_inclusion_of :tabular_input_format, in: IOStreams::Tabular.registered_formats
28
- validates_inclusion_of :tabular_input_mode, in: %i[line array hash row record]
29
- validate :tabular_input_header_present
30
-
31
- class_attribute :tabular_input_white_list
32
- class_attribute :tabular_input_required
33
- class_attribute :tabular_input_skip_unknown
34
-
35
- # Cleanse all uploaded data by removing non-printable characters
36
- # and any characters that cannot be converted to UTF-8
37
- class_attribute :tabular_input_type
38
-
39
- self.tabular_input_white_list = nil
40
- self.tabular_input_required = nil
41
- self.tabular_input_skip_unknown = true
42
- self.tabular_input_type = :text
43
-
44
- before_perform :tabular_input_render
45
- end
46
-
47
- # Extract the header line during the upload.
48
- #
49
- # Overrides: RocketJob::Batch::IO#upload
50
- #
51
- # Notes:
52
- # - When supplying a block the header must be set manually
53
- def upload(stream = nil, **args, &block)
54
- input_stream = stream.nil? ? nil : IOStreams.new(stream)
55
-
56
- if stream && (tabular_input_type == :text)
57
- # Cannot change the length of fixed width lines
58
- replace = tabular_input_format == :fixed ? " " : ""
59
- input_stream.option_or_stream(:encode, encoding: "UTF-8", cleaner: :printable, replace: replace)
60
- end
61
-
62
- # If an input header is not required, then we don't extract it'
63
- return super(input_stream, stream_mode: tabular_input_mode, **args, &block) unless tabular_input.header?
64
-
65
- # If the header is already set then it is not expected in the file
66
- if tabular_input_header.present?
67
- tabular_input_cleanse_header
68
- return super(input_stream, stream_mode: tabular_input_mode, **args, &block)
69
- end
70
-
71
- case tabular_input_mode
72
- when :line
73
- parse_header = lambda do |line|
74
- tabular_input.parse_header(line)
75
- tabular_input_cleanse_header
76
- self.tabular_input_header = tabular_input.header.columns
77
- end
78
- super(input_stream, on_first: parse_header, stream_mode: :line, **args, &block)
79
- when :array, :row
80
- set_header = lambda do |row|
81
- tabular_input.header.columns = row
82
- tabular_input_cleanse_header
83
- self.tabular_input_header = tabular_input.header.columns
84
- end
85
- super(input_stream, on_first: set_header, stream_mode: :array, **args, &block)
86
- when :hash, :record
87
- super(input_stream, stream_mode: :hash, **args, &block)
88
- else
89
- raise(ArgumentError, "Invalid tabular_input_mode: #{stream_mode.inspect}")
90
- end
91
- end
92
-
93
- private
94
-
95
- # Shared instance used for this slice, by a single worker (thread)
96
- def tabular_input
97
- @tabular_input ||= IOStreams::Tabular.new(
98
- columns: tabular_input_header,
99
- allowed_columns: tabular_input_white_list,
100
- required_columns: tabular_input_required,
101
- skip_unknown: tabular_input_skip_unknown,
102
- format: tabular_input_format,
103
- format_options: tabular_input_options&.deep_symbolize_keys
104
- )
105
- end
106
-
107
- def tabular_input_render
108
- return if tabular_input_header.blank? && tabular_input.header?
109
-
110
- @rocket_job_input = tabular_input.record_parse(@rocket_job_input)
111
- end
112
-
113
- # Cleanse custom input header if supplied.
114
- def tabular_input_cleanse_header
115
- ignored_columns = tabular_input.header.cleanse!
116
- logger.warn("Stripped out invalid columns from custom header", ignored_columns) unless ignored_columns.empty?
117
-
118
- self.tabular_input_header = tabular_input.header.columns
119
- end
120
-
121
- def tabular_input_header_present
122
- if tabular_input_header.present? ||
123
- !tabular_input.header? ||
124
- (tabular_input_mode == :hash || tabular_input_mode == :record)
125
- return
126
- end
127
-
128
- errors.add(:tabular_input_header, "is required when tabular_input_format is #{tabular_input_format.inspect}")
129
- end
130
- end
131
- end
132
- end
133
- end
@@ -1,67 +0,0 @@
1
- require "active_support/concern"
2
-
3
- module RocketJob
4
- module Batch
5
- class Tabular
6
- # For the simple case where all `output_categories` have the same format,
7
- # If multiple output categories are used with different formats, then use IOStreams::Tabular directly
8
- # instead of this plugin.
9
- module Output
10
- extend ActiveSupport::Concern
11
-
12
- included do
13
- warn "#{name} is using RocketJob::Batch::Tabular::Output which is deprecated"
14
-
15
- field :tabular_output_header, type: Array, class_attribute: true, user_editable: true, copy_on_restart: true
16
- field :tabular_output_format, type: Mongoid::StringifiedSymbol, default: :csv, class_attribute: true, user_editable: true, copy_on_restart: true
17
- field :tabular_output_options, type: Hash, class_attribute: true
18
-
19
- validates_inclusion_of :tabular_output_format, in: IOStreams::Tabular.registered_formats
20
-
21
- after_perform :tabular_output_render
22
- end
23
-
24
- # Clear out cached tabular_output any time header or format is changed.
25
- def tabular_output_header=(tabular_output_header)
26
- super(tabular_output_header)
27
- @tabular_output = nil
28
- end
29
-
30
- def tabular_output_format=(tabular_output_format)
31
- super(tabular_output_format)
32
- @tabular_output = nil
33
- end
34
-
35
- # Overrides: `RocketJob::Batch::IO#download` to add the `tabular_output_header`.
36
- def download(file_name_or_io = nil, category: :main, **args, &block)
37
- unless tabular_output.requires_header?(category)
38
- return super(file_name_or_io, category: category, **args, &block)
39
- end
40
-
41
- header = tabular_output.render_header(category)
42
- super(file_name_or_io, header_line: header, category: category, **args, &block)
43
- end
44
-
45
- private
46
-
47
- # Delimited instance used for this slice, by a single worker (thread)
48
- def tabular_output
49
- @tabular_output ||= Tabular.new(
50
- main: IOStreams::Tabular.new(
51
- columns: tabular_output_header,
52
- format: tabular_output_format,
53
- format_options: tabular_output_options&.deep_symbolize_keys
54
- )
55
- )
56
- end
57
-
58
- # Render the output from the perform.
59
- def tabular_output_render
60
- return unless output_categories.present?
61
-
62
- @rocket_job_output = tabular_output.render(@rocket_job_output)
63
- end
64
- end
65
- end
66
- end
67
- end