rocketjob 6.0.0.rc3 → 6.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (35) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +26 -0
  3. data/lib/rocket_job/batch/categories.rb +24 -20
  4. data/lib/rocket_job/batch/io.rb +128 -128
  5. data/lib/rocket_job/batch/worker.rb +14 -12
  6. data/lib/rocket_job/category/base.rb +10 -7
  7. data/lib/rocket_job/category/input.rb +61 -1
  8. data/lib/rocket_job/category/output.rb +9 -0
  9. data/lib/rocket_job/dirmon_entry.rb +1 -1
  10. data/lib/rocket_job/jobs/conversion_job.rb +21 -17
  11. data/lib/rocket_job/jobs/dirmon_job.rb +24 -35
  12. data/lib/rocket_job/jobs/housekeeping_job.rb +4 -5
  13. data/lib/rocket_job/jobs/on_demand_batch_job.rb +7 -5
  14. data/lib/rocket_job/jobs/on_demand_job.rb +2 -2
  15. data/lib/rocket_job/jobs/upload_file_job.rb +4 -0
  16. data/lib/rocket_job/plugins/cron.rb +60 -20
  17. data/lib/rocket_job/plugins/job/persistence.rb +36 -0
  18. data/lib/rocket_job/plugins/restart.rb +3 -110
  19. data/lib/rocket_job/plugins/state_machine.rb +2 -2
  20. data/lib/rocket_job/plugins/throttle_dependent_jobs.rb +1 -2
  21. data/lib/rocket_job/sliced/bzip2_output_slice.rb +18 -19
  22. data/lib/rocket_job/sliced/compressed_slice.rb +3 -6
  23. data/lib/rocket_job/sliced/encrypted_bzip2_output_slice.rb +49 -0
  24. data/lib/rocket_job/sliced/encrypted_slice.rb +4 -6
  25. data/lib/rocket_job/sliced/input.rb +42 -54
  26. data/lib/rocket_job/sliced/slice.rb +7 -3
  27. data/lib/rocket_job/sliced/slices.rb +12 -9
  28. data/lib/rocket_job/sliced/writer/input.rb +46 -18
  29. data/lib/rocket_job/sliced.rb +1 -19
  30. data/lib/rocket_job/version.rb +1 -1
  31. data/lib/rocketjob.rb +2 -2
  32. metadata +8 -10
  33. data/lib/rocket_job/batch/tabular/input.rb +0 -133
  34. data/lib/rocket_job/batch/tabular/output.rb +0 -67
  35. data/lib/rocket_job/batch/tabular.rb +0 -58
@@ -95,9 +95,13 @@ module RocketJob
95
95
  end
96
96
 
97
97
  # Returns whether this is a specialized binary slice for creating binary data from each slice
98
- # that is then just downloaded as-is into output files.
99
- def self.binary?
100
- false
98
+ # that is downloaded without conversion into output files.
99
+ def self.binary_format
100
+ end
101
+
102
+ # For binary formats only, format the supplied records into the binary format for this slice
103
+ def self.to_binary(_records)
104
+ raise NotImplementedError
101
105
  end
102
106
 
103
107
  # `records` array has special handling so that it can be modified in place instead of having
@@ -42,12 +42,6 @@ module RocketJob
42
42
  slice
43
43
  end
44
44
 
45
- # Returns whether this collection contains specialized binary slices for creating binary data from each slice
46
- # that is then just downloaded as-is into output files.
47
- def binary?
48
- slice_class.binary?
49
- end
50
-
51
45
  # Returns output slices in the order of their id
52
46
  # which is usually the order in which they were written.
53
47
  def each(&block)
@@ -96,6 +90,11 @@ module RocketJob
96
90
  slice
97
91
  end
98
92
 
93
+ def insert_many(slices)
94
+ documents = slices.collect(&:as_document)
95
+ all.collection.insert_many(documents)
96
+ end
97
+
99
98
  # Append to an existing slice if already present
100
99
  def append(slice, input_slice)
101
100
  existing_slice = all.where(id: input_slice.id).first
@@ -111,9 +110,13 @@ module RocketJob
111
110
 
112
111
  # Index for find_and_modify only if it is not already present
113
112
  def create_indexes
114
- all.collection.indexes.create_one(state: 1, _id: 1) if all.collection.indexes.none? { |i| i["name"] == "state_1__id_1" }
115
- rescue Mongo::Error::OperationFailure
116
- all.collection.indexes.create_one(state: 1, _id: 1)
113
+ missing =
114
+ begin
115
+ all.collection.indexes.none? { |i| i["name"] == "state_1__id_1" }
116
+ rescue Mongo::Error::OperationFailure
117
+ true
118
+ end
119
+ all.collection.indexes.create_one({state: 1, _id: 1}, unique: true) if missing
117
120
  end
118
121
 
119
122
  # Forward additional methods.
@@ -12,43 +12,71 @@ module RocketJob
12
12
  # Block to call on the first line only, instead of storing in the slice.
13
13
  # Useful for extracting the header row
14
14
  # Default: nil
15
- def self.collect(input, **args)
16
- writer = new(input, **args)
15
+ #
16
+ # slice_size: [Integer]
17
+ # Override the slice size when uploading for example ranges, where slice is the size
18
+ # of the range itself.
19
+ #
20
+ # slice_batch_size: [Integer]
21
+ # The number of slices to batch up and to bulk load.
22
+ # For smaller slices this significantly improves upload performance.
23
+ # Note: If `slice_batch_size` is too high, it can exceed the maximum BSON block size.
24
+ def self.collect(data_store, **args)
25
+ writer = new(data_store, **args)
17
26
  yield(writer)
18
27
  writer.record_count
19
28
  ensure
20
- writer&.close
29
+ writer&.flush
21
30
  end
22
31
 
23
- def initialize(input, on_first: nil)
24
- @on_first = on_first
25
- @batch_count = 0
26
- @record_count = 0
27
- @input = input
28
- @record_number = 1
29
- @slice = @input.new(first_record_number: @record_number)
32
+ def initialize(data_store, on_first: nil, slice_size: nil, slice_batch_size: nil)
33
+ @on_first = on_first
34
+ @record_count = 0
35
+ @data_store = data_store
36
+ @slice_size = slice_size || @data_store.slice_size
37
+ @slice_batch_size = slice_batch_size || 20
38
+ @batch = []
39
+ @batch_count = 0
40
+ new_slice
30
41
  end
31
42
 
32
43
  def <<(line)
33
- @record_number += 1
34
44
  if @on_first
35
45
  @on_first.call(line)
36
46
  @on_first = nil
37
47
  return self
38
48
  end
39
49
  @slice << line
40
- @batch_count += 1
41
50
  @record_count += 1
42
- if @batch_count >= @input.slice_size
43
- @input.insert(@slice)
44
- @batch_count = 0
45
- @slice = @input.new(first_record_number: @record_number)
51
+ if @slice.size >= @slice_size
52
+ save_slice
53
+ new_slice
46
54
  end
47
55
  self
48
56
  end
49
57
 
50
- def close
51
- @input.insert(@slice) if @slice.size.positive?
58
+ def flush
59
+ if @slice_batch_size
60
+ @batch << @slice if @slice.size.positive?
61
+ @data_store.insert_many(@batch)
62
+ @batch = []
63
+ @batch_count = 0
64
+ elsif @slice.size.positive?
65
+ @data_store.insert(@slice)
66
+ end
67
+ end
68
+
69
+ def new_slice
70
+ @slice = @data_store.new(first_record_number: @record_count + 1)
71
+ end
72
+
73
+ def save_slice
74
+ return flush unless @slice_batch_size
75
+
76
+ @batch_count += 1
77
+ return flush if @batch_count >= @slice_batch_size
78
+
79
+ @batch << @slice
52
80
  end
53
81
  end
54
82
  end
@@ -2,6 +2,7 @@ module RocketJob
2
2
  module Sliced
3
3
  autoload :BZip2OutputSlice, "rocket_job/sliced/bzip2_output_slice"
4
4
  autoload :CompressedSlice, "rocket_job/sliced/compressed_slice"
5
+ autoload :EncryptedBZip2OutputSlice, "rocket_job/sliced/encrypted_bzip2_output_slice"
5
6
  autoload :EncryptedSlice, "rocket_job/sliced/encrypted_slice"
6
7
  autoload :Input, "rocket_job/sliced/input"
7
8
  autoload :Output, "rocket_job/sliced/output"
@@ -13,24 +14,5 @@ module RocketJob
13
14
  autoload :Input, "rocket_job/sliced/writer/input"
14
15
  autoload :Output, "rocket_job/sliced/writer/output"
15
16
  end
16
-
17
- # Returns [RocketJob::Sliced::Slices] for the relevant direction and category.
18
- def self.factory(direction, category, job)
19
- collection_name = "rocket_job.#{direction}s.#{job.id}"
20
- collection_name << ".#{category.name}" unless category.name == :main
21
-
22
- case direction
23
- when :input
24
- RocketJob::Sliced::Input.new(
25
- collection_name: collection_name,
26
- slice_class: category.serializer_class,
27
- slice_size: category.slice_size
28
- )
29
- when :output
30
- RocketJob::Sliced::Output.new(collection_name: collection_name, slice_class: category.serializer_class)
31
- else
32
- raise(ArgumentError, "Unknown direction: #{direction.inspect}")
33
- end
34
- end
35
17
  end
36
18
  end
@@ -1,3 +1,3 @@
1
1
  module RocketJob
2
- VERSION = "6.0.0.rc3".freeze
2
+ VERSION = "6.0.0".freeze
3
3
  end
data/lib/rocketjob.rb CHANGED
@@ -63,7 +63,6 @@ module RocketJob
63
63
  autoload :Cron, "rocket_job/plugins/cron"
64
64
  autoload :Document, "rocket_job/plugins/document"
65
65
  autoload :ProcessingWindow, "rocket_job/plugins/processing_window"
66
- autoload :Restart, "rocket_job/plugins/restart"
67
66
  autoload :Retry, "rocket_job/plugins/retry"
68
67
  autoload :Singleton, "rocket_job/plugins/singleton"
69
68
  autoload :StateMachine, "rocket_job/plugins/state_machine"
@@ -73,11 +72,12 @@ module RocketJob
73
72
 
74
73
  module Jobs
75
74
  autoload :ActiveJob, "rocket_job/jobs/active_job"
75
+ autoload :ConversionJob, "rocket_job/jobs/conversion_job"
76
76
  autoload :CopyFileJob, "rocket_job/jobs/copy_file_job"
77
77
  autoload :DirmonJob, "rocket_job/jobs/dirmon_job"
78
+ autoload :HousekeepingJob, "rocket_job/jobs/housekeeping_job"
78
79
  autoload :OnDemandBatchJob, "rocket_job/jobs/on_demand_batch_job"
79
80
  autoload :OnDemandJob, "rocket_job/jobs/on_demand_job"
80
- autoload :HousekeepingJob, "rocket_job/jobs/housekeeping_job"
81
81
  autoload :PerformanceJob, "rocket_job/jobs/performance_job"
82
82
  autoload :SimpleJob, "rocket_job/jobs/simple_job"
83
83
  autoload :UploadFileJob, "rocket_job/jobs/upload_file_job"
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rocketjob
3
3
  version: !ruby/object:Gem::Version
4
- version: 6.0.0.rc3
4
+ version: 6.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Reid Morrison
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-06-23 00:00:00.000000000 Z
11
+ date: 2021-08-25 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: aasm
@@ -58,14 +58,14 @@ dependencies:
58
58
  requirements:
59
59
  - - "~>"
60
60
  - !ruby/object:Gem::Version
61
- version: '1.6'
61
+ version: '1.9'
62
62
  type: :runtime
63
63
  prerelease: false
64
64
  version_requirements: !ruby/object:Gem::Requirement
65
65
  requirements:
66
66
  - - "~>"
67
67
  - !ruby/object:Gem::Version
68
- version: '1.6'
68
+ version: '1.9'
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: mongoid
71
71
  requirement: !ruby/object:Gem::Requirement
@@ -134,9 +134,6 @@ files:
134
134
  - lib/rocket_job/batch/results.rb
135
135
  - lib/rocket_job/batch/state_machine.rb
136
136
  - lib/rocket_job/batch/statistics.rb
137
- - lib/rocket_job/batch/tabular.rb
138
- - lib/rocket_job/batch/tabular/input.rb
139
- - lib/rocket_job/batch/tabular/output.rb
140
137
  - lib/rocket_job/batch/throttle.rb
141
138
  - lib/rocket_job/batch/throttle_running_workers.rb
142
139
  - lib/rocket_job/batch/throttle_windows.rb
@@ -198,6 +195,7 @@ files:
198
195
  - lib/rocket_job/sliced.rb
199
196
  - lib/rocket_job/sliced/bzip2_output_slice.rb
200
197
  - lib/rocket_job/sliced/compressed_slice.rb
198
+ - lib/rocket_job/sliced/encrypted_bzip2_output_slice.rb
201
199
  - lib/rocket_job/sliced/encrypted_slice.rb
202
200
  - lib/rocket_job/sliced/input.rb
203
201
  - lib/rocket_job/sliced/output.rb
@@ -233,11 +231,11 @@ required_ruby_version: !ruby/object:Gem::Requirement
233
231
  version: '2.5'
234
232
  required_rubygems_version: !ruby/object:Gem::Requirement
235
233
  requirements:
236
- - - ">"
234
+ - - ">="
237
235
  - !ruby/object:Gem::Version
238
- version: 1.3.1
236
+ version: '0'
239
237
  requirements: []
240
- rubygems_version: 3.2.15
238
+ rubygems_version: 3.2.22
241
239
  signing_key:
242
240
  specification_version: 4
243
241
  summary: Ruby's missing batch processing system.
@@ -1,133 +0,0 @@
1
- require "active_support/concern"
2
-
3
- module RocketJob
4
- module Batch
5
- class Tabular
6
- # @deprecated
7
- module Input
8
- extend ActiveSupport::Concern
9
-
10
- included do
11
- warn "#{name} is using RocketJob::Batch::Tabular::Input which is deprecated"
12
-
13
- field :tabular_input_header, type: Array, class_attribute: true, user_editable: true
14
- field :tabular_input_format, type: Mongoid::StringifiedSymbol, default: :csv, class_attribute: true, user_editable: true
15
- field :tabular_input_options, type: Hash, class_attribute: true
16
-
17
- # tabular_input_mode: [:line | :array | :hash]
18
- # :line
19
- # Uploads the file a line (String) at a time for processing by workers.
20
- # :array
21
- # Parses each line from the file as an Array and uploads each array for processing by workers.
22
- # :hash
23
- # Parses each line from the file into a Hash and uploads each hash for processing by workers.
24
- # See IOStreams#each.
25
- field :tabular_input_mode, type: Mongoid::StringifiedSymbol, default: :line, class_attribute: true, user_editable: true, copy_on_restart: true
26
-
27
- validates_inclusion_of :tabular_input_format, in: IOStreams::Tabular.registered_formats
28
- validates_inclusion_of :tabular_input_mode, in: %i[line array hash row record]
29
- validate :tabular_input_header_present
30
-
31
- class_attribute :tabular_input_white_list
32
- class_attribute :tabular_input_required
33
- class_attribute :tabular_input_skip_unknown
34
-
35
- # Cleanse all uploaded data by removing non-printable characters
36
- # and any characters that cannot be converted to UTF-8
37
- class_attribute :tabular_input_type
38
-
39
- self.tabular_input_white_list = nil
40
- self.tabular_input_required = nil
41
- self.tabular_input_skip_unknown = true
42
- self.tabular_input_type = :text
43
-
44
- before_perform :tabular_input_render
45
- end
46
-
47
- # Extract the header line during the upload.
48
- #
49
- # Overrides: RocketJob::Batch::IO#upload
50
- #
51
- # Notes:
52
- # - When supplying a block the header must be set manually
53
- def upload(stream = nil, **args, &block)
54
- input_stream = stream.nil? ? nil : IOStreams.new(stream)
55
-
56
- if stream && (tabular_input_type == :text)
57
- # Cannot change the length of fixed width lines
58
- replace = tabular_input_format == :fixed ? " " : ""
59
- input_stream.option_or_stream(:encode, encoding: "UTF-8", cleaner: :printable, replace: replace)
60
- end
61
-
62
- # If an input header is not required, then we don't extract it'
63
- return super(input_stream, stream_mode: tabular_input_mode, **args, &block) unless tabular_input.header?
64
-
65
- # If the header is already set then it is not expected in the file
66
- if tabular_input_header.present?
67
- tabular_input_cleanse_header
68
- return super(input_stream, stream_mode: tabular_input_mode, **args, &block)
69
- end
70
-
71
- case tabular_input_mode
72
- when :line
73
- parse_header = lambda do |line|
74
- tabular_input.parse_header(line)
75
- tabular_input_cleanse_header
76
- self.tabular_input_header = tabular_input.header.columns
77
- end
78
- super(input_stream, on_first: parse_header, stream_mode: :line, **args, &block)
79
- when :array, :row
80
- set_header = lambda do |row|
81
- tabular_input.header.columns = row
82
- tabular_input_cleanse_header
83
- self.tabular_input_header = tabular_input.header.columns
84
- end
85
- super(input_stream, on_first: set_header, stream_mode: :array, **args, &block)
86
- when :hash, :record
87
- super(input_stream, stream_mode: :hash, **args, &block)
88
- else
89
- raise(ArgumentError, "Invalid tabular_input_mode: #{stream_mode.inspect}")
90
- end
91
- end
92
-
93
- private
94
-
95
- # Shared instance used for this slice, by a single worker (thread)
96
- def tabular_input
97
- @tabular_input ||= IOStreams::Tabular.new(
98
- columns: tabular_input_header,
99
- allowed_columns: tabular_input_white_list,
100
- required_columns: tabular_input_required,
101
- skip_unknown: tabular_input_skip_unknown,
102
- format: tabular_input_format,
103
- format_options: tabular_input_options&.deep_symbolize_keys
104
- )
105
- end
106
-
107
- def tabular_input_render
108
- return if tabular_input_header.blank? && tabular_input.header?
109
-
110
- @rocket_job_input = tabular_input.record_parse(@rocket_job_input)
111
- end
112
-
113
- # Cleanse custom input header if supplied.
114
- def tabular_input_cleanse_header
115
- ignored_columns = tabular_input.header.cleanse!
116
- logger.warn("Stripped out invalid columns from custom header", ignored_columns) unless ignored_columns.empty?
117
-
118
- self.tabular_input_header = tabular_input.header.columns
119
- end
120
-
121
- def tabular_input_header_present
122
- if tabular_input_header.present? ||
123
- !tabular_input.header? ||
124
- (tabular_input_mode == :hash || tabular_input_mode == :record)
125
- return
126
- end
127
-
128
- errors.add(:tabular_input_header, "is required when tabular_input_format is #{tabular_input_format.inspect}")
129
- end
130
- end
131
- end
132
- end
133
- end
@@ -1,67 +0,0 @@
1
- require "active_support/concern"
2
-
3
- module RocketJob
4
- module Batch
5
- class Tabular
6
- # For the simple case where all `output_categories` have the same format,
7
- # If multiple output categories are used with different formats, then use IOStreams::Tabular directly
8
- # instead of this plugin.
9
- module Output
10
- extend ActiveSupport::Concern
11
-
12
- included do
13
- warn "#{name} is using RocketJob::Batch::Tabular::Output which is deprecated"
14
-
15
- field :tabular_output_header, type: Array, class_attribute: true, user_editable: true, copy_on_restart: true
16
- field :tabular_output_format, type: Mongoid::StringifiedSymbol, default: :csv, class_attribute: true, user_editable: true, copy_on_restart: true
17
- field :tabular_output_options, type: Hash, class_attribute: true
18
-
19
- validates_inclusion_of :tabular_output_format, in: IOStreams::Tabular.registered_formats
20
-
21
- after_perform :tabular_output_render
22
- end
23
-
24
- # Clear out cached tabular_output any time header or format is changed.
25
- def tabular_output_header=(tabular_output_header)
26
- super(tabular_output_header)
27
- @tabular_output = nil
28
- end
29
-
30
- def tabular_output_format=(tabular_output_format)
31
- super(tabular_output_format)
32
- @tabular_output = nil
33
- end
34
-
35
- # Overrides: `RocketJob::Batch::IO#download` to add the `tabular_output_header`.
36
- def download(file_name_or_io = nil, category: :main, **args, &block)
37
- unless tabular_output.requires_header?(category)
38
- return super(file_name_or_io, category: category, **args, &block)
39
- end
40
-
41
- header = tabular_output.render_header(category)
42
- super(file_name_or_io, header_line: header, category: category, **args, &block)
43
- end
44
-
45
- private
46
-
47
- # Delimited instance used for this slice, by a single worker (thread)
48
- def tabular_output
49
- @tabular_output ||= Tabular.new(
50
- main: IOStreams::Tabular.new(
51
- columns: tabular_output_header,
52
- format: tabular_output_format,
53
- format_options: tabular_output_options&.deep_symbolize_keys
54
- )
55
- )
56
- end
57
-
58
- # Render the output from the perform.
59
- def tabular_output_render
60
- return unless output_categories.present?
61
-
62
- @rocket_job_output = tabular_output.render(@rocket_job_output)
63
- end
64
- end
65
- end
66
- end
67
- end