rocketjob 5.4.1 → 6.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +175 -5
  3. data/bin/rocketjob_batch_perf +1 -1
  4. data/bin/rocketjob_perf +1 -1
  5. data/lib/rocket_job/batch/categories.rb +345 -0
  6. data/lib/rocket_job/batch/io.rb +174 -106
  7. data/lib/rocket_job/batch/model.rb +20 -68
  8. data/lib/rocket_job/batch/performance.rb +19 -7
  9. data/lib/rocket_job/batch/statistics.rb +34 -12
  10. data/lib/rocket_job/batch/throttle_running_workers.rb +2 -6
  11. data/lib/rocket_job/batch/worker.rb +31 -26
  12. data/lib/rocket_job/batch.rb +3 -1
  13. data/lib/rocket_job/category/base.rb +81 -0
  14. data/lib/rocket_job/category/input.rb +170 -0
  15. data/lib/rocket_job/category/output.rb +34 -0
  16. data/lib/rocket_job/cli.rb +25 -17
  17. data/lib/rocket_job/dirmon_entry.rb +23 -13
  18. data/lib/rocket_job/event.rb +1 -1
  19. data/lib/rocket_job/extensions/iostreams/path.rb +32 -0
  20. data/lib/rocket_job/extensions/mongoid/contextual/mongo.rb +2 -2
  21. data/lib/rocket_job/extensions/mongoid/factory.rb +4 -12
  22. data/lib/rocket_job/extensions/mongoid/stringified_symbol.rb +50 -0
  23. data/lib/rocket_job/extensions/psych/yaml_tree.rb +8 -0
  24. data/lib/rocket_job/extensions/rocket_job_adapter.rb +2 -2
  25. data/lib/rocket_job/jobs/conversion_job.rb +43 -0
  26. data/lib/rocket_job/jobs/dirmon_job.rb +25 -36
  27. data/lib/rocket_job/jobs/housekeeping_job.rb +11 -12
  28. data/lib/rocket_job/jobs/on_demand_batch_job.rb +24 -11
  29. data/lib/rocket_job/jobs/on_demand_job.rb +3 -4
  30. data/lib/rocket_job/jobs/performance_job.rb +3 -1
  31. data/lib/rocket_job/jobs/re_encrypt/relational_job.rb +103 -96
  32. data/lib/rocket_job/jobs/upload_file_job.rb +48 -8
  33. data/lib/rocket_job/lookup_collection.rb +69 -0
  34. data/lib/rocket_job/plugins/cron.rb +60 -20
  35. data/lib/rocket_job/plugins/job/model.rb +25 -50
  36. data/lib/rocket_job/plugins/job/persistence.rb +36 -0
  37. data/lib/rocket_job/plugins/job/throttle.rb +2 -2
  38. data/lib/rocket_job/plugins/job/throttle_running_jobs.rb +1 -1
  39. data/lib/rocket_job/plugins/job/worker.rb +2 -7
  40. data/lib/rocket_job/plugins/restart.rb +3 -103
  41. data/lib/rocket_job/plugins/state_machine.rb +4 -3
  42. data/lib/rocket_job/plugins/throttle_dependent_jobs.rb +37 -0
  43. data/lib/rocket_job/ractor_worker.rb +42 -0
  44. data/lib/rocket_job/server/model.rb +1 -1
  45. data/lib/rocket_job/sliced/bzip2_output_slice.rb +18 -19
  46. data/lib/rocket_job/sliced/compressed_slice.rb +3 -6
  47. data/lib/rocket_job/sliced/encrypted_bzip2_output_slice.rb +49 -0
  48. data/lib/rocket_job/sliced/encrypted_slice.rb +4 -6
  49. data/lib/rocket_job/sliced/input.rb +42 -54
  50. data/lib/rocket_job/sliced/slice.rb +12 -16
  51. data/lib/rocket_job/sliced/slices.rb +26 -11
  52. data/lib/rocket_job/sliced/writer/input.rb +46 -18
  53. data/lib/rocket_job/sliced/writer/output.rb +33 -45
  54. data/lib/rocket_job/sliced.rb +1 -74
  55. data/lib/rocket_job/subscribers/server.rb +1 -1
  56. data/lib/rocket_job/thread_worker.rb +46 -0
  57. data/lib/rocket_job/throttle_definitions.rb +7 -1
  58. data/lib/rocket_job/version.rb +1 -1
  59. data/lib/rocket_job/worker.rb +21 -55
  60. data/lib/rocket_job/worker_pool.rb +5 -7
  61. data/lib/rocketjob.rb +53 -43
  62. metadata +36 -28
  63. data/lib/rocket_job/batch/tabular/input.rb +0 -131
  64. data/lib/rocket_job/batch/tabular/output.rb +0 -65
  65. data/lib/rocket_job/batch/tabular.rb +0 -56
  66. data/lib/rocket_job/extensions/mongoid/remove_warnings.rb +0 -12
  67. data/lib/rocket_job/jobs/on_demand_batch_tabular_job.rb +0 -28
@@ -36,8 +36,8 @@ module RocketJob
36
36
  raise(ArgumentError, "Cannot supply both a method name and a block") if methods.size.positive? && block
37
37
  raise(ArgumentError, "Must supply either a method name or a block") unless methods.size.positive? || block
38
38
 
39
- # TODO: Somehow get AASM to support options such as :if and :unless to be consistent with other callbacks
40
- # For example:
39
+ # Limitation with AASM. It only supports guards on event transitions, not for callbacks.
40
+ # For example, AASM does not support callback options such as :if and :unless, yet Rails callbacks do.
41
41
  # before_start :my_callback, unless: :encrypted?
42
42
  # before_start :my_callback, if: :encrypted?
43
43
  event = aasm.state_machine.events[event_name]
@@ -51,7 +51,8 @@ module RocketJob
51
51
  # Validate methods are any of Symbol String Proc
52
52
  methods.each do |method|
53
53
  unless method.is_a?(Symbol) || method.is_a?(String)
54
- raise(ArgumentError, "#{action}_#{event_name} currently does not support any options. Only Symbol and String method names can be supplied.")
54
+ raise(ArgumentError,
55
+ "#{action}_#{event_name} currently does not support any options. Only Symbol and String method names can be supplied.")
55
56
  end
56
57
  end
57
58
  methods
@@ -0,0 +1,37 @@
1
+ require "active_support/concern"
2
+ module RocketJob
3
+ module Plugins
4
+ # Prevent this job from starting, or a batch slice from starting if the dependent jobs are running.
5
+ #
6
+ # Features:
7
+ # - Ensures dependent jobs won't run
8
+ # When the throttle has been exceeded all jobs of this class will be ignored until the
9
+ # next refresh. `RocketJob::Config::re_check_seconds` which by default is 60 seconds.
10
+ module ThrottleDependentJobs
11
+ extend ActiveSupport::Concern
12
+
13
+ included do
14
+ field :dependent_jobs, type: Array, class_attribute: true, user_editable: true, copy_on_restart: true
15
+
16
+ define_throttle :dependent_job_exists?
17
+ define_batch_throttle :dependent_job_exists? if respond_to?(:define_batch_throttle)
18
+ end
19
+
20
+ private
21
+
22
+ # Checks if there are any dependent jobs are running
23
+ def dependent_job_exists?
24
+ return false if dependent_jobs.blank?
25
+
26
+ jobs_count = RocketJob::Job.running.where(:_type.in => dependent_jobs).count
27
+ return false if jobs_count.zero?
28
+
29
+ logger.info(
30
+ message: "#{jobs_count} Dependent Jobs are running from #{dependent_jobs.join(', ')}",
31
+ metric: "#{self.class.name}/dependent_jobs_throttle"
32
+ )
33
+ true
34
+ end
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,42 @@
1
+ module RocketJob
2
+ # Run each worker in its own "Ractor".
3
+ class RactorWorker < Worker
4
+ attr_reader :thread
5
+
6
+ def initialize(id:, server_name:)
7
+ super(id: id, server_name: server_name)
8
+ @shutdown = Concurrent::Event.new
9
+ @thread = Ractor.new(name: "rocketjob-#{id}") { run }
10
+ end
11
+
12
+ def alive?
13
+ @thread.alive?
14
+ end
15
+
16
+ def backtrace
17
+ @thread.backtrace
18
+ end
19
+
20
+ def join(*args)
21
+ @thread.join(*args)
22
+ end
23
+
24
+ # Send each active worker the RocketJob::ShutdownException so that stops processing immediately.
25
+ def kill
26
+ @thread.raise(Shutdown, "Shutdown due to kill request for worker: #{name}") if @thread.alive?
27
+ end
28
+
29
+ def shutdown?
30
+ @shutdown.set?
31
+ end
32
+
33
+ def shutdown!
34
+ @shutdown.set
35
+ end
36
+
37
+ # Returns [true|false] whether the shutdown indicator was set
38
+ def wait_for_shutdown?(timeout = nil)
39
+ @shutdown.wait(timeout)
40
+ end
41
+ end
42
+ end
@@ -28,7 +28,7 @@ module RocketJob
28
28
 
29
29
  # Current state
30
30
  # Internal use only. Do not set this field directly
31
- field :state, type: Symbol, default: :starting
31
+ field :state, type: Mongoid::StringifiedSymbol, default: :starting
32
32
 
33
33
  index({name: 1}, background: true, unique: true)
34
34
 
@@ -7,36 +7,35 @@ module RocketJob
7
7
  # * The `bzip2` linux command line utility supports multiple embedded BZip2 stream,
8
8
  # but some other custom implementations may not. They may only read the first slice and stop.
9
9
  # * It is only designed for use on output collections.
10
- #
11
- # To download the output when using this slice:
12
- #
13
- # # Download the binary BZip2 streams into a single file
14
- # IOStreams.path(output_file_name).stream(:none).writer do |io|
15
- # job.download { |slice| io << slice[:binary] }
16
- # end
17
10
  class BZip2OutputSlice < ::RocketJob::Sliced::Slice
18
- # This is a specialized binary slice for creating binary data from each slice
11
+ # This is a specialized binary slice for creating BZip2 binary data from each slice
19
12
  # that must be downloaded as-is into output files.
20
- def self.binary?
21
- true
13
+ def self.binary_format
14
+ :bz2
15
+ end
16
+
17
+ # Compress the supplied records with BZip2
18
+ def self.to_binary(records, record_delimiter = "\n")
19
+ return [] if records.blank?
20
+
21
+ lines = Array(records).join(record_delimiter) + record_delimiter
22
+ s = StringIO.new
23
+ IOStreams::Bzip2::Writer.stream(s) { |io| io.write(lines) }
24
+ s.string
22
25
  end
23
26
 
24
27
  private
25
28
 
29
+ # Returns [Hash] the BZip2 compressed binary data in binary form when reading back from Mongo.
26
30
  def parse_records
27
- records = attributes.delete("records")
28
-
29
31
  # Convert BSON::Binary to a string
30
- @records = [{binary: records.data}]
32
+ @records = [attributes.delete("records").data]
31
33
  end
32
34
 
35
+ # Returns [BSON::Binary] the records compressed using BZip2 into a string.
33
36
  def serialize_records
34
- return [] if @records.nil? || @records.empty?
35
-
36
- lines = records.to_a.join("\n") + "\n"
37
- s = StringIO.new
38
- IOStreams::Bzip2::Writer.stream(s) { |io| io.write(lines) }
39
- BSON::Binary.new(s.string)
37
+ # TODO: Make the line terminator configurable
38
+ BSON::Binary.new(self.class.to_binary(@records))
40
39
  end
41
40
  end
42
41
  end
@@ -6,13 +6,10 @@ module RocketJob
6
6
  private
7
7
 
8
8
  def parse_records
9
- records = attributes.delete("records")
10
-
11
9
  # Convert BSON::Binary to a string
12
- binary_str = records.data
13
-
14
- str = Zlib::Inflate.inflate(binary_str)
15
- @records = Hash.from_bson(BSON::ByteBuffer.new(str))["r"]
10
+ compressed_str = attributes.delete("records").data
11
+ decompressed_str = Zlib::Inflate.inflate(compressed_str)
12
+ @records = Hash.from_bson(BSON::ByteBuffer.new(decompressed_str))["r"]
16
13
  end
17
14
 
18
15
  def serialize_records
@@ -0,0 +1,49 @@
1
+ module RocketJob
2
+ module Sliced
3
+ # This is a specialized output serializer that renders each output slice as a single BZip2 compressed stream.
4
+ # BZip2 allows multiple output streams to be written into a single BZip2 file.
5
+ #
6
+ # Notes:
7
+ # * The `bzip2` linux command line utility supports multiple embedded BZip2 stream,
8
+ # but some other custom implementations may not. They may only read the first slice and stop.
9
+ # * It is only designed for use on output collections.
10
+ class EncryptedBZip2OutputSlice < ::RocketJob::Sliced::Slice
11
+ # This is a specialized binary slice for creating BZip2 binary data from each slice
12
+ # that must be downloaded as-is into output files.
13
+ def self.binary_format
14
+ :bz2
15
+ end
16
+
17
+ private
18
+
19
+ # Returns [Hash] the BZip2 compressed binary data in binary form when reading back from Mongo.
20
+ def parse_records
21
+ # Convert BSON::Binary to a string
22
+ encrypted_str = attributes.delete("records").data
23
+
24
+ # Decrypt string
25
+ header = SymmetricEncryption::Header.new
26
+ header.parse(encrypted_str)
27
+ # Use the header that is present to decrypt the data, since its version could be different
28
+ decrypted_str = header.cipher.binary_decrypt(encrypted_str, header: header)
29
+
30
+ @records = [decrypted_str]
31
+ end
32
+
33
+ # Returns [BSON::Binary] the records compressed using BZip2 into a string.
34
+ def serialize_records
35
+ return [] if @records.nil? || @records.empty?
36
+
37
+ # TODO: Make the line terminator configurable
38
+ lines = records.to_a.join("\n") + "\n"
39
+ s = StringIO.new
40
+ IOStreams::Bzip2::Writer.stream(s) { |io| io.write(lines) }
41
+
42
+ # Encrypt to binary without applying an encoding such as Base64
43
+ # Use a random_iv with each encryption for better security
44
+ data = SymmetricEncryption.cipher.binary_encrypt(s.string, random_iv: true, compress: false)
45
+ BSON::Binary.new(data)
46
+ end
47
+ end
48
+ end
49
+ end
@@ -6,17 +6,15 @@ module RocketJob
6
6
  private
7
7
 
8
8
  def parse_records
9
- records = attributes.delete("records")
10
-
11
9
  # Convert BSON::Binary to a string
12
- binary_str = records.data
10
+ encrypted_str = attributes.delete("records").data
13
11
 
14
12
  header = SymmetricEncryption::Header.new
15
- header.parse(binary_str)
13
+ header.parse(encrypted_str)
16
14
  # Use the header that is present to decrypt the data, since its version could be different
17
- str = header.cipher.binary_decrypt(binary_str, header: header)
15
+ decrypted_str = header.cipher.binary_decrypt(encrypted_str, header: header)
18
16
 
19
- @records = Hash.from_bson(BSON::ByteBuffer.new(str))["r"]
17
+ @records = Hash.from_bson(BSON::ByteBuffer.new(decrypted_str))["r"]
20
18
  end
21
19
 
22
20
  def serialize_records
@@ -1,16 +1,16 @@
1
1
  module RocketJob
2
2
  module Sliced
3
3
  class Input < Slices
4
- def upload(on_first: nil, &block)
4
+ def upload(**args, &block)
5
5
  # Create indexes before uploading
6
6
  create_indexes
7
- Writer::Input.collect(self, on_first: on_first, &block)
7
+ Writer::Input.collect(self, **args, &block)
8
8
  rescue Exception => e
9
9
  drop
10
10
  raise(e)
11
11
  end
12
12
 
13
- def upload_mongo_query(criteria, *column_names, &block)
13
+ def upload_mongo_query(criteria, columns: [], slice_batch_size: nil, &block)
14
14
  options = criteria.options
15
15
 
16
16
  # Without a block extract the fields from the supplied criteria
@@ -18,23 +18,21 @@ module RocketJob
18
18
  # Criteria is returning old school :fields instead of :projections
19
19
  options[:projection] = options.delete(:fields) if options.key?(:fields)
20
20
  else
21
- column_names = column_names.collect(&:to_s)
22
- column_names << "_id" if column_names.size.zero?
23
-
24
- fields = options.delete(:fields) || {}
25
- column_names.each { |col| fields[col] = 1 }
21
+ columns = columns.blank? ? ["_id"] : columns.collect(&:to_s)
22
+ fields = options.delete(:fields) || {}
23
+ columns.each { |col| fields[col] = 1 }
26
24
  options[:projection] = fields
27
25
 
28
26
  block =
29
- if column_names.size == 1
30
- column = column_names.first
27
+ if columns.size == 1
28
+ column = columns.first
31
29
  ->(document) { document[column] }
32
30
  else
33
- ->(document) { column_names.collect { |c| document[c] } }
31
+ ->(document) { columns.collect { |c| document[c] } }
34
32
  end
35
33
  end
36
34
 
37
- upload do |records|
35
+ upload(slice_batch_size: slice_batch_size) do |records|
38
36
  # Drop down to the mongo driver level to avoid constructing a Model for each document returned
39
37
  criteria.klass.collection.find(criteria.selector, options).each do |document|
40
38
  records << block.call(document)
@@ -42,58 +40,48 @@ module RocketJob
42
40
  end
43
41
  end
44
42
 
45
- def upload_arel(arel, *column_names, &block)
43
+ def upload_arel(arel, columns: nil, slice_batch_size: nil, &block)
46
44
  unless block
47
- column_names = column_names.empty? ? [:id] : column_names.collect(&:to_sym)
45
+ columns = columns.blank? ? [:id] : columns.collect(&:to_sym)
48
46
 
49
47
  block =
50
- if column_names.size == 1
51
- column = column_names.first
52
- ->(model) { model.send(column) }
48
+ if columns.size == 1
49
+ column = columns.first
50
+ ->(model) { model.public_send(column) }
53
51
  else
54
- ->(model) { column_names.collect { |c| model.send(c) } }
52
+ ->(model) { columns.collect { |c| model.public_send(c) } }
55
53
  end
56
54
  # find_each requires the :id column in the query
57
- selection = column_names.include?(:id) ? column_names : column_names + [:id]
55
+ selection = columns.include?(:id) ? columns : columns + [:id]
58
56
  arel = arel.select(selection)
59
57
  end
60
58
 
61
- upload { |records| arel.find_each { |model| records << block.call(model) } }
59
+ upload(slice_batch_size: slice_batch_size) { |records| arel.find_each { |model| records << block.call(model) } }
62
60
  end
63
61
 
64
- def upload_integer_range(start_id, last_id)
65
- # Create indexes before uploading
66
- create_indexes
67
- count = 0
68
- while start_id <= last_id
69
- end_id = start_id + slice_size - 1
70
- end_id = last_id if end_id > last_id
71
- create!(records: [[start_id, end_id]])
72
- start_id += slice_size
73
- count += 1
62
+ def upload_integer_range(start_id, last_id, slice_batch_size: 1_000)
63
+ # Each "record" is actually a range of Integers which makes up each slice
64
+ upload(slice_size: 1, slice_batch_size: slice_batch_size) do |records|
65
+ while start_id <= last_id
66
+ end_id = start_id + slice_size - 1
67
+ end_id = last_id if end_id > last_id
68
+ records << [start_id, end_id]
69
+ start_id += slice_size
70
+ end
74
71
  end
75
- count
76
- rescue Exception => e
77
- drop
78
- raise(e)
79
72
  end
80
73
 
81
- def upload_integer_range_in_reverse_order(start_id, last_id)
82
- # Create indexes before uploading
83
- create_indexes
84
- end_id = last_id
85
- count = 0
86
- while end_id >= start_id
87
- first_id = end_id - slice_size + 1
88
- first_id = start_id if first_id.negative? || (first_id < start_id)
89
- create!(records: [[first_id, end_id]])
90
- end_id -= slice_size
91
- count += 1
74
+ def upload_integer_range_in_reverse_order(start_id, last_id, slice_batch_size: 1_000)
75
+ # Each "record" is actually a range of Integers which makes up each slice
76
+ upload(slice_size: 1, slice_batch_size: slice_batch_size) do |records|
77
+ end_id = last_id
78
+ while end_id >= start_id
79
+ first_id = end_id - slice_size + 1
80
+ first_id = start_id if first_id.negative? || (first_id < start_id)
81
+ records << [first_id, end_id]
82
+ end_id -= slice_size
83
+ end
92
84
  end
93
- count
94
- rescue Exception => e
95
- drop
96
- raise(e)
97
85
  end
98
86
 
99
87
  # Iterate over each failed record, if any
@@ -137,11 +125,11 @@ module RocketJob
137
125
  # TODO: Will it perform faster without the id sort?
138
126
  # I.e. Just process on a FIFO basis?
139
127
  document = all.queued.
140
- sort("_id" => 1).
141
- find_one_and_update(
142
- {"$set" => {worker_name: worker_name, state: :running, started_at: Time.now}},
143
- return_document: :after
144
- )
128
+ sort("_id" => 1).
129
+ find_one_and_update(
130
+ {"$set" => {worker_name: worker_name, state: "running", started_at: Time.now}},
131
+ return_document: :after
132
+ )
145
133
  document.collection_name = collection_name if document
146
134
  document
147
135
  end
@@ -33,7 +33,7 @@ module RocketJob
33
33
  #
34
34
 
35
35
  # Current state, as set by AASM
36
- field :state, type: Symbol, default: :queued
36
+ field :state, type: Mongoid::StringifiedSymbol, default: :queued
37
37
 
38
38
  # When processing started on this slice
39
39
  field :started_at, type: Time
@@ -95,9 +95,13 @@ module RocketJob
95
95
  end
96
96
 
97
97
  # Returns whether this is a specialized binary slice for creating binary data from each slice
98
- # that is then just downloaded as-is into output files.
99
- def self.binary?
100
- false
98
+ # that is downloaded without conversion into output files.
99
+ def self.binary_format
100
+ end
101
+
102
+ # For binary formats only, format the supplied records into the binary format for this slice
103
+ def self.to_binary(_records)
104
+ raise NotImplementedError
101
105
  end
102
106
 
103
107
  # `records` array has special handling so that it can be modified in place instead of having
@@ -139,18 +143,10 @@ module RocketJob
139
143
 
140
144
  # Returns [Hash] the slice as a Hash for storage purposes
141
145
  # Compresses / Encrypts the slice according to the job setting
142
- if ::Mongoid::VERSION.to_i >= 6
143
- def as_attributes
144
- attrs = super
145
- attrs["records"] = serialize_records if @records
146
- attrs
147
- end
148
- else
149
- def as_document
150
- attrs = super
151
- attrs["records"] = serialize_records if @records
152
- attrs
153
- end
146
+ def as_attributes
147
+ attrs = super
148
+ attrs["records"] = serialize_records if @records
149
+ attrs
154
150
  end
155
151
 
156
152
  def inspect