rocketjob 6.0.0.rc3 → 6.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +26 -0
  3. data/lib/rocket_job/batch/categories.rb +24 -20
  4. data/lib/rocket_job/batch/io.rb +128 -128
  5. data/lib/rocket_job/batch/worker.rb +14 -12
  6. data/lib/rocket_job/category/base.rb +10 -7
  7. data/lib/rocket_job/category/input.rb +61 -1
  8. data/lib/rocket_job/category/output.rb +9 -0
  9. data/lib/rocket_job/dirmon_entry.rb +1 -1
  10. data/lib/rocket_job/jobs/conversion_job.rb +21 -17
  11. data/lib/rocket_job/jobs/dirmon_job.rb +24 -35
  12. data/lib/rocket_job/jobs/housekeeping_job.rb +4 -5
  13. data/lib/rocket_job/jobs/on_demand_batch_job.rb +7 -5
  14. data/lib/rocket_job/jobs/on_demand_job.rb +2 -2
  15. data/lib/rocket_job/jobs/upload_file_job.rb +4 -0
  16. data/lib/rocket_job/plugins/cron.rb +60 -20
  17. data/lib/rocket_job/plugins/job/persistence.rb +36 -0
  18. data/lib/rocket_job/plugins/restart.rb +3 -110
  19. data/lib/rocket_job/plugins/state_machine.rb +2 -2
  20. data/lib/rocket_job/plugins/throttle_dependent_jobs.rb +1 -2
  21. data/lib/rocket_job/sliced/bzip2_output_slice.rb +18 -19
  22. data/lib/rocket_job/sliced/compressed_slice.rb +3 -6
  23. data/lib/rocket_job/sliced/encrypted_bzip2_output_slice.rb +49 -0
  24. data/lib/rocket_job/sliced/encrypted_slice.rb +4 -6
  25. data/lib/rocket_job/sliced/input.rb +42 -54
  26. data/lib/rocket_job/sliced/slice.rb +7 -3
  27. data/lib/rocket_job/sliced/slices.rb +12 -9
  28. data/lib/rocket_job/sliced/writer/input.rb +46 -18
  29. data/lib/rocket_job/sliced.rb +1 -19
  30. data/lib/rocket_job/version.rb +1 -1
  31. data/lib/rocketjob.rb +2 -2
  32. metadata +8 -10
  33. data/lib/rocket_job/batch/tabular/input.rb +0 -133
  34. data/lib/rocket_job/batch/tabular/output.rb +0 -67
  35. data/lib/rocket_job/batch/tabular.rb +0 -58
@@ -70,6 +70,29 @@ module RocketJob
70
70
  end
71
71
  end
72
72
 
73
+ # Create a new instance of this job, copying across only the `copy_on_restart` attributes.
74
+ # Copy across input and output categories to new scheduled job so that all of the
75
+ # settings are remembered between instance. Example: slice_size
76
+ def create_restart!(**overrides)
77
+ if expired?
78
+ logger.info("Job has expired. Not creating a new instance.")
79
+ return
80
+ end
81
+
82
+ job_attrs = self.class.rocket_job_restart_attributes.each_with_object({}) do |attr, attrs|
83
+ attrs[attr] = send(attr)
84
+ end
85
+ job_attrs.merge!(overrides)
86
+
87
+ job = self.class.new(job_attrs)
88
+ job.input_categories = input_categories if respond_to?(:input_categories)
89
+ job.output_categories = output_categories if respond_to?(:output_categories)
90
+
91
+ job.save_with_retry!
92
+
93
+ logger.info("Created a new job instance: #{job.id}")
94
+ end
95
+
73
96
  # Set in-memory job to complete if `destroy_on_complete` and the job has been destroyed
74
97
  def reload
75
98
  return super unless destroy_on_complete
@@ -85,6 +108,19 @@ module RocketJob
85
108
  self
86
109
  end
87
110
  end
111
+
112
+ # Save with retry in case persistence takes a moment.
113
+ def save_with_retry!(retry_limit = 10, sleep_interval = 0.5)
114
+ count = 0
115
+ while count < retry_limit
116
+ return true if save
117
+
118
+ logger.info("Retrying to persist new scheduled instance: #{errors.messages.inspect}")
119
+ sleep(sleep_interval)
120
+ count += 1
121
+ end
122
+ save!
123
+ end
88
124
  end
89
125
  end
90
126
  end
@@ -2,128 +2,21 @@ require "active_support/concern"
2
2
 
3
3
  module RocketJob
4
4
  module Plugins
5
- # Automatically starts a new instance of this job anytime it fails, aborts, or completes.
6
- #
7
- # Notes:
8
- # * Restartable jobs automatically abort if they fail. This prevents the failed job from being retried.
9
- # - To disable this behavior, add the following empty method:
10
- # def rocket_job_restart_abort
11
- # end
12
- # * On destroy this job is destroyed without starting a new instance.
13
- # * On Abort a new instance is created.
14
- # * Include `RocketJob::Plugins::Singleton` to prevent multiple copies of a job from running at
15
- # the same time.
16
- # * The job will not be restarted if:
17
- # - A validation fails after creating the new instance of this job.
18
- # - The job has expired.
19
- # * Only the fields that have `copy_on_restart: true` will be passed onto the new instance of this job.
20
- #
21
- # Example:
22
- #
23
- # class RestartableJob < RocketJob::Job
24
- # include RocketJob::Plugins::Restart
25
- #
26
- # # Retain the completed job under the completed tab in Rocket Job Web Interface.
27
- # self.destroy_on_complete = false
28
- #
29
- # # Will be copied to the new job on restart.
30
- # field :limit, type: Integer, copy_on_restart: true
31
- #
32
- # # Will _not_ be copied to the new job on restart.
33
- # field :list, type: Array, default: [1,2,3]
34
- #
35
- # # Set run_at every time a new instance of the job is created.
36
- # after_initialize set_run_at, if: :new_record?
37
- #
38
- # def perform
39
- # puts "The limit is #{limit}"
40
- # puts "The list is #{list}"
41
- # 'DONE'
42
- # end
43
- #
44
- # private
45
- #
46
- # # Run this job in 30 minutes.
47
- # def set_run_at
48
- # self.run_at = 30.minutes.from_now
49
- # end
50
- # end
51
- #
52
- # job = RestartableJob.create!(limit: 10, list: [4,5,6])
53
- # job.reload.state
54
- # # => :queued
55
- #
56
- # job.limit
57
- # # => 10
58
- #
59
- # job.list
60
- # # => [4,5,6]
61
- #
62
- # # Wait 30 minutes ...
63
- #
64
- # job.reload.state
65
- # # => :completed
66
- #
67
- # # A new instance was automatically created.
68
- # job2 = RestartableJob.last
69
- # job2.state
70
- # # => :queued
71
- #
72
- # job2.limit
73
- # # => 10
74
- #
75
- # job2.list
76
- # # => [1,2,3]
5
+ # @deprecated
77
6
  module Restart
78
7
  extend ActiveSupport::Concern
79
8
 
80
9
  included do
81
- after_abort :rocket_job_restart_new_instance
82
- after_complete :rocket_job_restart_new_instance
10
+ after_abort :create_restart!
11
+ after_complete :create_restart!
83
12
  after_fail :rocket_job_restart_abort
84
13
  end
85
14
 
86
15
  private
87
16
 
88
- # Run again in the future, even if this run fails with an exception
89
- def rocket_job_restart_new_instance
90
- if expired?
91
- logger.info("Job has expired. Not creating a new instance.")
92
- return
93
- end
94
- job_attrs =
95
- rocket_job_restart_attributes.each_with_object({}) { |attr, attrs| attrs[attr] = send(attr) }
96
- job = self.class.new(job_attrs)
97
-
98
- # Copy across input and output categories to new scheduled job so that all of the
99
- # settings are remembered between instance. Example: slice_size
100
- job.input_categories = input_categories if respond_to?(:input_categories)
101
- job.output_categories = output_categories if respond_to?(:output_categories)
102
-
103
- rocket_job_restart_save(job)
104
- end
105
-
106
17
  def rocket_job_restart_abort
107
18
  new_record? ? abort : abort!
108
19
  end
109
-
110
- # Allow Singleton to prevent the creation of a new job if one is already running
111
- # Retry since the delete may not have persisted to disk yet.
112
- def rocket_job_restart_save(job, retry_limit = 10, sleep_interval = 0.5)
113
- count = 0
114
- while count < retry_limit
115
- if job.save
116
- logger.info("Created a new job instance: #{job.id}")
117
- return true
118
- else
119
- logger.info("Job already active, retrying after a short sleep")
120
- sleep(sleep_interval)
121
- end
122
- count += 1
123
- end
124
- logger.error("New job instance not started: #{job.errors.messages.inspect}")
125
- false
126
- end
127
20
  end
128
21
  end
129
22
  end
@@ -36,8 +36,8 @@ module RocketJob
36
36
  raise(ArgumentError, "Cannot supply both a method name and a block") if methods.size.positive? && block
37
37
  raise(ArgumentError, "Must supply either a method name or a block") unless methods.size.positive? || block
38
38
 
39
- # TODO: Somehow get AASM to support options such as :if and :unless to be consistent with other callbacks
40
- # For example:
39
+ # Limitation with AASM. It only supports guards on event transitions, not for callbacks.
40
+ # For example, AASM does not support callback options such as :if and :unless, yet Rails callbacks do.
41
41
  # before_start :my_callback, unless: :encrypted?
42
42
  # before_start :my_callback, if: :encrypted?
43
43
  event = aasm.state_machine.events[event_name]
@@ -11,8 +11,7 @@ module RocketJob
11
11
  extend ActiveSupport::Concern
12
12
 
13
13
  included do
14
- class_attribute :dependent_jobs
15
- self.dependent_jobs = nil
14
+ field :dependent_jobs, type: Array, class_attribute: true, user_editable: true, copy_on_restart: true
16
15
 
17
16
  define_throttle :dependent_job_exists?
18
17
  define_batch_throttle :dependent_job_exists? if respond_to?(:define_batch_throttle)
@@ -7,36 +7,35 @@ module RocketJob
7
7
  # * The `bzip2` linux command line utility supports multiple embedded BZip2 stream,
8
8
  # but some other custom implementations may not. They may only read the first slice and stop.
9
9
  # * It is only designed for use on output collections.
10
- #
11
- # To download the output when using this slice:
12
- #
13
- # # Download the binary BZip2 streams into a single file
14
- # IOStreams.path(output_file_name).stream(:none).writer do |io|
15
- # job.download { |slice| io << slice[:binary] }
16
- # end
17
10
  class BZip2OutputSlice < ::RocketJob::Sliced::Slice
18
- # This is a specialized binary slice for creating binary data from each slice
11
+ # This is a specialized binary slice for creating BZip2 binary data from each slice
19
12
  # that must be downloaded as-is into output files.
20
- def self.binary?
21
- true
13
+ def self.binary_format
14
+ :bz2
15
+ end
16
+
17
+ # Compress the supplied records with BZip2
18
+ def self.to_binary(records, record_delimiter = "\n")
19
+ return [] if records.blank?
20
+
21
+ lines = Array(records).join(record_delimiter) + record_delimiter
22
+ s = StringIO.new
23
+ IOStreams::Bzip2::Writer.stream(s) { |io| io.write(lines) }
24
+ s.string
22
25
  end
23
26
 
24
27
  private
25
28
 
29
+ # Returns [Hash] the BZip2 compressed binary data in binary form when reading back from Mongo.
26
30
  def parse_records
27
- records = attributes.delete("records")
28
-
29
31
  # Convert BSON::Binary to a string
30
- @records = [{binary: records.data}]
32
+ @records = [attributes.delete("records").data]
31
33
  end
32
34
 
35
+ # Returns [BSON::Binary] the records compressed using BZip2 into a string.
33
36
  def serialize_records
34
- return [] if @records.nil? || @records.empty?
35
-
36
- lines = records.to_a.join("\n") + "\n"
37
- s = StringIO.new
38
- IOStreams::Bzip2::Writer.stream(s) { |io| io.write(lines) }
39
- BSON::Binary.new(s.string)
37
+ # TODO: Make the line terminator configurable
38
+ BSON::Binary.new(self.class.to_binary(@records))
40
39
  end
41
40
  end
42
41
  end
@@ -6,13 +6,10 @@ module RocketJob
6
6
  private
7
7
 
8
8
  def parse_records
9
- records = attributes.delete("records")
10
-
11
9
  # Convert BSON::Binary to a string
12
- binary_str = records.data
13
-
14
- str = Zlib::Inflate.inflate(binary_str)
15
- @records = Hash.from_bson(BSON::ByteBuffer.new(str))["r"]
10
+ compressed_str = attributes.delete("records").data
11
+ decompressed_str = Zlib::Inflate.inflate(compressed_str)
12
+ @records = Hash.from_bson(BSON::ByteBuffer.new(decompressed_str))["r"]
16
13
  end
17
14
 
18
15
  def serialize_records
@@ -0,0 +1,49 @@
1
+ module RocketJob
2
+ module Sliced
3
+ # This is a specialized output serializer that renders each output slice as a single BZip2 compressed stream.
4
+ # BZip2 allows multiple output streams to be written into a single BZip2 file.
5
+ #
6
+ # Notes:
7
+ # * The `bzip2` linux command line utility supports multiple embedded BZip2 stream,
8
+ # but some other custom implementations may not. They may only read the first slice and stop.
9
+ # * It is only designed for use on output collections.
10
+ class EncryptedBZip2OutputSlice < ::RocketJob::Sliced::Slice
11
+ # This is a specialized binary slice for creating BZip2 binary data from each slice
12
+ # that must be downloaded as-is into output files.
13
+ def self.binary_format
14
+ :bz2
15
+ end
16
+
17
+ private
18
+
19
+ # Returns [Hash] the BZip2 compressed binary data in binary form when reading back from Mongo.
20
+ def parse_records
21
+ # Convert BSON::Binary to a string
22
+ encrypted_str = attributes.delete("records").data
23
+
24
+ # Decrypt string
25
+ header = SymmetricEncryption::Header.new
26
+ header.parse(encrypted_str)
27
+ # Use the header that is present to decrypt the data, since its version could be different
28
+ decrypted_str = header.cipher.binary_decrypt(encrypted_str, header: header)
29
+
30
+ @records = [decrypted_str]
31
+ end
32
+
33
+ # Returns [BSON::Binary] the records compressed using BZip2 into a string.
34
+ def serialize_records
35
+ return [] if @records.nil? || @records.empty?
36
+
37
+ # TODO: Make the line terminator configurable
38
+ lines = records.to_a.join("\n") + "\n"
39
+ s = StringIO.new
40
+ IOStreams::Bzip2::Writer.stream(s) { |io| io.write(lines) }
41
+
42
+ # Encrypt to binary without applying an encoding such as Base64
43
+ # Use a random_iv with each encryption for better security
44
+ data = SymmetricEncryption.cipher.binary_encrypt(s.string, random_iv: true, compress: false)
45
+ BSON::Binary.new(data)
46
+ end
47
+ end
48
+ end
49
+ end
@@ -6,17 +6,15 @@ module RocketJob
6
6
  private
7
7
 
8
8
  def parse_records
9
- records = attributes.delete("records")
10
-
11
9
  # Convert BSON::Binary to a string
12
- binary_str = records.data
10
+ encrypted_str = attributes.delete("records").data
13
11
 
14
12
  header = SymmetricEncryption::Header.new
15
- header.parse(binary_str)
13
+ header.parse(encrypted_str)
16
14
  # Use the header that is present to decrypt the data, since its version could be different
17
- str = header.cipher.binary_decrypt(binary_str, header: header)
15
+ decrypted_str = header.cipher.binary_decrypt(encrypted_str, header: header)
18
16
 
19
- @records = Hash.from_bson(BSON::ByteBuffer.new(str))["r"]
17
+ @records = Hash.from_bson(BSON::ByteBuffer.new(decrypted_str))["r"]
20
18
  end
21
19
 
22
20
  def serialize_records
@@ -1,16 +1,16 @@
1
1
  module RocketJob
2
2
  module Sliced
3
3
  class Input < Slices
4
- def upload(on_first: nil, &block)
4
+ def upload(**args, &block)
5
5
  # Create indexes before uploading
6
6
  create_indexes
7
- Writer::Input.collect(self, on_first: on_first, &block)
7
+ Writer::Input.collect(self, **args, &block)
8
8
  rescue Exception => e
9
9
  drop
10
10
  raise(e)
11
11
  end
12
12
 
13
- def upload_mongo_query(criteria, *column_names, &block)
13
+ def upload_mongo_query(criteria, columns: [], slice_batch_size: nil, &block)
14
14
  options = criteria.options
15
15
 
16
16
  # Without a block extract the fields from the supplied criteria
@@ -18,23 +18,21 @@ module RocketJob
18
18
  # Criteria is returning old school :fields instead of :projections
19
19
  options[:projection] = options.delete(:fields) if options.key?(:fields)
20
20
  else
21
- column_names = column_names.collect(&:to_s)
22
- column_names << "_id" if column_names.size.zero?
23
-
24
- fields = options.delete(:fields) || {}
25
- column_names.each { |col| fields[col] = 1 }
21
+ columns = columns.blank? ? ["_id"] : columns.collect(&:to_s)
22
+ fields = options.delete(:fields) || {}
23
+ columns.each { |col| fields[col] = 1 }
26
24
  options[:projection] = fields
27
25
 
28
26
  block =
29
- if column_names.size == 1
30
- column = column_names.first
27
+ if columns.size == 1
28
+ column = columns.first
31
29
  ->(document) { document[column] }
32
30
  else
33
- ->(document) { column_names.collect { |c| document[c] } }
31
+ ->(document) { columns.collect { |c| document[c] } }
34
32
  end
35
33
  end
36
34
 
37
- upload do |records|
35
+ upload(slice_batch_size: slice_batch_size) do |records|
38
36
  # Drop down to the mongo driver level to avoid constructing a Model for each document returned
39
37
  criteria.klass.collection.find(criteria.selector, options).each do |document|
40
38
  records << block.call(document)
@@ -42,58 +40,48 @@ module RocketJob
42
40
  end
43
41
  end
44
42
 
45
- def upload_arel(arel, *column_names, &block)
43
+ def upload_arel(arel, columns: nil, slice_batch_size: nil, &block)
46
44
  unless block
47
- column_names = column_names.empty? ? [:id] : column_names.collect(&:to_sym)
45
+ columns = columns.blank? ? [:id] : columns.collect(&:to_sym)
48
46
 
49
47
  block =
50
- if column_names.size == 1
51
- column = column_names.first
52
- ->(model) { model.send(column) }
48
+ if columns.size == 1
49
+ column = columns.first
50
+ ->(model) { model.public_send(column) }
53
51
  else
54
- ->(model) { column_names.collect { |c| model.send(c) } }
52
+ ->(model) { columns.collect { |c| model.public_send(c) } }
55
53
  end
56
54
  # find_each requires the :id column in the query
57
- selection = column_names.include?(:id) ? column_names : column_names + [:id]
55
+ selection = columns.include?(:id) ? columns : columns + [:id]
58
56
  arel = arel.select(selection)
59
57
  end
60
58
 
61
- upload { |records| arel.find_each { |model| records << block.call(model) } }
59
+ upload(slice_batch_size: slice_batch_size) { |records| arel.find_each { |model| records << block.call(model) } }
62
60
  end
63
61
 
64
- def upload_integer_range(start_id, last_id)
65
- # Create indexes before uploading
66
- create_indexes
67
- count = 0
68
- while start_id <= last_id
69
- end_id = start_id + slice_size - 1
70
- end_id = last_id if end_id > last_id
71
- create!(records: [[start_id, end_id]])
72
- start_id += slice_size
73
- count += 1
62
+ def upload_integer_range(start_id, last_id, slice_batch_size: 1_000)
63
+ # Each "record" is actually a range of Integers which makes up each slice
64
+ upload(slice_size: 1, slice_batch_size: slice_batch_size) do |records|
65
+ while start_id <= last_id
66
+ end_id = start_id + slice_size - 1
67
+ end_id = last_id if end_id > last_id
68
+ records << [start_id, end_id]
69
+ start_id += slice_size
70
+ end
74
71
  end
75
- count
76
- rescue Exception => e
77
- drop
78
- raise(e)
79
72
  end
80
73
 
81
- def upload_integer_range_in_reverse_order(start_id, last_id)
82
- # Create indexes before uploading
83
- create_indexes
84
- end_id = last_id
85
- count = 0
86
- while end_id >= start_id
87
- first_id = end_id - slice_size + 1
88
- first_id = start_id if first_id.negative? || (first_id < start_id)
89
- create!(records: [[first_id, end_id]])
90
- end_id -= slice_size
91
- count += 1
74
+ def upload_integer_range_in_reverse_order(start_id, last_id, slice_batch_size: 1_000)
75
+ # Each "record" is actually a range of Integers which makes up each slice
76
+ upload(slice_size: 1, slice_batch_size: slice_batch_size) do |records|
77
+ end_id = last_id
78
+ while end_id >= start_id
79
+ first_id = end_id - slice_size + 1
80
+ first_id = start_id if first_id.negative? || (first_id < start_id)
81
+ records << [first_id, end_id]
82
+ end_id -= slice_size
83
+ end
92
84
  end
93
- count
94
- rescue Exception => e
95
- drop
96
- raise(e)
97
85
  end
98
86
 
99
87
  # Iterate over each failed record, if any
@@ -137,11 +125,11 @@ module RocketJob
137
125
  # TODO: Will it perform faster without the id sort?
138
126
  # I.e. Just process on a FIFO basis?
139
127
  document = all.queued.
140
- sort("_id" => 1).
141
- find_one_and_update(
142
- {"$set" => {worker_name: worker_name, state: "running", started_at: Time.now}},
143
- return_document: :after
144
- )
128
+ sort("_id" => 1).
129
+ find_one_and_update(
130
+ {"$set" => {worker_name: worker_name, state: "running", started_at: Time.now}},
131
+ return_document: :after
132
+ )
145
133
  document.collection_name = collection_name if document
146
134
  document
147
135
  end