rocketjob 6.0.0.rc3 → 6.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (35) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +26 -0
  3. data/lib/rocket_job/batch/categories.rb +24 -20
  4. data/lib/rocket_job/batch/io.rb +128 -128
  5. data/lib/rocket_job/batch/worker.rb +14 -12
  6. data/lib/rocket_job/category/base.rb +10 -7
  7. data/lib/rocket_job/category/input.rb +61 -1
  8. data/lib/rocket_job/category/output.rb +9 -0
  9. data/lib/rocket_job/dirmon_entry.rb +1 -1
  10. data/lib/rocket_job/jobs/conversion_job.rb +21 -17
  11. data/lib/rocket_job/jobs/dirmon_job.rb +24 -35
  12. data/lib/rocket_job/jobs/housekeeping_job.rb +4 -5
  13. data/lib/rocket_job/jobs/on_demand_batch_job.rb +7 -5
  14. data/lib/rocket_job/jobs/on_demand_job.rb +2 -2
  15. data/lib/rocket_job/jobs/upload_file_job.rb +4 -0
  16. data/lib/rocket_job/plugins/cron.rb +60 -20
  17. data/lib/rocket_job/plugins/job/persistence.rb +36 -0
  18. data/lib/rocket_job/plugins/restart.rb +3 -110
  19. data/lib/rocket_job/plugins/state_machine.rb +2 -2
  20. data/lib/rocket_job/plugins/throttle_dependent_jobs.rb +1 -2
  21. data/lib/rocket_job/sliced/bzip2_output_slice.rb +18 -19
  22. data/lib/rocket_job/sliced/compressed_slice.rb +3 -6
  23. data/lib/rocket_job/sliced/encrypted_bzip2_output_slice.rb +49 -0
  24. data/lib/rocket_job/sliced/encrypted_slice.rb +4 -6
  25. data/lib/rocket_job/sliced/input.rb +42 -54
  26. data/lib/rocket_job/sliced/slice.rb +7 -3
  27. data/lib/rocket_job/sliced/slices.rb +12 -9
  28. data/lib/rocket_job/sliced/writer/input.rb +46 -18
  29. data/lib/rocket_job/sliced.rb +1 -19
  30. data/lib/rocket_job/version.rb +1 -1
  31. data/lib/rocketjob.rb +2 -2
  32. metadata +8 -10
  33. data/lib/rocket_job/batch/tabular/input.rb +0 -133
  34. data/lib/rocket_job/batch/tabular/output.rb +0 -67
  35. data/lib/rocket_job/batch/tabular.rb +0 -58
@@ -70,6 +70,29 @@ module RocketJob
70
70
  end
71
71
  end
72
72
 
73
+ # Create a new instance of this job, copying across only the `copy_on_restart` attributes.
74
+ # Copy across input and output categories to new scheduled job so that all of the
75
+ # settings are remembered between instance. Example: slice_size
76
+ def create_restart!(**overrides)
77
+ if expired?
78
+ logger.info("Job has expired. Not creating a new instance.")
79
+ return
80
+ end
81
+
82
+ job_attrs = self.class.rocket_job_restart_attributes.each_with_object({}) do |attr, attrs|
83
+ attrs[attr] = send(attr)
84
+ end
85
+ job_attrs.merge!(overrides)
86
+
87
+ job = self.class.new(job_attrs)
88
+ job.input_categories = input_categories if respond_to?(:input_categories)
89
+ job.output_categories = output_categories if respond_to?(:output_categories)
90
+
91
+ job.save_with_retry!
92
+
93
+ logger.info("Created a new job instance: #{job.id}")
94
+ end
95
+
73
96
  # Set in-memory job to complete if `destroy_on_complete` and the job has been destroyed
74
97
  def reload
75
98
  return super unless destroy_on_complete
@@ -85,6 +108,19 @@ module RocketJob
85
108
  self
86
109
  end
87
110
  end
111
+
112
+ # Save with retry in case persistence takes a moment.
113
+ def save_with_retry!(retry_limit = 10, sleep_interval = 0.5)
114
+ count = 0
115
+ while count < retry_limit
116
+ return true if save
117
+
118
+ logger.info("Retrying to persist new scheduled instance: #{errors.messages.inspect}")
119
+ sleep(sleep_interval)
120
+ count += 1
121
+ end
122
+ save!
123
+ end
88
124
  end
89
125
  end
90
126
  end
@@ -2,128 +2,21 @@ require "active_support/concern"
2
2
 
3
3
  module RocketJob
4
4
  module Plugins
5
- # Automatically starts a new instance of this job anytime it fails, aborts, or completes.
6
- #
7
- # Notes:
8
- # * Restartable jobs automatically abort if they fail. This prevents the failed job from being retried.
9
- # - To disable this behavior, add the following empty method:
10
- # def rocket_job_restart_abort
11
- # end
12
- # * On destroy this job is destroyed without starting a new instance.
13
- # * On Abort a new instance is created.
14
- # * Include `RocketJob::Plugins::Singleton` to prevent multiple copies of a job from running at
15
- # the same time.
16
- # * The job will not be restarted if:
17
- # - A validation fails after creating the new instance of this job.
18
- # - The job has expired.
19
- # * Only the fields that have `copy_on_restart: true` will be passed onto the new instance of this job.
20
- #
21
- # Example:
22
- #
23
- # class RestartableJob < RocketJob::Job
24
- # include RocketJob::Plugins::Restart
25
- #
26
- # # Retain the completed job under the completed tab in Rocket Job Web Interface.
27
- # self.destroy_on_complete = false
28
- #
29
- # # Will be copied to the new job on restart.
30
- # field :limit, type: Integer, copy_on_restart: true
31
- #
32
- # # Will _not_ be copied to the new job on restart.
33
- # field :list, type: Array, default: [1,2,3]
34
- #
35
- # # Set run_at every time a new instance of the job is created.
36
- # after_initialize set_run_at, if: :new_record?
37
- #
38
- # def perform
39
- # puts "The limit is #{limit}"
40
- # puts "The list is #{list}"
41
- # 'DONE'
42
- # end
43
- #
44
- # private
45
- #
46
- # # Run this job in 30 minutes.
47
- # def set_run_at
48
- # self.run_at = 30.minutes.from_now
49
- # end
50
- # end
51
- #
52
- # job = RestartableJob.create!(limit: 10, list: [4,5,6])
53
- # job.reload.state
54
- # # => :queued
55
- #
56
- # job.limit
57
- # # => 10
58
- #
59
- # job.list
60
- # # => [4,5,6]
61
- #
62
- # # Wait 30 minutes ...
63
- #
64
- # job.reload.state
65
- # # => :completed
66
- #
67
- # # A new instance was automatically created.
68
- # job2 = RestartableJob.last
69
- # job2.state
70
- # # => :queued
71
- #
72
- # job2.limit
73
- # # => 10
74
- #
75
- # job2.list
76
- # # => [1,2,3]
5
+ # @deprecated
77
6
  module Restart
78
7
  extend ActiveSupport::Concern
79
8
 
80
9
  included do
81
- after_abort :rocket_job_restart_new_instance
82
- after_complete :rocket_job_restart_new_instance
10
+ after_abort :create_restart!
11
+ after_complete :create_restart!
83
12
  after_fail :rocket_job_restart_abort
84
13
  end
85
14
 
86
15
  private
87
16
 
88
- # Run again in the future, even if this run fails with an exception
89
- def rocket_job_restart_new_instance
90
- if expired?
91
- logger.info("Job has expired. Not creating a new instance.")
92
- return
93
- end
94
- job_attrs =
95
- rocket_job_restart_attributes.each_with_object({}) { |attr, attrs| attrs[attr] = send(attr) }
96
- job = self.class.new(job_attrs)
97
-
98
- # Copy across input and output categories to new scheduled job so that all of the
99
- # settings are remembered between instance. Example: slice_size
100
- job.input_categories = input_categories if respond_to?(:input_categories)
101
- job.output_categories = output_categories if respond_to?(:output_categories)
102
-
103
- rocket_job_restart_save(job)
104
- end
105
-
106
17
  def rocket_job_restart_abort
107
18
  new_record? ? abort : abort!
108
19
  end
109
-
110
- # Allow Singleton to prevent the creation of a new job if one is already running
111
- # Retry since the delete may not have persisted to disk yet.
112
- def rocket_job_restart_save(job, retry_limit = 10, sleep_interval = 0.5)
113
- count = 0
114
- while count < retry_limit
115
- if job.save
116
- logger.info("Created a new job instance: #{job.id}")
117
- return true
118
- else
119
- logger.info("Job already active, retrying after a short sleep")
120
- sleep(sleep_interval)
121
- end
122
- count += 1
123
- end
124
- logger.error("New job instance not started: #{job.errors.messages.inspect}")
125
- false
126
- end
127
20
  end
128
21
  end
129
22
  end
@@ -36,8 +36,8 @@ module RocketJob
36
36
  raise(ArgumentError, "Cannot supply both a method name and a block") if methods.size.positive? && block
37
37
  raise(ArgumentError, "Must supply either a method name or a block") unless methods.size.positive? || block
38
38
 
39
- # TODO: Somehow get AASM to support options such as :if and :unless to be consistent with other callbacks
40
- # For example:
39
+ # Limitation with AASM. It only supports guards on event transitions, not for callbacks.
40
+ # For example, AASM does not support callback options such as :if and :unless, yet Rails callbacks do.
41
41
  # before_start :my_callback, unless: :encrypted?
42
42
  # before_start :my_callback, if: :encrypted?
43
43
  event = aasm.state_machine.events[event_name]
@@ -11,8 +11,7 @@ module RocketJob
11
11
  extend ActiveSupport::Concern
12
12
 
13
13
  included do
14
- class_attribute :dependent_jobs
15
- self.dependent_jobs = nil
14
+ field :dependent_jobs, type: Array, class_attribute: true, user_editable: true, copy_on_restart: true
16
15
 
17
16
  define_throttle :dependent_job_exists?
18
17
  define_batch_throttle :dependent_job_exists? if respond_to?(:define_batch_throttle)
@@ -7,36 +7,35 @@ module RocketJob
7
7
  # * The `bzip2` linux command line utility supports multiple embedded BZip2 stream,
8
8
  # but some other custom implementations may not. They may only read the first slice and stop.
9
9
  # * It is only designed for use on output collections.
10
- #
11
- # To download the output when using this slice:
12
- #
13
- # # Download the binary BZip2 streams into a single file
14
- # IOStreams.path(output_file_name).stream(:none).writer do |io|
15
- # job.download { |slice| io << slice[:binary] }
16
- # end
17
10
  class BZip2OutputSlice < ::RocketJob::Sliced::Slice
18
- # This is a specialized binary slice for creating binary data from each slice
11
+ # This is a specialized binary slice for creating BZip2 binary data from each slice
19
12
  # that must be downloaded as-is into output files.
20
- def self.binary?
21
- true
13
+ def self.binary_format
14
+ :bz2
15
+ end
16
+
17
+ # Compress the supplied records with BZip2
18
+ def self.to_binary(records, record_delimiter = "\n")
19
+ return [] if records.blank?
20
+
21
+ lines = Array(records).join(record_delimiter) + record_delimiter
22
+ s = StringIO.new
23
+ IOStreams::Bzip2::Writer.stream(s) { |io| io.write(lines) }
24
+ s.string
22
25
  end
23
26
 
24
27
  private
25
28
 
29
+ # Returns [Hash] the BZip2 compressed binary data in binary form when reading back from Mongo.
26
30
  def parse_records
27
- records = attributes.delete("records")
28
-
29
31
  # Convert BSON::Binary to a string
30
- @records = [{binary: records.data}]
32
+ @records = [attributes.delete("records").data]
31
33
  end
32
34
 
35
+ # Returns [BSON::Binary] the records compressed using BZip2 into a string.
33
36
  def serialize_records
34
- return [] if @records.nil? || @records.empty?
35
-
36
- lines = records.to_a.join("\n") + "\n"
37
- s = StringIO.new
38
- IOStreams::Bzip2::Writer.stream(s) { |io| io.write(lines) }
39
- BSON::Binary.new(s.string)
37
+ # TODO: Make the line terminator configurable
38
+ BSON::Binary.new(self.class.to_binary(@records))
40
39
  end
41
40
  end
42
41
  end
@@ -6,13 +6,10 @@ module RocketJob
6
6
  private
7
7
 
8
8
  def parse_records
9
- records = attributes.delete("records")
10
-
11
9
  # Convert BSON::Binary to a string
12
- binary_str = records.data
13
-
14
- str = Zlib::Inflate.inflate(binary_str)
15
- @records = Hash.from_bson(BSON::ByteBuffer.new(str))["r"]
10
+ compressed_str = attributes.delete("records").data
11
+ decompressed_str = Zlib::Inflate.inflate(compressed_str)
12
+ @records = Hash.from_bson(BSON::ByteBuffer.new(decompressed_str))["r"]
16
13
  end
17
14
 
18
15
  def serialize_records
@@ -0,0 +1,49 @@
1
+ module RocketJob
2
+ module Sliced
3
+ # This is a specialized output serializer that renders each output slice as a single BZip2 compressed stream.
4
+ # BZip2 allows multiple output streams to be written into a single BZip2 file.
5
+ #
6
+ # Notes:
7
+ # * The `bzip2` linux command line utility supports multiple embedded BZip2 stream,
8
+ # but some other custom implementations may not. They may only read the first slice and stop.
9
+ # * It is only designed for use on output collections.
10
+ class EncryptedBZip2OutputSlice < ::RocketJob::Sliced::Slice
11
+ # This is a specialized binary slice for creating BZip2 binary data from each slice
12
+ # that must be downloaded as-is into output files.
13
+ def self.binary_format
14
+ :bz2
15
+ end
16
+
17
+ private
18
+
19
+ # Returns [Hash] the BZip2 compressed binary data in binary form when reading back from Mongo.
20
+ def parse_records
21
+ # Convert BSON::Binary to a string
22
+ encrypted_str = attributes.delete("records").data
23
+
24
+ # Decrypt string
25
+ header = SymmetricEncryption::Header.new
26
+ header.parse(encrypted_str)
27
+ # Use the header that is present to decrypt the data, since its version could be different
28
+ decrypted_str = header.cipher.binary_decrypt(encrypted_str, header: header)
29
+
30
+ @records = [decrypted_str]
31
+ end
32
+
33
+ # Returns [BSON::Binary] the records compressed using BZip2 into a string.
34
+ def serialize_records
35
+ return [] if @records.nil? || @records.empty?
36
+
37
+ # TODO: Make the line terminator configurable
38
+ lines = records.to_a.join("\n") + "\n"
39
+ s = StringIO.new
40
+ IOStreams::Bzip2::Writer.stream(s) { |io| io.write(lines) }
41
+
42
+ # Encrypt to binary without applying an encoding such as Base64
43
+ # Use a random_iv with each encryption for better security
44
+ data = SymmetricEncryption.cipher.binary_encrypt(s.string, random_iv: true, compress: false)
45
+ BSON::Binary.new(data)
46
+ end
47
+ end
48
+ end
49
+ end
@@ -6,17 +6,15 @@ module RocketJob
6
6
  private
7
7
 
8
8
  def parse_records
9
- records = attributes.delete("records")
10
-
11
9
  # Convert BSON::Binary to a string
12
- binary_str = records.data
10
+ encrypted_str = attributes.delete("records").data
13
11
 
14
12
  header = SymmetricEncryption::Header.new
15
- header.parse(binary_str)
13
+ header.parse(encrypted_str)
16
14
  # Use the header that is present to decrypt the data, since its version could be different
17
- str = header.cipher.binary_decrypt(binary_str, header: header)
15
+ decrypted_str = header.cipher.binary_decrypt(encrypted_str, header: header)
18
16
 
19
- @records = Hash.from_bson(BSON::ByteBuffer.new(str))["r"]
17
+ @records = Hash.from_bson(BSON::ByteBuffer.new(decrypted_str))["r"]
20
18
  end
21
19
 
22
20
  def serialize_records
@@ -1,16 +1,16 @@
1
1
  module RocketJob
2
2
  module Sliced
3
3
  class Input < Slices
4
- def upload(on_first: nil, &block)
4
+ def upload(**args, &block)
5
5
  # Create indexes before uploading
6
6
  create_indexes
7
- Writer::Input.collect(self, on_first: on_first, &block)
7
+ Writer::Input.collect(self, **args, &block)
8
8
  rescue Exception => e
9
9
  drop
10
10
  raise(e)
11
11
  end
12
12
 
13
- def upload_mongo_query(criteria, *column_names, &block)
13
+ def upload_mongo_query(criteria, columns: [], slice_batch_size: nil, &block)
14
14
  options = criteria.options
15
15
 
16
16
  # Without a block extract the fields from the supplied criteria
@@ -18,23 +18,21 @@ module RocketJob
18
18
  # Criteria is returning old school :fields instead of :projections
19
19
  options[:projection] = options.delete(:fields) if options.key?(:fields)
20
20
  else
21
- column_names = column_names.collect(&:to_s)
22
- column_names << "_id" if column_names.size.zero?
23
-
24
- fields = options.delete(:fields) || {}
25
- column_names.each { |col| fields[col] = 1 }
21
+ columns = columns.blank? ? ["_id"] : columns.collect(&:to_s)
22
+ fields = options.delete(:fields) || {}
23
+ columns.each { |col| fields[col] = 1 }
26
24
  options[:projection] = fields
27
25
 
28
26
  block =
29
- if column_names.size == 1
30
- column = column_names.first
27
+ if columns.size == 1
28
+ column = columns.first
31
29
  ->(document) { document[column] }
32
30
  else
33
- ->(document) { column_names.collect { |c| document[c] } }
31
+ ->(document) { columns.collect { |c| document[c] } }
34
32
  end
35
33
  end
36
34
 
37
- upload do |records|
35
+ upload(slice_batch_size: slice_batch_size) do |records|
38
36
  # Drop down to the mongo driver level to avoid constructing a Model for each document returned
39
37
  criteria.klass.collection.find(criteria.selector, options).each do |document|
40
38
  records << block.call(document)
@@ -42,58 +40,48 @@ module RocketJob
42
40
  end
43
41
  end
44
42
 
45
- def upload_arel(arel, *column_names, &block)
43
+ def upload_arel(arel, columns: nil, slice_batch_size: nil, &block)
46
44
  unless block
47
- column_names = column_names.empty? ? [:id] : column_names.collect(&:to_sym)
45
+ columns = columns.blank? ? [:id] : columns.collect(&:to_sym)
48
46
 
49
47
  block =
50
- if column_names.size == 1
51
- column = column_names.first
52
- ->(model) { model.send(column) }
48
+ if columns.size == 1
49
+ column = columns.first
50
+ ->(model) { model.public_send(column) }
53
51
  else
54
- ->(model) { column_names.collect { |c| model.send(c) } }
52
+ ->(model) { columns.collect { |c| model.public_send(c) } }
55
53
  end
56
54
  # find_each requires the :id column in the query
57
- selection = column_names.include?(:id) ? column_names : column_names + [:id]
55
+ selection = columns.include?(:id) ? columns : columns + [:id]
58
56
  arel = arel.select(selection)
59
57
  end
60
58
 
61
- upload { |records| arel.find_each { |model| records << block.call(model) } }
59
+ upload(slice_batch_size: slice_batch_size) { |records| arel.find_each { |model| records << block.call(model) } }
62
60
  end
63
61
 
64
- def upload_integer_range(start_id, last_id)
65
- # Create indexes before uploading
66
- create_indexes
67
- count = 0
68
- while start_id <= last_id
69
- end_id = start_id + slice_size - 1
70
- end_id = last_id if end_id > last_id
71
- create!(records: [[start_id, end_id]])
72
- start_id += slice_size
73
- count += 1
62
+ def upload_integer_range(start_id, last_id, slice_batch_size: 1_000)
63
+ # Each "record" is actually a range of Integers which makes up each slice
64
+ upload(slice_size: 1, slice_batch_size: slice_batch_size) do |records|
65
+ while start_id <= last_id
66
+ end_id = start_id + slice_size - 1
67
+ end_id = last_id if end_id > last_id
68
+ records << [start_id, end_id]
69
+ start_id += slice_size
70
+ end
74
71
  end
75
- count
76
- rescue Exception => e
77
- drop
78
- raise(e)
79
72
  end
80
73
 
81
- def upload_integer_range_in_reverse_order(start_id, last_id)
82
- # Create indexes before uploading
83
- create_indexes
84
- end_id = last_id
85
- count = 0
86
- while end_id >= start_id
87
- first_id = end_id - slice_size + 1
88
- first_id = start_id if first_id.negative? || (first_id < start_id)
89
- create!(records: [[first_id, end_id]])
90
- end_id -= slice_size
91
- count += 1
74
+ def upload_integer_range_in_reverse_order(start_id, last_id, slice_batch_size: 1_000)
75
+ # Each "record" is actually a range of Integers which makes up each slice
76
+ upload(slice_size: 1, slice_batch_size: slice_batch_size) do |records|
77
+ end_id = last_id
78
+ while end_id >= start_id
79
+ first_id = end_id - slice_size + 1
80
+ first_id = start_id if first_id.negative? || (first_id < start_id)
81
+ records << [first_id, end_id]
82
+ end_id -= slice_size
83
+ end
92
84
  end
93
- count
94
- rescue Exception => e
95
- drop
96
- raise(e)
97
85
  end
98
86
 
99
87
  # Iterate over each failed record, if any
@@ -137,11 +125,11 @@ module RocketJob
137
125
  # TODO: Will it perform faster without the id sort?
138
126
  # I.e. Just process on a FIFO basis?
139
127
  document = all.queued.
140
- sort("_id" => 1).
141
- find_one_and_update(
142
- {"$set" => {worker_name: worker_name, state: "running", started_at: Time.now}},
143
- return_document: :after
144
- )
128
+ sort("_id" => 1).
129
+ find_one_and_update(
130
+ {"$set" => {worker_name: worker_name, state: "running", started_at: Time.now}},
131
+ return_document: :after
132
+ )
145
133
  document.collection_name = collection_name if document
146
134
  document
147
135
  end