RubyGems - rocketjob - Versions diffs - 6.0.0.rc3 → 6.0.0 - Mend

rocketjob 6.0.0.rc3 → 6.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

checksums.yaml +4 -4
data/README.md +26 -0
data/lib/rocket_job/batch/categories.rb +24 -20
data/lib/rocket_job/batch/io.rb +128 -128
data/lib/rocket_job/batch/worker.rb +14 -12
data/lib/rocket_job/category/base.rb +10 -7
data/lib/rocket_job/category/input.rb +61 -1
data/lib/rocket_job/category/output.rb +9 -0
data/lib/rocket_job/dirmon_entry.rb +1 -1
data/lib/rocket_job/jobs/conversion_job.rb +21 -17
data/lib/rocket_job/jobs/dirmon_job.rb +24 -35
data/lib/rocket_job/jobs/housekeeping_job.rb +4 -5
data/lib/rocket_job/jobs/on_demand_batch_job.rb +7 -5
data/lib/rocket_job/jobs/on_demand_job.rb +2 -2
data/lib/rocket_job/jobs/upload_file_job.rb +4 -0
data/lib/rocket_job/plugins/cron.rb +60 -20
data/lib/rocket_job/plugins/job/persistence.rb +36 -0
data/lib/rocket_job/plugins/restart.rb +3 -110
data/lib/rocket_job/plugins/state_machine.rb +2 -2
data/lib/rocket_job/plugins/throttle_dependent_jobs.rb +1 -2
data/lib/rocket_job/sliced/bzip2_output_slice.rb +18 -19
data/lib/rocket_job/sliced/compressed_slice.rb +3 -6
data/lib/rocket_job/sliced/encrypted_bzip2_output_slice.rb +49 -0
data/lib/rocket_job/sliced/encrypted_slice.rb +4 -6
data/lib/rocket_job/sliced/input.rb +42 -54
data/lib/rocket_job/sliced/slice.rb +7 -3
data/lib/rocket_job/sliced/slices.rb +12 -9
data/lib/rocket_job/sliced/writer/input.rb +46 -18
data/lib/rocket_job/sliced.rb +1 -19
data/lib/rocket_job/version.rb +1 -1
data/lib/rocketjob.rb +2 -2
metadata +8 -10
data/lib/rocket_job/batch/tabular/input.rb +0 -133
data/lib/rocket_job/batch/tabular/output.rb +0 -67
data/lib/rocket_job/batch/tabular.rb +0 -58

data/lib/rocket_job/plugins/job/persistence.rb CHANGED Viewed

@@ -70,6 +70,29 @@ module RocketJob
           end
         end
+        # Create a new instance of this job, copying across only the `copy_on_restart` attributes.
+        # Copy across input and output categories to new scheduled job so that all of the
+        # settings are remembered between instance. Example: slice_size
+        def create_restart!(**overrides)
+          if expired?
+            logger.info("Job has expired. Not creating a new instance.")
+            return
+          end
+          job_attrs = self.class.rocket_job_restart_attributes.each_with_object({}) do |attr, attrs|
+            attrs[attr] = send(attr)
+          end
+          job_attrs.merge!(overrides)
+          job                   = self.class.new(job_attrs)
+          job.input_categories  = input_categories if respond_to?(:input_categories)
+          job.output_categories = output_categories if respond_to?(:output_categories)
+          job.save_with_retry!
+          logger.info("Created a new job instance: #{job.id}")
+        end
         # Set in-memory job to complete if `destroy_on_complete` and the job has been destroyed
         def reload
           return super unless destroy_on_complete
@@ -85,6 +108,19 @@ module RocketJob
             self
           end
         end
+        # Save with retry in case persistence takes a moment.
+        def save_with_retry!(retry_limit = 10, sleep_interval = 0.5)
+          count = 0
+          while count < retry_limit
+            return true if save
+            logger.info("Retrying to persist new scheduled instance: #{errors.messages.inspect}")
+            sleep(sleep_interval)
+            count += 1
+          end
+          save!
+        end
       end
     end
   end

data/lib/rocket_job/plugins/restart.rb CHANGED Viewed

@@ -2,128 +2,21 @@ require "active_support/concern"
 module RocketJob
   module Plugins
-    # Automatically starts a new instance of this job anytime it fails, aborts, or completes.
-    #
-    # Notes:
-    # * Restartable jobs automatically abort if they fail. This prevents the failed job from being retried.
-    #   - To disable this behavior, add the following empty method:
-    #        def rocket_job_restart_abort
-    #        end
-    # * On destroy this job is destroyed without starting a new instance.
-    # * On Abort a new instance is created.
-    # * Include `RocketJob::Plugins::Singleton` to prevent multiple copies of a job from running at
-    #   the same time.
-    # * The job will not be restarted if:
-    #   - A validation fails after creating the new instance of this job.
-    #   - The job has expired.
-    # * Only the fields that have `copy_on_restart: true` will be passed onto the new instance of this job.
-    #
-    # Example:
-    #
-    # class RestartableJob < RocketJob::Job
-    #   include RocketJob::Plugins::Restart
-    #
-    #   # Retain the completed job under the completed tab in Rocket Job Web Interface.
-    #   self.destroy_on_complete = false
-    #
-    #   # Will be copied to the new job on restart.
-    #   field :limit, type: Integer, copy_on_restart: true
-    #
-    #   # Will _not_ be copied to the new job on restart.
-    #   field :list, type: Array, default: [1,2,3]
-    #
-    #   # Set run_at every time a new instance of the job is created.
-    #   after_initialize set_run_at, if: :new_record?
-    #
-    #   def perform
-    #     puts "The limit is #{limit}"
-    #     puts "The list is #{list}"
-    #     'DONE'
-    #   end
-    #
-    #   private
-    #
-    #   # Run this job in 30 minutes.
-    #   def set_run_at
-    #     self.run_at = 30.minutes.from_now
-    #   end
-    # end
-    #
-    # job = RestartableJob.create!(limit: 10, list: [4,5,6])
-    # job.reload.state
-    # # => :queued
-    #
-    # job.limit
-    # # => 10
-    #
-    # job.list
-    # # => [4,5,6]
-    #
-    # # Wait 30 minutes ...
-    #
-    # job.reload.state
-    # # => :completed
-    #
-    # # A new instance was automatically created.
-    # job2 = RestartableJob.last
-    # job2.state
-    # # => :queued
-    #
-    # job2.limit
-    # # => 10
-    #
-    # job2.list
-    # # => [1,2,3]
+    # @deprecated
     module Restart
       extend ActiveSupport::Concern
       included do
-        after_abort :rocket_job_restart_new_instance
-        after_complete :rocket_job_restart_new_instance
+        after_abort :create_restart!
+        after_complete :create_restart!
         after_fail :rocket_job_restart_abort
       end
       private
-      # Run again in the future, even if this run fails with an exception
-      def rocket_job_restart_new_instance
-        if expired?
-          logger.info("Job has expired. Not creating a new instance.")
-          return
-        end
-        job_attrs             =
-          rocket_job_restart_attributes.each_with_object({}) { |attr, attrs| attrs[attr] = send(attr) }
-        job                   = self.class.new(job_attrs)
-        # Copy across input and output categories to new scheduled job so that all of the
-        # settings are remembered between instance. Example: slice_size
-        job.input_categories  = input_categories if respond_to?(:input_categories)
-        job.output_categories = output_categories if respond_to?(:output_categories)
-        rocket_job_restart_save(job)
-      end
       def rocket_job_restart_abort
         new_record? ? abort : abort!
       end
-      # Allow Singleton to prevent the creation of a new job if one is already running
-      # Retry since the delete may not have persisted to disk yet.
-      def rocket_job_restart_save(job, retry_limit = 10, sleep_interval = 0.5)
-        count = 0
-        while count < retry_limit
-          if job.save
-            logger.info("Created a new job instance: #{job.id}")
-            return true
-          else
-            logger.info("Job already active, retrying after a short sleep")
-            sleep(sleep_interval)
-          end
-          count += 1
-        end
-        logger.error("New job instance not started: #{job.errors.messages.inspect}")
-        false
-      end
     end
   end
 end

data/lib/rocket_job/plugins/state_machine.rb CHANGED Viewed

@@ -36,8 +36,8 @@ module RocketJob
           raise(ArgumentError, "Cannot supply both a method name and a block") if methods.size.positive? && block
           raise(ArgumentError, "Must supply either a method name or a block") unless methods.size.positive? || block
-          # TODO: Somehow get AASM to support options such as :if and :unless to be consistent with other callbacks
-          # For example:
+          # Limitation with AASM. It only supports guards on event transitions, not for callbacks.
+          # For example, AASM does not support callback options such as :if and :unless, yet Rails callbacks do.
           #    before_start :my_callback, unless: :encrypted?
           #    before_start :my_callback, if: :encrypted?
           event = aasm.state_machine.events[event_name]

data/lib/rocket_job/plugins/throttle_dependent_jobs.rb CHANGED Viewed

@@ -11,8 +11,7 @@ module RocketJob
       extend ActiveSupport::Concern
       included do
-        class_attribute :dependent_jobs
-        self.dependent_jobs = nil
+        field :dependent_jobs, type: Array, class_attribute: true, user_editable: true, copy_on_restart: true
         define_throttle :dependent_job_exists?
         define_batch_throttle :dependent_job_exists? if respond_to?(:define_batch_throttle)

data/lib/rocket_job/sliced/bzip2_output_slice.rb CHANGED Viewed

@@ -7,36 +7,35 @@ module RocketJob
     # * The `bzip2` linux command line utility supports multiple embedded BZip2 stream,
     #   but some other custom implementations may not. They may only read the first slice and stop.
     # * It is only designed for use on output collections.
-    #
-    # To download the output when using this slice:
-    #
-    #   # Download the binary BZip2 streams into a single file
-    #   IOStreams.path(output_file_name).stream(:none).writer do |io|
-    #     job.download { |slice| io << slice[:binary] }
-    #   end
     class BZip2OutputSlice < ::RocketJob::Sliced::Slice
-      # This is a specialized binary slice for creating binary data from each slice
+      # This is a specialized binary slice for creating BZip2 binary data from each slice
       # that must be downloaded as-is into output files.
-      def self.binary?
-        true
+      def self.binary_format
+        :bz2
+      end
+      # Compress the supplied records with BZip2
+      def self.to_binary(records, record_delimiter = "\n")
+        return [] if records.blank?
+        lines = Array(records).join(record_delimiter) + record_delimiter
+        s     = StringIO.new
+        IOStreams::Bzip2::Writer.stream(s) { |io| io.write(lines) }
+        s.string
       end
       private
+      # Returns [Hash] the BZip2 compressed binary data in binary form when reading back from Mongo.
       def parse_records
-        records = attributes.delete("records")
         # Convert BSON::Binary to a string
-        @records = [{binary: records.data}]
+        @records = [attributes.delete("records").data]
       end
+      # Returns [BSON::Binary] the records compressed using BZip2 into a string.
       def serialize_records
-        return [] if @records.nil? || @records.empty?
-        lines = records.to_a.join("\n") + "\n"
-        s     = StringIO.new
-        IOStreams::Bzip2::Writer.stream(s) { |io| io.write(lines) }
-        BSON::Binary.new(s.string)
+        # TODO: Make the line terminator configurable
+        BSON::Binary.new(self.class.to_binary(@records))
       end
     end
   end

data/lib/rocket_job/sliced/compressed_slice.rb CHANGED Viewed

@@ -6,13 +6,10 @@ module RocketJob
       private
       def parse_records
-        records = attributes.delete("records")
         # Convert BSON::Binary to a string
-        binary_str = records.data
-        str      = Zlib::Inflate.inflate(binary_str)
-        @records = Hash.from_bson(BSON::ByteBuffer.new(str))["r"]
+        compressed_str   = attributes.delete("records").data
+        decompressed_str = Zlib::Inflate.inflate(compressed_str)
+        @records         = Hash.from_bson(BSON::ByteBuffer.new(decompressed_str))["r"]
       end
       def serialize_records

data/lib/rocket_job/sliced/encrypted_bzip2_output_slice.rb ADDED Viewed

@@ -0,0 +1,49 @@
+module RocketJob
+  module Sliced
+    # This is a specialized output serializer that renders each output slice as a single BZip2 compressed stream.
+    # BZip2 allows multiple output streams to be written into a single BZip2 file.
+    #
+    # Notes:
+    # * The `bzip2` linux command line utility supports multiple embedded BZip2 stream,
+    #   but some other custom implementations may not. They may only read the first slice and stop.
+    # * It is only designed for use on output collections.
+    class EncryptedBZip2OutputSlice < ::RocketJob::Sliced::Slice
+      # This is a specialized binary slice for creating BZip2 binary data from each slice
+      # that must be downloaded as-is into output files.
+      def self.binary_format
+        :bz2
+      end
+      private
+      # Returns [Hash] the BZip2 compressed binary data in binary form when reading back from Mongo.
+      def parse_records
+        # Convert BSON::Binary to a string
+        encrypted_str = attributes.delete("records").data
+        # Decrypt string
+        header = SymmetricEncryption::Header.new
+        header.parse(encrypted_str)
+        # Use the header that is present to decrypt the data, since its version could be different
+        decrypted_str = header.cipher.binary_decrypt(encrypted_str, header: header)
+        @records = [decrypted_str]
+      end
+      # Returns [BSON::Binary] the records compressed using BZip2 into a string.
+      def serialize_records
+        return [] if @records.nil? || @records.empty?
+        # TODO: Make the line terminator configurable
+        lines = records.to_a.join("\n") + "\n"
+        s     = StringIO.new
+        IOStreams::Bzip2::Writer.stream(s) { |io| io.write(lines) }
+        # Encrypt to binary without applying an encoding such as Base64
+        # Use a random_iv with each encryption for better security
+        data = SymmetricEncryption.cipher.binary_encrypt(s.string, random_iv: true, compress: false)
+        BSON::Binary.new(data)
+      end
+    end
+  end
+end

data/lib/rocket_job/sliced/encrypted_slice.rb CHANGED Viewed

@@ -6,17 +6,15 @@ module RocketJob
       private
       def parse_records
-        records = attributes.delete("records")
         # Convert BSON::Binary to a string
-        binary_str = records.data
+        encrypted_str = attributes.delete("records").data
         header = SymmetricEncryption::Header.new
-        header.parse(binary_str)
+        header.parse(encrypted_str)
         # Use the header that is present to decrypt the data, since its version could be different
-        str = header.cipher.binary_decrypt(binary_str, header: header)
+        decrypted_str = header.cipher.binary_decrypt(encrypted_str, header: header)
-        @records = Hash.from_bson(BSON::ByteBuffer.new(str))["r"]
+        @records = Hash.from_bson(BSON::ByteBuffer.new(decrypted_str))["r"]
       end
       def serialize_records

data/lib/rocket_job/sliced/input.rb CHANGED Viewed

@@ -1,16 +1,16 @@
 module RocketJob
   module Sliced
     class Input < Slices
-      def upload(on_first: nil, &block)
+      def upload(**args, &block)
         # Create indexes before uploading
         create_indexes
-        Writer::Input.collect(self, on_first: on_first, &block)
+        Writer::Input.collect(self, **args, &block)
       rescue Exception => e
         drop
         raise(e)
       end
-      def upload_mongo_query(criteria, *column_names, &block)
+      def upload_mongo_query(criteria, columns: [], slice_batch_size: nil, &block)
         options = criteria.options
         # Without a block extract the fields from the supplied criteria
@@ -18,23 +18,21 @@ module RocketJob
           # Criteria is returning old school :fields instead of :projections
           options[:projection] = options.delete(:fields) if options.key?(:fields)
         else
-          column_names = column_names.collect(&:to_s)
-          column_names << "_id" if column_names.size.zero?
-          fields = options.delete(:fields) || {}
-          column_names.each { |col| fields[col] = 1 }
+          columns = columns.blank? ? ["_id"] : columns.collect(&:to_s)
+          fields  = options.delete(:fields) || {}
+          columns.each { |col| fields[col] = 1 }
           options[:projection] = fields
           block =
-            if column_names.size == 1
-              column = column_names.first
+            if columns.size == 1
+              column = columns.first
               ->(document) { document[column] }
             else
-              ->(document) { column_names.collect { |c| document[c] } }
+              ->(document) { columns.collect { |c| document[c] } }
             end
         end
-        upload do |records|
+        upload(slice_batch_size: slice_batch_size) do |records|
           # Drop down to the mongo driver level to avoid constructing a Model for each document returned
           criteria.klass.collection.find(criteria.selector, options).each do |document|
             records << block.call(document)
@@ -42,58 +40,48 @@ module RocketJob
         end
       end
-      def upload_arel(arel, *column_names, &block)
+      def upload_arel(arel, columns: nil, slice_batch_size: nil, &block)
         unless block
-          column_names = column_names.empty? ? [:id] : column_names.collect(&:to_sym)
+          columns = columns.blank? ? [:id] : columns.collect(&:to_sym)
           block =
-            if column_names.size == 1
-              column = column_names.first
-              ->(model) { model.send(column) }
+            if columns.size == 1
+              column = columns.first
+              ->(model) { model.public_send(column) }
             else
-              ->(model) { column_names.collect { |c| model.send(c) } }
+              ->(model) { columns.collect { |c| model.public_send(c) } }
             end
           # find_each requires the :id column in the query
-          selection = column_names.include?(:id) ? column_names : column_names + [:id]
+          selection = columns.include?(:id) ? columns : columns + [:id]
           arel      = arel.select(selection)
         end
-        upload { |records| arel.find_each { |model| records << block.call(model) } }
+        upload(slice_batch_size: slice_batch_size) { |records| arel.find_each { |model| records << block.call(model) } }
       end
-      def upload_integer_range(start_id, last_id)
-        # Create indexes before uploading
-        create_indexes
-        count = 0
-        while start_id <= last_id
-          end_id = start_id + slice_size - 1
-          end_id = last_id if end_id > last_id
-          create!(records: [[start_id, end_id]])
-          start_id += slice_size
-          count    += 1
+      def upload_integer_range(start_id, last_id, slice_batch_size: 1_000)
+        # Each "record" is actually a range of Integers which makes up each slice
+        upload(slice_size: 1, slice_batch_size: slice_batch_size) do |records|
+          while start_id <= last_id
+            end_id = start_id + slice_size - 1
+            end_id = last_id if end_id > last_id
+            records << [start_id, end_id]
+            start_id += slice_size
+          end
         end
-        count
-      rescue Exception => e
-        drop
-        raise(e)
       end
-      def upload_integer_range_in_reverse_order(start_id, last_id)
-        # Create indexes before uploading
-        create_indexes
-        end_id = last_id
-        count  = 0
-        while end_id >= start_id
-          first_id = end_id - slice_size + 1
-          first_id = start_id if first_id.negative? || (first_id < start_id)
-          create!(records: [[first_id, end_id]])
-          end_id -= slice_size
-          count  += 1
+      def upload_integer_range_in_reverse_order(start_id, last_id, slice_batch_size: 1_000)
+        # Each "record" is actually a range of Integers which makes up each slice
+        upload(slice_size: 1, slice_batch_size: slice_batch_size) do |records|
+          end_id = last_id
+          while end_id >= start_id
+            first_id = end_id - slice_size + 1
+            first_id = start_id if first_id.negative? || (first_id < start_id)
+            records << [first_id, end_id]
+            end_id -= slice_size
+          end
         end
-        count
-      rescue Exception => e
-        drop
-        raise(e)
       end
       # Iterate over each failed record, if any
@@ -137,11 +125,11 @@ module RocketJob
         # TODO: Will it perform faster without the id sort?
         # I.e. Just process on a FIFO basis?
         document                 = all.queued.
-                                   sort("_id" => 1).
-                                   find_one_and_update(
-                                     {"$set" => {worker_name: worker_name, state: "running", started_at: Time.now}},
-                                     return_document: :after
-                                   )
+          sort("_id" => 1).
+          find_one_and_update(
+            {"$set" => {worker_name: worker_name, state: "running", started_at: Time.now}},
+            return_document: :after
+          )
         document.collection_name = collection_name if document
         document
       end