RubyGems - rocketjob - Versions diffs - 6.0.0.rc1 → 6.0.1 - Mend

rocketjob 6.0.0.rc1 → 6.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (48) hide show

checksums.yaml +4 -4
data/README.md +164 -8
data/lib/rocket_job/batch/categories.rb +25 -18
data/lib/rocket_job/batch/io.rb +130 -130
data/lib/rocket_job/batch/performance.rb +2 -2
data/lib/rocket_job/batch/statistics.rb +2 -2
data/lib/rocket_job/batch/throttle_running_workers.rb +1 -1
data/lib/rocket_job/batch/worker.rb +14 -12
data/lib/rocket_job/batch.rb +0 -1
data/lib/rocket_job/category/base.rb +10 -7
data/lib/rocket_job/category/input.rb +61 -1
data/lib/rocket_job/category/output.rb +9 -0
data/lib/rocket_job/cli.rb +1 -1
data/lib/rocket_job/dirmon_entry.rb +1 -1
data/lib/rocket_job/extensions/mongoid/contextual/mongo.rb +2 -2
data/lib/rocket_job/extensions/rocket_job_adapter.rb +2 -2
data/lib/rocket_job/job_exception.rb +1 -1
data/lib/rocket_job/jobs/conversion_job.rb +43 -0
data/lib/rocket_job/jobs/dirmon_job.rb +24 -35
data/lib/rocket_job/jobs/housekeeping_job.rb +4 -5
data/lib/rocket_job/jobs/on_demand_batch_job.rb +15 -11
data/lib/rocket_job/jobs/on_demand_job.rb +2 -2
data/lib/rocket_job/jobs/re_encrypt/relational_job.rb +103 -97
data/lib/rocket_job/jobs/upload_file_job.rb +6 -3
data/lib/rocket_job/lookup_collection.rb +4 -3
data/lib/rocket_job/plugins/cron.rb +60 -20
data/lib/rocket_job/plugins/job/persistence.rb +36 -0
data/lib/rocket_job/plugins/job/throttle.rb +2 -2
data/lib/rocket_job/plugins/restart.rb +3 -110
data/lib/rocket_job/plugins/state_machine.rb +2 -2
data/lib/rocket_job/plugins/throttle_dependent_jobs.rb +43 -0
data/lib/rocket_job/sliced/bzip2_output_slice.rb +18 -19
data/lib/rocket_job/sliced/compressed_slice.rb +3 -6
data/lib/rocket_job/sliced/encrypted_bzip2_output_slice.rb +49 -0
data/lib/rocket_job/sliced/encrypted_slice.rb +4 -6
data/lib/rocket_job/sliced/input.rb +42 -54
data/lib/rocket_job/sliced/slice.rb +7 -3
data/lib/rocket_job/sliced/slices.rb +12 -9
data/lib/rocket_job/sliced/writer/input.rb +46 -18
data/lib/rocket_job/sliced/writer/output.rb +0 -1
data/lib/rocket_job/sliced.rb +1 -19
data/lib/rocket_job/throttle_definitions.rb +7 -1
data/lib/rocket_job/version.rb +1 -1
data/lib/rocketjob.rb +4 -5
metadata +12 -12
data/lib/rocket_job/batch/tabular/input.rb +0 -133
data/lib/rocket_job/batch/tabular/output.rb +0 -67
data/lib/rocket_job/batch/tabular.rb +0 -58

data/lib/rocket_job/batch/worker.rb CHANGED Viewed

@@ -67,6 +67,8 @@ module RocketJob
       # Returns [Integer] the number of records processed in the slice
       #
       # Note: The slice will be removed from processing when this method completes
+      #
+      # @deprecated Please open a ticket if you need this behavior.
       def work_first_slice(&block)
         raise "#work_first_slice can only be called from within before_batch callbacks" unless sub_state == :before
@@ -96,7 +98,7 @@ module RocketJob
         case sub_state
         when :before, :after
           if running? && (server_name.nil? || worker_on_server?(server_name))
-            servers << ActiveWorker.new(worker_name, started_at, self) if running?
+            servers << ActiveWorker.new(worker_name, started_at, self)
           end
         when :processing
           query = input.running
@@ -142,19 +144,19 @@ module RocketJob
       # Perform individual slice without callbacks
       def rocket_job_perform_slice(slice, &block)
         slice.processing_record_number ||= 0
-        records                        = []
         append                         = false
-        # Skip processed records in this slice if it has no output categpries.
-        if slice.processing_record_number > 1
-          records = slice.records[slice.processing_record_number - 1..-1]
-          append  = true
-          logger.info("Resuming previously incomplete slice from record number #{slice.processing_record_number}")
-        else
-          # Reprocess all records in this slice.
-          slice.processing_record_number = 0
-          records                        = slice.records
-        end
+        # Skip processed records in this slice if it has no output categories.
+        records =
+          if slice.processing_record_number.to_i > 1
+            append = true
+            logger.info("Resuming previously incomplete slice from record number #{slice.processing_record_number}")
+            slice.records[slice.processing_record_number - 1..-1]
+          else
+            # Reprocess all records in this slice.
+            slice.processing_record_number = 0
+            slice.records
+          end
         count = 0
         RocketJob::Sliced::Writer::Output.collect(self, input_slice: slice, append: append) do |writer|

data/lib/rocket_job/batch.rb CHANGED Viewed

@@ -30,6 +30,5 @@ module RocketJob
     autoload :ThrottleWindows, "rocket_job/batch/throttle_windows"
     autoload :Result, "rocket_job/batch/result"
     autoload :Results, "rocket_job/batch/results"
-    autoload :Tabular, "rocket_job/batch/tabular"
   end
 end

data/lib/rocket_job/category/base.rb CHANGED Viewed

@@ -11,7 +11,6 @@ module RocketJob
         # Whether to compress, encrypt, or use the bzip2 serialization for data in this category.
         field :serializer, type: ::Mongoid::StringifiedSymbol, default: :compress
-        validates_inclusion_of :serializer, in: [:none, :compress, :encrypt, :bzip2]
         # The header columns when the file does not include a header row.
         # Note:
@@ -49,10 +48,12 @@ module RocketJob
           Sliced::CompressedSlice
         when :encrypt
           Sliced::EncryptedSlice
-        when :bzip2
+        when :bzip2, :bz2
           Sliced::BZip2OutputSlice
+        when :encrypted_bz2
+          Sliced::EncryptedBZip2OutputSlice
         else
-          raise(ArgumentError, "serialize: #{serializer.inspect} must be :none, :compress, :encrypt, or :bzip2")
+          raise(ArgumentError, "serialize: #{serializer.inspect} must be :none, :compress, :encrypt, :bz2, or :encrypted_bz2")
         end
       end
@@ -65,14 +66,16 @@ module RocketJob
         )
       end
-      def reset_tabular
-        @tabular = nil
-      end
       # Returns [true|false] whether this category has the attributes defined for tabular to work.
       def tabular?
         format.present?
       end
+      def build_collection_name(direction, job)
+        collection_name = "rocket_job.#{direction}s.#{job.id}"
+        collection_name << ".#{name}" unless name == :main
+        collection_name
+      end
     end
   end
 end

data/lib/rocket_job/category/input.rb CHANGED Viewed

@@ -10,6 +10,7 @@ module RocketJob
       # Slice size for this input collection
       field :slice_size, type: Integer, default: 100
+      validates_presence_of :slice_size
       #
       # The fields below only apply if the field `format` has been set:
@@ -82,7 +83,7 @@ module RocketJob
       field :header_cleanser, type: ::Mongoid::StringifiedSymbol, default: :default
       validates :header_cleanser, inclusion: %i[default none]
-      validates_presence_of :slice_size
+      validates_inclusion_of :serializer, in: %i[none compress encrypt]
       # Cleanses the header column names when `cleanse_header` is true
       def cleanse_header!
@@ -105,6 +106,65 @@ module RocketJob
           skip_unknown:     skip_unknown
         )
       end
+      def data_store(job)
+        RocketJob::Sliced::Input.new(
+          collection_name: build_collection_name(:input, job),
+          slice_class:     serializer_class,
+          slice_size:      slice_size
+        )
+      end
+      # Returns [IOStreams::Path] of file to upload.
+      # Auto-detects file format from file name when format is :auto.
+      def upload_path(stream = nil, original_file_name: nil)
+        unless stream || file_name
+          raise(ArgumentError, "Either supply a file name to upload, or set input_collection.file_name first")
+        end
+        path           = IOStreams.new(stream || file_name)
+        path.file_name = original_file_name if original_file_name
+        self.file_name = path.file_name
+        # Auto detect the format based on the upload file name if present.
+        if format == :auto
+          self.format = path.format || :csv
+          # Rebuild tabular with new values.
+          @tabular = nil
+        end
+        # Remove non-printable characters from tabular input formats.
+        if tabular?
+          # Cannot change the length of fixed width lines.
+          replace = format == :fixed ? " " : ""
+          path.option_or_stream(:encode, encoding: "UTF-8", cleaner: :printable, replace: replace)
+        end
+        path
+      end
+      # Return a lambda to extract the header row from the uploaded file.
+      def extract_header_callback(on_first)
+        return on_first unless tabular? && tabular.header?
+        case mode
+        when :line
+          lambda do |line|
+            tabular.parse_header(line)
+            cleanse_header!
+            self.columns = tabular.header.columns
+            # Call chained on_first if present
+            on_first&.call(line)
+          end
+        when :array
+          lambda do |row|
+            tabular.header.columns = row
+            cleanse_header!
+            self.columns = category.tabular.header.columns
+            # Call chained on_first if present
+            on_first&.call(line)
+          end
+        end
+      end
     end
   end
 end

data/lib/rocket_job/category/output.rb CHANGED Viewed

@@ -13,6 +13,8 @@ module RocketJob
       #   false: do not save nil values to the output categories.
       field :nils, type: ::Mongoid::Boolean, default: false
+      validates_inclusion_of :serializer, in: %i[none compress encrypt bz2 encrypted_bz2 bzip2]
       # Renders [String] the header line.
       # Returns [nil] if no header is needed.
       def render_header
@@ -20,6 +22,13 @@ module RocketJob
         tabular.render_header
       end
+      def data_store(job)
+        RocketJob::Sliced::Output.new(
+          collection_name: build_collection_name(:output, job),
+          slice_class:     serializer_class
+        )
+      end
     end
   end
 end

data/lib/rocket_job/cli.rb CHANGED Viewed

@@ -233,7 +233,7 @@ module RocketJob
     # Parse command line options placing results in the corresponding instance variables
     def parse(argv)
-      parser = OptionParser.new do |o|
+      parser        = OptionParser.new do |o|
         o.on("-n", "--name NAME", "Unique Name of this server (Default: host_name:PID)") do |arg|
           Config.name = arg
         end

data/lib/rocket_job/dirmon_entry.rb CHANGED Viewed

@@ -173,7 +173,7 @@ module RocketJob
       counts
     end
-    # Passes each filename [Pathname] found that matches the pattern into the supplied block
+    # Yields [IOStreams::Path] for each file found that matches the current pattern.
     def each
       SemanticLogger.named_tagged(dirmon_entry: id.to_s) do
         # Case insensitive filename matching

data/lib/rocket_job/extensions/mongoid/contextual/mongo.rb CHANGED Viewed

@@ -4,8 +4,8 @@ module Mongoid
     class Mongo
       def initialize(criteria)
         @criteria = criteria
-        @klass = criteria.klass
-        @cache = criteria.options[:cache]
+        @klass    = criteria.klass
+        @cache    = criteria.options[:cache]
         # Only line changed is here, get collection name from criteria, not @klass
         # @collection = @klass.collection
         @collection = criteria.collection

data/lib/rocket_job/extensions/rocket_job_adapter.rb CHANGED Viewed

@@ -55,13 +55,13 @@ module ActiveJob
     #   - Completed jobs will not appear in completed since the Active Job adapter
     #     uses the default Rocket Job `destroy_on_completion` of `false`.
     class RocketJobAdapter
-      def self.enqueue(active_job) #:nodoc:
+      def self.enqueue(active_job)
         job                        = RocketJob::Jobs::ActiveJob.create!(active_job_params(active_job))
         active_job.provider_job_id = job.id.to_s if active_job.respond_to?(:provider_job_id=)
         job
       end
-      def self.enqueue_at(active_job, timestamp) #:nodoc:
+      def self.enqueue_at(active_job, timestamp)
         params          = active_job_params(active_job)
         params[:run_at] = Time.at(timestamp).utc

data/lib/rocket_job/job_exception.rb CHANGED Viewed

@@ -23,7 +23,7 @@ module RocketJob
       new(
         args.merge(
           class_name: exc.class.name,
-          message:    exc.message,
+          message:    exc.message.to_s.encode("UTF-8", replace: ""),
           backtrace:  exc.backtrace || []
         )
       )

data/lib/rocket_job/jobs/conversion_job.rb ADDED Viewed

@@ -0,0 +1,43 @@
+# Convert to and from CSV, JSON, xlsx, and PSV files.
+#
+# Example, Convert CSV file to JSON.
+#   job = RocketJob::Jobs::ConversionJob.new
+#   job.input_category.file_name  = "data.csv"
+#   job.output_category.file_name = "data.json"
+#   job.save!
+#
+# Example, Convert JSON file to PSV and compress it with GZip.
+#   job = RocketJob::Jobs::ConversionJob.new
+#   job.input_category.file_name  = "data.json"
+#   job.output_category.file_name = "data.psv.gz"
+#   job.save!
+#
+# Example, Read a CSV file that has been zipped from a remote website and the convert it to a GZipped json file.
+#   job = RocketJob::Jobs::ConversionJob.new
+#   job.input_category.file_name  = "https://example.org/file.zip"
+#   job.output_category.file_name = "data.json.gz"
+#   job.save!
+#
+module RocketJob
+  module Jobs
+    class ConversionJob < RocketJob::Job
+      include RocketJob::Batch
+      self.destroy_on_complete = false
+      # Detects file extension for its type
+      input_category format: :auto
+      output_category format: :auto
+      # Upload the file specified in `input_category.file_name` unless already uploaded.
+      before_batch :upload, unless: :record_count
+      # When the job completes it will write the result to `output_category.file_name`.
+      after_batch :cleanup!, :download
+      def perform(hash)
+        hash
+      end
+    end
+  end
+end

data/lib/rocket_job/jobs/dirmon_job.rb CHANGED Viewed

@@ -30,59 +30,48 @@ module RocketJob
     #
     # If another DirmonJob instance is already queued or running, then the create
     # above will fail with:
-    #   MongoMapper::DocumentNotValid: Validation failed: State Another instance of this job is already queued or running
+    #   Validation failed: State Another instance of this job is already queued or running
     #
     # Or to start DirmonJob and ignore errors if already running
     #   RocketJob::Jobs::DirmonJob.create
     class DirmonJob < RocketJob::Job
-      # Only allow one DirmonJob instance to be running at a time
-      include RocketJob::Plugins::Singleton
-      # Start a new job when this one completes, fails, or aborts
-      include RocketJob::Plugins::Restart
+      include RocketJob::Plugins::Cron
-      self.priority = 30
-      # Number of seconds between directory scans. Default 5 mins
-      field :check_seconds, type: Float, default: 300.0, copy_on_restart: true
+      # Runs every 5 minutes by default
+      self.cron_schedule = "*/5 * * * * UTC"
+      self.description   = "Directory Monitor"
+      self.priority      = 30
       # Hash[file_name, size]
       field :previous_file_names, type: Hash, default: {}, copy_on_restart: true
-      before_create :set_run_at
-      # Iterate over each Dirmon entry looking for new files
-      # If a new file is found, it is not processed immediately, instead
-      # it is passed to the next run of this job along with the file size.
-      # If the file size has not changed, the Job is kicked off.
+      # Checks the directories for new files, starting jobs if files have not changed since the last run.
       def perform
         check_directories
       end
       private
-      # Set a run_at when a new instance of this job is created
-      def set_run_at
-        self.run_at = Time.now + check_seconds
-      end
-      # Checks the directories for new files, starting jobs if files have not changed
-      # since the last run
+      # Iterate over each Dirmon Entry looking for new files
+      # If a new file is found, it is not processed immediately, instead
+      # it is passed to the next run of this job along with the file size.
+      # If the file size has not changed, the Job is kicked off.
       def check_directories
         new_file_names = {}
-        DirmonEntry.enabled.each do |entry|
-          entry.each do |iopath|
-            # S3 files are only visible once completely uploaded.
-            unless iopath.partial_files_visible?
-              logger.info("File: #{iopath}. Starting: #{entry.job_class_name}")
-              entry.later(iopath)
+        DirmonEntry.enabled.each do |dirmon_entry|
+          dirmon_entry.each do |path|
+            # Skip file size checking since S3 files are only visible once completely uploaded.
+            unless path.partial_files_visible?
+              logger.info("File: #{path}. Starting: #{dirmon_entry.job_class_name}")
+              dirmon_entry.later(path)
               next
             end
             # BSON Keys cannot contain periods
-            key           = iopath.to_s.tr(".", "_")
+            key           = path.to_s.tr(".", "_")
             previous_size = previous_file_names[key]
             # Check every few minutes for a file size change before trying to process the file.
-            size = check_file(entry, iopath, previous_size)
+            size                = check_file(dirmon_entry, path, previous_size)
             new_file_names[key] = size if size
           end
         end
@@ -91,14 +80,14 @@ module RocketJob
       # Checks if a file should result in starting a job
       # Returns [Integer] file size, or nil if the file started a job
-      def check_file(entry, iopath, previous_size)
-        size = iopath.size
+      def check_file(dirmon_entry, path, previous_size)
+        size = path.size
         if previous_size && (previous_size == size)
-          logger.info("File stabilized: #{iopath}. Starting: #{entry.job_class_name}")
-          entry.later(iopath)
+          logger.info("File stabilized: #{path}. Starting: #{dirmon_entry.job_class_name}")
+          dirmon_entry.later(path)
           nil
         else
-          logger.info("Found file: #{iopath}. File size: #{size}")
+          logger.info("Found file: #{path}. File size: #{size}")
           # Keep for the next run
           size
         end

data/lib/rocket_job/jobs/housekeeping_job.rb CHANGED Viewed

@@ -27,12 +27,11 @@ module RocketJob
     #   )
     class HousekeepingJob < RocketJob::Job
       include RocketJob::Plugins::Cron
-      include RocketJob::Plugins::Singleton
-      self.priority    = 25
-      self.description = "Cleans out historical jobs, and zombie servers."
-      # Runs every 15 minutes
-      self.cron_schedule = "*/15 * * * * UTC"
+      # Runs every 15 minutes on the 15 minute period
+      self.cron_schedule = "0,15,30,45 * * * * UTC"
+      self.description   = "Cleans out historical jobs, and zombie servers."
+      self.priority      = 25
       # Whether to destroy zombie servers automatically
       field :destroy_zombies, type: Mongoid::Boolean, default: true, user_editable: true, copy_on_restart: true

data/lib/rocket_job/jobs/on_demand_batch_job.rb CHANGED Viewed

@@ -65,27 +65,29 @@ module RocketJob
   module Jobs
     class OnDemandBatchJob < RocketJob::Job
       include RocketJob::Plugins::Cron
+      include RocketJob::Plugins::Retry
       include RocketJob::Batch
       include RocketJob::Batch::Statistics
       self.priority            = 90
-      self.description         = "Batch Job"
+      self.description         = "On Demand Batch Job"
       self.destroy_on_complete = false
+      self.retry_limit         = 0
       # Code that is performed against every row / record.
-      field :code, type: String
+      field :code, type: String, user_editable: true, copy_on_restart: true
       # Optional code to execute before the batch is run.
       # Usually to upload data into the job.
-      field :before_code, type: String
+      field :before_code, type: String, user_editable: true, copy_on_restart: true
       # Optional code to execute after the batch is run.
       # Usually to upload data into the job.
-      field :after_code, type: String
+      field :after_code, type: String, user_editable: true, copy_on_restart: true
       # Data that is made available to the job during the perform.
       # Be sure to store key names only as Strings, not Symbols.
-      field :data, type: Hash, default: {}
+      field :data, type: Hash, default: {}, user_editable: true, copy_on_restart: true
       validates :code, presence: true
       validate :validate_code
@@ -96,12 +98,14 @@ module RocketJob
       before_batch :run_before_code
       after_batch :run_after_code
-      # Make this job collect its output
-      # :nils [true|false]
-      #   Whether to skip the output from `code` when it is nil
-      #   Default: false
-      def collect_output(nils: false)
-        self.output_categories = [RocketJob::Category::Output.new(nils: nils)]
+      # Shortcut for setting the slice_size
+      def slice_size=(slice_size)
+        input_category.slice_size = slice_size
+      end
+      # Add a new output category and collect output for it.
+      def add_output_category(**args)
+        self.output_categories << RocketJob::Category::Output.new(**args)
       end
       private

data/lib/rocket_job/jobs/on_demand_job.rb CHANGED Viewed

@@ -78,8 +78,8 @@ module RocketJob
       self.retry_limit         = 0
       # Be sure to store key names only as Strings, not Symbols
-      field :data, type: Hash, default: {}, copy_on_restart: true
-      field :code, type: String, copy_on_restart: true
+      field :data, type: Hash, default: {}, user_editable: true, copy_on_restart: true
+      field :code, type: String, user_editable: true, copy_on_restart: true
       validates :code, presence: true
       validate :validate_code