RubyGems - rocketjob - Versions diffs - 6.0.0.rc3 → 6.0.0 - Mend

rocketjob 6.0.0.rc3 → 6.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

checksums.yaml +4 -4
data/README.md +26 -0
data/lib/rocket_job/batch/categories.rb +24 -20
data/lib/rocket_job/batch/io.rb +128 -128
data/lib/rocket_job/batch/worker.rb +14 -12
data/lib/rocket_job/category/base.rb +10 -7
data/lib/rocket_job/category/input.rb +61 -1
data/lib/rocket_job/category/output.rb +9 -0
data/lib/rocket_job/dirmon_entry.rb +1 -1
data/lib/rocket_job/jobs/conversion_job.rb +21 -17
data/lib/rocket_job/jobs/dirmon_job.rb +24 -35
data/lib/rocket_job/jobs/housekeeping_job.rb +4 -5
data/lib/rocket_job/jobs/on_demand_batch_job.rb +7 -5
data/lib/rocket_job/jobs/on_demand_job.rb +2 -2
data/lib/rocket_job/jobs/upload_file_job.rb +4 -0
data/lib/rocket_job/plugins/cron.rb +60 -20
data/lib/rocket_job/plugins/job/persistence.rb +36 -0
data/lib/rocket_job/plugins/restart.rb +3 -110
data/lib/rocket_job/plugins/state_machine.rb +2 -2
data/lib/rocket_job/plugins/throttle_dependent_jobs.rb +1 -2
data/lib/rocket_job/sliced/bzip2_output_slice.rb +18 -19
data/lib/rocket_job/sliced/compressed_slice.rb +3 -6
data/lib/rocket_job/sliced/encrypted_bzip2_output_slice.rb +49 -0
data/lib/rocket_job/sliced/encrypted_slice.rb +4 -6
data/lib/rocket_job/sliced/input.rb +42 -54
data/lib/rocket_job/sliced/slice.rb +7 -3
data/lib/rocket_job/sliced/slices.rb +12 -9
data/lib/rocket_job/sliced/writer/input.rb +46 -18
data/lib/rocket_job/sliced.rb +1 -19
data/lib/rocket_job/version.rb +1 -1
data/lib/rocketjob.rb +2 -2
metadata +8 -10
data/lib/rocket_job/batch/tabular/input.rb +0 -133
data/lib/rocket_job/batch/tabular/output.rb +0 -67
data/lib/rocket_job/batch/tabular.rb +0 -58

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 6a04a33b0cd03bdf0a7cb948fc87dd6c7d7bb3b392e566a8c15df50b73e27459
-  data.tar.gz: fc62e740a0a92bae8daf1f4ffbe199af1debcb84f8859aed10ea5954dc44c7b6
+  metadata.gz: e313f192b854d066258a614ceac1131851c8df94c7e08f7cea6681fff6946d69
+  data.tar.gz: 10804682bee08715671696db4610ce4f93679398398bc385e93619f5a3aca715
 SHA512:
-  metadata.gz: 74cac01d253cf21a856e1ca4a5cf63d5e90320303bdf310cf90325c9cca242c4ed1b7a0a1c43ca00764f2f40d29822df6e6bee499c1bff56c9ddaa2401bc3862
-  data.tar.gz: 1bbc47c7d869ef28fd578a7b2575f62957aa2f83f9fc927af1d6fba7866270b15fd21cef30007b78d58847137357c75ccae2d03545560d2ffe0d674fe34c1d0e
+  metadata.gz: 158675e5ddec87a8b277708887b037746e3f1573569edd5a8959eabdf5668b144553cbe6164844f2cf69bc603798b3b5b052697506dad8a29e8477afc62cc45f
+  data.tar.gz: 680efe5603de3649b7e09340a545d2e1df1e02697d34af9451869310e9f2a87bbd05423686bb7d40d60f104553f7311721870e52ac2171bb1491b3a8decaf439

data/README.md CHANGED Viewed

@@ -49,6 +49,32 @@ require "rocket_job/batch/tabular"
 It is important to migrate away from these plugins, since they will be removed in a future release.
+#### Scheduled Jobs
+For any scheduled jobs that include the `RocketJob::Plugins::Cron` plugin, the default behavior has changed
+so that the scheduled job instance is created immediately after the currently scheduled instance starts.
+To maintain the old behavior of creating the job when it fails, aborts, or completes, add the following line
+to each of the applicable jobs:
+~~~ruby
+self.cron_after_start = false
+~~~
+Additionally, scheduled jobs will now prevent a new one from being created when another scheduled instance
+of the same job is already queued, or running with the _same_ `cron_schedule`.
+To maintain the old behavior of allowing multiple instances with the same cron schedule, add the following
+line to each of the applicable jobs:
+~~~ruby
+self.cron_singleton = false
+~~~
+##### Singleton
+Since Scheduled jobs now implement their own singleton logic, remove the singleton plugin from any scheduled jobs.
 #### Upgrading Batch Jobs to Rocket Job v6
 Rocket Job v6 replaces the array of symbol type for `input_categories` and `output_categories`

data/lib/rocket_job/batch/categories.rb CHANGED Viewed

@@ -72,34 +72,38 @@ module RocketJob
       end
       def input_category(category_name = :main)
+        return category_name if category_name.is_a?(Category::Input)
+        raise(ArgumentError, "Cannot supply Output Category to input category") if category_name.is_a?(Category::Output)
         category_name = category_name.to_sym
-        category      = nil
-        # .find does not work against this association
-        input_categories.each { |catg| category = catg if catg.name == category_name }
-        unless category
-          # Auto-register main input category if missing
-          if category_name == :main
-            category              = Category::Input.new
-            self.input_categories = [category]
-          else
-            raise(ArgumentError,
-                  "Unknown Input Category: #{category_name.inspect}. Registered categories: #{input_categories.collect(&:name).join(',')}")
-          end
+        # find does not work against this association
+        input_categories.each { |category| return category if category.name == category_name }
+        unless category_name == :main
+          raise(
+            ArgumentError,
+            "Unknown Input Category: #{category_name.inspect}. Registered categories: #{input_categories.collect(&:name).join(',')}"
+          )
         end
+        # Auto-register main input category when not defined
+        category = Category::Input.new(job: self)
+        self.input_categories << category
         category
       end
       def output_category(category_name = :main)
+        return category_name if category_name.is_a?(Category::Output)
+        raise(ArgumentError, "Cannot supply Input Category to output category") if category_name.is_a?(Category::Input)
         category_name = category_name.to_sym
-        category      = nil
         # .find does not work against this association
-        output_categories.each { |catg| category = catg if catg.name == category_name }
-        unless category
-          raise(ArgumentError,
-                "Unknown Output Category: #{category_name.inspect}. Registered categories: #{output_categories.collect(&:name).join(',')}")
-        end
+        output_categories.each { |category| return category if category.name == category_name }
-        category
+        raise(
+          ArgumentError,
+          "Unknown Output Category: #{category_name.inspect}. Registered categories: #{output_categories.collect(&:name).join(',')}"
+        )
       end
       # Returns [true|false] whether the named category has already been defined
@@ -214,7 +218,7 @@ module RocketJob
         category.tabular.render(row)
       end
-      # Migrate existing v4 batch jobs to v5.0
+      # Migrate existing v5 batch jobs to v6
       def rocketjob_categories_migrate
         return unless attribute_present?(:input_categories) && self[:input_categories]&.first.is_a?(Symbol)

data/lib/rocket_job/batch/io.rb CHANGED Viewed

@@ -14,11 +14,9 @@ module RocketJob
       #     Default: None ( Uses the single default input collection for this job )
       #     Validates: This value must be one of those listed in #input_categories
       def input(category = :main)
-        raise(ArgumentError, "Cannot supply Output Category to input category") if category.is_a?(Category::Output)
+        category = input_category(category)
-        category = input_category(category) unless category.is_a?(Category::Input)
-        (@inputs ||= {})[category.name] ||= RocketJob::Sliced.factory(:input, category, self)
+        (@inputs ||= {})[category.name] ||= category.data_store(self)
       end
       # Returns [RocketJob::Sliced::Output] output collection for holding output slices
@@ -30,11 +28,9 @@ module RocketJob
       #     Default: None ( Uses the single default output collection for this job )
       #     Validates: This value must be one of those listed in #output_categories
       def output(category = :main)
-        raise(ArgumentError, "Cannot supply Input Category to output category") if category.is_a?(Category::Input)
-        category = output_category(category) unless category.is_a?(Category::Output)
+        category = output_category(category)
-        (@outputs ||= {})[category.name] ||= RocketJob::Sliced.factory(:output, category, self)
+        (@outputs ||= {})[category.name] ||= category.data_store(self)
       end
       # Rapidly upload individual records in batches.
@@ -59,19 +55,19 @@ module RocketJob
       #     The category or the name of the category to access or download data from
       #     Default: None ( Uses the single default output collection for this job )
       #     Validates: This value must be one of those listed in #input_categories
-      def lookup_collection(category = :main)
-        category = input_category(category) unless category.is_a?(Category::Input)
-        collection = (@lookup_collections ||= {})[category.name]
-        unless collection
-          collection_name = "rocket_job.inputs.#{id}"
-          collection_name << ".#{category.name}" unless category.name == :main
-          @lookup_collections[category.name] ||=
-            LookupCollection.new(Sliced::Slice.collection.database, collection_name)
-        end
-      end
+      # def lookup_collection(category = :main)
+      #   category = input_category(category) unless category.is_a?(Category::Input)
+      #
+      #   collection = (@lookup_collections ||= {})[category.name]
+      #
+      #   unless collection
+      #     collection_name = "rocket_job.inputs.#{id}"
+      #     collection_name << ".#{category.name}" unless category.name == :main
+      #
+      #     @lookup_collections[category.name] ||=
+      #       LookupCollection.new(Sliced::Slice.collection.database, collection_name)
+      #   end
+      # end
       # Upload the supplied file, io, IOStreams::Path, or IOStreams::Stream.
       #
@@ -154,53 +150,7 @@ module RocketJob
       # * If an io stream is supplied, it is read until it returns nil.
       # * Only use this method for UTF-8 data, for binary data use #input_slice or #input_records.
       # * CSV parsing is slow, so it is usually left for the workers to do.
-      def upload(stream = nil, file_name: nil, category: :main, stream_mode: :line, on_first: nil, **args, &block)
-        raise(ArgumentError, "Either stream, or a block must be supplied") unless stream || block
-        category = input_category(category) unless category.is_a?(Category::Input)
-        stream ||= category.file_name
-        path     = nil
-        if stream
-          path               = IOStreams.new(stream)
-          path.file_name     = file_name if file_name
-          category.file_name = path.file_name
-          # Auto detect the format based on the upload file name if present.
-          if category.format == :auto
-            format = path.format
-            if format
-              # Rebuild tabular with the above file name
-              category.reset_tabular
-              category.format = format
-            end
-          end
-        end
-        # Tabular transformations required for upload?
-        if category.tabular?
-          # Remove non-printable characters from tabular input formats
-          # Cannot change the length of fixed width lines
-          replace = category.format == :fixed ? " " : ""
-          path&.option_or_stream(:encode, encoding: "UTF-8", cleaner: :printable, replace: replace)
-          # Extract the header line during the file upload when needed.
-          on_first = rocket_job_upload_header_lambda(category, on_first) if category.tabular.header?
-        end
-        count =
-          if block
-            input(category).upload(on_first: on_first, &block)
-          else
-            input(category).upload(on_first: on_first) do |io|
-              path.each(stream_mode, **args) { |line| io << line }
-            end
-          end
-        self.record_count = (record_count || 0) + count
-        count
-      end
+      #
       # Upload results from an Arel into RocketJob::SlicedJob.
       #
       # Params
@@ -227,18 +177,13 @@ module RocketJob
       #
       # Example: Upload user_name and zip_code
       #   arel = User.where(country_code: 'US')
-      #   job.upload_arel(arel, :user_name, :zip_code)
+      #   job.upload_arel(arel, columns: [:user_name, :zip_code])
       #
       # Notes:
       # * Only call from one thread at a time against a single instance of this job.
       # * The record_count for the job is set to the number of records returned by the arel.
       # * If an exception is raised while uploading data, the input collection is cleared out
       #   so that if a job is retried during an upload failure, data is not duplicated.
-      def upload_arel(arel, *column_names, category: :main, &block)
-        count             = input(category).upload_arel(arel, *column_names, &block)
-        self.record_count = (record_count || 0) + count
-        count
-      end
       # Upload the result of a MongoDB query to the input collection for processing
       # Useful when an entire MongoDB collection, or part thereof needs to be
@@ -266,24 +211,19 @@ module RocketJob
       #   criteria = User.where(state: 'FL')
       #   job.record_count = job.upload_mongo_query(criteria)
       #
-      # Example: Upload just the supplied column
+      # Example: Upload only the specified column(s)
       #   criteria = User.where(state: 'FL')
-      #   job.record_count = job.upload_mongo_query(criteria, :zip_code)
+      #   job.record_count = job.upload_mongo_query(criteria, columns: [:zip_code])
       #
       # Notes:
       # * Only call from one thread at a time against a single instance of this job.
       # * The record_count for the job is set to the number of records returned by the monqo query.
       # * If an exception is raised while uploading data, the input collection is cleared out
       #   so that if a job is retried during an upload failure, data is not duplicated.
-      def upload_mongo_query(criteria, *column_names, category: :main, &block)
-        count             = input(category).upload_mongo_query(criteria, *column_names, &block)
-        self.record_count = (record_count || 0) + count
-        count
-      end
       # Upload sliced range of integer requests as arrays of start and end ids.
       #
-      # Returns [Integer] last_id - start_id + 1.
+      # Returns [Integer] the number of slices uploaded.
       #
       # Uploads one range per slice so that the response can return multiple records
       # for each slice processed
@@ -302,17 +242,11 @@ module RocketJob
       # * The record_count for the job is set to: last_id - start_id + 1.
       # * If an exception is raised while uploading data, the input collection is cleared out
       #   so that if a job is retried during an upload failure, data is not duplicated.
-      def upload_integer_range(start_id, last_id, category: :main)
-        input(category).upload_integer_range(start_id, last_id)
-        count             = last_id - start_id + 1
-        self.record_count = (record_count || 0) + count
-        count
-      end
       # Upload sliced range of integer requests as an arrays of start and end ids
       # starting with the last range first
       #
-      # Returns [Integer] last_id - start_id + 1.
+      # Returns [Integer] the number of slices uploaded.
       #
       # Uploads one range per slice so that the response can return multiple records
       # for each slice processed.
@@ -334,14 +268,102 @@ module RocketJob
       # * The record_count for the job is set to: last_id - start_id + 1.
       # * If an exception is raised while uploading data, the input collection is cleared out
       #   so that if a job is retried during an upload failure, data is not duplicated.
-      def upload_integer_range_in_reverse_order(start_id, last_id, category: :main)
-        input(category).upload_integer_range_in_reverse_order(start_id, last_id)
-        count             = last_id - start_id + 1
+      def upload(object = nil, category: :main, file_name: nil, stream_mode: nil, on_first: nil, columns: nil, slice_batch_size: nil, **args, &block)
+        input_collection = input(category)
+        if block
+          raise(ArgumentError, "Cannot supply both an object to upload, and a block.") if object
+          if stream_mode || columns || slice_batch_size || args.size > 0
+            raise(ArgumentError, "Unknown keyword arguments when uploading a block. Only accepts :category, :file_name, or :on_first")
+          end
+          category           = input_category(category)
+          category.file_name = file_name if file_name
+          # Extract the header line during the upload when applicable.
+          extract_header = category.extract_header_callback(on_first)
+          count             = input_collection.upload(on_first: extract_header, slice_batch_size: slice_batch_size, &block)
+          self.record_count = (record_count || 0) + count
+          return count
+        end
+        count =
+          case object
+          when Range
+            if file_name || stream_mode || on_first || args.size > 0
+              raise(ArgumentError, "Unknown keyword arguments when uploading a Range. Only accepts :category, :columns, or :slice_batch_size")
+            end
+            first = object.first
+            last  = object.last
+            if first < last
+              input_collection.upload_integer_range(first, last, slice_batch_size: slice_batch_size || 1_000)
+            else
+              input_collection.upload_integer_range_in_reverse_order(last, first, slice_batch_size: slice_batch_size || 1_000)
+            end
+          when Mongoid::Criteria
+            if file_name || stream_mode || on_first || args.size > 0
+              raise(ArgumentError, "Unknown keyword arguments when uploading a Mongoid::Criteria. Only accepts :category, :columns, or :slice_batch_size")
+            end
+            input_collection.upload_mongo_query(object, columns: columns, slice_batch_size: slice_batch_size, &block)
+          when defined?(ActiveRecord::Relation) ? ActiveRecord::Relation : false
+            if file_name || stream_mode || on_first || args.size > 0
+              raise(ArgumentError, "Unknown keyword arguments when uploading an ActiveRecord::Relation. Only accepts :category, :columns, or :slice_batch_size")
+            end
+            input_collection.upload_arel(object, columns: columns, slice_batch_size: slice_batch_size, &block)
+          else
+            raise(ArgumentError, "Unknown keyword argument :columns when uploading a file") if columns
+            category = input_category(category)
+            # Extract the header line during the upload when applicable.
+            extract_header = category.extract_header_callback(on_first)
+            path = category.upload_path(object, original_file_name: file_name)
+            input_collection.upload(on_first: extract_header, slice_batch_size: slice_batch_size) do |io|
+              path.each(stream_mode || :line, **args) { |line| io << line }
+            end
+          end
+        self.record_count = (record_count || 0) + count
+        count
+      end
+      # @deprecated
+      def upload_arel(arel, *column_names, category: :main, &block)
+        count             = input(category).upload_arel(arel, columns: column_names, &block)
         self.record_count = (record_count || 0) + count
         count
       end
-      # Upload the supplied slices for processing by workers
+      # @deprecated
+      def upload_mongo_query(criteria, *column_names, category: :main, &block)
+        count             = input(category).upload_mongo_query(criteria, columns: column_names, &block)
+        self.record_count = (record_count || 0) + count
+        count
+      end
+      # @deprecated
+      def upload_integer_range(start_id, last_id, category: :main, slice_batch_size: 1_000)
+        count             = input(category).upload_integer_range(start_id, last_id, slice_batch_size: slice_batch_size)
+        self.record_count = (record_count || 0) + count
+        count
+      end
+      # @deprecated
+      def upload_integer_range_in_reverse_order(start_id, last_id, category: :main, slice_batch_size: 1_000)
+        count             = input(category).upload_integer_range_in_reverse_order(start_id, last_id, slice_batch_size: slice_batch_size)
+        self.record_count = (record_count || 0) + count
+        count
+      end
+      # Upload the supplied slice for processing by workers
       #
       # Updates the record_count after adding the records
       #
@@ -427,50 +449,28 @@ module RocketJob
         # Store the output file name in the category
         category.file_name = stream if !block && (stream.is_a?(String) || stream.is_a?(IOStreams::Path))
-        if output_collection.binary?
-          raise(ArgumentError, "A `header_line` is not supported with binary output collections") if header_line
-          return output_collection.download(&block) if block
+        header_line ||= category.render_header
-          IOStreams.new(stream || category.file_name).stream(:none).writer(**args) do |io|
-            output_collection.download { |record| io << record[:binary] }
-          end
-        else
-          header_line ||= category.render_header
+        return output_collection.download(header_line: header_line, &block) if block
-          return output_collection.download(header_line: header_line, &block) if block
+        raise(ArgumentError, "Missing mandatory `stream` or `category.file_name`") unless stream || category.file_name
-          raise(ArgumentError, "Missing mandatory `stream` or `category.file_name`") unless stream || category.file_name
+        if output_collection.slice_class.binary_format
+          binary_header_line = output_collection.slice_class.to_binary(header_line) if header_line
+          # Don't overwrite supplied stream options if any
+          stream = stream&.is_a?(IOStreams::Stream) ? stream.dup : IOStreams.new(category.file_name)
+          stream.remove_from_pipeline(output_collection.slice_class.binary_format)
+          stream.writer(**args) do |io|
+            # TODO: Binary formats should return the record count, instead of the slice count.
+            output_collection.download(header_line: binary_header_line) { |record| io.write(record) }
+          end
+        else
           IOStreams.new(stream || category.file_name).writer(:line, **args) do |io|
             output_collection.download(header_line: header_line) { |record| io << record }
           end
         end
       end
-      private
-      # Return a lambda to extract the header row from the uploaded file.
-      def rocket_job_upload_header_lambda(category, on_first)
-        case category.mode
-        when :line
-          lambda do |line|
-            category.tabular.parse_header(line)
-            category.cleanse_header!
-            category.columns = category.tabular.header.columns
-            # Call chained on_first if present
-            on_first&.call(line)
-          end
-        when :array
-          lambda do |row|
-            category.tabular.header.columns = row
-            category.cleanse_header!
-            category.columns = category.tabular.header.columns
-            # Call chained on_first if present
-            on_first&.call(line)
-          end
-        end
-      end
     end
   end
 end