RubyGems - rocketjob - Versions diffs - 6.0.0.rc3 → 6.0.3 - Mend

rocketjob 6.0.0.rc3 → 6.0.3

Files changed (38) hide show

checksums.yaml +4 -4
data/README.md +26 -0
data/lib/rocket_job/batch/categories.rb +26 -24
data/lib/rocket_job/batch/io.rb +128 -128
data/lib/rocket_job/batch/worker.rb +14 -12
data/lib/rocket_job/category/base.rb +10 -7
data/lib/rocket_job/category/input.rb +61 -1
data/lib/rocket_job/category/output.rb +9 -0
data/lib/rocket_job/dirmon_entry.rb +1 -1
data/lib/rocket_job/job_exception.rb +1 -1
data/lib/rocket_job/jobs/conversion_job.rb +21 -17
data/lib/rocket_job/jobs/dirmon_job.rb +24 -35
data/lib/rocket_job/jobs/housekeeping_job.rb +4 -5
data/lib/rocket_job/jobs/on_demand_batch_job.rb +11 -5
data/lib/rocket_job/jobs/on_demand_job.rb +6 -2
data/lib/rocket_job/jobs/upload_file_job.rb +4 -0
data/lib/rocket_job/plugins/cron.rb +60 -20
data/lib/rocket_job/plugins/job/persistence.rb +36 -0
data/lib/rocket_job/plugins/restart.rb +3 -110
data/lib/rocket_job/plugins/state_machine.rb +2 -2
data/lib/rocket_job/plugins/throttle_dependent_jobs.rb +10 -5
data/lib/rocket_job/sliced/bzip2_output_slice.rb +18 -19
data/lib/rocket_job/sliced/compressed_slice.rb +3 -6
data/lib/rocket_job/sliced/encrypted_bzip2_output_slice.rb +49 -0
data/lib/rocket_job/sliced/encrypted_slice.rb +4 -6
data/lib/rocket_job/sliced/input.rb +42 -54
data/lib/rocket_job/sliced/slice.rb +7 -3
data/lib/rocket_job/sliced/slices.rb +12 -9
data/lib/rocket_job/sliced/writer/input.rb +46 -18
data/lib/rocket_job/sliced.rb +1 -19
data/lib/rocket_job/subscribers/secret_config.rb +17 -0
data/lib/rocket_job/supervisor.rb +1 -0
data/lib/rocket_job/version.rb +1 -1
data/lib/rocketjob.rb +4 -3
metadata +11 -12
data/lib/rocket_job/batch/tabular/input.rb +0 -133
data/lib/rocket_job/batch/tabular/output.rb +0 -67
data/lib/rocket_job/batch/tabular.rb +0 -58

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 6a04a33b0cd03bdf0a7cb948fc87dd6c7d7bb3b392e566a8c15df50b73e27459
-  data.tar.gz: fc62e740a0a92bae8daf1f4ffbe199af1debcb84f8859aed10ea5954dc44c7b6
+  metadata.gz: d37fc69678a20d2ab48a22daccbbff3238a4511d9fc711873c70640abe4b5d81
+  data.tar.gz: df5bc46066bf3359c7e75549cd75545f52fde6c4b5e6c0a4dfae0b63705ca3fe
 SHA512:
-  metadata.gz: 74cac01d253cf21a856e1ca4a5cf63d5e90320303bdf310cf90325c9cca242c4ed1b7a0a1c43ca00764f2f40d29822df6e6bee499c1bff56c9ddaa2401bc3862
-  data.tar.gz: 1bbc47c7d869ef28fd578a7b2575f62957aa2f83f9fc927af1d6fba7866270b15fd21cef30007b78d58847137357c75ccae2d03545560d2ffe0d674fe34c1d0e
+  metadata.gz: 18c563035502f1c1b9c0a37fb87f0563544ba83de0db5f7dec0ba663540a1db2a32b686cfc64c8056b99d7a3ce503b508ab206270feb277c42e79ca010debdf7
+  data.tar.gz: 71e00ec619377e422934ae0682b4322ffea63b3db307c06ab4a33a2e24fa149f599a9a579ab3b4d699cf5684d296d861b1cbbf2abc802b55002e721e35d4070e

data/README.md CHANGED Viewed

@@ -49,6 +49,32 @@ require "rocket_job/batch/tabular"
 It is important to migrate away from these plugins, since they will be removed in a future release.
+#### Scheduled Jobs
+For any scheduled jobs that include the `RocketJob::Plugins::Cron` plugin, the default behavior has changed
+so that the scheduled job instance is created immediately after the currently scheduled instance starts.
+To maintain the old behavior of creating the job when it fails, aborts, or completes, add the following line
+to each of the applicable jobs:
+~~~ruby
+self.cron_after_start = false
+~~~
+Additionally, scheduled jobs will now prevent a new one from being created when another scheduled instance
+of the same job is already queued, or running with the _same_ `cron_schedule`.
+To maintain the old behavior of allowing multiple instances with the same cron schedule, add the following
+line to each of the applicable jobs:
+~~~ruby
+self.cron_singleton = false
+~~~
+##### Singleton
+Since Scheduled jobs now implement their own singleton logic, remove the singleton plugin from any scheduled jobs.
 #### Upgrading Batch Jobs to Rocket Job v6
 Rocket Job v6 replaces the array of symbol type for `input_categories` and `output_categories`

data/lib/rocket_job/batch/categories.rb CHANGED Viewed

@@ -72,34 +72,37 @@ module RocketJob
       end
       def input_category(category_name = :main)
+        return category_name if category_name.is_a?(Category::Input)
+        raise(ArgumentError, "Cannot supply Output Category to input category") if category_name.is_a?(Category::Output)
+        # Initialize categories when this method is called before initialization is complete
+        rocketjob_categories_assign if input_categories.empty?
         category_name = category_name.to_sym
-        category      = nil
-        # .find does not work against this association
-        input_categories.each { |catg| category = catg if catg.name == category_name }
-        unless category
-          # Auto-register main input category if missing
-          if category_name == :main
-            category              = Category::Input.new
-            self.input_categories = [category]
-          else
-            raise(ArgumentError,
-                  "Unknown Input Category: #{category_name.inspect}. Registered categories: #{input_categories.collect(&:name).join(',')}")
-          end
-        end
-        category
+        # find does not work against this association
+        input_categories.each { |category| return category if category.name == category_name }
+        raise(
+          ArgumentError,
+          "Unknown Input Category: #{category_name.inspect}. Registered categories: #{input_categories.collect(&:name).join(',')}"
+        )
       end
       def output_category(category_name = :main)
+        return category_name if category_name.is_a?(Category::Output)
+        raise(ArgumentError, "Cannot supply Input Category to output category") if category_name.is_a?(Category::Input)
+        # Initialize categories when this method is called before initialization is complete
+        rocketjob_categories_assign if output_categories.empty? && self.class.defined_output_categories
         category_name = category_name.to_sym
-        category      = nil
         # .find does not work against this association
-        output_categories.each { |catg| category = catg if catg.name == category_name }
-        unless category
-          raise(ArgumentError,
-                "Unknown Output Category: #{category_name.inspect}. Registered categories: #{output_categories.collect(&:name).join(',')}")
-        end
+        output_categories.each { |category| return category if category.name == category_name }
-        category
+        raise(
+          ArgumentError,
+          "Unknown Output Category: #{category_name.inspect}. Registered categories: #{output_categories.collect(&:name).join(',')}"
+        )
       end
       # Returns [true|false] whether the named category has already been defined
@@ -150,7 +153,7 @@ module RocketJob
             end
         end
-        return if !self.class.defined_output_categories || !output_categories.empty?
+        return if !output_categories.empty? || !self.class.defined_output_categories
         # Input categories defaults to nil if none was set in the class
         self.output_categories = self.class.defined_output_categories.deep_dup
@@ -160,7 +163,6 @@ module RocketJob
       def rocketjob_categories_output_render
         return if @rocket_job_output.nil?
-        # TODO: ..
         return unless output_categories
         return if output_categories.empty?
@@ -214,7 +216,7 @@ module RocketJob
         category.tabular.render(row)
       end
-      # Migrate existing v4 batch jobs to v5.0
+      # Migrate existing v5 batch jobs to v6
       def rocketjob_categories_migrate
         return unless attribute_present?(:input_categories) && self[:input_categories]&.first.is_a?(Symbol)

data/lib/rocket_job/batch/io.rb CHANGED Viewed

@@ -14,11 +14,9 @@ module RocketJob
       #     Default: None ( Uses the single default input collection for this job )
       #     Validates: This value must be one of those listed in #input_categories
       def input(category = :main)
-        raise(ArgumentError, "Cannot supply Output Category to input category") if category.is_a?(Category::Output)
+        category = input_category(category)
-        category = input_category(category) unless category.is_a?(Category::Input)
-        (@inputs ||= {})[category.name] ||= RocketJob::Sliced.factory(:input, category, self)
+        (@inputs ||= {})[category.name] ||= category.data_store(self)
       end
       # Returns [RocketJob::Sliced::Output] output collection for holding output slices
@@ -30,11 +28,9 @@ module RocketJob
       #     Default: None ( Uses the single default output collection for this job )
       #     Validates: This value must be one of those listed in #output_categories
       def output(category = :main)
-        raise(ArgumentError, "Cannot supply Input Category to output category") if category.is_a?(Category::Input)
-        category = output_category(category) unless category.is_a?(Category::Output)
+        category = output_category(category)
-        (@outputs ||= {})[category.name] ||= RocketJob::Sliced.factory(:output, category, self)
+        (@outputs ||= {})[category.name] ||= category.data_store(self)
       end
       # Rapidly upload individual records in batches.
@@ -59,19 +55,19 @@ module RocketJob
       #     The category or the name of the category to access or download data from
       #     Default: None ( Uses the single default output collection for this job )
       #     Validates: This value must be one of those listed in #input_categories
-      def lookup_collection(category = :main)
-        category = input_category(category) unless category.is_a?(Category::Input)
-        collection = (@lookup_collections ||= {})[category.name]
-        unless collection
-          collection_name = "rocket_job.inputs.#{id}"
-          collection_name << ".#{category.name}" unless category.name == :main
-          @lookup_collections[category.name] ||=
-            LookupCollection.new(Sliced::Slice.collection.database, collection_name)
-        end
-      end
+      # def lookup_collection(category = :main)
+      #   category = input_category(category) unless category.is_a?(Category::Input)
+      #
+      #   collection = (@lookup_collections ||= {})[category.name]
+      #
+      #   unless collection
+      #     collection_name = "rocket_job.inputs.#{id}"
+      #     collection_name << ".#{category.name}" unless category.name == :main
+      #
+      #     @lookup_collections[category.name] ||=
+      #       LookupCollection.new(Sliced::Slice.collection.database, collection_name)
+      #   end
+      # end
       # Upload the supplied file, io, IOStreams::Path, or IOStreams::Stream.
       #
@@ -154,53 +150,7 @@ module RocketJob
       # * If an io stream is supplied, it is read until it returns nil.
       # * Only use this method for UTF-8 data, for binary data use #input_slice or #input_records.
       # * CSV parsing is slow, so it is usually left for the workers to do.
-      def upload(stream = nil, file_name: nil, category: :main, stream_mode: :line, on_first: nil, **args, &block)
-        raise(ArgumentError, "Either stream, or a block must be supplied") unless stream || block
-        category = input_category(category) unless category.is_a?(Category::Input)
-        stream ||= category.file_name
-        path     = nil
-        if stream
-          path               = IOStreams.new(stream)
-          path.file_name     = file_name if file_name
-          category.file_name = path.file_name
-          # Auto detect the format based on the upload file name if present.
-          if category.format == :auto
-            format = path.format
-            if format
-              # Rebuild tabular with the above file name
-              category.reset_tabular
-              category.format = format
-            end
-          end
-        end
-        # Tabular transformations required for upload?
-        if category.tabular?
-          # Remove non-printable characters from tabular input formats
-          # Cannot change the length of fixed width lines
-          replace = category.format == :fixed ? " " : ""
-          path&.option_or_stream(:encode, encoding: "UTF-8", cleaner: :printable, replace: replace)
-          # Extract the header line during the file upload when needed.
-          on_first = rocket_job_upload_header_lambda(category, on_first) if category.tabular.header?
-        end
-        count =
-          if block
-            input(category).upload(on_first: on_first, &block)
-          else
-            input(category).upload(on_first: on_first) do |io|
-              path.each(stream_mode, **args) { |line| io << line }
-            end
-          end
-        self.record_count = (record_count || 0) + count
-        count
-      end
+      #
       # Upload results from an Arel into RocketJob::SlicedJob.
       #
       # Params
@@ -227,18 +177,13 @@ module RocketJob
       #
       # Example: Upload user_name and zip_code
       #   arel = User.where(country_code: 'US')
-      #   job.upload_arel(arel, :user_name, :zip_code)
+      #   job.upload_arel(arel, columns: [:user_name, :zip_code])
       #
       # Notes:
       # * Only call from one thread at a time against a single instance of this job.
       # * The record_count for the job is set to the number of records returned by the arel.
       # * If an exception is raised while uploading data, the input collection is cleared out
       #   so that if a job is retried during an upload failure, data is not duplicated.
-      def upload_arel(arel, *column_names, category: :main, &block)
-        count             = input(category).upload_arel(arel, *column_names, &block)
-        self.record_count = (record_count || 0) + count
-        count
-      end
       # Upload the result of a MongoDB query to the input collection for processing
       # Useful when an entire MongoDB collection, or part thereof needs to be
@@ -266,24 +211,19 @@ module RocketJob
       #   criteria = User.where(state: 'FL')
       #   job.record_count = job.upload_mongo_query(criteria)
       #
-      # Example: Upload just the supplied column
+      # Example: Upload only the specified column(s)
       #   criteria = User.where(state: 'FL')
-      #   job.record_count = job.upload_mongo_query(criteria, :zip_code)
+      #   job.record_count = job.upload_mongo_query(criteria, columns: [:zip_code])
       #
       # Notes:
       # * Only call from one thread at a time against a single instance of this job.
       # * The record_count for the job is set to the number of records returned by the monqo query.
       # * If an exception is raised while uploading data, the input collection is cleared out
       #   so that if a job is retried during an upload failure, data is not duplicated.
-      def upload_mongo_query(criteria, *column_names, category: :main, &block)
-        count             = input(category).upload_mongo_query(criteria, *column_names, &block)
-        self.record_count = (record_count || 0) + count
-        count
-      end
       # Upload sliced range of integer requests as arrays of start and end ids.
       #
-      # Returns [Integer] last_id - start_id + 1.
+      # Returns [Integer] the number of slices uploaded.
       #
       # Uploads one range per slice so that the response can return multiple records
       # for each slice processed
@@ -302,17 +242,11 @@ module RocketJob
       # * The record_count for the job is set to: last_id - start_id + 1.
       # * If an exception is raised while uploading data, the input collection is cleared out
       #   so that if a job is retried during an upload failure, data is not duplicated.
-      def upload_integer_range(start_id, last_id, category: :main)
-        input(category).upload_integer_range(start_id, last_id)
-        count             = last_id - start_id + 1
-        self.record_count = (record_count || 0) + count
-        count
-      end
       # Upload sliced range of integer requests as an arrays of start and end ids
       # starting with the last range first
       #
-      # Returns [Integer] last_id - start_id + 1.
+      # Returns [Integer] the number of slices uploaded.
       #
       # Uploads one range per slice so that the response can return multiple records
       # for each slice processed.
@@ -334,14 +268,102 @@ module RocketJob
       # * The record_count for the job is set to: last_id - start_id + 1.
       # * If an exception is raised while uploading data, the input collection is cleared out
       #   so that if a job is retried during an upload failure, data is not duplicated.
-      def upload_integer_range_in_reverse_order(start_id, last_id, category: :main)
-        input(category).upload_integer_range_in_reverse_order(start_id, last_id)
-        count             = last_id - start_id + 1
+      def upload(object = nil, category: :main, file_name: nil, stream_mode: nil, on_first: nil, columns: nil, slice_batch_size: nil, **args, &block)
+        input_collection = input(category)
+        if block
+          raise(ArgumentError, "Cannot supply both an object to upload, and a block.") if object
+          if stream_mode || columns || slice_batch_size || args.size > 0
+            raise(ArgumentError, "Unknown keyword arguments when uploading a block. Only accepts :category, :file_name, or :on_first")
+          end
+          category           = input_category(category)
+          category.file_name = file_name if file_name
+          # Extract the header line during the upload when applicable.
+          extract_header = category.extract_header_callback(on_first)
+          count             = input_collection.upload(on_first: extract_header, slice_batch_size: slice_batch_size, &block)
+          self.record_count = (record_count || 0) + count
+          return count
+        end
+        count =
+          case object
+          when Range
+            if file_name || stream_mode || on_first || args.size > 0
+              raise(ArgumentError, "Unknown keyword arguments when uploading a Range. Only accepts :category, :columns, or :slice_batch_size")
+            end
+            first = object.first
+            last  = object.last
+            if first < last
+              input_collection.upload_integer_range(first, last, slice_batch_size: slice_batch_size || 1_000)
+            else
+              input_collection.upload_integer_range_in_reverse_order(last, first, slice_batch_size: slice_batch_size || 1_000)
+            end
+          when Mongoid::Criteria
+            if file_name || stream_mode || on_first || args.size > 0
+              raise(ArgumentError, "Unknown keyword arguments when uploading a Mongoid::Criteria. Only accepts :category, :columns, or :slice_batch_size")
+            end
+            input_collection.upload_mongo_query(object, columns: columns, slice_batch_size: slice_batch_size, &block)
+          when defined?(ActiveRecord::Relation) ? ActiveRecord::Relation : false
+            if file_name || stream_mode || on_first || args.size > 0
+              raise(ArgumentError, "Unknown keyword arguments when uploading an ActiveRecord::Relation. Only accepts :category, :columns, or :slice_batch_size")
+            end
+            input_collection.upload_arel(object, columns: columns, slice_batch_size: slice_batch_size, &block)
+          else
+            raise(ArgumentError, "Unknown keyword argument :columns when uploading a file") if columns
+            category = input_category(category)
+            # Extract the header line during the upload when applicable.
+            extract_header = category.extract_header_callback(on_first)
+            path = category.upload_path(object, original_file_name: file_name)
+            input_collection.upload(on_first: extract_header, slice_batch_size: slice_batch_size) do |io|
+              path.each(stream_mode || :line, **args) { |line| io << line }
+            end
+          end
+        self.record_count = (record_count || 0) + count
+        count
+      end
+      # @deprecated
+      def upload_arel(arel, *column_names, category: :main, &block)
+        count             = input(category).upload_arel(arel, columns: column_names, &block)
         self.record_count = (record_count || 0) + count
         count
       end
-      # Upload the supplied slices for processing by workers
+      # @deprecated
+      def upload_mongo_query(criteria, *column_names, category: :main, &block)
+        count             = input(category).upload_mongo_query(criteria, columns: column_names, &block)
+        self.record_count = (record_count || 0) + count
+        count
+      end
+      # @deprecated
+      def upload_integer_range(start_id, last_id, category: :main, slice_batch_size: 1_000)
+        count             = input(category).upload_integer_range(start_id, last_id, slice_batch_size: slice_batch_size)
+        self.record_count = (record_count || 0) + count
+        count
+      end
+      # @deprecated
+      def upload_integer_range_in_reverse_order(start_id, last_id, category: :main, slice_batch_size: 1_000)
+        count             = input(category).upload_integer_range_in_reverse_order(start_id, last_id, slice_batch_size: slice_batch_size)
+        self.record_count = (record_count || 0) + count
+        count
+      end
+      # Upload the supplied slice for processing by workers
       #
       # Updates the record_count after adding the records
       #
@@ -427,50 +449,28 @@ module RocketJob
         # Store the output file name in the category
         category.file_name = stream if !block && (stream.is_a?(String) || stream.is_a?(IOStreams::Path))
-        if output_collection.binary?
-          raise(ArgumentError, "A `header_line` is not supported with binary output collections") if header_line
-          return output_collection.download(&block) if block
+        header_line ||= category.render_header
-          IOStreams.new(stream || category.file_name).stream(:none).writer(**args) do |io|
-            output_collection.download { |record| io << record[:binary] }
-          end
-        else
-          header_line ||= category.render_header
+        return output_collection.download(header_line: header_line, &block) if block
-          return output_collection.download(header_line: header_line, &block) if block
+        raise(ArgumentError, "Missing mandatory `stream` or `category.file_name`") unless stream || category.file_name
-          raise(ArgumentError, "Missing mandatory `stream` or `category.file_name`") unless stream || category.file_name
+        if output_collection.slice_class.binary_format
+          binary_header_line = output_collection.slice_class.to_binary(header_line) if header_line
+          # Don't overwrite supplied stream options if any
+          stream = stream&.is_a?(IOStreams::Stream) ? stream.dup : IOStreams.new(category.file_name)
+          stream.remove_from_pipeline(output_collection.slice_class.binary_format)
+          stream.writer(**args) do |io|
+            # TODO: Binary formats should return the record count, instead of the slice count.
+            output_collection.download(header_line: binary_header_line) { |record| io.write(record) }
+          end
+        else
           IOStreams.new(stream || category.file_name).writer(:line, **args) do |io|
             output_collection.download(header_line: header_line) { |record| io << record }
           end
         end
       end
-      private
-      # Return a lambda to extract the header row from the uploaded file.
-      def rocket_job_upload_header_lambda(category, on_first)
-        case category.mode
-        when :line
-          lambda do |line|
-            category.tabular.parse_header(line)
-            category.cleanse_header!
-            category.columns = category.tabular.header.columns
-            # Call chained on_first if present
-            on_first&.call(line)
-          end
-        when :array
-          lambda do |row|
-            category.tabular.header.columns = row
-            category.cleanse_header!
-            category.columns = category.tabular.header.columns
-            # Call chained on_first if present
-            on_first&.call(line)
-          end
-        end
-      end
     end
   end
 end

data/lib/rocket_job/batch/worker.rb CHANGED Viewed

@@ -67,6 +67,8 @@ module RocketJob
       # Returns [Integer] the number of records processed in the slice
       #
       # Note: The slice will be removed from processing when this method completes
+      #
+      # @deprecated Please open a ticket if you need this behavior.
       def work_first_slice(&block)
         raise "#work_first_slice can only be called from within before_batch callbacks" unless sub_state == :before
@@ -142,19 +144,19 @@ module RocketJob
       # Perform individual slice without callbacks
       def rocket_job_perform_slice(slice, &block)
         slice.processing_record_number ||= 0
-        records                        = []
         append                         = false
-        # Skip processed records in this slice if it has no output categpries.
-        if slice.processing_record_number > 1
-          records = slice.records[slice.processing_record_number - 1..-1]
-          append  = true
-          logger.info("Resuming previously incomplete slice from record number #{slice.processing_record_number}")
-        else
-          # Reprocess all records in this slice.
-          slice.processing_record_number = 0
-          records                        = slice.records
-        end
+        # Skip processed records in this slice if it has no output categories.
+        records =
+          if slice.processing_record_number.to_i > 1
+            append = true
+            logger.info("Resuming previously incomplete slice from record number #{slice.processing_record_number}")
+            slice.records[slice.processing_record_number - 1..-1]
+          else
+            # Reprocess all records in this slice.
+            slice.processing_record_number = 0
+            slice.records
+          end
         count = 0
         RocketJob::Sliced::Writer::Output.collect(self, input_slice: slice, append: append) do |writer|
@@ -246,7 +248,7 @@ module RocketJob
         unless new_record?
           # Fail job iff no other worker has already finished it
           # Must set write concern to at least 1 since we need the nModified back
-          result = self.class.with(write: {w: 1}) do |query|
+          result   = self.class.with(write: {w: 1}) do |query|
             query.
               where(id: id, state: :running, sub_state: :processing).
               update({"$set" => {state: :failed, worker_name: worker_name}})

data/lib/rocket_job/category/base.rb CHANGED Viewed

@@ -11,7 +11,6 @@ module RocketJob
         # Whether to compress, encrypt, or use the bzip2 serialization for data in this category.
         field :serializer, type: ::Mongoid::StringifiedSymbol, default: :compress
-        validates_inclusion_of :serializer, in: %i[none compress encrypt bzip2]
         # The header columns when the file does not include a header row.
         # Note:
@@ -49,10 +48,12 @@ module RocketJob
           Sliced::CompressedSlice
         when :encrypt
           Sliced::EncryptedSlice
-        when :bzip2
+        when :bzip2, :bz2
           Sliced::BZip2OutputSlice
+        when :encrypted_bz2
+          Sliced::EncryptedBZip2OutputSlice
         else
-          raise(ArgumentError, "serialize: #{serializer.inspect} must be :none, :compress, :encrypt, or :bzip2")
+          raise(ArgumentError, "serialize: #{serializer.inspect} must be :none, :compress, :encrypt, :bz2, or :encrypted_bz2")
         end
       end
@@ -65,14 +66,16 @@ module RocketJob
         )
       end
-      def reset_tabular
-        @tabular = nil
-      end
       # Returns [true|false] whether this category has the attributes defined for tabular to work.
       def tabular?
         format.present?
       end
+      def build_collection_name(direction, job)
+        collection_name = "rocket_job.#{direction}s.#{job.id}"
+        collection_name << ".#{name}" unless name == :main
+        collection_name
+      end
     end
   end
 end

data/lib/rocket_job/category/input.rb CHANGED Viewed

@@ -10,6 +10,7 @@ module RocketJob
       # Slice size for this input collection
       field :slice_size, type: Integer, default: 100
+      validates_presence_of :slice_size
       #
       # The fields below only apply if the field `format` has been set:
@@ -82,7 +83,7 @@ module RocketJob
       field :header_cleanser, type: ::Mongoid::StringifiedSymbol, default: :default
       validates :header_cleanser, inclusion: %i[default none]
-      validates_presence_of :slice_size
+      validates_inclusion_of :serializer, in: %i[none compress encrypt]
       # Cleanses the header column names when `cleanse_header` is true
       def cleanse_header!
@@ -105,6 +106,65 @@ module RocketJob
           skip_unknown:     skip_unknown
         )
       end
+      def data_store(job)
+        RocketJob::Sliced::Input.new(
+          collection_name: build_collection_name(:input, job),
+          slice_class:     serializer_class,
+          slice_size:      slice_size
+        )
+      end
+      # Returns [IOStreams::Path] of file to upload.
+      # Auto-detects file format from file name when format is :auto.
+      def upload_path(stream = nil, original_file_name: nil)
+        unless stream || file_name
+          raise(ArgumentError, "Either supply a file name to upload, or set input_collection.file_name first")
+        end
+        path           = IOStreams.new(stream || file_name)
+        path.file_name = original_file_name if original_file_name
+        self.file_name = path.file_name
+        # Auto detect the format based on the upload file name if present.
+        if format == :auto
+          self.format = path.format || :csv
+          # Rebuild tabular with new values.
+          @tabular = nil
+        end
+        # Remove non-printable characters from tabular input formats.
+        if tabular?
+          # Cannot change the length of fixed width lines.
+          replace = format == :fixed ? " " : ""
+          path.option_or_stream(:encode, encoding: "UTF-8", cleaner: :printable, replace: replace)
+        end
+        path
+      end
+      # Return a lambda to extract the header row from the uploaded file.
+      def extract_header_callback(on_first)
+        return on_first unless tabular? && tabular.header?
+        case mode
+        when :line
+          lambda do |line|
+            tabular.parse_header(line)
+            cleanse_header!
+            self.columns = tabular.header.columns
+            # Call chained on_first if present
+            on_first&.call(line)
+          end
+        when :array
+          lambda do |row|
+            tabular.header.columns = row
+            cleanse_header!
+            self.columns = category.tabular.header.columns
+            # Call chained on_first if present
+            on_first&.call(line)
+          end
+        end
+      end
     end
   end
 end