RubyGems - sidekiq-iteration - Versions diffs - 0.1.0 - Mend

sidekiq-iteration 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

checksums.yaml +7 -0
data/CHANGELOG.md +5 -0
data/LICENSE.txt +21 -0
data/README.md +265 -0
data/guides/best-practices.md +71 -0
data/guides/custom-enumerator.md +98 -0
data/guides/iteration-how-it-works.md +71 -0
data/guides/throttling.md +42 -0
data/lib/sidekiq-iteration.rb +3 -0
data/lib/sidekiq_iteration/active_record_batch_enumerator.rb +127 -0
data/lib/sidekiq_iteration/active_record_cursor.rb +89 -0
data/lib/sidekiq_iteration/active_record_enumerator.rb +69 -0
data/lib/sidekiq_iteration/csv_enumerator.rb +85 -0
data/lib/sidekiq_iteration/enumerators.rb +187 -0
data/lib/sidekiq_iteration/iteration.rb +267 -0
data/lib/sidekiq_iteration/job_retry_patch.rb +30 -0
data/lib/sidekiq_iteration/nested_enumerator.rb +39 -0
data/lib/sidekiq_iteration/throttling.rb +45 -0
data/lib/sidekiq_iteration/version.rb +5 -0
data/lib/sidekiq_iteration.rb +40 -0
metadata +80 -0

data/lib/sidekiq_iteration/active_record_batch_enumerator.rb ADDED Viewed

@@ -0,0 +1,127 @@
+# frozen_string_literal: true
+module SidekiqIteration
+  # Batch Enumerator based on ActiveRecord Relation.
+  # @private
+  class ActiveRecordBatchEnumerator
+    include Enumerable
+    SQL_DATETIME_WITH_NSEC = "%Y-%m-%d %H:%M:%S.%N"
+    def initialize(relation, columns: nil, batch_size: 100, cursor: nil)
+      @primary_key = "#{relation.table_name}.#{relation.primary_key}"
+      @columns = Array(columns&.map(&:to_s) || @primary_key)
+      @primary_key_index = @columns.index(@primary_key) || @columns.index(relation.primary_key)
+      @pluck_columns = if @primary_key_index
+                         @columns
+                       else
+                         @columns + [@primary_key]
+                       end
+      @batch_size = batch_size
+      @cursor = Array.wrap(cursor)
+      @initial_cursor = @cursor
+      raise ArgumentError, "Must specify at least one column" if @columns.empty?
+      if relation.joins_values.present? && !@columns.all?(/\./)
+        raise ArgumentError, "You need to specify fully-qualified columns if you join a table"
+      end
+      if relation.arel.orders.present? || relation.arel.taken.present?
+        raise ArgumentError,
+          "The relation cannot use ORDER BY or LIMIT due to the way how iteration with a cursor is designed. " \
+          "You can use other ways to limit the number of rows, e.g. a WHERE condition on the primary key column."
+      end
+      @base_relation = relation.reorder(@columns.join(", "))
+    end
+    def each
+      return to_enum { size } unless block_given?
+      while (relation = next_batch)
+        yield relation, cursor_value
+      end
+    end
+    def size
+      (@base_relation.count(:all) + @batch_size - 1) / @batch_size # ceiling division
+    end
+    private
+      def next_batch
+        relation = @base_relation.limit(@batch_size)
+        if conditions.any?
+          relation = relation.where(*conditions)
+        end
+        cursor_values, ids = relation.uncached do
+          pluck_columns(relation)
+        end
+        cursor = cursor_values.last
+        unless cursor.present?
+          @cursor = @initial_cursor
+          return
+        end
+        # The primary key was plucked, but original cursor did not include it, so we should remove it
+        cursor.pop unless @primary_key_index
+        @cursor = Array.wrap(cursor)
+        # Yields relations by selecting the primary keys of records in the batch.
+        # Post.where(published: nil) results in an enumerator of relations like:
+        # Post.where(published: nil, ids: batch_of_ids)
+        @base_relation.where(@primary_key => ids)
+      end
+      def pluck_columns(relation)
+        if @pluck_columns.size == 1 # only the primary key
+          column_values = relation.pluck(*@pluck_columns)
+          return [column_values, column_values]
+        end
+        column_values = relation.pluck(*@pluck_columns)
+        primary_key_index = @primary_key_index || -1
+        primary_key_values = column_values.map { |values| values[primary_key_index] }
+        serialize_column_values!(column_values)
+        [column_values, primary_key_values]
+      end
+      def cursor_value
+        if @cursor.size == 1
+          @cursor.first
+        else
+          @cursor
+        end
+      end
+      def conditions
+        column_index = @cursor.size - 1
+        column = @columns[column_index]
+        where_clause = if @columns.size == @cursor.size
+                         "#{column} > ?"
+                       else
+                         "#{column} >= ?"
+                       end
+        while column_index > 0
+          column_index -= 1
+          column = @columns[column_index]
+          where_clause = "#{column} > ? OR (#{column} = ? AND (#{where_clause}))"
+        end
+        ret = @cursor.reduce([where_clause]) { |params, value| params << value << value }
+        ret.pop
+        ret
+      end
+      def serialize_column_values!(column_values)
+        column_values.map! { |values| values.map! { |value| column_value(value) } }
+      end
+      def column_value(value)
+        if value.is_a?(Time)
+          value.strftime(SQL_DATETIME_WITH_NSEC)
+        else
+          value
+        end
+      end
+  end
+end

data/lib/sidekiq_iteration/active_record_cursor.rb ADDED Viewed

@@ -0,0 +1,89 @@
+# frozen_string_literal: true
+module SidekiqIteration
+  # @private
+  class ActiveRecordCursor
+    include Comparable
+    attr_reader :position, :reached_end
+    def initialize(relation, columns = nil, position = nil)
+      columns ||= "#{relation.table_name}.#{relation.primary_key}"
+      @columns = Array.wrap(columns)
+      raise ArgumentError, "Must specify at least one column" if @columns.empty?
+      self.position = Array.wrap(position)
+      if relation.joins_values.present? && !@columns.all?(/\./)
+        raise ArgumentError, "You need to specify fully-qualified columns if you join a table"
+      end
+      if relation.arel.orders.present? || relation.arel.taken.present?
+        raise ArgumentError,
+          "The relation cannot use ORDER BY or LIMIT due to the way how iteration with a cursor is designed. " \
+          "You can use other ways to limit the number of rows, e.g. a WHERE condition on the primary key column."
+      end
+      @base_relation = relation.reorder(@columns.join(", "))
+      @reached_end = false
+    end
+    def <=>(other)
+      if reached_end == other.reached_end
+        position <=> other.position
+      else
+        reached_end ? 1 : -1
+      end
+    end
+    def position=(position)
+      raise ArgumentError, "Cursor position cannot contain nil values" if position.any?(&:nil?)
+      @position = position
+    end
+    def next_batch(batch_size)
+      return if @reached_end
+      relation = @base_relation.limit(batch_size)
+      if (conditions = self.conditions).any?
+        relation = relation.where(*conditions)
+      end
+      records = relation.uncached do
+        relation.to_a
+      end
+      update_from_record(records.last) if records.any?
+      @reached_end = records.size < batch_size
+      records if records.any?
+    end
+    private
+      def conditions
+        i = @position.size - 1
+        column = @columns[i]
+        conditions = if @columns.size == @position.size
+                       "#{column} > ?"
+                     else
+                       "#{column} >= ?"
+                     end
+        while i > 0
+          i -= 1
+          column = @columns[i]
+          conditions = "#{column} > ? OR (#{column} = ? AND (#{conditions}))"
+        end
+        ret = @position.reduce([conditions]) { |params, value| params << value << value }
+        ret.pop
+        ret
+      end
+      def update_from_record(record)
+        self.position = @columns.map do |column|
+          method = column.to_s.split(".").last
+          record.send(method)
+        end
+      end
+  end
+end

data/lib/sidekiq_iteration/active_record_enumerator.rb ADDED Viewed

@@ -0,0 +1,69 @@
+# frozen_string_literal: true
+require_relative "active_record_cursor"
+module SidekiqIteration
+  # Builds Enumerator based on ActiveRecord Relation. Supports enumerating on rows and batches.
+  # @private
+  class ActiveRecordEnumerator
+    SQL_DATETIME_WITH_NSEC = "%Y-%m-%d %H:%M:%S.%N"
+    def initialize(relation, columns: nil, batch_size: 100, cursor: nil)
+      unless relation.is_a?(ActiveRecord::Relation)
+        raise ArgumentError, "relation must be an ActiveRecord::Relation"
+      end
+      @relation = relation
+      @batch_size = batch_size
+      @columns = Array(columns || "#{relation.table_name}.#{relation.primary_key}")
+      @cursor = cursor
+    end
+    def records
+      Enumerator.new(-> { size }) do |yielder|
+        batches.each do |batch, _|
+          batch.each do |record|
+            yielder.yield(record, cursor_value(record))
+          end
+        end
+      end
+    end
+    def batches
+      cursor = ActiveRecordCursor.new(@relation, @columns, @cursor)
+      Enumerator.new(-> { size }) do |yielder|
+        while (records = cursor.next_batch(@batch_size))
+          yielder.yield(records, cursor_value(records.last)) if records.any?
+        end
+      end
+    end
+    def size
+      @relation.count(:all)
+    end
+    private
+      def cursor_value(record)
+        positions = @columns.map do |column|
+          attribute_name = column.to_s.split(".").last
+          column_value(record, attribute_name)
+        end
+        if positions.size == 1
+          positions.first
+        else
+          positions
+        end
+      end
+      def column_value(record, attribute)
+        value = record.read_attribute(attribute.to_sym)
+        case record.class.columns_hash.fetch(attribute).type
+        when :datetime
+          value.strftime(SQL_DATETIME_WITH_NSEC)
+        else
+          value
+        end
+      end
+  end
+end

data/lib/sidekiq_iteration/csv_enumerator.rb ADDED Viewed

@@ -0,0 +1,85 @@
+# frozen_string_literal: true
+module SidekiqIteration
+  # @private
+  # CsvEnumerator makes it possible to write an Iteration job
+  # that uses CSV file as a collection to iterate.
+  #
+  # @example Enumerator to iterate on rows
+  #   def build_enumerator(cursor:)
+  #     csv = CSV.open('tmp/files', { converters: :integer, headers: true })
+  #     csv_enumerator(csv, cursor: cursor)
+  #   end
+  #
+  #   def each_iteration(row)
+  #     ...
+  #   end
+  #
+  # @example Enumerator to iterate on batches of rows
+  #   def build_enumerator(cursor:)
+  #     csv = CSV.open('tmp/files', { converters: :integer, headers: true })
+  #     csv_batches_enumerator(csv, cursor: cursor)
+  #   end
+  #
+  #   def each_iteration(row)
+  #     ...
+  #   end
+  #
+  class CsvEnumerator
+    # Constructs CsvEnumerator instance based on a CSV file.
+    #
+    # @param [CSV] csv An instance of CSV object
+    # @return [SidekiqIteration::CsvEnumerator]
+    #
+    # @example
+    #   csv = CSV.open('tmp/files', { converters: :integer, headers: true })
+    #   SidekiqIteration::CsvEnumerator.new(csv).rows(cursor: cursor)
+    #
+    def initialize(csv)
+      unless csv.instance_of?(CSV)
+        raise ArgumentError, "CsvEnumerator.new takes CSV object"
+      end
+      @csv = csv
+    end
+    # Constructs a enumerator on CSV rows
+    # @return [Enumerator] Enumerator instance
+    #
+    def rows(cursor:)
+      @csv.lazy
+        .each_with_index
+        .drop(count_of_processed_rows(cursor))
+        .to_enum { count_of_rows_in_file }
+    end
+    # Constructs a enumerator on batches of CSV rows
+    # @return [Enumerator] Enumerator instance
+    #
+    def batches(cursor:, batch_size: 100)
+      @csv.lazy
+        .each_slice(batch_size)
+        .with_index
+        .drop(count_of_processed_rows(cursor))
+        .to_enum { (count_of_rows_in_file.to_f / batch_size).ceil }
+    end
+    private
+      def count_of_rows_in_file
+        filepath = @csv.path
+        return unless filepath
+        count = `wc -l < #{filepath}`.strip.to_i
+        count -= 1 if @csv.headers
+        count
+      end
+      def count_of_processed_rows(cursor)
+        if cursor
+          cursor + 1
+        else
+          0
+        end
+      end
+  end
+end

data/lib/sidekiq_iteration/enumerators.rb ADDED Viewed

@@ -0,0 +1,187 @@
+# frozen_string_literal: true
+require_relative "active_record_enumerator"
+require_relative "active_record_batch_enumerator"
+require_relative "csv_enumerator"
+require_relative "nested_enumerator"
+module SidekiqIteration
+  module Enumerators
+    # Builds Enumerator object from a given array, using +cursor+ as an offset.
+    #
+    # @param array [Array]
+    # @param cursor [Integer] offset to start iteration from
+    #
+    # @example
+    #   array_enumerator(['build', 'enumerator', 'from', 'any', 'array'], cursor: cursor)
+    #
+    def array_enumerator(array, cursor:)
+      raise ArgumentError, "array must be an Array" unless array.is_a?(Array)
+      if defined?(ActiveRecord) && array.any?(ActiveRecord::Base)
+        raise ArgumentError, "array cannot contain ActiveRecord objects"
+      end
+      drop = cursor ? cursor + 1 : 0
+      array.each_with_index.drop(drop).to_enum { array.size }
+    end
+    # Builds Enumerator from Active Record Relation. Each Enumerator tick moves the cursor one row forward.
+    #
+    # @param scope [ActiveRecord::Relation] scope to iterate
+    # @param cursor [Object] offset to start iteration from, usually an id
+    # @option options :columns [Array<String, Symbol>] used to build the actual query for iteration,
+    #   defaults to primary key
+    # @option options :batch_size [Integer] (100) size of the batch
+    #
+    # +columns:+ argument is used to build the actual query for iteration. +columns+: defaults to primary key:
+    #
+    #   1) SELECT * FROM users ORDER BY id LIMIT 100
+    #
+    # When iteration is resumed, +cursor:+ and +columns:+ values will be used to continue from the point
+    # where iteration stopped:
+    #
+    #   2) SELECT * FROM users WHERE id > $CURSOR ORDER BY id LIMIT 100
+    #
+    # +columns:+ can also take more than one column. In that case, +cursor+ will contain serialized values
+    # of all columns at the point where iteration stopped.
+    #
+    # Consider this example with +columns: [:created_at, :id]+. Here's the query will use on the first iteration:
+    #
+    #   1) SELECT * FROM "products" ORDER BY created_at, id LIMIT 100
+    #
+    # And the query on the next iteration:
+    #
+    #   2) SELECT * FROM "products"
+    #        WHERE (created_at > '$LAST_CREATED_AT_CURSOR'
+    #          OR (created_at = '$LAST_CREATED_AT_CURSOR' AND (id > '$LAST_ID_CURSOR')))
+    #        ORDER BY created_at, id LIMIT 100
+    #
+    # As a result of this query pattern, if the values in these columns change for the records in scope during
+    # iteration, they may be skipped or yielded multiple times depending on the nature of the update and the
+    # cursor's value. If the value gets updated to a greater value than the cursor's value, it will get yielded
+    # again. Similarly, if the value gets updated to a lesser value than the curor's value, it will get skipped.
+    #
+    # @example
+    #   def build_enumerator(cursor:)
+    #     active_record_records_enumerator(User.all, cursor: cursor)
+    #   end
+    #
+    #   def each_iteration(user)
+    #     user.notify_about_something
+    #   end
+    #
+    def active_record_records_enumerator(scope, cursor:, **options)
+      ActiveRecordEnumerator.new(scope, cursor: cursor, **options).records
+    end
+    # Builds Enumerator from Active Record Relation and enumerates on batches of records.
+    # Each Enumerator tick moves the cursor +batch_size+ rows forward.
+    # @see #active_record_records_enumerator
+    #
+    # @example
+    #   def build_enumerator(product_id, cursor:)
+    #     active_record_batches_enumerator(
+    #       Comment.where(product_id: product_id).select(:id),
+    #       cursor: cursor,
+    #       batch_size: 100
+    #     )
+    #   end
+    #
+    #   def each_iteration(batch_of_comments, product_id)
+    #     comment_ids = batch_of_comments.map(&:id)
+    #     CommentService.call(comment_ids: comment_ids)
+    #   end
+    #
+    def active_record_batches_enumerator(scope, cursor:, **options)
+      ActiveRecordEnumerator.new(scope, cursor: cursor, **options).batches
+    end
+    # Builds Enumerator from Active Record Relation and enumerates on batches, yielding Active Record Relations.
+    # @see #active_record_records_enumerator
+    #
+    # @example
+    #   def build_enumerator(product_id, cursor:)
+    #     active_record_relations_enumerator(
+    #       Product.find(product_id).comments,
+    #       cursor: cursor,
+    #       batch_size: 100,
+    #     )
+    #   end
+    #
+    #   def each_iteration(batch_of_comments, product_id)
+    #     # batch_of_comments will be a Comment::ActiveRecord_Relation
+    #     batch_of_comments.update_all(deleted: true)
+    #   end
+    #
+    def active_record_relations_enumerator(scope, cursor:, **options)
+      ActiveRecordBatchEnumerator.new(scope, cursor: cursor, **options).each
+    end
+    # Builds Enumerator from a CSV file.
+    #
+    # @param csv [CSV] an instance of CSV object
+    # @param cursor [Integer] offset to start iteration from
+    #
+    # @example
+    #   def build_enumerator(import_id, cursor:)
+    #     import = Import.find(import_id)
+    #     csv_enumerator(import.csv, cursor: cursor)
+    #   end
+    #
+    #   def each_iteration(csv_row)
+    #     # insert csv_row to database
+    #   end
+    #
+    def csv_enumerator(csv, cursor:)
+      CsvEnumerator.new(csv).rows(cursor: cursor)
+    end
+    # Builds Enumerator from a CSV file and enumerates on batches of records.
+    #
+    # @param csv [CSV] an instance of CSV object
+    # @param cursor [Integer] offset to start iteration from
+    # @option options :batch_size [Integer] (100) size of the batch
+    #
+    # @example
+    #   def build_enumerator(import_id, cursor:)
+    #     import = Import.find(import_id)
+    #     csv_batches_enumerator(import.csv, cursor: cursor)
+    #   end
+    #
+    #   def each_iteration(batch_of_csv_rows)
+    #     # ...
+    #   end
+    #
+    def csv_batches_enumerator(csv, cursor:, **options)
+      CsvEnumerator.new(csv).batches(cursor: cursor, **options)
+    end
+    # Builds Enumerator for nested iteration.
+    #
+    # @param enums [Array<Proc>] an Array of Procs, each should return an Enumerator.
+    #   Each proc from enums should accept the yielded items from the parent enumerators and the `cursor` as its arguments.
+    #   Each proc's `cursor` argument is its part from the `build_enumerator`'s `cursor` array.
+    # @param cursor [Array<Object>] array of offsets for each of the enums to start iteration from
+    #
+    # @example
+    #   def build_enumerator(cursor:)
+    #     nested_enumerator(
+    #       [
+    #         ->(cursor) { active_record_records_enumerator(Shop.all, cursor: cursor) },
+    #         ->(shop, cursor) { active_record_records_enumerator(shop.products, cursor: cursor) },
+    #         ->(_shop, product, cursor) { active_record_relations_enumerator(product.product_variants, cursor: cursor) }
+    #       ],
+    #       cursor: cursor
+    #     )
+    #   end
+    #
+    #   def each_iteration(product_variants_relation)
+    #     # do something
+    #   end
+    #
+    def nested_enumerator(enums, cursor:)
+      NestedEnumerator.new(enums, cursor: cursor).each
+    end
+  end
+end