RubyGems - karafka - Versions diffs - 2.1.5.beta1 → 2.1.6 - Mend

karafka 2.1.5.beta1 → 2.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

checksums.yaml +4 -4
checksums.yaml.gz.sig +0 -0
data/CHANGELOG.md +20 -1
data/Gemfile.lock +9 -9
data/karafka.gemspec +2 -2
data/lib/karafka/admin.rb +34 -3
data/lib/karafka/base_consumer.rb +16 -3
data/lib/karafka/connection/client.rb +110 -88
data/lib/karafka/errors.rb +4 -1
data/lib/karafka/messages/seek.rb +3 -0
data/lib/karafka/pro/iterator/expander.rb +95 -0
data/lib/karafka/pro/iterator/tpl_builder.rb +145 -0
data/lib/karafka/pro/iterator.rb +2 -87
data/lib/karafka/pro/processing/filters_applier.rb +1 -0
data/lib/karafka/pro/processing/strategies/aj/dlq_ftr_lrj_mom.rb +3 -1
data/lib/karafka/pro/processing/strategies/aj/dlq_ftr_lrj_mom_vp.rb +3 -1
data/lib/karafka/pro/processing/strategies/aj/dlq_lrj_mom.rb +3 -1
data/lib/karafka/pro/processing/strategies/aj/dlq_lrj_mom_vp.rb +3 -1
data/lib/karafka/pro/processing/strategies/aj/ftr_lrj_mom_vp.rb +3 -1
data/lib/karafka/pro/processing/strategies/aj/lrj_mom_vp.rb +4 -1
data/lib/karafka/pro/processing/strategies/dlq/ftr_lrj.rb +2 -2
data/lib/karafka/pro/processing/strategies/dlq/ftr_lrj_mom.rb +2 -2
data/lib/karafka/pro/processing/strategies/dlq/lrj.rb +2 -1
data/lib/karafka/pro/processing/strategies/dlq/lrj_mom.rb +3 -1
data/lib/karafka/pro/processing/strategies/ftr/default.rb +8 -1
data/lib/karafka/pro/processing/strategies/lrj/default.rb +1 -1
data/lib/karafka/pro/processing/strategies/lrj/ftr.rb +2 -2
data/lib/karafka/pro/processing/strategies/lrj/ftr_mom.rb +2 -2
data/lib/karafka/pro/processing/strategies/lrj/mom.rb +3 -1
data/lib/karafka/pro/processing/virtual_offset_manager.rb +1 -1
data/lib/karafka/processing/coordinator.rb +14 -0
data/lib/karafka/processing/strategies/default.rb +12 -14
data/lib/karafka/railtie.rb +2 -2
data/lib/karafka/version.rb +1 -1
data.tar.gz.sig +0 -0
metadata +10 -8
metadata.gz.sig +0 -0

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 967412d28c31483df6a0c151ec0578367da24e503184608789791774347ffc53
-  data.tar.gz: '0828ba1db27f9f287b3ab9f80928816a81ae45ad79ce092e02cccd33b9bac204'
+  metadata.gz: a6994a6d579728a877f84c87086d093aae8a1f830b891fcb4904883085432fe4
+  data.tar.gz: 13b21009a471194a72971ca81ddc718e044bb96587db0e8f186974f554e9ec62
 SHA512:
-  metadata.gz: 3774daedd74efe8e2498e97e503226cc01848e7a7be6ca255852bcaf112bc790ff98953584c771de8afa96142e8244e9f1b036b18490305766b428671b136bce
-  data.tar.gz: b575cf2be3d4cdcb73fec9cdfd65cd705408d1efae39ed200ae2d815e7c9241f582d5ebb8b802347289de910ee50589f0551bb4c194a161905e38425d05bd296
+  metadata.gz: e4711880bde1d2cd1cb34959f740459979b74ff4d28a671a232f88adbe7473cf67e366fc2b492fac761c572f3a6dfc147a59d46fc08e1c5e18df8ac5f108afdd
+  data.tar.gz: c094600c2bd421ce309c0125d60ea82ed0106d5ce4566b3bb8c1aab13c553e7bd2f6651b98029e42ac831b132563b2c502dd1c76defbf8307cd9bd2393b258f7

checksums.yaml.gz.sig CHANGED Viewed

Binary file

data/CHANGELOG.md CHANGED Viewed

@@ -1,11 +1,30 @@
 # Karafka framework changelog
-## 2.1.5 (Unreleased)
+## 2.1.6 (2023-06-29)
+- [Improvement] Provide time support for iterator
+- [Improvement] Provide time support for admin `#read_topic`
+- [Improvement] Provide time support for consumer `#seek`.
+- [Improvement] Remove no longer needed locks for client operations.
+- [Improvement] Raise `Karafka::Errors::TopicNotFoundError` when trying to iterate over non-existing topic.
+- [Improvement] Ensure that Kafka multi-command operations run under mutex together.
+- [Change] Require `waterdrop` `>= 2.6.2`
+- [Change] Require `karafka-core` `>= 2.1.1`
+- [Refactor] Clean-up iterator code.
+- [Fix]  Improve performance in dev environment for a Rails app (juike)
+- [Fix] Rename `InvalidRealOffsetUsage` to `InvalidRealOffsetUsageError` to align with naming of other errors.
+- [Fix] Fix unstable spec.
+- [Fix] Fix a case where automatic `#seek` would overwrite manual seek of a user when running LRJ.
+- [Fix] Make sure, that user direct `#seek` and `#pause` operations take precedence over system actions.
+- [Fix] Make sure, that `#pause` and `#resume` with one underlying connection do not race-condition.
+## 2.1.5 (2023-06-19)
 - [Improvement] Drastically improve `#revoked?` response quality by checking the real time assignment lost state on librdkafka.
 - [Improvement] Improve eviction of saturated jobs that would run on already revoked assignments.
 - [Improvement] Expose `#commit_offsets` and `#commit_offsets!` methods in the consumer to provide ability to commit offsets directly to Kafka without having to mark new messages as consumed.
 - [Improvement] No longer skip offset commit when no messages marked as consumed as `librdkafka` has fixed the crashes there.
 - [Improvement] Remove no longer needed patches.
+- [Improvement] Ensure, that the coordinator revocation status is switched upon revocation detection when using `#revoked?`
+- [Improvement] Add benchmarks for marking as consumed (sync and async).
 - [Change] Require `karafka-core` `>= 2.1.0`
 - [Change] Require `waterdrop` `>= 2.6.1`

data/Gemfile.lock CHANGED Viewed

@@ -1,10 +1,10 @@
 PATH
   remote: .
   specs:
-    karafka (2.1.5.beta1)
-      karafka-core (>= 2.1.0.beta1, < 2.2.0)
+    karafka (2.1.6)
+      karafka-core (>= 2.1.1, < 2.2.0)
       thor (>= 0.20)
-      waterdrop (>= 2.6.1.beta1, < 3.0.0)
+      waterdrop (>= 2.6.2, < 3.0.0)
       zeitwerk (~> 2.3)
 GEM
@@ -30,14 +30,14 @@ GEM
       activesupport (>= 5.0)
     i18n (1.14.1)
       concurrent-ruby (~> 1.0)
-    karafka-core (2.1.0.beta1)
+    karafka-core (2.1.1)
       concurrent-ruby (>= 1.1)
-      karafka-rdkafka (>= 0.13.0.beta2, < 0.14.0)
-    karafka-rdkafka (0.13.0.beta2)
+      karafka-rdkafka (>= 0.13.1, < 0.14.0)
+    karafka-rdkafka (0.13.1)
       ffi (~> 1.15)
       mini_portile2 (~> 2.6)
       rake (> 12)
-    karafka-web (0.6.0)
+    karafka-web (0.6.1)
       erubi (~> 1.4)
       karafka (>= 2.1.4, < 3.0.0)
       karafka-core (>= 2.0.13, < 3.0.0)
@@ -72,8 +72,8 @@ GEM
     tilt (2.2.0)
     tzinfo (2.0.6)
       concurrent-ruby (~> 1.0)
-    waterdrop (2.6.1.beta1)
-      karafka-core (>= 2.1.0.beta1, < 3.0.0)
+    waterdrop (2.6.2)
+      karafka-core (>= 2.1.0, < 3.0.0)
       zeitwerk (~> 2.3)
     zeitwerk (2.6.8)

data/karafka.gemspec CHANGED Viewed

@@ -21,9 +21,9 @@ Gem::Specification.new do |spec|
     without having to focus on things that are not your business domain.
   DESC
-  spec.add_dependency 'karafka-core', '>= 2.1.0.beta1', '< 2.2.0'
+  spec.add_dependency 'karafka-core', '>= 2.1.1', '< 2.2.0'
   spec.add_dependency 'thor', '>= 0.20'
-  spec.add_dependency 'waterdrop', '>= 2.6.1.beta1', '< 3.0.0'
+  spec.add_dependency 'waterdrop', '>= 2.6.2', '< 3.0.0'
   spec.add_dependency 'zeitwerk', '~> 2.3'
   if $PROGRAM_NAME.end_with?('gem')

data/lib/karafka/admin.rb CHANGED Viewed

@@ -18,6 +18,9 @@ module Karafka
     # retry after checking that the operation was finished or failed using external factor.
     MAX_WAIT_TIMEOUT = 1
+    # Max time for a TPL request. We increase it to compensate for remote clusters latency
+    TPL_REQUEST_TIMEOUT = 2_000
     # How many times should be try. 1 x 60 => 60 seconds wait in total
     MAX_ATTEMPTS = 60
@@ -34,7 +37,8 @@ module Karafka
       'enable.auto.commit': false
     }.freeze
-    private_constant :Topic, :CONFIG_DEFAULTS, :MAX_WAIT_TIMEOUT, :MAX_ATTEMPTS
+    private_constant :Topic, :CONFIG_DEFAULTS, :MAX_WAIT_TIMEOUT, :TPL_REQUEST_TIMEOUT,
+                     :MAX_ATTEMPTS
     class << self
       # Allows us to read messages from the topic
@@ -42,8 +46,9 @@ module Karafka
       # @param name [String, Symbol] topic name
       # @param partition [Integer] partition
       # @param count [Integer] how many messages we want to get at most
-      # @param start_offset [Integer] offset from which we should start. If -1 is provided
-      #   (default) we will start from the latest offset
+      # @param start_offset [Integer, Time] offset from which we should start. If -1 is provided
+      #   (default) we will start from the latest offset. If time is provided, the appropriate
+      #   offset will be resolved.
       # @param settings [Hash] kafka extra settings (optional)
       #
       # @return [Array<Karafka::Messages::Message>] array with messages
@@ -53,6 +58,9 @@ module Karafka
         low_offset, high_offset = nil
         with_consumer(settings) do |consumer|
+          # Convert the time offset (if needed)
+          start_offset = resolve_offset(consumer, name.to_s, partition, start_offset)
           low_offset, high_offset = consumer.query_watermark_offsets(name, partition)
           # Select offset dynamically if -1 or less
@@ -243,6 +251,29 @@ module Karafka
         ::Rdkafka::Config.new(config_hash)
       end
+      # Resolves the offset if offset is in a time format. Otherwise returns the offset without
+      # resolving.
+      # @param consumer [::Rdkafka::Consumer]
+      # @param name [String, Symbol] expected topic name
+      # @param partition [Integer]
+      # @param offset [Integer, Time]
+      # @return [Integer] expected offset
+      def resolve_offset(consumer, name, partition, offset)
+        if offset.is_a?(Time)
+          tpl = ::Rdkafka::Consumer::TopicPartitionList.new
+          tpl.add_topic_and_partitions_with_offsets(
+            name, partition => offset
+          )
+          real_offsets = consumer.offsets_for_times(tpl, TPL_REQUEST_TIMEOUT)
+          detected_offset = real_offsets.to_h.dig(name, partition)
+          detected_offset&.offset || raise(Errors::InvalidTimeBasedOffsetError)
+        else
+          offset
+        end
+      end
     end
   end
 end

data/lib/karafka/base_consumer.rb CHANGED Viewed

@@ -70,6 +70,7 @@ module Karafka
     #
     # @return [Boolean] true if there was no exception, otherwise false.
     #
+    # @private
     # @note We keep the seek offset tracking, and use it to compensate for async offset flushing
     #   that may not yet kick in when error occurs. That way we pause always on the last processed
     #   message.
@@ -203,8 +204,15 @@ module Karafka
     # Seeks in the context of current topic and partition
     #
-    # @param offset [Integer] offset where we want to seek
-    def seek(offset)
+    # @param offset [Integer, Time] offset where we want to seek or time of the offset where we
+    #   want to seek.
+    # @param manual_seek [Boolean] Flag to differentiate between user seek and system/strategy
+    #   based seek. User seek operations should take precedence over system actions, hence we need
+    #   to know who invoked it.
+    # @note Please note, that if you are seeking to a time offset, getting the offset is blocking
+    def seek(offset, manual_seek = true)
+      coordinator.manual_seek if manual_seek
       client.seek(
         Karafka::Messages::Seek.new(
           topic.name,
@@ -221,7 +229,12 @@ module Karafka
     #   even before we poll but it gets reset when polling happens, hence we also need to switch
     #   the coordinator state after the revocation (but prior to running more jobs)
     def revoked?
-      client.assignment_lost? || coordinator.revoked?
+      return true if coordinator.revoked?
+      return false unless client.assignment_lost?
+      coordinator.revoke
+      true
     end
     # @return [Boolean] are we retrying processing after an error. This can be used to provide a

data/lib/karafka/connection/client.rb CHANGED Viewed

@@ -20,11 +20,14 @@ module Karafka
       # How many times should we retry polling in case of a failure
       MAX_POLL_RETRIES = 20
+      # Max time for a TPL request. We increase it to compensate for remote clusters latency
+      TPL_REQUEST_TIMEOUT = 2_000
       # We want to make sure we never close several clients in the same moment to prevent
       # potential race conditions and other issues
       SHUTDOWN_MUTEX = Mutex.new
-      private_constant :MAX_POLL_RETRIES, :SHUTDOWN_MUTEX
+      private_constant :MAX_POLL_RETRIES, :SHUTDOWN_MUTEX, :TPL_REQUEST_TIMEOUT
       # Creates a new consumer instance.
       #
@@ -35,12 +38,16 @@ module Karafka
         @id = SecureRandom.hex(6)
         # Name is set when we build consumer
         @name = ''
-        @mutex = Mutex.new
         @closed = false
         @subscription_group = subscription_group
         @buffer = RawMessagesBuffer.new
         @rebalance_manager = RebalanceManager.new
         @kafka = build_consumer
+        # There are few operations that can happen in parallel from the listener threads as well
+        # as from the workers. They are not fully thread-safe because they may be composed out of
+        # few calls to Kafka or out of few internal state changes. That is why we mutex them.
+        # It mostly revolves around pausing and resuming.
+        @mutex = Mutex.new
         # We need to keep track of what we have paused for resuming
         # In case we loose partition, we still need to resume it, otherwise it won't be fetched
         # again if we get reassigned to it later on. We need to keep them as after revocation we
@@ -101,16 +108,12 @@ module Karafka
       #
       # @param message [Karafka::Messages::Message]
       def store_offset(message)
-        @mutex.synchronize do
-          internal_store_offset(message)
-        end
+        internal_store_offset(message)
       end
       # @return [Boolean] true if our current assignment has been lost involuntarily.
       def assignment_lost?
-        @mutex.synchronize do
-          @kafka.assignment_lost?
-        end
+        @kafka.assignment_lost?
       end
       # Commits the offset on a current consumer in a non-blocking or blocking way.
@@ -122,12 +125,12 @@ module Karafka
       # @note This will commit all the offsets for the whole consumer. In order to achieve
       #   granular control over where the offset should be for particular topic partitions, the
       #   store_offset should be used to only store new offset when we want them to be flushed
+      #
+      # @note This method for async may return `true` despite involuntary partition revocation as
+      #   it does **not** resolve to `lost_assignment?`. It returns only the commit state operation
+      #   result.
       def commit_offsets(async: true)
-        @mutex.lock
         internal_commit_offsets(async: async)
-      ensure
-        @mutex.unlock
       end
       # Commits offset in a synchronous way.
@@ -140,13 +143,11 @@ module Karafka
       # Seek to a particular message. The next poll on the topic/partition will return the
       # message at the given offset.
       #
-      # @param message [Messages::Message, Messages::Seek] message to which we want to seek to
+      # @param message [Messages::Message, Messages::Seek] message to which we want to seek to.
+      #   It can have the time based offset.
+      # @note Please note, that if you are seeking to a time offset, getting the offset is blocking
       def seek(message)
-        @mutex.lock
-        @kafka.seek(message)
-      ensure
-        @mutex.unlock
+        @mutex.synchronize { internal_seek(message) }
       end
       # Pauses given partition and moves back to last successful offset processed.
@@ -157,37 +158,34 @@ module Karafka
       #   be reprocessed after getting back to processing)
       # @note This will pause indefinitely and requires manual `#resume`
       def pause(topic, partition, offset)
-        @mutex.lock
-        # Do not pause if the client got closed, would not change anything
-        return if @closed
-        pause_msg = Messages::Seek.new(topic, partition, offset)
+        @mutex.synchronize do
+          # Do not pause if the client got closed, would not change anything
+          return if @closed
-        internal_commit_offsets(async: true)
+          pause_msg = Messages::Seek.new(topic, partition, offset)
-        # Here we do not use our cached tpls because we should not try to pause something we do
-        # not own anymore.
-        tpl = topic_partition_list(topic, partition)
+          internal_commit_offsets(async: true)
-        return unless tpl
+          # Here we do not use our cached tpls because we should not try to pause something we do
+          # not own anymore.
+          tpl = topic_partition_list(topic, partition)
-        Karafka.monitor.instrument(
-          'client.pause',
-          caller: self,
-          subscription_group: @subscription_group,
-          topic: topic,
-          partition: partition,
-          offset: offset
-        )
+          return unless tpl
-        @paused_tpls[topic][partition] = tpl
+          Karafka.monitor.instrument(
+            'client.pause',
+            caller: self,
+            subscription_group: @subscription_group,
+            topic: topic,
+            partition: partition,
+            offset: offset
+          )
-        @kafka.pause(tpl)
+          @paused_tpls[topic][partition] = tpl
-        @kafka.seek(pause_msg)
-      ensure
-        @mutex.unlock
+          @kafka.pause(tpl)
+          internal_seek(pause_msg)
+        end
       end
       # Resumes processing of a give topic partition after it was paused.
@@ -195,33 +193,31 @@ module Karafka
       # @param topic [String] topic name
       # @param partition [Integer] partition
       def resume(topic, partition)
-        @mutex.lock
-        return if @closed
+        @mutex.synchronize do
+          return if @closed
-        # We now commit offsets on rebalances, thus we can do it async just to make sure
-        internal_commit_offsets(async: true)
+          # We now commit offsets on rebalances, thus we can do it async just to make sure
+          internal_commit_offsets(async: true)
-        # If we were not able, let's try to reuse the one we have (if we have)
-        tpl = topic_partition_list(topic, partition) || @paused_tpls[topic][partition]
+          # If we were not able, let's try to reuse the one we have (if we have)
+          tpl = topic_partition_list(topic, partition) || @paused_tpls[topic][partition]
-        return unless tpl
+          return unless tpl
-        # If we did not have it, it means we never paused this partition, thus no resume should
-        # happen in the first place
-        return unless @paused_tpls[topic].delete(partition)
+          # If we did not have it, it means we never paused this partition, thus no resume should
+          # happen in the first place
+          return unless @paused_tpls[topic].delete(partition)
-        Karafka.monitor.instrument(
-          'client.resume',
-          caller: self,
-          subscription_group: @subscription_group,
-          topic: topic,
-          partition: partition
-        )
+          Karafka.monitor.instrument(
+            'client.resume',
+            caller: self,
+            subscription_group: @subscription_group,
+            topic: topic,
+            partition: partition
+          )
-        @kafka.resume(tpl)
-      ensure
-        @mutex.unlock
+          @kafka.resume(tpl)
+        end
       end
       # Gracefully stops topic consumption.
@@ -238,9 +234,10 @@ module Karafka
       # @param [Karafka::Messages::Message] message that we want to mark as processed
       # @return [Boolean] true if successful. False if we no longer own given partition
       # @note This method won't trigger automatic offsets commits, rather relying on the offset
-      #   check-pointing trigger that happens with each batch processed
+      #   check-pointing trigger that happens with each batch processed. It will however check the
+      #   `librdkafka` assignment ownership to increase accuracy for involuntary revocations.
       def mark_as_consumed(message)
-        store_offset(message)
+        store_offset(message) && !assignment_lost?
       end
       # Marks a given message as consumed and commits the offsets in a blocking way.
@@ -257,11 +254,9 @@ module Karafka
       def reset
         close
-        @mutex.synchronize do
-          @closed = false
-          @paused_tpls.clear
-          @kafka = build_consumer
-        end
+        @closed = false
+        @paused_tpls.clear
+        @kafka = build_consumer
       end
       # Runs a single poll ignoring all the potential errors
@@ -318,28 +313,55 @@ module Karafka
         raise e
       end
+      # Non-mutexed seek that should be used only internally. Outside we expose `#seek` that is
+      # wrapped with a mutex.
+      #
+      # @param message [Messages::Message, Messages::Seek] message to which we want to seek to.
+      #   It can have the time based offset.
+      def internal_seek(message)
+        # If the seek message offset is in a time format, we need to find the closest "real"
+        # offset matching before we seek
+        if message.offset.is_a?(Time)
+          tpl = ::Rdkafka::Consumer::TopicPartitionList.new
+          tpl.add_topic_and_partitions_with_offsets(
+            message.topic,
+            message.partition => message.offset
+          )
+          # Now we can overwrite the seek message offset with our resolved offset and we can
+          # then seek to the appropriate message
+          # We set the timeout to 2_000 to make sure that remote clusters handle this well
+          real_offsets = @kafka.offsets_for_times(tpl, TPL_REQUEST_TIMEOUT)
+          detected_partition = real_offsets.to_h.dig(message.topic, message.partition)
+          # There always needs to be an offset. In case we seek into the future, where there
+          # are no offsets yet, we get -1 which indicates the most recent offset
+          # We should always detect offset, whether it is 0, -1 or a corresponding
+          message.offset = detected_partition&.offset || raise(Errors::InvalidTimeBasedOffsetError)
+        end
+        @kafka.seek(message)
+      end
       # Commits the stored offsets in a sync way and closes the consumer.
       def close
         # Allow only one client to be closed at the same time
         SHUTDOWN_MUTEX.synchronize do
-          # Make sure that no other operations are happening on this client when we close it
-          @mutex.synchronize do
-            # Once client is closed, we should not close it again
-            # This could only happen in case of a race-condition when forceful shutdown happens
-            # and triggers this from a different thread
-            return if @closed
-            @closed = true
-            # Remove callbacks runners that were registered
-            ::Karafka::Core::Instrumentation.statistics_callbacks.delete(@subscription_group.id)
-            ::Karafka::Core::Instrumentation.error_callbacks.delete(@subscription_group.id)
-            @kafka.close
-            @buffer.clear
-            # @note We do not clear rebalance manager here as we may still have revocation info
-            # here that we want to consider valid prior to running another reconnection
-          end
+          # Once client is closed, we should not close it again
+          # This could only happen in case of a race-condition when forceful shutdown happens
+          # and triggers this from a different thread
+          return if @closed
+          @closed = true
+          # Remove callbacks runners that were registered
+          ::Karafka::Core::Instrumentation.statistics_callbacks.delete(@subscription_group.id)
+          ::Karafka::Core::Instrumentation.error_callbacks.delete(@subscription_group.id)
+          @kafka.close
+          @buffer.clear
+          # @note We do not clear rebalance manager here as we may still have revocation info
+          # here that we want to consider valid prior to running another reconnection
         end
       end

data/lib/karafka/errors.rb CHANGED Viewed

@@ -48,6 +48,9 @@ module Karafka
     StrategyNotFoundError = Class.new(BaseError)
     # This should never happen. Please open an issue if it does.
-    InvalidRealOffsetUsage = Class.new(BaseError)
+    InvalidRealOffsetUsageError = Class.new(BaseError)
+    # This should never happen. Please open an issue if it does.
+    InvalidTimeBasedOffsetError = Class.new(BaseError)
   end
 end

data/lib/karafka/messages/seek.rb CHANGED Viewed

@@ -4,6 +4,9 @@ module Karafka
   module Messages
     # "Fake" message that we use as an abstraction layer when seeking back.
     # This allows us to encapsulate a seek with a simple abstraction
+    #
+    # @note `#offset` can be either the offset value or the time of the offset
+    # (first equal or greater)
     Seek = Struct.new(:topic, :partition, :offset)
   end
 end

data/lib/karafka/pro/iterator/expander.rb ADDED Viewed

@@ -0,0 +1,95 @@
+# frozen_string_literal: true
+# This Karafka component is a Pro component under a commercial license.
+# This Karafka component is NOT licensed under LGPL.
+#
+# All of the commercial components are present in the lib/karafka/pro directory of this
+# repository and their usage requires commercial license agreement.
+#
+# Karafka has also commercial-friendly license, commercial support and commercial components.
+#
+# By sending a pull request to the pro components, you are agreeing to transfer the copyright of
+# your code to Maciej Mensfeld.
+module Karafka
+  module Pro
+    class Iterator
+      # There are various ways you can provide topics information for iterating.
+      #
+      # This mapper normalizes this data, resolves offsets and maps the time based offsets into
+      # appropriate once
+      #
+      # Following formats are accepted:
+      #
+      # - 'topic1' - just a string with one topic name
+      # - ['topic1', 'topic2'] - just the names
+      # - { 'topic1' => -100 } - names with negative lookup offset
+      # - { 'topic1' => { 0 => 5 } } - names with exact partitions offsets
+      # - { 'topic1' => { 0 => -5 }, 'topic2' => { 1 => 5 } } - with per partition negative offsets
+      # - { 'topic1' => 100 } - means we run all partitions from the offset 100
+      # - { 'topic1' => Time.now - 60 } - we run all partitions from the message from 60s ago
+      # - { 'topic1' => { 1 => Time.now - 60 } } - partition1 from message 60s ago
+      #
+      class Expander
+        # Expands topics to which we want to subscribe with partitions information in case this
+        # info is not provided.
+        #
+        # @param topics [Array, Hash, String] topics definitions
+        # @return [Hash] expanded and normalized requested topics and partitions data
+        def call(topics)
+          expanded = Hash.new { |h, k| h[k] = {} }
+          normalize_format(topics).map do |topic, details|
+            if details.is_a?(Hash)
+              details.each do |partition, offset|
+                expanded[topic][partition] = offset
+              end
+            else
+              partition_count(topic).times do |partition|
+                # If no offsets are provided, we just start from zero
+                expanded[topic][partition] = details || 0
+              end
+            end
+          end
+          expanded
+        end
+        private
+        # Input can be provided in multiple formats. Here we normalize it to one (hash).
+        #
+        # @param topics [Array, Hash, String] requested topics
+        # @return [Hash] normalized hash with topics data
+        def normalize_format(topics)
+          # Simplification for the single topic case
+          topics = [topics] if topics.is_a?(String)
+          # If we've got just array with topics, we need to convert that into a representation
+          # that we can expand with offsets
+          topics = topics.map { |name| [name, false] }.to_h if topics.is_a?(Array)
+          # We remap by creating new hash, just in case the hash came as the argument for this
+          # expanded. We do not want to modify user provided hash
+          topics.transform_keys(&:to_s)
+        end
+        # List of topics with their partition information for expansion
+        # We cache it so we do not have to run consecutive requests to obtain data about multiple
+        # topics
+        def topics
+          @topics ||= Admin.cluster_info.topics
+        end
+        # @param name [String] topic name
+        # @return [Integer] number of partitions of the topic we want to iterate over
+        def partition_count(name)
+          topics
+            .find { |topic| topic.fetch(:topic_name) == name }
+            .tap { |topic| topic || raise(Errors::TopicNotFoundError, name) }
+            .fetch(:partitions)
+            .count
+        end
+      end
+    end
+  end
+end