RubyGems - karafka - Versions diffs - 2.0.0.alpha4 → 2.0.0.beta1 - Mend

karafka 2.0.0.alpha4 → 2.0.0.beta1

Files changed (35) hide show

checksums.yaml +4 -4
checksums.yaml.gz.sig +0 -0
data/.ruby-version +1 -1
data/CHANGELOG.md +26 -3
data/Gemfile.lock +11 -11
data/bin/integrations +55 -43
data/docker-compose.yml +4 -1
data/karafka.gemspec +1 -1
data/lib/karafka/base_consumer.rb +65 -12
data/lib/karafka/connection/client.rb +35 -5
data/lib/karafka/connection/listener.rb +11 -7
data/lib/karafka/connection/messages_buffer.rb +44 -13
data/lib/karafka/connection/pauses_manager.rb +2 -2
data/lib/karafka/connection/rebalance_manager.rb +35 -20
data/lib/karafka/contracts/config.rb +1 -0
data/lib/karafka/instrumentation/{stdout_listener.rb → logger_listener.rb} +1 -1
data/lib/karafka/instrumentation/monitor.rb +2 -1
data/lib/karafka/pro/active_job/dispatcher.rb +9 -9
data/lib/karafka/pro/active_job/job_options_contract.rb +9 -9
data/lib/karafka/pro/loader.rb +13 -8
data/lib/karafka/pro/performance_tracker.rb +80 -0
data/lib/karafka/processing/executor.rb +15 -10
data/lib/karafka/processing/jobs/base.rb +16 -0
data/lib/karafka/processing/jobs/consume.rb +7 -2
data/lib/karafka/processing/jobs_queue.rb +18 -9
data/lib/karafka/processing/worker.rb +23 -0
data/lib/karafka/railtie.rb +12 -0
data/lib/karafka/scheduler.rb +21 -0
data/lib/karafka/setup/config.rb +3 -1
data/lib/karafka/templates/karafka.rb.erb +1 -1
data/lib/karafka/time_trackers/pause.rb +10 -2
data/lib/karafka/version.rb +1 -1
data.tar.gz.sig +0 -0
metadata +8 -6
metadata.gz.sig +0 -0

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: f8ed5005d3288abb4f9f6389fa126b5434b5a7a6df729e924695d38624e167e5
-  data.tar.gz: aa802a441d9c9cb1275260ba9b290c8c4c72acf125ca72efaafc58538a6c8b20
+  metadata.gz: f108cb4288d0ed0510381f51c77d49e052b947f6180c9b9c0b06e0ac2b599894
+  data.tar.gz: 3d79066d0107c08f450ca9f4c3b5c4a39aae497836c80bf8380c65f1406b82c0
 SHA512:
-  metadata.gz: 1ad458acdb42c28d04895db1aa348d0ab10177c0829e7b19fc9fd8b23aaac5fc17feac5e1366cd6444d1e60903b087be1b24298b7764a98f8f372a325afb7cee
-  data.tar.gz: fb1de540a2c50d26467585bb720a636726955e65f1cea1ca055ef4a9f10d66db233928ca8b067d225d6c935f779d50f2381660ddcaa0d86d6c5b0b8a8ed991e1
+  metadata.gz: 4aae257010c992c59ce4b01ead54ff2cfd4e8ccd8cbe6b52214b3cedf8f879690e0d577f2b41f44b1ab6888d7e27bbc92f3ba4a69e8b127687fb4c43bff51fbc
+  data.tar.gz: f65e425cb84152d20a055bdb9a94fd98280597cdf5e431337cb8604040534cacbfdd03efd6dc23b86c9ecf25721c860bd55ca75ad3f98e4c66136a88c1efc4e7

checksums.yaml.gz.sig CHANGED Viewed

Binary file

data/.ruby-version CHANGED Viewed

	@@ -1 +1 @@
1	- 3.1.0
1	+ 3.1.2

data/CHANGELOG.md CHANGED Viewed

@@ -1,6 +1,29 @@
 # Karafka framework changelog
-## 2.0.0-alpha4 (Unreleased)
+## 2.0.0-beta1 (2022-05-22)
+- Update the jobs queue blocking engine and allow for non-blocking jobs execution
+- Provide `#prepared` hook that always runs before the fetching loop is unblocked
+- [Pro] Introduce performance tracker for scheduling optimizer
+- Provide ability to pause (`#pause`) and resume (`#resume`) given partitions from the consumers
+- Small integration specs refactoring + specs for pausing scenarios
+## 2.0.0-alpha6 (2022-04-17)
+- Fix a bug, where upon missing boot file and Rails, railtie would fail with a generic exception (#818)
+- Fix an issue with parallel pristine specs colliding with each other during `bundle install` (#820)
+- Replace `consumer.consume` with `consumer.consumed` event to match the behaviour
+- Make sure, that offset committing happens before the `consumer.consumed` event is propagated
+- Fix for failing when not installed (just a dependency) (#817)
+- Evict messages from partitions that were lost upon rebalancing (#825)
+- Do **not** run `#revoked` on partitions that were lost and assigned back upon rebalancing (#825)
+- Remove potential duplicated that could occur upon rebalance with re-assigned partitions (#825)
+- Optimize integration test suite additional consumers shutdown process (#828)
+- Optimize messages eviction and duplicates removal on poll stopped due to lack of messages
+- Add static group membership integration spec
+## 2.0.0-alpha5 (2022-04-03)
+- Rename StdoutListener to LoggerListener (#811)
+## 2.0.0-alpha4 (2022-03-20)
 - Rails support without ActiveJob queue adapter usage (#805)
 ## 2.0.0-alpha3 (2022-03-16)
@@ -10,12 +33,12 @@
 ## 2.0.0-alpha2 (2022-02-19)
 - Require `kafka` keys to be symbols
-- Added ActiveJob Pro adapter
+- [Pro] Added ActiveJob Pro adapter
 - Small updates to the license and docs
 ## 2.0.0-alpha1 (2022-01-30)
 - Change license to `LGPL-3.0`
-- Introduce a Pro subscription
+- [Pro] Introduce a Pro subscription
 - Switch from `ruby-kafka` to `librdkafka` as an underlying driver
 - Introduce fully automatic integration tests that go through the whole server lifecycle
 - Integrate WaterDrop tightly with autoconfiguration inheritance and an option to redefine it

data/Gemfile.lock CHANGED Viewed

@@ -1,31 +1,31 @@
 PATH
   remote: .
   specs:
-    karafka (2.0.0.alpha4)
+    karafka (2.0.0.beta1)
       dry-configurable (~> 0.13)
       dry-monitor (~> 0.5)
       dry-validation (~> 1.7)
       rdkafka (>= 0.10)
       thor (>= 0.20)
-      waterdrop (>= 2.2.0, < 3.0.0)
+      waterdrop (>= 2.3.0, < 3.0.0)
       zeitwerk (~> 2.3)
 GEM
   remote: https://rubygems.org/
   specs:
-    activejob (7.0.2.2)
-      activesupport (= 7.0.2.2)
+    activejob (7.0.3)
+      activesupport (= 7.0.3)
       globalid (>= 0.3.6)
-    activesupport (7.0.2.2)
+    activesupport (7.0.3)
       concurrent-ruby (~> 1.0, >= 1.0.2)
       i18n (>= 1.6, < 2)
       minitest (>= 5.1)
       tzinfo (~> 2.0)
     byebug (11.1.3)
-    concurrent-ruby (1.1.9)
+    concurrent-ruby (1.1.10)
     diff-lcs (1.5.0)
     docile (1.4.0)
-    dry-configurable (0.14.0)
+    dry-configurable (0.15.0)
       concurrent-ruby (~> 1.0)
       dry-core (~> 0.6)
     dry-container (0.9.0)
@@ -64,7 +64,7 @@ GEM
       dry-core (~> 0.5, >= 0.5)
       dry-initializer (~> 3.0)
       dry-schema (~> 1.9, >= 1.9.1)
-    factory_bot (6.2.0)
+    factory_bot (6.2.1)
       activesupport (>= 5.0.0)
     ffi (1.15.5)
     globalid (1.0.0)
@@ -87,7 +87,7 @@ GEM
     rspec-expectations (3.11.0)
       diff-lcs (>= 1.2.0, < 2.0)
       rspec-support (~> 3.11.0)
-    rspec-mocks (3.11.0)
+    rspec-mocks (3.11.1)
       diff-lcs (>= 1.2.0, < 2.0)
       rspec-support (~> 3.11.0)
     rspec-support (3.11.0)
@@ -100,7 +100,7 @@ GEM
     thor (1.2.1)
     tzinfo (2.0.4)
       concurrent-ruby (~> 1.0)
-    waterdrop (2.2.0)
+    waterdrop (2.3.0)
       concurrent-ruby (>= 1.1)
       dry-configurable (~> 0.13)
       dry-monitor (~> 0.5)
@@ -121,4 +121,4 @@ DEPENDENCIES
   simplecov
 BUNDLED WITH
-   2.3.7
+   2.3.11

data/bin/integrations CHANGED Viewed

@@ -44,17 +44,30 @@ class Scenario
   # @param path [String] path to the scenarios file
   def initialize(path)
     @path = path
-    @stdin, @stdout, @stderr, @wait_thr = Open3.popen3(init_and_build_cmd)
-    @started_at = current_time
     # Last 1024 characters from stdout
     @stdout_tail = ''
   end
+  # Starts running given scenario in a separate process
+  def start
+    @stdin, @stdout, @stderr, @wait_thr = Open3.popen3(init_and_build_cmd)
+    @started_at = current_time
+  end
   # @return [String] integration spec name
   def name
     @path.gsub("#{ROOT_PATH}/spec/integrations/", '')
   end
+  # @return [Boolean] true if spec is pristine
+  def pristine?
+    scenario_dir = File.dirname(@path)
+    # If there is a Gemfile in a scenario directory, it means it is a pristine spec and we need
+    # to run bundle install, etc in order to run it
+    File.exist?(File.join(scenario_dir, 'Gemfile'))
+  end
   # @return [Boolean] did this scenario finished or is it still running
   def finished?
     # If the thread is running too long, kill it
@@ -73,6 +86,13 @@ class Scenario
     !@wait_thr.alive?
   end
+  # @return [Boolean] did this scenario finish successfully or not
+  def success?
+    expected_exit_codes = EXIT_CODES[name] || EXIT_CODES[:default]
+    expected_exit_codes.include?(exit_code)
+  end
   # @return [Integer] pid of the process of this scenario
   def pid
     @wait_thr.pid
@@ -84,13 +104,6 @@ class Scenario
     @wait_thr.value&.exitstatus || 123
   end
-  # @return [Boolean] did this scenario finish successfully or not
-  def success?
-    expected_exit_codes = EXIT_CODES[name] || EXIT_CODES[:default]
-    expected_exit_codes.include?(exit_code)
-  end
   # Prints a status report when scenario is finished and stdout if it failed
   def report
     result = success? ? "\e[#{32}m#{'OK'}\e[0m" : "\e[#{31}m#{'FAILED'}\e[0m"
@@ -109,11 +122,10 @@ class Scenario
   # Sets up a proper environment for a given spec to run and returns the run command
   # @return [String] run command
   def init_and_build_cmd
-    scenario_dir = File.dirname(@path)
     # If there is a Gemfile in a scenario directory, it means it is a pristine spec and we need
     # to run bundle install, etc in order to run it
-    if File.exist?(File.join(scenario_dir, 'Gemfile'))
+    if pristine?
+      scenario_dir = File.dirname(@path)
       # We copy the spec into a temp dir, not to pollute the spec location with logs, etc
       temp_dir = Dir.mktmpdir
       file_name = File.basename(@path)
@@ -141,31 +153,6 @@ class Scenario
   end
 end
-# Simple array to keep track of active integration processes thread running with info on which
-# test scenario is running
-active_scenarios = []
-# Finished runners
-finished_scenarios = []
-# Waits for any of the processes to be finished and tracks exit codes
-#
-# @param active_scenarios [Array] active runners
-# @param finished_scenarios [Hash] finished forks exit codes
-def wait_and_track(active_scenarios, finished_scenarios)
-  exited = active_scenarios.find(&:finished?)
-  if exited
-    scenario = active_scenarios.delete(exited)
-    scenario.report
-    finished_scenarios << scenario
-  else
-    Thread.pass
-  end
-end
 # Load all the specs
 specs = Dir[ROOT_PATH.join('spec/integrations/**/*.rb')]
@@ -182,15 +169,40 @@ seed = (ENV['SEED'] || rand(0..10_000)).to_i
 puts "Random seed: #{seed}"
-specs.shuffle(random: Random.new(seed)).each do |integration_test|
-  scenario = Scenario.new(integration_test)
+scenarios = specs
+            .shuffle(random: Random.new(seed))
+            .map { |integration_test| Scenario.new(integration_test) }
-  active_scenarios << scenario
+regulars = scenarios.reject(&:pristine?)
+pristine = scenarios.select(&:pristine?)
-  wait_and_track(active_scenarios, finished_scenarios) until active_scenarios.size < CONCURRENCY
-end
+active_scenarios = []
+finished_scenarios = []
+while finished_scenarios.size < scenarios.size
+  # If we have space to run another scenario, we add it
+  if active_scenarios.size < CONCURRENCY
+    scenario = nil
+    # We can run only one pristine at the same time due to concurrency issues within bundler
+    # Since they usually take longer than others, we try to run them as fast as possible when there
+    # is a slot
+    scenario = pristine.pop unless active_scenarios.any?(&:pristine?)
+    scenario ||= regulars.pop
+    if scenario
+      scenario.start
+      active_scenarios << scenario
+    end
+  end
-wait_and_track(active_scenarios, finished_scenarios) while !active_scenarios.empty?
+  active_scenarios.select(&:finished?).each do |exited|
+    scenario = active_scenarios.delete(exited)
+    scenario.report
+    finished_scenarios << scenario
+  end
+  sleep(0.1)
+end
 # Fail all if any of the tests does not have expected exit code
 raise IntegrationTestError unless finished_scenarios.all?(&:success?)

data/docker-compose.yml CHANGED Viewed

@@ -14,7 +14,10 @@ services:
       KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181
       KAFKA_AUTO_CREATE_TOPICS_ENABLE: 'true'
       KAFKA_CREATE_TOPICS:
-        "integrations_0_03:3:1,\
+        "integrations_0_02:2:1,\
+         integrations_1_02:2:1,\
+         integrations_2_02:2:1,\
+         integrations_0_03:3:1,\
          integrations_1_03:3:1,\
          integrations_2_03:3:1,\
          integrations_0_10:10:1,\

data/karafka.gemspec CHANGED Viewed

@@ -21,7 +21,7 @@ Gem::Specification.new do |spec|
   spec.add_dependency 'dry-validation', '~> 1.7'
   spec.add_dependency 'rdkafka', '>= 0.10'
   spec.add_dependency 'thor', '>= 0.20'
-  spec.add_dependency 'waterdrop', '>= 2.2.0', '< 3.0.0'
+  spec.add_dependency 'waterdrop', '>= 2.3.0', '< 3.0.0'
   spec.add_dependency 'zeitwerk', '~> 2.3'
   spec.required_ruby_version = '>= 2.6.0'

data/lib/karafka/base_consumer.rb CHANGED Viewed

@@ -10,8 +10,8 @@ module Karafka
     attr_accessor :messages
     # @return [Karafka::Connection::Client] kafka connection client
     attr_accessor :client
-    # @return [Karafka::TimeTrackers::Pause] current topic partition pause
-    attr_accessor :pause
+    # @return [Karafka::TimeTrackers::Pause] current topic partition pause tracker
+    attr_accessor :pause_tracker
     # @return [Waterdrop::Producer] producer instance
     attr_accessor :producer
@@ -21,18 +21,18 @@ module Karafka
     #   that may not yet kick in when error occurs. That way we pause always on the last processed
     #   message.
     def on_consume
-      Karafka.monitor.instrument('consumer.consume', caller: self) do
+      Karafka.monitor.instrument('consumer.consumed', caller: self) do
         consume
-      end
-      pause.reset
+        pause_tracker.reset
-      # Mark as consumed only if manual offset management is not on
-      return if topic.manual_offset_management
+        # Mark as consumed only if manual offset management is not on
+        return if topic.manual_offset_management
-      # We use the non-blocking one here. If someone needs the blocking one, can implement it with
-      # manual offset management
-      mark_as_consumed(messages.last)
+        # We use the non-blocking one here. If someone needs the blocking one, can implement it
+        # with manual offset management
+        mark_as_consumed(messages.last)
+      end
     rescue StandardError => e
       Karafka.monitor.instrument(
         'error.occurred',
@@ -40,8 +40,8 @@ module Karafka
         caller: self,
         type: 'consumer.consume.error'
       )
-      client.pause(topic.name, messages.first.partition, @seek_offset || messages.first.offset)
-      pause.pause
+      pause(@seek_offset || messages.first.offset)
     end
     # Trigger method for running on shutdown.
@@ -76,8 +76,31 @@ module Karafka
       )
     end
+    # Can be used to run preparation code
+    #
+    # @private
+    # @note This should not be used by the end users as it is part of the lifecycle of things but
+    #   not as part of the public api. This can act as a hook when creating non-blocking
+    #   consumers and doing other advanced stuff
+    def on_prepared
+      Karafka.monitor.instrument('consumer.prepared', caller: self) do
+        prepared
+      end
+    rescue StandardError => e
+      Karafka.monitor.instrument(
+        'error.occurred',
+        error: e,
+        caller: self,
+        type: 'consumer.prepared.error'
+      )
+    end
     private
+    # Method that gets called in the blocking flow allowing to setup any type of resources or to
+    # send additional commands to Kafka before the proper execution starts.
+    def prepared; end
     # Method that will perform business logic and on data received from Kafka (it will consume
     #   the data)
     # @note This method needs bo be implemented in a subclass. We stub it here as a failover if
@@ -97,6 +120,10 @@ module Karafka
     # Marks message as consumed in an async way.
     #
     # @param message [Messages::Message] last successfully processed message.
+    # @note We keep track of this offset in case we would mark as consumed and got error when
+    #   processing another message. In case like this we do not pause on the message we've already
+    #   processed but rather at the next one. This applies to both sync and async versions of this
+    #   method.
     def mark_as_consumed(message)
       client.mark_as_consumed(message)
       @seek_offset = message.offset + 1
@@ -110,6 +137,32 @@ module Karafka
       @seek_offset = message.offset + 1
     end
+    # Pauses processing on a given offset for the current topic partition
+    #
+    # After given partition is resumed, it will continue processing from the given offset
+    # @param offset [Integer] offset from which we want to restart the processing
+    # @param timeout [Integer, nil] how long in milliseconds do we want to pause or nil to use the
+    #   default exponential pausing strategy defined for retries
+    def pause(offset, timeout = nil)
+      client.pause(
+        messages.metadata.topic,
+        messages.metadata.partition,
+        offset
+      )
+      timeout ? pause_tracker.pause(timeout) : pause_tracker.pause
+    end
+    # Resumes processing of the current topic partition
+    def resume
+      client.resume(
+        messages.metadata.topic,
+        messages.metadata.partition
+      )
+      pause_tracker.expire
+    end
     # Seeks in the context of current topic and partition
     #
     # @param offset [Integer] offset where we want to seek

data/lib/karafka/connection/client.rb CHANGED Viewed

@@ -48,6 +48,7 @@ module Karafka
         time_poll.start
         @buffer.clear
+        @rebalance_manager.clear
         loop do
           # Don't fetch more messages if we do not have any time left
@@ -58,13 +59,23 @@ module Karafka
           # Fetch message within our time boundaries
           message = poll(time_poll.remaining)
-          # If there are no more messages, return what we have
-          break unless message
-          @buffer << message
+          # Put a message to the buffer if there is one
+          @buffer << message if message
           # Track time spent on all of the processing and polling
           time_poll.checkpoint
+          # Upon polling rebalance manager might have been updated.
+          # If partition revocation happens, we need to remove messages from revoked partitions
+          # as well as ensure we do not have duplicated due to the offset reset for partitions
+          # that we got assigned
+          remove_revoked_and_duplicated_messages if @rebalance_manager.revoked_partitions?
+          # Finally once we've (potentially) removed revoked, etc, if no messages were returned
+          # we can break.
+          # Worth keeping in mind, that the rebalance manager might have been updated despite no
+          # messages being returned during a poll
+          break unless message
         end
         @buffer
@@ -84,6 +95,9 @@ module Karafka
       # Ignoring a case where there would not be an offset (for example when rebalance occurs).
       #
       # @param async [Boolean] should the commit happen async or sync (async by default)
+      # @return [Boolean] did committing was successful. It may be not, when we no longer own
+      #   given partition.
+      #
       # @note This will commit all the offsets for the whole consumer. In order to achieve
       #   granular control over where the offset should be for particular topic partitions, the
       #   store_offset should be used to only store new offset when we want to to be flushed
@@ -212,6 +226,8 @@ module Karafka
           ::Karafka::Instrumentation.error_callbacks.delete(@subscription_group.id)
           @kafka.close
+          @buffer.clear
+          @rebalance_manager.clear
         end
       end
@@ -232,7 +248,7 @@ module Karafka
       # Performs a single poll operation.
       #
       # @param timeout [Integer] timeout for a single poll
-      # @return [Array<Rdkafka::Consumer::Message>, nil] fetched messages or nil if nothing polled
+      # @return [Rdkafka::Consumer::Message, nil] fetched message or nil if nothing polled
       def poll(timeout)
         time_poll ||= TimeTrackers::Poll.new(timeout)
@@ -301,6 +317,20 @@ module Karafka
         consumer
       end
+      # We may have a case where in the middle of data polling, we've lost a partition.
+      # In a case like this we should remove all the pre-buffered messages from list partitions as
+      # we are no longer responsible in a given process for processing those messages and they
+      # should have been picked up by a different process.
+      def remove_revoked_and_duplicated_messages
+        @rebalance_manager.revoked_partitions.each do |topic, partitions|
+          partitions.each do |partition|
+            @buffer.delete(topic, partition)
+          end
+        end
+        @buffer.uniq!
+      end
     end
   end
 end

data/lib/karafka/connection/listener.rb CHANGED Viewed

@@ -15,6 +15,8 @@ module Karafka
         @pauses_manager = PausesManager.new
         @client = Client.new(@subscription_group)
         @executors = Processing::ExecutorsBuffer.new(@client, subscription_group)
+        # We reference scheduler here as it is much faster than fetching this each time
+        @scheduler = ::Karafka::App.config.internal.scheduler
       end
       # Runs the main listener fetch loop.
@@ -66,9 +68,9 @@ module Karafka
           # distributing consuming jobs as upon revoking, we might get assigned to the same
           # partitions, thus getting their jobs. The revoking jobs need to finish before
           # appropriate consumers are taken down and re-created
-          wait(@subscription_group) if distribute_revoke_lost_partitions_jobs
+          wait(@subscription_group) if schedule_revoke_lost_partitions_jobs
-          distribute_partitions_jobs(messages_buffer)
+          schedule_partitions_jobs(messages_buffer)
           # We wait only on jobs from our subscription group. Other groups are independent.
           wait(@subscription_group)
@@ -103,15 +105,17 @@ module Karafka
       # Enqueues revoking jobs for partitions that were taken away from the running process.
       # @return [Boolean] was there anything to revoke
-      def distribute_revoke_lost_partitions_jobs
+      # @note We do not use scheduler here as those jobs are not meant to be order optimized in
+      #   any way. Since they operate occasionally it is irrelevant.
+      def schedule_revoke_lost_partitions_jobs
         revoked_partitions = @client.rebalance_manager.revoked_partitions
         return false if revoked_partitions.empty?
         revoked_partitions.each do |topic, partitions|
           partitions.each do |partition|
-            pause = @pauses_manager.fetch(topic, partition)
-            executor = @executors.fetch(topic, partition, pause)
+            pause_tracker = @pauses_manager.fetch(topic, partition)
+            executor = @executors.fetch(topic, partition, pause_tracker)
             @jobs_queue << Processing::Jobs::Revoked.new(executor)
           end
         end
@@ -122,8 +126,8 @@ module Karafka
       # Takes the messages per topic partition and enqueues processing jobs in threads.
       #
       # @param messages_buffer [Karafka::Connection::MessagesBuffer] buffer with messages
-      def distribute_partitions_jobs(messages_buffer)
-        messages_buffer.each do |topic, partition, messages|
+      def schedule_partitions_jobs(messages_buffer)
+        @scheduler.call(messages_buffer) do |topic, partition, messages|
           pause = @pauses_manager.fetch(topic, partition)
           next if pause.paused?

data/lib/karafka/connection/messages_buffer.rb CHANGED Viewed

@@ -10,6 +10,10 @@ module Karafka
     class MessagesBuffer
       attr_reader :size
+      extend Forwardable
+      def_delegators :@groups, :each
       # @return [Karafka::Connection::MessagesBuffer] buffer instance
       def initialize
         @size = 0
@@ -20,19 +24,6 @@ module Karafka
         end
       end
-      # Iterates over aggregated data providing messages per topic partition.
-      #
-      # @yieldparam [String] topic name
-      # @yieldparam [Integer] partition number
-      # @yieldparam [Array<Rdkafka::Consumer::Message>] topic partition aggregated results
-      def each
-        @groups.each do |topic, partitions|
-          partitions.each do |partition, messages|
-            yield(topic, partition, messages)
-          end
-        end
-      end
       # Adds a message to the buffer.
       #
       # @param message [Rdkafka::Consumer::Message] raw rdkafka message
@@ -42,6 +33,37 @@ module Karafka
         @groups[message.topic][message.partition] << message
       end
+      # Removes given topic and partition data out of the buffer
+      # This is used when there's a partition revocation
+      # @param topic [String] topic we're interested in
+      # @param partition [Integer] partition of which data we want to remove
+      def delete(topic, partition)
+        return unless @groups.key?(topic)
+        return unless @groups.fetch(topic).key?(partition)
+        topic_data = @groups.fetch(topic)
+        topic_data.delete(partition)
+        recount!
+        # If there are no more partitions to handle in a given topic, remove it completely
+        @groups.delete(topic) if topic_data.empty?
+      end
+      # Removes duplicated messages from the same partitions
+      # This should be used only when rebalance occurs, as we may get data again we already have
+      # due to the processing from the last offset. In cases like this, we may get same data
+      # again and we do want to ensure as few duplications as possible
+      def uniq!
+        @groups.each_value do |partitions|
+          partitions.each_value do |messages|
+            messages.uniq!(&:offset)
+          end
+        end
+        recount!
+      end
       # Removes all the data from the buffer.
       #
       # @note We do not clear the whole groups hash but rather we clear the partition hashes, so
@@ -52,6 +74,15 @@ module Karafka
         @size = 0
         @groups.each_value(&:clear)
       end
+      private
+      # Updates the messages count if we performed any operations that could change the state
+      def recount!
+        @size = @groups.each_value.sum do |partitions|
+          partitions.each_value.map(&:count).sum
+        end
+      end
     end
   end
 end

data/lib/karafka/connection/pauses_manager.rb CHANGED Viewed

@@ -12,11 +12,11 @@ module Karafka
         end
       end
-      # Creates or fetches pause of a given topic partition.
+      # Creates or fetches pause tracker of a given topic partition.
       #
       # @param topic [String] topic name
       # @param partition [Integer] partition number
-      # @return [Karafka::TimeTrackers::Pause] pause instance
+      # @return [Karafka::TimeTrackers::Pause] pause tracker instance
       def fetch(topic, partition)
         @pauses[topic][partition] ||= TimeTrackers::Pause.new(
           timeout: Karafka::App.config.pause_timeout,