RubyGems - karafka - Versions diffs - 2.5.6 → 2.5.8 - Mend

karafka 2.5.6 → 2.5.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +16 -0
data/lib/karafka/errors.rb +3 -1
data/lib/karafka/helpers/interval_runner.rb +4 -2
data/lib/karafka/instrumentation/logger_listener.rb +22 -9
data/lib/karafka/instrumentation/vendors/datadog/logger_listener.rb +6 -1
data/lib/karafka/messages/builders/batch_metadata.rb +4 -2
data/lib/karafka/pro/admin/recovery/errors.rb +43 -0
data/lib/karafka/pro/admin/recovery.rb +478 -0
data/lib/karafka/pro/cli/topics/health.rb +2 -2
data/lib/karafka/pro/iterator/expander.rb +1 -1
data/lib/karafka/pro/iterator.rb +1 -1
data/lib/karafka/pro/processing/coordinators/virtual_offset_manager.rb +17 -7
data/lib/karafka/processing/jobs_queue.rb +10 -0
data/lib/karafka/server.rb +9 -0
data/lib/karafka/swarm/node.rb +1 -30
data/lib/karafka/swarm/producer_replacer.rb +110 -0
data/lib/karafka/swarm/supervisor.rb +3 -0
data/lib/karafka/version.rb +1 -1
metadata +5 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 11edff86c8615652130786431e179d242dcc33130e5aabbbf5e0f5ed6d4138fe
-  data.tar.gz: 69244021709283a153da19907a424c230a3597832beec6acbb39dbc02e738256
+  metadata.gz: 9a0dd67b39af3fbad16ce4c96f061309d97e21ae95bcace2bab1e7879823a622
+  data.tar.gz: 0061a806b411a0526826327be18946fb39fe90c324a82e45d329ef9a45e3168a
 SHA512:
-  metadata.gz: 843548637c77ace03cde5c1f4150226c244b1b3343c6acd17d16c19cdc798c6d770611522eed7ebef76830f3e551c75afa176f2937c1fd63045e66ccb6276701
-  data.tar.gz: a0696c28b998c8e13d4b0e0d3f4252245f27e6f93e0d75d2153a93cc1dcc53b45fbe8416ba0a9e6025753ec5d107088d578f03d0a4cada3bb5ed2eb4e5341518
+  metadata.gz: 7cae09aabcea7cf6eb692bc5747dc51327601eebd680314a62c064c626f79db17a7a6c3f0e49a6a95859eeb4d6e30a51b7a6cb3b7acd1c69400734981e926db6
+  data.tar.gz: 1c1c031bd16d1da91fa7fc3a2627e157db698b51883c91197c6cf5311d19abd8d9be75742db447d588e73c1a30dca196f991a4ec9ee49f31090fcb2863f45b47

data/CHANGELOG.md CHANGED Viewed

@@ -1,5 +1,21 @@
 # Karafka Framework Changelog
+## 2.5.8 (2026-03-23)
+- **[Feature]** Add `Karafka::Admin::Recovery` for coordinator-bypass offset reading and consumer group migration when the Kafka group coordinator is in a FAILED state (Pro).
+## 2.5.7 (2026-03-16)
+- [Enhancement] Report detailed blocking information (active listeners, alive workers, and in-processing jobs) during forceful shutdown instead of only aggregate counts.
+- [Enhancement] Improve `ForcefulShutdownError` description to clearly explain when and why it is raised.
+- [Enhancement] Cache `messages.last` in `BatchMetadata` builder to avoid duplicate array traversal.
+- [Enhancement] Optimize `VirtualOffsetManager#mark` to use a single array scan instead of separate `include?` and `index` calls (Pro).
+- [Enhancement] Optimize `VirtualOffsetManager#materialize_real_offset` to use `keys.sort` instead of `to_a.sort_by` with tuple destructuring (Pro).
+- [Enhancement] Optimize `IntervalRunner#call` to use a single `monotonic_now` call instead of two per invocation.
+- [Enhancement] Support WaterDrop `:fd` mode in Swarm.
+- [Maintenance] Use both `:fd` and `:thread` producer backends in CI.
+- [Maintenance] Include spec file hash in integration test topic names for easier traceability in Kafka logs (#3056).
+- [Fix] Remove duplicate topic creation in multi-broker health integration specs (#3056).
+- [Fix] Preserve producer-specific kafka settings (e.g., `enable.idempotence`) when recreating the producer in swarm forks.
 ## 2.5.6 (2026-02-28)
 - **[Feature]** Add `karafka topics health` command to check Kafka topics for replication and durability issues, detecting no redundancy (RF=1), zero fault tolerance (RF≤min.insync), and low durability (min.insync=1) configurations with color-coded severity grouping and actionable recommendations (Pro).
 - [Enhancement] Optimize license loading process by reading license files directly from the gem directory instead of requiring the entire gem, reducing initialization overhead and adding support for user-defined License modules.

data/lib/karafka/errors.rb CHANGED Viewed

@@ -51,7 +51,9 @@ module Karafka
       end
     end
-    # Raised when we've waited enough for shutting down a non-responsive process
+    # Raised when the graceful shutdown timeout has been exceeded and Karafka must forcefully
+    # terminate remaining listeners and workers. This typically happens when consumer processing
+    # or shutdown jobs take longer than the configured `shutdown_timeout`.
     ForcefulShutdownError = Class.new(BaseError)
     # Raised when the jobs queue receives a job that should not be received as it would cause

data/lib/karafka/helpers/interval_runner.rb CHANGED Viewed

@@ -26,9 +26,11 @@ module Karafka
       # Runs the requested code if it was not executed previously recently
       def call
-        return if monotonic_now - @last_called_at < @interval
+        now = monotonic_now
-        @last_called_at = monotonic_now
+        return if now - @last_called_at < @interval
+        @last_called_at = now
         @block.call
       end

data/lib/karafka/instrumentation/logger_listener.rb CHANGED Viewed

@@ -385,21 +385,34 @@ module Karafka
           fatal "Runner crashed due to an error: #{details}"
           fatal backtrace
         when "app.stopping.error"
-          # Counts number of workers and listeners that were still active when forcing the
-          # shutdown. Please note, that unless all listeners are closed, workers will not finalize
-          # their operations as well.
-          # We need to check if listeners and workers are assigned as during super early stages of
-          # boot they are not.
-          listeners = Server.listeners ? Server.listeners.count(&:active?) : 0
-          workers = Server.workers ? Server.workers.count(&:alive?) : 0
+          active_listeners = event.payload[:active_listeners]
+          alive_workers = event.payload[:alive_workers]
+          in_processing = event.payload[:in_processing]
           message = <<~MSG.tr("\n", " ").strip!
             Forceful Karafka server stop with:
-            #{workers} active workers and
-            #{listeners} active listeners
+            #{alive_workers.size} active workers and
+            #{active_listeners.size} active listeners
           MSG
           error message
+          active_listeners.each do |listener|
+            error "Listener #{listener.id} for #{listener.subscription_group.name} still active"
+          end
+          in_processing.each do |group_id, jobs|
+            next if jobs.empty?
+            jobs.each do |job|
+              job_class = job.class.name.split("::").last
+              topic_name = job.executor.topic.name
+              partition = job.executor.partition
+              error "In processing: #{job_class} job for #{topic_name}/#{partition} " \
+                    "(group: #{group_id})"
+            end
+          end
         when "app.forceful_stopping.error"
           error "Forceful shutdown error occurred: #{details}"
           error backtrace

data/lib/karafka/instrumentation/vendors/datadog/logger_listener.rb CHANGED Viewed

@@ -117,7 +117,12 @@ module Karafka
             when "runner.call.error"
               fatal "Runner crashed due to an error: #{error}"
             when "app.stopping.error"
-              error "Forceful Karafka server stop"
+              active_listeners = event.payload[:active_listeners]
+              alive_workers = event.payload[:alive_workers]
+              error "Forceful Karafka server stop with: " \
+                    "#{alive_workers.size} active workers and " \
+                    "#{active_listeners.size} active listeners"
             when "app.forceful_stopping.error"
               error "Forceful shutdown error occurred: #{error}"
             when "librdkafka.error"

data/lib/karafka/messages/builders/batch_metadata.rb CHANGED Viewed

@@ -17,16 +17,18 @@ module Karafka
           # @note We do not set `processed_at` as this needs to be assigned when the batch is
           #   picked up for processing.
           def call(messages, topic, partition, scheduled_at)
+            last_message = messages.last
             Karafka::Messages::BatchMetadata.new(
               size: messages.size,
               first_offset: messages.first&.offset || -1001,
-              last_offset: messages.last&.offset || -1001,
+              last_offset: last_message&.offset || -1001,
               deserializers: topic.deserializers,
               partition: partition,
               topic: topic.name,
               # We go with the assumption that the creation of the whole batch is the last message
               # creation time
-              created_at: local_created_at(messages.last),
+              created_at: local_created_at(last_message),
               # When this batch was built and scheduled for execution
               scheduled_at: scheduled_at,
               # This needs to be set to a correct value prior to processing starting

data/lib/karafka/pro/admin/recovery/errors.rb ADDED Viewed

@@ -0,0 +1,43 @@
+# frozen_string_literal: true
+# Karafka Pro - Source Available Commercial Software
+# Copyright (c) 2017-present Maciej Mensfeld. All rights reserved.
+#
+# This software is NOT open source. It is source-available commercial software
+# requiring a paid license for use. It is NOT covered by LGPL.
+#
+# PROHIBITED:
+# - Use without a valid commercial license
+# - Redistribution, modification, or derivative works without authorization
+# - Use as training data for AI/ML models or inclusion in datasets
+# - Scraping, crawling, or automated collection for any purpose
+#
+# PERMITTED:
+# - Reading, referencing, and linking for personal or commercial use
+# - Runtime retrieval by AI assistants, coding agents, and RAG systems
+#   for the purpose of providing contextual help to Karafka users
+#
+# License: https://karafka.io/docs/Pro-License-Comm/
+# Contact: contact@karafka.io
+module Karafka
+  module Pro
+    # Pro Admin utilities
+    module Admin
+      class Recovery < Karafka::Admin
+        # Recovery related errors
+        module Errors
+          # Base for all the recovery errors
+          BaseError = Class.new(::Karafka::Errors::BaseError)
+          # Raised when required cluster metadata cannot be retrieved (topic, partition, or
+          # broker not found)
+          MetadataError = Class.new(BaseError)
+          # Raised when a partition number is outside the valid range for __consumer_offsets
+          PartitionOutOfRangeError = Class.new(BaseError)
+        end
+      end
+    end
+  end
+end

data/lib/karafka/pro/admin/recovery.rb ADDED Viewed

@@ -0,0 +1,478 @@
+# frozen_string_literal: true
+# Karafka Pro - Source Available Commercial Software
+# Copyright (c) 2017-present Maciej Mensfeld. All rights reserved.
+#
+# This software is NOT open source. It is source-available commercial software
+# requiring a paid license for use. It is NOT covered by LGPL.
+#
+# PROHIBITED:
+# - Use without a valid commercial license
+# - Redistribution, modification, or derivative works without authorization
+# - Use as training data for AI/ML models or inclusion in datasets
+# - Scraping, crawling, or automated collection for any purpose
+#
+# PERMITTED:
+# - Reading, referencing, and linking for personal or commercial use
+# - Runtime retrieval by AI assistants, coding agents, and RAG systems
+#   for the purpose of providing contextual help to Karafka users
+#
+# License: https://karafka.io/docs/Pro-License-Comm/
+# Contact: contact@karafka.io
+module Karafka
+  module Pro
+    # Pro Admin utilities
+    module Admin
+      # Consumer group recovery toolkit.
+      #
+      # Provides coordinator-bypass offset reading and blast-radius assessment for scenarios
+      # where the Kafka group coordinator is in a FAILED state and normal admin APIs return
+      # NOT_COORDINATOR or time out.
+      #
+      # Works for any coordinator failure scenario:
+      #   - KAFKA-19862 (compaction race during coordinator load)
+      #   - Broker OOM / GC pause making coordinator unreachable
+      #   - Network partition isolating the coordinator broker
+      #   - Any future bug that transitions a coordinator shard to FAILED
+      #
+      # Each consumer group is assigned to a specific __consumer_offsets partition (and therefore
+      # a specific coordinator broker) based on its name. When that coordinator enters a FAILED
+      # state, all operations for the group - joins, heartbeats, offset commits, and offset
+      # fetches - are stuck until the coordinator recovers.
+      #
+      # A common recovery strategy is migrating to a new consumer group with a different name,
+      # which causes Kafka to hash it to a (likely) different __consumer_offsets partition served
+      # by a healthy coordinator. This class provides the tools to:
+      #   1. Read committed offsets directly from the raw __consumer_offsets log (bypassing the
+      #      broken coordinator) via {#read_committed_offsets}
+      #   2. Assess blast radius: which broker coordinates a group ({#coordinator_for}), which
+      #      partitions a broker leads ({#affected_partitions}), and which groups are affected
+      #      ({#affected_groups})
+      #
+      # To complete the migration, use {Karafka::Admin::ConsumerGroups.seek} to write the
+      # recovered offsets to the new group.
+      #
+      # All reads go through the fetch API and never touch the group coordinator.
+      #
+      # @note These methods should NOT be used unless you are experiencing issues that require
+      #   manual intervention. Misuse can lead to data loss or other problems.
+      class Recovery < Karafka::Admin
+        # Internal topic where Kafka stores committed offsets and group metadata
+        OFFSETS_TOPIC = "__consumer_offsets"
+        # Default lookback window for offset scanning (1 hour). Covers any normal commit interval.
+        # Provide an earlier Time if your group commits infrequently or the incident has been
+        # ongoing for longer than 1 hour.
+        DEFAULT_LAST_COMMITTED_AT_OFFSET = 3_600
+        private_constant :OFFSETS_TOPIC, :DEFAULT_LAST_COMMITTED_AT_OFFSET
+        class << self
+          # @param consumer_group_id [String] consumer group to read offsets for
+          # @param last_committed_at [Time] approximate time of last successful offset commit
+          #   (default: 1 hour ago). A good rule of thumb is the crash time minus 10 minutes
+          # @return [Hash{String => Hash{Integer => Integer}}]
+          # @see #read_committed_offsets
+          def read_committed_offsets(
+            consumer_group_id,
+            last_committed_at: Time.now - DEFAULT_LAST_COMMITTED_AT_OFFSET
+          )
+            new.read_committed_offsets(consumer_group_id, last_committed_at: last_committed_at)
+          end
+          # @param consumer_group_id [String] consumer group id
+          # @return [Integer] __consumer_offsets partition number
+          # @see #offsets_partition_for
+          def offsets_partition_for(consumer_group_id)
+            new.offsets_partition_for(consumer_group_id)
+          end
+          # @param consumer_group_id [String] consumer group to look up
+          # @return [Hash] coordinator broker info
+          # @see #coordinator_for
+          def coordinator_for(consumer_group_id)
+            new.coordinator_for(consumer_group_id)
+          end
+          # @param partition [Integer] __consumer_offsets partition to scan
+          # @param last_committed_at [Time] approximate time of last successful offset commit
+          #   (default: 1 hour ago). A good rule of thumb is the crash time minus 10 minutes
+          # @return [Array<String>] sorted consumer group names
+          # @see #affected_groups
+          def affected_groups(partition, last_committed_at: Time.now - DEFAULT_LAST_COMMITTED_AT_OFFSET)
+            new.affected_groups(partition, last_committed_at: last_committed_at)
+          end
+          # @param broker_id [Integer] broker node id
+          # @return [Array<Integer>] sorted partition numbers
+          # @see #affected_partitions
+          def affected_partitions(broker_id)
+            new.affected_partitions(broker_id)
+          end
+        end
+        # Reads committed offsets for a consumer group directly from the __consumer_offsets internal
+        # topic, bypassing the group coordinator. Only scans the single __consumer_offsets partition
+        # that holds data for the given group (determined by Java's String#hashCode mod partition
+        # count), starting from last_committed_at and reading forward to EOF. Later records
+        # overwrite earlier ones so the result always reflects the most recent committed offset per
+        # partition.
+        #
+        # @note All consumers in this group should be fully stopped before calling this method.
+        #   While normally they would already be stopped due to a coordinator failure, if the
+        #   cluster recovers concurrently, active consumers may commit newer offsets that this scan
+        #   will not capture, resulting in stale data.
+        #
+        # @note This method may take a noticeable amount of time to complete because it scans
+        #   the raw __consumer_offsets log from last_committed_at forward to the end. The duration
+        #   depends on the volume of offset commits in the scan window across all consumer groups
+        #   that hash to the same __consumer_offsets partition.
+        #
+        # @note The result only contains topic-partitions that had offsets committed after
+        #   last_committed_at. If a partition never had an offset committed, or if the commit
+        #   happened before last_committed_at, it will be absent from the result. It is the
+        #   caller's responsibility to verify that all expected topic-partitions are present before
+        #   using the result for migration or other operations.
+        #
+        # @param consumer_group_id [String] consumer group to read offsets for
+        # @param last_committed_at [Time] approximate time of last successful offset commit
+        #   (default: 1 hour ago). A good rule of thumb is the crash time minus 10 minutes
+        # @return [Hash{String => Hash{Integer => Integer}}]
+        #   { topic => { partition => committed_offset } }
+        #
+        # @example Read offsets for the last hour (default)
+        #   Karafka::Admin::Recovery.read_committed_offsets('sync')
+        #   #=> { 'events' => { 0 => 1400, 1 => 1402, ... } }
+        #
+        # @example Read offsets for the last 6 hours
+        #   Karafka::Admin::Recovery.read_committed_offsets(
+        #     'sync', last_committed_at: Time.now - 6 * 3600
+        #   )
+        #
+        # @example Read offsets from a specific point in time
+        #   Karafka::Admin::Recovery.read_committed_offsets('sync', last_committed_at: Time.new(2025, 3, 1))
+        #
+        # @example Migrate a stuck consumer group to a new name (two-step workflow)
+        #   # Step 1: Read committed offsets from the broken group (bypasses coordinator)
+        #   offsets = Karafka::Admin::Recovery.read_committed_offsets('sync')
+        #   #=> { 'events' => { 0 => 1400, 1 => 1402 }, 'orders' => { 0 => 890 } }
+        #
+        #   # Step 2: Inspect the recovered offsets — verify all expected topics and partitions
+        #   # are present and the offset values look reasonable before committing them
+        #
+        #   # Step 3: Write the offsets to the target group using standard Admin APIs
+        #   Karafka::Admin::ConsumerGroups.seek('sync_v2', offsets)
+        #
+        #   # Now reconfigure your consumers to use 'sync_v2' and restart them
+        def read_committed_offsets(
+          consumer_group_id,
+          last_committed_at: Time.now - DEFAULT_LAST_COMMITTED_AT_OFFSET
+        )
+          committed = Hash.new { |h, k| h[k] = {} }
+          target_partition = offsets_partition_for(consumer_group_id)
+          iterator = Pro::Iterator.new(
+            { OFFSETS_TOPIC => { target_partition => last_committed_at } },
+            settings: @custom_kafka
+          )
+          iterator.each do |message|
+            next unless message.raw_key
+            parsed = parse_offset_commit(message)
+            next unless parsed
+            next unless parsed[:group] == consumer_group_id
+            if parsed[:offset].nil?
+              # Tombstone — offset was deleted, remove from results
+              committed[parsed[:topic]].delete(parsed[:partition])
+              committed.delete(parsed[:topic]) if committed[parsed[:topic]].empty?
+            else
+              # Last write wins — scanning forward means we naturally end up with the most
+              # recent commit per partition
+              committed[parsed[:topic]][parsed[:partition]] = parsed[:offset]
+            end
+          end
+          committed.sort.to_h.transform_values { |parts| parts.sort.to_h }
+        end
+        # Determines which __consumer_offsets partition holds data for a given consumer group. Kafka
+        # uses Utils.abs(String#hashCode) % numPartitions where hashCode is Java's 32-bit signed
+        # hash: s[0]*31^(n-1) + s[1]*31^(n-2) + ... + s[n-1], computed with int32 overflow
+        # semantics. Utils.abs maps Integer.MIN_VALUE to 0.
+        #
+        # @param consumer_group_id [String] consumer group id
+        # @return [Integer] __consumer_offsets partition number
+        #
+        # @example Check which partition stores offsets for a group
+        #   Karafka::Admin::Recovery.offsets_partition_for('my-group')
+        #   #=> 17
+        def offsets_partition_for(consumer_group_id)
+          h = java_hash_code(consumer_group_id)
+          # Kafka's Utils.abs: Integer.MIN_VALUE maps to 0
+          h = (h == -2_147_483_648) ? 0 : h.abs
+          h % offsets_partition_count
+        end
+        # Returns which broker is the coordinator for a consumer group. The coordinator is the
+        # leader of the __consumer_offsets partition assigned to this group. Pure metadata lookup
+        # that does not scan any topic data.
+        #
+        # Use this to quickly identify which broker is responsible for a consumer group. During an
+        # incident, this tells you whether a specific group is affected by a broker outage. If the
+        # returned broker is the one that is down or in a FAILED state, the group is stuck and
+        # needs migration.
+        #
+        # @param consumer_group_id [String] consumer group to look up
+        # @return [Hash{Symbol => Object}] coordinator info with :partition, :broker_id,
+        #   and :broker_host keys
+        #
+        # @example Find coordinator for a group
+        #   Karafka::Admin::Recovery.coordinator_for('my-group')
+        #   #=> { partition: 17, broker_id: 2, broker_host: "broker2:9092" }
+        #
+        # @example Check if a group is affected by a broker outage
+        #   info = Karafka::Admin::Recovery.coordinator_for('my-group')
+        #   if info[:broker_id] == failed_broker_id
+        #     puts "Group 'my-group' is stuck on failed broker #{info[:broker_host]}"
+        #   end
+        def coordinator_for(consumer_group_id)
+          target_partition = offsets_partition_for(consumer_group_id)
+          metadata = cluster_info
+          offsets_topic = metadata.topics.find { |t| t[:topic_name] == OFFSETS_TOPIC }
+          unless offsets_topic
+            raise(
+              Errors::MetadataError,
+              "Could not retrieve metadata for '#{OFFSETS_TOPIC}'"
+            )
+          end
+          partitions = offsets_topic[:partitions]
+          partition_info = partitions.find { |p| p[:partition_id] == target_partition }
+          unless partition_info
+            raise(
+              Errors::MetadataError,
+              "Could not find partition #{target_partition} in '#{OFFSETS_TOPIC}'"
+            )
+          end
+          leader_id = partition_info[:leader]
+          broker = metadata.brokers.find do |b|
+            if b.is_a?(Hash)
+              (b[:broker_id] || b[:node_id]) == leader_id
+            else
+              b.node_id == leader_id
+            end
+          end
+          unless broker
+            raise(
+              Errors::MetadataError,
+              "Could not find broker #{leader_id} in cluster metadata"
+            )
+          end
+          if broker.is_a?(Hash)
+            host = broker[:broker_name] || broker[:host]
+            port = broker[:broker_port] || broker[:port]
+            broker_host = "#{host}:#{port}"
+            broker_id = broker[:broker_id] || broker[:node_id]
+          else
+            broker_host = "#{broker.host}:#{broker.port}"
+            broker_id = broker.node_id
+          end
+          { partition: target_partition, broker_id: broker_id, broker_host: broker_host }
+        end
+        # Scans a __consumer_offsets partition and returns consumer group names that have active
+        # committed offsets. Groups where all offsets have been tombstoned (deleted) within the
+        # scan window are excluded.
+        #
+        # Use this to discover which consumer groups are affected when a coordinator broker fails.
+        # Combined with {#affected_partitions}, this gives the full blast radius of a broker
+        # outage: first find which __consumer_offsets partitions the failed broker leads, then
+        # scan each partition to discover all affected consumer groups.
+        #
+        # @param partition [Integer] __consumer_offsets partition to scan
+        # @param last_committed_at [Time] approximate time of last successful offset commit
+        #   (default: 1 hour ago). A good rule of thumb is the crash time minus 10 minutes
+        # @return [Array<String>] sorted list of consumer group names with active offsets
+        #
+        # @example Find all groups on partition 17
+        #   Karafka::Admin::Recovery.affected_groups(17)
+        #   #=> ["group-a", "group-b", "group-c"]
+        #
+        # @example Full blast radius of a broker outage
+        #   partitions = Karafka::Admin::Recovery.affected_partitions(failed_broker_id)
+        #   all_affected = partitions.flat_map do |p|
+        #     Karafka::Admin::Recovery.affected_groups(p)
+        #   end.uniq
+        def affected_groups(partition, last_committed_at: Time.now - DEFAULT_LAST_COMMITTED_AT_OFFSET)
+          count = offsets_partition_count
+          unless partition >= 0 && partition < count
+            raise(
+              Errors::PartitionOutOfRangeError,
+              "Partition #{partition} is out of range (0...#{count})"
+            )
+          end
+          # Track offsets per group with last-write-wins so fully tombstoned groups
+          # (all offsets deleted) are excluded from the result
+          committed = Hash.new { |h, k| h[k] = Hash.new { |h2, k2| h2[k2] = {} } }
+          iterator = Pro::Iterator.new(
+            { OFFSETS_TOPIC => { partition => last_committed_at } },
+            settings: @custom_kafka
+          )
+          iterator.each do |message|
+            next unless message.raw_key
+            parsed = parse_offset_commit(message)
+            next unless parsed
+            group = parsed[:group]
+            if parsed[:offset].nil?
+              committed[group][parsed[:topic]].delete(parsed[:partition])
+              committed[group].delete(parsed[:topic]) if committed[group][parsed[:topic]].empty?
+            else
+              committed[group][parsed[:topic]][parsed[:partition]] = parsed[:offset]
+            end
+          end
+          committed.select { |_, topics| !topics.empty? }.keys.sort
+        end
+        # Returns all __consumer_offsets partitions led by a given broker. Pure metadata lookup
+        # that does not scan any topic data.
+        #
+        # Use this as the first step in assessing the blast radius of a broker outage. The
+        # returned partition numbers can be passed to {#affected_groups} to discover all consumer
+        # groups that need recovery or migration.
+        #
+        # @param broker_id [Integer] broker node id
+        # @return [Array<Integer>] sorted list of __consumer_offsets partition numbers
+        #
+        # @example Find partitions led by broker 2
+        #   Karafka::Admin::Recovery.affected_partitions(2)
+        #   #=> [3, 17, 28, 42]
+        def affected_partitions(broker_id)
+          metadata = cluster_info
+          offsets_topic = metadata.topics.find { |t| t[:topic_name] == OFFSETS_TOPIC }
+          unless offsets_topic
+            raise(
+              Errors::MetadataError,
+              "Could not retrieve metadata for '#{OFFSETS_TOPIC}'"
+            )
+          end
+          offsets_topic[:partitions]
+            .select { |p| p[:leader] == broker_id }
+            .map { |p| p[:partition_id] }
+            .sort
+        end
+        private
+        # Parses a raw __consumer_offsets message into structured offset commit data.
+        # Handles both v0 and v1 offset commit key formats (both use the same layout for
+        # group/topic/partition). Tombstone records (nil payload) indicate offset deletion and
+        # are returned with offset: nil so callers can remove stale entries.
+        #
+        # @param message [Karafka::Messages::Message] raw message from __consumer_offsets
+        # @return [Hash, nil] parsed offset commit or nil if not an offset commit record.
+        #   When the record is a tombstone (deletion), the :offset value will be nil.
+        def parse_offset_commit(message)
+          return nil unless message.raw_key
+          key = message.raw_key.b
+          key_version = key[0, 2].unpack1("n")
+          # Versions 0 and 1 are offset commit records with identical key layout
+          return nil unless key_version <= 1
+          pos = 2
+          gl = key[pos, 2].unpack1("n")
+          pos += 2
+          group = key[pos, gl].force_encoding("UTF-8")
+          pos += gl
+          tl = key[pos, 2].unpack1("n")
+          pos += 2
+          topic = key[pos, tl].force_encoding("UTF-8")
+          pos += tl
+          partition = key[pos, 4].unpack1("N")
+          # Tombstone (nil payload) means the offset was deleted
+          unless message.raw_payload
+            return { group: group, topic: topic, partition: partition, offset: nil }
+          end
+          val = message.raw_payload.b
+          # value layout: int16 version | int64 offset | ...
+          offset = val[2, 8].unpack1("q>")
+          { group: group, topic: topic, partition: partition, offset: offset }
+        end
+        # Computes Java's String#hashCode for a given string. Java hashes UTF-16 code units
+        # (char values), not raw bytes. For ASCII-only strings this is identical to byte-level
+        # hashing, but non-ASCII characters (accented letters, CJK, emoji) require encoding to
+        # UTF-16 and hashing each 16-bit code unit (including surrogate pairs for characters
+        # above U+FFFF).
+        #
+        # @param str [String] input string
+        # @return [Integer] signed 32-bit hash value matching Java's String#hashCode
+        def java_hash_code(str)
+          hash = 0
+          # Encode to UTF-16BE to get Java's char sequence, then hash each 16-bit code unit
+          str.encode("UTF-16BE").bytes.each_slice(2) do |hi, lo|
+            code_unit = (hi << 8) | lo
+            hash = (hash * 31 + code_unit) & 0xFFFFFFFF
+          end
+          # Convert unsigned 32-bit to signed 32-bit (Java int semantics)
+          (hash >= 0x80000000) ? hash - 0x100000000 : hash
+        end
+        # Returns the partition count of the __consumer_offsets topic. Memoized per instance since
+        # this value never changes at runtime.
+        #
+        # @return [Integer] number of partitions
+        # @raise [Errors::MetadataError] when topic metadata cannot be retrieved
+        def offsets_partition_count
+          @offsets_partition_count ||= begin
+            topic_info = cluster_info.topics.find do |t|
+              t[:topic_name] == OFFSETS_TOPIC
+            end
+            unless topic_info
+              raise(
+                Errors::MetadataError,
+                "Could not retrieve partition count for '#{OFFSETS_TOPIC}'"
+              )
+            end
+            topic_info[:partition_count]
+          end
+        end
+      end
+    end
+  end
+end
+# We alias this for Pro users so we don't end up having two Admin namespaces from the end
+# user perspective. This enhances the UX.
+Karafka::Admin::Recovery = Karafka::Pro::Admin::Recovery

data/lib/karafka/pro/cli/topics/health.rb CHANGED Viewed

@@ -79,8 +79,8 @@ module Karafka
           # @param topic_name [String] name of the topic
           # @return [Integer] min.insync.replicas value
           def fetch_min_insync_replicas(topic_name)
-            configs = Admin::Configs.describe(
-              Admin::Configs::Resource.new(type: :topic, name: topic_name)
+            configs = Karafka::Admin::Configs.describe(
+              Karafka::Admin::Configs::Resource.new(type: :topic, name: topic_name)
             ).first.configs
             configs.find { |c| c.name == "min.insync.replicas" }.value.to_i

data/lib/karafka/pro/iterator/expander.rb CHANGED Viewed

@@ -89,7 +89,7 @@ module Karafka
         # We cache it so we do not have to run consecutive requests to obtain data about multiple
         # topics
         def topics
-          @topics ||= Admin.cluster_info.topics
+          @topics ||= ::Karafka::Admin.cluster_info.topics
         end
         # @param name [String] topic name

data/lib/karafka/pro/iterator.rb CHANGED Viewed

@@ -79,7 +79,7 @@ module Karafka
       # the partitions but once we found it, given partition data is no longer needed and would
       # only eat up resources.
       def each
-        Admin.with_consumer(@settings) do |consumer|
+        ::Karafka::Admin.with_consumer(@settings) do |consumer|
           tpl = TplBuilder.new(consumer, @topics_with_partitions).call
           consumer.assign(tpl)

data/lib/karafka/pro/processing/coordinators/virtual_offset_manager.rb CHANGED Viewed

@@ -91,17 +91,27 @@ module Karafka
             @offsets_metadata[offset] = offset_metadata
             @current_offset_metadata = offset_metadata
-            group = @groups.find { |reg_group| reg_group.include?(offset) }
+            group = nil
+            position = nil
+            @groups.each do |reg_group|
+              pos = reg_group.index(offset)
+              if pos
+                group = reg_group
+                position = pos
+                break
+              end
+            end
             # This case can happen when someone uses MoM and wants to mark message from a previous
             # batch as consumed. We can add it, since the real offset refresh will point to it
             unless group
               group = [offset]
+              position = 0
               @groups << group
             end
-            position = group.index(offset)
             # Mark all previous messages from the same group also as virtually consumed
             group[0..position].each do |markable_offset|
               # Set previous messages metadata offset as the offset of higher one for overwrites
@@ -135,7 +145,7 @@ module Karafka
           # @return [Array<Integer>] Offsets of messages already marked as consumed virtually
           def marked
-            @marked.select { |_, status| status }.map(&:first).sort
+            @marked.select { |_, status| status }.map { |offset, _| offset }.sort
           end
           # Is there a real offset we can mark as consumed
@@ -171,11 +181,11 @@ module Karafka
           private
           # Recomputes the biggest possible real offset we can have.
-          # It picks the the biggest offset that has uninterrupted stream of virtually marked as
+          # It picks the biggest offset that has uninterrupted stream of virtually marked as
           # consumed because this will be the collective offset.
           def materialize_real_offset
-            @marked.to_a.sort_by(&:first).each do |offset, marked|
-              break unless marked
+            @marked.keys.sort.each do |offset|
+              break unless @marked[offset]
               @real_offset = offset
             end

data/lib/karafka/processing/jobs_queue.rb CHANGED Viewed

@@ -180,6 +180,16 @@ module Karafka
         end
       end
+      # Returns a snapshot of all jobs currently in processing per group.
+      # Useful for diagnostics during forceful shutdown to understand what is blocking.
+      #
+      # @return [Hash{String => Array<Jobs::Base>}] hash mapping group ids to arrays of jobs
+      def in_processing
+        @mutex.synchronize do
+          @in_processing.transform_values(&:dup).freeze
+        end
+      end
       private
       # @param group_id [String] id of the group in which jobs we're interested.

data/lib/karafka/server.rb CHANGED Viewed

@@ -126,10 +126,19 @@ module Karafka
         raise Errors::ForcefulShutdownError
       rescue Errors::ForcefulShutdownError => e
+        active_listeners = listeners.select(&:active?)
+        alive_workers = workers.select(&:alive?)
+        # Collect details about subscription groups that still have jobs in processing
+        in_processing = jobs_queue ? jobs_queue.in_processing : {}
         Karafka.monitor.instrument(
           "error.occurred",
           caller: self,
           error: e,
+          active_listeners: active_listeners,
+          alive_workers: alive_workers,
+          in_processing: in_processing,
           type: "app.stopping.error"
         )

data/lib/karafka/swarm/node.rb CHANGED Viewed

@@ -27,18 +27,6 @@ module Karafka
       # @return [Integer] pid of the node
       attr_reader :pid
-      # When re-creating a producer in the fork, those are not attributes we want to inherit
-      # from the parent process because they are updated in the fork. If user wants to take those
-      # from the parent process, he should redefine them by overwriting the whole producer.
-      SKIPPABLE_NEW_PRODUCER_ATTRIBUTES = %i[
-        id
-        kafka
-        logger
-        oauth
-      ].freeze
-      private_constant :SKIPPABLE_NEW_PRODUCER_ATTRIBUTES
       # @param id [Integer] number of the fork. Used for uniqueness setup for group client ids and
       #   other stuff where we need to know a unique reference of the fork in regards to the rest
       #   of them.
@@ -70,24 +58,7 @@ module Karafka
           config.producer.close
           old_producer = config.producer
-          old_producer_config = old_producer.config
-          # Supervisor producer is closed, hence we need a new one here
-          config.producer = WaterDrop::Producer.new do |p_config|
-            p_config.kafka = Setup::AttributesMap.producer(kafka.dup)
-            p_config.logger = config.logger
-            old_producer_config.to_h.each do |key, value|
-              next if SKIPPABLE_NEW_PRODUCER_ATTRIBUTES.include?(key)
-              p_config.public_send("#{key}=", value)
-            end
-            # Namespaced attributes need to be migrated directly on their config node
-            old_producer_config.oauth.to_h.each do |key, value|
-              p_config.oauth.public_send("#{key}=", value)
-            end
-          end
+          config.producer = ProducerReplacer.new.call(old_producer, kafka, config.logger)
           @pid = ::Process.pid
           @reader.close

data/lib/karafka/swarm/producer_replacer.rb ADDED Viewed

@@ -0,0 +1,110 @@
+# frozen_string_literal: true
+module Karafka
+  module Swarm
+    # Builds a new WaterDrop producer that inherits configuration from an old one
+    #
+    # When a swarm node forks, the parent's producer must be replaced with a new one.
+    # This class encapsulates the logic for building that new producer, inheriting all relevant
+    # settings from the old one while generating fresh connection-level configuration.
+    class ProducerReplacer
+      # Attributes that should not be directly copied from the old producer config because they
+      # are either regenerated fresh (kafka, logger, id) or handled via their own namespaced
+      # migration (oauth, polling, polling.fd).
+      SKIPPABLE_ATTRIBUTES = %i[
+        id
+        kafka
+        logger
+        oauth
+        polling
+        fd
+      ].freeze
+      private_constant :SKIPPABLE_ATTRIBUTES
+      # Builds a new WaterDrop producer inheriting configuration from the old one
+      #
+      # @param old_producer [WaterDrop::Producer] the old producer to inherit settings from
+      # @param kafka [Hash] app-level kafka configuration
+      # @param logger [Object] logger instance for the new producer
+      # @return [WaterDrop::Producer] new producer with inherited configuration
+      def call(old_producer, kafka, logger)
+        old_producer_config = old_producer.config
+        WaterDrop::Producer.new do |p_config|
+          p_config.logger = logger
+          migrate_kafka(p_config, old_producer_config, kafka)
+          migrate_root(p_config, old_producer_config)
+          migrate_oauth(p_config, old_producer_config)
+          migrate_polling(p_config, old_producer_config)
+          migrate_polling_fd(p_config, old_producer_config)
+        end
+      end
+      private
+      # Migrates root-level producer attributes from the old producer, skipping those that are
+      # regenerated fresh or handled by their own namespaced migration
+      #
+      # @param p_config [WaterDrop::Config] new producer config being built
+      # @param old_producer_config [WaterDrop::Config] old producer config to inherit from
+      def migrate_root(p_config, old_producer_config)
+        old_producer_config.to_h.each do |key, value|
+          next if SKIPPABLE_ATTRIBUTES.include?(key)
+          p_config.public_send("#{key}=", value)
+        end
+      end
+      # Builds fresh kafka config from app-level settings and preserves any producer-specific
+      # kafka settings from the old producer (e.g., enable.idempotence) that aren't in the
+      # base app kafka config
+      #
+      # @param p_config [WaterDrop::Config] new producer config being built
+      # @param old_producer_config [WaterDrop::Config] old producer config to inherit from
+      # @param kafka [Hash] app-level kafka configuration
+      def migrate_kafka(p_config, old_producer_config, kafka)
+        p_config.kafka = Setup::AttributesMap.producer(kafka.dup)
+        old_producer_config.kafka.each do |key, value|
+          next if p_config.kafka.key?(key)
+          p_config.kafka[key] = value
+        end
+      end
+      # Migrates oauth configuration from the old producer
+      #
+      # @param p_config [WaterDrop::Config] new producer config being built
+      # @param old_producer_config [WaterDrop::Config] old producer config to inherit from
+      def migrate_oauth(p_config, old_producer_config)
+        old_producer_config.oauth.to_h.each do |key, value|
+          p_config.oauth.public_send("#{key}=", value)
+        end
+      end
+      # Migrates polling configuration from the old producer
+      #
+      # @param p_config [WaterDrop::Config] new producer config being built
+      # @param old_producer_config [WaterDrop::Config] old producer config to inherit from
+      def migrate_polling(p_config, old_producer_config)
+        old_producer_config.polling.to_h.each do |key, value|
+          next if SKIPPABLE_ATTRIBUTES.include?(key)
+          p_config.polling.public_send("#{key}=", value)
+        end
+      end
+      # Migrates polling fd configuration from the old producer
+      #
+      # @param p_config [WaterDrop::Config] new producer config being built
+      # @param old_producer_config [WaterDrop::Config] old producer config to inherit from
+      def migrate_polling_fd(p_config, old_producer_config)
+        old_producer_config.polling.fd.to_h.each do |key, value|
+          p_config.polling.fd.public_send("#{key}=", value)
+        end
+      end
+    end
+  end
+end

data/lib/karafka/swarm/supervisor.rb CHANGED Viewed

@@ -152,6 +152,9 @@ module Karafka
           caller: self,
           error: e,
           manager: manager,
+          active_listeners: [],
+          alive_workers: [],
+          in_processing: {},
           type: "app.stopping.error"
         )

data/lib/karafka/version.rb CHANGED Viewed

@@ -3,5 +3,5 @@
 # Main module namespace
 module Karafka
   # Current Karafka version
-  VERSION = "2.5.6"
+  VERSION = "2.5.8"
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: karafka
 version: !ruby/object:Gem::Version
-  version: 2.5.6
+  version: 2.5.8
 platform: ruby
 authors:
 - Maciej Mensfeld
@@ -206,6 +206,8 @@ files:
 - lib/karafka/pro/active_job/consumer.rb
 - lib/karafka/pro/active_job/dispatcher.rb
 - lib/karafka/pro/active_job/job_options_contract.rb
+- lib/karafka/pro/admin/recovery.rb
+- lib/karafka/pro/admin/recovery/errors.rb
 - lib/karafka/pro/base_consumer.rb
 - lib/karafka/pro/cleaner.rb
 - lib/karafka/pro/cleaner/errors.rb
@@ -542,6 +544,7 @@ files:
 - lib/karafka/swarm/liveness_listener.rb
 - lib/karafka/swarm/manager.rb
 - lib/karafka/swarm/node.rb
+- lib/karafka/swarm/producer_replacer.rb
 - lib/karafka/swarm/supervisor.rb
 - lib/karafka/templates/application_consumer.rb.erb
 - lib/karafka/templates/example_consumer.rb.erb
@@ -576,7 +579,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubygems_version: 4.0.3
+rubygems_version: 4.0.6
 specification_version: 4
 summary: Karafka is Ruby and Rails efficient Kafka processing framework.
 test_files: []