karafka 2.5.6 → 2.5.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +16 -0
- data/lib/karafka/errors.rb +3 -1
- data/lib/karafka/helpers/interval_runner.rb +4 -2
- data/lib/karafka/instrumentation/logger_listener.rb +22 -9
- data/lib/karafka/instrumentation/vendors/datadog/logger_listener.rb +6 -1
- data/lib/karafka/messages/builders/batch_metadata.rb +4 -2
- data/lib/karafka/pro/admin/recovery/errors.rb +43 -0
- data/lib/karafka/pro/admin/recovery.rb +478 -0
- data/lib/karafka/pro/cli/topics/health.rb +2 -2
- data/lib/karafka/pro/iterator/expander.rb +1 -1
- data/lib/karafka/pro/iterator.rb +1 -1
- data/lib/karafka/pro/processing/coordinators/virtual_offset_manager.rb +17 -7
- data/lib/karafka/processing/jobs_queue.rb +10 -0
- data/lib/karafka/server.rb +9 -0
- data/lib/karafka/swarm/node.rb +1 -30
- data/lib/karafka/swarm/producer_replacer.rb +110 -0
- data/lib/karafka/swarm/supervisor.rb +3 -0
- data/lib/karafka/version.rb +1 -1
- metadata +5 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 9a0dd67b39af3fbad16ce4c96f061309d97e21ae95bcace2bab1e7879823a622
|
|
4
|
+
data.tar.gz: 0061a806b411a0526826327be18946fb39fe90c324a82e45d329ef9a45e3168a
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 7cae09aabcea7cf6eb692bc5747dc51327601eebd680314a62c064c626f79db17a7a6c3f0e49a6a95859eeb4d6e30a51b7a6cb3b7acd1c69400734981e926db6
|
|
7
|
+
data.tar.gz: 1c1c031bd16d1da91fa7fc3a2627e157db698b51883c91197c6cf5311d19abd8d9be75742db447d588e73c1a30dca196f991a4ec9ee49f31090fcb2863f45b47
|
data/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,21 @@
|
|
|
1
1
|
# Karafka Framework Changelog
|
|
2
2
|
|
|
3
|
+
## 2.5.8 (2026-03-23)
|
|
4
|
+
- **[Feature]** Add `Karafka::Admin::Recovery` for coordinator-bypass offset reading and consumer group migration when the Kafka group coordinator is in a FAILED state (Pro).
|
|
5
|
+
|
|
6
|
+
## 2.5.7 (2026-03-16)
|
|
7
|
+
- [Enhancement] Report detailed blocking information (active listeners, alive workers, and in-processing jobs) during forceful shutdown instead of only aggregate counts.
|
|
8
|
+
- [Enhancement] Improve `ForcefulShutdownError` description to clearly explain when and why it is raised.
|
|
9
|
+
- [Enhancement] Cache `messages.last` in `BatchMetadata` builder to avoid duplicate array traversal.
|
|
10
|
+
- [Enhancement] Optimize `VirtualOffsetManager#mark` to use a single array scan instead of separate `include?` and `index` calls (Pro).
|
|
11
|
+
- [Enhancement] Optimize `VirtualOffsetManager#materialize_real_offset` to use `keys.sort` instead of `to_a.sort_by` with tuple destructuring (Pro).
|
|
12
|
+
- [Enhancement] Optimize `IntervalRunner#call` to use a single `monotonic_now` call instead of two per invocation.
|
|
13
|
+
- [Enhancement] Support WaterDrop `:fd` mode in Swarm.
|
|
14
|
+
- [Maintenance] Use both `:fd` and `:thread` producer backends in CI.
|
|
15
|
+
- [Maintenance] Include spec file hash in integration test topic names for easier traceability in Kafka logs (#3056).
|
|
16
|
+
- [Fix] Remove duplicate topic creation in multi-broker health integration specs (#3056).
|
|
17
|
+
- [Fix] Preserve producer-specific kafka settings (e.g., `enable.idempotence`) when recreating the producer in swarm forks.
|
|
18
|
+
|
|
3
19
|
## 2.5.6 (2026-02-28)
|
|
4
20
|
- **[Feature]** Add `karafka topics health` command to check Kafka topics for replication and durability issues, detecting no redundancy (RF=1), zero fault tolerance (RF≤min.insync), and low durability (min.insync=1) configurations with color-coded severity grouping and actionable recommendations (Pro).
|
|
5
21
|
- [Enhancement] Optimize license loading process by reading license files directly from the gem directory instead of requiring the entire gem, reducing initialization overhead and adding support for user-defined License modules.
|
data/lib/karafka/errors.rb
CHANGED
|
@@ -51,7 +51,9 @@ module Karafka
|
|
|
51
51
|
end
|
|
52
52
|
end
|
|
53
53
|
|
|
54
|
-
# Raised when
|
|
54
|
+
# Raised when the graceful shutdown timeout has been exceeded and Karafka must forcefully
|
|
55
|
+
# terminate remaining listeners and workers. This typically happens when consumer processing
|
|
56
|
+
# or shutdown jobs take longer than the configured `shutdown_timeout`.
|
|
55
57
|
ForcefulShutdownError = Class.new(BaseError)
|
|
56
58
|
|
|
57
59
|
# Raised when the jobs queue receives a job that should not be received as it would cause
|
|
@@ -26,9 +26,11 @@ module Karafka
|
|
|
26
26
|
|
|
27
27
|
# Runs the requested code if it was not executed previously recently
|
|
28
28
|
def call
|
|
29
|
-
|
|
29
|
+
now = monotonic_now
|
|
30
30
|
|
|
31
|
-
@last_called_at
|
|
31
|
+
return if now - @last_called_at < @interval
|
|
32
|
+
|
|
33
|
+
@last_called_at = now
|
|
32
34
|
|
|
33
35
|
@block.call
|
|
34
36
|
end
|
|
@@ -385,21 +385,34 @@ module Karafka
|
|
|
385
385
|
fatal "Runner crashed due to an error: #{details}"
|
|
386
386
|
fatal backtrace
|
|
387
387
|
when "app.stopping.error"
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
# We need to check if listeners and workers are assigned as during super early stages of
|
|
392
|
-
# boot they are not.
|
|
393
|
-
listeners = Server.listeners ? Server.listeners.count(&:active?) : 0
|
|
394
|
-
workers = Server.workers ? Server.workers.count(&:alive?) : 0
|
|
388
|
+
active_listeners = event.payload[:active_listeners]
|
|
389
|
+
alive_workers = event.payload[:alive_workers]
|
|
390
|
+
in_processing = event.payload[:in_processing]
|
|
395
391
|
|
|
396
392
|
message = <<~MSG.tr("\n", " ").strip!
|
|
397
393
|
Forceful Karafka server stop with:
|
|
398
|
-
#{
|
|
399
|
-
#{
|
|
394
|
+
#{alive_workers.size} active workers and
|
|
395
|
+
#{active_listeners.size} active listeners
|
|
400
396
|
MSG
|
|
401
397
|
|
|
402
398
|
error message
|
|
399
|
+
|
|
400
|
+
active_listeners.each do |listener|
|
|
401
|
+
error "Listener #{listener.id} for #{listener.subscription_group.name} still active"
|
|
402
|
+
end
|
|
403
|
+
|
|
404
|
+
in_processing.each do |group_id, jobs|
|
|
405
|
+
next if jobs.empty?
|
|
406
|
+
|
|
407
|
+
jobs.each do |job|
|
|
408
|
+
job_class = job.class.name.split("::").last
|
|
409
|
+
topic_name = job.executor.topic.name
|
|
410
|
+
partition = job.executor.partition
|
|
411
|
+
|
|
412
|
+
error "In processing: #{job_class} job for #{topic_name}/#{partition} " \
|
|
413
|
+
"(group: #{group_id})"
|
|
414
|
+
end
|
|
415
|
+
end
|
|
403
416
|
when "app.forceful_stopping.error"
|
|
404
417
|
error "Forceful shutdown error occurred: #{details}"
|
|
405
418
|
error backtrace
|
|
@@ -117,7 +117,12 @@ module Karafka
|
|
|
117
117
|
when "runner.call.error"
|
|
118
118
|
fatal "Runner crashed due to an error: #{error}"
|
|
119
119
|
when "app.stopping.error"
|
|
120
|
-
|
|
120
|
+
active_listeners = event.payload[:active_listeners]
|
|
121
|
+
alive_workers = event.payload[:alive_workers]
|
|
122
|
+
|
|
123
|
+
error "Forceful Karafka server stop with: " \
|
|
124
|
+
"#{alive_workers.size} active workers and " \
|
|
125
|
+
"#{active_listeners.size} active listeners"
|
|
121
126
|
when "app.forceful_stopping.error"
|
|
122
127
|
error "Forceful shutdown error occurred: #{error}"
|
|
123
128
|
when "librdkafka.error"
|
|
@@ -17,16 +17,18 @@ module Karafka
|
|
|
17
17
|
# @note We do not set `processed_at` as this needs to be assigned when the batch is
|
|
18
18
|
# picked up for processing.
|
|
19
19
|
def call(messages, topic, partition, scheduled_at)
|
|
20
|
+
last_message = messages.last
|
|
21
|
+
|
|
20
22
|
Karafka::Messages::BatchMetadata.new(
|
|
21
23
|
size: messages.size,
|
|
22
24
|
first_offset: messages.first&.offset || -1001,
|
|
23
|
-
last_offset:
|
|
25
|
+
last_offset: last_message&.offset || -1001,
|
|
24
26
|
deserializers: topic.deserializers,
|
|
25
27
|
partition: partition,
|
|
26
28
|
topic: topic.name,
|
|
27
29
|
# We go with the assumption that the creation of the whole batch is the last message
|
|
28
30
|
# creation time
|
|
29
|
-
created_at: local_created_at(
|
|
31
|
+
created_at: local_created_at(last_message),
|
|
30
32
|
# When this batch was built and scheduled for execution
|
|
31
33
|
scheduled_at: scheduled_at,
|
|
32
34
|
# This needs to be set to a correct value prior to processing starting
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
# Karafka Pro - Source Available Commercial Software
|
|
4
|
+
# Copyright (c) 2017-present Maciej Mensfeld. All rights reserved.
|
|
5
|
+
#
|
|
6
|
+
# This software is NOT open source. It is source-available commercial software
|
|
7
|
+
# requiring a paid license for use. It is NOT covered by LGPL.
|
|
8
|
+
#
|
|
9
|
+
# PROHIBITED:
|
|
10
|
+
# - Use without a valid commercial license
|
|
11
|
+
# - Redistribution, modification, or derivative works without authorization
|
|
12
|
+
# - Use as training data for AI/ML models or inclusion in datasets
|
|
13
|
+
# - Scraping, crawling, or automated collection for any purpose
|
|
14
|
+
#
|
|
15
|
+
# PERMITTED:
|
|
16
|
+
# - Reading, referencing, and linking for personal or commercial use
|
|
17
|
+
# - Runtime retrieval by AI assistants, coding agents, and RAG systems
|
|
18
|
+
# for the purpose of providing contextual help to Karafka users
|
|
19
|
+
#
|
|
20
|
+
# License: https://karafka.io/docs/Pro-License-Comm/
|
|
21
|
+
# Contact: contact@karafka.io
|
|
22
|
+
|
|
23
|
+
module Karafka
|
|
24
|
+
module Pro
|
|
25
|
+
# Pro Admin utilities
|
|
26
|
+
module Admin
|
|
27
|
+
class Recovery < Karafka::Admin
|
|
28
|
+
# Recovery related errors
|
|
29
|
+
module Errors
|
|
30
|
+
# Base for all the recovery errors
|
|
31
|
+
BaseError = Class.new(::Karafka::Errors::BaseError)
|
|
32
|
+
|
|
33
|
+
# Raised when required cluster metadata cannot be retrieved (topic, partition, or
|
|
34
|
+
# broker not found)
|
|
35
|
+
MetadataError = Class.new(BaseError)
|
|
36
|
+
|
|
37
|
+
# Raised when a partition number is outside the valid range for __consumer_offsets
|
|
38
|
+
PartitionOutOfRangeError = Class.new(BaseError)
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
end
|
|
@@ -0,0 +1,478 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
# Karafka Pro - Source Available Commercial Software
|
|
4
|
+
# Copyright (c) 2017-present Maciej Mensfeld. All rights reserved.
|
|
5
|
+
#
|
|
6
|
+
# This software is NOT open source. It is source-available commercial software
|
|
7
|
+
# requiring a paid license for use. It is NOT covered by LGPL.
|
|
8
|
+
#
|
|
9
|
+
# PROHIBITED:
|
|
10
|
+
# - Use without a valid commercial license
|
|
11
|
+
# - Redistribution, modification, or derivative works without authorization
|
|
12
|
+
# - Use as training data for AI/ML models or inclusion in datasets
|
|
13
|
+
# - Scraping, crawling, or automated collection for any purpose
|
|
14
|
+
#
|
|
15
|
+
# PERMITTED:
|
|
16
|
+
# - Reading, referencing, and linking for personal or commercial use
|
|
17
|
+
# - Runtime retrieval by AI assistants, coding agents, and RAG systems
|
|
18
|
+
# for the purpose of providing contextual help to Karafka users
|
|
19
|
+
#
|
|
20
|
+
# License: https://karafka.io/docs/Pro-License-Comm/
|
|
21
|
+
# Contact: contact@karafka.io
|
|
22
|
+
|
|
23
|
+
module Karafka
|
|
24
|
+
module Pro
|
|
25
|
+
# Pro Admin utilities
|
|
26
|
+
module Admin
|
|
27
|
+
# Consumer group recovery toolkit.
|
|
28
|
+
#
|
|
29
|
+
# Provides coordinator-bypass offset reading and blast-radius assessment for scenarios
|
|
30
|
+
# where the Kafka group coordinator is in a FAILED state and normal admin APIs return
|
|
31
|
+
# NOT_COORDINATOR or time out.
|
|
32
|
+
#
|
|
33
|
+
# Works for any coordinator failure scenario:
|
|
34
|
+
# - KAFKA-19862 (compaction race during coordinator load)
|
|
35
|
+
# - Broker OOM / GC pause making coordinator unreachable
|
|
36
|
+
# - Network partition isolating the coordinator broker
|
|
37
|
+
# - Any future bug that transitions a coordinator shard to FAILED
|
|
38
|
+
#
|
|
39
|
+
# Each consumer group is assigned to a specific __consumer_offsets partition (and therefore
|
|
40
|
+
# a specific coordinator broker) based on its name. When that coordinator enters a FAILED
|
|
41
|
+
# state, all operations for the group - joins, heartbeats, offset commits, and offset
|
|
42
|
+
# fetches - are stuck until the coordinator recovers.
|
|
43
|
+
#
|
|
44
|
+
# A common recovery strategy is migrating to a new consumer group with a different name,
|
|
45
|
+
# which causes Kafka to hash it to a (likely) different __consumer_offsets partition served
|
|
46
|
+
# by a healthy coordinator. This class provides the tools to:
|
|
47
|
+
# 1. Read committed offsets directly from the raw __consumer_offsets log (bypassing the
|
|
48
|
+
# broken coordinator) via {#read_committed_offsets}
|
|
49
|
+
# 2. Assess blast radius: which broker coordinates a group ({#coordinator_for}), which
|
|
50
|
+
# partitions a broker leads ({#affected_partitions}), and which groups are affected
|
|
51
|
+
# ({#affected_groups})
|
|
52
|
+
#
|
|
53
|
+
# To complete the migration, use {Karafka::Admin::ConsumerGroups.seek} to write the
|
|
54
|
+
# recovered offsets to the new group.
|
|
55
|
+
#
|
|
56
|
+
# All reads go through the fetch API and never touch the group coordinator.
|
|
57
|
+
#
|
|
58
|
+
# @note These methods should NOT be used unless you are experiencing issues that require
|
|
59
|
+
# manual intervention. Misuse can lead to data loss or other problems.
|
|
60
|
+
class Recovery < Karafka::Admin
|
|
61
|
+
# Internal topic where Kafka stores committed offsets and group metadata
|
|
62
|
+
OFFSETS_TOPIC = "__consumer_offsets"
|
|
63
|
+
|
|
64
|
+
# Default lookback window for offset scanning (1 hour). Covers any normal commit interval.
|
|
65
|
+
# Provide an earlier Time if your group commits infrequently or the incident has been
|
|
66
|
+
# ongoing for longer than 1 hour.
|
|
67
|
+
DEFAULT_LAST_COMMITTED_AT_OFFSET = 3_600
|
|
68
|
+
|
|
69
|
+
private_constant :OFFSETS_TOPIC, :DEFAULT_LAST_COMMITTED_AT_OFFSET
|
|
70
|
+
|
|
71
|
+
class << self
|
|
72
|
+
# @param consumer_group_id [String] consumer group to read offsets for
|
|
73
|
+
# @param last_committed_at [Time] approximate time of last successful offset commit
|
|
74
|
+
# (default: 1 hour ago). A good rule of thumb is the crash time minus 10 minutes
|
|
75
|
+
# @return [Hash{String => Hash{Integer => Integer}}]
|
|
76
|
+
# @see #read_committed_offsets
|
|
77
|
+
def read_committed_offsets(
|
|
78
|
+
consumer_group_id,
|
|
79
|
+
last_committed_at: Time.now - DEFAULT_LAST_COMMITTED_AT_OFFSET
|
|
80
|
+
)
|
|
81
|
+
new.read_committed_offsets(consumer_group_id, last_committed_at: last_committed_at)
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
# @param consumer_group_id [String] consumer group id
|
|
85
|
+
# @return [Integer] __consumer_offsets partition number
|
|
86
|
+
# @see #offsets_partition_for
|
|
87
|
+
def offsets_partition_for(consumer_group_id)
|
|
88
|
+
new.offsets_partition_for(consumer_group_id)
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
# @param consumer_group_id [String] consumer group to look up
|
|
92
|
+
# @return [Hash] coordinator broker info
|
|
93
|
+
# @see #coordinator_for
|
|
94
|
+
def coordinator_for(consumer_group_id)
|
|
95
|
+
new.coordinator_for(consumer_group_id)
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
# @param partition [Integer] __consumer_offsets partition to scan
|
|
99
|
+
# @param last_committed_at [Time] approximate time of last successful offset commit
|
|
100
|
+
# (default: 1 hour ago). A good rule of thumb is the crash time minus 10 minutes
|
|
101
|
+
# @return [Array<String>] sorted consumer group names
|
|
102
|
+
# @see #affected_groups
|
|
103
|
+
def affected_groups(partition, last_committed_at: Time.now - DEFAULT_LAST_COMMITTED_AT_OFFSET)
|
|
104
|
+
new.affected_groups(partition, last_committed_at: last_committed_at)
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
# @param broker_id [Integer] broker node id
|
|
108
|
+
# @return [Array<Integer>] sorted partition numbers
|
|
109
|
+
# @see #affected_partitions
|
|
110
|
+
def affected_partitions(broker_id)
|
|
111
|
+
new.affected_partitions(broker_id)
|
|
112
|
+
end
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
# Reads committed offsets for a consumer group directly from the __consumer_offsets internal
|
|
116
|
+
# topic, bypassing the group coordinator. Only scans the single __consumer_offsets partition
|
|
117
|
+
# that holds data for the given group (determined by Java's String#hashCode mod partition
|
|
118
|
+
# count), starting from last_committed_at and reading forward to EOF. Later records
|
|
119
|
+
# overwrite earlier ones so the result always reflects the most recent committed offset per
|
|
120
|
+
# partition.
|
|
121
|
+
#
|
|
122
|
+
# @note All consumers in this group should be fully stopped before calling this method.
|
|
123
|
+
# While normally they would already be stopped due to a coordinator failure, if the
|
|
124
|
+
# cluster recovers concurrently, active consumers may commit newer offsets that this scan
|
|
125
|
+
# will not capture, resulting in stale data.
|
|
126
|
+
#
|
|
127
|
+
# @note This method may take a noticeable amount of time to complete because it scans
|
|
128
|
+
# the raw __consumer_offsets log from last_committed_at forward to the end. The duration
|
|
129
|
+
# depends on the volume of offset commits in the scan window across all consumer groups
|
|
130
|
+
# that hash to the same __consumer_offsets partition.
|
|
131
|
+
#
|
|
132
|
+
# @note The result only contains topic-partitions that had offsets committed after
|
|
133
|
+
# last_committed_at. If a partition never had an offset committed, or if the commit
|
|
134
|
+
# happened before last_committed_at, it will be absent from the result. It is the
|
|
135
|
+
# caller's responsibility to verify that all expected topic-partitions are present before
|
|
136
|
+
# using the result for migration or other operations.
|
|
137
|
+
#
|
|
138
|
+
# @param consumer_group_id [String] consumer group to read offsets for
|
|
139
|
+
# @param last_committed_at [Time] approximate time of last successful offset commit
|
|
140
|
+
# (default: 1 hour ago). A good rule of thumb is the crash time minus 10 minutes
|
|
141
|
+
# @return [Hash{String => Hash{Integer => Integer}}]
|
|
142
|
+
# { topic => { partition => committed_offset } }
|
|
143
|
+
#
|
|
144
|
+
# @example Read offsets for the last hour (default)
|
|
145
|
+
# Karafka::Admin::Recovery.read_committed_offsets('sync')
|
|
146
|
+
# #=> { 'events' => { 0 => 1400, 1 => 1402, ... } }
|
|
147
|
+
#
|
|
148
|
+
# @example Read offsets for the last 6 hours
|
|
149
|
+
# Karafka::Admin::Recovery.read_committed_offsets(
|
|
150
|
+
# 'sync', last_committed_at: Time.now - 6 * 3600
|
|
151
|
+
# )
|
|
152
|
+
#
|
|
153
|
+
# @example Read offsets from a specific point in time
|
|
154
|
+
# Karafka::Admin::Recovery.read_committed_offsets('sync', last_committed_at: Time.new(2025, 3, 1))
|
|
155
|
+
#
|
|
156
|
+
# @example Migrate a stuck consumer group to a new name (two-step workflow)
|
|
157
|
+
# # Step 1: Read committed offsets from the broken group (bypasses coordinator)
|
|
158
|
+
# offsets = Karafka::Admin::Recovery.read_committed_offsets('sync')
|
|
159
|
+
# #=> { 'events' => { 0 => 1400, 1 => 1402 }, 'orders' => { 0 => 890 } }
|
|
160
|
+
#
|
|
161
|
+
# # Step 2: Inspect the recovered offsets — verify all expected topics and partitions
|
|
162
|
+
# # are present and the offset values look reasonable before committing them
|
|
163
|
+
#
|
|
164
|
+
# # Step 3: Write the offsets to the target group using standard Admin APIs
|
|
165
|
+
# Karafka::Admin::ConsumerGroups.seek('sync_v2', offsets)
|
|
166
|
+
#
|
|
167
|
+
# # Now reconfigure your consumers to use 'sync_v2' and restart them
|
|
168
|
+
def read_committed_offsets(
|
|
169
|
+
consumer_group_id,
|
|
170
|
+
last_committed_at: Time.now - DEFAULT_LAST_COMMITTED_AT_OFFSET
|
|
171
|
+
)
|
|
172
|
+
committed = Hash.new { |h, k| h[k] = {} }
|
|
173
|
+
target_partition = offsets_partition_for(consumer_group_id)
|
|
174
|
+
|
|
175
|
+
iterator = Pro::Iterator.new(
|
|
176
|
+
{ OFFSETS_TOPIC => { target_partition => last_committed_at } },
|
|
177
|
+
settings: @custom_kafka
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
iterator.each do |message|
|
|
181
|
+
next unless message.raw_key
|
|
182
|
+
|
|
183
|
+
parsed = parse_offset_commit(message)
|
|
184
|
+
next unless parsed
|
|
185
|
+
next unless parsed[:group] == consumer_group_id
|
|
186
|
+
|
|
187
|
+
if parsed[:offset].nil?
|
|
188
|
+
# Tombstone — offset was deleted, remove from results
|
|
189
|
+
committed[parsed[:topic]].delete(parsed[:partition])
|
|
190
|
+
committed.delete(parsed[:topic]) if committed[parsed[:topic]].empty?
|
|
191
|
+
else
|
|
192
|
+
# Last write wins — scanning forward means we naturally end up with the most
|
|
193
|
+
# recent commit per partition
|
|
194
|
+
committed[parsed[:topic]][parsed[:partition]] = parsed[:offset]
|
|
195
|
+
end
|
|
196
|
+
end
|
|
197
|
+
|
|
198
|
+
committed.sort.to_h.transform_values { |parts| parts.sort.to_h }
|
|
199
|
+
end
|
|
200
|
+
|
|
201
|
+
# Determines which __consumer_offsets partition holds data for a given consumer group. Kafka
|
|
202
|
+
# uses Utils.abs(String#hashCode) % numPartitions where hashCode is Java's 32-bit signed
|
|
203
|
+
# hash: s[0]*31^(n-1) + s[1]*31^(n-2) + ... + s[n-1], computed with int32 overflow
|
|
204
|
+
# semantics. Utils.abs maps Integer.MIN_VALUE to 0.
|
|
205
|
+
#
|
|
206
|
+
# @param consumer_group_id [String] consumer group id
|
|
207
|
+
# @return [Integer] __consumer_offsets partition number
|
|
208
|
+
#
|
|
209
|
+
# @example Check which partition stores offsets for a group
|
|
210
|
+
# Karafka::Admin::Recovery.offsets_partition_for('my-group')
|
|
211
|
+
# #=> 17
|
|
212
|
+
def offsets_partition_for(consumer_group_id)
|
|
213
|
+
h = java_hash_code(consumer_group_id)
|
|
214
|
+
# Kafka's Utils.abs: Integer.MIN_VALUE maps to 0
|
|
215
|
+
h = (h == -2_147_483_648) ? 0 : h.abs
|
|
216
|
+
h % offsets_partition_count
|
|
217
|
+
end
|
|
218
|
+
|
|
219
|
+
# Returns which broker is the coordinator for a consumer group. The coordinator is the
|
|
220
|
+
# leader of the __consumer_offsets partition assigned to this group. Pure metadata lookup
|
|
221
|
+
# that does not scan any topic data.
|
|
222
|
+
#
|
|
223
|
+
# Use this to quickly identify which broker is responsible for a consumer group. During an
|
|
224
|
+
# incident, this tells you whether a specific group is affected by a broker outage. If the
|
|
225
|
+
# returned broker is the one that is down or in a FAILED state, the group is stuck and
|
|
226
|
+
# needs migration.
|
|
227
|
+
#
|
|
228
|
+
# @param consumer_group_id [String] consumer group to look up
|
|
229
|
+
# @return [Hash{Symbol => Object}] coordinator info with :partition, :broker_id,
|
|
230
|
+
# and :broker_host keys
|
|
231
|
+
#
|
|
232
|
+
# @example Find coordinator for a group
|
|
233
|
+
# Karafka::Admin::Recovery.coordinator_for('my-group')
|
|
234
|
+
# #=> { partition: 17, broker_id: 2, broker_host: "broker2:9092" }
|
|
235
|
+
#
|
|
236
|
+
# @example Check if a group is affected by a broker outage
|
|
237
|
+
# info = Karafka::Admin::Recovery.coordinator_for('my-group')
|
|
238
|
+
# if info[:broker_id] == failed_broker_id
|
|
239
|
+
# puts "Group 'my-group' is stuck on failed broker #{info[:broker_host]}"
|
|
240
|
+
# end
|
|
241
|
+
def coordinator_for(consumer_group_id)
|
|
242
|
+
target_partition = offsets_partition_for(consumer_group_id)
|
|
243
|
+
metadata = cluster_info
|
|
244
|
+
|
|
245
|
+
offsets_topic = metadata.topics.find { |t| t[:topic_name] == OFFSETS_TOPIC }
|
|
246
|
+
|
|
247
|
+
unless offsets_topic
|
|
248
|
+
raise(
|
|
249
|
+
Errors::MetadataError,
|
|
250
|
+
"Could not retrieve metadata for '#{OFFSETS_TOPIC}'"
|
|
251
|
+
)
|
|
252
|
+
end
|
|
253
|
+
|
|
254
|
+
partitions = offsets_topic[:partitions]
|
|
255
|
+
partition_info = partitions.find { |p| p[:partition_id] == target_partition }
|
|
256
|
+
|
|
257
|
+
unless partition_info
|
|
258
|
+
raise(
|
|
259
|
+
Errors::MetadataError,
|
|
260
|
+
"Could not find partition #{target_partition} in '#{OFFSETS_TOPIC}'"
|
|
261
|
+
)
|
|
262
|
+
end
|
|
263
|
+
|
|
264
|
+
leader_id = partition_info[:leader]
|
|
265
|
+
|
|
266
|
+
broker = metadata.brokers.find do |b|
|
|
267
|
+
if b.is_a?(Hash)
|
|
268
|
+
(b[:broker_id] || b[:node_id]) == leader_id
|
|
269
|
+
else
|
|
270
|
+
b.node_id == leader_id
|
|
271
|
+
end
|
|
272
|
+
end
|
|
273
|
+
|
|
274
|
+
unless broker
|
|
275
|
+
raise(
|
|
276
|
+
Errors::MetadataError,
|
|
277
|
+
"Could not find broker #{leader_id} in cluster metadata"
|
|
278
|
+
)
|
|
279
|
+
end
|
|
280
|
+
|
|
281
|
+
if broker.is_a?(Hash)
|
|
282
|
+
host = broker[:broker_name] || broker[:host]
|
|
283
|
+
port = broker[:broker_port] || broker[:port]
|
|
284
|
+
broker_host = "#{host}:#{port}"
|
|
285
|
+
broker_id = broker[:broker_id] || broker[:node_id]
|
|
286
|
+
else
|
|
287
|
+
broker_host = "#{broker.host}:#{broker.port}"
|
|
288
|
+
broker_id = broker.node_id
|
|
289
|
+
end
|
|
290
|
+
|
|
291
|
+
{ partition: target_partition, broker_id: broker_id, broker_host: broker_host }
|
|
292
|
+
end
|
|
293
|
+
|
|
294
|
+
# Scans a __consumer_offsets partition and returns consumer group names that have active
|
|
295
|
+
# committed offsets. Groups where all offsets have been tombstoned (deleted) within the
|
|
296
|
+
# scan window are excluded.
|
|
297
|
+
#
|
|
298
|
+
# Use this to discover which consumer groups are affected when a coordinator broker fails.
|
|
299
|
+
# Combined with {#affected_partitions}, this gives the full blast radius of a broker
|
|
300
|
+
# outage: first find which __consumer_offsets partitions the failed broker leads, then
|
|
301
|
+
# scan each partition to discover all affected consumer groups.
|
|
302
|
+
#
|
|
303
|
+
# @param partition [Integer] __consumer_offsets partition to scan
|
|
304
|
+
# @param last_committed_at [Time] approximate time of last successful offset commit
|
|
305
|
+
# (default: 1 hour ago). A good rule of thumb is the crash time minus 10 minutes
|
|
306
|
+
# @return [Array<String>] sorted list of consumer group names with active offsets
|
|
307
|
+
#
|
|
308
|
+
# @example Find all groups on partition 17
|
|
309
|
+
# Karafka::Admin::Recovery.affected_groups(17)
|
|
310
|
+
# #=> ["group-a", "group-b", "group-c"]
|
|
311
|
+
#
|
|
312
|
+
# @example Full blast radius of a broker outage
|
|
313
|
+
# partitions = Karafka::Admin::Recovery.affected_partitions(failed_broker_id)
|
|
314
|
+
# all_affected = partitions.flat_map do |p|
|
|
315
|
+
# Karafka::Admin::Recovery.affected_groups(p)
|
|
316
|
+
# end.uniq
|
|
317
|
+
def affected_groups(partition, last_committed_at: Time.now - DEFAULT_LAST_COMMITTED_AT_OFFSET)
|
|
318
|
+
count = offsets_partition_count
|
|
319
|
+
|
|
320
|
+
unless partition >= 0 && partition < count
|
|
321
|
+
raise(
|
|
322
|
+
Errors::PartitionOutOfRangeError,
|
|
323
|
+
"Partition #{partition} is out of range (0...#{count})"
|
|
324
|
+
)
|
|
325
|
+
end
|
|
326
|
+
|
|
327
|
+
# Track offsets per group with last-write-wins so fully tombstoned groups
|
|
328
|
+
# (all offsets deleted) are excluded from the result
|
|
329
|
+
committed = Hash.new { |h, k| h[k] = Hash.new { |h2, k2| h2[k2] = {} } }
|
|
330
|
+
|
|
331
|
+
iterator = Pro::Iterator.new(
|
|
332
|
+
{ OFFSETS_TOPIC => { partition => last_committed_at } },
|
|
333
|
+
settings: @custom_kafka
|
|
334
|
+
)
|
|
335
|
+
|
|
336
|
+
iterator.each do |message|
|
|
337
|
+
next unless message.raw_key
|
|
338
|
+
|
|
339
|
+
parsed = parse_offset_commit(message)
|
|
340
|
+
next unless parsed
|
|
341
|
+
|
|
342
|
+
group = parsed[:group]
|
|
343
|
+
|
|
344
|
+
if parsed[:offset].nil?
|
|
345
|
+
committed[group][parsed[:topic]].delete(parsed[:partition])
|
|
346
|
+
committed[group].delete(parsed[:topic]) if committed[group][parsed[:topic]].empty?
|
|
347
|
+
else
|
|
348
|
+
committed[group][parsed[:topic]][parsed[:partition]] = parsed[:offset]
|
|
349
|
+
end
|
|
350
|
+
end
|
|
351
|
+
|
|
352
|
+
committed.select { |_, topics| !topics.empty? }.keys.sort
|
|
353
|
+
end
|
|
354
|
+
|
|
355
|
+
# Returns all __consumer_offsets partitions led by a given broker. Pure metadata lookup
|
|
356
|
+
# that does not scan any topic data.
|
|
357
|
+
#
|
|
358
|
+
# Use this as the first step in assessing the blast radius of a broker outage. The
|
|
359
|
+
# returned partition numbers can be passed to {#affected_groups} to discover all consumer
|
|
360
|
+
# groups that need recovery or migration.
|
|
361
|
+
#
|
|
362
|
+
# @param broker_id [Integer] broker node id
|
|
363
|
+
# @return [Array<Integer>] sorted list of __consumer_offsets partition numbers
|
|
364
|
+
#
|
|
365
|
+
# @example Find partitions led by broker 2
|
|
366
|
+
# Karafka::Admin::Recovery.affected_partitions(2)
|
|
367
|
+
# #=> [3, 17, 28, 42]
|
|
368
|
+
def affected_partitions(broker_id)
|
|
369
|
+
metadata = cluster_info
|
|
370
|
+
|
|
371
|
+
offsets_topic = metadata.topics.find { |t| t[:topic_name] == OFFSETS_TOPIC }
|
|
372
|
+
|
|
373
|
+
unless offsets_topic
|
|
374
|
+
raise(
|
|
375
|
+
Errors::MetadataError,
|
|
376
|
+
"Could not retrieve metadata for '#{OFFSETS_TOPIC}'"
|
|
377
|
+
)
|
|
378
|
+
end
|
|
379
|
+
|
|
380
|
+
offsets_topic[:partitions]
|
|
381
|
+
.select { |p| p[:leader] == broker_id }
|
|
382
|
+
.map { |p| p[:partition_id] }
|
|
383
|
+
.sort
|
|
384
|
+
end
|
|
385
|
+
|
|
386
|
+
private
|
|
387
|
+
|
|
388
|
+
# Parses a raw __consumer_offsets message into structured offset commit data.
|
|
389
|
+
# Handles both v0 and v1 offset commit key formats (both use the same layout for
|
|
390
|
+
# group/topic/partition). Tombstone records (nil payload) indicate offset deletion and
|
|
391
|
+
# are returned with offset: nil so callers can remove stale entries.
|
|
392
|
+
#
|
|
393
|
+
# @param message [Karafka::Messages::Message] raw message from __consumer_offsets
|
|
394
|
+
# @return [Hash, nil] parsed offset commit or nil if not an offset commit record.
|
|
395
|
+
# When the record is a tombstone (deletion), the :offset value will be nil.
|
|
396
|
+
def parse_offset_commit(message)
|
|
397
|
+
return nil unless message.raw_key
|
|
398
|
+
|
|
399
|
+
key = message.raw_key.b
|
|
400
|
+
key_version = key[0, 2].unpack1("n")
|
|
401
|
+
|
|
402
|
+
# Versions 0 and 1 are offset commit records with identical key layout
|
|
403
|
+
return nil unless key_version <= 1
|
|
404
|
+
|
|
405
|
+
pos = 2
|
|
406
|
+
gl = key[pos, 2].unpack1("n")
|
|
407
|
+
pos += 2
|
|
408
|
+
group = key[pos, gl].force_encoding("UTF-8")
|
|
409
|
+
pos += gl
|
|
410
|
+
tl = key[pos, 2].unpack1("n")
|
|
411
|
+
pos += 2
|
|
412
|
+
topic = key[pos, tl].force_encoding("UTF-8")
|
|
413
|
+
pos += tl
|
|
414
|
+
partition = key[pos, 4].unpack1("N")
|
|
415
|
+
|
|
416
|
+
# Tombstone (nil payload) means the offset was deleted
|
|
417
|
+
unless message.raw_payload
|
|
418
|
+
return { group: group, topic: topic, partition: partition, offset: nil }
|
|
419
|
+
end
|
|
420
|
+
|
|
421
|
+
val = message.raw_payload.b
|
|
422
|
+
|
|
423
|
+
# value layout: int16 version | int64 offset | ...
|
|
424
|
+
offset = val[2, 8].unpack1("q>")
|
|
425
|
+
|
|
426
|
+
{ group: group, topic: topic, partition: partition, offset: offset }
|
|
427
|
+
end
|
|
428
|
+
|
|
429
|
+
# Computes Java's String#hashCode for a given string. Java hashes UTF-16 code units
|
|
430
|
+
# (char values), not raw bytes. For ASCII-only strings this is identical to byte-level
|
|
431
|
+
# hashing, but non-ASCII characters (accented letters, CJK, emoji) require encoding to
|
|
432
|
+
# UTF-16 and hashing each 16-bit code unit (including surrogate pairs for characters
|
|
433
|
+
# above U+FFFF).
|
|
434
|
+
#
|
|
435
|
+
# @param str [String] input string
|
|
436
|
+
# @return [Integer] signed 32-bit hash value matching Java's String#hashCode
|
|
437
|
+
def java_hash_code(str)
|
|
438
|
+
hash = 0
|
|
439
|
+
|
|
440
|
+
# Encode to UTF-16BE to get Java's char sequence, then hash each 16-bit code unit
|
|
441
|
+
str.encode("UTF-16BE").bytes.each_slice(2) do |hi, lo|
|
|
442
|
+
code_unit = (hi << 8) | lo
|
|
443
|
+
hash = (hash * 31 + code_unit) & 0xFFFFFFFF
|
|
444
|
+
end
|
|
445
|
+
|
|
446
|
+
# Convert unsigned 32-bit to signed 32-bit (Java int semantics)
|
|
447
|
+
(hash >= 0x80000000) ? hash - 0x100000000 : hash
|
|
448
|
+
end
|
|
449
|
+
|
|
450
|
+
# Returns the partition count of the __consumer_offsets topic. Memoized per instance since
|
|
451
|
+
# this value never changes at runtime.
|
|
452
|
+
#
|
|
453
|
+
# @return [Integer] number of partitions
|
|
454
|
+
# @raise [Errors::MetadataError] when topic metadata cannot be retrieved
|
|
455
|
+
def offsets_partition_count
|
|
456
|
+
@offsets_partition_count ||= begin
|
|
457
|
+
topic_info = cluster_info.topics.find do |t|
|
|
458
|
+
t[:topic_name] == OFFSETS_TOPIC
|
|
459
|
+
end
|
|
460
|
+
|
|
461
|
+
unless topic_info
|
|
462
|
+
raise(
|
|
463
|
+
Errors::MetadataError,
|
|
464
|
+
"Could not retrieve partition count for '#{OFFSETS_TOPIC}'"
|
|
465
|
+
)
|
|
466
|
+
end
|
|
467
|
+
|
|
468
|
+
topic_info[:partition_count]
|
|
469
|
+
end
|
|
470
|
+
end
|
|
471
|
+
end
|
|
472
|
+
end
|
|
473
|
+
end
|
|
474
|
+
end
|
|
475
|
+
|
|
476
|
+
# We alias this for Pro users so we don't end up having two Admin namespaces from the end
|
|
477
|
+
# user perspective. This enhances the UX.
|
|
478
|
+
Karafka::Admin::Recovery = Karafka::Pro::Admin::Recovery
|
|
@@ -79,8 +79,8 @@ module Karafka
|
|
|
79
79
|
# @param topic_name [String] name of the topic
|
|
80
80
|
# @return [Integer] min.insync.replicas value
|
|
81
81
|
def fetch_min_insync_replicas(topic_name)
|
|
82
|
-
configs = Admin::Configs.describe(
|
|
83
|
-
Admin::Configs::Resource.new(type: :topic, name: topic_name)
|
|
82
|
+
configs = Karafka::Admin::Configs.describe(
|
|
83
|
+
Karafka::Admin::Configs::Resource.new(type: :topic, name: topic_name)
|
|
84
84
|
).first.configs
|
|
85
85
|
|
|
86
86
|
configs.find { |c| c.name == "min.insync.replicas" }.value.to_i
|
|
@@ -89,7 +89,7 @@ module Karafka
|
|
|
89
89
|
# We cache it so we do not have to run consecutive requests to obtain data about multiple
|
|
90
90
|
# topics
|
|
91
91
|
def topics
|
|
92
|
-
@topics ||= Admin.cluster_info.topics
|
|
92
|
+
@topics ||= ::Karafka::Admin.cluster_info.topics
|
|
93
93
|
end
|
|
94
94
|
|
|
95
95
|
# @param name [String] topic name
|
data/lib/karafka/pro/iterator.rb
CHANGED
|
@@ -79,7 +79,7 @@ module Karafka
|
|
|
79
79
|
# the partitions but once we found it, given partition data is no longer needed and would
|
|
80
80
|
# only eat up resources.
|
|
81
81
|
def each
|
|
82
|
-
Admin.with_consumer(@settings) do |consumer|
|
|
82
|
+
::Karafka::Admin.with_consumer(@settings) do |consumer|
|
|
83
83
|
tpl = TplBuilder.new(consumer, @topics_with_partitions).call
|
|
84
84
|
consumer.assign(tpl)
|
|
85
85
|
|
|
@@ -91,17 +91,27 @@ module Karafka
|
|
|
91
91
|
@offsets_metadata[offset] = offset_metadata
|
|
92
92
|
@current_offset_metadata = offset_metadata
|
|
93
93
|
|
|
94
|
-
group =
|
|
94
|
+
group = nil
|
|
95
|
+
position = nil
|
|
96
|
+
|
|
97
|
+
@groups.each do |reg_group|
|
|
98
|
+
pos = reg_group.index(offset)
|
|
99
|
+
|
|
100
|
+
if pos
|
|
101
|
+
group = reg_group
|
|
102
|
+
position = pos
|
|
103
|
+
break
|
|
104
|
+
end
|
|
105
|
+
end
|
|
95
106
|
|
|
96
107
|
# This case can happen when someone uses MoM and wants to mark message from a previous
|
|
97
108
|
# batch as consumed. We can add it, since the real offset refresh will point to it
|
|
98
109
|
unless group
|
|
99
110
|
group = [offset]
|
|
111
|
+
position = 0
|
|
100
112
|
@groups << group
|
|
101
113
|
end
|
|
102
114
|
|
|
103
|
-
position = group.index(offset)
|
|
104
|
-
|
|
105
115
|
# Mark all previous messages from the same group also as virtually consumed
|
|
106
116
|
group[0..position].each do |markable_offset|
|
|
107
117
|
# Set previous messages metadata offset as the offset of higher one for overwrites
|
|
@@ -135,7 +145,7 @@ module Karafka
|
|
|
135
145
|
|
|
136
146
|
# @return [Array<Integer>] Offsets of messages already marked as consumed virtually
|
|
137
147
|
def marked
|
|
138
|
-
@marked.select { |_, status| status }.map
|
|
148
|
+
@marked.select { |_, status| status }.map { |offset, _| offset }.sort
|
|
139
149
|
end
|
|
140
150
|
|
|
141
151
|
# Is there a real offset we can mark as consumed
|
|
@@ -171,11 +181,11 @@ module Karafka
|
|
|
171
181
|
private
|
|
172
182
|
|
|
173
183
|
# Recomputes the biggest possible real offset we can have.
|
|
174
|
-
# It picks the
|
|
184
|
+
# It picks the biggest offset that has uninterrupted stream of virtually marked as
|
|
175
185
|
# consumed because this will be the collective offset.
|
|
176
186
|
def materialize_real_offset
|
|
177
|
-
@marked.
|
|
178
|
-
break unless marked
|
|
187
|
+
@marked.keys.sort.each do |offset|
|
|
188
|
+
break unless @marked[offset]
|
|
179
189
|
|
|
180
190
|
@real_offset = offset
|
|
181
191
|
end
|
|
@@ -180,6 +180,16 @@ module Karafka
|
|
|
180
180
|
end
|
|
181
181
|
end
|
|
182
182
|
|
|
183
|
+
# Returns a snapshot of all jobs currently in processing per group.
|
|
184
|
+
# Useful for diagnostics during forceful shutdown to understand what is blocking.
|
|
185
|
+
#
|
|
186
|
+
# @return [Hash{String => Array<Jobs::Base>}] hash mapping group ids to arrays of jobs
|
|
187
|
+
def in_processing
|
|
188
|
+
@mutex.synchronize do
|
|
189
|
+
@in_processing.transform_values(&:dup).freeze
|
|
190
|
+
end
|
|
191
|
+
end
|
|
192
|
+
|
|
183
193
|
private
|
|
184
194
|
|
|
185
195
|
# @param group_id [String] id of the group in which jobs we're interested.
|
data/lib/karafka/server.rb
CHANGED
|
@@ -126,10 +126,19 @@ module Karafka
|
|
|
126
126
|
|
|
127
127
|
raise Errors::ForcefulShutdownError
|
|
128
128
|
rescue Errors::ForcefulShutdownError => e
|
|
129
|
+
active_listeners = listeners.select(&:active?)
|
|
130
|
+
alive_workers = workers.select(&:alive?)
|
|
131
|
+
|
|
132
|
+
# Collect details about subscription groups that still have jobs in processing
|
|
133
|
+
in_processing = jobs_queue ? jobs_queue.in_processing : {}
|
|
134
|
+
|
|
129
135
|
Karafka.monitor.instrument(
|
|
130
136
|
"error.occurred",
|
|
131
137
|
caller: self,
|
|
132
138
|
error: e,
|
|
139
|
+
active_listeners: active_listeners,
|
|
140
|
+
alive_workers: alive_workers,
|
|
141
|
+
in_processing: in_processing,
|
|
133
142
|
type: "app.stopping.error"
|
|
134
143
|
)
|
|
135
144
|
|
data/lib/karafka/swarm/node.rb
CHANGED
|
@@ -27,18 +27,6 @@ module Karafka
|
|
|
27
27
|
# @return [Integer] pid of the node
|
|
28
28
|
attr_reader :pid
|
|
29
29
|
|
|
30
|
-
# When re-creating a producer in the fork, those are not attributes we want to inherit
|
|
31
|
-
# from the parent process because they are updated in the fork. If user wants to take those
|
|
32
|
-
# from the parent process, he should redefine them by overwriting the whole producer.
|
|
33
|
-
SKIPPABLE_NEW_PRODUCER_ATTRIBUTES = %i[
|
|
34
|
-
id
|
|
35
|
-
kafka
|
|
36
|
-
logger
|
|
37
|
-
oauth
|
|
38
|
-
].freeze
|
|
39
|
-
|
|
40
|
-
private_constant :SKIPPABLE_NEW_PRODUCER_ATTRIBUTES
|
|
41
|
-
|
|
42
30
|
# @param id [Integer] number of the fork. Used for uniqueness setup for group client ids and
|
|
43
31
|
# other stuff where we need to know a unique reference of the fork in regards to the rest
|
|
44
32
|
# of them.
|
|
@@ -70,24 +58,7 @@ module Karafka
|
|
|
70
58
|
config.producer.close
|
|
71
59
|
|
|
72
60
|
old_producer = config.producer
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
# Supervisor producer is closed, hence we need a new one here
|
|
76
|
-
config.producer = WaterDrop::Producer.new do |p_config|
|
|
77
|
-
p_config.kafka = Setup::AttributesMap.producer(kafka.dup)
|
|
78
|
-
p_config.logger = config.logger
|
|
79
|
-
|
|
80
|
-
old_producer_config.to_h.each do |key, value|
|
|
81
|
-
next if SKIPPABLE_NEW_PRODUCER_ATTRIBUTES.include?(key)
|
|
82
|
-
|
|
83
|
-
p_config.public_send("#{key}=", value)
|
|
84
|
-
end
|
|
85
|
-
|
|
86
|
-
# Namespaced attributes need to be migrated directly on their config node
|
|
87
|
-
old_producer_config.oauth.to_h.each do |key, value|
|
|
88
|
-
p_config.oauth.public_send("#{key}=", value)
|
|
89
|
-
end
|
|
90
|
-
end
|
|
61
|
+
config.producer = ProducerReplacer.new.call(old_producer, kafka, config.logger)
|
|
91
62
|
|
|
92
63
|
@pid = ::Process.pid
|
|
93
64
|
@reader.close
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Karafka
|
|
4
|
+
module Swarm
|
|
5
|
+
# Builds a new WaterDrop producer that inherits configuration from an old one
|
|
6
|
+
#
|
|
7
|
+
# When a swarm node forks, the parent's producer must be replaced with a new one.
|
|
8
|
+
# This class encapsulates the logic for building that new producer, inheriting all relevant
|
|
9
|
+
# settings from the old one while generating fresh connection-level configuration.
|
|
10
|
+
class ProducerReplacer
|
|
11
|
+
# Attributes that should not be directly copied from the old producer config because they
|
|
12
|
+
# are either regenerated fresh (kafka, logger, id) or handled via their own namespaced
|
|
13
|
+
# migration (oauth, polling, polling.fd).
|
|
14
|
+
SKIPPABLE_ATTRIBUTES = %i[
|
|
15
|
+
id
|
|
16
|
+
kafka
|
|
17
|
+
logger
|
|
18
|
+
oauth
|
|
19
|
+
polling
|
|
20
|
+
fd
|
|
21
|
+
].freeze
|
|
22
|
+
|
|
23
|
+
private_constant :SKIPPABLE_ATTRIBUTES
|
|
24
|
+
|
|
25
|
+
# Builds a new WaterDrop producer inheriting configuration from the old one
|
|
26
|
+
#
|
|
27
|
+
# @param old_producer [WaterDrop::Producer] the old producer to inherit settings from
|
|
28
|
+
# @param kafka [Hash] app-level kafka configuration
|
|
29
|
+
# @param logger [Object] logger instance for the new producer
|
|
30
|
+
# @return [WaterDrop::Producer] new producer with inherited configuration
|
|
31
|
+
def call(old_producer, kafka, logger)
|
|
32
|
+
old_producer_config = old_producer.config
|
|
33
|
+
|
|
34
|
+
WaterDrop::Producer.new do |p_config|
|
|
35
|
+
p_config.logger = logger
|
|
36
|
+
|
|
37
|
+
migrate_kafka(p_config, old_producer_config, kafka)
|
|
38
|
+
migrate_root(p_config, old_producer_config)
|
|
39
|
+
migrate_oauth(p_config, old_producer_config)
|
|
40
|
+
migrate_polling(p_config, old_producer_config)
|
|
41
|
+
migrate_polling_fd(p_config, old_producer_config)
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
private
|
|
46
|
+
|
|
47
|
+
# Migrates root-level producer attributes from the old producer, skipping those that are
|
|
48
|
+
# regenerated fresh or handled by their own namespaced migration
|
|
49
|
+
#
|
|
50
|
+
# @param p_config [WaterDrop::Config] new producer config being built
|
|
51
|
+
# @param old_producer_config [WaterDrop::Config] old producer config to inherit from
|
|
52
|
+
def migrate_root(p_config, old_producer_config)
|
|
53
|
+
old_producer_config.to_h.each do |key, value|
|
|
54
|
+
next if SKIPPABLE_ATTRIBUTES.include?(key)
|
|
55
|
+
|
|
56
|
+
p_config.public_send("#{key}=", value)
|
|
57
|
+
end
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
# Builds fresh kafka config from app-level settings and preserves any producer-specific
|
|
61
|
+
# kafka settings from the old producer (e.g., enable.idempotence) that aren't in the
|
|
62
|
+
# base app kafka config
|
|
63
|
+
#
|
|
64
|
+
# @param p_config [WaterDrop::Config] new producer config being built
|
|
65
|
+
# @param old_producer_config [WaterDrop::Config] old producer config to inherit from
|
|
66
|
+
# @param kafka [Hash] app-level kafka configuration
|
|
67
|
+
def migrate_kafka(p_config, old_producer_config, kafka)
|
|
68
|
+
p_config.kafka = Setup::AttributesMap.producer(kafka.dup)
|
|
69
|
+
|
|
70
|
+
old_producer_config.kafka.each do |key, value|
|
|
71
|
+
next if p_config.kafka.key?(key)
|
|
72
|
+
|
|
73
|
+
p_config.kafka[key] = value
|
|
74
|
+
end
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
# Migrates oauth configuration from the old producer
|
|
78
|
+
#
|
|
79
|
+
# @param p_config [WaterDrop::Config] new producer config being built
|
|
80
|
+
# @param old_producer_config [WaterDrop::Config] old producer config to inherit from
|
|
81
|
+
def migrate_oauth(p_config, old_producer_config)
|
|
82
|
+
old_producer_config.oauth.to_h.each do |key, value|
|
|
83
|
+
p_config.oauth.public_send("#{key}=", value)
|
|
84
|
+
end
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
# Migrates polling configuration from the old producer
|
|
88
|
+
#
|
|
89
|
+
# @param p_config [WaterDrop::Config] new producer config being built
|
|
90
|
+
# @param old_producer_config [WaterDrop::Config] old producer config to inherit from
|
|
91
|
+
def migrate_polling(p_config, old_producer_config)
|
|
92
|
+
old_producer_config.polling.to_h.each do |key, value|
|
|
93
|
+
next if SKIPPABLE_ATTRIBUTES.include?(key)
|
|
94
|
+
|
|
95
|
+
p_config.polling.public_send("#{key}=", value)
|
|
96
|
+
end
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
# Migrates polling fd configuration from the old producer
|
|
100
|
+
#
|
|
101
|
+
# @param p_config [WaterDrop::Config] new producer config being built
|
|
102
|
+
# @param old_producer_config [WaterDrop::Config] old producer config to inherit from
|
|
103
|
+
def migrate_polling_fd(p_config, old_producer_config)
|
|
104
|
+
old_producer_config.polling.fd.to_h.each do |key, value|
|
|
105
|
+
p_config.polling.fd.public_send("#{key}=", value)
|
|
106
|
+
end
|
|
107
|
+
end
|
|
108
|
+
end
|
|
109
|
+
end
|
|
110
|
+
end
|
data/lib/karafka/version.rb
CHANGED
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: karafka
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 2.5.
|
|
4
|
+
version: 2.5.8
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Maciej Mensfeld
|
|
@@ -206,6 +206,8 @@ files:
|
|
|
206
206
|
- lib/karafka/pro/active_job/consumer.rb
|
|
207
207
|
- lib/karafka/pro/active_job/dispatcher.rb
|
|
208
208
|
- lib/karafka/pro/active_job/job_options_contract.rb
|
|
209
|
+
- lib/karafka/pro/admin/recovery.rb
|
|
210
|
+
- lib/karafka/pro/admin/recovery/errors.rb
|
|
209
211
|
- lib/karafka/pro/base_consumer.rb
|
|
210
212
|
- lib/karafka/pro/cleaner.rb
|
|
211
213
|
- lib/karafka/pro/cleaner/errors.rb
|
|
@@ -542,6 +544,7 @@ files:
|
|
|
542
544
|
- lib/karafka/swarm/liveness_listener.rb
|
|
543
545
|
- lib/karafka/swarm/manager.rb
|
|
544
546
|
- lib/karafka/swarm/node.rb
|
|
547
|
+
- lib/karafka/swarm/producer_replacer.rb
|
|
545
548
|
- lib/karafka/swarm/supervisor.rb
|
|
546
549
|
- lib/karafka/templates/application_consumer.rb.erb
|
|
547
550
|
- lib/karafka/templates/example_consumer.rb.erb
|
|
@@ -576,7 +579,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
576
579
|
- !ruby/object:Gem::Version
|
|
577
580
|
version: '0'
|
|
578
581
|
requirements: []
|
|
579
|
-
rubygems_version: 4.0.
|
|
582
|
+
rubygems_version: 4.0.6
|
|
580
583
|
specification_version: 4
|
|
581
584
|
summary: Karafka is Ruby and Rails efficient Kafka processing framework.
|
|
582
585
|
test_files: []
|