karafka 2.5.7 → 2.5.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +3 -0
- data/lib/karafka/pro/admin/recovery/errors.rb +43 -0
- data/lib/karafka/pro/admin/recovery.rb +478 -0
- data/lib/karafka/pro/cli/topics/health.rb +2 -2
- data/lib/karafka/pro/iterator/expander.rb +1 -1
- data/lib/karafka/pro/iterator.rb +1 -1
- data/lib/karafka/version.rb +1 -1
- metadata +4 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 9a0dd67b39af3fbad16ce4c96f061309d97e21ae95bcace2bab1e7879823a622
|
|
4
|
+
data.tar.gz: 0061a806b411a0526826327be18946fb39fe90c324a82e45d329ef9a45e3168a
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 7cae09aabcea7cf6eb692bc5747dc51327601eebd680314a62c064c626f79db17a7a6c3f0e49a6a95859eeb4d6e30a51b7a6cb3b7acd1c69400734981e926db6
|
|
7
|
+
data.tar.gz: 1c1c031bd16d1da91fa7fc3a2627e157db698b51883c91197c6cf5311d19abd8d9be75742db447d588e73c1a30dca196f991a4ec9ee49f31090fcb2863f45b47
|
data/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,8 @@
|
|
|
1
1
|
# Karafka Framework Changelog
|
|
2
2
|
|
|
3
|
+
## 2.5.8 (2026-03-23)
|
|
4
|
+
- **[Feature]** Add `Karafka::Admin::Recovery` for coordinator-bypass offset reading and consumer group migration when the Kafka group coordinator is in a FAILED state (Pro).
|
|
5
|
+
|
|
3
6
|
## 2.5.7 (2026-03-16)
|
|
4
7
|
- [Enhancement] Report detailed blocking information (active listeners, alive workers, and in-processing jobs) during forceful shutdown instead of only aggregate counts.
|
|
5
8
|
- [Enhancement] Improve `ForcefulShutdownError` description to clearly explain when and why it is raised.
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
# Karafka Pro - Source Available Commercial Software
|
|
4
|
+
# Copyright (c) 2017-present Maciej Mensfeld. All rights reserved.
|
|
5
|
+
#
|
|
6
|
+
# This software is NOT open source. It is source-available commercial software
|
|
7
|
+
# requiring a paid license for use. It is NOT covered by LGPL.
|
|
8
|
+
#
|
|
9
|
+
# PROHIBITED:
|
|
10
|
+
# - Use without a valid commercial license
|
|
11
|
+
# - Redistribution, modification, or derivative works without authorization
|
|
12
|
+
# - Use as training data for AI/ML models or inclusion in datasets
|
|
13
|
+
# - Scraping, crawling, or automated collection for any purpose
|
|
14
|
+
#
|
|
15
|
+
# PERMITTED:
|
|
16
|
+
# - Reading, referencing, and linking for personal or commercial use
|
|
17
|
+
# - Runtime retrieval by AI assistants, coding agents, and RAG systems
|
|
18
|
+
# for the purpose of providing contextual help to Karafka users
|
|
19
|
+
#
|
|
20
|
+
# License: https://karafka.io/docs/Pro-License-Comm/
|
|
21
|
+
# Contact: contact@karafka.io
|
|
22
|
+
|
|
23
|
+
module Karafka
|
|
24
|
+
module Pro
|
|
25
|
+
# Pro Admin utilities
|
|
26
|
+
module Admin
|
|
27
|
+
class Recovery < Karafka::Admin
|
|
28
|
+
# Recovery related errors
|
|
29
|
+
module Errors
|
|
30
|
+
# Base for all the recovery errors
|
|
31
|
+
BaseError = Class.new(::Karafka::Errors::BaseError)
|
|
32
|
+
|
|
33
|
+
# Raised when required cluster metadata cannot be retrieved (topic, partition, or
|
|
34
|
+
# broker not found)
|
|
35
|
+
MetadataError = Class.new(BaseError)
|
|
36
|
+
|
|
37
|
+
# Raised when a partition number is outside the valid range for __consumer_offsets
|
|
38
|
+
PartitionOutOfRangeError = Class.new(BaseError)
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
end
|
|
@@ -0,0 +1,478 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
# Karafka Pro - Source Available Commercial Software
|
|
4
|
+
# Copyright (c) 2017-present Maciej Mensfeld. All rights reserved.
|
|
5
|
+
#
|
|
6
|
+
# This software is NOT open source. It is source-available commercial software
|
|
7
|
+
# requiring a paid license for use. It is NOT covered by LGPL.
|
|
8
|
+
#
|
|
9
|
+
# PROHIBITED:
|
|
10
|
+
# - Use without a valid commercial license
|
|
11
|
+
# - Redistribution, modification, or derivative works without authorization
|
|
12
|
+
# - Use as training data for AI/ML models or inclusion in datasets
|
|
13
|
+
# - Scraping, crawling, or automated collection for any purpose
|
|
14
|
+
#
|
|
15
|
+
# PERMITTED:
|
|
16
|
+
# - Reading, referencing, and linking for personal or commercial use
|
|
17
|
+
# - Runtime retrieval by AI assistants, coding agents, and RAG systems
|
|
18
|
+
# for the purpose of providing contextual help to Karafka users
|
|
19
|
+
#
|
|
20
|
+
# License: https://karafka.io/docs/Pro-License-Comm/
|
|
21
|
+
# Contact: contact@karafka.io
|
|
22
|
+
|
|
23
|
+
module Karafka
|
|
24
|
+
module Pro
|
|
25
|
+
# Pro Admin utilities
|
|
26
|
+
module Admin
|
|
27
|
+
# Consumer group recovery toolkit.
|
|
28
|
+
#
|
|
29
|
+
# Provides coordinator-bypass offset reading and blast-radius assessment for scenarios
|
|
30
|
+
# where the Kafka group coordinator is in a FAILED state and normal admin APIs return
|
|
31
|
+
# NOT_COORDINATOR or time out.
|
|
32
|
+
#
|
|
33
|
+
# Works for any coordinator failure scenario:
|
|
34
|
+
# - KAFKA-19862 (compaction race during coordinator load)
|
|
35
|
+
# - Broker OOM / GC pause making coordinator unreachable
|
|
36
|
+
# - Network partition isolating the coordinator broker
|
|
37
|
+
# - Any future bug that transitions a coordinator shard to FAILED
|
|
38
|
+
#
|
|
39
|
+
# Each consumer group is assigned to a specific __consumer_offsets partition (and therefore
|
|
40
|
+
# a specific coordinator broker) based on its name. When that coordinator enters a FAILED
|
|
41
|
+
# state, all operations for the group - joins, heartbeats, offset commits, and offset
|
|
42
|
+
# fetches - are stuck until the coordinator recovers.
|
|
43
|
+
#
|
|
44
|
+
# A common recovery strategy is migrating to a new consumer group with a different name,
|
|
45
|
+
# which causes Kafka to hash it to a (likely) different __consumer_offsets partition served
|
|
46
|
+
# by a healthy coordinator. This class provides the tools to:
|
|
47
|
+
# 1. Read committed offsets directly from the raw __consumer_offsets log (bypassing the
|
|
48
|
+
# broken coordinator) via {#read_committed_offsets}
|
|
49
|
+
# 2. Assess blast radius: which broker coordinates a group ({#coordinator_for}), which
|
|
50
|
+
# partitions a broker leads ({#affected_partitions}), and which groups are affected
|
|
51
|
+
# ({#affected_groups})
|
|
52
|
+
#
|
|
53
|
+
# To complete the migration, use {Karafka::Admin::ConsumerGroups.seek} to write the
|
|
54
|
+
# recovered offsets to the new group.
|
|
55
|
+
#
|
|
56
|
+
# All reads go through the fetch API and never touch the group coordinator.
|
|
57
|
+
#
|
|
58
|
+
# @note These methods should NOT be used unless you are experiencing issues that require
|
|
59
|
+
# manual intervention. Misuse can lead to data loss or other problems.
|
|
60
|
+
class Recovery < Karafka::Admin
|
|
61
|
+
# Internal topic where Kafka stores committed offsets and group metadata
|
|
62
|
+
OFFSETS_TOPIC = "__consumer_offsets"
|
|
63
|
+
|
|
64
|
+
# Default lookback window for offset scanning (1 hour). Covers any normal commit interval.
|
|
65
|
+
# Provide an earlier Time if your group commits infrequently or the incident has been
|
|
66
|
+
# ongoing for longer than 1 hour.
|
|
67
|
+
DEFAULT_LAST_COMMITTED_AT_OFFSET = 3_600
|
|
68
|
+
|
|
69
|
+
private_constant :OFFSETS_TOPIC, :DEFAULT_LAST_COMMITTED_AT_OFFSET
|
|
70
|
+
|
|
71
|
+
class << self
|
|
72
|
+
# @param consumer_group_id [String] consumer group to read offsets for
|
|
73
|
+
# @param last_committed_at [Time] approximate time of last successful offset commit
|
|
74
|
+
# (default: 1 hour ago). A good rule of thumb is the crash time minus 10 minutes
|
|
75
|
+
# @return [Hash{String => Hash{Integer => Integer}}]
|
|
76
|
+
# @see #read_committed_offsets
|
|
77
|
+
def read_committed_offsets(
|
|
78
|
+
consumer_group_id,
|
|
79
|
+
last_committed_at: Time.now - DEFAULT_LAST_COMMITTED_AT_OFFSET
|
|
80
|
+
)
|
|
81
|
+
new.read_committed_offsets(consumer_group_id, last_committed_at: last_committed_at)
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
# @param consumer_group_id [String] consumer group id
|
|
85
|
+
# @return [Integer] __consumer_offsets partition number
|
|
86
|
+
# @see #offsets_partition_for
|
|
87
|
+
def offsets_partition_for(consumer_group_id)
|
|
88
|
+
new.offsets_partition_for(consumer_group_id)
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
# @param consumer_group_id [String] consumer group to look up
|
|
92
|
+
# @return [Hash] coordinator broker info
|
|
93
|
+
# @see #coordinator_for
|
|
94
|
+
def coordinator_for(consumer_group_id)
|
|
95
|
+
new.coordinator_for(consumer_group_id)
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
# @param partition [Integer] __consumer_offsets partition to scan
|
|
99
|
+
# @param last_committed_at [Time] approximate time of last successful offset commit
|
|
100
|
+
# (default: 1 hour ago). A good rule of thumb is the crash time minus 10 minutes
|
|
101
|
+
# @return [Array<String>] sorted consumer group names
|
|
102
|
+
# @see #affected_groups
|
|
103
|
+
def affected_groups(partition, last_committed_at: Time.now - DEFAULT_LAST_COMMITTED_AT_OFFSET)
|
|
104
|
+
new.affected_groups(partition, last_committed_at: last_committed_at)
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
# @param broker_id [Integer] broker node id
|
|
108
|
+
# @return [Array<Integer>] sorted partition numbers
|
|
109
|
+
# @see #affected_partitions
|
|
110
|
+
def affected_partitions(broker_id)
|
|
111
|
+
new.affected_partitions(broker_id)
|
|
112
|
+
end
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
# Reads committed offsets for a consumer group directly from the __consumer_offsets internal
|
|
116
|
+
# topic, bypassing the group coordinator. Only scans the single __consumer_offsets partition
|
|
117
|
+
# that holds data for the given group (determined by Java's String#hashCode mod partition
|
|
118
|
+
# count), starting from last_committed_at and reading forward to EOF. Later records
|
|
119
|
+
# overwrite earlier ones so the result always reflects the most recent committed offset per
|
|
120
|
+
# partition.
|
|
121
|
+
#
|
|
122
|
+
# @note All consumers in this group should be fully stopped before calling this method.
|
|
123
|
+
# While normally they would already be stopped due to a coordinator failure, if the
|
|
124
|
+
# cluster recovers concurrently, active consumers may commit newer offsets that this scan
|
|
125
|
+
# will not capture, resulting in stale data.
|
|
126
|
+
#
|
|
127
|
+
# @note This method may take a noticeable amount of time to complete because it scans
|
|
128
|
+
# the raw __consumer_offsets log from last_committed_at forward to the end. The duration
|
|
129
|
+
# depends on the volume of offset commits in the scan window across all consumer groups
|
|
130
|
+
# that hash to the same __consumer_offsets partition.
|
|
131
|
+
#
|
|
132
|
+
# @note The result only contains topic-partitions that had offsets committed after
|
|
133
|
+
# last_committed_at. If a partition never had an offset committed, or if the commit
|
|
134
|
+
# happened before last_committed_at, it will be absent from the result. It is the
|
|
135
|
+
# caller's responsibility to verify that all expected topic-partitions are present before
|
|
136
|
+
# using the result for migration or other operations.
|
|
137
|
+
#
|
|
138
|
+
# @param consumer_group_id [String] consumer group to read offsets for
|
|
139
|
+
# @param last_committed_at [Time] approximate time of last successful offset commit
|
|
140
|
+
# (default: 1 hour ago). A good rule of thumb is the crash time minus 10 minutes
|
|
141
|
+
# @return [Hash{String => Hash{Integer => Integer}}]
|
|
142
|
+
# { topic => { partition => committed_offset } }
|
|
143
|
+
#
|
|
144
|
+
# @example Read offsets for the last hour (default)
|
|
145
|
+
# Karafka::Admin::Recovery.read_committed_offsets('sync')
|
|
146
|
+
# #=> { 'events' => { 0 => 1400, 1 => 1402, ... } }
|
|
147
|
+
#
|
|
148
|
+
# @example Read offsets for the last 6 hours
|
|
149
|
+
# Karafka::Admin::Recovery.read_committed_offsets(
|
|
150
|
+
# 'sync', last_committed_at: Time.now - 6 * 3600
|
|
151
|
+
# )
|
|
152
|
+
#
|
|
153
|
+
# @example Read offsets from a specific point in time
|
|
154
|
+
# Karafka::Admin::Recovery.read_committed_offsets('sync', last_committed_at: Time.new(2025, 3, 1))
|
|
155
|
+
#
|
|
156
|
+
# @example Migrate a stuck consumer group to a new name (two-step workflow)
|
|
157
|
+
# # Step 1: Read committed offsets from the broken group (bypasses coordinator)
|
|
158
|
+
# offsets = Karafka::Admin::Recovery.read_committed_offsets('sync')
|
|
159
|
+
# #=> { 'events' => { 0 => 1400, 1 => 1402 }, 'orders' => { 0 => 890 } }
|
|
160
|
+
#
|
|
161
|
+
# # Step 2: Inspect the recovered offsets — verify all expected topics and partitions
|
|
162
|
+
# # are present and the offset values look reasonable before committing them
|
|
163
|
+
#
|
|
164
|
+
# # Step 3: Write the offsets to the target group using standard Admin APIs
|
|
165
|
+
# Karafka::Admin::ConsumerGroups.seek('sync_v2', offsets)
|
|
166
|
+
#
|
|
167
|
+
# # Now reconfigure your consumers to use 'sync_v2' and restart them
|
|
168
|
+
def read_committed_offsets(
|
|
169
|
+
consumer_group_id,
|
|
170
|
+
last_committed_at: Time.now - DEFAULT_LAST_COMMITTED_AT_OFFSET
|
|
171
|
+
)
|
|
172
|
+
committed = Hash.new { |h, k| h[k] = {} }
|
|
173
|
+
target_partition = offsets_partition_for(consumer_group_id)
|
|
174
|
+
|
|
175
|
+
iterator = Pro::Iterator.new(
|
|
176
|
+
{ OFFSETS_TOPIC => { target_partition => last_committed_at } },
|
|
177
|
+
settings: @custom_kafka
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
iterator.each do |message|
|
|
181
|
+
next unless message.raw_key
|
|
182
|
+
|
|
183
|
+
parsed = parse_offset_commit(message)
|
|
184
|
+
next unless parsed
|
|
185
|
+
next unless parsed[:group] == consumer_group_id
|
|
186
|
+
|
|
187
|
+
if parsed[:offset].nil?
|
|
188
|
+
# Tombstone — offset was deleted, remove from results
|
|
189
|
+
committed[parsed[:topic]].delete(parsed[:partition])
|
|
190
|
+
committed.delete(parsed[:topic]) if committed[parsed[:topic]].empty?
|
|
191
|
+
else
|
|
192
|
+
# Last write wins — scanning forward means we naturally end up with the most
|
|
193
|
+
# recent commit per partition
|
|
194
|
+
committed[parsed[:topic]][parsed[:partition]] = parsed[:offset]
|
|
195
|
+
end
|
|
196
|
+
end
|
|
197
|
+
|
|
198
|
+
committed.sort.to_h.transform_values { |parts| parts.sort.to_h }
|
|
199
|
+
end
|
|
200
|
+
|
|
201
|
+
# Determines which __consumer_offsets partition holds data for a given consumer group. Kafka
|
|
202
|
+
# uses Utils.abs(String#hashCode) % numPartitions where hashCode is Java's 32-bit signed
|
|
203
|
+
# hash: s[0]*31^(n-1) + s[1]*31^(n-2) + ... + s[n-1], computed with int32 overflow
|
|
204
|
+
# semantics. Utils.abs maps Integer.MIN_VALUE to 0.
|
|
205
|
+
#
|
|
206
|
+
# @param consumer_group_id [String] consumer group id
|
|
207
|
+
# @return [Integer] __consumer_offsets partition number
|
|
208
|
+
#
|
|
209
|
+
# @example Check which partition stores offsets for a group
|
|
210
|
+
# Karafka::Admin::Recovery.offsets_partition_for('my-group')
|
|
211
|
+
# #=> 17
|
|
212
|
+
def offsets_partition_for(consumer_group_id)
|
|
213
|
+
h = java_hash_code(consumer_group_id)
|
|
214
|
+
# Kafka's Utils.abs: Integer.MIN_VALUE maps to 0
|
|
215
|
+
h = (h == -2_147_483_648) ? 0 : h.abs
|
|
216
|
+
h % offsets_partition_count
|
|
217
|
+
end
|
|
218
|
+
|
|
219
|
+
# Returns which broker is the coordinator for a consumer group. The coordinator is the
|
|
220
|
+
# leader of the __consumer_offsets partition assigned to this group. Pure metadata lookup
|
|
221
|
+
# that does not scan any topic data.
|
|
222
|
+
#
|
|
223
|
+
# Use this to quickly identify which broker is responsible for a consumer group. During an
|
|
224
|
+
# incident, this tells you whether a specific group is affected by a broker outage. If the
|
|
225
|
+
# returned broker is the one that is down or in a FAILED state, the group is stuck and
|
|
226
|
+
# needs migration.
|
|
227
|
+
#
|
|
228
|
+
# @param consumer_group_id [String] consumer group to look up
|
|
229
|
+
# @return [Hash{Symbol => Object}] coordinator info with :partition, :broker_id,
|
|
230
|
+
# and :broker_host keys
|
|
231
|
+
#
|
|
232
|
+
# @example Find coordinator for a group
|
|
233
|
+
# Karafka::Admin::Recovery.coordinator_for('my-group')
|
|
234
|
+
# #=> { partition: 17, broker_id: 2, broker_host: "broker2:9092" }
|
|
235
|
+
#
|
|
236
|
+
# @example Check if a group is affected by a broker outage
|
|
237
|
+
# info = Karafka::Admin::Recovery.coordinator_for('my-group')
|
|
238
|
+
# if info[:broker_id] == failed_broker_id
|
|
239
|
+
# puts "Group 'my-group' is stuck on failed broker #{info[:broker_host]}"
|
|
240
|
+
# end
|
|
241
|
+
def coordinator_for(consumer_group_id)
|
|
242
|
+
target_partition = offsets_partition_for(consumer_group_id)
|
|
243
|
+
metadata = cluster_info
|
|
244
|
+
|
|
245
|
+
offsets_topic = metadata.topics.find { |t| t[:topic_name] == OFFSETS_TOPIC }
|
|
246
|
+
|
|
247
|
+
unless offsets_topic
|
|
248
|
+
raise(
|
|
249
|
+
Errors::MetadataError,
|
|
250
|
+
"Could not retrieve metadata for '#{OFFSETS_TOPIC}'"
|
|
251
|
+
)
|
|
252
|
+
end
|
|
253
|
+
|
|
254
|
+
partitions = offsets_topic[:partitions]
|
|
255
|
+
partition_info = partitions.find { |p| p[:partition_id] == target_partition }
|
|
256
|
+
|
|
257
|
+
unless partition_info
|
|
258
|
+
raise(
|
|
259
|
+
Errors::MetadataError,
|
|
260
|
+
"Could not find partition #{target_partition} in '#{OFFSETS_TOPIC}'"
|
|
261
|
+
)
|
|
262
|
+
end
|
|
263
|
+
|
|
264
|
+
leader_id = partition_info[:leader]
|
|
265
|
+
|
|
266
|
+
broker = metadata.brokers.find do |b|
|
|
267
|
+
if b.is_a?(Hash)
|
|
268
|
+
(b[:broker_id] || b[:node_id]) == leader_id
|
|
269
|
+
else
|
|
270
|
+
b.node_id == leader_id
|
|
271
|
+
end
|
|
272
|
+
end
|
|
273
|
+
|
|
274
|
+
unless broker
|
|
275
|
+
raise(
|
|
276
|
+
Errors::MetadataError,
|
|
277
|
+
"Could not find broker #{leader_id} in cluster metadata"
|
|
278
|
+
)
|
|
279
|
+
end
|
|
280
|
+
|
|
281
|
+
if broker.is_a?(Hash)
|
|
282
|
+
host = broker[:broker_name] || broker[:host]
|
|
283
|
+
port = broker[:broker_port] || broker[:port]
|
|
284
|
+
broker_host = "#{host}:#{port}"
|
|
285
|
+
broker_id = broker[:broker_id] || broker[:node_id]
|
|
286
|
+
else
|
|
287
|
+
broker_host = "#{broker.host}:#{broker.port}"
|
|
288
|
+
broker_id = broker.node_id
|
|
289
|
+
end
|
|
290
|
+
|
|
291
|
+
{ partition: target_partition, broker_id: broker_id, broker_host: broker_host }
|
|
292
|
+
end
|
|
293
|
+
|
|
294
|
+
# Scans a __consumer_offsets partition and returns consumer group names that have active
|
|
295
|
+
# committed offsets. Groups where all offsets have been tombstoned (deleted) within the
|
|
296
|
+
# scan window are excluded.
|
|
297
|
+
#
|
|
298
|
+
# Use this to discover which consumer groups are affected when a coordinator broker fails.
|
|
299
|
+
# Combined with {#affected_partitions}, this gives the full blast radius of a broker
|
|
300
|
+
# outage: first find which __consumer_offsets partitions the failed broker leads, then
|
|
301
|
+
# scan each partition to discover all affected consumer groups.
|
|
302
|
+
#
|
|
303
|
+
# @param partition [Integer] __consumer_offsets partition to scan
|
|
304
|
+
# @param last_committed_at [Time] approximate time of last successful offset commit
|
|
305
|
+
# (default: 1 hour ago). A good rule of thumb is the crash time minus 10 minutes
|
|
306
|
+
# @return [Array<String>] sorted list of consumer group names with active offsets
|
|
307
|
+
#
|
|
308
|
+
# @example Find all groups on partition 17
|
|
309
|
+
# Karafka::Admin::Recovery.affected_groups(17)
|
|
310
|
+
# #=> ["group-a", "group-b", "group-c"]
|
|
311
|
+
#
|
|
312
|
+
# @example Full blast radius of a broker outage
|
|
313
|
+
# partitions = Karafka::Admin::Recovery.affected_partitions(failed_broker_id)
|
|
314
|
+
# all_affected = partitions.flat_map do |p|
|
|
315
|
+
# Karafka::Admin::Recovery.affected_groups(p)
|
|
316
|
+
# end.uniq
|
|
317
|
+
def affected_groups(partition, last_committed_at: Time.now - DEFAULT_LAST_COMMITTED_AT_OFFSET)
|
|
318
|
+
count = offsets_partition_count
|
|
319
|
+
|
|
320
|
+
unless partition >= 0 && partition < count
|
|
321
|
+
raise(
|
|
322
|
+
Errors::PartitionOutOfRangeError,
|
|
323
|
+
"Partition #{partition} is out of range (0...#{count})"
|
|
324
|
+
)
|
|
325
|
+
end
|
|
326
|
+
|
|
327
|
+
# Track offsets per group with last-write-wins so fully tombstoned groups
|
|
328
|
+
# (all offsets deleted) are excluded from the result
|
|
329
|
+
committed = Hash.new { |h, k| h[k] = Hash.new { |h2, k2| h2[k2] = {} } }
|
|
330
|
+
|
|
331
|
+
iterator = Pro::Iterator.new(
|
|
332
|
+
{ OFFSETS_TOPIC => { partition => last_committed_at } },
|
|
333
|
+
settings: @custom_kafka
|
|
334
|
+
)
|
|
335
|
+
|
|
336
|
+
iterator.each do |message|
|
|
337
|
+
next unless message.raw_key
|
|
338
|
+
|
|
339
|
+
parsed = parse_offset_commit(message)
|
|
340
|
+
next unless parsed
|
|
341
|
+
|
|
342
|
+
group = parsed[:group]
|
|
343
|
+
|
|
344
|
+
if parsed[:offset].nil?
|
|
345
|
+
committed[group][parsed[:topic]].delete(parsed[:partition])
|
|
346
|
+
committed[group].delete(parsed[:topic]) if committed[group][parsed[:topic]].empty?
|
|
347
|
+
else
|
|
348
|
+
committed[group][parsed[:topic]][parsed[:partition]] = parsed[:offset]
|
|
349
|
+
end
|
|
350
|
+
end
|
|
351
|
+
|
|
352
|
+
committed.select { |_, topics| !topics.empty? }.keys.sort
|
|
353
|
+
end
|
|
354
|
+
|
|
355
|
+
# Returns all __consumer_offsets partitions led by a given broker. Pure metadata lookup
|
|
356
|
+
# that does not scan any topic data.
|
|
357
|
+
#
|
|
358
|
+
# Use this as the first step in assessing the blast radius of a broker outage. The
|
|
359
|
+
# returned partition numbers can be passed to {#affected_groups} to discover all consumer
|
|
360
|
+
# groups that need recovery or migration.
|
|
361
|
+
#
|
|
362
|
+
# @param broker_id [Integer] broker node id
|
|
363
|
+
# @return [Array<Integer>] sorted list of __consumer_offsets partition numbers
|
|
364
|
+
#
|
|
365
|
+
# @example Find partitions led by broker 2
|
|
366
|
+
# Karafka::Admin::Recovery.affected_partitions(2)
|
|
367
|
+
# #=> [3, 17, 28, 42]
|
|
368
|
+
def affected_partitions(broker_id)
|
|
369
|
+
metadata = cluster_info
|
|
370
|
+
|
|
371
|
+
offsets_topic = metadata.topics.find { |t| t[:topic_name] == OFFSETS_TOPIC }
|
|
372
|
+
|
|
373
|
+
unless offsets_topic
|
|
374
|
+
raise(
|
|
375
|
+
Errors::MetadataError,
|
|
376
|
+
"Could not retrieve metadata for '#{OFFSETS_TOPIC}'"
|
|
377
|
+
)
|
|
378
|
+
end
|
|
379
|
+
|
|
380
|
+
offsets_topic[:partitions]
|
|
381
|
+
.select { |p| p[:leader] == broker_id }
|
|
382
|
+
.map { |p| p[:partition_id] }
|
|
383
|
+
.sort
|
|
384
|
+
end
|
|
385
|
+
|
|
386
|
+
private
|
|
387
|
+
|
|
388
|
+
# Parses a raw __consumer_offsets message into structured offset commit data.
|
|
389
|
+
# Handles both v0 and v1 offset commit key formats (both use the same layout for
|
|
390
|
+
# group/topic/partition). Tombstone records (nil payload) indicate offset deletion and
|
|
391
|
+
# are returned with offset: nil so callers can remove stale entries.
|
|
392
|
+
#
|
|
393
|
+
# @param message [Karafka::Messages::Message] raw message from __consumer_offsets
|
|
394
|
+
# @return [Hash, nil] parsed offset commit or nil if not an offset commit record.
|
|
395
|
+
# When the record is a tombstone (deletion), the :offset value will be nil.
|
|
396
|
+
def parse_offset_commit(message)
|
|
397
|
+
return nil unless message.raw_key
|
|
398
|
+
|
|
399
|
+
key = message.raw_key.b
|
|
400
|
+
key_version = key[0, 2].unpack1("n")
|
|
401
|
+
|
|
402
|
+
# Versions 0 and 1 are offset commit records with identical key layout
|
|
403
|
+
return nil unless key_version <= 1
|
|
404
|
+
|
|
405
|
+
pos = 2
|
|
406
|
+
gl = key[pos, 2].unpack1("n")
|
|
407
|
+
pos += 2
|
|
408
|
+
group = key[pos, gl].force_encoding("UTF-8")
|
|
409
|
+
pos += gl
|
|
410
|
+
tl = key[pos, 2].unpack1("n")
|
|
411
|
+
pos += 2
|
|
412
|
+
topic = key[pos, tl].force_encoding("UTF-8")
|
|
413
|
+
pos += tl
|
|
414
|
+
partition = key[pos, 4].unpack1("N")
|
|
415
|
+
|
|
416
|
+
# Tombstone (nil payload) means the offset was deleted
|
|
417
|
+
unless message.raw_payload
|
|
418
|
+
return { group: group, topic: topic, partition: partition, offset: nil }
|
|
419
|
+
end
|
|
420
|
+
|
|
421
|
+
val = message.raw_payload.b
|
|
422
|
+
|
|
423
|
+
# value layout: int16 version | int64 offset | ...
|
|
424
|
+
offset = val[2, 8].unpack1("q>")
|
|
425
|
+
|
|
426
|
+
{ group: group, topic: topic, partition: partition, offset: offset }
|
|
427
|
+
end
|
|
428
|
+
|
|
429
|
+
# Computes Java's String#hashCode for a given string. Java hashes UTF-16 code units
|
|
430
|
+
# (char values), not raw bytes. For ASCII-only strings this is identical to byte-level
|
|
431
|
+
# hashing, but non-ASCII characters (accented letters, CJK, emoji) require encoding to
|
|
432
|
+
# UTF-16 and hashing each 16-bit code unit (including surrogate pairs for characters
|
|
433
|
+
# above U+FFFF).
|
|
434
|
+
#
|
|
435
|
+
# @param str [String] input string
|
|
436
|
+
# @return [Integer] signed 32-bit hash value matching Java's String#hashCode
|
|
437
|
+
def java_hash_code(str)
|
|
438
|
+
hash = 0
|
|
439
|
+
|
|
440
|
+
# Encode to UTF-16BE to get Java's char sequence, then hash each 16-bit code unit
|
|
441
|
+
str.encode("UTF-16BE").bytes.each_slice(2) do |hi, lo|
|
|
442
|
+
code_unit = (hi << 8) | lo
|
|
443
|
+
hash = (hash * 31 + code_unit) & 0xFFFFFFFF
|
|
444
|
+
end
|
|
445
|
+
|
|
446
|
+
# Convert unsigned 32-bit to signed 32-bit (Java int semantics)
|
|
447
|
+
(hash >= 0x80000000) ? hash - 0x100000000 : hash
|
|
448
|
+
end
|
|
449
|
+
|
|
450
|
+
# Returns the partition count of the __consumer_offsets topic. Memoized per instance since
|
|
451
|
+
# this value never changes at runtime.
|
|
452
|
+
#
|
|
453
|
+
# @return [Integer] number of partitions
|
|
454
|
+
# @raise [Errors::MetadataError] when topic metadata cannot be retrieved
|
|
455
|
+
def offsets_partition_count
|
|
456
|
+
@offsets_partition_count ||= begin
|
|
457
|
+
topic_info = cluster_info.topics.find do |t|
|
|
458
|
+
t[:topic_name] == OFFSETS_TOPIC
|
|
459
|
+
end
|
|
460
|
+
|
|
461
|
+
unless topic_info
|
|
462
|
+
raise(
|
|
463
|
+
Errors::MetadataError,
|
|
464
|
+
"Could not retrieve partition count for '#{OFFSETS_TOPIC}'"
|
|
465
|
+
)
|
|
466
|
+
end
|
|
467
|
+
|
|
468
|
+
topic_info[:partition_count]
|
|
469
|
+
end
|
|
470
|
+
end
|
|
471
|
+
end
|
|
472
|
+
end
|
|
473
|
+
end
|
|
474
|
+
end
|
|
475
|
+
|
|
476
|
+
# We alias this for Pro users so we don't end up having two Admin namespaces from the end
|
|
477
|
+
# user perspective. This enhances the UX.
|
|
478
|
+
Karafka::Admin::Recovery = Karafka::Pro::Admin::Recovery
|
|
@@ -79,8 +79,8 @@ module Karafka
|
|
|
79
79
|
# @param topic_name [String] name of the topic
|
|
80
80
|
# @return [Integer] min.insync.replicas value
|
|
81
81
|
def fetch_min_insync_replicas(topic_name)
|
|
82
|
-
configs = Admin::Configs.describe(
|
|
83
|
-
Admin::Configs::Resource.new(type: :topic, name: topic_name)
|
|
82
|
+
configs = Karafka::Admin::Configs.describe(
|
|
83
|
+
Karafka::Admin::Configs::Resource.new(type: :topic, name: topic_name)
|
|
84
84
|
).first.configs
|
|
85
85
|
|
|
86
86
|
configs.find { |c| c.name == "min.insync.replicas" }.value.to_i
|
|
@@ -89,7 +89,7 @@ module Karafka
|
|
|
89
89
|
# We cache it so we do not have to run consecutive requests to obtain data about multiple
|
|
90
90
|
# topics
|
|
91
91
|
def topics
|
|
92
|
-
@topics ||= Admin.cluster_info.topics
|
|
92
|
+
@topics ||= ::Karafka::Admin.cluster_info.topics
|
|
93
93
|
end
|
|
94
94
|
|
|
95
95
|
# @param name [String] topic name
|
data/lib/karafka/pro/iterator.rb
CHANGED
|
@@ -79,7 +79,7 @@ module Karafka
|
|
|
79
79
|
# the partitions but once we found it, given partition data is no longer needed and would
|
|
80
80
|
# only eat up resources.
|
|
81
81
|
def each
|
|
82
|
-
Admin.with_consumer(@settings) do |consumer|
|
|
82
|
+
::Karafka::Admin.with_consumer(@settings) do |consumer|
|
|
83
83
|
tpl = TplBuilder.new(consumer, @topics_with_partitions).call
|
|
84
84
|
consumer.assign(tpl)
|
|
85
85
|
|
data/lib/karafka/version.rb
CHANGED
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: karafka
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 2.5.
|
|
4
|
+
version: 2.5.8
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Maciej Mensfeld
|
|
@@ -206,6 +206,8 @@ files:
|
|
|
206
206
|
- lib/karafka/pro/active_job/consumer.rb
|
|
207
207
|
- lib/karafka/pro/active_job/dispatcher.rb
|
|
208
208
|
- lib/karafka/pro/active_job/job_options_contract.rb
|
|
209
|
+
- lib/karafka/pro/admin/recovery.rb
|
|
210
|
+
- lib/karafka/pro/admin/recovery/errors.rb
|
|
209
211
|
- lib/karafka/pro/base_consumer.rb
|
|
210
212
|
- lib/karafka/pro/cleaner.rb
|
|
211
213
|
- lib/karafka/pro/cleaner/errors.rb
|
|
@@ -577,7 +579,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
577
579
|
- !ruby/object:Gem::Version
|
|
578
580
|
version: '0'
|
|
579
581
|
requirements: []
|
|
580
|
-
rubygems_version: 4.0.
|
|
582
|
+
rubygems_version: 4.0.6
|
|
581
583
|
specification_version: 4
|
|
582
584
|
summary: Karafka is Ruby and Rails efficient Kafka processing framework.
|
|
583
585
|
test_files: []
|