karafka 2.0.0.beta2 → 2.0.0.beta5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- checksums.yaml.gz.sig +0 -0
- data/.github/workflows/ci.yml +18 -15
- data/CHANGELOG.md +49 -0
- data/Gemfile.lock +8 -8
- data/bin/benchmarks +2 -2
- data/bin/integrations +44 -15
- data/bin/scenario +29 -0
- data/bin/{stress → stress_many} +0 -0
- data/bin/stress_one +13 -0
- data/bin/wait_for_kafka +20 -0
- data/docker-compose.yml +28 -11
- data/karafka.gemspec +2 -2
- data/lib/karafka/active_job/routing/extensions.rb +12 -2
- data/lib/karafka/app.rb +2 -1
- data/lib/karafka/base_consumer.rb +75 -45
- data/lib/karafka/connection/client.rb +88 -22
- data/lib/karafka/connection/listener.rb +60 -18
- data/lib/karafka/connection/pauses_manager.rb +8 -0
- data/lib/karafka/connection/rebalance_manager.rb +20 -19
- data/lib/karafka/contracts/config.rb +17 -3
- data/lib/karafka/contracts/server_cli_options.rb +1 -1
- data/lib/karafka/errors.rb +3 -0
- data/lib/karafka/instrumentation/logger_listener.rb +34 -10
- data/lib/karafka/instrumentation/monitor.rb +3 -1
- data/lib/karafka/licenser.rb +26 -7
- data/lib/karafka/pro/active_job/consumer.rb +30 -9
- data/lib/karafka/pro/active_job/dispatcher.rb +9 -9
- data/lib/karafka/pro/active_job/job_options_contract.rb +9 -9
- data/lib/karafka/pro/base_consumer.rb +73 -0
- data/lib/karafka/pro/loader.rb +38 -20
- data/lib/karafka/pro/performance_tracker.rb +9 -9
- data/lib/karafka/pro/processing/coordinator.rb +12 -0
- data/lib/karafka/pro/processing/jobs/consume_non_blocking.rb +10 -11
- data/lib/karafka/pro/processing/jobs_builder.rb +32 -0
- data/lib/karafka/pro/processing/scheduler.rb +56 -0
- data/lib/karafka/pro/routing/extensions.rb +32 -0
- data/lib/karafka/processing/coordinator.rb +84 -0
- data/lib/karafka/processing/coordinators_buffer.rb +58 -0
- data/lib/karafka/processing/executor.rb +23 -9
- data/lib/karafka/processing/executors_buffer.rb +46 -15
- data/lib/karafka/processing/jobs/base.rb +8 -3
- data/lib/karafka/processing/jobs/consume.rb +11 -4
- data/lib/karafka/processing/jobs_builder.rb +29 -0
- data/lib/karafka/processing/result.rb +29 -0
- data/lib/karafka/processing/scheduler.rb +22 -0
- data/lib/karafka/processing/worker.rb +17 -9
- data/lib/karafka/routing/consumer_group.rb +1 -1
- data/lib/karafka/routing/subscription_group.rb +1 -1
- data/lib/karafka/routing/topic.rb +14 -0
- data/lib/karafka/setup/config.rb +19 -9
- data/lib/karafka/status.rb +1 -3
- data/lib/karafka/version.rb +1 -1
- data.tar.gz.sig +0 -0
- metadata +19 -7
- metadata.gz.sig +0 -0
- data/lib/karafka/pro/scheduler.rb +0 -54
- data/lib/karafka/scheduler.rb +0 -20
@@ -10,44 +10,74 @@ module Karafka
|
|
10
10
|
attr_accessor :messages
|
11
11
|
# @return [Karafka::Connection::Client] kafka connection client
|
12
12
|
attr_accessor :client
|
13
|
-
# @return [Karafka::
|
14
|
-
attr_accessor :
|
13
|
+
# @return [Karafka::Processing::Coordinator] coordinator
|
14
|
+
attr_accessor :coordinator
|
15
15
|
# @return [Waterdrop::Producer] producer instance
|
16
16
|
attr_accessor :producer
|
17
17
|
|
18
|
+
# Can be used to run preparation code
|
19
|
+
#
|
20
|
+
# @private
|
21
|
+
# @note This should not be used by the end users as it is part of the lifecycle of things but
|
22
|
+
# not as part of the public api. This can act as a hook when creating non-blocking
|
23
|
+
# consumers and doing other advanced stuff
|
24
|
+
def on_before_consume; end
|
25
|
+
|
18
26
|
# Executes the default consumer flow.
|
19
27
|
#
|
28
|
+
# @return [Boolean] true if there was no exception, otherwise false.
|
29
|
+
#
|
20
30
|
# @note We keep the seek offset tracking, and use it to compensate for async offset flushing
|
21
31
|
# that may not yet kick in when error occurs. That way we pause always on the last processed
|
22
32
|
# message.
|
23
33
|
def on_consume
|
24
34
|
Karafka.monitor.instrument('consumer.consumed', caller: self) do
|
25
35
|
consume
|
26
|
-
|
27
|
-
pause_tracker.reset
|
28
|
-
|
29
|
-
# Mark as consumed only if manual offset management is not on
|
30
|
-
next if topic.manual_offset_management
|
31
|
-
|
32
|
-
# We use the non-blocking one here. If someone needs the blocking one, can implement it
|
33
|
-
# with manual offset management
|
34
|
-
mark_as_consumed(messages.last)
|
35
36
|
end
|
37
|
+
|
38
|
+
@coordinator.consumption(self).success!
|
36
39
|
rescue StandardError => e
|
40
|
+
@coordinator.consumption(self).failure!
|
41
|
+
|
37
42
|
Karafka.monitor.instrument(
|
38
43
|
'error.occurred',
|
39
44
|
error: e,
|
40
45
|
caller: self,
|
41
46
|
type: 'consumer.consume.error'
|
42
47
|
)
|
48
|
+
ensure
|
49
|
+
# We need to decrease number of jobs that this coordinator coordinates as it has finished
|
50
|
+
@coordinator.decrement
|
51
|
+
end
|
52
|
+
|
53
|
+
# @private
|
54
|
+
# @note This should not be used by the end users as it is part of the lifecycle of things but
|
55
|
+
# not as part of the public api.
|
56
|
+
def on_after_consume
|
57
|
+
return if revoked?
|
58
|
+
|
59
|
+
if @coordinator.success?
|
60
|
+
coordinator.pause_tracker.reset
|
61
|
+
|
62
|
+
# Mark as consumed only if manual offset management is not on
|
63
|
+
return if topic.manual_offset_management?
|
43
64
|
|
44
|
-
|
65
|
+
# We use the non-blocking one here. If someone needs the blocking one, can implement it
|
66
|
+
# with manual offset management
|
67
|
+
mark_as_consumed(messages.last)
|
68
|
+
else
|
69
|
+
pause(@seek_offset || messages.first.offset)
|
70
|
+
end
|
45
71
|
end
|
46
72
|
|
47
73
|
# Trigger method for running on shutdown.
|
48
74
|
#
|
49
75
|
# @private
|
50
76
|
def on_revoked
|
77
|
+
coordinator.revoke
|
78
|
+
|
79
|
+
resume
|
80
|
+
|
51
81
|
Karafka.monitor.instrument('consumer.revoked', caller: self) do
|
52
82
|
revoked
|
53
83
|
end
|
@@ -76,31 +106,8 @@ module Karafka
|
|
76
106
|
)
|
77
107
|
end
|
78
108
|
|
79
|
-
# Can be used to run preparation code
|
80
|
-
#
|
81
|
-
# @private
|
82
|
-
# @note This should not be used by the end users as it is part of the lifecycle of things but
|
83
|
-
# not as part of the public api. This can act as a hook when creating non-blocking
|
84
|
-
# consumers and doing other advanced stuff
|
85
|
-
def on_prepared
|
86
|
-
Karafka.monitor.instrument('consumer.prepared', caller: self) do
|
87
|
-
prepared
|
88
|
-
end
|
89
|
-
rescue StandardError => e
|
90
|
-
Karafka.monitor.instrument(
|
91
|
-
'error.occurred',
|
92
|
-
error: e,
|
93
|
-
caller: self,
|
94
|
-
type: 'consumer.prepared.error'
|
95
|
-
)
|
96
|
-
end
|
97
|
-
|
98
109
|
private
|
99
110
|
|
100
|
-
# Method that gets called in the blocking flow allowing to setup any type of resources or to
|
101
|
-
# send additional commands to Kafka before the proper execution starts.
|
102
|
-
def prepared; end
|
103
|
-
|
104
111
|
# Method that will perform business logic and on data received from Kafka (it will consume
|
105
112
|
# the data)
|
106
113
|
# @note This method needs bo be implemented in a subclass. We stub it here as a failover if
|
@@ -120,21 +127,40 @@ module Karafka
|
|
120
127
|
# Marks message as consumed in an async way.
|
121
128
|
#
|
122
129
|
# @param message [Messages::Message] last successfully processed message.
|
130
|
+
# @return [Boolean] true if we were able to mark the offset, false otherwise. False indicates
|
131
|
+
# that we were not able and that we have lost the partition.
|
132
|
+
#
|
123
133
|
# @note We keep track of this offset in case we would mark as consumed and got error when
|
124
134
|
# processing another message. In case like this we do not pause on the message we've already
|
125
135
|
# processed but rather at the next one. This applies to both sync and async versions of this
|
126
136
|
# method.
|
127
137
|
def mark_as_consumed(message)
|
128
|
-
client.mark_as_consumed(message)
|
138
|
+
unless client.mark_as_consumed(message)
|
139
|
+
coordinator.revoke
|
140
|
+
|
141
|
+
return false
|
142
|
+
end
|
143
|
+
|
129
144
|
@seek_offset = message.offset + 1
|
145
|
+
|
146
|
+
true
|
130
147
|
end
|
131
148
|
|
132
149
|
# Marks message as consumed in a sync way.
|
133
150
|
#
|
134
151
|
# @param message [Messages::Message] last successfully processed message.
|
152
|
+
# @return [Boolean] true if we were able to mark the offset, false otherwise. False indicates
|
153
|
+
# that we were not able and that we have lost the partition.
|
135
154
|
def mark_as_consumed!(message)
|
136
|
-
client.mark_as_consumed!(message)
|
155
|
+
unless client.mark_as_consumed!(message)
|
156
|
+
coordinator.revoke
|
157
|
+
|
158
|
+
return false
|
159
|
+
end
|
160
|
+
|
137
161
|
@seek_offset = message.offset + 1
|
162
|
+
|
163
|
+
true
|
138
164
|
end
|
139
165
|
|
140
166
|
# Pauses processing on a given offset for the current topic partition
|
@@ -144,23 +170,20 @@ module Karafka
|
|
144
170
|
# @param timeout [Integer, nil] how long in milliseconds do we want to pause or nil to use the
|
145
171
|
# default exponential pausing strategy defined for retries
|
146
172
|
def pause(offset, timeout = nil)
|
173
|
+
timeout ? coordinator.pause_tracker.pause(timeout) : coordinator.pause_tracker.pause
|
174
|
+
|
147
175
|
client.pause(
|
148
176
|
messages.metadata.topic,
|
149
177
|
messages.metadata.partition,
|
150
178
|
offset
|
151
179
|
)
|
152
|
-
|
153
|
-
timeout ? pause_tracker.pause(timeout) : pause_tracker.pause
|
154
180
|
end
|
155
181
|
|
156
182
|
# Resumes processing of the current topic partition
|
157
183
|
def resume
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
)
|
162
|
-
|
163
|
-
pause_tracker.expire
|
184
|
+
# This is sufficient to expire a partition pause, as with it will be resumed by the listener
|
185
|
+
# thread before the next poll.
|
186
|
+
coordinator.pause_tracker.expire
|
164
187
|
end
|
165
188
|
|
166
189
|
# Seeks in the context of current topic and partition
|
@@ -175,5 +198,12 @@ module Karafka
|
|
175
198
|
)
|
176
199
|
)
|
177
200
|
end
|
201
|
+
|
202
|
+
# @return [Boolean] true if partition was revoked from the current consumer
|
203
|
+
# @note We know that partition got revoked because when we try to mark message as consumed,
|
204
|
+
# unless if is successful, it will return false
|
205
|
+
def revoked?
|
206
|
+
coordinator.revoked?
|
207
|
+
end
|
178
208
|
end
|
179
209
|
end
|
@@ -36,6 +36,12 @@ module Karafka
|
|
36
36
|
# Marks if we need to offset. If we did not store offsets, we should not commit the offset
|
37
37
|
# position as it will crash rdkafka
|
38
38
|
@offsetting = false
|
39
|
+
# We need to keep track of what we have paused for resuming
|
40
|
+
# In case we loose partition, we still need to resume it, otherwise it won't be fetched
|
41
|
+
# again if we get reassigned to it later on. We need to keep them as after revocation we
|
42
|
+
# no longer may be able to fetch them from Kafka. We could build them but it is easier
|
43
|
+
# to just keep them here and use if needed when cannot be obtained
|
44
|
+
@paused_tpls = Hash.new { |h, k| h[k] = {} }
|
39
45
|
end
|
40
46
|
|
41
47
|
# Fetches messages within boundaries defined by the settings (time, size, topics, etc).
|
@@ -45,12 +51,13 @@ module Karafka
|
|
45
51
|
# @note This method should not be executed from many threads at the same time
|
46
52
|
def batch_poll
|
47
53
|
time_poll = TimeTrackers::Poll.new(@subscription_group.max_wait_time)
|
48
|
-
time_poll.start
|
49
54
|
|
50
55
|
@buffer.clear
|
51
56
|
@rebalance_manager.clear
|
52
57
|
|
53
58
|
loop do
|
59
|
+
time_poll.start
|
60
|
+
|
54
61
|
# Don't fetch more messages if we do not have any time left
|
55
62
|
break if time_poll.exceeded?
|
56
63
|
# Don't fetch more messages if we've fetched max as we've wanted
|
@@ -69,7 +76,11 @@ module Karafka
|
|
69
76
|
# If partition revocation happens, we need to remove messages from revoked partitions
|
70
77
|
# as well as ensure we do not have duplicated due to the offset reset for partitions
|
71
78
|
# that we got assigned
|
72
|
-
|
79
|
+
# We also do early break, so the information about rebalance is used as soon as possible
|
80
|
+
if @rebalance_manager.changed?
|
81
|
+
remove_revoked_and_duplicated_messages
|
82
|
+
break
|
83
|
+
end
|
73
84
|
|
74
85
|
# Finally once we've (potentially) removed revoked, etc, if no messages were returned
|
75
86
|
# we can break.
|
@@ -86,8 +97,7 @@ module Karafka
|
|
86
97
|
# @param message [Karafka::Messages::Message]
|
87
98
|
def store_offset(message)
|
88
99
|
@mutex.synchronize do
|
89
|
-
|
90
|
-
@kafka.store_offset(message)
|
100
|
+
internal_store_offset(message)
|
91
101
|
end
|
92
102
|
end
|
93
103
|
|
@@ -104,14 +114,7 @@ module Karafka
|
|
104
114
|
def commit_offsets(async: true)
|
105
115
|
@mutex.lock
|
106
116
|
|
107
|
-
|
108
|
-
|
109
|
-
@kafka.commit(nil, async)
|
110
|
-
@offsetting = false
|
111
|
-
rescue Rdkafka::RdkafkaError => e
|
112
|
-
return if e.code == :no_offset
|
113
|
-
|
114
|
-
raise e
|
117
|
+
internal_commit_offsets(async: async)
|
115
118
|
ensure
|
116
119
|
@mutex.unlock
|
117
120
|
end
|
@@ -128,7 +131,11 @@ module Karafka
|
|
128
131
|
#
|
129
132
|
# @param message [Messages::Message, Messages::Seek] message to which we want to seek to
|
130
133
|
def seek(message)
|
134
|
+
@mutex.lock
|
135
|
+
|
131
136
|
@kafka.seek(message)
|
137
|
+
ensure
|
138
|
+
@mutex.unlock
|
132
139
|
end
|
133
140
|
|
134
141
|
# Pauses given partition and moves back to last successful offset processed.
|
@@ -144,15 +151,21 @@ module Karafka
|
|
144
151
|
# Do not pause if the client got closed, would not change anything
|
145
152
|
return if @closed
|
146
153
|
|
154
|
+
pause_msg = Messages::Seek.new(topic, partition, offset)
|
155
|
+
|
156
|
+
internal_commit_offsets(async: false)
|
157
|
+
|
158
|
+
# Here we do not use our cached tpls because we should not try to pause something we do
|
159
|
+
# not own anymore.
|
147
160
|
tpl = topic_partition_list(topic, partition)
|
148
161
|
|
149
162
|
return unless tpl
|
150
163
|
|
151
|
-
@
|
164
|
+
@paused_tpls[topic][partition] = tpl
|
152
165
|
|
153
|
-
|
166
|
+
@kafka.pause(tpl)
|
154
167
|
|
155
|
-
seek(pause_msg)
|
168
|
+
@kafka.seek(pause_msg)
|
156
169
|
ensure
|
157
170
|
@mutex.unlock
|
158
171
|
end
|
@@ -166,9 +179,18 @@ module Karafka
|
|
166
179
|
|
167
180
|
return if @closed
|
168
181
|
|
169
|
-
|
182
|
+
# Always commit synchronously offsets if any when we resume
|
183
|
+
# This prevents resuming without offset in case it would not be committed prior
|
184
|
+
# We can skip performance penalty since resuming should not happen too often
|
185
|
+
internal_commit_offsets(async: false)
|
186
|
+
|
187
|
+
# If we were not able, let's try to reuse the one we have (if we have)
|
188
|
+
tpl = topic_partition_list(topic, partition) || @paused_tpls[topic][partition]
|
170
189
|
|
171
190
|
return unless tpl
|
191
|
+
# If we did not have it, it means we never paused this partition, thus no resume should
|
192
|
+
# happen in the first place
|
193
|
+
return unless @paused_tpls[topic].delete(partition)
|
172
194
|
|
173
195
|
@kafka.resume(tpl)
|
174
196
|
ensure
|
@@ -187,6 +209,7 @@ module Karafka
|
|
187
209
|
# Marks given message as consumed.
|
188
210
|
#
|
189
211
|
# @param [Karafka::Messages::Message] message that we want to mark as processed
|
212
|
+
# @return [Boolean] true if successful. False if we no longer own given partition
|
190
213
|
# @note This method won't trigger automatic offsets commits, rather relying on the offset
|
191
214
|
# check-pointing trigger that happens with each batch processed
|
192
215
|
def mark_as_consumed(message)
|
@@ -196,8 +219,10 @@ module Karafka
|
|
196
219
|
# Marks a given message as consumed and commits the offsets in a blocking way.
|
197
220
|
#
|
198
221
|
# @param [Karafka::Messages::Message] message that we want to mark as processed
|
222
|
+
# @return [Boolean] true if successful. False if we no longer own given partition
|
199
223
|
def mark_as_consumed!(message)
|
200
|
-
mark_as_consumed(message)
|
224
|
+
return false unless mark_as_consumed(message)
|
225
|
+
|
201
226
|
commit_offsets!
|
202
227
|
end
|
203
228
|
|
@@ -208,17 +233,51 @@ module Karafka
|
|
208
233
|
@mutex.synchronize do
|
209
234
|
@closed = false
|
210
235
|
@offsetting = false
|
236
|
+
@paused_tpls.clear
|
211
237
|
@kafka = build_consumer
|
212
238
|
end
|
213
239
|
end
|
214
240
|
|
215
241
|
private
|
216
242
|
|
243
|
+
# When we cannot store an offset, it means we no longer own the partition
|
244
|
+
#
|
245
|
+
# Non thread-safe offset storing method
|
246
|
+
# @param message [Karafka::Messages::Message]
|
247
|
+
# @return [Boolean] true if we could store the offset (if we still own the partition)
|
248
|
+
def internal_store_offset(message)
|
249
|
+
@offsetting = true
|
250
|
+
@kafka.store_offset(message)
|
251
|
+
true
|
252
|
+
rescue Rdkafka::RdkafkaError => e
|
253
|
+
return false if e.code == :assignment_lost
|
254
|
+
return false if e.code == :state
|
255
|
+
|
256
|
+
raise e
|
257
|
+
end
|
258
|
+
|
259
|
+
# Non thread-safe message committing method
|
260
|
+
# @param async [Boolean] should the commit happen async or sync (async by default)
|
261
|
+
# @return [Boolean] true if offset commit worked, false if we've lost the assignment
|
262
|
+
def internal_commit_offsets(async: true)
|
263
|
+
return true unless @offsetting
|
264
|
+
|
265
|
+
@kafka.commit(nil, async)
|
266
|
+
@offsetting = false
|
267
|
+
|
268
|
+
true
|
269
|
+
rescue Rdkafka::RdkafkaError => e
|
270
|
+
return false if e.code == :assignment_lost
|
271
|
+
return false if e.code == :no_offset
|
272
|
+
|
273
|
+
raise e
|
274
|
+
end
|
275
|
+
|
217
276
|
# Commits the stored offsets in a sync way and closes the consumer.
|
218
277
|
def close
|
219
|
-
commit_offsets!
|
220
|
-
|
221
278
|
@mutex.synchronize do
|
279
|
+
internal_commit_offsets(async: false)
|
280
|
+
|
222
281
|
@closed = true
|
223
282
|
|
224
283
|
# Remove callbacks runners that were registered
|
@@ -227,7 +286,8 @@ module Karafka
|
|
227
286
|
|
228
287
|
@kafka.close
|
229
288
|
@buffer.clear
|
230
|
-
@
|
289
|
+
# @note We do not clear rebalance manager here as we may still have revocation info here
|
290
|
+
# that we want to consider valid prior to running another reconnection
|
231
291
|
end
|
232
292
|
end
|
233
293
|
|
@@ -280,7 +340,13 @@ module Karafka
|
|
280
340
|
|
281
341
|
time_poll.backoff
|
282
342
|
|
283
|
-
|
343
|
+
# We return nil, so we do not restart until running the whole loop
|
344
|
+
# This allows us to run revocation jobs and other things and we will pick up new work
|
345
|
+
# next time after dispatching all the things that are needed
|
346
|
+
#
|
347
|
+
# If we would retry here, the client reset would become transparent and we would not have
|
348
|
+
# a chance to take any actions
|
349
|
+
nil
|
284
350
|
end
|
285
351
|
|
286
352
|
# Builds a new rdkafka consumer instance based on the subscription group configuration
|
@@ -323,7 +389,7 @@ module Karafka
|
|
323
389
|
# we are no longer responsible in a given process for processing those messages and they
|
324
390
|
# should have been picked up by a different process.
|
325
391
|
def remove_revoked_and_duplicated_messages
|
326
|
-
@rebalance_manager.
|
392
|
+
@rebalance_manager.lost_partitions.each do |topic, partitions|
|
327
393
|
partitions.each do |partition|
|
328
394
|
@buffer.delete(topic, partition)
|
329
395
|
end
|
@@ -10,17 +10,23 @@ module Karafka
|
|
10
10
|
class Listener
|
11
11
|
include Helpers::Async
|
12
12
|
|
13
|
+
# Can be useful for logging
|
14
|
+
# @return [String] id of this listener
|
15
|
+
attr_reader :id
|
16
|
+
|
13
17
|
# @param subscription_group [Karafka::Routing::SubscriptionGroup]
|
14
18
|
# @param jobs_queue [Karafka::Processing::JobsQueue] queue where we should push work
|
15
19
|
# @return [Karafka::Connection::Listener] listener instance
|
16
20
|
def initialize(subscription_group, jobs_queue)
|
21
|
+
@id = SecureRandom.uuid
|
17
22
|
@subscription_group = subscription_group
|
18
23
|
@jobs_queue = jobs_queue
|
19
|
-
@
|
24
|
+
@jobs_builder = ::Karafka::App.config.internal.processing.jobs_builder
|
25
|
+
@coordinators = Processing::CoordinatorsBuffer.new
|
20
26
|
@client = Client.new(@subscription_group)
|
21
27
|
@executors = Processing::ExecutorsBuffer.new(@client, subscription_group)
|
22
28
|
# We reference scheduler here as it is much faster than fetching this each time
|
23
|
-
@scheduler = ::Karafka::App.config.internal.scheduler
|
29
|
+
@scheduler = ::Karafka::App.config.internal.processing.scheduler
|
24
30
|
# We keep one buffer for messages to preserve memory and not allocate extra objects
|
25
31
|
# We can do this that way because we always first schedule jobs using messages before we
|
26
32
|
# fetch another batch.
|
@@ -62,16 +68,20 @@ module Karafka
|
|
62
68
|
|
63
69
|
resume_paused_partitions
|
64
70
|
|
65
|
-
# We need to fetch data before we revoke lost partitions details as during the polling
|
66
|
-
# the callbacks for tracking lost partitions are triggered. Otherwise we would be always
|
67
|
-
# one batch behind.
|
68
|
-
poll_and_remap_messages
|
69
|
-
|
70
71
|
Karafka.monitor.instrument(
|
71
72
|
'connection.listener.fetch_loop.received',
|
72
73
|
caller: self,
|
73
74
|
messages_buffer: @messages_buffer
|
74
|
-
)
|
75
|
+
) do
|
76
|
+
# We need to fetch data before we revoke lost partitions details as during the polling
|
77
|
+
# the callbacks for tracking lost partitions are triggered. Otherwise we would be
|
78
|
+
# always one batch behind.
|
79
|
+
poll_and_remap_messages
|
80
|
+
end
|
81
|
+
|
82
|
+
# This will ensure, that in the next poll, we continue processing (if we get them back)
|
83
|
+
# partitions that we have paused
|
84
|
+
resume_assigned_partitions
|
75
85
|
|
76
86
|
# If there were revoked partitions, we need to wait on their jobs to finish before
|
77
87
|
# distributing consuming jobs as upon revoking, we might get assigned to the same
|
@@ -80,6 +90,9 @@ module Karafka
|
|
80
90
|
build_and_schedule_revoke_lost_partitions_jobs
|
81
91
|
|
82
92
|
# We wait only on jobs from our subscription group. Other groups are independent.
|
93
|
+
# This will block on revoked jobs until they are finished. Those are not meant to last
|
94
|
+
# long and should not have any bigger impact on the system. Doing this in a blocking way
|
95
|
+
# simplifies the overall design and prevents from race conditions
|
83
96
|
wait
|
84
97
|
|
85
98
|
build_and_schedule_consumption_jobs
|
@@ -130,7 +143,7 @@ module Karafka
|
|
130
143
|
|
131
144
|
# Resumes processing of partitions that were paused due to an error.
|
132
145
|
def resume_paused_partitions
|
133
|
-
@
|
146
|
+
@coordinators.resume do |topic, partition|
|
134
147
|
@client.resume(topic, partition)
|
135
148
|
end
|
136
149
|
end
|
@@ -146,9 +159,23 @@ module Karafka
|
|
146
159
|
|
147
160
|
revoked_partitions.each do |topic, partitions|
|
148
161
|
partitions.each do |partition|
|
149
|
-
|
150
|
-
|
151
|
-
|
162
|
+
# We revoke the coordinator here, so we do not have to revoke it in the revoke job
|
163
|
+
# itself (this happens prior to scheduling those jobs)
|
164
|
+
@coordinators.revoke(topic, partition)
|
165
|
+
|
166
|
+
# There may be a case where we have lost partition of which data we have never
|
167
|
+
# processed (if it was assigned and revoked really fast), thus we may not have it
|
168
|
+
# here. In cases like this, we do not run a revocation job
|
169
|
+
@executors.find_all(topic, partition).each do |executor|
|
170
|
+
jobs << @jobs_builder.revoked(executor)
|
171
|
+
end
|
172
|
+
|
173
|
+
# We need to remove all the executors of a given topic partition that we have lost, so
|
174
|
+
# next time we pick up it's work, new executors kick in. This may be needed especially
|
175
|
+
# for LRJ where we could end up with a race condition
|
176
|
+
# This revocation needs to happen after the jobs are scheduled, otherwise they would
|
177
|
+
# be scheduled with new executors instead of old
|
178
|
+
@executors.revoke(topic, partition)
|
152
179
|
end
|
153
180
|
end
|
154
181
|
|
@@ -160,7 +187,7 @@ module Karafka
|
|
160
187
|
jobs = []
|
161
188
|
|
162
189
|
@executors.each do |_, _, executor|
|
163
|
-
jobs <<
|
190
|
+
jobs << @jobs_builder.shutdown(executor)
|
164
191
|
end
|
165
192
|
|
166
193
|
@scheduler.schedule_shutdown(@jobs_queue, jobs)
|
@@ -177,6 +204,17 @@ module Karafka
|
|
177
204
|
)
|
178
205
|
end
|
179
206
|
|
207
|
+
# Revoked partition needs to be resumed if we were processing them earlier. This will do
|
208
|
+
# nothing to things that we are planning to process. Without this, things we get
|
209
|
+
# re-assigned would not be polled.
|
210
|
+
def resume_assigned_partitions
|
211
|
+
@client.rebalance_manager.assigned_partitions.each do |topic, partitions|
|
212
|
+
partitions.each do |partition|
|
213
|
+
@client.resume(topic, partition)
|
214
|
+
end
|
215
|
+
end
|
216
|
+
end
|
217
|
+
|
180
218
|
# Takes the messages per topic partition and enqueues processing jobs in threads using
|
181
219
|
# given scheduler.
|
182
220
|
def build_and_schedule_consumption_jobs
|
@@ -185,13 +223,17 @@ module Karafka
|
|
185
223
|
jobs = []
|
186
224
|
|
187
225
|
@messages_buffer.each do |topic, partition, messages|
|
188
|
-
|
226
|
+
coordinator = @coordinators.find_or_create(topic, partition)
|
227
|
+
|
228
|
+
# Start work coordination for this topic partition
|
229
|
+
coordinator.start
|
189
230
|
|
190
|
-
|
231
|
+
# Count the job we're going to create here
|
232
|
+
coordinator.increment
|
191
233
|
|
192
|
-
executor = @executors.
|
234
|
+
executor = @executors.find_or_create(topic, partition, 0)
|
193
235
|
|
194
|
-
jobs <<
|
236
|
+
jobs << @jobs_builder.consume(executor, messages, coordinator)
|
195
237
|
end
|
196
238
|
|
197
239
|
@scheduler.schedule_consumption(@jobs_queue, jobs)
|
@@ -227,7 +269,7 @@ module Karafka
|
|
227
269
|
@jobs_queue.wait(@subscription_group.id)
|
228
270
|
@jobs_queue.clear(@subscription_group.id)
|
229
271
|
@client.reset
|
230
|
-
@
|
272
|
+
@coordinators.reset
|
231
273
|
@executors = Processing::ExecutorsBuffer.new(@client, @subscription_group)
|
232
274
|
end
|
233
275
|
end
|
@@ -25,6 +25,14 @@ module Karafka
|
|
25
25
|
)
|
26
26
|
end
|
27
27
|
|
28
|
+
# Revokes pause tracker for a given topic partition
|
29
|
+
#
|
30
|
+
# @param topic [String] topic name
|
31
|
+
# @param partition [Integer] partition number
|
32
|
+
def revoke(topic, partition)
|
33
|
+
@pauses[topic].delete(partition)
|
34
|
+
end
|
35
|
+
|
28
36
|
# Resumes processing of partitions for which pause time has ended.
|
29
37
|
#
|
30
38
|
# @yieldparam [String] topic name
|
@@ -18,13 +18,15 @@ module Karafka
|
|
18
18
|
# Empty array for internal usage not to create new objects
|
19
19
|
EMPTY_ARRAY = [].freeze
|
20
20
|
|
21
|
+
attr_reader :assigned_partitions, :revoked_partitions
|
22
|
+
|
21
23
|
private_constant :EMPTY_ARRAY
|
22
24
|
|
23
25
|
# @return [RebalanceManager]
|
24
26
|
def initialize
|
25
27
|
@assigned_partitions = {}
|
26
28
|
@revoked_partitions = {}
|
27
|
-
@
|
29
|
+
@changed = false
|
28
30
|
end
|
29
31
|
|
30
32
|
# Resets the rebalance manager state
|
@@ -33,26 +35,12 @@ module Karafka
|
|
33
35
|
def clear
|
34
36
|
@assigned_partitions.clear
|
35
37
|
@revoked_partitions.clear
|
36
|
-
@
|
37
|
-
end
|
38
|
-
|
39
|
-
# @return [Hash<String, Array<Integer>>] hash where the keys are the names of topics for
|
40
|
-
# which we've lost partitions and array with ids of the partitions as the value
|
41
|
-
# @note We do not consider as lost topics and partitions that got revoked and assigned
|
42
|
-
def revoked_partitions
|
43
|
-
return @revoked_partitions if @revoked_partitions.empty?
|
44
|
-
return @lost_partitions unless @lost_partitions.empty?
|
45
|
-
|
46
|
-
@revoked_partitions.each do |topic, partitions|
|
47
|
-
@lost_partitions[topic] = partitions - @assigned_partitions.fetch(topic, EMPTY_ARRAY)
|
48
|
-
end
|
49
|
-
|
50
|
-
@lost_partitions
|
38
|
+
@changed = false
|
51
39
|
end
|
52
40
|
|
53
|
-
# @return [Boolean]
|
54
|
-
def
|
55
|
-
|
41
|
+
# @return [Boolean] indicates a state change in the partitions assignment
|
42
|
+
def changed?
|
43
|
+
@changed
|
56
44
|
end
|
57
45
|
|
58
46
|
# Callback that kicks in inside of rdkafka, when new partitions are assigned.
|
@@ -62,6 +50,7 @@ module Karafka
|
|
62
50
|
# @param partitions [Rdkafka::Consumer::TopicPartitionList]
|
63
51
|
def on_partitions_assigned(_, partitions)
|
64
52
|
@assigned_partitions = partitions.to_h.transform_values { |part| part.map(&:partition) }
|
53
|
+
@changed = true
|
65
54
|
end
|
66
55
|
|
67
56
|
# Callback that kicks in inside of rdkafka, when partitions are revoked.
|
@@ -71,6 +60,18 @@ module Karafka
|
|
71
60
|
# @param partitions [Rdkafka::Consumer::TopicPartitionList]
|
72
61
|
def on_partitions_revoked(_, partitions)
|
73
62
|
@revoked_partitions = partitions.to_h.transform_values { |part| part.map(&:partition) }
|
63
|
+
@changed = true
|
64
|
+
end
|
65
|
+
|
66
|
+
# We consider as lost only partitions that were taken away and not re-assigned back to us
|
67
|
+
def lost_partitions
|
68
|
+
lost_partitions = {}
|
69
|
+
|
70
|
+
revoked_partitions.each do |topic, partitions|
|
71
|
+
lost_partitions[topic] = partitions - assigned_partitions.fetch(topic, EMPTY_ARRAY)
|
72
|
+
end
|
73
|
+
|
74
|
+
lost_partitions
|
74
75
|
end
|
75
76
|
end
|
76
77
|
end
|