karafka 2.0.0.beta2 → 2.0.0.beta5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (58) hide show
  1. checksums.yaml +4 -4
  2. checksums.yaml.gz.sig +0 -0
  3. data/.github/workflows/ci.yml +18 -15
  4. data/CHANGELOG.md +49 -0
  5. data/Gemfile.lock +8 -8
  6. data/bin/benchmarks +2 -2
  7. data/bin/integrations +44 -15
  8. data/bin/scenario +29 -0
  9. data/bin/{stress → stress_many} +0 -0
  10. data/bin/stress_one +13 -0
  11. data/bin/wait_for_kafka +20 -0
  12. data/docker-compose.yml +28 -11
  13. data/karafka.gemspec +2 -2
  14. data/lib/karafka/active_job/routing/extensions.rb +12 -2
  15. data/lib/karafka/app.rb +2 -1
  16. data/lib/karafka/base_consumer.rb +75 -45
  17. data/lib/karafka/connection/client.rb +88 -22
  18. data/lib/karafka/connection/listener.rb +60 -18
  19. data/lib/karafka/connection/pauses_manager.rb +8 -0
  20. data/lib/karafka/connection/rebalance_manager.rb +20 -19
  21. data/lib/karafka/contracts/config.rb +17 -3
  22. data/lib/karafka/contracts/server_cli_options.rb +1 -1
  23. data/lib/karafka/errors.rb +3 -0
  24. data/lib/karafka/instrumentation/logger_listener.rb +34 -10
  25. data/lib/karafka/instrumentation/monitor.rb +3 -1
  26. data/lib/karafka/licenser.rb +26 -7
  27. data/lib/karafka/pro/active_job/consumer.rb +30 -9
  28. data/lib/karafka/pro/active_job/dispatcher.rb +9 -9
  29. data/lib/karafka/pro/active_job/job_options_contract.rb +9 -9
  30. data/lib/karafka/pro/base_consumer.rb +73 -0
  31. data/lib/karafka/pro/loader.rb +38 -20
  32. data/lib/karafka/pro/performance_tracker.rb +9 -9
  33. data/lib/karafka/pro/processing/coordinator.rb +12 -0
  34. data/lib/karafka/pro/processing/jobs/consume_non_blocking.rb +10 -11
  35. data/lib/karafka/pro/processing/jobs_builder.rb +32 -0
  36. data/lib/karafka/pro/processing/scheduler.rb +56 -0
  37. data/lib/karafka/pro/routing/extensions.rb +32 -0
  38. data/lib/karafka/processing/coordinator.rb +84 -0
  39. data/lib/karafka/processing/coordinators_buffer.rb +58 -0
  40. data/lib/karafka/processing/executor.rb +23 -9
  41. data/lib/karafka/processing/executors_buffer.rb +46 -15
  42. data/lib/karafka/processing/jobs/base.rb +8 -3
  43. data/lib/karafka/processing/jobs/consume.rb +11 -4
  44. data/lib/karafka/processing/jobs_builder.rb +29 -0
  45. data/lib/karafka/processing/result.rb +29 -0
  46. data/lib/karafka/processing/scheduler.rb +22 -0
  47. data/lib/karafka/processing/worker.rb +17 -9
  48. data/lib/karafka/routing/consumer_group.rb +1 -1
  49. data/lib/karafka/routing/subscription_group.rb +1 -1
  50. data/lib/karafka/routing/topic.rb +14 -0
  51. data/lib/karafka/setup/config.rb +19 -9
  52. data/lib/karafka/status.rb +1 -3
  53. data/lib/karafka/version.rb +1 -1
  54. data.tar.gz.sig +0 -0
  55. metadata +19 -7
  56. metadata.gz.sig +0 -0
  57. data/lib/karafka/pro/scheduler.rb +0 -54
  58. data/lib/karafka/scheduler.rb +0 -20
@@ -10,44 +10,74 @@ module Karafka
10
10
  attr_accessor :messages
11
11
  # @return [Karafka::Connection::Client] kafka connection client
12
12
  attr_accessor :client
13
- # @return [Karafka::TimeTrackers::Pause] current topic partition pause tracker
14
- attr_accessor :pause_tracker
13
+ # @return [Karafka::Processing::Coordinator] coordinator
14
+ attr_accessor :coordinator
15
15
  # @return [Waterdrop::Producer] producer instance
16
16
  attr_accessor :producer
17
17
 
18
+ # Can be used to run preparation code
19
+ #
20
+ # @private
21
+ # @note This should not be used by the end users as it is part of the lifecycle of things but
22
+ # not as part of the public api. This can act as a hook when creating non-blocking
23
+ # consumers and doing other advanced stuff
24
+ def on_before_consume; end
25
+
18
26
  # Executes the default consumer flow.
19
27
  #
28
+ # @return [Boolean] true if there was no exception, otherwise false.
29
+ #
20
30
  # @note We keep the seek offset tracking, and use it to compensate for async offset flushing
21
31
  # that may not yet kick in when error occurs. That way we pause always on the last processed
22
32
  # message.
23
33
  def on_consume
24
34
  Karafka.monitor.instrument('consumer.consumed', caller: self) do
25
35
  consume
26
-
27
- pause_tracker.reset
28
-
29
- # Mark as consumed only if manual offset management is not on
30
- next if topic.manual_offset_management
31
-
32
- # We use the non-blocking one here. If someone needs the blocking one, can implement it
33
- # with manual offset management
34
- mark_as_consumed(messages.last)
35
36
  end
37
+
38
+ @coordinator.consumption(self).success!
36
39
  rescue StandardError => e
40
+ @coordinator.consumption(self).failure!
41
+
37
42
  Karafka.monitor.instrument(
38
43
  'error.occurred',
39
44
  error: e,
40
45
  caller: self,
41
46
  type: 'consumer.consume.error'
42
47
  )
48
+ ensure
49
+ # We need to decrease number of jobs that this coordinator coordinates as it has finished
50
+ @coordinator.decrement
51
+ end
52
+
53
+ # @private
54
+ # @note This should not be used by the end users as it is part of the lifecycle of things but
55
+ # not as part of the public api.
56
+ def on_after_consume
57
+ return if revoked?
58
+
59
+ if @coordinator.success?
60
+ coordinator.pause_tracker.reset
61
+
62
+ # Mark as consumed only if manual offset management is not on
63
+ return if topic.manual_offset_management?
43
64
 
44
- pause(@seek_offset || messages.first.offset)
65
+ # We use the non-blocking one here. If someone needs the blocking one, can implement it
66
+ # with manual offset management
67
+ mark_as_consumed(messages.last)
68
+ else
69
+ pause(@seek_offset || messages.first.offset)
70
+ end
45
71
  end
46
72
 
47
73
  # Trigger method for running on shutdown.
48
74
  #
49
75
  # @private
50
76
  def on_revoked
77
+ coordinator.revoke
78
+
79
+ resume
80
+
51
81
  Karafka.monitor.instrument('consumer.revoked', caller: self) do
52
82
  revoked
53
83
  end
@@ -76,31 +106,8 @@ module Karafka
76
106
  )
77
107
  end
78
108
 
79
- # Can be used to run preparation code
80
- #
81
- # @private
82
- # @note This should not be used by the end users as it is part of the lifecycle of things but
83
- # not as part of the public api. This can act as a hook when creating non-blocking
84
- # consumers and doing other advanced stuff
85
- def on_prepared
86
- Karafka.monitor.instrument('consumer.prepared', caller: self) do
87
- prepared
88
- end
89
- rescue StandardError => e
90
- Karafka.monitor.instrument(
91
- 'error.occurred',
92
- error: e,
93
- caller: self,
94
- type: 'consumer.prepared.error'
95
- )
96
- end
97
-
98
109
  private
99
110
 
100
- # Method that gets called in the blocking flow allowing to setup any type of resources or to
101
- # send additional commands to Kafka before the proper execution starts.
102
- def prepared; end
103
-
104
111
  # Method that will perform business logic and on data received from Kafka (it will consume
105
112
  # the data)
106
113
  # @note This method needs bo be implemented in a subclass. We stub it here as a failover if
@@ -120,21 +127,40 @@ module Karafka
120
127
  # Marks message as consumed in an async way.
121
128
  #
122
129
  # @param message [Messages::Message] last successfully processed message.
130
+ # @return [Boolean] true if we were able to mark the offset, false otherwise. False indicates
131
+ # that we were not able and that we have lost the partition.
132
+ #
123
133
  # @note We keep track of this offset in case we would mark as consumed and got error when
124
134
  # processing another message. In case like this we do not pause on the message we've already
125
135
  # processed but rather at the next one. This applies to both sync and async versions of this
126
136
  # method.
127
137
  def mark_as_consumed(message)
128
- client.mark_as_consumed(message)
138
+ unless client.mark_as_consumed(message)
139
+ coordinator.revoke
140
+
141
+ return false
142
+ end
143
+
129
144
  @seek_offset = message.offset + 1
145
+
146
+ true
130
147
  end
131
148
 
132
149
  # Marks message as consumed in a sync way.
133
150
  #
134
151
  # @param message [Messages::Message] last successfully processed message.
152
+ # @return [Boolean] true if we were able to mark the offset, false otherwise. False indicates
153
+ # that we were not able and that we have lost the partition.
135
154
  def mark_as_consumed!(message)
136
- client.mark_as_consumed!(message)
155
+ unless client.mark_as_consumed!(message)
156
+ coordinator.revoke
157
+
158
+ return false
159
+ end
160
+
137
161
  @seek_offset = message.offset + 1
162
+
163
+ true
138
164
  end
139
165
 
140
166
  # Pauses processing on a given offset for the current topic partition
@@ -144,23 +170,20 @@ module Karafka
144
170
  # @param timeout [Integer, nil] how long in milliseconds do we want to pause or nil to use the
145
171
  # default exponential pausing strategy defined for retries
146
172
  def pause(offset, timeout = nil)
173
+ timeout ? coordinator.pause_tracker.pause(timeout) : coordinator.pause_tracker.pause
174
+
147
175
  client.pause(
148
176
  messages.metadata.topic,
149
177
  messages.metadata.partition,
150
178
  offset
151
179
  )
152
-
153
- timeout ? pause_tracker.pause(timeout) : pause_tracker.pause
154
180
  end
155
181
 
156
182
  # Resumes processing of the current topic partition
157
183
  def resume
158
- client.resume(
159
- messages.metadata.topic,
160
- messages.metadata.partition
161
- )
162
-
163
- pause_tracker.expire
184
+ # This is sufficient to expire a partition pause, as with it will be resumed by the listener
185
+ # thread before the next poll.
186
+ coordinator.pause_tracker.expire
164
187
  end
165
188
 
166
189
  # Seeks in the context of current topic and partition
@@ -175,5 +198,12 @@ module Karafka
175
198
  )
176
199
  )
177
200
  end
201
+
202
+ # @return [Boolean] true if partition was revoked from the current consumer
203
+ # @note We know that partition got revoked because when we try to mark message as consumed,
204
+ # unless if is successful, it will return false
205
+ def revoked?
206
+ coordinator.revoked?
207
+ end
178
208
  end
179
209
  end
@@ -36,6 +36,12 @@ module Karafka
36
36
  # Marks if we need to offset. If we did not store offsets, we should not commit the offset
37
37
  # position as it will crash rdkafka
38
38
  @offsetting = false
39
+ # We need to keep track of what we have paused for resuming
40
+ # In case we loose partition, we still need to resume it, otherwise it won't be fetched
41
+ # again if we get reassigned to it later on. We need to keep them as after revocation we
42
+ # no longer may be able to fetch them from Kafka. We could build them but it is easier
43
+ # to just keep them here and use if needed when cannot be obtained
44
+ @paused_tpls = Hash.new { |h, k| h[k] = {} }
39
45
  end
40
46
 
41
47
  # Fetches messages within boundaries defined by the settings (time, size, topics, etc).
@@ -45,12 +51,13 @@ module Karafka
45
51
  # @note This method should not be executed from many threads at the same time
46
52
  def batch_poll
47
53
  time_poll = TimeTrackers::Poll.new(@subscription_group.max_wait_time)
48
- time_poll.start
49
54
 
50
55
  @buffer.clear
51
56
  @rebalance_manager.clear
52
57
 
53
58
  loop do
59
+ time_poll.start
60
+
54
61
  # Don't fetch more messages if we do not have any time left
55
62
  break if time_poll.exceeded?
56
63
  # Don't fetch more messages if we've fetched max as we've wanted
@@ -69,7 +76,11 @@ module Karafka
69
76
  # If partition revocation happens, we need to remove messages from revoked partitions
70
77
  # as well as ensure we do not have duplicated due to the offset reset for partitions
71
78
  # that we got assigned
72
- remove_revoked_and_duplicated_messages if @rebalance_manager.revoked_partitions?
79
+ # We also do early break, so the information about rebalance is used as soon as possible
80
+ if @rebalance_manager.changed?
81
+ remove_revoked_and_duplicated_messages
82
+ break
83
+ end
73
84
 
74
85
  # Finally once we've (potentially) removed revoked, etc, if no messages were returned
75
86
  # we can break.
@@ -86,8 +97,7 @@ module Karafka
86
97
  # @param message [Karafka::Messages::Message]
87
98
  def store_offset(message)
88
99
  @mutex.synchronize do
89
- @offsetting = true
90
- @kafka.store_offset(message)
100
+ internal_store_offset(message)
91
101
  end
92
102
  end
93
103
 
@@ -104,14 +114,7 @@ module Karafka
104
114
  def commit_offsets(async: true)
105
115
  @mutex.lock
106
116
 
107
- return unless @offsetting
108
-
109
- @kafka.commit(nil, async)
110
- @offsetting = false
111
- rescue Rdkafka::RdkafkaError => e
112
- return if e.code == :no_offset
113
-
114
- raise e
117
+ internal_commit_offsets(async: async)
115
118
  ensure
116
119
  @mutex.unlock
117
120
  end
@@ -128,7 +131,11 @@ module Karafka
128
131
  #
129
132
  # @param message [Messages::Message, Messages::Seek] message to which we want to seek to
130
133
  def seek(message)
134
+ @mutex.lock
135
+
131
136
  @kafka.seek(message)
137
+ ensure
138
+ @mutex.unlock
132
139
  end
133
140
 
134
141
  # Pauses given partition and moves back to last successful offset processed.
@@ -144,15 +151,21 @@ module Karafka
144
151
  # Do not pause if the client got closed, would not change anything
145
152
  return if @closed
146
153
 
154
+ pause_msg = Messages::Seek.new(topic, partition, offset)
155
+
156
+ internal_commit_offsets(async: false)
157
+
158
+ # Here we do not use our cached tpls because we should not try to pause something we do
159
+ # not own anymore.
147
160
  tpl = topic_partition_list(topic, partition)
148
161
 
149
162
  return unless tpl
150
163
 
151
- @kafka.pause(tpl)
164
+ @paused_tpls[topic][partition] = tpl
152
165
 
153
- pause_msg = Messages::Seek.new(topic, partition, offset)
166
+ @kafka.pause(tpl)
154
167
 
155
- seek(pause_msg)
168
+ @kafka.seek(pause_msg)
156
169
  ensure
157
170
  @mutex.unlock
158
171
  end
@@ -166,9 +179,18 @@ module Karafka
166
179
 
167
180
  return if @closed
168
181
 
169
- tpl = topic_partition_list(topic, partition)
182
+ # Always commit synchronously offsets if any when we resume
183
+ # This prevents resuming without offset in case it would not be committed prior
184
+ # We can skip performance penalty since resuming should not happen too often
185
+ internal_commit_offsets(async: false)
186
+
187
+ # If we were not able, let's try to reuse the one we have (if we have)
188
+ tpl = topic_partition_list(topic, partition) || @paused_tpls[topic][partition]
170
189
 
171
190
  return unless tpl
191
+ # If we did not have it, it means we never paused this partition, thus no resume should
192
+ # happen in the first place
193
+ return unless @paused_tpls[topic].delete(partition)
172
194
 
173
195
  @kafka.resume(tpl)
174
196
  ensure
@@ -187,6 +209,7 @@ module Karafka
187
209
  # Marks given message as consumed.
188
210
  #
189
211
  # @param [Karafka::Messages::Message] message that we want to mark as processed
212
+ # @return [Boolean] true if successful. False if we no longer own given partition
190
213
  # @note This method won't trigger automatic offsets commits, rather relying on the offset
191
214
  # check-pointing trigger that happens with each batch processed
192
215
  def mark_as_consumed(message)
@@ -196,8 +219,10 @@ module Karafka
196
219
  # Marks a given message as consumed and commits the offsets in a blocking way.
197
220
  #
198
221
  # @param [Karafka::Messages::Message] message that we want to mark as processed
222
+ # @return [Boolean] true if successful. False if we no longer own given partition
199
223
  def mark_as_consumed!(message)
200
- mark_as_consumed(message)
224
+ return false unless mark_as_consumed(message)
225
+
201
226
  commit_offsets!
202
227
  end
203
228
 
@@ -208,17 +233,51 @@ module Karafka
208
233
  @mutex.synchronize do
209
234
  @closed = false
210
235
  @offsetting = false
236
+ @paused_tpls.clear
211
237
  @kafka = build_consumer
212
238
  end
213
239
  end
214
240
 
215
241
  private
216
242
 
243
+ # When we cannot store an offset, it means we no longer own the partition
244
+ #
245
+ # Non thread-safe offset storing method
246
+ # @param message [Karafka::Messages::Message]
247
+ # @return [Boolean] true if we could store the offset (if we still own the partition)
248
+ def internal_store_offset(message)
249
+ @offsetting = true
250
+ @kafka.store_offset(message)
251
+ true
252
+ rescue Rdkafka::RdkafkaError => e
253
+ return false if e.code == :assignment_lost
254
+ return false if e.code == :state
255
+
256
+ raise e
257
+ end
258
+
259
+ # Non thread-safe message committing method
260
+ # @param async [Boolean] should the commit happen async or sync (async by default)
261
+ # @return [Boolean] true if offset commit worked, false if we've lost the assignment
262
+ def internal_commit_offsets(async: true)
263
+ return true unless @offsetting
264
+
265
+ @kafka.commit(nil, async)
266
+ @offsetting = false
267
+
268
+ true
269
+ rescue Rdkafka::RdkafkaError => e
270
+ return false if e.code == :assignment_lost
271
+ return false if e.code == :no_offset
272
+
273
+ raise e
274
+ end
275
+
217
276
  # Commits the stored offsets in a sync way and closes the consumer.
218
277
  def close
219
- commit_offsets!
220
-
221
278
  @mutex.synchronize do
279
+ internal_commit_offsets(async: false)
280
+
222
281
  @closed = true
223
282
 
224
283
  # Remove callbacks runners that were registered
@@ -227,7 +286,8 @@ module Karafka
227
286
 
228
287
  @kafka.close
229
288
  @buffer.clear
230
- @rebalance_manager.clear
289
+ # @note We do not clear rebalance manager here as we may still have revocation info here
290
+ # that we want to consider valid prior to running another reconnection
231
291
  end
232
292
  end
233
293
 
@@ -280,7 +340,13 @@ module Karafka
280
340
 
281
341
  time_poll.backoff
282
342
 
283
- retry
343
+ # We return nil, so we do not restart until running the whole loop
344
+ # This allows us to run revocation jobs and other things and we will pick up new work
345
+ # next time after dispatching all the things that are needed
346
+ #
347
+ # If we would retry here, the client reset would become transparent and we would not have
348
+ # a chance to take any actions
349
+ nil
284
350
  end
285
351
 
286
352
  # Builds a new rdkafka consumer instance based on the subscription group configuration
@@ -323,7 +389,7 @@ module Karafka
323
389
  # we are no longer responsible in a given process for processing those messages and they
324
390
  # should have been picked up by a different process.
325
391
  def remove_revoked_and_duplicated_messages
326
- @rebalance_manager.revoked_partitions.each do |topic, partitions|
392
+ @rebalance_manager.lost_partitions.each do |topic, partitions|
327
393
  partitions.each do |partition|
328
394
  @buffer.delete(topic, partition)
329
395
  end
@@ -10,17 +10,23 @@ module Karafka
10
10
  class Listener
11
11
  include Helpers::Async
12
12
 
13
+ # Can be useful for logging
14
+ # @return [String] id of this listener
15
+ attr_reader :id
16
+
13
17
  # @param subscription_group [Karafka::Routing::SubscriptionGroup]
14
18
  # @param jobs_queue [Karafka::Processing::JobsQueue] queue where we should push work
15
19
  # @return [Karafka::Connection::Listener] listener instance
16
20
  def initialize(subscription_group, jobs_queue)
21
+ @id = SecureRandom.uuid
17
22
  @subscription_group = subscription_group
18
23
  @jobs_queue = jobs_queue
19
- @pauses_manager = PausesManager.new
24
+ @jobs_builder = ::Karafka::App.config.internal.processing.jobs_builder
25
+ @coordinators = Processing::CoordinatorsBuffer.new
20
26
  @client = Client.new(@subscription_group)
21
27
  @executors = Processing::ExecutorsBuffer.new(@client, subscription_group)
22
28
  # We reference scheduler here as it is much faster than fetching this each time
23
- @scheduler = ::Karafka::App.config.internal.scheduler
29
+ @scheduler = ::Karafka::App.config.internal.processing.scheduler
24
30
  # We keep one buffer for messages to preserve memory and not allocate extra objects
25
31
  # We can do this that way because we always first schedule jobs using messages before we
26
32
  # fetch another batch.
@@ -62,16 +68,20 @@ module Karafka
62
68
 
63
69
  resume_paused_partitions
64
70
 
65
- # We need to fetch data before we revoke lost partitions details as during the polling
66
- # the callbacks for tracking lost partitions are triggered. Otherwise we would be always
67
- # one batch behind.
68
- poll_and_remap_messages
69
-
70
71
  Karafka.monitor.instrument(
71
72
  'connection.listener.fetch_loop.received',
72
73
  caller: self,
73
74
  messages_buffer: @messages_buffer
74
- )
75
+ ) do
76
+ # We need to fetch data before we revoke lost partitions details as during the polling
77
+ # the callbacks for tracking lost partitions are triggered. Otherwise we would be
78
+ # always one batch behind.
79
+ poll_and_remap_messages
80
+ end
81
+
82
+ # This will ensure, that in the next poll, we continue processing (if we get them back)
83
+ # partitions that we have paused
84
+ resume_assigned_partitions
75
85
 
76
86
  # If there were revoked partitions, we need to wait on their jobs to finish before
77
87
  # distributing consuming jobs as upon revoking, we might get assigned to the same
@@ -80,6 +90,9 @@ module Karafka
80
90
  build_and_schedule_revoke_lost_partitions_jobs
81
91
 
82
92
  # We wait only on jobs from our subscription group. Other groups are independent.
93
+ # This will block on revoked jobs until they are finished. Those are not meant to last
94
+ # long and should not have any bigger impact on the system. Doing this in a blocking way
95
+ # simplifies the overall design and prevents from race conditions
83
96
  wait
84
97
 
85
98
  build_and_schedule_consumption_jobs
@@ -130,7 +143,7 @@ module Karafka
130
143
 
131
144
  # Resumes processing of partitions that were paused due to an error.
132
145
  def resume_paused_partitions
133
- @pauses_manager.resume do |topic, partition|
146
+ @coordinators.resume do |topic, partition|
134
147
  @client.resume(topic, partition)
135
148
  end
136
149
  end
@@ -146,9 +159,23 @@ module Karafka
146
159
 
147
160
  revoked_partitions.each do |topic, partitions|
148
161
  partitions.each do |partition|
149
- pause_tracker = @pauses_manager.fetch(topic, partition)
150
- executor = @executors.fetch(topic, partition, pause_tracker)
151
- jobs << Processing::Jobs::Revoked.new(executor)
162
+ # We revoke the coordinator here, so we do not have to revoke it in the revoke job
163
+ # itself (this happens prior to scheduling those jobs)
164
+ @coordinators.revoke(topic, partition)
165
+
166
+ # There may be a case where we have lost partition of which data we have never
167
+ # processed (if it was assigned and revoked really fast), thus we may not have it
168
+ # here. In cases like this, we do not run a revocation job
169
+ @executors.find_all(topic, partition).each do |executor|
170
+ jobs << @jobs_builder.revoked(executor)
171
+ end
172
+
173
+ # We need to remove all the executors of a given topic partition that we have lost, so
174
+ # next time we pick up it's work, new executors kick in. This may be needed especially
175
+ # for LRJ where we could end up with a race condition
176
+ # This revocation needs to happen after the jobs are scheduled, otherwise they would
177
+ # be scheduled with new executors instead of old
178
+ @executors.revoke(topic, partition)
152
179
  end
153
180
  end
154
181
 
@@ -160,7 +187,7 @@ module Karafka
160
187
  jobs = []
161
188
 
162
189
  @executors.each do |_, _, executor|
163
- jobs << Processing::Jobs::Shutdown.new(executor)
190
+ jobs << @jobs_builder.shutdown(executor)
164
191
  end
165
192
 
166
193
  @scheduler.schedule_shutdown(@jobs_queue, jobs)
@@ -177,6 +204,17 @@ module Karafka
177
204
  )
178
205
  end
179
206
 
207
+ # Revoked partition needs to be resumed if we were processing them earlier. This will do
208
+ # nothing to things that we are planning to process. Without this, things we get
209
+ # re-assigned would not be polled.
210
+ def resume_assigned_partitions
211
+ @client.rebalance_manager.assigned_partitions.each do |topic, partitions|
212
+ partitions.each do |partition|
213
+ @client.resume(topic, partition)
214
+ end
215
+ end
216
+ end
217
+
180
218
  # Takes the messages per topic partition and enqueues processing jobs in threads using
181
219
  # given scheduler.
182
220
  def build_and_schedule_consumption_jobs
@@ -185,13 +223,17 @@ module Karafka
185
223
  jobs = []
186
224
 
187
225
  @messages_buffer.each do |topic, partition, messages|
188
- pause = @pauses_manager.fetch(topic, partition)
226
+ coordinator = @coordinators.find_or_create(topic, partition)
227
+
228
+ # Start work coordination for this topic partition
229
+ coordinator.start
189
230
 
190
- next if pause.paused?
231
+ # Count the job we're going to create here
232
+ coordinator.increment
191
233
 
192
- executor = @executors.fetch(topic, partition, pause)
234
+ executor = @executors.find_or_create(topic, partition, 0)
193
235
 
194
- jobs << Processing::Jobs::Consume.new(executor, messages)
236
+ jobs << @jobs_builder.consume(executor, messages, coordinator)
195
237
  end
196
238
 
197
239
  @scheduler.schedule_consumption(@jobs_queue, jobs)
@@ -227,7 +269,7 @@ module Karafka
227
269
  @jobs_queue.wait(@subscription_group.id)
228
270
  @jobs_queue.clear(@subscription_group.id)
229
271
  @client.reset
230
- @pauses_manager = PausesManager.new
272
+ @coordinators.reset
231
273
  @executors = Processing::ExecutorsBuffer.new(@client, @subscription_group)
232
274
  end
233
275
  end
@@ -25,6 +25,14 @@ module Karafka
25
25
  )
26
26
  end
27
27
 
28
+ # Revokes pause tracker for a given topic partition
29
+ #
30
+ # @param topic [String] topic name
31
+ # @param partition [Integer] partition number
32
+ def revoke(topic, partition)
33
+ @pauses[topic].delete(partition)
34
+ end
35
+
28
36
  # Resumes processing of partitions for which pause time has ended.
29
37
  #
30
38
  # @yieldparam [String] topic name
@@ -18,13 +18,15 @@ module Karafka
18
18
  # Empty array for internal usage not to create new objects
19
19
  EMPTY_ARRAY = [].freeze
20
20
 
21
+ attr_reader :assigned_partitions, :revoked_partitions
22
+
21
23
  private_constant :EMPTY_ARRAY
22
24
 
23
25
  # @return [RebalanceManager]
24
26
  def initialize
25
27
  @assigned_partitions = {}
26
28
  @revoked_partitions = {}
27
- @lost_partitions = {}
29
+ @changed = false
28
30
  end
29
31
 
30
32
  # Resets the rebalance manager state
@@ -33,26 +35,12 @@ module Karafka
33
35
  def clear
34
36
  @assigned_partitions.clear
35
37
  @revoked_partitions.clear
36
- @lost_partitions.clear
37
- end
38
-
39
- # @return [Hash<String, Array<Integer>>] hash where the keys are the names of topics for
40
- # which we've lost partitions and array with ids of the partitions as the value
41
- # @note We do not consider as lost topics and partitions that got revoked and assigned
42
- def revoked_partitions
43
- return @revoked_partitions if @revoked_partitions.empty?
44
- return @lost_partitions unless @lost_partitions.empty?
45
-
46
- @revoked_partitions.each do |topic, partitions|
47
- @lost_partitions[topic] = partitions - @assigned_partitions.fetch(topic, EMPTY_ARRAY)
48
- end
49
-
50
- @lost_partitions
38
+ @changed = false
51
39
  end
52
40
 
53
- # @return [Boolean] true if any partitions were revoked
54
- def revoked_partitions?
55
- !revoked_partitions.empty?
41
+ # @return [Boolean] indicates a state change in the partitions assignment
42
+ def changed?
43
+ @changed
56
44
  end
57
45
 
58
46
  # Callback that kicks in inside of rdkafka, when new partitions are assigned.
@@ -62,6 +50,7 @@ module Karafka
62
50
  # @param partitions [Rdkafka::Consumer::TopicPartitionList]
63
51
  def on_partitions_assigned(_, partitions)
64
52
  @assigned_partitions = partitions.to_h.transform_values { |part| part.map(&:partition) }
53
+ @changed = true
65
54
  end
66
55
 
67
56
  # Callback that kicks in inside of rdkafka, when partitions are revoked.
@@ -71,6 +60,18 @@ module Karafka
71
60
  # @param partitions [Rdkafka::Consumer::TopicPartitionList]
72
61
  def on_partitions_revoked(_, partitions)
73
62
  @revoked_partitions = partitions.to_h.transform_values { |part| part.map(&:partition) }
63
+ @changed = true
64
+ end
65
+
66
+ # We consider as lost only partitions that were taken away and not re-assigned back to us
67
+ def lost_partitions
68
+ lost_partitions = {}
69
+
70
+ revoked_partitions.each do |topic, partitions|
71
+ lost_partitions[topic] = partitions - assigned_partitions.fetch(topic, EMPTY_ARRAY)
72
+ end
73
+
74
+ lost_partitions
74
75
  end
75
76
  end
76
77
  end