karafka 2.0.0.beta2 → 2.0.0.beta5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. checksums.yaml +4 -4
  2. checksums.yaml.gz.sig +0 -0
  3. data/.github/workflows/ci.yml +18 -15
  4. data/CHANGELOG.md +49 -0
  5. data/Gemfile.lock +8 -8
  6. data/bin/benchmarks +2 -2
  7. data/bin/integrations +44 -15
  8. data/bin/scenario +29 -0
  9. data/bin/{stress → stress_many} +0 -0
  10. data/bin/stress_one +13 -0
  11. data/bin/wait_for_kafka +20 -0
  12. data/docker-compose.yml +28 -11
  13. data/karafka.gemspec +2 -2
  14. data/lib/karafka/active_job/routing/extensions.rb +12 -2
  15. data/lib/karafka/app.rb +2 -1
  16. data/lib/karafka/base_consumer.rb +75 -45
  17. data/lib/karafka/connection/client.rb +88 -22
  18. data/lib/karafka/connection/listener.rb +60 -18
  19. data/lib/karafka/connection/pauses_manager.rb +8 -0
  20. data/lib/karafka/connection/rebalance_manager.rb +20 -19
  21. data/lib/karafka/contracts/config.rb +17 -3
  22. data/lib/karafka/contracts/server_cli_options.rb +1 -1
  23. data/lib/karafka/errors.rb +3 -0
  24. data/lib/karafka/instrumentation/logger_listener.rb +34 -10
  25. data/lib/karafka/instrumentation/monitor.rb +3 -1
  26. data/lib/karafka/licenser.rb +26 -7
  27. data/lib/karafka/pro/active_job/consumer.rb +30 -9
  28. data/lib/karafka/pro/active_job/dispatcher.rb +9 -9
  29. data/lib/karafka/pro/active_job/job_options_contract.rb +9 -9
  30. data/lib/karafka/pro/base_consumer.rb +73 -0
  31. data/lib/karafka/pro/loader.rb +38 -20
  32. data/lib/karafka/pro/performance_tracker.rb +9 -9
  33. data/lib/karafka/pro/processing/coordinator.rb +12 -0
  34. data/lib/karafka/pro/processing/jobs/consume_non_blocking.rb +10 -11
  35. data/lib/karafka/pro/processing/jobs_builder.rb +32 -0
  36. data/lib/karafka/pro/processing/scheduler.rb +56 -0
  37. data/lib/karafka/pro/routing/extensions.rb +32 -0
  38. data/lib/karafka/processing/coordinator.rb +84 -0
  39. data/lib/karafka/processing/coordinators_buffer.rb +58 -0
  40. data/lib/karafka/processing/executor.rb +23 -9
  41. data/lib/karafka/processing/executors_buffer.rb +46 -15
  42. data/lib/karafka/processing/jobs/base.rb +8 -3
  43. data/lib/karafka/processing/jobs/consume.rb +11 -4
  44. data/lib/karafka/processing/jobs_builder.rb +29 -0
  45. data/lib/karafka/processing/result.rb +29 -0
  46. data/lib/karafka/processing/scheduler.rb +22 -0
  47. data/lib/karafka/processing/worker.rb +17 -9
  48. data/lib/karafka/routing/consumer_group.rb +1 -1
  49. data/lib/karafka/routing/subscription_group.rb +1 -1
  50. data/lib/karafka/routing/topic.rb +14 -0
  51. data/lib/karafka/setup/config.rb +19 -9
  52. data/lib/karafka/status.rb +1 -3
  53. data/lib/karafka/version.rb +1 -1
  54. data.tar.gz.sig +0 -0
  55. metadata +19 -7
  56. metadata.gz.sig +0 -0
  57. data/lib/karafka/pro/scheduler.rb +0 -54
  58. data/lib/karafka/scheduler.rb +0 -20
@@ -10,44 +10,74 @@ module Karafka
10
10
  attr_accessor :messages
11
11
  # @return [Karafka::Connection::Client] kafka connection client
12
12
  attr_accessor :client
13
- # @return [Karafka::TimeTrackers::Pause] current topic partition pause tracker
14
- attr_accessor :pause_tracker
13
+ # @return [Karafka::Processing::Coordinator] coordinator
14
+ attr_accessor :coordinator
15
15
  # @return [Waterdrop::Producer] producer instance
16
16
  attr_accessor :producer
17
17
 
18
+ # Can be used to run preparation code
19
+ #
20
+ # @private
21
+ # @note This should not be used by the end users as it is part of the lifecycle of things but
22
+ # not as part of the public api. This can act as a hook when creating non-blocking
23
+ # consumers and doing other advanced stuff
24
+ def on_before_consume; end
25
+
18
26
  # Executes the default consumer flow.
19
27
  #
28
+ # @return [Boolean] true if there was no exception, otherwise false.
29
+ #
20
30
  # @note We keep the seek offset tracking, and use it to compensate for async offset flushing
21
31
  # that may not yet kick in when error occurs. That way we pause always on the last processed
22
32
  # message.
23
33
  def on_consume
24
34
  Karafka.monitor.instrument('consumer.consumed', caller: self) do
25
35
  consume
26
-
27
- pause_tracker.reset
28
-
29
- # Mark as consumed only if manual offset management is not on
30
- next if topic.manual_offset_management
31
-
32
- # We use the non-blocking one here. If someone needs the blocking one, can implement it
33
- # with manual offset management
34
- mark_as_consumed(messages.last)
35
36
  end
37
+
38
+ @coordinator.consumption(self).success!
36
39
  rescue StandardError => e
40
+ @coordinator.consumption(self).failure!
41
+
37
42
  Karafka.monitor.instrument(
38
43
  'error.occurred',
39
44
  error: e,
40
45
  caller: self,
41
46
  type: 'consumer.consume.error'
42
47
  )
48
+ ensure
49
+ # We need to decrease number of jobs that this coordinator coordinates as it has finished
50
+ @coordinator.decrement
51
+ end
52
+
53
+ # @private
54
+ # @note This should not be used by the end users as it is part of the lifecycle of things but
55
+ # not as part of the public api.
56
+ def on_after_consume
57
+ return if revoked?
58
+
59
+ if @coordinator.success?
60
+ coordinator.pause_tracker.reset
61
+
62
+ # Mark as consumed only if manual offset management is not on
63
+ return if topic.manual_offset_management?
43
64
 
44
- pause(@seek_offset || messages.first.offset)
65
+ # We use the non-blocking one here. If someone needs the blocking one, can implement it
66
+ # with manual offset management
67
+ mark_as_consumed(messages.last)
68
+ else
69
+ pause(@seek_offset || messages.first.offset)
70
+ end
45
71
  end
46
72
 
47
73
  # Trigger method for running on shutdown.
48
74
  #
49
75
  # @private
50
76
  def on_revoked
77
+ coordinator.revoke
78
+
79
+ resume
80
+
51
81
  Karafka.monitor.instrument('consumer.revoked', caller: self) do
52
82
  revoked
53
83
  end
@@ -76,31 +106,8 @@ module Karafka
76
106
  )
77
107
  end
78
108
 
79
- # Can be used to run preparation code
80
- #
81
- # @private
82
- # @note This should not be used by the end users as it is part of the lifecycle of things but
83
- # not as part of the public api. This can act as a hook when creating non-blocking
84
- # consumers and doing other advanced stuff
85
- def on_prepared
86
- Karafka.monitor.instrument('consumer.prepared', caller: self) do
87
- prepared
88
- end
89
- rescue StandardError => e
90
- Karafka.monitor.instrument(
91
- 'error.occurred',
92
- error: e,
93
- caller: self,
94
- type: 'consumer.prepared.error'
95
- )
96
- end
97
-
98
109
  private
99
110
 
100
- # Method that gets called in the blocking flow allowing to setup any type of resources or to
101
- # send additional commands to Kafka before the proper execution starts.
102
- def prepared; end
103
-
104
111
  # Method that will perform business logic and on data received from Kafka (it will consume
105
112
  # the data)
106
113
  # @note This method needs bo be implemented in a subclass. We stub it here as a failover if
@@ -120,21 +127,40 @@ module Karafka
120
127
  # Marks message as consumed in an async way.
121
128
  #
122
129
  # @param message [Messages::Message] last successfully processed message.
130
+ # @return [Boolean] true if we were able to mark the offset, false otherwise. False indicates
131
+ # that we were not able and that we have lost the partition.
132
+ #
123
133
  # @note We keep track of this offset in case we would mark as consumed and got error when
124
134
  # processing another message. In case like this we do not pause on the message we've already
125
135
  # processed but rather at the next one. This applies to both sync and async versions of this
126
136
  # method.
127
137
  def mark_as_consumed(message)
128
- client.mark_as_consumed(message)
138
+ unless client.mark_as_consumed(message)
139
+ coordinator.revoke
140
+
141
+ return false
142
+ end
143
+
129
144
  @seek_offset = message.offset + 1
145
+
146
+ true
130
147
  end
131
148
 
132
149
  # Marks message as consumed in a sync way.
133
150
  #
134
151
  # @param message [Messages::Message] last successfully processed message.
152
+ # @return [Boolean] true if we were able to mark the offset, false otherwise. False indicates
153
+ # that we were not able and that we have lost the partition.
135
154
  def mark_as_consumed!(message)
136
- client.mark_as_consumed!(message)
155
+ unless client.mark_as_consumed!(message)
156
+ coordinator.revoke
157
+
158
+ return false
159
+ end
160
+
137
161
  @seek_offset = message.offset + 1
162
+
163
+ true
138
164
  end
139
165
 
140
166
  # Pauses processing on a given offset for the current topic partition
@@ -144,23 +170,20 @@ module Karafka
144
170
  # @param timeout [Integer, nil] how long in milliseconds do we want to pause or nil to use the
145
171
  # default exponential pausing strategy defined for retries
146
172
  def pause(offset, timeout = nil)
173
+ timeout ? coordinator.pause_tracker.pause(timeout) : coordinator.pause_tracker.pause
174
+
147
175
  client.pause(
148
176
  messages.metadata.topic,
149
177
  messages.metadata.partition,
150
178
  offset
151
179
  )
152
-
153
- timeout ? pause_tracker.pause(timeout) : pause_tracker.pause
154
180
  end
155
181
 
156
182
  # Resumes processing of the current topic partition
157
183
  def resume
158
- client.resume(
159
- messages.metadata.topic,
160
- messages.metadata.partition
161
- )
162
-
163
- pause_tracker.expire
184
+ # This is sufficient to expire a partition pause, as with it will be resumed by the listener
185
+ # thread before the next poll.
186
+ coordinator.pause_tracker.expire
164
187
  end
165
188
 
166
189
  # Seeks in the context of current topic and partition
@@ -175,5 +198,12 @@ module Karafka
175
198
  )
176
199
  )
177
200
  end
201
+
202
+ # @return [Boolean] true if partition was revoked from the current consumer
203
+ # @note We know that partition got revoked because when we try to mark message as consumed,
204
+ # unless if is successful, it will return false
205
+ def revoked?
206
+ coordinator.revoked?
207
+ end
178
208
  end
179
209
  end
@@ -36,6 +36,12 @@ module Karafka
36
36
  # Marks if we need to offset. If we did not store offsets, we should not commit the offset
37
37
  # position as it will crash rdkafka
38
38
  @offsetting = false
39
+ # We need to keep track of what we have paused for resuming
40
+ # In case we loose partition, we still need to resume it, otherwise it won't be fetched
41
+ # again if we get reassigned to it later on. We need to keep them as after revocation we
42
+ # no longer may be able to fetch them from Kafka. We could build them but it is easier
43
+ # to just keep them here and use if needed when cannot be obtained
44
+ @paused_tpls = Hash.new { |h, k| h[k] = {} }
39
45
  end
40
46
 
41
47
  # Fetches messages within boundaries defined by the settings (time, size, topics, etc).
@@ -45,12 +51,13 @@ module Karafka
45
51
  # @note This method should not be executed from many threads at the same time
46
52
  def batch_poll
47
53
  time_poll = TimeTrackers::Poll.new(@subscription_group.max_wait_time)
48
- time_poll.start
49
54
 
50
55
  @buffer.clear
51
56
  @rebalance_manager.clear
52
57
 
53
58
  loop do
59
+ time_poll.start
60
+
54
61
  # Don't fetch more messages if we do not have any time left
55
62
  break if time_poll.exceeded?
56
63
  # Don't fetch more messages if we've fetched max as we've wanted
@@ -69,7 +76,11 @@ module Karafka
69
76
  # If partition revocation happens, we need to remove messages from revoked partitions
70
77
  # as well as ensure we do not have duplicated due to the offset reset for partitions
71
78
  # that we got assigned
72
- remove_revoked_and_duplicated_messages if @rebalance_manager.revoked_partitions?
79
+ # We also do early break, so the information about rebalance is used as soon as possible
80
+ if @rebalance_manager.changed?
81
+ remove_revoked_and_duplicated_messages
82
+ break
83
+ end
73
84
 
74
85
  # Finally once we've (potentially) removed revoked, etc, if no messages were returned
75
86
  # we can break.
@@ -86,8 +97,7 @@ module Karafka
86
97
  # @param message [Karafka::Messages::Message]
87
98
  def store_offset(message)
88
99
  @mutex.synchronize do
89
- @offsetting = true
90
- @kafka.store_offset(message)
100
+ internal_store_offset(message)
91
101
  end
92
102
  end
93
103
 
@@ -104,14 +114,7 @@ module Karafka
104
114
  def commit_offsets(async: true)
105
115
  @mutex.lock
106
116
 
107
- return unless @offsetting
108
-
109
- @kafka.commit(nil, async)
110
- @offsetting = false
111
- rescue Rdkafka::RdkafkaError => e
112
- return if e.code == :no_offset
113
-
114
- raise e
117
+ internal_commit_offsets(async: async)
115
118
  ensure
116
119
  @mutex.unlock
117
120
  end
@@ -128,7 +131,11 @@ module Karafka
128
131
  #
129
132
  # @param message [Messages::Message, Messages::Seek] message to which we want to seek to
130
133
  def seek(message)
134
+ @mutex.lock
135
+
131
136
  @kafka.seek(message)
137
+ ensure
138
+ @mutex.unlock
132
139
  end
133
140
 
134
141
  # Pauses given partition and moves back to last successful offset processed.
@@ -144,15 +151,21 @@ module Karafka
144
151
  # Do not pause if the client got closed, would not change anything
145
152
  return if @closed
146
153
 
154
+ pause_msg = Messages::Seek.new(topic, partition, offset)
155
+
156
+ internal_commit_offsets(async: false)
157
+
158
+ # Here we do not use our cached tpls because we should not try to pause something we do
159
+ # not own anymore.
147
160
  tpl = topic_partition_list(topic, partition)
148
161
 
149
162
  return unless tpl
150
163
 
151
- @kafka.pause(tpl)
164
+ @paused_tpls[topic][partition] = tpl
152
165
 
153
- pause_msg = Messages::Seek.new(topic, partition, offset)
166
+ @kafka.pause(tpl)
154
167
 
155
- seek(pause_msg)
168
+ @kafka.seek(pause_msg)
156
169
  ensure
157
170
  @mutex.unlock
158
171
  end
@@ -166,9 +179,18 @@ module Karafka
166
179
 
167
180
  return if @closed
168
181
 
169
- tpl = topic_partition_list(topic, partition)
182
+ # Always commit synchronously offsets if any when we resume
183
+ # This prevents resuming without offset in case it would not be committed prior
184
+ # We can skip performance penalty since resuming should not happen too often
185
+ internal_commit_offsets(async: false)
186
+
187
+ # If we were not able, let's try to reuse the one we have (if we have)
188
+ tpl = topic_partition_list(topic, partition) || @paused_tpls[topic][partition]
170
189
 
171
190
  return unless tpl
191
+ # If we did not have it, it means we never paused this partition, thus no resume should
192
+ # happen in the first place
193
+ return unless @paused_tpls[topic].delete(partition)
172
194
 
173
195
  @kafka.resume(tpl)
174
196
  ensure
@@ -187,6 +209,7 @@ module Karafka
187
209
  # Marks given message as consumed.
188
210
  #
189
211
  # @param [Karafka::Messages::Message] message that we want to mark as processed
212
+ # @return [Boolean] true if successful. False if we no longer own given partition
190
213
  # @note This method won't trigger automatic offsets commits, rather relying on the offset
191
214
  # check-pointing trigger that happens with each batch processed
192
215
  def mark_as_consumed(message)
@@ -196,8 +219,10 @@ module Karafka
196
219
  # Marks a given message as consumed and commits the offsets in a blocking way.
197
220
  #
198
221
  # @param [Karafka::Messages::Message] message that we want to mark as processed
222
+ # @return [Boolean] true if successful. False if we no longer own given partition
199
223
  def mark_as_consumed!(message)
200
- mark_as_consumed(message)
224
+ return false unless mark_as_consumed(message)
225
+
201
226
  commit_offsets!
202
227
  end
203
228
 
@@ -208,17 +233,51 @@ module Karafka
208
233
  @mutex.synchronize do
209
234
  @closed = false
210
235
  @offsetting = false
236
+ @paused_tpls.clear
211
237
  @kafka = build_consumer
212
238
  end
213
239
  end
214
240
 
215
241
  private
216
242
 
243
+ # When we cannot store an offset, it means we no longer own the partition
244
+ #
245
+ # Non thread-safe offset storing method
246
+ # @param message [Karafka::Messages::Message]
247
+ # @return [Boolean] true if we could store the offset (if we still own the partition)
248
+ def internal_store_offset(message)
249
+ @offsetting = true
250
+ @kafka.store_offset(message)
251
+ true
252
+ rescue Rdkafka::RdkafkaError => e
253
+ return false if e.code == :assignment_lost
254
+ return false if e.code == :state
255
+
256
+ raise e
257
+ end
258
+
259
+ # Non thread-safe message committing method
260
+ # @param async [Boolean] should the commit happen async or sync (async by default)
261
+ # @return [Boolean] true if offset commit worked, false if we've lost the assignment
262
+ def internal_commit_offsets(async: true)
263
+ return true unless @offsetting
264
+
265
+ @kafka.commit(nil, async)
266
+ @offsetting = false
267
+
268
+ true
269
+ rescue Rdkafka::RdkafkaError => e
270
+ return false if e.code == :assignment_lost
271
+ return false if e.code == :no_offset
272
+
273
+ raise e
274
+ end
275
+
217
276
  # Commits the stored offsets in a sync way and closes the consumer.
218
277
  def close
219
- commit_offsets!
220
-
221
278
  @mutex.synchronize do
279
+ internal_commit_offsets(async: false)
280
+
222
281
  @closed = true
223
282
 
224
283
  # Remove callbacks runners that were registered
@@ -227,7 +286,8 @@ module Karafka
227
286
 
228
287
  @kafka.close
229
288
  @buffer.clear
230
- @rebalance_manager.clear
289
+ # @note We do not clear rebalance manager here as we may still have revocation info here
290
+ # that we want to consider valid prior to running another reconnection
231
291
  end
232
292
  end
233
293
 
@@ -280,7 +340,13 @@ module Karafka
280
340
 
281
341
  time_poll.backoff
282
342
 
283
- retry
343
+ # We return nil, so we do not restart until running the whole loop
344
+ # This allows us to run revocation jobs and other things and we will pick up new work
345
+ # next time after dispatching all the things that are needed
346
+ #
347
+ # If we would retry here, the client reset would become transparent and we would not have
348
+ # a chance to take any actions
349
+ nil
284
350
  end
285
351
 
286
352
  # Builds a new rdkafka consumer instance based on the subscription group configuration
@@ -323,7 +389,7 @@ module Karafka
323
389
  # we are no longer responsible in a given process for processing those messages and they
324
390
  # should have been picked up by a different process.
325
391
  def remove_revoked_and_duplicated_messages
326
- @rebalance_manager.revoked_partitions.each do |topic, partitions|
392
+ @rebalance_manager.lost_partitions.each do |topic, partitions|
327
393
  partitions.each do |partition|
328
394
  @buffer.delete(topic, partition)
329
395
  end
@@ -10,17 +10,23 @@ module Karafka
10
10
  class Listener
11
11
  include Helpers::Async
12
12
 
13
+ # Can be useful for logging
14
+ # @return [String] id of this listener
15
+ attr_reader :id
16
+
13
17
  # @param subscription_group [Karafka::Routing::SubscriptionGroup]
14
18
  # @param jobs_queue [Karafka::Processing::JobsQueue] queue where we should push work
15
19
  # @return [Karafka::Connection::Listener] listener instance
16
20
  def initialize(subscription_group, jobs_queue)
21
+ @id = SecureRandom.uuid
17
22
  @subscription_group = subscription_group
18
23
  @jobs_queue = jobs_queue
19
- @pauses_manager = PausesManager.new
24
+ @jobs_builder = ::Karafka::App.config.internal.processing.jobs_builder
25
+ @coordinators = Processing::CoordinatorsBuffer.new
20
26
  @client = Client.new(@subscription_group)
21
27
  @executors = Processing::ExecutorsBuffer.new(@client, subscription_group)
22
28
  # We reference scheduler here as it is much faster than fetching this each time
23
- @scheduler = ::Karafka::App.config.internal.scheduler
29
+ @scheduler = ::Karafka::App.config.internal.processing.scheduler
24
30
  # We keep one buffer for messages to preserve memory and not allocate extra objects
25
31
  # We can do this that way because we always first schedule jobs using messages before we
26
32
  # fetch another batch.
@@ -62,16 +68,20 @@ module Karafka
62
68
 
63
69
  resume_paused_partitions
64
70
 
65
- # We need to fetch data before we revoke lost partitions details as during the polling
66
- # the callbacks for tracking lost partitions are triggered. Otherwise we would be always
67
- # one batch behind.
68
- poll_and_remap_messages
69
-
70
71
  Karafka.monitor.instrument(
71
72
  'connection.listener.fetch_loop.received',
72
73
  caller: self,
73
74
  messages_buffer: @messages_buffer
74
- )
75
+ ) do
76
+ # We need to fetch data before we revoke lost partitions details as during the polling
77
+ # the callbacks for tracking lost partitions are triggered. Otherwise we would be
78
+ # always one batch behind.
79
+ poll_and_remap_messages
80
+ end
81
+
82
+ # This will ensure, that in the next poll, we continue processing (if we get them back)
83
+ # partitions that we have paused
84
+ resume_assigned_partitions
75
85
 
76
86
  # If there were revoked partitions, we need to wait on their jobs to finish before
77
87
  # distributing consuming jobs as upon revoking, we might get assigned to the same
@@ -80,6 +90,9 @@ module Karafka
80
90
  build_and_schedule_revoke_lost_partitions_jobs
81
91
 
82
92
  # We wait only on jobs from our subscription group. Other groups are independent.
93
+ # This will block on revoked jobs until they are finished. Those are not meant to last
94
+ # long and should not have any bigger impact on the system. Doing this in a blocking way
95
+ # simplifies the overall design and prevents from race conditions
83
96
  wait
84
97
 
85
98
  build_and_schedule_consumption_jobs
@@ -130,7 +143,7 @@ module Karafka
130
143
 
131
144
  # Resumes processing of partitions that were paused due to an error.
132
145
  def resume_paused_partitions
133
- @pauses_manager.resume do |topic, partition|
146
+ @coordinators.resume do |topic, partition|
134
147
  @client.resume(topic, partition)
135
148
  end
136
149
  end
@@ -146,9 +159,23 @@ module Karafka
146
159
 
147
160
  revoked_partitions.each do |topic, partitions|
148
161
  partitions.each do |partition|
149
- pause_tracker = @pauses_manager.fetch(topic, partition)
150
- executor = @executors.fetch(topic, partition, pause_tracker)
151
- jobs << Processing::Jobs::Revoked.new(executor)
162
+ # We revoke the coordinator here, so we do not have to revoke it in the revoke job
163
+ # itself (this happens prior to scheduling those jobs)
164
+ @coordinators.revoke(topic, partition)
165
+
166
+ # There may be a case where we have lost partition of which data we have never
167
+ # processed (if it was assigned and revoked really fast), thus we may not have it
168
+ # here. In cases like this, we do not run a revocation job
169
+ @executors.find_all(topic, partition).each do |executor|
170
+ jobs << @jobs_builder.revoked(executor)
171
+ end
172
+
173
+ # We need to remove all the executors of a given topic partition that we have lost, so
174
+ # next time we pick up it's work, new executors kick in. This may be needed especially
175
+ # for LRJ where we could end up with a race condition
176
+ # This revocation needs to happen after the jobs are scheduled, otherwise they would
177
+ # be scheduled with new executors instead of old
178
+ @executors.revoke(topic, partition)
152
179
  end
153
180
  end
154
181
 
@@ -160,7 +187,7 @@ module Karafka
160
187
  jobs = []
161
188
 
162
189
  @executors.each do |_, _, executor|
163
- jobs << Processing::Jobs::Shutdown.new(executor)
190
+ jobs << @jobs_builder.shutdown(executor)
164
191
  end
165
192
 
166
193
  @scheduler.schedule_shutdown(@jobs_queue, jobs)
@@ -177,6 +204,17 @@ module Karafka
177
204
  )
178
205
  end
179
206
 
207
+ # Revoked partition needs to be resumed if we were processing them earlier. This will do
208
+ # nothing to things that we are planning to process. Without this, things we get
209
+ # re-assigned would not be polled.
210
+ def resume_assigned_partitions
211
+ @client.rebalance_manager.assigned_partitions.each do |topic, partitions|
212
+ partitions.each do |partition|
213
+ @client.resume(topic, partition)
214
+ end
215
+ end
216
+ end
217
+
180
218
  # Takes the messages per topic partition and enqueues processing jobs in threads using
181
219
  # given scheduler.
182
220
  def build_and_schedule_consumption_jobs
@@ -185,13 +223,17 @@ module Karafka
185
223
  jobs = []
186
224
 
187
225
  @messages_buffer.each do |topic, partition, messages|
188
- pause = @pauses_manager.fetch(topic, partition)
226
+ coordinator = @coordinators.find_or_create(topic, partition)
227
+
228
+ # Start work coordination for this topic partition
229
+ coordinator.start
189
230
 
190
- next if pause.paused?
231
+ # Count the job we're going to create here
232
+ coordinator.increment
191
233
 
192
- executor = @executors.fetch(topic, partition, pause)
234
+ executor = @executors.find_or_create(topic, partition, 0)
193
235
 
194
- jobs << Processing::Jobs::Consume.new(executor, messages)
236
+ jobs << @jobs_builder.consume(executor, messages, coordinator)
195
237
  end
196
238
 
197
239
  @scheduler.schedule_consumption(@jobs_queue, jobs)
@@ -227,7 +269,7 @@ module Karafka
227
269
  @jobs_queue.wait(@subscription_group.id)
228
270
  @jobs_queue.clear(@subscription_group.id)
229
271
  @client.reset
230
- @pauses_manager = PausesManager.new
272
+ @coordinators.reset
231
273
  @executors = Processing::ExecutorsBuffer.new(@client, @subscription_group)
232
274
  end
233
275
  end
@@ -25,6 +25,14 @@ module Karafka
25
25
  )
26
26
  end
27
27
 
28
+ # Revokes pause tracker for a given topic partition
29
+ #
30
+ # @param topic [String] topic name
31
+ # @param partition [Integer] partition number
32
+ def revoke(topic, partition)
33
+ @pauses[topic].delete(partition)
34
+ end
35
+
28
36
  # Resumes processing of partitions for which pause time has ended.
29
37
  #
30
38
  # @yieldparam [String] topic name
@@ -18,13 +18,15 @@ module Karafka
18
18
  # Empty array for internal usage not to create new objects
19
19
  EMPTY_ARRAY = [].freeze
20
20
 
21
+ attr_reader :assigned_partitions, :revoked_partitions
22
+
21
23
  private_constant :EMPTY_ARRAY
22
24
 
23
25
  # @return [RebalanceManager]
24
26
  def initialize
25
27
  @assigned_partitions = {}
26
28
  @revoked_partitions = {}
27
- @lost_partitions = {}
29
+ @changed = false
28
30
  end
29
31
 
30
32
  # Resets the rebalance manager state
@@ -33,26 +35,12 @@ module Karafka
33
35
  def clear
34
36
  @assigned_partitions.clear
35
37
  @revoked_partitions.clear
36
- @lost_partitions.clear
37
- end
38
-
39
- # @return [Hash<String, Array<Integer>>] hash where the keys are the names of topics for
40
- # which we've lost partitions and array with ids of the partitions as the value
41
- # @note We do not consider as lost topics and partitions that got revoked and assigned
42
- def revoked_partitions
43
- return @revoked_partitions if @revoked_partitions.empty?
44
- return @lost_partitions unless @lost_partitions.empty?
45
-
46
- @revoked_partitions.each do |topic, partitions|
47
- @lost_partitions[topic] = partitions - @assigned_partitions.fetch(topic, EMPTY_ARRAY)
48
- end
49
-
50
- @lost_partitions
38
+ @changed = false
51
39
  end
52
40
 
53
- # @return [Boolean] true if any partitions were revoked
54
- def revoked_partitions?
55
- !revoked_partitions.empty?
41
+ # @return [Boolean] indicates a state change in the partitions assignment
42
+ def changed?
43
+ @changed
56
44
  end
57
45
 
58
46
  # Callback that kicks in inside of rdkafka, when new partitions are assigned.
@@ -62,6 +50,7 @@ module Karafka
62
50
  # @param partitions [Rdkafka::Consumer::TopicPartitionList]
63
51
  def on_partitions_assigned(_, partitions)
64
52
  @assigned_partitions = partitions.to_h.transform_values { |part| part.map(&:partition) }
53
+ @changed = true
65
54
  end
66
55
 
67
56
  # Callback that kicks in inside of rdkafka, when partitions are revoked.
@@ -71,6 +60,18 @@ module Karafka
71
60
  # @param partitions [Rdkafka::Consumer::TopicPartitionList]
72
61
  def on_partitions_revoked(_, partitions)
73
62
  @revoked_partitions = partitions.to_h.transform_values { |part| part.map(&:partition) }
63
+ @changed = true
64
+ end
65
+
66
+ # We consider as lost only partitions that were taken away and not re-assigned back to us
67
+ def lost_partitions
68
+ lost_partitions = {}
69
+
70
+ revoked_partitions.each do |topic, partitions|
71
+ lost_partitions[topic] = partitions - assigned_partitions.fetch(topic, EMPTY_ARRAY)
72
+ end
73
+
74
+ lost_partitions
74
75
  end
75
76
  end
76
77
  end