karafka 2.2.13 → 2.3.0.alpha1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (125) hide show
  1. checksums.yaml +4 -4
  2. checksums.yaml.gz.sig +0 -0
  3. data/.github/workflows/ci.yml +38 -12
  4. data/.ruby-version +1 -1
  5. data/CHANGELOG.md +161 -125
  6. data/Gemfile.lock +12 -12
  7. data/README.md +0 -2
  8. data/SECURITY.md +23 -0
  9. data/config/locales/errors.yml +7 -1
  10. data/config/locales/pro_errors.yml +22 -0
  11. data/docker-compose.yml +3 -1
  12. data/karafka.gemspec +2 -2
  13. data/lib/karafka/admin/acl.rb +287 -0
  14. data/lib/karafka/admin.rb +118 -16
  15. data/lib/karafka/app.rb +12 -3
  16. data/lib/karafka/base_consumer.rb +32 -31
  17. data/lib/karafka/cli/base.rb +1 -1
  18. data/lib/karafka/connection/client.rb +94 -84
  19. data/lib/karafka/connection/conductor.rb +28 -0
  20. data/lib/karafka/connection/listener.rb +165 -46
  21. data/lib/karafka/connection/listeners_batch.rb +5 -11
  22. data/lib/karafka/connection/manager.rb +72 -0
  23. data/lib/karafka/connection/messages_buffer.rb +12 -0
  24. data/lib/karafka/connection/proxy.rb +17 -0
  25. data/lib/karafka/connection/status.rb +75 -0
  26. data/lib/karafka/contracts/config.rb +14 -10
  27. data/lib/karafka/contracts/consumer_group.rb +9 -1
  28. data/lib/karafka/contracts/topic.rb +3 -1
  29. data/lib/karafka/errors.rb +13 -0
  30. data/lib/karafka/instrumentation/assignments_tracker.rb +96 -0
  31. data/lib/karafka/instrumentation/callbacks/rebalance.rb +10 -7
  32. data/lib/karafka/instrumentation/logger_listener.rb +3 -9
  33. data/lib/karafka/instrumentation/notifications.rb +19 -9
  34. data/lib/karafka/instrumentation/vendors/appsignal/metrics_listener.rb +31 -28
  35. data/lib/karafka/instrumentation/vendors/datadog/logger_listener.rb +22 -3
  36. data/lib/karafka/instrumentation/vendors/datadog/metrics_listener.rb +15 -12
  37. data/lib/karafka/instrumentation/vendors/kubernetes/liveness_listener.rb +39 -36
  38. data/lib/karafka/pro/base_consumer.rb +47 -0
  39. data/lib/karafka/pro/connection/manager.rb +300 -0
  40. data/lib/karafka/pro/connection/multiplexing/listener.rb +40 -0
  41. data/lib/karafka/pro/instrumentation/performance_tracker.rb +85 -0
  42. data/lib/karafka/pro/iterator/tpl_builder.rb +1 -1
  43. data/lib/karafka/pro/iterator.rb +1 -6
  44. data/lib/karafka/pro/loader.rb +16 -2
  45. data/lib/karafka/pro/processing/coordinator.rb +2 -1
  46. data/lib/karafka/pro/processing/executor.rb +37 -0
  47. data/lib/karafka/pro/processing/expansions_selector.rb +32 -0
  48. data/lib/karafka/pro/processing/jobs/periodic.rb +41 -0
  49. data/lib/karafka/pro/processing/jobs/periodic_non_blocking.rb +32 -0
  50. data/lib/karafka/pro/processing/jobs_builder.rb +14 -3
  51. data/lib/karafka/pro/processing/offset_metadata/consumer.rb +44 -0
  52. data/lib/karafka/pro/processing/offset_metadata/fetcher.rb +131 -0
  53. data/lib/karafka/pro/processing/offset_metadata/listener.rb +46 -0
  54. data/lib/karafka/pro/processing/schedulers/base.rb +143 -0
  55. data/lib/karafka/pro/processing/schedulers/default.rb +107 -0
  56. data/lib/karafka/pro/processing/strategies/aj/lrj_mom_vp.rb +1 -1
  57. data/lib/karafka/pro/processing/strategies/default.rb +136 -3
  58. data/lib/karafka/pro/processing/strategies/dlq/default.rb +35 -0
  59. data/lib/karafka/pro/processing/strategies/lrj/default.rb +1 -1
  60. data/lib/karafka/pro/processing/strategies/lrj/mom.rb +1 -1
  61. data/lib/karafka/pro/processing/strategies/vp/default.rb +60 -26
  62. data/lib/karafka/pro/processing/virtual_offset_manager.rb +41 -11
  63. data/lib/karafka/pro/routing/features/long_running_job/topic.rb +2 -0
  64. data/lib/karafka/pro/routing/features/multiplexing/config.rb +38 -0
  65. data/lib/karafka/pro/routing/features/multiplexing/contracts/topic.rb +114 -0
  66. data/lib/karafka/pro/routing/features/multiplexing/patches/contracts/consumer_group.rb +42 -0
  67. data/lib/karafka/pro/routing/features/multiplexing/proxy.rb +38 -0
  68. data/lib/karafka/pro/routing/features/multiplexing/subscription_group.rb +42 -0
  69. data/lib/karafka/pro/routing/features/multiplexing/subscription_groups_builder.rb +40 -0
  70. data/lib/karafka/pro/routing/features/multiplexing.rb +59 -0
  71. data/lib/karafka/pro/routing/features/non_blocking_job/topic.rb +32 -0
  72. data/lib/karafka/pro/routing/features/non_blocking_job.rb +37 -0
  73. data/lib/karafka/pro/routing/features/offset_metadata/config.rb +33 -0
  74. data/lib/karafka/pro/routing/features/offset_metadata/contracts/topic.rb +42 -0
  75. data/lib/karafka/pro/routing/features/offset_metadata/topic.rb +65 -0
  76. data/lib/karafka/pro/routing/features/offset_metadata.rb +40 -0
  77. data/lib/karafka/pro/routing/features/patterns/contracts/consumer_group.rb +4 -0
  78. data/lib/karafka/pro/routing/features/patterns/detector.rb +18 -10
  79. data/lib/karafka/pro/routing/features/periodic_job/config.rb +37 -0
  80. data/lib/karafka/pro/routing/features/periodic_job/contracts/topic.rb +44 -0
  81. data/lib/karafka/pro/routing/features/periodic_job/topic.rb +94 -0
  82. data/lib/karafka/pro/routing/features/periodic_job.rb +27 -0
  83. data/lib/karafka/pro/routing/features/virtual_partitions/config.rb +1 -0
  84. data/lib/karafka/pro/routing/features/virtual_partitions/contracts/topic.rb +1 -0
  85. data/lib/karafka/pro/routing/features/virtual_partitions/topic.rb +7 -2
  86. data/lib/karafka/process.rb +5 -3
  87. data/lib/karafka/processing/coordinator.rb +5 -1
  88. data/lib/karafka/processing/executor.rb +43 -13
  89. data/lib/karafka/processing/executors_buffer.rb +22 -7
  90. data/lib/karafka/processing/jobs/base.rb +19 -2
  91. data/lib/karafka/processing/jobs/consume.rb +3 -3
  92. data/lib/karafka/processing/jobs/idle.rb +5 -0
  93. data/lib/karafka/processing/jobs/revoked.rb +5 -0
  94. data/lib/karafka/processing/jobs/shutdown.rb +5 -0
  95. data/lib/karafka/processing/jobs_queue.rb +19 -8
  96. data/lib/karafka/processing/schedulers/default.rb +42 -0
  97. data/lib/karafka/processing/strategies/base.rb +13 -4
  98. data/lib/karafka/processing/strategies/default.rb +23 -7
  99. data/lib/karafka/processing/strategies/dlq.rb +36 -0
  100. data/lib/karafka/processing/worker.rb +4 -1
  101. data/lib/karafka/routing/builder.rb +12 -2
  102. data/lib/karafka/routing/consumer_group.rb +5 -5
  103. data/lib/karafka/routing/features/base.rb +44 -8
  104. data/lib/karafka/routing/features/dead_letter_queue/config.rb +6 -1
  105. data/lib/karafka/routing/features/dead_letter_queue/contracts/topic.rb +1 -0
  106. data/lib/karafka/routing/features/dead_letter_queue/topic.rb +9 -2
  107. data/lib/karafka/routing/proxy.rb +4 -3
  108. data/lib/karafka/routing/subscription_group.rb +2 -2
  109. data/lib/karafka/routing/subscription_groups_builder.rb +11 -2
  110. data/lib/karafka/routing/topic.rb +8 -10
  111. data/lib/karafka/routing/topics.rb +1 -1
  112. data/lib/karafka/runner.rb +13 -3
  113. data/lib/karafka/server.rb +5 -9
  114. data/lib/karafka/setup/config.rb +21 -1
  115. data/lib/karafka/status.rb +23 -14
  116. data/lib/karafka/templates/karafka.rb.erb +7 -0
  117. data/lib/karafka/time_trackers/partition_usage.rb +56 -0
  118. data/lib/karafka/version.rb +1 -1
  119. data.tar.gz.sig +0 -0
  120. metadata +47 -13
  121. metadata.gz.sig +0 -0
  122. data/lib/karafka/connection/consumer_group_coordinator.rb +0 -48
  123. data/lib/karafka/pro/performance_tracker.rb +0 -84
  124. data/lib/karafka/pro/processing/scheduler.rb +0 -74
  125. data/lib/karafka/processing/scheduler.rb +0 -38
@@ -7,6 +7,8 @@ module Karafka
7
7
  # critical errors by restarting everything in a safe manner.
8
8
  #
9
9
  # This is the heart of the consumption process.
10
+ #
11
+ # It provides async API for managing, so all status changes are expected to be async.
10
12
  class Listener
11
13
  include Helpers::Async
12
14
 
@@ -14,22 +16,23 @@ module Karafka
14
16
  # @return [String] id of this listener
15
17
  attr_reader :id
16
18
 
19
+ # @return [Karafka::Routing::SubscriptionGroup] subscription group that this listener handles
20
+ attr_reader :subscription_group
21
+
17
22
  # How long to wait in the initial events poll. Increases chances of having the initial events
18
23
  # immediately available
19
24
  INITIAL_EVENTS_POLL_TIMEOUT = 100
20
25
 
21
26
  private_constant :INITIAL_EVENTS_POLL_TIMEOUT
22
27
 
23
- # @param consumer_group_coordinator [Karafka::Connection::ConsumerGroupCoordinator]
24
28
  # @param subscription_group [Karafka::Routing::SubscriptionGroup]
25
29
  # @param jobs_queue [Karafka::Processing::JobsQueue] queue where we should push work
26
30
  # @param scheduler [Karafka::Processing::Scheduler] scheduler we want to use
27
31
  # @return [Karafka::Connection::Listener] listener instance
28
- def initialize(consumer_group_coordinator, subscription_group, jobs_queue, scheduler)
32
+ def initialize(subscription_group, jobs_queue, scheduler)
29
33
  proc_config = ::Karafka::App.config.internal.processing
30
34
 
31
35
  @id = SecureRandom.hex(6)
32
- @consumer_group_coordinator = consumer_group_coordinator
33
36
  @subscription_group = subscription_group
34
37
  @jobs_queue = jobs_queue
35
38
  @coordinators = Processing::CoordinatorsBuffer.new(subscription_group.topics)
@@ -43,8 +46,11 @@ module Karafka
43
46
  # We can do this that way because we always first schedule jobs using messages before we
44
47
  # fetch another batch.
45
48
  @messages_buffer = MessagesBuffer.new(subscription_group)
49
+ @usage_tracker = TimeTrackers::PartitionUsage.new
46
50
  @mutex = Mutex.new
47
- @stopped = false
51
+ @status = Status.new
52
+
53
+ @jobs_queue.register(@subscription_group.id)
48
54
  end
49
55
 
50
56
  # Runs the main listener fetch loop.
@@ -60,6 +66,44 @@ module Karafka
60
66
  )
61
67
 
62
68
  fetch_loop
69
+
70
+ Karafka.monitor.instrument(
71
+ 'connection.listener.after_fetch_loop',
72
+ caller: self,
73
+ client: @client,
74
+ subscription_group: @subscription_group
75
+ )
76
+ end
77
+
78
+ # Aliases all statuses operations directly on the listener so we have a listener-facing API
79
+ Status::STATES.each do |state, transition|
80
+ # @return [Boolean] is the listener in a given state
81
+ define_method "#{state}?" do
82
+ @status.public_send("#{state}?")
83
+ end
84
+
85
+ # Moves listener to a given state
86
+ define_method transition do
87
+ @status.public_send(transition)
88
+ end
89
+ end
90
+
91
+ # @return [Boolean] is this listener active (not stopped and not pending)
92
+ def active?
93
+ @status.active?
94
+ end
95
+
96
+ # We overwrite the state `#start` because on start we need to also start running listener in
97
+ # the async thread. While other state transitions happen automatically and status state
98
+ # change is enough, here we need to run the background threads
99
+ def start!
100
+ if stopped?
101
+ @client.reset
102
+ @status.reset!
103
+ end
104
+
105
+ @status.start!
106
+ async_call
63
107
  end
64
108
 
65
109
  # Stops the jobs queue, triggers shutdown on all the executors (sync), commits offsets and
@@ -70,13 +114,16 @@ module Karafka
70
114
  #
71
115
  # @note We wrap it with a mutex exactly because of the above case of forceful shutdown
72
116
  def shutdown
73
- return if @stopped
74
-
75
117
  @mutex.synchronize do
76
- @stopped = true
118
+ return if stopped?
119
+ # Nothing to clear if it was not even running
120
+ return stopped! if pending?
121
+
77
122
  @executors.clear
78
123
  @coordinators.reset
79
124
  @client.stop
125
+
126
+ stopped!
80
127
  end
81
128
  end
82
129
 
@@ -91,6 +138,7 @@ module Karafka
91
138
  # Kafka connections / Internet connection issues / Etc. Business logic problems should not
92
139
  # propagate this far.
93
140
  def fetch_loop
141
+ running!
94
142
  # Run the initial events fetch to improve chances of having metrics and initial callbacks
95
143
  # triggers on start.
96
144
  #
@@ -101,7 +149,7 @@ module Karafka
101
149
  @client.events_poll(INITIAL_EVENTS_POLL_TIMEOUT)
102
150
 
103
151
  # Run the main loop as long as we are not stopping or moving into quiet mode
104
- until Karafka::App.done?
152
+ while running?
105
153
  Karafka.monitor.instrument(
106
154
  'connection.listener.fetch_loop',
107
155
  caller: self,
@@ -136,7 +184,11 @@ module Karafka
136
184
  # simplifies the overall design and prevents from race conditions
137
185
  wait
138
186
 
139
- build_and_schedule_consumption_jobs
187
+ build_and_schedule_flow_jobs
188
+
189
+ # periodic jobs never run on topics and partitions that were scheduled, so no risk in
190
+ # having collective wait after both
191
+ build_and_schedule_periodic_jobs if Karafka.pro?
140
192
 
141
193
  wait
142
194
  end
@@ -168,18 +220,11 @@ module Karafka
168
220
  # Wait until all the shutdown jobs are done
169
221
  wait_pinging(wait_until: -> { @jobs_queue.empty?(@subscription_group.id) })
170
222
 
171
- # Once all the work is done, we need to decrement counter of active subscription groups
172
- # within this consumer group
173
- @consumer_group_coordinator.finish_work(id)
223
+ quieted!
174
224
 
175
225
  # Wait if we're in the process of finishing started work or finished all the work and
176
226
  # just sitting and being quiet
177
- wait_pinging(wait_until: -> { !(Karafka::App.quieting? || Karafka::App.quiet?) })
178
-
179
- # We need to wait until all the work in the whole consumer group (local to the process)
180
- # is done. Otherwise we may end up with locks and `Timed out LeaveGroupRequest in flight`
181
- # warning notifications.
182
- wait_pinging(wait_until: -> { @consumer_group_coordinator.shutdown? })
227
+ wait_pinging(wait_until: -> { !quiet? })
183
228
 
184
229
  # This extra ping will make sure we've refreshed the rebalance state after other instances
185
230
  # potentially shutdown. This will prevent us from closing with a dangling callback
@@ -198,11 +243,9 @@ module Karafka
198
243
  type: 'connection.listener.fetch_loop.error'
199
244
  )
200
245
 
201
- restart
246
+ reset
202
247
 
203
248
  sleep(1) && retry
204
- ensure
205
- @consumer_group_coordinator.unlock
206
249
  end
207
250
 
208
251
  # Resumes processing of partitions that were paused due to an error.
@@ -212,6 +255,17 @@ module Karafka
212
255
  end
213
256
  end
214
257
 
258
+ # Polls messages within the time and amount boundaries defined in the settings and then
259
+ # builds karafka messages based on the raw rdkafka messages buffer returned by the
260
+ # `#batch_poll` method.
261
+ #
262
+ # @note There are two buffers, one for raw messages and one for "built" karafka messages
263
+ def poll_and_remap_messages
264
+ @messages_buffer.remap(
265
+ @client.batch_poll
266
+ )
267
+ end
268
+
215
269
  # Enqueues revoking jobs for partitions that were taken away from the running process.
216
270
  def build_and_schedule_revoked_jobs_for_revoked_partitions
217
271
  revoked_partitions = @client.rebalance_manager.revoked_partitions
@@ -223,6 +277,7 @@ module Karafka
223
277
 
224
278
  revoked_partitions.each do |topic, partitions|
225
279
  partitions.each do |partition|
280
+ @usage_tracker.revoke(topic, partition)
226
281
  @coordinators.revoke(topic, partition)
227
282
 
228
283
  # There may be a case where we have lost partition of which data we have never
@@ -230,7 +285,6 @@ module Karafka
230
285
  # here. In cases like this, we do not run a revocation job
231
286
  @executors.find_all(topic, partition).each do |executor|
232
287
  job = @jobs_builder.revoked(executor)
233
- job.before_enqueue
234
288
  jobs << job
235
289
  end
236
290
 
@@ -243,7 +297,10 @@ module Karafka
243
297
  end
244
298
  end
245
299
 
246
- @scheduler.schedule_revocation(jobs)
300
+ return if jobs.empty?
301
+
302
+ jobs.each(&:before_schedule)
303
+ @scheduler.on_schedule_revocation(jobs)
247
304
  end
248
305
 
249
306
  # Enqueues the shutdown jobs for all the executors that exist in our subscription group
@@ -252,32 +309,27 @@ module Karafka
252
309
 
253
310
  @executors.each do |executor|
254
311
  job = @jobs_builder.shutdown(executor)
255
- job.before_enqueue
256
312
  jobs << job
257
313
  end
258
314
 
259
- @scheduler.schedule_shutdown(jobs)
260
- end
315
+ return if jobs.empty?
261
316
 
262
- # Polls messages within the time and amount boundaries defined in the settings and then
263
- # builds karafka messages based on the raw rdkafka messages buffer returned by the
264
- # `#batch_poll` method.
265
- #
266
- # @note There are two buffers, one for raw messages and one for "built" karafka messages
267
- def poll_and_remap_messages
268
- @messages_buffer.remap(
269
- @client.batch_poll
270
- )
317
+ jobs.each(&:before_schedule)
318
+ @scheduler.on_schedule_shutdown(jobs)
271
319
  end
272
320
 
273
321
  # Takes the messages per topic partition and enqueues processing jobs in threads using
274
- # given scheduler.
275
- def build_and_schedule_consumption_jobs
322
+ # given scheduler. It also handles the idle jobs when filtering API removed all messages
323
+ # and we need to run house-keeping
324
+ def build_and_schedule_flow_jobs
276
325
  return if @messages_buffer.empty?
277
326
 
278
- jobs = []
327
+ consume_jobs = []
328
+ idle_jobs = []
279
329
 
280
330
  @messages_buffer.each do |topic, partition, messages|
331
+ @usage_tracker.track(topic, partition)
332
+
281
333
  coordinator = @coordinators.find_or_create(topic, partition)
282
334
  # Start work coordination for this topic partition
283
335
  coordinator.start(messages)
@@ -286,26 +338,93 @@ module Karafka
286
338
  # and it will not go through a standard lifecycle. Same applies to revoked and shutdown
287
339
  if messages.empty?
288
340
  executor = @executors.find_or_create(topic, partition, 0, coordinator)
289
- jobs << @jobs_builder.idle(executor)
341
+ idle_jobs << @jobs_builder.idle(executor)
290
342
  else
291
343
  @partitioner.call(topic, messages, coordinator) do |group_id, partition_messages|
292
344
  executor = @executors.find_or_create(topic, partition, group_id, coordinator)
293
345
  coordinator.increment
294
- jobs << @jobs_builder.consume(executor, partition_messages)
346
+ consume_jobs << @jobs_builder.consume(executor, partition_messages)
347
+ end
348
+ end
349
+ end
350
+
351
+ # We schedule the idle jobs before running the `#before_schedule` on the consume jobs so
352
+ # workers can already pick up the idle jobs while the `#before_schedule` on consumption
353
+ # jobs runs
354
+ unless idle_jobs.empty?
355
+ idle_jobs.each(&:before_schedule)
356
+ @scheduler.on_schedule_idle(idle_jobs)
357
+ end
358
+
359
+ unless consume_jobs.empty?
360
+ consume_jobs.each(&:before_schedule)
361
+ @scheduler.on_schedule_consumption(consume_jobs)
362
+ end
363
+ end
364
+
365
+ # Builds and schedules periodic jobs for topics partitions for which no messages were
366
+ # received recently. In case `Idle` job is invoked, we do not run periodic. Idle means that
367
+ # a complex flow kicked in and it was a user choice not to run consumption but messages were
368
+ # shipped.
369
+ def build_and_schedule_periodic_jobs
370
+ # Shortcut if periodic jobs are not used at all. No need to run the complex flow when it
371
+ # will never end up with anything. If periodics on any of the topics are not even defined,
372
+ # we can finish fast
373
+ @periodic_jobs ||= @subscription_group.topics.count(&:periodic_job?)
374
+
375
+ return if @periodic_jobs.zero?
376
+
377
+ jobs = []
378
+
379
+ # We select only currently assigned topics and partitions from the current subscription
380
+ # group as only those are of our interest. We then filter that to only pick those for whom
381
+ # we want to run periodic jobs and then we select only those that did not receive any
382
+ # messages recently. This ensures, that we do not tick close to recent arrival of messages
383
+ # but rather after certain period of inactivity
384
+ Karafka::App.assignments.each do |topic, partitions|
385
+ # Skip for assignments not from our subscription group
386
+ next unless topic.subscription_group == @subscription_group
387
+ # Skip if this topic does not have periodic jobs enabled
388
+ next unless topic.periodic_job?
389
+
390
+ topic_name = topic.name
391
+ interval = topic.periodic_job.interval
392
+
393
+ partitions.each do |partition|
394
+ # Skip if we were operating on a given topic partition recently
395
+ next if @usage_tracker.active?(topic_name, partition, interval)
396
+
397
+ coordinator = @coordinators.find_or_create(topic_name, partition)
398
+
399
+ # Do not tick if we do not want to tick during pauses
400
+ next if coordinator.paused? && !topic.periodic_job.during_pause?
401
+
402
+ # If we do not want to run periodics during retry flows, we should not
403
+ # Since this counter is incremented before processing, here it is always -1 from what
404
+ # we see in the consumer flow. This is why attempt 0 means that we will have first
405
+ # run (ok) but attempt 1 means, there was an error and we will retry
406
+ next if coordinator.attempt.positive? && !topic.periodic_job.during_retry?
407
+
408
+ # Track so we do not run periodic job again too soon
409
+ @usage_tracker.track(topic_name, partition)
410
+
411
+ @executors.find_all_or_create(topic_name, partition, coordinator).each do |executor|
412
+ jobs << @jobs_builder.periodic(executor)
295
413
  end
296
414
  end
297
415
  end
298
416
 
299
- jobs.each(&:before_enqueue)
417
+ return if jobs.empty?
300
418
 
301
- @scheduler.schedule_consumption(jobs)
419
+ jobs.each(&:before_schedule)
420
+ @scheduler.on_schedule_periodic(jobs)
302
421
  end
303
422
 
304
423
  # Waits for all the jobs from a given subscription group to finish before moving forward
305
424
  def wait
306
425
  @jobs_queue.wait(@subscription_group.id) do
307
426
  @events_poller.call
308
- @scheduler.manage
427
+ @scheduler.on_manage
309
428
  end
310
429
  end
311
430
 
@@ -322,7 +441,7 @@ module Karafka
322
441
  def wait_pinging(wait_until:, after_ping: -> {})
323
442
  until wait_until.call
324
443
  @client.ping
325
- @scheduler.manage
444
+ @scheduler.on_manage
326
445
 
327
446
  after_ping.call
328
447
  sleep(0.2)
@@ -333,13 +452,13 @@ module Karafka
333
452
  # `#fetch_loop` again. We just need to remember to also reset the runner as it is a long
334
453
  # running one, so with a new connection to Kafka, we need to initialize the state of the
335
454
  # runner and underlying consumers once again.
336
- def restart
455
+ def reset
337
456
  # If there was any problem with processing, before we reset things we need to make sure,
338
457
  # there are no jobs in the queue. Otherwise it could lead to leakage in between client
339
458
  # resetting.
340
459
  @jobs_queue.wait(@subscription_group.id)
341
460
  @jobs_queue.clear(@subscription_group.id)
342
- @scheduler.clear(@subscription_group.id)
461
+ @scheduler.on_clear(@subscription_group.id)
343
462
  @events_poller.reset
344
463
  @client.reset
345
464
  @coordinators.reset
@@ -6,8 +6,6 @@ module Karafka
6
6
  class ListenersBatch
7
7
  include Enumerable
8
8
 
9
- attr_reader :coordinators
10
-
11
9
  # @param jobs_queue [JobsQueue]
12
10
  # @return [ListenersBatch]
13
11
  def initialize(jobs_queue)
@@ -15,18 +13,9 @@ module Karafka
15
13
  # should be able to distribute work whenever any work is done in any of the listeners
16
14
  scheduler = App.config.internal.processing.scheduler_class.new(jobs_queue)
17
15
 
18
- @coordinators = []
19
-
20
16
  @batch = App.subscription_groups.flat_map do |_consumer_group, subscription_groups|
21
- consumer_group_coordinator = Connection::ConsumerGroupCoordinator.new(
22
- subscription_groups.size
23
- )
24
-
25
- @coordinators << consumer_group_coordinator
26
-
27
17
  subscription_groups.map do |subscription_group|
28
18
  Connection::Listener.new(
29
- consumer_group_coordinator,
30
19
  subscription_group,
31
20
  jobs_queue,
32
21
  scheduler
@@ -40,6 +29,11 @@ module Karafka
40
29
  def each(&block)
41
30
  @batch.each(&block)
42
31
  end
32
+
33
+ # @return [Array<Listener>] active listeners
34
+ def active
35
+ select(&:active?)
36
+ end
43
37
  end
44
38
  end
45
39
  end
@@ -0,0 +1,72 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Karafka
4
+ # Namespace for Kafka connection related logic
5
+ module Connection
6
+ # Connections manager responsible for starting and managing listeners connections
7
+ #
8
+ # In the OSS version it starts listeners as they are without any connection management or
9
+ # resources utilization supervision and shuts them down or quiets when time has come
10
+ class Manager
11
+ def initialize
12
+ @once_executions = Set.new
13
+ end
14
+
15
+ # Registers provided listeners and starts all of them
16
+ #
17
+ # @param listeners [Connection::ListenersBatch]
18
+ def register(listeners)
19
+ @listeners = listeners
20
+ @listeners.each(&:start!)
21
+ end
22
+
23
+ # @return [Boolean] true if all listeners are stopped
24
+ def done?
25
+ @listeners.all?(&:stopped?)
26
+ end
27
+
28
+ # Controls the state of listeners upon shutdown and quiet requests
29
+ # In both cases (quieting and shutdown) we first need to stop processing more work and tell
30
+ # listeners to become quiet (connected but not yielding messages) and then depending on
31
+ # whether we want to stop fully or just keep quiet we apply different flow.
32
+ #
33
+ # @note It is important to ensure, that all listeners from the same consumer group are always
34
+ # all quiet before we can fully shutdown given consumer group. Skipping this can cause
35
+ # `Timed out LeaveGroupRequest in flight` and other errors. For the simplification, we just
36
+ # quiet all and only then move forward.
37
+ #
38
+ # @note This manager works with the assumption, that all listeners are executed on register.
39
+ def control
40
+ # Do nothing until shutdown or quiet
41
+ return unless Karafka::App.done?
42
+
43
+ # When we are done processing, immediately quiet all the listeners so they do not pick up
44
+ # new work to do
45
+ once(:quiet!) { @listeners.each(&:quiet!) }
46
+
47
+ return unless @listeners.all?(&:quiet?)
48
+
49
+ # If we are in the process of moving to quiet state, we need to check it.
50
+ # Switch to quieted status only when all listeners are fully quieted and do nothing after
51
+ # that until further state changes
52
+ once(:quieted!) { Karafka::App.quieted! } if Karafka::App.quieting?
53
+
54
+ return if Karafka::App.quiet?
55
+
56
+ once(:stop!) { @listeners.each(&:stop!) }
57
+ end
58
+
59
+ private
60
+
61
+ # Runs code only once and never again
62
+ # @param args [Object] anything we want to use as a set of unique keys for given execution
63
+ def once(*args)
64
+ return if @once_executions.include?(args)
65
+
66
+ @once_executions << args
67
+
68
+ yield
69
+ end
70
+ end
71
+ end
72
+ end
@@ -67,6 +67,18 @@ module Karafka
67
67
  end
68
68
  end
69
69
 
70
+ # Checks if there are any messages from a given topic partition in the buffer
71
+ # @param topic [String] topic name
72
+ # @param partition [Integer] partition number
73
+ # @return [Boolean] true if there is at least one message from this topic partition,
74
+ # otherwise false
75
+ def present?(topic, partition)
76
+ return false unless @groups.include?(topic)
77
+ return false unless @groups[topic].include?(partition)
78
+
79
+ true
80
+ end
81
+
70
82
  # @return [Boolean] is the buffer empty or does it contain any messages
71
83
  def empty?
72
84
  @size.zero?
@@ -68,6 +68,23 @@ module Karafka
68
68
  end
69
69
  end
70
70
 
71
+ # Similar to `#query_watermark_offsets`.
72
+ #
73
+ # @param tpl [Rdkafka::Consumer::TopicPartitionList, nil] tpl or nil for full current
74
+ # assignment tpl usage
75
+ # @return [Rdkafka::Consumer::TopicPartitionList] tpl with committed offsets and metadata
76
+ def committed(tpl = nil)
77
+ c_config = @config.committed
78
+
79
+ with_broker_errors_retry(
80
+ # required to be in seconds, not ms
81
+ wait_time: c_config.wait_time / 1_000.to_f,
82
+ max_attempts: c_config.max_attempts
83
+ ) do
84
+ @wrapped.committed(tpl, c_config.timeout)
85
+ end
86
+ end
87
+
71
88
  private
72
89
 
73
90
  # Runs expected block of code with few retries on all_brokers_down
@@ -0,0 +1,75 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Karafka
4
+ # Namespace for Kafka connection related logic
5
+ module Connection
6
+ # Listener connection status representation
7
+ class Status
8
+ # Available states and their transitions.
9
+ STATES = {
10
+ pending: :pending!,
11
+ starting: :start!,
12
+ running: :running!,
13
+ quieting: :quiet!,
14
+ quiet: :quieted!,
15
+ stopping: :stop!,
16
+ stopped: :stopped!
17
+ }.freeze
18
+
19
+ STATES.each do |state, transition|
20
+ class_eval <<~RUBY, __FILE__, __LINE__ + 1
21
+ # Moves status to a different state
22
+ def #{transition}
23
+ @mutex.synchronize do
24
+ # Do not allow reverse state transitions (we always go one way) or transition to the
25
+ # same state as currently
26
+ return if @status && STATES.keys.index(:#{state}) <= STATES.keys.index(@status)
27
+
28
+ @status = :#{state}
29
+ @conductor.signal
30
+ end
31
+ end
32
+
33
+ # @return [Boolean] are we in a given state
34
+ def #{state}?
35
+ @status == :#{state}
36
+ end
37
+ RUBY
38
+ end
39
+
40
+ def initialize
41
+ @mutex = Mutex.new
42
+ @conductor = Karafka::App.config.internal.connection.conductor
43
+ pending!
44
+ end
45
+
46
+ # If this listener was not even running, will just move it through states until final.
47
+ # If it was running, will start the stopping procedures.
48
+ # Will do nothing if it was already stopped
49
+ def stop!
50
+ if pending?
51
+ @status = :stopping
52
+ stopped!
53
+ elsif stopped?
54
+ nil
55
+ else
56
+ @status = :stopping
57
+ end
58
+ end
59
+
60
+ # Moves status back from stopped to pending (and only that). We should not be able to reset
61
+ # listeners that are not stopped
62
+ def reset!
63
+ return unless stopped?
64
+
65
+ @status = :pending
66
+ end
67
+
68
+ # @return [Boolean] listener is considered active when it has a client reference that may
69
+ # be active and connected to Kafka
70
+ def active?
71
+ !pending? && !stopped?
72
+ end
73
+ end
74
+ end
75
+ end
@@ -51,17 +51,21 @@ module Karafka
51
51
  required(:tick_interval) { |val| val.is_a?(Integer) && val >= 1_000 }
52
52
 
53
53
  nested(:connection) do
54
- nested(:proxy) do
55
- nested(:query_watermark_offsets) do
56
- required(:timeout) { |val| val.is_a?(Integer) && val.positive? }
57
- required(:max_attempts) { |val| val.is_a?(Integer) && val.positive? }
58
- required(:wait_time) { |val| val.is_a?(Integer) && val.positive? }
59
- end
54
+ required(:manager) { |val| !val.nil? }
55
+ required(:conductor) { |val| !val.nil? }
60
56
 
61
- nested(:offsets_for_times) do
62
- required(:timeout) { |val| val.is_a?(Integer) && val.positive? }
63
- required(:max_attempts) { |val| val.is_a?(Integer) && val.positive? }
64
- required(:wait_time) { |val| val.is_a?(Integer) && val.positive? }
57
+ nested(:proxy) do
58
+ # All of them have the same requirements
59
+ %i[
60
+ query_watermark_offsets
61
+ offsets_for_times
62
+ committed
63
+ ].each do |scope|
64
+ nested(scope) do
65
+ required(:timeout) { |val| val.is_a?(Integer) && val.positive? }
66
+ required(:max_attempts) { |val| val.is_a?(Integer) && val.positive? }
67
+ required(:wait_time) { |val| val.is_a?(Integer) && val.positive? }
68
+ end
65
69
  end
66
70
  end
67
71
  end
@@ -18,7 +18,7 @@ module Karafka
18
18
  virtual do |data, errors|
19
19
  next unless errors.empty?
20
20
 
21
- names = data.fetch(:topics).map { |topic| topic[:name] }
21
+ names = data.fetch(:topics).map { |topic| topic_unique_key(topic) }
22
22
 
23
23
  next if names.size == names.uniq.size
24
24
 
@@ -51,6 +51,14 @@ module Karafka
51
51
 
52
52
  [[%i[topics], :topics_namespaced_names_not_unique]]
53
53
  end
54
+
55
+ class << self
56
+ # @param topic [Hash] topic config hash
57
+ # @return [String] topic unique key for validators
58
+ def topic_unique_key(topic)
59
+ topic[:name]
60
+ end
61
+ end
54
62
  end
55
63
  end
56
64
  end
@@ -20,7 +20,9 @@ module Karafka
20
20
  required(:max_wait_time) { |val| val.is_a?(Integer) && val >= 10 }
21
21
  required(:name) { |val| val.is_a?(String) && Contracts::TOPIC_REGEXP.match?(val) }
22
22
  required(:active) { |val| [true, false].include?(val) }
23
- required(:subscription_group_name) { |val| val.is_a?(String) && !val.empty? }
23
+ nested(:subscription_group_details) do
24
+ required(:name) { |val| val.is_a?(String) && !val.empty? }
25
+ end
24
26
 
25
27
  # Consumer needs to be present only if topic is active
26
28
  # We allow not to define consumer for non-active because they may be only used via admin