karafka 2.2.13 → 2.3.0.alpha1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (125) hide show
  1. checksums.yaml +4 -4
  2. checksums.yaml.gz.sig +0 -0
  3. data/.github/workflows/ci.yml +38 -12
  4. data/.ruby-version +1 -1
  5. data/CHANGELOG.md +161 -125
  6. data/Gemfile.lock +12 -12
  7. data/README.md +0 -2
  8. data/SECURITY.md +23 -0
  9. data/config/locales/errors.yml +7 -1
  10. data/config/locales/pro_errors.yml +22 -0
  11. data/docker-compose.yml +3 -1
  12. data/karafka.gemspec +2 -2
  13. data/lib/karafka/admin/acl.rb +287 -0
  14. data/lib/karafka/admin.rb +118 -16
  15. data/lib/karafka/app.rb +12 -3
  16. data/lib/karafka/base_consumer.rb +32 -31
  17. data/lib/karafka/cli/base.rb +1 -1
  18. data/lib/karafka/connection/client.rb +94 -84
  19. data/lib/karafka/connection/conductor.rb +28 -0
  20. data/lib/karafka/connection/listener.rb +165 -46
  21. data/lib/karafka/connection/listeners_batch.rb +5 -11
  22. data/lib/karafka/connection/manager.rb +72 -0
  23. data/lib/karafka/connection/messages_buffer.rb +12 -0
  24. data/lib/karafka/connection/proxy.rb +17 -0
  25. data/lib/karafka/connection/status.rb +75 -0
  26. data/lib/karafka/contracts/config.rb +14 -10
  27. data/lib/karafka/contracts/consumer_group.rb +9 -1
  28. data/lib/karafka/contracts/topic.rb +3 -1
  29. data/lib/karafka/errors.rb +13 -0
  30. data/lib/karafka/instrumentation/assignments_tracker.rb +96 -0
  31. data/lib/karafka/instrumentation/callbacks/rebalance.rb +10 -7
  32. data/lib/karafka/instrumentation/logger_listener.rb +3 -9
  33. data/lib/karafka/instrumentation/notifications.rb +19 -9
  34. data/lib/karafka/instrumentation/vendors/appsignal/metrics_listener.rb +31 -28
  35. data/lib/karafka/instrumentation/vendors/datadog/logger_listener.rb +22 -3
  36. data/lib/karafka/instrumentation/vendors/datadog/metrics_listener.rb +15 -12
  37. data/lib/karafka/instrumentation/vendors/kubernetes/liveness_listener.rb +39 -36
  38. data/lib/karafka/pro/base_consumer.rb +47 -0
  39. data/lib/karafka/pro/connection/manager.rb +300 -0
  40. data/lib/karafka/pro/connection/multiplexing/listener.rb +40 -0
  41. data/lib/karafka/pro/instrumentation/performance_tracker.rb +85 -0
  42. data/lib/karafka/pro/iterator/tpl_builder.rb +1 -1
  43. data/lib/karafka/pro/iterator.rb +1 -6
  44. data/lib/karafka/pro/loader.rb +16 -2
  45. data/lib/karafka/pro/processing/coordinator.rb +2 -1
  46. data/lib/karafka/pro/processing/executor.rb +37 -0
  47. data/lib/karafka/pro/processing/expansions_selector.rb +32 -0
  48. data/lib/karafka/pro/processing/jobs/periodic.rb +41 -0
  49. data/lib/karafka/pro/processing/jobs/periodic_non_blocking.rb +32 -0
  50. data/lib/karafka/pro/processing/jobs_builder.rb +14 -3
  51. data/lib/karafka/pro/processing/offset_metadata/consumer.rb +44 -0
  52. data/lib/karafka/pro/processing/offset_metadata/fetcher.rb +131 -0
  53. data/lib/karafka/pro/processing/offset_metadata/listener.rb +46 -0
  54. data/lib/karafka/pro/processing/schedulers/base.rb +143 -0
  55. data/lib/karafka/pro/processing/schedulers/default.rb +107 -0
  56. data/lib/karafka/pro/processing/strategies/aj/lrj_mom_vp.rb +1 -1
  57. data/lib/karafka/pro/processing/strategies/default.rb +136 -3
  58. data/lib/karafka/pro/processing/strategies/dlq/default.rb +35 -0
  59. data/lib/karafka/pro/processing/strategies/lrj/default.rb +1 -1
  60. data/lib/karafka/pro/processing/strategies/lrj/mom.rb +1 -1
  61. data/lib/karafka/pro/processing/strategies/vp/default.rb +60 -26
  62. data/lib/karafka/pro/processing/virtual_offset_manager.rb +41 -11
  63. data/lib/karafka/pro/routing/features/long_running_job/topic.rb +2 -0
  64. data/lib/karafka/pro/routing/features/multiplexing/config.rb +38 -0
  65. data/lib/karafka/pro/routing/features/multiplexing/contracts/topic.rb +114 -0
  66. data/lib/karafka/pro/routing/features/multiplexing/patches/contracts/consumer_group.rb +42 -0
  67. data/lib/karafka/pro/routing/features/multiplexing/proxy.rb +38 -0
  68. data/lib/karafka/pro/routing/features/multiplexing/subscription_group.rb +42 -0
  69. data/lib/karafka/pro/routing/features/multiplexing/subscription_groups_builder.rb +40 -0
  70. data/lib/karafka/pro/routing/features/multiplexing.rb +59 -0
  71. data/lib/karafka/pro/routing/features/non_blocking_job/topic.rb +32 -0
  72. data/lib/karafka/pro/routing/features/non_blocking_job.rb +37 -0
  73. data/lib/karafka/pro/routing/features/offset_metadata/config.rb +33 -0
  74. data/lib/karafka/pro/routing/features/offset_metadata/contracts/topic.rb +42 -0
  75. data/lib/karafka/pro/routing/features/offset_metadata/topic.rb +65 -0
  76. data/lib/karafka/pro/routing/features/offset_metadata.rb +40 -0
  77. data/lib/karafka/pro/routing/features/patterns/contracts/consumer_group.rb +4 -0
  78. data/lib/karafka/pro/routing/features/patterns/detector.rb +18 -10
  79. data/lib/karafka/pro/routing/features/periodic_job/config.rb +37 -0
  80. data/lib/karafka/pro/routing/features/periodic_job/contracts/topic.rb +44 -0
  81. data/lib/karafka/pro/routing/features/periodic_job/topic.rb +94 -0
  82. data/lib/karafka/pro/routing/features/periodic_job.rb +27 -0
  83. data/lib/karafka/pro/routing/features/virtual_partitions/config.rb +1 -0
  84. data/lib/karafka/pro/routing/features/virtual_partitions/contracts/topic.rb +1 -0
  85. data/lib/karafka/pro/routing/features/virtual_partitions/topic.rb +7 -2
  86. data/lib/karafka/process.rb +5 -3
  87. data/lib/karafka/processing/coordinator.rb +5 -1
  88. data/lib/karafka/processing/executor.rb +43 -13
  89. data/lib/karafka/processing/executors_buffer.rb +22 -7
  90. data/lib/karafka/processing/jobs/base.rb +19 -2
  91. data/lib/karafka/processing/jobs/consume.rb +3 -3
  92. data/lib/karafka/processing/jobs/idle.rb +5 -0
  93. data/lib/karafka/processing/jobs/revoked.rb +5 -0
  94. data/lib/karafka/processing/jobs/shutdown.rb +5 -0
  95. data/lib/karafka/processing/jobs_queue.rb +19 -8
  96. data/lib/karafka/processing/schedulers/default.rb +42 -0
  97. data/lib/karafka/processing/strategies/base.rb +13 -4
  98. data/lib/karafka/processing/strategies/default.rb +23 -7
  99. data/lib/karafka/processing/strategies/dlq.rb +36 -0
  100. data/lib/karafka/processing/worker.rb +4 -1
  101. data/lib/karafka/routing/builder.rb +12 -2
  102. data/lib/karafka/routing/consumer_group.rb +5 -5
  103. data/lib/karafka/routing/features/base.rb +44 -8
  104. data/lib/karafka/routing/features/dead_letter_queue/config.rb +6 -1
  105. data/lib/karafka/routing/features/dead_letter_queue/contracts/topic.rb +1 -0
  106. data/lib/karafka/routing/features/dead_letter_queue/topic.rb +9 -2
  107. data/lib/karafka/routing/proxy.rb +4 -3
  108. data/lib/karafka/routing/subscription_group.rb +2 -2
  109. data/lib/karafka/routing/subscription_groups_builder.rb +11 -2
  110. data/lib/karafka/routing/topic.rb +8 -10
  111. data/lib/karafka/routing/topics.rb +1 -1
  112. data/lib/karafka/runner.rb +13 -3
  113. data/lib/karafka/server.rb +5 -9
  114. data/lib/karafka/setup/config.rb +21 -1
  115. data/lib/karafka/status.rb +23 -14
  116. data/lib/karafka/templates/karafka.rb.erb +7 -0
  117. data/lib/karafka/time_trackers/partition_usage.rb +56 -0
  118. data/lib/karafka/version.rb +1 -1
  119. data.tar.gz.sig +0 -0
  120. metadata +47 -13
  121. metadata.gz.sig +0 -0
  122. data/lib/karafka/connection/consumer_group_coordinator.rb +0 -48
  123. data/lib/karafka/pro/performance_tracker.rb +0 -84
  124. data/lib/karafka/pro/processing/scheduler.rb +0 -74
  125. data/lib/karafka/processing/scheduler.rb +0 -38
@@ -7,6 +7,8 @@ module Karafka
7
7
  # critical errors by restarting everything in a safe manner.
8
8
  #
9
9
  # This is the heart of the consumption process.
10
+ #
11
+ # It provides async API for managing, so all status changes are expected to be async.
10
12
  class Listener
11
13
  include Helpers::Async
12
14
 
@@ -14,22 +16,23 @@ module Karafka
14
16
  # @return [String] id of this listener
15
17
  attr_reader :id
16
18
 
19
+ # @return [Karafka::Routing::SubscriptionGroup] subscription group that this listener handles
20
+ attr_reader :subscription_group
21
+
17
22
  # How long to wait in the initial events poll. Increases chances of having the initial events
18
23
  # immediately available
19
24
  INITIAL_EVENTS_POLL_TIMEOUT = 100
20
25
 
21
26
  private_constant :INITIAL_EVENTS_POLL_TIMEOUT
22
27
 
23
- # @param consumer_group_coordinator [Karafka::Connection::ConsumerGroupCoordinator]
24
28
  # @param subscription_group [Karafka::Routing::SubscriptionGroup]
25
29
  # @param jobs_queue [Karafka::Processing::JobsQueue] queue where we should push work
26
30
  # @param scheduler [Karafka::Processing::Scheduler] scheduler we want to use
27
31
  # @return [Karafka::Connection::Listener] listener instance
28
- def initialize(consumer_group_coordinator, subscription_group, jobs_queue, scheduler)
32
+ def initialize(subscription_group, jobs_queue, scheduler)
29
33
  proc_config = ::Karafka::App.config.internal.processing
30
34
 
31
35
  @id = SecureRandom.hex(6)
32
- @consumer_group_coordinator = consumer_group_coordinator
33
36
  @subscription_group = subscription_group
34
37
  @jobs_queue = jobs_queue
35
38
  @coordinators = Processing::CoordinatorsBuffer.new(subscription_group.topics)
@@ -43,8 +46,11 @@ module Karafka
43
46
  # We can do this that way because we always first schedule jobs using messages before we
44
47
  # fetch another batch.
45
48
  @messages_buffer = MessagesBuffer.new(subscription_group)
49
+ @usage_tracker = TimeTrackers::PartitionUsage.new
46
50
  @mutex = Mutex.new
47
- @stopped = false
51
+ @status = Status.new
52
+
53
+ @jobs_queue.register(@subscription_group.id)
48
54
  end
49
55
 
50
56
  # Runs the main listener fetch loop.
@@ -60,6 +66,44 @@ module Karafka
60
66
  )
61
67
 
62
68
  fetch_loop
69
+
70
+ Karafka.monitor.instrument(
71
+ 'connection.listener.after_fetch_loop',
72
+ caller: self,
73
+ client: @client,
74
+ subscription_group: @subscription_group
75
+ )
76
+ end
77
+
78
+ # Aliases all statuses operations directly on the listener so we have a listener-facing API
79
+ Status::STATES.each do |state, transition|
80
+ # @return [Boolean] is the listener in a given state
81
+ define_method "#{state}?" do
82
+ @status.public_send("#{state}?")
83
+ end
84
+
85
+ # Moves listener to a given state
86
+ define_method transition do
87
+ @status.public_send(transition)
88
+ end
89
+ end
90
+
91
+ # @return [Boolean] is this listener active (not stopped and not pending)
92
+ def active?
93
+ @status.active?
94
+ end
95
+
96
+ # We overwrite the state `#start` because on start we need to also start running listener in
97
+ # the async thread. While other state transitions happen automatically and status state
98
+ # change is enough, here we need to run the background threads
99
+ def start!
100
+ if stopped?
101
+ @client.reset
102
+ @status.reset!
103
+ end
104
+
105
+ @status.start!
106
+ async_call
63
107
  end
64
108
 
65
109
  # Stops the jobs queue, triggers shutdown on all the executors (sync), commits offsets and
@@ -70,13 +114,16 @@ module Karafka
70
114
  #
71
115
  # @note We wrap it with a mutex exactly because of the above case of forceful shutdown
72
116
  def shutdown
73
- return if @stopped
74
-
75
117
  @mutex.synchronize do
76
- @stopped = true
118
+ return if stopped?
119
+ # Nothing to clear if it was not even running
120
+ return stopped! if pending?
121
+
77
122
  @executors.clear
78
123
  @coordinators.reset
79
124
  @client.stop
125
+
126
+ stopped!
80
127
  end
81
128
  end
82
129
 
@@ -91,6 +138,7 @@ module Karafka
91
138
  # Kafka connections / Internet connection issues / Etc. Business logic problems should not
92
139
  # propagate this far.
93
140
  def fetch_loop
141
+ running!
94
142
  # Run the initial events fetch to improve chances of having metrics and initial callbacks
95
143
  # triggers on start.
96
144
  #
@@ -101,7 +149,7 @@ module Karafka
101
149
  @client.events_poll(INITIAL_EVENTS_POLL_TIMEOUT)
102
150
 
103
151
  # Run the main loop as long as we are not stopping or moving into quiet mode
104
- until Karafka::App.done?
152
+ while running?
105
153
  Karafka.monitor.instrument(
106
154
  'connection.listener.fetch_loop',
107
155
  caller: self,
@@ -136,7 +184,11 @@ module Karafka
136
184
  # simplifies the overall design and prevents from race conditions
137
185
  wait
138
186
 
139
- build_and_schedule_consumption_jobs
187
+ build_and_schedule_flow_jobs
188
+
189
+ # periodic jobs never run on topics and partitions that were scheduled, so no risk in
190
+ # having collective wait after both
191
+ build_and_schedule_periodic_jobs if Karafka.pro?
140
192
 
141
193
  wait
142
194
  end
@@ -168,18 +220,11 @@ module Karafka
168
220
  # Wait until all the shutdown jobs are done
169
221
  wait_pinging(wait_until: -> { @jobs_queue.empty?(@subscription_group.id) })
170
222
 
171
- # Once all the work is done, we need to decrement counter of active subscription groups
172
- # within this consumer group
173
- @consumer_group_coordinator.finish_work(id)
223
+ quieted!
174
224
 
175
225
  # Wait if we're in the process of finishing started work or finished all the work and
176
226
  # just sitting and being quiet
177
- wait_pinging(wait_until: -> { !(Karafka::App.quieting? || Karafka::App.quiet?) })
178
-
179
- # We need to wait until all the work in the whole consumer group (local to the process)
180
- # is done. Otherwise we may end up with locks and `Timed out LeaveGroupRequest in flight`
181
- # warning notifications.
182
- wait_pinging(wait_until: -> { @consumer_group_coordinator.shutdown? })
227
+ wait_pinging(wait_until: -> { !quiet? })
183
228
 
184
229
  # This extra ping will make sure we've refreshed the rebalance state after other instances
185
230
  # potentially shutdown. This will prevent us from closing with a dangling callback
@@ -198,11 +243,9 @@ module Karafka
198
243
  type: 'connection.listener.fetch_loop.error'
199
244
  )
200
245
 
201
- restart
246
+ reset
202
247
 
203
248
  sleep(1) && retry
204
- ensure
205
- @consumer_group_coordinator.unlock
206
249
  end
207
250
 
208
251
  # Resumes processing of partitions that were paused due to an error.
@@ -212,6 +255,17 @@ module Karafka
212
255
  end
213
256
  end
214
257
 
258
+ # Polls messages within the time and amount boundaries defined in the settings and then
259
+ # builds karafka messages based on the raw rdkafka messages buffer returned by the
260
+ # `#batch_poll` method.
261
+ #
262
+ # @note There are two buffers, one for raw messages and one for "built" karafka messages
263
+ def poll_and_remap_messages
264
+ @messages_buffer.remap(
265
+ @client.batch_poll
266
+ )
267
+ end
268
+
215
269
  # Enqueues revoking jobs for partitions that were taken away from the running process.
216
270
  def build_and_schedule_revoked_jobs_for_revoked_partitions
217
271
  revoked_partitions = @client.rebalance_manager.revoked_partitions
@@ -223,6 +277,7 @@ module Karafka
223
277
 
224
278
  revoked_partitions.each do |topic, partitions|
225
279
  partitions.each do |partition|
280
+ @usage_tracker.revoke(topic, partition)
226
281
  @coordinators.revoke(topic, partition)
227
282
 
228
283
  # There may be a case where we have lost partition of which data we have never
@@ -230,7 +285,6 @@ module Karafka
230
285
  # here. In cases like this, we do not run a revocation job
231
286
  @executors.find_all(topic, partition).each do |executor|
232
287
  job = @jobs_builder.revoked(executor)
233
- job.before_enqueue
234
288
  jobs << job
235
289
  end
236
290
 
@@ -243,7 +297,10 @@ module Karafka
243
297
  end
244
298
  end
245
299
 
246
- @scheduler.schedule_revocation(jobs)
300
+ return if jobs.empty?
301
+
302
+ jobs.each(&:before_schedule)
303
+ @scheduler.on_schedule_revocation(jobs)
247
304
  end
248
305
 
249
306
  # Enqueues the shutdown jobs for all the executors that exist in our subscription group
@@ -252,32 +309,27 @@ module Karafka
252
309
 
253
310
  @executors.each do |executor|
254
311
  job = @jobs_builder.shutdown(executor)
255
- job.before_enqueue
256
312
  jobs << job
257
313
  end
258
314
 
259
- @scheduler.schedule_shutdown(jobs)
260
- end
315
+ return if jobs.empty?
261
316
 
262
- # Polls messages within the time and amount boundaries defined in the settings and then
263
- # builds karafka messages based on the raw rdkafka messages buffer returned by the
264
- # `#batch_poll` method.
265
- #
266
- # @note There are two buffers, one for raw messages and one for "built" karafka messages
267
- def poll_and_remap_messages
268
- @messages_buffer.remap(
269
- @client.batch_poll
270
- )
317
+ jobs.each(&:before_schedule)
318
+ @scheduler.on_schedule_shutdown(jobs)
271
319
  end
272
320
 
273
321
  # Takes the messages per topic partition and enqueues processing jobs in threads using
274
- # given scheduler.
275
- def build_and_schedule_consumption_jobs
322
+ # given scheduler. It also handles the idle jobs when filtering API removed all messages
323
+ # and we need to run house-keeping
324
+ def build_and_schedule_flow_jobs
276
325
  return if @messages_buffer.empty?
277
326
 
278
- jobs = []
327
+ consume_jobs = []
328
+ idle_jobs = []
279
329
 
280
330
  @messages_buffer.each do |topic, partition, messages|
331
+ @usage_tracker.track(topic, partition)
332
+
281
333
  coordinator = @coordinators.find_or_create(topic, partition)
282
334
  # Start work coordination for this topic partition
283
335
  coordinator.start(messages)
@@ -286,26 +338,93 @@ module Karafka
286
338
  # and it will not go through a standard lifecycle. Same applies to revoked and shutdown
287
339
  if messages.empty?
288
340
  executor = @executors.find_or_create(topic, partition, 0, coordinator)
289
- jobs << @jobs_builder.idle(executor)
341
+ idle_jobs << @jobs_builder.idle(executor)
290
342
  else
291
343
  @partitioner.call(topic, messages, coordinator) do |group_id, partition_messages|
292
344
  executor = @executors.find_or_create(topic, partition, group_id, coordinator)
293
345
  coordinator.increment
294
- jobs << @jobs_builder.consume(executor, partition_messages)
346
+ consume_jobs << @jobs_builder.consume(executor, partition_messages)
347
+ end
348
+ end
349
+ end
350
+
351
+ # We schedule the idle jobs before running the `#before_schedule` on the consume jobs so
352
+ # workers can already pick up the idle jobs while the `#before_schedule` on consumption
353
+ # jobs runs
354
+ unless idle_jobs.empty?
355
+ idle_jobs.each(&:before_schedule)
356
+ @scheduler.on_schedule_idle(idle_jobs)
357
+ end
358
+
359
+ unless consume_jobs.empty?
360
+ consume_jobs.each(&:before_schedule)
361
+ @scheduler.on_schedule_consumption(consume_jobs)
362
+ end
363
+ end
364
+
365
+ # Builds and schedules periodic jobs for topics partitions for which no messages were
366
+ # received recently. In case `Idle` job is invoked, we do not run periodic. Idle means that
367
+ # a complex flow kicked in and it was a user choice not to run consumption but messages were
368
+ # shipped.
369
+ def build_and_schedule_periodic_jobs
370
+ # Shortcut if periodic jobs are not used at all. No need to run the complex flow when it
371
+ # will never end up with anything. If periodics on any of the topics are not even defined,
372
+ # we can finish fast
373
+ @periodic_jobs ||= @subscription_group.topics.count(&:periodic_job?)
374
+
375
+ return if @periodic_jobs.zero?
376
+
377
+ jobs = []
378
+
379
+ # We select only currently assigned topics and partitions from the current subscription
380
+ # group as only those are of our interest. We then filter that to only pick those for whom
381
+ # we want to run periodic jobs and then we select only those that did not receive any
382
+ # messages recently. This ensures, that we do not tick close to recent arrival of messages
383
+ # but rather after certain period of inactivity
384
+ Karafka::App.assignments.each do |topic, partitions|
385
+ # Skip for assignments not from our subscription group
386
+ next unless topic.subscription_group == @subscription_group
387
+ # Skip if this topic does not have periodic jobs enabled
388
+ next unless topic.periodic_job?
389
+
390
+ topic_name = topic.name
391
+ interval = topic.periodic_job.interval
392
+
393
+ partitions.each do |partition|
394
+ # Skip if we were operating on a given topic partition recently
395
+ next if @usage_tracker.active?(topic_name, partition, interval)
396
+
397
+ coordinator = @coordinators.find_or_create(topic_name, partition)
398
+
399
+ # Do not tick if we do not want to tick during pauses
400
+ next if coordinator.paused? && !topic.periodic_job.during_pause?
401
+
402
+ # If we do not want to run periodics during retry flows, we should not
403
+ # Since this counter is incremented before processing, here it is always -1 from what
404
+ # we see in the consumer flow. This is why attempt 0 means that we will have first
405
+ # run (ok) but attempt 1 means, there was an error and we will retry
406
+ next if coordinator.attempt.positive? && !topic.periodic_job.during_retry?
407
+
408
+ # Track so we do not run periodic job again too soon
409
+ @usage_tracker.track(topic_name, partition)
410
+
411
+ @executors.find_all_or_create(topic_name, partition, coordinator).each do |executor|
412
+ jobs << @jobs_builder.periodic(executor)
295
413
  end
296
414
  end
297
415
  end
298
416
 
299
- jobs.each(&:before_enqueue)
417
+ return if jobs.empty?
300
418
 
301
- @scheduler.schedule_consumption(jobs)
419
+ jobs.each(&:before_schedule)
420
+ @scheduler.on_schedule_periodic(jobs)
302
421
  end
303
422
 
304
423
  # Waits for all the jobs from a given subscription group to finish before moving forward
305
424
  def wait
306
425
  @jobs_queue.wait(@subscription_group.id) do
307
426
  @events_poller.call
308
- @scheduler.manage
427
+ @scheduler.on_manage
309
428
  end
310
429
  end
311
430
 
@@ -322,7 +441,7 @@ module Karafka
322
441
  def wait_pinging(wait_until:, after_ping: -> {})
323
442
  until wait_until.call
324
443
  @client.ping
325
- @scheduler.manage
444
+ @scheduler.on_manage
326
445
 
327
446
  after_ping.call
328
447
  sleep(0.2)
@@ -333,13 +452,13 @@ module Karafka
333
452
  # `#fetch_loop` again. We just need to remember to also reset the runner as it is a long
334
453
  # running one, so with a new connection to Kafka, we need to initialize the state of the
335
454
  # runner and underlying consumers once again.
336
- def restart
455
+ def reset
337
456
  # If there was any problem with processing, before we reset things we need to make sure,
338
457
  # there are no jobs in the queue. Otherwise it could lead to leakage in between client
339
458
  # resetting.
340
459
  @jobs_queue.wait(@subscription_group.id)
341
460
  @jobs_queue.clear(@subscription_group.id)
342
- @scheduler.clear(@subscription_group.id)
461
+ @scheduler.on_clear(@subscription_group.id)
343
462
  @events_poller.reset
344
463
  @client.reset
345
464
  @coordinators.reset
@@ -6,8 +6,6 @@ module Karafka
6
6
  class ListenersBatch
7
7
  include Enumerable
8
8
 
9
- attr_reader :coordinators
10
-
11
9
  # @param jobs_queue [JobsQueue]
12
10
  # @return [ListenersBatch]
13
11
  def initialize(jobs_queue)
@@ -15,18 +13,9 @@ module Karafka
15
13
  # should be able to distribute work whenever any work is done in any of the listeners
16
14
  scheduler = App.config.internal.processing.scheduler_class.new(jobs_queue)
17
15
 
18
- @coordinators = []
19
-
20
16
  @batch = App.subscription_groups.flat_map do |_consumer_group, subscription_groups|
21
- consumer_group_coordinator = Connection::ConsumerGroupCoordinator.new(
22
- subscription_groups.size
23
- )
24
-
25
- @coordinators << consumer_group_coordinator
26
-
27
17
  subscription_groups.map do |subscription_group|
28
18
  Connection::Listener.new(
29
- consumer_group_coordinator,
30
19
  subscription_group,
31
20
  jobs_queue,
32
21
  scheduler
@@ -40,6 +29,11 @@ module Karafka
40
29
  def each(&block)
41
30
  @batch.each(&block)
42
31
  end
32
+
33
+ # @return [Array<Listener>] active listeners
34
+ def active
35
+ select(&:active?)
36
+ end
43
37
  end
44
38
  end
45
39
  end
@@ -0,0 +1,72 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Karafka
4
+ # Namespace for Kafka connection related logic
5
+ module Connection
6
+ # Connections manager responsible for starting and managing listeners connections
7
+ #
8
+ # In the OSS version it starts listeners as they are without any connection management or
9
+ # resources utilization supervision and shuts them down or quiets when time has come
10
+ class Manager
11
+ def initialize
12
+ @once_executions = Set.new
13
+ end
14
+
15
+ # Registers provided listeners and starts all of them
16
+ #
17
+ # @param listeners [Connection::ListenersBatch]
18
+ def register(listeners)
19
+ @listeners = listeners
20
+ @listeners.each(&:start!)
21
+ end
22
+
23
+ # @return [Boolean] true if all listeners are stopped
24
+ def done?
25
+ @listeners.all?(&:stopped?)
26
+ end
27
+
28
+ # Controls the state of listeners upon shutdown and quiet requests
29
+ # In both cases (quieting and shutdown) we first need to stop processing more work and tell
30
+ # listeners to become quiet (connected but not yielding messages) and then depending on
31
+ # whether we want to stop fully or just keep quiet we apply different flow.
32
+ #
33
+ # @note It is important to ensure, that all listeners from the same consumer group are always
34
+ # all quiet before we can fully shutdown given consumer group. Skipping this can cause
35
+ # `Timed out LeaveGroupRequest in flight` and other errors. For the simplification, we just
36
+ # quiet all and only then move forward.
37
+ #
38
+ # @note This manager works with the assumption, that all listeners are executed on register.
39
+ def control
40
+ # Do nothing until shutdown or quiet
41
+ return unless Karafka::App.done?
42
+
43
+ # When we are done processing, immediately quiet all the listeners so they do not pick up
44
+ # new work to do
45
+ once(:quiet!) { @listeners.each(&:quiet!) }
46
+
47
+ return unless @listeners.all?(&:quiet?)
48
+
49
+ # If we are in the process of moving to quiet state, we need to check it.
50
+ # Switch to quieted status only when all listeners are fully quieted and do nothing after
51
+ # that until further state changes
52
+ once(:quieted!) { Karafka::App.quieted! } if Karafka::App.quieting?
53
+
54
+ return if Karafka::App.quiet?
55
+
56
+ once(:stop!) { @listeners.each(&:stop!) }
57
+ end
58
+
59
+ private
60
+
61
+ # Runs code only once and never again
62
+ # @param args [Object] anything we want to use as a set of unique keys for given execution
63
+ def once(*args)
64
+ return if @once_executions.include?(args)
65
+
66
+ @once_executions << args
67
+
68
+ yield
69
+ end
70
+ end
71
+ end
72
+ end
@@ -67,6 +67,18 @@ module Karafka
67
67
  end
68
68
  end
69
69
 
70
+ # Checks if there are any messages from a given topic partition in the buffer
71
+ # @param topic [String] topic name
72
+ # @param partition [Integer] partition number
73
+ # @return [Boolean] true if there is at least one message from this topic partition,
74
+ # otherwise false
75
+ def present?(topic, partition)
76
+ return false unless @groups.include?(topic)
77
+ return false unless @groups[topic].include?(partition)
78
+
79
+ true
80
+ end
81
+
70
82
  # @return [Boolean] is the buffer empty or does it contain any messages
71
83
  def empty?
72
84
  @size.zero?
@@ -68,6 +68,23 @@ module Karafka
68
68
  end
69
69
  end
70
70
 
71
+ # Similar to `#query_watermark_offsets`.
72
+ #
73
+ # @param tpl [Rdkafka::Consumer::TopicPartitionList, nil] tpl or nil for full current
74
+ # assignment tpl usage
75
+ # @return [Rdkafka::Consumer::TopicPartitionList] tpl with committed offsets and metadata
76
+ def committed(tpl = nil)
77
+ c_config = @config.committed
78
+
79
+ with_broker_errors_retry(
80
+ # required to be in seconds, not ms
81
+ wait_time: c_config.wait_time / 1_000.to_f,
82
+ max_attempts: c_config.max_attempts
83
+ ) do
84
+ @wrapped.committed(tpl, c_config.timeout)
85
+ end
86
+ end
87
+
71
88
  private
72
89
 
73
90
  # Runs expected block of code with few retries on all_brokers_down
@@ -0,0 +1,75 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Karafka
4
+ # Namespace for Kafka connection related logic
5
+ module Connection
6
+ # Listener connection status representation
7
+ class Status
8
+ # Available states and their transitions.
9
+ STATES = {
10
+ pending: :pending!,
11
+ starting: :start!,
12
+ running: :running!,
13
+ quieting: :quiet!,
14
+ quiet: :quieted!,
15
+ stopping: :stop!,
16
+ stopped: :stopped!
17
+ }.freeze
18
+
19
+ STATES.each do |state, transition|
20
+ class_eval <<~RUBY, __FILE__, __LINE__ + 1
21
+ # Moves status to a different state
22
+ def #{transition}
23
+ @mutex.synchronize do
24
+ # Do not allow reverse state transitions (we always go one way) or transition to the
25
+ # same state as currently
26
+ return if @status && STATES.keys.index(:#{state}) <= STATES.keys.index(@status)
27
+
28
+ @status = :#{state}
29
+ @conductor.signal
30
+ end
31
+ end
32
+
33
+ # @return [Boolean] are we in a given state
34
+ def #{state}?
35
+ @status == :#{state}
36
+ end
37
+ RUBY
38
+ end
39
+
40
+ def initialize
41
+ @mutex = Mutex.new
42
+ @conductor = Karafka::App.config.internal.connection.conductor
43
+ pending!
44
+ end
45
+
46
+ # If this listener was not even running, will just move it through states until final.
47
+ # If it was running, will start the stopping procedures.
48
+ # Will do nothing if it was already stopped
49
+ def stop!
50
+ if pending?
51
+ @status = :stopping
52
+ stopped!
53
+ elsif stopped?
54
+ nil
55
+ else
56
+ @status = :stopping
57
+ end
58
+ end
59
+
60
+ # Moves status back from stopped to pending (and only that). We should not be able to reset
61
+ # listeners that are not stopped
62
+ def reset!
63
+ return unless stopped?
64
+
65
+ @status = :pending
66
+ end
67
+
68
+ # @return [Boolean] listener is considered active when it has a client reference that may
69
+ # be active and connected to Kafka
70
+ def active?
71
+ !pending? && !stopped?
72
+ end
73
+ end
74
+ end
75
+ end
@@ -51,17 +51,21 @@ module Karafka
51
51
  required(:tick_interval) { |val| val.is_a?(Integer) && val >= 1_000 }
52
52
 
53
53
  nested(:connection) do
54
- nested(:proxy) do
55
- nested(:query_watermark_offsets) do
56
- required(:timeout) { |val| val.is_a?(Integer) && val.positive? }
57
- required(:max_attempts) { |val| val.is_a?(Integer) && val.positive? }
58
- required(:wait_time) { |val| val.is_a?(Integer) && val.positive? }
59
- end
54
+ required(:manager) { |val| !val.nil? }
55
+ required(:conductor) { |val| !val.nil? }
60
56
 
61
- nested(:offsets_for_times) do
62
- required(:timeout) { |val| val.is_a?(Integer) && val.positive? }
63
- required(:max_attempts) { |val| val.is_a?(Integer) && val.positive? }
64
- required(:wait_time) { |val| val.is_a?(Integer) && val.positive? }
57
+ nested(:proxy) do
58
+ # All of them have the same requirements
59
+ %i[
60
+ query_watermark_offsets
61
+ offsets_for_times
62
+ committed
63
+ ].each do |scope|
64
+ nested(scope) do
65
+ required(:timeout) { |val| val.is_a?(Integer) && val.positive? }
66
+ required(:max_attempts) { |val| val.is_a?(Integer) && val.positive? }
67
+ required(:wait_time) { |val| val.is_a?(Integer) && val.positive? }
68
+ end
65
69
  end
66
70
  end
67
71
  end
@@ -18,7 +18,7 @@ module Karafka
18
18
  virtual do |data, errors|
19
19
  next unless errors.empty?
20
20
 
21
- names = data.fetch(:topics).map { |topic| topic[:name] }
21
+ names = data.fetch(:topics).map { |topic| topic_unique_key(topic) }
22
22
 
23
23
  next if names.size == names.uniq.size
24
24
 
@@ -51,6 +51,14 @@ module Karafka
51
51
 
52
52
  [[%i[topics], :topics_namespaced_names_not_unique]]
53
53
  end
54
+
55
+ class << self
56
+ # @param topic [Hash] topic config hash
57
+ # @return [String] topic unique key for validators
58
+ def topic_unique_key(topic)
59
+ topic[:name]
60
+ end
61
+ end
54
62
  end
55
63
  end
56
64
  end
@@ -20,7 +20,9 @@ module Karafka
20
20
  required(:max_wait_time) { |val| val.is_a?(Integer) && val >= 10 }
21
21
  required(:name) { |val| val.is_a?(String) && Contracts::TOPIC_REGEXP.match?(val) }
22
22
  required(:active) { |val| [true, false].include?(val) }
23
- required(:subscription_group_name) { |val| val.is_a?(String) && !val.empty? }
23
+ nested(:subscription_group_details) do
24
+ required(:name) { |val| val.is_a?(String) && !val.empty? }
25
+ end
24
26
 
25
27
  # Consumer needs to be present only if topic is active
26
28
  # We allow not to define consumer for non-active because they may be only used via admin