karafka 2.2.14 → 2.3.0.alpha2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (107) hide show
  1. checksums.yaml +4 -4
  2. checksums.yaml.gz.sig +0 -0
  3. data/.github/workflows/ci.yml +38 -12
  4. data/.ruby-version +1 -1
  5. data/CHANGELOG.md +24 -0
  6. data/Gemfile.lock +16 -16
  7. data/README.md +0 -2
  8. data/SECURITY.md +23 -0
  9. data/bin/integrations +1 -1
  10. data/config/locales/errors.yml +7 -1
  11. data/config/locales/pro_errors.yml +22 -0
  12. data/docker-compose.yml +1 -1
  13. data/karafka.gemspec +2 -2
  14. data/lib/karafka/admin/acl.rb +287 -0
  15. data/lib/karafka/admin.rb +9 -13
  16. data/lib/karafka/app.rb +5 -3
  17. data/lib/karafka/base_consumer.rb +9 -1
  18. data/lib/karafka/cli/base.rb +1 -1
  19. data/lib/karafka/connection/client.rb +83 -76
  20. data/lib/karafka/connection/conductor.rb +28 -0
  21. data/lib/karafka/connection/listener.rb +159 -42
  22. data/lib/karafka/connection/listeners_batch.rb +5 -11
  23. data/lib/karafka/connection/manager.rb +72 -0
  24. data/lib/karafka/connection/messages_buffer.rb +12 -0
  25. data/lib/karafka/connection/proxy.rb +17 -0
  26. data/lib/karafka/connection/status.rb +75 -0
  27. data/lib/karafka/contracts/config.rb +14 -10
  28. data/lib/karafka/contracts/consumer_group.rb +9 -1
  29. data/lib/karafka/contracts/topic.rb +3 -1
  30. data/lib/karafka/errors.rb +17 -0
  31. data/lib/karafka/instrumentation/logger_listener.rb +3 -0
  32. data/lib/karafka/instrumentation/notifications.rb +13 -5
  33. data/lib/karafka/instrumentation/vendors/appsignal/metrics_listener.rb +31 -28
  34. data/lib/karafka/instrumentation/vendors/datadog/logger_listener.rb +20 -1
  35. data/lib/karafka/instrumentation/vendors/datadog/metrics_listener.rb +15 -12
  36. data/lib/karafka/instrumentation/vendors/kubernetes/liveness_listener.rb +39 -36
  37. data/lib/karafka/pro/base_consumer.rb +47 -0
  38. data/lib/karafka/pro/connection/manager.rb +269 -0
  39. data/lib/karafka/pro/connection/multiplexing/listener.rb +40 -0
  40. data/lib/karafka/pro/iterator/tpl_builder.rb +1 -1
  41. data/lib/karafka/pro/iterator.rb +1 -6
  42. data/lib/karafka/pro/loader.rb +14 -0
  43. data/lib/karafka/pro/processing/coordinator.rb +2 -1
  44. data/lib/karafka/pro/processing/executor.rb +37 -0
  45. data/lib/karafka/pro/processing/expansions_selector.rb +32 -0
  46. data/lib/karafka/pro/processing/jobs/periodic.rb +41 -0
  47. data/lib/karafka/pro/processing/jobs/periodic_non_blocking.rb +32 -0
  48. data/lib/karafka/pro/processing/jobs_builder.rb +14 -3
  49. data/lib/karafka/pro/processing/offset_metadata/consumer.rb +44 -0
  50. data/lib/karafka/pro/processing/offset_metadata/fetcher.rb +131 -0
  51. data/lib/karafka/pro/processing/offset_metadata/listener.rb +46 -0
  52. data/lib/karafka/pro/processing/schedulers/base.rb +39 -23
  53. data/lib/karafka/pro/processing/schedulers/default.rb +12 -14
  54. data/lib/karafka/pro/processing/strategies/default.rb +154 -1
  55. data/lib/karafka/pro/processing/strategies/dlq/default.rb +39 -0
  56. data/lib/karafka/pro/processing/strategies/vp/default.rb +65 -25
  57. data/lib/karafka/pro/processing/virtual_offset_manager.rb +41 -11
  58. data/lib/karafka/pro/routing/features/long_running_job/topic.rb +2 -0
  59. data/lib/karafka/pro/routing/features/multiplexing/config.rb +38 -0
  60. data/lib/karafka/pro/routing/features/multiplexing/contracts/topic.rb +114 -0
  61. data/lib/karafka/pro/routing/features/multiplexing/patches/contracts/consumer_group.rb +42 -0
  62. data/lib/karafka/pro/routing/features/multiplexing/proxy.rb +38 -0
  63. data/lib/karafka/pro/routing/features/multiplexing/subscription_group.rb +42 -0
  64. data/lib/karafka/pro/routing/features/multiplexing/subscription_groups_builder.rb +40 -0
  65. data/lib/karafka/pro/routing/features/multiplexing.rb +59 -0
  66. data/lib/karafka/pro/routing/features/non_blocking_job/topic.rb +32 -0
  67. data/lib/karafka/pro/routing/features/non_blocking_job.rb +37 -0
  68. data/lib/karafka/pro/routing/features/offset_metadata/config.rb +33 -0
  69. data/lib/karafka/pro/routing/features/offset_metadata/contracts/topic.rb +42 -0
  70. data/lib/karafka/pro/routing/features/offset_metadata/topic.rb +65 -0
  71. data/lib/karafka/pro/routing/features/offset_metadata.rb +40 -0
  72. data/lib/karafka/pro/routing/features/patterns/contracts/consumer_group.rb +4 -0
  73. data/lib/karafka/pro/routing/features/patterns/detector.rb +18 -10
  74. data/lib/karafka/pro/routing/features/periodic_job/config.rb +37 -0
  75. data/lib/karafka/pro/routing/features/periodic_job/contracts/topic.rb +44 -0
  76. data/lib/karafka/pro/routing/features/periodic_job/topic.rb +94 -0
  77. data/lib/karafka/pro/routing/features/periodic_job.rb +27 -0
  78. data/lib/karafka/pro/routing/features/virtual_partitions/config.rb +1 -0
  79. data/lib/karafka/pro/routing/features/virtual_partitions/contracts/topic.rb +1 -0
  80. data/lib/karafka/pro/routing/features/virtual_partitions/topic.rb +7 -2
  81. data/lib/karafka/process.rb +5 -3
  82. data/lib/karafka/processing/coordinator.rb +5 -1
  83. data/lib/karafka/processing/executor.rb +16 -10
  84. data/lib/karafka/processing/executors_buffer.rb +19 -4
  85. data/lib/karafka/processing/schedulers/default.rb +3 -2
  86. data/lib/karafka/processing/strategies/default.rb +6 -0
  87. data/lib/karafka/processing/strategies/dlq.rb +36 -0
  88. data/lib/karafka/routing/builder.rb +12 -2
  89. data/lib/karafka/routing/consumer_group.rb +5 -5
  90. data/lib/karafka/routing/features/base.rb +44 -8
  91. data/lib/karafka/routing/features/dead_letter_queue/config.rb +6 -1
  92. data/lib/karafka/routing/features/dead_letter_queue/contracts/topic.rb +1 -0
  93. data/lib/karafka/routing/features/dead_letter_queue/topic.rb +9 -2
  94. data/lib/karafka/routing/subscription_group.rb +2 -2
  95. data/lib/karafka/routing/subscription_groups_builder.rb +11 -2
  96. data/lib/karafka/routing/topic.rb +8 -10
  97. data/lib/karafka/runner.rb +13 -3
  98. data/lib/karafka/server.rb +5 -9
  99. data/lib/karafka/setup/config.rb +17 -0
  100. data/lib/karafka/status.rb +23 -14
  101. data/lib/karafka/templates/karafka.rb.erb +7 -0
  102. data/lib/karafka/time_trackers/partition_usage.rb +56 -0
  103. data/lib/karafka/version.rb +1 -1
  104. data.tar.gz.sig +0 -0
  105. metadata +42 -10
  106. metadata.gz.sig +0 -0
  107. data/lib/karafka/connection/consumer_group_coordinator.rb +0 -48
@@ -10,6 +10,10 @@ module Karafka
10
10
  class Client
11
11
  attr_reader :rebalance_manager
12
12
 
13
+ # @return [Karafka::Routing::SubscriptionGroup] subscription group to which this client
14
+ # belongs to
15
+ attr_reader :subscription_group
16
+
13
17
  # @return [String] underlying consumer name
14
18
  # @note Consumer name may change in case we regenerate it
15
19
  attr_reader :name
@@ -20,16 +24,7 @@ module Karafka
20
24
  # How many times should we retry polling in case of a failure
21
25
  MAX_POLL_RETRIES = 20
22
26
 
23
- # 1 minute of max wait for the first rebalance before a forceful attempt
24
- # This applies only to a case when a short-lived Karafka instance with a client would be
25
- # closed before first rebalance. Mitigates a librdkafka bug.
26
- COOPERATIVE_STICKY_MAX_WAIT = 60_000
27
-
28
- # We want to make sure we never close several clients in the same moment to prevent
29
- # potential race conditions and other issues
30
- SHUTDOWN_MUTEX = Mutex.new
31
-
32
- private_constant :MAX_POLL_RETRIES, :SHUTDOWN_MUTEX, :COOPERATIVE_STICKY_MAX_WAIT
27
+ private_constant :MAX_POLL_RETRIES
33
28
 
34
29
  # Creates a new consumer instance.
35
30
  #
@@ -47,7 +42,6 @@ module Karafka
47
42
  @rebalance_manager = RebalanceManager.new(@subscription_group.id)
48
43
  @rebalance_callback = Instrumentation::Callbacks::Rebalance.new(@subscription_group)
49
44
  @events_poller = Helpers::IntervalRunner.new { events_poll }
50
- @kafka = build_consumer
51
45
  # There are few operations that can happen in parallel from the listener threads as well
52
46
  # as from the workers. They are not fully thread-safe because they may be composed out of
53
47
  # few calls to Kafka or out of few internal state changes. That is why we mutex them.
@@ -122,13 +116,19 @@ module Karafka
122
116
  # Stores offset for a given partition of a given topic based on the provided message.
123
117
  #
124
118
  # @param message [Karafka::Messages::Message]
125
- def store_offset(message)
126
- internal_store_offset(message)
119
+ # @param offset_metadata [String, nil] offset storage metadata or nil if none
120
+ def store_offset(message, offset_metadata = nil)
121
+ internal_store_offset(message, offset_metadata)
127
122
  end
128
123
 
129
124
  # @return [Boolean] true if our current assignment has been lost involuntarily.
130
125
  def assignment_lost?
131
- @kafka.assignment_lost?
126
+ kafka.assignment_lost?
127
+ end
128
+
129
+ # @return [Rdkafka::Consumer::TopicPartitionList] current active assignment
130
+ def assignment
131
+ kafka.assignment
132
132
  end
133
133
 
134
134
  # Commits the offset on a current consumer in a non-blocking or blocking way.
@@ -199,7 +199,7 @@ module Karafka
199
199
 
200
200
  @paused_tpls[topic][partition] = tpl
201
201
 
202
- @kafka.pause(tpl)
202
+ kafka.pause(tpl)
203
203
 
204
204
  # If offset is not provided, will pause where it finished.
205
205
  # This makes librdkafka not purge buffers and can provide significant network savings
@@ -240,43 +240,23 @@ module Karafka
240
240
  partition: partition
241
241
  )
242
242
 
243
- @kafka.resume(tpl)
243
+ kafka.resume(tpl)
244
244
  end
245
245
  end
246
246
 
247
247
  # Gracefully stops topic consumption.
248
- #
249
- # @note Stopping running consumers without a really important reason is not recommended
250
- # as until all the consumers are stopped, the server will keep running serving only
251
- # part of the messages
252
248
  def stop
253
- # This ensures, that we do not stop the underlying client until it passes the first
254
- # rebalance for cooperative-sticky. Otherwise librdkafka may crash
255
- #
256
- # We set a timeout just in case the rebalance would never happen or would last for an
257
- # extensive time period.
258
- #
259
- # @see https://github.com/confluentinc/librdkafka/issues/4312
249
+ # In case of cooperative-sticky, there is a bug in librdkafka that may hang it.
250
+ # To mitigate it we first need to unsubscribe so we will not receive any assignments and
251
+ # only then we should be good to go.
252
+ # @see https://github.com/confluentinc/librdkafka/issues/4527
260
253
  if @subscription_group.kafka[:'partition.assignment.strategy'] == 'cooperative-sticky'
261
- active_wait = false
262
-
263
- (COOPERATIVE_STICKY_MAX_WAIT / 100).times do
264
- # If we're past the first rebalance, no need to wait
265
- if @rebalance_manager.active?
266
- # We give it a a bit of time because librdkafka has a tendency to do some-post
267
- # callback work that from its perspective is still under rebalance
268
- sleep(5) if active_wait
269
-
270
- break
271
- end
272
-
273
- active_wait = true
274
-
275
- # poll to trigger potential rebalances that could occur during stopping and to trigger
276
- # potential callbacks
277
- poll(100)
254
+ unsubscribe
278
255
 
256
+ until assignment.empty?
279
257
  sleep(0.1)
258
+
259
+ ping
280
260
  end
281
261
  end
282
262
 
@@ -285,21 +265,23 @@ module Karafka
285
265
 
286
266
  # Marks given message as consumed.
287
267
  #
288
- # @param [Karafka::Messages::Message] message that we want to mark as processed
268
+ # @param message [Karafka::Messages::Message] message that we want to mark as processed
269
+ # @param metadata [String, nil] offset storage metadata or nil if none
289
270
  # @return [Boolean] true if successful. False if we no longer own given partition
290
271
  # @note This method won't trigger automatic offsets commits, rather relying on the offset
291
272
  # check-pointing trigger that happens with each batch processed. It will however check the
292
273
  # `librdkafka` assignment ownership to increase accuracy for involuntary revocations.
293
- def mark_as_consumed(message)
294
- store_offset(message) && !assignment_lost?
274
+ def mark_as_consumed(message, metadata = nil)
275
+ store_offset(message, metadata) && !assignment_lost?
295
276
  end
296
277
 
297
278
  # Marks a given message as consumed and commits the offsets in a blocking way.
298
279
  #
299
- # @param [Karafka::Messages::Message] message that we want to mark as processed
280
+ # @param message [Karafka::Messages::Message] message that we want to mark as processed
281
+ # @param metadata [String, nil] offset storage metadata or nil if none
300
282
  # @return [Boolean] true if successful. False if we no longer own given partition
301
- def mark_as_consumed!(message)
302
- return false unless mark_as_consumed(message)
283
+ def mark_as_consumed!(message, metadata = nil)
284
+ return false unless mark_as_consumed(message, metadata)
303
285
 
304
286
  commit_offsets!
305
287
  end
@@ -316,7 +298,6 @@ module Karafka
316
298
  @events_poller.reset
317
299
  @closed = false
318
300
  @paused_tpls.clear
319
- @kafka = build_consumer
320
301
  end
321
302
  end
322
303
 
@@ -343,7 +324,27 @@ module Karafka
343
324
  # @note It is non-blocking when timeout 0 and will not wait if queue empty. It costs up to
344
325
  # 2ms when no callbacks are triggered.
345
326
  def events_poll(timeout = 0)
346
- @kafka.events_poll(timeout)
327
+ kafka.events_poll(timeout)
328
+ end
329
+
330
+ # Returns pointer to the consumer group metadata. It is used only in the context of
331
+ # exactly-once-semantics in transactions, this is why it is never remapped to Ruby
332
+ # @return [FFI::Pointer]
333
+ def consumer_group_metadata_pointer
334
+ kafka.consumer_group_metadata_pointer
335
+ end
336
+
337
+ # Return the current committed offset per partition for this consumer group.
338
+ # The offset field of each requested partition will either be set to stored offset or to
339
+ # -1001 in case there was no stored offset for that partition.
340
+ #
341
+ # @param tpl [Rdkafka::Consumer::TopicPartitionList] for which we want to get committed
342
+ # @return [Rdkafka::Consumer::TopicPartitionList]
343
+ # @raise [Rdkafka::RdkafkaError] When getting the committed positions fails.
344
+ # @note It is recommended to use this only on rebalances to get positions with metadata
345
+ # when working with metadata as this is synchronous
346
+ def committed(tpl = nil)
347
+ Proxy.new(kafka).committed(tpl)
347
348
  end
348
349
 
349
350
  private
@@ -352,9 +353,10 @@ module Karafka
352
353
  #
353
354
  # Non thread-safe offset storing method
354
355
  # @param message [Karafka::Messages::Message]
356
+ # @param metadata [String, nil] offset storage metadata or nil if none
355
357
  # @return [Boolean] true if we could store the offset (if we still own the partition)
356
- def internal_store_offset(message)
357
- @kafka.store_offset(message)
358
+ def internal_store_offset(message, metadata)
359
+ kafka.store_offset(message, metadata)
358
360
  true
359
361
  rescue Rdkafka::RdkafkaError => e
360
362
  return false if e.code == :assignment_lost
@@ -370,7 +372,7 @@ module Karafka
370
372
  # even when no stored, because with sync commit, it refreshes the ownership state of the
371
373
  # consumer in a sync way.
372
374
  def internal_commit_offsets(async: true)
373
- @kafka.commit(nil, async)
375
+ kafka.commit(nil, async)
374
376
 
375
377
  true
376
378
  rescue Rdkafka::RdkafkaError => e
@@ -407,7 +409,7 @@ module Karafka
407
409
  message.partition => message.offset
408
410
  )
409
411
 
410
- proxy = Proxy.new(@kafka)
412
+ proxy = Proxy.new(kafka)
411
413
 
412
414
  # Now we can overwrite the seek message offset with our resolved offset and we can
413
415
  # then seek to the appropriate message
@@ -429,29 +431,29 @@ module Karafka
429
431
  # seeking and pausing
430
432
  return if message.offset == topic_partition_position(message.topic, message.partition)
431
433
 
432
- @kafka.seek(message)
434
+ kafka.seek(message)
433
435
  end
434
436
 
435
437
  # Commits the stored offsets in a sync way and closes the consumer.
436
438
  def close
437
- # Allow only one client to be closed at the same time
438
- SHUTDOWN_MUTEX.synchronize do
439
- # Once client is closed, we should not close it again
440
- # This could only happen in case of a race-condition when forceful shutdown happens
441
- # and triggers this from a different thread
442
- return if @closed
439
+ # Once client is closed, we should not close it again
440
+ # This could only happen in case of a race-condition when forceful shutdown happens
441
+ # and triggers this from a different thread
442
+ return if @closed
443
443
 
444
- @closed = true
444
+ @closed = true
445
445
 
446
- # Remove callbacks runners that were registered
447
- ::Karafka::Core::Instrumentation.statistics_callbacks.delete(@subscription_group.id)
448
- ::Karafka::Core::Instrumentation.error_callbacks.delete(@subscription_group.id)
446
+ return unless @kafka
449
447
 
450
- @kafka.close
451
- @buffer.clear
452
- # @note We do not clear rebalance manager here as we may still have revocation info
453
- # here that we want to consider valid prior to running another reconnection
454
- end
448
+ # Remove callbacks runners that were registered
449
+ ::Karafka::Core::Instrumentation.statistics_callbacks.delete(@subscription_group.id)
450
+ ::Karafka::Core::Instrumentation.error_callbacks.delete(@subscription_group.id)
451
+
452
+ kafka.close
453
+ @kafka = nil
454
+ @buffer.clear
455
+ # @note We do not clear rebalance manager here as we may still have revocation info
456
+ # here that we want to consider valid prior to running another reconnection
455
457
  end
456
458
 
457
459
  # Unsubscribes from all the subscriptions
@@ -459,7 +461,7 @@ module Karafka
459
461
  # @note We do not re-raise since this is supposed to be only used on close and can be safely
460
462
  # ignored. We do however want to instrument on it
461
463
  def unsubscribe
462
- @kafka.unsubscribe
464
+ kafka.unsubscribe
463
465
  rescue ::Rdkafka::RdkafkaError => e
464
466
  Karafka.monitor.instrument(
465
467
  'error.occurred',
@@ -473,7 +475,7 @@ module Karafka
473
475
  # @param partition [Integer]
474
476
  # @return [Rdkafka::Consumer::TopicPartitionList]
475
477
  def topic_partition_list(topic, partition)
476
- rdkafka_partition = @kafka
478
+ rdkafka_partition = kafka
477
479
  .assignment
478
480
  .to_h[topic]
479
481
  &.detect { |part| part.partition == partition }
@@ -492,7 +494,7 @@ module Karafka
492
494
  rd_partition = ::Rdkafka::Consumer::Partition.new(partition, nil, 0)
493
495
  tpl = ::Rdkafka::Consumer::TopicPartitionList.new(topic => [rd_partition])
494
496
 
495
- @kafka.position(tpl).to_h.fetch(topic).first.offset || -1
497
+ kafka.position(tpl).to_h.fetch(topic).first.offset || -1
496
498
  end
497
499
 
498
500
  # Performs a single poll operation and handles retries and errors
@@ -520,7 +522,7 @@ module Karafka
520
522
  # blocking events from being handled.
521
523
  poll_tick = timeout > @tick_interval ? @tick_interval : timeout
522
524
 
523
- result = @kafka.poll(poll_tick)
525
+ result = kafka.poll(poll_tick)
524
526
 
525
527
  # If we've got a message, we can return it
526
528
  return result if result
@@ -647,6 +649,11 @@ module Karafka
647
649
 
648
650
  @buffer.uniq!
649
651
  end
652
+
653
+ # @return [Rdkafka::Consumer] librdkafka consumer instance
654
+ def kafka
655
+ @kafka ||= build_consumer
656
+ end
650
657
  end
651
658
  end
652
659
  end
@@ -0,0 +1,28 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Karafka
4
+ module Connection
5
+ # Conductor is responsible for time orchestration of listeners manager.
6
+ # It blocks when manager is not needed as there were no state changes that could cause any
7
+ # listeners config changes and unblocks when things change or when certain time passed.
8
+ # The time based unblocking allows for building of complex managers that could be state aware
9
+ class Conductor
10
+ # @param max_interval [Integer] after how many milliseconds of doing nothing should we wake
11
+ # up the manager despite no state changes
12
+ def initialize(max_interval = 30_000)
13
+ @lock = RUBY_VERSION < '3.2' ? Processing::TimedQueue.new : Queue.new
14
+ @timeout = max_interval / 1_000.0
15
+ end
16
+
17
+ # Waits in a blocking way until it is time to manage listeners
18
+ def wait
19
+ @lock.pop(timeout: @timeout)
20
+ end
21
+
22
+ # Releases wait lock on state change
23
+ def signal
24
+ @lock << true
25
+ end
26
+ end
27
+ end
28
+ end