karafka 2.2.14 → 2.3.0.alpha2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (107) hide show
  1. checksums.yaml +4 -4
  2. checksums.yaml.gz.sig +0 -0
  3. data/.github/workflows/ci.yml +38 -12
  4. data/.ruby-version +1 -1
  5. data/CHANGELOG.md +24 -0
  6. data/Gemfile.lock +16 -16
  7. data/README.md +0 -2
  8. data/SECURITY.md +23 -0
  9. data/bin/integrations +1 -1
  10. data/config/locales/errors.yml +7 -1
  11. data/config/locales/pro_errors.yml +22 -0
  12. data/docker-compose.yml +1 -1
  13. data/karafka.gemspec +2 -2
  14. data/lib/karafka/admin/acl.rb +287 -0
  15. data/lib/karafka/admin.rb +9 -13
  16. data/lib/karafka/app.rb +5 -3
  17. data/lib/karafka/base_consumer.rb +9 -1
  18. data/lib/karafka/cli/base.rb +1 -1
  19. data/lib/karafka/connection/client.rb +83 -76
  20. data/lib/karafka/connection/conductor.rb +28 -0
  21. data/lib/karafka/connection/listener.rb +159 -42
  22. data/lib/karafka/connection/listeners_batch.rb +5 -11
  23. data/lib/karafka/connection/manager.rb +72 -0
  24. data/lib/karafka/connection/messages_buffer.rb +12 -0
  25. data/lib/karafka/connection/proxy.rb +17 -0
  26. data/lib/karafka/connection/status.rb +75 -0
  27. data/lib/karafka/contracts/config.rb +14 -10
  28. data/lib/karafka/contracts/consumer_group.rb +9 -1
  29. data/lib/karafka/contracts/topic.rb +3 -1
  30. data/lib/karafka/errors.rb +17 -0
  31. data/lib/karafka/instrumentation/logger_listener.rb +3 -0
  32. data/lib/karafka/instrumentation/notifications.rb +13 -5
  33. data/lib/karafka/instrumentation/vendors/appsignal/metrics_listener.rb +31 -28
  34. data/lib/karafka/instrumentation/vendors/datadog/logger_listener.rb +20 -1
  35. data/lib/karafka/instrumentation/vendors/datadog/metrics_listener.rb +15 -12
  36. data/lib/karafka/instrumentation/vendors/kubernetes/liveness_listener.rb +39 -36
  37. data/lib/karafka/pro/base_consumer.rb +47 -0
  38. data/lib/karafka/pro/connection/manager.rb +269 -0
  39. data/lib/karafka/pro/connection/multiplexing/listener.rb +40 -0
  40. data/lib/karafka/pro/iterator/tpl_builder.rb +1 -1
  41. data/lib/karafka/pro/iterator.rb +1 -6
  42. data/lib/karafka/pro/loader.rb +14 -0
  43. data/lib/karafka/pro/processing/coordinator.rb +2 -1
  44. data/lib/karafka/pro/processing/executor.rb +37 -0
  45. data/lib/karafka/pro/processing/expansions_selector.rb +32 -0
  46. data/lib/karafka/pro/processing/jobs/periodic.rb +41 -0
  47. data/lib/karafka/pro/processing/jobs/periodic_non_blocking.rb +32 -0
  48. data/lib/karafka/pro/processing/jobs_builder.rb +14 -3
  49. data/lib/karafka/pro/processing/offset_metadata/consumer.rb +44 -0
  50. data/lib/karafka/pro/processing/offset_metadata/fetcher.rb +131 -0
  51. data/lib/karafka/pro/processing/offset_metadata/listener.rb +46 -0
  52. data/lib/karafka/pro/processing/schedulers/base.rb +39 -23
  53. data/lib/karafka/pro/processing/schedulers/default.rb +12 -14
  54. data/lib/karafka/pro/processing/strategies/default.rb +154 -1
  55. data/lib/karafka/pro/processing/strategies/dlq/default.rb +39 -0
  56. data/lib/karafka/pro/processing/strategies/vp/default.rb +65 -25
  57. data/lib/karafka/pro/processing/virtual_offset_manager.rb +41 -11
  58. data/lib/karafka/pro/routing/features/long_running_job/topic.rb +2 -0
  59. data/lib/karafka/pro/routing/features/multiplexing/config.rb +38 -0
  60. data/lib/karafka/pro/routing/features/multiplexing/contracts/topic.rb +114 -0
  61. data/lib/karafka/pro/routing/features/multiplexing/patches/contracts/consumer_group.rb +42 -0
  62. data/lib/karafka/pro/routing/features/multiplexing/proxy.rb +38 -0
  63. data/lib/karafka/pro/routing/features/multiplexing/subscription_group.rb +42 -0
  64. data/lib/karafka/pro/routing/features/multiplexing/subscription_groups_builder.rb +40 -0
  65. data/lib/karafka/pro/routing/features/multiplexing.rb +59 -0
  66. data/lib/karafka/pro/routing/features/non_blocking_job/topic.rb +32 -0
  67. data/lib/karafka/pro/routing/features/non_blocking_job.rb +37 -0
  68. data/lib/karafka/pro/routing/features/offset_metadata/config.rb +33 -0
  69. data/lib/karafka/pro/routing/features/offset_metadata/contracts/topic.rb +42 -0
  70. data/lib/karafka/pro/routing/features/offset_metadata/topic.rb +65 -0
  71. data/lib/karafka/pro/routing/features/offset_metadata.rb +40 -0
  72. data/lib/karafka/pro/routing/features/patterns/contracts/consumer_group.rb +4 -0
  73. data/lib/karafka/pro/routing/features/patterns/detector.rb +18 -10
  74. data/lib/karafka/pro/routing/features/periodic_job/config.rb +37 -0
  75. data/lib/karafka/pro/routing/features/periodic_job/contracts/topic.rb +44 -0
  76. data/lib/karafka/pro/routing/features/periodic_job/topic.rb +94 -0
  77. data/lib/karafka/pro/routing/features/periodic_job.rb +27 -0
  78. data/lib/karafka/pro/routing/features/virtual_partitions/config.rb +1 -0
  79. data/lib/karafka/pro/routing/features/virtual_partitions/contracts/topic.rb +1 -0
  80. data/lib/karafka/pro/routing/features/virtual_partitions/topic.rb +7 -2
  81. data/lib/karafka/process.rb +5 -3
  82. data/lib/karafka/processing/coordinator.rb +5 -1
  83. data/lib/karafka/processing/executor.rb +16 -10
  84. data/lib/karafka/processing/executors_buffer.rb +19 -4
  85. data/lib/karafka/processing/schedulers/default.rb +3 -2
  86. data/lib/karafka/processing/strategies/default.rb +6 -0
  87. data/lib/karafka/processing/strategies/dlq.rb +36 -0
  88. data/lib/karafka/routing/builder.rb +12 -2
  89. data/lib/karafka/routing/consumer_group.rb +5 -5
  90. data/lib/karafka/routing/features/base.rb +44 -8
  91. data/lib/karafka/routing/features/dead_letter_queue/config.rb +6 -1
  92. data/lib/karafka/routing/features/dead_letter_queue/contracts/topic.rb +1 -0
  93. data/lib/karafka/routing/features/dead_letter_queue/topic.rb +9 -2
  94. data/lib/karafka/routing/subscription_group.rb +2 -2
  95. data/lib/karafka/routing/subscription_groups_builder.rb +11 -2
  96. data/lib/karafka/routing/topic.rb +8 -10
  97. data/lib/karafka/runner.rb +13 -3
  98. data/lib/karafka/server.rb +5 -9
  99. data/lib/karafka/setup/config.rb +17 -0
  100. data/lib/karafka/status.rb +23 -14
  101. data/lib/karafka/templates/karafka.rb.erb +7 -0
  102. data/lib/karafka/time_trackers/partition_usage.rb +56 -0
  103. data/lib/karafka/version.rb +1 -1
  104. data.tar.gz.sig +0 -0
  105. metadata +42 -10
  106. metadata.gz.sig +0 -0
  107. data/lib/karafka/connection/consumer_group_coordinator.rb +0 -48
@@ -247,6 +247,9 @@ module Karafka
247
247
  when 'consumer.shutdown.error'
248
248
  error "Consumer on shutdown failed due to an error: #{error}"
249
249
  error details
250
+ when 'consumer.tick.error'
251
+ error "Consumer tick failed due to an error: #{error}"
252
+ error details
250
253
  when 'worker.process.error'
251
254
  fatal "Worker processing failed due to an error: #{error}"
252
255
  fatal details
@@ -36,22 +36,25 @@ module Karafka
36
36
  connection.listener.before_fetch_loop
37
37
  connection.listener.fetch_loop
38
38
  connection.listener.fetch_loop.received
39
-
40
- rebalance.partitions_assign
41
- rebalance.partitions_assigned
42
- rebalance.partitions_revoke
43
- rebalance.partitions_revoked
39
+ connection.listener.after_fetch_loop
44
40
 
45
41
  consumer.before_schedule_consume
46
42
  consumer.consume
47
43
  consumer.consumed
48
44
  consumer.consuming.pause
49
45
  consumer.consuming.retry
46
+
50
47
  consumer.before_schedule_idle
51
48
  consumer.idle
49
+
52
50
  consumer.before_schedule_revoked
53
51
  consumer.revoke
54
52
  consumer.revoked
53
+
54
+ consumer.before_schedule_tick
55
+ consumer.tick
56
+ consumer.ticked
57
+
55
58
  consumer.before_schedule_shutdown
56
59
  consumer.shutting_down
57
60
  consumer.shutdown
@@ -63,6 +66,11 @@ module Karafka
63
66
 
64
67
  process.notice_signal
65
68
 
69
+ rebalance.partitions_assign
70
+ rebalance.partitions_assigned
71
+ rebalance.partitions_revoke
72
+ rebalance.partitions_revoked
73
+
66
74
  statistics.emitted
67
75
 
68
76
  worker.process
@@ -42,6 +42,16 @@ module Karafka
42
42
 
43
43
  configure
44
44
 
45
+ # Types of errors originating from user code in the consumer flow
46
+ USER_CONSUMER_ERROR_TYPES = %w[
47
+ consumer.consume.error
48
+ consumer.revoked.error
49
+ consumer.shutdown.error
50
+ consumer.tick.error
51
+ ].freeze
52
+
53
+ private_constant :USER_CONSUMER_ERROR_TYPES
54
+
45
55
  # Before each consumption process, lets start a transaction associated with it
46
56
  # We also set some basic metadata about the given consumption that can be useful for
47
57
  # debugging
@@ -94,34 +104,27 @@ module Karafka
94
104
  client.register_probe(:karafka, -> { minute_probe })
95
105
  end
96
106
 
97
- # Keeps track of revocation user code execution
98
- #
99
- # @param event [Karafka::Core::Monitoring::Event]
100
- def on_consumer_revoke(event)
101
- consumer = event.payload[:caller]
102
- start_transaction(consumer, 'revoked')
103
- end
104
-
105
- # Finishes the revocation transaction
106
- #
107
- # @param _event [Karafka::Core::Monitoring::Event]
108
- def on_consumer_revoked(_event)
109
- stop_transaction
110
- end
111
-
112
- # Keeps track of revocation user code execution
113
- #
114
- # @param event [Karafka::Core::Monitoring::Event]
115
- def on_consumer_shutting_down(event)
116
- consumer = event.payload[:caller]
117
- start_transaction(consumer, 'shutdown')
118
- end
107
+ [
108
+ %i[revoke revoked revoked],
109
+ %i[shutting_down shutdown shutdown],
110
+ %i[tick ticked tick]
111
+ ].each do |before, after, name|
112
+ class_eval <<~RUBY, __FILE__, __LINE__ + 1
113
+ # Keeps track of user code execution
114
+ #
115
+ # @param event [Karafka::Core::Monitoring::Event]
116
+ def on_consumer_#{before}(event)
117
+ consumer = event.payload[:caller]
118
+ start_transaction(consumer, '#{name}')
119
+ end
119
120
 
120
- # Finishes the shutdown transaction
121
- #
122
- # @param _event [Karafka::Core::Monitoring::Event]
123
- def on_consumer_shutdown(_event)
124
- stop_transaction
121
+ # Finishes the transaction
122
+ #
123
+ # @param _event [Karafka::Core::Monitoring::Event]
124
+ def on_consumer_#{after}(_event)
125
+ stop_transaction
126
+ end
127
+ RUBY
125
128
  end
126
129
 
127
130
  # Counts DLQ dispatches
@@ -141,7 +144,7 @@ module Karafka
141
144
  # @param event [Karafka::Core::Monitoring::Event] error event details
142
145
  def on_error_occurred(event)
143
146
  # If this is a user consumption related error, we bump the counters for metrics
144
- if event[:type] == 'consumer.consume.error'
147
+ if USER_CONSUMER_ERROR_TYPES.include?(event[:type])
145
148
  consumer = event.payload[:caller]
146
149
 
147
150
  with_multiple_resolutions(consumer) do |tags|
@@ -55,7 +55,24 @@ module Karafka
55
55
  consumer = job.executor.topic.consumer
56
56
  topic = job.executor.topic.name
57
57
 
58
- current_span.resource = "#{consumer}#consume"
58
+ action = case job_type
59
+ when 'Periodic'
60
+ 'tick'
61
+ when 'PeriodicNonBlocking'
62
+ 'tick'
63
+ when 'Shutdown'
64
+ 'shutdown'
65
+ when 'Revoked'
66
+ 'revoked'
67
+ when 'RevokedNonBlocking'
68
+ 'revoked'
69
+ when 'Idle'
70
+ 'idle'
71
+ else
72
+ 'consume'
73
+ end
74
+
75
+ current_span.resource = "#{consumer}##{action}"
59
76
  info "[#{job.id}] #{job_type} job for #{consumer} on #{topic} started"
60
77
 
61
78
  pop_tags
@@ -102,6 +119,8 @@ module Karafka
102
119
  error "Consumer after consume failed due to an error: #{error}"
103
120
  when 'consumer.shutdown.error'
104
121
  error "Consumer on shutdown failed due to an error: #{error}"
122
+ when 'consumer.tick.error'
123
+ error "Consumer tick failed due to an error: #{error}"
105
124
  when 'worker.process.error'
106
125
  fatal "Worker processing failed due to an error: #{error}"
107
126
  when 'connection.listener.fetch_loop.error'
@@ -128,18 +128,21 @@ module Karafka
128
128
  histogram('consumer.consumption_lag', metadata.consumption_lag, tags: tags)
129
129
  end
130
130
 
131
- # @param event [Karafka::Core::Monitoring::Event]
132
- def on_consumer_revoked(event)
133
- tags = default_tags + consumer_tags(event.payload[:caller])
134
-
135
- count('consumer.revoked', 1, tags: tags)
136
- end
137
-
138
- # @param event [Karafka::Core::Monitoring::Event]
139
- def on_consumer_shutdown(event)
140
- tags = default_tags + consumer_tags(event.payload[:caller])
141
-
142
- count('consumer.shutdown', 1, tags: tags)
131
+ {
132
+ revoked: :revoked,
133
+ shutdown: :shutdown,
134
+ ticked: :tick
135
+ }.each do |after, name|
136
+ class_eval <<~RUBY, __FILE__, __LINE__ + 1
137
+ # Keeps track of user code execution
138
+ #
139
+ # @param event [Karafka::Core::Monitoring::Event]
140
+ def on_consumer_#{after}(event)
141
+ tags = default_tags + consumer_tags(event.payload[:caller])
142
+
143
+ count('consumer.#{name}', 1, tags: tags)
144
+ end
145
+ RUBY
143
146
  end
144
147
 
145
148
  # Worker related metrics
@@ -15,6 +15,14 @@ module Karafka
15
15
  # data would be processed, but process itself would still be active. This listener allows
16
16
  # for defining of a ttl that gets bumped on each poll loop and before and after processing
17
17
  # of a given messages batch.
18
+ #
19
+ # @note This listener will bind itself only when Karafka will actually attempt to start
20
+ # and moves from initializing to running. Before that, the TCP server will NOT be active.
21
+ # This is done on purpose to mitigate a case where users would subscribe this listener
22
+ # in `karafka.rb` without checking the recommendations of conditional assignment.
23
+ #
24
+ # @note In case of usage within an embedding with Puma, you need to select different port
25
+ # then the one used by Puma itself.
18
26
  class LivenessListener
19
27
  include ::Karafka::Core::Helpers::Time
20
28
 
@@ -40,12 +48,18 @@ module Karafka
40
48
  consuming_ttl: 5 * 60 * 1_000,
41
49
  polling_ttl: 5 * 60 * 1_000
42
50
  )
43
- @server = TCPServer.new(*[hostname, port].compact)
51
+ @hostname = hostname
52
+ @port = port
44
53
  @polling_ttl = polling_ttl
45
54
  @consuming_ttl = consuming_ttl
46
55
  @mutex = Mutex.new
47
56
  @pollings = {}
48
57
  @consumptions = {}
58
+ end
59
+
60
+ # @param _event [Karafka::Core::Monitoring::Event]
61
+ def on_app_running(_event)
62
+ @server = TCPServer.new(*[@hostname, @port].compact)
49
63
 
50
64
  Thread.new do
51
65
  loop do
@@ -54,42 +68,37 @@ module Karafka
54
68
  end
55
69
  end
56
70
 
57
- # Tick on each fetch
58
- # @param _event [Karafka::Core::Monitoring::Event]
59
- def on_connection_listener_fetch_loop(_event)
60
- mark_polling_tick
61
- end
62
-
63
- # Tick on starting work
64
- # @param _event [Karafka::Core::Monitoring::Event]
65
- def on_consumer_consume(_event)
66
- mark_consumption_tick
67
- end
68
-
69
- # Tick on finished work
70
- # @param _event [Karafka::Core::Monitoring::Event]
71
- def on_consumer_consumed(_event)
72
- clear_consumption_tick
73
- end
74
-
71
+ # Stop the http server when we stop the process
75
72
  # @param _event [Karafka::Core::Monitoring::Event]
76
- def on_consumer_revoke(_event)
77
- mark_consumption_tick
73
+ def on_app_stopped(_event)
74
+ @server.close
78
75
  end
79
76
 
77
+ # Tick on each fetch
80
78
  # @param _event [Karafka::Core::Monitoring::Event]
81
- def on_consumer_revoked(_event)
82
- clear_consumption_tick
79
+ def on_connection_listener_fetch_loop(_event)
80
+ mark_polling_tick
83
81
  end
84
82
 
85
- # @param _event [Karafka::Core::Monitoring::Event]
86
- def on_consumer_shutting_down(_event)
87
- mark_consumption_tick
88
- end
83
+ {
84
+ consume: :consumed,
85
+ revoke: :revoked,
86
+ shutting_down: :shutdown,
87
+ tick: :ticked
88
+ }.each do |before, after|
89
+ class_eval <<~RUBY, __FILE__, __LINE__ + 1
90
+ # Tick on starting work
91
+ # @param _event [Karafka::Core::Monitoring::Event]
92
+ def on_consumer_#{before}(_event)
93
+ mark_consumption_tick
94
+ end
89
95
 
90
- # @param _event [Karafka::Core::Monitoring::Event]
91
- def on_consumer_shutdown(_event)
92
- clear_consumption_tick
96
+ # Tick on finished work
97
+ # @param _event [Karafka::Core::Monitoring::Event]
98
+ def on_consumer_#{after}(_event)
99
+ clear_consumption_tick
100
+ end
101
+ RUBY
93
102
  end
94
103
 
95
104
  # @param _event [Karafka::Core::Monitoring::Event]
@@ -98,12 +107,6 @@ module Karafka
98
107
  clear_polling_tick
99
108
  end
100
109
 
101
- # Stop the http server when we stop the process
102
- # @param _event [Karafka::Core::Monitoring::Event]
103
- def on_app_stopped(_event)
104
- @server.close
105
- end
106
-
107
110
  private
108
111
 
109
112
  # Wraps the logic with a mutex
@@ -0,0 +1,47 @@
1
+ # frozen_string_literal: true
2
+
3
+ # This Karafka component is a Pro component under a commercial license.
4
+ # This Karafka component is NOT licensed under LGPL.
5
+ #
6
+ # All of the commercial components are present in the lib/karafka/pro directory of this
7
+ # repository and their usage requires commercial license agreement.
8
+ #
9
+ # Karafka has also commercial-friendly license, commercial support and commercial components.
10
+ #
11
+ # By sending a pull request to the pro components, you are agreeing to transfer the copyright of
12
+ # your code to Maciej Mensfeld.
13
+
14
+ module Karafka
15
+ module Pro
16
+ # Extra methods always used in the base consumer in the pro mode
17
+ #
18
+ # We do not define those methods as part of the strategies flows, because they are injected
19
+ # (strategies) on singletons and often used only in one of the strategy variants
20
+ #
21
+ # Methods here are suppose to be always available or are expected to be redefined
22
+ module BaseConsumer
23
+ # Runs the on-schedule tick periodic operations
24
+ # This method is an alias but is part of the naming convention used for other flows, this
25
+ # is why we do not reference the `handle_before_schedule_tick` directly
26
+ def on_before_schedule_tick
27
+ handle_before_schedule_tick
28
+ end
29
+
30
+ # Used by the executor to trigger consumer tick
31
+ # @private
32
+ def on_tick
33
+ handle_tick
34
+ rescue StandardError => e
35
+ Karafka.monitor.instrument(
36
+ 'error.occurred',
37
+ error: e,
38
+ caller: self,
39
+ type: 'consumer.tick.error'
40
+ )
41
+ end
42
+
43
+ # By default we do nothing when ticking
44
+ def tick; end
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,269 @@
1
+ # frozen_string_literal: true
2
+
3
+ # This Karafka component is a Pro component under a commercial license.
4
+ # This Karafka component is NOT licensed under LGPL.
5
+ #
6
+ # All of the commercial components are present in the lib/karafka/pro directory of this
7
+ # repository and their usage requires commercial license agreement.
8
+ #
9
+ # Karafka has also commercial-friendly license, commercial support and commercial components.
10
+ #
11
+ # By sending a pull request to the pro components, you are agreeing to transfer the copyright of
12
+ # your code to Maciej Mensfeld.
13
+
14
+ module Karafka
15
+ module Pro
16
+ module Connection
17
+ # Manager that can handle working with multiplexed connections.
18
+ #
19
+ # This manager takes into consideration the number of partitions assigned to the topics and
20
+ # does its best to balance. Additional connections may not always be utilized because
21
+ # alongside of them, other processes may "hijack" the assignment. In such cases those extra
22
+ # empty connections will be turned off after a while.
23
+ #
24
+ # @note Manager operations relate to consumer groups and not subscription groups. Since
25
+ # cluster operations can cause consumer group wide effects, we always apply only one
26
+ # change on a consumer group.
27
+ class Manager < Karafka::Connection::Manager
28
+ include Core::Helpers::Time
29
+
30
+ # How long should we wait after a rebalance before doing anything on a consumer group
31
+ #
32
+ # @param scale_delay [Integer] How long should we wait before making any changes. Any
33
+ # change related to this consumer group will postpone the scaling operations. This is
34
+ # done that way to prevent too many friction in the cluster. It is 1 minute by default
35
+ def initialize(scale_delay = 60 * 1_000)
36
+ super()
37
+ @scale_delay = scale_delay
38
+ @mutex = Mutex.new
39
+ @changes = Hash.new do |h, k|
40
+ h[k] = {
41
+ state: '',
42
+ join_state: '',
43
+ state_age: 0,
44
+ changed_at: monotonic_now
45
+ }
46
+ end
47
+ end
48
+
49
+ # Registers listeners and starts the scaling procedures
50
+ #
51
+ # When using dynamic multiplexing, it will start the absolute minimum of connections for
52
+ # subscription group available.
53
+ #
54
+ # @param listeners [Connection::ListenersBatch]
55
+ def register(listeners)
56
+ @listeners = listeners
57
+
58
+ # Preload all the keys into the hash so we never add keys to changes but just change them
59
+ listeners.each { |listener| @changes[listener.subscription_group.id] }
60
+
61
+ in_sg_families do |first_subscription_group, sg_listeners|
62
+ multiplexing = first_subscription_group.multiplexing
63
+
64
+ if multiplexing.active? && multiplexing.dynamic?
65
+ # Start as many boot listeners as user wants. If not configured, starts half of max.
66
+ sg_listeners.first(multiplexing.boot).each(&:start!)
67
+ else
68
+ sg_listeners.each(&:start!)
69
+ end
70
+ end
71
+ end
72
+
73
+ # Collects data from the statistics about given subscription group. This is used to ensure
74
+ # that we do not rescale short after rebalances, deployments, etc.
75
+ # @param subscription_group_id [String] id of the subscription group for which statistics
76
+ # were emitted
77
+ # @param statistics [Hash] emitted statistics
78
+ #
79
+ # @note Please note that while we collect here per subscription group, we use those metrics
80
+ # collectively on a whole consumer group. This reduces the friction.
81
+ def notice(subscription_group_id, statistics)
82
+ times = []
83
+ # stateage is in microseconds
84
+ # We monitor broker changes to make sure we do not introduce extra friction
85
+ times << statistics['brokers'].values.map { |stats| stats['stateage'] }.min / 1_000
86
+ times << statistics['cgrp']['rebalance_age']
87
+ times << statistics['cgrp']['stateage']
88
+
89
+ # Keep the previous change age for changes that were triggered by us
90
+ previous_changed_at = @changes[subscription_group_id][:changed_at]
91
+
92
+ @changes[subscription_group_id].merge!(
93
+ state_age: times.min,
94
+ changed_at: previous_changed_at,
95
+ join_state: statistics['cgrp']['join_state'],
96
+ state: statistics['cgrp']['state']
97
+ )
98
+ end
99
+
100
+ # Shuts down all the listeners when it is time (including moving to quiet) or rescales
101
+ # when it is needed
102
+ def control
103
+ Karafka::App.done? ? shutdown : rescale
104
+ end
105
+
106
+ private
107
+
108
+ # Handles the shutdown and quiet flows
109
+ def shutdown
110
+ active_listeners = @listeners.active
111
+
112
+ # When we are done processing immediately quiet all the listeners so they do not pick up
113
+ # new work to do
114
+ once(:quiet!) { active_listeners.each(&:quiet!) }
115
+
116
+ # If we are in the process of moving to quiet state, we need to check it.
117
+ if Karafka::App.quieting? && active_listeners.all?(&:quiet?)
118
+ once(:quieted!) { Karafka::App.quieted! }
119
+ end
120
+
121
+ return if Karafka::App.quiet?
122
+
123
+ # Since separate subscription groups are subscribed to different topics, there is no risk
124
+ # in shutting them down independently even if they operate in the same subscription group
125
+ in_sg_families do |first_subscription_group, sg_listeners|
126
+ active_sg_listeners = sg_listeners.select(&:active?)
127
+
128
+ # Do nothing until all listeners from the same consumer group are quiet. Otherwise we
129
+ # could have problems with in-flight rebalances during shutdown
130
+ next unless active_sg_listeners.all?(&:quiet?)
131
+
132
+ # Do not stop the same family twice
133
+ once(:stop!, first_subscription_group.name) { active_sg_listeners.each(&:stop!) }
134
+ end
135
+
136
+ return unless @listeners.active.all?(&:stopped?)
137
+
138
+ # All listeners including pending need to be moved at the end to stopped state for
139
+ # the whole server to stop
140
+ once(:stop!) { @listeners.each(&:stopped!) }
141
+ end
142
+
143
+ # Handles two scenarios:
144
+ # - Selects subscriptions that could benefit from having more parallel connections
145
+ # to kafka and then upscales them
146
+ # - Selects subscriptions that are idle (have nothing subscribed to them) and then shuts
147
+ # them down
148
+ #
149
+ # We always run scaling down and up because it may be applicable to different CGs
150
+ def rescale
151
+ scale_down
152
+ scale_up
153
+ end
154
+
155
+ # Checks for connections without any assignments and scales them down.
156
+ # Does that only for dynamically multiplexed subscription groups
157
+ def scale_down
158
+ sgs_in_use = Karafka::App.assignments.keys.map(&:subscription_group).uniq
159
+
160
+ # Select connections for scaling down
161
+ in_sg_families do |first_subscription_group, sg_listeners|
162
+ next unless stable?(sg_listeners)
163
+
164
+ multiplexing = first_subscription_group.multiplexing
165
+
166
+ next unless multiplexing.active?
167
+ next unless multiplexing.dynamic?
168
+
169
+ # If we cannot downscale, do not
170
+ next if sg_listeners.count(&:active?) <= multiplexing.min
171
+
172
+ sg_listeners.each do |sg_listener|
173
+ # Do not stop connections with subscriptions
174
+ next if sgs_in_use.include?(sg_listener.subscription_group)
175
+ # Skip listeners that are already in standby
176
+ next unless sg_listener.active?
177
+
178
+ touch(sg_listener.subscription_group.id)
179
+
180
+ # Shut down not used connection
181
+ sg_listener.stop!
182
+
183
+ break
184
+ end
185
+ end
186
+ end
187
+
188
+ # Checks if we have space to scale and if there are any assignments with multiple topics
189
+ # partitions assigned in sgs that can be scaled. If that is the case, we scale up.
190
+ def scale_up
191
+ multi_part_sgs_families = Karafka::App
192
+ .assignments
193
+ .select { |_, partitions| partitions.size > 1 }
194
+ .keys
195
+ .map(&:subscription_group)
196
+ .map(&:name)
197
+ .uniq
198
+
199
+ # Select connections for scaling up
200
+ in_sg_families do |first_subscription_group, sg_listeners|
201
+ next unless stable?(sg_listeners)
202
+
203
+ multiplexing = first_subscription_group.multiplexing
204
+
205
+ next unless multiplexing.active?
206
+ next unless multiplexing.dynamic?
207
+ # If we cannot downscale, do not
208
+ next if sg_listeners.count(&:active?) >= multiplexing.max
209
+
210
+ sg_listeners.each do |sg_listener|
211
+ next unless multi_part_sgs_families.include?(sg_listener.subscription_group.name)
212
+ # Skip already active connections
213
+ next unless sg_listener.pending? || sg_listener.stopped?
214
+
215
+ touch(sg_listener.subscription_group.id)
216
+ sg_listener.start!
217
+
218
+ break
219
+ end
220
+ end
221
+ end
222
+
223
+ # Indicates, that something has changed on a subscription group. We consider every single
224
+ # change we make as a change to the setup as well.
225
+ # @param subscription_group_id [String]
226
+ def touch(subscription_group_id)
227
+ @changes[subscription_group_id][:changed_at] = 0
228
+ end
229
+
230
+ # @param sg_listeners [Array<Listener>] listeners from one multiplexed sg
231
+ # @return [Boolean] is given subscription group listeners set stable. It is considered
232
+ # stable when it had no changes happening on it recently and all relevant states in it
233
+ # are also stable. This is a strong indicator that no rebalances or other operations are
234
+ # happening at a given moment.
235
+ def stable?(sg_listeners)
236
+ sg_listeners.all? do |sg_listener|
237
+ # If a listener is not active, we do not take it into consideration when looking at
238
+ # the stability data
239
+ next true unless sg_listener.active?
240
+
241
+ state = @changes[sg_listener.subscription_group.id]
242
+
243
+ state[:state_age] >= @scale_delay &&
244
+ (monotonic_now - state[:changed_at]) >= @scale_delay &&
245
+ state[:state] == 'up' &&
246
+ state[:join_state] == 'steady'
247
+ end
248
+ end
249
+
250
+ # Yields listeners in groups based on their subscription groups
251
+ # @yieldparam [Karafka::Routing::SubscriptionGroup] first subscription group out of the
252
+ # family
253
+ # @yieldparam [Array<Listener>] listeners of a single subscription group
254
+ def in_sg_families
255
+ grouped = @listeners.group_by { |listener| listener.subscription_group.name }
256
+
257
+ grouped.each_value do |listeners|
258
+ listener = listeners.first
259
+
260
+ yield(
261
+ listener.subscription_group,
262
+ listeners
263
+ )
264
+ end
265
+ end
266
+ end
267
+ end
268
+ end
269
+ end
@@ -0,0 +1,40 @@
1
+ # frozen_string_literal: true
2
+
3
+ # This Karafka component is a Pro component under a commercial license.
4
+ # This Karafka component is NOT licensed under LGPL.
5
+ #
6
+ # All of the commercial components are present in the lib/karafka/pro directory of this
7
+ # repository and their usage requires commercial license agreement.
8
+ #
9
+ # Karafka has also commercial-friendly license, commercial support and commercial components.
10
+ #
11
+ # By sending a pull request to the pro components, you are agreeing to transfer the copyright of
12
+ # your code to Maciej Mensfeld.
13
+
14
+ module Karafka
15
+ module Pro
16
+ # Namespace for Pro connections related components
17
+ module Connection
18
+ # Namespace for Multiplexing management related components
19
+ module Multiplexing
20
+ # Listener used to connect listeners manager to the lifecycle events that are significant
21
+ # to its operations
22
+ class Listener
23
+ def initialize
24
+ @manager = App.config.internal.connection.manager
25
+ end
26
+
27
+ # Triggers connection manage subscription groups details noticing
28
+ #
29
+ # @param event [Karafka::Core::Monitoring::Event] event with statistics
30
+ def on_statistics_emitted(event)
31
+ @manager.notice(
32
+ event[:subscription_group_id],
33
+ event[:statistics]
34
+ )
35
+ end
36
+ end
37
+ end
38
+ end
39
+ end
40
+ end