karafka 2.2.14 → 2.3.0.alpha2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. checksums.yaml +4 -4
  2. checksums.yaml.gz.sig +0 -0
  3. data/.github/workflows/ci.yml +38 -12
  4. data/.ruby-version +1 -1
  5. data/CHANGELOG.md +24 -0
  6. data/Gemfile.lock +16 -16
  7. data/README.md +0 -2
  8. data/SECURITY.md +23 -0
  9. data/bin/integrations +1 -1
  10. data/config/locales/errors.yml +7 -1
  11. data/config/locales/pro_errors.yml +22 -0
  12. data/docker-compose.yml +1 -1
  13. data/karafka.gemspec +2 -2
  14. data/lib/karafka/admin/acl.rb +287 -0
  15. data/lib/karafka/admin.rb +9 -13
  16. data/lib/karafka/app.rb +5 -3
  17. data/lib/karafka/base_consumer.rb +9 -1
  18. data/lib/karafka/cli/base.rb +1 -1
  19. data/lib/karafka/connection/client.rb +83 -76
  20. data/lib/karafka/connection/conductor.rb +28 -0
  21. data/lib/karafka/connection/listener.rb +159 -42
  22. data/lib/karafka/connection/listeners_batch.rb +5 -11
  23. data/lib/karafka/connection/manager.rb +72 -0
  24. data/lib/karafka/connection/messages_buffer.rb +12 -0
  25. data/lib/karafka/connection/proxy.rb +17 -0
  26. data/lib/karafka/connection/status.rb +75 -0
  27. data/lib/karafka/contracts/config.rb +14 -10
  28. data/lib/karafka/contracts/consumer_group.rb +9 -1
  29. data/lib/karafka/contracts/topic.rb +3 -1
  30. data/lib/karafka/errors.rb +17 -0
  31. data/lib/karafka/instrumentation/logger_listener.rb +3 -0
  32. data/lib/karafka/instrumentation/notifications.rb +13 -5
  33. data/lib/karafka/instrumentation/vendors/appsignal/metrics_listener.rb +31 -28
  34. data/lib/karafka/instrumentation/vendors/datadog/logger_listener.rb +20 -1
  35. data/lib/karafka/instrumentation/vendors/datadog/metrics_listener.rb +15 -12
  36. data/lib/karafka/instrumentation/vendors/kubernetes/liveness_listener.rb +39 -36
  37. data/lib/karafka/pro/base_consumer.rb +47 -0
  38. data/lib/karafka/pro/connection/manager.rb +269 -0
  39. data/lib/karafka/pro/connection/multiplexing/listener.rb +40 -0
  40. data/lib/karafka/pro/iterator/tpl_builder.rb +1 -1
  41. data/lib/karafka/pro/iterator.rb +1 -6
  42. data/lib/karafka/pro/loader.rb +14 -0
  43. data/lib/karafka/pro/processing/coordinator.rb +2 -1
  44. data/lib/karafka/pro/processing/executor.rb +37 -0
  45. data/lib/karafka/pro/processing/expansions_selector.rb +32 -0
  46. data/lib/karafka/pro/processing/jobs/periodic.rb +41 -0
  47. data/lib/karafka/pro/processing/jobs/periodic_non_blocking.rb +32 -0
  48. data/lib/karafka/pro/processing/jobs_builder.rb +14 -3
  49. data/lib/karafka/pro/processing/offset_metadata/consumer.rb +44 -0
  50. data/lib/karafka/pro/processing/offset_metadata/fetcher.rb +131 -0
  51. data/lib/karafka/pro/processing/offset_metadata/listener.rb +46 -0
  52. data/lib/karafka/pro/processing/schedulers/base.rb +39 -23
  53. data/lib/karafka/pro/processing/schedulers/default.rb +12 -14
  54. data/lib/karafka/pro/processing/strategies/default.rb +154 -1
  55. data/lib/karafka/pro/processing/strategies/dlq/default.rb +39 -0
  56. data/lib/karafka/pro/processing/strategies/vp/default.rb +65 -25
  57. data/lib/karafka/pro/processing/virtual_offset_manager.rb +41 -11
  58. data/lib/karafka/pro/routing/features/long_running_job/topic.rb +2 -0
  59. data/lib/karafka/pro/routing/features/multiplexing/config.rb +38 -0
  60. data/lib/karafka/pro/routing/features/multiplexing/contracts/topic.rb +114 -0
  61. data/lib/karafka/pro/routing/features/multiplexing/patches/contracts/consumer_group.rb +42 -0
  62. data/lib/karafka/pro/routing/features/multiplexing/proxy.rb +38 -0
  63. data/lib/karafka/pro/routing/features/multiplexing/subscription_group.rb +42 -0
  64. data/lib/karafka/pro/routing/features/multiplexing/subscription_groups_builder.rb +40 -0
  65. data/lib/karafka/pro/routing/features/multiplexing.rb +59 -0
  66. data/lib/karafka/pro/routing/features/non_blocking_job/topic.rb +32 -0
  67. data/lib/karafka/pro/routing/features/non_blocking_job.rb +37 -0
  68. data/lib/karafka/pro/routing/features/offset_metadata/config.rb +33 -0
  69. data/lib/karafka/pro/routing/features/offset_metadata/contracts/topic.rb +42 -0
  70. data/lib/karafka/pro/routing/features/offset_metadata/topic.rb +65 -0
  71. data/lib/karafka/pro/routing/features/offset_metadata.rb +40 -0
  72. data/lib/karafka/pro/routing/features/patterns/contracts/consumer_group.rb +4 -0
  73. data/lib/karafka/pro/routing/features/patterns/detector.rb +18 -10
  74. data/lib/karafka/pro/routing/features/periodic_job/config.rb +37 -0
  75. data/lib/karafka/pro/routing/features/periodic_job/contracts/topic.rb +44 -0
  76. data/lib/karafka/pro/routing/features/periodic_job/topic.rb +94 -0
  77. data/lib/karafka/pro/routing/features/periodic_job.rb +27 -0
  78. data/lib/karafka/pro/routing/features/virtual_partitions/config.rb +1 -0
  79. data/lib/karafka/pro/routing/features/virtual_partitions/contracts/topic.rb +1 -0
  80. data/lib/karafka/pro/routing/features/virtual_partitions/topic.rb +7 -2
  81. data/lib/karafka/process.rb +5 -3
  82. data/lib/karafka/processing/coordinator.rb +5 -1
  83. data/lib/karafka/processing/executor.rb +16 -10
  84. data/lib/karafka/processing/executors_buffer.rb +19 -4
  85. data/lib/karafka/processing/schedulers/default.rb +3 -2
  86. data/lib/karafka/processing/strategies/default.rb +6 -0
  87. data/lib/karafka/processing/strategies/dlq.rb +36 -0
  88. data/lib/karafka/routing/builder.rb +12 -2
  89. data/lib/karafka/routing/consumer_group.rb +5 -5
  90. data/lib/karafka/routing/features/base.rb +44 -8
  91. data/lib/karafka/routing/features/dead_letter_queue/config.rb +6 -1
  92. data/lib/karafka/routing/features/dead_letter_queue/contracts/topic.rb +1 -0
  93. data/lib/karafka/routing/features/dead_letter_queue/topic.rb +9 -2
  94. data/lib/karafka/routing/subscription_group.rb +2 -2
  95. data/lib/karafka/routing/subscription_groups_builder.rb +11 -2
  96. data/lib/karafka/routing/topic.rb +8 -10
  97. data/lib/karafka/runner.rb +13 -3
  98. data/lib/karafka/server.rb +5 -9
  99. data/lib/karafka/setup/config.rb +17 -0
  100. data/lib/karafka/status.rb +23 -14
  101. data/lib/karafka/templates/karafka.rb.erb +7 -0
  102. data/lib/karafka/time_trackers/partition_usage.rb +56 -0
  103. data/lib/karafka/version.rb +1 -1
  104. data.tar.gz.sig +0 -0
  105. metadata +42 -10
  106. metadata.gz.sig +0 -0
  107. data/lib/karafka/connection/consumer_group_coordinator.rb +0 -48
@@ -247,6 +247,9 @@ module Karafka
247
247
  when 'consumer.shutdown.error'
248
248
  error "Consumer on shutdown failed due to an error: #{error}"
249
249
  error details
250
+ when 'consumer.tick.error'
251
+ error "Consumer tick failed due to an error: #{error}"
252
+ error details
250
253
  when 'worker.process.error'
251
254
  fatal "Worker processing failed due to an error: #{error}"
252
255
  fatal details
@@ -36,22 +36,25 @@ module Karafka
36
36
  connection.listener.before_fetch_loop
37
37
  connection.listener.fetch_loop
38
38
  connection.listener.fetch_loop.received
39
-
40
- rebalance.partitions_assign
41
- rebalance.partitions_assigned
42
- rebalance.partitions_revoke
43
- rebalance.partitions_revoked
39
+ connection.listener.after_fetch_loop
44
40
 
45
41
  consumer.before_schedule_consume
46
42
  consumer.consume
47
43
  consumer.consumed
48
44
  consumer.consuming.pause
49
45
  consumer.consuming.retry
46
+
50
47
  consumer.before_schedule_idle
51
48
  consumer.idle
49
+
52
50
  consumer.before_schedule_revoked
53
51
  consumer.revoke
54
52
  consumer.revoked
53
+
54
+ consumer.before_schedule_tick
55
+ consumer.tick
56
+ consumer.ticked
57
+
55
58
  consumer.before_schedule_shutdown
56
59
  consumer.shutting_down
57
60
  consumer.shutdown
@@ -63,6 +66,11 @@ module Karafka
63
66
 
64
67
  process.notice_signal
65
68
 
69
+ rebalance.partitions_assign
70
+ rebalance.partitions_assigned
71
+ rebalance.partitions_revoke
72
+ rebalance.partitions_revoked
73
+
66
74
  statistics.emitted
67
75
 
68
76
  worker.process
@@ -42,6 +42,16 @@ module Karafka
42
42
 
43
43
  configure
44
44
 
45
+ # Types of errors originating from user code in the consumer flow
46
+ USER_CONSUMER_ERROR_TYPES = %w[
47
+ consumer.consume.error
48
+ consumer.revoked.error
49
+ consumer.shutdown.error
50
+ consumer.tick.error
51
+ ].freeze
52
+
53
+ private_constant :USER_CONSUMER_ERROR_TYPES
54
+
45
55
  # Before each consumption process, lets start a transaction associated with it
46
56
  # We also set some basic metadata about the given consumption that can be useful for
47
57
  # debugging
@@ -94,34 +104,27 @@ module Karafka
94
104
  client.register_probe(:karafka, -> { minute_probe })
95
105
  end
96
106
 
97
- # Keeps track of revocation user code execution
98
- #
99
- # @param event [Karafka::Core::Monitoring::Event]
100
- def on_consumer_revoke(event)
101
- consumer = event.payload[:caller]
102
- start_transaction(consumer, 'revoked')
103
- end
104
-
105
- # Finishes the revocation transaction
106
- #
107
- # @param _event [Karafka::Core::Monitoring::Event]
108
- def on_consumer_revoked(_event)
109
- stop_transaction
110
- end
111
-
112
- # Keeps track of revocation user code execution
113
- #
114
- # @param event [Karafka::Core::Monitoring::Event]
115
- def on_consumer_shutting_down(event)
116
- consumer = event.payload[:caller]
117
- start_transaction(consumer, 'shutdown')
118
- end
107
+ [
108
+ %i[revoke revoked revoked],
109
+ %i[shutting_down shutdown shutdown],
110
+ %i[tick ticked tick]
111
+ ].each do |before, after, name|
112
+ class_eval <<~RUBY, __FILE__, __LINE__ + 1
113
+ # Keeps track of user code execution
114
+ #
115
+ # @param event [Karafka::Core::Monitoring::Event]
116
+ def on_consumer_#{before}(event)
117
+ consumer = event.payload[:caller]
118
+ start_transaction(consumer, '#{name}')
119
+ end
119
120
 
120
- # Finishes the shutdown transaction
121
- #
122
- # @param _event [Karafka::Core::Monitoring::Event]
123
- def on_consumer_shutdown(_event)
124
- stop_transaction
121
+ # Finishes the transaction
122
+ #
123
+ # @param _event [Karafka::Core::Monitoring::Event]
124
+ def on_consumer_#{after}(_event)
125
+ stop_transaction
126
+ end
127
+ RUBY
125
128
  end
126
129
 
127
130
  # Counts DLQ dispatches
@@ -141,7 +144,7 @@ module Karafka
141
144
  # @param event [Karafka::Core::Monitoring::Event] error event details
142
145
  def on_error_occurred(event)
143
146
  # If this is a user consumption related error, we bump the counters for metrics
144
- if event[:type] == 'consumer.consume.error'
147
+ if USER_CONSUMER_ERROR_TYPES.include?(event[:type])
145
148
  consumer = event.payload[:caller]
146
149
 
147
150
  with_multiple_resolutions(consumer) do |tags|
@@ -55,7 +55,24 @@ module Karafka
55
55
  consumer = job.executor.topic.consumer
56
56
  topic = job.executor.topic.name
57
57
 
58
- current_span.resource = "#{consumer}#consume"
58
+ action = case job_type
59
+ when 'Periodic'
60
+ 'tick'
61
+ when 'PeriodicNonBlocking'
62
+ 'tick'
63
+ when 'Shutdown'
64
+ 'shutdown'
65
+ when 'Revoked'
66
+ 'revoked'
67
+ when 'RevokedNonBlocking'
68
+ 'revoked'
69
+ when 'Idle'
70
+ 'idle'
71
+ else
72
+ 'consume'
73
+ end
74
+
75
+ current_span.resource = "#{consumer}##{action}"
59
76
  info "[#{job.id}] #{job_type} job for #{consumer} on #{topic} started"
60
77
 
61
78
  pop_tags
@@ -102,6 +119,8 @@ module Karafka
102
119
  error "Consumer after consume failed due to an error: #{error}"
103
120
  when 'consumer.shutdown.error'
104
121
  error "Consumer on shutdown failed due to an error: #{error}"
122
+ when 'consumer.tick.error'
123
+ error "Consumer tick failed due to an error: #{error}"
105
124
  when 'worker.process.error'
106
125
  fatal "Worker processing failed due to an error: #{error}"
107
126
  when 'connection.listener.fetch_loop.error'
@@ -128,18 +128,21 @@ module Karafka
128
128
  histogram('consumer.consumption_lag', metadata.consumption_lag, tags: tags)
129
129
  end
130
130
 
131
- # @param event [Karafka::Core::Monitoring::Event]
132
- def on_consumer_revoked(event)
133
- tags = default_tags + consumer_tags(event.payload[:caller])
134
-
135
- count('consumer.revoked', 1, tags: tags)
136
- end
137
-
138
- # @param event [Karafka::Core::Monitoring::Event]
139
- def on_consumer_shutdown(event)
140
- tags = default_tags + consumer_tags(event.payload[:caller])
141
-
142
- count('consumer.shutdown', 1, tags: tags)
131
+ {
132
+ revoked: :revoked,
133
+ shutdown: :shutdown,
134
+ ticked: :tick
135
+ }.each do |after, name|
136
+ class_eval <<~RUBY, __FILE__, __LINE__ + 1
137
+ # Keeps track of user code execution
138
+ #
139
+ # @param event [Karafka::Core::Monitoring::Event]
140
+ def on_consumer_#{after}(event)
141
+ tags = default_tags + consumer_tags(event.payload[:caller])
142
+
143
+ count('consumer.#{name}', 1, tags: tags)
144
+ end
145
+ RUBY
143
146
  end
144
147
 
145
148
  # Worker related metrics
@@ -15,6 +15,14 @@ module Karafka
15
15
  # data would be processed, but process itself would still be active. This listener allows
16
16
  # for defining of a ttl that gets bumped on each poll loop and before and after processing
17
17
  # of a given messages batch.
18
+ #
19
+ # @note This listener will bind itself only when Karafka will actually attempt to start
20
+ # and moves from initializing to running. Before that, the TCP server will NOT be active.
21
+ # This is done on purpose to mitigate a case where users would subscribe this listener
22
+ # in `karafka.rb` without checking the recommendations of conditional assignment.
23
+ #
24
+ # @note In case of usage within an embedding with Puma, you need to select different port
25
+ # then the one used by Puma itself.
18
26
  class LivenessListener
19
27
  include ::Karafka::Core::Helpers::Time
20
28
 
@@ -40,12 +48,18 @@ module Karafka
40
48
  consuming_ttl: 5 * 60 * 1_000,
41
49
  polling_ttl: 5 * 60 * 1_000
42
50
  )
43
- @server = TCPServer.new(*[hostname, port].compact)
51
+ @hostname = hostname
52
+ @port = port
44
53
  @polling_ttl = polling_ttl
45
54
  @consuming_ttl = consuming_ttl
46
55
  @mutex = Mutex.new
47
56
  @pollings = {}
48
57
  @consumptions = {}
58
+ end
59
+
60
+ # @param _event [Karafka::Core::Monitoring::Event]
61
+ def on_app_running(_event)
62
+ @server = TCPServer.new(*[@hostname, @port].compact)
49
63
 
50
64
  Thread.new do
51
65
  loop do
@@ -54,42 +68,37 @@ module Karafka
54
68
  end
55
69
  end
56
70
 
57
- # Tick on each fetch
58
- # @param _event [Karafka::Core::Monitoring::Event]
59
- def on_connection_listener_fetch_loop(_event)
60
- mark_polling_tick
61
- end
62
-
63
- # Tick on starting work
64
- # @param _event [Karafka::Core::Monitoring::Event]
65
- def on_consumer_consume(_event)
66
- mark_consumption_tick
67
- end
68
-
69
- # Tick on finished work
70
- # @param _event [Karafka::Core::Monitoring::Event]
71
- def on_consumer_consumed(_event)
72
- clear_consumption_tick
73
- end
74
-
71
+ # Stop the http server when we stop the process
75
72
  # @param _event [Karafka::Core::Monitoring::Event]
76
- def on_consumer_revoke(_event)
77
- mark_consumption_tick
73
+ def on_app_stopped(_event)
74
+ @server.close
78
75
  end
79
76
 
77
+ # Tick on each fetch
80
78
  # @param _event [Karafka::Core::Monitoring::Event]
81
- def on_consumer_revoked(_event)
82
- clear_consumption_tick
79
+ def on_connection_listener_fetch_loop(_event)
80
+ mark_polling_tick
83
81
  end
84
82
 
85
- # @param _event [Karafka::Core::Monitoring::Event]
86
- def on_consumer_shutting_down(_event)
87
- mark_consumption_tick
88
- end
83
+ {
84
+ consume: :consumed,
85
+ revoke: :revoked,
86
+ shutting_down: :shutdown,
87
+ tick: :ticked
88
+ }.each do |before, after|
89
+ class_eval <<~RUBY, __FILE__, __LINE__ + 1
90
+ # Tick on starting work
91
+ # @param _event [Karafka::Core::Monitoring::Event]
92
+ def on_consumer_#{before}(_event)
93
+ mark_consumption_tick
94
+ end
89
95
 
90
- # @param _event [Karafka::Core::Monitoring::Event]
91
- def on_consumer_shutdown(_event)
92
- clear_consumption_tick
96
+ # Tick on finished work
97
+ # @param _event [Karafka::Core::Monitoring::Event]
98
+ def on_consumer_#{after}(_event)
99
+ clear_consumption_tick
100
+ end
101
+ RUBY
93
102
  end
94
103
 
95
104
  # @param _event [Karafka::Core::Monitoring::Event]
@@ -98,12 +107,6 @@ module Karafka
98
107
  clear_polling_tick
99
108
  end
100
109
 
101
- # Stop the http server when we stop the process
102
- # @param _event [Karafka::Core::Monitoring::Event]
103
- def on_app_stopped(_event)
104
- @server.close
105
- end
106
-
107
110
  private
108
111
 
109
112
  # Wraps the logic with a mutex
@@ -0,0 +1,47 @@
1
+ # frozen_string_literal: true
2
+
3
+ # This Karafka component is a Pro component under a commercial license.
4
+ # This Karafka component is NOT licensed under LGPL.
5
+ #
6
+ # All of the commercial components are present in the lib/karafka/pro directory of this
7
+ # repository and their usage requires commercial license agreement.
8
+ #
9
+ # Karafka has also commercial-friendly license, commercial support and commercial components.
10
+ #
11
+ # By sending a pull request to the pro components, you are agreeing to transfer the copyright of
12
+ # your code to Maciej Mensfeld.
13
+
14
+ module Karafka
15
+ module Pro
16
+ # Extra methods always used in the base consumer in the pro mode
17
+ #
18
+ # We do not define those methods as part of the strategies flows, because they are injected
19
+ # (strategies) on singletons and often used only in one of the strategy variants
20
+ #
21
+ # Methods here are suppose to be always available or are expected to be redefined
22
+ module BaseConsumer
23
+ # Runs the on-schedule tick periodic operations
24
+ # This method is an alias but is part of the naming convention used for other flows, this
25
+ # is why we do not reference the `handle_before_schedule_tick` directly
26
+ def on_before_schedule_tick
27
+ handle_before_schedule_tick
28
+ end
29
+
30
+ # Used by the executor to trigger consumer tick
31
+ # @private
32
+ def on_tick
33
+ handle_tick
34
+ rescue StandardError => e
35
+ Karafka.monitor.instrument(
36
+ 'error.occurred',
37
+ error: e,
38
+ caller: self,
39
+ type: 'consumer.tick.error'
40
+ )
41
+ end
42
+
43
+ # By default we do nothing when ticking
44
+ def tick; end
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,269 @@
1
+ # frozen_string_literal: true
2
+
3
+ # This Karafka component is a Pro component under a commercial license.
4
+ # This Karafka component is NOT licensed under LGPL.
5
+ #
6
+ # All of the commercial components are present in the lib/karafka/pro directory of this
7
+ # repository and their usage requires commercial license agreement.
8
+ #
9
+ # Karafka has also commercial-friendly license, commercial support and commercial components.
10
+ #
11
+ # By sending a pull request to the pro components, you are agreeing to transfer the copyright of
12
+ # your code to Maciej Mensfeld.
13
+
14
+ module Karafka
15
+ module Pro
16
+ module Connection
17
+ # Manager that can handle working with multiplexed connections.
18
+ #
19
+ # This manager takes into consideration the number of partitions assigned to the topics and
20
+ # does its best to balance. Additional connections may not always be utilized because
21
+ # alongside of them, other processes may "hijack" the assignment. In such cases those extra
22
+ # empty connections will be turned off after a while.
23
+ #
24
+ # @note Manager operations relate to consumer groups and not subscription groups. Since
25
+ # cluster operations can cause consumer group wide effects, we always apply only one
26
+ # change on a consumer group.
27
+ class Manager < Karafka::Connection::Manager
28
+ include Core::Helpers::Time
29
+
30
+ # How long should we wait after a rebalance before doing anything on a consumer group
31
+ #
32
+ # @param scale_delay [Integer] How long should we wait before making any changes. Any
33
+ # change related to this consumer group will postpone the scaling operations. This is
34
+ # done that way to prevent too many friction in the cluster. It is 1 minute by default
35
+ def initialize(scale_delay = 60 * 1_000)
36
+ super()
37
+ @scale_delay = scale_delay
38
+ @mutex = Mutex.new
39
+ @changes = Hash.new do |h, k|
40
+ h[k] = {
41
+ state: '',
42
+ join_state: '',
43
+ state_age: 0,
44
+ changed_at: monotonic_now
45
+ }
46
+ end
47
+ end
48
+
49
+ # Registers listeners and starts the scaling procedures
50
+ #
51
+ # When using dynamic multiplexing, it will start the absolute minimum of connections for
52
+ # subscription group available.
53
+ #
54
+ # @param listeners [Connection::ListenersBatch]
55
+ def register(listeners)
56
+ @listeners = listeners
57
+
58
+ # Preload all the keys into the hash so we never add keys to changes but just change them
59
+ listeners.each { |listener| @changes[listener.subscription_group.id] }
60
+
61
+ in_sg_families do |first_subscription_group, sg_listeners|
62
+ multiplexing = first_subscription_group.multiplexing
63
+
64
+ if multiplexing.active? && multiplexing.dynamic?
65
+ # Start as many boot listeners as user wants. If not configured, starts half of max.
66
+ sg_listeners.first(multiplexing.boot).each(&:start!)
67
+ else
68
+ sg_listeners.each(&:start!)
69
+ end
70
+ end
71
+ end
72
+
73
+ # Collects data from the statistics about given subscription group. This is used to ensure
74
+ # that we do not rescale short after rebalances, deployments, etc.
75
+ # @param subscription_group_id [String] id of the subscription group for which statistics
76
+ # were emitted
77
+ # @param statistics [Hash] emitted statistics
78
+ #
79
+ # @note Please note that while we collect here per subscription group, we use those metrics
80
+ # collectively on a whole consumer group. This reduces the friction.
81
+ def notice(subscription_group_id, statistics)
82
+ times = []
83
+ # stateage is in microseconds
84
+ # We monitor broker changes to make sure we do not introduce extra friction
85
+ times << statistics['brokers'].values.map { |stats| stats['stateage'] }.min / 1_000
86
+ times << statistics['cgrp']['rebalance_age']
87
+ times << statistics['cgrp']['stateage']
88
+
89
+ # Keep the previous change age for changes that were triggered by us
90
+ previous_changed_at = @changes[subscription_group_id][:changed_at]
91
+
92
+ @changes[subscription_group_id].merge!(
93
+ state_age: times.min,
94
+ changed_at: previous_changed_at,
95
+ join_state: statistics['cgrp']['join_state'],
96
+ state: statistics['cgrp']['state']
97
+ )
98
+ end
99
+
100
+ # Shuts down all the listeners when it is time (including moving to quiet) or rescales
101
+ # when it is needed
102
+ def control
103
+ Karafka::App.done? ? shutdown : rescale
104
+ end
105
+
106
+ private
107
+
108
+ # Handles the shutdown and quiet flows
109
+ def shutdown
110
+ active_listeners = @listeners.active
111
+
112
+ # When we are done processing immediately quiet all the listeners so they do not pick up
113
+ # new work to do
114
+ once(:quiet!) { active_listeners.each(&:quiet!) }
115
+
116
+ # If we are in the process of moving to quiet state, we need to check it.
117
+ if Karafka::App.quieting? && active_listeners.all?(&:quiet?)
118
+ once(:quieted!) { Karafka::App.quieted! }
119
+ end
120
+
121
+ return if Karafka::App.quiet?
122
+
123
+ # Since separate subscription groups are subscribed to different topics, there is no risk
124
+ # in shutting them down independently even if they operate in the same subscription group
125
+ in_sg_families do |first_subscription_group, sg_listeners|
126
+ active_sg_listeners = sg_listeners.select(&:active?)
127
+
128
+ # Do nothing until all listeners from the same consumer group are quiet. Otherwise we
129
+ # could have problems with in-flight rebalances during shutdown
130
+ next unless active_sg_listeners.all?(&:quiet?)
131
+
132
+ # Do not stop the same family twice
133
+ once(:stop!, first_subscription_group.name) { active_sg_listeners.each(&:stop!) }
134
+ end
135
+
136
+ return unless @listeners.active.all?(&:stopped?)
137
+
138
+ # All listeners including pending need to be moved at the end to stopped state for
139
+ # the whole server to stop
140
+ once(:stop!) { @listeners.each(&:stopped!) }
141
+ end
142
+
143
+ # Handles two scenarios:
144
+ # - Selects subscriptions that could benefit from having more parallel connections
145
+ # to kafka and then upscales them
146
+ # - Selects subscriptions that are idle (have nothing subscribed to them) and then shuts
147
+ # them down
148
+ #
149
+ # We always run scaling down and up because it may be applicable to different CGs
150
+ def rescale
151
+ scale_down
152
+ scale_up
153
+ end
154
+
155
+ # Checks for connections without any assignments and scales them down.
156
+ # Does that only for dynamically multiplexed subscription groups
157
+ def scale_down
158
+ sgs_in_use = Karafka::App.assignments.keys.map(&:subscription_group).uniq
159
+
160
+ # Select connections for scaling down
161
+ in_sg_families do |first_subscription_group, sg_listeners|
162
+ next unless stable?(sg_listeners)
163
+
164
+ multiplexing = first_subscription_group.multiplexing
165
+
166
+ next unless multiplexing.active?
167
+ next unless multiplexing.dynamic?
168
+
169
+ # If we cannot downscale, do not
170
+ next if sg_listeners.count(&:active?) <= multiplexing.min
171
+
172
+ sg_listeners.each do |sg_listener|
173
+ # Do not stop connections with subscriptions
174
+ next if sgs_in_use.include?(sg_listener.subscription_group)
175
+ # Skip listeners that are already in standby
176
+ next unless sg_listener.active?
177
+
178
+ touch(sg_listener.subscription_group.id)
179
+
180
+ # Shut down not used connection
181
+ sg_listener.stop!
182
+
183
+ break
184
+ end
185
+ end
186
+ end
187
+
188
+ # Checks if we have space to scale and if there are any assignments with multiple topics
189
+ # partitions assigned in sgs that can be scaled. If that is the case, we scale up.
190
+ def scale_up
191
+ multi_part_sgs_families = Karafka::App
192
+ .assignments
193
+ .select { |_, partitions| partitions.size > 1 }
194
+ .keys
195
+ .map(&:subscription_group)
196
+ .map(&:name)
197
+ .uniq
198
+
199
+ # Select connections for scaling up
200
+ in_sg_families do |first_subscription_group, sg_listeners|
201
+ next unless stable?(sg_listeners)
202
+
203
+ multiplexing = first_subscription_group.multiplexing
204
+
205
+ next unless multiplexing.active?
206
+ next unless multiplexing.dynamic?
207
+ # If we cannot downscale, do not
208
+ next if sg_listeners.count(&:active?) >= multiplexing.max
209
+
210
+ sg_listeners.each do |sg_listener|
211
+ next unless multi_part_sgs_families.include?(sg_listener.subscription_group.name)
212
+ # Skip already active connections
213
+ next unless sg_listener.pending? || sg_listener.stopped?
214
+
215
+ touch(sg_listener.subscription_group.id)
216
+ sg_listener.start!
217
+
218
+ break
219
+ end
220
+ end
221
+ end
222
+
223
+ # Indicates, that something has changed on a subscription group. We consider every single
224
+ # change we make as a change to the setup as well.
225
+ # @param subscription_group_id [String]
226
+ def touch(subscription_group_id)
227
+ @changes[subscription_group_id][:changed_at] = 0
228
+ end
229
+
230
+ # @param sg_listeners [Array<Listener>] listeners from one multiplexed sg
231
+ # @return [Boolean] is given subscription group listeners set stable. It is considered
232
+ # stable when it had no changes happening on it recently and all relevant states in it
233
+ # are also stable. This is a strong indicator that no rebalances or other operations are
234
+ # happening at a given moment.
235
+ def stable?(sg_listeners)
236
+ sg_listeners.all? do |sg_listener|
237
+ # If a listener is not active, we do not take it into consideration when looking at
238
+ # the stability data
239
+ next true unless sg_listener.active?
240
+
241
+ state = @changes[sg_listener.subscription_group.id]
242
+
243
+ state[:state_age] >= @scale_delay &&
244
+ (monotonic_now - state[:changed_at]) >= @scale_delay &&
245
+ state[:state] == 'up' &&
246
+ state[:join_state] == 'steady'
247
+ end
248
+ end
249
+
250
+ # Yields listeners in groups based on their subscription groups
251
+ # @yieldparam [Karafka::Routing::SubscriptionGroup] first subscription group out of the
252
+ # family
253
+ # @yieldparam [Array<Listener>] listeners of a single subscription group
254
+ def in_sg_families
255
+ grouped = @listeners.group_by { |listener| listener.subscription_group.name }
256
+
257
+ grouped.each_value do |listeners|
258
+ listener = listeners.first
259
+
260
+ yield(
261
+ listener.subscription_group,
262
+ listeners
263
+ )
264
+ end
265
+ end
266
+ end
267
+ end
268
+ end
269
+ end
@@ -0,0 +1,40 @@
1
+ # frozen_string_literal: true
2
+
3
+ # This Karafka component is a Pro component under a commercial license.
4
+ # This Karafka component is NOT licensed under LGPL.
5
+ #
6
+ # All of the commercial components are present in the lib/karafka/pro directory of this
7
+ # repository and their usage requires commercial license agreement.
8
+ #
9
+ # Karafka has also commercial-friendly license, commercial support and commercial components.
10
+ #
11
+ # By sending a pull request to the pro components, you are agreeing to transfer the copyright of
12
+ # your code to Maciej Mensfeld.
13
+
14
+ module Karafka
15
+ module Pro
16
+ # Namespace for Pro connections related components
17
+ module Connection
18
+ # Namespace for Multiplexing management related components
19
+ module Multiplexing
20
+ # Listener used to connect listeners manager to the lifecycle events that are significant
21
+ # to its operations
22
+ class Listener
23
+ def initialize
24
+ @manager = App.config.internal.connection.manager
25
+ end
26
+
27
+ # Triggers connection manage subscription groups details noticing
28
+ #
29
+ # @param event [Karafka::Core::Monitoring::Event] event with statistics
30
+ def on_statistics_emitted(event)
31
+ @manager.notice(
32
+ event[:subscription_group_id],
33
+ event[:statistics]
34
+ )
35
+ end
36
+ end
37
+ end
38
+ end
39
+ end
40
+ end