karafka 2.2.14 → 2.3.0.alpha1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- checksums.yaml.gz.sig +0 -0
- data/.github/workflows/ci.yml +38 -12
- data/.ruby-version +1 -1
- data/CHANGELOG.md +23 -0
- data/Gemfile.lock +12 -12
- data/README.md +0 -2
- data/SECURITY.md +23 -0
- data/config/locales/errors.yml +7 -1
- data/config/locales/pro_errors.yml +22 -0
- data/docker-compose.yml +1 -1
- data/karafka.gemspec +2 -2
- data/lib/karafka/admin/acl.rb +287 -0
- data/lib/karafka/admin.rb +9 -13
- data/lib/karafka/app.rb +5 -3
- data/lib/karafka/base_consumer.rb +9 -1
- data/lib/karafka/cli/base.rb +1 -1
- data/lib/karafka/connection/client.rb +83 -76
- data/lib/karafka/connection/conductor.rb +28 -0
- data/lib/karafka/connection/listener.rb +159 -42
- data/lib/karafka/connection/listeners_batch.rb +5 -11
- data/lib/karafka/connection/manager.rb +72 -0
- data/lib/karafka/connection/messages_buffer.rb +12 -0
- data/lib/karafka/connection/proxy.rb +17 -0
- data/lib/karafka/connection/status.rb +75 -0
- data/lib/karafka/contracts/config.rb +14 -10
- data/lib/karafka/contracts/consumer_group.rb +9 -1
- data/lib/karafka/contracts/topic.rb +3 -1
- data/lib/karafka/errors.rb +13 -0
- data/lib/karafka/instrumentation/logger_listener.rb +3 -0
- data/lib/karafka/instrumentation/notifications.rb +13 -5
- data/lib/karafka/instrumentation/vendors/appsignal/metrics_listener.rb +31 -28
- data/lib/karafka/instrumentation/vendors/datadog/logger_listener.rb +20 -1
- data/lib/karafka/instrumentation/vendors/datadog/metrics_listener.rb +15 -12
- data/lib/karafka/instrumentation/vendors/kubernetes/liveness_listener.rb +39 -36
- data/lib/karafka/pro/base_consumer.rb +47 -0
- data/lib/karafka/pro/connection/manager.rb +300 -0
- data/lib/karafka/pro/connection/multiplexing/listener.rb +40 -0
- data/lib/karafka/pro/iterator/tpl_builder.rb +1 -1
- data/lib/karafka/pro/iterator.rb +1 -6
- data/lib/karafka/pro/loader.rb +14 -0
- data/lib/karafka/pro/processing/coordinator.rb +2 -1
- data/lib/karafka/pro/processing/executor.rb +37 -0
- data/lib/karafka/pro/processing/expansions_selector.rb +32 -0
- data/lib/karafka/pro/processing/jobs/periodic.rb +41 -0
- data/lib/karafka/pro/processing/jobs/periodic_non_blocking.rb +32 -0
- data/lib/karafka/pro/processing/jobs_builder.rb +14 -3
- data/lib/karafka/pro/processing/offset_metadata/consumer.rb +44 -0
- data/lib/karafka/pro/processing/offset_metadata/fetcher.rb +131 -0
- data/lib/karafka/pro/processing/offset_metadata/listener.rb +46 -0
- data/lib/karafka/pro/processing/schedulers/base.rb +39 -23
- data/lib/karafka/pro/processing/schedulers/default.rb +12 -14
- data/lib/karafka/pro/processing/strategies/default.rb +134 -1
- data/lib/karafka/pro/processing/strategies/dlq/default.rb +35 -0
- data/lib/karafka/pro/processing/strategies/vp/default.rb +59 -25
- data/lib/karafka/pro/processing/virtual_offset_manager.rb +41 -11
- data/lib/karafka/pro/routing/features/long_running_job/topic.rb +2 -0
- data/lib/karafka/pro/routing/features/multiplexing/config.rb +38 -0
- data/lib/karafka/pro/routing/features/multiplexing/contracts/topic.rb +114 -0
- data/lib/karafka/pro/routing/features/multiplexing/patches/contracts/consumer_group.rb +42 -0
- data/lib/karafka/pro/routing/features/multiplexing/proxy.rb +38 -0
- data/lib/karafka/pro/routing/features/multiplexing/subscription_group.rb +42 -0
- data/lib/karafka/pro/routing/features/multiplexing/subscription_groups_builder.rb +40 -0
- data/lib/karafka/pro/routing/features/multiplexing.rb +59 -0
- data/lib/karafka/pro/routing/features/non_blocking_job/topic.rb +32 -0
- data/lib/karafka/pro/routing/features/non_blocking_job.rb +37 -0
- data/lib/karafka/pro/routing/features/offset_metadata/config.rb +33 -0
- data/lib/karafka/pro/routing/features/offset_metadata/contracts/topic.rb +42 -0
- data/lib/karafka/pro/routing/features/offset_metadata/topic.rb +65 -0
- data/lib/karafka/pro/routing/features/offset_metadata.rb +40 -0
- data/lib/karafka/pro/routing/features/patterns/contracts/consumer_group.rb +4 -0
- data/lib/karafka/pro/routing/features/patterns/detector.rb +18 -10
- data/lib/karafka/pro/routing/features/periodic_job/config.rb +37 -0
- data/lib/karafka/pro/routing/features/periodic_job/contracts/topic.rb +44 -0
- data/lib/karafka/pro/routing/features/periodic_job/topic.rb +94 -0
- data/lib/karafka/pro/routing/features/periodic_job.rb +27 -0
- data/lib/karafka/pro/routing/features/virtual_partitions/config.rb +1 -0
- data/lib/karafka/pro/routing/features/virtual_partitions/contracts/topic.rb +1 -0
- data/lib/karafka/pro/routing/features/virtual_partitions/topic.rb +7 -2
- data/lib/karafka/process.rb +5 -3
- data/lib/karafka/processing/coordinator.rb +5 -1
- data/lib/karafka/processing/executor.rb +16 -10
- data/lib/karafka/processing/executors_buffer.rb +19 -4
- data/lib/karafka/processing/schedulers/default.rb +3 -2
- data/lib/karafka/processing/strategies/default.rb +6 -0
- data/lib/karafka/processing/strategies/dlq.rb +36 -0
- data/lib/karafka/routing/builder.rb +12 -2
- data/lib/karafka/routing/consumer_group.rb +5 -5
- data/lib/karafka/routing/features/base.rb +44 -8
- data/lib/karafka/routing/features/dead_letter_queue/config.rb +6 -1
- data/lib/karafka/routing/features/dead_letter_queue/contracts/topic.rb +1 -0
- data/lib/karafka/routing/features/dead_letter_queue/topic.rb +9 -2
- data/lib/karafka/routing/subscription_group.rb +2 -2
- data/lib/karafka/routing/subscription_groups_builder.rb +11 -2
- data/lib/karafka/routing/topic.rb +8 -10
- data/lib/karafka/runner.rb +13 -3
- data/lib/karafka/server.rb +5 -9
- data/lib/karafka/setup/config.rb +17 -0
- data/lib/karafka/status.rb +23 -14
- data/lib/karafka/templates/karafka.rb.erb +7 -0
- data/lib/karafka/time_trackers/partition_usage.rb +56 -0
- data/lib/karafka/version.rb +1 -1
- data.tar.gz.sig +0 -0
- metadata +42 -10
- metadata.gz.sig +0 -0
- data/lib/karafka/connection/consumer_group_coordinator.rb +0 -48
@@ -36,22 +36,25 @@ module Karafka
|
|
36
36
|
connection.listener.before_fetch_loop
|
37
37
|
connection.listener.fetch_loop
|
38
38
|
connection.listener.fetch_loop.received
|
39
|
-
|
40
|
-
rebalance.partitions_assign
|
41
|
-
rebalance.partitions_assigned
|
42
|
-
rebalance.partitions_revoke
|
43
|
-
rebalance.partitions_revoked
|
39
|
+
connection.listener.after_fetch_loop
|
44
40
|
|
45
41
|
consumer.before_schedule_consume
|
46
42
|
consumer.consume
|
47
43
|
consumer.consumed
|
48
44
|
consumer.consuming.pause
|
49
45
|
consumer.consuming.retry
|
46
|
+
|
50
47
|
consumer.before_schedule_idle
|
51
48
|
consumer.idle
|
49
|
+
|
52
50
|
consumer.before_schedule_revoked
|
53
51
|
consumer.revoke
|
54
52
|
consumer.revoked
|
53
|
+
|
54
|
+
consumer.before_schedule_tick
|
55
|
+
consumer.tick
|
56
|
+
consumer.ticked
|
57
|
+
|
55
58
|
consumer.before_schedule_shutdown
|
56
59
|
consumer.shutting_down
|
57
60
|
consumer.shutdown
|
@@ -63,6 +66,11 @@ module Karafka
|
|
63
66
|
|
64
67
|
process.notice_signal
|
65
68
|
|
69
|
+
rebalance.partitions_assign
|
70
|
+
rebalance.partitions_assigned
|
71
|
+
rebalance.partitions_revoke
|
72
|
+
rebalance.partitions_revoked
|
73
|
+
|
66
74
|
statistics.emitted
|
67
75
|
|
68
76
|
worker.process
|
@@ -42,6 +42,16 @@ module Karafka
|
|
42
42
|
|
43
43
|
configure
|
44
44
|
|
45
|
+
# Types of errors originating from user code in the consumer flow
|
46
|
+
USER_CONSUMER_ERROR_TYPES = %w[
|
47
|
+
consumer.consume.error
|
48
|
+
consumer.revoked.error
|
49
|
+
consumer.shutdown.error
|
50
|
+
consumer.tick.error
|
51
|
+
].freeze
|
52
|
+
|
53
|
+
private_constant :USER_CONSUMER_ERROR_TYPES
|
54
|
+
|
45
55
|
# Before each consumption process, lets start a transaction associated with it
|
46
56
|
# We also set some basic metadata about the given consumption that can be useful for
|
47
57
|
# debugging
|
@@ -94,34 +104,27 @@ module Karafka
|
|
94
104
|
client.register_probe(:karafka, -> { minute_probe })
|
95
105
|
end
|
96
106
|
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
end
|
111
|
-
|
112
|
-
# Keeps track of revocation user code execution
|
113
|
-
#
|
114
|
-
# @param event [Karafka::Core::Monitoring::Event]
|
115
|
-
def on_consumer_shutting_down(event)
|
116
|
-
consumer = event.payload[:caller]
|
117
|
-
start_transaction(consumer, 'shutdown')
|
118
|
-
end
|
107
|
+
[
|
108
|
+
%i[revoke revoked revoked],
|
109
|
+
%i[shutting_down shutdown shutdown],
|
110
|
+
%i[tick ticked tick]
|
111
|
+
].each do |before, after, name|
|
112
|
+
class_eval <<~RUBY, __FILE__, __LINE__ + 1
|
113
|
+
# Keeps track of user code execution
|
114
|
+
#
|
115
|
+
# @param event [Karafka::Core::Monitoring::Event]
|
116
|
+
def on_consumer_#{before}(event)
|
117
|
+
consumer = event.payload[:caller]
|
118
|
+
start_transaction(consumer, '#{name}')
|
119
|
+
end
|
119
120
|
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
121
|
+
# Finishes the transaction
|
122
|
+
#
|
123
|
+
# @param _event [Karafka::Core::Monitoring::Event]
|
124
|
+
def on_consumer_#{after}(_event)
|
125
|
+
stop_transaction
|
126
|
+
end
|
127
|
+
RUBY
|
125
128
|
end
|
126
129
|
|
127
130
|
# Counts DLQ dispatches
|
@@ -141,7 +144,7 @@ module Karafka
|
|
141
144
|
# @param event [Karafka::Core::Monitoring::Event] error event details
|
142
145
|
def on_error_occurred(event)
|
143
146
|
# If this is a user consumption related error, we bump the counters for metrics
|
144
|
-
if event[:type]
|
147
|
+
if USER_CONSUMER_ERROR_TYPES.include?(event[:type])
|
145
148
|
consumer = event.payload[:caller]
|
146
149
|
|
147
150
|
with_multiple_resolutions(consumer) do |tags|
|
@@ -55,7 +55,24 @@ module Karafka
|
|
55
55
|
consumer = job.executor.topic.consumer
|
56
56
|
topic = job.executor.topic.name
|
57
57
|
|
58
|
-
|
58
|
+
action = case job_type
|
59
|
+
when 'Periodic'
|
60
|
+
'tick'
|
61
|
+
when 'PeriodicNonBlocking'
|
62
|
+
'tick'
|
63
|
+
when 'Shutdown'
|
64
|
+
'shutdown'
|
65
|
+
when 'Revoked'
|
66
|
+
'revoked'
|
67
|
+
when 'RevokedNonBlocking'
|
68
|
+
'revoked'
|
69
|
+
when 'Idle'
|
70
|
+
'idle'
|
71
|
+
else
|
72
|
+
'consume'
|
73
|
+
end
|
74
|
+
|
75
|
+
current_span.resource = "#{consumer}##{action}"
|
59
76
|
info "[#{job.id}] #{job_type} job for #{consumer} on #{topic} started"
|
60
77
|
|
61
78
|
pop_tags
|
@@ -102,6 +119,8 @@ module Karafka
|
|
102
119
|
error "Consumer after consume failed due to an error: #{error}"
|
103
120
|
when 'consumer.shutdown.error'
|
104
121
|
error "Consumer on shutdown failed due to an error: #{error}"
|
122
|
+
when 'consumer.tick.error'
|
123
|
+
error "Consumer tick failed due to an error: #{error}"
|
105
124
|
when 'worker.process.error'
|
106
125
|
fatal "Worker processing failed due to an error: #{error}"
|
107
126
|
when 'connection.listener.fetch_loop.error'
|
@@ -128,18 +128,21 @@ module Karafka
|
|
128
128
|
histogram('consumer.consumption_lag', metadata.consumption_lag, tags: tags)
|
129
129
|
end
|
130
130
|
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
131
|
+
{
|
132
|
+
revoked: :revoked,
|
133
|
+
shutdown: :shutdown,
|
134
|
+
ticked: :tick
|
135
|
+
}.each do |after, name|
|
136
|
+
class_eval <<~RUBY, __FILE__, __LINE__ + 1
|
137
|
+
# Keeps track of user code execution
|
138
|
+
#
|
139
|
+
# @param event [Karafka::Core::Monitoring::Event]
|
140
|
+
def on_consumer_#{after}(event)
|
141
|
+
tags = default_tags + consumer_tags(event.payload[:caller])
|
142
|
+
|
143
|
+
count('consumer.#{name}', 1, tags: tags)
|
144
|
+
end
|
145
|
+
RUBY
|
143
146
|
end
|
144
147
|
|
145
148
|
# Worker related metrics
|
@@ -15,6 +15,14 @@ module Karafka
|
|
15
15
|
# data would be processed, but process itself would still be active. This listener allows
|
16
16
|
# for defining of a ttl that gets bumped on each poll loop and before and after processing
|
17
17
|
# of a given messages batch.
|
18
|
+
#
|
19
|
+
# @note This listener will bind itself only when Karafka will actually attempt to start
|
20
|
+
# and moves from initializing to running. Before that, the TCP server will NOT be active.
|
21
|
+
# This is done on purpose to mitigate a case where users would subscribe this listener
|
22
|
+
# in `karafka.rb` without checking the recommendations of conditional assignment.
|
23
|
+
#
|
24
|
+
# @note In case of usage within an embedding with Puma, you need to select different port
|
25
|
+
# then the one used by Puma itself.
|
18
26
|
class LivenessListener
|
19
27
|
include ::Karafka::Core::Helpers::Time
|
20
28
|
|
@@ -40,12 +48,18 @@ module Karafka
|
|
40
48
|
consuming_ttl: 5 * 60 * 1_000,
|
41
49
|
polling_ttl: 5 * 60 * 1_000
|
42
50
|
)
|
43
|
-
@
|
51
|
+
@hostname = hostname
|
52
|
+
@port = port
|
44
53
|
@polling_ttl = polling_ttl
|
45
54
|
@consuming_ttl = consuming_ttl
|
46
55
|
@mutex = Mutex.new
|
47
56
|
@pollings = {}
|
48
57
|
@consumptions = {}
|
58
|
+
end
|
59
|
+
|
60
|
+
# @param _event [Karafka::Core::Monitoring::Event]
|
61
|
+
def on_app_running(_event)
|
62
|
+
@server = TCPServer.new(*[@hostname, @port].compact)
|
49
63
|
|
50
64
|
Thread.new do
|
51
65
|
loop do
|
@@ -54,42 +68,37 @@ module Karafka
|
|
54
68
|
end
|
55
69
|
end
|
56
70
|
|
57
|
-
#
|
58
|
-
# @param _event [Karafka::Core::Monitoring::Event]
|
59
|
-
def on_connection_listener_fetch_loop(_event)
|
60
|
-
mark_polling_tick
|
61
|
-
end
|
62
|
-
|
63
|
-
# Tick on starting work
|
64
|
-
# @param _event [Karafka::Core::Monitoring::Event]
|
65
|
-
def on_consumer_consume(_event)
|
66
|
-
mark_consumption_tick
|
67
|
-
end
|
68
|
-
|
69
|
-
# Tick on finished work
|
70
|
-
# @param _event [Karafka::Core::Monitoring::Event]
|
71
|
-
def on_consumer_consumed(_event)
|
72
|
-
clear_consumption_tick
|
73
|
-
end
|
74
|
-
|
71
|
+
# Stop the http server when we stop the process
|
75
72
|
# @param _event [Karafka::Core::Monitoring::Event]
|
76
|
-
def
|
77
|
-
|
73
|
+
def on_app_stopped(_event)
|
74
|
+
@server.close
|
78
75
|
end
|
79
76
|
|
77
|
+
# Tick on each fetch
|
80
78
|
# @param _event [Karafka::Core::Monitoring::Event]
|
81
|
-
def
|
82
|
-
|
79
|
+
def on_connection_listener_fetch_loop(_event)
|
80
|
+
mark_polling_tick
|
83
81
|
end
|
84
82
|
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
83
|
+
{
|
84
|
+
consume: :consumed,
|
85
|
+
revoke: :revoked,
|
86
|
+
shutting_down: :shutdown,
|
87
|
+
tick: :ticked
|
88
|
+
}.each do |before, after|
|
89
|
+
class_eval <<~RUBY, __FILE__, __LINE__ + 1
|
90
|
+
# Tick on starting work
|
91
|
+
# @param _event [Karafka::Core::Monitoring::Event]
|
92
|
+
def on_consumer_#{before}(_event)
|
93
|
+
mark_consumption_tick
|
94
|
+
end
|
89
95
|
|
90
|
-
|
91
|
-
|
92
|
-
|
96
|
+
# Tick on finished work
|
97
|
+
# @param _event [Karafka::Core::Monitoring::Event]
|
98
|
+
def on_consumer_#{after}(_event)
|
99
|
+
clear_consumption_tick
|
100
|
+
end
|
101
|
+
RUBY
|
93
102
|
end
|
94
103
|
|
95
104
|
# @param _event [Karafka::Core::Monitoring::Event]
|
@@ -98,12 +107,6 @@ module Karafka
|
|
98
107
|
clear_polling_tick
|
99
108
|
end
|
100
109
|
|
101
|
-
# Stop the http server when we stop the process
|
102
|
-
# @param _event [Karafka::Core::Monitoring::Event]
|
103
|
-
def on_app_stopped(_event)
|
104
|
-
@server.close
|
105
|
-
end
|
106
|
-
|
107
110
|
private
|
108
111
|
|
109
112
|
# Wraps the logic with a mutex
|
@@ -0,0 +1,47 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# This Karafka component is a Pro component under a commercial license.
|
4
|
+
# This Karafka component is NOT licensed under LGPL.
|
5
|
+
#
|
6
|
+
# All of the commercial components are present in the lib/karafka/pro directory of this
|
7
|
+
# repository and their usage requires commercial license agreement.
|
8
|
+
#
|
9
|
+
# Karafka has also commercial-friendly license, commercial support and commercial components.
|
10
|
+
#
|
11
|
+
# By sending a pull request to the pro components, you are agreeing to transfer the copyright of
|
12
|
+
# your code to Maciej Mensfeld.
|
13
|
+
|
14
|
+
module Karafka
|
15
|
+
module Pro
|
16
|
+
# Extra methods always used in the base consumer in the pro mode
|
17
|
+
#
|
18
|
+
# We do not define those methods as part of the strategies flows, because they are injected
|
19
|
+
# (strategies) on singletons and often used only in one of the strategy variants
|
20
|
+
#
|
21
|
+
# Methods here are suppose to be always available or are expected to be redefined
|
22
|
+
module BaseConsumer
|
23
|
+
# Runs the on-schedule tick periodic operations
|
24
|
+
# This method is an alias but is part of the naming convention used for other flows, this
|
25
|
+
# is why we do not reference the `handle_before_schedule_tick` directly
|
26
|
+
def on_before_schedule_tick
|
27
|
+
handle_before_schedule_tick
|
28
|
+
end
|
29
|
+
|
30
|
+
# Used by the executor to trigger consumer tick
|
31
|
+
# @private
|
32
|
+
def on_tick
|
33
|
+
handle_tick
|
34
|
+
rescue StandardError => e
|
35
|
+
Karafka.monitor.instrument(
|
36
|
+
'error.occurred',
|
37
|
+
error: e,
|
38
|
+
caller: self,
|
39
|
+
type: 'consumer.tick.error'
|
40
|
+
)
|
41
|
+
end
|
42
|
+
|
43
|
+
# By default we do nothing when ticking
|
44
|
+
def tick; end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
@@ -0,0 +1,300 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# This Karafka component is a Pro component under a commercial license.
|
4
|
+
# This Karafka component is NOT licensed under LGPL.
|
5
|
+
#
|
6
|
+
# All of the commercial components are present in the lib/karafka/pro directory of this
|
7
|
+
# repository and their usage requires commercial license agreement.
|
8
|
+
#
|
9
|
+
# Karafka has also commercial-friendly license, commercial support and commercial components.
|
10
|
+
#
|
11
|
+
# By sending a pull request to the pro components, you are agreeing to transfer the copyright of
|
12
|
+
# your code to Maciej Mensfeld.
|
13
|
+
|
14
|
+
module Karafka
|
15
|
+
module Pro
|
16
|
+
module Connection
|
17
|
+
# Manager that can handle working with multiplexed connections.
|
18
|
+
#
|
19
|
+
# This manager takes into consideration the number of partitions assigned to the topics and
|
20
|
+
# does its best to balance. Additional connections may not always be utilized because
|
21
|
+
# alongside of them, other processes may "hijack" the assignment. In such cases those extra
|
22
|
+
# empty connections will be turned off after a while.
|
23
|
+
#
|
24
|
+
# @note Manager operations relate to consumer groups and not subscription groups. Since
|
25
|
+
# cluster operations can cause consumer group wide effects, we always apply only one
|
26
|
+
# change on a consumer group.
|
27
|
+
#
|
28
|
+
# @note Since we collect statistical data from listeners and this happens in a background
|
29
|
+
# thread, we need to make sure we lock not to have race conditions with expired data
|
30
|
+
# eviction.
|
31
|
+
class Manager < Karafka::Connection::Manager
|
32
|
+
include Core::Helpers::Time
|
33
|
+
|
34
|
+
# How long should we keep stale stats before evicting them completely
|
35
|
+
EVICTION_DELAY = 5 * 60 * 1_000
|
36
|
+
|
37
|
+
private_constant :EVICTION_DELAY
|
38
|
+
|
39
|
+
# How long should we wait after a rebalance before doing anything on a consumer group
|
40
|
+
#
|
41
|
+
# @param scale_delay [Integer] How long should we wait before making any changes. Any
|
42
|
+
# change related to this consumer group will postpone the scaling operations. This is
|
43
|
+
# done that way to prevent too many friction in the cluster. It is 1 minute by default
|
44
|
+
def initialize(scale_delay = 60 * 1_000)
|
45
|
+
super()
|
46
|
+
@scale_delay = scale_delay
|
47
|
+
@mutex = Mutex.new
|
48
|
+
@changes = Hash.new do |h, k|
|
49
|
+
h[k] = {
|
50
|
+
state: '',
|
51
|
+
join_state: '',
|
52
|
+
state_age: 0,
|
53
|
+
state_age_sync: monotonic_now,
|
54
|
+
changed_at: monotonic_now
|
55
|
+
}
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
# Registers listeners and starts the scaling procedures
|
60
|
+
#
|
61
|
+
# When using dynamic multiplexing, it will start the absolute minimum of connections for
|
62
|
+
# subscription group available.
|
63
|
+
#
|
64
|
+
# @param listeners [Connection::ListenersBatch]
|
65
|
+
def register(listeners)
|
66
|
+
@listeners = listeners
|
67
|
+
|
68
|
+
in_sg_families do |first_subscription_group, sg_listeners|
|
69
|
+
multiplexing = first_subscription_group.multiplexing
|
70
|
+
|
71
|
+
if multiplexing.active? && multiplexing.dynamic?
|
72
|
+
# Start as many boot listeners as user wants. If not configured, starts half of max.
|
73
|
+
sg_listeners.first(multiplexing.boot).each(&:start!)
|
74
|
+
else
|
75
|
+
sg_listeners.each(&:start!)
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
# Collects data from the statistics about given subscription group. This is used to ensure
|
81
|
+
# that we do not rescale short after rebalances, deployments, etc.
|
82
|
+
# @param subscription_group_id [String] id of the subscription group for which statistics
|
83
|
+
# were emitted
|
84
|
+
# @param statistics [Hash] emitted statistics
|
85
|
+
#
|
86
|
+
# @note Please note that while we collect here per subscription group, we use those metrics
|
87
|
+
# collectively on a whole consumer group. This reduces the friction.
|
88
|
+
def notice(subscription_group_id, statistics)
|
89
|
+
@mutex.synchronize do
|
90
|
+
times = []
|
91
|
+
# stateage is in microseconds
|
92
|
+
# We monitor broker changes to make sure we do not introduce extra friction
|
93
|
+
times << statistics['brokers'].values.map { |stats| stats['stateage'] }.min / 1_000
|
94
|
+
times << statistics['cgrp']['rebalance_age']
|
95
|
+
times << statistics['cgrp']['stateage']
|
96
|
+
|
97
|
+
# Keep the previous change age for changes that were triggered by us
|
98
|
+
previous_changed_at = @changes[subscription_group_id][:changed_at]
|
99
|
+
|
100
|
+
@changes[subscription_group_id] = {
|
101
|
+
state_age: times.min,
|
102
|
+
changed_at: previous_changed_at,
|
103
|
+
join_state: statistics['cgrp']['join_state'],
|
104
|
+
state: statistics['cgrp']['state'],
|
105
|
+
state_age_sync: monotonic_now
|
106
|
+
}
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
# Shuts down all the listeners when it is time (including moving to quiet) or rescales
|
111
|
+
# when it is needed
|
112
|
+
def control
|
113
|
+
Karafka::App.done? ? shutdown : rescale
|
114
|
+
end
|
115
|
+
|
116
|
+
private
|
117
|
+
|
118
|
+
# Handles the shutdown and quiet flows
|
119
|
+
def shutdown
|
120
|
+
active_listeners = @listeners.active
|
121
|
+
|
122
|
+
# When we are done processing immediately quiet all the listeners so they do not pick up
|
123
|
+
# new work to do
|
124
|
+
once(:quiet!) { active_listeners.each(&:quiet!) }
|
125
|
+
|
126
|
+
# If we are in the process of moving to quiet state, we need to check it.
|
127
|
+
if Karafka::App.quieting? && active_listeners.all?(&:quiet?)
|
128
|
+
once(:quieted!) { Karafka::App.quieted! }
|
129
|
+
end
|
130
|
+
|
131
|
+
return if Karafka::App.quiet?
|
132
|
+
|
133
|
+
# Since separate subscription groups are subscribed to different topics, there is no risk
|
134
|
+
# in shutting them down independently even if they operate in the same subscription group
|
135
|
+
in_sg_families do |first_subscription_group, sg_listeners|
|
136
|
+
active_sg_listeners = sg_listeners.select(&:active?)
|
137
|
+
|
138
|
+
# Do nothing until all listeners from the same consumer group are quiet. Otherwise we
|
139
|
+
# could have problems with in-flight rebalances during shutdown
|
140
|
+
next unless active_sg_listeners.all?(&:quiet?)
|
141
|
+
|
142
|
+
# Do not stop the same family twice
|
143
|
+
once(:stop!, first_subscription_group.name) { active_sg_listeners.each(&:stop!) }
|
144
|
+
end
|
145
|
+
|
146
|
+
return unless @listeners.active.all?(&:stopped?)
|
147
|
+
|
148
|
+
# All listeners including pending need to be moved at the end to stopped state for
|
149
|
+
# the whole server to stop
|
150
|
+
once(:stop!) { @listeners.each(&:stopped!) }
|
151
|
+
end
|
152
|
+
|
153
|
+
# Handles two scenarios:
|
154
|
+
# - Selects subscriptions that could benefit from having more parallel connections
|
155
|
+
# to kafka and then upscales them
|
156
|
+
# - Selects subscriptions that are idle (have nothing subscribed to them) and then shuts
|
157
|
+
# them down
|
158
|
+
#
|
159
|
+
# We always run scaling down and up because it may be applicable to different CGs
|
160
|
+
def rescale
|
161
|
+
evict
|
162
|
+
|
163
|
+
scale_down
|
164
|
+
scale_up
|
165
|
+
end
|
166
|
+
|
167
|
+
# Checks for connections without any assignments and scales them down.
|
168
|
+
# Does that only for dynamically multiplexed subscription groups
|
169
|
+
def scale_down
|
170
|
+
sgs_in_use = Karafka::App.assignments.keys.map(&:subscription_group).uniq
|
171
|
+
|
172
|
+
# Select connections for scaling down
|
173
|
+
in_sg_families do |first_subscription_group, sg_listeners|
|
174
|
+
next unless stable?(sg_listeners)
|
175
|
+
|
176
|
+
multiplexing = first_subscription_group.multiplexing
|
177
|
+
|
178
|
+
next unless multiplexing.active?
|
179
|
+
next unless multiplexing.dynamic?
|
180
|
+
|
181
|
+
# If we cannot downscale, do not
|
182
|
+
next if sg_listeners.count(&:active?) <= multiplexing.min
|
183
|
+
|
184
|
+
sg_listeners.each do |sg_listener|
|
185
|
+
# Do not stop connections with subscriptions
|
186
|
+
next if sgs_in_use.include?(sg_listener.subscription_group)
|
187
|
+
# Skip listeners that are already in standby
|
188
|
+
next unless sg_listener.active?
|
189
|
+
|
190
|
+
touch(sg_listener.subscription_group.id)
|
191
|
+
|
192
|
+
# Shut down not used connection
|
193
|
+
sg_listener.stop!
|
194
|
+
|
195
|
+
break
|
196
|
+
end
|
197
|
+
end
|
198
|
+
end
|
199
|
+
|
200
|
+
# Checks if we have space to scale and if there are any assignments with multiple topics
|
201
|
+
# partitions assigned in sgs that can be scaled. If that is the case, we scale up.
|
202
|
+
def scale_up
|
203
|
+
multi_part_sgs_families = Karafka::App
|
204
|
+
.assignments
|
205
|
+
.select { |_, partitions| partitions.size > 1 }
|
206
|
+
.keys
|
207
|
+
.map(&:subscription_group)
|
208
|
+
.map(&:name)
|
209
|
+
.uniq
|
210
|
+
|
211
|
+
# Select connections for scaling up
|
212
|
+
in_sg_families do |first_subscription_group, sg_listeners|
|
213
|
+
next unless stable?(sg_listeners)
|
214
|
+
|
215
|
+
multiplexing = first_subscription_group.multiplexing
|
216
|
+
|
217
|
+
next unless multiplexing.active?
|
218
|
+
next unless multiplexing.dynamic?
|
219
|
+
# If we cannot downscale, do not
|
220
|
+
next if sg_listeners.count(&:active?) >= multiplexing.max
|
221
|
+
|
222
|
+
sg_listeners.each do |sg_listener|
|
223
|
+
next unless multi_part_sgs_families.include?(sg_listener.subscription_group.name)
|
224
|
+
# Skip already active connections
|
225
|
+
next unless sg_listener.pending? || sg_listener.stopped?
|
226
|
+
|
227
|
+
touch(sg_listener.subscription_group.id)
|
228
|
+
sg_listener.start!
|
229
|
+
|
230
|
+
break
|
231
|
+
end
|
232
|
+
end
|
233
|
+
end
|
234
|
+
|
235
|
+
# Removes states that are no longer being reported for stopped/pending listeners
|
236
|
+
def evict
|
237
|
+
@mutex.synchronize do
|
238
|
+
@changes.delete_if do |_, details|
|
239
|
+
monotonic_now - details[:state_age_sync] >= EVICTION_DELAY
|
240
|
+
end
|
241
|
+
end
|
242
|
+
end
|
243
|
+
|
244
|
+
# Indicates, that something has changed on a subscription group. We consider every single
|
245
|
+
# change we make as a change to the setup as well.
|
246
|
+
# @param subscription_group_id [String]
|
247
|
+
def touch(subscription_group_id)
|
248
|
+
@mutex.synchronize do
|
249
|
+
@changes[subscription_group_id][:changed_at] = 0
|
250
|
+
@changes[subscription_group_id][:state_age_sync] = monotonic_now
|
251
|
+
end
|
252
|
+
end
|
253
|
+
|
254
|
+
# @param sg_listeners [Array<Listener>] listeners from one multiplexed sg
|
255
|
+
# @return [Boolean] is given subscription group listeners set stable. It is considered
|
256
|
+
# stable when it had no changes happening on it recently and all relevant states in it
|
257
|
+
# are also stable. This is a strong indicator that no rebalances or other operations are
|
258
|
+
# happening at a given moment.
|
259
|
+
def stable?(sg_listeners)
|
260
|
+
# If none of listeners has changes reported it means we did not yet start collecting
|
261
|
+
# metrics about any of them and at least one must be present. We do not consider it
|
262
|
+
# stable in such case as we still are waiting for metrics.
|
263
|
+
return false if sg_listeners.none? do |sg_listener|
|
264
|
+
@changes.key?(sg_listener.subscription_group.id)
|
265
|
+
end
|
266
|
+
|
267
|
+
sg_listeners.all? do |sg_listener|
|
268
|
+
# Not all SGs may be started initially or may be stopped, we ignore them here as they
|
269
|
+
# are irrelevant from the point of view of establishing stability
|
270
|
+
next true unless @changes.key?(sg_listener.subscription_group.id)
|
271
|
+
|
272
|
+
state = @changes[sg_listener.subscription_group.id]
|
273
|
+
|
274
|
+
state[:state_age] >= @scale_delay &&
|
275
|
+
(monotonic_now - state[:changed_at]) >= @scale_delay &&
|
276
|
+
state[:state] == 'up' &&
|
277
|
+
state[:join_state] == 'steady'
|
278
|
+
end
|
279
|
+
end
|
280
|
+
|
281
|
+
# Yields listeners in groups based on their subscription groups
|
282
|
+
# @yieldparam [Karafka::Routing::SubscriptionGroup] first subscription group out of the
|
283
|
+
# family
|
284
|
+
# @yieldparam [Array<Listener>] listeners of a single subscription group
|
285
|
+
def in_sg_families
|
286
|
+
grouped = @listeners.group_by { |listener| listener.subscription_group.name }
|
287
|
+
|
288
|
+
grouped.each_value do |listeners|
|
289
|
+
listener = listeners.first
|
290
|
+
|
291
|
+
yield(
|
292
|
+
listener.subscription_group,
|
293
|
+
listeners
|
294
|
+
)
|
295
|
+
end
|
296
|
+
end
|
297
|
+
end
|
298
|
+
end
|
299
|
+
end
|
300
|
+
end
|