karafka 2.3.0 → 2.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- checksums.yaml.gz.sig +0 -0
- data/.rspec +2 -0
- data/CHANGELOG.md +15 -0
- data/Gemfile +1 -1
- data/Gemfile.lock +22 -22
- data/README.md +2 -2
- data/bin/integrations +2 -1
- data/bin/rspecs +6 -2
- data/config/locales/errors.yml +30 -8
- data/config/locales/pro_errors.yml +2 -0
- data/docker-compose.yml +1 -1
- data/lib/karafka/app.rb +14 -0
- data/lib/karafka/cli/base.rb +19 -0
- data/lib/karafka/cli/server.rb +62 -76
- data/lib/karafka/cli/swarm.rb +30 -0
- data/lib/karafka/constraints.rb +3 -3
- data/lib/karafka/contracts/config.rb +19 -0
- data/lib/karafka/errors.rb +12 -0
- data/lib/karafka/helpers/async.rb +13 -3
- data/lib/karafka/helpers/config_importer.rb +30 -0
- data/lib/karafka/instrumentation/logger_listener.rb +31 -0
- data/lib/karafka/instrumentation/notifications.rb +9 -0
- data/lib/karafka/instrumentation/vendors/datadog/logger_listener.rb +2 -0
- data/lib/karafka/instrumentation/vendors/kubernetes/base_listener.rb +72 -0
- data/lib/karafka/instrumentation/vendors/kubernetes/liveness_listener.rb +11 -40
- data/lib/karafka/instrumentation/vendors/kubernetes/swarm_liveness_listener.rb +54 -0
- data/lib/karafka/pro/active_job/job_options_contract.rb +1 -1
- data/lib/karafka/pro/base_consumer.rb +16 -0
- data/lib/karafka/pro/connection/manager.rb +6 -1
- data/lib/karafka/pro/processing/coordinator.rb +13 -3
- data/lib/karafka/pro/processing/coordinators/errors_tracker.rb +74 -0
- data/lib/karafka/pro/processing/coordinators/filters_applier.rb +107 -0
- data/lib/karafka/pro/processing/coordinators/virtual_offset_manager.rb +180 -0
- data/lib/karafka/pro/processing/strategies/aj/dlq_ftr_lrj_mom.rb +5 -7
- data/lib/karafka/pro/processing/strategies/aj/dlq_ftr_lrj_mom_vp.rb +5 -7
- data/lib/karafka/pro/processing/strategies/aj/dlq_ftr_mom.rb +8 -10
- data/lib/karafka/pro/processing/strategies/aj/dlq_ftr_mom_vp.rb +8 -16
- data/lib/karafka/pro/processing/strategies/aj/dlq_lrj_mom.rb +5 -7
- data/lib/karafka/pro/processing/strategies/aj/dlq_lrj_mom_vp.rb +5 -7
- data/lib/karafka/pro/processing/strategies/aj/dlq_mom.rb +8 -10
- data/lib/karafka/pro/processing/strategies/aj/dlq_mom_vp.rb +7 -9
- data/lib/karafka/pro/processing/strategies/dlq/default.rb +36 -10
- data/lib/karafka/pro/processing/strategies/dlq/ftr.rb +3 -7
- data/lib/karafka/pro/processing/strategies/dlq/ftr_lrj.rb +4 -8
- data/lib/karafka/pro/processing/strategies/dlq/ftr_lrj_mom.rb +6 -9
- data/lib/karafka/pro/processing/strategies/dlq/ftr_mom.rb +5 -15
- data/lib/karafka/pro/processing/strategies/dlq/lrj.rb +4 -8
- data/lib/karafka/pro/processing/strategies/dlq/lrj_mom.rb +6 -9
- data/lib/karafka/pro/processing/strategies/dlq/mom.rb +10 -20
- data/lib/karafka/pro/processing/strategies/vp/default.rb +7 -0
- data/lib/karafka/pro/routing/features/dead_letter_queue/contracts/topic.rb +6 -0
- data/lib/karafka/pro/routing/features/dead_letter_queue/topic.rb +39 -0
- data/lib/karafka/pro/swarm/liveness_listener.rb +171 -0
- data/lib/karafka/process.rb +27 -1
- data/lib/karafka/routing/features/dead_letter_queue/config.rb +2 -0
- data/lib/karafka/routing/subscription_group.rb +31 -9
- data/lib/karafka/runner.rb +4 -0
- data/lib/karafka/server.rb +13 -16
- data/lib/karafka/setup/config.rb +41 -2
- data/lib/karafka/status.rb +4 -2
- data/lib/karafka/swarm/liveness_listener.rb +55 -0
- data/lib/karafka/swarm/manager.rb +217 -0
- data/lib/karafka/swarm/node.rb +179 -0
- data/lib/karafka/swarm/pidfd.rb +131 -0
- data/lib/karafka/swarm/supervisor.rb +184 -0
- data/lib/karafka/swarm.rb +27 -0
- data/lib/karafka/templates/karafka.rb.erb +0 -2
- data/lib/karafka/version.rb +1 -1
- data/lib/karafka.rb +1 -1
- data.tar.gz.sig +0 -0
- metadata +17 -4
- metadata.gz.sig +0 -0
- data/lib/karafka/pro/processing/filters_applier.rb +0 -105
- data/lib/karafka/pro/processing/virtual_offset_manager.rb +0 -177
@@ -28,6 +28,12 @@ module Karafka
|
|
28
28
|
).fetch('en').fetch('validations').fetch('topic')
|
29
29
|
end
|
30
30
|
|
31
|
+
nested(:dead_letter_queue) do
|
32
|
+
# We use strategy based DLQ for every case in Pro
|
33
|
+
# For default (when no strategy) a default `max_retries` based strategy is used
|
34
|
+
required(:strategy) { |val| val.respond_to?(:call) }
|
35
|
+
end
|
36
|
+
|
31
37
|
# Make sure that when we use virtual partitions with DLQ, at least one retry is set
|
32
38
|
# We cannot use VP with DLQ without retries as we in order to provide ordering
|
33
39
|
# warranties on errors with VP, we need to collapse the VPs concurrency and retry
|
@@ -0,0 +1,39 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# This Karafka component is a Pro component under a commercial license.
|
4
|
+
# This Karafka component is NOT licensed under LGPL.
|
5
|
+
#
|
6
|
+
# All of the commercial components are present in the lib/karafka/pro directory of this
|
7
|
+
# repository and their usage requires commercial license agreement.
|
8
|
+
#
|
9
|
+
# Karafka has also commercial-friendly license, commercial support and commercial components.
|
10
|
+
#
|
11
|
+
# By sending a pull request to the pro components, you are agreeing to transfer the copyright of
|
12
|
+
# your code to Maciej Mensfeld.
|
13
|
+
|
14
|
+
module Karafka
|
15
|
+
module Pro
|
16
|
+
module Routing
|
17
|
+
module Features
|
18
|
+
class DeadLetterQueue < Base
|
19
|
+
# Expansions to the topic API in DLQ
|
20
|
+
module Topic
|
21
|
+
# @param strategy [#call, nil] Strategy we want to use or nil if a default strategy
|
22
|
+
# (same as in OSS) should be applied
|
23
|
+
# @param args [Hash] OSS DLQ arguments
|
24
|
+
def dead_letter_queue(strategy: nil, **args)
|
25
|
+
return @dead_letter_queue if @dead_letter_queue
|
26
|
+
|
27
|
+
super(**args).tap do |config|
|
28
|
+
# If explicit strategy is not provided, use the default approach from OSS
|
29
|
+
config.strategy = strategy || lambda do |_errors_tracker, attempt|
|
30
|
+
attempt > config.max_retries ? :dispatch : :retry
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
@@ -0,0 +1,171 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# This Karafka component is a Pro component under a commercial license.
|
4
|
+
# This Karafka component is NOT licensed under LGPL.
|
5
|
+
#
|
6
|
+
# All of the commercial components are present in the lib/karafka/pro directory of this
|
7
|
+
# repository and their usage requires commercial license agreement.
|
8
|
+
#
|
9
|
+
# Karafka has also commercial-friendly license, commercial support and commercial components.
|
10
|
+
#
|
11
|
+
# By sending a pull request to the pro components, you are agreeing to transfer the copyright of
|
12
|
+
# your code to Maciej Mensfeld.
|
13
|
+
|
14
|
+
module Karafka
|
15
|
+
module Pro
|
16
|
+
# Pro Swarm components namespace
|
17
|
+
module Swarm
|
18
|
+
# Pro listener that monitors RSS usage and other heartbeat metrics (if configured) to ensure
|
19
|
+
# that everything operates.
|
20
|
+
#
|
21
|
+
# It can:
|
22
|
+
# - monitor poll frequency to make sure things are not polled not often enough
|
23
|
+
# - monitor consumption to make sure we do not process data for too long
|
24
|
+
# - monitor RSS to make sure that we do not use too much memory
|
25
|
+
#
|
26
|
+
# By default it does **not** monitor memory and consuming and polling is configured in such
|
27
|
+
# a way to align with `max.poll.interval.ms` and other defaults.
|
28
|
+
#
|
29
|
+
# Failure statuses reported are as follows:
|
30
|
+
# - 1 - polling ttl exceeded
|
31
|
+
# - 2 - consuming ttl exceeded
|
32
|
+
# - 3 - memory limit exceeded
|
33
|
+
#
|
34
|
+
# @note This listener should not break anything if subscribed in the supervisor prior to
|
35
|
+
# forking as it relies on server events for operations.
|
36
|
+
class LivenessListener < Karafka::Swarm::LivenessListener
|
37
|
+
# @param memory_limit [Integer] max memory in MB for this process to be considered healthy
|
38
|
+
# @param consuming_ttl [Integer] time in ms after which we consider consumption hanging.
|
39
|
+
# It allows us to define max consumption time after which supervisor should consider
|
40
|
+
# given process as hanging
|
41
|
+
# @param polling_ttl [Integer] max time in ms for polling. If polling (any) does not
|
42
|
+
# happen that often, process should be considered dead.
|
43
|
+
# @note The default TTL matches the default `max.poll.interval.ms`
|
44
|
+
def initialize(
|
45
|
+
memory_limit: Float::INFINITY,
|
46
|
+
consuming_ttl: 5 * 60 * 1_000,
|
47
|
+
polling_ttl: 5 * 60 * 1_000
|
48
|
+
)
|
49
|
+
@polling_ttl = polling_ttl
|
50
|
+
@consuming_ttl = consuming_ttl
|
51
|
+
# We cast it just in case someone would provide '10MB' or something similar
|
52
|
+
@memory_limit = memory_limit.is_a?(String) ? memory_limit.to_i : memory_limit
|
53
|
+
@pollings = {}
|
54
|
+
@consumptions = {}
|
55
|
+
|
56
|
+
super()
|
57
|
+
end
|
58
|
+
|
59
|
+
# Tick on each fetch
|
60
|
+
#
|
61
|
+
# @param _event [Karafka::Core::Monitoring::Event]
|
62
|
+
def on_connection_listener_fetch_loop(_event)
|
63
|
+
mark_polling_tick
|
64
|
+
end
|
65
|
+
|
66
|
+
{
|
67
|
+
consume: :consumed,
|
68
|
+
revoke: :revoked,
|
69
|
+
shutting_down: :shutdown,
|
70
|
+
tick: :ticked
|
71
|
+
}.each do |before, after|
|
72
|
+
class_eval <<~RUBY, __FILE__, __LINE__ + 1
|
73
|
+
# Tick on starting work
|
74
|
+
# @param _event [Karafka::Core::Monitoring::Event]
|
75
|
+
def on_consumer_#{before}(_event)
|
76
|
+
mark_consumption_tick
|
77
|
+
end
|
78
|
+
|
79
|
+
# Tick on finished work
|
80
|
+
# @param _event [Karafka::Core::Monitoring::Event]
|
81
|
+
def on_consumer_#{after}(_event)
|
82
|
+
clear_consumption_tick
|
83
|
+
end
|
84
|
+
RUBY
|
85
|
+
end
|
86
|
+
|
87
|
+
# @param _event [Karafka::Core::Monitoring::Event]
|
88
|
+
def on_error_occurred(_event)
|
89
|
+
clear_consumption_tick
|
90
|
+
clear_polling_tick
|
91
|
+
end
|
92
|
+
|
93
|
+
# Reports the current status once in a while
|
94
|
+
#
|
95
|
+
# @param _event [Karafka::Core::Monitoring::Event]
|
96
|
+
def on_statistics_emitted(_event)
|
97
|
+
periodically do
|
98
|
+
return unless node
|
99
|
+
|
100
|
+
current_status = status
|
101
|
+
|
102
|
+
current_status.positive? ? node.unhealthy(current_status) : node.healthy
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
private
|
107
|
+
|
108
|
+
# @return [Integer] object id of the current thread
|
109
|
+
def thread_id
|
110
|
+
Thread.current.object_id
|
111
|
+
end
|
112
|
+
|
113
|
+
# Update the polling tick time for current thread
|
114
|
+
def mark_polling_tick
|
115
|
+
synchronize do
|
116
|
+
@pollings[thread_id] = monotonic_now
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
# Clear current thread polling time tracker
|
121
|
+
def clear_polling_tick
|
122
|
+
synchronize do
|
123
|
+
@pollings.delete(thread_id)
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
# Update the processing tick time
|
128
|
+
def mark_consumption_tick
|
129
|
+
synchronize do
|
130
|
+
@consumptions[thread_id] = monotonic_now
|
131
|
+
end
|
132
|
+
end
|
133
|
+
|
134
|
+
# Clear current thread consumption time tracker
|
135
|
+
def clear_consumption_tick
|
136
|
+
synchronize do
|
137
|
+
@consumptions.delete(thread_id)
|
138
|
+
end
|
139
|
+
end
|
140
|
+
|
141
|
+
# Did we exceed any of the ttls
|
142
|
+
# @return [String] 204 string if ok, 500 otherwise
|
143
|
+
def status
|
144
|
+
time = monotonic_now
|
145
|
+
|
146
|
+
return 1 if @pollings.values.any? { |tick| (time - tick) > @polling_ttl }
|
147
|
+
return 2 if @consumptions.values.any? { |tick| (time - tick) > @consuming_ttl }
|
148
|
+
return 3 if rss_mb > @memory_limit
|
149
|
+
|
150
|
+
0
|
151
|
+
end
|
152
|
+
|
153
|
+
# @return [Integer] RSS in MB for the current process
|
154
|
+
# @note Since swarm is linux only, we do not have to worry about getting RSS on other OSes
|
155
|
+
def rss_mb
|
156
|
+
kb_rss = 0
|
157
|
+
|
158
|
+
IO.readlines("/proc/#{node.pid}/status").each do |line|
|
159
|
+
next unless line.start_with?('VmRSS:')
|
160
|
+
|
161
|
+
kb_rss = line.split[1].to_i
|
162
|
+
|
163
|
+
break
|
164
|
+
end
|
165
|
+
|
166
|
+
(kb_rss / 1_024.to_i).round
|
167
|
+
end
|
168
|
+
end
|
169
|
+
end
|
170
|
+
end
|
171
|
+
end
|
data/lib/karafka/process.rb
CHANGED
@@ -14,6 +14,8 @@ module Karafka
|
|
14
14
|
SIGTERM
|
15
15
|
SIGTTIN
|
16
16
|
SIGTSTP
|
17
|
+
SIGCHLD
|
18
|
+
SIGUSER1
|
17
19
|
].freeze
|
18
20
|
|
19
21
|
HANDLED_SIGNALS.each do |signal|
|
@@ -32,16 +34,40 @@ module Karafka
|
|
32
34
|
RUBY
|
33
35
|
end
|
34
36
|
|
37
|
+
# Assigns a callback that will run on any supported signal that has at least one callback
|
38
|
+
# registered already.
|
39
|
+
# @param block [Proc] code we want to run
|
40
|
+
# @note This will only bind to signals that already have at least one callback defined
|
41
|
+
def on_any_active(&block)
|
42
|
+
HANDLED_SIGNALS.each do |signal|
|
43
|
+
next unless @callbacks.key?(signal)
|
44
|
+
|
45
|
+
public_send(:"on_#{signal.to_s.downcase}", &block)
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
35
49
|
# Creates an instance of process and creates empty hash for callbacks
|
36
50
|
def initialize
|
37
51
|
@callbacks = Hash.new { |hsh, key| hsh[key] = [] }
|
38
52
|
@supervised = false
|
39
53
|
end
|
40
54
|
|
55
|
+
# Clears all the defined callbacks. Useful for post-fork cleanup when parent already defined
|
56
|
+
# some signals
|
57
|
+
def clear
|
58
|
+
@callbacks.clear
|
59
|
+
end
|
60
|
+
|
41
61
|
# Method catches all HANDLED_SIGNALS and performs appropriate callbacks (if defined)
|
42
62
|
# @note If there are no callbacks, this method will just ignore a given signal that was sent
|
43
63
|
def supervise
|
44
|
-
HANDLED_SIGNALS.each
|
64
|
+
HANDLED_SIGNALS.each do |signal|
|
65
|
+
# Supervise only signals for which we have defined callbacks
|
66
|
+
next unless @callbacks.key?(signal)
|
67
|
+
|
68
|
+
trap_signal(signal)
|
69
|
+
end
|
70
|
+
|
45
71
|
@supervised = true
|
46
72
|
end
|
47
73
|
|
@@ -8,6 +8,12 @@ module Karafka
|
|
8
8
|
# @note One subscription group will always belong to one consumer group, but one consumer
|
9
9
|
# group can have multiple subscription groups.
|
10
10
|
class SubscriptionGroup
|
11
|
+
include Helpers::ConfigImporter.new(
|
12
|
+
activity_manager: %i[internal routing activity_manager],
|
13
|
+
client_id: %i[client_id],
|
14
|
+
node: %i[swarm node]
|
15
|
+
)
|
16
|
+
|
11
17
|
attr_reader :id, :name, :topics, :kafka, :consumer_group
|
12
18
|
|
13
19
|
# Lock for generating new ids safely
|
@@ -67,7 +73,7 @@ module Karafka
|
|
67
73
|
|
68
74
|
# @return [Boolean] is this subscription group one of active once
|
69
75
|
def active?
|
70
|
-
|
76
|
+
activity_manager.active?(:subscription_groups, name)
|
71
77
|
end
|
72
78
|
|
73
79
|
# @return [Array<String>] names of topics to which we should subscribe.
|
@@ -93,15 +99,9 @@ module Karafka
|
|
93
99
|
def build_kafka
|
94
100
|
kafka = Setup::AttributesMap.consumer(@topics.first.kafka.dup)
|
95
101
|
|
96
|
-
|
97
|
-
# be set on many subscription groups as the group instance id from Karafka perspective is
|
98
|
-
# set per config. Each instance even if they are subscribed to different topics needs to
|
99
|
-
# have it fully unique. To make sure of that, we just add extra postfix at the end that
|
100
|
-
# increments.
|
101
|
-
group_instance_id = kafka.fetch(:'group.instance.id', false)
|
102
|
+
inject_group_instance_id(kafka)
|
102
103
|
|
103
|
-
kafka[:'
|
104
|
-
kafka[:'client.id'] ||= Karafka::App.config.client_id
|
104
|
+
kafka[:'client.id'] ||= client_id
|
105
105
|
kafka[:'group.id'] ||= @consumer_group.id
|
106
106
|
kafka[:'auto.offset.reset'] ||= @topics.first.initial_offset
|
107
107
|
# Karafka manages the offsets based on the processing state, thus we do not rely on the
|
@@ -110,6 +110,28 @@ module Karafka
|
|
110
110
|
kafka.freeze
|
111
111
|
kafka
|
112
112
|
end
|
113
|
+
|
114
|
+
# If we use static group memberships, there can be a case, where same instance id would
|
115
|
+
# be set on many subscription groups as the group instance id from Karafka perspective is
|
116
|
+
# set per config. Each instance even if they are subscribed to different topics needs to
|
117
|
+
# have it fully unique. To make sure of that, we just add extra postfix at the end that
|
118
|
+
# increments.
|
119
|
+
#
|
120
|
+
# We also handle a swarm case, where the same setup would run from many forked nodes, hence
|
121
|
+
# affecting the instance id and causing conflicts
|
122
|
+
# @param kafka [Hash] kafka level config
|
123
|
+
def inject_group_instance_id(kafka)
|
124
|
+
group_instance_prefix = kafka.fetch(:'group.instance.id', false)
|
125
|
+
|
126
|
+
# If group instance id was not even configured, do nothing
|
127
|
+
return unless group_instance_prefix
|
128
|
+
|
129
|
+
# If there is a node, we need to take its id and inject it as well so multiple forks can
|
130
|
+
# have different instances ids but they are reproducible
|
131
|
+
components = [group_instance_prefix, node ? node.id : nil, @position]
|
132
|
+
|
133
|
+
kafka[:'group.instance.id'] = components.compact.join('_')
|
134
|
+
end
|
113
135
|
end
|
114
136
|
end
|
115
137
|
end
|
data/lib/karafka/runner.rb
CHANGED
@@ -18,6 +18,10 @@ module Karafka
|
|
18
18
|
workers = Processing::WorkersBatch.new(jobs_queue)
|
19
19
|
listeners = Connection::ListenersBatch.new(jobs_queue)
|
20
20
|
|
21
|
+
# We mark it prior to delegating to the manager as manager will have to start at least one
|
22
|
+
# connection to Kafka, hence running
|
23
|
+
Karafka::App.run!
|
24
|
+
|
21
25
|
# Register all the listeners so they can be started and managed
|
22
26
|
@manager.register(listeners)
|
23
27
|
|
data/lib/karafka/server.rb
CHANGED
@@ -3,16 +3,6 @@
|
|
3
3
|
module Karafka
|
4
4
|
# Karafka consuming server class
|
5
5
|
class Server
|
6
|
-
# How long should we sleep between checks on shutting down consumers
|
7
|
-
SUPERVISION_SLEEP = 0.1
|
8
|
-
# What system exit code should we use when we terminated forcefully
|
9
|
-
FORCEFUL_EXIT_CODE = 2
|
10
|
-
# This factor allows us to calculate how many times we have to sleep before
|
11
|
-
# a forceful shutdown
|
12
|
-
SUPERVISION_CHECK_FACTOR = (1 / SUPERVISION_SLEEP)
|
13
|
-
|
14
|
-
private_constant :SUPERVISION_SLEEP, :FORCEFUL_EXIT_CODE, :SUPERVISION_CHECK_FACTOR
|
15
|
-
|
16
6
|
class << self
|
17
7
|
# Set of consuming threads. Each consumer thread contains a single consumer
|
18
8
|
attr_accessor :listeners
|
@@ -36,12 +26,20 @@ module Karafka
|
|
36
26
|
config.internal.routing.activity_manager.to_h
|
37
27
|
)
|
38
28
|
|
29
|
+
# We clear as we do not want parent handlers in case of working from fork
|
30
|
+
process.clear
|
39
31
|
process.on_sigint { stop }
|
40
32
|
process.on_sigquit { stop }
|
41
33
|
process.on_sigterm { stop }
|
42
34
|
process.on_sigtstp { quiet }
|
35
|
+
# Needed for instrumentation
|
36
|
+
process.on_sigttin {}
|
43
37
|
process.supervise
|
44
38
|
|
39
|
+
# This will only run when not in a swarm mode. In swarm mode the server runs post-fork, so
|
40
|
+
# warmup will do nothing
|
41
|
+
Karafka::App.warmup
|
42
|
+
|
45
43
|
# Start is blocking until stop is called and when we stop, it will wait until
|
46
44
|
# all of the things are ready to stop
|
47
45
|
start
|
@@ -61,10 +59,9 @@ module Karafka
|
|
61
59
|
end
|
62
60
|
|
63
61
|
# Starts Karafka with a supervision
|
64
|
-
# @note We don't need to sleep because Karafka::
|
65
|
-
#
|
62
|
+
# @note We don't need to sleep because Karafka::Runner is locking and waiting to finish loop
|
63
|
+
# (and it won't happen until we explicitly want to stop)
|
66
64
|
def start
|
67
|
-
Karafka::App.run!
|
68
65
|
Karafka::Runner.new.call
|
69
66
|
end
|
70
67
|
|
@@ -87,13 +84,13 @@ module Karafka
|
|
87
84
|
# We check from time to time (for the timeout period) if all the threads finished
|
88
85
|
# their work and if so, we can just return and normal shutdown process will take place
|
89
86
|
# We divide it by 1000 because we use time in ms.
|
90
|
-
((timeout / 1_000) *
|
87
|
+
((timeout / 1_000) * (1 / config.internal.supervision_sleep)).to_i.times do
|
91
88
|
all_listeners_stopped = listeners.all?(&:stopped?)
|
92
89
|
all_workers_stopped = workers.none?(&:alive?)
|
93
90
|
|
94
91
|
return if all_listeners_stopped && all_workers_stopped
|
95
92
|
|
96
|
-
sleep
|
93
|
+
sleep(config.internal.supervision_sleep)
|
97
94
|
end
|
98
95
|
|
99
96
|
raise Errors::ForcefulShutdownError
|
@@ -117,7 +114,7 @@ module Karafka
|
|
117
114
|
return unless process.supervised?
|
118
115
|
|
119
116
|
# exit! is not within the instrumentation as it would not trigger due to exit
|
120
|
-
Kernel.exit!(
|
117
|
+
Kernel.exit!(config.internal.forceful_exit_code)
|
121
118
|
ensure
|
122
119
|
# We need to check if it wasn't an early exit to make sure that only on stop invocation
|
123
120
|
# can change the status after everything is closed
|
data/lib/karafka/setup/config.rb
CHANGED
@@ -105,6 +105,17 @@ module Karafka
|
|
105
105
|
# @see https://github.com/edenhill/librdkafka/blob/master/CONFIGURATION.md
|
106
106
|
setting :kafka, default: {}
|
107
107
|
|
108
|
+
# Public configuration for swarm operations
|
109
|
+
setting :swarm do
|
110
|
+
# option [Integer] how many processes do we want to run in a swarm mode
|
111
|
+
# Keep in mind this is only applicable when running in a swarm mode
|
112
|
+
setting :nodes, default: 3
|
113
|
+
# This is set automatically when we fork. Used to hold reference that may be needed
|
114
|
+
# for static group membership, supervision and more. If set to `false`, it means this
|
115
|
+
# process is not a fork
|
116
|
+
setting :node, default: false
|
117
|
+
end
|
118
|
+
|
108
119
|
# Admin specific settings.
|
109
120
|
#
|
110
121
|
# Since admin operations are often specific, they may require specific librdkafka settings
|
@@ -151,7 +162,6 @@ module Karafka
|
|
151
162
|
# @note In the future, we need to have a single process representation for all the karafka
|
152
163
|
# instances
|
153
164
|
setting :process, default: Process.new
|
154
|
-
|
155
165
|
# Interval of "ticking". This is used to define the maximum time between consecutive
|
156
166
|
# polling of the main rdkafka queue. It should match also the `statistics.interval.ms`
|
157
167
|
# smallest value defined in any of the per-kafka settings, so metrics are published with
|
@@ -162,6 +172,36 @@ module Karafka
|
|
162
172
|
# not to have enough time to run. This (not directly) defines also a single poll
|
163
173
|
# max timeout as to allow for frequent enough events polling
|
164
174
|
setting :tick_interval, default: 5_000
|
175
|
+
# How long should we sleep between checks on shutting down consumers
|
176
|
+
setting :supervision_sleep, default: 0.1
|
177
|
+
# What system exit code should we use when we terminated forcefully
|
178
|
+
setting :forceful_exit_code, default: 2
|
179
|
+
|
180
|
+
setting :swarm do
|
181
|
+
# Manager for swarm nodes control
|
182
|
+
setting :manager, default: Swarm::Manager.new
|
183
|
+
# Exit code we exit an orphaned child with to indicate something went wrong
|
184
|
+
setting :orphaned_exit_code, default: 3
|
185
|
+
# syscall number for https://man7.org/linux/man-pages/man2/pidfd_open.2.html
|
186
|
+
setting :pidfd_open_syscall, default: 434
|
187
|
+
# syscall number for https://man7.org/linux/man-pages/man2/pidfd_send_signal.2.html
|
188
|
+
setting :pidfd_signal_syscall, default: 424
|
189
|
+
# How often (in ms) should we control our nodes
|
190
|
+
# This is maximum time after which we will check. This can happen more often in case of
|
191
|
+
# system events.
|
192
|
+
setting :supervision_interval, default: 30_000
|
193
|
+
# How often should each node report its status
|
194
|
+
setting :liveness_interval, default: 10_000
|
195
|
+
# Listener used to report nodes state to the supervisor
|
196
|
+
setting :liveness_listener, default: Swarm::LivenessListener.new
|
197
|
+
# How long should we wait for any info from the node before we consider it hanging at
|
198
|
+
# stop it
|
199
|
+
setting :node_report_timeout, default: 30_000
|
200
|
+
# How long should we wait before restarting a node. This can prevent us from having a
|
201
|
+
# case where for some external reason our spawned process would die immediately and we
|
202
|
+
# would immediately try to start it back in an endless loop
|
203
|
+
setting :node_restart_timeout, default: 5_000
|
204
|
+
end
|
165
205
|
|
166
206
|
# Namespace for CLI related settings
|
167
207
|
setting :cli do
|
@@ -176,7 +216,6 @@ module Karafka
|
|
176
216
|
# option subscription_groups_builder [Routing::SubscriptionGroupsBuilder] subscription
|
177
217
|
# group builder
|
178
218
|
setting :subscription_groups_builder, default: Routing::SubscriptionGroupsBuilder.new
|
179
|
-
|
180
219
|
# Internally assigned list of limits on routings active for the current process
|
181
220
|
# This can be altered by the CLI command
|
182
221
|
setting :activity_manager, default: Routing::ActivityManager.new
|
data/lib/karafka/status.rb
CHANGED
@@ -7,6 +7,7 @@ module Karafka
|
|
7
7
|
STATES = {
|
8
8
|
initializing: :initialize!,
|
9
9
|
initialized: :initialized!,
|
10
|
+
supervising: :supervise!,
|
10
11
|
running: :run!,
|
11
12
|
# will no longer pickup any work, but current work will be finished
|
12
13
|
quieting: :quiet!,
|
@@ -49,8 +50,8 @@ module Karafka
|
|
49
50
|
|
50
51
|
def #{transition}
|
51
52
|
MUTEX.synchronize do
|
52
|
-
# Do not allow reverse state transitions (we always go one way) or transition to the
|
53
|
-
# state as currently
|
53
|
+
# Do not allow reverse state transitions (we always go one way) or transition to the
|
54
|
+
# same state as currently
|
54
55
|
return if @status && STATES.keys.index(:#{state}) <= STATES.keys.index(@status)
|
55
56
|
|
56
57
|
@status = :#{state}
|
@@ -78,6 +79,7 @@ module Karafka
|
|
78
79
|
def done?
|
79
80
|
# Short-track for the most common case not to invoke all others on normal execution
|
80
81
|
return false if running?
|
82
|
+
return false if supervising?
|
81
83
|
|
82
84
|
stopping? || stopped? || quieting? || quiet? || terminated?
|
83
85
|
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Karafka
|
4
|
+
module Swarm
|
5
|
+
# Simple listener for swarm nodes that:
|
6
|
+
# - reports once in a while to make sure that supervisor is aware we do not hang
|
7
|
+
# - makes sure we did not become an orphan and if so, exits
|
8
|
+
class LivenessListener
|
9
|
+
include Karafka::Core::Helpers::Time
|
10
|
+
include Helpers::ConfigImporter.new(
|
11
|
+
node: %i[swarm node],
|
12
|
+
liveness_interval: %i[internal swarm liveness_interval],
|
13
|
+
orphaned_exit_code: %i[internal swarm orphaned_exit_code]
|
14
|
+
)
|
15
|
+
|
16
|
+
def initialize
|
17
|
+
@last_checked_at = 0
|
18
|
+
@mutex = Mutex.new
|
19
|
+
end
|
20
|
+
|
21
|
+
# Since there may be many statistics emitted from multiple listeners, we do not want to write
|
22
|
+
# statuses that often. Instead we do it only once in a while which should be enough
|
23
|
+
#
|
24
|
+
# While this may provide a small lag in the orphaned detection, it does not really matter
|
25
|
+
# as it will be picked up fast enough.
|
26
|
+
# @param _event [Karafka::Core::Monitoring::Event]
|
27
|
+
def on_statistics_emitted(_event)
|
28
|
+
periodically do
|
29
|
+
Kernel.exit!(orphaned_exit_code) if node.orphaned?
|
30
|
+
|
31
|
+
node.healthy
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
private
|
36
|
+
|
37
|
+
# Wraps the logic with a mutex
|
38
|
+
# @param block [Proc] code we want to run in mutex
|
39
|
+
def synchronize(&block)
|
40
|
+
@mutex.synchronize(&block)
|
41
|
+
end
|
42
|
+
|
43
|
+
# Runs requested code once in a while
|
44
|
+
def periodically
|
45
|
+
return if monotonic_now - @last_checked_at < liveness_interval
|
46
|
+
|
47
|
+
synchronize do
|
48
|
+
@last_checked_at = monotonic_now
|
49
|
+
|
50
|
+
yield
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|