karafka 2.3.0 → 2.3.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- checksums.yaml.gz.sig +0 -0
- data/.rspec +2 -0
- data/CHANGELOG.md +15 -0
- data/Gemfile +1 -1
- data/Gemfile.lock +22 -22
- data/README.md +2 -2
- data/bin/integrations +2 -1
- data/bin/rspecs +6 -2
- data/config/locales/errors.yml +30 -8
- data/config/locales/pro_errors.yml +2 -0
- data/docker-compose.yml +1 -1
- data/lib/karafka/app.rb +14 -0
- data/lib/karafka/cli/base.rb +19 -0
- data/lib/karafka/cli/server.rb +62 -76
- data/lib/karafka/cli/swarm.rb +30 -0
- data/lib/karafka/constraints.rb +3 -3
- data/lib/karafka/contracts/config.rb +19 -0
- data/lib/karafka/errors.rb +12 -0
- data/lib/karafka/helpers/async.rb +13 -3
- data/lib/karafka/helpers/config_importer.rb +30 -0
- data/lib/karafka/instrumentation/logger_listener.rb +31 -0
- data/lib/karafka/instrumentation/notifications.rb +9 -0
- data/lib/karafka/instrumentation/vendors/datadog/logger_listener.rb +2 -0
- data/lib/karafka/instrumentation/vendors/kubernetes/base_listener.rb +72 -0
- data/lib/karafka/instrumentation/vendors/kubernetes/liveness_listener.rb +11 -40
- data/lib/karafka/instrumentation/vendors/kubernetes/swarm_liveness_listener.rb +54 -0
- data/lib/karafka/pro/active_job/job_options_contract.rb +1 -1
- data/lib/karafka/pro/base_consumer.rb +16 -0
- data/lib/karafka/pro/connection/manager.rb +6 -1
- data/lib/karafka/pro/processing/coordinator.rb +13 -3
- data/lib/karafka/pro/processing/coordinators/errors_tracker.rb +74 -0
- data/lib/karafka/pro/processing/coordinators/filters_applier.rb +107 -0
- data/lib/karafka/pro/processing/coordinators/virtual_offset_manager.rb +180 -0
- data/lib/karafka/pro/processing/strategies/aj/dlq_ftr_lrj_mom.rb +5 -7
- data/lib/karafka/pro/processing/strategies/aj/dlq_ftr_lrj_mom_vp.rb +5 -7
- data/lib/karafka/pro/processing/strategies/aj/dlq_ftr_mom.rb +8 -10
- data/lib/karafka/pro/processing/strategies/aj/dlq_ftr_mom_vp.rb +8 -16
- data/lib/karafka/pro/processing/strategies/aj/dlq_lrj_mom.rb +5 -7
- data/lib/karafka/pro/processing/strategies/aj/dlq_lrj_mom_vp.rb +5 -7
- data/lib/karafka/pro/processing/strategies/aj/dlq_mom.rb +8 -10
- data/lib/karafka/pro/processing/strategies/aj/dlq_mom_vp.rb +7 -9
- data/lib/karafka/pro/processing/strategies/dlq/default.rb +36 -10
- data/lib/karafka/pro/processing/strategies/dlq/ftr.rb +3 -7
- data/lib/karafka/pro/processing/strategies/dlq/ftr_lrj.rb +4 -8
- data/lib/karafka/pro/processing/strategies/dlq/ftr_lrj_mom.rb +6 -9
- data/lib/karafka/pro/processing/strategies/dlq/ftr_mom.rb +5 -15
- data/lib/karafka/pro/processing/strategies/dlq/lrj.rb +4 -8
- data/lib/karafka/pro/processing/strategies/dlq/lrj_mom.rb +6 -9
- data/lib/karafka/pro/processing/strategies/dlq/mom.rb +10 -20
- data/lib/karafka/pro/processing/strategies/vp/default.rb +7 -0
- data/lib/karafka/pro/routing/features/dead_letter_queue/contracts/topic.rb +6 -0
- data/lib/karafka/pro/routing/features/dead_letter_queue/topic.rb +39 -0
- data/lib/karafka/pro/swarm/liveness_listener.rb +171 -0
- data/lib/karafka/process.rb +27 -1
- data/lib/karafka/routing/features/dead_letter_queue/config.rb +2 -0
- data/lib/karafka/routing/subscription_group.rb +31 -9
- data/lib/karafka/runner.rb +4 -0
- data/lib/karafka/server.rb +13 -16
- data/lib/karafka/setup/config.rb +41 -2
- data/lib/karafka/status.rb +4 -2
- data/lib/karafka/swarm/liveness_listener.rb +55 -0
- data/lib/karafka/swarm/manager.rb +217 -0
- data/lib/karafka/swarm/node.rb +179 -0
- data/lib/karafka/swarm/pidfd.rb +131 -0
- data/lib/karafka/swarm/supervisor.rb +184 -0
- data/lib/karafka/swarm.rb +27 -0
- data/lib/karafka/templates/karafka.rb.erb +0 -2
- data/lib/karafka/version.rb +1 -1
- data/lib/karafka.rb +1 -1
- data.tar.gz.sig +0 -0
- metadata +17 -4
- metadata.gz.sig +0 -0
- data/lib/karafka/pro/processing/filters_applier.rb +0 -105
- data/lib/karafka/pro/processing/virtual_offset_manager.rb +0 -177
@@ -28,6 +28,12 @@ module Karafka
|
|
28
28
|
).fetch('en').fetch('validations').fetch('topic')
|
29
29
|
end
|
30
30
|
|
31
|
+
nested(:dead_letter_queue) do
|
32
|
+
# We use strategy based DLQ for every case in Pro
|
33
|
+
# For default (when no strategy) a default `max_retries` based strategy is used
|
34
|
+
required(:strategy) { |val| val.respond_to?(:call) }
|
35
|
+
end
|
36
|
+
|
31
37
|
# Make sure that when we use virtual partitions with DLQ, at least one retry is set
|
32
38
|
# We cannot use VP with DLQ without retries as we in order to provide ordering
|
33
39
|
# warranties on errors with VP, we need to collapse the VPs concurrency and retry
|
@@ -0,0 +1,39 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# This Karafka component is a Pro component under a commercial license.
|
4
|
+
# This Karafka component is NOT licensed under LGPL.
|
5
|
+
#
|
6
|
+
# All of the commercial components are present in the lib/karafka/pro directory of this
|
7
|
+
# repository and their usage requires commercial license agreement.
|
8
|
+
#
|
9
|
+
# Karafka has also commercial-friendly license, commercial support and commercial components.
|
10
|
+
#
|
11
|
+
# By sending a pull request to the pro components, you are agreeing to transfer the copyright of
|
12
|
+
# your code to Maciej Mensfeld.
|
13
|
+
|
14
|
+
module Karafka
|
15
|
+
module Pro
|
16
|
+
module Routing
|
17
|
+
module Features
|
18
|
+
class DeadLetterQueue < Base
|
19
|
+
# Expansions to the topic API in DLQ
|
20
|
+
module Topic
|
21
|
+
# @param strategy [#call, nil] Strategy we want to use or nil if a default strategy
|
22
|
+
# (same as in OSS) should be applied
|
23
|
+
# @param args [Hash] OSS DLQ arguments
|
24
|
+
def dead_letter_queue(strategy: nil, **args)
|
25
|
+
return @dead_letter_queue if @dead_letter_queue
|
26
|
+
|
27
|
+
super(**args).tap do |config|
|
28
|
+
# If explicit strategy is not provided, use the default approach from OSS
|
29
|
+
config.strategy = strategy || lambda do |_errors_tracker, attempt|
|
30
|
+
attempt > config.max_retries ? :dispatch : :retry
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
@@ -0,0 +1,171 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# This Karafka component is a Pro component under a commercial license.
|
4
|
+
# This Karafka component is NOT licensed under LGPL.
|
5
|
+
#
|
6
|
+
# All of the commercial components are present in the lib/karafka/pro directory of this
|
7
|
+
# repository and their usage requires commercial license agreement.
|
8
|
+
#
|
9
|
+
# Karafka has also commercial-friendly license, commercial support and commercial components.
|
10
|
+
#
|
11
|
+
# By sending a pull request to the pro components, you are agreeing to transfer the copyright of
|
12
|
+
# your code to Maciej Mensfeld.
|
13
|
+
|
14
|
+
module Karafka
|
15
|
+
module Pro
|
16
|
+
# Pro Swarm components namespace
|
17
|
+
module Swarm
|
18
|
+
# Pro listener that monitors RSS usage and other heartbeat metrics (if configured) to ensure
|
19
|
+
# that everything operates.
|
20
|
+
#
|
21
|
+
# It can:
|
22
|
+
# - monitor poll frequency to make sure things are not polled not often enough
|
23
|
+
# - monitor consumption to make sure we do not process data for too long
|
24
|
+
# - monitor RSS to make sure that we do not use too much memory
|
25
|
+
#
|
26
|
+
# By default it does **not** monitor memory and consuming and polling is configured in such
|
27
|
+
# a way to align with `max.poll.interval.ms` and other defaults.
|
28
|
+
#
|
29
|
+
# Failure statuses reported are as follows:
|
30
|
+
# - 1 - polling ttl exceeded
|
31
|
+
# - 2 - consuming ttl exceeded
|
32
|
+
# - 3 - memory limit exceeded
|
33
|
+
#
|
34
|
+
# @note This listener should not break anything if subscribed in the supervisor prior to
|
35
|
+
# forking as it relies on server events for operations.
|
36
|
+
class LivenessListener < Karafka::Swarm::LivenessListener
|
37
|
+
# @param memory_limit [Integer] max memory in MB for this process to be considered healthy
|
38
|
+
# @param consuming_ttl [Integer] time in ms after which we consider consumption hanging.
|
39
|
+
# It allows us to define max consumption time after which supervisor should consider
|
40
|
+
# given process as hanging
|
41
|
+
# @param polling_ttl [Integer] max time in ms for polling. If polling (any) does not
|
42
|
+
# happen that often, process should be considered dead.
|
43
|
+
# @note The default TTL matches the default `max.poll.interval.ms`
|
44
|
+
def initialize(
|
45
|
+
memory_limit: Float::INFINITY,
|
46
|
+
consuming_ttl: 5 * 60 * 1_000,
|
47
|
+
polling_ttl: 5 * 60 * 1_000
|
48
|
+
)
|
49
|
+
@polling_ttl = polling_ttl
|
50
|
+
@consuming_ttl = consuming_ttl
|
51
|
+
# We cast it just in case someone would provide '10MB' or something similar
|
52
|
+
@memory_limit = memory_limit.is_a?(String) ? memory_limit.to_i : memory_limit
|
53
|
+
@pollings = {}
|
54
|
+
@consumptions = {}
|
55
|
+
|
56
|
+
super()
|
57
|
+
end
|
58
|
+
|
59
|
+
# Tick on each fetch
|
60
|
+
#
|
61
|
+
# @param _event [Karafka::Core::Monitoring::Event]
|
62
|
+
def on_connection_listener_fetch_loop(_event)
|
63
|
+
mark_polling_tick
|
64
|
+
end
|
65
|
+
|
66
|
+
{
|
67
|
+
consume: :consumed,
|
68
|
+
revoke: :revoked,
|
69
|
+
shutting_down: :shutdown,
|
70
|
+
tick: :ticked
|
71
|
+
}.each do |before, after|
|
72
|
+
class_eval <<~RUBY, __FILE__, __LINE__ + 1
|
73
|
+
# Tick on starting work
|
74
|
+
# @param _event [Karafka::Core::Monitoring::Event]
|
75
|
+
def on_consumer_#{before}(_event)
|
76
|
+
mark_consumption_tick
|
77
|
+
end
|
78
|
+
|
79
|
+
# Tick on finished work
|
80
|
+
# @param _event [Karafka::Core::Monitoring::Event]
|
81
|
+
def on_consumer_#{after}(_event)
|
82
|
+
clear_consumption_tick
|
83
|
+
end
|
84
|
+
RUBY
|
85
|
+
end
|
86
|
+
|
87
|
+
# @param _event [Karafka::Core::Monitoring::Event]
|
88
|
+
def on_error_occurred(_event)
|
89
|
+
clear_consumption_tick
|
90
|
+
clear_polling_tick
|
91
|
+
end
|
92
|
+
|
93
|
+
# Reports the current status once in a while
|
94
|
+
#
|
95
|
+
# @param _event [Karafka::Core::Monitoring::Event]
|
96
|
+
def on_statistics_emitted(_event)
|
97
|
+
periodically do
|
98
|
+
return unless node
|
99
|
+
|
100
|
+
current_status = status
|
101
|
+
|
102
|
+
current_status.positive? ? node.unhealthy(current_status) : node.healthy
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
private
|
107
|
+
|
108
|
+
# @return [Integer] object id of the current thread
|
109
|
+
def thread_id
|
110
|
+
Thread.current.object_id
|
111
|
+
end
|
112
|
+
|
113
|
+
# Update the polling tick time for current thread
|
114
|
+
def mark_polling_tick
|
115
|
+
synchronize do
|
116
|
+
@pollings[thread_id] = monotonic_now
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
# Clear current thread polling time tracker
|
121
|
+
def clear_polling_tick
|
122
|
+
synchronize do
|
123
|
+
@pollings.delete(thread_id)
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
# Update the processing tick time
|
128
|
+
def mark_consumption_tick
|
129
|
+
synchronize do
|
130
|
+
@consumptions[thread_id] = monotonic_now
|
131
|
+
end
|
132
|
+
end
|
133
|
+
|
134
|
+
# Clear current thread consumption time tracker
|
135
|
+
def clear_consumption_tick
|
136
|
+
synchronize do
|
137
|
+
@consumptions.delete(thread_id)
|
138
|
+
end
|
139
|
+
end
|
140
|
+
|
141
|
+
# Did we exceed any of the ttls
|
142
|
+
# @return [String] 204 string if ok, 500 otherwise
|
143
|
+
def status
|
144
|
+
time = monotonic_now
|
145
|
+
|
146
|
+
return 1 if @pollings.values.any? { |tick| (time - tick) > @polling_ttl }
|
147
|
+
return 2 if @consumptions.values.any? { |tick| (time - tick) > @consuming_ttl }
|
148
|
+
return 3 if rss_mb > @memory_limit
|
149
|
+
|
150
|
+
0
|
151
|
+
end
|
152
|
+
|
153
|
+
# @return [Integer] RSS in MB for the current process
|
154
|
+
# @note Since swarm is linux only, we do not have to worry about getting RSS on other OSes
|
155
|
+
def rss_mb
|
156
|
+
kb_rss = 0
|
157
|
+
|
158
|
+
IO.readlines("/proc/#{node.pid}/status").each do |line|
|
159
|
+
next unless line.start_with?('VmRSS:')
|
160
|
+
|
161
|
+
kb_rss = line.split[1].to_i
|
162
|
+
|
163
|
+
break
|
164
|
+
end
|
165
|
+
|
166
|
+
(kb_rss / 1_024.to_i).round
|
167
|
+
end
|
168
|
+
end
|
169
|
+
end
|
170
|
+
end
|
171
|
+
end
|
data/lib/karafka/process.rb
CHANGED
@@ -14,6 +14,8 @@ module Karafka
|
|
14
14
|
SIGTERM
|
15
15
|
SIGTTIN
|
16
16
|
SIGTSTP
|
17
|
+
SIGCHLD
|
18
|
+
SIGUSER1
|
17
19
|
].freeze
|
18
20
|
|
19
21
|
HANDLED_SIGNALS.each do |signal|
|
@@ -32,16 +34,40 @@ module Karafka
|
|
32
34
|
RUBY
|
33
35
|
end
|
34
36
|
|
37
|
+
# Assigns a callback that will run on any supported signal that has at least one callback
|
38
|
+
# registered already.
|
39
|
+
# @param block [Proc] code we want to run
|
40
|
+
# @note This will only bind to signals that already have at least one callback defined
|
41
|
+
def on_any_active(&block)
|
42
|
+
HANDLED_SIGNALS.each do |signal|
|
43
|
+
next unless @callbacks.key?(signal)
|
44
|
+
|
45
|
+
public_send(:"on_#{signal.to_s.downcase}", &block)
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
35
49
|
# Creates an instance of process and creates empty hash for callbacks
|
36
50
|
def initialize
|
37
51
|
@callbacks = Hash.new { |hsh, key| hsh[key] = [] }
|
38
52
|
@supervised = false
|
39
53
|
end
|
40
54
|
|
55
|
+
# Clears all the defined callbacks. Useful for post-fork cleanup when parent already defined
|
56
|
+
# some signals
|
57
|
+
def clear
|
58
|
+
@callbacks.clear
|
59
|
+
end
|
60
|
+
|
41
61
|
# Method catches all HANDLED_SIGNALS and performs appropriate callbacks (if defined)
|
42
62
|
# @note If there are no callbacks, this method will just ignore a given signal that was sent
|
43
63
|
def supervise
|
44
|
-
HANDLED_SIGNALS.each
|
64
|
+
HANDLED_SIGNALS.each do |signal|
|
65
|
+
# Supervise only signals for which we have defined callbacks
|
66
|
+
next unless @callbacks.key?(signal)
|
67
|
+
|
68
|
+
trap_signal(signal)
|
69
|
+
end
|
70
|
+
|
45
71
|
@supervised = true
|
46
72
|
end
|
47
73
|
|
@@ -8,6 +8,12 @@ module Karafka
|
|
8
8
|
# @note One subscription group will always belong to one consumer group, but one consumer
|
9
9
|
# group can have multiple subscription groups.
|
10
10
|
class SubscriptionGroup
|
11
|
+
include Helpers::ConfigImporter.new(
|
12
|
+
activity_manager: %i[internal routing activity_manager],
|
13
|
+
client_id: %i[client_id],
|
14
|
+
node: %i[swarm node]
|
15
|
+
)
|
16
|
+
|
11
17
|
attr_reader :id, :name, :topics, :kafka, :consumer_group
|
12
18
|
|
13
19
|
# Lock for generating new ids safely
|
@@ -67,7 +73,7 @@ module Karafka
|
|
67
73
|
|
68
74
|
# @return [Boolean] is this subscription group one of active once
|
69
75
|
def active?
|
70
|
-
|
76
|
+
activity_manager.active?(:subscription_groups, name)
|
71
77
|
end
|
72
78
|
|
73
79
|
# @return [Array<String>] names of topics to which we should subscribe.
|
@@ -93,15 +99,9 @@ module Karafka
|
|
93
99
|
def build_kafka
|
94
100
|
kafka = Setup::AttributesMap.consumer(@topics.first.kafka.dup)
|
95
101
|
|
96
|
-
|
97
|
-
# be set on many subscription groups as the group instance id from Karafka perspective is
|
98
|
-
# set per config. Each instance even if they are subscribed to different topics needs to
|
99
|
-
# have it fully unique. To make sure of that, we just add extra postfix at the end that
|
100
|
-
# increments.
|
101
|
-
group_instance_id = kafka.fetch(:'group.instance.id', false)
|
102
|
+
inject_group_instance_id(kafka)
|
102
103
|
|
103
|
-
kafka[:'
|
104
|
-
kafka[:'client.id'] ||= Karafka::App.config.client_id
|
104
|
+
kafka[:'client.id'] ||= client_id
|
105
105
|
kafka[:'group.id'] ||= @consumer_group.id
|
106
106
|
kafka[:'auto.offset.reset'] ||= @topics.first.initial_offset
|
107
107
|
# Karafka manages the offsets based on the processing state, thus we do not rely on the
|
@@ -110,6 +110,28 @@ module Karafka
|
|
110
110
|
kafka.freeze
|
111
111
|
kafka
|
112
112
|
end
|
113
|
+
|
114
|
+
# If we use static group memberships, there can be a case, where same instance id would
|
115
|
+
# be set on many subscription groups as the group instance id from Karafka perspective is
|
116
|
+
# set per config. Each instance even if they are subscribed to different topics needs to
|
117
|
+
# have it fully unique. To make sure of that, we just add extra postfix at the end that
|
118
|
+
# increments.
|
119
|
+
#
|
120
|
+
# We also handle a swarm case, where the same setup would run from many forked nodes, hence
|
121
|
+
# affecting the instance id and causing conflicts
|
122
|
+
# @param kafka [Hash] kafka level config
|
123
|
+
def inject_group_instance_id(kafka)
|
124
|
+
group_instance_prefix = kafka.fetch(:'group.instance.id', false)
|
125
|
+
|
126
|
+
# If group instance id was not even configured, do nothing
|
127
|
+
return unless group_instance_prefix
|
128
|
+
|
129
|
+
# If there is a node, we need to take its id and inject it as well so multiple forks can
|
130
|
+
# have different instances ids but they are reproducible
|
131
|
+
components = [group_instance_prefix, node ? node.id : nil, @position]
|
132
|
+
|
133
|
+
kafka[:'group.instance.id'] = components.compact.join('_')
|
134
|
+
end
|
113
135
|
end
|
114
136
|
end
|
115
137
|
end
|
data/lib/karafka/runner.rb
CHANGED
@@ -18,6 +18,10 @@ module Karafka
|
|
18
18
|
workers = Processing::WorkersBatch.new(jobs_queue)
|
19
19
|
listeners = Connection::ListenersBatch.new(jobs_queue)
|
20
20
|
|
21
|
+
# We mark it prior to delegating to the manager as manager will have to start at least one
|
22
|
+
# connection to Kafka, hence running
|
23
|
+
Karafka::App.run!
|
24
|
+
|
21
25
|
# Register all the listeners so they can be started and managed
|
22
26
|
@manager.register(listeners)
|
23
27
|
|
data/lib/karafka/server.rb
CHANGED
@@ -3,16 +3,6 @@
|
|
3
3
|
module Karafka
|
4
4
|
# Karafka consuming server class
|
5
5
|
class Server
|
6
|
-
# How long should we sleep between checks on shutting down consumers
|
7
|
-
SUPERVISION_SLEEP = 0.1
|
8
|
-
# What system exit code should we use when we terminated forcefully
|
9
|
-
FORCEFUL_EXIT_CODE = 2
|
10
|
-
# This factor allows us to calculate how many times we have to sleep before
|
11
|
-
# a forceful shutdown
|
12
|
-
SUPERVISION_CHECK_FACTOR = (1 / SUPERVISION_SLEEP)
|
13
|
-
|
14
|
-
private_constant :SUPERVISION_SLEEP, :FORCEFUL_EXIT_CODE, :SUPERVISION_CHECK_FACTOR
|
15
|
-
|
16
6
|
class << self
|
17
7
|
# Set of consuming threads. Each consumer thread contains a single consumer
|
18
8
|
attr_accessor :listeners
|
@@ -36,12 +26,20 @@ module Karafka
|
|
36
26
|
config.internal.routing.activity_manager.to_h
|
37
27
|
)
|
38
28
|
|
29
|
+
# We clear as we do not want parent handlers in case of working from fork
|
30
|
+
process.clear
|
39
31
|
process.on_sigint { stop }
|
40
32
|
process.on_sigquit { stop }
|
41
33
|
process.on_sigterm { stop }
|
42
34
|
process.on_sigtstp { quiet }
|
35
|
+
# Needed for instrumentation
|
36
|
+
process.on_sigttin {}
|
43
37
|
process.supervise
|
44
38
|
|
39
|
+
# This will only run when not in a swarm mode. In swarm mode the server runs post-fork, so
|
40
|
+
# warmup will do nothing
|
41
|
+
Karafka::App.warmup
|
42
|
+
|
45
43
|
# Start is blocking until stop is called and when we stop, it will wait until
|
46
44
|
# all of the things are ready to stop
|
47
45
|
start
|
@@ -61,10 +59,9 @@ module Karafka
|
|
61
59
|
end
|
62
60
|
|
63
61
|
# Starts Karafka with a supervision
|
64
|
-
# @note We don't need to sleep because Karafka::
|
65
|
-
#
|
62
|
+
# @note We don't need to sleep because Karafka::Runner is locking and waiting to finish loop
|
63
|
+
# (and it won't happen until we explicitly want to stop)
|
66
64
|
def start
|
67
|
-
Karafka::App.run!
|
68
65
|
Karafka::Runner.new.call
|
69
66
|
end
|
70
67
|
|
@@ -87,13 +84,13 @@ module Karafka
|
|
87
84
|
# We check from time to time (for the timeout period) if all the threads finished
|
88
85
|
# their work and if so, we can just return and normal shutdown process will take place
|
89
86
|
# We divide it by 1000 because we use time in ms.
|
90
|
-
((timeout / 1_000) *
|
87
|
+
((timeout / 1_000) * (1 / config.internal.supervision_sleep)).to_i.times do
|
91
88
|
all_listeners_stopped = listeners.all?(&:stopped?)
|
92
89
|
all_workers_stopped = workers.none?(&:alive?)
|
93
90
|
|
94
91
|
return if all_listeners_stopped && all_workers_stopped
|
95
92
|
|
96
|
-
sleep
|
93
|
+
sleep(config.internal.supervision_sleep)
|
97
94
|
end
|
98
95
|
|
99
96
|
raise Errors::ForcefulShutdownError
|
@@ -117,7 +114,7 @@ module Karafka
|
|
117
114
|
return unless process.supervised?
|
118
115
|
|
119
116
|
# exit! is not within the instrumentation as it would not trigger due to exit
|
120
|
-
Kernel.exit!(
|
117
|
+
Kernel.exit!(config.internal.forceful_exit_code)
|
121
118
|
ensure
|
122
119
|
# We need to check if it wasn't an early exit to make sure that only on stop invocation
|
123
120
|
# can change the status after everything is closed
|
data/lib/karafka/setup/config.rb
CHANGED
@@ -105,6 +105,17 @@ module Karafka
|
|
105
105
|
# @see https://github.com/edenhill/librdkafka/blob/master/CONFIGURATION.md
|
106
106
|
setting :kafka, default: {}
|
107
107
|
|
108
|
+
# Public configuration for swarm operations
|
109
|
+
setting :swarm do
|
110
|
+
# option [Integer] how many processes do we want to run in a swarm mode
|
111
|
+
# Keep in mind this is only applicable when running in a swarm mode
|
112
|
+
setting :nodes, default: 3
|
113
|
+
# This is set automatically when we fork. Used to hold reference that may be needed
|
114
|
+
# for static group membership, supervision and more. If set to `false`, it means this
|
115
|
+
# process is not a fork
|
116
|
+
setting :node, default: false
|
117
|
+
end
|
118
|
+
|
108
119
|
# Admin specific settings.
|
109
120
|
#
|
110
121
|
# Since admin operations are often specific, they may require specific librdkafka settings
|
@@ -151,7 +162,6 @@ module Karafka
|
|
151
162
|
# @note In the future, we need to have a single process representation for all the karafka
|
152
163
|
# instances
|
153
164
|
setting :process, default: Process.new
|
154
|
-
|
155
165
|
# Interval of "ticking". This is used to define the maximum time between consecutive
|
156
166
|
# polling of the main rdkafka queue. It should match also the `statistics.interval.ms`
|
157
167
|
# smallest value defined in any of the per-kafka settings, so metrics are published with
|
@@ -162,6 +172,36 @@ module Karafka
|
|
162
172
|
# not to have enough time to run. This (not directly) defines also a single poll
|
163
173
|
# max timeout as to allow for frequent enough events polling
|
164
174
|
setting :tick_interval, default: 5_000
|
175
|
+
# How long should we sleep between checks on shutting down consumers
|
176
|
+
setting :supervision_sleep, default: 0.1
|
177
|
+
# What system exit code should we use when we terminated forcefully
|
178
|
+
setting :forceful_exit_code, default: 2
|
179
|
+
|
180
|
+
setting :swarm do
|
181
|
+
# Manager for swarm nodes control
|
182
|
+
setting :manager, default: Swarm::Manager.new
|
183
|
+
# Exit code we exit an orphaned child with to indicate something went wrong
|
184
|
+
setting :orphaned_exit_code, default: 3
|
185
|
+
# syscall number for https://man7.org/linux/man-pages/man2/pidfd_open.2.html
|
186
|
+
setting :pidfd_open_syscall, default: 434
|
187
|
+
# syscall number for https://man7.org/linux/man-pages/man2/pidfd_send_signal.2.html
|
188
|
+
setting :pidfd_signal_syscall, default: 424
|
189
|
+
# How often (in ms) should we control our nodes
|
190
|
+
# This is maximum time after which we will check. This can happen more often in case of
|
191
|
+
# system events.
|
192
|
+
setting :supervision_interval, default: 30_000
|
193
|
+
# How often should each node report its status
|
194
|
+
setting :liveness_interval, default: 10_000
|
195
|
+
# Listener used to report nodes state to the supervisor
|
196
|
+
setting :liveness_listener, default: Swarm::LivenessListener.new
|
197
|
+
# How long should we wait for any info from the node before we consider it hanging at
|
198
|
+
# stop it
|
199
|
+
setting :node_report_timeout, default: 30_000
|
200
|
+
# How long should we wait before restarting a node. This can prevent us from having a
|
201
|
+
# case where for some external reason our spawned process would die immediately and we
|
202
|
+
# would immediately try to start it back in an endless loop
|
203
|
+
setting :node_restart_timeout, default: 5_000
|
204
|
+
end
|
165
205
|
|
166
206
|
# Namespace for CLI related settings
|
167
207
|
setting :cli do
|
@@ -176,7 +216,6 @@ module Karafka
|
|
176
216
|
# option subscription_groups_builder [Routing::SubscriptionGroupsBuilder] subscription
|
177
217
|
# group builder
|
178
218
|
setting :subscription_groups_builder, default: Routing::SubscriptionGroupsBuilder.new
|
179
|
-
|
180
219
|
# Internally assigned list of limits on routings active for the current process
|
181
220
|
# This can be altered by the CLI command
|
182
221
|
setting :activity_manager, default: Routing::ActivityManager.new
|
data/lib/karafka/status.rb
CHANGED
@@ -7,6 +7,7 @@ module Karafka
|
|
7
7
|
STATES = {
|
8
8
|
initializing: :initialize!,
|
9
9
|
initialized: :initialized!,
|
10
|
+
supervising: :supervise!,
|
10
11
|
running: :run!,
|
11
12
|
# will no longer pickup any work, but current work will be finished
|
12
13
|
quieting: :quiet!,
|
@@ -49,8 +50,8 @@ module Karafka
|
|
49
50
|
|
50
51
|
def #{transition}
|
51
52
|
MUTEX.synchronize do
|
52
|
-
# Do not allow reverse state transitions (we always go one way) or transition to the
|
53
|
-
# state as currently
|
53
|
+
# Do not allow reverse state transitions (we always go one way) or transition to the
|
54
|
+
# same state as currently
|
54
55
|
return if @status && STATES.keys.index(:#{state}) <= STATES.keys.index(@status)
|
55
56
|
|
56
57
|
@status = :#{state}
|
@@ -78,6 +79,7 @@ module Karafka
|
|
78
79
|
def done?
|
79
80
|
# Short-track for the most common case not to invoke all others on normal execution
|
80
81
|
return false if running?
|
82
|
+
return false if supervising?
|
81
83
|
|
82
84
|
stopping? || stopped? || quieting? || quiet? || terminated?
|
83
85
|
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Karafka
|
4
|
+
module Swarm
|
5
|
+
# Simple listener for swarm nodes that:
|
6
|
+
# - reports once in a while to make sure that supervisor is aware we do not hang
|
7
|
+
# - makes sure we did not become an orphan and if so, exits
|
8
|
+
class LivenessListener
|
9
|
+
include Karafka::Core::Helpers::Time
|
10
|
+
include Helpers::ConfigImporter.new(
|
11
|
+
node: %i[swarm node],
|
12
|
+
liveness_interval: %i[internal swarm liveness_interval],
|
13
|
+
orphaned_exit_code: %i[internal swarm orphaned_exit_code]
|
14
|
+
)
|
15
|
+
|
16
|
+
def initialize
|
17
|
+
@last_checked_at = 0
|
18
|
+
@mutex = Mutex.new
|
19
|
+
end
|
20
|
+
|
21
|
+
# Since there may be many statistics emitted from multiple listeners, we do not want to write
|
22
|
+
# statuses that often. Instead we do it only once in a while which should be enough
|
23
|
+
#
|
24
|
+
# While this may provide a small lag in the orphaned detection, it does not really matter
|
25
|
+
# as it will be picked up fast enough.
|
26
|
+
# @param _event [Karafka::Core::Monitoring::Event]
|
27
|
+
def on_statistics_emitted(_event)
|
28
|
+
periodically do
|
29
|
+
Kernel.exit!(orphaned_exit_code) if node.orphaned?
|
30
|
+
|
31
|
+
node.healthy
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
private
|
36
|
+
|
37
|
+
# Wraps the logic with a mutex
|
38
|
+
# @param block [Proc] code we want to run in mutex
|
39
|
+
def synchronize(&block)
|
40
|
+
@mutex.synchronize(&block)
|
41
|
+
end
|
42
|
+
|
43
|
+
# Runs requested code once in a while
|
44
|
+
def periodically
|
45
|
+
return if monotonic_now - @last_checked_at < liveness_interval
|
46
|
+
|
47
|
+
synchronize do
|
48
|
+
@last_checked_at = monotonic_now
|
49
|
+
|
50
|
+
yield
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|