karafka 2.3.1 → 2.3.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (70) hide show
  1. checksums.yaml +4 -4
  2. checksums.yaml.gz.sig +0 -0
  3. data/.rspec +2 -0
  4. data/CHANGELOG.md +12 -0
  5. data/Gemfile.lock +6 -6
  6. data/bin/integrations +2 -1
  7. data/bin/rspecs +6 -2
  8. data/config/locales/errors.yml +30 -8
  9. data/config/locales/pro_errors.yml +2 -0
  10. data/docker-compose.yml +1 -1
  11. data/lib/karafka/app.rb +14 -0
  12. data/lib/karafka/cli/base.rb +19 -0
  13. data/lib/karafka/cli/server.rb +62 -76
  14. data/lib/karafka/cli/swarm.rb +30 -0
  15. data/lib/karafka/constraints.rb +3 -3
  16. data/lib/karafka/contracts/config.rb +19 -0
  17. data/lib/karafka/errors.rb +12 -0
  18. data/lib/karafka/helpers/config_importer.rb +30 -0
  19. data/lib/karafka/instrumentation/logger_listener.rb +31 -0
  20. data/lib/karafka/instrumentation/notifications.rb +9 -0
  21. data/lib/karafka/instrumentation/vendors/datadog/logger_listener.rb +2 -0
  22. data/lib/karafka/instrumentation/vendors/kubernetes/base_listener.rb +72 -0
  23. data/lib/karafka/instrumentation/vendors/kubernetes/liveness_listener.rb +11 -40
  24. data/lib/karafka/instrumentation/vendors/kubernetes/swarm_liveness_listener.rb +54 -0
  25. data/lib/karafka/pro/active_job/job_options_contract.rb +1 -1
  26. data/lib/karafka/pro/base_consumer.rb +16 -0
  27. data/lib/karafka/pro/connection/manager.rb +6 -1
  28. data/lib/karafka/pro/processing/coordinator.rb +13 -3
  29. data/lib/karafka/pro/processing/coordinators/errors_tracker.rb +74 -0
  30. data/lib/karafka/pro/processing/coordinators/filters_applier.rb +107 -0
  31. data/lib/karafka/pro/processing/coordinators/virtual_offset_manager.rb +180 -0
  32. data/lib/karafka/pro/processing/strategies/aj/dlq_ftr_lrj_mom.rb +5 -7
  33. data/lib/karafka/pro/processing/strategies/aj/dlq_ftr_lrj_mom_vp.rb +5 -7
  34. data/lib/karafka/pro/processing/strategies/aj/dlq_ftr_mom.rb +8 -10
  35. data/lib/karafka/pro/processing/strategies/aj/dlq_ftr_mom_vp.rb +8 -16
  36. data/lib/karafka/pro/processing/strategies/aj/dlq_lrj_mom.rb +5 -7
  37. data/lib/karafka/pro/processing/strategies/aj/dlq_lrj_mom_vp.rb +5 -7
  38. data/lib/karafka/pro/processing/strategies/aj/dlq_mom.rb +8 -10
  39. data/lib/karafka/pro/processing/strategies/aj/dlq_mom_vp.rb +7 -9
  40. data/lib/karafka/pro/processing/strategies/dlq/default.rb +36 -10
  41. data/lib/karafka/pro/processing/strategies/dlq/ftr.rb +3 -7
  42. data/lib/karafka/pro/processing/strategies/dlq/ftr_lrj.rb +4 -8
  43. data/lib/karafka/pro/processing/strategies/dlq/ftr_lrj_mom.rb +6 -9
  44. data/lib/karafka/pro/processing/strategies/dlq/ftr_mom.rb +5 -15
  45. data/lib/karafka/pro/processing/strategies/dlq/lrj.rb +4 -8
  46. data/lib/karafka/pro/processing/strategies/dlq/lrj_mom.rb +6 -9
  47. data/lib/karafka/pro/processing/strategies/dlq/mom.rb +10 -20
  48. data/lib/karafka/pro/processing/strategies/vp/default.rb +7 -0
  49. data/lib/karafka/pro/routing/features/dead_letter_queue/contracts/topic.rb +6 -0
  50. data/lib/karafka/pro/routing/features/dead_letter_queue/topic.rb +39 -0
  51. data/lib/karafka/pro/swarm/liveness_listener.rb +171 -0
  52. data/lib/karafka/process.rb +27 -1
  53. data/lib/karafka/routing/features/dead_letter_queue/config.rb +2 -0
  54. data/lib/karafka/routing/subscription_group.rb +31 -9
  55. data/lib/karafka/server.rb +11 -13
  56. data/lib/karafka/setup/config.rb +41 -2
  57. data/lib/karafka/status.rb +4 -2
  58. data/lib/karafka/swarm/liveness_listener.rb +55 -0
  59. data/lib/karafka/swarm/manager.rb +217 -0
  60. data/lib/karafka/swarm/node.rb +179 -0
  61. data/lib/karafka/swarm/pidfd.rb +131 -0
  62. data/lib/karafka/swarm/supervisor.rb +184 -0
  63. data/lib/karafka/swarm.rb +27 -0
  64. data/lib/karafka/version.rb +1 -1
  65. data/lib/karafka.rb +1 -1
  66. data.tar.gz.sig +0 -0
  67. metadata +17 -4
  68. metadata.gz.sig +0 -0
  69. data/lib/karafka/pro/processing/filters_applier.rb +0 -105
  70. data/lib/karafka/pro/processing/virtual_offset_manager.rb +0 -177
@@ -0,0 +1,217 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Karafka
4
+ module Swarm
5
+ # Manager similar to the one for threads but managing processing nodes
6
+ # It starts nodes and keeps an eye on them.
7
+ #
8
+ # In any of the nodes is misbehaving (based on liveness listener) it will be restarted.
9
+ # Initially gracefully but if won't stop itself, it will be forced to.
10
+ #
11
+ # @note This is intended to run in the supervisor under mutexes (when needed)
12
+ class Manager
13
+ include Karafka::Core::Helpers::Time
14
+ include Helpers::ConfigImporter.new(
15
+ monitor: %i[monitor],
16
+ nodes_count: %i[swarm nodes],
17
+ shutdown_timeout: %i[shutdown_timeout],
18
+ node_report_timeout: %i[internal swarm node_report_timeout],
19
+ node_restart_timeout: %i[internal swarm node_restart_timeout]
20
+ )
21
+
22
+ # @return [Array<Node>] All nodes that manager manages
23
+ attr_reader :nodes
24
+
25
+ def initialize
26
+ @nodes = []
27
+ @statuses = Hash.new { |h, k| h[k] = {} }
28
+ end
29
+
30
+ # Starts all the expected nodes for the first time
31
+ def start
32
+ pidfd = Pidfd.new(::Process.pid)
33
+
34
+ @nodes = Array.new(nodes_count) do |i|
35
+ start_one Node.new(i, pidfd)
36
+ end
37
+ end
38
+
39
+ # Attempts to quiet all the nodes
40
+ def quiet
41
+ @nodes.each(&:quiet)
42
+ end
43
+
44
+ # Attempts to stop all the nodes
45
+ def stop
46
+ @nodes.each(&:stop)
47
+ end
48
+
49
+ # Terminates all the nodes
50
+ def terminate
51
+ @nodes.each(&:terminate)
52
+ end
53
+
54
+ # Collects all processes statuses
55
+ def cleanup
56
+ @nodes.each(&:cleanup)
57
+ end
58
+
59
+ # Sends given signal to all nodes
60
+ # @param signal [String] signal name
61
+ def signal(signal)
62
+ @nodes.each { |node| node.signal(signal) }
63
+ end
64
+
65
+ # @return [Boolean] true if none of the nodes is running
66
+ def stopped?
67
+ @nodes.none?(&:alive?)
68
+ end
69
+
70
+ # Checks on nodes if they are ok one after another
71
+ def control
72
+ monitor.instrument('swarm.manager.control', caller: self) do
73
+ @nodes.each do |node|
74
+ statuses = @statuses[node]
75
+
76
+ if node.alive?
77
+ next if terminate_if_hanging(statuses, node)
78
+ next if stop_if_not_healthy(statuses, node)
79
+ next if stop_if_not_responding(statuses, node)
80
+ else
81
+ next if cleanup_one(statuses, node)
82
+ next if restart_after_timeout(statuses, node)
83
+ end
84
+ end
85
+ end
86
+ end
87
+
88
+ private
89
+
90
+ # If we've issued a stop to this process and it does not want to stop in the period, kills it
91
+ #
92
+ # @param statuses [Hash] hash with statuses transitions with times
93
+ # @param [Swarm::Node] node we're checking
94
+ # @return [Boolean] should it be the last action taken on this node in this run
95
+ def terminate_if_hanging(statuses, node)
96
+ return false unless statuses.key?(:stop)
97
+ # If we already sent the termination request, we should not do it again
98
+ return true if statuses.key?(:terminate)
99
+ # Do not run any other checks on this node if it is during stopping but still has time
100
+ return true unless over?(statuses[:stop], shutdown_timeout)
101
+
102
+ monitor.instrument('swarm.manager.terminating', caller: self, node: node) do
103
+ node.terminate
104
+ statuses[:terminate] = monotonic_now
105
+ end
106
+
107
+ true
108
+ end
109
+
110
+ # Checks if there is any new liveness report from given node and if yes, issues stop if it
111
+ # reported it is not healthy.
112
+ #
113
+ # @param statuses [Hash] hash with statuses transitions with times
114
+ # @param [Swarm::Node] node we're checking
115
+ # @return [Boolean] should it be the last action taken on this node in this run
116
+ def stop_if_not_healthy(statuses, node)
117
+ status = node.status
118
+
119
+ case status
120
+ # If no new state reported, we should just move with other checks
121
+ when -1
122
+ false
123
+ when 0
124
+ # Exists and reports as healthy, so no other checks should happen on it in this go
125
+ statuses[:control] = monotonic_now
126
+ true
127
+ else
128
+ # A single invalid report will cause it to stop. We do not support intermediate failures
129
+ # that would recover. Such states should be implemented in the listener.
130
+ monitor.instrument('swarm.manager.stopping', caller: self, node: node, status: status) do
131
+ node.stop
132
+ statuses[:stop] = monotonic_now
133
+ end
134
+
135
+ true
136
+ end
137
+ end
138
+
139
+ # If node stopped responding, starts the stopping procedure.
140
+ #
141
+ # @param statuses [Hash] hash with statuses transitions with times
142
+ # @param [Swarm::Node] node we're checking
143
+ # @return [Boolean] should it be the last action taken on this node in this run
144
+ def stop_if_not_responding(statuses, node)
145
+ # Do nothing if already stopping
146
+ return true if statuses.key?(:stop)
147
+ # Do nothing if we've received status update recently enough
148
+ return true unless over?(statuses[:control], node_report_timeout)
149
+
150
+ # Start the stopping procedure if the node stopped reporting frequently enough
151
+ monitor.instrument('swarm.manager.stopping', caller: self, node: node) do
152
+ node.stop
153
+ statuses[:stop] = monotonic_now
154
+ end
155
+
156
+ true
157
+ end
158
+
159
+ # Cleans up a dead process and remembers time of death for restart after a period.
160
+ #
161
+ # @param statuses [Hash] hash with statuses transitions with times
162
+ # @param [Swarm::Node] node we're checking
163
+ # @return [Boolean] should it be the last action taken on this node in this run
164
+ def cleanup_one(statuses, node)
165
+ return false if statuses.key?(:dead_since)
166
+
167
+ node.cleanup
168
+ statuses[:dead_since] = monotonic_now
169
+
170
+ true
171
+ end
172
+
173
+ # Restarts the node if there was enough of a backoff.
174
+ #
175
+ # We always wait a bit to make sure, we do not overload the system in case forks would be
176
+ # killed for some external reason.
177
+ #
178
+ # @param statuses [Hash] hash with statuses transitions with times
179
+ # @param [Swarm::Node] node we're checking
180
+ # @return [Boolean] should it be the last action taken on this node in this run
181
+ def restart_after_timeout(statuses, node)
182
+ return false unless over?(statuses[:dead_since], node_restart_timeout)
183
+
184
+ start_one(node)
185
+
186
+ true
187
+ end
188
+
189
+ # Starts a new node (or restarts dead)
190
+ #
191
+ # @param [Swarm::Node] node we're starting
192
+ def start_one(node)
193
+ instr_args = { caller: self, node: node }
194
+
195
+ statuses = @statuses[node]
196
+
197
+ statuses.clear
198
+ statuses[:control] = monotonic_now
199
+
200
+ monitor.instrument('swarm.manager.before_fork', instr_args)
201
+ node.start
202
+ monitor.instrument('swarm.manager.after_fork', instr_args)
203
+
204
+ node
205
+ end
206
+
207
+ # Are we over certain time from an event happening
208
+ #
209
+ # @param event_time [Float] when something happened
210
+ # @param delay [Float] how long should we wait
211
+ # @return [Boolean] true if we're past the delay
212
+ def over?(event_time, delay)
213
+ monotonic_now - event_time >= delay
214
+ end
215
+ end
216
+ end
217
+ end
@@ -0,0 +1,179 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Karafka
4
+ module Swarm
5
+ # Represents a single forked process node in a swarm
6
+ # Provides simple API to control forks and check their status
7
+ #
8
+ # @note Some of this APIs are for parent process only
9
+ #
10
+ # @note Keep in mind this can be used in both forks and supervisor and has a slightly different
11
+ # role in each. In case of the supervisor it is used to get information about the child and
12
+ # make certain requests to it. In case of child, it is used to provide zombie-fencing and
13
+ # report liveness
14
+ class Node
15
+ include Helpers::ConfigImporter.new(
16
+ monitor: %i[monitor],
17
+ config: %i[itself],
18
+ kafka: %i[kafka],
19
+ swarm: %i[swarm],
20
+ process: %i[process],
21
+ liveness_listener: %i[internal swarm liveness_listener]
22
+ )
23
+
24
+ # @return [Integer] id of the node. Useful for client.group.id assignment
25
+ attr_reader :id
26
+
27
+ # @return [Integer] pid of the node
28
+ attr_reader :pid
29
+
30
+ # @param id [Integer] number of the fork. Used for uniqueness setup for group client ids and
31
+ # other stuff where we need to know a unique reference of the fork in regards to the rest
32
+ # of them.
33
+ # @param parent_pidfd [Pidfd] parent pidfd for zombie fencing
34
+ def initialize(id, parent_pidfd)
35
+ @id = id
36
+ @parent_pidfd = parent_pidfd
37
+ end
38
+
39
+ # Starts a new fork and:
40
+ # - stores pid and parent reference
41
+ # - makes sure reader pipe is closed
42
+ # - sets up liveness listener
43
+ # - recreates producer and web producer
44
+ # @note Parent API
45
+ def start
46
+ @reader, @writer = IO.pipe
47
+
48
+ # :nocov:
49
+ @pid = ::Process.fork do
50
+ # Close the old producer so it is not a subject to GC
51
+ # While it was not opened in the parent, without explicit closing, there still could be
52
+ # an attempt to close it when finalized, meaning it would be kept in memory.
53
+ config.producer.close
54
+
55
+ # Supervisor producer is closed, hence we need a new one here
56
+ config.producer = ::WaterDrop::Producer.new do |p_config|
57
+ p_config.kafka = Setup::AttributesMap.producer(kafka.dup)
58
+ p_config.logger = config.logger
59
+ end
60
+
61
+ @pid = ::Process.pid
62
+ @reader.close
63
+
64
+ # Indicate we are alive right after start
65
+ healthy
66
+
67
+ swarm.node = self
68
+ monitor.subscribe(liveness_listener)
69
+ monitor.instrument('swarm.node.after_fork', caller: self)
70
+
71
+ Server.run
72
+
73
+ @writer.close
74
+ end
75
+ # :nocov:
76
+
77
+ @writer.close
78
+ @pidfd = Pidfd.new(@pid)
79
+ end
80
+
81
+ # Indicates that this node is doing well
82
+ # @note Child API
83
+ def healthy
84
+ write('0')
85
+ end
86
+
87
+ # Indicates, that this node has failed
88
+ # @param reason_code [Integer, String] numeric code we want to use to indicate that we are
89
+ # not healthy. Anything bigger than 0 will be considered not healthy. Useful it we want to
90
+ # have complex health-checking with reporting.
91
+ # @note Child API
92
+ # @note We convert this to string to normalize the API
93
+ def unhealthy(reason_code = '1')
94
+ write(reason_code.to_s)
95
+ end
96
+
97
+ # @return [Integer] This returns following status code depending on the data:
98
+ # - -1 if node did not report anything new
99
+ # - 0 if all good,
100
+ # - positive number if there was a problem (indicates error code)
101
+ #
102
+ # @note Parent API
103
+ # @note If there were few issues reported, it will pick the one with highest number
104
+ def status
105
+ result = read
106
+
107
+ return -1 if result.nil?
108
+ return -1 if result == false
109
+
110
+ result.split("\n").map(&:to_i).max
111
+ end
112
+
113
+ # @return [Boolean] true if node is alive or false if died
114
+ # @note Parent API
115
+ # @note Keep in mind that the fact that process is alive does not mean it is healthy
116
+ def alive?
117
+ @pidfd.alive?
118
+ end
119
+
120
+ # @return [Boolean] true if node is orphaned or false otherwise. Used for orphans detection.
121
+ # @note Child API
122
+ def orphaned?
123
+ !@parent_pidfd.alive?
124
+ end
125
+
126
+ # Sends sigterm to the node
127
+ # @note Parent API
128
+ def stop
129
+ signal('TERM')
130
+ end
131
+
132
+ # Sends sigtstp to the node
133
+ # @note Parent API
134
+ def quiet
135
+ signal('TSTP')
136
+ end
137
+
138
+ # Terminates node
139
+ # @note Parent API
140
+ def terminate
141
+ signal('KILL')
142
+ end
143
+
144
+ # Sends provided signal to the node
145
+ # @param signal [String]
146
+ def signal(signal)
147
+ @pidfd.signal(signal)
148
+ end
149
+
150
+ # Removes the dead process from the processes table
151
+ def cleanup
152
+ @pidfd.cleanup
153
+ end
154
+
155
+ private
156
+
157
+ # Reads in a non-blocking way provided content
158
+ # @return [String, false] Content from the pipe or false if nothing or something went wrong
159
+ # @note Parent API
160
+ def read
161
+ @reader.read_nonblock(1024)
162
+ rescue IO::WaitReadable, Errno::EPIPE, IOError
163
+ false
164
+ end
165
+
166
+ # Writes in a non-blocking way provided content into the pipe
167
+ # @param content [Integer, String] anything we want to write to the parent
168
+ # @return [Boolean] true if ok, otherwise false
169
+ # @note Child API
170
+ def write(content)
171
+ @writer.write_nonblock "#{content}\n"
172
+
173
+ true
174
+ rescue IO::WaitWritable, Errno::EPIPE, IOError
175
+ false
176
+ end
177
+ end
178
+ end
179
+ end
@@ -0,0 +1,131 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Karafka
4
+ module Swarm
5
+ # Pidfd Linux representation wrapped with Ruby for communication within Swarm
6
+ # It is more stable than using `#pid` and `#ppid` + signals and cheaper
7
+ class Pidfd
8
+ include Helpers::ConfigImporter.new(
9
+ pidfd_open_syscall: %i[internal swarm pidfd_open_syscall],
10
+ pidfd_signal_syscall: %i[internal swarm pidfd_signal_syscall],
11
+ waitid_syscall: %i[internal swarm waitid_syscall]
12
+ )
13
+
14
+ extend FFI::Library
15
+
16
+ begin
17
+ ffi_lib FFI::Library::LIBC
18
+
19
+ # direct usage of this is only available since glibc 2.36, hence we use bindings and call
20
+ # it directly via syscalls
21
+ attach_function :fdpid_open, :syscall, %i[long int uint], :int
22
+ attach_function :fdpid_signal, :syscall, %i[long int int pointer uint], :int
23
+ attach_function :waitid, %i[int int pointer uint], :int
24
+
25
+ API_SUPPORTED = true
26
+ # LoadError is a parent to FFI::NotFoundError
27
+ rescue LoadError
28
+ API_SUPPORTED = false
29
+ ensure
30
+ private_constant :API_SUPPORTED
31
+ end
32
+
33
+ # https://github.com/torvalds/linux/blob/7e90b5c295/include/uapi/linux/wait.h#L20
34
+ P_PIDFD = 3
35
+
36
+ # Wait for child processes that have exited
37
+ WEXITED = 4
38
+
39
+ private_constant :P_PIDFD, :WEXITED
40
+
41
+ class << self
42
+ # @return [Boolean] true if syscall is supported via FFI
43
+ def supported?
44
+ # If we were not even able to load the FFI C lib, it won't be supported
45
+ return false unless API_SUPPORTED
46
+ # Won't work on macOS because it does not support pidfd
47
+ return false if RUBY_DESCRIPTION.include?('darwin')
48
+ # Won't work on Windows for the same reason as on macOS
49
+ return false if RUBY_DESCRIPTION.match?(/mswin|ming|cygwin/)
50
+
51
+ # There are some OSes like BSD that will have C lib for FFI bindings but will not support
52
+ # the needed syscalls. In such cases, we can just try and fail, which will indicate it
53
+ # won't work. The same applies to using new glibc on an old kernel.
54
+ new(::Process.pid)
55
+
56
+ true
57
+ rescue Errors::PidfdOpenFailedError
58
+ false
59
+ end
60
+ end
61
+
62
+ # @param pid [Integer] pid of the node we want to work with
63
+ def initialize(pid)
64
+ @mutex = Mutex.new
65
+
66
+ @pid = pid
67
+ @pidfd = open(pid)
68
+ @pidfd_io = IO.new(@pidfd)
69
+ end
70
+
71
+ # @return [Boolean] true if given process is alive, false if no longer
72
+ def alive?
73
+ @pidfd_select ||= [@pidfd_io]
74
+
75
+ IO.select(@pidfd_select, nil, nil, 0).nil?
76
+ end
77
+
78
+ # Cleans the zombie process
79
+ # @note This should run **only** on processes that exited, otherwise will wait
80
+ def cleanup
81
+ return if @cleaned
82
+
83
+ waitid(P_PIDFD, @pidfd, nil, WEXITED)
84
+
85
+ @cleaned = true
86
+ end
87
+
88
+ # Sends given signal to the process using its pidfd
89
+ # @param sig_name [String] signal name
90
+ # @return [Boolean] true if signal was sent, otherwise false or error raised. `false`
91
+ # returned when we attempt to send a signal to a dead process
92
+ # @note It will not send signals to dead processes
93
+ def signal(sig_name)
94
+ @mutex.synchronize do
95
+ return false if @cleaned
96
+ # Never signal processes that are dead
97
+ return false unless alive?
98
+
99
+ result = fdpid_signal(
100
+ pidfd_signal_syscall,
101
+ @pidfd,
102
+ Signal.list.fetch(sig_name),
103
+ nil,
104
+ 0
105
+ )
106
+
107
+ return true if result.zero?
108
+
109
+ raise Errors::PidfdSignalFailedError, result
110
+ end
111
+ end
112
+
113
+ private
114
+
115
+ # Opens a pidfd for the provided pid
116
+ # @param pid [Integer]
117
+ # @return [Integer] pidfd
118
+ def open(pid)
119
+ pidfd = fdpid_open(
120
+ pidfd_open_syscall,
121
+ pid,
122
+ 0
123
+ )
124
+
125
+ return pidfd if pidfd != -1
126
+
127
+ raise Errors::PidfdOpenFailedError, pidfd
128
+ end
129
+ end
130
+ end
131
+ end
@@ -0,0 +1,184 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Karafka
4
+ module Swarm
5
+ # Supervisor that starts forks and uses monitor to monitor them. Also handles shutdown of
6
+ # all the processes including itself.
7
+ #
8
+ # In case any node dies, it will be restarted.
9
+ #
10
+ # @note Technically speaking supervisor is never in the running state because we do not want
11
+ # to have any sockets or anything else on it that could break under forking.
12
+ # It has its own "supervising" state from which it can go to the final shutdown.
13
+ class Supervisor
14
+ include Karafka::Core::Helpers::Time
15
+ include Helpers::ConfigImporter.new(
16
+ monitor: %i[monitor],
17
+ swarm: %i[internal swarm],
18
+ manager: %i[internal swarm manager],
19
+ supervision_interval: %i[internal swarm supervision_interval],
20
+ shutdown_timeout: %i[shutdown_timeout],
21
+ supervision_sleep: %i[internal supervision_sleep],
22
+ forceful_exit_code: %i[internal forceful_exit_code],
23
+ process: %i[internal process]
24
+ )
25
+
26
+ def initialize
27
+ @mutex = Mutex.new
28
+ @queue = Processing::TimedQueue.new
29
+ end
30
+
31
+ # Creates needed number of forks, installs signals and starts supervision
32
+ def run
33
+ Karafka::App.warmup
34
+
35
+ manager.start
36
+
37
+ # Close producer just in case. While it should not be used, we do not want even a
38
+ # theoretical case since librdkafka is not thread-safe.
39
+ Karafka.producer.close
40
+
41
+ process.on_sigint { stop }
42
+ process.on_sigquit { stop }
43
+ process.on_sigterm { stop }
44
+ process.on_sigtstp { quiet }
45
+ process.on_sigttin { signal('TTIN') }
46
+ # Needed to be registered as we want to unlock on child changes
47
+ process.on_sigchld {}
48
+ process.on_any_active { unlock }
49
+ process.supervise
50
+
51
+ Karafka::App.supervise!
52
+
53
+ loop do
54
+ return if Karafka::App.terminated?
55
+
56
+ lock
57
+ control
58
+ end
59
+ # If anything went wrong, signal this and die
60
+ # Supervisor is meant to be thin and not cause any issues. If you encounter this case
61
+ # please report it as it should be considered critical
62
+ rescue StandardError => e
63
+ monitor.instrument(
64
+ 'error.occurred',
65
+ caller: self,
66
+ error: e,
67
+ manager: manager,
68
+ type: 'swarm.supervisor.error'
69
+ )
70
+
71
+ @nodes.terminate
72
+ end
73
+
74
+ private
75
+
76
+ # Keeps the lock on the queue so we control nodes only when it is needed
77
+ # @note We convert to seconds since the queue timeout requires seconds
78
+ def lock
79
+ @queue.pop(timeout: supervision_interval / 1_000.0)
80
+ end
81
+
82
+ # Frees the lock on events that could require nodes control
83
+ def unlock
84
+ @queue << true
85
+ end
86
+
87
+ # Stops all the nodes and supervisor once all nodes are dead.
88
+ # It will forcefully stop all nodes if they exit the shutdown timeout. While in theory each
89
+ # of the nodes anyhow has its own supervisor, this is a last resort to stop everything.
90
+ def stop
91
+ # Ensure that the stopping procedure is initialized only once
92
+ @mutex.synchronize do
93
+ return if @stopping
94
+
95
+ @stopping = true
96
+ end
97
+
98
+ initialized = true
99
+ Karafka::App.stop!
100
+
101
+ manager.stop
102
+
103
+ # We check from time to time (for the timeout period) if all the threads finished
104
+ # their work and if so, we can just return and normal shutdown process will take place
105
+ # We divide it by 1000 because we use time in ms.
106
+ ((shutdown_timeout / 1_000) * (1 / supervision_sleep)).to_i.times do
107
+ if manager.stopped?
108
+ manager.cleanup
109
+ return
110
+ end
111
+
112
+ sleep(supervision_sleep)
113
+ end
114
+
115
+ raise Errors::ForcefulShutdownError
116
+ rescue Errors::ForcefulShutdownError => e
117
+ monitor.instrument(
118
+ 'error.occurred',
119
+ caller: self,
120
+ error: e,
121
+ manager: manager,
122
+ type: 'app.stopping.error'
123
+ )
124
+
125
+ # Run forceful kill
126
+ manager.terminate
127
+ # And wait until linux kills them
128
+ # This prevents us from existing forcefully with any dead child process still existing
129
+ # Since we have sent the `KILL` signal, it must die, so we can wait until all dead
130
+ sleep(supervision_sleep) until manager.stopped?
131
+
132
+ # Cleanup the process table
133
+ manager.cleanup
134
+
135
+ # exit! is not within the instrumentation as it would not trigger due to exit
136
+ Kernel.exit!(forceful_exit_code)
137
+ ensure
138
+ if initialized
139
+ Karafka::App.stopped!
140
+ Karafka::App.terminate!
141
+ end
142
+ end
143
+
144
+ # Moves all the nodes and itself to the quiet state
145
+ def quiet
146
+ @mutex.synchronize do
147
+ return if @quieting
148
+
149
+ @quieting = true
150
+
151
+ Karafka::App.quiet!
152
+ manager.quiet
153
+ Karafka::App.quieted!
154
+ end
155
+ end
156
+
157
+ # Checks on the children nodes and takes appropriate actions.
158
+ # - If node is dead, will cleanup
159
+ # - If node is no longer reporting as healthy will start a graceful shutdown
160
+ # - If node does not want to close itself gracefully, will kill it
161
+ # - If node was dead, new node will be started as a recovery means
162
+ def control
163
+ @mutex.synchronize do
164
+ # If we are in quieting or stopping we should no longer control children
165
+ # Those states aim to finally shutdown nodes and we should not forcefully do anything
166
+ # to them. This especially applies to the quieting mode where any complex lifecycle
167
+ # reporting listeners may no longer report correctly
168
+ return if @quieting
169
+ return if @stopping
170
+
171
+ manager.control
172
+ end
173
+ end
174
+
175
+ # Sends desired signal to each node
176
+ # @param signal [String]
177
+ def signal(signal)
178
+ @mutex.synchronize do
179
+ manager.signal(signal)
180
+ end
181
+ end
182
+ end
183
+ end
184
+ end