karafka 2.3.0 → 2.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. checksums.yaml +4 -4
  2. checksums.yaml.gz.sig +0 -0
  3. data/.rspec +2 -0
  4. data/CHANGELOG.md +15 -0
  5. data/Gemfile +1 -1
  6. data/Gemfile.lock +22 -22
  7. data/README.md +2 -2
  8. data/bin/integrations +2 -1
  9. data/bin/rspecs +6 -2
  10. data/config/locales/errors.yml +30 -8
  11. data/config/locales/pro_errors.yml +2 -0
  12. data/docker-compose.yml +1 -1
  13. data/lib/karafka/app.rb +14 -0
  14. data/lib/karafka/cli/base.rb +19 -0
  15. data/lib/karafka/cli/server.rb +62 -76
  16. data/lib/karafka/cli/swarm.rb +30 -0
  17. data/lib/karafka/constraints.rb +3 -3
  18. data/lib/karafka/contracts/config.rb +19 -0
  19. data/lib/karafka/errors.rb +12 -0
  20. data/lib/karafka/helpers/async.rb +13 -3
  21. data/lib/karafka/helpers/config_importer.rb +30 -0
  22. data/lib/karafka/instrumentation/logger_listener.rb +31 -0
  23. data/lib/karafka/instrumentation/notifications.rb +9 -0
  24. data/lib/karafka/instrumentation/vendors/datadog/logger_listener.rb +2 -0
  25. data/lib/karafka/instrumentation/vendors/kubernetes/base_listener.rb +72 -0
  26. data/lib/karafka/instrumentation/vendors/kubernetes/liveness_listener.rb +11 -40
  27. data/lib/karafka/instrumentation/vendors/kubernetes/swarm_liveness_listener.rb +54 -0
  28. data/lib/karafka/pro/active_job/job_options_contract.rb +1 -1
  29. data/lib/karafka/pro/base_consumer.rb +16 -0
  30. data/lib/karafka/pro/connection/manager.rb +6 -1
  31. data/lib/karafka/pro/processing/coordinator.rb +13 -3
  32. data/lib/karafka/pro/processing/coordinators/errors_tracker.rb +74 -0
  33. data/lib/karafka/pro/processing/coordinators/filters_applier.rb +107 -0
  34. data/lib/karafka/pro/processing/coordinators/virtual_offset_manager.rb +180 -0
  35. data/lib/karafka/pro/processing/strategies/aj/dlq_ftr_lrj_mom.rb +5 -7
  36. data/lib/karafka/pro/processing/strategies/aj/dlq_ftr_lrj_mom_vp.rb +5 -7
  37. data/lib/karafka/pro/processing/strategies/aj/dlq_ftr_mom.rb +8 -10
  38. data/lib/karafka/pro/processing/strategies/aj/dlq_ftr_mom_vp.rb +8 -16
  39. data/lib/karafka/pro/processing/strategies/aj/dlq_lrj_mom.rb +5 -7
  40. data/lib/karafka/pro/processing/strategies/aj/dlq_lrj_mom_vp.rb +5 -7
  41. data/lib/karafka/pro/processing/strategies/aj/dlq_mom.rb +8 -10
  42. data/lib/karafka/pro/processing/strategies/aj/dlq_mom_vp.rb +7 -9
  43. data/lib/karafka/pro/processing/strategies/dlq/default.rb +36 -10
  44. data/lib/karafka/pro/processing/strategies/dlq/ftr.rb +3 -7
  45. data/lib/karafka/pro/processing/strategies/dlq/ftr_lrj.rb +4 -8
  46. data/lib/karafka/pro/processing/strategies/dlq/ftr_lrj_mom.rb +6 -9
  47. data/lib/karafka/pro/processing/strategies/dlq/ftr_mom.rb +5 -15
  48. data/lib/karafka/pro/processing/strategies/dlq/lrj.rb +4 -8
  49. data/lib/karafka/pro/processing/strategies/dlq/lrj_mom.rb +6 -9
  50. data/lib/karafka/pro/processing/strategies/dlq/mom.rb +10 -20
  51. data/lib/karafka/pro/processing/strategies/vp/default.rb +7 -0
  52. data/lib/karafka/pro/routing/features/dead_letter_queue/contracts/topic.rb +6 -0
  53. data/lib/karafka/pro/routing/features/dead_letter_queue/topic.rb +39 -0
  54. data/lib/karafka/pro/swarm/liveness_listener.rb +171 -0
  55. data/lib/karafka/process.rb +27 -1
  56. data/lib/karafka/routing/features/dead_letter_queue/config.rb +2 -0
  57. data/lib/karafka/routing/subscription_group.rb +31 -9
  58. data/lib/karafka/runner.rb +4 -0
  59. data/lib/karafka/server.rb +13 -16
  60. data/lib/karafka/setup/config.rb +41 -2
  61. data/lib/karafka/status.rb +4 -2
  62. data/lib/karafka/swarm/liveness_listener.rb +55 -0
  63. data/lib/karafka/swarm/manager.rb +217 -0
  64. data/lib/karafka/swarm/node.rb +179 -0
  65. data/lib/karafka/swarm/pidfd.rb +131 -0
  66. data/lib/karafka/swarm/supervisor.rb +184 -0
  67. data/lib/karafka/swarm.rb +27 -0
  68. data/lib/karafka/templates/karafka.rb.erb +0 -2
  69. data/lib/karafka/version.rb +1 -1
  70. data/lib/karafka.rb +1 -1
  71. data.tar.gz.sig +0 -0
  72. metadata +17 -4
  73. metadata.gz.sig +0 -0
  74. data/lib/karafka/pro/processing/filters_applier.rb +0 -105
  75. data/lib/karafka/pro/processing/virtual_offset_manager.rb +0 -177
@@ -0,0 +1,217 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Karafka
4
+ module Swarm
5
+ # Manager similar to the one for threads but managing processing nodes
6
+ # It starts nodes and keeps an eye on them.
7
+ #
8
+ # In any of the nodes is misbehaving (based on liveness listener) it will be restarted.
9
+ # Initially gracefully but if won't stop itself, it will be forced to.
10
+ #
11
+ # @note This is intended to run in the supervisor under mutexes (when needed)
12
+ class Manager
13
+ include Karafka::Core::Helpers::Time
14
+ include Helpers::ConfigImporter.new(
15
+ monitor: %i[monitor],
16
+ nodes_count: %i[swarm nodes],
17
+ shutdown_timeout: %i[shutdown_timeout],
18
+ node_report_timeout: %i[internal swarm node_report_timeout],
19
+ node_restart_timeout: %i[internal swarm node_restart_timeout]
20
+ )
21
+
22
+ # @return [Array<Node>] All nodes that manager manages
23
+ attr_reader :nodes
24
+
25
+ def initialize
26
+ @nodes = []
27
+ @statuses = Hash.new { |h, k| h[k] = {} }
28
+ end
29
+
30
+ # Starts all the expected nodes for the first time
31
+ def start
32
+ pidfd = Pidfd.new(::Process.pid)
33
+
34
+ @nodes = Array.new(nodes_count) do |i|
35
+ start_one Node.new(i, pidfd)
36
+ end
37
+ end
38
+
39
+ # Attempts to quiet all the nodes
40
+ def quiet
41
+ @nodes.each(&:quiet)
42
+ end
43
+
44
+ # Attempts to stop all the nodes
45
+ def stop
46
+ @nodes.each(&:stop)
47
+ end
48
+
49
+ # Terminates all the nodes
50
+ def terminate
51
+ @nodes.each(&:terminate)
52
+ end
53
+
54
+ # Collects all processes statuses
55
+ def cleanup
56
+ @nodes.each(&:cleanup)
57
+ end
58
+
59
+ # Sends given signal to all nodes
60
+ # @param signal [String] signal name
61
+ def signal(signal)
62
+ @nodes.each { |node| node.signal(signal) }
63
+ end
64
+
65
+ # @return [Boolean] true if none of the nodes is running
66
+ def stopped?
67
+ @nodes.none?(&:alive?)
68
+ end
69
+
70
+ # Checks on nodes if they are ok one after another
71
+ def control
72
+ monitor.instrument('swarm.manager.control', caller: self) do
73
+ @nodes.each do |node|
74
+ statuses = @statuses[node]
75
+
76
+ if node.alive?
77
+ next if terminate_if_hanging(statuses, node)
78
+ next if stop_if_not_healthy(statuses, node)
79
+ next if stop_if_not_responding(statuses, node)
80
+ else
81
+ next if cleanup_one(statuses, node)
82
+ next if restart_after_timeout(statuses, node)
83
+ end
84
+ end
85
+ end
86
+ end
87
+
88
+ private
89
+
90
+ # If we've issued a stop to this process and it does not want to stop in the period, kills it
91
+ #
92
+ # @param statuses [Hash] hash with statuses transitions with times
93
+ # @param [Swarm::Node] node we're checking
94
+ # @return [Boolean] should it be the last action taken on this node in this run
95
+ def terminate_if_hanging(statuses, node)
96
+ return false unless statuses.key?(:stop)
97
+ # If we already sent the termination request, we should not do it again
98
+ return true if statuses.key?(:terminate)
99
+ # Do not run any other checks on this node if it is during stopping but still has time
100
+ return true unless over?(statuses[:stop], shutdown_timeout)
101
+
102
+ monitor.instrument('swarm.manager.terminating', caller: self, node: node) do
103
+ node.terminate
104
+ statuses[:terminate] = monotonic_now
105
+ end
106
+
107
+ true
108
+ end
109
+
110
+ # Checks if there is any new liveness report from given node and if yes, issues stop if it
111
+ # reported it is not healthy.
112
+ #
113
+ # @param statuses [Hash] hash with statuses transitions with times
114
+ # @param [Swarm::Node] node we're checking
115
+ # @return [Boolean] should it be the last action taken on this node in this run
116
+ def stop_if_not_healthy(statuses, node)
117
+ status = node.status
118
+
119
+ case status
120
+ # If no new state reported, we should just move with other checks
121
+ when -1
122
+ false
123
+ when 0
124
+ # Exists and reports as healthy, so no other checks should happen on it in this go
125
+ statuses[:control] = monotonic_now
126
+ true
127
+ else
128
+ # A single invalid report will cause it to stop. We do not support intermediate failures
129
+ # that would recover. Such states should be implemented in the listener.
130
+ monitor.instrument('swarm.manager.stopping', caller: self, node: node, status: status) do
131
+ node.stop
132
+ statuses[:stop] = monotonic_now
133
+ end
134
+
135
+ true
136
+ end
137
+ end
138
+
139
+ # If node stopped responding, starts the stopping procedure.
140
+ #
141
+ # @param statuses [Hash] hash with statuses transitions with times
142
+ # @param [Swarm::Node] node we're checking
143
+ # @return [Boolean] should it be the last action taken on this node in this run
144
+ def stop_if_not_responding(statuses, node)
145
+ # Do nothing if already stopping
146
+ return true if statuses.key?(:stop)
147
+ # Do nothing if we've received status update recently enough
148
+ return true unless over?(statuses[:control], node_report_timeout)
149
+
150
+ # Start the stopping procedure if the node stopped reporting frequently enough
151
+ monitor.instrument('swarm.manager.stopping', caller: self, node: node) do
152
+ node.stop
153
+ statuses[:stop] = monotonic_now
154
+ end
155
+
156
+ true
157
+ end
158
+
159
+ # Cleans up a dead process and remembers time of death for restart after a period.
160
+ #
161
+ # @param statuses [Hash] hash with statuses transitions with times
162
+ # @param [Swarm::Node] node we're checking
163
+ # @return [Boolean] should it be the last action taken on this node in this run
164
+ def cleanup_one(statuses, node)
165
+ return false if statuses.key?(:dead_since)
166
+
167
+ node.cleanup
168
+ statuses[:dead_since] = monotonic_now
169
+
170
+ true
171
+ end
172
+
173
+ # Restarts the node if there was enough of a backoff.
174
+ #
175
+ # We always wait a bit to make sure, we do not overload the system in case forks would be
176
+ # killed for some external reason.
177
+ #
178
+ # @param statuses [Hash] hash with statuses transitions with times
179
+ # @param [Swarm::Node] node we're checking
180
+ # @return [Boolean] should it be the last action taken on this node in this run
181
+ def restart_after_timeout(statuses, node)
182
+ return false unless over?(statuses[:dead_since], node_restart_timeout)
183
+
184
+ start_one(node)
185
+
186
+ true
187
+ end
188
+
189
+ # Starts a new node (or restarts dead)
190
+ #
191
+ # @param [Swarm::Node] node we're starting
192
+ def start_one(node)
193
+ instr_args = { caller: self, node: node }
194
+
195
+ statuses = @statuses[node]
196
+
197
+ statuses.clear
198
+ statuses[:control] = monotonic_now
199
+
200
+ monitor.instrument('swarm.manager.before_fork', instr_args)
201
+ node.start
202
+ monitor.instrument('swarm.manager.after_fork', instr_args)
203
+
204
+ node
205
+ end
206
+
207
+ # Are we over certain time from an event happening
208
+ #
209
+ # @param event_time [Float] when something happened
210
+ # @param delay [Float] how long should we wait
211
+ # @return [Boolean] true if we're past the delay
212
+ def over?(event_time, delay)
213
+ monotonic_now - event_time >= delay
214
+ end
215
+ end
216
+ end
217
+ end
@@ -0,0 +1,179 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Karafka
4
+ module Swarm
5
+ # Represents a single forked process node in a swarm
6
+ # Provides simple API to control forks and check their status
7
+ #
8
+ # @note Some of this APIs are for parent process only
9
+ #
10
+ # @note Keep in mind this can be used in both forks and supervisor and has a slightly different
11
+ # role in each. In case of the supervisor it is used to get information about the child and
12
+ # make certain requests to it. In case of child, it is used to provide zombie-fencing and
13
+ # report liveness
14
+ class Node
15
+ include Helpers::ConfigImporter.new(
16
+ monitor: %i[monitor],
17
+ config: %i[itself],
18
+ kafka: %i[kafka],
19
+ swarm: %i[swarm],
20
+ process: %i[process],
21
+ liveness_listener: %i[internal swarm liveness_listener]
22
+ )
23
+
24
+ # @return [Integer] id of the node. Useful for client.group.id assignment
25
+ attr_reader :id
26
+
27
+ # @return [Integer] pid of the node
28
+ attr_reader :pid
29
+
30
+ # @param id [Integer] number of the fork. Used for uniqueness setup for group client ids and
31
+ # other stuff where we need to know a unique reference of the fork in regards to the rest
32
+ # of them.
33
+ # @param parent_pidfd [Pidfd] parent pidfd for zombie fencing
34
+ def initialize(id, parent_pidfd)
35
+ @id = id
36
+ @parent_pidfd = parent_pidfd
37
+ end
38
+
39
+ # Starts a new fork and:
40
+ # - stores pid and parent reference
41
+ # - makes sure reader pipe is closed
42
+ # - sets up liveness listener
43
+ # - recreates producer and web producer
44
+ # @note Parent API
45
+ def start
46
+ @reader, @writer = IO.pipe
47
+
48
+ # :nocov:
49
+ @pid = ::Process.fork do
50
+ # Close the old producer so it is not a subject to GC
51
+ # While it was not opened in the parent, without explicit closing, there still could be
52
+ # an attempt to close it when finalized, meaning it would be kept in memory.
53
+ config.producer.close
54
+
55
+ # Supervisor producer is closed, hence we need a new one here
56
+ config.producer = ::WaterDrop::Producer.new do |p_config|
57
+ p_config.kafka = Setup::AttributesMap.producer(kafka.dup)
58
+ p_config.logger = config.logger
59
+ end
60
+
61
+ @pid = ::Process.pid
62
+ @reader.close
63
+
64
+ # Indicate we are alive right after start
65
+ healthy
66
+
67
+ swarm.node = self
68
+ monitor.subscribe(liveness_listener)
69
+ monitor.instrument('swarm.node.after_fork', caller: self)
70
+
71
+ Server.run
72
+
73
+ @writer.close
74
+ end
75
+ # :nocov:
76
+
77
+ @writer.close
78
+ @pidfd = Pidfd.new(@pid)
79
+ end
80
+
81
+ # Indicates that this node is doing well
82
+ # @note Child API
83
+ def healthy
84
+ write('0')
85
+ end
86
+
87
+ # Indicates, that this node has failed
88
+ # @param reason_code [Integer, String] numeric code we want to use to indicate that we are
89
+ # not healthy. Anything bigger than 0 will be considered not healthy. Useful it we want to
90
+ # have complex health-checking with reporting.
91
+ # @note Child API
92
+ # @note We convert this to string to normalize the API
93
+ def unhealthy(reason_code = '1')
94
+ write(reason_code.to_s)
95
+ end
96
+
97
+ # @return [Integer] This returns following status code depending on the data:
98
+ # - -1 if node did not report anything new
99
+ # - 0 if all good,
100
+ # - positive number if there was a problem (indicates error code)
101
+ #
102
+ # @note Parent API
103
+ # @note If there were few issues reported, it will pick the one with highest number
104
+ def status
105
+ result = read
106
+
107
+ return -1 if result.nil?
108
+ return -1 if result == false
109
+
110
+ result.split("\n").map(&:to_i).max
111
+ end
112
+
113
+ # @return [Boolean] true if node is alive or false if died
114
+ # @note Parent API
115
+ # @note Keep in mind that the fact that process is alive does not mean it is healthy
116
+ def alive?
117
+ @pidfd.alive?
118
+ end
119
+
120
+ # @return [Boolean] true if node is orphaned or false otherwise. Used for orphans detection.
121
+ # @note Child API
122
+ def orphaned?
123
+ !@parent_pidfd.alive?
124
+ end
125
+
126
+ # Sends sigterm to the node
127
+ # @note Parent API
128
+ def stop
129
+ signal('TERM')
130
+ end
131
+
132
+ # Sends sigtstp to the node
133
+ # @note Parent API
134
+ def quiet
135
+ signal('TSTP')
136
+ end
137
+
138
+ # Terminates node
139
+ # @note Parent API
140
+ def terminate
141
+ signal('KILL')
142
+ end
143
+
144
+ # Sends provided signal to the node
145
+ # @param signal [String]
146
+ def signal(signal)
147
+ @pidfd.signal(signal)
148
+ end
149
+
150
+ # Removes the dead process from the processes table
151
+ def cleanup
152
+ @pidfd.cleanup
153
+ end
154
+
155
+ private
156
+
157
+ # Reads in a non-blocking way provided content
158
+ # @return [String, false] Content from the pipe or false if nothing or something went wrong
159
+ # @note Parent API
160
+ def read
161
+ @reader.read_nonblock(1024)
162
+ rescue IO::WaitReadable, Errno::EPIPE, IOError
163
+ false
164
+ end
165
+
166
+ # Writes in a non-blocking way provided content into the pipe
167
+ # @param content [Integer, String] anything we want to write to the parent
168
+ # @return [Boolean] true if ok, otherwise false
169
+ # @note Child API
170
+ def write(content)
171
+ @writer.write_nonblock "#{content}\n"
172
+
173
+ true
174
+ rescue IO::WaitWritable, Errno::EPIPE, IOError
175
+ false
176
+ end
177
+ end
178
+ end
179
+ end
@@ -0,0 +1,131 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Karafka
4
+ module Swarm
5
+ # Pidfd Linux representation wrapped with Ruby for communication within Swarm
6
+ # It is more stable than using `#pid` and `#ppid` + signals and cheaper
7
+ class Pidfd
8
+ include Helpers::ConfigImporter.new(
9
+ pidfd_open_syscall: %i[internal swarm pidfd_open_syscall],
10
+ pidfd_signal_syscall: %i[internal swarm pidfd_signal_syscall],
11
+ waitid_syscall: %i[internal swarm waitid_syscall]
12
+ )
13
+
14
+ extend FFI::Library
15
+
16
+ begin
17
+ ffi_lib FFI::Library::LIBC
18
+
19
+ # direct usage of this is only available since glibc 2.36, hence we use bindings and call
20
+ # it directly via syscalls
21
+ attach_function :fdpid_open, :syscall, %i[long int uint], :int
22
+ attach_function :fdpid_signal, :syscall, %i[long int int pointer uint], :int
23
+ attach_function :waitid, %i[int int pointer uint], :int
24
+
25
+ API_SUPPORTED = true
26
+ # LoadError is a parent to FFI::NotFoundError
27
+ rescue LoadError
28
+ API_SUPPORTED = false
29
+ ensure
30
+ private_constant :API_SUPPORTED
31
+ end
32
+
33
+ # https://github.com/torvalds/linux/blob/7e90b5c295/include/uapi/linux/wait.h#L20
34
+ P_PIDFD = 3
35
+
36
+ # Wait for child processes that have exited
37
+ WEXITED = 4
38
+
39
+ private_constant :P_PIDFD, :WEXITED
40
+
41
+ class << self
42
+ # @return [Boolean] true if syscall is supported via FFI
43
+ def supported?
44
+ # If we were not even able to load the FFI C lib, it won't be supported
45
+ return false unless API_SUPPORTED
46
+ # Won't work on macOS because it does not support pidfd
47
+ return false if RUBY_DESCRIPTION.include?('darwin')
48
+ # Won't work on Windows for the same reason as on macOS
49
+ return false if RUBY_DESCRIPTION.match?(/mswin|ming|cygwin/)
50
+
51
+ # There are some OSes like BSD that will have C lib for FFI bindings but will not support
52
+ # the needed syscalls. In such cases, we can just try and fail, which will indicate it
53
+ # won't work. The same applies to using new glibc on an old kernel.
54
+ new(::Process.pid)
55
+
56
+ true
57
+ rescue Errors::PidfdOpenFailedError
58
+ false
59
+ end
60
+ end
61
+
62
+ # @param pid [Integer] pid of the node we want to work with
63
+ def initialize(pid)
64
+ @mutex = Mutex.new
65
+
66
+ @pid = pid
67
+ @pidfd = open(pid)
68
+ @pidfd_io = IO.new(@pidfd)
69
+ end
70
+
71
+ # @return [Boolean] true if given process is alive, false if no longer
72
+ def alive?
73
+ @pidfd_select ||= [@pidfd_io]
74
+
75
+ IO.select(@pidfd_select, nil, nil, 0).nil?
76
+ end
77
+
78
+ # Cleans the zombie process
79
+ # @note This should run **only** on processes that exited, otherwise will wait
80
+ def cleanup
81
+ return if @cleaned
82
+
83
+ waitid(P_PIDFD, @pidfd, nil, WEXITED)
84
+
85
+ @cleaned = true
86
+ end
87
+
88
+ # Sends given signal to the process using its pidfd
89
+ # @param sig_name [String] signal name
90
+ # @return [Boolean] true if signal was sent, otherwise false or error raised. `false`
91
+ # returned when we attempt to send a signal to a dead process
92
+ # @note It will not send signals to dead processes
93
+ def signal(sig_name)
94
+ @mutex.synchronize do
95
+ return false if @cleaned
96
+ # Never signal processes that are dead
97
+ return false unless alive?
98
+
99
+ result = fdpid_signal(
100
+ pidfd_signal_syscall,
101
+ @pidfd,
102
+ Signal.list.fetch(sig_name),
103
+ nil,
104
+ 0
105
+ )
106
+
107
+ return true if result.zero?
108
+
109
+ raise Errors::PidfdSignalFailedError, result
110
+ end
111
+ end
112
+
113
+ private
114
+
115
+ # Opens a pidfd for the provided pid
116
+ # @param pid [Integer]
117
+ # @return [Integer] pidfd
118
+ def open(pid)
119
+ pidfd = fdpid_open(
120
+ pidfd_open_syscall,
121
+ pid,
122
+ 0
123
+ )
124
+
125
+ return pidfd if pidfd != -1
126
+
127
+ raise Errors::PidfdOpenFailedError, pidfd
128
+ end
129
+ end
130
+ end
131
+ end
@@ -0,0 +1,184 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Karafka
4
+ module Swarm
5
+ # Supervisor that starts forks and uses monitor to monitor them. Also handles shutdown of
6
+ # all the processes including itself.
7
+ #
8
+ # In case any node dies, it will be restarted.
9
+ #
10
+ # @note Technically speaking supervisor is never in the running state because we do not want
11
+ # to have any sockets or anything else on it that could break under forking.
12
+ # It has its own "supervising" state from which it can go to the final shutdown.
13
+ class Supervisor
14
+ include Karafka::Core::Helpers::Time
15
+ include Helpers::ConfigImporter.new(
16
+ monitor: %i[monitor],
17
+ swarm: %i[internal swarm],
18
+ manager: %i[internal swarm manager],
19
+ supervision_interval: %i[internal swarm supervision_interval],
20
+ shutdown_timeout: %i[shutdown_timeout],
21
+ supervision_sleep: %i[internal supervision_sleep],
22
+ forceful_exit_code: %i[internal forceful_exit_code],
23
+ process: %i[internal process]
24
+ )
25
+
26
+ def initialize
27
+ @mutex = Mutex.new
28
+ @queue = Processing::TimedQueue.new
29
+ end
30
+
31
+ # Creates needed number of forks, installs signals and starts supervision
32
+ def run
33
+ Karafka::App.warmup
34
+
35
+ manager.start
36
+
37
+ # Close producer just in case. While it should not be used, we do not want even a
38
+ # theoretical case since librdkafka is not thread-safe.
39
+ Karafka.producer.close
40
+
41
+ process.on_sigint { stop }
42
+ process.on_sigquit { stop }
43
+ process.on_sigterm { stop }
44
+ process.on_sigtstp { quiet }
45
+ process.on_sigttin { signal('TTIN') }
46
+ # Needed to be registered as we want to unlock on child changes
47
+ process.on_sigchld {}
48
+ process.on_any_active { unlock }
49
+ process.supervise
50
+
51
+ Karafka::App.supervise!
52
+
53
+ loop do
54
+ return if Karafka::App.terminated?
55
+
56
+ lock
57
+ control
58
+ end
59
+ # If anything went wrong, signal this and die
60
+ # Supervisor is meant to be thin and not cause any issues. If you encounter this case
61
+ # please report it as it should be considered critical
62
+ rescue StandardError => e
63
+ monitor.instrument(
64
+ 'error.occurred',
65
+ caller: self,
66
+ error: e,
67
+ manager: manager,
68
+ type: 'swarm.supervisor.error'
69
+ )
70
+
71
+ @nodes.terminate
72
+ end
73
+
74
+ private
75
+
76
+ # Keeps the lock on the queue so we control nodes only when it is needed
77
+ # @note We convert to seconds since the queue timeout requires seconds
78
+ def lock
79
+ @queue.pop(timeout: supervision_interval / 1_000.0)
80
+ end
81
+
82
+ # Frees the lock on events that could require nodes control
83
+ def unlock
84
+ @queue << true
85
+ end
86
+
87
+ # Stops all the nodes and supervisor once all nodes are dead.
88
+ # It will forcefully stop all nodes if they exit the shutdown timeout. While in theory each
89
+ # of the nodes anyhow has its own supervisor, this is a last resort to stop everything.
90
+ def stop
91
+ # Ensure that the stopping procedure is initialized only once
92
+ @mutex.synchronize do
93
+ return if @stopping
94
+
95
+ @stopping = true
96
+ end
97
+
98
+ initialized = true
99
+ Karafka::App.stop!
100
+
101
+ manager.stop
102
+
103
+ # We check from time to time (for the timeout period) if all the threads finished
104
+ # their work and if so, we can just return and normal shutdown process will take place
105
+ # We divide it by 1000 because we use time in ms.
106
+ ((shutdown_timeout / 1_000) * (1 / supervision_sleep)).to_i.times do
107
+ if manager.stopped?
108
+ manager.cleanup
109
+ return
110
+ end
111
+
112
+ sleep(supervision_sleep)
113
+ end
114
+
115
+ raise Errors::ForcefulShutdownError
116
+ rescue Errors::ForcefulShutdownError => e
117
+ monitor.instrument(
118
+ 'error.occurred',
119
+ caller: self,
120
+ error: e,
121
+ manager: manager,
122
+ type: 'app.stopping.error'
123
+ )
124
+
125
+ # Run forceful kill
126
+ manager.terminate
127
+ # And wait until linux kills them
128
+ # This prevents us from existing forcefully with any dead child process still existing
129
+ # Since we have sent the `KILL` signal, it must die, so we can wait until all dead
130
+ sleep(supervision_sleep) until manager.stopped?
131
+
132
+ # Cleanup the process table
133
+ manager.cleanup
134
+
135
+ # exit! is not within the instrumentation as it would not trigger due to exit
136
+ Kernel.exit!(forceful_exit_code)
137
+ ensure
138
+ if initialized
139
+ Karafka::App.stopped!
140
+ Karafka::App.terminate!
141
+ end
142
+ end
143
+
144
+ # Moves all the nodes and itself to the quiet state
145
+ def quiet
146
+ @mutex.synchronize do
147
+ return if @quieting
148
+
149
+ @quieting = true
150
+
151
+ Karafka::App.quiet!
152
+ manager.quiet
153
+ Karafka::App.quieted!
154
+ end
155
+ end
156
+
157
+ # Checks on the children nodes and takes appropriate actions.
158
+ # - If node is dead, will cleanup
159
+ # - If node is no longer reporting as healthy will start a graceful shutdown
160
+ # - If node does not want to close itself gracefully, will kill it
161
+ # - If node was dead, new node will be started as a recovery means
162
+ def control
163
+ @mutex.synchronize do
164
+ # If we are in quieting or stopping we should no longer control children
165
+ # Those states aim to finally shutdown nodes and we should not forcefully do anything
166
+ # to them. This especially applies to the quieting mode where any complex lifecycle
167
+ # reporting listeners may no longer report correctly
168
+ return if @quieting
169
+ return if @stopping
170
+
171
+ manager.control
172
+ end
173
+ end
174
+
175
+ # Sends desired signal to each node
176
+ # @param signal [String]
177
+ def signal(signal)
178
+ @mutex.synchronize do
179
+ manager.signal(signal)
180
+ end
181
+ end
182
+ end
183
+ end
184
+ end