karafka 2.3.0 → 2.3.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (75) hide show
  1. checksums.yaml +4 -4
  2. checksums.yaml.gz.sig +0 -0
  3. data/.rspec +2 -0
  4. data/CHANGELOG.md +15 -0
  5. data/Gemfile +1 -1
  6. data/Gemfile.lock +22 -22
  7. data/README.md +2 -2
  8. data/bin/integrations +2 -1
  9. data/bin/rspecs +6 -2
  10. data/config/locales/errors.yml +30 -8
  11. data/config/locales/pro_errors.yml +2 -0
  12. data/docker-compose.yml +1 -1
  13. data/lib/karafka/app.rb +14 -0
  14. data/lib/karafka/cli/base.rb +19 -0
  15. data/lib/karafka/cli/server.rb +62 -76
  16. data/lib/karafka/cli/swarm.rb +30 -0
  17. data/lib/karafka/constraints.rb +3 -3
  18. data/lib/karafka/contracts/config.rb +19 -0
  19. data/lib/karafka/errors.rb +12 -0
  20. data/lib/karafka/helpers/async.rb +13 -3
  21. data/lib/karafka/helpers/config_importer.rb +30 -0
  22. data/lib/karafka/instrumentation/logger_listener.rb +31 -0
  23. data/lib/karafka/instrumentation/notifications.rb +9 -0
  24. data/lib/karafka/instrumentation/vendors/datadog/logger_listener.rb +2 -0
  25. data/lib/karafka/instrumentation/vendors/kubernetes/base_listener.rb +72 -0
  26. data/lib/karafka/instrumentation/vendors/kubernetes/liveness_listener.rb +11 -40
  27. data/lib/karafka/instrumentation/vendors/kubernetes/swarm_liveness_listener.rb +54 -0
  28. data/lib/karafka/pro/active_job/job_options_contract.rb +1 -1
  29. data/lib/karafka/pro/base_consumer.rb +16 -0
  30. data/lib/karafka/pro/connection/manager.rb +6 -1
  31. data/lib/karafka/pro/processing/coordinator.rb +13 -3
  32. data/lib/karafka/pro/processing/coordinators/errors_tracker.rb +74 -0
  33. data/lib/karafka/pro/processing/coordinators/filters_applier.rb +107 -0
  34. data/lib/karafka/pro/processing/coordinators/virtual_offset_manager.rb +180 -0
  35. data/lib/karafka/pro/processing/strategies/aj/dlq_ftr_lrj_mom.rb +5 -7
  36. data/lib/karafka/pro/processing/strategies/aj/dlq_ftr_lrj_mom_vp.rb +5 -7
  37. data/lib/karafka/pro/processing/strategies/aj/dlq_ftr_mom.rb +8 -10
  38. data/lib/karafka/pro/processing/strategies/aj/dlq_ftr_mom_vp.rb +8 -16
  39. data/lib/karafka/pro/processing/strategies/aj/dlq_lrj_mom.rb +5 -7
  40. data/lib/karafka/pro/processing/strategies/aj/dlq_lrj_mom_vp.rb +5 -7
  41. data/lib/karafka/pro/processing/strategies/aj/dlq_mom.rb +8 -10
  42. data/lib/karafka/pro/processing/strategies/aj/dlq_mom_vp.rb +7 -9
  43. data/lib/karafka/pro/processing/strategies/dlq/default.rb +36 -10
  44. data/lib/karafka/pro/processing/strategies/dlq/ftr.rb +3 -7
  45. data/lib/karafka/pro/processing/strategies/dlq/ftr_lrj.rb +4 -8
  46. data/lib/karafka/pro/processing/strategies/dlq/ftr_lrj_mom.rb +6 -9
  47. data/lib/karafka/pro/processing/strategies/dlq/ftr_mom.rb +5 -15
  48. data/lib/karafka/pro/processing/strategies/dlq/lrj.rb +4 -8
  49. data/lib/karafka/pro/processing/strategies/dlq/lrj_mom.rb +6 -9
  50. data/lib/karafka/pro/processing/strategies/dlq/mom.rb +10 -20
  51. data/lib/karafka/pro/processing/strategies/vp/default.rb +7 -0
  52. data/lib/karafka/pro/routing/features/dead_letter_queue/contracts/topic.rb +6 -0
  53. data/lib/karafka/pro/routing/features/dead_letter_queue/topic.rb +39 -0
  54. data/lib/karafka/pro/swarm/liveness_listener.rb +171 -0
  55. data/lib/karafka/process.rb +27 -1
  56. data/lib/karafka/routing/features/dead_letter_queue/config.rb +2 -0
  57. data/lib/karafka/routing/subscription_group.rb +31 -9
  58. data/lib/karafka/runner.rb +4 -0
  59. data/lib/karafka/server.rb +13 -16
  60. data/lib/karafka/setup/config.rb +41 -2
  61. data/lib/karafka/status.rb +4 -2
  62. data/lib/karafka/swarm/liveness_listener.rb +55 -0
  63. data/lib/karafka/swarm/manager.rb +217 -0
  64. data/lib/karafka/swarm/node.rb +179 -0
  65. data/lib/karafka/swarm/pidfd.rb +131 -0
  66. data/lib/karafka/swarm/supervisor.rb +184 -0
  67. data/lib/karafka/swarm.rb +27 -0
  68. data/lib/karafka/templates/karafka.rb.erb +0 -2
  69. data/lib/karafka/version.rb +1 -1
  70. data/lib/karafka.rb +1 -1
  71. data.tar.gz.sig +0 -0
  72. metadata +17 -4
  73. metadata.gz.sig +0 -0
  74. data/lib/karafka/pro/processing/filters_applier.rb +0 -105
  75. data/lib/karafka/pro/processing/virtual_offset_manager.rb +0 -177
@@ -28,6 +28,12 @@ module Karafka
28
28
  ).fetch('en').fetch('validations').fetch('topic')
29
29
  end
30
30
 
31
+ nested(:dead_letter_queue) do
32
+ # We use strategy based DLQ for every case in Pro
33
+ # For default (when no strategy) a default `max_retries` based strategy is used
34
+ required(:strategy) { |val| val.respond_to?(:call) }
35
+ end
36
+
31
37
  # Make sure that when we use virtual partitions with DLQ, at least one retry is set
32
38
  # We cannot use VP with DLQ without retries as we in order to provide ordering
33
39
  # warranties on errors with VP, we need to collapse the VPs concurrency and retry
@@ -0,0 +1,39 @@
1
+ # frozen_string_literal: true
2
+
3
+ # This Karafka component is a Pro component under a commercial license.
4
+ # This Karafka component is NOT licensed under LGPL.
5
+ #
6
+ # All of the commercial components are present in the lib/karafka/pro directory of this
7
+ # repository and their usage requires commercial license agreement.
8
+ #
9
+ # Karafka has also commercial-friendly license, commercial support and commercial components.
10
+ #
11
+ # By sending a pull request to the pro components, you are agreeing to transfer the copyright of
12
+ # your code to Maciej Mensfeld.
13
+
14
+ module Karafka
15
+ module Pro
16
+ module Routing
17
+ module Features
18
+ class DeadLetterQueue < Base
19
+ # Expansions to the topic API in DLQ
20
+ module Topic
21
+ # @param strategy [#call, nil] Strategy we want to use or nil if a default strategy
22
+ # (same as in OSS) should be applied
23
+ # @param args [Hash] OSS DLQ arguments
24
+ def dead_letter_queue(strategy: nil, **args)
25
+ return @dead_letter_queue if @dead_letter_queue
26
+
27
+ super(**args).tap do |config|
28
+ # If explicit strategy is not provided, use the default approach from OSS
29
+ config.strategy = strategy || lambda do |_errors_tracker, attempt|
30
+ attempt > config.max_retries ? :dispatch : :retry
31
+ end
32
+ end
33
+ end
34
+ end
35
+ end
36
+ end
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,171 @@
1
+ # frozen_string_literal: true
2
+
3
+ # This Karafka component is a Pro component under a commercial license.
4
+ # This Karafka component is NOT licensed under LGPL.
5
+ #
6
+ # All of the commercial components are present in the lib/karafka/pro directory of this
7
+ # repository and their usage requires commercial license agreement.
8
+ #
9
+ # Karafka has also commercial-friendly license, commercial support and commercial components.
10
+ #
11
+ # By sending a pull request to the pro components, you are agreeing to transfer the copyright of
12
+ # your code to Maciej Mensfeld.
13
+
14
+ module Karafka
15
+ module Pro
16
+ # Pro Swarm components namespace
17
+ module Swarm
18
+ # Pro listener that monitors RSS usage and other heartbeat metrics (if configured) to ensure
19
+ # that everything operates.
20
+ #
21
+ # It can:
22
+ # - monitor poll frequency to make sure things are not polled not often enough
23
+ # - monitor consumption to make sure we do not process data for too long
24
+ # - monitor RSS to make sure that we do not use too much memory
25
+ #
26
+ # By default it does **not** monitor memory and consuming and polling is configured in such
27
+ # a way to align with `max.poll.interval.ms` and other defaults.
28
+ #
29
+ # Failure statuses reported are as follows:
30
+ # - 1 - polling ttl exceeded
31
+ # - 2 - consuming ttl exceeded
32
+ # - 3 - memory limit exceeded
33
+ #
34
+ # @note This listener should not break anything if subscribed in the supervisor prior to
35
+ # forking as it relies on server events for operations.
36
+ class LivenessListener < Karafka::Swarm::LivenessListener
37
+ # @param memory_limit [Integer] max memory in MB for this process to be considered healthy
38
+ # @param consuming_ttl [Integer] time in ms after which we consider consumption hanging.
39
+ # It allows us to define max consumption time after which supervisor should consider
40
+ # given process as hanging
41
+ # @param polling_ttl [Integer] max time in ms for polling. If polling (any) does not
42
+ # happen that often, process should be considered dead.
43
+ # @note The default TTL matches the default `max.poll.interval.ms`
44
+ def initialize(
45
+ memory_limit: Float::INFINITY,
46
+ consuming_ttl: 5 * 60 * 1_000,
47
+ polling_ttl: 5 * 60 * 1_000
48
+ )
49
+ @polling_ttl = polling_ttl
50
+ @consuming_ttl = consuming_ttl
51
+ # We cast it just in case someone would provide '10MB' or something similar
52
+ @memory_limit = memory_limit.is_a?(String) ? memory_limit.to_i : memory_limit
53
+ @pollings = {}
54
+ @consumptions = {}
55
+
56
+ super()
57
+ end
58
+
59
+ # Tick on each fetch
60
+ #
61
+ # @param _event [Karafka::Core::Monitoring::Event]
62
+ def on_connection_listener_fetch_loop(_event)
63
+ mark_polling_tick
64
+ end
65
+
66
+ {
67
+ consume: :consumed,
68
+ revoke: :revoked,
69
+ shutting_down: :shutdown,
70
+ tick: :ticked
71
+ }.each do |before, after|
72
+ class_eval <<~RUBY, __FILE__, __LINE__ + 1
73
+ # Tick on starting work
74
+ # @param _event [Karafka::Core::Monitoring::Event]
75
+ def on_consumer_#{before}(_event)
76
+ mark_consumption_tick
77
+ end
78
+
79
+ # Tick on finished work
80
+ # @param _event [Karafka::Core::Monitoring::Event]
81
+ def on_consumer_#{after}(_event)
82
+ clear_consumption_tick
83
+ end
84
+ RUBY
85
+ end
86
+
87
+ # @param _event [Karafka::Core::Monitoring::Event]
88
+ def on_error_occurred(_event)
89
+ clear_consumption_tick
90
+ clear_polling_tick
91
+ end
92
+
93
+ # Reports the current status once in a while
94
+ #
95
+ # @param _event [Karafka::Core::Monitoring::Event]
96
+ def on_statistics_emitted(_event)
97
+ periodically do
98
+ return unless node
99
+
100
+ current_status = status
101
+
102
+ current_status.positive? ? node.unhealthy(current_status) : node.healthy
103
+ end
104
+ end
105
+
106
+ private
107
+
108
+ # @return [Integer] object id of the current thread
109
+ def thread_id
110
+ Thread.current.object_id
111
+ end
112
+
113
+ # Update the polling tick time for current thread
114
+ def mark_polling_tick
115
+ synchronize do
116
+ @pollings[thread_id] = monotonic_now
117
+ end
118
+ end
119
+
120
+ # Clear current thread polling time tracker
121
+ def clear_polling_tick
122
+ synchronize do
123
+ @pollings.delete(thread_id)
124
+ end
125
+ end
126
+
127
+ # Update the processing tick time
128
+ def mark_consumption_tick
129
+ synchronize do
130
+ @consumptions[thread_id] = monotonic_now
131
+ end
132
+ end
133
+
134
+ # Clear current thread consumption time tracker
135
+ def clear_consumption_tick
136
+ synchronize do
137
+ @consumptions.delete(thread_id)
138
+ end
139
+ end
140
+
141
+ # Did we exceed any of the ttls
142
+ # @return [String] 204 string if ok, 500 otherwise
143
+ def status
144
+ time = monotonic_now
145
+
146
+ return 1 if @pollings.values.any? { |tick| (time - tick) > @polling_ttl }
147
+ return 2 if @consumptions.values.any? { |tick| (time - tick) > @consuming_ttl }
148
+ return 3 if rss_mb > @memory_limit
149
+
150
+ 0
151
+ end
152
+
153
+ # @return [Integer] RSS in MB for the current process
154
+ # @note Since swarm is linux only, we do not have to worry about getting RSS on other OSes
155
+ def rss_mb
156
+ kb_rss = 0
157
+
158
+ IO.readlines("/proc/#{node.pid}/status").each do |line|
159
+ next unless line.start_with?('VmRSS:')
160
+
161
+ kb_rss = line.split[1].to_i
162
+
163
+ break
164
+ end
165
+
166
+ (kb_rss / 1_024.to_i).round
167
+ end
168
+ end
169
+ end
170
+ end
171
+ end
@@ -14,6 +14,8 @@ module Karafka
14
14
  SIGTERM
15
15
  SIGTTIN
16
16
  SIGTSTP
17
+ SIGCHLD
18
+ SIGUSER1
17
19
  ].freeze
18
20
 
19
21
  HANDLED_SIGNALS.each do |signal|
@@ -32,16 +34,40 @@ module Karafka
32
34
  RUBY
33
35
  end
34
36
 
37
+ # Assigns a callback that will run on any supported signal that has at least one callback
38
+ # registered already.
39
+ # @param block [Proc] code we want to run
40
+ # @note This will only bind to signals that already have at least one callback defined
41
+ def on_any_active(&block)
42
+ HANDLED_SIGNALS.each do |signal|
43
+ next unless @callbacks.key?(signal)
44
+
45
+ public_send(:"on_#{signal.to_s.downcase}", &block)
46
+ end
47
+ end
48
+
35
49
  # Creates an instance of process and creates empty hash for callbacks
36
50
  def initialize
37
51
  @callbacks = Hash.new { |hsh, key| hsh[key] = [] }
38
52
  @supervised = false
39
53
  end
40
54
 
55
+ # Clears all the defined callbacks. Useful for post-fork cleanup when parent already defined
56
+ # some signals
57
+ def clear
58
+ @callbacks.clear
59
+ end
60
+
41
61
  # Method catches all HANDLED_SIGNALS and performs appropriate callbacks (if defined)
42
62
  # @note If there are no callbacks, this method will just ignore a given signal that was sent
43
63
  def supervise
44
- HANDLED_SIGNALS.each { |signal| trap_signal(signal) }
64
+ HANDLED_SIGNALS.each do |signal|
65
+ # Supervise only signals for which we have defined callbacks
66
+ next unless @callbacks.key?(signal)
67
+
68
+ trap_signal(signal)
69
+ end
70
+
45
71
  @supervised = true
46
72
  end
47
73
 
@@ -15,6 +15,8 @@ module Karafka
15
15
  :independent,
16
16
  # Move to DLQ and mark as consumed in transactional mode (if applicable)
17
17
  :transactional,
18
+ # Strategy to apply (if strategies supported)
19
+ :strategy,
18
20
  keyword_init: true
19
21
  ) do
20
22
  alias_method :active?, :active
@@ -8,6 +8,12 @@ module Karafka
8
8
  # @note One subscription group will always belong to one consumer group, but one consumer
9
9
  # group can have multiple subscription groups.
10
10
  class SubscriptionGroup
11
+ include Helpers::ConfigImporter.new(
12
+ activity_manager: %i[internal routing activity_manager],
13
+ client_id: %i[client_id],
14
+ node: %i[swarm node]
15
+ )
16
+
11
17
  attr_reader :id, :name, :topics, :kafka, :consumer_group
12
18
 
13
19
  # Lock for generating new ids safely
@@ -67,7 +73,7 @@ module Karafka
67
73
 
68
74
  # @return [Boolean] is this subscription group one of active once
69
75
  def active?
70
- Karafka::App.config.internal.routing.activity_manager.active?(:subscription_groups, name)
76
+ activity_manager.active?(:subscription_groups, name)
71
77
  end
72
78
 
73
79
  # @return [Array<String>] names of topics to which we should subscribe.
@@ -93,15 +99,9 @@ module Karafka
93
99
  def build_kafka
94
100
  kafka = Setup::AttributesMap.consumer(@topics.first.kafka.dup)
95
101
 
96
- # If we use static group memberships, there can be a case, where same instance id would
97
- # be set on many subscription groups as the group instance id from Karafka perspective is
98
- # set per config. Each instance even if they are subscribed to different topics needs to
99
- # have it fully unique. To make sure of that, we just add extra postfix at the end that
100
- # increments.
101
- group_instance_id = kafka.fetch(:'group.instance.id', false)
102
+ inject_group_instance_id(kafka)
102
103
 
103
- kafka[:'group.instance.id'] = "#{group_instance_id}_#{@position}" if group_instance_id
104
- kafka[:'client.id'] ||= Karafka::App.config.client_id
104
+ kafka[:'client.id'] ||= client_id
105
105
  kafka[:'group.id'] ||= @consumer_group.id
106
106
  kafka[:'auto.offset.reset'] ||= @topics.first.initial_offset
107
107
  # Karafka manages the offsets based on the processing state, thus we do not rely on the
@@ -110,6 +110,28 @@ module Karafka
110
110
  kafka.freeze
111
111
  kafka
112
112
  end
113
+
114
+ # If we use static group memberships, there can be a case, where same instance id would
115
+ # be set on many subscription groups as the group instance id from Karafka perspective is
116
+ # set per config. Each instance even if they are subscribed to different topics needs to
117
+ # have it fully unique. To make sure of that, we just add extra postfix at the end that
118
+ # increments.
119
+ #
120
+ # We also handle a swarm case, where the same setup would run from many forked nodes, hence
121
+ # affecting the instance id and causing conflicts
122
+ # @param kafka [Hash] kafka level config
123
+ def inject_group_instance_id(kafka)
124
+ group_instance_prefix = kafka.fetch(:'group.instance.id', false)
125
+
126
+ # If group instance id was not even configured, do nothing
127
+ return unless group_instance_prefix
128
+
129
+ # If there is a node, we need to take its id and inject it as well so multiple forks can
130
+ # have different instances ids but they are reproducible
131
+ components = [group_instance_prefix, node ? node.id : nil, @position]
132
+
133
+ kafka[:'group.instance.id'] = components.compact.join('_')
134
+ end
113
135
  end
114
136
  end
115
137
  end
@@ -18,6 +18,10 @@ module Karafka
18
18
  workers = Processing::WorkersBatch.new(jobs_queue)
19
19
  listeners = Connection::ListenersBatch.new(jobs_queue)
20
20
 
21
+ # We mark it prior to delegating to the manager as manager will have to start at least one
22
+ # connection to Kafka, hence running
23
+ Karafka::App.run!
24
+
21
25
  # Register all the listeners so they can be started and managed
22
26
  @manager.register(listeners)
23
27
 
@@ -3,16 +3,6 @@
3
3
  module Karafka
4
4
  # Karafka consuming server class
5
5
  class Server
6
- # How long should we sleep between checks on shutting down consumers
7
- SUPERVISION_SLEEP = 0.1
8
- # What system exit code should we use when we terminated forcefully
9
- FORCEFUL_EXIT_CODE = 2
10
- # This factor allows us to calculate how many times we have to sleep before
11
- # a forceful shutdown
12
- SUPERVISION_CHECK_FACTOR = (1 / SUPERVISION_SLEEP)
13
-
14
- private_constant :SUPERVISION_SLEEP, :FORCEFUL_EXIT_CODE, :SUPERVISION_CHECK_FACTOR
15
-
16
6
  class << self
17
7
  # Set of consuming threads. Each consumer thread contains a single consumer
18
8
  attr_accessor :listeners
@@ -36,12 +26,20 @@ module Karafka
36
26
  config.internal.routing.activity_manager.to_h
37
27
  )
38
28
 
29
+ # We clear as we do not want parent handlers in case of working from fork
30
+ process.clear
39
31
  process.on_sigint { stop }
40
32
  process.on_sigquit { stop }
41
33
  process.on_sigterm { stop }
42
34
  process.on_sigtstp { quiet }
35
+ # Needed for instrumentation
36
+ process.on_sigttin {}
43
37
  process.supervise
44
38
 
39
+ # This will only run when not in a swarm mode. In swarm mode the server runs post-fork, so
40
+ # warmup will do nothing
41
+ Karafka::App.warmup
42
+
45
43
  # Start is blocking until stop is called and when we stop, it will wait until
46
44
  # all of the things are ready to stop
47
45
  start
@@ -61,10 +59,9 @@ module Karafka
61
59
  end
62
60
 
63
61
  # Starts Karafka with a supervision
64
- # @note We don't need to sleep because Karafka::Fetcher is locking and waiting to
65
- # finish loop (and it won't happen until we explicitly want to stop)
62
+ # @note We don't need to sleep because Karafka::Runner is locking and waiting to finish loop
63
+ # (and it won't happen until we explicitly want to stop)
66
64
  def start
67
- Karafka::App.run!
68
65
  Karafka::Runner.new.call
69
66
  end
70
67
 
@@ -87,13 +84,13 @@ module Karafka
87
84
  # We check from time to time (for the timeout period) if all the threads finished
88
85
  # their work and if so, we can just return and normal shutdown process will take place
89
86
  # We divide it by 1000 because we use time in ms.
90
- ((timeout / 1_000) * SUPERVISION_CHECK_FACTOR).to_i.times do
87
+ ((timeout / 1_000) * (1 / config.internal.supervision_sleep)).to_i.times do
91
88
  all_listeners_stopped = listeners.all?(&:stopped?)
92
89
  all_workers_stopped = workers.none?(&:alive?)
93
90
 
94
91
  return if all_listeners_stopped && all_workers_stopped
95
92
 
96
- sleep SUPERVISION_SLEEP
93
+ sleep(config.internal.supervision_sleep)
97
94
  end
98
95
 
99
96
  raise Errors::ForcefulShutdownError
@@ -117,7 +114,7 @@ module Karafka
117
114
  return unless process.supervised?
118
115
 
119
116
  # exit! is not within the instrumentation as it would not trigger due to exit
120
- Kernel.exit!(FORCEFUL_EXIT_CODE)
117
+ Kernel.exit!(config.internal.forceful_exit_code)
121
118
  ensure
122
119
  # We need to check if it wasn't an early exit to make sure that only on stop invocation
123
120
  # can change the status after everything is closed
@@ -105,6 +105,17 @@ module Karafka
105
105
  # @see https://github.com/edenhill/librdkafka/blob/master/CONFIGURATION.md
106
106
  setting :kafka, default: {}
107
107
 
108
+ # Public configuration for swarm operations
109
+ setting :swarm do
110
+ # option [Integer] how many processes do we want to run in a swarm mode
111
+ # Keep in mind this is only applicable when running in a swarm mode
112
+ setting :nodes, default: 3
113
+ # This is set automatically when we fork. Used to hold reference that may be needed
114
+ # for static group membership, supervision and more. If set to `false`, it means this
115
+ # process is not a fork
116
+ setting :node, default: false
117
+ end
118
+
108
119
  # Admin specific settings.
109
120
  #
110
121
  # Since admin operations are often specific, they may require specific librdkafka settings
@@ -151,7 +162,6 @@ module Karafka
151
162
  # @note In the future, we need to have a single process representation for all the karafka
152
163
  # instances
153
164
  setting :process, default: Process.new
154
-
155
165
  # Interval of "ticking". This is used to define the maximum time between consecutive
156
166
  # polling of the main rdkafka queue. It should match also the `statistics.interval.ms`
157
167
  # smallest value defined in any of the per-kafka settings, so metrics are published with
@@ -162,6 +172,36 @@ module Karafka
162
172
  # not to have enough time to run. This (not directly) defines also a single poll
163
173
  # max timeout as to allow for frequent enough events polling
164
174
  setting :tick_interval, default: 5_000
175
+ # How long should we sleep between checks on shutting down consumers
176
+ setting :supervision_sleep, default: 0.1
177
+ # What system exit code should we use when we terminated forcefully
178
+ setting :forceful_exit_code, default: 2
179
+
180
+ setting :swarm do
181
+ # Manager for swarm nodes control
182
+ setting :manager, default: Swarm::Manager.new
183
+ # Exit code we exit an orphaned child with to indicate something went wrong
184
+ setting :orphaned_exit_code, default: 3
185
+ # syscall number for https://man7.org/linux/man-pages/man2/pidfd_open.2.html
186
+ setting :pidfd_open_syscall, default: 434
187
+ # syscall number for https://man7.org/linux/man-pages/man2/pidfd_send_signal.2.html
188
+ setting :pidfd_signal_syscall, default: 424
189
+ # How often (in ms) should we control our nodes
190
+ # This is maximum time after which we will check. This can happen more often in case of
191
+ # system events.
192
+ setting :supervision_interval, default: 30_000
193
+ # How often should each node report its status
194
+ setting :liveness_interval, default: 10_000
195
+ # Listener used to report nodes state to the supervisor
196
+ setting :liveness_listener, default: Swarm::LivenessListener.new
197
+ # How long should we wait for any info from the node before we consider it hanging at
198
+ # stop it
199
+ setting :node_report_timeout, default: 30_000
200
+ # How long should we wait before restarting a node. This can prevent us from having a
201
+ # case where for some external reason our spawned process would die immediately and we
202
+ # would immediately try to start it back in an endless loop
203
+ setting :node_restart_timeout, default: 5_000
204
+ end
165
205
 
166
206
  # Namespace for CLI related settings
167
207
  setting :cli do
@@ -176,7 +216,6 @@ module Karafka
176
216
  # option subscription_groups_builder [Routing::SubscriptionGroupsBuilder] subscription
177
217
  # group builder
178
218
  setting :subscription_groups_builder, default: Routing::SubscriptionGroupsBuilder.new
179
-
180
219
  # Internally assigned list of limits on routings active for the current process
181
220
  # This can be altered by the CLI command
182
221
  setting :activity_manager, default: Routing::ActivityManager.new
@@ -7,6 +7,7 @@ module Karafka
7
7
  STATES = {
8
8
  initializing: :initialize!,
9
9
  initialized: :initialized!,
10
+ supervising: :supervise!,
10
11
  running: :run!,
11
12
  # will no longer pickup any work, but current work will be finished
12
13
  quieting: :quiet!,
@@ -49,8 +50,8 @@ module Karafka
49
50
 
50
51
  def #{transition}
51
52
  MUTEX.synchronize do
52
- # Do not allow reverse state transitions (we always go one way) or transition to the same
53
- # state as currently
53
+ # Do not allow reverse state transitions (we always go one way) or transition to the
54
+ # same state as currently
54
55
  return if @status && STATES.keys.index(:#{state}) <= STATES.keys.index(@status)
55
56
 
56
57
  @status = :#{state}
@@ -78,6 +79,7 @@ module Karafka
78
79
  def done?
79
80
  # Short-track for the most common case not to invoke all others on normal execution
80
81
  return false if running?
82
+ return false if supervising?
81
83
 
82
84
  stopping? || stopped? || quieting? || quiet? || terminated?
83
85
  end
@@ -0,0 +1,55 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Karafka
4
+ module Swarm
5
+ # Simple listener for swarm nodes that:
6
+ # - reports once in a while to make sure that supervisor is aware we do not hang
7
+ # - makes sure we did not become an orphan and if so, exits
8
+ class LivenessListener
9
+ include Karafka::Core::Helpers::Time
10
+ include Helpers::ConfigImporter.new(
11
+ node: %i[swarm node],
12
+ liveness_interval: %i[internal swarm liveness_interval],
13
+ orphaned_exit_code: %i[internal swarm orphaned_exit_code]
14
+ )
15
+
16
+ def initialize
17
+ @last_checked_at = 0
18
+ @mutex = Mutex.new
19
+ end
20
+
21
+ # Since there may be many statistics emitted from multiple listeners, we do not want to write
22
+ # statuses that often. Instead we do it only once in a while which should be enough
23
+ #
24
+ # While this may provide a small lag in the orphaned detection, it does not really matter
25
+ # as it will be picked up fast enough.
26
+ # @param _event [Karafka::Core::Monitoring::Event]
27
+ def on_statistics_emitted(_event)
28
+ periodically do
29
+ Kernel.exit!(orphaned_exit_code) if node.orphaned?
30
+
31
+ node.healthy
32
+ end
33
+ end
34
+
35
+ private
36
+
37
+ # Wraps the logic with a mutex
38
+ # @param block [Proc] code we want to run in mutex
39
+ def synchronize(&block)
40
+ @mutex.synchronize(&block)
41
+ end
42
+
43
+ # Runs requested code once in a while
44
+ def periodically
45
+ return if monotonic_now - @last_checked_at < liveness_interval
46
+
47
+ synchronize do
48
+ @last_checked_at = monotonic_now
49
+
50
+ yield
51
+ end
52
+ end
53
+ end
54
+ end
55
+ end