karafka 2.3.0 → 2.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. checksums.yaml +4 -4
  2. checksums.yaml.gz.sig +0 -0
  3. data/.rspec +2 -0
  4. data/CHANGELOG.md +15 -0
  5. data/Gemfile +1 -1
  6. data/Gemfile.lock +22 -22
  7. data/README.md +2 -2
  8. data/bin/integrations +2 -1
  9. data/bin/rspecs +6 -2
  10. data/config/locales/errors.yml +30 -8
  11. data/config/locales/pro_errors.yml +2 -0
  12. data/docker-compose.yml +1 -1
  13. data/lib/karafka/app.rb +14 -0
  14. data/lib/karafka/cli/base.rb +19 -0
  15. data/lib/karafka/cli/server.rb +62 -76
  16. data/lib/karafka/cli/swarm.rb +30 -0
  17. data/lib/karafka/constraints.rb +3 -3
  18. data/lib/karafka/contracts/config.rb +19 -0
  19. data/lib/karafka/errors.rb +12 -0
  20. data/lib/karafka/helpers/async.rb +13 -3
  21. data/lib/karafka/helpers/config_importer.rb +30 -0
  22. data/lib/karafka/instrumentation/logger_listener.rb +31 -0
  23. data/lib/karafka/instrumentation/notifications.rb +9 -0
  24. data/lib/karafka/instrumentation/vendors/datadog/logger_listener.rb +2 -0
  25. data/lib/karafka/instrumentation/vendors/kubernetes/base_listener.rb +72 -0
  26. data/lib/karafka/instrumentation/vendors/kubernetes/liveness_listener.rb +11 -40
  27. data/lib/karafka/instrumentation/vendors/kubernetes/swarm_liveness_listener.rb +54 -0
  28. data/lib/karafka/pro/active_job/job_options_contract.rb +1 -1
  29. data/lib/karafka/pro/base_consumer.rb +16 -0
  30. data/lib/karafka/pro/connection/manager.rb +6 -1
  31. data/lib/karafka/pro/processing/coordinator.rb +13 -3
  32. data/lib/karafka/pro/processing/coordinators/errors_tracker.rb +74 -0
  33. data/lib/karafka/pro/processing/coordinators/filters_applier.rb +107 -0
  34. data/lib/karafka/pro/processing/coordinators/virtual_offset_manager.rb +180 -0
  35. data/lib/karafka/pro/processing/strategies/aj/dlq_ftr_lrj_mom.rb +5 -7
  36. data/lib/karafka/pro/processing/strategies/aj/dlq_ftr_lrj_mom_vp.rb +5 -7
  37. data/lib/karafka/pro/processing/strategies/aj/dlq_ftr_mom.rb +8 -10
  38. data/lib/karafka/pro/processing/strategies/aj/dlq_ftr_mom_vp.rb +8 -16
  39. data/lib/karafka/pro/processing/strategies/aj/dlq_lrj_mom.rb +5 -7
  40. data/lib/karafka/pro/processing/strategies/aj/dlq_lrj_mom_vp.rb +5 -7
  41. data/lib/karafka/pro/processing/strategies/aj/dlq_mom.rb +8 -10
  42. data/lib/karafka/pro/processing/strategies/aj/dlq_mom_vp.rb +7 -9
  43. data/lib/karafka/pro/processing/strategies/dlq/default.rb +36 -10
  44. data/lib/karafka/pro/processing/strategies/dlq/ftr.rb +3 -7
  45. data/lib/karafka/pro/processing/strategies/dlq/ftr_lrj.rb +4 -8
  46. data/lib/karafka/pro/processing/strategies/dlq/ftr_lrj_mom.rb +6 -9
  47. data/lib/karafka/pro/processing/strategies/dlq/ftr_mom.rb +5 -15
  48. data/lib/karafka/pro/processing/strategies/dlq/lrj.rb +4 -8
  49. data/lib/karafka/pro/processing/strategies/dlq/lrj_mom.rb +6 -9
  50. data/lib/karafka/pro/processing/strategies/dlq/mom.rb +10 -20
  51. data/lib/karafka/pro/processing/strategies/vp/default.rb +7 -0
  52. data/lib/karafka/pro/routing/features/dead_letter_queue/contracts/topic.rb +6 -0
  53. data/lib/karafka/pro/routing/features/dead_letter_queue/topic.rb +39 -0
  54. data/lib/karafka/pro/swarm/liveness_listener.rb +171 -0
  55. data/lib/karafka/process.rb +27 -1
  56. data/lib/karafka/routing/features/dead_letter_queue/config.rb +2 -0
  57. data/lib/karafka/routing/subscription_group.rb +31 -9
  58. data/lib/karafka/runner.rb +4 -0
  59. data/lib/karafka/server.rb +13 -16
  60. data/lib/karafka/setup/config.rb +41 -2
  61. data/lib/karafka/status.rb +4 -2
  62. data/lib/karafka/swarm/liveness_listener.rb +55 -0
  63. data/lib/karafka/swarm/manager.rb +217 -0
  64. data/lib/karafka/swarm/node.rb +179 -0
  65. data/lib/karafka/swarm/pidfd.rb +131 -0
  66. data/lib/karafka/swarm/supervisor.rb +184 -0
  67. data/lib/karafka/swarm.rb +27 -0
  68. data/lib/karafka/templates/karafka.rb.erb +0 -2
  69. data/lib/karafka/version.rb +1 -1
  70. data/lib/karafka.rb +1 -1
  71. data.tar.gz.sig +0 -0
  72. metadata +17 -4
  73. metadata.gz.sig +0 -0
  74. data/lib/karafka/pro/processing/filters_applier.rb +0 -105
  75. data/lib/karafka/pro/processing/virtual_offset_manager.rb +0 -177
@@ -28,6 +28,12 @@ module Karafka
28
28
  ).fetch('en').fetch('validations').fetch('topic')
29
29
  end
30
30
 
31
+ nested(:dead_letter_queue) do
32
+ # We use strategy based DLQ for every case in Pro
33
+ # For default (when no strategy) a default `max_retries` based strategy is used
34
+ required(:strategy) { |val| val.respond_to?(:call) }
35
+ end
36
+
31
37
  # Make sure that when we use virtual partitions with DLQ, at least one retry is set
32
38
  # We cannot use VP with DLQ without retries as we in order to provide ordering
33
39
  # warranties on errors with VP, we need to collapse the VPs concurrency and retry
@@ -0,0 +1,39 @@
1
+ # frozen_string_literal: true
2
+
3
+ # This Karafka component is a Pro component under a commercial license.
4
+ # This Karafka component is NOT licensed under LGPL.
5
+ #
6
+ # All of the commercial components are present in the lib/karafka/pro directory of this
7
+ # repository and their usage requires commercial license agreement.
8
+ #
9
+ # Karafka has also commercial-friendly license, commercial support and commercial components.
10
+ #
11
+ # By sending a pull request to the pro components, you are agreeing to transfer the copyright of
12
+ # your code to Maciej Mensfeld.
13
+
14
+ module Karafka
15
+ module Pro
16
+ module Routing
17
+ module Features
18
+ class DeadLetterQueue < Base
19
+ # Expansions to the topic API in DLQ
20
+ module Topic
21
+ # @param strategy [#call, nil] Strategy we want to use or nil if a default strategy
22
+ # (same as in OSS) should be applied
23
+ # @param args [Hash] OSS DLQ arguments
24
+ def dead_letter_queue(strategy: nil, **args)
25
+ return @dead_letter_queue if @dead_letter_queue
26
+
27
+ super(**args).tap do |config|
28
+ # If explicit strategy is not provided, use the default approach from OSS
29
+ config.strategy = strategy || lambda do |_errors_tracker, attempt|
30
+ attempt > config.max_retries ? :dispatch : :retry
31
+ end
32
+ end
33
+ end
34
+ end
35
+ end
36
+ end
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,171 @@
1
+ # frozen_string_literal: true
2
+
3
+ # This Karafka component is a Pro component under a commercial license.
4
+ # This Karafka component is NOT licensed under LGPL.
5
+ #
6
+ # All of the commercial components are present in the lib/karafka/pro directory of this
7
+ # repository and their usage requires commercial license agreement.
8
+ #
9
+ # Karafka has also commercial-friendly license, commercial support and commercial components.
10
+ #
11
+ # By sending a pull request to the pro components, you are agreeing to transfer the copyright of
12
+ # your code to Maciej Mensfeld.
13
+
14
+ module Karafka
15
+ module Pro
16
+ # Pro Swarm components namespace
17
+ module Swarm
18
+ # Pro listener that monitors RSS usage and other heartbeat metrics (if configured) to ensure
19
+ # that everything operates.
20
+ #
21
+ # It can:
22
+ # - monitor poll frequency to make sure things are not polled not often enough
23
+ # - monitor consumption to make sure we do not process data for too long
24
+ # - monitor RSS to make sure that we do not use too much memory
25
+ #
26
+ # By default it does **not** monitor memory and consuming and polling is configured in such
27
+ # a way to align with `max.poll.interval.ms` and other defaults.
28
+ #
29
+ # Failure statuses reported are as follows:
30
+ # - 1 - polling ttl exceeded
31
+ # - 2 - consuming ttl exceeded
32
+ # - 3 - memory limit exceeded
33
+ #
34
+ # @note This listener should not break anything if subscribed in the supervisor prior to
35
+ # forking as it relies on server events for operations.
36
+ class LivenessListener < Karafka::Swarm::LivenessListener
37
+ # @param memory_limit [Integer] max memory in MB for this process to be considered healthy
38
+ # @param consuming_ttl [Integer] time in ms after which we consider consumption hanging.
39
+ # It allows us to define max consumption time after which supervisor should consider
40
+ # given process as hanging
41
+ # @param polling_ttl [Integer] max time in ms for polling. If polling (any) does not
42
+ # happen that often, process should be considered dead.
43
+ # @note The default TTL matches the default `max.poll.interval.ms`
44
+ def initialize(
45
+ memory_limit: Float::INFINITY,
46
+ consuming_ttl: 5 * 60 * 1_000,
47
+ polling_ttl: 5 * 60 * 1_000
48
+ )
49
+ @polling_ttl = polling_ttl
50
+ @consuming_ttl = consuming_ttl
51
+ # We cast it just in case someone would provide '10MB' or something similar
52
+ @memory_limit = memory_limit.is_a?(String) ? memory_limit.to_i : memory_limit
53
+ @pollings = {}
54
+ @consumptions = {}
55
+
56
+ super()
57
+ end
58
+
59
+ # Tick on each fetch
60
+ #
61
+ # @param _event [Karafka::Core::Monitoring::Event]
62
+ def on_connection_listener_fetch_loop(_event)
63
+ mark_polling_tick
64
+ end
65
+
66
+ {
67
+ consume: :consumed,
68
+ revoke: :revoked,
69
+ shutting_down: :shutdown,
70
+ tick: :ticked
71
+ }.each do |before, after|
72
+ class_eval <<~RUBY, __FILE__, __LINE__ + 1
73
+ # Tick on starting work
74
+ # @param _event [Karafka::Core::Monitoring::Event]
75
+ def on_consumer_#{before}(_event)
76
+ mark_consumption_tick
77
+ end
78
+
79
+ # Tick on finished work
80
+ # @param _event [Karafka::Core::Monitoring::Event]
81
+ def on_consumer_#{after}(_event)
82
+ clear_consumption_tick
83
+ end
84
+ RUBY
85
+ end
86
+
87
+ # @param _event [Karafka::Core::Monitoring::Event]
88
+ def on_error_occurred(_event)
89
+ clear_consumption_tick
90
+ clear_polling_tick
91
+ end
92
+
93
+ # Reports the current status once in a while
94
+ #
95
+ # @param _event [Karafka::Core::Monitoring::Event]
96
+ def on_statistics_emitted(_event)
97
+ periodically do
98
+ return unless node
99
+
100
+ current_status = status
101
+
102
+ current_status.positive? ? node.unhealthy(current_status) : node.healthy
103
+ end
104
+ end
105
+
106
+ private
107
+
108
+ # @return [Integer] object id of the current thread
109
+ def thread_id
110
+ Thread.current.object_id
111
+ end
112
+
113
+ # Update the polling tick time for current thread
114
+ def mark_polling_tick
115
+ synchronize do
116
+ @pollings[thread_id] = monotonic_now
117
+ end
118
+ end
119
+
120
+ # Clear current thread polling time tracker
121
+ def clear_polling_tick
122
+ synchronize do
123
+ @pollings.delete(thread_id)
124
+ end
125
+ end
126
+
127
+ # Update the processing tick time
128
+ def mark_consumption_tick
129
+ synchronize do
130
+ @consumptions[thread_id] = monotonic_now
131
+ end
132
+ end
133
+
134
+ # Clear current thread consumption time tracker
135
+ def clear_consumption_tick
136
+ synchronize do
137
+ @consumptions.delete(thread_id)
138
+ end
139
+ end
140
+
141
+ # Did we exceed any of the ttls
142
+ # @return [String] 204 string if ok, 500 otherwise
143
+ def status
144
+ time = monotonic_now
145
+
146
+ return 1 if @pollings.values.any? { |tick| (time - tick) > @polling_ttl }
147
+ return 2 if @consumptions.values.any? { |tick| (time - tick) > @consuming_ttl }
148
+ return 3 if rss_mb > @memory_limit
149
+
150
+ 0
151
+ end
152
+
153
+ # @return [Integer] RSS in MB for the current process
154
+ # @note Since swarm is linux only, we do not have to worry about getting RSS on other OSes
155
+ def rss_mb
156
+ kb_rss = 0
157
+
158
+ IO.readlines("/proc/#{node.pid}/status").each do |line|
159
+ next unless line.start_with?('VmRSS:')
160
+
161
+ kb_rss = line.split[1].to_i
162
+
163
+ break
164
+ end
165
+
166
+ (kb_rss / 1_024.to_i).round
167
+ end
168
+ end
169
+ end
170
+ end
171
+ end
@@ -14,6 +14,8 @@ module Karafka
14
14
  SIGTERM
15
15
  SIGTTIN
16
16
  SIGTSTP
17
+ SIGCHLD
18
+ SIGUSER1
17
19
  ].freeze
18
20
 
19
21
  HANDLED_SIGNALS.each do |signal|
@@ -32,16 +34,40 @@ module Karafka
32
34
  RUBY
33
35
  end
34
36
 
37
+ # Assigns a callback that will run on any supported signal that has at least one callback
38
+ # registered already.
39
+ # @param block [Proc] code we want to run
40
+ # @note This will only bind to signals that already have at least one callback defined
41
+ def on_any_active(&block)
42
+ HANDLED_SIGNALS.each do |signal|
43
+ next unless @callbacks.key?(signal)
44
+
45
+ public_send(:"on_#{signal.to_s.downcase}", &block)
46
+ end
47
+ end
48
+
35
49
  # Creates an instance of process and creates empty hash for callbacks
36
50
  def initialize
37
51
  @callbacks = Hash.new { |hsh, key| hsh[key] = [] }
38
52
  @supervised = false
39
53
  end
40
54
 
55
+ # Clears all the defined callbacks. Useful for post-fork cleanup when parent already defined
56
+ # some signals
57
+ def clear
58
+ @callbacks.clear
59
+ end
60
+
41
61
  # Method catches all HANDLED_SIGNALS and performs appropriate callbacks (if defined)
42
62
  # @note If there are no callbacks, this method will just ignore a given signal that was sent
43
63
  def supervise
44
- HANDLED_SIGNALS.each { |signal| trap_signal(signal) }
64
+ HANDLED_SIGNALS.each do |signal|
65
+ # Supervise only signals for which we have defined callbacks
66
+ next unless @callbacks.key?(signal)
67
+
68
+ trap_signal(signal)
69
+ end
70
+
45
71
  @supervised = true
46
72
  end
47
73
 
@@ -15,6 +15,8 @@ module Karafka
15
15
  :independent,
16
16
  # Move to DLQ and mark as consumed in transactional mode (if applicable)
17
17
  :transactional,
18
+ # Strategy to apply (if strategies supported)
19
+ :strategy,
18
20
  keyword_init: true
19
21
  ) do
20
22
  alias_method :active?, :active
@@ -8,6 +8,12 @@ module Karafka
8
8
  # @note One subscription group will always belong to one consumer group, but one consumer
9
9
  # group can have multiple subscription groups.
10
10
  class SubscriptionGroup
11
+ include Helpers::ConfigImporter.new(
12
+ activity_manager: %i[internal routing activity_manager],
13
+ client_id: %i[client_id],
14
+ node: %i[swarm node]
15
+ )
16
+
11
17
  attr_reader :id, :name, :topics, :kafka, :consumer_group
12
18
 
13
19
  # Lock for generating new ids safely
@@ -67,7 +73,7 @@ module Karafka
67
73
 
68
74
  # @return [Boolean] is this subscription group one of active once
69
75
  def active?
70
- Karafka::App.config.internal.routing.activity_manager.active?(:subscription_groups, name)
76
+ activity_manager.active?(:subscription_groups, name)
71
77
  end
72
78
 
73
79
  # @return [Array<String>] names of topics to which we should subscribe.
@@ -93,15 +99,9 @@ module Karafka
93
99
  def build_kafka
94
100
  kafka = Setup::AttributesMap.consumer(@topics.first.kafka.dup)
95
101
 
96
- # If we use static group memberships, there can be a case, where same instance id would
97
- # be set on many subscription groups as the group instance id from Karafka perspective is
98
- # set per config. Each instance even if they are subscribed to different topics needs to
99
- # have it fully unique. To make sure of that, we just add extra postfix at the end that
100
- # increments.
101
- group_instance_id = kafka.fetch(:'group.instance.id', false)
102
+ inject_group_instance_id(kafka)
102
103
 
103
- kafka[:'group.instance.id'] = "#{group_instance_id}_#{@position}" if group_instance_id
104
- kafka[:'client.id'] ||= Karafka::App.config.client_id
104
+ kafka[:'client.id'] ||= client_id
105
105
  kafka[:'group.id'] ||= @consumer_group.id
106
106
  kafka[:'auto.offset.reset'] ||= @topics.first.initial_offset
107
107
  # Karafka manages the offsets based on the processing state, thus we do not rely on the
@@ -110,6 +110,28 @@ module Karafka
110
110
  kafka.freeze
111
111
  kafka
112
112
  end
113
+
114
+ # If we use static group memberships, there can be a case, where same instance id would
115
+ # be set on many subscription groups as the group instance id from Karafka perspective is
116
+ # set per config. Each instance even if they are subscribed to different topics needs to
117
+ # have it fully unique. To make sure of that, we just add extra postfix at the end that
118
+ # increments.
119
+ #
120
+ # We also handle a swarm case, where the same setup would run from many forked nodes, hence
121
+ # affecting the instance id and causing conflicts
122
+ # @param kafka [Hash] kafka level config
123
+ def inject_group_instance_id(kafka)
124
+ group_instance_prefix = kafka.fetch(:'group.instance.id', false)
125
+
126
+ # If group instance id was not even configured, do nothing
127
+ return unless group_instance_prefix
128
+
129
+ # If there is a node, we need to take its id and inject it as well so multiple forks can
130
+ # have different instances ids but they are reproducible
131
+ components = [group_instance_prefix, node ? node.id : nil, @position]
132
+
133
+ kafka[:'group.instance.id'] = components.compact.join('_')
134
+ end
113
135
  end
114
136
  end
115
137
  end
@@ -18,6 +18,10 @@ module Karafka
18
18
  workers = Processing::WorkersBatch.new(jobs_queue)
19
19
  listeners = Connection::ListenersBatch.new(jobs_queue)
20
20
 
21
+ # We mark it prior to delegating to the manager as manager will have to start at least one
22
+ # connection to Kafka, hence running
23
+ Karafka::App.run!
24
+
21
25
  # Register all the listeners so they can be started and managed
22
26
  @manager.register(listeners)
23
27
 
@@ -3,16 +3,6 @@
3
3
  module Karafka
4
4
  # Karafka consuming server class
5
5
  class Server
6
- # How long should we sleep between checks on shutting down consumers
7
- SUPERVISION_SLEEP = 0.1
8
- # What system exit code should we use when we terminated forcefully
9
- FORCEFUL_EXIT_CODE = 2
10
- # This factor allows us to calculate how many times we have to sleep before
11
- # a forceful shutdown
12
- SUPERVISION_CHECK_FACTOR = (1 / SUPERVISION_SLEEP)
13
-
14
- private_constant :SUPERVISION_SLEEP, :FORCEFUL_EXIT_CODE, :SUPERVISION_CHECK_FACTOR
15
-
16
6
  class << self
17
7
  # Set of consuming threads. Each consumer thread contains a single consumer
18
8
  attr_accessor :listeners
@@ -36,12 +26,20 @@ module Karafka
36
26
  config.internal.routing.activity_manager.to_h
37
27
  )
38
28
 
29
+ # We clear as we do not want parent handlers in case of working from fork
30
+ process.clear
39
31
  process.on_sigint { stop }
40
32
  process.on_sigquit { stop }
41
33
  process.on_sigterm { stop }
42
34
  process.on_sigtstp { quiet }
35
+ # Needed for instrumentation
36
+ process.on_sigttin {}
43
37
  process.supervise
44
38
 
39
+ # This will only run when not in a swarm mode. In swarm mode the server runs post-fork, so
40
+ # warmup will do nothing
41
+ Karafka::App.warmup
42
+
45
43
  # Start is blocking until stop is called and when we stop, it will wait until
46
44
  # all of the things are ready to stop
47
45
  start
@@ -61,10 +59,9 @@ module Karafka
61
59
  end
62
60
 
63
61
  # Starts Karafka with a supervision
64
- # @note We don't need to sleep because Karafka::Fetcher is locking and waiting to
65
- # finish loop (and it won't happen until we explicitly want to stop)
62
+ # @note We don't need to sleep because Karafka::Runner is locking and waiting to finish loop
63
+ # (and it won't happen until we explicitly want to stop)
66
64
  def start
67
- Karafka::App.run!
68
65
  Karafka::Runner.new.call
69
66
  end
70
67
 
@@ -87,13 +84,13 @@ module Karafka
87
84
  # We check from time to time (for the timeout period) if all the threads finished
88
85
  # their work and if so, we can just return and normal shutdown process will take place
89
86
  # We divide it by 1000 because we use time in ms.
90
- ((timeout / 1_000) * SUPERVISION_CHECK_FACTOR).to_i.times do
87
+ ((timeout / 1_000) * (1 / config.internal.supervision_sleep)).to_i.times do
91
88
  all_listeners_stopped = listeners.all?(&:stopped?)
92
89
  all_workers_stopped = workers.none?(&:alive?)
93
90
 
94
91
  return if all_listeners_stopped && all_workers_stopped
95
92
 
96
- sleep SUPERVISION_SLEEP
93
+ sleep(config.internal.supervision_sleep)
97
94
  end
98
95
 
99
96
  raise Errors::ForcefulShutdownError
@@ -117,7 +114,7 @@ module Karafka
117
114
  return unless process.supervised?
118
115
 
119
116
  # exit! is not within the instrumentation as it would not trigger due to exit
120
- Kernel.exit!(FORCEFUL_EXIT_CODE)
117
+ Kernel.exit!(config.internal.forceful_exit_code)
121
118
  ensure
122
119
  # We need to check if it wasn't an early exit to make sure that only on stop invocation
123
120
  # can change the status after everything is closed
@@ -105,6 +105,17 @@ module Karafka
105
105
  # @see https://github.com/edenhill/librdkafka/blob/master/CONFIGURATION.md
106
106
  setting :kafka, default: {}
107
107
 
108
+ # Public configuration for swarm operations
109
+ setting :swarm do
110
+ # option [Integer] how many processes do we want to run in a swarm mode
111
+ # Keep in mind this is only applicable when running in a swarm mode
112
+ setting :nodes, default: 3
113
+ # This is set automatically when we fork. Used to hold reference that may be needed
114
+ # for static group membership, supervision and more. If set to `false`, it means this
115
+ # process is not a fork
116
+ setting :node, default: false
117
+ end
118
+
108
119
  # Admin specific settings.
109
120
  #
110
121
  # Since admin operations are often specific, they may require specific librdkafka settings
@@ -151,7 +162,6 @@ module Karafka
151
162
  # @note In the future, we need to have a single process representation for all the karafka
152
163
  # instances
153
164
  setting :process, default: Process.new
154
-
155
165
  # Interval of "ticking". This is used to define the maximum time between consecutive
156
166
  # polling of the main rdkafka queue. It should match also the `statistics.interval.ms`
157
167
  # smallest value defined in any of the per-kafka settings, so metrics are published with
@@ -162,6 +172,36 @@ module Karafka
162
172
  # not to have enough time to run. This (not directly) defines also a single poll
163
173
  # max timeout as to allow for frequent enough events polling
164
174
  setting :tick_interval, default: 5_000
175
+ # How long should we sleep between checks on shutting down consumers
176
+ setting :supervision_sleep, default: 0.1
177
+ # What system exit code should we use when we terminated forcefully
178
+ setting :forceful_exit_code, default: 2
179
+
180
+ setting :swarm do
181
+ # Manager for swarm nodes control
182
+ setting :manager, default: Swarm::Manager.new
183
+ # Exit code we exit an orphaned child with to indicate something went wrong
184
+ setting :orphaned_exit_code, default: 3
185
+ # syscall number for https://man7.org/linux/man-pages/man2/pidfd_open.2.html
186
+ setting :pidfd_open_syscall, default: 434
187
+ # syscall number for https://man7.org/linux/man-pages/man2/pidfd_send_signal.2.html
188
+ setting :pidfd_signal_syscall, default: 424
189
+ # How often (in ms) should we control our nodes
190
+ # This is maximum time after which we will check. This can happen more often in case of
191
+ # system events.
192
+ setting :supervision_interval, default: 30_000
193
+ # How often should each node report its status
194
+ setting :liveness_interval, default: 10_000
195
+ # Listener used to report nodes state to the supervisor
196
+ setting :liveness_listener, default: Swarm::LivenessListener.new
197
+ # How long should we wait for any info from the node before we consider it hanging at
198
+ # stop it
199
+ setting :node_report_timeout, default: 30_000
200
+ # How long should we wait before restarting a node. This can prevent us from having a
201
+ # case where for some external reason our spawned process would die immediately and we
202
+ # would immediately try to start it back in an endless loop
203
+ setting :node_restart_timeout, default: 5_000
204
+ end
165
205
 
166
206
  # Namespace for CLI related settings
167
207
  setting :cli do
@@ -176,7 +216,6 @@ module Karafka
176
216
  # option subscription_groups_builder [Routing::SubscriptionGroupsBuilder] subscription
177
217
  # group builder
178
218
  setting :subscription_groups_builder, default: Routing::SubscriptionGroupsBuilder.new
179
-
180
219
  # Internally assigned list of limits on routings active for the current process
181
220
  # This can be altered by the CLI command
182
221
  setting :activity_manager, default: Routing::ActivityManager.new
@@ -7,6 +7,7 @@ module Karafka
7
7
  STATES = {
8
8
  initializing: :initialize!,
9
9
  initialized: :initialized!,
10
+ supervising: :supervise!,
10
11
  running: :run!,
11
12
  # will no longer pickup any work, but current work will be finished
12
13
  quieting: :quiet!,
@@ -49,8 +50,8 @@ module Karafka
49
50
 
50
51
  def #{transition}
51
52
  MUTEX.synchronize do
52
- # Do not allow reverse state transitions (we always go one way) or transition to the same
53
- # state as currently
53
+ # Do not allow reverse state transitions (we always go one way) or transition to the
54
+ # same state as currently
54
55
  return if @status && STATES.keys.index(:#{state}) <= STATES.keys.index(@status)
55
56
 
56
57
  @status = :#{state}
@@ -78,6 +79,7 @@ module Karafka
78
79
  def done?
79
80
  # Short-track for the most common case not to invoke all others on normal execution
80
81
  return false if running?
82
+ return false if supervising?
81
83
 
82
84
  stopping? || stopped? || quieting? || quiet? || terminated?
83
85
  end
@@ -0,0 +1,55 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Karafka
4
+ module Swarm
5
+ # Simple listener for swarm nodes that:
6
+ # - reports once in a while to make sure that supervisor is aware we do not hang
7
+ # - makes sure we did not become an orphan and if so, exits
8
+ class LivenessListener
9
+ include Karafka::Core::Helpers::Time
10
+ include Helpers::ConfigImporter.new(
11
+ node: %i[swarm node],
12
+ liveness_interval: %i[internal swarm liveness_interval],
13
+ orphaned_exit_code: %i[internal swarm orphaned_exit_code]
14
+ )
15
+
16
+ def initialize
17
+ @last_checked_at = 0
18
+ @mutex = Mutex.new
19
+ end
20
+
21
+ # Since there may be many statistics emitted from multiple listeners, we do not want to write
22
+ # statuses that often. Instead we do it only once in a while which should be enough
23
+ #
24
+ # While this may provide a small lag in the orphaned detection, it does not really matter
25
+ # as it will be picked up fast enough.
26
+ # @param _event [Karafka::Core::Monitoring::Event]
27
+ def on_statistics_emitted(_event)
28
+ periodically do
29
+ Kernel.exit!(orphaned_exit_code) if node.orphaned?
30
+
31
+ node.healthy
32
+ end
33
+ end
34
+
35
+ private
36
+
37
+ # Wraps the logic with a mutex
38
+ # @param block [Proc] code we want to run in mutex
39
+ def synchronize(&block)
40
+ @mutex.synchronize(&block)
41
+ end
42
+
43
+ # Runs requested code once in a while
44
+ def periodically
45
+ return if monotonic_now - @last_checked_at < liveness_interval
46
+
47
+ synchronize do
48
+ @last_checked_at = monotonic_now
49
+
50
+ yield
51
+ end
52
+ end
53
+ end
54
+ end
55
+ end