RubyGems - karafka - Versions diffs - 2.3.0 → 2.3.2 - Mend

karafka 2.3.0 → 2.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (75) hide show

checksums.yaml +4 -4
checksums.yaml.gz.sig +0 -0
data/.rspec +2 -0
data/CHANGELOG.md +15 -0
data/Gemfile +1 -1
data/Gemfile.lock +22 -22
data/README.md +2 -2
data/bin/integrations +2 -1
data/bin/rspecs +6 -2
data/config/locales/errors.yml +30 -8
data/config/locales/pro_errors.yml +2 -0
data/docker-compose.yml +1 -1
data/lib/karafka/app.rb +14 -0
data/lib/karafka/cli/base.rb +19 -0
data/lib/karafka/cli/server.rb +62 -76
data/lib/karafka/cli/swarm.rb +30 -0
data/lib/karafka/constraints.rb +3 -3
data/lib/karafka/contracts/config.rb +19 -0
data/lib/karafka/errors.rb +12 -0
data/lib/karafka/helpers/async.rb +13 -3
data/lib/karafka/helpers/config_importer.rb +30 -0
data/lib/karafka/instrumentation/logger_listener.rb +31 -0
data/lib/karafka/instrumentation/notifications.rb +9 -0
data/lib/karafka/instrumentation/vendors/datadog/logger_listener.rb +2 -0
data/lib/karafka/instrumentation/vendors/kubernetes/base_listener.rb +72 -0
data/lib/karafka/instrumentation/vendors/kubernetes/liveness_listener.rb +11 -40
data/lib/karafka/instrumentation/vendors/kubernetes/swarm_liveness_listener.rb +54 -0
data/lib/karafka/pro/active_job/job_options_contract.rb +1 -1
data/lib/karafka/pro/base_consumer.rb +16 -0
data/lib/karafka/pro/connection/manager.rb +6 -1
data/lib/karafka/pro/processing/coordinator.rb +13 -3
data/lib/karafka/pro/processing/coordinators/errors_tracker.rb +74 -0
data/lib/karafka/pro/processing/coordinators/filters_applier.rb +107 -0
data/lib/karafka/pro/processing/coordinators/virtual_offset_manager.rb +180 -0
data/lib/karafka/pro/processing/strategies/aj/dlq_ftr_lrj_mom.rb +5 -7
data/lib/karafka/pro/processing/strategies/aj/dlq_ftr_lrj_mom_vp.rb +5 -7
data/lib/karafka/pro/processing/strategies/aj/dlq_ftr_mom.rb +8 -10
data/lib/karafka/pro/processing/strategies/aj/dlq_ftr_mom_vp.rb +8 -16
data/lib/karafka/pro/processing/strategies/aj/dlq_lrj_mom.rb +5 -7
data/lib/karafka/pro/processing/strategies/aj/dlq_lrj_mom_vp.rb +5 -7
data/lib/karafka/pro/processing/strategies/aj/dlq_mom.rb +8 -10
data/lib/karafka/pro/processing/strategies/aj/dlq_mom_vp.rb +7 -9
data/lib/karafka/pro/processing/strategies/dlq/default.rb +36 -10
data/lib/karafka/pro/processing/strategies/dlq/ftr.rb +3 -7
data/lib/karafka/pro/processing/strategies/dlq/ftr_lrj.rb +4 -8
data/lib/karafka/pro/processing/strategies/dlq/ftr_lrj_mom.rb +6 -9
data/lib/karafka/pro/processing/strategies/dlq/ftr_mom.rb +5 -15
data/lib/karafka/pro/processing/strategies/dlq/lrj.rb +4 -8
data/lib/karafka/pro/processing/strategies/dlq/lrj_mom.rb +6 -9
data/lib/karafka/pro/processing/strategies/dlq/mom.rb +10 -20
data/lib/karafka/pro/processing/strategies/vp/default.rb +7 -0
data/lib/karafka/pro/routing/features/dead_letter_queue/contracts/topic.rb +6 -0
data/lib/karafka/pro/routing/features/dead_letter_queue/topic.rb +39 -0
data/lib/karafka/pro/swarm/liveness_listener.rb +171 -0
data/lib/karafka/process.rb +27 -1
data/lib/karafka/routing/features/dead_letter_queue/config.rb +2 -0
data/lib/karafka/routing/subscription_group.rb +31 -9
data/lib/karafka/runner.rb +4 -0
data/lib/karafka/server.rb +13 -16
data/lib/karafka/setup/config.rb +41 -2
data/lib/karafka/status.rb +4 -2
data/lib/karafka/swarm/liveness_listener.rb +55 -0
data/lib/karafka/swarm/manager.rb +217 -0
data/lib/karafka/swarm/node.rb +179 -0
data/lib/karafka/swarm/pidfd.rb +131 -0
data/lib/karafka/swarm/supervisor.rb +184 -0
data/lib/karafka/swarm.rb +27 -0
data/lib/karafka/templates/karafka.rb.erb +0 -2
data/lib/karafka/version.rb +1 -1
data/lib/karafka.rb +1 -1
data.tar.gz.sig +0 -0
metadata +17 -4
metadata.gz.sig +0 -0
data/lib/karafka/pro/processing/filters_applier.rb +0 -105
data/lib/karafka/pro/processing/virtual_offset_manager.rb +0 -177

data/lib/karafka/pro/routing/features/dead_letter_queue/contracts/topic.rb CHANGED Viewed

@@ -28,6 +28,12 @@ module Karafka
                 ).fetch('en').fetch('validations').fetch('topic')
               end
+              nested(:dead_letter_queue) do
+                # We use strategy based DLQ for every case in Pro
+                # For default (when no strategy) a default `max_retries` based strategy is used
+                required(:strategy) { |val| val.respond_to?(:call) }
+              end
               # Make sure that when we use virtual partitions with DLQ, at least one retry is set
               # We cannot use VP with DLQ without retries as we in order to provide ordering
               # warranties on errors with VP, we need to collapse the VPs concurrency and retry

data/lib/karafka/pro/routing/features/dead_letter_queue/topic.rb ADDED Viewed

@@ -0,0 +1,39 @@
+# frozen_string_literal: true
+# This Karafka component is a Pro component under a commercial license.
+# This Karafka component is NOT licensed under LGPL.
+#
+# All of the commercial components are present in the lib/karafka/pro directory of this
+# repository and their usage requires commercial license agreement.
+#
+# Karafka has also commercial-friendly license, commercial support and commercial components.
+#
+# By sending a pull request to the pro components, you are agreeing to transfer the copyright of
+# your code to Maciej Mensfeld.
+module Karafka
+  module Pro
+    module Routing
+      module Features
+        class DeadLetterQueue < Base
+          # Expansions to the topic API in DLQ
+          module Topic
+            # @param strategy [#call, nil] Strategy we want to use or nil if a default strategy
+            # (same as in OSS) should be applied
+            # @param args [Hash] OSS DLQ arguments
+            def dead_letter_queue(strategy: nil, **args)
+              return @dead_letter_queue if @dead_letter_queue
+              super(**args).tap do |config|
+                # If explicit strategy is not provided, use the default approach from OSS
+                config.strategy = strategy || lambda do |_errors_tracker, attempt|
+                  attempt > config.max_retries ? :dispatch : :retry
+                end
+              end
+            end
+          end
+        end
+      end
+    end
+  end
+end

data/lib/karafka/pro/swarm/liveness_listener.rb ADDED Viewed

@@ -0,0 +1,171 @@
+# frozen_string_literal: true
+# This Karafka component is a Pro component under a commercial license.
+# This Karafka component is NOT licensed under LGPL.
+#
+# All of the commercial components are present in the lib/karafka/pro directory of this
+# repository and their usage requires commercial license agreement.
+#
+# Karafka has also commercial-friendly license, commercial support and commercial components.
+#
+# By sending a pull request to the pro components, you are agreeing to transfer the copyright of
+# your code to Maciej Mensfeld.
+module Karafka
+  module Pro
+    # Pro Swarm components namespace
+    module Swarm
+      # Pro listener that monitors RSS usage and other heartbeat metrics (if configured) to ensure
+      # that everything operates.
+      #
+      # It can:
+      #   - monitor poll frequency to make sure things are not polled not often enough
+      #   - monitor consumption to make sure we do not process data for too long
+      #   - monitor RSS to make sure that we do not use too much memory
+      #
+      # By default it does **not** monitor memory and consuming and polling is configured in such
+      # a way to align with `max.poll.interval.ms` and other defaults.
+      #
+      # Failure statuses reported are as follows:
+      #   - 1 - polling ttl exceeded
+      #   - 2 - consuming ttl exceeded
+      #   - 3 - memory limit exceeded
+      #
+      # @note This listener should not break anything if subscribed in the supervisor prior to
+      #   forking as it relies on server events for operations.
+      class LivenessListener < Karafka::Swarm::LivenessListener
+        # @param memory_limit [Integer] max memory in MB for this process to be considered healthy
+        # @param consuming_ttl [Integer] time in ms after which we consider consumption hanging.
+        #   It allows us to define max consumption time after which supervisor should consider
+        #   given process as hanging
+        # @param polling_ttl [Integer] max time in ms for polling. If polling (any) does not
+        #   happen that often, process should be considered dead.
+        # @note The default TTL matches the default `max.poll.interval.ms`
+        def initialize(
+          memory_limit: Float::INFINITY,
+          consuming_ttl: 5 * 60 * 1_000,
+          polling_ttl: 5 * 60 * 1_000
+        )
+          @polling_ttl = polling_ttl
+          @consuming_ttl = consuming_ttl
+          # We cast it just in case someone would provide '10MB' or something similar
+          @memory_limit = memory_limit.is_a?(String) ? memory_limit.to_i : memory_limit
+          @pollings = {}
+          @consumptions = {}
+          super()
+        end
+        # Tick on each fetch
+        #
+        # @param _event [Karafka::Core::Monitoring::Event]
+        def on_connection_listener_fetch_loop(_event)
+          mark_polling_tick
+        end
+        {
+          consume: :consumed,
+          revoke: :revoked,
+          shutting_down: :shutdown,
+          tick: :ticked
+        }.each do |before, after|
+          class_eval <<~RUBY, __FILE__, __LINE__ + 1
+            # Tick on starting work
+            # @param _event [Karafka::Core::Monitoring::Event]
+            def on_consumer_#{before}(_event)
+              mark_consumption_tick
+            end
+            # Tick on finished work
+            # @param _event [Karafka::Core::Monitoring::Event]
+            def on_consumer_#{after}(_event)
+              clear_consumption_tick
+            end
+          RUBY
+        end
+        # @param _event [Karafka::Core::Monitoring::Event]
+        def on_error_occurred(_event)
+          clear_consumption_tick
+          clear_polling_tick
+        end
+        # Reports the current status once in a while
+        #
+        # @param _event [Karafka::Core::Monitoring::Event]
+        def on_statistics_emitted(_event)
+          periodically do
+            return unless node
+            current_status = status
+            current_status.positive? ? node.unhealthy(current_status) : node.healthy
+          end
+        end
+        private
+        # @return [Integer] object id of the current thread
+        def thread_id
+          Thread.current.object_id
+        end
+        # Update the polling tick time for current thread
+        def mark_polling_tick
+          synchronize do
+            @pollings[thread_id] = monotonic_now
+          end
+        end
+        # Clear current thread polling time tracker
+        def clear_polling_tick
+          synchronize do
+            @pollings.delete(thread_id)
+          end
+        end
+        # Update the processing tick time
+        def mark_consumption_tick
+          synchronize do
+            @consumptions[thread_id] = monotonic_now
+          end
+        end
+        # Clear current thread consumption time tracker
+        def clear_consumption_tick
+          synchronize do
+            @consumptions.delete(thread_id)
+          end
+        end
+        # Did we exceed any of the ttls
+        # @return [String] 204 string if ok, 500 otherwise
+        def status
+          time = monotonic_now
+          return 1 if @pollings.values.any? { |tick| (time - tick) > @polling_ttl }
+          return 2 if @consumptions.values.any? { |tick| (time - tick) > @consuming_ttl }
+          return 3 if rss_mb > @memory_limit
+          0
+        end
+        # @return [Integer] RSS in MB for the current process
+        # @note Since swarm is linux only, we do not have to worry about getting RSS on other OSes
+        def rss_mb
+          kb_rss = 0
+          IO.readlines("/proc/#{node.pid}/status").each do |line|
+            next unless line.start_with?('VmRSS:')
+            kb_rss = line.split[1].to_i
+            break
+          end
+          (kb_rss / 1_024.to_i).round
+        end
+      end
+    end
+  end
+end

data/lib/karafka/process.rb CHANGED Viewed

@@ -14,6 +14,8 @@ module Karafka
       SIGTERM
       SIGTTIN
       SIGTSTP
+      SIGCHLD
+      SIGUSER1
     ].freeze
     HANDLED_SIGNALS.each do |signal|
@@ -32,16 +34,40 @@ module Karafka
       RUBY
     end
+    # Assigns a callback that will run on any supported signal that has at least one callback
+    # registered already.
+    # @param block [Proc] code we want to run
+    # @note This will only bind to signals that already have at least one callback defined
+    def on_any_active(&block)
+      HANDLED_SIGNALS.each do |signal|
+        next unless @callbacks.key?(signal)
+        public_send(:"on_#{signal.to_s.downcase}", &block)
+      end
+    end
     # Creates an instance of process and creates empty hash for callbacks
     def initialize
       @callbacks = Hash.new { |hsh, key| hsh[key] = [] }
       @supervised = false
     end
+    # Clears all the defined callbacks. Useful for post-fork cleanup when parent already defined
+    # some signals
+    def clear
+      @callbacks.clear
+    end
     # Method catches all HANDLED_SIGNALS and performs appropriate callbacks (if defined)
     # @note If there are no callbacks, this method will just ignore a given signal that was sent
     def supervise
-      HANDLED_SIGNALS.each { |signal| trap_signal(signal) }
+      HANDLED_SIGNALS.each do |signal|
+        # Supervise only signals for which we have defined callbacks
+        next unless @callbacks.key?(signal)
+        trap_signal(signal)
+      end
       @supervised = true
     end

data/lib/karafka/routing/features/dead_letter_queue/config.rb CHANGED Viewed

@@ -15,6 +15,8 @@ module Karafka
           :independent,
           # Move to DLQ and mark as consumed in transactional mode (if applicable)
           :transactional,
+          # Strategy to apply (if strategies supported)
+          :strategy,
           keyword_init: true
         ) do
           alias_method :active?, :active

data/lib/karafka/routing/subscription_group.rb CHANGED Viewed

@@ -8,6 +8,12 @@ module Karafka
     # @note One subscription group will always belong to one consumer group, but one consumer
     #   group can have multiple subscription groups.
     class SubscriptionGroup
+      include Helpers::ConfigImporter.new(
+        activity_manager: %i[internal routing activity_manager],
+        client_id: %i[client_id],
+        node: %i[swarm node]
+      )
       attr_reader :id, :name, :topics, :kafka, :consumer_group
       # Lock for generating new ids safely
@@ -67,7 +73,7 @@ module Karafka
       # @return [Boolean] is this subscription group one of active once
       def active?
-        Karafka::App.config.internal.routing.activity_manager.active?(:subscription_groups, name)
+        activity_manager.active?(:subscription_groups, name)
       end
       # @return [Array<String>] names of topics to which we should subscribe.
@@ -93,15 +99,9 @@ module Karafka
       def build_kafka
         kafka = Setup::AttributesMap.consumer(@topics.first.kafka.dup)
-        # If we use static group memberships, there can be a case, where same instance id would
-        # be set on many subscription groups as the group instance id from Karafka perspective is
-        # set per config. Each instance even if they are subscribed to different topics needs to
-        # have it fully unique. To make sure of that, we just add extra postfix at the end that
-        # increments.
-        group_instance_id = kafka.fetch(:'group.instance.id', false)
+        inject_group_instance_id(kafka)
-        kafka[:'group.instance.id'] = "#{group_instance_id}_#{@position}" if group_instance_id
-        kafka[:'client.id'] ||= Karafka::App.config.client_id
+        kafka[:'client.id'] ||= client_id
         kafka[:'group.id'] ||= @consumer_group.id
         kafka[:'auto.offset.reset'] ||= @topics.first.initial_offset
         # Karafka manages the offsets based on the processing state, thus we do not rely on the
@@ -110,6 +110,28 @@ module Karafka
         kafka.freeze
         kafka
       end
+      # If we use static group memberships, there can be a case, where same instance id would
+      # be set on many subscription groups as the group instance id from Karafka perspective is
+      # set per config. Each instance even if they are subscribed to different topics needs to
+      # have it fully unique. To make sure of that, we just add extra postfix at the end that
+      # increments.
+      #
+      # We also handle a swarm case, where the same setup would run from many forked nodes, hence
+      # affecting the instance id and causing conflicts
+      # @param kafka [Hash] kafka level config
+      def inject_group_instance_id(kafka)
+        group_instance_prefix = kafka.fetch(:'group.instance.id', false)
+        # If group instance id was not even configured, do nothing
+        return unless group_instance_prefix
+        # If there is a node, we need to take its id and inject it as well so multiple forks can
+        # have different instances ids but they are reproducible
+        components = [group_instance_prefix, node ? node.id : nil, @position]
+        kafka[:'group.instance.id'] = components.compact.join('_')
+      end
     end
   end
 end

data/lib/karafka/runner.rb CHANGED Viewed

@@ -18,6 +18,10 @@ module Karafka
       workers = Processing::WorkersBatch.new(jobs_queue)
       listeners = Connection::ListenersBatch.new(jobs_queue)
+      # We mark it prior to delegating to the manager as manager will have to start at least one
+      # connection to Kafka, hence running
+      Karafka::App.run!
       # Register all the listeners so they can be started and managed
       @manager.register(listeners)

data/lib/karafka/server.rb CHANGED Viewed

@@ -3,16 +3,6 @@
 module Karafka
   # Karafka consuming server class
   class Server
-    # How long should we sleep between checks on shutting down consumers
-    SUPERVISION_SLEEP = 0.1
-    # What system exit code should we use when we terminated forcefully
-    FORCEFUL_EXIT_CODE = 2
-    # This factor allows us to calculate how many times we have to sleep before
-    # a forceful shutdown
-    SUPERVISION_CHECK_FACTOR = (1 / SUPERVISION_SLEEP)
-    private_constant :SUPERVISION_SLEEP, :FORCEFUL_EXIT_CODE, :SUPERVISION_CHECK_FACTOR
     class << self
       # Set of consuming threads. Each consumer thread contains a single consumer
       attr_accessor :listeners
@@ -36,12 +26,20 @@ module Karafka
           config.internal.routing.activity_manager.to_h
         )
+        # We clear as we do not want parent handlers in case of working from fork
+        process.clear
         process.on_sigint { stop }
         process.on_sigquit { stop }
         process.on_sigterm { stop }
         process.on_sigtstp { quiet }
+        # Needed for instrumentation
+        process.on_sigttin {}
         process.supervise
+        # This will only run when not in a swarm mode. In swarm mode the server runs post-fork, so
+        # warmup will do nothing
+        Karafka::App.warmup
         # Start is blocking until stop is called and when we stop, it will wait until
         # all of the things are ready to stop
         start
@@ -61,10 +59,9 @@ module Karafka
       end
       # Starts Karafka with a supervision
-      # @note We don't need to sleep because Karafka::Fetcher is locking and waiting to
-      # finish loop (and it won't happen until we explicitly want to stop)
+      # @note We don't need to sleep because Karafka::Runner is locking and waiting to finish loop
+      # (and it won't happen until we explicitly want to stop)
       def start
-        Karafka::App.run!
         Karafka::Runner.new.call
       end
@@ -87,13 +84,13 @@ module Karafka
         # We check from time to time (for the timeout period) if all the threads finished
         # their work and if so, we can just return and normal shutdown process will take place
         # We divide it by 1000 because we use time in ms.
-        ((timeout / 1_000) * SUPERVISION_CHECK_FACTOR).to_i.times do
+        ((timeout / 1_000) * (1 / config.internal.supervision_sleep)).to_i.times do
           all_listeners_stopped = listeners.all?(&:stopped?)
           all_workers_stopped = workers.none?(&:alive?)
           return if all_listeners_stopped && all_workers_stopped
-          sleep SUPERVISION_SLEEP
+          sleep(config.internal.supervision_sleep)
         end
         raise Errors::ForcefulShutdownError
@@ -117,7 +114,7 @@ module Karafka
         return unless process.supervised?
         # exit! is not within the instrumentation as it would not trigger due to exit
-        Kernel.exit!(FORCEFUL_EXIT_CODE)
+        Kernel.exit!(config.internal.forceful_exit_code)
       ensure
         # We need to check if it wasn't an early exit to make sure that only on stop invocation
         # can change the status after everything is closed

data/lib/karafka/setup/config.rb CHANGED Viewed

@@ -105,6 +105,17 @@ module Karafka
       # @see https://github.com/edenhill/librdkafka/blob/master/CONFIGURATION.md
       setting :kafka, default: {}
+      # Public configuration for swarm operations
+      setting :swarm do
+        # option [Integer] how many processes do we want to run in a swarm mode
+        # Keep in mind this is only applicable when running in a swarm mode
+        setting :nodes, default: 3
+        # This is set automatically when we fork. Used to hold reference that may be needed
+        # for static group membership, supervision and more. If set to `false`, it means this
+        # process is not a fork
+        setting :node, default: false
+      end
       # Admin specific settings.
       #
       # Since admin operations are often specific, they may require specific librdkafka settings
@@ -151,7 +162,6 @@ module Karafka
         # @note In the future, we need to have a single process representation for all the karafka
         #   instances
         setting :process, default: Process.new
         # Interval of "ticking". This is used to define the maximum time between consecutive
         # polling of the main rdkafka queue. It should match also the `statistics.interval.ms`
         # smallest value defined in any of the per-kafka settings, so metrics are published with
@@ -162,6 +172,36 @@ module Karafka
         # not to have enough time to run. This (not directly) defines also a single poll
         # max timeout as to allow for frequent enough events polling
         setting :tick_interval, default: 5_000
+        # How long should we sleep between checks on shutting down consumers
+        setting :supervision_sleep, default: 0.1
+        # What system exit code should we use when we terminated forcefully
+        setting :forceful_exit_code, default: 2
+        setting :swarm do
+          # Manager for swarm nodes control
+          setting :manager, default: Swarm::Manager.new
+          # Exit code we exit an orphaned child with to indicate something went wrong
+          setting :orphaned_exit_code, default: 3
+          # syscall number for https://man7.org/linux/man-pages/man2/pidfd_open.2.html
+          setting :pidfd_open_syscall, default: 434
+          # syscall number for https://man7.org/linux/man-pages/man2/pidfd_send_signal.2.html
+          setting :pidfd_signal_syscall, default: 424
+          # How often (in ms) should we control our nodes
+          # This is maximum time after which we will check. This can happen more often in case of
+          # system events.
+          setting :supervision_interval, default: 30_000
+          # How often should each node report its status
+          setting :liveness_interval, default: 10_000
+          # Listener used to report nodes state to the supervisor
+          setting :liveness_listener, default: Swarm::LivenessListener.new
+          # How long should we wait for any info from the node before we consider it hanging at
+          # stop it
+          setting :node_report_timeout, default: 30_000
+          # How long should we wait before restarting a node. This can prevent us from having a
+          # case where for some external reason our spawned process would die immediately and we
+          # would immediately try to start it back in an endless loop
+          setting :node_restart_timeout, default: 5_000
+        end
         # Namespace for CLI related settings
         setting :cli do
@@ -176,7 +216,6 @@ module Karafka
           # option subscription_groups_builder [Routing::SubscriptionGroupsBuilder] subscription
           #   group builder
           setting :subscription_groups_builder, default: Routing::SubscriptionGroupsBuilder.new
           # Internally assigned list of limits on routings active for the current process
           # This can be altered by the CLI command
           setting :activity_manager, default: Routing::ActivityManager.new

data/lib/karafka/status.rb CHANGED Viewed

@@ -7,6 +7,7 @@ module Karafka
     STATES = {
       initializing: :initialize!,
       initialized: :initialized!,
+      supervising: :supervise!,
       running: :run!,
       # will no longer pickup any work, but current work will be finished
       quieting: :quiet!,
@@ -49,8 +50,8 @@ module Karafka
         def #{transition}
           MUTEX.synchronize do
-            # Do not allow reverse state transitions (we always go one way) or transition to the same
-            # state as currently
+            # Do not allow reverse state transitions (we always go one way) or transition to the
+            # same state as currently
             return if @status && STATES.keys.index(:#{state}) <= STATES.keys.index(@status)
             @status = :#{state}
@@ -78,6 +79,7 @@ module Karafka
     def done?
       # Short-track for the most common case not to invoke all others on normal execution
       return false if running?
+      return false if supervising?
       stopping? || stopped? || quieting? || quiet? || terminated?
     end

data/lib/karafka/swarm/liveness_listener.rb ADDED Viewed

@@ -0,0 +1,55 @@
+# frozen_string_literal: true
+module Karafka
+  module Swarm
+    # Simple listener for swarm nodes that:
+    #   - reports once in a while to make sure that supervisor is aware we do not hang
+    #   - makes sure we did not become an orphan and if so, exits
+    class LivenessListener
+      include Karafka::Core::Helpers::Time
+      include Helpers::ConfigImporter.new(
+        node: %i[swarm node],
+        liveness_interval: %i[internal swarm liveness_interval],
+        orphaned_exit_code: %i[internal swarm orphaned_exit_code]
+      )
+      def initialize
+        @last_checked_at = 0
+        @mutex = Mutex.new
+      end
+      # Since there may be many statistics emitted from multiple listeners, we do not want to write
+      # statuses that often. Instead we do it only once in a while which should be enough
+      #
+      # While this may provide a small lag in the orphaned detection, it does not really matter
+      # as it will be picked up fast enough.
+      # @param _event [Karafka::Core::Monitoring::Event]
+      def on_statistics_emitted(_event)
+        periodically do
+          Kernel.exit!(orphaned_exit_code) if node.orphaned?
+          node.healthy
+        end
+      end
+      private
+      # Wraps the logic with a mutex
+      # @param block [Proc] code we want to run in mutex
+      def synchronize(&block)
+        @mutex.synchronize(&block)
+      end
+      # Runs requested code once in a while
+      def periodically
+        return if monotonic_now - @last_checked_at < liveness_interval
+        synchronize do
+          @last_checked_at = monotonic_now
+          yield
+        end
+      end
+    end
+  end
+end