RubyGems - karafka - Versions diffs - 2.3.0 → 2.3.2 - Mend

karafka 2.3.0 → 2.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (75) hide show

checksums.yaml +4 -4
checksums.yaml.gz.sig +0 -0
data/.rspec +2 -0
data/CHANGELOG.md +15 -0
data/Gemfile +1 -1
data/Gemfile.lock +22 -22
data/README.md +2 -2
data/bin/integrations +2 -1
data/bin/rspecs +6 -2
data/config/locales/errors.yml +30 -8
data/config/locales/pro_errors.yml +2 -0
data/docker-compose.yml +1 -1
data/lib/karafka/app.rb +14 -0
data/lib/karafka/cli/base.rb +19 -0
data/lib/karafka/cli/server.rb +62 -76
data/lib/karafka/cli/swarm.rb +30 -0
data/lib/karafka/constraints.rb +3 -3
data/lib/karafka/contracts/config.rb +19 -0
data/lib/karafka/errors.rb +12 -0
data/lib/karafka/helpers/async.rb +13 -3
data/lib/karafka/helpers/config_importer.rb +30 -0
data/lib/karafka/instrumentation/logger_listener.rb +31 -0
data/lib/karafka/instrumentation/notifications.rb +9 -0
data/lib/karafka/instrumentation/vendors/datadog/logger_listener.rb +2 -0
data/lib/karafka/instrumentation/vendors/kubernetes/base_listener.rb +72 -0
data/lib/karafka/instrumentation/vendors/kubernetes/liveness_listener.rb +11 -40
data/lib/karafka/instrumentation/vendors/kubernetes/swarm_liveness_listener.rb +54 -0
data/lib/karafka/pro/active_job/job_options_contract.rb +1 -1
data/lib/karafka/pro/base_consumer.rb +16 -0
data/lib/karafka/pro/connection/manager.rb +6 -1
data/lib/karafka/pro/processing/coordinator.rb +13 -3
data/lib/karafka/pro/processing/coordinators/errors_tracker.rb +74 -0
data/lib/karafka/pro/processing/coordinators/filters_applier.rb +107 -0
data/lib/karafka/pro/processing/coordinators/virtual_offset_manager.rb +180 -0
data/lib/karafka/pro/processing/strategies/aj/dlq_ftr_lrj_mom.rb +5 -7
data/lib/karafka/pro/processing/strategies/aj/dlq_ftr_lrj_mom_vp.rb +5 -7
data/lib/karafka/pro/processing/strategies/aj/dlq_ftr_mom.rb +8 -10
data/lib/karafka/pro/processing/strategies/aj/dlq_ftr_mom_vp.rb +8 -16
data/lib/karafka/pro/processing/strategies/aj/dlq_lrj_mom.rb +5 -7
data/lib/karafka/pro/processing/strategies/aj/dlq_lrj_mom_vp.rb +5 -7
data/lib/karafka/pro/processing/strategies/aj/dlq_mom.rb +8 -10
data/lib/karafka/pro/processing/strategies/aj/dlq_mom_vp.rb +7 -9
data/lib/karafka/pro/processing/strategies/dlq/default.rb +36 -10
data/lib/karafka/pro/processing/strategies/dlq/ftr.rb +3 -7
data/lib/karafka/pro/processing/strategies/dlq/ftr_lrj.rb +4 -8
data/lib/karafka/pro/processing/strategies/dlq/ftr_lrj_mom.rb +6 -9
data/lib/karafka/pro/processing/strategies/dlq/ftr_mom.rb +5 -15
data/lib/karafka/pro/processing/strategies/dlq/lrj.rb +4 -8
data/lib/karafka/pro/processing/strategies/dlq/lrj_mom.rb +6 -9
data/lib/karafka/pro/processing/strategies/dlq/mom.rb +10 -20
data/lib/karafka/pro/processing/strategies/vp/default.rb +7 -0
data/lib/karafka/pro/routing/features/dead_letter_queue/contracts/topic.rb +6 -0
data/lib/karafka/pro/routing/features/dead_letter_queue/topic.rb +39 -0
data/lib/karafka/pro/swarm/liveness_listener.rb +171 -0
data/lib/karafka/process.rb +27 -1
data/lib/karafka/routing/features/dead_letter_queue/config.rb +2 -0
data/lib/karafka/routing/subscription_group.rb +31 -9
data/lib/karafka/runner.rb +4 -0
data/lib/karafka/server.rb +13 -16
data/lib/karafka/setup/config.rb +41 -2
data/lib/karafka/status.rb +4 -2
data/lib/karafka/swarm/liveness_listener.rb +55 -0
data/lib/karafka/swarm/manager.rb +217 -0
data/lib/karafka/swarm/node.rb +179 -0
data/lib/karafka/swarm/pidfd.rb +131 -0
data/lib/karafka/swarm/supervisor.rb +184 -0
data/lib/karafka/swarm.rb +27 -0
data/lib/karafka/templates/karafka.rb.erb +0 -2
data/lib/karafka/version.rb +1 -1
data/lib/karafka.rb +1 -1
data.tar.gz.sig +0 -0
metadata +17 -4
metadata.gz.sig +0 -0
data/lib/karafka/pro/processing/filters_applier.rb +0 -105
data/lib/karafka/pro/processing/virtual_offset_manager.rb +0 -177

data/lib/karafka/instrumentation/logger_listener.rb CHANGED Viewed

@@ -226,6 +226,34 @@ module Karafka
         MSG
       end
+      # @param event [Karafka::Core::Monitoring::Event] event details including payload
+      def on_swarm_manager_stopping(event)
+        node = event[:node]
+        error "Swarm manager detected unhealthy node #{node.pid}. Sending TERM signal..."
+      end
+      # @param event [Karafka::Core::Monitoring::Event] event details including payload
+      def on_swarm_manager_terminating(event)
+        node = event[:node]
+        error "Swarm manager detected unresponsive node #{node.pid}. Sending KILL signal..."
+      end
+      # @param event [Karafka::Core::Monitoring::Event] event details including payload
+      def on_swarm_manager_before_fork(event)
+        debug "Swarm manager starting node with id: #{event[:node].id}"
+      end
+      # @param _event [Karafka::Core::Monitoring::Event] event details including payload
+      def on_swarm_node_after_fork(_event)
+        info "Swarm node #{::Process.pid} forked from #{::Process.ppid}"
+      end
+      # @param event [Karafka::Core::Monitoring::Event] event details including payload
+      def on_swarm_manager_control(event)
+        pids = event[:caller].nodes.map(&:pid).join(', ')
+        debug "Swarm manager checking nodes: #{pids}"
+      end
       # There are many types of errors that can occur in many places, but we provide a single
       # handler for all of them to simplify error instrumentation.
       # @param event [Karafka::Core::Monitoring::Event] event details including payload
@@ -259,6 +287,9 @@ module Karafka
         when 'connection.listener.fetch_loop.error'
           error "Listener fetch loop error: #{error}"
           error details
+        when 'swarm.supervisor.error'
+          fatal "Swarm supervisor crashed due to an error: #{error}"
+          fatal details
         when 'runner.call.error'
           fatal "Runner crashed due to an error: #{error}"
           fatal details

data/lib/karafka/instrumentation/notifications.rb CHANGED Viewed

@@ -22,6 +22,8 @@ module Karafka
         app.initializing
         app.initialized
+        app.before_warmup
+        app.supervising
         app.running
         app.quieting
         app.quiet
@@ -73,6 +75,13 @@ module Karafka
         statistics.emitted
+        swarm.node.after_fork
+        swarm.manager.before_fork
+        swarm.manager.after_fork
+        swarm.manager.control
+        swarm.manager.stopping
+        swarm.manager.terminating
         worker.process
         worker.processed
         worker.completed

data/lib/karafka/instrumentation/vendors/datadog/logger_listener.rb CHANGED Viewed

@@ -129,6 +129,8 @@ module Karafka
               fatal "Runner crashed due to an error: #{error}"
             when 'app.stopping.error'
               error 'Forceful Karafka server stop'
+            when 'swarm.supervisor.error'
+              fatal "Swarm supervisor crashed due to an error: #{error}"
             when 'librdkafka.error'
               error "librdkafka internal error occurred: #{error}"
               # Those will only occur when retries in the client fail and when they did not stop

data/lib/karafka/instrumentation/vendors/kubernetes/base_listener.rb ADDED Viewed

@@ -0,0 +1,72 @@
+# frozen_string_literal: true
+require 'socket'
+module Karafka
+  module Instrumentation
+    module Vendors
+      # Namespace for instrumentation related with Kubernetes
+      module Kubernetes
+        # Base Kubernetes Listener providing basic HTTP server capabilities to respond with health
+        class BaseListener
+          include ::Karafka::Core::Helpers::Time
+          # All good with Karafka
+          OK_CODE = '204 No Content'
+          # Some timeouts, fail
+          FAIL_CODE = '500 Internal Server Error'
+          private_constant :OK_CODE, :FAIL_CODE
+          # @param hostname [String, nil] hostname or nil to bind on all
+          # @param port [Integer] TCP port on which we want to run our HTTP status server
+          def initialize(
+            hostname: nil,
+            port: 3000
+          )
+            @hostname = hostname
+            @port = port
+          end
+          private
+          # @return [Boolean] true if all good, false if we should tell k8s to kill this process
+          def healthy?
+            raise NotImplementedError, 'Implement in a subclass'
+          end
+          # Responds to a HTTP request with the process liveness status
+          def respond
+            client = @server.accept
+            client.gets
+            client.print "HTTP/1.1 #{healthy? ? OK_CODE : FAIL_CODE}\r\n"
+            client.print "Content-Type: text/plain\r\n"
+            client.print "\r\n"
+            client.close
+            true
+          rescue Errno::ECONNRESET, Errno::EPIPE, IOError
+            !@server.closed?
+          end
+          # Starts background thread with micro-http monitoring
+          def start
+            @server = TCPServer.new(*[@hostname, @port].compact)
+            Thread.new do
+              loop do
+                break unless respond
+              end
+            end
+          end
+          # Stops the server
+          def stop
+            @server.close
+          end
+        end
+      end
+    end
+  end
+end

data/lib/karafka/instrumentation/vendors/kubernetes/liveness_listener.rb CHANGED Viewed

@@ -1,6 +1,6 @@
 # frozen_string_literal: true
-require 'socket'
+require 'karafka/instrumentation/vendors/kubernetes/base_listener'
 module Karafka
   module Instrumentation
@@ -23,17 +23,9 @@ module Karafka
         #
         # @note In case of usage within an embedding with Puma, you need to select different port
         #   then the one used by Puma itself.
-        class LivenessListener
-          include ::Karafka::Core::Helpers::Time
-          # All good with Karafka
-          OK_CODE = '204 No Content'
-          # Some timeouts, fail
-          FAIL_CODE = '500 Internal Server Error'
-          private_constant :OK_CODE, :FAIL_CODE
+        #
+        # @note Please use `Kubernetes::SwarmLivenessListener` when operating in the swarm mode
+        class LivenessListener < BaseListener
           # @param hostname [String, nil] hostname or nil to bind on all
           # @param port [Integer] TCP port on which we want to run our HTTP status server
           # @param consuming_ttl [Integer] time in ms after which we consider consumption hanging.
@@ -48,30 +40,23 @@ module Karafka
             consuming_ttl: 5 * 60 * 1_000,
             polling_ttl: 5 * 60 * 1_000
           )
-            @hostname = hostname
-            @port = port
             @polling_ttl = polling_ttl
             @consuming_ttl = consuming_ttl
             @mutex = Mutex.new
             @pollings = {}
             @consumptions = {}
+            super(hostname: hostname, port: port)
           end
           # @param _event [Karafka::Core::Monitoring::Event]
           def on_app_running(_event)
-            @server = TCPServer.new(*[@hostname, @port].compact)
-            Thread.new do
-              loop do
-                break unless respond
-              end
-            end
+            start
           end
           # Stop the http server when we stop the process
           # @param _event [Karafka::Core::Monitoring::Event]
           def on_app_stopped(_event)
-            @server.close
+            stop
           end
           # Tick on each fetch
@@ -148,29 +133,15 @@ module Karafka
             end
           end
-          # Responds to a HTTP request with the process liveness status
-          def respond
-            client = @server.accept
-            client.gets
-            client.print "HTTP/1.1 #{status}\r\n"
-            client.print "Content-Type: text/plain\r\n"
-            client.print "\r\n"
-            client.close
-            true
-          rescue Errno::ECONNRESET, Errno::EPIPE, IOError
-            !@server.closed?
-          end
           # Did we exceed any of the ttls
           # @return [String] 204 string if ok, 500 otherwise
-          def status
+          def healthy?
             time = monotonic_now
-            return FAIL_CODE if @pollings.values.any? { |tick| (time - tick) > @polling_ttl }
-            return FAIL_CODE if @consumptions.values.any? { |tick| (time - tick) > @consuming_ttl }
+            return false if @pollings.values.any? { |tick| (time - tick) > @polling_ttl }
+            return false if @consumptions.values.any? { |tick| (time - tick) > @consuming_ttl }
-            OK_CODE
+            true
           end
         end
       end

data/lib/karafka/instrumentation/vendors/kubernetes/swarm_liveness_listener.rb ADDED Viewed

@@ -0,0 +1,54 @@
+# frozen_string_literal: true
+require 'karafka/instrumentation/vendors/kubernetes/base_listener'
+module Karafka
+  module Instrumentation
+    module Vendors
+      module Kubernetes
+        # Kubernetes HTTP listener designed to operate with Karafka running in the swarm mode
+        # In the Swarm mode we supervise only the supervisor as other nodes are suppose to be
+        # managed by the swarm supervisor
+        class SwarmLivenessListener < BaseListener
+          # @param hostname [String, nil] hostname or nil to bind on all
+          # @param port [Integer] TCP port on which we want to run our HTTP status server
+          # @param controlling_ttl [Integer] time in ms after which we consider the supervising
+          #   thread dead because it is not controlling nodes. When configuring this, please take
+          #   into consideration, that during shutdown of the swarm, there is no controlling
+          #   happening.
+          def initialize(
+            hostname: nil,
+            port: 3000,
+            controlling_ttl: 60 * 1_000
+          )
+            @hostname = hostname
+            @port = port
+            @controlling_ttl = controlling_ttl
+            @controlling = monotonic_now
+            super(port: port, hostname: hostname)
+          end
+          # Starts reporting in the supervisor only when it runs
+          # @param _event [Karafka::Core::Monitoring::Event]
+          def on_app_supervising(_event)
+            start
+          end
+          # Tick on each control
+          # @param _event [Karafka::Core::Monitoring::Event]
+          def on_swarm_manager_control(_event)
+            @controlling = monotonic_now
+          end
+          private
+          # Did we exceed any of the ttls
+          # @return [String] 204 string if ok, 500 otherwise
+          def healthy?
+            (monotonic_now - @controlling) < @controlling_ttl
+          end
+        end
+      end
+    end
+  end
+end

data/lib/karafka/pro/active_job/job_options_contract.rb CHANGED Viewed

@@ -26,7 +26,7 @@ module Karafka
         end
         optional(:partitioner) { |val| val.respond_to?(:call) }
-        optional(:partition_key_type) { |val| %i[key partition_key].include?(val) }
+        optional(:partition_key_type) { |val| %i[key partition_key partition].include?(val) }
         optional(:dispatch_method) do |val|
           %i[
             produce_async

data/lib/karafka/pro/base_consumer.rb CHANGED Viewed

@@ -42,6 +42,22 @@ module Karafka
       # By default we do nothing when ticking
       def tick; end
+      # @return [Karafka::Pro::Processing::Coordinators::ErrorsTracker] tracker for errors that
+      #   occurred during processing until another successful processing
+      #
+      # @note This will always contain **only** details of errors that occurred during `#consume`
+      #   because only those are retryable.
+      #
+      # @note This may contain more than one error because:
+      #   - this can collect various errors that might have happened during virtual partitions
+      #     execution
+      #   - errors can pile up during retries and until a clean run, they will be collected with
+      #     a limit of last 100. We do not store more because a consumer with an endless error loop
+      #     would cause memory leaks without such a limit.
+      def errors_tracker
+        coordinator.errors_tracker
+      end
     end
   end
 end

data/lib/karafka/pro/connection/manager.rb CHANGED Viewed

@@ -114,10 +114,15 @@ module Karafka
           once(:quiet!) { active_listeners.each(&:quiet!) }
           # If we are in the process of moving to quiet state, we need to check it.
-          if Karafka::App.quieting? && active_listeners.all?(&:quiet?)
+          if Karafka::App.quieting?
+            # If we are quieting but not all active listeners are quiet we need to wait for all of
+            # them to reach the quiet state
+            return unless active_listeners.all?(&:quiet?)
             once(:quieted!) { Karafka::App.quieted! }
           end
+          # Do nothing if we moved to quiet state and want to be in it
           return if Karafka::App.quiet?
           # Since separate subscription groups are subscribed to different topics, there is no risk

data/lib/karafka/pro/processing/coordinator.rb CHANGED Viewed

@@ -21,13 +21,14 @@ module Karafka
         def_delegators :@collapser, :collapsed?, :collapse_until!
-        attr_reader :filter, :virtual_offset_manager, :shared_mutex
+        attr_reader :filter, :virtual_offset_manager, :shared_mutex, :errors_tracker
         # @param args [Object] anything the base coordinator accepts
         def initialize(*args)
           super
           @executed = []
+          @errors_tracker = Coordinators::ErrorsTracker.new
           @flow_mutex = Mutex.new
           # Lock for user code synchronization
           # We do not want to mix coordinator lock with the user lock not to create cases where
@@ -36,11 +37,11 @@ module Karafka
           # framework and can be used for user-facing locking
           @shared_mutex = Mutex.new
           @collapser = Collapser.new
-          @filter = FiltersApplier.new(self)
+          @filter = Coordinators::FiltersApplier.new(self)
           return unless topic.virtual_partitions?
-          @virtual_offset_manager = VirtualOffsetManager.new(
+          @virtual_offset_manager = Coordinators::VirtualOffsetManager.new(
             topic.name,
             partition,
             topic.virtual_partitions.offset_metadata_strategy
@@ -64,6 +65,14 @@ module Karafka
           @filter.apply!(messages)
+          # Do not clear coordinator errors storage when we are retrying, so we can reference the
+          # errors that have happened during recovery. This can be useful for implementing custom
+          # flows. There can be more errors than one when running with virtual partitions so we
+          # need to make sure we collect them all. Under collapse when we reference a given
+          # consumer we should be able to get all the errors and not just first/last.
+          #
+          # @note We use zero as the attempt mark because we are not "yet" in the attempt 1
+          @errors_tracker.clear if attempt.zero?
           @executed.clear
           # We keep the old processed offsets until the collapsing is done and regular processing
@@ -79,6 +88,7 @@ module Karafka
         # @param error [StandardError] error from the failure
         def failure!(consumer, error)
           super
+          @errors_tracker << error
           collapse_until!(@last_message.offset + 1)
         end

data/lib/karafka/pro/processing/coordinators/errors_tracker.rb ADDED Viewed

@@ -0,0 +1,74 @@
+# frozen_string_literal: true
+# This Karafka component is a Pro component under a commercial license.
+# This Karafka component is NOT licensed under LGPL.
+#
+# All of the commercial components are present in the lib/karafka/pro directory of this
+# repository and their usage requires commercial license agreement.
+#
+# Karafka has also commercial-friendly license, commercial support and commercial components.
+#
+# By sending a pull request to the pro components, you are agreeing to transfer the copyright of
+# your code to Maciej Mensfeld.
+module Karafka
+  module Pro
+    module Processing
+      # Namespace for Pro coordinator related sub-components
+      module Coordinators
+        # Object used to track errors in between executions to be able to build error-type based
+        # recovery flows.
+        class ErrorsTracker
+          include Enumerable
+          # Max errors we keep in memory.
+          # We do not want to keep more because for DLQ-less this would cause memory-leaks.
+          STORAGE_LIMIT = 100
+          private_constant :STORAGE_LIMIT
+          def initialize
+            @errors = []
+          end
+          # Clears all the errors
+          def clear
+            @errors.clear
+          end
+          # @param error [StandardError] adds the error to the tracker
+          def <<(error)
+            @errors.shift if @errors.size >= STORAGE_LIMIT
+            @errors << error
+          end
+          # @return [Boolean] is the error tracker empty
+          def empty?
+            @errors.empty?
+          end
+          # @return [Integer] number of elements
+          def size
+            count
+          end
+          # @return [StandardError, nil] last error that occurred or nil if no errors
+          def last
+            @errors.last
+          end
+          # Iterates over errors
+          # @param block [Proc] code we want to run on each error
+          def each(&block)
+            @errors.each(&block)
+          end
+          # @return [Array<StandardError>] array with all the errors that occurred
+          def all
+            @errors
+          end
+        end
+      end
+    end
+  end
+end

data/lib/karafka/pro/processing/coordinators/filters_applier.rb ADDED Viewed

@@ -0,0 +1,107 @@
+# frozen_string_literal: true
+# This Karafka component is a Pro component under a commercial license.
+# This Karafka component is NOT licensed under LGPL.
+#
+# All of the commercial components are present in the lib/karafka/pro directory of this
+# repository and their usage requires commercial license agreement.
+#
+# Karafka has also commercial-friendly license, commercial support and commercial components.
+#
+# By sending a pull request to the pro components, you are agreeing to transfer the copyright of
+# your code to Maciej Mensfeld.
+module Karafka
+  module Pro
+    module Processing
+      module Coordinators
+        # Applier for all filters we want to have. Whether related to limiting messages based
+        # on the payload or any other things.
+        #
+        # From the outside world perspective, this encapsulates all the filters.
+        # This means that this is the API we expose as a single filter, allowing us to control
+        # the filtering via many filters easily.
+        class FiltersApplier
+          # @return [Array] registered filters array. Useful if we want to inject internal context
+          #   aware filters.
+          attr_reader :filters
+          # @param coordinator [Pro::Coordinator] pro coordinator
+          def initialize(coordinator)
+            # Builds filters out of their factories
+            # We build it that way (providing topic and partition) because there may be a case
+            # where someone wants to have a specific logic that is per topic or partition. Like for
+            # example a case where there is a cache bypassing revocations for topic partition.
+            #
+            # We provide full Karafka routing topic here and not the name only, in case the filter
+            # would be customized based on other topic settings (like VPs, etc)
+            #
+            # This setup allows for biggest flexibility also because topic object holds the
+            # reference to the subscription group and consumer group
+            @filters = coordinator.topic.filtering.factories.map do |factory|
+              factory.call(coordinator.topic, coordinator.partition)
+            end
+          end
+          # @param messages [Array<Karafka::Messages::Message>] array with messages from the
+          #   partition
+          def apply!(messages)
+            return unless active?
+            @filters.each { |filter| filter.apply!(messages) }
+          end
+          # @return [Boolean] did we filter out any messages during filtering run
+          def applied?
+            return false unless active?
+            !applied.empty?
+          end
+          # @return [Symbol] consumer post-filtering action that should be taken
+          def action
+            return :skip unless applied?
+            # The highest priority is on a potential backoff from any of the filters because it is
+            # the less risky (delay and continue later)
+            return :pause if applied.any? { |filter| filter.action == :pause }
+            # If none of the filters wanted to pause, we can check for any that would want to seek
+            # and if there is any, we can go with this strategy
+            return :seek if applied.any? { |filter| filter.action == :seek }
+            :skip
+          end
+          # @return [Integer] minimum timeout we need to pause. This is the minimum for all the
+          #   filters to satisfy all of them.
+          def timeout
+            applied.map(&:timeout).compact.min || 0
+          end
+          # The first message we do need to get next time we poll. We use the minimum not to jump
+          # accidentally by over any.
+          # @return [Karafka::Messages::Message, nil] cursor message or nil if none
+          # @note Cursor message can also return the offset in the time format
+          def cursor
+            return nil unless active?
+            applied.map(&:cursor).compact.min_by(&:offset)
+          end
+          private
+          # @return [Boolean] is filtering active
+          def active?
+            !@filters.empty?
+          end
+          # @return [Array<Object>] filters that applied any sort of messages limiting
+          def applied
+            @filters.select(&:applied?)
+          end
+        end
+      end
+    end
+  end
+end