RubyGems - ruby-kafka - Versions diffs - 0.7.8 → 1.1.0 - Mend

ruby-kafka 0.7.8 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

checksums.yaml +4 -4
data/.circleci/config.yml +135 -3
data/.github/workflows/stale.yml +19 -0
data/CHANGELOG.md +26 -0
data/README.md +26 -0
data/lib/kafka/async_producer.rb +3 -0
data/lib/kafka/client.rb +49 -1
data/lib/kafka/cluster.rb +52 -0
data/lib/kafka/connection.rb +3 -0
data/lib/kafka/consumer.rb +56 -11
data/lib/kafka/consumer_group.rb +10 -1
data/lib/kafka/datadog.rb +18 -11
data/lib/kafka/fetched_batch_generator.rb +1 -1
data/lib/kafka/fetcher.rb +5 -2
data/lib/kafka/offset_manager.rb +12 -1
data/lib/kafka/producer.rb +4 -1
data/lib/kafka/prometheus.rb +316 -0
data/lib/kafka/protocol/join_group_request.rb +8 -2
data/lib/kafka/protocol/metadata_response.rb +1 -1
data/lib/kafka/protocol/offset_fetch_request.rb +3 -1
data/lib/kafka/protocol/record_batch.rb +5 -4
data/lib/kafka/sasl/scram.rb +15 -12
data/lib/kafka/ssl_context.rb +4 -2
data/lib/kafka/tagged_logger.rb +25 -20
data/lib/kafka/version.rb +1 -1
data/ruby-kafka.gemspec +4 -3
metadata +29 -7

data/lib/kafka/connection.rb CHANGED

@@ -58,6 +58,9 @@ module Kafka
       @connect_timeout = connect_timeout || CONNECT_TIMEOUT
       @socket_timeout = socket_timeout || SOCKET_TIMEOUT
       @ssl_context = ssl_context
+      @socket = nil
+      @last_request = nil
     end
     def to_s

data/lib/kafka/consumer.rb CHANGED

@@ -44,7 +44,7 @@ module Kafka
   #
   class Consumer
-    def initialize(cluster:, logger:, instrumenter:, group:, fetcher:, offset_manager:, session_timeout:, heartbeat:)
+    def initialize(cluster:, logger:, instrumenter:, group:, fetcher:, offset_manager:, session_timeout:, heartbeat:, refresh_topic_interval: 0)
       @cluster = cluster
       @logger = TaggedLogger.new(logger)
       @instrumenter = instrumenter
@@ -53,6 +53,7 @@ module Kafka
       @session_timeout = session_timeout
       @fetcher = fetcher
       @heartbeat = heartbeat
+      @refresh_topic_interval = refresh_topic_interval
       @pauses = Hash.new {|h, k|
         h[k] = Hash.new {|h2, k2|
@@ -73,6 +74,15 @@ module Kafka
       #   when user commits message other than last in a batch, this would make ruby-kafka refetch
       #   some already consumed messages
       @current_offsets = Hash.new { |h, k| h[k] = {} }
+      # Map storing subscribed topics with their configuration
+      @subscribed_topics = Concurrent::Map.new
+      # Set storing topics that matched topics in @subscribed_topics
+      @matched_topics = Set.new
+      # Whether join_group must be executed again because new topics are added
+      @join_group_for_new_topics = false
     end
     # Subscribes the consumer to a topic.
@@ -97,13 +107,12 @@ module Kafka
     def subscribe(topic_or_regex, default_offset: nil, start_from_beginning: true, max_bytes_per_partition: 1048576)
       default_offset ||= start_from_beginning ? :earliest : :latest
-      if topic_or_regex.is_a?(Regexp)
-        cluster_topics.select { |topic| topic =~ topic_or_regex }.each do |topic|
-          subscribe_to_topic(topic, default_offset, start_from_beginning, max_bytes_per_partition)
-        end
-      else
-        subscribe_to_topic(topic_or_regex, default_offset, start_from_beginning, max_bytes_per_partition)
-      end
+      @subscribed_topics[topic_or_regex] = {
+        default_offset: default_offset,
+        start_from_beginning: start_from_beginning,
+        max_bytes_per_partition: max_bytes_per_partition
+      }
+      scan_for_subscribing
       nil
     end
@@ -116,7 +125,6 @@ module Kafka
     def stop
       @running = false
       @fetcher.stop
-      @cluster.disconnect
     end
     # Pause processing of a specific topic partition.
@@ -308,6 +316,7 @@ module Kafka
               topic: batch.topic,
               partition: batch.partition,
               last_offset: batch.last_offset,
+              last_create_time: batch.messages.last.try(:create_time),
               offset_lag: batch.offset_lag,
               highwater_mark_offset: batch.highwater_mark_offset,
               message_count: batch.messages.count,
@@ -401,6 +410,7 @@ module Kafka
       while running?
         begin
           @instrumenter.instrument("loop.consumer") do
+            refresh_topic_list_if_enabled
             yield
           end
         rescue HeartbeatError
@@ -432,6 +442,7 @@ module Kafka
       # important that members explicitly tell Kafka when they're leaving.
       make_final_offsets_commit!
       @group.leave rescue nil
+      @cluster.disconnect
       @running = false
       @logger.pop_tags
     end
@@ -452,6 +463,8 @@ module Kafka
     end
     def join_group
+      @join_group_for_new_topics = false
       old_generation_id = @group.generation_id
       @group.join
@@ -513,11 +526,19 @@ module Kafka
       end
     end
+    def refresh_topic_list_if_enabled
+      return if @refresh_topic_interval <= 0
+      return if @refreshed_at && @refreshed_at + @refresh_topic_interval > Time.now
+      scan_for_subscribing
+      @refreshed_at = Time.now
+    end
     def fetch_batches
       # Return early if the consumer has been stopped.
       return [] if shutting_down?
-      join_group unless @group.member?
+      join_group if !@group.member? || @join_group_for_new_topics
       trigger_heartbeat
@@ -525,7 +546,7 @@ module Kafka
       if !@fetcher.data?
         @logger.debug "No batches to process"
-        sleep 2
+        sleep(@fetcher.max_wait_time || 2)
         []
       else
         tag, message = @fetcher.poll
@@ -571,10 +592,34 @@ module Kafka
       end
     end
+    def scan_for_subscribing
+      @subscribed_topics.each do |topic_or_regex, config|
+        default_offset = config.fetch(:default_offset)
+        start_from_beginning = config.fetch(:start_from_beginning)
+        max_bytes_per_partition = config.fetch(:max_bytes_per_partition)
+        if topic_or_regex.is_a?(Regexp)
+          subscribe_to_regex(topic_or_regex, default_offset, start_from_beginning, max_bytes_per_partition)
+        else
+          subscribe_to_topic(topic_or_regex, default_offset, start_from_beginning, max_bytes_per_partition)
+        end
+      end
+    end
+    def subscribe_to_regex(topic_regex, default_offset, start_from_beginning, max_bytes_per_partition)
+      cluster_topics.select { |topic| topic =~ topic_regex }.each do |topic|
+        subscribe_to_topic(topic, default_offset, start_from_beginning, max_bytes_per_partition)
+      end
+    end
     def subscribe_to_topic(topic, default_offset, start_from_beginning, max_bytes_per_partition)
+      return if @matched_topics.include?(topic)
+      @matched_topics.add(topic)
+      @join_group_for_new_topics = true
       @group.subscribe(topic)
       @offset_manager.set_default_offset(topic, default_offset)
       @fetcher.subscribe(topic, max_bytes_per_partition: max_bytes_per_partition)
+      @cluster.mark_as_stale!
     end
     def cluster_topics

data/lib/kafka/consumer_group.rb CHANGED

@@ -7,11 +7,12 @@ module Kafka
   class ConsumerGroup
     attr_reader :assigned_partitions, :generation_id, :group_id
-    def initialize(cluster:, logger:, group_id:, session_timeout:, retention_time:, instrumenter:)
+    def initialize(cluster:, logger:, group_id:, session_timeout:, rebalance_timeout:, retention_time:, instrumenter:)
       @cluster = cluster
       @logger = TaggedLogger.new(logger)
       @group_id = group_id
       @session_timeout = session_timeout
+      @rebalance_timeout = rebalance_timeout
       @instrumenter = instrumenter
       @member_id = ""
       @generation_id = nil
@@ -140,7 +141,9 @@ module Kafka
         response = coordinator.join_group(
           group_id: @group_id,
           session_timeout: @session_timeout,
+          rebalance_timeout: @rebalance_timeout,
           member_id: @member_id,
+          topics: @topics,
         )
         Protocol.handle_error(response.error_code)
@@ -158,6 +161,12 @@ module Kafka
       @member_id = ""
       sleep 1
+      retry
+    rescue CoordinatorLoadInProgress
+      @logger.error "Coordinator broker still loading, retrying in 1s..."
+      sleep 1
       retry
     end

data/lib/kafka/datadog.rb CHANGED

@@ -31,7 +31,7 @@ module Kafka
     class << self
       def statsd
-        @statsd ||= ::Datadog::Statsd.new(host, port, namespace: namespace, tags: tags)
+        @statsd ||= ::Datadog::Statsd.new(host, port, namespace: namespace, tags: tags, socket_path: socket_path)
       end
       def statsd=(statsd)
@@ -40,7 +40,7 @@ module Kafka
       end
       def host
-        @host ||= default_host
+        @host
       end
       def host=(host)
@@ -49,7 +49,7 @@ module Kafka
       end
       def port
-        @port ||= default_port
+        @port
       end
       def port=(port)
@@ -57,6 +57,15 @@ module Kafka
         clear
       end
+      def socket_path
+        @socket_path
+      end
+      def socket_path=(socket_path)
+        @socket_path = socket_path
+        clear
+      end
       def namespace
         @namespace ||= STATSD_NAMESPACE
       end
@@ -77,14 +86,6 @@ module Kafka
       private
-      def default_host
-        ::Datadog::Statsd.const_defined?(:Connection) ? ::Datadog::Statsd::Connection::DEFAULT_HOST : ::Datadog::Statsd::DEFAULT_HOST
-      end
-      def default_port
-        ::Datadog::Statsd.const_defined?(:Connection) ? ::Datadog::Statsd::Connection::DEFAULT_PORT : ::Datadog::Statsd::DEFAULT_PORT
-      end
       def clear
         @statsd && @statsd.close
         @statsd = nil
@@ -168,6 +169,8 @@ module Kafka
       def process_batch(event)
         offset = event.payload.fetch(:last_offset)
         messages = event.payload.fetch(:message_count)
+        create_time = event.payload.fetch(:last_create_time)
+        time_lag = create_time && ((Time.now - create_time) * 1000).to_i
         tags = {
           client: event.payload.fetch(:client_id),
@@ -184,6 +187,10 @@ module Kafka
         end
         gauge("consumer.offset", offset, tags: tags)
+        if time_lag
+          gauge("consumer.time_lag", time_lag, tags: tags)
+        end
       end
       def fetch_batch(event)

data/lib/kafka/fetched_batch_generator.rb CHANGED

@@ -48,7 +48,7 @@ module Kafka
               partition: @fetched_partition.partition
             )
           end
-        end
+        end.compact
       end
       FetchedBatch.new(
         topic: @topic,

data/lib/kafka/fetcher.rb CHANGED

@@ -4,7 +4,7 @@ require "kafka/fetch_operation"
 module Kafka
   class Fetcher
-    attr_reader :queue
+    attr_reader :queue, :max_wait_time
     def initialize(cluster:, logger:, instrumenter:, max_queue_size:, group:)
       @cluster = cluster
@@ -17,6 +17,9 @@ module Kafka
       @commands = Queue.new
       @next_offsets = Hash.new { |h, k| h[k] = {} }
+      # We are only running when someone calls start.
+      @running = false
       # Long poll until at least this many bytes can be fetched.
       @min_bytes = 1
@@ -110,7 +113,7 @@ module Kafka
       elsif @queue.size < @max_queue_size
         step
       else
-        @logger.warn "Reached max fetcher queue size (#{@max_queue_size}), sleeping 1s"
+        @logger.info "Reached max fetcher queue size (#{@max_queue_size}), sleeping 1s"
         sleep 1
       end
     ensure

data/lib/kafka/offset_manager.rb CHANGED

@@ -50,9 +50,20 @@ module Kafka
     # @param offset [Integer] the offset of the message that should be marked as processed.
     # @return [nil]
     def mark_as_processed(topic, partition, offset)
-      @uncommitted_offsets += 1
+      unless @group.assigned_to?(topic, partition)
+        @logger.debug "Not marking #{topic}/#{partition}:#{offset} as processed for partition not assigned to this consumer."
+        return
+      end
       @processed_offsets[topic] ||= {}
+      last_processed_offset = @processed_offsets[topic][partition] || -1
+      if last_processed_offset > offset + 1
+        @logger.debug "Not overwriting newer offset #{topic}/#{partition}:#{last_processed_offset - 1} with older #{offset}"
+        return
+      end
+      @uncommitted_offsets += 1
       # The committed offset should always be the offset of the next message that the
       # application will read, thus adding one to the last message processed.
       @processed_offsets[topic][partition] = offset + 1

data/lib/kafka/producer.rb CHANGED

@@ -188,11 +188,14 @@ module Kafka
     # @raise [BufferOverflow] if the maximum buffer size has been reached.
     # @return [nil]
     def produce(value, key: nil, headers: {}, topic:, partition: nil, partition_key: nil, create_time: Time.now)
+      # We want to fail fast if `topic` isn't a String
+      topic = topic.to_str
       message = PendingMessage.new(
         value: value && value.to_s,
         key: key && key.to_s,
         headers: headers,
-        topic: topic.to_s,
+        topic: topic,
         partition: partition && Integer(partition),
         partition_key: partition_key && partition_key.to_s,
         create_time: create_time

data/lib/kafka/prometheus.rb ADDED

@@ -0,0 +1,316 @@
+# frozen_string_literal: true
+#
+#  Subscriber to ruby_kafka to report metrics to prometheus
+#
+#  Usage:
+#     require "kafka/prometheus"
+#
+#  Once the file has been required, no further configuration is needed, all operational
+#  metrics are automatically emitted (Unless PROMETHEUS_NO_AUTO_START is set).
+#
+#  By Peter Mustel, T2 Data AB
+#
+begin
+  require 'prometheus/client'
+rescue LoadError
+  warn 'In order to report Kafka client metrics to Prometheus you need to install the `prometheus-client` gem.'
+  raise
+end
+require 'active_support/subscriber'
+module Kafka
+  module Prometheus
+    SIZE_BUCKETS = [1, 10, 100, 1000, 10_000, 100_000, 1_000_000].freeze
+    LATENCY_BUCKETS = [0.0001, 0.001, 0.01, 0.1, 1.0, 10, 100, 1000].freeze
+    DELAY_BUCKETS = [1, 3, 10, 30, 100, 300, 1000, 3000, 10_000, 30_000].freeze
+    class << self
+      attr_accessor :registry
+      def start(registry = ::Prometheus::Client.registry)
+        @registry = registry
+        ConnectionSubscriber.attach_to 'connection.kafka'
+        ConsumerSubscriber.attach_to 'consumer.kafka'
+        ProducerSubscriber.attach_to 'producer.kafka'
+        AsyncProducerSubscriber.attach_to 'async_producer.kafka'
+        FetcherSubscriber.attach_to 'fetcher.kafka'
+      end
+    end
+    class ConnectionSubscriber < ActiveSupport::Subscriber
+      def initialize
+        super
+        @api_calls = Prometheus.registry.counter(:api_calls, docstring: 'Total calls', labels: [:client, :api, :broker])
+        @api_latency = Prometheus.registry.histogram(:api_latency, docstring: 'Latency', buckets: LATENCY_BUCKETS, labels: [:client, :api, :broker])
+        @api_request_size = Prometheus.registry.histogram(:api_request_size, docstring: 'Request size', buckets: SIZE_BUCKETS, labels: [:client, :api, :broker])
+        @api_response_size = Prometheus.registry.histogram(:api_response_size, docstring: 'Response size', buckets: SIZE_BUCKETS, labels: [:client, :api, :broker])
+        @api_errors = Prometheus.registry.counter(:api_errors, docstring: 'Errors', labels: [:client, :api, :broker])
+      end
+      def request(event)
+        key = {
+          client: event.payload.fetch(:client_id),
+          api: event.payload.fetch(:api, 'unknown'),
+          broker: event.payload.fetch(:broker_host)
+        }
+        request_size = event.payload.fetch(:request_size, 0)
+        response_size = event.payload.fetch(:response_size, 0)
+        @api_calls.increment(labels: key)
+        @api_latency.observe(event.duration, labels: key)
+        @api_request_size.observe(request_size, labels: key)
+        @api_response_size.observe(response_size, labels: key)
+        @api_errors.increment(labels: key) if event.payload.key?(:exception)
+      end
+    end
+    class ConsumerSubscriber < ActiveSupport::Subscriber
+      def initialize
+        super
+        @process_messages = Prometheus.registry.counter(:consumer_process_messages, docstring: 'Total messages', labels: [:client, :group_id, :topic, :partition])
+        @process_message_errors = Prometheus.registry.counter(:consumer_process_message_errors, docstring: 'Total errors', labels: [:client, :group_id, :topic, :partition])
+        @process_message_latency =
+          Prometheus.registry.histogram(:consumer_process_message_latency, docstring: 'Latency', buckets: LATENCY_BUCKETS, labels: [:client, :group_id, :topic, :partition])
+        @offset_lag = Prometheus.registry.gauge(:consumer_offset_lag, docstring: 'Offset lag', labels: [:client, :group_id, :topic, :partition])
+        @time_lag = Prometheus.registry.gauge(:consumer_time_lag, docstring: 'Time lag of message', labels: [:client, :group_id, :topic, :partition])
+        @process_batch_errors = Prometheus.registry.counter(:consumer_process_batch_errors, docstring: 'Total errors in batch', labels: [:client, :group_id, :topic, :partition])
+        @process_batch_latency =
+          Prometheus.registry.histogram(:consumer_process_batch_latency, docstring: 'Latency in batch', buckets: LATENCY_BUCKETS, labels: [:client, :group_id, :topic, :partition])
+        @batch_size = Prometheus.registry.histogram(:consumer_batch_size, docstring: 'Size of batch', buckets: SIZE_BUCKETS, labels: [:client, :group_id, :topic, :partition])
+        @join_group = Prometheus.registry.histogram(:consumer_join_group, docstring: 'Time to join group', buckets: DELAY_BUCKETS, labels: [:client, :group_id])
+        @join_group_errors = Prometheus.registry.counter(:consumer_join_group_errors, docstring: 'Total error in joining group', labels: [:client, :group_id])
+        @sync_group = Prometheus.registry.histogram(:consumer_sync_group, docstring: 'Time to sync group', buckets: DELAY_BUCKETS, labels: [:client, :group_id])
+        @sync_group_errors = Prometheus.registry.counter(:consumer_sync_group_errors, docstring: 'Total error in syncing group', labels: [:client, :group_id])
+        @leave_group = Prometheus.registry.histogram(:consumer_leave_group, docstring: 'Time to leave group', buckets: DELAY_BUCKETS, labels: [:client, :group_id])
+        @leave_group_errors = Prometheus.registry.counter(:consumer_leave_group_errors, docstring: 'Total error in leaving group', labels: [:client, :group_id])
+        @pause_duration = Prometheus.registry.gauge(:consumer_pause_duration, docstring: 'Pause duration', labels: [:client, :group_id, :topic, :partition])
+      end
+      def process_message(event)
+        key = {
+          client: event.payload.fetch(:client_id),
+          group_id: event.payload.fetch(:group_id),
+          topic: event.payload.fetch(:topic),
+          partition: event.payload.fetch(:partition)
+        }
+        offset_lag = event.payload.fetch(:offset_lag)
+        create_time = event.payload.fetch(:create_time)
+        time_lag = create_time && ((Time.now - create_time) * 1000).to_i
+        if event.payload.key?(:exception)
+          @process_message_errors.increment(labels: key)
+        else
+          @process_message_latency.observe(event.duration, labels: key)
+          @process_messages.increment(labels: key)
+        end
+        @offset_lag.set(offset_lag, labels: key)
+        # Not all messages have timestamps.
+        return unless time_lag
+        @time_lag.set(time_lag, labels: key)
+      end
+      def process_batch(event)
+        key = {
+          client: event.payload.fetch(:client_id),
+          group_id: event.payload.fetch(:group_id),
+          topic: event.payload.fetch(:topic),
+          partition: event.payload.fetch(:partition)
+        }
+        message_count = event.payload.fetch(:message_count)
+        if event.payload.key?(:exception)
+          @process_batch_errors.increment(labels: key)
+        else
+          @process_batch_latency.observe(event.duration, labels: key)
+          @process_messages.increment(by: message_count, labels: key)
+        end
+      end
+      def fetch_batch(event)
+        key = {
+          client: event.payload.fetch(:client_id),
+          group_id: event.payload.fetch(:group_id),
+          topic: event.payload.fetch(:topic),
+          partition: event.payload.fetch(:partition)
+        }
+        offset_lag = event.payload.fetch(:offset_lag)
+        batch_size = event.payload.fetch(:message_count)
+        @batch_size.observe(batch_size, labels: key)
+        @offset_lag.set(offset_lag, labels: key)
+      end
+      def join_group(event)
+        key = { client: event.payload.fetch(:client_id), group_id: event.payload.fetch(:group_id) }
+        @join_group.observe(event.duration, labels: key)
+        @join_group_errors.increment(labels: key) if event.payload.key?(:exception)
+      end
+      def sync_group(event)
+        key = { client: event.payload.fetch(:client_id), group_id: event.payload.fetch(:group_id) }
+        @sync_group.observe(event.duration, labels: key)
+        @sync_group_errors.increment(labels: key) if event.payload.key?(:exception)
+      end
+      def leave_group(event)
+        key = { client: event.payload.fetch(:client_id), group_id: event.payload.fetch(:group_id) }
+        @leave_group.observe(event.duration, labels: key)
+        @leave_group_errors.increment(labels: key) if event.payload.key?(:exception)
+      end
+      def pause_status(event)
+        key = {
+          client: event.payload.fetch(:client_id),
+          group_id: event.payload.fetch(:group_id),
+          topic: event.payload.fetch(:topic),
+          partition: event.payload.fetch(:partition)
+        }
+        duration = event.payload.fetch(:duration)
+        @pause_duration.set(duration, labels: key)
+      end
+    end
+    class ProducerSubscriber < ActiveSupport::Subscriber
+      def initialize
+        super
+        @produce_messages = Prometheus.registry.counter(:producer_produced_messages, docstring: 'Produced messages total', labels: [:client, :topic])
+        @produce_message_size =
+          Prometheus.registry.histogram(:producer_message_size, docstring: 'Message size', buckets: SIZE_BUCKETS, labels: [:client, :topic])
+        @buffer_size = Prometheus.registry.histogram(:producer_buffer_size, docstring: 'Buffer size', buckets: SIZE_BUCKETS, labels: [:client])
+        @buffer_fill_ratio = Prometheus.registry.histogram(:producer_buffer_fill_ratio, docstring: 'Buffer fill ratio', labels: [:client])
+        @buffer_fill_percentage = Prometheus.registry.histogram(:producer_buffer_fill_percentage, docstring: 'Buffer fill percentage', labels: [:client])
+        @produce_errors = Prometheus.registry.counter(:producer_produce_errors, docstring: 'Produce errors', labels: [:client, :topic])
+        @deliver_errors = Prometheus.registry.counter(:producer_deliver_errors, docstring: 'Deliver error', labels: [:client])
+        @deliver_latency =
+          Prometheus.registry.histogram(:producer_deliver_latency, docstring: 'Delivery latency', buckets: LATENCY_BUCKETS, labels: [:client])
+        @deliver_messages = Prometheus.registry.counter(:producer_deliver_messages, docstring: 'Total count of delivered messages', labels: [:client])
+        @deliver_attempts = Prometheus.registry.histogram(:producer_deliver_attempts, docstring: 'Delivery attempts', labels: [:client])
+        @ack_messages = Prometheus.registry.counter(:producer_ack_messages, docstring: 'Ack', labels: [:client, :topic])
+        @ack_delay = Prometheus.registry.histogram(:producer_ack_delay, docstring: 'Ack delay', buckets: LATENCY_BUCKETS, labels: [:client, :topic])
+        @ack_errors = Prometheus.registry.counter(:producer_ack_errors, docstring: 'Ack errors', labels: [:client, :topic])
+      end
+      def produce_message(event)
+        client = event.payload.fetch(:client_id)
+        key = { client: client, topic: event.payload.fetch(:topic) }
+        message_size = event.payload.fetch(:message_size)
+        buffer_size = event.payload.fetch(:buffer_size)
+        max_buffer_size = event.payload.fetch(:max_buffer_size)
+        buffer_fill_ratio = buffer_size.to_f / max_buffer_size.to_f
+        buffer_fill_percentage = buffer_fill_ratio * 100.0
+        # This gets us the write rate.
+        @produce_messages.increment(labels: key)
+        @produce_message_size.observe(message_size, labels: key)
+        # This gets us the avg/max buffer size per producer.
+        @buffer_size.observe(buffer_size, labels: { client: client })
+        # This gets us the avg/max buffer fill ratio per producer.
+        @buffer_fill_ratio.observe(buffer_fill_ratio, labels: { client: client })
+        @buffer_fill_percentage.observe(buffer_fill_percentage, labels: { client: client })
+      end
+      def buffer_overflow(event)
+        key = { client: event.payload.fetch(:client_id), topic: event.payload.fetch(:topic) }
+        @produce_errors.increment(labels: key)
+      end
+      def deliver_messages(event)
+        key = { client: event.payload.fetch(:client_id) }
+        message_count = event.payload.fetch(:delivered_message_count)
+        attempts = event.payload.fetch(:attempts)
+        @deliver_errors.increment(labels: key) if event.payload.key?(:exception)
+        @deliver_latency.observe(event.duration, labels: key)
+        # Messages delivered to Kafka:
+        @deliver_messages.increment(by: message_count, labels: key)
+        # Number of attempts to deliver messages:
+        @deliver_attempts.observe(attempts, labels: key)
+      end
+      def ack_message(event)
+        key = { client: event.payload.fetch(:client_id), topic: event.payload.fetch(:topic) }
+        # Number of messages ACK'd for the topic.
+        @ack_messages.increment(labels: key)
+        # Histogram of delay between a message being produced and it being ACK'd.
+        @ack_delay.observe(event.payload.fetch(:delay), labels: key)
+      end
+      def topic_error(event)
+        key = { client: event.payload.fetch(:client_id), topic: event.payload.fetch(:topic) }
+        @ack_errors.increment(labels: key)
+      end
+    end
+    class AsyncProducerSubscriber < ActiveSupport::Subscriber
+      def initialize
+        super
+        @queue_size = Prometheus.registry.histogram(:async_producer_queue_size, docstring: 'Queue size', buckets: SIZE_BUCKETS, labels: [:client, :topic])
+        @queue_fill_ratio = Prometheus.registry.histogram(:async_producer_queue_fill_ratio, docstring: 'Queue fill ratio', labels: [:client, :topic])
+        @produce_errors = Prometheus.registry.counter(:async_producer_produce_errors, docstring: 'Producer errors', labels: [:client, :topic])
+        @dropped_messages = Prometheus.registry.counter(:async_producer_dropped_messages, docstring: 'Dropped messages', labels: [:client])
+      end
+      def enqueue_message(event)
+        key = { client: event.payload.fetch(:client_id), topic: event.payload.fetch(:topic) }
+        queue_size = event.payload.fetch(:queue_size)
+        max_queue_size = event.payload.fetch(:max_queue_size)
+        queue_fill_ratio = queue_size.to_f / max_queue_size.to_f
+        # This gets us the avg/max queue size per producer.
+        @queue_size.observe(queue_size, labels: key)
+        # This gets us the avg/max queue fill ratio per producer.
+        @queue_fill_ratio.observe(queue_fill_ratio, labels: key)
+      end
+      def buffer_overflow(event)
+        key = { client: event.payload.fetch(:client_id), topic: event.payload.fetch(:topic) }
+        @produce_errors.increment(labels: key)
+      end
+      def drop_messages(event)
+        key = { client: event.payload.fetch(:client_id) }
+        message_count = event.payload.fetch(:message_count)
+        @dropped_messages.increment(by: message_count, labels: key)
+      end
+    end
+    class FetcherSubscriber < ActiveSupport::Subscriber
+      def initialize
+        super
+        @queue_size = Prometheus.registry.gauge(:fetcher_queue_size, docstring: 'Queue size', labels: [:client, :group_id])
+      end
+      def loop(event)
+        queue_size = event.payload.fetch(:queue_size)
+        client = event.payload.fetch(:client_id)
+        group_id = event.payload.fetch(:group_id)
+        @queue_size.set(queue_size, labels: { client: client, group_id: group_id })
+      end
+    end
+  end
+end
+# To enable testability, it is possible to skip the start until test time
+Kafka::Prometheus.start unless defined?(PROMETHEUS_NO_AUTO_START)