ruby-kafka 0.7.8 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -58,6 +58,9 @@ module Kafka
58
58
  @connect_timeout = connect_timeout || CONNECT_TIMEOUT
59
59
  @socket_timeout = socket_timeout || SOCKET_TIMEOUT
60
60
  @ssl_context = ssl_context
61
+
62
+ @socket = nil
63
+ @last_request = nil
61
64
  end
62
65
 
63
66
  def to_s
@@ -44,7 +44,7 @@ module Kafka
44
44
  #
45
45
  class Consumer
46
46
 
47
- def initialize(cluster:, logger:, instrumenter:, group:, fetcher:, offset_manager:, session_timeout:, heartbeat:)
47
+ def initialize(cluster:, logger:, instrumenter:, group:, fetcher:, offset_manager:, session_timeout:, heartbeat:, refresh_topic_interval: 0)
48
48
  @cluster = cluster
49
49
  @logger = TaggedLogger.new(logger)
50
50
  @instrumenter = instrumenter
@@ -53,6 +53,7 @@ module Kafka
53
53
  @session_timeout = session_timeout
54
54
  @fetcher = fetcher
55
55
  @heartbeat = heartbeat
56
+ @refresh_topic_interval = refresh_topic_interval
56
57
 
57
58
  @pauses = Hash.new {|h, k|
58
59
  h[k] = Hash.new {|h2, k2|
@@ -73,6 +74,15 @@ module Kafka
73
74
  # when user commits message other than last in a batch, this would make ruby-kafka refetch
74
75
  # some already consumed messages
75
76
  @current_offsets = Hash.new { |h, k| h[k] = {} }
77
+
78
+ # Map storing subscribed topics with their configuration
79
+ @subscribed_topics = Concurrent::Map.new
80
+
81
+ # Set storing topics that matched topics in @subscribed_topics
82
+ @matched_topics = Set.new
83
+
84
+ # Whether join_group must be executed again because new topics are added
85
+ @join_group_for_new_topics = false
76
86
  end
77
87
 
78
88
  # Subscribes the consumer to a topic.
@@ -97,13 +107,12 @@ module Kafka
97
107
  def subscribe(topic_or_regex, default_offset: nil, start_from_beginning: true, max_bytes_per_partition: 1048576)
98
108
  default_offset ||= start_from_beginning ? :earliest : :latest
99
109
 
100
- if topic_or_regex.is_a?(Regexp)
101
- cluster_topics.select { |topic| topic =~ topic_or_regex }.each do |topic|
102
- subscribe_to_topic(topic, default_offset, start_from_beginning, max_bytes_per_partition)
103
- end
104
- else
105
- subscribe_to_topic(topic_or_regex, default_offset, start_from_beginning, max_bytes_per_partition)
106
- end
110
+ @subscribed_topics[topic_or_regex] = {
111
+ default_offset: default_offset,
112
+ start_from_beginning: start_from_beginning,
113
+ max_bytes_per_partition: max_bytes_per_partition
114
+ }
115
+ scan_for_subscribing
107
116
 
108
117
  nil
109
118
  end
@@ -116,7 +125,6 @@ module Kafka
116
125
  def stop
117
126
  @running = false
118
127
  @fetcher.stop
119
- @cluster.disconnect
120
128
  end
121
129
 
122
130
  # Pause processing of a specific topic partition.
@@ -308,6 +316,7 @@ module Kafka
308
316
  topic: batch.topic,
309
317
  partition: batch.partition,
310
318
  last_offset: batch.last_offset,
319
+ last_create_time: batch.messages.last.try(:create_time),
311
320
  offset_lag: batch.offset_lag,
312
321
  highwater_mark_offset: batch.highwater_mark_offset,
313
322
  message_count: batch.messages.count,
@@ -401,6 +410,7 @@ module Kafka
401
410
  while running?
402
411
  begin
403
412
  @instrumenter.instrument("loop.consumer") do
413
+ refresh_topic_list_if_enabled
404
414
  yield
405
415
  end
406
416
  rescue HeartbeatError
@@ -432,6 +442,7 @@ module Kafka
432
442
  # important that members explicitly tell Kafka when they're leaving.
433
443
  make_final_offsets_commit!
434
444
  @group.leave rescue nil
445
+ @cluster.disconnect
435
446
  @running = false
436
447
  @logger.pop_tags
437
448
  end
@@ -452,6 +463,8 @@ module Kafka
452
463
  end
453
464
 
454
465
  def join_group
466
+ @join_group_for_new_topics = false
467
+
455
468
  old_generation_id = @group.generation_id
456
469
 
457
470
  @group.join
@@ -513,11 +526,19 @@ module Kafka
513
526
  end
514
527
  end
515
528
 
529
+ def refresh_topic_list_if_enabled
530
+ return if @refresh_topic_interval <= 0
531
+ return if @refreshed_at && @refreshed_at + @refresh_topic_interval > Time.now
532
+
533
+ scan_for_subscribing
534
+ @refreshed_at = Time.now
535
+ end
536
+
516
537
  def fetch_batches
517
538
  # Return early if the consumer has been stopped.
518
539
  return [] if shutting_down?
519
540
 
520
- join_group unless @group.member?
541
+ join_group if !@group.member? || @join_group_for_new_topics
521
542
 
522
543
  trigger_heartbeat
523
544
 
@@ -525,7 +546,7 @@ module Kafka
525
546
 
526
547
  if !@fetcher.data?
527
548
  @logger.debug "No batches to process"
528
- sleep 2
549
+ sleep(@fetcher.max_wait_time || 2)
529
550
  []
530
551
  else
531
552
  tag, message = @fetcher.poll
@@ -571,10 +592,34 @@ module Kafka
571
592
  end
572
593
  end
573
594
 
595
+ def scan_for_subscribing
596
+ @subscribed_topics.each do |topic_or_regex, config|
597
+ default_offset = config.fetch(:default_offset)
598
+ start_from_beginning = config.fetch(:start_from_beginning)
599
+ max_bytes_per_partition = config.fetch(:max_bytes_per_partition)
600
+ if topic_or_regex.is_a?(Regexp)
601
+ subscribe_to_regex(topic_or_regex, default_offset, start_from_beginning, max_bytes_per_partition)
602
+ else
603
+ subscribe_to_topic(topic_or_regex, default_offset, start_from_beginning, max_bytes_per_partition)
604
+ end
605
+ end
606
+ end
607
+
608
+ def subscribe_to_regex(topic_regex, default_offset, start_from_beginning, max_bytes_per_partition)
609
+ cluster_topics.select { |topic| topic =~ topic_regex }.each do |topic|
610
+ subscribe_to_topic(topic, default_offset, start_from_beginning, max_bytes_per_partition)
611
+ end
612
+ end
613
+
574
614
  def subscribe_to_topic(topic, default_offset, start_from_beginning, max_bytes_per_partition)
615
+ return if @matched_topics.include?(topic)
616
+ @matched_topics.add(topic)
617
+ @join_group_for_new_topics = true
618
+
575
619
  @group.subscribe(topic)
576
620
  @offset_manager.set_default_offset(topic, default_offset)
577
621
  @fetcher.subscribe(topic, max_bytes_per_partition: max_bytes_per_partition)
622
+ @cluster.mark_as_stale!
578
623
  end
579
624
 
580
625
  def cluster_topics
@@ -7,11 +7,12 @@ module Kafka
7
7
  class ConsumerGroup
8
8
  attr_reader :assigned_partitions, :generation_id, :group_id
9
9
 
10
- def initialize(cluster:, logger:, group_id:, session_timeout:, retention_time:, instrumenter:)
10
+ def initialize(cluster:, logger:, group_id:, session_timeout:, rebalance_timeout:, retention_time:, instrumenter:)
11
11
  @cluster = cluster
12
12
  @logger = TaggedLogger.new(logger)
13
13
  @group_id = group_id
14
14
  @session_timeout = session_timeout
15
+ @rebalance_timeout = rebalance_timeout
15
16
  @instrumenter = instrumenter
16
17
  @member_id = ""
17
18
  @generation_id = nil
@@ -140,7 +141,9 @@ module Kafka
140
141
  response = coordinator.join_group(
141
142
  group_id: @group_id,
142
143
  session_timeout: @session_timeout,
144
+ rebalance_timeout: @rebalance_timeout,
143
145
  member_id: @member_id,
146
+ topics: @topics,
144
147
  )
145
148
 
146
149
  Protocol.handle_error(response.error_code)
@@ -158,6 +161,12 @@ module Kafka
158
161
  @member_id = ""
159
162
  sleep 1
160
163
 
164
+ retry
165
+ rescue CoordinatorLoadInProgress
166
+ @logger.error "Coordinator broker still loading, retrying in 1s..."
167
+
168
+ sleep 1
169
+
161
170
  retry
162
171
  end
163
172
 
@@ -31,7 +31,7 @@ module Kafka
31
31
 
32
32
  class << self
33
33
  def statsd
34
- @statsd ||= ::Datadog::Statsd.new(host, port, namespace: namespace, tags: tags)
34
+ @statsd ||= ::Datadog::Statsd.new(host, port, namespace: namespace, tags: tags, socket_path: socket_path)
35
35
  end
36
36
 
37
37
  def statsd=(statsd)
@@ -40,7 +40,7 @@ module Kafka
40
40
  end
41
41
 
42
42
  def host
43
- @host ||= default_host
43
+ @host
44
44
  end
45
45
 
46
46
  def host=(host)
@@ -49,7 +49,7 @@ module Kafka
49
49
  end
50
50
 
51
51
  def port
52
- @port ||= default_port
52
+ @port
53
53
  end
54
54
 
55
55
  def port=(port)
@@ -57,6 +57,15 @@ module Kafka
57
57
  clear
58
58
  end
59
59
 
60
+ def socket_path
61
+ @socket_path
62
+ end
63
+
64
+ def socket_path=(socket_path)
65
+ @socket_path = socket_path
66
+ clear
67
+ end
68
+
60
69
  def namespace
61
70
  @namespace ||= STATSD_NAMESPACE
62
71
  end
@@ -77,14 +86,6 @@ module Kafka
77
86
 
78
87
  private
79
88
 
80
- def default_host
81
- ::Datadog::Statsd.const_defined?(:Connection) ? ::Datadog::Statsd::Connection::DEFAULT_HOST : ::Datadog::Statsd::DEFAULT_HOST
82
- end
83
-
84
- def default_port
85
- ::Datadog::Statsd.const_defined?(:Connection) ? ::Datadog::Statsd::Connection::DEFAULT_PORT : ::Datadog::Statsd::DEFAULT_PORT
86
- end
87
-
88
89
  def clear
89
90
  @statsd && @statsd.close
90
91
  @statsd = nil
@@ -168,6 +169,8 @@ module Kafka
168
169
  def process_batch(event)
169
170
  offset = event.payload.fetch(:last_offset)
170
171
  messages = event.payload.fetch(:message_count)
172
+ create_time = event.payload.fetch(:last_create_time)
173
+ time_lag = create_time && ((Time.now - create_time) * 1000).to_i
171
174
 
172
175
  tags = {
173
176
  client: event.payload.fetch(:client_id),
@@ -184,6 +187,10 @@ module Kafka
184
187
  end
185
188
 
186
189
  gauge("consumer.offset", offset, tags: tags)
190
+
191
+ if time_lag
192
+ gauge("consumer.time_lag", time_lag, tags: tags)
193
+ end
187
194
  end
188
195
 
189
196
  def fetch_batch(event)
@@ -48,7 +48,7 @@ module Kafka
48
48
  partition: @fetched_partition.partition
49
49
  )
50
50
  end
51
- end
51
+ end.compact
52
52
  end
53
53
  FetchedBatch.new(
54
54
  topic: @topic,
@@ -4,7 +4,7 @@ require "kafka/fetch_operation"
4
4
 
5
5
  module Kafka
6
6
  class Fetcher
7
- attr_reader :queue
7
+ attr_reader :queue, :max_wait_time
8
8
 
9
9
  def initialize(cluster:, logger:, instrumenter:, max_queue_size:, group:)
10
10
  @cluster = cluster
@@ -17,6 +17,9 @@ module Kafka
17
17
  @commands = Queue.new
18
18
  @next_offsets = Hash.new { |h, k| h[k] = {} }
19
19
 
20
+ # We are only running when someone calls start.
21
+ @running = false
22
+
20
23
  # Long poll until at least this many bytes can be fetched.
21
24
  @min_bytes = 1
22
25
 
@@ -110,7 +113,7 @@ module Kafka
110
113
  elsif @queue.size < @max_queue_size
111
114
  step
112
115
  else
113
- @logger.warn "Reached max fetcher queue size (#{@max_queue_size}), sleeping 1s"
116
+ @logger.info "Reached max fetcher queue size (#{@max_queue_size}), sleeping 1s"
114
117
  sleep 1
115
118
  end
116
119
  ensure
@@ -50,9 +50,20 @@ module Kafka
50
50
  # @param offset [Integer] the offset of the message that should be marked as processed.
51
51
  # @return [nil]
52
52
  def mark_as_processed(topic, partition, offset)
53
- @uncommitted_offsets += 1
53
+ unless @group.assigned_to?(topic, partition)
54
+ @logger.debug "Not marking #{topic}/#{partition}:#{offset} as processed for partition not assigned to this consumer."
55
+ return
56
+ end
54
57
  @processed_offsets[topic] ||= {}
55
58
 
59
+ last_processed_offset = @processed_offsets[topic][partition] || -1
60
+ if last_processed_offset > offset + 1
61
+ @logger.debug "Not overwriting newer offset #{topic}/#{partition}:#{last_processed_offset - 1} with older #{offset}"
62
+ return
63
+ end
64
+
65
+ @uncommitted_offsets += 1
66
+
56
67
  # The committed offset should always be the offset of the next message that the
57
68
  # application will read, thus adding one to the last message processed.
58
69
  @processed_offsets[topic][partition] = offset + 1
@@ -188,11 +188,14 @@ module Kafka
188
188
  # @raise [BufferOverflow] if the maximum buffer size has been reached.
189
189
  # @return [nil]
190
190
  def produce(value, key: nil, headers: {}, topic:, partition: nil, partition_key: nil, create_time: Time.now)
191
+ # We want to fail fast if `topic` isn't a String
192
+ topic = topic.to_str
193
+
191
194
  message = PendingMessage.new(
192
195
  value: value && value.to_s,
193
196
  key: key && key.to_s,
194
197
  headers: headers,
195
- topic: topic.to_s,
198
+ topic: topic,
196
199
  partition: partition && Integer(partition),
197
200
  partition_key: partition_key && partition_key.to_s,
198
201
  create_time: create_time
@@ -0,0 +1,316 @@
1
+ # frozen_string_literal: true
2
+
3
+ #
4
+ # Subscriber to ruby_kafka to report metrics to prometheus
5
+ #
6
+ # Usage:
7
+ # require "kafka/prometheus"
8
+ #
9
+ # Once the file has been required, no further configuration is needed, all operational
10
+ # metrics are automatically emitted (Unless PROMETHEUS_NO_AUTO_START is set).
11
+ #
12
+ # By Peter Mustel, T2 Data AB
13
+ #
14
+ begin
15
+ require 'prometheus/client'
16
+ rescue LoadError
17
+ warn 'In order to report Kafka client metrics to Prometheus you need to install the `prometheus-client` gem.'
18
+ raise
19
+ end
20
+
21
+ require 'active_support/subscriber'
22
+
23
+ module Kafka
24
+ module Prometheus
25
+ SIZE_BUCKETS = [1, 10, 100, 1000, 10_000, 100_000, 1_000_000].freeze
26
+ LATENCY_BUCKETS = [0.0001, 0.001, 0.01, 0.1, 1.0, 10, 100, 1000].freeze
27
+ DELAY_BUCKETS = [1, 3, 10, 30, 100, 300, 1000, 3000, 10_000, 30_000].freeze
28
+
29
+ class << self
30
+ attr_accessor :registry
31
+
32
+ def start(registry = ::Prometheus::Client.registry)
33
+ @registry = registry
34
+ ConnectionSubscriber.attach_to 'connection.kafka'
35
+ ConsumerSubscriber.attach_to 'consumer.kafka'
36
+ ProducerSubscriber.attach_to 'producer.kafka'
37
+ AsyncProducerSubscriber.attach_to 'async_producer.kafka'
38
+ FetcherSubscriber.attach_to 'fetcher.kafka'
39
+ end
40
+ end
41
+
42
+ class ConnectionSubscriber < ActiveSupport::Subscriber
43
+ def initialize
44
+ super
45
+ @api_calls = Prometheus.registry.counter(:api_calls, docstring: 'Total calls', labels: [:client, :api, :broker])
46
+ @api_latency = Prometheus.registry.histogram(:api_latency, docstring: 'Latency', buckets: LATENCY_BUCKETS, labels: [:client, :api, :broker])
47
+ @api_request_size = Prometheus.registry.histogram(:api_request_size, docstring: 'Request size', buckets: SIZE_BUCKETS, labels: [:client, :api, :broker])
48
+ @api_response_size = Prometheus.registry.histogram(:api_response_size, docstring: 'Response size', buckets: SIZE_BUCKETS, labels: [:client, :api, :broker])
49
+ @api_errors = Prometheus.registry.counter(:api_errors, docstring: 'Errors', labels: [:client, :api, :broker])
50
+ end
51
+
52
+ def request(event)
53
+ key = {
54
+ client: event.payload.fetch(:client_id),
55
+ api: event.payload.fetch(:api, 'unknown'),
56
+ broker: event.payload.fetch(:broker_host)
57
+ }
58
+ request_size = event.payload.fetch(:request_size, 0)
59
+ response_size = event.payload.fetch(:response_size, 0)
60
+
61
+ @api_calls.increment(labels: key)
62
+ @api_latency.observe(event.duration, labels: key)
63
+ @api_request_size.observe(request_size, labels: key)
64
+ @api_response_size.observe(response_size, labels: key)
65
+ @api_errors.increment(labels: key) if event.payload.key?(:exception)
66
+ end
67
+ end
68
+
69
+ class ConsumerSubscriber < ActiveSupport::Subscriber
70
+ def initialize
71
+ super
72
+ @process_messages = Prometheus.registry.counter(:consumer_process_messages, docstring: 'Total messages', labels: [:client, :group_id, :topic, :partition])
73
+ @process_message_errors = Prometheus.registry.counter(:consumer_process_message_errors, docstring: 'Total errors', labels: [:client, :group_id, :topic, :partition])
74
+ @process_message_latency =
75
+ Prometheus.registry.histogram(:consumer_process_message_latency, docstring: 'Latency', buckets: LATENCY_BUCKETS, labels: [:client, :group_id, :topic, :partition])
76
+ @offset_lag = Prometheus.registry.gauge(:consumer_offset_lag, docstring: 'Offset lag', labels: [:client, :group_id, :topic, :partition])
77
+ @time_lag = Prometheus.registry.gauge(:consumer_time_lag, docstring: 'Time lag of message', labels: [:client, :group_id, :topic, :partition])
78
+ @process_batch_errors = Prometheus.registry.counter(:consumer_process_batch_errors, docstring: 'Total errors in batch', labels: [:client, :group_id, :topic, :partition])
79
+ @process_batch_latency =
80
+ Prometheus.registry.histogram(:consumer_process_batch_latency, docstring: 'Latency in batch', buckets: LATENCY_BUCKETS, labels: [:client, :group_id, :topic, :partition])
81
+ @batch_size = Prometheus.registry.histogram(:consumer_batch_size, docstring: 'Size of batch', buckets: SIZE_BUCKETS, labels: [:client, :group_id, :topic, :partition])
82
+ @join_group = Prometheus.registry.histogram(:consumer_join_group, docstring: 'Time to join group', buckets: DELAY_BUCKETS, labels: [:client, :group_id])
83
+ @join_group_errors = Prometheus.registry.counter(:consumer_join_group_errors, docstring: 'Total error in joining group', labels: [:client, :group_id])
84
+ @sync_group = Prometheus.registry.histogram(:consumer_sync_group, docstring: 'Time to sync group', buckets: DELAY_BUCKETS, labels: [:client, :group_id])
85
+ @sync_group_errors = Prometheus.registry.counter(:consumer_sync_group_errors, docstring: 'Total error in syncing group', labels: [:client, :group_id])
86
+ @leave_group = Prometheus.registry.histogram(:consumer_leave_group, docstring: 'Time to leave group', buckets: DELAY_BUCKETS, labels: [:client, :group_id])
87
+ @leave_group_errors = Prometheus.registry.counter(:consumer_leave_group_errors, docstring: 'Total error in leaving group', labels: [:client, :group_id])
88
+ @pause_duration = Prometheus.registry.gauge(:consumer_pause_duration, docstring: 'Pause duration', labels: [:client, :group_id, :topic, :partition])
89
+ end
90
+
91
+ def process_message(event)
92
+ key = {
93
+ client: event.payload.fetch(:client_id),
94
+ group_id: event.payload.fetch(:group_id),
95
+ topic: event.payload.fetch(:topic),
96
+ partition: event.payload.fetch(:partition)
97
+ }
98
+
99
+ offset_lag = event.payload.fetch(:offset_lag)
100
+ create_time = event.payload.fetch(:create_time)
101
+
102
+ time_lag = create_time && ((Time.now - create_time) * 1000).to_i
103
+
104
+ if event.payload.key?(:exception)
105
+ @process_message_errors.increment(labels: key)
106
+ else
107
+ @process_message_latency.observe(event.duration, labels: key)
108
+ @process_messages.increment(labels: key)
109
+ end
110
+
111
+ @offset_lag.set(offset_lag, labels: key)
112
+
113
+ # Not all messages have timestamps.
114
+ return unless time_lag
115
+
116
+ @time_lag.set(time_lag, labels: key)
117
+ end
118
+
119
+ def process_batch(event)
120
+ key = {
121
+ client: event.payload.fetch(:client_id),
122
+ group_id: event.payload.fetch(:group_id),
123
+ topic: event.payload.fetch(:topic),
124
+ partition: event.payload.fetch(:partition)
125
+ }
126
+ message_count = event.payload.fetch(:message_count)
127
+
128
+ if event.payload.key?(:exception)
129
+ @process_batch_errors.increment(labels: key)
130
+ else
131
+ @process_batch_latency.observe(event.duration, labels: key)
132
+ @process_messages.increment(by: message_count, labels: key)
133
+ end
134
+ end
135
+
136
+ def fetch_batch(event)
137
+ key = {
138
+ client: event.payload.fetch(:client_id),
139
+ group_id: event.payload.fetch(:group_id),
140
+ topic: event.payload.fetch(:topic),
141
+ partition: event.payload.fetch(:partition)
142
+ }
143
+ offset_lag = event.payload.fetch(:offset_lag)
144
+ batch_size = event.payload.fetch(:message_count)
145
+
146
+ @batch_size.observe(batch_size, labels: key)
147
+ @offset_lag.set(offset_lag, labels: key)
148
+ end
149
+
150
+ def join_group(event)
151
+ key = { client: event.payload.fetch(:client_id), group_id: event.payload.fetch(:group_id) }
152
+ @join_group.observe(event.duration, labels: key)
153
+
154
+ @join_group_errors.increment(labels: key) if event.payload.key?(:exception)
155
+ end
156
+
157
+ def sync_group(event)
158
+ key = { client: event.payload.fetch(:client_id), group_id: event.payload.fetch(:group_id) }
159
+ @sync_group.observe(event.duration, labels: key)
160
+
161
+ @sync_group_errors.increment(labels: key) if event.payload.key?(:exception)
162
+ end
163
+
164
+ def leave_group(event)
165
+ key = { client: event.payload.fetch(:client_id), group_id: event.payload.fetch(:group_id) }
166
+ @leave_group.observe(event.duration, labels: key)
167
+
168
+ @leave_group_errors.increment(labels: key) if event.payload.key?(:exception)
169
+ end
170
+
171
+ def pause_status(event)
172
+ key = {
173
+ client: event.payload.fetch(:client_id),
174
+ group_id: event.payload.fetch(:group_id),
175
+ topic: event.payload.fetch(:topic),
176
+ partition: event.payload.fetch(:partition)
177
+ }
178
+
179
+ duration = event.payload.fetch(:duration)
180
+ @pause_duration.set(duration, labels: key)
181
+ end
182
+ end
183
+
184
+ class ProducerSubscriber < ActiveSupport::Subscriber
185
+ def initialize
186
+ super
187
+ @produce_messages = Prometheus.registry.counter(:producer_produced_messages, docstring: 'Produced messages total', labels: [:client, :topic])
188
+ @produce_message_size =
189
+ Prometheus.registry.histogram(:producer_message_size, docstring: 'Message size', buckets: SIZE_BUCKETS, labels: [:client, :topic])
190
+ @buffer_size = Prometheus.registry.histogram(:producer_buffer_size, docstring: 'Buffer size', buckets: SIZE_BUCKETS, labels: [:client])
191
+ @buffer_fill_ratio = Prometheus.registry.histogram(:producer_buffer_fill_ratio, docstring: 'Buffer fill ratio', labels: [:client])
192
+ @buffer_fill_percentage = Prometheus.registry.histogram(:producer_buffer_fill_percentage, docstring: 'Buffer fill percentage', labels: [:client])
193
+ @produce_errors = Prometheus.registry.counter(:producer_produce_errors, docstring: 'Produce errors', labels: [:client, :topic])
194
+ @deliver_errors = Prometheus.registry.counter(:producer_deliver_errors, docstring: 'Deliver error', labels: [:client])
195
+ @deliver_latency =
196
+ Prometheus.registry.histogram(:producer_deliver_latency, docstring: 'Delivery latency', buckets: LATENCY_BUCKETS, labels: [:client])
197
+ @deliver_messages = Prometheus.registry.counter(:producer_deliver_messages, docstring: 'Total count of delivered messages', labels: [:client])
198
+ @deliver_attempts = Prometheus.registry.histogram(:producer_deliver_attempts, docstring: 'Delivery attempts', labels: [:client])
199
+ @ack_messages = Prometheus.registry.counter(:producer_ack_messages, docstring: 'Ack', labels: [:client, :topic])
200
+ @ack_delay = Prometheus.registry.histogram(:producer_ack_delay, docstring: 'Ack delay', buckets: LATENCY_BUCKETS, labels: [:client, :topic])
201
+ @ack_errors = Prometheus.registry.counter(:producer_ack_errors, docstring: 'Ack errors', labels: [:client, :topic])
202
+ end
203
+
204
+ def produce_message(event)
205
+ client = event.payload.fetch(:client_id)
206
+ key = { client: client, topic: event.payload.fetch(:topic) }
207
+
208
+ message_size = event.payload.fetch(:message_size)
209
+ buffer_size = event.payload.fetch(:buffer_size)
210
+ max_buffer_size = event.payload.fetch(:max_buffer_size)
211
+ buffer_fill_ratio = buffer_size.to_f / max_buffer_size.to_f
212
+ buffer_fill_percentage = buffer_fill_ratio * 100.0
213
+
214
+ # This gets us the write rate.
215
+ @produce_messages.increment(labels: key)
216
+ @produce_message_size.observe(message_size, labels: key)
217
+
218
+ # This gets us the avg/max buffer size per producer.
219
+ @buffer_size.observe(buffer_size, labels: { client: client })
220
+
221
+ # This gets us the avg/max buffer fill ratio per producer.
222
+ @buffer_fill_ratio.observe(buffer_fill_ratio, labels: { client: client })
223
+ @buffer_fill_percentage.observe(buffer_fill_percentage, labels: { client: client })
224
+ end
225
+
226
+ def buffer_overflow(event)
227
+ key = { client: event.payload.fetch(:client_id), topic: event.payload.fetch(:topic) }
228
+ @produce_errors.increment(labels: key)
229
+ end
230
+
231
+ def deliver_messages(event)
232
+ key = { client: event.payload.fetch(:client_id) }
233
+ message_count = event.payload.fetch(:delivered_message_count)
234
+ attempts = event.payload.fetch(:attempts)
235
+
236
+ @deliver_errors.increment(labels: key) if event.payload.key?(:exception)
237
+ @deliver_latency.observe(event.duration, labels: key)
238
+
239
+ # Messages delivered to Kafka:
240
+ @deliver_messages.increment(by: message_count, labels: key)
241
+
242
+ # Number of attempts to deliver messages:
243
+ @deliver_attempts.observe(attempts, labels: key)
244
+ end
245
+
246
+ def ack_message(event)
247
+ key = { client: event.payload.fetch(:client_id), topic: event.payload.fetch(:topic) }
248
+
249
+ # Number of messages ACK'd for the topic.
250
+ @ack_messages.increment(labels: key)
251
+
252
+ # Histogram of delay between a message being produced and it being ACK'd.
253
+ @ack_delay.observe(event.payload.fetch(:delay), labels: key)
254
+ end
255
+
256
+ def topic_error(event)
257
+ key = { client: event.payload.fetch(:client_id), topic: event.payload.fetch(:topic) }
258
+
259
+ @ack_errors.increment(labels: key)
260
+ end
261
+ end
262
+
263
+ class AsyncProducerSubscriber < ActiveSupport::Subscriber
264
+ def initialize
265
+ super
266
+ @queue_size = Prometheus.registry.histogram(:async_producer_queue_size, docstring: 'Queue size', buckets: SIZE_BUCKETS, labels: [:client, :topic])
267
+ @queue_fill_ratio = Prometheus.registry.histogram(:async_producer_queue_fill_ratio, docstring: 'Queue fill ratio', labels: [:client, :topic])
268
+ @produce_errors = Prometheus.registry.counter(:async_producer_produce_errors, docstring: 'Producer errors', labels: [:client, :topic])
269
+ @dropped_messages = Prometheus.registry.counter(:async_producer_dropped_messages, docstring: 'Dropped messages', labels: [:client])
270
+ end
271
+
272
+ def enqueue_message(event)
273
+ key = { client: event.payload.fetch(:client_id), topic: event.payload.fetch(:topic) }
274
+
275
+ queue_size = event.payload.fetch(:queue_size)
276
+ max_queue_size = event.payload.fetch(:max_queue_size)
277
+ queue_fill_ratio = queue_size.to_f / max_queue_size.to_f
278
+
279
+ # This gets us the avg/max queue size per producer.
280
+ @queue_size.observe(queue_size, labels: key)
281
+
282
+ # This gets us the avg/max queue fill ratio per producer.
283
+ @queue_fill_ratio.observe(queue_fill_ratio, labels: key)
284
+ end
285
+
286
+ def buffer_overflow(event)
287
+ key = { client: event.payload.fetch(:client_id), topic: event.payload.fetch(:topic) }
288
+ @produce_errors.increment(labels: key)
289
+ end
290
+
291
+ def drop_messages(event)
292
+ key = { client: event.payload.fetch(:client_id) }
293
+ message_count = event.payload.fetch(:message_count)
294
+ @dropped_messages.increment(by: message_count, labels: key)
295
+ end
296
+ end
297
+
298
+ class FetcherSubscriber < ActiveSupport::Subscriber
299
+ def initialize
300
+ super
301
+ @queue_size = Prometheus.registry.gauge(:fetcher_queue_size, docstring: 'Queue size', labels: [:client, :group_id])
302
+ end
303
+
304
+ def loop(event)
305
+ queue_size = event.payload.fetch(:queue_size)
306
+ client = event.payload.fetch(:client_id)
307
+ group_id = event.payload.fetch(:group_id)
308
+
309
+ @queue_size.set(queue_size, labels: { client: client, group_id: group_id })
310
+ end
311
+ end
312
+ end
313
+ end
314
+
315
+ # To enable testability, it is possible to skip the start until test time
316
+ Kafka::Prometheus.start unless defined?(PROMETHEUS_NO_AUTO_START)