ruby-kafka 0.7.6 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. checksums.yaml +4 -4
  2. data/.circleci/config.yml +102 -3
  3. data/.github/workflows/stale.yml +19 -0
  4. data/CHANGELOG.md +24 -0
  5. data/README.md +18 -0
  6. data/lib/kafka/async_producer.rb +3 -0
  7. data/lib/kafka/broker.rb +12 -0
  8. data/lib/kafka/client.rb +35 -3
  9. data/lib/kafka/cluster.rb +52 -0
  10. data/lib/kafka/compression.rb +13 -11
  11. data/lib/kafka/compressor.rb +1 -0
  12. data/lib/kafka/connection.rb +3 -0
  13. data/lib/kafka/consumer_group.rb +4 -1
  14. data/lib/kafka/datadog.rb +2 -10
  15. data/lib/kafka/fetched_batch.rb +5 -1
  16. data/lib/kafka/fetched_batch_generator.rb +4 -1
  17. data/lib/kafka/fetched_message.rb +1 -0
  18. data/lib/kafka/fetcher.rb +4 -1
  19. data/lib/kafka/gzip_codec.rb +4 -0
  20. data/lib/kafka/lz4_codec.rb +4 -0
  21. data/lib/kafka/producer.rb +20 -1
  22. data/lib/kafka/prometheus.rb +316 -0
  23. data/lib/kafka/protocol.rb +8 -0
  24. data/lib/kafka/protocol/add_offsets_to_txn_request.rb +29 -0
  25. data/lib/kafka/protocol/add_offsets_to_txn_response.rb +19 -0
  26. data/lib/kafka/protocol/join_group_request.rb +8 -2
  27. data/lib/kafka/protocol/offset_fetch_request.rb +3 -1
  28. data/lib/kafka/protocol/produce_request.rb +3 -1
  29. data/lib/kafka/protocol/record_batch.rb +5 -4
  30. data/lib/kafka/protocol/txn_offset_commit_request.rb +46 -0
  31. data/lib/kafka/protocol/txn_offset_commit_response.rb +18 -0
  32. data/lib/kafka/sasl/scram.rb +15 -12
  33. data/lib/kafka/snappy_codec.rb +4 -0
  34. data/lib/kafka/ssl_context.rb +4 -1
  35. data/lib/kafka/ssl_socket_with_timeout.rb +1 -0
  36. data/lib/kafka/tagged_logger.rb +25 -20
  37. data/lib/kafka/transaction_manager.rb +25 -0
  38. data/lib/kafka/version.rb +1 -1
  39. data/lib/kafka/zstd_codec.rb +27 -0
  40. data/ruby-kafka.gemspec +4 -2
  41. metadata +47 -6
@@ -18,6 +18,7 @@ module Kafka
18
18
  # * `compressed_bytesize` – the byte size of the compressed data.
19
19
  #
20
20
  class Compressor
21
+ attr_reader :codec
21
22
 
22
23
  # @param codec_name [Symbol, nil]
23
24
  # @param threshold [Integer] the minimum number of messages in a message set
@@ -58,6 +58,9 @@ module Kafka
58
58
  @connect_timeout = connect_timeout || CONNECT_TIMEOUT
59
59
  @socket_timeout = socket_timeout || SOCKET_TIMEOUT
60
60
  @ssl_context = ssl_context
61
+
62
+ @socket = nil
63
+ @last_request = nil
61
64
  end
62
65
 
63
66
  def to_s
@@ -7,11 +7,12 @@ module Kafka
7
7
  class ConsumerGroup
8
8
  attr_reader :assigned_partitions, :generation_id, :group_id
9
9
 
10
- def initialize(cluster:, logger:, group_id:, session_timeout:, retention_time:, instrumenter:)
10
+ def initialize(cluster:, logger:, group_id:, session_timeout:, rebalance_timeout:, retention_time:, instrumenter:)
11
11
  @cluster = cluster
12
12
  @logger = TaggedLogger.new(logger)
13
13
  @group_id = group_id
14
14
  @session_timeout = session_timeout
15
+ @rebalance_timeout = rebalance_timeout
15
16
  @instrumenter = instrumenter
16
17
  @member_id = ""
17
18
  @generation_id = nil
@@ -140,7 +141,9 @@ module Kafka
140
141
  response = coordinator.join_group(
141
142
  group_id: @group_id,
142
143
  session_timeout: @session_timeout,
144
+ rebalance_timeout: @rebalance_timeout,
143
145
  member_id: @member_id,
146
+ topics: @topics,
144
147
  )
145
148
 
146
149
  Protocol.handle_error(response.error_code)
@@ -40,7 +40,7 @@ module Kafka
40
40
  end
41
41
 
42
42
  def host
43
- @host ||= default_host
43
+ @host
44
44
  end
45
45
 
46
46
  def host=(host)
@@ -49,7 +49,7 @@ module Kafka
49
49
  end
50
50
 
51
51
  def port
52
- @port ||= default_port
52
+ @port
53
53
  end
54
54
 
55
55
  def port=(port)
@@ -77,14 +77,6 @@ module Kafka
77
77
 
78
78
  private
79
79
 
80
- def default_host
81
- ::Datadog::Statsd.const_defined?(:Connection) ? ::Datadog::Statsd::Connection::DEFAULT_HOST : ::Datadog::Statsd::DEFAULT_HOST
82
- end
83
-
84
- def default_port
85
- ::Datadog::Statsd.const_defined?(:Connection) ? ::Datadog::Statsd::Connection::DEFAULT_PORT : ::Datadog::Statsd::DEFAULT_PORT
86
- end
87
-
88
80
  def clear
89
81
  @statsd && @statsd.close
90
82
  @statsd = nil
@@ -13,18 +13,22 @@ module Kafka
13
13
  # @return [Integer]
14
14
  attr_reader :last_offset
15
15
 
16
+ # @return [Integer]
17
+ attr_reader :leader_epoch
18
+
16
19
  # @return [Integer] the offset of the most recent message in the partition.
17
20
  attr_reader :highwater_mark_offset
18
21
 
19
22
  # @return [Array<Kafka::FetchedMessage>]
20
23
  attr_accessor :messages
21
24
 
22
- def initialize(topic:, partition:, highwater_mark_offset:, messages:, last_offset: nil)
25
+ def initialize(topic:, partition:, highwater_mark_offset:, messages:, last_offset: nil, leader_epoch: nil)
23
26
  @topic = topic
24
27
  @partition = partition
25
28
  @highwater_mark_offset = highwater_mark_offset
26
29
  @messages = messages
27
30
  @last_offset = last_offset
31
+ @leader_epoch = leader_epoch
28
32
  end
29
33
 
30
34
  def empty?
@@ -48,7 +48,7 @@ module Kafka
48
48
  partition: @fetched_partition.partition
49
49
  )
50
50
  end
51
- end
51
+ end.compact
52
52
  end
53
53
  FetchedBatch.new(
54
54
  topic: @topic,
@@ -62,11 +62,13 @@ module Kafka
62
62
  def extract_records
63
63
  records = []
64
64
  last_offset = nil
65
+ leader_epoch = nil
65
66
  aborted_transactions = @fetched_partition.aborted_transactions.sort_by(&:first_offset)
66
67
  aborted_producer_ids = {}
67
68
 
68
69
  @fetched_partition.messages.each do |record_batch|
69
70
  last_offset = record_batch.last_offset if last_offset.nil? || last_offset < record_batch.last_offset
71
+ leader_epoch = record_batch.partition_leader_epoch if leader_epoch.nil? || leader_epoch < record_batch.partition_leader_epoch
70
72
  # Find the list of aborted producer IDs less than current offset
71
73
  unless aborted_transactions.empty?
72
74
  if aborted_transactions.first.first_offset <= record_batch.last_offset
@@ -99,6 +101,7 @@ module Kafka
99
101
  topic: @topic,
100
102
  partition: @fetched_partition.partition,
101
103
  last_offset: last_offset,
104
+ leader_epoch: leader_epoch,
102
105
  highwater_mark_offset: @fetched_partition.highwater_mark_offset,
103
106
  messages: records
104
107
  )
@@ -43,5 +43,6 @@ module Kafka
43
43
  def is_control_record
44
44
  @message.is_control_record
45
45
  end
46
+
46
47
  end
47
48
  end
@@ -17,6 +17,9 @@ module Kafka
17
17
  @commands = Queue.new
18
18
  @next_offsets = Hash.new { |h, k| h[k] = {} }
19
19
 
20
+ # We are only running when someone calls start.
21
+ @running = false
22
+
20
23
  # Long poll until at least this many bytes can be fetched.
21
24
  @min_bytes = 1
22
25
 
@@ -110,7 +113,7 @@ module Kafka
110
113
  elsif @queue.size < @max_queue_size
111
114
  step
112
115
  else
113
- @logger.warn "Reached max fetcher queue size (#{@max_queue_size}), sleeping 1s"
116
+ @logger.info "Reached max fetcher queue size (#{@max_queue_size}), sleeping 1s"
114
117
  sleep 1
115
118
  end
116
119
  ensure
@@ -6,6 +6,10 @@ module Kafka
6
6
  1
7
7
  end
8
8
 
9
+ def produce_api_min_version
10
+ 0
11
+ end
12
+
9
13
  def load
10
14
  require "zlib"
11
15
  end
@@ -6,6 +6,10 @@ module Kafka
6
6
  3
7
7
  end
8
8
 
9
+ def produce_api_min_version
10
+ 0
11
+ end
12
+
9
13
  def load
10
14
  require "extlz4"
11
15
  rescue LoadError
@@ -68,6 +68,8 @@ module Kafka
68
68
  #
69
69
  # * `:snappy` for [Snappy](http://google.github.io/snappy/) compression.
70
70
  # * `:gzip` for [gzip](https://en.wikipedia.org/wiki/Gzip) compression.
71
+ # * `:lz4` for [LZ4](https://en.wikipedia.org/wiki/LZ4_(compression_algorithm)) compression.
72
+ # * `:zstd` for [zstd](https://facebook.github.io/zstd/) compression.
71
73
  #
72
74
  # By default, all message sets will be compressed if you specify a compression
73
75
  # codec. To increase the compression threshold, set `compression_threshold` to
@@ -186,11 +188,14 @@ module Kafka
186
188
  # @raise [BufferOverflow] if the maximum buffer size has been reached.
187
189
  # @return [nil]
188
190
  def produce(value, key: nil, headers: {}, topic:, partition: nil, partition_key: nil, create_time: Time.now)
191
+ # We want to fail fast if `topic` isn't a String
192
+ topic = topic.to_str
193
+
189
194
  message = PendingMessage.new(
190
195
  value: value && value.to_s,
191
196
  key: key && key.to_s,
192
197
  headers: headers,
193
- topic: topic.to_s,
198
+ topic: topic,
194
199
  partition: partition && Integer(partition),
195
200
  partition_key: partition_key && partition_key.to_s,
196
201
  create_time: create_time
@@ -328,6 +333,20 @@ module Kafka
328
333
  @transaction_manager.abort_transaction
329
334
  end
330
335
 
336
+ # Sends batch last offset to the consumer group coordinator, and also marks
337
+ # this offset as part of the current transaction. This offset will be considered
338
+ # committed only if the transaction is committed successfully.
339
+ #
340
+ # This method should be used when you need to batch consumed and produced messages
341
+ # together, typically in a consume-transform-produce pattern. Thus, the specified
342
+ # group_id should be the same as config parameter group_id of the used
343
+ # consumer.
344
+ #
345
+ # @return [nil]
346
+ def send_offsets_to_transaction(batch:, group_id:)
347
+ @transaction_manager.send_offsets_to_txn(offsets: { batch.topic => { batch.partition => { offset: batch.last_offset + 1, leader_epoch: batch.leader_epoch } } }, group_id: group_id)
348
+ end
349
+
331
350
  # Syntactic sugar to enable easier transaction usage. Do the following steps
332
351
  #
333
352
  # - Start the transaction (with Producer#begin_transaction)
@@ -0,0 +1,316 @@
1
+ # frozen_string_literal: true
2
+
3
+ #
4
+ # Subscriber to ruby_kafka to report metrics to prometheus
5
+ #
6
+ # Usage:
7
+ # require "kafka/prometheus"
8
+ #
9
+ # Once the file has been required, no further configuration is needed, all operational
10
+ # metrics are automatically emitted (Unless PROMETHEUS_NO_AUTO_START is set).
11
+ #
12
+ # By Peter Mustel, T2 Data AB
13
+ #
14
+ begin
15
+ require 'prometheus/client'
16
+ rescue LoadError
17
+ warn 'In order to report Kafka client metrics to Prometheus you need to install the `prometheus-client` gem.'
18
+ raise
19
+ end
20
+
21
+ require 'active_support/subscriber'
22
+
23
+ module Kafka
24
+ module Prometheus
25
+ SIZE_BUCKETS = [1, 10, 100, 1000, 10_000, 100_000, 1_000_000].freeze
26
+ LATENCY_BUCKETS = [0.0001, 0.001, 0.01, 0.1, 1.0, 10, 100, 1000].freeze
27
+ DELAY_BUCKETS = [1, 3, 10, 30, 100, 300, 1000, 3000, 10_000, 30_000].freeze
28
+
29
+ class << self
30
+ attr_accessor :registry
31
+
32
+ def start(registry = ::Prometheus::Client.registry)
33
+ @registry = registry
34
+ ConnectionSubscriber.attach_to 'connection.kafka'
35
+ ConsumerSubscriber.attach_to 'consumer.kafka'
36
+ ProducerSubscriber.attach_to 'producer.kafka'
37
+ AsyncProducerSubscriber.attach_to 'async_producer.kafka'
38
+ FetcherSubscriber.attach_to 'fetcher.kafka'
39
+ end
40
+ end
41
+
42
+ class ConnectionSubscriber < ActiveSupport::Subscriber
43
+ def initialize
44
+ super
45
+ @api_calls = Prometheus.registry.counter(:api_calls, docstring: 'Total calls', labels: [:client, :api, :broker])
46
+ @api_latency = Prometheus.registry.histogram(:api_latency, docstring: 'Latency', buckets: LATENCY_BUCKETS, labels: [:client, :api, :broker])
47
+ @api_request_size = Prometheus.registry.histogram(:api_request_size, docstring: 'Request size', buckets: SIZE_BUCKETS, labels: [:client, :api, :broker])
48
+ @api_response_size = Prometheus.registry.histogram(:api_response_size, docstring: 'Response size', buckets: SIZE_BUCKETS, labels: [:client, :api, :broker])
49
+ @api_errors = Prometheus.registry.counter(:api_errors, docstring: 'Errors', labels: [:client, :api, :broker])
50
+ end
51
+
52
+ def request(event)
53
+ key = {
54
+ client: event.payload.fetch(:client_id),
55
+ api: event.payload.fetch(:api, 'unknown'),
56
+ broker: event.payload.fetch(:broker_host)
57
+ }
58
+ request_size = event.payload.fetch(:request_size, 0)
59
+ response_size = event.payload.fetch(:response_size, 0)
60
+
61
+ @api_calls.increment(labels: key)
62
+ @api_latency.observe(event.duration, labels: key)
63
+ @api_request_size.observe(request_size, labels: key)
64
+ @api_response_size.observe(response_size, labels: key)
65
+ @api_errors.increment(labels: key) if event.payload.key?(:exception)
66
+ end
67
+ end
68
+
69
+ class ConsumerSubscriber < ActiveSupport::Subscriber
70
+ def initialize
71
+ super
72
+ @process_messages = Prometheus.registry.counter(:consumer_process_messages, docstring: 'Total messages', labels: [:client, :group_id, :topic, :partition])
73
+ @process_message_errors = Prometheus.registry.counter(:consumer_process_message_errors, docstring: 'Total errors', labels: [:client, :group_id, :topic, :partition])
74
+ @process_message_latency =
75
+ Prometheus.registry.histogram(:consumer_process_message_latency, docstring: 'Latency', buckets: LATENCY_BUCKETS, labels: [:client, :group_id, :topic, :partition])
76
+ @offset_lag = Prometheus.registry.gauge(:consumer_offset_lag, docstring: 'Offset lag', labels: [:client, :group_id, :topic, :partition])
77
+ @time_lag = Prometheus.registry.gauge(:consumer_time_lag, docstring: 'Time lag of message', labels: [:client, :group_id, :topic, :partition])
78
+ @process_batch_errors = Prometheus.registry.counter(:consumer_process_batch_errors, docstring: 'Total errors in batch', labels: [:client, :group_id, :topic, :partition])
79
+ @process_batch_latency =
80
+ Prometheus.registry.histogram(:consumer_process_batch_latency, docstring: 'Latency in batch', buckets: LATENCY_BUCKETS, labels: [:client, :group_id, :topic, :partition])
81
+ @batch_size = Prometheus.registry.histogram(:consumer_batch_size, docstring: 'Size of batch', buckets: SIZE_BUCKETS, labels: [:client, :group_id, :topic, :partition])
82
+ @join_group = Prometheus.registry.histogram(:consumer_join_group, docstring: 'Time to join group', buckets: DELAY_BUCKETS, labels: [:client, :group_id])
83
+ @join_group_errors = Prometheus.registry.counter(:consumer_join_group_errors, docstring: 'Total error in joining group', labels: [:client, :group_id])
84
+ @sync_group = Prometheus.registry.histogram(:consumer_sync_group, docstring: 'Time to sync group', buckets: DELAY_BUCKETS, labels: [:client, :group_id])
85
+ @sync_group_errors = Prometheus.registry.counter(:consumer_sync_group_errors, docstring: 'Total error in syncing group', labels: [:client, :group_id])
86
+ @leave_group = Prometheus.registry.histogram(:consumer_leave_group, docstring: 'Time to leave group', buckets: DELAY_BUCKETS, labels: [:client, :group_id])
87
+ @leave_group_errors = Prometheus.registry.counter(:consumer_leave_group_errors, docstring: 'Total error in leaving group', labels: [:client, :group_id])
88
+ @pause_duration = Prometheus.registry.gauge(:consumer_pause_duration, docstring: 'Pause duration', labels: [:client, :group_id, :topic, :partition])
89
+ end
90
+
91
+ def process_message(event)
92
+ key = {
93
+ client: event.payload.fetch(:client_id),
94
+ group_id: event.payload.fetch(:group_id),
95
+ topic: event.payload.fetch(:topic),
96
+ partition: event.payload.fetch(:partition)
97
+ }
98
+
99
+ offset_lag = event.payload.fetch(:offset_lag)
100
+ create_time = event.payload.fetch(:create_time)
101
+
102
+ time_lag = create_time && ((Time.now - create_time) * 1000).to_i
103
+
104
+ if event.payload.key?(:exception)
105
+ @process_message_errors.increment(labels: key)
106
+ else
107
+ @process_message_latency.observe(event.duration, labels: key)
108
+ @process_messages.increment(labels: key)
109
+ end
110
+
111
+ @offset_lag.set(offset_lag, labels: key)
112
+
113
+ # Not all messages have timestamps.
114
+ return unless time_lag
115
+
116
+ @time_lag.set(time_lag, labels: key)
117
+ end
118
+
119
+ def process_batch(event)
120
+ key = {
121
+ client: event.payload.fetch(:client_id),
122
+ group_id: event.payload.fetch(:group_id),
123
+ topic: event.payload.fetch(:topic),
124
+ partition: event.payload.fetch(:partition)
125
+ }
126
+ message_count = event.payload.fetch(:message_count)
127
+
128
+ if event.payload.key?(:exception)
129
+ @process_batch_errors.increment(labels: key)
130
+ else
131
+ @process_batch_latency.observe(event.duration, labels: key)
132
+ @process_messages.increment(by: message_count, labels: key)
133
+ end
134
+ end
135
+
136
+ def fetch_batch(event)
137
+ key = {
138
+ client: event.payload.fetch(:client_id),
139
+ group_id: event.payload.fetch(:group_id),
140
+ topic: event.payload.fetch(:topic),
141
+ partition: event.payload.fetch(:partition)
142
+ }
143
+ offset_lag = event.payload.fetch(:offset_lag)
144
+ batch_size = event.payload.fetch(:message_count)
145
+
146
+ @batch_size.observe(batch_size, labels: key)
147
+ @offset_lag.set(offset_lag, labels: key)
148
+ end
149
+
150
+ def join_group(event)
151
+ key = { client: event.payload.fetch(:client_id), group_id: event.payload.fetch(:group_id) }
152
+ @join_group.observe(event.duration, labels: key)
153
+
154
+ @join_group_errors.increment(labels: key) if event.payload.key?(:exception)
155
+ end
156
+
157
+ def sync_group(event)
158
+ key = { client: event.payload.fetch(:client_id), group_id: event.payload.fetch(:group_id) }
159
+ @sync_group.observe(event.duration, labels: key)
160
+
161
+ @sync_group_errors.increment(labels: key) if event.payload.key?(:exception)
162
+ end
163
+
164
+ def leave_group(event)
165
+ key = { client: event.payload.fetch(:client_id), group_id: event.payload.fetch(:group_id) }
166
+ @leave_group.observe(event.duration, labels: key)
167
+
168
+ @leave_group_errors.increment(labels: key) if event.payload.key?(:exception)
169
+ end
170
+
171
+ def pause_status(event)
172
+ key = {
173
+ client: event.payload.fetch(:client_id),
174
+ group_id: event.payload.fetch(:group_id),
175
+ topic: event.payload.fetch(:topic),
176
+ partition: event.payload.fetch(:partition)
177
+ }
178
+
179
+ duration = event.payload.fetch(:duration)
180
+ @pause_duration.set(duration, labels: key)
181
+ end
182
+ end
183
+
184
+ class ProducerSubscriber < ActiveSupport::Subscriber
185
+ def initialize
186
+ super
187
+ @produce_messages = Prometheus.registry.counter(:producer_produced_messages, docstring: 'Produced messages total', labels: [:client, :topic])
188
+ @produce_message_size =
189
+ Prometheus.registry.histogram(:producer_message_size, docstring: 'Message size', buckets: SIZE_BUCKETS, labels: [:client, :topic])
190
+ @buffer_size = Prometheus.registry.histogram(:producer_buffer_size, docstring: 'Buffer size', buckets: SIZE_BUCKETS, labels: [:client])
191
+ @buffer_fill_ratio = Prometheus.registry.histogram(:producer_buffer_fill_ratio, docstring: 'Buffer fill ratio', labels: [:client])
192
+ @buffer_fill_percentage = Prometheus.registry.histogram(:producer_buffer_fill_percentage, docstring: 'Buffer fill percentage', labels: [:client])
193
+ @produce_errors = Prometheus.registry.counter(:producer_produce_errors, docstring: 'Produce errors', labels: [:client, :topic])
194
+ @deliver_errors = Prometheus.registry.counter(:producer_deliver_errors, docstring: 'Deliver error', labels: [:client])
195
+ @deliver_latency =
196
+ Prometheus.registry.histogram(:producer_deliver_latency, docstring: 'Delivery latency', buckets: LATENCY_BUCKETS, labels: [:client])
197
+ @deliver_messages = Prometheus.registry.counter(:producer_deliver_messages, docstring: 'Total count of delivered messages', labels: [:client])
198
+ @deliver_attempts = Prometheus.registry.histogram(:producer_deliver_attempts, docstring: 'Delivery attempts', labels: [:client])
199
+ @ack_messages = Prometheus.registry.counter(:producer_ack_messages, docstring: 'Ack', labels: [:client, :topic])
200
+ @ack_delay = Prometheus.registry.histogram(:producer_ack_delay, docstring: 'Ack delay', buckets: LATENCY_BUCKETS, labels: [:client, :topic])
201
+ @ack_errors = Prometheus.registry.counter(:producer_ack_errors, docstring: 'Ack errors', labels: [:client, :topic])
202
+ end
203
+
204
+ def produce_message(event)
205
+ client = event.payload.fetch(:client_id)
206
+ key = { client: client, topic: event.payload.fetch(:topic) }
207
+
208
+ message_size = event.payload.fetch(:message_size)
209
+ buffer_size = event.payload.fetch(:buffer_size)
210
+ max_buffer_size = event.payload.fetch(:max_buffer_size)
211
+ buffer_fill_ratio = buffer_size.to_f / max_buffer_size.to_f
212
+ buffer_fill_percentage = buffer_fill_ratio * 100.0
213
+
214
+ # This gets us the write rate.
215
+ @produce_messages.increment(labels: key)
216
+ @produce_message_size.observe(message_size, labels: key)
217
+
218
+ # This gets us the avg/max buffer size per producer.
219
+ @buffer_size.observe(buffer_size, labels: { client: client })
220
+
221
+ # This gets us the avg/max buffer fill ratio per producer.
222
+ @buffer_fill_ratio.observe(buffer_fill_ratio, labels: { client: client })
223
+ @buffer_fill_percentage.observe(buffer_fill_percentage, labels: { client: client })
224
+ end
225
+
226
+ def buffer_overflow(event)
227
+ key = { client: event.payload.fetch(:client_id), topic: event.payload.fetch(:topic) }
228
+ @produce_errors.increment(labels: key)
229
+ end
230
+
231
+ def deliver_messages(event)
232
+ key = { client: event.payload.fetch(:client_id) }
233
+ message_count = event.payload.fetch(:delivered_message_count)
234
+ attempts = event.payload.fetch(:attempts)
235
+
236
+ @deliver_errors.increment(labels: key) if event.payload.key?(:exception)
237
+ @deliver_latency.observe(event.duration, labels: key)
238
+
239
+ # Messages delivered to Kafka:
240
+ @deliver_messages.increment(by: message_count, labels: key)
241
+
242
+ # Number of attempts to deliver messages:
243
+ @deliver_attempts.observe(attempts, labels: key)
244
+ end
245
+
246
+ def ack_message(event)
247
+ key = { client: event.payload.fetch(:client_id), topic: event.payload.fetch(:topic) }
248
+
249
+ # Number of messages ACK'd for the topic.
250
+ @ack_messages.increment(labels: key)
251
+
252
+ # Histogram of delay between a message being produced and it being ACK'd.
253
+ @ack_delay.observe(event.payload.fetch(:delay), labels: key)
254
+ end
255
+
256
+ def topic_error(event)
257
+ key = { client: event.payload.fetch(:client_id), topic: event.payload.fetch(:topic) }
258
+
259
+ @ack_errors.increment(labels: key)
260
+ end
261
+ end
262
+
263
+ class AsyncProducerSubscriber < ActiveSupport::Subscriber
264
+ def initialize
265
+ super
266
+ @queue_size = Prometheus.registry.histogram(:async_producer_queue_size, docstring: 'Queue size', buckets: SIZE_BUCKETS, labels: [:client, :topic])
267
+ @queue_fill_ratio = Prometheus.registry.histogram(:async_producer_queue_fill_ratio, docstring: 'Queue fill ratio', labels: [:client, :topic])
268
+ @produce_errors = Prometheus.registry.counter(:async_producer_produce_errors, docstring: 'Producer errors', labels: [:client, :topic])
269
+ @dropped_messages = Prometheus.registry.counter(:async_producer_dropped_messages, docstring: 'Dropped messages', labels: [:client])
270
+ end
271
+
272
+ def enqueue_message(event)
273
+ key = { client: event.payload.fetch(:client_id), topic: event.payload.fetch(:topic) }
274
+
275
+ queue_size = event.payload.fetch(:queue_size)
276
+ max_queue_size = event.payload.fetch(:max_queue_size)
277
+ queue_fill_ratio = queue_size.to_f / max_queue_size.to_f
278
+
279
+ # This gets us the avg/max queue size per producer.
280
+ @queue_size.observe(queue_size, labels: key)
281
+
282
+ # This gets us the avg/max queue fill ratio per producer.
283
+ @queue_fill_ratio.observe(queue_fill_ratio, labels: key)
284
+ end
285
+
286
+ def buffer_overflow(event)
287
+ key = { client: event.payload.fetch(:client_id), topic: event.payload.fetch(:topic) }
288
+ @produce_errors.increment(labels: key)
289
+ end
290
+
291
+ def drop_messages(event)
292
+ key = { client: event.payload.fetch(:client_id) }
293
+ message_count = event.payload.fetch(:message_count)
294
+ @dropped_messages.increment(by: message_count, labels: key)
295
+ end
296
+ end
297
+
298
+ class FetcherSubscriber < ActiveSupport::Subscriber
299
+ def initialize
300
+ super
301
+ @queue_size = Prometheus.registry.gauge(:fetcher_queue_size, docstring: 'Queue size', labels: [:client, :group_id])
302
+ end
303
+
304
+ def loop(event)
305
+ queue_size = event.payload.fetch(:queue_size)
306
+ client = event.payload.fetch(:client_id)
307
+ group_id = event.payload.fetch(:group_id)
308
+
309
+ @queue_size.set(queue_size, labels: { client: client, group_id: group_id })
310
+ end
311
+ end
312
+ end
313
+ end
314
+
315
+ # To enable testability, it is possible to skip the start until test time
316
+ Kafka::Prometheus.start unless defined?(PROMETHEUS_NO_AUTO_START)