ruby-kafka 0.7.4 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. checksums.yaml +4 -4
  2. data/.circleci/config.yml +168 -3
  3. data/.github/workflows/stale.yml +19 -0
  4. data/CHANGELOG.md +48 -0
  5. data/README.md +59 -0
  6. data/lib/kafka/async_producer.rb +30 -9
  7. data/lib/kafka/broker.rb +13 -1
  8. data/lib/kafka/broker_pool.rb +1 -1
  9. data/lib/kafka/client.rb +63 -6
  10. data/lib/kafka/cluster.rb +53 -1
  11. data/lib/kafka/compression.rb +13 -11
  12. data/lib/kafka/compressor.rb +1 -0
  13. data/lib/kafka/connection.rb +7 -1
  14. data/lib/kafka/connection_builder.rb +1 -1
  15. data/lib/kafka/consumer.rb +98 -17
  16. data/lib/kafka/consumer_group.rb +20 -2
  17. data/lib/kafka/datadog.rb +32 -12
  18. data/lib/kafka/fetch_operation.rb +1 -1
  19. data/lib/kafka/fetched_batch.rb +5 -1
  20. data/lib/kafka/fetched_batch_generator.rb +5 -2
  21. data/lib/kafka/fetched_message.rb +1 -0
  22. data/lib/kafka/fetched_offset_resolver.rb +1 -1
  23. data/lib/kafka/fetcher.rb +13 -6
  24. data/lib/kafka/gzip_codec.rb +4 -0
  25. data/lib/kafka/heartbeat.rb +8 -3
  26. data/lib/kafka/lz4_codec.rb +4 -0
  27. data/lib/kafka/offset_manager.rb +13 -2
  28. data/lib/kafka/produce_operation.rb +1 -1
  29. data/lib/kafka/producer.rb +33 -8
  30. data/lib/kafka/prometheus.rb +316 -0
  31. data/lib/kafka/protocol/add_offsets_to_txn_request.rb +29 -0
  32. data/lib/kafka/protocol/add_offsets_to_txn_response.rb +19 -0
  33. data/lib/kafka/protocol/join_group_request.rb +8 -2
  34. data/lib/kafka/protocol/metadata_response.rb +1 -1
  35. data/lib/kafka/protocol/offset_fetch_request.rb +3 -1
  36. data/lib/kafka/protocol/produce_request.rb +3 -1
  37. data/lib/kafka/protocol/record_batch.rb +7 -4
  38. data/lib/kafka/protocol/sasl_handshake_request.rb +1 -1
  39. data/lib/kafka/protocol/txn_offset_commit_request.rb +46 -0
  40. data/lib/kafka/protocol/txn_offset_commit_response.rb +18 -0
  41. data/lib/kafka/protocol.rb +8 -0
  42. data/lib/kafka/round_robin_assignment_strategy.rb +10 -7
  43. data/lib/kafka/sasl/gssapi.rb +1 -1
  44. data/lib/kafka/sasl/oauth.rb +64 -0
  45. data/lib/kafka/sasl/plain.rb +1 -1
  46. data/lib/kafka/sasl/scram.rb +16 -13
  47. data/lib/kafka/sasl_authenticator.rb +10 -3
  48. data/lib/kafka/snappy_codec.rb +4 -0
  49. data/lib/kafka/ssl_context.rb +5 -1
  50. data/lib/kafka/ssl_socket_with_timeout.rb +1 -0
  51. data/lib/kafka/statsd.rb +10 -1
  52. data/lib/kafka/tagged_logger.rb +77 -0
  53. data/lib/kafka/transaction_manager.rb +26 -1
  54. data/lib/kafka/transaction_state_machine.rb +1 -1
  55. data/lib/kafka/version.rb +1 -1
  56. data/lib/kafka/zstd_codec.rb +27 -0
  57. data/lib/kafka.rb +4 -0
  58. data/ruby-kafka.gemspec +5 -3
  59. metadata +50 -7
data/lib/kafka/fetcher.rb CHANGED
@@ -4,11 +4,11 @@ require "kafka/fetch_operation"
4
4
 
5
5
  module Kafka
6
6
  class Fetcher
7
- attr_reader :queue
7
+ attr_reader :queue, :max_wait_time
8
8
 
9
9
  def initialize(cluster:, logger:, instrumenter:, max_queue_size:, group:)
10
10
  @cluster = cluster
11
- @logger = logger
11
+ @logger = TaggedLogger.new(logger)
12
12
  @instrumenter = instrumenter
13
13
  @max_queue_size = max_queue_size
14
14
  @group = group
@@ -17,6 +17,9 @@ module Kafka
17
17
  @commands = Queue.new
18
18
  @next_offsets = Hash.new { |h, k| h[k] = {} }
19
19
 
20
+ # We are only running when someone calls start.
21
+ @running = false
22
+
20
23
  # Long poll until at least this many bytes can be fetched.
21
24
  @min_bytes = 1
22
25
 
@@ -49,20 +52,21 @@ module Kafka
49
52
  def start
50
53
  return if @running
51
54
 
55
+ @running = true
56
+
52
57
  @thread = Thread.new do
53
58
  while @running
54
59
  loop
55
60
  end
56
- @logger.info "Fetcher thread exited."
61
+ @logger.info "#{@group} Fetcher thread exited."
57
62
  end
58
63
  @thread.abort_on_exception = true
59
-
60
- @running = true
61
64
  end
62
65
 
63
66
  def stop
64
67
  return unless @running
65
68
  @commands << [:stop, []]
69
+ @thread.join
66
70
  end
67
71
 
68
72
  def reset
@@ -93,6 +97,7 @@ module Kafka
93
97
  attr_reader :current_reset_counter
94
98
 
95
99
  def loop
100
+ @logger.push_tags(@group.to_s)
96
101
  @instrumenter.instrument("loop.fetcher", {
97
102
  queue_size: @queue.size,
98
103
  })
@@ -108,9 +113,11 @@ module Kafka
108
113
  elsif @queue.size < @max_queue_size
109
114
  step
110
115
  else
111
- @logger.warn "Reached max fetcher queue size (#{@max_queue_size}), sleeping 1s"
116
+ @logger.info "Reached max fetcher queue size (#{@max_queue_size}), sleeping 1s"
112
117
  sleep 1
113
118
  end
119
+ ensure
120
+ @logger.pop_tags
114
121
  end
115
122
 
116
123
  def handle_configure(min_bytes, max_bytes, max_wait_time)
@@ -6,6 +6,10 @@ module Kafka
6
6
  1
7
7
  end
8
8
 
9
+ def produce_api_min_version
10
+ 0
11
+ end
12
+
9
13
  def load
10
14
  require "zlib"
11
15
  end
@@ -2,15 +2,20 @@
2
2
 
3
3
  module Kafka
4
4
  class Heartbeat
5
- def initialize(group:, interval:)
5
+ def initialize(group:, interval:, instrumenter:)
6
6
  @group = group
7
7
  @interval = interval
8
8
  @last_heartbeat = Time.now
9
+ @instrumenter = instrumenter
9
10
  end
10
11
 
11
12
  def trigger!
12
- @group.heartbeat
13
- @last_heartbeat = Time.now
13
+ @instrumenter.instrument('heartbeat.consumer',
14
+ group_id: @group.group_id,
15
+ topic_partitions: @group.assigned_partitions) do
16
+ @group.heartbeat
17
+ @last_heartbeat = Time.now
18
+ end
14
19
  end
15
20
 
16
21
  def trigger
@@ -6,6 +6,10 @@ module Kafka
6
6
  3
7
7
  end
8
8
 
9
+ def produce_api_min_version
10
+ 0
11
+ end
12
+
9
13
  def load
10
14
  require "extlz4"
11
15
  rescue LoadError
@@ -13,7 +13,7 @@ module Kafka
13
13
  @cluster = cluster
14
14
  @group = group
15
15
  @fetcher = fetcher
16
- @logger = logger
16
+ @logger = TaggedLogger.new(logger)
17
17
  @commit_interval = commit_interval
18
18
  @commit_threshold = commit_threshold
19
19
 
@@ -50,9 +50,20 @@ module Kafka
50
50
  # @param offset [Integer] the offset of the message that should be marked as processed.
51
51
  # @return [nil]
52
52
  def mark_as_processed(topic, partition, offset)
53
- @uncommitted_offsets += 1
53
+ unless @group.assigned_to?(topic, partition)
54
+ @logger.debug "Not marking #{topic}/#{partition}:#{offset} as processed for partition not assigned to this consumer."
55
+ return
56
+ end
54
57
  @processed_offsets[topic] ||= {}
55
58
 
59
+ last_processed_offset = @processed_offsets[topic][partition] || -1
60
+ if last_processed_offset > offset + 1
61
+ @logger.debug "Not overwriting newer offset #{topic}/#{partition}:#{last_processed_offset - 1} with older #{offset}"
62
+ return
63
+ end
64
+
65
+ @uncommitted_offsets += 1
66
+
56
67
  # The committed offset should always be the offset of the next message that the
57
68
  # application will read, thus adding one to the last message processed.
58
69
  @processed_offsets[topic][partition] = offset + 1
@@ -37,7 +37,7 @@ module Kafka
37
37
  @required_acks = required_acks
38
38
  @ack_timeout = ack_timeout
39
39
  @compressor = compressor
40
- @logger = logger
40
+ @logger = TaggedLogger.new(logger)
41
41
  @instrumenter = instrumenter
42
42
  end
43
43
 
@@ -68,6 +68,8 @@ module Kafka
68
68
  #
69
69
  # * `:snappy` for [Snappy](http://google.github.io/snappy/) compression.
70
70
  # * `:gzip` for [gzip](https://en.wikipedia.org/wiki/Gzip) compression.
71
+ # * `:lz4` for [LZ4](https://en.wikipedia.org/wiki/LZ4_(compression_algorithm)) compression.
72
+ # * `:zstd` for [zstd](https://facebook.github.io/zstd/) compression.
71
73
  #
72
74
  # By default, all message sets will be compressed if you specify a compression
73
75
  # codec. To increase the compression threshold, set `compression_threshold` to
@@ -130,7 +132,7 @@ module Kafka
130
132
  def initialize(cluster:, transaction_manager:, logger:, instrumenter:, compressor:, ack_timeout:, required_acks:, max_retries:, retry_backoff:, max_buffer_size:, max_buffer_bytesize:)
131
133
  @cluster = cluster
132
134
  @transaction_manager = transaction_manager
133
- @logger = logger
135
+ @logger = TaggedLogger.new(logger)
134
136
  @instrumenter = instrumenter
135
137
  @required_acks = required_acks == :all ? -1 : required_acks
136
138
  @ack_timeout = ack_timeout
@@ -150,6 +152,10 @@ module Kafka
150
152
  @pending_message_queue = PendingMessageQueue.new
151
153
  end
152
154
 
155
+ def to_s
156
+ "Producer #{@target_topics.to_a.join(', ')}"
157
+ end
158
+
153
159
  # Produces a message to the specified topic. Note that messages are buffered in
154
160
  # the producer until {#deliver_messages} is called.
155
161
  #
@@ -182,11 +188,14 @@ module Kafka
182
188
  # @raise [BufferOverflow] if the maximum buffer size has been reached.
183
189
  # @return [nil]
184
190
  def produce(value, key: nil, headers: {}, topic:, partition: nil, partition_key: nil, create_time: Time.now)
191
+ # We want to fail fast if `topic` isn't a String
192
+ topic = topic.to_str
193
+
185
194
  message = PendingMessage.new(
186
195
  value: value && value.to_s,
187
196
  key: key && key.to_s,
188
197
  headers: headers,
189
- topic: topic.to_s,
198
+ topic: topic,
190
199
  partition: partition && Integer(partition),
191
200
  partition_key: partition_key && partition_key.to_s,
192
201
  create_time: create_time
@@ -205,7 +214,7 @@ module Kafka
205
214
  # If the producer is in transactional mode, all the message production
206
215
  # must be used when the producer is currently in transaction
207
216
  if @transaction_manager.transactional? && !@transaction_manager.in_transaction?
208
- raise 'You must trigger begin_transaction before producing messages'
217
+ raise "Cannot produce to #{topic}: You must trigger begin_transaction before producing messages"
209
218
  end
210
219
 
211
220
  @target_topics.add(topic)
@@ -324,6 +333,20 @@ module Kafka
324
333
  @transaction_manager.abort_transaction
325
334
  end
326
335
 
336
+ # Sends batch last offset to the consumer group coordinator, and also marks
337
+ # this offset as part of the current transaction. This offset will be considered
338
+ # committed only if the transaction is committed successfully.
339
+ #
340
+ # This method should be used when you need to batch consumed and produced messages
341
+ # together, typically in a consume-transform-produce pattern. Thus, the specified
342
+ # group_id should be the same as config parameter group_id of the used
343
+ # consumer.
344
+ #
345
+ # @return [nil]
346
+ def send_offsets_to_transaction(batch:, group_id:)
347
+ @transaction_manager.send_offsets_to_txn(offsets: { batch.topic => { batch.partition => { offset: batch.last_offset + 1, leader_epoch: batch.leader_epoch } } }, group_id: group_id)
348
+ end
349
+
327
350
  # Syntactic sugar to enable easier transaction usage. Do the following steps
328
351
  #
329
352
  # - Start the transaction (with Producer#begin_transaction)
@@ -391,11 +414,11 @@ module Kafka
391
414
  if buffer_size.zero?
392
415
  break
393
416
  elsif attempt <= @max_retries
394
- @logger.warn "Failed to send all messages; attempting retry #{attempt} of #{@max_retries} after #{@retry_backoff}s"
417
+ @logger.warn "Failed to send all messages to #{pretty_partitions}; attempting retry #{attempt} of #{@max_retries} after #{@retry_backoff}s"
395
418
 
396
419
  sleep @retry_backoff
397
420
  else
398
- @logger.error "Failed to send all messages; keeping remaining messages in buffer"
421
+ @logger.error "Failed to send all messages to #{pretty_partitions}; keeping remaining messages in buffer"
399
422
  break
400
423
  end
401
424
  end
@@ -407,12 +430,14 @@ module Kafka
407
430
  end
408
431
 
409
432
  unless @buffer.empty?
410
- partitions = @buffer.map {|topic, partition, _| "#{topic}/#{partition}" }.join(", ")
411
-
412
- raise DeliveryFailed.new("Failed to send messages to #{partitions}", buffer_messages)
433
+ raise DeliveryFailed.new("Failed to send messages to #{pretty_partitions}", buffer_messages)
413
434
  end
414
435
  end
415
436
 
437
+ def pretty_partitions
438
+ @buffer.map {|topic, partition, _| "#{topic}/#{partition}" }.join(", ")
439
+ end
440
+
416
441
  def assign_partitions!
417
442
  failed_messages = []
418
443
  topics_with_failures = Set.new
@@ -0,0 +1,316 @@
1
+ # frozen_string_literal: true
2
+
3
+ #
4
+ # Subscriber to ruby_kafka to report metrics to prometheus
5
+ #
6
+ # Usage:
7
+ # require "kafka/prometheus"
8
+ #
9
+ # Once the file has been required, no further configuration is needed, all operational
10
+ # metrics are automatically emitted (Unless PROMETHEUS_NO_AUTO_START is set).
11
+ #
12
+ # By Peter Mustel, T2 Data AB
13
+ #
14
+ begin
15
+ require 'prometheus/client'
16
+ rescue LoadError
17
+ warn 'In order to report Kafka client metrics to Prometheus you need to install the `prometheus-client` gem.'
18
+ raise
19
+ end
20
+
21
+ require 'active_support/subscriber'
22
+
23
+ module Kafka
24
+ module Prometheus
25
+ SIZE_BUCKETS = [1, 10, 100, 1000, 10_000, 100_000, 1_000_000].freeze
26
+ LATENCY_BUCKETS = [0.0001, 0.001, 0.01, 0.1, 1.0, 10, 100, 1000].freeze
27
+ DELAY_BUCKETS = [1, 3, 10, 30, 100, 300, 1000, 3000, 10_000, 30_000].freeze
28
+
29
+ class << self
30
+ attr_accessor :registry
31
+
32
+ def start(registry = ::Prometheus::Client.registry)
33
+ @registry = registry
34
+ ConnectionSubscriber.attach_to 'connection.kafka'
35
+ ConsumerSubscriber.attach_to 'consumer.kafka'
36
+ ProducerSubscriber.attach_to 'producer.kafka'
37
+ AsyncProducerSubscriber.attach_to 'async_producer.kafka'
38
+ FetcherSubscriber.attach_to 'fetcher.kafka'
39
+ end
40
+ end
41
+
42
+ class ConnectionSubscriber < ActiveSupport::Subscriber
43
+ def initialize
44
+ super
45
+ @api_calls = Prometheus.registry.counter(:api_calls, docstring: 'Total calls', labels: [:client, :api, :broker])
46
+ @api_latency = Prometheus.registry.histogram(:api_latency, docstring: 'Latency', buckets: LATENCY_BUCKETS, labels: [:client, :api, :broker])
47
+ @api_request_size = Prometheus.registry.histogram(:api_request_size, docstring: 'Request size', buckets: SIZE_BUCKETS, labels: [:client, :api, :broker])
48
+ @api_response_size = Prometheus.registry.histogram(:api_response_size, docstring: 'Response size', buckets: SIZE_BUCKETS, labels: [:client, :api, :broker])
49
+ @api_errors = Prometheus.registry.counter(:api_errors, docstring: 'Errors', labels: [:client, :api, :broker])
50
+ end
51
+
52
+ def request(event)
53
+ key = {
54
+ client: event.payload.fetch(:client_id),
55
+ api: event.payload.fetch(:api, 'unknown'),
56
+ broker: event.payload.fetch(:broker_host)
57
+ }
58
+ request_size = event.payload.fetch(:request_size, 0)
59
+ response_size = event.payload.fetch(:response_size, 0)
60
+
61
+ @api_calls.increment(labels: key)
62
+ @api_latency.observe(event.duration, labels: key)
63
+ @api_request_size.observe(request_size, labels: key)
64
+ @api_response_size.observe(response_size, labels: key)
65
+ @api_errors.increment(labels: key) if event.payload.key?(:exception)
66
+ end
67
+ end
68
+
69
+ class ConsumerSubscriber < ActiveSupport::Subscriber
70
+ def initialize
71
+ super
72
+ @process_messages = Prometheus.registry.counter(:consumer_process_messages, docstring: 'Total messages', labels: [:client, :group_id, :topic, :partition])
73
+ @process_message_errors = Prometheus.registry.counter(:consumer_process_message_errors, docstring: 'Total errors', labels: [:client, :group_id, :topic, :partition])
74
+ @process_message_latency =
75
+ Prometheus.registry.histogram(:consumer_process_message_latency, docstring: 'Latency', buckets: LATENCY_BUCKETS, labels: [:client, :group_id, :topic, :partition])
76
+ @offset_lag = Prometheus.registry.gauge(:consumer_offset_lag, docstring: 'Offset lag', labels: [:client, :group_id, :topic, :partition])
77
+ @time_lag = Prometheus.registry.gauge(:consumer_time_lag, docstring: 'Time lag of message', labels: [:client, :group_id, :topic, :partition])
78
+ @process_batch_errors = Prometheus.registry.counter(:consumer_process_batch_errors, docstring: 'Total errors in batch', labels: [:client, :group_id, :topic, :partition])
79
+ @process_batch_latency =
80
+ Prometheus.registry.histogram(:consumer_process_batch_latency, docstring: 'Latency in batch', buckets: LATENCY_BUCKETS, labels: [:client, :group_id, :topic, :partition])
81
+ @batch_size = Prometheus.registry.histogram(:consumer_batch_size, docstring: 'Size of batch', buckets: SIZE_BUCKETS, labels: [:client, :group_id, :topic, :partition])
82
+ @join_group = Prometheus.registry.histogram(:consumer_join_group, docstring: 'Time to join group', buckets: DELAY_BUCKETS, labels: [:client, :group_id])
83
+ @join_group_errors = Prometheus.registry.counter(:consumer_join_group_errors, docstring: 'Total error in joining group', labels: [:client, :group_id])
84
+ @sync_group = Prometheus.registry.histogram(:consumer_sync_group, docstring: 'Time to sync group', buckets: DELAY_BUCKETS, labels: [:client, :group_id])
85
+ @sync_group_errors = Prometheus.registry.counter(:consumer_sync_group_errors, docstring: 'Total error in syncing group', labels: [:client, :group_id])
86
+ @leave_group = Prometheus.registry.histogram(:consumer_leave_group, docstring: 'Time to leave group', buckets: DELAY_BUCKETS, labels: [:client, :group_id])
87
+ @leave_group_errors = Prometheus.registry.counter(:consumer_leave_group_errors, docstring: 'Total error in leaving group', labels: [:client, :group_id])
88
+ @pause_duration = Prometheus.registry.gauge(:consumer_pause_duration, docstring: 'Pause duration', labels: [:client, :group_id, :topic, :partition])
89
+ end
90
+
91
+ def process_message(event)
92
+ key = {
93
+ client: event.payload.fetch(:client_id),
94
+ group_id: event.payload.fetch(:group_id),
95
+ topic: event.payload.fetch(:topic),
96
+ partition: event.payload.fetch(:partition)
97
+ }
98
+
99
+ offset_lag = event.payload.fetch(:offset_lag)
100
+ create_time = event.payload.fetch(:create_time)
101
+
102
+ time_lag = create_time && ((Time.now - create_time) * 1000).to_i
103
+
104
+ if event.payload.key?(:exception)
105
+ @process_message_errors.increment(labels: key)
106
+ else
107
+ @process_message_latency.observe(event.duration, labels: key)
108
+ @process_messages.increment(labels: key)
109
+ end
110
+
111
+ @offset_lag.set(offset_lag, labels: key)
112
+
113
+ # Not all messages have timestamps.
114
+ return unless time_lag
115
+
116
+ @time_lag.set(time_lag, labels: key)
117
+ end
118
+
119
+ def process_batch(event)
120
+ key = {
121
+ client: event.payload.fetch(:client_id),
122
+ group_id: event.payload.fetch(:group_id),
123
+ topic: event.payload.fetch(:topic),
124
+ partition: event.payload.fetch(:partition)
125
+ }
126
+ message_count = event.payload.fetch(:message_count)
127
+
128
+ if event.payload.key?(:exception)
129
+ @process_batch_errors.increment(labels: key)
130
+ else
131
+ @process_batch_latency.observe(event.duration, labels: key)
132
+ @process_messages.increment(by: message_count, labels: key)
133
+ end
134
+ end
135
+
136
+ def fetch_batch(event)
137
+ key = {
138
+ client: event.payload.fetch(:client_id),
139
+ group_id: event.payload.fetch(:group_id),
140
+ topic: event.payload.fetch(:topic),
141
+ partition: event.payload.fetch(:partition)
142
+ }
143
+ offset_lag = event.payload.fetch(:offset_lag)
144
+ batch_size = event.payload.fetch(:message_count)
145
+
146
+ @batch_size.observe(batch_size, labels: key)
147
+ @offset_lag.set(offset_lag, labels: key)
148
+ end
149
+
150
+ def join_group(event)
151
+ key = { client: event.payload.fetch(:client_id), group_id: event.payload.fetch(:group_id) }
152
+ @join_group.observe(event.duration, labels: key)
153
+
154
+ @join_group_errors.increment(labels: key) if event.payload.key?(:exception)
155
+ end
156
+
157
+ def sync_group(event)
158
+ key = { client: event.payload.fetch(:client_id), group_id: event.payload.fetch(:group_id) }
159
+ @sync_group.observe(event.duration, labels: key)
160
+
161
+ @sync_group_errors.increment(labels: key) if event.payload.key?(:exception)
162
+ end
163
+
164
+ def leave_group(event)
165
+ key = { client: event.payload.fetch(:client_id), group_id: event.payload.fetch(:group_id) }
166
+ @leave_group.observe(event.duration, labels: key)
167
+
168
+ @leave_group_errors.increment(labels: key) if event.payload.key?(:exception)
169
+ end
170
+
171
+ def pause_status(event)
172
+ key = {
173
+ client: event.payload.fetch(:client_id),
174
+ group_id: event.payload.fetch(:group_id),
175
+ topic: event.payload.fetch(:topic),
176
+ partition: event.payload.fetch(:partition)
177
+ }
178
+
179
+ duration = event.payload.fetch(:duration)
180
+ @pause_duration.set(duration, labels: key)
181
+ end
182
+ end
183
+
184
+ class ProducerSubscriber < ActiveSupport::Subscriber
185
+ def initialize
186
+ super
187
+ @produce_messages = Prometheus.registry.counter(:producer_produced_messages, docstring: 'Produced messages total', labels: [:client, :topic])
188
+ @produce_message_size =
189
+ Prometheus.registry.histogram(:producer_message_size, docstring: 'Message size', buckets: SIZE_BUCKETS, labels: [:client, :topic])
190
+ @buffer_size = Prometheus.registry.histogram(:producer_buffer_size, docstring: 'Buffer size', buckets: SIZE_BUCKETS, labels: [:client])
191
+ @buffer_fill_ratio = Prometheus.registry.histogram(:producer_buffer_fill_ratio, docstring: 'Buffer fill ratio', labels: [:client])
192
+ @buffer_fill_percentage = Prometheus.registry.histogram(:producer_buffer_fill_percentage, docstring: 'Buffer fill percentage', labels: [:client])
193
+ @produce_errors = Prometheus.registry.counter(:producer_produce_errors, docstring: 'Produce errors', labels: [:client, :topic])
194
+ @deliver_errors = Prometheus.registry.counter(:producer_deliver_errors, docstring: 'Deliver error', labels: [:client])
195
+ @deliver_latency =
196
+ Prometheus.registry.histogram(:producer_deliver_latency, docstring: 'Delivery latency', buckets: LATENCY_BUCKETS, labels: [:client])
197
+ @deliver_messages = Prometheus.registry.counter(:producer_deliver_messages, docstring: 'Total count of delivered messages', labels: [:client])
198
+ @deliver_attempts = Prometheus.registry.histogram(:producer_deliver_attempts, docstring: 'Delivery attempts', labels: [:client])
199
+ @ack_messages = Prometheus.registry.counter(:producer_ack_messages, docstring: 'Ack', labels: [:client, :topic])
200
+ @ack_delay = Prometheus.registry.histogram(:producer_ack_delay, docstring: 'Ack delay', buckets: LATENCY_BUCKETS, labels: [:client, :topic])
201
+ @ack_errors = Prometheus.registry.counter(:producer_ack_errors, docstring: 'Ack errors', labels: [:client, :topic])
202
+ end
203
+
204
+ def produce_message(event)
205
+ client = event.payload.fetch(:client_id)
206
+ key = { client: client, topic: event.payload.fetch(:topic) }
207
+
208
+ message_size = event.payload.fetch(:message_size)
209
+ buffer_size = event.payload.fetch(:buffer_size)
210
+ max_buffer_size = event.payload.fetch(:max_buffer_size)
211
+ buffer_fill_ratio = buffer_size.to_f / max_buffer_size.to_f
212
+ buffer_fill_percentage = buffer_fill_ratio * 100.0
213
+
214
+ # This gets us the write rate.
215
+ @produce_messages.increment(labels: key)
216
+ @produce_message_size.observe(message_size, labels: key)
217
+
218
+ # This gets us the avg/max buffer size per producer.
219
+ @buffer_size.observe(buffer_size, labels: { client: client })
220
+
221
+ # This gets us the avg/max buffer fill ratio per producer.
222
+ @buffer_fill_ratio.observe(buffer_fill_ratio, labels: { client: client })
223
+ @buffer_fill_percentage.observe(buffer_fill_percentage, labels: { client: client })
224
+ end
225
+
226
+ def buffer_overflow(event)
227
+ key = { client: event.payload.fetch(:client_id), topic: event.payload.fetch(:topic) }
228
+ @produce_errors.increment(labels: key)
229
+ end
230
+
231
+ def deliver_messages(event)
232
+ key = { client: event.payload.fetch(:client_id) }
233
+ message_count = event.payload.fetch(:delivered_message_count)
234
+ attempts = event.payload.fetch(:attempts)
235
+
236
+ @deliver_errors.increment(labels: key) if event.payload.key?(:exception)
237
+ @deliver_latency.observe(event.duration, labels: key)
238
+
239
+ # Messages delivered to Kafka:
240
+ @deliver_messages.increment(by: message_count, labels: key)
241
+
242
+ # Number of attempts to deliver messages:
243
+ @deliver_attempts.observe(attempts, labels: key)
244
+ end
245
+
246
+ def ack_message(event)
247
+ key = { client: event.payload.fetch(:client_id), topic: event.payload.fetch(:topic) }
248
+
249
+ # Number of messages ACK'd for the topic.
250
+ @ack_messages.increment(labels: key)
251
+
252
+ # Histogram of delay between a message being produced and it being ACK'd.
253
+ @ack_delay.observe(event.payload.fetch(:delay), labels: key)
254
+ end
255
+
256
+ def topic_error(event)
257
+ key = { client: event.payload.fetch(:client_id), topic: event.payload.fetch(:topic) }
258
+
259
+ @ack_errors.increment(labels: key)
260
+ end
261
+ end
262
+
263
+ class AsyncProducerSubscriber < ActiveSupport::Subscriber
264
+ def initialize
265
+ super
266
+ @queue_size = Prometheus.registry.histogram(:async_producer_queue_size, docstring: 'Queue size', buckets: SIZE_BUCKETS, labels: [:client, :topic])
267
+ @queue_fill_ratio = Prometheus.registry.histogram(:async_producer_queue_fill_ratio, docstring: 'Queue fill ratio', labels: [:client, :topic])
268
+ @produce_errors = Prometheus.registry.counter(:async_producer_produce_errors, docstring: 'Producer errors', labels: [:client, :topic])
269
+ @dropped_messages = Prometheus.registry.counter(:async_producer_dropped_messages, docstring: 'Dropped messages', labels: [:client])
270
+ end
271
+
272
+ def enqueue_message(event)
273
+ key = { client: event.payload.fetch(:client_id), topic: event.payload.fetch(:topic) }
274
+
275
+ queue_size = event.payload.fetch(:queue_size)
276
+ max_queue_size = event.payload.fetch(:max_queue_size)
277
+ queue_fill_ratio = queue_size.to_f / max_queue_size.to_f
278
+
279
+ # This gets us the avg/max queue size per producer.
280
+ @queue_size.observe(queue_size, labels: key)
281
+
282
+ # This gets us the avg/max queue fill ratio per producer.
283
+ @queue_fill_ratio.observe(queue_fill_ratio, labels: key)
284
+ end
285
+
286
+ def buffer_overflow(event)
287
+ key = { client: event.payload.fetch(:client_id), topic: event.payload.fetch(:topic) }
288
+ @produce_errors.increment(labels: key)
289
+ end
290
+
291
+ def drop_messages(event)
292
+ key = { client: event.payload.fetch(:client_id) }
293
+ message_count = event.payload.fetch(:message_count)
294
+ @dropped_messages.increment(by: message_count, labels: key)
295
+ end
296
+ end
297
+
298
+ class FetcherSubscriber < ActiveSupport::Subscriber
299
+ def initialize
300
+ super
301
+ @queue_size = Prometheus.registry.gauge(:fetcher_queue_size, docstring: 'Queue size', labels: [:client, :group_id])
302
+ end
303
+
304
+ def loop(event)
305
+ queue_size = event.payload.fetch(:queue_size)
306
+ client = event.payload.fetch(:client_id)
307
+ group_id = event.payload.fetch(:group_id)
308
+
309
+ @queue_size.set(queue_size, labels: { client: client, group_id: group_id })
310
+ end
311
+ end
312
+ end
313
+ end
314
+
315
+ # To enable testability, it is possible to skip the start until test time
316
+ Kafka::Prometheus.start unless defined?(PROMETHEUS_NO_AUTO_START)
@@ -0,0 +1,29 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Kafka
4
+ module Protocol
5
+ class AddOffsetsToTxnRequest
6
+ def initialize(transactional_id: nil, producer_id:, producer_epoch:, group_id:)
7
+ @transactional_id = transactional_id
8
+ @producer_id = producer_id
9
+ @producer_epoch = producer_epoch
10
+ @group_id = group_id
11
+ end
12
+
13
+ def api_key
14
+ ADD_OFFSETS_TO_TXN_API
15
+ end
16
+
17
+ def response_class
18
+ AddOffsetsToTxnResponse
19
+ end
20
+
21
+ def encode(encoder)
22
+ encoder.write_string(@transactional_id.to_s)
23
+ encoder.write_int64(@producer_id)
24
+ encoder.write_int16(@producer_epoch)
25
+ encoder.write_string(@group_id)
26
+ end
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,19 @@
1
+ module Kafka
2
+ module Protocol
3
+ class AddOffsetsToTxnResponse
4
+
5
+ attr_reader :error_code
6
+
7
+ def initialize(error_code:)
8
+ @error_code = error_code
9
+ end
10
+
11
+ def self.decode(decoder)
12
+ _throttle_time_ms = decoder.int32
13
+ error_code = decoder.int16
14
+ new(error_code: error_code)
15
+ end
16
+
17
+ end
18
+ end
19
+ end
@@ -7,13 +7,14 @@ module Kafka
7
7
  class JoinGroupRequest
8
8
  PROTOCOL_TYPE = "consumer"
9
9
 
10
- def initialize(group_id:, session_timeout:, member_id:, topics: [])
10
+ def initialize(group_id:, session_timeout:, rebalance_timeout:, member_id:, topics: [])
11
11
  @group_id = group_id
12
12
  @session_timeout = session_timeout * 1000 # Kafka wants ms.
13
+ @rebalance_timeout = rebalance_timeout * 1000 # Kafka wants ms.
13
14
  @member_id = member_id || ""
14
15
  @protocol_type = PROTOCOL_TYPE
15
16
  @group_protocols = {
16
- "standard" => ConsumerGroupProtocol.new(topics: ["test-messages"]),
17
+ "roundrobin" => ConsumerGroupProtocol.new(topics: topics),
17
18
  }
18
19
  end
19
20
 
@@ -21,6 +22,10 @@ module Kafka
21
22
  JOIN_GROUP_API
22
23
  end
23
24
 
25
+ def api_version
26
+ 1
27
+ end
28
+
24
29
  def response_class
25
30
  JoinGroupResponse
26
31
  end
@@ -28,6 +33,7 @@ module Kafka
28
33
  def encode(encoder)
29
34
  encoder.write_string(@group_id)
30
35
  encoder.write_int32(@session_timeout)
36
+ encoder.write_int32(@rebalance_timeout)
31
37
  encoder.write_string(@member_id)
32
38
  encoder.write_string(@protocol_type)
33
39
 
@@ -34,7 +34,7 @@ module Kafka
34
34
  #
35
35
  class MetadataResponse
36
36
  class PartitionMetadata
37
- attr_reader :partition_id, :leader
37
+ attr_reader :partition_id, :leader, :replicas
38
38
 
39
39
  attr_reader :partition_error_code
40
40