ruby-kafka 0.7.4 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.circleci/config.yml +168 -3
- data/.github/workflows/stale.yml +19 -0
- data/CHANGELOG.md +48 -0
- data/README.md +59 -0
- data/lib/kafka/async_producer.rb +30 -9
- data/lib/kafka/broker.rb +13 -1
- data/lib/kafka/broker_pool.rb +1 -1
- data/lib/kafka/client.rb +63 -6
- data/lib/kafka/cluster.rb +53 -1
- data/lib/kafka/compression.rb +13 -11
- data/lib/kafka/compressor.rb +1 -0
- data/lib/kafka/connection.rb +7 -1
- data/lib/kafka/connection_builder.rb +1 -1
- data/lib/kafka/consumer.rb +98 -17
- data/lib/kafka/consumer_group.rb +20 -2
- data/lib/kafka/datadog.rb +32 -12
- data/lib/kafka/fetch_operation.rb +1 -1
- data/lib/kafka/fetched_batch.rb +5 -1
- data/lib/kafka/fetched_batch_generator.rb +5 -2
- data/lib/kafka/fetched_message.rb +1 -0
- data/lib/kafka/fetched_offset_resolver.rb +1 -1
- data/lib/kafka/fetcher.rb +13 -6
- data/lib/kafka/gzip_codec.rb +4 -0
- data/lib/kafka/heartbeat.rb +8 -3
- data/lib/kafka/lz4_codec.rb +4 -0
- data/lib/kafka/offset_manager.rb +13 -2
- data/lib/kafka/produce_operation.rb +1 -1
- data/lib/kafka/producer.rb +33 -8
- data/lib/kafka/prometheus.rb +316 -0
- data/lib/kafka/protocol/add_offsets_to_txn_request.rb +29 -0
- data/lib/kafka/protocol/add_offsets_to_txn_response.rb +19 -0
- data/lib/kafka/protocol/join_group_request.rb +8 -2
- data/lib/kafka/protocol/metadata_response.rb +1 -1
- data/lib/kafka/protocol/offset_fetch_request.rb +3 -1
- data/lib/kafka/protocol/produce_request.rb +3 -1
- data/lib/kafka/protocol/record_batch.rb +7 -4
- data/lib/kafka/protocol/sasl_handshake_request.rb +1 -1
- data/lib/kafka/protocol/txn_offset_commit_request.rb +46 -0
- data/lib/kafka/protocol/txn_offset_commit_response.rb +18 -0
- data/lib/kafka/protocol.rb +8 -0
- data/lib/kafka/round_robin_assignment_strategy.rb +10 -7
- data/lib/kafka/sasl/gssapi.rb +1 -1
- data/lib/kafka/sasl/oauth.rb +64 -0
- data/lib/kafka/sasl/plain.rb +1 -1
- data/lib/kafka/sasl/scram.rb +16 -13
- data/lib/kafka/sasl_authenticator.rb +10 -3
- data/lib/kafka/snappy_codec.rb +4 -0
- data/lib/kafka/ssl_context.rb +5 -1
- data/lib/kafka/ssl_socket_with_timeout.rb +1 -0
- data/lib/kafka/statsd.rb +10 -1
- data/lib/kafka/tagged_logger.rb +77 -0
- data/lib/kafka/transaction_manager.rb +26 -1
- data/lib/kafka/transaction_state_machine.rb +1 -1
- data/lib/kafka/version.rb +1 -1
- data/lib/kafka/zstd_codec.rb +27 -0
- data/lib/kafka.rb +4 -0
- data/ruby-kafka.gemspec +5 -3
- metadata +50 -7
data/lib/kafka/fetcher.rb
CHANGED
@@ -4,11 +4,11 @@ require "kafka/fetch_operation"
|
|
4
4
|
|
5
5
|
module Kafka
|
6
6
|
class Fetcher
|
7
|
-
attr_reader :queue
|
7
|
+
attr_reader :queue, :max_wait_time
|
8
8
|
|
9
9
|
def initialize(cluster:, logger:, instrumenter:, max_queue_size:, group:)
|
10
10
|
@cluster = cluster
|
11
|
-
@logger = logger
|
11
|
+
@logger = TaggedLogger.new(logger)
|
12
12
|
@instrumenter = instrumenter
|
13
13
|
@max_queue_size = max_queue_size
|
14
14
|
@group = group
|
@@ -17,6 +17,9 @@ module Kafka
|
|
17
17
|
@commands = Queue.new
|
18
18
|
@next_offsets = Hash.new { |h, k| h[k] = {} }
|
19
19
|
|
20
|
+
# We are only running when someone calls start.
|
21
|
+
@running = false
|
22
|
+
|
20
23
|
# Long poll until at least this many bytes can be fetched.
|
21
24
|
@min_bytes = 1
|
22
25
|
|
@@ -49,20 +52,21 @@ module Kafka
|
|
49
52
|
def start
|
50
53
|
return if @running
|
51
54
|
|
55
|
+
@running = true
|
56
|
+
|
52
57
|
@thread = Thread.new do
|
53
58
|
while @running
|
54
59
|
loop
|
55
60
|
end
|
56
|
-
@logger.info "Fetcher thread exited."
|
61
|
+
@logger.info "#{@group} Fetcher thread exited."
|
57
62
|
end
|
58
63
|
@thread.abort_on_exception = true
|
59
|
-
|
60
|
-
@running = true
|
61
64
|
end
|
62
65
|
|
63
66
|
def stop
|
64
67
|
return unless @running
|
65
68
|
@commands << [:stop, []]
|
69
|
+
@thread.join
|
66
70
|
end
|
67
71
|
|
68
72
|
def reset
|
@@ -93,6 +97,7 @@ module Kafka
|
|
93
97
|
attr_reader :current_reset_counter
|
94
98
|
|
95
99
|
def loop
|
100
|
+
@logger.push_tags(@group.to_s)
|
96
101
|
@instrumenter.instrument("loop.fetcher", {
|
97
102
|
queue_size: @queue.size,
|
98
103
|
})
|
@@ -108,9 +113,11 @@ module Kafka
|
|
108
113
|
elsif @queue.size < @max_queue_size
|
109
114
|
step
|
110
115
|
else
|
111
|
-
@logger.
|
116
|
+
@logger.info "Reached max fetcher queue size (#{@max_queue_size}), sleeping 1s"
|
112
117
|
sleep 1
|
113
118
|
end
|
119
|
+
ensure
|
120
|
+
@logger.pop_tags
|
114
121
|
end
|
115
122
|
|
116
123
|
def handle_configure(min_bytes, max_bytes, max_wait_time)
|
data/lib/kafka/gzip_codec.rb
CHANGED
data/lib/kafka/heartbeat.rb
CHANGED
@@ -2,15 +2,20 @@
|
|
2
2
|
|
3
3
|
module Kafka
|
4
4
|
class Heartbeat
|
5
|
-
def initialize(group:, interval:)
|
5
|
+
def initialize(group:, interval:, instrumenter:)
|
6
6
|
@group = group
|
7
7
|
@interval = interval
|
8
8
|
@last_heartbeat = Time.now
|
9
|
+
@instrumenter = instrumenter
|
9
10
|
end
|
10
11
|
|
11
12
|
def trigger!
|
12
|
-
@
|
13
|
-
|
13
|
+
@instrumenter.instrument('heartbeat.consumer',
|
14
|
+
group_id: @group.group_id,
|
15
|
+
topic_partitions: @group.assigned_partitions) do
|
16
|
+
@group.heartbeat
|
17
|
+
@last_heartbeat = Time.now
|
18
|
+
end
|
14
19
|
end
|
15
20
|
|
16
21
|
def trigger
|
data/lib/kafka/lz4_codec.rb
CHANGED
data/lib/kafka/offset_manager.rb
CHANGED
@@ -13,7 +13,7 @@ module Kafka
|
|
13
13
|
@cluster = cluster
|
14
14
|
@group = group
|
15
15
|
@fetcher = fetcher
|
16
|
-
@logger = logger
|
16
|
+
@logger = TaggedLogger.new(logger)
|
17
17
|
@commit_interval = commit_interval
|
18
18
|
@commit_threshold = commit_threshold
|
19
19
|
|
@@ -50,9 +50,20 @@ module Kafka
|
|
50
50
|
# @param offset [Integer] the offset of the message that should be marked as processed.
|
51
51
|
# @return [nil]
|
52
52
|
def mark_as_processed(topic, partition, offset)
|
53
|
-
@
|
53
|
+
unless @group.assigned_to?(topic, partition)
|
54
|
+
@logger.debug "Not marking #{topic}/#{partition}:#{offset} as processed for partition not assigned to this consumer."
|
55
|
+
return
|
56
|
+
end
|
54
57
|
@processed_offsets[topic] ||= {}
|
55
58
|
|
59
|
+
last_processed_offset = @processed_offsets[topic][partition] || -1
|
60
|
+
if last_processed_offset > offset + 1
|
61
|
+
@logger.debug "Not overwriting newer offset #{topic}/#{partition}:#{last_processed_offset - 1} with older #{offset}"
|
62
|
+
return
|
63
|
+
end
|
64
|
+
|
65
|
+
@uncommitted_offsets += 1
|
66
|
+
|
56
67
|
# The committed offset should always be the offset of the next message that the
|
57
68
|
# application will read, thus adding one to the last message processed.
|
58
69
|
@processed_offsets[topic][partition] = offset + 1
|
data/lib/kafka/producer.rb
CHANGED
@@ -68,6 +68,8 @@ module Kafka
|
|
68
68
|
#
|
69
69
|
# * `:snappy` for [Snappy](http://google.github.io/snappy/) compression.
|
70
70
|
# * `:gzip` for [gzip](https://en.wikipedia.org/wiki/Gzip) compression.
|
71
|
+
# * `:lz4` for [LZ4](https://en.wikipedia.org/wiki/LZ4_(compression_algorithm)) compression.
|
72
|
+
# * `:zstd` for [zstd](https://facebook.github.io/zstd/) compression.
|
71
73
|
#
|
72
74
|
# By default, all message sets will be compressed if you specify a compression
|
73
75
|
# codec. To increase the compression threshold, set `compression_threshold` to
|
@@ -130,7 +132,7 @@ module Kafka
|
|
130
132
|
def initialize(cluster:, transaction_manager:, logger:, instrumenter:, compressor:, ack_timeout:, required_acks:, max_retries:, retry_backoff:, max_buffer_size:, max_buffer_bytesize:)
|
131
133
|
@cluster = cluster
|
132
134
|
@transaction_manager = transaction_manager
|
133
|
-
@logger = logger
|
135
|
+
@logger = TaggedLogger.new(logger)
|
134
136
|
@instrumenter = instrumenter
|
135
137
|
@required_acks = required_acks == :all ? -1 : required_acks
|
136
138
|
@ack_timeout = ack_timeout
|
@@ -150,6 +152,10 @@ module Kafka
|
|
150
152
|
@pending_message_queue = PendingMessageQueue.new
|
151
153
|
end
|
152
154
|
|
155
|
+
def to_s
|
156
|
+
"Producer #{@target_topics.to_a.join(', ')}"
|
157
|
+
end
|
158
|
+
|
153
159
|
# Produces a message to the specified topic. Note that messages are buffered in
|
154
160
|
# the producer until {#deliver_messages} is called.
|
155
161
|
#
|
@@ -182,11 +188,14 @@ module Kafka
|
|
182
188
|
# @raise [BufferOverflow] if the maximum buffer size has been reached.
|
183
189
|
# @return [nil]
|
184
190
|
def produce(value, key: nil, headers: {}, topic:, partition: nil, partition_key: nil, create_time: Time.now)
|
191
|
+
# We want to fail fast if `topic` isn't a String
|
192
|
+
topic = topic.to_str
|
193
|
+
|
185
194
|
message = PendingMessage.new(
|
186
195
|
value: value && value.to_s,
|
187
196
|
key: key && key.to_s,
|
188
197
|
headers: headers,
|
189
|
-
topic: topic
|
198
|
+
topic: topic,
|
190
199
|
partition: partition && Integer(partition),
|
191
200
|
partition_key: partition_key && partition_key.to_s,
|
192
201
|
create_time: create_time
|
@@ -205,7 +214,7 @@ module Kafka
|
|
205
214
|
# If the producer is in transactional mode, all the message production
|
206
215
|
# must be used when the producer is currently in transaction
|
207
216
|
if @transaction_manager.transactional? && !@transaction_manager.in_transaction?
|
208
|
-
raise
|
217
|
+
raise "Cannot produce to #{topic}: You must trigger begin_transaction before producing messages"
|
209
218
|
end
|
210
219
|
|
211
220
|
@target_topics.add(topic)
|
@@ -324,6 +333,20 @@ module Kafka
|
|
324
333
|
@transaction_manager.abort_transaction
|
325
334
|
end
|
326
335
|
|
336
|
+
# Sends batch last offset to the consumer group coordinator, and also marks
|
337
|
+
# this offset as part of the current transaction. This offset will be considered
|
338
|
+
# committed only if the transaction is committed successfully.
|
339
|
+
#
|
340
|
+
# This method should be used when you need to batch consumed and produced messages
|
341
|
+
# together, typically in a consume-transform-produce pattern. Thus, the specified
|
342
|
+
# group_id should be the same as config parameter group_id of the used
|
343
|
+
# consumer.
|
344
|
+
#
|
345
|
+
# @return [nil]
|
346
|
+
def send_offsets_to_transaction(batch:, group_id:)
|
347
|
+
@transaction_manager.send_offsets_to_txn(offsets: { batch.topic => { batch.partition => { offset: batch.last_offset + 1, leader_epoch: batch.leader_epoch } } }, group_id: group_id)
|
348
|
+
end
|
349
|
+
|
327
350
|
# Syntactic sugar to enable easier transaction usage. Do the following steps
|
328
351
|
#
|
329
352
|
# - Start the transaction (with Producer#begin_transaction)
|
@@ -391,11 +414,11 @@ module Kafka
|
|
391
414
|
if buffer_size.zero?
|
392
415
|
break
|
393
416
|
elsif attempt <= @max_retries
|
394
|
-
@logger.warn "Failed to send all messages; attempting retry #{attempt} of #{@max_retries} after #{@retry_backoff}s"
|
417
|
+
@logger.warn "Failed to send all messages to #{pretty_partitions}; attempting retry #{attempt} of #{@max_retries} after #{@retry_backoff}s"
|
395
418
|
|
396
419
|
sleep @retry_backoff
|
397
420
|
else
|
398
|
-
@logger.error "Failed to send all messages; keeping remaining messages in buffer"
|
421
|
+
@logger.error "Failed to send all messages to #{pretty_partitions}; keeping remaining messages in buffer"
|
399
422
|
break
|
400
423
|
end
|
401
424
|
end
|
@@ -407,12 +430,14 @@ module Kafka
|
|
407
430
|
end
|
408
431
|
|
409
432
|
unless @buffer.empty?
|
410
|
-
|
411
|
-
|
412
|
-
raise DeliveryFailed.new("Failed to send messages to #{partitions}", buffer_messages)
|
433
|
+
raise DeliveryFailed.new("Failed to send messages to #{pretty_partitions}", buffer_messages)
|
413
434
|
end
|
414
435
|
end
|
415
436
|
|
437
|
+
def pretty_partitions
|
438
|
+
@buffer.map {|topic, partition, _| "#{topic}/#{partition}" }.join(", ")
|
439
|
+
end
|
440
|
+
|
416
441
|
def assign_partitions!
|
417
442
|
failed_messages = []
|
418
443
|
topics_with_failures = Set.new
|
@@ -0,0 +1,316 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
#
|
4
|
+
# Subscriber to ruby_kafka to report metrics to prometheus
|
5
|
+
#
|
6
|
+
# Usage:
|
7
|
+
# require "kafka/prometheus"
|
8
|
+
#
|
9
|
+
# Once the file has been required, no further configuration is needed, all operational
|
10
|
+
# metrics are automatically emitted (Unless PROMETHEUS_NO_AUTO_START is set).
|
11
|
+
#
|
12
|
+
# By Peter Mustel, T2 Data AB
|
13
|
+
#
|
14
|
+
begin
|
15
|
+
require 'prometheus/client'
|
16
|
+
rescue LoadError
|
17
|
+
warn 'In order to report Kafka client metrics to Prometheus you need to install the `prometheus-client` gem.'
|
18
|
+
raise
|
19
|
+
end
|
20
|
+
|
21
|
+
require 'active_support/subscriber'
|
22
|
+
|
23
|
+
module Kafka
|
24
|
+
module Prometheus
|
25
|
+
SIZE_BUCKETS = [1, 10, 100, 1000, 10_000, 100_000, 1_000_000].freeze
|
26
|
+
LATENCY_BUCKETS = [0.0001, 0.001, 0.01, 0.1, 1.0, 10, 100, 1000].freeze
|
27
|
+
DELAY_BUCKETS = [1, 3, 10, 30, 100, 300, 1000, 3000, 10_000, 30_000].freeze
|
28
|
+
|
29
|
+
class << self
|
30
|
+
attr_accessor :registry
|
31
|
+
|
32
|
+
def start(registry = ::Prometheus::Client.registry)
|
33
|
+
@registry = registry
|
34
|
+
ConnectionSubscriber.attach_to 'connection.kafka'
|
35
|
+
ConsumerSubscriber.attach_to 'consumer.kafka'
|
36
|
+
ProducerSubscriber.attach_to 'producer.kafka'
|
37
|
+
AsyncProducerSubscriber.attach_to 'async_producer.kafka'
|
38
|
+
FetcherSubscriber.attach_to 'fetcher.kafka'
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
class ConnectionSubscriber < ActiveSupport::Subscriber
|
43
|
+
def initialize
|
44
|
+
super
|
45
|
+
@api_calls = Prometheus.registry.counter(:api_calls, docstring: 'Total calls', labels: [:client, :api, :broker])
|
46
|
+
@api_latency = Prometheus.registry.histogram(:api_latency, docstring: 'Latency', buckets: LATENCY_BUCKETS, labels: [:client, :api, :broker])
|
47
|
+
@api_request_size = Prometheus.registry.histogram(:api_request_size, docstring: 'Request size', buckets: SIZE_BUCKETS, labels: [:client, :api, :broker])
|
48
|
+
@api_response_size = Prometheus.registry.histogram(:api_response_size, docstring: 'Response size', buckets: SIZE_BUCKETS, labels: [:client, :api, :broker])
|
49
|
+
@api_errors = Prometheus.registry.counter(:api_errors, docstring: 'Errors', labels: [:client, :api, :broker])
|
50
|
+
end
|
51
|
+
|
52
|
+
def request(event)
|
53
|
+
key = {
|
54
|
+
client: event.payload.fetch(:client_id),
|
55
|
+
api: event.payload.fetch(:api, 'unknown'),
|
56
|
+
broker: event.payload.fetch(:broker_host)
|
57
|
+
}
|
58
|
+
request_size = event.payload.fetch(:request_size, 0)
|
59
|
+
response_size = event.payload.fetch(:response_size, 0)
|
60
|
+
|
61
|
+
@api_calls.increment(labels: key)
|
62
|
+
@api_latency.observe(event.duration, labels: key)
|
63
|
+
@api_request_size.observe(request_size, labels: key)
|
64
|
+
@api_response_size.observe(response_size, labels: key)
|
65
|
+
@api_errors.increment(labels: key) if event.payload.key?(:exception)
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
class ConsumerSubscriber < ActiveSupport::Subscriber
|
70
|
+
def initialize
|
71
|
+
super
|
72
|
+
@process_messages = Prometheus.registry.counter(:consumer_process_messages, docstring: 'Total messages', labels: [:client, :group_id, :topic, :partition])
|
73
|
+
@process_message_errors = Prometheus.registry.counter(:consumer_process_message_errors, docstring: 'Total errors', labels: [:client, :group_id, :topic, :partition])
|
74
|
+
@process_message_latency =
|
75
|
+
Prometheus.registry.histogram(:consumer_process_message_latency, docstring: 'Latency', buckets: LATENCY_BUCKETS, labels: [:client, :group_id, :topic, :partition])
|
76
|
+
@offset_lag = Prometheus.registry.gauge(:consumer_offset_lag, docstring: 'Offset lag', labels: [:client, :group_id, :topic, :partition])
|
77
|
+
@time_lag = Prometheus.registry.gauge(:consumer_time_lag, docstring: 'Time lag of message', labels: [:client, :group_id, :topic, :partition])
|
78
|
+
@process_batch_errors = Prometheus.registry.counter(:consumer_process_batch_errors, docstring: 'Total errors in batch', labels: [:client, :group_id, :topic, :partition])
|
79
|
+
@process_batch_latency =
|
80
|
+
Prometheus.registry.histogram(:consumer_process_batch_latency, docstring: 'Latency in batch', buckets: LATENCY_BUCKETS, labels: [:client, :group_id, :topic, :partition])
|
81
|
+
@batch_size = Prometheus.registry.histogram(:consumer_batch_size, docstring: 'Size of batch', buckets: SIZE_BUCKETS, labels: [:client, :group_id, :topic, :partition])
|
82
|
+
@join_group = Prometheus.registry.histogram(:consumer_join_group, docstring: 'Time to join group', buckets: DELAY_BUCKETS, labels: [:client, :group_id])
|
83
|
+
@join_group_errors = Prometheus.registry.counter(:consumer_join_group_errors, docstring: 'Total error in joining group', labels: [:client, :group_id])
|
84
|
+
@sync_group = Prometheus.registry.histogram(:consumer_sync_group, docstring: 'Time to sync group', buckets: DELAY_BUCKETS, labels: [:client, :group_id])
|
85
|
+
@sync_group_errors = Prometheus.registry.counter(:consumer_sync_group_errors, docstring: 'Total error in syncing group', labels: [:client, :group_id])
|
86
|
+
@leave_group = Prometheus.registry.histogram(:consumer_leave_group, docstring: 'Time to leave group', buckets: DELAY_BUCKETS, labels: [:client, :group_id])
|
87
|
+
@leave_group_errors = Prometheus.registry.counter(:consumer_leave_group_errors, docstring: 'Total error in leaving group', labels: [:client, :group_id])
|
88
|
+
@pause_duration = Prometheus.registry.gauge(:consumer_pause_duration, docstring: 'Pause duration', labels: [:client, :group_id, :topic, :partition])
|
89
|
+
end
|
90
|
+
|
91
|
+
def process_message(event)
|
92
|
+
key = {
|
93
|
+
client: event.payload.fetch(:client_id),
|
94
|
+
group_id: event.payload.fetch(:group_id),
|
95
|
+
topic: event.payload.fetch(:topic),
|
96
|
+
partition: event.payload.fetch(:partition)
|
97
|
+
}
|
98
|
+
|
99
|
+
offset_lag = event.payload.fetch(:offset_lag)
|
100
|
+
create_time = event.payload.fetch(:create_time)
|
101
|
+
|
102
|
+
time_lag = create_time && ((Time.now - create_time) * 1000).to_i
|
103
|
+
|
104
|
+
if event.payload.key?(:exception)
|
105
|
+
@process_message_errors.increment(labels: key)
|
106
|
+
else
|
107
|
+
@process_message_latency.observe(event.duration, labels: key)
|
108
|
+
@process_messages.increment(labels: key)
|
109
|
+
end
|
110
|
+
|
111
|
+
@offset_lag.set(offset_lag, labels: key)
|
112
|
+
|
113
|
+
# Not all messages have timestamps.
|
114
|
+
return unless time_lag
|
115
|
+
|
116
|
+
@time_lag.set(time_lag, labels: key)
|
117
|
+
end
|
118
|
+
|
119
|
+
def process_batch(event)
|
120
|
+
key = {
|
121
|
+
client: event.payload.fetch(:client_id),
|
122
|
+
group_id: event.payload.fetch(:group_id),
|
123
|
+
topic: event.payload.fetch(:topic),
|
124
|
+
partition: event.payload.fetch(:partition)
|
125
|
+
}
|
126
|
+
message_count = event.payload.fetch(:message_count)
|
127
|
+
|
128
|
+
if event.payload.key?(:exception)
|
129
|
+
@process_batch_errors.increment(labels: key)
|
130
|
+
else
|
131
|
+
@process_batch_latency.observe(event.duration, labels: key)
|
132
|
+
@process_messages.increment(by: message_count, labels: key)
|
133
|
+
end
|
134
|
+
end
|
135
|
+
|
136
|
+
def fetch_batch(event)
|
137
|
+
key = {
|
138
|
+
client: event.payload.fetch(:client_id),
|
139
|
+
group_id: event.payload.fetch(:group_id),
|
140
|
+
topic: event.payload.fetch(:topic),
|
141
|
+
partition: event.payload.fetch(:partition)
|
142
|
+
}
|
143
|
+
offset_lag = event.payload.fetch(:offset_lag)
|
144
|
+
batch_size = event.payload.fetch(:message_count)
|
145
|
+
|
146
|
+
@batch_size.observe(batch_size, labels: key)
|
147
|
+
@offset_lag.set(offset_lag, labels: key)
|
148
|
+
end
|
149
|
+
|
150
|
+
def join_group(event)
|
151
|
+
key = { client: event.payload.fetch(:client_id), group_id: event.payload.fetch(:group_id) }
|
152
|
+
@join_group.observe(event.duration, labels: key)
|
153
|
+
|
154
|
+
@join_group_errors.increment(labels: key) if event.payload.key?(:exception)
|
155
|
+
end
|
156
|
+
|
157
|
+
def sync_group(event)
|
158
|
+
key = { client: event.payload.fetch(:client_id), group_id: event.payload.fetch(:group_id) }
|
159
|
+
@sync_group.observe(event.duration, labels: key)
|
160
|
+
|
161
|
+
@sync_group_errors.increment(labels: key) if event.payload.key?(:exception)
|
162
|
+
end
|
163
|
+
|
164
|
+
def leave_group(event)
|
165
|
+
key = { client: event.payload.fetch(:client_id), group_id: event.payload.fetch(:group_id) }
|
166
|
+
@leave_group.observe(event.duration, labels: key)
|
167
|
+
|
168
|
+
@leave_group_errors.increment(labels: key) if event.payload.key?(:exception)
|
169
|
+
end
|
170
|
+
|
171
|
+
def pause_status(event)
|
172
|
+
key = {
|
173
|
+
client: event.payload.fetch(:client_id),
|
174
|
+
group_id: event.payload.fetch(:group_id),
|
175
|
+
topic: event.payload.fetch(:topic),
|
176
|
+
partition: event.payload.fetch(:partition)
|
177
|
+
}
|
178
|
+
|
179
|
+
duration = event.payload.fetch(:duration)
|
180
|
+
@pause_duration.set(duration, labels: key)
|
181
|
+
end
|
182
|
+
end
|
183
|
+
|
184
|
+
class ProducerSubscriber < ActiveSupport::Subscriber
|
185
|
+
def initialize
|
186
|
+
super
|
187
|
+
@produce_messages = Prometheus.registry.counter(:producer_produced_messages, docstring: 'Produced messages total', labels: [:client, :topic])
|
188
|
+
@produce_message_size =
|
189
|
+
Prometheus.registry.histogram(:producer_message_size, docstring: 'Message size', buckets: SIZE_BUCKETS, labels: [:client, :topic])
|
190
|
+
@buffer_size = Prometheus.registry.histogram(:producer_buffer_size, docstring: 'Buffer size', buckets: SIZE_BUCKETS, labels: [:client])
|
191
|
+
@buffer_fill_ratio = Prometheus.registry.histogram(:producer_buffer_fill_ratio, docstring: 'Buffer fill ratio', labels: [:client])
|
192
|
+
@buffer_fill_percentage = Prometheus.registry.histogram(:producer_buffer_fill_percentage, docstring: 'Buffer fill percentage', labels: [:client])
|
193
|
+
@produce_errors = Prometheus.registry.counter(:producer_produce_errors, docstring: 'Produce errors', labels: [:client, :topic])
|
194
|
+
@deliver_errors = Prometheus.registry.counter(:producer_deliver_errors, docstring: 'Deliver error', labels: [:client])
|
195
|
+
@deliver_latency =
|
196
|
+
Prometheus.registry.histogram(:producer_deliver_latency, docstring: 'Delivery latency', buckets: LATENCY_BUCKETS, labels: [:client])
|
197
|
+
@deliver_messages = Prometheus.registry.counter(:producer_deliver_messages, docstring: 'Total count of delivered messages', labels: [:client])
|
198
|
+
@deliver_attempts = Prometheus.registry.histogram(:producer_deliver_attempts, docstring: 'Delivery attempts', labels: [:client])
|
199
|
+
@ack_messages = Prometheus.registry.counter(:producer_ack_messages, docstring: 'Ack', labels: [:client, :topic])
|
200
|
+
@ack_delay = Prometheus.registry.histogram(:producer_ack_delay, docstring: 'Ack delay', buckets: LATENCY_BUCKETS, labels: [:client, :topic])
|
201
|
+
@ack_errors = Prometheus.registry.counter(:producer_ack_errors, docstring: 'Ack errors', labels: [:client, :topic])
|
202
|
+
end
|
203
|
+
|
204
|
+
def produce_message(event)
|
205
|
+
client = event.payload.fetch(:client_id)
|
206
|
+
key = { client: client, topic: event.payload.fetch(:topic) }
|
207
|
+
|
208
|
+
message_size = event.payload.fetch(:message_size)
|
209
|
+
buffer_size = event.payload.fetch(:buffer_size)
|
210
|
+
max_buffer_size = event.payload.fetch(:max_buffer_size)
|
211
|
+
buffer_fill_ratio = buffer_size.to_f / max_buffer_size.to_f
|
212
|
+
buffer_fill_percentage = buffer_fill_ratio * 100.0
|
213
|
+
|
214
|
+
# This gets us the write rate.
|
215
|
+
@produce_messages.increment(labels: key)
|
216
|
+
@produce_message_size.observe(message_size, labels: key)
|
217
|
+
|
218
|
+
# This gets us the avg/max buffer size per producer.
|
219
|
+
@buffer_size.observe(buffer_size, labels: { client: client })
|
220
|
+
|
221
|
+
# This gets us the avg/max buffer fill ratio per producer.
|
222
|
+
@buffer_fill_ratio.observe(buffer_fill_ratio, labels: { client: client })
|
223
|
+
@buffer_fill_percentage.observe(buffer_fill_percentage, labels: { client: client })
|
224
|
+
end
|
225
|
+
|
226
|
+
def buffer_overflow(event)
|
227
|
+
key = { client: event.payload.fetch(:client_id), topic: event.payload.fetch(:topic) }
|
228
|
+
@produce_errors.increment(labels: key)
|
229
|
+
end
|
230
|
+
|
231
|
+
def deliver_messages(event)
|
232
|
+
key = { client: event.payload.fetch(:client_id) }
|
233
|
+
message_count = event.payload.fetch(:delivered_message_count)
|
234
|
+
attempts = event.payload.fetch(:attempts)
|
235
|
+
|
236
|
+
@deliver_errors.increment(labels: key) if event.payload.key?(:exception)
|
237
|
+
@deliver_latency.observe(event.duration, labels: key)
|
238
|
+
|
239
|
+
# Messages delivered to Kafka:
|
240
|
+
@deliver_messages.increment(by: message_count, labels: key)
|
241
|
+
|
242
|
+
# Number of attempts to deliver messages:
|
243
|
+
@deliver_attempts.observe(attempts, labels: key)
|
244
|
+
end
|
245
|
+
|
246
|
+
def ack_message(event)
|
247
|
+
key = { client: event.payload.fetch(:client_id), topic: event.payload.fetch(:topic) }
|
248
|
+
|
249
|
+
# Number of messages ACK'd for the topic.
|
250
|
+
@ack_messages.increment(labels: key)
|
251
|
+
|
252
|
+
# Histogram of delay between a message being produced and it being ACK'd.
|
253
|
+
@ack_delay.observe(event.payload.fetch(:delay), labels: key)
|
254
|
+
end
|
255
|
+
|
256
|
+
def topic_error(event)
|
257
|
+
key = { client: event.payload.fetch(:client_id), topic: event.payload.fetch(:topic) }
|
258
|
+
|
259
|
+
@ack_errors.increment(labels: key)
|
260
|
+
end
|
261
|
+
end
|
262
|
+
|
263
|
+
class AsyncProducerSubscriber < ActiveSupport::Subscriber
|
264
|
+
def initialize
|
265
|
+
super
|
266
|
+
@queue_size = Prometheus.registry.histogram(:async_producer_queue_size, docstring: 'Queue size', buckets: SIZE_BUCKETS, labels: [:client, :topic])
|
267
|
+
@queue_fill_ratio = Prometheus.registry.histogram(:async_producer_queue_fill_ratio, docstring: 'Queue fill ratio', labels: [:client, :topic])
|
268
|
+
@produce_errors = Prometheus.registry.counter(:async_producer_produce_errors, docstring: 'Producer errors', labels: [:client, :topic])
|
269
|
+
@dropped_messages = Prometheus.registry.counter(:async_producer_dropped_messages, docstring: 'Dropped messages', labels: [:client])
|
270
|
+
end
|
271
|
+
|
272
|
+
def enqueue_message(event)
|
273
|
+
key = { client: event.payload.fetch(:client_id), topic: event.payload.fetch(:topic) }
|
274
|
+
|
275
|
+
queue_size = event.payload.fetch(:queue_size)
|
276
|
+
max_queue_size = event.payload.fetch(:max_queue_size)
|
277
|
+
queue_fill_ratio = queue_size.to_f / max_queue_size.to_f
|
278
|
+
|
279
|
+
# This gets us the avg/max queue size per producer.
|
280
|
+
@queue_size.observe(queue_size, labels: key)
|
281
|
+
|
282
|
+
# This gets us the avg/max queue fill ratio per producer.
|
283
|
+
@queue_fill_ratio.observe(queue_fill_ratio, labels: key)
|
284
|
+
end
|
285
|
+
|
286
|
+
def buffer_overflow(event)
|
287
|
+
key = { client: event.payload.fetch(:client_id), topic: event.payload.fetch(:topic) }
|
288
|
+
@produce_errors.increment(labels: key)
|
289
|
+
end
|
290
|
+
|
291
|
+
def drop_messages(event)
|
292
|
+
key = { client: event.payload.fetch(:client_id) }
|
293
|
+
message_count = event.payload.fetch(:message_count)
|
294
|
+
@dropped_messages.increment(by: message_count, labels: key)
|
295
|
+
end
|
296
|
+
end
|
297
|
+
|
298
|
+
class FetcherSubscriber < ActiveSupport::Subscriber
|
299
|
+
def initialize
|
300
|
+
super
|
301
|
+
@queue_size = Prometheus.registry.gauge(:fetcher_queue_size, docstring: 'Queue size', labels: [:client, :group_id])
|
302
|
+
end
|
303
|
+
|
304
|
+
def loop(event)
|
305
|
+
queue_size = event.payload.fetch(:queue_size)
|
306
|
+
client = event.payload.fetch(:client_id)
|
307
|
+
group_id = event.payload.fetch(:group_id)
|
308
|
+
|
309
|
+
@queue_size.set(queue_size, labels: { client: client, group_id: group_id })
|
310
|
+
end
|
311
|
+
end
|
312
|
+
end
|
313
|
+
end
|
314
|
+
|
315
|
+
# To enable testability, it is possible to skip the start until test time
|
316
|
+
Kafka::Prometheus.start unless defined?(PROMETHEUS_NO_AUTO_START)
|
@@ -0,0 +1,29 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Kafka
|
4
|
+
module Protocol
|
5
|
+
class AddOffsetsToTxnRequest
|
6
|
+
def initialize(transactional_id: nil, producer_id:, producer_epoch:, group_id:)
|
7
|
+
@transactional_id = transactional_id
|
8
|
+
@producer_id = producer_id
|
9
|
+
@producer_epoch = producer_epoch
|
10
|
+
@group_id = group_id
|
11
|
+
end
|
12
|
+
|
13
|
+
def api_key
|
14
|
+
ADD_OFFSETS_TO_TXN_API
|
15
|
+
end
|
16
|
+
|
17
|
+
def response_class
|
18
|
+
AddOffsetsToTxnResponse
|
19
|
+
end
|
20
|
+
|
21
|
+
def encode(encoder)
|
22
|
+
encoder.write_string(@transactional_id.to_s)
|
23
|
+
encoder.write_int64(@producer_id)
|
24
|
+
encoder.write_int16(@producer_epoch)
|
25
|
+
encoder.write_string(@group_id)
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module Kafka
|
2
|
+
module Protocol
|
3
|
+
class AddOffsetsToTxnResponse
|
4
|
+
|
5
|
+
attr_reader :error_code
|
6
|
+
|
7
|
+
def initialize(error_code:)
|
8
|
+
@error_code = error_code
|
9
|
+
end
|
10
|
+
|
11
|
+
def self.decode(decoder)
|
12
|
+
_throttle_time_ms = decoder.int32
|
13
|
+
error_code = decoder.int16
|
14
|
+
new(error_code: error_code)
|
15
|
+
end
|
16
|
+
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -7,13 +7,14 @@ module Kafka
|
|
7
7
|
class JoinGroupRequest
|
8
8
|
PROTOCOL_TYPE = "consumer"
|
9
9
|
|
10
|
-
def initialize(group_id:, session_timeout:, member_id:, topics: [])
|
10
|
+
def initialize(group_id:, session_timeout:, rebalance_timeout:, member_id:, topics: [])
|
11
11
|
@group_id = group_id
|
12
12
|
@session_timeout = session_timeout * 1000 # Kafka wants ms.
|
13
|
+
@rebalance_timeout = rebalance_timeout * 1000 # Kafka wants ms.
|
13
14
|
@member_id = member_id || ""
|
14
15
|
@protocol_type = PROTOCOL_TYPE
|
15
16
|
@group_protocols = {
|
16
|
-
"
|
17
|
+
"roundrobin" => ConsumerGroupProtocol.new(topics: topics),
|
17
18
|
}
|
18
19
|
end
|
19
20
|
|
@@ -21,6 +22,10 @@ module Kafka
|
|
21
22
|
JOIN_GROUP_API
|
22
23
|
end
|
23
24
|
|
25
|
+
def api_version
|
26
|
+
1
|
27
|
+
end
|
28
|
+
|
24
29
|
def response_class
|
25
30
|
JoinGroupResponse
|
26
31
|
end
|
@@ -28,6 +33,7 @@ module Kafka
|
|
28
33
|
def encode(encoder)
|
29
34
|
encoder.write_string(@group_id)
|
30
35
|
encoder.write_int32(@session_timeout)
|
36
|
+
encoder.write_int32(@rebalance_timeout)
|
31
37
|
encoder.write_string(@member_id)
|
32
38
|
encoder.write_string(@protocol_type)
|
33
39
|
|