ruby-kafka 0.7.6 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.circleci/config.yml +102 -3
- data/.github/workflows/stale.yml +19 -0
- data/CHANGELOG.md +24 -0
- data/README.md +18 -0
- data/lib/kafka/async_producer.rb +3 -0
- data/lib/kafka/broker.rb +12 -0
- data/lib/kafka/client.rb +35 -3
- data/lib/kafka/cluster.rb +52 -0
- data/lib/kafka/compression.rb +13 -11
- data/lib/kafka/compressor.rb +1 -0
- data/lib/kafka/connection.rb +3 -0
- data/lib/kafka/consumer_group.rb +4 -1
- data/lib/kafka/datadog.rb +2 -10
- data/lib/kafka/fetched_batch.rb +5 -1
- data/lib/kafka/fetched_batch_generator.rb +4 -1
- data/lib/kafka/fetched_message.rb +1 -0
- data/lib/kafka/fetcher.rb +4 -1
- data/lib/kafka/gzip_codec.rb +4 -0
- data/lib/kafka/lz4_codec.rb +4 -0
- data/lib/kafka/producer.rb +20 -1
- data/lib/kafka/prometheus.rb +316 -0
- data/lib/kafka/protocol.rb +8 -0
- data/lib/kafka/protocol/add_offsets_to_txn_request.rb +29 -0
- data/lib/kafka/protocol/add_offsets_to_txn_response.rb +19 -0
- data/lib/kafka/protocol/join_group_request.rb +8 -2
- data/lib/kafka/protocol/offset_fetch_request.rb +3 -1
- data/lib/kafka/protocol/produce_request.rb +3 -1
- data/lib/kafka/protocol/record_batch.rb +5 -4
- data/lib/kafka/protocol/txn_offset_commit_request.rb +46 -0
- data/lib/kafka/protocol/txn_offset_commit_response.rb +18 -0
- data/lib/kafka/sasl/scram.rb +15 -12
- data/lib/kafka/snappy_codec.rb +4 -0
- data/lib/kafka/ssl_context.rb +4 -1
- data/lib/kafka/ssl_socket_with_timeout.rb +1 -0
- data/lib/kafka/tagged_logger.rb +25 -20
- data/lib/kafka/transaction_manager.rb +25 -0
- data/lib/kafka/version.rb +1 -1
- data/lib/kafka/zstd_codec.rb +27 -0
- data/ruby-kafka.gemspec +4 -2
- metadata +47 -6
data/lib/kafka/compressor.rb
CHANGED
data/lib/kafka/connection.rb
CHANGED
data/lib/kafka/consumer_group.rb
CHANGED
@@ -7,11 +7,12 @@ module Kafka
|
|
7
7
|
class ConsumerGroup
|
8
8
|
attr_reader :assigned_partitions, :generation_id, :group_id
|
9
9
|
|
10
|
-
def initialize(cluster:, logger:, group_id:, session_timeout:, retention_time:, instrumenter:)
|
10
|
+
def initialize(cluster:, logger:, group_id:, session_timeout:, rebalance_timeout:, retention_time:, instrumenter:)
|
11
11
|
@cluster = cluster
|
12
12
|
@logger = TaggedLogger.new(logger)
|
13
13
|
@group_id = group_id
|
14
14
|
@session_timeout = session_timeout
|
15
|
+
@rebalance_timeout = rebalance_timeout
|
15
16
|
@instrumenter = instrumenter
|
16
17
|
@member_id = ""
|
17
18
|
@generation_id = nil
|
@@ -140,7 +141,9 @@ module Kafka
|
|
140
141
|
response = coordinator.join_group(
|
141
142
|
group_id: @group_id,
|
142
143
|
session_timeout: @session_timeout,
|
144
|
+
rebalance_timeout: @rebalance_timeout,
|
143
145
|
member_id: @member_id,
|
146
|
+
topics: @topics,
|
144
147
|
)
|
145
148
|
|
146
149
|
Protocol.handle_error(response.error_code)
|
data/lib/kafka/datadog.rb
CHANGED
@@ -40,7 +40,7 @@ module Kafka
|
|
40
40
|
end
|
41
41
|
|
42
42
|
def host
|
43
|
-
@host
|
43
|
+
@host
|
44
44
|
end
|
45
45
|
|
46
46
|
def host=(host)
|
@@ -49,7 +49,7 @@ module Kafka
|
|
49
49
|
end
|
50
50
|
|
51
51
|
def port
|
52
|
-
@port
|
52
|
+
@port
|
53
53
|
end
|
54
54
|
|
55
55
|
def port=(port)
|
@@ -77,14 +77,6 @@ module Kafka
|
|
77
77
|
|
78
78
|
private
|
79
79
|
|
80
|
-
def default_host
|
81
|
-
::Datadog::Statsd.const_defined?(:Connection) ? ::Datadog::Statsd::Connection::DEFAULT_HOST : ::Datadog::Statsd::DEFAULT_HOST
|
82
|
-
end
|
83
|
-
|
84
|
-
def default_port
|
85
|
-
::Datadog::Statsd.const_defined?(:Connection) ? ::Datadog::Statsd::Connection::DEFAULT_PORT : ::Datadog::Statsd::DEFAULT_PORT
|
86
|
-
end
|
87
|
-
|
88
80
|
def clear
|
89
81
|
@statsd && @statsd.close
|
90
82
|
@statsd = nil
|
data/lib/kafka/fetched_batch.rb
CHANGED
@@ -13,18 +13,22 @@ module Kafka
|
|
13
13
|
# @return [Integer]
|
14
14
|
attr_reader :last_offset
|
15
15
|
|
16
|
+
# @return [Integer]
|
17
|
+
attr_reader :leader_epoch
|
18
|
+
|
16
19
|
# @return [Integer] the offset of the most recent message in the partition.
|
17
20
|
attr_reader :highwater_mark_offset
|
18
21
|
|
19
22
|
# @return [Array<Kafka::FetchedMessage>]
|
20
23
|
attr_accessor :messages
|
21
24
|
|
22
|
-
def initialize(topic:, partition:, highwater_mark_offset:, messages:, last_offset: nil)
|
25
|
+
def initialize(topic:, partition:, highwater_mark_offset:, messages:, last_offset: nil, leader_epoch: nil)
|
23
26
|
@topic = topic
|
24
27
|
@partition = partition
|
25
28
|
@highwater_mark_offset = highwater_mark_offset
|
26
29
|
@messages = messages
|
27
30
|
@last_offset = last_offset
|
31
|
+
@leader_epoch = leader_epoch
|
28
32
|
end
|
29
33
|
|
30
34
|
def empty?
|
@@ -48,7 +48,7 @@ module Kafka
|
|
48
48
|
partition: @fetched_partition.partition
|
49
49
|
)
|
50
50
|
end
|
51
|
-
end
|
51
|
+
end.compact
|
52
52
|
end
|
53
53
|
FetchedBatch.new(
|
54
54
|
topic: @topic,
|
@@ -62,11 +62,13 @@ module Kafka
|
|
62
62
|
def extract_records
|
63
63
|
records = []
|
64
64
|
last_offset = nil
|
65
|
+
leader_epoch = nil
|
65
66
|
aborted_transactions = @fetched_partition.aborted_transactions.sort_by(&:first_offset)
|
66
67
|
aborted_producer_ids = {}
|
67
68
|
|
68
69
|
@fetched_partition.messages.each do |record_batch|
|
69
70
|
last_offset = record_batch.last_offset if last_offset.nil? || last_offset < record_batch.last_offset
|
71
|
+
leader_epoch = record_batch.partition_leader_epoch if leader_epoch.nil? || leader_epoch < record_batch.partition_leader_epoch
|
70
72
|
# Find the list of aborted producer IDs less than current offset
|
71
73
|
unless aborted_transactions.empty?
|
72
74
|
if aborted_transactions.first.first_offset <= record_batch.last_offset
|
@@ -99,6 +101,7 @@ module Kafka
|
|
99
101
|
topic: @topic,
|
100
102
|
partition: @fetched_partition.partition,
|
101
103
|
last_offset: last_offset,
|
104
|
+
leader_epoch: leader_epoch,
|
102
105
|
highwater_mark_offset: @fetched_partition.highwater_mark_offset,
|
103
106
|
messages: records
|
104
107
|
)
|
data/lib/kafka/fetcher.rb
CHANGED
@@ -17,6 +17,9 @@ module Kafka
|
|
17
17
|
@commands = Queue.new
|
18
18
|
@next_offsets = Hash.new { |h, k| h[k] = {} }
|
19
19
|
|
20
|
+
# We are only running when someone calls start.
|
21
|
+
@running = false
|
22
|
+
|
20
23
|
# Long poll until at least this many bytes can be fetched.
|
21
24
|
@min_bytes = 1
|
22
25
|
|
@@ -110,7 +113,7 @@ module Kafka
|
|
110
113
|
elsif @queue.size < @max_queue_size
|
111
114
|
step
|
112
115
|
else
|
113
|
-
@logger.
|
116
|
+
@logger.info "Reached max fetcher queue size (#{@max_queue_size}), sleeping 1s"
|
114
117
|
sleep 1
|
115
118
|
end
|
116
119
|
ensure
|
data/lib/kafka/gzip_codec.rb
CHANGED
data/lib/kafka/lz4_codec.rb
CHANGED
data/lib/kafka/producer.rb
CHANGED
@@ -68,6 +68,8 @@ module Kafka
|
|
68
68
|
#
|
69
69
|
# * `:snappy` for [Snappy](http://google.github.io/snappy/) compression.
|
70
70
|
# * `:gzip` for [gzip](https://en.wikipedia.org/wiki/Gzip) compression.
|
71
|
+
# * `:lz4` for [LZ4](https://en.wikipedia.org/wiki/LZ4_(compression_algorithm)) compression.
|
72
|
+
# * `:zstd` for [zstd](https://facebook.github.io/zstd/) compression.
|
71
73
|
#
|
72
74
|
# By default, all message sets will be compressed if you specify a compression
|
73
75
|
# codec. To increase the compression threshold, set `compression_threshold` to
|
@@ -186,11 +188,14 @@ module Kafka
|
|
186
188
|
# @raise [BufferOverflow] if the maximum buffer size has been reached.
|
187
189
|
# @return [nil]
|
188
190
|
def produce(value, key: nil, headers: {}, topic:, partition: nil, partition_key: nil, create_time: Time.now)
|
191
|
+
# We want to fail fast if `topic` isn't a String
|
192
|
+
topic = topic.to_str
|
193
|
+
|
189
194
|
message = PendingMessage.new(
|
190
195
|
value: value && value.to_s,
|
191
196
|
key: key && key.to_s,
|
192
197
|
headers: headers,
|
193
|
-
topic: topic
|
198
|
+
topic: topic,
|
194
199
|
partition: partition && Integer(partition),
|
195
200
|
partition_key: partition_key && partition_key.to_s,
|
196
201
|
create_time: create_time
|
@@ -328,6 +333,20 @@ module Kafka
|
|
328
333
|
@transaction_manager.abort_transaction
|
329
334
|
end
|
330
335
|
|
336
|
+
# Sends batch last offset to the consumer group coordinator, and also marks
|
337
|
+
# this offset as part of the current transaction. This offset will be considered
|
338
|
+
# committed only if the transaction is committed successfully.
|
339
|
+
#
|
340
|
+
# This method should be used when you need to batch consumed and produced messages
|
341
|
+
# together, typically in a consume-transform-produce pattern. Thus, the specified
|
342
|
+
# group_id should be the same as config parameter group_id of the used
|
343
|
+
# consumer.
|
344
|
+
#
|
345
|
+
# @return [nil]
|
346
|
+
def send_offsets_to_transaction(batch:, group_id:)
|
347
|
+
@transaction_manager.send_offsets_to_txn(offsets: { batch.topic => { batch.partition => { offset: batch.last_offset + 1, leader_epoch: batch.leader_epoch } } }, group_id: group_id)
|
348
|
+
end
|
349
|
+
|
331
350
|
# Syntactic sugar to enable easier transaction usage. Do the following steps
|
332
351
|
#
|
333
352
|
# - Start the transaction (with Producer#begin_transaction)
|
@@ -0,0 +1,316 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
#
|
4
|
+
# Subscriber to ruby_kafka to report metrics to prometheus
|
5
|
+
#
|
6
|
+
# Usage:
|
7
|
+
# require "kafka/prometheus"
|
8
|
+
#
|
9
|
+
# Once the file has been required, no further configuration is needed, all operational
|
10
|
+
# metrics are automatically emitted (Unless PROMETHEUS_NO_AUTO_START is set).
|
11
|
+
#
|
12
|
+
# By Peter Mustel, T2 Data AB
|
13
|
+
#
|
14
|
+
begin
|
15
|
+
require 'prometheus/client'
|
16
|
+
rescue LoadError
|
17
|
+
warn 'In order to report Kafka client metrics to Prometheus you need to install the `prometheus-client` gem.'
|
18
|
+
raise
|
19
|
+
end
|
20
|
+
|
21
|
+
require 'active_support/subscriber'
|
22
|
+
|
23
|
+
module Kafka
|
24
|
+
module Prometheus
|
25
|
+
SIZE_BUCKETS = [1, 10, 100, 1000, 10_000, 100_000, 1_000_000].freeze
|
26
|
+
LATENCY_BUCKETS = [0.0001, 0.001, 0.01, 0.1, 1.0, 10, 100, 1000].freeze
|
27
|
+
DELAY_BUCKETS = [1, 3, 10, 30, 100, 300, 1000, 3000, 10_000, 30_000].freeze
|
28
|
+
|
29
|
+
class << self
|
30
|
+
attr_accessor :registry
|
31
|
+
|
32
|
+
def start(registry = ::Prometheus::Client.registry)
|
33
|
+
@registry = registry
|
34
|
+
ConnectionSubscriber.attach_to 'connection.kafka'
|
35
|
+
ConsumerSubscriber.attach_to 'consumer.kafka'
|
36
|
+
ProducerSubscriber.attach_to 'producer.kafka'
|
37
|
+
AsyncProducerSubscriber.attach_to 'async_producer.kafka'
|
38
|
+
FetcherSubscriber.attach_to 'fetcher.kafka'
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
class ConnectionSubscriber < ActiveSupport::Subscriber
|
43
|
+
def initialize
|
44
|
+
super
|
45
|
+
@api_calls = Prometheus.registry.counter(:api_calls, docstring: 'Total calls', labels: [:client, :api, :broker])
|
46
|
+
@api_latency = Prometheus.registry.histogram(:api_latency, docstring: 'Latency', buckets: LATENCY_BUCKETS, labels: [:client, :api, :broker])
|
47
|
+
@api_request_size = Prometheus.registry.histogram(:api_request_size, docstring: 'Request size', buckets: SIZE_BUCKETS, labels: [:client, :api, :broker])
|
48
|
+
@api_response_size = Prometheus.registry.histogram(:api_response_size, docstring: 'Response size', buckets: SIZE_BUCKETS, labels: [:client, :api, :broker])
|
49
|
+
@api_errors = Prometheus.registry.counter(:api_errors, docstring: 'Errors', labels: [:client, :api, :broker])
|
50
|
+
end
|
51
|
+
|
52
|
+
def request(event)
|
53
|
+
key = {
|
54
|
+
client: event.payload.fetch(:client_id),
|
55
|
+
api: event.payload.fetch(:api, 'unknown'),
|
56
|
+
broker: event.payload.fetch(:broker_host)
|
57
|
+
}
|
58
|
+
request_size = event.payload.fetch(:request_size, 0)
|
59
|
+
response_size = event.payload.fetch(:response_size, 0)
|
60
|
+
|
61
|
+
@api_calls.increment(labels: key)
|
62
|
+
@api_latency.observe(event.duration, labels: key)
|
63
|
+
@api_request_size.observe(request_size, labels: key)
|
64
|
+
@api_response_size.observe(response_size, labels: key)
|
65
|
+
@api_errors.increment(labels: key) if event.payload.key?(:exception)
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
class ConsumerSubscriber < ActiveSupport::Subscriber
|
70
|
+
def initialize
|
71
|
+
super
|
72
|
+
@process_messages = Prometheus.registry.counter(:consumer_process_messages, docstring: 'Total messages', labels: [:client, :group_id, :topic, :partition])
|
73
|
+
@process_message_errors = Prometheus.registry.counter(:consumer_process_message_errors, docstring: 'Total errors', labels: [:client, :group_id, :topic, :partition])
|
74
|
+
@process_message_latency =
|
75
|
+
Prometheus.registry.histogram(:consumer_process_message_latency, docstring: 'Latency', buckets: LATENCY_BUCKETS, labels: [:client, :group_id, :topic, :partition])
|
76
|
+
@offset_lag = Prometheus.registry.gauge(:consumer_offset_lag, docstring: 'Offset lag', labels: [:client, :group_id, :topic, :partition])
|
77
|
+
@time_lag = Prometheus.registry.gauge(:consumer_time_lag, docstring: 'Time lag of message', labels: [:client, :group_id, :topic, :partition])
|
78
|
+
@process_batch_errors = Prometheus.registry.counter(:consumer_process_batch_errors, docstring: 'Total errors in batch', labels: [:client, :group_id, :topic, :partition])
|
79
|
+
@process_batch_latency =
|
80
|
+
Prometheus.registry.histogram(:consumer_process_batch_latency, docstring: 'Latency in batch', buckets: LATENCY_BUCKETS, labels: [:client, :group_id, :topic, :partition])
|
81
|
+
@batch_size = Prometheus.registry.histogram(:consumer_batch_size, docstring: 'Size of batch', buckets: SIZE_BUCKETS, labels: [:client, :group_id, :topic, :partition])
|
82
|
+
@join_group = Prometheus.registry.histogram(:consumer_join_group, docstring: 'Time to join group', buckets: DELAY_BUCKETS, labels: [:client, :group_id])
|
83
|
+
@join_group_errors = Prometheus.registry.counter(:consumer_join_group_errors, docstring: 'Total error in joining group', labels: [:client, :group_id])
|
84
|
+
@sync_group = Prometheus.registry.histogram(:consumer_sync_group, docstring: 'Time to sync group', buckets: DELAY_BUCKETS, labels: [:client, :group_id])
|
85
|
+
@sync_group_errors = Prometheus.registry.counter(:consumer_sync_group_errors, docstring: 'Total error in syncing group', labels: [:client, :group_id])
|
86
|
+
@leave_group = Prometheus.registry.histogram(:consumer_leave_group, docstring: 'Time to leave group', buckets: DELAY_BUCKETS, labels: [:client, :group_id])
|
87
|
+
@leave_group_errors = Prometheus.registry.counter(:consumer_leave_group_errors, docstring: 'Total error in leaving group', labels: [:client, :group_id])
|
88
|
+
@pause_duration = Prometheus.registry.gauge(:consumer_pause_duration, docstring: 'Pause duration', labels: [:client, :group_id, :topic, :partition])
|
89
|
+
end
|
90
|
+
|
91
|
+
def process_message(event)
|
92
|
+
key = {
|
93
|
+
client: event.payload.fetch(:client_id),
|
94
|
+
group_id: event.payload.fetch(:group_id),
|
95
|
+
topic: event.payload.fetch(:topic),
|
96
|
+
partition: event.payload.fetch(:partition)
|
97
|
+
}
|
98
|
+
|
99
|
+
offset_lag = event.payload.fetch(:offset_lag)
|
100
|
+
create_time = event.payload.fetch(:create_time)
|
101
|
+
|
102
|
+
time_lag = create_time && ((Time.now - create_time) * 1000).to_i
|
103
|
+
|
104
|
+
if event.payload.key?(:exception)
|
105
|
+
@process_message_errors.increment(labels: key)
|
106
|
+
else
|
107
|
+
@process_message_latency.observe(event.duration, labels: key)
|
108
|
+
@process_messages.increment(labels: key)
|
109
|
+
end
|
110
|
+
|
111
|
+
@offset_lag.set(offset_lag, labels: key)
|
112
|
+
|
113
|
+
# Not all messages have timestamps.
|
114
|
+
return unless time_lag
|
115
|
+
|
116
|
+
@time_lag.set(time_lag, labels: key)
|
117
|
+
end
|
118
|
+
|
119
|
+
def process_batch(event)
|
120
|
+
key = {
|
121
|
+
client: event.payload.fetch(:client_id),
|
122
|
+
group_id: event.payload.fetch(:group_id),
|
123
|
+
topic: event.payload.fetch(:topic),
|
124
|
+
partition: event.payload.fetch(:partition)
|
125
|
+
}
|
126
|
+
message_count = event.payload.fetch(:message_count)
|
127
|
+
|
128
|
+
if event.payload.key?(:exception)
|
129
|
+
@process_batch_errors.increment(labels: key)
|
130
|
+
else
|
131
|
+
@process_batch_latency.observe(event.duration, labels: key)
|
132
|
+
@process_messages.increment(by: message_count, labels: key)
|
133
|
+
end
|
134
|
+
end
|
135
|
+
|
136
|
+
def fetch_batch(event)
|
137
|
+
key = {
|
138
|
+
client: event.payload.fetch(:client_id),
|
139
|
+
group_id: event.payload.fetch(:group_id),
|
140
|
+
topic: event.payload.fetch(:topic),
|
141
|
+
partition: event.payload.fetch(:partition)
|
142
|
+
}
|
143
|
+
offset_lag = event.payload.fetch(:offset_lag)
|
144
|
+
batch_size = event.payload.fetch(:message_count)
|
145
|
+
|
146
|
+
@batch_size.observe(batch_size, labels: key)
|
147
|
+
@offset_lag.set(offset_lag, labels: key)
|
148
|
+
end
|
149
|
+
|
150
|
+
def join_group(event)
|
151
|
+
key = { client: event.payload.fetch(:client_id), group_id: event.payload.fetch(:group_id) }
|
152
|
+
@join_group.observe(event.duration, labels: key)
|
153
|
+
|
154
|
+
@join_group_errors.increment(labels: key) if event.payload.key?(:exception)
|
155
|
+
end
|
156
|
+
|
157
|
+
def sync_group(event)
|
158
|
+
key = { client: event.payload.fetch(:client_id), group_id: event.payload.fetch(:group_id) }
|
159
|
+
@sync_group.observe(event.duration, labels: key)
|
160
|
+
|
161
|
+
@sync_group_errors.increment(labels: key) if event.payload.key?(:exception)
|
162
|
+
end
|
163
|
+
|
164
|
+
def leave_group(event)
|
165
|
+
key = { client: event.payload.fetch(:client_id), group_id: event.payload.fetch(:group_id) }
|
166
|
+
@leave_group.observe(event.duration, labels: key)
|
167
|
+
|
168
|
+
@leave_group_errors.increment(labels: key) if event.payload.key?(:exception)
|
169
|
+
end
|
170
|
+
|
171
|
+
def pause_status(event)
|
172
|
+
key = {
|
173
|
+
client: event.payload.fetch(:client_id),
|
174
|
+
group_id: event.payload.fetch(:group_id),
|
175
|
+
topic: event.payload.fetch(:topic),
|
176
|
+
partition: event.payload.fetch(:partition)
|
177
|
+
}
|
178
|
+
|
179
|
+
duration = event.payload.fetch(:duration)
|
180
|
+
@pause_duration.set(duration, labels: key)
|
181
|
+
end
|
182
|
+
end
|
183
|
+
|
184
|
+
class ProducerSubscriber < ActiveSupport::Subscriber
|
185
|
+
def initialize
|
186
|
+
super
|
187
|
+
@produce_messages = Prometheus.registry.counter(:producer_produced_messages, docstring: 'Produced messages total', labels: [:client, :topic])
|
188
|
+
@produce_message_size =
|
189
|
+
Prometheus.registry.histogram(:producer_message_size, docstring: 'Message size', buckets: SIZE_BUCKETS, labels: [:client, :topic])
|
190
|
+
@buffer_size = Prometheus.registry.histogram(:producer_buffer_size, docstring: 'Buffer size', buckets: SIZE_BUCKETS, labels: [:client])
|
191
|
+
@buffer_fill_ratio = Prometheus.registry.histogram(:producer_buffer_fill_ratio, docstring: 'Buffer fill ratio', labels: [:client])
|
192
|
+
@buffer_fill_percentage = Prometheus.registry.histogram(:producer_buffer_fill_percentage, docstring: 'Buffer fill percentage', labels: [:client])
|
193
|
+
@produce_errors = Prometheus.registry.counter(:producer_produce_errors, docstring: 'Produce errors', labels: [:client, :topic])
|
194
|
+
@deliver_errors = Prometheus.registry.counter(:producer_deliver_errors, docstring: 'Deliver error', labels: [:client])
|
195
|
+
@deliver_latency =
|
196
|
+
Prometheus.registry.histogram(:producer_deliver_latency, docstring: 'Delivery latency', buckets: LATENCY_BUCKETS, labels: [:client])
|
197
|
+
@deliver_messages = Prometheus.registry.counter(:producer_deliver_messages, docstring: 'Total count of delivered messages', labels: [:client])
|
198
|
+
@deliver_attempts = Prometheus.registry.histogram(:producer_deliver_attempts, docstring: 'Delivery attempts', labels: [:client])
|
199
|
+
@ack_messages = Prometheus.registry.counter(:producer_ack_messages, docstring: 'Ack', labels: [:client, :topic])
|
200
|
+
@ack_delay = Prometheus.registry.histogram(:producer_ack_delay, docstring: 'Ack delay', buckets: LATENCY_BUCKETS, labels: [:client, :topic])
|
201
|
+
@ack_errors = Prometheus.registry.counter(:producer_ack_errors, docstring: 'Ack errors', labels: [:client, :topic])
|
202
|
+
end
|
203
|
+
|
204
|
+
def produce_message(event)
|
205
|
+
client = event.payload.fetch(:client_id)
|
206
|
+
key = { client: client, topic: event.payload.fetch(:topic) }
|
207
|
+
|
208
|
+
message_size = event.payload.fetch(:message_size)
|
209
|
+
buffer_size = event.payload.fetch(:buffer_size)
|
210
|
+
max_buffer_size = event.payload.fetch(:max_buffer_size)
|
211
|
+
buffer_fill_ratio = buffer_size.to_f / max_buffer_size.to_f
|
212
|
+
buffer_fill_percentage = buffer_fill_ratio * 100.0
|
213
|
+
|
214
|
+
# This gets us the write rate.
|
215
|
+
@produce_messages.increment(labels: key)
|
216
|
+
@produce_message_size.observe(message_size, labels: key)
|
217
|
+
|
218
|
+
# This gets us the avg/max buffer size per producer.
|
219
|
+
@buffer_size.observe(buffer_size, labels: { client: client })
|
220
|
+
|
221
|
+
# This gets us the avg/max buffer fill ratio per producer.
|
222
|
+
@buffer_fill_ratio.observe(buffer_fill_ratio, labels: { client: client })
|
223
|
+
@buffer_fill_percentage.observe(buffer_fill_percentage, labels: { client: client })
|
224
|
+
end
|
225
|
+
|
226
|
+
def buffer_overflow(event)
|
227
|
+
key = { client: event.payload.fetch(:client_id), topic: event.payload.fetch(:topic) }
|
228
|
+
@produce_errors.increment(labels: key)
|
229
|
+
end
|
230
|
+
|
231
|
+
def deliver_messages(event)
|
232
|
+
key = { client: event.payload.fetch(:client_id) }
|
233
|
+
message_count = event.payload.fetch(:delivered_message_count)
|
234
|
+
attempts = event.payload.fetch(:attempts)
|
235
|
+
|
236
|
+
@deliver_errors.increment(labels: key) if event.payload.key?(:exception)
|
237
|
+
@deliver_latency.observe(event.duration, labels: key)
|
238
|
+
|
239
|
+
# Messages delivered to Kafka:
|
240
|
+
@deliver_messages.increment(by: message_count, labels: key)
|
241
|
+
|
242
|
+
# Number of attempts to deliver messages:
|
243
|
+
@deliver_attempts.observe(attempts, labels: key)
|
244
|
+
end
|
245
|
+
|
246
|
+
def ack_message(event)
|
247
|
+
key = { client: event.payload.fetch(:client_id), topic: event.payload.fetch(:topic) }
|
248
|
+
|
249
|
+
# Number of messages ACK'd for the topic.
|
250
|
+
@ack_messages.increment(labels: key)
|
251
|
+
|
252
|
+
# Histogram of delay between a message being produced and it being ACK'd.
|
253
|
+
@ack_delay.observe(event.payload.fetch(:delay), labels: key)
|
254
|
+
end
|
255
|
+
|
256
|
+
def topic_error(event)
|
257
|
+
key = { client: event.payload.fetch(:client_id), topic: event.payload.fetch(:topic) }
|
258
|
+
|
259
|
+
@ack_errors.increment(labels: key)
|
260
|
+
end
|
261
|
+
end
|
262
|
+
|
263
|
+
class AsyncProducerSubscriber < ActiveSupport::Subscriber
|
264
|
+
def initialize
|
265
|
+
super
|
266
|
+
@queue_size = Prometheus.registry.histogram(:async_producer_queue_size, docstring: 'Queue size', buckets: SIZE_BUCKETS, labels: [:client, :topic])
|
267
|
+
@queue_fill_ratio = Prometheus.registry.histogram(:async_producer_queue_fill_ratio, docstring: 'Queue fill ratio', labels: [:client, :topic])
|
268
|
+
@produce_errors = Prometheus.registry.counter(:async_producer_produce_errors, docstring: 'Producer errors', labels: [:client, :topic])
|
269
|
+
@dropped_messages = Prometheus.registry.counter(:async_producer_dropped_messages, docstring: 'Dropped messages', labels: [:client])
|
270
|
+
end
|
271
|
+
|
272
|
+
def enqueue_message(event)
|
273
|
+
key = { client: event.payload.fetch(:client_id), topic: event.payload.fetch(:topic) }
|
274
|
+
|
275
|
+
queue_size = event.payload.fetch(:queue_size)
|
276
|
+
max_queue_size = event.payload.fetch(:max_queue_size)
|
277
|
+
queue_fill_ratio = queue_size.to_f / max_queue_size.to_f
|
278
|
+
|
279
|
+
# This gets us the avg/max queue size per producer.
|
280
|
+
@queue_size.observe(queue_size, labels: key)
|
281
|
+
|
282
|
+
# This gets us the avg/max queue fill ratio per producer.
|
283
|
+
@queue_fill_ratio.observe(queue_fill_ratio, labels: key)
|
284
|
+
end
|
285
|
+
|
286
|
+
def buffer_overflow(event)
|
287
|
+
key = { client: event.payload.fetch(:client_id), topic: event.payload.fetch(:topic) }
|
288
|
+
@produce_errors.increment(labels: key)
|
289
|
+
end
|
290
|
+
|
291
|
+
def drop_messages(event)
|
292
|
+
key = { client: event.payload.fetch(:client_id) }
|
293
|
+
message_count = event.payload.fetch(:message_count)
|
294
|
+
@dropped_messages.increment(by: message_count, labels: key)
|
295
|
+
end
|
296
|
+
end
|
297
|
+
|
298
|
+
class FetcherSubscriber < ActiveSupport::Subscriber
|
299
|
+
def initialize
|
300
|
+
super
|
301
|
+
@queue_size = Prometheus.registry.gauge(:fetcher_queue_size, docstring: 'Queue size', labels: [:client, :group_id])
|
302
|
+
end
|
303
|
+
|
304
|
+
def loop(event)
|
305
|
+
queue_size = event.payload.fetch(:queue_size)
|
306
|
+
client = event.payload.fetch(:client_id)
|
307
|
+
group_id = event.payload.fetch(:group_id)
|
308
|
+
|
309
|
+
@queue_size.set(queue_size, labels: { client: client, group_id: group_id })
|
310
|
+
end
|
311
|
+
end
|
312
|
+
end
|
313
|
+
end
|
314
|
+
|
315
|
+
# To enable testability, it is possible to skip the start until test time
|
316
|
+
Kafka::Prometheus.start unless defined?(PROMETHEUS_NO_AUTO_START)
|