ruby-kafka 0.7.8 → 1.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.circleci/config.yml +135 -3
- data/.github/workflows/stale.yml +19 -0
- data/CHANGELOG.md +26 -0
- data/README.md +26 -0
- data/lib/kafka/async_producer.rb +3 -0
- data/lib/kafka/client.rb +49 -1
- data/lib/kafka/cluster.rb +52 -0
- data/lib/kafka/connection.rb +3 -0
- data/lib/kafka/consumer.rb +56 -11
- data/lib/kafka/consumer_group.rb +10 -1
- data/lib/kafka/datadog.rb +18 -11
- data/lib/kafka/fetched_batch_generator.rb +1 -1
- data/lib/kafka/fetcher.rb +5 -2
- data/lib/kafka/offset_manager.rb +12 -1
- data/lib/kafka/producer.rb +4 -1
- data/lib/kafka/prometheus.rb +316 -0
- data/lib/kafka/protocol/join_group_request.rb +8 -2
- data/lib/kafka/protocol/metadata_response.rb +1 -1
- data/lib/kafka/protocol/offset_fetch_request.rb +3 -1
- data/lib/kafka/protocol/record_batch.rb +5 -4
- data/lib/kafka/sasl/scram.rb +15 -12
- data/lib/kafka/ssl_context.rb +4 -2
- data/lib/kafka/tagged_logger.rb +25 -20
- data/lib/kafka/version.rb +1 -1
- data/ruby-kafka.gemspec +4 -3
- metadata +29 -7
data/lib/kafka/connection.rb
CHANGED
data/lib/kafka/consumer.rb
CHANGED
@@ -44,7 +44,7 @@ module Kafka
|
|
44
44
|
#
|
45
45
|
class Consumer
|
46
46
|
|
47
|
-
def initialize(cluster:, logger:, instrumenter:, group:, fetcher:, offset_manager:, session_timeout:, heartbeat:)
|
47
|
+
def initialize(cluster:, logger:, instrumenter:, group:, fetcher:, offset_manager:, session_timeout:, heartbeat:, refresh_topic_interval: 0)
|
48
48
|
@cluster = cluster
|
49
49
|
@logger = TaggedLogger.new(logger)
|
50
50
|
@instrumenter = instrumenter
|
@@ -53,6 +53,7 @@ module Kafka
|
|
53
53
|
@session_timeout = session_timeout
|
54
54
|
@fetcher = fetcher
|
55
55
|
@heartbeat = heartbeat
|
56
|
+
@refresh_topic_interval = refresh_topic_interval
|
56
57
|
|
57
58
|
@pauses = Hash.new {|h, k|
|
58
59
|
h[k] = Hash.new {|h2, k2|
|
@@ -73,6 +74,15 @@ module Kafka
|
|
73
74
|
# when user commits message other than last in a batch, this would make ruby-kafka refetch
|
74
75
|
# some already consumed messages
|
75
76
|
@current_offsets = Hash.new { |h, k| h[k] = {} }
|
77
|
+
|
78
|
+
# Map storing subscribed topics with their configuration
|
79
|
+
@subscribed_topics = Concurrent::Map.new
|
80
|
+
|
81
|
+
# Set storing topics that matched topics in @subscribed_topics
|
82
|
+
@matched_topics = Set.new
|
83
|
+
|
84
|
+
# Whether join_group must be executed again because new topics are added
|
85
|
+
@join_group_for_new_topics = false
|
76
86
|
end
|
77
87
|
|
78
88
|
# Subscribes the consumer to a topic.
|
@@ -97,13 +107,12 @@ module Kafka
|
|
97
107
|
def subscribe(topic_or_regex, default_offset: nil, start_from_beginning: true, max_bytes_per_partition: 1048576)
|
98
108
|
default_offset ||= start_from_beginning ? :earliest : :latest
|
99
109
|
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
end
|
110
|
+
@subscribed_topics[topic_or_regex] = {
|
111
|
+
default_offset: default_offset,
|
112
|
+
start_from_beginning: start_from_beginning,
|
113
|
+
max_bytes_per_partition: max_bytes_per_partition
|
114
|
+
}
|
115
|
+
scan_for_subscribing
|
107
116
|
|
108
117
|
nil
|
109
118
|
end
|
@@ -116,7 +125,6 @@ module Kafka
|
|
116
125
|
def stop
|
117
126
|
@running = false
|
118
127
|
@fetcher.stop
|
119
|
-
@cluster.disconnect
|
120
128
|
end
|
121
129
|
|
122
130
|
# Pause processing of a specific topic partition.
|
@@ -308,6 +316,7 @@ module Kafka
|
|
308
316
|
topic: batch.topic,
|
309
317
|
partition: batch.partition,
|
310
318
|
last_offset: batch.last_offset,
|
319
|
+
last_create_time: batch.messages.last.try(:create_time),
|
311
320
|
offset_lag: batch.offset_lag,
|
312
321
|
highwater_mark_offset: batch.highwater_mark_offset,
|
313
322
|
message_count: batch.messages.count,
|
@@ -401,6 +410,7 @@ module Kafka
|
|
401
410
|
while running?
|
402
411
|
begin
|
403
412
|
@instrumenter.instrument("loop.consumer") do
|
413
|
+
refresh_topic_list_if_enabled
|
404
414
|
yield
|
405
415
|
end
|
406
416
|
rescue HeartbeatError
|
@@ -432,6 +442,7 @@ module Kafka
|
|
432
442
|
# important that members explicitly tell Kafka when they're leaving.
|
433
443
|
make_final_offsets_commit!
|
434
444
|
@group.leave rescue nil
|
445
|
+
@cluster.disconnect
|
435
446
|
@running = false
|
436
447
|
@logger.pop_tags
|
437
448
|
end
|
@@ -452,6 +463,8 @@ module Kafka
|
|
452
463
|
end
|
453
464
|
|
454
465
|
def join_group
|
466
|
+
@join_group_for_new_topics = false
|
467
|
+
|
455
468
|
old_generation_id = @group.generation_id
|
456
469
|
|
457
470
|
@group.join
|
@@ -513,11 +526,19 @@ module Kafka
|
|
513
526
|
end
|
514
527
|
end
|
515
528
|
|
529
|
+
def refresh_topic_list_if_enabled
|
530
|
+
return if @refresh_topic_interval <= 0
|
531
|
+
return if @refreshed_at && @refreshed_at + @refresh_topic_interval > Time.now
|
532
|
+
|
533
|
+
scan_for_subscribing
|
534
|
+
@refreshed_at = Time.now
|
535
|
+
end
|
536
|
+
|
516
537
|
def fetch_batches
|
517
538
|
# Return early if the consumer has been stopped.
|
518
539
|
return [] if shutting_down?
|
519
540
|
|
520
|
-
join_group
|
541
|
+
join_group if !@group.member? || @join_group_for_new_topics
|
521
542
|
|
522
543
|
trigger_heartbeat
|
523
544
|
|
@@ -525,7 +546,7 @@ module Kafka
|
|
525
546
|
|
526
547
|
if !@fetcher.data?
|
527
548
|
@logger.debug "No batches to process"
|
528
|
-
sleep 2
|
549
|
+
sleep(@fetcher.max_wait_time || 2)
|
529
550
|
[]
|
530
551
|
else
|
531
552
|
tag, message = @fetcher.poll
|
@@ -571,10 +592,34 @@ module Kafka
|
|
571
592
|
end
|
572
593
|
end
|
573
594
|
|
595
|
+
def scan_for_subscribing
|
596
|
+
@subscribed_topics.each do |topic_or_regex, config|
|
597
|
+
default_offset = config.fetch(:default_offset)
|
598
|
+
start_from_beginning = config.fetch(:start_from_beginning)
|
599
|
+
max_bytes_per_partition = config.fetch(:max_bytes_per_partition)
|
600
|
+
if topic_or_regex.is_a?(Regexp)
|
601
|
+
subscribe_to_regex(topic_or_regex, default_offset, start_from_beginning, max_bytes_per_partition)
|
602
|
+
else
|
603
|
+
subscribe_to_topic(topic_or_regex, default_offset, start_from_beginning, max_bytes_per_partition)
|
604
|
+
end
|
605
|
+
end
|
606
|
+
end
|
607
|
+
|
608
|
+
def subscribe_to_regex(topic_regex, default_offset, start_from_beginning, max_bytes_per_partition)
|
609
|
+
cluster_topics.select { |topic| topic =~ topic_regex }.each do |topic|
|
610
|
+
subscribe_to_topic(topic, default_offset, start_from_beginning, max_bytes_per_partition)
|
611
|
+
end
|
612
|
+
end
|
613
|
+
|
574
614
|
def subscribe_to_topic(topic, default_offset, start_from_beginning, max_bytes_per_partition)
|
615
|
+
return if @matched_topics.include?(topic)
|
616
|
+
@matched_topics.add(topic)
|
617
|
+
@join_group_for_new_topics = true
|
618
|
+
|
575
619
|
@group.subscribe(topic)
|
576
620
|
@offset_manager.set_default_offset(topic, default_offset)
|
577
621
|
@fetcher.subscribe(topic, max_bytes_per_partition: max_bytes_per_partition)
|
622
|
+
@cluster.mark_as_stale!
|
578
623
|
end
|
579
624
|
|
580
625
|
def cluster_topics
|
data/lib/kafka/consumer_group.rb
CHANGED
@@ -7,11 +7,12 @@ module Kafka
|
|
7
7
|
class ConsumerGroup
|
8
8
|
attr_reader :assigned_partitions, :generation_id, :group_id
|
9
9
|
|
10
|
-
def initialize(cluster:, logger:, group_id:, session_timeout:, retention_time:, instrumenter:)
|
10
|
+
def initialize(cluster:, logger:, group_id:, session_timeout:, rebalance_timeout:, retention_time:, instrumenter:)
|
11
11
|
@cluster = cluster
|
12
12
|
@logger = TaggedLogger.new(logger)
|
13
13
|
@group_id = group_id
|
14
14
|
@session_timeout = session_timeout
|
15
|
+
@rebalance_timeout = rebalance_timeout
|
15
16
|
@instrumenter = instrumenter
|
16
17
|
@member_id = ""
|
17
18
|
@generation_id = nil
|
@@ -140,7 +141,9 @@ module Kafka
|
|
140
141
|
response = coordinator.join_group(
|
141
142
|
group_id: @group_id,
|
142
143
|
session_timeout: @session_timeout,
|
144
|
+
rebalance_timeout: @rebalance_timeout,
|
143
145
|
member_id: @member_id,
|
146
|
+
topics: @topics,
|
144
147
|
)
|
145
148
|
|
146
149
|
Protocol.handle_error(response.error_code)
|
@@ -158,6 +161,12 @@ module Kafka
|
|
158
161
|
@member_id = ""
|
159
162
|
sleep 1
|
160
163
|
|
164
|
+
retry
|
165
|
+
rescue CoordinatorLoadInProgress
|
166
|
+
@logger.error "Coordinator broker still loading, retrying in 1s..."
|
167
|
+
|
168
|
+
sleep 1
|
169
|
+
|
161
170
|
retry
|
162
171
|
end
|
163
172
|
|
data/lib/kafka/datadog.rb
CHANGED
@@ -31,7 +31,7 @@ module Kafka
|
|
31
31
|
|
32
32
|
class << self
|
33
33
|
def statsd
|
34
|
-
@statsd ||= ::Datadog::Statsd.new(host, port, namespace: namespace, tags: tags)
|
34
|
+
@statsd ||= ::Datadog::Statsd.new(host, port, namespace: namespace, tags: tags, socket_path: socket_path)
|
35
35
|
end
|
36
36
|
|
37
37
|
def statsd=(statsd)
|
@@ -40,7 +40,7 @@ module Kafka
|
|
40
40
|
end
|
41
41
|
|
42
42
|
def host
|
43
|
-
@host
|
43
|
+
@host
|
44
44
|
end
|
45
45
|
|
46
46
|
def host=(host)
|
@@ -49,7 +49,7 @@ module Kafka
|
|
49
49
|
end
|
50
50
|
|
51
51
|
def port
|
52
|
-
@port
|
52
|
+
@port
|
53
53
|
end
|
54
54
|
|
55
55
|
def port=(port)
|
@@ -57,6 +57,15 @@ module Kafka
|
|
57
57
|
clear
|
58
58
|
end
|
59
59
|
|
60
|
+
def socket_path
|
61
|
+
@socket_path
|
62
|
+
end
|
63
|
+
|
64
|
+
def socket_path=(socket_path)
|
65
|
+
@socket_path = socket_path
|
66
|
+
clear
|
67
|
+
end
|
68
|
+
|
60
69
|
def namespace
|
61
70
|
@namespace ||= STATSD_NAMESPACE
|
62
71
|
end
|
@@ -77,14 +86,6 @@ module Kafka
|
|
77
86
|
|
78
87
|
private
|
79
88
|
|
80
|
-
def default_host
|
81
|
-
::Datadog::Statsd.const_defined?(:Connection) ? ::Datadog::Statsd::Connection::DEFAULT_HOST : ::Datadog::Statsd::DEFAULT_HOST
|
82
|
-
end
|
83
|
-
|
84
|
-
def default_port
|
85
|
-
::Datadog::Statsd.const_defined?(:Connection) ? ::Datadog::Statsd::Connection::DEFAULT_PORT : ::Datadog::Statsd::DEFAULT_PORT
|
86
|
-
end
|
87
|
-
|
88
89
|
def clear
|
89
90
|
@statsd && @statsd.close
|
90
91
|
@statsd = nil
|
@@ -168,6 +169,8 @@ module Kafka
|
|
168
169
|
def process_batch(event)
|
169
170
|
offset = event.payload.fetch(:last_offset)
|
170
171
|
messages = event.payload.fetch(:message_count)
|
172
|
+
create_time = event.payload.fetch(:last_create_time)
|
173
|
+
time_lag = create_time && ((Time.now - create_time) * 1000).to_i
|
171
174
|
|
172
175
|
tags = {
|
173
176
|
client: event.payload.fetch(:client_id),
|
@@ -184,6 +187,10 @@ module Kafka
|
|
184
187
|
end
|
185
188
|
|
186
189
|
gauge("consumer.offset", offset, tags: tags)
|
190
|
+
|
191
|
+
if time_lag
|
192
|
+
gauge("consumer.time_lag", time_lag, tags: tags)
|
193
|
+
end
|
187
194
|
end
|
188
195
|
|
189
196
|
def fetch_batch(event)
|
data/lib/kafka/fetcher.rb
CHANGED
@@ -4,7 +4,7 @@ require "kafka/fetch_operation"
|
|
4
4
|
|
5
5
|
module Kafka
|
6
6
|
class Fetcher
|
7
|
-
attr_reader :queue
|
7
|
+
attr_reader :queue, :max_wait_time
|
8
8
|
|
9
9
|
def initialize(cluster:, logger:, instrumenter:, max_queue_size:, group:)
|
10
10
|
@cluster = cluster
|
@@ -17,6 +17,9 @@ module Kafka
|
|
17
17
|
@commands = Queue.new
|
18
18
|
@next_offsets = Hash.new { |h, k| h[k] = {} }
|
19
19
|
|
20
|
+
# We are only running when someone calls start.
|
21
|
+
@running = false
|
22
|
+
|
20
23
|
# Long poll until at least this many bytes can be fetched.
|
21
24
|
@min_bytes = 1
|
22
25
|
|
@@ -110,7 +113,7 @@ module Kafka
|
|
110
113
|
elsif @queue.size < @max_queue_size
|
111
114
|
step
|
112
115
|
else
|
113
|
-
@logger.
|
116
|
+
@logger.info "Reached max fetcher queue size (#{@max_queue_size}), sleeping 1s"
|
114
117
|
sleep 1
|
115
118
|
end
|
116
119
|
ensure
|
data/lib/kafka/offset_manager.rb
CHANGED
@@ -50,9 +50,20 @@ module Kafka
|
|
50
50
|
# @param offset [Integer] the offset of the message that should be marked as processed.
|
51
51
|
# @return [nil]
|
52
52
|
def mark_as_processed(topic, partition, offset)
|
53
|
-
@
|
53
|
+
unless @group.assigned_to?(topic, partition)
|
54
|
+
@logger.debug "Not marking #{topic}/#{partition}:#{offset} as processed for partition not assigned to this consumer."
|
55
|
+
return
|
56
|
+
end
|
54
57
|
@processed_offsets[topic] ||= {}
|
55
58
|
|
59
|
+
last_processed_offset = @processed_offsets[topic][partition] || -1
|
60
|
+
if last_processed_offset > offset + 1
|
61
|
+
@logger.debug "Not overwriting newer offset #{topic}/#{partition}:#{last_processed_offset - 1} with older #{offset}"
|
62
|
+
return
|
63
|
+
end
|
64
|
+
|
65
|
+
@uncommitted_offsets += 1
|
66
|
+
|
56
67
|
# The committed offset should always be the offset of the next message that the
|
57
68
|
# application will read, thus adding one to the last message processed.
|
58
69
|
@processed_offsets[topic][partition] = offset + 1
|
data/lib/kafka/producer.rb
CHANGED
@@ -188,11 +188,14 @@ module Kafka
|
|
188
188
|
# @raise [BufferOverflow] if the maximum buffer size has been reached.
|
189
189
|
# @return [nil]
|
190
190
|
def produce(value, key: nil, headers: {}, topic:, partition: nil, partition_key: nil, create_time: Time.now)
|
191
|
+
# We want to fail fast if `topic` isn't a String
|
192
|
+
topic = topic.to_str
|
193
|
+
|
191
194
|
message = PendingMessage.new(
|
192
195
|
value: value && value.to_s,
|
193
196
|
key: key && key.to_s,
|
194
197
|
headers: headers,
|
195
|
-
topic: topic
|
198
|
+
topic: topic,
|
196
199
|
partition: partition && Integer(partition),
|
197
200
|
partition_key: partition_key && partition_key.to_s,
|
198
201
|
create_time: create_time
|
@@ -0,0 +1,316 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
#
|
4
|
+
# Subscriber to ruby_kafka to report metrics to prometheus
|
5
|
+
#
|
6
|
+
# Usage:
|
7
|
+
# require "kafka/prometheus"
|
8
|
+
#
|
9
|
+
# Once the file has been required, no further configuration is needed, all operational
|
10
|
+
# metrics are automatically emitted (Unless PROMETHEUS_NO_AUTO_START is set).
|
11
|
+
#
|
12
|
+
# By Peter Mustel, T2 Data AB
|
13
|
+
#
|
14
|
+
begin
|
15
|
+
require 'prometheus/client'
|
16
|
+
rescue LoadError
|
17
|
+
warn 'In order to report Kafka client metrics to Prometheus you need to install the `prometheus-client` gem.'
|
18
|
+
raise
|
19
|
+
end
|
20
|
+
|
21
|
+
require 'active_support/subscriber'
|
22
|
+
|
23
|
+
module Kafka
|
24
|
+
module Prometheus
|
25
|
+
SIZE_BUCKETS = [1, 10, 100, 1000, 10_000, 100_000, 1_000_000].freeze
|
26
|
+
LATENCY_BUCKETS = [0.0001, 0.001, 0.01, 0.1, 1.0, 10, 100, 1000].freeze
|
27
|
+
DELAY_BUCKETS = [1, 3, 10, 30, 100, 300, 1000, 3000, 10_000, 30_000].freeze
|
28
|
+
|
29
|
+
class << self
|
30
|
+
attr_accessor :registry
|
31
|
+
|
32
|
+
def start(registry = ::Prometheus::Client.registry)
|
33
|
+
@registry = registry
|
34
|
+
ConnectionSubscriber.attach_to 'connection.kafka'
|
35
|
+
ConsumerSubscriber.attach_to 'consumer.kafka'
|
36
|
+
ProducerSubscriber.attach_to 'producer.kafka'
|
37
|
+
AsyncProducerSubscriber.attach_to 'async_producer.kafka'
|
38
|
+
FetcherSubscriber.attach_to 'fetcher.kafka'
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
class ConnectionSubscriber < ActiveSupport::Subscriber
|
43
|
+
def initialize
|
44
|
+
super
|
45
|
+
@api_calls = Prometheus.registry.counter(:api_calls, docstring: 'Total calls', labels: [:client, :api, :broker])
|
46
|
+
@api_latency = Prometheus.registry.histogram(:api_latency, docstring: 'Latency', buckets: LATENCY_BUCKETS, labels: [:client, :api, :broker])
|
47
|
+
@api_request_size = Prometheus.registry.histogram(:api_request_size, docstring: 'Request size', buckets: SIZE_BUCKETS, labels: [:client, :api, :broker])
|
48
|
+
@api_response_size = Prometheus.registry.histogram(:api_response_size, docstring: 'Response size', buckets: SIZE_BUCKETS, labels: [:client, :api, :broker])
|
49
|
+
@api_errors = Prometheus.registry.counter(:api_errors, docstring: 'Errors', labels: [:client, :api, :broker])
|
50
|
+
end
|
51
|
+
|
52
|
+
def request(event)
|
53
|
+
key = {
|
54
|
+
client: event.payload.fetch(:client_id),
|
55
|
+
api: event.payload.fetch(:api, 'unknown'),
|
56
|
+
broker: event.payload.fetch(:broker_host)
|
57
|
+
}
|
58
|
+
request_size = event.payload.fetch(:request_size, 0)
|
59
|
+
response_size = event.payload.fetch(:response_size, 0)
|
60
|
+
|
61
|
+
@api_calls.increment(labels: key)
|
62
|
+
@api_latency.observe(event.duration, labels: key)
|
63
|
+
@api_request_size.observe(request_size, labels: key)
|
64
|
+
@api_response_size.observe(response_size, labels: key)
|
65
|
+
@api_errors.increment(labels: key) if event.payload.key?(:exception)
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
class ConsumerSubscriber < ActiveSupport::Subscriber
|
70
|
+
def initialize
|
71
|
+
super
|
72
|
+
@process_messages = Prometheus.registry.counter(:consumer_process_messages, docstring: 'Total messages', labels: [:client, :group_id, :topic, :partition])
|
73
|
+
@process_message_errors = Prometheus.registry.counter(:consumer_process_message_errors, docstring: 'Total errors', labels: [:client, :group_id, :topic, :partition])
|
74
|
+
@process_message_latency =
|
75
|
+
Prometheus.registry.histogram(:consumer_process_message_latency, docstring: 'Latency', buckets: LATENCY_BUCKETS, labels: [:client, :group_id, :topic, :partition])
|
76
|
+
@offset_lag = Prometheus.registry.gauge(:consumer_offset_lag, docstring: 'Offset lag', labels: [:client, :group_id, :topic, :partition])
|
77
|
+
@time_lag = Prometheus.registry.gauge(:consumer_time_lag, docstring: 'Time lag of message', labels: [:client, :group_id, :topic, :partition])
|
78
|
+
@process_batch_errors = Prometheus.registry.counter(:consumer_process_batch_errors, docstring: 'Total errors in batch', labels: [:client, :group_id, :topic, :partition])
|
79
|
+
@process_batch_latency =
|
80
|
+
Prometheus.registry.histogram(:consumer_process_batch_latency, docstring: 'Latency in batch', buckets: LATENCY_BUCKETS, labels: [:client, :group_id, :topic, :partition])
|
81
|
+
@batch_size = Prometheus.registry.histogram(:consumer_batch_size, docstring: 'Size of batch', buckets: SIZE_BUCKETS, labels: [:client, :group_id, :topic, :partition])
|
82
|
+
@join_group = Prometheus.registry.histogram(:consumer_join_group, docstring: 'Time to join group', buckets: DELAY_BUCKETS, labels: [:client, :group_id])
|
83
|
+
@join_group_errors = Prometheus.registry.counter(:consumer_join_group_errors, docstring: 'Total error in joining group', labels: [:client, :group_id])
|
84
|
+
@sync_group = Prometheus.registry.histogram(:consumer_sync_group, docstring: 'Time to sync group', buckets: DELAY_BUCKETS, labels: [:client, :group_id])
|
85
|
+
@sync_group_errors = Prometheus.registry.counter(:consumer_sync_group_errors, docstring: 'Total error in syncing group', labels: [:client, :group_id])
|
86
|
+
@leave_group = Prometheus.registry.histogram(:consumer_leave_group, docstring: 'Time to leave group', buckets: DELAY_BUCKETS, labels: [:client, :group_id])
|
87
|
+
@leave_group_errors = Prometheus.registry.counter(:consumer_leave_group_errors, docstring: 'Total error in leaving group', labels: [:client, :group_id])
|
88
|
+
@pause_duration = Prometheus.registry.gauge(:consumer_pause_duration, docstring: 'Pause duration', labels: [:client, :group_id, :topic, :partition])
|
89
|
+
end
|
90
|
+
|
91
|
+
def process_message(event)
|
92
|
+
key = {
|
93
|
+
client: event.payload.fetch(:client_id),
|
94
|
+
group_id: event.payload.fetch(:group_id),
|
95
|
+
topic: event.payload.fetch(:topic),
|
96
|
+
partition: event.payload.fetch(:partition)
|
97
|
+
}
|
98
|
+
|
99
|
+
offset_lag = event.payload.fetch(:offset_lag)
|
100
|
+
create_time = event.payload.fetch(:create_time)
|
101
|
+
|
102
|
+
time_lag = create_time && ((Time.now - create_time) * 1000).to_i
|
103
|
+
|
104
|
+
if event.payload.key?(:exception)
|
105
|
+
@process_message_errors.increment(labels: key)
|
106
|
+
else
|
107
|
+
@process_message_latency.observe(event.duration, labels: key)
|
108
|
+
@process_messages.increment(labels: key)
|
109
|
+
end
|
110
|
+
|
111
|
+
@offset_lag.set(offset_lag, labels: key)
|
112
|
+
|
113
|
+
# Not all messages have timestamps.
|
114
|
+
return unless time_lag
|
115
|
+
|
116
|
+
@time_lag.set(time_lag, labels: key)
|
117
|
+
end
|
118
|
+
|
119
|
+
def process_batch(event)
|
120
|
+
key = {
|
121
|
+
client: event.payload.fetch(:client_id),
|
122
|
+
group_id: event.payload.fetch(:group_id),
|
123
|
+
topic: event.payload.fetch(:topic),
|
124
|
+
partition: event.payload.fetch(:partition)
|
125
|
+
}
|
126
|
+
message_count = event.payload.fetch(:message_count)
|
127
|
+
|
128
|
+
if event.payload.key?(:exception)
|
129
|
+
@process_batch_errors.increment(labels: key)
|
130
|
+
else
|
131
|
+
@process_batch_latency.observe(event.duration, labels: key)
|
132
|
+
@process_messages.increment(by: message_count, labels: key)
|
133
|
+
end
|
134
|
+
end
|
135
|
+
|
136
|
+
def fetch_batch(event)
|
137
|
+
key = {
|
138
|
+
client: event.payload.fetch(:client_id),
|
139
|
+
group_id: event.payload.fetch(:group_id),
|
140
|
+
topic: event.payload.fetch(:topic),
|
141
|
+
partition: event.payload.fetch(:partition)
|
142
|
+
}
|
143
|
+
offset_lag = event.payload.fetch(:offset_lag)
|
144
|
+
batch_size = event.payload.fetch(:message_count)
|
145
|
+
|
146
|
+
@batch_size.observe(batch_size, labels: key)
|
147
|
+
@offset_lag.set(offset_lag, labels: key)
|
148
|
+
end
|
149
|
+
|
150
|
+
def join_group(event)
|
151
|
+
key = { client: event.payload.fetch(:client_id), group_id: event.payload.fetch(:group_id) }
|
152
|
+
@join_group.observe(event.duration, labels: key)
|
153
|
+
|
154
|
+
@join_group_errors.increment(labels: key) if event.payload.key?(:exception)
|
155
|
+
end
|
156
|
+
|
157
|
+
def sync_group(event)
|
158
|
+
key = { client: event.payload.fetch(:client_id), group_id: event.payload.fetch(:group_id) }
|
159
|
+
@sync_group.observe(event.duration, labels: key)
|
160
|
+
|
161
|
+
@sync_group_errors.increment(labels: key) if event.payload.key?(:exception)
|
162
|
+
end
|
163
|
+
|
164
|
+
def leave_group(event)
|
165
|
+
key = { client: event.payload.fetch(:client_id), group_id: event.payload.fetch(:group_id) }
|
166
|
+
@leave_group.observe(event.duration, labels: key)
|
167
|
+
|
168
|
+
@leave_group_errors.increment(labels: key) if event.payload.key?(:exception)
|
169
|
+
end
|
170
|
+
|
171
|
+
def pause_status(event)
|
172
|
+
key = {
|
173
|
+
client: event.payload.fetch(:client_id),
|
174
|
+
group_id: event.payload.fetch(:group_id),
|
175
|
+
topic: event.payload.fetch(:topic),
|
176
|
+
partition: event.payload.fetch(:partition)
|
177
|
+
}
|
178
|
+
|
179
|
+
duration = event.payload.fetch(:duration)
|
180
|
+
@pause_duration.set(duration, labels: key)
|
181
|
+
end
|
182
|
+
end
|
183
|
+
|
184
|
+
class ProducerSubscriber < ActiveSupport::Subscriber
|
185
|
+
def initialize
|
186
|
+
super
|
187
|
+
@produce_messages = Prometheus.registry.counter(:producer_produced_messages, docstring: 'Produced messages total', labels: [:client, :topic])
|
188
|
+
@produce_message_size =
|
189
|
+
Prometheus.registry.histogram(:producer_message_size, docstring: 'Message size', buckets: SIZE_BUCKETS, labels: [:client, :topic])
|
190
|
+
@buffer_size = Prometheus.registry.histogram(:producer_buffer_size, docstring: 'Buffer size', buckets: SIZE_BUCKETS, labels: [:client])
|
191
|
+
@buffer_fill_ratio = Prometheus.registry.histogram(:producer_buffer_fill_ratio, docstring: 'Buffer fill ratio', labels: [:client])
|
192
|
+
@buffer_fill_percentage = Prometheus.registry.histogram(:producer_buffer_fill_percentage, docstring: 'Buffer fill percentage', labels: [:client])
|
193
|
+
@produce_errors = Prometheus.registry.counter(:producer_produce_errors, docstring: 'Produce errors', labels: [:client, :topic])
|
194
|
+
@deliver_errors = Prometheus.registry.counter(:producer_deliver_errors, docstring: 'Deliver error', labels: [:client])
|
195
|
+
@deliver_latency =
|
196
|
+
Prometheus.registry.histogram(:producer_deliver_latency, docstring: 'Delivery latency', buckets: LATENCY_BUCKETS, labels: [:client])
|
197
|
+
@deliver_messages = Prometheus.registry.counter(:producer_deliver_messages, docstring: 'Total count of delivered messages', labels: [:client])
|
198
|
+
@deliver_attempts = Prometheus.registry.histogram(:producer_deliver_attempts, docstring: 'Delivery attempts', labels: [:client])
|
199
|
+
@ack_messages = Prometheus.registry.counter(:producer_ack_messages, docstring: 'Ack', labels: [:client, :topic])
|
200
|
+
@ack_delay = Prometheus.registry.histogram(:producer_ack_delay, docstring: 'Ack delay', buckets: LATENCY_BUCKETS, labels: [:client, :topic])
|
201
|
+
@ack_errors = Prometheus.registry.counter(:producer_ack_errors, docstring: 'Ack errors', labels: [:client, :topic])
|
202
|
+
end
|
203
|
+
|
204
|
+
def produce_message(event)
|
205
|
+
client = event.payload.fetch(:client_id)
|
206
|
+
key = { client: client, topic: event.payload.fetch(:topic) }
|
207
|
+
|
208
|
+
message_size = event.payload.fetch(:message_size)
|
209
|
+
buffer_size = event.payload.fetch(:buffer_size)
|
210
|
+
max_buffer_size = event.payload.fetch(:max_buffer_size)
|
211
|
+
buffer_fill_ratio = buffer_size.to_f / max_buffer_size.to_f
|
212
|
+
buffer_fill_percentage = buffer_fill_ratio * 100.0
|
213
|
+
|
214
|
+
# This gets us the write rate.
|
215
|
+
@produce_messages.increment(labels: key)
|
216
|
+
@produce_message_size.observe(message_size, labels: key)
|
217
|
+
|
218
|
+
# This gets us the avg/max buffer size per producer.
|
219
|
+
@buffer_size.observe(buffer_size, labels: { client: client })
|
220
|
+
|
221
|
+
# This gets us the avg/max buffer fill ratio per producer.
|
222
|
+
@buffer_fill_ratio.observe(buffer_fill_ratio, labels: { client: client })
|
223
|
+
@buffer_fill_percentage.observe(buffer_fill_percentage, labels: { client: client })
|
224
|
+
end
|
225
|
+
|
226
|
+
def buffer_overflow(event)
|
227
|
+
key = { client: event.payload.fetch(:client_id), topic: event.payload.fetch(:topic) }
|
228
|
+
@produce_errors.increment(labels: key)
|
229
|
+
end
|
230
|
+
|
231
|
+
def deliver_messages(event)
|
232
|
+
key = { client: event.payload.fetch(:client_id) }
|
233
|
+
message_count = event.payload.fetch(:delivered_message_count)
|
234
|
+
attempts = event.payload.fetch(:attempts)
|
235
|
+
|
236
|
+
@deliver_errors.increment(labels: key) if event.payload.key?(:exception)
|
237
|
+
@deliver_latency.observe(event.duration, labels: key)
|
238
|
+
|
239
|
+
# Messages delivered to Kafka:
|
240
|
+
@deliver_messages.increment(by: message_count, labels: key)
|
241
|
+
|
242
|
+
# Number of attempts to deliver messages:
|
243
|
+
@deliver_attempts.observe(attempts, labels: key)
|
244
|
+
end
|
245
|
+
|
246
|
+
def ack_message(event)
|
247
|
+
key = { client: event.payload.fetch(:client_id), topic: event.payload.fetch(:topic) }
|
248
|
+
|
249
|
+
# Number of messages ACK'd for the topic.
|
250
|
+
@ack_messages.increment(labels: key)
|
251
|
+
|
252
|
+
# Histogram of delay between a message being produced and it being ACK'd.
|
253
|
+
@ack_delay.observe(event.payload.fetch(:delay), labels: key)
|
254
|
+
end
|
255
|
+
|
256
|
+
def topic_error(event)
|
257
|
+
key = { client: event.payload.fetch(:client_id), topic: event.payload.fetch(:topic) }
|
258
|
+
|
259
|
+
@ack_errors.increment(labels: key)
|
260
|
+
end
|
261
|
+
end
|
262
|
+
|
263
|
+
class AsyncProducerSubscriber < ActiveSupport::Subscriber
|
264
|
+
def initialize
|
265
|
+
super
|
266
|
+
@queue_size = Prometheus.registry.histogram(:async_producer_queue_size, docstring: 'Queue size', buckets: SIZE_BUCKETS, labels: [:client, :topic])
|
267
|
+
@queue_fill_ratio = Prometheus.registry.histogram(:async_producer_queue_fill_ratio, docstring: 'Queue fill ratio', labels: [:client, :topic])
|
268
|
+
@produce_errors = Prometheus.registry.counter(:async_producer_produce_errors, docstring: 'Producer errors', labels: [:client, :topic])
|
269
|
+
@dropped_messages = Prometheus.registry.counter(:async_producer_dropped_messages, docstring: 'Dropped messages', labels: [:client])
|
270
|
+
end
|
271
|
+
|
272
|
+
def enqueue_message(event)
|
273
|
+
key = { client: event.payload.fetch(:client_id), topic: event.payload.fetch(:topic) }
|
274
|
+
|
275
|
+
queue_size = event.payload.fetch(:queue_size)
|
276
|
+
max_queue_size = event.payload.fetch(:max_queue_size)
|
277
|
+
queue_fill_ratio = queue_size.to_f / max_queue_size.to_f
|
278
|
+
|
279
|
+
# This gets us the avg/max queue size per producer.
|
280
|
+
@queue_size.observe(queue_size, labels: key)
|
281
|
+
|
282
|
+
# This gets us the avg/max queue fill ratio per producer.
|
283
|
+
@queue_fill_ratio.observe(queue_fill_ratio, labels: key)
|
284
|
+
end
|
285
|
+
|
286
|
+
def buffer_overflow(event)
|
287
|
+
key = { client: event.payload.fetch(:client_id), topic: event.payload.fetch(:topic) }
|
288
|
+
@produce_errors.increment(labels: key)
|
289
|
+
end
|
290
|
+
|
291
|
+
def drop_messages(event)
|
292
|
+
key = { client: event.payload.fetch(:client_id) }
|
293
|
+
message_count = event.payload.fetch(:message_count)
|
294
|
+
@dropped_messages.increment(by: message_count, labels: key)
|
295
|
+
end
|
296
|
+
end
|
297
|
+
|
298
|
+
class FetcherSubscriber < ActiveSupport::Subscriber
|
299
|
+
def initialize
|
300
|
+
super
|
301
|
+
@queue_size = Prometheus.registry.gauge(:fetcher_queue_size, docstring: 'Queue size', labels: [:client, :group_id])
|
302
|
+
end
|
303
|
+
|
304
|
+
def loop(event)
|
305
|
+
queue_size = event.payload.fetch(:queue_size)
|
306
|
+
client = event.payload.fetch(:client_id)
|
307
|
+
group_id = event.payload.fetch(:group_id)
|
308
|
+
|
309
|
+
@queue_size.set(queue_size, labels: { client: client, group_id: group_id })
|
310
|
+
end
|
311
|
+
end
|
312
|
+
end
|
313
|
+
end
|
314
|
+
|
315
|
+
# To enable testability, it is possible to skip the start until test time
|
316
|
+
Kafka::Prometheus.start unless defined?(PROMETHEUS_NO_AUTO_START)
|