ruby-kafka 0.7.8 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.circleci/config.yml +135 -3
- data/.github/workflows/stale.yml +19 -0
- data/CHANGELOG.md +26 -0
- data/README.md +26 -0
- data/lib/kafka/async_producer.rb +3 -0
- data/lib/kafka/client.rb +49 -1
- data/lib/kafka/cluster.rb +52 -0
- data/lib/kafka/connection.rb +3 -0
- data/lib/kafka/consumer.rb +56 -11
- data/lib/kafka/consumer_group.rb +10 -1
- data/lib/kafka/datadog.rb +18 -11
- data/lib/kafka/fetched_batch_generator.rb +1 -1
- data/lib/kafka/fetcher.rb +5 -2
- data/lib/kafka/offset_manager.rb +12 -1
- data/lib/kafka/producer.rb +4 -1
- data/lib/kafka/prometheus.rb +316 -0
- data/lib/kafka/protocol/join_group_request.rb +8 -2
- data/lib/kafka/protocol/metadata_response.rb +1 -1
- data/lib/kafka/protocol/offset_fetch_request.rb +3 -1
- data/lib/kafka/protocol/record_batch.rb +5 -4
- data/lib/kafka/sasl/scram.rb +15 -12
- data/lib/kafka/ssl_context.rb +4 -2
- data/lib/kafka/tagged_logger.rb +25 -20
- data/lib/kafka/version.rb +1 -1
- data/ruby-kafka.gemspec +4 -3
- metadata +29 -7
data/lib/kafka/connection.rb
CHANGED
data/lib/kafka/consumer.rb
CHANGED
@@ -44,7 +44,7 @@ module Kafka
|
|
44
44
|
#
|
45
45
|
class Consumer
|
46
46
|
|
47
|
-
def initialize(cluster:, logger:, instrumenter:, group:, fetcher:, offset_manager:, session_timeout:, heartbeat:)
|
47
|
+
def initialize(cluster:, logger:, instrumenter:, group:, fetcher:, offset_manager:, session_timeout:, heartbeat:, refresh_topic_interval: 0)
|
48
48
|
@cluster = cluster
|
49
49
|
@logger = TaggedLogger.new(logger)
|
50
50
|
@instrumenter = instrumenter
|
@@ -53,6 +53,7 @@ module Kafka
|
|
53
53
|
@session_timeout = session_timeout
|
54
54
|
@fetcher = fetcher
|
55
55
|
@heartbeat = heartbeat
|
56
|
+
@refresh_topic_interval = refresh_topic_interval
|
56
57
|
|
57
58
|
@pauses = Hash.new {|h, k|
|
58
59
|
h[k] = Hash.new {|h2, k2|
|
@@ -73,6 +74,15 @@ module Kafka
|
|
73
74
|
# when user commits message other than last in a batch, this would make ruby-kafka refetch
|
74
75
|
# some already consumed messages
|
75
76
|
@current_offsets = Hash.new { |h, k| h[k] = {} }
|
77
|
+
|
78
|
+
# Map storing subscribed topics with their configuration
|
79
|
+
@subscribed_topics = Concurrent::Map.new
|
80
|
+
|
81
|
+
# Set storing topics that matched topics in @subscribed_topics
|
82
|
+
@matched_topics = Set.new
|
83
|
+
|
84
|
+
# Whether join_group must be executed again because new topics are added
|
85
|
+
@join_group_for_new_topics = false
|
76
86
|
end
|
77
87
|
|
78
88
|
# Subscribes the consumer to a topic.
|
@@ -97,13 +107,12 @@ module Kafka
|
|
97
107
|
def subscribe(topic_or_regex, default_offset: nil, start_from_beginning: true, max_bytes_per_partition: 1048576)
|
98
108
|
default_offset ||= start_from_beginning ? :earliest : :latest
|
99
109
|
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
end
|
110
|
+
@subscribed_topics[topic_or_regex] = {
|
111
|
+
default_offset: default_offset,
|
112
|
+
start_from_beginning: start_from_beginning,
|
113
|
+
max_bytes_per_partition: max_bytes_per_partition
|
114
|
+
}
|
115
|
+
scan_for_subscribing
|
107
116
|
|
108
117
|
nil
|
109
118
|
end
|
@@ -116,7 +125,6 @@ module Kafka
|
|
116
125
|
def stop
|
117
126
|
@running = false
|
118
127
|
@fetcher.stop
|
119
|
-
@cluster.disconnect
|
120
128
|
end
|
121
129
|
|
122
130
|
# Pause processing of a specific topic partition.
|
@@ -308,6 +316,7 @@ module Kafka
|
|
308
316
|
topic: batch.topic,
|
309
317
|
partition: batch.partition,
|
310
318
|
last_offset: batch.last_offset,
|
319
|
+
last_create_time: batch.messages.last.try(:create_time),
|
311
320
|
offset_lag: batch.offset_lag,
|
312
321
|
highwater_mark_offset: batch.highwater_mark_offset,
|
313
322
|
message_count: batch.messages.count,
|
@@ -401,6 +410,7 @@ module Kafka
|
|
401
410
|
while running?
|
402
411
|
begin
|
403
412
|
@instrumenter.instrument("loop.consumer") do
|
413
|
+
refresh_topic_list_if_enabled
|
404
414
|
yield
|
405
415
|
end
|
406
416
|
rescue HeartbeatError
|
@@ -432,6 +442,7 @@ module Kafka
|
|
432
442
|
# important that members explicitly tell Kafka when they're leaving.
|
433
443
|
make_final_offsets_commit!
|
434
444
|
@group.leave rescue nil
|
445
|
+
@cluster.disconnect
|
435
446
|
@running = false
|
436
447
|
@logger.pop_tags
|
437
448
|
end
|
@@ -452,6 +463,8 @@ module Kafka
|
|
452
463
|
end
|
453
464
|
|
454
465
|
def join_group
|
466
|
+
@join_group_for_new_topics = false
|
467
|
+
|
455
468
|
old_generation_id = @group.generation_id
|
456
469
|
|
457
470
|
@group.join
|
@@ -513,11 +526,19 @@ module Kafka
|
|
513
526
|
end
|
514
527
|
end
|
515
528
|
|
529
|
+
def refresh_topic_list_if_enabled
|
530
|
+
return if @refresh_topic_interval <= 0
|
531
|
+
return if @refreshed_at && @refreshed_at + @refresh_topic_interval > Time.now
|
532
|
+
|
533
|
+
scan_for_subscribing
|
534
|
+
@refreshed_at = Time.now
|
535
|
+
end
|
536
|
+
|
516
537
|
def fetch_batches
|
517
538
|
# Return early if the consumer has been stopped.
|
518
539
|
return [] if shutting_down?
|
519
540
|
|
520
|
-
join_group
|
541
|
+
join_group if !@group.member? || @join_group_for_new_topics
|
521
542
|
|
522
543
|
trigger_heartbeat
|
523
544
|
|
@@ -525,7 +546,7 @@ module Kafka
|
|
525
546
|
|
526
547
|
if !@fetcher.data?
|
527
548
|
@logger.debug "No batches to process"
|
528
|
-
sleep 2
|
549
|
+
sleep(@fetcher.max_wait_time || 2)
|
529
550
|
[]
|
530
551
|
else
|
531
552
|
tag, message = @fetcher.poll
|
@@ -571,10 +592,34 @@ module Kafka
|
|
571
592
|
end
|
572
593
|
end
|
573
594
|
|
595
|
+
def scan_for_subscribing
|
596
|
+
@subscribed_topics.each do |topic_or_regex, config|
|
597
|
+
default_offset = config.fetch(:default_offset)
|
598
|
+
start_from_beginning = config.fetch(:start_from_beginning)
|
599
|
+
max_bytes_per_partition = config.fetch(:max_bytes_per_partition)
|
600
|
+
if topic_or_regex.is_a?(Regexp)
|
601
|
+
subscribe_to_regex(topic_or_regex, default_offset, start_from_beginning, max_bytes_per_partition)
|
602
|
+
else
|
603
|
+
subscribe_to_topic(topic_or_regex, default_offset, start_from_beginning, max_bytes_per_partition)
|
604
|
+
end
|
605
|
+
end
|
606
|
+
end
|
607
|
+
|
608
|
+
def subscribe_to_regex(topic_regex, default_offset, start_from_beginning, max_bytes_per_partition)
|
609
|
+
cluster_topics.select { |topic| topic =~ topic_regex }.each do |topic|
|
610
|
+
subscribe_to_topic(topic, default_offset, start_from_beginning, max_bytes_per_partition)
|
611
|
+
end
|
612
|
+
end
|
613
|
+
|
574
614
|
def subscribe_to_topic(topic, default_offset, start_from_beginning, max_bytes_per_partition)
|
615
|
+
return if @matched_topics.include?(topic)
|
616
|
+
@matched_topics.add(topic)
|
617
|
+
@join_group_for_new_topics = true
|
618
|
+
|
575
619
|
@group.subscribe(topic)
|
576
620
|
@offset_manager.set_default_offset(topic, default_offset)
|
577
621
|
@fetcher.subscribe(topic, max_bytes_per_partition: max_bytes_per_partition)
|
622
|
+
@cluster.mark_as_stale!
|
578
623
|
end
|
579
624
|
|
580
625
|
def cluster_topics
|
data/lib/kafka/consumer_group.rb
CHANGED
@@ -7,11 +7,12 @@ module Kafka
|
|
7
7
|
class ConsumerGroup
|
8
8
|
attr_reader :assigned_partitions, :generation_id, :group_id
|
9
9
|
|
10
|
-
def initialize(cluster:, logger:, group_id:, session_timeout:, retention_time:, instrumenter:)
|
10
|
+
def initialize(cluster:, logger:, group_id:, session_timeout:, rebalance_timeout:, retention_time:, instrumenter:)
|
11
11
|
@cluster = cluster
|
12
12
|
@logger = TaggedLogger.new(logger)
|
13
13
|
@group_id = group_id
|
14
14
|
@session_timeout = session_timeout
|
15
|
+
@rebalance_timeout = rebalance_timeout
|
15
16
|
@instrumenter = instrumenter
|
16
17
|
@member_id = ""
|
17
18
|
@generation_id = nil
|
@@ -140,7 +141,9 @@ module Kafka
|
|
140
141
|
response = coordinator.join_group(
|
141
142
|
group_id: @group_id,
|
142
143
|
session_timeout: @session_timeout,
|
144
|
+
rebalance_timeout: @rebalance_timeout,
|
143
145
|
member_id: @member_id,
|
146
|
+
topics: @topics,
|
144
147
|
)
|
145
148
|
|
146
149
|
Protocol.handle_error(response.error_code)
|
@@ -158,6 +161,12 @@ module Kafka
|
|
158
161
|
@member_id = ""
|
159
162
|
sleep 1
|
160
163
|
|
164
|
+
retry
|
165
|
+
rescue CoordinatorLoadInProgress
|
166
|
+
@logger.error "Coordinator broker still loading, retrying in 1s..."
|
167
|
+
|
168
|
+
sleep 1
|
169
|
+
|
161
170
|
retry
|
162
171
|
end
|
163
172
|
|
data/lib/kafka/datadog.rb
CHANGED
@@ -31,7 +31,7 @@ module Kafka
|
|
31
31
|
|
32
32
|
class << self
|
33
33
|
def statsd
|
34
|
-
@statsd ||= ::Datadog::Statsd.new(host, port, namespace: namespace, tags: tags)
|
34
|
+
@statsd ||= ::Datadog::Statsd.new(host, port, namespace: namespace, tags: tags, socket_path: socket_path)
|
35
35
|
end
|
36
36
|
|
37
37
|
def statsd=(statsd)
|
@@ -40,7 +40,7 @@ module Kafka
|
|
40
40
|
end
|
41
41
|
|
42
42
|
def host
|
43
|
-
@host
|
43
|
+
@host
|
44
44
|
end
|
45
45
|
|
46
46
|
def host=(host)
|
@@ -49,7 +49,7 @@ module Kafka
|
|
49
49
|
end
|
50
50
|
|
51
51
|
def port
|
52
|
-
@port
|
52
|
+
@port
|
53
53
|
end
|
54
54
|
|
55
55
|
def port=(port)
|
@@ -57,6 +57,15 @@ module Kafka
|
|
57
57
|
clear
|
58
58
|
end
|
59
59
|
|
60
|
+
def socket_path
|
61
|
+
@socket_path
|
62
|
+
end
|
63
|
+
|
64
|
+
def socket_path=(socket_path)
|
65
|
+
@socket_path = socket_path
|
66
|
+
clear
|
67
|
+
end
|
68
|
+
|
60
69
|
def namespace
|
61
70
|
@namespace ||= STATSD_NAMESPACE
|
62
71
|
end
|
@@ -77,14 +86,6 @@ module Kafka
|
|
77
86
|
|
78
87
|
private
|
79
88
|
|
80
|
-
def default_host
|
81
|
-
::Datadog::Statsd.const_defined?(:Connection) ? ::Datadog::Statsd::Connection::DEFAULT_HOST : ::Datadog::Statsd::DEFAULT_HOST
|
82
|
-
end
|
83
|
-
|
84
|
-
def default_port
|
85
|
-
::Datadog::Statsd.const_defined?(:Connection) ? ::Datadog::Statsd::Connection::DEFAULT_PORT : ::Datadog::Statsd::DEFAULT_PORT
|
86
|
-
end
|
87
|
-
|
88
89
|
def clear
|
89
90
|
@statsd && @statsd.close
|
90
91
|
@statsd = nil
|
@@ -168,6 +169,8 @@ module Kafka
|
|
168
169
|
def process_batch(event)
|
169
170
|
offset = event.payload.fetch(:last_offset)
|
170
171
|
messages = event.payload.fetch(:message_count)
|
172
|
+
create_time = event.payload.fetch(:last_create_time)
|
173
|
+
time_lag = create_time && ((Time.now - create_time) * 1000).to_i
|
171
174
|
|
172
175
|
tags = {
|
173
176
|
client: event.payload.fetch(:client_id),
|
@@ -184,6 +187,10 @@ module Kafka
|
|
184
187
|
end
|
185
188
|
|
186
189
|
gauge("consumer.offset", offset, tags: tags)
|
190
|
+
|
191
|
+
if time_lag
|
192
|
+
gauge("consumer.time_lag", time_lag, tags: tags)
|
193
|
+
end
|
187
194
|
end
|
188
195
|
|
189
196
|
def fetch_batch(event)
|
data/lib/kafka/fetcher.rb
CHANGED
@@ -4,7 +4,7 @@ require "kafka/fetch_operation"
|
|
4
4
|
|
5
5
|
module Kafka
|
6
6
|
class Fetcher
|
7
|
-
attr_reader :queue
|
7
|
+
attr_reader :queue, :max_wait_time
|
8
8
|
|
9
9
|
def initialize(cluster:, logger:, instrumenter:, max_queue_size:, group:)
|
10
10
|
@cluster = cluster
|
@@ -17,6 +17,9 @@ module Kafka
|
|
17
17
|
@commands = Queue.new
|
18
18
|
@next_offsets = Hash.new { |h, k| h[k] = {} }
|
19
19
|
|
20
|
+
# We are only running when someone calls start.
|
21
|
+
@running = false
|
22
|
+
|
20
23
|
# Long poll until at least this many bytes can be fetched.
|
21
24
|
@min_bytes = 1
|
22
25
|
|
@@ -110,7 +113,7 @@ module Kafka
|
|
110
113
|
elsif @queue.size < @max_queue_size
|
111
114
|
step
|
112
115
|
else
|
113
|
-
@logger.
|
116
|
+
@logger.info "Reached max fetcher queue size (#{@max_queue_size}), sleeping 1s"
|
114
117
|
sleep 1
|
115
118
|
end
|
116
119
|
ensure
|
data/lib/kafka/offset_manager.rb
CHANGED
@@ -50,9 +50,20 @@ module Kafka
|
|
50
50
|
# @param offset [Integer] the offset of the message that should be marked as processed.
|
51
51
|
# @return [nil]
|
52
52
|
def mark_as_processed(topic, partition, offset)
|
53
|
-
@
|
53
|
+
unless @group.assigned_to?(topic, partition)
|
54
|
+
@logger.debug "Not marking #{topic}/#{partition}:#{offset} as processed for partition not assigned to this consumer."
|
55
|
+
return
|
56
|
+
end
|
54
57
|
@processed_offsets[topic] ||= {}
|
55
58
|
|
59
|
+
last_processed_offset = @processed_offsets[topic][partition] || -1
|
60
|
+
if last_processed_offset > offset + 1
|
61
|
+
@logger.debug "Not overwriting newer offset #{topic}/#{partition}:#{last_processed_offset - 1} with older #{offset}"
|
62
|
+
return
|
63
|
+
end
|
64
|
+
|
65
|
+
@uncommitted_offsets += 1
|
66
|
+
|
56
67
|
# The committed offset should always be the offset of the next message that the
|
57
68
|
# application will read, thus adding one to the last message processed.
|
58
69
|
@processed_offsets[topic][partition] = offset + 1
|
data/lib/kafka/producer.rb
CHANGED
@@ -188,11 +188,14 @@ module Kafka
|
|
188
188
|
# @raise [BufferOverflow] if the maximum buffer size has been reached.
|
189
189
|
# @return [nil]
|
190
190
|
def produce(value, key: nil, headers: {}, topic:, partition: nil, partition_key: nil, create_time: Time.now)
|
191
|
+
# We want to fail fast if `topic` isn't a String
|
192
|
+
topic = topic.to_str
|
193
|
+
|
191
194
|
message = PendingMessage.new(
|
192
195
|
value: value && value.to_s,
|
193
196
|
key: key && key.to_s,
|
194
197
|
headers: headers,
|
195
|
-
topic: topic
|
198
|
+
topic: topic,
|
196
199
|
partition: partition && Integer(partition),
|
197
200
|
partition_key: partition_key && partition_key.to_s,
|
198
201
|
create_time: create_time
|
@@ -0,0 +1,316 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
#
|
4
|
+
# Subscriber to ruby_kafka to report metrics to prometheus
|
5
|
+
#
|
6
|
+
# Usage:
|
7
|
+
# require "kafka/prometheus"
|
8
|
+
#
|
9
|
+
# Once the file has been required, no further configuration is needed, all operational
|
10
|
+
# metrics are automatically emitted (Unless PROMETHEUS_NO_AUTO_START is set).
|
11
|
+
#
|
12
|
+
# By Peter Mustel, T2 Data AB
|
13
|
+
#
|
14
|
+
begin
|
15
|
+
require 'prometheus/client'
|
16
|
+
rescue LoadError
|
17
|
+
warn 'In order to report Kafka client metrics to Prometheus you need to install the `prometheus-client` gem.'
|
18
|
+
raise
|
19
|
+
end
|
20
|
+
|
21
|
+
require 'active_support/subscriber'
|
22
|
+
|
23
|
+
module Kafka
|
24
|
+
module Prometheus
|
25
|
+
SIZE_BUCKETS = [1, 10, 100, 1000, 10_000, 100_000, 1_000_000].freeze
|
26
|
+
LATENCY_BUCKETS = [0.0001, 0.001, 0.01, 0.1, 1.0, 10, 100, 1000].freeze
|
27
|
+
DELAY_BUCKETS = [1, 3, 10, 30, 100, 300, 1000, 3000, 10_000, 30_000].freeze
|
28
|
+
|
29
|
+
class << self
|
30
|
+
attr_accessor :registry
|
31
|
+
|
32
|
+
def start(registry = ::Prometheus::Client.registry)
|
33
|
+
@registry = registry
|
34
|
+
ConnectionSubscriber.attach_to 'connection.kafka'
|
35
|
+
ConsumerSubscriber.attach_to 'consumer.kafka'
|
36
|
+
ProducerSubscriber.attach_to 'producer.kafka'
|
37
|
+
AsyncProducerSubscriber.attach_to 'async_producer.kafka'
|
38
|
+
FetcherSubscriber.attach_to 'fetcher.kafka'
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
class ConnectionSubscriber < ActiveSupport::Subscriber
|
43
|
+
def initialize
|
44
|
+
super
|
45
|
+
@api_calls = Prometheus.registry.counter(:api_calls, docstring: 'Total calls', labels: [:client, :api, :broker])
|
46
|
+
@api_latency = Prometheus.registry.histogram(:api_latency, docstring: 'Latency', buckets: LATENCY_BUCKETS, labels: [:client, :api, :broker])
|
47
|
+
@api_request_size = Prometheus.registry.histogram(:api_request_size, docstring: 'Request size', buckets: SIZE_BUCKETS, labels: [:client, :api, :broker])
|
48
|
+
@api_response_size = Prometheus.registry.histogram(:api_response_size, docstring: 'Response size', buckets: SIZE_BUCKETS, labels: [:client, :api, :broker])
|
49
|
+
@api_errors = Prometheus.registry.counter(:api_errors, docstring: 'Errors', labels: [:client, :api, :broker])
|
50
|
+
end
|
51
|
+
|
52
|
+
def request(event)
|
53
|
+
key = {
|
54
|
+
client: event.payload.fetch(:client_id),
|
55
|
+
api: event.payload.fetch(:api, 'unknown'),
|
56
|
+
broker: event.payload.fetch(:broker_host)
|
57
|
+
}
|
58
|
+
request_size = event.payload.fetch(:request_size, 0)
|
59
|
+
response_size = event.payload.fetch(:response_size, 0)
|
60
|
+
|
61
|
+
@api_calls.increment(labels: key)
|
62
|
+
@api_latency.observe(event.duration, labels: key)
|
63
|
+
@api_request_size.observe(request_size, labels: key)
|
64
|
+
@api_response_size.observe(response_size, labels: key)
|
65
|
+
@api_errors.increment(labels: key) if event.payload.key?(:exception)
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
class ConsumerSubscriber < ActiveSupport::Subscriber
|
70
|
+
def initialize
|
71
|
+
super
|
72
|
+
@process_messages = Prometheus.registry.counter(:consumer_process_messages, docstring: 'Total messages', labels: [:client, :group_id, :topic, :partition])
|
73
|
+
@process_message_errors = Prometheus.registry.counter(:consumer_process_message_errors, docstring: 'Total errors', labels: [:client, :group_id, :topic, :partition])
|
74
|
+
@process_message_latency =
|
75
|
+
Prometheus.registry.histogram(:consumer_process_message_latency, docstring: 'Latency', buckets: LATENCY_BUCKETS, labels: [:client, :group_id, :topic, :partition])
|
76
|
+
@offset_lag = Prometheus.registry.gauge(:consumer_offset_lag, docstring: 'Offset lag', labels: [:client, :group_id, :topic, :partition])
|
77
|
+
@time_lag = Prometheus.registry.gauge(:consumer_time_lag, docstring: 'Time lag of message', labels: [:client, :group_id, :topic, :partition])
|
78
|
+
@process_batch_errors = Prometheus.registry.counter(:consumer_process_batch_errors, docstring: 'Total errors in batch', labels: [:client, :group_id, :topic, :partition])
|
79
|
+
@process_batch_latency =
|
80
|
+
Prometheus.registry.histogram(:consumer_process_batch_latency, docstring: 'Latency in batch', buckets: LATENCY_BUCKETS, labels: [:client, :group_id, :topic, :partition])
|
81
|
+
@batch_size = Prometheus.registry.histogram(:consumer_batch_size, docstring: 'Size of batch', buckets: SIZE_BUCKETS, labels: [:client, :group_id, :topic, :partition])
|
82
|
+
@join_group = Prometheus.registry.histogram(:consumer_join_group, docstring: 'Time to join group', buckets: DELAY_BUCKETS, labels: [:client, :group_id])
|
83
|
+
@join_group_errors = Prometheus.registry.counter(:consumer_join_group_errors, docstring: 'Total error in joining group', labels: [:client, :group_id])
|
84
|
+
@sync_group = Prometheus.registry.histogram(:consumer_sync_group, docstring: 'Time to sync group', buckets: DELAY_BUCKETS, labels: [:client, :group_id])
|
85
|
+
@sync_group_errors = Prometheus.registry.counter(:consumer_sync_group_errors, docstring: 'Total error in syncing group', labels: [:client, :group_id])
|
86
|
+
@leave_group = Prometheus.registry.histogram(:consumer_leave_group, docstring: 'Time to leave group', buckets: DELAY_BUCKETS, labels: [:client, :group_id])
|
87
|
+
@leave_group_errors = Prometheus.registry.counter(:consumer_leave_group_errors, docstring: 'Total error in leaving group', labels: [:client, :group_id])
|
88
|
+
@pause_duration = Prometheus.registry.gauge(:consumer_pause_duration, docstring: 'Pause duration', labels: [:client, :group_id, :topic, :partition])
|
89
|
+
end
|
90
|
+
|
91
|
+
def process_message(event)
|
92
|
+
key = {
|
93
|
+
client: event.payload.fetch(:client_id),
|
94
|
+
group_id: event.payload.fetch(:group_id),
|
95
|
+
topic: event.payload.fetch(:topic),
|
96
|
+
partition: event.payload.fetch(:partition)
|
97
|
+
}
|
98
|
+
|
99
|
+
offset_lag = event.payload.fetch(:offset_lag)
|
100
|
+
create_time = event.payload.fetch(:create_time)
|
101
|
+
|
102
|
+
time_lag = create_time && ((Time.now - create_time) * 1000).to_i
|
103
|
+
|
104
|
+
if event.payload.key?(:exception)
|
105
|
+
@process_message_errors.increment(labels: key)
|
106
|
+
else
|
107
|
+
@process_message_latency.observe(event.duration, labels: key)
|
108
|
+
@process_messages.increment(labels: key)
|
109
|
+
end
|
110
|
+
|
111
|
+
@offset_lag.set(offset_lag, labels: key)
|
112
|
+
|
113
|
+
# Not all messages have timestamps.
|
114
|
+
return unless time_lag
|
115
|
+
|
116
|
+
@time_lag.set(time_lag, labels: key)
|
117
|
+
end
|
118
|
+
|
119
|
+
def process_batch(event)
|
120
|
+
key = {
|
121
|
+
client: event.payload.fetch(:client_id),
|
122
|
+
group_id: event.payload.fetch(:group_id),
|
123
|
+
topic: event.payload.fetch(:topic),
|
124
|
+
partition: event.payload.fetch(:partition)
|
125
|
+
}
|
126
|
+
message_count = event.payload.fetch(:message_count)
|
127
|
+
|
128
|
+
if event.payload.key?(:exception)
|
129
|
+
@process_batch_errors.increment(labels: key)
|
130
|
+
else
|
131
|
+
@process_batch_latency.observe(event.duration, labels: key)
|
132
|
+
@process_messages.increment(by: message_count, labels: key)
|
133
|
+
end
|
134
|
+
end
|
135
|
+
|
136
|
+
def fetch_batch(event)
|
137
|
+
key = {
|
138
|
+
client: event.payload.fetch(:client_id),
|
139
|
+
group_id: event.payload.fetch(:group_id),
|
140
|
+
topic: event.payload.fetch(:topic),
|
141
|
+
partition: event.payload.fetch(:partition)
|
142
|
+
}
|
143
|
+
offset_lag = event.payload.fetch(:offset_lag)
|
144
|
+
batch_size = event.payload.fetch(:message_count)
|
145
|
+
|
146
|
+
@batch_size.observe(batch_size, labels: key)
|
147
|
+
@offset_lag.set(offset_lag, labels: key)
|
148
|
+
end
|
149
|
+
|
150
|
+
def join_group(event)
|
151
|
+
key = { client: event.payload.fetch(:client_id), group_id: event.payload.fetch(:group_id) }
|
152
|
+
@join_group.observe(event.duration, labels: key)
|
153
|
+
|
154
|
+
@join_group_errors.increment(labels: key) if event.payload.key?(:exception)
|
155
|
+
end
|
156
|
+
|
157
|
+
def sync_group(event)
|
158
|
+
key = { client: event.payload.fetch(:client_id), group_id: event.payload.fetch(:group_id) }
|
159
|
+
@sync_group.observe(event.duration, labels: key)
|
160
|
+
|
161
|
+
@sync_group_errors.increment(labels: key) if event.payload.key?(:exception)
|
162
|
+
end
|
163
|
+
|
164
|
+
def leave_group(event)
|
165
|
+
key = { client: event.payload.fetch(:client_id), group_id: event.payload.fetch(:group_id) }
|
166
|
+
@leave_group.observe(event.duration, labels: key)
|
167
|
+
|
168
|
+
@leave_group_errors.increment(labels: key) if event.payload.key?(:exception)
|
169
|
+
end
|
170
|
+
|
171
|
+
def pause_status(event)
|
172
|
+
key = {
|
173
|
+
client: event.payload.fetch(:client_id),
|
174
|
+
group_id: event.payload.fetch(:group_id),
|
175
|
+
topic: event.payload.fetch(:topic),
|
176
|
+
partition: event.payload.fetch(:partition)
|
177
|
+
}
|
178
|
+
|
179
|
+
duration = event.payload.fetch(:duration)
|
180
|
+
@pause_duration.set(duration, labels: key)
|
181
|
+
end
|
182
|
+
end
|
183
|
+
|
184
|
+
class ProducerSubscriber < ActiveSupport::Subscriber
|
185
|
+
def initialize
|
186
|
+
super
|
187
|
+
@produce_messages = Prometheus.registry.counter(:producer_produced_messages, docstring: 'Produced messages total', labels: [:client, :topic])
|
188
|
+
@produce_message_size =
|
189
|
+
Prometheus.registry.histogram(:producer_message_size, docstring: 'Message size', buckets: SIZE_BUCKETS, labels: [:client, :topic])
|
190
|
+
@buffer_size = Prometheus.registry.histogram(:producer_buffer_size, docstring: 'Buffer size', buckets: SIZE_BUCKETS, labels: [:client])
|
191
|
+
@buffer_fill_ratio = Prometheus.registry.histogram(:producer_buffer_fill_ratio, docstring: 'Buffer fill ratio', labels: [:client])
|
192
|
+
@buffer_fill_percentage = Prometheus.registry.histogram(:producer_buffer_fill_percentage, docstring: 'Buffer fill percentage', labels: [:client])
|
193
|
+
@produce_errors = Prometheus.registry.counter(:producer_produce_errors, docstring: 'Produce errors', labels: [:client, :topic])
|
194
|
+
@deliver_errors = Prometheus.registry.counter(:producer_deliver_errors, docstring: 'Deliver error', labels: [:client])
|
195
|
+
@deliver_latency =
|
196
|
+
Prometheus.registry.histogram(:producer_deliver_latency, docstring: 'Delivery latency', buckets: LATENCY_BUCKETS, labels: [:client])
|
197
|
+
@deliver_messages = Prometheus.registry.counter(:producer_deliver_messages, docstring: 'Total count of delivered messages', labels: [:client])
|
198
|
+
@deliver_attempts = Prometheus.registry.histogram(:producer_deliver_attempts, docstring: 'Delivery attempts', labels: [:client])
|
199
|
+
@ack_messages = Prometheus.registry.counter(:producer_ack_messages, docstring: 'Ack', labels: [:client, :topic])
|
200
|
+
@ack_delay = Prometheus.registry.histogram(:producer_ack_delay, docstring: 'Ack delay', buckets: LATENCY_BUCKETS, labels: [:client, :topic])
|
201
|
+
@ack_errors = Prometheus.registry.counter(:producer_ack_errors, docstring: 'Ack errors', labels: [:client, :topic])
|
202
|
+
end
|
203
|
+
|
204
|
+
def produce_message(event)
|
205
|
+
client = event.payload.fetch(:client_id)
|
206
|
+
key = { client: client, topic: event.payload.fetch(:topic) }
|
207
|
+
|
208
|
+
message_size = event.payload.fetch(:message_size)
|
209
|
+
buffer_size = event.payload.fetch(:buffer_size)
|
210
|
+
max_buffer_size = event.payload.fetch(:max_buffer_size)
|
211
|
+
buffer_fill_ratio = buffer_size.to_f / max_buffer_size.to_f
|
212
|
+
buffer_fill_percentage = buffer_fill_ratio * 100.0
|
213
|
+
|
214
|
+
# This gets us the write rate.
|
215
|
+
@produce_messages.increment(labels: key)
|
216
|
+
@produce_message_size.observe(message_size, labels: key)
|
217
|
+
|
218
|
+
# This gets us the avg/max buffer size per producer.
|
219
|
+
@buffer_size.observe(buffer_size, labels: { client: client })
|
220
|
+
|
221
|
+
# This gets us the avg/max buffer fill ratio per producer.
|
222
|
+
@buffer_fill_ratio.observe(buffer_fill_ratio, labels: { client: client })
|
223
|
+
@buffer_fill_percentage.observe(buffer_fill_percentage, labels: { client: client })
|
224
|
+
end
|
225
|
+
|
226
|
+
def buffer_overflow(event)
|
227
|
+
key = { client: event.payload.fetch(:client_id), topic: event.payload.fetch(:topic) }
|
228
|
+
@produce_errors.increment(labels: key)
|
229
|
+
end
|
230
|
+
|
231
|
+
def deliver_messages(event)
|
232
|
+
key = { client: event.payload.fetch(:client_id) }
|
233
|
+
message_count = event.payload.fetch(:delivered_message_count)
|
234
|
+
attempts = event.payload.fetch(:attempts)
|
235
|
+
|
236
|
+
@deliver_errors.increment(labels: key) if event.payload.key?(:exception)
|
237
|
+
@deliver_latency.observe(event.duration, labels: key)
|
238
|
+
|
239
|
+
# Messages delivered to Kafka:
|
240
|
+
@deliver_messages.increment(by: message_count, labels: key)
|
241
|
+
|
242
|
+
# Number of attempts to deliver messages:
|
243
|
+
@deliver_attempts.observe(attempts, labels: key)
|
244
|
+
end
|
245
|
+
|
246
|
+
def ack_message(event)
|
247
|
+
key = { client: event.payload.fetch(:client_id), topic: event.payload.fetch(:topic) }
|
248
|
+
|
249
|
+
# Number of messages ACK'd for the topic.
|
250
|
+
@ack_messages.increment(labels: key)
|
251
|
+
|
252
|
+
# Histogram of delay between a message being produced and it being ACK'd.
|
253
|
+
@ack_delay.observe(event.payload.fetch(:delay), labels: key)
|
254
|
+
end
|
255
|
+
|
256
|
+
def topic_error(event)
|
257
|
+
key = { client: event.payload.fetch(:client_id), topic: event.payload.fetch(:topic) }
|
258
|
+
|
259
|
+
@ack_errors.increment(labels: key)
|
260
|
+
end
|
261
|
+
end
|
262
|
+
|
263
|
+
class AsyncProducerSubscriber < ActiveSupport::Subscriber
|
264
|
+
def initialize
|
265
|
+
super
|
266
|
+
@queue_size = Prometheus.registry.histogram(:async_producer_queue_size, docstring: 'Queue size', buckets: SIZE_BUCKETS, labels: [:client, :topic])
|
267
|
+
@queue_fill_ratio = Prometheus.registry.histogram(:async_producer_queue_fill_ratio, docstring: 'Queue fill ratio', labels: [:client, :topic])
|
268
|
+
@produce_errors = Prometheus.registry.counter(:async_producer_produce_errors, docstring: 'Producer errors', labels: [:client, :topic])
|
269
|
+
@dropped_messages = Prometheus.registry.counter(:async_producer_dropped_messages, docstring: 'Dropped messages', labels: [:client])
|
270
|
+
end
|
271
|
+
|
272
|
+
def enqueue_message(event)
|
273
|
+
key = { client: event.payload.fetch(:client_id), topic: event.payload.fetch(:topic) }
|
274
|
+
|
275
|
+
queue_size = event.payload.fetch(:queue_size)
|
276
|
+
max_queue_size = event.payload.fetch(:max_queue_size)
|
277
|
+
queue_fill_ratio = queue_size.to_f / max_queue_size.to_f
|
278
|
+
|
279
|
+
# This gets us the avg/max queue size per producer.
|
280
|
+
@queue_size.observe(queue_size, labels: key)
|
281
|
+
|
282
|
+
# This gets us the avg/max queue fill ratio per producer.
|
283
|
+
@queue_fill_ratio.observe(queue_fill_ratio, labels: key)
|
284
|
+
end
|
285
|
+
|
286
|
+
def buffer_overflow(event)
|
287
|
+
key = { client: event.payload.fetch(:client_id), topic: event.payload.fetch(:topic) }
|
288
|
+
@produce_errors.increment(labels: key)
|
289
|
+
end
|
290
|
+
|
291
|
+
def drop_messages(event)
|
292
|
+
key = { client: event.payload.fetch(:client_id) }
|
293
|
+
message_count = event.payload.fetch(:message_count)
|
294
|
+
@dropped_messages.increment(by: message_count, labels: key)
|
295
|
+
end
|
296
|
+
end
|
297
|
+
|
298
|
+
class FetcherSubscriber < ActiveSupport::Subscriber
|
299
|
+
def initialize
|
300
|
+
super
|
301
|
+
@queue_size = Prometheus.registry.gauge(:fetcher_queue_size, docstring: 'Queue size', labels: [:client, :group_id])
|
302
|
+
end
|
303
|
+
|
304
|
+
def loop(event)
|
305
|
+
queue_size = event.payload.fetch(:queue_size)
|
306
|
+
client = event.payload.fetch(:client_id)
|
307
|
+
group_id = event.payload.fetch(:group_id)
|
308
|
+
|
309
|
+
@queue_size.set(queue_size, labels: { client: client, group_id: group_id })
|
310
|
+
end
|
311
|
+
end
|
312
|
+
end
|
313
|
+
end
|
314
|
+
|
315
|
+
# To enable testability, it is possible to skip the start until test time
|
316
|
+
Kafka::Prometheus.start unless defined?(PROMETHEUS_NO_AUTO_START)
|