karafka 1.4.13 → 2.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- checksums.yaml.gz.sig +3 -3
- data/.github/workflows/ci.yml +85 -30
- data/.ruby-version +1 -1
- data/CHANGELOG.md +268 -7
- data/CONTRIBUTING.md +10 -19
- data/Gemfile +6 -0
- data/Gemfile.lock +44 -87
- data/LICENSE +17 -0
- data/LICENSE-COMM +89 -0
- data/LICENSE-LGPL +165 -0
- data/README.md +44 -48
- data/bin/benchmarks +85 -0
- data/bin/create_token +22 -0
- data/bin/integrations +237 -0
- data/bin/karafka +4 -0
- data/bin/scenario +29 -0
- data/bin/stress_many +13 -0
- data/bin/stress_one +13 -0
- data/bin/wait_for_kafka +20 -0
- data/certs/karafka-pro.pem +11 -0
- data/config/errors.yml +55 -40
- data/docker-compose.yml +39 -3
- data/karafka.gemspec +11 -17
- data/lib/active_job/karafka.rb +21 -0
- data/lib/active_job/queue_adapters/karafka_adapter.rb +26 -0
- data/lib/karafka/active_job/consumer.rb +26 -0
- data/lib/karafka/active_job/dispatcher.rb +38 -0
- data/lib/karafka/active_job/job_extensions.rb +34 -0
- data/lib/karafka/active_job/job_options_contract.rb +21 -0
- data/lib/karafka/active_job/routing/extensions.rb +31 -0
- data/lib/karafka/app.rb +15 -20
- data/lib/karafka/base_consumer.rb +181 -31
- data/lib/karafka/cli/base.rb +4 -4
- data/lib/karafka/cli/info.rb +43 -9
- data/lib/karafka/cli/install.rb +19 -10
- data/lib/karafka/cli/server.rb +17 -42
- data/lib/karafka/cli.rb +4 -11
- data/lib/karafka/connection/client.rb +385 -90
- data/lib/karafka/connection/listener.rb +246 -38
- data/lib/karafka/connection/listeners_batch.rb +24 -0
- data/lib/karafka/connection/messages_buffer.rb +84 -0
- data/lib/karafka/connection/pauses_manager.rb +46 -0
- data/lib/karafka/connection/raw_messages_buffer.rb +101 -0
- data/lib/karafka/connection/rebalance_manager.rb +78 -0
- data/lib/karafka/contracts/base.rb +17 -0
- data/lib/karafka/contracts/config.rb +88 -11
- data/lib/karafka/contracts/consumer_group.rb +21 -189
- data/lib/karafka/contracts/consumer_group_topic.rb +34 -11
- data/lib/karafka/contracts/server_cli_options.rb +19 -18
- data/lib/karafka/contracts.rb +1 -1
- data/lib/karafka/env.rb +46 -0
- data/lib/karafka/errors.rb +21 -21
- data/lib/karafka/helpers/async.rb +33 -0
- data/lib/karafka/helpers/colorize.rb +20 -0
- data/lib/karafka/helpers/multi_delegator.rb +2 -2
- data/lib/karafka/instrumentation/callbacks/error.rb +40 -0
- data/lib/karafka/instrumentation/callbacks/statistics.rb +41 -0
- data/lib/karafka/instrumentation/logger_listener.rb +164 -0
- data/lib/karafka/instrumentation/monitor.rb +13 -61
- data/lib/karafka/instrumentation/notifications.rb +52 -0
- data/lib/karafka/instrumentation/proctitle_listener.rb +3 -3
- data/lib/karafka/instrumentation/vendors/datadog/dashboard.json +1 -0
- data/lib/karafka/instrumentation/vendors/datadog/listener.rb +232 -0
- data/lib/karafka/instrumentation.rb +21 -0
- data/lib/karafka/licenser.rb +75 -0
- data/lib/karafka/messages/batch_metadata.rb +45 -0
- data/lib/karafka/messages/builders/batch_metadata.rb +40 -0
- data/lib/karafka/messages/builders/message.rb +39 -0
- data/lib/karafka/messages/builders/messages.rb +32 -0
- data/lib/karafka/{params/params.rb → messages/message.rb} +7 -12
- data/lib/karafka/messages/messages.rb +64 -0
- data/lib/karafka/{params → messages}/metadata.rb +4 -6
- data/lib/karafka/messages/seek.rb +9 -0
- data/lib/karafka/patches/rdkafka/consumer.rb +22 -0
- data/lib/karafka/pro/active_job/consumer.rb +46 -0
- data/lib/karafka/pro/active_job/dispatcher.rb +61 -0
- data/lib/karafka/pro/active_job/job_options_contract.rb +32 -0
- data/lib/karafka/pro/base_consumer.rb +82 -0
- data/lib/karafka/pro/contracts/base.rb +21 -0
- data/lib/karafka/pro/contracts/consumer_group.rb +34 -0
- data/lib/karafka/pro/contracts/consumer_group_topic.rb +33 -0
- data/lib/karafka/pro/loader.rb +76 -0
- data/lib/karafka/pro/performance_tracker.rb +80 -0
- data/lib/karafka/pro/processing/coordinator.rb +72 -0
- data/lib/karafka/pro/processing/jobs/consume_non_blocking.rb +37 -0
- data/lib/karafka/pro/processing/jobs_builder.rb +32 -0
- data/lib/karafka/pro/processing/partitioner.rb +60 -0
- data/lib/karafka/pro/processing/scheduler.rb +56 -0
- data/lib/karafka/pro/routing/builder_extensions.rb +30 -0
- data/lib/karafka/pro/routing/topic_extensions.rb +38 -0
- data/lib/karafka/pro.rb +13 -0
- data/lib/karafka/process.rb +1 -0
- data/lib/karafka/processing/coordinator.rb +88 -0
- data/lib/karafka/processing/coordinators_buffer.rb +54 -0
- data/lib/karafka/processing/executor.rb +118 -0
- data/lib/karafka/processing/executors_buffer.rb +88 -0
- data/lib/karafka/processing/jobs/base.rb +51 -0
- data/lib/karafka/processing/jobs/consume.rb +42 -0
- data/lib/karafka/processing/jobs/revoked.rb +22 -0
- data/lib/karafka/processing/jobs/shutdown.rb +23 -0
- data/lib/karafka/processing/jobs_builder.rb +29 -0
- data/lib/karafka/processing/jobs_queue.rb +144 -0
- data/lib/karafka/processing/partitioner.rb +22 -0
- data/lib/karafka/processing/result.rb +29 -0
- data/lib/karafka/processing/scheduler.rb +22 -0
- data/lib/karafka/processing/worker.rb +88 -0
- data/lib/karafka/processing/workers_batch.rb +27 -0
- data/lib/karafka/railtie.rb +113 -0
- data/lib/karafka/routing/builder.rb +15 -24
- data/lib/karafka/routing/consumer_group.rb +11 -19
- data/lib/karafka/routing/consumer_mapper.rb +1 -2
- data/lib/karafka/routing/router.rb +1 -1
- data/lib/karafka/routing/subscription_group.rb +53 -0
- data/lib/karafka/routing/subscription_groups_builder.rb +53 -0
- data/lib/karafka/routing/topic.rb +61 -24
- data/lib/karafka/routing/topics.rb +38 -0
- data/lib/karafka/runner.rb +51 -0
- data/lib/karafka/serialization/json/deserializer.rb +6 -15
- data/lib/karafka/server.rb +67 -26
- data/lib/karafka/setup/config.rb +147 -175
- data/lib/karafka/status.rb +14 -5
- data/lib/karafka/templates/example_consumer.rb.erb +16 -0
- data/lib/karafka/templates/karafka.rb.erb +15 -51
- data/lib/karafka/time_trackers/base.rb +19 -0
- data/lib/karafka/time_trackers/pause.rb +92 -0
- data/lib/karafka/time_trackers/poll.rb +65 -0
- data/lib/karafka/version.rb +1 -1
- data/lib/karafka.rb +38 -17
- data.tar.gz.sig +0 -0
- metadata +118 -120
- metadata.gz.sig +0 -0
- data/MIT-LICENCE +0 -18
- data/lib/karafka/assignment_strategies/round_robin.rb +0 -13
- data/lib/karafka/attributes_map.rb +0 -63
- data/lib/karafka/backends/inline.rb +0 -16
- data/lib/karafka/base_responder.rb +0 -226
- data/lib/karafka/cli/flow.rb +0 -48
- data/lib/karafka/cli/missingno.rb +0 -19
- data/lib/karafka/code_reloader.rb +0 -67
- data/lib/karafka/connection/api_adapter.rb +0 -158
- data/lib/karafka/connection/batch_delegator.rb +0 -55
- data/lib/karafka/connection/builder.rb +0 -23
- data/lib/karafka/connection/message_delegator.rb +0 -36
- data/lib/karafka/consumers/batch_metadata.rb +0 -10
- data/lib/karafka/consumers/callbacks.rb +0 -71
- data/lib/karafka/consumers/includer.rb +0 -64
- data/lib/karafka/consumers/responders.rb +0 -24
- data/lib/karafka/consumers/single_params.rb +0 -15
- data/lib/karafka/contracts/responder_usage.rb +0 -54
- data/lib/karafka/fetcher.rb +0 -42
- data/lib/karafka/helpers/class_matcher.rb +0 -88
- data/lib/karafka/helpers/config_retriever.rb +0 -46
- data/lib/karafka/helpers/inflector.rb +0 -26
- data/lib/karafka/instrumentation/stdout_listener.rb +0 -140
- data/lib/karafka/params/batch_metadata.rb +0 -26
- data/lib/karafka/params/builders/batch_metadata.rb +0 -30
- data/lib/karafka/params/builders/params.rb +0 -38
- data/lib/karafka/params/builders/params_batch.rb +0 -25
- data/lib/karafka/params/params_batch.rb +0 -60
- data/lib/karafka/patches/ruby_kafka.rb +0 -47
- data/lib/karafka/persistence/client.rb +0 -29
- data/lib/karafka/persistence/consumers.rb +0 -45
- data/lib/karafka/persistence/topics.rb +0 -48
- data/lib/karafka/responders/builder.rb +0 -36
- data/lib/karafka/responders/topic.rb +0 -55
- data/lib/karafka/routing/topic_mapper.rb +0 -53
- data/lib/karafka/serialization/json/serializer.rb +0 -31
- data/lib/karafka/setup/configurators/water_drop.rb +0 -36
- data/lib/karafka/templates/application_responder.rb.erb +0 -11
@@ -0,0 +1,41 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Karafka
|
4
|
+
module Instrumentation
|
5
|
+
module Callbacks
|
6
|
+
# Statistics callback handler
|
7
|
+
# @see `WaterDrop::Instrumentation::Callbacks::Statistics` for details on why we decorate
|
8
|
+
# those statistics
|
9
|
+
class Statistics
|
10
|
+
# @param subscription_group_id [String] id of the current subscription group
|
11
|
+
# @param consumer_group_id [String] id of the current consumer group
|
12
|
+
# @param client_name [String] rdkafka client name
|
13
|
+
# @param monitor [WaterDrop::Instrumentation::Monitor] monitor we are using
|
14
|
+
def initialize(subscription_group_id, consumer_group_id, client_name, monitor)
|
15
|
+
@subscription_group_id = subscription_group_id
|
16
|
+
@consumer_group_id = consumer_group_id
|
17
|
+
@client_name = client_name
|
18
|
+
@monitor = monitor
|
19
|
+
@statistics_decorator = ::Karafka::Core::Monitoring::StatisticsDecorator.new
|
20
|
+
end
|
21
|
+
|
22
|
+
# Emits decorated statistics to the monitor
|
23
|
+
# @param statistics [Hash] rdkafka statistics
|
24
|
+
def call(statistics)
|
25
|
+
# Emit only statistics related to our client
|
26
|
+
# rdkafka does not have per-instance statistics hook, thus we need to make sure that we
|
27
|
+
# emit only stats that are related to current producer. Otherwise we would emit all of
|
28
|
+
# all the time.
|
29
|
+
return unless @client_name == statistics['name']
|
30
|
+
|
31
|
+
@monitor.instrument(
|
32
|
+
'statistics.emitted',
|
33
|
+
subscription_group_id: @subscription_group_id,
|
34
|
+
consumer_group_id: @consumer_group_id,
|
35
|
+
statistics: @statistics_decorator.call(statistics)
|
36
|
+
)
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
@@ -0,0 +1,164 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Karafka
|
4
|
+
module Instrumentation
|
5
|
+
# Default listener that hooks up to our instrumentation and uses its events for logging
|
6
|
+
# It can be removed/replaced or anything without any harm to the Karafka app flow.
|
7
|
+
class LoggerListener
|
8
|
+
# Log levels that we use in this particular listener
|
9
|
+
USED_LOG_LEVELS = %i[
|
10
|
+
debug
|
11
|
+
info
|
12
|
+
warn
|
13
|
+
error
|
14
|
+
fatal
|
15
|
+
].freeze
|
16
|
+
|
17
|
+
# Logs each messages fetching attempt
|
18
|
+
#
|
19
|
+
# @param event [Dry::Events::Event] event details including payload
|
20
|
+
def on_connection_listener_fetch_loop(event)
|
21
|
+
listener = event[:caller]
|
22
|
+
debug "[#{listener.id}] Polling messages..."
|
23
|
+
end
|
24
|
+
|
25
|
+
# Logs about messages that we've received from Kafka
|
26
|
+
#
|
27
|
+
# @param event [Dry::Events::Event] event details including payload
|
28
|
+
def on_connection_listener_fetch_loop_received(event)
|
29
|
+
listener = event[:caller]
|
30
|
+
time = event[:time]
|
31
|
+
messages_count = event[:messages_buffer].size
|
32
|
+
|
33
|
+
message = "[#{listener.id}] Polled #{messages_count} messages in #{time}ms"
|
34
|
+
|
35
|
+
# We don't want the "polled 0" in dev as it would spam the log
|
36
|
+
# Instead we publish only info when there was anything we could poll and fail over to the
|
37
|
+
# zero notifications when in debug mode
|
38
|
+
messages_count.zero? ? debug(message) : info(message)
|
39
|
+
end
|
40
|
+
|
41
|
+
# Prints info about the fact that a given job has started
|
42
|
+
#
|
43
|
+
# @param event [Dry::Events::Event] event details including payload
|
44
|
+
def on_worker_process(event)
|
45
|
+
job = event[:job]
|
46
|
+
job_type = job.class.to_s.split('::').last
|
47
|
+
consumer = job.executor.topic.consumer
|
48
|
+
topic = job.executor.topic.name
|
49
|
+
info "[#{job.id}] #{job_type} job for #{consumer} on #{topic} started"
|
50
|
+
end
|
51
|
+
|
52
|
+
# Prints info about the fact that a given job has finished
|
53
|
+
#
|
54
|
+
# @param event [Dry::Events::Event] event details including payload
|
55
|
+
def on_worker_processed(event)
|
56
|
+
job = event[:job]
|
57
|
+
time = event[:time]
|
58
|
+
job_type = job.class.to_s.split('::').last
|
59
|
+
consumer = job.executor.topic.consumer
|
60
|
+
topic = job.executor.topic.name
|
61
|
+
info "[#{job.id}] #{job_type} job for #{consumer} on #{topic} finished in #{time}ms"
|
62
|
+
end
|
63
|
+
|
64
|
+
# Logs info about system signals that Karafka received and prints backtrace for threads in
|
65
|
+
# case of ttin
|
66
|
+
#
|
67
|
+
# @param event [Dry::Events::Event] event details including payload
|
68
|
+
def on_process_notice_signal(event)
|
69
|
+
info "Received #{event[:signal]} system signal"
|
70
|
+
|
71
|
+
# We print backtrace only for ttin
|
72
|
+
return unless event[:signal] == :SIGTTIN
|
73
|
+
|
74
|
+
# Inspired by Sidekiq
|
75
|
+
Thread.list.each do |thread|
|
76
|
+
tid = (thread.object_id ^ ::Process.pid).to_s(36)
|
77
|
+
|
78
|
+
warn "Thread TID-#{tid} #{thread['label']}"
|
79
|
+
|
80
|
+
if thread.backtrace
|
81
|
+
warn thread.backtrace.join("\n")
|
82
|
+
else
|
83
|
+
warn '<no backtrace available>'
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
# Logs info that we're initializing Karafka app.
|
89
|
+
#
|
90
|
+
# @param _event [Dry::Events::Event] event details including payload
|
91
|
+
def on_app_initializing(_event)
|
92
|
+
info 'Initializing Karafka framework'
|
93
|
+
end
|
94
|
+
|
95
|
+
# Logs info that we're running Karafka app.
|
96
|
+
#
|
97
|
+
# @param _event [Dry::Events::Event] event details including payload
|
98
|
+
def on_app_running(_event)
|
99
|
+
info 'Running Karafka server'
|
100
|
+
end
|
101
|
+
|
102
|
+
# Logs info that we're going to stop the Karafka server.
|
103
|
+
#
|
104
|
+
# @param _event [Dry::Events::Event] event details including payload
|
105
|
+
def on_app_stopping(_event)
|
106
|
+
info 'Stopping Karafka server'
|
107
|
+
end
|
108
|
+
|
109
|
+
# Logs info that we stopped the Karafka server.
|
110
|
+
#
|
111
|
+
# @param _event [Dry::Events::Event] event details including payload
|
112
|
+
def on_app_stopped(_event)
|
113
|
+
info 'Stopped Karafka server'
|
114
|
+
end
|
115
|
+
|
116
|
+
# There are many types of errors that can occur in many places, but we provide a single
|
117
|
+
# handler for all of them to simplify error instrumentation.
|
118
|
+
# @param event [Dry::Events::Event] event details including payload
|
119
|
+
def on_error_occurred(event)
|
120
|
+
type = event[:type]
|
121
|
+
error = event[:error]
|
122
|
+
details = (error.backtrace || []).join("\n")
|
123
|
+
|
124
|
+
case type
|
125
|
+
when 'consumer.consume.error'
|
126
|
+
error "Consumer consuming error: #{error}"
|
127
|
+
error details
|
128
|
+
when 'consumer.revoked.error'
|
129
|
+
error "Consumer on revoked failed due to an error: #{error}"
|
130
|
+
error details
|
131
|
+
when 'consumer.shutdown.error'
|
132
|
+
error "Consumer on shutdown failed due to an error: #{error}"
|
133
|
+
error details
|
134
|
+
when 'worker.process.error'
|
135
|
+
fatal "Worker processing failed due to an error: #{error}"
|
136
|
+
fatal details
|
137
|
+
when 'connection.listener.fetch_loop.error'
|
138
|
+
error "Listener fetch loop error: #{error}"
|
139
|
+
error details
|
140
|
+
when 'licenser.expired'
|
141
|
+
error error
|
142
|
+
error details
|
143
|
+
when 'runner.call.error'
|
144
|
+
fatal "Runner crashed due to an error: #{error}"
|
145
|
+
fatal details
|
146
|
+
when 'app.stopping.error'
|
147
|
+
error 'Forceful Karafka server stop'
|
148
|
+
when 'librdkafka.error'
|
149
|
+
error "librdkafka internal error occurred: #{error}"
|
150
|
+
error details
|
151
|
+
else
|
152
|
+
# This should never happen. Please contact the maintainers
|
153
|
+
raise Errors::UnsupportedCaseError, event
|
154
|
+
end
|
155
|
+
end
|
156
|
+
|
157
|
+
USED_LOG_LEVELS.each do |log_level|
|
158
|
+
define_method log_level do |*args|
|
159
|
+
Karafka.logger.send(log_level, *args)
|
160
|
+
end
|
161
|
+
end
|
162
|
+
end
|
163
|
+
end
|
164
|
+
end
|
@@ -1,69 +1,21 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
module Karafka
|
4
|
-
# Namespace for all the things related with Karafka instrumentation process
|
5
4
|
module Instrumentation
|
6
|
-
#
|
7
|
-
#
|
8
|
-
#
|
9
|
-
|
10
|
-
|
11
|
-
# @note This class acts as a singleton because we are only permitted to have single monitor
|
12
|
-
# per running process (just as logger)
|
13
|
-
class Monitor < Dry::Monitor::Notifications
|
14
|
-
# List of events that we support in the system and to which a monitor client can hook up
|
15
|
-
# @note The non-error once support timestamp benchmarking
|
16
|
-
# @note Depending on Karafka extensions and additional engines, this might not be the
|
17
|
-
# complete list of all the events. Please use the #available_events on fully loaded
|
18
|
-
# Karafka system to determine all of the events you can use.
|
19
|
-
# Last 4 events are from WaterDrop but for convenience we use the same monitor for the
|
20
|
-
# whole karafka ecosystem
|
21
|
-
BASE_EVENTS = %w[
|
22
|
-
params.params.deserialize
|
23
|
-
params.params.deserialize.error
|
24
|
-
connection.listener.before_fetch_loop
|
25
|
-
connection.listener.fetch_loop
|
26
|
-
connection.listener.fetch_loop.error
|
27
|
-
connection.client.fetch_loop.error
|
28
|
-
connection.batch_delegator.call
|
29
|
-
connection.message_delegator.call
|
30
|
-
fetcher.call.error
|
31
|
-
backends.inline.process
|
32
|
-
process.notice_signal
|
33
|
-
consumers.responders.respond_with
|
34
|
-
async_producer.call.error
|
35
|
-
async_producer.call.retry
|
36
|
-
sync_producer.call.error
|
37
|
-
sync_producer.call.retry
|
38
|
-
app.initializing
|
39
|
-
app.initialized
|
40
|
-
app.running
|
41
|
-
app.stopping
|
42
|
-
app.stopping.error
|
43
|
-
app.stopped
|
44
|
-
].freeze
|
5
|
+
# Karafka instrumentation monitor that we use to publish events
|
6
|
+
# By default uses our internal notifications bus but can be used with
|
7
|
+
# `ActiveSupport::Notifications` as well
|
8
|
+
class Monitor < ::Karafka::Core::Monitoring::Monitor
|
9
|
+
attr_reader :notifications_bus
|
45
10
|
|
46
|
-
|
47
|
-
|
48
|
-
# @
|
49
|
-
def initialize
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
# Allows us to subscribe to events with a code that will be yielded upon events
|
55
|
-
# @param event_name_or_listener [String, Object] name of the event we want to subscribe to
|
56
|
-
# or a listener if we decide to go with object listener
|
57
|
-
def subscribe(event_name_or_listener)
|
58
|
-
return super unless event_name_or_listener.is_a?(String)
|
59
|
-
return super if available_events.include?(event_name_or_listener)
|
60
|
-
|
61
|
-
raise Errors::UnregisteredMonitorEventError, event_name_or_listener
|
62
|
-
end
|
63
|
-
|
64
|
-
# @return [Array<String>] names of available events to which we can subscribe
|
65
|
-
def available_events
|
66
|
-
__bus__.events.keys
|
11
|
+
# @param notifications_bus [Object] either our internal notifications bus or
|
12
|
+
# `ActiveSupport::Notifications`
|
13
|
+
# @param namespace [String, nil] namespace for events or nil if no namespace
|
14
|
+
def initialize(
|
15
|
+
notifications_bus = ::Karafka::Instrumentation::Notifications.new,
|
16
|
+
namespace = nil
|
17
|
+
)
|
18
|
+
super(notifications_bus, namespace)
|
67
19
|
end
|
68
20
|
end
|
69
21
|
end
|
@@ -0,0 +1,52 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Karafka
|
4
|
+
# Namespace for all the things related with Karafka instrumentation process
|
5
|
+
module Instrumentation
|
6
|
+
# Monitor is used to hookup external monitoring services to monitor how Karafka works
|
7
|
+
# It provides a standardized API for checking incoming messages/enqueueing etc
|
8
|
+
# Since it is a pub-sub based on dry-monitor, you can use as many subscribers/loggers at the
|
9
|
+
# same time, which means that you might have for example file logging and NewRelic at the same
|
10
|
+
# time
|
11
|
+
# @note This class acts as a singleton because we are only permitted to have single monitor
|
12
|
+
# per running process (just as logger)
|
13
|
+
class Notifications < Karafka::Core::Monitoring::Notifications
|
14
|
+
# List of events that we support in the system and to which a monitor client can hook up
|
15
|
+
# @note The non-error once support timestamp benchmarking
|
16
|
+
# @note Depending on Karafka extensions and additional engines, this might not be the
|
17
|
+
# complete list of all the events. Please use the #available_events on fully loaded
|
18
|
+
# Karafka system to determine all of the events you can use.
|
19
|
+
EVENTS = %w[
|
20
|
+
app.initialized
|
21
|
+
app.running
|
22
|
+
app.stopping
|
23
|
+
app.stopped
|
24
|
+
|
25
|
+
consumer.consumed
|
26
|
+
consumer.revoked
|
27
|
+
consumer.shutdown
|
28
|
+
|
29
|
+
process.notice_signal
|
30
|
+
|
31
|
+
connection.listener.before_fetch_loop
|
32
|
+
connection.listener.fetch_loop
|
33
|
+
connection.listener.fetch_loop.received
|
34
|
+
|
35
|
+
worker.process
|
36
|
+
worker.processed
|
37
|
+
|
38
|
+
statistics.emitted
|
39
|
+
|
40
|
+
error.occurred
|
41
|
+
].freeze
|
42
|
+
|
43
|
+
private_constant :EVENTS
|
44
|
+
|
45
|
+
# @return [Karafka::Instrumentation::Monitor] monitor instance for system instrumentation
|
46
|
+
def initialize
|
47
|
+
super
|
48
|
+
EVENTS.each { |event| register_event(event) }
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
@@ -5,19 +5,19 @@ module Karafka
|
|
5
5
|
# Listener that sets a proc title with a nice descriptive value
|
6
6
|
class ProctitleListener
|
7
7
|
# Updates proc title to an initializing one
|
8
|
-
# @param _event [
|
8
|
+
# @param _event [Karafka::Core::Monitoring::Event] event details including payload
|
9
9
|
def on_app_initializing(_event)
|
10
10
|
setproctitle('initializing')
|
11
11
|
end
|
12
12
|
|
13
13
|
# Updates proc title to a running one
|
14
|
-
# @param _event [
|
14
|
+
# @param _event [Karafka::Core::Monitoring::Event] event details including payload
|
15
15
|
def on_app_running(_event)
|
16
16
|
setproctitle('running')
|
17
17
|
end
|
18
18
|
|
19
19
|
# Updates proc title to a stopping one
|
20
|
-
# @param _event [
|
20
|
+
# @param _event [Karafka::Core::Monitoring::Event] event details including payload
|
21
21
|
def on_app_stopping(_event)
|
22
22
|
setproctitle('stopping')
|
23
23
|
end
|
@@ -0,0 +1 @@
|
|
1
|
+
{"title":"Karafka monitoring dashboard","description":"","widgets":[{"id":7444969424381053,"definition":{"title":"Stability & errors","type":"group","show_title":true,"layout_type":"ordered","widgets":[{"id":8304008422587936,"definition":{"title":"Client connects and disconnects","title_size":"16","title_align":"left","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"alias":"Connects","formula":"query1"},{"alias":"Disconnects","formula":"query2"}],"response_format":"timeseries","queries":[{"query":"sum:karafka.connection.connects{*} by {host}.as_count()","data_source":"metrics","name":"query1"},{"query":"sum:karafka.connection.disconnects{*} by {host}.as_count()","data_source":"metrics","name":"query2"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"bars"}]},"layout":{"x":0,"y":0,"width":4,"height":2}},{"id":3722865443336921,"definition":{"title":"Errors encountered (any)","title_size":"16","title_align":"left","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"alias":"in-karafka errors","formula":"query1"},{"alias":"librdkafka consume errors","formula":"query2"},{"alias":"librdkafka receive errors","formula":"query3"}],"response_format":"timeseries","queries":[{"query":"sum:karafka.error_occurred{*} by {type}.as_count()","data_source":"metrics","name":"query1"},{"query":"sum:karafka.consume.errors{*}.as_count()","data_source":"metrics","name":"query2"},{"query":"sum:karafka.receive.errors{*}.as_count()","data_source":"metrics","name":"query3"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}]},"layout":{"x":4,"y":0,"width":4,"height":2}},{"id":5477381252952760,"definition":{"title":"Processing errors","title_size":"16","title_align":"left","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"formula":"query1"}],"response_format":"timeseries","queries":[{"query":"sum:karafka.error_occurred{type:consumer.consume.error} by {partition,topic}.as_count()","data_source":"metrics","name":"query1"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}]},"layout":{"x":8,"y":0,"width":4,"height":2}},{"id":2357301680769076,"definition":{"title":"Processing errors rate per topic","title_size":"16","title_align":"left","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"alias":"% error rate per topic","formula":"(query1 / (query1 + query2)) * 100"}],"response_format":"timeseries","queries":[{"query":"sum:karafka.error_occurred{type:consumer.consume.error} by {topic,partition}.as_count()","data_source":"metrics","name":"query1"},{"query":"sum:karafka.consumer.batches{*} by {topic,partition}.as_count()","data_source":"metrics","name":"query2"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"bars"}]},"layout":{"x":0,"y":2,"width":4,"height":2}},{"id":3902930069982135,"definition":{"title":"Batches successful vs failures","title_size":"16","title_align":"left","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"alias":"Successfully processed batch","formula":"query1"},{"alias":"Batch processing with error","formula":"query2"}],"response_format":"timeseries","queries":[{"query":"sum:karafka.consumer.batches{*} by {partition,topic}.as_count()","data_source":"metrics","name":"query1"},{"query":"avg:karafka.error_occurred{type:consumer.consume.error} by {partition,topic}.as_count()","data_source":"metrics","name":"query2"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}]},"layout":{"x":4,"y":2,"width":4,"height":2}},{"id":718749162159145,"definition":{"title":"Consumer instances revocations and shutdowns","title_size":"16","title_align":"left","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"alias":"Consumer instances revokations","formula":"query1"},{"alias":"Consumer instances shutdowns","formula":"query2"}],"response_format":"timeseries","queries":[{"query":"sum:karafka.consumer.revoked{*}.as_count()","data_source":"metrics","name":"query1"},{"query":"sum:karafka.consumer.shutdown{*}.as_count()","data_source":"metrics","name":"query2"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}]},"layout":{"x":8,"y":2,"width":4,"height":2}}]},"layout":{"x":0,"y":0,"width":12,"height":5}},{"id":5988438511387100,"definition":{"title":"Workers poll","type":"group","show_title":true,"layout_type":"ordered","widgets":[{"id":8769294644934352,"definition":{"title":"Enqueued jobs","title_size":"16","title_align":"left","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"alias":"Enqueued jobs","formula":"query1"}],"response_format":"timeseries","queries":[{"query":"sum:karafka.worker.enqueued_jobs.avg{*}","data_source":"metrics","name":"query1"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}]},"layout":{"x":0,"y":0,"width":4,"height":2}},{"id":2714502141463873,"definition":{"title":"Workers usage","title_size":"16","title_align":"left","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"alias":"Busy workers (p95)","formula":"query1"},{"alias":"Total workers","formula":"query2"}],"response_format":"timeseries","queries":[{"query":"sum:karafka.worker.processing.95percentile{*}","data_source":"metrics","name":"query1"},{"query":"sum:karafka.worker.total_threads{*}","data_source":"metrics","name":"query2"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}]},"layout":{"x":4,"y":0,"width":4,"height":2}},{"id":5370086629441984,"definition":{"title":"Workers % utilization","title_size":"16","title_align":"left","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"alias":"% workers utilization","formula":"(query1 / query2) * 100"}],"response_format":"timeseries","queries":[{"query":"sum:karafka.worker.processing.95percentile{*}","data_source":"metrics","name":"query1"},{"query":"sum:karafka.worker.total_threads{*}","data_source":"metrics","name":"query2"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}]},"layout":{"x":8,"y":0,"width":4,"height":2}}]},"layout":{"x":0,"y":5,"width":12,"height":3}},{"id":8544040083223278,"definition":{"title":"Throughput ","type":"group","show_title":true,"layout_type":"ordered","widgets":[{"id":3740207481939733,"definition":{"title":"Offset lag changes","title_size":"16","title_align":"left","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"formula":"derivative(query1)"}],"response_format":"timeseries","queries":[{"query":"max:karafka.consumer.offset{*} by {topic,partition}","data_source":"metrics","name":"query1"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}]},"layout":{"x":0,"y":0,"width":4,"height":2}},{"id":6319110548544878,"definition":{"title":"Batches processed per topic","title_size":"16","title_align":"left","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"formula":"query1"}],"response_format":"timeseries","queries":[{"query":"sum:karafka.consumer.batches{*} by {partition,topic}.as_count()","data_source":"metrics","name":"query1"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}]},"layout":{"x":4,"y":0,"width":4,"height":2}},{"id":6232784865331443,"definition":{"title":"Messages consumed per topic","title_size":"16","title_align":"left","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"alias":"Messages consumed","formula":"query1"},{"alias":"Average batch size","formula":"query1 / query2"}],"response_format":"timeseries","queries":[{"query":"sum:karafka.consumer.messages{*} by {partition,topic}.as_count()","data_source":"metrics","name":"query1"},{"query":"sum:karafka.consumer.batches{*} by {partition,topic}.as_count()","data_source":"metrics","name":"query2"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}]},"layout":{"x":8,"y":0,"width":4,"height":2}},{"id":2321394598982770,"definition":{"title":"Consumption lag (in seconds)","title_size":"16","title_align":"left","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"alias":"Consumption lag in s (max)","formula":"query2 / 1000"},{"alias":"Consumption lag in s (avg)","formula":"query3 / 1000"},{"alias":"Consumption lag in s (p95)","formula":"query1 / 1000"}],"response_format":"timeseries","queries":[{"query":"max:karafka.consumer.consumption_lag.max{*}","data_source":"metrics","name":"query2"},{"query":"max:karafka.consumer.consumption_lag.avg{*}","data_source":"metrics","name":"query3"},{"query":"max:karafka.consumer.consumption_lag.95percentile{*}","data_source":"metrics","name":"query1"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}]},"layout":{"x":0,"y":2,"width":4,"height":2}},{"id":1062074781483741,"definition":{"title":"Processing lag (in ms)","title_size":"16","title_align":"left","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"alias":"Processing lag in ms (p95)","formula":"query1"},{"alias":"Processing lag in ms (max)","formula":"query2"},{"alias":"Processing lag in ms (avg)","formula":"query3"}],"response_format":"timeseries","queries":[{"query":"max:karafka.consumer.processing_lag.95percentile{*}","data_source":"metrics","name":"query1"},{"query":"max:karafka.consumer.processing_lag.max{*}","data_source":"metrics","name":"query2"},{"query":"max:karafka.consumer.processing_lag.avg{*}","data_source":"metrics","name":"query3"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}]},"layout":{"x":4,"y":2,"width":4,"height":2}},{"id":7497794728674267,"definition":{"title":"Batch processing time","title_size":"16","title_align":"left","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"formula":"query1"},{"formula":"query2"},{"formula":"query3"}],"response_format":"timeseries","queries":[{"query":"max:karafka.consumer.consumed.time_taken.95percentile{*} by {topic,partition}","data_source":"metrics","name":"query1"},{"query":"max:karafka.consumer.consumed.time_taken.max{*} by {topic,partition}","data_source":"metrics","name":"query2"},{"query":"max:karafka.consumer.consumed.time_taken.avg{*} by {topic,partition}","data_source":"metrics","name":"query3"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}]},"layout":{"x":8,"y":2,"width":4,"height":2}},{"id":4192833027984161,"definition":{"title":"Batch size per topic","title_size":"16","title_align":"left","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"alias":"Batch size p95","formula":"query1"},{"alias":"Batch size avg","formula":"query2"},{"alias":"Batch size max","formula":"query3"}],"response_format":"timeseries","queries":[{"query":"sum:karafka.consumer.batch_size.95percentile{*} by {partition,topic}","data_source":"metrics","name":"query1"},{"query":"sum:karafka.consumer.batch_size.avg{*} by {partition,topic}","data_source":"metrics","name":"query2"},{"query":"sum:karafka.consumer.batch_size.max{*} by {partition,topic}","data_source":"metrics","name":"query3"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}]},"layout":{"x":0,"y":4,"width":4,"height":2}},{"id":4741598444771147,"definition":{"title":"Messages consumed overall","title_size":"16","title_align":"left","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"alias":"Messages consumed","formula":"query1"},{"alias":"Average batch size","formula":"query1 / query2"}],"response_format":"timeseries","queries":[{"query":"sum:karafka.consumer.messages{*}.as_count()","data_source":"metrics","name":"query1"},{"query":"sum:karafka.consumer.batches{*}.as_count()","data_source":"metrics","name":"query2"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}]},"layout":{"x":4,"y":4,"width":4,"height":2}},{"id":4502534794102513,"definition":{"title":"Polling times (ms)","title_size":"16","title_align":"left","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"time":{},"type":"timeseries","requests":[{"formulas":[{"alias":"p95 ms polling time","formula":"query1"},{"alias":"max ms polling time","formula":"query2"},{"alias":"average ms polling time","formula":"query3"}],"queries":[{"name":"query1","data_source":"metrics","query":"avg:karafka.listener.polling.time_taken.95percentile{*}"},{"name":"query2","data_source":"metrics","query":"avg:karafka.listener.polling.time_taken.max{*}"},{"name":"query3","data_source":"metrics","query":"avg:karafka.listener.polling.time_taken.avg{*}"}],"response_format":"timeseries","style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}]},"layout":{"x":8,"y":4,"width":4,"height":2}}]},"layout":{"x":0,"y":0,"width":12,"height":7,"is_column_break":true}}],"template_variables":[],"layout_type":"ordered","is_read_only":false,"notify_list":[],"reflow_type":"fixed","id":"s3u-z47-i6u"}
|
@@ -0,0 +1,232 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Karafka
|
4
|
+
module Instrumentation
|
5
|
+
# Namespace for vendor specific instrumentation
|
6
|
+
module Vendors
|
7
|
+
# Datadog specific instrumentation
|
8
|
+
module Datadog
|
9
|
+
# Listener that can be used to subscribe to Karafka to receive stats via StatsD
|
10
|
+
# and/or Datadog
|
11
|
+
#
|
12
|
+
# @note You need to setup the `dogstatsd-ruby` client and assign it
|
13
|
+
class Listener
|
14
|
+
include ::Karafka::Core::Configurable
|
15
|
+
extend Forwardable
|
16
|
+
|
17
|
+
def_delegators :config, :client, :rd_kafka_metrics, :namespace, :default_tags
|
18
|
+
|
19
|
+
# Value object for storing a single rdkafka metric publishing details
|
20
|
+
RdKafkaMetric = Struct.new(:type, :scope, :name, :key_location)
|
21
|
+
|
22
|
+
# Namespace under which the DD metrics should be published
|
23
|
+
setting :namespace, default: 'karafka'
|
24
|
+
|
25
|
+
# Datadog client that we should use to publish the metrics
|
26
|
+
setting :client
|
27
|
+
|
28
|
+
# Default tags we want to publish (for example hostname)
|
29
|
+
# Format as followed (example for hostname): `["host:#{Socket.gethostname}"]`
|
30
|
+
setting :default_tags, default: []
|
31
|
+
|
32
|
+
# All the rdkafka metrics we want to publish
|
33
|
+
#
|
34
|
+
# By default we publish quite a lot so this can be tuned
|
35
|
+
# Note, that the once with `_d` come from Karafka, not rdkafka or Kafka
|
36
|
+
setting :rd_kafka_metrics, default: [
|
37
|
+
# Client metrics
|
38
|
+
RdKafkaMetric.new(:count, :root, 'messages.consumed', 'rxmsgs_d'),
|
39
|
+
RdKafkaMetric.new(:count, :root, 'messages.consumed.bytes', 'rxmsg_bytes'),
|
40
|
+
|
41
|
+
# Broker metrics
|
42
|
+
RdKafkaMetric.new(:count, :brokers, 'consume.attempts', 'txretries_d'),
|
43
|
+
RdKafkaMetric.new(:count, :brokers, 'consume.errors', 'txerrs_d'),
|
44
|
+
RdKafkaMetric.new(:count, :brokers, 'receive.errors', 'rxerrs_d'),
|
45
|
+
RdKafkaMetric.new(:count, :brokers, 'connection.connects', 'connects_d'),
|
46
|
+
RdKafkaMetric.new(:count, :brokers, 'connection.disconnects', 'disconnects_d'),
|
47
|
+
RdKafkaMetric.new(:gauge, :brokers, 'network.latency.avg', %w[rtt avg]),
|
48
|
+
RdKafkaMetric.new(:gauge, :brokers, 'network.latency.p95', %w[rtt p95]),
|
49
|
+
RdKafkaMetric.new(:gauge, :brokers, 'network.latency.p99', %w[rtt p99])
|
50
|
+
].freeze
|
51
|
+
|
52
|
+
configure
|
53
|
+
|
54
|
+
# @param block [Proc] configuration block
|
55
|
+
def initialize(&block)
|
56
|
+
configure
|
57
|
+
setup(&block) if block
|
58
|
+
end
|
59
|
+
|
60
|
+
# @param block [Proc] configuration block
|
61
|
+
# @note We define this alias to be consistent with `WaterDrop#setup`
|
62
|
+
def setup(&block)
|
63
|
+
configure(&block)
|
64
|
+
end
|
65
|
+
|
66
|
+
# Hooks up to WaterDrop instrumentation for emitted statistics
|
67
|
+
#
|
68
|
+
# @param event [Karafka::Core::Monitoring::Event]
|
69
|
+
def on_statistics_emitted(event)
|
70
|
+
statistics = event[:statistics]
|
71
|
+
|
72
|
+
rd_kafka_metrics.each do |metric|
|
73
|
+
report_metric(metric, statistics)
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
# Increases the errors count by 1
|
78
|
+
#
|
79
|
+
# @param event [Karafka::Core::Monitoring::Event]
|
80
|
+
def on_error_occurred(event)
|
81
|
+
extra_tags = ["type:#{event[:type]}"]
|
82
|
+
|
83
|
+
if event.payload[:caller].respond_to?(:messages)
|
84
|
+
metadata = event.payload[:caller].messages.metadata
|
85
|
+
|
86
|
+
extra_tags += [
|
87
|
+
"topic:#{metadata.topic}",
|
88
|
+
"partition:#{metadata.partition}"
|
89
|
+
]
|
90
|
+
end
|
91
|
+
|
92
|
+
count('error_occurred', 1, tags: default_tags + extra_tags)
|
93
|
+
end
|
94
|
+
|
95
|
+
# Reports how many messages we've polled and how much time did we spend on it
|
96
|
+
#
|
97
|
+
# @param event [Karafka::Core::Monitoring::Event]
|
98
|
+
def on_connection_listener_fetch_loop_received(event)
|
99
|
+
time_taken = event[:time]
|
100
|
+
messages_count = event[:messages_buffer].size
|
101
|
+
|
102
|
+
histogram('listener.polling.time_taken', time_taken, tags: default_tags)
|
103
|
+
histogram('listener.polling.messages', messages_count, tags: default_tags)
|
104
|
+
end
|
105
|
+
|
106
|
+
# Here we report majority of things related to processing as we have access to the
|
107
|
+
# consumer
|
108
|
+
# @param event [Karafka::Core::Monitoring::Event]
|
109
|
+
def on_consumer_consumed(event)
|
110
|
+
messages = event.payload[:caller].messages
|
111
|
+
metadata = messages.metadata
|
112
|
+
|
113
|
+
tags = default_tags + [
|
114
|
+
"topic:#{metadata.topic}",
|
115
|
+
"partition:#{metadata.partition}"
|
116
|
+
]
|
117
|
+
|
118
|
+
count('consumer.messages', messages.count, tags: tags)
|
119
|
+
count('consumer.batches', 1, tags: tags)
|
120
|
+
gauge('consumer.offset', metadata.last_offset, tags: tags)
|
121
|
+
histogram('consumer.consumed.time_taken', event[:time], tags: tags)
|
122
|
+
histogram('consumer.batch_size', messages.count, tags: tags)
|
123
|
+
histogram('consumer.processing_lag', metadata.processing_lag, tags: tags)
|
124
|
+
histogram('consumer.consumption_lag', metadata.consumption_lag, tags: tags)
|
125
|
+
end
|
126
|
+
|
127
|
+
# @param event [Karafka::Core::Monitoring::Event]
|
128
|
+
def on_consumer_revoked(event)
|
129
|
+
messages = event.payload[:caller].messages
|
130
|
+
metadata = messages.metadata
|
131
|
+
|
132
|
+
tags = default_tags + [
|
133
|
+
"topic:#{metadata.topic}",
|
134
|
+
"partition:#{metadata.partition}"
|
135
|
+
]
|
136
|
+
|
137
|
+
count('consumer.revoked', 1, tags: tags)
|
138
|
+
end
|
139
|
+
|
140
|
+
# @param event [Karafka::Core::Monitoring::Event]
|
141
|
+
def on_consumer_shutdown(event)
|
142
|
+
messages = event.payload[:caller].messages
|
143
|
+
metadata = messages.metadata
|
144
|
+
|
145
|
+
tags = default_tags + [
|
146
|
+
"topic:#{metadata.topic}",
|
147
|
+
"partition:#{metadata.partition}"
|
148
|
+
]
|
149
|
+
|
150
|
+
count('consumer.shutdown', 1, tags: tags)
|
151
|
+
end
|
152
|
+
|
153
|
+
# Worker related metrics
|
154
|
+
# @param event [Karafka::Core::Monitoring::Event]
|
155
|
+
def on_worker_process(event)
|
156
|
+
jq_stats = event[:jobs_queue].statistics
|
157
|
+
|
158
|
+
gauge('worker.total_threads', Karafka::App.config.concurrency, tags: default_tags)
|
159
|
+
histogram('worker.processing', jq_stats[:processing], tags: default_tags)
|
160
|
+
histogram('worker.enqueued_jobs', jq_stats[:enqueued], tags: default_tags)
|
161
|
+
end
|
162
|
+
|
163
|
+
# We report this metric before and after processing for higher accuracy
|
164
|
+
# Without this, the utilization would not be fully reflected
|
165
|
+
# @param event [Karafka::Core::Monitoring::Event]
|
166
|
+
def on_worker_processed(event)
|
167
|
+
jq_stats = event[:jobs_queue].statistics
|
168
|
+
|
169
|
+
histogram('worker.processing', jq_stats[:processing], tags: default_tags)
|
170
|
+
end
|
171
|
+
|
172
|
+
private
|
173
|
+
|
174
|
+
%i[
|
175
|
+
count
|
176
|
+
gauge
|
177
|
+
histogram
|
178
|
+
increment
|
179
|
+
decrement
|
180
|
+
].each do |metric_type|
|
181
|
+
class_eval <<~METHODS, __FILE__, __LINE__ + 1
|
182
|
+
def #{metric_type}(key, *args)
|
183
|
+
client.#{metric_type}(
|
184
|
+
namespaced_metric(key),
|
185
|
+
*args
|
186
|
+
)
|
187
|
+
end
|
188
|
+
METHODS
|
189
|
+
end
|
190
|
+
|
191
|
+
# Wraps metric name in listener's namespace
|
192
|
+
# @param metric_name [String] RdKafkaMetric name
|
193
|
+
# @return [String]
|
194
|
+
def namespaced_metric(metric_name)
|
195
|
+
"#{namespace}.#{metric_name}"
|
196
|
+
end
|
197
|
+
|
198
|
+
# Reports a given metric statistics to Datadog
|
199
|
+
# @param metric [RdKafkaMetric] metric value object
|
200
|
+
# @param statistics [Hash] hash with all the statistics emitted
|
201
|
+
def report_metric(metric, statistics)
|
202
|
+
case metric.scope
|
203
|
+
when :root
|
204
|
+
public_send(
|
205
|
+
metric.type,
|
206
|
+
metric.name,
|
207
|
+
statistics.fetch(*metric.key_location),
|
208
|
+
tags: default_tags
|
209
|
+
)
|
210
|
+
when :brokers
|
211
|
+
statistics.fetch('brokers').each_value do |broker_statistics|
|
212
|
+
# Skip bootstrap nodes
|
213
|
+
# Bootstrap nodes have nodeid -1, other nodes have positive
|
214
|
+
# node ids
|
215
|
+
next if broker_statistics['nodeid'] == -1
|
216
|
+
|
217
|
+
public_send(
|
218
|
+
metric.type,
|
219
|
+
metric.name,
|
220
|
+
broker_statistics.dig(*metric.key_location),
|
221
|
+
tags: default_tags + ["broker:#{broker_statistics['nodename']}"]
|
222
|
+
)
|
223
|
+
end
|
224
|
+
else
|
225
|
+
raise ArgumentError, metric.scope
|
226
|
+
end
|
227
|
+
end
|
228
|
+
end
|
229
|
+
end
|
230
|
+
end
|
231
|
+
end
|
232
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Karafka
|
4
|
+
# @note Since we can only have one statistics callbacks manager and one error callbacks manager
|
5
|
+
# we use WaterDrops one that is already configured.
|
6
|
+
module Instrumentation
|
7
|
+
class << self
|
8
|
+
# Returns a manager for statistics callbacks
|
9
|
+
# @return [::WaterDrop::CallbacksManager]
|
10
|
+
def statistics_callbacks
|
11
|
+
::WaterDrop::Instrumentation.statistics_callbacks
|
12
|
+
end
|
13
|
+
|
14
|
+
# Returns a manager for error callbacks
|
15
|
+
# @return [::WaterDrop::CallbacksManager]
|
16
|
+
def error_callbacks
|
17
|
+
::WaterDrop::Instrumentation.error_callbacks
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|