karafka 1.4.13 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- checksums.yaml.gz.sig +3 -3
- data/.github/workflows/ci.yml +85 -30
- data/.ruby-version +1 -1
- data/CHANGELOG.md +268 -7
- data/CONTRIBUTING.md +10 -19
- data/Gemfile +6 -0
- data/Gemfile.lock +44 -87
- data/LICENSE +17 -0
- data/LICENSE-COMM +89 -0
- data/LICENSE-LGPL +165 -0
- data/README.md +44 -48
- data/bin/benchmarks +85 -0
- data/bin/create_token +22 -0
- data/bin/integrations +237 -0
- data/bin/karafka +4 -0
- data/bin/scenario +29 -0
- data/bin/stress_many +13 -0
- data/bin/stress_one +13 -0
- data/bin/wait_for_kafka +20 -0
- data/certs/karafka-pro.pem +11 -0
- data/config/errors.yml +55 -40
- data/docker-compose.yml +39 -3
- data/karafka.gemspec +11 -17
- data/lib/active_job/karafka.rb +21 -0
- data/lib/active_job/queue_adapters/karafka_adapter.rb +26 -0
- data/lib/karafka/active_job/consumer.rb +26 -0
- data/lib/karafka/active_job/dispatcher.rb +38 -0
- data/lib/karafka/active_job/job_extensions.rb +34 -0
- data/lib/karafka/active_job/job_options_contract.rb +21 -0
- data/lib/karafka/active_job/routing/extensions.rb +31 -0
- data/lib/karafka/app.rb +15 -20
- data/lib/karafka/base_consumer.rb +181 -31
- data/lib/karafka/cli/base.rb +4 -4
- data/lib/karafka/cli/info.rb +43 -9
- data/lib/karafka/cli/install.rb +19 -10
- data/lib/karafka/cli/server.rb +17 -42
- data/lib/karafka/cli.rb +4 -11
- data/lib/karafka/connection/client.rb +385 -90
- data/lib/karafka/connection/listener.rb +246 -38
- data/lib/karafka/connection/listeners_batch.rb +24 -0
- data/lib/karafka/connection/messages_buffer.rb +84 -0
- data/lib/karafka/connection/pauses_manager.rb +46 -0
- data/lib/karafka/connection/raw_messages_buffer.rb +101 -0
- data/lib/karafka/connection/rebalance_manager.rb +78 -0
- data/lib/karafka/contracts/base.rb +17 -0
- data/lib/karafka/contracts/config.rb +88 -11
- data/lib/karafka/contracts/consumer_group.rb +21 -189
- data/lib/karafka/contracts/consumer_group_topic.rb +34 -11
- data/lib/karafka/contracts/server_cli_options.rb +19 -18
- data/lib/karafka/contracts.rb +1 -1
- data/lib/karafka/env.rb +46 -0
- data/lib/karafka/errors.rb +21 -21
- data/lib/karafka/helpers/async.rb +33 -0
- data/lib/karafka/helpers/colorize.rb +20 -0
- data/lib/karafka/helpers/multi_delegator.rb +2 -2
- data/lib/karafka/instrumentation/callbacks/error.rb +40 -0
- data/lib/karafka/instrumentation/callbacks/statistics.rb +41 -0
- data/lib/karafka/instrumentation/logger_listener.rb +164 -0
- data/lib/karafka/instrumentation/monitor.rb +13 -61
- data/lib/karafka/instrumentation/notifications.rb +52 -0
- data/lib/karafka/instrumentation/proctitle_listener.rb +3 -3
- data/lib/karafka/instrumentation/vendors/datadog/dashboard.json +1 -0
- data/lib/karafka/instrumentation/vendors/datadog/listener.rb +232 -0
- data/lib/karafka/instrumentation.rb +21 -0
- data/lib/karafka/licenser.rb +75 -0
- data/lib/karafka/messages/batch_metadata.rb +45 -0
- data/lib/karafka/messages/builders/batch_metadata.rb +40 -0
- data/lib/karafka/messages/builders/message.rb +39 -0
- data/lib/karafka/messages/builders/messages.rb +32 -0
- data/lib/karafka/{params/params.rb → messages/message.rb} +7 -12
- data/lib/karafka/messages/messages.rb +64 -0
- data/lib/karafka/{params → messages}/metadata.rb +4 -6
- data/lib/karafka/messages/seek.rb +9 -0
- data/lib/karafka/patches/rdkafka/consumer.rb +22 -0
- data/lib/karafka/pro/active_job/consumer.rb +46 -0
- data/lib/karafka/pro/active_job/dispatcher.rb +61 -0
- data/lib/karafka/pro/active_job/job_options_contract.rb +32 -0
- data/lib/karafka/pro/base_consumer.rb +82 -0
- data/lib/karafka/pro/contracts/base.rb +21 -0
- data/lib/karafka/pro/contracts/consumer_group.rb +34 -0
- data/lib/karafka/pro/contracts/consumer_group_topic.rb +33 -0
- data/lib/karafka/pro/loader.rb +76 -0
- data/lib/karafka/pro/performance_tracker.rb +80 -0
- data/lib/karafka/pro/processing/coordinator.rb +72 -0
- data/lib/karafka/pro/processing/jobs/consume_non_blocking.rb +37 -0
- data/lib/karafka/pro/processing/jobs_builder.rb +32 -0
- data/lib/karafka/pro/processing/partitioner.rb +60 -0
- data/lib/karafka/pro/processing/scheduler.rb +56 -0
- data/lib/karafka/pro/routing/builder_extensions.rb +30 -0
- data/lib/karafka/pro/routing/topic_extensions.rb +38 -0
- data/lib/karafka/pro.rb +13 -0
- data/lib/karafka/process.rb +1 -0
- data/lib/karafka/processing/coordinator.rb +88 -0
- data/lib/karafka/processing/coordinators_buffer.rb +54 -0
- data/lib/karafka/processing/executor.rb +118 -0
- data/lib/karafka/processing/executors_buffer.rb +88 -0
- data/lib/karafka/processing/jobs/base.rb +51 -0
- data/lib/karafka/processing/jobs/consume.rb +42 -0
- data/lib/karafka/processing/jobs/revoked.rb +22 -0
- data/lib/karafka/processing/jobs/shutdown.rb +23 -0
- data/lib/karafka/processing/jobs_builder.rb +29 -0
- data/lib/karafka/processing/jobs_queue.rb +144 -0
- data/lib/karafka/processing/partitioner.rb +22 -0
- data/lib/karafka/processing/result.rb +29 -0
- data/lib/karafka/processing/scheduler.rb +22 -0
- data/lib/karafka/processing/worker.rb +88 -0
- data/lib/karafka/processing/workers_batch.rb +27 -0
- data/lib/karafka/railtie.rb +113 -0
- data/lib/karafka/routing/builder.rb +15 -24
- data/lib/karafka/routing/consumer_group.rb +11 -19
- data/lib/karafka/routing/consumer_mapper.rb +1 -2
- data/lib/karafka/routing/router.rb +1 -1
- data/lib/karafka/routing/subscription_group.rb +53 -0
- data/lib/karafka/routing/subscription_groups_builder.rb +53 -0
- data/lib/karafka/routing/topic.rb +61 -24
- data/lib/karafka/routing/topics.rb +38 -0
- data/lib/karafka/runner.rb +51 -0
- data/lib/karafka/serialization/json/deserializer.rb +6 -15
- data/lib/karafka/server.rb +67 -26
- data/lib/karafka/setup/config.rb +147 -175
- data/lib/karafka/status.rb +14 -5
- data/lib/karafka/templates/example_consumer.rb.erb +16 -0
- data/lib/karafka/templates/karafka.rb.erb +15 -51
- data/lib/karafka/time_trackers/base.rb +19 -0
- data/lib/karafka/time_trackers/pause.rb +92 -0
- data/lib/karafka/time_trackers/poll.rb +65 -0
- data/lib/karafka/version.rb +1 -1
- data/lib/karafka.rb +38 -17
- data.tar.gz.sig +0 -0
- metadata +118 -120
- metadata.gz.sig +0 -0
- data/MIT-LICENCE +0 -18
- data/lib/karafka/assignment_strategies/round_robin.rb +0 -13
- data/lib/karafka/attributes_map.rb +0 -63
- data/lib/karafka/backends/inline.rb +0 -16
- data/lib/karafka/base_responder.rb +0 -226
- data/lib/karafka/cli/flow.rb +0 -48
- data/lib/karafka/cli/missingno.rb +0 -19
- data/lib/karafka/code_reloader.rb +0 -67
- data/lib/karafka/connection/api_adapter.rb +0 -158
- data/lib/karafka/connection/batch_delegator.rb +0 -55
- data/lib/karafka/connection/builder.rb +0 -23
- data/lib/karafka/connection/message_delegator.rb +0 -36
- data/lib/karafka/consumers/batch_metadata.rb +0 -10
- data/lib/karafka/consumers/callbacks.rb +0 -71
- data/lib/karafka/consumers/includer.rb +0 -64
- data/lib/karafka/consumers/responders.rb +0 -24
- data/lib/karafka/consumers/single_params.rb +0 -15
- data/lib/karafka/contracts/responder_usage.rb +0 -54
- data/lib/karafka/fetcher.rb +0 -42
- data/lib/karafka/helpers/class_matcher.rb +0 -88
- data/lib/karafka/helpers/config_retriever.rb +0 -46
- data/lib/karafka/helpers/inflector.rb +0 -26
- data/lib/karafka/instrumentation/stdout_listener.rb +0 -140
- data/lib/karafka/params/batch_metadata.rb +0 -26
- data/lib/karafka/params/builders/batch_metadata.rb +0 -30
- data/lib/karafka/params/builders/params.rb +0 -38
- data/lib/karafka/params/builders/params_batch.rb +0 -25
- data/lib/karafka/params/params_batch.rb +0 -60
- data/lib/karafka/patches/ruby_kafka.rb +0 -47
- data/lib/karafka/persistence/client.rb +0 -29
- data/lib/karafka/persistence/consumers.rb +0 -45
- data/lib/karafka/persistence/topics.rb +0 -48
- data/lib/karafka/responders/builder.rb +0 -36
- data/lib/karafka/responders/topic.rb +0 -55
- data/lib/karafka/routing/topic_mapper.rb +0 -53
- data/lib/karafka/serialization/json/serializer.rb +0 -31
- data/lib/karafka/setup/configurators/water_drop.rb +0 -36
- data/lib/karafka/templates/application_responder.rb.erb +0 -11
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Karafka
|
|
4
|
+
module Instrumentation
|
|
5
|
+
module Callbacks
|
|
6
|
+
# Statistics callback handler
|
|
7
|
+
# @see `WaterDrop::Instrumentation::Callbacks::Statistics` for details on why we decorate
|
|
8
|
+
# those statistics
|
|
9
|
+
class Statistics
|
|
10
|
+
# @param subscription_group_id [String] id of the current subscription group
|
|
11
|
+
# @param consumer_group_id [String] id of the current consumer group
|
|
12
|
+
# @param client_name [String] rdkafka client name
|
|
13
|
+
# @param monitor [WaterDrop::Instrumentation::Monitor] monitor we are using
|
|
14
|
+
def initialize(subscription_group_id, consumer_group_id, client_name, monitor)
|
|
15
|
+
@subscription_group_id = subscription_group_id
|
|
16
|
+
@consumer_group_id = consumer_group_id
|
|
17
|
+
@client_name = client_name
|
|
18
|
+
@monitor = monitor
|
|
19
|
+
@statistics_decorator = ::Karafka::Core::Monitoring::StatisticsDecorator.new
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
# Emits decorated statistics to the monitor
|
|
23
|
+
# @param statistics [Hash] rdkafka statistics
|
|
24
|
+
def call(statistics)
|
|
25
|
+
# Emit only statistics related to our client
|
|
26
|
+
# rdkafka does not have per-instance statistics hook, thus we need to make sure that we
|
|
27
|
+
# emit only stats that are related to current producer. Otherwise we would emit all of
|
|
28
|
+
# all the time.
|
|
29
|
+
return unless @client_name == statistics['name']
|
|
30
|
+
|
|
31
|
+
@monitor.instrument(
|
|
32
|
+
'statistics.emitted',
|
|
33
|
+
subscription_group_id: @subscription_group_id,
|
|
34
|
+
consumer_group_id: @consumer_group_id,
|
|
35
|
+
statistics: @statistics_decorator.call(statistics)
|
|
36
|
+
)
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
end
|
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Karafka
|
|
4
|
+
module Instrumentation
|
|
5
|
+
# Default listener that hooks up to our instrumentation and uses its events for logging
|
|
6
|
+
# It can be removed/replaced or anything without any harm to the Karafka app flow.
|
|
7
|
+
class LoggerListener
|
|
8
|
+
# Log levels that we use in this particular listener
|
|
9
|
+
USED_LOG_LEVELS = %i[
|
|
10
|
+
debug
|
|
11
|
+
info
|
|
12
|
+
warn
|
|
13
|
+
error
|
|
14
|
+
fatal
|
|
15
|
+
].freeze
|
|
16
|
+
|
|
17
|
+
# Logs each messages fetching attempt
|
|
18
|
+
#
|
|
19
|
+
# @param event [Dry::Events::Event] event details including payload
|
|
20
|
+
def on_connection_listener_fetch_loop(event)
|
|
21
|
+
listener = event[:caller]
|
|
22
|
+
debug "[#{listener.id}] Polling messages..."
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
# Logs about messages that we've received from Kafka
|
|
26
|
+
#
|
|
27
|
+
# @param event [Dry::Events::Event] event details including payload
|
|
28
|
+
def on_connection_listener_fetch_loop_received(event)
|
|
29
|
+
listener = event[:caller]
|
|
30
|
+
time = event[:time]
|
|
31
|
+
messages_count = event[:messages_buffer].size
|
|
32
|
+
|
|
33
|
+
message = "[#{listener.id}] Polled #{messages_count} messages in #{time}ms"
|
|
34
|
+
|
|
35
|
+
# We don't want the "polled 0" in dev as it would spam the log
|
|
36
|
+
# Instead we publish only info when there was anything we could poll and fail over to the
|
|
37
|
+
# zero notifications when in debug mode
|
|
38
|
+
messages_count.zero? ? debug(message) : info(message)
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
# Prints info about the fact that a given job has started
|
|
42
|
+
#
|
|
43
|
+
# @param event [Dry::Events::Event] event details including payload
|
|
44
|
+
def on_worker_process(event)
|
|
45
|
+
job = event[:job]
|
|
46
|
+
job_type = job.class.to_s.split('::').last
|
|
47
|
+
consumer = job.executor.topic.consumer
|
|
48
|
+
topic = job.executor.topic.name
|
|
49
|
+
info "[#{job.id}] #{job_type} job for #{consumer} on #{topic} started"
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
# Prints info about the fact that a given job has finished
|
|
53
|
+
#
|
|
54
|
+
# @param event [Dry::Events::Event] event details including payload
|
|
55
|
+
def on_worker_processed(event)
|
|
56
|
+
job = event[:job]
|
|
57
|
+
time = event[:time]
|
|
58
|
+
job_type = job.class.to_s.split('::').last
|
|
59
|
+
consumer = job.executor.topic.consumer
|
|
60
|
+
topic = job.executor.topic.name
|
|
61
|
+
info "[#{job.id}] #{job_type} job for #{consumer} on #{topic} finished in #{time}ms"
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
# Logs info about system signals that Karafka received and prints backtrace for threads in
|
|
65
|
+
# case of ttin
|
|
66
|
+
#
|
|
67
|
+
# @param event [Dry::Events::Event] event details including payload
|
|
68
|
+
def on_process_notice_signal(event)
|
|
69
|
+
info "Received #{event[:signal]} system signal"
|
|
70
|
+
|
|
71
|
+
# We print backtrace only for ttin
|
|
72
|
+
return unless event[:signal] == :SIGTTIN
|
|
73
|
+
|
|
74
|
+
# Inspired by Sidekiq
|
|
75
|
+
Thread.list.each do |thread|
|
|
76
|
+
tid = (thread.object_id ^ ::Process.pid).to_s(36)
|
|
77
|
+
|
|
78
|
+
warn "Thread TID-#{tid} #{thread['label']}"
|
|
79
|
+
|
|
80
|
+
if thread.backtrace
|
|
81
|
+
warn thread.backtrace.join("\n")
|
|
82
|
+
else
|
|
83
|
+
warn '<no backtrace available>'
|
|
84
|
+
end
|
|
85
|
+
end
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
# Logs info that we're initializing Karafka app.
|
|
89
|
+
#
|
|
90
|
+
# @param _event [Dry::Events::Event] event details including payload
|
|
91
|
+
def on_app_initializing(_event)
|
|
92
|
+
info 'Initializing Karafka framework'
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
# Logs info that we're running Karafka app.
|
|
96
|
+
#
|
|
97
|
+
# @param _event [Dry::Events::Event] event details including payload
|
|
98
|
+
def on_app_running(_event)
|
|
99
|
+
info 'Running Karafka server'
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
# Logs info that we're going to stop the Karafka server.
|
|
103
|
+
#
|
|
104
|
+
# @param _event [Dry::Events::Event] event details including payload
|
|
105
|
+
def on_app_stopping(_event)
|
|
106
|
+
info 'Stopping Karafka server'
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
# Logs info that we stopped the Karafka server.
|
|
110
|
+
#
|
|
111
|
+
# @param _event [Dry::Events::Event] event details including payload
|
|
112
|
+
def on_app_stopped(_event)
|
|
113
|
+
info 'Stopped Karafka server'
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
# There are many types of errors that can occur in many places, but we provide a single
|
|
117
|
+
# handler for all of them to simplify error instrumentation.
|
|
118
|
+
# @param event [Dry::Events::Event] event details including payload
|
|
119
|
+
def on_error_occurred(event)
|
|
120
|
+
type = event[:type]
|
|
121
|
+
error = event[:error]
|
|
122
|
+
details = (error.backtrace || []).join("\n")
|
|
123
|
+
|
|
124
|
+
case type
|
|
125
|
+
when 'consumer.consume.error'
|
|
126
|
+
error "Consumer consuming error: #{error}"
|
|
127
|
+
error details
|
|
128
|
+
when 'consumer.revoked.error'
|
|
129
|
+
error "Consumer on revoked failed due to an error: #{error}"
|
|
130
|
+
error details
|
|
131
|
+
when 'consumer.shutdown.error'
|
|
132
|
+
error "Consumer on shutdown failed due to an error: #{error}"
|
|
133
|
+
error details
|
|
134
|
+
when 'worker.process.error'
|
|
135
|
+
fatal "Worker processing failed due to an error: #{error}"
|
|
136
|
+
fatal details
|
|
137
|
+
when 'connection.listener.fetch_loop.error'
|
|
138
|
+
error "Listener fetch loop error: #{error}"
|
|
139
|
+
error details
|
|
140
|
+
when 'licenser.expired'
|
|
141
|
+
error error
|
|
142
|
+
error details
|
|
143
|
+
when 'runner.call.error'
|
|
144
|
+
fatal "Runner crashed due to an error: #{error}"
|
|
145
|
+
fatal details
|
|
146
|
+
when 'app.stopping.error'
|
|
147
|
+
error 'Forceful Karafka server stop'
|
|
148
|
+
when 'librdkafka.error'
|
|
149
|
+
error "librdkafka internal error occurred: #{error}"
|
|
150
|
+
error details
|
|
151
|
+
else
|
|
152
|
+
# This should never happen. Please contact the maintainers
|
|
153
|
+
raise Errors::UnsupportedCaseError, event
|
|
154
|
+
end
|
|
155
|
+
end
|
|
156
|
+
|
|
157
|
+
USED_LOG_LEVELS.each do |log_level|
|
|
158
|
+
define_method log_level do |*args|
|
|
159
|
+
Karafka.logger.send(log_level, *args)
|
|
160
|
+
end
|
|
161
|
+
end
|
|
162
|
+
end
|
|
163
|
+
end
|
|
164
|
+
end
|
|
@@ -1,69 +1,21 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
module Karafka
|
|
4
|
-
# Namespace for all the things related with Karafka instrumentation process
|
|
5
4
|
module Instrumentation
|
|
6
|
-
#
|
|
7
|
-
#
|
|
8
|
-
#
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
# @note This class acts as a singleton because we are only permitted to have single monitor
|
|
12
|
-
# per running process (just as logger)
|
|
13
|
-
class Monitor < Dry::Monitor::Notifications
|
|
14
|
-
# List of events that we support in the system and to which a monitor client can hook up
|
|
15
|
-
# @note The non-error once support timestamp benchmarking
|
|
16
|
-
# @note Depending on Karafka extensions and additional engines, this might not be the
|
|
17
|
-
# complete list of all the events. Please use the #available_events on fully loaded
|
|
18
|
-
# Karafka system to determine all of the events you can use.
|
|
19
|
-
# Last 4 events are from WaterDrop but for convenience we use the same monitor for the
|
|
20
|
-
# whole karafka ecosystem
|
|
21
|
-
BASE_EVENTS = %w[
|
|
22
|
-
params.params.deserialize
|
|
23
|
-
params.params.deserialize.error
|
|
24
|
-
connection.listener.before_fetch_loop
|
|
25
|
-
connection.listener.fetch_loop
|
|
26
|
-
connection.listener.fetch_loop.error
|
|
27
|
-
connection.client.fetch_loop.error
|
|
28
|
-
connection.batch_delegator.call
|
|
29
|
-
connection.message_delegator.call
|
|
30
|
-
fetcher.call.error
|
|
31
|
-
backends.inline.process
|
|
32
|
-
process.notice_signal
|
|
33
|
-
consumers.responders.respond_with
|
|
34
|
-
async_producer.call.error
|
|
35
|
-
async_producer.call.retry
|
|
36
|
-
sync_producer.call.error
|
|
37
|
-
sync_producer.call.retry
|
|
38
|
-
app.initializing
|
|
39
|
-
app.initialized
|
|
40
|
-
app.running
|
|
41
|
-
app.stopping
|
|
42
|
-
app.stopping.error
|
|
43
|
-
app.stopped
|
|
44
|
-
].freeze
|
|
5
|
+
# Karafka instrumentation monitor that we use to publish events
|
|
6
|
+
# By default uses our internal notifications bus but can be used with
|
|
7
|
+
# `ActiveSupport::Notifications` as well
|
|
8
|
+
class Monitor < ::Karafka::Core::Monitoring::Monitor
|
|
9
|
+
attr_reader :notifications_bus
|
|
45
10
|
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
# @
|
|
49
|
-
def initialize
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
# Allows us to subscribe to events with a code that will be yielded upon events
|
|
55
|
-
# @param event_name_or_listener [String, Object] name of the event we want to subscribe to
|
|
56
|
-
# or a listener if we decide to go with object listener
|
|
57
|
-
def subscribe(event_name_or_listener)
|
|
58
|
-
return super unless event_name_or_listener.is_a?(String)
|
|
59
|
-
return super if available_events.include?(event_name_or_listener)
|
|
60
|
-
|
|
61
|
-
raise Errors::UnregisteredMonitorEventError, event_name_or_listener
|
|
62
|
-
end
|
|
63
|
-
|
|
64
|
-
# @return [Array<String>] names of available events to which we can subscribe
|
|
65
|
-
def available_events
|
|
66
|
-
__bus__.events.keys
|
|
11
|
+
# @param notifications_bus [Object] either our internal notifications bus or
|
|
12
|
+
# `ActiveSupport::Notifications`
|
|
13
|
+
# @param namespace [String, nil] namespace for events or nil if no namespace
|
|
14
|
+
def initialize(
|
|
15
|
+
notifications_bus = ::Karafka::Instrumentation::Notifications.new,
|
|
16
|
+
namespace = nil
|
|
17
|
+
)
|
|
18
|
+
super(notifications_bus, namespace)
|
|
67
19
|
end
|
|
68
20
|
end
|
|
69
21
|
end
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Karafka
|
|
4
|
+
# Namespace for all the things related with Karafka instrumentation process
|
|
5
|
+
module Instrumentation
|
|
6
|
+
# Monitor is used to hookup external monitoring services to monitor how Karafka works
|
|
7
|
+
# It provides a standardized API for checking incoming messages/enqueueing etc
|
|
8
|
+
# Since it is a pub-sub based on dry-monitor, you can use as many subscribers/loggers at the
|
|
9
|
+
# same time, which means that you might have for example file logging and NewRelic at the same
|
|
10
|
+
# time
|
|
11
|
+
# @note This class acts as a singleton because we are only permitted to have single monitor
|
|
12
|
+
# per running process (just as logger)
|
|
13
|
+
class Notifications < Karafka::Core::Monitoring::Notifications
|
|
14
|
+
# List of events that we support in the system and to which a monitor client can hook up
|
|
15
|
+
# @note The non-error once support timestamp benchmarking
|
|
16
|
+
# @note Depending on Karafka extensions and additional engines, this might not be the
|
|
17
|
+
# complete list of all the events. Please use the #available_events on fully loaded
|
|
18
|
+
# Karafka system to determine all of the events you can use.
|
|
19
|
+
EVENTS = %w[
|
|
20
|
+
app.initialized
|
|
21
|
+
app.running
|
|
22
|
+
app.stopping
|
|
23
|
+
app.stopped
|
|
24
|
+
|
|
25
|
+
consumer.consumed
|
|
26
|
+
consumer.revoked
|
|
27
|
+
consumer.shutdown
|
|
28
|
+
|
|
29
|
+
process.notice_signal
|
|
30
|
+
|
|
31
|
+
connection.listener.before_fetch_loop
|
|
32
|
+
connection.listener.fetch_loop
|
|
33
|
+
connection.listener.fetch_loop.received
|
|
34
|
+
|
|
35
|
+
worker.process
|
|
36
|
+
worker.processed
|
|
37
|
+
|
|
38
|
+
statistics.emitted
|
|
39
|
+
|
|
40
|
+
error.occurred
|
|
41
|
+
].freeze
|
|
42
|
+
|
|
43
|
+
private_constant :EVENTS
|
|
44
|
+
|
|
45
|
+
# @return [Karafka::Instrumentation::Monitor] monitor instance for system instrumentation
|
|
46
|
+
def initialize
|
|
47
|
+
super
|
|
48
|
+
EVENTS.each { |event| register_event(event) }
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
end
|
|
52
|
+
end
|
|
@@ -5,19 +5,19 @@ module Karafka
|
|
|
5
5
|
# Listener that sets a proc title with a nice descriptive value
|
|
6
6
|
class ProctitleListener
|
|
7
7
|
# Updates proc title to an initializing one
|
|
8
|
-
# @param _event [
|
|
8
|
+
# @param _event [Karafka::Core::Monitoring::Event] event details including payload
|
|
9
9
|
def on_app_initializing(_event)
|
|
10
10
|
setproctitle('initializing')
|
|
11
11
|
end
|
|
12
12
|
|
|
13
13
|
# Updates proc title to a running one
|
|
14
|
-
# @param _event [
|
|
14
|
+
# @param _event [Karafka::Core::Monitoring::Event] event details including payload
|
|
15
15
|
def on_app_running(_event)
|
|
16
16
|
setproctitle('running')
|
|
17
17
|
end
|
|
18
18
|
|
|
19
19
|
# Updates proc title to a stopping one
|
|
20
|
-
# @param _event [
|
|
20
|
+
# @param _event [Karafka::Core::Monitoring::Event] event details including payload
|
|
21
21
|
def on_app_stopping(_event)
|
|
22
22
|
setproctitle('stopping')
|
|
23
23
|
end
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"title":"Karafka monitoring dashboard","description":"","widgets":[{"id":7444969424381053,"definition":{"title":"Stability & errors","type":"group","show_title":true,"layout_type":"ordered","widgets":[{"id":8304008422587936,"definition":{"title":"Client connects and disconnects","title_size":"16","title_align":"left","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"alias":"Connects","formula":"query1"},{"alias":"Disconnects","formula":"query2"}],"response_format":"timeseries","queries":[{"query":"sum:karafka.connection.connects{*} by {host}.as_count()","data_source":"metrics","name":"query1"},{"query":"sum:karafka.connection.disconnects{*} by {host}.as_count()","data_source":"metrics","name":"query2"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"bars"}]},"layout":{"x":0,"y":0,"width":4,"height":2}},{"id":3722865443336921,"definition":{"title":"Errors encountered (any)","title_size":"16","title_align":"left","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"alias":"in-karafka errors","formula":"query1"},{"alias":"librdkafka consume errors","formula":"query2"},{"alias":"librdkafka receive errors","formula":"query3"}],"response_format":"timeseries","queries":[{"query":"sum:karafka.error_occurred{*} by {type}.as_count()","data_source":"metrics","name":"query1"},{"query":"sum:karafka.consume.errors{*}.as_count()","data_source":"metrics","name":"query2"},{"query":"sum:karafka.receive.errors{*}.as_count()","data_source":"metrics","name":"query3"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}]},"layout":{"x":4,"y":0,"width":4,"height":2}},{"id":5477381252952760,"definition":{"title":"Processing errors","title_size":"16","title_align":"left","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"formula":"query1"}],"response_format":"timeseries","queries":[{"query":"sum:karafka.error_occurred{type:consumer.consume.error} by {partition,topic}.as_count()","data_source":"metrics","name":"query1"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}]},"layout":{"x":8,"y":0,"width":4,"height":2}},{"id":2357301680769076,"definition":{"title":"Processing errors rate per topic","title_size":"16","title_align":"left","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"alias":"% error rate per topic","formula":"(query1 / (query1 + query2)) * 100"}],"response_format":"timeseries","queries":[{"query":"sum:karafka.error_occurred{type:consumer.consume.error} by {topic,partition}.as_count()","data_source":"metrics","name":"query1"},{"query":"sum:karafka.consumer.batches{*} by {topic,partition}.as_count()","data_source":"metrics","name":"query2"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"bars"}]},"layout":{"x":0,"y":2,"width":4,"height":2}},{"id":3902930069982135,"definition":{"title":"Batches successful vs failures","title_size":"16","title_align":"left","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"alias":"Successfully processed batch","formula":"query1"},{"alias":"Batch processing with error","formula":"query2"}],"response_format":"timeseries","queries":[{"query":"sum:karafka.consumer.batches{*} by {partition,topic}.as_count()","data_source":"metrics","name":"query1"},{"query":"avg:karafka.error_occurred{type:consumer.consume.error} by {partition,topic}.as_count()","data_source":"metrics","name":"query2"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}]},"layout":{"x":4,"y":2,"width":4,"height":2}},{"id":718749162159145,"definition":{"title":"Consumer instances revocations and shutdowns","title_size":"16","title_align":"left","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"alias":"Consumer instances revokations","formula":"query1"},{"alias":"Consumer instances shutdowns","formula":"query2"}],"response_format":"timeseries","queries":[{"query":"sum:karafka.consumer.revoked{*}.as_count()","data_source":"metrics","name":"query1"},{"query":"sum:karafka.consumer.shutdown{*}.as_count()","data_source":"metrics","name":"query2"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}]},"layout":{"x":8,"y":2,"width":4,"height":2}}]},"layout":{"x":0,"y":0,"width":12,"height":5}},{"id":5988438511387100,"definition":{"title":"Workers poll","type":"group","show_title":true,"layout_type":"ordered","widgets":[{"id":8769294644934352,"definition":{"title":"Enqueued jobs","title_size":"16","title_align":"left","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"alias":"Enqueued jobs","formula":"query1"}],"response_format":"timeseries","queries":[{"query":"sum:karafka.worker.enqueued_jobs.avg{*}","data_source":"metrics","name":"query1"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}]},"layout":{"x":0,"y":0,"width":4,"height":2}},{"id":2714502141463873,"definition":{"title":"Workers usage","title_size":"16","title_align":"left","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"alias":"Busy workers (p95)","formula":"query1"},{"alias":"Total workers","formula":"query2"}],"response_format":"timeseries","queries":[{"query":"sum:karafka.worker.processing.95percentile{*}","data_source":"metrics","name":"query1"},{"query":"sum:karafka.worker.total_threads{*}","data_source":"metrics","name":"query2"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}]},"layout":{"x":4,"y":0,"width":4,"height":2}},{"id":5370086629441984,"definition":{"title":"Workers % utilization","title_size":"16","title_align":"left","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"alias":"% workers utilization","formula":"(query1 / query2) * 100"}],"response_format":"timeseries","queries":[{"query":"sum:karafka.worker.processing.95percentile{*}","data_source":"metrics","name":"query1"},{"query":"sum:karafka.worker.total_threads{*}","data_source":"metrics","name":"query2"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}]},"layout":{"x":8,"y":0,"width":4,"height":2}}]},"layout":{"x":0,"y":5,"width":12,"height":3}},{"id":8544040083223278,"definition":{"title":"Throughput ","type":"group","show_title":true,"layout_type":"ordered","widgets":[{"id":3740207481939733,"definition":{"title":"Offset lag changes","title_size":"16","title_align":"left","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"formula":"derivative(query1)"}],"response_format":"timeseries","queries":[{"query":"max:karafka.consumer.offset{*} by {topic,partition}","data_source":"metrics","name":"query1"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}]},"layout":{"x":0,"y":0,"width":4,"height":2}},{"id":6319110548544878,"definition":{"title":"Batches processed per topic","title_size":"16","title_align":"left","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"formula":"query1"}],"response_format":"timeseries","queries":[{"query":"sum:karafka.consumer.batches{*} by {partition,topic}.as_count()","data_source":"metrics","name":"query1"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}]},"layout":{"x":4,"y":0,"width":4,"height":2}},{"id":6232784865331443,"definition":{"title":"Messages consumed per topic","title_size":"16","title_align":"left","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"alias":"Messages consumed","formula":"query1"},{"alias":"Average batch size","formula":"query1 / query2"}],"response_format":"timeseries","queries":[{"query":"sum:karafka.consumer.messages{*} by {partition,topic}.as_count()","data_source":"metrics","name":"query1"},{"query":"sum:karafka.consumer.batches{*} by {partition,topic}.as_count()","data_source":"metrics","name":"query2"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}]},"layout":{"x":8,"y":0,"width":4,"height":2}},{"id":2321394598982770,"definition":{"title":"Consumption lag (in seconds)","title_size":"16","title_align":"left","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"alias":"Consumption lag in s (max)","formula":"query2 / 1000"},{"alias":"Consumption lag in s (avg)","formula":"query3 / 1000"},{"alias":"Consumption lag in s (p95)","formula":"query1 / 1000"}],"response_format":"timeseries","queries":[{"query":"max:karafka.consumer.consumption_lag.max{*}","data_source":"metrics","name":"query2"},{"query":"max:karafka.consumer.consumption_lag.avg{*}","data_source":"metrics","name":"query3"},{"query":"max:karafka.consumer.consumption_lag.95percentile{*}","data_source":"metrics","name":"query1"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}]},"layout":{"x":0,"y":2,"width":4,"height":2}},{"id":1062074781483741,"definition":{"title":"Processing lag (in ms)","title_size":"16","title_align":"left","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"alias":"Processing lag in ms (p95)","formula":"query1"},{"alias":"Processing lag in ms (max)","formula":"query2"},{"alias":"Processing lag in ms (avg)","formula":"query3"}],"response_format":"timeseries","queries":[{"query":"max:karafka.consumer.processing_lag.95percentile{*}","data_source":"metrics","name":"query1"},{"query":"max:karafka.consumer.processing_lag.max{*}","data_source":"metrics","name":"query2"},{"query":"max:karafka.consumer.processing_lag.avg{*}","data_source":"metrics","name":"query3"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}]},"layout":{"x":4,"y":2,"width":4,"height":2}},{"id":7497794728674267,"definition":{"title":"Batch processing time","title_size":"16","title_align":"left","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"formula":"query1"},{"formula":"query2"},{"formula":"query3"}],"response_format":"timeseries","queries":[{"query":"max:karafka.consumer.consumed.time_taken.95percentile{*} by {topic,partition}","data_source":"metrics","name":"query1"},{"query":"max:karafka.consumer.consumed.time_taken.max{*} by {topic,partition}","data_source":"metrics","name":"query2"},{"query":"max:karafka.consumer.consumed.time_taken.avg{*} by {topic,partition}","data_source":"metrics","name":"query3"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}]},"layout":{"x":8,"y":2,"width":4,"height":2}},{"id":4192833027984161,"definition":{"title":"Batch size per topic","title_size":"16","title_align":"left","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"alias":"Batch size p95","formula":"query1"},{"alias":"Batch size avg","formula":"query2"},{"alias":"Batch size max","formula":"query3"}],"response_format":"timeseries","queries":[{"query":"sum:karafka.consumer.batch_size.95percentile{*} by {partition,topic}","data_source":"metrics","name":"query1"},{"query":"sum:karafka.consumer.batch_size.avg{*} by {partition,topic}","data_source":"metrics","name":"query2"},{"query":"sum:karafka.consumer.batch_size.max{*} by {partition,topic}","data_source":"metrics","name":"query3"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}]},"layout":{"x":0,"y":4,"width":4,"height":2}},{"id":4741598444771147,"definition":{"title":"Messages consumed overall","title_size":"16","title_align":"left","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"alias":"Messages consumed","formula":"query1"},{"alias":"Average batch size","formula":"query1 / query2"}],"response_format":"timeseries","queries":[{"query":"sum:karafka.consumer.messages{*}.as_count()","data_source":"metrics","name":"query1"},{"query":"sum:karafka.consumer.batches{*}.as_count()","data_source":"metrics","name":"query2"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}]},"layout":{"x":4,"y":4,"width":4,"height":2}},{"id":4502534794102513,"definition":{"title":"Polling times (ms)","title_size":"16","title_align":"left","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"time":{},"type":"timeseries","requests":[{"formulas":[{"alias":"p95 ms polling time","formula":"query1"},{"alias":"max ms polling time","formula":"query2"},{"alias":"average ms polling time","formula":"query3"}],"queries":[{"name":"query1","data_source":"metrics","query":"avg:karafka.listener.polling.time_taken.95percentile{*}"},{"name":"query2","data_source":"metrics","query":"avg:karafka.listener.polling.time_taken.max{*}"},{"name":"query3","data_source":"metrics","query":"avg:karafka.listener.polling.time_taken.avg{*}"}],"response_format":"timeseries","style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}]},"layout":{"x":8,"y":4,"width":4,"height":2}}]},"layout":{"x":0,"y":0,"width":12,"height":7,"is_column_break":true}}],"template_variables":[],"layout_type":"ordered","is_read_only":false,"notify_list":[],"reflow_type":"fixed","id":"s3u-z47-i6u"}
|
|
@@ -0,0 +1,232 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Karafka
|
|
4
|
+
module Instrumentation
|
|
5
|
+
# Namespace for vendor specific instrumentation
|
|
6
|
+
module Vendors
|
|
7
|
+
# Datadog specific instrumentation
|
|
8
|
+
module Datadog
|
|
9
|
+
# Listener that can be used to subscribe to Karafka to receive stats via StatsD
|
|
10
|
+
# and/or Datadog
|
|
11
|
+
#
|
|
12
|
+
# @note You need to setup the `dogstatsd-ruby` client and assign it
|
|
13
|
+
class Listener
|
|
14
|
+
include ::Karafka::Core::Configurable
|
|
15
|
+
extend Forwardable
|
|
16
|
+
|
|
17
|
+
def_delegators :config, :client, :rd_kafka_metrics, :namespace, :default_tags
|
|
18
|
+
|
|
19
|
+
# Value object for storing a single rdkafka metric publishing details
|
|
20
|
+
RdKafkaMetric = Struct.new(:type, :scope, :name, :key_location)
|
|
21
|
+
|
|
22
|
+
# Namespace under which the DD metrics should be published
|
|
23
|
+
setting :namespace, default: 'karafka'
|
|
24
|
+
|
|
25
|
+
# Datadog client that we should use to publish the metrics
|
|
26
|
+
setting :client
|
|
27
|
+
|
|
28
|
+
# Default tags we want to publish (for example hostname)
|
|
29
|
+
# Format as followed (example for hostname): `["host:#{Socket.gethostname}"]`
|
|
30
|
+
setting :default_tags, default: []
|
|
31
|
+
|
|
32
|
+
# All the rdkafka metrics we want to publish
|
|
33
|
+
#
|
|
34
|
+
# By default we publish quite a lot so this can be tuned
|
|
35
|
+
# Note, that the once with `_d` come from Karafka, not rdkafka or Kafka
|
|
36
|
+
setting :rd_kafka_metrics, default: [
|
|
37
|
+
# Client metrics
|
|
38
|
+
RdKafkaMetric.new(:count, :root, 'messages.consumed', 'rxmsgs_d'),
|
|
39
|
+
RdKafkaMetric.new(:count, :root, 'messages.consumed.bytes', 'rxmsg_bytes'),
|
|
40
|
+
|
|
41
|
+
# Broker metrics
|
|
42
|
+
RdKafkaMetric.new(:count, :brokers, 'consume.attempts', 'txretries_d'),
|
|
43
|
+
RdKafkaMetric.new(:count, :brokers, 'consume.errors', 'txerrs_d'),
|
|
44
|
+
RdKafkaMetric.new(:count, :brokers, 'receive.errors', 'rxerrs_d'),
|
|
45
|
+
RdKafkaMetric.new(:count, :brokers, 'connection.connects', 'connects_d'),
|
|
46
|
+
RdKafkaMetric.new(:count, :brokers, 'connection.disconnects', 'disconnects_d'),
|
|
47
|
+
RdKafkaMetric.new(:gauge, :brokers, 'network.latency.avg', %w[rtt avg]),
|
|
48
|
+
RdKafkaMetric.new(:gauge, :brokers, 'network.latency.p95', %w[rtt p95]),
|
|
49
|
+
RdKafkaMetric.new(:gauge, :brokers, 'network.latency.p99', %w[rtt p99])
|
|
50
|
+
].freeze
|
|
51
|
+
|
|
52
|
+
configure
|
|
53
|
+
|
|
54
|
+
# @param block [Proc] configuration block
|
|
55
|
+
def initialize(&block)
|
|
56
|
+
configure
|
|
57
|
+
setup(&block) if block
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
# @param block [Proc] configuration block
|
|
61
|
+
# @note We define this alias to be consistent with `WaterDrop#setup`
|
|
62
|
+
def setup(&block)
|
|
63
|
+
configure(&block)
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
# Hooks up to WaterDrop instrumentation for emitted statistics
|
|
67
|
+
#
|
|
68
|
+
# @param event [Karafka::Core::Monitoring::Event]
|
|
69
|
+
def on_statistics_emitted(event)
|
|
70
|
+
statistics = event[:statistics]
|
|
71
|
+
|
|
72
|
+
rd_kafka_metrics.each do |metric|
|
|
73
|
+
report_metric(metric, statistics)
|
|
74
|
+
end
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
# Increases the errors count by 1
|
|
78
|
+
#
|
|
79
|
+
# @param event [Karafka::Core::Monitoring::Event]
|
|
80
|
+
def on_error_occurred(event)
|
|
81
|
+
extra_tags = ["type:#{event[:type]}"]
|
|
82
|
+
|
|
83
|
+
if event.payload[:caller].respond_to?(:messages)
|
|
84
|
+
metadata = event.payload[:caller].messages.metadata
|
|
85
|
+
|
|
86
|
+
extra_tags += [
|
|
87
|
+
"topic:#{metadata.topic}",
|
|
88
|
+
"partition:#{metadata.partition}"
|
|
89
|
+
]
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
count('error_occurred', 1, tags: default_tags + extra_tags)
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
# Reports how many messages we've polled and how much time did we spend on it
|
|
96
|
+
#
|
|
97
|
+
# @param event [Karafka::Core::Monitoring::Event]
|
|
98
|
+
def on_connection_listener_fetch_loop_received(event)
|
|
99
|
+
time_taken = event[:time]
|
|
100
|
+
messages_count = event[:messages_buffer].size
|
|
101
|
+
|
|
102
|
+
histogram('listener.polling.time_taken', time_taken, tags: default_tags)
|
|
103
|
+
histogram('listener.polling.messages', messages_count, tags: default_tags)
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
# Here we report majority of things related to processing as we have access to the
|
|
107
|
+
# consumer
|
|
108
|
+
# @param event [Karafka::Core::Monitoring::Event]
|
|
109
|
+
def on_consumer_consumed(event)
|
|
110
|
+
messages = event.payload[:caller].messages
|
|
111
|
+
metadata = messages.metadata
|
|
112
|
+
|
|
113
|
+
tags = default_tags + [
|
|
114
|
+
"topic:#{metadata.topic}",
|
|
115
|
+
"partition:#{metadata.partition}"
|
|
116
|
+
]
|
|
117
|
+
|
|
118
|
+
count('consumer.messages', messages.count, tags: tags)
|
|
119
|
+
count('consumer.batches', 1, tags: tags)
|
|
120
|
+
gauge('consumer.offset', metadata.last_offset, tags: tags)
|
|
121
|
+
histogram('consumer.consumed.time_taken', event[:time], tags: tags)
|
|
122
|
+
histogram('consumer.batch_size', messages.count, tags: tags)
|
|
123
|
+
histogram('consumer.processing_lag', metadata.processing_lag, tags: tags)
|
|
124
|
+
histogram('consumer.consumption_lag', metadata.consumption_lag, tags: tags)
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
# @param event [Karafka::Core::Monitoring::Event]
|
|
128
|
+
def on_consumer_revoked(event)
|
|
129
|
+
messages = event.payload[:caller].messages
|
|
130
|
+
metadata = messages.metadata
|
|
131
|
+
|
|
132
|
+
tags = default_tags + [
|
|
133
|
+
"topic:#{metadata.topic}",
|
|
134
|
+
"partition:#{metadata.partition}"
|
|
135
|
+
]
|
|
136
|
+
|
|
137
|
+
count('consumer.revoked', 1, tags: tags)
|
|
138
|
+
end
|
|
139
|
+
|
|
140
|
+
# @param event [Karafka::Core::Monitoring::Event]
|
|
141
|
+
def on_consumer_shutdown(event)
|
|
142
|
+
messages = event.payload[:caller].messages
|
|
143
|
+
metadata = messages.metadata
|
|
144
|
+
|
|
145
|
+
tags = default_tags + [
|
|
146
|
+
"topic:#{metadata.topic}",
|
|
147
|
+
"partition:#{metadata.partition}"
|
|
148
|
+
]
|
|
149
|
+
|
|
150
|
+
count('consumer.shutdown', 1, tags: tags)
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
# Worker related metrics
|
|
154
|
+
# @param event [Karafka::Core::Monitoring::Event]
|
|
155
|
+
def on_worker_process(event)
|
|
156
|
+
jq_stats = event[:jobs_queue].statistics
|
|
157
|
+
|
|
158
|
+
gauge('worker.total_threads', Karafka::App.config.concurrency, tags: default_tags)
|
|
159
|
+
histogram('worker.processing', jq_stats[:processing], tags: default_tags)
|
|
160
|
+
histogram('worker.enqueued_jobs', jq_stats[:enqueued], tags: default_tags)
|
|
161
|
+
end
|
|
162
|
+
|
|
163
|
+
# We report this metric before and after processing for higher accuracy
|
|
164
|
+
# Without this, the utilization would not be fully reflected
|
|
165
|
+
# @param event [Karafka::Core::Monitoring::Event]
|
|
166
|
+
def on_worker_processed(event)
|
|
167
|
+
jq_stats = event[:jobs_queue].statistics
|
|
168
|
+
|
|
169
|
+
histogram('worker.processing', jq_stats[:processing], tags: default_tags)
|
|
170
|
+
end
|
|
171
|
+
|
|
172
|
+
private
|
|
173
|
+
|
|
174
|
+
%i[
|
|
175
|
+
count
|
|
176
|
+
gauge
|
|
177
|
+
histogram
|
|
178
|
+
increment
|
|
179
|
+
decrement
|
|
180
|
+
].each do |metric_type|
|
|
181
|
+
class_eval <<~METHODS, __FILE__, __LINE__ + 1
|
|
182
|
+
def #{metric_type}(key, *args)
|
|
183
|
+
client.#{metric_type}(
|
|
184
|
+
namespaced_metric(key),
|
|
185
|
+
*args
|
|
186
|
+
)
|
|
187
|
+
end
|
|
188
|
+
METHODS
|
|
189
|
+
end
|
|
190
|
+
|
|
191
|
+
# Wraps metric name in listener's namespace
|
|
192
|
+
# @param metric_name [String] RdKafkaMetric name
|
|
193
|
+
# @return [String]
|
|
194
|
+
def namespaced_metric(metric_name)
|
|
195
|
+
"#{namespace}.#{metric_name}"
|
|
196
|
+
end
|
|
197
|
+
|
|
198
|
+
# Reports a given metric statistics to Datadog
|
|
199
|
+
# @param metric [RdKafkaMetric] metric value object
|
|
200
|
+
# @param statistics [Hash] hash with all the statistics emitted
|
|
201
|
+
def report_metric(metric, statistics)
|
|
202
|
+
case metric.scope
|
|
203
|
+
when :root
|
|
204
|
+
public_send(
|
|
205
|
+
metric.type,
|
|
206
|
+
metric.name,
|
|
207
|
+
statistics.fetch(*metric.key_location),
|
|
208
|
+
tags: default_tags
|
|
209
|
+
)
|
|
210
|
+
when :brokers
|
|
211
|
+
statistics.fetch('brokers').each_value do |broker_statistics|
|
|
212
|
+
# Skip bootstrap nodes
|
|
213
|
+
# Bootstrap nodes have nodeid -1, other nodes have positive
|
|
214
|
+
# node ids
|
|
215
|
+
next if broker_statistics['nodeid'] == -1
|
|
216
|
+
|
|
217
|
+
public_send(
|
|
218
|
+
metric.type,
|
|
219
|
+
metric.name,
|
|
220
|
+
broker_statistics.dig(*metric.key_location),
|
|
221
|
+
tags: default_tags + ["broker:#{broker_statistics['nodename']}"]
|
|
222
|
+
)
|
|
223
|
+
end
|
|
224
|
+
else
|
|
225
|
+
raise ArgumentError, metric.scope
|
|
226
|
+
end
|
|
227
|
+
end
|
|
228
|
+
end
|
|
229
|
+
end
|
|
230
|
+
end
|
|
231
|
+
end
|
|
232
|
+
end
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Karafka
|
|
4
|
+
# @note Since we can only have one statistics callbacks manager and one error callbacks manager
|
|
5
|
+
# we use WaterDrops one that is already configured.
|
|
6
|
+
module Instrumentation
|
|
7
|
+
class << self
|
|
8
|
+
# Returns a manager for statistics callbacks
|
|
9
|
+
# @return [::WaterDrop::CallbacksManager]
|
|
10
|
+
def statistics_callbacks
|
|
11
|
+
::WaterDrop::Instrumentation.statistics_callbacks
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
# Returns a manager for error callbacks
|
|
15
|
+
# @return [::WaterDrop::CallbacksManager]
|
|
16
|
+
def error_callbacks
|
|
17
|
+
::WaterDrop::Instrumentation.error_callbacks
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|