waterdrop 2.0.0 → 2.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- checksums.yaml.gz.sig +0 -0
- data/.github/workflows/ci.yml +33 -6
- data/.ruby-version +1 -1
- data/CHANGELOG.md +80 -0
- data/Gemfile +0 -2
- data/Gemfile.lock +36 -87
- data/MIT-LICENSE +18 -0
- data/README.md +180 -46
- data/certs/mensfeld.pem +21 -21
- data/config/errors.yml +29 -5
- data/docker-compose.yml +2 -1
- data/lib/{water_drop → waterdrop}/config.rb +47 -19
- data/lib/waterdrop/contracts/config.rb +40 -0
- data/lib/waterdrop/contracts/message.rb +60 -0
- data/lib/waterdrop/instrumentation/callbacks/delivery.rb +30 -0
- data/lib/waterdrop/instrumentation/callbacks/error.rb +36 -0
- data/lib/waterdrop/instrumentation/callbacks/statistics.rb +41 -0
- data/lib/waterdrop/instrumentation/callbacks/statistics_decorator.rb +77 -0
- data/lib/waterdrop/instrumentation/callbacks_manager.rb +39 -0
- data/lib/{water_drop/instrumentation/stdout_listener.rb → waterdrop/instrumentation/logger_listener.rb} +17 -26
- data/lib/waterdrop/instrumentation/monitor.rb +20 -0
- data/lib/{water_drop/instrumentation/monitor.rb → waterdrop/instrumentation/notifications.rb} +12 -13
- data/lib/waterdrop/instrumentation/vendors/datadog/dashboard.json +1 -0
- data/lib/waterdrop/instrumentation/vendors/datadog/listener.rb +210 -0
- data/lib/waterdrop/instrumentation.rb +20 -0
- data/lib/waterdrop/patches/rdkafka/bindings.rb +42 -0
- data/lib/waterdrop/patches/rdkafka/producer.rb +28 -0
- data/lib/{water_drop → waterdrop}/producer/async.rb +2 -2
- data/lib/{water_drop → waterdrop}/producer/buffer.rb +15 -8
- data/lib/waterdrop/producer/builder.rb +28 -0
- data/lib/{water_drop → waterdrop}/producer/sync.rb +2 -2
- data/lib/{water_drop → waterdrop}/producer.rb +29 -15
- data/lib/{water_drop → waterdrop}/version.rb +1 -1
- data/lib/waterdrop.rb +33 -2
- data/waterdrop.gemspec +12 -10
- data.tar.gz.sig +0 -0
- metadata +64 -97
- metadata.gz.sig +0 -0
- data/.github/FUNDING.yml +0 -1
- data/LICENSE +0 -165
- data/lib/water_drop/contracts/config.rb +0 -26
- data/lib/water_drop/contracts/message.rb +0 -41
- data/lib/water_drop/instrumentation.rb +0 -7
- data/lib/water_drop/producer/builder.rb +0 -63
- data/lib/water_drop/producer/statistics_decorator.rb +0 -71
- data/lib/water_drop.rb +0 -30
- /data/lib/{water_drop → waterdrop}/contracts.rb +0 -0
- /data/lib/{water_drop → waterdrop}/errors.rb +0 -0
- /data/lib/{water_drop → waterdrop}/producer/dummy_client.rb +0 -0
- /data/lib/{water_drop → waterdrop}/producer/status.rb +0 -0
@@ -0,0 +1,36 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module WaterDrop
|
4
|
+
module Instrumentation
|
5
|
+
module Callbacks
|
6
|
+
# Callback that kicks in when error occurs and is published in a background thread
|
7
|
+
class Error
|
8
|
+
# @param producer_id [String] id of the current producer
|
9
|
+
# @param client_name [String] rdkafka client name
|
10
|
+
# @param monitor [WaterDrop::Instrumentation::Monitor] monitor we are using
|
11
|
+
def initialize(producer_id, client_name, monitor)
|
12
|
+
@producer_id = producer_id
|
13
|
+
@client_name = client_name
|
14
|
+
@monitor = monitor
|
15
|
+
end
|
16
|
+
|
17
|
+
# Runs the instrumentation monitor with error
|
18
|
+
# @param client_name [String] rdkafka client name
|
19
|
+
# @param error [Rdkafka::Error] error that occurred
|
20
|
+
# @note It will only instrument on errors of the client of our producer
|
21
|
+
def call(client_name, error)
|
22
|
+
# Emit only errors related to our client
|
23
|
+
# Same as with statistics (mor explanation there)
|
24
|
+
return unless @client_name == client_name
|
25
|
+
|
26
|
+
@monitor.instrument(
|
27
|
+
'error.occurred',
|
28
|
+
error: error,
|
29
|
+
producer_id: @producer_id,
|
30
|
+
type: 'librdkafka.error'
|
31
|
+
)
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module WaterDrop
|
4
|
+
module Instrumentation
|
5
|
+
# Namespace for handlers of callbacks emitted by the kafka client lib
|
6
|
+
module Callbacks
|
7
|
+
# Statistics callback handler
|
8
|
+
# @note We decorate the statistics with our own decorator because some of the metrics from
|
9
|
+
# rdkafka are absolute. For example number of sent messages increases not in reference to
|
10
|
+
# previous statistics emit but from the beginning of the process. We decorate it with diff
|
11
|
+
# of all the numeric values against the data from the previous callback emit
|
12
|
+
class Statistics
|
13
|
+
# @param producer_id [String] id of the current producer
|
14
|
+
# @param client_name [String] rdkafka client name
|
15
|
+
# @param monitor [WaterDrop::Instrumentation::Monitor] monitor we are using
|
16
|
+
def initialize(producer_id, client_name, monitor)
|
17
|
+
@producer_id = producer_id
|
18
|
+
@client_name = client_name
|
19
|
+
@monitor = monitor
|
20
|
+
@statistics_decorator = StatisticsDecorator.new
|
21
|
+
end
|
22
|
+
|
23
|
+
# Emits decorated statistics to the monitor
|
24
|
+
# @param statistics [Hash] rdkafka statistics
|
25
|
+
def call(statistics)
|
26
|
+
# Emit only statistics related to our client
|
27
|
+
# rdkafka does not have per-instance statistics hook, thus we need to make sure that we
|
28
|
+
# emit only stats that are related to current producer. Otherwise we would emit all of
|
29
|
+
# all the time.
|
30
|
+
return unless @client_name == statistics['name']
|
31
|
+
|
32
|
+
@monitor.instrument(
|
33
|
+
'statistics.emitted',
|
34
|
+
producer_id: @producer_id,
|
35
|
+
statistics: @statistics_decorator.call(statistics)
|
36
|
+
)
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
@@ -0,0 +1,77 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module WaterDrop
|
4
|
+
module Instrumentation
|
5
|
+
module Callbacks
|
6
|
+
# Many of the librdkafka statistics are absolute values instead of a gauge.
|
7
|
+
# This means, that for example number of messages sent is an absolute growing value
|
8
|
+
# instead of being a value of messages sent from the last statistics report.
|
9
|
+
# This decorator calculates the diff against previously emited stats, so we get also
|
10
|
+
# the diff together with the original values
|
11
|
+
class StatisticsDecorator
|
12
|
+
def initialize
|
13
|
+
@previous = {}.freeze
|
14
|
+
end
|
15
|
+
|
16
|
+
# @param emited_stats [Hash] original emited statistics
|
17
|
+
# @return [Hash] emited statistics extended with the diff data
|
18
|
+
# @note We modify the emited statistics, instead of creating new. Since we don't expose
|
19
|
+
# any API to get raw data, users can just assume that the result of this decoration is
|
20
|
+
# the proper raw stats that they can use
|
21
|
+
def call(emited_stats)
|
22
|
+
diff(
|
23
|
+
@previous,
|
24
|
+
emited_stats
|
25
|
+
)
|
26
|
+
|
27
|
+
@previous = emited_stats
|
28
|
+
|
29
|
+
emited_stats.freeze
|
30
|
+
end
|
31
|
+
|
32
|
+
private
|
33
|
+
|
34
|
+
# Calculates the diff of the provided values and modifies in place the emited statistics
|
35
|
+
#
|
36
|
+
# @param previous [Object] previous value from the given scope in which
|
37
|
+
# we are
|
38
|
+
# @param current [Object] current scope from emitted statistics
|
39
|
+
# @return [Object] the diff if the values were numerics or the current scope
|
40
|
+
def diff(previous, current)
|
41
|
+
if current.is_a?(Hash)
|
42
|
+
# @note We cannot use #each_key as we modify the content of the current scope
|
43
|
+
# in place (in case it's a hash)
|
44
|
+
current.keys.each do |key|
|
45
|
+
append(
|
46
|
+
current,
|
47
|
+
key,
|
48
|
+
diff((previous || {})[key], (current || {})[key])
|
49
|
+
)
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
# Diff can be computed only for numerics
|
54
|
+
return current unless current.is_a?(Numeric)
|
55
|
+
# If there was no previous value, delta is always zero
|
56
|
+
return 0 unless previous
|
57
|
+
# Should never happen but just in case, a type changed in between stats
|
58
|
+
return current unless previous.is_a?(Numeric)
|
59
|
+
|
60
|
+
current - previous
|
61
|
+
end
|
62
|
+
|
63
|
+
# Appends the result of the diff to a given key as long as the result is numeric
|
64
|
+
#
|
65
|
+
# @param current [Hash] current scope
|
66
|
+
# @param key [Symbol] key based on which we were diffing
|
67
|
+
# @param result [Object] diff result
|
68
|
+
def append(current, key, result)
|
69
|
+
return unless result.is_a?(Numeric)
|
70
|
+
return if current.frozen?
|
71
|
+
|
72
|
+
current["#{key}_d"] = result
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module WaterDrop
|
4
|
+
module Instrumentation
|
5
|
+
# This manager allows us to register multiple callbacks into a hook that is suppose to support
|
6
|
+
# a single callback
|
7
|
+
class CallbacksManager
|
8
|
+
# @return [::WaterDrop::Instrumentation::CallbacksManager]
|
9
|
+
def initialize
|
10
|
+
@callbacks = Concurrent::Hash.new
|
11
|
+
end
|
12
|
+
|
13
|
+
# Invokes all the callbacks registered one after another
|
14
|
+
#
|
15
|
+
# @param args [Object] any args that should go to the callbacks
|
16
|
+
# @note We do not use `#each_value` here on purpose. With it being used, we cannot dispatch
|
17
|
+
# callbacks and add new at the same time. Since we don't know when and in what thread
|
18
|
+
# things are going to be added to the manager, we need to extract values into an array and
|
19
|
+
# run it. That way we can add new things the same time.
|
20
|
+
def call(*args)
|
21
|
+
@callbacks.values.each { |callback| callback.call(*args) }
|
22
|
+
end
|
23
|
+
|
24
|
+
# Adds a callback to the manager
|
25
|
+
#
|
26
|
+
# @param id [String] id of the callback (used when deleting it)
|
27
|
+
# @param callable [#call] object that responds to a `#call` method
|
28
|
+
def add(id, callable)
|
29
|
+
@callbacks[id] = callable
|
30
|
+
end
|
31
|
+
|
32
|
+
# Removes the callback from the manager
|
33
|
+
# @param id [String] id of the callback we want to remove
|
34
|
+
def delete(id)
|
35
|
+
@callbacks.delete(id)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
@@ -6,8 +6,8 @@ module WaterDrop
|
|
6
6
|
# It can be removed/replaced or anything without any harm to the Waterdrop flow
|
7
7
|
# @note It is a module as we can use it then as a part of the Karafka framework listener
|
8
8
|
# as well as we can use it standalone
|
9
|
-
class
|
10
|
-
# @param logger [Object]
|
9
|
+
class LoggerListener
|
10
|
+
# @param logger [Object] logger we want to use
|
11
11
|
def initialize(logger)
|
12
12
|
@logger = logger
|
13
13
|
end
|
@@ -51,7 +51,7 @@ module WaterDrop
|
|
51
51
|
message = event[:message]
|
52
52
|
|
53
53
|
info(event, "Buffering of a message to '#{message[:topic]}' topic")
|
54
|
-
debug(event, [message
|
54
|
+
debug(event, [message])
|
55
55
|
end
|
56
56
|
|
57
57
|
# @param event [Dry::Events::Event] event that happened with the details
|
@@ -59,7 +59,7 @@ module WaterDrop
|
|
59
59
|
messages = event[:messages]
|
60
60
|
|
61
61
|
info(event, "Buffering of #{messages.size} messages")
|
62
|
-
debug(event, [messages,
|
62
|
+
debug(event, [messages, messages.size])
|
63
63
|
end
|
64
64
|
|
65
65
|
# @param event [Dry::Events::Event] event that happened with the details
|
@@ -70,15 +70,6 @@ module WaterDrop
|
|
70
70
|
debug(event, messages)
|
71
71
|
end
|
72
72
|
|
73
|
-
# @param event [Dry::Events::Event] event that happened with the details
|
74
|
-
def on_buffer_flushed_async_error(event)
|
75
|
-
messages = event[:messages]
|
76
|
-
error = event[:error]
|
77
|
-
|
78
|
-
error(event, "Async flushing of #{messages.size} failed due to: #{error}")
|
79
|
-
debug(event, messages)
|
80
|
-
end
|
81
|
-
|
82
73
|
# @param event [Dry::Events::Event] event that happened with the details
|
83
74
|
def on_buffer_flushed_sync(event)
|
84
75
|
messages = event[:messages]
|
@@ -87,19 +78,19 @@ module WaterDrop
|
|
87
78
|
debug(event, messages)
|
88
79
|
end
|
89
80
|
|
90
|
-
# @param event [Dry::Events::Event] event that happened with the details
|
91
|
-
def on_buffer_flushed_sync_error(event)
|
92
|
-
messages = event[:dispatched]
|
93
|
-
error = event[:error]
|
94
|
-
|
95
|
-
error(event, "Sync flushing of #{messages.size} failed due to: #{error}")
|
96
|
-
debug(event, messages)
|
97
|
-
end
|
98
|
-
|
99
81
|
# @param event [Dry::Events::Event] event that happened with the details
|
100
82
|
def on_producer_closed(event)
|
101
83
|
info event, 'Closing producer'
|
102
|
-
debug event,
|
84
|
+
debug event, ''
|
85
|
+
end
|
86
|
+
|
87
|
+
# @param event [Dry::Events::Event] event that happened with the error details
|
88
|
+
def on_error_occurred(event)
|
89
|
+
error = event[:error]
|
90
|
+
type = event[:type]
|
91
|
+
|
92
|
+
error(event, "Error occurred: #{error} - #{type}")
|
93
|
+
debug(event, '')
|
103
94
|
end
|
104
95
|
|
105
96
|
private
|
@@ -107,19 +98,19 @@ module WaterDrop
|
|
107
98
|
# @param event [Dry::Events::Event] event that happened with the details
|
108
99
|
# @param log_message [String] message we want to publish
|
109
100
|
def debug(event, log_message)
|
110
|
-
@logger.debug("[#{event[:
|
101
|
+
@logger.debug("[#{event[:producer_id]}] #{log_message}")
|
111
102
|
end
|
112
103
|
|
113
104
|
# @param event [Dry::Events::Event] event that happened with the details
|
114
105
|
# @param log_message [String] message we want to publish
|
115
106
|
def info(event, log_message)
|
116
|
-
@logger.info("[#{event[:
|
107
|
+
@logger.info("[#{event[:producer_id]}] #{log_message} took #{event[:time]} ms")
|
117
108
|
end
|
118
109
|
|
119
110
|
# @param event [Dry::Events::Event] event that happened with the details
|
120
111
|
# @param log_message [String] message we want to publish
|
121
112
|
def error(event, log_message)
|
122
|
-
@logger.error("[#{event[:
|
113
|
+
@logger.error("[#{event[:producer_id]}] #{log_message}")
|
123
114
|
end
|
124
115
|
end
|
125
116
|
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module WaterDrop
|
4
|
+
module Instrumentation
|
5
|
+
# WaterDrop instrumentation monitor that we use to publish events
|
6
|
+
# By default uses our internal notifications bus but can be used with
|
7
|
+
# `ActiveSupport::Notifications` as well
|
8
|
+
class Monitor < ::Karafka::Core::Monitoring::Monitor
|
9
|
+
# @param notifications_bus [Object] either our internal notifications bus or
|
10
|
+
# `ActiveSupport::Notifications`
|
11
|
+
# @param namespace [String, nil] namespace for events or nil if no namespace
|
12
|
+
def initialize(
|
13
|
+
notifications_bus = WaterDrop::Instrumentation::Notifications.new,
|
14
|
+
namespace = nil
|
15
|
+
)
|
16
|
+
super(notifications_bus, namespace)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
data/lib/{water_drop/instrumentation/monitor.rb → waterdrop/instrumentation/notifications.rb}
RENAMED
@@ -2,37 +2,36 @@
|
|
2
2
|
|
3
3
|
module WaterDrop
|
4
4
|
module Instrumentation
|
5
|
-
#
|
6
|
-
|
7
|
-
# same time, which means that you might have for example file logging and NewRelic at the same
|
8
|
-
# time
|
9
|
-
# @note This class acts as a singleton because we are only permitted to have single monitor
|
10
|
-
# per running process (just as logger)
|
11
|
-
class Monitor < Dry::Monitor::Notifications
|
5
|
+
# Instrumented is used to hookup external monitoring services to monitor how WaterDrop works
|
6
|
+
class Notifications < ::Karafka::Core::Monitoring::Notifications
|
12
7
|
# List of events that we support in the system and to which a monitor client can hook up
|
13
8
|
# @note The non-error once support timestamp benchmarking
|
14
9
|
EVENTS = %w[
|
15
10
|
producer.closed
|
11
|
+
|
16
12
|
message.produced_async
|
17
13
|
message.produced_sync
|
14
|
+
message.acknowledged
|
15
|
+
message.buffered
|
16
|
+
|
18
17
|
messages.produced_async
|
19
18
|
messages.produced_sync
|
20
|
-
message.buffered
|
21
19
|
messages.buffered
|
22
|
-
|
20
|
+
|
23
21
|
buffer.flushed_async
|
24
|
-
buffer.flushed_async.error
|
25
22
|
buffer.flushed_sync
|
26
|
-
|
23
|
+
|
27
24
|
statistics.emitted
|
25
|
+
|
26
|
+
error.occurred
|
28
27
|
].freeze
|
29
28
|
|
30
29
|
private_constant :EVENTS
|
31
30
|
|
32
31
|
# @return [WaterDrop::Instrumentation::Monitor] monitor instance for system instrumentation
|
33
32
|
def initialize
|
34
|
-
super
|
35
|
-
EVENTS.each(
|
33
|
+
super
|
34
|
+
EVENTS.each { |event| register_event(event) }
|
36
35
|
end
|
37
36
|
end
|
38
37
|
end
|
@@ -0,0 +1 @@
|
|
1
|
+
{"title":"WaterDrop producer example dashboard","description":"This dashboard include example setup for monitoring activity of your WaterDrop producer","widgets":[{"id":243951318,"definition":{"title":"Messages produced","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"alias":"produced sync","formula":"query1"},{"alias":"produced async","formula":"query2"},{"alias":"flushed sync","formula":"query3"},{"alias":"flushed async","formula":"query4"},{"alias":"acknowledged","formula":"query5"}],"response_format":"timeseries","queries":[{"query":"sum:waterdrop.produced_sync{*}.as_count()","data_source":"metrics","name":"query1"},{"query":"sum:waterdrop.produced_async{*}.as_count()","data_source":"metrics","name":"query2"},{"query":"sum:waterdrop.flushed_sync{*}.as_count()","data_source":"metrics","name":"query3"},{"query":"sum:waterdrop.flushed_async{*}.as_count()","data_source":"metrics","name":"query4"},{"query":"sum:waterdrop.acknowledged{*}.as_count()","data_source":"metrics","name":"query5"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}],"yaxis":{"include_zero":true,"scale":"linear","label":"","min":"auto","max":"auto"}}},{"id":1979626566852990,"definition":{"title":"Messages buffer size","title_size":"16","title_align":"left","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"alias":"max","formula":"query1"}],"response_format":"timeseries","queries":[{"query":"avg:waterdrop.buffer.size.max{*}","data_source":"metrics","name":"query1"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}]}},{"id":243951221,"definition":{"title":"Kafka broker API calls","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"alias":"API calls","formula":"query1"}],"response_format":"timeseries","queries":[{"query":"sum:waterdrop.calls{*}","data_source":"metrics","name":"query1"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}],"yaxis":{"include_zero":true,"scale":"linear","label":"","min":"auto","max":"auto"}}},{"id":243951952,"definition":{"title":"Producer queue size","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"alias":"Queue size average","formula":"query1"}],"response_format":"timeseries","queries":[{"query":"max:waterdrop.queue.size.avg{*}","data_source":"metrics","name":"query1"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"},{"formulas":[{"alias":"Queue size max","formula":"query1"}],"response_format":"timeseries","queries":[{"query":"max:waterdrop.queue.size.max{*}","data_source":"metrics","name":"query1"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}],"yaxis":{"include_zero":true,"scale":"linear","label":"","min":"auto","max":"auto"}}},{"id":243951263,"definition":{"title":"Producer queue latency","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"alias":"Average latency","formula":"query1"}],"response_format":"timeseries","queries":[{"query":"avg:waterdrop.queue.latency.avg{*}","data_source":"metrics","name":"query1"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"},{"formulas":[{"alias":"Latency p95","formula":"query1"}],"response_format":"timeseries","queries":[{"query":"avg:waterdrop.queue.latency.p95{*}","data_source":"metrics","name":"query1"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"},{"formulas":[{"alias":"Latency p99","formula":"query1"}],"response_format":"timeseries","queries":[{"query":"avg:waterdrop.queue.latency.p99{*}","data_source":"metrics","name":"query1"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}],"yaxis":{"include_zero":true,"scale":"linear","label":"","min":"auto","max":"auto"}}},{"id":243951276,"definition":{"title":"Producer network latency","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"alias":"Average latency","formula":"query1"}],"response_format":"timeseries","queries":[{"query":"avg:waterdrop.request_size.avg{*}","data_source":"metrics","name":"query1"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"},{"formulas":[{"alias":"Latency p95","formula":"query1"}],"response_format":"timeseries","queries":[{"query":"avg:waterdrop.network.latency.p95{*}","data_source":"metrics","name":"query1"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"},{"formulas":[{"alias":"Latency p99","formula":"query1"}],"response_format":"timeseries","queries":[{"query":"avg:waterdrop.network.latency.p99{*}","data_source":"metrics","name":"query1"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}],"yaxis":{"include_zero":true,"scale":"linear","label":"","min":"auto","max":"auto"}}},{"id":243954928,"definition":{"title":"Producer errors","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"formula":"query1"}],"response_format":"timeseries","queries":[{"query":"sum:waterdrop.error_occurred{*}.as_count()","data_source":"metrics","name":"query1"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}],"yaxis":{"include_zero":true,"scale":"linear","label":"","min":"auto","max":"auto"}}}],"template_variables":[],"layout_type":"ordered","is_read_only":false,"notify_list":[],"reflow_type":"auto","id":"rnr-kgh-dna"}
|
@@ -0,0 +1,210 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module WaterDrop
|
4
|
+
module Instrumentation
|
5
|
+
# Namespace for vendor specific instrumentation
|
6
|
+
module Vendors
|
7
|
+
# Datadog specific instrumentation
|
8
|
+
module Datadog
|
9
|
+
# Listener that can be used to subscribe to WaterDrop producer to receive stats via StatsD
|
10
|
+
# and/or Datadog
|
11
|
+
#
|
12
|
+
# @note You need to setup the `dogstatsd-ruby` client and assign it
|
13
|
+
class Listener
|
14
|
+
include ::Karafka::Core::Configurable
|
15
|
+
extend Forwardable
|
16
|
+
|
17
|
+
def_delegators :config, :client, :rd_kafka_metrics, :namespace, :default_tags
|
18
|
+
|
19
|
+
# Value object for storing a single rdkafka metric publishing details
|
20
|
+
RdKafkaMetric = Struct.new(:type, :scope, :name, :key_location)
|
21
|
+
|
22
|
+
# Namespace under which the DD metrics should be published
|
23
|
+
setting :namespace, default: 'waterdrop'
|
24
|
+
|
25
|
+
# Datadog client that we should use to publish the metrics
|
26
|
+
setting :client
|
27
|
+
|
28
|
+
# Default tags we want to publish (for example hostname)
|
29
|
+
# Format as followed (example for hostname): `["host:#{Socket.gethostname}"]`
|
30
|
+
setting :default_tags, default: []
|
31
|
+
|
32
|
+
# All the rdkafka metrics we want to publish
|
33
|
+
#
|
34
|
+
# By default we publish quite a lot so this can be tuned
|
35
|
+
# Note, that the once with `_d` come from WaterDrop, not rdkafka or Kafka
|
36
|
+
setting :rd_kafka_metrics, default: [
|
37
|
+
# Client metrics
|
38
|
+
RdKafkaMetric.new(:count, :root, 'calls', 'tx_d'),
|
39
|
+
RdKafkaMetric.new(:histogram, :root, 'queue.size', 'msg_cnt_d'),
|
40
|
+
|
41
|
+
# Broker metrics
|
42
|
+
RdKafkaMetric.new(:count, :brokers, 'deliver.attempts', 'txretries_d'),
|
43
|
+
RdKafkaMetric.new(:count, :brokers, 'deliver.errors', 'txerrs_d'),
|
44
|
+
RdKafkaMetric.new(:count, :brokers, 'receive.errors', 'rxerrs_d'),
|
45
|
+
RdKafkaMetric.new(:gauge, :brokers, 'queue.latency.avg', %w[outbuf_latency avg]),
|
46
|
+
RdKafkaMetric.new(:gauge, :brokers, 'queue.latency.p95', %w[outbuf_latency p95]),
|
47
|
+
RdKafkaMetric.new(:gauge, :brokers, 'queue.latency.p99', %w[outbuf_latency p99]),
|
48
|
+
RdKafkaMetric.new(:gauge, :brokers, 'network.latency.avg', %w[rtt avg]),
|
49
|
+
RdKafkaMetric.new(:gauge, :brokers, 'network.latency.p95', %w[rtt p95]),
|
50
|
+
RdKafkaMetric.new(:gauge, :brokers, 'network.latency.p99', %w[rtt p99])
|
51
|
+
].freeze
|
52
|
+
|
53
|
+
configure
|
54
|
+
|
55
|
+
# @param block [Proc] configuration block
|
56
|
+
def initialize(&block)
|
57
|
+
configure
|
58
|
+
setup(&block) if block
|
59
|
+
end
|
60
|
+
|
61
|
+
# @param block [Proc] configuration block
|
62
|
+
# @note We define this alias to be consistent with `WaterDrop#setup`
|
63
|
+
def setup(&block)
|
64
|
+
configure(&block)
|
65
|
+
end
|
66
|
+
|
67
|
+
# Hooks up to WaterDrop instrumentation for emitted statistics
|
68
|
+
#
|
69
|
+
# @param event [WaterDrop::Monitor::Event]
|
70
|
+
def on_statistics_emitted(event)
|
71
|
+
statistics = event[:statistics]
|
72
|
+
|
73
|
+
rd_kafka_metrics.each do |metric|
|
74
|
+
report_metric(metric, statistics)
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
# Increases the errors count by 1
|
79
|
+
#
|
80
|
+
# @param _event [WaterDrop::Monitor::Event]
|
81
|
+
def on_error_occurred(_event)
|
82
|
+
count('error_occurred', 1, tags: default_tags)
|
83
|
+
end
|
84
|
+
|
85
|
+
# Increases acknowledged messages counter
|
86
|
+
# @param _event [WaterDrop::Monitor::Event]
|
87
|
+
def on_message_acknowledged(_event)
|
88
|
+
increment('acknowledged', tags: default_tags)
|
89
|
+
end
|
90
|
+
|
91
|
+
%i[
|
92
|
+
produced_sync
|
93
|
+
produced_async
|
94
|
+
].each do |event_scope|
|
95
|
+
class_eval <<~METHODS, __FILE__, __LINE__ + 1
|
96
|
+
# @param event [WaterDrop::Monitor::Event]
|
97
|
+
def on_message_#{event_scope}(event)
|
98
|
+
report_message(event[:message][:topic], :#{event_scope})
|
99
|
+
end
|
100
|
+
|
101
|
+
# @param event [WaterDrop::Monitor::Event]
|
102
|
+
def on_messages_#{event_scope}(event)
|
103
|
+
event[:messages].each do |message|
|
104
|
+
report_message(message[:topic], :#{event_scope})
|
105
|
+
end
|
106
|
+
end
|
107
|
+
METHODS
|
108
|
+
end
|
109
|
+
|
110
|
+
# Reports the buffer usage when anything is added to the buffer
|
111
|
+
%i[
|
112
|
+
message_buffered
|
113
|
+
messages_buffered
|
114
|
+
].each do |event_scope|
|
115
|
+
class_eval <<~METHODS, __FILE__, __LINE__ + 1
|
116
|
+
# @param event [WaterDrop::Monitor::Event]
|
117
|
+
def on_#{event_scope}(event)
|
118
|
+
histogram(
|
119
|
+
'buffer.size',
|
120
|
+
event[:buffer].size,
|
121
|
+
tags: default_tags
|
122
|
+
)
|
123
|
+
end
|
124
|
+
METHODS
|
125
|
+
end
|
126
|
+
|
127
|
+
# Events that support many messages only
|
128
|
+
# Reports data flushing operation (production from the buffer)
|
129
|
+
%i[
|
130
|
+
flushed_sync
|
131
|
+
flushed_async
|
132
|
+
].each do |event_scope|
|
133
|
+
class_eval <<~METHODS, __FILE__, __LINE__ + 1
|
134
|
+
# @param event [WaterDrop::Monitor::Event]
|
135
|
+
def on_buffer_#{event_scope}(event)
|
136
|
+
event[:messages].each do |message|
|
137
|
+
report_message(message[:topic], :#{event_scope})
|
138
|
+
end
|
139
|
+
end
|
140
|
+
METHODS
|
141
|
+
end
|
142
|
+
|
143
|
+
private
|
144
|
+
|
145
|
+
%i[
|
146
|
+
count
|
147
|
+
gauge
|
148
|
+
histogram
|
149
|
+
increment
|
150
|
+
decrement
|
151
|
+
].each do |metric_type|
|
152
|
+
class_eval <<~METHODS, __FILE__, __LINE__ + 1
|
153
|
+
def #{metric_type}(key, *args)
|
154
|
+
client.#{metric_type}(
|
155
|
+
namespaced_metric(key),
|
156
|
+
*args
|
157
|
+
)
|
158
|
+
end
|
159
|
+
METHODS
|
160
|
+
end
|
161
|
+
|
162
|
+
# Report that a message has been produced to a topic.
|
163
|
+
# @param topic [String] Kafka topic
|
164
|
+
# @param method_name [Symbol] method from which this message operation comes
|
165
|
+
def report_message(topic, method_name)
|
166
|
+
increment(method_name, tags: default_tags + ["topic:#{topic}"])
|
167
|
+
end
|
168
|
+
|
169
|
+
# Wraps metric name in listener's namespace
|
170
|
+
# @param metric_name [String] RdKafkaMetric name
|
171
|
+
# @return [String]
|
172
|
+
def namespaced_metric(metric_name)
|
173
|
+
"#{namespace}.#{metric_name}"
|
174
|
+
end
|
175
|
+
|
176
|
+
# Reports a given metric statistics to Datadog
|
177
|
+
# @param metric [RdKafkaMetric] metric value object
|
178
|
+
# @param statistics [Hash] hash with all the statistics emitted
|
179
|
+
def report_metric(metric, statistics)
|
180
|
+
case metric.scope
|
181
|
+
when :root
|
182
|
+
public_send(
|
183
|
+
metric.type,
|
184
|
+
metric.name,
|
185
|
+
statistics.fetch(*metric.key_location),
|
186
|
+
tags: default_tags
|
187
|
+
)
|
188
|
+
when :brokers
|
189
|
+
statistics.fetch('brokers').each_value do |broker_statistics|
|
190
|
+
# Skip bootstrap nodes
|
191
|
+
# Bootstrap nodes have nodeid -1, other nodes have positive
|
192
|
+
# node ids
|
193
|
+
next if broker_statistics['nodeid'] == -1
|
194
|
+
|
195
|
+
public_send(
|
196
|
+
metric.type,
|
197
|
+
metric.name,
|
198
|
+
broker_statistics.dig(*metric.key_location),
|
199
|
+
tags: default_tags + ["broker:#{broker_statistics['nodename']}"]
|
200
|
+
)
|
201
|
+
end
|
202
|
+
else
|
203
|
+
raise ArgumentError, metric.scope
|
204
|
+
end
|
205
|
+
end
|
206
|
+
end
|
207
|
+
end
|
208
|
+
end
|
209
|
+
end
|
210
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module WaterDrop
|
4
|
+
# Namespace for all the things related with WaterDrop instrumentation process
|
5
|
+
module Instrumentation
|
6
|
+
class << self
|
7
|
+
# Builds a manager for statistics callbacks
|
8
|
+
# @return [WaterDrop::CallbacksManager]
|
9
|
+
def statistics_callbacks
|
10
|
+
@statistics_callbacks ||= CallbacksManager.new
|
11
|
+
end
|
12
|
+
|
13
|
+
# Builds a manager for error callbacks
|
14
|
+
# @return [WaterDrop::CallbacksManager]
|
15
|
+
def error_callbacks
|
16
|
+
@error_callbacks ||= CallbacksManager.new
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module WaterDrop
|
4
|
+
module Patches
|
5
|
+
module Rdkafka
|
6
|
+
# Extends `Rdkafka::Bindings` with some extra methods and updates callbacks that we intend
|
7
|
+
# to work with in a bit different way than rdkafka itself
|
8
|
+
module Bindings
|
9
|
+
class << self
|
10
|
+
# Add extra methods that we need
|
11
|
+
# @param mod [::Rdkafka::Bindings] rdkafka bindings module
|
12
|
+
def included(mod)
|
13
|
+
mod.attach_function :rd_kafka_name, [:pointer], :string
|
14
|
+
|
15
|
+
# Default rdkafka setup for errors doest not propagate client details, thus it always
|
16
|
+
# publishes all the stuff for all rdkafka instances. We change that by providing
|
17
|
+
# function that fetches the instance name, allowing us to have better notifications
|
18
|
+
mod.send(:remove_const, :ErrorCallback)
|
19
|
+
mod.const_set(:ErrorCallback, build_error_callback)
|
20
|
+
end
|
21
|
+
|
22
|
+
# @return [FFI::Function] overwritten callback function
|
23
|
+
def build_error_callback
|
24
|
+
FFI::Function.new(
|
25
|
+
:void, %i[pointer int string pointer]
|
26
|
+
) do |client_prr, err_code, reason, _opaque|
|
27
|
+
return nil unless ::Rdkafka::Config.error_callback
|
28
|
+
|
29
|
+
name = ::Rdkafka::Bindings.rd_kafka_name(client_prr)
|
30
|
+
|
31
|
+
error = ::Rdkafka::RdkafkaError.new(err_code, broker_message: reason)
|
32
|
+
|
33
|
+
::Rdkafka::Config.error_callback.call(name, error)
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
::Rdkafka::Bindings.include(::WaterDrop::Patches::Rdkafka::Bindings)
|