waterdrop 2.0.0 → 2.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- checksums.yaml.gz.sig +0 -0
- data/.github/workflows/ci.yml +33 -6
- data/.ruby-version +1 -1
- data/CHANGELOG.md +80 -0
- data/Gemfile +0 -2
- data/Gemfile.lock +36 -87
- data/MIT-LICENSE +18 -0
- data/README.md +180 -46
- data/certs/mensfeld.pem +21 -21
- data/config/errors.yml +29 -5
- data/docker-compose.yml +2 -1
- data/lib/{water_drop → waterdrop}/config.rb +47 -19
- data/lib/waterdrop/contracts/config.rb +40 -0
- data/lib/waterdrop/contracts/message.rb +60 -0
- data/lib/waterdrop/instrumentation/callbacks/delivery.rb +30 -0
- data/lib/waterdrop/instrumentation/callbacks/error.rb +36 -0
- data/lib/waterdrop/instrumentation/callbacks/statistics.rb +41 -0
- data/lib/waterdrop/instrumentation/callbacks/statistics_decorator.rb +77 -0
- data/lib/waterdrop/instrumentation/callbacks_manager.rb +39 -0
- data/lib/{water_drop/instrumentation/stdout_listener.rb → waterdrop/instrumentation/logger_listener.rb} +17 -26
- data/lib/waterdrop/instrumentation/monitor.rb +20 -0
- data/lib/{water_drop/instrumentation/monitor.rb → waterdrop/instrumentation/notifications.rb} +12 -13
- data/lib/waterdrop/instrumentation/vendors/datadog/dashboard.json +1 -0
- data/lib/waterdrop/instrumentation/vendors/datadog/listener.rb +210 -0
- data/lib/waterdrop/instrumentation.rb +20 -0
- data/lib/waterdrop/patches/rdkafka/bindings.rb +42 -0
- data/lib/waterdrop/patches/rdkafka/producer.rb +28 -0
- data/lib/{water_drop → waterdrop}/producer/async.rb +2 -2
- data/lib/{water_drop → waterdrop}/producer/buffer.rb +15 -8
- data/lib/waterdrop/producer/builder.rb +28 -0
- data/lib/{water_drop → waterdrop}/producer/sync.rb +2 -2
- data/lib/{water_drop → waterdrop}/producer.rb +29 -15
- data/lib/{water_drop → waterdrop}/version.rb +1 -1
- data/lib/waterdrop.rb +33 -2
- data/waterdrop.gemspec +12 -10
- data.tar.gz.sig +0 -0
- metadata +64 -97
- metadata.gz.sig +0 -0
- data/.github/FUNDING.yml +0 -1
- data/LICENSE +0 -165
- data/lib/water_drop/contracts/config.rb +0 -26
- data/lib/water_drop/contracts/message.rb +0 -41
- data/lib/water_drop/instrumentation.rb +0 -7
- data/lib/water_drop/producer/builder.rb +0 -63
- data/lib/water_drop/producer/statistics_decorator.rb +0 -71
- data/lib/water_drop.rb +0 -30
- /data/lib/{water_drop → waterdrop}/contracts.rb +0 -0
- /data/lib/{water_drop → waterdrop}/errors.rb +0 -0
- /data/lib/{water_drop → waterdrop}/producer/dummy_client.rb +0 -0
- /data/lib/{water_drop → waterdrop}/producer/status.rb +0 -0
@@ -0,0 +1,36 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module WaterDrop
|
4
|
+
module Instrumentation
|
5
|
+
module Callbacks
|
6
|
+
# Callback that kicks in when error occurs and is published in a background thread
|
7
|
+
class Error
|
8
|
+
# @param producer_id [String] id of the current producer
|
9
|
+
# @param client_name [String] rdkafka client name
|
10
|
+
# @param monitor [WaterDrop::Instrumentation::Monitor] monitor we are using
|
11
|
+
def initialize(producer_id, client_name, monitor)
|
12
|
+
@producer_id = producer_id
|
13
|
+
@client_name = client_name
|
14
|
+
@monitor = monitor
|
15
|
+
end
|
16
|
+
|
17
|
+
# Runs the instrumentation monitor with error
|
18
|
+
# @param client_name [String] rdkafka client name
|
19
|
+
# @param error [Rdkafka::Error] error that occurred
|
20
|
+
# @note It will only instrument on errors of the client of our producer
|
21
|
+
def call(client_name, error)
|
22
|
+
# Emit only errors related to our client
|
23
|
+
# Same as with statistics (mor explanation there)
|
24
|
+
return unless @client_name == client_name
|
25
|
+
|
26
|
+
@monitor.instrument(
|
27
|
+
'error.occurred',
|
28
|
+
error: error,
|
29
|
+
producer_id: @producer_id,
|
30
|
+
type: 'librdkafka.error'
|
31
|
+
)
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module WaterDrop
|
4
|
+
module Instrumentation
|
5
|
+
# Namespace for handlers of callbacks emitted by the kafka client lib
|
6
|
+
module Callbacks
|
7
|
+
# Statistics callback handler
|
8
|
+
# @note We decorate the statistics with our own decorator because some of the metrics from
|
9
|
+
# rdkafka are absolute. For example number of sent messages increases not in reference to
|
10
|
+
# previous statistics emit but from the beginning of the process. We decorate it with diff
|
11
|
+
# of all the numeric values against the data from the previous callback emit
|
12
|
+
class Statistics
|
13
|
+
# @param producer_id [String] id of the current producer
|
14
|
+
# @param client_name [String] rdkafka client name
|
15
|
+
# @param monitor [WaterDrop::Instrumentation::Monitor] monitor we are using
|
16
|
+
def initialize(producer_id, client_name, monitor)
|
17
|
+
@producer_id = producer_id
|
18
|
+
@client_name = client_name
|
19
|
+
@monitor = monitor
|
20
|
+
@statistics_decorator = StatisticsDecorator.new
|
21
|
+
end
|
22
|
+
|
23
|
+
# Emits decorated statistics to the monitor
|
24
|
+
# @param statistics [Hash] rdkafka statistics
|
25
|
+
def call(statistics)
|
26
|
+
# Emit only statistics related to our client
|
27
|
+
# rdkafka does not have per-instance statistics hook, thus we need to make sure that we
|
28
|
+
# emit only stats that are related to current producer. Otherwise we would emit all of
|
29
|
+
# all the time.
|
30
|
+
return unless @client_name == statistics['name']
|
31
|
+
|
32
|
+
@monitor.instrument(
|
33
|
+
'statistics.emitted',
|
34
|
+
producer_id: @producer_id,
|
35
|
+
statistics: @statistics_decorator.call(statistics)
|
36
|
+
)
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
@@ -0,0 +1,77 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module WaterDrop
|
4
|
+
module Instrumentation
|
5
|
+
module Callbacks
|
6
|
+
# Many of the librdkafka statistics are absolute values instead of a gauge.
|
7
|
+
# This means, that for example number of messages sent is an absolute growing value
|
8
|
+
# instead of being a value of messages sent from the last statistics report.
|
9
|
+
# This decorator calculates the diff against previously emited stats, so we get also
|
10
|
+
# the diff together with the original values
|
11
|
+
class StatisticsDecorator
|
12
|
+
def initialize
|
13
|
+
@previous = {}.freeze
|
14
|
+
end
|
15
|
+
|
16
|
+
# @param emited_stats [Hash] original emited statistics
|
17
|
+
# @return [Hash] emited statistics extended with the diff data
|
18
|
+
# @note We modify the emited statistics, instead of creating new. Since we don't expose
|
19
|
+
# any API to get raw data, users can just assume that the result of this decoration is
|
20
|
+
# the proper raw stats that they can use
|
21
|
+
def call(emited_stats)
|
22
|
+
diff(
|
23
|
+
@previous,
|
24
|
+
emited_stats
|
25
|
+
)
|
26
|
+
|
27
|
+
@previous = emited_stats
|
28
|
+
|
29
|
+
emited_stats.freeze
|
30
|
+
end
|
31
|
+
|
32
|
+
private
|
33
|
+
|
34
|
+
# Calculates the diff of the provided values and modifies in place the emited statistics
|
35
|
+
#
|
36
|
+
# @param previous [Object] previous value from the given scope in which
|
37
|
+
# we are
|
38
|
+
# @param current [Object] current scope from emitted statistics
|
39
|
+
# @return [Object] the diff if the values were numerics or the current scope
|
40
|
+
def diff(previous, current)
|
41
|
+
if current.is_a?(Hash)
|
42
|
+
# @note We cannot use #each_key as we modify the content of the current scope
|
43
|
+
# in place (in case it's a hash)
|
44
|
+
current.keys.each do |key|
|
45
|
+
append(
|
46
|
+
current,
|
47
|
+
key,
|
48
|
+
diff((previous || {})[key], (current || {})[key])
|
49
|
+
)
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
# Diff can be computed only for numerics
|
54
|
+
return current unless current.is_a?(Numeric)
|
55
|
+
# If there was no previous value, delta is always zero
|
56
|
+
return 0 unless previous
|
57
|
+
# Should never happen but just in case, a type changed in between stats
|
58
|
+
return current unless previous.is_a?(Numeric)
|
59
|
+
|
60
|
+
current - previous
|
61
|
+
end
|
62
|
+
|
63
|
+
# Appends the result of the diff to a given key as long as the result is numeric
|
64
|
+
#
|
65
|
+
# @param current [Hash] current scope
|
66
|
+
# @param key [Symbol] key based on which we were diffing
|
67
|
+
# @param result [Object] diff result
|
68
|
+
def append(current, key, result)
|
69
|
+
return unless result.is_a?(Numeric)
|
70
|
+
return if current.frozen?
|
71
|
+
|
72
|
+
current["#{key}_d"] = result
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module WaterDrop
|
4
|
+
module Instrumentation
|
5
|
+
# This manager allows us to register multiple callbacks into a hook that is suppose to support
|
6
|
+
# a single callback
|
7
|
+
class CallbacksManager
|
8
|
+
# @return [::WaterDrop::Instrumentation::CallbacksManager]
|
9
|
+
def initialize
|
10
|
+
@callbacks = Concurrent::Hash.new
|
11
|
+
end
|
12
|
+
|
13
|
+
# Invokes all the callbacks registered one after another
|
14
|
+
#
|
15
|
+
# @param args [Object] any args that should go to the callbacks
|
16
|
+
# @note We do not use `#each_value` here on purpose. With it being used, we cannot dispatch
|
17
|
+
# callbacks and add new at the same time. Since we don't know when and in what thread
|
18
|
+
# things are going to be added to the manager, we need to extract values into an array and
|
19
|
+
# run it. That way we can add new things the same time.
|
20
|
+
def call(*args)
|
21
|
+
@callbacks.values.each { |callback| callback.call(*args) }
|
22
|
+
end
|
23
|
+
|
24
|
+
# Adds a callback to the manager
|
25
|
+
#
|
26
|
+
# @param id [String] id of the callback (used when deleting it)
|
27
|
+
# @param callable [#call] object that responds to a `#call` method
|
28
|
+
def add(id, callable)
|
29
|
+
@callbacks[id] = callable
|
30
|
+
end
|
31
|
+
|
32
|
+
# Removes the callback from the manager
|
33
|
+
# @param id [String] id of the callback we want to remove
|
34
|
+
def delete(id)
|
35
|
+
@callbacks.delete(id)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
@@ -6,8 +6,8 @@ module WaterDrop
|
|
6
6
|
# It can be removed/replaced or anything without any harm to the Waterdrop flow
|
7
7
|
# @note It is a module as we can use it then as a part of the Karafka framework listener
|
8
8
|
# as well as we can use it standalone
|
9
|
-
class
|
10
|
-
# @param logger [Object]
|
9
|
+
class LoggerListener
|
10
|
+
# @param logger [Object] logger we want to use
|
11
11
|
def initialize(logger)
|
12
12
|
@logger = logger
|
13
13
|
end
|
@@ -51,7 +51,7 @@ module WaterDrop
|
|
51
51
|
message = event[:message]
|
52
52
|
|
53
53
|
info(event, "Buffering of a message to '#{message[:topic]}' topic")
|
54
|
-
debug(event, [message
|
54
|
+
debug(event, [message])
|
55
55
|
end
|
56
56
|
|
57
57
|
# @param event [Dry::Events::Event] event that happened with the details
|
@@ -59,7 +59,7 @@ module WaterDrop
|
|
59
59
|
messages = event[:messages]
|
60
60
|
|
61
61
|
info(event, "Buffering of #{messages.size} messages")
|
62
|
-
debug(event, [messages,
|
62
|
+
debug(event, [messages, messages.size])
|
63
63
|
end
|
64
64
|
|
65
65
|
# @param event [Dry::Events::Event] event that happened with the details
|
@@ -70,15 +70,6 @@ module WaterDrop
|
|
70
70
|
debug(event, messages)
|
71
71
|
end
|
72
72
|
|
73
|
-
# @param event [Dry::Events::Event] event that happened with the details
|
74
|
-
def on_buffer_flushed_async_error(event)
|
75
|
-
messages = event[:messages]
|
76
|
-
error = event[:error]
|
77
|
-
|
78
|
-
error(event, "Async flushing of #{messages.size} failed due to: #{error}")
|
79
|
-
debug(event, messages)
|
80
|
-
end
|
81
|
-
|
82
73
|
# @param event [Dry::Events::Event] event that happened with the details
|
83
74
|
def on_buffer_flushed_sync(event)
|
84
75
|
messages = event[:messages]
|
@@ -87,19 +78,19 @@ module WaterDrop
|
|
87
78
|
debug(event, messages)
|
88
79
|
end
|
89
80
|
|
90
|
-
# @param event [Dry::Events::Event] event that happened with the details
|
91
|
-
def on_buffer_flushed_sync_error(event)
|
92
|
-
messages = event[:dispatched]
|
93
|
-
error = event[:error]
|
94
|
-
|
95
|
-
error(event, "Sync flushing of #{messages.size} failed due to: #{error}")
|
96
|
-
debug(event, messages)
|
97
|
-
end
|
98
|
-
|
99
81
|
# @param event [Dry::Events::Event] event that happened with the details
|
100
82
|
def on_producer_closed(event)
|
101
83
|
info event, 'Closing producer'
|
102
|
-
debug event,
|
84
|
+
debug event, ''
|
85
|
+
end
|
86
|
+
|
87
|
+
# @param event [Dry::Events::Event] event that happened with the error details
|
88
|
+
def on_error_occurred(event)
|
89
|
+
error = event[:error]
|
90
|
+
type = event[:type]
|
91
|
+
|
92
|
+
error(event, "Error occurred: #{error} - #{type}")
|
93
|
+
debug(event, '')
|
103
94
|
end
|
104
95
|
|
105
96
|
private
|
@@ -107,19 +98,19 @@ module WaterDrop
|
|
107
98
|
# @param event [Dry::Events::Event] event that happened with the details
|
108
99
|
# @param log_message [String] message we want to publish
|
109
100
|
def debug(event, log_message)
|
110
|
-
@logger.debug("[#{event[:
|
101
|
+
@logger.debug("[#{event[:producer_id]}] #{log_message}")
|
111
102
|
end
|
112
103
|
|
113
104
|
# @param event [Dry::Events::Event] event that happened with the details
|
114
105
|
# @param log_message [String] message we want to publish
|
115
106
|
def info(event, log_message)
|
116
|
-
@logger.info("[#{event[:
|
107
|
+
@logger.info("[#{event[:producer_id]}] #{log_message} took #{event[:time]} ms")
|
117
108
|
end
|
118
109
|
|
119
110
|
# @param event [Dry::Events::Event] event that happened with the details
|
120
111
|
# @param log_message [String] message we want to publish
|
121
112
|
def error(event, log_message)
|
122
|
-
@logger.error("[#{event[:
|
113
|
+
@logger.error("[#{event[:producer_id]}] #{log_message}")
|
123
114
|
end
|
124
115
|
end
|
125
116
|
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module WaterDrop
|
4
|
+
module Instrumentation
|
5
|
+
# WaterDrop instrumentation monitor that we use to publish events
|
6
|
+
# By default uses our internal notifications bus but can be used with
|
7
|
+
# `ActiveSupport::Notifications` as well
|
8
|
+
class Monitor < ::Karafka::Core::Monitoring::Monitor
|
9
|
+
# @param notifications_bus [Object] either our internal notifications bus or
|
10
|
+
# `ActiveSupport::Notifications`
|
11
|
+
# @param namespace [String, nil] namespace for events or nil if no namespace
|
12
|
+
def initialize(
|
13
|
+
notifications_bus = WaterDrop::Instrumentation::Notifications.new,
|
14
|
+
namespace = nil
|
15
|
+
)
|
16
|
+
super(notifications_bus, namespace)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
data/lib/{water_drop/instrumentation/monitor.rb → waterdrop/instrumentation/notifications.rb}
RENAMED
@@ -2,37 +2,36 @@
|
|
2
2
|
|
3
3
|
module WaterDrop
|
4
4
|
module Instrumentation
|
5
|
-
#
|
6
|
-
|
7
|
-
# same time, which means that you might have for example file logging and NewRelic at the same
|
8
|
-
# time
|
9
|
-
# @note This class acts as a singleton because we are only permitted to have single monitor
|
10
|
-
# per running process (just as logger)
|
11
|
-
class Monitor < Dry::Monitor::Notifications
|
5
|
+
# Instrumented is used to hookup external monitoring services to monitor how WaterDrop works
|
6
|
+
class Notifications < ::Karafka::Core::Monitoring::Notifications
|
12
7
|
# List of events that we support in the system and to which a monitor client can hook up
|
13
8
|
# @note The non-error once support timestamp benchmarking
|
14
9
|
EVENTS = %w[
|
15
10
|
producer.closed
|
11
|
+
|
16
12
|
message.produced_async
|
17
13
|
message.produced_sync
|
14
|
+
message.acknowledged
|
15
|
+
message.buffered
|
16
|
+
|
18
17
|
messages.produced_async
|
19
18
|
messages.produced_sync
|
20
|
-
message.buffered
|
21
19
|
messages.buffered
|
22
|
-
|
20
|
+
|
23
21
|
buffer.flushed_async
|
24
|
-
buffer.flushed_async.error
|
25
22
|
buffer.flushed_sync
|
26
|
-
|
23
|
+
|
27
24
|
statistics.emitted
|
25
|
+
|
26
|
+
error.occurred
|
28
27
|
].freeze
|
29
28
|
|
30
29
|
private_constant :EVENTS
|
31
30
|
|
32
31
|
# @return [WaterDrop::Instrumentation::Monitor] monitor instance for system instrumentation
|
33
32
|
def initialize
|
34
|
-
super
|
35
|
-
EVENTS.each(
|
33
|
+
super
|
34
|
+
EVENTS.each { |event| register_event(event) }
|
36
35
|
end
|
37
36
|
end
|
38
37
|
end
|
@@ -0,0 +1 @@
|
|
1
|
+
{"title":"WaterDrop producer example dashboard","description":"This dashboard include example setup for monitoring activity of your WaterDrop producer","widgets":[{"id":243951318,"definition":{"title":"Messages produced","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"alias":"produced sync","formula":"query1"},{"alias":"produced async","formula":"query2"},{"alias":"flushed sync","formula":"query3"},{"alias":"flushed async","formula":"query4"},{"alias":"acknowledged","formula":"query5"}],"response_format":"timeseries","queries":[{"query":"sum:waterdrop.produced_sync{*}.as_count()","data_source":"metrics","name":"query1"},{"query":"sum:waterdrop.produced_async{*}.as_count()","data_source":"metrics","name":"query2"},{"query":"sum:waterdrop.flushed_sync{*}.as_count()","data_source":"metrics","name":"query3"},{"query":"sum:waterdrop.flushed_async{*}.as_count()","data_source":"metrics","name":"query4"},{"query":"sum:waterdrop.acknowledged{*}.as_count()","data_source":"metrics","name":"query5"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}],"yaxis":{"include_zero":true,"scale":"linear","label":"","min":"auto","max":"auto"}}},{"id":1979626566852990,"definition":{"title":"Messages buffer size","title_size":"16","title_align":"left","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"alias":"max","formula":"query1"}],"response_format":"timeseries","queries":[{"query":"avg:waterdrop.buffer.size.max{*}","data_source":"metrics","name":"query1"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}]}},{"id":243951221,"definition":{"title":"Kafka broker API calls","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"alias":"API calls","formula":"query1"}],"response_format":"timeseries","queries":[{"query":"sum:waterdrop.calls{*}","data_source":"metrics","name":"query1"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}],"yaxis":{"include_zero":true,"scale":"linear","label":"","min":"auto","max":"auto"}}},{"id":243951952,"definition":{"title":"Producer queue size","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"alias":"Queue size average","formula":"query1"}],"response_format":"timeseries","queries":[{"query":"max:waterdrop.queue.size.avg{*}","data_source":"metrics","name":"query1"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"},{"formulas":[{"alias":"Queue size max","formula":"query1"}],"response_format":"timeseries","queries":[{"query":"max:waterdrop.queue.size.max{*}","data_source":"metrics","name":"query1"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}],"yaxis":{"include_zero":true,"scale":"linear","label":"","min":"auto","max":"auto"}}},{"id":243951263,"definition":{"title":"Producer queue latency","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"alias":"Average latency","formula":"query1"}],"response_format":"timeseries","queries":[{"query":"avg:waterdrop.queue.latency.avg{*}","data_source":"metrics","name":"query1"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"},{"formulas":[{"alias":"Latency p95","formula":"query1"}],"response_format":"timeseries","queries":[{"query":"avg:waterdrop.queue.latency.p95{*}","data_source":"metrics","name":"query1"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"},{"formulas":[{"alias":"Latency p99","formula":"query1"}],"response_format":"timeseries","queries":[{"query":"avg:waterdrop.queue.latency.p99{*}","data_source":"metrics","name":"query1"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}],"yaxis":{"include_zero":true,"scale":"linear","label":"","min":"auto","max":"auto"}}},{"id":243951276,"definition":{"title":"Producer network latency","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"alias":"Average latency","formula":"query1"}],"response_format":"timeseries","queries":[{"query":"avg:waterdrop.request_size.avg{*}","data_source":"metrics","name":"query1"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"},{"formulas":[{"alias":"Latency p95","formula":"query1"}],"response_format":"timeseries","queries":[{"query":"avg:waterdrop.network.latency.p95{*}","data_source":"metrics","name":"query1"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"},{"formulas":[{"alias":"Latency p99","formula":"query1"}],"response_format":"timeseries","queries":[{"query":"avg:waterdrop.network.latency.p99{*}","data_source":"metrics","name":"query1"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}],"yaxis":{"include_zero":true,"scale":"linear","label":"","min":"auto","max":"auto"}}},{"id":243954928,"definition":{"title":"Producer errors","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"formula":"query1"}],"response_format":"timeseries","queries":[{"query":"sum:waterdrop.error_occurred{*}.as_count()","data_source":"metrics","name":"query1"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}],"yaxis":{"include_zero":true,"scale":"linear","label":"","min":"auto","max":"auto"}}}],"template_variables":[],"layout_type":"ordered","is_read_only":false,"notify_list":[],"reflow_type":"auto","id":"rnr-kgh-dna"}
|
@@ -0,0 +1,210 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module WaterDrop
|
4
|
+
module Instrumentation
|
5
|
+
# Namespace for vendor specific instrumentation
|
6
|
+
module Vendors
|
7
|
+
# Datadog specific instrumentation
|
8
|
+
module Datadog
|
9
|
+
# Listener that can be used to subscribe to WaterDrop producer to receive stats via StatsD
|
10
|
+
# and/or Datadog
|
11
|
+
#
|
12
|
+
# @note You need to setup the `dogstatsd-ruby` client and assign it
|
13
|
+
class Listener
|
14
|
+
include ::Karafka::Core::Configurable
|
15
|
+
extend Forwardable
|
16
|
+
|
17
|
+
def_delegators :config, :client, :rd_kafka_metrics, :namespace, :default_tags
|
18
|
+
|
19
|
+
# Value object for storing a single rdkafka metric publishing details
|
20
|
+
RdKafkaMetric = Struct.new(:type, :scope, :name, :key_location)
|
21
|
+
|
22
|
+
# Namespace under which the DD metrics should be published
|
23
|
+
setting :namespace, default: 'waterdrop'
|
24
|
+
|
25
|
+
# Datadog client that we should use to publish the metrics
|
26
|
+
setting :client
|
27
|
+
|
28
|
+
# Default tags we want to publish (for example hostname)
|
29
|
+
# Format as followed (example for hostname): `["host:#{Socket.gethostname}"]`
|
30
|
+
setting :default_tags, default: []
|
31
|
+
|
32
|
+
# All the rdkafka metrics we want to publish
|
33
|
+
#
|
34
|
+
# By default we publish quite a lot so this can be tuned
|
35
|
+
# Note, that the once with `_d` come from WaterDrop, not rdkafka or Kafka
|
36
|
+
setting :rd_kafka_metrics, default: [
|
37
|
+
# Client metrics
|
38
|
+
RdKafkaMetric.new(:count, :root, 'calls', 'tx_d'),
|
39
|
+
RdKafkaMetric.new(:histogram, :root, 'queue.size', 'msg_cnt_d'),
|
40
|
+
|
41
|
+
# Broker metrics
|
42
|
+
RdKafkaMetric.new(:count, :brokers, 'deliver.attempts', 'txretries_d'),
|
43
|
+
RdKafkaMetric.new(:count, :brokers, 'deliver.errors', 'txerrs_d'),
|
44
|
+
RdKafkaMetric.new(:count, :brokers, 'receive.errors', 'rxerrs_d'),
|
45
|
+
RdKafkaMetric.new(:gauge, :brokers, 'queue.latency.avg', %w[outbuf_latency avg]),
|
46
|
+
RdKafkaMetric.new(:gauge, :brokers, 'queue.latency.p95', %w[outbuf_latency p95]),
|
47
|
+
RdKafkaMetric.new(:gauge, :brokers, 'queue.latency.p99', %w[outbuf_latency p99]),
|
48
|
+
RdKafkaMetric.new(:gauge, :brokers, 'network.latency.avg', %w[rtt avg]),
|
49
|
+
RdKafkaMetric.new(:gauge, :brokers, 'network.latency.p95', %w[rtt p95]),
|
50
|
+
RdKafkaMetric.new(:gauge, :brokers, 'network.latency.p99', %w[rtt p99])
|
51
|
+
].freeze
|
52
|
+
|
53
|
+
configure
|
54
|
+
|
55
|
+
# @param block [Proc] configuration block
|
56
|
+
def initialize(&block)
|
57
|
+
configure
|
58
|
+
setup(&block) if block
|
59
|
+
end
|
60
|
+
|
61
|
+
# @param block [Proc] configuration block
|
62
|
+
# @note We define this alias to be consistent with `WaterDrop#setup`
|
63
|
+
def setup(&block)
|
64
|
+
configure(&block)
|
65
|
+
end
|
66
|
+
|
67
|
+
# Hooks up to WaterDrop instrumentation for emitted statistics
|
68
|
+
#
|
69
|
+
# @param event [WaterDrop::Monitor::Event]
|
70
|
+
def on_statistics_emitted(event)
|
71
|
+
statistics = event[:statistics]
|
72
|
+
|
73
|
+
rd_kafka_metrics.each do |metric|
|
74
|
+
report_metric(metric, statistics)
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
# Increases the errors count by 1
|
79
|
+
#
|
80
|
+
# @param _event [WaterDrop::Monitor::Event]
|
81
|
+
def on_error_occurred(_event)
|
82
|
+
count('error_occurred', 1, tags: default_tags)
|
83
|
+
end
|
84
|
+
|
85
|
+
# Increases acknowledged messages counter
|
86
|
+
# @param _event [WaterDrop::Monitor::Event]
|
87
|
+
def on_message_acknowledged(_event)
|
88
|
+
increment('acknowledged', tags: default_tags)
|
89
|
+
end
|
90
|
+
|
91
|
+
%i[
|
92
|
+
produced_sync
|
93
|
+
produced_async
|
94
|
+
].each do |event_scope|
|
95
|
+
class_eval <<~METHODS, __FILE__, __LINE__ + 1
|
96
|
+
# @param event [WaterDrop::Monitor::Event]
|
97
|
+
def on_message_#{event_scope}(event)
|
98
|
+
report_message(event[:message][:topic], :#{event_scope})
|
99
|
+
end
|
100
|
+
|
101
|
+
# @param event [WaterDrop::Monitor::Event]
|
102
|
+
def on_messages_#{event_scope}(event)
|
103
|
+
event[:messages].each do |message|
|
104
|
+
report_message(message[:topic], :#{event_scope})
|
105
|
+
end
|
106
|
+
end
|
107
|
+
METHODS
|
108
|
+
end
|
109
|
+
|
110
|
+
# Reports the buffer usage when anything is added to the buffer
|
111
|
+
%i[
|
112
|
+
message_buffered
|
113
|
+
messages_buffered
|
114
|
+
].each do |event_scope|
|
115
|
+
class_eval <<~METHODS, __FILE__, __LINE__ + 1
|
116
|
+
# @param event [WaterDrop::Monitor::Event]
|
117
|
+
def on_#{event_scope}(event)
|
118
|
+
histogram(
|
119
|
+
'buffer.size',
|
120
|
+
event[:buffer].size,
|
121
|
+
tags: default_tags
|
122
|
+
)
|
123
|
+
end
|
124
|
+
METHODS
|
125
|
+
end
|
126
|
+
|
127
|
+
# Events that support many messages only
|
128
|
+
# Reports data flushing operation (production from the buffer)
|
129
|
+
%i[
|
130
|
+
flushed_sync
|
131
|
+
flushed_async
|
132
|
+
].each do |event_scope|
|
133
|
+
class_eval <<~METHODS, __FILE__, __LINE__ + 1
|
134
|
+
# @param event [WaterDrop::Monitor::Event]
|
135
|
+
def on_buffer_#{event_scope}(event)
|
136
|
+
event[:messages].each do |message|
|
137
|
+
report_message(message[:topic], :#{event_scope})
|
138
|
+
end
|
139
|
+
end
|
140
|
+
METHODS
|
141
|
+
end
|
142
|
+
|
143
|
+
private
|
144
|
+
|
145
|
+
%i[
|
146
|
+
count
|
147
|
+
gauge
|
148
|
+
histogram
|
149
|
+
increment
|
150
|
+
decrement
|
151
|
+
].each do |metric_type|
|
152
|
+
class_eval <<~METHODS, __FILE__, __LINE__ + 1
|
153
|
+
def #{metric_type}(key, *args)
|
154
|
+
client.#{metric_type}(
|
155
|
+
namespaced_metric(key),
|
156
|
+
*args
|
157
|
+
)
|
158
|
+
end
|
159
|
+
METHODS
|
160
|
+
end
|
161
|
+
|
162
|
+
# Report that a message has been produced to a topic.
|
163
|
+
# @param topic [String] Kafka topic
|
164
|
+
# @param method_name [Symbol] method from which this message operation comes
|
165
|
+
def report_message(topic, method_name)
|
166
|
+
increment(method_name, tags: default_tags + ["topic:#{topic}"])
|
167
|
+
end
|
168
|
+
|
169
|
+
# Wraps metric name in listener's namespace
|
170
|
+
# @param metric_name [String] RdKafkaMetric name
|
171
|
+
# @return [String]
|
172
|
+
def namespaced_metric(metric_name)
|
173
|
+
"#{namespace}.#{metric_name}"
|
174
|
+
end
|
175
|
+
|
176
|
+
# Reports a given metric statistics to Datadog
|
177
|
+
# @param metric [RdKafkaMetric] metric value object
|
178
|
+
# @param statistics [Hash] hash with all the statistics emitted
|
179
|
+
def report_metric(metric, statistics)
|
180
|
+
case metric.scope
|
181
|
+
when :root
|
182
|
+
public_send(
|
183
|
+
metric.type,
|
184
|
+
metric.name,
|
185
|
+
statistics.fetch(*metric.key_location),
|
186
|
+
tags: default_tags
|
187
|
+
)
|
188
|
+
when :brokers
|
189
|
+
statistics.fetch('brokers').each_value do |broker_statistics|
|
190
|
+
# Skip bootstrap nodes
|
191
|
+
# Bootstrap nodes have nodeid -1, other nodes have positive
|
192
|
+
# node ids
|
193
|
+
next if broker_statistics['nodeid'] == -1
|
194
|
+
|
195
|
+
public_send(
|
196
|
+
metric.type,
|
197
|
+
metric.name,
|
198
|
+
broker_statistics.dig(*metric.key_location),
|
199
|
+
tags: default_tags + ["broker:#{broker_statistics['nodename']}"]
|
200
|
+
)
|
201
|
+
end
|
202
|
+
else
|
203
|
+
raise ArgumentError, metric.scope
|
204
|
+
end
|
205
|
+
end
|
206
|
+
end
|
207
|
+
end
|
208
|
+
end
|
209
|
+
end
|
210
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module WaterDrop
|
4
|
+
# Namespace for all the things related with WaterDrop instrumentation process
|
5
|
+
module Instrumentation
|
6
|
+
class << self
|
7
|
+
# Builds a manager for statistics callbacks
|
8
|
+
# @return [WaterDrop::CallbacksManager]
|
9
|
+
def statistics_callbacks
|
10
|
+
@statistics_callbacks ||= CallbacksManager.new
|
11
|
+
end
|
12
|
+
|
13
|
+
# Builds a manager for error callbacks
|
14
|
+
# @return [WaterDrop::CallbacksManager]
|
15
|
+
def error_callbacks
|
16
|
+
@error_callbacks ||= CallbacksManager.new
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module WaterDrop
|
4
|
+
module Patches
|
5
|
+
module Rdkafka
|
6
|
+
# Extends `Rdkafka::Bindings` with some extra methods and updates callbacks that we intend
|
7
|
+
# to work with in a bit different way than rdkafka itself
|
8
|
+
module Bindings
|
9
|
+
class << self
|
10
|
+
# Add extra methods that we need
|
11
|
+
# @param mod [::Rdkafka::Bindings] rdkafka bindings module
|
12
|
+
def included(mod)
|
13
|
+
mod.attach_function :rd_kafka_name, [:pointer], :string
|
14
|
+
|
15
|
+
# Default rdkafka setup for errors doest not propagate client details, thus it always
|
16
|
+
# publishes all the stuff for all rdkafka instances. We change that by providing
|
17
|
+
# function that fetches the instance name, allowing us to have better notifications
|
18
|
+
mod.send(:remove_const, :ErrorCallback)
|
19
|
+
mod.const_set(:ErrorCallback, build_error_callback)
|
20
|
+
end
|
21
|
+
|
22
|
+
# @return [FFI::Function] overwritten callback function
|
23
|
+
def build_error_callback
|
24
|
+
FFI::Function.new(
|
25
|
+
:void, %i[pointer int string pointer]
|
26
|
+
) do |client_prr, err_code, reason, _opaque|
|
27
|
+
return nil unless ::Rdkafka::Config.error_callback
|
28
|
+
|
29
|
+
name = ::Rdkafka::Bindings.rd_kafka_name(client_prr)
|
30
|
+
|
31
|
+
error = ::Rdkafka::RdkafkaError.new(err_code, broker_message: reason)
|
32
|
+
|
33
|
+
::Rdkafka::Config.error_callback.call(name, error)
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
::Rdkafka::Bindings.include(::WaterDrop::Patches::Rdkafka::Bindings)
|