puma-plugin-telemetry 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/docs/examples.md ADDED
@@ -0,0 +1,163 @@
1
+ ## Keeping track of waiting requests
2
+
3
+ There are requests waiting in 2 places:
4
+ - socket
5
+ - queue
6
+
7
+ Their sum is a total number of accepted requests waiting.
8
+
9
+ Puma configuration
10
+
11
+ ```ruby
12
+ plugin :telemetry
13
+
14
+ Puma::Plugin::Telemetry.configure do |config|
15
+ config.enabled = true
16
+ config.initial_delay = 10
17
+
18
+ config.puma_telemetry = %w[queue.backlog]
19
+
20
+ config.socket_telemetry!
21
+
22
+ config.add_target :dogstatsd, client: Datadog::Statsd.new(tags: %w[your tags], namespace: "ruby.puma")
23
+ end
24
+ ```
25
+
26
+ Example Datadog widget and it's configuration. Depending on what you prefer to see, you might replace `rollup(max)` with `rollup(sum)` whenever you want to see maximum value or sum across the aggregated time frame.
27
+
28
+ | :point_up: | Remember to update tags after initial setup! |
29
+ |---------------|:---------------------------------------------|
30
+
31
+ ![Datadog Widget, barchart showcasing sockets & queue backlog sizes stacked up](example-datadog_backlog_size.png "Datadog Widget")
32
+
33
+ ```json
34
+ {
35
+ "viz": "timeseries",
36
+ "requests": [
37
+ {
38
+ "style": {
39
+ "palette": "dog_classic",
40
+ "type": "solid",
41
+ "width": "normal"
42
+ },
43
+ "type": "bars",
44
+ "formulas": [
45
+ {
46
+ "alias": "queue",
47
+ "formula": "query1"
48
+ },
49
+ {
50
+ "alias": "socket",
51
+ "formula": "query2"
52
+ }
53
+ ],
54
+ "response_format": "timeseries",
55
+ "on_right_yaxis": false,
56
+ "queries": [
57
+ {
58
+ "query": "max:ruby.puma.queue.backlog{}.rollup(max)",
59
+ "data_source": "metrics",
60
+ "name": "query1"
61
+ },
62
+ {
63
+ "query": "max:ruby.puma.sockets.backlog{}.rollup(max)",
64
+ "data_source": "metrics",
65
+ "name": "query2"
66
+ }
67
+ ]
68
+ }
69
+ ],
70
+ "yaxis": {
71
+ "include_zero": true,
72
+ "max": "auto",
73
+ "scale": "linear",
74
+ "min": "auto",
75
+ "label": ""
76
+ },
77
+ "markers": []
78
+ }
79
+ ```
80
+
81
+ ## Keeping track of request queue time
82
+
83
+ The time request spent waiting to be processed, between it's accepted by Load Balancer till it starts going through Rack Middleware in your application. Holy grail of autoscaling.
84
+
85
+ Example configuration of middleware, i.e. in case of Rails it could be placed under `config/initializers/request_queue_time.rb`
86
+
87
+ ```ruby
88
+ Rails.application.config.middleware.insert_after(
89
+ 0,
90
+ RequestQueueTimeMiddleware,
91
+ statsd: Datadog::Statsd.new(namespace: "ruby.puma", tags: %w[your tags])
92
+ )
93
+ ```
94
+
95
+ If you are utilizing tags in your logs, you might also want to add this measurement as follows:
96
+
97
+ ```ruby
98
+ Rails.application.config.log_tags ||= {}
99
+ Rails.application.config.log_tags[:queue_time] = ->(req) { req.env[::RequestQueueTimeMiddleware::ENV_KEY] }
100
+ ```
101
+
102
+ Example Datadog widget with configuration.
103
+
104
+ | :point_up: | Remember to update tags after initial setup! |
105
+ |---------------|:---------------------------------------------|
106
+
107
+ ![Datadog Widget, barchart showcasing sockets & queue backlog sizes stacked up](example-datadog_queue_time.png "Datadog Widget")
108
+
109
+ ```json
110
+ {
111
+ "viz": "timeseries",
112
+ "requests": [
113
+ {
114
+ "style": {
115
+ "palette": "dog_classic",
116
+ "type": "solid",
117
+ "width": "normal"
118
+ },
119
+ "type": "line",
120
+ "response_format": "timeseries",
121
+ "queries": [
122
+ {
123
+ "query": "max:ruby.puma.queue.time.max{}",
124
+ "data_source": "metrics",
125
+ "name": "query1"
126
+ },
127
+ {
128
+ "query": "max:ruby.puma.queue.time.95percentile{}",
129
+ "data_source": "metrics",
130
+ "name": "query2"
131
+ },
132
+ {
133
+ "query": "max:ruby.puma.queue.time.median{}",
134
+ "data_source": "metrics",
135
+ "name": "query3"
136
+ }
137
+ ],
138
+ "formulas": [
139
+ {
140
+ "alias": "max",
141
+ "formula": "query1"
142
+ },
143
+ {
144
+ "alias": "p95",
145
+ "formula": "query2"
146
+ },
147
+ {
148
+ "alias": "median",
149
+ "formula": "query3"
150
+ }
151
+ ]
152
+ }
153
+ ],
154
+ "yaxis": {
155
+ "include_zero": true,
156
+ "max": "auto",
157
+ "scale": "linear",
158
+ "min": "auto",
159
+ "label": ""
160
+ },
161
+ "markers": []
162
+ }
163
+ ```
@@ -0,0 +1,113 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Puma
4
+ class Plugin
5
+ module Telemetry
6
+ # Configuration object for plugin
7
+ class Config
8
+ DEFAULT_PUMA_TELEMETRY = [
9
+ # Total booted workers.
10
+ 'workers.booted',
11
+
12
+ # Total number of workers configured.
13
+ 'workers.total',
14
+
15
+ # Current number of threads spawned.
16
+ 'workers.spawned_threads',
17
+
18
+ # Maximum number of threads that can run .
19
+ 'workers.max_threads',
20
+
21
+ # Number of requests performed so far.
22
+ 'workers.requests_count',
23
+
24
+ # Number of requests waiting to be processed.
25
+ 'queue.backlog',
26
+
27
+ # Free capacity that could be utilized, i.e. if backlog
28
+ # is growing, and we still have capacity available, it
29
+ # could mean that load balancing is not performing well.
30
+ 'queue.capacity'
31
+ ].freeze
32
+
33
+ TARGETS = {
34
+ dogstatsd: Telemetry::Targets::DatadogStatsdTarget,
35
+ io: Telemetry::Targets::IOTarget
36
+ }.freeze
37
+
38
+ # Whenever telemetry should run with puma
39
+ # - default: false
40
+ attr_accessor :enabled
41
+
42
+ # Number of seconds to delay first telemetry
43
+ # - default: 5
44
+ attr_accessor :initial_delay
45
+
46
+ # Seconds between publishing telemetry
47
+ # - default: 5
48
+ attr_accessor :frequency
49
+
50
+ # List of targets which are meant to publish telemetry.
51
+ # Target should implement `#call` method accepting
52
+ # a single argument - so it can be even a simple proc.
53
+ # - default: []
54
+ attr_accessor :targets
55
+
56
+ # Which metrics to publish from puma stats. You can select
57
+ # a subset from default ones that interest you the most.
58
+ # - default: DEFAULT_PUMA_TELEMETRY
59
+ attr_accessor :puma_telemetry
60
+
61
+ # Whenever to publish socket telemetry.
62
+ # - default: false
63
+ attr_accessor :socket_telemetry
64
+
65
+ # Symbol representing method to parse the `Socket::Option`, or
66
+ # the whole implementation as a lambda. Available options:
67
+ # - `:inspect`, based on the `Socket::Option#inspect` method,
68
+ # it's the safest and slowest way to extract the info. `inspect`
69
+ # output might not be available, i.e. on AWS Fargate
70
+ # - `:unpack`, parse binary data given by `Socket::Option`. Fastest
71
+ # way (12x compared to `inspect`) but depends on kernel headers
72
+ # and fields ordering within the struct. It should almost always
73
+ # match though. DEFAULT
74
+ # - proc/lambda, `Socket::Option` will be given as an argument, it
75
+ # should return the value of `unacked` field as an integer.
76
+ #
77
+ attr_accessor :socket_parser
78
+
79
+ def initialize
80
+ @enabled = false
81
+ @initial_delay = 5
82
+ @frequency = 5
83
+ @targets = []
84
+ @puma_telemetry = DEFAULT_PUMA_TELEMETRY
85
+ @socket_telemetry = false
86
+ @socket_parser = :unpack
87
+ end
88
+
89
+ def enabled?
90
+ !!@enabled
91
+ end
92
+
93
+ def socket_telemetry!
94
+ @socket_telemetry = true
95
+ end
96
+
97
+ def socket_telemetry?
98
+ @socket_telemetry
99
+ end
100
+
101
+ def add_target(name_or_target, **args)
102
+ return @targets.push(name_or_target) unless name_or_target.is_a?(Symbol)
103
+
104
+ target = TARGETS.fetch(name_or_target) do
105
+ raise Telemetry::Error, "Unknown Target: #{name_or_target.inspect}, #{args.inspect}"
106
+ end
107
+
108
+ @targets.push(target.new(**args))
109
+ end
110
+ end
111
+ end
112
+ end
113
+ end
@@ -0,0 +1,269 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Puma
4
+ class Plugin
5
+ module Telemetry
6
+ # Helper for working with Puma stats
7
+ module CommonData
8
+ TELEMETRY_TO_METHODS = {
9
+ 'workers.booted' => :workers_booted,
10
+ 'workers.total' => :workers_total,
11
+ 'workers.spawned_threads' => :workers_spawned_threads,
12
+ 'workers.max_threads' => :workers_max_threads,
13
+ 'workers.requests_count' => :workers_requests_count,
14
+ 'queue.backlog' => :queue_backlog,
15
+ 'queue.capacity' => :queue_capacity
16
+ }.freeze
17
+
18
+ def initialize(stats)
19
+ @stats = stats
20
+ end
21
+
22
+ def workers_booted
23
+ @stats.fetch(:booted_workers, 1)
24
+ end
25
+
26
+ def workers_total
27
+ @stats.fetch(:workers, 1)
28
+ end
29
+
30
+ def metrics(selected)
31
+ selected.each_with_object({}) do |metric, obj|
32
+ next unless TELEMETRY_TO_METHODS.key?(metric)
33
+
34
+ obj[metric] = public_send(TELEMETRY_TO_METHODS[metric])
35
+ end
36
+ end
37
+ end
38
+
39
+ # Handles the case of non clustered mode, where `workers` isn't configured
40
+ class WorkerData
41
+ include CommonData
42
+
43
+ def workers_max_threads
44
+ @stats.fetch(:max_threads, 0)
45
+ end
46
+
47
+ def workers_requests_count
48
+ @stats.fetch(:requests_count, 0)
49
+ end
50
+
51
+ def workers_spawned_threads
52
+ @stats.fetch(:running, 0)
53
+ end
54
+
55
+ def queue_backlog
56
+ @stats.fetch(:backlog, 0)
57
+ end
58
+
59
+ def queue_capacity
60
+ @stats.fetch(:pool_capacity, 0)
61
+ end
62
+ end
63
+
64
+ # Handles the case of clustered mode, where we have statistics
65
+ # for all the workers. This class takes care of summing all
66
+ # relevant data.
67
+ class ClusteredData
68
+ include CommonData
69
+
70
+ def workers_max_threads
71
+ sum_stat(:max_threads)
72
+ end
73
+
74
+ def workers_requests_count
75
+ sum_stat(:requests_count)
76
+ end
77
+
78
+ def workers_spawned_threads
79
+ sum_stat(:running)
80
+ end
81
+
82
+ def queue_backlog
83
+ sum_stat(:backlog)
84
+ end
85
+
86
+ def queue_capacity
87
+ sum_stat(:pool_capacity)
88
+ end
89
+
90
+ private
91
+
92
+ def sum_stat(stat)
93
+ @stats[:worker_status].reduce(0) do |sum, data|
94
+ (data.dig(:last_status, stat) || 0) + sum
95
+ end
96
+ end
97
+ end
98
+
99
+ # Pulls TCP INFO data from socket
100
+ class SocketData
101
+ UNACKED_REGEXP = /\ unacked=(?<unacked>\d+)\ /.freeze
102
+
103
+ def initialize(ios, parser)
104
+ @sockets = ios.select { |io| io.respond_to?(:getsockopt) && io.is_a?(TCPSocket) }
105
+ @parser =
106
+ case parser
107
+ when :inspect then method(:parse_with_inspect)
108
+ when :unpack then method(:parse_with_unpack)
109
+ when Proc then parser
110
+ end
111
+ end
112
+
113
+ # Number of unacknowledged connections in the sockets, which
114
+ # we know as socket backlog.
115
+ #
116
+ def unacked
117
+ @sockets.sum do |socket|
118
+ @parser.call(socket.getsockopt(Socket::SOL_TCP,
119
+ Socket::TCP_INFO))
120
+ end
121
+ end
122
+
123
+ def metrics
124
+ {
125
+ 'sockets.backlog' => unacked
126
+ }
127
+ end
128
+
129
+ private
130
+
131
+ # The Socket::Option returned by `getsockopt` doesn't provide
132
+ # any kind of accessors for data inside. It decodes it on demand
133
+ # for `inspect` as strings in C implementation. It looks like
134
+ #
135
+ # #<Socket::Option: INET TCP INFO state=LISTEN
136
+ # ca_state=Open
137
+ # retransmits=0
138
+ # probes=0
139
+ # backoff=0
140
+ # options=0
141
+ # rto=0.000000s
142
+ # ato=0.000000s
143
+ # snd_mss=0
144
+ # rcv_mss=0
145
+ # unacked=0
146
+ # sacked=5
147
+ # lost=0
148
+ # retrans=0
149
+ # fackets=0
150
+ # last_data_sent=0.000s
151
+ # last_ack_sent=0.000s
152
+ # last_data_recv=0.000s
153
+ # last_ack_recv=0.000s
154
+ # pmtu=0
155
+ # rcv_ssthresh=0
156
+ # rtt=0.000000s
157
+ # rttvar=0.000000s
158
+ # snd_ssthresh=0
159
+ # snd_cwnd=10
160
+ # advmss=0
161
+ # reordering=3
162
+ # rcv_rtt=0.000000s
163
+ # rcv_space=0
164
+ # total_retrans=0
165
+ # (128 bytes too long)>
166
+ #
167
+ # That's why pulling the `unacked` field by parsing
168
+ # `inspect` output is one of the ways to retrieve it.
169
+ #
170
+ def parse_with_inspect(tcp_info)
171
+ tcp_match = tcp_info.inspect.match(UNACKED_REGEXP)
172
+
173
+ return 0 if tcp_match.nil?
174
+
175
+ tcp_match[:unacked].to_i
176
+ end
177
+
178
+ # The above inspect data might not be available everywhere (looking at you
179
+ # AWS Fargate Host running on kernel 4.14!), but we might still recover it
180
+ # by manually unpacking the binary data based on linux headers. For example
181
+ # below is tcp info struct from `linux/tcp.h` header file, from problematic
182
+ # host rocking kernel 4.14.
183
+ #
184
+ # struct tcp_info {
185
+ # __u8 tcpi_state;
186
+ # __u8 tcpi_ca_state;
187
+ # __u8 tcpi_retransmits;
188
+ # __u8 tcpi_probes;
189
+ # __u8 tcpi_backoff;
190
+ # __u8 tcpi_options;
191
+ # __u8 tcpi_snd_wscale : 4, tcpi_rcv_wscale : 4;
192
+ # __u8 tcpi_delivery_rate_app_limited:1;
193
+ #
194
+ # __u32 tcpi_rto;
195
+ # __u32 tcpi_ato;
196
+ # __u32 tcpi_snd_mss;
197
+ # __u32 tcpi_rcv_mss;
198
+ #
199
+ # __u32 tcpi_unacked;
200
+ # __u32 tcpi_sacked;
201
+ # __u32 tcpi_lost;
202
+ # __u32 tcpi_retrans;
203
+ # __u32 tcpi_fackets;
204
+ #
205
+ # /* Times. */
206
+ # __u32 tcpi_last_data_sent;
207
+ # __u32 tcpi_last_ack_sent; /* Not remembered, sorry. */
208
+ # __u32 tcpi_last_data_recv;
209
+ # __u32 tcpi_last_ack_recv;
210
+ #
211
+ # /* Metrics. */
212
+ # __u32 tcpi_pmtu;
213
+ # __u32 tcpi_rcv_ssthresh;
214
+ # __u32 tcpi_rtt;
215
+ # __u32 tcpi_rttvar;
216
+ # __u32 tcpi_snd_ssthresh;
217
+ # __u32 tcpi_snd_cwnd;
218
+ # __u32 tcpi_advmss;
219
+ # __u32 tcpi_reordering;
220
+ #
221
+ # __u32 tcpi_rcv_rtt;
222
+ # __u32 tcpi_rcv_space;
223
+ #
224
+ # __u32 tcpi_total_retrans;
225
+ #
226
+ # __u64 tcpi_pacing_rate;
227
+ # __u64 tcpi_max_pacing_rate;
228
+ # __u64 tcpi_bytes_acked; /* RFC4898 tcpEStatsAppHCThruOctetsAcked */
229
+ # __u64 tcpi_bytes_received; /* RFC4898 tcpEStatsAppHCThruOctetsReceived */
230
+ # __u32 tcpi_segs_out; /* RFC4898 tcpEStatsPerfSegsOut */
231
+ # __u32 tcpi_segs_in; /* RFC4898 tcpEStatsPerfSegsIn */
232
+ #
233
+ # __u32 tcpi_notsent_bytes;
234
+ # __u32 tcpi_min_rtt;
235
+ # __u32 tcpi_data_segs_in; /* RFC4898 tcpEStatsDataSegsIn */
236
+ # __u32 tcpi_data_segs_out; /* RFC4898 tcpEStatsDataSegsOut */
237
+ #
238
+ # __u64 tcpi_delivery_rate;
239
+ #
240
+ # __u64 tcpi_busy_time; /* Time (usec) busy sending data */
241
+ # __u64 tcpi_rwnd_limited; /* Time (usec) limited by receive window */
242
+ # __u64 tcpi_sndbuf_limited; /* Time (usec) limited by send buffer */
243
+ # };
244
+ #
245
+ # Now nowing types and order of fields we can easily parse binary data
246
+ # by using
247
+ # - `C` flag for `__u8` type - 8-bit unsigned (unsigned char)
248
+ # - `L` flag for `__u32` type - 32-bit unsigned, native endian (uint32_t)
249
+ # - `Q` flag for `__u64` type - 64-bit unsigned, native endian (uint64_t)
250
+ #
251
+ # Complete `unpack` would look like `C8 L24 Q4 L6 Q4`, but we are only
252
+ # interested in `unacked` field at the moment, that's why we only parse
253
+ # till this field by unpacking with `C8 L5`.
254
+ #
255
+ # If you find that it's not giving correct results, then please fall back
256
+ # to inspect, or update this code to accept unpack sequence. But in the
257
+ # end unpack is preferable, as it's 12x faster than inspect.
258
+ #
259
+ # Tested against:
260
+ # - Amazon Linux 2 with kernel 4.14 & 5.10
261
+ # - Ubuntu 20.04 with kernel 5.13
262
+ #
263
+ def parse_with_unpack(tcp_info)
264
+ tcp_info.unpack('C8L5').last
265
+ end
266
+ end
267
+ end
268
+ end
269
+ end
@@ -0,0 +1,51 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Puma
4
+ class Plugin
5
+ module Telemetry
6
+ module Targets
7
+ # Target wrapping Datadog Statsd client. You can configure
8
+ # all details like _metrics prefix_ and _tags_ in the client
9
+ # itself.
10
+ #
11
+ # ## Example
12
+ #
13
+ # require "datadog/statsd"
14
+ #
15
+ # client = Datadog::Statsd.new(namespace: "ruby.puma",
16
+ # tags: {
17
+ # service: "my-webapp",
18
+ # env: ENV["RAILS_ENV"],
19
+ # version: ENV["CODE_VERSION"]
20
+ # })
21
+ #
22
+ # DatadogStatsdTarget.new(client: client)
23
+ #
24
+ class DatadogStatsdTarget
25
+ def initialize(client:)
26
+ @client = client
27
+ end
28
+
29
+ # We are using `gauge` metric type, which means that only the last
30
+ # value will get send to datadog. DD Statsd client is using extra
31
+ # thread since v5 for aggregating metrics before it sends them.
32
+ #
33
+ # This means that we could publish metrics from here several times
34
+ # before they get flushed from the aggregation thread, and when they
35
+ # do, only the last values will get sent.
36
+ #
37
+ # That's why we are explicitly calling flush here, in order to persist
38
+ # all metrics, and not only the most recent ones.
39
+ #
40
+ def call(telemetry)
41
+ telemetry.each do |metric, value|
42
+ @client.gauge(metric, value)
43
+ end
44
+
45
+ @client.flush(sync: true)
46
+ end
47
+ end
48
+ end
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,42 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'json'
4
+
5
+ module Puma
6
+ class Plugin
7
+ module Telemetry
8
+ module Targets
9
+ # Simple IO Target, publishing metrics to STDOUT or logs
10
+ #
11
+ class IOTarget
12
+ # JSON formatter for IO, expects `call` method accepting telemetry hash
13
+ #
14
+ class JSONFormatter
15
+ # NOTE: Replace dots with dashes for better support of AWS CloudWatch
16
+ # Log Metric filters, as they don't support dots in key names.
17
+ def self.call(telemetry)
18
+ log = telemetry.transform_keys { |k| k.tr('.', '-') }
19
+
20
+ log['name'] = 'Puma::Plugin::Telemetry'
21
+ log['message'] = 'Publish telemetry'
22
+
23
+ ::JSON.dump(log)
24
+ end
25
+ end
26
+
27
+ def initialize(io: $stdout, formatter: :json)
28
+ @io = io
29
+ @formatter = case formatter
30
+ when :json then JSONFormatter
31
+ else formatter
32
+ end
33
+ end
34
+
35
+ def call(telemetry)
36
+ @io.puts(@formatter.call(telemetry))
37
+ end
38
+ end
39
+ end
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,9 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Puma
4
+ class Plugin
5
+ module Telemetry
6
+ VERSION = '1.1.0'
7
+ end
8
+ end
9
+ end