puma-plugin-telemetry 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/docs/examples.md ADDED
@@ -0,0 +1,163 @@
1
+ ## Keeping track of waiting requests
2
+
3
+ There are requests waiting in 2 places:
4
+ - socket
5
+ - queue
6
+
7
+ Their sum is a total number of accepted requests waiting.
8
+
9
+ Puma configuration
10
+
11
+ ```ruby
12
+ plugin :telemetry
13
+
14
+ Puma::Plugin::Telemetry.configure do |config|
15
+ config.enabled = true
16
+ config.initial_delay = 10
17
+
18
+ config.puma_telemetry = %w[queue.backlog]
19
+
20
+ config.socket_telemetry!
21
+
22
+ config.add_target :dogstatsd, client: Datadog::Statsd.new(tags: %w[your tags], namespace: "ruby.puma")
23
+ end
24
+ ```
25
+
26
+ Example Datadog widget and it's configuration. Depending on what you prefer to see, you might replace `rollup(max)` with `rollup(sum)` whenever you want to see maximum value or sum across the aggregated time frame.
27
+
28
+ | :point_up: | Remember to update tags after initial setup! |
29
+ |---------------|:---------------------------------------------|
30
+
31
+ ![Datadog Widget, barchart showcasing sockets & queue backlog sizes stacked up](example-datadog_backlog_size.png "Datadog Widget")
32
+
33
+ ```json
34
+ {
35
+ "viz": "timeseries",
36
+ "requests": [
37
+ {
38
+ "style": {
39
+ "palette": "dog_classic",
40
+ "type": "solid",
41
+ "width": "normal"
42
+ },
43
+ "type": "bars",
44
+ "formulas": [
45
+ {
46
+ "alias": "queue",
47
+ "formula": "query1"
48
+ },
49
+ {
50
+ "alias": "socket",
51
+ "formula": "query2"
52
+ }
53
+ ],
54
+ "response_format": "timeseries",
55
+ "on_right_yaxis": false,
56
+ "queries": [
57
+ {
58
+ "query": "max:ruby.puma.queue.backlog{}.rollup(max)",
59
+ "data_source": "metrics",
60
+ "name": "query1"
61
+ },
62
+ {
63
+ "query": "max:ruby.puma.sockets.backlog{}.rollup(max)",
64
+ "data_source": "metrics",
65
+ "name": "query2"
66
+ }
67
+ ]
68
+ }
69
+ ],
70
+ "yaxis": {
71
+ "include_zero": true,
72
+ "max": "auto",
73
+ "scale": "linear",
74
+ "min": "auto",
75
+ "label": ""
76
+ },
77
+ "markers": []
78
+ }
79
+ ```
80
+
81
+ ## Keeping track of request queue time
82
+
83
+ The time request spent waiting to be processed, between it's accepted by Load Balancer till it starts going through Rack Middleware in your application. Holy grail of autoscaling.
84
+
85
+ Example configuration of middleware, i.e. in case of Rails it could be placed under `config/initializers/request_queue_time.rb`
86
+
87
+ ```ruby
88
+ Rails.application.config.middleware.insert_after(
89
+ 0,
90
+ RequestQueueTimeMiddleware,
91
+ statsd: Datadog::Statsd.new(namespace: "ruby.puma", tags: %w[your tags])
92
+ )
93
+ ```
94
+
95
+ If you are utilizing tags in your logs, you might also want to add this measurement as follows:
96
+
97
+ ```ruby
98
+ Rails.application.config.log_tags ||= {}
99
+ Rails.application.config.log_tags[:queue_time] = ->(req) { req.env[::RequestQueueTimeMiddleware::ENV_KEY] }
100
+ ```
101
+
102
+ Example Datadog widget with configuration.
103
+
104
+ | :point_up: | Remember to update tags after initial setup! |
105
+ |---------------|:---------------------------------------------|
106
+
107
+ ![Datadog Widget, barchart showcasing sockets & queue backlog sizes stacked up](example-datadog_queue_time.png "Datadog Widget")
108
+
109
+ ```json
110
+ {
111
+ "viz": "timeseries",
112
+ "requests": [
113
+ {
114
+ "style": {
115
+ "palette": "dog_classic",
116
+ "type": "solid",
117
+ "width": "normal"
118
+ },
119
+ "type": "line",
120
+ "response_format": "timeseries",
121
+ "queries": [
122
+ {
123
+ "query": "max:ruby.puma.queue.time.max{}",
124
+ "data_source": "metrics",
125
+ "name": "query1"
126
+ },
127
+ {
128
+ "query": "max:ruby.puma.queue.time.95percentile{}",
129
+ "data_source": "metrics",
130
+ "name": "query2"
131
+ },
132
+ {
133
+ "query": "max:ruby.puma.queue.time.median{}",
134
+ "data_source": "metrics",
135
+ "name": "query3"
136
+ }
137
+ ],
138
+ "formulas": [
139
+ {
140
+ "alias": "max",
141
+ "formula": "query1"
142
+ },
143
+ {
144
+ "alias": "p95",
145
+ "formula": "query2"
146
+ },
147
+ {
148
+ "alias": "median",
149
+ "formula": "query3"
150
+ }
151
+ ]
152
+ }
153
+ ],
154
+ "yaxis": {
155
+ "include_zero": true,
156
+ "max": "auto",
157
+ "scale": "linear",
158
+ "min": "auto",
159
+ "label": ""
160
+ },
161
+ "markers": []
162
+ }
163
+ ```
@@ -0,0 +1,113 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Puma
4
+ class Plugin
5
+ module Telemetry
6
+ # Configuration object for plugin
7
+ class Config
8
+ DEFAULT_PUMA_TELEMETRY = [
9
+ # Total booted workers.
10
+ 'workers.booted',
11
+
12
+ # Total number of workers configured.
13
+ 'workers.total',
14
+
15
+ # Current number of threads spawned.
16
+ 'workers.spawned_threads',
17
+
18
+ # Maximum number of threads that can run .
19
+ 'workers.max_threads',
20
+
21
+ # Number of requests performed so far.
22
+ 'workers.requests_count',
23
+
24
+ # Number of requests waiting to be processed.
25
+ 'queue.backlog',
26
+
27
+ # Free capacity that could be utilized, i.e. if backlog
28
+ # is growing, and we still have capacity available, it
29
+ # could mean that load balancing is not performing well.
30
+ 'queue.capacity'
31
+ ].freeze
32
+
33
+ TARGETS = {
34
+ dogstatsd: Telemetry::Targets::DatadogStatsdTarget,
35
+ io: Telemetry::Targets::IOTarget
36
+ }.freeze
37
+
38
+ # Whenever telemetry should run with puma
39
+ # - default: false
40
+ attr_accessor :enabled
41
+
42
+ # Number of seconds to delay first telemetry
43
+ # - default: 5
44
+ attr_accessor :initial_delay
45
+
46
+ # Seconds between publishing telemetry
47
+ # - default: 5
48
+ attr_accessor :frequency
49
+
50
+ # List of targets which are meant to publish telemetry.
51
+ # Target should implement `#call` method accepting
52
+ # a single argument - so it can be even a simple proc.
53
+ # - default: []
54
+ attr_accessor :targets
55
+
56
+ # Which metrics to publish from puma stats. You can select
57
+ # a subset from default ones that interest you the most.
58
+ # - default: DEFAULT_PUMA_TELEMETRY
59
+ attr_accessor :puma_telemetry
60
+
61
+ # Whenever to publish socket telemetry.
62
+ # - default: false
63
+ attr_accessor :socket_telemetry
64
+
65
+ # Symbol representing method to parse the `Socket::Option`, or
66
+ # the whole implementation as a lambda. Available options:
67
+ # - `:inspect`, based on the `Socket::Option#inspect` method,
68
+ # it's the safest and slowest way to extract the info. `inspect`
69
+ # output might not be available, i.e. on AWS Fargate
70
+ # - `:unpack`, parse binary data given by `Socket::Option`. Fastest
71
+ # way (12x compared to `inspect`) but depends on kernel headers
72
+ # and fields ordering within the struct. It should almost always
73
+ # match though. DEFAULT
74
+ # - proc/lambda, `Socket::Option` will be given as an argument, it
75
+ # should return the value of `unacked` field as an integer.
76
+ #
77
+ attr_accessor :socket_parser
78
+
79
+ def initialize
80
+ @enabled = false
81
+ @initial_delay = 5
82
+ @frequency = 5
83
+ @targets = []
84
+ @puma_telemetry = DEFAULT_PUMA_TELEMETRY
85
+ @socket_telemetry = false
86
+ @socket_parser = :unpack
87
+ end
88
+
89
+ def enabled?
90
+ !!@enabled
91
+ end
92
+
93
+ def socket_telemetry!
94
+ @socket_telemetry = true
95
+ end
96
+
97
+ def socket_telemetry?
98
+ @socket_telemetry
99
+ end
100
+
101
+ def add_target(name_or_target, **args)
102
+ return @targets.push(name_or_target) unless name_or_target.is_a?(Symbol)
103
+
104
+ target = TARGETS.fetch(name_or_target) do
105
+ raise Telemetry::Error, "Unknown Target: #{name_or_target.inspect}, #{args.inspect}"
106
+ end
107
+
108
+ @targets.push(target.new(**args))
109
+ end
110
+ end
111
+ end
112
+ end
113
+ end
@@ -0,0 +1,269 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Puma
4
+ class Plugin
5
+ module Telemetry
6
+ # Helper for working with Puma stats
7
+ module CommonData
8
+ TELEMETRY_TO_METHODS = {
9
+ 'workers.booted' => :workers_booted,
10
+ 'workers.total' => :workers_total,
11
+ 'workers.spawned_threads' => :workers_spawned_threads,
12
+ 'workers.max_threads' => :workers_max_threads,
13
+ 'workers.requests_count' => :workers_requests_count,
14
+ 'queue.backlog' => :queue_backlog,
15
+ 'queue.capacity' => :queue_capacity
16
+ }.freeze
17
+
18
+ def initialize(stats)
19
+ @stats = stats
20
+ end
21
+
22
+ def workers_booted
23
+ @stats.fetch(:booted_workers, 1)
24
+ end
25
+
26
+ def workers_total
27
+ @stats.fetch(:workers, 1)
28
+ end
29
+
30
+ def metrics(selected)
31
+ selected.each_with_object({}) do |metric, obj|
32
+ next unless TELEMETRY_TO_METHODS.key?(metric)
33
+
34
+ obj[metric] = public_send(TELEMETRY_TO_METHODS[metric])
35
+ end
36
+ end
37
+ end
38
+
39
+ # Handles the case of non clustered mode, where `workers` isn't configured
40
+ class WorkerData
41
+ include CommonData
42
+
43
+ def workers_max_threads
44
+ @stats.fetch(:max_threads, 0)
45
+ end
46
+
47
+ def workers_requests_count
48
+ @stats.fetch(:requests_count, 0)
49
+ end
50
+
51
+ def workers_spawned_threads
52
+ @stats.fetch(:running, 0)
53
+ end
54
+
55
+ def queue_backlog
56
+ @stats.fetch(:backlog, 0)
57
+ end
58
+
59
+ def queue_capacity
60
+ @stats.fetch(:pool_capacity, 0)
61
+ end
62
+ end
63
+
64
+ # Handles the case of clustered mode, where we have statistics
65
+ # for all the workers. This class takes care of summing all
66
+ # relevant data.
67
+ class ClusteredData
68
+ include CommonData
69
+
70
+ def workers_max_threads
71
+ sum_stat(:max_threads)
72
+ end
73
+
74
+ def workers_requests_count
75
+ sum_stat(:requests_count)
76
+ end
77
+
78
+ def workers_spawned_threads
79
+ sum_stat(:running)
80
+ end
81
+
82
+ def queue_backlog
83
+ sum_stat(:backlog)
84
+ end
85
+
86
+ def queue_capacity
87
+ sum_stat(:pool_capacity)
88
+ end
89
+
90
+ private
91
+
92
+ def sum_stat(stat)
93
+ @stats[:worker_status].reduce(0) do |sum, data|
94
+ (data.dig(:last_status, stat) || 0) + sum
95
+ end
96
+ end
97
+ end
98
+
99
+ # Pulls TCP INFO data from socket
100
+ class SocketData
101
+ UNACKED_REGEXP = /\ unacked=(?<unacked>\d+)\ /.freeze
102
+
103
+ def initialize(ios, parser)
104
+ @sockets = ios.select { |io| io.respond_to?(:getsockopt) && io.is_a?(TCPSocket) }
105
+ @parser =
106
+ case parser
107
+ when :inspect then method(:parse_with_inspect)
108
+ when :unpack then method(:parse_with_unpack)
109
+ when Proc then parser
110
+ end
111
+ end
112
+
113
+ # Number of unacknowledged connections in the sockets, which
114
+ # we know as socket backlog.
115
+ #
116
+ def unacked
117
+ @sockets.sum do |socket|
118
+ @parser.call(socket.getsockopt(Socket::SOL_TCP,
119
+ Socket::TCP_INFO))
120
+ end
121
+ end
122
+
123
+ def metrics
124
+ {
125
+ 'sockets.backlog' => unacked
126
+ }
127
+ end
128
+
129
+ private
130
+
131
+ # The Socket::Option returned by `getsockopt` doesn't provide
132
+ # any kind of accessors for data inside. It decodes it on demand
133
+ # for `inspect` as strings in C implementation. It looks like
134
+ #
135
+ # #<Socket::Option: INET TCP INFO state=LISTEN
136
+ # ca_state=Open
137
+ # retransmits=0
138
+ # probes=0
139
+ # backoff=0
140
+ # options=0
141
+ # rto=0.000000s
142
+ # ato=0.000000s
143
+ # snd_mss=0
144
+ # rcv_mss=0
145
+ # unacked=0
146
+ # sacked=5
147
+ # lost=0
148
+ # retrans=0
149
+ # fackets=0
150
+ # last_data_sent=0.000s
151
+ # last_ack_sent=0.000s
152
+ # last_data_recv=0.000s
153
+ # last_ack_recv=0.000s
154
+ # pmtu=0
155
+ # rcv_ssthresh=0
156
+ # rtt=0.000000s
157
+ # rttvar=0.000000s
158
+ # snd_ssthresh=0
159
+ # snd_cwnd=10
160
+ # advmss=0
161
+ # reordering=3
162
+ # rcv_rtt=0.000000s
163
+ # rcv_space=0
164
+ # total_retrans=0
165
+ # (128 bytes too long)>
166
+ #
167
+ # That's why pulling the `unacked` field by parsing
168
+ # `inspect` output is one of the ways to retrieve it.
169
+ #
170
+ def parse_with_inspect(tcp_info)
171
+ tcp_match = tcp_info.inspect.match(UNACKED_REGEXP)
172
+
173
+ return 0 if tcp_match.nil?
174
+
175
+ tcp_match[:unacked].to_i
176
+ end
177
+
178
+ # The above inspect data might not be available everywhere (looking at you
179
+ # AWS Fargate Host running on kernel 4.14!), but we might still recover it
180
+ # by manually unpacking the binary data based on linux headers. For example
181
+ # below is tcp info struct from `linux/tcp.h` header file, from problematic
182
+ # host rocking kernel 4.14.
183
+ #
184
+ # struct tcp_info {
185
+ # __u8 tcpi_state;
186
+ # __u8 tcpi_ca_state;
187
+ # __u8 tcpi_retransmits;
188
+ # __u8 tcpi_probes;
189
+ # __u8 tcpi_backoff;
190
+ # __u8 tcpi_options;
191
+ # __u8 tcpi_snd_wscale : 4, tcpi_rcv_wscale : 4;
192
+ # __u8 tcpi_delivery_rate_app_limited:1;
193
+ #
194
+ # __u32 tcpi_rto;
195
+ # __u32 tcpi_ato;
196
+ # __u32 tcpi_snd_mss;
197
+ # __u32 tcpi_rcv_mss;
198
+ #
199
+ # __u32 tcpi_unacked;
200
+ # __u32 tcpi_sacked;
201
+ # __u32 tcpi_lost;
202
+ # __u32 tcpi_retrans;
203
+ # __u32 tcpi_fackets;
204
+ #
205
+ # /* Times. */
206
+ # __u32 tcpi_last_data_sent;
207
+ # __u32 tcpi_last_ack_sent; /* Not remembered, sorry. */
208
+ # __u32 tcpi_last_data_recv;
209
+ # __u32 tcpi_last_ack_recv;
210
+ #
211
+ # /* Metrics. */
212
+ # __u32 tcpi_pmtu;
213
+ # __u32 tcpi_rcv_ssthresh;
214
+ # __u32 tcpi_rtt;
215
+ # __u32 tcpi_rttvar;
216
+ # __u32 tcpi_snd_ssthresh;
217
+ # __u32 tcpi_snd_cwnd;
218
+ # __u32 tcpi_advmss;
219
+ # __u32 tcpi_reordering;
220
+ #
221
+ # __u32 tcpi_rcv_rtt;
222
+ # __u32 tcpi_rcv_space;
223
+ #
224
+ # __u32 tcpi_total_retrans;
225
+ #
226
+ # __u64 tcpi_pacing_rate;
227
+ # __u64 tcpi_max_pacing_rate;
228
+ # __u64 tcpi_bytes_acked; /* RFC4898 tcpEStatsAppHCThruOctetsAcked */
229
+ # __u64 tcpi_bytes_received; /* RFC4898 tcpEStatsAppHCThruOctetsReceived */
230
+ # __u32 tcpi_segs_out; /* RFC4898 tcpEStatsPerfSegsOut */
231
+ # __u32 tcpi_segs_in; /* RFC4898 tcpEStatsPerfSegsIn */
232
+ #
233
+ # __u32 tcpi_notsent_bytes;
234
+ # __u32 tcpi_min_rtt;
235
+ # __u32 tcpi_data_segs_in; /* RFC4898 tcpEStatsDataSegsIn */
236
+ # __u32 tcpi_data_segs_out; /* RFC4898 tcpEStatsDataSegsOut */
237
+ #
238
+ # __u64 tcpi_delivery_rate;
239
+ #
240
+ # __u64 tcpi_busy_time; /* Time (usec) busy sending data */
241
+ # __u64 tcpi_rwnd_limited; /* Time (usec) limited by receive window */
242
+ # __u64 tcpi_sndbuf_limited; /* Time (usec) limited by send buffer */
243
+ # };
244
+ #
245
+ # Now nowing types and order of fields we can easily parse binary data
246
+ # by using
247
+ # - `C` flag for `__u8` type - 8-bit unsigned (unsigned char)
248
+ # - `L` flag for `__u32` type - 32-bit unsigned, native endian (uint32_t)
249
+ # - `Q` flag for `__u64` type - 64-bit unsigned, native endian (uint64_t)
250
+ #
251
+ # Complete `unpack` would look like `C8 L24 Q4 L6 Q4`, but we are only
252
+ # interested in `unacked` field at the moment, that's why we only parse
253
+ # till this field by unpacking with `C8 L5`.
254
+ #
255
+ # If you find that it's not giving correct results, then please fall back
256
+ # to inspect, or update this code to accept unpack sequence. But in the
257
+ # end unpack is preferable, as it's 12x faster than inspect.
258
+ #
259
+ # Tested against:
260
+ # - Amazon Linux 2 with kernel 4.14 & 5.10
261
+ # - Ubuntu 20.04 with kernel 5.13
262
+ #
263
+ def parse_with_unpack(tcp_info)
264
+ tcp_info.unpack('C8L5').last
265
+ end
266
+ end
267
+ end
268
+ end
269
+ end
@@ -0,0 +1,51 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Puma
4
+ class Plugin
5
+ module Telemetry
6
+ module Targets
7
+ # Target wrapping Datadog Statsd client. You can configure
8
+ # all details like _metrics prefix_ and _tags_ in the client
9
+ # itself.
10
+ #
11
+ # ## Example
12
+ #
13
+ # require "datadog/statsd"
14
+ #
15
+ # client = Datadog::Statsd.new(namespace: "ruby.puma",
16
+ # tags: {
17
+ # service: "my-webapp",
18
+ # env: ENV["RAILS_ENV"],
19
+ # version: ENV["CODE_VERSION"]
20
+ # })
21
+ #
22
+ # DatadogStatsdTarget.new(client: client)
23
+ #
24
+ class DatadogStatsdTarget
25
+ def initialize(client:)
26
+ @client = client
27
+ end
28
+
29
+ # We are using `gauge` metric type, which means that only the last
30
+ # value will get send to datadog. DD Statsd client is using extra
31
+ # thread since v5 for aggregating metrics before it sends them.
32
+ #
33
+ # This means that we could publish metrics from here several times
34
+ # before they get flushed from the aggregation thread, and when they
35
+ # do, only the last values will get sent.
36
+ #
37
+ # That's why we are explicitly calling flush here, in order to persist
38
+ # all metrics, and not only the most recent ones.
39
+ #
40
+ def call(telemetry)
41
+ telemetry.each do |metric, value|
42
+ @client.gauge(metric, value)
43
+ end
44
+
45
+ @client.flush(sync: true)
46
+ end
47
+ end
48
+ end
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,42 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'json'
4
+
5
+ module Puma
6
+ class Plugin
7
+ module Telemetry
8
+ module Targets
9
+ # Simple IO Target, publishing metrics to STDOUT or logs
10
+ #
11
+ class IOTarget
12
+ # JSON formatter for IO, expects `call` method accepting telemetry hash
13
+ #
14
+ class JSONFormatter
15
+ # NOTE: Replace dots with dashes for better support of AWS CloudWatch
16
+ # Log Metric filters, as they don't support dots in key names.
17
+ def self.call(telemetry)
18
+ log = telemetry.transform_keys { |k| k.tr('.', '-') }
19
+
20
+ log['name'] = 'Puma::Plugin::Telemetry'
21
+ log['message'] = 'Publish telemetry'
22
+
23
+ ::JSON.dump(log)
24
+ end
25
+ end
26
+
27
+ def initialize(io: $stdout, formatter: :json)
28
+ @io = io
29
+ @formatter = case formatter
30
+ when :json then JSONFormatter
31
+ else formatter
32
+ end
33
+ end
34
+
35
+ def call(telemetry)
36
+ @io.puts(@formatter.call(telemetry))
37
+ end
38
+ end
39
+ end
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,9 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Puma
4
+ class Plugin
5
+ module Telemetry
6
+ VERSION = '1.1.0'
7
+ end
8
+ end
9
+ end