mod-wsgi-telemetry 1.0.0.dev2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mod_wsgi/__init__.py +2 -0
- mod_wsgi/telemetry/__init__.py +1 -0
- mod_wsgi/telemetry/cli.py +55 -0
- mod_wsgi/telemetry/contention.py +229 -0
- mod_wsgi/telemetry/dump.py +102 -0
- mod_wsgi/telemetry/ingest.py +773 -0
- mod_wsgi/telemetry/server.py +270 -0
- mod_wsgi/telemetry/simulate.py +649 -0
- mod_wsgi/telemetry/static/index.html +8368 -0
- mod_wsgi/telemetry/tui.py +1536 -0
- mod_wsgi/telemetry/wire.py +480 -0
- mod_wsgi_telemetry-1.0.0.dev2.dist-info/METADATA +79 -0
- mod_wsgi_telemetry-1.0.0.dev2.dist-info/RECORD +16 -0
- mod_wsgi_telemetry-1.0.0.dev2.dist-info/WHEEL +4 -0
- mod_wsgi_telemetry-1.0.0.dev2.dist-info/entry_points.txt +2 -0
- mod_wsgi_telemetry-1.0.0.dev2.dist-info/licenses/LICENSE +202 -0
|
@@ -0,0 +1,773 @@
|
|
|
1
|
+
"""Async datagram receiver + rolling per-process window.
|
|
2
|
+
|
|
3
|
+
Listens on a UNIX SOCK_DGRAM socket for TLV samples, decodes them, and
|
|
4
|
+
keeps a bounded history per PID so connecting UI clients can fetch
|
|
5
|
+
recent state immediately without waiting for the next tick.
|
|
6
|
+
|
|
7
|
+
Remote (IPv4 UDP) listeners are not supported — telemetry is intended
|
|
8
|
+
for a co-located ingester so MTU / IP-fragmentation / packet-loss are
|
|
9
|
+
non-concerns. The reporter is allowed to emit datagrams that exceed
|
|
10
|
+
the Ethernet MTU as a result.
|
|
11
|
+
|
|
12
|
+
Emits each decoded sample on an asyncio broadcast queue for WebSocket
|
|
13
|
+
clients to pick up.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import asyncio
|
|
19
|
+
import grp
|
|
20
|
+
import logging
|
|
21
|
+
import os
|
|
22
|
+
import socket
|
|
23
|
+
import time
|
|
24
|
+
from collections import deque
|
|
25
|
+
from dataclasses import dataclass, field
|
|
26
|
+
from typing import Iterable
|
|
27
|
+
|
|
28
|
+
from .wire import Sample, decode
|
|
29
|
+
|
|
30
|
+
log = logging.getLogger(__name__)
|
|
31
|
+
|
|
32
|
+
# How long to retain GC event records per process. Sized to match the
|
|
33
|
+
# longest UI window (5 minutes) so the event timeline can always cover
|
|
34
|
+
# the selected window regardless of event rate. Trimmed by event
|
|
35
|
+
# timestamp rather than entry count so memory use scales with rate
|
|
36
|
+
# rather than being capped to a fixed (possibly too small) entry count.
|
|
37
|
+
GC_EVENT_RETENTION_SEC = 300.0
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def parse_listen(spec: str) -> tuple[int, str]:
|
|
41
|
+
"""Return (family, bind_path) for a UNIX SOCK_DGRAM target."""
|
|
42
|
+
if spec.startswith("unix:"):
|
|
43
|
+
return socket.AF_UNIX, spec[len("unix:"):]
|
|
44
|
+
raise ValueError(
|
|
45
|
+
f"unknown scheme {spec!r}: expected 'unix:/path' "
|
|
46
|
+
f"(remote 'udp:host:port' targets are no longer supported)"
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def open_socket(spec: str, *, mode: int = 0o660,
|
|
51
|
+
group: str | int | None = None) -> socket.socket:
|
|
52
|
+
"""Open and bind the listening socket with explicit permissions.
|
|
53
|
+
|
|
54
|
+
Senders only need write permission on the socket file, so the
|
|
55
|
+
default 0660 mode plus a shared group (set via ``group``) is the
|
|
56
|
+
standard pattern: the ingester user owns the socket, the shared
|
|
57
|
+
group covers every WSGI-process identity that needs to connect,
|
|
58
|
+
and nobody else can sendto() the socket.
|
|
59
|
+
|
|
60
|
+
During bind() the umask is temporarily tightened to 0077 so the
|
|
61
|
+
socket file is briefly 0600 before the explicit chmod widens it
|
|
62
|
+
to ``mode``; this closes the small window during which an
|
|
63
|
+
inherited umask might leave the file world-writable.
|
|
64
|
+
"""
|
|
65
|
+
family, addr = parse_listen(spec)
|
|
66
|
+
sock = socket.socket(family, socket.SOCK_DGRAM)
|
|
67
|
+
|
|
68
|
+
if os.path.exists(addr):
|
|
69
|
+
os.unlink(addr)
|
|
70
|
+
|
|
71
|
+
saved_umask = os.umask(0o077)
|
|
72
|
+
try:
|
|
73
|
+
sock.bind(addr)
|
|
74
|
+
finally:
|
|
75
|
+
os.umask(saved_umask)
|
|
76
|
+
|
|
77
|
+
if group is not None:
|
|
78
|
+
gid = group if isinstance(group, int) else grp.getgrnam(group).gr_gid
|
|
79
|
+
os.chown(addr, -1, gid)
|
|
80
|
+
os.chmod(addr, mode)
|
|
81
|
+
|
|
82
|
+
sock.setblocking(False)
|
|
83
|
+
try:
|
|
84
|
+
sock.setsockopt(socket.SOL_SOCKET, socket.SO_RCVBUF, 2 * 1024 * 1024)
|
|
85
|
+
except OSError:
|
|
86
|
+
pass
|
|
87
|
+
return sock
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
@dataclass
|
|
91
|
+
class ProcessState:
|
|
92
|
+
"""Rolling window of samples for one PID."""
|
|
93
|
+
pid: int
|
|
94
|
+
hostname: str = ""
|
|
95
|
+
process_group: str = ""
|
|
96
|
+
# Build/runtime identity. Emitted by the daemon once at process
|
|
97
|
+
# start (or on every sample; the ingester doesn't care) and
|
|
98
|
+
# latched here so a mid-stream client reconnect still sees the
|
|
99
|
+
# "who is this" banner via the snapshot payload, even if the
|
|
100
|
+
# rolling sample window no longer carries the identity TLVs.
|
|
101
|
+
mod_wsgi_version: str = ""
|
|
102
|
+
python_version: str = ""
|
|
103
|
+
apache_version: str = ""
|
|
104
|
+
mpm_name: str = ""
|
|
105
|
+
# Apache parent pid, latched from process_started. Lets the UI
|
|
106
|
+
# group sibling daemon processes under their parent.
|
|
107
|
+
process_parent_pid: int = 0
|
|
108
|
+
# Telemetry reporter's tick interval in seconds, as reported by the
|
|
109
|
+
# process itself on each KIND_REQUEST sample. Used to size the
|
|
110
|
+
# slow-request TTLs — a reporter ticking every 10 s needs a longer
|
|
111
|
+
# TTL than the default 5 s floor so heartbeats aren't aged out
|
|
112
|
+
# between ticks.
|
|
113
|
+
sample_period: float = 1.0
|
|
114
|
+
samples: deque = field(default_factory=lambda: deque(maxlen=600))
|
|
115
|
+
last_seq: int = 0
|
|
116
|
+
drops: int = 0
|
|
117
|
+
last_seen: float = 0.0
|
|
118
|
+
# Per-interpreter rolling buffer of recent KIND_GC_SNAPSHOT
|
|
119
|
+
# samples, keyed by interpreter_name (empty string for the
|
|
120
|
+
# main / sole interpreter, also the single-interpreter common
|
|
121
|
+
# case where the daemon omits the interpreter_name field on
|
|
122
|
+
# the wire). Each entry in the deque is {stamp, fields} so
|
|
123
|
+
# client-side rendering can compute rates and plot time-series
|
|
124
|
+
# without a follow-up poll.
|
|
125
|
+
gc_snapshots: dict = field(default_factory=dict)
|
|
126
|
+
# Rolling buffer of recent KIND_GC_EVENT records (each is a
|
|
127
|
+
# pause event). Trimmed by event timestamp on each append so the
|
|
128
|
+
# retained span always covers the longest UI window regardless of
|
|
129
|
+
# event rate. See GC_EVENT_RETENTION_SEC.
|
|
130
|
+
gc_events: deque = field(default_factory=deque)
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
@dataclass
|
|
134
|
+
class SlowEntry:
|
|
135
|
+
"""One slow request as currently known to the ingester.
|
|
136
|
+
|
|
137
|
+
Active heartbeats replace earlier active records for the same key; a
|
|
138
|
+
completed record marks the entry final (its duration is the end-of-
|
|
139
|
+
request total). last_seen is the wall-monotonic arrival time, used for
|
|
140
|
+
TTL age-out so requests from processes that died mid-flight don't
|
|
141
|
+
linger as "active" forever.
|
|
142
|
+
|
|
143
|
+
I/O counters are final at completion; for an active record they
|
|
144
|
+
are the partial values captured at scan time (the adapter may yet
|
|
145
|
+
read or write more before the request completes).
|
|
146
|
+
"""
|
|
147
|
+
pid: int
|
|
148
|
+
thread_id: int
|
|
149
|
+
# Apache child worker pid that accepted the request. In embedded
|
|
150
|
+
# mode this is the same process as pid (Apache child runs the
|
|
151
|
+
# WSGI app directly). In daemon mode pid is the daemon process
|
|
152
|
+
# and server_pid is the Apache child that proxied the request.
|
|
153
|
+
server_pid: int
|
|
154
|
+
log_id: str
|
|
155
|
+
method: str
|
|
156
|
+
scheme: str
|
|
157
|
+
hostname: str
|
|
158
|
+
script_name: str
|
|
159
|
+
path_info: str
|
|
160
|
+
start_stamp: float # seconds since epoch
|
|
161
|
+
duration: float # seconds
|
|
162
|
+
state: int # 0 = active, 1 = completed
|
|
163
|
+
# Network identity. peer_ip is post-trusted-proxy resolution, so
|
|
164
|
+
# reflects the real client when X-Forwarded-For handling is
|
|
165
|
+
# configured. protocol is "HTTP/1.1" / "HTTP/2.0". user_agent
|
|
166
|
+
# is empty unless the operator opted in via
|
|
167
|
+
# WSGITelemetryOptions +CaptureUserAgent.
|
|
168
|
+
peer_ip: str = ""
|
|
169
|
+
protocol: str = ""
|
|
170
|
+
user_agent: str = ""
|
|
171
|
+
input_bytes: int = 0
|
|
172
|
+
input_reads: int = 0
|
|
173
|
+
output_bytes: int = 0
|
|
174
|
+
output_writes: int = 0
|
|
175
|
+
cpu_user_time: float = 0.0
|
|
176
|
+
cpu_system_time: float = 0.0
|
|
177
|
+
# Per-phase timing breakdown (seconds). server is Apache
|
|
178
|
+
# request arrival to handed off to daemon (or to application_start
|
|
179
|
+
# in embedded mode); queue is daemon connect to worker pickup;
|
|
180
|
+
# daemon is worker pickup to WSGI callable invoked; application
|
|
181
|
+
# is the WSGI callable elapsed. queue and daemon are 0 in
|
|
182
|
+
# embedded mode. application is partial for active records still
|
|
183
|
+
# inside the callable; pre-application active records report 0
|
|
184
|
+
# for application and a partial daemon (or server) so the user
|
|
185
|
+
# can see where time is going.
|
|
186
|
+
server_time: float = 0.0
|
|
187
|
+
queue_time: float = 0.0
|
|
188
|
+
daemon_time: float = 0.0
|
|
189
|
+
application_time: float = 0.0
|
|
190
|
+
# GIL-wait pressure indicator. Sum of waits at every instrumented
|
|
191
|
+
# re-acquire site reached during this request, plus the initial
|
|
192
|
+
# sub-interp GIL acquire. Cross-cutting overlap, not a phase
|
|
193
|
+
# addend. Cannot see waits inside the application's own C
|
|
194
|
+
# extensions, so it surfaces as a partial pressure indicator.
|
|
195
|
+
gil_wait_time: float = 0.0
|
|
196
|
+
gil_wait_count: int = 0
|
|
197
|
+
# I/O time overlap indicators for this request. input_read_time is
|
|
198
|
+
# the total time spent inside wsgi.input.read*; output_write_time
|
|
199
|
+
# is the total time spent in the adapter's output path
|
|
200
|
+
# (start_response / write / yield-to-Apache). Cross-cutting
|
|
201
|
+
# overlap, not a phase addend. output_write_time is "adapter
|
|
202
|
+
# handoff" time, not client-receive time: Apache may buffer and
|
|
203
|
+
# async-flush past mod_wsgi's view. See the wire.py field
|
|
204
|
+
# comment for the full caveat.
|
|
205
|
+
input_read_time: float = 0.0
|
|
206
|
+
output_write_time: float = 0.0
|
|
207
|
+
# Concurrency context — wsgi_active_requests including this one
|
|
208
|
+
# at slot claim and at completion. active_at_completion is 0 for
|
|
209
|
+
# active records by definition (the request hasn't finished).
|
|
210
|
+
# Used together with the per-process request_threads_maximum
|
|
211
|
+
# from the periodic stream to render an "n / max" saturation
|
|
212
|
+
# indicator on the slow-record detail panel.
|
|
213
|
+
active_at_start: int = 0
|
|
214
|
+
active_at_completion: int = 0
|
|
215
|
+
status: int = 0 # 0 = not yet known, else final WSGI status
|
|
216
|
+
last_seen: float = 0.0
|
|
217
|
+
|
|
218
|
+
def to_dict(self) -> dict:
|
|
219
|
+
return {
|
|
220
|
+
"pid": self.pid,
|
|
221
|
+
"thread_id": self.thread_id,
|
|
222
|
+
"server_pid": self.server_pid,
|
|
223
|
+
"log_id": self.log_id,
|
|
224
|
+
"method": self.method,
|
|
225
|
+
"scheme": self.scheme,
|
|
226
|
+
"hostname": self.hostname,
|
|
227
|
+
"script_name": self.script_name,
|
|
228
|
+
"path_info": self.path_info,
|
|
229
|
+
"peer_ip": self.peer_ip,
|
|
230
|
+
"protocol": self.protocol,
|
|
231
|
+
"user_agent": self.user_agent,
|
|
232
|
+
"start_stamp": self.start_stamp,
|
|
233
|
+
"duration": self.duration,
|
|
234
|
+
"state": self.state,
|
|
235
|
+
"input_bytes": self.input_bytes,
|
|
236
|
+
"input_reads": self.input_reads,
|
|
237
|
+
"output_bytes": self.output_bytes,
|
|
238
|
+
"output_writes": self.output_writes,
|
|
239
|
+
"cpu_user_time": self.cpu_user_time,
|
|
240
|
+
"cpu_system_time": self.cpu_system_time,
|
|
241
|
+
"server_time": self.server_time,
|
|
242
|
+
"queue_time": self.queue_time,
|
|
243
|
+
"daemon_time": self.daemon_time,
|
|
244
|
+
"application_time": self.application_time,
|
|
245
|
+
"gil_wait_time": self.gil_wait_time,
|
|
246
|
+
"gil_wait_count": self.gil_wait_count,
|
|
247
|
+
"input_read_time": self.input_read_time,
|
|
248
|
+
"output_write_time": self.output_write_time,
|
|
249
|
+
"active_at_start": self.active_at_start,
|
|
250
|
+
"active_at_completion": self.active_at_completion,
|
|
251
|
+
"status": self.status,
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
@dataclass
|
|
256
|
+
class LifecycleEvent:
|
|
257
|
+
"""One process_started / process_stopping / process_stopped record.
|
|
258
|
+
|
|
259
|
+
Stored in a bounded ingester-side deque so reconnecting clients see
|
|
260
|
+
recent restart history without waiting for the next event. The
|
|
261
|
+
frontend renders STOPPING events as chart markers; STARTED and
|
|
262
|
+
STOPPED feed the (future) process-lifetime panel and the
|
|
263
|
+
forensics-style restart event log.
|
|
264
|
+
"""
|
|
265
|
+
kind: str # "process_started" | "process_stopping" | "process_stopped"
|
|
266
|
+
pid: int
|
|
267
|
+
stamp: float # seconds since epoch
|
|
268
|
+
hostname: str = ""
|
|
269
|
+
process_group: str = ""
|
|
270
|
+
process_parent_pid: int = 0 # STARTED only
|
|
271
|
+
shutdown_reason: str = "" # STOPPING / STOPPED
|
|
272
|
+
process_uptime: float = 0.0 # STOPPED only — seconds
|
|
273
|
+
lifetime_request_count: int = 0 # STOPPED only
|
|
274
|
+
active_requests_at_decision: int = 0 # STOPPING only
|
|
275
|
+
active_requests_at_exit: int = 0 # STOPPED only
|
|
276
|
+
graceful_drain: int = 0 # STOPPED only — 1 if drain completed cleanly
|
|
277
|
+
|
|
278
|
+
def to_dict(self) -> dict:
|
|
279
|
+
return {
|
|
280
|
+
"kind": self.kind,
|
|
281
|
+
"pid": self.pid,
|
|
282
|
+
"stamp": self.stamp,
|
|
283
|
+
"hostname": self.hostname,
|
|
284
|
+
"process_group": self.process_group,
|
|
285
|
+
"process_parent_pid": self.process_parent_pid,
|
|
286
|
+
"shutdown_reason": self.shutdown_reason,
|
|
287
|
+
"process_uptime": self.process_uptime,
|
|
288
|
+
"lifetime_request_count": self.lifetime_request_count,
|
|
289
|
+
"active_requests_at_decision": self.active_requests_at_decision,
|
|
290
|
+
"active_requests_at_exit": self.active_requests_at_exit,
|
|
291
|
+
"graceful_drain": self.graceful_drain,
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
class Ingester:
|
|
296
|
+
"""Owns the listening socket and all per-process state."""
|
|
297
|
+
|
|
298
|
+
STALE_SECONDS = 300 # drop processes we haven't heard from in 5 min
|
|
299
|
+
|
|
300
|
+
# Floor values for slow-request *storage* TTLs. Effective TTL per
|
|
301
|
+
# entry scales with the reporting process's telemetry interval
|
|
302
|
+
# (see _gc_slow): active = max(FLOOR, 3 * sample_period), completed
|
|
303
|
+
# = max(FLOOR, 5 * sample_period), so a 10 s reporter interval
|
|
304
|
+
# doesn't drop entries between heartbeats.
|
|
305
|
+
#
|
|
306
|
+
# Active records still age out fast so a worker that died mid-
|
|
307
|
+
# request doesn't leave a ghost row pinned forever.
|
|
308
|
+
#
|
|
309
|
+
# Completed records are kept long enough to support drill-down
|
|
310
|
+
# from the Capacity heatmap (whose visible window can outlive the
|
|
311
|
+
# 15 s display TTL the UI table uses). Initially set to match the
|
|
312
|
+
# client's SAMPLE_RETENTION_SEC (10 minutes) — kept as a separate
|
|
313
|
+
# constant so it can be adjusted independently from sample
|
|
314
|
+
# retention if the trade-off ever changes.
|
|
315
|
+
SLOW_ACTIVE_TTL_SECONDS = 5.0
|
|
316
|
+
SLOW_COMPLETED_TTL_SECONDS = 600.0
|
|
317
|
+
|
|
318
|
+
# Lifecycle events kept in a bounded ring so a reconnecting client
|
|
319
|
+
# sees recent restart history without waiting for the next event.
|
|
320
|
+
# Sized to comfortably outlive the chart's default rolling window
|
|
321
|
+
# (10 minutes) even on a process group that restarts aggressively.
|
|
322
|
+
LIFECYCLE_RING_SIZE = 500
|
|
323
|
+
|
|
324
|
+
def __init__(self, listen_spec: str, *, max_subscribers: int = 64,
|
|
325
|
+
socket_mode: int = 0o660,
|
|
326
|
+
socket_group: str | int | None = None) -> None:
|
|
327
|
+
self.listen_spec = listen_spec
|
|
328
|
+
self.socket_mode = socket_mode
|
|
329
|
+
self.socket_group = socket_group
|
|
330
|
+
self.sock: socket.socket | None = None
|
|
331
|
+
self.processes: dict[int, ProcessState] = {}
|
|
332
|
+
self.slow_requests: dict[tuple, SlowEntry] = {}
|
|
333
|
+
self.lifecycle_events: deque[LifecycleEvent] = deque(
|
|
334
|
+
maxlen=self.LIFECYCLE_RING_SIZE
|
|
335
|
+
)
|
|
336
|
+
self.subscribers: set[asyncio.Queue] = set()
|
|
337
|
+
self.max_subscribers = max_subscribers
|
|
338
|
+
self.decode_errors = 0
|
|
339
|
+
self.total_received = 0
|
|
340
|
+
|
|
341
|
+
async def run(self) -> None:
|
|
342
|
+
self.sock = open_socket(self.listen_spec,
|
|
343
|
+
mode=self.socket_mode,
|
|
344
|
+
group=self.socket_group)
|
|
345
|
+
loop = asyncio.get_running_loop()
|
|
346
|
+
log.info("listening on %s", self.listen_spec)
|
|
347
|
+
try:
|
|
348
|
+
while True:
|
|
349
|
+
data = await loop.sock_recv(self.sock, 65536)
|
|
350
|
+
self._handle(data)
|
|
351
|
+
except asyncio.CancelledError:
|
|
352
|
+
pass
|
|
353
|
+
finally:
|
|
354
|
+
if self.sock:
|
|
355
|
+
self.sock.close()
|
|
356
|
+
|
|
357
|
+
def _handle(self, data: bytes) -> None:
|
|
358
|
+
self.total_received += 1
|
|
359
|
+
try:
|
|
360
|
+
sample = decode(data)
|
|
361
|
+
except Exception as e:
|
|
362
|
+
self.decode_errors += 1
|
|
363
|
+
log.warning("decode error: %s (len=%d)", e, len(data))
|
|
364
|
+
return
|
|
365
|
+
|
|
366
|
+
# Slow-request records are a separate stream. They don't share the
|
|
367
|
+
# per-process rolling sample window — they feed into slow_requests.
|
|
368
|
+
if sample.kind_name == "slow_request":
|
|
369
|
+
self._handle_slow(sample)
|
|
370
|
+
self._gc_slow()
|
|
371
|
+
self._gc_stale()
|
|
372
|
+
return
|
|
373
|
+
|
|
374
|
+
# Lifecycle events feed a separate ring buffer; the periodic
|
|
375
|
+
# sample window doesn't carry them.
|
|
376
|
+
if sample.kind_name in (
|
|
377
|
+
"process_started", "process_stopping", "process_stopped"
|
|
378
|
+
):
|
|
379
|
+
self._handle_lifecycle(sample)
|
|
380
|
+
self._gc_stale()
|
|
381
|
+
return
|
|
382
|
+
|
|
383
|
+
# GC telemetry rides on its own kinds (one KIND_GC_SNAPSHOT
|
|
384
|
+
# per interpreter per tick plus one KIND_GC_EVENT per
|
|
385
|
+
# cyclic-GC pause). Routed out of the periodic-sample path
|
|
386
|
+
# so the rolling aggregator does not see a partial payload
|
|
387
|
+
# on every tick and zero out fields like memory_rss.
|
|
388
|
+
if sample.kind_name in ("gc_snapshot", "gc_event"):
|
|
389
|
+
self._handle_gc(sample)
|
|
390
|
+
self._gc_stale()
|
|
391
|
+
return
|
|
392
|
+
|
|
393
|
+
state = self.processes.get(sample.pid)
|
|
394
|
+
if state is None:
|
|
395
|
+
state = ProcessState(pid=sample.pid)
|
|
396
|
+
self.processes[sample.pid] = state
|
|
397
|
+
|
|
398
|
+
if state.last_seq and sample.seq > state.last_seq + 1:
|
|
399
|
+
state.drops += sample.seq - state.last_seq - 1
|
|
400
|
+
|
|
401
|
+
state.last_seq = sample.seq
|
|
402
|
+
state.last_seen = time.monotonic()
|
|
403
|
+
state.samples.append(sample)
|
|
404
|
+
|
|
405
|
+
def _latch_str(field_name: str, attr: str) -> None:
|
|
406
|
+
v = sample.fields.get(field_name)
|
|
407
|
+
if isinstance(v, bytes):
|
|
408
|
+
setattr(state, attr, v.decode("utf-8", errors="replace"))
|
|
409
|
+
|
|
410
|
+
_latch_str("hostname", "hostname")
|
|
411
|
+
_latch_str("process_group", "process_group")
|
|
412
|
+
_latch_str("mod_wsgi_version", "mod_wsgi_version")
|
|
413
|
+
_latch_str("python_version", "python_version")
|
|
414
|
+
_latch_str("apache_version", "apache_version")
|
|
415
|
+
_latch_str("mpm_name", "mpm_name")
|
|
416
|
+
|
|
417
|
+
ppid = sample.fields.get("process_parent_pid")
|
|
418
|
+
if isinstance(ppid, int) and ppid > 0:
|
|
419
|
+
state.process_parent_pid = ppid
|
|
420
|
+
|
|
421
|
+
sp = sample.fields.get("sample_period")
|
|
422
|
+
if isinstance(sp, (int, float)) and sp > 0:
|
|
423
|
+
state.sample_period = float(sp)
|
|
424
|
+
|
|
425
|
+
self._broadcast(sample)
|
|
426
|
+
self._gc_slow()
|
|
427
|
+
self._gc_stale()
|
|
428
|
+
|
|
429
|
+
def _handle_lifecycle(self, sample: Sample) -> None:
|
|
430
|
+
"""Record a STARTED / STOPPING / STOPPED event.
|
|
431
|
+
|
|
432
|
+
STARTED also seeds / refreshes the per-process identity so a
|
|
433
|
+
late-joining client can render the process even if the periodic
|
|
434
|
+
stream hasn't begun yet for this pid. STOPPING and STOPPED only
|
|
435
|
+
carry the trimmed identity (hostname, group) since the consumer
|
|
436
|
+
already knows the process from STARTED + the periodic stream.
|
|
437
|
+
"""
|
|
438
|
+
f = sample.fields
|
|
439
|
+
|
|
440
|
+
def _s(name: str) -> str:
|
|
441
|
+
v = f.get(name)
|
|
442
|
+
if isinstance(v, bytes):
|
|
443
|
+
return v.decode("utf-8", errors="replace")
|
|
444
|
+
return ""
|
|
445
|
+
|
|
446
|
+
ev = LifecycleEvent(
|
|
447
|
+
kind=sample.kind_name,
|
|
448
|
+
pid=sample.pid,
|
|
449
|
+
stamp=sample.stamp,
|
|
450
|
+
hostname=_s("hostname"),
|
|
451
|
+
process_group=_s("process_group"),
|
|
452
|
+
process_parent_pid=int(f.get("process_parent_pid") or 0),
|
|
453
|
+
shutdown_reason=_s("shutdown_reason"),
|
|
454
|
+
process_uptime=float(f.get("process_uptime") or 0.0),
|
|
455
|
+
lifetime_request_count=int(f.get("lifetime_request_count") or 0),
|
|
456
|
+
active_requests_at_decision=int(
|
|
457
|
+
f.get("active_requests_at_decision") or 0),
|
|
458
|
+
active_requests_at_exit=int(
|
|
459
|
+
f.get("active_requests_at_exit") or 0),
|
|
460
|
+
graceful_drain=int(f.get("graceful_drain") or 0),
|
|
461
|
+
)
|
|
462
|
+
self.lifecycle_events.append(ev)
|
|
463
|
+
|
|
464
|
+
# STARTED is the canonical place to latch the static identity
|
|
465
|
+
# banner and the parent pid. Create the ProcessState if the
|
|
466
|
+
# periodic stream hasn't arrived yet so the sidebar shows the
|
|
467
|
+
# process the moment it announces itself.
|
|
468
|
+
if sample.kind_name == "process_started":
|
|
469
|
+
state = self.processes.get(sample.pid)
|
|
470
|
+
if state is None:
|
|
471
|
+
state = ProcessState(pid=sample.pid)
|
|
472
|
+
self.processes[sample.pid] = state
|
|
473
|
+
state.last_seen = time.monotonic()
|
|
474
|
+
if ev.hostname:
|
|
475
|
+
state.hostname = ev.hostname
|
|
476
|
+
if ev.process_group:
|
|
477
|
+
state.process_group = ev.process_group
|
|
478
|
+
if ev.process_parent_pid:
|
|
479
|
+
state.process_parent_pid = ev.process_parent_pid
|
|
480
|
+
for name in ("mod_wsgi_version", "python_version",
|
|
481
|
+
"apache_version", "mpm_name"):
|
|
482
|
+
v = _s(name)
|
|
483
|
+
if v:
|
|
484
|
+
setattr(state, name, v)
|
|
485
|
+
|
|
486
|
+
self._enqueue_all({
|
|
487
|
+
"type": "lifecycle",
|
|
488
|
+
"event": ev.to_dict(),
|
|
489
|
+
})
|
|
490
|
+
|
|
491
|
+
def _handle_gc(self, sample: Sample) -> None:
|
|
492
|
+
"""Stash a KIND_GC_SNAPSHOT or KIND_GC_EVENT for one pid.
|
|
493
|
+
|
|
494
|
+
Snapshots latch the most recent tier-1 counters per
|
|
495
|
+
interpreter; events append to a rolling buffer of recent
|
|
496
|
+
pause records. Both are emitted to subscribers as-is so the
|
|
497
|
+
UI can render the GC tab without an additional poll path.
|
|
498
|
+
|
|
499
|
+
Routed out of the periodic-sample window so the rolling
|
|
500
|
+
per-process aggregator (memory_rss, request_threads,
|
|
501
|
+
per-phase times) is not zeroed on every GC tick when a
|
|
502
|
+
snapshot datagram arrives without those fields.
|
|
503
|
+
"""
|
|
504
|
+
state = self.processes.get(sample.pid)
|
|
505
|
+
if state is None:
|
|
506
|
+
state = ProcessState(pid=sample.pid)
|
|
507
|
+
self.processes[sample.pid] = state
|
|
508
|
+
state.last_seen = time.monotonic()
|
|
509
|
+
|
|
510
|
+
interp_name = sample.fields.get("interpreter_name")
|
|
511
|
+
if isinstance(interp_name, bytes):
|
|
512
|
+
interp_name = interp_name.decode("utf-8", errors="replace")
|
|
513
|
+
else:
|
|
514
|
+
interp_name = ""
|
|
515
|
+
|
|
516
|
+
if sample.kind_name == "gc_snapshot":
|
|
517
|
+
ring = state.gc_snapshots.get(interp_name)
|
|
518
|
+
if ring is None:
|
|
519
|
+
ring = deque(maxlen=600)
|
|
520
|
+
state.gc_snapshots[interp_name] = ring
|
|
521
|
+
entry = {
|
|
522
|
+
"stamp": sample.stamp,
|
|
523
|
+
"fields": dict(sample.fields),
|
|
524
|
+
}
|
|
525
|
+
ring.append(entry)
|
|
526
|
+
self._enqueue_all({
|
|
527
|
+
"type": "gc_snapshot",
|
|
528
|
+
"pid": sample.pid,
|
|
529
|
+
"interpreter": interp_name,
|
|
530
|
+
"stamp": sample.stamp,
|
|
531
|
+
"fields": entry["fields"],
|
|
532
|
+
})
|
|
533
|
+
else:
|
|
534
|
+
entry = {
|
|
535
|
+
"stamp": sample.stamp,
|
|
536
|
+
"interpreter": interp_name,
|
|
537
|
+
"fields": dict(sample.fields),
|
|
538
|
+
}
|
|
539
|
+
state.gc_events.append(entry)
|
|
540
|
+
cutoff = sample.stamp - GC_EVENT_RETENTION_SEC
|
|
541
|
+
while state.gc_events and state.gc_events[0]["stamp"] < cutoff:
|
|
542
|
+
state.gc_events.popleft()
|
|
543
|
+
self._enqueue_all({
|
|
544
|
+
"type": "gc_event",
|
|
545
|
+
"pid": sample.pid,
|
|
546
|
+
"interpreter": interp_name,
|
|
547
|
+
"stamp": sample.stamp,
|
|
548
|
+
"fields": entry["fields"],
|
|
549
|
+
})
|
|
550
|
+
|
|
551
|
+
def _handle_slow(self, sample: Sample) -> None:
|
|
552
|
+
f = sample.fields
|
|
553
|
+
|
|
554
|
+
def _s(name: str) -> str:
|
|
555
|
+
v = f.get(name)
|
|
556
|
+
if isinstance(v, bytes):
|
|
557
|
+
return v.decode("utf-8", errors="replace")
|
|
558
|
+
return ""
|
|
559
|
+
|
|
560
|
+
log_id = _s("slow_log_id")
|
|
561
|
+
thread_id = int(f.get("slow_thread_id") or 0)
|
|
562
|
+
start_stamp = float(f.get("slow_start_stamp") or 0.0)
|
|
563
|
+
|
|
564
|
+
# Prefer Apache's per-request log_id as correlation key; fall back
|
|
565
|
+
# to a (pid, thread, start) tuple when mod_unique_id isn't loaded.
|
|
566
|
+
if log_id:
|
|
567
|
+
key: tuple = (sample.pid, log_id)
|
|
568
|
+
else:
|
|
569
|
+
key = (sample.pid, thread_id, start_stamp)
|
|
570
|
+
|
|
571
|
+
entry = SlowEntry(
|
|
572
|
+
pid=sample.pid,
|
|
573
|
+
thread_id=thread_id,
|
|
574
|
+
server_pid=int(f.get("slow_server_pid") or 0),
|
|
575
|
+
log_id=log_id,
|
|
576
|
+
method=_s("slow_method"),
|
|
577
|
+
scheme=_s("slow_scheme"),
|
|
578
|
+
hostname=_s("slow_hostname"),
|
|
579
|
+
script_name=_s("slow_script_name"),
|
|
580
|
+
path_info=_s("slow_path_info"),
|
|
581
|
+
peer_ip=_s("slow_peer_ip"),
|
|
582
|
+
protocol=_s("slow_protocol"),
|
|
583
|
+
user_agent=_s("slow_user_agent"),
|
|
584
|
+
start_stamp=start_stamp,
|
|
585
|
+
duration=float(f.get("slow_duration") or 0.0),
|
|
586
|
+
state=int(f.get("slow_record_state") or 0),
|
|
587
|
+
input_bytes=int(f.get("slow_input_bytes") or 0),
|
|
588
|
+
input_reads=int(f.get("slow_input_reads") or 0),
|
|
589
|
+
output_bytes=int(f.get("slow_output_bytes") or 0),
|
|
590
|
+
output_writes=int(f.get("slow_output_writes") or 0),
|
|
591
|
+
cpu_user_time=float(f.get("slow_cpu_user_time") or 0.0),
|
|
592
|
+
cpu_system_time=float(f.get("slow_cpu_system_time") or 0.0),
|
|
593
|
+
server_time=float(f.get("slow_server_time") or 0.0),
|
|
594
|
+
queue_time=float(f.get("slow_queue_time") or 0.0),
|
|
595
|
+
daemon_time=float(f.get("slow_daemon_time") or 0.0),
|
|
596
|
+
application_time=float(f.get("slow_application_time") or 0.0),
|
|
597
|
+
gil_wait_time=float(f.get("slow_gil_wait_time") or 0.0),
|
|
598
|
+
gil_wait_count=int(f.get("slow_gil_wait_count") or 0),
|
|
599
|
+
input_read_time=float(f.get("slow_input_read_time") or 0.0),
|
|
600
|
+
output_write_time=float(f.get("slow_output_write_time") or 0.0),
|
|
601
|
+
active_at_start=int(f.get("slow_active_at_start") or 0),
|
|
602
|
+
active_at_completion=int(f.get("slow_active_at_completion") or 0),
|
|
603
|
+
status=int(f.get("slow_status") or 0),
|
|
604
|
+
last_seen=time.monotonic(),
|
|
605
|
+
)
|
|
606
|
+
self.slow_requests[key] = entry
|
|
607
|
+
|
|
608
|
+
self._enqueue_all({
|
|
609
|
+
"type": "slow_request",
|
|
610
|
+
"key": list(key),
|
|
611
|
+
"entry": entry.to_dict(),
|
|
612
|
+
"stamp": sample.stamp,
|
|
613
|
+
})
|
|
614
|
+
|
|
615
|
+
def _broadcast(self, sample: Sample) -> None:
|
|
616
|
+
self._enqueue_all(self._sample_to_dict(sample))
|
|
617
|
+
|
|
618
|
+
def _enqueue_all(self, payload: dict) -> None:
|
|
619
|
+
for q in list(self.subscribers):
|
|
620
|
+
try:
|
|
621
|
+
q.put_nowait(payload)
|
|
622
|
+
except asyncio.QueueFull:
|
|
623
|
+
# Slow consumer — drop the oldest to stay bounded.
|
|
624
|
+
try:
|
|
625
|
+
q.get_nowait()
|
|
626
|
+
q.put_nowait(payload)
|
|
627
|
+
except Exception:
|
|
628
|
+
pass
|
|
629
|
+
|
|
630
|
+
def _gc_stale(self) -> None:
|
|
631
|
+
now = time.monotonic()
|
|
632
|
+
stale = [
|
|
633
|
+
pid for pid, st in self.processes.items()
|
|
634
|
+
if now - st.last_seen > self.STALE_SECONDS
|
|
635
|
+
]
|
|
636
|
+
for pid in stale:
|
|
637
|
+
log.info("gc: dropping stale pid=%d", pid)
|
|
638
|
+
self.processes.pop(pid, None)
|
|
639
|
+
|
|
640
|
+
def clear_slow_requests(self) -> None:
|
|
641
|
+
"""Drop completed history plus any active record GC would also drop.
|
|
642
|
+
|
|
643
|
+
Triggered by the Slow requests tab's Clear button. Drops every
|
|
644
|
+
completed entry outright and every active entry whose pid has died
|
|
645
|
+
or whose last_seen is already past the active TTL — so the table
|
|
646
|
+
snaps to "only requests the daemon is still actively heart-beating
|
|
647
|
+
about". Live in-flight rows are preserved.
|
|
648
|
+
"""
|
|
649
|
+
now = time.monotonic()
|
|
650
|
+
live_pids = set(self.processes)
|
|
651
|
+
kept: dict[tuple, SlowEntry] = {}
|
|
652
|
+
for key, entry in self.slow_requests.items():
|
|
653
|
+
if entry.state == 1:
|
|
654
|
+
continue
|
|
655
|
+
if entry.pid not in live_pids:
|
|
656
|
+
continue
|
|
657
|
+
proc = self.processes.get(entry.pid)
|
|
658
|
+
sp = proc.sample_period if proc and proc.sample_period > 0 else 1.0
|
|
659
|
+
ttl = max(self.SLOW_ACTIVE_TTL_SECONDS, 3.0 * sp)
|
|
660
|
+
if now - entry.last_seen > ttl:
|
|
661
|
+
continue
|
|
662
|
+
kept[key] = entry
|
|
663
|
+
self.slow_requests = kept
|
|
664
|
+
self._enqueue_all({
|
|
665
|
+
"type": "slow_clear",
|
|
666
|
+
"kept": [
|
|
667
|
+
{"key": list(k), "entry": e.to_dict()}
|
|
668
|
+
for k, e in kept.items()
|
|
669
|
+
],
|
|
670
|
+
})
|
|
671
|
+
|
|
672
|
+
def _gc_slow(self) -> None:
|
|
673
|
+
"""Age out slow-request entries the reporter has stopped updating.
|
|
674
|
+
|
|
675
|
+
Active entries disappear quickly so a worker that was killed mid-
|
|
676
|
+
request doesn't leave a ghost row. Completed entries linger so a
|
|
677
|
+
user can still see recently-finished slow requests when they open
|
|
678
|
+
the UI. Both TTLs scale with the reporting process's telemetry
|
|
679
|
+
interval: a reporter ticking every 10 s only emits heartbeats
|
|
680
|
+
every 10 s, so a 5 s floor would flicker rows in and out — we
|
|
681
|
+
bump TTL to 3x the sample period in that case. Also drops all
|
|
682
|
+
entries for processes that have aged out of self.processes so
|
|
683
|
+
the list stays in sync with the sidebar.
|
|
684
|
+
"""
|
|
685
|
+
if not self.slow_requests:
|
|
686
|
+
return
|
|
687
|
+
now = time.monotonic()
|
|
688
|
+
drop = []
|
|
689
|
+
live_pids = set(self.processes)
|
|
690
|
+
for key, entry in self.slow_requests.items():
|
|
691
|
+
if entry.pid not in live_pids:
|
|
692
|
+
drop.append(key)
|
|
693
|
+
continue
|
|
694
|
+
proc = self.processes.get(entry.pid)
|
|
695
|
+
sp = proc.sample_period if proc and proc.sample_period > 0 else 1.0
|
|
696
|
+
if entry.state == 1:
|
|
697
|
+
ttl = max(self.SLOW_COMPLETED_TTL_SECONDS, 5.0 * sp)
|
|
698
|
+
else:
|
|
699
|
+
ttl = max(self.SLOW_ACTIVE_TTL_SECONDS, 3.0 * sp)
|
|
700
|
+
if now - entry.last_seen > ttl:
|
|
701
|
+
drop.append(key)
|
|
702
|
+
for key in drop:
|
|
703
|
+
self.slow_requests.pop(key, None)
|
|
704
|
+
|
|
705
|
+
# --- WebSocket client API -------------------------------------------------
|
|
706
|
+
|
|
707
|
+
def subscribe(self) -> asyncio.Queue:
|
|
708
|
+
if len(self.subscribers) >= self.max_subscribers:
|
|
709
|
+
raise RuntimeError("too many subscribers")
|
|
710
|
+
q: asyncio.Queue = asyncio.Queue(maxsize=256)
|
|
711
|
+
self.subscribers.add(q)
|
|
712
|
+
return q
|
|
713
|
+
|
|
714
|
+
def unsubscribe(self, q: asyncio.Queue) -> None:
|
|
715
|
+
self.subscribers.discard(q)
|
|
716
|
+
|
|
717
|
+
def snapshot(self) -> dict:
|
|
718
|
+
"""Return the full current rolling state for a newly-connected client."""
|
|
719
|
+
return {
|
|
720
|
+
"type": "snapshot",
|
|
721
|
+
"processes": [
|
|
722
|
+
{
|
|
723
|
+
"pid": st.pid,
|
|
724
|
+
"hostname": st.hostname,
|
|
725
|
+
"process_group": st.process_group,
|
|
726
|
+
"mod_wsgi_version": st.mod_wsgi_version,
|
|
727
|
+
"python_version": st.python_version,
|
|
728
|
+
"apache_version": st.apache_version,
|
|
729
|
+
"mpm_name": st.mpm_name,
|
|
730
|
+
"process_parent_pid": st.process_parent_pid,
|
|
731
|
+
"last_seq": st.last_seq,
|
|
732
|
+
"drops": st.drops,
|
|
733
|
+
"samples": [self._sample_to_dict(s) for s in st.samples],
|
|
734
|
+
"gc_snapshots": {
|
|
735
|
+
interp: [
|
|
736
|
+
{"stamp": e["stamp"], "fields": e["fields"]}
|
|
737
|
+
for e in ring
|
|
738
|
+
]
|
|
739
|
+
for interp, ring in st.gc_snapshots.items()
|
|
740
|
+
},
|
|
741
|
+
"gc_events": [
|
|
742
|
+
{
|
|
743
|
+
"stamp": e["stamp"],
|
|
744
|
+
"interpreter": e["interpreter"],
|
|
745
|
+
"fields": e["fields"],
|
|
746
|
+
}
|
|
747
|
+
for e in st.gc_events
|
|
748
|
+
],
|
|
749
|
+
}
|
|
750
|
+
for st in self.processes.values()
|
|
751
|
+
],
|
|
752
|
+
"slow_requests": [
|
|
753
|
+
{"key": list(k), "entry": e.to_dict()}
|
|
754
|
+
for k, e in self.slow_requests.items()
|
|
755
|
+
],
|
|
756
|
+
"lifecycle_events": [ev.to_dict() for ev in self.lifecycle_events],
|
|
757
|
+
"total_received": self.total_received,
|
|
758
|
+
"decode_errors": self.decode_errors,
|
|
759
|
+
}
|
|
760
|
+
|
|
761
|
+
@staticmethod
|
|
762
|
+
def _sample_to_dict(sample: Sample) -> dict:
|
|
763
|
+
return {
|
|
764
|
+
"type": "sample",
|
|
765
|
+
"kind": sample.kind_name,
|
|
766
|
+
"pid": sample.pid,
|
|
767
|
+
"seq": sample.seq,
|
|
768
|
+
"stamp": sample.stamp,
|
|
769
|
+
"fields": {
|
|
770
|
+
k: (v.decode("utf-8", errors="replace") if isinstance(v, bytes) else v)
|
|
771
|
+
for k, v in sample.fields.items()
|
|
772
|
+
},
|
|
773
|
+
}
|