mod-wsgi-telemetry 1.0.0.dev2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,773 @@
1
+ """Async datagram receiver + rolling per-process window.
2
+
3
+ Listens on a UNIX SOCK_DGRAM socket for TLV samples, decodes them, and
4
+ keeps a bounded history per PID so connecting UI clients can fetch
5
+ recent state immediately without waiting for the next tick.
6
+
7
+ Remote (IPv4 UDP) listeners are not supported — telemetry is intended
8
+ for a co-located ingester so MTU / IP-fragmentation / packet-loss are
9
+ non-concerns. The reporter is allowed to emit datagrams that exceed
10
+ the Ethernet MTU as a result.
11
+
12
+ Emits each decoded sample on an asyncio broadcast queue for WebSocket
13
+ clients to pick up.
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ import asyncio
19
+ import grp
20
+ import logging
21
+ import os
22
+ import socket
23
+ import time
24
+ from collections import deque
25
+ from dataclasses import dataclass, field
26
+ from typing import Iterable
27
+
28
+ from .wire import Sample, decode
29
+
30
+ log = logging.getLogger(__name__)
31
+
32
+ # How long to retain GC event records per process. Sized to match the
33
+ # longest UI window (5 minutes) so the event timeline can always cover
34
+ # the selected window regardless of event rate. Trimmed by event
35
+ # timestamp rather than entry count so memory use scales with rate
36
+ # rather than being capped to a fixed (possibly too small) entry count.
37
+ GC_EVENT_RETENTION_SEC = 300.0
38
+
39
+
40
+ def parse_listen(spec: str) -> tuple[int, str]:
41
+ """Return (family, bind_path) for a UNIX SOCK_DGRAM target."""
42
+ if spec.startswith("unix:"):
43
+ return socket.AF_UNIX, spec[len("unix:"):]
44
+ raise ValueError(
45
+ f"unknown scheme {spec!r}: expected 'unix:/path' "
46
+ f"(remote 'udp:host:port' targets are no longer supported)"
47
+ )
48
+
49
+
50
+ def open_socket(spec: str, *, mode: int = 0o660,
51
+ group: str | int | None = None) -> socket.socket:
52
+ """Open and bind the listening socket with explicit permissions.
53
+
54
+ Senders only need write permission on the socket file, so the
55
+ default 0660 mode plus a shared group (set via ``group``) is the
56
+ standard pattern: the ingester user owns the socket, the shared
57
+ group covers every WSGI-process identity that needs to connect,
58
+ and nobody else can sendto() the socket.
59
+
60
+ During bind() the umask is temporarily tightened to 0077 so the
61
+ socket file is briefly 0600 before the explicit chmod widens it
62
+ to ``mode``; this closes the small window during which an
63
+ inherited umask might leave the file world-writable.
64
+ """
65
+ family, addr = parse_listen(spec)
66
+ sock = socket.socket(family, socket.SOCK_DGRAM)
67
+
68
+ if os.path.exists(addr):
69
+ os.unlink(addr)
70
+
71
+ saved_umask = os.umask(0o077)
72
+ try:
73
+ sock.bind(addr)
74
+ finally:
75
+ os.umask(saved_umask)
76
+
77
+ if group is not None:
78
+ gid = group if isinstance(group, int) else grp.getgrnam(group).gr_gid
79
+ os.chown(addr, -1, gid)
80
+ os.chmod(addr, mode)
81
+
82
+ sock.setblocking(False)
83
+ try:
84
+ sock.setsockopt(socket.SOL_SOCKET, socket.SO_RCVBUF, 2 * 1024 * 1024)
85
+ except OSError:
86
+ pass
87
+ return sock
88
+
89
+
90
+ @dataclass
91
+ class ProcessState:
92
+ """Rolling window of samples for one PID."""
93
+ pid: int
94
+ hostname: str = ""
95
+ process_group: str = ""
96
+ # Build/runtime identity. Emitted by the daemon once at process
97
+ # start (or on every sample; the ingester doesn't care) and
98
+ # latched here so a mid-stream client reconnect still sees the
99
+ # "who is this" banner via the snapshot payload, even if the
100
+ # rolling sample window no longer carries the identity TLVs.
101
+ mod_wsgi_version: str = ""
102
+ python_version: str = ""
103
+ apache_version: str = ""
104
+ mpm_name: str = ""
105
+ # Apache parent pid, latched from process_started. Lets the UI
106
+ # group sibling daemon processes under their parent.
107
+ process_parent_pid: int = 0
108
+ # Telemetry reporter's tick interval in seconds, as reported by the
109
+ # process itself on each KIND_REQUEST sample. Used to size the
110
+ # slow-request TTLs — a reporter ticking every 10 s needs a longer
111
+ # TTL than the default 5 s floor so heartbeats aren't aged out
112
+ # between ticks.
113
+ sample_period: float = 1.0
114
+ samples: deque = field(default_factory=lambda: deque(maxlen=600))
115
+ last_seq: int = 0
116
+ drops: int = 0
117
+ last_seen: float = 0.0
118
+ # Per-interpreter rolling buffer of recent KIND_GC_SNAPSHOT
119
+ # samples, keyed by interpreter_name (empty string for the
120
+ # main / sole interpreter, also the single-interpreter common
121
+ # case where the daemon omits the interpreter_name field on
122
+ # the wire). Each entry in the deque is {stamp, fields} so
123
+ # client-side rendering can compute rates and plot time-series
124
+ # without a follow-up poll.
125
+ gc_snapshots: dict = field(default_factory=dict)
126
+ # Rolling buffer of recent KIND_GC_EVENT records (each is a
127
+ # pause event). Trimmed by event timestamp on each append so the
128
+ # retained span always covers the longest UI window regardless of
129
+ # event rate. See GC_EVENT_RETENTION_SEC.
130
+ gc_events: deque = field(default_factory=deque)
131
+
132
+
133
+ @dataclass
134
+ class SlowEntry:
135
+ """One slow request as currently known to the ingester.
136
+
137
+ Active heartbeats replace earlier active records for the same key; a
138
+ completed record marks the entry final (its duration is the end-of-
139
+ request total). last_seen is the wall-monotonic arrival time, used for
140
+ TTL age-out so requests from processes that died mid-flight don't
141
+ linger as "active" forever.
142
+
143
+ I/O counters are final at completion; for an active record they
144
+ are the partial values captured at scan time (the adapter may yet
145
+ read or write more before the request completes).
146
+ """
147
+ pid: int
148
+ thread_id: int
149
+ # Apache child worker pid that accepted the request. In embedded
150
+ # mode this is the same process as pid (Apache child runs the
151
+ # WSGI app directly). In daemon mode pid is the daemon process
152
+ # and server_pid is the Apache child that proxied the request.
153
+ server_pid: int
154
+ log_id: str
155
+ method: str
156
+ scheme: str
157
+ hostname: str
158
+ script_name: str
159
+ path_info: str
160
+ start_stamp: float # seconds since epoch
161
+ duration: float # seconds
162
+ state: int # 0 = active, 1 = completed
163
+ # Network identity. peer_ip is post-trusted-proxy resolution, so
164
+ # reflects the real client when X-Forwarded-For handling is
165
+ # configured. protocol is "HTTP/1.1" / "HTTP/2.0". user_agent
166
+ # is empty unless the operator opted in via
167
+ # WSGITelemetryOptions +CaptureUserAgent.
168
+ peer_ip: str = ""
169
+ protocol: str = ""
170
+ user_agent: str = ""
171
+ input_bytes: int = 0
172
+ input_reads: int = 0
173
+ output_bytes: int = 0
174
+ output_writes: int = 0
175
+ cpu_user_time: float = 0.0
176
+ cpu_system_time: float = 0.0
177
+ # Per-phase timing breakdown (seconds). server is Apache
178
+ # request arrival to handed off to daemon (or to application_start
179
+ # in embedded mode); queue is daemon connect to worker pickup;
180
+ # daemon is worker pickup to WSGI callable invoked; application
181
+ # is the WSGI callable elapsed. queue and daemon are 0 in
182
+ # embedded mode. application is partial for active records still
183
+ # inside the callable; pre-application active records report 0
184
+ # for application and a partial daemon (or server) so the user
185
+ # can see where time is going.
186
+ server_time: float = 0.0
187
+ queue_time: float = 0.0
188
+ daemon_time: float = 0.0
189
+ application_time: float = 0.0
190
+ # GIL-wait pressure indicator. Sum of waits at every instrumented
191
+ # re-acquire site reached during this request, plus the initial
192
+ # sub-interp GIL acquire. Cross-cutting overlap, not a phase
193
+ # addend. Cannot see waits inside the application's own C
194
+ # extensions, so it surfaces as a partial pressure indicator.
195
+ gil_wait_time: float = 0.0
196
+ gil_wait_count: int = 0
197
+ # I/O time overlap indicators for this request. input_read_time is
198
+ # the total time spent inside wsgi.input.read*; output_write_time
199
+ # is the total time spent in the adapter's output path
200
+ # (start_response / write / yield-to-Apache). Cross-cutting
201
+ # overlap, not a phase addend. output_write_time is "adapter
202
+ # handoff" time, not client-receive time: Apache may buffer and
203
+ # async-flush past mod_wsgi's view. See the wire.py field
204
+ # comment for the full caveat.
205
+ input_read_time: float = 0.0
206
+ output_write_time: float = 0.0
207
+ # Concurrency context — wsgi_active_requests including this one
208
+ # at slot claim and at completion. active_at_completion is 0 for
209
+ # active records by definition (the request hasn't finished).
210
+ # Used together with the per-process request_threads_maximum
211
+ # from the periodic stream to render an "n / max" saturation
212
+ # indicator on the slow-record detail panel.
213
+ active_at_start: int = 0
214
+ active_at_completion: int = 0
215
+ status: int = 0 # 0 = not yet known, else final WSGI status
216
+ last_seen: float = 0.0
217
+
218
+ def to_dict(self) -> dict:
219
+ return {
220
+ "pid": self.pid,
221
+ "thread_id": self.thread_id,
222
+ "server_pid": self.server_pid,
223
+ "log_id": self.log_id,
224
+ "method": self.method,
225
+ "scheme": self.scheme,
226
+ "hostname": self.hostname,
227
+ "script_name": self.script_name,
228
+ "path_info": self.path_info,
229
+ "peer_ip": self.peer_ip,
230
+ "protocol": self.protocol,
231
+ "user_agent": self.user_agent,
232
+ "start_stamp": self.start_stamp,
233
+ "duration": self.duration,
234
+ "state": self.state,
235
+ "input_bytes": self.input_bytes,
236
+ "input_reads": self.input_reads,
237
+ "output_bytes": self.output_bytes,
238
+ "output_writes": self.output_writes,
239
+ "cpu_user_time": self.cpu_user_time,
240
+ "cpu_system_time": self.cpu_system_time,
241
+ "server_time": self.server_time,
242
+ "queue_time": self.queue_time,
243
+ "daemon_time": self.daemon_time,
244
+ "application_time": self.application_time,
245
+ "gil_wait_time": self.gil_wait_time,
246
+ "gil_wait_count": self.gil_wait_count,
247
+ "input_read_time": self.input_read_time,
248
+ "output_write_time": self.output_write_time,
249
+ "active_at_start": self.active_at_start,
250
+ "active_at_completion": self.active_at_completion,
251
+ "status": self.status,
252
+ }
253
+
254
+
255
+ @dataclass
256
+ class LifecycleEvent:
257
+ """One process_started / process_stopping / process_stopped record.
258
+
259
+ Stored in a bounded ingester-side deque so reconnecting clients see
260
+ recent restart history without waiting for the next event. The
261
+ frontend renders STOPPING events as chart markers; STARTED and
262
+ STOPPED feed the (future) process-lifetime panel and the
263
+ forensics-style restart event log.
264
+ """
265
+ kind: str # "process_started" | "process_stopping" | "process_stopped"
266
+ pid: int
267
+ stamp: float # seconds since epoch
268
+ hostname: str = ""
269
+ process_group: str = ""
270
+ process_parent_pid: int = 0 # STARTED only
271
+ shutdown_reason: str = "" # STOPPING / STOPPED
272
+ process_uptime: float = 0.0 # STOPPED only — seconds
273
+ lifetime_request_count: int = 0 # STOPPED only
274
+ active_requests_at_decision: int = 0 # STOPPING only
275
+ active_requests_at_exit: int = 0 # STOPPED only
276
+ graceful_drain: int = 0 # STOPPED only — 1 if drain completed cleanly
277
+
278
+ def to_dict(self) -> dict:
279
+ return {
280
+ "kind": self.kind,
281
+ "pid": self.pid,
282
+ "stamp": self.stamp,
283
+ "hostname": self.hostname,
284
+ "process_group": self.process_group,
285
+ "process_parent_pid": self.process_parent_pid,
286
+ "shutdown_reason": self.shutdown_reason,
287
+ "process_uptime": self.process_uptime,
288
+ "lifetime_request_count": self.lifetime_request_count,
289
+ "active_requests_at_decision": self.active_requests_at_decision,
290
+ "active_requests_at_exit": self.active_requests_at_exit,
291
+ "graceful_drain": self.graceful_drain,
292
+ }
293
+
294
+
295
+ class Ingester:
296
+ """Owns the listening socket and all per-process state."""
297
+
298
+ STALE_SECONDS = 300 # drop processes we haven't heard from in 5 min
299
+
300
+ # Floor values for slow-request *storage* TTLs. Effective TTL per
301
+ # entry scales with the reporting process's telemetry interval
302
+ # (see _gc_slow): active = max(FLOOR, 3 * sample_period), completed
303
+ # = max(FLOOR, 5 * sample_period), so a 10 s reporter interval
304
+ # doesn't drop entries between heartbeats.
305
+ #
306
+ # Active records still age out fast so a worker that died mid-
307
+ # request doesn't leave a ghost row pinned forever.
308
+ #
309
+ # Completed records are kept long enough to support drill-down
310
+ # from the Capacity heatmap (whose visible window can outlive the
311
+ # 15 s display TTL the UI table uses). Initially set to match the
312
+ # client's SAMPLE_RETENTION_SEC (10 minutes) — kept as a separate
313
+ # constant so it can be adjusted independently from sample
314
+ # retention if the trade-off ever changes.
315
+ SLOW_ACTIVE_TTL_SECONDS = 5.0
316
+ SLOW_COMPLETED_TTL_SECONDS = 600.0
317
+
318
+ # Lifecycle events kept in a bounded ring so a reconnecting client
319
+ # sees recent restart history without waiting for the next event.
320
+ # Sized to comfortably outlive the chart's default rolling window
321
+ # (10 minutes) even on a process group that restarts aggressively.
322
+ LIFECYCLE_RING_SIZE = 500
323
+
324
+ def __init__(self, listen_spec: str, *, max_subscribers: int = 64,
325
+ socket_mode: int = 0o660,
326
+ socket_group: str | int | None = None) -> None:
327
+ self.listen_spec = listen_spec
328
+ self.socket_mode = socket_mode
329
+ self.socket_group = socket_group
330
+ self.sock: socket.socket | None = None
331
+ self.processes: dict[int, ProcessState] = {}
332
+ self.slow_requests: dict[tuple, SlowEntry] = {}
333
+ self.lifecycle_events: deque[LifecycleEvent] = deque(
334
+ maxlen=self.LIFECYCLE_RING_SIZE
335
+ )
336
+ self.subscribers: set[asyncio.Queue] = set()
337
+ self.max_subscribers = max_subscribers
338
+ self.decode_errors = 0
339
+ self.total_received = 0
340
+
341
+ async def run(self) -> None:
342
+ self.sock = open_socket(self.listen_spec,
343
+ mode=self.socket_mode,
344
+ group=self.socket_group)
345
+ loop = asyncio.get_running_loop()
346
+ log.info("listening on %s", self.listen_spec)
347
+ try:
348
+ while True:
349
+ data = await loop.sock_recv(self.sock, 65536)
350
+ self._handle(data)
351
+ except asyncio.CancelledError:
352
+ pass
353
+ finally:
354
+ if self.sock:
355
+ self.sock.close()
356
+
357
+ def _handle(self, data: bytes) -> None:
358
+ self.total_received += 1
359
+ try:
360
+ sample = decode(data)
361
+ except Exception as e:
362
+ self.decode_errors += 1
363
+ log.warning("decode error: %s (len=%d)", e, len(data))
364
+ return
365
+
366
+ # Slow-request records are a separate stream. They don't share the
367
+ # per-process rolling sample window — they feed into slow_requests.
368
+ if sample.kind_name == "slow_request":
369
+ self._handle_slow(sample)
370
+ self._gc_slow()
371
+ self._gc_stale()
372
+ return
373
+
374
+ # Lifecycle events feed a separate ring buffer; the periodic
375
+ # sample window doesn't carry them.
376
+ if sample.kind_name in (
377
+ "process_started", "process_stopping", "process_stopped"
378
+ ):
379
+ self._handle_lifecycle(sample)
380
+ self._gc_stale()
381
+ return
382
+
383
+ # GC telemetry rides on its own kinds (one KIND_GC_SNAPSHOT
384
+ # per interpreter per tick plus one KIND_GC_EVENT per
385
+ # cyclic-GC pause). Routed out of the periodic-sample path
386
+ # so the rolling aggregator does not see a partial payload
387
+ # on every tick and zero out fields like memory_rss.
388
+ if sample.kind_name in ("gc_snapshot", "gc_event"):
389
+ self._handle_gc(sample)
390
+ self._gc_stale()
391
+ return
392
+
393
+ state = self.processes.get(sample.pid)
394
+ if state is None:
395
+ state = ProcessState(pid=sample.pid)
396
+ self.processes[sample.pid] = state
397
+
398
+ if state.last_seq and sample.seq > state.last_seq + 1:
399
+ state.drops += sample.seq - state.last_seq - 1
400
+
401
+ state.last_seq = sample.seq
402
+ state.last_seen = time.monotonic()
403
+ state.samples.append(sample)
404
+
405
+ def _latch_str(field_name: str, attr: str) -> None:
406
+ v = sample.fields.get(field_name)
407
+ if isinstance(v, bytes):
408
+ setattr(state, attr, v.decode("utf-8", errors="replace"))
409
+
410
+ _latch_str("hostname", "hostname")
411
+ _latch_str("process_group", "process_group")
412
+ _latch_str("mod_wsgi_version", "mod_wsgi_version")
413
+ _latch_str("python_version", "python_version")
414
+ _latch_str("apache_version", "apache_version")
415
+ _latch_str("mpm_name", "mpm_name")
416
+
417
+ ppid = sample.fields.get("process_parent_pid")
418
+ if isinstance(ppid, int) and ppid > 0:
419
+ state.process_parent_pid = ppid
420
+
421
+ sp = sample.fields.get("sample_period")
422
+ if isinstance(sp, (int, float)) and sp > 0:
423
+ state.sample_period = float(sp)
424
+
425
+ self._broadcast(sample)
426
+ self._gc_slow()
427
+ self._gc_stale()
428
+
429
+ def _handle_lifecycle(self, sample: Sample) -> None:
430
+ """Record a STARTED / STOPPING / STOPPED event.
431
+
432
+ STARTED also seeds / refreshes the per-process identity so a
433
+ late-joining client can render the process even if the periodic
434
+ stream hasn't begun yet for this pid. STOPPING and STOPPED only
435
+ carry the trimmed identity (hostname, group) since the consumer
436
+ already knows the process from STARTED + the periodic stream.
437
+ """
438
+ f = sample.fields
439
+
440
+ def _s(name: str) -> str:
441
+ v = f.get(name)
442
+ if isinstance(v, bytes):
443
+ return v.decode("utf-8", errors="replace")
444
+ return ""
445
+
446
+ ev = LifecycleEvent(
447
+ kind=sample.kind_name,
448
+ pid=sample.pid,
449
+ stamp=sample.stamp,
450
+ hostname=_s("hostname"),
451
+ process_group=_s("process_group"),
452
+ process_parent_pid=int(f.get("process_parent_pid") or 0),
453
+ shutdown_reason=_s("shutdown_reason"),
454
+ process_uptime=float(f.get("process_uptime") or 0.0),
455
+ lifetime_request_count=int(f.get("lifetime_request_count") or 0),
456
+ active_requests_at_decision=int(
457
+ f.get("active_requests_at_decision") or 0),
458
+ active_requests_at_exit=int(
459
+ f.get("active_requests_at_exit") or 0),
460
+ graceful_drain=int(f.get("graceful_drain") or 0),
461
+ )
462
+ self.lifecycle_events.append(ev)
463
+
464
+ # STARTED is the canonical place to latch the static identity
465
+ # banner and the parent pid. Create the ProcessState if the
466
+ # periodic stream hasn't arrived yet so the sidebar shows the
467
+ # process the moment it announces itself.
468
+ if sample.kind_name == "process_started":
469
+ state = self.processes.get(sample.pid)
470
+ if state is None:
471
+ state = ProcessState(pid=sample.pid)
472
+ self.processes[sample.pid] = state
473
+ state.last_seen = time.monotonic()
474
+ if ev.hostname:
475
+ state.hostname = ev.hostname
476
+ if ev.process_group:
477
+ state.process_group = ev.process_group
478
+ if ev.process_parent_pid:
479
+ state.process_parent_pid = ev.process_parent_pid
480
+ for name in ("mod_wsgi_version", "python_version",
481
+ "apache_version", "mpm_name"):
482
+ v = _s(name)
483
+ if v:
484
+ setattr(state, name, v)
485
+
486
+ self._enqueue_all({
487
+ "type": "lifecycle",
488
+ "event": ev.to_dict(),
489
+ })
490
+
491
+ def _handle_gc(self, sample: Sample) -> None:
492
+ """Stash a KIND_GC_SNAPSHOT or KIND_GC_EVENT for one pid.
493
+
494
+ Snapshots latch the most recent tier-1 counters per
495
+ interpreter; events append to a rolling buffer of recent
496
+ pause records. Both are emitted to subscribers as-is so the
497
+ UI can render the GC tab without an additional poll path.
498
+
499
+ Routed out of the periodic-sample window so the rolling
500
+ per-process aggregator (memory_rss, request_threads,
501
+ per-phase times) is not zeroed on every GC tick when a
502
+ snapshot datagram arrives without those fields.
503
+ """
504
+ state = self.processes.get(sample.pid)
505
+ if state is None:
506
+ state = ProcessState(pid=sample.pid)
507
+ self.processes[sample.pid] = state
508
+ state.last_seen = time.monotonic()
509
+
510
+ interp_name = sample.fields.get("interpreter_name")
511
+ if isinstance(interp_name, bytes):
512
+ interp_name = interp_name.decode("utf-8", errors="replace")
513
+ else:
514
+ interp_name = ""
515
+
516
+ if sample.kind_name == "gc_snapshot":
517
+ ring = state.gc_snapshots.get(interp_name)
518
+ if ring is None:
519
+ ring = deque(maxlen=600)
520
+ state.gc_snapshots[interp_name] = ring
521
+ entry = {
522
+ "stamp": sample.stamp,
523
+ "fields": dict(sample.fields),
524
+ }
525
+ ring.append(entry)
526
+ self._enqueue_all({
527
+ "type": "gc_snapshot",
528
+ "pid": sample.pid,
529
+ "interpreter": interp_name,
530
+ "stamp": sample.stamp,
531
+ "fields": entry["fields"],
532
+ })
533
+ else:
534
+ entry = {
535
+ "stamp": sample.stamp,
536
+ "interpreter": interp_name,
537
+ "fields": dict(sample.fields),
538
+ }
539
+ state.gc_events.append(entry)
540
+ cutoff = sample.stamp - GC_EVENT_RETENTION_SEC
541
+ while state.gc_events and state.gc_events[0]["stamp"] < cutoff:
542
+ state.gc_events.popleft()
543
+ self._enqueue_all({
544
+ "type": "gc_event",
545
+ "pid": sample.pid,
546
+ "interpreter": interp_name,
547
+ "stamp": sample.stamp,
548
+ "fields": entry["fields"],
549
+ })
550
+
551
+ def _handle_slow(self, sample: Sample) -> None:
552
+ f = sample.fields
553
+
554
+ def _s(name: str) -> str:
555
+ v = f.get(name)
556
+ if isinstance(v, bytes):
557
+ return v.decode("utf-8", errors="replace")
558
+ return ""
559
+
560
+ log_id = _s("slow_log_id")
561
+ thread_id = int(f.get("slow_thread_id") or 0)
562
+ start_stamp = float(f.get("slow_start_stamp") or 0.0)
563
+
564
+ # Prefer Apache's per-request log_id as correlation key; fall back
565
+ # to a (pid, thread, start) tuple when mod_unique_id isn't loaded.
566
+ if log_id:
567
+ key: tuple = (sample.pid, log_id)
568
+ else:
569
+ key = (sample.pid, thread_id, start_stamp)
570
+
571
+ entry = SlowEntry(
572
+ pid=sample.pid,
573
+ thread_id=thread_id,
574
+ server_pid=int(f.get("slow_server_pid") or 0),
575
+ log_id=log_id,
576
+ method=_s("slow_method"),
577
+ scheme=_s("slow_scheme"),
578
+ hostname=_s("slow_hostname"),
579
+ script_name=_s("slow_script_name"),
580
+ path_info=_s("slow_path_info"),
581
+ peer_ip=_s("slow_peer_ip"),
582
+ protocol=_s("slow_protocol"),
583
+ user_agent=_s("slow_user_agent"),
584
+ start_stamp=start_stamp,
585
+ duration=float(f.get("slow_duration") or 0.0),
586
+ state=int(f.get("slow_record_state") or 0),
587
+ input_bytes=int(f.get("slow_input_bytes") or 0),
588
+ input_reads=int(f.get("slow_input_reads") or 0),
589
+ output_bytes=int(f.get("slow_output_bytes") or 0),
590
+ output_writes=int(f.get("slow_output_writes") or 0),
591
+ cpu_user_time=float(f.get("slow_cpu_user_time") or 0.0),
592
+ cpu_system_time=float(f.get("slow_cpu_system_time") or 0.0),
593
+ server_time=float(f.get("slow_server_time") or 0.0),
594
+ queue_time=float(f.get("slow_queue_time") or 0.0),
595
+ daemon_time=float(f.get("slow_daemon_time") or 0.0),
596
+ application_time=float(f.get("slow_application_time") or 0.0),
597
+ gil_wait_time=float(f.get("slow_gil_wait_time") or 0.0),
598
+ gil_wait_count=int(f.get("slow_gil_wait_count") or 0),
599
+ input_read_time=float(f.get("slow_input_read_time") or 0.0),
600
+ output_write_time=float(f.get("slow_output_write_time") or 0.0),
601
+ active_at_start=int(f.get("slow_active_at_start") or 0),
602
+ active_at_completion=int(f.get("slow_active_at_completion") or 0),
603
+ status=int(f.get("slow_status") or 0),
604
+ last_seen=time.monotonic(),
605
+ )
606
+ self.slow_requests[key] = entry
607
+
608
+ self._enqueue_all({
609
+ "type": "slow_request",
610
+ "key": list(key),
611
+ "entry": entry.to_dict(),
612
+ "stamp": sample.stamp,
613
+ })
614
+
615
+ def _broadcast(self, sample: Sample) -> None:
616
+ self._enqueue_all(self._sample_to_dict(sample))
617
+
618
+ def _enqueue_all(self, payload: dict) -> None:
619
+ for q in list(self.subscribers):
620
+ try:
621
+ q.put_nowait(payload)
622
+ except asyncio.QueueFull:
623
+ # Slow consumer — drop the oldest to stay bounded.
624
+ try:
625
+ q.get_nowait()
626
+ q.put_nowait(payload)
627
+ except Exception:
628
+ pass
629
+
630
+ def _gc_stale(self) -> None:
631
+ now = time.monotonic()
632
+ stale = [
633
+ pid for pid, st in self.processes.items()
634
+ if now - st.last_seen > self.STALE_SECONDS
635
+ ]
636
+ for pid in stale:
637
+ log.info("gc: dropping stale pid=%d", pid)
638
+ self.processes.pop(pid, None)
639
+
640
+ def clear_slow_requests(self) -> None:
641
+ """Drop completed history plus any active record GC would also drop.
642
+
643
+ Triggered by the Slow requests tab's Clear button. Drops every
644
+ completed entry outright and every active entry whose pid has died
645
+ or whose last_seen is already past the active TTL — so the table
646
+ snaps to "only requests the daemon is still actively heart-beating
647
+ about". Live in-flight rows are preserved.
648
+ """
649
+ now = time.monotonic()
650
+ live_pids = set(self.processes)
651
+ kept: dict[tuple, SlowEntry] = {}
652
+ for key, entry in self.slow_requests.items():
653
+ if entry.state == 1:
654
+ continue
655
+ if entry.pid not in live_pids:
656
+ continue
657
+ proc = self.processes.get(entry.pid)
658
+ sp = proc.sample_period if proc and proc.sample_period > 0 else 1.0
659
+ ttl = max(self.SLOW_ACTIVE_TTL_SECONDS, 3.0 * sp)
660
+ if now - entry.last_seen > ttl:
661
+ continue
662
+ kept[key] = entry
663
+ self.slow_requests = kept
664
+ self._enqueue_all({
665
+ "type": "slow_clear",
666
+ "kept": [
667
+ {"key": list(k), "entry": e.to_dict()}
668
+ for k, e in kept.items()
669
+ ],
670
+ })
671
+
672
+ def _gc_slow(self) -> None:
673
+ """Age out slow-request entries the reporter has stopped updating.
674
+
675
+ Active entries disappear quickly so a worker that was killed mid-
676
+ request doesn't leave a ghost row. Completed entries linger so a
677
+ user can still see recently-finished slow requests when they open
678
+ the UI. Both TTLs scale with the reporting process's telemetry
679
+ interval: a reporter ticking every 10 s only emits heartbeats
680
+ every 10 s, so a 5 s floor would flicker rows in and out — we
681
+ bump TTL to 3x the sample period in that case. Also drops all
682
+ entries for processes that have aged out of self.processes so
683
+ the list stays in sync with the sidebar.
684
+ """
685
+ if not self.slow_requests:
686
+ return
687
+ now = time.monotonic()
688
+ drop = []
689
+ live_pids = set(self.processes)
690
+ for key, entry in self.slow_requests.items():
691
+ if entry.pid not in live_pids:
692
+ drop.append(key)
693
+ continue
694
+ proc = self.processes.get(entry.pid)
695
+ sp = proc.sample_period if proc and proc.sample_period > 0 else 1.0
696
+ if entry.state == 1:
697
+ ttl = max(self.SLOW_COMPLETED_TTL_SECONDS, 5.0 * sp)
698
+ else:
699
+ ttl = max(self.SLOW_ACTIVE_TTL_SECONDS, 3.0 * sp)
700
+ if now - entry.last_seen > ttl:
701
+ drop.append(key)
702
+ for key in drop:
703
+ self.slow_requests.pop(key, None)
704
+
705
+ # --- WebSocket client API -------------------------------------------------
706
+
707
+ def subscribe(self) -> asyncio.Queue:
708
+ if len(self.subscribers) >= self.max_subscribers:
709
+ raise RuntimeError("too many subscribers")
710
+ q: asyncio.Queue = asyncio.Queue(maxsize=256)
711
+ self.subscribers.add(q)
712
+ return q
713
+
714
+ def unsubscribe(self, q: asyncio.Queue) -> None:
715
+ self.subscribers.discard(q)
716
+
717
+ def snapshot(self) -> dict:
718
+ """Return the full current rolling state for a newly-connected client."""
719
+ return {
720
+ "type": "snapshot",
721
+ "processes": [
722
+ {
723
+ "pid": st.pid,
724
+ "hostname": st.hostname,
725
+ "process_group": st.process_group,
726
+ "mod_wsgi_version": st.mod_wsgi_version,
727
+ "python_version": st.python_version,
728
+ "apache_version": st.apache_version,
729
+ "mpm_name": st.mpm_name,
730
+ "process_parent_pid": st.process_parent_pid,
731
+ "last_seq": st.last_seq,
732
+ "drops": st.drops,
733
+ "samples": [self._sample_to_dict(s) for s in st.samples],
734
+ "gc_snapshots": {
735
+ interp: [
736
+ {"stamp": e["stamp"], "fields": e["fields"]}
737
+ for e in ring
738
+ ]
739
+ for interp, ring in st.gc_snapshots.items()
740
+ },
741
+ "gc_events": [
742
+ {
743
+ "stamp": e["stamp"],
744
+ "interpreter": e["interpreter"],
745
+ "fields": e["fields"],
746
+ }
747
+ for e in st.gc_events
748
+ ],
749
+ }
750
+ for st in self.processes.values()
751
+ ],
752
+ "slow_requests": [
753
+ {"key": list(k), "entry": e.to_dict()}
754
+ for k, e in self.slow_requests.items()
755
+ ],
756
+ "lifecycle_events": [ev.to_dict() for ev in self.lifecycle_events],
757
+ "total_received": self.total_received,
758
+ "decode_errors": self.decode_errors,
759
+ }
760
+
761
+ @staticmethod
762
+ def _sample_to_dict(sample: Sample) -> dict:
763
+ return {
764
+ "type": "sample",
765
+ "kind": sample.kind_name,
766
+ "pid": sample.pid,
767
+ "seq": sample.seq,
768
+ "stamp": sample.stamp,
769
+ "fields": {
770
+ k: (v.decode("utf-8", errors="replace") if isinstance(v, bytes) else v)
771
+ for k, v in sample.fields.items()
772
+ },
773
+ }