mod-wsgi-telemetry 1.0.0.dev2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mod_wsgi/__init__.py +2 -0
- mod_wsgi/telemetry/__init__.py +1 -0
- mod_wsgi/telemetry/cli.py +55 -0
- mod_wsgi/telemetry/contention.py +229 -0
- mod_wsgi/telemetry/dump.py +102 -0
- mod_wsgi/telemetry/ingest.py +773 -0
- mod_wsgi/telemetry/server.py +270 -0
- mod_wsgi/telemetry/simulate.py +649 -0
- mod_wsgi/telemetry/static/index.html +8368 -0
- mod_wsgi/telemetry/tui.py +1536 -0
- mod_wsgi/telemetry/wire.py +480 -0
- mod_wsgi_telemetry-1.0.0.dev2.dist-info/METADATA +79 -0
- mod_wsgi_telemetry-1.0.0.dev2.dist-info/RECORD +16 -0
- mod_wsgi_telemetry-1.0.0.dev2.dist-info/WHEEL +4 -0
- mod_wsgi_telemetry-1.0.0.dev2.dist-info/entry_points.txt +2 -0
- mod_wsgi_telemetry-1.0.0.dev2.dist-info/licenses/LICENSE +202 -0
mod_wsgi/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "1.0.0dev2"
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
"""Top-level dispatcher for mod_wsgi-telemetry subcommands.
|
|
2
|
+
|
|
3
|
+
Recognises ``serve``, ``top``, ``dump``, ``simulate`` and forwards the
|
|
4
|
+
remaining argv to the subcommand's own ``main(argv)``. Bare invocation
|
|
5
|
+
(no arguments) runs ``serve``.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import sys
|
|
11
|
+
from importlib import import_module
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
_SUBCOMMANDS = {
|
|
15
|
+
"serve": ("mod_wsgi.telemetry.server", "Run the ingestor and web UI (default)."),
|
|
16
|
+
"top": ("mod_wsgi.telemetry.tui", "Curses terminal monitor."),
|
|
17
|
+
"dump": ("mod_wsgi.telemetry.dump", "Bind the listen socket and print decoded samples."),
|
|
18
|
+
"simulate": ("mod_wsgi.telemetry.simulate", "Emit synthetic samples for UI development."),
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _print_usage(stream) -> None:
|
|
23
|
+
print("usage: mod_wsgi-telemetry <command> [options]", file=stream)
|
|
24
|
+
print("", file=stream)
|
|
25
|
+
print("commands:", file=stream)
|
|
26
|
+
for name, (_, desc) in _SUBCOMMANDS.items():
|
|
27
|
+
print(f" {name:9s} {desc}", file=stream)
|
|
28
|
+
print("", file=stream)
|
|
29
|
+
print("Run 'mod_wsgi-telemetry <command> --help' for command-specific options.",
|
|
30
|
+
file=stream)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def main(argv: list[str] | None = None) -> int:
|
|
34
|
+
argv = list(sys.argv[1:] if argv is None else argv)
|
|
35
|
+
|
|
36
|
+
if not argv:
|
|
37
|
+
cmd, rest = "serve", []
|
|
38
|
+
elif argv[0] in ("-h", "--help"):
|
|
39
|
+
_print_usage(sys.stdout)
|
|
40
|
+
return 0
|
|
41
|
+
elif argv[0] in _SUBCOMMANDS:
|
|
42
|
+
cmd, rest = argv[0], argv[1:]
|
|
43
|
+
else:
|
|
44
|
+
print(f"mod_wsgi-telemetry: unknown subcommand or option {argv[0]!r}",
|
|
45
|
+
file=sys.stderr)
|
|
46
|
+
_print_usage(sys.stderr)
|
|
47
|
+
return 2
|
|
48
|
+
|
|
49
|
+
module_name, _ = _SUBCOMMANDS[cmd]
|
|
50
|
+
sys.argv[0] = f"mod_wsgi-telemetry {cmd}"
|
|
51
|
+
return import_module(module_name).main(rest)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
if __name__ == "__main__":
|
|
55
|
+
raise SystemExit(main())
|
|
@@ -0,0 +1,229 @@
|
|
|
1
|
+
"""GIL contention coefficient computation.
|
|
2
|
+
|
|
3
|
+
When a Python thread blocks waiting for the GIL, its wait time clusters
|
|
4
|
+
at multiples of ``sys.setswitchinterval`` (default 5 ms): a head bucket
|
|
5
|
+
below 1 ms for immediate handoffs, then bumps near s, 2*s, 3*s, ... — one
|
|
6
|
+
extra switch-interval cycle per missed handoff. Under fair contention
|
|
7
|
+
the per-cycle handoff success probability `q` is roughly constant and
|
|
8
|
+
the bump heights follow a geometric distribution::
|
|
9
|
+
|
|
10
|
+
P(k cycles) = (1 - q) ** k * q
|
|
11
|
+
|
|
12
|
+
Fitting `q` from aggregated HDR-bucket counts yields a single contention
|
|
13
|
+
coefficient that is more interpretable than p95 / p99 of GIL-wait time:
|
|
14
|
+
high `q` means waits typically resolve in one handoff, low `q` means
|
|
15
|
+
threads consistently lose multiple cycles (convoy).
|
|
16
|
+
|
|
17
|
+
The wire format reports the active switch-interval value as field 13;
|
|
18
|
+
the consumer bands the aggregated GIL-wait HDR buckets at multiples of
|
|
19
|
+
that interval and fits the geometric on `c1..c3`. The k=0 (immediate)
|
|
20
|
+
band is intentionally excluded from the fit because it is contaminated
|
|
21
|
+
by voluntary GIL releases (I/O drops where the holder happens to release
|
|
22
|
+
before its check fires); these have nothing to do with contention but
|
|
23
|
+
inflate the head bucket.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
from __future__ import annotations
|
|
27
|
+
|
|
28
|
+
import math
|
|
29
|
+
from typing import Iterable
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def cycle_band_counts(
|
|
33
|
+
buckets: list[int],
|
|
34
|
+
bucket_bounds: list[tuple[float, float]],
|
|
35
|
+
switch_interval_s: float,
|
|
36
|
+
) -> list[int]:
|
|
37
|
+
"""Sum HDR bucket counts into cycle bands.
|
|
38
|
+
|
|
39
|
+
Bands are indexed by k (cycles waited):
|
|
40
|
+
k=0: [0, 0.8*s) immediate / voluntary release
|
|
41
|
+
k=1: [0.8*s, 1.8*s) one missed handoff
|
|
42
|
+
k=2: [1.8*s, 2.8*s) two missed handoffs
|
|
43
|
+
k=3: [2.8*s, 3.8*s)
|
|
44
|
+
k=4plus: [3.8*s, inf) tail (likely OS stalls, not race losses)
|
|
45
|
+
|
|
46
|
+
A bucket is assigned to whichever band contains its midpoint. The
|
|
47
|
+
head bucket (index 0) is treated as covering ``[0, bounds[0][1])``
|
|
48
|
+
regardless of the lower edge passed in — the HDR helper used for
|
|
49
|
+
percentile interpolation reports the head bucket's lower bound as
|
|
50
|
+
the bottom of the first octave (1 ms), but for band assignment the
|
|
51
|
+
head bucket logically starts at zero.
|
|
52
|
+
|
|
53
|
+
Returns a 5-element list [c0, c1, c2, c3, c4plus].
|
|
54
|
+
"""
|
|
55
|
+
s = switch_interval_s
|
|
56
|
+
edges = [
|
|
57
|
+
(0.0, 0.8 * s),
|
|
58
|
+
(0.8 * s, 1.8 * s),
|
|
59
|
+
(1.8 * s, 2.8 * s),
|
|
60
|
+
(2.8 * s, 3.8 * s),
|
|
61
|
+
(3.8 * s, math.inf),
|
|
62
|
+
]
|
|
63
|
+
bands = [0] * 5
|
|
64
|
+
for i, (count, (lo, hi)) in enumerate(zip(buckets, bucket_bounds)):
|
|
65
|
+
if count <= 0:
|
|
66
|
+
continue
|
|
67
|
+
if i == 0:
|
|
68
|
+
lo = 0.0
|
|
69
|
+
if math.isinf(hi):
|
|
70
|
+
mid = lo
|
|
71
|
+
else:
|
|
72
|
+
mid = 0.5 * (lo + hi)
|
|
73
|
+
for k, (blo, bhi) in enumerate(edges):
|
|
74
|
+
if blo <= mid < bhi:
|
|
75
|
+
bands[k] += count
|
|
76
|
+
break
|
|
77
|
+
return bands
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def _fit_geometric_decay(
|
|
81
|
+
points: list[tuple[float, float, float]],
|
|
82
|
+
) -> dict | None:
|
|
83
|
+
"""Weighted log-linear fit of log(c[k]) vs k.
|
|
84
|
+
|
|
85
|
+
``points`` is a list of ``(x, log(count), weight)`` triples. Returns
|
|
86
|
+
``{"q": float, "r2": float}`` when the fit meets the geometric-decay
|
|
87
|
+
assumption (negative slope, valid q in (0, 1), R² ≥ 0.5); returns
|
|
88
|
+
``None`` otherwise.
|
|
89
|
+
"""
|
|
90
|
+
if len(points) < 2:
|
|
91
|
+
return None
|
|
92
|
+
|
|
93
|
+
sum_w = sum(p[2] for p in points)
|
|
94
|
+
mean_x = sum(w * x for x, _, w in points) / sum_w
|
|
95
|
+
mean_y = sum(w * y for _, y, w in points) / sum_w
|
|
96
|
+
var_x = sum(w * (x - mean_x) ** 2 for x, _, w in points) / sum_w
|
|
97
|
+
cov_xy = sum(
|
|
98
|
+
w * (x - mean_x) * (y - mean_y) for x, y, w in points
|
|
99
|
+
) / sum_w
|
|
100
|
+
|
|
101
|
+
if var_x <= 0.0:
|
|
102
|
+
return None
|
|
103
|
+
|
|
104
|
+
slope = cov_xy / var_x
|
|
105
|
+
if slope >= 0.0:
|
|
106
|
+
return None
|
|
107
|
+
|
|
108
|
+
q = 1.0 - math.exp(slope)
|
|
109
|
+
if not (0.0 < q < 1.0):
|
|
110
|
+
return None
|
|
111
|
+
|
|
112
|
+
intercept = mean_y - slope * mean_x
|
|
113
|
+
ss_res = sum(
|
|
114
|
+
w * (y - (intercept + slope * x)) ** 2 for x, y, w in points
|
|
115
|
+
)
|
|
116
|
+
var_y = sum(w * (y - mean_y) ** 2 for _, y, w in points)
|
|
117
|
+
if var_y <= 0.0:
|
|
118
|
+
return None
|
|
119
|
+
r2 = 1.0 - ss_res / var_y
|
|
120
|
+
|
|
121
|
+
if r2 < 0.5:
|
|
122
|
+
return None
|
|
123
|
+
|
|
124
|
+
return {"q": q, "r2": r2}
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def contention_coefficient(
|
|
128
|
+
buckets: list[int],
|
|
129
|
+
bucket_bounds: list[tuple[float, float]],
|
|
130
|
+
switch_interval_s: float,
|
|
131
|
+
) -> dict | None:
|
|
132
|
+
"""Compute the GIL contention coefficient from aggregated HDR buckets.
|
|
133
|
+
|
|
134
|
+
Primary path fits a geometric decay to cycle bands ``c1..c3``.
|
|
135
|
+
Fallback path fits ``c2..c4plus`` when ``c1`` is contaminated by the
|
|
136
|
+
HDR head bucket — at switch intervals ≲ 1 ms the head bucket
|
|
137
|
+
(1.25 ms wide on the default HDR config) absorbs both ``k=0``
|
|
138
|
+
(immediate) and ``k=1`` (one missed cycle), leaving ``c1`` holding
|
|
139
|
+
only the spillover. The contamination signature is ``c1 < c2``.
|
|
140
|
+
|
|
141
|
+
Returns ``None`` if the data does not support a meaningful fit:
|
|
142
|
+
- switch_interval not positive
|
|
143
|
+
- fewer than 100 events in the chosen cycle bands combined
|
|
144
|
+
- geometric fit R² below 0.5 (model doesn't apply, e.g. non-
|
|
145
|
+
stationary load, very low contention, or extreme contamination)
|
|
146
|
+
|
|
147
|
+
Otherwise returns a dict::
|
|
148
|
+
|
|
149
|
+
{
|
|
150
|
+
"q": float, # per-cycle handoff success probability
|
|
151
|
+
"r": float, # 1 - q, convoy persistence
|
|
152
|
+
"band_counts": [c0, c1, c2, c3, c4plus],
|
|
153
|
+
"fit_r2": float, # R^2 of the log-linear fit
|
|
154
|
+
"n_total": int, # total events across all bands
|
|
155
|
+
"n_fit": int, # events used in the fit
|
|
156
|
+
"fit_kind": str, # "primary" or "fallback_c2_c4plus"
|
|
157
|
+
}
|
|
158
|
+
"""
|
|
159
|
+
if switch_interval_s <= 0.0:
|
|
160
|
+
return None
|
|
161
|
+
|
|
162
|
+
bands = cycle_band_counts(buckets, bucket_bounds, switch_interval_s)
|
|
163
|
+
n_total = sum(bands)
|
|
164
|
+
|
|
165
|
+
c1_contaminated = bands[1] < bands[2]
|
|
166
|
+
|
|
167
|
+
if not c1_contaminated:
|
|
168
|
+
n_fit = bands[1] + bands[2] + bands[3]
|
|
169
|
+
if n_fit >= 100:
|
|
170
|
+
points = [
|
|
171
|
+
(float(k - 1), math.log(bands[k]), float(bands[k]))
|
|
172
|
+
for k in (1, 2, 3)
|
|
173
|
+
if bands[k] > 0
|
|
174
|
+
]
|
|
175
|
+
fit = _fit_geometric_decay(points)
|
|
176
|
+
if fit is not None:
|
|
177
|
+
return {
|
|
178
|
+
"q": fit["q"],
|
|
179
|
+
"r": 1.0 - fit["q"],
|
|
180
|
+
"band_counts": bands,
|
|
181
|
+
"fit_r2": fit["r2"],
|
|
182
|
+
"n_total": n_total,
|
|
183
|
+
"n_fit": n_fit,
|
|
184
|
+
"fit_kind": "primary",
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
# Fallback: c2, c3, c4plus. c4plus aggregates the true k=4 cycle
|
|
188
|
+
# band with the OS-stall tail beyond it, so the fit is somewhat
|
|
189
|
+
# noisier and biased high in q (tail inflates c4plus); the R²
|
|
190
|
+
# check still applies. Only attempted when c1 is contaminated —
|
|
191
|
+
# the primary fit is preferred whenever it can run.
|
|
192
|
+
n_fit = bands[2] + bands[3] + bands[4]
|
|
193
|
+
if n_fit < 100:
|
|
194
|
+
return None
|
|
195
|
+
points = [
|
|
196
|
+
(float(k - 2), math.log(bands[k]), float(bands[k]))
|
|
197
|
+
for k in (2, 3, 4)
|
|
198
|
+
if bands[k] > 0
|
|
199
|
+
]
|
|
200
|
+
fit = _fit_geometric_decay(points)
|
|
201
|
+
if fit is None:
|
|
202
|
+
return None
|
|
203
|
+
return {
|
|
204
|
+
"q": fit["q"],
|
|
205
|
+
"r": 1.0 - fit["q"],
|
|
206
|
+
"band_counts": bands,
|
|
207
|
+
"fit_r2": fit["r2"],
|
|
208
|
+
"n_total": n_total,
|
|
209
|
+
"n_fit": n_fit,
|
|
210
|
+
"fit_kind": "fallback_c2_c4plus",
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
def decay_label(q: float) -> str:
|
|
215
|
+
"""Verbal tier for q describing convoy-decay shape, NOT severity.
|
|
216
|
+
|
|
217
|
+
``q`` is the per-cycle handoff success probability fitted from the
|
|
218
|
+
geometric decay across cycle bands; high ``q`` means convoys clear
|
|
219
|
+
quickly (a missed cycle rarely chains into more), low ``q`` means
|
|
220
|
+
they persist (the Beazley convoy signature). Whether ``q`` maps to
|
|
221
|
+
"things are bad" depends on the absolute wall-clock cost per cycle
|
|
222
|
+
(the switch interval) and the ``gil_wait_time`` mean — combine the
|
|
223
|
+
two for a severity read.
|
|
224
|
+
"""
|
|
225
|
+
if q >= 0.7:
|
|
226
|
+
return "transient"
|
|
227
|
+
if q >= 0.4:
|
|
228
|
+
return "compounding"
|
|
229
|
+
return "convoy"
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
"""CLI pretty-printer for the telemetry wire format.
|
|
2
|
+
|
|
3
|
+
Binds the listening socket itself (so don't run this at the same time as
|
|
4
|
+
the ingester), receives datagrams, decodes, prints.
|
|
5
|
+
|
|
6
|
+
Usage:
|
|
7
|
+
mod_wsgi-telemetry dump --listen unix:/tmp/mod_wsgi-telemetry.sock
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import argparse
|
|
13
|
+
import json
|
|
14
|
+
import os
|
|
15
|
+
import socket
|
|
16
|
+
import sys
|
|
17
|
+
from datetime import datetime, timezone
|
|
18
|
+
|
|
19
|
+
from .ingest import open_socket
|
|
20
|
+
from .wire import decode
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _fmt_value(v):
|
|
24
|
+
if isinstance(v, bytes):
|
|
25
|
+
try:
|
|
26
|
+
return v.decode("utf-8")
|
|
27
|
+
except UnicodeDecodeError:
|
|
28
|
+
return v.hex()
|
|
29
|
+
return v
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _parse_octal_mode(s: str) -> int:
|
|
33
|
+
try:
|
|
34
|
+
return int(s, 8)
|
|
35
|
+
except ValueError:
|
|
36
|
+
raise argparse.ArgumentTypeError(
|
|
37
|
+
f"socket-mode must be octal (e.g. 0660 or 660), got {s!r}")
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def main(argv: list[str] | None = None) -> int:
|
|
41
|
+
ap = argparse.ArgumentParser(description=__doc__)
|
|
42
|
+
ap.add_argument("--listen", default="unix:/tmp/mod_wsgi-telemetry.sock")
|
|
43
|
+
ap.add_argument("--socket-mode", type=_parse_octal_mode, default=0o660,
|
|
44
|
+
metavar="MODE",
|
|
45
|
+
help="Octal permission mode for the UNIX socket "
|
|
46
|
+
"(default: 0660).")
|
|
47
|
+
ap.add_argument("--socket-group", default=None, metavar="GROUP",
|
|
48
|
+
help="Group name or numeric GID to chown the UNIX "
|
|
49
|
+
"socket to.")
|
|
50
|
+
ap.add_argument("--format", choices=["text", "json"], default="text")
|
|
51
|
+
ap.add_argument("--count", type=int, default=0,
|
|
52
|
+
help="stop after N samples (0 = forever)")
|
|
53
|
+
args = ap.parse_args(argv)
|
|
54
|
+
|
|
55
|
+
socket_group: str | int | None = args.socket_group
|
|
56
|
+
if isinstance(socket_group, str) and socket_group.isdigit():
|
|
57
|
+
socket_group = int(socket_group)
|
|
58
|
+
|
|
59
|
+
sock = open_socket(args.listen, mode=args.socket_mode, group=socket_group)
|
|
60
|
+
sock.setblocking(True)
|
|
61
|
+
seen = 0
|
|
62
|
+
try:
|
|
63
|
+
while True:
|
|
64
|
+
data, _ = sock.recvfrom(65536)
|
|
65
|
+
try:
|
|
66
|
+
sample = decode(data)
|
|
67
|
+
except Exception as e:
|
|
68
|
+
print(f"decode error: {e} (len={len(data)})", file=sys.stderr)
|
|
69
|
+
continue
|
|
70
|
+
|
|
71
|
+
if args.format == "json":
|
|
72
|
+
payload = {
|
|
73
|
+
"kind": sample.kind_name,
|
|
74
|
+
"pid": sample.pid,
|
|
75
|
+
"seq": sample.seq,
|
|
76
|
+
"stamp": sample.stamp,
|
|
77
|
+
"fields": {k: _fmt_value(v) for k, v in sample.fields.items()},
|
|
78
|
+
}
|
|
79
|
+
print(json.dumps(payload))
|
|
80
|
+
else:
|
|
81
|
+
ts = datetime.fromtimestamp(
|
|
82
|
+
sample.stamp, tz=timezone.utc
|
|
83
|
+
).isoformat(timespec="milliseconds")
|
|
84
|
+
print(
|
|
85
|
+
f"\n[{ts}] pid={sample.pid} seq={sample.seq} "
|
|
86
|
+
f"kind={sample.kind_name} v{sample.version}"
|
|
87
|
+
)
|
|
88
|
+
for k, v in sample.fields.items():
|
|
89
|
+
print(f" {k:30s} = {_fmt_value(v)}")
|
|
90
|
+
|
|
91
|
+
seen += 1
|
|
92
|
+
if args.count and seen >= args.count:
|
|
93
|
+
break
|
|
94
|
+
except KeyboardInterrupt:
|
|
95
|
+
pass
|
|
96
|
+
finally:
|
|
97
|
+
sock.close()
|
|
98
|
+
return 0
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
if __name__ == "__main__":
|
|
102
|
+
sys.exit(main())
|