overlaat 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
overlaat/__init__.py ADDED
@@ -0,0 +1,3 @@
1
+ """Overlaat: fair-queueing + usage accounting sidecar for self-hosted LLM gateways."""
2
+
3
+ __version__ = "0.0.1"
@@ -0,0 +1,437 @@
1
+ #!/usr/bin/env python3
2
+ """Overlaat host sampler (OPTIONAL, macOS-only).
3
+
4
+ Samples CPU/GPU/RAM every INTERVAL_S seconds and writes one row to the
5
+ `host_samples` table (Postgres — the same DB the gateway logs to). Each row
6
+ carries host totals (mem/cpu/gpu) plus a per-backend RSS breakdown
7
+ (`backends_json`) — the memory holders that actually fill the machine's RAM.
8
+
9
+ This component is OPTIONAL. Overlaat's core value (fair queueing + one honest
10
+ lifecycle event per request) does not depend on it; the host sampler only adds
11
+ host-level context (who is holding memory, host GPU%) alongside the request
12
+ events. It is macOS-specific: it shells out to `powermetrics`, `vm_stat`, and
13
+ `ps`. On other platforms you would replace this module with an equivalent
14
+ sampler (e.g. nvidia-smi + /proc on Linux) writing the same `host_samples` rows.
15
+
16
+ Why RSS and not per-process GPU: on macOS, per-process GPU is not measurable for
17
+ Metal/MLX workloads (powermetrics --show-process-gpu reports 0 ms/s). RSS IS
18
+ measurable per process, and RSS is what causes the Metal OOM on a memory-bound
19
+ box. So we attribute MEMORY by RSS and leave GPU% host-wide.
20
+
21
+ To read powermetrics it needs root. Run it as root (e.g. via a system service /
22
+ launch daemon) or grant the invoking user passwordless sudo for powermetrics.
23
+ Stdlib-only (no venv) -> it writes to Postgres via the `psql` CLI. The DB is
24
+ expected to be up; there is no on-disk fallback.
25
+
26
+ Environment overrides:
27
+ METRICS_DB_URL Postgres connection URL (else read DATABASE_URL from the env
28
+ file at OVERLAAT_ENV, default ./overlaat.env).
29
+ OVERLAAT_ENV path to an env file holding DATABASE_URL (default
30
+ ./overlaat.env).
31
+ PSQL path to the psql binary (default: "psql" on PATH).
32
+ SLOT_RUNNING_URL optional URL of a model-swap server's /running endpoint, used
33
+ only for cold-load tracking (default off).
34
+ """
35
+
36
+ from __future__ import annotations # support older Python 3.9 (PEP 604 unions)
37
+
38
+ import argparse
39
+ import json
40
+ import os
41
+ import re
42
+ import signal
43
+ import subprocess
44
+ import sys
45
+ import time
46
+ import urllib.request
47
+ from datetime import UTC, datetime
48
+ from pathlib import Path
49
+
50
+ INTERVAL_S = 5 # 5s sampling (small, but ~12x denser than 60s)
51
+ POWERMETRICS_DUR_MS = 1000 # 1s sample inside each loop
52
+ PAGE_SIZE = 16384 # Apple Silicon page size
53
+ TOTAL_MEM_GB = float(os.environ.get("TOTAL_MEM_GB", "256.0")) # host RAM size
54
+ RSS_MIN_GB = 1.0 # only record processes holding >= this much RAM
55
+ RSS_TOP_N = 20 # cap the per-sample backend list
56
+
57
+ # psql binary: default to whatever is on PATH; override with PSQL.
58
+ PSQL = os.environ.get("PSQL", "psql")
59
+
60
+ # Env file holding DATABASE_URL (kept out of this repo). Override with OVERLAAT_ENV.
61
+ OVERLAAT_ENV = Path(os.environ.get("OVERLAAT_ENV", "./overlaat.env"))
62
+
63
+ # Process names whose RSS we care to attribute. macOS `ps` exposes the full
64
+ # command line, so we match on the executable basename derived in derive_name().
65
+ # Adapt this list to the backends you actually run (inference servers, runtimes,
66
+ # the gateway, the database). It is only used to recognize port-suffixed names;
67
+ # any process over RSS_MIN_GB is still recorded regardless.
68
+ BACKEND_EXE_PREFIXES = (
69
+ "python", # e.g. an mlx/transformers server launched via python
70
+ "ollama", # a local model server + its runner subprocesses
71
+ "model-server", # a custom inference engine binary (rename to yours)
72
+ "mlx_lm", # an MLX language-model server
73
+ )
74
+
75
+ # Optional: a model-swap server's /running endpoint for cold-load tracking.
76
+ # Empty (the default) disables cold-load tracking entirely.
77
+ SLOT_RUNNING_URL = os.environ.get("SLOT_RUNNING_URL", "")
78
+
79
+
80
+ def database_url() -> str:
81
+ """METRICS_DB_URL env override, else DATABASE_URL from the env file."""
82
+ env = os.environ.get("METRICS_DB_URL")
83
+ if env:
84
+ return env
85
+ if OVERLAAT_ENV.exists():
86
+ for line in OVERLAAT_ENV.read_text().splitlines():
87
+ if line.startswith("DATABASE_URL="):
88
+ return line.split("=", 1)[1].strip()
89
+ return ""
90
+
91
+
92
+ DB_URL = database_url()
93
+
94
+
95
+ def vm_stat() -> dict:
96
+ out = subprocess.check_output(["/usr/bin/vm_stat"], text=True)
97
+ pages = {}
98
+ for line in out.splitlines():
99
+ m = re.match(r"^([^:]+):\s+(\d+)\.?$", line.strip())
100
+ if m:
101
+ pages[m.group(1).lower().replace(" ", "_")] = int(m.group(2))
102
+
103
+ def gb(key: str) -> float:
104
+ return round(pages.get(key, 0) * PAGE_SIZE / 1024**3, 2)
105
+
106
+ return {
107
+ "total_gb": TOTAL_MEM_GB,
108
+ "free_gb": gb("pages_free"),
109
+ "active_gb": gb("pages_active"),
110
+ "inactive_gb": gb("pages_inactive"),
111
+ "wired_gb": gb("pages_wired_down"),
112
+ "compressed_gb": gb("pages_occupied_by_compressor"),
113
+ "speculative_gb": gb("pages_speculative"),
114
+ }
115
+
116
+
117
+ def loadavg() -> dict:
118
+ out = subprocess.check_output(["/usr/bin/uptime"], text=True)
119
+ m = re.search(r"load averages?:\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)", out)
120
+ if not m:
121
+ return {}
122
+ return {"load1": float(m.group(1)), "load5": float(m.group(2)), "load15": float(m.group(3))}
123
+
124
+
125
+ def powermetrics_sample() -> tuple[dict, list[dict]]:
126
+ """Return (gpu_summary, [per_process_gpu]). Per-process GPU is kept only for
127
+ annotating the backend list; it is ~always 0 for Metal/MLX (see module doc)."""
128
+ cmd = [
129
+ "/usr/bin/powermetrics",
130
+ "--samplers",
131
+ "gpu_power,tasks",
132
+ "--show-process-gpu",
133
+ "-i",
134
+ str(POWERMETRICS_DUR_MS),
135
+ "-n",
136
+ "1",
137
+ ]
138
+ if os.geteuid() != 0:
139
+ # As a normal user, go through sudo -n (requires passwordless sudo for
140
+ # powermetrics, e.g. a NOPASSWD sudoers entry).
141
+ cmd = ["/usr/bin/sudo", "-n", *cmd]
142
+ try:
143
+ out = subprocess.check_output(cmd, text=True, stderr=subprocess.DEVNULL, timeout=15)
144
+ except Exception:
145
+ return {"active_pct": None, "freq_mhz": None}, []
146
+
147
+ gpu_pct = None
148
+ gpu_freq = None
149
+ for line in out.splitlines():
150
+ if "GPU HW active residency" in line:
151
+ m = re.search(r"(\d+\.\d+)%", line)
152
+ if m:
153
+ gpu_pct = float(m.group(1))
154
+ elif "GPU HW active frequency" in line:
155
+ m = re.search(r"(\d+)\s*MHz", line)
156
+ if m:
157
+ gpu_freq = int(m.group(1))
158
+
159
+ procs = []
160
+ in_table = False
161
+ for line in out.splitlines():
162
+ if re.match(r"^Name\s+ID\s+CPU", line):
163
+ in_table = True
164
+ continue
165
+ if in_table:
166
+ if not line.strip() or line.startswith("***") or line.startswith("ALL_"):
167
+ in_table = False
168
+ continue
169
+ parts = line.split()
170
+ if len(parts) < 9:
171
+ continue
172
+ try:
173
+ pid = int(parts[1])
174
+ gpu_ms_per_s = float(parts[-1]) # ms/s -> % via /10
175
+ if gpu_ms_per_s > 0:
176
+ procs.append(
177
+ {"name": parts[0], "pid": pid, "gpu_pct": round(gpu_ms_per_s / 10, 1)}
178
+ )
179
+ except (ValueError, IndexError):
180
+ continue
181
+ return {"active_pct": gpu_pct, "freq_mhz": gpu_freq}, procs
182
+
183
+
184
+ def derive_name(cmd: str) -> str:
185
+ """Build a readable process name from a full command line.
186
+
187
+ Examples:
188
+ /System/Library/.../WindowServer -daemon -> WindowServer
189
+ /opt/model-server --host ... --port 8086 -> model-server-8086
190
+ /Applications/Ollama.app/.../ollama runner --model X -> ollama-runner-X
191
+ /.../python3 .../uvicorn server:app --port 8083 -> uvicorn-8083
192
+ """
193
+ if not cmd:
194
+ return "?"
195
+ if cmd.startswith("postgres:"):
196
+ rest = cmd[len("postgres:") :].strip().split()
197
+ return f"postgres-{rest[0]}" if rest else "postgres"
198
+ if cmd.startswith("sshd-session"):
199
+ return "sshd-session"
200
+ tokens = cmd.split()
201
+ exe = os.path.basename(tokens[0]) or "?"
202
+ port = None
203
+ for i, t in enumerate(tokens):
204
+ if t == "--port" and i + 1 < len(tokens):
205
+ port = tokens[i + 1]
206
+ break
207
+ if exe == "ollama" and len(tokens) > 1:
208
+ if tokens[1] == "runner":
209
+ for i, t in enumerate(tokens):
210
+ if t == "--model" and i + 1 < len(tokens):
211
+ return f"ollama-runner-{tokens[i + 1]}"
212
+ return "ollama-runner"
213
+ if tokens[1] == "serve":
214
+ return "ollama-serve"
215
+ if port and exe.startswith(BACKEND_EXE_PREFIXES):
216
+ if exe.startswith("python"):
217
+ # A python-hosted server: try to recover a meaningful name from an
218
+ # absolute script path ending in -server / -api.
219
+ for t in tokens[1:]:
220
+ if t.startswith("/"):
221
+ for d in t.split("/"):
222
+ if d.endswith(("-server", "-api")):
223
+ return f"{d}-{port}"
224
+ return f"{exe}-{port}"
225
+ return exe
226
+
227
+
228
+ def ps_snapshot() -> dict[int, dict]:
229
+ """pid -> {pid, name, cpu_pct, rss_gb, gpu_pct}. Uses full `command` (macOS
230
+ `comm` is truncated to 16 chars)."""
231
+ out = subprocess.check_output(["/bin/ps", "-axo", "pid,%cpu,rss,command"], text=True)
232
+ processes: dict[int, dict] = {}
233
+ for line in out.splitlines()[1:]:
234
+ parts = line.split(None, 3)
235
+ if len(parts) < 4:
236
+ continue
237
+ try:
238
+ pid = int(parts[0])
239
+ cpu = float(parts[1])
240
+ rss_kb = int(parts[2])
241
+ cmd = parts[3]
242
+ except ValueError:
243
+ continue
244
+ processes[pid] = {
245
+ "pid": pid,
246
+ "name": derive_name(cmd),
247
+ "cpu_pct": cpu,
248
+ "rss_gb": round(rss_kb / 1024**2, 2),
249
+ "gpu_pct": 0.0,
250
+ }
251
+ return processes
252
+
253
+
254
+ def backend_breakdown(processes: dict[int, dict], gpu_procs: list[dict]) -> list[dict]:
255
+ """Memory holders: processes with RSS >= RSS_MIN_GB, sorted desc, top N.
256
+ GPU% merged in where powermetrics saw it (usually 0 for Metal/MLX)."""
257
+ for gp in gpu_procs:
258
+ p = processes.get(gp["pid"])
259
+ if p:
260
+ p["gpu_pct"] = gp["gpu_pct"]
261
+ holders = [p for p in processes.values() if p["rss_gb"] >= RSS_MIN_GB]
262
+ holders.sort(key=lambda p: -p["rss_gb"])
263
+ return holders[:RSS_TOP_N]
264
+
265
+
266
+ def sample() -> dict:
267
+ t_epoch = time.time()
268
+ ts = datetime.now(UTC).strftime("%Y-%m-%dT%H:%M:%SZ")
269
+ mem = vm_stat()
270
+ cpu = loadavg()
271
+ gpu_stats, gpu_procs = powermetrics_sample()
272
+ ps_data = ps_snapshot()
273
+ backends = backend_breakdown(ps_data, gpu_procs)
274
+ return {
275
+ "ts": ts,
276
+ "ts_epoch": round(t_epoch, 3),
277
+ "interval_s": INTERVAL_S,
278
+ "cpu": cpu,
279
+ "mem": mem,
280
+ "gpu": gpu_stats,
281
+ "backends": backends,
282
+ }
283
+
284
+
285
+ def _num(x) -> str:
286
+ """SQL literal for a numeric column (NULL if None)."""
287
+ return "NULL" if x is None else repr(float(x))
288
+
289
+
290
+ def write_sample_pg(rec: dict) -> None:
291
+ """INSERT one host sample via psql. Raises on failure (caller logs)."""
292
+ if not DB_URL:
293
+ raise RuntimeError("no DATABASE_URL")
294
+ mem, gpu, cpu = rec["mem"], rec["gpu"], rec["cpu"]
295
+ backends = json.dumps(rec["backends"], ensure_ascii=False).replace("'", "''")
296
+ sql = (
297
+ "INSERT INTO host_samples "
298
+ "(ts,gpu_pct,gpu_freq_mhz,ram_total_gb,ram_wired_gb,ram_active_gb,"
299
+ "ram_inactive_gb,ram_compressed_gb,ram_free_gb,cpu_load1,cpu_load5,"
300
+ "backends_json) VALUES ("
301
+ f"{rec['ts_epoch']:.3f},{_num(gpu.get('active_pct'))},"
302
+ f"{_num(gpu.get('freq_mhz'))},{_num(mem.get('total_gb'))},"
303
+ f"{_num(mem.get('wired_gb'))},{_num(mem.get('active_gb'))},"
304
+ f"{_num(mem.get('inactive_gb'))},{_num(mem.get('compressed_gb'))},"
305
+ f"{_num(mem.get('free_gb'))},{_num(cpu.get('load1'))},"
306
+ f"{_num(cpu.get('load5'))},'{backends}'::jsonb) "
307
+ "ON CONFLICT (ts) DO NOTHING;"
308
+ )
309
+ subprocess.run(
310
+ [PSQL, DB_URL, "-v", "ON_ERROR_STOP=1", "-q", "-c", sql],
311
+ check=True,
312
+ capture_output=True,
313
+ text=True,
314
+ timeout=15,
315
+ )
316
+
317
+
318
+ # -- cold-load tracking (optional) ----------------------------------------------
319
+ # If you run a model-swap server that loads one large model at a time, its
320
+ # /running endpoint reports each member's state (starting -> ready -> [evicted]).
321
+ # We poll it every loop and emit an explicit load row on the starting->ready
322
+ # transition, so cold-load time stops hiding inside the first request's TTFT.
323
+ # State is held in module memory across loop iterations. Disabled when
324
+ # SLOT_RUNNING_URL is empty.
325
+ _load_state: dict[str, dict] = {} # model -> {"state": str, "t_start": float|None}
326
+
327
+
328
+ def poll_slot_server() -> dict[str, str]:
329
+ """{model: state} for currently-running swap members; {} if unreachable or
330
+ if cold-load tracking is disabled (SLOT_RUNNING_URL unset)."""
331
+ if not SLOT_RUNNING_URL:
332
+ return {}
333
+ try:
334
+ with urllib.request.urlopen(SLOT_RUNNING_URL, timeout=2) as r:
335
+ data = json.loads(r.read().decode())
336
+ return {m["model"]: (m.get("state") or "") for m in data.get("running", [])}
337
+ except Exception:
338
+ return {}
339
+
340
+
341
+ def write_model_load_pg(model: str, t_start: float, t_ready: float | None, detail: str) -> None:
342
+ """INSERT one cold-load row via psql. Raises on failure (caller logs)."""
343
+ if not DB_URL:
344
+ raise RuntimeError("no DATABASE_URL")
345
+ load_s = None if t_ready is None else round(t_ready - t_start, 2)
346
+ m = model.replace("'", "''")
347
+ d = detail.replace("'", "''")
348
+ sql = (
349
+ "INSERT INTO model_loads (model,t_start,t_ready,load_s,detail) "
350
+ f"VALUES ('{m}',{t_start:.3f},{_num(t_ready)},{_num(load_s)},'{d}');"
351
+ )
352
+ subprocess.run(
353
+ [PSQL, DB_URL, "-v", "ON_ERROR_STOP=1", "-q", "-c", sql],
354
+ check=True,
355
+ capture_output=True,
356
+ text=True,
357
+ timeout=15,
358
+ )
359
+
360
+
361
+ def track_model_loads(now_epoch: float) -> None:
362
+ """Poll the swap server and emit a load row when a model reaches ready after
363
+ a non-ready period. Best-effort: a failed poll or write is swallowed
364
+ (logged). No-op when cold-load tracking is disabled."""
365
+ cur = poll_slot_server()
366
+ if not cur:
367
+ return
368
+ for model, state in cur.items():
369
+ prev = _load_state.get(model)
370
+ if prev is None:
371
+ # First sighting: only arm a load if it's mid-startup right now.
372
+ _load_state[model] = {
373
+ "state": state,
374
+ "t_start": None if state == "ready" else now_epoch,
375
+ }
376
+ continue
377
+ if state == prev["state"]:
378
+ continue
379
+ if state != "ready" and prev["t_start"] is None:
380
+ prev["t_start"] = now_epoch # load began
381
+ elif state == "ready" and prev["t_start"] is not None:
382
+ try:
383
+ write_model_load_pg(model, prev["t_start"], now_epoch, f"{prev['state']}->ready")
384
+ except Exception as e: # noqa: BLE001
385
+ sys.stderr.write(f"model-load write failed: {type(e).__name__}: {e}\n")
386
+ sys.stderr.flush()
387
+ prev["t_start"] = None # load done
388
+ prev["state"] = state
389
+ # Drop models no longer running (evicted) so a future reload re-arms cleanly.
390
+ for model in [m for m in _load_state if m not in cur]:
391
+ del _load_state[model]
392
+
393
+
394
+ _stop = False
395
+
396
+
397
+ def _on_term(_signum, _frame):
398
+ global _stop
399
+ _stop = True
400
+
401
+
402
+ def main():
403
+ p = argparse.ArgumentParser()
404
+ p.add_argument(
405
+ "--once", action="store_true", help="one sample -> pretty JSON to stdout (preview mode)"
406
+ )
407
+ args = p.parse_args()
408
+
409
+ if args.once:
410
+ print(json.dumps(sample(), ensure_ascii=False, indent=2))
411
+ return
412
+
413
+ signal.signal(signal.SIGTERM, _on_term)
414
+ signal.signal(signal.SIGINT, _on_term)
415
+ sys.stderr.write(
416
+ f"host-logger started, interval={INTERVAL_S}s, db={'yes' if DB_URL else 'NO'}\n"
417
+ )
418
+ sys.stderr.flush()
419
+ while not _stop:
420
+ t0 = time.monotonic()
421
+ try:
422
+ write_sample_pg(sample())
423
+ except Exception as e: # noqa: BLE001
424
+ sys.stderr.write(f"sample failed: {type(e).__name__}: {e}\n")
425
+ sys.stderr.flush()
426
+ try:
427
+ track_model_loads(time.time())
428
+ except Exception as e: # noqa: BLE001
429
+ sys.stderr.write(f"model-load track failed: {type(e).__name__}: {e}\n")
430
+ sys.stderr.flush()
431
+ elapsed = time.monotonic() - t0
432
+ time.sleep(max(1.0, INTERVAL_S - elapsed))
433
+ sys.stderr.write("host-logger stopping cleanly\n")
434
+
435
+
436
+ if __name__ == "__main__":
437
+ main()