overlaat 0.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- overlaat-0.0.1/.github/workflows/ci.yml +96 -0
- overlaat-0.0.1/.gitignore +19 -0
- overlaat-0.0.1/CHANGELOG.md +59 -0
- overlaat-0.0.1/LICENSE +21 -0
- overlaat-0.0.1/PKG-INFO +270 -0
- overlaat-0.0.1/README.md +232 -0
- overlaat-0.0.1/docs/ARCHITECTURE.md +401 -0
- overlaat-0.0.1/docs/COST-SCHEDULER.md +308 -0
- overlaat-0.0.1/docs/OBSERVABILITY.md +229 -0
- overlaat-0.0.1/docs/overlaat-llm-stack.excalidraw.svg +4 -0
- overlaat-0.0.1/examples/litellm-config.example.yaml +110 -0
- overlaat-0.0.1/examples/overlaat.env.example +39 -0
- overlaat-0.0.1/examples/run-queue-proxy.sh +38 -0
- overlaat-0.0.1/examples/run-usage-api.sh +37 -0
- overlaat-0.0.1/overlaat/__init__.py +3 -0
- overlaat-0.0.1/overlaat/host_logger.py +437 -0
- overlaat-0.0.1/overlaat/metrics_db.py +525 -0
- overlaat-0.0.1/overlaat/queue_proxy.py +577 -0
- overlaat-0.0.1/overlaat/usage_api.py +568 -0
- overlaat-0.0.1/pyproject.toml +75 -0
- overlaat-0.0.1/schema.sql +73 -0
- overlaat-0.0.1/tests/conftest.py +10 -0
- overlaat-0.0.1/tests/test_metrics_helpers.py +48 -0
- overlaat-0.0.1/tests/test_metrics_views.py +84 -0
- overlaat-0.0.1/tests/test_queue_proxy_api.py +88 -0
- overlaat-0.0.1/tests/test_queue_proxy_helpers.py +51 -0
- overlaat-0.0.1/tests/test_usage_api.py +106 -0
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
tags: ['v*']
|
|
7
|
+
pull_request:
|
|
8
|
+
workflow_dispatch:
|
|
9
|
+
|
|
10
|
+
permissions:
|
|
11
|
+
contents: read
|
|
12
|
+
|
|
13
|
+
jobs:
|
|
14
|
+
lint:
|
|
15
|
+
runs-on: ubuntu-latest
|
|
16
|
+
steps:
|
|
17
|
+
- uses: actions/checkout@v4
|
|
18
|
+
- uses: actions/setup-python@v5
|
|
19
|
+
with:
|
|
20
|
+
python-version: "3.12"
|
|
21
|
+
- name: Install linter
|
|
22
|
+
run: pip install ruff
|
|
23
|
+
- name: Lint with ruff
|
|
24
|
+
run: ruff check .
|
|
25
|
+
|
|
26
|
+
test:
|
|
27
|
+
needs: [lint]
|
|
28
|
+
runs-on: ubuntu-latest
|
|
29
|
+
strategy:
|
|
30
|
+
matrix:
|
|
31
|
+
python-version: ["3.11", "3.12", "3.13"]
|
|
32
|
+
steps:
|
|
33
|
+
- uses: actions/checkout@v4
|
|
34
|
+
- uses: actions/setup-python@v5
|
|
35
|
+
with:
|
|
36
|
+
python-version: ${{ matrix.python-version }}
|
|
37
|
+
- name: Install package + dev deps
|
|
38
|
+
run: |
|
|
39
|
+
python -m pip install --upgrade pip
|
|
40
|
+
pip install -e ".[dev]"
|
|
41
|
+
- name: Run tests
|
|
42
|
+
run: pytest
|
|
43
|
+
|
|
44
|
+
build:
|
|
45
|
+
needs: [lint, test]
|
|
46
|
+
runs-on: ubuntu-latest
|
|
47
|
+
steps:
|
|
48
|
+
- uses: actions/checkout@v4
|
|
49
|
+
- uses: actions/setup-python@v5
|
|
50
|
+
with:
|
|
51
|
+
python-version: "3.12"
|
|
52
|
+
- name: Verify tag matches package version
|
|
53
|
+
if: startsWith(github.ref, 'refs/tags/v')
|
|
54
|
+
run: |
|
|
55
|
+
PKG=$(python -c "import re; print(re.search(r'__version__ = \"([^\"]+)\"', open('overlaat/__init__.py').read()).group(1))")
|
|
56
|
+
TAG="${GITHUB_REF_NAME#v}"
|
|
57
|
+
echo "tag=$TAG package=$PKG"
|
|
58
|
+
test "$PKG" = "$TAG"
|
|
59
|
+
- name: Build distributions
|
|
60
|
+
run: |
|
|
61
|
+
python -m pip install --upgrade pip build twine
|
|
62
|
+
python -m build
|
|
63
|
+
twine check dist/*
|
|
64
|
+
- uses: actions/upload-artifact@v4
|
|
65
|
+
with:
|
|
66
|
+
name: dist
|
|
67
|
+
path: dist/*
|
|
68
|
+
|
|
69
|
+
release:
|
|
70
|
+
name: Release
|
|
71
|
+
if: startsWith(github.ref, 'refs/tags/') || github.event_name == 'workflow_dispatch'
|
|
72
|
+
needs: [lint, test, build]
|
|
73
|
+
runs-on: ubuntu-latest
|
|
74
|
+
environment: pypi
|
|
75
|
+
permissions:
|
|
76
|
+
id-token: write # trusted publishing to PyPI (OIDC)
|
|
77
|
+
contents: write # create/update the GitHub Release
|
|
78
|
+
attestations: write # build provenance attestation
|
|
79
|
+
steps:
|
|
80
|
+
- uses: actions/download-artifact@v4
|
|
81
|
+
with:
|
|
82
|
+
name: dist
|
|
83
|
+
path: dist
|
|
84
|
+
- name: Generate artifact attestation
|
|
85
|
+
uses: actions/attest-build-provenance@v2
|
|
86
|
+
with:
|
|
87
|
+
subject-path: 'dist/*'
|
|
88
|
+
- name: Publish to PyPI
|
|
89
|
+
if: startsWith(github.ref, 'refs/tags/')
|
|
90
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
91
|
+
- name: Upload to GitHub Release
|
|
92
|
+
uses: softprops/action-gh-release@v2
|
|
93
|
+
with:
|
|
94
|
+
files: dist/*
|
|
95
|
+
fail_on_unmatched_files: false
|
|
96
|
+
prerelease: ${{ contains(github.ref, 'alpha') || contains(github.ref, 'beta') }}
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
__pycache__/
|
|
2
|
+
*.py[cod]
|
|
3
|
+
.venv/
|
|
4
|
+
venv/
|
|
5
|
+
*.env
|
|
6
|
+
!*.env.example
|
|
7
|
+
tmp/
|
|
8
|
+
.DS_Store
|
|
9
|
+
*.log
|
|
10
|
+
dist/
|
|
11
|
+
build/
|
|
12
|
+
*.egg-info/
|
|
13
|
+
CLAUDE.md
|
|
14
|
+
PUBLISH.md
|
|
15
|
+
# Diagram build inputs — we publish only the .excalidraw.svg (it embeds the
|
|
16
|
+
# editable scene). The PNG screenshot and the .excalidraw source stay local.
|
|
17
|
+
docs/overlaat-llm-stack.png
|
|
18
|
+
docs/dashboard.png
|
|
19
|
+
docs/*.excalidraw
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project are documented here. The format is based on
|
|
4
|
+
[Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and the project aims to
|
|
5
|
+
follow [Semantic Versioning](https://semver.org/spec/v2.0.0.html) — though while the
|
|
6
|
+
status is **experimental**, the APIs and the Postgres schema may change between
|
|
7
|
+
versions without a compatibility guarantee.
|
|
8
|
+
|
|
9
|
+
## [0.0.1] — 2026-06-19
|
|
10
|
+
|
|
11
|
+
First public release. Overlaat is a sidecar that sits in *front* of a self-hosted,
|
|
12
|
+
multi-backend LiteLLM gateway and adds the two things the gateway does not do: a fair
|
|
13
|
+
waiting-queue instead of a 429-cliff, and one honest usage event per request. This
|
|
14
|
+
version implements:
|
|
15
|
+
|
|
16
|
+
### queue-proxy (`:4000`) — the single network entry point
|
|
17
|
+
- **Per-model FIFO wait-queue.** Every `/v1/chat/completions`, `/v1/completions`,
|
|
18
|
+
`/v1/embeddings`, and `/rerank` call passes through a per-model `asyncio.Semaphore`;
|
|
19
|
+
overflow **waits in FIFO order** rather than being rejected with HTTP 429.
|
|
20
|
+
- **Caps derived from the gateway config**, not tuned separately — the slot size for
|
|
21
|
+
each model is read from `max_parallel_requests` in `litellm-config.yaml`. Models with
|
|
22
|
+
no cap (and all non-LLM paths) pass through without a queue.
|
|
23
|
+
- **Streaming-compatible** (SSE and plain JSON), forwarding the body unbuffered and
|
|
24
|
+
headers 1:1 except hop-by-hop. Injects `stream_options.include_usage=true` on
|
|
25
|
+
streaming chat so token counts arrive reliably.
|
|
26
|
+
- **One lifecycle event per request** written to Postgres `request_events`, *including
|
|
27
|
+
queued and client-abandoned calls* that insert-on-completion logging structurally
|
|
28
|
+
misses. Captures `t_enqueue` / `t_acquire` / `t_first_token` / `t_done`, outcome,
|
|
29
|
+
model, key fingerprint, and token counts (NULL, never zero, when unreported). The
|
|
30
|
+
writer is non-blocking (bounded queue + background batch insert); on overflow or DB
|
|
31
|
+
error the event is dropped and counted — the hot path is never slowed.
|
|
32
|
+
- **Control + status endpoints:** `/__queue/health`, `/__queue/status`,
|
|
33
|
+
`/__queue/cancel/{req_id}`, `/__queue/cancel-all`. Cancellation affects **queued
|
|
34
|
+
requests only** (in-flight calls are deliberately not cancellable).
|
|
35
|
+
|
|
36
|
+
### usage-api (`:4100`) — read-only dashboard
|
|
37
|
+
- FastAPI service that **only ever reads** the event/host tables, serving an HTML
|
|
38
|
+
dashboard plus `/now`, `/timeline`, `/models`, `/perf`, `/consumers`, `/healthz`.
|
|
39
|
+
- Derives the three honest concurrency curves (**offered / active / queued**),
|
|
40
|
+
throughput bucketed by time-weighted measured concurrency (min-sample guarded), and a
|
|
41
|
+
**solo decode tok/s** backend-health signal that isolates engine degradation from load.
|
|
42
|
+
|
|
43
|
+
### host sampler (optional, macOS)
|
|
44
|
+
- `host_logger` samples GPU% / RAM and per-backend RSS into `host_samples` every few
|
|
45
|
+
seconds, and logs swap-slot cold loads into `model_loads`. Memory is attributed
|
|
46
|
+
per-backend by RSS; GPU% is kept host-wide (per-process GPU is not measurable for
|
|
47
|
+
Metal/MLX).
|
|
48
|
+
|
|
49
|
+
### schema & packaging
|
|
50
|
+
- `schema.sql` — idempotent DDL for the three tables (`request_events`, `host_samples`,
|
|
51
|
+
`model_loads`); all timestamps are epoch seconds (UTC).
|
|
52
|
+
- Pure-Python package (`hatchling`, Python ≥ 3.11) with example config, env, and run
|
|
53
|
+
scripts under `examples/`.
|
|
54
|
+
|
|
55
|
+
### Not yet implemented
|
|
56
|
+
- **Cost-weighted admission** is design-only (see `docs/COST-SCHEDULER.md`); the queue
|
|
57
|
+
is plain per-model FIFO in this version.
|
|
58
|
+
|
|
59
|
+
[0.0.1]: https://github.com/tdamsma/overlaat/releases/tag/v0.0.1
|
overlaat-0.0.1/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Thijs Damsma
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
overlaat-0.0.1/PKG-INFO
ADDED
|
@@ -0,0 +1,270 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: overlaat
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: Fair waiting-queue + honest usage accounting for a Mac Studio shared as a small, trusted team's personal LLM compute server — a sidecar in front of a multi-backend LiteLLM gateway.
|
|
5
|
+
Project-URL: Homepage, https://github.com/tdamsma/overlaat
|
|
6
|
+
Project-URL: Repository, https://github.com/tdamsma/overlaat
|
|
7
|
+
Project-URL: Issues, https://github.com/tdamsma/overlaat/issues
|
|
8
|
+
Project-URL: Changelog, https://github.com/tdamsma/overlaat/blob/main/CHANGELOG.md
|
|
9
|
+
Author: Thijs Damsma
|
|
10
|
+
License-Expression: MIT
|
|
11
|
+
License-File: LICENSE
|
|
12
|
+
Keywords: apple-silicon,gateway,litellm,llm,mlx,observability,queue,self-hosted
|
|
13
|
+
Classifier: Development Status :: 3 - Alpha
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: Intended Audience :: System Administrators
|
|
16
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
17
|
+
Classifier: Operating System :: OS Independent
|
|
18
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
22
|
+
Classifier: Topic :: Internet :: Proxy Servers
|
|
23
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
24
|
+
Classifier: Topic :: System :: Monitoring
|
|
25
|
+
Requires-Python: >=3.11
|
|
26
|
+
Requires-Dist: fastapi
|
|
27
|
+
Requires-Dist: httpx
|
|
28
|
+
Requires-Dist: psycopg[binary]
|
|
29
|
+
Requires-Dist: pyyaml
|
|
30
|
+
Requires-Dist: uvicorn[standard]
|
|
31
|
+
Provides-Extra: dev
|
|
32
|
+
Requires-Dist: build; extra == 'dev'
|
|
33
|
+
Requires-Dist: pytest-asyncio>=0.23; extra == 'dev'
|
|
34
|
+
Requires-Dist: pytest>=8; extra == 'dev'
|
|
35
|
+
Requires-Dist: ruff>=0.6; extra == 'dev'
|
|
36
|
+
Requires-Dist: twine; extra == 'dev'
|
|
37
|
+
Description-Content-Type: text/markdown
|
|
38
|
+
|
|
39
|
+
# Overlaat
|
|
40
|
+
|
|
41
|
+
Overlaat puts a **fair waiting-queue** and **honest usage accounting** in front of a
|
|
42
|
+
self-hosted, multi-backend LLM gateway ([LiteLLM](https://github.com/BerriAI/litellm)).
|
|
43
|
+
It is two small services and one Postgres schema. That is the whole thing.
|
|
44
|
+
|
|
45
|
+
It is built for one specific situation, and we wrote it for ours: a **Mac Studio (or a
|
|
46
|
+
similar big Apple-Silicon box) running as a small, trusted team's personal compute
|
|
47
|
+
server** — a handful of models behind one gateway, a mix of interactive chat and bursty
|
|
48
|
+
batch jobs, on one trusted network where API keys are *attribution*, not secrets.
|
|
49
|
+
|
|
50
|
+
It is deliberately narrow. It is **not** a load balancer, **not** multi-tenant billing,
|
|
51
|
+
**not** an enterprise analytics stack, and **not** trying to be. It does two things and
|
|
52
|
+
tries to do them without lying to you.
|
|
53
|
+
|
|
54
|
+
*Overlaat* is Dutch for a controlled spillway in a dike: it sheds overflow by design
|
|
55
|
+
instead of breaching. That is the posture toward load — when requests outrun your
|
|
56
|
+
backends, the excess pools in a fair FIFO queue and drains in order, rather than a
|
|
57
|
+
`429`-cascade tearing through every caller's retry loop.
|
|
58
|
+
|
|
59
|
+

|
|
60
|
+
|
|
61
|
+
## Why it exists
|
|
62
|
+
|
|
63
|
+
Share one box between a few people and a few agents and two things start to hurt.
|
|
64
|
+
|
|
65
|
+
1. **Overflow is a cliff, not a queue.** LiteLLM's `max_parallel_requests` (and any
|
|
66
|
+
swap-layer concurrency limit) *reject* on overflow — they return `429` rather than
|
|
67
|
+
making the caller wait. A burst of parallel jobs against a single-slot model becomes
|
|
68
|
+
a `429`-cascade, and every caller has to grow its own backoff logic. Nobody wants to
|
|
69
|
+
write that backoff loop. Several people writing it independently is worse.
|
|
70
|
+
2. **Usage accounting lies by omission.** Insert-on-completion spend logging only ever
|
|
71
|
+
writes a row for a call that *ran to completion*. Calls that sat queued, calls the
|
|
72
|
+
client abandoned mid-stream, long-running calls still in flight — all invisible. You
|
|
73
|
+
cannot answer "what was actually happening on the box at 14:03" from rows that only
|
|
74
|
+
appear after the fact.
|
|
75
|
+
|
|
76
|
+
Overlaat sits in the gap between "Ollama on my laptop" (no queueing, no accounting, fine
|
|
77
|
+
for one person) and "enterprise gateway with a full analytics stack" (more machinery
|
|
78
|
+
than a personal compute server should have to run). Fair queueing and truthful usage
|
|
79
|
+
attribution, with as little machinery as we could get away with.
|
|
80
|
+
|
|
81
|
+
## What it is
|
|
82
|
+
|
|
83
|
+
A **sidecar in *front* of LiteLLM**, plus a **read-only dashboard**:
|
|
84
|
+
|
|
85
|
+
- **queue-proxy** (`:4000`) — the single network entry point. Every request flows
|
|
86
|
+
through here and is FIFO-queued behind a **per-model semaphore**; the slot size for
|
|
87
|
+
each model is *derived* from your backend config, not tuned separately. Because it is
|
|
88
|
+
the one component on the full call path, it is also the one instrumentation site: it
|
|
89
|
+
emits **exactly one lifecycle event per request** to Postgres — *including* queued and
|
|
90
|
+
client-abandoned calls that insert-on-completion logging structurally misses.
|
|
91
|
+
- **usage-api** (`:4100`) — a read-only FastAPI dashboard over those events. It never
|
|
92
|
+
writes; it only reads. Restart it whenever you like, independently of the proxy.
|
|
93
|
+
|
|
94
|
+
The one principle the whole thing is built on: **instrument the call path once, derive
|
|
95
|
+
everything else.** The proxy writes one honest row per request; the host sampler writes
|
|
96
|
+
host facts every few seconds; the dashboard is pure query. No second source of truth to
|
|
97
|
+
reconcile, no survivor bias, no two endpoints that compute "latency" three different ways.
|
|
98
|
+
|
|
99
|
+
## Architecture
|
|
100
|
+
|
|
101
|
+
```mermaid
|
|
102
|
+
flowchart TD
|
|
103
|
+
clients["Clients<br/>chat · batch jobs · agents"]
|
|
104
|
+
proxy["overlaat queue-proxy :4000<br/>per-model FIFO semaphore<br/>wait, not 429"]
|
|
105
|
+
gw["LiteLLM gateway :4002<br/>loopback-only · routes to every backend"]
|
|
106
|
+
|
|
107
|
+
subgraph be ["GPU backends — behind LiteLLM"]
|
|
108
|
+
direction LR
|
|
109
|
+
b1["DeepSeek<br/>llama.cpp"]
|
|
110
|
+
b2["Qwen 3<br/>MLX"]
|
|
111
|
+
b3["Whisper<br/>MLX · STT"]
|
|
112
|
+
b4["Ollama<br/>embeddings"]
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
pg[("Postgres<br/>request_events<br/>host_samples · model_loads")]
|
|
116
|
+
usage["overlaat usage-api :4100<br/>read-only dashboard<br/>/ /now /timeline /models /consumers"]
|
|
117
|
+
host["host sampler<br/>GPU% · RAM · per-backend RSS"]
|
|
118
|
+
|
|
119
|
+
clients -->|single network entry| proxy
|
|
120
|
+
proxy -->|forwards admitted calls · loopback| gw
|
|
121
|
+
gw --> b1 & b2 & b3 & b4
|
|
122
|
+
proxy -. one lifecycle event per request<br/>including queued and abandoned .-> pg
|
|
123
|
+
host -. host facts every few seconds .-> pg
|
|
124
|
+
pg -->|read only — never writes| usage
|
|
125
|
+
|
|
126
|
+
classDef overlaat fill:#3b82f6,stroke:#1e3a5f,color:#ffffff;
|
|
127
|
+
class proxy,usage overlaat;
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
Keep LiteLLM bound to loopback. That is the trick that makes the proxy the *only* entry
|
|
131
|
+
point — and therefore the single, complete instrumentation site. If there is a second
|
|
132
|
+
door into the gateway, your accounting has a hole in it.
|
|
133
|
+
|
|
134
|
+
## Quickstart
|
|
135
|
+
|
|
136
|
+
You need a reachable Postgres (the same one LiteLLM uses is fine) and a configured
|
|
137
|
+
LiteLLM gateway on loopback.
|
|
138
|
+
|
|
139
|
+
```bash
|
|
140
|
+
# 1. Install
|
|
141
|
+
pip install -e . # or: uv pip install -e .
|
|
142
|
+
|
|
143
|
+
# 2. Apply the schema (idempotent)
|
|
144
|
+
psql "$DATABASE_URL" -f schema.sql
|
|
145
|
+
|
|
146
|
+
# 3. Configure
|
|
147
|
+
cp examples/overlaat.env.example overlaat.env # fill in DATABASE_URL etc.
|
|
148
|
+
chmod 600 overlaat.env # it holds DB credentials
|
|
149
|
+
cp examples/litellm-config.example.yaml litellm-config.yaml # your model list
|
|
150
|
+
cp examples/run-queue-proxy.sh examples/run-usage-api.sh . # the two run scripts
|
|
151
|
+
|
|
152
|
+
# 4. Run the two services (behind a supervisor of your choice)
|
|
153
|
+
OVERLAAT_ENV=./overlaat.env ./run-queue-proxy.sh # :4000 entry, in front of LiteLLM
|
|
154
|
+
OVERLAAT_ENV=./overlaat.env ./run-usage-api.sh # :4100 read-only dashboard
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
Point your clients at `:4000` instead of LiteLLM directly. Open
|
|
158
|
+
`http://your-host:4100/` for the dashboard. The queue-proxy derives one semaphore per
|
|
159
|
+
model from `litellm-config.yaml`, so that file is the single source of truth for
|
|
160
|
+
concurrency.
|
|
161
|
+
|
|
162
|
+
> The proxy runs a **single uvicorn worker on purpose**: the in-memory per-model
|
|
163
|
+
> semaphores and the instrumentation live in that one process, so FIFO ordering and
|
|
164
|
+
> event emission must not be sharded across workers. This is a feature, not a TODO.
|
|
165
|
+
|
|
166
|
+
## Honest concurrency: three curves
|
|
167
|
+
|
|
168
|
+
The dashboard never invents a concurrency number. From `request_events` it derives, at
|
|
169
|
+
any time *t* and per model, exactly three time series: **offered** (`t_enqueue ≤ t <
|
|
170
|
+
t_done` — everything in the system, including still-queued), **active** (`t_acquire ≤ t
|
|
171
|
+
< t_done` — actually occupying a backend slot, bounded by the cap *by definition*), and
|
|
172
|
+
**queued** = offered − active. Throughput-vs-concurrency buckets each completed call on
|
|
173
|
+
the time-weighted average `active(t)` over its own `[acquire, done]` interval, and cells
|
|
174
|
+
with too few samples are marked insufficient and never shown as a trend. If we don't have
|
|
175
|
+
the data, the dashboard says so instead of drawing a confident line.
|
|
176
|
+
|
|
177
|
+
See [`docs/OBSERVABILITY.md`](docs/OBSERVABILITY.md) for the curves and their caveats,
|
|
178
|
+
and [`docs/ARCHITECTURE.md`](docs/ARCHITECTURE.md) for the call-path and instrumentation
|
|
179
|
+
design.
|
|
180
|
+
|
|
181
|
+
## Roadmap
|
|
182
|
+
|
|
183
|
+
- **Capacity-aware priority scheduler** — *not yet implemented.* Today's code runs
|
|
184
|
+
independent **per-model FIFO semaphores**: each model admits up to its own cap, and
|
|
185
|
+
the caps sum freely. That is a fine v1, but it lets two models be individually
|
|
186
|
+
"under cap" while collectively oversubscribing the single GPU.
|
|
187
|
+
|
|
188
|
+
The planned next step replaces those independent semaphores with **one global
|
|
189
|
+
priority queue + cost-weighted admission against a single shared GPU budget**
|
|
190
|
+
(`B = 1.0`). Each run costs its fraction of the GPU (`cost = 1 / cap`, so a `cap=4`
|
|
191
|
+
model costs `0.25`); the scheduler admits the highest-priority request that *fits*
|
|
192
|
+
the remaining budget and releases that cost on completion — so multiple models run
|
|
193
|
+
**in parallel up to real capacity** instead of up to the sum of their caps. Packing
|
|
194
|
+
is **work-conserving** (leftover budget keeps serving cheap jobs) with a
|
|
195
|
+
**reservation + aging** guard so a drip of cheap high-priority jobs can't starve an
|
|
196
|
+
expensive one. Backend hard caps still bind (`model_in_flight < cap` **and**
|
|
197
|
+
`used + cost ≤ B`), and **large-model switching** falls out for free: a swap-slot
|
|
198
|
+
("fat-slot") group where only one big model is resident at a time is modeled as
|
|
199
|
+
`cost = 1.0`, so admitting one fills the budget and blocks the rest until it
|
|
200
|
+
completes. Optional **per-key priority** is un-gameable (`effective_priority =
|
|
201
|
+
min(requested, key_max)`, batch keys capped low). There is **no preemption** —
|
|
202
|
+
Metal can't reorder dispatched GPU kernels, so the only lever is *admission*.
|
|
203
|
+
|
|
204
|
+
The trade is deliberate: a shared budget is **lower peak concurrency** than summed
|
|
205
|
+
caps but **honest about the one GPU and free of thrash**, and it keeps the same
|
|
206
|
+
"wait, don't reject" spillway posture. Full design (packing policy, starvation
|
|
207
|
+
proof, the scalar-cost VRAM-vs-compute caveat):
|
|
208
|
+
[`docs/COST-SCHEDULER.md`](docs/COST-SCHEDULER.md).
|
|
209
|
+
|
|
210
|
+
> Note: today, large-model switching is performed by the underlying swap layer
|
|
211
|
+
> (e.g. llama-swap); Overlaat only **observes and logs** it (`model_loads`). The
|
|
212
|
+
> scheduler above folds that switching into its own budget arithmetic.
|
|
213
|
+
|
|
214
|
+
- **Storage-backend agnostic (at least Postgres + SQLite)** — *not yet implemented.*
|
|
215
|
+
Today both writers and the dashboard talk to Postgres directly (`psycopg`). The plan
|
|
216
|
+
is a thin storage abstraction over the three tables so a single-box deployment can
|
|
217
|
+
run on **SQLite** with zero extra services, while a shared/multi-host setup keeps
|
|
218
|
+
**Postgres**. The event schema is intentionally simple (epoch-second timestamps, no
|
|
219
|
+
DB-specific types), so this is mostly an insert/query adapter plus dialect-aware DDL.
|
|
220
|
+
|
|
221
|
+
## Built with LLMs, said openly
|
|
222
|
+
|
|
223
|
+
This software was developed with **strong assistance from large language models** —
|
|
224
|
+
Claude, and the very local models it queues — with humans leading the ideas, the
|
|
225
|
+
architecture, the testing, and the debugging. We say this openly because it shaped how
|
|
226
|
+
the project was built: a lot of the code, the docs, and this README were drafted by a
|
|
227
|
+
model and then dogfooded against the real gateway it sits in front of. If you are not
|
|
228
|
+
happy with AI-assisted code, this software is not for you.
|
|
229
|
+
|
|
230
|
+
The flip side of saying it openly: the design decisions are human-owned and it runs in
|
|
231
|
+
real use, but it is experimental — read the code before you rely on it.
|
|
232
|
+
|
|
233
|
+
## Acknowledgements
|
|
234
|
+
|
|
235
|
+
Overlaat is a thin layer, and it would not exist without the work it sits on top of:
|
|
236
|
+
|
|
237
|
+
- [LiteLLM](https://github.com/BerriAI/litellm) — the gateway it stands in front of.
|
|
238
|
+
- [FastAPI](https://fastapi.tiangolo.com/) / [Starlette](https://www.starlette.io/) /
|
|
239
|
+
[uvicorn](https://www.uvicorn.org/) — the two services.
|
|
240
|
+
- [httpx](https://www.python-httpx.org/) — the streaming pass-through.
|
|
241
|
+
- [psycopg](https://www.psycopg.org/) and [PostgreSQL](https://www.postgresql.org/) —
|
|
242
|
+
the one honest event store.
|
|
243
|
+
- and the local-inference ecosystem it exists to queue:
|
|
244
|
+
[MLX](https://github.com/ml-explore/mlx),
|
|
245
|
+
[llama.cpp](https://github.com/ggml-org/llama.cpp),
|
|
246
|
+
[Ollama](https://github.com/ollama/ollama),
|
|
247
|
+
[vLLM](https://github.com/vllm-project/vllm),
|
|
248
|
+
[llama-swap](https://github.com/mostlygeek/llama-swap).
|
|
249
|
+
|
|
250
|
+
## Status and caveats
|
|
251
|
+
|
|
252
|
+
- **Experimental.** Shared as-is. **No support promise**, no compatibility guarantee
|
|
253
|
+
between versions.
|
|
254
|
+
- **MIT licensed.** See [`LICENSE`](LICENSE).
|
|
255
|
+
- Built and dogfooded on an **Apple-Silicon multi-backend** setup, but it is
|
|
256
|
+
**backend-agnostic**: all it needs is an OpenAI-compatible LiteLLM gateway in front of
|
|
257
|
+
whatever engines you run, and a Postgres to write events to.
|
|
258
|
+
|
|
259
|
+
Known caveats, stated up front because that is the whole point of the project:
|
|
260
|
+
|
|
261
|
+
- **Per-process GPU is not reliably measurable** on all platforms — notably Metal/MLX
|
|
262
|
+
workloads on macOS report 0. GPU% is therefore kept host-wide; **memory is attributed
|
|
263
|
+
per-backend via RSS**.
|
|
264
|
+
- **Token counts are NULL** when a backend reports no `usage`. The proxy injects
|
|
265
|
+
`stream_options.include_usage=true` on streaming chat to minimize this; NULL is never
|
|
266
|
+
counted as zero.
|
|
267
|
+
- **Engine tail after client-abandon.** On disconnect the slot releases at `t_done`,
|
|
268
|
+
but a single-stream engine may keep decoding briefly. The "active" curve measures
|
|
269
|
+
*slot occupancy*, not literal GPU-busy after release. In-flight requests are therefore
|
|
270
|
+
*not* safely cancellable; only still-queued requests are.
|