anvil-serving 0.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. anvil_serving-0.4.0/LICENSE +21 -0
  2. anvil_serving-0.4.0/PKG-INFO +474 -0
  3. anvil_serving-0.4.0/README.md +425 -0
  4. anvil_serving-0.4.0/anvil_serving/__init__.py +2 -0
  5. anvil_serving-0.4.0/anvil_serving/_aggregate_usage.py +193 -0
  6. anvil_serving-0.4.0/anvil_serving/_role_split.py +91 -0
  7. anvil_serving-0.4.0/anvil_serving/_sync.py +359 -0
  8. anvil_serving-0.4.0/anvil_serving/benchmark.py +224 -0
  9. anvil_serving-0.4.0/anvil_serving/cache_prune.py +608 -0
  10. anvil_serving-0.4.0/anvil_serving/cli.py +31 -0
  11. anvil_serving-0.4.0/anvil_serving/config.py +40 -0
  12. anvil_serving-0.4.0/anvil_serving/deploy.py +51 -0
  13. anvil_serving-0.4.0/anvil_serving/eval.py +167 -0
  14. anvil_serving-0.4.0/anvil_serving/models.py +22 -0
  15. anvil_serving-0.4.0/anvil_serving/multiplexer.py +773 -0
  16. anvil_serving-0.4.0/anvil_serving/preflight.py +144 -0
  17. anvil_serving-0.4.0/anvil_serving/profile.py +22 -0
  18. anvil_serving-0.4.0/anvil_serving/py.typed +0 -0
  19. anvil_serving-0.4.0/anvil_serving/router/__init__.py +102 -0
  20. anvil_serving-0.4.0/anvil_serving/router/__main__.py +45 -0
  21. anvil_serving-0.4.0/anvil_serving/router/backends/__init__.py +32 -0
  22. anvil_serving-0.4.0/anvil_serving/router/backends/cloud.py +470 -0
  23. anvil_serving-0.4.0/anvil_serving/router/backends/local.py +62 -0
  24. anvil_serving-0.4.0/anvil_serving/router/backends/relay.py +72 -0
  25. anvil_serving-0.4.0/anvil_serving/router/calibrate.py +402 -0
  26. anvil_serving-0.4.0/anvil_serving/router/classify.py +225 -0
  27. anvil_serving-0.4.0/anvil_serving/router/commit_window.py +330 -0
  28. anvil_serving-0.4.0/anvil_serving/router/config.py +338 -0
  29. anvil_serving-0.4.0/anvil_serving/router/decision_log.py +236 -0
  30. anvil_serving-0.4.0/anvil_serving/router/dialects/__init__.py +52 -0
  31. anvil_serving-0.4.0/anvil_serving/router/dialects/anthropic.py +264 -0
  32. anvil_serving-0.4.0/anvil_serving/router/dialects/openai.py +216 -0
  33. anvil_serving-0.4.0/anvil_serving/router/discovery.py +60 -0
  34. anvil_serving-0.4.0/anvil_serving/router/fallback.py +689 -0
  35. anvil_serving-0.4.0/anvil_serving/router/fingerprint.py +133 -0
  36. anvil_serving-0.4.0/anvil_serving/router/front_door.py +683 -0
  37. anvil_serving-0.4.0/anvil_serving/router/intent.py +263 -0
  38. anvil_serving-0.4.0/anvil_serving/router/internal.py +185 -0
  39. anvil_serving-0.4.0/anvil_serving/router/metrics.py +428 -0
  40. anvil_serving-0.4.0/anvil_serving/router/policy.py +308 -0
  41. anvil_serving-0.4.0/anvil_serving/router/prices.py +92 -0
  42. anvil_serving-0.4.0/anvil_serving/router/profile_bootstrap.py +528 -0
  43. anvil_serving-0.4.0/anvil_serving/router/profile_store.py +374 -0
  44. anvil_serving-0.4.0/anvil_serving/router/registry.py +356 -0
  45. anvil_serving-0.4.0/anvil_serving/router/seams.py +267 -0
  46. anvil_serving-0.4.0/anvil_serving/router/secrets.py +233 -0
  47. anvil_serving-0.4.0/anvil_serving/router/serve.py +592 -0
  48. anvil_serving-0.4.0/anvil_serving/router/tier0_keywords.json +7 -0
  49. anvil_serving-0.4.0/anvil_serving/router/verify.py +733 -0
  50. anvil_serving-0.4.0/anvil_serving/score.py +621 -0
  51. anvil_serving-0.4.0/anvil_serving/serves.py +240 -0
  52. anvil_serving-0.4.0/anvil_serving.egg-info/PKG-INFO +474 -0
  53. anvil_serving-0.4.0/anvil_serving.egg-info/SOURCES.txt +60 -0
  54. anvil_serving-0.4.0/anvil_serving.egg-info/dependency_links.txt +1 -0
  55. anvil_serving-0.4.0/anvil_serving.egg-info/entry_points.txt +2 -0
  56. anvil_serving-0.4.0/anvil_serving.egg-info/requires.txt +3 -0
  57. anvil_serving-0.4.0/anvil_serving.egg-info/top_level.txt +1 -0
  58. anvil_serving-0.4.0/pyproject.toml +62 -0
  59. anvil_serving-0.4.0/setup.cfg +4 -0
  60. anvil_serving-0.4.0/tests/test_benchmark.py +117 -0
  61. anvil_serving-0.4.0/tests/test_eval.py +187 -0
  62. anvil_serving-0.4.0/tests/test_serves.py +208 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Sekou Doumbouya
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,474 @@
1
+ Metadata-Version: 2.4
2
+ Name: anvil-serving
3
+ Version: 0.4.0
4
+ Summary: A quality-gated local-model router for coding harnesses — local where it's proven, cloud where it isn't, verified with automatic fallback.
5
+ Author: Sekou Doumbouya
6
+ License: MIT License
7
+
8
+ Copyright (c) 2026 Sekou Doumbouya
9
+
10
+ Permission is hereby granted, free of charge, to any person obtaining a copy
11
+ of this software and associated documentation files (the "Software"), to deal
12
+ in the Software without restriction, including without limitation the rights
13
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14
+ copies of the Software, and to permit persons to whom the Software is
15
+ furnished to do so, subject to the following conditions:
16
+
17
+ The above copyright notice and this permission notice shall be included in all
18
+ copies or substantial portions of the Software.
19
+
20
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26
+ SOFTWARE.
27
+
28
+ Project-URL: Homepage, https://github.com/fakoli/anvil-serving
29
+ Project-URL: Repository, https://github.com/fakoli/anvil-serving
30
+ Project-URL: Issues, https://github.com/fakoli/anvil-serving/issues
31
+ Project-URL: Changelog, https://github.com/fakoli/anvil-serving/blob/main/CHANGELOG.md
32
+ Keywords: llm,router,local-llm,claude-code,openai,anthropic,quality-gate,verify,fallback,inference,coding-agent,sglang,vllm
33
+ Classifier: Development Status :: 4 - Beta
34
+ Classifier: Intended Audience :: Developers
35
+ Classifier: License :: OSI Approved :: MIT License
36
+ Classifier: Programming Language :: Python :: 3
37
+ Classifier: Programming Language :: Python :: 3.11
38
+ Classifier: Programming Language :: Python :: 3.12
39
+ Classifier: Programming Language :: Python :: 3.13
40
+ Classifier: Operating System :: OS Independent
41
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
42
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
43
+ Requires-Python: >=3.11
44
+ Description-Content-Type: text/markdown
45
+ License-File: LICENSE
46
+ Provides-Extra: dev
47
+ Requires-Dist: pytest; extra == "dev"
48
+ Dynamic: license-file
49
+
50
+ <div align="center">
51
+
52
+ ![anvil-serving — the quality-gated local-model router for coding harnesses](assets/banner.png)
53
+
54
+ # anvil-serving
55
+
56
+ > **The quality-gated local-model router for coding harnesses.**
57
+
58
+ > *Local where it's been proven, cloud where it hasn't — verified, with automatic fallback.*
59
+
60
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE)
61
+ [![Version](https://img.shields.io/badge/version-0.4.0-blue.svg)](CHANGELOG.md)
62
+ [![Docs](https://img.shields.io/badge/docs-fakoli.github.io%2Fanvil--serving-blue.svg)](https://fakoli.github.io/anvil-serving/)
63
+ [![Marketplace](https://img.shields.io/badge/marketplace-fakoli-purple.svg)](https://github.com/fakoli/anvil-serving)
64
+ [![Tests](https://img.shields.io/badge/tests-passing-brightgreen.svg)](tests)
65
+
66
+ </div>
67
+
68
+ Point your coding harness (Claude Code, Codex, Aider, Cline, Continue — OpenClaw as the
69
+ near-first-class beachhead) at **one** anvil-serving endpoint. Per request, the router resolves an
70
+ **intent** to a **tier** — fast-local, heavy-local, or cloud — using a **measured per-(model,
71
+ work-class) quality profile**, cheaply **verifies** the output, and **falls back** to the next
72
+ tier (ultimately cloud) when the local answer fails. The harness sees one reliable endpoint and
73
+ never silently eats a local-quality failure mid-run.
74
+
75
+ The router is built on a serving substrate that already exists in this repo — usage profiling,
76
+ model cataloging, tuned deployment, a correctness gate, capacity benchmarking, and a
77
+ single-GPU model multiplexer. Those tools (documented below) right-size and stand up the local
78
+ tiers; the router decides what each one is *trusted* to serve.
79
+
80
+ ---
81
+
82
+ ## Why a router, and not just another proxy
83
+
84
+ Transport is a commodity — LiteLLM, claude-code-router, Ollama, OpenRouter all move tokens.
85
+ None of them know **whether local can actually do *this* work.** They route by static rules
86
+ (model name, cost, regex). On anvil's real PRD→tasks planning prompt, the gap was measured
87
+ directly:
88
+
89
+ - Local output is **structurally valid ≥92%** of the time (5 of 6 outputs scored 100% — parses
90
+ cleanly under the strict schema, no cycles, no dangling edges) — structural validity is **not**
91
+ the differentiator.
92
+ - But on **dependency/ordering reasoning** local collapses: blind-judge totals were
93
+ **frontier 24.75/25, fast 16.0, heavy 13.25** (local ≈ 55–65% of frontier), with the gap
94
+ squarely in dependency correctness (frontier **5.0/5** vs local **~2/5**).
95
+
96
+ A dumb proxy sends that planning request to local and silently corrupts a long agent run. The
97
+ defensible asset is therefore **not** the transport — it's the **quality profile** (per model ×
98
+ work-class, measured on the operator's own workload) plus the **verify-and-fallback loop.**
99
+ Competitors can copy transport in a weekend; they can't copy "we measured that `gpt-oss-20b` is
100
+ safe for bounded edits but unsafe for dependency planning on *your* repos."
101
+
102
+ Two more decisions fell out of the research and shaped the design:
103
+
104
+ - **The integration point is the harness/runtime, not a ledger.** An audit of the `anvil`
105
+ state engine confirmed it is *not* an LLM gateway — it exposes a single `custom_base_url` for
106
+ optional planning augmentation, with no router and no two-tier endpoint routing.
107
+ So the router lives where the agent traffic actually flows: in front of the harness.
108
+ - **The `model` field is the one routing channel unmodified harnesses expose.** It is required
109
+ in both wire schemas, forwarded verbatim, and free-form — so "named presets in the model
110
+ field" is the right wire surface.
111
+
112
+ ---
113
+
114
+ ## How it works
115
+
116
+ ### Intent presets in the `model` field
117
+
118
+ Callers declare an **intent** — a closed enum of named presets — instead of a model name. The
119
+ router owns `intent → (model, tier, params)`:
120
+
121
+ ```
122
+ planning quick-edit review chat long-context
123
+ ```
124
+
125
+ Accepted bare (`planning`) or namespaced (`anvil/planning`). Each preset resolves internally to
126
+ **hard constraints** (context length, privacy=local-only, tool/structured-output support, cost
127
+ ceiling) that *filter* the candidate pool, plus a quality intent that *ranks* the survivors via
128
+ the profile. **Filter, then rank.** A `model:`-pin escape hatch stays available for repro and
129
+ debugging. `/v1/models` advertises the preset vocabulary so presets surface in harness model
130
+ pickers, and responses stay **transparent** — the response reports the *real* tier that served.
131
+
132
+ ### The graceful-degradation tier ladder
133
+
134
+ Intent resolution degrades to the highest tier the originating harness can reach. The
135
+ classifier is the **universal floor**, because most requests arrive on a single session model
136
+ string with no declared intent:
137
+
138
+ | Tier | Mechanism | Available on |
139
+ |---|---|---|
140
+ | **0 — Infer** | classify work-class from the raw payload (token count, `thinking` flag, tool types, image content, system-prompt fingerprint) — per-request intent with no caller cooperation | every harness that reaches the endpoint |
141
+ | **1 — Named presets in `model`** | caller/config sets a preset token; router maps preset → tier | Claude Code, Codex, Aider, Cline, Continue — **not** Cursor/Amp/Devin |
142
+ | **2 — extra_body / header dimensions** | optional structured hints (budget, latency, verifier policy) | Codex, Continue; Aider (config) |
143
+ | **3 — Native intent field** | a first-class per-request intent field | none today (needs a standard/harness change) |
144
+
145
+ **Work-class taxonomy v0:** `chat/Q&A`, `bounded-edit`, `multi-file-refactor`,
146
+ `planning/decomposition`, `review/critique`, `long-context-retrieval`. Ambiguous
147
+ classifications bias toward the safer/cloud tier and are logged for calibration.
148
+
149
+ ### Verify-and-fallback
150
+
151
+ ```mermaid
152
+ flowchart LR
153
+ REQ([request]) --> RES["resolve intent<br/>(preset or Tier-0 infer)"]
154
+ RES --> POL{"quality profile<br/>model x work-class"}
155
+ POL -->|allow| LOCAL["local tier<br/>(fast or heavy)"]
156
+ POL -->|"allow-with-verify"| LV[local tier]
157
+ LV --> VER{"structural verify"}
158
+ VER -->|pass| RET([200 to harness])
159
+ LOCAL --> RET
160
+ VER -->|fail| MISS[all candidates exhausted]
161
+ POL -->|deny| MISS
162
+ MISS --> GATE{"cloud tier<br/>opt-in configured?"}
163
+ GATE -->|"no, keyless default"| SOS["503 exhaustion — no local tokens<br/>gateway failover to native provider<br/>(flat-rate, zero metered by anvil)"]
164
+ GATE -->|"yes, opt-in keyed"| CLOUD["cloud tier<br/>(metered, explicit opt-in only)"]
165
+ CLOUD --> RET
166
+ classDef gate fill:#0b3b40,stroke:#23b6c4,color:#7fe9f0;
167
+ classDef cloud fill:#16365e,stroke:#58a6ff,color:#cfe6ff;
168
+ classDef local fill:#0b2b0b,stroke:#23c430,color:#7ff09a;
169
+ classDef escape fill:#3b1800,stroke:#c47823,color:#f0c887;
170
+ class POL,VER,GATE gate;
171
+ class CLOUD cloud;
172
+ class LOCAL,LV local;
173
+ class SOS escape;
174
+ ```
175
+
176
+ Most "quality control" is **routing done ahead of time** (never send a `deny` work-class to
177
+ local). Verification is a cheap safety net, tiered:
178
+
179
+ 1. **Prevent** — the profile's `deny` decisions keep risky classes (e.g. dependency planning)
180
+ off local entirely. Free.
181
+ 2. **Cheap structural verify (inline)** — near-zero-cost checks that caught real failures in the
182
+ eval: empty/truncated content (thinking-budget starvation), tool-call JSON that doesn't
183
+ validate, code that doesn't parse, a diff that doesn't apply.
184
+ 3. **Confidence signals** — logprob/entropy thresholds, refusal markers where available.
185
+ 4. **Async LLM-judge (off the hot path)** — sampled cloud grading that feeds the profile, never
186
+ a blocking gate.
187
+
188
+ On verify-fail / error / timeout / low-confidence the router retries **up the tier chain**
189
+ (fast → heavy → cloud), with retry caps, circuit breakers, and a per-session cost budget. For
190
+ fail-prone classes on the streaming path, a non-streamed **commit window** buffers and verifies
191
+ before the first byte reaches the harness, so a local miss never delivers partial tokens. Every
192
+ fallback is logged as a profile signal.
193
+
194
+ ### Default: local-only / $0 metered cloud
195
+
196
+ **The default config (`configs/example.toml`) routes local-only — anvil holds no cloud API key
197
+ and incurs $0 metered API billing.** The "cloud tier" in the diagram above operates in two modes:
198
+
199
+ - **Keyless (default):** no cloud tier is configured. On a local verify-failure, all candidates
200
+ are exhausted and anvil returns an **`exhaustion_status`** (503 by default, configurable) with
201
+ nothing local committed. A gateway like OpenClaw treats this as a transport failure and re-routes
202
+ the request on its native subscription provider — **flat-rate, not metered.**
203
+ - **Opt-in keyed:** when an operator explicitly adds a cloud tier via
204
+ `configs/example-with-cloud.toml`, verify-failures escalate internally to the cloud tier and
205
+ return **200**. This is metered — every cloud-routed request bills per token against the API key.
206
+
207
+ > **Billing warning:** the opt-in cloud tier routes through a metered API key
208
+ > (`ANTHROPIC_API_KEY`, etc.), not your flat-rate subscription. Per-token charges apply to every
209
+ > request the cloud tier serves. The per-intent `[router].metered_cloud` map controls which
210
+ > work-classes are ever eligible for the cloud tier — nothing is metered unless you explicitly
211
+ > list it there. See [`configs/example-with-cloud.toml`](configs/example-with-cloud.toml) and
212
+ > [ADR-0001](docs/adr/0001-cloud-cost-and-subscription-auth.md) for the full opt-in config and
213
+ > rationale.
214
+
215
+ **`POST /v1/route` — the routing-brain endpoint.** The decision is also queryable standalone,
216
+ without serving the request: send a completions-shaped body, get back
217
+ `{ tier, model, provider, work_class, reason, confidence, session_id }`. The OpenClaw plugin
218
+ uses this to route `deny`-class requests directly to the gateway's native provider — bypassing
219
+ anvil entirely — and to send `allow`-class requests through. Status 200 (decision made, even
220
+ if `cloud`), 400 (malformed), 503 (no suitable tier).
221
+
222
+ ### The quality profile (the moat)
223
+
224
+ ![The quality gate — work proven on a tier stays local ($0 metered); unproven work is deferred to the harness's own opt-in cloud subscription; measured per (model, work-class)](assets/explainer-quality-gate.png)
225
+
226
+ A table keyed `(model, work-class) → {quality_score, sample_n, last_measured, decision}` where
227
+ `decision ∈ {allow, allow-with-verify, deny}`. Hand-seeded for the MVP; later bootstrapped from
228
+ the shadow-eval harness (the planning eval generalized to arbitrary work-classes), right-sized
229
+ from measured usage via `profile`, and continuously calibrated from sampled production traffic
230
+ graded off the hot path. Keyed on a **serve fingerprint** (model + quant + engine + serve flags)
231
+ so a quant/engine swap marks affected rows stale and triggers re-measure.
232
+
233
+ ### Architecture
234
+
235
+ ![anvil-serving routing pipeline — local-first across fast-local / heavy-local ($0 metered); cloud is the harness's own, opt-in](assets/architecture.png)
236
+
237
+ ```
238
+ ┌──────────────────── anvil-serving router ───────────────────┐
239
+ harness → │ front door → resolve intent → route → [verify] → return │ → harness
240
+ (CC/Codex)│ (Anthropic (preset in (filter by │ on fail ↘ │
241
+ │ +OpenAI model field, else constraints, │ fall back to │
242
+ │ dialects) infer work-class) rank by │ next local tier │
243
+ │ quality profile) │
244
+ └───────────────────────────────┬──────────────┴────────────────────┘
245
+
246
+ fast-local :30001 heavy-local :30000 · cloud is opt-in ·
247
+ (multiplexer-managed, free GPU) on a local miss anvil returns 503;
248
+ the harness fails over to its OWN
249
+ cloud subscription (anvil holds no key)
250
+ ```
251
+
252
+ The core stays **protocol-standard** (Anthropic Messages + OpenAI Chat Completions) with **zero
253
+ OpenClaw coupling.** OpenClaw is the near-first-class beachhead because its `before_model_resolve`
254
+ hook (fires per run — plausibly one user message; exact cadence is a documented validate-first gap)
255
+ unlocks client-side per-request intent the closed harnesses can't do
256
+ — but it ships as a **thin, swappable adapter plugin**, not a dependency. Focus, not couple.
257
+ Full design: [`docs/QUALITY-GATED-ROUTER.md`](docs/QUALITY-GATED-ROUTER.md). OpenClaw spec
258
+ (verdict: go-with-caveats, medium risk / API churn):
259
+ [`docs/OPENCLAW-INTEGRATION-SPEC.md`](docs/OPENCLAW-INTEGRATION-SPEC.md).
260
+
261
+ ---
262
+
263
+ ## Install
264
+
265
+ ```bash
266
+ pip install anvil-serving # stdlib-only; no required deps
267
+ anvil-serving --help
268
+ ```
269
+
270
+ > `pip install anvil-serving` works once the package is published to PyPI. Until then (or for
271
+ > development), install from a clone: `pip install -e .`.
272
+
273
+ ### 30-second quickstart
274
+
275
+ ```bash
276
+ # 1) install
277
+ pip install anvil-serving # or: pip install -e . (from a clone)
278
+
279
+ # 2) start the router front door on 127.0.0.1:8000
280
+ anvil-serving serve --config configs/example.toml
281
+
282
+ # 3a) Claude Code: point it at the router (use 127.0.0.1, never localhost)
283
+ export ANTHROPIC_BASE_URL="http://127.0.0.1:8000"
284
+ export ANTHROPIC_MODEL="planning" # an intent preset, sent verbatim in the model field
285
+
286
+ # 3b) or any OpenAI-compatible client: point its base URL at the router
287
+ export OPENAI_API_BASE="http://127.0.0.1:8000/v1"
288
+ # then use a preset token as the model id, e.g. "planning" / "quick-edit" / "chat"
289
+ ```
290
+
291
+ Both the router front door (`anvil-serving serve`) and the serving substrate commands ship today.
292
+
293
+ ### Pointing a harness at the router (config recipes)
294
+
295
+ Use `127.0.0.1` in local URLs — on Windows, `localhost` can stall on an IPv6 lookup. Secrets are
296
+ referenced by **env-var name** only; never paste a key into config.
297
+
298
+ **Claude Code** — intent preset per session slot:
299
+ ```bash
300
+ export ANTHROPIC_BASE_URL="http://127.0.0.1:8000"
301
+ export ANTHROPIC_AUTH_TOKEN="$ANVIL_ROUTER_TOKEN" # -> Authorization header
302
+ export ANTHROPIC_MODEL="planning" # main-loop intent, sent verbatim
303
+ export ANTHROPIC_DEFAULT_HAIKU_MODEL="quick-edit" # background/utility intent
304
+ export CLAUDE_CODE_SUBAGENT_MODEL="review" # subagent-class intent
305
+ export CLAUDE_CODE_ENABLE_GATEWAY_MODEL_DISCOVERY=1 # optional: enumerate router presets
306
+ ```
307
+
308
+ **Aider** — the preset rides in the model string (`openai/` forces compat routing):
309
+ ```bash
310
+ export OPENAI_API_BASE="http://127.0.0.1:8000/v1"
311
+ export OPENAI_API_KEY="$ANVIL_ROUTER_TOKEN"
312
+ aider --model openai/planning --editor-model openai/quick-edit --weak-model openai/chat
313
+ ```
314
+
315
+ **Cline / Continue.dev** — select "OpenAI Compatible", Base URL `http://127.0.0.1:8000/v1`,
316
+ Model (ID) = a preset token. **Codex CLI** — set `base_url` + `model = "planning"` in
317
+ `~/.codex/config.toml`. **Cursor / Amp / Devin are out of scope** — backend-mediated or
318
+ backend-locked, so they can't be repointed at a self-hosted endpoint. Full recipes:
319
+ [`docs/QUALITY-GATED-ROUTER.md`](docs/QUALITY-GATED-ROUTER.md) (Appendix).
320
+
321
+ ---
322
+
323
+ ## Security & exposure
324
+
325
+ The front door binds **`127.0.0.1`** by default and has **no built-in authentication**. That is
326
+ the right default — keep it loopback-only unless you have a reason not to.
327
+
328
+ If you bind it publicly (`--host 0.0.0.0`) or otherwise put it on a reachable address, **you are
329
+ responsible for authentication and network controls** (a reverse proxy with auth, firewall rules,
330
+ a private network). An open endpoint lets **any** caller drive routing and, if you have configured an opt-in metered
331
+ cloud tier, **consume your cloud credentials** — so an unauthenticated public bind is both a
332
+ quality and a billing exposure. (The default local-only config carries no cloud key, but the risk
333
+ still applies if you later add one.) Cloud credentials are referenced by env-var name and redacted
334
+ from logs; see [`SECURITY.md`](SECURITY.md) for the full threat model and how to report a
335
+ vulnerability.
336
+
337
+ ---
338
+
339
+ ## The serving substrate (the foundation the router builds on)
340
+
341
+ These commands ship today and are how you right-size, stand up, and validate the local tiers the
342
+ router routes across. Stdlib-only; the hard-won gotchas (below) are baked in as defaults.
343
+
344
+ ```
345
+ anvil-serving profile # parse ~/.claude logs -> usage baseline (context/gen/concurrency percentiles, role split)
346
+ anvil-serving models sync # scan your HF caches -> card catalog + INDEX (GGUF vs safetensors, ctx, quant, license, thinking)
347
+ anvil-serving deploy # render a tuned SGLang docker-compose for YOUR gpu + model
348
+ anvil-serving preflight # correctness gate against any OpenAI-compatible endpoint (sm_120-aware)
349
+ anvil-serving benchmark # replay YOUR measured request distribution (TTFT, throughput, prefix-cache hit)
350
+ anvil-serving multiplexer # single-resident model swap on one GPU (multi-engine: SGLang + vLLM)
351
+ ```
352
+
353
+ ### Substrate quickstart
354
+
355
+ ```bash
356
+ # 1) understand your usage (-> the routing profile is right-sized from this)
357
+ anvil-serving profile --out-dir .
358
+
359
+ # 2) see what models you have and which a server can actually load
360
+ anvil-serving models sync --out ./model-library
361
+ # -> ./model-library/INDEX.md (the (yes/no) "SGLang-loadable" column is the one that saves you)
362
+
363
+ # 3) generate a deployment for a local model on GPU 1
364
+ anvil-serving deploy --model /path/to/model-dir --gpu 1 --context 131072 --served-name local-specialist
365
+ docker compose -f docker-compose.yml up -d
366
+
367
+ # 4) validate + benchmark the tier (use 127.0.0.1, never localhost, on Windows)
368
+ anvil-serving preflight --base-url http://127.0.0.1:30000/v1 --model local-specialist --needle-ctx 60000
369
+ anvil-serving benchmark --base-url http://127.0.0.1:30000/v1 --model local-specialist --burst 20
370
+ ```
371
+
372
+ `preflight` is the router's **correctness gate** in microcosm — it's the same verify-before-trust
373
+ philosophy applied to a single endpoint. `multiplexer` is the **Backend seam** that already
374
+ exists: it manages the single-resident fast/heavy swap pair on one GPU behind one interface.
375
+ (`score` and `cache-prune` are additional substrate helpers.)
376
+
377
+ ### What's baked in (the knowledge, not just code)
378
+
379
+ - **Load-time OOM fix:** loading a model without mmap pulls it fully into RAM — on WSL2 the
380
+ default ~50%-of-host cap OOM-kills the scheduler (SIGKILL/-9). Raise the WSL VM memory; the
381
+ deploy uses `--weight-loader-disable-mmap` (fast sequential reads vs catastrophic
382
+ mmap-over-virtiofs).
383
+ - **GGUF != SGLang.** GGUF is llama.cpp-only; SGLang/vLLM need safetensors. The catalog flags
384
+ this up front.
385
+ - **Thinking-by-default models** (Qwen3.5 etc.) return *empty* content with a small `max_tokens`
386
+ — they spend the budget reasoning. Disable per request with
387
+ `chat_template_kwargs:{enable_thinking:false}`, or give >= 4096 tokens. See
388
+ [`docs/MODEL-SETTINGS-EXAMPLE.md`](docs/MODEL-SETTINGS-EXAMPLE.md).
389
+ - **GPU pinning** (`device_ids`) so one card serves while another stays free (gaming / second job).
390
+ - **Blackwell sm_120 caveats:** some FP8 MoE paths hang on load; AWQ/compressed-tensors via
391
+ Marlin works; NVFP4 large-prefill is still rough. Pre-flight before you trust throughput.
392
+
393
+ ### Worked example — `fakoli-dark`
394
+
395
+ [`examples/fakoli-dark/`](examples/fakoli-dark/) is a real two-tier instance and the bake-off
396
+ context for the router:
397
+
398
+ - **heavy** `:30000` — `qwen3-coder-30b` on **SGLang**, RTX PRO 6000 96GB.
399
+ - **fast** `:30001` — `gpt-oss-20b` on **vLLM**, RTX 5090 32GB.
400
+ - **gateway** — **Fakoli Mini** runs **OpenClaw** (already installed), the beachhead harness; the
401
+ router sits between it and the serves.
402
+
403
+ It carries the actual compose files, the `.wslconfig` fix snapshot, the model index, the setup
404
+ story ([`SETUP-STORY.md`](examples/fakoli-dark/SETUP-STORY.md)), the decisions log, and the
405
+ [`BAKE-OFF-RUNBOOK.md`](examples/fakoli-dark/BAKE-OFF-RUNBOOK.md) (where local-planning failover
406
+ now lives, after the router promotion).
407
+
408
+ ---
409
+
410
+ ## Status
411
+
412
+ **v0.4.0 is shipped.** The `harness-router` PRD is **complete — all 18 tasks built
413
+ (milestones M0–M3), 378 tests green**. v0.4.0 ships advise-and-defer (local-only default,
414
+ opt-in metered cloud) and the launch-hardening pass on top of the v0.3.0 harness-router. Both
415
+ the router front door (`anvil-serving serve`) and the serving substrate (profile / models sync /
416
+ deploy / preflight / benchmark / multiplexer) ship. See the [CHANGELOG](CHANGELOG.md) for the
417
+ full release notes.
418
+
419
+ What shipped, by milestone:
420
+
421
+ - **M0 — front door + config:** Anthropic + OpenAI dialects, SSE streaming, pass-through to one
422
+ backend, tier-topology config schema. Drop-in for Claude Code.
423
+ - **M1 — intent + policy:** Tier-0 classifier (the floor) **and** preset parsing from the `model`
424
+ field, residency-aware routing policy over the multiplexer, `/v1/models` preset discovery,
425
+ cloud-tier credentials on the Backend seam.
426
+ - **M2 — the wedge:** cheap inline structural verify + verify-gated fallback (streaming commit
427
+ window; cloud escalation is the opt-in keyed mode — the keyless default returns an
428
+ exhaustion-503 for gateway handoff), transparent responses + decision log, the typed extension
429
+ seams, the `anvil-serving serve --config ...` CLI verb, and the OpenClaw reference adapter plugin.
430
+ - **M3 — the moat:** quality-profile bootstrap from the generalized shadow-eval, async opt-in
431
+ calibration + serve-fingerprint staleness, validation on routed traffic, and the per-work-class
432
+ promotion decision (planning/critic stay cloud-default, failover-only).
433
+
434
+ ### Known limitations
435
+
436
+ - **OpenClaw live validation is manual.** Validating against a real OpenClaw install (firing
437
+ cadence + outbound wire `model` form) is a human step on the gateway box —
438
+ [`examples/openclaw/README.md`](examples/openclaw/README.md). The committed
439
+ `hook-fire-log.jsonl` is a representative fixture, not a live capture.
440
+ - **Most promotion verdicts are seed/expected.** The shipped per-work-class promotion decisions
441
+ are hand-seeded, pending real-traffic calibration; only `planning` rests on hard eval data
442
+ (blind-judge scores: frontier 24.75/25, fast 16.0, heavy 13.25 — full data in the companion notes repo).
443
+ - **The T017 traffic fixture is synthetic** — traffic-metrics behavior is exercised against a
444
+ synthetic fixture, not yet against real routed production traffic.
445
+
446
+ ### Reuse map
447
+
448
+ | Capability | Module | Status |
449
+ |---|---|---|
450
+ | Right-size from real usage | `profile` | exists |
451
+ | Per-model serving facts / sane defaults | `models sync`, `analyze` | exists / designed |
452
+ | Bring up + on-demand model swap on one GPU | `multiplexer` | exists |
453
+ | Correctness gate | `preflight` | exists |
454
+ | Throughput / capacity measurement | `benchmark` | exists |
455
+ | Per-work-class quality measurement | shadow-eval harness | built + generalized |
456
+ | Front door + intent-resolve + route + verify + fallback | `router` module | **shipped** |
457
+
458
+ ### Docs
459
+
460
+ - **Product design:** [`docs/QUALITY-GATED-ROUTER.md`](docs/QUALITY-GATED-ROUTER.md) (full router
461
+ design — intent presets, tier ladder, verify-fallback, quality profile, plugin seams, appendix
462
+ config recipes)
463
+ - **Cloud cost & auth:** [ADR-0001](docs/adr/0001-cloud-cost-and-subscription-auth.md)
464
+ (why anvil does not relay cloud by default) ·
465
+ [`docs/PLAN-advise-and-defer.md`](docs/PLAN-advise-and-defer.md) (implementation plan)
466
+ - **OpenClaw integration:** [`docs/OPENCLAW-INTEGRATION-SPEC.md`](docs/OPENCLAW-INTEGRATION-SPEC.md)
467
+ (source-verified buildable spec, go-with-caveats verdict) ·
468
+ [`docs/OPENCLAW-LIVE-VALIDATION.md`](docs/OPENCLAW-LIVE-VALIDATION.md) (live-validation runbook)
469
+ - **Serving reference:** [`docs/MODEL-SETTINGS-EXAMPLE.md`](docs/MODEL-SETTINGS-EXAMPLE.md)
470
+ (thinking-by-default model config and sampling) ·
471
+ [`docs/SERVES-AND-EVAL.md`](docs/SERVES-AND-EVAL.md) (serve lifecycle + eval entry point)
472
+ - **Architecture decisions:** [`docs/adr/`](docs/adr/) — one ADR per significant design choice
473
+
474
+ MIT licensed.