opencode-llmstack 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
llmstack/app.py ADDED
@@ -0,0 +1,644 @@
1
+ """
2
+ FastAPI auto-router proxy in front of llama-swap (and AWS Bedrock).
3
+
4
+ Public endpoint: ``http://127.0.0.1:10101``
5
+ Upstream: ``http://127.0.0.1:10102`` (llama-swap)
6
+
7
+ Behaviour:
8
+
9
+ * ``GET /v1/models`` -> proxied verbatim, plus an
10
+ ``auto`` entry and any
11
+ hosted (e.g. bedrock) tiers
12
+ declared in ``models.ini``.
13
+ * ``GET /models.ini`` -> raw text of the router's
14
+ ``models.ini``. Thin
15
+ clients (``llmstack
16
+ install --external``)
17
+ fetch this on every
18
+ install and use it to
19
+ regenerate
20
+ ``opencode.json`` without
21
+ keeping a local copy of
22
+ the file. Returning a
23
+ 200 + valid INI doubles
24
+ as the canonical health
25
+ check for external
26
+ clients -- there is no
27
+ separate ``/health``
28
+ route on the router (the
29
+ catch-all proxies any
30
+ such request through to
31
+ llama-swap's own
32
+ ``/health`` for
33
+ backwards-compat curl
34
+ users).
35
+ * ``POST /v1/chat/completions``,
36
+ ``POST /v1/completions``
37
+ - if request body ``model == "auto"`` (or unset), classify the request
38
+ and rewrite ``model`` -> one of: ``code-fast``, ``code-smart``,
39
+ ``code-ultra`` (when wired), ``plan``, ``plan-uncensored``.
40
+ - otherwise pass through unchanged.
41
+ - tiers with ``backend = bedrock`` in ``models.ini`` are dispatched
42
+ to AWS Bedrock via :mod:`llmstack.backends.bedrock` instead of
43
+ proxied to llama-swap.
44
+ * Streaming (SSE) responses are forwarded chunk-by-chunk.
45
+ * Anything else is reverse-proxied.
46
+
47
+ Routing philosophy: **start at the top of the fidelity ladder and
48
+ step DOWN as context grows**. This inverts the classic
49
+ "escalate-on-size" pattern, and it's deliberate:
50
+
51
+ * Top-tier hosted models (Claude Opus/Sonnet on Bedrock) are
52
+ fastest *and* most accurate on short prompts, but their
53
+ per-request latency and $cost scale with input tokens, and
54
+ long-context performance degrades faster than headline
55
+ benchmarks suggest.
56
+ * The local heavy coder (``code-smart``, Qwen3-Coder 80B-A3B) has
57
+ a 64k window -- it does its best work in the middle of that
58
+ range, and saturates near the top.
59
+ * The always-resident fast coder (``code-fast``, Qwen2.5-Coder 3B
60
+ with YaRN x4) has a **128k** window, costs nothing, and benefits
61
+ from more context: small models lean on retrieval / explicit
62
+ examples to disambiguate, where bigger models would just guess
63
+ from priors.
64
+
65
+ So as the conversation accumulates context, we step *down*: ultra
66
+ -> smart -> fast. Triggers and the plan track sit alongside this
67
+ ladder.
68
+
69
+ Routing decision tree (first match wins):
70
+
71
+ 1. Explicit "uncensored" trigger in the last user message
72
+ (``[nofilter]``, ``[uncensored]``, ``[heretic]``, or a line
73
+ starting with ``uncensored:`` / ``nofilter:``) -> plan-uncensored
74
+ 2. Explicit "ultra" trigger (``[ultra]``, ``[opus]``,
75
+ ``ultra:``, ``opus:``) AND ultra tier configured -> code-ultra
76
+ 3. PLAN signal words AND no code-block / agent verbs / tools
77
+ (design discussion, no implementation pending) -> plan
78
+ 4. Estimated input tokens <= HIGH_FIDELITY_CEILING
79
+ ("reasonable context still being built") -> code-ultra
80
+ (else code-smart)
81
+ 5. Estimated input tokens <= MID_FIDELITY_CEILING -> code-smart
82
+ 6. Otherwise (long context, top-tier becomes
83
+ expensive/slow, fast tier's 128k window is the
84
+ best fit and it's free) -> code-fast
85
+ (floored at
86
+ code-smart when
87
+ ``tools[]`` is set
88
+ or n_turns >=
89
+ MULTI_TURN_THRESHOLD,
90
+ since 3B models
91
+ tool-call unreliably)
92
+
93
+ Ultra-tier routing is gated on availability: rule (2) and the
94
+ "high-fidelity" rung of (4) first check that the tier is loaded
95
+ from ``models.ini`` (i.e. present in :data:`TIER_BY_ALIAS`). When
96
+ it isn't, the router silently falls back to ``code-smart`` --
97
+ otherwise rewriting ``model`` to a tier name that isn't wired up
98
+ surfaces as a 404 from llama-swap or a tier-not-found error from
99
+ the bedrock dispatcher, which is just a confusing way to fail.
100
+
101
+ Run with::
102
+
103
+ python -m llmstack.app
104
+ # or
105
+ uvicorn llmstack.app:app --host 127.0.0.1 --port 10101
106
+ """
107
+
108
+ from __future__ import annotations
109
+
110
+ import json
111
+ import logging
112
+ import os
113
+ import re
114
+ from contextlib import asynccontextmanager
115
+ from typing import Any
116
+
117
+ import httpx
118
+ from fastapi import FastAPI, Request, Response
119
+ from fastapi.responses import JSONResponse, PlainTextResponse, StreamingResponse
120
+
121
+ from llmstack.paths import models_ini_path
122
+ from llmstack.tiers import Tier, load_tiers
123
+
124
+ UPSTREAM = os.getenv("LLAMA_SWAP_URL", "http://127.0.0.1:10102").rstrip("/")
125
+
126
+ FAST_MODEL = os.getenv("ROUTER_FAST_MODEL", "code-fast")
127
+ AGENT_MODEL = os.getenv("ROUTER_AGENT_MODEL", "code-smart")
128
+ ULTRA_MODEL = os.getenv("ROUTER_ULTRA_MODEL", "code-ultra")
129
+ PLAN_MODEL = os.getenv("ROUTER_PLAN_MODEL", "plan")
130
+ UNCENSORED_MODEL = os.getenv("ROUTER_UNCENSORED_MODEL", "plan-uncensored")
131
+
132
+ # Step-DOWN ladder (see module docstring). Both ceilings are *upper
133
+ # bounds* of a tier's sweet-spot range, expressed in estimated input
134
+ # tokens (chars/4):
135
+ #
136
+ # est <= HIGH_FIDELITY_CEILING -> top tier (ultra, else smart)
137
+ # est <= MID_FIDELITY_CEILING -> code-smart
138
+ # est > MID_FIDELITY_CEILING -> code-fast (or smart with tools/loop)
139
+ #
140
+ # Defaults:
141
+ # HIGH 8000 - "reasonable context built": a couple of files loaded,
142
+ # instructions clear, top-tier still cheap+fast here.
143
+ # MID 32000 - half of code-smart's 65k window; past this, hosted
144
+ # top-tier latency/$cost balloons and code-smart starts
145
+ # getting cramped, while code-fast's 128k YaRN window
146
+ # still has comfortable headroom.
147
+ HIGH_FIDELITY_CEILING = int(os.getenv("ROUTER_HIGH_FIDELITY_CEILING", "8000"))
148
+ MID_FIDELITY_CEILING = int(os.getenv("ROUTER_MID_FIDELITY_CEILING", "32000"))
149
+ # Floor the long-context rung at code-smart whenever a tool-call
150
+ # protocol is in play -- 3B models tool-call unreliably regardless of
151
+ # how big their context window is.
152
+ MULTI_TURN_THRESHOLD = int(os.getenv("ROUTER_MULTI_TURN", "6"))
153
+ AUTO_ALIASES = {"auto", "", None}
154
+
155
+ UNCENSORED_TRIGGERS = re.compile(
156
+ r"(\[(uncensored|nofilter|no-?filter|heretic)\]"
157
+ r"|^[ \t]*(uncensored|nofilter|no-?filter)\s*:)",
158
+ re.IGNORECASE | re.MULTILINE,
159
+ )
160
+
161
+ ULTRA_TRIGGERS = re.compile(
162
+ r"(\[(ultra|opus)\]|^[ \t]*(ultra|opus)\s*:)",
163
+ re.IGNORECASE | re.MULTILINE,
164
+ )
165
+
166
+ PLAN_SIGNALS = re.compile(
167
+ r"\b(plan|design|architect(ure)?|approach|trade-?off|"
168
+ r"should\s+we|how\s+would\s+(you|we)|what\s+would\s+you|"
169
+ r"explain\s+why|reason\s+about|think\s+(through|step|hard|carefully)|"
170
+ r"compare\s+(options|approaches)|review\s+(the|this|my)\s+"
171
+ r"(architecture|design|approach|plan)|brainstorm|outline|"
172
+ r"summari[sz]e|root\s*cause|migrate|port\s+to)\b",
173
+ re.IGNORECASE,
174
+ )
175
+
176
+ AGENT_SIGNALS = re.compile(
177
+ r"\b(implement|fix\s+(this|the|a|my)?\s*(bug|issue|error|test)|"
178
+ r"write\s+(a|the|some)?\s*(function|class|test|script|module|method)|"
179
+ r"add\s+(a|the)?\s*(function|class|method|test|file|endpoint)|"
180
+ r"create\s+(a|the)?\s*(function|class|file|component|endpoint)|"
181
+ r"refactor|edit|patch|generate\s+code|debug|trace|"
182
+ r"run\s+tests?|build\s+(it|this)|compile)\b",
183
+ re.IGNORECASE,
184
+ )
185
+
186
+ CODE_BLOCK = re.compile(r"```|`[^`\n]{30,}`")
187
+
188
+ logging.basicConfig(
189
+ level=os.getenv("LOG_LEVEL", "INFO"),
190
+ format="%(asctime)s %(levelname)s router %(message)s",
191
+ )
192
+ log = logging.getLogger("router")
193
+
194
+ @asynccontextmanager
195
+ async def _lifespan(app: FastAPI):
196
+ global client
197
+ timeout = httpx.Timeout(connect=10.0, read=None, write=None, pool=None)
198
+ client = httpx.AsyncClient(base_url=UPSTREAM, timeout=timeout)
199
+ bedrock_tiers = sorted(t.name for t in TIERS.values() if t.is_bedrock)
200
+ log.info(
201
+ "router up upstream=%s ladder=[ultra<=%d -> agent<=%d -> fast] "
202
+ "fast=%s agent=%s ultra=%s plan=%s uncensored=%s bedrock=%s",
203
+ UPSTREAM, HIGH_FIDELITY_CEILING, MID_FIDELITY_CEILING,
204
+ FAST_MODEL, AGENT_MODEL,
205
+ f"{ULTRA_MODEL} (active)" if _ultra_available()
206
+ else f"{ULTRA_MODEL} (unwired -- high-fidelity rung falls back to {AGENT_MODEL})",
207
+ PLAN_MODEL, UNCENSORED_MODEL,
208
+ ",".join(bedrock_tiers) or "(none)",
209
+ )
210
+ yield
211
+ if client:
212
+ await client.aclose()
213
+
214
+
215
+ app = FastAPI(title="llmstack-auto-router", version="3.0", lifespan=_lifespan)
216
+ client: httpx.AsyncClient | None = None
217
+ TIERS: dict[str, Tier] = {}
218
+ TIER_BY_ALIAS: dict[str, Tier] = {}
219
+
220
+
221
+ def _index_tiers() -> None:
222
+ """Load ``models.ini`` and index by name + alias for fast lookup."""
223
+ global TIERS, TIER_BY_ALIAS
224
+ try:
225
+ TIERS = load_tiers()
226
+ except SystemExit as exc:
227
+ # No models.ini -- run as a pure pass-through proxy and let
228
+ # downstream errors describe the problem.
229
+ log.warning("models.ini not loaded (%s); bedrock dispatch disabled", exc)
230
+ TIERS = {}
231
+ TIER_BY_ALIAS = {}
232
+ for tier in TIERS.values():
233
+ TIER_BY_ALIAS[tier.name] = tier
234
+ for alias in tier.aliases:
235
+ TIER_BY_ALIAS.setdefault(alias, tier)
236
+
237
+
238
+ _index_tiers()
239
+
240
+
241
+ # ----------------------------- routing logic -------------------------------
242
+
243
+ def _iter_message_text(messages: list[dict[str, Any]] | None):
244
+ if not messages:
245
+ return
246
+ for m in messages:
247
+ content = m.get("content")
248
+ if isinstance(content, str):
249
+ yield content
250
+ elif isinstance(content, list):
251
+ for part in content:
252
+ if isinstance(part, dict):
253
+ t = part.get("text")
254
+ if isinstance(t, str):
255
+ yield t
256
+
257
+
258
+ def _last_user_text(messages: list[dict[str, Any]] | None) -> str:
259
+ if not messages:
260
+ return ""
261
+ for m in reversed(messages):
262
+ if m.get("role") != "user":
263
+ continue
264
+ content = m.get("content")
265
+ if isinstance(content, str):
266
+ return content
267
+ if isinstance(content, list):
268
+ return "\n".join(
269
+ p.get("text", "")
270
+ for p in content
271
+ if isinstance(p, dict) and isinstance(p.get("text"), str)
272
+ )
273
+ return ""
274
+
275
+
276
+ def _estimate_tokens(messages: list[dict[str, Any]] | None, prompt: str | None) -> int:
277
+ chars = len(prompt) if prompt else 0
278
+ for t in _iter_message_text(messages):
279
+ chars += len(t)
280
+ return chars // 4
281
+
282
+
283
+ def _matches(pattern: re.Pattern[str], messages: list[dict[str, Any]] | None, prompt: str | None) -> bool:
284
+ if prompt and pattern.search(prompt):
285
+ return True
286
+ return any(pattern.search(t) for t in _iter_message_text(messages))
287
+
288
+
289
+ def _ultra_available() -> bool:
290
+ """True iff the ultra tier is loaded from ``models.ini``.
291
+
292
+ Every auto-route to :data:`ULTRA_MODEL` is gated on this. Without
293
+ the guard, an explicit ``[ultra]`` trigger or the high-fidelity
294
+ rung of the step-down ladder on a vanilla install (no
295
+ ``code-ultra`` section) would rewrite ``model`` to a tier that
296
+ doesn't exist downstream -- llama-swap returns 404, the bedrock
297
+ dispatcher raises -- so the request would fail even though
298
+ falling back to ``code-smart`` would have served it just fine.
299
+ The check is a cheap dict lookup so we run it on every classify
300
+ invocation; that also means re-indexing tiers at runtime (e.g.
301
+ SIGHUP -> ``_index_tiers()``) flips routing behaviour live
302
+ without restarting the router.
303
+ """
304
+ return ULTRA_MODEL in TIER_BY_ALIAS
305
+
306
+
307
+ def classify(body: dict[str, Any]) -> tuple[str, str]:
308
+ """Return (chosen_model, reason).
309
+
310
+ Step-DOWN ladder: top fidelity for short context, fall to mid for
311
+ medium, drop to fast for long. See module docstring for rationale.
312
+ """
313
+ messages = body.get("messages") if isinstance(body.get("messages"), list) else None
314
+ prompt = body.get("prompt") if isinstance(body.get("prompt"), str) else None
315
+
316
+ last_user = _last_user_text(messages)
317
+ sys_prompts = [
318
+ m.get("content", "")
319
+ for m in (messages or [])
320
+ if m.get("role") == "system" and isinstance(m.get("content"), str)
321
+ ]
322
+ if any(UNCENSORED_TRIGGERS.search(s) for s in (last_user, *sys_prompts) if s):
323
+ return UNCENSORED_MODEL, "uncensored-trigger"
324
+
325
+ if any(ULTRA_TRIGGERS.search(s) for s in (last_user, *sys_prompts) if s):
326
+ if _ultra_available():
327
+ return ULTRA_MODEL, "ultra-trigger"
328
+ # Explicit user opt-in but the tier isn't wired up. Don't 404 --
329
+ # serve the request from the heaviest tier we *do* have and let
330
+ # the user notice in logs that their trigger was a no-op.
331
+ log.warning("ultra-trigger ignored: %s not in models.ini; falling back to %s",
332
+ ULTRA_MODEL, AGENT_MODEL)
333
+ return AGENT_MODEL, f"ultra-trigger->agent ({ULTRA_MODEL} unavailable)"
334
+
335
+ has_tools = bool(body.get("tools"))
336
+ n_turns = len(messages) if messages else 0
337
+ has_code_signal = (
338
+ _matches(CODE_BLOCK, messages, prompt)
339
+ or _matches(AGENT_SIGNALS, messages, prompt)
340
+ )
341
+
342
+ # Plan track is orthogonal to the code fidelity ladder: ``plan`` is a
343
+ # chat-tuned model meant for design / "should we" discussions. Only
344
+ # take it when nothing about the request says "I'm about to write
345
+ # code" (no triple-backticks, no agent verbs, no tool calls).
346
+ if (
347
+ not has_tools
348
+ and not has_code_signal
349
+ and _matches(PLAN_SIGNALS, messages, prompt)
350
+ ):
351
+ return PLAN_MODEL, "plan-signal"
352
+
353
+ est = _estimate_tokens(messages, prompt)
354
+
355
+ # Rung 1: short context -- start at the top.
356
+ if est <= HIGH_FIDELITY_CEILING:
357
+ if _ultra_available():
358
+ return ULTRA_MODEL, f"high-fidelity tokens~{est}<={HIGH_FIDELITY_CEILING}"
359
+ return AGENT_MODEL, (
360
+ f"high-fidelity tokens~{est}<={HIGH_FIDELITY_CEILING} "
361
+ f"({ULTRA_MODEL} unavailable)"
362
+ )
363
+
364
+ # Rung 2: mid context -- local heavy coder is at its sweet spot.
365
+ if est <= MID_FIDELITY_CEILING:
366
+ return AGENT_MODEL, f"mid-fidelity tokens~{est}<={MID_FIDELITY_CEILING}"
367
+
368
+ # Rung 3: long context -- step down to fast (128k YaRN, free,
369
+ # always-resident). Floor at smart when tools/agent loop is in
370
+ # play; the 3B coder doesn't tool-call reliably.
371
+ if has_tools or n_turns >= MULTI_TURN_THRESHOLD:
372
+ why = "tools" if has_tools else f"turns={n_turns}"
373
+ return AGENT_MODEL, f"long-context tokens~{est}>{MID_FIDELITY_CEILING} ({why} floor)"
374
+ return FAST_MODEL, f"long-context tokens~{est}>{MID_FIDELITY_CEILING}"
375
+
376
+
377
+ # ----------------------------- proxy plumbing ------------------------------
378
+
379
+ HOP_BY_HOP = {
380
+ "connection", "keep-alive", "proxy-authenticate", "proxy-authorization",
381
+ "te", "trailers", "transfer-encoding", "upgrade", "host", "content-length",
382
+ }
383
+
384
+
385
+ def _filter_request_headers(req: Request) -> dict[str, str]:
386
+ return {k: v for k, v in req.headers.items() if k.lower() not in HOP_BY_HOP}
387
+
388
+
389
+ def _filter_response_headers(resp: httpx.Response) -> dict[str, str]:
390
+ return {k: v for k, v in resp.headers.items() if k.lower() not in HOP_BY_HOP}
391
+
392
+
393
+ async def _stream_proxy(method: str, path: str, body: bytes, headers: dict[str, str]) -> StreamingResponse:
394
+ assert client is not None
395
+ upstream_req = client.build_request(method, path, content=body, headers=headers)
396
+ upstream = await client.send(upstream_req, stream=True)
397
+
398
+ async def gen():
399
+ try:
400
+ async for chunk in upstream.aiter_raw():
401
+ yield chunk
402
+ finally:
403
+ await upstream.aclose()
404
+
405
+ return StreamingResponse(
406
+ gen(),
407
+ status_code=upstream.status_code,
408
+ headers=_filter_response_headers(upstream),
409
+ media_type=upstream.headers.get("content-type"),
410
+ )
411
+
412
+
413
+ # --------------------------------- routes ----------------------------------
414
+
415
+ @app.get("/models.ini")
416
+ async def serve_models_ini() -> Response:
417
+ """Return the router's live ``models.ini`` as text.
418
+
419
+ Read fresh on every request rather than from the cached
420
+ :data:`TIERS` snapshot -- a thin client running
421
+ ``llmstack install --external`` against this router should see
422
+ whatever the operator has most recently written to disk, even if
423
+ the router hasn't been restarted to pick up a re-parse. (Stale
424
+ ``TIERS`` only affects in-flight routing decisions; the file on
425
+ disk is the source of truth for downstream config generation.)
426
+
427
+ Returning the file is also how external clients health-check the
428
+ router: a 200 with a non-empty INI body proves both that the
429
+ router process is up and that the operator has a usable config
430
+ here -- which is exactly what the client needs to render its
431
+ own ``opencode.json``. There is no separate ``/health`` route.
432
+ """
433
+ path = models_ini_path()
434
+ if not path.is_file():
435
+ # Router is up but the operator hasn't pointed it at a
436
+ # models.ini yet (or the file went missing). Fail loud so the
437
+ # thin-client install surfaces a real error message instead of
438
+ # rendering an empty opencode.json.
439
+ return PlainTextResponse(
440
+ f"models.ini not found at {path} on the router host.\n"
441
+ "Set $LLMSTACK_MODELS_INI on the router or run "
442
+ "`llmstack install` there to seed the default.\n",
443
+ status_code=404,
444
+ media_type="text/plain",
445
+ )
446
+ try:
447
+ text = path.read_text(encoding="utf-8")
448
+ except OSError as e:
449
+ log.warning("failed to read %s for /models.ini: %s", path, e)
450
+ return PlainTextResponse(
451
+ f"failed to read {path}: {e}\n",
452
+ status_code=500,
453
+ media_type="text/plain",
454
+ )
455
+ return PlainTextResponse(text, media_type="text/plain; charset=utf-8")
456
+
457
+
458
+ @app.get("/v1/models")
459
+ async def list_models() -> JSONResponse:
460
+ assert client is not None
461
+ try:
462
+ r = await client.get("/v1/models")
463
+ data = r.json()
464
+ status = r.status_code
465
+ except Exception as exc:
466
+ log.warning("upstream /v1/models failed: %s", exc)
467
+ data = {"object": "list", "data": []}
468
+ status = 200
469
+
470
+ if not isinstance(data, dict) or not isinstance(data.get("data"), list):
471
+ data = {"object": "list", "data": []}
472
+
473
+ # Hosted (bedrock) tiers aren't known to llama-swap; fold them in.
474
+ seen = {entry.get("id") for entry in data["data"] if isinstance(entry, dict)}
475
+ from llmstack.backends import bedrock as bedrock_backend
476
+ for tier in TIERS.values():
477
+ if not tier.is_bedrock:
478
+ continue
479
+ if tier.name in seen:
480
+ continue
481
+ data["data"].append(bedrock_backend.model_descriptor(tier))
482
+ seen.add(tier.name)
483
+ for alias in tier.aliases:
484
+ if alias not in seen:
485
+ desc = bedrock_backend.model_descriptor(tier)
486
+ desc["id"] = alias
487
+ desc["name"] = f"{tier.description} (alias of {tier.name})"
488
+ data["data"].append(desc)
489
+ seen.add(alias)
490
+
491
+ if _ultra_available():
492
+ top_blurb = (
493
+ f"Step-down ladder (top->bottom as context grows): "
494
+ f"'{ULTRA_MODEL}' up to ~{HIGH_FIDELITY_CEILING} tokens, "
495
+ f"'{AGENT_MODEL}' up to ~{MID_FIDELITY_CEILING}, "
496
+ f"'{FAST_MODEL}' beyond that."
497
+ )
498
+ name = "Auto (step-down router: ultra/agent/fast + plan/uncensored)"
499
+ else:
500
+ top_blurb = (
501
+ f"Step-down ladder (top->bottom as context grows): "
502
+ f"'{AGENT_MODEL}' up to ~{MID_FIDELITY_CEILING} tokens, "
503
+ f"'{FAST_MODEL}' beyond that."
504
+ )
505
+ name = "Auto (step-down router: agent/fast + plan/uncensored)"
506
+ data["data"].insert(0, {
507
+ "id": "auto",
508
+ "object": "model",
509
+ "created": 0,
510
+ "owned_by": "router",
511
+ "name": name,
512
+ "description": (
513
+ f"{top_blurb} "
514
+ f"'{PLAN_MODEL}' for design/planning (orthogonal to ladder); "
515
+ f"'{UNCENSORED_MODEL}' for explicit [nofilter] triggers; "
516
+ f"'[ultra]'/'[opus]' triggers force '{ULTRA_MODEL}' regardless of size."
517
+ ),
518
+ "tier": "auto",
519
+ })
520
+ return JSONResponse(content=data, status_code=status)
521
+
522
+
523
+ def _resolve_tier(name: str | None) -> Tier | None:
524
+ if not name:
525
+ return None
526
+ return TIER_BY_ALIAS.get(name)
527
+
528
+
529
+ # Map the short sampler keys used in models.ini to the OpenAI-compatible
530
+ # request-body fields that downstream backends understand. llama.cpp
531
+ # accepts `top_k`, `min_p`, and `repetition_penalty` as extensions; the
532
+ # Bedrock backend ignores fields it can't translate to Converse.
533
+ _SAMPLER_BODY_FIELD = {
534
+ "temp": "temperature",
535
+ "top_p": "top_p",
536
+ "top_k": "top_k",
537
+ "min_p": "min_p",
538
+ "rep_pen": "repetition_penalty",
539
+ }
540
+
541
+
542
+ def _inject_sampler(body: dict[str, Any], tier: Tier) -> bool:
543
+ """Layer this tier's `sampler = ...` defaults onto the request body.
544
+
545
+ **Bedrock-only.** For gguf tiers, sampling defaults are baked into
546
+ the llama-server startup command line by
547
+ :mod:`llmstack.generators.llama_swap`, so llama-server already
548
+ applies them for any request whose body lacks an explicit value.
549
+ Bedrock has no equivalent server-side mechanism -- the only place to
550
+ apply per-tier sampling for hosted models is the outbound request
551
+ body, which is what this function does.
552
+
553
+ Caller-supplied values always win -- if the client already set
554
+ `temperature`, the tier default does not overwrite it. This makes
555
+ models.ini the source of truth for "what sampler does each tier
556
+ use", while still letting power users override per call.
557
+
558
+ Returns ``True`` iff anything was added (the caller re-encodes the
559
+ raw body bytes only when the dict actually changed).
560
+
561
+ A Bedrock tier with an empty sampler dict (no `sampler =` line, or
562
+ all keys stripped) is a no-op -- the canonical pattern for Bedrock
563
+ families like Claude Opus 4.7 that reject every sampler param.
564
+ """
565
+ if not tier.is_bedrock or not tier.sampler:
566
+ return False
567
+ mutated = False
568
+ for src, dst in _SAMPLER_BODY_FIELD.items():
569
+ if src in tier.sampler and dst not in body:
570
+ body[dst] = tier.sampler[src]
571
+ mutated = True
572
+ return mutated
573
+
574
+
575
+ async def _handle_completion(req: Request, path: str) -> Response:
576
+ raw = await req.body()
577
+ headers = _filter_request_headers(req)
578
+
579
+ try:
580
+ body = json.loads(raw) if raw else {}
581
+ except json.JSONDecodeError:
582
+ return await _stream_proxy(req.method, path, raw, headers)
583
+
584
+ mutated = False
585
+ requested = body.get("model")
586
+ if requested in AUTO_ALIASES or requested == "auto":
587
+ chosen, reason = classify(body)
588
+ body["model"] = chosen
589
+ log.info("auto -> %s (%s) [path=%s]", chosen, reason, path)
590
+ mutated = True
591
+
592
+ chosen_name = body.get("model")
593
+ tier = _resolve_tier(chosen_name)
594
+ if tier is not None and _inject_sampler(body, tier):
595
+ mutated = True
596
+
597
+ if mutated:
598
+ raw = json.dumps(body).encode()
599
+
600
+ if tier is not None and tier.is_bedrock:
601
+ from llmstack.backends import bedrock as bedrock_backend
602
+ return await bedrock_backend.dispatch(req, tier, body)
603
+
604
+ return await _stream_proxy(req.method, path, raw, headers)
605
+
606
+
607
+ @app.post("/v1/chat/completions")
608
+ async def chat_completions(req: Request) -> Response:
609
+ return await _handle_completion(req, "/v1/chat/completions")
610
+
611
+
612
+ @app.post("/v1/completions")
613
+ async def completions(req: Request) -> Response:
614
+ return await _handle_completion(req, "/v1/completions")
615
+
616
+
617
+ # --------------------------- catch-all reverse proxy -----------------------
618
+
619
+ @app.api_route(
620
+ "/{path:path}",
621
+ methods=["GET", "POST", "PUT", "DELETE", "PATCH", "OPTIONS", "HEAD"],
622
+ )
623
+ async def passthrough(path: str, req: Request) -> Response:
624
+ raw = await req.body()
625
+ headers = _filter_request_headers(req)
626
+ return await _stream_proxy(req.method, "/" + path, raw, headers)
627
+
628
+
629
+ def main() -> None:
630
+ """Run the router with uvicorn. Used by ``python -m llmstack.app``."""
631
+ import asyncio
632
+
633
+ import uvicorn
634
+
635
+ log_level = os.getenv("LOG_LEVEL", "info").lower()
636
+ host = os.getenv("ROUTER_HOST", "127.0.0.1")
637
+ port = int(os.getenv("ROUTER_PORT", "10101"))
638
+
639
+ cfg = uvicorn.Config(app, host=host, port=port, log_level=log_level)
640
+ asyncio.run(uvicorn.Server(cfg).serve())
641
+
642
+
643
+ if __name__ == "__main__":
644
+ main()
@@ -0,0 +1,19 @@
1
+ """Pluggable request backends.
2
+
3
+ The router (:mod:`llmstack.app`) classifies a request and picks a tier
4
+ name. Each tier's :attr:`Tier.backend` selects how the request actually
5
+ gets fulfilled:
6
+
7
+ ``gguf`` reverse-proxy to the local llama-swap (the default; no
8
+ module needed -- :mod:`llmstack.app` does the proxying
9
+ itself).
10
+ ``bedrock`` hand off to :mod:`llmstack.backends.bedrock` which
11
+ translates OpenAI chat/completions to AWS Bedrock
12
+ Converse and streams the response back as OpenAI SSE.
13
+
14
+ Each backend module is loaded lazily so the optional cloud SDKs are
15
+ only imported when the operator has actually configured a tier that
16
+ needs them (and only when they're invoked).
17
+ """
18
+
19
+ from __future__ import annotations