opencode-llmstack 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- llmstack/AGENTS.md +13 -0
- llmstack/__init__.py +20 -0
- llmstack/__main__.py +10 -0
- llmstack/_platform.py +420 -0
- llmstack/app.py +644 -0
- llmstack/backends/__init__.py +19 -0
- llmstack/backends/bedrock.py +790 -0
- llmstack/check_models.py +119 -0
- llmstack/cli.py +264 -0
- llmstack/commands/__init__.py +10 -0
- llmstack/commands/_helpers.py +91 -0
- llmstack/commands/activate.py +71 -0
- llmstack/commands/check.py +13 -0
- llmstack/commands/download.py +27 -0
- llmstack/commands/install.py +365 -0
- llmstack/commands/install_llama_swap.py +36 -0
- llmstack/commands/reload.py +59 -0
- llmstack/commands/restart.py +12 -0
- llmstack/commands/setup.py +146 -0
- llmstack/commands/start.py +360 -0
- llmstack/commands/status.py +260 -0
- llmstack/commands/stop.py +73 -0
- llmstack/download/__init__.py +21 -0
- llmstack/download/binary.py +234 -0
- llmstack/download/ggufs.py +164 -0
- llmstack/generators/__init__.py +37 -0
- llmstack/generators/llama_swap.py +421 -0
- llmstack/generators/opencode.py +291 -0
- llmstack/models.ini +304 -0
- llmstack/paths.py +318 -0
- llmstack/shell_env.py +927 -0
- llmstack/tiers.py +394 -0
- opencode_llmstack-0.6.0.dist-info/METADATA +693 -0
- opencode_llmstack-0.6.0.dist-info/RECORD +37 -0
- opencode_llmstack-0.6.0.dist-info/WHEEL +5 -0
- opencode_llmstack-0.6.0.dist-info/entry_points.txt +2 -0
- opencode_llmstack-0.6.0.dist-info/top_level.txt +1 -0
llmstack/app.py
ADDED
|
@@ -0,0 +1,644 @@
|
|
|
1
|
+
"""
|
|
2
|
+
FastAPI auto-router proxy in front of llama-swap (and AWS Bedrock).
|
|
3
|
+
|
|
4
|
+
Public endpoint: ``http://127.0.0.1:10101``
|
|
5
|
+
Upstream: ``http://127.0.0.1:10102`` (llama-swap)
|
|
6
|
+
|
|
7
|
+
Behaviour:
|
|
8
|
+
|
|
9
|
+
* ``GET /v1/models`` -> proxied verbatim, plus an
|
|
10
|
+
``auto`` entry and any
|
|
11
|
+
hosted (e.g. bedrock) tiers
|
|
12
|
+
declared in ``models.ini``.
|
|
13
|
+
* ``GET /models.ini`` -> raw text of the router's
|
|
14
|
+
``models.ini``. Thin
|
|
15
|
+
clients (``llmstack
|
|
16
|
+
install --external``)
|
|
17
|
+
fetch this on every
|
|
18
|
+
install and use it to
|
|
19
|
+
regenerate
|
|
20
|
+
``opencode.json`` without
|
|
21
|
+
keeping a local copy of
|
|
22
|
+
the file. Returning a
|
|
23
|
+
200 + valid INI doubles
|
|
24
|
+
as the canonical health
|
|
25
|
+
check for external
|
|
26
|
+
clients -- there is no
|
|
27
|
+
separate ``/health``
|
|
28
|
+
route on the router (the
|
|
29
|
+
catch-all proxies any
|
|
30
|
+
such request through to
|
|
31
|
+
llama-swap's own
|
|
32
|
+
``/health`` for
|
|
33
|
+
backwards-compat curl
|
|
34
|
+
users).
|
|
35
|
+
* ``POST /v1/chat/completions``,
|
|
36
|
+
``POST /v1/completions``
|
|
37
|
+
- if request body ``model == "auto"`` (or unset), classify the request
|
|
38
|
+
and rewrite ``model`` -> one of: ``code-fast``, ``code-smart``,
|
|
39
|
+
``code-ultra`` (when wired), ``plan``, ``plan-uncensored``.
|
|
40
|
+
- otherwise pass through unchanged.
|
|
41
|
+
- tiers with ``backend = bedrock`` in ``models.ini`` are dispatched
|
|
42
|
+
to AWS Bedrock via :mod:`llmstack.backends.bedrock` instead of
|
|
43
|
+
proxied to llama-swap.
|
|
44
|
+
* Streaming (SSE) responses are forwarded chunk-by-chunk.
|
|
45
|
+
* Anything else is reverse-proxied.
|
|
46
|
+
|
|
47
|
+
Routing philosophy: **start at the top of the fidelity ladder and
|
|
48
|
+
step DOWN as context grows**. This inverts the classic
|
|
49
|
+
"escalate-on-size" pattern, and it's deliberate:
|
|
50
|
+
|
|
51
|
+
* Top-tier hosted models (Claude Opus/Sonnet on Bedrock) are
|
|
52
|
+
fastest *and* most accurate on short prompts, but their
|
|
53
|
+
per-request latency and $cost scale with input tokens, and
|
|
54
|
+
long-context performance degrades faster than headline
|
|
55
|
+
benchmarks suggest.
|
|
56
|
+
* The local heavy coder (``code-smart``, Qwen3-Coder 80B-A3B) has
|
|
57
|
+
a 64k window -- it does its best work in the middle of that
|
|
58
|
+
range, and saturates near the top.
|
|
59
|
+
* The always-resident fast coder (``code-fast``, Qwen2.5-Coder 3B
|
|
60
|
+
with YaRN x4) has a **128k** window, costs nothing, and benefits
|
|
61
|
+
from more context: small models lean on retrieval / explicit
|
|
62
|
+
examples to disambiguate, where bigger models would just guess
|
|
63
|
+
from priors.
|
|
64
|
+
|
|
65
|
+
So as the conversation accumulates context, we step *down*: ultra
|
|
66
|
+
-> smart -> fast. Triggers and the plan track sit alongside this
|
|
67
|
+
ladder.
|
|
68
|
+
|
|
69
|
+
Routing decision tree (first match wins):
|
|
70
|
+
|
|
71
|
+
1. Explicit "uncensored" trigger in the last user message
|
|
72
|
+
(``[nofilter]``, ``[uncensored]``, ``[heretic]``, or a line
|
|
73
|
+
starting with ``uncensored:`` / ``nofilter:``) -> plan-uncensored
|
|
74
|
+
2. Explicit "ultra" trigger (``[ultra]``, ``[opus]``,
|
|
75
|
+
``ultra:``, ``opus:``) AND ultra tier configured -> code-ultra
|
|
76
|
+
3. PLAN signal words AND no code-block / agent verbs / tools
|
|
77
|
+
(design discussion, no implementation pending) -> plan
|
|
78
|
+
4. Estimated input tokens <= HIGH_FIDELITY_CEILING
|
|
79
|
+
("reasonable context still being built") -> code-ultra
|
|
80
|
+
(else code-smart)
|
|
81
|
+
5. Estimated input tokens <= MID_FIDELITY_CEILING -> code-smart
|
|
82
|
+
6. Otherwise (long context, top-tier becomes
|
|
83
|
+
expensive/slow, fast tier's 128k window is the
|
|
84
|
+
best fit and it's free) -> code-fast
|
|
85
|
+
(floored at
|
|
86
|
+
code-smart when
|
|
87
|
+
``tools[]`` is set
|
|
88
|
+
or n_turns >=
|
|
89
|
+
MULTI_TURN_THRESHOLD,
|
|
90
|
+
since 3B models
|
|
91
|
+
tool-call unreliably)
|
|
92
|
+
|
|
93
|
+
Ultra-tier routing is gated on availability: rule (2) and the
|
|
94
|
+
"high-fidelity" rung of (4) first check that the tier is loaded
|
|
95
|
+
from ``models.ini`` (i.e. present in :data:`TIER_BY_ALIAS`). When
|
|
96
|
+
it isn't, the router silently falls back to ``code-smart`` --
|
|
97
|
+
otherwise rewriting ``model`` to a tier name that isn't wired up
|
|
98
|
+
surfaces as a 404 from llama-swap or a tier-not-found error from
|
|
99
|
+
the bedrock dispatcher, which is just a confusing way to fail.
|
|
100
|
+
|
|
101
|
+
Run with::
|
|
102
|
+
|
|
103
|
+
python -m llmstack.app
|
|
104
|
+
# or
|
|
105
|
+
uvicorn llmstack.app:app --host 127.0.0.1 --port 10101
|
|
106
|
+
"""
|
|
107
|
+
|
|
108
|
+
from __future__ import annotations
|
|
109
|
+
|
|
110
|
+
import json
|
|
111
|
+
import logging
|
|
112
|
+
import os
|
|
113
|
+
import re
|
|
114
|
+
from contextlib import asynccontextmanager
|
|
115
|
+
from typing import Any
|
|
116
|
+
|
|
117
|
+
import httpx
|
|
118
|
+
from fastapi import FastAPI, Request, Response
|
|
119
|
+
from fastapi.responses import JSONResponse, PlainTextResponse, StreamingResponse
|
|
120
|
+
|
|
121
|
+
from llmstack.paths import models_ini_path
|
|
122
|
+
from llmstack.tiers import Tier, load_tiers
|
|
123
|
+
|
|
124
|
+
UPSTREAM = os.getenv("LLAMA_SWAP_URL", "http://127.0.0.1:10102").rstrip("/")
|
|
125
|
+
|
|
126
|
+
FAST_MODEL = os.getenv("ROUTER_FAST_MODEL", "code-fast")
|
|
127
|
+
AGENT_MODEL = os.getenv("ROUTER_AGENT_MODEL", "code-smart")
|
|
128
|
+
ULTRA_MODEL = os.getenv("ROUTER_ULTRA_MODEL", "code-ultra")
|
|
129
|
+
PLAN_MODEL = os.getenv("ROUTER_PLAN_MODEL", "plan")
|
|
130
|
+
UNCENSORED_MODEL = os.getenv("ROUTER_UNCENSORED_MODEL", "plan-uncensored")
|
|
131
|
+
|
|
132
|
+
# Step-DOWN ladder (see module docstring). Both ceilings are *upper
|
|
133
|
+
# bounds* of a tier's sweet-spot range, expressed in estimated input
|
|
134
|
+
# tokens (chars/4):
|
|
135
|
+
#
|
|
136
|
+
# est <= HIGH_FIDELITY_CEILING -> top tier (ultra, else smart)
|
|
137
|
+
# est <= MID_FIDELITY_CEILING -> code-smart
|
|
138
|
+
# est > MID_FIDELITY_CEILING -> code-fast (or smart with tools/loop)
|
|
139
|
+
#
|
|
140
|
+
# Defaults:
|
|
141
|
+
# HIGH 8000 - "reasonable context built": a couple of files loaded,
|
|
142
|
+
# instructions clear, top-tier still cheap+fast here.
|
|
143
|
+
# MID 32000 - half of code-smart's 65k window; past this, hosted
|
|
144
|
+
# top-tier latency/$cost balloons and code-smart starts
|
|
145
|
+
# getting cramped, while code-fast's 128k YaRN window
|
|
146
|
+
# still has comfortable headroom.
|
|
147
|
+
HIGH_FIDELITY_CEILING = int(os.getenv("ROUTER_HIGH_FIDELITY_CEILING", "8000"))
|
|
148
|
+
MID_FIDELITY_CEILING = int(os.getenv("ROUTER_MID_FIDELITY_CEILING", "32000"))
|
|
149
|
+
# Floor the long-context rung at code-smart whenever a tool-call
|
|
150
|
+
# protocol is in play -- 3B models tool-call unreliably regardless of
|
|
151
|
+
# how big their context window is.
|
|
152
|
+
MULTI_TURN_THRESHOLD = int(os.getenv("ROUTER_MULTI_TURN", "6"))
|
|
153
|
+
AUTO_ALIASES = {"auto", "", None}
|
|
154
|
+
|
|
155
|
+
UNCENSORED_TRIGGERS = re.compile(
|
|
156
|
+
r"(\[(uncensored|nofilter|no-?filter|heretic)\]"
|
|
157
|
+
r"|^[ \t]*(uncensored|nofilter|no-?filter)\s*:)",
|
|
158
|
+
re.IGNORECASE | re.MULTILINE,
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
ULTRA_TRIGGERS = re.compile(
|
|
162
|
+
r"(\[(ultra|opus)\]|^[ \t]*(ultra|opus)\s*:)",
|
|
163
|
+
re.IGNORECASE | re.MULTILINE,
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
PLAN_SIGNALS = re.compile(
|
|
167
|
+
r"\b(plan|design|architect(ure)?|approach|trade-?off|"
|
|
168
|
+
r"should\s+we|how\s+would\s+(you|we)|what\s+would\s+you|"
|
|
169
|
+
r"explain\s+why|reason\s+about|think\s+(through|step|hard|carefully)|"
|
|
170
|
+
r"compare\s+(options|approaches)|review\s+(the|this|my)\s+"
|
|
171
|
+
r"(architecture|design|approach|plan)|brainstorm|outline|"
|
|
172
|
+
r"summari[sz]e|root\s*cause|migrate|port\s+to)\b",
|
|
173
|
+
re.IGNORECASE,
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
AGENT_SIGNALS = re.compile(
|
|
177
|
+
r"\b(implement|fix\s+(this|the|a|my)?\s*(bug|issue|error|test)|"
|
|
178
|
+
r"write\s+(a|the|some)?\s*(function|class|test|script|module|method)|"
|
|
179
|
+
r"add\s+(a|the)?\s*(function|class|method|test|file|endpoint)|"
|
|
180
|
+
r"create\s+(a|the)?\s*(function|class|file|component|endpoint)|"
|
|
181
|
+
r"refactor|edit|patch|generate\s+code|debug|trace|"
|
|
182
|
+
r"run\s+tests?|build\s+(it|this)|compile)\b",
|
|
183
|
+
re.IGNORECASE,
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
CODE_BLOCK = re.compile(r"```|`[^`\n]{30,}`")
|
|
187
|
+
|
|
188
|
+
logging.basicConfig(
|
|
189
|
+
level=os.getenv("LOG_LEVEL", "INFO"),
|
|
190
|
+
format="%(asctime)s %(levelname)s router %(message)s",
|
|
191
|
+
)
|
|
192
|
+
log = logging.getLogger("router")
|
|
193
|
+
|
|
194
|
+
@asynccontextmanager
|
|
195
|
+
async def _lifespan(app: FastAPI):
|
|
196
|
+
global client
|
|
197
|
+
timeout = httpx.Timeout(connect=10.0, read=None, write=None, pool=None)
|
|
198
|
+
client = httpx.AsyncClient(base_url=UPSTREAM, timeout=timeout)
|
|
199
|
+
bedrock_tiers = sorted(t.name for t in TIERS.values() if t.is_bedrock)
|
|
200
|
+
log.info(
|
|
201
|
+
"router up upstream=%s ladder=[ultra<=%d -> agent<=%d -> fast] "
|
|
202
|
+
"fast=%s agent=%s ultra=%s plan=%s uncensored=%s bedrock=%s",
|
|
203
|
+
UPSTREAM, HIGH_FIDELITY_CEILING, MID_FIDELITY_CEILING,
|
|
204
|
+
FAST_MODEL, AGENT_MODEL,
|
|
205
|
+
f"{ULTRA_MODEL} (active)" if _ultra_available()
|
|
206
|
+
else f"{ULTRA_MODEL} (unwired -- high-fidelity rung falls back to {AGENT_MODEL})",
|
|
207
|
+
PLAN_MODEL, UNCENSORED_MODEL,
|
|
208
|
+
",".join(bedrock_tiers) or "(none)",
|
|
209
|
+
)
|
|
210
|
+
yield
|
|
211
|
+
if client:
|
|
212
|
+
await client.aclose()
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
app = FastAPI(title="llmstack-auto-router", version="3.0", lifespan=_lifespan)
|
|
216
|
+
client: httpx.AsyncClient | None = None
|
|
217
|
+
TIERS: dict[str, Tier] = {}
|
|
218
|
+
TIER_BY_ALIAS: dict[str, Tier] = {}
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
def _index_tiers() -> None:
|
|
222
|
+
"""Load ``models.ini`` and index by name + alias for fast lookup."""
|
|
223
|
+
global TIERS, TIER_BY_ALIAS
|
|
224
|
+
try:
|
|
225
|
+
TIERS = load_tiers()
|
|
226
|
+
except SystemExit as exc:
|
|
227
|
+
# No models.ini -- run as a pure pass-through proxy and let
|
|
228
|
+
# downstream errors describe the problem.
|
|
229
|
+
log.warning("models.ini not loaded (%s); bedrock dispatch disabled", exc)
|
|
230
|
+
TIERS = {}
|
|
231
|
+
TIER_BY_ALIAS = {}
|
|
232
|
+
for tier in TIERS.values():
|
|
233
|
+
TIER_BY_ALIAS[tier.name] = tier
|
|
234
|
+
for alias in tier.aliases:
|
|
235
|
+
TIER_BY_ALIAS.setdefault(alias, tier)
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
_index_tiers()
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
# ----------------------------- routing logic -------------------------------
|
|
242
|
+
|
|
243
|
+
def _iter_message_text(messages: list[dict[str, Any]] | None):
|
|
244
|
+
if not messages:
|
|
245
|
+
return
|
|
246
|
+
for m in messages:
|
|
247
|
+
content = m.get("content")
|
|
248
|
+
if isinstance(content, str):
|
|
249
|
+
yield content
|
|
250
|
+
elif isinstance(content, list):
|
|
251
|
+
for part in content:
|
|
252
|
+
if isinstance(part, dict):
|
|
253
|
+
t = part.get("text")
|
|
254
|
+
if isinstance(t, str):
|
|
255
|
+
yield t
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
def _last_user_text(messages: list[dict[str, Any]] | None) -> str:
|
|
259
|
+
if not messages:
|
|
260
|
+
return ""
|
|
261
|
+
for m in reversed(messages):
|
|
262
|
+
if m.get("role") != "user":
|
|
263
|
+
continue
|
|
264
|
+
content = m.get("content")
|
|
265
|
+
if isinstance(content, str):
|
|
266
|
+
return content
|
|
267
|
+
if isinstance(content, list):
|
|
268
|
+
return "\n".join(
|
|
269
|
+
p.get("text", "")
|
|
270
|
+
for p in content
|
|
271
|
+
if isinstance(p, dict) and isinstance(p.get("text"), str)
|
|
272
|
+
)
|
|
273
|
+
return ""
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
def _estimate_tokens(messages: list[dict[str, Any]] | None, prompt: str | None) -> int:
|
|
277
|
+
chars = len(prompt) if prompt else 0
|
|
278
|
+
for t in _iter_message_text(messages):
|
|
279
|
+
chars += len(t)
|
|
280
|
+
return chars // 4
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
def _matches(pattern: re.Pattern[str], messages: list[dict[str, Any]] | None, prompt: str | None) -> bool:
|
|
284
|
+
if prompt and pattern.search(prompt):
|
|
285
|
+
return True
|
|
286
|
+
return any(pattern.search(t) for t in _iter_message_text(messages))
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
def _ultra_available() -> bool:
|
|
290
|
+
"""True iff the ultra tier is loaded from ``models.ini``.
|
|
291
|
+
|
|
292
|
+
Every auto-route to :data:`ULTRA_MODEL` is gated on this. Without
|
|
293
|
+
the guard, an explicit ``[ultra]`` trigger or the high-fidelity
|
|
294
|
+
rung of the step-down ladder on a vanilla install (no
|
|
295
|
+
``code-ultra`` section) would rewrite ``model`` to a tier that
|
|
296
|
+
doesn't exist downstream -- llama-swap returns 404, the bedrock
|
|
297
|
+
dispatcher raises -- so the request would fail even though
|
|
298
|
+
falling back to ``code-smart`` would have served it just fine.
|
|
299
|
+
The check is a cheap dict lookup so we run it on every classify
|
|
300
|
+
invocation; that also means re-indexing tiers at runtime (e.g.
|
|
301
|
+
SIGHUP -> ``_index_tiers()``) flips routing behaviour live
|
|
302
|
+
without restarting the router.
|
|
303
|
+
"""
|
|
304
|
+
return ULTRA_MODEL in TIER_BY_ALIAS
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
def classify(body: dict[str, Any]) -> tuple[str, str]:
|
|
308
|
+
"""Return (chosen_model, reason).
|
|
309
|
+
|
|
310
|
+
Step-DOWN ladder: top fidelity for short context, fall to mid for
|
|
311
|
+
medium, drop to fast for long. See module docstring for rationale.
|
|
312
|
+
"""
|
|
313
|
+
messages = body.get("messages") if isinstance(body.get("messages"), list) else None
|
|
314
|
+
prompt = body.get("prompt") if isinstance(body.get("prompt"), str) else None
|
|
315
|
+
|
|
316
|
+
last_user = _last_user_text(messages)
|
|
317
|
+
sys_prompts = [
|
|
318
|
+
m.get("content", "")
|
|
319
|
+
for m in (messages or [])
|
|
320
|
+
if m.get("role") == "system" and isinstance(m.get("content"), str)
|
|
321
|
+
]
|
|
322
|
+
if any(UNCENSORED_TRIGGERS.search(s) for s in (last_user, *sys_prompts) if s):
|
|
323
|
+
return UNCENSORED_MODEL, "uncensored-trigger"
|
|
324
|
+
|
|
325
|
+
if any(ULTRA_TRIGGERS.search(s) for s in (last_user, *sys_prompts) if s):
|
|
326
|
+
if _ultra_available():
|
|
327
|
+
return ULTRA_MODEL, "ultra-trigger"
|
|
328
|
+
# Explicit user opt-in but the tier isn't wired up. Don't 404 --
|
|
329
|
+
# serve the request from the heaviest tier we *do* have and let
|
|
330
|
+
# the user notice in logs that their trigger was a no-op.
|
|
331
|
+
log.warning("ultra-trigger ignored: %s not in models.ini; falling back to %s",
|
|
332
|
+
ULTRA_MODEL, AGENT_MODEL)
|
|
333
|
+
return AGENT_MODEL, f"ultra-trigger->agent ({ULTRA_MODEL} unavailable)"
|
|
334
|
+
|
|
335
|
+
has_tools = bool(body.get("tools"))
|
|
336
|
+
n_turns = len(messages) if messages else 0
|
|
337
|
+
has_code_signal = (
|
|
338
|
+
_matches(CODE_BLOCK, messages, prompt)
|
|
339
|
+
or _matches(AGENT_SIGNALS, messages, prompt)
|
|
340
|
+
)
|
|
341
|
+
|
|
342
|
+
# Plan track is orthogonal to the code fidelity ladder: ``plan`` is a
|
|
343
|
+
# chat-tuned model meant for design / "should we" discussions. Only
|
|
344
|
+
# take it when nothing about the request says "I'm about to write
|
|
345
|
+
# code" (no triple-backticks, no agent verbs, no tool calls).
|
|
346
|
+
if (
|
|
347
|
+
not has_tools
|
|
348
|
+
and not has_code_signal
|
|
349
|
+
and _matches(PLAN_SIGNALS, messages, prompt)
|
|
350
|
+
):
|
|
351
|
+
return PLAN_MODEL, "plan-signal"
|
|
352
|
+
|
|
353
|
+
est = _estimate_tokens(messages, prompt)
|
|
354
|
+
|
|
355
|
+
# Rung 1: short context -- start at the top.
|
|
356
|
+
if est <= HIGH_FIDELITY_CEILING:
|
|
357
|
+
if _ultra_available():
|
|
358
|
+
return ULTRA_MODEL, f"high-fidelity tokens~{est}<={HIGH_FIDELITY_CEILING}"
|
|
359
|
+
return AGENT_MODEL, (
|
|
360
|
+
f"high-fidelity tokens~{est}<={HIGH_FIDELITY_CEILING} "
|
|
361
|
+
f"({ULTRA_MODEL} unavailable)"
|
|
362
|
+
)
|
|
363
|
+
|
|
364
|
+
# Rung 2: mid context -- local heavy coder is at its sweet spot.
|
|
365
|
+
if est <= MID_FIDELITY_CEILING:
|
|
366
|
+
return AGENT_MODEL, f"mid-fidelity tokens~{est}<={MID_FIDELITY_CEILING}"
|
|
367
|
+
|
|
368
|
+
# Rung 3: long context -- step down to fast (128k YaRN, free,
|
|
369
|
+
# always-resident). Floor at smart when tools/agent loop is in
|
|
370
|
+
# play; the 3B coder doesn't tool-call reliably.
|
|
371
|
+
if has_tools or n_turns >= MULTI_TURN_THRESHOLD:
|
|
372
|
+
why = "tools" if has_tools else f"turns={n_turns}"
|
|
373
|
+
return AGENT_MODEL, f"long-context tokens~{est}>{MID_FIDELITY_CEILING} ({why} floor)"
|
|
374
|
+
return FAST_MODEL, f"long-context tokens~{est}>{MID_FIDELITY_CEILING}"
|
|
375
|
+
|
|
376
|
+
|
|
377
|
+
# ----------------------------- proxy plumbing ------------------------------
|
|
378
|
+
|
|
379
|
+
HOP_BY_HOP = {
|
|
380
|
+
"connection", "keep-alive", "proxy-authenticate", "proxy-authorization",
|
|
381
|
+
"te", "trailers", "transfer-encoding", "upgrade", "host", "content-length",
|
|
382
|
+
}
|
|
383
|
+
|
|
384
|
+
|
|
385
|
+
def _filter_request_headers(req: Request) -> dict[str, str]:
|
|
386
|
+
return {k: v for k, v in req.headers.items() if k.lower() not in HOP_BY_HOP}
|
|
387
|
+
|
|
388
|
+
|
|
389
|
+
def _filter_response_headers(resp: httpx.Response) -> dict[str, str]:
|
|
390
|
+
return {k: v for k, v in resp.headers.items() if k.lower() not in HOP_BY_HOP}
|
|
391
|
+
|
|
392
|
+
|
|
393
|
+
async def _stream_proxy(method: str, path: str, body: bytes, headers: dict[str, str]) -> StreamingResponse:
|
|
394
|
+
assert client is not None
|
|
395
|
+
upstream_req = client.build_request(method, path, content=body, headers=headers)
|
|
396
|
+
upstream = await client.send(upstream_req, stream=True)
|
|
397
|
+
|
|
398
|
+
async def gen():
|
|
399
|
+
try:
|
|
400
|
+
async for chunk in upstream.aiter_raw():
|
|
401
|
+
yield chunk
|
|
402
|
+
finally:
|
|
403
|
+
await upstream.aclose()
|
|
404
|
+
|
|
405
|
+
return StreamingResponse(
|
|
406
|
+
gen(),
|
|
407
|
+
status_code=upstream.status_code,
|
|
408
|
+
headers=_filter_response_headers(upstream),
|
|
409
|
+
media_type=upstream.headers.get("content-type"),
|
|
410
|
+
)
|
|
411
|
+
|
|
412
|
+
|
|
413
|
+
# --------------------------------- routes ----------------------------------
|
|
414
|
+
|
|
415
|
+
@app.get("/models.ini")
|
|
416
|
+
async def serve_models_ini() -> Response:
|
|
417
|
+
"""Return the router's live ``models.ini`` as text.
|
|
418
|
+
|
|
419
|
+
Read fresh on every request rather than from the cached
|
|
420
|
+
:data:`TIERS` snapshot -- a thin client running
|
|
421
|
+
``llmstack install --external`` against this router should see
|
|
422
|
+
whatever the operator has most recently written to disk, even if
|
|
423
|
+
the router hasn't been restarted to pick up a re-parse. (Stale
|
|
424
|
+
``TIERS`` only affects in-flight routing decisions; the file on
|
|
425
|
+
disk is the source of truth for downstream config generation.)
|
|
426
|
+
|
|
427
|
+
Returning the file is also how external clients health-check the
|
|
428
|
+
router: a 200 with a non-empty INI body proves both that the
|
|
429
|
+
router process is up and that the operator has a usable config
|
|
430
|
+
here -- which is exactly what the client needs to render its
|
|
431
|
+
own ``opencode.json``. There is no separate ``/health`` route.
|
|
432
|
+
"""
|
|
433
|
+
path = models_ini_path()
|
|
434
|
+
if not path.is_file():
|
|
435
|
+
# Router is up but the operator hasn't pointed it at a
|
|
436
|
+
# models.ini yet (or the file went missing). Fail loud so the
|
|
437
|
+
# thin-client install surfaces a real error message instead of
|
|
438
|
+
# rendering an empty opencode.json.
|
|
439
|
+
return PlainTextResponse(
|
|
440
|
+
f"models.ini not found at {path} on the router host.\n"
|
|
441
|
+
"Set $LLMSTACK_MODELS_INI on the router or run "
|
|
442
|
+
"`llmstack install` there to seed the default.\n",
|
|
443
|
+
status_code=404,
|
|
444
|
+
media_type="text/plain",
|
|
445
|
+
)
|
|
446
|
+
try:
|
|
447
|
+
text = path.read_text(encoding="utf-8")
|
|
448
|
+
except OSError as e:
|
|
449
|
+
log.warning("failed to read %s for /models.ini: %s", path, e)
|
|
450
|
+
return PlainTextResponse(
|
|
451
|
+
f"failed to read {path}: {e}\n",
|
|
452
|
+
status_code=500,
|
|
453
|
+
media_type="text/plain",
|
|
454
|
+
)
|
|
455
|
+
return PlainTextResponse(text, media_type="text/plain; charset=utf-8")
|
|
456
|
+
|
|
457
|
+
|
|
458
|
+
@app.get("/v1/models")
|
|
459
|
+
async def list_models() -> JSONResponse:
|
|
460
|
+
assert client is not None
|
|
461
|
+
try:
|
|
462
|
+
r = await client.get("/v1/models")
|
|
463
|
+
data = r.json()
|
|
464
|
+
status = r.status_code
|
|
465
|
+
except Exception as exc:
|
|
466
|
+
log.warning("upstream /v1/models failed: %s", exc)
|
|
467
|
+
data = {"object": "list", "data": []}
|
|
468
|
+
status = 200
|
|
469
|
+
|
|
470
|
+
if not isinstance(data, dict) or not isinstance(data.get("data"), list):
|
|
471
|
+
data = {"object": "list", "data": []}
|
|
472
|
+
|
|
473
|
+
# Hosted (bedrock) tiers aren't known to llama-swap; fold them in.
|
|
474
|
+
seen = {entry.get("id") for entry in data["data"] if isinstance(entry, dict)}
|
|
475
|
+
from llmstack.backends import bedrock as bedrock_backend
|
|
476
|
+
for tier in TIERS.values():
|
|
477
|
+
if not tier.is_bedrock:
|
|
478
|
+
continue
|
|
479
|
+
if tier.name in seen:
|
|
480
|
+
continue
|
|
481
|
+
data["data"].append(bedrock_backend.model_descriptor(tier))
|
|
482
|
+
seen.add(tier.name)
|
|
483
|
+
for alias in tier.aliases:
|
|
484
|
+
if alias not in seen:
|
|
485
|
+
desc = bedrock_backend.model_descriptor(tier)
|
|
486
|
+
desc["id"] = alias
|
|
487
|
+
desc["name"] = f"{tier.description} (alias of {tier.name})"
|
|
488
|
+
data["data"].append(desc)
|
|
489
|
+
seen.add(alias)
|
|
490
|
+
|
|
491
|
+
if _ultra_available():
|
|
492
|
+
top_blurb = (
|
|
493
|
+
f"Step-down ladder (top->bottom as context grows): "
|
|
494
|
+
f"'{ULTRA_MODEL}' up to ~{HIGH_FIDELITY_CEILING} tokens, "
|
|
495
|
+
f"'{AGENT_MODEL}' up to ~{MID_FIDELITY_CEILING}, "
|
|
496
|
+
f"'{FAST_MODEL}' beyond that."
|
|
497
|
+
)
|
|
498
|
+
name = "Auto (step-down router: ultra/agent/fast + plan/uncensored)"
|
|
499
|
+
else:
|
|
500
|
+
top_blurb = (
|
|
501
|
+
f"Step-down ladder (top->bottom as context grows): "
|
|
502
|
+
f"'{AGENT_MODEL}' up to ~{MID_FIDELITY_CEILING} tokens, "
|
|
503
|
+
f"'{FAST_MODEL}' beyond that."
|
|
504
|
+
)
|
|
505
|
+
name = "Auto (step-down router: agent/fast + plan/uncensored)"
|
|
506
|
+
data["data"].insert(0, {
|
|
507
|
+
"id": "auto",
|
|
508
|
+
"object": "model",
|
|
509
|
+
"created": 0,
|
|
510
|
+
"owned_by": "router",
|
|
511
|
+
"name": name,
|
|
512
|
+
"description": (
|
|
513
|
+
f"{top_blurb} "
|
|
514
|
+
f"'{PLAN_MODEL}' for design/planning (orthogonal to ladder); "
|
|
515
|
+
f"'{UNCENSORED_MODEL}' for explicit [nofilter] triggers; "
|
|
516
|
+
f"'[ultra]'/'[opus]' triggers force '{ULTRA_MODEL}' regardless of size."
|
|
517
|
+
),
|
|
518
|
+
"tier": "auto",
|
|
519
|
+
})
|
|
520
|
+
return JSONResponse(content=data, status_code=status)
|
|
521
|
+
|
|
522
|
+
|
|
523
|
+
def _resolve_tier(name: str | None) -> Tier | None:
|
|
524
|
+
if not name:
|
|
525
|
+
return None
|
|
526
|
+
return TIER_BY_ALIAS.get(name)
|
|
527
|
+
|
|
528
|
+
|
|
529
|
+
# Map the short sampler keys used in models.ini to the OpenAI-compatible
|
|
530
|
+
# request-body fields that downstream backends understand. llama.cpp
|
|
531
|
+
# accepts `top_k`, `min_p`, and `repetition_penalty` as extensions; the
|
|
532
|
+
# Bedrock backend ignores fields it can't translate to Converse.
|
|
533
|
+
_SAMPLER_BODY_FIELD = {
|
|
534
|
+
"temp": "temperature",
|
|
535
|
+
"top_p": "top_p",
|
|
536
|
+
"top_k": "top_k",
|
|
537
|
+
"min_p": "min_p",
|
|
538
|
+
"rep_pen": "repetition_penalty",
|
|
539
|
+
}
|
|
540
|
+
|
|
541
|
+
|
|
542
|
+
def _inject_sampler(body: dict[str, Any], tier: Tier) -> bool:
|
|
543
|
+
"""Layer this tier's `sampler = ...` defaults onto the request body.
|
|
544
|
+
|
|
545
|
+
**Bedrock-only.** For gguf tiers, sampling defaults are baked into
|
|
546
|
+
the llama-server startup command line by
|
|
547
|
+
:mod:`llmstack.generators.llama_swap`, so llama-server already
|
|
548
|
+
applies them for any request whose body lacks an explicit value.
|
|
549
|
+
Bedrock has no equivalent server-side mechanism -- the only place to
|
|
550
|
+
apply per-tier sampling for hosted models is the outbound request
|
|
551
|
+
body, which is what this function does.
|
|
552
|
+
|
|
553
|
+
Caller-supplied values always win -- if the client already set
|
|
554
|
+
`temperature`, the tier default does not overwrite it. This makes
|
|
555
|
+
models.ini the source of truth for "what sampler does each tier
|
|
556
|
+
use", while still letting power users override per call.
|
|
557
|
+
|
|
558
|
+
Returns ``True`` iff anything was added (the caller re-encodes the
|
|
559
|
+
raw body bytes only when the dict actually changed).
|
|
560
|
+
|
|
561
|
+
A Bedrock tier with an empty sampler dict (no `sampler =` line, or
|
|
562
|
+
all keys stripped) is a no-op -- the canonical pattern for Bedrock
|
|
563
|
+
families like Claude Opus 4.7 that reject every sampler param.
|
|
564
|
+
"""
|
|
565
|
+
if not tier.is_bedrock or not tier.sampler:
|
|
566
|
+
return False
|
|
567
|
+
mutated = False
|
|
568
|
+
for src, dst in _SAMPLER_BODY_FIELD.items():
|
|
569
|
+
if src in tier.sampler and dst not in body:
|
|
570
|
+
body[dst] = tier.sampler[src]
|
|
571
|
+
mutated = True
|
|
572
|
+
return mutated
|
|
573
|
+
|
|
574
|
+
|
|
575
|
+
async def _handle_completion(req: Request, path: str) -> Response:
|
|
576
|
+
raw = await req.body()
|
|
577
|
+
headers = _filter_request_headers(req)
|
|
578
|
+
|
|
579
|
+
try:
|
|
580
|
+
body = json.loads(raw) if raw else {}
|
|
581
|
+
except json.JSONDecodeError:
|
|
582
|
+
return await _stream_proxy(req.method, path, raw, headers)
|
|
583
|
+
|
|
584
|
+
mutated = False
|
|
585
|
+
requested = body.get("model")
|
|
586
|
+
if requested in AUTO_ALIASES or requested == "auto":
|
|
587
|
+
chosen, reason = classify(body)
|
|
588
|
+
body["model"] = chosen
|
|
589
|
+
log.info("auto -> %s (%s) [path=%s]", chosen, reason, path)
|
|
590
|
+
mutated = True
|
|
591
|
+
|
|
592
|
+
chosen_name = body.get("model")
|
|
593
|
+
tier = _resolve_tier(chosen_name)
|
|
594
|
+
if tier is not None and _inject_sampler(body, tier):
|
|
595
|
+
mutated = True
|
|
596
|
+
|
|
597
|
+
if mutated:
|
|
598
|
+
raw = json.dumps(body).encode()
|
|
599
|
+
|
|
600
|
+
if tier is not None and tier.is_bedrock:
|
|
601
|
+
from llmstack.backends import bedrock as bedrock_backend
|
|
602
|
+
return await bedrock_backend.dispatch(req, tier, body)
|
|
603
|
+
|
|
604
|
+
return await _stream_proxy(req.method, path, raw, headers)
|
|
605
|
+
|
|
606
|
+
|
|
607
|
+
@app.post("/v1/chat/completions")
|
|
608
|
+
async def chat_completions(req: Request) -> Response:
|
|
609
|
+
return await _handle_completion(req, "/v1/chat/completions")
|
|
610
|
+
|
|
611
|
+
|
|
612
|
+
@app.post("/v1/completions")
|
|
613
|
+
async def completions(req: Request) -> Response:
|
|
614
|
+
return await _handle_completion(req, "/v1/completions")
|
|
615
|
+
|
|
616
|
+
|
|
617
|
+
# --------------------------- catch-all reverse proxy -----------------------
|
|
618
|
+
|
|
619
|
+
@app.api_route(
|
|
620
|
+
"/{path:path}",
|
|
621
|
+
methods=["GET", "POST", "PUT", "DELETE", "PATCH", "OPTIONS", "HEAD"],
|
|
622
|
+
)
|
|
623
|
+
async def passthrough(path: str, req: Request) -> Response:
|
|
624
|
+
raw = await req.body()
|
|
625
|
+
headers = _filter_request_headers(req)
|
|
626
|
+
return await _stream_proxy(req.method, "/" + path, raw, headers)
|
|
627
|
+
|
|
628
|
+
|
|
629
|
+
def main() -> None:
|
|
630
|
+
"""Run the router with uvicorn. Used by ``python -m llmstack.app``."""
|
|
631
|
+
import asyncio
|
|
632
|
+
|
|
633
|
+
import uvicorn
|
|
634
|
+
|
|
635
|
+
log_level = os.getenv("LOG_LEVEL", "info").lower()
|
|
636
|
+
host = os.getenv("ROUTER_HOST", "127.0.0.1")
|
|
637
|
+
port = int(os.getenv("ROUTER_PORT", "10101"))
|
|
638
|
+
|
|
639
|
+
cfg = uvicorn.Config(app, host=host, port=port, log_level=log_level)
|
|
640
|
+
asyncio.run(uvicorn.Server(cfg).serve())
|
|
641
|
+
|
|
642
|
+
|
|
643
|
+
if __name__ == "__main__":
|
|
644
|
+
main()
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
"""Pluggable request backends.
|
|
2
|
+
|
|
3
|
+
The router (:mod:`llmstack.app`) classifies a request and picks a tier
|
|
4
|
+
name. Each tier's :attr:`Tier.backend` selects how the request actually
|
|
5
|
+
gets fulfilled:
|
|
6
|
+
|
|
7
|
+
``gguf`` reverse-proxy to the local llama-swap (the default; no
|
|
8
|
+
module needed -- :mod:`llmstack.app` does the proxying
|
|
9
|
+
itself).
|
|
10
|
+
``bedrock`` hand off to :mod:`llmstack.backends.bedrock` which
|
|
11
|
+
translates OpenAI chat/completions to AWS Bedrock
|
|
12
|
+
Converse and streams the response back as OpenAI SSE.
|
|
13
|
+
|
|
14
|
+
Each backend module is loaded lazily so the optional cloud SDKs are
|
|
15
|
+
only imported when the operator has actually configured a tier that
|
|
16
|
+
needs them (and only when they're invoked).
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from __future__ import annotations
|