opencode-llmstack 0.9.4__tar.gz → 0.9.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {opencode_llmstack-0.9.4/opencode_llmstack.egg-info → opencode_llmstack-0.9.7}/PKG-INFO +26 -39
- {opencode_llmstack-0.9.4 → opencode_llmstack-0.9.7}/README.md +25 -38
- {opencode_llmstack-0.9.4 → opencode_llmstack-0.9.7}/UPGRADING.md +12 -12
- {opencode_llmstack-0.9.4 → opencode_llmstack-0.9.7}/llmstack/__init__.py +1 -1
- {opencode_llmstack-0.9.4 → opencode_llmstack-0.9.7}/llmstack/app.py +78 -112
- {opencode_llmstack-0.9.4 → opencode_llmstack-0.9.7}/llmstack/backends/bedrock.py +3 -1
- {opencode_llmstack-0.9.4 → opencode_llmstack-0.9.7}/llmstack/generators/opencode.py +2 -2
- {opencode_llmstack-0.9.4 → opencode_llmstack-0.9.7}/llmstack/models.ini +11 -17
- {opencode_llmstack-0.9.4 → opencode_llmstack-0.9.7/opencode_llmstack.egg-info}/PKG-INFO +26 -39
- {opencode_llmstack-0.9.4 → opencode_llmstack-0.9.7}/pyproject.toml +1 -1
- {opencode_llmstack-0.9.4 → opencode_llmstack-0.9.7}/CHANGELOG.md +0 -0
- {opencode_llmstack-0.9.4 → opencode_llmstack-0.9.7}/LICENSE +0 -0
- {opencode_llmstack-0.9.4 → opencode_llmstack-0.9.7}/llmstack/AGENTS.md +0 -0
- {opencode_llmstack-0.9.4 → opencode_llmstack-0.9.7}/llmstack/__main__.py +0 -0
- {opencode_llmstack-0.9.4 → opencode_llmstack-0.9.7}/llmstack/_platform.py +0 -0
- {opencode_llmstack-0.9.4 → opencode_llmstack-0.9.7}/llmstack/backends/__init__.py +0 -0
- {opencode_llmstack-0.9.4 → opencode_llmstack-0.9.7}/llmstack/check_models.py +0 -0
- {opencode_llmstack-0.9.4 → opencode_llmstack-0.9.7}/llmstack/cli.py +0 -0
- {opencode_llmstack-0.9.4 → opencode_llmstack-0.9.7}/llmstack/commands/__init__.py +0 -0
- {opencode_llmstack-0.9.4 → opencode_llmstack-0.9.7}/llmstack/commands/_helpers.py +0 -0
- {opencode_llmstack-0.9.4 → opencode_llmstack-0.9.7}/llmstack/commands/activate.py +0 -0
- {opencode_llmstack-0.9.4 → opencode_llmstack-0.9.7}/llmstack/commands/check.py +0 -0
- {opencode_llmstack-0.9.4 → opencode_llmstack-0.9.7}/llmstack/commands/download.py +0 -0
- {opencode_llmstack-0.9.4 → opencode_llmstack-0.9.7}/llmstack/commands/install.py +0 -0
- {opencode_llmstack-0.9.4 → opencode_llmstack-0.9.7}/llmstack/commands/install_llama_swap.py +0 -0
- {opencode_llmstack-0.9.4 → opencode_llmstack-0.9.7}/llmstack/commands/reload.py +0 -0
- {opencode_llmstack-0.9.4 → opencode_llmstack-0.9.7}/llmstack/commands/restart.py +0 -0
- {opencode_llmstack-0.9.4 → opencode_llmstack-0.9.7}/llmstack/commands/setup.py +0 -0
- {opencode_llmstack-0.9.4 → opencode_llmstack-0.9.7}/llmstack/commands/start.py +0 -0
- {opencode_llmstack-0.9.4 → opencode_llmstack-0.9.7}/llmstack/commands/status.py +0 -0
- {opencode_llmstack-0.9.4 → opencode_llmstack-0.9.7}/llmstack/commands/stop.py +0 -0
- {opencode_llmstack-0.9.4 → opencode_llmstack-0.9.7}/llmstack/download/__init__.py +0 -0
- {opencode_llmstack-0.9.4 → opencode_llmstack-0.9.7}/llmstack/download/binary.py +0 -0
- {opencode_llmstack-0.9.4 → opencode_llmstack-0.9.7}/llmstack/download/ggufs.py +0 -0
- {opencode_llmstack-0.9.4 → opencode_llmstack-0.9.7}/llmstack/generators/__init__.py +0 -0
- {opencode_llmstack-0.9.4 → opencode_llmstack-0.9.7}/llmstack/generators/llama_swap.py +0 -0
- {opencode_llmstack-0.9.4 → opencode_llmstack-0.9.7}/llmstack/paths.py +0 -0
- {opencode_llmstack-0.9.4 → opencode_llmstack-0.9.7}/llmstack/shell_env.py +0 -0
- {opencode_llmstack-0.9.4 → opencode_llmstack-0.9.7}/llmstack/tiers.py +0 -0
- {opencode_llmstack-0.9.4 → opencode_llmstack-0.9.7}/opencode_llmstack.egg-info/SOURCES.txt +0 -0
- {opencode_llmstack-0.9.4 → opencode_llmstack-0.9.7}/opencode_llmstack.egg-info/dependency_links.txt +0 -0
- {opencode_llmstack-0.9.4 → opencode_llmstack-0.9.7}/opencode_llmstack.egg-info/entry_points.txt +0 -0
- {opencode_llmstack-0.9.4 → opencode_llmstack-0.9.7}/opencode_llmstack.egg-info/requires.txt +0 -0
- {opencode_llmstack-0.9.4 → opencode_llmstack-0.9.7}/opencode_llmstack.egg-info/top_level.txt +0 -0
- {opencode_llmstack-0.9.4 → opencode_llmstack-0.9.7}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: opencode-llmstack
|
|
3
|
-
Version: 0.9.
|
|
3
|
+
Version: 0.9.7
|
|
4
4
|
Summary: Multi-tier local LLM stack: llama-swap + FastAPI auto-router + opencode wiring.
|
|
5
5
|
Author: llmstack
|
|
6
6
|
License: MIT License
|
|
@@ -78,14 +78,14 @@ client (opencode / curl / Cursor / etc.)
|
|
|
78
78
|
│
|
|
79
79
|
▼
|
|
80
80
|
http://127.0.0.1:10101 <-- FastAPI router (llmstack.app)
|
|
81
|
-
│ • model="auto" → classify → rewrite to one of
|
|
81
|
+
│ • model="auto" → classify → rewrite to one of 3 coder tiers
|
|
82
82
|
│ • everything else → pass-through
|
|
83
83
|
▼
|
|
84
84
|
http://127.0.0.1:10102 <-- llama-swap (binary, manages model lifecycle)
|
|
85
85
|
│ • loads/unloads llama-server processes per model
|
|
86
86
|
│ • matrix solver allows {code-fast + one heavy model} co-resident
|
|
87
87
|
▼
|
|
88
|
-
llama-server <code-fast | code-smart |
|
|
88
|
+
llama-server <code-fast | code-smart | code-ultra>
|
|
89
89
|
│
|
|
90
90
|
▼
|
|
91
91
|
GGUF in ~/.cache/huggingface/hub/...
|
|
@@ -101,7 +101,7 @@ A 64 GB unified memory M4 Max can comfortably hold **one always-on tiny coder +
|
|
|
101
101
|
|
|
102
102
|
- **Agent work** (multi-file edits, tool use, refactors) → coder models, which are trained on tool-call protocols and code edits.
|
|
103
103
|
- **Planning** (design discussions, architecture, "what's the best approach") → chat-tuned models, which are better at high-level reasoning and don't try to start writing code in response to every message.
|
|
104
|
-
- **Uncensored planning** is a separate plan-tier model, opted in
|
|
104
|
+
- **Uncensored planning** is a separate plan-tier model, opted in by explicit agent selection (`/agent plan-nofilter` in opencode).
|
|
105
105
|
|
|
106
106
|
Routing decisions cost ~zero — they're a few regex checks in the FastAPI router, not an LLM call.
|
|
107
107
|
|
|
@@ -135,20 +135,18 @@ matches how these models actually behave on this stack:
|
|
|
135
135
|
than priors, so they tend to *improve* relative to top-tier as the
|
|
136
136
|
conversation grows.
|
|
137
137
|
|
|
138
|
-
First match wins:
|
|
138
|
+
First match wins (auto-routing only; `plan` and `plan-uncensored` are not auto-routed):
|
|
139
139
|
|
|
140
140
|
| # | Condition | → Model | Reason |
|
|
141
141
|
|---|---|---|---|
|
|
142
|
-
| 1 |
|
|
143
|
-
| 2 |
|
|
144
|
-
| 3 |
|
|
145
|
-
| 4 |
|
|
146
|
-
| 5 |
|
|
147
|
-
| 6 | otherwise (long context) AND ≥ 10 user turns | `code-smart` | floor: deep agentic loop, keep the heavy model |
|
|
148
|
-
| 7 | otherwise (long context) | `code-fast` | 128k YaRN window + always-resident + free |
|
|
142
|
+
| 1 | `[ultra]` / `[opus]` / `ultra:` trigger AND `code-ultra` tier configured | `code-ultra` | explicit top-tier opt-in |
|
|
143
|
+
| 2 | estimated input ≤ 12 000 tokens | `code-ultra` *(or `code-smart` if ultra unwired)* | top tier — context still being built, latency/$ are best here |
|
|
144
|
+
| 3 | estimated input ≤ 32 000 tokens | `code-smart` | mid-context, local heavy coder is at its sweet spot |
|
|
145
|
+
| 4 | otherwise (long context) AND ≥ 10 user turns | `code-smart` | floor: deep agentic loop, keep the heavy model |
|
|
146
|
+
| 5 | otherwise (long context) | `code-fast` | 128k YaRN window + always-resident + free |
|
|
149
147
|
|
|
150
148
|
Token estimates are `chars / 4` over all message text + `prompt`. The
|
|
151
|
-
`code-ultra` rungs (
|
|
149
|
+
`code-ultra` rungs (1 and 2) are gated on availability: when no
|
|
152
150
|
`[code-ultra]` section is loaded from `models.ini`, both silently fall
|
|
153
151
|
back to `code-smart` so vanilla installs don't 404.
|
|
154
152
|
|
|
@@ -198,7 +196,8 @@ your global setup unchanged.
|
|
|
198
196
|
| **`agent.plan-nofilter`** (custom uncensored planner) | `llama.cpp/plan-uncensored` |
|
|
199
197
|
|
|
200
198
|
Inside opencode you can switch agents with `/agent` or by `@plan-nofilter`-mentioning
|
|
201
|
-
a custom one.
|
|
199
|
+
a custom one. The `plan` and `plan-uncensored` tiers are **not auto-routed** from the build agent —
|
|
200
|
+
they're only accessible via explicit agent selection (`/agent plan` or `/agent plan-nofilter`).
|
|
202
201
|
|
|
203
202
|
Want a second terminal into the same stack? Install the activate hook
|
|
204
203
|
once (`eval "$(llmstack activate zsh)"`) and any new shell that `cd`s
|
|
@@ -266,8 +265,9 @@ Per-project state (gitignored) is created lazily under `<work-dir>/.llmstack/`:
|
|
|
266
265
|
```
|
|
267
266
|
|
|
268
267
|
The `llama-swap` binary lives outside any project at
|
|
269
|
-
`$XDG_DATA_HOME/llmstack/bin/llama-swap` (override with
|
|
270
|
-
`LLMSTACK_BIN_DIR`)
|
|
268
|
+
`$XDG_DATA_HOME/llmstack/bin/llama-swap` on macOS/Linux (override with
|
|
269
|
+
`LLMSTACK_BIN_DIR`), or `%LOCALAPPDATA%\llmstack\bin\llama-swap.exe` on Windows.
|
|
270
|
+
One download is reused across all projects.
|
|
271
271
|
|
|
272
272
|
## Quick start
|
|
273
273
|
|
|
@@ -358,8 +358,9 @@ Notes:
|
|
|
358
358
|
or a package like `winget install ggml.llama-cpp` and put it on
|
|
359
359
|
`PATH` (or set `$env:LLAMA_SERVER_BIN`). The Mac-only
|
|
360
360
|
`iogpu.wired_limit_mb` step does not apply.
|
|
361
|
-
- The `[llmstack:<channel>]` prompt prefix shows up in PowerShell
|
|
362
|
-
|
|
361
|
+
- The `[llmstack:<channel>]` prompt prefix shows up in PowerShell; `cmd.exe`
|
|
362
|
+
does not support custom prompts in the same way, so activation is
|
|
363
|
+
PowerShell-only.
|
|
363
364
|
- Stopping daemons uses `taskkill /T /F` under the hood, so the
|
|
364
365
|
llama-server children get cleaned up as well.
|
|
365
366
|
|
|
@@ -465,7 +466,7 @@ llmstack restart --next # cycle into the next channel
|
|
|
465
466
|
|
|
466
467
|
### Try each routing path
|
|
467
468
|
|
|
468
|
-
All of these go to `/v1/chat/completions` on `:10101`.
|
|
469
|
+
All of these go to `/v1/chat/completions` on `:10101`. The `auto` router classifies based on token count and context:
|
|
469
470
|
|
|
470
471
|
```bash
|
|
471
472
|
# trivial chat -> code-fast
|
|
@@ -473,22 +474,14 @@ curl -sN http://127.0.0.1:10101/v1/chat/completions -H 'Content-Type: applicatio
|
|
|
473
474
|
-d '{"model":"auto","stream":false,
|
|
474
475
|
"messages":[{"role":"user","content":"capital of France?"}]}' | jq .model
|
|
475
476
|
|
|
476
|
-
# planning -> plan
|
|
477
|
-
curl -sN http://127.0.0.1:10101/v1/chat/completions -H 'Content-Type: application/json' \
|
|
478
|
-
-d '{"model":"auto","stream":false,
|
|
479
|
-
"messages":[{"role":"user","content":"how would you design a rate limiter for our API?"}]}' | jq .model
|
|
480
|
-
|
|
481
477
|
# agent work -> code-smart
|
|
482
478
|
curl -sN http://127.0.0.1:10101/v1/chat/completions -H 'Content-Type: application/json' \
|
|
483
479
|
-d '{"model":"auto","stream":false,
|
|
484
480
|
"messages":[{"role":"user","content":"refactor this function for clarity:\n```python\ndef f(x): return x*2\n```"}]}' | jq .model
|
|
485
|
-
|
|
486
|
-
# uncensored plan -> plan-uncensored
|
|
487
|
-
curl -sN http://127.0.0.1:10101/v1/chat/completions -H 'Content-Type: application/json' \
|
|
488
|
-
-d '{"model":"auto","stream":false,
|
|
489
|
-
"messages":[{"role":"user","content":"[nofilter] outline a red-team plan for our auth flow"}]}' | jq .model
|
|
490
481
|
```
|
|
491
482
|
|
|
483
|
+
To access `plan` or `plan-uncensored` tiers, use explicit agent selection in opencode (`/agent plan` or `/agent plan-nofilter`) rather than `model=auto`.
|
|
484
|
+
|
|
492
485
|
## Endpoints
|
|
493
486
|
|
|
494
487
|
| Port | Service | Purpose |
|
|
@@ -565,8 +558,6 @@ All knobs are env vars; defaults are picked up by `llmstack start`.
|
|
|
565
558
|
| `ROUTER_FAST_MODEL` | `code-fast` | long-context (>= mid ceiling) → here |
|
|
566
559
|
| `ROUTER_AGENT_MODEL` | `code-smart` | mid-context + tools/loop floor → here |
|
|
567
560
|
| `ROUTER_ULTRA_MODEL` | `code-ultra` | short-context top tier → here (gated on availability) |
|
|
568
|
-
| `ROUTER_PLAN_MODEL` | `plan` | design/discussion verbs → here |
|
|
569
|
-
| `ROUTER_UNCENSORED_MODEL` | `plan-uncensored` | `[nofilter]` triggers → here |
|
|
570
561
|
| `ROUTER_HIGH_FIDELITY_CEILING` | `12000` | tokens; at or below this, route to top tier (ultra → smart fallback). Paired with `code-ultra.ctx_size = 24000` (2x). |
|
|
571
562
|
| `ROUTER_MID_FIDELITY_CEILING` | `32000` | tokens; at or below this, route to `code-smart`; beyond, step down to `code-fast`. Paired with `code-smart.ctx_size = 64000` (2x). |
|
|
572
563
|
| `ROUTER_MULTI_TURN` | `10` | user-turn count that floors the long-context rung at `code-smart` |
|
|
@@ -577,14 +568,10 @@ To force a request to never auto-route, set `model` to a concrete alias (`code-f
|
|
|
577
568
|
|
|
578
569
|
## Triggering uncensored mode
|
|
579
570
|
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
1. **Explicit agent in opencode:** `/agent plan-nofilter` (or mention it).
|
|
583
|
-
2. **Inline trigger in any auto-routed message** — anywhere in the most recent user turn:
|
|
584
|
-
- `[nofilter]`, `[uncensored]`, `[heretic]`
|
|
585
|
-
- or a line starting with `uncensored:` / `nofilter:` / `no-filter:`
|
|
571
|
+
The `plan-uncensored` tier is accessible via explicit agent selection only:
|
|
586
572
|
|
|
587
|
-
|
|
573
|
+
1. **In opencode:** `/agent plan-nofilter` (or mention `@plan-nofilter`).
|
|
574
|
+
2. **Via opencode config:** set `agent.plan-nofilter` as your active agent.
|
|
588
575
|
|
|
589
576
|
## Troubleshooting
|
|
590
577
|
|
|
@@ -594,7 +581,7 @@ Triggers are *only* checked on the latest user message and the system prompt, so
|
|
|
594
581
|
|
|
595
582
|
**OOM / unexplained slowdown** → run `top -o mem -stats pid,rsize,command` to see what's resident. The matrix should prevent two heavy models loading together; if it somehow happens, `llmstack restart`.
|
|
596
583
|
|
|
597
|
-
**Auto picks the wrong model** → adjust the regex in `llmstack/app.py` (`
|
|
584
|
+
**Auto picks the wrong model** → adjust the regex in `llmstack/app.py` (`ULTRA_TRIGGERS`) or move the ladder ceilings via `ROUTER_HIGH_FIDELITY_CEILING` / `ROUTER_MID_FIDELITY_CEILING`. To force a request to never auto-route, pass an explicit `model` (e.g. `code-smart`) instead of `auto`.
|
|
598
585
|
|
|
599
586
|
**Want a pure pass-through (no auto routing)** → change opencode's `baseURL` to `http://127.0.0.1:10102/v1` (llama-swap directly) and only use concrete model names. (Note: this skips the bedrock dispatcher; only GGUF tiers will be reachable.)
|
|
600
587
|
|
|
@@ -19,14 +19,14 @@ client (opencode / curl / Cursor / etc.)
|
|
|
19
19
|
│
|
|
20
20
|
▼
|
|
21
21
|
http://127.0.0.1:10101 <-- FastAPI router (llmstack.app)
|
|
22
|
-
│ • model="auto" → classify → rewrite to one of
|
|
22
|
+
│ • model="auto" → classify → rewrite to one of 3 coder tiers
|
|
23
23
|
│ • everything else → pass-through
|
|
24
24
|
▼
|
|
25
25
|
http://127.0.0.1:10102 <-- llama-swap (binary, manages model lifecycle)
|
|
26
26
|
│ • loads/unloads llama-server processes per model
|
|
27
27
|
│ • matrix solver allows {code-fast + one heavy model} co-resident
|
|
28
28
|
▼
|
|
29
|
-
llama-server <code-fast | code-smart |
|
|
29
|
+
llama-server <code-fast | code-smart | code-ultra>
|
|
30
30
|
│
|
|
31
31
|
▼
|
|
32
32
|
GGUF in ~/.cache/huggingface/hub/...
|
|
@@ -42,7 +42,7 @@ A 64 GB unified memory M4 Max can comfortably hold **one always-on tiny coder +
|
|
|
42
42
|
|
|
43
43
|
- **Agent work** (multi-file edits, tool use, refactors) → coder models, which are trained on tool-call protocols and code edits.
|
|
44
44
|
- **Planning** (design discussions, architecture, "what's the best approach") → chat-tuned models, which are better at high-level reasoning and don't try to start writing code in response to every message.
|
|
45
|
-
- **Uncensored planning** is a separate plan-tier model, opted in
|
|
45
|
+
- **Uncensored planning** is a separate plan-tier model, opted in by explicit agent selection (`/agent plan-nofilter` in opencode).
|
|
46
46
|
|
|
47
47
|
Routing decisions cost ~zero — they're a few regex checks in the FastAPI router, not an LLM call.
|
|
48
48
|
|
|
@@ -76,20 +76,18 @@ matches how these models actually behave on this stack:
|
|
|
76
76
|
than priors, so they tend to *improve* relative to top-tier as the
|
|
77
77
|
conversation grows.
|
|
78
78
|
|
|
79
|
-
First match wins:
|
|
79
|
+
First match wins (auto-routing only; `plan` and `plan-uncensored` are not auto-routed):
|
|
80
80
|
|
|
81
81
|
| # | Condition | → Model | Reason |
|
|
82
82
|
|---|---|---|---|
|
|
83
|
-
| 1 |
|
|
84
|
-
| 2 |
|
|
85
|
-
| 3 |
|
|
86
|
-
| 4 |
|
|
87
|
-
| 5 |
|
|
88
|
-
| 6 | otherwise (long context) AND ≥ 10 user turns | `code-smart` | floor: deep agentic loop, keep the heavy model |
|
|
89
|
-
| 7 | otherwise (long context) | `code-fast` | 128k YaRN window + always-resident + free |
|
|
83
|
+
| 1 | `[ultra]` / `[opus]` / `ultra:` trigger AND `code-ultra` tier configured | `code-ultra` | explicit top-tier opt-in |
|
|
84
|
+
| 2 | estimated input ≤ 12 000 tokens | `code-ultra` *(or `code-smart` if ultra unwired)* | top tier — context still being built, latency/$ are best here |
|
|
85
|
+
| 3 | estimated input ≤ 32 000 tokens | `code-smart` | mid-context, local heavy coder is at its sweet spot |
|
|
86
|
+
| 4 | otherwise (long context) AND ≥ 10 user turns | `code-smart` | floor: deep agentic loop, keep the heavy model |
|
|
87
|
+
| 5 | otherwise (long context) | `code-fast` | 128k YaRN window + always-resident + free |
|
|
90
88
|
|
|
91
89
|
Token estimates are `chars / 4` over all message text + `prompt`. The
|
|
92
|
-
`code-ultra` rungs (
|
|
90
|
+
`code-ultra` rungs (1 and 2) are gated on availability: when no
|
|
93
91
|
`[code-ultra]` section is loaded from `models.ini`, both silently fall
|
|
94
92
|
back to `code-smart` so vanilla installs don't 404.
|
|
95
93
|
|
|
@@ -139,7 +137,8 @@ your global setup unchanged.
|
|
|
139
137
|
| **`agent.plan-nofilter`** (custom uncensored planner) | `llama.cpp/plan-uncensored` |
|
|
140
138
|
|
|
141
139
|
Inside opencode you can switch agents with `/agent` or by `@plan-nofilter`-mentioning
|
|
142
|
-
a custom one.
|
|
140
|
+
a custom one. The `plan` and `plan-uncensored` tiers are **not auto-routed** from the build agent —
|
|
141
|
+
they're only accessible via explicit agent selection (`/agent plan` or `/agent plan-nofilter`).
|
|
143
142
|
|
|
144
143
|
Want a second terminal into the same stack? Install the activate hook
|
|
145
144
|
once (`eval "$(llmstack activate zsh)"`) and any new shell that `cd`s
|
|
@@ -207,8 +206,9 @@ Per-project state (gitignored) is created lazily under `<work-dir>/.llmstack/`:
|
|
|
207
206
|
```
|
|
208
207
|
|
|
209
208
|
The `llama-swap` binary lives outside any project at
|
|
210
|
-
`$XDG_DATA_HOME/llmstack/bin/llama-swap` (override with
|
|
211
|
-
`LLMSTACK_BIN_DIR`)
|
|
209
|
+
`$XDG_DATA_HOME/llmstack/bin/llama-swap` on macOS/Linux (override with
|
|
210
|
+
`LLMSTACK_BIN_DIR`), or `%LOCALAPPDATA%\llmstack\bin\llama-swap.exe` on Windows.
|
|
211
|
+
One download is reused across all projects.
|
|
212
212
|
|
|
213
213
|
## Quick start
|
|
214
214
|
|
|
@@ -299,8 +299,9 @@ Notes:
|
|
|
299
299
|
or a package like `winget install ggml.llama-cpp` and put it on
|
|
300
300
|
`PATH` (or set `$env:LLAMA_SERVER_BIN`). The Mac-only
|
|
301
301
|
`iogpu.wired_limit_mb` step does not apply.
|
|
302
|
-
- The `[llmstack:<channel>]` prompt prefix shows up in PowerShell
|
|
303
|
-
|
|
302
|
+
- The `[llmstack:<channel>]` prompt prefix shows up in PowerShell; `cmd.exe`
|
|
303
|
+
does not support custom prompts in the same way, so activation is
|
|
304
|
+
PowerShell-only.
|
|
304
305
|
- Stopping daemons uses `taskkill /T /F` under the hood, so the
|
|
305
306
|
llama-server children get cleaned up as well.
|
|
306
307
|
|
|
@@ -406,7 +407,7 @@ llmstack restart --next # cycle into the next channel
|
|
|
406
407
|
|
|
407
408
|
### Try each routing path
|
|
408
409
|
|
|
409
|
-
All of these go to `/v1/chat/completions` on `:10101`.
|
|
410
|
+
All of these go to `/v1/chat/completions` on `:10101`. The `auto` router classifies based on token count and context:
|
|
410
411
|
|
|
411
412
|
```bash
|
|
412
413
|
# trivial chat -> code-fast
|
|
@@ -414,22 +415,14 @@ curl -sN http://127.0.0.1:10101/v1/chat/completions -H 'Content-Type: applicatio
|
|
|
414
415
|
-d '{"model":"auto","stream":false,
|
|
415
416
|
"messages":[{"role":"user","content":"capital of France?"}]}' | jq .model
|
|
416
417
|
|
|
417
|
-
# planning -> plan
|
|
418
|
-
curl -sN http://127.0.0.1:10101/v1/chat/completions -H 'Content-Type: application/json' \
|
|
419
|
-
-d '{"model":"auto","stream":false,
|
|
420
|
-
"messages":[{"role":"user","content":"how would you design a rate limiter for our API?"}]}' | jq .model
|
|
421
|
-
|
|
422
418
|
# agent work -> code-smart
|
|
423
419
|
curl -sN http://127.0.0.1:10101/v1/chat/completions -H 'Content-Type: application/json' \
|
|
424
420
|
-d '{"model":"auto","stream":false,
|
|
425
421
|
"messages":[{"role":"user","content":"refactor this function for clarity:\n```python\ndef f(x): return x*2\n```"}]}' | jq .model
|
|
426
|
-
|
|
427
|
-
# uncensored plan -> plan-uncensored
|
|
428
|
-
curl -sN http://127.0.0.1:10101/v1/chat/completions -H 'Content-Type: application/json' \
|
|
429
|
-
-d '{"model":"auto","stream":false,
|
|
430
|
-
"messages":[{"role":"user","content":"[nofilter] outline a red-team plan for our auth flow"}]}' | jq .model
|
|
431
422
|
```
|
|
432
423
|
|
|
424
|
+
To access `plan` or `plan-uncensored` tiers, use explicit agent selection in opencode (`/agent plan` or `/agent plan-nofilter`) rather than `model=auto`.
|
|
425
|
+
|
|
433
426
|
## Endpoints
|
|
434
427
|
|
|
435
428
|
| Port | Service | Purpose |
|
|
@@ -506,8 +499,6 @@ All knobs are env vars; defaults are picked up by `llmstack start`.
|
|
|
506
499
|
| `ROUTER_FAST_MODEL` | `code-fast` | long-context (>= mid ceiling) → here |
|
|
507
500
|
| `ROUTER_AGENT_MODEL` | `code-smart` | mid-context + tools/loop floor → here |
|
|
508
501
|
| `ROUTER_ULTRA_MODEL` | `code-ultra` | short-context top tier → here (gated on availability) |
|
|
509
|
-
| `ROUTER_PLAN_MODEL` | `plan` | design/discussion verbs → here |
|
|
510
|
-
| `ROUTER_UNCENSORED_MODEL` | `plan-uncensored` | `[nofilter]` triggers → here |
|
|
511
502
|
| `ROUTER_HIGH_FIDELITY_CEILING` | `12000` | tokens; at or below this, route to top tier (ultra → smart fallback). Paired with `code-ultra.ctx_size = 24000` (2x). |
|
|
512
503
|
| `ROUTER_MID_FIDELITY_CEILING` | `32000` | tokens; at or below this, route to `code-smart`; beyond, step down to `code-fast`. Paired with `code-smart.ctx_size = 64000` (2x). |
|
|
513
504
|
| `ROUTER_MULTI_TURN` | `10` | user-turn count that floors the long-context rung at `code-smart` |
|
|
@@ -518,14 +509,10 @@ To force a request to never auto-route, set `model` to a concrete alias (`code-f
|
|
|
518
509
|
|
|
519
510
|
## Triggering uncensored mode
|
|
520
511
|
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
1. **Explicit agent in opencode:** `/agent plan-nofilter` (or mention it).
|
|
524
|
-
2. **Inline trigger in any auto-routed message** — anywhere in the most recent user turn:
|
|
525
|
-
- `[nofilter]`, `[uncensored]`, `[heretic]`
|
|
526
|
-
- or a line starting with `uncensored:` / `nofilter:` / `no-filter:`
|
|
512
|
+
The `plan-uncensored` tier is accessible via explicit agent selection only:
|
|
527
513
|
|
|
528
|
-
|
|
514
|
+
1. **In opencode:** `/agent plan-nofilter` (or mention `@plan-nofilter`).
|
|
515
|
+
2. **Via opencode config:** set `agent.plan-nofilter` as your active agent.
|
|
529
516
|
|
|
530
517
|
## Troubleshooting
|
|
531
518
|
|
|
@@ -535,7 +522,7 @@ Triggers are *only* checked on the latest user message and the system prompt, so
|
|
|
535
522
|
|
|
536
523
|
**OOM / unexplained slowdown** → run `top -o mem -stats pid,rsize,command` to see what's resident. The matrix should prevent two heavy models loading together; if it somehow happens, `llmstack restart`.
|
|
537
524
|
|
|
538
|
-
**Auto picks the wrong model** → adjust the regex in `llmstack/app.py` (`
|
|
525
|
+
**Auto picks the wrong model** → adjust the regex in `llmstack/app.py` (`ULTRA_TRIGGERS`) or move the ladder ceilings via `ROUTER_HIGH_FIDELITY_CEILING` / `ROUTER_MID_FIDELITY_CEILING`. To force a request to never auto-route, pass an explicit `model` (e.g. `code-smart`) instead of `auto`.
|
|
539
526
|
|
|
540
527
|
**Want a pure pass-through (no auto routing)** → change opencode's `baseURL` to `http://127.0.0.1:10102/v1` (llama-swap directly) and only use concrete model names. (Note: this skips the bedrock dispatcher; only GGUF tiers will be reachable.)
|
|
541
528
|
|
|
@@ -266,7 +266,7 @@ How to evaluate:
|
|
|
266
266
|
- Run `llama-bench -m <new>.gguf -p 512 -n 128 -ngl 999` for raw speed
|
|
267
267
|
- Sniff test with a typical autocomplete prompt; latency should feel like
|
|
268
268
|
the cursor is barely ahead of you
|
|
269
|
-
- Aider leaderboard "edit format" column — proxy for FIM quality
|
|
269
|
+
- [Aider leaderboard](https://aider.chat/docs/leaderboards/) "edit format" column — proxy for FIM quality
|
|
270
270
|
|
|
271
271
|
Size budget: **~2–6 GB** weights (we want this resident permanently while
|
|
272
272
|
sharing memory with the heavy tier).
|
|
@@ -287,10 +287,10 @@ What matters:
|
|
|
287
287
|
- **Speed at full context** (MoE models win here on Apple Silicon)
|
|
288
288
|
|
|
289
289
|
How to evaluate:
|
|
290
|
-
- Aider's
|
|
290
|
+
- [Aider's LLM Leaderboard](https://aider.chat/docs/leaderboards/) — most
|
|
291
291
|
honest signal for agentic coding
|
|
292
|
-
- LiveCodeBench scores
|
|
293
|
-
- SWE-Bench Verified (the "real PRs" benchmark)
|
|
292
|
+
- [LiveCodeBench](https://livecodebench.github.io/leaderboard.html) scores
|
|
293
|
+
- [SWE-Bench Verified](https://www.swebench.com/) (the "real PRs" benchmark)
|
|
294
294
|
- Run an actual opencode session in `build` mode against your repo
|
|
295
295
|
|
|
296
296
|
Size budget: **~30–55 GB** weights (must fit alongside `code-fast` ≈ 5 GB
|
|
@@ -311,8 +311,8 @@ What matters:
|
|
|
311
311
|
- **Refusals on edge cases** — fine to refuse weird stuff in plain plan mode
|
|
312
312
|
|
|
313
313
|
How to evaluate:
|
|
314
|
-
- Open LLM Leaderboard (filter to chat/instruct, your size class)
|
|
315
|
-
- Chatbot Arena — vibes-based but useful proxy
|
|
314
|
+
- [Open LLM Leaderboard](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard) (filter to chat/instruct, your size class)
|
|
315
|
+
- [Chatbot Arena](https://lmarena.ai/) — vibes-based but useful proxy
|
|
316
316
|
- Hand-roll a "design this rate limiter" prompt and compare outputs
|
|
317
317
|
|
|
318
318
|
Size budget: **~7–25 GB** weights — this tier shouldn't dominate memory.
|
|
@@ -360,12 +360,12 @@ Same size budget as `plan`.
|
|
|
360
360
|
|
|
361
361
|
| Tier | Leaderboard |
|
|
362
362
|
|---|---|
|
|
363
|
-
| `code-fast` / `code-smart` | https://aider.chat/docs/leaderboards/ |
|
|
364
|
-
| | https://livecodebench.github.io/leaderboard.html |
|
|
365
|
-
| | https://www.swebench.com/
|
|
366
|
-
| `plan` / `plan-uncensored` | https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard |
|
|
367
|
-
| | https://lmarena.ai/ |
|
|
368
|
-
| | https://livebench.ai/ |
|
|
363
|
+
| `code-fast` / `code-smart` | [Aider LLM Leaderboard](https://aider.chat/docs/leaderboards/) |
|
|
364
|
+
| | [LiveCodeBench](https://livecodebench.github.io/leaderboard.html) |
|
|
365
|
+
| | [SWE-Bench Verified](https://www.swebench.com/) |
|
|
366
|
+
| `plan` / `plan-uncensored` | [Open LLM Leaderboard](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard) |
|
|
367
|
+
| | [Chatbot Arena](https://lmarena.ai/) |
|
|
368
|
+
| | [LiveBench](https://livebench.ai/) |
|
|
369
369
|
|
|
370
370
|
**Community signal** (qualitative but valuable):
|
|
371
371
|
|
|
@@ -36,7 +36,7 @@ Behaviour:
|
|
|
36
36
|
``POST /v1/completions``
|
|
37
37
|
- if request body ``model == "auto"`` (or unset), classify the request
|
|
38
38
|
and rewrite ``model`` -> one of: ``code-fast``, ``code-smart``,
|
|
39
|
-
``code-ultra`` (when wired)
|
|
39
|
+
``code-ultra`` (when wired).
|
|
40
40
|
- otherwise pass through unchanged.
|
|
41
41
|
- tiers with ``backend = bedrock`` in ``models.ini`` are dispatched
|
|
42
42
|
to AWS Bedrock via :mod:`llmstack.backends.bedrock` instead of
|
|
@@ -63,41 +63,28 @@ step DOWN as context grows**. This inverts the classic
|
|
|
63
63
|
from priors.
|
|
64
64
|
|
|
65
65
|
So as the conversation accumulates context, we step *down*: ultra
|
|
66
|
-
-> smart -> fast.
|
|
67
|
-
ladder.
|
|
66
|
+
-> smart -> fast.
|
|
68
67
|
|
|
69
68
|
Routing decision tree (first match wins):
|
|
70
69
|
|
|
71
|
-
1. Explicit "
|
|
72
|
-
(``[nofilter]``, ``[uncensored]``, ``[heretic]``, or a line
|
|
73
|
-
starting with ``uncensored:`` / ``nofilter:``) -> plan-uncensored
|
|
74
|
-
2. Explicit "ultra" trigger (``[ultra]``, ``[opus]``,
|
|
70
|
+
1. Explicit "ultra" trigger (``[ultra]``, ``[opus]``,
|
|
75
71
|
``ultra:``, ``opus:``) AND ultra tier configured -> code-ultra
|
|
76
|
-
|
|
77
|
-
AND estimated tokens <= ``[plan]`` tier's ctx_size
|
|
78
|
-
(pure design discussion that fits the planner's
|
|
79
|
-
window) -> plan
|
|
80
|
-
(if the planner's
|
|
81
|
-
ctx_size is breached
|
|
82
|
-
we fall through to
|
|
83
|
-
the coding ladder
|
|
84
|
-
rather than send a
|
|
85
|
-
request that won't
|
|
86
|
-
fit -- the coding
|
|
87
|
-
tiers cover larger
|
|
88
|
-
windows by design)
|
|
89
|
-
4. Estimated input tokens <= HIGH_FIDELITY_CEILING
|
|
72
|
+
2. Estimated input tokens <= HIGH_FIDELITY_CEILING
|
|
90
73
|
("reasonable context still being built") -> code-ultra
|
|
91
74
|
(else code-smart)
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
75
|
+
3. Estimated input tokens <= MID_FIDELITY_CEILING -> code-smart
|
|
76
|
+
4. Otherwise (long context, top-tier becomes
|
|
77
|
+
expensive/slow, fast tier's 128k window is the
|
|
78
|
+
best fit and it's free) -> code-fast
|
|
96
79
|
(floored at
|
|
97
80
|
code-smart when
|
|
98
81
|
n_turns >=
|
|
99
82
|
MULTI_TURN_THRESHOLD)
|
|
100
83
|
|
|
84
|
+
Plan and uncensored tiers are accessible via their dedicated agent
|
|
85
|
+
modes (``agent.plan``, ``agent.plan-nofilter``) and slash commands;
|
|
86
|
+
they are not auto-routed through ``model = auto``.
|
|
87
|
+
|
|
101
88
|
The auto router's effective max context window is
|
|
102
89
|
``[code-fast].ctx_size`` -- fast is the bottom of the step-down
|
|
103
90
|
ladder, so any context that would overflow the tiers above lands on
|
|
@@ -140,8 +127,6 @@ UPSTREAM = os.getenv("LLAMA_SWAP_URL", "http://127.0.0.1:10102").rstrip("/")
|
|
|
140
127
|
FAST_MODEL = os.getenv("ROUTER_FAST_MODEL", "code-fast")
|
|
141
128
|
AGENT_MODEL = os.getenv("ROUTER_AGENT_MODEL", "code-smart")
|
|
142
129
|
ULTRA_MODEL = os.getenv("ROUTER_ULTRA_MODEL", "code-ultra")
|
|
143
|
-
PLAN_MODEL = os.getenv("ROUTER_PLAN_MODEL", "plan")
|
|
144
|
-
UNCENSORED_MODEL = os.getenv("ROUTER_UNCENSORED_MODEL", "plan-uncensored")
|
|
145
130
|
|
|
146
131
|
# Step-DOWN ladder (see module docstring). Both ceilings are *upper
|
|
147
132
|
# bounds* of a tier's sweet-spot range, expressed in estimated input
|
|
@@ -167,45 +152,14 @@ UNCENSORED_MODEL = os.getenv("ROUTER_UNCENSORED_MODEL", "plan-uncensored")
|
|
|
167
152
|
# still has comfortable headroom.
|
|
168
153
|
HIGH_FIDELITY_CEILING = int(os.getenv("ROUTER_HIGH_FIDELITY_CEILING", "12000"))
|
|
169
154
|
MID_FIDELITY_CEILING = int(os.getenv("ROUTER_MID_FIDELITY_CEILING", "32000"))
|
|
170
|
-
# Floor the long-context rung at code-smart whenever a tool-call
|
|
171
|
-
# protocol is in play -- 3B models tool-call unreliably regardless of
|
|
172
|
-
# how big their context window is.
|
|
173
155
|
MULTI_TURN_THRESHOLD = int(os.getenv("ROUTER_MULTI_TURN", "10"))
|
|
174
156
|
AUTO_ALIASES = {"auto", "", None}
|
|
175
157
|
|
|
176
|
-
UNCENSORED_TRIGGERS = re.compile(
|
|
177
|
-
r"(\[(uncensored|nofilter|no-?filter|heretic)\]"
|
|
178
|
-
r"|^[ \t]*(uncensored|nofilter|no-?filter)\s*:)",
|
|
179
|
-
re.IGNORECASE | re.MULTILINE,
|
|
180
|
-
)
|
|
181
|
-
|
|
182
158
|
ULTRA_TRIGGERS = re.compile(
|
|
183
159
|
r"(\[(ultra|opus)\]|^[ \t]*(ultra|opus)\s*:)",
|
|
184
160
|
re.IGNORECASE | re.MULTILINE,
|
|
185
161
|
)
|
|
186
162
|
|
|
187
|
-
PLAN_SIGNALS = re.compile(
|
|
188
|
-
r"\b(plan|design|architect(ure)?|approach|trade-?off|"
|
|
189
|
-
r"should\s+we|how\s+would\s+(you|we)|what\s+would\s+you|"
|
|
190
|
-
r"explain\s+why|reason\s+about|think\s+(through|step|hard|carefully)|"
|
|
191
|
-
r"compare\s+(options|approaches)|review\s+(the|this|my)\s+"
|
|
192
|
-
r"(architecture|design|approach|plan)|brainstorm|outline|"
|
|
193
|
-
r"summari[sz]e|root\s*cause|migrate|port\s+to)\b",
|
|
194
|
-
re.IGNORECASE,
|
|
195
|
-
)
|
|
196
|
-
|
|
197
|
-
AGENT_SIGNALS = re.compile(
|
|
198
|
-
r"\b(implement|fix\s+(this|the|a|my)?\s*(bug|issue|error|test)|"
|
|
199
|
-
r"write\s+(a|the|some)?\s*(function|class|test|script|module|method)|"
|
|
200
|
-
r"add\s+(a|the)?\s*(function|class|method|test|file|endpoint)|"
|
|
201
|
-
r"create\s+(a|the)?\s*(function|class|file|component|endpoint)|"
|
|
202
|
-
r"refactor|edit|patch|generate\s+code|debug|trace|"
|
|
203
|
-
r"run\s+tests?|build\s+(it|this)|compile)\b",
|
|
204
|
-
re.IGNORECASE,
|
|
205
|
-
)
|
|
206
|
-
|
|
207
|
-
CODE_BLOCK = re.compile(r"```|`[^`\n]{30,}`")
|
|
208
|
-
|
|
209
163
|
logging.basicConfig(
|
|
210
164
|
level=os.getenv("LOG_LEVEL", "INFO"),
|
|
211
165
|
format="%(asctime)s %(levelname)s router %(message)s",
|
|
@@ -221,12 +175,11 @@ async def _lifespan(app: FastAPI):
|
|
|
221
175
|
bedrock_tiers = sorted(t.name for t in TIERS.values() if t.is_bedrock)
|
|
222
176
|
log.info(
|
|
223
177
|
"router up upstream=%s ladder=[ultra<=%d -> agent<=%d -> fast] "
|
|
224
|
-
"fast=%s agent=%s ultra=%s
|
|
178
|
+
"fast=%s agent=%s ultra=%s bedrock=%s",
|
|
225
179
|
UPSTREAM, HIGH_FIDELITY_CEILING, MID_FIDELITY_CEILING,
|
|
226
180
|
FAST_MODEL, AGENT_MODEL,
|
|
227
181
|
f"{ULTRA_MODEL} (active)" if _ultra_available()
|
|
228
182
|
else f"{ULTRA_MODEL} (unwired -- high-fidelity rung falls back to {AGENT_MODEL})",
|
|
229
|
-
PLAN_MODEL, UNCENSORED_MODEL,
|
|
230
183
|
",".join(bedrock_tiers) or "(none)",
|
|
231
184
|
)
|
|
232
185
|
yield
|
|
@@ -302,12 +255,6 @@ def _estimate_tokens(messages: list[dict[str, Any]] | None, prompt: str | None)
|
|
|
302
255
|
return chars // 4
|
|
303
256
|
|
|
304
257
|
|
|
305
|
-
def _matches(pattern: re.Pattern[str], messages: list[dict[str, Any]] | None, prompt: str | None) -> bool:
|
|
306
|
-
if prompt and pattern.search(prompt):
|
|
307
|
-
return True
|
|
308
|
-
return any(pattern.search(t) for t in _iter_message_text(messages))
|
|
309
|
-
|
|
310
|
-
|
|
311
258
|
def _ultra_available() -> bool:
|
|
312
259
|
"""True iff the ultra tier is loaded from ``models.ini``.
|
|
313
260
|
|
|
@@ -331,6 +278,11 @@ def classify(body: dict[str, Any]) -> tuple[str, str]:
|
|
|
331
278
|
|
|
332
279
|
Step-DOWN ladder: top fidelity for short context, fall to mid for
|
|
333
280
|
medium, drop to fast for long. See module docstring for rationale.
|
|
281
|
+
|
|
282
|
+
Only the fast / agent / ultra rungs are implemented here. Plan and
|
|
283
|
+
uncensored tiers are accessible via their dedicated agent modes
|
|
284
|
+
(``agent.plan``, ``agent.plan-nofilter``) and slash commands; they
|
|
285
|
+
are not auto-routed from the build agent.
|
|
334
286
|
"""
|
|
335
287
|
messages = body.get("messages") if isinstance(body.get("messages"), list) else None
|
|
336
288
|
prompt = body.get("prompt") if isinstance(body.get("prompt"), str) else None
|
|
@@ -341,51 +293,17 @@ def classify(body: dict[str, Any]) -> tuple[str, str]:
|
|
|
341
293
|
for m in (messages or [])
|
|
342
294
|
if m.get("role") == "system" and isinstance(m.get("content"), str)
|
|
343
295
|
]
|
|
344
|
-
if any(UNCENSORED_TRIGGERS.search(s) for s in (last_user, *sys_prompts) if s):
|
|
345
|
-
return UNCENSORED_MODEL, "uncensored-trigger"
|
|
346
296
|
|
|
347
297
|
if any(ULTRA_TRIGGERS.search(s) for s in (last_user, *sys_prompts) if s):
|
|
348
298
|
if _ultra_available():
|
|
349
299
|
return ULTRA_MODEL, "ultra-trigger"
|
|
350
|
-
# Explicit user opt-in but the tier isn't wired up. Don't 404 --
|
|
351
|
-
# serve the request from the heaviest tier we *do* have and let
|
|
352
|
-
# the user notice in logs that their trigger was a no-op.
|
|
353
300
|
log.warning("ultra-trigger ignored: %s not in models.ini; falling back to %s",
|
|
354
301
|
ULTRA_MODEL, AGENT_MODEL)
|
|
355
302
|
return AGENT_MODEL, f"ultra-trigger->agent ({ULTRA_MODEL} unavailable)"
|
|
356
303
|
|
|
357
304
|
n_turns = sum(1 for m in (messages or []) if m.get("role") == "user")
|
|
358
|
-
_last_msgs = [{"role": "user", "content": last_user}] if last_user else None
|
|
359
|
-
has_code_signal = (
|
|
360
|
-
_matches(CODE_BLOCK, _last_msgs, prompt)
|
|
361
|
-
or _matches(AGENT_SIGNALS, _last_msgs, prompt)
|
|
362
|
-
)
|
|
363
|
-
|
|
364
305
|
est = _estimate_tokens(messages, prompt)
|
|
365
306
|
|
|
366
|
-
# Plan track is orthogonal to the code fidelity ladder: ``plan`` is a
|
|
367
|
-
# chat-tuned model meant for design / "should we" discussions. Only
|
|
368
|
-
# take it when nothing about the request says "I'm about to write
|
|
369
|
-
# code" (no triple-backticks, no agent verbs). Tools are stripped
|
|
370
|
-
# from the request body before dispatch (see ``_handle_completion``),
|
|
371
|
-
# so their presence here does not block plan routing.
|
|
372
|
-
# Only route to plan if the input fits in the planner's ctx_size --
|
|
373
|
-
# past that we fall through to the coding ladder which has tiers
|
|
374
|
-
# (smart, fast) explicitly sized for larger contexts.
|
|
375
|
-
if (
|
|
376
|
-
not has_code_signal
|
|
377
|
-
and _matches(PLAN_SIGNALS, messages, prompt)
|
|
378
|
-
):
|
|
379
|
-
plan_tier = TIER_BY_ALIAS.get(PLAN_MODEL)
|
|
380
|
-
plan_ctx = plan_tier.ctx_size if plan_tier else 0
|
|
381
|
-
if not plan_ctx or est <= plan_ctx:
|
|
382
|
-
return PLAN_MODEL, "plan-signal"
|
|
383
|
-
log.info(
|
|
384
|
-
"plan-signal but tokens~%d > %s.ctx_size %d; "
|
|
385
|
-
"falling through to coding ladder",
|
|
386
|
-
est, PLAN_MODEL, plan_ctx,
|
|
387
|
-
)
|
|
388
|
-
|
|
389
307
|
# Rung 1: short context -- start at the top.
|
|
390
308
|
if est <= HIGH_FIDELITY_CEILING:
|
|
391
309
|
if _ultra_available():
|
|
@@ -400,9 +318,7 @@ def classify(body: dict[str, Any]) -> tuple[str, str]:
|
|
|
400
318
|
return AGENT_MODEL, f"mid-fidelity tokens~{est}<={MID_FIDELITY_CEILING}"
|
|
401
319
|
|
|
402
320
|
# Rung 3: long context -- step down to fast. Floor at smart only
|
|
403
|
-
# when the multi-turn threshold is hit
|
|
404
|
-
# prevent the step-down (plan tiers strip tools before dispatch,
|
|
405
|
-
# and code-fast is a hosted model that tool-calls reliably).
|
|
321
|
+
# when the multi-turn threshold is hit.
|
|
406
322
|
if n_turns >= MULTI_TURN_THRESHOLD:
|
|
407
323
|
return AGENT_MODEL, f"long-context tokens~{est}>{MID_FIDELITY_CEILING} (user-turns={n_turns}>={MULTI_TURN_THRESHOLD} floor)"
|
|
408
324
|
return FAST_MODEL, f"long-context tokens~{est}>{MID_FIDELITY_CEILING}"
|
|
@@ -531,14 +447,14 @@ async def list_models() -> JSONResponse:
|
|
|
531
447
|
f"'{AGENT_MODEL}' up to ~{MID_FIDELITY_CEILING}, "
|
|
532
448
|
f"'{FAST_MODEL}' beyond that."
|
|
533
449
|
)
|
|
534
|
-
name = "Auto (step-down router: ultra/agent/fast
|
|
450
|
+
name = "Auto (step-down router: ultra/agent/fast)"
|
|
535
451
|
else:
|
|
536
452
|
top_blurb = (
|
|
537
453
|
f"Step-down ladder (top->bottom as context grows): "
|
|
538
454
|
f"'{AGENT_MODEL}' up to ~{MID_FIDELITY_CEILING} tokens, "
|
|
539
455
|
f"'{FAST_MODEL}' beyond that."
|
|
540
456
|
)
|
|
541
|
-
name = "Auto (step-down router: agent/fast
|
|
457
|
+
name = "Auto (step-down router: agent/fast)"
|
|
542
458
|
data["data"].insert(0, {
|
|
543
459
|
"id": "auto",
|
|
544
460
|
"object": "model",
|
|
@@ -547,8 +463,6 @@ async def list_models() -> JSONResponse:
|
|
|
547
463
|
"name": name,
|
|
548
464
|
"description": (
|
|
549
465
|
f"{top_blurb} "
|
|
550
|
-
f"'{PLAN_MODEL}' for design/planning (orthogonal to ladder); "
|
|
551
|
-
f"'{UNCENSORED_MODEL}' for explicit [nofilter] triggers; "
|
|
552
466
|
f"'[ultra]'/'[opus]' triggers force '{ULTRA_MODEL}' regardless of size."
|
|
553
467
|
),
|
|
554
468
|
"tier": "auto",
|
|
@@ -608,6 +522,41 @@ def _inject_sampler(body: dict[str, Any], tier: Tier) -> bool:
|
|
|
608
522
|
return mutated
|
|
609
523
|
|
|
610
524
|
|
|
525
|
+
def _inject_name_json(raw: bytes, tier_name: str) -> bytes:
|
|
526
|
+
try:
|
|
527
|
+
data = json.loads(raw)
|
|
528
|
+
except (json.JSONDecodeError, ValueError):
|
|
529
|
+
return raw
|
|
530
|
+
try:
|
|
531
|
+
msg = data["choices"][0]["message"]
|
|
532
|
+
if msg.get("content"):
|
|
533
|
+
msg["name"] = tier_name
|
|
534
|
+
except (KeyError, IndexError, TypeError):
|
|
535
|
+
pass
|
|
536
|
+
return json.dumps(data).encode()
|
|
537
|
+
|
|
538
|
+
|
|
539
|
+
def _inject_name_sse(chunk: bytes, tier_name: str, injected: list[bool]) -> bytes:
|
|
540
|
+
if injected[0]:
|
|
541
|
+
return chunk
|
|
542
|
+
line = chunk.decode(errors="replace")
|
|
543
|
+
if not line.startswith("data: "):
|
|
544
|
+
return chunk
|
|
545
|
+
payload_str = line[len("data: "):].strip()
|
|
546
|
+
if payload_str in ("[DONE]", ""):
|
|
547
|
+
return chunk
|
|
548
|
+
try:
|
|
549
|
+
payload = json.loads(payload_str)
|
|
550
|
+
delta = payload["choices"][0]["delta"]
|
|
551
|
+
if "role" in delta:
|
|
552
|
+
delta["name"] = tier_name
|
|
553
|
+
injected[0] = True
|
|
554
|
+
return f"data: {json.dumps(payload, separators=(',', ':'))}\n\n".encode()
|
|
555
|
+
except (KeyError, IndexError, TypeError, json.JSONDecodeError):
|
|
556
|
+
pass
|
|
557
|
+
return chunk
|
|
558
|
+
|
|
559
|
+
|
|
611
560
|
async def _handle_completion(req: Request, path: str) -> Response:
|
|
612
561
|
raw = await req.body()
|
|
613
562
|
headers = _filter_request_headers(req)
|
|
@@ -631,11 +580,6 @@ async def _handle_completion(req: Request, path: str) -> Response:
|
|
|
631
580
|
mutated = True
|
|
632
581
|
|
|
633
582
|
chosen_name = body.get("model")
|
|
634
|
-
if chosen_name in {PLAN_MODEL, UNCENSORED_MODEL} and body.get("tools"):
|
|
635
|
-
log.info("plan tier %s: stripping tools from request", chosen_name)
|
|
636
|
-
body.pop("tools")
|
|
637
|
-
body.pop("tool_choice", None)
|
|
638
|
-
mutated = True
|
|
639
583
|
tier = _resolve_tier(chosen_name)
|
|
640
584
|
if tier is not None and _inject_sampler(body, tier):
|
|
641
585
|
mutated = True
|
|
@@ -646,6 +590,28 @@ async def _handle_completion(req: Request, path: str) -> Response:
|
|
|
646
590
|
if tier is not None and tier.is_bedrock:
|
|
647
591
|
from llmstack.backends import bedrock as bedrock_backend
|
|
648
592
|
resp = await bedrock_backend.dispatch(req, tier, body)
|
|
593
|
+
elif tier is not None and body.get("stream"):
|
|
594
|
+
proxy = await _stream_proxy(req.method, path, raw, headers)
|
|
595
|
+
injected: list[bool] = [False]
|
|
596
|
+
tier_name = tier.name
|
|
597
|
+
original_gen = proxy.body_iterator
|
|
598
|
+
|
|
599
|
+
async def _named_gen():
|
|
600
|
+
async for chunk in original_gen:
|
|
601
|
+
yield _inject_name_sse(chunk, tier_name, injected)
|
|
602
|
+
|
|
603
|
+
proxy.body_iterator = _named_gen()
|
|
604
|
+
resp = proxy
|
|
605
|
+
elif tier is not None:
|
|
606
|
+
proxy = await _stream_proxy(req.method, path, raw, headers)
|
|
607
|
+
raw_resp = b"".join([chunk async for chunk in proxy.body_iterator])
|
|
608
|
+
patched = _inject_name_json(raw_resp, tier.name)
|
|
609
|
+
resp = Response(
|
|
610
|
+
content=patched,
|
|
611
|
+
status_code=proxy.status_code,
|
|
612
|
+
headers=dict(proxy.headers),
|
|
613
|
+
media_type=proxy.media_type,
|
|
614
|
+
)
|
|
649
615
|
else:
|
|
650
616
|
resp = await _stream_proxy(req.method, path, raw, headers)
|
|
651
617
|
|
|
@@ -588,6 +588,8 @@ async def _complete_response(client: Any, tier: Tier, converse_kwargs: dict[str,
|
|
|
588
588
|
return JSONResponse(status_code=502, content={"error": _error_payload(exc)})
|
|
589
589
|
|
|
590
590
|
message, finish = _openai_message_from_converse(resp)
|
|
591
|
+
if message.get("content"):
|
|
592
|
+
message["name"] = tier.name
|
|
591
593
|
usage_in = (resp.get("usage") or {})
|
|
592
594
|
payload = {
|
|
593
595
|
"id": _completion_id(),
|
|
@@ -665,7 +667,7 @@ async def _stream_response(client: Any, tier: Tier, converse_kwargs: dict[str, A
|
|
|
665
667
|
|
|
666
668
|
# First chunk: announce the assistant role so OpenAI clients can
|
|
667
669
|
# initialise their accumulator.
|
|
668
|
-
yield _sse(_frame({"role": "assistant"}))
|
|
670
|
+
yield _sse(_frame({"role": "assistant", "name": model_label}))
|
|
669
671
|
|
|
670
672
|
# Per-content-block state: index -> "text" | "tool_use"
|
|
671
673
|
block_kinds: dict[int, str] = {}
|
|
@@ -69,7 +69,7 @@ COMMANDS = {
|
|
|
69
69
|
"agent": "plan",
|
|
70
70
|
},
|
|
71
71
|
"nofilter": {
|
|
72
|
-
"template": "
|
|
72
|
+
"template": "",
|
|
73
73
|
"description": "Route to the uncensored planning model.",
|
|
74
74
|
"agent": "plan-nofilter",
|
|
75
75
|
},
|
|
@@ -194,7 +194,7 @@ def build_config(
|
|
|
194
194
|
|
|
195
195
|
models: dict[str, dict] = {
|
|
196
196
|
"auto": {
|
|
197
|
-
"name": "Auto (router selects: fast / agent /
|
|
197
|
+
"name": "Auto (router selects: fast / agent / ultra)",
|
|
198
198
|
"limit": {"context": auto_ctx, "output": 16384},
|
|
199
199
|
"tool_call": True,
|
|
200
200
|
"cost": ZERO_COST,
|
|
@@ -178,7 +178,7 @@ description = Qwopus GLM 18B - planning, design discussions, architecture
|
|
|
178
178
|
; aws_region = eu-central-1
|
|
179
179
|
; aws_profile = bedrock-prod
|
|
180
180
|
; ctx_size = 200000
|
|
181
|
-
; sampler = temp=0.7
|
|
181
|
+
; sampler = temp=0.7 ; creative; Opus 4.6
|
|
182
182
|
; description = Claude Opus 4.6 on Bedrock - planning, design discussions, architecture
|
|
183
183
|
|
|
184
184
|
[plan-uncensored]
|
|
@@ -258,21 +258,18 @@ description = Mistral-Small 3.2 24B Heretic - no-filter planning
|
|
|
258
258
|
;
|
|
259
259
|
; First-match-wins decision tree applied by llmstack/app.py when model="auto":
|
|
260
260
|
;
|
|
261
|
-
; 1. "[
|
|
262
|
-
; 2. "[ultra]" / "[opus]" / "ultra:" trigger AND code-ultra
|
|
261
|
+
; 1. "[ultra]" / "[opus]" / "ultra:" trigger AND code-ultra
|
|
263
262
|
; tier configured -> code-ultra
|
|
264
|
-
;
|
|
265
|
-
; AND tokens <= [plan].ctx_size (pure design discussion that
|
|
266
|
-
; still fits the planner's window) -> plan
|
|
267
|
-
; ...if the plan tier's ctx_size is breached, the request
|
|
268
|
-
; falls through to the coding ladder below rather than being
|
|
269
|
-
; sent to a planner whose window can't hold the input.
|
|
270
|
-
; 4. tokens <= high_fidelity_ceiling AND code-ultra configured -> code-ultra
|
|
263
|
+
; 2. tokens <= high_fidelity_ceiling AND code-ultra configured -> code-ultra
|
|
271
264
|
; tokens <= high_fidelity_ceiling AND no code-ultra -> code-smart
|
|
272
|
-
;
|
|
273
|
-
;
|
|
274
|
-
; - if
|
|
275
|
-
; - else
|
|
265
|
+
; 3. tokens <= mid_fidelity_ceiling -> code-smart
|
|
266
|
+
; 4. otherwise (long context):
|
|
267
|
+
; - if turns >= multi_turn (floor at smart) -> code-smart
|
|
268
|
+
; - else -> code-fast
|
|
269
|
+
;
|
|
270
|
+
; Plan and uncensored tiers are accessible via their dedicated agent
|
|
271
|
+
; modes (agent.plan, agent.plan-nofilter) and slash commands; they are
|
|
272
|
+
; NOT auto-routed through model=auto.
|
|
276
273
|
;
|
|
277
274
|
; AUTO ROUTER MAX CONTEXT = [code-fast].ctx_size. The fast tier sits at
|
|
278
275
|
; the bottom of the step-down ladder, so any context too big for the
|
|
@@ -303,9 +300,6 @@ description = Mistral-Small 3.2 24B Heretic - no-filter planning
|
|
|
303
300
|
high_fidelity_ceiling = 12000 ; tokens; below this, top-tier model is still cheap+fast (and ultra ctx_size = 2 * this)
|
|
304
301
|
mid_fidelity_ceiling = 32000 ; tokens; smart's sweet spot up to here, then step down to fast (smart ctx_size = 2 * this)
|
|
305
302
|
multi_turn = 10 ; turn count that floors the long-context rung at code-smart
|
|
306
|
-
agent_signal_words = implement, fix bug, write a function, refactor, edit, patch, debug, run tests, build it
|
|
307
|
-
plan_signal_words = design, architect, approach, trade-off, should we, how would you, explain why, think through, compare options, brainstorm, root cause
|
|
308
|
-
uncensored_triggers = [nofilter], [uncensored], [heretic], "uncensored:", "nofilter:" (line start)
|
|
309
303
|
ultra_triggers = [ultra], [opus], "ultra:", "opus:" (line start)
|
|
310
304
|
|
|
311
305
|
;------------------------------------------------------------------------------
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: opencode-llmstack
|
|
3
|
-
Version: 0.9.
|
|
3
|
+
Version: 0.9.7
|
|
4
4
|
Summary: Multi-tier local LLM stack: llama-swap + FastAPI auto-router + opencode wiring.
|
|
5
5
|
Author: llmstack
|
|
6
6
|
License: MIT License
|
|
@@ -78,14 +78,14 @@ client (opencode / curl / Cursor / etc.)
|
|
|
78
78
|
│
|
|
79
79
|
▼
|
|
80
80
|
http://127.0.0.1:10101 <-- FastAPI router (llmstack.app)
|
|
81
|
-
│ • model="auto" → classify → rewrite to one of
|
|
81
|
+
│ • model="auto" → classify → rewrite to one of 3 coder tiers
|
|
82
82
|
│ • everything else → pass-through
|
|
83
83
|
▼
|
|
84
84
|
http://127.0.0.1:10102 <-- llama-swap (binary, manages model lifecycle)
|
|
85
85
|
│ • loads/unloads llama-server processes per model
|
|
86
86
|
│ • matrix solver allows {code-fast + one heavy model} co-resident
|
|
87
87
|
▼
|
|
88
|
-
llama-server <code-fast | code-smart |
|
|
88
|
+
llama-server <code-fast | code-smart | code-ultra>
|
|
89
89
|
│
|
|
90
90
|
▼
|
|
91
91
|
GGUF in ~/.cache/huggingface/hub/...
|
|
@@ -101,7 +101,7 @@ A 64 GB unified memory M4 Max can comfortably hold **one always-on tiny coder +
|
|
|
101
101
|
|
|
102
102
|
- **Agent work** (multi-file edits, tool use, refactors) → coder models, which are trained on tool-call protocols and code edits.
|
|
103
103
|
- **Planning** (design discussions, architecture, "what's the best approach") → chat-tuned models, which are better at high-level reasoning and don't try to start writing code in response to every message.
|
|
104
|
-
- **Uncensored planning** is a separate plan-tier model, opted in
|
|
104
|
+
- **Uncensored planning** is a separate plan-tier model, opted in by explicit agent selection (`/agent plan-nofilter` in opencode).
|
|
105
105
|
|
|
106
106
|
Routing decisions cost ~zero — they're a few regex checks in the FastAPI router, not an LLM call.
|
|
107
107
|
|
|
@@ -135,20 +135,18 @@ matches how these models actually behave on this stack:
|
|
|
135
135
|
than priors, so they tend to *improve* relative to top-tier as the
|
|
136
136
|
conversation grows.
|
|
137
137
|
|
|
138
|
-
First match wins:
|
|
138
|
+
First match wins (auto-routing only; `plan` and `plan-uncensored` are not auto-routed):
|
|
139
139
|
|
|
140
140
|
| # | Condition | → Model | Reason |
|
|
141
141
|
|---|---|---|---|
|
|
142
|
-
| 1 |
|
|
143
|
-
| 2 |
|
|
144
|
-
| 3 |
|
|
145
|
-
| 4 |
|
|
146
|
-
| 5 |
|
|
147
|
-
| 6 | otherwise (long context) AND ≥ 10 user turns | `code-smart` | floor: deep agentic loop, keep the heavy model |
|
|
148
|
-
| 7 | otherwise (long context) | `code-fast` | 128k YaRN window + always-resident + free |
|
|
142
|
+
| 1 | `[ultra]` / `[opus]` / `ultra:` trigger AND `code-ultra` tier configured | `code-ultra` | explicit top-tier opt-in |
|
|
143
|
+
| 2 | estimated input ≤ 12 000 tokens | `code-ultra` *(or `code-smart` if ultra unwired)* | top tier — context still being built, latency/$ are best here |
|
|
144
|
+
| 3 | estimated input ≤ 32 000 tokens | `code-smart` | mid-context, local heavy coder is at its sweet spot |
|
|
145
|
+
| 4 | otherwise (long context) AND ≥ 10 user turns | `code-smart` | floor: deep agentic loop, keep the heavy model |
|
|
146
|
+
| 5 | otherwise (long context) | `code-fast` | 128k YaRN window + always-resident + free |
|
|
149
147
|
|
|
150
148
|
Token estimates are `chars / 4` over all message text + `prompt`. The
|
|
151
|
-
`code-ultra` rungs (
|
|
149
|
+
`code-ultra` rungs (1 and 2) are gated on availability: when no
|
|
152
150
|
`[code-ultra]` section is loaded from `models.ini`, both silently fall
|
|
153
151
|
back to `code-smart` so vanilla installs don't 404.
|
|
154
152
|
|
|
@@ -198,7 +196,8 @@ your global setup unchanged.
|
|
|
198
196
|
| **`agent.plan-nofilter`** (custom uncensored planner) | `llama.cpp/plan-uncensored` |
|
|
199
197
|
|
|
200
198
|
Inside opencode you can switch agents with `/agent` or by `@plan-nofilter`-mentioning
|
|
201
|
-
a custom one.
|
|
199
|
+
a custom one. The `plan` and `plan-uncensored` tiers are **not auto-routed** from the build agent —
|
|
200
|
+
they're only accessible via explicit agent selection (`/agent plan` or `/agent plan-nofilter`).
|
|
202
201
|
|
|
203
202
|
Want a second terminal into the same stack? Install the activate hook
|
|
204
203
|
once (`eval "$(llmstack activate zsh)"`) and any new shell that `cd`s
|
|
@@ -266,8 +265,9 @@ Per-project state (gitignored) is created lazily under `<work-dir>/.llmstack/`:
|
|
|
266
265
|
```
|
|
267
266
|
|
|
268
267
|
The `llama-swap` binary lives outside any project at
|
|
269
|
-
`$XDG_DATA_HOME/llmstack/bin/llama-swap` (override with
|
|
270
|
-
`LLMSTACK_BIN_DIR`)
|
|
268
|
+
`$XDG_DATA_HOME/llmstack/bin/llama-swap` on macOS/Linux (override with
|
|
269
|
+
`LLMSTACK_BIN_DIR`), or `%LOCALAPPDATA%\llmstack\bin\llama-swap.exe` on Windows.
|
|
270
|
+
One download is reused across all projects.
|
|
271
271
|
|
|
272
272
|
## Quick start
|
|
273
273
|
|
|
@@ -358,8 +358,9 @@ Notes:
|
|
|
358
358
|
or a package like `winget install ggml.llama-cpp` and put it on
|
|
359
359
|
`PATH` (or set `$env:LLAMA_SERVER_BIN`). The Mac-only
|
|
360
360
|
`iogpu.wired_limit_mb` step does not apply.
|
|
361
|
-
- The `[llmstack:<channel>]` prompt prefix shows up in PowerShell
|
|
362
|
-
|
|
361
|
+
- The `[llmstack:<channel>]` prompt prefix shows up in PowerShell; `cmd.exe`
|
|
362
|
+
does not support custom prompts in the same way, so activation is
|
|
363
|
+
PowerShell-only.
|
|
363
364
|
- Stopping daemons uses `taskkill /T /F` under the hood, so the
|
|
364
365
|
llama-server children get cleaned up as well.
|
|
365
366
|
|
|
@@ -465,7 +466,7 @@ llmstack restart --next # cycle into the next channel
|
|
|
465
466
|
|
|
466
467
|
### Try each routing path
|
|
467
468
|
|
|
468
|
-
All of these go to `/v1/chat/completions` on `:10101`.
|
|
469
|
+
All of these go to `/v1/chat/completions` on `:10101`. The `auto` router classifies based on token count and context:
|
|
469
470
|
|
|
470
471
|
```bash
|
|
471
472
|
# trivial chat -> code-fast
|
|
@@ -473,22 +474,14 @@ curl -sN http://127.0.0.1:10101/v1/chat/completions -H 'Content-Type: applicatio
|
|
|
473
474
|
-d '{"model":"auto","stream":false,
|
|
474
475
|
"messages":[{"role":"user","content":"capital of France?"}]}' | jq .model
|
|
475
476
|
|
|
476
|
-
# planning -> plan
|
|
477
|
-
curl -sN http://127.0.0.1:10101/v1/chat/completions -H 'Content-Type: application/json' \
|
|
478
|
-
-d '{"model":"auto","stream":false,
|
|
479
|
-
"messages":[{"role":"user","content":"how would you design a rate limiter for our API?"}]}' | jq .model
|
|
480
|
-
|
|
481
477
|
# agent work -> code-smart
|
|
482
478
|
curl -sN http://127.0.0.1:10101/v1/chat/completions -H 'Content-Type: application/json' \
|
|
483
479
|
-d '{"model":"auto","stream":false,
|
|
484
480
|
"messages":[{"role":"user","content":"refactor this function for clarity:\n```python\ndef f(x): return x*2\n```"}]}' | jq .model
|
|
485
|
-
|
|
486
|
-
# uncensored plan -> plan-uncensored
|
|
487
|
-
curl -sN http://127.0.0.1:10101/v1/chat/completions -H 'Content-Type: application/json' \
|
|
488
|
-
-d '{"model":"auto","stream":false,
|
|
489
|
-
"messages":[{"role":"user","content":"[nofilter] outline a red-team plan for our auth flow"}]}' | jq .model
|
|
490
481
|
```
|
|
491
482
|
|
|
483
|
+
To access `plan` or `plan-uncensored` tiers, use explicit agent selection in opencode (`/agent plan` or `/agent plan-nofilter`) rather than `model=auto`.
|
|
484
|
+
|
|
492
485
|
## Endpoints
|
|
493
486
|
|
|
494
487
|
| Port | Service | Purpose |
|
|
@@ -565,8 +558,6 @@ All knobs are env vars; defaults are picked up by `llmstack start`.
|
|
|
565
558
|
| `ROUTER_FAST_MODEL` | `code-fast` | long-context (>= mid ceiling) → here |
|
|
566
559
|
| `ROUTER_AGENT_MODEL` | `code-smart` | mid-context + tools/loop floor → here |
|
|
567
560
|
| `ROUTER_ULTRA_MODEL` | `code-ultra` | short-context top tier → here (gated on availability) |
|
|
568
|
-
| `ROUTER_PLAN_MODEL` | `plan` | design/discussion verbs → here |
|
|
569
|
-
| `ROUTER_UNCENSORED_MODEL` | `plan-uncensored` | `[nofilter]` triggers → here |
|
|
570
561
|
| `ROUTER_HIGH_FIDELITY_CEILING` | `12000` | tokens; at or below this, route to top tier (ultra → smart fallback). Paired with `code-ultra.ctx_size = 24000` (2x). |
|
|
571
562
|
| `ROUTER_MID_FIDELITY_CEILING` | `32000` | tokens; at or below this, route to `code-smart`; beyond, step down to `code-fast`. Paired with `code-smart.ctx_size = 64000` (2x). |
|
|
572
563
|
| `ROUTER_MULTI_TURN` | `10` | user-turn count that floors the long-context rung at `code-smart` |
|
|
@@ -577,14 +568,10 @@ To force a request to never auto-route, set `model` to a concrete alias (`code-f
|
|
|
577
568
|
|
|
578
569
|
## Triggering uncensored mode
|
|
579
570
|
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
1. **Explicit agent in opencode:** `/agent plan-nofilter` (or mention it).
|
|
583
|
-
2. **Inline trigger in any auto-routed message** — anywhere in the most recent user turn:
|
|
584
|
-
- `[nofilter]`, `[uncensored]`, `[heretic]`
|
|
585
|
-
- or a line starting with `uncensored:` / `nofilter:` / `no-filter:`
|
|
571
|
+
The `plan-uncensored` tier is accessible via explicit agent selection only:
|
|
586
572
|
|
|
587
|
-
|
|
573
|
+
1. **In opencode:** `/agent plan-nofilter` (or mention `@plan-nofilter`).
|
|
574
|
+
2. **Via opencode config:** set `agent.plan-nofilter` as your active agent.
|
|
588
575
|
|
|
589
576
|
## Troubleshooting
|
|
590
577
|
|
|
@@ -594,7 +581,7 @@ Triggers are *only* checked on the latest user message and the system prompt, so
|
|
|
594
581
|
|
|
595
582
|
**OOM / unexplained slowdown** → run `top -o mem -stats pid,rsize,command` to see what's resident. The matrix should prevent two heavy models loading together; if it somehow happens, `llmstack restart`.
|
|
596
583
|
|
|
597
|
-
**Auto picks the wrong model** → adjust the regex in `llmstack/app.py` (`
|
|
584
|
+
**Auto picks the wrong model** → adjust the regex in `llmstack/app.py` (`ULTRA_TRIGGERS`) or move the ladder ceilings via `ROUTER_HIGH_FIDELITY_CEILING` / `ROUTER_MID_FIDELITY_CEILING`. To force a request to never auto-route, pass an explicit `model` (e.g. `code-smart`) instead of `auto`.
|
|
598
585
|
|
|
599
586
|
**Want a pure pass-through (no auto routing)** → change opencode's `baseURL` to `http://127.0.0.1:10102/v1` (llama-swap directly) and only use concrete model names. (Note: this skips the bedrock dispatcher; only GGUF tiers will be reachable.)
|
|
600
587
|
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "opencode-llmstack"
|
|
7
|
-
version = "0.9.
|
|
7
|
+
version = "0.9.7"
|
|
8
8
|
description = "Multi-tier local LLM stack: llama-swap + FastAPI auto-router + opencode wiring."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.11"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{opencode_llmstack-0.9.4 → opencode_llmstack-0.9.7}/opencode_llmstack.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
{opencode_llmstack-0.9.4 → opencode_llmstack-0.9.7}/opencode_llmstack.egg-info/entry_points.txt
RENAMED
|
File without changes
|
|
File without changes
|
{opencode_llmstack-0.9.4 → opencode_llmstack-0.9.7}/opencode_llmstack.egg-info/top_level.txt
RENAMED
|
File without changes
|
|
File without changes
|