opencode-llmstack 0.9.4__tar.gz → 0.9.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. {opencode_llmstack-0.9.4/opencode_llmstack.egg-info → opencode_llmstack-0.9.7}/PKG-INFO +26 -39
  2. {opencode_llmstack-0.9.4 → opencode_llmstack-0.9.7}/README.md +25 -38
  3. {opencode_llmstack-0.9.4 → opencode_llmstack-0.9.7}/UPGRADING.md +12 -12
  4. {opencode_llmstack-0.9.4 → opencode_llmstack-0.9.7}/llmstack/__init__.py +1 -1
  5. {opencode_llmstack-0.9.4 → opencode_llmstack-0.9.7}/llmstack/app.py +78 -112
  6. {opencode_llmstack-0.9.4 → opencode_llmstack-0.9.7}/llmstack/backends/bedrock.py +3 -1
  7. {opencode_llmstack-0.9.4 → opencode_llmstack-0.9.7}/llmstack/generators/opencode.py +2 -2
  8. {opencode_llmstack-0.9.4 → opencode_llmstack-0.9.7}/llmstack/models.ini +11 -17
  9. {opencode_llmstack-0.9.4 → opencode_llmstack-0.9.7/opencode_llmstack.egg-info}/PKG-INFO +26 -39
  10. {opencode_llmstack-0.9.4 → opencode_llmstack-0.9.7}/pyproject.toml +1 -1
  11. {opencode_llmstack-0.9.4 → opencode_llmstack-0.9.7}/CHANGELOG.md +0 -0
  12. {opencode_llmstack-0.9.4 → opencode_llmstack-0.9.7}/LICENSE +0 -0
  13. {opencode_llmstack-0.9.4 → opencode_llmstack-0.9.7}/llmstack/AGENTS.md +0 -0
  14. {opencode_llmstack-0.9.4 → opencode_llmstack-0.9.7}/llmstack/__main__.py +0 -0
  15. {opencode_llmstack-0.9.4 → opencode_llmstack-0.9.7}/llmstack/_platform.py +0 -0
  16. {opencode_llmstack-0.9.4 → opencode_llmstack-0.9.7}/llmstack/backends/__init__.py +0 -0
  17. {opencode_llmstack-0.9.4 → opencode_llmstack-0.9.7}/llmstack/check_models.py +0 -0
  18. {opencode_llmstack-0.9.4 → opencode_llmstack-0.9.7}/llmstack/cli.py +0 -0
  19. {opencode_llmstack-0.9.4 → opencode_llmstack-0.9.7}/llmstack/commands/__init__.py +0 -0
  20. {opencode_llmstack-0.9.4 → opencode_llmstack-0.9.7}/llmstack/commands/_helpers.py +0 -0
  21. {opencode_llmstack-0.9.4 → opencode_llmstack-0.9.7}/llmstack/commands/activate.py +0 -0
  22. {opencode_llmstack-0.9.4 → opencode_llmstack-0.9.7}/llmstack/commands/check.py +0 -0
  23. {opencode_llmstack-0.9.4 → opencode_llmstack-0.9.7}/llmstack/commands/download.py +0 -0
  24. {opencode_llmstack-0.9.4 → opencode_llmstack-0.9.7}/llmstack/commands/install.py +0 -0
  25. {opencode_llmstack-0.9.4 → opencode_llmstack-0.9.7}/llmstack/commands/install_llama_swap.py +0 -0
  26. {opencode_llmstack-0.9.4 → opencode_llmstack-0.9.7}/llmstack/commands/reload.py +0 -0
  27. {opencode_llmstack-0.9.4 → opencode_llmstack-0.9.7}/llmstack/commands/restart.py +0 -0
  28. {opencode_llmstack-0.9.4 → opencode_llmstack-0.9.7}/llmstack/commands/setup.py +0 -0
  29. {opencode_llmstack-0.9.4 → opencode_llmstack-0.9.7}/llmstack/commands/start.py +0 -0
  30. {opencode_llmstack-0.9.4 → opencode_llmstack-0.9.7}/llmstack/commands/status.py +0 -0
  31. {opencode_llmstack-0.9.4 → opencode_llmstack-0.9.7}/llmstack/commands/stop.py +0 -0
  32. {opencode_llmstack-0.9.4 → opencode_llmstack-0.9.7}/llmstack/download/__init__.py +0 -0
  33. {opencode_llmstack-0.9.4 → opencode_llmstack-0.9.7}/llmstack/download/binary.py +0 -0
  34. {opencode_llmstack-0.9.4 → opencode_llmstack-0.9.7}/llmstack/download/ggufs.py +0 -0
  35. {opencode_llmstack-0.9.4 → opencode_llmstack-0.9.7}/llmstack/generators/__init__.py +0 -0
  36. {opencode_llmstack-0.9.4 → opencode_llmstack-0.9.7}/llmstack/generators/llama_swap.py +0 -0
  37. {opencode_llmstack-0.9.4 → opencode_llmstack-0.9.7}/llmstack/paths.py +0 -0
  38. {opencode_llmstack-0.9.4 → opencode_llmstack-0.9.7}/llmstack/shell_env.py +0 -0
  39. {opencode_llmstack-0.9.4 → opencode_llmstack-0.9.7}/llmstack/tiers.py +0 -0
  40. {opencode_llmstack-0.9.4 → opencode_llmstack-0.9.7}/opencode_llmstack.egg-info/SOURCES.txt +0 -0
  41. {opencode_llmstack-0.9.4 → opencode_llmstack-0.9.7}/opencode_llmstack.egg-info/dependency_links.txt +0 -0
  42. {opencode_llmstack-0.9.4 → opencode_llmstack-0.9.7}/opencode_llmstack.egg-info/entry_points.txt +0 -0
  43. {opencode_llmstack-0.9.4 → opencode_llmstack-0.9.7}/opencode_llmstack.egg-info/requires.txt +0 -0
  44. {opencode_llmstack-0.9.4 → opencode_llmstack-0.9.7}/opencode_llmstack.egg-info/top_level.txt +0 -0
  45. {opencode_llmstack-0.9.4 → opencode_llmstack-0.9.7}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: opencode-llmstack
3
- Version: 0.9.4
3
+ Version: 0.9.7
4
4
  Summary: Multi-tier local LLM stack: llama-swap + FastAPI auto-router + opencode wiring.
5
5
  Author: llmstack
6
6
  License: MIT License
@@ -78,14 +78,14 @@ client (opencode / curl / Cursor / etc.)
78
78
 
79
79
 
80
80
  http://127.0.0.1:10101 <-- FastAPI router (llmstack.app)
81
- │ • model="auto" → classify → rewrite to one of 4 tiers
81
+ │ • model="auto" → classify → rewrite to one of 3 coder tiers
82
82
  │ • everything else → pass-through
83
83
 
84
84
  http://127.0.0.1:10102 <-- llama-swap (binary, manages model lifecycle)
85
85
  │ • loads/unloads llama-server processes per model
86
86
  │ • matrix solver allows {code-fast + one heavy model} co-resident
87
87
 
88
- llama-server <code-fast | code-smart | plan | plan-uncensored>
88
+ llama-server <code-fast | code-smart | code-ultra>
89
89
 
90
90
 
91
91
  GGUF in ~/.cache/huggingface/hub/...
@@ -101,7 +101,7 @@ A 64 GB unified memory M4 Max can comfortably hold **one always-on tiny coder +
101
101
 
102
102
  - **Agent work** (multi-file edits, tool use, refactors) → coder models, which are trained on tool-call protocols and code edits.
103
103
  - **Planning** (design discussions, architecture, "what's the best approach") → chat-tuned models, which are better at high-level reasoning and don't try to start writing code in response to every message.
104
- - **Uncensored planning** is a separate plan-tier model, opted in either by request (`agent.plan-nofilter` in opencode) or by an inline `[nofilter]` trigger in the prompt.
104
+ - **Uncensored planning** is a separate plan-tier model, opted in by explicit agent selection (`/agent plan-nofilter` in opencode).
105
105
 
106
106
  Routing decisions cost ~zero — they're a few regex checks in the FastAPI router, not an LLM call.
107
107
 
@@ -135,20 +135,18 @@ matches how these models actually behave on this stack:
135
135
  than priors, so they tend to *improve* relative to top-tier as the
136
136
  conversation grows.
137
137
 
138
- First match wins:
138
+ First match wins (auto-routing only; `plan` and `plan-uncensored` are not auto-routed):
139
139
 
140
140
  | # | Condition | → Model | Reason |
141
141
  |---|---|---|---|
142
- | 1 | last user msg contains `[nofilter]`, `[uncensored]`, `[heretic]`, or starts with `uncensored:` / `nofilter:` | `plan-uncensored` | explicit opt-in |
143
- | 2 | `[ultra]` / `[opus]` / `ultra:` trigger AND `code-ultra` tier configured | `code-ultra` | explicit top-tier opt-in |
144
- | 3 | plan verbs (*design, architect, approach, trade-off, should we, explain why, …*) AND no code blocks / agent verbs / tools | `plan` | pure design discussion (orthogonal track) |
145
- | 4 | estimated input 12 000 tokens | `code-ultra` *(or `code-smart` if ultra unwired)* | top tier context still being built, latency/$ are best here |
146
- | 5 | estimated input 32 000 tokens | `code-smart` | mid-context, local heavy coder is at its sweet spot |
147
- | 6 | otherwise (long context) AND ≥ 10 user turns | `code-smart` | floor: deep agentic loop, keep the heavy model |
148
- | 7 | otherwise (long context) | `code-fast` | 128k YaRN window + always-resident + free |
142
+ | 1 | `[ultra]` / `[opus]` / `ultra:` trigger AND `code-ultra` tier configured | `code-ultra` | explicit top-tier opt-in |
143
+ | 2 | estimated input 12 000 tokens | `code-ultra` *(or `code-smart` if ultra unwired)* | top tier context still being built, latency/$ are best here |
144
+ | 3 | estimated input 32 000 tokens | `code-smart` | mid-context, local heavy coder is at its sweet spot |
145
+ | 4 | otherwise (long context) AND 10 user turns | `code-smart` | floor: deep agentic loop, keep the heavy model |
146
+ | 5 | otherwise (long context) | `code-fast` | 128k YaRN window + always-resident + free |
149
147
 
150
148
  Token estimates are `chars / 4` over all message text + `prompt`. The
151
- `code-ultra` rungs (2 and 4) are gated on availability: when no
149
+ `code-ultra` rungs (1 and 2) are gated on availability: when no
152
150
  `[code-ultra]` section is loaded from `models.ini`, both silently fall
153
151
  back to `code-smart` so vanilla installs don't 404.
154
152
 
@@ -198,7 +196,8 @@ your global setup unchanged.
198
196
  | **`agent.plan-nofilter`** (custom uncensored planner) | `llama.cpp/plan-uncensored` |
199
197
 
200
198
  Inside opencode you can switch agents with `/agent` or by `@plan-nofilter`-mentioning
201
- a custom one. Slash-commands `/review`, `/nofilter` are also available.
199
+ a custom one. The `plan` and `plan-uncensored` tiers are **not auto-routed** from the build agent —
200
+ they're only accessible via explicit agent selection (`/agent plan` or `/agent plan-nofilter`).
202
201
 
203
202
  Want a second terminal into the same stack? Install the activate hook
204
203
  once (`eval "$(llmstack activate zsh)"`) and any new shell that `cd`s
@@ -266,8 +265,9 @@ Per-project state (gitignored) is created lazily under `<work-dir>/.llmstack/`:
266
265
  ```
267
266
 
268
267
  The `llama-swap` binary lives outside any project at
269
- `$XDG_DATA_HOME/llmstack/bin/llama-swap` (override with
270
- `LLMSTACK_BIN_DIR`). One download is reused across all projects.
268
+ `$XDG_DATA_HOME/llmstack/bin/llama-swap` on macOS/Linux (override with
269
+ `LLMSTACK_BIN_DIR`), or `%LOCALAPPDATA%\llmstack\bin\llama-swap.exe` on Windows.
270
+ One download is reused across all projects.
271
271
 
272
272
  ## Quick start
273
273
 
@@ -358,8 +358,9 @@ Notes:
358
358
  or a package like `winget install ggml.llama-cpp` and put it on
359
359
  `PATH` (or set `$env:LLAMA_SERVER_BIN`). The Mac-only
360
360
  `iogpu.wired_limit_mb` step does not apply.
361
- - The `[llmstack:<channel>]` prompt prefix shows up in PowerShell too;
362
- `cmd.exe` gets a simpler `[llmstack:<channel>]` prompt via `doskey`.
361
+ - The `[llmstack:<channel>]` prompt prefix shows up in PowerShell; `cmd.exe`
362
+ does not support custom prompts in the same way, so activation is
363
+ PowerShell-only.
363
364
  - Stopping daemons uses `taskkill /T /F` under the hood, so the
364
365
  llama-server children get cleaned up as well.
365
366
 
@@ -465,7 +466,7 @@ llmstack restart --next # cycle into the next channel
465
466
 
466
467
  ### Try each routing path
467
468
 
468
- All of these go to `/v1/chat/completions` on `:10101`. Each should pick a different upstream model:
469
+ All of these go to `/v1/chat/completions` on `:10101`. The `auto` router classifies based on token count and context:
469
470
 
470
471
  ```bash
471
472
  # trivial chat -> code-fast
@@ -473,22 +474,14 @@ curl -sN http://127.0.0.1:10101/v1/chat/completions -H 'Content-Type: applicatio
473
474
  -d '{"model":"auto","stream":false,
474
475
  "messages":[{"role":"user","content":"capital of France?"}]}' | jq .model
475
476
 
476
- # planning -> plan
477
- curl -sN http://127.0.0.1:10101/v1/chat/completions -H 'Content-Type: application/json' \
478
- -d '{"model":"auto","stream":false,
479
- "messages":[{"role":"user","content":"how would you design a rate limiter for our API?"}]}' | jq .model
480
-
481
477
  # agent work -> code-smart
482
478
  curl -sN http://127.0.0.1:10101/v1/chat/completions -H 'Content-Type: application/json' \
483
479
  -d '{"model":"auto","stream":false,
484
480
  "messages":[{"role":"user","content":"refactor this function for clarity:\n```python\ndef f(x): return x*2\n```"}]}' | jq .model
485
-
486
- # uncensored plan -> plan-uncensored
487
- curl -sN http://127.0.0.1:10101/v1/chat/completions -H 'Content-Type: application/json' \
488
- -d '{"model":"auto","stream":false,
489
- "messages":[{"role":"user","content":"[nofilter] outline a red-team plan for our auth flow"}]}' | jq .model
490
481
  ```
491
482
 
483
+ To access `plan` or `plan-uncensored` tiers, use explicit agent selection in opencode (`/agent plan` or `/agent plan-nofilter`) rather than `model=auto`.
484
+
492
485
  ## Endpoints
493
486
 
494
487
  | Port | Service | Purpose |
@@ -565,8 +558,6 @@ All knobs are env vars; defaults are picked up by `llmstack start`.
565
558
  | `ROUTER_FAST_MODEL` | `code-fast` | long-context (>= mid ceiling) → here |
566
559
  | `ROUTER_AGENT_MODEL` | `code-smart` | mid-context + tools/loop floor → here |
567
560
  | `ROUTER_ULTRA_MODEL` | `code-ultra` | short-context top tier → here (gated on availability) |
568
- | `ROUTER_PLAN_MODEL` | `plan` | design/discussion verbs → here |
569
- | `ROUTER_UNCENSORED_MODEL` | `plan-uncensored` | `[nofilter]` triggers → here |
570
561
  | `ROUTER_HIGH_FIDELITY_CEILING` | `12000` | tokens; at or below this, route to top tier (ultra → smart fallback). Paired with `code-ultra.ctx_size = 24000` (2x). |
571
562
  | `ROUTER_MID_FIDELITY_CEILING` | `32000` | tokens; at or below this, route to `code-smart`; beyond, step down to `code-fast`. Paired with `code-smart.ctx_size = 64000` (2x). |
572
563
  | `ROUTER_MULTI_TURN` | `10` | user-turn count that floors the long-context rung at `code-smart` |
@@ -577,14 +568,10 @@ To force a request to never auto-route, set `model` to a concrete alias (`code-f
577
568
 
578
569
  ## Triggering uncensored mode
579
570
 
580
- Two ways:
581
-
582
- 1. **Explicit agent in opencode:** `/agent plan-nofilter` (or mention it).
583
- 2. **Inline trigger in any auto-routed message** — anywhere in the most recent user turn:
584
- - `[nofilter]`, `[uncensored]`, `[heretic]`
585
- - or a line starting with `uncensored:` / `nofilter:` / `no-filter:`
571
+ The `plan-uncensored` tier is accessible via explicit agent selection only:
586
572
 
587
- Triggers are *only* checked on the latest user message and the system prompt, so an old `[nofilter]` further up the conversation won't pin the whole session.
573
+ 1. **In opencode:** `/agent plan-nofilter` (or mention `@plan-nofilter`).
574
+ 2. **Via opencode config:** set `agent.plan-nofilter` as your active agent.
588
575
 
589
576
  ## Troubleshooting
590
577
 
@@ -594,7 +581,7 @@ Triggers are *only* checked on the latest user message and the system prompt, so
594
581
 
595
582
  **OOM / unexplained slowdown** → run `top -o mem -stats pid,rsize,command` to see what's resident. The matrix should prevent two heavy models loading together; if it somehow happens, `llmstack restart`.
596
583
 
597
- **Auto picks the wrong model** → adjust the regex in `llmstack/app.py` (`AGENT_SIGNALS` / `PLAN_SIGNALS` / `UNCENSORED_TRIGGERS`) or move the ladder ceilings via `ROUTER_HIGH_FIDELITY_CEILING` / `ROUTER_MID_FIDELITY_CEILING`. To force a request to never auto-route, pass an explicit `model` (e.g. `code-smart`) instead of `auto`.
584
+ **Auto picks the wrong model** → adjust the regex in `llmstack/app.py` (`ULTRA_TRIGGERS`) or move the ladder ceilings via `ROUTER_HIGH_FIDELITY_CEILING` / `ROUTER_MID_FIDELITY_CEILING`. To force a request to never auto-route, pass an explicit `model` (e.g. `code-smart`) instead of `auto`.
598
585
 
599
586
  **Want a pure pass-through (no auto routing)** → change opencode's `baseURL` to `http://127.0.0.1:10102/v1` (llama-swap directly) and only use concrete model names. (Note: this skips the bedrock dispatcher; only GGUF tiers will be reachable.)
600
587
 
@@ -19,14 +19,14 @@ client (opencode / curl / Cursor / etc.)
19
19
 
20
20
 
21
21
  http://127.0.0.1:10101 <-- FastAPI router (llmstack.app)
22
- │ • model="auto" → classify → rewrite to one of 4 tiers
22
+ │ • model="auto" → classify → rewrite to one of 3 coder tiers
23
23
  │ • everything else → pass-through
24
24
 
25
25
  http://127.0.0.1:10102 <-- llama-swap (binary, manages model lifecycle)
26
26
  │ • loads/unloads llama-server processes per model
27
27
  │ • matrix solver allows {code-fast + one heavy model} co-resident
28
28
 
29
- llama-server <code-fast | code-smart | plan | plan-uncensored>
29
+ llama-server <code-fast | code-smart | code-ultra>
30
30
 
31
31
 
32
32
  GGUF in ~/.cache/huggingface/hub/...
@@ -42,7 +42,7 @@ A 64 GB unified memory M4 Max can comfortably hold **one always-on tiny coder +
42
42
 
43
43
  - **Agent work** (multi-file edits, tool use, refactors) → coder models, which are trained on tool-call protocols and code edits.
44
44
  - **Planning** (design discussions, architecture, "what's the best approach") → chat-tuned models, which are better at high-level reasoning and don't try to start writing code in response to every message.
45
- - **Uncensored planning** is a separate plan-tier model, opted in either by request (`agent.plan-nofilter` in opencode) or by an inline `[nofilter]` trigger in the prompt.
45
+ - **Uncensored planning** is a separate plan-tier model, opted in by explicit agent selection (`/agent plan-nofilter` in opencode).
46
46
 
47
47
  Routing decisions cost ~zero — they're a few regex checks in the FastAPI router, not an LLM call.
48
48
 
@@ -76,20 +76,18 @@ matches how these models actually behave on this stack:
76
76
  than priors, so they tend to *improve* relative to top-tier as the
77
77
  conversation grows.
78
78
 
79
- First match wins:
79
+ First match wins (auto-routing only; `plan` and `plan-uncensored` are not auto-routed):
80
80
 
81
81
  | # | Condition | → Model | Reason |
82
82
  |---|---|---|---|
83
- | 1 | last user msg contains `[nofilter]`, `[uncensored]`, `[heretic]`, or starts with `uncensored:` / `nofilter:` | `plan-uncensored` | explicit opt-in |
84
- | 2 | `[ultra]` / `[opus]` / `ultra:` trigger AND `code-ultra` tier configured | `code-ultra` | explicit top-tier opt-in |
85
- | 3 | plan verbs (*design, architect, approach, trade-off, should we, explain why, …*) AND no code blocks / agent verbs / tools | `plan` | pure design discussion (orthogonal track) |
86
- | 4 | estimated input 12 000 tokens | `code-ultra` *(or `code-smart` if ultra unwired)* | top tier context still being built, latency/$ are best here |
87
- | 5 | estimated input 32 000 tokens | `code-smart` | mid-context, local heavy coder is at its sweet spot |
88
- | 6 | otherwise (long context) AND ≥ 10 user turns | `code-smart` | floor: deep agentic loop, keep the heavy model |
89
- | 7 | otherwise (long context) | `code-fast` | 128k YaRN window + always-resident + free |
83
+ | 1 | `[ultra]` / `[opus]` / `ultra:` trigger AND `code-ultra` tier configured | `code-ultra` | explicit top-tier opt-in |
84
+ | 2 | estimated input 12 000 tokens | `code-ultra` *(or `code-smart` if ultra unwired)* | top tier context still being built, latency/$ are best here |
85
+ | 3 | estimated input 32 000 tokens | `code-smart` | mid-context, local heavy coder is at its sweet spot |
86
+ | 4 | otherwise (long context) AND 10 user turns | `code-smart` | floor: deep agentic loop, keep the heavy model |
87
+ | 5 | otherwise (long context) | `code-fast` | 128k YaRN window + always-resident + free |
90
88
 
91
89
  Token estimates are `chars / 4` over all message text + `prompt`. The
92
- `code-ultra` rungs (2 and 4) are gated on availability: when no
90
+ `code-ultra` rungs (1 and 2) are gated on availability: when no
93
91
  `[code-ultra]` section is loaded from `models.ini`, both silently fall
94
92
  back to `code-smart` so vanilla installs don't 404.
95
93
 
@@ -139,7 +137,8 @@ your global setup unchanged.
139
137
  | **`agent.plan-nofilter`** (custom uncensored planner) | `llama.cpp/plan-uncensored` |
140
138
 
141
139
  Inside opencode you can switch agents with `/agent` or by `@plan-nofilter`-mentioning
142
- a custom one. Slash-commands `/review`, `/nofilter` are also available.
140
+ a custom one. The `plan` and `plan-uncensored` tiers are **not auto-routed** from the build agent —
141
+ they're only accessible via explicit agent selection (`/agent plan` or `/agent plan-nofilter`).
143
142
 
144
143
  Want a second terminal into the same stack? Install the activate hook
145
144
  once (`eval "$(llmstack activate zsh)"`) and any new shell that `cd`s
@@ -207,8 +206,9 @@ Per-project state (gitignored) is created lazily under `<work-dir>/.llmstack/`:
207
206
  ```
208
207
 
209
208
  The `llama-swap` binary lives outside any project at
210
- `$XDG_DATA_HOME/llmstack/bin/llama-swap` (override with
211
- `LLMSTACK_BIN_DIR`). One download is reused across all projects.
209
+ `$XDG_DATA_HOME/llmstack/bin/llama-swap` on macOS/Linux (override with
210
+ `LLMSTACK_BIN_DIR`), or `%LOCALAPPDATA%\llmstack\bin\llama-swap.exe` on Windows.
211
+ One download is reused across all projects.
212
212
 
213
213
  ## Quick start
214
214
 
@@ -299,8 +299,9 @@ Notes:
299
299
  or a package like `winget install ggml.llama-cpp` and put it on
300
300
  `PATH` (or set `$env:LLAMA_SERVER_BIN`). The Mac-only
301
301
  `iogpu.wired_limit_mb` step does not apply.
302
- - The `[llmstack:<channel>]` prompt prefix shows up in PowerShell too;
303
- `cmd.exe` gets a simpler `[llmstack:<channel>]` prompt via `doskey`.
302
+ - The `[llmstack:<channel>]` prompt prefix shows up in PowerShell; `cmd.exe`
303
+ does not support custom prompts in the same way, so activation is
304
+ PowerShell-only.
304
305
  - Stopping daemons uses `taskkill /T /F` under the hood, so the
305
306
  llama-server children get cleaned up as well.
306
307
 
@@ -406,7 +407,7 @@ llmstack restart --next # cycle into the next channel
406
407
 
407
408
  ### Try each routing path
408
409
 
409
- All of these go to `/v1/chat/completions` on `:10101`. Each should pick a different upstream model:
410
+ All of these go to `/v1/chat/completions` on `:10101`. The `auto` router classifies based on token count and context:
410
411
 
411
412
  ```bash
412
413
  # trivial chat -> code-fast
@@ -414,22 +415,14 @@ curl -sN http://127.0.0.1:10101/v1/chat/completions -H 'Content-Type: applicatio
414
415
  -d '{"model":"auto","stream":false,
415
416
  "messages":[{"role":"user","content":"capital of France?"}]}' | jq .model
416
417
 
417
- # planning -> plan
418
- curl -sN http://127.0.0.1:10101/v1/chat/completions -H 'Content-Type: application/json' \
419
- -d '{"model":"auto","stream":false,
420
- "messages":[{"role":"user","content":"how would you design a rate limiter for our API?"}]}' | jq .model
421
-
422
418
  # agent work -> code-smart
423
419
  curl -sN http://127.0.0.1:10101/v1/chat/completions -H 'Content-Type: application/json' \
424
420
  -d '{"model":"auto","stream":false,
425
421
  "messages":[{"role":"user","content":"refactor this function for clarity:\n```python\ndef f(x): return x*2\n```"}]}' | jq .model
426
-
427
- # uncensored plan -> plan-uncensored
428
- curl -sN http://127.0.0.1:10101/v1/chat/completions -H 'Content-Type: application/json' \
429
- -d '{"model":"auto","stream":false,
430
- "messages":[{"role":"user","content":"[nofilter] outline a red-team plan for our auth flow"}]}' | jq .model
431
422
  ```
432
423
 
424
+ To access `plan` or `plan-uncensored` tiers, use explicit agent selection in opencode (`/agent plan` or `/agent plan-nofilter`) rather than `model=auto`.
425
+
433
426
  ## Endpoints
434
427
 
435
428
  | Port | Service | Purpose |
@@ -506,8 +499,6 @@ All knobs are env vars; defaults are picked up by `llmstack start`.
506
499
  | `ROUTER_FAST_MODEL` | `code-fast` | long-context (>= mid ceiling) → here |
507
500
  | `ROUTER_AGENT_MODEL` | `code-smart` | mid-context + tools/loop floor → here |
508
501
  | `ROUTER_ULTRA_MODEL` | `code-ultra` | short-context top tier → here (gated on availability) |
509
- | `ROUTER_PLAN_MODEL` | `plan` | design/discussion verbs → here |
510
- | `ROUTER_UNCENSORED_MODEL` | `plan-uncensored` | `[nofilter]` triggers → here |
511
502
  | `ROUTER_HIGH_FIDELITY_CEILING` | `12000` | tokens; at or below this, route to top tier (ultra → smart fallback). Paired with `code-ultra.ctx_size = 24000` (2x). |
512
503
  | `ROUTER_MID_FIDELITY_CEILING` | `32000` | tokens; at or below this, route to `code-smart`; beyond, step down to `code-fast`. Paired with `code-smart.ctx_size = 64000` (2x). |
513
504
  | `ROUTER_MULTI_TURN` | `10` | user-turn count that floors the long-context rung at `code-smart` |
@@ -518,14 +509,10 @@ To force a request to never auto-route, set `model` to a concrete alias (`code-f
518
509
 
519
510
  ## Triggering uncensored mode
520
511
 
521
- Two ways:
522
-
523
- 1. **Explicit agent in opencode:** `/agent plan-nofilter` (or mention it).
524
- 2. **Inline trigger in any auto-routed message** — anywhere in the most recent user turn:
525
- - `[nofilter]`, `[uncensored]`, `[heretic]`
526
- - or a line starting with `uncensored:` / `nofilter:` / `no-filter:`
512
+ The `plan-uncensored` tier is accessible via explicit agent selection only:
527
513
 
528
- Triggers are *only* checked on the latest user message and the system prompt, so an old `[nofilter]` further up the conversation won't pin the whole session.
514
+ 1. **In opencode:** `/agent plan-nofilter` (or mention `@plan-nofilter`).
515
+ 2. **Via opencode config:** set `agent.plan-nofilter` as your active agent.
529
516
 
530
517
  ## Troubleshooting
531
518
 
@@ -535,7 +522,7 @@ Triggers are *only* checked on the latest user message and the system prompt, so
535
522
 
536
523
  **OOM / unexplained slowdown** → run `top -o mem -stats pid,rsize,command` to see what's resident. The matrix should prevent two heavy models loading together; if it somehow happens, `llmstack restart`.
537
524
 
538
- **Auto picks the wrong model** → adjust the regex in `llmstack/app.py` (`AGENT_SIGNALS` / `PLAN_SIGNALS` / `UNCENSORED_TRIGGERS`) or move the ladder ceilings via `ROUTER_HIGH_FIDELITY_CEILING` / `ROUTER_MID_FIDELITY_CEILING`. To force a request to never auto-route, pass an explicit `model` (e.g. `code-smart`) instead of `auto`.
525
+ **Auto picks the wrong model** → adjust the regex in `llmstack/app.py` (`ULTRA_TRIGGERS`) or move the ladder ceilings via `ROUTER_HIGH_FIDELITY_CEILING` / `ROUTER_MID_FIDELITY_CEILING`. To force a request to never auto-route, pass an explicit `model` (e.g. `code-smart`) instead of `auto`.
539
526
 
540
527
  **Want a pure pass-through (no auto routing)** → change opencode's `baseURL` to `http://127.0.0.1:10102/v1` (llama-swap directly) and only use concrete model names. (Note: this skips the bedrock dispatcher; only GGUF tiers will be reachable.)
541
528
 
@@ -266,7 +266,7 @@ How to evaluate:
266
266
  - Run `llama-bench -m <new>.gguf -p 512 -n 128 -ngl 999` for raw speed
267
267
  - Sniff test with a typical autocomplete prompt; latency should feel like
268
268
  the cursor is barely ahead of you
269
- - Aider leaderboard "edit format" column — proxy for FIM quality
269
+ - [Aider leaderboard](https://aider.chat/docs/leaderboards/) "edit format" column — proxy for FIM quality
270
270
 
271
271
  Size budget: **~2–6 GB** weights (we want this resident permanently while
272
272
  sharing memory with the heavy tier).
@@ -287,10 +287,10 @@ What matters:
287
287
  - **Speed at full context** (MoE models win here on Apple Silicon)
288
288
 
289
289
  How to evaluate:
290
- - Aider's [LLM Leaderboard](https://aider.chat/docs/leaderboards/) — most
290
+ - [Aider's LLM Leaderboard](https://aider.chat/docs/leaderboards/) — most
291
291
  honest signal for agentic coding
292
- - LiveCodeBench scores
293
- - SWE-Bench Verified (the "real PRs" benchmark)
292
+ - [LiveCodeBench](https://livecodebench.github.io/leaderboard.html) scores
293
+ - [SWE-Bench Verified](https://www.swebench.com/) (the "real PRs" benchmark)
294
294
  - Run an actual opencode session in `build` mode against your repo
295
295
 
296
296
  Size budget: **~30–55 GB** weights (must fit alongside `code-fast` ≈ 5 GB
@@ -311,8 +311,8 @@ What matters:
311
311
  - **Refusals on edge cases** — fine to refuse weird stuff in plain plan mode
312
312
 
313
313
  How to evaluate:
314
- - Open LLM Leaderboard (filter to chat/instruct, your size class)
315
- - Chatbot Arena — vibes-based but useful proxy
314
+ - [Open LLM Leaderboard](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard) (filter to chat/instruct, your size class)
315
+ - [Chatbot Arena](https://lmarena.ai/) — vibes-based but useful proxy
316
316
  - Hand-roll a "design this rate limiter" prompt and compare outputs
317
317
 
318
318
  Size budget: **~7–25 GB** weights — this tier shouldn't dominate memory.
@@ -360,12 +360,12 @@ Same size budget as `plan`.
360
360
 
361
361
  | Tier | Leaderboard |
362
362
  |---|---|
363
- | `code-fast` / `code-smart` | https://aider.chat/docs/leaderboards/ |
364
- | | https://livecodebench.github.io/leaderboard.html |
365
- | | https://www.swebench.com/ (Verified split) |
366
- | `plan` / `plan-uncensored` | https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard |
367
- | | https://lmarena.ai/ |
368
- | | https://livebench.ai/ |
363
+ | `code-fast` / `code-smart` | [Aider LLM Leaderboard](https://aider.chat/docs/leaderboards/) |
364
+ | | [LiveCodeBench](https://livecodebench.github.io/leaderboard.html) |
365
+ | | [SWE-Bench Verified](https://www.swebench.com/) |
366
+ | `plan` / `plan-uncensored` | [Open LLM Leaderboard](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard) |
367
+ | | [Chatbot Arena](https://lmarena.ai/) |
368
+ | | [LiveBench](https://livebench.ai/) |
369
369
 
370
370
  **Community signal** (qualitative but valuable):
371
371
 
@@ -16,5 +16,5 @@ organised by concern:
16
16
 
17
17
  from __future__ import annotations
18
18
 
19
- __version__ = "0.9.4"
19
+ __version__ = "0.9.6"
20
20
  __all__ = ["__version__"]
@@ -36,7 +36,7 @@ Behaviour:
36
36
  ``POST /v1/completions``
37
37
  - if request body ``model == "auto"`` (or unset), classify the request
38
38
  and rewrite ``model`` -> one of: ``code-fast``, ``code-smart``,
39
- ``code-ultra`` (when wired), ``plan``, ``plan-uncensored``.
39
+ ``code-ultra`` (when wired).
40
40
  - otherwise pass through unchanged.
41
41
  - tiers with ``backend = bedrock`` in ``models.ini`` are dispatched
42
42
  to AWS Bedrock via :mod:`llmstack.backends.bedrock` instead of
@@ -63,41 +63,28 @@ step DOWN as context grows**. This inverts the classic
63
63
  from priors.
64
64
 
65
65
  So as the conversation accumulates context, we step *down*: ultra
66
- -> smart -> fast. Triggers and the plan track sit alongside this
67
- ladder.
66
+ -> smart -> fast.
68
67
 
69
68
  Routing decision tree (first match wins):
70
69
 
71
- 1. Explicit "uncensored" trigger in the last user message
72
- (``[nofilter]``, ``[uncensored]``, ``[heretic]``, or a line
73
- starting with ``uncensored:`` / ``nofilter:``) -> plan-uncensored
74
- 2. Explicit "ultra" trigger (``[ultra]``, ``[opus]``,
70
+ 1. Explicit "ultra" trigger (``[ultra]``, ``[opus]``,
75
71
  ``ultra:``, ``opus:``) AND ultra tier configured -> code-ultra
76
- 3. PLAN signal words AND no code-block / agent verbs / tools
77
- AND estimated tokens <= ``[plan]`` tier's ctx_size
78
- (pure design discussion that fits the planner's
79
- window) -> plan
80
- (if the planner's
81
- ctx_size is breached
82
- we fall through to
83
- the coding ladder
84
- rather than send a
85
- request that won't
86
- fit -- the coding
87
- tiers cover larger
88
- windows by design)
89
- 4. Estimated input tokens <= HIGH_FIDELITY_CEILING
72
+ 2. Estimated input tokens <= HIGH_FIDELITY_CEILING
90
73
  ("reasonable context still being built") -> code-ultra
91
74
  (else code-smart)
92
- 5. Estimated input tokens <= MID_FIDELITY_CEILING -> code-smart
93
- 6. Otherwise (long context, top-tier becomes
94
- expensive/slow, fast tier's 128k window is the
95
- best fit and it's free) -> code-fast
75
+ 3. Estimated input tokens <= MID_FIDELITY_CEILING -> code-smart
76
+ 4. Otherwise (long context, top-tier becomes
77
+ expensive/slow, fast tier's 128k window is the
78
+ best fit and it's free) -> code-fast
96
79
  (floored at
97
80
  code-smart when
98
81
  n_turns >=
99
82
  MULTI_TURN_THRESHOLD)
100
83
 
84
+ Plan and uncensored tiers are accessible via their dedicated agent
85
+ modes (``agent.plan``, ``agent.plan-nofilter``) and slash commands;
86
+ they are not auto-routed through ``model = auto``.
87
+
101
88
  The auto router's effective max context window is
102
89
  ``[code-fast].ctx_size`` -- fast is the bottom of the step-down
103
90
  ladder, so any context that would overflow the tiers above lands on
@@ -140,8 +127,6 @@ UPSTREAM = os.getenv("LLAMA_SWAP_URL", "http://127.0.0.1:10102").rstrip("/")
140
127
  FAST_MODEL = os.getenv("ROUTER_FAST_MODEL", "code-fast")
141
128
  AGENT_MODEL = os.getenv("ROUTER_AGENT_MODEL", "code-smart")
142
129
  ULTRA_MODEL = os.getenv("ROUTER_ULTRA_MODEL", "code-ultra")
143
- PLAN_MODEL = os.getenv("ROUTER_PLAN_MODEL", "plan")
144
- UNCENSORED_MODEL = os.getenv("ROUTER_UNCENSORED_MODEL", "plan-uncensored")
145
130
 
146
131
  # Step-DOWN ladder (see module docstring). Both ceilings are *upper
147
132
  # bounds* of a tier's sweet-spot range, expressed in estimated input
@@ -167,45 +152,14 @@ UNCENSORED_MODEL = os.getenv("ROUTER_UNCENSORED_MODEL", "plan-uncensored")
167
152
  # still has comfortable headroom.
168
153
  HIGH_FIDELITY_CEILING = int(os.getenv("ROUTER_HIGH_FIDELITY_CEILING", "12000"))
169
154
  MID_FIDELITY_CEILING = int(os.getenv("ROUTER_MID_FIDELITY_CEILING", "32000"))
170
- # Floor the long-context rung at code-smart whenever a tool-call
171
- # protocol is in play -- 3B models tool-call unreliably regardless of
172
- # how big their context window is.
173
155
  MULTI_TURN_THRESHOLD = int(os.getenv("ROUTER_MULTI_TURN", "10"))
174
156
  AUTO_ALIASES = {"auto", "", None}
175
157
 
176
- UNCENSORED_TRIGGERS = re.compile(
177
- r"(\[(uncensored|nofilter|no-?filter|heretic)\]"
178
- r"|^[ \t]*(uncensored|nofilter|no-?filter)\s*:)",
179
- re.IGNORECASE | re.MULTILINE,
180
- )
181
-
182
158
  ULTRA_TRIGGERS = re.compile(
183
159
  r"(\[(ultra|opus)\]|^[ \t]*(ultra|opus)\s*:)",
184
160
  re.IGNORECASE | re.MULTILINE,
185
161
  )
186
162
 
187
- PLAN_SIGNALS = re.compile(
188
- r"\b(plan|design|architect(ure)?|approach|trade-?off|"
189
- r"should\s+we|how\s+would\s+(you|we)|what\s+would\s+you|"
190
- r"explain\s+why|reason\s+about|think\s+(through|step|hard|carefully)|"
191
- r"compare\s+(options|approaches)|review\s+(the|this|my)\s+"
192
- r"(architecture|design|approach|plan)|brainstorm|outline|"
193
- r"summari[sz]e|root\s*cause|migrate|port\s+to)\b",
194
- re.IGNORECASE,
195
- )
196
-
197
- AGENT_SIGNALS = re.compile(
198
- r"\b(implement|fix\s+(this|the|a|my)?\s*(bug|issue|error|test)|"
199
- r"write\s+(a|the|some)?\s*(function|class|test|script|module|method)|"
200
- r"add\s+(a|the)?\s*(function|class|method|test|file|endpoint)|"
201
- r"create\s+(a|the)?\s*(function|class|file|component|endpoint)|"
202
- r"refactor|edit|patch|generate\s+code|debug|trace|"
203
- r"run\s+tests?|build\s+(it|this)|compile)\b",
204
- re.IGNORECASE,
205
- )
206
-
207
- CODE_BLOCK = re.compile(r"```|`[^`\n]{30,}`")
208
-
209
163
  logging.basicConfig(
210
164
  level=os.getenv("LOG_LEVEL", "INFO"),
211
165
  format="%(asctime)s %(levelname)s router %(message)s",
@@ -221,12 +175,11 @@ async def _lifespan(app: FastAPI):
221
175
  bedrock_tiers = sorted(t.name for t in TIERS.values() if t.is_bedrock)
222
176
  log.info(
223
177
  "router up upstream=%s ladder=[ultra<=%d -> agent<=%d -> fast] "
224
- "fast=%s agent=%s ultra=%s plan=%s uncensored=%s bedrock=%s",
178
+ "fast=%s agent=%s ultra=%s bedrock=%s",
225
179
  UPSTREAM, HIGH_FIDELITY_CEILING, MID_FIDELITY_CEILING,
226
180
  FAST_MODEL, AGENT_MODEL,
227
181
  f"{ULTRA_MODEL} (active)" if _ultra_available()
228
182
  else f"{ULTRA_MODEL} (unwired -- high-fidelity rung falls back to {AGENT_MODEL})",
229
- PLAN_MODEL, UNCENSORED_MODEL,
230
183
  ",".join(bedrock_tiers) or "(none)",
231
184
  )
232
185
  yield
@@ -302,12 +255,6 @@ def _estimate_tokens(messages: list[dict[str, Any]] | None, prompt: str | None)
302
255
  return chars // 4
303
256
 
304
257
 
305
- def _matches(pattern: re.Pattern[str], messages: list[dict[str, Any]] | None, prompt: str | None) -> bool:
306
- if prompt and pattern.search(prompt):
307
- return True
308
- return any(pattern.search(t) for t in _iter_message_text(messages))
309
-
310
-
311
258
  def _ultra_available() -> bool:
312
259
  """True iff the ultra tier is loaded from ``models.ini``.
313
260
 
@@ -331,6 +278,11 @@ def classify(body: dict[str, Any]) -> tuple[str, str]:
331
278
 
332
279
  Step-DOWN ladder: top fidelity for short context, fall to mid for
333
280
  medium, drop to fast for long. See module docstring for rationale.
281
+
282
+ Only the fast / agent / ultra rungs are implemented here. Plan and
283
+ uncensored tiers are accessible via their dedicated agent modes
284
+ (``agent.plan``, ``agent.plan-nofilter``) and slash commands; they
285
+ are not auto-routed from the build agent.
334
286
  """
335
287
  messages = body.get("messages") if isinstance(body.get("messages"), list) else None
336
288
  prompt = body.get("prompt") if isinstance(body.get("prompt"), str) else None
@@ -341,51 +293,17 @@ def classify(body: dict[str, Any]) -> tuple[str, str]:
341
293
  for m in (messages or [])
342
294
  if m.get("role") == "system" and isinstance(m.get("content"), str)
343
295
  ]
344
- if any(UNCENSORED_TRIGGERS.search(s) for s in (last_user, *sys_prompts) if s):
345
- return UNCENSORED_MODEL, "uncensored-trigger"
346
296
 
347
297
  if any(ULTRA_TRIGGERS.search(s) for s in (last_user, *sys_prompts) if s):
348
298
  if _ultra_available():
349
299
  return ULTRA_MODEL, "ultra-trigger"
350
- # Explicit user opt-in but the tier isn't wired up. Don't 404 --
351
- # serve the request from the heaviest tier we *do* have and let
352
- # the user notice in logs that their trigger was a no-op.
353
300
  log.warning("ultra-trigger ignored: %s not in models.ini; falling back to %s",
354
301
  ULTRA_MODEL, AGENT_MODEL)
355
302
  return AGENT_MODEL, f"ultra-trigger->agent ({ULTRA_MODEL} unavailable)"
356
303
 
357
304
  n_turns = sum(1 for m in (messages or []) if m.get("role") == "user")
358
- _last_msgs = [{"role": "user", "content": last_user}] if last_user else None
359
- has_code_signal = (
360
- _matches(CODE_BLOCK, _last_msgs, prompt)
361
- or _matches(AGENT_SIGNALS, _last_msgs, prompt)
362
- )
363
-
364
305
  est = _estimate_tokens(messages, prompt)
365
306
 
366
- # Plan track is orthogonal to the code fidelity ladder: ``plan`` is a
367
- # chat-tuned model meant for design / "should we" discussions. Only
368
- # take it when nothing about the request says "I'm about to write
369
- # code" (no triple-backticks, no agent verbs). Tools are stripped
370
- # from the request body before dispatch (see ``_handle_completion``),
371
- # so their presence here does not block plan routing.
372
- # Only route to plan if the input fits in the planner's ctx_size --
373
- # past that we fall through to the coding ladder which has tiers
374
- # (smart, fast) explicitly sized for larger contexts.
375
- if (
376
- not has_code_signal
377
- and _matches(PLAN_SIGNALS, messages, prompt)
378
- ):
379
- plan_tier = TIER_BY_ALIAS.get(PLAN_MODEL)
380
- plan_ctx = plan_tier.ctx_size if plan_tier else 0
381
- if not plan_ctx or est <= plan_ctx:
382
- return PLAN_MODEL, "plan-signal"
383
- log.info(
384
- "plan-signal but tokens~%d > %s.ctx_size %d; "
385
- "falling through to coding ladder",
386
- est, PLAN_MODEL, plan_ctx,
387
- )
388
-
389
307
  # Rung 1: short context -- start at the top.
390
308
  if est <= HIGH_FIDELITY_CEILING:
391
309
  if _ultra_available():
@@ -400,9 +318,7 @@ def classify(body: dict[str, Any]) -> tuple[str, str]:
400
318
  return AGENT_MODEL, f"mid-fidelity tokens~{est}<={MID_FIDELITY_CEILING}"
401
319
 
402
320
  # Rung 3: long context -- step down to fast. Floor at smart only
403
- # when the multi-turn threshold is hit; tools alone no longer
404
- # prevent the step-down (plan tiers strip tools before dispatch,
405
- # and code-fast is a hosted model that tool-calls reliably).
321
+ # when the multi-turn threshold is hit.
406
322
  if n_turns >= MULTI_TURN_THRESHOLD:
407
323
  return AGENT_MODEL, f"long-context tokens~{est}>{MID_FIDELITY_CEILING} (user-turns={n_turns}>={MULTI_TURN_THRESHOLD} floor)"
408
324
  return FAST_MODEL, f"long-context tokens~{est}>{MID_FIDELITY_CEILING}"
@@ -531,14 +447,14 @@ async def list_models() -> JSONResponse:
531
447
  f"'{AGENT_MODEL}' up to ~{MID_FIDELITY_CEILING}, "
532
448
  f"'{FAST_MODEL}' beyond that."
533
449
  )
534
- name = "Auto (step-down router: ultra/agent/fast + plan/uncensored)"
450
+ name = "Auto (step-down router: ultra/agent/fast)"
535
451
  else:
536
452
  top_blurb = (
537
453
  f"Step-down ladder (top->bottom as context grows): "
538
454
  f"'{AGENT_MODEL}' up to ~{MID_FIDELITY_CEILING} tokens, "
539
455
  f"'{FAST_MODEL}' beyond that."
540
456
  )
541
- name = "Auto (step-down router: agent/fast + plan/uncensored)"
457
+ name = "Auto (step-down router: agent/fast)"
542
458
  data["data"].insert(0, {
543
459
  "id": "auto",
544
460
  "object": "model",
@@ -547,8 +463,6 @@ async def list_models() -> JSONResponse:
547
463
  "name": name,
548
464
  "description": (
549
465
  f"{top_blurb} "
550
- f"'{PLAN_MODEL}' for design/planning (orthogonal to ladder); "
551
- f"'{UNCENSORED_MODEL}' for explicit [nofilter] triggers; "
552
466
  f"'[ultra]'/'[opus]' triggers force '{ULTRA_MODEL}' regardless of size."
553
467
  ),
554
468
  "tier": "auto",
@@ -608,6 +522,41 @@ def _inject_sampler(body: dict[str, Any], tier: Tier) -> bool:
608
522
  return mutated
609
523
 
610
524
 
525
+ def _inject_name_json(raw: bytes, tier_name: str) -> bytes:
526
+ try:
527
+ data = json.loads(raw)
528
+ except (json.JSONDecodeError, ValueError):
529
+ return raw
530
+ try:
531
+ msg = data["choices"][0]["message"]
532
+ if msg.get("content"):
533
+ msg["name"] = tier_name
534
+ except (KeyError, IndexError, TypeError):
535
+ pass
536
+ return json.dumps(data).encode()
537
+
538
+
539
+ def _inject_name_sse(chunk: bytes, tier_name: str, injected: list[bool]) -> bytes:
540
+ if injected[0]:
541
+ return chunk
542
+ line = chunk.decode(errors="replace")
543
+ if not line.startswith("data: "):
544
+ return chunk
545
+ payload_str = line[len("data: "):].strip()
546
+ if payload_str in ("[DONE]", ""):
547
+ return chunk
548
+ try:
549
+ payload = json.loads(payload_str)
550
+ delta = payload["choices"][0]["delta"]
551
+ if "role" in delta:
552
+ delta["name"] = tier_name
553
+ injected[0] = True
554
+ return f"data: {json.dumps(payload, separators=(',', ':'))}\n\n".encode()
555
+ except (KeyError, IndexError, TypeError, json.JSONDecodeError):
556
+ pass
557
+ return chunk
558
+
559
+
611
560
  async def _handle_completion(req: Request, path: str) -> Response:
612
561
  raw = await req.body()
613
562
  headers = _filter_request_headers(req)
@@ -631,11 +580,6 @@ async def _handle_completion(req: Request, path: str) -> Response:
631
580
  mutated = True
632
581
 
633
582
  chosen_name = body.get("model")
634
- if chosen_name in {PLAN_MODEL, UNCENSORED_MODEL} and body.get("tools"):
635
- log.info("plan tier %s: stripping tools from request", chosen_name)
636
- body.pop("tools")
637
- body.pop("tool_choice", None)
638
- mutated = True
639
583
  tier = _resolve_tier(chosen_name)
640
584
  if tier is not None and _inject_sampler(body, tier):
641
585
  mutated = True
@@ -646,6 +590,28 @@ async def _handle_completion(req: Request, path: str) -> Response:
646
590
  if tier is not None and tier.is_bedrock:
647
591
  from llmstack.backends import bedrock as bedrock_backend
648
592
  resp = await bedrock_backend.dispatch(req, tier, body)
593
+ elif tier is not None and body.get("stream"):
594
+ proxy = await _stream_proxy(req.method, path, raw, headers)
595
+ injected: list[bool] = [False]
596
+ tier_name = tier.name
597
+ original_gen = proxy.body_iterator
598
+
599
+ async def _named_gen():
600
+ async for chunk in original_gen:
601
+ yield _inject_name_sse(chunk, tier_name, injected)
602
+
603
+ proxy.body_iterator = _named_gen()
604
+ resp = proxy
605
+ elif tier is not None:
606
+ proxy = await _stream_proxy(req.method, path, raw, headers)
607
+ raw_resp = b"".join([chunk async for chunk in proxy.body_iterator])
608
+ patched = _inject_name_json(raw_resp, tier.name)
609
+ resp = Response(
610
+ content=patched,
611
+ status_code=proxy.status_code,
612
+ headers=dict(proxy.headers),
613
+ media_type=proxy.media_type,
614
+ )
649
615
  else:
650
616
  resp = await _stream_proxy(req.method, path, raw, headers)
651
617
 
@@ -588,6 +588,8 @@ async def _complete_response(client: Any, tier: Tier, converse_kwargs: dict[str,
588
588
  return JSONResponse(status_code=502, content={"error": _error_payload(exc)})
589
589
 
590
590
  message, finish = _openai_message_from_converse(resp)
591
+ if message.get("content"):
592
+ message["name"] = tier.name
591
593
  usage_in = (resp.get("usage") or {})
592
594
  payload = {
593
595
  "id": _completion_id(),
@@ -665,7 +667,7 @@ async def _stream_response(client: Any, tier: Tier, converse_kwargs: dict[str, A
665
667
 
666
668
  # First chunk: announce the assistant role so OpenAI clients can
667
669
  # initialise their accumulator.
668
- yield _sse(_frame({"role": "assistant"}))
670
+ yield _sse(_frame({"role": "assistant", "name": model_label}))
669
671
 
670
672
  # Per-content-block state: index -> "text" | "tool_use"
671
673
  block_kinds: dict[int, str] = {}
@@ -69,7 +69,7 @@ COMMANDS = {
69
69
  "agent": "plan",
70
70
  },
71
71
  "nofilter": {
72
- "template": "[nofilter]",
72
+ "template": "",
73
73
  "description": "Route to the uncensored planning model.",
74
74
  "agent": "plan-nofilter",
75
75
  },
@@ -194,7 +194,7 @@ def build_config(
194
194
 
195
195
  models: dict[str, dict] = {
196
196
  "auto": {
197
- "name": "Auto (router selects: fast / agent / plan / uncensored)",
197
+ "name": "Auto (router selects: fast / agent / ultra)",
198
198
  "limit": {"context": auto_ctx, "output": 16384},
199
199
  "tool_call": True,
200
200
  "cost": ZERO_COST,
@@ -178,7 +178,7 @@ description = Qwopus GLM 18B - planning, design discussions, architecture
178
178
  ; aws_region = eu-central-1
179
179
  ; aws_profile = bedrock-prod
180
180
  ; ctx_size = 200000
181
- ; sampler = temp=0.7, top_p=0.9 ; creative; Opus 4.6 accepts both
181
+ ; sampler = temp=0.7 ; creative; Opus 4.6
182
182
  ; description = Claude Opus 4.6 on Bedrock - planning, design discussions, architecture
183
183
 
184
184
  [plan-uncensored]
@@ -258,21 +258,18 @@ description = Mistral-Small 3.2 24B Heretic - no-filter planning
258
258
  ;
259
259
  ; First-match-wins decision tree applied by llmstack/app.py when model="auto":
260
260
  ;
261
- ; 1. "[nofilter]" / "uncensored:" trigger -> plan-uncensored
262
- ; 2. "[ultra]" / "[opus]" / "ultra:" trigger AND code-ultra
261
+ ; 1. "[ultra]" / "[opus]" / "ultra:" trigger AND code-ultra
263
262
  ; tier configured -> code-ultra
264
- ; 3. PLAN signal words AND no code-block / agent verbs / tools
265
- ; AND tokens <= [plan].ctx_size (pure design discussion that
266
- ; still fits the planner's window) -> plan
267
- ; ...if the plan tier's ctx_size is breached, the request
268
- ; falls through to the coding ladder below rather than being
269
- ; sent to a planner whose window can't hold the input.
270
- ; 4. tokens <= high_fidelity_ceiling AND code-ultra configured -> code-ultra
263
+ ; 2. tokens <= high_fidelity_ceiling AND code-ultra configured -> code-ultra
271
264
  ; tokens <= high_fidelity_ceiling AND no code-ultra -> code-smart
272
- ; 5. tokens <= mid_fidelity_ceiling -> code-smart
273
- ; 6. otherwise (long context):
274
- ; - if tools[] OR turns >= multi_turn (3B tool-calls badly) -> code-smart
275
- ; - else -> code-fast
265
+ ; 3. tokens <= mid_fidelity_ceiling -> code-smart
266
+ ; 4. otherwise (long context):
267
+ ; - if turns >= multi_turn (floor at smart) -> code-smart
268
+ ; - else -> code-fast
269
+ ;
270
+ ; Plan and uncensored tiers are accessible via their dedicated agent
271
+ ; modes (agent.plan, agent.plan-nofilter) and slash commands; they are
272
+ ; NOT auto-routed through model=auto.
276
273
  ;
277
274
  ; AUTO ROUTER MAX CONTEXT = [code-fast].ctx_size. The fast tier sits at
278
275
  ; the bottom of the step-down ladder, so any context too big for the
@@ -303,9 +300,6 @@ description = Mistral-Small 3.2 24B Heretic - no-filter planning
303
300
  high_fidelity_ceiling = 12000 ; tokens; below this, top-tier model is still cheap+fast (and ultra ctx_size = 2 * this)
304
301
  mid_fidelity_ceiling = 32000 ; tokens; smart's sweet spot up to here, then step down to fast (smart ctx_size = 2 * this)
305
302
  multi_turn = 10 ; turn count that floors the long-context rung at code-smart
306
- agent_signal_words = implement, fix bug, write a function, refactor, edit, patch, debug, run tests, build it
307
- plan_signal_words = design, architect, approach, trade-off, should we, how would you, explain why, think through, compare options, brainstorm, root cause
308
- uncensored_triggers = [nofilter], [uncensored], [heretic], "uncensored:", "nofilter:" (line start)
309
303
  ultra_triggers = [ultra], [opus], "ultra:", "opus:" (line start)
310
304
 
311
305
  ;------------------------------------------------------------------------------
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: opencode-llmstack
3
- Version: 0.9.4
3
+ Version: 0.9.7
4
4
  Summary: Multi-tier local LLM stack: llama-swap + FastAPI auto-router + opencode wiring.
5
5
  Author: llmstack
6
6
  License: MIT License
@@ -78,14 +78,14 @@ client (opencode / curl / Cursor / etc.)
78
78
 
79
79
 
80
80
  http://127.0.0.1:10101 <-- FastAPI router (llmstack.app)
81
- │ • model="auto" → classify → rewrite to one of 4 tiers
81
+ │ • model="auto" → classify → rewrite to one of 3 coder tiers
82
82
  │ • everything else → pass-through
83
83
 
84
84
  http://127.0.0.1:10102 <-- llama-swap (binary, manages model lifecycle)
85
85
  │ • loads/unloads llama-server processes per model
86
86
  │ • matrix solver allows {code-fast + one heavy model} co-resident
87
87
 
88
- llama-server <code-fast | code-smart | plan | plan-uncensored>
88
+ llama-server <code-fast | code-smart | code-ultra>
89
89
 
90
90
 
91
91
  GGUF in ~/.cache/huggingface/hub/...
@@ -101,7 +101,7 @@ A 64 GB unified memory M4 Max can comfortably hold **one always-on tiny coder +
101
101
 
102
102
  - **Agent work** (multi-file edits, tool use, refactors) → coder models, which are trained on tool-call protocols and code edits.
103
103
  - **Planning** (design discussions, architecture, "what's the best approach") → chat-tuned models, which are better at high-level reasoning and don't try to start writing code in response to every message.
104
- - **Uncensored planning** is a separate plan-tier model, opted in either by request (`agent.plan-nofilter` in opencode) or by an inline `[nofilter]` trigger in the prompt.
104
+ - **Uncensored planning** is a separate plan-tier model, opted in by explicit agent selection (`/agent plan-nofilter` in opencode).
105
105
 
106
106
  Routing decisions cost ~zero — they're a few regex checks in the FastAPI router, not an LLM call.
107
107
 
@@ -135,20 +135,18 @@ matches how these models actually behave on this stack:
135
135
  than priors, so they tend to *improve* relative to top-tier as the
136
136
  conversation grows.
137
137
 
138
- First match wins:
138
+ First match wins (auto-routing only; `plan` and `plan-uncensored` are not auto-routed):
139
139
 
140
140
  | # | Condition | → Model | Reason |
141
141
  |---|---|---|---|
142
- | 1 | last user msg contains `[nofilter]`, `[uncensored]`, `[heretic]`, or starts with `uncensored:` / `nofilter:` | `plan-uncensored` | explicit opt-in |
143
- | 2 | `[ultra]` / `[opus]` / `ultra:` trigger AND `code-ultra` tier configured | `code-ultra` | explicit top-tier opt-in |
144
- | 3 | plan verbs (*design, architect, approach, trade-off, should we, explain why, …*) AND no code blocks / agent verbs / tools | `plan` | pure design discussion (orthogonal track) |
145
- | 4 | estimated input 12 000 tokens | `code-ultra` *(or `code-smart` if ultra unwired)* | top tier context still being built, latency/$ are best here |
146
- | 5 | estimated input 32 000 tokens | `code-smart` | mid-context, local heavy coder is at its sweet spot |
147
- | 6 | otherwise (long context) AND ≥ 10 user turns | `code-smart` | floor: deep agentic loop, keep the heavy model |
148
- | 7 | otherwise (long context) | `code-fast` | 128k YaRN window + always-resident + free |
142
+ | 1 | `[ultra]` / `[opus]` / `ultra:` trigger AND `code-ultra` tier configured | `code-ultra` | explicit top-tier opt-in |
143
+ | 2 | estimated input 12 000 tokens | `code-ultra` *(or `code-smart` if ultra unwired)* | top tier context still being built, latency/$ are best here |
144
+ | 3 | estimated input 32 000 tokens | `code-smart` | mid-context, local heavy coder is at its sweet spot |
145
+ | 4 | otherwise (long context) AND 10 user turns | `code-smart` | floor: deep agentic loop, keep the heavy model |
146
+ | 5 | otherwise (long context) | `code-fast` | 128k YaRN window + always-resident + free |
149
147
 
150
148
  Token estimates are `chars / 4` over all message text + `prompt`. The
151
- `code-ultra` rungs (2 and 4) are gated on availability: when no
149
+ `code-ultra` rungs (1 and 2) are gated on availability: when no
152
150
  `[code-ultra]` section is loaded from `models.ini`, both silently fall
153
151
  back to `code-smart` so vanilla installs don't 404.
154
152
 
@@ -198,7 +196,8 @@ your global setup unchanged.
198
196
  | **`agent.plan-nofilter`** (custom uncensored planner) | `llama.cpp/plan-uncensored` |
199
197
 
200
198
  Inside opencode you can switch agents with `/agent` or by `@plan-nofilter`-mentioning
201
- a custom one. Slash-commands `/review`, `/nofilter` are also available.
199
+ a custom one. The `plan` and `plan-uncensored` tiers are **not auto-routed** from the build agent —
200
+ they're only accessible via explicit agent selection (`/agent plan` or `/agent plan-nofilter`).
202
201
 
203
202
  Want a second terminal into the same stack? Install the activate hook
204
203
  once (`eval "$(llmstack activate zsh)"`) and any new shell that `cd`s
@@ -266,8 +265,9 @@ Per-project state (gitignored) is created lazily under `<work-dir>/.llmstack/`:
266
265
  ```
267
266
 
268
267
  The `llama-swap` binary lives outside any project at
269
- `$XDG_DATA_HOME/llmstack/bin/llama-swap` (override with
270
- `LLMSTACK_BIN_DIR`). One download is reused across all projects.
268
+ `$XDG_DATA_HOME/llmstack/bin/llama-swap` on macOS/Linux (override with
269
+ `LLMSTACK_BIN_DIR`), or `%LOCALAPPDATA%\llmstack\bin\llama-swap.exe` on Windows.
270
+ One download is reused across all projects.
271
271
 
272
272
  ## Quick start
273
273
 
@@ -358,8 +358,9 @@ Notes:
358
358
  or a package like `winget install ggml.llama-cpp` and put it on
359
359
  `PATH` (or set `$env:LLAMA_SERVER_BIN`). The Mac-only
360
360
  `iogpu.wired_limit_mb` step does not apply.
361
- - The `[llmstack:<channel>]` prompt prefix shows up in PowerShell too;
362
- `cmd.exe` gets a simpler `[llmstack:<channel>]` prompt via `doskey`.
361
+ - The `[llmstack:<channel>]` prompt prefix shows up in PowerShell; `cmd.exe`
362
+ does not support custom prompts in the same way, so activation is
363
+ PowerShell-only.
363
364
  - Stopping daemons uses `taskkill /T /F` under the hood, so the
364
365
  llama-server children get cleaned up as well.
365
366
 
@@ -465,7 +466,7 @@ llmstack restart --next # cycle into the next channel
465
466
 
466
467
  ### Try each routing path
467
468
 
468
- All of these go to `/v1/chat/completions` on `:10101`. Each should pick a different upstream model:
469
+ All of these go to `/v1/chat/completions` on `:10101`. The `auto` router classifies based on token count and context:
469
470
 
470
471
  ```bash
471
472
  # trivial chat -> code-fast
@@ -473,22 +474,14 @@ curl -sN http://127.0.0.1:10101/v1/chat/completions -H 'Content-Type: applicatio
473
474
  -d '{"model":"auto","stream":false,
474
475
  "messages":[{"role":"user","content":"capital of France?"}]}' | jq .model
475
476
 
476
- # planning -> plan
477
- curl -sN http://127.0.0.1:10101/v1/chat/completions -H 'Content-Type: application/json' \
478
- -d '{"model":"auto","stream":false,
479
- "messages":[{"role":"user","content":"how would you design a rate limiter for our API?"}]}' | jq .model
480
-
481
477
  # agent work -> code-smart
482
478
  curl -sN http://127.0.0.1:10101/v1/chat/completions -H 'Content-Type: application/json' \
483
479
  -d '{"model":"auto","stream":false,
484
480
  "messages":[{"role":"user","content":"refactor this function for clarity:\n```python\ndef f(x): return x*2\n```"}]}' | jq .model
485
-
486
- # uncensored plan -> plan-uncensored
487
- curl -sN http://127.0.0.1:10101/v1/chat/completions -H 'Content-Type: application/json' \
488
- -d '{"model":"auto","stream":false,
489
- "messages":[{"role":"user","content":"[nofilter] outline a red-team plan for our auth flow"}]}' | jq .model
490
481
  ```
491
482
 
483
+ To access `plan` or `plan-uncensored` tiers, use explicit agent selection in opencode (`/agent plan` or `/agent plan-nofilter`) rather than `model=auto`.
484
+
492
485
  ## Endpoints
493
486
 
494
487
  | Port | Service | Purpose |
@@ -565,8 +558,6 @@ All knobs are env vars; defaults are picked up by `llmstack start`.
565
558
  | `ROUTER_FAST_MODEL` | `code-fast` | long-context (>= mid ceiling) → here |
566
559
  | `ROUTER_AGENT_MODEL` | `code-smart` | mid-context + tools/loop floor → here |
567
560
  | `ROUTER_ULTRA_MODEL` | `code-ultra` | short-context top tier → here (gated on availability) |
568
- | `ROUTER_PLAN_MODEL` | `plan` | design/discussion verbs → here |
569
- | `ROUTER_UNCENSORED_MODEL` | `plan-uncensored` | `[nofilter]` triggers → here |
570
561
  | `ROUTER_HIGH_FIDELITY_CEILING` | `12000` | tokens; at or below this, route to top tier (ultra → smart fallback). Paired with `code-ultra.ctx_size = 24000` (2x). |
571
562
  | `ROUTER_MID_FIDELITY_CEILING` | `32000` | tokens; at or below this, route to `code-smart`; beyond, step down to `code-fast`. Paired with `code-smart.ctx_size = 64000` (2x). |
572
563
  | `ROUTER_MULTI_TURN` | `10` | user-turn count that floors the long-context rung at `code-smart` |
@@ -577,14 +568,10 @@ To force a request to never auto-route, set `model` to a concrete alias (`code-f
577
568
 
578
569
  ## Triggering uncensored mode
579
570
 
580
- Two ways:
581
-
582
- 1. **Explicit agent in opencode:** `/agent plan-nofilter` (or mention it).
583
- 2. **Inline trigger in any auto-routed message** — anywhere in the most recent user turn:
584
- - `[nofilter]`, `[uncensored]`, `[heretic]`
585
- - or a line starting with `uncensored:` / `nofilter:` / `no-filter:`
571
+ The `plan-uncensored` tier is accessible via explicit agent selection only:
586
572
 
587
- Triggers are *only* checked on the latest user message and the system prompt, so an old `[nofilter]` further up the conversation won't pin the whole session.
573
+ 1. **In opencode:** `/agent plan-nofilter` (or mention `@plan-nofilter`).
574
+ 2. **Via opencode config:** set `agent.plan-nofilter` as your active agent.
588
575
 
589
576
  ## Troubleshooting
590
577
 
@@ -594,7 +581,7 @@ Triggers are *only* checked on the latest user message and the system prompt, so
594
581
 
595
582
  **OOM / unexplained slowdown** → run `top -o mem -stats pid,rsize,command` to see what's resident. The matrix should prevent two heavy models loading together; if it somehow happens, `llmstack restart`.
596
583
 
597
- **Auto picks the wrong model** → adjust the regex in `llmstack/app.py` (`AGENT_SIGNALS` / `PLAN_SIGNALS` / `UNCENSORED_TRIGGERS`) or move the ladder ceilings via `ROUTER_HIGH_FIDELITY_CEILING` / `ROUTER_MID_FIDELITY_CEILING`. To force a request to never auto-route, pass an explicit `model` (e.g. `code-smart`) instead of `auto`.
584
+ **Auto picks the wrong model** → adjust the regex in `llmstack/app.py` (`ULTRA_TRIGGERS`) or move the ladder ceilings via `ROUTER_HIGH_FIDELITY_CEILING` / `ROUTER_MID_FIDELITY_CEILING`. To force a request to never auto-route, pass an explicit `model` (e.g. `code-smart`) instead of `auto`.
598
585
 
599
586
  **Want a pure pass-through (no auto routing)** → change opencode's `baseURL` to `http://127.0.0.1:10102/v1` (llama-swap directly) and only use concrete model names. (Note: this skips the bedrock dispatcher; only GGUF tiers will be reachable.)
600
587
 
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "opencode-llmstack"
7
- version = "0.9.4"
7
+ version = "0.9.7"
8
8
  description = "Multi-tier local LLM stack: llama-swap + FastAPI auto-router + opencode wiring."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.11"