opencode-llmstack 0.9.6__tar.gz → 0.9.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. {opencode_llmstack-0.9.6/opencode_llmstack.egg-info → opencode_llmstack-0.9.7}/PKG-INFO +26 -39
  2. {opencode_llmstack-0.9.6 → opencode_llmstack-0.9.7}/README.md +25 -38
  3. {opencode_llmstack-0.9.6 → opencode_llmstack-0.9.7}/UPGRADING.md +12 -12
  4. {opencode_llmstack-0.9.6 → opencode_llmstack-0.9.7}/llmstack/app.py +0 -2
  5. {opencode_llmstack-0.9.6 → opencode_llmstack-0.9.7}/llmstack/generators/opencode.py +1 -1
  6. {opencode_llmstack-0.9.6 → opencode_llmstack-0.9.7/opencode_llmstack.egg-info}/PKG-INFO +26 -39
  7. {opencode_llmstack-0.9.6 → opencode_llmstack-0.9.7}/pyproject.toml +1 -1
  8. {opencode_llmstack-0.9.6 → opencode_llmstack-0.9.7}/CHANGELOG.md +0 -0
  9. {opencode_llmstack-0.9.6 → opencode_llmstack-0.9.7}/LICENSE +0 -0
  10. {opencode_llmstack-0.9.6 → opencode_llmstack-0.9.7}/llmstack/AGENTS.md +0 -0
  11. {opencode_llmstack-0.9.6 → opencode_llmstack-0.9.7}/llmstack/__init__.py +0 -0
  12. {opencode_llmstack-0.9.6 → opencode_llmstack-0.9.7}/llmstack/__main__.py +0 -0
  13. {opencode_llmstack-0.9.6 → opencode_llmstack-0.9.7}/llmstack/_platform.py +0 -0
  14. {opencode_llmstack-0.9.6 → opencode_llmstack-0.9.7}/llmstack/backends/__init__.py +0 -0
  15. {opencode_llmstack-0.9.6 → opencode_llmstack-0.9.7}/llmstack/backends/bedrock.py +0 -0
  16. {opencode_llmstack-0.9.6 → opencode_llmstack-0.9.7}/llmstack/check_models.py +0 -0
  17. {opencode_llmstack-0.9.6 → opencode_llmstack-0.9.7}/llmstack/cli.py +0 -0
  18. {opencode_llmstack-0.9.6 → opencode_llmstack-0.9.7}/llmstack/commands/__init__.py +0 -0
  19. {opencode_llmstack-0.9.6 → opencode_llmstack-0.9.7}/llmstack/commands/_helpers.py +0 -0
  20. {opencode_llmstack-0.9.6 → opencode_llmstack-0.9.7}/llmstack/commands/activate.py +0 -0
  21. {opencode_llmstack-0.9.6 → opencode_llmstack-0.9.7}/llmstack/commands/check.py +0 -0
  22. {opencode_llmstack-0.9.6 → opencode_llmstack-0.9.7}/llmstack/commands/download.py +0 -0
  23. {opencode_llmstack-0.9.6 → opencode_llmstack-0.9.7}/llmstack/commands/install.py +0 -0
  24. {opencode_llmstack-0.9.6 → opencode_llmstack-0.9.7}/llmstack/commands/install_llama_swap.py +0 -0
  25. {opencode_llmstack-0.9.6 → opencode_llmstack-0.9.7}/llmstack/commands/reload.py +0 -0
  26. {opencode_llmstack-0.9.6 → opencode_llmstack-0.9.7}/llmstack/commands/restart.py +0 -0
  27. {opencode_llmstack-0.9.6 → opencode_llmstack-0.9.7}/llmstack/commands/setup.py +0 -0
  28. {opencode_llmstack-0.9.6 → opencode_llmstack-0.9.7}/llmstack/commands/start.py +0 -0
  29. {opencode_llmstack-0.9.6 → opencode_llmstack-0.9.7}/llmstack/commands/status.py +0 -0
  30. {opencode_llmstack-0.9.6 → opencode_llmstack-0.9.7}/llmstack/commands/stop.py +0 -0
  31. {opencode_llmstack-0.9.6 → opencode_llmstack-0.9.7}/llmstack/download/__init__.py +0 -0
  32. {opencode_llmstack-0.9.6 → opencode_llmstack-0.9.7}/llmstack/download/binary.py +0 -0
  33. {opencode_llmstack-0.9.6 → opencode_llmstack-0.9.7}/llmstack/download/ggufs.py +0 -0
  34. {opencode_llmstack-0.9.6 → opencode_llmstack-0.9.7}/llmstack/generators/__init__.py +0 -0
  35. {opencode_llmstack-0.9.6 → opencode_llmstack-0.9.7}/llmstack/generators/llama_swap.py +0 -0
  36. {opencode_llmstack-0.9.6 → opencode_llmstack-0.9.7}/llmstack/models.ini +0 -0
  37. {opencode_llmstack-0.9.6 → opencode_llmstack-0.9.7}/llmstack/paths.py +0 -0
  38. {opencode_llmstack-0.9.6 → opencode_llmstack-0.9.7}/llmstack/shell_env.py +0 -0
  39. {opencode_llmstack-0.9.6 → opencode_llmstack-0.9.7}/llmstack/tiers.py +0 -0
  40. {opencode_llmstack-0.9.6 → opencode_llmstack-0.9.7}/opencode_llmstack.egg-info/SOURCES.txt +0 -0
  41. {opencode_llmstack-0.9.6 → opencode_llmstack-0.9.7}/opencode_llmstack.egg-info/dependency_links.txt +0 -0
  42. {opencode_llmstack-0.9.6 → opencode_llmstack-0.9.7}/opencode_llmstack.egg-info/entry_points.txt +0 -0
  43. {opencode_llmstack-0.9.6 → opencode_llmstack-0.9.7}/opencode_llmstack.egg-info/requires.txt +0 -0
  44. {opencode_llmstack-0.9.6 → opencode_llmstack-0.9.7}/opencode_llmstack.egg-info/top_level.txt +0 -0
  45. {opencode_llmstack-0.9.6 → opencode_llmstack-0.9.7}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: opencode-llmstack
3
- Version: 0.9.6
3
+ Version: 0.9.7
4
4
  Summary: Multi-tier local LLM stack: llama-swap + FastAPI auto-router + opencode wiring.
5
5
  Author: llmstack
6
6
  License: MIT License
@@ -78,14 +78,14 @@ client (opencode / curl / Cursor / etc.)
78
78
 
79
79
 
80
80
  http://127.0.0.1:10101 <-- FastAPI router (llmstack.app)
81
- │ • model="auto" → classify → rewrite to one of 4 tiers
81
+ │ • model="auto" → classify → rewrite to one of 3 coder tiers
82
82
  │ • everything else → pass-through
83
83
 
84
84
  http://127.0.0.1:10102 <-- llama-swap (binary, manages model lifecycle)
85
85
  │ • loads/unloads llama-server processes per model
86
86
  │ • matrix solver allows {code-fast + one heavy model} co-resident
87
87
 
88
- llama-server <code-fast | code-smart | plan | plan-uncensored>
88
+ llama-server <code-fast | code-smart | code-ultra>
89
89
 
90
90
 
91
91
  GGUF in ~/.cache/huggingface/hub/...
@@ -101,7 +101,7 @@ A 64 GB unified memory M4 Max can comfortably hold **one always-on tiny coder +
101
101
 
102
102
  - **Agent work** (multi-file edits, tool use, refactors) → coder models, which are trained on tool-call protocols and code edits.
103
103
  - **Planning** (design discussions, architecture, "what's the best approach") → chat-tuned models, which are better at high-level reasoning and don't try to start writing code in response to every message.
104
- - **Uncensored planning** is a separate plan-tier model, opted in either by request (`agent.plan-nofilter` in opencode) or by an inline `[nofilter]` trigger in the prompt.
104
+ - **Uncensored planning** is a separate plan-tier model, opted in by explicit agent selection (`/agent plan-nofilter` in opencode).
105
105
 
106
106
  Routing decisions cost ~zero — they're a few regex checks in the FastAPI router, not an LLM call.
107
107
 
@@ -135,20 +135,18 @@ matches how these models actually behave on this stack:
135
135
  than priors, so they tend to *improve* relative to top-tier as the
136
136
  conversation grows.
137
137
 
138
- First match wins:
138
+ First match wins (auto-routing only; `plan` and `plan-uncensored` are not auto-routed):
139
139
 
140
140
  | # | Condition | → Model | Reason |
141
141
  |---|---|---|---|
142
- | 1 | last user msg contains `[nofilter]`, `[uncensored]`, `[heretic]`, or starts with `uncensored:` / `nofilter:` | `plan-uncensored` | explicit opt-in |
143
- | 2 | `[ultra]` / `[opus]` / `ultra:` trigger AND `code-ultra` tier configured | `code-ultra` | explicit top-tier opt-in |
144
- | 3 | plan verbs (*design, architect, approach, trade-off, should we, explain why, …*) AND no code blocks / agent verbs / tools | `plan` | pure design discussion (orthogonal track) |
145
- | 4 | estimated input 12 000 tokens | `code-ultra` *(or `code-smart` if ultra unwired)* | top tier context still being built, latency/$ are best here |
146
- | 5 | estimated input 32 000 tokens | `code-smart` | mid-context, local heavy coder is at its sweet spot |
147
- | 6 | otherwise (long context) AND ≥ 10 user turns | `code-smart` | floor: deep agentic loop, keep the heavy model |
148
- | 7 | otherwise (long context) | `code-fast` | 128k YaRN window + always-resident + free |
142
+ | 1 | `[ultra]` / `[opus]` / `ultra:` trigger AND `code-ultra` tier configured | `code-ultra` | explicit top-tier opt-in |
143
+ | 2 | estimated input 12 000 tokens | `code-ultra` *(or `code-smart` if ultra unwired)* | top tier context still being built, latency/$ are best here |
144
+ | 3 | estimated input 32 000 tokens | `code-smart` | mid-context, local heavy coder is at its sweet spot |
145
+ | 4 | otherwise (long context) AND 10 user turns | `code-smart` | floor: deep agentic loop, keep the heavy model |
146
+ | 5 | otherwise (long context) | `code-fast` | 128k YaRN window + always-resident + free |
149
147
 
150
148
  Token estimates are `chars / 4` over all message text + `prompt`. The
151
- `code-ultra` rungs (2 and 4) are gated on availability: when no
149
+ `code-ultra` rungs (1 and 2) are gated on availability: when no
152
150
  `[code-ultra]` section is loaded from `models.ini`, both silently fall
153
151
  back to `code-smart` so vanilla installs don't 404.
154
152
 
@@ -198,7 +196,8 @@ your global setup unchanged.
198
196
  | **`agent.plan-nofilter`** (custom uncensored planner) | `llama.cpp/plan-uncensored` |
199
197
 
200
198
  Inside opencode you can switch agents with `/agent` or by `@plan-nofilter`-mentioning
201
- a custom one. Slash-commands `/review`, `/nofilter` are also available.
199
+ a custom one. The `plan` and `plan-uncensored` tiers are **not auto-routed** from the build agent —
200
+ they're only accessible via explicit agent selection (`/agent plan` or `/agent plan-nofilter`).
202
201
 
203
202
  Want a second terminal into the same stack? Install the activate hook
204
203
  once (`eval "$(llmstack activate zsh)"`) and any new shell that `cd`s
@@ -266,8 +265,9 @@ Per-project state (gitignored) is created lazily under `<work-dir>/.llmstack/`:
266
265
  ```
267
266
 
268
267
  The `llama-swap` binary lives outside any project at
269
- `$XDG_DATA_HOME/llmstack/bin/llama-swap` (override with
270
- `LLMSTACK_BIN_DIR`). One download is reused across all projects.
268
+ `$XDG_DATA_HOME/llmstack/bin/llama-swap` on macOS/Linux (override with
269
+ `LLMSTACK_BIN_DIR`), or `%LOCALAPPDATA%\llmstack\bin\llama-swap.exe` on Windows.
270
+ One download is reused across all projects.
271
271
 
272
272
  ## Quick start
273
273
 
@@ -358,8 +358,9 @@ Notes:
358
358
  or a package like `winget install ggml.llama-cpp` and put it on
359
359
  `PATH` (or set `$env:LLAMA_SERVER_BIN`). The Mac-only
360
360
  `iogpu.wired_limit_mb` step does not apply.
361
- - The `[llmstack:<channel>]` prompt prefix shows up in PowerShell too;
362
- `cmd.exe` gets a simpler `[llmstack:<channel>]` prompt via `doskey`.
361
+ - The `[llmstack:<channel>]` prompt prefix shows up in PowerShell; `cmd.exe`
362
+ does not support custom prompts in the same way, so activation is
363
+ PowerShell-only.
363
364
  - Stopping daemons uses `taskkill /T /F` under the hood, so the
364
365
  llama-server children get cleaned up as well.
365
366
 
@@ -465,7 +466,7 @@ llmstack restart --next # cycle into the next channel
465
466
 
466
467
  ### Try each routing path
467
468
 
468
- All of these go to `/v1/chat/completions` on `:10101`. Each should pick a different upstream model:
469
+ All of these go to `/v1/chat/completions` on `:10101`. The `auto` router classifies based on token count and context:
469
470
 
470
471
  ```bash
471
472
  # trivial chat -> code-fast
@@ -473,22 +474,14 @@ curl -sN http://127.0.0.1:10101/v1/chat/completions -H 'Content-Type: applicatio
473
474
  -d '{"model":"auto","stream":false,
474
475
  "messages":[{"role":"user","content":"capital of France?"}]}' | jq .model
475
476
 
476
- # planning -> plan
477
- curl -sN http://127.0.0.1:10101/v1/chat/completions -H 'Content-Type: application/json' \
478
- -d '{"model":"auto","stream":false,
479
- "messages":[{"role":"user","content":"how would you design a rate limiter for our API?"}]}' | jq .model
480
-
481
477
  # agent work -> code-smart
482
478
  curl -sN http://127.0.0.1:10101/v1/chat/completions -H 'Content-Type: application/json' \
483
479
  -d '{"model":"auto","stream":false,
484
480
  "messages":[{"role":"user","content":"refactor this function for clarity:\n```python\ndef f(x): return x*2\n```"}]}' | jq .model
485
-
486
- # uncensored plan -> plan-uncensored
487
- curl -sN http://127.0.0.1:10101/v1/chat/completions -H 'Content-Type: application/json' \
488
- -d '{"model":"auto","stream":false,
489
- "messages":[{"role":"user","content":"[nofilter] outline a red-team plan for our auth flow"}]}' | jq .model
490
481
  ```
491
482
 
483
+ To access `plan` or `plan-uncensored` tiers, use explicit agent selection in opencode (`/agent plan` or `/agent plan-nofilter`) rather than `model=auto`.
484
+
492
485
  ## Endpoints
493
486
 
494
487
  | Port | Service | Purpose |
@@ -565,8 +558,6 @@ All knobs are env vars; defaults are picked up by `llmstack start`.
565
558
  | `ROUTER_FAST_MODEL` | `code-fast` | long-context (>= mid ceiling) → here |
566
559
  | `ROUTER_AGENT_MODEL` | `code-smart` | mid-context + tools/loop floor → here |
567
560
  | `ROUTER_ULTRA_MODEL` | `code-ultra` | short-context top tier → here (gated on availability) |
568
- | `ROUTER_PLAN_MODEL` | `plan` | design/discussion verbs → here |
569
- | `ROUTER_UNCENSORED_MODEL` | `plan-uncensored` | `[nofilter]` triggers → here |
570
561
  | `ROUTER_HIGH_FIDELITY_CEILING` | `12000` | tokens; at or below this, route to top tier (ultra → smart fallback). Paired with `code-ultra.ctx_size = 24000` (2x). |
571
562
  | `ROUTER_MID_FIDELITY_CEILING` | `32000` | tokens; at or below this, route to `code-smart`; beyond, step down to `code-fast`. Paired with `code-smart.ctx_size = 64000` (2x). |
572
563
  | `ROUTER_MULTI_TURN` | `10` | user-turn count that floors the long-context rung at `code-smart` |
@@ -577,14 +568,10 @@ To force a request to never auto-route, set `model` to a concrete alias (`code-f
577
568
 
578
569
  ## Triggering uncensored mode
579
570
 
580
- Two ways:
581
-
582
- 1. **Explicit agent in opencode:** `/agent plan-nofilter` (or mention it).
583
- 2. **Inline trigger in any auto-routed message** — anywhere in the most recent user turn:
584
- - `[nofilter]`, `[uncensored]`, `[heretic]`
585
- - or a line starting with `uncensored:` / `nofilter:` / `no-filter:`
571
+ The `plan-uncensored` tier is accessible via explicit agent selection only:
586
572
 
587
- Triggers are *only* checked on the latest user message and the system prompt, so an old `[nofilter]` further up the conversation won't pin the whole session.
573
+ 1. **In opencode:** `/agent plan-nofilter` (or mention `@plan-nofilter`).
574
+ 2. **Via opencode config:** set `agent.plan-nofilter` as your active agent.
588
575
 
589
576
  ## Troubleshooting
590
577
 
@@ -594,7 +581,7 @@ Triggers are *only* checked on the latest user message and the system prompt, so
594
581
 
595
582
  **OOM / unexplained slowdown** → run `top -o mem -stats pid,rsize,command` to see what's resident. The matrix should prevent two heavy models loading together; if it somehow happens, `llmstack restart`.
596
583
 
597
- **Auto picks the wrong model** → adjust the regex in `llmstack/app.py` (`AGENT_SIGNALS` / `PLAN_SIGNALS` / `UNCENSORED_TRIGGERS`) or move the ladder ceilings via `ROUTER_HIGH_FIDELITY_CEILING` / `ROUTER_MID_FIDELITY_CEILING`. To force a request to never auto-route, pass an explicit `model` (e.g. `code-smart`) instead of `auto`.
584
+ **Auto picks the wrong model** → adjust the regex in `llmstack/app.py` (`ULTRA_TRIGGERS`) or move the ladder ceilings via `ROUTER_HIGH_FIDELITY_CEILING` / `ROUTER_MID_FIDELITY_CEILING`. To force a request to never auto-route, pass an explicit `model` (e.g. `code-smart`) instead of `auto`.
598
585
 
599
586
  **Want a pure pass-through (no auto routing)** → change opencode's `baseURL` to `http://127.0.0.1:10102/v1` (llama-swap directly) and only use concrete model names. (Note: this skips the bedrock dispatcher; only GGUF tiers will be reachable.)
600
587
 
@@ -19,14 +19,14 @@ client (opencode / curl / Cursor / etc.)
19
19
 
20
20
 
21
21
  http://127.0.0.1:10101 <-- FastAPI router (llmstack.app)
22
- │ • model="auto" → classify → rewrite to one of 4 tiers
22
+ │ • model="auto" → classify → rewrite to one of 3 coder tiers
23
23
  │ • everything else → pass-through
24
24
 
25
25
  http://127.0.0.1:10102 <-- llama-swap (binary, manages model lifecycle)
26
26
  │ • loads/unloads llama-server processes per model
27
27
  │ • matrix solver allows {code-fast + one heavy model} co-resident
28
28
 
29
- llama-server <code-fast | code-smart | plan | plan-uncensored>
29
+ llama-server <code-fast | code-smart | code-ultra>
30
30
 
31
31
 
32
32
  GGUF in ~/.cache/huggingface/hub/...
@@ -42,7 +42,7 @@ A 64 GB unified memory M4 Max can comfortably hold **one always-on tiny coder +
42
42
 
43
43
  - **Agent work** (multi-file edits, tool use, refactors) → coder models, which are trained on tool-call protocols and code edits.
44
44
  - **Planning** (design discussions, architecture, "what's the best approach") → chat-tuned models, which are better at high-level reasoning and don't try to start writing code in response to every message.
45
- - **Uncensored planning** is a separate plan-tier model, opted in either by request (`agent.plan-nofilter` in opencode) or by an inline `[nofilter]` trigger in the prompt.
45
+ - **Uncensored planning** is a separate plan-tier model, opted in by explicit agent selection (`/agent plan-nofilter` in opencode).
46
46
 
47
47
  Routing decisions cost ~zero — they're a few regex checks in the FastAPI router, not an LLM call.
48
48
 
@@ -76,20 +76,18 @@ matches how these models actually behave on this stack:
76
76
  than priors, so they tend to *improve* relative to top-tier as the
77
77
  conversation grows.
78
78
 
79
- First match wins:
79
+ First match wins (auto-routing only; `plan` and `plan-uncensored` are not auto-routed):
80
80
 
81
81
  | # | Condition | → Model | Reason |
82
82
  |---|---|---|---|
83
- | 1 | last user msg contains `[nofilter]`, `[uncensored]`, `[heretic]`, or starts with `uncensored:` / `nofilter:` | `plan-uncensored` | explicit opt-in |
84
- | 2 | `[ultra]` / `[opus]` / `ultra:` trigger AND `code-ultra` tier configured | `code-ultra` | explicit top-tier opt-in |
85
- | 3 | plan verbs (*design, architect, approach, trade-off, should we, explain why, …*) AND no code blocks / agent verbs / tools | `plan` | pure design discussion (orthogonal track) |
86
- | 4 | estimated input 12 000 tokens | `code-ultra` *(or `code-smart` if ultra unwired)* | top tier context still being built, latency/$ are best here |
87
- | 5 | estimated input 32 000 tokens | `code-smart` | mid-context, local heavy coder is at its sweet spot |
88
- | 6 | otherwise (long context) AND ≥ 10 user turns | `code-smart` | floor: deep agentic loop, keep the heavy model |
89
- | 7 | otherwise (long context) | `code-fast` | 128k YaRN window + always-resident + free |
83
+ | 1 | `[ultra]` / `[opus]` / `ultra:` trigger AND `code-ultra` tier configured | `code-ultra` | explicit top-tier opt-in |
84
+ | 2 | estimated input 12 000 tokens | `code-ultra` *(or `code-smart` if ultra unwired)* | top tier context still being built, latency/$ are best here |
85
+ | 3 | estimated input 32 000 tokens | `code-smart` | mid-context, local heavy coder is at its sweet spot |
86
+ | 4 | otherwise (long context) AND 10 user turns | `code-smart` | floor: deep agentic loop, keep the heavy model |
87
+ | 5 | otherwise (long context) | `code-fast` | 128k YaRN window + always-resident + free |
90
88
 
91
89
  Token estimates are `chars / 4` over all message text + `prompt`. The
92
- `code-ultra` rungs (2 and 4) are gated on availability: when no
90
+ `code-ultra` rungs (1 and 2) are gated on availability: when no
93
91
  `[code-ultra]` section is loaded from `models.ini`, both silently fall
94
92
  back to `code-smart` so vanilla installs don't 404.
95
93
 
@@ -139,7 +137,8 @@ your global setup unchanged.
139
137
  | **`agent.plan-nofilter`** (custom uncensored planner) | `llama.cpp/plan-uncensored` |
140
138
 
141
139
  Inside opencode you can switch agents with `/agent` or by `@plan-nofilter`-mentioning
142
- a custom one. Slash-commands `/review`, `/nofilter` are also available.
140
+ a custom one. The `plan` and `plan-uncensored` tiers are **not auto-routed** from the build agent —
141
+ they're only accessible via explicit agent selection (`/agent plan` or `/agent plan-nofilter`).
143
142
 
144
143
  Want a second terminal into the same stack? Install the activate hook
145
144
  once (`eval "$(llmstack activate zsh)"`) and any new shell that `cd`s
@@ -207,8 +206,9 @@ Per-project state (gitignored) is created lazily under `<work-dir>/.llmstack/`:
207
206
  ```
208
207
 
209
208
  The `llama-swap` binary lives outside any project at
210
- `$XDG_DATA_HOME/llmstack/bin/llama-swap` (override with
211
- `LLMSTACK_BIN_DIR`). One download is reused across all projects.
209
+ `$XDG_DATA_HOME/llmstack/bin/llama-swap` on macOS/Linux (override with
210
+ `LLMSTACK_BIN_DIR`), or `%LOCALAPPDATA%\llmstack\bin\llama-swap.exe` on Windows.
211
+ One download is reused across all projects.
212
212
 
213
213
  ## Quick start
214
214
 
@@ -299,8 +299,9 @@ Notes:
299
299
  or a package like `winget install ggml.llama-cpp` and put it on
300
300
  `PATH` (or set `$env:LLAMA_SERVER_BIN`). The Mac-only
301
301
  `iogpu.wired_limit_mb` step does not apply.
302
- - The `[llmstack:<channel>]` prompt prefix shows up in PowerShell too;
303
- `cmd.exe` gets a simpler `[llmstack:<channel>]` prompt via `doskey`.
302
+ - The `[llmstack:<channel>]` prompt prefix shows up in PowerShell; `cmd.exe`
303
+ does not support custom prompts in the same way, so activation is
304
+ PowerShell-only.
304
305
  - Stopping daemons uses `taskkill /T /F` under the hood, so the
305
306
  llama-server children get cleaned up as well.
306
307
 
@@ -406,7 +407,7 @@ llmstack restart --next # cycle into the next channel
406
407
 
407
408
  ### Try each routing path
408
409
 
409
- All of these go to `/v1/chat/completions` on `:10101`. Each should pick a different upstream model:
410
+ All of these go to `/v1/chat/completions` on `:10101`. The `auto` router classifies based on token count and context:
410
411
 
411
412
  ```bash
412
413
  # trivial chat -> code-fast
@@ -414,22 +415,14 @@ curl -sN http://127.0.0.1:10101/v1/chat/completions -H 'Content-Type: applicatio
414
415
  -d '{"model":"auto","stream":false,
415
416
  "messages":[{"role":"user","content":"capital of France?"}]}' | jq .model
416
417
 
417
- # planning -> plan
418
- curl -sN http://127.0.0.1:10101/v1/chat/completions -H 'Content-Type: application/json' \
419
- -d '{"model":"auto","stream":false,
420
- "messages":[{"role":"user","content":"how would you design a rate limiter for our API?"}]}' | jq .model
421
-
422
418
  # agent work -> code-smart
423
419
  curl -sN http://127.0.0.1:10101/v1/chat/completions -H 'Content-Type: application/json' \
424
420
  -d '{"model":"auto","stream":false,
425
421
  "messages":[{"role":"user","content":"refactor this function for clarity:\n```python\ndef f(x): return x*2\n```"}]}' | jq .model
426
-
427
- # uncensored plan -> plan-uncensored
428
- curl -sN http://127.0.0.1:10101/v1/chat/completions -H 'Content-Type: application/json' \
429
- -d '{"model":"auto","stream":false,
430
- "messages":[{"role":"user","content":"[nofilter] outline a red-team plan for our auth flow"}]}' | jq .model
431
422
  ```
432
423
 
424
+ To access `plan` or `plan-uncensored` tiers, use explicit agent selection in opencode (`/agent plan` or `/agent plan-nofilter`) rather than `model=auto`.
425
+
433
426
  ## Endpoints
434
427
 
435
428
  | Port | Service | Purpose |
@@ -506,8 +499,6 @@ All knobs are env vars; defaults are picked up by `llmstack start`.
506
499
  | `ROUTER_FAST_MODEL` | `code-fast` | long-context (>= mid ceiling) → here |
507
500
  | `ROUTER_AGENT_MODEL` | `code-smart` | mid-context + tools/loop floor → here |
508
501
  | `ROUTER_ULTRA_MODEL` | `code-ultra` | short-context top tier → here (gated on availability) |
509
- | `ROUTER_PLAN_MODEL` | `plan` | design/discussion verbs → here |
510
- | `ROUTER_UNCENSORED_MODEL` | `plan-uncensored` | `[nofilter]` triggers → here |
511
502
  | `ROUTER_HIGH_FIDELITY_CEILING` | `12000` | tokens; at or below this, route to top tier (ultra → smart fallback). Paired with `code-ultra.ctx_size = 24000` (2x). |
512
503
  | `ROUTER_MID_FIDELITY_CEILING` | `32000` | tokens; at or below this, route to `code-smart`; beyond, step down to `code-fast`. Paired with `code-smart.ctx_size = 64000` (2x). |
513
504
  | `ROUTER_MULTI_TURN` | `10` | user-turn count that floors the long-context rung at `code-smart` |
@@ -518,14 +509,10 @@ To force a request to never auto-route, set `model` to a concrete alias (`code-f
518
509
 
519
510
  ## Triggering uncensored mode
520
511
 
521
- Two ways:
522
-
523
- 1. **Explicit agent in opencode:** `/agent plan-nofilter` (or mention it).
524
- 2. **Inline trigger in any auto-routed message** — anywhere in the most recent user turn:
525
- - `[nofilter]`, `[uncensored]`, `[heretic]`
526
- - or a line starting with `uncensored:` / `nofilter:` / `no-filter:`
512
+ The `plan-uncensored` tier is accessible via explicit agent selection only:
527
513
 
528
- Triggers are *only* checked on the latest user message and the system prompt, so an old `[nofilter]` further up the conversation won't pin the whole session.
514
+ 1. **In opencode:** `/agent plan-nofilter` (or mention `@plan-nofilter`).
515
+ 2. **Via opencode config:** set `agent.plan-nofilter` as your active agent.
529
516
 
530
517
  ## Troubleshooting
531
518
 
@@ -535,7 +522,7 @@ Triggers are *only* checked on the latest user message and the system prompt, so
535
522
 
536
523
  **OOM / unexplained slowdown** → run `top -o mem -stats pid,rsize,command` to see what's resident. The matrix should prevent two heavy models loading together; if it somehow happens, `llmstack restart`.
537
524
 
538
- **Auto picks the wrong model** → adjust the regex in `llmstack/app.py` (`AGENT_SIGNALS` / `PLAN_SIGNALS` / `UNCENSORED_TRIGGERS`) or move the ladder ceilings via `ROUTER_HIGH_FIDELITY_CEILING` / `ROUTER_MID_FIDELITY_CEILING`. To force a request to never auto-route, pass an explicit `model` (e.g. `code-smart`) instead of `auto`.
525
+ **Auto picks the wrong model** → adjust the regex in `llmstack/app.py` (`ULTRA_TRIGGERS`) or move the ladder ceilings via `ROUTER_HIGH_FIDELITY_CEILING` / `ROUTER_MID_FIDELITY_CEILING`. To force a request to never auto-route, pass an explicit `model` (e.g. `code-smart`) instead of `auto`.
539
526
 
540
527
  **Want a pure pass-through (no auto routing)** → change opencode's `baseURL` to `http://127.0.0.1:10102/v1` (llama-swap directly) and only use concrete model names. (Note: this skips the bedrock dispatcher; only GGUF tiers will be reachable.)
541
528
 
@@ -266,7 +266,7 @@ How to evaluate:
266
266
  - Run `llama-bench -m <new>.gguf -p 512 -n 128 -ngl 999` for raw speed
267
267
  - Sniff test with a typical autocomplete prompt; latency should feel like
268
268
  the cursor is barely ahead of you
269
- - Aider leaderboard "edit format" column — proxy for FIM quality
269
+ - [Aider leaderboard](https://aider.chat/docs/leaderboards/) "edit format" column — proxy for FIM quality
270
270
 
271
271
  Size budget: **~2–6 GB** weights (we want this resident permanently while
272
272
  sharing memory with the heavy tier).
@@ -287,10 +287,10 @@ What matters:
287
287
  - **Speed at full context** (MoE models win here on Apple Silicon)
288
288
 
289
289
  How to evaluate:
290
- - Aider's [LLM Leaderboard](https://aider.chat/docs/leaderboards/) — most
290
+ - [Aider's LLM Leaderboard](https://aider.chat/docs/leaderboards/) — most
291
291
  honest signal for agentic coding
292
- - LiveCodeBench scores
293
- - SWE-Bench Verified (the "real PRs" benchmark)
292
+ - [LiveCodeBench](https://livecodebench.github.io/leaderboard.html) scores
293
+ - [SWE-Bench Verified](https://www.swebench.com/) (the "real PRs" benchmark)
294
294
  - Run an actual opencode session in `build` mode against your repo
295
295
 
296
296
  Size budget: **~30–55 GB** weights (must fit alongside `code-fast` ≈ 5 GB
@@ -311,8 +311,8 @@ What matters:
311
311
  - **Refusals on edge cases** — fine to refuse weird stuff in plain plan mode
312
312
 
313
313
  How to evaluate:
314
- - Open LLM Leaderboard (filter to chat/instruct, your size class)
315
- - Chatbot Arena — vibes-based but useful proxy
314
+ - [Open LLM Leaderboard](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard) (filter to chat/instruct, your size class)
315
+ - [Chatbot Arena](https://lmarena.ai/) — vibes-based but useful proxy
316
316
  - Hand-roll a "design this rate limiter" prompt and compare outputs
317
317
 
318
318
  Size budget: **~7–25 GB** weights — this tier shouldn't dominate memory.
@@ -360,12 +360,12 @@ Same size budget as `plan`.
360
360
 
361
361
  | Tier | Leaderboard |
362
362
  |---|---|
363
- | `code-fast` / `code-smart` | https://aider.chat/docs/leaderboards/ |
364
- | | https://livecodebench.github.io/leaderboard.html |
365
- | | https://www.swebench.com/ (Verified split) |
366
- | `plan` / `plan-uncensored` | https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard |
367
- | | https://lmarena.ai/ |
368
- | | https://livebench.ai/ |
363
+ | `code-fast` / `code-smart` | [Aider LLM Leaderboard](https://aider.chat/docs/leaderboards/) |
364
+ | | [LiveCodeBench](https://livecodebench.github.io/leaderboard.html) |
365
+ | | [SWE-Bench Verified](https://www.swebench.com/) |
366
+ | `plan` / `plan-uncensored` | [Open LLM Leaderboard](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard) |
367
+ | | [Chatbot Arena](https://lmarena.ai/) |
368
+ | | [LiveBench](https://livebench.ai/) |
369
369
 
370
370
  **Community signal** (qualitative but valuable):
371
371
 
@@ -127,8 +127,6 @@ UPSTREAM = os.getenv("LLAMA_SWAP_URL", "http://127.0.0.1:10102").rstrip("/")
127
127
  FAST_MODEL = os.getenv("ROUTER_FAST_MODEL", "code-fast")
128
128
  AGENT_MODEL = os.getenv("ROUTER_AGENT_MODEL", "code-smart")
129
129
  ULTRA_MODEL = os.getenv("ROUTER_ULTRA_MODEL", "code-ultra")
130
- PLAN_MODEL = os.getenv("ROUTER_PLAN_MODEL", "plan")
131
- UNCENSORED_MODEL = os.getenv("ROUTER_UNCENSORED_MODEL", "plan-uncensored")
132
130
 
133
131
  # Step-DOWN ladder (see module docstring). Both ceilings are *upper
134
132
  # bounds* of a tier's sweet-spot range, expressed in estimated input
@@ -69,7 +69,7 @@ COMMANDS = {
69
69
  "agent": "plan",
70
70
  },
71
71
  "nofilter": {
72
- "template": "[nofilter]",
72
+ "template": "",
73
73
  "description": "Route to the uncensored planning model.",
74
74
  "agent": "plan-nofilter",
75
75
  },
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: opencode-llmstack
3
- Version: 0.9.6
3
+ Version: 0.9.7
4
4
  Summary: Multi-tier local LLM stack: llama-swap + FastAPI auto-router + opencode wiring.
5
5
  Author: llmstack
6
6
  License: MIT License
@@ -78,14 +78,14 @@ client (opencode / curl / Cursor / etc.)
78
78
 
79
79
 
80
80
  http://127.0.0.1:10101 <-- FastAPI router (llmstack.app)
81
- │ • model="auto" → classify → rewrite to one of 4 tiers
81
+ │ • model="auto" → classify → rewrite to one of 3 coder tiers
82
82
  │ • everything else → pass-through
83
83
 
84
84
  http://127.0.0.1:10102 <-- llama-swap (binary, manages model lifecycle)
85
85
  │ • loads/unloads llama-server processes per model
86
86
  │ • matrix solver allows {code-fast + one heavy model} co-resident
87
87
 
88
- llama-server <code-fast | code-smart | plan | plan-uncensored>
88
+ llama-server <code-fast | code-smart | code-ultra>
89
89
 
90
90
 
91
91
  GGUF in ~/.cache/huggingface/hub/...
@@ -101,7 +101,7 @@ A 64 GB unified memory M4 Max can comfortably hold **one always-on tiny coder +
101
101
 
102
102
  - **Agent work** (multi-file edits, tool use, refactors) → coder models, which are trained on tool-call protocols and code edits.
103
103
  - **Planning** (design discussions, architecture, "what's the best approach") → chat-tuned models, which are better at high-level reasoning and don't try to start writing code in response to every message.
104
- - **Uncensored planning** is a separate plan-tier model, opted in either by request (`agent.plan-nofilter` in opencode) or by an inline `[nofilter]` trigger in the prompt.
104
+ - **Uncensored planning** is a separate plan-tier model, opted in by explicit agent selection (`/agent plan-nofilter` in opencode).
105
105
 
106
106
  Routing decisions cost ~zero — they're a few regex checks in the FastAPI router, not an LLM call.
107
107
 
@@ -135,20 +135,18 @@ matches how these models actually behave on this stack:
135
135
  than priors, so they tend to *improve* relative to top-tier as the
136
136
  conversation grows.
137
137
 
138
- First match wins:
138
+ First match wins (auto-routing only; `plan` and `plan-uncensored` are not auto-routed):
139
139
 
140
140
  | # | Condition | → Model | Reason |
141
141
  |---|---|---|---|
142
- | 1 | last user msg contains `[nofilter]`, `[uncensored]`, `[heretic]`, or starts with `uncensored:` / `nofilter:` | `plan-uncensored` | explicit opt-in |
143
- | 2 | `[ultra]` / `[opus]` / `ultra:` trigger AND `code-ultra` tier configured | `code-ultra` | explicit top-tier opt-in |
144
- | 3 | plan verbs (*design, architect, approach, trade-off, should we, explain why, …*) AND no code blocks / agent verbs / tools | `plan` | pure design discussion (orthogonal track) |
145
- | 4 | estimated input 12 000 tokens | `code-ultra` *(or `code-smart` if ultra unwired)* | top tier context still being built, latency/$ are best here |
146
- | 5 | estimated input 32 000 tokens | `code-smart` | mid-context, local heavy coder is at its sweet spot |
147
- | 6 | otherwise (long context) AND ≥ 10 user turns | `code-smart` | floor: deep agentic loop, keep the heavy model |
148
- | 7 | otherwise (long context) | `code-fast` | 128k YaRN window + always-resident + free |
142
+ | 1 | `[ultra]` / `[opus]` / `ultra:` trigger AND `code-ultra` tier configured | `code-ultra` | explicit top-tier opt-in |
143
+ | 2 | estimated input 12 000 tokens | `code-ultra` *(or `code-smart` if ultra unwired)* | top tier context still being built, latency/$ are best here |
144
+ | 3 | estimated input 32 000 tokens | `code-smart` | mid-context, local heavy coder is at its sweet spot |
145
+ | 4 | otherwise (long context) AND 10 user turns | `code-smart` | floor: deep agentic loop, keep the heavy model |
146
+ | 5 | otherwise (long context) | `code-fast` | 128k YaRN window + always-resident + free |
149
147
 
150
148
  Token estimates are `chars / 4` over all message text + `prompt`. The
151
- `code-ultra` rungs (2 and 4) are gated on availability: when no
149
+ `code-ultra` rungs (1 and 2) are gated on availability: when no
152
150
  `[code-ultra]` section is loaded from `models.ini`, both silently fall
153
151
  back to `code-smart` so vanilla installs don't 404.
154
152
 
@@ -198,7 +196,8 @@ your global setup unchanged.
198
196
  | **`agent.plan-nofilter`** (custom uncensored planner) | `llama.cpp/plan-uncensored` |
199
197
 
200
198
  Inside opencode you can switch agents with `/agent` or by `@plan-nofilter`-mentioning
201
- a custom one. Slash-commands `/review`, `/nofilter` are also available.
199
+ a custom one. The `plan` and `plan-uncensored` tiers are **not auto-routed** from the build agent —
200
+ they're only accessible via explicit agent selection (`/agent plan` or `/agent plan-nofilter`).
202
201
 
203
202
  Want a second terminal into the same stack? Install the activate hook
204
203
  once (`eval "$(llmstack activate zsh)"`) and any new shell that `cd`s
@@ -266,8 +265,9 @@ Per-project state (gitignored) is created lazily under `<work-dir>/.llmstack/`:
266
265
  ```
267
266
 
268
267
  The `llama-swap` binary lives outside any project at
269
- `$XDG_DATA_HOME/llmstack/bin/llama-swap` (override with
270
- `LLMSTACK_BIN_DIR`). One download is reused across all projects.
268
+ `$XDG_DATA_HOME/llmstack/bin/llama-swap` on macOS/Linux (override with
269
+ `LLMSTACK_BIN_DIR`), or `%LOCALAPPDATA%\llmstack\bin\llama-swap.exe` on Windows.
270
+ One download is reused across all projects.
271
271
 
272
272
  ## Quick start
273
273
 
@@ -358,8 +358,9 @@ Notes:
358
358
  or a package like `winget install ggml.llama-cpp` and put it on
359
359
  `PATH` (or set `$env:LLAMA_SERVER_BIN`). The Mac-only
360
360
  `iogpu.wired_limit_mb` step does not apply.
361
- - The `[llmstack:<channel>]` prompt prefix shows up in PowerShell too;
362
- `cmd.exe` gets a simpler `[llmstack:<channel>]` prompt via `doskey`.
361
+ - The `[llmstack:<channel>]` prompt prefix shows up in PowerShell; `cmd.exe`
362
+ does not support custom prompts in the same way, so activation is
363
+ PowerShell-only.
363
364
  - Stopping daemons uses `taskkill /T /F` under the hood, so the
364
365
  llama-server children get cleaned up as well.
365
366
 
@@ -465,7 +466,7 @@ llmstack restart --next # cycle into the next channel
465
466
 
466
467
  ### Try each routing path
467
468
 
468
- All of these go to `/v1/chat/completions` on `:10101`. Each should pick a different upstream model:
469
+ All of these go to `/v1/chat/completions` on `:10101`. The `auto` router classifies based on token count and context:
469
470
 
470
471
  ```bash
471
472
  # trivial chat -> code-fast
@@ -473,22 +474,14 @@ curl -sN http://127.0.0.1:10101/v1/chat/completions -H 'Content-Type: applicatio
473
474
  -d '{"model":"auto","stream":false,
474
475
  "messages":[{"role":"user","content":"capital of France?"}]}' | jq .model
475
476
 
476
- # planning -> plan
477
- curl -sN http://127.0.0.1:10101/v1/chat/completions -H 'Content-Type: application/json' \
478
- -d '{"model":"auto","stream":false,
479
- "messages":[{"role":"user","content":"how would you design a rate limiter for our API?"}]}' | jq .model
480
-
481
477
  # agent work -> code-smart
482
478
  curl -sN http://127.0.0.1:10101/v1/chat/completions -H 'Content-Type: application/json' \
483
479
  -d '{"model":"auto","stream":false,
484
480
  "messages":[{"role":"user","content":"refactor this function for clarity:\n```python\ndef f(x): return x*2\n```"}]}' | jq .model
485
-
486
- # uncensored plan -> plan-uncensored
487
- curl -sN http://127.0.0.1:10101/v1/chat/completions -H 'Content-Type: application/json' \
488
- -d '{"model":"auto","stream":false,
489
- "messages":[{"role":"user","content":"[nofilter] outline a red-team plan for our auth flow"}]}' | jq .model
490
481
  ```
491
482
 
483
+ To access `plan` or `plan-uncensored` tiers, use explicit agent selection in opencode (`/agent plan` or `/agent plan-nofilter`) rather than `model=auto`.
484
+
492
485
  ## Endpoints
493
486
 
494
487
  | Port | Service | Purpose |
@@ -565,8 +558,6 @@ All knobs are env vars; defaults are picked up by `llmstack start`.
565
558
  | `ROUTER_FAST_MODEL` | `code-fast` | long-context (>= mid ceiling) → here |
566
559
  | `ROUTER_AGENT_MODEL` | `code-smart` | mid-context + tools/loop floor → here |
567
560
  | `ROUTER_ULTRA_MODEL` | `code-ultra` | short-context top tier → here (gated on availability) |
568
- | `ROUTER_PLAN_MODEL` | `plan` | design/discussion verbs → here |
569
- | `ROUTER_UNCENSORED_MODEL` | `plan-uncensored` | `[nofilter]` triggers → here |
570
561
  | `ROUTER_HIGH_FIDELITY_CEILING` | `12000` | tokens; at or below this, route to top tier (ultra → smart fallback). Paired with `code-ultra.ctx_size = 24000` (2x). |
571
562
  | `ROUTER_MID_FIDELITY_CEILING` | `32000` | tokens; at or below this, route to `code-smart`; beyond, step down to `code-fast`. Paired with `code-smart.ctx_size = 64000` (2x). |
572
563
  | `ROUTER_MULTI_TURN` | `10` | user-turn count that floors the long-context rung at `code-smart` |
@@ -577,14 +568,10 @@ To force a request to never auto-route, set `model` to a concrete alias (`code-f
577
568
 
578
569
  ## Triggering uncensored mode
579
570
 
580
- Two ways:
581
-
582
- 1. **Explicit agent in opencode:** `/agent plan-nofilter` (or mention it).
583
- 2. **Inline trigger in any auto-routed message** — anywhere in the most recent user turn:
584
- - `[nofilter]`, `[uncensored]`, `[heretic]`
585
- - or a line starting with `uncensored:` / `nofilter:` / `no-filter:`
571
+ The `plan-uncensored` tier is accessible via explicit agent selection only:
586
572
 
587
- Triggers are *only* checked on the latest user message and the system prompt, so an old `[nofilter]` further up the conversation won't pin the whole session.
573
+ 1. **In opencode:** `/agent plan-nofilter` (or mention `@plan-nofilter`).
574
+ 2. **Via opencode config:** set `agent.plan-nofilter` as your active agent.
588
575
 
589
576
  ## Troubleshooting
590
577
 
@@ -594,7 +581,7 @@ Triggers are *only* checked on the latest user message and the system prompt, so
594
581
 
595
582
  **OOM / unexplained slowdown** → run `top -o mem -stats pid,rsize,command` to see what's resident. The matrix should prevent two heavy models loading together; if it somehow happens, `llmstack restart`.
596
583
 
597
- **Auto picks the wrong model** → adjust the regex in `llmstack/app.py` (`AGENT_SIGNALS` / `PLAN_SIGNALS` / `UNCENSORED_TRIGGERS`) or move the ladder ceilings via `ROUTER_HIGH_FIDELITY_CEILING` / `ROUTER_MID_FIDELITY_CEILING`. To force a request to never auto-route, pass an explicit `model` (e.g. `code-smart`) instead of `auto`.
584
+ **Auto picks the wrong model** → adjust the regex in `llmstack/app.py` (`ULTRA_TRIGGERS`) or move the ladder ceilings via `ROUTER_HIGH_FIDELITY_CEILING` / `ROUTER_MID_FIDELITY_CEILING`. To force a request to never auto-route, pass an explicit `model` (e.g. `code-smart`) instead of `auto`.
598
585
 
599
586
  **Want a pure pass-through (no auto routing)** → change opencode's `baseURL` to `http://127.0.0.1:10102/v1` (llama-swap directly) and only use concrete model names. (Note: this skips the bedrock dispatcher; only GGUF tiers will be reachable.)
600
587
 
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "opencode-llmstack"
7
- version = "0.9.6"
7
+ version = "0.9.7"
8
8
  description = "Multi-tier local LLM stack: llama-swap + FastAPI auto-router + opencode wiring."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.11"