opencode-llmstack 0.9.6__py3-none-any.whl → 0.9.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- llmstack/app.py +0 -2
- llmstack/generators/opencode.py +1 -1
- {opencode_llmstack-0.9.6.data → opencode_llmstack-0.9.7.data}/data/UPGRADING.md +12 -12
- {opencode_llmstack-0.9.6.dist-info → opencode_llmstack-0.9.7.dist-info}/METADATA +26 -39
- {opencode_llmstack-0.9.6.dist-info → opencode_llmstack-0.9.7.dist-info}/RECORD +11 -11
- {opencode_llmstack-0.9.6.data → opencode_llmstack-0.9.7.data}/data/CHANGELOG.md +0 -0
- {opencode_llmstack-0.9.6.data → opencode_llmstack-0.9.7.data}/data/LICENSE +0 -0
- {opencode_llmstack-0.9.6.dist-info → opencode_llmstack-0.9.7.dist-info}/WHEEL +0 -0
- {opencode_llmstack-0.9.6.dist-info → opencode_llmstack-0.9.7.dist-info}/entry_points.txt +0 -0
- {opencode_llmstack-0.9.6.dist-info → opencode_llmstack-0.9.7.dist-info}/licenses/LICENSE +0 -0
- {opencode_llmstack-0.9.6.dist-info → opencode_llmstack-0.9.7.dist-info}/top_level.txt +0 -0
llmstack/app.py
CHANGED
|
@@ -127,8 +127,6 @@ UPSTREAM = os.getenv("LLAMA_SWAP_URL", "http://127.0.0.1:10102").rstrip("/")
|
|
|
127
127
|
FAST_MODEL = os.getenv("ROUTER_FAST_MODEL", "code-fast")
|
|
128
128
|
AGENT_MODEL = os.getenv("ROUTER_AGENT_MODEL", "code-smart")
|
|
129
129
|
ULTRA_MODEL = os.getenv("ROUTER_ULTRA_MODEL", "code-ultra")
|
|
130
|
-
PLAN_MODEL = os.getenv("ROUTER_PLAN_MODEL", "plan")
|
|
131
|
-
UNCENSORED_MODEL = os.getenv("ROUTER_UNCENSORED_MODEL", "plan-uncensored")
|
|
132
130
|
|
|
133
131
|
# Step-DOWN ladder (see module docstring). Both ceilings are *upper
|
|
134
132
|
# bounds* of a tier's sweet-spot range, expressed in estimated input
|
llmstack/generators/opencode.py
CHANGED
|
@@ -266,7 +266,7 @@ How to evaluate:
|
|
|
266
266
|
- Run `llama-bench -m <new>.gguf -p 512 -n 128 -ngl 999` for raw speed
|
|
267
267
|
- Sniff test with a typical autocomplete prompt; latency should feel like
|
|
268
268
|
the cursor is barely ahead of you
|
|
269
|
-
- Aider leaderboard "edit format" column — proxy for FIM quality
|
|
269
|
+
- [Aider leaderboard](https://aider.chat/docs/leaderboards/) "edit format" column — proxy for FIM quality
|
|
270
270
|
|
|
271
271
|
Size budget: **~2–6 GB** weights (we want this resident permanently while
|
|
272
272
|
sharing memory with the heavy tier).
|
|
@@ -287,10 +287,10 @@ What matters:
|
|
|
287
287
|
- **Speed at full context** (MoE models win here on Apple Silicon)
|
|
288
288
|
|
|
289
289
|
How to evaluate:
|
|
290
|
-
- Aider's
|
|
290
|
+
- [Aider's LLM Leaderboard](https://aider.chat/docs/leaderboards/) — most
|
|
291
291
|
honest signal for agentic coding
|
|
292
|
-
- LiveCodeBench scores
|
|
293
|
-
- SWE-Bench Verified (the "real PRs" benchmark)
|
|
292
|
+
- [LiveCodeBench](https://livecodebench.github.io/leaderboard.html) scores
|
|
293
|
+
- [SWE-Bench Verified](https://www.swebench.com/) (the "real PRs" benchmark)
|
|
294
294
|
- Run an actual opencode session in `build` mode against your repo
|
|
295
295
|
|
|
296
296
|
Size budget: **~30–55 GB** weights (must fit alongside `code-fast` ≈ 5 GB
|
|
@@ -311,8 +311,8 @@ What matters:
|
|
|
311
311
|
- **Refusals on edge cases** — fine to refuse weird stuff in plain plan mode
|
|
312
312
|
|
|
313
313
|
How to evaluate:
|
|
314
|
-
- Open LLM Leaderboard (filter to chat/instruct, your size class)
|
|
315
|
-
- Chatbot Arena — vibes-based but useful proxy
|
|
314
|
+
- [Open LLM Leaderboard](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard) (filter to chat/instruct, your size class)
|
|
315
|
+
- [Chatbot Arena](https://lmarena.ai/) — vibes-based but useful proxy
|
|
316
316
|
- Hand-roll a "design this rate limiter" prompt and compare outputs
|
|
317
317
|
|
|
318
318
|
Size budget: **~7–25 GB** weights — this tier shouldn't dominate memory.
|
|
@@ -360,12 +360,12 @@ Same size budget as `plan`.
|
|
|
360
360
|
|
|
361
361
|
| Tier | Leaderboard |
|
|
362
362
|
|---|---|
|
|
363
|
-
| `code-fast` / `code-smart` | https://aider.chat/docs/leaderboards/ |
|
|
364
|
-
| | https://livecodebench.github.io/leaderboard.html |
|
|
365
|
-
| | https://www.swebench.com/
|
|
366
|
-
| `plan` / `plan-uncensored` | https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard |
|
|
367
|
-
| | https://lmarena.ai/ |
|
|
368
|
-
| | https://livebench.ai/ |
|
|
363
|
+
| `code-fast` / `code-smart` | [Aider LLM Leaderboard](https://aider.chat/docs/leaderboards/) |
|
|
364
|
+
| | [LiveCodeBench](https://livecodebench.github.io/leaderboard.html) |
|
|
365
|
+
| | [SWE-Bench Verified](https://www.swebench.com/) |
|
|
366
|
+
| `plan` / `plan-uncensored` | [Open LLM Leaderboard](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard) |
|
|
367
|
+
| | [Chatbot Arena](https://lmarena.ai/) |
|
|
368
|
+
| | [LiveBench](https://livebench.ai/) |
|
|
369
369
|
|
|
370
370
|
**Community signal** (qualitative but valuable):
|
|
371
371
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: opencode-llmstack
|
|
3
|
-
Version: 0.9.
|
|
3
|
+
Version: 0.9.7
|
|
4
4
|
Summary: Multi-tier local LLM stack: llama-swap + FastAPI auto-router + opencode wiring.
|
|
5
5
|
Author: llmstack
|
|
6
6
|
License: MIT License
|
|
@@ -78,14 +78,14 @@ client (opencode / curl / Cursor / etc.)
|
|
|
78
78
|
│
|
|
79
79
|
▼
|
|
80
80
|
http://127.0.0.1:10101 <-- FastAPI router (llmstack.app)
|
|
81
|
-
│ • model="auto" → classify → rewrite to one of
|
|
81
|
+
│ • model="auto" → classify → rewrite to one of 3 coder tiers
|
|
82
82
|
│ • everything else → pass-through
|
|
83
83
|
▼
|
|
84
84
|
http://127.0.0.1:10102 <-- llama-swap (binary, manages model lifecycle)
|
|
85
85
|
│ • loads/unloads llama-server processes per model
|
|
86
86
|
│ • matrix solver allows {code-fast + one heavy model} co-resident
|
|
87
87
|
▼
|
|
88
|
-
llama-server <code-fast | code-smart |
|
|
88
|
+
llama-server <code-fast | code-smart | code-ultra>
|
|
89
89
|
│
|
|
90
90
|
▼
|
|
91
91
|
GGUF in ~/.cache/huggingface/hub/...
|
|
@@ -101,7 +101,7 @@ A 64 GB unified memory M4 Max can comfortably hold **one always-on tiny coder +
|
|
|
101
101
|
|
|
102
102
|
- **Agent work** (multi-file edits, tool use, refactors) → coder models, which are trained on tool-call protocols and code edits.
|
|
103
103
|
- **Planning** (design discussions, architecture, "what's the best approach") → chat-tuned models, which are better at high-level reasoning and don't try to start writing code in response to every message.
|
|
104
|
-
- **Uncensored planning** is a separate plan-tier model, opted in
|
|
104
|
+
- **Uncensored planning** is a separate plan-tier model, opted in by explicit agent selection (`/agent plan-nofilter` in opencode).
|
|
105
105
|
|
|
106
106
|
Routing decisions cost ~zero — they're a few regex checks in the FastAPI router, not an LLM call.
|
|
107
107
|
|
|
@@ -135,20 +135,18 @@ matches how these models actually behave on this stack:
|
|
|
135
135
|
than priors, so they tend to *improve* relative to top-tier as the
|
|
136
136
|
conversation grows.
|
|
137
137
|
|
|
138
|
-
First match wins:
|
|
138
|
+
First match wins (auto-routing only; `plan` and `plan-uncensored` are not auto-routed):
|
|
139
139
|
|
|
140
140
|
| # | Condition | → Model | Reason |
|
|
141
141
|
|---|---|---|---|
|
|
142
|
-
| 1 |
|
|
143
|
-
| 2 |
|
|
144
|
-
| 3 |
|
|
145
|
-
| 4 |
|
|
146
|
-
| 5 |
|
|
147
|
-
| 6 | otherwise (long context) AND ≥ 10 user turns | `code-smart` | floor: deep agentic loop, keep the heavy model |
|
|
148
|
-
| 7 | otherwise (long context) | `code-fast` | 128k YaRN window + always-resident + free |
|
|
142
|
+
| 1 | `[ultra]` / `[opus]` / `ultra:` trigger AND `code-ultra` tier configured | `code-ultra` | explicit top-tier opt-in |
|
|
143
|
+
| 2 | estimated input ≤ 12 000 tokens | `code-ultra` *(or `code-smart` if ultra unwired)* | top tier — context still being built, latency/$ are best here |
|
|
144
|
+
| 3 | estimated input ≤ 32 000 tokens | `code-smart` | mid-context, local heavy coder is at its sweet spot |
|
|
145
|
+
| 4 | otherwise (long context) AND ≥ 10 user turns | `code-smart` | floor: deep agentic loop, keep the heavy model |
|
|
146
|
+
| 5 | otherwise (long context) | `code-fast` | 128k YaRN window + always-resident + free |
|
|
149
147
|
|
|
150
148
|
Token estimates are `chars / 4` over all message text + `prompt`. The
|
|
151
|
-
`code-ultra` rungs (
|
|
149
|
+
`code-ultra` rungs (1 and 2) are gated on availability: when no
|
|
152
150
|
`[code-ultra]` section is loaded from `models.ini`, both silently fall
|
|
153
151
|
back to `code-smart` so vanilla installs don't 404.
|
|
154
152
|
|
|
@@ -198,7 +196,8 @@ your global setup unchanged.
|
|
|
198
196
|
| **`agent.plan-nofilter`** (custom uncensored planner) | `llama.cpp/plan-uncensored` |
|
|
199
197
|
|
|
200
198
|
Inside opencode you can switch agents with `/agent` or by `@plan-nofilter`-mentioning
|
|
201
|
-
a custom one.
|
|
199
|
+
a custom one. The `plan` and `plan-uncensored` tiers are **not auto-routed** from the build agent —
|
|
200
|
+
they're only accessible via explicit agent selection (`/agent plan` or `/agent plan-nofilter`).
|
|
202
201
|
|
|
203
202
|
Want a second terminal into the same stack? Install the activate hook
|
|
204
203
|
once (`eval "$(llmstack activate zsh)"`) and any new shell that `cd`s
|
|
@@ -266,8 +265,9 @@ Per-project state (gitignored) is created lazily under `<work-dir>/.llmstack/`:
|
|
|
266
265
|
```
|
|
267
266
|
|
|
268
267
|
The `llama-swap` binary lives outside any project at
|
|
269
|
-
`$XDG_DATA_HOME/llmstack/bin/llama-swap` (override with
|
|
270
|
-
`LLMSTACK_BIN_DIR`)
|
|
268
|
+
`$XDG_DATA_HOME/llmstack/bin/llama-swap` on macOS/Linux (override with
|
|
269
|
+
`LLMSTACK_BIN_DIR`), or `%LOCALAPPDATA%\llmstack\bin\llama-swap.exe` on Windows.
|
|
270
|
+
One download is reused across all projects.
|
|
271
271
|
|
|
272
272
|
## Quick start
|
|
273
273
|
|
|
@@ -358,8 +358,9 @@ Notes:
|
|
|
358
358
|
or a package like `winget install ggml.llama-cpp` and put it on
|
|
359
359
|
`PATH` (or set `$env:LLAMA_SERVER_BIN`). The Mac-only
|
|
360
360
|
`iogpu.wired_limit_mb` step does not apply.
|
|
361
|
-
- The `[llmstack:<channel>]` prompt prefix shows up in PowerShell
|
|
362
|
-
|
|
361
|
+
- The `[llmstack:<channel>]` prompt prefix shows up in PowerShell; `cmd.exe`
|
|
362
|
+
does not support custom prompts in the same way, so activation is
|
|
363
|
+
PowerShell-only.
|
|
363
364
|
- Stopping daemons uses `taskkill /T /F` under the hood, so the
|
|
364
365
|
llama-server children get cleaned up as well.
|
|
365
366
|
|
|
@@ -465,7 +466,7 @@ llmstack restart --next # cycle into the next channel
|
|
|
465
466
|
|
|
466
467
|
### Try each routing path
|
|
467
468
|
|
|
468
|
-
All of these go to `/v1/chat/completions` on `:10101`.
|
|
469
|
+
All of these go to `/v1/chat/completions` on `:10101`. The `auto` router classifies based on token count and context:
|
|
469
470
|
|
|
470
471
|
```bash
|
|
471
472
|
# trivial chat -> code-fast
|
|
@@ -473,22 +474,14 @@ curl -sN http://127.0.0.1:10101/v1/chat/completions -H 'Content-Type: applicatio
|
|
|
473
474
|
-d '{"model":"auto","stream":false,
|
|
474
475
|
"messages":[{"role":"user","content":"capital of France?"}]}' | jq .model
|
|
475
476
|
|
|
476
|
-
# planning -> plan
|
|
477
|
-
curl -sN http://127.0.0.1:10101/v1/chat/completions -H 'Content-Type: application/json' \
|
|
478
|
-
-d '{"model":"auto","stream":false,
|
|
479
|
-
"messages":[{"role":"user","content":"how would you design a rate limiter for our API?"}]}' | jq .model
|
|
480
|
-
|
|
481
477
|
# agent work -> code-smart
|
|
482
478
|
curl -sN http://127.0.0.1:10101/v1/chat/completions -H 'Content-Type: application/json' \
|
|
483
479
|
-d '{"model":"auto","stream":false,
|
|
484
480
|
"messages":[{"role":"user","content":"refactor this function for clarity:\n```python\ndef f(x): return x*2\n```"}]}' | jq .model
|
|
485
|
-
|
|
486
|
-
# uncensored plan -> plan-uncensored
|
|
487
|
-
curl -sN http://127.0.0.1:10101/v1/chat/completions -H 'Content-Type: application/json' \
|
|
488
|
-
-d '{"model":"auto","stream":false,
|
|
489
|
-
"messages":[{"role":"user","content":"[nofilter] outline a red-team plan for our auth flow"}]}' | jq .model
|
|
490
481
|
```
|
|
491
482
|
|
|
483
|
+
To access `plan` or `plan-uncensored` tiers, use explicit agent selection in opencode (`/agent plan` or `/agent plan-nofilter`) rather than `model=auto`.
|
|
484
|
+
|
|
492
485
|
## Endpoints
|
|
493
486
|
|
|
494
487
|
| Port | Service | Purpose |
|
|
@@ -565,8 +558,6 @@ All knobs are env vars; defaults are picked up by `llmstack start`.
|
|
|
565
558
|
| `ROUTER_FAST_MODEL` | `code-fast` | long-context (>= mid ceiling) → here |
|
|
566
559
|
| `ROUTER_AGENT_MODEL` | `code-smart` | mid-context + tools/loop floor → here |
|
|
567
560
|
| `ROUTER_ULTRA_MODEL` | `code-ultra` | short-context top tier → here (gated on availability) |
|
|
568
|
-
| `ROUTER_PLAN_MODEL` | `plan` | design/discussion verbs → here |
|
|
569
|
-
| `ROUTER_UNCENSORED_MODEL` | `plan-uncensored` | `[nofilter]` triggers → here |
|
|
570
561
|
| `ROUTER_HIGH_FIDELITY_CEILING` | `12000` | tokens; at or below this, route to top tier (ultra → smart fallback). Paired with `code-ultra.ctx_size = 24000` (2x). |
|
|
571
562
|
| `ROUTER_MID_FIDELITY_CEILING` | `32000` | tokens; at or below this, route to `code-smart`; beyond, step down to `code-fast`. Paired with `code-smart.ctx_size = 64000` (2x). |
|
|
572
563
|
| `ROUTER_MULTI_TURN` | `10` | user-turn count that floors the long-context rung at `code-smart` |
|
|
@@ -577,14 +568,10 @@ To force a request to never auto-route, set `model` to a concrete alias (`code-f
|
|
|
577
568
|
|
|
578
569
|
## Triggering uncensored mode
|
|
579
570
|
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
1. **Explicit agent in opencode:** `/agent plan-nofilter` (or mention it).
|
|
583
|
-
2. **Inline trigger in any auto-routed message** — anywhere in the most recent user turn:
|
|
584
|
-
- `[nofilter]`, `[uncensored]`, `[heretic]`
|
|
585
|
-
- or a line starting with `uncensored:` / `nofilter:` / `no-filter:`
|
|
571
|
+
The `plan-uncensored` tier is accessible via explicit agent selection only:
|
|
586
572
|
|
|
587
|
-
|
|
573
|
+
1. **In opencode:** `/agent plan-nofilter` (or mention `@plan-nofilter`).
|
|
574
|
+
2. **Via opencode config:** set `agent.plan-nofilter` as your active agent.
|
|
588
575
|
|
|
589
576
|
## Troubleshooting
|
|
590
577
|
|
|
@@ -594,7 +581,7 @@ Triggers are *only* checked on the latest user message and the system prompt, so
|
|
|
594
581
|
|
|
595
582
|
**OOM / unexplained slowdown** → run `top -o mem -stats pid,rsize,command` to see what's resident. The matrix should prevent two heavy models loading together; if it somehow happens, `llmstack restart`.
|
|
596
583
|
|
|
597
|
-
**Auto picks the wrong model** → adjust the regex in `llmstack/app.py` (`
|
|
584
|
+
**Auto picks the wrong model** → adjust the regex in `llmstack/app.py` (`ULTRA_TRIGGERS`) or move the ladder ceilings via `ROUTER_HIGH_FIDELITY_CEILING` / `ROUTER_MID_FIDELITY_CEILING`. To force a request to never auto-route, pass an explicit `model` (e.g. `code-smart`) instead of `auto`.
|
|
598
585
|
|
|
599
586
|
**Want a pure pass-through (no auto routing)** → change opencode's `baseURL` to `http://127.0.0.1:10102/v1` (llama-swap directly) and only use concrete model names. (Note: this skips the bedrock dispatcher; only GGUF tiers will be reachable.)
|
|
600
587
|
|
|
@@ -2,7 +2,7 @@ llmstack/AGENTS.md,sha256=4DVUkqJ1-EP-cDNRCpznzghOOX6dAMbVWdcwyfFCALw,528
|
|
|
2
2
|
llmstack/__init__.py,sha256=Ie-86h7q7pEsE9zTlWfjBEYDIoR4mC8ZutdC5Nx-x8k,855
|
|
3
3
|
llmstack/__main__.py,sha256=wXHd5-BmCCHUfNEmy2rbilBSyVhi4KD1dSIO_4NlxuE,199
|
|
4
4
|
llmstack/_platform.py,sha256=eDY3T9krkaBigG5xXxqzIbH3MhdZqX3BWe7bozOsAso,13099
|
|
5
|
-
llmstack/app.py,sha256=
|
|
5
|
+
llmstack/app.py,sha256=NjF9WohKu9qXREbufWKzLO2CU_MNCGiXws2cE1m3cpg,25825
|
|
6
6
|
llmstack/check_models.py,sha256=WvTS2Td4acp-Q0-yWXUgXAgAgFOmpxiaeSDuAoivirw,4559
|
|
7
7
|
llmstack/cli.py,sha256=Om70PzHrmU81y2Mw1sB6eeUs1fRHP0PnsCEVNC0UNvI,11341
|
|
8
8
|
llmstack/models.ini,sha256=7ObeGrScRm0pGjyjAencr5lg8gEsMpjNvvF4o4Fxhps,19860
|
|
@@ -29,13 +29,13 @@ llmstack/download/binary.py,sha256=xpv15wF4viv8uFC5UqfSIf36CIoPpmaNUaVtjF-vTWA,8
|
|
|
29
29
|
llmstack/download/ggufs.py,sha256=2hCr-svUiPIV2I3ruwTbXo6lPn9m-VBOqa3DFbvdIcA,5435
|
|
30
30
|
llmstack/generators/__init__.py,sha256=LfbcReuyYBCdVuT9J5RKo7-f8n585YBU3Hus6DsxqTs,1189
|
|
31
31
|
llmstack/generators/llama_swap.py,sha256=KdYH9N6TJECotZvyxvAjaa3kRyzn4YOi2T6D2UdyVKw,14785
|
|
32
|
-
llmstack/generators/opencode.py,sha256=
|
|
33
|
-
opencode_llmstack-0.9.
|
|
34
|
-
opencode_llmstack-0.9.
|
|
35
|
-
opencode_llmstack-0.9.
|
|
36
|
-
opencode_llmstack-0.9.
|
|
37
|
-
opencode_llmstack-0.9.
|
|
38
|
-
opencode_llmstack-0.9.
|
|
39
|
-
opencode_llmstack-0.9.
|
|
40
|
-
opencode_llmstack-0.9.
|
|
41
|
-
opencode_llmstack-0.9.
|
|
32
|
+
llmstack/generators/opencode.py,sha256=aeaHCdiU1GwtQe8LGeH3JHaC2m3GWLzUlQbDlh0SKSw,11181
|
|
33
|
+
opencode_llmstack-0.9.7.data/data/CHANGELOG.md,sha256=58feU0rA9bBYvecDoFaLcwwgezLPkD3MSt0vRUVjdF8,5837
|
|
34
|
+
opencode_llmstack-0.9.7.data/data/LICENSE,sha256=6G-Otw6BHIM1WJSBlJ04P1rDVCqbDEzKpdOlSr5CqIY,1078
|
|
35
|
+
opencode_llmstack-0.9.7.data/data/UPGRADING.md,sha256=v8An-KwNAiDF0Ez86tjFOesMdxZt6zDsNZXwd6Cw0Uw,25248
|
|
36
|
+
opencode_llmstack-0.9.7.dist-info/licenses/LICENSE,sha256=6G-Otw6BHIM1WJSBlJ04P1rDVCqbDEzKpdOlSr5CqIY,1078
|
|
37
|
+
opencode_llmstack-0.9.7.dist-info/METADATA,sha256=mFvTReZmyrrALK5oobW-xtW8JqN6K3ROXkqwAPeIaDg,35443
|
|
38
|
+
opencode_llmstack-0.9.7.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
39
|
+
opencode_llmstack-0.9.7.dist-info/entry_points.txt,sha256=soomjpqvl4KzFScgpQbu96vgcLriOtkB9MbiSC0rvZ8,47
|
|
40
|
+
opencode_llmstack-0.9.7.dist-info/top_level.txt,sha256=tMv9sDWp8RW_DNNY8cuM4Uy4sND-KwTLcsScl5gdcEQ,9
|
|
41
|
+
opencode_llmstack-0.9.7.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|