opencode-llmstack 0.9.1__tar.gz → 0.9.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opencode_llmstack-0.9.2/CHANGELOG.md +117 -0
- opencode_llmstack-0.9.2/LICENSE +21 -0
- {opencode_llmstack-0.9.1 → opencode_llmstack-0.9.2}/PKG-INFO +33 -8
- {opencode_llmstack-0.9.1 → opencode_llmstack-0.9.2}/README.md +7 -6
- opencode_llmstack-0.9.2/UPGRADING.md +602 -0
- {opencode_llmstack-0.9.1 → opencode_llmstack-0.9.2}/llmstack/__init__.py +1 -1
- {opencode_llmstack-0.9.1 → opencode_llmstack-0.9.2}/llmstack/app.py +16 -4
- {opencode_llmstack-0.9.1 → opencode_llmstack-0.9.2}/llmstack/backends/bedrock.py +4 -2
- {opencode_llmstack-0.9.1 → opencode_llmstack-0.9.2}/llmstack/commands/start.py +0 -1
- {opencode_llmstack-0.9.1 → opencode_llmstack-0.9.2}/llmstack/models.ini +2 -2
- {opencode_llmstack-0.9.1 → opencode_llmstack-0.9.2}/opencode_llmstack.egg-info/PKG-INFO +33 -8
- {opencode_llmstack-0.9.1 → opencode_llmstack-0.9.2}/opencode_llmstack.egg-info/SOURCES.txt +3 -0
- {opencode_llmstack-0.9.1 → opencode_llmstack-0.9.2}/pyproject.toml +10 -2
- {opencode_llmstack-0.9.1 → opencode_llmstack-0.9.2}/llmstack/AGENTS.md +0 -0
- {opencode_llmstack-0.9.1 → opencode_llmstack-0.9.2}/llmstack/__main__.py +0 -0
- {opencode_llmstack-0.9.1 → opencode_llmstack-0.9.2}/llmstack/_platform.py +0 -0
- {opencode_llmstack-0.9.1 → opencode_llmstack-0.9.2}/llmstack/backends/__init__.py +0 -0
- {opencode_llmstack-0.9.1 → opencode_llmstack-0.9.2}/llmstack/check_models.py +0 -0
- {opencode_llmstack-0.9.1 → opencode_llmstack-0.9.2}/llmstack/cli.py +0 -0
- {opencode_llmstack-0.9.1 → opencode_llmstack-0.9.2}/llmstack/commands/__init__.py +0 -0
- {opencode_llmstack-0.9.1 → opencode_llmstack-0.9.2}/llmstack/commands/_helpers.py +0 -0
- {opencode_llmstack-0.9.1 → opencode_llmstack-0.9.2}/llmstack/commands/activate.py +0 -0
- {opencode_llmstack-0.9.1 → opencode_llmstack-0.9.2}/llmstack/commands/check.py +0 -0
- {opencode_llmstack-0.9.1 → opencode_llmstack-0.9.2}/llmstack/commands/download.py +0 -0
- {opencode_llmstack-0.9.1 → opencode_llmstack-0.9.2}/llmstack/commands/install.py +0 -0
- {opencode_llmstack-0.9.1 → opencode_llmstack-0.9.2}/llmstack/commands/install_llama_swap.py +0 -0
- {opencode_llmstack-0.9.1 → opencode_llmstack-0.9.2}/llmstack/commands/reload.py +0 -0
- {opencode_llmstack-0.9.1 → opencode_llmstack-0.9.2}/llmstack/commands/restart.py +0 -0
- {opencode_llmstack-0.9.1 → opencode_llmstack-0.9.2}/llmstack/commands/setup.py +0 -0
- {opencode_llmstack-0.9.1 → opencode_llmstack-0.9.2}/llmstack/commands/status.py +0 -0
- {opencode_llmstack-0.9.1 → opencode_llmstack-0.9.2}/llmstack/commands/stop.py +0 -0
- {opencode_llmstack-0.9.1 → opencode_llmstack-0.9.2}/llmstack/download/__init__.py +0 -0
- {opencode_llmstack-0.9.1 → opencode_llmstack-0.9.2}/llmstack/download/binary.py +0 -0
- {opencode_llmstack-0.9.1 → opencode_llmstack-0.9.2}/llmstack/download/ggufs.py +0 -0
- {opencode_llmstack-0.9.1 → opencode_llmstack-0.9.2}/llmstack/generators/__init__.py +0 -0
- {opencode_llmstack-0.9.1 → opencode_llmstack-0.9.2}/llmstack/generators/llama_swap.py +0 -0
- {opencode_llmstack-0.9.1 → opencode_llmstack-0.9.2}/llmstack/generators/opencode.py +0 -0
- {opencode_llmstack-0.9.1 → opencode_llmstack-0.9.2}/llmstack/paths.py +0 -0
- {opencode_llmstack-0.9.1 → opencode_llmstack-0.9.2}/llmstack/shell_env.py +0 -0
- {opencode_llmstack-0.9.1 → opencode_llmstack-0.9.2}/llmstack/tiers.py +0 -0
- {opencode_llmstack-0.9.1 → opencode_llmstack-0.9.2}/opencode_llmstack.egg-info/dependency_links.txt +0 -0
- {opencode_llmstack-0.9.1 → opencode_llmstack-0.9.2}/opencode_llmstack.egg-info/entry_points.txt +0 -0
- {opencode_llmstack-0.9.1 → opencode_llmstack-0.9.2}/opencode_llmstack.egg-info/requires.txt +0 -0
- {opencode_llmstack-0.9.1 → opencode_llmstack-0.9.2}/opencode_llmstack.egg-info/top_level.txt +0 -0
- {opencode_llmstack-0.9.1 → opencode_llmstack-0.9.2}/setup.cfg +0 -0
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to `opencode-llmstack` are documented here.
|
|
4
|
+
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
## [0.9.2] — 2026-05-11
|
|
8
|
+
|
|
9
|
+
### Fixed
|
|
10
|
+
- `classify()` now counts only **user-role messages** when evaluating the
|
|
11
|
+
multi-turn floor (`n_turns`). Previously `len(messages)` counted system
|
|
12
|
+
prompts, assistant turns, and tool-result messages, causing the floor to
|
|
13
|
+
fire after just a few real exchanges and permanently blocking `code-fast`
|
|
14
|
+
routing for the rest of any session.
|
|
15
|
+
- Multi-turn floor threshold raised from **6 → 10** user turns. `code-fast`
|
|
16
|
+
is now a hosted Bedrock model (Haiku 4.5) that tool-calls reliably, so
|
|
17
|
+
the old 3B-model rationale no longer applies. Sessions with fewer than 10
|
|
18
|
+
user turns will now correctly step down to `code-fast` past 32k tokens.
|
|
19
|
+
- Log label corrected: `(tools floor)` → `(user-turns=N>=10 floor)`.
|
|
20
|
+
- `__version__` corrected from `"0.1.0"` to `"0.9.2"`.
|
|
21
|
+
- CI release pipeline now runs lint (`ruff`) + `pytest` across Python
|
|
22
|
+
3.11/3.12/3.13 before building the wheel. Previously the `test` job was
|
|
23
|
+
documented in comments but never implemented.
|
|
24
|
+
- Added `LICENSE` (MIT) file to the repository root.
|
|
25
|
+
- README routing table updated: high-fidelity ceiling corrected (8k → 12k),
|
|
26
|
+
tools-floor condition updated to reflect user-turn counting, `ROUTER_MULTI_TURN`
|
|
27
|
+
default corrected (6 → 10).
|
|
28
|
+
- UPGRADING.md corrected: `llmstack install` does **not** regenerate
|
|
29
|
+
`llama-swap.yaml` — that is `llmstack restart`'s job. Three places in the
|
|
30
|
+
doc had this wrong.
|
|
31
|
+
- README layout tree: repo root label corrected (`opencode/` → `llmstack/`),
|
|
32
|
+
`models.ini` moved to its correct location inside the package, `shell.py`
|
|
33
|
+
(deleted) removed, `reload.py` and `LICENSE` added.
|
|
34
|
+
- `iter_downloads` reference in UPGRADING.md corrected to `iter_download_targets`.
|
|
35
|
+
- Bundled `llmstack/models.ini` header comment paths updated from legacy
|
|
36
|
+
locations to current `.llmstack/` state-dir layout.
|
|
37
|
+
- `assert` statements in production code (`app.py`, `bedrock.py`) replaced
|
|
38
|
+
with explicit `RuntimeError` / `TypeError` raises so `-O` optimisation
|
|
39
|
+
does not silently swallow them.
|
|
40
|
+
- `UPGRADING.md` and `LICENSE` added to the sdist via `pyproject.toml`
|
|
41
|
+
`package-data` / `tool.setuptools` config.
|
|
42
|
+
- `[tool.pytest.ini_options]` added to `pyproject.toml` with `testpaths`
|
|
43
|
+
and `addopts`.
|
|
44
|
+
- Python 3.14 classifier added to `pyproject.toml`.
|
|
45
|
+
|
|
46
|
+
### Added
|
|
47
|
+
- `classify()` end-to-end test coverage: step-down ladder (short/mid/long
|
|
48
|
+
context), multi-turn floor, plan-signal routing, ultra-trigger routing,
|
|
49
|
+
uncensored-trigger routing, plan ctx-size overflow fall-through.
|
|
50
|
+
- Generator tests: `build_config()` coverage for gguf tiers, bedrock tiers,
|
|
51
|
+
`use_next`, `small_model` wiring, agent wiring, `auto` ctx derivation.
|
|
52
|
+
- `X-LLMStack-Tokens` response header on every `/v1/chat/completions` and
|
|
53
|
+
`/v1/completions` response so opencode (and curl) can see the estimated
|
|
54
|
+
token count the router used to make its routing decision.
|
|
55
|
+
|
|
56
|
+
---
|
|
57
|
+
|
|
58
|
+
## [0.9.1] — 2026-05-11
|
|
59
|
+
|
|
60
|
+
### Fixed
|
|
61
|
+
- `classify()` multi-turn floor: count only `role == "user"` messages
|
|
62
|
+
(not all messages). This was the primary fix preventing `code-fast`
|
|
63
|
+
from ever being reached in long sessions.
|
|
64
|
+
- Multi-turn threshold raised 6 → 10 (see 0.9.2 for full rationale).
|
|
65
|
+
- Log label `(tools floor)` corrected to `(user-turns=N>=10 floor)`.
|
|
66
|
+
|
|
67
|
+
---
|
|
68
|
+
|
|
69
|
+
## [0.9.0] — 2026-05-08
|
|
70
|
+
|
|
71
|
+
### Changed
|
|
72
|
+
- Plan tiers now strip `tools` from the request body before dispatch.
|
|
73
|
+
Previously a plan-routed request carrying a `tools` array would fail
|
|
74
|
+
on Bedrock (Converse rejects tool configs on non-agent models).
|
|
75
|
+
- Long-context fall-through to `code-fast` is now allowed even when
|
|
76
|
+
`tools[]` is present in the request body. The tools-presence check
|
|
77
|
+
was removed from the floor condition; only turn count matters now.
|
|
78
|
+
- `plan` tier ctx-size overflow: when estimated tokens exceed the
|
|
79
|
+
planner's `ctx_size`, the request falls through to the coding ladder
|
|
80
|
+
instead of being sent to a planner whose window can't hold it.
|
|
81
|
+
- `HIGH_FIDELITY_CEILING` raised to 12 000 (was 8 000).
|
|
82
|
+
|
|
83
|
+
---
|
|
84
|
+
|
|
85
|
+
## [0.8.0] — 2026-05-07
|
|
86
|
+
|
|
87
|
+
### Changed
|
|
88
|
+
- Fidelity-ceiling overhaul: each ceiling is now exactly half of the
|
|
89
|
+
corresponding tier's `ctx_size` (the "comfortable headroom" invariant).
|
|
90
|
+
- `code-ultra.ctx_size` set to 24 000 (2× high ceiling of 12 000).
|
|
91
|
+
- `code-smart.ctx_size` set to 64 000 (2× mid ceiling of 32 000).
|
|
92
|
+
- `code-fast.ctx_size` set to 128 000 (YaRN ×4 from native 32k).
|
|
93
|
+
- `HIGH_FIDELITY_CEILING` env var added; overrides the 12 000 default.
|
|
94
|
+
- `MID_FIDELITY_CEILING` env var added; overrides the 32 000 default.
|
|
95
|
+
|
|
96
|
+
---
|
|
97
|
+
|
|
98
|
+
## [0.7.3] — 2026-05-06
|
|
99
|
+
|
|
100
|
+
### Added
|
|
101
|
+
- Per-tier Bedrock alternatives in `models.ini`: every tier now ships a
|
|
102
|
+
commented-out Bedrock block directly beneath its GGUF block.
|
|
103
|
+
- All Bedrock tiers anchored to `eu-west-3`; `plan-uncensored` pinned to
|
|
104
|
+
`us-west-2` (Llama 405B has no EU deployment).
|
|
105
|
+
- `aws_model_id_next` / `aws_region_next` support for Bedrock upgrade
|
|
106
|
+
pre-staging (mirrors gguf `hf_file_next`).
|
|
107
|
+
|
|
108
|
+
### Fixed
|
|
109
|
+
- `models.ini` comment cleanup: removed stale references to old model names.
|
|
110
|
+
|
|
111
|
+
---
|
|
112
|
+
|
|
113
|
+
## [0.7.2] — earlier
|
|
114
|
+
|
|
115
|
+
### Fixed
|
|
116
|
+
- Soft-fail when `llama-server` binary is missing at startup.
|
|
117
|
+
- PowerShell activation hook: fixed `Invoke-Expression` quoting.
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 llmstack contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -1,9 +1,30 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: opencode-llmstack
|
|
3
|
-
Version: 0.9.
|
|
3
|
+
Version: 0.9.2
|
|
4
4
|
Summary: Multi-tier local LLM stack: llama-swap + FastAPI auto-router + opencode wiring.
|
|
5
5
|
Author: llmstack
|
|
6
|
-
License: MIT
|
|
6
|
+
License: MIT License
|
|
7
|
+
|
|
8
|
+
Copyright (c) 2024 llmstack contributors
|
|
9
|
+
|
|
10
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
11
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
12
|
+
in the Software without restriction, including without limitation the rights
|
|
13
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
14
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
15
|
+
furnished to do so, subject to the following conditions:
|
|
16
|
+
|
|
17
|
+
The above copyright notice and this permission notice shall be included in all
|
|
18
|
+
copies or substantial portions of the Software.
|
|
19
|
+
|
|
20
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
21
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
22
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
23
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
24
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
25
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
26
|
+
SOFTWARE.
|
|
27
|
+
|
|
7
28
|
Project-URL: Homepage, https://github.com/rohitgarg19/llmstack
|
|
8
29
|
Project-URL: Issues, https://github.com/rohitgarg19/llmstack/issues
|
|
9
30
|
Keywords: llm,llama-cpp,llama-swap,opencode,router,local-ai
|
|
@@ -17,9 +38,11 @@ Classifier: Programming Language :: Python :: 3
|
|
|
17
38
|
Classifier: Programming Language :: Python :: 3.11
|
|
18
39
|
Classifier: Programming Language :: Python :: 3.12
|
|
19
40
|
Classifier: Programming Language :: Python :: 3.13
|
|
41
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
20
42
|
Classifier: Topic :: Software Development
|
|
21
43
|
Requires-Python: >=3.11
|
|
22
44
|
Description-Content-Type: text/markdown
|
|
45
|
+
License-File: LICENSE
|
|
23
46
|
Requires-Dist: fastapi<1.0,>=0.110
|
|
24
47
|
Requires-Dist: httpx<1.0,>=0.27
|
|
25
48
|
Requires-Dist: uvicorn[standard]<1.0,>=0.30
|
|
@@ -32,6 +55,7 @@ Requires-Dist: pytest>=7; extra == "dev"
|
|
|
32
55
|
Provides-Extra: bedrock
|
|
33
56
|
Requires-Dist: boto3>=1.35; extra == "bedrock"
|
|
34
57
|
Requires-Dist: botocore>=1.35; extra == "bedrock"
|
|
58
|
+
Dynamic: license-file
|
|
35
59
|
|
|
36
60
|
# llmstack — multi-tier local LLM stack for Mac M4 Max / 64 GB
|
|
37
61
|
|
|
@@ -118,9 +142,9 @@ First match wins:
|
|
|
118
142
|
| 1 | last user msg contains `[nofilter]`, `[uncensored]`, `[heretic]`, or starts with `uncensored:` / `nofilter:` | `plan-uncensored` | explicit opt-in |
|
|
119
143
|
| 2 | `[ultra]` / `[opus]` / `ultra:` trigger AND `code-ultra` tier configured | `code-ultra` | explicit top-tier opt-in |
|
|
120
144
|
| 3 | plan verbs (*design, architect, approach, trade-off, should we, explain why, …*) AND no code blocks / agent verbs / tools | `plan` | pure design discussion (orthogonal track) |
|
|
121
|
-
| 4 | estimated input ≤
|
|
145
|
+
| 4 | estimated input ≤ 12 000 tokens | `code-ultra` *(or `code-smart` if ultra unwired)* | top tier — context still being built, latency/$ are best here |
|
|
122
146
|
| 5 | estimated input ≤ 32 000 tokens | `code-smart` | mid-context, local heavy coder is at its sweet spot |
|
|
123
|
-
| 6 | otherwise (long context) AND
|
|
147
|
+
| 6 | otherwise (long context) AND ≥ 10 user turns | `code-smart` | floor: deep agentic loop, keep the heavy model |
|
|
124
148
|
| 7 | otherwise (long context) | `code-fast` | 128k YaRN window + always-resident + free |
|
|
125
149
|
|
|
126
150
|
Token estimates are `chars / 4` over all message text + `prompt`. The
|
|
@@ -185,12 +209,12 @@ from any directory you previously ran `install` in.
|
|
|
185
209
|
## Layout
|
|
186
210
|
|
|
187
211
|
```
|
|
188
|
-
|
|
212
|
+
llmstack/ # repo root
|
|
189
213
|
├── pyproject.toml # package metadata + `llmstack` console script
|
|
190
214
|
├── README.md # this file
|
|
191
215
|
├── UPGRADING.md # how to swap any tier for a newer/better model
|
|
192
216
|
│ + how to upgrade the Python toolchain itself
|
|
193
|
-
├──
|
|
217
|
+
├── LICENSE # MIT
|
|
194
218
|
└── llmstack/ # the python package (importable, installable)
|
|
195
219
|
├── __init__.py
|
|
196
220
|
├── __main__.py # `python -m llmstack`
|
|
@@ -199,6 +223,7 @@ opencode/ # repo root
|
|
|
199
223
|
├── shell_env.py # spawn the env-prepared subshell + activate hooks
|
|
200
224
|
├── app.py # FastAPI auto-router (~280 lines)
|
|
201
225
|
├── tiers.py # parse models.ini -> Tier dataclasses
|
|
226
|
+
├── models.ini # SINGLE SOURCE OF TRUTH for tiers + sampler (bundled template)
|
|
202
227
|
├── check_models.py # snapshot tool (HF metadata + drift check)
|
|
203
228
|
├── AGENTS.md # opencode agent template (shipped as package data)
|
|
204
229
|
├── generators/
|
|
@@ -213,9 +238,9 @@ opencode/ # repo root
|
|
|
213
238
|
├── install_llama_swap.py
|
|
214
239
|
├── download.py
|
|
215
240
|
├── start.py
|
|
216
|
-
├── shell.py
|
|
217
241
|
├── stop.py
|
|
218
242
|
├── restart.py
|
|
243
|
+
├── reload.py
|
|
219
244
|
├── status.py
|
|
220
245
|
├── check.py
|
|
221
246
|
└── activate.py
|
|
@@ -544,7 +569,7 @@ All knobs are env vars; defaults are picked up by `llmstack start`.
|
|
|
544
569
|
| `ROUTER_UNCENSORED_MODEL` | `plan-uncensored` | `[nofilter]` triggers → here |
|
|
545
570
|
| `ROUTER_HIGH_FIDELITY_CEILING` | `12000` | tokens; at or below this, route to top tier (ultra → smart fallback). Paired with `code-ultra.ctx_size = 24000` (2x). |
|
|
546
571
|
| `ROUTER_MID_FIDELITY_CEILING` | `32000` | tokens; at or below this, route to `code-smart`; beyond, step down to `code-fast`. Paired with `code-smart.ctx_size = 64000` (2x). |
|
|
547
|
-
| `ROUTER_MULTI_TURN` | `
|
|
572
|
+
| `ROUTER_MULTI_TURN` | `10` | user-turn count that floors the long-context rung at `code-smart` |
|
|
548
573
|
| `ROUTER_HOST` / `ROUTER_PORT` | `127.0.0.1` / `10101` | listen address |
|
|
549
574
|
| `LOG_LEVEL` | `info` | router log level |
|
|
550
575
|
|
|
@@ -83,9 +83,9 @@ First match wins:
|
|
|
83
83
|
| 1 | last user msg contains `[nofilter]`, `[uncensored]`, `[heretic]`, or starts with `uncensored:` / `nofilter:` | `plan-uncensored` | explicit opt-in |
|
|
84
84
|
| 2 | `[ultra]` / `[opus]` / `ultra:` trigger AND `code-ultra` tier configured | `code-ultra` | explicit top-tier opt-in |
|
|
85
85
|
| 3 | plan verbs (*design, architect, approach, trade-off, should we, explain why, …*) AND no code blocks / agent verbs / tools | `plan` | pure design discussion (orthogonal track) |
|
|
86
|
-
| 4 | estimated input ≤
|
|
86
|
+
| 4 | estimated input ≤ 12 000 tokens | `code-ultra` *(or `code-smart` if ultra unwired)* | top tier — context still being built, latency/$ are best here |
|
|
87
87
|
| 5 | estimated input ≤ 32 000 tokens | `code-smart` | mid-context, local heavy coder is at its sweet spot |
|
|
88
|
-
| 6 | otherwise (long context) AND
|
|
88
|
+
| 6 | otherwise (long context) AND ≥ 10 user turns | `code-smart` | floor: deep agentic loop, keep the heavy model |
|
|
89
89
|
| 7 | otherwise (long context) | `code-fast` | 128k YaRN window + always-resident + free |
|
|
90
90
|
|
|
91
91
|
Token estimates are `chars / 4` over all message text + `prompt`. The
|
|
@@ -150,12 +150,12 @@ from any directory you previously ran `install` in.
|
|
|
150
150
|
## Layout
|
|
151
151
|
|
|
152
152
|
```
|
|
153
|
-
|
|
153
|
+
llmstack/ # repo root
|
|
154
154
|
├── pyproject.toml # package metadata + `llmstack` console script
|
|
155
155
|
├── README.md # this file
|
|
156
156
|
├── UPGRADING.md # how to swap any tier for a newer/better model
|
|
157
157
|
│ + how to upgrade the Python toolchain itself
|
|
158
|
-
├──
|
|
158
|
+
├── LICENSE # MIT
|
|
159
159
|
└── llmstack/ # the python package (importable, installable)
|
|
160
160
|
├── __init__.py
|
|
161
161
|
├── __main__.py # `python -m llmstack`
|
|
@@ -164,6 +164,7 @@ opencode/ # repo root
|
|
|
164
164
|
├── shell_env.py # spawn the env-prepared subshell + activate hooks
|
|
165
165
|
├── app.py # FastAPI auto-router (~280 lines)
|
|
166
166
|
├── tiers.py # parse models.ini -> Tier dataclasses
|
|
167
|
+
├── models.ini # SINGLE SOURCE OF TRUTH for tiers + sampler (bundled template)
|
|
167
168
|
├── check_models.py # snapshot tool (HF metadata + drift check)
|
|
168
169
|
├── AGENTS.md # opencode agent template (shipped as package data)
|
|
169
170
|
├── generators/
|
|
@@ -178,9 +179,9 @@ opencode/ # repo root
|
|
|
178
179
|
├── install_llama_swap.py
|
|
179
180
|
├── download.py
|
|
180
181
|
├── start.py
|
|
181
|
-
├── shell.py
|
|
182
182
|
├── stop.py
|
|
183
183
|
├── restart.py
|
|
184
|
+
├── reload.py
|
|
184
185
|
├── status.py
|
|
185
186
|
├── check.py
|
|
186
187
|
└── activate.py
|
|
@@ -509,7 +510,7 @@ All knobs are env vars; defaults are picked up by `llmstack start`.
|
|
|
509
510
|
| `ROUTER_UNCENSORED_MODEL` | `plan-uncensored` | `[nofilter]` triggers → here |
|
|
510
511
|
| `ROUTER_HIGH_FIDELITY_CEILING` | `12000` | tokens; at or below this, route to top tier (ultra → smart fallback). Paired with `code-ultra.ctx_size = 24000` (2x). |
|
|
511
512
|
| `ROUTER_MID_FIDELITY_CEILING` | `32000` | tokens; at or below this, route to `code-smart`; beyond, step down to `code-fast`. Paired with `code-smart.ctx_size = 64000` (2x). |
|
|
512
|
-
| `ROUTER_MULTI_TURN` | `
|
|
513
|
+
| `ROUTER_MULTI_TURN` | `10` | user-turn count that floors the long-context rung at `code-smart` |
|
|
513
514
|
| `ROUTER_HOST` / `ROUTER_PORT` | `127.0.0.1` / `10101` | listen address |
|
|
514
515
|
| `LOG_LEVEL` | `info` | router log level |
|
|
515
516
|
|
|
@@ -0,0 +1,602 @@
|
|
|
1
|
+
# UPGRADING — replacing models with newer / better ones
|
|
2
|
+
|
|
3
|
+
This stack is built around **swapping individual model files in/out**. There is no
|
|
4
|
+
lock-in to provider, model family, or name — you can replace any tier with any
|
|
5
|
+
other GGUF that fits the role and your hardware budget.
|
|
6
|
+
|
|
7
|
+
This doc explains:
|
|
8
|
+
|
|
9
|
+
1. [Why GGUF specifically](#why-gguf)
|
|
10
|
+
2. [Where each model is referenced (single source of truth + IDE wiring)](#where-the-models-live)
|
|
11
|
+
3. [The upgrade workflow](#upgrade-workflow)
|
|
12
|
+
4. [How to judge "holistically better" per tier](#judging-better-per-tier)
|
|
13
|
+
5. [Where to look for candidates](#where-to-find-candidates)
|
|
14
|
+
6. [Worked example](#worked-example-replacing-the-plan-tier)
|
|
15
|
+
7. [After-upgrade housekeeping](#after-upgrade-housekeeping)
|
|
16
|
+
8. [Upgrading the Python toolchain](#upgrading-the-python-toolchain)
|
|
17
|
+
|
|
18
|
+
---
|
|
19
|
+
|
|
20
|
+
## Why GGUF
|
|
21
|
+
|
|
22
|
+
Every model the stack runs **must** be a GGUF (`.gguf`) file because:
|
|
23
|
+
|
|
24
|
+
- **`llama.cpp` only loads GGUF natively.** The whole stack — `llama-server`,
|
|
25
|
+
`llama-swap`, the Metal backend on Apple Silicon — is built on llama.cpp.
|
|
26
|
+
- **Self-contained format.** A single `.gguf` includes weights, tokenizer,
|
|
27
|
+
chat template (Jinja), and metadata. No `transformers`, no separate
|
|
28
|
+
tokenizer files, no Python deps at runtime.
|
|
29
|
+
- **First-class quantisation.** GGUF is the format quantised models ship in
|
|
30
|
+
(Q4_K_M, Q5_K_M, Q8_0, i1, UD, …). Quants are how a 24 B model fits in
|
|
31
|
+
13 GB of RAM with little quality loss.
|
|
32
|
+
- **Instant `-hf` resolution.** `llama-server -hf <repo> -hff <file>.gguf`
|
|
33
|
+
downloads on demand into the standard HF cache and starts serving — no
|
|
34
|
+
conversion step, no extra tooling.
|
|
35
|
+
|
|
36
|
+
If a model you want is **not** published as a GGUF, you have two options:
|
|
37
|
+
|
|
38
|
+
1. Wait for one of the trusted converters (`bartowski`, `unsloth`,
|
|
39
|
+
`mradermacher`, `lmstudio-community`, `MaziyarPanahi`, …) to publish one —
|
|
40
|
+
usually within hours of the original release.
|
|
41
|
+
2. Convert it yourself with `llama.cpp/convert_hf_to_gguf.py` then quantise
|
|
42
|
+
with `llama-quantize`. Doable but slow and adds maintenance burden — only
|
|
43
|
+
worth it for niche models.
|
|
44
|
+
|
|
45
|
+
**Rule of thumb:** if you can't find `<model-name>-GGUF` on HF or via the
|
|
46
|
+
maintainers above, the model is not ready for this stack. Pick another.
|
|
47
|
+
|
|
48
|
+
## Cache management
|
|
49
|
+
|
|
50
|
+
There is **one** writer to the model cache: `llama-cli`. Everything reads from
|
|
51
|
+
the same place: `~/.cache/huggingface/hub/`.
|
|
52
|
+
|
|
53
|
+
```
|
|
54
|
+
~/.cache/huggingface/hub/
|
|
55
|
+
└── models--<owner>--<repo>/
|
|
56
|
+
├── blobs/
|
|
57
|
+
│ ├── <sha> ← completed file
|
|
58
|
+
│ └── <sha>.downloadInProgress ← llama.cpp's resumable partial
|
|
59
|
+
├── refs/main ← commit hash
|
|
60
|
+
└── snapshots/<commit>/
|
|
61
|
+
└── <filename>.gguf ← symlink to the blob
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
Key points:
|
|
65
|
+
|
|
66
|
+
- **`llama-cli -cl`** lists everything currently complete in the cache.
|
|
67
|
+
- **`llmstack download` uses `llama-cli`** so the cache stays coherent. It
|
|
68
|
+
accepts `HF_TOKEN` from the environment if you have one.
|
|
69
|
+
- **Do NOT use `huggingface-cli download` or `pip install huggingface_hub`'s
|
|
70
|
+
`hf_hub_download()`** to write to the same cache — they use a different
|
|
71
|
+
partial-file convention (`.incomplete` vs `.downloadInProgress`) and
|
|
72
|
+
cannot resume each other's partials. Completed files are interchangeable;
|
|
73
|
+
in-flight downloads are not.
|
|
74
|
+
- **Cache-dir override:** all llama.cpp tools accept `-cd <dir>` to point at
|
|
75
|
+
a different cache. We don't use it; the default is correct.
|
|
76
|
+
- **Cache cleanup:** `llama-cli -cr <repo>:<quant>` removes a single entry,
|
|
77
|
+
or `rm -rf ~/.cache/huggingface/hub/models--<owner>--<repo>/` removes a
|
|
78
|
+
whole repo's blobs.
|
|
79
|
+
|
|
80
|
+
## `-hf` / `-hff` syntax (gotcha)
|
|
81
|
+
|
|
82
|
+
```
|
|
83
|
+
-hf <user>/<repo> repo on HuggingFace
|
|
84
|
+
-hf <user>/<repo>:<quant> repo + quant TAG (e.g. :Q4_K_M, :Q6_K) — auto-resolves a file
|
|
85
|
+
-hff <filename>.gguf explicit file inside the repo
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
`-hf <repo>:<full-filename>.gguf` **does not work** — the `:suffix` part is
|
|
89
|
+
parsed as a quant tag, so a full filename ending in `.gguf` is rejected with
|
|
90
|
+
`get_hf_plan: no GGUF files found in repository`. Always pair `-hf <repo>` with
|
|
91
|
+
`-hff <filename>.gguf` for explicit selection. This is what the generated
|
|
92
|
+
`llama-swap.yaml` uses everywhere.
|
|
93
|
+
|
|
94
|
+
---
|
|
95
|
+
|
|
96
|
+
## Where the models live
|
|
97
|
+
|
|
98
|
+
There is exactly **one source of truth** that determines what runs in a
|
|
99
|
+
given project: `.llmstack/models.ini` in the work-dir. Everything else is
|
|
100
|
+
generated from it. The first time you run `llmstack install` in a fresh
|
|
101
|
+
project, this file is auto-seeded from the bundled template at
|
|
102
|
+
`llmstack/models.ini` inside the package; from then on it's per-project.
|
|
103
|
+
|
|
104
|
+
| File | What it does | Required when changing a model? |
|
|
105
|
+
|---|---|---|
|
|
106
|
+
| **`<work-dir>/.llmstack/models.ini`** | The single source of truth: tier definitions (repo, file, ctx, sampler, role, …). All other artefacts derive from it. Auto-seeded from the bundled template on first `install`; gitignored after that — yours to edit per project. | **Yes — primary edit.** |
|
|
107
|
+
| `llmstack/models.ini` (in the package) | Bundled template, version-controlled, ships with `pip install -e .`. Only consulted as the seed source when a project has no `.llmstack/models.ini` yet. Edit if you want to change the *factory defaults* every new project starts from. | Only when changing factory defaults. |
|
|
108
|
+
| `<work-dir>/.llmstack/llama-swap.yaml` | AUTO-GENERATED by `llmstack.generators.llama_swap`. Determines what `llama-server` actually loads. Regenerated fresh by `llmstack start` (or `llmstack restart`) for the channel you're booting into. Do not regenerate with `llmstack install` — `install` only writes `opencode.json` + `AGENTS.md`. | Auto — don't hand-edit. |
|
|
109
|
+
| `<work-dir>/.llmstack/opencode.json` | AUTO-GENERATED by `llmstack.generators.opencode`. `<work-dir>` is whatever directory the CLI was invoked from — `cd` into any project to get a project-local config. Wires opencode's `model`, `small_model`, and `agent.*` to the right tiers. Loaded by opencode via the `OPENCODE_CONFIG` env var that the activate hook (or `llmstack start`'s fallback subshell) exports — your global `~/.config/opencode/opencode.json` is intentionally left alone. Regenerate via `llmstack install`. | Auto — don't hand-edit. |
|
|
110
|
+
| `<work-dir>/.llmstack/AGENTS.md` | Copy of `llmstack/AGENTS.md` (the bundled template), made by `install`. Loaded by opencode via the `instructions` field of `opencode.json`. Edit the source template to change defaults; edit the `.llmstack/` copy for one-off project tweaks (will be overwritten on next `install`). | Auto — copy of template. |
|
|
111
|
+
| `llmstack download` | Enumerates download targets at runtime via `llmstack.tiers.iter_download_targets` (which reads `models.ini`). | No edits — it reflects `models.ini` automatically. |
|
|
112
|
+
|
|
113
|
+
`llmstack check` flags any DRIFT between `models.ini` (the recommended file
|
|
114
|
+
per tier) and `llama-swap.yaml` (the file actually configured). If you ever
|
|
115
|
+
see DRIFT, run `llmstack restart` to regenerate the yaml and cycle the stack.
|
|
116
|
+
|
|
117
|
+
### Where the download list comes from
|
|
118
|
+
|
|
119
|
+
`llmstack download` does **not** hardcode the (repo, file) tuples. It
|
|
120
|
+
enumerates them at runtime by reading `.llmstack/models.ini` (via
|
|
121
|
+
`llmstack.tiers`), yielding one row per `(tier, label)` where `label` is
|
|
122
|
+
`current` (the active `hf_file`) or `next` (the queued `hf_file_next`).
|
|
123
|
+
|
|
124
|
+
To add a new file to the download set, edit the matching tier in
|
|
125
|
+
`models.ini`. To stop pre-fetching an upgrade target, blank out its
|
|
126
|
+
`hf_file_next` line. No edits to Python code needed.
|
|
127
|
+
|
|
128
|
+
### Trying the queued upgrade without committing to it
|
|
129
|
+
|
|
130
|
+
Once an `hf_file_next` finishes downloading you can run the whole stack
|
|
131
|
+
against the **next** file for every tier that has one, without touching
|
|
132
|
+
`models.ini` permanently:
|
|
133
|
+
|
|
134
|
+
```bash
|
|
135
|
+
llmstack stop
|
|
136
|
+
llmstack start --next
|
|
137
|
+
llmstack status # shows "channel: next" + per-tier swap list
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
Under the hood, `start --next` calls
|
|
141
|
+
`llmstack.generators.llama_swap.render(use_next=True)` and writes the
|
|
142
|
+
result to the same `<work-dir>/.llmstack/llama-swap.yaml` (with a header
|
|
143
|
+
banner reminding you it's the next-channel build), then points llama-swap
|
|
144
|
+
at it. Tiers with no `hf_file_next` are unchanged. To revert, `llmstack
|
|
145
|
+
stop && llmstack start` (no `--next`) regenerates the canonical yaml.
|
|
146
|
+
To make the upgrade permanent, swap `hf_file_next` into `hf_file` in
|
|
147
|
+
`models.ini` and re-run `llmstack install`.
|
|
148
|
+
|
|
149
|
+
### Anchors to find on each upgrade
|
|
150
|
+
|
|
151
|
+
Inside `<work-dir>/.llmstack/llama-swap.yaml`, each tier has a clearly-marked
|
|
152
|
+
model block:
|
|
153
|
+
|
|
154
|
+
```yaml
|
|
155
|
+
models:
|
|
156
|
+
code-fast: # ← tier alias (do not rename without updating opencode.json)
|
|
157
|
+
cmd: |
|
|
158
|
+
${llama_server} ${metal_defaults}
|
|
159
|
+
# >>> UPGRADE-POINT (code-fast): swap the -hf/-hff pair below to change this tier.
|
|
160
|
+
-hf bartowski/Qwen2.5-Coder-3B-Instruct-GGUF # <-- REPLACE: HF repo
|
|
161
|
+
-hff Qwen2.5-Coder-3B-Instruct-Q5_K_M.gguf # <-- REPLACE: filename in that repo
|
|
162
|
+
--alias code-fast
|
|
163
|
+
-c 131072
|
|
164
|
+
...
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
But don't edit the YAML — it's regenerated. Find the same lines in
|
|
168
|
+
`.llmstack/models.ini` (search for the tier name, e.g. `[code-fast]`) and
|
|
169
|
+
edit `hf_repo` / `hf_file` there. Then re-run `llmstack install`.
|
|
170
|
+
|
|
171
|
+
The download list comes from the same ini file: the four tiers map 1:1 to
|
|
172
|
+
four download jobs, each with `(repo, current-file, next-file?)`.
|
|
173
|
+
|
|
174
|
+
---
|
|
175
|
+
|
|
176
|
+
## Upgrade workflow
|
|
177
|
+
|
|
178
|
+
Apply this **per tier**, not all at once. Test one model at a time so you
|
|
179
|
+
can roll back cleanly.
|
|
180
|
+
|
|
181
|
+
```
|
|
182
|
+
0. (Optional, fast) llmstack check
|
|
183
|
+
→ prints current model + repo + last-modified + HF URL for each tier
|
|
184
|
+
|
|
185
|
+
1. Identify a candidate.
|
|
186
|
+
- Hugging Face search: https://huggingface.co/models?search=<keywords>+GGUF&sort=trending
|
|
187
|
+
- Filter: GGUF format, last 30 days, downloads > 1k
|
|
188
|
+
- Cross-check: a leaderboard appropriate to the tier (see below)
|
|
189
|
+
|
|
190
|
+
2. Verify the candidate is suitable for the tier.
|
|
191
|
+
- GGUF available? must exist as a .gguf file
|
|
192
|
+
- Size in budget? see "tier sizes" table below
|
|
193
|
+
- Chat template embedded? open the GGUF page → "Use this model" → llama.cpp tab.
|
|
194
|
+
If --jinja is mentioned or chat_template metadata
|
|
195
|
+
is shown, you're fine.
|
|
196
|
+
- Tool calls supported? only matters for code-smart. Check that the model
|
|
197
|
+
page lists "function calling" or its chat template
|
|
198
|
+
handles tool-call blocks.
|
|
199
|
+
- Native context length? must be ≥ what you set with -c, or have YaRN config.
|
|
200
|
+
|
|
201
|
+
3. Smoke-test it before wiring it in. First, prefetch + validate via llama-cli:
|
|
202
|
+
|
|
203
|
+
llama-cli -hf <new-repo> -hff <new-file>.gguf \
|
|
204
|
+
--no-warmup -ngl 0 -c 256 -p ok -n 1
|
|
205
|
+
|
|
206
|
+
# then bring up llama-server on a throwaway port to test inference:
|
|
207
|
+
llama-server -hf <new-repo> -hff <new-file>.gguf \
|
|
208
|
+
--port 9999 -ngl 999 -fa on --jinja \
|
|
209
|
+
--cache-type-k q8_0 --cache-type-v q8_0 -c 32768
|
|
210
|
+
|
|
211
|
+
# in another terminal:
|
|
212
|
+
curl -sN http://127.0.0.1:9999/v1/chat/completions \
|
|
213
|
+
-H 'Content-Type: application/json' \
|
|
214
|
+
-d '{"model":"x","messages":[{"role":"user","content":"hello"}]}'
|
|
215
|
+
|
|
216
|
+
Confirm:
|
|
217
|
+
- it loads without errors (look for "main: model loaded")
|
|
218
|
+
- it streams a sensible response
|
|
219
|
+
- it accepts your typical request shape (tools, long context, …)
|
|
220
|
+
|
|
221
|
+
4. Edit ONE file: `.llmstack/models.ini`. In the matching `[tier]` section,
|
|
222
|
+
update:
|
|
223
|
+
|
|
224
|
+
hf_repo = <new-repo>
|
|
225
|
+
hf_file = <new-file>.gguf
|
|
226
|
+
ctx_size = <new-context> ; only if changing
|
|
227
|
+
sampler = temp=..., top_p=..., ... ; only if changing
|
|
228
|
+
size_gb, quant, status ; for documentation accuracy
|
|
229
|
+
|
|
230
|
+
No other files need editing — `llama-swap.yaml`, `opencode.json`, and
|
|
231
|
+
the download list are all auto-derived from this.
|
|
232
|
+
|
|
233
|
+
5. Regenerate everything from the ini:
|
|
234
|
+
|
|
235
|
+
llmstack install # rewrites opencode.json + AGENTS.md
|
|
236
|
+
llmstack download # picks up the new (repo, file) pair from models.ini
|
|
237
|
+
|
|
238
|
+
6. Cycle the stack:
|
|
239
|
+
|
|
240
|
+
llmstack stop
|
|
241
|
+
llmstack start
|
|
242
|
+
llmstack status
|
|
243
|
+
curl -s http://127.0.0.1:10101/v1/models | jq '.data[].id'
|
|
244
|
+
|
|
245
|
+
7. Sanity-check via opencode (or curl) — fire a few characteristic prompts at
|
|
246
|
+
the upgraded tier. If the new model misbehaves, the rollback is just a
|
|
247
|
+
one-line revert in models.ini + `llmstack install` + `llmstack restart`.
|
|
248
|
+
```
|
|
249
|
+
|
|
250
|
+
---
|
|
251
|
+
|
|
252
|
+
## Judging "better" per tier
|
|
253
|
+
|
|
254
|
+
A model that scores +5 % on MMLU is not automatically better for *your*
|
|
255
|
+
workflow. Score against the role the tier plays.
|
|
256
|
+
|
|
257
|
+
### `code-fast` — autocomplete / FIM / quick Q&A
|
|
258
|
+
|
|
259
|
+
What matters:
|
|
260
|
+
- **Tokens/sec on M4 Max** (must clear ~60 tok/s for tab-feel responsiveness)
|
|
261
|
+
- **FIM (fill-in-the-middle) support** — the chat template must include a FIM
|
|
262
|
+
format, and the model must be trained for it
|
|
263
|
+
- **Tool calling** — *not* required, this tier never calls tools
|
|
264
|
+
|
|
265
|
+
How to evaluate:
|
|
266
|
+
- Run `llama-bench -m <new>.gguf -p 512 -n 128 -ngl 999` for raw speed
|
|
267
|
+
- Sniff test with a typical autocomplete prompt; latency should feel like
|
|
268
|
+
the cursor is barely ahead of you
|
|
269
|
+
- Aider leaderboard "edit format" column — proxy for FIM quality
|
|
270
|
+
|
|
271
|
+
Size budget: **~2–6 GB** weights (we want this resident permanently while
|
|
272
|
+
sharing memory with the heavy tier).
|
|
273
|
+
|
|
274
|
+
Good candidates to track:
|
|
275
|
+
- `bartowski/Qwen2.5-Coder-*-Instruct-GGUF` (any of 3B/7B)
|
|
276
|
+
- `bartowski/DeepSeek-Coder-V2-Lite-Instruct-GGUF`
|
|
277
|
+
- `unsloth/Qwen3-Coder-*-GGUF` (smaller variants)
|
|
278
|
+
- Any new "tab" / "FIM" coder ≤ 7B that drops
|
|
279
|
+
|
|
280
|
+
### `code-smart` — agent: tool calls, multi-file edits, refactors
|
|
281
|
+
|
|
282
|
+
What matters:
|
|
283
|
+
- **Tool / function-calling correctness** (the model must reliably emit
|
|
284
|
+
well-formed tool-call blocks)
|
|
285
|
+
- **Long-context recall** (≥ 64 k usable, ideally 128 k+)
|
|
286
|
+
- **Code editing benchmarks** (aider edit success, SWE-Bench Verified)
|
|
287
|
+
- **Speed at full context** (MoE models win here on Apple Silicon)
|
|
288
|
+
|
|
289
|
+
How to evaluate:
|
|
290
|
+
- Aider's [LLM Leaderboard](https://aider.chat/docs/leaderboards/) — most
|
|
291
|
+
honest signal for agentic coding
|
|
292
|
+
- LiveCodeBench scores
|
|
293
|
+
- SWE-Bench Verified (the "real PRs" benchmark)
|
|
294
|
+
- Run an actual opencode session in `build` mode against your repo
|
|
295
|
+
|
|
296
|
+
Size budget: **~30–55 GB** weights (must fit alongside `code-fast` ≈ 5 GB
|
|
297
|
+
in your wired-mem cap).
|
|
298
|
+
|
|
299
|
+
Good candidates to track:
|
|
300
|
+
- `unsloth/Qwen3-Coder-*-GGUF` (incl. the Next 80B-A3B you have)
|
|
301
|
+
- `bartowski/DeepSeek-Coder-V*-Instruct-GGUF`
|
|
302
|
+
- `bartowski/Codestral-*-GGUF` future versions
|
|
303
|
+
- New "Coder" or "Devstral" releases as they appear
|
|
304
|
+
|
|
305
|
+
### `plan` — design discussions, architecture, trade-offs
|
|
306
|
+
|
|
307
|
+
What matters:
|
|
308
|
+
- **Reasoning quality** (MMLU-Pro, GPQA-Diamond)
|
|
309
|
+
- **Instruction-following on multi-step prompts** (IFEval)
|
|
310
|
+
- **Discussion style** — should propose alternatives, not jump to code
|
|
311
|
+
- **Refusals on edge cases** — fine to refuse weird stuff in plain plan mode
|
|
312
|
+
|
|
313
|
+
How to evaluate:
|
|
314
|
+
- Open LLM Leaderboard (filter to chat/instruct, your size class)
|
|
315
|
+
- Chatbot Arena — vibes-based but useful proxy
|
|
316
|
+
- Hand-roll a "design this rate limiter" prompt and compare outputs
|
|
317
|
+
|
|
318
|
+
Size budget: **~7–25 GB** weights — this tier shouldn't dominate memory.
|
|
319
|
+
|
|
320
|
+
Good candidates to track:
|
|
321
|
+
- `bartowski/Qwen3-*-Instruct-GGUF`
|
|
322
|
+
- `bartowski/Mistral-Small-*-Instruct-GGUF` (non-uncensored)
|
|
323
|
+
- `bartowski/gemma-*-it-GGUF`
|
|
324
|
+
- `bartowski/glm-*-chat-GGUF`
|
|
325
|
+
- Reasoning-tuned variants: QwQ, DeepSeek-R1-Distill, Qwen3-Thinking
|
|
326
|
+
|
|
327
|
+
### `plan-uncensored` — no-filter planning
|
|
328
|
+
|
|
329
|
+
What matters:
|
|
330
|
+
- Same metrics as `plan`
|
|
331
|
+
- **Plus** demonstrably reduced refusal rate (look for "abliterated",
|
|
332
|
+
"uncensored", "heretic", "dolphin", "neural-chat" branding)
|
|
333
|
+
|
|
334
|
+
Good candidates to track:
|
|
335
|
+
- `mradermacher/<base-model>-uncensored-*-GGUF`
|
|
336
|
+
- `mradermacher/<base-model>-heretic-i1-GGUF`
|
|
337
|
+
- `bartowski/<base-model>-abliterated-*-GGUF`
|
|
338
|
+
- `cognitivecomputations/dolphin-*` (then look for GGUF re-uploads)
|
|
339
|
+
|
|
340
|
+
Same size budget as `plan`.
|
|
341
|
+
|
|
342
|
+
---
|
|
343
|
+
|
|
344
|
+
## Where to find candidates
|
|
345
|
+
|
|
346
|
+
**HuggingFace search** (fastest):
|
|
347
|
+
|
|
348
|
+
- All recent GGUFs sorted by trending:
|
|
349
|
+
https://huggingface.co/models?library=gguf&sort=trending
|
|
350
|
+
- New coder GGUFs in the last 30 days:
|
|
351
|
+
https://huggingface.co/models?other=code&library=gguf&sort=created
|
|
352
|
+
- Specific maintainer feeds (subscribe / bookmark):
|
|
353
|
+
- https://huggingface.co/bartowski (broad coverage, fast turnaround)
|
|
354
|
+
- https://huggingface.co/unsloth (Qwen, Llama, with Dynamic UD quants)
|
|
355
|
+
- https://huggingface.co/mradermacher (i1 / abliterated / heretic variants)
|
|
356
|
+
- https://huggingface.co/lmstudio-community (curated, conservative quants)
|
|
357
|
+
- https://huggingface.co/MaziyarPanahi (broad chat + coder)
|
|
358
|
+
|
|
359
|
+
**Leaderboards** (signal):
|
|
360
|
+
|
|
361
|
+
| Tier | Leaderboard |
|
|
362
|
+
|---|---|
|
|
363
|
+
| `code-fast` / `code-smart` | https://aider.chat/docs/leaderboards/ |
|
|
364
|
+
| | https://livecodebench.github.io/leaderboard.html |
|
|
365
|
+
| | https://www.swebench.com/ (Verified split) |
|
|
366
|
+
| `plan` / `plan-uncensored` | https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard |
|
|
367
|
+
| | https://lmarena.ai/ |
|
|
368
|
+
| | https://livebench.ai/ |
|
|
369
|
+
|
|
370
|
+
**Community signal** (qualitative but valuable):
|
|
371
|
+
|
|
372
|
+
- r/LocalLLaMA — daily threads on what's good
|
|
373
|
+
- HF model card discussions — real-world report-backs
|
|
374
|
+
- Each maintainer's repo READMEs often link to benchmarks they ran themselves
|
|
375
|
+
|
|
376
|
+
---
|
|
377
|
+
|
|
378
|
+
## Worked example: replacing the `plan` tier
|
|
379
|
+
|
|
380
|
+
Suppose tomorrow `bartowski` ships `Qwen3-Next-32B-Thinking-GGUF` and you
|
|
381
|
+
think it'd plan better than your current Qwopus GLM 18B.
|
|
382
|
+
|
|
383
|
+
```bash
|
|
384
|
+
# 0. Snapshot the current configuration
|
|
385
|
+
cd ~/Projects/opencode
|
|
386
|
+
llmstack check > /tmp/before.txt
|
|
387
|
+
|
|
388
|
+
# 1. Pre-pull via llama-cli (writes to the standard cache)
|
|
389
|
+
llama-cli -hf bartowski/Qwen3-Next-32B-Thinking-GGUF \
|
|
390
|
+
-hff Qwen3-Next-32B-Thinking-Q5_K_M.gguf \
|
|
391
|
+
--no-warmup -ngl 0 -c 256 -p ok -n 1
|
|
392
|
+
|
|
393
|
+
# 2. Smoke-test inference on a throwaway port
|
|
394
|
+
llama-server -hf bartowski/Qwen3-Next-32B-Thinking-GGUF \
|
|
395
|
+
-hff Qwen3-Next-32B-Thinking-Q5_K_M.gguf \
|
|
396
|
+
--port 9999 -ngl 999 -fa on --jinja \
|
|
397
|
+
--cache-type-k q8_0 --cache-type-v q8_0 -c 65536 &
|
|
398
|
+
|
|
399
|
+
curl -sN http://127.0.0.1:9999/v1/chat/completions \
|
|
400
|
+
-H 'Content-Type: application/json' \
|
|
401
|
+
-d '{"model":"x","messages":[{"role":"user","content":"How would you architect a rate limiter for our API?"}]}'
|
|
402
|
+
|
|
403
|
+
kill %1 # stop the test server
|
|
404
|
+
|
|
405
|
+
# 3. Edit ONE file: .llmstack/models.ini -> the [plan] section.
|
|
406
|
+
# OLD:
|
|
407
|
+
# hf_repo = Jackrong/Qwopus-GLM-18B-Merged-GGUF
|
|
408
|
+
# hf_file = Qwopus-GLM-18B-Healed-Q4_K_M.gguf
|
|
409
|
+
# NEW:
|
|
410
|
+
# hf_repo = bartowski/Qwen3-Next-32B-Thinking-GGUF
|
|
411
|
+
# hf_file = Qwen3-Next-32B-Thinking-Q5_K_M.gguf
|
|
412
|
+
# Also bump size_gb / quant for documentation accuracy.
|
|
413
|
+
|
|
414
|
+
# 4. Apply: regenerate llama-swap.yaml + opencode.json, restart, verify.
|
|
415
|
+
llmstack install
|
|
416
|
+
llmstack restart
|
|
417
|
+
llmstack status
|
|
418
|
+
llmstack check # confirms no DRIFT vs models.ini
|
|
419
|
+
```
|
|
420
|
+
|
|
421
|
+
Roll back? Revert the two lines in `.llmstack/models.ini` and re-run
|
|
422
|
+
`llmstack install && llmstack restart`. The old GGUF is still in the HF
|
|
423
|
+
cache so loading it costs nothing extra.
|
|
424
|
+
|
|
425
|
+
---
|
|
426
|
+
|
|
427
|
+
## After-upgrade housekeeping
|
|
428
|
+
|
|
429
|
+
When you're confident in the new model, reclaim disk space:
|
|
430
|
+
|
|
431
|
+
```bash
|
|
432
|
+
# What does llama.cpp see in its cache?
|
|
433
|
+
llama-cli -cl
|
|
434
|
+
|
|
435
|
+
# Sizes on disk (sorted)
|
|
436
|
+
du -h ~/.cache/huggingface/hub/* | sort -h
|
|
437
|
+
|
|
438
|
+
# Drop a single quant
|
|
439
|
+
llama-cli -cr <user>/<repo>:<quant> # e.g. -cr unsloth/Qwen3-Coder-Next-GGUF:Q4_K_M
|
|
440
|
+
|
|
441
|
+
# Drop a whole repo
|
|
442
|
+
rm -rf ~/.cache/huggingface/hub/models--<owner>--<repo>/
|
|
443
|
+
```
|
|
444
|
+
|
|
445
|
+
The old `Q4_K_M` for a tier you've upgraded is also a candidate for deletion
|
|
446
|
+
once the new quant is verified working.
|
|
447
|
+
|
|
448
|
+
---
|
|
449
|
+
|
|
450
|
+
## Upgrading the Python toolchain
|
|
451
|
+
|
|
452
|
+
The stack is a single Python package (`llmstack`) installed via
|
|
453
|
+
`pyproject.toml`. The runtime needs:
|
|
454
|
+
|
|
455
|
+
- `llmstack/app.py` — `fastapi`, `uvicorn[standard]`, `httpx` (router)
|
|
456
|
+
- `llmstack/check_models.py` — `huggingface_hub`, `PyYAML`
|
|
457
|
+
- `llmstack/tiers.py`, `llmstack/generators/*.py` — stdlib only
|
|
458
|
+
|
|
459
|
+
`hf_transfer` is a declared dep and gets used automatically when
|
|
460
|
+
`HF_HUB_ENABLE_HF_TRANSFER=1` is set, for faster multi-GB GGUF pulls.
|
|
461
|
+
|
|
462
|
+
Dependency versions live in `pyproject.toml`'s `[project.dependencies]`,
|
|
463
|
+
not in a `requirements.txt`. There is no checked-in venv either.
|
|
464
|
+
|
|
465
|
+
### First-time setup
|
|
466
|
+
|
|
467
|
+
The cleanest install puts the CLI on your PATH in an isolated env:
|
|
468
|
+
|
|
469
|
+
```bash
|
|
470
|
+
cd ~/Projects/opencode
|
|
471
|
+
pipx install -e . # editable install + isolated venv
|
|
472
|
+
llmstack --version
|
|
473
|
+
```
|
|
474
|
+
|
|
475
|
+
Or, if you prefer a managed venv (no pipx):
|
|
476
|
+
|
|
477
|
+
```bash
|
|
478
|
+
cd ~/Projects/opencode
|
|
479
|
+
python3 -m venv .venv
|
|
480
|
+
.venv/bin/pip install -e .
|
|
481
|
+
.venv/bin/llmstack --version
|
|
482
|
+
```
|
|
483
|
+
|
|
484
|
+
Pin a specific Python (e.g. 3.13) by passing `python3.13 -m venv .venv` or
|
|
485
|
+
`pipx install --python python3.13 -e .`. Anything ≥ 3.11 works (we use
|
|
486
|
+
3.11+ syntax: PEP 604 `X | None`, `list[Tier]`, etc.).
|
|
487
|
+
|
|
488
|
+
### Routine upgrade (latest patch versions of declared deps)
|
|
489
|
+
|
|
490
|
+
```bash
|
|
491
|
+
cd ~/Projects/opencode
|
|
492
|
+
pipx upgrade llmstack # if installed via pipx
|
|
493
|
+
# or, in your venv:
|
|
494
|
+
.venv/bin/pip install -U -e .
|
|
495
|
+
|
|
496
|
+
llmstack stop && llmstack start # bounce the router
|
|
497
|
+
llmstack check # smoke-test PyYAML + huggingface_hub
|
|
498
|
+
```
|
|
499
|
+
|
|
500
|
+
### Bumping a major version (e.g. fastapi 0 → 1)
|
|
501
|
+
|
|
502
|
+
1. Edit the version constraint in `pyproject.toml`'s
|
|
503
|
+
`[project.dependencies]` (e.g. `"fastapi>=1.0,<2.0"`).
|
|
504
|
+
2. Reinstall: `pipx upgrade llmstack` or `.venv/bin/pip install -U -e .`.
|
|
505
|
+
3. Test the router:
|
|
506
|
+
|
|
507
|
+
llmstack stop && llmstack start
|
|
508
|
+
curl -s http://127.0.0.1:10101/v1/models | jq '.data[].id'
|
|
509
|
+
curl -s http://127.0.0.1:10101/models.ini | head
|
|
510
|
+
curl -sN http://127.0.0.1:10101/v1/chat/completions -H 'Content-Type: application/json' \
|
|
511
|
+
-d '{"model":"auto","messages":[{"role":"user","content":"hi"}]}'
|
|
512
|
+
|
|
513
|
+
4. Read the upstream changelog for any breaking imports the router relies
|
|
514
|
+
on (`fastapi.FastAPI`, `Request`, `Response`, `JSONResponse`,
|
|
515
|
+
`StreamingResponse`, lifespan handlers, etc.). Update `llmstack/app.py`
|
|
516
|
+
to match if needed.
|
|
517
|
+
|
|
518
|
+
### Rebuilding the install from scratch
|
|
519
|
+
|
|
520
|
+
If anything gets weird (corrupt install, Python upgrade, dependency
|
|
521
|
+
conflicts), nuke and reinstall — it's cheap:
|
|
522
|
+
|
|
523
|
+
```bash
|
|
524
|
+
# pipx install:
|
|
525
|
+
pipx uninstall llmstack
|
|
526
|
+
pipx install -e ~/Projects/opencode
|
|
527
|
+
|
|
528
|
+
# venv install:
|
|
529
|
+
cd ~/Projects/opencode
|
|
530
|
+
rm -rf .venv
|
|
531
|
+
python3 -m venv .venv
|
|
532
|
+
.venv/bin/pip install -e .
|
|
533
|
+
```
|
|
534
|
+
|
|
535
|
+
Nothing in the venv / pipx env is editable — all source lives under
|
|
536
|
+
`llmstack/` in the repo and is installed in editable mode (`-e`), so edits
|
|
537
|
+
land instantly.
|
|
538
|
+
|
|
539
|
+
### Upgrading the llama-swap binary
|
|
540
|
+
|
|
541
|
+
The binary lives at `$LLMSTACK_BIN_DIR/llama-swap`, which defaults to
|
|
542
|
+
`$XDG_DATA_HOME/llmstack/bin/llama-swap` (i.e.
|
|
543
|
+
`~/.local/share/llmstack/bin/llama-swap` on macOS). It is **not** in
|
|
544
|
+
version control — `llmstack setup` downloads it as part of the first-time
|
|
545
|
+
walkthrough, and `llmstack install-llama-swap` lets you re-fetch it on
|
|
546
|
+
demand.
|
|
547
|
+
|
|
548
|
+
```bash
|
|
549
|
+
llmstack install-llama-swap # latest, idempotent (no-op if up-to-date)
|
|
550
|
+
llmstack install-llama-swap --force # redownload even if up-to-date
|
|
551
|
+
LLAMA_SWAP_VERSION=v211 \
|
|
552
|
+
llmstack install-llama-swap # pin a specific tag
|
|
553
|
+
```
|
|
554
|
+
|
|
555
|
+
It auto-detects OS (Darwin / Linux / FreeBSD) and arch (arm64 / amd64) and
|
|
556
|
+
downloads the matching tarball from
|
|
557
|
+
https://github.com/mostlygeek/llama-swap/releases.
|
|
558
|
+
|
|
559
|
+
To check what's installed, look at the version line printed by
|
|
560
|
+
`llmstack install-llama-swap` (or run the binary directly):
|
|
561
|
+
|
|
562
|
+
```bash
|
|
563
|
+
"$(llmstack status 2>/dev/null | awk '/llama-swap binary/ {print $NF}')" --version
|
|
564
|
+
# or simply:
|
|
565
|
+
~/.local/share/llmstack/bin/llama-swap --version
|
|
566
|
+
```
|
|
567
|
+
|
|
568
|
+
After bumping the binary, re-run `llmstack install` only if the new
|
|
569
|
+
version of llama-swap requires a config-schema change (it usually doesn't —
|
|
570
|
+
the YAML schema is stable).
|
|
571
|
+
|
|
572
|
+
### Snapshotting the exact installed set
|
|
573
|
+
|
|
574
|
+
For reproducibility (e.g. before a risky upgrade) capture the full lock:
|
|
575
|
+
|
|
576
|
+
```bash
|
|
577
|
+
# pipx:
|
|
578
|
+
pipx runpip llmstack freeze > requirements.lock.txt
|
|
579
|
+
|
|
580
|
+
# venv:
|
|
581
|
+
.venv/bin/pip freeze > requirements.lock.txt
|
|
582
|
+
|
|
583
|
+
# …upgrade…
|
|
584
|
+
# if it goes wrong:
|
|
585
|
+
.venv/bin/pip install -r requirements.lock.txt
|
|
586
|
+
```
|
|
587
|
+
|
|
588
|
+
We don't commit `requirements.lock.txt` by default — the floor pins in
|
|
589
|
+
`pyproject.toml` are sufficient for this stack's blast radius.
|
|
590
|
+
|
|
591
|
+
---
|
|
592
|
+
|
|
593
|
+
## Quick reference — tier sizes & natural model classes
|
|
594
|
+
|
|
595
|
+
| Tier | Weights budget | Typical params (dense) | Typical params (MoE) | Examples |
|
|
596
|
+
|---|---|---|---|---|
|
|
597
|
+
| `code-fast` | 2–6 GB | 1.5–7 B | — | Qwen2.5-Coder 3B/7B, DeepSeek-Coder-V2-Lite |
|
|
598
|
+
| `code-smart` | 25–55 GB | 30–70 B | 30B-A3B / 80B-A3B | Qwen3-Coder-Next, DeepSeek-Coder-V2, Codestral |
|
|
599
|
+
| `plan` | 7–25 GB | 14–32 B | — | Qwen3, Mistral-Small, Gemma-3, GLM |
|
|
600
|
+
| `plan-uncensored` | 7–25 GB | 14–32 B | — | abliterated/heretic/dolphin variants of the above |
|
|
601
|
+
|
|
602
|
+
Anything outside these brackets either won't fit, or wastes hardware.
|
|
@@ -212,6 +212,7 @@ logging.basicConfig(
|
|
|
212
212
|
)
|
|
213
213
|
log = logging.getLogger("router")
|
|
214
214
|
|
|
215
|
+
|
|
215
216
|
@asynccontextmanager
|
|
216
217
|
async def _lifespan(app: FastAPI):
|
|
217
218
|
global client
|
|
@@ -423,7 +424,8 @@ def _filter_response_headers(resp: httpx.Response) -> dict[str, str]:
|
|
|
423
424
|
|
|
424
425
|
|
|
425
426
|
async def _stream_proxy(method: str, path: str, body: bytes, headers: dict[str, str]) -> StreamingResponse:
|
|
426
|
-
|
|
427
|
+
if client is None:
|
|
428
|
+
raise RuntimeError("HTTP client not initialised — lifespan not started")
|
|
427
429
|
upstream_req = client.build_request(method, path, content=body, headers=headers)
|
|
428
430
|
upstream = await client.send(upstream_req, stream=True)
|
|
429
431
|
|
|
@@ -489,7 +491,8 @@ async def serve_models_ini() -> Response:
|
|
|
489
491
|
|
|
490
492
|
@app.get("/v1/models")
|
|
491
493
|
async def list_models() -> JSONResponse:
|
|
492
|
-
|
|
494
|
+
if client is None:
|
|
495
|
+
raise RuntimeError("HTTP client not initialised — lifespan not started")
|
|
493
496
|
try:
|
|
494
497
|
r = await client.get("/v1/models")
|
|
495
498
|
data = r.json()
|
|
@@ -614,9 +617,14 @@ async def _handle_completion(req: Request, path: str) -> Response:
|
|
|
614
617
|
return await _stream_proxy(req.method, path, raw, headers)
|
|
615
618
|
|
|
616
619
|
mutated = False
|
|
620
|
+
est_tokens: int | None = None
|
|
617
621
|
requested = body.get("model")
|
|
618
622
|
if requested in AUTO_ALIASES or requested == "auto":
|
|
619
623
|
chosen, reason = classify(body)
|
|
624
|
+
est_tokens = _estimate_tokens(
|
|
625
|
+
body.get("messages") if isinstance(body.get("messages"), list) else None,
|
|
626
|
+
body.get("prompt") if isinstance(body.get("prompt"), str) else None,
|
|
627
|
+
)
|
|
620
628
|
body["model"] = chosen
|
|
621
629
|
log.info("auto -> %s (%s) [path=%s]", chosen, reason, path)
|
|
622
630
|
mutated = True
|
|
@@ -636,9 +644,13 @@ async def _handle_completion(req: Request, path: str) -> Response:
|
|
|
636
644
|
|
|
637
645
|
if tier is not None and tier.is_bedrock:
|
|
638
646
|
from llmstack.backends import bedrock as bedrock_backend
|
|
639
|
-
|
|
647
|
+
resp = await bedrock_backend.dispatch(req, tier, body)
|
|
648
|
+
else:
|
|
649
|
+
resp = await _stream_proxy(req.method, path, raw, headers)
|
|
640
650
|
|
|
641
|
-
|
|
651
|
+
if est_tokens is not None:
|
|
652
|
+
resp.headers["X-LLMStack-Tokens"] = str(est_tokens)
|
|
653
|
+
return resp
|
|
642
654
|
|
|
643
655
|
|
|
644
656
|
@app.post("/v1/chat/completions")
|
|
@@ -446,7 +446,8 @@ def _build_converse_kwargs(tier: Tier, body: dict[str, Any], cfg: BedrockConfig)
|
|
|
446
446
|
next), passed in so the caller controls the channel and we don't
|
|
447
447
|
re-read the env mid-call.
|
|
448
448
|
"""
|
|
449
|
-
|
|
449
|
+
if tier.bedrock is None:
|
|
450
|
+
raise TypeError(f"_build_converse_kwargs called on non-bedrock tier {tier.name!r}")
|
|
450
451
|
messages = body.get("messages")
|
|
451
452
|
if not isinstance(messages, list):
|
|
452
453
|
# /v1/completions style: synthesise a single user message
|
|
@@ -763,7 +764,8 @@ async def _stream_response(client: Any, tier: Tier, converse_kwargs: dict[str, A
|
|
|
763
764
|
|
|
764
765
|
def model_descriptor(tier: Tier) -> dict[str, Any]:
|
|
765
766
|
"""Return an OpenAI-style ``/v1/models`` entry for a bedrock tier."""
|
|
766
|
-
|
|
767
|
+
if tier.bedrock is None:
|
|
768
|
+
raise TypeError(f"model_descriptor called on non-bedrock tier {tier.name!r}")
|
|
767
769
|
use_next = _use_next()
|
|
768
770
|
active = tier.bedrock.resolved(use_next=use_next)
|
|
769
771
|
channel = "next" if (use_next and tier.bedrock.has_next) else "current"
|
|
@@ -54,7 +54,6 @@ from llmstack.commands._helpers import (
|
|
|
54
54
|
from llmstack.generators import render_to
|
|
55
55
|
from llmstack.generators.llama_swap import render as render_yaml
|
|
56
56
|
from llmstack.generators.llama_swap import validate as validate_yaml
|
|
57
|
-
from llmstack.tiers import load_tiers
|
|
58
57
|
from llmstack.paths import (
|
|
59
58
|
DEFAULT_REMOTE_URL,
|
|
60
59
|
ROUTER_PORT,
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
; ----------------------------------------------------------------------------
|
|
2
2
|
; models.ini - inventory of models served by llmstack/.
|
|
3
3
|
;
|
|
4
|
-
; Runtime config: llmstack/llama-swap.yaml.
|
|
5
|
-
; opencode bindings:
|
|
4
|
+
; Runtime config: .llmstack/llama-swap.yaml (written by `llmstack start`).
|
|
5
|
+
; opencode bindings: .llmstack/opencode.json (written by `llmstack install`).
|
|
6
6
|
;
|
|
7
7
|
; SUPPORTED TIERS -- canonical names recognised by the router and the
|
|
8
8
|
; opencode / llama-swap generators. ONLY the names listed below should
|
|
@@ -1,9 +1,30 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: opencode-llmstack
|
|
3
|
-
Version: 0.9.
|
|
3
|
+
Version: 0.9.2
|
|
4
4
|
Summary: Multi-tier local LLM stack: llama-swap + FastAPI auto-router + opencode wiring.
|
|
5
5
|
Author: llmstack
|
|
6
|
-
License: MIT
|
|
6
|
+
License: MIT License
|
|
7
|
+
|
|
8
|
+
Copyright (c) 2024 llmstack contributors
|
|
9
|
+
|
|
10
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
11
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
12
|
+
in the Software without restriction, including without limitation the rights
|
|
13
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
14
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
15
|
+
furnished to do so, subject to the following conditions:
|
|
16
|
+
|
|
17
|
+
The above copyright notice and this permission notice shall be included in all
|
|
18
|
+
copies or substantial portions of the Software.
|
|
19
|
+
|
|
20
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
21
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
22
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
23
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
24
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
25
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
26
|
+
SOFTWARE.
|
|
27
|
+
|
|
7
28
|
Project-URL: Homepage, https://github.com/rohitgarg19/llmstack
|
|
8
29
|
Project-URL: Issues, https://github.com/rohitgarg19/llmstack/issues
|
|
9
30
|
Keywords: llm,llama-cpp,llama-swap,opencode,router,local-ai
|
|
@@ -17,9 +38,11 @@ Classifier: Programming Language :: Python :: 3
|
|
|
17
38
|
Classifier: Programming Language :: Python :: 3.11
|
|
18
39
|
Classifier: Programming Language :: Python :: 3.12
|
|
19
40
|
Classifier: Programming Language :: Python :: 3.13
|
|
41
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
20
42
|
Classifier: Topic :: Software Development
|
|
21
43
|
Requires-Python: >=3.11
|
|
22
44
|
Description-Content-Type: text/markdown
|
|
45
|
+
License-File: LICENSE
|
|
23
46
|
Requires-Dist: fastapi<1.0,>=0.110
|
|
24
47
|
Requires-Dist: httpx<1.0,>=0.27
|
|
25
48
|
Requires-Dist: uvicorn[standard]<1.0,>=0.30
|
|
@@ -32,6 +55,7 @@ Requires-Dist: pytest>=7; extra == "dev"
|
|
|
32
55
|
Provides-Extra: bedrock
|
|
33
56
|
Requires-Dist: boto3>=1.35; extra == "bedrock"
|
|
34
57
|
Requires-Dist: botocore>=1.35; extra == "bedrock"
|
|
58
|
+
Dynamic: license-file
|
|
35
59
|
|
|
36
60
|
# llmstack — multi-tier local LLM stack for Mac M4 Max / 64 GB
|
|
37
61
|
|
|
@@ -118,9 +142,9 @@ First match wins:
|
|
|
118
142
|
| 1 | last user msg contains `[nofilter]`, `[uncensored]`, `[heretic]`, or starts with `uncensored:` / `nofilter:` | `plan-uncensored` | explicit opt-in |
|
|
119
143
|
| 2 | `[ultra]` / `[opus]` / `ultra:` trigger AND `code-ultra` tier configured | `code-ultra` | explicit top-tier opt-in |
|
|
120
144
|
| 3 | plan verbs (*design, architect, approach, trade-off, should we, explain why, …*) AND no code blocks / agent verbs / tools | `plan` | pure design discussion (orthogonal track) |
|
|
121
|
-
| 4 | estimated input ≤
|
|
145
|
+
| 4 | estimated input ≤ 12 000 tokens | `code-ultra` *(or `code-smart` if ultra unwired)* | top tier — context still being built, latency/$ are best here |
|
|
122
146
|
| 5 | estimated input ≤ 32 000 tokens | `code-smart` | mid-context, local heavy coder is at its sweet spot |
|
|
123
|
-
| 6 | otherwise (long context) AND
|
|
147
|
+
| 6 | otherwise (long context) AND ≥ 10 user turns | `code-smart` | floor: deep agentic loop, keep the heavy model |
|
|
124
148
|
| 7 | otherwise (long context) | `code-fast` | 128k YaRN window + always-resident + free |
|
|
125
149
|
|
|
126
150
|
Token estimates are `chars / 4` over all message text + `prompt`. The
|
|
@@ -185,12 +209,12 @@ from any directory you previously ran `install` in.
|
|
|
185
209
|
## Layout
|
|
186
210
|
|
|
187
211
|
```
|
|
188
|
-
|
|
212
|
+
llmstack/ # repo root
|
|
189
213
|
├── pyproject.toml # package metadata + `llmstack` console script
|
|
190
214
|
├── README.md # this file
|
|
191
215
|
├── UPGRADING.md # how to swap any tier for a newer/better model
|
|
192
216
|
│ + how to upgrade the Python toolchain itself
|
|
193
|
-
├──
|
|
217
|
+
├── LICENSE # MIT
|
|
194
218
|
└── llmstack/ # the python package (importable, installable)
|
|
195
219
|
├── __init__.py
|
|
196
220
|
├── __main__.py # `python -m llmstack`
|
|
@@ -199,6 +223,7 @@ opencode/ # repo root
|
|
|
199
223
|
├── shell_env.py # spawn the env-prepared subshell + activate hooks
|
|
200
224
|
├── app.py # FastAPI auto-router (~280 lines)
|
|
201
225
|
├── tiers.py # parse models.ini -> Tier dataclasses
|
|
226
|
+
├── models.ini # SINGLE SOURCE OF TRUTH for tiers + sampler (bundled template)
|
|
202
227
|
├── check_models.py # snapshot tool (HF metadata + drift check)
|
|
203
228
|
├── AGENTS.md # opencode agent template (shipped as package data)
|
|
204
229
|
├── generators/
|
|
@@ -213,9 +238,9 @@ opencode/ # repo root
|
|
|
213
238
|
├── install_llama_swap.py
|
|
214
239
|
├── download.py
|
|
215
240
|
├── start.py
|
|
216
|
-
├── shell.py
|
|
217
241
|
├── stop.py
|
|
218
242
|
├── restart.py
|
|
243
|
+
├── reload.py
|
|
219
244
|
├── status.py
|
|
220
245
|
├── check.py
|
|
221
246
|
└── activate.py
|
|
@@ -544,7 +569,7 @@ All knobs are env vars; defaults are picked up by `llmstack start`.
|
|
|
544
569
|
| `ROUTER_UNCENSORED_MODEL` | `plan-uncensored` | `[nofilter]` triggers → here |
|
|
545
570
|
| `ROUTER_HIGH_FIDELITY_CEILING` | `12000` | tokens; at or below this, route to top tier (ultra → smart fallback). Paired with `code-ultra.ctx_size = 24000` (2x). |
|
|
546
571
|
| `ROUTER_MID_FIDELITY_CEILING` | `32000` | tokens; at or below this, route to `code-smart`; beyond, step down to `code-fast`. Paired with `code-smart.ctx_size = 64000` (2x). |
|
|
547
|
-
| `ROUTER_MULTI_TURN` | `
|
|
572
|
+
| `ROUTER_MULTI_TURN` | `10` | user-turn count that floors the long-context rung at `code-smart` |
|
|
548
573
|
| `ROUTER_HOST` / `ROUTER_PORT` | `127.0.0.1` / `10101` | listen address |
|
|
549
574
|
| `LOG_LEVEL` | `info` | router log level |
|
|
550
575
|
|
|
@@ -4,11 +4,11 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "opencode-llmstack"
|
|
7
|
-
version = "0.9.
|
|
7
|
+
version = "0.9.2"
|
|
8
8
|
description = "Multi-tier local LLM stack: llama-swap + FastAPI auto-router + opencode wiring."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.11"
|
|
11
|
-
license = {
|
|
11
|
+
license = { file = "LICENSE" }
|
|
12
12
|
authors = [{ name = "llmstack" }]
|
|
13
13
|
keywords = ["llm", "llama-cpp", "llama-swap", "opencode", "router", "local-ai"]
|
|
14
14
|
classifiers = [
|
|
@@ -22,6 +22,7 @@ classifiers = [
|
|
|
22
22
|
"Programming Language :: Python :: 3.11",
|
|
23
23
|
"Programming Language :: Python :: 3.12",
|
|
24
24
|
"Programming Language :: Python :: 3.13",
|
|
25
|
+
"Programming Language :: Python :: 3.14",
|
|
25
26
|
"Topic :: Software Development",
|
|
26
27
|
]
|
|
27
28
|
|
|
@@ -58,6 +59,13 @@ exclude = ["llmstack.bin*", "llmstack.tests*"]
|
|
|
58
59
|
[tool.setuptools.package-data]
|
|
59
60
|
llmstack = ["AGENTS.md", "models.ini"]
|
|
60
61
|
|
|
62
|
+
[tool.setuptools.data-files]
|
|
63
|
+
"" = ["LICENSE", "CHANGELOG.md", "UPGRADING.md"]
|
|
64
|
+
|
|
65
|
+
[tool.pytest.ini_options]
|
|
66
|
+
testpaths = ["llmstack/tests"]
|
|
67
|
+
addopts = "-v"
|
|
68
|
+
|
|
61
69
|
[tool.ruff]
|
|
62
70
|
line-length = 100
|
|
63
71
|
target-version = "py311"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{opencode_llmstack-0.9.1 → opencode_llmstack-0.9.2}/opencode_llmstack.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
{opencode_llmstack-0.9.1 → opencode_llmstack-0.9.2}/opencode_llmstack.egg-info/entry_points.txt
RENAMED
|
File without changes
|
|
File without changes
|
{opencode_llmstack-0.9.1 → opencode_llmstack-0.9.2}/opencode_llmstack.egg-info/top_level.txt
RENAMED
|
File without changes
|
|
File without changes
|