opencode-llmstack 0.9.1__py3-none-any.whl → 0.9.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
llmstack/__init__.py CHANGED
@@ -16,5 +16,5 @@ organised by concern:
16
16
 
17
17
  from __future__ import annotations
18
18
 
19
- __version__ = "0.1.0"
19
+ __version__ = "0.9.2"
20
20
  __all__ = ["__version__"]
llmstack/app.py CHANGED
@@ -212,6 +212,7 @@ logging.basicConfig(
212
212
  )
213
213
  log = logging.getLogger("router")
214
214
 
215
+
215
216
  @asynccontextmanager
216
217
  async def _lifespan(app: FastAPI):
217
218
  global client
@@ -423,7 +424,8 @@ def _filter_response_headers(resp: httpx.Response) -> dict[str, str]:
423
424
 
424
425
 
425
426
  async def _stream_proxy(method: str, path: str, body: bytes, headers: dict[str, str]) -> StreamingResponse:
426
- assert client is not None
427
+ if client is None:
428
+ raise RuntimeError("HTTP client not initialised — lifespan not started")
427
429
  upstream_req = client.build_request(method, path, content=body, headers=headers)
428
430
  upstream = await client.send(upstream_req, stream=True)
429
431
 
@@ -489,7 +491,8 @@ async def serve_models_ini() -> Response:
489
491
 
490
492
  @app.get("/v1/models")
491
493
  async def list_models() -> JSONResponse:
492
- assert client is not None
494
+ if client is None:
495
+ raise RuntimeError("HTTP client not initialised — lifespan not started")
493
496
  try:
494
497
  r = await client.get("/v1/models")
495
498
  data = r.json()
@@ -614,9 +617,14 @@ async def _handle_completion(req: Request, path: str) -> Response:
614
617
  return await _stream_proxy(req.method, path, raw, headers)
615
618
 
616
619
  mutated = False
620
+ est_tokens: int | None = None
617
621
  requested = body.get("model")
618
622
  if requested in AUTO_ALIASES or requested == "auto":
619
623
  chosen, reason = classify(body)
624
+ est_tokens = _estimate_tokens(
625
+ body.get("messages") if isinstance(body.get("messages"), list) else None,
626
+ body.get("prompt") if isinstance(body.get("prompt"), str) else None,
627
+ )
620
628
  body["model"] = chosen
621
629
  log.info("auto -> %s (%s) [path=%s]", chosen, reason, path)
622
630
  mutated = True
@@ -636,9 +644,13 @@ async def _handle_completion(req: Request, path: str) -> Response:
636
644
 
637
645
  if tier is not None and tier.is_bedrock:
638
646
  from llmstack.backends import bedrock as bedrock_backend
639
- return await bedrock_backend.dispatch(req, tier, body)
647
+ resp = await bedrock_backend.dispatch(req, tier, body)
648
+ else:
649
+ resp = await _stream_proxy(req.method, path, raw, headers)
640
650
 
641
- return await _stream_proxy(req.method, path, raw, headers)
651
+ if est_tokens is not None:
652
+ resp.headers["X-LLMStack-Tokens"] = str(est_tokens)
653
+ return resp
642
654
 
643
655
 
644
656
  @app.post("/v1/chat/completions")
@@ -400,7 +400,7 @@ def _tool_config(tools: list[dict[str, Any]] | None) -> dict[str, Any] | None:
400
400
  return {"tools": specs}
401
401
 
402
402
 
403
- def _inference_config(body: dict[str, Any]) -> dict[str, Any]:
403
+ def _inference_config(body: dict[str, Any], max_output_tokens: int | None = None) -> dict[str, Any]:
404
404
  # We forward only what the Converse `inferenceConfig` schema accepts:
405
405
  # `temperature`, `topP`, `maxTokens`, `stopSequences`. Other sampler
406
406
  # knobs (`top_k`, `min_p`, `repetition_penalty`) have no Converse-
@@ -415,28 +415,34 @@ def _inference_config(body: dict[str, Any]) -> dict[str, Any]:
415
415
  # forward. Configure Bedrock tiers in models.ini accordingly: omit
416
416
  # the `sampler =` line for Opus 4.7+, and pick the one allowed knob
417
417
  # for Sonnet 4.5 / Haiku 4.5.
418
- cfg: dict[str, Any] = {}
418
+ #
419
+ # `max_output_tokens` is the per-tier cap from models.ini
420
+ # (`aws_max_output_tokens`). When set, the client-requested value is
421
+ # silently clamped to this ceiling -- useful for models like
422
+ # Llama 3.1 405B whose Bedrock deployment rejects values above 4096.
423
+ icfg: dict[str, Any] = {}
419
424
  if "temperature" in body:
420
425
  try:
421
- cfg["temperature"] = float(body["temperature"])
426
+ icfg["temperature"] = float(body["temperature"])
422
427
  except (TypeError, ValueError):
423
428
  pass
424
429
  if "top_p" in body:
425
430
  try:
426
- cfg["topP"] = float(body["top_p"])
431
+ icfg["topP"] = float(body["top_p"])
427
432
  except (TypeError, ValueError):
428
433
  pass
429
434
  if "max_tokens" in body or "max_completion_tokens" in body:
430
435
  try:
431
- cfg["maxTokens"] = int(body.get("max_tokens") or body.get("max_completion_tokens"))
436
+ requested = int(body.get("max_tokens") or body.get("max_completion_tokens"))
437
+ icfg["maxTokens"] = min(requested, max_output_tokens) if max_output_tokens else requested
432
438
  except (TypeError, ValueError):
433
439
  pass
434
440
  stop = body.get("stop")
435
441
  if isinstance(stop, str):
436
- cfg["stopSequences"] = [stop]
442
+ icfg["stopSequences"] = [stop]
437
443
  elif isinstance(stop, list):
438
- cfg["stopSequences"] = [s for s in stop if isinstance(s, str)]
439
- return cfg
444
+ icfg["stopSequences"] = [s for s in stop if isinstance(s, str)]
445
+ return icfg
440
446
 
441
447
 
442
448
  def _build_converse_kwargs(tier: Tier, body: dict[str, Any], cfg: BedrockConfig) -> dict[str, Any]:
@@ -446,7 +452,8 @@ def _build_converse_kwargs(tier: Tier, body: dict[str, Any], cfg: BedrockConfig)
446
452
  next), passed in so the caller controls the channel and we don't
447
453
  re-read the env mid-call.
448
454
  """
449
- assert tier.bedrock is not None
455
+ if tier.bedrock is None:
456
+ raise TypeError(f"_build_converse_kwargs called on non-bedrock tier {tier.name!r}")
450
457
  messages = body.get("messages")
451
458
  if not isinstance(messages, list):
452
459
  # /v1/completions style: synthesise a single user message
@@ -462,7 +469,7 @@ def _build_converse_kwargs(tier: Tier, body: dict[str, Any], cfg: BedrockConfig)
462
469
  if sys_blocks:
463
470
  converse_kwargs["system"] = sys_blocks
464
471
 
465
- inference = _inference_config(body)
472
+ inference = _inference_config(body, max_output_tokens=cfg.max_output_tokens)
466
473
  if inference:
467
474
  converse_kwargs["inferenceConfig"] = inference
468
475
 
@@ -763,7 +770,8 @@ async def _stream_response(client: Any, tier: Tier, converse_kwargs: dict[str, A
763
770
 
764
771
  def model_descriptor(tier: Tier) -> dict[str, Any]:
765
772
  """Return an OpenAI-style ``/v1/models`` entry for a bedrock tier."""
766
- assert tier.bedrock is not None
773
+ if tier.bedrock is None:
774
+ raise TypeError(f"model_descriptor called on non-bedrock tier {tier.name!r}")
767
775
  use_next = _use_next()
768
776
  active = tier.bedrock.resolved(use_next=use_next)
769
777
  channel = "next" if (use_next and tier.bedrock.has_next) else "current"
@@ -54,7 +54,6 @@ from llmstack.commands._helpers import (
54
54
  from llmstack.generators import render_to
55
55
  from llmstack.generators.llama_swap import render as render_yaml
56
56
  from llmstack.generators.llama_swap import validate as validate_yaml
57
- from llmstack.tiers import load_tiers
58
57
  from llmstack.paths import (
59
58
  DEFAULT_REMOTE_URL,
60
59
  ROUTER_PORT,
llmstack/models.ini CHANGED
@@ -1,8 +1,8 @@
1
1
  ; ----------------------------------------------------------------------------
2
2
  ; models.ini - inventory of models served by llmstack/.
3
3
  ;
4
- ; Runtime config: llmstack/llama-swap.yaml.
5
- ; opencode bindings: ../opencode.json (model + agent.build/plan/plan-nofilter).
4
+ ; Runtime config: .llmstack/llama-swap.yaml (written by `llmstack start`).
5
+ ; opencode bindings: .llmstack/opencode.json (written by `llmstack install`).
6
6
  ;
7
7
  ; SUPPORTED TIERS -- canonical names recognised by the router and the
8
8
  ; opencode / llama-swap generators. ONLY the names listed below should
@@ -215,9 +215,10 @@ description = Mistral-Small 3.2 24B Heretic - no-filter planning
215
215
  ; tier = chat
216
216
  ; role = plan-uncensored
217
217
  ; backend = bedrock
218
- ; aws_model_id = meta.llama3-1-405b-instruct-v1:0
219
- ; aws_region = us-west-2 ; Llama 405B has no EU deployment; keep on US
220
- ; aws_profile = bedrock-prod
218
+ ; aws_model_id = meta.llama3-1-405b-instruct-v1:0
219
+ ; aws_region = us-west-2 ; Llama 405B has no EU deployment; keep on US
220
+ ; aws_profile = bedrock-prod
221
+ ; aws_max_output_tokens = 4096 ; Llama 3.1 405B Bedrock hard cap
221
222
  ; ctx_size = 128000
222
223
  ; sampler = temp=0.85, top_p=0.95 ; max exploration
223
224
  ; description = Llama 3.1 405B on Bedrock - no-filter planning
@@ -229,10 +230,11 @@ description = Mistral-Small 3.2 24B Heretic - no-filter planning
229
230
  ; tier = chat
230
231
  ; role = plan-uncensored
231
232
  ; backend = bedrock
232
- ; aws_model_id = meta.llama3-1-405b-instruct-v1:0
233
- ; aws_region = us-west-2 ; Llama 405B has no EU deployment
234
- ; aws_profile = bedrock-prod
235
- ; aws_endpoint_url = https://bedrock-runtime.us-west-2.vpce.amazonaws.com
233
+ ; aws_model_id = meta.llama3-1-405b-instruct-v1:0
234
+ ; aws_region = us-west-2 ; Llama 405B has no EU deployment
235
+ ; aws_profile = bedrock-prod
236
+ ; aws_endpoint_url = https://bedrock-runtime.us-west-2.vpce.amazonaws.com
237
+ ; aws_max_output_tokens = 4096 ; Llama 3.1 405B Bedrock hard cap
236
238
  ; ctx_size = 128000
237
239
  ; sampler = temp=0.85, top_p=0.95
238
240
  ; description = Llama 3.1 405B on Bedrock (VPC) - no-filter planning
llmstack/tiers.py CHANGED
@@ -139,6 +139,7 @@ class BedrockConfig:
139
139
  endpoint_url: str | None = None
140
140
  model_id_next: str | None = None
141
141
  region_next: str | None = None
142
+ max_output_tokens: int | None = None
142
143
 
143
144
  @property
144
145
  def has_next(self) -> bool:
@@ -298,6 +299,7 @@ def _build_bedrock(section) -> BedrockConfig:
298
299
  endpoint_url=_opt(section.get("aws_endpoint_url")),
299
300
  model_id_next=_opt(section.get("aws_model_id_next")),
300
301
  region_next=_opt(section.get("aws_region_next")),
302
+ max_output_tokens=_int(section.get("aws_max_output_tokens", "")) or None,
301
303
  )
302
304
 
303
305
 
@@ -0,0 +1,117 @@
1
+ # Changelog
2
+
3
+ All notable changes to `opencode-llmstack` are documented here.
4
+
5
+ ---
6
+
7
+ ## [0.9.2] — 2026-05-11
8
+
9
+ ### Fixed
10
+ - `classify()` now counts only **user-role messages** when evaluating the
11
+ multi-turn floor (`n_turns`). Previously `len(messages)` counted system
12
+ prompts, assistant turns, and tool-result messages, causing the floor to
13
+ fire after just a few real exchanges and permanently blocking `code-fast`
14
+ routing for the rest of any session.
15
+ - Multi-turn floor threshold raised from **6 → 10** user turns. `code-fast`
16
+ is now a hosted Bedrock model (Haiku 4.5) that tool-calls reliably, so
17
+ the old 3B-model rationale no longer applies. Sessions with fewer than 10
18
+ user turns will now correctly step down to `code-fast` past 32k tokens.
19
+ - Log label corrected: `(tools floor)` → `(user-turns=N>=10 floor)`.
20
+ - `__version__` corrected from `"0.1.0"` to `"0.9.2"`.
21
+ - CI release pipeline now runs lint (`ruff`) + `pytest` across Python
22
+ 3.11/3.12/3.13 before building the wheel. Previously the `test` job was
23
+ documented in comments but never implemented.
24
+ - Added `LICENSE` (MIT) file to the repository root.
25
+ - README routing table updated: high-fidelity ceiling corrected (8k → 12k),
26
+ tools-floor condition updated to reflect user-turn counting, `ROUTER_MULTI_TURN`
27
+ default corrected (6 → 10).
28
+ - UPGRADING.md corrected: `llmstack install` does **not** regenerate
29
+ `llama-swap.yaml` — that is `llmstack restart`'s job. Three places in the
30
+ doc had this wrong.
31
+ - README layout tree: repo root label corrected (`opencode/` → `llmstack/`),
32
+ `models.ini` moved to its correct location inside the package, `shell.py`
33
+ (deleted) removed, `reload.py` and `LICENSE` added.
34
+ - `iter_downloads` reference in UPGRADING.md corrected to `iter_download_targets`.
35
+ - Bundled `llmstack/models.ini` header comment paths updated from legacy
36
+ locations to current `.llmstack/` state-dir layout.
37
+ - `assert` statements in production code (`app.py`, `bedrock.py`) replaced
38
+ with explicit `RuntimeError` / `TypeError` raises so `-O` optimisation
39
+ does not silently swallow them.
40
+ - `UPGRADING.md` and `LICENSE` added to the sdist via `pyproject.toml`
41
+ `package-data` / `tool.setuptools` config.
42
+ - `[tool.pytest.ini_options]` added to `pyproject.toml` with `testpaths`
43
+ and `addopts`.
44
+ - Python 3.14 classifier added to `pyproject.toml`.
45
+
46
+ ### Added
47
+ - `classify()` end-to-end test coverage: step-down ladder (short/mid/long
48
+ context), multi-turn floor, plan-signal routing, ultra-trigger routing,
49
+ uncensored-trigger routing, plan ctx-size overflow fall-through.
50
+ - Generator tests: `build_config()` coverage for gguf tiers, bedrock tiers,
51
+ `use_next`, `small_model` wiring, agent wiring, `auto` ctx derivation.
52
+ - `X-LLMStack-Tokens` response header on every `/v1/chat/completions` and
53
+ `/v1/completions` response so opencode (and curl) can see the estimated
54
+ token count the router used to make its routing decision.
55
+
56
+ ---
57
+
58
+ ## [0.9.1] — 2026-05-11
59
+
60
+ ### Fixed
61
+ - `classify()` multi-turn floor: count only `role == "user"` messages
62
+ (not all messages). This was the primary fix preventing `code-fast`
63
+ from ever being reached in long sessions.
64
+ - Multi-turn threshold raised 6 → 10 (see 0.9.2 for full rationale).
65
+ - Log label `(tools floor)` corrected to `(user-turns=N>=10 floor)`.
66
+
67
+ ---
68
+
69
+ ## [0.9.0] — 2026-05-08
70
+
71
+ ### Changed
72
+ - Plan tiers now strip `tools` from the request body before dispatch.
73
+ Previously a plan-routed request carrying a `tools` array would fail
74
+ on Bedrock (Converse rejects tool configs on non-agent models).
75
+ - Long-context fall-through to `code-fast` is now allowed even when
76
+ `tools[]` is present in the request body. The tools-presence check
77
+ was removed from the floor condition; only turn count matters now.
78
+ - `plan` tier ctx-size overflow: when estimated tokens exceed the
79
+ planner's `ctx_size`, the request falls through to the coding ladder
80
+ instead of being sent to a planner whose window can't hold it.
81
+ - `HIGH_FIDELITY_CEILING` raised to 12 000 (was 8 000).
82
+
83
+ ---
84
+
85
+ ## [0.8.0] — 2026-05-07
86
+
87
+ ### Changed
88
+ - Fidelity-ceiling overhaul: each ceiling is now exactly half of the
89
+ corresponding tier's `ctx_size` (the "comfortable headroom" invariant).
90
+ - `code-ultra.ctx_size` set to 24 000 (2× high ceiling of 12 000).
91
+ - `code-smart.ctx_size` set to 64 000 (2× mid ceiling of 32 000).
92
+ - `code-fast.ctx_size` set to 128 000 (YaRN ×4 from native 32k).
93
+ - `HIGH_FIDELITY_CEILING` env var added; overrides the 12 000 default.
94
+ - `MID_FIDELITY_CEILING` env var added; overrides the 32 000 default.
95
+
96
+ ---
97
+
98
+ ## [0.7.3] — 2026-05-06
99
+
100
+ ### Added
101
+ - Per-tier Bedrock alternatives in `models.ini`: every tier now ships a
102
+ commented-out Bedrock block directly beneath its GGUF block.
103
+ - All Bedrock tiers anchored to `eu-west-3`; `plan-uncensored` pinned to
104
+ `us-west-2` (Llama 405B has no EU deployment).
105
+ - `aws_model_id_next` / `aws_region_next` support for Bedrock upgrade
106
+ pre-staging (mirrors gguf `hf_file_next`).
107
+
108
+ ### Fixed
109
+ - `models.ini` comment cleanup: removed stale references to old model names.
110
+
111
+ ---
112
+
113
+ ## [0.7.2] — earlier
114
+
115
+ ### Fixed
116
+ - Soft-fail when `llama-server` binary is missing at startup.
117
+ - PowerShell activation hook: fixed `Invoke-Expression` quoting.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 llmstack contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,602 @@
1
+ # UPGRADING — replacing models with newer / better ones
2
+
3
+ This stack is built around **swapping individual model files in/out**. There is no
4
+ lock-in to provider, model family, or name — you can replace any tier with any
5
+ other GGUF that fits the role and your hardware budget.
6
+
7
+ This doc explains:
8
+
9
+ 1. [Why GGUF specifically](#why-gguf)
10
+ 2. [Where each model is referenced (single source of truth + IDE wiring)](#where-the-models-live)
11
+ 3. [The upgrade workflow](#upgrade-workflow)
12
+ 4. [How to judge "holistically better" per tier](#judging-better-per-tier)
13
+ 5. [Where to look for candidates](#where-to-find-candidates)
14
+ 6. [Worked example](#worked-example-replacing-the-plan-tier)
15
+ 7. [After-upgrade housekeeping](#after-upgrade-housekeeping)
16
+ 8. [Upgrading the Python toolchain](#upgrading-the-python-toolchain)
17
+
18
+ ---
19
+
20
+ ## Why GGUF
21
+
22
+ Every model the stack runs **must** be a GGUF (`.gguf`) file because:
23
+
24
+ - **`llama.cpp` only loads GGUF natively.** The whole stack — `llama-server`,
25
+ `llama-swap`, the Metal backend on Apple Silicon — is built on llama.cpp.
26
+ - **Self-contained format.** A single `.gguf` includes weights, tokenizer,
27
+ chat template (Jinja), and metadata. No `transformers`, no separate
28
+ tokenizer files, no Python deps at runtime.
29
+ - **First-class quantisation.** GGUF is the format quantised models ship in
30
+ (Q4_K_M, Q5_K_M, Q8_0, i1, UD, …). Quants are how a 24 B model fits in
31
+ 13 GB of RAM with little quality loss.
32
+ - **Instant `-hf` resolution.** `llama-server -hf <repo> -hff <file>.gguf`
33
+ downloads on demand into the standard HF cache and starts serving — no
34
+ conversion step, no extra tooling.
35
+
36
+ If a model you want is **not** published as a GGUF, you have two options:
37
+
38
+ 1. Wait for one of the trusted converters (`bartowski`, `unsloth`,
39
+ `mradermacher`, `lmstudio-community`, `MaziyarPanahi`, …) to publish one —
40
+ usually within hours of the original release.
41
+ 2. Convert it yourself with `llama.cpp/convert_hf_to_gguf.py` then quantise
42
+ with `llama-quantize`. Doable but slow and adds maintenance burden — only
43
+ worth it for niche models.
44
+
45
+ **Rule of thumb:** if you can't find `<model-name>-GGUF` on HF or via the
46
+ maintainers above, the model is not ready for this stack. Pick another.
47
+
48
+ ## Cache management
49
+
50
+ There is **one** writer to the model cache: `llama-cli`. Everything reads from
51
+ the same place: `~/.cache/huggingface/hub/`.
52
+
53
+ ```
54
+ ~/.cache/huggingface/hub/
55
+ └── models--<owner>--<repo>/
56
+ ├── blobs/
57
+ │ ├── <sha> ← completed file
58
+ │ └── <sha>.downloadInProgress ← llama.cpp's resumable partial
59
+ ├── refs/main ← commit hash
60
+ └── snapshots/<commit>/
61
+ └── <filename>.gguf ← symlink to the blob
62
+ ```
63
+
64
+ Key points:
65
+
66
+ - **`llama-cli -cl`** lists everything currently complete in the cache.
67
+ - **`llmstack download` uses `llama-cli`** so the cache stays coherent. It
68
+ accepts `HF_TOKEN` from the environment if you have one.
69
+ - **Do NOT use `huggingface-cli download` or `pip install huggingface_hub`'s
70
+ `hf_hub_download()`** to write to the same cache — they use a different
71
+ partial-file convention (`.incomplete` vs `.downloadInProgress`) and
72
+ cannot resume each other's partials. Completed files are interchangeable;
73
+ in-flight downloads are not.
74
+ - **Cache-dir override:** all llama.cpp tools accept `-cd <dir>` to point at
75
+ a different cache. We don't use it; the default is correct.
76
+ - **Cache cleanup:** `llama-cli -cr <repo>:<quant>` removes a single entry,
77
+ or `rm -rf ~/.cache/huggingface/hub/models--<owner>--<repo>/` removes a
78
+ whole repo's blobs.
79
+
80
+ ## `-hf` / `-hff` syntax (gotcha)
81
+
82
+ ```
83
+ -hf <user>/<repo> repo on HuggingFace
84
+ -hf <user>/<repo>:<quant> repo + quant TAG (e.g. :Q4_K_M, :Q6_K) — auto-resolves a file
85
+ -hff <filename>.gguf explicit file inside the repo
86
+ ```
87
+
88
+ `-hf <repo>:<full-filename>.gguf` **does not work** — the `:suffix` part is
89
+ parsed as a quant tag, so a full filename ending in `.gguf` is rejected with
90
+ `get_hf_plan: no GGUF files found in repository`. Always pair `-hf <repo>` with
91
+ `-hff <filename>.gguf` for explicit selection. This is what the generated
92
+ `llama-swap.yaml` uses everywhere.
93
+
94
+ ---
95
+
96
+ ## Where the models live
97
+
98
+ There is exactly **one source of truth** that determines what runs in a
99
+ given project: `.llmstack/models.ini` in the work-dir. Everything else is
100
+ generated from it. The first time you run `llmstack install` in a fresh
101
+ project, this file is auto-seeded from the bundled template at
102
+ `llmstack/models.ini` inside the package; from then on it's per-project.
103
+
104
+ | File | What it does | Required when changing a model? |
105
+ |---|---|---|
106
+ | **`<work-dir>/.llmstack/models.ini`** | The single source of truth: tier definitions (repo, file, ctx, sampler, role, …). All other artefacts derive from it. Auto-seeded from the bundled template on first `install`; gitignored after that — yours to edit per project. | **Yes — primary edit.** |
107
+ | `llmstack/models.ini` (in the package) | Bundled template, version-controlled, ships with `pip install -e .`. Only consulted as the seed source when a project has no `.llmstack/models.ini` yet. Edit if you want to change the *factory defaults* every new project starts from. | Only when changing factory defaults. |
108
+ | `<work-dir>/.llmstack/llama-swap.yaml` | AUTO-GENERATED by `llmstack.generators.llama_swap`. Determines what `llama-server` actually loads. Regenerated fresh by `llmstack start` (or `llmstack restart`) for the channel you're booting into. Do not regenerate with `llmstack install` — `install` only writes `opencode.json` + `AGENTS.md`. | Auto — don't hand-edit. |
109
+ | `<work-dir>/.llmstack/opencode.json` | AUTO-GENERATED by `llmstack.generators.opencode`. `<work-dir>` is whatever directory the CLI was invoked from — `cd` into any project to get a project-local config. Wires opencode's `model`, `small_model`, and `agent.*` to the right tiers. Loaded by opencode via the `OPENCODE_CONFIG` env var that the activate hook (or `llmstack start`'s fallback subshell) exports — your global `~/.config/opencode/opencode.json` is intentionally left alone. Regenerate via `llmstack install`. | Auto — don't hand-edit. |
110
+ | `<work-dir>/.llmstack/AGENTS.md` | Copy of `llmstack/AGENTS.md` (the bundled template), made by `install`. Loaded by opencode via the `instructions` field of `opencode.json`. Edit the source template to change defaults; edit the `.llmstack/` copy for one-off project tweaks (will be overwritten on next `install`). | Auto — copy of template. |
111
+ | `llmstack download` | Enumerates download targets at runtime via `llmstack.tiers.iter_download_targets` (which reads `models.ini`). | No edits — it reflects `models.ini` automatically. |
112
+
113
+ `llmstack check` flags any DRIFT between `models.ini` (the recommended file
114
+ per tier) and `llama-swap.yaml` (the file actually configured). If you ever
115
+ see DRIFT, run `llmstack restart` to regenerate the yaml and cycle the stack.
116
+
117
+ ### Where the download list comes from
118
+
119
+ `llmstack download` does **not** hardcode the (repo, file) tuples. It
120
+ enumerates them at runtime by reading `.llmstack/models.ini` (via
121
+ `llmstack.tiers`), yielding one row per `(tier, label)` where `label` is
122
+ `current` (the active `hf_file`) or `next` (the queued `hf_file_next`).
123
+
124
+ To add a new file to the download set, edit the matching tier in
125
+ `models.ini`. To stop pre-fetching an upgrade target, blank out its
126
+ `hf_file_next` line. No edits to Python code needed.
127
+
128
+ ### Trying the queued upgrade without committing to it
129
+
130
+ Once an `hf_file_next` finishes downloading you can run the whole stack
131
+ against the **next** file for every tier that has one, without touching
132
+ `models.ini` permanently:
133
+
134
+ ```bash
135
+ llmstack stop
136
+ llmstack start --next
137
+ llmstack status # shows "channel: next" + per-tier swap list
138
+ ```
139
+
140
+ Under the hood, `start --next` calls
141
+ `llmstack.generators.llama_swap.render(use_next=True)` and writes the
142
+ result to the same `<work-dir>/.llmstack/llama-swap.yaml` (with a header
143
+ banner reminding you it's the next-channel build), then points llama-swap
144
+ at it. Tiers with no `hf_file_next` are unchanged. To revert, `llmstack
145
+ stop && llmstack start` (no `--next`) regenerates the canonical yaml.
146
+ To make the upgrade permanent, swap `hf_file_next` into `hf_file` in
147
+ `models.ini` and re-run `llmstack install`.
148
+
149
+ ### Anchors to find on each upgrade
150
+
151
+ Inside `<work-dir>/.llmstack/llama-swap.yaml`, each tier has a clearly-marked
152
+ model block:
153
+
154
+ ```yaml
155
+ models:
156
+ code-fast: # ← tier alias (do not rename without updating opencode.json)
157
+ cmd: |
158
+ ${llama_server} ${metal_defaults}
159
+ # >>> UPGRADE-POINT (code-fast): swap the -hf/-hff pair below to change this tier.
160
+ -hf bartowski/Qwen2.5-Coder-3B-Instruct-GGUF # <-- REPLACE: HF repo
161
+ -hff Qwen2.5-Coder-3B-Instruct-Q5_K_M.gguf # <-- REPLACE: filename in that repo
162
+ --alias code-fast
163
+ -c 131072
164
+ ...
165
+ ```
166
+
167
+ But don't edit the YAML — it's regenerated. Find the same lines in
168
+ `.llmstack/models.ini` (search for the tier name, e.g. `[code-fast]`) and
169
+ edit `hf_repo` / `hf_file` there. Then re-run `llmstack install`.
170
+
171
+ The download list comes from the same ini file: the four tiers map 1:1 to
172
+ four download jobs, each with `(repo, current-file, next-file?)`.
173
+
174
+ ---
175
+
176
+ ## Upgrade workflow
177
+
178
+ Apply this **per tier**, not all at once. Test one model at a time so you
179
+ can roll back cleanly.
180
+
181
+ ```
182
+ 0. (Optional, fast) llmstack check
183
+ → prints current model + repo + last-modified + HF URL for each tier
184
+
185
+ 1. Identify a candidate.
186
+ - Hugging Face search: https://huggingface.co/models?search=<keywords>+GGUF&sort=trending
187
+ - Filter: GGUF format, last 30 days, downloads > 1k
188
+ - Cross-check: a leaderboard appropriate to the tier (see below)
189
+
190
+ 2. Verify the candidate is suitable for the tier.
191
+ - GGUF available? must exist as a .gguf file
192
+ - Size in budget? see "tier sizes" table below
193
+ - Chat template embedded? open the GGUF page → "Use this model" → llama.cpp tab.
194
+ If --jinja is mentioned or chat_template metadata
195
+ is shown, you're fine.
196
+ - Tool calls supported? only matters for code-smart. Check that the model
197
+ page lists "function calling" or its chat template
198
+ handles tool-call blocks.
199
+ - Native context length? must be ≥ what you set with -c, or have YaRN config.
200
+
201
+ 3. Smoke-test it before wiring it in. First, prefetch + validate via llama-cli:
202
+
203
+ llama-cli -hf <new-repo> -hff <new-file>.gguf \
204
+ --no-warmup -ngl 0 -c 256 -p ok -n 1
205
+
206
+ # then bring up llama-server on a throwaway port to test inference:
207
+ llama-server -hf <new-repo> -hff <new-file>.gguf \
208
+ --port 9999 -ngl 999 -fa on --jinja \
209
+ --cache-type-k q8_0 --cache-type-v q8_0 -c 32768
210
+
211
+ # in another terminal:
212
+ curl -sN http://127.0.0.1:9999/v1/chat/completions \
213
+ -H 'Content-Type: application/json' \
214
+ -d '{"model":"x","messages":[{"role":"user","content":"hello"}]}'
215
+
216
+ Confirm:
217
+ - it loads without errors (look for "main: model loaded")
218
+ - it streams a sensible response
219
+ - it accepts your typical request shape (tools, long context, …)
220
+
221
+ 4. Edit ONE file: `.llmstack/models.ini`. In the matching `[tier]` section,
222
+ update:
223
+
224
+ hf_repo = <new-repo>
225
+ hf_file = <new-file>.gguf
226
+ ctx_size = <new-context> ; only if changing
227
+ sampler = temp=..., top_p=..., ... ; only if changing
228
+ size_gb, quant, status ; for documentation accuracy
229
+
230
+ No other files need editing — `llama-swap.yaml`, `opencode.json`, and
231
+ the download list are all auto-derived from this.
232
+
233
+ 5. Regenerate everything from the ini:
234
+
235
+ llmstack install # rewrites opencode.json + AGENTS.md
236
+ llmstack download # picks up the new (repo, file) pair from models.ini
237
+
238
+ 6. Cycle the stack:
239
+
240
+ llmstack stop
241
+ llmstack start
242
+ llmstack status
243
+ curl -s http://127.0.0.1:10101/v1/models | jq '.data[].id'
244
+
245
+ 7. Sanity-check via opencode (or curl) — fire a few characteristic prompts at
246
+ the upgraded tier. If the new model misbehaves, the rollback is just a
247
+ one-line revert in models.ini + `llmstack install` + `llmstack restart`.
248
+ ```
249
+
250
+ ---
251
+
252
+ ## Judging "better" per tier
253
+
254
+ A model that scores +5 % on MMLU is not automatically better for *your*
255
+ workflow. Score against the role the tier plays.
256
+
257
+ ### `code-fast` — autocomplete / FIM / quick Q&A
258
+
259
+ What matters:
260
+ - **Tokens/sec on M4 Max** (must clear ~60 tok/s for tab-feel responsiveness)
261
+ - **FIM (fill-in-the-middle) support** — the chat template must include a FIM
262
+ format, and the model must be trained for it
263
+ - **Tool calling** — *not* required, this tier never calls tools
264
+
265
+ How to evaluate:
266
+ - Run `llama-bench -m <new>.gguf -p 512 -n 128 -ngl 999` for raw speed
267
+ - Sniff test with a typical autocomplete prompt; latency should feel like
268
+ the cursor is barely ahead of you
269
+ - Aider leaderboard "edit format" column — proxy for FIM quality
270
+
271
+ Size budget: **~2–6 GB** weights (we want this resident permanently while
272
+ sharing memory with the heavy tier).
273
+
274
+ Good candidates to track:
275
+ - `bartowski/Qwen2.5-Coder-*-Instruct-GGUF` (any of 3B/7B)
276
+ - `bartowski/DeepSeek-Coder-V2-Lite-Instruct-GGUF`
277
+ - `unsloth/Qwen3-Coder-*-GGUF` (smaller variants)
278
+ - Any new "tab" / "FIM" coder ≤ 7B that drops
279
+
280
+ ### `code-smart` — agent: tool calls, multi-file edits, refactors
281
+
282
+ What matters:
283
+ - **Tool / function-calling correctness** (the model must reliably emit
284
+ well-formed tool-call blocks)
285
+ - **Long-context recall** (≥ 64 k usable, ideally 128 k+)
286
+ - **Code editing benchmarks** (aider edit success, SWE-Bench Verified)
287
+ - **Speed at full context** (MoE models win here on Apple Silicon)
288
+
289
+ How to evaluate:
290
+ - Aider's [LLM Leaderboard](https://aider.chat/docs/leaderboards/) — most
291
+ honest signal for agentic coding
292
+ - LiveCodeBench scores
293
+ - SWE-Bench Verified (the "real PRs" benchmark)
294
+ - Run an actual opencode session in `build` mode against your repo
295
+
296
+ Size budget: **~30–55 GB** weights (must fit alongside `code-fast` ≈ 5 GB
297
+ in your wired-mem cap).
298
+
299
+ Good candidates to track:
300
+ - `unsloth/Qwen3-Coder-*-GGUF` (incl. the Next 80B-A3B you have)
301
+ - `bartowski/DeepSeek-Coder-V*-Instruct-GGUF`
302
+ - `bartowski/Codestral-*-GGUF` future versions
303
+ - New "Coder" or "Devstral" releases as they appear
304
+
305
+ ### `plan` — design discussions, architecture, trade-offs
306
+
307
+ What matters:
308
+ - **Reasoning quality** (MMLU-Pro, GPQA-Diamond)
309
+ - **Instruction-following on multi-step prompts** (IFEval)
310
+ - **Discussion style** — should propose alternatives, not jump to code
311
+ - **Refusals on edge cases** — fine to refuse weird stuff in plain plan mode
312
+
313
+ How to evaluate:
314
+ - Open LLM Leaderboard (filter to chat/instruct, your size class)
315
+ - Chatbot Arena — vibes-based but useful proxy
316
+ - Hand-roll a "design this rate limiter" prompt and compare outputs
317
+
318
+ Size budget: **~7–25 GB** weights — this tier shouldn't dominate memory.
319
+
320
+ Good candidates to track:
321
+ - `bartowski/Qwen3-*-Instruct-GGUF`
322
+ - `bartowski/Mistral-Small-*-Instruct-GGUF` (non-uncensored)
323
+ - `bartowski/gemma-*-it-GGUF`
324
+ - `bartowski/glm-*-chat-GGUF`
325
+ - Reasoning-tuned variants: QwQ, DeepSeek-R1-Distill, Qwen3-Thinking
326
+
327
+ ### `plan-uncensored` — no-filter planning
328
+
329
+ What matters:
330
+ - Same metrics as `plan`
331
+ - **Plus** demonstrably reduced refusal rate (look for "abliterated",
332
+ "uncensored", "heretic", "dolphin", "neural-chat" branding)
333
+
334
+ Good candidates to track:
335
+ - `mradermacher/<base-model>-uncensored-*-GGUF`
336
+ - `mradermacher/<base-model>-heretic-i1-GGUF`
337
+ - `bartowski/<base-model>-abliterated-*-GGUF`
338
+ - `cognitivecomputations/dolphin-*` (then look for GGUF re-uploads)
339
+
340
+ Same size budget as `plan`.
341
+
342
+ ---
343
+
344
+ ## Where to find candidates
345
+
346
+ **HuggingFace search** (fastest):
347
+
348
+ - All recent GGUFs sorted by trending:
349
+ https://huggingface.co/models?library=gguf&sort=trending
350
+ - New coder GGUFs in the last 30 days:
351
+ https://huggingface.co/models?other=code&library=gguf&sort=created
352
+ - Specific maintainer feeds (subscribe / bookmark):
353
+ - https://huggingface.co/bartowski (broad coverage, fast turnaround)
354
+ - https://huggingface.co/unsloth (Qwen, Llama, with Dynamic UD quants)
355
+ - https://huggingface.co/mradermacher (i1 / abliterated / heretic variants)
356
+ - https://huggingface.co/lmstudio-community (curated, conservative quants)
357
+ - https://huggingface.co/MaziyarPanahi (broad chat + coder)
358
+
359
+ **Leaderboards** (signal):
360
+
361
+ | Tier | Leaderboard |
362
+ |---|---|
363
+ | `code-fast` / `code-smart` | https://aider.chat/docs/leaderboards/ |
364
+ | | https://livecodebench.github.io/leaderboard.html |
365
+ | | https://www.swebench.com/ (Verified split) |
366
+ | `plan` / `plan-uncensored` | https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard |
367
+ | | https://lmarena.ai/ |
368
+ | | https://livebench.ai/ |
369
+
370
+ **Community signal** (qualitative but valuable):
371
+
372
+ - r/LocalLLaMA — daily threads on what's good
373
+ - HF model card discussions — real-world report-backs
374
+ - Each maintainer's repo READMEs often link to benchmarks they ran themselves
375
+
376
+ ---
377
+
378
+ ## Worked example: replacing the `plan` tier
379
+
380
+ Suppose tomorrow `bartowski` ships `Qwen3-Next-32B-Thinking-GGUF` and you
381
+ think it'd plan better than your current Qwopus GLM 18B.
382
+
383
+ ```bash
384
+ # 0. Snapshot the current configuration
385
+ cd ~/Projects/opencode
386
+ llmstack check > /tmp/before.txt
387
+
388
+ # 1. Pre-pull via llama-cli (writes to the standard cache)
389
+ llama-cli -hf bartowski/Qwen3-Next-32B-Thinking-GGUF \
390
+ -hff Qwen3-Next-32B-Thinking-Q5_K_M.gguf \
391
+ --no-warmup -ngl 0 -c 256 -p ok -n 1
392
+
393
+ # 2. Smoke-test inference on a throwaway port
394
+ llama-server -hf bartowski/Qwen3-Next-32B-Thinking-GGUF \
395
+ -hff Qwen3-Next-32B-Thinking-Q5_K_M.gguf \
396
+ --port 9999 -ngl 999 -fa on --jinja \
397
+ --cache-type-k q8_0 --cache-type-v q8_0 -c 65536 &
398
+
399
+ curl -sN http://127.0.0.1:9999/v1/chat/completions \
400
+ -H 'Content-Type: application/json' \
401
+ -d '{"model":"x","messages":[{"role":"user","content":"How would you architect a rate limiter for our API?"}]}'
402
+
403
+ kill %1 # stop the test server
404
+
405
+ # 3. Edit ONE file: .llmstack/models.ini -> the [plan] section.
406
+ # OLD:
407
+ # hf_repo = Jackrong/Qwopus-GLM-18B-Merged-GGUF
408
+ # hf_file = Qwopus-GLM-18B-Healed-Q4_K_M.gguf
409
+ # NEW:
410
+ # hf_repo = bartowski/Qwen3-Next-32B-Thinking-GGUF
411
+ # hf_file = Qwen3-Next-32B-Thinking-Q5_K_M.gguf
412
+ # Also bump size_gb / quant for documentation accuracy.
413
+
414
+ # 4. Apply: regenerate llama-swap.yaml + opencode.json, restart, verify.
415
+ llmstack install
416
+ llmstack restart
417
+ llmstack status
418
+ llmstack check # confirms no DRIFT vs models.ini
419
+ ```
420
+
421
+ Roll back? Revert the two lines in `.llmstack/models.ini` and re-run
422
+ `llmstack install && llmstack restart`. The old GGUF is still in the HF
423
+ cache so loading it costs nothing extra.
424
+
425
+ ---
426
+
427
+ ## After-upgrade housekeeping
428
+
429
+ When you're confident in the new model, reclaim disk space:
430
+
431
+ ```bash
432
+ # What does llama.cpp see in its cache?
433
+ llama-cli -cl
434
+
435
+ # Sizes on disk (sorted)
436
+ du -h ~/.cache/huggingface/hub/* | sort -h
437
+
438
+ # Drop a single quant
439
+ llama-cli -cr <user>/<repo>:<quant> # e.g. -cr unsloth/Qwen3-Coder-Next-GGUF:Q4_K_M
440
+
441
+ # Drop a whole repo
442
+ rm -rf ~/.cache/huggingface/hub/models--<owner>--<repo>/
443
+ ```
444
+
445
+ The old `Q4_K_M` for a tier you've upgraded is also a candidate for deletion
446
+ once the new quant is verified working.
447
+
448
+ ---
449
+
450
+ ## Upgrading the Python toolchain
451
+
452
+ The stack is a single Python package (`llmstack`) installed via
453
+ `pyproject.toml`. The runtime needs:
454
+
455
+ - `llmstack/app.py` — `fastapi`, `uvicorn[standard]`, `httpx` (router)
456
+ - `llmstack/check_models.py` — `huggingface_hub`, `PyYAML`
457
+ - `llmstack/tiers.py`, `llmstack/generators/*.py` — stdlib only
458
+
459
+ `hf_transfer` is a declared dep and gets used automatically when
460
+ `HF_HUB_ENABLE_HF_TRANSFER=1` is set, for faster multi-GB GGUF pulls.
461
+
462
+ Dependency versions live in `pyproject.toml`'s `[project.dependencies]`,
463
+ not in a `requirements.txt`. There is no checked-in venv either.
464
+
465
+ ### First-time setup
466
+
467
+ The cleanest install puts the CLI on your PATH in an isolated env:
468
+
469
+ ```bash
470
+ cd ~/Projects/opencode
471
+ pipx install -e . # editable install + isolated venv
472
+ llmstack --version
473
+ ```
474
+
475
+ Or, if you prefer a managed venv (no pipx):
476
+
477
+ ```bash
478
+ cd ~/Projects/opencode
479
+ python3 -m venv .venv
480
+ .venv/bin/pip install -e .
481
+ .venv/bin/llmstack --version
482
+ ```
483
+
484
+ Pin a specific Python (e.g. 3.13) by passing `python3.13 -m venv .venv` or
485
+ `pipx install --python python3.13 -e .`. Anything ≥ 3.11 works (we use
486
+ 3.11+ syntax: PEP 604 `X | None`, `list[Tier]`, etc.).
487
+
488
+ ### Routine upgrade (latest patch versions of declared deps)
489
+
490
+ ```bash
491
+ cd ~/Projects/opencode
492
+ pipx upgrade llmstack # if installed via pipx
493
+ # or, in your venv:
494
+ .venv/bin/pip install -U -e .
495
+
496
+ llmstack stop && llmstack start # bounce the router
497
+ llmstack check # smoke-test PyYAML + huggingface_hub
498
+ ```
499
+
500
+ ### Bumping a major version (e.g. fastapi 0 → 1)
501
+
502
+ 1. Edit the version constraint in `pyproject.toml`'s
503
+ `[project.dependencies]` (e.g. `"fastapi>=1.0,<2.0"`).
504
+ 2. Reinstall: `pipx upgrade llmstack` or `.venv/bin/pip install -U -e .`.
505
+ 3. Test the router:
506
+
507
+ llmstack stop && llmstack start
508
+ curl -s http://127.0.0.1:10101/v1/models | jq '.data[].id'
509
+ curl -s http://127.0.0.1:10101/models.ini | head
510
+ curl -sN http://127.0.0.1:10101/v1/chat/completions -H 'Content-Type: application/json' \
511
+ -d '{"model":"auto","messages":[{"role":"user","content":"hi"}]}'
512
+
513
+ 4. Read the upstream changelog for any breaking imports the router relies
514
+ on (`fastapi.FastAPI`, `Request`, `Response`, `JSONResponse`,
515
+ `StreamingResponse`, lifespan handlers, etc.). Update `llmstack/app.py`
516
+ to match if needed.
517
+
518
+ ### Rebuilding the install from scratch
519
+
520
+ If anything gets weird (corrupt install, Python upgrade, dependency
521
+ conflicts), nuke and reinstall — it's cheap:
522
+
523
+ ```bash
524
+ # pipx install:
525
+ pipx uninstall llmstack
526
+ pipx install -e ~/Projects/opencode
527
+
528
+ # venv install:
529
+ cd ~/Projects/opencode
530
+ rm -rf .venv
531
+ python3 -m venv .venv
532
+ .venv/bin/pip install -e .
533
+ ```
534
+
535
+ Nothing in the venv / pipx env is editable — all source lives under
536
+ `llmstack/` in the repo and is installed in editable mode (`-e`), so edits
537
+ land instantly.
538
+
539
+ ### Upgrading the llama-swap binary
540
+
541
+ The binary lives at `$LLMSTACK_BIN_DIR/llama-swap`, which defaults to
542
+ `$XDG_DATA_HOME/llmstack/bin/llama-swap` (i.e.
543
+ `~/.local/share/llmstack/bin/llama-swap` on macOS). It is **not** in
544
+ version control — `llmstack setup` downloads it as part of the first-time
545
+ walkthrough, and `llmstack install-llama-swap` lets you re-fetch it on
546
+ demand.
547
+
548
+ ```bash
549
+ llmstack install-llama-swap # latest, idempotent (no-op if up-to-date)
550
+ llmstack install-llama-swap --force # redownload even if up-to-date
551
+ LLAMA_SWAP_VERSION=v211 \
552
+ llmstack install-llama-swap # pin a specific tag
553
+ ```
554
+
555
+ It auto-detects OS (Darwin / Linux / FreeBSD) and arch (arm64 / amd64) and
556
+ downloads the matching tarball from
557
+ https://github.com/mostlygeek/llama-swap/releases.
558
+
559
+ To check what's installed, look at the version line printed by
560
+ `llmstack install-llama-swap` (or run the binary directly):
561
+
562
+ ```bash
563
+ "$(llmstack status 2>/dev/null | awk '/llama-swap binary/ {print $NF}')" --version
564
+ # or simply:
565
+ ~/.local/share/llmstack/bin/llama-swap --version
566
+ ```
567
+
568
+ After bumping the binary, re-run `llmstack install` only if the new
569
+ version of llama-swap requires a config-schema change (it usually doesn't —
570
+ the YAML schema is stable).
571
+
572
+ ### Snapshotting the exact installed set
573
+
574
+ For reproducibility (e.g. before a risky upgrade) capture the full lock:
575
+
576
+ ```bash
577
+ # pipx:
578
+ pipx runpip llmstack freeze > requirements.lock.txt
579
+
580
+ # venv:
581
+ .venv/bin/pip freeze > requirements.lock.txt
582
+
583
+ # …upgrade…
584
+ # if it goes wrong:
585
+ .venv/bin/pip install -r requirements.lock.txt
586
+ ```
587
+
588
+ We don't commit `requirements.lock.txt` by default — the floor pins in
589
+ `pyproject.toml` are sufficient for this stack's blast radius.
590
+
591
+ ---
592
+
593
+ ## Quick reference — tier sizes & natural model classes
594
+
595
+ | Tier | Weights budget | Typical params (dense) | Typical params (MoE) | Examples |
596
+ |---|---|---|---|---|
597
+ | `code-fast` | 2–6 GB | 1.5–7 B | — | Qwen2.5-Coder 3B/7B, DeepSeek-Coder-V2-Lite |
598
+ | `code-smart` | 25–55 GB | 30–70 B | 30B-A3B / 80B-A3B | Qwen3-Coder-Next, DeepSeek-Coder-V2, Codestral |
599
+ | `plan` | 7–25 GB | 14–32 B | — | Qwen3, Mistral-Small, Gemma-3, GLM |
600
+ | `plan-uncensored` | 7–25 GB | 14–32 B | — | abliterated/heretic/dolphin variants of the above |
601
+
602
+ Anything outside these brackets either won't fit, or wastes hardware.
@@ -1,9 +1,30 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: opencode-llmstack
3
- Version: 0.9.1
3
+ Version: 0.9.3
4
4
  Summary: Multi-tier local LLM stack: llama-swap + FastAPI auto-router + opencode wiring.
5
5
  Author: llmstack
6
- License: MIT
6
+ License: MIT License
7
+
8
+ Copyright (c) 2024 llmstack contributors
9
+
10
+ Permission is hereby granted, free of charge, to any person obtaining a copy
11
+ of this software and associated documentation files (the "Software"), to deal
12
+ in the Software without restriction, including without limitation the rights
13
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14
+ copies of the Software, and to permit persons to whom the Software is
15
+ furnished to do so, subject to the following conditions:
16
+
17
+ The above copyright notice and this permission notice shall be included in all
18
+ copies or substantial portions of the Software.
19
+
20
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26
+ SOFTWARE.
27
+
7
28
  Project-URL: Homepage, https://github.com/rohitgarg19/llmstack
8
29
  Project-URL: Issues, https://github.com/rohitgarg19/llmstack/issues
9
30
  Keywords: llm,llama-cpp,llama-swap,opencode,router,local-ai
@@ -17,9 +38,11 @@ Classifier: Programming Language :: Python :: 3
17
38
  Classifier: Programming Language :: Python :: 3.11
18
39
  Classifier: Programming Language :: Python :: 3.12
19
40
  Classifier: Programming Language :: Python :: 3.13
41
+ Classifier: Programming Language :: Python :: 3.14
20
42
  Classifier: Topic :: Software Development
21
43
  Requires-Python: >=3.11
22
44
  Description-Content-Type: text/markdown
45
+ License-File: LICENSE
23
46
  Requires-Dist: fastapi<1.0,>=0.110
24
47
  Requires-Dist: httpx<1.0,>=0.27
25
48
  Requires-Dist: uvicorn[standard]<1.0,>=0.30
@@ -32,6 +55,7 @@ Requires-Dist: pytest>=7; extra == "dev"
32
55
  Provides-Extra: bedrock
33
56
  Requires-Dist: boto3>=1.35; extra == "bedrock"
34
57
  Requires-Dist: botocore>=1.35; extra == "bedrock"
58
+ Dynamic: license-file
35
59
 
36
60
  # llmstack — multi-tier local LLM stack for Mac M4 Max / 64 GB
37
61
 
@@ -118,9 +142,9 @@ First match wins:
118
142
  | 1 | last user msg contains `[nofilter]`, `[uncensored]`, `[heretic]`, or starts with `uncensored:` / `nofilter:` | `plan-uncensored` | explicit opt-in |
119
143
  | 2 | `[ultra]` / `[opus]` / `ultra:` trigger AND `code-ultra` tier configured | `code-ultra` | explicit top-tier opt-in |
120
144
  | 3 | plan verbs (*design, architect, approach, trade-off, should we, explain why, …*) AND no code blocks / agent verbs / tools | `plan` | pure design discussion (orthogonal track) |
121
- | 4 | estimated input ≤ 8 000 tokens | `code-ultra` *(or `code-smart` if ultra unwired)* | top tier — context still being built, latency/$ are best here |
145
+ | 4 | estimated input ≤ 12 000 tokens | `code-ultra` *(or `code-smart` if ultra unwired)* | top tier — context still being built, latency/$ are best here |
122
146
  | 5 | estimated input ≤ 32 000 tokens | `code-smart` | mid-context, local heavy coder is at its sweet spot |
123
- | 6 | otherwise (long context) AND (`tools[]` OR 6 turns) | `code-smart` | floor: 3B model tool-calls unreliably |
147
+ | 6 | otherwise (long context) AND 10 user turns | `code-smart` | floor: deep agentic loop, keep the heavy model |
124
148
  | 7 | otherwise (long context) | `code-fast` | 128k YaRN window + always-resident + free |
125
149
 
126
150
  Token estimates are `chars / 4` over all message text + `prompt`. The
@@ -185,12 +209,12 @@ from any directory you previously ran `install` in.
185
209
  ## Layout
186
210
 
187
211
  ```
188
- opencode/ # repo root
212
+ llmstack/ # repo root
189
213
  ├── pyproject.toml # package metadata + `llmstack` console script
190
214
  ├── README.md # this file
191
215
  ├── UPGRADING.md # how to swap any tier for a newer/better model
192
216
  │ + how to upgrade the Python toolchain itself
193
- ├── models.ini # SINGLE SOURCE OF TRUTH for tiers + sampler
217
+ ├── LICENSE # MIT
194
218
  └── llmstack/ # the python package (importable, installable)
195
219
  ├── __init__.py
196
220
  ├── __main__.py # `python -m llmstack`
@@ -199,6 +223,7 @@ opencode/ # repo root
199
223
  ├── shell_env.py # spawn the env-prepared subshell + activate hooks
200
224
  ├── app.py # FastAPI auto-router (~280 lines)
201
225
  ├── tiers.py # parse models.ini -> Tier dataclasses
226
+ ├── models.ini # SINGLE SOURCE OF TRUTH for tiers + sampler (bundled template)
202
227
  ├── check_models.py # snapshot tool (HF metadata + drift check)
203
228
  ├── AGENTS.md # opencode agent template (shipped as package data)
204
229
  ├── generators/
@@ -213,9 +238,9 @@ opencode/ # repo root
213
238
  ├── install_llama_swap.py
214
239
  ├── download.py
215
240
  ├── start.py
216
- ├── shell.py
217
241
  ├── stop.py
218
242
  ├── restart.py
243
+ ├── reload.py
219
244
  ├── status.py
220
245
  ├── check.py
221
246
  └── activate.py
@@ -544,7 +569,7 @@ All knobs are env vars; defaults are picked up by `llmstack start`.
544
569
  | `ROUTER_UNCENSORED_MODEL` | `plan-uncensored` | `[nofilter]` triggers → here |
545
570
  | `ROUTER_HIGH_FIDELITY_CEILING` | `12000` | tokens; at or below this, route to top tier (ultra → smart fallback). Paired with `code-ultra.ctx_size = 24000` (2x). |
546
571
  | `ROUTER_MID_FIDELITY_CEILING` | `32000` | tokens; at or below this, route to `code-smart`; beyond, step down to `code-fast`. Paired with `code-smart.ctx_size = 64000` (2x). |
547
- | `ROUTER_MULTI_TURN` | `6` | turn count that floors the long-context rung at `code-smart` |
572
+ | `ROUTER_MULTI_TURN` | `10` | user-turn count that floors the long-context rung at `code-smart` |
548
573
  | `ROUTER_HOST` / `ROUTER_PORT` | `127.0.0.1` / `10101` | listen address |
549
574
  | `LOG_LEVEL` | `info` | router log level |
550
575
 
@@ -1,16 +1,16 @@
1
1
  llmstack/AGENTS.md,sha256=4DVUkqJ1-EP-cDNRCpznzghOOX6dAMbVWdcwyfFCALw,528
2
- llmstack/__init__.py,sha256=EKHybZtPxLqFWkgkIoYBameu5_Tf9j4UewpANKm0fMU,855
2
+ llmstack/__init__.py,sha256=Qs9d58V8cJWmJvu4QLvO7_panKa8UkGRXAurHYgKDyU,855
3
3
  llmstack/__main__.py,sha256=wXHd5-BmCCHUfNEmy2rbilBSyVhi4KD1dSIO_4NlxuE,199
4
4
  llmstack/_platform.py,sha256=eDY3T9krkaBigG5xXxqzIbH3MhdZqX3BWe7bozOsAso,13099
5
- llmstack/app.py,sha256=WKViWk7-TUuqdo3em77JgxgUo1Jo745Ew3RUoY7th-s,27945
5
+ llmstack/app.py,sha256=3Qt_bveLS13rPEucyX0P6QDf9o9O68amnJIvwMqoTQQ,28469
6
6
  llmstack/check_models.py,sha256=WvTS2Td4acp-Q0-yWXUgXAgAgFOmpxiaeSDuAoivirw,4559
7
7
  llmstack/cli.py,sha256=Om70PzHrmU81y2Mw1sB6eeUs1fRHP0PnsCEVNC0UNvI,11341
8
- llmstack/models.ini,sha256=m595QVX5Nh5YEHcrKLl5ldvNIORiy27OwxNMDsSwlsE,20339
8
+ llmstack/models.ini,sha256=U38Z6wfGCpmgQTCNbnp1zu80rUrhNGWFTvI2nhVx1Mo,20556
9
9
  llmstack/paths.py,sha256=A8q4-tpwIt5UMGG5ZDESKSuViMGLbPIAL1VoONopJqU,11512
10
10
  llmstack/shell_env.py,sha256=MJSW0PP15q-fsppIZ98WZ7XoqYMZmDy4k8N0gzEA6wU,39362
11
- llmstack/tiers.py,sha256=et738dWftsc74ZElZ3Vt9eEF_SzgJCDuH9kBhzH-scI,14697
11
+ llmstack/tiers.py,sha256=yl5xEhECe-GHiVXBRvlNoFtH_9y4uNSASpfHlZ4Ja74,14820
12
12
  llmstack/backends/__init__.py,sha256=-85sQz0R94OdbM2bUHGyyA5WaMnI9bHywPOaELeQHX0,777
13
- llmstack/backends/bedrock.py,sha256=_UFBWR7R2Q4BPAsskXemjgPnu0dyJLSXel86smo9mSc,30015
13
+ llmstack/backends/bedrock.py,sha256=Nb9sV45aH0RQHie_AkQwcpX5pkio5EAqnsphZM5P_nQ,30638
14
14
  llmstack/commands/__init__.py,sha256=eVO-YUxh1fSfdq72KggC-NrTYMtN6zIykgjyRgOCAt4,406
15
15
  llmstack/commands/_helpers.py,sha256=UKADaNXrnuoDi_JG0W2Tph7rWFB0cXvQh8YknZBw56I,2660
16
16
  llmstack/commands/activate.py,sha256=zCdEmyVv5qZUdhfez6hZ5Y46N_yjPwfKbPTwCJXnA3o,3663
@@ -21,7 +21,7 @@ llmstack/commands/install_llama_swap.py,sha256=c6iedl-DjnOc7jMVzy_M0aIWSgygzAgYU
21
21
  llmstack/commands/reload.py,sha256=Z7ceZQX2fkHpZiWxov8YwidR72Xw0-qMFFV_RRXpkwI,2016
22
22
  llmstack/commands/restart.py,sha256=Bp6lSAnLhR2Nd7eA5BlD9J_TeGlzRfWS_Z3DdxP-eq4,294
23
23
  llmstack/commands/setup.py,sha256=ZBPXas7jswfYL6IwAJhReR0BVGn4LWaf-0ZhR8lQG6I,5381
24
- llmstack/commands/start.py,sha256=V9BDZeCQS_NL2bJmJANHVE2J1rqoYBUDYcjK9O_PNYM,15693
24
+ llmstack/commands/start.py,sha256=u6tLI5yTtOtIRwEJNDofZiqY2vv39Xhw6GPShE1wFIg,15655
25
25
  llmstack/commands/status.py,sha256=TOHoDSyu04lZtepJH4bFmIk694RyaUYeFMpUejyUPe0,10403
26
26
  llmstack/commands/stop.py,sha256=vntZ1n8wpY9zgix1xGHDNJqEacaUpw9haSKgOnMg73k,2474
27
27
  llmstack/download/__init__.py,sha256=lpGmxsE4zxSp0fQViNJZHzbCL_V8zy6IHn71MP31538,695
@@ -30,8 +30,12 @@ llmstack/download/ggufs.py,sha256=2hCr-svUiPIV2I3ruwTbXo6lPn9m-VBOqa3DFbvdIcA,54
30
30
  llmstack/generators/__init__.py,sha256=LfbcReuyYBCdVuT9J5RKo7-f8n585YBU3Hus6DsxqTs,1189
31
31
  llmstack/generators/llama_swap.py,sha256=KdYH9N6TJECotZvyxvAjaa3kRyzn4YOi2T6D2UdyVKw,14785
32
32
  llmstack/generators/opencode.py,sha256=s_FrLXUBnLzRGQovl1PcAEs7V_P52wT1vnvvxMcKfs4,11203
33
- opencode_llmstack-0.9.1.dist-info/METADATA,sha256=OAoNwEOF9ESVwKAQ-VXD5868sMSIhiVxq47cDqWcK_k,34914
34
- opencode_llmstack-0.9.1.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
35
- opencode_llmstack-0.9.1.dist-info/entry_points.txt,sha256=soomjpqvl4KzFScgpQbu96vgcLriOtkB9MbiSC0rvZ8,47
36
- opencode_llmstack-0.9.1.dist-info/top_level.txt,sha256=tMv9sDWp8RW_DNNY8cuM4Uy4sND-KwTLcsScl5gdcEQ,9
37
- opencode_llmstack-0.9.1.dist-info/RECORD,,
33
+ opencode_llmstack-0.9.3.data/data/CHANGELOG.md,sha256=2Ok5sn4aA_N5UMaUJ4jRbuTeWC1pt7gdFgYnxj2JdKU,5217
34
+ opencode_llmstack-0.9.3.data/data/LICENSE,sha256=6G-Otw6BHIM1WJSBlJ04P1rDVCqbDEzKpdOlSr5CqIY,1078
35
+ opencode_llmstack-0.9.3.data/data/UPGRADING.md,sha256=0XSNZ9trCviFLH5EL3Jz02fO2_8AfqB8_9aX0-o1bik,24927
36
+ opencode_llmstack-0.9.3.dist-info/licenses/LICENSE,sha256=6G-Otw6BHIM1WJSBlJ04P1rDVCqbDEzKpdOlSr5CqIY,1078
37
+ opencode_llmstack-0.9.3.dist-info/METADATA,sha256=QL_Za8nscUU57s41131pavDonGlSmSbHnNUZXWAMYqo,36323
38
+ opencode_llmstack-0.9.3.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
39
+ opencode_llmstack-0.9.3.dist-info/entry_points.txt,sha256=soomjpqvl4KzFScgpQbu96vgcLriOtkB9MbiSC0rvZ8,47
40
+ opencode_llmstack-0.9.3.dist-info/top_level.txt,sha256=tMv9sDWp8RW_DNNY8cuM4Uy4sND-KwTLcsScl5gdcEQ,9
41
+ opencode_llmstack-0.9.3.dist-info/RECORD,,
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 llmstack contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.