opencode-llmstack 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- llmstack/AGENTS.md +13 -0
- llmstack/__init__.py +20 -0
- llmstack/__main__.py +10 -0
- llmstack/_platform.py +420 -0
- llmstack/app.py +644 -0
- llmstack/backends/__init__.py +19 -0
- llmstack/backends/bedrock.py +790 -0
- llmstack/check_models.py +119 -0
- llmstack/cli.py +264 -0
- llmstack/commands/__init__.py +10 -0
- llmstack/commands/_helpers.py +91 -0
- llmstack/commands/activate.py +71 -0
- llmstack/commands/check.py +13 -0
- llmstack/commands/download.py +27 -0
- llmstack/commands/install.py +365 -0
- llmstack/commands/install_llama_swap.py +36 -0
- llmstack/commands/reload.py +59 -0
- llmstack/commands/restart.py +12 -0
- llmstack/commands/setup.py +146 -0
- llmstack/commands/start.py +360 -0
- llmstack/commands/status.py +260 -0
- llmstack/commands/stop.py +73 -0
- llmstack/download/__init__.py +21 -0
- llmstack/download/binary.py +234 -0
- llmstack/download/ggufs.py +164 -0
- llmstack/generators/__init__.py +37 -0
- llmstack/generators/llama_swap.py +421 -0
- llmstack/generators/opencode.py +291 -0
- llmstack/models.ini +304 -0
- llmstack/paths.py +318 -0
- llmstack/shell_env.py +927 -0
- llmstack/tiers.py +394 -0
- opencode_llmstack-0.6.0.dist-info/METADATA +693 -0
- opencode_llmstack-0.6.0.dist-info/RECORD +37 -0
- opencode_llmstack-0.6.0.dist-info/WHEEL +5 -0
- opencode_llmstack-0.6.0.dist-info/entry_points.txt +2 -0
- opencode_llmstack-0.6.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,291 @@
|
|
|
1
|
+
"""Generate ``opencode.json`` from ``models.ini``.
|
|
2
|
+
|
|
3
|
+
Reads the models.ini located by :func:`llmstack.paths.models_ini_path` and
|
|
4
|
+
writes an opencode config to the path given as the first CLI argument
|
|
5
|
+
(or stdout if omitted).
|
|
6
|
+
|
|
7
|
+
What gets wired:
|
|
8
|
+
|
|
9
|
+
provider single llama.cpp-compatible provider at router_port
|
|
10
|
+
model ``auto`` (FastAPI router classifies and rewrites
|
|
11
|
+
model names)
|
|
12
|
+
small_model tier with ``role=fast`` (tab-complete, titles)
|
|
13
|
+
agent.build ``auto`` -- always routed
|
|
14
|
+
agent.plan ``role=plan`` -- read-only, no bash
|
|
15
|
+
agent.plan-nofilter ``role=plan-uncensored`` -- read-only, no bash
|
|
16
|
+
command./review,
|
|
17
|
+
command./nofilter shortcuts that the router can't auto-classify
|
|
18
|
+
|
|
19
|
+
What is **deliberately NOT wired**:
|
|
20
|
+
|
|
21
|
+
Sampler params (``temperature``, ``top_p``, etc.) are NEVER emitted on
|
|
22
|
+
any agent or model in opencode.json. Sampling is the *backend's*
|
|
23
|
+
responsibility, with ``models.ini`` as the single source of truth:
|
|
24
|
+
|
|
25
|
+
* gguf tiers -- :mod:`llmstack.generators.llama_swap` bakes the
|
|
26
|
+
tier's ``sampler = ...`` into the llama-server startup command line
|
|
27
|
+
(``--temp``/``--top-p``/...). llama-server applies them as defaults
|
|
28
|
+
for every request.
|
|
29
|
+
* Bedrock tiers -- :func:`llmstack.app._inject_sampler` adds the
|
|
30
|
+
tier's sampler keys to each outbound request body (Bedrock has no
|
|
31
|
+
server-side defaults mechanism). Bedrock models that reject sampler
|
|
32
|
+
params (e.g. Claude Opus 4.7) declare an empty ``sampler =`` and
|
|
33
|
+
the router passes requests through untouched.
|
|
34
|
+
|
|
35
|
+
Either way, opencode.json never carries sampler params -- one place to
|
|
36
|
+
edit (``models.ini``), no risk of opencode drifting from the actual
|
|
37
|
+
tier config, and any other router client (curl, a different IDE) gets
|
|
38
|
+
the same per-tier behaviour for free.
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
from __future__ import annotations
|
|
42
|
+
|
|
43
|
+
import configparser
|
|
44
|
+
import json
|
|
45
|
+
import os
|
|
46
|
+
import re
|
|
47
|
+
import sys
|
|
48
|
+
from pathlib import Path
|
|
49
|
+
|
|
50
|
+
from llmstack.paths import AGENTS_TEMPLATE, models_ini_path, remote_url
|
|
51
|
+
|
|
52
|
+
PROVIDER_KEY = "llama.cpp"
|
|
53
|
+
API_KEY = "sk-no-key-required"
|
|
54
|
+
|
|
55
|
+
ROLE_MAP: dict[str, tuple[str, str | None]] = {
|
|
56
|
+
"fast": ("small_model", None),
|
|
57
|
+
"agent": ("agent", "build"),
|
|
58
|
+
"plan": ("agent", "plan"),
|
|
59
|
+
"plan-uncensored": ("agent", "plan-nofilter"),
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
READ_ONLY_AGENTS = {"plan", "plan-nofilter"}
|
|
63
|
+
|
|
64
|
+
# Slash-command shortcuts (no `/fast`: `auto` already routes trivial chat).
|
|
65
|
+
COMMANDS = {
|
|
66
|
+
"review": {
|
|
67
|
+
"template": "Review the following for trade-offs, risks, and follow-ups. Be concrete.",
|
|
68
|
+
"description": "Architectural review via the planning model.",
|
|
69
|
+
"agent": "plan",
|
|
70
|
+
},
|
|
71
|
+
"nofilter": {
|
|
72
|
+
"template": "[nofilter]",
|
|
73
|
+
"description": "Route to the uncensored planning model.",
|
|
74
|
+
"agent": "plan-nofilter",
|
|
75
|
+
},
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
SHARE = os.getenv("OPENCODE_SHARE", "disabled")
|
|
79
|
+
USERNAME = os.getenv("OPENCODE_USERNAME") or None
|
|
80
|
+
|
|
81
|
+
# Keep model picker scoped to the local stack even if hosted-API env vars leak in.
|
|
82
|
+
_REMOTE_PROVIDERS = (
|
|
83
|
+
"anthropic,openai,google,openrouter,xai,groq,deepseek,"
|
|
84
|
+
"mistral,cerebras,azure,perplexity,vercel,morph,bedrock"
|
|
85
|
+
)
|
|
86
|
+
DISABLED_PROVIDERS = [
|
|
87
|
+
p.strip() for p in
|
|
88
|
+
os.getenv("OPENCODE_DISABLED_PROVIDERS", _REMOTE_PROVIDERS).split(",")
|
|
89
|
+
if p.strip()
|
|
90
|
+
]
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def _instructions_paths() -> list[str]:
|
|
94
|
+
"""Resolve the ``instructions`` array in opencode.json.
|
|
95
|
+
|
|
96
|
+
Honours ``OPENCODE_INSTRUCTIONS`` (colon-separated) for the per-project
|
|
97
|
+
install path; falls back to the bundled template inside the package.
|
|
98
|
+
"""
|
|
99
|
+
raw = os.getenv("OPENCODE_INSTRUCTIONS", str(AGENTS_TEMPLATE))
|
|
100
|
+
return [p for p in raw.split(":") if p]
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
ZERO_COST = {"input": 0, "output": 0, "cache_read": 0, "cache_write": 0}
|
|
104
|
+
DIGITS = re.compile(r"\d+")
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def _int(value: str, default: int) -> int:
|
|
108
|
+
m = DIGITS.search(value or "")
|
|
109
|
+
return int(m.group()) if m else default
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def build_config(
|
|
113
|
+
ini_path: Path | None = None,
|
|
114
|
+
*,
|
|
115
|
+
ini_text: str | None = None,
|
|
116
|
+
remote: str | None = None,
|
|
117
|
+
) -> dict:
|
|
118
|
+
"""Build the opencode.json dict from ``models.ini``.
|
|
119
|
+
|
|
120
|
+
Source of the INI is one of (mutually exclusive):
|
|
121
|
+
|
|
122
|
+
* ``ini_text`` -- raw INI content as a string. Used by
|
|
123
|
+
``llmstack install --external``: it fetches ``models.ini``
|
|
124
|
+
straight from the router (``GET /models.ini``) and renders the
|
|
125
|
+
opencode config without writing the file to disk. Thin clients
|
|
126
|
+
don't keep a local copy of ``models.ini`` -- they re-fetch it
|
|
127
|
+
on each ``install``.
|
|
128
|
+
* ``ini_path`` -- explicit path. Used by callers that have a
|
|
129
|
+
``Path`` in hand (``check_models``, tests).
|
|
130
|
+
* neither -- read from :func:`models_ini_path` (canonical
|
|
131
|
+
per-project location, the local-mode default).
|
|
132
|
+
|
|
133
|
+
``remote`` overrides the router base URL: when given, opencode is
|
|
134
|
+
pointed at ``{remote}/v1`` (thin-client / external mode). When
|
|
135
|
+
``None``, fall back to :func:`llmstack.paths.remote_url` -- which
|
|
136
|
+
reads the persisted channel marker first, env var second -- and
|
|
137
|
+
finally to the local router host/port from ``models.ini``.
|
|
138
|
+
|
|
139
|
+
Passing ``remote`` explicitly is what ``llmstack install --external
|
|
140
|
+
[URL]`` does: it has just *decided* the URL from flags + env and
|
|
141
|
+
needs the renderer to honour that decision rather than looking
|
|
142
|
+
again at a possibly-stale marker.
|
|
143
|
+
"""
|
|
144
|
+
if ini_text is not None and ini_path is not None:
|
|
145
|
+
raise ValueError("build_config: pass ini_text OR ini_path, not both")
|
|
146
|
+
|
|
147
|
+
cfg = configparser.ConfigParser(inline_comment_prefixes=(";",), interpolation=None)
|
|
148
|
+
if ini_text is not None:
|
|
149
|
+
cfg.read_string(ini_text)
|
|
150
|
+
else:
|
|
151
|
+
path = ini_path or models_ini_path()
|
|
152
|
+
if not path.exists():
|
|
153
|
+
raise SystemExit(f"models.ini not found at {path}")
|
|
154
|
+
cfg.read(path)
|
|
155
|
+
|
|
156
|
+
defaults = cfg["DEFAULT"]
|
|
157
|
+
rurl = remote if remote is not None else remote_url()
|
|
158
|
+
if rurl:
|
|
159
|
+
# Client mode: send all traffic to the remote router. Keep the
|
|
160
|
+
# tier / agent wiring derived from the local models.ini -- the
|
|
161
|
+
# remote stack is expected to expose the same tier names; tier
|
|
162
|
+
# ``ctx_size`` is a useful client-side hint (used by opencode
|
|
163
|
+
# for prompt-packing) regardless of where the actual model
|
|
164
|
+
# lives. Sampling is the *router's* responsibility (it injects
|
|
165
|
+
# per-tier defaults from its own models.ini), so it never
|
|
166
|
+
# appears in opencode.json.
|
|
167
|
+
base_url = f"{rurl}/v1"
|
|
168
|
+
else:
|
|
169
|
+
host = (defaults.get("host") or "127.0.0.1").strip()
|
|
170
|
+
port = (defaults.get("router_port") or "10101").strip()
|
|
171
|
+
base_url = f"http://{host}:{port}/v1"
|
|
172
|
+
|
|
173
|
+
tier_sections = [s for s in cfg.sections() if s != "ROUTING"]
|
|
174
|
+
|
|
175
|
+
# `auto` context = MIN across all tiers so opencode never packs a prompt
|
|
176
|
+
# that overflows the tier the router actually picks.
|
|
177
|
+
auto_ctx = min(
|
|
178
|
+
(_int(cfg[s].get("ctx_size", ""), 0) for s in tier_sections),
|
|
179
|
+
default=8192,
|
|
180
|
+
) or 8192
|
|
181
|
+
|
|
182
|
+
models: dict[str, dict] = {
|
|
183
|
+
"auto": {
|
|
184
|
+
"name": "Auto (router selects: fast / agent / plan / uncensored)",
|
|
185
|
+
"limit": {"context": auto_ctx, "output": 16384},
|
|
186
|
+
"tool_call": True,
|
|
187
|
+
"cost": ZERO_COST,
|
|
188
|
+
}
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
small_model: str | None = None
|
|
192
|
+
agents: dict[str, dict] = {}
|
|
193
|
+
|
|
194
|
+
for sec in tier_sections:
|
|
195
|
+
s = cfg[sec]
|
|
196
|
+
role = (s.get("role") or "").strip()
|
|
197
|
+
ctx = _int(s.get("ctx_size", ""), 8192)
|
|
198
|
+
desc = (s.get("description") or sec).strip()
|
|
199
|
+
|
|
200
|
+
model_entry: dict = {
|
|
201
|
+
"name": desc,
|
|
202
|
+
"limit": {"context": ctx, "output": 32768 if role == "agent" else 8192},
|
|
203
|
+
"tool_call": True,
|
|
204
|
+
"cost": ZERO_COST,
|
|
205
|
+
}
|
|
206
|
+
if role in ("agent", "plan-uncensored"):
|
|
207
|
+
model_entry["reasoning"] = True
|
|
208
|
+
models[sec] = model_entry
|
|
209
|
+
|
|
210
|
+
kind, agent_name = ROLE_MAP.get(role, (None, None))
|
|
211
|
+
if kind is None:
|
|
212
|
+
continue
|
|
213
|
+
|
|
214
|
+
model_ref = f"{PROVIDER_KEY}/{sec}"
|
|
215
|
+
if kind == "small_model":
|
|
216
|
+
small_model = model_ref
|
|
217
|
+
continue
|
|
218
|
+
|
|
219
|
+
# `build` is always wired to the auto router so escalation to
|
|
220
|
+
# code-ultra (or fallback to code-fast) happens transparently.
|
|
221
|
+
if agent_name == "build":
|
|
222
|
+
agent_model_ref = f"{PROVIDER_KEY}/auto"
|
|
223
|
+
else:
|
|
224
|
+
agent_model_ref = model_ref
|
|
225
|
+
|
|
226
|
+
# Sampler params are intentionally absent here -- the router
|
|
227
|
+
# injects per-tier defaults from models.ini at request time
|
|
228
|
+
# (see :func:`llmstack.app._inject_sampler`). See the module
|
|
229
|
+
# docstring for the rationale.
|
|
230
|
+
agent: dict = {"model": agent_model_ref}
|
|
231
|
+
if agent_name in READ_ONLY_AGENTS:
|
|
232
|
+
agent["permission"] = {"edit": "deny", "write": "deny", "bash": "deny"}
|
|
233
|
+
agents[agent_name] = agent # type: ignore[index]
|
|
234
|
+
|
|
235
|
+
out: dict = {
|
|
236
|
+
"$schema": "https://opencode.ai/config.json",
|
|
237
|
+
"share": SHARE,
|
|
238
|
+
"autoupdate": "notify",
|
|
239
|
+
}
|
|
240
|
+
if USERNAME:
|
|
241
|
+
out["username"] = USERNAME
|
|
242
|
+
if DISABLED_PROVIDERS:
|
|
243
|
+
out["disabled_providers"] = DISABLED_PROVIDERS
|
|
244
|
+
|
|
245
|
+
instructions = _instructions_paths()
|
|
246
|
+
if instructions:
|
|
247
|
+
out["instructions"] = instructions
|
|
248
|
+
|
|
249
|
+
out["provider"] = {
|
|
250
|
+
PROVIDER_KEY: {
|
|
251
|
+
"npm": "@ai-sdk/openai-compatible",
|
|
252
|
+
"name": "llmstack (local llama-swap + auto router)",
|
|
253
|
+
"options": {"baseURL": base_url, "apiKey": API_KEY},
|
|
254
|
+
"models": models,
|
|
255
|
+
}
|
|
256
|
+
}
|
|
257
|
+
out["model"] = f"{PROVIDER_KEY}/auto"
|
|
258
|
+
if small_model:
|
|
259
|
+
out["small_model"] = small_model
|
|
260
|
+
if agents:
|
|
261
|
+
out["agent"] = {k: agents[k] for k in ("build", "plan", "plan-nofilter") if k in agents}
|
|
262
|
+
out["command"] = COMMANDS
|
|
263
|
+
return out
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
def render(*, ini_text: str | None = None, remote: str | None = None) -> str:
|
|
267
|
+
"""Return the full opencode.json text (with trailing newline).
|
|
268
|
+
|
|
269
|
+
``ini_text`` and ``remote`` are forwarded to :func:`build_config`;
|
|
270
|
+
see there for the resolution order.
|
|
271
|
+
"""
|
|
272
|
+
return json.dumps(build_config(ini_text=ini_text, remote=remote), indent=2) + "\n"
|
|
273
|
+
|
|
274
|
+
|
|
275
|
+
def validate(path: Path) -> None:
|
|
276
|
+
"""Cheap structural sanity check: parses cleanly as JSON."""
|
|
277
|
+
json.loads(path.read_text())
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
def main(argv: list[str]) -> int:
|
|
281
|
+
target = argv[1] if len(argv) > 1 else "-"
|
|
282
|
+
text = render()
|
|
283
|
+
if target == "-":
|
|
284
|
+
sys.stdout.write(text)
|
|
285
|
+
else:
|
|
286
|
+
Path(target).write_text(text)
|
|
287
|
+
return 0
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
if __name__ == "__main__":
|
|
291
|
+
sys.exit(main(sys.argv))
|
llmstack/models.ini
ADDED
|
@@ -0,0 +1,304 @@
|
|
|
1
|
+
; ----------------------------------------------------------------------------
|
|
2
|
+
; models.ini - inventory of models served by llmstack/.
|
|
3
|
+
;
|
|
4
|
+
; Runtime config: llmstack/llama-swap.yaml.
|
|
5
|
+
; opencode bindings: ../opencode.json (model + agent.build/plan/plan-nofilter).
|
|
6
|
+
;
|
|
7
|
+
; SUPPORTED TIERS -- canonical names recognised by the router and the
|
|
8
|
+
; opencode / llama-swap generators. ONLY the names listed below should
|
|
9
|
+
; appear as section headers in this file. Any other section name is
|
|
10
|
+
; parsed but ignored by `model: auto` routing and won't be wired into
|
|
11
|
+
; the generated opencode.json -- it would just be dead config. To grow
|
|
12
|
+
; this set, add the new name here AND update llmstack/app.py +
|
|
13
|
+
; llmstack/tiers.py so the router knows about it.
|
|
14
|
+
;
|
|
15
|
+
; code-fast always-resident tiny local coder
|
|
16
|
+
; code-smart AGENT mode: heavy local coder, tool calls, multi-file edits
|
|
17
|
+
; code-ultra AGENT mode: top-tier hosted coder (Claude Opus on Bedrock)
|
|
18
|
+
; - shipped commented-out in the bundled template;
|
|
19
|
+
; `llmstack install` auto-uncomments it when the
|
|
20
|
+
; `bedrock` extra (boto3) is installed.
|
|
21
|
+
; plan PLAN mode: chat-tuned, design discussions
|
|
22
|
+
; plan-uncensored PLAN mode (no filter): when the topic requires it
|
|
23
|
+
;
|
|
24
|
+
; Per-tier backends:
|
|
25
|
+
; gguf local llama-server (managed by llama-swap). Driven by
|
|
26
|
+
; hf_repo + hf_file. This is the default; auto-detected
|
|
27
|
+
; when hf_* keys are present.
|
|
28
|
+
; bedrock hosted AWS Bedrock model. Identity-only in this file:
|
|
29
|
+
; aws_model_id (required)
|
|
30
|
+
; aws_model_id_next (optional upgrade target)
|
|
31
|
+
; aws_region (optional; some models are
|
|
32
|
+
; region-pinned)
|
|
33
|
+
; aws_region_next (optional)
|
|
34
|
+
; aws_profile (named profile in ~/.aws/config
|
|
35
|
+
; or ~/.aws/credentials)
|
|
36
|
+
; aws_endpoint_url (optional; VPC endpoint, etc.)
|
|
37
|
+
; Auto-detected from `aws_model_id`; set `backend = bedrock`
|
|
38
|
+
; only when you want to override auto-detection.
|
|
39
|
+
;
|
|
40
|
+
; CREDENTIALS NEVER LIVE HERE. This file is meant to be
|
|
41
|
+
; committable. Anything boto3 can do via a named profile --
|
|
42
|
+
; long-term keys, SSO, role chaining via `role_arn` +
|
|
43
|
+
; `source_profile` in ~/.aws/config, MFA, IMDS -- is
|
|
44
|
+
; supported transparently. Configure once with:
|
|
45
|
+
; aws configure --profile <my-profile>
|
|
46
|
+
; and reference the profile by name from the tier.
|
|
47
|
+
; Without `aws_profile` set, boto3's default credential
|
|
48
|
+
; chain applies (env vars, default profile, instance role).
|
|
49
|
+
; ----------------------------------------------------------------------------
|
|
50
|
+
|
|
51
|
+
[DEFAULT]
|
|
52
|
+
host = 127.0.0.1
|
|
53
|
+
router_port = 10101 ; FastAPI auto-router (what opencode hits)
|
|
54
|
+
swap_port = 10102 ; llama-swap manager UI + raw model endpoints
|
|
55
|
+
n_gpu_layers = 999 ; offload everything to Metal on Apple Silicon
|
|
56
|
+
flash_attn = on
|
|
57
|
+
jinja = true
|
|
58
|
+
threads = -1
|
|
59
|
+
cache_type_k = q8_0
|
|
60
|
+
cache_type_v = q8_0
|
|
61
|
+
|
|
62
|
+
;------------------------------------------------------------------------------
|
|
63
|
+
; CODER tier
|
|
64
|
+
;------------------------------------------------------------------------------
|
|
65
|
+
|
|
66
|
+
[code-fast]
|
|
67
|
+
tier = code
|
|
68
|
+
role = fast
|
|
69
|
+
hf_repo = bartowski/Qwen2.5-Coder-3B-Instruct-GGUF
|
|
70
|
+
hf_file = Qwen2.5-Coder-3B-Instruct-Q5_K_M.gguf
|
|
71
|
+
ctx_size = 131072 ; native 32k extended via YaRN (factor 4)
|
|
72
|
+
rope_scaling = yarn (scale=4, orig_ctx=32768)
|
|
73
|
+
size_gb = 2.5
|
|
74
|
+
quant = Q5_K_M
|
|
75
|
+
status = downloading ; queued by `llmstack.sh download`
|
|
76
|
+
opencode_use = small_model + auto-fast tier
|
|
77
|
+
sampler = temp=0.2, top_p=0.95, top_k=40, min_p=0.05 ; deterministic
|
|
78
|
+
description = Qwen2.5-Coder 3B - autocomplete / FIM / quick Q&A
|
|
79
|
+
|
|
80
|
+
[code-smart]
|
|
81
|
+
tier = code
|
|
82
|
+
role = agent
|
|
83
|
+
hf_repo = unsloth/Qwen3-Coder-Next-GGUF
|
|
84
|
+
hf_file = Qwen3-Coder-Next-Q4_K_M.gguf
|
|
85
|
+
hf_file_next = Qwen3-Coder-Next-UD-Q4_K_XL.gguf
|
|
86
|
+
ctx_size = 65536
|
|
87
|
+
size_gb = 45
|
|
88
|
+
size_gb_next = 50
|
|
89
|
+
quant = Q4_K_M
|
|
90
|
+
quant_next = UD-Q4_K_XL
|
|
91
|
+
status = ready (Q4_K_M); UD-Q4_K_XL queued
|
|
92
|
+
opencode_use = agent.build + auto-agent tier
|
|
93
|
+
sampler = temp=0.5, top_p=0.85, top_k=20, min_p=0.05, rep_pen=1.05 ; balanced agent
|
|
94
|
+
description = Qwen3-Coder-Next 80B-A3B MoE - heavy coder for agent loops
|
|
95
|
+
|
|
96
|
+
; Top-tier hosted coder. Shipped disabled because it requires boto3 +
|
|
97
|
+
; AWS Bedrock access. `llmstack install` auto-uncomments the block
|
|
98
|
+
; below (by stripping the leading "; " from each line and dropping
|
|
99
|
+
; the BEGIN/END markers) on first seed iff the `bedrock` extra is
|
|
100
|
+
; installed, i.e. `import boto3` succeeds. On a vanilla install the
|
|
101
|
+
; whole block stays inert -- configparser treats `;`-prefixed lines
|
|
102
|
+
; as comments, so the section is invisible to the tier loader.
|
|
103
|
+
;
|
|
104
|
+
; >>> AUTO-ENABLE-WHEN-BEDROCK-AVAILABLE >>>
|
|
105
|
+
; [code-ultra]
|
|
106
|
+
; tier = code
|
|
107
|
+
; role = ultra
|
|
108
|
+
; backend = bedrock
|
|
109
|
+
; aws_model_id = global.anthropic.claude-opus-4-7 ; global.* cross-region inference profile
|
|
110
|
+
; aws_region = us-east-1 ; API anchor region; global.* auto-routes inference cross-region
|
|
111
|
+
; aws_profile = bedrock-prod ; uncomment + set your own profile name; falls back to default cred chain otherwise
|
|
112
|
+
; ctx_size = 200000
|
|
113
|
+
; opencode_use = on-demand top-tier coder for hard agent tasks
|
|
114
|
+
; ; NB: no `sampler =` line. Claude Opus 4.7 explicitly rejects all
|
|
115
|
+
; ; sampler params (temperature, top_p, top_k) -- per the Bedrock
|
|
116
|
+
; ; model card, "the recommended migration path is to omit these
|
|
117
|
+
; ; parameters entirely from your requests". Leaving sampler off
|
|
118
|
+
; ; makes the router pass requests through untouched.
|
|
119
|
+
; description = Claude Opus 4.7 on Bedrock - top-tier coder for hard agent tasks
|
|
120
|
+
; <<< AUTO-ENABLE-WHEN-BEDROCK-AVAILABLE <<<
|
|
121
|
+
|
|
122
|
+
;------------------------------------------------------------------------------
|
|
123
|
+
; CHAT tier
|
|
124
|
+
;------------------------------------------------------------------------------
|
|
125
|
+
|
|
126
|
+
[plan]
|
|
127
|
+
tier = chat
|
|
128
|
+
role = plan
|
|
129
|
+
hf_repo = Jackrong/Qwopus-GLM-18B-Merged-GGUF
|
|
130
|
+
hf_file = Qwopus-GLM-18B-Healed-Q4_K_M.gguf
|
|
131
|
+
hf_file_next = Qwopus-GLM-18B-Healed-Q6_K.gguf ; recommended; Q8_0 also exists (16 GB)
|
|
132
|
+
ctx_size = 65536 ; conservative 2x doubling (it's a merge, no YaRN)
|
|
133
|
+
size_gb = 9.2
|
|
134
|
+
size_gb_next = 12.1
|
|
135
|
+
quant = Q4_K_M
|
|
136
|
+
quant_next = Q6_K
|
|
137
|
+
status = ready (Q4_K_M); Q6_K queued
|
|
138
|
+
opencode_use = agent.plan + auto-plan tier
|
|
139
|
+
sampler = temp=0.7, top_p=0.9, top_k=40, min_p=0.05 ; creative thinking
|
|
140
|
+
description = Qwopus GLM 18B - planning, design discussions, architecture
|
|
141
|
+
|
|
142
|
+
[plan-uncensored]
|
|
143
|
+
tier = chat
|
|
144
|
+
role = plan-uncensored
|
|
145
|
+
hf_repo = mradermacher/Mistral-Small-3.2-24B-Instruct-2506-ultra-uncensored-heretic-i1-GGUF
|
|
146
|
+
hf_file = Mistral-Small-3.2-24B-Instruct-2506-ultra-uncensored-heretic.i1-Q4_K_M.gguf
|
|
147
|
+
hf_file_next = Mistral-Small-3.2-24B-Instruct-2506-ultra-uncensored-heretic.i1-Q6_K.gguf
|
|
148
|
+
ctx_size = 131072 ; native 128k support, no YaRN required
|
|
149
|
+
size_gb = 13
|
|
150
|
+
size_gb_next = 20
|
|
151
|
+
quant = i1-Q4_K_M
|
|
152
|
+
quant_next = i1-Q6_K
|
|
153
|
+
status = ready (i1-Q4_K_M); i1-Q6_K queued
|
|
154
|
+
opencode_use = agent.plan-nofilter + auto via [nofilter] trigger
|
|
155
|
+
sampler = temp=0.85, top_p=0.95, top_k=50, min_p=0.05 ; max exploration
|
|
156
|
+
description = Mistral-Small 3.2 24B Heretic - no-filter planning
|
|
157
|
+
|
|
158
|
+
;------------------------------------------------------------------------------
|
|
159
|
+
[ROUTING]
|
|
160
|
+
; STEP-DOWN ladder: start at the top of the fidelity ladder for short
|
|
161
|
+
; conversations, then drop down as context grows. Inverts the classic
|
|
162
|
+
; "escalate-on-size" pattern. Rationale:
|
|
163
|
+
;
|
|
164
|
+
; * Top-tier hosted models (Claude Opus/Sonnet on Bedrock) are fastest
|
|
165
|
+
; and most accurate on small inputs, but per-request latency and
|
|
166
|
+
; $cost scale with input tokens, and long-context behaviour
|
|
167
|
+
; degrades faster than headline benchmarks suggest.
|
|
168
|
+
; * code-smart (Qwen3-Coder 80B-A3B) has a 64k window -- best work in
|
|
169
|
+
; the middle of that range, saturates near the top.
|
|
170
|
+
; * code-fast (Qwen2.5-Coder 3B + YaRN x4) has a 128k window, is
|
|
171
|
+
; always-resident (zero load latency), free, and benefits from more
|
|
172
|
+
; explicit context (small models lean on retrieval rather than
|
|
173
|
+
; priors).
|
|
174
|
+
;
|
|
175
|
+
; First-match-wins decision tree applied by llmstack/app.py when model="auto":
|
|
176
|
+
;
|
|
177
|
+
; 1. "[nofilter]" / "uncensored:" trigger -> plan-uncensored
|
|
178
|
+
; 2. "[ultra]" / "[opus]" / "ultra:" trigger AND code-ultra
|
|
179
|
+
; tier configured -> code-ultra
|
|
180
|
+
; 3. PLAN signal words AND no code-block / agent verbs / tools
|
|
181
|
+
; (pure design discussion) -> plan
|
|
182
|
+
; 4. tokens <= high_fidelity_ceiling AND code-ultra configured -> code-ultra
|
|
183
|
+
; tokens <= high_fidelity_ceiling AND no code-ultra -> code-smart
|
|
184
|
+
; 5. tokens <= mid_fidelity_ceiling -> code-smart
|
|
185
|
+
; 6. otherwise (long context):
|
|
186
|
+
; - if tools[] OR turns >= multi_turn (3B tool-calls badly) -> code-smart
|
|
187
|
+
; - else -> code-fast
|
|
188
|
+
;
|
|
189
|
+
; The "high-fidelity" rung is gated on availability: when the
|
|
190
|
+
; [code-ultra] section is absent (or fails to load), rules (2) and (4)
|
|
191
|
+
; silently fall back to code-smart instead of routing to a tier that
|
|
192
|
+
; doesn't exist downstream. Override target names with the
|
|
193
|
+
; ROUTER_*_MODEL env vars; numeric thresholds with the env vars
|
|
194
|
+
; ROUTER_HIGH_FIDELITY_CEILING / ROUTER_MID_FIDELITY_CEILING /
|
|
195
|
+
; ROUTER_MULTI_TURN.
|
|
196
|
+
;
|
|
197
|
+
high_fidelity_ceiling = 8000 ; tokens; below this, top-tier model is still cheap+fast
|
|
198
|
+
mid_fidelity_ceiling = 32000 ; tokens; smart's sweet spot up to here, then step down to fast
|
|
199
|
+
multi_turn = 6 ; turn count that floors the long-context rung at code-smart
|
|
200
|
+
agent_signal_words = implement, fix bug, write a function, refactor, edit, patch, debug, run tests, build it
|
|
201
|
+
plan_signal_words = design, architect, approach, trade-off, should we, how would you, explain why, think through, compare options, brainstorm, root cause
|
|
202
|
+
uncensored_triggers = [nofilter], [uncensored], [heretic], "uncensored:", "nofilter:" (line start)
|
|
203
|
+
ultra_triggers = [ultra], [opus], "ultra:", "opus:" (line start)
|
|
204
|
+
|
|
205
|
+
;------------------------------------------------------------------------------
|
|
206
|
+
; BEDROCK EXAMPLES (commented out -- copy / uncomment to adopt)
|
|
207
|
+
;------------------------------------------------------------------------------
|
|
208
|
+
; To swap one of the local GGUF tiers above for an AWS Bedrock model, COMMENT
|
|
209
|
+
; OUT the existing tier of the same name and uncomment one of these. The router
|
|
210
|
+
; auto-detects backend=bedrock from the presence of `aws_model_id` -- no other
|
|
211
|
+
; flag needed. llama-swap won't load it; the router calls Bedrock directly via
|
|
212
|
+
; boto3 (`pip install 'llmstack[bedrock]'`).
|
|
213
|
+
;
|
|
214
|
+
; Credentials: this file ONLY names a profile. The actual keys / SSO /
|
|
215
|
+
; role chaining live in the standard AWS config files. One-time setup:
|
|
216
|
+
;
|
|
217
|
+
; aws configure --profile bedrock-prod
|
|
218
|
+
; # for SSO: aws configure sso --profile bedrock-prod
|
|
219
|
+
; # for role chaining, edit ~/.aws/config and add a profile with:
|
|
220
|
+
; # role_arn = arn:aws:iam::123456789012:role/llmstack-bedrock
|
|
221
|
+
; # source_profile = bedrock-prod
|
|
222
|
+
;
|
|
223
|
+
; Then reference the profile name from your tier with `aws_profile = ...`.
|
|
224
|
+
; If you omit `aws_profile`, boto3's default chain applies (env vars,
|
|
225
|
+
; default profile, instance role -- whatever boto3 normally finds).
|
|
226
|
+
;
|
|
227
|
+
; SAMPLER NOTE: the `sampler = temp=..., top_p=..., top_k=..., ...`
|
|
228
|
+
; line on each tier is the SINGLE SOURCE OF TRUTH for sampling, but how
|
|
229
|
+
; it gets applied depends on the backend:
|
|
230
|
+
;
|
|
231
|
+
; * gguf tiers -- the llama-swap generator bakes the sampler keys
|
|
232
|
+
; into the llama-server startup command line as `--temp`,
|
|
233
|
+
; `--top-p`, `--top-k`, `--min-p`, `--repeat-penalty` flags.
|
|
234
|
+
; llama-server then uses those as its defaults for every request.
|
|
235
|
+
; The router does NOT touch the request body for gguf tiers --
|
|
236
|
+
; server-side defaults survive across requests cleanly.
|
|
237
|
+
;
|
|
238
|
+
; * bedrock tiers -- AWS Bedrock has no server-side defaults
|
|
239
|
+
; mechanism, so the auto-router (llmstack.app) injects the
|
|
240
|
+
; declared sampler keys into each outbound request body. Only
|
|
241
|
+
; the Converse-supported subset gets through: `temp` ->
|
|
242
|
+
; `temperature` and `top_p` -> `topP`. `top_k`, `min_p`,
|
|
243
|
+
; `rep_pen` are llama.cpp extensions and are silently dropped by
|
|
244
|
+
; the Bedrock backend; declare only what your Bedrock model
|
|
245
|
+
; accepts. Caller-supplied values in the request body still win,
|
|
246
|
+
; so per-call overrides work.
|
|
247
|
+
;
|
|
248
|
+
; opencode.json is sampler-free in both cases by design (the
|
|
249
|
+
; opencode.json generator never emits sampler params on agents).
|
|
250
|
+
;
|
|
251
|
+
; Per-Bedrock-family rules (as of 2026):
|
|
252
|
+
;
|
|
253
|
+
; * Claude Opus 4.7+ -- rejects all sampler params; OMIT `sampler =`
|
|
254
|
+
; entirely (the router will then pass requests through untouched).
|
|
255
|
+
; * Claude Sonnet 4.5 / Haiku 4.5 -- accept `temp` OR `top_p`, never
|
|
256
|
+
; both; pick one.
|
|
257
|
+
; * Claude Opus 4.x (4.1, 4.5, 4.6) -- accept `temp` and `top_p`.
|
|
258
|
+
; * Llama / Titan / Cohere / etc. -- accept `temp` + `top_p`; check
|
|
259
|
+
; the model card if in doubt.
|
|
260
|
+
;
|
|
261
|
+
; Example A: top-tier coder on Bedrock (us-west-2), default cred chain.
|
|
262
|
+
; Optional `aws_model_id_next` (and optional `aws_region_next`) is the
|
|
263
|
+
; queued upgrade target -- mirrors gguf `hf_file_next`. The router uses
|
|
264
|
+
; it only when `--next` is in effect; permanent promotion is the same
|
|
265
|
+
; as gguf: edit `aws_model_id` and re-run `llmstack install`.
|
|
266
|
+
;
|
|
267
|
+
; [code-smart]
|
|
268
|
+
; tier = code
|
|
269
|
+
; role = agent
|
|
270
|
+
; backend = bedrock
|
|
271
|
+
; aws_model_id = anthropic.claude-sonnet-4-5-20250929-v1:0
|
|
272
|
+
; aws_region = us-west-2
|
|
273
|
+
; aws_model_id_next = anthropic.claude-sonnet-5-20260201-v1:0 ; queued
|
|
274
|
+
; aws_region_next = us-east-1 ; (optional) different region for the new model
|
|
275
|
+
; ctx_size = 200000
|
|
276
|
+
; sampler = temp=0.5 ; Sonnet 4.5 accepts ONE of temp / top_p; pick `temp` for agent work
|
|
277
|
+
; description = Claude Sonnet 4.5 on Bedrock - heavy coder for agent loops
|
|
278
|
+
;
|
|
279
|
+
; Example B: planner in a different AWS account, accessed via a named
|
|
280
|
+
; profile that itself uses role-chaining + SSO under ~/.aws/config.
|
|
281
|
+
; (Different tier => different profile name; different account/region.)
|
|
282
|
+
;
|
|
283
|
+
; [plan]
|
|
284
|
+
; tier = chat
|
|
285
|
+
; role = plan
|
|
286
|
+
; aws_model_id = us.anthropic.claude-opus-4-1-20250805-v1:0
|
|
287
|
+
; aws_region = us-east-1
|
|
288
|
+
; aws_profile = bedrock-planning
|
|
289
|
+
; ctx_size = 200000
|
|
290
|
+
; sampler = temp=0.7, top_p=0.9
|
|
291
|
+
; description = Claude Opus 4.1 on Bedrock - planning, design discussions
|
|
292
|
+
;
|
|
293
|
+
; Example C: large model behind a VPC endpoint.
|
|
294
|
+
;
|
|
295
|
+
; [plan-uncensored]
|
|
296
|
+
; tier = chat
|
|
297
|
+
; role = plan-uncensored
|
|
298
|
+
; aws_model_id = meta.llama3-1-405b-instruct-v1:0
|
|
299
|
+
; aws_region = us-west-2
|
|
300
|
+
; aws_profile = bedrock-prod
|
|
301
|
+
; aws_endpoint_url = https://bedrock-runtime.us-west-2.vpce.amazonaws.com
|
|
302
|
+
; ctx_size = 128000
|
|
303
|
+
; sampler = temp=0.85, top_p=0.95
|
|
304
|
+
; description = Llama 3.1 405B on Bedrock - max-exploration planning
|