opencode-llmstack 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,291 @@
1
+ """Generate ``opencode.json`` from ``models.ini``.
2
+
3
+ Reads the models.ini located by :func:`llmstack.paths.models_ini_path` and
4
+ writes an opencode config to the path given as the first CLI argument
5
+ (or stdout if omitted).
6
+
7
+ What gets wired:
8
+
9
+ provider single llama.cpp-compatible provider at router_port
10
+ model ``auto`` (FastAPI router classifies and rewrites
11
+ model names)
12
+ small_model tier with ``role=fast`` (tab-complete, titles)
13
+ agent.build ``auto`` -- always routed
14
+ agent.plan ``role=plan`` -- read-only, no bash
15
+ agent.plan-nofilter ``role=plan-uncensored`` -- read-only, no bash
16
+ command./review,
17
+ command./nofilter shortcuts that the router can't auto-classify
18
+
19
+ What is **deliberately NOT wired**:
20
+
21
+ Sampler params (``temperature``, ``top_p``, etc.) are NEVER emitted on
22
+ any agent or model in opencode.json. Sampling is the *backend's*
23
+ responsibility, with ``models.ini`` as the single source of truth:
24
+
25
+ * gguf tiers -- :mod:`llmstack.generators.llama_swap` bakes the
26
+ tier's ``sampler = ...`` into the llama-server startup command line
27
+ (``--temp``/``--top-p``/...). llama-server applies them as defaults
28
+ for every request.
29
+ * Bedrock tiers -- :func:`llmstack.app._inject_sampler` adds the
30
+ tier's sampler keys to each outbound request body (Bedrock has no
31
+ server-side defaults mechanism). Bedrock models that reject sampler
32
+ params (e.g. Claude Opus 4.7) declare an empty ``sampler =`` and
33
+ the router passes requests through untouched.
34
+
35
+ Either way, opencode.json never carries sampler params -- one place to
36
+ edit (``models.ini``), no risk of opencode drifting from the actual
37
+ tier config, and any other router client (curl, a different IDE) gets
38
+ the same per-tier behaviour for free.
39
+ """
40
+
41
+ from __future__ import annotations
42
+
43
+ import configparser
44
+ import json
45
+ import os
46
+ import re
47
+ import sys
48
+ from pathlib import Path
49
+
50
+ from llmstack.paths import AGENTS_TEMPLATE, models_ini_path, remote_url
51
+
52
+ PROVIDER_KEY = "llama.cpp"
53
+ API_KEY = "sk-no-key-required"
54
+
55
+ ROLE_MAP: dict[str, tuple[str, str | None]] = {
56
+ "fast": ("small_model", None),
57
+ "agent": ("agent", "build"),
58
+ "plan": ("agent", "plan"),
59
+ "plan-uncensored": ("agent", "plan-nofilter"),
60
+ }
61
+
62
+ READ_ONLY_AGENTS = {"plan", "plan-nofilter"}
63
+
64
+ # Slash-command shortcuts (no `/fast`: `auto` already routes trivial chat).
65
+ COMMANDS = {
66
+ "review": {
67
+ "template": "Review the following for trade-offs, risks, and follow-ups. Be concrete.",
68
+ "description": "Architectural review via the planning model.",
69
+ "agent": "plan",
70
+ },
71
+ "nofilter": {
72
+ "template": "[nofilter]",
73
+ "description": "Route to the uncensored planning model.",
74
+ "agent": "plan-nofilter",
75
+ },
76
+ }
77
+
78
+ SHARE = os.getenv("OPENCODE_SHARE", "disabled")
79
+ USERNAME = os.getenv("OPENCODE_USERNAME") or None
80
+
81
+ # Keep model picker scoped to the local stack even if hosted-API env vars leak in.
82
+ _REMOTE_PROVIDERS = (
83
+ "anthropic,openai,google,openrouter,xai,groq,deepseek,"
84
+ "mistral,cerebras,azure,perplexity,vercel,morph,bedrock"
85
+ )
86
+ DISABLED_PROVIDERS = [
87
+ p.strip() for p in
88
+ os.getenv("OPENCODE_DISABLED_PROVIDERS", _REMOTE_PROVIDERS).split(",")
89
+ if p.strip()
90
+ ]
91
+
92
+
93
+ def _instructions_paths() -> list[str]:
94
+ """Resolve the ``instructions`` array in opencode.json.
95
+
96
+ Honours ``OPENCODE_INSTRUCTIONS`` (colon-separated) for the per-project
97
+ install path; falls back to the bundled template inside the package.
98
+ """
99
+ raw = os.getenv("OPENCODE_INSTRUCTIONS", str(AGENTS_TEMPLATE))
100
+ return [p for p in raw.split(":") if p]
101
+
102
+
103
+ ZERO_COST = {"input": 0, "output": 0, "cache_read": 0, "cache_write": 0}
104
+ DIGITS = re.compile(r"\d+")
105
+
106
+
107
+ def _int(value: str, default: int) -> int:
108
+ m = DIGITS.search(value or "")
109
+ return int(m.group()) if m else default
110
+
111
+
112
+ def build_config(
113
+ ini_path: Path | None = None,
114
+ *,
115
+ ini_text: str | None = None,
116
+ remote: str | None = None,
117
+ ) -> dict:
118
+ """Build the opencode.json dict from ``models.ini``.
119
+
120
+ Source of the INI is one of (mutually exclusive):
121
+
122
+ * ``ini_text`` -- raw INI content as a string. Used by
123
+ ``llmstack install --external``: it fetches ``models.ini``
124
+ straight from the router (``GET /models.ini``) and renders the
125
+ opencode config without writing the file to disk. Thin clients
126
+ don't keep a local copy of ``models.ini`` -- they re-fetch it
127
+ on each ``install``.
128
+ * ``ini_path`` -- explicit path. Used by callers that have a
129
+ ``Path`` in hand (``check_models``, tests).
130
+ * neither -- read from :func:`models_ini_path` (canonical
131
+ per-project location, the local-mode default).
132
+
133
+ ``remote`` overrides the router base URL: when given, opencode is
134
+ pointed at ``{remote}/v1`` (thin-client / external mode). When
135
+ ``None``, fall back to :func:`llmstack.paths.remote_url` -- which
136
+ reads the persisted channel marker first, env var second -- and
137
+ finally to the local router host/port from ``models.ini``.
138
+
139
+ Passing ``remote`` explicitly is what ``llmstack install --external
140
+ [URL]`` does: it has just *decided* the URL from flags + env and
141
+ needs the renderer to honour that decision rather than looking
142
+ again at a possibly-stale marker.
143
+ """
144
+ if ini_text is not None and ini_path is not None:
145
+ raise ValueError("build_config: pass ini_text OR ini_path, not both")
146
+
147
+ cfg = configparser.ConfigParser(inline_comment_prefixes=(";",), interpolation=None)
148
+ if ini_text is not None:
149
+ cfg.read_string(ini_text)
150
+ else:
151
+ path = ini_path or models_ini_path()
152
+ if not path.exists():
153
+ raise SystemExit(f"models.ini not found at {path}")
154
+ cfg.read(path)
155
+
156
+ defaults = cfg["DEFAULT"]
157
+ rurl = remote if remote is not None else remote_url()
158
+ if rurl:
159
+ # Client mode: send all traffic to the remote router. Keep the
160
+ # tier / agent wiring derived from the local models.ini -- the
161
+ # remote stack is expected to expose the same tier names; tier
162
+ # ``ctx_size`` is a useful client-side hint (used by opencode
163
+ # for prompt-packing) regardless of where the actual model
164
+ # lives. Sampling is the *router's* responsibility (it injects
165
+ # per-tier defaults from its own models.ini), so it never
166
+ # appears in opencode.json.
167
+ base_url = f"{rurl}/v1"
168
+ else:
169
+ host = (defaults.get("host") or "127.0.0.1").strip()
170
+ port = (defaults.get("router_port") or "10101").strip()
171
+ base_url = f"http://{host}:{port}/v1"
172
+
173
+ tier_sections = [s for s in cfg.sections() if s != "ROUTING"]
174
+
175
+ # `auto` context = MIN across all tiers so opencode never packs a prompt
176
+ # that overflows the tier the router actually picks.
177
+ auto_ctx = min(
178
+ (_int(cfg[s].get("ctx_size", ""), 0) for s in tier_sections),
179
+ default=8192,
180
+ ) or 8192
181
+
182
+ models: dict[str, dict] = {
183
+ "auto": {
184
+ "name": "Auto (router selects: fast / agent / plan / uncensored)",
185
+ "limit": {"context": auto_ctx, "output": 16384},
186
+ "tool_call": True,
187
+ "cost": ZERO_COST,
188
+ }
189
+ }
190
+
191
+ small_model: str | None = None
192
+ agents: dict[str, dict] = {}
193
+
194
+ for sec in tier_sections:
195
+ s = cfg[sec]
196
+ role = (s.get("role") or "").strip()
197
+ ctx = _int(s.get("ctx_size", ""), 8192)
198
+ desc = (s.get("description") or sec).strip()
199
+
200
+ model_entry: dict = {
201
+ "name": desc,
202
+ "limit": {"context": ctx, "output": 32768 if role == "agent" else 8192},
203
+ "tool_call": True,
204
+ "cost": ZERO_COST,
205
+ }
206
+ if role in ("agent", "plan-uncensored"):
207
+ model_entry["reasoning"] = True
208
+ models[sec] = model_entry
209
+
210
+ kind, agent_name = ROLE_MAP.get(role, (None, None))
211
+ if kind is None:
212
+ continue
213
+
214
+ model_ref = f"{PROVIDER_KEY}/{sec}"
215
+ if kind == "small_model":
216
+ small_model = model_ref
217
+ continue
218
+
219
+ # `build` is always wired to the auto router so escalation to
220
+ # code-ultra (or fallback to code-fast) happens transparently.
221
+ if agent_name == "build":
222
+ agent_model_ref = f"{PROVIDER_KEY}/auto"
223
+ else:
224
+ agent_model_ref = model_ref
225
+
226
+ # Sampler params are intentionally absent here -- the router
227
+ # injects per-tier defaults from models.ini at request time
228
+ # (see :func:`llmstack.app._inject_sampler`). See the module
229
+ # docstring for the rationale.
230
+ agent: dict = {"model": agent_model_ref}
231
+ if agent_name in READ_ONLY_AGENTS:
232
+ agent["permission"] = {"edit": "deny", "write": "deny", "bash": "deny"}
233
+ agents[agent_name] = agent # type: ignore[index]
234
+
235
+ out: dict = {
236
+ "$schema": "https://opencode.ai/config.json",
237
+ "share": SHARE,
238
+ "autoupdate": "notify",
239
+ }
240
+ if USERNAME:
241
+ out["username"] = USERNAME
242
+ if DISABLED_PROVIDERS:
243
+ out["disabled_providers"] = DISABLED_PROVIDERS
244
+
245
+ instructions = _instructions_paths()
246
+ if instructions:
247
+ out["instructions"] = instructions
248
+
249
+ out["provider"] = {
250
+ PROVIDER_KEY: {
251
+ "npm": "@ai-sdk/openai-compatible",
252
+ "name": "llmstack (local llama-swap + auto router)",
253
+ "options": {"baseURL": base_url, "apiKey": API_KEY},
254
+ "models": models,
255
+ }
256
+ }
257
+ out["model"] = f"{PROVIDER_KEY}/auto"
258
+ if small_model:
259
+ out["small_model"] = small_model
260
+ if agents:
261
+ out["agent"] = {k: agents[k] for k in ("build", "plan", "plan-nofilter") if k in agents}
262
+ out["command"] = COMMANDS
263
+ return out
264
+
265
+
266
+ def render(*, ini_text: str | None = None, remote: str | None = None) -> str:
267
+ """Return the full opencode.json text (with trailing newline).
268
+
269
+ ``ini_text`` and ``remote`` are forwarded to :func:`build_config`;
270
+ see there for the resolution order.
271
+ """
272
+ return json.dumps(build_config(ini_text=ini_text, remote=remote), indent=2) + "\n"
273
+
274
+
275
+ def validate(path: Path) -> None:
276
+ """Cheap structural sanity check: parses cleanly as JSON."""
277
+ json.loads(path.read_text())
278
+
279
+
280
+ def main(argv: list[str]) -> int:
281
+ target = argv[1] if len(argv) > 1 else "-"
282
+ text = render()
283
+ if target == "-":
284
+ sys.stdout.write(text)
285
+ else:
286
+ Path(target).write_text(text)
287
+ return 0
288
+
289
+
290
+ if __name__ == "__main__":
291
+ sys.exit(main(sys.argv))
llmstack/models.ini ADDED
@@ -0,0 +1,304 @@
1
+ ; ----------------------------------------------------------------------------
2
+ ; models.ini - inventory of models served by llmstack/.
3
+ ;
4
+ ; Runtime config: llmstack/llama-swap.yaml.
5
+ ; opencode bindings: ../opencode.json (model + agent.build/plan/plan-nofilter).
6
+ ;
7
+ ; SUPPORTED TIERS -- canonical names recognised by the router and the
8
+ ; opencode / llama-swap generators. ONLY the names listed below should
9
+ ; appear as section headers in this file. Any other section name is
10
+ ; parsed but ignored by `model: auto` routing and won't be wired into
11
+ ; the generated opencode.json -- it would just be dead config. To grow
12
+ ; this set, add the new name here AND update llmstack/app.py +
13
+ ; llmstack/tiers.py so the router knows about it.
14
+ ;
15
+ ; code-fast always-resident tiny local coder
16
+ ; code-smart AGENT mode: heavy local coder, tool calls, multi-file edits
17
+ ; code-ultra AGENT mode: top-tier hosted coder (Claude Opus on Bedrock)
18
+ ; - shipped commented-out in the bundled template;
19
+ ; `llmstack install` auto-uncomments it when the
20
+ ; `bedrock` extra (boto3) is installed.
21
+ ; plan PLAN mode: chat-tuned, design discussions
22
+ ; plan-uncensored PLAN mode (no filter): when the topic requires it
23
+ ;
24
+ ; Per-tier backends:
25
+ ; gguf local llama-server (managed by llama-swap). Driven by
26
+ ; hf_repo + hf_file. This is the default; auto-detected
27
+ ; when hf_* keys are present.
28
+ ; bedrock hosted AWS Bedrock model. Identity-only in this file:
29
+ ; aws_model_id (required)
30
+ ; aws_model_id_next (optional upgrade target)
31
+ ; aws_region (optional; some models are
32
+ ; region-pinned)
33
+ ; aws_region_next (optional)
34
+ ; aws_profile (named profile in ~/.aws/config
35
+ ; or ~/.aws/credentials)
36
+ ; aws_endpoint_url (optional; VPC endpoint, etc.)
37
+ ; Auto-detected from `aws_model_id`; set `backend = bedrock`
38
+ ; only when you want to override auto-detection.
39
+ ;
40
+ ; CREDENTIALS NEVER LIVE HERE. This file is meant to be
41
+ ; committable. Anything boto3 can do via a named profile --
42
+ ; long-term keys, SSO, role chaining via `role_arn` +
43
+ ; `source_profile` in ~/.aws/config, MFA, IMDS -- is
44
+ ; supported transparently. Configure once with:
45
+ ; aws configure --profile <my-profile>
46
+ ; and reference the profile by name from the tier.
47
+ ; Without `aws_profile` set, boto3's default credential
48
+ ; chain applies (env vars, default profile, instance role).
49
+ ; ----------------------------------------------------------------------------
50
+
51
+ [DEFAULT]
52
+ host = 127.0.0.1
53
+ router_port = 10101 ; FastAPI auto-router (what opencode hits)
54
+ swap_port = 10102 ; llama-swap manager UI + raw model endpoints
55
+ n_gpu_layers = 999 ; offload everything to Metal on Apple Silicon
56
+ flash_attn = on
57
+ jinja = true
58
+ threads = -1
59
+ cache_type_k = q8_0
60
+ cache_type_v = q8_0
61
+
62
+ ;------------------------------------------------------------------------------
63
+ ; CODER tier
64
+ ;------------------------------------------------------------------------------
65
+
66
+ [code-fast]
67
+ tier = code
68
+ role = fast
69
+ hf_repo = bartowski/Qwen2.5-Coder-3B-Instruct-GGUF
70
+ hf_file = Qwen2.5-Coder-3B-Instruct-Q5_K_M.gguf
71
+ ctx_size = 131072 ; native 32k extended via YaRN (factor 4)
72
+ rope_scaling = yarn (scale=4, orig_ctx=32768)
73
+ size_gb = 2.5
74
+ quant = Q5_K_M
75
+ status = downloading ; queued by `llmstack.sh download`
76
+ opencode_use = small_model + auto-fast tier
77
+ sampler = temp=0.2, top_p=0.95, top_k=40, min_p=0.05 ; deterministic
78
+ description = Qwen2.5-Coder 3B - autocomplete / FIM / quick Q&A
79
+
80
+ [code-smart]
81
+ tier = code
82
+ role = agent
83
+ hf_repo = unsloth/Qwen3-Coder-Next-GGUF
84
+ hf_file = Qwen3-Coder-Next-Q4_K_M.gguf
85
+ hf_file_next = Qwen3-Coder-Next-UD-Q4_K_XL.gguf
86
+ ctx_size = 65536
87
+ size_gb = 45
88
+ size_gb_next = 50
89
+ quant = Q4_K_M
90
+ quant_next = UD-Q4_K_XL
91
+ status = ready (Q4_K_M); UD-Q4_K_XL queued
92
+ opencode_use = agent.build + auto-agent tier
93
+ sampler = temp=0.5, top_p=0.85, top_k=20, min_p=0.05, rep_pen=1.05 ; balanced agent
94
+ description = Qwen3-Coder-Next 80B-A3B MoE - heavy coder for agent loops
95
+
96
+ ; Top-tier hosted coder. Shipped disabled because it requires boto3 +
97
+ ; AWS Bedrock access. `llmstack install` auto-uncomments the block
98
+ ; below (by stripping the leading "; " from each line and dropping
99
+ ; the BEGIN/END markers) on first seed iff the `bedrock` extra is
100
+ ; installed, i.e. `import boto3` succeeds. On a vanilla install the
101
+ ; whole block stays inert -- configparser treats `;`-prefixed lines
102
+ ; as comments, so the section is invisible to the tier loader.
103
+ ;
104
+ ; >>> AUTO-ENABLE-WHEN-BEDROCK-AVAILABLE >>>
105
+ ; [code-ultra]
106
+ ; tier = code
107
+ ; role = ultra
108
+ ; backend = bedrock
109
+ ; aws_model_id = global.anthropic.claude-opus-4-7 ; global.* cross-region inference profile
110
+ ; aws_region = us-east-1 ; API anchor region; global.* auto-routes inference cross-region
111
+ ; aws_profile = bedrock-prod ; uncomment + set your own profile name; falls back to default cred chain otherwise
112
+ ; ctx_size = 200000
113
+ ; opencode_use = on-demand top-tier coder for hard agent tasks
114
+ ; ; NB: no `sampler =` line. Claude Opus 4.7 explicitly rejects all
115
+ ; ; sampler params (temperature, top_p, top_k) -- per the Bedrock
116
+ ; ; model card, "the recommended migration path is to omit these
117
+ ; ; parameters entirely from your requests". Leaving sampler off
118
+ ; ; makes the router pass requests through untouched.
119
+ ; description = Claude Opus 4.7 on Bedrock - top-tier coder for hard agent tasks
120
+ ; <<< AUTO-ENABLE-WHEN-BEDROCK-AVAILABLE <<<
121
+
122
+ ;------------------------------------------------------------------------------
123
+ ; CHAT tier
124
+ ;------------------------------------------------------------------------------
125
+
126
+ [plan]
127
+ tier = chat
128
+ role = plan
129
+ hf_repo = Jackrong/Qwopus-GLM-18B-Merged-GGUF
130
+ hf_file = Qwopus-GLM-18B-Healed-Q4_K_M.gguf
131
+ hf_file_next = Qwopus-GLM-18B-Healed-Q6_K.gguf ; recommended; Q8_0 also exists (16 GB)
132
+ ctx_size = 65536 ; conservative 2x doubling (it's a merge, no YaRN)
133
+ size_gb = 9.2
134
+ size_gb_next = 12.1
135
+ quant = Q4_K_M
136
+ quant_next = Q6_K
137
+ status = ready (Q4_K_M); Q6_K queued
138
+ opencode_use = agent.plan + auto-plan tier
139
+ sampler = temp=0.7, top_p=0.9, top_k=40, min_p=0.05 ; creative thinking
140
+ description = Qwopus GLM 18B - planning, design discussions, architecture
141
+
142
+ [plan-uncensored]
143
+ tier = chat
144
+ role = plan-uncensored
145
+ hf_repo = mradermacher/Mistral-Small-3.2-24B-Instruct-2506-ultra-uncensored-heretic-i1-GGUF
146
+ hf_file = Mistral-Small-3.2-24B-Instruct-2506-ultra-uncensored-heretic.i1-Q4_K_M.gguf
147
+ hf_file_next = Mistral-Small-3.2-24B-Instruct-2506-ultra-uncensored-heretic.i1-Q6_K.gguf
148
+ ctx_size = 131072 ; native 128k support, no YaRN required
149
+ size_gb = 13
150
+ size_gb_next = 20
151
+ quant = i1-Q4_K_M
152
+ quant_next = i1-Q6_K
153
+ status = ready (i1-Q4_K_M); i1-Q6_K queued
154
+ opencode_use = agent.plan-nofilter + auto via [nofilter] trigger
155
+ sampler = temp=0.85, top_p=0.95, top_k=50, min_p=0.05 ; max exploration
156
+ description = Mistral-Small 3.2 24B Heretic - no-filter planning
157
+
158
+ ;------------------------------------------------------------------------------
159
+ [ROUTING]
160
+ ; STEP-DOWN ladder: start at the top of the fidelity ladder for short
161
+ ; conversations, then drop down as context grows. Inverts the classic
162
+ ; "escalate-on-size" pattern. Rationale:
163
+ ;
164
+ ; * Top-tier hosted models (Claude Opus/Sonnet on Bedrock) are fastest
165
+ ; and most accurate on small inputs, but per-request latency and
166
+ ; $cost scale with input tokens, and long-context behaviour
167
+ ; degrades faster than headline benchmarks suggest.
168
+ ; * code-smart (Qwen3-Coder 80B-A3B) has a 64k window -- best work in
169
+ ; the middle of that range, saturates near the top.
170
+ ; * code-fast (Qwen2.5-Coder 3B + YaRN x4) has a 128k window, is
171
+ ; always-resident (zero load latency), free, and benefits from more
172
+ ; explicit context (small models lean on retrieval rather than
173
+ ; priors).
174
+ ;
175
+ ; First-match-wins decision tree applied by llmstack/app.py when model="auto":
176
+ ;
177
+ ; 1. "[nofilter]" / "uncensored:" trigger -> plan-uncensored
178
+ ; 2. "[ultra]" / "[opus]" / "ultra:" trigger AND code-ultra
179
+ ; tier configured -> code-ultra
180
+ ; 3. PLAN signal words AND no code-block / agent verbs / tools
181
+ ; (pure design discussion) -> plan
182
+ ; 4. tokens <= high_fidelity_ceiling AND code-ultra configured -> code-ultra
183
+ ; tokens <= high_fidelity_ceiling AND no code-ultra -> code-smart
184
+ ; 5. tokens <= mid_fidelity_ceiling -> code-smart
185
+ ; 6. otherwise (long context):
186
+ ; - if tools[] OR turns >= multi_turn (3B tool-calls badly) -> code-smart
187
+ ; - else -> code-fast
188
+ ;
189
+ ; The "high-fidelity" rung is gated on availability: when the
190
+ ; [code-ultra] section is absent (or fails to load), rules (2) and (4)
191
+ ; silently fall back to code-smart instead of routing to a tier that
192
+ ; doesn't exist downstream. Override target names with the
193
+ ; ROUTER_*_MODEL env vars; numeric thresholds with the env vars
194
+ ; ROUTER_HIGH_FIDELITY_CEILING / ROUTER_MID_FIDELITY_CEILING /
195
+ ; ROUTER_MULTI_TURN.
196
+ ;
197
+ high_fidelity_ceiling = 8000 ; tokens; below this, top-tier model is still cheap+fast
198
+ mid_fidelity_ceiling = 32000 ; tokens; smart's sweet spot up to here, then step down to fast
199
+ multi_turn = 6 ; turn count that floors the long-context rung at code-smart
200
+ agent_signal_words = implement, fix bug, write a function, refactor, edit, patch, debug, run tests, build it
201
+ plan_signal_words = design, architect, approach, trade-off, should we, how would you, explain why, think through, compare options, brainstorm, root cause
202
+ uncensored_triggers = [nofilter], [uncensored], [heretic], "uncensored:", "nofilter:" (line start)
203
+ ultra_triggers = [ultra], [opus], "ultra:", "opus:" (line start)
204
+
205
+ ;------------------------------------------------------------------------------
206
+ ; BEDROCK EXAMPLES (commented out -- copy / uncomment to adopt)
207
+ ;------------------------------------------------------------------------------
208
+ ; To swap one of the local GGUF tiers above for an AWS Bedrock model, COMMENT
209
+ ; OUT the existing tier of the same name and uncomment one of these. The router
210
+ ; auto-detects backend=bedrock from the presence of `aws_model_id` -- no other
211
+ ; flag needed. llama-swap won't load it; the router calls Bedrock directly via
212
+ ; boto3 (`pip install 'llmstack[bedrock]'`).
213
+ ;
214
+ ; Credentials: this file ONLY names a profile. The actual keys / SSO /
215
+ ; role chaining live in the standard AWS config files. One-time setup:
216
+ ;
217
+ ; aws configure --profile bedrock-prod
218
+ ; # for SSO: aws configure sso --profile bedrock-prod
219
+ ; # for role chaining, edit ~/.aws/config and add a profile with:
220
+ ; # role_arn = arn:aws:iam::123456789012:role/llmstack-bedrock
221
+ ; # source_profile = bedrock-prod
222
+ ;
223
+ ; Then reference the profile name from your tier with `aws_profile = ...`.
224
+ ; If you omit `aws_profile`, boto3's default chain applies (env vars,
225
+ ; default profile, instance role -- whatever boto3 normally finds).
226
+ ;
227
+ ; SAMPLER NOTE: the `sampler = temp=..., top_p=..., top_k=..., ...`
228
+ ; line on each tier is the SINGLE SOURCE OF TRUTH for sampling, but how
229
+ ; it gets applied depends on the backend:
230
+ ;
231
+ ; * gguf tiers -- the llama-swap generator bakes the sampler keys
232
+ ; into the llama-server startup command line as `--temp`,
233
+ ; `--top-p`, `--top-k`, `--min-p`, `--repeat-penalty` flags.
234
+ ; llama-server then uses those as its defaults for every request.
235
+ ; The router does NOT touch the request body for gguf tiers --
236
+ ; server-side defaults survive across requests cleanly.
237
+ ;
238
+ ; * bedrock tiers -- AWS Bedrock has no server-side defaults
239
+ ; mechanism, so the auto-router (llmstack.app) injects the
240
+ ; declared sampler keys into each outbound request body. Only
241
+ ; the Converse-supported subset gets through: `temp` ->
242
+ ; `temperature` and `top_p` -> `topP`. `top_k`, `min_p`,
243
+ ; `rep_pen` are llama.cpp extensions and are silently dropped by
244
+ ; the Bedrock backend; declare only what your Bedrock model
245
+ ; accepts. Caller-supplied values in the request body still win,
246
+ ; so per-call overrides work.
247
+ ;
248
+ ; opencode.json is sampler-free in both cases by design (the
249
+ ; opencode.json generator never emits sampler params on agents).
250
+ ;
251
+ ; Per-Bedrock-family rules (as of 2026):
252
+ ;
253
+ ; * Claude Opus 4.7+ -- rejects all sampler params; OMIT `sampler =`
254
+ ; entirely (the router will then pass requests through untouched).
255
+ ; * Claude Sonnet 4.5 / Haiku 4.5 -- accept `temp` OR `top_p`, never
256
+ ; both; pick one.
257
+ ; * Claude Opus 4.x (4.1, 4.5, 4.6) -- accept `temp` and `top_p`.
258
+ ; * Llama / Titan / Cohere / etc. -- accept `temp` + `top_p`; check
259
+ ; the model card if in doubt.
260
+ ;
261
+ ; Example A: top-tier coder on Bedrock (us-west-2), default cred chain.
262
+ ; Optional `aws_model_id_next` (and optional `aws_region_next`) is the
263
+ ; queued upgrade target -- mirrors gguf `hf_file_next`. The router uses
264
+ ; it only when `--next` is in effect; permanent promotion is the same
265
+ ; as gguf: edit `aws_model_id` and re-run `llmstack install`.
266
+ ;
267
+ ; [code-smart]
268
+ ; tier = code
269
+ ; role = agent
270
+ ; backend = bedrock
271
+ ; aws_model_id = anthropic.claude-sonnet-4-5-20250929-v1:0
272
+ ; aws_region = us-west-2
273
+ ; aws_model_id_next = anthropic.claude-sonnet-5-20260201-v1:0 ; queued
274
+ ; aws_region_next = us-east-1 ; (optional) different region for the new model
275
+ ; ctx_size = 200000
276
+ ; sampler = temp=0.5 ; Sonnet 4.5 accepts ONE of temp / top_p; pick `temp` for agent work
277
+ ; description = Claude Sonnet 4.5 on Bedrock - heavy coder for agent loops
278
+ ;
279
+ ; Example B: planner in a different AWS account, accessed via a named
280
+ ; profile that itself uses role-chaining + SSO under ~/.aws/config.
281
+ ; (Different tier => different profile name; different account/region.)
282
+ ;
283
+ ; [plan]
284
+ ; tier = chat
285
+ ; role = plan
286
+ ; aws_model_id = us.anthropic.claude-opus-4-1-20250805-v1:0
287
+ ; aws_region = us-east-1
288
+ ; aws_profile = bedrock-planning
289
+ ; ctx_size = 200000
290
+ ; sampler = temp=0.7, top_p=0.9
291
+ ; description = Claude Opus 4.1 on Bedrock - planning, design discussions
292
+ ;
293
+ ; Example C: large model behind a VPC endpoint.
294
+ ;
295
+ ; [plan-uncensored]
296
+ ; tier = chat
297
+ ; role = plan-uncensored
298
+ ; aws_model_id = meta.llama3-1-405b-instruct-v1:0
299
+ ; aws_region = us-west-2
300
+ ; aws_profile = bedrock-prod
301
+ ; aws_endpoint_url = https://bedrock-runtime.us-west-2.vpce.amazonaws.com
302
+ ; ctx_size = 128000
303
+ ; sampler = temp=0.85, top_p=0.95
304
+ ; description = Llama 3.1 405B on Bedrock - max-exploration planning