opencode-llmstack 0.7.2__py3-none-any.whl → 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
llmstack/app.py CHANGED
@@ -74,7 +74,18 @@ Routing decision tree (first match wins):
74
74
  2. Explicit "ultra" trigger (``[ultra]``, ``[opus]``,
75
75
  ``ultra:``, ``opus:``) AND ultra tier configured -> code-ultra
76
76
  3. PLAN signal words AND no code-block / agent verbs / tools
77
- (design discussion, no implementation pending) -> plan
77
+ AND estimated tokens <= ``[plan]`` tier's ctx_size
78
+ (pure design discussion that fits the planner's
79
+ window) -> plan
80
+ (if the planner's
81
+ ctx_size is breached
82
+ we fall through to
83
+ the coding ladder
84
+ rather than send a
85
+ request that won't
86
+ fit -- the coding
87
+ tiers cover larger
88
+ windows by design)
78
89
  4. Estimated input tokens <= HIGH_FIDELITY_CEILING
79
90
  ("reasonable context still being built") -> code-ultra
80
91
  (else code-smart)
@@ -90,6 +101,12 @@ Routing decision tree (first match wins):
90
101
  since 3B models
91
102
  tool-call unreliably)
92
103
 
104
+ The auto router's effective max context window is
105
+ ``[code-fast].ctx_size`` -- fast is the bottom of the step-down
106
+ ladder, so any context that would overflow the tiers above lands on
107
+ fast. Inputs longer than fast's window have no safe home and should
108
+ be considered out of scope for ``model = auto``.
109
+
93
110
  Ultra-tier routing is gated on availability: rule (2) and the
94
111
  "high-fidelity" rung of (4) first check that the tier is loaded
95
112
  from ``models.ini`` (i.e. present in :data:`TIER_BY_ALIAS`). When
@@ -137,14 +154,21 @@ UNCENSORED_MODEL = os.getenv("ROUTER_UNCENSORED_MODEL", "plan-uncensored")
137
154
  # est <= MID_FIDELITY_CEILING -> code-smart
138
155
  # est > MID_FIDELITY_CEILING -> code-fast (or smart with tools/loop)
139
156
  #
157
+ # Each ceiling is half of the corresponding tier's ``ctx_size`` in
158
+ # models.ini -- the ceiling marks where the tier still has comfortable
159
+ # headroom, and double the ceiling is where the router has already
160
+ # stepped down to the next tier (so the upper tier never has to handle
161
+ # inputs at its own limit).
162
+ #
140
163
  # Defaults:
141
- # HIGH 8000 - "reasonable context built": a couple of files loaded,
164
+ # HIGH 12000 - "reasonable context built": a couple of files loaded,
142
165
  # instructions clear, top-tier still cheap+fast here.
143
- # MID 32000 - half of code-smart's 65k window; past this, hosted
166
+ # Pairs with a 24k ctx_size on code-ultra.
167
+ # MID 32000 - half of code-smart's 64k window; past this, hosted
144
168
  # top-tier latency/$cost balloons and code-smart starts
145
169
  # getting cramped, while code-fast's 128k YaRN window
146
170
  # still has comfortable headroom.
147
- HIGH_FIDELITY_CEILING = int(os.getenv("ROUTER_HIGH_FIDELITY_CEILING", "8000"))
171
+ HIGH_FIDELITY_CEILING = int(os.getenv("ROUTER_HIGH_FIDELITY_CEILING", "12000"))
148
172
  MID_FIDELITY_CEILING = int(os.getenv("ROUTER_MID_FIDELITY_CEILING", "32000"))
149
173
  # Floor the long-context rung at code-smart whenever a tool-call
150
174
  # protocol is in play -- 3B models tool-call unreliably regardless of
@@ -339,18 +363,30 @@ def classify(body: dict[str, Any]) -> tuple[str, str]:
339
363
  or _matches(AGENT_SIGNALS, messages, prompt)
340
364
  )
341
365
 
366
+ est = _estimate_tokens(messages, prompt)
367
+
342
368
  # Plan track is orthogonal to the code fidelity ladder: ``plan`` is a
343
369
  # chat-tuned model meant for design / "should we" discussions. Only
344
370
  # take it when nothing about the request says "I'm about to write
345
- # code" (no triple-backticks, no agent verbs, no tool calls).
371
+ # code" (no triple-backticks, no agent verbs, no tool calls). And
372
+ # only if the input fits in the planner's ctx_size -- past that we'd
373
+ # be sending a request the planner can't hold, so we fall through
374
+ # to the coding ladder, which has tiers (smart, fast) explicitly
375
+ # sized for larger contexts.
346
376
  if (
347
377
  not has_tools
348
378
  and not has_code_signal
349
379
  and _matches(PLAN_SIGNALS, messages, prompt)
350
380
  ):
351
- return PLAN_MODEL, "plan-signal"
352
-
353
- est = _estimate_tokens(messages, prompt)
381
+ plan_tier = TIER_BY_ALIAS.get(PLAN_MODEL)
382
+ plan_ctx = plan_tier.ctx_size if plan_tier else 0
383
+ if not plan_ctx or est <= plan_ctx:
384
+ return PLAN_MODEL, "plan-signal"
385
+ log.info(
386
+ "plan-signal but tokens~%d > %s.ctx_size %d; "
387
+ "falling through to coding ladder",
388
+ est, PLAN_MODEL, plan_ctx,
389
+ )
354
390
 
355
391
  # Rung 1: short context -- start at the top.
356
392
  if est <= HIGH_FIDELITY_CEILING:
@@ -172,9 +172,22 @@ def build_config(
172
172
 
173
173
  tier_sections = [s for s in cfg.sections() if s != "ROUTING"]
174
174
 
175
- # `auto` context = MIN across all tiers so opencode never packs a prompt
176
- # that overflows the tier the router actually picks.
177
- auto_ctx = min(
175
+ # `auto` context = the fast tier's ctx_size. The router runs a
176
+ # step-DOWN ladder (ultra -> smart -> fast as context grows), so
177
+ # the largest window in the ladder is fast's, and that's the
178
+ # effective ceiling for `model = auto` -- anything bigger has no
179
+ # tier to land on. Using `min(...)` here would clip opencode to
180
+ # the smallest tier's window even though the router would never
181
+ # actually send a long prompt to that tier.
182
+ fast_ctx = next(
183
+ (
184
+ _int(cfg[s].get("ctx_size", ""), 0)
185
+ for s in tier_sections
186
+ if (cfg[s].get("role") or "").strip() == "fast"
187
+ ),
188
+ 0,
189
+ )
190
+ auto_ctx = fast_ctx or max(
178
191
  (_int(cfg[s].get("ctx_size", ""), 0) for s in tier_sections),
179
192
  default=8192,
180
193
  ) or 8192
llmstack/models.ini CHANGED
@@ -51,7 +51,6 @@
51
51
  [DEFAULT]
52
52
  host = 127.0.0.1
53
53
  router_port = 10101 ; FastAPI auto-router (what opencode hits)
54
- swap_port = 10102 ; llama-swap manager UI + raw model endpoints
55
54
  n_gpu_layers = 999 ; offload everything to Metal on Apple Silicon
56
55
  flash_attn = on
57
56
  jinja = true
@@ -68,31 +67,60 @@ tier = code
68
67
  role = fast
69
68
  hf_repo = bartowski/Qwen2.5-Coder-3B-Instruct-GGUF
70
69
  hf_file = Qwen2.5-Coder-3B-Instruct-Q5_K_M.gguf
71
- ctx_size = 131072 ; native 32k extended via YaRN (factor 4)
70
+ ctx_size = 131072 ; native 32k extended via YaRN (factor 4); also defines the auto router's effective max -- contexts longer than this have nowhere to land, since fast is the bottom of the step-down ladder
72
71
  rope_scaling = yarn (scale=4, orig_ctx=32768)
73
72
  size_gb = 2.5
74
73
  quant = Q5_K_M
75
- status = downloading ; queued by `llmstack.sh download`
76
- opencode_use = small_model + auto-fast tier
77
74
  sampler = temp=0.2, top_p=0.95, top_k=40, min_p=0.05 ; deterministic
78
75
  description = Qwen2.5-Coder 3B - autocomplete / FIM / quick Q&A
79
76
 
77
+ ; Bedrock alternative for code-fast -- comment out the [code-fast] block above
78
+ ; and uncomment the block below to swap to a hosted fast tier (Claude Haiku
79
+ ; 4.5: cheapest + fastest Anthropic model with tool calling, sub-second TTFT).
80
+ ; See "BEDROCK NOTES" at the bottom of this file for profile / sampler /
81
+ ; access-form details.
82
+ ;
83
+ ; [code-fast]
84
+ ; tier = code
85
+ ; role = fast
86
+ ; backend = bedrock
87
+ ; aws_model_id = eu.anthropic.claude-haiku-4-5-20251001-v1:0
88
+ ; aws_region = eu-central-1
89
+ ; aws_profile = bedrock-prod
90
+ ; ctx_size = 200000
91
+ ; sampler = temp=0.2 ; deterministic; Haiku 4.5 accepts ONE of temp / top_p
92
+ ; description = Claude Haiku 4.5 on Bedrock - hosted fast tier for autocomplete / FIM / quick Q&A
93
+
80
94
  [code-smart]
81
95
  tier = code
82
96
  role = agent
83
97
  hf_repo = unsloth/Qwen3-Coder-Next-GGUF
84
98
  hf_file = Qwen3-Coder-Next-Q4_K_M.gguf
85
99
  hf_file_next = Qwen3-Coder-Next-UD-Q4_K_XL.gguf
86
- ctx_size = 65536
100
+ ctx_size = 64000 ; 2x mid_fidelity_ceiling -- router steps down to code-fast past this, so a larger window would never get used
87
101
  size_gb = 45
88
102
  size_gb_next = 50
89
103
  quant = Q4_K_M
90
104
  quant_next = UD-Q4_K_XL
91
- status = ready (Q4_K_M); UD-Q4_K_XL queued
92
- opencode_use = agent.build + auto-agent tier
93
105
  sampler = temp=0.5, top_p=0.85, top_k=20, min_p=0.05, rep_pen=1.05 ; balanced agent
94
106
  description = Qwen3-Coder-Next 80B-A3B MoE - heavy coder for agent loops
95
107
 
108
+ ; Bedrock alternative for code-smart -- comment out the [code-smart] block
109
+ ; above and uncomment the block below to swap to a hosted heavy coder
110
+ ; (Claude Sonnet 4.6: agent-loop workhorse, heavy tool calling, multi-file
111
+ ; edits). See "BEDROCK NOTES" at the bottom of this file.
112
+ ;
113
+ ; [code-smart]
114
+ ; tier = code
115
+ ; role = agent
116
+ ; backend = bedrock
117
+ ; aws_model_id = eu.anthropic.claude-sonnet-4-6
118
+ ; aws_region = eu-central-1
119
+ ; aws_profile = bedrock-prod
120
+ ; ctx_size = 64000 ; 2x mid_fidelity_ceiling -- intentionally well below Sonnet 4.6's 200k native window; router steps down to code-fast past this rather than paying Sonnet's long-context $cost/latency
121
+ ; sampler = temp=0.5 ; Sonnet 4.6 accepts ONE of temp / top_p; pick `temp` for agent work
122
+ ; description = Claude Sonnet 4.6 on Bedrock - heavy coder for agent loops
123
+
96
124
  ; Top-tier hosted coder. Shipped disabled because it requires boto3 +
97
125
  ; AWS Bedrock access. `llmstack install` auto-uncomments the block
98
126
  ; below (by stripping the leading "; " from each line and dropping
@@ -107,10 +135,9 @@ description = Qwen3-Coder-Next 80B-A3B MoE - heavy coder for agent loops
107
135
  ; role = ultra
108
136
  ; backend = bedrock
109
137
  ; aws_model_id = global.anthropic.claude-opus-4-7 ; global.* cross-region inference profile
110
- ; aws_region = us-east-1 ; API anchor region; global.* auto-routes inference cross-region
111
- ; aws_profile = bedrock-prod ; uncomment + set your own profile name; falls back to default cred chain otherwise
112
- ; ctx_size = 200000
113
- ; opencode_use = on-demand top-tier coder for hard agent tasks
138
+ ; aws_region = eu-central-1 ; API anchor region; global.* auto-routes inference cross-region (set EU as the anchor for residency)
139
+ ; aws_profile = bedrock-prod ; conventional profile name; configure once with `aws configure --profile bedrock-prod` (or change to your own and run `llmstack install`)
140
+ ; ctx_size = 24000 ; 2x high_fidelity_ceiling -- intentionally well below Opus 4.7's 200k native window; we only invoke ultra for short prompts, where it's still cheap+fast, then step down past this
114
141
  ; ; NB: no `sampler =` line. Claude Opus 4.7 explicitly rejects all
115
142
  ; ; sampler params (temperature, top_p, top_k) -- per the Bedrock
116
143
  ; ; model card, "the recommended migration path is to omit these
@@ -134,11 +161,26 @@ size_gb = 9.2
134
161
  size_gb_next = 12.1
135
162
  quant = Q4_K_M
136
163
  quant_next = Q6_K
137
- status = ready (Q4_K_M); Q6_K queued
138
- opencode_use = agent.plan + auto-plan tier
139
164
  sampler = temp=0.7, top_p=0.9, top_k=40, min_p=0.05 ; creative thinking
140
165
  description = Qwopus GLM 18B - planning, design discussions, architecture
141
166
 
167
+ ; Bedrock alternative for plan -- comment out the [plan] block above and
168
+ ; uncomment the block below to swap to a hosted planner (Claude Opus 4.6:
169
+ ; deep reasoning for design discussions and architecture). Opus 4.6 still
170
+ ; accepts both temperature and top_p (unlike 4.7), so the local sampler
171
+ ; maps over cleanly. See "BEDROCK NOTES" at the bottom of this file.
172
+ ;
173
+ ; [plan]
174
+ ; tier = chat
175
+ ; role = plan
176
+ ; backend = bedrock
177
+ ; aws_model_id = eu.anthropic.claude-opus-4-6-v1
178
+ ; aws_region = eu-central-1
179
+ ; aws_profile = bedrock-prod
180
+ ; ctx_size = 200000
181
+ ; sampler = temp=0.7, top_p=0.9 ; creative; Opus 4.6 accepts both
182
+ ; description = Claude Opus 4.6 on Bedrock - planning, design discussions, architecture
183
+
142
184
  [plan-uncensored]
143
185
  tier = chat
144
186
  role = plan-uncensored
@@ -150,11 +192,51 @@ size_gb = 13
150
192
  size_gb_next = 20
151
193
  quant = i1-Q4_K_M
152
194
  quant_next = i1-Q6_K
153
- status = ready (i1-Q4_K_M); i1-Q6_K queued
154
- opencode_use = agent.plan-nofilter + auto via [nofilter] trigger
155
195
  sampler = temp=0.85, top_p=0.95, top_k=50, min_p=0.05 ; max exploration
156
196
  description = Mistral-Small 3.2 24B Heretic - no-filter planning
157
197
 
198
+ ; Bedrock alternative for plan-uncensored -- comment out the [plan-uncensored]
199
+ ; block above and uncomment ONE of the blocks below. Anthropic models on
200
+ ; Bedrock are filtered, so for the uncensored slot we pick the largest
201
+ ; open-weights model on Bedrock: Llama 3.1 405B has minimal safety post-
202
+ ; training and matches the spirit of the local Heretic tier. NOTE: Meta
203
+ ; models do NOT require the AWS use-case form, so this swap unblocks
204
+ ; plan-uncensored on a fresh AWS account.
205
+ ;
206
+ ; REGION CAVEAT: unlike the other tiers above, Llama 3.1 405B has NO
207
+ ; cross-region inference profile (no eu.* / global.*) and is only
208
+ ; deployed in US regions. Pin to us-west-2 even when the rest of the
209
+ ; stack is anchored in eu-central-1. If EU residency is mandatory for
210
+ ; this tier, switch to one of the eu.anthropic.* IDs at the cost of
211
+ ; losing the "uncensored" property. See "BEDROCK NOTES" at the bottom
212
+ ; of this file.
213
+ ;
214
+ ; [plan-uncensored]
215
+ ; tier = chat
216
+ ; role = plan-uncensored
217
+ ; backend = bedrock
218
+ ; aws_model_id = meta.llama3-1-405b-instruct-v1:0
219
+ ; aws_region = us-west-2 ; Llama 405B has no EU deployment; keep on US
220
+ ; aws_profile = bedrock-prod
221
+ ; ctx_size = 128000
222
+ ; sampler = temp=0.85, top_p=0.95 ; max exploration
223
+ ; description = Llama 3.1 405B on Bedrock - no-filter planning
224
+ ;
225
+ ; ...or, if your org locks Bedrock access to a VPC endpoint, use this
226
+ ; variant instead (same model + sampler, with aws_endpoint_url set):
227
+ ;
228
+ ; [plan-uncensored]
229
+ ; tier = chat
230
+ ; role = plan-uncensored
231
+ ; backend = bedrock
232
+ ; aws_model_id = meta.llama3-1-405b-instruct-v1:0
233
+ ; aws_region = us-west-2 ; Llama 405B has no EU deployment
234
+ ; aws_profile = bedrock-prod
235
+ ; aws_endpoint_url = https://bedrock-runtime.us-west-2.vpce.amazonaws.com
236
+ ; ctx_size = 128000
237
+ ; sampler = temp=0.85, top_p=0.95
238
+ ; description = Llama 3.1 405B on Bedrock (VPC) - no-filter planning
239
+
158
240
  ;------------------------------------------------------------------------------
159
241
  [ROUTING]
160
242
  ; STEP-DOWN ladder: start at the top of the fidelity ladder for short
@@ -178,7 +260,11 @@ description = Mistral-Small 3.2 24B Heretic - no-filter planning
178
260
  ; 2. "[ultra]" / "[opus]" / "ultra:" trigger AND code-ultra
179
261
  ; tier configured -> code-ultra
180
262
  ; 3. PLAN signal words AND no code-block / agent verbs / tools
181
- ; (pure design discussion) -> plan
263
+ ; AND tokens <= [plan].ctx_size (pure design discussion that
264
+ ; still fits the planner's window) -> plan
265
+ ; ...if the plan tier's ctx_size is breached, the request
266
+ ; falls through to the coding ladder below rather than being
267
+ ; sent to a planner whose window can't hold the input.
182
268
  ; 4. tokens <= high_fidelity_ceiling AND code-ultra configured -> code-ultra
183
269
  ; tokens <= high_fidelity_ceiling AND no code-ultra -> code-smart
184
270
  ; 5. tokens <= mid_fidelity_ceiling -> code-smart
@@ -186,6 +272,24 @@ description = Mistral-Small 3.2 24B Heretic - no-filter planning
186
272
  ; - if tools[] OR turns >= multi_turn (3B tool-calls badly) -> code-smart
187
273
  ; - else -> code-fast
188
274
  ;
275
+ ; AUTO ROUTER MAX CONTEXT = [code-fast].ctx_size. The fast tier sits at
276
+ ; the bottom of the step-down ladder, so any context too big for the
277
+ ; tiers above lands on fast. Anything beyond fast's window has no
278
+ ; safe home and should be considered out of scope for `model = auto`
279
+ ; -- callers with such payloads should pass an explicit model name
280
+ ; instead of relying on the router.
281
+ ;
282
+ ; FIDELITY-CEILING <-> CTX-SIZE COUPLING. Each "fidelity" rung's
283
+ ; ceiling is half of the corresponding tier's ctx_size:
284
+ ; high_fidelity_ceiling x 2 == [code-ultra].ctx_size
285
+ ; mid_fidelity_ceiling x 2 == [code-smart].ctx_size
286
+ ; This is deliberate: the ceiling marks where the tier still has
287
+ ; comfortable headroom; double the ceiling is where we'd be packing
288
+ ; the tier to its limit (and where the router has already stepped
289
+ ; down to the next tier). If you bump a ceiling, bump the matching
290
+ ; ctx_size in the tier section too -- otherwise the router will
291
+ ; route requests up to a tier whose window can't hold them.
292
+ ;
189
293
  ; The "high-fidelity" rung is gated on availability: when the
190
294
  ; [code-ultra] section is absent (or fails to load), rules (2) and (4)
191
295
  ; silently fall back to code-smart instead of routing to a tier that
@@ -194,8 +298,8 @@ description = Mistral-Small 3.2 24B Heretic - no-filter planning
194
298
  ; ROUTER_HIGH_FIDELITY_CEILING / ROUTER_MID_FIDELITY_CEILING /
195
299
  ; ROUTER_MULTI_TURN.
196
300
  ;
197
- high_fidelity_ceiling = 8000 ; tokens; below this, top-tier model is still cheap+fast
198
- mid_fidelity_ceiling = 32000 ; tokens; smart's sweet spot up to here, then step down to fast
301
+ high_fidelity_ceiling = 12000 ; tokens; below this, top-tier model is still cheap+fast (and ultra ctx_size = 2 * this)
302
+ mid_fidelity_ceiling = 32000 ; tokens; smart's sweet spot up to here, then step down to fast (smart ctx_size = 2 * this)
199
303
  multi_turn = 6 ; turn count that floors the long-context rung at code-smart
200
304
  agent_signal_words = implement, fix bug, write a function, refactor, edit, patch, debug, run tests, build it
201
305
  plan_signal_words = design, architect, approach, trade-off, should we, how would you, explain why, think through, compare options, brainstorm, root cause
@@ -203,30 +307,46 @@ uncensored_triggers = [nofilter], [uncensored], [heretic], "uncensored:", "nof
203
307
  ultra_triggers = [ultra], [opus], "ultra:", "opus:" (line start)
204
308
 
205
309
  ;------------------------------------------------------------------------------
206
- ; BEDROCK EXAMPLES (commented out -- copy / uncomment to adopt)
310
+ ; BEDROCK NOTES (referenced by the commented-out alternatives above)
207
311
  ;------------------------------------------------------------------------------
208
- ; To swap one of the local GGUF tiers above for an AWS Bedrock model, COMMENT
209
- ; OUT the existing tier of the same name and uncomment one of these. The router
210
- ; auto-detects backend=bedrock from the presence of `aws_model_id` -- no other
211
- ; flag needed. llama-swap won't load it; the router calls Bedrock directly via
212
- ; boto3 (`pip install 'llmstack[bedrock]'`).
312
+ ; Each tier section above carries a "Bedrock alternative for <tier>" block
313
+ ; directly underneath it (commented out by default). To swap a tier:
314
+ ;
315
+ ; 1. comment out the active local section (GGUF by default);
316
+ ; 2. uncomment the Bedrock-alternative block beneath it;
317
+ ; 3. run `llmstack install` (and `llmstack restart` if the tier was
318
+ ; already loaded -- bedrock creds aren't picked up live).
213
319
  ;
214
- ; Credentials: this file ONLY names a profile. The actual keys / SSO /
215
- ; role chaining live in the standard AWS config files. One-time setup:
320
+ ; The router auto-detects backend=bedrock from `aws_model_id`, but every
321
+ ; alternative block also sets `backend = bedrock` explicitly so the intent
322
+ ; is obvious. llama-swap won't load bedrock tiers; the router calls
323
+ ; Bedrock directly via boto3 (`pip install 'llmstack[bedrock]'`).
324
+ ;
325
+ ; PROFILE: every alternative uses `aws_profile = bedrock-prod`, the
326
+ ; conventional profile name for this stack. The actual keys / SSO /
327
+ ; role chaining live in the standard AWS config files (this file ONLY
328
+ ; names a profile -- never put credentials here). One-time setup:
216
329
  ;
217
330
  ; aws configure --profile bedrock-prod
218
- ; # for SSO: aws configure sso --profile bedrock-prod
219
- ; # for role chaining, edit ~/.aws/config and add a profile with:
331
+ ; # SSO: aws configure sso --profile bedrock-prod
332
+ ; # role chaining: edit ~/.aws/config and add:
333
+ ; # [profile bedrock-prod]
220
334
  ; # role_arn = arn:aws:iam::123456789012:role/llmstack-bedrock
221
- ; # source_profile = bedrock-prod
335
+ ; # source_profile = bedrock-prod-base
336
+ ;
337
+ ; To use a different profile name, edit the `aws_profile` line. To fall
338
+ ; back on boto3's default chain (env vars, default profile, instance
339
+ ; role), remove the line entirely.
222
340
  ;
223
- ; Then reference the profile name from your tier with `aws_profile = ...`.
224
- ; If you omit `aws_profile`, boto3's default chain applies (env vars,
225
- ; default profile, instance role -- whatever boto3 normally finds).
341
+ ; UPGRADE PRE-STAGING: optional `aws_model_id_next` (+ `aws_region_next`)
342
+ ; is the queued upgrade target -- mirrors gguf `hf_file_next`. The router
343
+ ; uses it only when `llmstack start --next` is in effect; permanent
344
+ ; promotion is the same as gguf: edit `aws_model_id` and re-run
345
+ ; `llmstack install`.
226
346
  ;
227
- ; SAMPLER NOTE: the `sampler = temp=..., top_p=..., top_k=..., ...`
228
- ; line on each tier is the SINGLE SOURCE OF TRUTH for sampling, but how
229
- ; it gets applied depends on the backend:
347
+ ; SAMPLER: the `sampler = temp=..., top_p=..., top_k=..., ...` line on
348
+ ; each tier is the SINGLE SOURCE OF TRUTH for sampling, but how it gets
349
+ ; applied depends on the backend:
230
350
  ;
231
351
  ; * gguf tiers -- the llama-swap generator bakes the sampler keys
232
352
  ; into the llama-server startup command line as `--temp`,
@@ -248,57 +368,21 @@ ultra_triggers = [ultra], [opus], "ultra:", "opus:" (line start)
248
368
  ; opencode.json is sampler-free in both cases by design (the
249
369
  ; opencode.json generator never emits sampler params on agents).
250
370
  ;
251
- ; Per-Bedrock-family rules (as of 2026):
371
+ ; Per-Bedrock-family sampler rules (as of 2026):
252
372
  ;
253
373
  ; * Claude Opus 4.7+ -- rejects all sampler params; OMIT `sampler =`
254
374
  ; entirely (the router will then pass requests through untouched).
255
- ; * Claude Sonnet 4.5 / Haiku 4.5 -- accept `temp` OR `top_p`, never
256
- ; both; pick one.
375
+ ; * Claude Sonnet 4.5 / 4.6 / Haiku 4.5 -- accept `temp` OR `top_p`,
376
+ ; never both; pick one.
257
377
  ; * Claude Opus 4.x (4.1, 4.5, 4.6) -- accept `temp` and `top_p`.
258
- ; * Llama / Titan / Cohere / etc. -- accept `temp` + `top_p`; check
259
- ; the model card if in doubt.
260
- ;
261
- ; Example A: top-tier coder on Bedrock (us-west-2), default cred chain.
262
- ; Optional `aws_model_id_next` (and optional `aws_region_next`) is the
263
- ; queued upgrade target -- mirrors gguf `hf_file_next`. The router uses
264
- ; it only when `--next` is in effect; permanent promotion is the same
265
- ; as gguf: edit `aws_model_id` and re-run `llmstack install`.
266
- ;
267
- ; [code-smart]
268
- ; tier = code
269
- ; role = agent
270
- ; backend = bedrock
271
- ; aws_model_id = anthropic.claude-sonnet-4-5-20250929-v1:0
272
- ; aws_region = us-west-2
273
- ; aws_model_id_next = anthropic.claude-sonnet-5-20260201-v1:0 ; queued
274
- ; aws_region_next = us-east-1 ; (optional) different region for the new model
275
- ; ctx_size = 200000
276
- ; sampler = temp=0.5 ; Sonnet 4.5 accepts ONE of temp / top_p; pick `temp` for agent work
277
- ; description = Claude Sonnet 4.5 on Bedrock - heavy coder for agent loops
278
- ;
279
- ; Example B: planner in a different AWS account, accessed via a named
280
- ; profile that itself uses role-chaining + SSO under ~/.aws/config.
281
- ; (Different tier => different profile name; different account/region.)
378
+ ; * Llama / Titan / Mistral / Cohere / Nova / etc. -- accept `temp`
379
+ ; + `top_p`; check the model card if in doubt.
282
380
  ;
283
- ; [plan]
284
- ; tier = chat
285
- ; role = plan
286
- ; aws_model_id = us.anthropic.claude-opus-4-1-20250805-v1:0
287
- ; aws_region = us-east-1
288
- ; aws_profile = bedrock-planning
289
- ; ctx_size = 200000
290
- ; sampler = temp=0.7, top_p=0.9
291
- ; description = Claude Opus 4.1 on Bedrock - planning, design discussions
292
- ;
293
- ; Example C: large model behind a VPC endpoint.
294
- ;
295
- ; [plan-uncensored]
296
- ; tier = chat
297
- ; role = plan-uncensored
298
- ; aws_model_id = meta.llama3-1-405b-instruct-v1:0
299
- ; aws_region = us-west-2
300
- ; aws_profile = bedrock-prod
301
- ; aws_endpoint_url = https://bedrock-runtime.us-west-2.vpce.amazonaws.com
302
- ; ctx_size = 128000
303
- ; sampler = temp=0.85, top_p=0.95
304
- ; description = Llama 3.1 405B on Bedrock - max-exploration planning
381
+ ; ACCESS: Anthropic Claude on Bedrock requires a one-time use-case-form
382
+ ; approval per AWS account (Bedrock console -> Model catalog -> pick the
383
+ ; model -> fill the form). Approval is account-level and persists; once
384
+ ; granted, every Claude variant works (bare ID, us./eu./global. cross-
385
+ ; region profile, application inference profile ARN). To skip the form
386
+ ; entirely, use the Llama 3.1 405B variant under [plan-uncensored] (Meta
387
+ ; models don't require the form) or pick another non-Anthropic family
388
+ ; (Amazon Nova, Mistral, Cohere, Titan).
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: opencode-llmstack
3
- Version: 0.7.2
3
+ Version: 0.8.0
4
4
  Summary: Multi-tier local LLM stack: llama-swap + FastAPI auto-router + opencode wiring.
5
5
  Author: llmstack
6
6
  License: MIT
@@ -542,8 +542,8 @@ All knobs are env vars; defaults are picked up by `llmstack start`.
542
542
  | `ROUTER_ULTRA_MODEL` | `code-ultra` | short-context top tier → here (gated on availability) |
543
543
  | `ROUTER_PLAN_MODEL` | `plan` | design/discussion verbs → here |
544
544
  | `ROUTER_UNCENSORED_MODEL` | `plan-uncensored` | `[nofilter]` triggers → here |
545
- | `ROUTER_HIGH_FIDELITY_CEILING` | `8000` | tokens; at or below this, route to top tier (ultra → smart fallback) |
546
- | `ROUTER_MID_FIDELITY_CEILING` | `32000` | tokens; at or below this, route to `code-smart`; beyond, step down to `code-fast` |
545
+ | `ROUTER_HIGH_FIDELITY_CEILING` | `12000` | tokens; at or below this, route to top tier (ultra → smart fallback). Paired with `code-ultra.ctx_size = 24000` (2x). |
546
+ | `ROUTER_MID_FIDELITY_CEILING` | `32000` | tokens; at or below this, route to `code-smart`; beyond, step down to `code-fast`. Paired with `code-smart.ctx_size = 64000` (2x). |
547
547
  | `ROUTER_MULTI_TURN` | `6` | turn count that floors the long-context rung at `code-smart` |
548
548
  | `ROUTER_HOST` / `ROUTER_PORT` | `127.0.0.1` / `10101` | listen address |
549
549
  | `LOG_LEVEL` | `info` | router log level |
@@ -2,10 +2,10 @@ llmstack/AGENTS.md,sha256=4DVUkqJ1-EP-cDNRCpznzghOOX6dAMbVWdcwyfFCALw,528
2
2
  llmstack/__init__.py,sha256=EKHybZtPxLqFWkgkIoYBameu5_Tf9j4UewpANKm0fMU,855
3
3
  llmstack/__main__.py,sha256=wXHd5-BmCCHUfNEmy2rbilBSyVhi4KD1dSIO_4NlxuE,199
4
4
  llmstack/_platform.py,sha256=eDY3T9krkaBigG5xXxqzIbH3MhdZqX3BWe7bozOsAso,13099
5
- llmstack/app.py,sha256=fPyjqJ_4td7qs-OKuDsE1JzBtvNzVV9XYKF2WXBzRas,25795
5
+ llmstack/app.py,sha256=YfglFlzrp58mh8K1srQA6KNqc9cF41w1xnWnUrLW0IE,27839
6
6
  llmstack/check_models.py,sha256=WvTS2Td4acp-Q0-yWXUgXAgAgFOmpxiaeSDuAoivirw,4559
7
7
  llmstack/cli.py,sha256=Om70PzHrmU81y2Mw1sB6eeUs1fRHP0PnsCEVNC0UNvI,11341
8
- llmstack/models.ini,sha256=seGda3LWEREWBHnyVCv8f07XBtjkWFK9iBbKhu5yAl0,15351
8
+ llmstack/models.ini,sha256=wWAmbfKUCacjLXpBpH7tcgasHgMyOrhF_AmDLsmzptI,20339
9
9
  llmstack/paths.py,sha256=A8q4-tpwIt5UMGG5ZDESKSuViMGLbPIAL1VoONopJqU,11512
10
10
  llmstack/shell_env.py,sha256=MJSW0PP15q-fsppIZ98WZ7XoqYMZmDy4k8N0gzEA6wU,39362
11
11
  llmstack/tiers.py,sha256=et738dWftsc74ZElZ3Vt9eEF_SzgJCDuH9kBhzH-scI,14697
@@ -29,9 +29,9 @@ llmstack/download/binary.py,sha256=xpv15wF4viv8uFC5UqfSIf36CIoPpmaNUaVtjF-vTWA,8
29
29
  llmstack/download/ggufs.py,sha256=2hCr-svUiPIV2I3ruwTbXo6lPn9m-VBOqa3DFbvdIcA,5435
30
30
  llmstack/generators/__init__.py,sha256=LfbcReuyYBCdVuT9J5RKo7-f8n585YBU3Hus6DsxqTs,1189
31
31
  llmstack/generators/llama_swap.py,sha256=KdYH9N6TJECotZvyxvAjaa3kRyzn4YOi2T6D2UdyVKw,14785
32
- llmstack/generators/opencode.py,sha256=If7opOQyMWSSbHTj7M9dndsA3BmskSTUsTggMKV0VWM,10669
33
- opencode_llmstack-0.7.2.dist-info/METADATA,sha256=Uxw4Ln5LWGpBnBuejQfZM0K18JCYuHiez0hN1J-NgkM,34815
34
- opencode_llmstack-0.7.2.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
35
- opencode_llmstack-0.7.2.dist-info/entry_points.txt,sha256=soomjpqvl4KzFScgpQbu96vgcLriOtkB9MbiSC0rvZ8,47
36
- opencode_llmstack-0.7.2.dist-info/top_level.txt,sha256=tMv9sDWp8RW_DNNY8cuM4Uy4sND-KwTLcsScl5gdcEQ,9
37
- opencode_llmstack-0.7.2.dist-info/RECORD,,
32
+ llmstack/generators/opencode.py,sha256=s_FrLXUBnLzRGQovl1PcAEs7V_P52wT1vnvvxMcKfs4,11203
33
+ opencode_llmstack-0.8.0.dist-info/METADATA,sha256=kskFW_TAESnhrsu3ims1bMeLgANnnfWK8YDaaSlbnGQ,34914
34
+ opencode_llmstack-0.8.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
35
+ opencode_llmstack-0.8.0.dist-info/entry_points.txt,sha256=soomjpqvl4KzFScgpQbu96vgcLriOtkB9MbiSC0rvZ8,47
36
+ opencode_llmstack-0.8.0.dist-info/top_level.txt,sha256=tMv9sDWp8RW_DNNY8cuM4Uy4sND-KwTLcsScl5gdcEQ,9
37
+ opencode_llmstack-0.8.0.dist-info/RECORD,,