opencode-llmstack 0.7.3__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
llmstack/app.py CHANGED
@@ -74,21 +74,35 @@ Routing decision tree (first match wins):
74
74
  2. Explicit "ultra" trigger (``[ultra]``, ``[opus]``,
75
75
  ``ultra:``, ``opus:``) AND ultra tier configured -> code-ultra
76
76
  3. PLAN signal words AND no code-block / agent verbs / tools
77
- (design discussion, no implementation pending) -> plan
77
+ AND estimated tokens <= ``[plan]`` tier's ctx_size
78
+ (pure design discussion that fits the planner's
79
+ window) -> plan
80
+ (if the planner's
81
+ ctx_size is breached
82
+ we fall through to
83
+ the coding ladder
84
+ rather than send a
85
+ request that won't
86
+ fit -- the coding
87
+ tiers cover larger
88
+ windows by design)
78
89
  4. Estimated input tokens <= HIGH_FIDELITY_CEILING
79
90
  ("reasonable context still being built") -> code-ultra
80
91
  (else code-smart)
81
92
  5. Estimated input tokens <= MID_FIDELITY_CEILING -> code-smart
82
- 6. Otherwise (long context, top-tier becomes
83
- expensive/slow, fast tier's 128k window is the
84
- best fit and it's free) -> code-fast
93
+ 6. Otherwise (long context, top-tier becomes
94
+ expensive/slow, fast tier's 128k window is the
95
+ best fit and it's free) -> code-fast
85
96
  (floored at
86
97
  code-smart when
87
- ``tools[]`` is set
88
- or n_turns >=
89
- MULTI_TURN_THRESHOLD,
90
- since 3B models
91
- tool-call unreliably)
98
+ n_turns >=
99
+ MULTI_TURN_THRESHOLD)
100
+
101
+ The auto router's effective max context window is
102
+ ``[code-fast].ctx_size`` -- fast is the bottom of the step-down
103
+ ladder, so any context that would overflow the tiers above lands on
104
+ fast. Inputs longer than fast's window have no safe home and should
105
+ be considered out of scope for ``model = auto``.
92
106
 
93
107
  Ultra-tier routing is gated on availability: rule (2) and the
94
108
  "high-fidelity" rung of (4) first check that the tier is loaded
@@ -137,14 +151,21 @@ UNCENSORED_MODEL = os.getenv("ROUTER_UNCENSORED_MODEL", "plan-uncensored")
137
151
  # est <= MID_FIDELITY_CEILING -> code-smart
138
152
  # est > MID_FIDELITY_CEILING -> code-fast (or smart with tools/loop)
139
153
  #
154
+ # Each ceiling is half of the corresponding tier's ``ctx_size`` in
155
+ # models.ini -- the ceiling marks where the tier still has comfortable
156
+ # headroom, and double the ceiling is where the router has already
157
+ # stepped down to the next tier (so the upper tier never has to handle
158
+ # inputs at its own limit).
159
+ #
140
160
  # Defaults:
141
- # HIGH 8000 - "reasonable context built": a couple of files loaded,
161
+ # HIGH 12000 - "reasonable context built": a couple of files loaded,
142
162
  # instructions clear, top-tier still cheap+fast here.
143
- # MID 32000 - half of code-smart's 65k window; past this, hosted
163
+ # Pairs with a 24k ctx_size on code-ultra.
164
+ # MID 32000 - half of code-smart's 64k window; past this, hosted
144
165
  # top-tier latency/$cost balloons and code-smart starts
145
166
  # getting cramped, while code-fast's 128k YaRN window
146
167
  # still has comfortable headroom.
147
- HIGH_FIDELITY_CEILING = int(os.getenv("ROUTER_HIGH_FIDELITY_CEILING", "8000"))
168
+ HIGH_FIDELITY_CEILING = int(os.getenv("ROUTER_HIGH_FIDELITY_CEILING", "12000"))
148
169
  MID_FIDELITY_CEILING = int(os.getenv("ROUTER_MID_FIDELITY_CEILING", "32000"))
149
170
  # Floor the long-context rung at code-smart whenever a tool-call
150
171
  # protocol is in play -- 3B models tool-call unreliably regardless of
@@ -332,25 +353,36 @@ def classify(body: dict[str, Any]) -> tuple[str, str]:
332
353
  ULTRA_MODEL, AGENT_MODEL)
333
354
  return AGENT_MODEL, f"ultra-trigger->agent ({ULTRA_MODEL} unavailable)"
334
355
 
335
- has_tools = bool(body.get("tools"))
336
356
  n_turns = len(messages) if messages else 0
337
357
  has_code_signal = (
338
358
  _matches(CODE_BLOCK, messages, prompt)
339
359
  or _matches(AGENT_SIGNALS, messages, prompt)
340
360
  )
341
361
 
362
+ est = _estimate_tokens(messages, prompt)
363
+
342
364
  # Plan track is orthogonal to the code fidelity ladder: ``plan`` is a
343
365
  # chat-tuned model meant for design / "should we" discussions. Only
344
366
  # take it when nothing about the request says "I'm about to write
345
- # code" (no triple-backticks, no agent verbs, no tool calls).
367
+ # code" (no triple-backticks, no agent verbs). Tools are stripped
368
+ # from the request body before dispatch (see ``_handle_completion``),
369
+ # so their presence here does not block plan routing.
370
+ # Only route to plan if the input fits in the planner's ctx_size --
371
+ # past that we fall through to the coding ladder which has tiers
372
+ # (smart, fast) explicitly sized for larger contexts.
346
373
  if (
347
- not has_tools
348
- and not has_code_signal
374
+ not has_code_signal
349
375
  and _matches(PLAN_SIGNALS, messages, prompt)
350
376
  ):
351
- return PLAN_MODEL, "plan-signal"
352
-
353
- est = _estimate_tokens(messages, prompt)
377
+ plan_tier = TIER_BY_ALIAS.get(PLAN_MODEL)
378
+ plan_ctx = plan_tier.ctx_size if plan_tier else 0
379
+ if not plan_ctx or est <= plan_ctx:
380
+ return PLAN_MODEL, "plan-signal"
381
+ log.info(
382
+ "plan-signal but tokens~%d > %s.ctx_size %d; "
383
+ "falling through to coding ladder",
384
+ est, PLAN_MODEL, plan_ctx,
385
+ )
354
386
 
355
387
  # Rung 1: short context -- start at the top.
356
388
  if est <= HIGH_FIDELITY_CEILING:
@@ -365,12 +397,12 @@ def classify(body: dict[str, Any]) -> tuple[str, str]:
365
397
  if est <= MID_FIDELITY_CEILING:
366
398
  return AGENT_MODEL, f"mid-fidelity tokens~{est}<={MID_FIDELITY_CEILING}"
367
399
 
368
- # Rung 3: long context -- step down to fast (128k YaRN, free,
369
- # always-resident). Floor at smart when tools/agent loop is in
370
- # play; the 3B coder doesn't tool-call reliably.
371
- if has_tools or n_turns >= MULTI_TURN_THRESHOLD:
372
- why = "tools" if has_tools else f"turns={n_turns}"
373
- return AGENT_MODEL, f"long-context tokens~{est}>{MID_FIDELITY_CEILING} ({why} floor)"
400
+ # Rung 3: long context -- step down to fast. Floor at smart only
401
+ # when the multi-turn threshold is hit; tools alone no longer
402
+ # prevent the step-down (plan tiers strip tools before dispatch,
403
+ # and code-fast is a hosted model that tool-calls reliably).
404
+ if n_turns >= MULTI_TURN_THRESHOLD:
405
+ return AGENT_MODEL, f"long-context tokens~{est}>{MID_FIDELITY_CEILING} (turns={n_turns} floor)"
374
406
  return FAST_MODEL, f"long-context tokens~{est}>{MID_FIDELITY_CEILING}"
375
407
 
376
408
 
@@ -590,6 +622,11 @@ async def _handle_completion(req: Request, path: str) -> Response:
590
622
  mutated = True
591
623
 
592
624
  chosen_name = body.get("model")
625
+ if chosen_name in {PLAN_MODEL, UNCENSORED_MODEL} and body.get("tools"):
626
+ log.info("plan tier %s: stripping tools from request", chosen_name)
627
+ body.pop("tools")
628
+ body.pop("tool_choice", None)
629
+ mutated = True
593
630
  tier = _resolve_tier(chosen_name)
594
631
  if tier is not None and _inject_sampler(body, tier):
595
632
  mutated = True
@@ -172,9 +172,22 @@ def build_config(
172
172
 
173
173
  tier_sections = [s for s in cfg.sections() if s != "ROUTING"]
174
174
 
175
- # `auto` context = MIN across all tiers so opencode never packs a prompt
176
- # that overflows the tier the router actually picks.
177
- auto_ctx = min(
175
+ # `auto` context = the fast tier's ctx_size. The router runs a
176
+ # step-DOWN ladder (ultra -> smart -> fast as context grows), so
177
+ # the largest window in the ladder is fast's, and that's the
178
+ # effective ceiling for `model = auto` -- anything bigger has no
179
+ # tier to land on. Using `min(...)` here would clip opencode to
180
+ # the smallest tier's window even though the router would never
181
+ # actually send a long prompt to that tier.
182
+ fast_ctx = next(
183
+ (
184
+ _int(cfg[s].get("ctx_size", ""), 0)
185
+ for s in tier_sections
186
+ if (cfg[s].get("role") or "").strip() == "fast"
187
+ ),
188
+ 0,
189
+ )
190
+ auto_ctx = fast_ctx or max(
178
191
  (_int(cfg[s].get("ctx_size", ""), 0) for s in tier_sections),
179
192
  default=8192,
180
193
  ) or 8192
llmstack/models.ini CHANGED
@@ -67,7 +67,7 @@ tier = code
67
67
  role = fast
68
68
  hf_repo = bartowski/Qwen2.5-Coder-3B-Instruct-GGUF
69
69
  hf_file = Qwen2.5-Coder-3B-Instruct-Q5_K_M.gguf
70
- ctx_size = 131072 ; native 32k extended via YaRN (factor 4)
70
+ ctx_size = 131072 ; native 32k extended via YaRN (factor 4); also defines the auto router's effective max -- contexts longer than this have nowhere to land, since fast is the bottom of the step-down ladder
71
71
  rope_scaling = yarn (scale=4, orig_ctx=32768)
72
72
  size_gb = 2.5
73
73
  quant = Q5_K_M
@@ -97,7 +97,7 @@ role = agent
97
97
  hf_repo = unsloth/Qwen3-Coder-Next-GGUF
98
98
  hf_file = Qwen3-Coder-Next-Q4_K_M.gguf
99
99
  hf_file_next = Qwen3-Coder-Next-UD-Q4_K_XL.gguf
100
- ctx_size = 65536
100
+ ctx_size = 64000 ; 2x mid_fidelity_ceiling -- router steps down to code-fast past this, so a larger window would never get used
101
101
  size_gb = 45
102
102
  size_gb_next = 50
103
103
  quant = Q4_K_M
@@ -117,7 +117,7 @@ description = Qwen3-Coder-Next 80B-A3B MoE - heavy coder for agent loops
117
117
  ; aws_model_id = eu.anthropic.claude-sonnet-4-6
118
118
  ; aws_region = eu-central-1
119
119
  ; aws_profile = bedrock-prod
120
- ; ctx_size = 200000
120
+ ; ctx_size = 64000 ; 2x mid_fidelity_ceiling -- intentionally well below Sonnet 4.6's 200k native window; router steps down to code-fast past this rather than paying Sonnet's long-context $cost/latency
121
121
  ; sampler = temp=0.5 ; Sonnet 4.6 accepts ONE of temp / top_p; pick `temp` for agent work
122
122
  ; description = Claude Sonnet 4.6 on Bedrock - heavy coder for agent loops
123
123
 
@@ -137,7 +137,7 @@ description = Qwen3-Coder-Next 80B-A3B MoE - heavy coder for agent loops
137
137
  ; aws_model_id = global.anthropic.claude-opus-4-7 ; global.* cross-region inference profile
138
138
  ; aws_region = eu-central-1 ; API anchor region; global.* auto-routes inference cross-region (set EU as the anchor for residency)
139
139
  ; aws_profile = bedrock-prod ; conventional profile name; configure once with `aws configure --profile bedrock-prod` (or change to your own and run `llmstack install`)
140
- ; ctx_size = 200000
140
+ ; ctx_size = 24000 ; 2x high_fidelity_ceiling -- intentionally well below Opus 4.7's 200k native window; we only invoke ultra for short prompts, where it's still cheap+fast, then step down past this
141
141
  ; ; NB: no `sampler =` line. Claude Opus 4.7 explicitly rejects all
142
142
  ; ; sampler params (temperature, top_p, top_k) -- per the Bedrock
143
143
  ; ; model card, "the recommended migration path is to omit these
@@ -260,7 +260,11 @@ description = Mistral-Small 3.2 24B Heretic - no-filter planning
260
260
  ; 2. "[ultra]" / "[opus]" / "ultra:" trigger AND code-ultra
261
261
  ; tier configured -> code-ultra
262
262
  ; 3. PLAN signal words AND no code-block / agent verbs / tools
263
- ; (pure design discussion) -> plan
263
+ ; AND tokens <= [plan].ctx_size (pure design discussion that
264
+ ; still fits the planner's window) -> plan
265
+ ; ...if the plan tier's ctx_size is breached, the request
266
+ ; falls through to the coding ladder below rather than being
267
+ ; sent to a planner whose window can't hold the input.
264
268
  ; 4. tokens <= high_fidelity_ceiling AND code-ultra configured -> code-ultra
265
269
  ; tokens <= high_fidelity_ceiling AND no code-ultra -> code-smart
266
270
  ; 5. tokens <= mid_fidelity_ceiling -> code-smart
@@ -268,6 +272,24 @@ description = Mistral-Small 3.2 24B Heretic - no-filter planning
268
272
  ; - if tools[] OR turns >= multi_turn (3B tool-calls badly) -> code-smart
269
273
  ; - else -> code-fast
270
274
  ;
275
+ ; AUTO ROUTER MAX CONTEXT = [code-fast].ctx_size. The fast tier sits at
276
+ ; the bottom of the step-down ladder, so any context too big for the
277
+ ; tiers above lands on fast. Anything beyond fast's window has no
278
+ ; safe home and should be considered out of scope for `model = auto`
279
+ ; -- callers with such payloads should pass an explicit model name
280
+ ; instead of relying on the router.
281
+ ;
282
+ ; FIDELITY-CEILING <-> CTX-SIZE COUPLING. Each "fidelity" rung's
283
+ ; ceiling is half of the corresponding tier's ctx_size:
284
+ ; high_fidelity_ceiling x 2 == [code-ultra].ctx_size
285
+ ; mid_fidelity_ceiling x 2 == [code-smart].ctx_size
286
+ ; This is deliberate: the ceiling marks where the tier still has
287
+ ; comfortable headroom; double the ceiling is where we'd be packing
288
+ ; the tier to its limit (and where the router has already stepped
289
+ ; down to the next tier). If you bump a ceiling, bump the matching
290
+ ; ctx_size in the tier section too -- otherwise the router will
291
+ ; route requests up to a tier whose window can't hold them.
292
+ ;
271
293
  ; The "high-fidelity" rung is gated on availability: when the
272
294
  ; [code-ultra] section is absent (or fails to load), rules (2) and (4)
273
295
  ; silently fall back to code-smart instead of routing to a tier that
@@ -276,8 +298,8 @@ description = Mistral-Small 3.2 24B Heretic - no-filter planning
276
298
  ; ROUTER_HIGH_FIDELITY_CEILING / ROUTER_MID_FIDELITY_CEILING /
277
299
  ; ROUTER_MULTI_TURN.
278
300
  ;
279
- high_fidelity_ceiling = 8000 ; tokens; below this, top-tier model is still cheap+fast
280
- mid_fidelity_ceiling = 32000 ; tokens; smart's sweet spot up to here, then step down to fast
301
+ high_fidelity_ceiling = 12000 ; tokens; below this, top-tier model is still cheap+fast (and ultra ctx_size = 2 * this)
302
+ mid_fidelity_ceiling = 32000 ; tokens; smart's sweet spot up to here, then step down to fast (smart ctx_size = 2 * this)
281
303
  multi_turn = 6 ; turn count that floors the long-context rung at code-smart
282
304
  agent_signal_words = implement, fix bug, write a function, refactor, edit, patch, debug, run tests, build it
283
305
  plan_signal_words = design, architect, approach, trade-off, should we, how would you, explain why, think through, compare options, brainstorm, root cause
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: opencode-llmstack
3
- Version: 0.7.3
3
+ Version: 0.9.0
4
4
  Summary: Multi-tier local LLM stack: llama-swap + FastAPI auto-router + opencode wiring.
5
5
  Author: llmstack
6
6
  License: MIT
@@ -542,8 +542,8 @@ All knobs are env vars; defaults are picked up by `llmstack start`.
542
542
  | `ROUTER_ULTRA_MODEL` | `code-ultra` | short-context top tier → here (gated on availability) |
543
543
  | `ROUTER_PLAN_MODEL` | `plan` | design/discussion verbs → here |
544
544
  | `ROUTER_UNCENSORED_MODEL` | `plan-uncensored` | `[nofilter]` triggers → here |
545
- | `ROUTER_HIGH_FIDELITY_CEILING` | `8000` | tokens; at or below this, route to top tier (ultra → smart fallback) |
546
- | `ROUTER_MID_FIDELITY_CEILING` | `32000` | tokens; at or below this, route to `code-smart`; beyond, step down to `code-fast` |
545
+ | `ROUTER_HIGH_FIDELITY_CEILING` | `12000` | tokens; at or below this, route to top tier (ultra → smart fallback). Paired with `code-ultra.ctx_size = 24000` (2x). |
546
+ | `ROUTER_MID_FIDELITY_CEILING` | `32000` | tokens; at or below this, route to `code-smart`; beyond, step down to `code-fast`. Paired with `code-smart.ctx_size = 64000` (2x). |
547
547
  | `ROUTER_MULTI_TURN` | `6` | turn count that floors the long-context rung at `code-smart` |
548
548
  | `ROUTER_HOST` / `ROUTER_PORT` | `127.0.0.1` / `10101` | listen address |
549
549
  | `LOG_LEVEL` | `info` | router log level |
@@ -2,10 +2,10 @@ llmstack/AGENTS.md,sha256=4DVUkqJ1-EP-cDNRCpznzghOOX6dAMbVWdcwyfFCALw,528
2
2
  llmstack/__init__.py,sha256=EKHybZtPxLqFWkgkIoYBameu5_Tf9j4UewpANKm0fMU,855
3
3
  llmstack/__main__.py,sha256=wXHd5-BmCCHUfNEmy2rbilBSyVhi4KD1dSIO_4NlxuE,199
4
4
  llmstack/_platform.py,sha256=eDY3T9krkaBigG5xXxqzIbH3MhdZqX3BWe7bozOsAso,13099
5
- llmstack/app.py,sha256=fPyjqJ_4td7qs-OKuDsE1JzBtvNzVV9XYKF2WXBzRas,25795
5
+ llmstack/app.py,sha256=Fha6Ivb-lsnoWVAK3ekzRlaLqQ1bIEavipgPP9W_TuQ,27888
6
6
  llmstack/check_models.py,sha256=WvTS2Td4acp-Q0-yWXUgXAgAgFOmpxiaeSDuAoivirw,4559
7
7
  llmstack/cli.py,sha256=Om70PzHrmU81y2Mw1sB6eeUs1fRHP0PnsCEVNC0UNvI,11341
8
- llmstack/models.ini,sha256=kmfX_9WHEqnjRfF7srT6zesfC_YIp-0MmW0YbfFkXD8,18381
8
+ llmstack/models.ini,sha256=wWAmbfKUCacjLXpBpH7tcgasHgMyOrhF_AmDLsmzptI,20339
9
9
  llmstack/paths.py,sha256=A8q4-tpwIt5UMGG5ZDESKSuViMGLbPIAL1VoONopJqU,11512
10
10
  llmstack/shell_env.py,sha256=MJSW0PP15q-fsppIZ98WZ7XoqYMZmDy4k8N0gzEA6wU,39362
11
11
  llmstack/tiers.py,sha256=et738dWftsc74ZElZ3Vt9eEF_SzgJCDuH9kBhzH-scI,14697
@@ -29,9 +29,9 @@ llmstack/download/binary.py,sha256=xpv15wF4viv8uFC5UqfSIf36CIoPpmaNUaVtjF-vTWA,8
29
29
  llmstack/download/ggufs.py,sha256=2hCr-svUiPIV2I3ruwTbXo6lPn9m-VBOqa3DFbvdIcA,5435
30
30
  llmstack/generators/__init__.py,sha256=LfbcReuyYBCdVuT9J5RKo7-f8n585YBU3Hus6DsxqTs,1189
31
31
  llmstack/generators/llama_swap.py,sha256=KdYH9N6TJECotZvyxvAjaa3kRyzn4YOi2T6D2UdyVKw,14785
32
- llmstack/generators/opencode.py,sha256=If7opOQyMWSSbHTj7M9dndsA3BmskSTUsTggMKV0VWM,10669
33
- opencode_llmstack-0.7.3.dist-info/METADATA,sha256=sobMO1qeP8dsGlofz-odTUKS2jNzKHPDneQcy_WyHz4,34815
34
- opencode_llmstack-0.7.3.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
35
- opencode_llmstack-0.7.3.dist-info/entry_points.txt,sha256=soomjpqvl4KzFScgpQbu96vgcLriOtkB9MbiSC0rvZ8,47
36
- opencode_llmstack-0.7.3.dist-info/top_level.txt,sha256=tMv9sDWp8RW_DNNY8cuM4Uy4sND-KwTLcsScl5gdcEQ,9
37
- opencode_llmstack-0.7.3.dist-info/RECORD,,
32
+ llmstack/generators/opencode.py,sha256=s_FrLXUBnLzRGQovl1PcAEs7V_P52wT1vnvvxMcKfs4,11203
33
+ opencode_llmstack-0.9.0.dist-info/METADATA,sha256=WSRM1_jNIIwH9zBhb41tvEiHDPSbdara_FoHqFLgWj4,34914
34
+ opencode_llmstack-0.9.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
35
+ opencode_llmstack-0.9.0.dist-info/entry_points.txt,sha256=soomjpqvl4KzFScgpQbu96vgcLriOtkB9MbiSC0rvZ8,47
36
+ opencode_llmstack-0.9.0.dist-info/top_level.txt,sha256=tMv9sDWp8RW_DNNY8cuM4Uy4sND-KwTLcsScl5gdcEQ,9
37
+ opencode_llmstack-0.9.0.dist-info/RECORD,,