opencode-llmstack 0.7.3__tar.gz → 0.9.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {opencode_llmstack-0.7.3 → opencode_llmstack-0.9.0}/PKG-INFO +3 -3
- {opencode_llmstack-0.7.3 → opencode_llmstack-0.9.0}/README.md +2 -2
- {opencode_llmstack-0.7.3 → opencode_llmstack-0.9.0}/llmstack/app.py +62 -25
- {opencode_llmstack-0.7.3 → opencode_llmstack-0.9.0}/llmstack/generators/opencode.py +16 -3
- {opencode_llmstack-0.7.3 → opencode_llmstack-0.9.0}/llmstack/models.ini +29 -7
- {opencode_llmstack-0.7.3 → opencode_llmstack-0.9.0}/opencode_llmstack.egg-info/PKG-INFO +3 -3
- {opencode_llmstack-0.7.3 → opencode_llmstack-0.9.0}/pyproject.toml +1 -1
- {opencode_llmstack-0.7.3 → opencode_llmstack-0.9.0}/llmstack/AGENTS.md +0 -0
- {opencode_llmstack-0.7.3 → opencode_llmstack-0.9.0}/llmstack/__init__.py +0 -0
- {opencode_llmstack-0.7.3 → opencode_llmstack-0.9.0}/llmstack/__main__.py +0 -0
- {opencode_llmstack-0.7.3 → opencode_llmstack-0.9.0}/llmstack/_platform.py +0 -0
- {opencode_llmstack-0.7.3 → opencode_llmstack-0.9.0}/llmstack/backends/__init__.py +0 -0
- {opencode_llmstack-0.7.3 → opencode_llmstack-0.9.0}/llmstack/backends/bedrock.py +0 -0
- {opencode_llmstack-0.7.3 → opencode_llmstack-0.9.0}/llmstack/check_models.py +0 -0
- {opencode_llmstack-0.7.3 → opencode_llmstack-0.9.0}/llmstack/cli.py +0 -0
- {opencode_llmstack-0.7.3 → opencode_llmstack-0.9.0}/llmstack/commands/__init__.py +0 -0
- {opencode_llmstack-0.7.3 → opencode_llmstack-0.9.0}/llmstack/commands/_helpers.py +0 -0
- {opencode_llmstack-0.7.3 → opencode_llmstack-0.9.0}/llmstack/commands/activate.py +0 -0
- {opencode_llmstack-0.7.3 → opencode_llmstack-0.9.0}/llmstack/commands/check.py +0 -0
- {opencode_llmstack-0.7.3 → opencode_llmstack-0.9.0}/llmstack/commands/download.py +0 -0
- {opencode_llmstack-0.7.3 → opencode_llmstack-0.9.0}/llmstack/commands/install.py +0 -0
- {opencode_llmstack-0.7.3 → opencode_llmstack-0.9.0}/llmstack/commands/install_llama_swap.py +0 -0
- {opencode_llmstack-0.7.3 → opencode_llmstack-0.9.0}/llmstack/commands/reload.py +0 -0
- {opencode_llmstack-0.7.3 → opencode_llmstack-0.9.0}/llmstack/commands/restart.py +0 -0
- {opencode_llmstack-0.7.3 → opencode_llmstack-0.9.0}/llmstack/commands/setup.py +0 -0
- {opencode_llmstack-0.7.3 → opencode_llmstack-0.9.0}/llmstack/commands/start.py +0 -0
- {opencode_llmstack-0.7.3 → opencode_llmstack-0.9.0}/llmstack/commands/status.py +0 -0
- {opencode_llmstack-0.7.3 → opencode_llmstack-0.9.0}/llmstack/commands/stop.py +0 -0
- {opencode_llmstack-0.7.3 → opencode_llmstack-0.9.0}/llmstack/download/__init__.py +0 -0
- {opencode_llmstack-0.7.3 → opencode_llmstack-0.9.0}/llmstack/download/binary.py +0 -0
- {opencode_llmstack-0.7.3 → opencode_llmstack-0.9.0}/llmstack/download/ggufs.py +0 -0
- {opencode_llmstack-0.7.3 → opencode_llmstack-0.9.0}/llmstack/generators/__init__.py +0 -0
- {opencode_llmstack-0.7.3 → opencode_llmstack-0.9.0}/llmstack/generators/llama_swap.py +0 -0
- {opencode_llmstack-0.7.3 → opencode_llmstack-0.9.0}/llmstack/paths.py +0 -0
- {opencode_llmstack-0.7.3 → opencode_llmstack-0.9.0}/llmstack/shell_env.py +0 -0
- {opencode_llmstack-0.7.3 → opencode_llmstack-0.9.0}/llmstack/tiers.py +0 -0
- {opencode_llmstack-0.7.3 → opencode_llmstack-0.9.0}/opencode_llmstack.egg-info/SOURCES.txt +0 -0
- {opencode_llmstack-0.7.3 → opencode_llmstack-0.9.0}/opencode_llmstack.egg-info/dependency_links.txt +0 -0
- {opencode_llmstack-0.7.3 → opencode_llmstack-0.9.0}/opencode_llmstack.egg-info/entry_points.txt +0 -0
- {opencode_llmstack-0.7.3 → opencode_llmstack-0.9.0}/opencode_llmstack.egg-info/requires.txt +0 -0
- {opencode_llmstack-0.7.3 → opencode_llmstack-0.9.0}/opencode_llmstack.egg-info/top_level.txt +0 -0
- {opencode_llmstack-0.7.3 → opencode_llmstack-0.9.0}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: opencode-llmstack
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.9.0
|
|
4
4
|
Summary: Multi-tier local LLM stack: llama-swap + FastAPI auto-router + opencode wiring.
|
|
5
5
|
Author: llmstack
|
|
6
6
|
License: MIT
|
|
@@ -542,8 +542,8 @@ All knobs are env vars; defaults are picked up by `llmstack start`.
|
|
|
542
542
|
| `ROUTER_ULTRA_MODEL` | `code-ultra` | short-context top tier → here (gated on availability) |
|
|
543
543
|
| `ROUTER_PLAN_MODEL` | `plan` | design/discussion verbs → here |
|
|
544
544
|
| `ROUTER_UNCENSORED_MODEL` | `plan-uncensored` | `[nofilter]` triggers → here |
|
|
545
|
-
| `ROUTER_HIGH_FIDELITY_CEILING` | `
|
|
546
|
-
| `ROUTER_MID_FIDELITY_CEILING` | `32000` | tokens; at or below this, route to `code-smart`; beyond, step down to `code-fast` |
|
|
545
|
+
| `ROUTER_HIGH_FIDELITY_CEILING` | `12000` | tokens; at or below this, route to top tier (ultra → smart fallback). Paired with `code-ultra.ctx_size = 24000` (2x). |
|
|
546
|
+
| `ROUTER_MID_FIDELITY_CEILING` | `32000` | tokens; at or below this, route to `code-smart`; beyond, step down to `code-fast`. Paired with `code-smart.ctx_size = 64000` (2x). |
|
|
547
547
|
| `ROUTER_MULTI_TURN` | `6` | turn count that floors the long-context rung at `code-smart` |
|
|
548
548
|
| `ROUTER_HOST` / `ROUTER_PORT` | `127.0.0.1` / `10101` | listen address |
|
|
549
549
|
| `LOG_LEVEL` | `info` | router log level |
|
|
@@ -507,8 +507,8 @@ All knobs are env vars; defaults are picked up by `llmstack start`.
|
|
|
507
507
|
| `ROUTER_ULTRA_MODEL` | `code-ultra` | short-context top tier → here (gated on availability) |
|
|
508
508
|
| `ROUTER_PLAN_MODEL` | `plan` | design/discussion verbs → here |
|
|
509
509
|
| `ROUTER_UNCENSORED_MODEL` | `plan-uncensored` | `[nofilter]` triggers → here |
|
|
510
|
-
| `ROUTER_HIGH_FIDELITY_CEILING` | `
|
|
511
|
-
| `ROUTER_MID_FIDELITY_CEILING` | `32000` | tokens; at or below this, route to `code-smart`; beyond, step down to `code-fast` |
|
|
510
|
+
| `ROUTER_HIGH_FIDELITY_CEILING` | `12000` | tokens; at or below this, route to top tier (ultra → smart fallback). Paired with `code-ultra.ctx_size = 24000` (2x). |
|
|
511
|
+
| `ROUTER_MID_FIDELITY_CEILING` | `32000` | tokens; at or below this, route to `code-smart`; beyond, step down to `code-fast`. Paired with `code-smart.ctx_size = 64000` (2x). |
|
|
512
512
|
| `ROUTER_MULTI_TURN` | `6` | turn count that floors the long-context rung at `code-smart` |
|
|
513
513
|
| `ROUTER_HOST` / `ROUTER_PORT` | `127.0.0.1` / `10101` | listen address |
|
|
514
514
|
| `LOG_LEVEL` | `info` | router log level |
|
|
@@ -74,21 +74,35 @@ Routing decision tree (first match wins):
|
|
|
74
74
|
2. Explicit "ultra" trigger (``[ultra]``, ``[opus]``,
|
|
75
75
|
``ultra:``, ``opus:``) AND ultra tier configured -> code-ultra
|
|
76
76
|
3. PLAN signal words AND no code-block / agent verbs / tools
|
|
77
|
-
|
|
77
|
+
AND estimated tokens <= ``[plan]`` tier's ctx_size
|
|
78
|
+
(pure design discussion that fits the planner's
|
|
79
|
+
window) -> plan
|
|
80
|
+
(if the planner's
|
|
81
|
+
ctx_size is breached
|
|
82
|
+
we fall through to
|
|
83
|
+
the coding ladder
|
|
84
|
+
rather than send a
|
|
85
|
+
request that won't
|
|
86
|
+
fit -- the coding
|
|
87
|
+
tiers cover larger
|
|
88
|
+
windows by design)
|
|
78
89
|
4. Estimated input tokens <= HIGH_FIDELITY_CEILING
|
|
79
90
|
("reasonable context still being built") -> code-ultra
|
|
80
91
|
(else code-smart)
|
|
81
92
|
5. Estimated input tokens <= MID_FIDELITY_CEILING -> code-smart
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
93
|
+
6. Otherwise (long context, top-tier becomes
|
|
94
|
+
expensive/slow, fast tier's 128k window is the
|
|
95
|
+
best fit and it's free) -> code-fast
|
|
85
96
|
(floored at
|
|
86
97
|
code-smart when
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
98
|
+
n_turns >=
|
|
99
|
+
MULTI_TURN_THRESHOLD)
|
|
100
|
+
|
|
101
|
+
The auto router's effective max context window is
|
|
102
|
+
``[code-fast].ctx_size`` -- fast is the bottom of the step-down
|
|
103
|
+
ladder, so any context that would overflow the tiers above lands on
|
|
104
|
+
fast. Inputs longer than fast's window have no safe home and should
|
|
105
|
+
be considered out of scope for ``model = auto``.
|
|
92
106
|
|
|
93
107
|
Ultra-tier routing is gated on availability: rule (2) and the
|
|
94
108
|
"high-fidelity" rung of (4) first check that the tier is loaded
|
|
@@ -137,14 +151,21 @@ UNCENSORED_MODEL = os.getenv("ROUTER_UNCENSORED_MODEL", "plan-uncensored")
|
|
|
137
151
|
# est <= MID_FIDELITY_CEILING -> code-smart
|
|
138
152
|
# est > MID_FIDELITY_CEILING -> code-fast (or smart with tools/loop)
|
|
139
153
|
#
|
|
154
|
+
# Each ceiling is half of the corresponding tier's ``ctx_size`` in
|
|
155
|
+
# models.ini -- the ceiling marks where the tier still has comfortable
|
|
156
|
+
# headroom, and double the ceiling is where the router has already
|
|
157
|
+
# stepped down to the next tier (so the upper tier never has to handle
|
|
158
|
+
# inputs at its own limit).
|
|
159
|
+
#
|
|
140
160
|
# Defaults:
|
|
141
|
-
# HIGH
|
|
161
|
+
# HIGH 12000 - "reasonable context built": a couple of files loaded,
|
|
142
162
|
# instructions clear, top-tier still cheap+fast here.
|
|
143
|
-
#
|
|
163
|
+
# Pairs with a 24k ctx_size on code-ultra.
|
|
164
|
+
# MID 32000 - half of code-smart's 64k window; past this, hosted
|
|
144
165
|
# top-tier latency/$cost balloons and code-smart starts
|
|
145
166
|
# getting cramped, while code-fast's 128k YaRN window
|
|
146
167
|
# still has comfortable headroom.
|
|
147
|
-
HIGH_FIDELITY_CEILING = int(os.getenv("ROUTER_HIGH_FIDELITY_CEILING", "
|
|
168
|
+
HIGH_FIDELITY_CEILING = int(os.getenv("ROUTER_HIGH_FIDELITY_CEILING", "12000"))
|
|
148
169
|
MID_FIDELITY_CEILING = int(os.getenv("ROUTER_MID_FIDELITY_CEILING", "32000"))
|
|
149
170
|
# Floor the long-context rung at code-smart whenever a tool-call
|
|
150
171
|
# protocol is in play -- 3B models tool-call unreliably regardless of
|
|
@@ -332,25 +353,36 @@ def classify(body: dict[str, Any]) -> tuple[str, str]:
|
|
|
332
353
|
ULTRA_MODEL, AGENT_MODEL)
|
|
333
354
|
return AGENT_MODEL, f"ultra-trigger->agent ({ULTRA_MODEL} unavailable)"
|
|
334
355
|
|
|
335
|
-
has_tools = bool(body.get("tools"))
|
|
336
356
|
n_turns = len(messages) if messages else 0
|
|
337
357
|
has_code_signal = (
|
|
338
358
|
_matches(CODE_BLOCK, messages, prompt)
|
|
339
359
|
or _matches(AGENT_SIGNALS, messages, prompt)
|
|
340
360
|
)
|
|
341
361
|
|
|
362
|
+
est = _estimate_tokens(messages, prompt)
|
|
363
|
+
|
|
342
364
|
# Plan track is orthogonal to the code fidelity ladder: ``plan`` is a
|
|
343
365
|
# chat-tuned model meant for design / "should we" discussions. Only
|
|
344
366
|
# take it when nothing about the request says "I'm about to write
|
|
345
|
-
# code" (no triple-backticks, no agent verbs
|
|
367
|
+
# code" (no triple-backticks, no agent verbs). Tools are stripped
|
|
368
|
+
# from the request body before dispatch (see ``_handle_completion``),
|
|
369
|
+
# so their presence here does not block plan routing.
|
|
370
|
+
# Only route to plan if the input fits in the planner's ctx_size --
|
|
371
|
+
# past that we fall through to the coding ladder which has tiers
|
|
372
|
+
# (smart, fast) explicitly sized for larger contexts.
|
|
346
373
|
if (
|
|
347
|
-
not
|
|
348
|
-
and not has_code_signal
|
|
374
|
+
not has_code_signal
|
|
349
375
|
and _matches(PLAN_SIGNALS, messages, prompt)
|
|
350
376
|
):
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
377
|
+
plan_tier = TIER_BY_ALIAS.get(PLAN_MODEL)
|
|
378
|
+
plan_ctx = plan_tier.ctx_size if plan_tier else 0
|
|
379
|
+
if not plan_ctx or est <= plan_ctx:
|
|
380
|
+
return PLAN_MODEL, "plan-signal"
|
|
381
|
+
log.info(
|
|
382
|
+
"plan-signal but tokens~%d > %s.ctx_size %d; "
|
|
383
|
+
"falling through to coding ladder",
|
|
384
|
+
est, PLAN_MODEL, plan_ctx,
|
|
385
|
+
)
|
|
354
386
|
|
|
355
387
|
# Rung 1: short context -- start at the top.
|
|
356
388
|
if est <= HIGH_FIDELITY_CEILING:
|
|
@@ -365,12 +397,12 @@ def classify(body: dict[str, Any]) -> tuple[str, str]:
|
|
|
365
397
|
if est <= MID_FIDELITY_CEILING:
|
|
366
398
|
return AGENT_MODEL, f"mid-fidelity tokens~{est}<={MID_FIDELITY_CEILING}"
|
|
367
399
|
|
|
368
|
-
# Rung 3: long context -- step down to fast
|
|
369
|
-
#
|
|
370
|
-
#
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
return AGENT_MODEL, f"long-context tokens~{est}>{MID_FIDELITY_CEILING} ({
|
|
400
|
+
# Rung 3: long context -- step down to fast. Floor at smart only
|
|
401
|
+
# when the multi-turn threshold is hit; tools alone no longer
|
|
402
|
+
# prevent the step-down (plan tiers strip tools before dispatch,
|
|
403
|
+
# and code-fast is a hosted model that tool-calls reliably).
|
|
404
|
+
if n_turns >= MULTI_TURN_THRESHOLD:
|
|
405
|
+
return AGENT_MODEL, f"long-context tokens~{est}>{MID_FIDELITY_CEILING} (turns={n_turns} floor)"
|
|
374
406
|
return FAST_MODEL, f"long-context tokens~{est}>{MID_FIDELITY_CEILING}"
|
|
375
407
|
|
|
376
408
|
|
|
@@ -590,6 +622,11 @@ async def _handle_completion(req: Request, path: str) -> Response:
|
|
|
590
622
|
mutated = True
|
|
591
623
|
|
|
592
624
|
chosen_name = body.get("model")
|
|
625
|
+
if chosen_name in {PLAN_MODEL, UNCENSORED_MODEL} and body.get("tools"):
|
|
626
|
+
log.info("plan tier %s: stripping tools from request", chosen_name)
|
|
627
|
+
body.pop("tools")
|
|
628
|
+
body.pop("tool_choice", None)
|
|
629
|
+
mutated = True
|
|
593
630
|
tier = _resolve_tier(chosen_name)
|
|
594
631
|
if tier is not None and _inject_sampler(body, tier):
|
|
595
632
|
mutated = True
|
|
@@ -172,9 +172,22 @@ def build_config(
|
|
|
172
172
|
|
|
173
173
|
tier_sections = [s for s in cfg.sections() if s != "ROUTING"]
|
|
174
174
|
|
|
175
|
-
# `auto` context =
|
|
176
|
-
#
|
|
177
|
-
|
|
175
|
+
# `auto` context = the fast tier's ctx_size. The router runs a
|
|
176
|
+
# step-DOWN ladder (ultra -> smart -> fast as context grows), so
|
|
177
|
+
# the largest window in the ladder is fast's, and that's the
|
|
178
|
+
# effective ceiling for `model = auto` -- anything bigger has no
|
|
179
|
+
# tier to land on. Using `min(...)` here would clip opencode to
|
|
180
|
+
# the smallest tier's window even though the router would never
|
|
181
|
+
# actually send a long prompt to that tier.
|
|
182
|
+
fast_ctx = next(
|
|
183
|
+
(
|
|
184
|
+
_int(cfg[s].get("ctx_size", ""), 0)
|
|
185
|
+
for s in tier_sections
|
|
186
|
+
if (cfg[s].get("role") or "").strip() == "fast"
|
|
187
|
+
),
|
|
188
|
+
0,
|
|
189
|
+
)
|
|
190
|
+
auto_ctx = fast_ctx or max(
|
|
178
191
|
(_int(cfg[s].get("ctx_size", ""), 0) for s in tier_sections),
|
|
179
192
|
default=8192,
|
|
180
193
|
) or 8192
|
|
@@ -67,7 +67,7 @@ tier = code
|
|
|
67
67
|
role = fast
|
|
68
68
|
hf_repo = bartowski/Qwen2.5-Coder-3B-Instruct-GGUF
|
|
69
69
|
hf_file = Qwen2.5-Coder-3B-Instruct-Q5_K_M.gguf
|
|
70
|
-
ctx_size = 131072 ; native 32k extended via YaRN (factor 4)
|
|
70
|
+
ctx_size = 131072 ; native 32k extended via YaRN (factor 4); also defines the auto router's effective max -- contexts longer than this have nowhere to land, since fast is the bottom of the step-down ladder
|
|
71
71
|
rope_scaling = yarn (scale=4, orig_ctx=32768)
|
|
72
72
|
size_gb = 2.5
|
|
73
73
|
quant = Q5_K_M
|
|
@@ -97,7 +97,7 @@ role = agent
|
|
|
97
97
|
hf_repo = unsloth/Qwen3-Coder-Next-GGUF
|
|
98
98
|
hf_file = Qwen3-Coder-Next-Q4_K_M.gguf
|
|
99
99
|
hf_file_next = Qwen3-Coder-Next-UD-Q4_K_XL.gguf
|
|
100
|
-
ctx_size =
|
|
100
|
+
ctx_size = 64000 ; 2x mid_fidelity_ceiling -- router steps down to code-fast past this, so a larger window would never get used
|
|
101
101
|
size_gb = 45
|
|
102
102
|
size_gb_next = 50
|
|
103
103
|
quant = Q4_K_M
|
|
@@ -117,7 +117,7 @@ description = Qwen3-Coder-Next 80B-A3B MoE - heavy coder for agent loops
|
|
|
117
117
|
; aws_model_id = eu.anthropic.claude-sonnet-4-6
|
|
118
118
|
; aws_region = eu-central-1
|
|
119
119
|
; aws_profile = bedrock-prod
|
|
120
|
-
; ctx_size =
|
|
120
|
+
; ctx_size = 64000 ; 2x mid_fidelity_ceiling -- intentionally well below Sonnet 4.6's 200k native window; router steps down to code-fast past this rather than paying Sonnet's long-context $cost/latency
|
|
121
121
|
; sampler = temp=0.5 ; Sonnet 4.6 accepts ONE of temp / top_p; pick `temp` for agent work
|
|
122
122
|
; description = Claude Sonnet 4.6 on Bedrock - heavy coder for agent loops
|
|
123
123
|
|
|
@@ -137,7 +137,7 @@ description = Qwen3-Coder-Next 80B-A3B MoE - heavy coder for agent loops
|
|
|
137
137
|
; aws_model_id = global.anthropic.claude-opus-4-7 ; global.* cross-region inference profile
|
|
138
138
|
; aws_region = eu-central-1 ; API anchor region; global.* auto-routes inference cross-region (set EU as the anchor for residency)
|
|
139
139
|
; aws_profile = bedrock-prod ; conventional profile name; configure once with `aws configure --profile bedrock-prod` (or change to your own and run `llmstack install`)
|
|
140
|
-
; ctx_size =
|
|
140
|
+
; ctx_size = 24000 ; 2x high_fidelity_ceiling -- intentionally well below Opus 4.7's 200k native window; we only invoke ultra for short prompts, where it's still cheap+fast, then step down past this
|
|
141
141
|
; ; NB: no `sampler =` line. Claude Opus 4.7 explicitly rejects all
|
|
142
142
|
; ; sampler params (temperature, top_p, top_k) -- per the Bedrock
|
|
143
143
|
; ; model card, "the recommended migration path is to omit these
|
|
@@ -260,7 +260,11 @@ description = Mistral-Small 3.2 24B Heretic - no-filter planning
|
|
|
260
260
|
; 2. "[ultra]" / "[opus]" / "ultra:" trigger AND code-ultra
|
|
261
261
|
; tier configured -> code-ultra
|
|
262
262
|
; 3. PLAN signal words AND no code-block / agent verbs / tools
|
|
263
|
-
; (pure design discussion
|
|
263
|
+
; AND tokens <= [plan].ctx_size (pure design discussion that
|
|
264
|
+
; still fits the planner's window) -> plan
|
|
265
|
+
; ...if the plan tier's ctx_size is breached, the request
|
|
266
|
+
; falls through to the coding ladder below rather than being
|
|
267
|
+
; sent to a planner whose window can't hold the input.
|
|
264
268
|
; 4. tokens <= high_fidelity_ceiling AND code-ultra configured -> code-ultra
|
|
265
269
|
; tokens <= high_fidelity_ceiling AND no code-ultra -> code-smart
|
|
266
270
|
; 5. tokens <= mid_fidelity_ceiling -> code-smart
|
|
@@ -268,6 +272,24 @@ description = Mistral-Small 3.2 24B Heretic - no-filter planning
|
|
|
268
272
|
; - if tools[] OR turns >= multi_turn (3B tool-calls badly) -> code-smart
|
|
269
273
|
; - else -> code-fast
|
|
270
274
|
;
|
|
275
|
+
; AUTO ROUTER MAX CONTEXT = [code-fast].ctx_size. The fast tier sits at
|
|
276
|
+
; the bottom of the step-down ladder, so any context too big for the
|
|
277
|
+
; tiers above lands on fast. Anything beyond fast's window has no
|
|
278
|
+
; safe home and should be considered out of scope for `model = auto`
|
|
279
|
+
; -- callers with such payloads should pass an explicit model name
|
|
280
|
+
; instead of relying on the router.
|
|
281
|
+
;
|
|
282
|
+
; FIDELITY-CEILING <-> CTX-SIZE COUPLING. Each "fidelity" rung's
|
|
283
|
+
; ceiling is half of the corresponding tier's ctx_size:
|
|
284
|
+
; high_fidelity_ceiling x 2 == [code-ultra].ctx_size
|
|
285
|
+
; mid_fidelity_ceiling x 2 == [code-smart].ctx_size
|
|
286
|
+
; This is deliberate: the ceiling marks where the tier still has
|
|
287
|
+
; comfortable headroom; double the ceiling is where we'd be packing
|
|
288
|
+
; the tier to its limit (and where the router has already stepped
|
|
289
|
+
; down to the next tier). If you bump a ceiling, bump the matching
|
|
290
|
+
; ctx_size in the tier section too -- otherwise the router will
|
|
291
|
+
; route requests up to a tier whose window can't hold them.
|
|
292
|
+
;
|
|
271
293
|
; The "high-fidelity" rung is gated on availability: when the
|
|
272
294
|
; [code-ultra] section is absent (or fails to load), rules (2) and (4)
|
|
273
295
|
; silently fall back to code-smart instead of routing to a tier that
|
|
@@ -276,8 +298,8 @@ description = Mistral-Small 3.2 24B Heretic - no-filter planning
|
|
|
276
298
|
; ROUTER_HIGH_FIDELITY_CEILING / ROUTER_MID_FIDELITY_CEILING /
|
|
277
299
|
; ROUTER_MULTI_TURN.
|
|
278
300
|
;
|
|
279
|
-
high_fidelity_ceiling =
|
|
280
|
-
mid_fidelity_ceiling = 32000 ; tokens; smart's sweet spot up to here, then step down to fast
|
|
301
|
+
high_fidelity_ceiling = 12000 ; tokens; below this, top-tier model is still cheap+fast (and ultra ctx_size = 2 * this)
|
|
302
|
+
mid_fidelity_ceiling = 32000 ; tokens; smart's sweet spot up to here, then step down to fast (smart ctx_size = 2 * this)
|
|
281
303
|
multi_turn = 6 ; turn count that floors the long-context rung at code-smart
|
|
282
304
|
agent_signal_words = implement, fix bug, write a function, refactor, edit, patch, debug, run tests, build it
|
|
283
305
|
plan_signal_words = design, architect, approach, trade-off, should we, how would you, explain why, think through, compare options, brainstorm, root cause
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: opencode-llmstack
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.9.0
|
|
4
4
|
Summary: Multi-tier local LLM stack: llama-swap + FastAPI auto-router + opencode wiring.
|
|
5
5
|
Author: llmstack
|
|
6
6
|
License: MIT
|
|
@@ -542,8 +542,8 @@ All knobs are env vars; defaults are picked up by `llmstack start`.
|
|
|
542
542
|
| `ROUTER_ULTRA_MODEL` | `code-ultra` | short-context top tier → here (gated on availability) |
|
|
543
543
|
| `ROUTER_PLAN_MODEL` | `plan` | design/discussion verbs → here |
|
|
544
544
|
| `ROUTER_UNCENSORED_MODEL` | `plan-uncensored` | `[nofilter]` triggers → here |
|
|
545
|
-
| `ROUTER_HIGH_FIDELITY_CEILING` | `
|
|
546
|
-
| `ROUTER_MID_FIDELITY_CEILING` | `32000` | tokens; at or below this, route to `code-smart`; beyond, step down to `code-fast` |
|
|
545
|
+
| `ROUTER_HIGH_FIDELITY_CEILING` | `12000` | tokens; at or below this, route to top tier (ultra → smart fallback). Paired with `code-ultra.ctx_size = 24000` (2x). |
|
|
546
|
+
| `ROUTER_MID_FIDELITY_CEILING` | `32000` | tokens; at or below this, route to `code-smart`; beyond, step down to `code-fast`. Paired with `code-smart.ctx_size = 64000` (2x). |
|
|
547
547
|
| `ROUTER_MULTI_TURN` | `6` | turn count that floors the long-context rung at `code-smart` |
|
|
548
548
|
| `ROUTER_HOST` / `ROUTER_PORT` | `127.0.0.1` / `10101` | listen address |
|
|
549
549
|
| `LOG_LEVEL` | `info` | router log level |
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "opencode-llmstack"
|
|
7
|
-
version = "0.
|
|
7
|
+
version = "0.9.0"
|
|
8
8
|
description = "Multi-tier local LLM stack: llama-swap + FastAPI auto-router + opencode wiring."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.11"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{opencode_llmstack-0.7.3 → opencode_llmstack-0.9.0}/opencode_llmstack.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
{opencode_llmstack-0.7.3 → opencode_llmstack-0.9.0}/opencode_llmstack.egg-info/entry_points.txt
RENAMED
|
File without changes
|
|
File without changes
|
{opencode_llmstack-0.7.3 → opencode_llmstack-0.9.0}/opencode_llmstack.egg-info/top_level.txt
RENAMED
|
File without changes
|
|
File without changes
|