opencode-llmstack 0.7.2__tar.gz → 0.8.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {opencode_llmstack-0.7.2 → opencode_llmstack-0.8.0}/PKG-INFO +3 -3
- {opencode_llmstack-0.7.2 → opencode_llmstack-0.8.0}/README.md +2 -2
- {opencode_llmstack-0.7.2 → opencode_llmstack-0.8.0}/llmstack/app.py +44 -8
- {opencode_llmstack-0.7.2 → opencode_llmstack-0.8.0}/llmstack/generators/opencode.py +16 -3
- {opencode_llmstack-0.7.2 → opencode_llmstack-0.8.0}/llmstack/models.ini +168 -84
- {opencode_llmstack-0.7.2 → opencode_llmstack-0.8.0}/opencode_llmstack.egg-info/PKG-INFO +3 -3
- {opencode_llmstack-0.7.2 → opencode_llmstack-0.8.0}/pyproject.toml +1 -1
- {opencode_llmstack-0.7.2 → opencode_llmstack-0.8.0}/llmstack/AGENTS.md +0 -0
- {opencode_llmstack-0.7.2 → opencode_llmstack-0.8.0}/llmstack/__init__.py +0 -0
- {opencode_llmstack-0.7.2 → opencode_llmstack-0.8.0}/llmstack/__main__.py +0 -0
- {opencode_llmstack-0.7.2 → opencode_llmstack-0.8.0}/llmstack/_platform.py +0 -0
- {opencode_llmstack-0.7.2 → opencode_llmstack-0.8.0}/llmstack/backends/__init__.py +0 -0
- {opencode_llmstack-0.7.2 → opencode_llmstack-0.8.0}/llmstack/backends/bedrock.py +0 -0
- {opencode_llmstack-0.7.2 → opencode_llmstack-0.8.0}/llmstack/check_models.py +0 -0
- {opencode_llmstack-0.7.2 → opencode_llmstack-0.8.0}/llmstack/cli.py +0 -0
- {opencode_llmstack-0.7.2 → opencode_llmstack-0.8.0}/llmstack/commands/__init__.py +0 -0
- {opencode_llmstack-0.7.2 → opencode_llmstack-0.8.0}/llmstack/commands/_helpers.py +0 -0
- {opencode_llmstack-0.7.2 → opencode_llmstack-0.8.0}/llmstack/commands/activate.py +0 -0
- {opencode_llmstack-0.7.2 → opencode_llmstack-0.8.0}/llmstack/commands/check.py +0 -0
- {opencode_llmstack-0.7.2 → opencode_llmstack-0.8.0}/llmstack/commands/download.py +0 -0
- {opencode_llmstack-0.7.2 → opencode_llmstack-0.8.0}/llmstack/commands/install.py +0 -0
- {opencode_llmstack-0.7.2 → opencode_llmstack-0.8.0}/llmstack/commands/install_llama_swap.py +0 -0
- {opencode_llmstack-0.7.2 → opencode_llmstack-0.8.0}/llmstack/commands/reload.py +0 -0
- {opencode_llmstack-0.7.2 → opencode_llmstack-0.8.0}/llmstack/commands/restart.py +0 -0
- {opencode_llmstack-0.7.2 → opencode_llmstack-0.8.0}/llmstack/commands/setup.py +0 -0
- {opencode_llmstack-0.7.2 → opencode_llmstack-0.8.0}/llmstack/commands/start.py +0 -0
- {opencode_llmstack-0.7.2 → opencode_llmstack-0.8.0}/llmstack/commands/status.py +0 -0
- {opencode_llmstack-0.7.2 → opencode_llmstack-0.8.0}/llmstack/commands/stop.py +0 -0
- {opencode_llmstack-0.7.2 → opencode_llmstack-0.8.0}/llmstack/download/__init__.py +0 -0
- {opencode_llmstack-0.7.2 → opencode_llmstack-0.8.0}/llmstack/download/binary.py +0 -0
- {opencode_llmstack-0.7.2 → opencode_llmstack-0.8.0}/llmstack/download/ggufs.py +0 -0
- {opencode_llmstack-0.7.2 → opencode_llmstack-0.8.0}/llmstack/generators/__init__.py +0 -0
- {opencode_llmstack-0.7.2 → opencode_llmstack-0.8.0}/llmstack/generators/llama_swap.py +0 -0
- {opencode_llmstack-0.7.2 → opencode_llmstack-0.8.0}/llmstack/paths.py +0 -0
- {opencode_llmstack-0.7.2 → opencode_llmstack-0.8.0}/llmstack/shell_env.py +0 -0
- {opencode_llmstack-0.7.2 → opencode_llmstack-0.8.0}/llmstack/tiers.py +0 -0
- {opencode_llmstack-0.7.2 → opencode_llmstack-0.8.0}/opencode_llmstack.egg-info/SOURCES.txt +0 -0
- {opencode_llmstack-0.7.2 → opencode_llmstack-0.8.0}/opencode_llmstack.egg-info/dependency_links.txt +0 -0
- {opencode_llmstack-0.7.2 → opencode_llmstack-0.8.0}/opencode_llmstack.egg-info/entry_points.txt +0 -0
- {opencode_llmstack-0.7.2 → opencode_llmstack-0.8.0}/opencode_llmstack.egg-info/requires.txt +0 -0
- {opencode_llmstack-0.7.2 → opencode_llmstack-0.8.0}/opencode_llmstack.egg-info/top_level.txt +0 -0
- {opencode_llmstack-0.7.2 → opencode_llmstack-0.8.0}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: opencode-llmstack
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.8.0
|
|
4
4
|
Summary: Multi-tier local LLM stack: llama-swap + FastAPI auto-router + opencode wiring.
|
|
5
5
|
Author: llmstack
|
|
6
6
|
License: MIT
|
|
@@ -542,8 +542,8 @@ All knobs are env vars; defaults are picked up by `llmstack start`.
|
|
|
542
542
|
| `ROUTER_ULTRA_MODEL` | `code-ultra` | short-context top tier → here (gated on availability) |
|
|
543
543
|
| `ROUTER_PLAN_MODEL` | `plan` | design/discussion verbs → here |
|
|
544
544
|
| `ROUTER_UNCENSORED_MODEL` | `plan-uncensored` | `[nofilter]` triggers → here |
|
|
545
|
-
| `ROUTER_HIGH_FIDELITY_CEILING` | `
|
|
546
|
-
| `ROUTER_MID_FIDELITY_CEILING` | `32000` | tokens; at or below this, route to `code-smart`; beyond, step down to `code-fast` |
|
|
545
|
+
| `ROUTER_HIGH_FIDELITY_CEILING` | `12000` | tokens; at or below this, route to top tier (ultra → smart fallback). Paired with `code-ultra.ctx_size = 24000` (2x). |
|
|
546
|
+
| `ROUTER_MID_FIDELITY_CEILING` | `32000` | tokens; at or below this, route to `code-smart`; beyond, step down to `code-fast`. Paired with `code-smart.ctx_size = 64000` (2x). |
|
|
547
547
|
| `ROUTER_MULTI_TURN` | `6` | turn count that floors the long-context rung at `code-smart` |
|
|
548
548
|
| `ROUTER_HOST` / `ROUTER_PORT` | `127.0.0.1` / `10101` | listen address |
|
|
549
549
|
| `LOG_LEVEL` | `info` | router log level |
|
|
@@ -507,8 +507,8 @@ All knobs are env vars; defaults are picked up by `llmstack start`.
|
|
|
507
507
|
| `ROUTER_ULTRA_MODEL` | `code-ultra` | short-context top tier → here (gated on availability) |
|
|
508
508
|
| `ROUTER_PLAN_MODEL` | `plan` | design/discussion verbs → here |
|
|
509
509
|
| `ROUTER_UNCENSORED_MODEL` | `plan-uncensored` | `[nofilter]` triggers → here |
|
|
510
|
-
| `ROUTER_HIGH_FIDELITY_CEILING` | `
|
|
511
|
-
| `ROUTER_MID_FIDELITY_CEILING` | `32000` | tokens; at or below this, route to `code-smart`; beyond, step down to `code-fast` |
|
|
510
|
+
| `ROUTER_HIGH_FIDELITY_CEILING` | `12000` | tokens; at or below this, route to top tier (ultra → smart fallback). Paired with `code-ultra.ctx_size = 24000` (2x). |
|
|
511
|
+
| `ROUTER_MID_FIDELITY_CEILING` | `32000` | tokens; at or below this, route to `code-smart`; beyond, step down to `code-fast`. Paired with `code-smart.ctx_size = 64000` (2x). |
|
|
512
512
|
| `ROUTER_MULTI_TURN` | `6` | turn count that floors the long-context rung at `code-smart` |
|
|
513
513
|
| `ROUTER_HOST` / `ROUTER_PORT` | `127.0.0.1` / `10101` | listen address |
|
|
514
514
|
| `LOG_LEVEL` | `info` | router log level |
|
|
@@ -74,7 +74,18 @@ Routing decision tree (first match wins):
|
|
|
74
74
|
2. Explicit "ultra" trigger (``[ultra]``, ``[opus]``,
|
|
75
75
|
``ultra:``, ``opus:``) AND ultra tier configured -> code-ultra
|
|
76
76
|
3. PLAN signal words AND no code-block / agent verbs / tools
|
|
77
|
-
|
|
77
|
+
AND estimated tokens <= ``[plan]`` tier's ctx_size
|
|
78
|
+
(pure design discussion that fits the planner's
|
|
79
|
+
window) -> plan
|
|
80
|
+
(if the planner's
|
|
81
|
+
ctx_size is breached
|
|
82
|
+
we fall through to
|
|
83
|
+
the coding ladder
|
|
84
|
+
rather than send a
|
|
85
|
+
request that won't
|
|
86
|
+
fit -- the coding
|
|
87
|
+
tiers cover larger
|
|
88
|
+
windows by design)
|
|
78
89
|
4. Estimated input tokens <= HIGH_FIDELITY_CEILING
|
|
79
90
|
("reasonable context still being built") -> code-ultra
|
|
80
91
|
(else code-smart)
|
|
@@ -90,6 +101,12 @@ Routing decision tree (first match wins):
|
|
|
90
101
|
since 3B models
|
|
91
102
|
tool-call unreliably)
|
|
92
103
|
|
|
104
|
+
The auto router's effective max context window is
|
|
105
|
+
``[code-fast].ctx_size`` -- fast is the bottom of the step-down
|
|
106
|
+
ladder, so any context that would overflow the tiers above lands on
|
|
107
|
+
fast. Inputs longer than fast's window have no safe home and should
|
|
108
|
+
be considered out of scope for ``model = auto``.
|
|
109
|
+
|
|
93
110
|
Ultra-tier routing is gated on availability: rule (2) and the
|
|
94
111
|
"high-fidelity" rung of (4) first check that the tier is loaded
|
|
95
112
|
from ``models.ini`` (i.e. present in :data:`TIER_BY_ALIAS`). When
|
|
@@ -137,14 +154,21 @@ UNCENSORED_MODEL = os.getenv("ROUTER_UNCENSORED_MODEL", "plan-uncensored")
|
|
|
137
154
|
# est <= MID_FIDELITY_CEILING -> code-smart
|
|
138
155
|
# est > MID_FIDELITY_CEILING -> code-fast (or smart with tools/loop)
|
|
139
156
|
#
|
|
157
|
+
# Each ceiling is half of the corresponding tier's ``ctx_size`` in
|
|
158
|
+
# models.ini -- the ceiling marks where the tier still has comfortable
|
|
159
|
+
# headroom, and double the ceiling is where the router has already
|
|
160
|
+
# stepped down to the next tier (so the upper tier never has to handle
|
|
161
|
+
# inputs at its own limit).
|
|
162
|
+
#
|
|
140
163
|
# Defaults:
|
|
141
|
-
# HIGH
|
|
164
|
+
# HIGH 12000 - "reasonable context built": a couple of files loaded,
|
|
142
165
|
# instructions clear, top-tier still cheap+fast here.
|
|
143
|
-
#
|
|
166
|
+
# Pairs with a 24k ctx_size on code-ultra.
|
|
167
|
+
# MID 32000 - half of code-smart's 64k window; past this, hosted
|
|
144
168
|
# top-tier latency/$cost balloons and code-smart starts
|
|
145
169
|
# getting cramped, while code-fast's 128k YaRN window
|
|
146
170
|
# still has comfortable headroom.
|
|
147
|
-
HIGH_FIDELITY_CEILING = int(os.getenv("ROUTER_HIGH_FIDELITY_CEILING", "
|
|
171
|
+
HIGH_FIDELITY_CEILING = int(os.getenv("ROUTER_HIGH_FIDELITY_CEILING", "12000"))
|
|
148
172
|
MID_FIDELITY_CEILING = int(os.getenv("ROUTER_MID_FIDELITY_CEILING", "32000"))
|
|
149
173
|
# Floor the long-context rung at code-smart whenever a tool-call
|
|
150
174
|
# protocol is in play -- 3B models tool-call unreliably regardless of
|
|
@@ -339,18 +363,30 @@ def classify(body: dict[str, Any]) -> tuple[str, str]:
|
|
|
339
363
|
or _matches(AGENT_SIGNALS, messages, prompt)
|
|
340
364
|
)
|
|
341
365
|
|
|
366
|
+
est = _estimate_tokens(messages, prompt)
|
|
367
|
+
|
|
342
368
|
# Plan track is orthogonal to the code fidelity ladder: ``plan`` is a
|
|
343
369
|
# chat-tuned model meant for design / "should we" discussions. Only
|
|
344
370
|
# take it when nothing about the request says "I'm about to write
|
|
345
|
-
# code" (no triple-backticks, no agent verbs, no tool calls).
|
|
371
|
+
# code" (no triple-backticks, no agent verbs, no tool calls). And
|
|
372
|
+
# only if the input fits in the planner's ctx_size -- past that we'd
|
|
373
|
+
# be sending a request the planner can't hold, so we fall through
|
|
374
|
+
# to the coding ladder, which has tiers (smart, fast) explicitly
|
|
375
|
+
# sized for larger contexts.
|
|
346
376
|
if (
|
|
347
377
|
not has_tools
|
|
348
378
|
and not has_code_signal
|
|
349
379
|
and _matches(PLAN_SIGNALS, messages, prompt)
|
|
350
380
|
):
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
381
|
+
plan_tier = TIER_BY_ALIAS.get(PLAN_MODEL)
|
|
382
|
+
plan_ctx = plan_tier.ctx_size if plan_tier else 0
|
|
383
|
+
if not plan_ctx or est <= plan_ctx:
|
|
384
|
+
return PLAN_MODEL, "plan-signal"
|
|
385
|
+
log.info(
|
|
386
|
+
"plan-signal but tokens~%d > %s.ctx_size %d; "
|
|
387
|
+
"falling through to coding ladder",
|
|
388
|
+
est, PLAN_MODEL, plan_ctx,
|
|
389
|
+
)
|
|
354
390
|
|
|
355
391
|
# Rung 1: short context -- start at the top.
|
|
356
392
|
if est <= HIGH_FIDELITY_CEILING:
|
|
@@ -172,9 +172,22 @@ def build_config(
|
|
|
172
172
|
|
|
173
173
|
tier_sections = [s for s in cfg.sections() if s != "ROUTING"]
|
|
174
174
|
|
|
175
|
-
# `auto` context =
|
|
176
|
-
#
|
|
177
|
-
|
|
175
|
+
# `auto` context = the fast tier's ctx_size. The router runs a
|
|
176
|
+
# step-DOWN ladder (ultra -> smart -> fast as context grows), so
|
|
177
|
+
# the largest window in the ladder is fast's, and that's the
|
|
178
|
+
# effective ceiling for `model = auto` -- anything bigger has no
|
|
179
|
+
# tier to land on. Using `min(...)` here would clip opencode to
|
|
180
|
+
# the smallest tier's window even though the router would never
|
|
181
|
+
# actually send a long prompt to that tier.
|
|
182
|
+
fast_ctx = next(
|
|
183
|
+
(
|
|
184
|
+
_int(cfg[s].get("ctx_size", ""), 0)
|
|
185
|
+
for s in tier_sections
|
|
186
|
+
if (cfg[s].get("role") or "").strip() == "fast"
|
|
187
|
+
),
|
|
188
|
+
0,
|
|
189
|
+
)
|
|
190
|
+
auto_ctx = fast_ctx or max(
|
|
178
191
|
(_int(cfg[s].get("ctx_size", ""), 0) for s in tier_sections),
|
|
179
192
|
default=8192,
|
|
180
193
|
) or 8192
|
|
@@ -51,7 +51,6 @@
|
|
|
51
51
|
[DEFAULT]
|
|
52
52
|
host = 127.0.0.1
|
|
53
53
|
router_port = 10101 ; FastAPI auto-router (what opencode hits)
|
|
54
|
-
swap_port = 10102 ; llama-swap manager UI + raw model endpoints
|
|
55
54
|
n_gpu_layers = 999 ; offload everything to Metal on Apple Silicon
|
|
56
55
|
flash_attn = on
|
|
57
56
|
jinja = true
|
|
@@ -68,31 +67,60 @@ tier = code
|
|
|
68
67
|
role = fast
|
|
69
68
|
hf_repo = bartowski/Qwen2.5-Coder-3B-Instruct-GGUF
|
|
70
69
|
hf_file = Qwen2.5-Coder-3B-Instruct-Q5_K_M.gguf
|
|
71
|
-
ctx_size = 131072 ; native 32k extended via YaRN (factor 4)
|
|
70
|
+
ctx_size = 131072 ; native 32k extended via YaRN (factor 4); also defines the auto router's effective max -- contexts longer than this have nowhere to land, since fast is the bottom of the step-down ladder
|
|
72
71
|
rope_scaling = yarn (scale=4, orig_ctx=32768)
|
|
73
72
|
size_gb = 2.5
|
|
74
73
|
quant = Q5_K_M
|
|
75
|
-
status = downloading ; queued by `llmstack.sh download`
|
|
76
|
-
opencode_use = small_model + auto-fast tier
|
|
77
74
|
sampler = temp=0.2, top_p=0.95, top_k=40, min_p=0.05 ; deterministic
|
|
78
75
|
description = Qwen2.5-Coder 3B - autocomplete / FIM / quick Q&A
|
|
79
76
|
|
|
77
|
+
; Bedrock alternative for code-fast -- comment out the [code-fast] block above
|
|
78
|
+
; and uncomment the block below to swap to a hosted fast tier (Claude Haiku
|
|
79
|
+
; 4.5: cheapest + fastest Anthropic model with tool calling, sub-second TTFT).
|
|
80
|
+
; See "BEDROCK NOTES" at the bottom of this file for profile / sampler /
|
|
81
|
+
; access-form details.
|
|
82
|
+
;
|
|
83
|
+
; [code-fast]
|
|
84
|
+
; tier = code
|
|
85
|
+
; role = fast
|
|
86
|
+
; backend = bedrock
|
|
87
|
+
; aws_model_id = eu.anthropic.claude-haiku-4-5-20251001-v1:0
|
|
88
|
+
; aws_region = eu-central-1
|
|
89
|
+
; aws_profile = bedrock-prod
|
|
90
|
+
; ctx_size = 200000
|
|
91
|
+
; sampler = temp=0.2 ; deterministic; Haiku 4.5 accepts ONE of temp / top_p
|
|
92
|
+
; description = Claude Haiku 4.5 on Bedrock - hosted fast tier for autocomplete / FIM / quick Q&A
|
|
93
|
+
|
|
80
94
|
[code-smart]
|
|
81
95
|
tier = code
|
|
82
96
|
role = agent
|
|
83
97
|
hf_repo = unsloth/Qwen3-Coder-Next-GGUF
|
|
84
98
|
hf_file = Qwen3-Coder-Next-Q4_K_M.gguf
|
|
85
99
|
hf_file_next = Qwen3-Coder-Next-UD-Q4_K_XL.gguf
|
|
86
|
-
ctx_size =
|
|
100
|
+
ctx_size = 64000 ; 2x mid_fidelity_ceiling -- router steps down to code-fast past this, so a larger window would never get used
|
|
87
101
|
size_gb = 45
|
|
88
102
|
size_gb_next = 50
|
|
89
103
|
quant = Q4_K_M
|
|
90
104
|
quant_next = UD-Q4_K_XL
|
|
91
|
-
status = ready (Q4_K_M); UD-Q4_K_XL queued
|
|
92
|
-
opencode_use = agent.build + auto-agent tier
|
|
93
105
|
sampler = temp=0.5, top_p=0.85, top_k=20, min_p=0.05, rep_pen=1.05 ; balanced agent
|
|
94
106
|
description = Qwen3-Coder-Next 80B-A3B MoE - heavy coder for agent loops
|
|
95
107
|
|
|
108
|
+
; Bedrock alternative for code-smart -- comment out the [code-smart] block
|
|
109
|
+
; above and uncomment the block below to swap to a hosted heavy coder
|
|
110
|
+
; (Claude Sonnet 4.6: agent-loop workhorse, heavy tool calling, multi-file
|
|
111
|
+
; edits). See "BEDROCK NOTES" at the bottom of this file.
|
|
112
|
+
;
|
|
113
|
+
; [code-smart]
|
|
114
|
+
; tier = code
|
|
115
|
+
; role = agent
|
|
116
|
+
; backend = bedrock
|
|
117
|
+
; aws_model_id = eu.anthropic.claude-sonnet-4-6
|
|
118
|
+
; aws_region = eu-central-1
|
|
119
|
+
; aws_profile = bedrock-prod
|
|
120
|
+
; ctx_size = 64000 ; 2x mid_fidelity_ceiling -- intentionally well below Sonnet 4.6's 200k native window; router steps down to code-fast past this rather than paying Sonnet's long-context $cost/latency
|
|
121
|
+
; sampler = temp=0.5 ; Sonnet 4.6 accepts ONE of temp / top_p; pick `temp` for agent work
|
|
122
|
+
; description = Claude Sonnet 4.6 on Bedrock - heavy coder for agent loops
|
|
123
|
+
|
|
96
124
|
; Top-tier hosted coder. Shipped disabled because it requires boto3 +
|
|
97
125
|
; AWS Bedrock access. `llmstack install` auto-uncomments the block
|
|
98
126
|
; below (by stripping the leading "; " from each line and dropping
|
|
@@ -107,10 +135,9 @@ description = Qwen3-Coder-Next 80B-A3B MoE - heavy coder for agent loops
|
|
|
107
135
|
; role = ultra
|
|
108
136
|
; backend = bedrock
|
|
109
137
|
; aws_model_id = global.anthropic.claude-opus-4-7 ; global.* cross-region inference profile
|
|
110
|
-
; aws_region =
|
|
111
|
-
; aws_profile
|
|
112
|
-
; ctx_size =
|
|
113
|
-
; opencode_use = on-demand top-tier coder for hard agent tasks
|
|
138
|
+
; aws_region = eu-central-1 ; API anchor region; global.* auto-routes inference cross-region (set EU as the anchor for residency)
|
|
139
|
+
; aws_profile = bedrock-prod ; conventional profile name; configure once with `aws configure --profile bedrock-prod` (or change to your own and run `llmstack install`)
|
|
140
|
+
; ctx_size = 24000 ; 2x high_fidelity_ceiling -- intentionally well below Opus 4.7's 200k native window; we only invoke ultra for short prompts, where it's still cheap+fast, then step down past this
|
|
114
141
|
; ; NB: no `sampler =` line. Claude Opus 4.7 explicitly rejects all
|
|
115
142
|
; ; sampler params (temperature, top_p, top_k) -- per the Bedrock
|
|
116
143
|
; ; model card, "the recommended migration path is to omit these
|
|
@@ -134,11 +161,26 @@ size_gb = 9.2
|
|
|
134
161
|
size_gb_next = 12.1
|
|
135
162
|
quant = Q4_K_M
|
|
136
163
|
quant_next = Q6_K
|
|
137
|
-
status = ready (Q4_K_M); Q6_K queued
|
|
138
|
-
opencode_use = agent.plan + auto-plan tier
|
|
139
164
|
sampler = temp=0.7, top_p=0.9, top_k=40, min_p=0.05 ; creative thinking
|
|
140
165
|
description = Qwopus GLM 18B - planning, design discussions, architecture
|
|
141
166
|
|
|
167
|
+
; Bedrock alternative for plan -- comment out the [plan] block above and
|
|
168
|
+
; uncomment the block below to swap to a hosted planner (Claude Opus 4.6:
|
|
169
|
+
; deep reasoning for design discussions and architecture). Opus 4.6 still
|
|
170
|
+
; accepts both temperature and top_p (unlike 4.7), so the local sampler
|
|
171
|
+
; maps over cleanly. See "BEDROCK NOTES" at the bottom of this file.
|
|
172
|
+
;
|
|
173
|
+
; [plan]
|
|
174
|
+
; tier = chat
|
|
175
|
+
; role = plan
|
|
176
|
+
; backend = bedrock
|
|
177
|
+
; aws_model_id = eu.anthropic.claude-opus-4-6-v1
|
|
178
|
+
; aws_region = eu-central-1
|
|
179
|
+
; aws_profile = bedrock-prod
|
|
180
|
+
; ctx_size = 200000
|
|
181
|
+
; sampler = temp=0.7, top_p=0.9 ; creative; Opus 4.6 accepts both
|
|
182
|
+
; description = Claude Opus 4.6 on Bedrock - planning, design discussions, architecture
|
|
183
|
+
|
|
142
184
|
[plan-uncensored]
|
|
143
185
|
tier = chat
|
|
144
186
|
role = plan-uncensored
|
|
@@ -150,11 +192,51 @@ size_gb = 13
|
|
|
150
192
|
size_gb_next = 20
|
|
151
193
|
quant = i1-Q4_K_M
|
|
152
194
|
quant_next = i1-Q6_K
|
|
153
|
-
status = ready (i1-Q4_K_M); i1-Q6_K queued
|
|
154
|
-
opencode_use = agent.plan-nofilter + auto via [nofilter] trigger
|
|
155
195
|
sampler = temp=0.85, top_p=0.95, top_k=50, min_p=0.05 ; max exploration
|
|
156
196
|
description = Mistral-Small 3.2 24B Heretic - no-filter planning
|
|
157
197
|
|
|
198
|
+
; Bedrock alternative for plan-uncensored -- comment out the [plan-uncensored]
|
|
199
|
+
; block above and uncomment ONE of the blocks below. Anthropic models on
|
|
200
|
+
; Bedrock are filtered, so for the uncensored slot we pick the largest
|
|
201
|
+
; open-weights model on Bedrock: Llama 3.1 405B has minimal safety post-
|
|
202
|
+
; training and matches the spirit of the local Heretic tier. NOTE: Meta
|
|
203
|
+
; models do NOT require the AWS use-case form, so this swap unblocks
|
|
204
|
+
; plan-uncensored on a fresh AWS account.
|
|
205
|
+
;
|
|
206
|
+
; REGION CAVEAT: unlike the other tiers above, Llama 3.1 405B has NO
|
|
207
|
+
; cross-region inference profile (no eu.* / global.*) and is only
|
|
208
|
+
; deployed in US regions. Pin to us-west-2 even when the rest of the
|
|
209
|
+
; stack is anchored in eu-central-1. If EU residency is mandatory for
|
|
210
|
+
; this tier, switch to one of the eu.anthropic.* IDs at the cost of
|
|
211
|
+
; losing the "uncensored" property. See "BEDROCK NOTES" at the bottom
|
|
212
|
+
; of this file.
|
|
213
|
+
;
|
|
214
|
+
; [plan-uncensored]
|
|
215
|
+
; tier = chat
|
|
216
|
+
; role = plan-uncensored
|
|
217
|
+
; backend = bedrock
|
|
218
|
+
; aws_model_id = meta.llama3-1-405b-instruct-v1:0
|
|
219
|
+
; aws_region = us-west-2 ; Llama 405B has no EU deployment; keep on US
|
|
220
|
+
; aws_profile = bedrock-prod
|
|
221
|
+
; ctx_size = 128000
|
|
222
|
+
; sampler = temp=0.85, top_p=0.95 ; max exploration
|
|
223
|
+
; description = Llama 3.1 405B on Bedrock - no-filter planning
|
|
224
|
+
;
|
|
225
|
+
; ...or, if your org locks Bedrock access to a VPC endpoint, use this
|
|
226
|
+
; variant instead (same model + sampler, with aws_endpoint_url set):
|
|
227
|
+
;
|
|
228
|
+
; [plan-uncensored]
|
|
229
|
+
; tier = chat
|
|
230
|
+
; role = plan-uncensored
|
|
231
|
+
; backend = bedrock
|
|
232
|
+
; aws_model_id = meta.llama3-1-405b-instruct-v1:0
|
|
233
|
+
; aws_region = us-west-2 ; Llama 405B has no EU deployment
|
|
234
|
+
; aws_profile = bedrock-prod
|
|
235
|
+
; aws_endpoint_url = https://bedrock-runtime.us-west-2.vpce.amazonaws.com
|
|
236
|
+
; ctx_size = 128000
|
|
237
|
+
; sampler = temp=0.85, top_p=0.95
|
|
238
|
+
; description = Llama 3.1 405B on Bedrock (VPC) - no-filter planning
|
|
239
|
+
|
|
158
240
|
;------------------------------------------------------------------------------
|
|
159
241
|
[ROUTING]
|
|
160
242
|
; STEP-DOWN ladder: start at the top of the fidelity ladder for short
|
|
@@ -178,7 +260,11 @@ description = Mistral-Small 3.2 24B Heretic - no-filter planning
|
|
|
178
260
|
; 2. "[ultra]" / "[opus]" / "ultra:" trigger AND code-ultra
|
|
179
261
|
; tier configured -> code-ultra
|
|
180
262
|
; 3. PLAN signal words AND no code-block / agent verbs / tools
|
|
181
|
-
; (pure design discussion
|
|
263
|
+
; AND tokens <= [plan].ctx_size (pure design discussion that
|
|
264
|
+
; still fits the planner's window) -> plan
|
|
265
|
+
; ...if the plan tier's ctx_size is breached, the request
|
|
266
|
+
; falls through to the coding ladder below rather than being
|
|
267
|
+
; sent to a planner whose window can't hold the input.
|
|
182
268
|
; 4. tokens <= high_fidelity_ceiling AND code-ultra configured -> code-ultra
|
|
183
269
|
; tokens <= high_fidelity_ceiling AND no code-ultra -> code-smart
|
|
184
270
|
; 5. tokens <= mid_fidelity_ceiling -> code-smart
|
|
@@ -186,6 +272,24 @@ description = Mistral-Small 3.2 24B Heretic - no-filter planning
|
|
|
186
272
|
; - if tools[] OR turns >= multi_turn (3B tool-calls badly) -> code-smart
|
|
187
273
|
; - else -> code-fast
|
|
188
274
|
;
|
|
275
|
+
; AUTO ROUTER MAX CONTEXT = [code-fast].ctx_size. The fast tier sits at
|
|
276
|
+
; the bottom of the step-down ladder, so any context too big for the
|
|
277
|
+
; tiers above lands on fast. Anything beyond fast's window has no
|
|
278
|
+
; safe home and should be considered out of scope for `model = auto`
|
|
279
|
+
; -- callers with such payloads should pass an explicit model name
|
|
280
|
+
; instead of relying on the router.
|
|
281
|
+
;
|
|
282
|
+
; FIDELITY-CEILING <-> CTX-SIZE COUPLING. Each "fidelity" rung's
|
|
283
|
+
; ceiling is half of the corresponding tier's ctx_size:
|
|
284
|
+
; high_fidelity_ceiling x 2 == [code-ultra].ctx_size
|
|
285
|
+
; mid_fidelity_ceiling x 2 == [code-smart].ctx_size
|
|
286
|
+
; This is deliberate: the ceiling marks where the tier still has
|
|
287
|
+
; comfortable headroom; double the ceiling is where we'd be packing
|
|
288
|
+
; the tier to its limit (and where the router has already stepped
|
|
289
|
+
; down to the next tier). If you bump a ceiling, bump the matching
|
|
290
|
+
; ctx_size in the tier section too -- otherwise the router will
|
|
291
|
+
; route requests up to a tier whose window can't hold them.
|
|
292
|
+
;
|
|
189
293
|
; The "high-fidelity" rung is gated on availability: when the
|
|
190
294
|
; [code-ultra] section is absent (or fails to load), rules (2) and (4)
|
|
191
295
|
; silently fall back to code-smart instead of routing to a tier that
|
|
@@ -194,8 +298,8 @@ description = Mistral-Small 3.2 24B Heretic - no-filter planning
|
|
|
194
298
|
; ROUTER_HIGH_FIDELITY_CEILING / ROUTER_MID_FIDELITY_CEILING /
|
|
195
299
|
; ROUTER_MULTI_TURN.
|
|
196
300
|
;
|
|
197
|
-
high_fidelity_ceiling =
|
|
198
|
-
mid_fidelity_ceiling = 32000 ; tokens; smart's sweet spot up to here, then step down to fast
|
|
301
|
+
high_fidelity_ceiling = 12000 ; tokens; below this, top-tier model is still cheap+fast (and ultra ctx_size = 2 * this)
|
|
302
|
+
mid_fidelity_ceiling = 32000 ; tokens; smart's sweet spot up to here, then step down to fast (smart ctx_size = 2 * this)
|
|
199
303
|
multi_turn = 6 ; turn count that floors the long-context rung at code-smart
|
|
200
304
|
agent_signal_words = implement, fix bug, write a function, refactor, edit, patch, debug, run tests, build it
|
|
201
305
|
plan_signal_words = design, architect, approach, trade-off, should we, how would you, explain why, think through, compare options, brainstorm, root cause
|
|
@@ -203,30 +307,46 @@ uncensored_triggers = [nofilter], [uncensored], [heretic], "uncensored:", "nof
|
|
|
203
307
|
ultra_triggers = [ultra], [opus], "ultra:", "opus:" (line start)
|
|
204
308
|
|
|
205
309
|
;------------------------------------------------------------------------------
|
|
206
|
-
; BEDROCK
|
|
310
|
+
; BEDROCK NOTES (referenced by the commented-out alternatives above)
|
|
207
311
|
;------------------------------------------------------------------------------
|
|
208
|
-
;
|
|
209
|
-
;
|
|
210
|
-
;
|
|
211
|
-
;
|
|
212
|
-
;
|
|
312
|
+
; Each tier section above carries a "Bedrock alternative for <tier>" block
|
|
313
|
+
; directly underneath it (commented out by default). To swap a tier:
|
|
314
|
+
;
|
|
315
|
+
; 1. comment out the active local section (GGUF by default);
|
|
316
|
+
; 2. uncomment the Bedrock-alternative block beneath it;
|
|
317
|
+
; 3. run `llmstack install` (and `llmstack restart` if the tier was
|
|
318
|
+
; already loaded -- bedrock creds aren't picked up live).
|
|
213
319
|
;
|
|
214
|
-
;
|
|
215
|
-
;
|
|
320
|
+
; The router auto-detects backend=bedrock from `aws_model_id`, but every
|
|
321
|
+
; alternative block also sets `backend = bedrock` explicitly so the intent
|
|
322
|
+
; is obvious. llama-swap won't load bedrock tiers; the router calls
|
|
323
|
+
; Bedrock directly via boto3 (`pip install 'llmstack[bedrock]'`).
|
|
324
|
+
;
|
|
325
|
+
; PROFILE: every alternative uses `aws_profile = bedrock-prod`, the
|
|
326
|
+
; conventional profile name for this stack. The actual keys / SSO /
|
|
327
|
+
; role chaining live in the standard AWS config files (this file ONLY
|
|
328
|
+
; names a profile -- never put credentials here). One-time setup:
|
|
216
329
|
;
|
|
217
330
|
; aws configure --profile bedrock-prod
|
|
218
|
-
; #
|
|
219
|
-
; #
|
|
331
|
+
; # SSO: aws configure sso --profile bedrock-prod
|
|
332
|
+
; # role chaining: edit ~/.aws/config and add:
|
|
333
|
+
; # [profile bedrock-prod]
|
|
220
334
|
; # role_arn = arn:aws:iam::123456789012:role/llmstack-bedrock
|
|
221
|
-
; # source_profile = bedrock-prod
|
|
335
|
+
; # source_profile = bedrock-prod-base
|
|
336
|
+
;
|
|
337
|
+
; To use a different profile name, edit the `aws_profile` line. To fall
|
|
338
|
+
; back on boto3's default chain (env vars, default profile, instance
|
|
339
|
+
; role), remove the line entirely.
|
|
222
340
|
;
|
|
223
|
-
;
|
|
224
|
-
;
|
|
225
|
-
;
|
|
341
|
+
; UPGRADE PRE-STAGING: optional `aws_model_id_next` (+ `aws_region_next`)
|
|
342
|
+
; is the queued upgrade target -- mirrors gguf `hf_file_next`. The router
|
|
343
|
+
; uses it only when `llmstack start --next` is in effect; permanent
|
|
344
|
+
; promotion is the same as gguf: edit `aws_model_id` and re-run
|
|
345
|
+
; `llmstack install`.
|
|
226
346
|
;
|
|
227
|
-
; SAMPLER
|
|
228
|
-
;
|
|
229
|
-
;
|
|
347
|
+
; SAMPLER: the `sampler = temp=..., top_p=..., top_k=..., ...` line on
|
|
348
|
+
; each tier is the SINGLE SOURCE OF TRUTH for sampling, but how it gets
|
|
349
|
+
; applied depends on the backend:
|
|
230
350
|
;
|
|
231
351
|
; * gguf tiers -- the llama-swap generator bakes the sampler keys
|
|
232
352
|
; into the llama-server startup command line as `--temp`,
|
|
@@ -248,57 +368,21 @@ ultra_triggers = [ultra], [opus], "ultra:", "opus:" (line start)
|
|
|
248
368
|
; opencode.json is sampler-free in both cases by design (the
|
|
249
369
|
; opencode.json generator never emits sampler params on agents).
|
|
250
370
|
;
|
|
251
|
-
; Per-Bedrock-family rules (as of 2026):
|
|
371
|
+
; Per-Bedrock-family sampler rules (as of 2026):
|
|
252
372
|
;
|
|
253
373
|
; * Claude Opus 4.7+ -- rejects all sampler params; OMIT `sampler =`
|
|
254
374
|
; entirely (the router will then pass requests through untouched).
|
|
255
|
-
; * Claude Sonnet 4.5 / Haiku 4.5 -- accept `temp` OR `top_p`,
|
|
256
|
-
; both; pick one.
|
|
375
|
+
; * Claude Sonnet 4.5 / 4.6 / Haiku 4.5 -- accept `temp` OR `top_p`,
|
|
376
|
+
; never both; pick one.
|
|
257
377
|
; * Claude Opus 4.x (4.1, 4.5, 4.6) -- accept `temp` and `top_p`.
|
|
258
|
-
; * Llama / Titan / Cohere / etc. -- accept `temp`
|
|
259
|
-
; the model card if in doubt.
|
|
260
|
-
;
|
|
261
|
-
; Example A: top-tier coder on Bedrock (us-west-2), default cred chain.
|
|
262
|
-
; Optional `aws_model_id_next` (and optional `aws_region_next`) is the
|
|
263
|
-
; queued upgrade target -- mirrors gguf `hf_file_next`. The router uses
|
|
264
|
-
; it only when `--next` is in effect; permanent promotion is the same
|
|
265
|
-
; as gguf: edit `aws_model_id` and re-run `llmstack install`.
|
|
266
|
-
;
|
|
267
|
-
; [code-smart]
|
|
268
|
-
; tier = code
|
|
269
|
-
; role = agent
|
|
270
|
-
; backend = bedrock
|
|
271
|
-
; aws_model_id = anthropic.claude-sonnet-4-5-20250929-v1:0
|
|
272
|
-
; aws_region = us-west-2
|
|
273
|
-
; aws_model_id_next = anthropic.claude-sonnet-5-20260201-v1:0 ; queued
|
|
274
|
-
; aws_region_next = us-east-1 ; (optional) different region for the new model
|
|
275
|
-
; ctx_size = 200000
|
|
276
|
-
; sampler = temp=0.5 ; Sonnet 4.5 accepts ONE of temp / top_p; pick `temp` for agent work
|
|
277
|
-
; description = Claude Sonnet 4.5 on Bedrock - heavy coder for agent loops
|
|
278
|
-
;
|
|
279
|
-
; Example B: planner in a different AWS account, accessed via a named
|
|
280
|
-
; profile that itself uses role-chaining + SSO under ~/.aws/config.
|
|
281
|
-
; (Different tier => different profile name; different account/region.)
|
|
378
|
+
; * Llama / Titan / Mistral / Cohere / Nova / etc. -- accept `temp`
|
|
379
|
+
; + `top_p`; check the model card if in doubt.
|
|
282
380
|
;
|
|
283
|
-
;
|
|
284
|
-
;
|
|
285
|
-
;
|
|
286
|
-
;
|
|
287
|
-
;
|
|
288
|
-
;
|
|
289
|
-
;
|
|
290
|
-
;
|
|
291
|
-
; description = Claude Opus 4.1 on Bedrock - planning, design discussions
|
|
292
|
-
;
|
|
293
|
-
; Example C: large model behind a VPC endpoint.
|
|
294
|
-
;
|
|
295
|
-
; [plan-uncensored]
|
|
296
|
-
; tier = chat
|
|
297
|
-
; role = plan-uncensored
|
|
298
|
-
; aws_model_id = meta.llama3-1-405b-instruct-v1:0
|
|
299
|
-
; aws_region = us-west-2
|
|
300
|
-
; aws_profile = bedrock-prod
|
|
301
|
-
; aws_endpoint_url = https://bedrock-runtime.us-west-2.vpce.amazonaws.com
|
|
302
|
-
; ctx_size = 128000
|
|
303
|
-
; sampler = temp=0.85, top_p=0.95
|
|
304
|
-
; description = Llama 3.1 405B on Bedrock - max-exploration planning
|
|
381
|
+
; ACCESS: Anthropic Claude on Bedrock requires a one-time use-case-form
|
|
382
|
+
; approval per AWS account (Bedrock console -> Model catalog -> pick the
|
|
383
|
+
; model -> fill the form). Approval is account-level and persists; once
|
|
384
|
+
; granted, every Claude variant works (bare ID, us./eu./global. cross-
|
|
385
|
+
; region profile, application inference profile ARN). To skip the form
|
|
386
|
+
; entirely, use the Llama 3.1 405B variant under [plan-uncensored] (Meta
|
|
387
|
+
; models don't require the form) or pick another non-Anthropic family
|
|
388
|
+
; (Amazon Nova, Mistral, Cohere, Titan).
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: opencode-llmstack
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.8.0
|
|
4
4
|
Summary: Multi-tier local LLM stack: llama-swap + FastAPI auto-router + opencode wiring.
|
|
5
5
|
Author: llmstack
|
|
6
6
|
License: MIT
|
|
@@ -542,8 +542,8 @@ All knobs are env vars; defaults are picked up by `llmstack start`.
|
|
|
542
542
|
| `ROUTER_ULTRA_MODEL` | `code-ultra` | short-context top tier → here (gated on availability) |
|
|
543
543
|
| `ROUTER_PLAN_MODEL` | `plan` | design/discussion verbs → here |
|
|
544
544
|
| `ROUTER_UNCENSORED_MODEL` | `plan-uncensored` | `[nofilter]` triggers → here |
|
|
545
|
-
| `ROUTER_HIGH_FIDELITY_CEILING` | `
|
|
546
|
-
| `ROUTER_MID_FIDELITY_CEILING` | `32000` | tokens; at or below this, route to `code-smart`; beyond, step down to `code-fast` |
|
|
545
|
+
| `ROUTER_HIGH_FIDELITY_CEILING` | `12000` | tokens; at or below this, route to top tier (ultra → smart fallback). Paired with `code-ultra.ctx_size = 24000` (2x). |
|
|
546
|
+
| `ROUTER_MID_FIDELITY_CEILING` | `32000` | tokens; at or below this, route to `code-smart`; beyond, step down to `code-fast`. Paired with `code-smart.ctx_size = 64000` (2x). |
|
|
547
547
|
| `ROUTER_MULTI_TURN` | `6` | turn count that floors the long-context rung at `code-smart` |
|
|
548
548
|
| `ROUTER_HOST` / `ROUTER_PORT` | `127.0.0.1` / `10101` | listen address |
|
|
549
549
|
| `LOG_LEVEL` | `info` | router log level |
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "opencode-llmstack"
|
|
7
|
-
version = "0.
|
|
7
|
+
version = "0.8.0"
|
|
8
8
|
description = "Multi-tier local LLM stack: llama-swap + FastAPI auto-router + opencode wiring."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.11"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{opencode_llmstack-0.7.2 → opencode_llmstack-0.8.0}/opencode_llmstack.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
{opencode_llmstack-0.7.2 → opencode_llmstack-0.8.0}/opencode_llmstack.egg-info/entry_points.txt
RENAMED
|
File without changes
|
|
File without changes
|
{opencode_llmstack-0.7.2 → opencode_llmstack-0.8.0}/opencode_llmstack.egg-info/top_level.txt
RENAMED
|
File without changes
|
|
File without changes
|