opencode-llmstack 0.9.3__py3-none-any.whl → 0.9.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- llmstack/__init__.py +1 -1
- llmstack/app.py +78 -109
- llmstack/backends/bedrock.py +3 -1
- llmstack/generators/opencode.py +1 -1
- llmstack/models.ini +11 -17
- {opencode_llmstack-0.9.3.data → opencode_llmstack-0.9.6.data}/data/CHANGELOG.md +15 -0
- {opencode_llmstack-0.9.3.dist-info → opencode_llmstack-0.9.6.dist-info}/METADATA +1 -1
- {opencode_llmstack-0.9.3.dist-info → opencode_llmstack-0.9.6.dist-info}/RECORD +14 -14
- {opencode_llmstack-0.9.3.data → opencode_llmstack-0.9.6.data}/data/LICENSE +0 -0
- {opencode_llmstack-0.9.3.data → opencode_llmstack-0.9.6.data}/data/UPGRADING.md +0 -0
- {opencode_llmstack-0.9.3.dist-info → opencode_llmstack-0.9.6.dist-info}/WHEEL +0 -0
- {opencode_llmstack-0.9.3.dist-info → opencode_llmstack-0.9.6.dist-info}/entry_points.txt +0 -0
- {opencode_llmstack-0.9.3.dist-info → opencode_llmstack-0.9.6.dist-info}/licenses/LICENSE +0 -0
- {opencode_llmstack-0.9.3.dist-info → opencode_llmstack-0.9.6.dist-info}/top_level.txt +0 -0
llmstack/__init__.py
CHANGED
llmstack/app.py
CHANGED
|
@@ -36,7 +36,7 @@ Behaviour:
|
|
|
36
36
|
``POST /v1/completions``
|
|
37
37
|
- if request body ``model == "auto"`` (or unset), classify the request
|
|
38
38
|
and rewrite ``model`` -> one of: ``code-fast``, ``code-smart``,
|
|
39
|
-
``code-ultra`` (when wired)
|
|
39
|
+
``code-ultra`` (when wired).
|
|
40
40
|
- otherwise pass through unchanged.
|
|
41
41
|
- tiers with ``backend = bedrock`` in ``models.ini`` are dispatched
|
|
42
42
|
to AWS Bedrock via :mod:`llmstack.backends.bedrock` instead of
|
|
@@ -63,41 +63,28 @@ step DOWN as context grows**. This inverts the classic
|
|
|
63
63
|
from priors.
|
|
64
64
|
|
|
65
65
|
So as the conversation accumulates context, we step *down*: ultra
|
|
66
|
-
-> smart -> fast.
|
|
67
|
-
ladder.
|
|
66
|
+
-> smart -> fast.
|
|
68
67
|
|
|
69
68
|
Routing decision tree (first match wins):
|
|
70
69
|
|
|
71
|
-
1. Explicit "
|
|
72
|
-
(``[nofilter]``, ``[uncensored]``, ``[heretic]``, or a line
|
|
73
|
-
starting with ``uncensored:`` / ``nofilter:``) -> plan-uncensored
|
|
74
|
-
2. Explicit "ultra" trigger (``[ultra]``, ``[opus]``,
|
|
70
|
+
1. Explicit "ultra" trigger (``[ultra]``, ``[opus]``,
|
|
75
71
|
``ultra:``, ``opus:``) AND ultra tier configured -> code-ultra
|
|
76
|
-
|
|
77
|
-
AND estimated tokens <= ``[plan]`` tier's ctx_size
|
|
78
|
-
(pure design discussion that fits the planner's
|
|
79
|
-
window) -> plan
|
|
80
|
-
(if the planner's
|
|
81
|
-
ctx_size is breached
|
|
82
|
-
we fall through to
|
|
83
|
-
the coding ladder
|
|
84
|
-
rather than send a
|
|
85
|
-
request that won't
|
|
86
|
-
fit -- the coding
|
|
87
|
-
tiers cover larger
|
|
88
|
-
windows by design)
|
|
89
|
-
4. Estimated input tokens <= HIGH_FIDELITY_CEILING
|
|
72
|
+
2. Estimated input tokens <= HIGH_FIDELITY_CEILING
|
|
90
73
|
("reasonable context still being built") -> code-ultra
|
|
91
74
|
(else code-smart)
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
75
|
+
3. Estimated input tokens <= MID_FIDELITY_CEILING -> code-smart
|
|
76
|
+
4. Otherwise (long context, top-tier becomes
|
|
77
|
+
expensive/slow, fast tier's 128k window is the
|
|
78
|
+
best fit and it's free) -> code-fast
|
|
96
79
|
(floored at
|
|
97
80
|
code-smart when
|
|
98
81
|
n_turns >=
|
|
99
82
|
MULTI_TURN_THRESHOLD)
|
|
100
83
|
|
|
84
|
+
Plan and uncensored tiers are accessible via their dedicated agent
|
|
85
|
+
modes (``agent.plan``, ``agent.plan-nofilter``) and slash commands;
|
|
86
|
+
they are not auto-routed through ``model = auto``.
|
|
87
|
+
|
|
101
88
|
The auto router's effective max context window is
|
|
102
89
|
``[code-fast].ctx_size`` -- fast is the bottom of the step-down
|
|
103
90
|
ladder, so any context that would overflow the tiers above lands on
|
|
@@ -167,45 +154,14 @@ UNCENSORED_MODEL = os.getenv("ROUTER_UNCENSORED_MODEL", "plan-uncensored")
|
|
|
167
154
|
# still has comfortable headroom.
|
|
168
155
|
HIGH_FIDELITY_CEILING = int(os.getenv("ROUTER_HIGH_FIDELITY_CEILING", "12000"))
|
|
169
156
|
MID_FIDELITY_CEILING = int(os.getenv("ROUTER_MID_FIDELITY_CEILING", "32000"))
|
|
170
|
-
# Floor the long-context rung at code-smart whenever a tool-call
|
|
171
|
-
# protocol is in play -- 3B models tool-call unreliably regardless of
|
|
172
|
-
# how big their context window is.
|
|
173
157
|
MULTI_TURN_THRESHOLD = int(os.getenv("ROUTER_MULTI_TURN", "10"))
|
|
174
158
|
AUTO_ALIASES = {"auto", "", None}
|
|
175
159
|
|
|
176
|
-
UNCENSORED_TRIGGERS = re.compile(
|
|
177
|
-
r"(\[(uncensored|nofilter|no-?filter|heretic)\]"
|
|
178
|
-
r"|^[ \t]*(uncensored|nofilter|no-?filter)\s*:)",
|
|
179
|
-
re.IGNORECASE | re.MULTILINE,
|
|
180
|
-
)
|
|
181
|
-
|
|
182
160
|
ULTRA_TRIGGERS = re.compile(
|
|
183
161
|
r"(\[(ultra|opus)\]|^[ \t]*(ultra|opus)\s*:)",
|
|
184
162
|
re.IGNORECASE | re.MULTILINE,
|
|
185
163
|
)
|
|
186
164
|
|
|
187
|
-
PLAN_SIGNALS = re.compile(
|
|
188
|
-
r"\b(plan|design|architect(ure)?|approach|trade-?off|"
|
|
189
|
-
r"should\s+we|how\s+would\s+(you|we)|what\s+would\s+you|"
|
|
190
|
-
r"explain\s+why|reason\s+about|think\s+(through|step|hard|carefully)|"
|
|
191
|
-
r"compare\s+(options|approaches)|review\s+(the|this|my)\s+"
|
|
192
|
-
r"(architecture|design|approach|plan)|brainstorm|outline|"
|
|
193
|
-
r"summari[sz]e|root\s*cause|migrate|port\s+to)\b",
|
|
194
|
-
re.IGNORECASE,
|
|
195
|
-
)
|
|
196
|
-
|
|
197
|
-
AGENT_SIGNALS = re.compile(
|
|
198
|
-
r"\b(implement|fix\s+(this|the|a|my)?\s*(bug|issue|error|test)|"
|
|
199
|
-
r"write\s+(a|the|some)?\s*(function|class|test|script|module|method)|"
|
|
200
|
-
r"add\s+(a|the)?\s*(function|class|method|test|file|endpoint)|"
|
|
201
|
-
r"create\s+(a|the)?\s*(function|class|file|component|endpoint)|"
|
|
202
|
-
r"refactor|edit|patch|generate\s+code|debug|trace|"
|
|
203
|
-
r"run\s+tests?|build\s+(it|this)|compile)\b",
|
|
204
|
-
re.IGNORECASE,
|
|
205
|
-
)
|
|
206
|
-
|
|
207
|
-
CODE_BLOCK = re.compile(r"```|`[^`\n]{30,}`")
|
|
208
|
-
|
|
209
165
|
logging.basicConfig(
|
|
210
166
|
level=os.getenv("LOG_LEVEL", "INFO"),
|
|
211
167
|
format="%(asctime)s %(levelname)s router %(message)s",
|
|
@@ -221,12 +177,11 @@ async def _lifespan(app: FastAPI):
|
|
|
221
177
|
bedrock_tiers = sorted(t.name for t in TIERS.values() if t.is_bedrock)
|
|
222
178
|
log.info(
|
|
223
179
|
"router up upstream=%s ladder=[ultra<=%d -> agent<=%d -> fast] "
|
|
224
|
-
"fast=%s agent=%s ultra=%s
|
|
180
|
+
"fast=%s agent=%s ultra=%s bedrock=%s",
|
|
225
181
|
UPSTREAM, HIGH_FIDELITY_CEILING, MID_FIDELITY_CEILING,
|
|
226
182
|
FAST_MODEL, AGENT_MODEL,
|
|
227
183
|
f"{ULTRA_MODEL} (active)" if _ultra_available()
|
|
228
184
|
else f"{ULTRA_MODEL} (unwired -- high-fidelity rung falls back to {AGENT_MODEL})",
|
|
229
|
-
PLAN_MODEL, UNCENSORED_MODEL,
|
|
230
185
|
",".join(bedrock_tiers) or "(none)",
|
|
231
186
|
)
|
|
232
187
|
yield
|
|
@@ -302,12 +257,6 @@ def _estimate_tokens(messages: list[dict[str, Any]] | None, prompt: str | None)
|
|
|
302
257
|
return chars // 4
|
|
303
258
|
|
|
304
259
|
|
|
305
|
-
def _matches(pattern: re.Pattern[str], messages: list[dict[str, Any]] | None, prompt: str | None) -> bool:
|
|
306
|
-
if prompt and pattern.search(prompt):
|
|
307
|
-
return True
|
|
308
|
-
return any(pattern.search(t) for t in _iter_message_text(messages))
|
|
309
|
-
|
|
310
|
-
|
|
311
260
|
def _ultra_available() -> bool:
|
|
312
261
|
"""True iff the ultra tier is loaded from ``models.ini``.
|
|
313
262
|
|
|
@@ -331,6 +280,11 @@ def classify(body: dict[str, Any]) -> tuple[str, str]:
|
|
|
331
280
|
|
|
332
281
|
Step-DOWN ladder: top fidelity for short context, fall to mid for
|
|
333
282
|
medium, drop to fast for long. See module docstring for rationale.
|
|
283
|
+
|
|
284
|
+
Only the fast / agent / ultra rungs are implemented here. Plan and
|
|
285
|
+
uncensored tiers are accessible via their dedicated agent modes
|
|
286
|
+
(``agent.plan``, ``agent.plan-nofilter``) and slash commands; they
|
|
287
|
+
are not auto-routed from the build agent.
|
|
334
288
|
"""
|
|
335
289
|
messages = body.get("messages") if isinstance(body.get("messages"), list) else None
|
|
336
290
|
prompt = body.get("prompt") if isinstance(body.get("prompt"), str) else None
|
|
@@ -341,50 +295,17 @@ def classify(body: dict[str, Any]) -> tuple[str, str]:
|
|
|
341
295
|
for m in (messages or [])
|
|
342
296
|
if m.get("role") == "system" and isinstance(m.get("content"), str)
|
|
343
297
|
]
|
|
344
|
-
if any(UNCENSORED_TRIGGERS.search(s) for s in (last_user, *sys_prompts) if s):
|
|
345
|
-
return UNCENSORED_MODEL, "uncensored-trigger"
|
|
346
298
|
|
|
347
299
|
if any(ULTRA_TRIGGERS.search(s) for s in (last_user, *sys_prompts) if s):
|
|
348
300
|
if _ultra_available():
|
|
349
301
|
return ULTRA_MODEL, "ultra-trigger"
|
|
350
|
-
# Explicit user opt-in but the tier isn't wired up. Don't 404 --
|
|
351
|
-
# serve the request from the heaviest tier we *do* have and let
|
|
352
|
-
# the user notice in logs that their trigger was a no-op.
|
|
353
302
|
log.warning("ultra-trigger ignored: %s not in models.ini; falling back to %s",
|
|
354
303
|
ULTRA_MODEL, AGENT_MODEL)
|
|
355
304
|
return AGENT_MODEL, f"ultra-trigger->agent ({ULTRA_MODEL} unavailable)"
|
|
356
305
|
|
|
357
306
|
n_turns = sum(1 for m in (messages or []) if m.get("role") == "user")
|
|
358
|
-
has_code_signal = (
|
|
359
|
-
_matches(CODE_BLOCK, messages, prompt)
|
|
360
|
-
or _matches(AGENT_SIGNALS, messages, prompt)
|
|
361
|
-
)
|
|
362
|
-
|
|
363
307
|
est = _estimate_tokens(messages, prompt)
|
|
364
308
|
|
|
365
|
-
# Plan track is orthogonal to the code fidelity ladder: ``plan`` is a
|
|
366
|
-
# chat-tuned model meant for design / "should we" discussions. Only
|
|
367
|
-
# take it when nothing about the request says "I'm about to write
|
|
368
|
-
# code" (no triple-backticks, no agent verbs). Tools are stripped
|
|
369
|
-
# from the request body before dispatch (see ``_handle_completion``),
|
|
370
|
-
# so their presence here does not block plan routing.
|
|
371
|
-
# Only route to plan if the input fits in the planner's ctx_size --
|
|
372
|
-
# past that we fall through to the coding ladder which has tiers
|
|
373
|
-
# (smart, fast) explicitly sized for larger contexts.
|
|
374
|
-
if (
|
|
375
|
-
not has_code_signal
|
|
376
|
-
and _matches(PLAN_SIGNALS, messages, prompt)
|
|
377
|
-
):
|
|
378
|
-
plan_tier = TIER_BY_ALIAS.get(PLAN_MODEL)
|
|
379
|
-
plan_ctx = plan_tier.ctx_size if plan_tier else 0
|
|
380
|
-
if not plan_ctx or est <= plan_ctx:
|
|
381
|
-
return PLAN_MODEL, "plan-signal"
|
|
382
|
-
log.info(
|
|
383
|
-
"plan-signal but tokens~%d > %s.ctx_size %d; "
|
|
384
|
-
"falling through to coding ladder",
|
|
385
|
-
est, PLAN_MODEL, plan_ctx,
|
|
386
|
-
)
|
|
387
|
-
|
|
388
309
|
# Rung 1: short context -- start at the top.
|
|
389
310
|
if est <= HIGH_FIDELITY_CEILING:
|
|
390
311
|
if _ultra_available():
|
|
@@ -399,9 +320,7 @@ def classify(body: dict[str, Any]) -> tuple[str, str]:
|
|
|
399
320
|
return AGENT_MODEL, f"mid-fidelity tokens~{est}<={MID_FIDELITY_CEILING}"
|
|
400
321
|
|
|
401
322
|
# Rung 3: long context -- step down to fast. Floor at smart only
|
|
402
|
-
# when the multi-turn threshold is hit
|
|
403
|
-
# prevent the step-down (plan tiers strip tools before dispatch,
|
|
404
|
-
# and code-fast is a hosted model that tool-calls reliably).
|
|
323
|
+
# when the multi-turn threshold is hit.
|
|
405
324
|
if n_turns >= MULTI_TURN_THRESHOLD:
|
|
406
325
|
return AGENT_MODEL, f"long-context tokens~{est}>{MID_FIDELITY_CEILING} (user-turns={n_turns}>={MULTI_TURN_THRESHOLD} floor)"
|
|
407
326
|
return FAST_MODEL, f"long-context tokens~{est}>{MID_FIDELITY_CEILING}"
|
|
@@ -530,14 +449,14 @@ async def list_models() -> JSONResponse:
|
|
|
530
449
|
f"'{AGENT_MODEL}' up to ~{MID_FIDELITY_CEILING}, "
|
|
531
450
|
f"'{FAST_MODEL}' beyond that."
|
|
532
451
|
)
|
|
533
|
-
name = "Auto (step-down router: ultra/agent/fast
|
|
452
|
+
name = "Auto (step-down router: ultra/agent/fast)"
|
|
534
453
|
else:
|
|
535
454
|
top_blurb = (
|
|
536
455
|
f"Step-down ladder (top->bottom as context grows): "
|
|
537
456
|
f"'{AGENT_MODEL}' up to ~{MID_FIDELITY_CEILING} tokens, "
|
|
538
457
|
f"'{FAST_MODEL}' beyond that."
|
|
539
458
|
)
|
|
540
|
-
name = "Auto (step-down router: agent/fast
|
|
459
|
+
name = "Auto (step-down router: agent/fast)"
|
|
541
460
|
data["data"].insert(0, {
|
|
542
461
|
"id": "auto",
|
|
543
462
|
"object": "model",
|
|
@@ -546,8 +465,6 @@ async def list_models() -> JSONResponse:
|
|
|
546
465
|
"name": name,
|
|
547
466
|
"description": (
|
|
548
467
|
f"{top_blurb} "
|
|
549
|
-
f"'{PLAN_MODEL}' for design/planning (orthogonal to ladder); "
|
|
550
|
-
f"'{UNCENSORED_MODEL}' for explicit [nofilter] triggers; "
|
|
551
468
|
f"'[ultra]'/'[opus]' triggers force '{ULTRA_MODEL}' regardless of size."
|
|
552
469
|
),
|
|
553
470
|
"tier": "auto",
|
|
@@ -607,6 +524,41 @@ def _inject_sampler(body: dict[str, Any], tier: Tier) -> bool:
|
|
|
607
524
|
return mutated
|
|
608
525
|
|
|
609
526
|
|
|
527
|
+
def _inject_name_json(raw: bytes, tier_name: str) -> bytes:
|
|
528
|
+
try:
|
|
529
|
+
data = json.loads(raw)
|
|
530
|
+
except (json.JSONDecodeError, ValueError):
|
|
531
|
+
return raw
|
|
532
|
+
try:
|
|
533
|
+
msg = data["choices"][0]["message"]
|
|
534
|
+
if msg.get("content"):
|
|
535
|
+
msg["name"] = tier_name
|
|
536
|
+
except (KeyError, IndexError, TypeError):
|
|
537
|
+
pass
|
|
538
|
+
return json.dumps(data).encode()
|
|
539
|
+
|
|
540
|
+
|
|
541
|
+
def _inject_name_sse(chunk: bytes, tier_name: str, injected: list[bool]) -> bytes:
|
|
542
|
+
if injected[0]:
|
|
543
|
+
return chunk
|
|
544
|
+
line = chunk.decode(errors="replace")
|
|
545
|
+
if not line.startswith("data: "):
|
|
546
|
+
return chunk
|
|
547
|
+
payload_str = line[len("data: "):].strip()
|
|
548
|
+
if payload_str in ("[DONE]", ""):
|
|
549
|
+
return chunk
|
|
550
|
+
try:
|
|
551
|
+
payload = json.loads(payload_str)
|
|
552
|
+
delta = payload["choices"][0]["delta"]
|
|
553
|
+
if "role" in delta:
|
|
554
|
+
delta["name"] = tier_name
|
|
555
|
+
injected[0] = True
|
|
556
|
+
return f"data: {json.dumps(payload, separators=(',', ':'))}\n\n".encode()
|
|
557
|
+
except (KeyError, IndexError, TypeError, json.JSONDecodeError):
|
|
558
|
+
pass
|
|
559
|
+
return chunk
|
|
560
|
+
|
|
561
|
+
|
|
610
562
|
async def _handle_completion(req: Request, path: str) -> Response:
|
|
611
563
|
raw = await req.body()
|
|
612
564
|
headers = _filter_request_headers(req)
|
|
@@ -630,11 +582,6 @@ async def _handle_completion(req: Request, path: str) -> Response:
|
|
|
630
582
|
mutated = True
|
|
631
583
|
|
|
632
584
|
chosen_name = body.get("model")
|
|
633
|
-
if chosen_name in {PLAN_MODEL, UNCENSORED_MODEL} and body.get("tools"):
|
|
634
|
-
log.info("plan tier %s: stripping tools from request", chosen_name)
|
|
635
|
-
body.pop("tools")
|
|
636
|
-
body.pop("tool_choice", None)
|
|
637
|
-
mutated = True
|
|
638
585
|
tier = _resolve_tier(chosen_name)
|
|
639
586
|
if tier is not None and _inject_sampler(body, tier):
|
|
640
587
|
mutated = True
|
|
@@ -645,6 +592,28 @@ async def _handle_completion(req: Request, path: str) -> Response:
|
|
|
645
592
|
if tier is not None and tier.is_bedrock:
|
|
646
593
|
from llmstack.backends import bedrock as bedrock_backend
|
|
647
594
|
resp = await bedrock_backend.dispatch(req, tier, body)
|
|
595
|
+
elif tier is not None and body.get("stream"):
|
|
596
|
+
proxy = await _stream_proxy(req.method, path, raw, headers)
|
|
597
|
+
injected: list[bool] = [False]
|
|
598
|
+
tier_name = tier.name
|
|
599
|
+
original_gen = proxy.body_iterator
|
|
600
|
+
|
|
601
|
+
async def _named_gen():
|
|
602
|
+
async for chunk in original_gen:
|
|
603
|
+
yield _inject_name_sse(chunk, tier_name, injected)
|
|
604
|
+
|
|
605
|
+
proxy.body_iterator = _named_gen()
|
|
606
|
+
resp = proxy
|
|
607
|
+
elif tier is not None:
|
|
608
|
+
proxy = await _stream_proxy(req.method, path, raw, headers)
|
|
609
|
+
raw_resp = b"".join([chunk async for chunk in proxy.body_iterator])
|
|
610
|
+
patched = _inject_name_json(raw_resp, tier.name)
|
|
611
|
+
resp = Response(
|
|
612
|
+
content=patched,
|
|
613
|
+
status_code=proxy.status_code,
|
|
614
|
+
headers=dict(proxy.headers),
|
|
615
|
+
media_type=proxy.media_type,
|
|
616
|
+
)
|
|
648
617
|
else:
|
|
649
618
|
resp = await _stream_proxy(req.method, path, raw, headers)
|
|
650
619
|
|
llmstack/backends/bedrock.py
CHANGED
|
@@ -588,6 +588,8 @@ async def _complete_response(client: Any, tier: Tier, converse_kwargs: dict[str,
|
|
|
588
588
|
return JSONResponse(status_code=502, content={"error": _error_payload(exc)})
|
|
589
589
|
|
|
590
590
|
message, finish = _openai_message_from_converse(resp)
|
|
591
|
+
if message.get("content"):
|
|
592
|
+
message["name"] = tier.name
|
|
591
593
|
usage_in = (resp.get("usage") or {})
|
|
592
594
|
payload = {
|
|
593
595
|
"id": _completion_id(),
|
|
@@ -665,7 +667,7 @@ async def _stream_response(client: Any, tier: Tier, converse_kwargs: dict[str, A
|
|
|
665
667
|
|
|
666
668
|
# First chunk: announce the assistant role so OpenAI clients can
|
|
667
669
|
# initialise their accumulator.
|
|
668
|
-
yield _sse(_frame({"role": "assistant"}))
|
|
670
|
+
yield _sse(_frame({"role": "assistant", "name": model_label}))
|
|
669
671
|
|
|
670
672
|
# Per-content-block state: index -> "text" | "tool_use"
|
|
671
673
|
block_kinds: dict[int, str] = {}
|
llmstack/generators/opencode.py
CHANGED
|
@@ -194,7 +194,7 @@ def build_config(
|
|
|
194
194
|
|
|
195
195
|
models: dict[str, dict] = {
|
|
196
196
|
"auto": {
|
|
197
|
-
"name": "Auto (router selects: fast / agent /
|
|
197
|
+
"name": "Auto (router selects: fast / agent / ultra)",
|
|
198
198
|
"limit": {"context": auto_ctx, "output": 16384},
|
|
199
199
|
"tool_call": True,
|
|
200
200
|
"cost": ZERO_COST,
|
llmstack/models.ini
CHANGED
|
@@ -178,7 +178,7 @@ description = Qwopus GLM 18B - planning, design discussions, architecture
|
|
|
178
178
|
; aws_region = eu-central-1
|
|
179
179
|
; aws_profile = bedrock-prod
|
|
180
180
|
; ctx_size = 200000
|
|
181
|
-
; sampler = temp=0.7
|
|
181
|
+
; sampler = temp=0.7 ; creative; Opus 4.6
|
|
182
182
|
; description = Claude Opus 4.6 on Bedrock - planning, design discussions, architecture
|
|
183
183
|
|
|
184
184
|
[plan-uncensored]
|
|
@@ -258,21 +258,18 @@ description = Mistral-Small 3.2 24B Heretic - no-filter planning
|
|
|
258
258
|
;
|
|
259
259
|
; First-match-wins decision tree applied by llmstack/app.py when model="auto":
|
|
260
260
|
;
|
|
261
|
-
; 1. "[
|
|
262
|
-
; 2. "[ultra]" / "[opus]" / "ultra:" trigger AND code-ultra
|
|
261
|
+
; 1. "[ultra]" / "[opus]" / "ultra:" trigger AND code-ultra
|
|
263
262
|
; tier configured -> code-ultra
|
|
264
|
-
;
|
|
265
|
-
; AND tokens <= [plan].ctx_size (pure design discussion that
|
|
266
|
-
; still fits the planner's window) -> plan
|
|
267
|
-
; ...if the plan tier's ctx_size is breached, the request
|
|
268
|
-
; falls through to the coding ladder below rather than being
|
|
269
|
-
; sent to a planner whose window can't hold the input.
|
|
270
|
-
; 4. tokens <= high_fidelity_ceiling AND code-ultra configured -> code-ultra
|
|
263
|
+
; 2. tokens <= high_fidelity_ceiling AND code-ultra configured -> code-ultra
|
|
271
264
|
; tokens <= high_fidelity_ceiling AND no code-ultra -> code-smart
|
|
272
|
-
;
|
|
273
|
-
;
|
|
274
|
-
; - if
|
|
275
|
-
; - else
|
|
265
|
+
; 3. tokens <= mid_fidelity_ceiling -> code-smart
|
|
266
|
+
; 4. otherwise (long context):
|
|
267
|
+
; - if turns >= multi_turn (floor at smart) -> code-smart
|
|
268
|
+
; - else -> code-fast
|
|
269
|
+
;
|
|
270
|
+
; Plan and uncensored tiers are accessible via their dedicated agent
|
|
271
|
+
; modes (agent.plan, agent.plan-nofilter) and slash commands; they are
|
|
272
|
+
; NOT auto-routed through model=auto.
|
|
276
273
|
;
|
|
277
274
|
; AUTO ROUTER MAX CONTEXT = [code-fast].ctx_size. The fast tier sits at
|
|
278
275
|
; the bottom of the step-down ladder, so any context too big for the
|
|
@@ -303,9 +300,6 @@ description = Mistral-Small 3.2 24B Heretic - no-filter planning
|
|
|
303
300
|
high_fidelity_ceiling = 12000 ; tokens; below this, top-tier model is still cheap+fast (and ultra ctx_size = 2 * this)
|
|
304
301
|
mid_fidelity_ceiling = 32000 ; tokens; smart's sweet spot up to here, then step down to fast (smart ctx_size = 2 * this)
|
|
305
302
|
multi_turn = 10 ; turn count that floors the long-context rung at code-smart
|
|
306
|
-
agent_signal_words = implement, fix bug, write a function, refactor, edit, patch, debug, run tests, build it
|
|
307
|
-
plan_signal_words = design, architect, approach, trade-off, should we, how would you, explain why, think through, compare options, brainstorm, root cause
|
|
308
|
-
uncensored_triggers = [nofilter], [uncensored], [heretic], "uncensored:", "nofilter:" (line start)
|
|
309
303
|
ultra_triggers = [ultra], [opus], "ultra:", "opus:" (line start)
|
|
310
304
|
|
|
311
305
|
;------------------------------------------------------------------------------
|
|
@@ -4,6 +4,21 @@ All notable changes to `opencode-llmstack` are documented here.
|
|
|
4
4
|
|
|
5
5
|
---
|
|
6
6
|
|
|
7
|
+
## [0.9.4] — 2026-05-11
|
|
8
|
+
|
|
9
|
+
### Fixed
|
|
10
|
+
- `classify()` now scopes `has_code_signal` to the **last user message only**
|
|
11
|
+
(was scanning the full conversation history). Previously, any prior coding
|
|
12
|
+
exchange in the session (code blocks, agent verbs) would permanently block
|
|
13
|
+
plan routing for the rest of the conversation — e.g. "explain why these
|
|
14
|
+
changes are important?" after a refactor request would never reach `plan`.
|
|
15
|
+
- Added regression test:
|
|
16
|
+
`test_plan_signal_after_prior_coding_exchange_routes_to_plan`.
|
|
17
|
+
- `__version__` corrected from `"0.9.2"` to `"0.9.4"` (was skewed vs
|
|
18
|
+
`pyproject.toml` since 0.9.3).
|
|
19
|
+
|
|
20
|
+
---
|
|
21
|
+
|
|
7
22
|
## [0.9.2] — 2026-05-11
|
|
8
23
|
|
|
9
24
|
### Fixed
|
|
@@ -1,16 +1,16 @@
|
|
|
1
1
|
llmstack/AGENTS.md,sha256=4DVUkqJ1-EP-cDNRCpznzghOOX6dAMbVWdcwyfFCALw,528
|
|
2
|
-
llmstack/__init__.py,sha256=
|
|
2
|
+
llmstack/__init__.py,sha256=Ie-86h7q7pEsE9zTlWfjBEYDIoR4mC8ZutdC5Nx-x8k,855
|
|
3
3
|
llmstack/__main__.py,sha256=wXHd5-BmCCHUfNEmy2rbilBSyVhi4KD1dSIO_4NlxuE,199
|
|
4
4
|
llmstack/_platform.py,sha256=eDY3T9krkaBigG5xXxqzIbH3MhdZqX3BWe7bozOsAso,13099
|
|
5
|
-
llmstack/app.py,sha256=
|
|
5
|
+
llmstack/app.py,sha256=CiqIGpEbcjuSGDFNH951LiWZlE7Ju2_w5FHL5ieiAog,25952
|
|
6
6
|
llmstack/check_models.py,sha256=WvTS2Td4acp-Q0-yWXUgXAgAgFOmpxiaeSDuAoivirw,4559
|
|
7
7
|
llmstack/cli.py,sha256=Om70PzHrmU81y2Mw1sB6eeUs1fRHP0PnsCEVNC0UNvI,11341
|
|
8
|
-
llmstack/models.ini,sha256=
|
|
8
|
+
llmstack/models.ini,sha256=7ObeGrScRm0pGjyjAencr5lg8gEsMpjNvvF4o4Fxhps,19860
|
|
9
9
|
llmstack/paths.py,sha256=A8q4-tpwIt5UMGG5ZDESKSuViMGLbPIAL1VoONopJqU,11512
|
|
10
10
|
llmstack/shell_env.py,sha256=MJSW0PP15q-fsppIZ98WZ7XoqYMZmDy4k8N0gzEA6wU,39362
|
|
11
11
|
llmstack/tiers.py,sha256=yl5xEhECe-GHiVXBRvlNoFtH_9y4uNSASpfHlZ4Ja74,14820
|
|
12
12
|
llmstack/backends/__init__.py,sha256=-85sQz0R94OdbM2bUHGyyA5WaMnI9bHywPOaELeQHX0,777
|
|
13
|
-
llmstack/backends/bedrock.py,sha256=
|
|
13
|
+
llmstack/backends/bedrock.py,sha256=1o-s5C4CWorhqnse6dTY7E6y-98CoTckwP6qfq_H9Lw,30726
|
|
14
14
|
llmstack/commands/__init__.py,sha256=eVO-YUxh1fSfdq72KggC-NrTYMtN6zIykgjyRgOCAt4,406
|
|
15
15
|
llmstack/commands/_helpers.py,sha256=UKADaNXrnuoDi_JG0W2Tph7rWFB0cXvQh8YknZBw56I,2660
|
|
16
16
|
llmstack/commands/activate.py,sha256=zCdEmyVv5qZUdhfez6hZ5Y46N_yjPwfKbPTwCJXnA3o,3663
|
|
@@ -29,13 +29,13 @@ llmstack/download/binary.py,sha256=xpv15wF4viv8uFC5UqfSIf36CIoPpmaNUaVtjF-vTWA,8
|
|
|
29
29
|
llmstack/download/ggufs.py,sha256=2hCr-svUiPIV2I3ruwTbXo6lPn9m-VBOqa3DFbvdIcA,5435
|
|
30
30
|
llmstack/generators/__init__.py,sha256=LfbcReuyYBCdVuT9J5RKo7-f8n585YBU3Hus6DsxqTs,1189
|
|
31
31
|
llmstack/generators/llama_swap.py,sha256=KdYH9N6TJECotZvyxvAjaa3kRyzn4YOi2T6D2UdyVKw,14785
|
|
32
|
-
llmstack/generators/opencode.py,sha256=
|
|
33
|
-
opencode_llmstack-0.9.
|
|
34
|
-
opencode_llmstack-0.9.
|
|
35
|
-
opencode_llmstack-0.9.
|
|
36
|
-
opencode_llmstack-0.9.
|
|
37
|
-
opencode_llmstack-0.9.
|
|
38
|
-
opencode_llmstack-0.9.
|
|
39
|
-
opencode_llmstack-0.9.
|
|
40
|
-
opencode_llmstack-0.9.
|
|
41
|
-
opencode_llmstack-0.9.
|
|
32
|
+
llmstack/generators/opencode.py,sha256=8-eVGD3ZTgta3DHQ5gu6I9CkYZt7x-EdMw2qNzgyoQ4,11191
|
|
33
|
+
opencode_llmstack-0.9.6.data/data/CHANGELOG.md,sha256=58feU0rA9bBYvecDoFaLcwwgezLPkD3MSt0vRUVjdF8,5837
|
|
34
|
+
opencode_llmstack-0.9.6.data/data/LICENSE,sha256=6G-Otw6BHIM1WJSBlJ04P1rDVCqbDEzKpdOlSr5CqIY,1078
|
|
35
|
+
opencode_llmstack-0.9.6.data/data/UPGRADING.md,sha256=0XSNZ9trCviFLH5EL3Jz02fO2_8AfqB8_9aX0-o1bik,24927
|
|
36
|
+
opencode_llmstack-0.9.6.dist-info/licenses/LICENSE,sha256=6G-Otw6BHIM1WJSBlJ04P1rDVCqbDEzKpdOlSr5CqIY,1078
|
|
37
|
+
opencode_llmstack-0.9.6.dist-info/METADATA,sha256=VHLq0f7YxR0BAPnGYC7uLS4jqNGB0Ki5GPZq0ER1nW8,36323
|
|
38
|
+
opencode_llmstack-0.9.6.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
39
|
+
opencode_llmstack-0.9.6.dist-info/entry_points.txt,sha256=soomjpqvl4KzFScgpQbu96vgcLriOtkB9MbiSC0rvZ8,47
|
|
40
|
+
opencode_llmstack-0.9.6.dist-info/top_level.txt,sha256=tMv9sDWp8RW_DNNY8cuM4Uy4sND-KwTLcsScl5gdcEQ,9
|
|
41
|
+
opencode_llmstack-0.9.6.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|