opencode-llmstack 0.9.3__tar.gz → 0.9.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. {opencode_llmstack-0.9.3 → opencode_llmstack-0.9.6}/CHANGELOG.md +15 -0
  2. {opencode_llmstack-0.9.3/opencode_llmstack.egg-info → opencode_llmstack-0.9.6}/PKG-INFO +1 -1
  3. {opencode_llmstack-0.9.3 → opencode_llmstack-0.9.6}/llmstack/__init__.py +1 -1
  4. {opencode_llmstack-0.9.3 → opencode_llmstack-0.9.6}/llmstack/app.py +78 -109
  5. {opencode_llmstack-0.9.3 → opencode_llmstack-0.9.6}/llmstack/backends/bedrock.py +3 -1
  6. {opencode_llmstack-0.9.3 → opencode_llmstack-0.9.6}/llmstack/generators/opencode.py +1 -1
  7. {opencode_llmstack-0.9.3 → opencode_llmstack-0.9.6}/llmstack/models.ini +11 -17
  8. {opencode_llmstack-0.9.3 → opencode_llmstack-0.9.6/opencode_llmstack.egg-info}/PKG-INFO +1 -1
  9. {opencode_llmstack-0.9.3 → opencode_llmstack-0.9.6}/pyproject.toml +1 -1
  10. {opencode_llmstack-0.9.3 → opencode_llmstack-0.9.6}/LICENSE +0 -0
  11. {opencode_llmstack-0.9.3 → opencode_llmstack-0.9.6}/README.md +0 -0
  12. {opencode_llmstack-0.9.3 → opencode_llmstack-0.9.6}/UPGRADING.md +0 -0
  13. {opencode_llmstack-0.9.3 → opencode_llmstack-0.9.6}/llmstack/AGENTS.md +0 -0
  14. {opencode_llmstack-0.9.3 → opencode_llmstack-0.9.6}/llmstack/__main__.py +0 -0
  15. {opencode_llmstack-0.9.3 → opencode_llmstack-0.9.6}/llmstack/_platform.py +0 -0
  16. {opencode_llmstack-0.9.3 → opencode_llmstack-0.9.6}/llmstack/backends/__init__.py +0 -0
  17. {opencode_llmstack-0.9.3 → opencode_llmstack-0.9.6}/llmstack/check_models.py +0 -0
  18. {opencode_llmstack-0.9.3 → opencode_llmstack-0.9.6}/llmstack/cli.py +0 -0
  19. {opencode_llmstack-0.9.3 → opencode_llmstack-0.9.6}/llmstack/commands/__init__.py +0 -0
  20. {opencode_llmstack-0.9.3 → opencode_llmstack-0.9.6}/llmstack/commands/_helpers.py +0 -0
  21. {opencode_llmstack-0.9.3 → opencode_llmstack-0.9.6}/llmstack/commands/activate.py +0 -0
  22. {opencode_llmstack-0.9.3 → opencode_llmstack-0.9.6}/llmstack/commands/check.py +0 -0
  23. {opencode_llmstack-0.9.3 → opencode_llmstack-0.9.6}/llmstack/commands/download.py +0 -0
  24. {opencode_llmstack-0.9.3 → opencode_llmstack-0.9.6}/llmstack/commands/install.py +0 -0
  25. {opencode_llmstack-0.9.3 → opencode_llmstack-0.9.6}/llmstack/commands/install_llama_swap.py +0 -0
  26. {opencode_llmstack-0.9.3 → opencode_llmstack-0.9.6}/llmstack/commands/reload.py +0 -0
  27. {opencode_llmstack-0.9.3 → opencode_llmstack-0.9.6}/llmstack/commands/restart.py +0 -0
  28. {opencode_llmstack-0.9.3 → opencode_llmstack-0.9.6}/llmstack/commands/setup.py +0 -0
  29. {opencode_llmstack-0.9.3 → opencode_llmstack-0.9.6}/llmstack/commands/start.py +0 -0
  30. {opencode_llmstack-0.9.3 → opencode_llmstack-0.9.6}/llmstack/commands/status.py +0 -0
  31. {opencode_llmstack-0.9.3 → opencode_llmstack-0.9.6}/llmstack/commands/stop.py +0 -0
  32. {opencode_llmstack-0.9.3 → opencode_llmstack-0.9.6}/llmstack/download/__init__.py +0 -0
  33. {opencode_llmstack-0.9.3 → opencode_llmstack-0.9.6}/llmstack/download/binary.py +0 -0
  34. {opencode_llmstack-0.9.3 → opencode_llmstack-0.9.6}/llmstack/download/ggufs.py +0 -0
  35. {opencode_llmstack-0.9.3 → opencode_llmstack-0.9.6}/llmstack/generators/__init__.py +0 -0
  36. {opencode_llmstack-0.9.3 → opencode_llmstack-0.9.6}/llmstack/generators/llama_swap.py +0 -0
  37. {opencode_llmstack-0.9.3 → opencode_llmstack-0.9.6}/llmstack/paths.py +0 -0
  38. {opencode_llmstack-0.9.3 → opencode_llmstack-0.9.6}/llmstack/shell_env.py +0 -0
  39. {opencode_llmstack-0.9.3 → opencode_llmstack-0.9.6}/llmstack/tiers.py +0 -0
  40. {opencode_llmstack-0.9.3 → opencode_llmstack-0.9.6}/opencode_llmstack.egg-info/SOURCES.txt +0 -0
  41. {opencode_llmstack-0.9.3 → opencode_llmstack-0.9.6}/opencode_llmstack.egg-info/dependency_links.txt +0 -0
  42. {opencode_llmstack-0.9.3 → opencode_llmstack-0.9.6}/opencode_llmstack.egg-info/entry_points.txt +0 -0
  43. {opencode_llmstack-0.9.3 → opencode_llmstack-0.9.6}/opencode_llmstack.egg-info/requires.txt +0 -0
  44. {opencode_llmstack-0.9.3 → opencode_llmstack-0.9.6}/opencode_llmstack.egg-info/top_level.txt +0 -0
  45. {opencode_llmstack-0.9.3 → opencode_llmstack-0.9.6}/setup.cfg +0 -0
@@ -4,6 +4,21 @@ All notable changes to `opencode-llmstack` are documented here.
4
4
 
5
5
  ---
6
6
 
7
+ ## [0.9.4] — 2026-05-11
8
+
9
+ ### Fixed
10
+ - `classify()` now scopes `has_code_signal` to the **last user message only**
11
+ (was scanning the full conversation history). Previously, any prior coding
12
+ exchange in the session (code blocks, agent verbs) would permanently block
13
+ plan routing for the rest of the conversation — e.g. "explain why these
14
+ changes are important?" after a refactor request would never reach `plan`.
15
+ - Added regression test:
16
+ `test_plan_signal_after_prior_coding_exchange_routes_to_plan`.
17
+ - `__version__` corrected from `"0.9.2"` to `"0.9.4"` (was skewed vs
18
+ `pyproject.toml` since 0.9.3).
19
+
20
+ ---
21
+
7
22
  ## [0.9.2] — 2026-05-11
8
23
 
9
24
  ### Fixed
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: opencode-llmstack
3
- Version: 0.9.3
3
+ Version: 0.9.6
4
4
  Summary: Multi-tier local LLM stack: llama-swap + FastAPI auto-router + opencode wiring.
5
5
  Author: llmstack
6
6
  License: MIT License
@@ -16,5 +16,5 @@ organised by concern:
16
16
 
17
17
  from __future__ import annotations
18
18
 
19
- __version__ = "0.9.2"
19
+ __version__ = "0.9.6"
20
20
  __all__ = ["__version__"]
@@ -36,7 +36,7 @@ Behaviour:
36
36
  ``POST /v1/completions``
37
37
  - if request body ``model == "auto"`` (or unset), classify the request
38
38
  and rewrite ``model`` -> one of: ``code-fast``, ``code-smart``,
39
- ``code-ultra`` (when wired), ``plan``, ``plan-uncensored``.
39
+ ``code-ultra`` (when wired).
40
40
  - otherwise pass through unchanged.
41
41
  - tiers with ``backend = bedrock`` in ``models.ini`` are dispatched
42
42
  to AWS Bedrock via :mod:`llmstack.backends.bedrock` instead of
@@ -63,41 +63,28 @@ step DOWN as context grows**. This inverts the classic
63
63
  from priors.
64
64
 
65
65
  So as the conversation accumulates context, we step *down*: ultra
66
- -> smart -> fast. Triggers and the plan track sit alongside this
67
- ladder.
66
+ -> smart -> fast.
68
67
 
69
68
  Routing decision tree (first match wins):
70
69
 
71
- 1. Explicit "uncensored" trigger in the last user message
72
- (``[nofilter]``, ``[uncensored]``, ``[heretic]``, or a line
73
- starting with ``uncensored:`` / ``nofilter:``) -> plan-uncensored
74
- 2. Explicit "ultra" trigger (``[ultra]``, ``[opus]``,
70
+ 1. Explicit "ultra" trigger (``[ultra]``, ``[opus]``,
75
71
  ``ultra:``, ``opus:``) AND ultra tier configured -> code-ultra
76
- 3. PLAN signal words AND no code-block / agent verbs / tools
77
- AND estimated tokens <= ``[plan]`` tier's ctx_size
78
- (pure design discussion that fits the planner's
79
- window) -> plan
80
- (if the planner's
81
- ctx_size is breached
82
- we fall through to
83
- the coding ladder
84
- rather than send a
85
- request that won't
86
- fit -- the coding
87
- tiers cover larger
88
- windows by design)
89
- 4. Estimated input tokens <= HIGH_FIDELITY_CEILING
72
+ 2. Estimated input tokens <= HIGH_FIDELITY_CEILING
90
73
  ("reasonable context still being built") -> code-ultra
91
74
  (else code-smart)
92
- 5. Estimated input tokens <= MID_FIDELITY_CEILING -> code-smart
93
- 6. Otherwise (long context, top-tier becomes
94
- expensive/slow, fast tier's 128k window is the
95
- best fit and it's free) -> code-fast
75
+ 3. Estimated input tokens <= MID_FIDELITY_CEILING -> code-smart
76
+ 4. Otherwise (long context, top-tier becomes
77
+ expensive/slow, fast tier's 128k window is the
78
+ best fit and it's free) -> code-fast
96
79
  (floored at
97
80
  code-smart when
98
81
  n_turns >=
99
82
  MULTI_TURN_THRESHOLD)
100
83
 
84
+ Plan and uncensored tiers are accessible via their dedicated agent
85
+ modes (``agent.plan``, ``agent.plan-nofilter``) and slash commands;
86
+ they are not auto-routed through ``model = auto``.
87
+
101
88
  The auto router's effective max context window is
102
89
  ``[code-fast].ctx_size`` -- fast is the bottom of the step-down
103
90
  ladder, so any context that would overflow the tiers above lands on
@@ -167,45 +154,14 @@ UNCENSORED_MODEL = os.getenv("ROUTER_UNCENSORED_MODEL", "plan-uncensored")
167
154
  # still has comfortable headroom.
168
155
  HIGH_FIDELITY_CEILING = int(os.getenv("ROUTER_HIGH_FIDELITY_CEILING", "12000"))
169
156
  MID_FIDELITY_CEILING = int(os.getenv("ROUTER_MID_FIDELITY_CEILING", "32000"))
170
- # Floor the long-context rung at code-smart whenever a tool-call
171
- # protocol is in play -- 3B models tool-call unreliably regardless of
172
- # how big their context window is.
173
157
  MULTI_TURN_THRESHOLD = int(os.getenv("ROUTER_MULTI_TURN", "10"))
174
158
  AUTO_ALIASES = {"auto", "", None}
175
159
 
176
- UNCENSORED_TRIGGERS = re.compile(
177
- r"(\[(uncensored|nofilter|no-?filter|heretic)\]"
178
- r"|^[ \t]*(uncensored|nofilter|no-?filter)\s*:)",
179
- re.IGNORECASE | re.MULTILINE,
180
- )
181
-
182
160
  ULTRA_TRIGGERS = re.compile(
183
161
  r"(\[(ultra|opus)\]|^[ \t]*(ultra|opus)\s*:)",
184
162
  re.IGNORECASE | re.MULTILINE,
185
163
  )
186
164
 
187
- PLAN_SIGNALS = re.compile(
188
- r"\b(plan|design|architect(ure)?|approach|trade-?off|"
189
- r"should\s+we|how\s+would\s+(you|we)|what\s+would\s+you|"
190
- r"explain\s+why|reason\s+about|think\s+(through|step|hard|carefully)|"
191
- r"compare\s+(options|approaches)|review\s+(the|this|my)\s+"
192
- r"(architecture|design|approach|plan)|brainstorm|outline|"
193
- r"summari[sz]e|root\s*cause|migrate|port\s+to)\b",
194
- re.IGNORECASE,
195
- )
196
-
197
- AGENT_SIGNALS = re.compile(
198
- r"\b(implement|fix\s+(this|the|a|my)?\s*(bug|issue|error|test)|"
199
- r"write\s+(a|the|some)?\s*(function|class|test|script|module|method)|"
200
- r"add\s+(a|the)?\s*(function|class|method|test|file|endpoint)|"
201
- r"create\s+(a|the)?\s*(function|class|file|component|endpoint)|"
202
- r"refactor|edit|patch|generate\s+code|debug|trace|"
203
- r"run\s+tests?|build\s+(it|this)|compile)\b",
204
- re.IGNORECASE,
205
- )
206
-
207
- CODE_BLOCK = re.compile(r"```|`[^`\n]{30,}`")
208
-
209
165
  logging.basicConfig(
210
166
  level=os.getenv("LOG_LEVEL", "INFO"),
211
167
  format="%(asctime)s %(levelname)s router %(message)s",
@@ -221,12 +177,11 @@ async def _lifespan(app: FastAPI):
221
177
  bedrock_tiers = sorted(t.name for t in TIERS.values() if t.is_bedrock)
222
178
  log.info(
223
179
  "router up upstream=%s ladder=[ultra<=%d -> agent<=%d -> fast] "
224
- "fast=%s agent=%s ultra=%s plan=%s uncensored=%s bedrock=%s",
180
+ "fast=%s agent=%s ultra=%s bedrock=%s",
225
181
  UPSTREAM, HIGH_FIDELITY_CEILING, MID_FIDELITY_CEILING,
226
182
  FAST_MODEL, AGENT_MODEL,
227
183
  f"{ULTRA_MODEL} (active)" if _ultra_available()
228
184
  else f"{ULTRA_MODEL} (unwired -- high-fidelity rung falls back to {AGENT_MODEL})",
229
- PLAN_MODEL, UNCENSORED_MODEL,
230
185
  ",".join(bedrock_tiers) or "(none)",
231
186
  )
232
187
  yield
@@ -302,12 +257,6 @@ def _estimate_tokens(messages: list[dict[str, Any]] | None, prompt: str | None)
302
257
  return chars // 4
303
258
 
304
259
 
305
- def _matches(pattern: re.Pattern[str], messages: list[dict[str, Any]] | None, prompt: str | None) -> bool:
306
- if prompt and pattern.search(prompt):
307
- return True
308
- return any(pattern.search(t) for t in _iter_message_text(messages))
309
-
310
-
311
260
  def _ultra_available() -> bool:
312
261
  """True iff the ultra tier is loaded from ``models.ini``.
313
262
 
@@ -331,6 +280,11 @@ def classify(body: dict[str, Any]) -> tuple[str, str]:
331
280
 
332
281
  Step-DOWN ladder: top fidelity for short context, fall to mid for
333
282
  medium, drop to fast for long. See module docstring for rationale.
283
+
284
+ Only the fast / agent / ultra rungs are implemented here. Plan and
285
+ uncensored tiers are accessible via their dedicated agent modes
286
+ (``agent.plan``, ``agent.plan-nofilter``) and slash commands; they
287
+ are not auto-routed from the build agent.
334
288
  """
335
289
  messages = body.get("messages") if isinstance(body.get("messages"), list) else None
336
290
  prompt = body.get("prompt") if isinstance(body.get("prompt"), str) else None
@@ -341,50 +295,17 @@ def classify(body: dict[str, Any]) -> tuple[str, str]:
341
295
  for m in (messages or [])
342
296
  if m.get("role") == "system" and isinstance(m.get("content"), str)
343
297
  ]
344
- if any(UNCENSORED_TRIGGERS.search(s) for s in (last_user, *sys_prompts) if s):
345
- return UNCENSORED_MODEL, "uncensored-trigger"
346
298
 
347
299
  if any(ULTRA_TRIGGERS.search(s) for s in (last_user, *sys_prompts) if s):
348
300
  if _ultra_available():
349
301
  return ULTRA_MODEL, "ultra-trigger"
350
- # Explicit user opt-in but the tier isn't wired up. Don't 404 --
351
- # serve the request from the heaviest tier we *do* have and let
352
- # the user notice in logs that their trigger was a no-op.
353
302
  log.warning("ultra-trigger ignored: %s not in models.ini; falling back to %s",
354
303
  ULTRA_MODEL, AGENT_MODEL)
355
304
  return AGENT_MODEL, f"ultra-trigger->agent ({ULTRA_MODEL} unavailable)"
356
305
 
357
306
  n_turns = sum(1 for m in (messages or []) if m.get("role") == "user")
358
- has_code_signal = (
359
- _matches(CODE_BLOCK, messages, prompt)
360
- or _matches(AGENT_SIGNALS, messages, prompt)
361
- )
362
-
363
307
  est = _estimate_tokens(messages, prompt)
364
308
 
365
- # Plan track is orthogonal to the code fidelity ladder: ``plan`` is a
366
- # chat-tuned model meant for design / "should we" discussions. Only
367
- # take it when nothing about the request says "I'm about to write
368
- # code" (no triple-backticks, no agent verbs). Tools are stripped
369
- # from the request body before dispatch (see ``_handle_completion``),
370
- # so their presence here does not block plan routing.
371
- # Only route to plan if the input fits in the planner's ctx_size --
372
- # past that we fall through to the coding ladder which has tiers
373
- # (smart, fast) explicitly sized for larger contexts.
374
- if (
375
- not has_code_signal
376
- and _matches(PLAN_SIGNALS, messages, prompt)
377
- ):
378
- plan_tier = TIER_BY_ALIAS.get(PLAN_MODEL)
379
- plan_ctx = plan_tier.ctx_size if plan_tier else 0
380
- if not plan_ctx or est <= plan_ctx:
381
- return PLAN_MODEL, "plan-signal"
382
- log.info(
383
- "plan-signal but tokens~%d > %s.ctx_size %d; "
384
- "falling through to coding ladder",
385
- est, PLAN_MODEL, plan_ctx,
386
- )
387
-
388
309
  # Rung 1: short context -- start at the top.
389
310
  if est <= HIGH_FIDELITY_CEILING:
390
311
  if _ultra_available():
@@ -399,9 +320,7 @@ def classify(body: dict[str, Any]) -> tuple[str, str]:
399
320
  return AGENT_MODEL, f"mid-fidelity tokens~{est}<={MID_FIDELITY_CEILING}"
400
321
 
401
322
  # Rung 3: long context -- step down to fast. Floor at smart only
402
- # when the multi-turn threshold is hit; tools alone no longer
403
- # prevent the step-down (plan tiers strip tools before dispatch,
404
- # and code-fast is a hosted model that tool-calls reliably).
323
+ # when the multi-turn threshold is hit.
405
324
  if n_turns >= MULTI_TURN_THRESHOLD:
406
325
  return AGENT_MODEL, f"long-context tokens~{est}>{MID_FIDELITY_CEILING} (user-turns={n_turns}>={MULTI_TURN_THRESHOLD} floor)"
407
326
  return FAST_MODEL, f"long-context tokens~{est}>{MID_FIDELITY_CEILING}"
@@ -530,14 +449,14 @@ async def list_models() -> JSONResponse:
530
449
  f"'{AGENT_MODEL}' up to ~{MID_FIDELITY_CEILING}, "
531
450
  f"'{FAST_MODEL}' beyond that."
532
451
  )
533
- name = "Auto (step-down router: ultra/agent/fast + plan/uncensored)"
452
+ name = "Auto (step-down router: ultra/agent/fast)"
534
453
  else:
535
454
  top_blurb = (
536
455
  f"Step-down ladder (top->bottom as context grows): "
537
456
  f"'{AGENT_MODEL}' up to ~{MID_FIDELITY_CEILING} tokens, "
538
457
  f"'{FAST_MODEL}' beyond that."
539
458
  )
540
- name = "Auto (step-down router: agent/fast + plan/uncensored)"
459
+ name = "Auto (step-down router: agent/fast)"
541
460
  data["data"].insert(0, {
542
461
  "id": "auto",
543
462
  "object": "model",
@@ -546,8 +465,6 @@ async def list_models() -> JSONResponse:
546
465
  "name": name,
547
466
  "description": (
548
467
  f"{top_blurb} "
549
- f"'{PLAN_MODEL}' for design/planning (orthogonal to ladder); "
550
- f"'{UNCENSORED_MODEL}' for explicit [nofilter] triggers; "
551
468
  f"'[ultra]'/'[opus]' triggers force '{ULTRA_MODEL}' regardless of size."
552
469
  ),
553
470
  "tier": "auto",
@@ -607,6 +524,41 @@ def _inject_sampler(body: dict[str, Any], tier: Tier) -> bool:
607
524
  return mutated
608
525
 
609
526
 
527
+ def _inject_name_json(raw: bytes, tier_name: str) -> bytes:
528
+ try:
529
+ data = json.loads(raw)
530
+ except (json.JSONDecodeError, ValueError):
531
+ return raw
532
+ try:
533
+ msg = data["choices"][0]["message"]
534
+ if msg.get("content"):
535
+ msg["name"] = tier_name
536
+ except (KeyError, IndexError, TypeError):
537
+ pass
538
+ return json.dumps(data).encode()
539
+
540
+
541
+ def _inject_name_sse(chunk: bytes, tier_name: str, injected: list[bool]) -> bytes:
542
+ if injected[0]:
543
+ return chunk
544
+ line = chunk.decode(errors="replace")
545
+ if not line.startswith("data: "):
546
+ return chunk
547
+ payload_str = line[len("data: "):].strip()
548
+ if payload_str in ("[DONE]", ""):
549
+ return chunk
550
+ try:
551
+ payload = json.loads(payload_str)
552
+ delta = payload["choices"][0]["delta"]
553
+ if "role" in delta:
554
+ delta["name"] = tier_name
555
+ injected[0] = True
556
+ return f"data: {json.dumps(payload, separators=(',', ':'))}\n\n".encode()
557
+ except (KeyError, IndexError, TypeError, json.JSONDecodeError):
558
+ pass
559
+ return chunk
560
+
561
+
610
562
  async def _handle_completion(req: Request, path: str) -> Response:
611
563
  raw = await req.body()
612
564
  headers = _filter_request_headers(req)
@@ -630,11 +582,6 @@ async def _handle_completion(req: Request, path: str) -> Response:
630
582
  mutated = True
631
583
 
632
584
  chosen_name = body.get("model")
633
- if chosen_name in {PLAN_MODEL, UNCENSORED_MODEL} and body.get("tools"):
634
- log.info("plan tier %s: stripping tools from request", chosen_name)
635
- body.pop("tools")
636
- body.pop("tool_choice", None)
637
- mutated = True
638
585
  tier = _resolve_tier(chosen_name)
639
586
  if tier is not None and _inject_sampler(body, tier):
640
587
  mutated = True
@@ -645,6 +592,28 @@ async def _handle_completion(req: Request, path: str) -> Response:
645
592
  if tier is not None and tier.is_bedrock:
646
593
  from llmstack.backends import bedrock as bedrock_backend
647
594
  resp = await bedrock_backend.dispatch(req, tier, body)
595
+ elif tier is not None and body.get("stream"):
596
+ proxy = await _stream_proxy(req.method, path, raw, headers)
597
+ injected: list[bool] = [False]
598
+ tier_name = tier.name
599
+ original_gen = proxy.body_iterator
600
+
601
+ async def _named_gen():
602
+ async for chunk in original_gen:
603
+ yield _inject_name_sse(chunk, tier_name, injected)
604
+
605
+ proxy.body_iterator = _named_gen()
606
+ resp = proxy
607
+ elif tier is not None:
608
+ proxy = await _stream_proxy(req.method, path, raw, headers)
609
+ raw_resp = b"".join([chunk async for chunk in proxy.body_iterator])
610
+ patched = _inject_name_json(raw_resp, tier.name)
611
+ resp = Response(
612
+ content=patched,
613
+ status_code=proxy.status_code,
614
+ headers=dict(proxy.headers),
615
+ media_type=proxy.media_type,
616
+ )
648
617
  else:
649
618
  resp = await _stream_proxy(req.method, path, raw, headers)
650
619
 
@@ -588,6 +588,8 @@ async def _complete_response(client: Any, tier: Tier, converse_kwargs: dict[str,
588
588
  return JSONResponse(status_code=502, content={"error": _error_payload(exc)})
589
589
 
590
590
  message, finish = _openai_message_from_converse(resp)
591
+ if message.get("content"):
592
+ message["name"] = tier.name
591
593
  usage_in = (resp.get("usage") or {})
592
594
  payload = {
593
595
  "id": _completion_id(),
@@ -665,7 +667,7 @@ async def _stream_response(client: Any, tier: Tier, converse_kwargs: dict[str, A
665
667
 
666
668
  # First chunk: announce the assistant role so OpenAI clients can
667
669
  # initialise their accumulator.
668
- yield _sse(_frame({"role": "assistant"}))
670
+ yield _sse(_frame({"role": "assistant", "name": model_label}))
669
671
 
670
672
  # Per-content-block state: index -> "text" | "tool_use"
671
673
  block_kinds: dict[int, str] = {}
@@ -194,7 +194,7 @@ def build_config(
194
194
 
195
195
  models: dict[str, dict] = {
196
196
  "auto": {
197
- "name": "Auto (router selects: fast / agent / plan / uncensored)",
197
+ "name": "Auto (router selects: fast / agent / ultra)",
198
198
  "limit": {"context": auto_ctx, "output": 16384},
199
199
  "tool_call": True,
200
200
  "cost": ZERO_COST,
@@ -178,7 +178,7 @@ description = Qwopus GLM 18B - planning, design discussions, architecture
178
178
  ; aws_region = eu-central-1
179
179
  ; aws_profile = bedrock-prod
180
180
  ; ctx_size = 200000
181
- ; sampler = temp=0.7, top_p=0.9 ; creative; Opus 4.6 accepts both
181
+ ; sampler = temp=0.7 ; creative; Opus 4.6
182
182
  ; description = Claude Opus 4.6 on Bedrock - planning, design discussions, architecture
183
183
 
184
184
  [plan-uncensored]
@@ -258,21 +258,18 @@ description = Mistral-Small 3.2 24B Heretic - no-filter planning
258
258
  ;
259
259
  ; First-match-wins decision tree applied by llmstack/app.py when model="auto":
260
260
  ;
261
- ; 1. "[nofilter]" / "uncensored:" trigger -> plan-uncensored
262
- ; 2. "[ultra]" / "[opus]" / "ultra:" trigger AND code-ultra
261
+ ; 1. "[ultra]" / "[opus]" / "ultra:" trigger AND code-ultra
263
262
  ; tier configured -> code-ultra
264
- ; 3. PLAN signal words AND no code-block / agent verbs / tools
265
- ; AND tokens <= [plan].ctx_size (pure design discussion that
266
- ; still fits the planner's window) -> plan
267
- ; ...if the plan tier's ctx_size is breached, the request
268
- ; falls through to the coding ladder below rather than being
269
- ; sent to a planner whose window can't hold the input.
270
- ; 4. tokens <= high_fidelity_ceiling AND code-ultra configured -> code-ultra
263
+ ; 2. tokens <= high_fidelity_ceiling AND code-ultra configured -> code-ultra
271
264
  ; tokens <= high_fidelity_ceiling AND no code-ultra -> code-smart
272
- ; 5. tokens <= mid_fidelity_ceiling -> code-smart
273
- ; 6. otherwise (long context):
274
- ; - if tools[] OR turns >= multi_turn (3B tool-calls badly) -> code-smart
275
- ; - else -> code-fast
265
+ ; 3. tokens <= mid_fidelity_ceiling -> code-smart
266
+ ; 4. otherwise (long context):
267
+ ; - if turns >= multi_turn (floor at smart) -> code-smart
268
+ ; - else -> code-fast
269
+ ;
270
+ ; Plan and uncensored tiers are accessible via their dedicated agent
271
+ ; modes (agent.plan, agent.plan-nofilter) and slash commands; they are
272
+ ; NOT auto-routed through model=auto.
276
273
  ;
277
274
  ; AUTO ROUTER MAX CONTEXT = [code-fast].ctx_size. The fast tier sits at
278
275
  ; the bottom of the step-down ladder, so any context too big for the
@@ -303,9 +300,6 @@ description = Mistral-Small 3.2 24B Heretic - no-filter planning
303
300
  high_fidelity_ceiling = 12000 ; tokens; below this, top-tier model is still cheap+fast (and ultra ctx_size = 2 * this)
304
301
  mid_fidelity_ceiling = 32000 ; tokens; smart's sweet spot up to here, then step down to fast (smart ctx_size = 2 * this)
305
302
  multi_turn = 10 ; turn count that floors the long-context rung at code-smart
306
- agent_signal_words = implement, fix bug, write a function, refactor, edit, patch, debug, run tests, build it
307
- plan_signal_words = design, architect, approach, trade-off, should we, how would you, explain why, think through, compare options, brainstorm, root cause
308
- uncensored_triggers = [nofilter], [uncensored], [heretic], "uncensored:", "nofilter:" (line start)
309
303
  ultra_triggers = [ultra], [opus], "ultra:", "opus:" (line start)
310
304
 
311
305
  ;------------------------------------------------------------------------------
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: opencode-llmstack
3
- Version: 0.9.3
3
+ Version: 0.9.6
4
4
  Summary: Multi-tier local LLM stack: llama-swap + FastAPI auto-router + opencode wiring.
5
5
  Author: llmstack
6
6
  License: MIT License
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "opencode-llmstack"
7
- version = "0.9.3"
7
+ version = "0.9.6"
8
8
  description = "Multi-tier local LLM stack: llama-swap + FastAPI auto-router + opencode wiring."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.11"