PyPI - opencode-llmstack - Versions diffs - 0.8.0__tar.gz → 0.9.1__tar.gz - Mend

opencode-llmstack 0.8.0tar.gz → 0.9.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (42) hide show

{opencode_llmstack-0.8.0 → opencode_llmstack-0.9.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: opencode-llmstack
-Version: 0.8.0
+Version: 0.9.1
 Summary: Multi-tier local LLM stack: llama-swap + FastAPI auto-router + opencode wiring.
 Author: llmstack
 License: MIT

{opencode_llmstack-0.8.0 → opencode_llmstack-0.9.1}/llmstack/app.py RENAMED Viewed

@@ -90,16 +90,13 @@ Routing decision tree (first match wins):
      ("reasonable context still being built")         -> code-ultra
                                                          (else code-smart)
   5. Estimated input tokens <= MID_FIDELITY_CEILING   -> code-smart
-  6. Otherwise (long context, top-tier becomes
-     expensive/slow, fast tier's 128k window is the
-     best fit and it's free)                          -> code-fast
+   6. Otherwise (long context, top-tier becomes
+      expensive/slow, fast tier's 128k window is the
+      best fit and it's free)                          -> code-fast
                                                          (floored at
                                                           code-smart when
-                                                          ``tools[]`` is set
-                                                          or n_turns >=
-                                                          MULTI_TURN_THRESHOLD,
-                                                          since 3B models
-                                                          tool-call unreliably)
+                                                          n_turns >=
+                                                          MULTI_TURN_THRESHOLD)
 The auto router's effective max context window is
 ``[code-fast].ctx_size`` -- fast is the bottom of the step-down
@@ -173,7 +170,7 @@ MID_FIDELITY_CEILING = int(os.getenv("ROUTER_MID_FIDELITY_CEILING", "32000"))
 # Floor the long-context rung at code-smart whenever a tool-call
 # protocol is in play -- 3B models tool-call unreliably regardless of
 # how big their context window is.
-MULTI_TURN_THRESHOLD = int(os.getenv("ROUTER_MULTI_TURN", "6"))
+MULTI_TURN_THRESHOLD = int(os.getenv("ROUTER_MULTI_TURN", "10"))
 AUTO_ALIASES = {"auto", "", None}
 UNCENSORED_TRIGGERS = re.compile(
@@ -356,8 +353,7 @@ def classify(body: dict[str, Any]) -> tuple[str, str]:
                     ULTRA_MODEL, AGENT_MODEL)
         return AGENT_MODEL, f"ultra-trigger->agent ({ULTRA_MODEL} unavailable)"
-    has_tools = bool(body.get("tools"))
-    n_turns = len(messages) if messages else 0
+    n_turns = sum(1 for m in (messages or []) if m.get("role") == "user")
     has_code_signal = (
         _matches(CODE_BLOCK, messages, prompt)
         or _matches(AGENT_SIGNALS, messages, prompt)
@@ -368,14 +364,14 @@ def classify(body: dict[str, Any]) -> tuple[str, str]:
     # Plan track is orthogonal to the code fidelity ladder: ``plan`` is a
     # chat-tuned model meant for design / "should we" discussions. Only
     # take it when nothing about the request says "I'm about to write
-    # code" (no triple-backticks, no agent verbs, no tool calls). And
-    # only if the input fits in the planner's ctx_size -- past that we'd
-    # be sending a request the planner can't hold, so we fall through
-    # to the coding ladder, which has tiers (smart, fast) explicitly
-    # sized for larger contexts.
+    # code" (no triple-backticks, no agent verbs). Tools are stripped
+    # from the request body before dispatch (see ``_handle_completion``),
+    # so their presence here does not block plan routing.
+    # Only route to plan if the input fits in the planner's ctx_size --
+    # past that we fall through to the coding ladder which has tiers
+    # (smart, fast) explicitly sized for larger contexts.
     if (
-        not has_tools
-        and not has_code_signal
+        not has_code_signal
         and _matches(PLAN_SIGNALS, messages, prompt)
     ):
         plan_tier = TIER_BY_ALIAS.get(PLAN_MODEL)
@@ -401,12 +397,12 @@ def classify(body: dict[str, Any]) -> tuple[str, str]:
     if est <= MID_FIDELITY_CEILING:
         return AGENT_MODEL, f"mid-fidelity tokens~{est}<={MID_FIDELITY_CEILING}"
-    # Rung 3: long context -- step down to fast (128k YaRN, free,
-    # always-resident). Floor at smart when tools/agent loop is in
-    # play; the 3B coder doesn't tool-call reliably.
-    if has_tools or n_turns >= MULTI_TURN_THRESHOLD:
-        why = "tools" if has_tools else f"turns={n_turns}"
-        return AGENT_MODEL, f"long-context tokens~{est}>{MID_FIDELITY_CEILING} ({why} floor)"
+    # Rung 3: long context -- step down to fast. Floor at smart only
+    # when the multi-turn threshold is hit; tools alone no longer
+    # prevent the step-down (plan tiers strip tools before dispatch,
+    # and code-fast is a hosted model that tool-calls reliably).
+    if n_turns >= MULTI_TURN_THRESHOLD:
+        return AGENT_MODEL, f"long-context tokens~{est}>{MID_FIDELITY_CEILING} (user-turns={n_turns}>={MULTI_TURN_THRESHOLD} floor)"
     return FAST_MODEL, f"long-context tokens~{est}>{MID_FIDELITY_CEILING}"
@@ -626,6 +622,11 @@ async def _handle_completion(req: Request, path: str) -> Response:
         mutated = True
     chosen_name = body.get("model")
+    if chosen_name in {PLAN_MODEL, UNCENSORED_MODEL} and body.get("tools"):
+        log.info("plan tier %s: stripping tools from request", chosen_name)
+        body.pop("tools")
+        body.pop("tool_choice", None)
+        mutated = True
     tier = _resolve_tier(chosen_name)
     if tier is not None and _inject_sampler(body, tier):
         mutated = True

{opencode_llmstack-0.8.0 → opencode_llmstack-0.9.1}/llmstack/models.ini RENAMED Viewed

@@ -300,7 +300,7 @@ description  = Mistral-Small 3.2 24B Heretic - no-filter planning
 ;
 high_fidelity_ceiling = 12000    ; tokens; below this, top-tier model is still cheap+fast (and ultra ctx_size = 2 * this)
 mid_fidelity_ceiling  = 32000    ; tokens; smart's sweet spot up to here, then step down to fast (smart ctx_size = 2 * this)
-multi_turn            = 6        ; turn count that floors the long-context rung at code-smart
+multi_turn            = 10       ; turn count that floors the long-context rung at code-smart
 agent_signal_words    = implement, fix bug, write a function, refactor, edit, patch, debug, run tests, build it
 plan_signal_words     = design, architect, approach, trade-off, should we, how would you, explain why, think through, compare options, brainstorm, root cause
 uncensored_triggers   = [nofilter], [uncensored], [heretic], "uncensored:", "nofilter:" (line start)

{opencode_llmstack-0.8.0 → opencode_llmstack-0.9.1}/opencode_llmstack.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: opencode-llmstack
-Version: 0.8.0
+Version: 0.9.1
 Summary: Multi-tier local LLM stack: llama-swap + FastAPI auto-router + opencode wiring.
 Author: llmstack
 License: MIT

{opencode_llmstack-0.8.0 → opencode_llmstack-0.9.1}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "opencode-llmstack"
-version = "0.8.0"
+version = "0.9.1"
 description = "Multi-tier local LLM stack: llama-swap + FastAPI auto-router + opencode wiring."
 readme = "README.md"
 requires-python = ">=3.11"