npm - @miller-tech/uap - Versions diffs - 1.20.34 → 1.20.36 - Mend

@miller-tech/uap 1.20.34 → 1.20.36

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/README.md +1 -0
package/docs/deployment/QWEN35_LLAMA_CPP.md +15 -6
package/package.json +1 -1
package/tools/agents/config/qwen3.5-enhanced.jinja +187 -0
package/tools/agents/scripts/anthropic_proxy.py +1097 -59
package/tools/agents/scripts/tool-choice-proxy.cjs +12 -0
package/tools/agents/tests/test_anthropic_proxy_streaming.py +193 -8

package/tools/agents/scripts/anthropic_proxy.py CHANGED Viewed

@@ -134,6 +134,11 @@ PROXY_LOOP_BREAKER = os.environ.get("PROXY_LOOP_BREAKER", "on").lower() not in {
 }
 PROXY_LOOP_WINDOW = int(os.environ.get("PROXY_LOOP_WINDOW", "6"))
 PROXY_LOOP_REPEAT_THRESHOLD = int(os.environ.get("PROXY_LOOP_REPEAT_THRESHOLD", "6"))
+# Fix K (2026-04-22): minimum consecutive cycle-repeat count required to flip
+# phase from act -> review. The old behaviour accepted cycle_repeat=2, which
+# is normal in a working session (re-reading the same file across edits).
+# Set higher to tolerate legitimate re-reads; set 1 to restore old behaviour.
+PROXY_CYCLE_TRIGGER_REPEAT = int(os.environ.get("PROXY_CYCLE_TRIGGER_REPEAT", "3"))
 PROXY_FORCED_THRESHOLD = int(os.environ.get("PROXY_FORCED_THRESHOLD", "15"))
 PROXY_NO_PROGRESS_THRESHOLD = int(os.environ.get("PROXY_NO_PROGRESS_THRESHOLD", "3"))
 PROXY_CONTEXT_RELEASE_THRESHOLD = float(
@@ -247,6 +252,19 @@ PROXY_DISABLE_THINKING_ON_TOOL_TURNS = os.environ.get(
     "off",
     "no",
 }
+# Disable thinking on EVERY turn (not just tool turns). For models like Gemma 4
+# that emit ~100 thinking tokens for trivial replies, this halves output cost.
+PROXY_DISABLE_THINKING_ALWAYS = os.environ.get(
+    "PROXY_DISABLE_THINKING_ALWAYS", "off"
+).lower() not in {"0", "false", "off", "no"}
+# Force tool_choice='required' on the first turn of a fresh session. Originally
+# Qwen-tuned to break out of cold-start "tries to chat instead of calling a tool"
+# behaviour. Gemma 4 doesn't need this — it routes 'auto' correctly and the
+# force triggers malformed-JSON emissions when it would rather speak. Default
+# off; set 'on' to restore the legacy Qwen-style behaviour.
+PROXY_FORCE_TOOL_CHOICE_ON_COLD_START = os.environ.get(
+    "PROXY_FORCE_TOOL_CHOICE_ON_COLD_START", "off"
+).lower() not in {"0", "false", "off", "no"}
 PROXY_DISABLE_SPEC_ON_TOOL_TURNS = os.environ.get(
     "PROXY_DISABLE_SPEC_ON_TOOL_TURNS", "off"
 ).lower() not in {
@@ -576,6 +594,44 @@ def _is_grammar_tools_incompatibility(status_code: int, error_text: str) -> bool
     return "custom grammar constraints" in lowered and "with tools" in lowered
+def _is_gemma4_peg_parse_failure(status_code: int, error_text: str) -> bool:
+    """Detect Gemma 4's PEG-parser failure on tool-turn output.
+    llama-server returns HTTP 500 with `failed to parse grammar` /
+    `Failed to parse input at pos N: <|tool_call>call:...` when the model
+    emits an incomplete tool call (missing required schema fields) under
+    tool_choice='required'. The PEG grammar enforces the schema strictly
+    and rejects the partial output. Caller should retry with relaxed
+    tool_choice='auto' so the model can emit prose or a complete call
+    without grammar enforcement triggering this failure mode.
+    """
+    if status_code != 500:
+        return False
+    text = error_text or ""
+    return (
+        "Failed to parse input at pos" in text
+        or "<|tool_call>call:" in text
+    )
+def _relax_tool_choice_for_gemma4_peg_retry(request_body: dict, source: str) -> bool:
+    """When a Gemma 4 PEG parse failure is detected on a tool turn, drop
+    tool_choice='required' so the retry has a permissive grammar. Returns
+    True if the body was modified (caller should retry POST)."""
+    if not request_body.get("tools"):
+        return False
+    current = request_body.get("tool_choice")
+    if current in ("required", {"type": "any"}):
+        request_body["tool_choice"] = "auto"
+        logger.warning(
+            "GEMMA4 PEG RETRY (%s): relaxed tool_choice='required' -> 'auto' "
+            "to bypass strict-grammar parse failure on incomplete model output",
+            source,
+        )
+        return True
+    return False
 def _maybe_disable_grammar_for_tools_error(
     request_body: dict,
     status_code: int,
@@ -1409,6 +1465,66 @@ def prune_conversation(
 # Granular timeouts: short connect, long read for streaming LLM output.
 http_client: httpx.AsyncClient | None = None
+# ---------------------------------------------------------------------------
+# Concurrency Control
+# ---------------------------------------------------------------------------
+# Semaphore to serialize upstream requests. llama.cpp is configured with
+# --parallel 1 (LLAMA_PARALLEL=1), so it can only process one inference at
+# a time. Without this gate, concurrent client requests (Shannon sub-agents,
+# multiple Claude Code sessions) would all hit llama.cpp at once and the
+# server would serialize them while the proxy holds N httpx connections
+# open — potentially exhausting the proxy's connection pool while requests
+# queue inside llama.cpp opaquely.
+#
+# With the semaphore: requests queue inside the proxy (cheap, just asyncio
+# tasks waiting) and only PROXY_CONCURRENCY_LIMIT at a time reaches
+# llama.cpp. Each httpx connection is held only for the actual inference
+# duration, not the queue wait.
+#
+# Default: 1 (matches LLAMA_PARALLEL=1). Increase if you raise --parallel.
+PROXY_CONCURRENCY_LIMIT = int(os.environ.get("PROXY_CONCURRENCY_LIMIT", "1"))
+# Max time to wait for a slot before returning 503. Generous because real
+# inference can take 30-600s and queued requests must wait through that.
+# 0 = wait indefinitely.
+PROXY_CONCURRENCY_QUEUE_TIMEOUT = float(
+    os.environ.get("PROXY_CONCURRENCY_QUEUE_TIMEOUT", "900")
+)
+upstream_semaphore: asyncio.Semaphore | None = None
+async def _acquire_upstream_slot() -> bool:
+    """Acquire a semaphore slot for an upstream request.
+    Returns True if a slot was acquired, False if the wait timed out.
+    asyncio.Semaphore.acquire() preserves wait order via futures, so this
+    gives a natural FIFO queue.
+    """
+    if upstream_semaphore is None:
+        return True  # Not yet initialized; proceed without limiting
+    if PROXY_CONCURRENCY_QUEUE_TIMEOUT <= 0:
+        await upstream_semaphore.acquire()
+        return True
+    try:
+        await asyncio.wait_for(
+            upstream_semaphore.acquire(),
+            timeout=PROXY_CONCURRENCY_QUEUE_TIMEOUT,
+        )
+        return True
+    except asyncio.TimeoutError:
+        return False
+def _release_upstream_slot() -> None:
+    """Release a semaphore slot. MUST be called once per successful acquire.
+    Note: asyncio.Semaphore.release() always increments the counter — we
+    do NOT gate on locked() because that returns True only when the counter
+    is 0 (no slots left). Gating would cause a slot leak when limit > 1 and
+    multiple holders release simultaneously.
+    """
+    if upstream_semaphore is not None:
+        upstream_semaphore.release()
 def _is_loading_model_503(resp: httpx.Response) -> bool:
     """Check if response is a 503 'Loading model' from llama.cpp."""
@@ -1452,6 +1568,36 @@ async def _post_with_retry(
     url: str,
     payload: dict,
     headers: dict,
+) -> httpx.Response:
+    """Post with upstream-retry + concurrency-slot acquire.
+    Acquires a slot from upstream_semaphore before making the request, so
+    concurrent client requests queue in the proxy (cheap asyncio waits)
+    rather than all hammering llama.cpp at once. Slot is released in a
+    finally block so it's always returned to the pool even on error.
+    """
+    acquired = await _acquire_upstream_slot()
+    if not acquired:
+        logger.warning(
+            "CONCURRENCY: queue timeout (%ds) exceeded waiting for upstream slot",
+            int(PROXY_CONCURRENCY_QUEUE_TIMEOUT),
+        )
+        raise httpx.RemoteProtocolError(
+            f"Upstream concurrency queue timed out after {int(PROXY_CONCURRENCY_QUEUE_TIMEOUT)}s "
+            f"(limit={PROXY_CONCURRENCY_LIMIT})",
+            request=None,
+        )
+    try:
+        return await _post_with_retry_inner(client, url, payload, headers)
+    finally:
+        _release_upstream_slot()
+async def _post_with_retry_inner(
+    client: httpx.AsyncClient,
+    url: str,
+    payload: dict,
+    headers: dict,
 ) -> httpx.Response:
     last_exc: Exception | None = None
     for attempt in range(PROXY_UPSTREAM_RETRY_MAX):
@@ -1497,6 +1643,7 @@ async def _post_with_generation_timeout(
     headers: dict,
 ) -> httpx.Response:
     """Wrap _post_with_retry with an explicit asyncio generation timeout.
+    Also acquires a concurrency slot before making the request.
     The httpx read timeout may not fire for hung connections where the server
     keeps the socket open but produces no data (observed with llama.cpp server
@@ -1561,6 +1708,13 @@ async def lifespan(app: FastAPI):
     """Manage the httpx client lifecycle with the FastAPI app."""
     global http_client
     global default_context_window
+    global upstream_semaphore
+    upstream_semaphore = asyncio.Semaphore(PROXY_CONCURRENCY_LIMIT)
+    logger.info(
+        "CONCURRENCY: upstream semaphore initialized limit=%d queue_timeout=%.0fs",
+        PROXY_CONCURRENCY_LIMIT,
+        PROXY_CONCURRENCY_QUEUE_TIMEOUT,
+    )
     http_client = httpx.AsyncClient(
         timeout=httpx.Timeout(
             connect=10.0,  # 10s to establish connection
@@ -1643,6 +1797,8 @@ async def lifespan(app: FastAPI):
     yield
     await http_client.aclose()
     http_client = None
+    if upstream_semaphore is not None:
+        upstream_semaphore = None
     logger.info("Proxy shut down")
@@ -1653,6 +1809,16 @@ app = FastAPI(
     lifespan=lifespan,
 )
+# NOTE: Concurrency control is enforced by _acquire_upstream_slot() inside
+# _post_with_retry (the single point where we hit llama.cpp). An earlier
+# implementation also added an HTTP middleware that acquired the same
+# semaphore — this caused a self-deadlock (middleware holds slot, inner
+# call waits for slot, both on the same task). The middleware approach
+# also called non-existent asyncio.Semaphore methods (try_acquire /
+# acquire_nowait) and ran an async primitive in a thread executor.
+# Removed 2026-05-13.
 # ===========================================================================
 # Request Translation: Anthropic -> OpenAI
@@ -1686,6 +1852,31 @@ def anthropic_to_openai_messages(anthropic_body: dict) -> list[dict]:
         role = msg["role"]
         content = msg.get("content")
+        # Strip <think>...</think> blocks from PRIOR assistant turns. Qwen is
+        # heavily few-shot influenced by its own conversation history — if
+        # earlier assistant turns contain reasoning blocks, the next turn
+        # will pattern-match and emit <think> tags even when the system
+        # prompt forbids them. Stripping breaks the copy cycle.
+        if role == "assistant":
+            if isinstance(content, str) and "<think>" in content:
+                content = _THINKING_BLOCK_RE.sub("", content).lstrip()
+            elif isinstance(content, list):
+                stripped = []
+                for b in content:
+                    if isinstance(b, dict) and b.get("type") == "text":
+                        t = b.get("text", "")
+                        if "<think>" in t:
+                            t = _THINKING_BLOCK_RE.sub("", t).lstrip()
+                        if t:
+                            stripped.append({**b, "text": t})
+                    elif isinstance(b, dict) and b.get("type") == "thinking":
+                        # Anthropic-style thinking block — drop entirely
+                        # (don't replay it back to the model).
+                        continue
+                    else:
+                        stripped.append(b)
+                content = stripped
         if isinstance(content, str):
             messages.append({"role": role, "content": content})
         elif isinstance(content, list):
@@ -1695,6 +1886,10 @@ def anthropic_to_openai_messages(anthropic_body: dict) -> list[dict]:
                     parts.append(block)
                 elif block.get("type") == "text":
                     parts.append(block.get("text", ""))
+                elif block.get("type") == "thinking":
+                    # Drop thinking blocks from user/assistant content when
+                    # echoed back into history — model shouldn't see them.
+                    continue
                 elif block.get("type") == "tool_use":
                     messages.append(
                         {
@@ -1703,7 +1898,7 @@ def anthropic_to_openai_messages(anthropic_body: dict) -> list[dict]:
                             "tool_calls": [
                                 {
                                     "id": block.get(
-                                        "id", f"call_{uuid.uuid4().hex[:8]}"
+                                        "id", f"toolu_{uuid.uuid4().hex[:24]}"
                                     ),
                                     "type": "function",
                                     "function": {
@@ -1716,10 +1911,17 @@ def anthropic_to_openai_messages(anthropic_body: dict) -> list[dict]:
                     )
                     continue
                 elif block.get("type") == "tool_result":
+                    # Strip Anthropic-spec toolu_ prefix so the upstream
+                    # tool_call_id matches what llama-server originally
+                    # emitted (we stamped the prefix on outbound; reverse it
+                    # here so the loop closes correctly).
+                    tu_id = block.get("tool_use_id", "")
+                    if isinstance(tu_id, str) and tu_id.startswith("toolu_"):
+                        tu_id = tu_id[len("toolu_"):]
                     messages.append(
                         {
                             "role": "tool",
-                            "tool_call_id": block.get("tool_use_id", ""),
+                            "tool_call_id": tu_id,
                             "content": _extract_text(block.get("content", "")),
                         }
                     )
@@ -1899,6 +2101,18 @@ _AGENTIC_SYSTEM_SUPPLEMENT_MINIMAL = (
     "\n\nUse tools for all actions. Respond with tool calls, not descriptions of what to do."
 )
+# Directive appended when the upstream model (Qwen) is configured with
+# enable_thinking=False but consistently emits <think>...</think> blocks
+# anyway, consuming the max_tokens budget before any tool_use is generated.
+# Empirically required for Shannon-style workflows where max_tokens=512
+# leaves no room for both internal reasoning AND a tool call.
+_NO_THINKING_DIRECTIVE = (
+    "\n\nCRITICAL: Do NOT output <think>...</think> tags or any internal "
+    "reasoning. Begin your response IMMEDIATELY with the appropriate "
+    "tool_call. If you have no tool to call, reply with plain text only — "
+    "never include reasoning blocks."
+)
 if PROXY_AGENTIC_SUPPLEMENT_MODE == "legacy":
     _AGENTIC_SYSTEM_SUPPLEMENT = _AGENTIC_SYSTEM_SUPPLEMENT_LEGACY
 elif PROXY_AGENTIC_SUPPLEMENT_MODE == "minimal":
@@ -2264,7 +2478,7 @@ def anthropic_to_openai_response(anthropic_resp: dict) -> dict:
         elif btype == "tool_use":
             tool_calls.append(
                 {
-                    "id": block.get("id", f"call_{uuid.uuid4().hex[:12]}"),
+                    "id": block.get("id", f"toolu_{uuid.uuid4().hex[:24]}"),
                     "type": "function",
                     "function": {
                         "name": block.get("name", ""),
@@ -2347,6 +2561,72 @@ def _latest_user_text(anthropic_body: dict) -> str:
     return ""
+# 2026-05-12: Detect "no-task" user turns to gate the state machine's
+# force-required path. When the last actual human query is a short ack
+# ("ok", "3", "test"), an acknowledgement phrase ("standing by", "awaiting
+# next instruction"), or a status report ending in an ack ("scan complete.
+# awaiting next instruction"), there is no genuine work for the model to
+# do. Forcing tool_choice='required' in this state causes the model to
+# ruminate in <think> blocks, and the meta-tool talk inside those blocks
+# trips the malformed-pseudo-tool detector. Conservative patterns only.
+_NO_TASK_SHORT_ACKS = frozenset({
+    "ok", "okay", "k", "kk", "y", "n", "yes", "no", "nope", "yep", "yeah",
+    "thanks", "thank", "thx", "ty", "ack", "noted", "received", "understood",
+    "test", "ping", "hi", "hello",
+})
+_NO_TASK_ACK_PATTERNS = (
+    re.compile(r"awaiting\s+(?:next|further|your)\s+(?:instruction|input|command|task|directive)", re.I),
+    re.compile(r"standing\s+by(?:\s+for\s+(?:your\s+)?(?:next|further|new)\s+(?:instruction|input|command|task|directive)?)?", re.I),
+    re.compile(r"\b(?:ready|waiting|holding)\s+for\s+(?:your\s+)?(?:next|further|new)\s+(?:task|instruction|command|input|directive)", re.I),
+    # Status report ending in ack: "X complete. {awaiting/standing/ready/done}"
+    re.compile(r"\bcomplet(?:e|ed)\b[\s.,;:!\-]+(?:awaiting|standing\s+by|ready|done|finished|over\s+to\s+you)", re.I),
+)
+def _is_no_task_user_text(text: str) -> bool:
+    if not text:
+        return False
+    stripped = text.strip()
+    if not stripped:
+        return False
+    bare = re.sub(r"[^\w\s]", "", stripped).strip().lower()
+    if bare in _NO_TASK_SHORT_ACKS:
+        return True
+    if re.fullmatch(r"\d+(?:\.\d+)?", bare):
+        return True
+    snippet = stripped[:400]
+    return any(p.search(snippet) for p in _NO_TASK_ACK_PATTERNS)
+def _latest_user_query_text(anthropic_body: dict) -> str:
+    """Return the most recent user message *text* — walking past
+    tool_result-only messages to find the last actual human query.
+    During agentic loops the trailing user message is a tool_result block
+    with no ``text`` parts, so ``_latest_user_text`` returns empty.
+    Tool-narrowing needs query tokens to score tools; without them it
+    keeps all tools (defeating the purpose). This walker pulls text
+    from prior user turns as a fallback so narrowing stays useful in
+    long loops.
+    """
+    for msg in reversed(anthropic_body.get("messages", [])):
+        if msg.get("role") != "user":
+            continue
+        content = msg.get("content", "")
+        if isinstance(content, str) and content.strip():
+            return content
+        if isinstance(content, list):
+            text_parts = [
+                b.get("text", "")
+                for b in content
+                if isinstance(b, dict) and b.get("type") == "text" and b.get("text")
+            ]
+            if text_parts:
+                return "\n".join(text_parts)
+    return ""
 def _tokenize_for_tool_ranking(text: str) -> set[str]:
     return {m.group(0).lower() for m in re.finditer(r"[a-zA-Z0-9_]{2,}", text)}
@@ -2366,6 +2646,13 @@ def _narrow_tools_for_request(
     query_text = _latest_user_text(anthropic_body).lower()
     query_tokens = _tokenize_for_tool_ranking(query_text)
+    if not query_tokens:
+        # Walk back past tool_result turns to find the prior real human
+        # query. Lets narrowing stay effective during agentic loops where
+        # the latest user msg is just a tool_result block (no text).
+        fallback_query = _latest_user_query_text(anthropic_body).lower()
+        query_text = fallback_query or query_text
+        query_tokens = _tokenize_for_tool_ranking(query_text)
     if not query_tokens:
         n_msgs = len(anthropic_body.get("messages", []))
         if (
@@ -2490,6 +2777,18 @@ def _resolve_state_machine_tool_choice(
         monitor.finalize_synthetic_tool_id = ""
         return None, "fresh_user_text"
+    # 2026-05-12: No-task ack guard. When the latest user message is just a
+    # tool_result (no fresh text), walk back to the most recent human query.
+    # If that query is a short ack or "X complete. awaiting next" status,
+    # do not force tool_choice — let the model produce a natural finalization
+    # text instead of ruminating in <think> blocks.
+    last_user_query = _latest_user_query_text(anthropic_body).strip()
+    if last_user_query and _is_no_task_user_text(last_user_query):
+        monitor.reset_tool_turn_state(reason="no_task_user_text")
+        monitor.finalize_continuation_count = 0
+        monitor.finalize_synthetic_tool_id = ""
+        return None, "no_task_user_text"
     active_loop = (
         has_tool_results
         and last_user_has_tool_result
@@ -2563,7 +2862,15 @@ def _resolve_state_machine_tool_choice(
                 dup_tool,
             )
-        if cycle_looping or stagnating:
+        # Fix K (2026-04-22): require cycle_repeat >= PROXY_CYCLE_TRIGGER_REPEAT
+        # before flipping phase. Single-repeat cycles are legitimate in working
+        # sessions (e.g. re-reading the same file across edits). dup_target
+        # above already demands threshold=3 before asserting a cycle, so the
+        # `cycle_looping = True, cycle_repeat = 2` pair from that branch is
+        # kept as a strong signal (read target repeated 3+ times). Low-repeat
+        # cycles detected by detect_tool_cycle get filtered here.
+        cycle_trip = cycle_looping and cycle_repeat >= PROXY_CYCLE_TRIGGER_REPEAT
+        if cycle_trip or stagnating:
             reason = "cycle_detected" if cycle_looping else "stagnation"
             monitor.set_tool_turn_phase("review", reason=reason)
             monitor.tool_state_review_cycles += 1
@@ -2702,6 +3009,33 @@ def build_openai_request(
     has_tools = _has_tool_definitions(anthropic_body)
+    # Translate Anthropic `thinking` parameter to upstream `enable_thinking`.
+    # Anthropic shape: {"thinking": {"type": "enabled", "budget_tokens": 1024}}
+    # or {"type": "disabled"}. Per the Anthropic spec, thinking is OFF by
+    # default and ONLY enabled when the client opts in. Match that behaviour:
+    #   - thinking.type == "enabled" -> enable_thinking=True
+    #   - thinking.type == "disabled" or absent -> enable_thinking=False
+    # Without this, Qwen's chat template (which defaults thinking ON) would
+    # consume the client's max_tokens budget on internal reasoning, leaving
+    # nothing for the visible answer.
+    anthropic_thinking = anthropic_body.get("thinking")
+    if isinstance(anthropic_thinking, dict):
+        ttype = (anthropic_thinking.get("type") or "").lower()
+        if ttype == "enabled":
+            openai_body["enable_thinking"] = True
+        else:
+            openai_body["enable_thinking"] = False
+    else:
+        # Match Anthropic default: thinking off unless explicitly requested.
+        openai_body["enable_thinking"] = False
+    # Global thinking-off (G): apply to every request, not just tool turns.
+    # Only applies when the client did NOT explicitly request thinking above.
+    # Per-path tool-turn handling below (DISABLE_THINKING_ON_TOOL_TURNS) is
+    # additive — ALWAYS supersedes when set.
+    if PROXY_DISABLE_THINKING_ALWAYS:
+        openai_body["enable_thinking"] = False
     # Inject agentic protocol instructions only for tool-enabled turns.
     # Use minimal supplement for qwen models to reduce prompt leak surface.
     if has_tools:
@@ -2711,6 +3045,15 @@ def build_openai_request(
             if "qwen" in model_name and PROXY_AGENTIC_SUPPLEMENT_MODE != "legacy"
             else _AGENTIC_SYSTEM_SUPPLEMENT
         )
+        # When thinking is explicitly disabled (Anthropic default, plus our
+        # tool-turn forcing) but the upstream model is Qwen — which emits
+        # <think> blocks regardless of enable_thinking — append a strong
+        # directive that suppresses internal reasoning. Without this, small
+        # max_tokens budgets get fully consumed by the model's reasoning,
+        # producing required_tool_miss retries (observed in Shannon workflows
+        # with max_tokens=512 + tool_choice=required).
+        if openai_body.get("enable_thinking") is False:
+            supplement = supplement + _NO_THINKING_DIRECTIVE
         if (
             openai_body["messages"]
             and openai_body["messages"][0].get("role") == "system"
@@ -2731,23 +3074,62 @@ def build_openai_request(
     if "max_tokens" in anthropic_body:
         requested_raw = max(1, int(anthropic_body["max_tokens"]))
-        # Enforce configurable minimum floor for thinking mode: model needs
-        # tokens for reasoning (<think>...</think>) plus actual response/tool
-        # calls. Set PROXY_MAX_TOKENS_FLOOR=0 to disable this floor.
+        # Enforce configurable minimum floor for tool turns: the model needs
+        # enough headroom to emit complete tool-call arguments (long heredocs,
+        # full-function oldString/newString pairs, etc.) without hitting the
+        # client-requested max_tokens in the middle of a JSON string. If the
+        # client requested >= the floor we keep their value; short preflight
+        # requests (max_tokens <= 1024) always skip the floor to avoid
+        # inflating plan-generation turns.
         #
-        # The floor is ONLY applied when thinking is actually enabled —
-        # skip it for non-tool requests (tools=0) and for tool turns
-        # with thinking disabled, to prevent inflating short preflight
-        # requests (e.g. max_tokens=100 for plan generation).
-        thinking_active_for_request = has_tools and not PROXY_DISABLE_THINKING_ON_TOOL_TURNS
+        # The earlier gating on PROXY_DISABLE_THINKING_ON_TOOL_TURNS was too
+        # restrictive: it skipped the floor on every tool turn once thinking
+        # was off, which re-introduced truncated tool calls on long edits.
+        # Set PROXY_MAX_TOKENS_FLOOR=0 to disable the floor entirely.
+        thinking_active_for_request = (
+            has_tools
+            and not PROXY_DISABLE_THINKING_ON_TOOL_TURNS
+            and not PROXY_DISABLE_THINKING_ALWAYS
+        )
+        SMALL_PREFLIGHT_THRESHOLD = 1024
+        # Qwen-style models emit <think> blocks regardless of the
+        # enable_thinking flag (template ignored by trained behaviour).
+        # For tool turns those blocks alone consume ~400-1000 tokens, so a
+        # client-requested max_tokens < THINKING_MIN_FOR_TOOLS leaves no
+        # budget for the tool_call itself — manifesting as required_tool_miss
+        # retries (observed Shannon: max_tokens=512 + tools=7 -> ~5 retries
+        # per turn). Bump up to THINKING_MIN_FOR_TOOLS for these requests.
+        THINKING_MIN_FOR_TOOLS = 2048
         skip_floor = (
-            not has_tools  # non-tool requests don't need thinking headroom
-            or PROXY_DISABLE_THINKING_ON_TOOL_TURNS  # thinking disabled on tool turns
+            not has_tools  # non-tool requests don't need the headroom
             or PROXY_MAX_TOKENS_FLOOR <= 0  # floor explicitly disabled
+            or requested_raw <= SMALL_PREFLIGHT_THRESHOLD  # tiny preflight request
         )
+        # Qwen-style models emit <think> blocks regardless of the
+        # enable_thinking flag (template ignored by trained behaviour).
+        # For tool turns those blocks alone consume ~400-1000 tokens, so a
+        # client-requested max_tokens < THINKING_MIN_FOR_TOOLS leaves no
+        # budget for the tool_call itself — manifesting as required_tool_miss
+        # retries (observed Shannon: max_tokens=512 + tools=7 -> ~5 retries
+        # per turn). Bump up to THINKING_MIN_FOR_TOOLS for these requests.
+        THINKING_MIN_FOR_TOOLS = 2048
         if skip_floor:
             requested_max = requested_raw
-            if requested_raw < PROXY_MAX_TOKENS_FLOOR and PROXY_MAX_TOKENS_FLOOR > 0:
+            # Even when skipping the big floor, bump small tool-turn
+            # budgets so Qwen's mandatory thinking has room before the
+            # tool_call. Only applies when tools are present.
+            if (
+                has_tools
+                and requested_raw < THINKING_MIN_FOR_TOOLS
+                and requested_raw > 16  # leave true preflight (e.g. max_tokens=1) alone
+            ):
+                requested_max = THINKING_MIN_FOR_TOOLS
+                logger.info(
+                    "MAX_TOKENS thinking-floor: %d -> %d (tool turn, Qwen mandatory thinking)",
+                    requested_raw,
+                    requested_max,
+                )
+            elif requested_raw < PROXY_MAX_TOKENS_FLOOR and PROXY_MAX_TOKENS_FLOOR > 0:
                 logger.info(
                     "MAX_TOKENS floor skipped: has_tools=%s thinking_active=%s requested=%d floor=%d",
                     has_tools,
@@ -2946,24 +3328,35 @@ def build_openai_request(
                 monitor.tool_state_stagnation_streak,
             )
         elif state_choice == "finalize":
-            openai_body.pop("tool_choice", None)
-            openai_body.pop("tools", None)
+            # Fix H/J (2026-04-22): Do NOT strip tools from the body on
+            # cycle-limit finalize. Stripping tools lets the model emit
+            # prose that LOOKS like a tool call ("<function=edit>…") but
+            # has no structured tool_calls array, so the Anthropic client
+            # sees end_turn with no action and halts. Instead, keep tools
+            # available, set tool_choice=auto, and nudge the model to
+            # either complete with a tool call OR emit a proper summary.
+            # Grammar (when PROXY_TOOL_CALL_GRAMMAR_REQUIRED_ONLY=off) will
+            # still constrain tool-call emission to valid JSON format.
+            openai_body["tool_choice"] = "auto"
             monitor.finalize_turn_active = True
             monitor.finalize_hard_stop_count += 1  # monotonic marker: a finalize fired this session
             monitor.consecutive_forced_count = 0
             monitor.no_progress_streak = 0
-            # Option 3: Inject explicit "no tool calls" instruction to reduce XML leak
             finalize_instruction = {
                 "role": "user",
                 "content": (
-                    "Respond with plain text only. Do not emit any tool calls, "
-                    "XML tags, or JSON objects."
+                    "You have been looping on the same tools for several turns. "
+                    "Wrap up: either emit ONE decisive tool call that completes "
+                    "the task, or reply with a plain-text summary of what you "
+                    "accomplished and what is blocking further progress. Do NOT "
+                    "emit tool call text in prose form — if you call a tool, do "
+                    "it through the structured tool_call mechanism."
                 ),
             }
             msgs = openai_body.get("messages", [])
             msgs.append(finalize_instruction)
             logger.warning(
-                "TOOL STATE MACHINE: tools temporarily disabled for finalize turn (reason=%s)",
+                "TOOL STATE MACHINE: finalize turn (reason=%s) — tools kept, tool_choice=auto",
                 state_reason,
             )
         elif state_choice == "required":
@@ -3045,8 +3438,11 @@ def build_openai_request(
             monitor.consecutive_forced_count = 0
             monitor.no_progress_streak = 0
             # Force tool_choice=required on first turn to ensure local models
-            # produce a tool call instead of plain text (cold-start fix)
-            if has_tools and n_msgs == 1:
+            # produce a tool call instead of plain text (cold-start fix).
+            # Gated by PROXY_FORCE_TOOL_CHOICE_ON_COLD_START — Gemma 4 routes
+            # 'auto' correctly without needing the force, and the force
+            # triggers malformed-JSON emissions on Gemma 4 cold turns.
+            if has_tools and n_msgs == 1 and PROXY_FORCE_TOOL_CHOICE_ON_COLD_START:
                 openai_body["tool_choice"] = "required"
                 logger.info(
                     "tool_choice forced to 'required' on first turn (reason=%s n_msgs=%d cold_start_fix=true)",
@@ -3089,10 +3485,12 @@ def build_openai_request(
                 monitor.reset_tool_turn_state(reason="no_tool_results")
-        if PROXY_DISABLE_THINKING_ON_TOOL_TURNS:
+        if PROXY_DISABLE_THINKING_ALWAYS or PROXY_DISABLE_THINKING_ON_TOOL_TURNS:
             openai_body["enable_thinking"] = False
             logger.info(
-                "Thinking disabled for tool turn (PROXY_DISABLE_THINKING_ON_TOOL_TURNS=on)"
+                "Thinking disabled (always=%s tool_turns=%s)",
+                PROXY_DISABLE_THINKING_ALWAYS,
+                PROXY_DISABLE_THINKING_ON_TOOL_TURNS,
             )
         if PROXY_DISABLE_SPEC_ON_TOOL_TURNS:
@@ -3411,7 +3809,10 @@ def _schema_type_matches(value, expected_type: str) -> bool:
 def _string_contains_tool_markup(value: str) -> bool:
     lowered = value.lower()
-    markers = ("<parameter", "</parameter", "<tool_call", "<function=", "</function")
+    markers = (
+        "<parameter", "</parameter", "<tool_call", "<function=", "</function",
+        "<|tool_call>", "<tool_call|>",  # Gemma 4 native DSL
+    )
     return any(marker in lowered for marker in markers)
@@ -3483,6 +3884,343 @@ _TOOL_CALL_XML_RE = re.compile(
     re.DOTALL,
 )
+# Hermes-style XML function call format emitted by some Qwen/Llama fine-tunes
+# when grammar is not applied:
+#   <function=name>
+#   <parameter=key>
+#   value
+#   </parameter>
+#   ...
+#   </function>
+#
+# The value of a <parameter=KEY> block may span multiple lines and include
+# arbitrary characters (code snippets, JSON, quotes). The closing
+# </parameter> tag may be missing if the model emitted EOS prematurely —
+# in which case we consume up to the next <parameter=...> tag or end of
+# string. Names are captured as alphanumeric + underscore to avoid pulling
+# in attribute-like garbage.
+_HERMES_FUNCTION_RE = re.compile(
+    r"<function=([A-Za-z_][A-Za-z0-9_]*)>(.*?)(?:</function>|\Z)",
+    re.DOTALL,
+)
+_HERMES_PARAMETER_RE = re.compile(
+    r"<parameter=([A-Za-z_][A-Za-z0-9_]*)>\s*(.*?)\s*(?=</parameter>|<parameter=|\Z)",
+    re.DOTALL,
+)
+def _extract_hermes_tool_calls(text: str) -> tuple[list[dict], str]:
+    """Parse Hermes-style ``<function=name><parameter=k>v</parameter></function>``
+    blocks out of *text*. Used as a fallback when the Qwen JSON format
+    (``<tool_call>{...}</tool_call>``) is not present — for example on
+    finalize turns where grammar does not constrain the output. Tolerates
+    premature EOS (missing closing ``</parameter>`` / ``</function>``)."""
+    if "<function=" not in text:
+        return [], text
+    extracted: list[dict] = []
+    matched_spans: list[tuple[int, int]] = []
+    for fn_match in _HERMES_FUNCTION_RE.finditer(text):
+        name = fn_match.group(1).strip()
+        body = fn_match.group(2) or ""
+        if not name:
+            continue
+        args: dict = {}
+        for p_match in _HERMES_PARAMETER_RE.finditer(body):
+            key = p_match.group(1).strip()
+            value = p_match.group(2)
+            if key:
+                # Strip one leading newline that the template usually adds
+                # but preserve interior whitespace (code indentation, etc.)
+                if value.startswith("\n"):
+                    value = value[1:]
+                args[key] = value
+        extracted.append(
+            {
+                "id": f"toolu_{uuid.uuid4().hex[:24]}",
+                "type": "function",
+                "function": {
+                    "name": name,
+                    "arguments": json.dumps(args, separators=(",", ":")),
+                },
+            }
+        )
+        matched_spans.append(fn_match.span())
+    if not extracted:
+        return [], text
+    # Remove matched function blocks from text (plus any dangling
+    # <tool_call>/</tool_call> wrappers around them).
+    remaining = text
+    for start, end in reversed(matched_spans):
+        remaining = remaining[:start] + remaining[end:]
+    # Strip leftover <tool_call>…</tool_call> envelopes that now enclose
+    # nothing useful.
+    remaining = re.sub(r"<tool_call>\s*</tool_call>", "", remaining, flags=re.DOTALL)
+    remaining = remaining.strip()
+    logger.info(
+        "TOOL CALL EXTRACTION: recovered %d Hermes-format tool call(s) from text content",
+        len(extracted),
+    )
+    return extracted, remaining
+# ---------------------------------------------------------------------------
+# Gemma 4 tool-call DSL extractors
+# ---------------------------------------------------------------------------
+# Gemma 4's chat template emits tool calls as:
+#   <|tool_call>call:NAME{key1:<|"|>value1<|"|>,key2:42}<tool_call|>
+# Note the asymmetric open/close tags and `<|"|>` substitution for `"`.
+# Llama-server's --jinja autoparser usually converts these to standard
+# OpenAI tool_calls, but the raw form can leak through on (a) malformed
+# emissions, (b) finalize turns, (c) non-tool-template requests where the
+# model still tries to call a tool. This parser catches those cases.
+#
+# Gemma 4 also falls back to ```json {"name": "...", "arguments": {...}} ```
+# markdown blocks when it doesn't trust the template — observed when
+# tool_choice was forced 'required' but the model lacked confidence in the
+# native format. Only treated as a tool call when the JSON has a "name".
+_GEMMA4_TOOL_CALL_DSL_RE = re.compile(
+    r"<\|tool_call>\s*call:\s*([A-Za-z_][A-Za-z0-9_]*)\s*\{(.*?)\}\s*<tool_call\|>",
+    re.DOTALL,
+)
+# Markdown JSON code-block fallback. Group 1 = JSON content (may include
+# leading/trailing whitespace inside the block).
+_GEMMA4_MARKDOWN_JSON_RE = re.compile(
+    r"```(?:json)?\s*(\{.*?\})\s*```",
+    re.DOTALL,
+)
+def _parse_gemma4_dsl_args(raw: str) -> dict | None:
+    """Parse Gemma 4's tool-call DSL arg body into a Python dict.
+    Input shape (between the `{` and `}` of the DSL):
+        key1:<|"|>str value<|"|>,key2:42,key3:true,key4:[<|"|>a<|"|>,<|"|>b<|"|>]
+    Strategy: replace `<|"|>` with `"`, wrap unquoted keys in quotes, then
+    feed to json.loads. Returns None on parse failure (caller decides).
+    """
+    if not raw or not raw.strip():
+        return {}
+    s = raw.replace('<|"|>', '"')
+    # Wrap unquoted keys: `key:` -> `"key":` (only at start or after `,` / `{` / whitespace).
+    s = re.sub(r"(^|[\s,{\[])([A-Za-z_][A-Za-z0-9_]*)\s*:", r'\1"\2":', s)
+    s = "{" + s + "}"
+    try:
+        parsed = json.loads(s)
+        return parsed if isinstance(parsed, dict) else None
+    except json.JSONDecodeError:
+        return None
+def _schema_match_tool(payload: dict, available_tools: list[dict]) -> str | None:
+    """Match a bare-args dict against available tool schemas.
+    Score each tool by:
+      - +10 per required field present in payload
+      - +1 per optional property present
+      - -5 per payload key NOT in tool's properties
+      - -100 if any required field is missing
+    Return the name of the highest-scoring tool, or None if no clear match.
+    """
+    if not isinstance(payload, dict) or not available_tools:
+        return None
+    payload_keys = set(payload.keys())
+    best_name = None
+    best_score = 0
+    for tool in available_tools:
+        if not isinstance(tool, dict):
+            continue
+        # Anthropic tools format: {"name": ..., "input_schema": {...}}
+        # OpenAI format: {"type": "function", "function": {"name": ..., "parameters": {...}}}
+        name = tool.get("name")
+        schema = tool.get("input_schema")
+        if name is None and isinstance(tool.get("function"), dict):
+            name = tool["function"].get("name")
+            schema = tool["function"].get("parameters")
+        if not isinstance(name, str) or not isinstance(schema, dict):
+            continue
+        properties = schema.get("properties") if isinstance(schema.get("properties"), dict) else {}
+        required = set(schema.get("required") or [])
+        prop_keys = set(properties.keys())
+        score = 0
+        missing_required = required - payload_keys
+        if missing_required:
+            score -= 100
+        score += 10 * len(required & payload_keys)
+        score += len((payload_keys & prop_keys) - required)
+        score -= 5 * len(payload_keys - prop_keys)
+        if score > best_score:
+            best_score = score
+            best_name = name
+    return best_name if best_score >= 10 else None
+def _extract_gemma4_tool_calls(
+    text: str, available_tools: list[dict] | None = None
+) -> tuple[list[dict], str]:
+    """Parse Gemma 4 tool-call emissions out of *text*.
+    Three formats handled, in order:
+      1. Native DSL: ``<|tool_call>call:N{...}<tool_call|>``
+      2. Markdown with name: ```json\\n{"name": "N", "arguments": {...}}\\n```
+      3. Markdown bare-args + ``available_tools`` provided — schema-match
+         against tool definitions (fix D for Gemma 4 cold-turn malformation
+         where the model emits ``{"city": "Paris"}`` for a get_weather call
+         instead of ``{"name": "get_weather", "arguments": {"city": "Paris"}}``).
+         Without ``available_tools``, bare-args blocks pass through as text.
+    Returns ``(extracted_openai_tool_calls, remaining_text)``.
+    """
+    if "<|tool_call>" not in text and "```" not in text:
+        return [], text
+    extracted: list[dict] = []
+    matched_spans: list[tuple[int, int]] = []
+    # Pattern 1: native DSL
+    for m in _GEMMA4_TOOL_CALL_DSL_RE.finditer(text):
+        name = m.group(1).strip()
+        body = m.group(2) or ""
+        if not name:
+            continue
+        args = _parse_gemma4_dsl_args(body)
+        if args is None:
+            # DSL body unparseable; skip and let model retry next turn.
+            continue
+        extracted.append(
+            {
+                "id": f"toolu_{uuid.uuid4().hex[:24]}",
+                "type": "function",
+                "function": {
+                    "name": name,
+                    "arguments": json.dumps(args, separators=(",", ":")),
+                },
+            }
+        )
+        matched_spans.append(m.span())
+    # Pattern 2: markdown JSON fallback (only if no DSL hit AND text has ```)
+    if not extracted and "```" in text:
+        for m in _GEMMA4_MARKDOWN_JSON_RE.finditer(text):
+            raw_json = m.group(1)
+            try:
+                payload = json.loads(raw_json)
+            except json.JSONDecodeError:
+                # Try a JSON repair like the Qwen path does
+                repaired = _repair_tool_call_json(raw_json)
+                if not repaired:
+                    continue
+                try:
+                    payload = json.loads(repaired)
+                except json.JSONDecodeError:
+                    continue
+            if not isinstance(payload, dict):
+                continue
+            name = payload.get("name")
+            arguments_obj = None
+            if isinstance(name, str) and name:
+                # Standard {name, arguments} form
+                arguments_obj = payload.get("arguments", payload.get("args", {}))
+            elif available_tools:
+                # Bare-args block — try schema-matching against available tools
+                matched = _schema_match_tool(payload, available_tools)
+                if matched is None:
+                    continue
+                name = matched
+                arguments_obj = payload  # whole payload IS the args
+                logger.info(
+                    "TOOL CALL EXTRACTION: schema-matched bare-args markdown JSON to tool '%s' (keys=%s)",
+                    name,
+                    sorted(payload.keys())[:6],
+                )
+            else:
+                # No name, no tools to match against — pass through as text
+                continue
+            if isinstance(arguments_obj, dict):
+                arguments = json.dumps(arguments_obj, separators=(",", ":"))
+            elif isinstance(arguments_obj, str):
+                arguments = arguments_obj
+            else:
+                arguments = "{}"
+            extracted.append(
+                {
+                    "id": f"toolu_{uuid.uuid4().hex[:24]}",
+                    "type": "function",
+                    "function": {"name": name, "arguments": arguments},
+                }
+            )
+            matched_spans.append(m.span())
+    if not extracted:
+        return [], text
+    # Strip matched spans from text (in reverse to keep indices valid)
+    remaining = text
+    for start, end in sorted(matched_spans, key=lambda s: -s[0]):
+        remaining = remaining[:start] + remaining[end:]
+    remaining = remaining.strip()
+    logger.info(
+        "TOOL CALL EXTRACTION: recovered %d Gemma 4 tool call(s) from text content",
+        len(extracted),
+    )
+    return extracted, remaining
+# ---------------------------------------------------------------------------
+# Gemma 4 tool-call DSL extractors
+# ---------------------------------------------------------------------------
+# Gemma 4's chat template emits tool calls as:
+#   <|tool_call>call:NAME{key1:<|"|>value1<|"|>,key2:42}<tool_call|>
+# Note the asymmetric open/close tags and `<|"|>` substitution for `"`.
+# Llama-server's --jinja autoparser usually converts these to standard
+# OpenAI tool_calls, but the raw form can leak through on (a) malformed
+# emissions, (b) finalize turns, (c) non-tool-template requests where the
+# model still tries to call a tool. This parser catches those cases.
+#
+# Gemma 4 also falls back to ```json {"name": "...", "arguments": {...}} ```
+# markdown blocks when it doesn't trust the template — observed when
+# tool_choice was forced 'required' but the model lacked confidence in the
+# native format. Only treated as a tool call when the JSON has a "name".
+_GEMMA4_TOOL_CALL_DSL_RE = re.compile(
+    r"<\|tool_call>\s*call:\s*([A-Za-z_][A-Za-z0-9_]*)\s*\{(.*?)\}\s*<tool_call\|>",
+    re.DOTALL,
+)
+# Markdown JSON code-block fallback. Group 1 = JSON content (may include
+# leading/trailing whitespace inside the block).
+_GEMMA4_MARKDOWN_JSON_RE = re.compile(
+    r"```(?:json)?\s*(\{.*?\})\s*```",
+    re.DOTALL,
+)
+def _parse_gemma4_dsl_args(raw: str) -> dict | None:
+    """Parse Gemma 4's tool-call DSL arg body into a Python dict.
+    Input shape (between the `{` and `}` of the DSL):
+        key1:<|"|>str value<|"|>,key2:42,key3:true,key4:[<|"|>a<|"|>,<|"|>b<|"|>]
+    Strategy: replace `<|"|>` with `"`, wrap unquoted keys in quotes, then
+    feed to json.loads. Returns None on parse failure (caller decides).
+    """
+    if not raw or not raw.strip():
+        return {}
+    s = raw.replace('<|"|>', '"')
+    # Wrap unquoted keys: `key:` -> `"key":` (only at start or after `,` / `{` / whitespace).
+    s = re.sub(r"(^|[\s,{\[])([A-Za-z_][A-Za-z0-9_]*)\s*:", r'\1"\2":', s)
+    s = "{" + s + "}"
+    try:
+        parsed = json.loads(s)
+        return parsed if isinstance(parsed, dict) else None
+    except json.JSONDecodeError:
+        return None
 def _repair_tool_call_json(raw: str) -> str | None:
     """Attempt to repair common garbled JSON in tool call payloads.
@@ -3525,7 +4263,9 @@ def _repair_tool_call_json(raw: str) -> str | None:
     return None
-def _extract_tool_calls_from_text(text: str) -> tuple[list[dict], str]:
+def _extract_tool_calls_from_text(
+    text: str, available_tools: list[dict] | None = None
+) -> tuple[list[dict], str]:
     """Parse ``<tool_call>{...}</tool_call>`` blocks out of *text*.
     Returns a tuple of (extracted_openai_tool_calls, remaining_text).
@@ -3535,8 +4275,18 @@ def _extract_tool_calls_from_text(text: str) -> tuple[list[dict], str]:
     The *remaining_text* has the matched ``<tool_call>`` blocks removed.
     If no valid blocks are found the original text is returned unchanged.
+    Falls back to Hermes-style ``<function=X><parameter=K>V</parameter></function>``
+    for older Qwen/Llama fine-tunes, then to Gemma 4's
+    ``<|tool_call>call:N{...}<tool_call|>`` DSL and ```json``` markdown
+    blocks. Anything not matching any known format falls through unchanged
+    so plain prose passes the parser without mutation.
     """
-    if "<tool_call>" not in text:
+    if (
+        "<tool_call>" not in text
+        and "<function=" not in text
+        and "<|tool_call>" not in text
+        and "```" not in text
+    ):
         return [], text
     extracted: list[dict] = []
@@ -3572,14 +4322,24 @@ def _extract_tool_calls_from_text(text: str) -> tuple[list[dict], str]:
         extracted.append(
             {
-                "id": f"call_{uuid.uuid4().hex[:12]}",
+                "id": f"toolu_{uuid.uuid4().hex[:24]}",
                 "type": "function",
                 "function": {"name": name, "arguments": arguments},
             }
         )
     if not extracted:
-        return [], text
+        # Fall back to Hermes format. This catches Qwen emissions on finalize
+        # turns where grammar is not applied and the model defaults to its
+        # base training's <function=X><parameter=K>V</parameter></function>
+        # format instead of the <tool_call>{JSON}</tool_call> Qwen template
+        # format. Without this path, tool_calls=[] and the client halts.
+        hermes_calls, hermes_remaining = _extract_hermes_tool_calls(text)
+        if hermes_calls:
+            return hermes_calls, hermes_remaining
+        # Then try Gemma 4's DSL + markdown-JSON fallback. Anything still
+        # not matching falls through as plain text.
+        return _extract_gemma4_tool_calls(text, available_tools=available_tools)
     # Strip matched tool_call blocks from the text
     remaining = _TOOL_CALL_XML_RE.sub("", text).strip()
@@ -4550,6 +5310,16 @@ def _classify_tool_response_issue(
     if "tools" not in anthropic_body:
         return ToolResponseIssue()
+    # When the upstream response was cut off by max_tokens (finish_reason=length),
+    # any garbled/unbalanced-brace appearance in the tool args is almost
+    # certainly truncation, not degenerate generation. Re-classify such
+    # issues as "truncated_tool_args" so the caller can still retry (with a
+    # larger cap) but WITHOUT triggering the forced-tool dampener, which
+    # otherwise penalises a perfectly-recoverable truncation event.
+    choice_for_finish, _ = _extract_openai_choice(openai_resp)
+    finish_reason = (choice_for_finish.get("finish_reason") or "").lower()
+    was_truncated = finish_reason == "length"
     if _is_malformed_tool_response(openai_resp, anthropic_body):
         return ToolResponseIssue(
             kind="malformed_payload",
@@ -4593,15 +5363,107 @@ def _classify_tool_response_issue(
             allowed_tools,
         )
         if issue.has_issue():
+            # Downgrade invalid_tool_args to truncated_tool_args when the
+            # response hit max_tokens — retry path still fires but the
+            # dampener/streak counters stay cold.
+            if was_truncated and issue.kind == "invalid_tool_args":
+                return ToolResponseIssue(
+                    kind="truncated_tool_args",
+                    reason=(
+                        f"tool call for '{tool_name}' truncated by max_tokens "
+                        f"({issue.reason})"
+                    ),
+                    retry_hint=issue.retry_hint,
+                )
             return issue
     return ToolResponseIssue()
+# 2026-05-12: Regex for the tool-XML tag scanner. Captures opening vs
+# closing form (group 1: "/" or ""), the tag name (group 2), and any
+# attributes (group 3). Matches <parameter>, <parameter=key>,
+# <parameter name="key">, </parameter>, <function=name>, </function>.
+_TOOL_XML_TAG_RE = re.compile(r"<(/?)(parameter|function)\b([^>]*)>")
+def _strip_orphan_tool_xml(text: str) -> str:
+    """Remove orphan </parameter> and </function> closing tags that have
+    no matching opener earlier in the text.
+    Qwen3.6 trained on the qwen3_coder XML format leaks these closers
+    after its actual answer when forced into tool_choice='required' with
+    no genuine tool to call. The closers are training residuals, not real
+    malformed tool-call markup — keeping them in the text causes the
+    primary_markers branch of _looks_malformed_tool_payload to fire on
+    every clean-but-runaway-shaped response. Real malformed tool-call
+    attempts always have at least one matching opener ('<parameter' or
+    '<function='), which the regex preserves, so primary_markers still
+    fires correctly on genuine bad output.
+    """
+    if "</parameter" not in text and "</function" not in text:
+        return text
+    out: list[str] = []
+    pos = 0
+    open_param = 0
+    open_func = 0
+    for m in _TOOL_XML_TAG_RE.finditer(text):
+        out.append(text[pos:m.start()])
+        is_close = m.group(1) == "/"
+        tag = m.group(2)
+        if is_close:
+            if tag == "parameter":
+                if open_param > 0:
+                    open_param -= 1
+                    out.append(m.group(0))
+            else:  # function
+                if open_func > 0:
+                    open_func -= 1
+                    out.append(m.group(0))
+            # else: orphan closer, skip (strip)
+        else:
+            if tag == "parameter":
+                open_param += 1
+            else:
+                open_func += 1
+            out.append(m.group(0))
+        pos = m.end()
+    out.append(text[pos:])
+    return "".join(out)
 def _looks_malformed_tool_payload(text: str) -> bool:
     if not text:
         return False
+    # 2026-05-12: Strip balanced <think>...</think> blocks before applying
+    # the heuristic. Qwen3.6 emits <think> blocks regardless of
+    # enable_thinking, and two scenarios were tripping false positives:
+    #   1. Meta-tool reasoning inside the thinking ({"description":...},
+    #      repeated "must call a tool") triggering the structural-marker
+    #      and policy-echo branches.
+    #   2. The model wrapping its ENTIRE answer inside a single <think>
+    #      block (markdown reports, tables) — the </think> structural
+    #      marker plus content-resembling-policy then fires.
+    # Downstream response processing surfaces <think> content as proper
+    # Anthropic `thinking` blocks via _THINKING_BLOCK_RE, so stripping
+    # here loses no information. Unbalanced/stray </think> without a
+    # matching opener is NOT stripped — those remain genuinely malformed.
+    if "<think>" in text and "</think>" in text:
+        text = _THINKING_BLOCK_RE.sub("", text)
+        if not text.strip():
+            return False
+    # 2026-05-12: Strip orphan </parameter> and </function> closers that
+    # have no matching opener. Qwen3.6 leaks these training residuals
+    # after its visible answer when forced into tool_choice='required'
+    # with no valid tool to call. Real malformed tool-call attempts retain
+    # their opener and still trip the primary_markers check below.
+    text = _strip_orphan_tool_xml(text)
+    if not text.strip():
+        return False
     lowered = text.lower()
     if _contains_tool_call_apology(text):
         return True
@@ -4836,13 +5698,17 @@ def _build_malformed_retry_body(
         retry_instruction = (
             "Your previous response had invalid tool-call formatting. "
             "Respond with exactly one valid tool call using the provided tools. "
-            "Do not output prose, markdown, XML tags, or schema snippets."
+            "Do not output prose, markdown, XML tags, or schema snippets. "
+            "Do NOT use <think>...</think> blocks or internal reasoning — "
+            "emit the tool_call object as the very first token of your response."
         )
     else:
         retry_instruction = (
             "Your previous response had invalid tool-call formatting. "
             "If a tool is needed, emit exactly one valid tool call with strict JSON arguments. "
-            "If no tool is needed for this turn, return concise plain text with no protocol tags."
+            "If no tool is needed for this turn, return concise plain text with no protocol tags. "
+            "Do NOT use <think>...</think> blocks — start your response directly with "
+            "either a tool_call or the plain text answer."
         )
     malformed_retry_instruction = {
@@ -5023,7 +5889,7 @@ async def _apply_unexpected_end_turn_guardrail(
     )
     if retry_resp.status_code == 200:
         retry_json = retry_resp.json()
-        _maybe_extract_text_tool_calls(retry_json)
+        _maybe_extract_text_tool_calls(retry_json, anthropic_tools=anthropic_body.get("tools"))
         retry_choice, retry_message = _extract_openai_choice(retry_json)
         if _openai_has_valid_tool_calls(retry_json, anthropic_body):
             logger.info("GUARDRAIL: retry produced tool_use; using retried response")
@@ -5112,8 +5978,12 @@ async def _apply_malformed_tool_guardrail(
             )
         return working_resp
-    # Mark garbled state for progressive max_tokens reduction on next turn
-    monitor.last_response_garbled = True
+    # Only set last_response_garbled for TRUE degenerate generation, not
+    # for responses merely truncated by max_tokens — otherwise the next
+    # turn gets hit with the garbled_cap (smaller max_tokens) and the
+    # problem compounds.
+    if issue.kind != "truncated_tool_args":
+        monitor.last_response_garbled = True
     if issue.kind == "malformed_payload":
         monitor.malformed_tool_streak += 1
@@ -5121,7 +5991,12 @@ async def _apply_malformed_tool_guardrail(
         monitor.invalid_tool_call_streak += 1
         monitor.arg_preflight_rejections += 1
-    monitor.maybe_activate_forced_tool_dampener(issue.kind)
+    # Truncation is a max_tokens accident, not the model misbehaving: don't
+    # feed it to the forced-tool dampener, which would otherwise relax
+    # tool_choice on the very next turn and let the model trail off with
+    # text (the exact failure mode that stopped opencode).
+    if issue.kind != "truncated_tool_args":
+        monitor.maybe_activate_forced_tool_dampener(issue.kind)
     excerpt = _openai_message_text(working_resp)[:220].replace("\n", " ")
     # Option 2: Log garbled argument content for diagnostics
     arg_excerpt = ""
@@ -5194,7 +6069,7 @@ async def _apply_malformed_tool_guardrail(
             continue
         retry_json = retry_resp.json()
-        _maybe_extract_text_tool_calls(retry_json)
+        _maybe_extract_text_tool_calls(retry_json, anthropic_tools=anthropic_body.get("tools"))
         retry_working = retry_json
         retry_repairs = 0
         if PROXY_TOOL_ARGS_PREFLIGHT and _openai_has_tool_calls(retry_json):
@@ -5226,15 +6101,20 @@ async def _apply_malformed_tool_guardrail(
         )
         if not retry_issue.has_issue():
-            monitor.malformed_tool_streak = 0
-            monitor.invalid_tool_call_streak = 0
-            monitor.required_tool_miss_streak = 0
+            # 2026-05-12: Fix #2 — do NOT reset malformed/invalid/miss streaks
+            # to 0 on retry-success. Previously, sessions stuck in a
+            # malformed→retry-success loop never accumulated enough streak to
+            # trigger the forced-tool dampener. Healthy responses with real
+            # tool_calls still reset the streak via the upstream no-issue path
+            # (~L5655), so genuine recovery still resets counters; only
+            # repeated retry-recoveries persist toward the dampener.
             monitor.last_response_garbled = False
             logger.info(
-                "TOOL RESPONSE RETRY success: kind=%s attempt=%d/%d",
+                "TOOL RESPONSE RETRY success: kind=%s attempt=%d/%d malformed_streak=%d",
                 current_issue.kind,
                 attempt + 1,
                 attempts,
+                monitor.malformed_tool_streak,
             )
             if retry_repairs > 0:
                 monitor.arg_preflight_repairs += retry_repairs
@@ -5259,7 +6139,10 @@ async def _apply_malformed_tool_guardrail(
                 if fn_name and raw_args and _is_garbled_tool_arguments(raw_args):
                     failing_tools.add(fn_name)
-        monitor.maybe_activate_forced_tool_dampener(retry_issue.kind)
+        # Truncation on retry is still a max_tokens problem, not a model
+        # misbehaviour — don't dampen. The outer retry loop will try again.
+        if retry_issue.kind != "truncated_tool_args":
+            monitor.maybe_activate_forced_tool_dampener(retry_issue.kind)
         logger.warning(
             "TOOL RESPONSE RETRY invalid: session=%s attempt=%d/%d kind=%s reason=%s",
             session_id,
@@ -5440,11 +6323,19 @@ def _maybe_apply_session_contamination_breaker(
 # ===========================================================================
-def _maybe_extract_text_tool_calls(openai_resp: dict) -> dict:
+def _maybe_extract_text_tool_calls(
+    openai_resp: dict, anthropic_tools: list[dict] | None = None
+) -> dict:
     """Mutate *openai_resp* in-place: if the message has no structured
-    ``tool_calls`` but contains ``<tool_call>`` XML in text, extract them
-    and promote to real ``tool_calls`` on the message.  Returns the
-    (possibly-mutated) response for chaining."""
+    ``tool_calls`` but contains tool-call markup in text, extract them
+    and promote to real ``tool_calls`` on the message.
+    *anthropic_tools* (optional): list of tool definitions from the original
+    Anthropic request. Enables schema-matching of bare-args markdown JSON
+    blocks emitted by Gemma 4 cold turns (fix D). Without it, bare-args
+    blocks pass through as text.
+    Returns the (possibly-mutated) response for chaining."""
     choice = (openai_resp.get("choices") or [{}])[0]
     message = choice.get("message", {})
@@ -5453,10 +6344,20 @@ def _maybe_extract_text_tool_calls(openai_resp: dict) -> dict:
         return openai_resp
     text = message.get("content", "")
-    if not isinstance(text, str) or "<tool_call>" not in text:
+    if not isinstance(text, str):
+        return openai_resp
+    # Quick early-exit if no markers present (matches dispatcher guard)
+    if (
+        "<tool_call>" not in text
+        and "<function=" not in text
+        and "<|tool_call>" not in text
+        and "```" not in text
+    ):
         return openai_resp
-    extracted, remaining = _extract_tool_calls_from_text(text)
+    extracted, remaining = _extract_tool_calls_from_text(
+        text, available_tools=anthropic_tools
+    )
     if not extracted:
         return openai_resp
@@ -5591,8 +6492,43 @@ def _inject_synthetic_continuation(
     return anthropic_resp
-def openai_to_anthropic_response(openai_resp: dict, model: str) -> dict:
-    """Convert an OpenAI Chat Completions response to Anthropic Messages format."""
+_THINKING_BLOCK_RE = re.compile(r"<think>(.*?)</think>\s*", re.DOTALL)
+def _extract_thinking_block(text: str) -> tuple[str | None, str]:
+    """Extract Qwen-style ``<think>...</think>`` blocks from *text*.
+    Returns ``(thinking_content, remaining_text)``. If no ``<think>`` tag is
+    present, returns ``(None, text)`` unchanged. Multiple thinking blocks
+    are concatenated. Trailing whitespace after each block is consumed so
+    the remaining text starts cleanly with the model's actual answer.
+    """
+    if "<think>" not in text:
+        return None, text
+    parts: list[str] = []
+    def collect(m: re.Match) -> str:
+        parts.append(m.group(1).strip())
+        return ""
+    remaining = _THINKING_BLOCK_RE.sub(collect, text)
+    if not parts:
+        return None, text
+    return "\n\n".join(p for p in parts if p), remaining.lstrip()
+def openai_to_anthropic_response(
+    openai_resp: dict, model: str, expose_thinking: bool = True
+) -> dict:
+    """Convert an OpenAI Chat Completions response to Anthropic Messages format.
+    *expose_thinking*: when True, surface ``<think>...</think>`` content from
+    the upstream as Anthropic ``{"type": "thinking"}`` blocks. When False
+    (Anthropic default — client didn't opt in), strip thinking content
+    from the response entirely so the client only sees the actual answer.
+    Qwen's chat template seeds the model into thinking regardless of the
+    ``enable_thinking`` request param, so even thinking-off responses
+    typically still contain ``<think>`` blocks; this flag controls whether
+    they're surfaced as Anthropic blocks or silently consumed.
+    """
     # First: try to recover tool calls trapped in text XML tags
     _maybe_extract_text_tool_calls(openai_resp)
     # Second: strip garbled/degenerate tool call arguments
@@ -5603,20 +6539,46 @@ def openai_to_anthropic_response(openai_resp: dict, model: str) -> dict:
     finish = choice.get("finish_reason", "stop")
     content = []
+    # Surface Qwen's <think>...</think> output as Anthropic-style thinking
+    # blocks (Anthropic extended-thinking API shape:
+    #   {"type": "thinking", "thinking": "...", "signature": ""}).
+    # Clients that don't request thinking simply ignore the block; clients
+    # that do (Claude Code) render them in the thinking pane.
+    raw_text = ""
     if message.get("content"):
         raw_text = (
             message["content"]
             if isinstance(message["content"], str)
             else str(message["content"])
         )
-        sanitized_text = _sanitize_tool_call_apology_text(raw_text)
-        if sanitized_text != raw_text:
+    # Some llama-server builds emit the model's reasoning into a separate
+    # `reasoning_content` field instead of inline <think> tags. Surface
+    # that too so the proxy is consistent regardless of upstream behaviour.
+    inline_thinking, body_text = _extract_thinking_block(raw_text)
+    sidecar_thinking = message.get("reasoning_content") or message.get("reasoning")
+    thinking_chunks: list[str] = []
+    if isinstance(sidecar_thinking, str) and sidecar_thinking.strip():
+        thinking_chunks.append(sidecar_thinking.strip())
+    if inline_thinking:
+        thinking_chunks.append(inline_thinking)
+    if thinking_chunks and expose_thinking:
+        content.append(
+            {
+                "type": "thinking",
+                "thinking": "\n\n".join(thinking_chunks),
+                "signature": "",
+            }
+        )
+    if body_text:
+        sanitized_text = _sanitize_tool_call_apology_text(body_text)
+        if sanitized_text != body_text:
             logger.warning(
                 "SANITIZE: replaced known malformed tool-call apology text in assistant response"
             )
         # Option 1: Strip residual <tool_call> XML that wasn't extracted
         sanitized_text = _strip_residual_tool_call_xml(sanitized_text)
-        if sanitized_text != raw_text and "<tool_call>" in raw_text:
+        if sanitized_text != body_text and "<tool_call>" in body_text:
             logger.warning(
                 "SANITIZE: stripped residual <tool_call> XML from text content"
             )
@@ -5641,10 +6603,21 @@ def openai_to_anthropic_response(openai_resp: dict, model: str) -> dict:
                     logger.warning(
                         "BASH SAFETY: stripped standalone protocol-tag lines from command before tool execution"
                     )
+        # Normalise IDs to Anthropic spec (toolu_ prefix). Upstream
+        # llama-server returns opaque IDs without prefix; clients that
+        # validate prefix would reject. Strip-and-restamp here, restore in
+        # anthropic_to_openai_messages() when client sends tool_result back.
+        upstream_id = tc.get("id", "")
+        if upstream_id.startswith("toolu_"):
+            tool_use_id = upstream_id
+        elif upstream_id:
+            tool_use_id = f"toolu_{upstream_id}"
+        else:
+            tool_use_id = f"toolu_{uuid.uuid4().hex[:24]}"
         content.append(
             {
                 "type": "tool_use",
-                "id": tc.get("id", f"toolu_{uuid.uuid4().hex[:12]}"),
+                "id": tool_use_id,
                 "name": fn.get("name", ""),
                 "input": args,
             }
@@ -6341,7 +7314,7 @@ async def messages(request: Request):
             )
         except Exception as exc:
             # Check if upstream is hung before returning error
-            await _check_slot_hang(f"{LLAMA_CPP_BASE}/slots")
+            await _check_slot_hang(LLAMA_CPP_BASE.replace("/v1", "/slots"))
             return Response(
                 content=json.dumps(
                     {
@@ -6356,6 +7329,23 @@ async def messages(request: Request):
                 media_type="application/json",
             )
+        if strict_resp.status_code != 200:
+            error_text = strict_resp.text[:1000]
+            # Try the Gemma 4 PEG parse-failure recovery first — relax
+            # tool_choice='required' so the retry isn't constrained by the
+            # strict-grammar that triggered the parse failure.
+            relaxed = _is_gemma4_peg_parse_failure(strict_resp.status_code, error_text) and \
+                _relax_tool_choice_for_gemma4_peg_retry(strict_body, "strict-stream")
+            if relaxed:
+                try:
+                    strict_resp = await _post_with_generation_timeout(
+                        client,
+                        f"{LLAMA_CPP_BASE}/chat/completions",
+                        strict_body,
+                        {"Content-Type": "application/json"},
+                    )
+                except Exception:
+                    pass  # fall through to next handler
         if strict_resp.status_code != 200:
             error_text = strict_resp.text[:1000]
             if _maybe_disable_grammar_for_tools_error(
@@ -6430,7 +7420,7 @@ async def messages(request: Request):
         openai_resp = strict_resp.json()
         # Recover tool calls from <tool_call> XML before guardrails run
-        _maybe_extract_text_tool_calls(openai_resp)
+        _maybe_extract_text_tool_calls(openai_resp, anthropic_tools=body.get("tools"))
         openai_resp = await _apply_unexpected_end_turn_guardrail(
             client,
             openai_resp,
@@ -6485,7 +7475,11 @@ async def messages(request: Request):
                         logger.info("DEGENERATE RETRY: retry insufficient, using truncated original")
             except Exception as exc:
                 logger.warning("DEGENERATE RETRY: failed: %s", exc)
-        anthropic_resp = openai_to_anthropic_response(openai_resp, model)
+        anthropic_resp = openai_to_anthropic_response(
+            openai_resp, model,
+            expose_thinking=isinstance(body.get("thinking"), dict)
+                and (body["thinking"].get("type") or "").lower() == "enabled",
+        )
         # FINALIZE CONTINUATION: inject synthetic tool_use to keep client loop alive
         if (
             monitor.finalize_turn_active
@@ -6601,6 +7595,29 @@ async def messages(request: Request):
             error_body = await resp.aread()
             await resp.aclose()
             error_text = error_body.decode("utf-8", errors="replace")[:1000]
+            # Gemma 4 PEG parse-failure recovery: relax tool_choice='required'
+            # so the retry isn't blocked by the strict-grammar that rejected
+            # the model's incomplete tool call.
+            if _is_gemma4_peg_parse_failure(resp.status_code, error_text) and \
+                    _relax_tool_choice_for_gemma4_peg_retry(openai_body, "stream"):
+                resp = await client.send(
+                    client.build_request(
+                        "POST",
+                        f"{LLAMA_CPP_BASE}/chat/completions",
+                        json=openai_body,
+                        headers={"Content-Type": "application/json"},
+                    ),
+                    stream=True,
+                )
+                if resp.status_code == 200:
+                    return StreamingResponse(
+                        stream_anthropic_response(resp, model, monitor, body),
+                        media_type="text/event-stream",
+                    )
+                # fall through if still failing
+                error_body = await resp.aread()
+                await resp.aclose()
+                error_text = error_body.decode("utf-8", errors="replace")[:1000]
             if _maybe_disable_grammar_for_tools_error(
                 openai_body,
                 resp.status_code,
@@ -6733,6 +7750,23 @@ async def messages(request: Request):
                 media_type="application/json",
             )
+        if resp.status_code != 200:
+            error_text = resp.text[:1000]
+            # Gemma 4 PEG parse-failure recovery (non-stream path).
+            relaxed = (
+                _is_gemma4_peg_parse_failure(resp.status_code, error_text)
+                and _relax_tool_choice_for_gemma4_peg_retry(openai_body, "non-stream")
+            )
+            if relaxed:
+                try:
+                    resp = await _post_with_generation_timeout(
+                        client,
+                        f"{LLAMA_CPP_BASE}/chat/completions",
+                        openai_body,
+                        {"Content-Type": "application/json"},
+                    )
+                except Exception:
+                    pass  # fall through
         if resp.status_code != 200:
             error_text = resp.text[:1000]
             if _maybe_disable_grammar_for_tools_error(
@@ -6785,7 +7819,7 @@ async def messages(request: Request):
         openai_resp = resp.json()
         # Recover tool calls from <tool_call> XML before guardrails run
-        _maybe_extract_text_tool_calls(openai_resp)
+        _maybe_extract_text_tool_calls(openai_resp, anthropic_tools=body.get("tools"))
         openai_resp = await _apply_unexpected_end_turn_guardrail(
             client,
             openai_resp,
@@ -6854,7 +7888,11 @@ async def messages(request: Request):
                         logger.info("DEGENERATE RETRY (stream): no tool call, using truncated")
             except Exception as exc:
                 logger.warning("DEGENERATE RETRY (stream): failed: %s", exc)
-        anthropic_resp = openai_to_anthropic_response(openai_resp, model)
+        anthropic_resp = openai_to_anthropic_response(
+            openai_resp, model,
+            expose_thinking=isinstance(body.get("thinking"), dict)
+                and (body["thinking"].get("type") or "").lower() == "enabled",
+        )
         # FINALIZE CONTINUATION: inject synthetic tool_use (non-guarded stream path)
         if (
             monitor.finalize_turn_active