@miller-tech/uap 1.20.34 → 1.20.36

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -134,6 +134,11 @@ PROXY_LOOP_BREAKER = os.environ.get("PROXY_LOOP_BREAKER", "on").lower() not in {
134
134
  }
135
135
  PROXY_LOOP_WINDOW = int(os.environ.get("PROXY_LOOP_WINDOW", "6"))
136
136
  PROXY_LOOP_REPEAT_THRESHOLD = int(os.environ.get("PROXY_LOOP_REPEAT_THRESHOLD", "6"))
137
+ # Fix K (2026-04-22): minimum consecutive cycle-repeat count required to flip
138
+ # phase from act -> review. The old behaviour accepted cycle_repeat=2, which
139
+ # is normal in a working session (re-reading the same file across edits).
140
+ # Set higher to tolerate legitimate re-reads; set 1 to restore old behaviour.
141
+ PROXY_CYCLE_TRIGGER_REPEAT = int(os.environ.get("PROXY_CYCLE_TRIGGER_REPEAT", "3"))
137
142
  PROXY_FORCED_THRESHOLD = int(os.environ.get("PROXY_FORCED_THRESHOLD", "15"))
138
143
  PROXY_NO_PROGRESS_THRESHOLD = int(os.environ.get("PROXY_NO_PROGRESS_THRESHOLD", "3"))
139
144
  PROXY_CONTEXT_RELEASE_THRESHOLD = float(
@@ -247,6 +252,19 @@ PROXY_DISABLE_THINKING_ON_TOOL_TURNS = os.environ.get(
247
252
  "off",
248
253
  "no",
249
254
  }
255
+ # Disable thinking on EVERY turn (not just tool turns). For models like Gemma 4
256
+ # that emit ~100 thinking tokens for trivial replies, this halves output cost.
257
+ PROXY_DISABLE_THINKING_ALWAYS = os.environ.get(
258
+ "PROXY_DISABLE_THINKING_ALWAYS", "off"
259
+ ).lower() not in {"0", "false", "off", "no"}
260
+ # Force tool_choice='required' on the first turn of a fresh session. Originally
261
+ # Qwen-tuned to break out of cold-start "tries to chat instead of calling a tool"
262
+ # behaviour. Gemma 4 doesn't need this — it routes 'auto' correctly and the
263
+ # force triggers malformed-JSON emissions when it would rather speak. Default
264
+ # off; set 'on' to restore the legacy Qwen-style behaviour.
265
+ PROXY_FORCE_TOOL_CHOICE_ON_COLD_START = os.environ.get(
266
+ "PROXY_FORCE_TOOL_CHOICE_ON_COLD_START", "off"
267
+ ).lower() not in {"0", "false", "off", "no"}
250
268
  PROXY_DISABLE_SPEC_ON_TOOL_TURNS = os.environ.get(
251
269
  "PROXY_DISABLE_SPEC_ON_TOOL_TURNS", "off"
252
270
  ).lower() not in {
@@ -576,6 +594,44 @@ def _is_grammar_tools_incompatibility(status_code: int, error_text: str) -> bool
576
594
  return "custom grammar constraints" in lowered and "with tools" in lowered
577
595
 
578
596
 
597
+ def _is_gemma4_peg_parse_failure(status_code: int, error_text: str) -> bool:
598
+ """Detect Gemma 4's PEG-parser failure on tool-turn output.
599
+
600
+ llama-server returns HTTP 500 with `failed to parse grammar` /
601
+ `Failed to parse input at pos N: <|tool_call>call:...` when the model
602
+ emits an incomplete tool call (missing required schema fields) under
603
+ tool_choice='required'. The PEG grammar enforces the schema strictly
604
+ and rejects the partial output. Caller should retry with relaxed
605
+ tool_choice='auto' so the model can emit prose or a complete call
606
+ without grammar enforcement triggering this failure mode.
607
+ """
608
+ if status_code != 500:
609
+ return False
610
+ text = error_text or ""
611
+ return (
612
+ "Failed to parse input at pos" in text
613
+ or "<|tool_call>call:" in text
614
+ )
615
+
616
+
617
+ def _relax_tool_choice_for_gemma4_peg_retry(request_body: dict, source: str) -> bool:
618
+ """When a Gemma 4 PEG parse failure is detected on a tool turn, drop
619
+ tool_choice='required' so the retry has a permissive grammar. Returns
620
+ True if the body was modified (caller should retry POST)."""
621
+ if not request_body.get("tools"):
622
+ return False
623
+ current = request_body.get("tool_choice")
624
+ if current in ("required", {"type": "any"}):
625
+ request_body["tool_choice"] = "auto"
626
+ logger.warning(
627
+ "GEMMA4 PEG RETRY (%s): relaxed tool_choice='required' -> 'auto' "
628
+ "to bypass strict-grammar parse failure on incomplete model output",
629
+ source,
630
+ )
631
+ return True
632
+ return False
633
+
634
+
579
635
  def _maybe_disable_grammar_for_tools_error(
580
636
  request_body: dict,
581
637
  status_code: int,
@@ -1409,6 +1465,66 @@ def prune_conversation(
1409
1465
  # Granular timeouts: short connect, long read for streaming LLM output.
1410
1466
  http_client: httpx.AsyncClient | None = None
1411
1467
 
1468
+ # ---------------------------------------------------------------------------
1469
+ # Concurrency Control
1470
+ # ---------------------------------------------------------------------------
1471
+ # Semaphore to serialize upstream requests. llama.cpp is configured with
1472
+ # --parallel 1 (LLAMA_PARALLEL=1), so it can only process one inference at
1473
+ # a time. Without this gate, concurrent client requests (Shannon sub-agents,
1474
+ # multiple Claude Code sessions) would all hit llama.cpp at once and the
1475
+ # server would serialize them while the proxy holds N httpx connections
1476
+ # open — potentially exhausting the proxy's connection pool while requests
1477
+ # queue inside llama.cpp opaquely.
1478
+ #
1479
+ # With the semaphore: requests queue inside the proxy (cheap, just asyncio
1480
+ # tasks waiting) and only PROXY_CONCURRENCY_LIMIT at a time reaches
1481
+ # llama.cpp. Each httpx connection is held only for the actual inference
1482
+ # duration, not the queue wait.
1483
+ #
1484
+ # Default: 1 (matches LLAMA_PARALLEL=1). Increase if you raise --parallel.
1485
+ PROXY_CONCURRENCY_LIMIT = int(os.environ.get("PROXY_CONCURRENCY_LIMIT", "1"))
1486
+ # Max time to wait for a slot before returning 503. Generous because real
1487
+ # inference can take 30-600s and queued requests must wait through that.
1488
+ # 0 = wait indefinitely.
1489
+ PROXY_CONCURRENCY_QUEUE_TIMEOUT = float(
1490
+ os.environ.get("PROXY_CONCURRENCY_QUEUE_TIMEOUT", "900")
1491
+ )
1492
+ upstream_semaphore: asyncio.Semaphore | None = None
1493
+
1494
+
1495
+ async def _acquire_upstream_slot() -> bool:
1496
+ """Acquire a semaphore slot for an upstream request.
1497
+
1498
+ Returns True if a slot was acquired, False if the wait timed out.
1499
+ asyncio.Semaphore.acquire() preserves wait order via futures, so this
1500
+ gives a natural FIFO queue.
1501
+ """
1502
+ if upstream_semaphore is None:
1503
+ return True # Not yet initialized; proceed without limiting
1504
+ if PROXY_CONCURRENCY_QUEUE_TIMEOUT <= 0:
1505
+ await upstream_semaphore.acquire()
1506
+ return True
1507
+ try:
1508
+ await asyncio.wait_for(
1509
+ upstream_semaphore.acquire(),
1510
+ timeout=PROXY_CONCURRENCY_QUEUE_TIMEOUT,
1511
+ )
1512
+ return True
1513
+ except asyncio.TimeoutError:
1514
+ return False
1515
+
1516
+
1517
+ def _release_upstream_slot() -> None:
1518
+ """Release a semaphore slot. MUST be called once per successful acquire.
1519
+
1520
+ Note: asyncio.Semaphore.release() always increments the counter — we
1521
+ do NOT gate on locked() because that returns True only when the counter
1522
+ is 0 (no slots left). Gating would cause a slot leak when limit > 1 and
1523
+ multiple holders release simultaneously.
1524
+ """
1525
+ if upstream_semaphore is not None:
1526
+ upstream_semaphore.release()
1527
+
1412
1528
 
1413
1529
  def _is_loading_model_503(resp: httpx.Response) -> bool:
1414
1530
  """Check if response is a 503 'Loading model' from llama.cpp."""
@@ -1452,6 +1568,36 @@ async def _post_with_retry(
1452
1568
  url: str,
1453
1569
  payload: dict,
1454
1570
  headers: dict,
1571
+ ) -> httpx.Response:
1572
+ """Post with upstream-retry + concurrency-slot acquire.
1573
+
1574
+ Acquires a slot from upstream_semaphore before making the request, so
1575
+ concurrent client requests queue in the proxy (cheap asyncio waits)
1576
+ rather than all hammering llama.cpp at once. Slot is released in a
1577
+ finally block so it's always returned to the pool even on error.
1578
+ """
1579
+ acquired = await _acquire_upstream_slot()
1580
+ if not acquired:
1581
+ logger.warning(
1582
+ "CONCURRENCY: queue timeout (%ds) exceeded waiting for upstream slot",
1583
+ int(PROXY_CONCURRENCY_QUEUE_TIMEOUT),
1584
+ )
1585
+ raise httpx.RemoteProtocolError(
1586
+ f"Upstream concurrency queue timed out after {int(PROXY_CONCURRENCY_QUEUE_TIMEOUT)}s "
1587
+ f"(limit={PROXY_CONCURRENCY_LIMIT})",
1588
+ request=None,
1589
+ )
1590
+ try:
1591
+ return await _post_with_retry_inner(client, url, payload, headers)
1592
+ finally:
1593
+ _release_upstream_slot()
1594
+
1595
+
1596
+ async def _post_with_retry_inner(
1597
+ client: httpx.AsyncClient,
1598
+ url: str,
1599
+ payload: dict,
1600
+ headers: dict,
1455
1601
  ) -> httpx.Response:
1456
1602
  last_exc: Exception | None = None
1457
1603
  for attempt in range(PROXY_UPSTREAM_RETRY_MAX):
@@ -1497,6 +1643,7 @@ async def _post_with_generation_timeout(
1497
1643
  headers: dict,
1498
1644
  ) -> httpx.Response:
1499
1645
  """Wrap _post_with_retry with an explicit asyncio generation timeout.
1646
+ Also acquires a concurrency slot before making the request.
1500
1647
 
1501
1648
  The httpx read timeout may not fire for hung connections where the server
1502
1649
  keeps the socket open but produces no data (observed with llama.cpp server
@@ -1561,6 +1708,13 @@ async def lifespan(app: FastAPI):
1561
1708
  """Manage the httpx client lifecycle with the FastAPI app."""
1562
1709
  global http_client
1563
1710
  global default_context_window
1711
+ global upstream_semaphore
1712
+ upstream_semaphore = asyncio.Semaphore(PROXY_CONCURRENCY_LIMIT)
1713
+ logger.info(
1714
+ "CONCURRENCY: upstream semaphore initialized limit=%d queue_timeout=%.0fs",
1715
+ PROXY_CONCURRENCY_LIMIT,
1716
+ PROXY_CONCURRENCY_QUEUE_TIMEOUT,
1717
+ )
1564
1718
  http_client = httpx.AsyncClient(
1565
1719
  timeout=httpx.Timeout(
1566
1720
  connect=10.0, # 10s to establish connection
@@ -1643,6 +1797,8 @@ async def lifespan(app: FastAPI):
1643
1797
  yield
1644
1798
  await http_client.aclose()
1645
1799
  http_client = None
1800
+ if upstream_semaphore is not None:
1801
+ upstream_semaphore = None
1646
1802
  logger.info("Proxy shut down")
1647
1803
 
1648
1804
 
@@ -1653,6 +1809,16 @@ app = FastAPI(
1653
1809
  lifespan=lifespan,
1654
1810
  )
1655
1811
 
1812
+ # NOTE: Concurrency control is enforced by _acquire_upstream_slot() inside
1813
+ # _post_with_retry (the single point where we hit llama.cpp). An earlier
1814
+ # implementation also added an HTTP middleware that acquired the same
1815
+ # semaphore — this caused a self-deadlock (middleware holds slot, inner
1816
+ # call waits for slot, both on the same task). The middleware approach
1817
+ # also called non-existent asyncio.Semaphore methods (try_acquire /
1818
+ # acquire_nowait) and ran an async primitive in a thread executor.
1819
+ # Removed 2026-05-13.
1820
+
1821
+
1656
1822
 
1657
1823
  # ===========================================================================
1658
1824
  # Request Translation: Anthropic -> OpenAI
@@ -1686,6 +1852,31 @@ def anthropic_to_openai_messages(anthropic_body: dict) -> list[dict]:
1686
1852
  role = msg["role"]
1687
1853
  content = msg.get("content")
1688
1854
 
1855
+ # Strip <think>...</think> blocks from PRIOR assistant turns. Qwen is
1856
+ # heavily few-shot influenced by its own conversation history — if
1857
+ # earlier assistant turns contain reasoning blocks, the next turn
1858
+ # will pattern-match and emit <think> tags even when the system
1859
+ # prompt forbids them. Stripping breaks the copy cycle.
1860
+ if role == "assistant":
1861
+ if isinstance(content, str) and "<think>" in content:
1862
+ content = _THINKING_BLOCK_RE.sub("", content).lstrip()
1863
+ elif isinstance(content, list):
1864
+ stripped = []
1865
+ for b in content:
1866
+ if isinstance(b, dict) and b.get("type") == "text":
1867
+ t = b.get("text", "")
1868
+ if "<think>" in t:
1869
+ t = _THINKING_BLOCK_RE.sub("", t).lstrip()
1870
+ if t:
1871
+ stripped.append({**b, "text": t})
1872
+ elif isinstance(b, dict) and b.get("type") == "thinking":
1873
+ # Anthropic-style thinking block — drop entirely
1874
+ # (don't replay it back to the model).
1875
+ continue
1876
+ else:
1877
+ stripped.append(b)
1878
+ content = stripped
1879
+
1689
1880
  if isinstance(content, str):
1690
1881
  messages.append({"role": role, "content": content})
1691
1882
  elif isinstance(content, list):
@@ -1695,6 +1886,10 @@ def anthropic_to_openai_messages(anthropic_body: dict) -> list[dict]:
1695
1886
  parts.append(block)
1696
1887
  elif block.get("type") == "text":
1697
1888
  parts.append(block.get("text", ""))
1889
+ elif block.get("type") == "thinking":
1890
+ # Drop thinking blocks from user/assistant content when
1891
+ # echoed back into history — model shouldn't see them.
1892
+ continue
1698
1893
  elif block.get("type") == "tool_use":
1699
1894
  messages.append(
1700
1895
  {
@@ -1703,7 +1898,7 @@ def anthropic_to_openai_messages(anthropic_body: dict) -> list[dict]:
1703
1898
  "tool_calls": [
1704
1899
  {
1705
1900
  "id": block.get(
1706
- "id", f"call_{uuid.uuid4().hex[:8]}"
1901
+ "id", f"toolu_{uuid.uuid4().hex[:24]}"
1707
1902
  ),
1708
1903
  "type": "function",
1709
1904
  "function": {
@@ -1716,10 +1911,17 @@ def anthropic_to_openai_messages(anthropic_body: dict) -> list[dict]:
1716
1911
  )
1717
1912
  continue
1718
1913
  elif block.get("type") == "tool_result":
1914
+ # Strip Anthropic-spec toolu_ prefix so the upstream
1915
+ # tool_call_id matches what llama-server originally
1916
+ # emitted (we stamped the prefix on outbound; reverse it
1917
+ # here so the loop closes correctly).
1918
+ tu_id = block.get("tool_use_id", "")
1919
+ if isinstance(tu_id, str) and tu_id.startswith("toolu_"):
1920
+ tu_id = tu_id[len("toolu_"):]
1719
1921
  messages.append(
1720
1922
  {
1721
1923
  "role": "tool",
1722
- "tool_call_id": block.get("tool_use_id", ""),
1924
+ "tool_call_id": tu_id,
1723
1925
  "content": _extract_text(block.get("content", "")),
1724
1926
  }
1725
1927
  )
@@ -1899,6 +2101,18 @@ _AGENTIC_SYSTEM_SUPPLEMENT_MINIMAL = (
1899
2101
  "\n\nUse tools for all actions. Respond with tool calls, not descriptions of what to do."
1900
2102
  )
1901
2103
 
2104
+ # Directive appended when the upstream model (Qwen) is configured with
2105
+ # enable_thinking=False but consistently emits <think>...</think> blocks
2106
+ # anyway, consuming the max_tokens budget before any tool_use is generated.
2107
+ # Empirically required for Shannon-style workflows where max_tokens=512
2108
+ # leaves no room for both internal reasoning AND a tool call.
2109
+ _NO_THINKING_DIRECTIVE = (
2110
+ "\n\nCRITICAL: Do NOT output <think>...</think> tags or any internal "
2111
+ "reasoning. Begin your response IMMEDIATELY with the appropriate "
2112
+ "tool_call. If you have no tool to call, reply with plain text only — "
2113
+ "never include reasoning blocks."
2114
+ )
2115
+
1902
2116
  if PROXY_AGENTIC_SUPPLEMENT_MODE == "legacy":
1903
2117
  _AGENTIC_SYSTEM_SUPPLEMENT = _AGENTIC_SYSTEM_SUPPLEMENT_LEGACY
1904
2118
  elif PROXY_AGENTIC_SUPPLEMENT_MODE == "minimal":
@@ -2264,7 +2478,7 @@ def anthropic_to_openai_response(anthropic_resp: dict) -> dict:
2264
2478
  elif btype == "tool_use":
2265
2479
  tool_calls.append(
2266
2480
  {
2267
- "id": block.get("id", f"call_{uuid.uuid4().hex[:12]}"),
2481
+ "id": block.get("id", f"toolu_{uuid.uuid4().hex[:24]}"),
2268
2482
  "type": "function",
2269
2483
  "function": {
2270
2484
  "name": block.get("name", ""),
@@ -2347,6 +2561,72 @@ def _latest_user_text(anthropic_body: dict) -> str:
2347
2561
  return ""
2348
2562
 
2349
2563
 
2564
+ # 2026-05-12: Detect "no-task" user turns to gate the state machine's
2565
+ # force-required path. When the last actual human query is a short ack
2566
+ # ("ok", "3", "test"), an acknowledgement phrase ("standing by", "awaiting
2567
+ # next instruction"), or a status report ending in an ack ("scan complete.
2568
+ # awaiting next instruction"), there is no genuine work for the model to
2569
+ # do. Forcing tool_choice='required' in this state causes the model to
2570
+ # ruminate in <think> blocks, and the meta-tool talk inside those blocks
2571
+ # trips the malformed-pseudo-tool detector. Conservative patterns only.
2572
+ _NO_TASK_SHORT_ACKS = frozenset({
2573
+ "ok", "okay", "k", "kk", "y", "n", "yes", "no", "nope", "yep", "yeah",
2574
+ "thanks", "thank", "thx", "ty", "ack", "noted", "received", "understood",
2575
+ "test", "ping", "hi", "hello",
2576
+ })
2577
+
2578
+ _NO_TASK_ACK_PATTERNS = (
2579
+ re.compile(r"awaiting\s+(?:next|further|your)\s+(?:instruction|input|command|task|directive)", re.I),
2580
+ re.compile(r"standing\s+by(?:\s+for\s+(?:your\s+)?(?:next|further|new)\s+(?:instruction|input|command|task|directive)?)?", re.I),
2581
+ re.compile(r"\b(?:ready|waiting|holding)\s+for\s+(?:your\s+)?(?:next|further|new)\s+(?:task|instruction|command|input|directive)", re.I),
2582
+ # Status report ending in ack: "X complete. {awaiting/standing/ready/done}"
2583
+ re.compile(r"\bcomplet(?:e|ed)\b[\s.,;:!\-]+(?:awaiting|standing\s+by|ready|done|finished|over\s+to\s+you)", re.I),
2584
+ )
2585
+
2586
+
2587
+ def _is_no_task_user_text(text: str) -> bool:
2588
+ if not text:
2589
+ return False
2590
+ stripped = text.strip()
2591
+ if not stripped:
2592
+ return False
2593
+ bare = re.sub(r"[^\w\s]", "", stripped).strip().lower()
2594
+ if bare in _NO_TASK_SHORT_ACKS:
2595
+ return True
2596
+ if re.fullmatch(r"\d+(?:\.\d+)?", bare):
2597
+ return True
2598
+ snippet = stripped[:400]
2599
+ return any(p.search(snippet) for p in _NO_TASK_ACK_PATTERNS)
2600
+
2601
+
2602
+ def _latest_user_query_text(anthropic_body: dict) -> str:
2603
+ """Return the most recent user message *text* — walking past
2604
+ tool_result-only messages to find the last actual human query.
2605
+
2606
+ During agentic loops the trailing user message is a tool_result block
2607
+ with no ``text`` parts, so ``_latest_user_text`` returns empty.
2608
+ Tool-narrowing needs query tokens to score tools; without them it
2609
+ keeps all tools (defeating the purpose). This walker pulls text
2610
+ from prior user turns as a fallback so narrowing stays useful in
2611
+ long loops.
2612
+ """
2613
+ for msg in reversed(anthropic_body.get("messages", [])):
2614
+ if msg.get("role") != "user":
2615
+ continue
2616
+ content = msg.get("content", "")
2617
+ if isinstance(content, str) and content.strip():
2618
+ return content
2619
+ if isinstance(content, list):
2620
+ text_parts = [
2621
+ b.get("text", "")
2622
+ for b in content
2623
+ if isinstance(b, dict) and b.get("type") == "text" and b.get("text")
2624
+ ]
2625
+ if text_parts:
2626
+ return "\n".join(text_parts)
2627
+ return ""
2628
+
2629
+
2350
2630
  def _tokenize_for_tool_ranking(text: str) -> set[str]:
2351
2631
  return {m.group(0).lower() for m in re.finditer(r"[a-zA-Z0-9_]{2,}", text)}
2352
2632
 
@@ -2366,6 +2646,13 @@ def _narrow_tools_for_request(
2366
2646
 
2367
2647
  query_text = _latest_user_text(anthropic_body).lower()
2368
2648
  query_tokens = _tokenize_for_tool_ranking(query_text)
2649
+ if not query_tokens:
2650
+ # Walk back past tool_result turns to find the prior real human
2651
+ # query. Lets narrowing stay effective during agentic loops where
2652
+ # the latest user msg is just a tool_result block (no text).
2653
+ fallback_query = _latest_user_query_text(anthropic_body).lower()
2654
+ query_text = fallback_query or query_text
2655
+ query_tokens = _tokenize_for_tool_ranking(query_text)
2369
2656
  if not query_tokens:
2370
2657
  n_msgs = len(anthropic_body.get("messages", []))
2371
2658
  if (
@@ -2490,6 +2777,18 @@ def _resolve_state_machine_tool_choice(
2490
2777
  monitor.finalize_synthetic_tool_id = ""
2491
2778
  return None, "fresh_user_text"
2492
2779
 
2780
+ # 2026-05-12: No-task ack guard. When the latest user message is just a
2781
+ # tool_result (no fresh text), walk back to the most recent human query.
2782
+ # If that query is a short ack or "X complete. awaiting next" status,
2783
+ # do not force tool_choice — let the model produce a natural finalization
2784
+ # text instead of ruminating in <think> blocks.
2785
+ last_user_query = _latest_user_query_text(anthropic_body).strip()
2786
+ if last_user_query and _is_no_task_user_text(last_user_query):
2787
+ monitor.reset_tool_turn_state(reason="no_task_user_text")
2788
+ monitor.finalize_continuation_count = 0
2789
+ monitor.finalize_synthetic_tool_id = ""
2790
+ return None, "no_task_user_text"
2791
+
2493
2792
  active_loop = (
2494
2793
  has_tool_results
2495
2794
  and last_user_has_tool_result
@@ -2563,7 +2862,15 @@ def _resolve_state_machine_tool_choice(
2563
2862
  dup_tool,
2564
2863
  )
2565
2864
 
2566
- if cycle_looping or stagnating:
2865
+ # Fix K (2026-04-22): require cycle_repeat >= PROXY_CYCLE_TRIGGER_REPEAT
2866
+ # before flipping phase. Single-repeat cycles are legitimate in working
2867
+ # sessions (e.g. re-reading the same file across edits). dup_target
2868
+ # above already demands threshold=3 before asserting a cycle, so the
2869
+ # `cycle_looping = True, cycle_repeat = 2` pair from that branch is
2870
+ # kept as a strong signal (read target repeated 3+ times). Low-repeat
2871
+ # cycles detected by detect_tool_cycle get filtered here.
2872
+ cycle_trip = cycle_looping and cycle_repeat >= PROXY_CYCLE_TRIGGER_REPEAT
2873
+ if cycle_trip or stagnating:
2567
2874
  reason = "cycle_detected" if cycle_looping else "stagnation"
2568
2875
  monitor.set_tool_turn_phase("review", reason=reason)
2569
2876
  monitor.tool_state_review_cycles += 1
@@ -2702,6 +3009,33 @@ def build_openai_request(
2702
3009
 
2703
3010
  has_tools = _has_tool_definitions(anthropic_body)
2704
3011
 
3012
+ # Translate Anthropic `thinking` parameter to upstream `enable_thinking`.
3013
+ # Anthropic shape: {"thinking": {"type": "enabled", "budget_tokens": 1024}}
3014
+ # or {"type": "disabled"}. Per the Anthropic spec, thinking is OFF by
3015
+ # default and ONLY enabled when the client opts in. Match that behaviour:
3016
+ # - thinking.type == "enabled" -> enable_thinking=True
3017
+ # - thinking.type == "disabled" or absent -> enable_thinking=False
3018
+ # Without this, Qwen's chat template (which defaults thinking ON) would
3019
+ # consume the client's max_tokens budget on internal reasoning, leaving
3020
+ # nothing for the visible answer.
3021
+ anthropic_thinking = anthropic_body.get("thinking")
3022
+ if isinstance(anthropic_thinking, dict):
3023
+ ttype = (anthropic_thinking.get("type") or "").lower()
3024
+ if ttype == "enabled":
3025
+ openai_body["enable_thinking"] = True
3026
+ else:
3027
+ openai_body["enable_thinking"] = False
3028
+ else:
3029
+ # Match Anthropic default: thinking off unless explicitly requested.
3030
+ openai_body["enable_thinking"] = False
3031
+
3032
+ # Global thinking-off (G): apply to every request, not just tool turns.
3033
+ # Only applies when the client did NOT explicitly request thinking above.
3034
+ # Per-path tool-turn handling below (DISABLE_THINKING_ON_TOOL_TURNS) is
3035
+ # additive — ALWAYS supersedes when set.
3036
+ if PROXY_DISABLE_THINKING_ALWAYS:
3037
+ openai_body["enable_thinking"] = False
3038
+
2705
3039
  # Inject agentic protocol instructions only for tool-enabled turns.
2706
3040
  # Use minimal supplement for qwen models to reduce prompt leak surface.
2707
3041
  if has_tools:
@@ -2711,6 +3045,15 @@ def build_openai_request(
2711
3045
  if "qwen" in model_name and PROXY_AGENTIC_SUPPLEMENT_MODE != "legacy"
2712
3046
  else _AGENTIC_SYSTEM_SUPPLEMENT
2713
3047
  )
3048
+ # When thinking is explicitly disabled (Anthropic default, plus our
3049
+ # tool-turn forcing) but the upstream model is Qwen — which emits
3050
+ # <think> blocks regardless of enable_thinking — append a strong
3051
+ # directive that suppresses internal reasoning. Without this, small
3052
+ # max_tokens budgets get fully consumed by the model's reasoning,
3053
+ # producing required_tool_miss retries (observed in Shannon workflows
3054
+ # with max_tokens=512 + tool_choice=required).
3055
+ if openai_body.get("enable_thinking") is False:
3056
+ supplement = supplement + _NO_THINKING_DIRECTIVE
2714
3057
  if (
2715
3058
  openai_body["messages"]
2716
3059
  and openai_body["messages"][0].get("role") == "system"
@@ -2731,23 +3074,62 @@ def build_openai_request(
2731
3074
  if "max_tokens" in anthropic_body:
2732
3075
  requested_raw = max(1, int(anthropic_body["max_tokens"]))
2733
3076
 
2734
- # Enforce configurable minimum floor for thinking mode: model needs
2735
- # tokens for reasoning (<think>...</think>) plus actual response/tool
2736
- # calls. Set PROXY_MAX_TOKENS_FLOOR=0 to disable this floor.
3077
+ # Enforce configurable minimum floor for tool turns: the model needs
3078
+ # enough headroom to emit complete tool-call arguments (long heredocs,
3079
+ # full-function oldString/newString pairs, etc.) without hitting the
3080
+ # client-requested max_tokens in the middle of a JSON string. If the
3081
+ # client requested >= the floor we keep their value; short preflight
3082
+ # requests (max_tokens <= 1024) always skip the floor to avoid
3083
+ # inflating plan-generation turns.
2737
3084
  #
2738
- # The floor is ONLY applied when thinking is actually enabled —
2739
- # skip it for non-tool requests (tools=0) and for tool turns
2740
- # with thinking disabled, to prevent inflating short preflight
2741
- # requests (e.g. max_tokens=100 for plan generation).
2742
- thinking_active_for_request = has_tools and not PROXY_DISABLE_THINKING_ON_TOOL_TURNS
3085
+ # The earlier gating on PROXY_DISABLE_THINKING_ON_TOOL_TURNS was too
3086
+ # restrictive: it skipped the floor on every tool turn once thinking
3087
+ # was off, which re-introduced truncated tool calls on long edits.
3088
+ # Set PROXY_MAX_TOKENS_FLOOR=0 to disable the floor entirely.
3089
+ thinking_active_for_request = (
3090
+ has_tools
3091
+ and not PROXY_DISABLE_THINKING_ON_TOOL_TURNS
3092
+ and not PROXY_DISABLE_THINKING_ALWAYS
3093
+ )
3094
+ SMALL_PREFLIGHT_THRESHOLD = 1024
3095
+ # Qwen-style models emit <think> blocks regardless of the
3096
+ # enable_thinking flag (template ignored by trained behaviour).
3097
+ # For tool turns those blocks alone consume ~400-1000 tokens, so a
3098
+ # client-requested max_tokens < THINKING_MIN_FOR_TOOLS leaves no
3099
+ # budget for the tool_call itself — manifesting as required_tool_miss
3100
+ # retries (observed Shannon: max_tokens=512 + tools=7 -> ~5 retries
3101
+ # per turn). Bump up to THINKING_MIN_FOR_TOOLS for these requests.
3102
+ THINKING_MIN_FOR_TOOLS = 2048
2743
3103
  skip_floor = (
2744
- not has_tools # non-tool requests don't need thinking headroom
2745
- or PROXY_DISABLE_THINKING_ON_TOOL_TURNS # thinking disabled on tool turns
3104
+ not has_tools # non-tool requests don't need the headroom
2746
3105
  or PROXY_MAX_TOKENS_FLOOR <= 0 # floor explicitly disabled
3106
+ or requested_raw <= SMALL_PREFLIGHT_THRESHOLD # tiny preflight request
2747
3107
  )
3108
+ # Qwen-style models emit <think> blocks regardless of the
3109
+ # enable_thinking flag (template ignored by trained behaviour).
3110
+ # For tool turns those blocks alone consume ~400-1000 tokens, so a
3111
+ # client-requested max_tokens < THINKING_MIN_FOR_TOOLS leaves no
3112
+ # budget for the tool_call itself — manifesting as required_tool_miss
3113
+ # retries (observed Shannon: max_tokens=512 + tools=7 -> ~5 retries
3114
+ # per turn). Bump up to THINKING_MIN_FOR_TOOLS for these requests.
3115
+ THINKING_MIN_FOR_TOOLS = 2048
2748
3116
  if skip_floor:
2749
3117
  requested_max = requested_raw
2750
- if requested_raw < PROXY_MAX_TOKENS_FLOOR and PROXY_MAX_TOKENS_FLOOR > 0:
3118
+ # Even when skipping the big floor, bump small tool-turn
3119
+ # budgets so Qwen's mandatory thinking has room before the
3120
+ # tool_call. Only applies when tools are present.
3121
+ if (
3122
+ has_tools
3123
+ and requested_raw < THINKING_MIN_FOR_TOOLS
3124
+ and requested_raw > 16 # leave true preflight (e.g. max_tokens=1) alone
3125
+ ):
3126
+ requested_max = THINKING_MIN_FOR_TOOLS
3127
+ logger.info(
3128
+ "MAX_TOKENS thinking-floor: %d -> %d (tool turn, Qwen mandatory thinking)",
3129
+ requested_raw,
3130
+ requested_max,
3131
+ )
3132
+ elif requested_raw < PROXY_MAX_TOKENS_FLOOR and PROXY_MAX_TOKENS_FLOOR > 0:
2751
3133
  logger.info(
2752
3134
  "MAX_TOKENS floor skipped: has_tools=%s thinking_active=%s requested=%d floor=%d",
2753
3135
  has_tools,
@@ -2946,24 +3328,35 @@ def build_openai_request(
2946
3328
  monitor.tool_state_stagnation_streak,
2947
3329
  )
2948
3330
  elif state_choice == "finalize":
2949
- openai_body.pop("tool_choice", None)
2950
- openai_body.pop("tools", None)
3331
+ # Fix H/J (2026-04-22): Do NOT strip tools from the body on
3332
+ # cycle-limit finalize. Stripping tools lets the model emit
3333
+ # prose that LOOKS like a tool call ("<function=edit>…") but
3334
+ # has no structured tool_calls array, so the Anthropic client
3335
+ # sees end_turn with no action and halts. Instead, keep tools
3336
+ # available, set tool_choice=auto, and nudge the model to
3337
+ # either complete with a tool call OR emit a proper summary.
3338
+ # Grammar (when PROXY_TOOL_CALL_GRAMMAR_REQUIRED_ONLY=off) will
3339
+ # still constrain tool-call emission to valid JSON format.
3340
+ openai_body["tool_choice"] = "auto"
2951
3341
  monitor.finalize_turn_active = True
2952
3342
  monitor.finalize_hard_stop_count += 1 # monotonic marker: a finalize fired this session
2953
3343
  monitor.consecutive_forced_count = 0
2954
3344
  monitor.no_progress_streak = 0
2955
- # Option 3: Inject explicit "no tool calls" instruction to reduce XML leak
2956
3345
  finalize_instruction = {
2957
3346
  "role": "user",
2958
3347
  "content": (
2959
- "Respond with plain text only. Do not emit any tool calls, "
2960
- "XML tags, or JSON objects."
3348
+ "You have been looping on the same tools for several turns. "
3349
+ "Wrap up: either emit ONE decisive tool call that completes "
3350
+ "the task, or reply with a plain-text summary of what you "
3351
+ "accomplished and what is blocking further progress. Do NOT "
3352
+ "emit tool call text in prose form — if you call a tool, do "
3353
+ "it through the structured tool_call mechanism."
2961
3354
  ),
2962
3355
  }
2963
3356
  msgs = openai_body.get("messages", [])
2964
3357
  msgs.append(finalize_instruction)
2965
3358
  logger.warning(
2966
- "TOOL STATE MACHINE: tools temporarily disabled for finalize turn (reason=%s)",
3359
+ "TOOL STATE MACHINE: finalize turn (reason=%s) — tools kept, tool_choice=auto",
2967
3360
  state_reason,
2968
3361
  )
2969
3362
  elif state_choice == "required":
@@ -3045,8 +3438,11 @@ def build_openai_request(
3045
3438
  monitor.consecutive_forced_count = 0
3046
3439
  monitor.no_progress_streak = 0
3047
3440
  # Force tool_choice=required on first turn to ensure local models
3048
- # produce a tool call instead of plain text (cold-start fix)
3049
- if has_tools and n_msgs == 1:
3441
+ # produce a tool call instead of plain text (cold-start fix).
3442
+ # Gated by PROXY_FORCE_TOOL_CHOICE_ON_COLD_START Gemma 4 routes
3443
+ # 'auto' correctly without needing the force, and the force
3444
+ # triggers malformed-JSON emissions on Gemma 4 cold turns.
3445
+ if has_tools and n_msgs == 1 and PROXY_FORCE_TOOL_CHOICE_ON_COLD_START:
3050
3446
  openai_body["tool_choice"] = "required"
3051
3447
  logger.info(
3052
3448
  "tool_choice forced to 'required' on first turn (reason=%s n_msgs=%d cold_start_fix=true)",
@@ -3089,10 +3485,12 @@ def build_openai_request(
3089
3485
  monitor.reset_tool_turn_state(reason="no_tool_results")
3090
3486
 
3091
3487
 
3092
- if PROXY_DISABLE_THINKING_ON_TOOL_TURNS:
3488
+ if PROXY_DISABLE_THINKING_ALWAYS or PROXY_DISABLE_THINKING_ON_TOOL_TURNS:
3093
3489
  openai_body["enable_thinking"] = False
3094
3490
  logger.info(
3095
- "Thinking disabled for tool turn (PROXY_DISABLE_THINKING_ON_TOOL_TURNS=on)"
3491
+ "Thinking disabled (always=%s tool_turns=%s)",
3492
+ PROXY_DISABLE_THINKING_ALWAYS,
3493
+ PROXY_DISABLE_THINKING_ON_TOOL_TURNS,
3096
3494
  )
3097
3495
 
3098
3496
  if PROXY_DISABLE_SPEC_ON_TOOL_TURNS:
@@ -3411,7 +3809,10 @@ def _schema_type_matches(value, expected_type: str) -> bool:
3411
3809
 
3412
3810
  def _string_contains_tool_markup(value: str) -> bool:
3413
3811
  lowered = value.lower()
3414
- markers = ("<parameter", "</parameter", "<tool_call", "<function=", "</function")
3812
+ markers = (
3813
+ "<parameter", "</parameter", "<tool_call", "<function=", "</function",
3814
+ "<|tool_call>", "<tool_call|>", # Gemma 4 native DSL
3815
+ )
3415
3816
  return any(marker in lowered for marker in markers)
3416
3817
 
3417
3818
 
@@ -3483,6 +3884,343 @@ _TOOL_CALL_XML_RE = re.compile(
3483
3884
  re.DOTALL,
3484
3885
  )
3485
3886
 
3887
+ # Hermes-style XML function call format emitted by some Qwen/Llama fine-tunes
3888
+ # when grammar is not applied:
3889
+ # <function=name>
3890
+ # <parameter=key>
3891
+ # value
3892
+ # </parameter>
3893
+ # ...
3894
+ # </function>
3895
+ #
3896
+ # The value of a <parameter=KEY> block may span multiple lines and include
3897
+ # arbitrary characters (code snippets, JSON, quotes). The closing
3898
+ # </parameter> tag may be missing if the model emitted EOS prematurely —
3899
+ # in which case we consume up to the next <parameter=...> tag or end of
3900
+ # string. Names are captured as alphanumeric + underscore to avoid pulling
3901
+ # in attribute-like garbage.
3902
+ _HERMES_FUNCTION_RE = re.compile(
3903
+ r"<function=([A-Za-z_][A-Za-z0-9_]*)>(.*?)(?:</function>|\Z)",
3904
+ re.DOTALL,
3905
+ )
3906
+ _HERMES_PARAMETER_RE = re.compile(
3907
+ r"<parameter=([A-Za-z_][A-Za-z0-9_]*)>\s*(.*?)\s*(?=</parameter>|<parameter=|\Z)",
3908
+ re.DOTALL,
3909
+ )
3910
+
3911
+
3912
+ def _extract_hermes_tool_calls(text: str) -> tuple[list[dict], str]:
3913
+ """Parse Hermes-style ``<function=name><parameter=k>v</parameter></function>``
3914
+ blocks out of *text*. Used as a fallback when the Qwen JSON format
3915
+ (``<tool_call>{...}</tool_call>``) is not present — for example on
3916
+ finalize turns where grammar does not constrain the output. Tolerates
3917
+ premature EOS (missing closing ``</parameter>`` / ``</function>``)."""
3918
+ if "<function=" not in text:
3919
+ return [], text
3920
+
3921
+ extracted: list[dict] = []
3922
+ matched_spans: list[tuple[int, int]] = []
3923
+
3924
+ for fn_match in _HERMES_FUNCTION_RE.finditer(text):
3925
+ name = fn_match.group(1).strip()
3926
+ body = fn_match.group(2) or ""
3927
+ if not name:
3928
+ continue
3929
+ args: dict = {}
3930
+ for p_match in _HERMES_PARAMETER_RE.finditer(body):
3931
+ key = p_match.group(1).strip()
3932
+ value = p_match.group(2)
3933
+ if key:
3934
+ # Strip one leading newline that the template usually adds
3935
+ # but preserve interior whitespace (code indentation, etc.)
3936
+ if value.startswith("\n"):
3937
+ value = value[1:]
3938
+ args[key] = value
3939
+ extracted.append(
3940
+ {
3941
+ "id": f"toolu_{uuid.uuid4().hex[:24]}",
3942
+ "type": "function",
3943
+ "function": {
3944
+ "name": name,
3945
+ "arguments": json.dumps(args, separators=(",", ":")),
3946
+ },
3947
+ }
3948
+ )
3949
+ matched_spans.append(fn_match.span())
3950
+
3951
+ if not extracted:
3952
+ return [], text
3953
+
3954
+ # Remove matched function blocks from text (plus any dangling
3955
+ # <tool_call>/</tool_call> wrappers around them).
3956
+ remaining = text
3957
+ for start, end in reversed(matched_spans):
3958
+ remaining = remaining[:start] + remaining[end:]
3959
+ # Strip leftover <tool_call>…</tool_call> envelopes that now enclose
3960
+ # nothing useful.
3961
+ remaining = re.sub(r"<tool_call>\s*</tool_call>", "", remaining, flags=re.DOTALL)
3962
+ remaining = remaining.strip()
3963
+
3964
+ logger.info(
3965
+ "TOOL CALL EXTRACTION: recovered %d Hermes-format tool call(s) from text content",
3966
+ len(extracted),
3967
+ )
3968
+ return extracted, remaining
3969
+
3970
+
3971
+ # ---------------------------------------------------------------------------
3972
+ # Gemma 4 tool-call DSL extractors
3973
+ # ---------------------------------------------------------------------------
3974
+ # Gemma 4's chat template emits tool calls as:
3975
+ # <|tool_call>call:NAME{key1:<|"|>value1<|"|>,key2:42}<tool_call|>
3976
+ # Note the asymmetric open/close tags and `<|"|>` substitution for `"`.
3977
+ # Llama-server's --jinja autoparser usually converts these to standard
3978
+ # OpenAI tool_calls, but the raw form can leak through on (a) malformed
3979
+ # emissions, (b) finalize turns, (c) non-tool-template requests where the
3980
+ # model still tries to call a tool. This parser catches those cases.
3981
+ #
3982
+ # Gemma 4 also falls back to ```json {"name": "...", "arguments": {...}} ```
3983
+ # markdown blocks when it doesn't trust the template — observed when
3984
+ # tool_choice was forced 'required' but the model lacked confidence in the
3985
+ # native format. Only treated as a tool call when the JSON has a "name".
3986
+ _GEMMA4_TOOL_CALL_DSL_RE = re.compile(
3987
+ r"<\|tool_call>\s*call:\s*([A-Za-z_][A-Za-z0-9_]*)\s*\{(.*?)\}\s*<tool_call\|>",
3988
+ re.DOTALL,
3989
+ )
3990
+ # Markdown JSON code-block fallback. Group 1 = JSON content (may include
3991
+ # leading/trailing whitespace inside the block).
3992
+ _GEMMA4_MARKDOWN_JSON_RE = re.compile(
3993
+ r"```(?:json)?\s*(\{.*?\})\s*```",
3994
+ re.DOTALL,
3995
+ )
3996
+
3997
+
3998
+ def _parse_gemma4_dsl_args(raw: str) -> dict | None:
3999
+ """Parse Gemma 4's tool-call DSL arg body into a Python dict.
4000
+
4001
+ Input shape (between the `{` and `}` of the DSL):
4002
+ key1:<|"|>str value<|"|>,key2:42,key3:true,key4:[<|"|>a<|"|>,<|"|>b<|"|>]
4003
+
4004
+ Strategy: replace `<|"|>` with `"`, wrap unquoted keys in quotes, then
4005
+ feed to json.loads. Returns None on parse failure (caller decides).
4006
+ """
4007
+ if not raw or not raw.strip():
4008
+ return {}
4009
+ s = raw.replace('<|"|>', '"')
4010
+ # Wrap unquoted keys: `key:` -> `"key":` (only at start or after `,` / `{` / whitespace).
4011
+ s = re.sub(r"(^|[\s,{\[])([A-Za-z_][A-Za-z0-9_]*)\s*:", r'\1"\2":', s)
4012
+ s = "{" + s + "}"
4013
+ try:
4014
+ parsed = json.loads(s)
4015
+ return parsed if isinstance(parsed, dict) else None
4016
+ except json.JSONDecodeError:
4017
+ return None
4018
+
4019
+
4020
+ def _schema_match_tool(payload: dict, available_tools: list[dict]) -> str | None:
4021
+ """Match a bare-args dict against available tool schemas.
4022
+
4023
+ Score each tool by:
4024
+ - +10 per required field present in payload
4025
+ - +1 per optional property present
4026
+ - -5 per payload key NOT in tool's properties
4027
+ - -100 if any required field is missing
4028
+ Return the name of the highest-scoring tool, or None if no clear match.
4029
+ """
4030
+ if not isinstance(payload, dict) or not available_tools:
4031
+ return None
4032
+ payload_keys = set(payload.keys())
4033
+ best_name = None
4034
+ best_score = 0
4035
+ for tool in available_tools:
4036
+ if not isinstance(tool, dict):
4037
+ continue
4038
+ # Anthropic tools format: {"name": ..., "input_schema": {...}}
4039
+ # OpenAI format: {"type": "function", "function": {"name": ..., "parameters": {...}}}
4040
+ name = tool.get("name")
4041
+ schema = tool.get("input_schema")
4042
+ if name is None and isinstance(tool.get("function"), dict):
4043
+ name = tool["function"].get("name")
4044
+ schema = tool["function"].get("parameters")
4045
+ if not isinstance(name, str) or not isinstance(schema, dict):
4046
+ continue
4047
+ properties = schema.get("properties") if isinstance(schema.get("properties"), dict) else {}
4048
+ required = set(schema.get("required") or [])
4049
+ prop_keys = set(properties.keys())
4050
+ score = 0
4051
+ missing_required = required - payload_keys
4052
+ if missing_required:
4053
+ score -= 100
4054
+ score += 10 * len(required & payload_keys)
4055
+ score += len((payload_keys & prop_keys) - required)
4056
+ score -= 5 * len(payload_keys - prop_keys)
4057
+ if score > best_score:
4058
+ best_score = score
4059
+ best_name = name
4060
+ return best_name if best_score >= 10 else None
4061
+
4062
+
4063
+ def _extract_gemma4_tool_calls(
4064
+ text: str, available_tools: list[dict] | None = None
4065
+ ) -> tuple[list[dict], str]:
4066
+ """Parse Gemma 4 tool-call emissions out of *text*.
4067
+
4068
+ Three formats handled, in order:
4069
+ 1. Native DSL: ``<|tool_call>call:N{...}<tool_call|>``
4070
+ 2. Markdown with name: ``​`json\\n{"name": "N", "arguments": {...}}\\n`​``
4071
+ 3. Markdown bare-args + ``available_tools`` provided — schema-match
4072
+ against tool definitions (fix D for Gemma 4 cold-turn malformation
4073
+ where the model emits ``{"city": "Paris"}`` for a get_weather call
4074
+ instead of ``{"name": "get_weather", "arguments": {"city": "Paris"}}``).
4075
+ Without ``available_tools``, bare-args blocks pass through as text.
4076
+
4077
+ Returns ``(extracted_openai_tool_calls, remaining_text)``.
4078
+ """
4079
+ if "<|tool_call>" not in text and "```" not in text:
4080
+ return [], text
4081
+
4082
+ extracted: list[dict] = []
4083
+ matched_spans: list[tuple[int, int]] = []
4084
+
4085
+ # Pattern 1: native DSL
4086
+ for m in _GEMMA4_TOOL_CALL_DSL_RE.finditer(text):
4087
+ name = m.group(1).strip()
4088
+ body = m.group(2) or ""
4089
+ if not name:
4090
+ continue
4091
+ args = _parse_gemma4_dsl_args(body)
4092
+ if args is None:
4093
+ # DSL body unparseable; skip and let model retry next turn.
4094
+ continue
4095
+ extracted.append(
4096
+ {
4097
+ "id": f"toolu_{uuid.uuid4().hex[:24]}",
4098
+ "type": "function",
4099
+ "function": {
4100
+ "name": name,
4101
+ "arguments": json.dumps(args, separators=(",", ":")),
4102
+ },
4103
+ }
4104
+ )
4105
+ matched_spans.append(m.span())
4106
+
4107
+ # Pattern 2: markdown JSON fallback (only if no DSL hit AND text has ```)
4108
+ if not extracted and "```" in text:
4109
+ for m in _GEMMA4_MARKDOWN_JSON_RE.finditer(text):
4110
+ raw_json = m.group(1)
4111
+ try:
4112
+ payload = json.loads(raw_json)
4113
+ except json.JSONDecodeError:
4114
+ # Try a JSON repair like the Qwen path does
4115
+ repaired = _repair_tool_call_json(raw_json)
4116
+ if not repaired:
4117
+ continue
4118
+ try:
4119
+ payload = json.loads(repaired)
4120
+ except json.JSONDecodeError:
4121
+ continue
4122
+ if not isinstance(payload, dict):
4123
+ continue
4124
+ name = payload.get("name")
4125
+ arguments_obj = None
4126
+ if isinstance(name, str) and name:
4127
+ # Standard {name, arguments} form
4128
+ arguments_obj = payload.get("arguments", payload.get("args", {}))
4129
+ elif available_tools:
4130
+ # Bare-args block — try schema-matching against available tools
4131
+ matched = _schema_match_tool(payload, available_tools)
4132
+ if matched is None:
4133
+ continue
4134
+ name = matched
4135
+ arguments_obj = payload # whole payload IS the args
4136
+ logger.info(
4137
+ "TOOL CALL EXTRACTION: schema-matched bare-args markdown JSON to tool '%s' (keys=%s)",
4138
+ name,
4139
+ sorted(payload.keys())[:6],
4140
+ )
4141
+ else:
4142
+ # No name, no tools to match against — pass through as text
4143
+ continue
4144
+ if isinstance(arguments_obj, dict):
4145
+ arguments = json.dumps(arguments_obj, separators=(",", ":"))
4146
+ elif isinstance(arguments_obj, str):
4147
+ arguments = arguments_obj
4148
+ else:
4149
+ arguments = "{}"
4150
+ extracted.append(
4151
+ {
4152
+ "id": f"toolu_{uuid.uuid4().hex[:24]}",
4153
+ "type": "function",
4154
+ "function": {"name": name, "arguments": arguments},
4155
+ }
4156
+ )
4157
+ matched_spans.append(m.span())
4158
+
4159
+ if not extracted:
4160
+ return [], text
4161
+
4162
+ # Strip matched spans from text (in reverse to keep indices valid)
4163
+ remaining = text
4164
+ for start, end in sorted(matched_spans, key=lambda s: -s[0]):
4165
+ remaining = remaining[:start] + remaining[end:]
4166
+ remaining = remaining.strip()
4167
+
4168
+ logger.info(
4169
+ "TOOL CALL EXTRACTION: recovered %d Gemma 4 tool call(s) from text content",
4170
+ len(extracted),
4171
+ )
4172
+ return extracted, remaining
4173
+
4174
+
4175
+ # ---------------------------------------------------------------------------
4176
+ # Gemma 4 tool-call DSL extractors
4177
+ # ---------------------------------------------------------------------------
4178
+ # Gemma 4's chat template emits tool calls as:
4179
+ # <|tool_call>call:NAME{key1:<|"|>value1<|"|>,key2:42}<tool_call|>
4180
+ # Note the asymmetric open/close tags and `<|"|>` substitution for `"`.
4181
+ # Llama-server's --jinja autoparser usually converts these to standard
4182
+ # OpenAI tool_calls, but the raw form can leak through on (a) malformed
4183
+ # emissions, (b) finalize turns, (c) non-tool-template requests where the
4184
+ # model still tries to call a tool. This parser catches those cases.
4185
+ #
4186
+ # Gemma 4 also falls back to ```json {"name": "...", "arguments": {...}} ```
4187
+ # markdown blocks when it doesn't trust the template — observed when
4188
+ # tool_choice was forced 'required' but the model lacked confidence in the
4189
+ # native format. Only treated as a tool call when the JSON has a "name".
4190
+ _GEMMA4_TOOL_CALL_DSL_RE = re.compile(
4191
+ r"<\|tool_call>\s*call:\s*([A-Za-z_][A-Za-z0-9_]*)\s*\{(.*?)\}\s*<tool_call\|>",
4192
+ re.DOTALL,
4193
+ )
4194
+ # Markdown JSON code-block fallback. Group 1 = JSON content (may include
4195
+ # leading/trailing whitespace inside the block).
4196
+ _GEMMA4_MARKDOWN_JSON_RE = re.compile(
4197
+ r"```(?:json)?\s*(\{.*?\})\s*```",
4198
+ re.DOTALL,
4199
+ )
4200
+
4201
+
4202
+ def _parse_gemma4_dsl_args(raw: str) -> dict | None:
4203
+ """Parse Gemma 4's tool-call DSL arg body into a Python dict.
4204
+
4205
+ Input shape (between the `{` and `}` of the DSL):
4206
+ key1:<|"|>str value<|"|>,key2:42,key3:true,key4:[<|"|>a<|"|>,<|"|>b<|"|>]
4207
+
4208
+ Strategy: replace `<|"|>` with `"`, wrap unquoted keys in quotes, then
4209
+ feed to json.loads. Returns None on parse failure (caller decides).
4210
+ """
4211
+ if not raw or not raw.strip():
4212
+ return {}
4213
+ s = raw.replace('<|"|>', '"')
4214
+ # Wrap unquoted keys: `key:` -> `"key":` (only at start or after `,` / `{` / whitespace).
4215
+ s = re.sub(r"(^|[\s,{\[])([A-Za-z_][A-Za-z0-9_]*)\s*:", r'\1"\2":', s)
4216
+ s = "{" + s + "}"
4217
+ try:
4218
+ parsed = json.loads(s)
4219
+ return parsed if isinstance(parsed, dict) else None
4220
+ except json.JSONDecodeError:
4221
+ return None
4222
+
4223
+
3486
4224
 
3487
4225
  def _repair_tool_call_json(raw: str) -> str | None:
3488
4226
  """Attempt to repair common garbled JSON in tool call payloads.
@@ -3525,7 +4263,9 @@ def _repair_tool_call_json(raw: str) -> str | None:
3525
4263
  return None
3526
4264
 
3527
4265
 
3528
- def _extract_tool_calls_from_text(text: str) -> tuple[list[dict], str]:
4266
+ def _extract_tool_calls_from_text(
4267
+ text: str, available_tools: list[dict] | None = None
4268
+ ) -> tuple[list[dict], str]:
3529
4269
  """Parse ``<tool_call>{...}</tool_call>`` blocks out of *text*.
3530
4270
 
3531
4271
  Returns a tuple of (extracted_openai_tool_calls, remaining_text).
@@ -3535,8 +4275,18 @@ def _extract_tool_calls_from_text(text: str) -> tuple[list[dict], str]:
3535
4275
 
3536
4276
  The *remaining_text* has the matched ``<tool_call>`` blocks removed.
3537
4277
  If no valid blocks are found the original text is returned unchanged.
4278
+ Falls back to Hermes-style ``<function=X><parameter=K>V</parameter></function>``
4279
+ for older Qwen/Llama fine-tunes, then to Gemma 4's
4280
+ ``<|tool_call>call:N{...}<tool_call|>`` DSL and ```json``` markdown
4281
+ blocks. Anything not matching any known format falls through unchanged
4282
+ so plain prose passes the parser without mutation.
3538
4283
  """
3539
- if "<tool_call>" not in text:
4284
+ if (
4285
+ "<tool_call>" not in text
4286
+ and "<function=" not in text
4287
+ and "<|tool_call>" not in text
4288
+ and "```" not in text
4289
+ ):
3540
4290
  return [], text
3541
4291
 
3542
4292
  extracted: list[dict] = []
@@ -3572,14 +4322,24 @@ def _extract_tool_calls_from_text(text: str) -> tuple[list[dict], str]:
3572
4322
 
3573
4323
  extracted.append(
3574
4324
  {
3575
- "id": f"call_{uuid.uuid4().hex[:12]}",
4325
+ "id": f"toolu_{uuid.uuid4().hex[:24]}",
3576
4326
  "type": "function",
3577
4327
  "function": {"name": name, "arguments": arguments},
3578
4328
  }
3579
4329
  )
3580
4330
 
3581
4331
  if not extracted:
3582
- return [], text
4332
+ # Fall back to Hermes format. This catches Qwen emissions on finalize
4333
+ # turns where grammar is not applied and the model defaults to its
4334
+ # base training's <function=X><parameter=K>V</parameter></function>
4335
+ # format instead of the <tool_call>{JSON}</tool_call> Qwen template
4336
+ # format. Without this path, tool_calls=[] and the client halts.
4337
+ hermes_calls, hermes_remaining = _extract_hermes_tool_calls(text)
4338
+ if hermes_calls:
4339
+ return hermes_calls, hermes_remaining
4340
+ # Then try Gemma 4's DSL + markdown-JSON fallback. Anything still
4341
+ # not matching falls through as plain text.
4342
+ return _extract_gemma4_tool_calls(text, available_tools=available_tools)
3583
4343
 
3584
4344
  # Strip matched tool_call blocks from the text
3585
4345
  remaining = _TOOL_CALL_XML_RE.sub("", text).strip()
@@ -4550,6 +5310,16 @@ def _classify_tool_response_issue(
4550
5310
  if "tools" not in anthropic_body:
4551
5311
  return ToolResponseIssue()
4552
5312
 
5313
+ # When the upstream response was cut off by max_tokens (finish_reason=length),
5314
+ # any garbled/unbalanced-brace appearance in the tool args is almost
5315
+ # certainly truncation, not degenerate generation. Re-classify such
5316
+ # issues as "truncated_tool_args" so the caller can still retry (with a
5317
+ # larger cap) but WITHOUT triggering the forced-tool dampener, which
5318
+ # otherwise penalises a perfectly-recoverable truncation event.
5319
+ choice_for_finish, _ = _extract_openai_choice(openai_resp)
5320
+ finish_reason = (choice_for_finish.get("finish_reason") or "").lower()
5321
+ was_truncated = finish_reason == "length"
5322
+
4553
5323
  if _is_malformed_tool_response(openai_resp, anthropic_body):
4554
5324
  return ToolResponseIssue(
4555
5325
  kind="malformed_payload",
@@ -4593,15 +5363,107 @@ def _classify_tool_response_issue(
4593
5363
  allowed_tools,
4594
5364
  )
4595
5365
  if issue.has_issue():
5366
+ # Downgrade invalid_tool_args to truncated_tool_args when the
5367
+ # response hit max_tokens — retry path still fires but the
5368
+ # dampener/streak counters stay cold.
5369
+ if was_truncated and issue.kind == "invalid_tool_args":
5370
+ return ToolResponseIssue(
5371
+ kind="truncated_tool_args",
5372
+ reason=(
5373
+ f"tool call for '{tool_name}' truncated by max_tokens "
5374
+ f"({issue.reason})"
5375
+ ),
5376
+ retry_hint=issue.retry_hint,
5377
+ )
4596
5378
  return issue
4597
5379
 
4598
5380
  return ToolResponseIssue()
4599
5381
 
4600
5382
 
5383
+ # 2026-05-12: Regex for the tool-XML tag scanner. Captures opening vs
5384
+ # closing form (group 1: "/" or ""), the tag name (group 2), and any
5385
+ # attributes (group 3). Matches <parameter>, <parameter=key>,
5386
+ # <parameter name="key">, </parameter>, <function=name>, </function>.
5387
+ _TOOL_XML_TAG_RE = re.compile(r"<(/?)(parameter|function)\b([^>]*)>")
5388
+
5389
+
5390
+ def _strip_orphan_tool_xml(text: str) -> str:
5391
+ """Remove orphan </parameter> and </function> closing tags that have
5392
+ no matching opener earlier in the text.
5393
+
5394
+ Qwen3.6 trained on the qwen3_coder XML format leaks these closers
5395
+ after its actual answer when forced into tool_choice='required' with
5396
+ no genuine tool to call. The closers are training residuals, not real
5397
+ malformed tool-call markup — keeping them in the text causes the
5398
+ primary_markers branch of _looks_malformed_tool_payload to fire on
5399
+ every clean-but-runaway-shaped response. Real malformed tool-call
5400
+ attempts always have at least one matching opener ('<parameter' or
5401
+ '<function='), which the regex preserves, so primary_markers still
5402
+ fires correctly on genuine bad output.
5403
+ """
5404
+ if "</parameter" not in text and "</function" not in text:
5405
+ return text
5406
+
5407
+ out: list[str] = []
5408
+ pos = 0
5409
+ open_param = 0
5410
+ open_func = 0
5411
+ for m in _TOOL_XML_TAG_RE.finditer(text):
5412
+ out.append(text[pos:m.start()])
5413
+ is_close = m.group(1) == "/"
5414
+ tag = m.group(2)
5415
+ if is_close:
5416
+ if tag == "parameter":
5417
+ if open_param > 0:
5418
+ open_param -= 1
5419
+ out.append(m.group(0))
5420
+ else: # function
5421
+ if open_func > 0:
5422
+ open_func -= 1
5423
+ out.append(m.group(0))
5424
+ # else: orphan closer, skip (strip)
5425
+ else:
5426
+ if tag == "parameter":
5427
+ open_param += 1
5428
+ else:
5429
+ open_func += 1
5430
+ out.append(m.group(0))
5431
+ pos = m.end()
5432
+ out.append(text[pos:])
5433
+ return "".join(out)
5434
+
5435
+
4601
5436
  def _looks_malformed_tool_payload(text: str) -> bool:
4602
5437
  if not text:
4603
5438
  return False
4604
5439
 
5440
+ # 2026-05-12: Strip balanced <think>...</think> blocks before applying
5441
+ # the heuristic. Qwen3.6 emits <think> blocks regardless of
5442
+ # enable_thinking, and two scenarios were tripping false positives:
5443
+ # 1. Meta-tool reasoning inside the thinking ({"description":...},
5444
+ # repeated "must call a tool") triggering the structural-marker
5445
+ # and policy-echo branches.
5446
+ # 2. The model wrapping its ENTIRE answer inside a single <think>
5447
+ # block (markdown reports, tables) — the </think> structural
5448
+ # marker plus content-resembling-policy then fires.
5449
+ # Downstream response processing surfaces <think> content as proper
5450
+ # Anthropic `thinking` blocks via _THINKING_BLOCK_RE, so stripping
5451
+ # here loses no information. Unbalanced/stray </think> without a
5452
+ # matching opener is NOT stripped — those remain genuinely malformed.
5453
+ if "<think>" in text and "</think>" in text:
5454
+ text = _THINKING_BLOCK_RE.sub("", text)
5455
+ if not text.strip():
5456
+ return False
5457
+
5458
+ # 2026-05-12: Strip orphan </parameter> and </function> closers that
5459
+ # have no matching opener. Qwen3.6 leaks these training residuals
5460
+ # after its visible answer when forced into tool_choice='required'
5461
+ # with no valid tool to call. Real malformed tool-call attempts retain
5462
+ # their opener and still trip the primary_markers check below.
5463
+ text = _strip_orphan_tool_xml(text)
5464
+ if not text.strip():
5465
+ return False
5466
+
4605
5467
  lowered = text.lower()
4606
5468
  if _contains_tool_call_apology(text):
4607
5469
  return True
@@ -4836,13 +5698,17 @@ def _build_malformed_retry_body(
4836
5698
  retry_instruction = (
4837
5699
  "Your previous response had invalid tool-call formatting. "
4838
5700
  "Respond with exactly one valid tool call using the provided tools. "
4839
- "Do not output prose, markdown, XML tags, or schema snippets."
5701
+ "Do not output prose, markdown, XML tags, or schema snippets. "
5702
+ "Do NOT use <think>...</think> blocks or internal reasoning — "
5703
+ "emit the tool_call object as the very first token of your response."
4840
5704
  )
4841
5705
  else:
4842
5706
  retry_instruction = (
4843
5707
  "Your previous response had invalid tool-call formatting. "
4844
5708
  "If a tool is needed, emit exactly one valid tool call with strict JSON arguments. "
4845
- "If no tool is needed for this turn, return concise plain text with no protocol tags."
5709
+ "If no tool is needed for this turn, return concise plain text with no protocol tags. "
5710
+ "Do NOT use <think>...</think> blocks — start your response directly with "
5711
+ "either a tool_call or the plain text answer."
4846
5712
  )
4847
5713
 
4848
5714
  malformed_retry_instruction = {
@@ -5023,7 +5889,7 @@ async def _apply_unexpected_end_turn_guardrail(
5023
5889
  )
5024
5890
  if retry_resp.status_code == 200:
5025
5891
  retry_json = retry_resp.json()
5026
- _maybe_extract_text_tool_calls(retry_json)
5892
+ _maybe_extract_text_tool_calls(retry_json, anthropic_tools=anthropic_body.get("tools"))
5027
5893
  retry_choice, retry_message = _extract_openai_choice(retry_json)
5028
5894
  if _openai_has_valid_tool_calls(retry_json, anthropic_body):
5029
5895
  logger.info("GUARDRAIL: retry produced tool_use; using retried response")
@@ -5112,8 +5978,12 @@ async def _apply_malformed_tool_guardrail(
5112
5978
  )
5113
5979
  return working_resp
5114
5980
 
5115
- # Mark garbled state for progressive max_tokens reduction on next turn
5116
- monitor.last_response_garbled = True
5981
+ # Only set last_response_garbled for TRUE degenerate generation, not
5982
+ # for responses merely truncated by max_tokens — otherwise the next
5983
+ # turn gets hit with the garbled_cap (smaller max_tokens) and the
5984
+ # problem compounds.
5985
+ if issue.kind != "truncated_tool_args":
5986
+ monitor.last_response_garbled = True
5117
5987
 
5118
5988
  if issue.kind == "malformed_payload":
5119
5989
  monitor.malformed_tool_streak += 1
@@ -5121,7 +5991,12 @@ async def _apply_malformed_tool_guardrail(
5121
5991
  monitor.invalid_tool_call_streak += 1
5122
5992
  monitor.arg_preflight_rejections += 1
5123
5993
 
5124
- monitor.maybe_activate_forced_tool_dampener(issue.kind)
5994
+ # Truncation is a max_tokens accident, not the model misbehaving: don't
5995
+ # feed it to the forced-tool dampener, which would otherwise relax
5996
+ # tool_choice on the very next turn and let the model trail off with
5997
+ # text (the exact failure mode that stopped opencode).
5998
+ if issue.kind != "truncated_tool_args":
5999
+ monitor.maybe_activate_forced_tool_dampener(issue.kind)
5125
6000
  excerpt = _openai_message_text(working_resp)[:220].replace("\n", " ")
5126
6001
  # Option 2: Log garbled argument content for diagnostics
5127
6002
  arg_excerpt = ""
@@ -5194,7 +6069,7 @@ async def _apply_malformed_tool_guardrail(
5194
6069
  continue
5195
6070
 
5196
6071
  retry_json = retry_resp.json()
5197
- _maybe_extract_text_tool_calls(retry_json)
6072
+ _maybe_extract_text_tool_calls(retry_json, anthropic_tools=anthropic_body.get("tools"))
5198
6073
  retry_working = retry_json
5199
6074
  retry_repairs = 0
5200
6075
  if PROXY_TOOL_ARGS_PREFLIGHT and _openai_has_tool_calls(retry_json):
@@ -5226,15 +6101,20 @@ async def _apply_malformed_tool_guardrail(
5226
6101
  )
5227
6102
 
5228
6103
  if not retry_issue.has_issue():
5229
- monitor.malformed_tool_streak = 0
5230
- monitor.invalid_tool_call_streak = 0
5231
- monitor.required_tool_miss_streak = 0
6104
+ # 2026-05-12: Fix #2 — do NOT reset malformed/invalid/miss streaks
6105
+ # to 0 on retry-success. Previously, sessions stuck in a
6106
+ # malformed→retry-success loop never accumulated enough streak to
6107
+ # trigger the forced-tool dampener. Healthy responses with real
6108
+ # tool_calls still reset the streak via the upstream no-issue path
6109
+ # (~L5655), so genuine recovery still resets counters; only
6110
+ # repeated retry-recoveries persist toward the dampener.
5232
6111
  monitor.last_response_garbled = False
5233
6112
  logger.info(
5234
- "TOOL RESPONSE RETRY success: kind=%s attempt=%d/%d",
6113
+ "TOOL RESPONSE RETRY success: kind=%s attempt=%d/%d malformed_streak=%d",
5235
6114
  current_issue.kind,
5236
6115
  attempt + 1,
5237
6116
  attempts,
6117
+ monitor.malformed_tool_streak,
5238
6118
  )
5239
6119
  if retry_repairs > 0:
5240
6120
  monitor.arg_preflight_repairs += retry_repairs
@@ -5259,7 +6139,10 @@ async def _apply_malformed_tool_guardrail(
5259
6139
  if fn_name and raw_args and _is_garbled_tool_arguments(raw_args):
5260
6140
  failing_tools.add(fn_name)
5261
6141
 
5262
- monitor.maybe_activate_forced_tool_dampener(retry_issue.kind)
6142
+ # Truncation on retry is still a max_tokens problem, not a model
6143
+ # misbehaviour — don't dampen. The outer retry loop will try again.
6144
+ if retry_issue.kind != "truncated_tool_args":
6145
+ monitor.maybe_activate_forced_tool_dampener(retry_issue.kind)
5263
6146
  logger.warning(
5264
6147
  "TOOL RESPONSE RETRY invalid: session=%s attempt=%d/%d kind=%s reason=%s",
5265
6148
  session_id,
@@ -5440,11 +6323,19 @@ def _maybe_apply_session_contamination_breaker(
5440
6323
  # ===========================================================================
5441
6324
 
5442
6325
 
5443
- def _maybe_extract_text_tool_calls(openai_resp: dict) -> dict:
6326
+ def _maybe_extract_text_tool_calls(
6327
+ openai_resp: dict, anthropic_tools: list[dict] | None = None
6328
+ ) -> dict:
5444
6329
  """Mutate *openai_resp* in-place: if the message has no structured
5445
- ``tool_calls`` but contains ``<tool_call>`` XML in text, extract them
5446
- and promote to real ``tool_calls`` on the message. Returns the
5447
- (possibly-mutated) response for chaining."""
6330
+ ``tool_calls`` but contains tool-call markup in text, extract them
6331
+ and promote to real ``tool_calls`` on the message.
6332
+
6333
+ *anthropic_tools* (optional): list of tool definitions from the original
6334
+ Anthropic request. Enables schema-matching of bare-args markdown JSON
6335
+ blocks emitted by Gemma 4 cold turns (fix D). Without it, bare-args
6336
+ blocks pass through as text.
6337
+
6338
+ Returns the (possibly-mutated) response for chaining."""
5448
6339
  choice = (openai_resp.get("choices") or [{}])[0]
5449
6340
  message = choice.get("message", {})
5450
6341
 
@@ -5453,10 +6344,20 @@ def _maybe_extract_text_tool_calls(openai_resp: dict) -> dict:
5453
6344
  return openai_resp
5454
6345
 
5455
6346
  text = message.get("content", "")
5456
- if not isinstance(text, str) or "<tool_call>" not in text:
6347
+ if not isinstance(text, str):
6348
+ return openai_resp
6349
+ # Quick early-exit if no markers present (matches dispatcher guard)
6350
+ if (
6351
+ "<tool_call>" not in text
6352
+ and "<function=" not in text
6353
+ and "<|tool_call>" not in text
6354
+ and "```" not in text
6355
+ ):
5457
6356
  return openai_resp
5458
6357
 
5459
- extracted, remaining = _extract_tool_calls_from_text(text)
6358
+ extracted, remaining = _extract_tool_calls_from_text(
6359
+ text, available_tools=anthropic_tools
6360
+ )
5460
6361
  if not extracted:
5461
6362
  return openai_resp
5462
6363
 
@@ -5591,8 +6492,43 @@ def _inject_synthetic_continuation(
5591
6492
  return anthropic_resp
5592
6493
 
5593
6494
 
5594
- def openai_to_anthropic_response(openai_resp: dict, model: str) -> dict:
5595
- """Convert an OpenAI Chat Completions response to Anthropic Messages format."""
6495
+ _THINKING_BLOCK_RE = re.compile(r"<think>(.*?)</think>\s*", re.DOTALL)
6496
+
6497
+
6498
+ def _extract_thinking_block(text: str) -> tuple[str | None, str]:
6499
+ """Extract Qwen-style ``<think>...</think>`` blocks from *text*.
6500
+
6501
+ Returns ``(thinking_content, remaining_text)``. If no ``<think>`` tag is
6502
+ present, returns ``(None, text)`` unchanged. Multiple thinking blocks
6503
+ are concatenated. Trailing whitespace after each block is consumed so
6504
+ the remaining text starts cleanly with the model's actual answer.
6505
+ """
6506
+ if "<think>" not in text:
6507
+ return None, text
6508
+ parts: list[str] = []
6509
+ def collect(m: re.Match) -> str:
6510
+ parts.append(m.group(1).strip())
6511
+ return ""
6512
+ remaining = _THINKING_BLOCK_RE.sub(collect, text)
6513
+ if not parts:
6514
+ return None, text
6515
+ return "\n\n".join(p for p in parts if p), remaining.lstrip()
6516
+
6517
+
6518
+ def openai_to_anthropic_response(
6519
+ openai_resp: dict, model: str, expose_thinking: bool = True
6520
+ ) -> dict:
6521
+ """Convert an OpenAI Chat Completions response to Anthropic Messages format.
6522
+
6523
+ *expose_thinking*: when True, surface ``<think>...</think>`` content from
6524
+ the upstream as Anthropic ``{"type": "thinking"}`` blocks. When False
6525
+ (Anthropic default — client didn't opt in), strip thinking content
6526
+ from the response entirely so the client only sees the actual answer.
6527
+ Qwen's chat template seeds the model into thinking regardless of the
6528
+ ``enable_thinking`` request param, so even thinking-off responses
6529
+ typically still contain ``<think>`` blocks; this flag controls whether
6530
+ they're surfaced as Anthropic blocks or silently consumed.
6531
+ """
5596
6532
  # First: try to recover tool calls trapped in text XML tags
5597
6533
  _maybe_extract_text_tool_calls(openai_resp)
5598
6534
  # Second: strip garbled/degenerate tool call arguments
@@ -5603,20 +6539,46 @@ def openai_to_anthropic_response(openai_resp: dict, model: str) -> dict:
5603
6539
  finish = choice.get("finish_reason", "stop")
5604
6540
 
5605
6541
  content = []
6542
+ # Surface Qwen's <think>...</think> output as Anthropic-style thinking
6543
+ # blocks (Anthropic extended-thinking API shape:
6544
+ # {"type": "thinking", "thinking": "...", "signature": ""}).
6545
+ # Clients that don't request thinking simply ignore the block; clients
6546
+ # that do (Claude Code) render them in the thinking pane.
6547
+ raw_text = ""
5606
6548
  if message.get("content"):
5607
6549
  raw_text = (
5608
6550
  message["content"]
5609
6551
  if isinstance(message["content"], str)
5610
6552
  else str(message["content"])
5611
6553
  )
5612
- sanitized_text = _sanitize_tool_call_apology_text(raw_text)
5613
- if sanitized_text != raw_text:
6554
+ # Some llama-server builds emit the model's reasoning into a separate
6555
+ # `reasoning_content` field instead of inline <think> tags. Surface
6556
+ # that too so the proxy is consistent regardless of upstream behaviour.
6557
+ inline_thinking, body_text = _extract_thinking_block(raw_text)
6558
+ sidecar_thinking = message.get("reasoning_content") or message.get("reasoning")
6559
+ thinking_chunks: list[str] = []
6560
+ if isinstance(sidecar_thinking, str) and sidecar_thinking.strip():
6561
+ thinking_chunks.append(sidecar_thinking.strip())
6562
+ if inline_thinking:
6563
+ thinking_chunks.append(inline_thinking)
6564
+ if thinking_chunks and expose_thinking:
6565
+ content.append(
6566
+ {
6567
+ "type": "thinking",
6568
+ "thinking": "\n\n".join(thinking_chunks),
6569
+ "signature": "",
6570
+ }
6571
+ )
6572
+
6573
+ if body_text:
6574
+ sanitized_text = _sanitize_tool_call_apology_text(body_text)
6575
+ if sanitized_text != body_text:
5614
6576
  logger.warning(
5615
6577
  "SANITIZE: replaced known malformed tool-call apology text in assistant response"
5616
6578
  )
5617
6579
  # Option 1: Strip residual <tool_call> XML that wasn't extracted
5618
6580
  sanitized_text = _strip_residual_tool_call_xml(sanitized_text)
5619
- if sanitized_text != raw_text and "<tool_call>" in raw_text:
6581
+ if sanitized_text != body_text and "<tool_call>" in body_text:
5620
6582
  logger.warning(
5621
6583
  "SANITIZE: stripped residual <tool_call> XML from text content"
5622
6584
  )
@@ -5641,10 +6603,21 @@ def openai_to_anthropic_response(openai_resp: dict, model: str) -> dict:
5641
6603
  logger.warning(
5642
6604
  "BASH SAFETY: stripped standalone protocol-tag lines from command before tool execution"
5643
6605
  )
6606
+ # Normalise IDs to Anthropic spec (toolu_ prefix). Upstream
6607
+ # llama-server returns opaque IDs without prefix; clients that
6608
+ # validate prefix would reject. Strip-and-restamp here, restore in
6609
+ # anthropic_to_openai_messages() when client sends tool_result back.
6610
+ upstream_id = tc.get("id", "")
6611
+ if upstream_id.startswith("toolu_"):
6612
+ tool_use_id = upstream_id
6613
+ elif upstream_id:
6614
+ tool_use_id = f"toolu_{upstream_id}"
6615
+ else:
6616
+ tool_use_id = f"toolu_{uuid.uuid4().hex[:24]}"
5644
6617
  content.append(
5645
6618
  {
5646
6619
  "type": "tool_use",
5647
- "id": tc.get("id", f"toolu_{uuid.uuid4().hex[:12]}"),
6620
+ "id": tool_use_id,
5648
6621
  "name": fn.get("name", ""),
5649
6622
  "input": args,
5650
6623
  }
@@ -6341,7 +7314,7 @@ async def messages(request: Request):
6341
7314
  )
6342
7315
  except Exception as exc:
6343
7316
  # Check if upstream is hung before returning error
6344
- await _check_slot_hang(f"{LLAMA_CPP_BASE}/slots")
7317
+ await _check_slot_hang(LLAMA_CPP_BASE.replace("/v1", "/slots"))
6345
7318
  return Response(
6346
7319
  content=json.dumps(
6347
7320
  {
@@ -6356,6 +7329,23 @@ async def messages(request: Request):
6356
7329
  media_type="application/json",
6357
7330
  )
6358
7331
 
7332
+ if strict_resp.status_code != 200:
7333
+ error_text = strict_resp.text[:1000]
7334
+ # Try the Gemma 4 PEG parse-failure recovery first — relax
7335
+ # tool_choice='required' so the retry isn't constrained by the
7336
+ # strict-grammar that triggered the parse failure.
7337
+ relaxed = _is_gemma4_peg_parse_failure(strict_resp.status_code, error_text) and \
7338
+ _relax_tool_choice_for_gemma4_peg_retry(strict_body, "strict-stream")
7339
+ if relaxed:
7340
+ try:
7341
+ strict_resp = await _post_with_generation_timeout(
7342
+ client,
7343
+ f"{LLAMA_CPP_BASE}/chat/completions",
7344
+ strict_body,
7345
+ {"Content-Type": "application/json"},
7346
+ )
7347
+ except Exception:
7348
+ pass # fall through to next handler
6359
7349
  if strict_resp.status_code != 200:
6360
7350
  error_text = strict_resp.text[:1000]
6361
7351
  if _maybe_disable_grammar_for_tools_error(
@@ -6430,7 +7420,7 @@ async def messages(request: Request):
6430
7420
 
6431
7421
  openai_resp = strict_resp.json()
6432
7422
  # Recover tool calls from <tool_call> XML before guardrails run
6433
- _maybe_extract_text_tool_calls(openai_resp)
7423
+ _maybe_extract_text_tool_calls(openai_resp, anthropic_tools=body.get("tools"))
6434
7424
  openai_resp = await _apply_unexpected_end_turn_guardrail(
6435
7425
  client,
6436
7426
  openai_resp,
@@ -6485,7 +7475,11 @@ async def messages(request: Request):
6485
7475
  logger.info("DEGENERATE RETRY: retry insufficient, using truncated original")
6486
7476
  except Exception as exc:
6487
7477
  logger.warning("DEGENERATE RETRY: failed: %s", exc)
6488
- anthropic_resp = openai_to_anthropic_response(openai_resp, model)
7478
+ anthropic_resp = openai_to_anthropic_response(
7479
+ openai_resp, model,
7480
+ expose_thinking=isinstance(body.get("thinking"), dict)
7481
+ and (body["thinking"].get("type") or "").lower() == "enabled",
7482
+ )
6489
7483
  # FINALIZE CONTINUATION: inject synthetic tool_use to keep client loop alive
6490
7484
  if (
6491
7485
  monitor.finalize_turn_active
@@ -6601,6 +7595,29 @@ async def messages(request: Request):
6601
7595
  error_body = await resp.aread()
6602
7596
  await resp.aclose()
6603
7597
  error_text = error_body.decode("utf-8", errors="replace")[:1000]
7598
+ # Gemma 4 PEG parse-failure recovery: relax tool_choice='required'
7599
+ # so the retry isn't blocked by the strict-grammar that rejected
7600
+ # the model's incomplete tool call.
7601
+ if _is_gemma4_peg_parse_failure(resp.status_code, error_text) and \
7602
+ _relax_tool_choice_for_gemma4_peg_retry(openai_body, "stream"):
7603
+ resp = await client.send(
7604
+ client.build_request(
7605
+ "POST",
7606
+ f"{LLAMA_CPP_BASE}/chat/completions",
7607
+ json=openai_body,
7608
+ headers={"Content-Type": "application/json"},
7609
+ ),
7610
+ stream=True,
7611
+ )
7612
+ if resp.status_code == 200:
7613
+ return StreamingResponse(
7614
+ stream_anthropic_response(resp, model, monitor, body),
7615
+ media_type="text/event-stream",
7616
+ )
7617
+ # fall through if still failing
7618
+ error_body = await resp.aread()
7619
+ await resp.aclose()
7620
+ error_text = error_body.decode("utf-8", errors="replace")[:1000]
6604
7621
  if _maybe_disable_grammar_for_tools_error(
6605
7622
  openai_body,
6606
7623
  resp.status_code,
@@ -6733,6 +7750,23 @@ async def messages(request: Request):
6733
7750
  media_type="application/json",
6734
7751
  )
6735
7752
 
7753
+ if resp.status_code != 200:
7754
+ error_text = resp.text[:1000]
7755
+ # Gemma 4 PEG parse-failure recovery (non-stream path).
7756
+ relaxed = (
7757
+ _is_gemma4_peg_parse_failure(resp.status_code, error_text)
7758
+ and _relax_tool_choice_for_gemma4_peg_retry(openai_body, "non-stream")
7759
+ )
7760
+ if relaxed:
7761
+ try:
7762
+ resp = await _post_with_generation_timeout(
7763
+ client,
7764
+ f"{LLAMA_CPP_BASE}/chat/completions",
7765
+ openai_body,
7766
+ {"Content-Type": "application/json"},
7767
+ )
7768
+ except Exception:
7769
+ pass # fall through
6736
7770
  if resp.status_code != 200:
6737
7771
  error_text = resp.text[:1000]
6738
7772
  if _maybe_disable_grammar_for_tools_error(
@@ -6785,7 +7819,7 @@ async def messages(request: Request):
6785
7819
 
6786
7820
  openai_resp = resp.json()
6787
7821
  # Recover tool calls from <tool_call> XML before guardrails run
6788
- _maybe_extract_text_tool_calls(openai_resp)
7822
+ _maybe_extract_text_tool_calls(openai_resp, anthropic_tools=body.get("tools"))
6789
7823
  openai_resp = await _apply_unexpected_end_turn_guardrail(
6790
7824
  client,
6791
7825
  openai_resp,
@@ -6854,7 +7888,11 @@ async def messages(request: Request):
6854
7888
  logger.info("DEGENERATE RETRY (stream): no tool call, using truncated")
6855
7889
  except Exception as exc:
6856
7890
  logger.warning("DEGENERATE RETRY (stream): failed: %s", exc)
6857
- anthropic_resp = openai_to_anthropic_response(openai_resp, model)
7891
+ anthropic_resp = openai_to_anthropic_response(
7892
+ openai_resp, model,
7893
+ expose_thinking=isinstance(body.get("thinking"), dict)
7894
+ and (body["thinking"].get("type") or "").lower() == "enabled",
7895
+ )
6858
7896
  # FINALIZE CONTINUATION: inject synthetic tool_use (non-guarded stream path)
6859
7897
  if (
6860
7898
  monitor.finalize_turn_active