@miller-tech/uap 1.20.34 → 1.20.36
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -0
- package/docs/deployment/QWEN35_LLAMA_CPP.md +15 -6
- package/package.json +1 -1
- package/tools/agents/config/qwen3.5-enhanced.jinja +187 -0
- package/tools/agents/scripts/anthropic_proxy.py +1097 -59
- package/tools/agents/scripts/tool-choice-proxy.cjs +12 -0
- package/tools/agents/tests/test_anthropic_proxy_streaming.py +193 -8
|
@@ -134,6 +134,11 @@ PROXY_LOOP_BREAKER = os.environ.get("PROXY_LOOP_BREAKER", "on").lower() not in {
|
|
|
134
134
|
}
|
|
135
135
|
PROXY_LOOP_WINDOW = int(os.environ.get("PROXY_LOOP_WINDOW", "6"))
|
|
136
136
|
PROXY_LOOP_REPEAT_THRESHOLD = int(os.environ.get("PROXY_LOOP_REPEAT_THRESHOLD", "6"))
|
|
137
|
+
# Fix K (2026-04-22): minimum consecutive cycle-repeat count required to flip
|
|
138
|
+
# phase from act -> review. The old behaviour accepted cycle_repeat=2, which
|
|
139
|
+
# is normal in a working session (re-reading the same file across edits).
|
|
140
|
+
# Set higher to tolerate legitimate re-reads; set 1 to restore old behaviour.
|
|
141
|
+
PROXY_CYCLE_TRIGGER_REPEAT = int(os.environ.get("PROXY_CYCLE_TRIGGER_REPEAT", "3"))
|
|
137
142
|
PROXY_FORCED_THRESHOLD = int(os.environ.get("PROXY_FORCED_THRESHOLD", "15"))
|
|
138
143
|
PROXY_NO_PROGRESS_THRESHOLD = int(os.environ.get("PROXY_NO_PROGRESS_THRESHOLD", "3"))
|
|
139
144
|
PROXY_CONTEXT_RELEASE_THRESHOLD = float(
|
|
@@ -247,6 +252,19 @@ PROXY_DISABLE_THINKING_ON_TOOL_TURNS = os.environ.get(
|
|
|
247
252
|
"off",
|
|
248
253
|
"no",
|
|
249
254
|
}
|
|
255
|
+
# Disable thinking on EVERY turn (not just tool turns). For models like Gemma 4
|
|
256
|
+
# that emit ~100 thinking tokens for trivial replies, this halves output cost.
|
|
257
|
+
PROXY_DISABLE_THINKING_ALWAYS = os.environ.get(
|
|
258
|
+
"PROXY_DISABLE_THINKING_ALWAYS", "off"
|
|
259
|
+
).lower() not in {"0", "false", "off", "no"}
|
|
260
|
+
# Force tool_choice='required' on the first turn of a fresh session. Originally
|
|
261
|
+
# Qwen-tuned to break out of cold-start "tries to chat instead of calling a tool"
|
|
262
|
+
# behaviour. Gemma 4 doesn't need this — it routes 'auto' correctly and the
|
|
263
|
+
# force triggers malformed-JSON emissions when it would rather speak. Default
|
|
264
|
+
# off; set 'on' to restore the legacy Qwen-style behaviour.
|
|
265
|
+
PROXY_FORCE_TOOL_CHOICE_ON_COLD_START = os.environ.get(
|
|
266
|
+
"PROXY_FORCE_TOOL_CHOICE_ON_COLD_START", "off"
|
|
267
|
+
).lower() not in {"0", "false", "off", "no"}
|
|
250
268
|
PROXY_DISABLE_SPEC_ON_TOOL_TURNS = os.environ.get(
|
|
251
269
|
"PROXY_DISABLE_SPEC_ON_TOOL_TURNS", "off"
|
|
252
270
|
).lower() not in {
|
|
@@ -576,6 +594,44 @@ def _is_grammar_tools_incompatibility(status_code: int, error_text: str) -> bool
|
|
|
576
594
|
return "custom grammar constraints" in lowered and "with tools" in lowered
|
|
577
595
|
|
|
578
596
|
|
|
597
|
+
def _is_gemma4_peg_parse_failure(status_code: int, error_text: str) -> bool:
|
|
598
|
+
"""Detect Gemma 4's PEG-parser failure on tool-turn output.
|
|
599
|
+
|
|
600
|
+
llama-server returns HTTP 500 with `failed to parse grammar` /
|
|
601
|
+
`Failed to parse input at pos N: <|tool_call>call:...` when the model
|
|
602
|
+
emits an incomplete tool call (missing required schema fields) under
|
|
603
|
+
tool_choice='required'. The PEG grammar enforces the schema strictly
|
|
604
|
+
and rejects the partial output. Caller should retry with relaxed
|
|
605
|
+
tool_choice='auto' so the model can emit prose or a complete call
|
|
606
|
+
without grammar enforcement triggering this failure mode.
|
|
607
|
+
"""
|
|
608
|
+
if status_code != 500:
|
|
609
|
+
return False
|
|
610
|
+
text = error_text or ""
|
|
611
|
+
return (
|
|
612
|
+
"Failed to parse input at pos" in text
|
|
613
|
+
or "<|tool_call>call:" in text
|
|
614
|
+
)
|
|
615
|
+
|
|
616
|
+
|
|
617
|
+
def _relax_tool_choice_for_gemma4_peg_retry(request_body: dict, source: str) -> bool:
|
|
618
|
+
"""When a Gemma 4 PEG parse failure is detected on a tool turn, drop
|
|
619
|
+
tool_choice='required' so the retry has a permissive grammar. Returns
|
|
620
|
+
True if the body was modified (caller should retry POST)."""
|
|
621
|
+
if not request_body.get("tools"):
|
|
622
|
+
return False
|
|
623
|
+
current = request_body.get("tool_choice")
|
|
624
|
+
if current in ("required", {"type": "any"}):
|
|
625
|
+
request_body["tool_choice"] = "auto"
|
|
626
|
+
logger.warning(
|
|
627
|
+
"GEMMA4 PEG RETRY (%s): relaxed tool_choice='required' -> 'auto' "
|
|
628
|
+
"to bypass strict-grammar parse failure on incomplete model output",
|
|
629
|
+
source,
|
|
630
|
+
)
|
|
631
|
+
return True
|
|
632
|
+
return False
|
|
633
|
+
|
|
634
|
+
|
|
579
635
|
def _maybe_disable_grammar_for_tools_error(
|
|
580
636
|
request_body: dict,
|
|
581
637
|
status_code: int,
|
|
@@ -1409,6 +1465,66 @@ def prune_conversation(
|
|
|
1409
1465
|
# Granular timeouts: short connect, long read for streaming LLM output.
|
|
1410
1466
|
http_client: httpx.AsyncClient | None = None
|
|
1411
1467
|
|
|
1468
|
+
# ---------------------------------------------------------------------------
|
|
1469
|
+
# Concurrency Control
|
|
1470
|
+
# ---------------------------------------------------------------------------
|
|
1471
|
+
# Semaphore to serialize upstream requests. llama.cpp is configured with
|
|
1472
|
+
# --parallel 1 (LLAMA_PARALLEL=1), so it can only process one inference at
|
|
1473
|
+
# a time. Without this gate, concurrent client requests (Shannon sub-agents,
|
|
1474
|
+
# multiple Claude Code sessions) would all hit llama.cpp at once and the
|
|
1475
|
+
# server would serialize them while the proxy holds N httpx connections
|
|
1476
|
+
# open — potentially exhausting the proxy's connection pool while requests
|
|
1477
|
+
# queue inside llama.cpp opaquely.
|
|
1478
|
+
#
|
|
1479
|
+
# With the semaphore: requests queue inside the proxy (cheap, just asyncio
|
|
1480
|
+
# tasks waiting) and only PROXY_CONCURRENCY_LIMIT at a time reaches
|
|
1481
|
+
# llama.cpp. Each httpx connection is held only for the actual inference
|
|
1482
|
+
# duration, not the queue wait.
|
|
1483
|
+
#
|
|
1484
|
+
# Default: 1 (matches LLAMA_PARALLEL=1). Increase if you raise --parallel.
|
|
1485
|
+
PROXY_CONCURRENCY_LIMIT = int(os.environ.get("PROXY_CONCURRENCY_LIMIT", "1"))
|
|
1486
|
+
# Max time to wait for a slot before returning 503. Generous because real
|
|
1487
|
+
# inference can take 30-600s and queued requests must wait through that.
|
|
1488
|
+
# 0 = wait indefinitely.
|
|
1489
|
+
PROXY_CONCURRENCY_QUEUE_TIMEOUT = float(
|
|
1490
|
+
os.environ.get("PROXY_CONCURRENCY_QUEUE_TIMEOUT", "900")
|
|
1491
|
+
)
|
|
1492
|
+
upstream_semaphore: asyncio.Semaphore | None = None
|
|
1493
|
+
|
|
1494
|
+
|
|
1495
|
+
async def _acquire_upstream_slot() -> bool:
|
|
1496
|
+
"""Acquire a semaphore slot for an upstream request.
|
|
1497
|
+
|
|
1498
|
+
Returns True if a slot was acquired, False if the wait timed out.
|
|
1499
|
+
asyncio.Semaphore.acquire() preserves wait order via futures, so this
|
|
1500
|
+
gives a natural FIFO queue.
|
|
1501
|
+
"""
|
|
1502
|
+
if upstream_semaphore is None:
|
|
1503
|
+
return True # Not yet initialized; proceed without limiting
|
|
1504
|
+
if PROXY_CONCURRENCY_QUEUE_TIMEOUT <= 0:
|
|
1505
|
+
await upstream_semaphore.acquire()
|
|
1506
|
+
return True
|
|
1507
|
+
try:
|
|
1508
|
+
await asyncio.wait_for(
|
|
1509
|
+
upstream_semaphore.acquire(),
|
|
1510
|
+
timeout=PROXY_CONCURRENCY_QUEUE_TIMEOUT,
|
|
1511
|
+
)
|
|
1512
|
+
return True
|
|
1513
|
+
except asyncio.TimeoutError:
|
|
1514
|
+
return False
|
|
1515
|
+
|
|
1516
|
+
|
|
1517
|
+
def _release_upstream_slot() -> None:
|
|
1518
|
+
"""Release a semaphore slot. MUST be called once per successful acquire.
|
|
1519
|
+
|
|
1520
|
+
Note: asyncio.Semaphore.release() always increments the counter — we
|
|
1521
|
+
do NOT gate on locked() because that returns True only when the counter
|
|
1522
|
+
is 0 (no slots left). Gating would cause a slot leak when limit > 1 and
|
|
1523
|
+
multiple holders release simultaneously.
|
|
1524
|
+
"""
|
|
1525
|
+
if upstream_semaphore is not None:
|
|
1526
|
+
upstream_semaphore.release()
|
|
1527
|
+
|
|
1412
1528
|
|
|
1413
1529
|
def _is_loading_model_503(resp: httpx.Response) -> bool:
|
|
1414
1530
|
"""Check if response is a 503 'Loading model' from llama.cpp."""
|
|
@@ -1452,6 +1568,36 @@ async def _post_with_retry(
|
|
|
1452
1568
|
url: str,
|
|
1453
1569
|
payload: dict,
|
|
1454
1570
|
headers: dict,
|
|
1571
|
+
) -> httpx.Response:
|
|
1572
|
+
"""Post with upstream-retry + concurrency-slot acquire.
|
|
1573
|
+
|
|
1574
|
+
Acquires a slot from upstream_semaphore before making the request, so
|
|
1575
|
+
concurrent client requests queue in the proxy (cheap asyncio waits)
|
|
1576
|
+
rather than all hammering llama.cpp at once. Slot is released in a
|
|
1577
|
+
finally block so it's always returned to the pool even on error.
|
|
1578
|
+
"""
|
|
1579
|
+
acquired = await _acquire_upstream_slot()
|
|
1580
|
+
if not acquired:
|
|
1581
|
+
logger.warning(
|
|
1582
|
+
"CONCURRENCY: queue timeout (%ds) exceeded waiting for upstream slot",
|
|
1583
|
+
int(PROXY_CONCURRENCY_QUEUE_TIMEOUT),
|
|
1584
|
+
)
|
|
1585
|
+
raise httpx.RemoteProtocolError(
|
|
1586
|
+
f"Upstream concurrency queue timed out after {int(PROXY_CONCURRENCY_QUEUE_TIMEOUT)}s "
|
|
1587
|
+
f"(limit={PROXY_CONCURRENCY_LIMIT})",
|
|
1588
|
+
request=None,
|
|
1589
|
+
)
|
|
1590
|
+
try:
|
|
1591
|
+
return await _post_with_retry_inner(client, url, payload, headers)
|
|
1592
|
+
finally:
|
|
1593
|
+
_release_upstream_slot()
|
|
1594
|
+
|
|
1595
|
+
|
|
1596
|
+
async def _post_with_retry_inner(
|
|
1597
|
+
client: httpx.AsyncClient,
|
|
1598
|
+
url: str,
|
|
1599
|
+
payload: dict,
|
|
1600
|
+
headers: dict,
|
|
1455
1601
|
) -> httpx.Response:
|
|
1456
1602
|
last_exc: Exception | None = None
|
|
1457
1603
|
for attempt in range(PROXY_UPSTREAM_RETRY_MAX):
|
|
@@ -1497,6 +1643,7 @@ async def _post_with_generation_timeout(
|
|
|
1497
1643
|
headers: dict,
|
|
1498
1644
|
) -> httpx.Response:
|
|
1499
1645
|
"""Wrap _post_with_retry with an explicit asyncio generation timeout.
|
|
1646
|
+
Also acquires a concurrency slot before making the request.
|
|
1500
1647
|
|
|
1501
1648
|
The httpx read timeout may not fire for hung connections where the server
|
|
1502
1649
|
keeps the socket open but produces no data (observed with llama.cpp server
|
|
@@ -1561,6 +1708,13 @@ async def lifespan(app: FastAPI):
|
|
|
1561
1708
|
"""Manage the httpx client lifecycle with the FastAPI app."""
|
|
1562
1709
|
global http_client
|
|
1563
1710
|
global default_context_window
|
|
1711
|
+
global upstream_semaphore
|
|
1712
|
+
upstream_semaphore = asyncio.Semaphore(PROXY_CONCURRENCY_LIMIT)
|
|
1713
|
+
logger.info(
|
|
1714
|
+
"CONCURRENCY: upstream semaphore initialized limit=%d queue_timeout=%.0fs",
|
|
1715
|
+
PROXY_CONCURRENCY_LIMIT,
|
|
1716
|
+
PROXY_CONCURRENCY_QUEUE_TIMEOUT,
|
|
1717
|
+
)
|
|
1564
1718
|
http_client = httpx.AsyncClient(
|
|
1565
1719
|
timeout=httpx.Timeout(
|
|
1566
1720
|
connect=10.0, # 10s to establish connection
|
|
@@ -1643,6 +1797,8 @@ async def lifespan(app: FastAPI):
|
|
|
1643
1797
|
yield
|
|
1644
1798
|
await http_client.aclose()
|
|
1645
1799
|
http_client = None
|
|
1800
|
+
if upstream_semaphore is not None:
|
|
1801
|
+
upstream_semaphore = None
|
|
1646
1802
|
logger.info("Proxy shut down")
|
|
1647
1803
|
|
|
1648
1804
|
|
|
@@ -1653,6 +1809,16 @@ app = FastAPI(
|
|
|
1653
1809
|
lifespan=lifespan,
|
|
1654
1810
|
)
|
|
1655
1811
|
|
|
1812
|
+
# NOTE: Concurrency control is enforced by _acquire_upstream_slot() inside
|
|
1813
|
+
# _post_with_retry (the single point where we hit llama.cpp). An earlier
|
|
1814
|
+
# implementation also added an HTTP middleware that acquired the same
|
|
1815
|
+
# semaphore — this caused a self-deadlock (middleware holds slot, inner
|
|
1816
|
+
# call waits for slot, both on the same task). The middleware approach
|
|
1817
|
+
# also called non-existent asyncio.Semaphore methods (try_acquire /
|
|
1818
|
+
# acquire_nowait) and ran an async primitive in a thread executor.
|
|
1819
|
+
# Removed 2026-05-13.
|
|
1820
|
+
|
|
1821
|
+
|
|
1656
1822
|
|
|
1657
1823
|
# ===========================================================================
|
|
1658
1824
|
# Request Translation: Anthropic -> OpenAI
|
|
@@ -1686,6 +1852,31 @@ def anthropic_to_openai_messages(anthropic_body: dict) -> list[dict]:
|
|
|
1686
1852
|
role = msg["role"]
|
|
1687
1853
|
content = msg.get("content")
|
|
1688
1854
|
|
|
1855
|
+
# Strip <think>...</think> blocks from PRIOR assistant turns. Qwen is
|
|
1856
|
+
# heavily few-shot influenced by its own conversation history — if
|
|
1857
|
+
# earlier assistant turns contain reasoning blocks, the next turn
|
|
1858
|
+
# will pattern-match and emit <think> tags even when the system
|
|
1859
|
+
# prompt forbids them. Stripping breaks the copy cycle.
|
|
1860
|
+
if role == "assistant":
|
|
1861
|
+
if isinstance(content, str) and "<think>" in content:
|
|
1862
|
+
content = _THINKING_BLOCK_RE.sub("", content).lstrip()
|
|
1863
|
+
elif isinstance(content, list):
|
|
1864
|
+
stripped = []
|
|
1865
|
+
for b in content:
|
|
1866
|
+
if isinstance(b, dict) and b.get("type") == "text":
|
|
1867
|
+
t = b.get("text", "")
|
|
1868
|
+
if "<think>" in t:
|
|
1869
|
+
t = _THINKING_BLOCK_RE.sub("", t).lstrip()
|
|
1870
|
+
if t:
|
|
1871
|
+
stripped.append({**b, "text": t})
|
|
1872
|
+
elif isinstance(b, dict) and b.get("type") == "thinking":
|
|
1873
|
+
# Anthropic-style thinking block — drop entirely
|
|
1874
|
+
# (don't replay it back to the model).
|
|
1875
|
+
continue
|
|
1876
|
+
else:
|
|
1877
|
+
stripped.append(b)
|
|
1878
|
+
content = stripped
|
|
1879
|
+
|
|
1689
1880
|
if isinstance(content, str):
|
|
1690
1881
|
messages.append({"role": role, "content": content})
|
|
1691
1882
|
elif isinstance(content, list):
|
|
@@ -1695,6 +1886,10 @@ def anthropic_to_openai_messages(anthropic_body: dict) -> list[dict]:
|
|
|
1695
1886
|
parts.append(block)
|
|
1696
1887
|
elif block.get("type") == "text":
|
|
1697
1888
|
parts.append(block.get("text", ""))
|
|
1889
|
+
elif block.get("type") == "thinking":
|
|
1890
|
+
# Drop thinking blocks from user/assistant content when
|
|
1891
|
+
# echoed back into history — model shouldn't see them.
|
|
1892
|
+
continue
|
|
1698
1893
|
elif block.get("type") == "tool_use":
|
|
1699
1894
|
messages.append(
|
|
1700
1895
|
{
|
|
@@ -1703,7 +1898,7 @@ def anthropic_to_openai_messages(anthropic_body: dict) -> list[dict]:
|
|
|
1703
1898
|
"tool_calls": [
|
|
1704
1899
|
{
|
|
1705
1900
|
"id": block.get(
|
|
1706
|
-
"id", f"
|
|
1901
|
+
"id", f"toolu_{uuid.uuid4().hex[:24]}"
|
|
1707
1902
|
),
|
|
1708
1903
|
"type": "function",
|
|
1709
1904
|
"function": {
|
|
@@ -1716,10 +1911,17 @@ def anthropic_to_openai_messages(anthropic_body: dict) -> list[dict]:
|
|
|
1716
1911
|
)
|
|
1717
1912
|
continue
|
|
1718
1913
|
elif block.get("type") == "tool_result":
|
|
1914
|
+
# Strip Anthropic-spec toolu_ prefix so the upstream
|
|
1915
|
+
# tool_call_id matches what llama-server originally
|
|
1916
|
+
# emitted (we stamped the prefix on outbound; reverse it
|
|
1917
|
+
# here so the loop closes correctly).
|
|
1918
|
+
tu_id = block.get("tool_use_id", "")
|
|
1919
|
+
if isinstance(tu_id, str) and tu_id.startswith("toolu_"):
|
|
1920
|
+
tu_id = tu_id[len("toolu_"):]
|
|
1719
1921
|
messages.append(
|
|
1720
1922
|
{
|
|
1721
1923
|
"role": "tool",
|
|
1722
|
-
"tool_call_id":
|
|
1924
|
+
"tool_call_id": tu_id,
|
|
1723
1925
|
"content": _extract_text(block.get("content", "")),
|
|
1724
1926
|
}
|
|
1725
1927
|
)
|
|
@@ -1899,6 +2101,18 @@ _AGENTIC_SYSTEM_SUPPLEMENT_MINIMAL = (
|
|
|
1899
2101
|
"\n\nUse tools for all actions. Respond with tool calls, not descriptions of what to do."
|
|
1900
2102
|
)
|
|
1901
2103
|
|
|
2104
|
+
# Directive appended when the upstream model (Qwen) is configured with
|
|
2105
|
+
# enable_thinking=False but consistently emits <think>...</think> blocks
|
|
2106
|
+
# anyway, consuming the max_tokens budget before any tool_use is generated.
|
|
2107
|
+
# Empirically required for Shannon-style workflows where max_tokens=512
|
|
2108
|
+
# leaves no room for both internal reasoning AND a tool call.
|
|
2109
|
+
_NO_THINKING_DIRECTIVE = (
|
|
2110
|
+
"\n\nCRITICAL: Do NOT output <think>...</think> tags or any internal "
|
|
2111
|
+
"reasoning. Begin your response IMMEDIATELY with the appropriate "
|
|
2112
|
+
"tool_call. If you have no tool to call, reply with plain text only — "
|
|
2113
|
+
"never include reasoning blocks."
|
|
2114
|
+
)
|
|
2115
|
+
|
|
1902
2116
|
if PROXY_AGENTIC_SUPPLEMENT_MODE == "legacy":
|
|
1903
2117
|
_AGENTIC_SYSTEM_SUPPLEMENT = _AGENTIC_SYSTEM_SUPPLEMENT_LEGACY
|
|
1904
2118
|
elif PROXY_AGENTIC_SUPPLEMENT_MODE == "minimal":
|
|
@@ -2264,7 +2478,7 @@ def anthropic_to_openai_response(anthropic_resp: dict) -> dict:
|
|
|
2264
2478
|
elif btype == "tool_use":
|
|
2265
2479
|
tool_calls.append(
|
|
2266
2480
|
{
|
|
2267
|
-
"id": block.get("id", f"
|
|
2481
|
+
"id": block.get("id", f"toolu_{uuid.uuid4().hex[:24]}"),
|
|
2268
2482
|
"type": "function",
|
|
2269
2483
|
"function": {
|
|
2270
2484
|
"name": block.get("name", ""),
|
|
@@ -2347,6 +2561,72 @@ def _latest_user_text(anthropic_body: dict) -> str:
|
|
|
2347
2561
|
return ""
|
|
2348
2562
|
|
|
2349
2563
|
|
|
2564
|
+
# 2026-05-12: Detect "no-task" user turns to gate the state machine's
|
|
2565
|
+
# force-required path. When the last actual human query is a short ack
|
|
2566
|
+
# ("ok", "3", "test"), an acknowledgement phrase ("standing by", "awaiting
|
|
2567
|
+
# next instruction"), or a status report ending in an ack ("scan complete.
|
|
2568
|
+
# awaiting next instruction"), there is no genuine work for the model to
|
|
2569
|
+
# do. Forcing tool_choice='required' in this state causes the model to
|
|
2570
|
+
# ruminate in <think> blocks, and the meta-tool talk inside those blocks
|
|
2571
|
+
# trips the malformed-pseudo-tool detector. Conservative patterns only.
|
|
2572
|
+
_NO_TASK_SHORT_ACKS = frozenset({
|
|
2573
|
+
"ok", "okay", "k", "kk", "y", "n", "yes", "no", "nope", "yep", "yeah",
|
|
2574
|
+
"thanks", "thank", "thx", "ty", "ack", "noted", "received", "understood",
|
|
2575
|
+
"test", "ping", "hi", "hello",
|
|
2576
|
+
})
|
|
2577
|
+
|
|
2578
|
+
_NO_TASK_ACK_PATTERNS = (
|
|
2579
|
+
re.compile(r"awaiting\s+(?:next|further|your)\s+(?:instruction|input|command|task|directive)", re.I),
|
|
2580
|
+
re.compile(r"standing\s+by(?:\s+for\s+(?:your\s+)?(?:next|further|new)\s+(?:instruction|input|command|task|directive)?)?", re.I),
|
|
2581
|
+
re.compile(r"\b(?:ready|waiting|holding)\s+for\s+(?:your\s+)?(?:next|further|new)\s+(?:task|instruction|command|input|directive)", re.I),
|
|
2582
|
+
# Status report ending in ack: "X complete. {awaiting/standing/ready/done}"
|
|
2583
|
+
re.compile(r"\bcomplet(?:e|ed)\b[\s.,;:!\-]+(?:awaiting|standing\s+by|ready|done|finished|over\s+to\s+you)", re.I),
|
|
2584
|
+
)
|
|
2585
|
+
|
|
2586
|
+
|
|
2587
|
+
def _is_no_task_user_text(text: str) -> bool:
|
|
2588
|
+
if not text:
|
|
2589
|
+
return False
|
|
2590
|
+
stripped = text.strip()
|
|
2591
|
+
if not stripped:
|
|
2592
|
+
return False
|
|
2593
|
+
bare = re.sub(r"[^\w\s]", "", stripped).strip().lower()
|
|
2594
|
+
if bare in _NO_TASK_SHORT_ACKS:
|
|
2595
|
+
return True
|
|
2596
|
+
if re.fullmatch(r"\d+(?:\.\d+)?", bare):
|
|
2597
|
+
return True
|
|
2598
|
+
snippet = stripped[:400]
|
|
2599
|
+
return any(p.search(snippet) for p in _NO_TASK_ACK_PATTERNS)
|
|
2600
|
+
|
|
2601
|
+
|
|
2602
|
+
def _latest_user_query_text(anthropic_body: dict) -> str:
|
|
2603
|
+
"""Return the most recent user message *text* — walking past
|
|
2604
|
+
tool_result-only messages to find the last actual human query.
|
|
2605
|
+
|
|
2606
|
+
During agentic loops the trailing user message is a tool_result block
|
|
2607
|
+
with no ``text`` parts, so ``_latest_user_text`` returns empty.
|
|
2608
|
+
Tool-narrowing needs query tokens to score tools; without them it
|
|
2609
|
+
keeps all tools (defeating the purpose). This walker pulls text
|
|
2610
|
+
from prior user turns as a fallback so narrowing stays useful in
|
|
2611
|
+
long loops.
|
|
2612
|
+
"""
|
|
2613
|
+
for msg in reversed(anthropic_body.get("messages", [])):
|
|
2614
|
+
if msg.get("role") != "user":
|
|
2615
|
+
continue
|
|
2616
|
+
content = msg.get("content", "")
|
|
2617
|
+
if isinstance(content, str) and content.strip():
|
|
2618
|
+
return content
|
|
2619
|
+
if isinstance(content, list):
|
|
2620
|
+
text_parts = [
|
|
2621
|
+
b.get("text", "")
|
|
2622
|
+
for b in content
|
|
2623
|
+
if isinstance(b, dict) and b.get("type") == "text" and b.get("text")
|
|
2624
|
+
]
|
|
2625
|
+
if text_parts:
|
|
2626
|
+
return "\n".join(text_parts)
|
|
2627
|
+
return ""
|
|
2628
|
+
|
|
2629
|
+
|
|
2350
2630
|
def _tokenize_for_tool_ranking(text: str) -> set[str]:
|
|
2351
2631
|
return {m.group(0).lower() for m in re.finditer(r"[a-zA-Z0-9_]{2,}", text)}
|
|
2352
2632
|
|
|
@@ -2366,6 +2646,13 @@ def _narrow_tools_for_request(
|
|
|
2366
2646
|
|
|
2367
2647
|
query_text = _latest_user_text(anthropic_body).lower()
|
|
2368
2648
|
query_tokens = _tokenize_for_tool_ranking(query_text)
|
|
2649
|
+
if not query_tokens:
|
|
2650
|
+
# Walk back past tool_result turns to find the prior real human
|
|
2651
|
+
# query. Lets narrowing stay effective during agentic loops where
|
|
2652
|
+
# the latest user msg is just a tool_result block (no text).
|
|
2653
|
+
fallback_query = _latest_user_query_text(anthropic_body).lower()
|
|
2654
|
+
query_text = fallback_query or query_text
|
|
2655
|
+
query_tokens = _tokenize_for_tool_ranking(query_text)
|
|
2369
2656
|
if not query_tokens:
|
|
2370
2657
|
n_msgs = len(anthropic_body.get("messages", []))
|
|
2371
2658
|
if (
|
|
@@ -2490,6 +2777,18 @@ def _resolve_state_machine_tool_choice(
|
|
|
2490
2777
|
monitor.finalize_synthetic_tool_id = ""
|
|
2491
2778
|
return None, "fresh_user_text"
|
|
2492
2779
|
|
|
2780
|
+
# 2026-05-12: No-task ack guard. When the latest user message is just a
|
|
2781
|
+
# tool_result (no fresh text), walk back to the most recent human query.
|
|
2782
|
+
# If that query is a short ack or "X complete. awaiting next" status,
|
|
2783
|
+
# do not force tool_choice — let the model produce a natural finalization
|
|
2784
|
+
# text instead of ruminating in <think> blocks.
|
|
2785
|
+
last_user_query = _latest_user_query_text(anthropic_body).strip()
|
|
2786
|
+
if last_user_query and _is_no_task_user_text(last_user_query):
|
|
2787
|
+
monitor.reset_tool_turn_state(reason="no_task_user_text")
|
|
2788
|
+
monitor.finalize_continuation_count = 0
|
|
2789
|
+
monitor.finalize_synthetic_tool_id = ""
|
|
2790
|
+
return None, "no_task_user_text"
|
|
2791
|
+
|
|
2493
2792
|
active_loop = (
|
|
2494
2793
|
has_tool_results
|
|
2495
2794
|
and last_user_has_tool_result
|
|
@@ -2563,7 +2862,15 @@ def _resolve_state_machine_tool_choice(
|
|
|
2563
2862
|
dup_tool,
|
|
2564
2863
|
)
|
|
2565
2864
|
|
|
2566
|
-
|
|
2865
|
+
# Fix K (2026-04-22): require cycle_repeat >= PROXY_CYCLE_TRIGGER_REPEAT
|
|
2866
|
+
# before flipping phase. Single-repeat cycles are legitimate in working
|
|
2867
|
+
# sessions (e.g. re-reading the same file across edits). dup_target
|
|
2868
|
+
# above already demands threshold=3 before asserting a cycle, so the
|
|
2869
|
+
# `cycle_looping = True, cycle_repeat = 2` pair from that branch is
|
|
2870
|
+
# kept as a strong signal (read target repeated 3+ times). Low-repeat
|
|
2871
|
+
# cycles detected by detect_tool_cycle get filtered here.
|
|
2872
|
+
cycle_trip = cycle_looping and cycle_repeat >= PROXY_CYCLE_TRIGGER_REPEAT
|
|
2873
|
+
if cycle_trip or stagnating:
|
|
2567
2874
|
reason = "cycle_detected" if cycle_looping else "stagnation"
|
|
2568
2875
|
monitor.set_tool_turn_phase("review", reason=reason)
|
|
2569
2876
|
monitor.tool_state_review_cycles += 1
|
|
@@ -2702,6 +3009,33 @@ def build_openai_request(
|
|
|
2702
3009
|
|
|
2703
3010
|
has_tools = _has_tool_definitions(anthropic_body)
|
|
2704
3011
|
|
|
3012
|
+
# Translate Anthropic `thinking` parameter to upstream `enable_thinking`.
|
|
3013
|
+
# Anthropic shape: {"thinking": {"type": "enabled", "budget_tokens": 1024}}
|
|
3014
|
+
# or {"type": "disabled"}. Per the Anthropic spec, thinking is OFF by
|
|
3015
|
+
# default and ONLY enabled when the client opts in. Match that behaviour:
|
|
3016
|
+
# - thinking.type == "enabled" -> enable_thinking=True
|
|
3017
|
+
# - thinking.type == "disabled" or absent -> enable_thinking=False
|
|
3018
|
+
# Without this, Qwen's chat template (which defaults thinking ON) would
|
|
3019
|
+
# consume the client's max_tokens budget on internal reasoning, leaving
|
|
3020
|
+
# nothing for the visible answer.
|
|
3021
|
+
anthropic_thinking = anthropic_body.get("thinking")
|
|
3022
|
+
if isinstance(anthropic_thinking, dict):
|
|
3023
|
+
ttype = (anthropic_thinking.get("type") or "").lower()
|
|
3024
|
+
if ttype == "enabled":
|
|
3025
|
+
openai_body["enable_thinking"] = True
|
|
3026
|
+
else:
|
|
3027
|
+
openai_body["enable_thinking"] = False
|
|
3028
|
+
else:
|
|
3029
|
+
# Match Anthropic default: thinking off unless explicitly requested.
|
|
3030
|
+
openai_body["enable_thinking"] = False
|
|
3031
|
+
|
|
3032
|
+
# Global thinking-off (G): apply to every request, not just tool turns.
|
|
3033
|
+
# Only applies when the client did NOT explicitly request thinking above.
|
|
3034
|
+
# Per-path tool-turn handling below (DISABLE_THINKING_ON_TOOL_TURNS) is
|
|
3035
|
+
# additive — ALWAYS supersedes when set.
|
|
3036
|
+
if PROXY_DISABLE_THINKING_ALWAYS:
|
|
3037
|
+
openai_body["enable_thinking"] = False
|
|
3038
|
+
|
|
2705
3039
|
# Inject agentic protocol instructions only for tool-enabled turns.
|
|
2706
3040
|
# Use minimal supplement for qwen models to reduce prompt leak surface.
|
|
2707
3041
|
if has_tools:
|
|
@@ -2711,6 +3045,15 @@ def build_openai_request(
|
|
|
2711
3045
|
if "qwen" in model_name and PROXY_AGENTIC_SUPPLEMENT_MODE != "legacy"
|
|
2712
3046
|
else _AGENTIC_SYSTEM_SUPPLEMENT
|
|
2713
3047
|
)
|
|
3048
|
+
# When thinking is explicitly disabled (Anthropic default, plus our
|
|
3049
|
+
# tool-turn forcing) but the upstream model is Qwen — which emits
|
|
3050
|
+
# <think> blocks regardless of enable_thinking — append a strong
|
|
3051
|
+
# directive that suppresses internal reasoning. Without this, small
|
|
3052
|
+
# max_tokens budgets get fully consumed by the model's reasoning,
|
|
3053
|
+
# producing required_tool_miss retries (observed in Shannon workflows
|
|
3054
|
+
# with max_tokens=512 + tool_choice=required).
|
|
3055
|
+
if openai_body.get("enable_thinking") is False:
|
|
3056
|
+
supplement = supplement + _NO_THINKING_DIRECTIVE
|
|
2714
3057
|
if (
|
|
2715
3058
|
openai_body["messages"]
|
|
2716
3059
|
and openai_body["messages"][0].get("role") == "system"
|
|
@@ -2731,23 +3074,62 @@ def build_openai_request(
|
|
|
2731
3074
|
if "max_tokens" in anthropic_body:
|
|
2732
3075
|
requested_raw = max(1, int(anthropic_body["max_tokens"]))
|
|
2733
3076
|
|
|
2734
|
-
# Enforce configurable minimum floor for
|
|
2735
|
-
#
|
|
2736
|
-
#
|
|
3077
|
+
# Enforce configurable minimum floor for tool turns: the model needs
|
|
3078
|
+
# enough headroom to emit complete tool-call arguments (long heredocs,
|
|
3079
|
+
# full-function oldString/newString pairs, etc.) without hitting the
|
|
3080
|
+
# client-requested max_tokens in the middle of a JSON string. If the
|
|
3081
|
+
# client requested >= the floor we keep their value; short preflight
|
|
3082
|
+
# requests (max_tokens <= 1024) always skip the floor to avoid
|
|
3083
|
+
# inflating plan-generation turns.
|
|
2737
3084
|
#
|
|
2738
|
-
# The
|
|
2739
|
-
#
|
|
2740
|
-
#
|
|
2741
|
-
#
|
|
2742
|
-
thinking_active_for_request =
|
|
3085
|
+
# The earlier gating on PROXY_DISABLE_THINKING_ON_TOOL_TURNS was too
|
|
3086
|
+
# restrictive: it skipped the floor on every tool turn once thinking
|
|
3087
|
+
# was off, which re-introduced truncated tool calls on long edits.
|
|
3088
|
+
# Set PROXY_MAX_TOKENS_FLOOR=0 to disable the floor entirely.
|
|
3089
|
+
thinking_active_for_request = (
|
|
3090
|
+
has_tools
|
|
3091
|
+
and not PROXY_DISABLE_THINKING_ON_TOOL_TURNS
|
|
3092
|
+
and not PROXY_DISABLE_THINKING_ALWAYS
|
|
3093
|
+
)
|
|
3094
|
+
SMALL_PREFLIGHT_THRESHOLD = 1024
|
|
3095
|
+
# Qwen-style models emit <think> blocks regardless of the
|
|
3096
|
+
# enable_thinking flag (template ignored by trained behaviour).
|
|
3097
|
+
# For tool turns those blocks alone consume ~400-1000 tokens, so a
|
|
3098
|
+
# client-requested max_tokens < THINKING_MIN_FOR_TOOLS leaves no
|
|
3099
|
+
# budget for the tool_call itself — manifesting as required_tool_miss
|
|
3100
|
+
# retries (observed Shannon: max_tokens=512 + tools=7 -> ~5 retries
|
|
3101
|
+
# per turn). Bump up to THINKING_MIN_FOR_TOOLS for these requests.
|
|
3102
|
+
THINKING_MIN_FOR_TOOLS = 2048
|
|
2743
3103
|
skip_floor = (
|
|
2744
|
-
not has_tools # non-tool requests don't need
|
|
2745
|
-
or PROXY_DISABLE_THINKING_ON_TOOL_TURNS # thinking disabled on tool turns
|
|
3104
|
+
not has_tools # non-tool requests don't need the headroom
|
|
2746
3105
|
or PROXY_MAX_TOKENS_FLOOR <= 0 # floor explicitly disabled
|
|
3106
|
+
or requested_raw <= SMALL_PREFLIGHT_THRESHOLD # tiny preflight request
|
|
2747
3107
|
)
|
|
3108
|
+
# Qwen-style models emit <think> blocks regardless of the
|
|
3109
|
+
# enable_thinking flag (template ignored by trained behaviour).
|
|
3110
|
+
# For tool turns those blocks alone consume ~400-1000 tokens, so a
|
|
3111
|
+
# client-requested max_tokens < THINKING_MIN_FOR_TOOLS leaves no
|
|
3112
|
+
# budget for the tool_call itself — manifesting as required_tool_miss
|
|
3113
|
+
# retries (observed Shannon: max_tokens=512 + tools=7 -> ~5 retries
|
|
3114
|
+
# per turn). Bump up to THINKING_MIN_FOR_TOOLS for these requests.
|
|
3115
|
+
THINKING_MIN_FOR_TOOLS = 2048
|
|
2748
3116
|
if skip_floor:
|
|
2749
3117
|
requested_max = requested_raw
|
|
2750
|
-
|
|
3118
|
+
# Even when skipping the big floor, bump small tool-turn
|
|
3119
|
+
# budgets so Qwen's mandatory thinking has room before the
|
|
3120
|
+
# tool_call. Only applies when tools are present.
|
|
3121
|
+
if (
|
|
3122
|
+
has_tools
|
|
3123
|
+
and requested_raw < THINKING_MIN_FOR_TOOLS
|
|
3124
|
+
and requested_raw > 16 # leave true preflight (e.g. max_tokens=1) alone
|
|
3125
|
+
):
|
|
3126
|
+
requested_max = THINKING_MIN_FOR_TOOLS
|
|
3127
|
+
logger.info(
|
|
3128
|
+
"MAX_TOKENS thinking-floor: %d -> %d (tool turn, Qwen mandatory thinking)",
|
|
3129
|
+
requested_raw,
|
|
3130
|
+
requested_max,
|
|
3131
|
+
)
|
|
3132
|
+
elif requested_raw < PROXY_MAX_TOKENS_FLOOR and PROXY_MAX_TOKENS_FLOOR > 0:
|
|
2751
3133
|
logger.info(
|
|
2752
3134
|
"MAX_TOKENS floor skipped: has_tools=%s thinking_active=%s requested=%d floor=%d",
|
|
2753
3135
|
has_tools,
|
|
@@ -2946,24 +3328,35 @@ def build_openai_request(
|
|
|
2946
3328
|
monitor.tool_state_stagnation_streak,
|
|
2947
3329
|
)
|
|
2948
3330
|
elif state_choice == "finalize":
|
|
2949
|
-
|
|
2950
|
-
|
|
3331
|
+
# Fix H/J (2026-04-22): Do NOT strip tools from the body on
|
|
3332
|
+
# cycle-limit finalize. Stripping tools lets the model emit
|
|
3333
|
+
# prose that LOOKS like a tool call ("<function=edit>…") but
|
|
3334
|
+
# has no structured tool_calls array, so the Anthropic client
|
|
3335
|
+
# sees end_turn with no action and halts. Instead, keep tools
|
|
3336
|
+
# available, set tool_choice=auto, and nudge the model to
|
|
3337
|
+
# either complete with a tool call OR emit a proper summary.
|
|
3338
|
+
# Grammar (when PROXY_TOOL_CALL_GRAMMAR_REQUIRED_ONLY=off) will
|
|
3339
|
+
# still constrain tool-call emission to valid JSON format.
|
|
3340
|
+
openai_body["tool_choice"] = "auto"
|
|
2951
3341
|
monitor.finalize_turn_active = True
|
|
2952
3342
|
monitor.finalize_hard_stop_count += 1 # monotonic marker: a finalize fired this session
|
|
2953
3343
|
monitor.consecutive_forced_count = 0
|
|
2954
3344
|
monitor.no_progress_streak = 0
|
|
2955
|
-
# Option 3: Inject explicit "no tool calls" instruction to reduce XML leak
|
|
2956
3345
|
finalize_instruction = {
|
|
2957
3346
|
"role": "user",
|
|
2958
3347
|
"content": (
|
|
2959
|
-
"
|
|
2960
|
-
"
|
|
3348
|
+
"You have been looping on the same tools for several turns. "
|
|
3349
|
+
"Wrap up: either emit ONE decisive tool call that completes "
|
|
3350
|
+
"the task, or reply with a plain-text summary of what you "
|
|
3351
|
+
"accomplished and what is blocking further progress. Do NOT "
|
|
3352
|
+
"emit tool call text in prose form — if you call a tool, do "
|
|
3353
|
+
"it through the structured tool_call mechanism."
|
|
2961
3354
|
),
|
|
2962
3355
|
}
|
|
2963
3356
|
msgs = openai_body.get("messages", [])
|
|
2964
3357
|
msgs.append(finalize_instruction)
|
|
2965
3358
|
logger.warning(
|
|
2966
|
-
"TOOL STATE MACHINE:
|
|
3359
|
+
"TOOL STATE MACHINE: finalize turn (reason=%s) — tools kept, tool_choice=auto",
|
|
2967
3360
|
state_reason,
|
|
2968
3361
|
)
|
|
2969
3362
|
elif state_choice == "required":
|
|
@@ -3045,8 +3438,11 @@ def build_openai_request(
|
|
|
3045
3438
|
monitor.consecutive_forced_count = 0
|
|
3046
3439
|
monitor.no_progress_streak = 0
|
|
3047
3440
|
# Force tool_choice=required on first turn to ensure local models
|
|
3048
|
-
# produce a tool call instead of plain text (cold-start fix)
|
|
3049
|
-
|
|
3441
|
+
# produce a tool call instead of plain text (cold-start fix).
|
|
3442
|
+
# Gated by PROXY_FORCE_TOOL_CHOICE_ON_COLD_START — Gemma 4 routes
|
|
3443
|
+
# 'auto' correctly without needing the force, and the force
|
|
3444
|
+
# triggers malformed-JSON emissions on Gemma 4 cold turns.
|
|
3445
|
+
if has_tools and n_msgs == 1 and PROXY_FORCE_TOOL_CHOICE_ON_COLD_START:
|
|
3050
3446
|
openai_body["tool_choice"] = "required"
|
|
3051
3447
|
logger.info(
|
|
3052
3448
|
"tool_choice forced to 'required' on first turn (reason=%s n_msgs=%d cold_start_fix=true)",
|
|
@@ -3089,10 +3485,12 @@ def build_openai_request(
|
|
|
3089
3485
|
monitor.reset_tool_turn_state(reason="no_tool_results")
|
|
3090
3486
|
|
|
3091
3487
|
|
|
3092
|
-
if PROXY_DISABLE_THINKING_ON_TOOL_TURNS:
|
|
3488
|
+
if PROXY_DISABLE_THINKING_ALWAYS or PROXY_DISABLE_THINKING_ON_TOOL_TURNS:
|
|
3093
3489
|
openai_body["enable_thinking"] = False
|
|
3094
3490
|
logger.info(
|
|
3095
|
-
"Thinking disabled
|
|
3491
|
+
"Thinking disabled (always=%s tool_turns=%s)",
|
|
3492
|
+
PROXY_DISABLE_THINKING_ALWAYS,
|
|
3493
|
+
PROXY_DISABLE_THINKING_ON_TOOL_TURNS,
|
|
3096
3494
|
)
|
|
3097
3495
|
|
|
3098
3496
|
if PROXY_DISABLE_SPEC_ON_TOOL_TURNS:
|
|
@@ -3411,7 +3809,10 @@ def _schema_type_matches(value, expected_type: str) -> bool:
|
|
|
3411
3809
|
|
|
3412
3810
|
def _string_contains_tool_markup(value: str) -> bool:
|
|
3413
3811
|
lowered = value.lower()
|
|
3414
|
-
markers = (
|
|
3812
|
+
markers = (
|
|
3813
|
+
"<parameter", "</parameter", "<tool_call", "<function=", "</function",
|
|
3814
|
+
"<|tool_call>", "<tool_call|>", # Gemma 4 native DSL
|
|
3815
|
+
)
|
|
3415
3816
|
return any(marker in lowered for marker in markers)
|
|
3416
3817
|
|
|
3417
3818
|
|
|
@@ -3483,6 +3884,343 @@ _TOOL_CALL_XML_RE = re.compile(
|
|
|
3483
3884
|
re.DOTALL,
|
|
3484
3885
|
)
|
|
3485
3886
|
|
|
3887
|
+
# Hermes-style XML function call format emitted by some Qwen/Llama fine-tunes
|
|
3888
|
+
# when grammar is not applied:
|
|
3889
|
+
# <function=name>
|
|
3890
|
+
# <parameter=key>
|
|
3891
|
+
# value
|
|
3892
|
+
# </parameter>
|
|
3893
|
+
# ...
|
|
3894
|
+
# </function>
|
|
3895
|
+
#
|
|
3896
|
+
# The value of a <parameter=KEY> block may span multiple lines and include
|
|
3897
|
+
# arbitrary characters (code snippets, JSON, quotes). The closing
|
|
3898
|
+
# </parameter> tag may be missing if the model emitted EOS prematurely —
|
|
3899
|
+
# in which case we consume up to the next <parameter=...> tag or end of
|
|
3900
|
+
# string. Names are captured as alphanumeric + underscore to avoid pulling
|
|
3901
|
+
# in attribute-like garbage.
|
|
3902
|
+
_HERMES_FUNCTION_RE = re.compile(
|
|
3903
|
+
r"<function=([A-Za-z_][A-Za-z0-9_]*)>(.*?)(?:</function>|\Z)",
|
|
3904
|
+
re.DOTALL,
|
|
3905
|
+
)
|
|
3906
|
+
_HERMES_PARAMETER_RE = re.compile(
|
|
3907
|
+
r"<parameter=([A-Za-z_][A-Za-z0-9_]*)>\s*(.*?)\s*(?=</parameter>|<parameter=|\Z)",
|
|
3908
|
+
re.DOTALL,
|
|
3909
|
+
)
|
|
3910
|
+
|
|
3911
|
+
|
|
3912
|
+
def _extract_hermes_tool_calls(text: str) -> tuple[list[dict], str]:
|
|
3913
|
+
"""Parse Hermes-style ``<function=name><parameter=k>v</parameter></function>``
|
|
3914
|
+
blocks out of *text*. Used as a fallback when the Qwen JSON format
|
|
3915
|
+
(``<tool_call>{...}</tool_call>``) is not present — for example on
|
|
3916
|
+
finalize turns where grammar does not constrain the output. Tolerates
|
|
3917
|
+
premature EOS (missing closing ``</parameter>`` / ``</function>``)."""
|
|
3918
|
+
if "<function=" not in text:
|
|
3919
|
+
return [], text
|
|
3920
|
+
|
|
3921
|
+
extracted: list[dict] = []
|
|
3922
|
+
matched_spans: list[tuple[int, int]] = []
|
|
3923
|
+
|
|
3924
|
+
for fn_match in _HERMES_FUNCTION_RE.finditer(text):
|
|
3925
|
+
name = fn_match.group(1).strip()
|
|
3926
|
+
body = fn_match.group(2) or ""
|
|
3927
|
+
if not name:
|
|
3928
|
+
continue
|
|
3929
|
+
args: dict = {}
|
|
3930
|
+
for p_match in _HERMES_PARAMETER_RE.finditer(body):
|
|
3931
|
+
key = p_match.group(1).strip()
|
|
3932
|
+
value = p_match.group(2)
|
|
3933
|
+
if key:
|
|
3934
|
+
# Strip one leading newline that the template usually adds
|
|
3935
|
+
# but preserve interior whitespace (code indentation, etc.)
|
|
3936
|
+
if value.startswith("\n"):
|
|
3937
|
+
value = value[1:]
|
|
3938
|
+
args[key] = value
|
|
3939
|
+
extracted.append(
|
|
3940
|
+
{
|
|
3941
|
+
"id": f"toolu_{uuid.uuid4().hex[:24]}",
|
|
3942
|
+
"type": "function",
|
|
3943
|
+
"function": {
|
|
3944
|
+
"name": name,
|
|
3945
|
+
"arguments": json.dumps(args, separators=(",", ":")),
|
|
3946
|
+
},
|
|
3947
|
+
}
|
|
3948
|
+
)
|
|
3949
|
+
matched_spans.append(fn_match.span())
|
|
3950
|
+
|
|
3951
|
+
if not extracted:
|
|
3952
|
+
return [], text
|
|
3953
|
+
|
|
3954
|
+
# Remove matched function blocks from text (plus any dangling
|
|
3955
|
+
# <tool_call>/</tool_call> wrappers around them).
|
|
3956
|
+
remaining = text
|
|
3957
|
+
for start, end in reversed(matched_spans):
|
|
3958
|
+
remaining = remaining[:start] + remaining[end:]
|
|
3959
|
+
# Strip leftover <tool_call>…</tool_call> envelopes that now enclose
|
|
3960
|
+
# nothing useful.
|
|
3961
|
+
remaining = re.sub(r"<tool_call>\s*</tool_call>", "", remaining, flags=re.DOTALL)
|
|
3962
|
+
remaining = remaining.strip()
|
|
3963
|
+
|
|
3964
|
+
logger.info(
|
|
3965
|
+
"TOOL CALL EXTRACTION: recovered %d Hermes-format tool call(s) from text content",
|
|
3966
|
+
len(extracted),
|
|
3967
|
+
)
|
|
3968
|
+
return extracted, remaining
|
|
3969
|
+
|
|
3970
|
+
|
|
3971
|
+
# ---------------------------------------------------------------------------
|
|
3972
|
+
# Gemma 4 tool-call DSL extractors
|
|
3973
|
+
# ---------------------------------------------------------------------------
|
|
3974
|
+
# Gemma 4's chat template emits tool calls as:
|
|
3975
|
+
# <|tool_call>call:NAME{key1:<|"|>value1<|"|>,key2:42}<tool_call|>
|
|
3976
|
+
# Note the asymmetric open/close tags and `<|"|>` substitution for `"`.
|
|
3977
|
+
# Llama-server's --jinja autoparser usually converts these to standard
|
|
3978
|
+
# OpenAI tool_calls, but the raw form can leak through on (a) malformed
|
|
3979
|
+
# emissions, (b) finalize turns, (c) non-tool-template requests where the
|
|
3980
|
+
# model still tries to call a tool. This parser catches those cases.
|
|
3981
|
+
#
|
|
3982
|
+
# Gemma 4 also falls back to ```json {"name": "...", "arguments": {...}} ```
|
|
3983
|
+
# markdown blocks when it doesn't trust the template — observed when
|
|
3984
|
+
# tool_choice was forced 'required' but the model lacked confidence in the
|
|
3985
|
+
# native format. Only treated as a tool call when the JSON has a "name".
|
|
3986
|
+
_GEMMA4_TOOL_CALL_DSL_RE = re.compile(
|
|
3987
|
+
r"<\|tool_call>\s*call:\s*([A-Za-z_][A-Za-z0-9_]*)\s*\{(.*?)\}\s*<tool_call\|>",
|
|
3988
|
+
re.DOTALL,
|
|
3989
|
+
)
|
|
3990
|
+
# Markdown JSON code-block fallback. Group 1 = JSON content (may include
|
|
3991
|
+
# leading/trailing whitespace inside the block).
|
|
3992
|
+
_GEMMA4_MARKDOWN_JSON_RE = re.compile(
|
|
3993
|
+
r"```(?:json)?\s*(\{.*?\})\s*```",
|
|
3994
|
+
re.DOTALL,
|
|
3995
|
+
)
|
|
3996
|
+
|
|
3997
|
+
|
|
3998
|
+
def _parse_gemma4_dsl_args(raw: str) -> dict | None:
|
|
3999
|
+
"""Parse Gemma 4's tool-call DSL arg body into a Python dict.
|
|
4000
|
+
|
|
4001
|
+
Input shape (between the `{` and `}` of the DSL):
|
|
4002
|
+
key1:<|"|>str value<|"|>,key2:42,key3:true,key4:[<|"|>a<|"|>,<|"|>b<|"|>]
|
|
4003
|
+
|
|
4004
|
+
Strategy: replace `<|"|>` with `"`, wrap unquoted keys in quotes, then
|
|
4005
|
+
feed to json.loads. Returns None on parse failure (caller decides).
|
|
4006
|
+
"""
|
|
4007
|
+
if not raw or not raw.strip():
|
|
4008
|
+
return {}
|
|
4009
|
+
s = raw.replace('<|"|>', '"')
|
|
4010
|
+
# Wrap unquoted keys: `key:` -> `"key":` (only at start or after `,` / `{` / whitespace).
|
|
4011
|
+
s = re.sub(r"(^|[\s,{\[])([A-Za-z_][A-Za-z0-9_]*)\s*:", r'\1"\2":', s)
|
|
4012
|
+
s = "{" + s + "}"
|
|
4013
|
+
try:
|
|
4014
|
+
parsed = json.loads(s)
|
|
4015
|
+
return parsed if isinstance(parsed, dict) else None
|
|
4016
|
+
except json.JSONDecodeError:
|
|
4017
|
+
return None
|
|
4018
|
+
|
|
4019
|
+
|
|
4020
|
+
def _schema_match_tool(payload: dict, available_tools: list[dict]) -> str | None:
|
|
4021
|
+
"""Match a bare-args dict against available tool schemas.
|
|
4022
|
+
|
|
4023
|
+
Score each tool by:
|
|
4024
|
+
- +10 per required field present in payload
|
|
4025
|
+
- +1 per optional property present
|
|
4026
|
+
- -5 per payload key NOT in tool's properties
|
|
4027
|
+
- -100 if any required field is missing
|
|
4028
|
+
Return the name of the highest-scoring tool, or None if no clear match.
|
|
4029
|
+
"""
|
|
4030
|
+
if not isinstance(payload, dict) or not available_tools:
|
|
4031
|
+
return None
|
|
4032
|
+
payload_keys = set(payload.keys())
|
|
4033
|
+
best_name = None
|
|
4034
|
+
best_score = 0
|
|
4035
|
+
for tool in available_tools:
|
|
4036
|
+
if not isinstance(tool, dict):
|
|
4037
|
+
continue
|
|
4038
|
+
# Anthropic tools format: {"name": ..., "input_schema": {...}}
|
|
4039
|
+
# OpenAI format: {"type": "function", "function": {"name": ..., "parameters": {...}}}
|
|
4040
|
+
name = tool.get("name")
|
|
4041
|
+
schema = tool.get("input_schema")
|
|
4042
|
+
if name is None and isinstance(tool.get("function"), dict):
|
|
4043
|
+
name = tool["function"].get("name")
|
|
4044
|
+
schema = tool["function"].get("parameters")
|
|
4045
|
+
if not isinstance(name, str) or not isinstance(schema, dict):
|
|
4046
|
+
continue
|
|
4047
|
+
properties = schema.get("properties") if isinstance(schema.get("properties"), dict) else {}
|
|
4048
|
+
required = set(schema.get("required") or [])
|
|
4049
|
+
prop_keys = set(properties.keys())
|
|
4050
|
+
score = 0
|
|
4051
|
+
missing_required = required - payload_keys
|
|
4052
|
+
if missing_required:
|
|
4053
|
+
score -= 100
|
|
4054
|
+
score += 10 * len(required & payload_keys)
|
|
4055
|
+
score += len((payload_keys & prop_keys) - required)
|
|
4056
|
+
score -= 5 * len(payload_keys - prop_keys)
|
|
4057
|
+
if score > best_score:
|
|
4058
|
+
best_score = score
|
|
4059
|
+
best_name = name
|
|
4060
|
+
return best_name if best_score >= 10 else None
|
|
4061
|
+
|
|
4062
|
+
|
|
4063
|
+
def _extract_gemma4_tool_calls(
|
|
4064
|
+
text: str, available_tools: list[dict] | None = None
|
|
4065
|
+
) -> tuple[list[dict], str]:
|
|
4066
|
+
"""Parse Gemma 4 tool-call emissions out of *text*.
|
|
4067
|
+
|
|
4068
|
+
Three formats handled, in order:
|
|
4069
|
+
1. Native DSL: ``<|tool_call>call:N{...}<tool_call|>``
|
|
4070
|
+
2. Markdown with name: ```json\\n{"name": "N", "arguments": {...}}\\n```
|
|
4071
|
+
3. Markdown bare-args + ``available_tools`` provided — schema-match
|
|
4072
|
+
against tool definitions (fix D for Gemma 4 cold-turn malformation
|
|
4073
|
+
where the model emits ``{"city": "Paris"}`` for a get_weather call
|
|
4074
|
+
instead of ``{"name": "get_weather", "arguments": {"city": "Paris"}}``).
|
|
4075
|
+
Without ``available_tools``, bare-args blocks pass through as text.
|
|
4076
|
+
|
|
4077
|
+
Returns ``(extracted_openai_tool_calls, remaining_text)``.
|
|
4078
|
+
"""
|
|
4079
|
+
if "<|tool_call>" not in text and "```" not in text:
|
|
4080
|
+
return [], text
|
|
4081
|
+
|
|
4082
|
+
extracted: list[dict] = []
|
|
4083
|
+
matched_spans: list[tuple[int, int]] = []
|
|
4084
|
+
|
|
4085
|
+
# Pattern 1: native DSL
|
|
4086
|
+
for m in _GEMMA4_TOOL_CALL_DSL_RE.finditer(text):
|
|
4087
|
+
name = m.group(1).strip()
|
|
4088
|
+
body = m.group(2) or ""
|
|
4089
|
+
if not name:
|
|
4090
|
+
continue
|
|
4091
|
+
args = _parse_gemma4_dsl_args(body)
|
|
4092
|
+
if args is None:
|
|
4093
|
+
# DSL body unparseable; skip and let model retry next turn.
|
|
4094
|
+
continue
|
|
4095
|
+
extracted.append(
|
|
4096
|
+
{
|
|
4097
|
+
"id": f"toolu_{uuid.uuid4().hex[:24]}",
|
|
4098
|
+
"type": "function",
|
|
4099
|
+
"function": {
|
|
4100
|
+
"name": name,
|
|
4101
|
+
"arguments": json.dumps(args, separators=(",", ":")),
|
|
4102
|
+
},
|
|
4103
|
+
}
|
|
4104
|
+
)
|
|
4105
|
+
matched_spans.append(m.span())
|
|
4106
|
+
|
|
4107
|
+
# Pattern 2: markdown JSON fallback (only if no DSL hit AND text has ```)
|
|
4108
|
+
if not extracted and "```" in text:
|
|
4109
|
+
for m in _GEMMA4_MARKDOWN_JSON_RE.finditer(text):
|
|
4110
|
+
raw_json = m.group(1)
|
|
4111
|
+
try:
|
|
4112
|
+
payload = json.loads(raw_json)
|
|
4113
|
+
except json.JSONDecodeError:
|
|
4114
|
+
# Try a JSON repair like the Qwen path does
|
|
4115
|
+
repaired = _repair_tool_call_json(raw_json)
|
|
4116
|
+
if not repaired:
|
|
4117
|
+
continue
|
|
4118
|
+
try:
|
|
4119
|
+
payload = json.loads(repaired)
|
|
4120
|
+
except json.JSONDecodeError:
|
|
4121
|
+
continue
|
|
4122
|
+
if not isinstance(payload, dict):
|
|
4123
|
+
continue
|
|
4124
|
+
name = payload.get("name")
|
|
4125
|
+
arguments_obj = None
|
|
4126
|
+
if isinstance(name, str) and name:
|
|
4127
|
+
# Standard {name, arguments} form
|
|
4128
|
+
arguments_obj = payload.get("arguments", payload.get("args", {}))
|
|
4129
|
+
elif available_tools:
|
|
4130
|
+
# Bare-args block — try schema-matching against available tools
|
|
4131
|
+
matched = _schema_match_tool(payload, available_tools)
|
|
4132
|
+
if matched is None:
|
|
4133
|
+
continue
|
|
4134
|
+
name = matched
|
|
4135
|
+
arguments_obj = payload # whole payload IS the args
|
|
4136
|
+
logger.info(
|
|
4137
|
+
"TOOL CALL EXTRACTION: schema-matched bare-args markdown JSON to tool '%s' (keys=%s)",
|
|
4138
|
+
name,
|
|
4139
|
+
sorted(payload.keys())[:6],
|
|
4140
|
+
)
|
|
4141
|
+
else:
|
|
4142
|
+
# No name, no tools to match against — pass through as text
|
|
4143
|
+
continue
|
|
4144
|
+
if isinstance(arguments_obj, dict):
|
|
4145
|
+
arguments = json.dumps(arguments_obj, separators=(",", ":"))
|
|
4146
|
+
elif isinstance(arguments_obj, str):
|
|
4147
|
+
arguments = arguments_obj
|
|
4148
|
+
else:
|
|
4149
|
+
arguments = "{}"
|
|
4150
|
+
extracted.append(
|
|
4151
|
+
{
|
|
4152
|
+
"id": f"toolu_{uuid.uuid4().hex[:24]}",
|
|
4153
|
+
"type": "function",
|
|
4154
|
+
"function": {"name": name, "arguments": arguments},
|
|
4155
|
+
}
|
|
4156
|
+
)
|
|
4157
|
+
matched_spans.append(m.span())
|
|
4158
|
+
|
|
4159
|
+
if not extracted:
|
|
4160
|
+
return [], text
|
|
4161
|
+
|
|
4162
|
+
# Strip matched spans from text (in reverse to keep indices valid)
|
|
4163
|
+
remaining = text
|
|
4164
|
+
for start, end in sorted(matched_spans, key=lambda s: -s[0]):
|
|
4165
|
+
remaining = remaining[:start] + remaining[end:]
|
|
4166
|
+
remaining = remaining.strip()
|
|
4167
|
+
|
|
4168
|
+
logger.info(
|
|
4169
|
+
"TOOL CALL EXTRACTION: recovered %d Gemma 4 tool call(s) from text content",
|
|
4170
|
+
len(extracted),
|
|
4171
|
+
)
|
|
4172
|
+
return extracted, remaining
|
|
4173
|
+
|
|
4174
|
+
|
|
4175
|
+
# ---------------------------------------------------------------------------
|
|
4176
|
+
# Gemma 4 tool-call DSL extractors
|
|
4177
|
+
# ---------------------------------------------------------------------------
|
|
4178
|
+
# Gemma 4's chat template emits tool calls as:
|
|
4179
|
+
# <|tool_call>call:NAME{key1:<|"|>value1<|"|>,key2:42}<tool_call|>
|
|
4180
|
+
# Note the asymmetric open/close tags and `<|"|>` substitution for `"`.
|
|
4181
|
+
# Llama-server's --jinja autoparser usually converts these to standard
|
|
4182
|
+
# OpenAI tool_calls, but the raw form can leak through on (a) malformed
|
|
4183
|
+
# emissions, (b) finalize turns, (c) non-tool-template requests where the
|
|
4184
|
+
# model still tries to call a tool. This parser catches those cases.
|
|
4185
|
+
#
|
|
4186
|
+
# Gemma 4 also falls back to ```json {"name": "...", "arguments": {...}} ```
|
|
4187
|
+
# markdown blocks when it doesn't trust the template — observed when
|
|
4188
|
+
# tool_choice was forced 'required' but the model lacked confidence in the
|
|
4189
|
+
# native format. Only treated as a tool call when the JSON has a "name".
|
|
4190
|
+
_GEMMA4_TOOL_CALL_DSL_RE = re.compile(
|
|
4191
|
+
r"<\|tool_call>\s*call:\s*([A-Za-z_][A-Za-z0-9_]*)\s*\{(.*?)\}\s*<tool_call\|>",
|
|
4192
|
+
re.DOTALL,
|
|
4193
|
+
)
|
|
4194
|
+
# Markdown JSON code-block fallback. Group 1 = JSON content (may include
|
|
4195
|
+
# leading/trailing whitespace inside the block).
|
|
4196
|
+
_GEMMA4_MARKDOWN_JSON_RE = re.compile(
|
|
4197
|
+
r"```(?:json)?\s*(\{.*?\})\s*```",
|
|
4198
|
+
re.DOTALL,
|
|
4199
|
+
)
|
|
4200
|
+
|
|
4201
|
+
|
|
4202
|
+
def _parse_gemma4_dsl_args(raw: str) -> dict | None:
|
|
4203
|
+
"""Parse Gemma 4's tool-call DSL arg body into a Python dict.
|
|
4204
|
+
|
|
4205
|
+
Input shape (between the `{` and `}` of the DSL):
|
|
4206
|
+
key1:<|"|>str value<|"|>,key2:42,key3:true,key4:[<|"|>a<|"|>,<|"|>b<|"|>]
|
|
4207
|
+
|
|
4208
|
+
Strategy: replace `<|"|>` with `"`, wrap unquoted keys in quotes, then
|
|
4209
|
+
feed to json.loads. Returns None on parse failure (caller decides).
|
|
4210
|
+
"""
|
|
4211
|
+
if not raw or not raw.strip():
|
|
4212
|
+
return {}
|
|
4213
|
+
s = raw.replace('<|"|>', '"')
|
|
4214
|
+
# Wrap unquoted keys: `key:` -> `"key":` (only at start or after `,` / `{` / whitespace).
|
|
4215
|
+
s = re.sub(r"(^|[\s,{\[])([A-Za-z_][A-Za-z0-9_]*)\s*:", r'\1"\2":', s)
|
|
4216
|
+
s = "{" + s + "}"
|
|
4217
|
+
try:
|
|
4218
|
+
parsed = json.loads(s)
|
|
4219
|
+
return parsed if isinstance(parsed, dict) else None
|
|
4220
|
+
except json.JSONDecodeError:
|
|
4221
|
+
return None
|
|
4222
|
+
|
|
4223
|
+
|
|
3486
4224
|
|
|
3487
4225
|
def _repair_tool_call_json(raw: str) -> str | None:
|
|
3488
4226
|
"""Attempt to repair common garbled JSON in tool call payloads.
|
|
@@ -3525,7 +4263,9 @@ def _repair_tool_call_json(raw: str) -> str | None:
|
|
|
3525
4263
|
return None
|
|
3526
4264
|
|
|
3527
4265
|
|
|
3528
|
-
def _extract_tool_calls_from_text(
|
|
4266
|
+
def _extract_tool_calls_from_text(
|
|
4267
|
+
text: str, available_tools: list[dict] | None = None
|
|
4268
|
+
) -> tuple[list[dict], str]:
|
|
3529
4269
|
"""Parse ``<tool_call>{...}</tool_call>`` blocks out of *text*.
|
|
3530
4270
|
|
|
3531
4271
|
Returns a tuple of (extracted_openai_tool_calls, remaining_text).
|
|
@@ -3535,8 +4275,18 @@ def _extract_tool_calls_from_text(text: str) -> tuple[list[dict], str]:
|
|
|
3535
4275
|
|
|
3536
4276
|
The *remaining_text* has the matched ``<tool_call>`` blocks removed.
|
|
3537
4277
|
If no valid blocks are found the original text is returned unchanged.
|
|
4278
|
+
Falls back to Hermes-style ``<function=X><parameter=K>V</parameter></function>``
|
|
4279
|
+
for older Qwen/Llama fine-tunes, then to Gemma 4's
|
|
4280
|
+
``<|tool_call>call:N{...}<tool_call|>`` DSL and ```json``` markdown
|
|
4281
|
+
blocks. Anything not matching any known format falls through unchanged
|
|
4282
|
+
so plain prose passes the parser without mutation.
|
|
3538
4283
|
"""
|
|
3539
|
-
if
|
|
4284
|
+
if (
|
|
4285
|
+
"<tool_call>" not in text
|
|
4286
|
+
and "<function=" not in text
|
|
4287
|
+
and "<|tool_call>" not in text
|
|
4288
|
+
and "```" not in text
|
|
4289
|
+
):
|
|
3540
4290
|
return [], text
|
|
3541
4291
|
|
|
3542
4292
|
extracted: list[dict] = []
|
|
@@ -3572,14 +4322,24 @@ def _extract_tool_calls_from_text(text: str) -> tuple[list[dict], str]:
|
|
|
3572
4322
|
|
|
3573
4323
|
extracted.append(
|
|
3574
4324
|
{
|
|
3575
|
-
"id": f"
|
|
4325
|
+
"id": f"toolu_{uuid.uuid4().hex[:24]}",
|
|
3576
4326
|
"type": "function",
|
|
3577
4327
|
"function": {"name": name, "arguments": arguments},
|
|
3578
4328
|
}
|
|
3579
4329
|
)
|
|
3580
4330
|
|
|
3581
4331
|
if not extracted:
|
|
3582
|
-
|
|
4332
|
+
# Fall back to Hermes format. This catches Qwen emissions on finalize
|
|
4333
|
+
# turns where grammar is not applied and the model defaults to its
|
|
4334
|
+
# base training's <function=X><parameter=K>V</parameter></function>
|
|
4335
|
+
# format instead of the <tool_call>{JSON}</tool_call> Qwen template
|
|
4336
|
+
# format. Without this path, tool_calls=[] and the client halts.
|
|
4337
|
+
hermes_calls, hermes_remaining = _extract_hermes_tool_calls(text)
|
|
4338
|
+
if hermes_calls:
|
|
4339
|
+
return hermes_calls, hermes_remaining
|
|
4340
|
+
# Then try Gemma 4's DSL + markdown-JSON fallback. Anything still
|
|
4341
|
+
# not matching falls through as plain text.
|
|
4342
|
+
return _extract_gemma4_tool_calls(text, available_tools=available_tools)
|
|
3583
4343
|
|
|
3584
4344
|
# Strip matched tool_call blocks from the text
|
|
3585
4345
|
remaining = _TOOL_CALL_XML_RE.sub("", text).strip()
|
|
@@ -4550,6 +5310,16 @@ def _classify_tool_response_issue(
|
|
|
4550
5310
|
if "tools" not in anthropic_body:
|
|
4551
5311
|
return ToolResponseIssue()
|
|
4552
5312
|
|
|
5313
|
+
# When the upstream response was cut off by max_tokens (finish_reason=length),
|
|
5314
|
+
# any garbled/unbalanced-brace appearance in the tool args is almost
|
|
5315
|
+
# certainly truncation, not degenerate generation. Re-classify such
|
|
5316
|
+
# issues as "truncated_tool_args" so the caller can still retry (with a
|
|
5317
|
+
# larger cap) but WITHOUT triggering the forced-tool dampener, which
|
|
5318
|
+
# otherwise penalises a perfectly-recoverable truncation event.
|
|
5319
|
+
choice_for_finish, _ = _extract_openai_choice(openai_resp)
|
|
5320
|
+
finish_reason = (choice_for_finish.get("finish_reason") or "").lower()
|
|
5321
|
+
was_truncated = finish_reason == "length"
|
|
5322
|
+
|
|
4553
5323
|
if _is_malformed_tool_response(openai_resp, anthropic_body):
|
|
4554
5324
|
return ToolResponseIssue(
|
|
4555
5325
|
kind="malformed_payload",
|
|
@@ -4593,15 +5363,107 @@ def _classify_tool_response_issue(
|
|
|
4593
5363
|
allowed_tools,
|
|
4594
5364
|
)
|
|
4595
5365
|
if issue.has_issue():
|
|
5366
|
+
# Downgrade invalid_tool_args to truncated_tool_args when the
|
|
5367
|
+
# response hit max_tokens — retry path still fires but the
|
|
5368
|
+
# dampener/streak counters stay cold.
|
|
5369
|
+
if was_truncated and issue.kind == "invalid_tool_args":
|
|
5370
|
+
return ToolResponseIssue(
|
|
5371
|
+
kind="truncated_tool_args",
|
|
5372
|
+
reason=(
|
|
5373
|
+
f"tool call for '{tool_name}' truncated by max_tokens "
|
|
5374
|
+
f"({issue.reason})"
|
|
5375
|
+
),
|
|
5376
|
+
retry_hint=issue.retry_hint,
|
|
5377
|
+
)
|
|
4596
5378
|
return issue
|
|
4597
5379
|
|
|
4598
5380
|
return ToolResponseIssue()
|
|
4599
5381
|
|
|
4600
5382
|
|
|
5383
|
+
# 2026-05-12: Regex for the tool-XML tag scanner. Captures opening vs
|
|
5384
|
+
# closing form (group 1: "/" or ""), the tag name (group 2), and any
|
|
5385
|
+
# attributes (group 3). Matches <parameter>, <parameter=key>,
|
|
5386
|
+
# <parameter name="key">, </parameter>, <function=name>, </function>.
|
|
5387
|
+
_TOOL_XML_TAG_RE = re.compile(r"<(/?)(parameter|function)\b([^>]*)>")
|
|
5388
|
+
|
|
5389
|
+
|
|
5390
|
+
def _strip_orphan_tool_xml(text: str) -> str:
|
|
5391
|
+
"""Remove orphan </parameter> and </function> closing tags that have
|
|
5392
|
+
no matching opener earlier in the text.
|
|
5393
|
+
|
|
5394
|
+
Qwen3.6 trained on the qwen3_coder XML format leaks these closers
|
|
5395
|
+
after its actual answer when forced into tool_choice='required' with
|
|
5396
|
+
no genuine tool to call. The closers are training residuals, not real
|
|
5397
|
+
malformed tool-call markup — keeping them in the text causes the
|
|
5398
|
+
primary_markers branch of _looks_malformed_tool_payload to fire on
|
|
5399
|
+
every clean-but-runaway-shaped response. Real malformed tool-call
|
|
5400
|
+
attempts always have at least one matching opener ('<parameter' or
|
|
5401
|
+
'<function='), which the regex preserves, so primary_markers still
|
|
5402
|
+
fires correctly on genuine bad output.
|
|
5403
|
+
"""
|
|
5404
|
+
if "</parameter" not in text and "</function" not in text:
|
|
5405
|
+
return text
|
|
5406
|
+
|
|
5407
|
+
out: list[str] = []
|
|
5408
|
+
pos = 0
|
|
5409
|
+
open_param = 0
|
|
5410
|
+
open_func = 0
|
|
5411
|
+
for m in _TOOL_XML_TAG_RE.finditer(text):
|
|
5412
|
+
out.append(text[pos:m.start()])
|
|
5413
|
+
is_close = m.group(1) == "/"
|
|
5414
|
+
tag = m.group(2)
|
|
5415
|
+
if is_close:
|
|
5416
|
+
if tag == "parameter":
|
|
5417
|
+
if open_param > 0:
|
|
5418
|
+
open_param -= 1
|
|
5419
|
+
out.append(m.group(0))
|
|
5420
|
+
else: # function
|
|
5421
|
+
if open_func > 0:
|
|
5422
|
+
open_func -= 1
|
|
5423
|
+
out.append(m.group(0))
|
|
5424
|
+
# else: orphan closer, skip (strip)
|
|
5425
|
+
else:
|
|
5426
|
+
if tag == "parameter":
|
|
5427
|
+
open_param += 1
|
|
5428
|
+
else:
|
|
5429
|
+
open_func += 1
|
|
5430
|
+
out.append(m.group(0))
|
|
5431
|
+
pos = m.end()
|
|
5432
|
+
out.append(text[pos:])
|
|
5433
|
+
return "".join(out)
|
|
5434
|
+
|
|
5435
|
+
|
|
4601
5436
|
def _looks_malformed_tool_payload(text: str) -> bool:
|
|
4602
5437
|
if not text:
|
|
4603
5438
|
return False
|
|
4604
5439
|
|
|
5440
|
+
# 2026-05-12: Strip balanced <think>...</think> blocks before applying
|
|
5441
|
+
# the heuristic. Qwen3.6 emits <think> blocks regardless of
|
|
5442
|
+
# enable_thinking, and two scenarios were tripping false positives:
|
|
5443
|
+
# 1. Meta-tool reasoning inside the thinking ({"description":...},
|
|
5444
|
+
# repeated "must call a tool") triggering the structural-marker
|
|
5445
|
+
# and policy-echo branches.
|
|
5446
|
+
# 2. The model wrapping its ENTIRE answer inside a single <think>
|
|
5447
|
+
# block (markdown reports, tables) — the </think> structural
|
|
5448
|
+
# marker plus content-resembling-policy then fires.
|
|
5449
|
+
# Downstream response processing surfaces <think> content as proper
|
|
5450
|
+
# Anthropic `thinking` blocks via _THINKING_BLOCK_RE, so stripping
|
|
5451
|
+
# here loses no information. Unbalanced/stray </think> without a
|
|
5452
|
+
# matching opener is NOT stripped — those remain genuinely malformed.
|
|
5453
|
+
if "<think>" in text and "</think>" in text:
|
|
5454
|
+
text = _THINKING_BLOCK_RE.sub("", text)
|
|
5455
|
+
if not text.strip():
|
|
5456
|
+
return False
|
|
5457
|
+
|
|
5458
|
+
# 2026-05-12: Strip orphan </parameter> and </function> closers that
|
|
5459
|
+
# have no matching opener. Qwen3.6 leaks these training residuals
|
|
5460
|
+
# after its visible answer when forced into tool_choice='required'
|
|
5461
|
+
# with no valid tool to call. Real malformed tool-call attempts retain
|
|
5462
|
+
# their opener and still trip the primary_markers check below.
|
|
5463
|
+
text = _strip_orphan_tool_xml(text)
|
|
5464
|
+
if not text.strip():
|
|
5465
|
+
return False
|
|
5466
|
+
|
|
4605
5467
|
lowered = text.lower()
|
|
4606
5468
|
if _contains_tool_call_apology(text):
|
|
4607
5469
|
return True
|
|
@@ -4836,13 +5698,17 @@ def _build_malformed_retry_body(
|
|
|
4836
5698
|
retry_instruction = (
|
|
4837
5699
|
"Your previous response had invalid tool-call formatting. "
|
|
4838
5700
|
"Respond with exactly one valid tool call using the provided tools. "
|
|
4839
|
-
"Do not output prose, markdown, XML tags, or schema snippets."
|
|
5701
|
+
"Do not output prose, markdown, XML tags, or schema snippets. "
|
|
5702
|
+
"Do NOT use <think>...</think> blocks or internal reasoning — "
|
|
5703
|
+
"emit the tool_call object as the very first token of your response."
|
|
4840
5704
|
)
|
|
4841
5705
|
else:
|
|
4842
5706
|
retry_instruction = (
|
|
4843
5707
|
"Your previous response had invalid tool-call formatting. "
|
|
4844
5708
|
"If a tool is needed, emit exactly one valid tool call with strict JSON arguments. "
|
|
4845
|
-
"If no tool is needed for this turn, return concise plain text with no protocol tags."
|
|
5709
|
+
"If no tool is needed for this turn, return concise plain text with no protocol tags. "
|
|
5710
|
+
"Do NOT use <think>...</think> blocks — start your response directly with "
|
|
5711
|
+
"either a tool_call or the plain text answer."
|
|
4846
5712
|
)
|
|
4847
5713
|
|
|
4848
5714
|
malformed_retry_instruction = {
|
|
@@ -5023,7 +5889,7 @@ async def _apply_unexpected_end_turn_guardrail(
|
|
|
5023
5889
|
)
|
|
5024
5890
|
if retry_resp.status_code == 200:
|
|
5025
5891
|
retry_json = retry_resp.json()
|
|
5026
|
-
_maybe_extract_text_tool_calls(retry_json)
|
|
5892
|
+
_maybe_extract_text_tool_calls(retry_json, anthropic_tools=anthropic_body.get("tools"))
|
|
5027
5893
|
retry_choice, retry_message = _extract_openai_choice(retry_json)
|
|
5028
5894
|
if _openai_has_valid_tool_calls(retry_json, anthropic_body):
|
|
5029
5895
|
logger.info("GUARDRAIL: retry produced tool_use; using retried response")
|
|
@@ -5112,8 +5978,12 @@ async def _apply_malformed_tool_guardrail(
|
|
|
5112
5978
|
)
|
|
5113
5979
|
return working_resp
|
|
5114
5980
|
|
|
5115
|
-
#
|
|
5116
|
-
|
|
5981
|
+
# Only set last_response_garbled for TRUE degenerate generation, not
|
|
5982
|
+
# for responses merely truncated by max_tokens — otherwise the next
|
|
5983
|
+
# turn gets hit with the garbled_cap (smaller max_tokens) and the
|
|
5984
|
+
# problem compounds.
|
|
5985
|
+
if issue.kind != "truncated_tool_args":
|
|
5986
|
+
monitor.last_response_garbled = True
|
|
5117
5987
|
|
|
5118
5988
|
if issue.kind == "malformed_payload":
|
|
5119
5989
|
monitor.malformed_tool_streak += 1
|
|
@@ -5121,7 +5991,12 @@ async def _apply_malformed_tool_guardrail(
|
|
|
5121
5991
|
monitor.invalid_tool_call_streak += 1
|
|
5122
5992
|
monitor.arg_preflight_rejections += 1
|
|
5123
5993
|
|
|
5124
|
-
|
|
5994
|
+
# Truncation is a max_tokens accident, not the model misbehaving: don't
|
|
5995
|
+
# feed it to the forced-tool dampener, which would otherwise relax
|
|
5996
|
+
# tool_choice on the very next turn and let the model trail off with
|
|
5997
|
+
# text (the exact failure mode that stopped opencode).
|
|
5998
|
+
if issue.kind != "truncated_tool_args":
|
|
5999
|
+
monitor.maybe_activate_forced_tool_dampener(issue.kind)
|
|
5125
6000
|
excerpt = _openai_message_text(working_resp)[:220].replace("\n", " ")
|
|
5126
6001
|
# Option 2: Log garbled argument content for diagnostics
|
|
5127
6002
|
arg_excerpt = ""
|
|
@@ -5194,7 +6069,7 @@ async def _apply_malformed_tool_guardrail(
|
|
|
5194
6069
|
continue
|
|
5195
6070
|
|
|
5196
6071
|
retry_json = retry_resp.json()
|
|
5197
|
-
_maybe_extract_text_tool_calls(retry_json)
|
|
6072
|
+
_maybe_extract_text_tool_calls(retry_json, anthropic_tools=anthropic_body.get("tools"))
|
|
5198
6073
|
retry_working = retry_json
|
|
5199
6074
|
retry_repairs = 0
|
|
5200
6075
|
if PROXY_TOOL_ARGS_PREFLIGHT and _openai_has_tool_calls(retry_json):
|
|
@@ -5226,15 +6101,20 @@ async def _apply_malformed_tool_guardrail(
|
|
|
5226
6101
|
)
|
|
5227
6102
|
|
|
5228
6103
|
if not retry_issue.has_issue():
|
|
5229
|
-
|
|
5230
|
-
|
|
5231
|
-
|
|
6104
|
+
# 2026-05-12: Fix #2 — do NOT reset malformed/invalid/miss streaks
|
|
6105
|
+
# to 0 on retry-success. Previously, sessions stuck in a
|
|
6106
|
+
# malformed→retry-success loop never accumulated enough streak to
|
|
6107
|
+
# trigger the forced-tool dampener. Healthy responses with real
|
|
6108
|
+
# tool_calls still reset the streak via the upstream no-issue path
|
|
6109
|
+
# (~L5655), so genuine recovery still resets counters; only
|
|
6110
|
+
# repeated retry-recoveries persist toward the dampener.
|
|
5232
6111
|
monitor.last_response_garbled = False
|
|
5233
6112
|
logger.info(
|
|
5234
|
-
"TOOL RESPONSE RETRY success: kind=%s attempt=%d/%d",
|
|
6113
|
+
"TOOL RESPONSE RETRY success: kind=%s attempt=%d/%d malformed_streak=%d",
|
|
5235
6114
|
current_issue.kind,
|
|
5236
6115
|
attempt + 1,
|
|
5237
6116
|
attempts,
|
|
6117
|
+
monitor.malformed_tool_streak,
|
|
5238
6118
|
)
|
|
5239
6119
|
if retry_repairs > 0:
|
|
5240
6120
|
monitor.arg_preflight_repairs += retry_repairs
|
|
@@ -5259,7 +6139,10 @@ async def _apply_malformed_tool_guardrail(
|
|
|
5259
6139
|
if fn_name and raw_args and _is_garbled_tool_arguments(raw_args):
|
|
5260
6140
|
failing_tools.add(fn_name)
|
|
5261
6141
|
|
|
5262
|
-
|
|
6142
|
+
# Truncation on retry is still a max_tokens problem, not a model
|
|
6143
|
+
# misbehaviour — don't dampen. The outer retry loop will try again.
|
|
6144
|
+
if retry_issue.kind != "truncated_tool_args":
|
|
6145
|
+
monitor.maybe_activate_forced_tool_dampener(retry_issue.kind)
|
|
5263
6146
|
logger.warning(
|
|
5264
6147
|
"TOOL RESPONSE RETRY invalid: session=%s attempt=%d/%d kind=%s reason=%s",
|
|
5265
6148
|
session_id,
|
|
@@ -5440,11 +6323,19 @@ def _maybe_apply_session_contamination_breaker(
|
|
|
5440
6323
|
# ===========================================================================
|
|
5441
6324
|
|
|
5442
6325
|
|
|
5443
|
-
def _maybe_extract_text_tool_calls(
|
|
6326
|
+
def _maybe_extract_text_tool_calls(
|
|
6327
|
+
openai_resp: dict, anthropic_tools: list[dict] | None = None
|
|
6328
|
+
) -> dict:
|
|
5444
6329
|
"""Mutate *openai_resp* in-place: if the message has no structured
|
|
5445
|
-
``tool_calls`` but contains
|
|
5446
|
-
and promote to real ``tool_calls`` on the message.
|
|
5447
|
-
|
|
6330
|
+
``tool_calls`` but contains tool-call markup in text, extract them
|
|
6331
|
+
and promote to real ``tool_calls`` on the message.
|
|
6332
|
+
|
|
6333
|
+
*anthropic_tools* (optional): list of tool definitions from the original
|
|
6334
|
+
Anthropic request. Enables schema-matching of bare-args markdown JSON
|
|
6335
|
+
blocks emitted by Gemma 4 cold turns (fix D). Without it, bare-args
|
|
6336
|
+
blocks pass through as text.
|
|
6337
|
+
|
|
6338
|
+
Returns the (possibly-mutated) response for chaining."""
|
|
5448
6339
|
choice = (openai_resp.get("choices") or [{}])[0]
|
|
5449
6340
|
message = choice.get("message", {})
|
|
5450
6341
|
|
|
@@ -5453,10 +6344,20 @@ def _maybe_extract_text_tool_calls(openai_resp: dict) -> dict:
|
|
|
5453
6344
|
return openai_resp
|
|
5454
6345
|
|
|
5455
6346
|
text = message.get("content", "")
|
|
5456
|
-
if not isinstance(text, str)
|
|
6347
|
+
if not isinstance(text, str):
|
|
6348
|
+
return openai_resp
|
|
6349
|
+
# Quick early-exit if no markers present (matches dispatcher guard)
|
|
6350
|
+
if (
|
|
6351
|
+
"<tool_call>" not in text
|
|
6352
|
+
and "<function=" not in text
|
|
6353
|
+
and "<|tool_call>" not in text
|
|
6354
|
+
and "```" not in text
|
|
6355
|
+
):
|
|
5457
6356
|
return openai_resp
|
|
5458
6357
|
|
|
5459
|
-
extracted, remaining = _extract_tool_calls_from_text(
|
|
6358
|
+
extracted, remaining = _extract_tool_calls_from_text(
|
|
6359
|
+
text, available_tools=anthropic_tools
|
|
6360
|
+
)
|
|
5460
6361
|
if not extracted:
|
|
5461
6362
|
return openai_resp
|
|
5462
6363
|
|
|
@@ -5591,8 +6492,43 @@ def _inject_synthetic_continuation(
|
|
|
5591
6492
|
return anthropic_resp
|
|
5592
6493
|
|
|
5593
6494
|
|
|
5594
|
-
|
|
5595
|
-
|
|
6495
|
+
_THINKING_BLOCK_RE = re.compile(r"<think>(.*?)</think>\s*", re.DOTALL)
|
|
6496
|
+
|
|
6497
|
+
|
|
6498
|
+
def _extract_thinking_block(text: str) -> tuple[str | None, str]:
|
|
6499
|
+
"""Extract Qwen-style ``<think>...</think>`` blocks from *text*.
|
|
6500
|
+
|
|
6501
|
+
Returns ``(thinking_content, remaining_text)``. If no ``<think>`` tag is
|
|
6502
|
+
present, returns ``(None, text)`` unchanged. Multiple thinking blocks
|
|
6503
|
+
are concatenated. Trailing whitespace after each block is consumed so
|
|
6504
|
+
the remaining text starts cleanly with the model's actual answer.
|
|
6505
|
+
"""
|
|
6506
|
+
if "<think>" not in text:
|
|
6507
|
+
return None, text
|
|
6508
|
+
parts: list[str] = []
|
|
6509
|
+
def collect(m: re.Match) -> str:
|
|
6510
|
+
parts.append(m.group(1).strip())
|
|
6511
|
+
return ""
|
|
6512
|
+
remaining = _THINKING_BLOCK_RE.sub(collect, text)
|
|
6513
|
+
if not parts:
|
|
6514
|
+
return None, text
|
|
6515
|
+
return "\n\n".join(p for p in parts if p), remaining.lstrip()
|
|
6516
|
+
|
|
6517
|
+
|
|
6518
|
+
def openai_to_anthropic_response(
|
|
6519
|
+
openai_resp: dict, model: str, expose_thinking: bool = True
|
|
6520
|
+
) -> dict:
|
|
6521
|
+
"""Convert an OpenAI Chat Completions response to Anthropic Messages format.
|
|
6522
|
+
|
|
6523
|
+
*expose_thinking*: when True, surface ``<think>...</think>`` content from
|
|
6524
|
+
the upstream as Anthropic ``{"type": "thinking"}`` blocks. When False
|
|
6525
|
+
(Anthropic default — client didn't opt in), strip thinking content
|
|
6526
|
+
from the response entirely so the client only sees the actual answer.
|
|
6527
|
+
Qwen's chat template seeds the model into thinking regardless of the
|
|
6528
|
+
``enable_thinking`` request param, so even thinking-off responses
|
|
6529
|
+
typically still contain ``<think>`` blocks; this flag controls whether
|
|
6530
|
+
they're surfaced as Anthropic blocks or silently consumed.
|
|
6531
|
+
"""
|
|
5596
6532
|
# First: try to recover tool calls trapped in text XML tags
|
|
5597
6533
|
_maybe_extract_text_tool_calls(openai_resp)
|
|
5598
6534
|
# Second: strip garbled/degenerate tool call arguments
|
|
@@ -5603,20 +6539,46 @@ def openai_to_anthropic_response(openai_resp: dict, model: str) -> dict:
|
|
|
5603
6539
|
finish = choice.get("finish_reason", "stop")
|
|
5604
6540
|
|
|
5605
6541
|
content = []
|
|
6542
|
+
# Surface Qwen's <think>...</think> output as Anthropic-style thinking
|
|
6543
|
+
# blocks (Anthropic extended-thinking API shape:
|
|
6544
|
+
# {"type": "thinking", "thinking": "...", "signature": ""}).
|
|
6545
|
+
# Clients that don't request thinking simply ignore the block; clients
|
|
6546
|
+
# that do (Claude Code) render them in the thinking pane.
|
|
6547
|
+
raw_text = ""
|
|
5606
6548
|
if message.get("content"):
|
|
5607
6549
|
raw_text = (
|
|
5608
6550
|
message["content"]
|
|
5609
6551
|
if isinstance(message["content"], str)
|
|
5610
6552
|
else str(message["content"])
|
|
5611
6553
|
)
|
|
5612
|
-
|
|
5613
|
-
|
|
6554
|
+
# Some llama-server builds emit the model's reasoning into a separate
|
|
6555
|
+
# `reasoning_content` field instead of inline <think> tags. Surface
|
|
6556
|
+
# that too so the proxy is consistent regardless of upstream behaviour.
|
|
6557
|
+
inline_thinking, body_text = _extract_thinking_block(raw_text)
|
|
6558
|
+
sidecar_thinking = message.get("reasoning_content") or message.get("reasoning")
|
|
6559
|
+
thinking_chunks: list[str] = []
|
|
6560
|
+
if isinstance(sidecar_thinking, str) and sidecar_thinking.strip():
|
|
6561
|
+
thinking_chunks.append(sidecar_thinking.strip())
|
|
6562
|
+
if inline_thinking:
|
|
6563
|
+
thinking_chunks.append(inline_thinking)
|
|
6564
|
+
if thinking_chunks and expose_thinking:
|
|
6565
|
+
content.append(
|
|
6566
|
+
{
|
|
6567
|
+
"type": "thinking",
|
|
6568
|
+
"thinking": "\n\n".join(thinking_chunks),
|
|
6569
|
+
"signature": "",
|
|
6570
|
+
}
|
|
6571
|
+
)
|
|
6572
|
+
|
|
6573
|
+
if body_text:
|
|
6574
|
+
sanitized_text = _sanitize_tool_call_apology_text(body_text)
|
|
6575
|
+
if sanitized_text != body_text:
|
|
5614
6576
|
logger.warning(
|
|
5615
6577
|
"SANITIZE: replaced known malformed tool-call apology text in assistant response"
|
|
5616
6578
|
)
|
|
5617
6579
|
# Option 1: Strip residual <tool_call> XML that wasn't extracted
|
|
5618
6580
|
sanitized_text = _strip_residual_tool_call_xml(sanitized_text)
|
|
5619
|
-
if sanitized_text !=
|
|
6581
|
+
if sanitized_text != body_text and "<tool_call>" in body_text:
|
|
5620
6582
|
logger.warning(
|
|
5621
6583
|
"SANITIZE: stripped residual <tool_call> XML from text content"
|
|
5622
6584
|
)
|
|
@@ -5641,10 +6603,21 @@ def openai_to_anthropic_response(openai_resp: dict, model: str) -> dict:
|
|
|
5641
6603
|
logger.warning(
|
|
5642
6604
|
"BASH SAFETY: stripped standalone protocol-tag lines from command before tool execution"
|
|
5643
6605
|
)
|
|
6606
|
+
# Normalise IDs to Anthropic spec (toolu_ prefix). Upstream
|
|
6607
|
+
# llama-server returns opaque IDs without prefix; clients that
|
|
6608
|
+
# validate prefix would reject. Strip-and-restamp here, restore in
|
|
6609
|
+
# anthropic_to_openai_messages() when client sends tool_result back.
|
|
6610
|
+
upstream_id = tc.get("id", "")
|
|
6611
|
+
if upstream_id.startswith("toolu_"):
|
|
6612
|
+
tool_use_id = upstream_id
|
|
6613
|
+
elif upstream_id:
|
|
6614
|
+
tool_use_id = f"toolu_{upstream_id}"
|
|
6615
|
+
else:
|
|
6616
|
+
tool_use_id = f"toolu_{uuid.uuid4().hex[:24]}"
|
|
5644
6617
|
content.append(
|
|
5645
6618
|
{
|
|
5646
6619
|
"type": "tool_use",
|
|
5647
|
-
"id":
|
|
6620
|
+
"id": tool_use_id,
|
|
5648
6621
|
"name": fn.get("name", ""),
|
|
5649
6622
|
"input": args,
|
|
5650
6623
|
}
|
|
@@ -6341,7 +7314,7 @@ async def messages(request: Request):
|
|
|
6341
7314
|
)
|
|
6342
7315
|
except Exception as exc:
|
|
6343
7316
|
# Check if upstream is hung before returning error
|
|
6344
|
-
await _check_slot_hang(
|
|
7317
|
+
await _check_slot_hang(LLAMA_CPP_BASE.replace("/v1", "/slots"))
|
|
6345
7318
|
return Response(
|
|
6346
7319
|
content=json.dumps(
|
|
6347
7320
|
{
|
|
@@ -6356,6 +7329,23 @@ async def messages(request: Request):
|
|
|
6356
7329
|
media_type="application/json",
|
|
6357
7330
|
)
|
|
6358
7331
|
|
|
7332
|
+
if strict_resp.status_code != 200:
|
|
7333
|
+
error_text = strict_resp.text[:1000]
|
|
7334
|
+
# Try the Gemma 4 PEG parse-failure recovery first — relax
|
|
7335
|
+
# tool_choice='required' so the retry isn't constrained by the
|
|
7336
|
+
# strict-grammar that triggered the parse failure.
|
|
7337
|
+
relaxed = _is_gemma4_peg_parse_failure(strict_resp.status_code, error_text) and \
|
|
7338
|
+
_relax_tool_choice_for_gemma4_peg_retry(strict_body, "strict-stream")
|
|
7339
|
+
if relaxed:
|
|
7340
|
+
try:
|
|
7341
|
+
strict_resp = await _post_with_generation_timeout(
|
|
7342
|
+
client,
|
|
7343
|
+
f"{LLAMA_CPP_BASE}/chat/completions",
|
|
7344
|
+
strict_body,
|
|
7345
|
+
{"Content-Type": "application/json"},
|
|
7346
|
+
)
|
|
7347
|
+
except Exception:
|
|
7348
|
+
pass # fall through to next handler
|
|
6359
7349
|
if strict_resp.status_code != 200:
|
|
6360
7350
|
error_text = strict_resp.text[:1000]
|
|
6361
7351
|
if _maybe_disable_grammar_for_tools_error(
|
|
@@ -6430,7 +7420,7 @@ async def messages(request: Request):
|
|
|
6430
7420
|
|
|
6431
7421
|
openai_resp = strict_resp.json()
|
|
6432
7422
|
# Recover tool calls from <tool_call> XML before guardrails run
|
|
6433
|
-
_maybe_extract_text_tool_calls(openai_resp)
|
|
7423
|
+
_maybe_extract_text_tool_calls(openai_resp, anthropic_tools=body.get("tools"))
|
|
6434
7424
|
openai_resp = await _apply_unexpected_end_turn_guardrail(
|
|
6435
7425
|
client,
|
|
6436
7426
|
openai_resp,
|
|
@@ -6485,7 +7475,11 @@ async def messages(request: Request):
|
|
|
6485
7475
|
logger.info("DEGENERATE RETRY: retry insufficient, using truncated original")
|
|
6486
7476
|
except Exception as exc:
|
|
6487
7477
|
logger.warning("DEGENERATE RETRY: failed: %s", exc)
|
|
6488
|
-
anthropic_resp = openai_to_anthropic_response(
|
|
7478
|
+
anthropic_resp = openai_to_anthropic_response(
|
|
7479
|
+
openai_resp, model,
|
|
7480
|
+
expose_thinking=isinstance(body.get("thinking"), dict)
|
|
7481
|
+
and (body["thinking"].get("type") or "").lower() == "enabled",
|
|
7482
|
+
)
|
|
6489
7483
|
# FINALIZE CONTINUATION: inject synthetic tool_use to keep client loop alive
|
|
6490
7484
|
if (
|
|
6491
7485
|
monitor.finalize_turn_active
|
|
@@ -6601,6 +7595,29 @@ async def messages(request: Request):
|
|
|
6601
7595
|
error_body = await resp.aread()
|
|
6602
7596
|
await resp.aclose()
|
|
6603
7597
|
error_text = error_body.decode("utf-8", errors="replace")[:1000]
|
|
7598
|
+
# Gemma 4 PEG parse-failure recovery: relax tool_choice='required'
|
|
7599
|
+
# so the retry isn't blocked by the strict-grammar that rejected
|
|
7600
|
+
# the model's incomplete tool call.
|
|
7601
|
+
if _is_gemma4_peg_parse_failure(resp.status_code, error_text) and \
|
|
7602
|
+
_relax_tool_choice_for_gemma4_peg_retry(openai_body, "stream"):
|
|
7603
|
+
resp = await client.send(
|
|
7604
|
+
client.build_request(
|
|
7605
|
+
"POST",
|
|
7606
|
+
f"{LLAMA_CPP_BASE}/chat/completions",
|
|
7607
|
+
json=openai_body,
|
|
7608
|
+
headers={"Content-Type": "application/json"},
|
|
7609
|
+
),
|
|
7610
|
+
stream=True,
|
|
7611
|
+
)
|
|
7612
|
+
if resp.status_code == 200:
|
|
7613
|
+
return StreamingResponse(
|
|
7614
|
+
stream_anthropic_response(resp, model, monitor, body),
|
|
7615
|
+
media_type="text/event-stream",
|
|
7616
|
+
)
|
|
7617
|
+
# fall through if still failing
|
|
7618
|
+
error_body = await resp.aread()
|
|
7619
|
+
await resp.aclose()
|
|
7620
|
+
error_text = error_body.decode("utf-8", errors="replace")[:1000]
|
|
6604
7621
|
if _maybe_disable_grammar_for_tools_error(
|
|
6605
7622
|
openai_body,
|
|
6606
7623
|
resp.status_code,
|
|
@@ -6733,6 +7750,23 @@ async def messages(request: Request):
|
|
|
6733
7750
|
media_type="application/json",
|
|
6734
7751
|
)
|
|
6735
7752
|
|
|
7753
|
+
if resp.status_code != 200:
|
|
7754
|
+
error_text = resp.text[:1000]
|
|
7755
|
+
# Gemma 4 PEG parse-failure recovery (non-stream path).
|
|
7756
|
+
relaxed = (
|
|
7757
|
+
_is_gemma4_peg_parse_failure(resp.status_code, error_text)
|
|
7758
|
+
and _relax_tool_choice_for_gemma4_peg_retry(openai_body, "non-stream")
|
|
7759
|
+
)
|
|
7760
|
+
if relaxed:
|
|
7761
|
+
try:
|
|
7762
|
+
resp = await _post_with_generation_timeout(
|
|
7763
|
+
client,
|
|
7764
|
+
f"{LLAMA_CPP_BASE}/chat/completions",
|
|
7765
|
+
openai_body,
|
|
7766
|
+
{"Content-Type": "application/json"},
|
|
7767
|
+
)
|
|
7768
|
+
except Exception:
|
|
7769
|
+
pass # fall through
|
|
6736
7770
|
if resp.status_code != 200:
|
|
6737
7771
|
error_text = resp.text[:1000]
|
|
6738
7772
|
if _maybe_disable_grammar_for_tools_error(
|
|
@@ -6785,7 +7819,7 @@ async def messages(request: Request):
|
|
|
6785
7819
|
|
|
6786
7820
|
openai_resp = resp.json()
|
|
6787
7821
|
# Recover tool calls from <tool_call> XML before guardrails run
|
|
6788
|
-
_maybe_extract_text_tool_calls(openai_resp)
|
|
7822
|
+
_maybe_extract_text_tool_calls(openai_resp, anthropic_tools=body.get("tools"))
|
|
6789
7823
|
openai_resp = await _apply_unexpected_end_turn_guardrail(
|
|
6790
7824
|
client,
|
|
6791
7825
|
openai_resp,
|
|
@@ -6854,7 +7888,11 @@ async def messages(request: Request):
|
|
|
6854
7888
|
logger.info("DEGENERATE RETRY (stream): no tool call, using truncated")
|
|
6855
7889
|
except Exception as exc:
|
|
6856
7890
|
logger.warning("DEGENERATE RETRY (stream): failed: %s", exc)
|
|
6857
|
-
anthropic_resp = openai_to_anthropic_response(
|
|
7891
|
+
anthropic_resp = openai_to_anthropic_response(
|
|
7892
|
+
openai_resp, model,
|
|
7893
|
+
expose_thinking=isinstance(body.get("thinking"), dict)
|
|
7894
|
+
and (body["thinking"].get("type") or "").lower() == "enabled",
|
|
7895
|
+
)
|
|
6858
7896
|
# FINALIZE CONTINUATION: inject synthetic tool_use (non-guarded stream path)
|
|
6859
7897
|
if (
|
|
6860
7898
|
monitor.finalize_turn_active
|