@miller-tech/uap 1.20.33 → 1.20.35
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/config/model-profiles/qwen35.json +6 -5
- package/dist/.tsbuildinfo +1 -1
- package/dist/bin/cli.js +6 -1
- package/dist/bin/cli.js.map +1 -1
- package/dist/cli/hooks.js +30 -7
- package/dist/cli/hooks.js.map +1 -1
- package/dist/cli/policy.d.ts.map +1 -1
- package/dist/cli/policy.js +26 -0
- package/dist/cli/policy.js.map +1 -1
- package/dist/index.d.ts +15 -1
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +14 -0
- package/dist/index.js.map +1 -1
- package/dist/types/index.d.ts +20 -0
- package/dist/types/index.d.ts.map +1 -1
- package/dist/types/index.js +20 -0
- package/dist/types/index.js.map +1 -1
- package/docs/AGENTS.md +423 -0
- package/docs/AGENTS.md</path>CLAUDE.md</path>/home/cogtek/dev/miller-tech/universal-agent-protocol/docs/INDEX.md</path>/home/cogtek/dev/miller-tech/universal-agent-protocol/docs/reference/API_REFERENCE.md</path>/home/cogtek/dev/miller-tech/universal-agent-protocol/docs/reference/UAP_CLI_REFERENCE.md</path>src/index.ts</path>/src/cli/worktree.ts</path>/src/coordination/deploy-batcher.ts</path>/src/policies/policy-gate.ts</path>/src/memory/model-router.ts</path>/src/memory/embeddings.ts</path>/src/models/types.ts</path>/src/types/coordination.ts</path>/src/utils/logger.ts</path>/src/utils/config-loader.ts</path>/src/utils/performance-monitor.ts</path>/src/utils/concurrency.ts</path>/src/utils/concurrency-pool.ts</path>/src/utils/string-similarity.ts</path>/src/utils/rate-limiter.ts</path>/src/utils/system-resources.ts</path>/src/utils/adaptive-cache.ts</path>/src/utils/lazy-imports.ts</path>/src/utils/merge-claude-md.ts</path>/src/utils/stopwords.ts</path>/src/utils/config-loader.ts</path>/src/utils/performance-monitor.ts</path>/src/utils/concurrency.ts</path>/src/utils/concurrency-pool.ts</path>/src/utils/string-similarity.ts</path>/src/utils/rate-limiter.ts</path>/src/utils/system-resources.ts</path>/src/utils/adaptive-cache.ts</path>/src/utils/lazy-imports.ts</path>/src/utils/merge-claude-md.ts</path>/src/utils/stopwords.ts</path> +433 -0
- package/docs/DOCUMENTATION_AUDIT_REPORT.md +131 -0
- package/docs/GETTING_STARTED.md +288 -0
- package/docs/INDEX.md +272 -42
- package/docs/PROJECT_ANALYSIS_REPORT.md +510 -0
- package/docs/architecture/SYSTEM_ANALYSIS.md +220 -1003
- package/docs/blog/local-coding-agents.md +266 -0
- package/docs/blog/x-thread.md +254 -0
- package/docs/deployment/DEPLOY_BATCHER_ANALYSIS.md +15 -647
- package/docs/getting-started/OVERVIEW.md +10 -30
- package/docs/getting-started/SETUP.md +183 -9
- package/docs/pr/UPSTREAM_PRS.md +424 -0
- package/docs/reference/CONFIGURATION.md +208 -0
- package/docs/reference/DATABASE_SCHEMA.md +344 -0
- package/docs/reference/PATTERN_LIBRARY.md +636 -0
- package/package.json +1 -1
- package/templates/hooks/uap-policy-gate.sh +36 -0
- package/tools/agents/claude_local_agent.py +92 -0
- package/tools/agents/config/qwen3.5-enhanced.jinja +187 -0
- package/tools/agents/opencode_uap_agent.py +3 -0
- package/tools/agents/scripts/anthropic_proxy.py +1748 -76
- package/tools/agents/tests/test_anthropic_proxy_streaming.py +64 -8
- package/tools/agents/uap_agent.py +1 -1
|
@@ -134,6 +134,11 @@ PROXY_LOOP_BREAKER = os.environ.get("PROXY_LOOP_BREAKER", "on").lower() not in {
|
|
|
134
134
|
}
|
|
135
135
|
PROXY_LOOP_WINDOW = int(os.environ.get("PROXY_LOOP_WINDOW", "6"))
|
|
136
136
|
PROXY_LOOP_REPEAT_THRESHOLD = int(os.environ.get("PROXY_LOOP_REPEAT_THRESHOLD", "6"))
|
|
137
|
+
# Fix K (2026-04-22): minimum consecutive cycle-repeat count required to flip
|
|
138
|
+
# phase from act -> review. The old behaviour accepted cycle_repeat=2, which
|
|
139
|
+
# is normal in a working session (re-reading the same file across edits).
|
|
140
|
+
# Set higher to tolerate legitimate re-reads; set 1 to restore old behaviour.
|
|
141
|
+
PROXY_CYCLE_TRIGGER_REPEAT = int(os.environ.get("PROXY_CYCLE_TRIGGER_REPEAT", "3"))
|
|
137
142
|
PROXY_FORCED_THRESHOLD = int(os.environ.get("PROXY_FORCED_THRESHOLD", "15"))
|
|
138
143
|
PROXY_NO_PROGRESS_THRESHOLD = int(os.environ.get("PROXY_NO_PROGRESS_THRESHOLD", "3"))
|
|
139
144
|
PROXY_CONTEXT_RELEASE_THRESHOLD = float(
|
|
@@ -166,6 +171,12 @@ PROXY_TOOL_STATE_FINALIZE_THRESHOLD = int(
|
|
|
166
171
|
PROXY_TOOL_STATE_REVIEW_CYCLE_LIMIT = int(
|
|
167
172
|
os.environ.get("PROXY_TOOL_STATE_REVIEW_CYCLE_LIMIT", "3")
|
|
168
173
|
)
|
|
174
|
+
# Force finalize after N consecutive forced_budget_exhausted events where
|
|
175
|
+
# neither cycling nor stagnation was detected — catches "distinct but
|
|
176
|
+
# unproductive" tool spam that defeats per-tool cycle detection.
|
|
177
|
+
PROXY_UNPRODUCTIVE_EXHAUSTION_LIMIT = int(
|
|
178
|
+
os.environ.get("PROXY_UNPRODUCTIVE_EXHAUSTION_LIMIT", "2")
|
|
179
|
+
)
|
|
169
180
|
PROXY_COMPLETION_RECOVERY_MAX = int(
|
|
170
181
|
os.environ.get("PROXY_COMPLETION_RECOVERY_MAX", "3")
|
|
171
182
|
)
|
|
@@ -205,6 +216,13 @@ PROXY_SESSION_TTL_SECS = int(os.environ.get("PROXY_SESSION_TTL_SECS", "7200"))
|
|
|
205
216
|
PROXY_FINALIZE_CONTINUATION_MAX = int(
|
|
206
217
|
os.environ.get("PROXY_FINALIZE_CONTINUATION_MAX", "3")
|
|
207
218
|
)
|
|
219
|
+
# Session-level cap: after N total finalize continuations in a session (even
|
|
220
|
+
# across "fresh user text" state resets), stop injecting synthetic tools and
|
|
221
|
+
# let the response terminate naturally. Catches runaway loops that dodge the
|
|
222
|
+
# per-cycle cap by triggering state resets.
|
|
223
|
+
PROXY_FINALIZE_SESSION_HARD_CAP = int(
|
|
224
|
+
os.environ.get("PROXY_FINALIZE_SESSION_HARD_CAP", "3")
|
|
225
|
+
)
|
|
208
226
|
PROXY_STREAM_REASONING_FALLBACK = (
|
|
209
227
|
os.environ.get("PROXY_STREAM_REASONING_FALLBACK", "off").strip().lower()
|
|
210
228
|
)
|
|
@@ -234,6 +252,27 @@ PROXY_DISABLE_THINKING_ON_TOOL_TURNS = os.environ.get(
|
|
|
234
252
|
"off",
|
|
235
253
|
"no",
|
|
236
254
|
}
|
|
255
|
+
# Disable thinking on EVERY turn (not just tool turns). For models like Gemma 4
|
|
256
|
+
# that emit ~100 thinking tokens for trivial replies, this halves output cost.
|
|
257
|
+
PROXY_DISABLE_THINKING_ALWAYS = os.environ.get(
|
|
258
|
+
"PROXY_DISABLE_THINKING_ALWAYS", "off"
|
|
259
|
+
).lower() not in {"0", "false", "off", "no"}
|
|
260
|
+
# Force tool_choice='required' on the first turn of a fresh session. Originally
|
|
261
|
+
# Qwen-tuned to break out of cold-start "tries to chat instead of calling a tool"
|
|
262
|
+
# behaviour. Gemma 4 doesn't need this — it routes 'auto' correctly and the
|
|
263
|
+
# force triggers malformed-JSON emissions when it would rather speak. Default
|
|
264
|
+
# off; set 'on' to restore the legacy Qwen-style behaviour.
|
|
265
|
+
PROXY_FORCE_TOOL_CHOICE_ON_COLD_START = os.environ.get(
|
|
266
|
+
"PROXY_FORCE_TOOL_CHOICE_ON_COLD_START", "off"
|
|
267
|
+
).lower() not in {"0", "false", "off", "no"}
|
|
268
|
+
PROXY_DISABLE_SPEC_ON_TOOL_TURNS = os.environ.get(
|
|
269
|
+
"PROXY_DISABLE_SPEC_ON_TOOL_TURNS", "off"
|
|
270
|
+
).lower() not in {
|
|
271
|
+
"0",
|
|
272
|
+
"false",
|
|
273
|
+
"off",
|
|
274
|
+
"no",
|
|
275
|
+
}
|
|
237
276
|
PROXY_MALFORMED_TOOL_GUARDRAIL = os.environ.get(
|
|
238
277
|
"PROXY_MALFORMED_TOOL_GUARDRAIL", "on"
|
|
239
278
|
).lower() not in {
|
|
@@ -555,6 +594,44 @@ def _is_grammar_tools_incompatibility(status_code: int, error_text: str) -> bool
|
|
|
555
594
|
return "custom grammar constraints" in lowered and "with tools" in lowered
|
|
556
595
|
|
|
557
596
|
|
|
597
|
+
def _is_gemma4_peg_parse_failure(status_code: int, error_text: str) -> bool:
|
|
598
|
+
"""Detect Gemma 4's PEG-parser failure on tool-turn output.
|
|
599
|
+
|
|
600
|
+
llama-server returns HTTP 500 with `failed to parse grammar` /
|
|
601
|
+
`Failed to parse input at pos N: <|tool_call>call:...` when the model
|
|
602
|
+
emits an incomplete tool call (missing required schema fields) under
|
|
603
|
+
tool_choice='required'. The PEG grammar enforces the schema strictly
|
|
604
|
+
and rejects the partial output. Caller should retry with relaxed
|
|
605
|
+
tool_choice='auto' so the model can emit prose or a complete call
|
|
606
|
+
without grammar enforcement triggering this failure mode.
|
|
607
|
+
"""
|
|
608
|
+
if status_code != 500:
|
|
609
|
+
return False
|
|
610
|
+
text = error_text or ""
|
|
611
|
+
return (
|
|
612
|
+
"Failed to parse input at pos" in text
|
|
613
|
+
or "<|tool_call>call:" in text
|
|
614
|
+
)
|
|
615
|
+
|
|
616
|
+
|
|
617
|
+
def _relax_tool_choice_for_gemma4_peg_retry(request_body: dict, source: str) -> bool:
|
|
618
|
+
"""When a Gemma 4 PEG parse failure is detected on a tool turn, drop
|
|
619
|
+
tool_choice='required' so the retry has a permissive grammar. Returns
|
|
620
|
+
True if the body was modified (caller should retry POST)."""
|
|
621
|
+
if not request_body.get("tools"):
|
|
622
|
+
return False
|
|
623
|
+
current = request_body.get("tool_choice")
|
|
624
|
+
if current in ("required", {"type": "any"}):
|
|
625
|
+
request_body["tool_choice"] = "auto"
|
|
626
|
+
logger.warning(
|
|
627
|
+
"GEMMA4 PEG RETRY (%s): relaxed tool_choice='required' -> 'auto' "
|
|
628
|
+
"to bypass strict-grammar parse failure on incomplete model output",
|
|
629
|
+
source,
|
|
630
|
+
)
|
|
631
|
+
return True
|
|
632
|
+
return False
|
|
633
|
+
|
|
634
|
+
|
|
558
635
|
def _maybe_disable_grammar_for_tools_error(
|
|
559
636
|
request_body: dict,
|
|
560
637
|
status_code: int,
|
|
@@ -654,6 +731,7 @@ class SessionMonitor:
|
|
|
654
731
|
tool_state_stagnation_streak: int = 0
|
|
655
732
|
tool_state_transitions: int = 0
|
|
656
733
|
tool_state_review_cycles: int = 0
|
|
734
|
+
tool_state_unproductive_exhaustion_streak: int = 0
|
|
657
735
|
last_tool_fingerprint: str = ""
|
|
658
736
|
cycling_tool_names: list = field(default_factory=list)
|
|
659
737
|
session_banned_tools: set = field(default_factory=set) # tools banned for entire session after repeated cycling
|
|
@@ -661,6 +739,7 @@ class SessionMonitor:
|
|
|
661
739
|
last_response_garbled: bool = False # previous turn had garbled/malformed output
|
|
662
740
|
finalize_turn_active: bool = False
|
|
663
741
|
finalize_continuation_count: int = 0
|
|
742
|
+
finalize_hard_stop_count: int = 0 # monotonic, not reset by fresh user text
|
|
664
743
|
finalize_synthetic_tool_id: str = ""
|
|
665
744
|
completion_required: bool = False
|
|
666
745
|
completion_pending: bool = False
|
|
@@ -898,6 +977,7 @@ class SessionMonitor:
|
|
|
898
977
|
self.tool_state_auto_budget_remaining = 0
|
|
899
978
|
self.tool_state_stagnation_streak = 0
|
|
900
979
|
self.tool_state_review_cycles = 0
|
|
980
|
+
self.tool_state_unproductive_exhaustion_streak = 0
|
|
901
981
|
self.cycling_tool_names = []
|
|
902
982
|
self.last_tool_fingerprint = ""
|
|
903
983
|
self.reset_tool_targets()
|
|
@@ -906,7 +986,10 @@ class SessionMonitor:
|
|
|
906
986
|
self.completion_required = _should_enforce_completion_contract(anthropic_body)
|
|
907
987
|
self.completion_progress_signals = _count_completion_progress_signals(anthropic_body)
|
|
908
988
|
blockers = _completion_blockers(
|
|
909
|
-
anthropic_body,
|
|
989
|
+
anthropic_body,
|
|
990
|
+
has_tool_results,
|
|
991
|
+
phase=self.tool_turn_phase,
|
|
992
|
+
finalize_fired=(self.finalize_hard_stop_count > 0),
|
|
910
993
|
)
|
|
911
994
|
self.completion_blockers = blockers
|
|
912
995
|
self.completion_pending = self.completion_required and bool(blockers)
|
|
@@ -1046,6 +1129,8 @@ class SessionMonitor:
|
|
|
1046
1129
|
session_monitors: dict[str, SessionMonitor] = {}
|
|
1047
1130
|
default_context_window = 0
|
|
1048
1131
|
last_session_id = ""
|
|
1132
|
+
_last_ctx_recheck_ts: float = 0.0
|
|
1133
|
+
_CTX_RECHECK_INTERVAL: float = 60.0 # Re-detect context window every 60s
|
|
1049
1134
|
|
|
1050
1135
|
|
|
1051
1136
|
def _cleanup_stale_monitors(now_ts: float) -> None:
|
|
@@ -1058,6 +1143,39 @@ def _cleanup_stale_monitors(now_ts: float) -> None:
|
|
|
1058
1143
|
session_monitors.pop(sid, None)
|
|
1059
1144
|
|
|
1060
1145
|
|
|
1146
|
+
async def _maybe_recheck_context_window() -> None:
|
|
1147
|
+
"""Periodically re-query the upstream server's context window.
|
|
1148
|
+
|
|
1149
|
+
Handles server restarts with different --ctx-size mid-session.
|
|
1150
|
+
Non-blocking: skips if the check interval hasn't elapsed.
|
|
1151
|
+
"""
|
|
1152
|
+
global default_context_window, _last_ctx_recheck_ts
|
|
1153
|
+
now = time.time()
|
|
1154
|
+
if now - _last_ctx_recheck_ts < _CTX_RECHECK_INTERVAL:
|
|
1155
|
+
return
|
|
1156
|
+
_last_ctx_recheck_ts = now
|
|
1157
|
+
if http_client is None:
|
|
1158
|
+
return
|
|
1159
|
+
try:
|
|
1160
|
+
slots_url = LLAMA_CPP_BASE.replace("/v1", "/slots")
|
|
1161
|
+
resp = await http_client.get(slots_url, timeout=2.0)
|
|
1162
|
+
if resp.status_code == 200:
|
|
1163
|
+
slots = resp.json()
|
|
1164
|
+
if slots and isinstance(slots, list):
|
|
1165
|
+
n_ctx = slots[0].get("n_ctx", 0)
|
|
1166
|
+
if n_ctx > 0 and n_ctx != default_context_window:
|
|
1167
|
+
old = default_context_window
|
|
1168
|
+
default_context_window = n_ctx
|
|
1169
|
+
for mon in session_monitors.values():
|
|
1170
|
+
mon.context_window = n_ctx
|
|
1171
|
+
logger.warning(
|
|
1172
|
+
"Context window changed: %d → %d (upstream server restarted?)",
|
|
1173
|
+
old, n_ctx,
|
|
1174
|
+
)
|
|
1175
|
+
except Exception:
|
|
1176
|
+
pass # Non-critical, will retry next interval
|
|
1177
|
+
|
|
1178
|
+
|
|
1061
1179
|
def get_session_monitor(session_id: str) -> SessionMonitor:
|
|
1062
1180
|
now_ts = time.time()
|
|
1063
1181
|
_cleanup_stale_monitors(now_ts)
|
|
@@ -1347,6 +1465,66 @@ def prune_conversation(
|
|
|
1347
1465
|
# Granular timeouts: short connect, long read for streaming LLM output.
|
|
1348
1466
|
http_client: httpx.AsyncClient | None = None
|
|
1349
1467
|
|
|
1468
|
+
# ---------------------------------------------------------------------------
|
|
1469
|
+
# Concurrency Control
|
|
1470
|
+
# ---------------------------------------------------------------------------
|
|
1471
|
+
# Semaphore to serialize upstream requests. llama.cpp is configured with
|
|
1472
|
+
# --parallel 1 (LLAMA_PARALLEL=1), so it can only process one inference at
|
|
1473
|
+
# a time. Without this gate, concurrent client requests (Shannon sub-agents,
|
|
1474
|
+
# multiple Claude Code sessions) would all hit llama.cpp at once and the
|
|
1475
|
+
# server would serialize them while the proxy holds N httpx connections
|
|
1476
|
+
# open — potentially exhausting the proxy's connection pool while requests
|
|
1477
|
+
# queue inside llama.cpp opaquely.
|
|
1478
|
+
#
|
|
1479
|
+
# With the semaphore: requests queue inside the proxy (cheap, just asyncio
|
|
1480
|
+
# tasks waiting) and only PROXY_CONCURRENCY_LIMIT at a time reaches
|
|
1481
|
+
# llama.cpp. Each httpx connection is held only for the actual inference
|
|
1482
|
+
# duration, not the queue wait.
|
|
1483
|
+
#
|
|
1484
|
+
# Default: 1 (matches LLAMA_PARALLEL=1). Increase if you raise --parallel.
|
|
1485
|
+
PROXY_CONCURRENCY_LIMIT = int(os.environ.get("PROXY_CONCURRENCY_LIMIT", "1"))
|
|
1486
|
+
# Max time to wait for a slot before returning 503. Generous because real
|
|
1487
|
+
# inference can take 30-600s and queued requests must wait through that.
|
|
1488
|
+
# 0 = wait indefinitely.
|
|
1489
|
+
PROXY_CONCURRENCY_QUEUE_TIMEOUT = float(
|
|
1490
|
+
os.environ.get("PROXY_CONCURRENCY_QUEUE_TIMEOUT", "900")
|
|
1491
|
+
)
|
|
1492
|
+
upstream_semaphore: asyncio.Semaphore | None = None
|
|
1493
|
+
|
|
1494
|
+
|
|
1495
|
+
async def _acquire_upstream_slot() -> bool:
|
|
1496
|
+
"""Acquire a semaphore slot for an upstream request.
|
|
1497
|
+
|
|
1498
|
+
Returns True if a slot was acquired, False if the wait timed out.
|
|
1499
|
+
asyncio.Semaphore.acquire() preserves wait order via futures, so this
|
|
1500
|
+
gives a natural FIFO queue.
|
|
1501
|
+
"""
|
|
1502
|
+
if upstream_semaphore is None:
|
|
1503
|
+
return True # Not yet initialized; proceed without limiting
|
|
1504
|
+
if PROXY_CONCURRENCY_QUEUE_TIMEOUT <= 0:
|
|
1505
|
+
await upstream_semaphore.acquire()
|
|
1506
|
+
return True
|
|
1507
|
+
try:
|
|
1508
|
+
await asyncio.wait_for(
|
|
1509
|
+
upstream_semaphore.acquire(),
|
|
1510
|
+
timeout=PROXY_CONCURRENCY_QUEUE_TIMEOUT,
|
|
1511
|
+
)
|
|
1512
|
+
return True
|
|
1513
|
+
except asyncio.TimeoutError:
|
|
1514
|
+
return False
|
|
1515
|
+
|
|
1516
|
+
|
|
1517
|
+
def _release_upstream_slot() -> None:
|
|
1518
|
+
"""Release a semaphore slot. MUST be called once per successful acquire.
|
|
1519
|
+
|
|
1520
|
+
Note: asyncio.Semaphore.release() always increments the counter — we
|
|
1521
|
+
do NOT gate on locked() because that returns True only when the counter
|
|
1522
|
+
is 0 (no slots left). Gating would cause a slot leak when limit > 1 and
|
|
1523
|
+
multiple holders release simultaneously.
|
|
1524
|
+
"""
|
|
1525
|
+
if upstream_semaphore is not None:
|
|
1526
|
+
upstream_semaphore.release()
|
|
1527
|
+
|
|
1350
1528
|
|
|
1351
1529
|
def _is_loading_model_503(resp: httpx.Response) -> bool:
|
|
1352
1530
|
"""Check if response is a 503 'Loading model' from llama.cpp."""
|
|
@@ -1390,6 +1568,36 @@ async def _post_with_retry(
|
|
|
1390
1568
|
url: str,
|
|
1391
1569
|
payload: dict,
|
|
1392
1570
|
headers: dict,
|
|
1571
|
+
) -> httpx.Response:
|
|
1572
|
+
"""Post with upstream-retry + concurrency-slot acquire.
|
|
1573
|
+
|
|
1574
|
+
Acquires a slot from upstream_semaphore before making the request, so
|
|
1575
|
+
concurrent client requests queue in the proxy (cheap asyncio waits)
|
|
1576
|
+
rather than all hammering llama.cpp at once. Slot is released in a
|
|
1577
|
+
finally block so it's always returned to the pool even on error.
|
|
1578
|
+
"""
|
|
1579
|
+
acquired = await _acquire_upstream_slot()
|
|
1580
|
+
if not acquired:
|
|
1581
|
+
logger.warning(
|
|
1582
|
+
"CONCURRENCY: queue timeout (%ds) exceeded waiting for upstream slot",
|
|
1583
|
+
int(PROXY_CONCURRENCY_QUEUE_TIMEOUT),
|
|
1584
|
+
)
|
|
1585
|
+
raise httpx.RemoteProtocolError(
|
|
1586
|
+
f"Upstream concurrency queue timed out after {int(PROXY_CONCURRENCY_QUEUE_TIMEOUT)}s "
|
|
1587
|
+
f"(limit={PROXY_CONCURRENCY_LIMIT})",
|
|
1588
|
+
request=None,
|
|
1589
|
+
)
|
|
1590
|
+
try:
|
|
1591
|
+
return await _post_with_retry_inner(client, url, payload, headers)
|
|
1592
|
+
finally:
|
|
1593
|
+
_release_upstream_slot()
|
|
1594
|
+
|
|
1595
|
+
|
|
1596
|
+
async def _post_with_retry_inner(
|
|
1597
|
+
client: httpx.AsyncClient,
|
|
1598
|
+
url: str,
|
|
1599
|
+
payload: dict,
|
|
1600
|
+
headers: dict,
|
|
1393
1601
|
) -> httpx.Response:
|
|
1394
1602
|
last_exc: Exception | None = None
|
|
1395
1603
|
for attempt in range(PROXY_UPSTREAM_RETRY_MAX):
|
|
@@ -1435,6 +1643,7 @@ async def _post_with_generation_timeout(
|
|
|
1435
1643
|
headers: dict,
|
|
1436
1644
|
) -> httpx.Response:
|
|
1437
1645
|
"""Wrap _post_with_retry with an explicit asyncio generation timeout.
|
|
1646
|
+
Also acquires a concurrency slot before making the request.
|
|
1438
1647
|
|
|
1439
1648
|
The httpx read timeout may not fire for hung connections where the server
|
|
1440
1649
|
keeps the socket open but produces no data (observed with llama.cpp server
|
|
@@ -1499,6 +1708,13 @@ async def lifespan(app: FastAPI):
|
|
|
1499
1708
|
"""Manage the httpx client lifecycle with the FastAPI app."""
|
|
1500
1709
|
global http_client
|
|
1501
1710
|
global default_context_window
|
|
1711
|
+
global upstream_semaphore
|
|
1712
|
+
upstream_semaphore = asyncio.Semaphore(PROXY_CONCURRENCY_LIMIT)
|
|
1713
|
+
logger.info(
|
|
1714
|
+
"CONCURRENCY: upstream semaphore initialized limit=%d queue_timeout=%.0fs",
|
|
1715
|
+
PROXY_CONCURRENCY_LIMIT,
|
|
1716
|
+
PROXY_CONCURRENCY_QUEUE_TIMEOUT,
|
|
1717
|
+
)
|
|
1502
1718
|
http_client = httpx.AsyncClient(
|
|
1503
1719
|
timeout=httpx.Timeout(
|
|
1504
1720
|
connect=10.0, # 10s to establish connection
|
|
@@ -1581,6 +1797,8 @@ async def lifespan(app: FastAPI):
|
|
|
1581
1797
|
yield
|
|
1582
1798
|
await http_client.aclose()
|
|
1583
1799
|
http_client = None
|
|
1800
|
+
if upstream_semaphore is not None:
|
|
1801
|
+
upstream_semaphore = None
|
|
1584
1802
|
logger.info("Proxy shut down")
|
|
1585
1803
|
|
|
1586
1804
|
|
|
@@ -1591,6 +1809,16 @@ app = FastAPI(
|
|
|
1591
1809
|
lifespan=lifespan,
|
|
1592
1810
|
)
|
|
1593
1811
|
|
|
1812
|
+
# NOTE: Concurrency control is enforced by _acquire_upstream_slot() inside
|
|
1813
|
+
# _post_with_retry (the single point where we hit llama.cpp). An earlier
|
|
1814
|
+
# implementation also added an HTTP middleware that acquired the same
|
|
1815
|
+
# semaphore — this caused a self-deadlock (middleware holds slot, inner
|
|
1816
|
+
# call waits for slot, both on the same task). The middleware approach
|
|
1817
|
+
# also called non-existent asyncio.Semaphore methods (try_acquire /
|
|
1818
|
+
# acquire_nowait) and ran an async primitive in a thread executor.
|
|
1819
|
+
# Removed 2026-05-13.
|
|
1820
|
+
|
|
1821
|
+
|
|
1594
1822
|
|
|
1595
1823
|
# ===========================================================================
|
|
1596
1824
|
# Request Translation: Anthropic -> OpenAI
|
|
@@ -1624,6 +1852,31 @@ def anthropic_to_openai_messages(anthropic_body: dict) -> list[dict]:
|
|
|
1624
1852
|
role = msg["role"]
|
|
1625
1853
|
content = msg.get("content")
|
|
1626
1854
|
|
|
1855
|
+
# Strip <think>...</think> blocks from PRIOR assistant turns. Qwen is
|
|
1856
|
+
# heavily few-shot influenced by its own conversation history — if
|
|
1857
|
+
# earlier assistant turns contain reasoning blocks, the next turn
|
|
1858
|
+
# will pattern-match and emit <think> tags even when the system
|
|
1859
|
+
# prompt forbids them. Stripping breaks the copy cycle.
|
|
1860
|
+
if role == "assistant":
|
|
1861
|
+
if isinstance(content, str) and "<think>" in content:
|
|
1862
|
+
content = _THINKING_BLOCK_RE.sub("", content).lstrip()
|
|
1863
|
+
elif isinstance(content, list):
|
|
1864
|
+
stripped = []
|
|
1865
|
+
for b in content:
|
|
1866
|
+
if isinstance(b, dict) and b.get("type") == "text":
|
|
1867
|
+
t = b.get("text", "")
|
|
1868
|
+
if "<think>" in t:
|
|
1869
|
+
t = _THINKING_BLOCK_RE.sub("", t).lstrip()
|
|
1870
|
+
if t:
|
|
1871
|
+
stripped.append({**b, "text": t})
|
|
1872
|
+
elif isinstance(b, dict) and b.get("type") == "thinking":
|
|
1873
|
+
# Anthropic-style thinking block — drop entirely
|
|
1874
|
+
# (don't replay it back to the model).
|
|
1875
|
+
continue
|
|
1876
|
+
else:
|
|
1877
|
+
stripped.append(b)
|
|
1878
|
+
content = stripped
|
|
1879
|
+
|
|
1627
1880
|
if isinstance(content, str):
|
|
1628
1881
|
messages.append({"role": role, "content": content})
|
|
1629
1882
|
elif isinstance(content, list):
|
|
@@ -1633,6 +1886,10 @@ def anthropic_to_openai_messages(anthropic_body: dict) -> list[dict]:
|
|
|
1633
1886
|
parts.append(block)
|
|
1634
1887
|
elif block.get("type") == "text":
|
|
1635
1888
|
parts.append(block.get("text", ""))
|
|
1889
|
+
elif block.get("type") == "thinking":
|
|
1890
|
+
# Drop thinking blocks from user/assistant content when
|
|
1891
|
+
# echoed back into history — model shouldn't see them.
|
|
1892
|
+
continue
|
|
1636
1893
|
elif block.get("type") == "tool_use":
|
|
1637
1894
|
messages.append(
|
|
1638
1895
|
{
|
|
@@ -1641,7 +1898,7 @@ def anthropic_to_openai_messages(anthropic_body: dict) -> list[dict]:
|
|
|
1641
1898
|
"tool_calls": [
|
|
1642
1899
|
{
|
|
1643
1900
|
"id": block.get(
|
|
1644
|
-
"id", f"
|
|
1901
|
+
"id", f"toolu_{uuid.uuid4().hex[:24]}"
|
|
1645
1902
|
),
|
|
1646
1903
|
"type": "function",
|
|
1647
1904
|
"function": {
|
|
@@ -1654,10 +1911,17 @@ def anthropic_to_openai_messages(anthropic_body: dict) -> list[dict]:
|
|
|
1654
1911
|
)
|
|
1655
1912
|
continue
|
|
1656
1913
|
elif block.get("type") == "tool_result":
|
|
1914
|
+
# Strip Anthropic-spec toolu_ prefix so the upstream
|
|
1915
|
+
# tool_call_id matches what llama-server originally
|
|
1916
|
+
# emitted (we stamped the prefix on outbound; reverse it
|
|
1917
|
+
# here so the loop closes correctly).
|
|
1918
|
+
tu_id = block.get("tool_use_id", "")
|
|
1919
|
+
if isinstance(tu_id, str) and tu_id.startswith("toolu_"):
|
|
1920
|
+
tu_id = tu_id[len("toolu_"):]
|
|
1657
1921
|
messages.append(
|
|
1658
1922
|
{
|
|
1659
1923
|
"role": "tool",
|
|
1660
|
-
"tool_call_id":
|
|
1924
|
+
"tool_call_id": tu_id,
|
|
1661
1925
|
"content": _extract_text(block.get("content", "")),
|
|
1662
1926
|
}
|
|
1663
1927
|
)
|
|
@@ -1837,6 +2101,18 @@ _AGENTIC_SYSTEM_SUPPLEMENT_MINIMAL = (
|
|
|
1837
2101
|
"\n\nUse tools for all actions. Respond with tool calls, not descriptions of what to do."
|
|
1838
2102
|
)
|
|
1839
2103
|
|
|
2104
|
+
# Directive appended when the upstream model (Qwen) is configured with
|
|
2105
|
+
# enable_thinking=False but consistently emits <think>...</think> blocks
|
|
2106
|
+
# anyway, consuming the max_tokens budget before any tool_use is generated.
|
|
2107
|
+
# Empirically required for Shannon-style workflows where max_tokens=512
|
|
2108
|
+
# leaves no room for both internal reasoning AND a tool call.
|
|
2109
|
+
_NO_THINKING_DIRECTIVE = (
|
|
2110
|
+
"\n\nCRITICAL: Do NOT output <think>...</think> tags or any internal "
|
|
2111
|
+
"reasoning. Begin your response IMMEDIATELY with the appropriate "
|
|
2112
|
+
"tool_call. If you have no tool to call, reply with plain text only — "
|
|
2113
|
+
"never include reasoning blocks."
|
|
2114
|
+
)
|
|
2115
|
+
|
|
1840
2116
|
if PROXY_AGENTIC_SUPPLEMENT_MODE == "legacy":
|
|
1841
2117
|
_AGENTIC_SYSTEM_SUPPLEMENT = _AGENTIC_SYSTEM_SUPPLEMENT_LEGACY
|
|
1842
2118
|
elif PROXY_AGENTIC_SUPPLEMENT_MODE == "minimal":
|
|
@@ -1852,6 +2128,9 @@ else:
|
|
|
1852
2128
|
|
|
1853
2129
|
|
|
1854
2130
|
def _content_fingerprint(content) -> str:
|
|
2131
|
+
"""Return a STABLE fingerprint for content. Must not include volatile
|
|
2132
|
+
identifiers (tool_use_ids change per-turn), otherwise session stickiness
|
|
2133
|
+
breaks in agentic loops with stateful guardrails."""
|
|
1855
2134
|
if isinstance(content, str):
|
|
1856
2135
|
return content[:512]
|
|
1857
2136
|
if isinstance(content, list):
|
|
@@ -1866,7 +2145,10 @@ def _content_fingerprint(content) -> str:
|
|
|
1866
2145
|
elif btype == "tool_use":
|
|
1867
2146
|
parts.append(f"tool:{block.get('name', '')}")
|
|
1868
2147
|
elif btype == "tool_result":
|
|
1869
|
-
|
|
2148
|
+
# Stable: use tool name + first 64 chars of content, not tool_use_id
|
|
2149
|
+
inner = block.get("content", "")
|
|
2150
|
+
inner_text = _extract_text(inner) if not isinstance(inner, str) else inner
|
|
2151
|
+
parts.append(f"result:{inner_text[:64]}")
|
|
1870
2152
|
return "\n".join(parts)[:1024]
|
|
1871
2153
|
return str(content)[:512]
|
|
1872
2154
|
|
|
@@ -1893,14 +2175,26 @@ def resolve_session_id(request: Request, anthropic_body: dict) -> str:
|
|
|
1893
2175
|
first_user = ""
|
|
1894
2176
|
for msg in anthropic_body.get("messages", []):
|
|
1895
2177
|
if msg.get("role") == "user":
|
|
1896
|
-
|
|
2178
|
+
# Only hash TEXT content of first user message, not tool_result blocks
|
|
2179
|
+
# (which may appear in /anthropic/v1/messages passthrough scenarios)
|
|
2180
|
+
content = msg.get("content", "")
|
|
2181
|
+
if isinstance(content, str):
|
|
2182
|
+
first_user = content[:512]
|
|
2183
|
+
elif isinstance(content, list):
|
|
2184
|
+
text_parts = [
|
|
2185
|
+
b.get("text", "") for b in content
|
|
2186
|
+
if isinstance(b, dict) and b.get("type") == "text"
|
|
2187
|
+
]
|
|
2188
|
+
first_user = "\n".join(text_parts)[:512]
|
|
1897
2189
|
break
|
|
1898
2190
|
|
|
1899
|
-
|
|
2191
|
+
# Deliberately exclude `system` from fingerprint — clients often inject
|
|
2192
|
+
# volatile context (timestamps, cwd, session markers) into system prompts
|
|
2193
|
+
# which would break session stickiness for ongoing conversations.
|
|
1900
2194
|
model = anthropic_body.get("model", "default")
|
|
1901
2195
|
remote = request.client.host if request.client else "unknown"
|
|
1902
2196
|
digest = hashlib.sha256(
|
|
1903
|
-
f"{remote}|{model}|{
|
|
2197
|
+
f"{remote}|{model}|{first_user}".encode(
|
|
1904
2198
|
"utf-8", errors="ignore"
|
|
1905
2199
|
)
|
|
1906
2200
|
).hexdigest()[:20]
|
|
@@ -1965,7 +2259,10 @@ def _should_enforce_completion_contract(anthropic_body: dict) -> bool:
|
|
|
1965
2259
|
|
|
1966
2260
|
|
|
1967
2261
|
def _completion_blockers(
|
|
1968
|
-
anthropic_body: dict,
|
|
2262
|
+
anthropic_body: dict,
|
|
2263
|
+
has_tool_results: bool,
|
|
2264
|
+
phase: str = "",
|
|
2265
|
+
finalize_fired: bool = False,
|
|
1969
2266
|
) -> list[str]:
|
|
1970
2267
|
blockers: list[str] = []
|
|
1971
2268
|
progress = _count_completion_progress_signals(anthropic_body)
|
|
@@ -1977,9 +2274,12 @@ def _completion_blockers(
|
|
|
1977
2274
|
if last_user_has_result:
|
|
1978
2275
|
blockers.append("awaiting_post_tool_followup")
|
|
1979
2276
|
elif _last_assistant_was_text_only(anthropic_body):
|
|
1980
|
-
#
|
|
1981
|
-
#
|
|
1982
|
-
|
|
2277
|
+
# Suppress in two cases:
|
|
2278
|
+
# 1. Currently in finalize phase — text-only is expected
|
|
2279
|
+
# 2. A finalize fired earlier this session — means the state machine
|
|
2280
|
+
# already wrapped up the loop, don't re-trigger it (was causing
|
|
2281
|
+
# finalize -> review -> cycle -> finalize -> review... infinite loop)
|
|
2282
|
+
if phase != "finalize" and not finalize_fired:
|
|
1983
2283
|
blockers.append("text_only_after_tool_results")
|
|
1984
2284
|
|
|
1985
2285
|
return blockers
|
|
@@ -2020,6 +2320,212 @@ def _sanitize_tool_schema_for_llama(schema):
|
|
|
2020
2320
|
return _walk(schema), removed
|
|
2021
2321
|
|
|
2022
2322
|
|
|
2323
|
+
def openai_to_anthropic_request(openai_body: dict) -> dict:
|
|
2324
|
+
"""Convert an OpenAI Chat Completions request to an Anthropic Messages request.
|
|
2325
|
+
|
|
2326
|
+
Inverse of anthropic_to_openai_messages. Used by /v1/chat/completions passthrough
|
|
2327
|
+
to let OpenAI-shaped clients (Forge, etc.) benefit from the Anthropic-path
|
|
2328
|
+
guardrails (loop detection, tool narrowing, cycle breaking, etc.).
|
|
2329
|
+
"""
|
|
2330
|
+
anthropic_messages: list[dict] = []
|
|
2331
|
+
system_text_parts: list[str] = []
|
|
2332
|
+
|
|
2333
|
+
for msg in openai_body.get("messages", []):
|
|
2334
|
+
role = msg.get("role", "")
|
|
2335
|
+
content = msg.get("content")
|
|
2336
|
+
|
|
2337
|
+
if role == "system":
|
|
2338
|
+
if isinstance(content, str):
|
|
2339
|
+
system_text_parts.append(content)
|
|
2340
|
+
elif isinstance(content, list):
|
|
2341
|
+
for block in content:
|
|
2342
|
+
if isinstance(block, dict) and block.get("type") == "text":
|
|
2343
|
+
system_text_parts.append(block.get("text", ""))
|
|
2344
|
+
elif isinstance(block, str):
|
|
2345
|
+
system_text_parts.append(block)
|
|
2346
|
+
continue
|
|
2347
|
+
|
|
2348
|
+
if role == "tool":
|
|
2349
|
+
# OpenAI tool response -> Anthropic user message with tool_result block
|
|
2350
|
+
tool_call_id = msg.get("tool_call_id", "")
|
|
2351
|
+
tool_text = content if isinstance(content, str) else _extract_text(content)
|
|
2352
|
+
anthropic_messages.append(
|
|
2353
|
+
{
|
|
2354
|
+
"role": "user",
|
|
2355
|
+
"content": [
|
|
2356
|
+
{
|
|
2357
|
+
"type": "tool_result",
|
|
2358
|
+
"tool_use_id": tool_call_id,
|
|
2359
|
+
"content": tool_text,
|
|
2360
|
+
}
|
|
2361
|
+
],
|
|
2362
|
+
}
|
|
2363
|
+
)
|
|
2364
|
+
continue
|
|
2365
|
+
|
|
2366
|
+
if role == "assistant":
|
|
2367
|
+
blocks: list[dict] = []
|
|
2368
|
+
if isinstance(content, str) and content:
|
|
2369
|
+
blocks.append({"type": "text", "text": content})
|
|
2370
|
+
elif isinstance(content, list):
|
|
2371
|
+
for block in content:
|
|
2372
|
+
if isinstance(block, dict) and block.get("type") == "text":
|
|
2373
|
+
blocks.append({"type": "text", "text": block.get("text", "")})
|
|
2374
|
+
elif isinstance(block, str):
|
|
2375
|
+
blocks.append({"type": "text", "text": block})
|
|
2376
|
+
|
|
2377
|
+
for tc in msg.get("tool_calls", []) or []:
|
|
2378
|
+
fn = tc.get("function", {})
|
|
2379
|
+
try:
|
|
2380
|
+
args = json.loads(fn.get("arguments", "{}") or "{}")
|
|
2381
|
+
except (ValueError, TypeError):
|
|
2382
|
+
args = {}
|
|
2383
|
+
blocks.append(
|
|
2384
|
+
{
|
|
2385
|
+
"type": "tool_use",
|
|
2386
|
+
"id": tc.get("id", f"toolu_{uuid.uuid4().hex[:12]}"),
|
|
2387
|
+
"name": fn.get("name", ""),
|
|
2388
|
+
"input": args,
|
|
2389
|
+
}
|
|
2390
|
+
)
|
|
2391
|
+
|
|
2392
|
+
anthropic_messages.append(
|
|
2393
|
+
{"role": "assistant", "content": blocks if blocks else ""}
|
|
2394
|
+
)
|
|
2395
|
+
continue
|
|
2396
|
+
|
|
2397
|
+
# role == "user" (or unknown -> treat as user)
|
|
2398
|
+
if isinstance(content, str):
|
|
2399
|
+
anthropic_messages.append({"role": "user", "content": content})
|
|
2400
|
+
elif isinstance(content, list):
|
|
2401
|
+
blocks = []
|
|
2402
|
+
for block in content:
|
|
2403
|
+
if isinstance(block, dict) and block.get("type") == "text":
|
|
2404
|
+
blocks.append({"type": "text", "text": block.get("text", "")})
|
|
2405
|
+
elif isinstance(block, str):
|
|
2406
|
+
blocks.append({"type": "text", "text": block})
|
|
2407
|
+
anthropic_messages.append(
|
|
2408
|
+
{"role": "user", "content": blocks if blocks else ""}
|
|
2409
|
+
)
|
|
2410
|
+
else:
|
|
2411
|
+
anthropic_messages.append({"role": "user", "content": ""})
|
|
2412
|
+
|
|
2413
|
+
anthropic_body: dict = {
|
|
2414
|
+
"model": openai_body.get("model", "default"),
|
|
2415
|
+
"messages": anthropic_messages,
|
|
2416
|
+
"max_tokens": int(openai_body.get("max_tokens", 4096) or 4096),
|
|
2417
|
+
}
|
|
2418
|
+
if system_text_parts:
|
|
2419
|
+
anthropic_body["system"] = "\n\n".join(p for p in system_text_parts if p)
|
|
2420
|
+
|
|
2421
|
+
for key_o, key_a in (
|
|
2422
|
+
("temperature", "temperature"),
|
|
2423
|
+
("top_p", "top_p"),
|
|
2424
|
+
("top_k", "top_k"),
|
|
2425
|
+
("stop", "stop_sequences"),
|
|
2426
|
+
("stream", "stream"),
|
|
2427
|
+
):
|
|
2428
|
+
if key_o in openai_body:
|
|
2429
|
+
val = openai_body[key_o]
|
|
2430
|
+
if key_a == "stop_sequences" and isinstance(val, str):
|
|
2431
|
+
val = [val]
|
|
2432
|
+
anthropic_body[key_a] = val
|
|
2433
|
+
|
|
2434
|
+
# Convert OpenAI tools -> Anthropic tools
|
|
2435
|
+
openai_tools = openai_body.get("tools") or []
|
|
2436
|
+
if openai_tools:
|
|
2437
|
+
anthropic_tools = []
|
|
2438
|
+
for tool in openai_tools:
|
|
2439
|
+
fn = tool.get("function", {}) if isinstance(tool, dict) else {}
|
|
2440
|
+
if not fn.get("name"):
|
|
2441
|
+
continue
|
|
2442
|
+
anthropic_tools.append(
|
|
2443
|
+
{
|
|
2444
|
+
"name": fn.get("name", ""),
|
|
2445
|
+
"description": fn.get("description", ""),
|
|
2446
|
+
"input_schema": fn.get("parameters", {"type": "object", "properties": {}}),
|
|
2447
|
+
}
|
|
2448
|
+
)
|
|
2449
|
+
if anthropic_tools:
|
|
2450
|
+
anthropic_body["tools"] = anthropic_tools
|
|
2451
|
+
|
|
2452
|
+
tool_choice = openai_body.get("tool_choice")
|
|
2453
|
+
if tool_choice == "none":
|
|
2454
|
+
anthropic_body.pop("tools", None)
|
|
2455
|
+
elif tool_choice == "required":
|
|
2456
|
+
anthropic_body["tool_choice"] = {"type": "any"}
|
|
2457
|
+
elif isinstance(tool_choice, dict) and tool_choice.get("type") == "function":
|
|
2458
|
+
anthropic_body["tool_choice"] = {
|
|
2459
|
+
"type": "tool",
|
|
2460
|
+
"name": tool_choice.get("function", {}).get("name", ""),
|
|
2461
|
+
}
|
|
2462
|
+
|
|
2463
|
+
return anthropic_body
|
|
2464
|
+
|
|
2465
|
+
|
|
2466
|
+
def anthropic_to_openai_response(anthropic_resp: dict) -> dict:
|
|
2467
|
+
"""Convert an Anthropic Messages response to OpenAI Chat Completions format."""
|
|
2468
|
+
content_blocks = anthropic_resp.get("content", []) or []
|
|
2469
|
+
text_parts: list[str] = []
|
|
2470
|
+
tool_calls: list[dict] = []
|
|
2471
|
+
|
|
2472
|
+
for block in content_blocks:
|
|
2473
|
+
if not isinstance(block, dict):
|
|
2474
|
+
continue
|
|
2475
|
+
btype = block.get("type")
|
|
2476
|
+
if btype == "text":
|
|
2477
|
+
text_parts.append(block.get("text", ""))
|
|
2478
|
+
elif btype == "tool_use":
|
|
2479
|
+
tool_calls.append(
|
|
2480
|
+
{
|
|
2481
|
+
"id": block.get("id", f"toolu_{uuid.uuid4().hex[:24]}"),
|
|
2482
|
+
"type": "function",
|
|
2483
|
+
"function": {
|
|
2484
|
+
"name": block.get("name", ""),
|
|
2485
|
+
"arguments": json.dumps(block.get("input", {}) or {}),
|
|
2486
|
+
},
|
|
2487
|
+
}
|
|
2488
|
+
)
|
|
2489
|
+
|
|
2490
|
+
stop_reason = anthropic_resp.get("stop_reason", "end_turn")
|
|
2491
|
+
finish_map = {
|
|
2492
|
+
"end_turn": "stop",
|
|
2493
|
+
"stop_sequence": "stop",
|
|
2494
|
+
"max_tokens": "length",
|
|
2495
|
+
"tool_use": "tool_calls",
|
|
2496
|
+
}
|
|
2497
|
+
finish_reason = finish_map.get(stop_reason, "stop")
|
|
2498
|
+
|
|
2499
|
+
message: dict = {"role": "assistant"}
|
|
2500
|
+
if text_parts:
|
|
2501
|
+
message["content"] = "".join(text_parts)
|
|
2502
|
+
else:
|
|
2503
|
+
message["content"] = None
|
|
2504
|
+
if tool_calls:
|
|
2505
|
+
message["tool_calls"] = tool_calls
|
|
2506
|
+
|
|
2507
|
+
usage = anthropic_resp.get("usage", {}) or {}
|
|
2508
|
+
|
|
2509
|
+
return {
|
|
2510
|
+
"id": anthropic_resp.get("id", f"chatcmpl-{uuid.uuid4().hex[:12]}"),
|
|
2511
|
+
"object": "chat.completion",
|
|
2512
|
+
"created": int(time.time()),
|
|
2513
|
+
"model": anthropic_resp.get("model", "unknown"),
|
|
2514
|
+
"choices": [
|
|
2515
|
+
{
|
|
2516
|
+
"index": 0,
|
|
2517
|
+
"message": message,
|
|
2518
|
+
"finish_reason": finish_reason,
|
|
2519
|
+
}
|
|
2520
|
+
],
|
|
2521
|
+
"usage": {
|
|
2522
|
+
"prompt_tokens": usage.get("input_tokens", 0),
|
|
2523
|
+
"completion_tokens": usage.get("output_tokens", 0),
|
|
2524
|
+
"total_tokens": usage.get("input_tokens", 0) + usage.get("output_tokens", 0),
|
|
2525
|
+
},
|
|
2526
|
+
}
|
|
2527
|
+
|
|
2528
|
+
|
|
2023
2529
|
def _convert_anthropic_tools_to_openai(anthropic_tools: list[dict]) -> list[dict]:
|
|
2024
2530
|
converted = []
|
|
2025
2531
|
removed_pattern_fields = 0
|
|
@@ -2055,6 +2561,72 @@ def _latest_user_text(anthropic_body: dict) -> str:
|
|
|
2055
2561
|
return ""
|
|
2056
2562
|
|
|
2057
2563
|
|
|
2564
|
+
# 2026-05-12: Detect "no-task" user turns to gate the state machine's
|
|
2565
|
+
# force-required path. When the last actual human query is a short ack
|
|
2566
|
+
# ("ok", "3", "test"), an acknowledgement phrase ("standing by", "awaiting
|
|
2567
|
+
# next instruction"), or a status report ending in an ack ("scan complete.
|
|
2568
|
+
# awaiting next instruction"), there is no genuine work for the model to
|
|
2569
|
+
# do. Forcing tool_choice='required' in this state causes the model to
|
|
2570
|
+
# ruminate in <think> blocks, and the meta-tool talk inside those blocks
|
|
2571
|
+
# trips the malformed-pseudo-tool detector. Conservative patterns only.
|
|
2572
|
+
_NO_TASK_SHORT_ACKS = frozenset({
|
|
2573
|
+
"ok", "okay", "k", "kk", "y", "n", "yes", "no", "nope", "yep", "yeah",
|
|
2574
|
+
"thanks", "thank", "thx", "ty", "ack", "noted", "received", "understood",
|
|
2575
|
+
"test", "ping", "hi", "hello",
|
|
2576
|
+
})
|
|
2577
|
+
|
|
2578
|
+
_NO_TASK_ACK_PATTERNS = (
|
|
2579
|
+
re.compile(r"awaiting\s+(?:next|further|your)\s+(?:instruction|input|command|task|directive)", re.I),
|
|
2580
|
+
re.compile(r"standing\s+by(?:\s+for\s+(?:your\s+)?(?:next|further|new)\s+(?:instruction|input|command|task|directive)?)?", re.I),
|
|
2581
|
+
re.compile(r"\b(?:ready|waiting|holding)\s+for\s+(?:your\s+)?(?:next|further|new)\s+(?:task|instruction|command|input|directive)", re.I),
|
|
2582
|
+
# Status report ending in ack: "X complete. {awaiting/standing/ready/done}"
|
|
2583
|
+
re.compile(r"\bcomplet(?:e|ed)\b[\s.,;:!\-]+(?:awaiting|standing\s+by|ready|done|finished|over\s+to\s+you)", re.I),
|
|
2584
|
+
)
|
|
2585
|
+
|
|
2586
|
+
|
|
2587
|
+
def _is_no_task_user_text(text: str) -> bool:
|
|
2588
|
+
if not text:
|
|
2589
|
+
return False
|
|
2590
|
+
stripped = text.strip()
|
|
2591
|
+
if not stripped:
|
|
2592
|
+
return False
|
|
2593
|
+
bare = re.sub(r"[^\w\s]", "", stripped).strip().lower()
|
|
2594
|
+
if bare in _NO_TASK_SHORT_ACKS:
|
|
2595
|
+
return True
|
|
2596
|
+
if re.fullmatch(r"\d+(?:\.\d+)?", bare):
|
|
2597
|
+
return True
|
|
2598
|
+
snippet = stripped[:400]
|
|
2599
|
+
return any(p.search(snippet) for p in _NO_TASK_ACK_PATTERNS)
|
|
2600
|
+
|
|
2601
|
+
|
|
2602
|
+
def _latest_user_query_text(anthropic_body: dict) -> str:
|
|
2603
|
+
"""Return the most recent user message *text* — walking past
|
|
2604
|
+
tool_result-only messages to find the last actual human query.
|
|
2605
|
+
|
|
2606
|
+
During agentic loops the trailing user message is a tool_result block
|
|
2607
|
+
with no ``text`` parts, so ``_latest_user_text`` returns empty.
|
|
2608
|
+
Tool-narrowing needs query tokens to score tools; without them it
|
|
2609
|
+
keeps all tools (defeating the purpose). This walker pulls text
|
|
2610
|
+
from prior user turns as a fallback so narrowing stays useful in
|
|
2611
|
+
long loops.
|
|
2612
|
+
"""
|
|
2613
|
+
for msg in reversed(anthropic_body.get("messages", [])):
|
|
2614
|
+
if msg.get("role") != "user":
|
|
2615
|
+
continue
|
|
2616
|
+
content = msg.get("content", "")
|
|
2617
|
+
if isinstance(content, str) and content.strip():
|
|
2618
|
+
return content
|
|
2619
|
+
if isinstance(content, list):
|
|
2620
|
+
text_parts = [
|
|
2621
|
+
b.get("text", "")
|
|
2622
|
+
for b in content
|
|
2623
|
+
if isinstance(b, dict) and b.get("type") == "text" and b.get("text")
|
|
2624
|
+
]
|
|
2625
|
+
if text_parts:
|
|
2626
|
+
return "\n".join(text_parts)
|
|
2627
|
+
return ""
|
|
2628
|
+
|
|
2629
|
+
|
|
2058
2630
|
def _tokenize_for_tool_ranking(text: str) -> set[str]:
|
|
2059
2631
|
return {m.group(0).lower() for m in re.finditer(r"[a-zA-Z0-9_]{2,}", text)}
|
|
2060
2632
|
|
|
@@ -2074,6 +2646,13 @@ def _narrow_tools_for_request(
|
|
|
2074
2646
|
|
|
2075
2647
|
query_text = _latest_user_text(anthropic_body).lower()
|
|
2076
2648
|
query_tokens = _tokenize_for_tool_ranking(query_text)
|
|
2649
|
+
if not query_tokens:
|
|
2650
|
+
# Walk back past tool_result turns to find the prior real human
|
|
2651
|
+
# query. Lets narrowing stay effective during agentic loops where
|
|
2652
|
+
# the latest user msg is just a tool_result block (no text).
|
|
2653
|
+
fallback_query = _latest_user_query_text(anthropic_body).lower()
|
|
2654
|
+
query_text = fallback_query or query_text
|
|
2655
|
+
query_tokens = _tokenize_for_tool_ranking(query_text)
|
|
2077
2656
|
if not query_tokens:
|
|
2078
2657
|
n_msgs = len(anthropic_body.get("messages", []))
|
|
2079
2658
|
if (
|
|
@@ -2198,6 +2777,18 @@ def _resolve_state_machine_tool_choice(
|
|
|
2198
2777
|
monitor.finalize_synthetic_tool_id = ""
|
|
2199
2778
|
return None, "fresh_user_text"
|
|
2200
2779
|
|
|
2780
|
+
# 2026-05-12: No-task ack guard. When the latest user message is just a
|
|
2781
|
+
# tool_result (no fresh text), walk back to the most recent human query.
|
|
2782
|
+
# If that query is a short ack or "X complete. awaiting next" status,
|
|
2783
|
+
# do not force tool_choice — let the model produce a natural finalization
|
|
2784
|
+
# text instead of ruminating in <think> blocks.
|
|
2785
|
+
last_user_query = _latest_user_query_text(anthropic_body).strip()
|
|
2786
|
+
if last_user_query and _is_no_task_user_text(last_user_query):
|
|
2787
|
+
monitor.reset_tool_turn_state(reason="no_task_user_text")
|
|
2788
|
+
monitor.finalize_continuation_count = 0
|
|
2789
|
+
monitor.finalize_synthetic_tool_id = ""
|
|
2790
|
+
return None, "no_task_user_text"
|
|
2791
|
+
|
|
2201
2792
|
active_loop = (
|
|
2202
2793
|
has_tool_results
|
|
2203
2794
|
and last_user_has_tool_result
|
|
@@ -2271,7 +2862,15 @@ def _resolve_state_machine_tool_choice(
|
|
|
2271
2862
|
dup_tool,
|
|
2272
2863
|
)
|
|
2273
2864
|
|
|
2274
|
-
|
|
2865
|
+
# Fix K (2026-04-22): require cycle_repeat >= PROXY_CYCLE_TRIGGER_REPEAT
|
|
2866
|
+
# before flipping phase. Single-repeat cycles are legitimate in working
|
|
2867
|
+
# sessions (e.g. re-reading the same file across edits). dup_target
|
|
2868
|
+
# above already demands threshold=3 before asserting a cycle, so the
|
|
2869
|
+
# `cycle_looping = True, cycle_repeat = 2` pair from that branch is
|
|
2870
|
+
# kept as a strong signal (read target repeated 3+ times). Low-repeat
|
|
2871
|
+
# cycles detected by detect_tool_cycle get filtered here.
|
|
2872
|
+
cycle_trip = cycle_looping and cycle_repeat >= PROXY_CYCLE_TRIGGER_REPEAT
|
|
2873
|
+
if cycle_trip or stagnating:
|
|
2275
2874
|
reason = "cycle_detected" if cycle_looping else "stagnation"
|
|
2276
2875
|
monitor.set_tool_turn_phase("review", reason=reason)
|
|
2277
2876
|
monitor.tool_state_review_cycles += 1
|
|
@@ -2313,11 +2912,27 @@ def _resolve_state_machine_tool_choice(
|
|
|
2313
2912
|
|
|
2314
2913
|
if monitor.tool_state_forced_budget_remaining <= 0:
|
|
2315
2914
|
monitor.set_tool_turn_phase("review", reason="forced_budget_exhausted")
|
|
2316
|
-
# Only count toward review cycle limit if there was an actual
|
|
2317
|
-
# cycle/stagnation detected. Budget exhaustion alone means the
|
|
2318
|
-
# model is working — it just used all its turns — not cycling.
|
|
2319
2915
|
if cycle_looping or stagnating:
|
|
2320
2916
|
monitor.tool_state_review_cycles += 1
|
|
2917
|
+
monitor.tool_state_unproductive_exhaustion_streak = 0
|
|
2918
|
+
else:
|
|
2919
|
+
# Track consecutive unproductive exhaustions. Even without a
|
|
2920
|
+
# detected cycle, if the model burns through the forced budget
|
|
2921
|
+
# repeatedly with distinct-but-useless tool calls, treat it as
|
|
2922
|
+
# a loop and force finalize. Catches the 35B-A3B failure mode
|
|
2923
|
+
# where different short tool calls defeat per-tool cycle
|
|
2924
|
+
# detection.
|
|
2925
|
+
monitor.tool_state_unproductive_exhaustion_streak += 1
|
|
2926
|
+
if monitor.tool_state_unproductive_exhaustion_streak >= PROXY_UNPRODUCTIVE_EXHAUSTION_LIMIT:
|
|
2927
|
+
logger.warning(
|
|
2928
|
+
"TOOL STATE MACHINE: %d consecutive unproductive budget exhaustions — forcing finalize",
|
|
2929
|
+
monitor.tool_state_unproductive_exhaustion_streak,
|
|
2930
|
+
)
|
|
2931
|
+
monitor.set_tool_turn_phase("finalize", reason="unproductive_exhaustion")
|
|
2932
|
+
monitor.tool_state_unproductive_exhaustion_streak = 0
|
|
2933
|
+
monitor.tool_state_forced_budget_remaining = 0
|
|
2934
|
+
monitor.tool_state_auto_budget_remaining = 0
|
|
2935
|
+
return "finalize", "unproductive_exhaustion"
|
|
2321
2936
|
monitor.tool_state_auto_budget_remaining = max(
|
|
2322
2937
|
1, PROXY_TOOL_STATE_AUTO_BUDGET
|
|
2323
2938
|
)
|
|
@@ -2325,10 +2940,11 @@ def _resolve_state_machine_tool_choice(
|
|
|
2325
2940
|
1, PROXY_TOOL_STATE_FORCED_BUDGET // 2
|
|
2326
2941
|
)
|
|
2327
2942
|
logger.warning(
|
|
2328
|
-
"TOOL STATE MACHINE: forced budget exhausted, entering review (cycles=%d cycling=%s stagnating=%s)",
|
|
2943
|
+
"TOOL STATE MACHINE: forced budget exhausted, entering review (cycles=%d cycling=%s stagnating=%s unprod_exh=%d)",
|
|
2329
2944
|
monitor.tool_state_review_cycles,
|
|
2330
2945
|
cycle_looping,
|
|
2331
2946
|
stagnating,
|
|
2947
|
+
monitor.tool_state_unproductive_exhaustion_streak,
|
|
2332
2948
|
)
|
|
2333
2949
|
return "required", "forced_budget_exhausted"
|
|
2334
2950
|
|
|
@@ -2393,6 +3009,33 @@ def build_openai_request(
|
|
|
2393
3009
|
|
|
2394
3010
|
has_tools = _has_tool_definitions(anthropic_body)
|
|
2395
3011
|
|
|
3012
|
+
# Translate Anthropic `thinking` parameter to upstream `enable_thinking`.
|
|
3013
|
+
# Anthropic shape: {"thinking": {"type": "enabled", "budget_tokens": 1024}}
|
|
3014
|
+
# or {"type": "disabled"}. Per the Anthropic spec, thinking is OFF by
|
|
3015
|
+
# default and ONLY enabled when the client opts in. Match that behaviour:
|
|
3016
|
+
# - thinking.type == "enabled" -> enable_thinking=True
|
|
3017
|
+
# - thinking.type == "disabled" or absent -> enable_thinking=False
|
|
3018
|
+
# Without this, Qwen's chat template (which defaults thinking ON) would
|
|
3019
|
+
# consume the client's max_tokens budget on internal reasoning, leaving
|
|
3020
|
+
# nothing for the visible answer.
|
|
3021
|
+
anthropic_thinking = anthropic_body.get("thinking")
|
|
3022
|
+
if isinstance(anthropic_thinking, dict):
|
|
3023
|
+
ttype = (anthropic_thinking.get("type") or "").lower()
|
|
3024
|
+
if ttype == "enabled":
|
|
3025
|
+
openai_body["enable_thinking"] = True
|
|
3026
|
+
else:
|
|
3027
|
+
openai_body["enable_thinking"] = False
|
|
3028
|
+
else:
|
|
3029
|
+
# Match Anthropic default: thinking off unless explicitly requested.
|
|
3030
|
+
openai_body["enable_thinking"] = False
|
|
3031
|
+
|
|
3032
|
+
# Global thinking-off (G): apply to every request, not just tool turns.
|
|
3033
|
+
# Only applies when the client did NOT explicitly request thinking above.
|
|
3034
|
+
# Per-path tool-turn handling below (DISABLE_THINKING_ON_TOOL_TURNS) is
|
|
3035
|
+
# additive — ALWAYS supersedes when set.
|
|
3036
|
+
if PROXY_DISABLE_THINKING_ALWAYS:
|
|
3037
|
+
openai_body["enable_thinking"] = False
|
|
3038
|
+
|
|
2396
3039
|
# Inject agentic protocol instructions only for tool-enabled turns.
|
|
2397
3040
|
# Use minimal supplement for qwen models to reduce prompt leak surface.
|
|
2398
3041
|
if has_tools:
|
|
@@ -2402,6 +3045,15 @@ def build_openai_request(
|
|
|
2402
3045
|
if "qwen" in model_name and PROXY_AGENTIC_SUPPLEMENT_MODE != "legacy"
|
|
2403
3046
|
else _AGENTIC_SYSTEM_SUPPLEMENT
|
|
2404
3047
|
)
|
|
3048
|
+
# When thinking is explicitly disabled (Anthropic default, plus our
|
|
3049
|
+
# tool-turn forcing) but the upstream model is Qwen — which emits
|
|
3050
|
+
# <think> blocks regardless of enable_thinking — append a strong
|
|
3051
|
+
# directive that suppresses internal reasoning. Without this, small
|
|
3052
|
+
# max_tokens budgets get fully consumed by the model's reasoning,
|
|
3053
|
+
# producing required_tool_miss retries (observed in Shannon workflows
|
|
3054
|
+
# with max_tokens=512 + tool_choice=required).
|
|
3055
|
+
if openai_body.get("enable_thinking") is False:
|
|
3056
|
+
supplement = supplement + _NO_THINKING_DIRECTIVE
|
|
2405
3057
|
if (
|
|
2406
3058
|
openai_body["messages"]
|
|
2407
3059
|
and openai_body["messages"][0].get("role") == "system"
|
|
@@ -2422,23 +3074,62 @@ def build_openai_request(
|
|
|
2422
3074
|
if "max_tokens" in anthropic_body:
|
|
2423
3075
|
requested_raw = max(1, int(anthropic_body["max_tokens"]))
|
|
2424
3076
|
|
|
2425
|
-
# Enforce configurable minimum floor for
|
|
2426
|
-
#
|
|
2427
|
-
#
|
|
3077
|
+
# Enforce configurable minimum floor for tool turns: the model needs
|
|
3078
|
+
# enough headroom to emit complete tool-call arguments (long heredocs,
|
|
3079
|
+
# full-function oldString/newString pairs, etc.) without hitting the
|
|
3080
|
+
# client-requested max_tokens in the middle of a JSON string. If the
|
|
3081
|
+
# client requested >= the floor we keep their value; short preflight
|
|
3082
|
+
# requests (max_tokens <= 1024) always skip the floor to avoid
|
|
3083
|
+
# inflating plan-generation turns.
|
|
2428
3084
|
#
|
|
2429
|
-
# The
|
|
2430
|
-
#
|
|
2431
|
-
#
|
|
2432
|
-
#
|
|
2433
|
-
thinking_active_for_request =
|
|
3085
|
+
# The earlier gating on PROXY_DISABLE_THINKING_ON_TOOL_TURNS was too
|
|
3086
|
+
# restrictive: it skipped the floor on every tool turn once thinking
|
|
3087
|
+
# was off, which re-introduced truncated tool calls on long edits.
|
|
3088
|
+
# Set PROXY_MAX_TOKENS_FLOOR=0 to disable the floor entirely.
|
|
3089
|
+
thinking_active_for_request = (
|
|
3090
|
+
has_tools
|
|
3091
|
+
and not PROXY_DISABLE_THINKING_ON_TOOL_TURNS
|
|
3092
|
+
and not PROXY_DISABLE_THINKING_ALWAYS
|
|
3093
|
+
)
|
|
3094
|
+
SMALL_PREFLIGHT_THRESHOLD = 1024
|
|
3095
|
+
# Qwen-style models emit <think> blocks regardless of the
|
|
3096
|
+
# enable_thinking flag (template ignored by trained behaviour).
|
|
3097
|
+
# For tool turns those blocks alone consume ~400-1000 tokens, so a
|
|
3098
|
+
# client-requested max_tokens < THINKING_MIN_FOR_TOOLS leaves no
|
|
3099
|
+
# budget for the tool_call itself — manifesting as required_tool_miss
|
|
3100
|
+
# retries (observed Shannon: max_tokens=512 + tools=7 -> ~5 retries
|
|
3101
|
+
# per turn). Bump up to THINKING_MIN_FOR_TOOLS for these requests.
|
|
3102
|
+
THINKING_MIN_FOR_TOOLS = 2048
|
|
2434
3103
|
skip_floor = (
|
|
2435
|
-
not has_tools # non-tool requests don't need
|
|
2436
|
-
or PROXY_DISABLE_THINKING_ON_TOOL_TURNS # thinking disabled on tool turns
|
|
3104
|
+
not has_tools # non-tool requests don't need the headroom
|
|
2437
3105
|
or PROXY_MAX_TOKENS_FLOOR <= 0 # floor explicitly disabled
|
|
3106
|
+
or requested_raw <= SMALL_PREFLIGHT_THRESHOLD # tiny preflight request
|
|
2438
3107
|
)
|
|
3108
|
+
# Qwen-style models emit <think> blocks regardless of the
|
|
3109
|
+
# enable_thinking flag (template ignored by trained behaviour).
|
|
3110
|
+
# For tool turns those blocks alone consume ~400-1000 tokens, so a
|
|
3111
|
+
# client-requested max_tokens < THINKING_MIN_FOR_TOOLS leaves no
|
|
3112
|
+
# budget for the tool_call itself — manifesting as required_tool_miss
|
|
3113
|
+
# retries (observed Shannon: max_tokens=512 + tools=7 -> ~5 retries
|
|
3114
|
+
# per turn). Bump up to THINKING_MIN_FOR_TOOLS for these requests.
|
|
3115
|
+
THINKING_MIN_FOR_TOOLS = 2048
|
|
2439
3116
|
if skip_floor:
|
|
2440
3117
|
requested_max = requested_raw
|
|
2441
|
-
|
|
3118
|
+
# Even when skipping the big floor, bump small tool-turn
|
|
3119
|
+
# budgets so Qwen's mandatory thinking has room before the
|
|
3120
|
+
# tool_call. Only applies when tools are present.
|
|
3121
|
+
if (
|
|
3122
|
+
has_tools
|
|
3123
|
+
and requested_raw < THINKING_MIN_FOR_TOOLS
|
|
3124
|
+
and requested_raw > 16 # leave true preflight (e.g. max_tokens=1) alone
|
|
3125
|
+
):
|
|
3126
|
+
requested_max = THINKING_MIN_FOR_TOOLS
|
|
3127
|
+
logger.info(
|
|
3128
|
+
"MAX_TOKENS thinking-floor: %d -> %d (tool turn, Qwen mandatory thinking)",
|
|
3129
|
+
requested_raw,
|
|
3130
|
+
requested_max,
|
|
3131
|
+
)
|
|
3132
|
+
elif requested_raw < PROXY_MAX_TOKENS_FLOOR and PROXY_MAX_TOKENS_FLOOR > 0:
|
|
2442
3133
|
logger.info(
|
|
2443
3134
|
"MAX_TOKENS floor skipped: has_tools=%s thinking_active=%s requested=%d floor=%d",
|
|
2444
3135
|
has_tools,
|
|
@@ -2612,6 +3303,8 @@ def build_openai_request(
|
|
|
2612
3303
|
# Skip all further tool_choice logic — no tools this turn
|
|
2613
3304
|
if PROXY_DISABLE_THINKING_ON_TOOL_TURNS:
|
|
2614
3305
|
openai_body["enable_thinking"] = False
|
|
3306
|
+
if PROXY_DISABLE_SPEC_ON_TOOL_TURNS:
|
|
3307
|
+
openai_body["speculative.n_max"] = 0
|
|
2615
3308
|
return openai_body
|
|
2616
3309
|
|
|
2617
3310
|
# Check if forced-tool dampener or loop breaker should override tool_choice
|
|
@@ -2635,23 +3328,35 @@ def build_openai_request(
|
|
|
2635
3328
|
monitor.tool_state_stagnation_streak,
|
|
2636
3329
|
)
|
|
2637
3330
|
elif state_choice == "finalize":
|
|
2638
|
-
|
|
2639
|
-
|
|
3331
|
+
# Fix H/J (2026-04-22): Do NOT strip tools from the body on
|
|
3332
|
+
# cycle-limit finalize. Stripping tools lets the model emit
|
|
3333
|
+
# prose that LOOKS like a tool call ("<function=edit>…") but
|
|
3334
|
+
# has no structured tool_calls array, so the Anthropic client
|
|
3335
|
+
# sees end_turn with no action and halts. Instead, keep tools
|
|
3336
|
+
# available, set tool_choice=auto, and nudge the model to
|
|
3337
|
+
# either complete with a tool call OR emit a proper summary.
|
|
3338
|
+
# Grammar (when PROXY_TOOL_CALL_GRAMMAR_REQUIRED_ONLY=off) will
|
|
3339
|
+
# still constrain tool-call emission to valid JSON format.
|
|
3340
|
+
openai_body["tool_choice"] = "auto"
|
|
2640
3341
|
monitor.finalize_turn_active = True
|
|
3342
|
+
monitor.finalize_hard_stop_count += 1 # monotonic marker: a finalize fired this session
|
|
2641
3343
|
monitor.consecutive_forced_count = 0
|
|
2642
3344
|
monitor.no_progress_streak = 0
|
|
2643
|
-
# Option 3: Inject explicit "no tool calls" instruction to reduce XML leak
|
|
2644
3345
|
finalize_instruction = {
|
|
2645
3346
|
"role": "user",
|
|
2646
3347
|
"content": (
|
|
2647
|
-
"
|
|
2648
|
-
"
|
|
3348
|
+
"You have been looping on the same tools for several turns. "
|
|
3349
|
+
"Wrap up: either emit ONE decisive tool call that completes "
|
|
3350
|
+
"the task, or reply with a plain-text summary of what you "
|
|
3351
|
+
"accomplished and what is blocking further progress. Do NOT "
|
|
3352
|
+
"emit tool call text in prose form — if you call a tool, do "
|
|
3353
|
+
"it through the structured tool_call mechanism."
|
|
2649
3354
|
),
|
|
2650
3355
|
}
|
|
2651
3356
|
msgs = openai_body.get("messages", [])
|
|
2652
3357
|
msgs.append(finalize_instruction)
|
|
2653
3358
|
logger.warning(
|
|
2654
|
-
"TOOL STATE MACHINE:
|
|
3359
|
+
"TOOL STATE MACHINE: finalize turn (reason=%s) — tools kept, tool_choice=auto",
|
|
2655
3360
|
state_reason,
|
|
2656
3361
|
)
|
|
2657
3362
|
elif state_choice == "required":
|
|
@@ -2732,11 +3437,24 @@ def build_openai_request(
|
|
|
2732
3437
|
elif state_reason in {"fresh_user_text", "inactive_loop"} and n_msgs <= 1:
|
|
2733
3438
|
monitor.consecutive_forced_count = 0
|
|
2734
3439
|
monitor.no_progress_streak = 0
|
|
2735
|
-
|
|
2736
|
-
|
|
2737
|
-
|
|
2738
|
-
|
|
2739
|
-
|
|
3440
|
+
# Force tool_choice=required on first turn to ensure local models
|
|
3441
|
+
# produce a tool call instead of plain text (cold-start fix).
|
|
3442
|
+
# Gated by PROXY_FORCE_TOOL_CHOICE_ON_COLD_START — Gemma 4 routes
|
|
3443
|
+
# 'auto' correctly without needing the force, and the force
|
|
3444
|
+
# triggers malformed-JSON emissions on Gemma 4 cold turns.
|
|
3445
|
+
if has_tools and n_msgs == 1 and PROXY_FORCE_TOOL_CHOICE_ON_COLD_START:
|
|
3446
|
+
openai_body["tool_choice"] = "required"
|
|
3447
|
+
logger.info(
|
|
3448
|
+
"tool_choice forced to 'required' on first turn (reason=%s n_msgs=%d cold_start_fix=true)",
|
|
3449
|
+
state_reason,
|
|
3450
|
+
n_msgs,
|
|
3451
|
+
)
|
|
3452
|
+
else:
|
|
3453
|
+
logger.info(
|
|
3454
|
+
"tool_choice left unchanged after state reset (reason=%s n_msgs=%d)",
|
|
3455
|
+
state_reason,
|
|
3456
|
+
n_msgs,
|
|
3457
|
+
)
|
|
2740
3458
|
elif monitor.should_release_tool_choice():
|
|
2741
3459
|
openai_body["tool_choice"] = "auto"
|
|
2742
3460
|
monitor.consecutive_forced_count = 0
|
|
@@ -2767,10 +3485,18 @@ def build_openai_request(
|
|
|
2767
3485
|
monitor.reset_tool_turn_state(reason="no_tool_results")
|
|
2768
3486
|
|
|
2769
3487
|
|
|
2770
|
-
if PROXY_DISABLE_THINKING_ON_TOOL_TURNS:
|
|
3488
|
+
if PROXY_DISABLE_THINKING_ALWAYS or PROXY_DISABLE_THINKING_ON_TOOL_TURNS:
|
|
2771
3489
|
openai_body["enable_thinking"] = False
|
|
2772
3490
|
logger.info(
|
|
2773
|
-
"Thinking disabled
|
|
3491
|
+
"Thinking disabled (always=%s tool_turns=%s)",
|
|
3492
|
+
PROXY_DISABLE_THINKING_ALWAYS,
|
|
3493
|
+
PROXY_DISABLE_THINKING_ON_TOOL_TURNS,
|
|
3494
|
+
)
|
|
3495
|
+
|
|
3496
|
+
if PROXY_DISABLE_SPEC_ON_TOOL_TURNS:
|
|
3497
|
+
openai_body["speculative.n_max"] = 0
|
|
3498
|
+
logger.info(
|
|
3499
|
+
"Spec decoding disabled for tool turn (PROXY_DISABLE_SPEC_ON_TOOL_TURNS=on)"
|
|
2774
3500
|
)
|
|
2775
3501
|
|
|
2776
3502
|
_apply_tool_call_grammar(openai_body, grammar_override=profile_grammar)
|
|
@@ -3083,7 +3809,10 @@ def _schema_type_matches(value, expected_type: str) -> bool:
|
|
|
3083
3809
|
|
|
3084
3810
|
def _string_contains_tool_markup(value: str) -> bool:
|
|
3085
3811
|
lowered = value.lower()
|
|
3086
|
-
markers = (
|
|
3812
|
+
markers = (
|
|
3813
|
+
"<parameter", "</parameter", "<tool_call", "<function=", "</function",
|
|
3814
|
+
"<|tool_call>", "<tool_call|>", # Gemma 4 native DSL
|
|
3815
|
+
)
|
|
3087
3816
|
return any(marker in lowered for marker in markers)
|
|
3088
3817
|
|
|
3089
3818
|
|
|
@@ -3155,6 +3884,343 @@ _TOOL_CALL_XML_RE = re.compile(
|
|
|
3155
3884
|
re.DOTALL,
|
|
3156
3885
|
)
|
|
3157
3886
|
|
|
3887
|
+
# Hermes-style XML function call format emitted by some Qwen/Llama fine-tunes
|
|
3888
|
+
# when grammar is not applied:
|
|
3889
|
+
# <function=name>
|
|
3890
|
+
# <parameter=key>
|
|
3891
|
+
# value
|
|
3892
|
+
# </parameter>
|
|
3893
|
+
# ...
|
|
3894
|
+
# </function>
|
|
3895
|
+
#
|
|
3896
|
+
# The value of a <parameter=KEY> block may span multiple lines and include
|
|
3897
|
+
# arbitrary characters (code snippets, JSON, quotes). The closing
|
|
3898
|
+
# </parameter> tag may be missing if the model emitted EOS prematurely —
|
|
3899
|
+
# in which case we consume up to the next <parameter=...> tag or end of
|
|
3900
|
+
# string. Names are captured as alphanumeric + underscore to avoid pulling
|
|
3901
|
+
# in attribute-like garbage.
|
|
3902
|
+
_HERMES_FUNCTION_RE = re.compile(
|
|
3903
|
+
r"<function=([A-Za-z_][A-Za-z0-9_]*)>(.*?)(?:</function>|\Z)",
|
|
3904
|
+
re.DOTALL,
|
|
3905
|
+
)
|
|
3906
|
+
_HERMES_PARAMETER_RE = re.compile(
|
|
3907
|
+
r"<parameter=([A-Za-z_][A-Za-z0-9_]*)>\s*(.*?)\s*(?=</parameter>|<parameter=|\Z)",
|
|
3908
|
+
re.DOTALL,
|
|
3909
|
+
)
|
|
3910
|
+
|
|
3911
|
+
|
|
3912
|
+
def _extract_hermes_tool_calls(text: str) -> tuple[list[dict], str]:
|
|
3913
|
+
"""Parse Hermes-style ``<function=name><parameter=k>v</parameter></function>``
|
|
3914
|
+
blocks out of *text*. Used as a fallback when the Qwen JSON format
|
|
3915
|
+
(``<tool_call>{...}</tool_call>``) is not present — for example on
|
|
3916
|
+
finalize turns where grammar does not constrain the output. Tolerates
|
|
3917
|
+
premature EOS (missing closing ``</parameter>`` / ``</function>``)."""
|
|
3918
|
+
if "<function=" not in text:
|
|
3919
|
+
return [], text
|
|
3920
|
+
|
|
3921
|
+
extracted: list[dict] = []
|
|
3922
|
+
matched_spans: list[tuple[int, int]] = []
|
|
3923
|
+
|
|
3924
|
+
for fn_match in _HERMES_FUNCTION_RE.finditer(text):
|
|
3925
|
+
name = fn_match.group(1).strip()
|
|
3926
|
+
body = fn_match.group(2) or ""
|
|
3927
|
+
if not name:
|
|
3928
|
+
continue
|
|
3929
|
+
args: dict = {}
|
|
3930
|
+
for p_match in _HERMES_PARAMETER_RE.finditer(body):
|
|
3931
|
+
key = p_match.group(1).strip()
|
|
3932
|
+
value = p_match.group(2)
|
|
3933
|
+
if key:
|
|
3934
|
+
# Strip one leading newline that the template usually adds
|
|
3935
|
+
# but preserve interior whitespace (code indentation, etc.)
|
|
3936
|
+
if value.startswith("\n"):
|
|
3937
|
+
value = value[1:]
|
|
3938
|
+
args[key] = value
|
|
3939
|
+
extracted.append(
|
|
3940
|
+
{
|
|
3941
|
+
"id": f"toolu_{uuid.uuid4().hex[:24]}",
|
|
3942
|
+
"type": "function",
|
|
3943
|
+
"function": {
|
|
3944
|
+
"name": name,
|
|
3945
|
+
"arguments": json.dumps(args, separators=(",", ":")),
|
|
3946
|
+
},
|
|
3947
|
+
}
|
|
3948
|
+
)
|
|
3949
|
+
matched_spans.append(fn_match.span())
|
|
3950
|
+
|
|
3951
|
+
if not extracted:
|
|
3952
|
+
return [], text
|
|
3953
|
+
|
|
3954
|
+
# Remove matched function blocks from text (plus any dangling
|
|
3955
|
+
# <tool_call>/</tool_call> wrappers around them).
|
|
3956
|
+
remaining = text
|
|
3957
|
+
for start, end in reversed(matched_spans):
|
|
3958
|
+
remaining = remaining[:start] + remaining[end:]
|
|
3959
|
+
# Strip leftover <tool_call>…</tool_call> envelopes that now enclose
|
|
3960
|
+
# nothing useful.
|
|
3961
|
+
remaining = re.sub(r"<tool_call>\s*</tool_call>", "", remaining, flags=re.DOTALL)
|
|
3962
|
+
remaining = remaining.strip()
|
|
3963
|
+
|
|
3964
|
+
logger.info(
|
|
3965
|
+
"TOOL CALL EXTRACTION: recovered %d Hermes-format tool call(s) from text content",
|
|
3966
|
+
len(extracted),
|
|
3967
|
+
)
|
|
3968
|
+
return extracted, remaining
|
|
3969
|
+
|
|
3970
|
+
|
|
3971
|
+
# ---------------------------------------------------------------------------
|
|
3972
|
+
# Gemma 4 tool-call DSL extractors
|
|
3973
|
+
# ---------------------------------------------------------------------------
|
|
3974
|
+
# Gemma 4's chat template emits tool calls as:
|
|
3975
|
+
# <|tool_call>call:NAME{key1:<|"|>value1<|"|>,key2:42}<tool_call|>
|
|
3976
|
+
# Note the asymmetric open/close tags and `<|"|>` substitution for `"`.
|
|
3977
|
+
# Llama-server's --jinja autoparser usually converts these to standard
|
|
3978
|
+
# OpenAI tool_calls, but the raw form can leak through on (a) malformed
|
|
3979
|
+
# emissions, (b) finalize turns, (c) non-tool-template requests where the
|
|
3980
|
+
# model still tries to call a tool. This parser catches those cases.
|
|
3981
|
+
#
|
|
3982
|
+
# Gemma 4 also falls back to ```json {"name": "...", "arguments": {...}} ```
|
|
3983
|
+
# markdown blocks when it doesn't trust the template — observed when
|
|
3984
|
+
# tool_choice was forced 'required' but the model lacked confidence in the
|
|
3985
|
+
# native format. Only treated as a tool call when the JSON has a "name".
|
|
3986
|
+
_GEMMA4_TOOL_CALL_DSL_RE = re.compile(
|
|
3987
|
+
r"<\|tool_call>\s*call:\s*([A-Za-z_][A-Za-z0-9_]*)\s*\{(.*?)\}\s*<tool_call\|>",
|
|
3988
|
+
re.DOTALL,
|
|
3989
|
+
)
|
|
3990
|
+
# Markdown JSON code-block fallback. Group 1 = JSON content (may include
|
|
3991
|
+
# leading/trailing whitespace inside the block).
|
|
3992
|
+
_GEMMA4_MARKDOWN_JSON_RE = re.compile(
|
|
3993
|
+
r"```(?:json)?\s*(\{.*?\})\s*```",
|
|
3994
|
+
re.DOTALL,
|
|
3995
|
+
)
|
|
3996
|
+
|
|
3997
|
+
|
|
3998
|
+
def _parse_gemma4_dsl_args(raw: str) -> dict | None:
|
|
3999
|
+
"""Parse Gemma 4's tool-call DSL arg body into a Python dict.
|
|
4000
|
+
|
|
4001
|
+
Input shape (between the `{` and `}` of the DSL):
|
|
4002
|
+
key1:<|"|>str value<|"|>,key2:42,key3:true,key4:[<|"|>a<|"|>,<|"|>b<|"|>]
|
|
4003
|
+
|
|
4004
|
+
Strategy: replace `<|"|>` with `"`, wrap unquoted keys in quotes, then
|
|
4005
|
+
feed to json.loads. Returns None on parse failure (caller decides).
|
|
4006
|
+
"""
|
|
4007
|
+
if not raw or not raw.strip():
|
|
4008
|
+
return {}
|
|
4009
|
+
s = raw.replace('<|"|>', '"')
|
|
4010
|
+
# Wrap unquoted keys: `key:` -> `"key":` (only at start or after `,` / `{` / whitespace).
|
|
4011
|
+
s = re.sub(r"(^|[\s,{\[])([A-Za-z_][A-Za-z0-9_]*)\s*:", r'\1"\2":', s)
|
|
4012
|
+
s = "{" + s + "}"
|
|
4013
|
+
try:
|
|
4014
|
+
parsed = json.loads(s)
|
|
4015
|
+
return parsed if isinstance(parsed, dict) else None
|
|
4016
|
+
except json.JSONDecodeError:
|
|
4017
|
+
return None
|
|
4018
|
+
|
|
4019
|
+
|
|
4020
|
+
def _schema_match_tool(payload: dict, available_tools: list[dict]) -> str | None:
|
|
4021
|
+
"""Match a bare-args dict against available tool schemas.
|
|
4022
|
+
|
|
4023
|
+
Score each tool by:
|
|
4024
|
+
- +10 per required field present in payload
|
|
4025
|
+
- +1 per optional property present
|
|
4026
|
+
- -5 per payload key NOT in tool's properties
|
|
4027
|
+
- -100 if any required field is missing
|
|
4028
|
+
Return the name of the highest-scoring tool, or None if no clear match.
|
|
4029
|
+
"""
|
|
4030
|
+
if not isinstance(payload, dict) or not available_tools:
|
|
4031
|
+
return None
|
|
4032
|
+
payload_keys = set(payload.keys())
|
|
4033
|
+
best_name = None
|
|
4034
|
+
best_score = 0
|
|
4035
|
+
for tool in available_tools:
|
|
4036
|
+
if not isinstance(tool, dict):
|
|
4037
|
+
continue
|
|
4038
|
+
# Anthropic tools format: {"name": ..., "input_schema": {...}}
|
|
4039
|
+
# OpenAI format: {"type": "function", "function": {"name": ..., "parameters": {...}}}
|
|
4040
|
+
name = tool.get("name")
|
|
4041
|
+
schema = tool.get("input_schema")
|
|
4042
|
+
if name is None and isinstance(tool.get("function"), dict):
|
|
4043
|
+
name = tool["function"].get("name")
|
|
4044
|
+
schema = tool["function"].get("parameters")
|
|
4045
|
+
if not isinstance(name, str) or not isinstance(schema, dict):
|
|
4046
|
+
continue
|
|
4047
|
+
properties = schema.get("properties") if isinstance(schema.get("properties"), dict) else {}
|
|
4048
|
+
required = set(schema.get("required") or [])
|
|
4049
|
+
prop_keys = set(properties.keys())
|
|
4050
|
+
score = 0
|
|
4051
|
+
missing_required = required - payload_keys
|
|
4052
|
+
if missing_required:
|
|
4053
|
+
score -= 100
|
|
4054
|
+
score += 10 * len(required & payload_keys)
|
|
4055
|
+
score += len((payload_keys & prop_keys) - required)
|
|
4056
|
+
score -= 5 * len(payload_keys - prop_keys)
|
|
4057
|
+
if score > best_score:
|
|
4058
|
+
best_score = score
|
|
4059
|
+
best_name = name
|
|
4060
|
+
return best_name if best_score >= 10 else None
|
|
4061
|
+
|
|
4062
|
+
|
|
4063
|
+
def _extract_gemma4_tool_calls(
|
|
4064
|
+
text: str, available_tools: list[dict] | None = None
|
|
4065
|
+
) -> tuple[list[dict], str]:
|
|
4066
|
+
"""Parse Gemma 4 tool-call emissions out of *text*.
|
|
4067
|
+
|
|
4068
|
+
Three formats handled, in order:
|
|
4069
|
+
1. Native DSL: ``<|tool_call>call:N{...}<tool_call|>``
|
|
4070
|
+
2. Markdown with name: ```json\\n{"name": "N", "arguments": {...}}\\n```
|
|
4071
|
+
3. Markdown bare-args + ``available_tools`` provided — schema-match
|
|
4072
|
+
against tool definitions (fix D for Gemma 4 cold-turn malformation
|
|
4073
|
+
where the model emits ``{"city": "Paris"}`` for a get_weather call
|
|
4074
|
+
instead of ``{"name": "get_weather", "arguments": {"city": "Paris"}}``).
|
|
4075
|
+
Without ``available_tools``, bare-args blocks pass through as text.
|
|
4076
|
+
|
|
4077
|
+
Returns ``(extracted_openai_tool_calls, remaining_text)``.
|
|
4078
|
+
"""
|
|
4079
|
+
if "<|tool_call>" not in text and "```" not in text:
|
|
4080
|
+
return [], text
|
|
4081
|
+
|
|
4082
|
+
extracted: list[dict] = []
|
|
4083
|
+
matched_spans: list[tuple[int, int]] = []
|
|
4084
|
+
|
|
4085
|
+
# Pattern 1: native DSL
|
|
4086
|
+
for m in _GEMMA4_TOOL_CALL_DSL_RE.finditer(text):
|
|
4087
|
+
name = m.group(1).strip()
|
|
4088
|
+
body = m.group(2) or ""
|
|
4089
|
+
if not name:
|
|
4090
|
+
continue
|
|
4091
|
+
args = _parse_gemma4_dsl_args(body)
|
|
4092
|
+
if args is None:
|
|
4093
|
+
# DSL body unparseable; skip and let model retry next turn.
|
|
4094
|
+
continue
|
|
4095
|
+
extracted.append(
|
|
4096
|
+
{
|
|
4097
|
+
"id": f"toolu_{uuid.uuid4().hex[:24]}",
|
|
4098
|
+
"type": "function",
|
|
4099
|
+
"function": {
|
|
4100
|
+
"name": name,
|
|
4101
|
+
"arguments": json.dumps(args, separators=(",", ":")),
|
|
4102
|
+
},
|
|
4103
|
+
}
|
|
4104
|
+
)
|
|
4105
|
+
matched_spans.append(m.span())
|
|
4106
|
+
|
|
4107
|
+
# Pattern 2: markdown JSON fallback (only if no DSL hit AND text has ```)
|
|
4108
|
+
if not extracted and "```" in text:
|
|
4109
|
+
for m in _GEMMA4_MARKDOWN_JSON_RE.finditer(text):
|
|
4110
|
+
raw_json = m.group(1)
|
|
4111
|
+
try:
|
|
4112
|
+
payload = json.loads(raw_json)
|
|
4113
|
+
except json.JSONDecodeError:
|
|
4114
|
+
# Try a JSON repair like the Qwen path does
|
|
4115
|
+
repaired = _repair_tool_call_json(raw_json)
|
|
4116
|
+
if not repaired:
|
|
4117
|
+
continue
|
|
4118
|
+
try:
|
|
4119
|
+
payload = json.loads(repaired)
|
|
4120
|
+
except json.JSONDecodeError:
|
|
4121
|
+
continue
|
|
4122
|
+
if not isinstance(payload, dict):
|
|
4123
|
+
continue
|
|
4124
|
+
name = payload.get("name")
|
|
4125
|
+
arguments_obj = None
|
|
4126
|
+
if isinstance(name, str) and name:
|
|
4127
|
+
# Standard {name, arguments} form
|
|
4128
|
+
arguments_obj = payload.get("arguments", payload.get("args", {}))
|
|
4129
|
+
elif available_tools:
|
|
4130
|
+
# Bare-args block — try schema-matching against available tools
|
|
4131
|
+
matched = _schema_match_tool(payload, available_tools)
|
|
4132
|
+
if matched is None:
|
|
4133
|
+
continue
|
|
4134
|
+
name = matched
|
|
4135
|
+
arguments_obj = payload # whole payload IS the args
|
|
4136
|
+
logger.info(
|
|
4137
|
+
"TOOL CALL EXTRACTION: schema-matched bare-args markdown JSON to tool '%s' (keys=%s)",
|
|
4138
|
+
name,
|
|
4139
|
+
sorted(payload.keys())[:6],
|
|
4140
|
+
)
|
|
4141
|
+
else:
|
|
4142
|
+
# No name, no tools to match against — pass through as text
|
|
4143
|
+
continue
|
|
4144
|
+
if isinstance(arguments_obj, dict):
|
|
4145
|
+
arguments = json.dumps(arguments_obj, separators=(",", ":"))
|
|
4146
|
+
elif isinstance(arguments_obj, str):
|
|
4147
|
+
arguments = arguments_obj
|
|
4148
|
+
else:
|
|
4149
|
+
arguments = "{}"
|
|
4150
|
+
extracted.append(
|
|
4151
|
+
{
|
|
4152
|
+
"id": f"toolu_{uuid.uuid4().hex[:24]}",
|
|
4153
|
+
"type": "function",
|
|
4154
|
+
"function": {"name": name, "arguments": arguments},
|
|
4155
|
+
}
|
|
4156
|
+
)
|
|
4157
|
+
matched_spans.append(m.span())
|
|
4158
|
+
|
|
4159
|
+
if not extracted:
|
|
4160
|
+
return [], text
|
|
4161
|
+
|
|
4162
|
+
# Strip matched spans from text (in reverse to keep indices valid)
|
|
4163
|
+
remaining = text
|
|
4164
|
+
for start, end in sorted(matched_spans, key=lambda s: -s[0]):
|
|
4165
|
+
remaining = remaining[:start] + remaining[end:]
|
|
4166
|
+
remaining = remaining.strip()
|
|
4167
|
+
|
|
4168
|
+
logger.info(
|
|
4169
|
+
"TOOL CALL EXTRACTION: recovered %d Gemma 4 tool call(s) from text content",
|
|
4170
|
+
len(extracted),
|
|
4171
|
+
)
|
|
4172
|
+
return extracted, remaining
|
|
4173
|
+
|
|
4174
|
+
|
|
4175
|
+
# ---------------------------------------------------------------------------
|
|
4176
|
+
# Gemma 4 tool-call DSL extractors
|
|
4177
|
+
# ---------------------------------------------------------------------------
|
|
4178
|
+
# Gemma 4's chat template emits tool calls as:
|
|
4179
|
+
# <|tool_call>call:NAME{key1:<|"|>value1<|"|>,key2:42}<tool_call|>
|
|
4180
|
+
# Note the asymmetric open/close tags and `<|"|>` substitution for `"`.
|
|
4181
|
+
# Llama-server's --jinja autoparser usually converts these to standard
|
|
4182
|
+
# OpenAI tool_calls, but the raw form can leak through on (a) malformed
|
|
4183
|
+
# emissions, (b) finalize turns, (c) non-tool-template requests where the
|
|
4184
|
+
# model still tries to call a tool. This parser catches those cases.
|
|
4185
|
+
#
|
|
4186
|
+
# Gemma 4 also falls back to ```json {"name": "...", "arguments": {...}} ```
|
|
4187
|
+
# markdown blocks when it doesn't trust the template — observed when
|
|
4188
|
+
# tool_choice was forced 'required' but the model lacked confidence in the
|
|
4189
|
+
# native format. Only treated as a tool call when the JSON has a "name".
|
|
4190
|
+
_GEMMA4_TOOL_CALL_DSL_RE = re.compile(
|
|
4191
|
+
r"<\|tool_call>\s*call:\s*([A-Za-z_][A-Za-z0-9_]*)\s*\{(.*?)\}\s*<tool_call\|>",
|
|
4192
|
+
re.DOTALL,
|
|
4193
|
+
)
|
|
4194
|
+
# Markdown JSON code-block fallback. Group 1 = JSON content (may include
|
|
4195
|
+
# leading/trailing whitespace inside the block).
|
|
4196
|
+
_GEMMA4_MARKDOWN_JSON_RE = re.compile(
|
|
4197
|
+
r"```(?:json)?\s*(\{.*?\})\s*```",
|
|
4198
|
+
re.DOTALL,
|
|
4199
|
+
)
|
|
4200
|
+
|
|
4201
|
+
|
|
4202
|
+
def _parse_gemma4_dsl_args(raw: str) -> dict | None:
|
|
4203
|
+
"""Parse Gemma 4's tool-call DSL arg body into a Python dict.
|
|
4204
|
+
|
|
4205
|
+
Input shape (between the `{` and `}` of the DSL):
|
|
4206
|
+
key1:<|"|>str value<|"|>,key2:42,key3:true,key4:[<|"|>a<|"|>,<|"|>b<|"|>]
|
|
4207
|
+
|
|
4208
|
+
Strategy: replace `<|"|>` with `"`, wrap unquoted keys in quotes, then
|
|
4209
|
+
feed to json.loads. Returns None on parse failure (caller decides).
|
|
4210
|
+
"""
|
|
4211
|
+
if not raw or not raw.strip():
|
|
4212
|
+
return {}
|
|
4213
|
+
s = raw.replace('<|"|>', '"')
|
|
4214
|
+
# Wrap unquoted keys: `key:` -> `"key":` (only at start or after `,` / `{` / whitespace).
|
|
4215
|
+
s = re.sub(r"(^|[\s,{\[])([A-Za-z_][A-Za-z0-9_]*)\s*:", r'\1"\2":', s)
|
|
4216
|
+
s = "{" + s + "}"
|
|
4217
|
+
try:
|
|
4218
|
+
parsed = json.loads(s)
|
|
4219
|
+
return parsed if isinstance(parsed, dict) else None
|
|
4220
|
+
except json.JSONDecodeError:
|
|
4221
|
+
return None
|
|
4222
|
+
|
|
4223
|
+
|
|
3158
4224
|
|
|
3159
4225
|
def _repair_tool_call_json(raw: str) -> str | None:
|
|
3160
4226
|
"""Attempt to repair common garbled JSON in tool call payloads.
|
|
@@ -3197,7 +4263,9 @@ def _repair_tool_call_json(raw: str) -> str | None:
|
|
|
3197
4263
|
return None
|
|
3198
4264
|
|
|
3199
4265
|
|
|
3200
|
-
def _extract_tool_calls_from_text(
|
|
4266
|
+
def _extract_tool_calls_from_text(
|
|
4267
|
+
text: str, available_tools: list[dict] | None = None
|
|
4268
|
+
) -> tuple[list[dict], str]:
|
|
3201
4269
|
"""Parse ``<tool_call>{...}</tool_call>`` blocks out of *text*.
|
|
3202
4270
|
|
|
3203
4271
|
Returns a tuple of (extracted_openai_tool_calls, remaining_text).
|
|
@@ -3207,8 +4275,18 @@ def _extract_tool_calls_from_text(text: str) -> tuple[list[dict], str]:
|
|
|
3207
4275
|
|
|
3208
4276
|
The *remaining_text* has the matched ``<tool_call>`` blocks removed.
|
|
3209
4277
|
If no valid blocks are found the original text is returned unchanged.
|
|
4278
|
+
Falls back to Hermes-style ``<function=X><parameter=K>V</parameter></function>``
|
|
4279
|
+
for older Qwen/Llama fine-tunes, then to Gemma 4's
|
|
4280
|
+
``<|tool_call>call:N{...}<tool_call|>`` DSL and ```json``` markdown
|
|
4281
|
+
blocks. Anything not matching any known format falls through unchanged
|
|
4282
|
+
so plain prose passes the parser without mutation.
|
|
3210
4283
|
"""
|
|
3211
|
-
if
|
|
4284
|
+
if (
|
|
4285
|
+
"<tool_call>" not in text
|
|
4286
|
+
and "<function=" not in text
|
|
4287
|
+
and "<|tool_call>" not in text
|
|
4288
|
+
and "```" not in text
|
|
4289
|
+
):
|
|
3212
4290
|
return [], text
|
|
3213
4291
|
|
|
3214
4292
|
extracted: list[dict] = []
|
|
@@ -3244,14 +4322,24 @@ def _extract_tool_calls_from_text(text: str) -> tuple[list[dict], str]:
|
|
|
3244
4322
|
|
|
3245
4323
|
extracted.append(
|
|
3246
4324
|
{
|
|
3247
|
-
"id": f"
|
|
4325
|
+
"id": f"toolu_{uuid.uuid4().hex[:24]}",
|
|
3248
4326
|
"type": "function",
|
|
3249
4327
|
"function": {"name": name, "arguments": arguments},
|
|
3250
4328
|
}
|
|
3251
4329
|
)
|
|
3252
4330
|
|
|
3253
4331
|
if not extracted:
|
|
3254
|
-
|
|
4332
|
+
# Fall back to Hermes format. This catches Qwen emissions on finalize
|
|
4333
|
+
# turns where grammar is not applied and the model defaults to its
|
|
4334
|
+
# base training's <function=X><parameter=K>V</parameter></function>
|
|
4335
|
+
# format instead of the <tool_call>{JSON}</tool_call> Qwen template
|
|
4336
|
+
# format. Without this path, tool_calls=[] and the client halts.
|
|
4337
|
+
hermes_calls, hermes_remaining = _extract_hermes_tool_calls(text)
|
|
4338
|
+
if hermes_calls:
|
|
4339
|
+
return hermes_calls, hermes_remaining
|
|
4340
|
+
# Then try Gemma 4's DSL + markdown-JSON fallback. Anything still
|
|
4341
|
+
# not matching falls through as plain text.
|
|
4342
|
+
return _extract_gemma4_tool_calls(text, available_tools=available_tools)
|
|
3255
4343
|
|
|
3256
4344
|
# Strip matched tool_call blocks from the text
|
|
3257
4345
|
remaining = _TOOL_CALL_XML_RE.sub("", text).strip()
|
|
@@ -4222,6 +5310,16 @@ def _classify_tool_response_issue(
|
|
|
4222
5310
|
if "tools" not in anthropic_body:
|
|
4223
5311
|
return ToolResponseIssue()
|
|
4224
5312
|
|
|
5313
|
+
# When the upstream response was cut off by max_tokens (finish_reason=length),
|
|
5314
|
+
# any garbled/unbalanced-brace appearance in the tool args is almost
|
|
5315
|
+
# certainly truncation, not degenerate generation. Re-classify such
|
|
5316
|
+
# issues as "truncated_tool_args" so the caller can still retry (with a
|
|
5317
|
+
# larger cap) but WITHOUT triggering the forced-tool dampener, which
|
|
5318
|
+
# otherwise penalises a perfectly-recoverable truncation event.
|
|
5319
|
+
choice_for_finish, _ = _extract_openai_choice(openai_resp)
|
|
5320
|
+
finish_reason = (choice_for_finish.get("finish_reason") or "").lower()
|
|
5321
|
+
was_truncated = finish_reason == "length"
|
|
5322
|
+
|
|
4225
5323
|
if _is_malformed_tool_response(openai_resp, anthropic_body):
|
|
4226
5324
|
return ToolResponseIssue(
|
|
4227
5325
|
kind="malformed_payload",
|
|
@@ -4265,15 +5363,107 @@ def _classify_tool_response_issue(
|
|
|
4265
5363
|
allowed_tools,
|
|
4266
5364
|
)
|
|
4267
5365
|
if issue.has_issue():
|
|
5366
|
+
# Downgrade invalid_tool_args to truncated_tool_args when the
|
|
5367
|
+
# response hit max_tokens — retry path still fires but the
|
|
5368
|
+
# dampener/streak counters stay cold.
|
|
5369
|
+
if was_truncated and issue.kind == "invalid_tool_args":
|
|
5370
|
+
return ToolResponseIssue(
|
|
5371
|
+
kind="truncated_tool_args",
|
|
5372
|
+
reason=(
|
|
5373
|
+
f"tool call for '{tool_name}' truncated by max_tokens "
|
|
5374
|
+
f"({issue.reason})"
|
|
5375
|
+
),
|
|
5376
|
+
retry_hint=issue.retry_hint,
|
|
5377
|
+
)
|
|
4268
5378
|
return issue
|
|
4269
5379
|
|
|
4270
5380
|
return ToolResponseIssue()
|
|
4271
5381
|
|
|
4272
5382
|
|
|
5383
|
+
# 2026-05-12: Regex for the tool-XML tag scanner. Captures opening vs
|
|
5384
|
+
# closing form (group 1: "/" or ""), the tag name (group 2), and any
|
|
5385
|
+
# attributes (group 3). Matches <parameter>, <parameter=key>,
|
|
5386
|
+
# <parameter name="key">, </parameter>, <function=name>, </function>.
|
|
5387
|
+
_TOOL_XML_TAG_RE = re.compile(r"<(/?)(parameter|function)\b([^>]*)>")
|
|
5388
|
+
|
|
5389
|
+
|
|
5390
|
+
def _strip_orphan_tool_xml(text: str) -> str:
|
|
5391
|
+
"""Remove orphan </parameter> and </function> closing tags that have
|
|
5392
|
+
no matching opener earlier in the text.
|
|
5393
|
+
|
|
5394
|
+
Qwen3.6 trained on the qwen3_coder XML format leaks these closers
|
|
5395
|
+
after its actual answer when forced into tool_choice='required' with
|
|
5396
|
+
no genuine tool to call. The closers are training residuals, not real
|
|
5397
|
+
malformed tool-call markup — keeping them in the text causes the
|
|
5398
|
+
primary_markers branch of _looks_malformed_tool_payload to fire on
|
|
5399
|
+
every clean-but-runaway-shaped response. Real malformed tool-call
|
|
5400
|
+
attempts always have at least one matching opener ('<parameter' or
|
|
5401
|
+
'<function='), which the regex preserves, so primary_markers still
|
|
5402
|
+
fires correctly on genuine bad output.
|
|
5403
|
+
"""
|
|
5404
|
+
if "</parameter" not in text and "</function" not in text:
|
|
5405
|
+
return text
|
|
5406
|
+
|
|
5407
|
+
out: list[str] = []
|
|
5408
|
+
pos = 0
|
|
5409
|
+
open_param = 0
|
|
5410
|
+
open_func = 0
|
|
5411
|
+
for m in _TOOL_XML_TAG_RE.finditer(text):
|
|
5412
|
+
out.append(text[pos:m.start()])
|
|
5413
|
+
is_close = m.group(1) == "/"
|
|
5414
|
+
tag = m.group(2)
|
|
5415
|
+
if is_close:
|
|
5416
|
+
if tag == "parameter":
|
|
5417
|
+
if open_param > 0:
|
|
5418
|
+
open_param -= 1
|
|
5419
|
+
out.append(m.group(0))
|
|
5420
|
+
else: # function
|
|
5421
|
+
if open_func > 0:
|
|
5422
|
+
open_func -= 1
|
|
5423
|
+
out.append(m.group(0))
|
|
5424
|
+
# else: orphan closer, skip (strip)
|
|
5425
|
+
else:
|
|
5426
|
+
if tag == "parameter":
|
|
5427
|
+
open_param += 1
|
|
5428
|
+
else:
|
|
5429
|
+
open_func += 1
|
|
5430
|
+
out.append(m.group(0))
|
|
5431
|
+
pos = m.end()
|
|
5432
|
+
out.append(text[pos:])
|
|
5433
|
+
return "".join(out)
|
|
5434
|
+
|
|
5435
|
+
|
|
4273
5436
|
def _looks_malformed_tool_payload(text: str) -> bool:
|
|
4274
5437
|
if not text:
|
|
4275
5438
|
return False
|
|
4276
5439
|
|
|
5440
|
+
# 2026-05-12: Strip balanced <think>...</think> blocks before applying
|
|
5441
|
+
# the heuristic. Qwen3.6 emits <think> blocks regardless of
|
|
5442
|
+
# enable_thinking, and two scenarios were tripping false positives:
|
|
5443
|
+
# 1. Meta-tool reasoning inside the thinking ({"description":...},
|
|
5444
|
+
# repeated "must call a tool") triggering the structural-marker
|
|
5445
|
+
# and policy-echo branches.
|
|
5446
|
+
# 2. The model wrapping its ENTIRE answer inside a single <think>
|
|
5447
|
+
# block (markdown reports, tables) — the </think> structural
|
|
5448
|
+
# marker plus content-resembling-policy then fires.
|
|
5449
|
+
# Downstream response processing surfaces <think> content as proper
|
|
5450
|
+
# Anthropic `thinking` blocks via _THINKING_BLOCK_RE, so stripping
|
|
5451
|
+
# here loses no information. Unbalanced/stray </think> without a
|
|
5452
|
+
# matching opener is NOT stripped — those remain genuinely malformed.
|
|
5453
|
+
if "<think>" in text and "</think>" in text:
|
|
5454
|
+
text = _THINKING_BLOCK_RE.sub("", text)
|
|
5455
|
+
if not text.strip():
|
|
5456
|
+
return False
|
|
5457
|
+
|
|
5458
|
+
# 2026-05-12: Strip orphan </parameter> and </function> closers that
|
|
5459
|
+
# have no matching opener. Qwen3.6 leaks these training residuals
|
|
5460
|
+
# after its visible answer when forced into tool_choice='required'
|
|
5461
|
+
# with no valid tool to call. Real malformed tool-call attempts retain
|
|
5462
|
+
# their opener and still trip the primary_markers check below.
|
|
5463
|
+
text = _strip_orphan_tool_xml(text)
|
|
5464
|
+
if not text.strip():
|
|
5465
|
+
return False
|
|
5466
|
+
|
|
4277
5467
|
lowered = text.lower()
|
|
4278
5468
|
if _contains_tool_call_apology(text):
|
|
4279
5469
|
return True
|
|
@@ -4508,13 +5698,17 @@ def _build_malformed_retry_body(
|
|
|
4508
5698
|
retry_instruction = (
|
|
4509
5699
|
"Your previous response had invalid tool-call formatting. "
|
|
4510
5700
|
"Respond with exactly one valid tool call using the provided tools. "
|
|
4511
|
-
"Do not output prose, markdown, XML tags, or schema snippets."
|
|
5701
|
+
"Do not output prose, markdown, XML tags, or schema snippets. "
|
|
5702
|
+
"Do NOT use <think>...</think> blocks or internal reasoning — "
|
|
5703
|
+
"emit the tool_call object as the very first token of your response."
|
|
4512
5704
|
)
|
|
4513
5705
|
else:
|
|
4514
5706
|
retry_instruction = (
|
|
4515
5707
|
"Your previous response had invalid tool-call formatting. "
|
|
4516
5708
|
"If a tool is needed, emit exactly one valid tool call with strict JSON arguments. "
|
|
4517
|
-
"If no tool is needed for this turn, return concise plain text with no protocol tags."
|
|
5709
|
+
"If no tool is needed for this turn, return concise plain text with no protocol tags. "
|
|
5710
|
+
"Do NOT use <think>...</think> blocks — start your response directly with "
|
|
5711
|
+
"either a tool_call or the plain text answer."
|
|
4518
5712
|
)
|
|
4519
5713
|
|
|
4520
5714
|
malformed_retry_instruction = {
|
|
@@ -4695,7 +5889,7 @@ async def _apply_unexpected_end_turn_guardrail(
|
|
|
4695
5889
|
)
|
|
4696
5890
|
if retry_resp.status_code == 200:
|
|
4697
5891
|
retry_json = retry_resp.json()
|
|
4698
|
-
_maybe_extract_text_tool_calls(retry_json)
|
|
5892
|
+
_maybe_extract_text_tool_calls(retry_json, anthropic_tools=anthropic_body.get("tools"))
|
|
4699
5893
|
retry_choice, retry_message = _extract_openai_choice(retry_json)
|
|
4700
5894
|
if _openai_has_valid_tool_calls(retry_json, anthropic_body):
|
|
4701
5895
|
logger.info("GUARDRAIL: retry produced tool_use; using retried response")
|
|
@@ -4784,8 +5978,12 @@ async def _apply_malformed_tool_guardrail(
|
|
|
4784
5978
|
)
|
|
4785
5979
|
return working_resp
|
|
4786
5980
|
|
|
4787
|
-
#
|
|
4788
|
-
|
|
5981
|
+
# Only set last_response_garbled for TRUE degenerate generation, not
|
|
5982
|
+
# for responses merely truncated by max_tokens — otherwise the next
|
|
5983
|
+
# turn gets hit with the garbled_cap (smaller max_tokens) and the
|
|
5984
|
+
# problem compounds.
|
|
5985
|
+
if issue.kind != "truncated_tool_args":
|
|
5986
|
+
monitor.last_response_garbled = True
|
|
4789
5987
|
|
|
4790
5988
|
if issue.kind == "malformed_payload":
|
|
4791
5989
|
monitor.malformed_tool_streak += 1
|
|
@@ -4793,7 +5991,12 @@ async def _apply_malformed_tool_guardrail(
|
|
|
4793
5991
|
monitor.invalid_tool_call_streak += 1
|
|
4794
5992
|
monitor.arg_preflight_rejections += 1
|
|
4795
5993
|
|
|
4796
|
-
|
|
5994
|
+
# Truncation is a max_tokens accident, not the model misbehaving: don't
|
|
5995
|
+
# feed it to the forced-tool dampener, which would otherwise relax
|
|
5996
|
+
# tool_choice on the very next turn and let the model trail off with
|
|
5997
|
+
# text (the exact failure mode that stopped opencode).
|
|
5998
|
+
if issue.kind != "truncated_tool_args":
|
|
5999
|
+
monitor.maybe_activate_forced_tool_dampener(issue.kind)
|
|
4797
6000
|
excerpt = _openai_message_text(working_resp)[:220].replace("\n", " ")
|
|
4798
6001
|
# Option 2: Log garbled argument content for diagnostics
|
|
4799
6002
|
arg_excerpt = ""
|
|
@@ -4866,7 +6069,7 @@ async def _apply_malformed_tool_guardrail(
|
|
|
4866
6069
|
continue
|
|
4867
6070
|
|
|
4868
6071
|
retry_json = retry_resp.json()
|
|
4869
|
-
_maybe_extract_text_tool_calls(retry_json)
|
|
6072
|
+
_maybe_extract_text_tool_calls(retry_json, anthropic_tools=anthropic_body.get("tools"))
|
|
4870
6073
|
retry_working = retry_json
|
|
4871
6074
|
retry_repairs = 0
|
|
4872
6075
|
if PROXY_TOOL_ARGS_PREFLIGHT and _openai_has_tool_calls(retry_json):
|
|
@@ -4898,15 +6101,20 @@ async def _apply_malformed_tool_guardrail(
|
|
|
4898
6101
|
)
|
|
4899
6102
|
|
|
4900
6103
|
if not retry_issue.has_issue():
|
|
4901
|
-
|
|
4902
|
-
|
|
4903
|
-
|
|
6104
|
+
# 2026-05-12: Fix #2 — do NOT reset malformed/invalid/miss streaks
|
|
6105
|
+
# to 0 on retry-success. Previously, sessions stuck in a
|
|
6106
|
+
# malformed→retry-success loop never accumulated enough streak to
|
|
6107
|
+
# trigger the forced-tool dampener. Healthy responses with real
|
|
6108
|
+
# tool_calls still reset the streak via the upstream no-issue path
|
|
6109
|
+
# (~L5655), so genuine recovery still resets counters; only
|
|
6110
|
+
# repeated retry-recoveries persist toward the dampener.
|
|
4904
6111
|
monitor.last_response_garbled = False
|
|
4905
6112
|
logger.info(
|
|
4906
|
-
"TOOL RESPONSE RETRY success: kind=%s attempt=%d/%d",
|
|
6113
|
+
"TOOL RESPONSE RETRY success: kind=%s attempt=%d/%d malformed_streak=%d",
|
|
4907
6114
|
current_issue.kind,
|
|
4908
6115
|
attempt + 1,
|
|
4909
6116
|
attempts,
|
|
6117
|
+
monitor.malformed_tool_streak,
|
|
4910
6118
|
)
|
|
4911
6119
|
if retry_repairs > 0:
|
|
4912
6120
|
monitor.arg_preflight_repairs += retry_repairs
|
|
@@ -4931,7 +6139,10 @@ async def _apply_malformed_tool_guardrail(
|
|
|
4931
6139
|
if fn_name and raw_args and _is_garbled_tool_arguments(raw_args):
|
|
4932
6140
|
failing_tools.add(fn_name)
|
|
4933
6141
|
|
|
4934
|
-
|
|
6142
|
+
# Truncation on retry is still a max_tokens problem, not a model
|
|
6143
|
+
# misbehaviour — don't dampen. The outer retry loop will try again.
|
|
6144
|
+
if retry_issue.kind != "truncated_tool_args":
|
|
6145
|
+
monitor.maybe_activate_forced_tool_dampener(retry_issue.kind)
|
|
4935
6146
|
logger.warning(
|
|
4936
6147
|
"TOOL RESPONSE RETRY invalid: session=%s attempt=%d/%d kind=%s reason=%s",
|
|
4937
6148
|
session_id,
|
|
@@ -5112,11 +6323,19 @@ def _maybe_apply_session_contamination_breaker(
|
|
|
5112
6323
|
# ===========================================================================
|
|
5113
6324
|
|
|
5114
6325
|
|
|
5115
|
-
def _maybe_extract_text_tool_calls(
|
|
6326
|
+
def _maybe_extract_text_tool_calls(
|
|
6327
|
+
openai_resp: dict, anthropic_tools: list[dict] | None = None
|
|
6328
|
+
) -> dict:
|
|
5116
6329
|
"""Mutate *openai_resp* in-place: if the message has no structured
|
|
5117
|
-
``tool_calls`` but contains
|
|
5118
|
-
and promote to real ``tool_calls`` on the message.
|
|
5119
|
-
|
|
6330
|
+
``tool_calls`` but contains tool-call markup in text, extract them
|
|
6331
|
+
and promote to real ``tool_calls`` on the message.
|
|
6332
|
+
|
|
6333
|
+
*anthropic_tools* (optional): list of tool definitions from the original
|
|
6334
|
+
Anthropic request. Enables schema-matching of bare-args markdown JSON
|
|
6335
|
+
blocks emitted by Gemma 4 cold turns (fix D). Without it, bare-args
|
|
6336
|
+
blocks pass through as text.
|
|
6337
|
+
|
|
6338
|
+
Returns the (possibly-mutated) response for chaining."""
|
|
5120
6339
|
choice = (openai_resp.get("choices") or [{}])[0]
|
|
5121
6340
|
message = choice.get("message", {})
|
|
5122
6341
|
|
|
@@ -5125,10 +6344,20 @@ def _maybe_extract_text_tool_calls(openai_resp: dict) -> dict:
|
|
|
5125
6344
|
return openai_resp
|
|
5126
6345
|
|
|
5127
6346
|
text = message.get("content", "")
|
|
5128
|
-
if not isinstance(text, str)
|
|
6347
|
+
if not isinstance(text, str):
|
|
6348
|
+
return openai_resp
|
|
6349
|
+
# Quick early-exit if no markers present (matches dispatcher guard)
|
|
6350
|
+
if (
|
|
6351
|
+
"<tool_call>" not in text
|
|
6352
|
+
and "<function=" not in text
|
|
6353
|
+
and "<|tool_call>" not in text
|
|
6354
|
+
and "```" not in text
|
|
6355
|
+
):
|
|
5129
6356
|
return openai_resp
|
|
5130
6357
|
|
|
5131
|
-
extracted, remaining = _extract_tool_calls_from_text(
|
|
6358
|
+
extracted, remaining = _extract_tool_calls_from_text(
|
|
6359
|
+
text, available_tools=anthropic_tools
|
|
6360
|
+
)
|
|
5132
6361
|
if not extracted:
|
|
5133
6362
|
return openai_resp
|
|
5134
6363
|
|
|
@@ -5212,6 +6441,18 @@ def _inject_synthetic_continuation(
|
|
|
5212
6441
|
Appends a no-op Read("/dev/null") tool_use block and changes stop_reason
|
|
5213
6442
|
from "end_turn" to "tool_use" so the client continues sending requests.
|
|
5214
6443
|
"""
|
|
6444
|
+
# Session-level hard cap: if we've already done N continuations in this
|
|
6445
|
+
# session (counter is monotonic, survives fresh-user-text resets), stop
|
|
6446
|
+
# injecting and let the response terminate. This catches runaway loops
|
|
6447
|
+
# that dodge the per-cycle cap via state resets.
|
|
6448
|
+
if monitor.finalize_hard_stop_count >= PROXY_FINALIZE_SESSION_HARD_CAP:
|
|
6449
|
+
logger.warning(
|
|
6450
|
+
"FINALIZE CONTINUATION: session hard cap reached (%d/%d) — not injecting, allowing termination",
|
|
6451
|
+
monitor.finalize_hard_stop_count,
|
|
6452
|
+
PROXY_FINALIZE_SESSION_HARD_CAP,
|
|
6453
|
+
)
|
|
6454
|
+
return anthropic_resp
|
|
6455
|
+
|
|
5215
6456
|
# Pick a safe tool the client knows about (case-insensitive match,
|
|
5216
6457
|
# then use the client's actual casing for the tool name)
|
|
5217
6458
|
if _client_has_tool(anthropic_body, "read"):
|
|
@@ -5227,6 +6468,7 @@ def _inject_synthetic_continuation(
|
|
|
5227
6468
|
synthetic_id = f"toolu_{uuid.uuid4().hex[:12]}"
|
|
5228
6469
|
monitor.finalize_synthetic_tool_id = synthetic_id
|
|
5229
6470
|
monitor.finalize_continuation_count += 1
|
|
6471
|
+
monitor.finalize_hard_stop_count += 1
|
|
5230
6472
|
|
|
5231
6473
|
content = anthropic_resp.get("content", [])
|
|
5232
6474
|
content.append({
|
|
@@ -5239,17 +6481,54 @@ def _inject_synthetic_continuation(
|
|
|
5239
6481
|
anthropic_resp["stop_reason"] = "tool_use"
|
|
5240
6482
|
|
|
5241
6483
|
logger.info(
|
|
5242
|
-
"FINALIZE CONTINUATION: injected synthetic %s tool_use id=%s (count=%d/%d)",
|
|
6484
|
+
"FINALIZE CONTINUATION: injected synthetic %s tool_use id=%s (count=%d/%d, session=%d/%d)",
|
|
5243
6485
|
tool_name,
|
|
5244
6486
|
synthetic_id,
|
|
5245
6487
|
monitor.finalize_continuation_count,
|
|
5246
6488
|
PROXY_FINALIZE_CONTINUATION_MAX,
|
|
6489
|
+
monitor.finalize_hard_stop_count,
|
|
6490
|
+
PROXY_FINALIZE_SESSION_HARD_CAP,
|
|
5247
6491
|
)
|
|
5248
6492
|
return anthropic_resp
|
|
5249
6493
|
|
|
5250
6494
|
|
|
5251
|
-
|
|
5252
|
-
|
|
6495
|
+
_THINKING_BLOCK_RE = re.compile(r"<think>(.*?)</think>\s*", re.DOTALL)
|
|
6496
|
+
|
|
6497
|
+
|
|
6498
|
+
def _extract_thinking_block(text: str) -> tuple[str | None, str]:
|
|
6499
|
+
"""Extract Qwen-style ``<think>...</think>`` blocks from *text*.
|
|
6500
|
+
|
|
6501
|
+
Returns ``(thinking_content, remaining_text)``. If no ``<think>`` tag is
|
|
6502
|
+
present, returns ``(None, text)`` unchanged. Multiple thinking blocks
|
|
6503
|
+
are concatenated. Trailing whitespace after each block is consumed so
|
|
6504
|
+
the remaining text starts cleanly with the model's actual answer.
|
|
6505
|
+
"""
|
|
6506
|
+
if "<think>" not in text:
|
|
6507
|
+
return None, text
|
|
6508
|
+
parts: list[str] = []
|
|
6509
|
+
def collect(m: re.Match) -> str:
|
|
6510
|
+
parts.append(m.group(1).strip())
|
|
6511
|
+
return ""
|
|
6512
|
+
remaining = _THINKING_BLOCK_RE.sub(collect, text)
|
|
6513
|
+
if not parts:
|
|
6514
|
+
return None, text
|
|
6515
|
+
return "\n\n".join(p for p in parts if p), remaining.lstrip()
|
|
6516
|
+
|
|
6517
|
+
|
|
6518
|
+
def openai_to_anthropic_response(
|
|
6519
|
+
openai_resp: dict, model: str, expose_thinking: bool = True
|
|
6520
|
+
) -> dict:
|
|
6521
|
+
"""Convert an OpenAI Chat Completions response to Anthropic Messages format.
|
|
6522
|
+
|
|
6523
|
+
*expose_thinking*: when True, surface ``<think>...</think>`` content from
|
|
6524
|
+
the upstream as Anthropic ``{"type": "thinking"}`` blocks. When False
|
|
6525
|
+
(Anthropic default — client didn't opt in), strip thinking content
|
|
6526
|
+
from the response entirely so the client only sees the actual answer.
|
|
6527
|
+
Qwen's chat template seeds the model into thinking regardless of the
|
|
6528
|
+
``enable_thinking`` request param, so even thinking-off responses
|
|
6529
|
+
typically still contain ``<think>`` blocks; this flag controls whether
|
|
6530
|
+
they're surfaced as Anthropic blocks or silently consumed.
|
|
6531
|
+
"""
|
|
5253
6532
|
# First: try to recover tool calls trapped in text XML tags
|
|
5254
6533
|
_maybe_extract_text_tool_calls(openai_resp)
|
|
5255
6534
|
# Second: strip garbled/degenerate tool call arguments
|
|
@@ -5260,20 +6539,46 @@ def openai_to_anthropic_response(openai_resp: dict, model: str) -> dict:
|
|
|
5260
6539
|
finish = choice.get("finish_reason", "stop")
|
|
5261
6540
|
|
|
5262
6541
|
content = []
|
|
6542
|
+
# Surface Qwen's <think>...</think> output as Anthropic-style thinking
|
|
6543
|
+
# blocks (Anthropic extended-thinking API shape:
|
|
6544
|
+
# {"type": "thinking", "thinking": "...", "signature": ""}).
|
|
6545
|
+
# Clients that don't request thinking simply ignore the block; clients
|
|
6546
|
+
# that do (Claude Code) render them in the thinking pane.
|
|
6547
|
+
raw_text = ""
|
|
5263
6548
|
if message.get("content"):
|
|
5264
6549
|
raw_text = (
|
|
5265
6550
|
message["content"]
|
|
5266
6551
|
if isinstance(message["content"], str)
|
|
5267
6552
|
else str(message["content"])
|
|
5268
6553
|
)
|
|
5269
|
-
|
|
5270
|
-
|
|
6554
|
+
# Some llama-server builds emit the model's reasoning into a separate
|
|
6555
|
+
# `reasoning_content` field instead of inline <think> tags. Surface
|
|
6556
|
+
# that too so the proxy is consistent regardless of upstream behaviour.
|
|
6557
|
+
inline_thinking, body_text = _extract_thinking_block(raw_text)
|
|
6558
|
+
sidecar_thinking = message.get("reasoning_content") or message.get("reasoning")
|
|
6559
|
+
thinking_chunks: list[str] = []
|
|
6560
|
+
if isinstance(sidecar_thinking, str) and sidecar_thinking.strip():
|
|
6561
|
+
thinking_chunks.append(sidecar_thinking.strip())
|
|
6562
|
+
if inline_thinking:
|
|
6563
|
+
thinking_chunks.append(inline_thinking)
|
|
6564
|
+
if thinking_chunks and expose_thinking:
|
|
6565
|
+
content.append(
|
|
6566
|
+
{
|
|
6567
|
+
"type": "thinking",
|
|
6568
|
+
"thinking": "\n\n".join(thinking_chunks),
|
|
6569
|
+
"signature": "",
|
|
6570
|
+
}
|
|
6571
|
+
)
|
|
6572
|
+
|
|
6573
|
+
if body_text:
|
|
6574
|
+
sanitized_text = _sanitize_tool_call_apology_text(body_text)
|
|
6575
|
+
if sanitized_text != body_text:
|
|
5271
6576
|
logger.warning(
|
|
5272
6577
|
"SANITIZE: replaced known malformed tool-call apology text in assistant response"
|
|
5273
6578
|
)
|
|
5274
6579
|
# Option 1: Strip residual <tool_call> XML that wasn't extracted
|
|
5275
6580
|
sanitized_text = _strip_residual_tool_call_xml(sanitized_text)
|
|
5276
|
-
if sanitized_text !=
|
|
6581
|
+
if sanitized_text != body_text and "<tool_call>" in body_text:
|
|
5277
6582
|
logger.warning(
|
|
5278
6583
|
"SANITIZE: stripped residual <tool_call> XML from text content"
|
|
5279
6584
|
)
|
|
@@ -5298,10 +6603,21 @@ def openai_to_anthropic_response(openai_resp: dict, model: str) -> dict:
|
|
|
5298
6603
|
logger.warning(
|
|
5299
6604
|
"BASH SAFETY: stripped standalone protocol-tag lines from command before tool execution"
|
|
5300
6605
|
)
|
|
6606
|
+
# Normalise IDs to Anthropic spec (toolu_ prefix). Upstream
|
|
6607
|
+
# llama-server returns opaque IDs without prefix; clients that
|
|
6608
|
+
# validate prefix would reject. Strip-and-restamp here, restore in
|
|
6609
|
+
# anthropic_to_openai_messages() when client sends tool_result back.
|
|
6610
|
+
upstream_id = tc.get("id", "")
|
|
6611
|
+
if upstream_id.startswith("toolu_"):
|
|
6612
|
+
tool_use_id = upstream_id
|
|
6613
|
+
elif upstream_id:
|
|
6614
|
+
tool_use_id = f"toolu_{upstream_id}"
|
|
6615
|
+
else:
|
|
6616
|
+
tool_use_id = f"toolu_{uuid.uuid4().hex[:24]}"
|
|
5301
6617
|
content.append(
|
|
5302
6618
|
{
|
|
5303
6619
|
"type": "tool_use",
|
|
5304
|
-
"id":
|
|
6620
|
+
"id": tool_use_id,
|
|
5305
6621
|
"name": fn.get("name", ""),
|
|
5306
6622
|
"input": args,
|
|
5307
6623
|
}
|
|
@@ -5804,6 +7120,10 @@ async def messages(request: Request):
|
|
|
5804
7120
|
is_stream = body.get("stream", False)
|
|
5805
7121
|
model = body.get("model", "default")
|
|
5806
7122
|
client_id = resolve_client_id(request)
|
|
7123
|
+
|
|
7124
|
+
# Periodically re-detect context window from upstream (handles server restarts)
|
|
7125
|
+
await _maybe_recheck_context_window()
|
|
7126
|
+
|
|
5807
7127
|
if _should_passthrough_model(model):
|
|
5808
7128
|
logger.info("PASSTHROUGH: model=%s -> %s", model, ANTHROPIC_API_BASE)
|
|
5809
7129
|
return await _passthrough_anthropic_request(request, body, is_stream)
|
|
@@ -5861,8 +7181,9 @@ async def messages(request: Request):
|
|
|
5861
7181
|
last_text = str(last_content)[:200]
|
|
5862
7182
|
rate_count = log_client_rate(client_id)
|
|
5863
7183
|
logger.info(
|
|
5864
|
-
"REQ: client=%s rate_%ss=%d stream=%s msgs=%d tools=%d max_tokens=%s last_role=%s last_content=%.200s",
|
|
7184
|
+
"REQ: client=%s sess=%s rate_%ss=%d stream=%s msgs=%d tools=%d max_tokens=%s last_role=%s last_content=%.200s",
|
|
5865
7185
|
client_id,
|
|
7186
|
+
session_id,
|
|
5866
7187
|
PROXY_CLIENT_RATE_WINDOW_SECS,
|
|
5867
7188
|
rate_count,
|
|
5868
7189
|
is_stream,
|
|
@@ -5993,7 +7314,7 @@ async def messages(request: Request):
|
|
|
5993
7314
|
)
|
|
5994
7315
|
except Exception as exc:
|
|
5995
7316
|
# Check if upstream is hung before returning error
|
|
5996
|
-
await _check_slot_hang(
|
|
7317
|
+
await _check_slot_hang(LLAMA_CPP_BASE.replace("/v1", "/slots"))
|
|
5997
7318
|
return Response(
|
|
5998
7319
|
content=json.dumps(
|
|
5999
7320
|
{
|
|
@@ -6008,6 +7329,23 @@ async def messages(request: Request):
|
|
|
6008
7329
|
media_type="application/json",
|
|
6009
7330
|
)
|
|
6010
7331
|
|
|
7332
|
+
if strict_resp.status_code != 200:
|
|
7333
|
+
error_text = strict_resp.text[:1000]
|
|
7334
|
+
# Try the Gemma 4 PEG parse-failure recovery first — relax
|
|
7335
|
+
# tool_choice='required' so the retry isn't constrained by the
|
|
7336
|
+
# strict-grammar that triggered the parse failure.
|
|
7337
|
+
relaxed = _is_gemma4_peg_parse_failure(strict_resp.status_code, error_text) and \
|
|
7338
|
+
_relax_tool_choice_for_gemma4_peg_retry(strict_body, "strict-stream")
|
|
7339
|
+
if relaxed:
|
|
7340
|
+
try:
|
|
7341
|
+
strict_resp = await _post_with_generation_timeout(
|
|
7342
|
+
client,
|
|
7343
|
+
f"{LLAMA_CPP_BASE}/chat/completions",
|
|
7344
|
+
strict_body,
|
|
7345
|
+
{"Content-Type": "application/json"},
|
|
7346
|
+
)
|
|
7347
|
+
except Exception:
|
|
7348
|
+
pass # fall through to next handler
|
|
6011
7349
|
if strict_resp.status_code != 200:
|
|
6012
7350
|
error_text = strict_resp.text[:1000]
|
|
6013
7351
|
if _maybe_disable_grammar_for_tools_error(
|
|
@@ -6082,7 +7420,7 @@ async def messages(request: Request):
|
|
|
6082
7420
|
|
|
6083
7421
|
openai_resp = strict_resp.json()
|
|
6084
7422
|
# Recover tool calls from <tool_call> XML before guardrails run
|
|
6085
|
-
_maybe_extract_text_tool_calls(openai_resp)
|
|
7423
|
+
_maybe_extract_text_tool_calls(openai_resp, anthropic_tools=body.get("tools"))
|
|
6086
7424
|
openai_resp = await _apply_unexpected_end_turn_guardrail(
|
|
6087
7425
|
client,
|
|
6088
7426
|
openai_resp,
|
|
@@ -6137,7 +7475,11 @@ async def messages(request: Request):
|
|
|
6137
7475
|
logger.info("DEGENERATE RETRY: retry insufficient, using truncated original")
|
|
6138
7476
|
except Exception as exc:
|
|
6139
7477
|
logger.warning("DEGENERATE RETRY: failed: %s", exc)
|
|
6140
|
-
anthropic_resp = openai_to_anthropic_response(
|
|
7478
|
+
anthropic_resp = openai_to_anthropic_response(
|
|
7479
|
+
openai_resp, model,
|
|
7480
|
+
expose_thinking=isinstance(body.get("thinking"), dict)
|
|
7481
|
+
and (body["thinking"].get("type") or "").lower() == "enabled",
|
|
7482
|
+
)
|
|
6141
7483
|
# FINALIZE CONTINUATION: inject synthetic tool_use to keep client loop alive
|
|
6142
7484
|
if (
|
|
6143
7485
|
monitor.finalize_turn_active
|
|
@@ -6253,6 +7595,29 @@ async def messages(request: Request):
|
|
|
6253
7595
|
error_body = await resp.aread()
|
|
6254
7596
|
await resp.aclose()
|
|
6255
7597
|
error_text = error_body.decode("utf-8", errors="replace")[:1000]
|
|
7598
|
+
# Gemma 4 PEG parse-failure recovery: relax tool_choice='required'
|
|
7599
|
+
# so the retry isn't blocked by the strict-grammar that rejected
|
|
7600
|
+
# the model's incomplete tool call.
|
|
7601
|
+
if _is_gemma4_peg_parse_failure(resp.status_code, error_text) and \
|
|
7602
|
+
_relax_tool_choice_for_gemma4_peg_retry(openai_body, "stream"):
|
|
7603
|
+
resp = await client.send(
|
|
7604
|
+
client.build_request(
|
|
7605
|
+
"POST",
|
|
7606
|
+
f"{LLAMA_CPP_BASE}/chat/completions",
|
|
7607
|
+
json=openai_body,
|
|
7608
|
+
headers={"Content-Type": "application/json"},
|
|
7609
|
+
),
|
|
7610
|
+
stream=True,
|
|
7611
|
+
)
|
|
7612
|
+
if resp.status_code == 200:
|
|
7613
|
+
return StreamingResponse(
|
|
7614
|
+
stream_anthropic_response(resp, model, monitor, body),
|
|
7615
|
+
media_type="text/event-stream",
|
|
7616
|
+
)
|
|
7617
|
+
# fall through if still failing
|
|
7618
|
+
error_body = await resp.aread()
|
|
7619
|
+
await resp.aclose()
|
|
7620
|
+
error_text = error_body.decode("utf-8", errors="replace")[:1000]
|
|
6256
7621
|
if _maybe_disable_grammar_for_tools_error(
|
|
6257
7622
|
openai_body,
|
|
6258
7623
|
resp.status_code,
|
|
@@ -6385,6 +7750,23 @@ async def messages(request: Request):
|
|
|
6385
7750
|
media_type="application/json",
|
|
6386
7751
|
)
|
|
6387
7752
|
|
|
7753
|
+
if resp.status_code != 200:
|
|
7754
|
+
error_text = resp.text[:1000]
|
|
7755
|
+
# Gemma 4 PEG parse-failure recovery (non-stream path).
|
|
7756
|
+
relaxed = (
|
|
7757
|
+
_is_gemma4_peg_parse_failure(resp.status_code, error_text)
|
|
7758
|
+
and _relax_tool_choice_for_gemma4_peg_retry(openai_body, "non-stream")
|
|
7759
|
+
)
|
|
7760
|
+
if relaxed:
|
|
7761
|
+
try:
|
|
7762
|
+
resp = await _post_with_generation_timeout(
|
|
7763
|
+
client,
|
|
7764
|
+
f"{LLAMA_CPP_BASE}/chat/completions",
|
|
7765
|
+
openai_body,
|
|
7766
|
+
{"Content-Type": "application/json"},
|
|
7767
|
+
)
|
|
7768
|
+
except Exception:
|
|
7769
|
+
pass # fall through
|
|
6388
7770
|
if resp.status_code != 200:
|
|
6389
7771
|
error_text = resp.text[:1000]
|
|
6390
7772
|
if _maybe_disable_grammar_for_tools_error(
|
|
@@ -6437,7 +7819,7 @@ async def messages(request: Request):
|
|
|
6437
7819
|
|
|
6438
7820
|
openai_resp = resp.json()
|
|
6439
7821
|
# Recover tool calls from <tool_call> XML before guardrails run
|
|
6440
|
-
_maybe_extract_text_tool_calls(openai_resp)
|
|
7822
|
+
_maybe_extract_text_tool_calls(openai_resp, anthropic_tools=body.get("tools"))
|
|
6441
7823
|
openai_resp = await _apply_unexpected_end_turn_guardrail(
|
|
6442
7824
|
client,
|
|
6443
7825
|
openai_resp,
|
|
@@ -6506,7 +7888,11 @@ async def messages(request: Request):
|
|
|
6506
7888
|
logger.info("DEGENERATE RETRY (stream): no tool call, using truncated")
|
|
6507
7889
|
except Exception as exc:
|
|
6508
7890
|
logger.warning("DEGENERATE RETRY (stream): failed: %s", exc)
|
|
6509
|
-
anthropic_resp = openai_to_anthropic_response(
|
|
7891
|
+
anthropic_resp = openai_to_anthropic_response(
|
|
7892
|
+
openai_resp, model,
|
|
7893
|
+
expose_thinking=isinstance(body.get("thinking"), dict)
|
|
7894
|
+
and (body["thinking"].get("type") or "").lower() == "enabled",
|
|
7895
|
+
)
|
|
6510
7896
|
# FINALIZE CONTINUATION: inject synthetic tool_use (non-guarded stream path)
|
|
6511
7897
|
if (
|
|
6512
7898
|
monitor.finalize_turn_active
|
|
@@ -6532,6 +7918,292 @@ async def messages_anthropic(request: Request):
|
|
|
6532
7918
|
return await messages(request)
|
|
6533
7919
|
|
|
6534
7920
|
|
|
7921
|
+
@app.post("/v1/chat/completions")
|
|
7922
|
+
async def chat_completions(request: Request):
|
|
7923
|
+
"""OpenAI-compatible chat/completions endpoint for clients like Forge
|
|
7924
|
+
that require the OpenAI API shape.
|
|
7925
|
+
|
|
7926
|
+
FULL GUARDRAIL PATH: Converts the OpenAI request to Anthropic format,
|
|
7927
|
+
runs the full /v1/messages pipeline (loop detection, tool narrowing,
|
|
7928
|
+
cycle breaking, malformed tool retry, context pruning, etc.), then
|
|
7929
|
+
converts the Anthropic response back to OpenAI format.
|
|
7930
|
+
|
|
7931
|
+
Streaming is down-converted to a single final OpenAI SSE chunk sequence
|
|
7932
|
+
built from the completed Anthropic response (not token-by-token from
|
|
7933
|
+
upstream). This preserves guardrails at the cost of stream granularity.
|
|
7934
|
+
"""
|
|
7935
|
+
body_bytes = await request.body()
|
|
7936
|
+
try:
|
|
7937
|
+
openai_body = json.loads(body_bytes) if body_bytes else {}
|
|
7938
|
+
except (ValueError, TypeError):
|
|
7939
|
+
return Response(
|
|
7940
|
+
content=b'{"error":{"message":"invalid JSON","type":"invalid_request_error"}}',
|
|
7941
|
+
status_code=400,
|
|
7942
|
+
media_type="application/json",
|
|
7943
|
+
)
|
|
7944
|
+
|
|
7945
|
+
requested_stream = bool(openai_body.get("stream", False))
|
|
7946
|
+
model = openai_body.get("model", "default")
|
|
7947
|
+
client_id = resolve_client_id(request)
|
|
7948
|
+
|
|
7949
|
+
logger.info(
|
|
7950
|
+
"CHAT (guarded): client=%s model=%s stream=%s msgs=%d tools=%d",
|
|
7951
|
+
client_id,
|
|
7952
|
+
model,
|
|
7953
|
+
requested_stream,
|
|
7954
|
+
len(openai_body.get("messages", [])),
|
|
7955
|
+
len(openai_body.get("tools", []) or []),
|
|
7956
|
+
)
|
|
7957
|
+
|
|
7958
|
+
# Convert OpenAI request -> Anthropic request
|
|
7959
|
+
anthropic_body = openai_to_anthropic_request(openai_body)
|
|
7960
|
+
# Force non-streaming through the pipeline; we re-stream at the end if the
|
|
7961
|
+
# client wanted streaming. This keeps guardrail logic simpler/consistent.
|
|
7962
|
+
anthropic_body["stream"] = False
|
|
7963
|
+
|
|
7964
|
+
# Build a synthetic Request that the existing messages() handler can consume
|
|
7965
|
+
fake_body_bytes = json.dumps(anthropic_body).encode("utf-8")
|
|
7966
|
+
|
|
7967
|
+
async def receive():
|
|
7968
|
+
return {"type": "http.request", "body": fake_body_bytes, "more_body": False}
|
|
7969
|
+
|
|
7970
|
+
fake_scope = dict(request.scope)
|
|
7971
|
+
# Preserve client/headers but override the body + path
|
|
7972
|
+
fake_scope["path"] = "/v1/messages"
|
|
7973
|
+
fake_scope["raw_path"] = b"/v1/messages"
|
|
7974
|
+
# Strip content-length since the body changes
|
|
7975
|
+
fake_scope["headers"] = [
|
|
7976
|
+
(k, v)
|
|
7977
|
+
for (k, v) in fake_scope.get("headers", [])
|
|
7978
|
+
if k.lower() != b"content-length"
|
|
7979
|
+
]
|
|
7980
|
+
fake_request = Request(fake_scope, receive)
|
|
7981
|
+
|
|
7982
|
+
# Run the full guarded Anthropic pipeline
|
|
7983
|
+
inner_resp = await messages(fake_request)
|
|
7984
|
+
|
|
7985
|
+
# Extract the Anthropic-format JSON from whatever messages() returned
|
|
7986
|
+
anthropic_resp_dict: dict | None = None
|
|
7987
|
+
status_code = 200
|
|
7988
|
+
if isinstance(inner_resp, StreamingResponse):
|
|
7989
|
+
# Pipeline shouldn't stream because we set stream=False, but defensively
|
|
7990
|
+
# consume the stream and parse the final message event.
|
|
7991
|
+
chunks: list[bytes] = []
|
|
7992
|
+
async for chunk in inner_resp.body_iterator:
|
|
7993
|
+
if isinstance(chunk, bytes):
|
|
7994
|
+
chunks.append(chunk)
|
|
7995
|
+
elif isinstance(chunk, str):
|
|
7996
|
+
chunks.append(chunk.encode("utf-8"))
|
|
7997
|
+
raw = b"".join(chunks)
|
|
7998
|
+
# Try to parse as JSON directly first, then fall back to SSE parsing
|
|
7999
|
+
try:
|
|
8000
|
+
anthropic_resp_dict = json.loads(raw)
|
|
8001
|
+
except (ValueError, TypeError):
|
|
8002
|
+
anthropic_resp_dict = _parse_anthropic_sse_to_message(raw)
|
|
8003
|
+
elif isinstance(inner_resp, Response):
|
|
8004
|
+
status_code = inner_resp.status_code
|
|
8005
|
+
try:
|
|
8006
|
+
anthropic_resp_dict = json.loads(inner_resp.body)
|
|
8007
|
+
except (ValueError, TypeError):
|
|
8008
|
+
anthropic_resp_dict = None
|
|
8009
|
+
elif isinstance(inner_resp, dict):
|
|
8010
|
+
anthropic_resp_dict = inner_resp
|
|
8011
|
+
|
|
8012
|
+
if anthropic_resp_dict is None or "content" not in anthropic_resp_dict:
|
|
8013
|
+
# Upstream error: forward as-is in OpenAI error shape
|
|
8014
|
+
err_msg = "upstream returned no message"
|
|
8015
|
+
if isinstance(anthropic_resp_dict, dict) and "error" in anthropic_resp_dict:
|
|
8016
|
+
err_msg = anthropic_resp_dict["error"].get("message", err_msg)
|
|
8017
|
+
return Response(
|
|
8018
|
+
content=json.dumps({"error": {"message": err_msg, "type": "upstream_error"}}).encode(),
|
|
8019
|
+
status_code=status_code if status_code >= 400 else 502,
|
|
8020
|
+
media_type="application/json",
|
|
8021
|
+
)
|
|
8022
|
+
|
|
8023
|
+
# Ensure model field is set for response
|
|
8024
|
+
anthropic_resp_dict.setdefault("model", model)
|
|
8025
|
+
openai_resp = anthropic_to_openai_response(anthropic_resp_dict)
|
|
8026
|
+
|
|
8027
|
+
if not requested_stream:
|
|
8028
|
+
return Response(
|
|
8029
|
+
content=json.dumps(openai_resp).encode(),
|
|
8030
|
+
status_code=200,
|
|
8031
|
+
media_type="application/json",
|
|
8032
|
+
)
|
|
8033
|
+
|
|
8034
|
+
# Client requested streaming: emit the response as OpenAI SSE chunks
|
|
8035
|
+
async def emit_openai_stream():
|
|
8036
|
+
resp_id = openai_resp["id"]
|
|
8037
|
+
created = openai_resp["created"]
|
|
8038
|
+
model_name = openai_resp["model"]
|
|
8039
|
+
choice = openai_resp["choices"][0]
|
|
8040
|
+
message = choice["message"]
|
|
8041
|
+
|
|
8042
|
+
# Opening chunk: role
|
|
8043
|
+
opening = {
|
|
8044
|
+
"id": resp_id,
|
|
8045
|
+
"object": "chat.completion.chunk",
|
|
8046
|
+
"created": created,
|
|
8047
|
+
"model": model_name,
|
|
8048
|
+
"choices": [{"index": 0, "delta": {"role": "assistant"}, "finish_reason": None}],
|
|
8049
|
+
}
|
|
8050
|
+
yield f"data: {json.dumps(opening)}\n\n".encode()
|
|
8051
|
+
|
|
8052
|
+
# Content chunk
|
|
8053
|
+
if message.get("content"):
|
|
8054
|
+
content_chunk = {
|
|
8055
|
+
"id": resp_id,
|
|
8056
|
+
"object": "chat.completion.chunk",
|
|
8057
|
+
"created": created,
|
|
8058
|
+
"model": model_name,
|
|
8059
|
+
"choices": [
|
|
8060
|
+
{
|
|
8061
|
+
"index": 0,
|
|
8062
|
+
"delta": {"content": message["content"]},
|
|
8063
|
+
"finish_reason": None,
|
|
8064
|
+
}
|
|
8065
|
+
],
|
|
8066
|
+
}
|
|
8067
|
+
yield f"data: {json.dumps(content_chunk)}\n\n".encode()
|
|
8068
|
+
|
|
8069
|
+
# Tool call chunks
|
|
8070
|
+
for idx, tc in enumerate(message.get("tool_calls", []) or []):
|
|
8071
|
+
tc_chunk = {
|
|
8072
|
+
"id": resp_id,
|
|
8073
|
+
"object": "chat.completion.chunk",
|
|
8074
|
+
"created": created,
|
|
8075
|
+
"model": model_name,
|
|
8076
|
+
"choices": [
|
|
8077
|
+
{
|
|
8078
|
+
"index": 0,
|
|
8079
|
+
"delta": {
|
|
8080
|
+
"tool_calls": [
|
|
8081
|
+
{
|
|
8082
|
+
"index": idx,
|
|
8083
|
+
"id": tc["id"],
|
|
8084
|
+
"type": "function",
|
|
8085
|
+
"function": {
|
|
8086
|
+
"name": tc["function"]["name"],
|
|
8087
|
+
"arguments": tc["function"]["arguments"],
|
|
8088
|
+
},
|
|
8089
|
+
}
|
|
8090
|
+
]
|
|
8091
|
+
},
|
|
8092
|
+
"finish_reason": None,
|
|
8093
|
+
}
|
|
8094
|
+
],
|
|
8095
|
+
}
|
|
8096
|
+
yield f"data: {json.dumps(tc_chunk)}\n\n".encode()
|
|
8097
|
+
|
|
8098
|
+
# Final chunk with finish_reason
|
|
8099
|
+
final_chunk = {
|
|
8100
|
+
"id": resp_id,
|
|
8101
|
+
"object": "chat.completion.chunk",
|
|
8102
|
+
"created": created,
|
|
8103
|
+
"model": model_name,
|
|
8104
|
+
"choices": [
|
|
8105
|
+
{"index": 0, "delta": {}, "finish_reason": choice["finish_reason"]}
|
|
8106
|
+
],
|
|
8107
|
+
}
|
|
8108
|
+
yield f"data: {json.dumps(final_chunk)}\n\n".encode()
|
|
8109
|
+
yield b"data: [DONE]\n\n"
|
|
8110
|
+
|
|
8111
|
+
return StreamingResponse(
|
|
8112
|
+
emit_openai_stream(),
|
|
8113
|
+
media_type="text/event-stream",
|
|
8114
|
+
headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no"},
|
|
8115
|
+
)
|
|
8116
|
+
|
|
8117
|
+
|
|
8118
|
+
def _parse_anthropic_sse_to_message(raw: bytes) -> dict | None:
|
|
8119
|
+
"""Parse a concatenated Anthropic SSE stream into a final message dict.
|
|
8120
|
+
Used as a fallback when messages() returns a StreamingResponse despite stream=False.
|
|
8121
|
+
"""
|
|
8122
|
+
try:
|
|
8123
|
+
text = raw.decode("utf-8", errors="replace")
|
|
8124
|
+
except Exception:
|
|
8125
|
+
return None
|
|
8126
|
+
|
|
8127
|
+
text_parts: list[str] = []
|
|
8128
|
+
tool_uses: list[dict] = []
|
|
8129
|
+
usage = {"input_tokens": 0, "output_tokens": 0}
|
|
8130
|
+
stop_reason = "end_turn"
|
|
8131
|
+
model = "unknown"
|
|
8132
|
+
message_id = f"msg_{uuid.uuid4().hex[:24]}"
|
|
8133
|
+
|
|
8134
|
+
current_block: dict | None = None
|
|
8135
|
+
current_json_buffer = ""
|
|
8136
|
+
|
|
8137
|
+
for line in text.splitlines():
|
|
8138
|
+
if not line.startswith("data:"):
|
|
8139
|
+
continue
|
|
8140
|
+
payload = line[5:].strip()
|
|
8141
|
+
if not payload or payload == "[DONE]":
|
|
8142
|
+
continue
|
|
8143
|
+
try:
|
|
8144
|
+
evt = json.loads(payload)
|
|
8145
|
+
except (ValueError, TypeError):
|
|
8146
|
+
continue
|
|
8147
|
+
etype = evt.get("type")
|
|
8148
|
+
if etype == "message_start":
|
|
8149
|
+
m = evt.get("message", {}) or {}
|
|
8150
|
+
message_id = m.get("id", message_id)
|
|
8151
|
+
model = m.get("model", model)
|
|
8152
|
+
if "usage" in m:
|
|
8153
|
+
usage.update(m["usage"])
|
|
8154
|
+
elif etype == "content_block_start":
|
|
8155
|
+
current_block = evt.get("content_block", {})
|
|
8156
|
+
current_json_buffer = ""
|
|
8157
|
+
if current_block.get("type") == "text":
|
|
8158
|
+
text_parts.append(current_block.get("text", ""))
|
|
8159
|
+
elif etype == "content_block_delta":
|
|
8160
|
+
d = evt.get("delta", {}) or {}
|
|
8161
|
+
if d.get("type") == "text_delta":
|
|
8162
|
+
text_parts.append(d.get("text", ""))
|
|
8163
|
+
elif d.get("type") == "input_json_delta":
|
|
8164
|
+
current_json_buffer += d.get("partial_json", "")
|
|
8165
|
+
elif etype == "content_block_stop":
|
|
8166
|
+
if current_block and current_block.get("type") == "tool_use":
|
|
8167
|
+
try:
|
|
8168
|
+
input_obj = json.loads(current_json_buffer) if current_json_buffer else {}
|
|
8169
|
+
except (ValueError, TypeError):
|
|
8170
|
+
input_obj = {}
|
|
8171
|
+
tool_uses.append(
|
|
8172
|
+
{
|
|
8173
|
+
"type": "tool_use",
|
|
8174
|
+
"id": current_block.get("id", f"toolu_{uuid.uuid4().hex[:12]}"),
|
|
8175
|
+
"name": current_block.get("name", ""),
|
|
8176
|
+
"input": input_obj,
|
|
8177
|
+
}
|
|
8178
|
+
)
|
|
8179
|
+
current_block = None
|
|
8180
|
+
current_json_buffer = ""
|
|
8181
|
+
elif etype == "message_delta":
|
|
8182
|
+
d = evt.get("delta", {}) or {}
|
|
8183
|
+
if "stop_reason" in d:
|
|
8184
|
+
stop_reason = d["stop_reason"] or stop_reason
|
|
8185
|
+
u = evt.get("usage", {}) or {}
|
|
8186
|
+
if u:
|
|
8187
|
+
usage.update(u)
|
|
8188
|
+
|
|
8189
|
+
content: list[dict] = []
|
|
8190
|
+
joined_text = "".join(text_parts)
|
|
8191
|
+
if joined_text:
|
|
8192
|
+
content.append({"type": "text", "text": joined_text})
|
|
8193
|
+
content.extend(tool_uses)
|
|
8194
|
+
|
|
8195
|
+
return {
|
|
8196
|
+
"id": message_id,
|
|
8197
|
+
"type": "message",
|
|
8198
|
+
"role": "assistant",
|
|
8199
|
+
"content": content if content else [{"type": "text", "text": ""}],
|
|
8200
|
+
"model": model,
|
|
8201
|
+
"stop_reason": stop_reason,
|
|
8202
|
+
"stop_sequence": None,
|
|
8203
|
+
"usage": usage,
|
|
8204
|
+
}
|
|
8205
|
+
|
|
8206
|
+
|
|
6535
8207
|
@app.get("/v1/models")
|
|
6536
8208
|
async def models():
|
|
6537
8209
|
"""Return available model list (spoofs Anthropic model IDs for client compatibility)."""
|