@miller-tech/uap 1.20.33 → 1.20.35

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. package/config/model-profiles/qwen35.json +6 -5
  2. package/dist/.tsbuildinfo +1 -1
  3. package/dist/bin/cli.js +6 -1
  4. package/dist/bin/cli.js.map +1 -1
  5. package/dist/cli/hooks.js +30 -7
  6. package/dist/cli/hooks.js.map +1 -1
  7. package/dist/cli/policy.d.ts.map +1 -1
  8. package/dist/cli/policy.js +26 -0
  9. package/dist/cli/policy.js.map +1 -1
  10. package/dist/index.d.ts +15 -1
  11. package/dist/index.d.ts.map +1 -1
  12. package/dist/index.js +14 -0
  13. package/dist/index.js.map +1 -1
  14. package/dist/types/index.d.ts +20 -0
  15. package/dist/types/index.d.ts.map +1 -1
  16. package/dist/types/index.js +20 -0
  17. package/dist/types/index.js.map +1 -1
  18. package/docs/AGENTS.md +423 -0
  19. package/docs/AGENTS.md</path>CLAUDE.md</path>/home/cogtek/dev/miller-tech/universal-agent-protocol/docs/INDEX.md</path>/home/cogtek/dev/miller-tech/universal-agent-protocol/docs/reference/API_REFERENCE.md</path>/home/cogtek/dev/miller-tech/universal-agent-protocol/docs/reference/UAP_CLI_REFERENCE.md</path>src/index.ts</path>/src/cli/worktree.ts</path>/src/coordination/deploy-batcher.ts</path>/src/policies/policy-gate.ts</path>/src/memory/model-router.ts</path>/src/memory/embeddings.ts</path>/src/models/types.ts</path>/src/types/coordination.ts</path>/src/utils/logger.ts</path>/src/utils/config-loader.ts</path>/src/utils/performance-monitor.ts</path>/src/utils/concurrency.ts</path>/src/utils/concurrency-pool.ts</path>/src/utils/string-similarity.ts</path>/src/utils/rate-limiter.ts</path>/src/utils/system-resources.ts</path>/src/utils/adaptive-cache.ts</path>/src/utils/lazy-imports.ts</path>/src/utils/merge-claude-md.ts</path>/src/utils/stopwords.ts</path>/src/utils/config-loader.ts</path>/src/utils/performance-monitor.ts</path>/src/utils/concurrency.ts</path>/src/utils/concurrency-pool.ts</path>/src/utils/string-similarity.ts</path>/src/utils/rate-limiter.ts</path>/src/utils/system-resources.ts</path>/src/utils/adaptive-cache.ts</path>/src/utils/lazy-imports.ts</path>/src/utils/merge-claude-md.ts</path>/src/utils/stopwords.ts</path> +433 -0
  20. package/docs/DOCUMENTATION_AUDIT_REPORT.md +131 -0
  21. package/docs/GETTING_STARTED.md +288 -0
  22. package/docs/INDEX.md +272 -42
  23. package/docs/PROJECT_ANALYSIS_REPORT.md +510 -0
  24. package/docs/architecture/SYSTEM_ANALYSIS.md +220 -1003
  25. package/docs/blog/local-coding-agents.md +266 -0
  26. package/docs/blog/x-thread.md +254 -0
  27. package/docs/deployment/DEPLOY_BATCHER_ANALYSIS.md +15 -647
  28. package/docs/getting-started/OVERVIEW.md +10 -30
  29. package/docs/getting-started/SETUP.md +183 -9
  30. package/docs/pr/UPSTREAM_PRS.md +424 -0
  31. package/docs/reference/CONFIGURATION.md +208 -0
  32. package/docs/reference/DATABASE_SCHEMA.md +344 -0
  33. package/docs/reference/PATTERN_LIBRARY.md +636 -0
  34. package/package.json +1 -1
  35. package/templates/hooks/uap-policy-gate.sh +36 -0
  36. package/tools/agents/claude_local_agent.py +92 -0
  37. package/tools/agents/config/qwen3.5-enhanced.jinja +187 -0
  38. package/tools/agents/opencode_uap_agent.py +3 -0
  39. package/tools/agents/scripts/anthropic_proxy.py +1748 -76
  40. package/tools/agents/tests/test_anthropic_proxy_streaming.py +64 -8
  41. package/tools/agents/uap_agent.py +1 -1
@@ -134,6 +134,11 @@ PROXY_LOOP_BREAKER = os.environ.get("PROXY_LOOP_BREAKER", "on").lower() not in {
134
134
  }
135
135
  PROXY_LOOP_WINDOW = int(os.environ.get("PROXY_LOOP_WINDOW", "6"))
136
136
  PROXY_LOOP_REPEAT_THRESHOLD = int(os.environ.get("PROXY_LOOP_REPEAT_THRESHOLD", "6"))
137
+ # Fix K (2026-04-22): minimum consecutive cycle-repeat count required to flip
138
+ # phase from act -> review. The old behaviour accepted cycle_repeat=2, which
139
+ # is normal in a working session (re-reading the same file across edits).
140
+ # Set higher to tolerate legitimate re-reads; set 1 to restore old behaviour.
141
+ PROXY_CYCLE_TRIGGER_REPEAT = int(os.environ.get("PROXY_CYCLE_TRIGGER_REPEAT", "3"))
137
142
  PROXY_FORCED_THRESHOLD = int(os.environ.get("PROXY_FORCED_THRESHOLD", "15"))
138
143
  PROXY_NO_PROGRESS_THRESHOLD = int(os.environ.get("PROXY_NO_PROGRESS_THRESHOLD", "3"))
139
144
  PROXY_CONTEXT_RELEASE_THRESHOLD = float(
@@ -166,6 +171,12 @@ PROXY_TOOL_STATE_FINALIZE_THRESHOLD = int(
166
171
  PROXY_TOOL_STATE_REVIEW_CYCLE_LIMIT = int(
167
172
  os.environ.get("PROXY_TOOL_STATE_REVIEW_CYCLE_LIMIT", "3")
168
173
  )
174
+ # Force finalize after N consecutive forced_budget_exhausted events where
175
+ # neither cycling nor stagnation was detected — catches "distinct but
176
+ # unproductive" tool spam that defeats per-tool cycle detection.
177
+ PROXY_UNPRODUCTIVE_EXHAUSTION_LIMIT = int(
178
+ os.environ.get("PROXY_UNPRODUCTIVE_EXHAUSTION_LIMIT", "2")
179
+ )
169
180
  PROXY_COMPLETION_RECOVERY_MAX = int(
170
181
  os.environ.get("PROXY_COMPLETION_RECOVERY_MAX", "3")
171
182
  )
@@ -205,6 +216,13 @@ PROXY_SESSION_TTL_SECS = int(os.environ.get("PROXY_SESSION_TTL_SECS", "7200"))
205
216
  PROXY_FINALIZE_CONTINUATION_MAX = int(
206
217
  os.environ.get("PROXY_FINALIZE_CONTINUATION_MAX", "3")
207
218
  )
219
+ # Session-level cap: after N total finalize continuations in a session (even
220
+ # across "fresh user text" state resets), stop injecting synthetic tools and
221
+ # let the response terminate naturally. Catches runaway loops that dodge the
222
+ # per-cycle cap by triggering state resets.
223
+ PROXY_FINALIZE_SESSION_HARD_CAP = int(
224
+ os.environ.get("PROXY_FINALIZE_SESSION_HARD_CAP", "3")
225
+ )
208
226
  PROXY_STREAM_REASONING_FALLBACK = (
209
227
  os.environ.get("PROXY_STREAM_REASONING_FALLBACK", "off").strip().lower()
210
228
  )
@@ -234,6 +252,27 @@ PROXY_DISABLE_THINKING_ON_TOOL_TURNS = os.environ.get(
234
252
  "off",
235
253
  "no",
236
254
  }
255
+ # Disable thinking on EVERY turn (not just tool turns). For models like Gemma 4
256
+ # that emit ~100 thinking tokens for trivial replies, this halves output cost.
257
+ PROXY_DISABLE_THINKING_ALWAYS = os.environ.get(
258
+ "PROXY_DISABLE_THINKING_ALWAYS", "off"
259
+ ).lower() not in {"0", "false", "off", "no"}
260
+ # Force tool_choice='required' on the first turn of a fresh session. Originally
261
+ # Qwen-tuned to break out of cold-start "tries to chat instead of calling a tool"
262
+ # behaviour. Gemma 4 doesn't need this — it routes 'auto' correctly and the
263
+ # force triggers malformed-JSON emissions when it would rather speak. Default
264
+ # off; set 'on' to restore the legacy Qwen-style behaviour.
265
+ PROXY_FORCE_TOOL_CHOICE_ON_COLD_START = os.environ.get(
266
+ "PROXY_FORCE_TOOL_CHOICE_ON_COLD_START", "off"
267
+ ).lower() not in {"0", "false", "off", "no"}
268
+ PROXY_DISABLE_SPEC_ON_TOOL_TURNS = os.environ.get(
269
+ "PROXY_DISABLE_SPEC_ON_TOOL_TURNS", "off"
270
+ ).lower() not in {
271
+ "0",
272
+ "false",
273
+ "off",
274
+ "no",
275
+ }
237
276
  PROXY_MALFORMED_TOOL_GUARDRAIL = os.environ.get(
238
277
  "PROXY_MALFORMED_TOOL_GUARDRAIL", "on"
239
278
  ).lower() not in {
@@ -555,6 +594,44 @@ def _is_grammar_tools_incompatibility(status_code: int, error_text: str) -> bool
555
594
  return "custom grammar constraints" in lowered and "with tools" in lowered
556
595
 
557
596
 
597
+ def _is_gemma4_peg_parse_failure(status_code: int, error_text: str) -> bool:
598
+ """Detect Gemma 4's PEG-parser failure on tool-turn output.
599
+
600
+ llama-server returns HTTP 500 with `failed to parse grammar` /
601
+ `Failed to parse input at pos N: <|tool_call>call:...` when the model
602
+ emits an incomplete tool call (missing required schema fields) under
603
+ tool_choice='required'. The PEG grammar enforces the schema strictly
604
+ and rejects the partial output. Caller should retry with relaxed
605
+ tool_choice='auto' so the model can emit prose or a complete call
606
+ without grammar enforcement triggering this failure mode.
607
+ """
608
+ if status_code != 500:
609
+ return False
610
+ text = error_text or ""
611
+ return (
612
+ "Failed to parse input at pos" in text
613
+ or "<|tool_call>call:" in text
614
+ )
615
+
616
+
617
+ def _relax_tool_choice_for_gemma4_peg_retry(request_body: dict, source: str) -> bool:
618
+ """When a Gemma 4 PEG parse failure is detected on a tool turn, drop
619
+ tool_choice='required' so the retry has a permissive grammar. Returns
620
+ True if the body was modified (caller should retry POST)."""
621
+ if not request_body.get("tools"):
622
+ return False
623
+ current = request_body.get("tool_choice")
624
+ if current in ("required", {"type": "any"}):
625
+ request_body["tool_choice"] = "auto"
626
+ logger.warning(
627
+ "GEMMA4 PEG RETRY (%s): relaxed tool_choice='required' -> 'auto' "
628
+ "to bypass strict-grammar parse failure on incomplete model output",
629
+ source,
630
+ )
631
+ return True
632
+ return False
633
+
634
+
558
635
  def _maybe_disable_grammar_for_tools_error(
559
636
  request_body: dict,
560
637
  status_code: int,
@@ -654,6 +731,7 @@ class SessionMonitor:
654
731
  tool_state_stagnation_streak: int = 0
655
732
  tool_state_transitions: int = 0
656
733
  tool_state_review_cycles: int = 0
734
+ tool_state_unproductive_exhaustion_streak: int = 0
657
735
  last_tool_fingerprint: str = ""
658
736
  cycling_tool_names: list = field(default_factory=list)
659
737
  session_banned_tools: set = field(default_factory=set) # tools banned for entire session after repeated cycling
@@ -661,6 +739,7 @@ class SessionMonitor:
661
739
  last_response_garbled: bool = False # previous turn had garbled/malformed output
662
740
  finalize_turn_active: bool = False
663
741
  finalize_continuation_count: int = 0
742
+ finalize_hard_stop_count: int = 0 # monotonic, not reset by fresh user text
664
743
  finalize_synthetic_tool_id: str = ""
665
744
  completion_required: bool = False
666
745
  completion_pending: bool = False
@@ -898,6 +977,7 @@ class SessionMonitor:
898
977
  self.tool_state_auto_budget_remaining = 0
899
978
  self.tool_state_stagnation_streak = 0
900
979
  self.tool_state_review_cycles = 0
980
+ self.tool_state_unproductive_exhaustion_streak = 0
901
981
  self.cycling_tool_names = []
902
982
  self.last_tool_fingerprint = ""
903
983
  self.reset_tool_targets()
@@ -906,7 +986,10 @@ class SessionMonitor:
906
986
  self.completion_required = _should_enforce_completion_contract(anthropic_body)
907
987
  self.completion_progress_signals = _count_completion_progress_signals(anthropic_body)
908
988
  blockers = _completion_blockers(
909
- anthropic_body, has_tool_results, phase=self.tool_turn_phase
989
+ anthropic_body,
990
+ has_tool_results,
991
+ phase=self.tool_turn_phase,
992
+ finalize_fired=(self.finalize_hard_stop_count > 0),
910
993
  )
911
994
  self.completion_blockers = blockers
912
995
  self.completion_pending = self.completion_required and bool(blockers)
@@ -1046,6 +1129,8 @@ class SessionMonitor:
1046
1129
  session_monitors: dict[str, SessionMonitor] = {}
1047
1130
  default_context_window = 0
1048
1131
  last_session_id = ""
1132
+ _last_ctx_recheck_ts: float = 0.0
1133
+ _CTX_RECHECK_INTERVAL: float = 60.0 # Re-detect context window every 60s
1049
1134
 
1050
1135
 
1051
1136
  def _cleanup_stale_monitors(now_ts: float) -> None:
@@ -1058,6 +1143,39 @@ def _cleanup_stale_monitors(now_ts: float) -> None:
1058
1143
  session_monitors.pop(sid, None)
1059
1144
 
1060
1145
 
1146
+ async def _maybe_recheck_context_window() -> None:
1147
+ """Periodically re-query the upstream server's context window.
1148
+
1149
+ Handles server restarts with different --ctx-size mid-session.
1150
+ Non-blocking: skips if the check interval hasn't elapsed.
1151
+ """
1152
+ global default_context_window, _last_ctx_recheck_ts
1153
+ now = time.time()
1154
+ if now - _last_ctx_recheck_ts < _CTX_RECHECK_INTERVAL:
1155
+ return
1156
+ _last_ctx_recheck_ts = now
1157
+ if http_client is None:
1158
+ return
1159
+ try:
1160
+ slots_url = LLAMA_CPP_BASE.replace("/v1", "/slots")
1161
+ resp = await http_client.get(slots_url, timeout=2.0)
1162
+ if resp.status_code == 200:
1163
+ slots = resp.json()
1164
+ if slots and isinstance(slots, list):
1165
+ n_ctx = slots[0].get("n_ctx", 0)
1166
+ if n_ctx > 0 and n_ctx != default_context_window:
1167
+ old = default_context_window
1168
+ default_context_window = n_ctx
1169
+ for mon in session_monitors.values():
1170
+ mon.context_window = n_ctx
1171
+ logger.warning(
1172
+ "Context window changed: %d → %d (upstream server restarted?)",
1173
+ old, n_ctx,
1174
+ )
1175
+ except Exception:
1176
+ pass # Non-critical, will retry next interval
1177
+
1178
+
1061
1179
  def get_session_monitor(session_id: str) -> SessionMonitor:
1062
1180
  now_ts = time.time()
1063
1181
  _cleanup_stale_monitors(now_ts)
@@ -1347,6 +1465,66 @@ def prune_conversation(
1347
1465
  # Granular timeouts: short connect, long read for streaming LLM output.
1348
1466
  http_client: httpx.AsyncClient | None = None
1349
1467
 
1468
+ # ---------------------------------------------------------------------------
1469
+ # Concurrency Control
1470
+ # ---------------------------------------------------------------------------
1471
+ # Semaphore to serialize upstream requests. llama.cpp is configured with
1472
+ # --parallel 1 (LLAMA_PARALLEL=1), so it can only process one inference at
1473
+ # a time. Without this gate, concurrent client requests (Shannon sub-agents,
1474
+ # multiple Claude Code sessions) would all hit llama.cpp at once and the
1475
+ # server would serialize them while the proxy holds N httpx connections
1476
+ # open — potentially exhausting the proxy's connection pool while requests
1477
+ # queue inside llama.cpp opaquely.
1478
+ #
1479
+ # With the semaphore: requests queue inside the proxy (cheap, just asyncio
1480
+ # tasks waiting) and only PROXY_CONCURRENCY_LIMIT at a time reaches
1481
+ # llama.cpp. Each httpx connection is held only for the actual inference
1482
+ # duration, not the queue wait.
1483
+ #
1484
+ # Default: 1 (matches LLAMA_PARALLEL=1). Increase if you raise --parallel.
1485
+ PROXY_CONCURRENCY_LIMIT = int(os.environ.get("PROXY_CONCURRENCY_LIMIT", "1"))
1486
+ # Max time to wait for a slot before returning 503. Generous because real
1487
+ # inference can take 30-600s and queued requests must wait through that.
1488
+ # 0 = wait indefinitely.
1489
+ PROXY_CONCURRENCY_QUEUE_TIMEOUT = float(
1490
+ os.environ.get("PROXY_CONCURRENCY_QUEUE_TIMEOUT", "900")
1491
+ )
1492
+ upstream_semaphore: asyncio.Semaphore | None = None
1493
+
1494
+
1495
+ async def _acquire_upstream_slot() -> bool:
1496
+ """Acquire a semaphore slot for an upstream request.
1497
+
1498
+ Returns True if a slot was acquired, False if the wait timed out.
1499
+ asyncio.Semaphore.acquire() preserves wait order via futures, so this
1500
+ gives a natural FIFO queue.
1501
+ """
1502
+ if upstream_semaphore is None:
1503
+ return True # Not yet initialized; proceed without limiting
1504
+ if PROXY_CONCURRENCY_QUEUE_TIMEOUT <= 0:
1505
+ await upstream_semaphore.acquire()
1506
+ return True
1507
+ try:
1508
+ await asyncio.wait_for(
1509
+ upstream_semaphore.acquire(),
1510
+ timeout=PROXY_CONCURRENCY_QUEUE_TIMEOUT,
1511
+ )
1512
+ return True
1513
+ except asyncio.TimeoutError:
1514
+ return False
1515
+
1516
+
1517
+ def _release_upstream_slot() -> None:
1518
+ """Release a semaphore slot. MUST be called once per successful acquire.
1519
+
1520
+ Note: asyncio.Semaphore.release() always increments the counter — we
1521
+ do NOT gate on locked() because that returns True only when the counter
1522
+ is 0 (no slots left). Gating would cause a slot leak when limit > 1 and
1523
+ multiple holders release simultaneously.
1524
+ """
1525
+ if upstream_semaphore is not None:
1526
+ upstream_semaphore.release()
1527
+
1350
1528
 
1351
1529
  def _is_loading_model_503(resp: httpx.Response) -> bool:
1352
1530
  """Check if response is a 503 'Loading model' from llama.cpp."""
@@ -1390,6 +1568,36 @@ async def _post_with_retry(
1390
1568
  url: str,
1391
1569
  payload: dict,
1392
1570
  headers: dict,
1571
+ ) -> httpx.Response:
1572
+ """Post with upstream-retry + concurrency-slot acquire.
1573
+
1574
+ Acquires a slot from upstream_semaphore before making the request, so
1575
+ concurrent client requests queue in the proxy (cheap asyncio waits)
1576
+ rather than all hammering llama.cpp at once. Slot is released in a
1577
+ finally block so it's always returned to the pool even on error.
1578
+ """
1579
+ acquired = await _acquire_upstream_slot()
1580
+ if not acquired:
1581
+ logger.warning(
1582
+ "CONCURRENCY: queue timeout (%ds) exceeded waiting for upstream slot",
1583
+ int(PROXY_CONCURRENCY_QUEUE_TIMEOUT),
1584
+ )
1585
+ raise httpx.RemoteProtocolError(
1586
+ f"Upstream concurrency queue timed out after {int(PROXY_CONCURRENCY_QUEUE_TIMEOUT)}s "
1587
+ f"(limit={PROXY_CONCURRENCY_LIMIT})",
1588
+ request=None,
1589
+ )
1590
+ try:
1591
+ return await _post_with_retry_inner(client, url, payload, headers)
1592
+ finally:
1593
+ _release_upstream_slot()
1594
+
1595
+
1596
+ async def _post_with_retry_inner(
1597
+ client: httpx.AsyncClient,
1598
+ url: str,
1599
+ payload: dict,
1600
+ headers: dict,
1393
1601
  ) -> httpx.Response:
1394
1602
  last_exc: Exception | None = None
1395
1603
  for attempt in range(PROXY_UPSTREAM_RETRY_MAX):
@@ -1435,6 +1643,7 @@ async def _post_with_generation_timeout(
1435
1643
  headers: dict,
1436
1644
  ) -> httpx.Response:
1437
1645
  """Wrap _post_with_retry with an explicit asyncio generation timeout.
1646
+ Also acquires a concurrency slot before making the request.
1438
1647
 
1439
1648
  The httpx read timeout may not fire for hung connections where the server
1440
1649
  keeps the socket open but produces no data (observed with llama.cpp server
@@ -1499,6 +1708,13 @@ async def lifespan(app: FastAPI):
1499
1708
  """Manage the httpx client lifecycle with the FastAPI app."""
1500
1709
  global http_client
1501
1710
  global default_context_window
1711
+ global upstream_semaphore
1712
+ upstream_semaphore = asyncio.Semaphore(PROXY_CONCURRENCY_LIMIT)
1713
+ logger.info(
1714
+ "CONCURRENCY: upstream semaphore initialized limit=%d queue_timeout=%.0fs",
1715
+ PROXY_CONCURRENCY_LIMIT,
1716
+ PROXY_CONCURRENCY_QUEUE_TIMEOUT,
1717
+ )
1502
1718
  http_client = httpx.AsyncClient(
1503
1719
  timeout=httpx.Timeout(
1504
1720
  connect=10.0, # 10s to establish connection
@@ -1581,6 +1797,8 @@ async def lifespan(app: FastAPI):
1581
1797
  yield
1582
1798
  await http_client.aclose()
1583
1799
  http_client = None
1800
+ if upstream_semaphore is not None:
1801
+ upstream_semaphore = None
1584
1802
  logger.info("Proxy shut down")
1585
1803
 
1586
1804
 
@@ -1591,6 +1809,16 @@ app = FastAPI(
1591
1809
  lifespan=lifespan,
1592
1810
  )
1593
1811
 
1812
+ # NOTE: Concurrency control is enforced by _acquire_upstream_slot() inside
1813
+ # _post_with_retry (the single point where we hit llama.cpp). An earlier
1814
+ # implementation also added an HTTP middleware that acquired the same
1815
+ # semaphore — this caused a self-deadlock (middleware holds slot, inner
1816
+ # call waits for slot, both on the same task). The middleware approach
1817
+ # also called non-existent asyncio.Semaphore methods (try_acquire /
1818
+ # acquire_nowait) and ran an async primitive in a thread executor.
1819
+ # Removed 2026-05-13.
1820
+
1821
+
1594
1822
 
1595
1823
  # ===========================================================================
1596
1824
  # Request Translation: Anthropic -> OpenAI
@@ -1624,6 +1852,31 @@ def anthropic_to_openai_messages(anthropic_body: dict) -> list[dict]:
1624
1852
  role = msg["role"]
1625
1853
  content = msg.get("content")
1626
1854
 
1855
+ # Strip <think>...</think> blocks from PRIOR assistant turns. Qwen is
1856
+ # heavily few-shot influenced by its own conversation history — if
1857
+ # earlier assistant turns contain reasoning blocks, the next turn
1858
+ # will pattern-match and emit <think> tags even when the system
1859
+ # prompt forbids them. Stripping breaks the copy cycle.
1860
+ if role == "assistant":
1861
+ if isinstance(content, str) and "<think>" in content:
1862
+ content = _THINKING_BLOCK_RE.sub("", content).lstrip()
1863
+ elif isinstance(content, list):
1864
+ stripped = []
1865
+ for b in content:
1866
+ if isinstance(b, dict) and b.get("type") == "text":
1867
+ t = b.get("text", "")
1868
+ if "<think>" in t:
1869
+ t = _THINKING_BLOCK_RE.sub("", t).lstrip()
1870
+ if t:
1871
+ stripped.append({**b, "text": t})
1872
+ elif isinstance(b, dict) and b.get("type") == "thinking":
1873
+ # Anthropic-style thinking block — drop entirely
1874
+ # (don't replay it back to the model).
1875
+ continue
1876
+ else:
1877
+ stripped.append(b)
1878
+ content = stripped
1879
+
1627
1880
  if isinstance(content, str):
1628
1881
  messages.append({"role": role, "content": content})
1629
1882
  elif isinstance(content, list):
@@ -1633,6 +1886,10 @@ def anthropic_to_openai_messages(anthropic_body: dict) -> list[dict]:
1633
1886
  parts.append(block)
1634
1887
  elif block.get("type") == "text":
1635
1888
  parts.append(block.get("text", ""))
1889
+ elif block.get("type") == "thinking":
1890
+ # Drop thinking blocks from user/assistant content when
1891
+ # echoed back into history — model shouldn't see them.
1892
+ continue
1636
1893
  elif block.get("type") == "tool_use":
1637
1894
  messages.append(
1638
1895
  {
@@ -1641,7 +1898,7 @@ def anthropic_to_openai_messages(anthropic_body: dict) -> list[dict]:
1641
1898
  "tool_calls": [
1642
1899
  {
1643
1900
  "id": block.get(
1644
- "id", f"call_{uuid.uuid4().hex[:8]}"
1901
+ "id", f"toolu_{uuid.uuid4().hex[:24]}"
1645
1902
  ),
1646
1903
  "type": "function",
1647
1904
  "function": {
@@ -1654,10 +1911,17 @@ def anthropic_to_openai_messages(anthropic_body: dict) -> list[dict]:
1654
1911
  )
1655
1912
  continue
1656
1913
  elif block.get("type") == "tool_result":
1914
+ # Strip Anthropic-spec toolu_ prefix so the upstream
1915
+ # tool_call_id matches what llama-server originally
1916
+ # emitted (we stamped the prefix on outbound; reverse it
1917
+ # here so the loop closes correctly).
1918
+ tu_id = block.get("tool_use_id", "")
1919
+ if isinstance(tu_id, str) and tu_id.startswith("toolu_"):
1920
+ tu_id = tu_id[len("toolu_"):]
1657
1921
  messages.append(
1658
1922
  {
1659
1923
  "role": "tool",
1660
- "tool_call_id": block.get("tool_use_id", ""),
1924
+ "tool_call_id": tu_id,
1661
1925
  "content": _extract_text(block.get("content", "")),
1662
1926
  }
1663
1927
  )
@@ -1837,6 +2101,18 @@ _AGENTIC_SYSTEM_SUPPLEMENT_MINIMAL = (
1837
2101
  "\n\nUse tools for all actions. Respond with tool calls, not descriptions of what to do."
1838
2102
  )
1839
2103
 
2104
+ # Directive appended when the upstream model (Qwen) is configured with
2105
+ # enable_thinking=False but consistently emits <think>...</think> blocks
2106
+ # anyway, consuming the max_tokens budget before any tool_use is generated.
2107
+ # Empirically required for Shannon-style workflows where max_tokens=512
2108
+ # leaves no room for both internal reasoning AND a tool call.
2109
+ _NO_THINKING_DIRECTIVE = (
2110
+ "\n\nCRITICAL: Do NOT output <think>...</think> tags or any internal "
2111
+ "reasoning. Begin your response IMMEDIATELY with the appropriate "
2112
+ "tool_call. If you have no tool to call, reply with plain text only — "
2113
+ "never include reasoning blocks."
2114
+ )
2115
+
1840
2116
  if PROXY_AGENTIC_SUPPLEMENT_MODE == "legacy":
1841
2117
  _AGENTIC_SYSTEM_SUPPLEMENT = _AGENTIC_SYSTEM_SUPPLEMENT_LEGACY
1842
2118
  elif PROXY_AGENTIC_SUPPLEMENT_MODE == "minimal":
@@ -1852,6 +2128,9 @@ else:
1852
2128
 
1853
2129
 
1854
2130
  def _content_fingerprint(content) -> str:
2131
+ """Return a STABLE fingerprint for content. Must not include volatile
2132
+ identifiers (tool_use_ids change per-turn), otherwise session stickiness
2133
+ breaks in agentic loops with stateful guardrails."""
1855
2134
  if isinstance(content, str):
1856
2135
  return content[:512]
1857
2136
  if isinstance(content, list):
@@ -1866,7 +2145,10 @@ def _content_fingerprint(content) -> str:
1866
2145
  elif btype == "tool_use":
1867
2146
  parts.append(f"tool:{block.get('name', '')}")
1868
2147
  elif btype == "tool_result":
1869
- parts.append(f"result:{block.get('tool_use_id', '')}")
2148
+ # Stable: use tool name + first 64 chars of content, not tool_use_id
2149
+ inner = block.get("content", "")
2150
+ inner_text = _extract_text(inner) if not isinstance(inner, str) else inner
2151
+ parts.append(f"result:{inner_text[:64]}")
1870
2152
  return "\n".join(parts)[:1024]
1871
2153
  return str(content)[:512]
1872
2154
 
@@ -1893,14 +2175,26 @@ def resolve_session_id(request: Request, anthropic_body: dict) -> str:
1893
2175
  first_user = ""
1894
2176
  for msg in anthropic_body.get("messages", []):
1895
2177
  if msg.get("role") == "user":
1896
- first_user = _content_fingerprint(msg.get("content", ""))
2178
+ # Only hash TEXT content of first user message, not tool_result blocks
2179
+ # (which may appear in /anthropic/v1/messages passthrough scenarios)
2180
+ content = msg.get("content", "")
2181
+ if isinstance(content, str):
2182
+ first_user = content[:512]
2183
+ elif isinstance(content, list):
2184
+ text_parts = [
2185
+ b.get("text", "") for b in content
2186
+ if isinstance(b, dict) and b.get("type") == "text"
2187
+ ]
2188
+ first_user = "\n".join(text_parts)[:512]
1897
2189
  break
1898
2190
 
1899
- system_fingerprint = _content_fingerprint(anthropic_body.get("system", ""))
2191
+ # Deliberately exclude `system` from fingerprint — clients often inject
2192
+ # volatile context (timestamps, cwd, session markers) into system prompts
2193
+ # which would break session stickiness for ongoing conversations.
1900
2194
  model = anthropic_body.get("model", "default")
1901
2195
  remote = request.client.host if request.client else "unknown"
1902
2196
  digest = hashlib.sha256(
1903
- f"{remote}|{model}|{system_fingerprint}|{first_user}".encode(
2197
+ f"{remote}|{model}|{first_user}".encode(
1904
2198
  "utf-8", errors="ignore"
1905
2199
  )
1906
2200
  ).hexdigest()[:20]
@@ -1965,7 +2259,10 @@ def _should_enforce_completion_contract(anthropic_body: dict) -> bool:
1965
2259
 
1966
2260
 
1967
2261
  def _completion_blockers(
1968
- anthropic_body: dict, has_tool_results: bool, phase: str = ""
2262
+ anthropic_body: dict,
2263
+ has_tool_results: bool,
2264
+ phase: str = "",
2265
+ finalize_fired: bool = False,
1969
2266
  ) -> list[str]:
1970
2267
  blockers: list[str] = []
1971
2268
  progress = _count_completion_progress_signals(anthropic_body)
@@ -1977,9 +2274,12 @@ def _completion_blockers(
1977
2274
  if last_user_has_result:
1978
2275
  blockers.append("awaiting_post_tool_followup")
1979
2276
  elif _last_assistant_was_text_only(anthropic_body):
1980
- # Option 2: Suppress during finalize — text-only is expected behavior
1981
- # for finalize turns, so blocking on it causes infinite ping-pong.
1982
- if phase != "finalize":
2277
+ # Suppress in two cases:
2278
+ # 1. Currently in finalize phase text-only is expected
2279
+ # 2. A finalize fired earlier this session — means the state machine
2280
+ # already wrapped up the loop, don't re-trigger it (was causing
2281
+ # finalize -> review -> cycle -> finalize -> review... infinite loop)
2282
+ if phase != "finalize" and not finalize_fired:
1983
2283
  blockers.append("text_only_after_tool_results")
1984
2284
 
1985
2285
  return blockers
@@ -2020,6 +2320,212 @@ def _sanitize_tool_schema_for_llama(schema):
2020
2320
  return _walk(schema), removed
2021
2321
 
2022
2322
 
2323
+ def openai_to_anthropic_request(openai_body: dict) -> dict:
2324
+ """Convert an OpenAI Chat Completions request to an Anthropic Messages request.
2325
+
2326
+ Inverse of anthropic_to_openai_messages. Used by /v1/chat/completions passthrough
2327
+ to let OpenAI-shaped clients (Forge, etc.) benefit from the Anthropic-path
2328
+ guardrails (loop detection, tool narrowing, cycle breaking, etc.).
2329
+ """
2330
+ anthropic_messages: list[dict] = []
2331
+ system_text_parts: list[str] = []
2332
+
2333
+ for msg in openai_body.get("messages", []):
2334
+ role = msg.get("role", "")
2335
+ content = msg.get("content")
2336
+
2337
+ if role == "system":
2338
+ if isinstance(content, str):
2339
+ system_text_parts.append(content)
2340
+ elif isinstance(content, list):
2341
+ for block in content:
2342
+ if isinstance(block, dict) and block.get("type") == "text":
2343
+ system_text_parts.append(block.get("text", ""))
2344
+ elif isinstance(block, str):
2345
+ system_text_parts.append(block)
2346
+ continue
2347
+
2348
+ if role == "tool":
2349
+ # OpenAI tool response -> Anthropic user message with tool_result block
2350
+ tool_call_id = msg.get("tool_call_id", "")
2351
+ tool_text = content if isinstance(content, str) else _extract_text(content)
2352
+ anthropic_messages.append(
2353
+ {
2354
+ "role": "user",
2355
+ "content": [
2356
+ {
2357
+ "type": "tool_result",
2358
+ "tool_use_id": tool_call_id,
2359
+ "content": tool_text,
2360
+ }
2361
+ ],
2362
+ }
2363
+ )
2364
+ continue
2365
+
2366
+ if role == "assistant":
2367
+ blocks: list[dict] = []
2368
+ if isinstance(content, str) and content:
2369
+ blocks.append({"type": "text", "text": content})
2370
+ elif isinstance(content, list):
2371
+ for block in content:
2372
+ if isinstance(block, dict) and block.get("type") == "text":
2373
+ blocks.append({"type": "text", "text": block.get("text", "")})
2374
+ elif isinstance(block, str):
2375
+ blocks.append({"type": "text", "text": block})
2376
+
2377
+ for tc in msg.get("tool_calls", []) or []:
2378
+ fn = tc.get("function", {})
2379
+ try:
2380
+ args = json.loads(fn.get("arguments", "{}") or "{}")
2381
+ except (ValueError, TypeError):
2382
+ args = {}
2383
+ blocks.append(
2384
+ {
2385
+ "type": "tool_use",
2386
+ "id": tc.get("id", f"toolu_{uuid.uuid4().hex[:12]}"),
2387
+ "name": fn.get("name", ""),
2388
+ "input": args,
2389
+ }
2390
+ )
2391
+
2392
+ anthropic_messages.append(
2393
+ {"role": "assistant", "content": blocks if blocks else ""}
2394
+ )
2395
+ continue
2396
+
2397
+ # role == "user" (or unknown -> treat as user)
2398
+ if isinstance(content, str):
2399
+ anthropic_messages.append({"role": "user", "content": content})
2400
+ elif isinstance(content, list):
2401
+ blocks = []
2402
+ for block in content:
2403
+ if isinstance(block, dict) and block.get("type") == "text":
2404
+ blocks.append({"type": "text", "text": block.get("text", "")})
2405
+ elif isinstance(block, str):
2406
+ blocks.append({"type": "text", "text": block})
2407
+ anthropic_messages.append(
2408
+ {"role": "user", "content": blocks if blocks else ""}
2409
+ )
2410
+ else:
2411
+ anthropic_messages.append({"role": "user", "content": ""})
2412
+
2413
+ anthropic_body: dict = {
2414
+ "model": openai_body.get("model", "default"),
2415
+ "messages": anthropic_messages,
2416
+ "max_tokens": int(openai_body.get("max_tokens", 4096) or 4096),
2417
+ }
2418
+ if system_text_parts:
2419
+ anthropic_body["system"] = "\n\n".join(p for p in system_text_parts if p)
2420
+
2421
+ for key_o, key_a in (
2422
+ ("temperature", "temperature"),
2423
+ ("top_p", "top_p"),
2424
+ ("top_k", "top_k"),
2425
+ ("stop", "stop_sequences"),
2426
+ ("stream", "stream"),
2427
+ ):
2428
+ if key_o in openai_body:
2429
+ val = openai_body[key_o]
2430
+ if key_a == "stop_sequences" and isinstance(val, str):
2431
+ val = [val]
2432
+ anthropic_body[key_a] = val
2433
+
2434
+ # Convert OpenAI tools -> Anthropic tools
2435
+ openai_tools = openai_body.get("tools") or []
2436
+ if openai_tools:
2437
+ anthropic_tools = []
2438
+ for tool in openai_tools:
2439
+ fn = tool.get("function", {}) if isinstance(tool, dict) else {}
2440
+ if not fn.get("name"):
2441
+ continue
2442
+ anthropic_tools.append(
2443
+ {
2444
+ "name": fn.get("name", ""),
2445
+ "description": fn.get("description", ""),
2446
+ "input_schema": fn.get("parameters", {"type": "object", "properties": {}}),
2447
+ }
2448
+ )
2449
+ if anthropic_tools:
2450
+ anthropic_body["tools"] = anthropic_tools
2451
+
2452
+ tool_choice = openai_body.get("tool_choice")
2453
+ if tool_choice == "none":
2454
+ anthropic_body.pop("tools", None)
2455
+ elif tool_choice == "required":
2456
+ anthropic_body["tool_choice"] = {"type": "any"}
2457
+ elif isinstance(tool_choice, dict) and tool_choice.get("type") == "function":
2458
+ anthropic_body["tool_choice"] = {
2459
+ "type": "tool",
2460
+ "name": tool_choice.get("function", {}).get("name", ""),
2461
+ }
2462
+
2463
+ return anthropic_body
2464
+
2465
+
2466
+ def anthropic_to_openai_response(anthropic_resp: dict) -> dict:
2467
+ """Convert an Anthropic Messages response to OpenAI Chat Completions format."""
2468
+ content_blocks = anthropic_resp.get("content", []) or []
2469
+ text_parts: list[str] = []
2470
+ tool_calls: list[dict] = []
2471
+
2472
+ for block in content_blocks:
2473
+ if not isinstance(block, dict):
2474
+ continue
2475
+ btype = block.get("type")
2476
+ if btype == "text":
2477
+ text_parts.append(block.get("text", ""))
2478
+ elif btype == "tool_use":
2479
+ tool_calls.append(
2480
+ {
2481
+ "id": block.get("id", f"toolu_{uuid.uuid4().hex[:24]}"),
2482
+ "type": "function",
2483
+ "function": {
2484
+ "name": block.get("name", ""),
2485
+ "arguments": json.dumps(block.get("input", {}) or {}),
2486
+ },
2487
+ }
2488
+ )
2489
+
2490
+ stop_reason = anthropic_resp.get("stop_reason", "end_turn")
2491
+ finish_map = {
2492
+ "end_turn": "stop",
2493
+ "stop_sequence": "stop",
2494
+ "max_tokens": "length",
2495
+ "tool_use": "tool_calls",
2496
+ }
2497
+ finish_reason = finish_map.get(stop_reason, "stop")
2498
+
2499
+ message: dict = {"role": "assistant"}
2500
+ if text_parts:
2501
+ message["content"] = "".join(text_parts)
2502
+ else:
2503
+ message["content"] = None
2504
+ if tool_calls:
2505
+ message["tool_calls"] = tool_calls
2506
+
2507
+ usage = anthropic_resp.get("usage", {}) or {}
2508
+
2509
+ return {
2510
+ "id": anthropic_resp.get("id", f"chatcmpl-{uuid.uuid4().hex[:12]}"),
2511
+ "object": "chat.completion",
2512
+ "created": int(time.time()),
2513
+ "model": anthropic_resp.get("model", "unknown"),
2514
+ "choices": [
2515
+ {
2516
+ "index": 0,
2517
+ "message": message,
2518
+ "finish_reason": finish_reason,
2519
+ }
2520
+ ],
2521
+ "usage": {
2522
+ "prompt_tokens": usage.get("input_tokens", 0),
2523
+ "completion_tokens": usage.get("output_tokens", 0),
2524
+ "total_tokens": usage.get("input_tokens", 0) + usage.get("output_tokens", 0),
2525
+ },
2526
+ }
2527
+
2528
+
2023
2529
  def _convert_anthropic_tools_to_openai(anthropic_tools: list[dict]) -> list[dict]:
2024
2530
  converted = []
2025
2531
  removed_pattern_fields = 0
@@ -2055,6 +2561,72 @@ def _latest_user_text(anthropic_body: dict) -> str:
2055
2561
  return ""
2056
2562
 
2057
2563
 
2564
+ # 2026-05-12: Detect "no-task" user turns to gate the state machine's
2565
+ # force-required path. When the last actual human query is a short ack
2566
+ # ("ok", "3", "test"), an acknowledgement phrase ("standing by", "awaiting
2567
+ # next instruction"), or a status report ending in an ack ("scan complete.
2568
+ # awaiting next instruction"), there is no genuine work for the model to
2569
+ # do. Forcing tool_choice='required' in this state causes the model to
2570
+ # ruminate in <think> blocks, and the meta-tool talk inside those blocks
2571
+ # trips the malformed-pseudo-tool detector. Conservative patterns only.
2572
+ _NO_TASK_SHORT_ACKS = frozenset({
2573
+ "ok", "okay", "k", "kk", "y", "n", "yes", "no", "nope", "yep", "yeah",
2574
+ "thanks", "thank", "thx", "ty", "ack", "noted", "received", "understood",
2575
+ "test", "ping", "hi", "hello",
2576
+ })
2577
+
2578
+ _NO_TASK_ACK_PATTERNS = (
2579
+ re.compile(r"awaiting\s+(?:next|further|your)\s+(?:instruction|input|command|task|directive)", re.I),
2580
+ re.compile(r"standing\s+by(?:\s+for\s+(?:your\s+)?(?:next|further|new)\s+(?:instruction|input|command|task|directive)?)?", re.I),
2581
+ re.compile(r"\b(?:ready|waiting|holding)\s+for\s+(?:your\s+)?(?:next|further|new)\s+(?:task|instruction|command|input|directive)", re.I),
2582
+ # Status report ending in ack: "X complete. {awaiting/standing/ready/done}"
2583
+ re.compile(r"\bcomplet(?:e|ed)\b[\s.,;:!\-]+(?:awaiting|standing\s+by|ready|done|finished|over\s+to\s+you)", re.I),
2584
+ )
2585
+
2586
+
2587
+ def _is_no_task_user_text(text: str) -> bool:
2588
+ if not text:
2589
+ return False
2590
+ stripped = text.strip()
2591
+ if not stripped:
2592
+ return False
2593
+ bare = re.sub(r"[^\w\s]", "", stripped).strip().lower()
2594
+ if bare in _NO_TASK_SHORT_ACKS:
2595
+ return True
2596
+ if re.fullmatch(r"\d+(?:\.\d+)?", bare):
2597
+ return True
2598
+ snippet = stripped[:400]
2599
+ return any(p.search(snippet) for p in _NO_TASK_ACK_PATTERNS)
2600
+
2601
+
2602
+ def _latest_user_query_text(anthropic_body: dict) -> str:
2603
+ """Return the most recent user message *text* — walking past
2604
+ tool_result-only messages to find the last actual human query.
2605
+
2606
+ During agentic loops the trailing user message is a tool_result block
2607
+ with no ``text`` parts, so ``_latest_user_text`` returns empty.
2608
+ Tool-narrowing needs query tokens to score tools; without them it
2609
+ keeps all tools (defeating the purpose). This walker pulls text
2610
+ from prior user turns as a fallback so narrowing stays useful in
2611
+ long loops.
2612
+ """
2613
+ for msg in reversed(anthropic_body.get("messages", [])):
2614
+ if msg.get("role") != "user":
2615
+ continue
2616
+ content = msg.get("content", "")
2617
+ if isinstance(content, str) and content.strip():
2618
+ return content
2619
+ if isinstance(content, list):
2620
+ text_parts = [
2621
+ b.get("text", "")
2622
+ for b in content
2623
+ if isinstance(b, dict) and b.get("type") == "text" and b.get("text")
2624
+ ]
2625
+ if text_parts:
2626
+ return "\n".join(text_parts)
2627
+ return ""
2628
+
2629
+
2058
2630
  def _tokenize_for_tool_ranking(text: str) -> set[str]:
2059
2631
  return {m.group(0).lower() for m in re.finditer(r"[a-zA-Z0-9_]{2,}", text)}
2060
2632
 
@@ -2074,6 +2646,13 @@ def _narrow_tools_for_request(
2074
2646
 
2075
2647
  query_text = _latest_user_text(anthropic_body).lower()
2076
2648
  query_tokens = _tokenize_for_tool_ranking(query_text)
2649
+ if not query_tokens:
2650
+ # Walk back past tool_result turns to find the prior real human
2651
+ # query. Lets narrowing stay effective during agentic loops where
2652
+ # the latest user msg is just a tool_result block (no text).
2653
+ fallback_query = _latest_user_query_text(anthropic_body).lower()
2654
+ query_text = fallback_query or query_text
2655
+ query_tokens = _tokenize_for_tool_ranking(query_text)
2077
2656
  if not query_tokens:
2078
2657
  n_msgs = len(anthropic_body.get("messages", []))
2079
2658
  if (
@@ -2198,6 +2777,18 @@ def _resolve_state_machine_tool_choice(
2198
2777
  monitor.finalize_synthetic_tool_id = ""
2199
2778
  return None, "fresh_user_text"
2200
2779
 
2780
+ # 2026-05-12: No-task ack guard. When the latest user message is just a
2781
+ # tool_result (no fresh text), walk back to the most recent human query.
2782
+ # If that query is a short ack or "X complete. awaiting next" status,
2783
+ # do not force tool_choice — let the model produce a natural finalization
2784
+ # text instead of ruminating in <think> blocks.
2785
+ last_user_query = _latest_user_query_text(anthropic_body).strip()
2786
+ if last_user_query and _is_no_task_user_text(last_user_query):
2787
+ monitor.reset_tool_turn_state(reason="no_task_user_text")
2788
+ monitor.finalize_continuation_count = 0
2789
+ monitor.finalize_synthetic_tool_id = ""
2790
+ return None, "no_task_user_text"
2791
+
2201
2792
  active_loop = (
2202
2793
  has_tool_results
2203
2794
  and last_user_has_tool_result
@@ -2271,7 +2862,15 @@ def _resolve_state_machine_tool_choice(
2271
2862
  dup_tool,
2272
2863
  )
2273
2864
 
2274
- if cycle_looping or stagnating:
2865
+ # Fix K (2026-04-22): require cycle_repeat >= PROXY_CYCLE_TRIGGER_REPEAT
2866
+ # before flipping phase. Single-repeat cycles are legitimate in working
2867
+ # sessions (e.g. re-reading the same file across edits). dup_target
2868
+ # above already demands threshold=3 before asserting a cycle, so the
2869
+ # `cycle_looping = True, cycle_repeat = 2` pair from that branch is
2870
+ # kept as a strong signal (read target repeated 3+ times). Low-repeat
2871
+ # cycles detected by detect_tool_cycle get filtered here.
2872
+ cycle_trip = cycle_looping and cycle_repeat >= PROXY_CYCLE_TRIGGER_REPEAT
2873
+ if cycle_trip or stagnating:
2275
2874
  reason = "cycle_detected" if cycle_looping else "stagnation"
2276
2875
  monitor.set_tool_turn_phase("review", reason=reason)
2277
2876
  monitor.tool_state_review_cycles += 1
@@ -2313,11 +2912,27 @@ def _resolve_state_machine_tool_choice(
2313
2912
 
2314
2913
  if monitor.tool_state_forced_budget_remaining <= 0:
2315
2914
  monitor.set_tool_turn_phase("review", reason="forced_budget_exhausted")
2316
- # Only count toward review cycle limit if there was an actual
2317
- # cycle/stagnation detected. Budget exhaustion alone means the
2318
- # model is working — it just used all its turns — not cycling.
2319
2915
  if cycle_looping or stagnating:
2320
2916
  monitor.tool_state_review_cycles += 1
2917
+ monitor.tool_state_unproductive_exhaustion_streak = 0
2918
+ else:
2919
+ # Track consecutive unproductive exhaustions. Even without a
2920
+ # detected cycle, if the model burns through the forced budget
2921
+ # repeatedly with distinct-but-useless tool calls, treat it as
2922
+ # a loop and force finalize. Catches the 35B-A3B failure mode
2923
+ # where different short tool calls defeat per-tool cycle
2924
+ # detection.
2925
+ monitor.tool_state_unproductive_exhaustion_streak += 1
2926
+ if monitor.tool_state_unproductive_exhaustion_streak >= PROXY_UNPRODUCTIVE_EXHAUSTION_LIMIT:
2927
+ logger.warning(
2928
+ "TOOL STATE MACHINE: %d consecutive unproductive budget exhaustions — forcing finalize",
2929
+ monitor.tool_state_unproductive_exhaustion_streak,
2930
+ )
2931
+ monitor.set_tool_turn_phase("finalize", reason="unproductive_exhaustion")
2932
+ monitor.tool_state_unproductive_exhaustion_streak = 0
2933
+ monitor.tool_state_forced_budget_remaining = 0
2934
+ monitor.tool_state_auto_budget_remaining = 0
2935
+ return "finalize", "unproductive_exhaustion"
2321
2936
  monitor.tool_state_auto_budget_remaining = max(
2322
2937
  1, PROXY_TOOL_STATE_AUTO_BUDGET
2323
2938
  )
@@ -2325,10 +2940,11 @@ def _resolve_state_machine_tool_choice(
2325
2940
  1, PROXY_TOOL_STATE_FORCED_BUDGET // 2
2326
2941
  )
2327
2942
  logger.warning(
2328
- "TOOL STATE MACHINE: forced budget exhausted, entering review (cycles=%d cycling=%s stagnating=%s)",
2943
+ "TOOL STATE MACHINE: forced budget exhausted, entering review (cycles=%d cycling=%s stagnating=%s unprod_exh=%d)",
2329
2944
  monitor.tool_state_review_cycles,
2330
2945
  cycle_looping,
2331
2946
  stagnating,
2947
+ monitor.tool_state_unproductive_exhaustion_streak,
2332
2948
  )
2333
2949
  return "required", "forced_budget_exhausted"
2334
2950
 
@@ -2393,6 +3009,33 @@ def build_openai_request(
2393
3009
 
2394
3010
  has_tools = _has_tool_definitions(anthropic_body)
2395
3011
 
3012
+ # Translate Anthropic `thinking` parameter to upstream `enable_thinking`.
3013
+ # Anthropic shape: {"thinking": {"type": "enabled", "budget_tokens": 1024}}
3014
+ # or {"type": "disabled"}. Per the Anthropic spec, thinking is OFF by
3015
+ # default and ONLY enabled when the client opts in. Match that behaviour:
3016
+ # - thinking.type == "enabled" -> enable_thinking=True
3017
+ # - thinking.type == "disabled" or absent -> enable_thinking=False
3018
+ # Without this, Qwen's chat template (which defaults thinking ON) would
3019
+ # consume the client's max_tokens budget on internal reasoning, leaving
3020
+ # nothing for the visible answer.
3021
+ anthropic_thinking = anthropic_body.get("thinking")
3022
+ if isinstance(anthropic_thinking, dict):
3023
+ ttype = (anthropic_thinking.get("type") or "").lower()
3024
+ if ttype == "enabled":
3025
+ openai_body["enable_thinking"] = True
3026
+ else:
3027
+ openai_body["enable_thinking"] = False
3028
+ else:
3029
+ # Match Anthropic default: thinking off unless explicitly requested.
3030
+ openai_body["enable_thinking"] = False
3031
+
3032
+ # Global thinking-off (G): apply to every request, not just tool turns.
3033
+ # Only applies when the client did NOT explicitly request thinking above.
3034
+ # Per-path tool-turn handling below (DISABLE_THINKING_ON_TOOL_TURNS) is
3035
+ # additive — ALWAYS supersedes when set.
3036
+ if PROXY_DISABLE_THINKING_ALWAYS:
3037
+ openai_body["enable_thinking"] = False
3038
+
2396
3039
  # Inject agentic protocol instructions only for tool-enabled turns.
2397
3040
  # Use minimal supplement for qwen models to reduce prompt leak surface.
2398
3041
  if has_tools:
@@ -2402,6 +3045,15 @@ def build_openai_request(
2402
3045
  if "qwen" in model_name and PROXY_AGENTIC_SUPPLEMENT_MODE != "legacy"
2403
3046
  else _AGENTIC_SYSTEM_SUPPLEMENT
2404
3047
  )
3048
+ # When thinking is explicitly disabled (Anthropic default, plus our
3049
+ # tool-turn forcing) but the upstream model is Qwen — which emits
3050
+ # <think> blocks regardless of enable_thinking — append a strong
3051
+ # directive that suppresses internal reasoning. Without this, small
3052
+ # max_tokens budgets get fully consumed by the model's reasoning,
3053
+ # producing required_tool_miss retries (observed in Shannon workflows
3054
+ # with max_tokens=512 + tool_choice=required).
3055
+ if openai_body.get("enable_thinking") is False:
3056
+ supplement = supplement + _NO_THINKING_DIRECTIVE
2405
3057
  if (
2406
3058
  openai_body["messages"]
2407
3059
  and openai_body["messages"][0].get("role") == "system"
@@ -2422,23 +3074,62 @@ def build_openai_request(
2422
3074
  if "max_tokens" in anthropic_body:
2423
3075
  requested_raw = max(1, int(anthropic_body["max_tokens"]))
2424
3076
 
2425
- # Enforce configurable minimum floor for thinking mode: model needs
2426
- # tokens for reasoning (<think>...</think>) plus actual response/tool
2427
- # calls. Set PROXY_MAX_TOKENS_FLOOR=0 to disable this floor.
3077
+ # Enforce configurable minimum floor for tool turns: the model needs
3078
+ # enough headroom to emit complete tool-call arguments (long heredocs,
3079
+ # full-function oldString/newString pairs, etc.) without hitting the
3080
+ # client-requested max_tokens in the middle of a JSON string. If the
3081
+ # client requested >= the floor we keep their value; short preflight
3082
+ # requests (max_tokens <= 1024) always skip the floor to avoid
3083
+ # inflating plan-generation turns.
2428
3084
  #
2429
- # The floor is ONLY applied when thinking is actually enabled —
2430
- # skip it for non-tool requests (tools=0) and for tool turns
2431
- # with thinking disabled, to prevent inflating short preflight
2432
- # requests (e.g. max_tokens=100 for plan generation).
2433
- thinking_active_for_request = has_tools and not PROXY_DISABLE_THINKING_ON_TOOL_TURNS
3085
+ # The earlier gating on PROXY_DISABLE_THINKING_ON_TOOL_TURNS was too
3086
+ # restrictive: it skipped the floor on every tool turn once thinking
3087
+ # was off, which re-introduced truncated tool calls on long edits.
3088
+ # Set PROXY_MAX_TOKENS_FLOOR=0 to disable the floor entirely.
3089
+ thinking_active_for_request = (
3090
+ has_tools
3091
+ and not PROXY_DISABLE_THINKING_ON_TOOL_TURNS
3092
+ and not PROXY_DISABLE_THINKING_ALWAYS
3093
+ )
3094
+ SMALL_PREFLIGHT_THRESHOLD = 1024
3095
+ # Qwen-style models emit <think> blocks regardless of the
3096
+ # enable_thinking flag (template ignored by trained behaviour).
3097
+ # For tool turns those blocks alone consume ~400-1000 tokens, so a
3098
+ # client-requested max_tokens < THINKING_MIN_FOR_TOOLS leaves no
3099
+ # budget for the tool_call itself — manifesting as required_tool_miss
3100
+ # retries (observed Shannon: max_tokens=512 + tools=7 -> ~5 retries
3101
+ # per turn). Bump up to THINKING_MIN_FOR_TOOLS for these requests.
3102
+ THINKING_MIN_FOR_TOOLS = 2048
2434
3103
  skip_floor = (
2435
- not has_tools # non-tool requests don't need thinking headroom
2436
- or PROXY_DISABLE_THINKING_ON_TOOL_TURNS # thinking disabled on tool turns
3104
+ not has_tools # non-tool requests don't need the headroom
2437
3105
  or PROXY_MAX_TOKENS_FLOOR <= 0 # floor explicitly disabled
3106
+ or requested_raw <= SMALL_PREFLIGHT_THRESHOLD # tiny preflight request
2438
3107
  )
3108
+ # Qwen-style models emit <think> blocks regardless of the
3109
+ # enable_thinking flag (template ignored by trained behaviour).
3110
+ # For tool turns those blocks alone consume ~400-1000 tokens, so a
3111
+ # client-requested max_tokens < THINKING_MIN_FOR_TOOLS leaves no
3112
+ # budget for the tool_call itself — manifesting as required_tool_miss
3113
+ # retries (observed Shannon: max_tokens=512 + tools=7 -> ~5 retries
3114
+ # per turn). Bump up to THINKING_MIN_FOR_TOOLS for these requests.
3115
+ THINKING_MIN_FOR_TOOLS = 2048
2439
3116
  if skip_floor:
2440
3117
  requested_max = requested_raw
2441
- if requested_raw < PROXY_MAX_TOKENS_FLOOR and PROXY_MAX_TOKENS_FLOOR > 0:
3118
+ # Even when skipping the big floor, bump small tool-turn
3119
+ # budgets so Qwen's mandatory thinking has room before the
3120
+ # tool_call. Only applies when tools are present.
3121
+ if (
3122
+ has_tools
3123
+ and requested_raw < THINKING_MIN_FOR_TOOLS
3124
+ and requested_raw > 16 # leave true preflight (e.g. max_tokens=1) alone
3125
+ ):
3126
+ requested_max = THINKING_MIN_FOR_TOOLS
3127
+ logger.info(
3128
+ "MAX_TOKENS thinking-floor: %d -> %d (tool turn, Qwen mandatory thinking)",
3129
+ requested_raw,
3130
+ requested_max,
3131
+ )
3132
+ elif requested_raw < PROXY_MAX_TOKENS_FLOOR and PROXY_MAX_TOKENS_FLOOR > 0:
2442
3133
  logger.info(
2443
3134
  "MAX_TOKENS floor skipped: has_tools=%s thinking_active=%s requested=%d floor=%d",
2444
3135
  has_tools,
@@ -2612,6 +3303,8 @@ def build_openai_request(
2612
3303
  # Skip all further tool_choice logic — no tools this turn
2613
3304
  if PROXY_DISABLE_THINKING_ON_TOOL_TURNS:
2614
3305
  openai_body["enable_thinking"] = False
3306
+ if PROXY_DISABLE_SPEC_ON_TOOL_TURNS:
3307
+ openai_body["speculative.n_max"] = 0
2615
3308
  return openai_body
2616
3309
 
2617
3310
  # Check if forced-tool dampener or loop breaker should override tool_choice
@@ -2635,23 +3328,35 @@ def build_openai_request(
2635
3328
  monitor.tool_state_stagnation_streak,
2636
3329
  )
2637
3330
  elif state_choice == "finalize":
2638
- openai_body.pop("tool_choice", None)
2639
- openai_body.pop("tools", None)
3331
+ # Fix H/J (2026-04-22): Do NOT strip tools from the body on
3332
+ # cycle-limit finalize. Stripping tools lets the model emit
3333
+ # prose that LOOKS like a tool call ("<function=edit>…") but
3334
+ # has no structured tool_calls array, so the Anthropic client
3335
+ # sees end_turn with no action and halts. Instead, keep tools
3336
+ # available, set tool_choice=auto, and nudge the model to
3337
+ # either complete with a tool call OR emit a proper summary.
3338
+ # Grammar (when PROXY_TOOL_CALL_GRAMMAR_REQUIRED_ONLY=off) will
3339
+ # still constrain tool-call emission to valid JSON format.
3340
+ openai_body["tool_choice"] = "auto"
2640
3341
  monitor.finalize_turn_active = True
3342
+ monitor.finalize_hard_stop_count += 1 # monotonic marker: a finalize fired this session
2641
3343
  monitor.consecutive_forced_count = 0
2642
3344
  monitor.no_progress_streak = 0
2643
- # Option 3: Inject explicit "no tool calls" instruction to reduce XML leak
2644
3345
  finalize_instruction = {
2645
3346
  "role": "user",
2646
3347
  "content": (
2647
- "Respond with plain text only. Do not emit any tool calls, "
2648
- "XML tags, or JSON objects."
3348
+ "You have been looping on the same tools for several turns. "
3349
+ "Wrap up: either emit ONE decisive tool call that completes "
3350
+ "the task, or reply with a plain-text summary of what you "
3351
+ "accomplished and what is blocking further progress. Do NOT "
3352
+ "emit tool call text in prose form — if you call a tool, do "
3353
+ "it through the structured tool_call mechanism."
2649
3354
  ),
2650
3355
  }
2651
3356
  msgs = openai_body.get("messages", [])
2652
3357
  msgs.append(finalize_instruction)
2653
3358
  logger.warning(
2654
- "TOOL STATE MACHINE: tools temporarily disabled for finalize turn (reason=%s)",
3359
+ "TOOL STATE MACHINE: finalize turn (reason=%s) — tools kept, tool_choice=auto",
2655
3360
  state_reason,
2656
3361
  )
2657
3362
  elif state_choice == "required":
@@ -2732,11 +3437,24 @@ def build_openai_request(
2732
3437
  elif state_reason in {"fresh_user_text", "inactive_loop"} and n_msgs <= 1:
2733
3438
  monitor.consecutive_forced_count = 0
2734
3439
  monitor.no_progress_streak = 0
2735
- logger.info(
2736
- "tool_choice left unchanged after state reset (reason=%s n_msgs=%d)",
2737
- state_reason,
2738
- n_msgs,
2739
- )
3440
+ # Force tool_choice=required on first turn to ensure local models
3441
+ # produce a tool call instead of plain text (cold-start fix).
3442
+ # Gated by PROXY_FORCE_TOOL_CHOICE_ON_COLD_START — Gemma 4 routes
3443
+ # 'auto' correctly without needing the force, and the force
3444
+ # triggers malformed-JSON emissions on Gemma 4 cold turns.
3445
+ if has_tools and n_msgs == 1 and PROXY_FORCE_TOOL_CHOICE_ON_COLD_START:
3446
+ openai_body["tool_choice"] = "required"
3447
+ logger.info(
3448
+ "tool_choice forced to 'required' on first turn (reason=%s n_msgs=%d cold_start_fix=true)",
3449
+ state_reason,
3450
+ n_msgs,
3451
+ )
3452
+ else:
3453
+ logger.info(
3454
+ "tool_choice left unchanged after state reset (reason=%s n_msgs=%d)",
3455
+ state_reason,
3456
+ n_msgs,
3457
+ )
2740
3458
  elif monitor.should_release_tool_choice():
2741
3459
  openai_body["tool_choice"] = "auto"
2742
3460
  monitor.consecutive_forced_count = 0
@@ -2767,10 +3485,18 @@ def build_openai_request(
2767
3485
  monitor.reset_tool_turn_state(reason="no_tool_results")
2768
3486
 
2769
3487
 
2770
- if PROXY_DISABLE_THINKING_ON_TOOL_TURNS:
3488
+ if PROXY_DISABLE_THINKING_ALWAYS or PROXY_DISABLE_THINKING_ON_TOOL_TURNS:
2771
3489
  openai_body["enable_thinking"] = False
2772
3490
  logger.info(
2773
- "Thinking disabled for tool turn (PROXY_DISABLE_THINKING_ON_TOOL_TURNS=on)"
3491
+ "Thinking disabled (always=%s tool_turns=%s)",
3492
+ PROXY_DISABLE_THINKING_ALWAYS,
3493
+ PROXY_DISABLE_THINKING_ON_TOOL_TURNS,
3494
+ )
3495
+
3496
+ if PROXY_DISABLE_SPEC_ON_TOOL_TURNS:
3497
+ openai_body["speculative.n_max"] = 0
3498
+ logger.info(
3499
+ "Spec decoding disabled for tool turn (PROXY_DISABLE_SPEC_ON_TOOL_TURNS=on)"
2774
3500
  )
2775
3501
 
2776
3502
  _apply_tool_call_grammar(openai_body, grammar_override=profile_grammar)
@@ -3083,7 +3809,10 @@ def _schema_type_matches(value, expected_type: str) -> bool:
3083
3809
 
3084
3810
  def _string_contains_tool_markup(value: str) -> bool:
3085
3811
  lowered = value.lower()
3086
- markers = ("<parameter", "</parameter", "<tool_call", "<function=", "</function")
3812
+ markers = (
3813
+ "<parameter", "</parameter", "<tool_call", "<function=", "</function",
3814
+ "<|tool_call>", "<tool_call|>", # Gemma 4 native DSL
3815
+ )
3087
3816
  return any(marker in lowered for marker in markers)
3088
3817
 
3089
3818
 
@@ -3155,6 +3884,343 @@ _TOOL_CALL_XML_RE = re.compile(
3155
3884
  re.DOTALL,
3156
3885
  )
3157
3886
 
3887
+ # Hermes-style XML function call format emitted by some Qwen/Llama fine-tunes
3888
+ # when grammar is not applied:
3889
+ # <function=name>
3890
+ # <parameter=key>
3891
+ # value
3892
+ # </parameter>
3893
+ # ...
3894
+ # </function>
3895
+ #
3896
+ # The value of a <parameter=KEY> block may span multiple lines and include
3897
+ # arbitrary characters (code snippets, JSON, quotes). The closing
3898
+ # </parameter> tag may be missing if the model emitted EOS prematurely —
3899
+ # in which case we consume up to the next <parameter=...> tag or end of
3900
+ # string. Names are captured as alphanumeric + underscore to avoid pulling
3901
+ # in attribute-like garbage.
3902
+ _HERMES_FUNCTION_RE = re.compile(
3903
+ r"<function=([A-Za-z_][A-Za-z0-9_]*)>(.*?)(?:</function>|\Z)",
3904
+ re.DOTALL,
3905
+ )
3906
+ _HERMES_PARAMETER_RE = re.compile(
3907
+ r"<parameter=([A-Za-z_][A-Za-z0-9_]*)>\s*(.*?)\s*(?=</parameter>|<parameter=|\Z)",
3908
+ re.DOTALL,
3909
+ )
3910
+
3911
+
3912
+ def _extract_hermes_tool_calls(text: str) -> tuple[list[dict], str]:
3913
+ """Parse Hermes-style ``<function=name><parameter=k>v</parameter></function>``
3914
+ blocks out of *text*. Used as a fallback when the Qwen JSON format
3915
+ (``<tool_call>{...}</tool_call>``) is not present — for example on
3916
+ finalize turns where grammar does not constrain the output. Tolerates
3917
+ premature EOS (missing closing ``</parameter>`` / ``</function>``)."""
3918
+ if "<function=" not in text:
3919
+ return [], text
3920
+
3921
+ extracted: list[dict] = []
3922
+ matched_spans: list[tuple[int, int]] = []
3923
+
3924
+ for fn_match in _HERMES_FUNCTION_RE.finditer(text):
3925
+ name = fn_match.group(1).strip()
3926
+ body = fn_match.group(2) or ""
3927
+ if not name:
3928
+ continue
3929
+ args: dict = {}
3930
+ for p_match in _HERMES_PARAMETER_RE.finditer(body):
3931
+ key = p_match.group(1).strip()
3932
+ value = p_match.group(2)
3933
+ if key:
3934
+ # Strip one leading newline that the template usually adds
3935
+ # but preserve interior whitespace (code indentation, etc.)
3936
+ if value.startswith("\n"):
3937
+ value = value[1:]
3938
+ args[key] = value
3939
+ extracted.append(
3940
+ {
3941
+ "id": f"toolu_{uuid.uuid4().hex[:24]}",
3942
+ "type": "function",
3943
+ "function": {
3944
+ "name": name,
3945
+ "arguments": json.dumps(args, separators=(",", ":")),
3946
+ },
3947
+ }
3948
+ )
3949
+ matched_spans.append(fn_match.span())
3950
+
3951
+ if not extracted:
3952
+ return [], text
3953
+
3954
+ # Remove matched function blocks from text (plus any dangling
3955
+ # <tool_call>/</tool_call> wrappers around them).
3956
+ remaining = text
3957
+ for start, end in reversed(matched_spans):
3958
+ remaining = remaining[:start] + remaining[end:]
3959
+ # Strip leftover <tool_call>…</tool_call> envelopes that now enclose
3960
+ # nothing useful.
3961
+ remaining = re.sub(r"<tool_call>\s*</tool_call>", "", remaining, flags=re.DOTALL)
3962
+ remaining = remaining.strip()
3963
+
3964
+ logger.info(
3965
+ "TOOL CALL EXTRACTION: recovered %d Hermes-format tool call(s) from text content",
3966
+ len(extracted),
3967
+ )
3968
+ return extracted, remaining
3969
+
3970
+
3971
+ # ---------------------------------------------------------------------------
3972
+ # Gemma 4 tool-call DSL extractors
3973
+ # ---------------------------------------------------------------------------
3974
+ # Gemma 4's chat template emits tool calls as:
3975
+ # <|tool_call>call:NAME{key1:<|"|>value1<|"|>,key2:42}<tool_call|>
3976
+ # Note the asymmetric open/close tags and `<|"|>` substitution for `"`.
3977
+ # Llama-server's --jinja autoparser usually converts these to standard
3978
+ # OpenAI tool_calls, but the raw form can leak through on (a) malformed
3979
+ # emissions, (b) finalize turns, (c) non-tool-template requests where the
3980
+ # model still tries to call a tool. This parser catches those cases.
3981
+ #
3982
+ # Gemma 4 also falls back to ```json {"name": "...", "arguments": {...}} ```
3983
+ # markdown blocks when it doesn't trust the template — observed when
3984
+ # tool_choice was forced 'required' but the model lacked confidence in the
3985
+ # native format. Only treated as a tool call when the JSON has a "name".
3986
+ _GEMMA4_TOOL_CALL_DSL_RE = re.compile(
3987
+ r"<\|tool_call>\s*call:\s*([A-Za-z_][A-Za-z0-9_]*)\s*\{(.*?)\}\s*<tool_call\|>",
3988
+ re.DOTALL,
3989
+ )
3990
+ # Markdown JSON code-block fallback. Group 1 = JSON content (may include
3991
+ # leading/trailing whitespace inside the block).
3992
+ _GEMMA4_MARKDOWN_JSON_RE = re.compile(
3993
+ r"```(?:json)?\s*(\{.*?\})\s*```",
3994
+ re.DOTALL,
3995
+ )
3996
+
3997
+
3998
+ def _parse_gemma4_dsl_args(raw: str) -> dict | None:
3999
+ """Parse Gemma 4's tool-call DSL arg body into a Python dict.
4000
+
4001
+ Input shape (between the `{` and `}` of the DSL):
4002
+ key1:<|"|>str value<|"|>,key2:42,key3:true,key4:[<|"|>a<|"|>,<|"|>b<|"|>]
4003
+
4004
+ Strategy: replace `<|"|>` with `"`, wrap unquoted keys in quotes, then
4005
+ feed to json.loads. Returns None on parse failure (caller decides).
4006
+ """
4007
+ if not raw or not raw.strip():
4008
+ return {}
4009
+ s = raw.replace('<|"|>', '"')
4010
+ # Wrap unquoted keys: `key:` -> `"key":` (only at start or after `,` / `{` / whitespace).
4011
+ s = re.sub(r"(^|[\s,{\[])([A-Za-z_][A-Za-z0-9_]*)\s*:", r'\1"\2":', s)
4012
+ s = "{" + s + "}"
4013
+ try:
4014
+ parsed = json.loads(s)
4015
+ return parsed if isinstance(parsed, dict) else None
4016
+ except json.JSONDecodeError:
4017
+ return None
4018
+
4019
+
4020
+ def _schema_match_tool(payload: dict, available_tools: list[dict]) -> str | None:
4021
+ """Match a bare-args dict against available tool schemas.
4022
+
4023
+ Score each tool by:
4024
+ - +10 per required field present in payload
4025
+ - +1 per optional property present
4026
+ - -5 per payload key NOT in tool's properties
4027
+ - -100 if any required field is missing
4028
+ Return the name of the highest-scoring tool, or None if no clear match.
4029
+ """
4030
+ if not isinstance(payload, dict) or not available_tools:
4031
+ return None
4032
+ payload_keys = set(payload.keys())
4033
+ best_name = None
4034
+ best_score = 0
4035
+ for tool in available_tools:
4036
+ if not isinstance(tool, dict):
4037
+ continue
4038
+ # Anthropic tools format: {"name": ..., "input_schema": {...}}
4039
+ # OpenAI format: {"type": "function", "function": {"name": ..., "parameters": {...}}}
4040
+ name = tool.get("name")
4041
+ schema = tool.get("input_schema")
4042
+ if name is None and isinstance(tool.get("function"), dict):
4043
+ name = tool["function"].get("name")
4044
+ schema = tool["function"].get("parameters")
4045
+ if not isinstance(name, str) or not isinstance(schema, dict):
4046
+ continue
4047
+ properties = schema.get("properties") if isinstance(schema.get("properties"), dict) else {}
4048
+ required = set(schema.get("required") or [])
4049
+ prop_keys = set(properties.keys())
4050
+ score = 0
4051
+ missing_required = required - payload_keys
4052
+ if missing_required:
4053
+ score -= 100
4054
+ score += 10 * len(required & payload_keys)
4055
+ score += len((payload_keys & prop_keys) - required)
4056
+ score -= 5 * len(payload_keys - prop_keys)
4057
+ if score > best_score:
4058
+ best_score = score
4059
+ best_name = name
4060
+ return best_name if best_score >= 10 else None
4061
+
4062
+
4063
+ def _extract_gemma4_tool_calls(
4064
+ text: str, available_tools: list[dict] | None = None
4065
+ ) -> tuple[list[dict], str]:
4066
+ """Parse Gemma 4 tool-call emissions out of *text*.
4067
+
4068
+ Three formats handled, in order:
4069
+ 1. Native DSL: ``<|tool_call>call:N{...}<tool_call|>``
4070
+ 2. Markdown with name: ``​`json\\n{"name": "N", "arguments": {...}}\\n`​``
4071
+ 3. Markdown bare-args + ``available_tools`` provided — schema-match
4072
+ against tool definitions (fix D for Gemma 4 cold-turn malformation
4073
+ where the model emits ``{"city": "Paris"}`` for a get_weather call
4074
+ instead of ``{"name": "get_weather", "arguments": {"city": "Paris"}}``).
4075
+ Without ``available_tools``, bare-args blocks pass through as text.
4076
+
4077
+ Returns ``(extracted_openai_tool_calls, remaining_text)``.
4078
+ """
4079
+ if "<|tool_call>" not in text and "```" not in text:
4080
+ return [], text
4081
+
4082
+ extracted: list[dict] = []
4083
+ matched_spans: list[tuple[int, int]] = []
4084
+
4085
+ # Pattern 1: native DSL
4086
+ for m in _GEMMA4_TOOL_CALL_DSL_RE.finditer(text):
4087
+ name = m.group(1).strip()
4088
+ body = m.group(2) or ""
4089
+ if not name:
4090
+ continue
4091
+ args = _parse_gemma4_dsl_args(body)
4092
+ if args is None:
4093
+ # DSL body unparseable; skip and let model retry next turn.
4094
+ continue
4095
+ extracted.append(
4096
+ {
4097
+ "id": f"toolu_{uuid.uuid4().hex[:24]}",
4098
+ "type": "function",
4099
+ "function": {
4100
+ "name": name,
4101
+ "arguments": json.dumps(args, separators=(",", ":")),
4102
+ },
4103
+ }
4104
+ )
4105
+ matched_spans.append(m.span())
4106
+
4107
+ # Pattern 2: markdown JSON fallback (only if no DSL hit AND text has ```)
4108
+ if not extracted and "```" in text:
4109
+ for m in _GEMMA4_MARKDOWN_JSON_RE.finditer(text):
4110
+ raw_json = m.group(1)
4111
+ try:
4112
+ payload = json.loads(raw_json)
4113
+ except json.JSONDecodeError:
4114
+ # Try a JSON repair like the Qwen path does
4115
+ repaired = _repair_tool_call_json(raw_json)
4116
+ if not repaired:
4117
+ continue
4118
+ try:
4119
+ payload = json.loads(repaired)
4120
+ except json.JSONDecodeError:
4121
+ continue
4122
+ if not isinstance(payload, dict):
4123
+ continue
4124
+ name = payload.get("name")
4125
+ arguments_obj = None
4126
+ if isinstance(name, str) and name:
4127
+ # Standard {name, arguments} form
4128
+ arguments_obj = payload.get("arguments", payload.get("args", {}))
4129
+ elif available_tools:
4130
+ # Bare-args block — try schema-matching against available tools
4131
+ matched = _schema_match_tool(payload, available_tools)
4132
+ if matched is None:
4133
+ continue
4134
+ name = matched
4135
+ arguments_obj = payload # whole payload IS the args
4136
+ logger.info(
4137
+ "TOOL CALL EXTRACTION: schema-matched bare-args markdown JSON to tool '%s' (keys=%s)",
4138
+ name,
4139
+ sorted(payload.keys())[:6],
4140
+ )
4141
+ else:
4142
+ # No name, no tools to match against — pass through as text
4143
+ continue
4144
+ if isinstance(arguments_obj, dict):
4145
+ arguments = json.dumps(arguments_obj, separators=(",", ":"))
4146
+ elif isinstance(arguments_obj, str):
4147
+ arguments = arguments_obj
4148
+ else:
4149
+ arguments = "{}"
4150
+ extracted.append(
4151
+ {
4152
+ "id": f"toolu_{uuid.uuid4().hex[:24]}",
4153
+ "type": "function",
4154
+ "function": {"name": name, "arguments": arguments},
4155
+ }
4156
+ )
4157
+ matched_spans.append(m.span())
4158
+
4159
+ if not extracted:
4160
+ return [], text
4161
+
4162
+ # Strip matched spans from text (in reverse to keep indices valid)
4163
+ remaining = text
4164
+ for start, end in sorted(matched_spans, key=lambda s: -s[0]):
4165
+ remaining = remaining[:start] + remaining[end:]
4166
+ remaining = remaining.strip()
4167
+
4168
+ logger.info(
4169
+ "TOOL CALL EXTRACTION: recovered %d Gemma 4 tool call(s) from text content",
4170
+ len(extracted),
4171
+ )
4172
+ return extracted, remaining
4173
+
4174
+
4175
+ # ---------------------------------------------------------------------------
4176
+ # Gemma 4 tool-call DSL extractors
4177
+ # ---------------------------------------------------------------------------
4178
+ # Gemma 4's chat template emits tool calls as:
4179
+ # <|tool_call>call:NAME{key1:<|"|>value1<|"|>,key2:42}<tool_call|>
4180
+ # Note the asymmetric open/close tags and `<|"|>` substitution for `"`.
4181
+ # Llama-server's --jinja autoparser usually converts these to standard
4182
+ # OpenAI tool_calls, but the raw form can leak through on (a) malformed
4183
+ # emissions, (b) finalize turns, (c) non-tool-template requests where the
4184
+ # model still tries to call a tool. This parser catches those cases.
4185
+ #
4186
+ # Gemma 4 also falls back to ```json {"name": "...", "arguments": {...}} ```
4187
+ # markdown blocks when it doesn't trust the template — observed when
4188
+ # tool_choice was forced 'required' but the model lacked confidence in the
4189
+ # native format. Only treated as a tool call when the JSON has a "name".
4190
+ _GEMMA4_TOOL_CALL_DSL_RE = re.compile(
4191
+ r"<\|tool_call>\s*call:\s*([A-Za-z_][A-Za-z0-9_]*)\s*\{(.*?)\}\s*<tool_call\|>",
4192
+ re.DOTALL,
4193
+ )
4194
+ # Markdown JSON code-block fallback. Group 1 = JSON content (may include
4195
+ # leading/trailing whitespace inside the block).
4196
+ _GEMMA4_MARKDOWN_JSON_RE = re.compile(
4197
+ r"```(?:json)?\s*(\{.*?\})\s*```",
4198
+ re.DOTALL,
4199
+ )
4200
+
4201
+
4202
+ def _parse_gemma4_dsl_args(raw: str) -> dict | None:
4203
+ """Parse Gemma 4's tool-call DSL arg body into a Python dict.
4204
+
4205
+ Input shape (between the `{` and `}` of the DSL):
4206
+ key1:<|"|>str value<|"|>,key2:42,key3:true,key4:[<|"|>a<|"|>,<|"|>b<|"|>]
4207
+
4208
+ Strategy: replace `<|"|>` with `"`, wrap unquoted keys in quotes, then
4209
+ feed to json.loads. Returns None on parse failure (caller decides).
4210
+ """
4211
+ if not raw or not raw.strip():
4212
+ return {}
4213
+ s = raw.replace('<|"|>', '"')
4214
+ # Wrap unquoted keys: `key:` -> `"key":` (only at start or after `,` / `{` / whitespace).
4215
+ s = re.sub(r"(^|[\s,{\[])([A-Za-z_][A-Za-z0-9_]*)\s*:", r'\1"\2":', s)
4216
+ s = "{" + s + "}"
4217
+ try:
4218
+ parsed = json.loads(s)
4219
+ return parsed if isinstance(parsed, dict) else None
4220
+ except json.JSONDecodeError:
4221
+ return None
4222
+
4223
+
3158
4224
 
3159
4225
  def _repair_tool_call_json(raw: str) -> str | None:
3160
4226
  """Attempt to repair common garbled JSON in tool call payloads.
@@ -3197,7 +4263,9 @@ def _repair_tool_call_json(raw: str) -> str | None:
3197
4263
  return None
3198
4264
 
3199
4265
 
3200
- def _extract_tool_calls_from_text(text: str) -> tuple[list[dict], str]:
4266
+ def _extract_tool_calls_from_text(
4267
+ text: str, available_tools: list[dict] | None = None
4268
+ ) -> tuple[list[dict], str]:
3201
4269
  """Parse ``<tool_call>{...}</tool_call>`` blocks out of *text*.
3202
4270
 
3203
4271
  Returns a tuple of (extracted_openai_tool_calls, remaining_text).
@@ -3207,8 +4275,18 @@ def _extract_tool_calls_from_text(text: str) -> tuple[list[dict], str]:
3207
4275
 
3208
4276
  The *remaining_text* has the matched ``<tool_call>`` blocks removed.
3209
4277
  If no valid blocks are found the original text is returned unchanged.
4278
+ Falls back to Hermes-style ``<function=X><parameter=K>V</parameter></function>``
4279
+ for older Qwen/Llama fine-tunes, then to Gemma 4's
4280
+ ``<|tool_call>call:N{...}<tool_call|>`` DSL and ```json``` markdown
4281
+ blocks. Anything not matching any known format falls through unchanged
4282
+ so plain prose passes the parser without mutation.
3210
4283
  """
3211
- if "<tool_call>" not in text:
4284
+ if (
4285
+ "<tool_call>" not in text
4286
+ and "<function=" not in text
4287
+ and "<|tool_call>" not in text
4288
+ and "```" not in text
4289
+ ):
3212
4290
  return [], text
3213
4291
 
3214
4292
  extracted: list[dict] = []
@@ -3244,14 +4322,24 @@ def _extract_tool_calls_from_text(text: str) -> tuple[list[dict], str]:
3244
4322
 
3245
4323
  extracted.append(
3246
4324
  {
3247
- "id": f"call_{uuid.uuid4().hex[:12]}",
4325
+ "id": f"toolu_{uuid.uuid4().hex[:24]}",
3248
4326
  "type": "function",
3249
4327
  "function": {"name": name, "arguments": arguments},
3250
4328
  }
3251
4329
  )
3252
4330
 
3253
4331
  if not extracted:
3254
- return [], text
4332
+ # Fall back to Hermes format. This catches Qwen emissions on finalize
4333
+ # turns where grammar is not applied and the model defaults to its
4334
+ # base training's <function=X><parameter=K>V</parameter></function>
4335
+ # format instead of the <tool_call>{JSON}</tool_call> Qwen template
4336
+ # format. Without this path, tool_calls=[] and the client halts.
4337
+ hermes_calls, hermes_remaining = _extract_hermes_tool_calls(text)
4338
+ if hermes_calls:
4339
+ return hermes_calls, hermes_remaining
4340
+ # Then try Gemma 4's DSL + markdown-JSON fallback. Anything still
4341
+ # not matching falls through as plain text.
4342
+ return _extract_gemma4_tool_calls(text, available_tools=available_tools)
3255
4343
 
3256
4344
  # Strip matched tool_call blocks from the text
3257
4345
  remaining = _TOOL_CALL_XML_RE.sub("", text).strip()
@@ -4222,6 +5310,16 @@ def _classify_tool_response_issue(
4222
5310
  if "tools" not in anthropic_body:
4223
5311
  return ToolResponseIssue()
4224
5312
 
5313
+ # When the upstream response was cut off by max_tokens (finish_reason=length),
5314
+ # any garbled/unbalanced-brace appearance in the tool args is almost
5315
+ # certainly truncation, not degenerate generation. Re-classify such
5316
+ # issues as "truncated_tool_args" so the caller can still retry (with a
5317
+ # larger cap) but WITHOUT triggering the forced-tool dampener, which
5318
+ # otherwise penalises a perfectly-recoverable truncation event.
5319
+ choice_for_finish, _ = _extract_openai_choice(openai_resp)
5320
+ finish_reason = (choice_for_finish.get("finish_reason") or "").lower()
5321
+ was_truncated = finish_reason == "length"
5322
+
4225
5323
  if _is_malformed_tool_response(openai_resp, anthropic_body):
4226
5324
  return ToolResponseIssue(
4227
5325
  kind="malformed_payload",
@@ -4265,15 +5363,107 @@ def _classify_tool_response_issue(
4265
5363
  allowed_tools,
4266
5364
  )
4267
5365
  if issue.has_issue():
5366
+ # Downgrade invalid_tool_args to truncated_tool_args when the
5367
+ # response hit max_tokens — retry path still fires but the
5368
+ # dampener/streak counters stay cold.
5369
+ if was_truncated and issue.kind == "invalid_tool_args":
5370
+ return ToolResponseIssue(
5371
+ kind="truncated_tool_args",
5372
+ reason=(
5373
+ f"tool call for '{tool_name}' truncated by max_tokens "
5374
+ f"({issue.reason})"
5375
+ ),
5376
+ retry_hint=issue.retry_hint,
5377
+ )
4268
5378
  return issue
4269
5379
 
4270
5380
  return ToolResponseIssue()
4271
5381
 
4272
5382
 
5383
+ # 2026-05-12: Regex for the tool-XML tag scanner. Captures opening vs
5384
+ # closing form (group 1: "/" or ""), the tag name (group 2), and any
5385
+ # attributes (group 3). Matches <parameter>, <parameter=key>,
5386
+ # <parameter name="key">, </parameter>, <function=name>, </function>.
5387
+ _TOOL_XML_TAG_RE = re.compile(r"<(/?)(parameter|function)\b([^>]*)>")
5388
+
5389
+
5390
+ def _strip_orphan_tool_xml(text: str) -> str:
5391
+ """Remove orphan </parameter> and </function> closing tags that have
5392
+ no matching opener earlier in the text.
5393
+
5394
+ Qwen3.6 trained on the qwen3_coder XML format leaks these closers
5395
+ after its actual answer when forced into tool_choice='required' with
5396
+ no genuine tool to call. The closers are training residuals, not real
5397
+ malformed tool-call markup — keeping them in the text causes the
5398
+ primary_markers branch of _looks_malformed_tool_payload to fire on
5399
+ every clean-but-runaway-shaped response. Real malformed tool-call
5400
+ attempts always have at least one matching opener ('<parameter' or
5401
+ '<function='), which the regex preserves, so primary_markers still
5402
+ fires correctly on genuine bad output.
5403
+ """
5404
+ if "</parameter" not in text and "</function" not in text:
5405
+ return text
5406
+
5407
+ out: list[str] = []
5408
+ pos = 0
5409
+ open_param = 0
5410
+ open_func = 0
5411
+ for m in _TOOL_XML_TAG_RE.finditer(text):
5412
+ out.append(text[pos:m.start()])
5413
+ is_close = m.group(1) == "/"
5414
+ tag = m.group(2)
5415
+ if is_close:
5416
+ if tag == "parameter":
5417
+ if open_param > 0:
5418
+ open_param -= 1
5419
+ out.append(m.group(0))
5420
+ else: # function
5421
+ if open_func > 0:
5422
+ open_func -= 1
5423
+ out.append(m.group(0))
5424
+ # else: orphan closer, skip (strip)
5425
+ else:
5426
+ if tag == "parameter":
5427
+ open_param += 1
5428
+ else:
5429
+ open_func += 1
5430
+ out.append(m.group(0))
5431
+ pos = m.end()
5432
+ out.append(text[pos:])
5433
+ return "".join(out)
5434
+
5435
+
4273
5436
  def _looks_malformed_tool_payload(text: str) -> bool:
4274
5437
  if not text:
4275
5438
  return False
4276
5439
 
5440
+ # 2026-05-12: Strip balanced <think>...</think> blocks before applying
5441
+ # the heuristic. Qwen3.6 emits <think> blocks regardless of
5442
+ # enable_thinking, and two scenarios were tripping false positives:
5443
+ # 1. Meta-tool reasoning inside the thinking ({"description":...},
5444
+ # repeated "must call a tool") triggering the structural-marker
5445
+ # and policy-echo branches.
5446
+ # 2. The model wrapping its ENTIRE answer inside a single <think>
5447
+ # block (markdown reports, tables) — the </think> structural
5448
+ # marker plus content-resembling-policy then fires.
5449
+ # Downstream response processing surfaces <think> content as proper
5450
+ # Anthropic `thinking` blocks via _THINKING_BLOCK_RE, so stripping
5451
+ # here loses no information. Unbalanced/stray </think> without a
5452
+ # matching opener is NOT stripped — those remain genuinely malformed.
5453
+ if "<think>" in text and "</think>" in text:
5454
+ text = _THINKING_BLOCK_RE.sub("", text)
5455
+ if not text.strip():
5456
+ return False
5457
+
5458
+ # 2026-05-12: Strip orphan </parameter> and </function> closers that
5459
+ # have no matching opener. Qwen3.6 leaks these training residuals
5460
+ # after its visible answer when forced into tool_choice='required'
5461
+ # with no valid tool to call. Real malformed tool-call attempts retain
5462
+ # their opener and still trip the primary_markers check below.
5463
+ text = _strip_orphan_tool_xml(text)
5464
+ if not text.strip():
5465
+ return False
5466
+
4277
5467
  lowered = text.lower()
4278
5468
  if _contains_tool_call_apology(text):
4279
5469
  return True
@@ -4508,13 +5698,17 @@ def _build_malformed_retry_body(
4508
5698
  retry_instruction = (
4509
5699
  "Your previous response had invalid tool-call formatting. "
4510
5700
  "Respond with exactly one valid tool call using the provided tools. "
4511
- "Do not output prose, markdown, XML tags, or schema snippets."
5701
+ "Do not output prose, markdown, XML tags, or schema snippets. "
5702
+ "Do NOT use <think>...</think> blocks or internal reasoning — "
5703
+ "emit the tool_call object as the very first token of your response."
4512
5704
  )
4513
5705
  else:
4514
5706
  retry_instruction = (
4515
5707
  "Your previous response had invalid tool-call formatting. "
4516
5708
  "If a tool is needed, emit exactly one valid tool call with strict JSON arguments. "
4517
- "If no tool is needed for this turn, return concise plain text with no protocol tags."
5709
+ "If no tool is needed for this turn, return concise plain text with no protocol tags. "
5710
+ "Do NOT use <think>...</think> blocks — start your response directly with "
5711
+ "either a tool_call or the plain text answer."
4518
5712
  )
4519
5713
 
4520
5714
  malformed_retry_instruction = {
@@ -4695,7 +5889,7 @@ async def _apply_unexpected_end_turn_guardrail(
4695
5889
  )
4696
5890
  if retry_resp.status_code == 200:
4697
5891
  retry_json = retry_resp.json()
4698
- _maybe_extract_text_tool_calls(retry_json)
5892
+ _maybe_extract_text_tool_calls(retry_json, anthropic_tools=anthropic_body.get("tools"))
4699
5893
  retry_choice, retry_message = _extract_openai_choice(retry_json)
4700
5894
  if _openai_has_valid_tool_calls(retry_json, anthropic_body):
4701
5895
  logger.info("GUARDRAIL: retry produced tool_use; using retried response")
@@ -4784,8 +5978,12 @@ async def _apply_malformed_tool_guardrail(
4784
5978
  )
4785
5979
  return working_resp
4786
5980
 
4787
- # Mark garbled state for progressive max_tokens reduction on next turn
4788
- monitor.last_response_garbled = True
5981
+ # Only set last_response_garbled for TRUE degenerate generation, not
5982
+ # for responses merely truncated by max_tokens — otherwise the next
5983
+ # turn gets hit with the garbled_cap (smaller max_tokens) and the
5984
+ # problem compounds.
5985
+ if issue.kind != "truncated_tool_args":
5986
+ monitor.last_response_garbled = True
4789
5987
 
4790
5988
  if issue.kind == "malformed_payload":
4791
5989
  monitor.malformed_tool_streak += 1
@@ -4793,7 +5991,12 @@ async def _apply_malformed_tool_guardrail(
4793
5991
  monitor.invalid_tool_call_streak += 1
4794
5992
  monitor.arg_preflight_rejections += 1
4795
5993
 
4796
- monitor.maybe_activate_forced_tool_dampener(issue.kind)
5994
+ # Truncation is a max_tokens accident, not the model misbehaving: don't
5995
+ # feed it to the forced-tool dampener, which would otherwise relax
5996
+ # tool_choice on the very next turn and let the model trail off with
5997
+ # text (the exact failure mode that stopped opencode).
5998
+ if issue.kind != "truncated_tool_args":
5999
+ monitor.maybe_activate_forced_tool_dampener(issue.kind)
4797
6000
  excerpt = _openai_message_text(working_resp)[:220].replace("\n", " ")
4798
6001
  # Option 2: Log garbled argument content for diagnostics
4799
6002
  arg_excerpt = ""
@@ -4866,7 +6069,7 @@ async def _apply_malformed_tool_guardrail(
4866
6069
  continue
4867
6070
 
4868
6071
  retry_json = retry_resp.json()
4869
- _maybe_extract_text_tool_calls(retry_json)
6072
+ _maybe_extract_text_tool_calls(retry_json, anthropic_tools=anthropic_body.get("tools"))
4870
6073
  retry_working = retry_json
4871
6074
  retry_repairs = 0
4872
6075
  if PROXY_TOOL_ARGS_PREFLIGHT and _openai_has_tool_calls(retry_json):
@@ -4898,15 +6101,20 @@ async def _apply_malformed_tool_guardrail(
4898
6101
  )
4899
6102
 
4900
6103
  if not retry_issue.has_issue():
4901
- monitor.malformed_tool_streak = 0
4902
- monitor.invalid_tool_call_streak = 0
4903
- monitor.required_tool_miss_streak = 0
6104
+ # 2026-05-12: Fix #2 — do NOT reset malformed/invalid/miss streaks
6105
+ # to 0 on retry-success. Previously, sessions stuck in a
6106
+ # malformed→retry-success loop never accumulated enough streak to
6107
+ # trigger the forced-tool dampener. Healthy responses with real
6108
+ # tool_calls still reset the streak via the upstream no-issue path
6109
+ # (~L5655), so genuine recovery still resets counters; only
6110
+ # repeated retry-recoveries persist toward the dampener.
4904
6111
  monitor.last_response_garbled = False
4905
6112
  logger.info(
4906
- "TOOL RESPONSE RETRY success: kind=%s attempt=%d/%d",
6113
+ "TOOL RESPONSE RETRY success: kind=%s attempt=%d/%d malformed_streak=%d",
4907
6114
  current_issue.kind,
4908
6115
  attempt + 1,
4909
6116
  attempts,
6117
+ monitor.malformed_tool_streak,
4910
6118
  )
4911
6119
  if retry_repairs > 0:
4912
6120
  monitor.arg_preflight_repairs += retry_repairs
@@ -4931,7 +6139,10 @@ async def _apply_malformed_tool_guardrail(
4931
6139
  if fn_name and raw_args and _is_garbled_tool_arguments(raw_args):
4932
6140
  failing_tools.add(fn_name)
4933
6141
 
4934
- monitor.maybe_activate_forced_tool_dampener(retry_issue.kind)
6142
+ # Truncation on retry is still a max_tokens problem, not a model
6143
+ # misbehaviour — don't dampen. The outer retry loop will try again.
6144
+ if retry_issue.kind != "truncated_tool_args":
6145
+ monitor.maybe_activate_forced_tool_dampener(retry_issue.kind)
4935
6146
  logger.warning(
4936
6147
  "TOOL RESPONSE RETRY invalid: session=%s attempt=%d/%d kind=%s reason=%s",
4937
6148
  session_id,
@@ -5112,11 +6323,19 @@ def _maybe_apply_session_contamination_breaker(
5112
6323
  # ===========================================================================
5113
6324
 
5114
6325
 
5115
- def _maybe_extract_text_tool_calls(openai_resp: dict) -> dict:
6326
+ def _maybe_extract_text_tool_calls(
6327
+ openai_resp: dict, anthropic_tools: list[dict] | None = None
6328
+ ) -> dict:
5116
6329
  """Mutate *openai_resp* in-place: if the message has no structured
5117
- ``tool_calls`` but contains ``<tool_call>`` XML in text, extract them
5118
- and promote to real ``tool_calls`` on the message. Returns the
5119
- (possibly-mutated) response for chaining."""
6330
+ ``tool_calls`` but contains tool-call markup in text, extract them
6331
+ and promote to real ``tool_calls`` on the message.
6332
+
6333
+ *anthropic_tools* (optional): list of tool definitions from the original
6334
+ Anthropic request. Enables schema-matching of bare-args markdown JSON
6335
+ blocks emitted by Gemma 4 cold turns (fix D). Without it, bare-args
6336
+ blocks pass through as text.
6337
+
6338
+ Returns the (possibly-mutated) response for chaining."""
5120
6339
  choice = (openai_resp.get("choices") or [{}])[0]
5121
6340
  message = choice.get("message", {})
5122
6341
 
@@ -5125,10 +6344,20 @@ def _maybe_extract_text_tool_calls(openai_resp: dict) -> dict:
5125
6344
  return openai_resp
5126
6345
 
5127
6346
  text = message.get("content", "")
5128
- if not isinstance(text, str) or "<tool_call>" not in text:
6347
+ if not isinstance(text, str):
6348
+ return openai_resp
6349
+ # Quick early-exit if no markers present (matches dispatcher guard)
6350
+ if (
6351
+ "<tool_call>" not in text
6352
+ and "<function=" not in text
6353
+ and "<|tool_call>" not in text
6354
+ and "```" not in text
6355
+ ):
5129
6356
  return openai_resp
5130
6357
 
5131
- extracted, remaining = _extract_tool_calls_from_text(text)
6358
+ extracted, remaining = _extract_tool_calls_from_text(
6359
+ text, available_tools=anthropic_tools
6360
+ )
5132
6361
  if not extracted:
5133
6362
  return openai_resp
5134
6363
 
@@ -5212,6 +6441,18 @@ def _inject_synthetic_continuation(
5212
6441
  Appends a no-op Read("/dev/null") tool_use block and changes stop_reason
5213
6442
  from "end_turn" to "tool_use" so the client continues sending requests.
5214
6443
  """
6444
+ # Session-level hard cap: if we've already done N continuations in this
6445
+ # session (counter is monotonic, survives fresh-user-text resets), stop
6446
+ # injecting and let the response terminate. This catches runaway loops
6447
+ # that dodge the per-cycle cap via state resets.
6448
+ if monitor.finalize_hard_stop_count >= PROXY_FINALIZE_SESSION_HARD_CAP:
6449
+ logger.warning(
6450
+ "FINALIZE CONTINUATION: session hard cap reached (%d/%d) — not injecting, allowing termination",
6451
+ monitor.finalize_hard_stop_count,
6452
+ PROXY_FINALIZE_SESSION_HARD_CAP,
6453
+ )
6454
+ return anthropic_resp
6455
+
5215
6456
  # Pick a safe tool the client knows about (case-insensitive match,
5216
6457
  # then use the client's actual casing for the tool name)
5217
6458
  if _client_has_tool(anthropic_body, "read"):
@@ -5227,6 +6468,7 @@ def _inject_synthetic_continuation(
5227
6468
  synthetic_id = f"toolu_{uuid.uuid4().hex[:12]}"
5228
6469
  monitor.finalize_synthetic_tool_id = synthetic_id
5229
6470
  monitor.finalize_continuation_count += 1
6471
+ monitor.finalize_hard_stop_count += 1
5230
6472
 
5231
6473
  content = anthropic_resp.get("content", [])
5232
6474
  content.append({
@@ -5239,17 +6481,54 @@ def _inject_synthetic_continuation(
5239
6481
  anthropic_resp["stop_reason"] = "tool_use"
5240
6482
 
5241
6483
  logger.info(
5242
- "FINALIZE CONTINUATION: injected synthetic %s tool_use id=%s (count=%d/%d)",
6484
+ "FINALIZE CONTINUATION: injected synthetic %s tool_use id=%s (count=%d/%d, session=%d/%d)",
5243
6485
  tool_name,
5244
6486
  synthetic_id,
5245
6487
  monitor.finalize_continuation_count,
5246
6488
  PROXY_FINALIZE_CONTINUATION_MAX,
6489
+ monitor.finalize_hard_stop_count,
6490
+ PROXY_FINALIZE_SESSION_HARD_CAP,
5247
6491
  )
5248
6492
  return anthropic_resp
5249
6493
 
5250
6494
 
5251
- def openai_to_anthropic_response(openai_resp: dict, model: str) -> dict:
5252
- """Convert an OpenAI Chat Completions response to Anthropic Messages format."""
6495
+ _THINKING_BLOCK_RE = re.compile(r"<think>(.*?)</think>\s*", re.DOTALL)
6496
+
6497
+
6498
+ def _extract_thinking_block(text: str) -> tuple[str | None, str]:
6499
+ """Extract Qwen-style ``<think>...</think>`` blocks from *text*.
6500
+
6501
+ Returns ``(thinking_content, remaining_text)``. If no ``<think>`` tag is
6502
+ present, returns ``(None, text)`` unchanged. Multiple thinking blocks
6503
+ are concatenated. Trailing whitespace after each block is consumed so
6504
+ the remaining text starts cleanly with the model's actual answer.
6505
+ """
6506
+ if "<think>" not in text:
6507
+ return None, text
6508
+ parts: list[str] = []
6509
+ def collect(m: re.Match) -> str:
6510
+ parts.append(m.group(1).strip())
6511
+ return ""
6512
+ remaining = _THINKING_BLOCK_RE.sub(collect, text)
6513
+ if not parts:
6514
+ return None, text
6515
+ return "\n\n".join(p for p in parts if p), remaining.lstrip()
6516
+
6517
+
6518
+ def openai_to_anthropic_response(
6519
+ openai_resp: dict, model: str, expose_thinking: bool = True
6520
+ ) -> dict:
6521
+ """Convert an OpenAI Chat Completions response to Anthropic Messages format.
6522
+
6523
+ *expose_thinking*: when True, surface ``<think>...</think>`` content from
6524
+ the upstream as Anthropic ``{"type": "thinking"}`` blocks. When False
6525
+ (Anthropic default — client didn't opt in), strip thinking content
6526
+ from the response entirely so the client only sees the actual answer.
6527
+ Qwen's chat template seeds the model into thinking regardless of the
6528
+ ``enable_thinking`` request param, so even thinking-off responses
6529
+ typically still contain ``<think>`` blocks; this flag controls whether
6530
+ they're surfaced as Anthropic blocks or silently consumed.
6531
+ """
5253
6532
  # First: try to recover tool calls trapped in text XML tags
5254
6533
  _maybe_extract_text_tool_calls(openai_resp)
5255
6534
  # Second: strip garbled/degenerate tool call arguments
@@ -5260,20 +6539,46 @@ def openai_to_anthropic_response(openai_resp: dict, model: str) -> dict:
5260
6539
  finish = choice.get("finish_reason", "stop")
5261
6540
 
5262
6541
  content = []
6542
+ # Surface Qwen's <think>...</think> output as Anthropic-style thinking
6543
+ # blocks (Anthropic extended-thinking API shape:
6544
+ # {"type": "thinking", "thinking": "...", "signature": ""}).
6545
+ # Clients that don't request thinking simply ignore the block; clients
6546
+ # that do (Claude Code) render them in the thinking pane.
6547
+ raw_text = ""
5263
6548
  if message.get("content"):
5264
6549
  raw_text = (
5265
6550
  message["content"]
5266
6551
  if isinstance(message["content"], str)
5267
6552
  else str(message["content"])
5268
6553
  )
5269
- sanitized_text = _sanitize_tool_call_apology_text(raw_text)
5270
- if sanitized_text != raw_text:
6554
+ # Some llama-server builds emit the model's reasoning into a separate
6555
+ # `reasoning_content` field instead of inline <think> tags. Surface
6556
+ # that too so the proxy is consistent regardless of upstream behaviour.
6557
+ inline_thinking, body_text = _extract_thinking_block(raw_text)
6558
+ sidecar_thinking = message.get("reasoning_content") or message.get("reasoning")
6559
+ thinking_chunks: list[str] = []
6560
+ if isinstance(sidecar_thinking, str) and sidecar_thinking.strip():
6561
+ thinking_chunks.append(sidecar_thinking.strip())
6562
+ if inline_thinking:
6563
+ thinking_chunks.append(inline_thinking)
6564
+ if thinking_chunks and expose_thinking:
6565
+ content.append(
6566
+ {
6567
+ "type": "thinking",
6568
+ "thinking": "\n\n".join(thinking_chunks),
6569
+ "signature": "",
6570
+ }
6571
+ )
6572
+
6573
+ if body_text:
6574
+ sanitized_text = _sanitize_tool_call_apology_text(body_text)
6575
+ if sanitized_text != body_text:
5271
6576
  logger.warning(
5272
6577
  "SANITIZE: replaced known malformed tool-call apology text in assistant response"
5273
6578
  )
5274
6579
  # Option 1: Strip residual <tool_call> XML that wasn't extracted
5275
6580
  sanitized_text = _strip_residual_tool_call_xml(sanitized_text)
5276
- if sanitized_text != raw_text and "<tool_call>" in raw_text:
6581
+ if sanitized_text != body_text and "<tool_call>" in body_text:
5277
6582
  logger.warning(
5278
6583
  "SANITIZE: stripped residual <tool_call> XML from text content"
5279
6584
  )
@@ -5298,10 +6603,21 @@ def openai_to_anthropic_response(openai_resp: dict, model: str) -> dict:
5298
6603
  logger.warning(
5299
6604
  "BASH SAFETY: stripped standalone protocol-tag lines from command before tool execution"
5300
6605
  )
6606
+ # Normalise IDs to Anthropic spec (toolu_ prefix). Upstream
6607
+ # llama-server returns opaque IDs without prefix; clients that
6608
+ # validate prefix would reject. Strip-and-restamp here, restore in
6609
+ # anthropic_to_openai_messages() when client sends tool_result back.
6610
+ upstream_id = tc.get("id", "")
6611
+ if upstream_id.startswith("toolu_"):
6612
+ tool_use_id = upstream_id
6613
+ elif upstream_id:
6614
+ tool_use_id = f"toolu_{upstream_id}"
6615
+ else:
6616
+ tool_use_id = f"toolu_{uuid.uuid4().hex[:24]}"
5301
6617
  content.append(
5302
6618
  {
5303
6619
  "type": "tool_use",
5304
- "id": tc.get("id", f"toolu_{uuid.uuid4().hex[:12]}"),
6620
+ "id": tool_use_id,
5305
6621
  "name": fn.get("name", ""),
5306
6622
  "input": args,
5307
6623
  }
@@ -5804,6 +7120,10 @@ async def messages(request: Request):
5804
7120
  is_stream = body.get("stream", False)
5805
7121
  model = body.get("model", "default")
5806
7122
  client_id = resolve_client_id(request)
7123
+
7124
+ # Periodically re-detect context window from upstream (handles server restarts)
7125
+ await _maybe_recheck_context_window()
7126
+
5807
7127
  if _should_passthrough_model(model):
5808
7128
  logger.info("PASSTHROUGH: model=%s -> %s", model, ANTHROPIC_API_BASE)
5809
7129
  return await _passthrough_anthropic_request(request, body, is_stream)
@@ -5861,8 +7181,9 @@ async def messages(request: Request):
5861
7181
  last_text = str(last_content)[:200]
5862
7182
  rate_count = log_client_rate(client_id)
5863
7183
  logger.info(
5864
- "REQ: client=%s rate_%ss=%d stream=%s msgs=%d tools=%d max_tokens=%s last_role=%s last_content=%.200s",
7184
+ "REQ: client=%s sess=%s rate_%ss=%d stream=%s msgs=%d tools=%d max_tokens=%s last_role=%s last_content=%.200s",
5865
7185
  client_id,
7186
+ session_id,
5866
7187
  PROXY_CLIENT_RATE_WINDOW_SECS,
5867
7188
  rate_count,
5868
7189
  is_stream,
@@ -5993,7 +7314,7 @@ async def messages(request: Request):
5993
7314
  )
5994
7315
  except Exception as exc:
5995
7316
  # Check if upstream is hung before returning error
5996
- await _check_slot_hang(f"{LLAMA_CPP_BASE}/slots")
7317
+ await _check_slot_hang(LLAMA_CPP_BASE.replace("/v1", "/slots"))
5997
7318
  return Response(
5998
7319
  content=json.dumps(
5999
7320
  {
@@ -6008,6 +7329,23 @@ async def messages(request: Request):
6008
7329
  media_type="application/json",
6009
7330
  )
6010
7331
 
7332
+ if strict_resp.status_code != 200:
7333
+ error_text = strict_resp.text[:1000]
7334
+ # Try the Gemma 4 PEG parse-failure recovery first — relax
7335
+ # tool_choice='required' so the retry isn't constrained by the
7336
+ # strict-grammar that triggered the parse failure.
7337
+ relaxed = _is_gemma4_peg_parse_failure(strict_resp.status_code, error_text) and \
7338
+ _relax_tool_choice_for_gemma4_peg_retry(strict_body, "strict-stream")
7339
+ if relaxed:
7340
+ try:
7341
+ strict_resp = await _post_with_generation_timeout(
7342
+ client,
7343
+ f"{LLAMA_CPP_BASE}/chat/completions",
7344
+ strict_body,
7345
+ {"Content-Type": "application/json"},
7346
+ )
7347
+ except Exception:
7348
+ pass # fall through to next handler
6011
7349
  if strict_resp.status_code != 200:
6012
7350
  error_text = strict_resp.text[:1000]
6013
7351
  if _maybe_disable_grammar_for_tools_error(
@@ -6082,7 +7420,7 @@ async def messages(request: Request):
6082
7420
 
6083
7421
  openai_resp = strict_resp.json()
6084
7422
  # Recover tool calls from <tool_call> XML before guardrails run
6085
- _maybe_extract_text_tool_calls(openai_resp)
7423
+ _maybe_extract_text_tool_calls(openai_resp, anthropic_tools=body.get("tools"))
6086
7424
  openai_resp = await _apply_unexpected_end_turn_guardrail(
6087
7425
  client,
6088
7426
  openai_resp,
@@ -6137,7 +7475,11 @@ async def messages(request: Request):
6137
7475
  logger.info("DEGENERATE RETRY: retry insufficient, using truncated original")
6138
7476
  except Exception as exc:
6139
7477
  logger.warning("DEGENERATE RETRY: failed: %s", exc)
6140
- anthropic_resp = openai_to_anthropic_response(openai_resp, model)
7478
+ anthropic_resp = openai_to_anthropic_response(
7479
+ openai_resp, model,
7480
+ expose_thinking=isinstance(body.get("thinking"), dict)
7481
+ and (body["thinking"].get("type") or "").lower() == "enabled",
7482
+ )
6141
7483
  # FINALIZE CONTINUATION: inject synthetic tool_use to keep client loop alive
6142
7484
  if (
6143
7485
  monitor.finalize_turn_active
@@ -6253,6 +7595,29 @@ async def messages(request: Request):
6253
7595
  error_body = await resp.aread()
6254
7596
  await resp.aclose()
6255
7597
  error_text = error_body.decode("utf-8", errors="replace")[:1000]
7598
+ # Gemma 4 PEG parse-failure recovery: relax tool_choice='required'
7599
+ # so the retry isn't blocked by the strict-grammar that rejected
7600
+ # the model's incomplete tool call.
7601
+ if _is_gemma4_peg_parse_failure(resp.status_code, error_text) and \
7602
+ _relax_tool_choice_for_gemma4_peg_retry(openai_body, "stream"):
7603
+ resp = await client.send(
7604
+ client.build_request(
7605
+ "POST",
7606
+ f"{LLAMA_CPP_BASE}/chat/completions",
7607
+ json=openai_body,
7608
+ headers={"Content-Type": "application/json"},
7609
+ ),
7610
+ stream=True,
7611
+ )
7612
+ if resp.status_code == 200:
7613
+ return StreamingResponse(
7614
+ stream_anthropic_response(resp, model, monitor, body),
7615
+ media_type="text/event-stream",
7616
+ )
7617
+ # fall through if still failing
7618
+ error_body = await resp.aread()
7619
+ await resp.aclose()
7620
+ error_text = error_body.decode("utf-8", errors="replace")[:1000]
6256
7621
  if _maybe_disable_grammar_for_tools_error(
6257
7622
  openai_body,
6258
7623
  resp.status_code,
@@ -6385,6 +7750,23 @@ async def messages(request: Request):
6385
7750
  media_type="application/json",
6386
7751
  )
6387
7752
 
7753
+ if resp.status_code != 200:
7754
+ error_text = resp.text[:1000]
7755
+ # Gemma 4 PEG parse-failure recovery (non-stream path).
7756
+ relaxed = (
7757
+ _is_gemma4_peg_parse_failure(resp.status_code, error_text)
7758
+ and _relax_tool_choice_for_gemma4_peg_retry(openai_body, "non-stream")
7759
+ )
7760
+ if relaxed:
7761
+ try:
7762
+ resp = await _post_with_generation_timeout(
7763
+ client,
7764
+ f"{LLAMA_CPP_BASE}/chat/completions",
7765
+ openai_body,
7766
+ {"Content-Type": "application/json"},
7767
+ )
7768
+ except Exception:
7769
+ pass # fall through
6388
7770
  if resp.status_code != 200:
6389
7771
  error_text = resp.text[:1000]
6390
7772
  if _maybe_disable_grammar_for_tools_error(
@@ -6437,7 +7819,7 @@ async def messages(request: Request):
6437
7819
 
6438
7820
  openai_resp = resp.json()
6439
7821
  # Recover tool calls from <tool_call> XML before guardrails run
6440
- _maybe_extract_text_tool_calls(openai_resp)
7822
+ _maybe_extract_text_tool_calls(openai_resp, anthropic_tools=body.get("tools"))
6441
7823
  openai_resp = await _apply_unexpected_end_turn_guardrail(
6442
7824
  client,
6443
7825
  openai_resp,
@@ -6506,7 +7888,11 @@ async def messages(request: Request):
6506
7888
  logger.info("DEGENERATE RETRY (stream): no tool call, using truncated")
6507
7889
  except Exception as exc:
6508
7890
  logger.warning("DEGENERATE RETRY (stream): failed: %s", exc)
6509
- anthropic_resp = openai_to_anthropic_response(openai_resp, model)
7891
+ anthropic_resp = openai_to_anthropic_response(
7892
+ openai_resp, model,
7893
+ expose_thinking=isinstance(body.get("thinking"), dict)
7894
+ and (body["thinking"].get("type") or "").lower() == "enabled",
7895
+ )
6510
7896
  # FINALIZE CONTINUATION: inject synthetic tool_use (non-guarded stream path)
6511
7897
  if (
6512
7898
  monitor.finalize_turn_active
@@ -6532,6 +7918,292 @@ async def messages_anthropic(request: Request):
6532
7918
  return await messages(request)
6533
7919
 
6534
7920
 
7921
+ @app.post("/v1/chat/completions")
7922
+ async def chat_completions(request: Request):
7923
+ """OpenAI-compatible chat/completions endpoint for clients like Forge
7924
+ that require the OpenAI API shape.
7925
+
7926
+ FULL GUARDRAIL PATH: Converts the OpenAI request to Anthropic format,
7927
+ runs the full /v1/messages pipeline (loop detection, tool narrowing,
7928
+ cycle breaking, malformed tool retry, context pruning, etc.), then
7929
+ converts the Anthropic response back to OpenAI format.
7930
+
7931
+ Streaming is down-converted to a single final OpenAI SSE chunk sequence
7932
+ built from the completed Anthropic response (not token-by-token from
7933
+ upstream). This preserves guardrails at the cost of stream granularity.
7934
+ """
7935
+ body_bytes = await request.body()
7936
+ try:
7937
+ openai_body = json.loads(body_bytes) if body_bytes else {}
7938
+ except (ValueError, TypeError):
7939
+ return Response(
7940
+ content=b'{"error":{"message":"invalid JSON","type":"invalid_request_error"}}',
7941
+ status_code=400,
7942
+ media_type="application/json",
7943
+ )
7944
+
7945
+ requested_stream = bool(openai_body.get("stream", False))
7946
+ model = openai_body.get("model", "default")
7947
+ client_id = resolve_client_id(request)
7948
+
7949
+ logger.info(
7950
+ "CHAT (guarded): client=%s model=%s stream=%s msgs=%d tools=%d",
7951
+ client_id,
7952
+ model,
7953
+ requested_stream,
7954
+ len(openai_body.get("messages", [])),
7955
+ len(openai_body.get("tools", []) or []),
7956
+ )
7957
+
7958
+ # Convert OpenAI request -> Anthropic request
7959
+ anthropic_body = openai_to_anthropic_request(openai_body)
7960
+ # Force non-streaming through the pipeline; we re-stream at the end if the
7961
+ # client wanted streaming. This keeps guardrail logic simpler/consistent.
7962
+ anthropic_body["stream"] = False
7963
+
7964
+ # Build a synthetic Request that the existing messages() handler can consume
7965
+ fake_body_bytes = json.dumps(anthropic_body).encode("utf-8")
7966
+
7967
+ async def receive():
7968
+ return {"type": "http.request", "body": fake_body_bytes, "more_body": False}
7969
+
7970
+ fake_scope = dict(request.scope)
7971
+ # Preserve client/headers but override the body + path
7972
+ fake_scope["path"] = "/v1/messages"
7973
+ fake_scope["raw_path"] = b"/v1/messages"
7974
+ # Strip content-length since the body changes
7975
+ fake_scope["headers"] = [
7976
+ (k, v)
7977
+ for (k, v) in fake_scope.get("headers", [])
7978
+ if k.lower() != b"content-length"
7979
+ ]
7980
+ fake_request = Request(fake_scope, receive)
7981
+
7982
+ # Run the full guarded Anthropic pipeline
7983
+ inner_resp = await messages(fake_request)
7984
+
7985
+ # Extract the Anthropic-format JSON from whatever messages() returned
7986
+ anthropic_resp_dict: dict | None = None
7987
+ status_code = 200
7988
+ if isinstance(inner_resp, StreamingResponse):
7989
+ # Pipeline shouldn't stream because we set stream=False, but defensively
7990
+ # consume the stream and parse the final message event.
7991
+ chunks: list[bytes] = []
7992
+ async for chunk in inner_resp.body_iterator:
7993
+ if isinstance(chunk, bytes):
7994
+ chunks.append(chunk)
7995
+ elif isinstance(chunk, str):
7996
+ chunks.append(chunk.encode("utf-8"))
7997
+ raw = b"".join(chunks)
7998
+ # Try to parse as JSON directly first, then fall back to SSE parsing
7999
+ try:
8000
+ anthropic_resp_dict = json.loads(raw)
8001
+ except (ValueError, TypeError):
8002
+ anthropic_resp_dict = _parse_anthropic_sse_to_message(raw)
8003
+ elif isinstance(inner_resp, Response):
8004
+ status_code = inner_resp.status_code
8005
+ try:
8006
+ anthropic_resp_dict = json.loads(inner_resp.body)
8007
+ except (ValueError, TypeError):
8008
+ anthropic_resp_dict = None
8009
+ elif isinstance(inner_resp, dict):
8010
+ anthropic_resp_dict = inner_resp
8011
+
8012
+ if anthropic_resp_dict is None or "content" not in anthropic_resp_dict:
8013
+ # Upstream error: forward as-is in OpenAI error shape
8014
+ err_msg = "upstream returned no message"
8015
+ if isinstance(anthropic_resp_dict, dict) and "error" in anthropic_resp_dict:
8016
+ err_msg = anthropic_resp_dict["error"].get("message", err_msg)
8017
+ return Response(
8018
+ content=json.dumps({"error": {"message": err_msg, "type": "upstream_error"}}).encode(),
8019
+ status_code=status_code if status_code >= 400 else 502,
8020
+ media_type="application/json",
8021
+ )
8022
+
8023
+ # Ensure model field is set for response
8024
+ anthropic_resp_dict.setdefault("model", model)
8025
+ openai_resp = anthropic_to_openai_response(anthropic_resp_dict)
8026
+
8027
+ if not requested_stream:
8028
+ return Response(
8029
+ content=json.dumps(openai_resp).encode(),
8030
+ status_code=200,
8031
+ media_type="application/json",
8032
+ )
8033
+
8034
+ # Client requested streaming: emit the response as OpenAI SSE chunks
8035
+ async def emit_openai_stream():
8036
+ resp_id = openai_resp["id"]
8037
+ created = openai_resp["created"]
8038
+ model_name = openai_resp["model"]
8039
+ choice = openai_resp["choices"][0]
8040
+ message = choice["message"]
8041
+
8042
+ # Opening chunk: role
8043
+ opening = {
8044
+ "id": resp_id,
8045
+ "object": "chat.completion.chunk",
8046
+ "created": created,
8047
+ "model": model_name,
8048
+ "choices": [{"index": 0, "delta": {"role": "assistant"}, "finish_reason": None}],
8049
+ }
8050
+ yield f"data: {json.dumps(opening)}\n\n".encode()
8051
+
8052
+ # Content chunk
8053
+ if message.get("content"):
8054
+ content_chunk = {
8055
+ "id": resp_id,
8056
+ "object": "chat.completion.chunk",
8057
+ "created": created,
8058
+ "model": model_name,
8059
+ "choices": [
8060
+ {
8061
+ "index": 0,
8062
+ "delta": {"content": message["content"]},
8063
+ "finish_reason": None,
8064
+ }
8065
+ ],
8066
+ }
8067
+ yield f"data: {json.dumps(content_chunk)}\n\n".encode()
8068
+
8069
+ # Tool call chunks
8070
+ for idx, tc in enumerate(message.get("tool_calls", []) or []):
8071
+ tc_chunk = {
8072
+ "id": resp_id,
8073
+ "object": "chat.completion.chunk",
8074
+ "created": created,
8075
+ "model": model_name,
8076
+ "choices": [
8077
+ {
8078
+ "index": 0,
8079
+ "delta": {
8080
+ "tool_calls": [
8081
+ {
8082
+ "index": idx,
8083
+ "id": tc["id"],
8084
+ "type": "function",
8085
+ "function": {
8086
+ "name": tc["function"]["name"],
8087
+ "arguments": tc["function"]["arguments"],
8088
+ },
8089
+ }
8090
+ ]
8091
+ },
8092
+ "finish_reason": None,
8093
+ }
8094
+ ],
8095
+ }
8096
+ yield f"data: {json.dumps(tc_chunk)}\n\n".encode()
8097
+
8098
+ # Final chunk with finish_reason
8099
+ final_chunk = {
8100
+ "id": resp_id,
8101
+ "object": "chat.completion.chunk",
8102
+ "created": created,
8103
+ "model": model_name,
8104
+ "choices": [
8105
+ {"index": 0, "delta": {}, "finish_reason": choice["finish_reason"]}
8106
+ ],
8107
+ }
8108
+ yield f"data: {json.dumps(final_chunk)}\n\n".encode()
8109
+ yield b"data: [DONE]\n\n"
8110
+
8111
+ return StreamingResponse(
8112
+ emit_openai_stream(),
8113
+ media_type="text/event-stream",
8114
+ headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no"},
8115
+ )
8116
+
8117
+
8118
+ def _parse_anthropic_sse_to_message(raw: bytes) -> dict | None:
8119
+ """Parse a concatenated Anthropic SSE stream into a final message dict.
8120
+ Used as a fallback when messages() returns a StreamingResponse despite stream=False.
8121
+ """
8122
+ try:
8123
+ text = raw.decode("utf-8", errors="replace")
8124
+ except Exception:
8125
+ return None
8126
+
8127
+ text_parts: list[str] = []
8128
+ tool_uses: list[dict] = []
8129
+ usage = {"input_tokens": 0, "output_tokens": 0}
8130
+ stop_reason = "end_turn"
8131
+ model = "unknown"
8132
+ message_id = f"msg_{uuid.uuid4().hex[:24]}"
8133
+
8134
+ current_block: dict | None = None
8135
+ current_json_buffer = ""
8136
+
8137
+ for line in text.splitlines():
8138
+ if not line.startswith("data:"):
8139
+ continue
8140
+ payload = line[5:].strip()
8141
+ if not payload or payload == "[DONE]":
8142
+ continue
8143
+ try:
8144
+ evt = json.loads(payload)
8145
+ except (ValueError, TypeError):
8146
+ continue
8147
+ etype = evt.get("type")
8148
+ if etype == "message_start":
8149
+ m = evt.get("message", {}) or {}
8150
+ message_id = m.get("id", message_id)
8151
+ model = m.get("model", model)
8152
+ if "usage" in m:
8153
+ usage.update(m["usage"])
8154
+ elif etype == "content_block_start":
8155
+ current_block = evt.get("content_block", {})
8156
+ current_json_buffer = ""
8157
+ if current_block.get("type") == "text":
8158
+ text_parts.append(current_block.get("text", ""))
8159
+ elif etype == "content_block_delta":
8160
+ d = evt.get("delta", {}) or {}
8161
+ if d.get("type") == "text_delta":
8162
+ text_parts.append(d.get("text", ""))
8163
+ elif d.get("type") == "input_json_delta":
8164
+ current_json_buffer += d.get("partial_json", "")
8165
+ elif etype == "content_block_stop":
8166
+ if current_block and current_block.get("type") == "tool_use":
8167
+ try:
8168
+ input_obj = json.loads(current_json_buffer) if current_json_buffer else {}
8169
+ except (ValueError, TypeError):
8170
+ input_obj = {}
8171
+ tool_uses.append(
8172
+ {
8173
+ "type": "tool_use",
8174
+ "id": current_block.get("id", f"toolu_{uuid.uuid4().hex[:12]}"),
8175
+ "name": current_block.get("name", ""),
8176
+ "input": input_obj,
8177
+ }
8178
+ )
8179
+ current_block = None
8180
+ current_json_buffer = ""
8181
+ elif etype == "message_delta":
8182
+ d = evt.get("delta", {}) or {}
8183
+ if "stop_reason" in d:
8184
+ stop_reason = d["stop_reason"] or stop_reason
8185
+ u = evt.get("usage", {}) or {}
8186
+ if u:
8187
+ usage.update(u)
8188
+
8189
+ content: list[dict] = []
8190
+ joined_text = "".join(text_parts)
8191
+ if joined_text:
8192
+ content.append({"type": "text", "text": joined_text})
8193
+ content.extend(tool_uses)
8194
+
8195
+ return {
8196
+ "id": message_id,
8197
+ "type": "message",
8198
+ "role": "assistant",
8199
+ "content": content if content else [{"type": "text", "text": ""}],
8200
+ "model": model,
8201
+ "stop_reason": stop_reason,
8202
+ "stop_sequence": None,
8203
+ "usage": usage,
8204
+ }
8205
+
8206
+
6535
8207
  @app.get("/v1/models")
6536
8208
  async def models():
6537
8209
  """Return available model list (spoofs Anthropic model IDs for client compatibility)."""