@miller-tech/uap 1.13.13 → 1.13.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -76,9 +76,11 @@ Dependencies
76
76
  """
77
77
 
78
78
  import asyncio
79
+ import hashlib
79
80
  import json
80
81
  import logging
81
82
  import os
83
+ import re
82
84
  import sys
83
85
  import time
84
86
  import uuid
@@ -100,7 +102,107 @@ PROXY_LOG_LEVEL = os.environ.get("PROXY_LOG_LEVEL", "INFO").upper()
100
102
  PROXY_READ_TIMEOUT = float(os.environ.get("PROXY_READ_TIMEOUT", "600"))
101
103
  PROXY_MAX_CONNECTIONS = int(os.environ.get("PROXY_MAX_CONNECTIONS", "20"))
102
104
  PROXY_CONTEXT_WINDOW = int(os.environ.get("PROXY_CONTEXT_WINDOW", "0"))
103
- PROXY_CONTEXT_PRUNE_THRESHOLD = float(os.environ.get("PROXY_CONTEXT_PRUNE_THRESHOLD", "0.75"))
105
+ PROXY_CONTEXT_PRUNE_THRESHOLD = float(
106
+ os.environ.get("PROXY_CONTEXT_PRUNE_THRESHOLD", "0.75")
107
+ )
108
+ PROXY_CONTEXT_PRUNE_TARGET_FRACTION = float(
109
+ os.environ.get("PROXY_CONTEXT_PRUNE_TARGET_FRACTION", "0.65")
110
+ )
111
+ PROXY_LOOP_BREAKER = os.environ.get("PROXY_LOOP_BREAKER", "on").lower() not in {
112
+ "0",
113
+ "false",
114
+ "off",
115
+ "no",
116
+ }
117
+ PROXY_LOOP_WINDOW = int(os.environ.get("PROXY_LOOP_WINDOW", "6"))
118
+ PROXY_LOOP_REPEAT_THRESHOLD = int(os.environ.get("PROXY_LOOP_REPEAT_THRESHOLD", "8"))
119
+ PROXY_FORCED_THRESHOLD = int(os.environ.get("PROXY_FORCED_THRESHOLD", "15"))
120
+ PROXY_NO_PROGRESS_THRESHOLD = int(os.environ.get("PROXY_NO_PROGRESS_THRESHOLD", "4"))
121
+ PROXY_CONTEXT_RELEASE_THRESHOLD = float(
122
+ os.environ.get("PROXY_CONTEXT_RELEASE_THRESHOLD", "0.90")
123
+ )
124
+ PROXY_GUARDRAIL_RETRY = os.environ.get("PROXY_GUARDRAIL_RETRY", "on").lower() not in {
125
+ "0",
126
+ "false",
127
+ "off",
128
+ "no",
129
+ }
130
+ PROXY_SESSION_TTL_SECS = int(os.environ.get("PROXY_SESSION_TTL_SECS", "7200"))
131
+ PROXY_STREAM_REASONING_FALLBACK = (
132
+ os.environ.get("PROXY_STREAM_REASONING_FALLBACK", "off").strip().lower()
133
+ )
134
+ PROXY_STREAM_REASONING_MAX_CHARS = int(
135
+ os.environ.get("PROXY_STREAM_REASONING_MAX_CHARS", "240")
136
+ )
137
+ PROXY_MAX_TOKENS_FLOOR = int(os.environ.get("PROXY_MAX_TOKENS_FLOOR", "16384"))
138
+ PROXY_TOOL_NARROWING = os.environ.get("PROXY_TOOL_NARROWING", "off").lower() not in {
139
+ "0",
140
+ "false",
141
+ "off",
142
+ "no",
143
+ }
144
+ PROXY_TOOL_NARROWING_KEEP = int(os.environ.get("PROXY_TOOL_NARROWING_KEEP", "8"))
145
+ PROXY_TOOL_NARROWING_MIN_TOOLS = int(
146
+ os.environ.get("PROXY_TOOL_NARROWING_MIN_TOOLS", "12")
147
+ )
148
+ PROXY_DISABLE_THINKING_ON_TOOL_TURNS = os.environ.get(
149
+ "PROXY_DISABLE_THINKING_ON_TOOL_TURNS", "off"
150
+ ).lower() not in {
151
+ "0",
152
+ "false",
153
+ "off",
154
+ "no",
155
+ }
156
+ PROXY_MALFORMED_TOOL_GUARDRAIL = os.environ.get(
157
+ "PROXY_MALFORMED_TOOL_GUARDRAIL", "on"
158
+ ).lower() not in {
159
+ "0",
160
+ "false",
161
+ "off",
162
+ "no",
163
+ }
164
+ PROXY_MALFORMED_TOOL_RETRY_MAX = int(
165
+ os.environ.get("PROXY_MALFORMED_TOOL_RETRY_MAX", "1")
166
+ )
167
+ PROXY_MALFORMED_TOOL_RETRY_MAX_TOKENS = int(
168
+ os.environ.get("PROXY_MALFORMED_TOOL_RETRY_MAX_TOKENS", "2048")
169
+ )
170
+ PROXY_MALFORMED_TOOL_RETRY_TEMPERATURE = float(
171
+ os.environ.get("PROXY_MALFORMED_TOOL_RETRY_TEMPERATURE", "0")
172
+ )
173
+ PROXY_MALFORMED_TOOL_STREAM_STRICT = os.environ.get(
174
+ "PROXY_MALFORMED_TOOL_STREAM_STRICT", "off"
175
+ ).lower() not in {
176
+ "0",
177
+ "false",
178
+ "off",
179
+ "no",
180
+ }
181
+ PROXY_FORCE_NON_STREAM = os.environ.get(
182
+ "PROXY_FORCE_NON_STREAM", "off"
183
+ ).lower() not in {
184
+ "0",
185
+ "false",
186
+ "off",
187
+ "no",
188
+ }
189
+ PROXY_SESSION_CONTAMINATION_BREAKER = os.environ.get(
190
+ "PROXY_SESSION_CONTAMINATION_BREAKER", "on"
191
+ ).lower() not in {
192
+ "0",
193
+ "false",
194
+ "off",
195
+ "no",
196
+ }
197
+ PROXY_SESSION_CONTAMINATION_THRESHOLD = int(
198
+ os.environ.get("PROXY_SESSION_CONTAMINATION_THRESHOLD", "3")
199
+ )
200
+ PROXY_SESSION_CONTAMINATION_KEEP_LAST = int(
201
+ os.environ.get("PROXY_SESSION_CONTAMINATION_KEEP_LAST", "8")
202
+ )
203
+ PROXY_AGENTIC_SUPPLEMENT_MODE = (
204
+ os.environ.get("PROXY_AGENTIC_SUPPLEMENT_MODE", "clean").strip().lower()
205
+ )
104
206
 
105
207
  # ---------------------------------------------------------------------------
106
208
  # Logging
@@ -121,19 +223,28 @@ class SessionMonitor:
121
223
  """Tracks token usage across the session to provide early warnings
122
224
  and enable proactive context management before overflow occurs."""
123
225
 
124
- context_window: int = 0 # Auto-detected or configured
226
+ context_window: int = 0 # Auto-detected or configured
125
227
  total_requests: int = 0
126
- last_input_tokens: int = 0 # Estimated input tokens of last request
127
- last_output_tokens: int = 0 # Actual output tokens of last response
128
- peak_input_tokens: int = 0 # High-water mark
129
- prune_count: int = 0 # How many times pruning was triggered
130
- overflow_count: int = 0 # How many context overflow errors caught
228
+ last_input_tokens: int = 0 # Estimated input tokens of last request
229
+ last_output_tokens: int = 0 # Actual output tokens of last response
230
+ peak_input_tokens: int = 0 # High-water mark
231
+ prune_count: int = 0 # How many times pruning was triggered
232
+ overflow_count: int = 0 # How many context overflow errors caught
131
233
  context_history: list = field(default_factory=list) # Recent token counts
132
234
 
133
235
  # --- Token Loop Protection ---
134
- tool_call_history: list = field(default_factory=list) # Recent tool call fingerprints
135
- consecutive_forced_count: int = 0 # How many times tool_choice was forced consecutively
136
- loop_warnings_emitted: int = 0 # How many loop warnings sent to the model
236
+ tool_call_history: list = field(
237
+ default_factory=list
238
+ ) # Recent tool call fingerprints
239
+ consecutive_forced_count: int = (
240
+ 0 # How many times tool_choice was forced consecutively
241
+ )
242
+ loop_warnings_emitted: int = 0 # How many loop warnings sent to the model
243
+ no_progress_streak: int = 0 # Forced tool turns without new tool_result
244
+ unexpected_end_turn_count: int = 0 # end_turn without tool_use in active loop
245
+ malformed_tool_streak: int = 0 # consecutive malformed pseudo tool payloads
246
+ contamination_resets: int = 0 # how many contamination resets were applied
247
+ last_seen_ts: float = 0.0
137
248
 
138
249
  def record_request(self, estimated_tokens: int):
139
250
  """Record an outgoing request's estimated token count."""
@@ -150,6 +261,9 @@ class SessionMonitor:
150
261
  """Record a response's output token count."""
151
262
  self.last_output_tokens = output_tokens
152
263
 
264
+ def touch(self):
265
+ self.last_seen_ts = time.time()
266
+
153
267
  def get_utilization(self) -> float:
154
268
  """Get current context utilization as a fraction (0.0 - 1.0)."""
155
269
  if self.context_window <= 0:
@@ -196,25 +310,36 @@ class SessionMonitor:
196
310
  if warning == "CRITICAL":
197
311
  logger.error(
198
312
  "CONTEXT CRITICAL: %d/%d tokens (%.1f%%), %s, pruned=%d, overflows=%d",
199
- self.last_input_tokens, self.context_window, util * 100,
200
- turns_str, self.prune_count, self.overflow_count,
313
+ self.last_input_tokens,
314
+ self.context_window,
315
+ util * 100,
316
+ turns_str,
317
+ self.prune_count,
318
+ self.overflow_count,
201
319
  )
202
320
  elif warning == "HIGH":
203
321
  logger.warning(
204
322
  "CONTEXT HIGH: %d/%d tokens (%.1f%%), %s, pruned=%d",
205
- self.last_input_tokens, self.context_window, util * 100,
206
- turns_str, self.prune_count,
323
+ self.last_input_tokens,
324
+ self.context_window,
325
+ util * 100,
326
+ turns_str,
327
+ self.prune_count,
207
328
  )
208
329
  elif warning == "ELEVATED":
209
330
  logger.warning(
210
331
  "CONTEXT ELEVATED: %d/%d tokens (%.1f%%), %s",
211
- self.last_input_tokens, self.context_window, util * 100,
332
+ self.last_input_tokens,
333
+ self.context_window,
334
+ util * 100,
212
335
  turns_str,
213
336
  )
214
337
  else:
215
338
  logger.info(
216
339
  "CONTEXT: %d/%d tokens (%.1f%%), %s",
217
- self.last_input_tokens, self.context_window, util * 100,
340
+ self.last_input_tokens,
341
+ self.context_window,
342
+ util * 100,
218
343
  turns_str,
219
344
  )
220
345
 
@@ -264,30 +389,42 @@ class SessionMonitor:
264
389
  - 15+ consecutive forced requests regardless -> release
265
390
  - Context utilization > 90% -> release (let model wrap up)
266
391
  """
267
- is_looping, repeat_count = self.detect_tool_loop(window=6)
392
+ if not PROXY_LOOP_BREAKER:
393
+ return False
394
+
395
+ is_looping, repeat_count = self.detect_tool_loop(window=PROXY_LOOP_WINDOW)
268
396
 
269
397
  # Pattern 1: Detected tool call loop
270
- if is_looping and repeat_count >= 8:
398
+ if (
399
+ is_looping
400
+ and repeat_count >= PROXY_LOOP_REPEAT_THRESHOLD
401
+ and self.no_progress_streak >= PROXY_NO_PROGRESS_THRESHOLD
402
+ ):
271
403
  logger.warning(
272
- "LOOP BREAKER: Same tool pattern repeated %d times. "
404
+ "LOOP BREAKER: Same tool pattern repeated %d times with no progress streak=%d. "
273
405
  "Releasing tool_choice to 'auto'.",
274
406
  repeat_count,
407
+ self.no_progress_streak,
275
408
  )
276
409
  self.loop_warnings_emitted += 1
277
410
  return True
278
411
 
279
412
  # Pattern 2: Too many consecutive forced requests
280
- if self.consecutive_forced_count >= 15:
413
+ if (
414
+ self.consecutive_forced_count >= PROXY_FORCED_THRESHOLD
415
+ and self.no_progress_streak >= PROXY_NO_PROGRESS_THRESHOLD
416
+ ):
281
417
  logger.warning(
282
- "LOOP BREAKER: %d consecutive forced tool_choice requests. "
418
+ "LOOP BREAKER: %d consecutive forced tool_choice requests with no progress streak=%d. "
283
419
  "Releasing to 'auto'.",
284
420
  self.consecutive_forced_count,
421
+ self.no_progress_streak,
285
422
  )
286
423
  self.loop_warnings_emitted += 1
287
424
  return True
288
425
 
289
426
  # Pattern 3: Context almost full -- let model wrap up naturally
290
- if self.get_utilization() >= 0.90:
427
+ if self.get_utilization() >= PROXY_CONTEXT_RELEASE_THRESHOLD:
291
428
  logger.warning(
292
429
  "LOOP BREAKER: Context utilization %.1f%% -- releasing "
293
430
  "tool_choice to let model wrap up.",
@@ -298,7 +435,35 @@ class SessionMonitor:
298
435
  return False
299
436
 
300
437
 
301
- session_monitor = SessionMonitor()
438
+ session_monitors: dict[str, SessionMonitor] = {}
439
+ default_context_window = 0
440
+ last_session_id = ""
441
+
442
+
443
+ def _cleanup_stale_monitors(now_ts: float) -> None:
444
+ stale = [
445
+ sid
446
+ for sid, mon in session_monitors.items()
447
+ if mon.last_seen_ts > 0 and now_ts - mon.last_seen_ts > PROXY_SESSION_TTL_SECS
448
+ ]
449
+ for sid in stale:
450
+ session_monitors.pop(sid, None)
451
+
452
+
453
+ def get_session_monitor(session_id: str) -> SessionMonitor:
454
+ now_ts = time.time()
455
+ _cleanup_stale_monitors(now_ts)
456
+
457
+ monitor = session_monitors.get(session_id)
458
+ if monitor is None:
459
+ monitor = SessionMonitor(context_window=default_context_window)
460
+ session_monitors[session_id] = monitor
461
+
462
+ monitor.touch()
463
+ if monitor.context_window <= 0:
464
+ monitor.context_window = default_context_window
465
+
466
+ return monitor
302
467
 
303
468
 
304
469
  # ---------------------------------------------------------------------------
@@ -324,7 +489,8 @@ async def detect_context_window(client: httpx.AsyncClient) -> int:
324
489
  if n_ctx > 0:
325
490
  logger.info(
326
491
  "Auto-detected context window from upstream: %d tokens (%d slots)",
327
- n_ctx, len(slots),
492
+ n_ctx,
493
+ len(slots),
328
494
  )
329
495
  return n_ctx
330
496
  except Exception as exc:
@@ -398,7 +564,9 @@ def estimate_total_tokens(anthropic_body: dict) -> int:
398
564
  return tokens
399
565
 
400
566
 
401
- def prune_conversation(anthropic_body: dict, context_window: int, target_fraction: float = 0.65) -> dict:
567
+ def prune_conversation(
568
+ anthropic_body: dict, context_window: int, target_fraction: float = 0.65
569
+ ) -> dict:
402
570
  """Prune the conversation to fit within the context window.
403
571
 
404
572
  Strategy:
@@ -445,19 +613,24 @@ def prune_conversation(anthropic_body: dict, context_window: int, target_fractio
445
613
 
446
614
  # Always keep the first user message and the last N messages
447
615
  KEEP_LAST = 8 # Keep the last 8 messages (recent context)
448
- protected_head = messages[:1] # First user message
449
- protected_tail = messages[-KEEP_LAST:] if len(messages) > KEEP_LAST else messages[1:]
616
+ protected_head = messages[:1] # First user message
617
+ protected_tail = (
618
+ messages[-KEEP_LAST:] if len(messages) > KEEP_LAST else messages[1:]
619
+ )
450
620
  middle = messages[1:-KEEP_LAST] if len(messages) > KEEP_LAST + 1 else []
451
621
 
452
622
  # Calculate tokens for protected messages
453
- protected_tokens = sum(estimate_message_tokens(m) for m in protected_head + protected_tail)
623
+ protected_tokens = sum(
624
+ estimate_message_tokens(m) for m in protected_head + protected_tail
625
+ )
454
626
 
455
627
  if protected_tokens >= message_budget:
456
628
  # Even protected messages exceed budget -- truncate tool_result content
457
629
  # in the tail to fit
458
630
  logger.warning(
459
631
  "Protected messages (%d tokens) exceed budget (%d) -- truncating tool results",
460
- protected_tokens, message_budget,
632
+ protected_tokens,
633
+ message_budget,
461
634
  )
462
635
  for msg in protected_tail:
463
636
  content = msg.get("content", [])
@@ -466,7 +639,11 @@ def prune_conversation(anthropic_body: dict, context_window: int, target_fractio
466
639
  if isinstance(block, dict) and block.get("type") == "tool_result":
467
640
  result_text = _extract_text(block.get("content", ""))
468
641
  if len(result_text) > 2000:
469
- block["content"] = result_text[:1000] + "\n...[TRUNCATED]...\n" + result_text[-500:]
642
+ block["content"] = (
643
+ result_text[:1000]
644
+ + "\n...[TRUNCATED]...\n"
645
+ + result_text[-500:]
646
+ )
470
647
  anthropic_body["messages"] = protected_head + protected_tail
471
648
  return anthropic_body
472
649
 
@@ -486,8 +663,7 @@ def prune_conversation(anthropic_body: dict, context_window: int, target_fractio
486
663
 
487
664
  if isinstance(content, list):
488
665
  is_tool_result = any(
489
- isinstance(b, dict) and b.get("type") == "tool_result"
490
- for b in content
666
+ isinstance(b, dict) and b.get("type") == "tool_result" for b in content
491
667
  )
492
668
 
493
669
  # Lower priority = removed first
@@ -529,12 +705,17 @@ def prune_conversation(anthropic_body: dict, context_window: int, target_fractio
529
705
  f"The conversation continues from recent context below.]"
530
706
  ),
531
707
  }
532
- anthropic_body["messages"] = protected_head + [prune_marker] + kept_msgs + protected_tail
708
+ anthropic_body["messages"] = (
709
+ protected_head + [prune_marker] + kept_msgs + protected_tail
710
+ )
533
711
  logger.warning(
534
712
  "PRUNED: removed %d messages (~%d tokens), kept %d messages, "
535
713
  "target=%.0f%% of %d ctx",
536
- removed_count, removed_tokens, len(anthropic_body["messages"]),
537
- target_fraction * 100, context_window,
714
+ removed_count,
715
+ removed_tokens,
716
+ len(anthropic_body["messages"]),
717
+ target_fraction * 100,
718
+ context_window,
538
719
  )
539
720
  else:
540
721
  anthropic_body["messages"] = protected_head + kept_msgs + protected_tail
@@ -554,12 +735,13 @@ http_client: httpx.AsyncClient | None = None
554
735
  async def lifespan(app: FastAPI):
555
736
  """Manage the httpx client lifecycle with the FastAPI app."""
556
737
  global http_client
738
+ global default_context_window
557
739
  http_client = httpx.AsyncClient(
558
740
  timeout=httpx.Timeout(
559
- connect=10.0, # 10s to establish connection
560
- read=PROXY_READ_TIMEOUT, # configurable (default 10 min)
561
- write=30.0, # 30s to send the request body
562
- pool=10.0, # 10s to acquire a pool connection
741
+ connect=10.0, # 10s to establish connection
742
+ read=PROXY_READ_TIMEOUT, # configurable (default 10 min)
743
+ write=30.0, # 30s to send the request body
744
+ pool=10.0, # 10s to acquire a pool connection
563
745
  ),
564
746
  limits=httpx.Limits(
565
747
  max_connections=PROXY_MAX_CONNECTIONS,
@@ -569,15 +751,31 @@ async def lifespan(app: FastAPI):
569
751
  )
570
752
  logger.info(
571
753
  "Proxy started: listening on %s:%d -> upstream %s",
572
- PROXY_HOST, PROXY_PORT, LLAMA_CPP_BASE,
754
+ PROXY_HOST,
755
+ PROXY_PORT,
756
+ LLAMA_CPP_BASE,
573
757
  )
574
758
 
575
759
  # Auto-detect context window from upstream server
576
- session_monitor.context_window = await detect_context_window(http_client)
760
+ default_context_window = await detect_context_window(http_client)
761
+ for mon in session_monitors.values():
762
+ if mon.context_window <= 0:
763
+ mon.context_window = default_context_window
577
764
  logger.info(
578
- "Context window: %d tokens, prune threshold: %.0f%%",
579
- session_monitor.context_window,
765
+ "Context window: %d tokens, prune threshold: %.0f%%, prune target: %.0f%%",
766
+ default_context_window,
580
767
  PROXY_CONTEXT_PRUNE_THRESHOLD * 100,
768
+ _resolve_prune_target_fraction() * 100,
769
+ )
770
+ logger.info(
771
+ "Guardrails: malformed=%s stream_strict=%s force_non_stream=%s tool_narrowing=%s thinking_off_on_tools=%s contamination_breaker=%s(%d)",
772
+ PROXY_MALFORMED_TOOL_GUARDRAIL,
773
+ PROXY_MALFORMED_TOOL_STREAM_STRICT,
774
+ PROXY_FORCE_NON_STREAM,
775
+ PROXY_TOOL_NARROWING,
776
+ PROXY_DISABLE_THINKING_ON_TOOL_TURNS,
777
+ PROXY_SESSION_CONTAMINATION_BREAKER,
778
+ PROXY_SESSION_CONTAMINATION_THRESHOLD,
581
779
  )
582
780
 
583
781
  yield
@@ -598,6 +796,7 @@ app = FastAPI(
598
796
  # Request Translation: Anthropic -> OpenAI
599
797
  # ===========================================================================
600
798
 
799
+
601
800
  def anthropic_to_openai_messages(anthropic_body: dict) -> list[dict]:
602
801
  """Convert Anthropic message format to OpenAI message format.
603
802
 
@@ -635,25 +834,33 @@ def anthropic_to_openai_messages(anthropic_body: dict) -> list[dict]:
635
834
  elif block.get("type") == "text":
636
835
  parts.append(block.get("text", ""))
637
836
  elif block.get("type") == "tool_use":
638
- messages.append({
639
- "role": "assistant",
640
- "content": None,
641
- "tool_calls": [{
642
- "id": block.get("id", f"call_{uuid.uuid4().hex[:8]}"),
643
- "type": "function",
644
- "function": {
645
- "name": block["name"],
646
- "arguments": json.dumps(block.get("input", {})),
647
- },
648
- }],
649
- })
837
+ messages.append(
838
+ {
839
+ "role": "assistant",
840
+ "content": None,
841
+ "tool_calls": [
842
+ {
843
+ "id": block.get(
844
+ "id", f"call_{uuid.uuid4().hex[:8]}"
845
+ ),
846
+ "type": "function",
847
+ "function": {
848
+ "name": block["name"],
849
+ "arguments": json.dumps(block.get("input", {})),
850
+ },
851
+ }
852
+ ],
853
+ }
854
+ )
650
855
  continue
651
856
  elif block.get("type") == "tool_result":
652
- messages.append({
653
- "role": "tool",
654
- "tool_call_id": block.get("tool_use_id", ""),
655
- "content": _extract_text(block.get("content", "")),
656
- })
857
+ messages.append(
858
+ {
859
+ "role": "tool",
860
+ "tool_call_id": block.get("tool_use_id", ""),
861
+ "content": _extract_text(block.get("content", "")),
862
+ }
863
+ )
657
864
  continue
658
865
  if parts:
659
866
  messages.append({"role": role, "content": "\n".join(parts)})
@@ -672,7 +879,7 @@ def _extract_text(content) -> str:
672
879
  return str(content)
673
880
 
674
881
 
675
- _AGENTIC_SYSTEM_SUPPLEMENT = (
882
+ _AGENTIC_SYSTEM_SUPPLEMENT_LEGACY = (
676
883
  "\n\n<agentic-protocol>\n"
677
884
  "You are operating in an agentic coding loop with tool access. Follow these rules:\n"
678
885
  "1. ALWAYS use tools to read, edit, write, and test code. Never just describe or explain what should be done.\n"
@@ -685,8 +892,183 @@ _AGENTIC_SYSTEM_SUPPLEMENT = (
685
892
  "</agentic-protocol>"
686
893
  )
687
894
 
895
+ _AGENTIC_SYSTEM_SUPPLEMENT_CLEAN = (
896
+ "\n\n<agentic-protocol>\n"
897
+ "You are operating in an agentic coding loop with tool access. Follow these rules:\n"
898
+ "1. Use tools for concrete work (read, edit, write, test) instead of stopping at analysis.\n"
899
+ "2. When a fix is identified, take the next tool action immediately.\n"
900
+ "3. Return final text only when the task is complete and verified.\n"
901
+ "4. Never output protocol fragments or raw tool schema in assistant text.\n"
902
+ "5. Never emit literal tag artifacts such as </parameter>, <tool_call>, or <function=...>.\n"
903
+ "6. When a tool is needed, emit a valid tool call object instead of prose about tool-call formatting.\n"
904
+ "7. If a tool call fails, adapt and try another approach.\n"
905
+ "</agentic-protocol>"
906
+ )
907
+
908
+ if PROXY_AGENTIC_SUPPLEMENT_MODE == "legacy":
909
+ _AGENTIC_SYSTEM_SUPPLEMENT = _AGENTIC_SYSTEM_SUPPLEMENT_LEGACY
910
+ elif PROXY_AGENTIC_SUPPLEMENT_MODE == "clean":
911
+ _AGENTIC_SYSTEM_SUPPLEMENT = _AGENTIC_SYSTEM_SUPPLEMENT_CLEAN
912
+ else:
913
+ logger.warning(
914
+ "Unknown PROXY_AGENTIC_SUPPLEMENT_MODE=%r; using clean supplement",
915
+ PROXY_AGENTIC_SUPPLEMENT_MODE,
916
+ )
917
+ _AGENTIC_SYSTEM_SUPPLEMENT = _AGENTIC_SYSTEM_SUPPLEMENT_CLEAN
918
+
688
919
 
689
- def build_openai_request(anthropic_body: dict) -> dict:
920
+ def _content_fingerprint(content) -> str:
921
+ if isinstance(content, str):
922
+ return content[:512]
923
+ if isinstance(content, list):
924
+ parts = []
925
+ for block in content:
926
+ if isinstance(block, str):
927
+ parts.append(block)
928
+ elif isinstance(block, dict):
929
+ btype = block.get("type", "")
930
+ if btype == "text":
931
+ parts.append(block.get("text", ""))
932
+ elif btype == "tool_use":
933
+ parts.append(f"tool:{block.get('name', '')}")
934
+ elif btype == "tool_result":
935
+ parts.append(f"result:{block.get('tool_use_id', '')}")
936
+ return "\n".join(parts)[:1024]
937
+ return str(content)[:512]
938
+
939
+
940
+ def resolve_session_id(request: Request, anthropic_body: dict) -> str:
941
+ header_keys = (
942
+ "x-uap-session-id",
943
+ "x-claude-session-id",
944
+ "anthropic-session-id",
945
+ "x-session-id",
946
+ )
947
+ for key in header_keys:
948
+ value = request.headers.get(key)
949
+ if value:
950
+ return f"hdr:{value}"
951
+
952
+ metadata = anthropic_body.get("metadata", {})
953
+ if isinstance(metadata, dict):
954
+ for key in ("session_id", "conversation_id", "thread_id"):
955
+ value = metadata.get(key)
956
+ if value:
957
+ return f"meta:{value}"
958
+
959
+ first_user = ""
960
+ for msg in anthropic_body.get("messages", []):
961
+ if msg.get("role") == "user":
962
+ first_user = _content_fingerprint(msg.get("content", ""))
963
+ break
964
+
965
+ system_fingerprint = _content_fingerprint(anthropic_body.get("system", ""))
966
+ model = anthropic_body.get("model", "default")
967
+ remote = request.client.host if request.client else "unknown"
968
+ digest = hashlib.sha256(
969
+ f"{remote}|{model}|{system_fingerprint}|{first_user}".encode(
970
+ "utf-8", errors="ignore"
971
+ )
972
+ ).hexdigest()[:20]
973
+ return f"fp:{digest}"
974
+
975
+
976
+ def _last_user_has_tool_result(anthropic_body: dict) -> bool:
977
+ messages = anthropic_body.get("messages", [])
978
+ for msg in reversed(messages):
979
+ if msg.get("role") != "user":
980
+ continue
981
+ content = msg.get("content")
982
+ if not isinstance(content, list):
983
+ return False
984
+ return any(
985
+ isinstance(b, dict) and b.get("type") == "tool_result" for b in content
986
+ )
987
+ return False
988
+
989
+
990
+ def _convert_anthropic_tools_to_openai(anthropic_tools: list[dict]) -> list[dict]:
991
+ converted = []
992
+ for tool in anthropic_tools:
993
+ converted.append(
994
+ {
995
+ "type": "function",
996
+ "function": {
997
+ "name": tool.get("name", ""),
998
+ "description": tool.get("description", ""),
999
+ "parameters": tool.get("input_schema", {}),
1000
+ },
1001
+ }
1002
+ )
1003
+ return converted
1004
+
1005
+
1006
+ def _latest_user_text(anthropic_body: dict) -> str:
1007
+ for msg in reversed(anthropic_body.get("messages", [])):
1008
+ if msg.get("role") != "user":
1009
+ continue
1010
+ return _extract_text(msg.get("content", ""))
1011
+ return ""
1012
+
1013
+
1014
+ def _tokenize_for_tool_ranking(text: str) -> set[str]:
1015
+ return {m.group(0).lower() for m in re.finditer(r"[a-zA-Z0-9_]{2,}", text)}
1016
+
1017
+
1018
+ def _narrow_tools_for_request(
1019
+ anthropic_body: dict, openai_tools: list[dict]
1020
+ ) -> list[dict]:
1021
+ if not PROXY_TOOL_NARROWING:
1022
+ return openai_tools
1023
+
1024
+ if len(openai_tools) < max(1, PROXY_TOOL_NARROWING_MIN_TOOLS):
1025
+ return openai_tools
1026
+
1027
+ keep = max(1, PROXY_TOOL_NARROWING_KEEP)
1028
+ if keep >= len(openai_tools):
1029
+ return openai_tools
1030
+
1031
+ query_text = _latest_user_text(anthropic_body).lower()
1032
+ query_tokens = _tokenize_for_tool_ranking(query_text)
1033
+ if not query_tokens:
1034
+ narrowed = openai_tools[:keep]
1035
+ logger.info(
1036
+ "TOOL NARROWING: %d -> %d tools (no query tokens)",
1037
+ len(openai_tools),
1038
+ len(narrowed),
1039
+ )
1040
+ return narrowed
1041
+
1042
+ scored: list[tuple[int, int, dict]] = []
1043
+ for idx, tool in enumerate(openai_tools):
1044
+ fn = tool.get("function", {})
1045
+ name = fn.get("name", "")
1046
+ desc = fn.get("description", "")
1047
+ hay = f"{name} {desc}".lower()
1048
+ tool_tokens = _tokenize_for_tool_ranking(hay)
1049
+ overlap = len(query_tokens & tool_tokens)
1050
+ score = overlap * 3
1051
+ if name and name.lower() in query_text:
1052
+ score += 4
1053
+ if name and any(tok in name.lower() for tok in query_tokens):
1054
+ score += 1
1055
+ scored.append((score, -idx, tool))
1056
+
1057
+ scored.sort(reverse=True)
1058
+ selected = {id(tool) for _, _, tool in scored[:keep]}
1059
+ narrowed = [tool for tool in openai_tools if id(tool) in selected]
1060
+
1061
+ top_names = [t.get("function", {}).get("name", "") for t in narrowed[:4]]
1062
+ logger.info(
1063
+ "TOOL NARROWING: %d -> %d tools (top=%s)",
1064
+ len(openai_tools),
1065
+ len(narrowed),
1066
+ top_names,
1067
+ )
1068
+ return narrowed
1069
+
1070
+
1071
+ def build_openai_request(anthropic_body: dict, monitor: SessionMonitor) -> dict:
690
1072
  """Build an OpenAI Chat Completions request from an Anthropic Messages request."""
691
1073
  openai_body = {
692
1074
  "model": anthropic_body.get("model", "default"),
@@ -700,23 +1082,26 @@ def build_openai_request(anthropic_body: dict) -> dict:
700
1082
  openai_body["messages"][0]["content"] += _AGENTIC_SYSTEM_SUPPLEMENT
701
1083
  else:
702
1084
  # No system message from the client; inject one.
703
- openai_body["messages"].insert(0, {
704
- "role": "system",
705
- "content": _AGENTIC_SYSTEM_SUPPLEMENT.strip(),
706
- })
1085
+ openai_body["messages"].insert(
1086
+ 0,
1087
+ {
1088
+ "role": "system",
1089
+ "content": _AGENTIC_SYSTEM_SUPPLEMENT.strip(),
1090
+ },
1091
+ )
707
1092
 
708
1093
  if "max_tokens" in anthropic_body:
709
- # Enforce minimum floor for thinking mode: model needs tokens for
710
- # reasoning (<think>...</think>) plus the actual response/tool calls.
711
- # Claude Code typically sends 4096-8192 which is too low for thinking.
712
- requested_max = max(anthropic_body["max_tokens"], 16384)
1094
+ # Enforce configurable minimum floor for thinking mode: model needs
1095
+ # tokens for reasoning (<think>...</think>) plus actual response/tool
1096
+ # calls. Set PROXY_MAX_TOKENS_FLOOR=0 to disable this floor.
1097
+ requested_max = _resolve_max_tokens_request(anthropic_body["max_tokens"])
713
1098
 
714
1099
  # Option E: Smart max_tokens capping — prevent the response from
715
1100
  # consuming so many tokens that the NEXT turn's input won't fit.
716
1101
  # Formula: max_tokens = min(requested, context_window - input_tokens - safety_margin)
717
1102
  # This ensures the model's output + current input stays within bounds,
718
1103
  # leaving room for the next turn's incremental growth.
719
- ctx_window = session_monitor.context_window
1104
+ ctx_window = monitor.context_window
720
1105
  if ctx_window > 0:
721
1106
  estimated_input = estimate_total_tokens(anthropic_body)
722
1107
  # Reserve 15% of context for next-turn growth (tool results, etc.)
@@ -725,8 +1110,11 @@ def build_openai_request(anthropic_body: dict) -> dict:
725
1110
  if available_for_output < requested_max and available_for_output > 1024:
726
1111
  logger.info(
727
1112
  "MAX_TOKENS capped: %d -> %d (ctx=%d, input~%d, margin=%d)",
728
- requested_max, available_for_output,
729
- ctx_window, estimated_input, safety_margin,
1113
+ requested_max,
1114
+ available_for_output,
1115
+ ctx_window,
1116
+ estimated_input,
1117
+ safety_margin,
730
1118
  )
731
1119
  requested_max = available_for_output
732
1120
  elif available_for_output <= 1024:
@@ -734,7 +1122,9 @@ def build_openai_request(anthropic_body: dict) -> dict:
734
1122
  logger.warning(
735
1123
  "MAX_TOKENS: only %d tokens available for output (ctx=%d, input~%d). "
736
1124
  "Response may be truncated.",
737
- available_for_output, ctx_window, estimated_input,
1125
+ available_for_output,
1126
+ ctx_window,
1127
+ estimated_input,
738
1128
  )
739
1129
  requested_max = max(1024, available_for_output)
740
1130
 
@@ -748,16 +1138,12 @@ def build_openai_request(anthropic_body: dict) -> dict:
748
1138
 
749
1139
  # Convert Anthropic tools to OpenAI function-calling tools
750
1140
  if "tools" in anthropic_body:
751
- openai_body["tools"] = []
752
- for tool in anthropic_body["tools"]:
753
- openai_body["tools"].append({
754
- "type": "function",
755
- "function": {
756
- "name": tool["name"],
757
- "description": tool.get("description", ""),
758
- "parameters": tool.get("input_schema", {}),
759
- },
760
- })
1141
+ openai_body["tools"] = _convert_anthropic_tools_to_openai(
1142
+ anthropic_body.get("tools", [])
1143
+ )
1144
+ openai_body["tools"] = _narrow_tools_for_request(
1145
+ anthropic_body, openai_body["tools"]
1146
+ )
761
1147
 
762
1148
  # Smart tool_choice: force tool calls during the agentic loop to
763
1149
  # prevent the model from producing text-only end_turn responses that
@@ -774,7 +1160,8 @@ def build_openai_request(anthropic_body: dict) -> dict:
774
1160
  # runaway token consumption.
775
1161
  n_msgs = len(anthropic_body.get("messages", []))
776
1162
  has_tool_results = any(
777
- isinstance(m.get("content"), list) and any(
1163
+ isinstance(m.get("content"), list)
1164
+ and any(
778
1165
  isinstance(b, dict) and b.get("type") == "tool_result"
779
1166
  for b in m.get("content", [])
780
1167
  )
@@ -782,28 +1169,47 @@ def build_openai_request(anthropic_body: dict) -> dict:
782
1169
  )
783
1170
 
784
1171
  # Record tool calls from the last assistant message for loop detection
785
- _record_last_assistant_tool_calls(anthropic_body)
1172
+ _record_last_assistant_tool_calls(anthropic_body, monitor)
1173
+ last_user_has_tool_result = _last_user_has_tool_result(anthropic_body)
786
1174
 
787
1175
  # Check if loop breaker should override tool_choice
788
- if session_monitor.should_release_tool_choice():
1176
+ if monitor.should_release_tool_choice():
789
1177
  openai_body["tool_choice"] = "auto"
790
- session_monitor.consecutive_forced_count = 0
1178
+ monitor.consecutive_forced_count = 0
1179
+ monitor.no_progress_streak = 0
791
1180
  logger.warning("tool_choice set to 'auto' by LOOP BREAKER")
792
1181
  elif _last_assistant_was_text_only(anthropic_body):
793
1182
  openai_body["tool_choice"] = "required"
794
- session_monitor.consecutive_forced_count += 1
795
- logger.info("tool_choice forced to 'required' (last assistant was text-only)")
1183
+ monitor.consecutive_forced_count += 1
1184
+ monitor.no_progress_streak = (
1185
+ 0 if last_user_has_tool_result else monitor.no_progress_streak + 1
1186
+ )
1187
+ logger.info(
1188
+ "tool_choice forced to 'required' (last assistant was text-only)"
1189
+ )
796
1190
  elif has_tool_results and n_msgs > 2:
797
1191
  openai_body["tool_choice"] = "required"
798
- session_monitor.consecutive_forced_count += 1
799
- logger.info("tool_choice forced to 'required' (active agentic loop with tool results)")
1192
+ monitor.consecutive_forced_count += 1
1193
+ monitor.no_progress_streak = (
1194
+ 0 if last_user_has_tool_result else monitor.no_progress_streak + 1
1195
+ )
1196
+ logger.info(
1197
+ "tool_choice forced to 'required' (active agentic loop with tool results)"
1198
+ )
800
1199
  else:
801
- session_monitor.consecutive_forced_count = 0
1200
+ monitor.consecutive_forced_count = 0
1201
+ monitor.no_progress_streak = 0
1202
+
1203
+ if PROXY_DISABLE_THINKING_ON_TOOL_TURNS:
1204
+ openai_body["enable_thinking"] = False
1205
+ logger.info(
1206
+ "Thinking disabled for tool turn (PROXY_DISABLE_THINKING_ON_TOOL_TURNS=on)"
1207
+ )
802
1208
 
803
1209
  return openai_body
804
1210
 
805
1211
 
806
- def _record_last_assistant_tool_calls(anthropic_body: dict):
1212
+ def _record_last_assistant_tool_calls(anthropic_body: dict, monitor: SessionMonitor):
807
1213
  """Extract tool call names from the last assistant message and record
808
1214
  them in the session monitor for loop detection."""
809
1215
  messages = anthropic_body.get("messages", [])
@@ -818,7 +1224,88 @@ def _record_last_assistant_tool_calls(anthropic_body: dict):
818
1224
  tool_names.append(block.get("name", "unknown"))
819
1225
  break
820
1226
  if tool_names:
821
- session_monitor.record_tool_calls(tool_names)
1227
+ monitor.record_tool_calls(tool_names)
1228
+
1229
+
1230
+ def _is_unexpected_end_turn(openai_resp: dict, anthropic_body: dict) -> bool:
1231
+ choices = openai_resp.get("choices") or []
1232
+ if not choices:
1233
+ return False
1234
+
1235
+ choice = choices[0]
1236
+ finish = choice.get("finish_reason")
1237
+ if finish not in {"stop", "end_turn"}:
1238
+ return False
1239
+
1240
+ msg = choice.get("message", {})
1241
+ if msg.get("tool_calls"):
1242
+ return False
1243
+
1244
+ if "tools" not in anthropic_body:
1245
+ return False
1246
+
1247
+ has_tool_results = any(
1248
+ isinstance(m.get("content"), list)
1249
+ and any(
1250
+ isinstance(b, dict) and b.get("type") == "tool_result"
1251
+ for b in m.get("content", [])
1252
+ )
1253
+ for m in anthropic_body.get("messages", [])
1254
+ )
1255
+
1256
+ return has_tool_results or _last_assistant_was_text_only(anthropic_body)
1257
+
1258
+
1259
+ def _resolve_max_tokens_request(requested_max_tokens: int) -> int:
1260
+ requested = max(1, int(requested_max_tokens))
1261
+ floor = max(0, PROXY_MAX_TOKENS_FLOOR)
1262
+ if floor == 0:
1263
+ return requested
1264
+ return max(requested, floor)
1265
+
1266
+
1267
+ def _resolve_prune_target_fraction() -> float:
1268
+ if 0.0 < PROXY_CONTEXT_PRUNE_TARGET_FRACTION < 1.0:
1269
+ return PROXY_CONTEXT_PRUNE_TARGET_FRACTION
1270
+ logger.warning(
1271
+ "Invalid PROXY_CONTEXT_PRUNE_TARGET_FRACTION=%s; using default 0.65",
1272
+ PROXY_CONTEXT_PRUNE_TARGET_FRACTION,
1273
+ )
1274
+ return 0.65
1275
+
1276
+
1277
+ def _sanitize_reasoning_fallback_text(reasoning_text: str) -> str:
1278
+ cleaned = re.sub(r"</?think>", "", reasoning_text, flags=re.IGNORECASE)
1279
+ cleaned = re.sub(r"\s+", " ", cleaned).strip()
1280
+ if not cleaned:
1281
+ return ""
1282
+ if len(cleaned) > PROXY_STREAM_REASONING_MAX_CHARS:
1283
+ return cleaned[:PROXY_STREAM_REASONING_MAX_CHARS].rstrip() + "..."
1284
+ return cleaned
1285
+
1286
+
1287
+ def _build_reasoning_fallback_text(
1288
+ reasoning_chunks: list[str], mode: str | None = None
1289
+ ) -> str | None:
1290
+ fallback_mode = (mode or PROXY_STREAM_REASONING_FALLBACK).strip().lower()
1291
+ if fallback_mode == "off":
1292
+ return None
1293
+
1294
+ raw_text = "".join(reasoning_chunks).strip()
1295
+ if not raw_text:
1296
+ return None
1297
+
1298
+ if fallback_mode == "visible":
1299
+ return raw_text
1300
+ if fallback_mode == "sanitized":
1301
+ sanitized = _sanitize_reasoning_fallback_text(raw_text)
1302
+ return sanitized or None
1303
+
1304
+ logger.warning(
1305
+ "Unknown PROXY_STREAM_REASONING_FALLBACK=%r; disabling reasoning fallback",
1306
+ fallback_mode,
1307
+ )
1308
+ return None
822
1309
 
823
1310
 
824
1311
  def _last_assistant_was_text_only(anthropic_body: dict) -> bool:
@@ -836,11 +1323,14 @@ def _last_assistant_was_text_only(anthropic_body: dict) -> bool:
836
1323
  return bool(content.strip())
837
1324
  if isinstance(content, list):
838
1325
  has_tool_use = any(
839
- isinstance(b, dict) and b.get("type") == "tool_use"
840
- for b in content
1326
+ isinstance(b, dict) and b.get("type") == "tool_use" for b in content
841
1327
  )
842
1328
  has_text = any(
843
- (isinstance(b, dict) and b.get("type") == "text" and b.get("text", "").strip())
1329
+ (
1330
+ isinstance(b, dict)
1331
+ and b.get("type") == "text"
1332
+ and b.get("text", "").strip()
1333
+ )
844
1334
  or isinstance(b, str)
845
1335
  for b in content
846
1336
  )
@@ -850,10 +1340,468 @@ def _last_assistant_was_text_only(anthropic_body: dict) -> bool:
850
1340
  return False
851
1341
 
852
1342
 
1343
+ def _extract_openai_choice(openai_resp: dict) -> tuple[dict, dict]:
1344
+ choice = (openai_resp.get("choices") or [{}])[0]
1345
+ message = choice.get("message") or {}
1346
+ return choice, message
1347
+
1348
+
1349
+ def _openai_message_text(openai_resp: dict) -> str:
1350
+ _, message = _extract_openai_choice(openai_resp)
1351
+ content = message.get("content", "")
1352
+ return content if isinstance(content, str) else str(content)
1353
+
1354
+
1355
+ def _extract_openai_tool_calls(openai_resp: dict) -> list[dict]:
1356
+ _, message = _extract_openai_choice(openai_resp)
1357
+ tool_calls = message.get("tool_calls") or []
1358
+ return tool_calls if isinstance(tool_calls, list) else []
1359
+
1360
+
1361
+ def _openai_has_tool_calls(openai_resp: dict) -> bool:
1362
+ return bool(_extract_openai_tool_calls(openai_resp))
1363
+
1364
+
1365
+ def _parse_openai_function_arguments(raw_args) -> tuple[dict | None, str | None]:
1366
+ if isinstance(raw_args, dict):
1367
+ return raw_args, None
1368
+ if isinstance(raw_args, str):
1369
+ try:
1370
+ parsed = json.loads(raw_args)
1371
+ except json.JSONDecodeError:
1372
+ return None, "invalid_json"
1373
+ if not isinstance(parsed, dict):
1374
+ return None, "arguments_not_object"
1375
+ return parsed, None
1376
+ return None, "invalid_arguments_type"
1377
+
1378
+
1379
+ def _schema_type_matches(value, expected_type: str) -> bool:
1380
+ if expected_type == "string":
1381
+ return isinstance(value, str)
1382
+ if expected_type == "number":
1383
+ return isinstance(value, (int, float)) and not isinstance(value, bool)
1384
+ if expected_type == "integer":
1385
+ return isinstance(value, int) and not isinstance(value, bool)
1386
+ if expected_type == "boolean":
1387
+ return isinstance(value, bool)
1388
+ if expected_type == "array":
1389
+ return isinstance(value, list)
1390
+ if expected_type == "object":
1391
+ return isinstance(value, dict)
1392
+ if expected_type == "null":
1393
+ return value is None
1394
+ return True
1395
+
1396
+
1397
+ def _string_contains_tool_markup(value: str) -> bool:
1398
+ lowered = value.lower()
1399
+ markers = ("<parameter", "</parameter", "<tool_call", "<function=", "</function")
1400
+ return any(marker in lowered for marker in markers)
1401
+
1402
+
1403
+ def _validate_tool_arguments_against_schema(
1404
+ args: dict, input_schema: dict
1405
+ ) -> tuple[bool, str]:
1406
+ if not isinstance(input_schema, dict):
1407
+ return True, ""
1408
+
1409
+ required = input_schema.get("required") or []
1410
+ if isinstance(required, list):
1411
+ for field in required:
1412
+ if not isinstance(field, str):
1413
+ continue
1414
+ if field not in args:
1415
+ return False, f"missing required field '{field}'"
1416
+ value = args.get(field)
1417
+ if value is None:
1418
+ return False, f"required field '{field}' is null"
1419
+ if isinstance(value, str) and not value.strip():
1420
+ return False, f"required field '{field}' is empty"
1421
+ if isinstance(value, str) and _string_contains_tool_markup(value):
1422
+ return (
1423
+ False,
1424
+ f"required field '{field}' contains malformed tool markup",
1425
+ )
1426
+
1427
+ properties = input_schema.get("properties") or {}
1428
+ if isinstance(properties, dict):
1429
+ for key, prop_schema in properties.items():
1430
+ if key not in args:
1431
+ continue
1432
+ if not isinstance(prop_schema, dict):
1433
+ continue
1434
+ expected = prop_schema.get("type")
1435
+ if isinstance(expected, str):
1436
+ if not _schema_type_matches(args[key], expected):
1437
+ return (
1438
+ False,
1439
+ f"type mismatch for '{key}' (expected {expected})",
1440
+ )
1441
+ if expected == "string" and isinstance(args[key], str):
1442
+ if _string_contains_tool_markup(args[key]):
1443
+ return (
1444
+ False,
1445
+ f"string field '{key}' contains malformed tool markup",
1446
+ )
1447
+ elif isinstance(expected, list) and expected:
1448
+ if not any(_schema_type_matches(args[key], t) for t in expected):
1449
+ expected_str = ",".join(str(t) for t in expected)
1450
+ return (
1451
+ False,
1452
+ f"type mismatch for '{key}' (expected one of {expected_str})",
1453
+ )
1454
+
1455
+ return True, ""
1456
+
1457
+
1458
+ def _tool_schema_map_from_anthropic_body(anthropic_body: dict) -> dict[str, dict]:
1459
+ schema_map: dict[str, dict] = {}
1460
+ for tool in anthropic_body.get("tools", []) or []:
1461
+ if not isinstance(tool, dict):
1462
+ continue
1463
+ name = tool.get("name")
1464
+ if isinstance(name, str) and name:
1465
+ schema = tool.get("input_schema")
1466
+ schema_map[name] = schema if isinstance(schema, dict) else {}
1467
+ return schema_map
1468
+
1469
+
1470
+ def _invalid_tool_call_reason(openai_resp: dict, anthropic_body: dict) -> str | None:
1471
+ if "tools" not in anthropic_body:
1472
+ return None
1473
+
1474
+ tool_calls = _extract_openai_tool_calls(openai_resp)
1475
+ if not tool_calls:
1476
+ return None
1477
+
1478
+ schema_map = _tool_schema_map_from_anthropic_body(anthropic_body)
1479
+ if not schema_map:
1480
+ return None
1481
+
1482
+ for idx, tc in enumerate(tool_calls):
1483
+ if not isinstance(tc, dict):
1484
+ return f"tool call {idx} is not an object"
1485
+ fn = tc.get("function")
1486
+ if not isinstance(fn, dict):
1487
+ return f"tool call {idx} missing function payload"
1488
+
1489
+ name = fn.get("name")
1490
+ if not isinstance(name, str) or not name:
1491
+ return f"tool call {idx} missing function name"
1492
+ if name not in schema_map:
1493
+ return f"tool call {idx} uses unknown tool '{name}'"
1494
+
1495
+ args, parse_error = _parse_openai_function_arguments(fn.get("arguments", "{}"))
1496
+ if parse_error:
1497
+ return f"tool call {idx} invalid arguments ({parse_error})"
1498
+ if args is None:
1499
+ return f"tool call {idx} has empty arguments"
1500
+
1501
+ valid, reason = _validate_tool_arguments_against_schema(args, schema_map[name])
1502
+ if not valid:
1503
+ return f"tool call {idx} failed schema validation: {reason}"
1504
+
1505
+ return None
1506
+
1507
+
1508
+ def _openai_has_valid_tool_calls(openai_resp: dict, anthropic_body: dict) -> bool:
1509
+ return (
1510
+ _openai_has_tool_calls(openai_resp)
1511
+ and _invalid_tool_call_reason(openai_resp, anthropic_body) is None
1512
+ )
1513
+
1514
+
1515
+ def _looks_malformed_tool_payload(text: str) -> bool:
1516
+ if not text:
1517
+ return False
1518
+
1519
+ lowered = text.lower()
1520
+ primary_markers = ("</parameter", "<parameter", "<tool_call", "<function=")
1521
+ if any(marker in lowered for marker in primary_markers):
1522
+ return True
1523
+
1524
+ structural_markers = (
1525
+ '=\n{"description"',
1526
+ "</think>",
1527
+ )
1528
+ marker_hits = sum(1 for marker in structural_markers if marker in lowered)
1529
+ repeated_description = lowered.count('{"description"') >= 2
1530
+ repeated_must_call = lowered.count("you must call a tool") >= 2
1531
+ has_unicode_marker = "⎿" in text
1532
+ policy_echo_loop = repeated_must_call and (
1533
+ "do not summarize the issue and stop" in lowered
1534
+ or "must call a tool to make the fix" in lowered
1535
+ )
1536
+ policy_snippets = (
1537
+ "do not summarize the issue and stop",
1538
+ "if you have identified a problem",
1539
+ "you must call a tool to make the fix",
1540
+ "</agentic-protocol>",
1541
+ )
1542
+ policy_hits = sum(1 for snippet in policy_snippets if snippet in lowered)
1543
+
1544
+ if marker_hits >= 2:
1545
+ return True
1546
+ if marker_hits >= 1 and (
1547
+ repeated_description or repeated_must_call or has_unicode_marker
1548
+ ):
1549
+ return True
1550
+ if policy_echo_loop:
1551
+ return True
1552
+ if policy_hits >= 2:
1553
+ return True
1554
+ if lowered.count("</parameter") >= 1 and lowered.count('{"description"') >= 1:
1555
+ return True
1556
+ return False
1557
+
1558
+
1559
+ def _is_malformed_tool_response(openai_resp: dict, anthropic_body: dict) -> bool:
1560
+ if "tools" not in anthropic_body:
1561
+ return False
1562
+
1563
+ if _invalid_tool_call_reason(openai_resp, anthropic_body):
1564
+ return True
1565
+
1566
+ if _openai_has_tool_calls(openai_resp):
1567
+ return False
1568
+
1569
+ return _looks_malformed_tool_payload(_openai_message_text(openai_resp))
1570
+
1571
+
1572
+ def _build_malformed_retry_body(openai_body: dict, anthropic_body: dict) -> dict:
1573
+ retry_body = dict(openai_body)
1574
+ retry_body["stream"] = False
1575
+ retry_body["tool_choice"] = "required"
1576
+ retry_body["temperature"] = PROXY_MALFORMED_TOOL_RETRY_TEMPERATURE
1577
+
1578
+ if PROXY_MALFORMED_TOOL_RETRY_MAX_TOKENS > 0:
1579
+ current_max = int(
1580
+ retry_body.get("max_tokens", PROXY_MALFORMED_TOOL_RETRY_MAX_TOKENS)
1581
+ )
1582
+ retry_body["max_tokens"] = min(
1583
+ current_max, PROXY_MALFORMED_TOOL_RETRY_MAX_TOKENS
1584
+ )
1585
+
1586
+ # On malformed retry, restore full tool list to avoid starving selection.
1587
+ if anthropic_body.get("tools"):
1588
+ retry_body["tools"] = _convert_anthropic_tools_to_openai(
1589
+ anthropic_body.get("tools", [])
1590
+ )
1591
+
1592
+ if PROXY_DISABLE_THINKING_ON_TOOL_TURNS:
1593
+ retry_body["enable_thinking"] = False
1594
+
1595
+ return retry_body
1596
+
1597
+
1598
+ def _build_clean_guardrail_openai_response(openai_resp: dict) -> dict:
1599
+ return {
1600
+ "id": openai_resp.get("id", f"chatcmpl_{uuid.uuid4().hex[:12]}"),
1601
+ "object": openai_resp.get("object", "chat.completion"),
1602
+ "created": openai_resp.get("created", int(time.time())),
1603
+ "model": openai_resp.get("model", "unknown"),
1604
+ "choices": [
1605
+ {
1606
+ "index": 0,
1607
+ "finish_reason": "stop",
1608
+ "message": {
1609
+ "role": "assistant",
1610
+ "content": (
1611
+ "I could not produce a valid tool-call format in this turn. "
1612
+ "Please continue; I will issue exactly one valid tool call next."
1613
+ ),
1614
+ },
1615
+ }
1616
+ ],
1617
+ "usage": openai_resp.get("usage", {}),
1618
+ }
1619
+
1620
+
1621
+ async def _apply_unexpected_end_turn_guardrail(
1622
+ client: httpx.AsyncClient,
1623
+ openai_resp: dict,
1624
+ openai_body: dict,
1625
+ anthropic_body: dict,
1626
+ monitor: SessionMonitor,
1627
+ session_id: str,
1628
+ ) -> dict:
1629
+ if not PROXY_GUARDRAIL_RETRY:
1630
+ return openai_resp
1631
+
1632
+ if not _is_unexpected_end_turn(openai_resp, anthropic_body):
1633
+ return openai_resp
1634
+
1635
+ monitor.unexpected_end_turn_count += 1
1636
+ logger.warning(
1637
+ "GUARDRAIL: unexpected end_turn without tool_use in active loop (session=%s), retrying once with tool_choice=required",
1638
+ session_id,
1639
+ )
1640
+
1641
+ retry_body = dict(openai_body)
1642
+ retry_body["tool_choice"] = "required"
1643
+ retry_body["stream"] = False
1644
+
1645
+ retry_resp = await client.post(
1646
+ f"{LLAMA_CPP_BASE}/chat/completions",
1647
+ json=retry_body,
1648
+ headers={"Content-Type": "application/json"},
1649
+ )
1650
+ if retry_resp.status_code == 200:
1651
+ retry_json = retry_resp.json()
1652
+ retry_choice, retry_message = _extract_openai_choice(retry_json)
1653
+ if _openai_has_valid_tool_calls(retry_json, anthropic_body):
1654
+ logger.info("GUARDRAIL: retry produced tool_use; using retried response")
1655
+ return retry_json
1656
+ invalid_reason = _invalid_tool_call_reason(retry_json, anthropic_body)
1657
+ if invalid_reason:
1658
+ logger.warning(
1659
+ "GUARDRAIL: retry produced invalid tool_call payload (%s)",
1660
+ invalid_reason,
1661
+ )
1662
+ logger.info(
1663
+ "GUARDRAIL: retry returned finish_reason=%s without tool_use",
1664
+ retry_choice.get("finish_reason"),
1665
+ )
1666
+ else:
1667
+ logger.warning(
1668
+ "GUARDRAIL retry upstream status=%d; keeping original response",
1669
+ retry_resp.status_code,
1670
+ )
1671
+
1672
+ return openai_resp
1673
+
1674
+
1675
+ async def _apply_malformed_tool_guardrail(
1676
+ client: httpx.AsyncClient,
1677
+ openai_resp: dict,
1678
+ openai_body: dict,
1679
+ anthropic_body: dict,
1680
+ monitor: SessionMonitor,
1681
+ session_id: str,
1682
+ ) -> dict:
1683
+ if not PROXY_MALFORMED_TOOL_GUARDRAIL:
1684
+ return openai_resp
1685
+
1686
+ if not _is_malformed_tool_response(openai_resp, anthropic_body):
1687
+ if _openai_has_valid_tool_calls(openai_resp, anthropic_body):
1688
+ monitor.malformed_tool_streak = 0
1689
+ return openai_resp
1690
+
1691
+ monitor.malformed_tool_streak += 1
1692
+ invalid_reason = _invalid_tool_call_reason(openai_resp, anthropic_body)
1693
+ if invalid_reason:
1694
+ excerpt = invalid_reason[:220]
1695
+ else:
1696
+ excerpt = _openai_message_text(openai_resp)[:220].replace("\n", " ")
1697
+ logger.warning(
1698
+ "MALFORMED TOOL PAYLOAD: session=%s streak=%d excerpt=%.220s",
1699
+ session_id,
1700
+ monitor.malformed_tool_streak,
1701
+ excerpt,
1702
+ )
1703
+
1704
+ attempts = max(0, PROXY_MALFORMED_TOOL_RETRY_MAX)
1705
+ for attempt in range(attempts):
1706
+ retry_body = _build_malformed_retry_body(openai_body, anthropic_body)
1707
+ retry_resp = await client.post(
1708
+ f"{LLAMA_CPP_BASE}/chat/completions",
1709
+ json=retry_body,
1710
+ headers={"Content-Type": "application/json"},
1711
+ )
1712
+ if retry_resp.status_code != 200:
1713
+ logger.warning(
1714
+ "MALFORMED RETRY failed (attempt %d/%d): HTTP %d",
1715
+ attempt + 1,
1716
+ attempts,
1717
+ retry_resp.status_code,
1718
+ )
1719
+ continue
1720
+
1721
+ retry_json = retry_resp.json()
1722
+ if _openai_has_valid_tool_calls(retry_json, anthropic_body):
1723
+ monitor.malformed_tool_streak = 0
1724
+ logger.info(
1725
+ "MALFORMED RETRY success: produced tool_use (attempt %d/%d)",
1726
+ attempt + 1,
1727
+ attempts,
1728
+ )
1729
+ return retry_json
1730
+
1731
+ retry_invalid_reason = _invalid_tool_call_reason(retry_json, anthropic_body)
1732
+ if retry_invalid_reason:
1733
+ logger.warning(
1734
+ "MALFORMED RETRY invalid tool_call payload (attempt %d/%d): %s",
1735
+ attempt + 1,
1736
+ attempts,
1737
+ retry_invalid_reason,
1738
+ )
1739
+
1740
+ if not _is_malformed_tool_response(retry_json, anthropic_body):
1741
+ monitor.malformed_tool_streak = 0
1742
+ logger.info(
1743
+ "MALFORMED RETRY produced clean text response (attempt %d/%d)",
1744
+ attempt + 1,
1745
+ attempts,
1746
+ )
1747
+ return retry_json
1748
+
1749
+ monitor.malformed_tool_streak += 1
1750
+
1751
+ logger.error(
1752
+ "MALFORMED TOOL PAYLOAD persisted after retries (session=%s); returning clean guardrail response",
1753
+ session_id,
1754
+ )
1755
+ return _build_clean_guardrail_openai_response(openai_resp)
1756
+
1757
+
1758
+ def _maybe_apply_session_contamination_breaker(
1759
+ anthropic_body: dict, monitor: SessionMonitor, session_id: str
1760
+ ) -> dict:
1761
+ if not PROXY_SESSION_CONTAMINATION_BREAKER:
1762
+ return anthropic_body
1763
+
1764
+ threshold = max(1, PROXY_SESSION_CONTAMINATION_THRESHOLD)
1765
+ if monitor.malformed_tool_streak < threshold:
1766
+ return anthropic_body
1767
+
1768
+ messages = anthropic_body.get("messages", [])
1769
+ keep_last = max(2, PROXY_SESSION_CONTAMINATION_KEEP_LAST)
1770
+ if len(messages) <= keep_last + 1:
1771
+ monitor.malformed_tool_streak = 0
1772
+ return anthropic_body
1773
+
1774
+ head = messages[:1]
1775
+ tail = messages[-keep_last:]
1776
+ reset_marker = {
1777
+ "role": "user",
1778
+ "content": (
1779
+ "[SESSION RESET: previous turns contained malformed tool-call formatting "
1780
+ "artifacts. Continue from the recent context below and emit valid tool calls only.]"
1781
+ ),
1782
+ }
1783
+
1784
+ updated_body = dict(anthropic_body)
1785
+ updated_body["messages"] = head + [reset_marker] + tail
1786
+
1787
+ monitor.contamination_resets += 1
1788
+ monitor.malformed_tool_streak = 0
1789
+ monitor.no_progress_streak = 0
1790
+ monitor.consecutive_forced_count = 0
1791
+ logger.warning(
1792
+ "SESSION CONTAMINATION BREAKER: session=%s reset applied, kept=%d messages",
1793
+ session_id,
1794
+ len(updated_body["messages"]),
1795
+ )
1796
+
1797
+ return updated_body
1798
+
1799
+
853
1800
  # ===========================================================================
854
1801
  # Response Translation: OpenAI -> Anthropic
855
1802
  # ===========================================================================
856
1803
 
1804
+
857
1805
  def openai_to_anthropic_response(openai_resp: dict, model: str) -> dict:
858
1806
  """Convert an OpenAI Chat Completions response to Anthropic Messages format."""
859
1807
  choice = openai_resp.get("choices", [{}])[0]
@@ -871,12 +1819,14 @@ def openai_to_anthropic_response(openai_resp: dict, model: str) -> dict:
871
1819
  args = json.loads(fn.get("arguments", "{}"))
872
1820
  except json.JSONDecodeError:
873
1821
  args = {}
874
- content.append({
875
- "type": "tool_use",
876
- "id": tc.get("id", f"toolu_{uuid.uuid4().hex[:12]}"),
877
- "name": fn.get("name", ""),
878
- "input": args,
879
- })
1822
+ content.append(
1823
+ {
1824
+ "type": "tool_use",
1825
+ "id": tc.get("id", f"toolu_{uuid.uuid4().hex[:12]}"),
1826
+ "name": fn.get("name", ""),
1827
+ "input": args,
1828
+ }
1829
+ )
880
1830
 
881
1831
  stop_reason_map = {
882
1832
  "stop": "end_turn",
@@ -902,11 +1852,78 @@ def openai_to_anthropic_response(openai_resp: dict, model: str) -> dict:
902
1852
  }
903
1853
 
904
1854
 
1855
+ async def stream_anthropic_message(anthropic_resp: dict):
1856
+ """Stream a finalized Anthropic message as SSE events."""
1857
+ message = {
1858
+ "id": anthropic_resp.get("id", f"msg_{uuid.uuid4().hex[:24]}"),
1859
+ "type": "message",
1860
+ "role": "assistant",
1861
+ "content": [],
1862
+ "model": anthropic_resp.get("model", "unknown"),
1863
+ "stop_reason": None,
1864
+ "stop_sequence": None,
1865
+ "usage": {"input_tokens": 0, "output_tokens": 0},
1866
+ }
1867
+ yield f"event: message_start\ndata: {json.dumps({'type': 'message_start', 'message': message})}\n\n"
1868
+
1869
+ content_blocks = anthropic_resp.get("content", []) or [{"type": "text", "text": ""}]
1870
+ block_index = 0
1871
+ for block in content_blocks:
1872
+ btype = block.get("type", "text")
1873
+ if btype == "tool_use":
1874
+ tool_id = block.get("id", f"toolu_{uuid.uuid4().hex[:12]}")
1875
+ tool_name = block.get("name", "")
1876
+ tool_input = json.dumps(block.get("input", {}), separators=(",", ":"))
1877
+ yield (
1878
+ "event: content_block_start\n"
1879
+ f"data: {json.dumps({'type': 'content_block_start', 'index': block_index, 'content_block': {'type': 'tool_use', 'id': tool_id, 'name': tool_name}})}\n\n"
1880
+ )
1881
+ if tool_input:
1882
+ yield (
1883
+ "event: content_block_delta\n"
1884
+ f"data: {json.dumps({'type': 'content_block_delta', 'index': block_index, 'delta': {'type': 'input_json_delta', 'partial_json': tool_input}})}\n\n"
1885
+ )
1886
+ yield (
1887
+ "event: content_block_stop\n"
1888
+ f"data: {json.dumps({'type': 'content_block_stop', 'index': block_index})}\n\n"
1889
+ )
1890
+ else:
1891
+ text = block.get("text", "")
1892
+ yield (
1893
+ "event: content_block_start\n"
1894
+ f"data: {json.dumps({'type': 'content_block_start', 'index': block_index, 'content_block': {'type': 'text', 'text': ''}})}\n\n"
1895
+ )
1896
+ if text:
1897
+ yield (
1898
+ "event: content_block_delta\n"
1899
+ f"data: {json.dumps({'type': 'content_block_delta', 'index': block_index, 'delta': {'type': 'text_delta', 'text': text}})}\n\n"
1900
+ )
1901
+ yield (
1902
+ "event: content_block_stop\n"
1903
+ f"data: {json.dumps({'type': 'content_block_stop', 'index': block_index})}\n\n"
1904
+ )
1905
+ block_index += 1
1906
+
1907
+ output_tokens = anthropic_resp.get("usage", {}).get("output_tokens", 0)
1908
+ stop_reason = anthropic_resp.get("stop_reason", "end_turn")
1909
+ yield (
1910
+ "event: message_delta\n"
1911
+ f"data: {json.dumps({'type': 'message_delta', 'delta': {'stop_reason': stop_reason, 'stop_sequence': None}, 'usage': {'output_tokens': output_tokens}})}\n\n"
1912
+ )
1913
+ yield f"event: message_stop\ndata: {json.dumps({'type': 'message_stop'})}\n\n"
1914
+
1915
+
905
1916
  # ===========================================================================
906
1917
  # Streaming Translation: OpenAI SSE -> Anthropic SSE
907
1918
  # ===========================================================================
908
1919
 
909
- async def stream_anthropic_response(openai_stream: httpx.Response, model: str):
1920
+
1921
+ async def stream_anthropic_response(
1922
+ openai_stream: httpx.Response,
1923
+ model: str,
1924
+ monitor: SessionMonitor,
1925
+ anthropic_body: dict,
1926
+ ):
910
1927
  """Convert an OpenAI streaming response to Anthropic SSE stream format.
911
1928
 
912
1929
  Handles:
@@ -929,7 +1946,7 @@ async def stream_anthropic_response(openai_stream: httpx.Response, model: str):
929
1946
  f"data: {json.dumps({'type': 'content_block_start', 'index': 0, 'content_block': {'type': 'text', 'text': ''}})}\n\n"
930
1947
  )
931
1948
 
932
- yield "event: ping\ndata: {\"type\": \"ping\"}\n\n"
1949
+ yield 'event: ping\ndata: {"type": "ping"}\n\n'
933
1950
 
934
1951
  output_tokens = 0
935
1952
  finish_reason = "end_turn"
@@ -1058,21 +2075,29 @@ async def stream_anthropic_response(openai_stream: httpx.Response, model: str):
1058
2075
  f"data: {json.dumps({'type': 'content_block_stop', 'index': tc['block_index']})}\n\n"
1059
2076
  )
1060
2077
  else:
1061
- # Option E: If the response has no text AND no tool calls, but the
1062
- # model produced reasoning_content, forward the reasoning as visible
1063
- # text so the client doesn't receive a completely empty turn.
2078
+ # If the response has no text and no tool calls, optionally emit a
2079
+ # reasoning fallback (configurable) to avoid leaking malformed
2080
+ # internal chain-of-thought content by default.
1064
2081
  accumulated_text = "".join(text_chunks)
1065
2082
  if not accumulated_text and reasoning_chunks:
1066
- fallback_text = "".join(reasoning_chunks)
1067
- logger.warning(
1068
- "Empty response with %d reasoning tokens – forwarding reasoning as fallback text",
1069
- len(reasoning_chunks),
1070
- )
1071
- text_chunks.append(fallback_text)
1072
- yield (
1073
- f"event: content_block_delta\n"
1074
- f"data: {json.dumps({'type': 'content_block_delta', 'index': 0, 'delta': {'type': 'text_delta', 'text': fallback_text}})}\n\n"
1075
- )
2083
+ fallback_text = _build_reasoning_fallback_text(reasoning_chunks)
2084
+ if fallback_text:
2085
+ logger.warning(
2086
+ "Empty response with %d reasoning chunks – emitting fallback text (mode=%s)",
2087
+ len(reasoning_chunks),
2088
+ PROXY_STREAM_REASONING_FALLBACK,
2089
+ )
2090
+ text_chunks.append(fallback_text)
2091
+ yield (
2092
+ f"event: content_block_delta\n"
2093
+ f"data: {json.dumps({'type': 'content_block_delta', 'index': 0, 'delta': {'type': 'text_delta', 'text': fallback_text}})}\n\n"
2094
+ )
2095
+ else:
2096
+ logger.warning(
2097
+ "Empty response with %d reasoning chunks – fallback suppressed (mode=%s)",
2098
+ len(reasoning_chunks),
2099
+ PROXY_STREAM_REASONING_FALLBACK,
2100
+ )
1076
2101
 
1077
2102
  yield (
1078
2103
  f"event: content_block_stop\n"
@@ -1081,17 +2106,65 @@ async def stream_anthropic_response(openai_stream: httpx.Response, model: str):
1081
2106
 
1082
2107
  # Log response summary
1083
2108
  accumulated_text = "".join(text_chunks)
1084
- tc_names = [tc["name"] for tc in tool_calls_by_index.values()] if tool_calls_by_index else []
1085
- tc_args = [tc.get("arguments", "") for tc in tool_calls_by_index.values()] if tool_calls_by_index else []
2109
+ tc_names = (
2110
+ [tc["name"] for tc in tool_calls_by_index.values()]
2111
+ if tool_calls_by_index
2112
+ else []
2113
+ )
2114
+ tc_args = (
2115
+ [tc.get("arguments", "") for tc in tool_calls_by_index.values()]
2116
+ if tool_calls_by_index
2117
+ else []
2118
+ )
1086
2119
  logger.info(
1087
2120
  "RESP: finish=%s output_tokens=%d text_len=%d text=%.300s tool_calls=%s args=%s",
1088
- finish_reason, output_tokens,
2121
+ finish_reason,
2122
+ output_tokens,
1089
2123
  len(accumulated_text),
1090
2124
  accumulated_text[:300],
1091
2125
  tc_names,
1092
2126
  [a[:200] for a in tc_args],
1093
2127
  )
1094
2128
 
2129
+ synthetic_openai_resp = {
2130
+ "choices": [
2131
+ {
2132
+ "finish_reason": "stop"
2133
+ if finish_reason == "end_turn"
2134
+ else finish_reason,
2135
+ "message": {
2136
+ "content": accumulated_text,
2137
+ "tool_calls": [
2138
+ {
2139
+ "function": {
2140
+ "name": tc["name"],
2141
+ "arguments": tc.get("arguments", ""),
2142
+ }
2143
+ }
2144
+ for tc in tool_calls_by_index.values()
2145
+ ],
2146
+ },
2147
+ }
2148
+ ]
2149
+ }
2150
+
2151
+ if _is_malformed_tool_response(synthetic_openai_resp, anthropic_body):
2152
+ monitor.malformed_tool_streak += 1
2153
+ elif (
2154
+ "tools" in anthropic_body
2155
+ and not tool_calls_by_index
2156
+ and (
2157
+ finish_reason == "max_tokens"
2158
+ or (finish_reason == "end_turn" and len(accumulated_text) > 512)
2159
+ )
2160
+ ):
2161
+ monitor.malformed_tool_streak += 1
2162
+ elif tool_calls_by_index:
2163
+ monitor.malformed_tool_streak = 0
2164
+
2165
+ if _is_unexpected_end_turn(synthetic_openai_resp, anthropic_body):
2166
+ monitor.unexpected_end_turn_count += 1
2167
+
1095
2168
  # message_delta with final stop reason
1096
2169
  yield (
1097
2170
  f"event: message_delta\n"
@@ -1106,6 +2179,7 @@ async def stream_anthropic_response(openai_stream: httpx.Response, model: str):
1106
2179
  # API Endpoints
1107
2180
  # ===========================================================================
1108
2181
 
2182
+
1109
2183
  @app.post("/v1/messages")
1110
2184
  async def messages(request: Request):
1111
2185
  """Handle Anthropic Messages API requests (streaming and non-streaming).
@@ -1116,9 +2190,16 @@ async def messages(request: Request):
1116
2190
  - Option E: Smart max_tokens capping (in build_openai_request)
1117
2191
  - Option F: Session-level token monitoring with warnings
1118
2192
  """
2193
+ global last_session_id
2194
+
1119
2195
  body = await request.json()
1120
2196
  model = body.get("model", "default")
1121
2197
  is_stream = body.get("stream", False)
2198
+ session_id = resolve_session_id(request, body)
2199
+ monitor = get_session_monitor(session_id)
2200
+ last_session_id = session_id
2201
+
2202
+ body = _maybe_apply_session_contamination_breaker(body, monitor, session_id)
1122
2203
 
1123
2204
  # Debug: log request summary
1124
2205
  n_messages = len(body.get("messages", []))
@@ -1128,42 +2209,53 @@ async def messages(request: Request):
1128
2209
  last_role = last_msg.get("role", "?")
1129
2210
  last_content = last_msg.get("content", "")
1130
2211
  if isinstance(last_content, list):
1131
- last_text = next((b.get("text", "") for b in last_content if b.get("type") == "text"), "")[:200]
2212
+ last_text = next(
2213
+ (b.get("text", "") for b in last_content if b.get("type") == "text"), ""
2214
+ )[:200]
1132
2215
  elif isinstance(last_content, str):
1133
2216
  last_text = last_content[:200]
1134
2217
  else:
1135
2218
  last_text = str(last_content)[:200]
1136
2219
  logger.info(
1137
2220
  "REQ: stream=%s msgs=%d tools=%d max_tokens=%s last_role=%s last_content=%.200s",
1138
- is_stream, n_messages, n_tools, max_tokens, last_role, last_text
2221
+ is_stream,
2222
+ n_messages,
2223
+ n_tools,
2224
+ max_tokens,
2225
+ last_role,
2226
+ last_text,
1139
2227
  )
1140
2228
 
1141
2229
  # --- Option F: Estimate tokens and record in session monitor ---
1142
2230
  estimated_tokens = estimate_total_tokens(body)
1143
- session_monitor.record_request(estimated_tokens)
1144
- session_monitor.log_status()
2231
+ monitor.record_request(estimated_tokens)
2232
+ monitor.log_status()
1145
2233
 
1146
2234
  # --- Option C: Prune conversation if approaching context limit ---
1147
- ctx_window = session_monitor.context_window
2235
+ ctx_window = monitor.context_window
1148
2236
  if ctx_window > 0:
1149
2237
  utilization = estimated_tokens / ctx_window
1150
2238
  if utilization >= PROXY_CONTEXT_PRUNE_THRESHOLD:
1151
2239
  logger.warning(
1152
2240
  "Context utilization %.1f%% exceeds threshold %.1f%% -- pruning conversation",
1153
- utilization * 100, PROXY_CONTEXT_PRUNE_THRESHOLD * 100,
2241
+ utilization * 100,
2242
+ PROXY_CONTEXT_PRUNE_THRESHOLD * 100,
2243
+ )
2244
+ body = prune_conversation(
2245
+ body, ctx_window, target_fraction=_resolve_prune_target_fraction()
1154
2246
  )
1155
- body = prune_conversation(body, ctx_window, target_fraction=0.65)
1156
- session_monitor.prune_count += 1
2247
+ monitor.prune_count += 1
1157
2248
  # Re-estimate after pruning
1158
2249
  estimated_tokens = estimate_total_tokens(body)
1159
- session_monitor.record_request(estimated_tokens)
2250
+ monitor.record_request(estimated_tokens)
1160
2251
  n_messages = len(body.get("messages", []))
1161
2252
  logger.info(
1162
2253
  "After pruning: ~%d tokens, %d messages",
1163
- estimated_tokens, n_messages,
2254
+ estimated_tokens,
2255
+ n_messages,
1164
2256
  )
1165
2257
 
1166
- openai_body = build_openai_request(body)
2258
+ openai_body = build_openai_request(body, monitor)
1167
2259
 
1168
2260
  client = http_client
1169
2261
  if client is None:
@@ -1173,6 +2265,79 @@ async def messages(request: Request):
1173
2265
  media_type="application/json",
1174
2266
  )
1175
2267
 
2268
+ use_guarded_non_stream = is_stream and (
2269
+ PROXY_FORCE_NON_STREAM
2270
+ or (PROXY_MALFORMED_TOOL_STREAM_STRICT and "tools" in body)
2271
+ )
2272
+ if use_guarded_non_stream:
2273
+ strict_body = dict(openai_body)
2274
+ strict_body["stream"] = False
2275
+
2276
+ strict_resp = await client.post(
2277
+ f"{LLAMA_CPP_BASE}/chat/completions",
2278
+ json=strict_body,
2279
+ headers={"Content-Type": "application/json"},
2280
+ )
2281
+
2282
+ if strict_resp.status_code != 200:
2283
+ error_text = strict_resp.text[:1000]
2284
+ logger.error(
2285
+ "Upstream HTTP %d (strict-stream): %s",
2286
+ strict_resp.status_code,
2287
+ error_text,
2288
+ )
2289
+ return Response(
2290
+ content=json.dumps(
2291
+ {
2292
+ "type": "error",
2293
+ "error": {
2294
+ "type": "overloaded_error",
2295
+ "message": f"Upstream error (HTTP {strict_resp.status_code}): {error_text[:500]}",
2296
+ },
2297
+ }
2298
+ ),
2299
+ status_code=529,
2300
+ media_type="application/json",
2301
+ )
2302
+
2303
+ openai_resp = strict_resp.json()
2304
+ openai_resp = await _apply_unexpected_end_turn_guardrail(
2305
+ client,
2306
+ openai_resp,
2307
+ strict_body,
2308
+ body,
2309
+ monitor,
2310
+ session_id,
2311
+ )
2312
+ openai_resp = await _apply_malformed_tool_guardrail(
2313
+ client,
2314
+ openai_resp,
2315
+ strict_body,
2316
+ body,
2317
+ monitor,
2318
+ session_id,
2319
+ )
2320
+
2321
+ anthropic_resp = openai_to_anthropic_response(openai_resp, model)
2322
+ monitor.record_response(anthropic_resp.get("usage", {}).get("output_tokens", 0))
2323
+ if PROXY_FORCE_NON_STREAM:
2324
+ logger.info(
2325
+ "FORCED NON-STREAM: served stream response via guarded non-stream path"
2326
+ )
2327
+ else:
2328
+ logger.info(
2329
+ "STRICT STREAM GUARDRAIL: served stream response via guarded non-stream path"
2330
+ )
2331
+
2332
+ return StreamingResponse(
2333
+ stream_anthropic_message(anthropic_resp),
2334
+ media_type="text/event-stream",
2335
+ headers={
2336
+ "Cache-Control": "no-cache",
2337
+ "Connection": "keep-alive",
2338
+ },
2339
+ )
2340
+
1176
2341
  if is_stream:
1177
2342
  openai_body["stream"] = True
1178
2343
 
@@ -1181,6 +2346,7 @@ async def messages(request: Request):
1181
2346
  MAX_UPSTREAM_RETRIES = 3
1182
2347
  RETRY_DELAY_SECS = 5.0
1183
2348
  last_exc: Exception | None = None
2349
+ resp: httpx.Response | None = None
1184
2350
 
1185
2351
  for attempt in range(MAX_UPSTREAM_RETRIES):
1186
2352
  try:
@@ -1201,25 +2367,46 @@ async def messages(request: Request):
1201
2367
  if attempt < MAX_UPSTREAM_RETRIES - 1:
1202
2368
  logger.warning(
1203
2369
  "Upstream connect failed (attempt %d/%d): %s – retrying in %.0fs",
1204
- attempt + 1, MAX_UPSTREAM_RETRIES,
1205
- type(exc).__name__, RETRY_DELAY_SECS,
2370
+ attempt + 1,
2371
+ MAX_UPSTREAM_RETRIES,
2372
+ type(exc).__name__,
2373
+ RETRY_DELAY_SECS,
1206
2374
  )
1207
2375
  await asyncio.sleep(RETRY_DELAY_SECS)
1208
2376
  else:
1209
2377
  logger.error(
1210
2378
  "Upstream connect failed after %d attempts: %s: %s",
1211
- MAX_UPSTREAM_RETRIES, type(exc).__name__, exc,
2379
+ MAX_UPSTREAM_RETRIES,
2380
+ type(exc).__name__,
2381
+ exc,
1212
2382
  )
1213
2383
 
1214
2384
  if last_exc is not None:
1215
2385
  return Response(
1216
- content=json.dumps({
1217
- "type": "error",
1218
- "error": {
1219
- "type": "overloaded_error",
1220
- "message": f"Upstream server unavailable after {MAX_UPSTREAM_RETRIES} retries: {last_exc}",
1221
- },
1222
- }),
2386
+ content=json.dumps(
2387
+ {
2388
+ "type": "error",
2389
+ "error": {
2390
+ "type": "overloaded_error",
2391
+ "message": f"Upstream server unavailable after {MAX_UPSTREAM_RETRIES} retries: {last_exc}",
2392
+ },
2393
+ }
2394
+ ),
2395
+ status_code=529,
2396
+ media_type="application/json",
2397
+ )
2398
+
2399
+ if resp is None:
2400
+ return Response(
2401
+ content=json.dumps(
2402
+ {
2403
+ "type": "error",
2404
+ "error": {
2405
+ "type": "overloaded_error",
2406
+ "message": "Upstream response unavailable",
2407
+ },
2408
+ }
2409
+ ),
1223
2410
  status_code=529,
1224
2411
  media_type="application/json",
1225
2412
  )
@@ -1232,9 +2419,7 @@ async def messages(request: Request):
1232
2419
  error_body = await resp.aread()
1233
2420
  await resp.aclose()
1234
2421
  error_text = error_body.decode("utf-8", errors="replace")[:1000]
1235
- logger.error(
1236
- "Upstream HTTP %d: %s", resp.status_code, error_text
1237
- )
2422
+ logger.error("Upstream HTTP %d: %s", resp.status_code, error_text)
1238
2423
 
1239
2424
  # Parse the error for a user-friendly message
1240
2425
  error_message = f"Upstream server error (HTTP {resp.status_code})"
@@ -1257,47 +2442,57 @@ async def messages(request: Request):
1257
2442
  )
1258
2443
 
1259
2444
  if is_context_overflow:
1260
- session_monitor.overflow_count += 1
2445
+ monitor.overflow_count += 1
1261
2446
  logger.error(
1262
2447
  "CONTEXT OVERFLOW detected (count=%d). "
1263
2448
  "Estimated input: %d tokens, context window: %d tokens. "
1264
2449
  "Conversation needs pruning or context window increase.",
1265
- session_monitor.overflow_count, estimated_tokens, ctx_window,
2450
+ monitor.overflow_count,
2451
+ estimated_tokens,
2452
+ ctx_window,
1266
2453
  )
1267
2454
  # Return Anthropic-format error that Claude Code can handle
1268
2455
  return Response(
1269
- content=json.dumps({
1270
- "type": "error",
1271
- "error": {
1272
- "type": "overloaded_error",
1273
- "message": (
1274
- f"Context window exceeded: request requires ~{estimated_tokens} tokens "
1275
- f"but only {ctx_window} are available. "
1276
- f"The conversation is too long. Please start a new session or "
1277
- f"reduce conversation length."
1278
- ),
1279
- },
1280
- }),
2456
+ content=json.dumps(
2457
+ {
2458
+ "type": "error",
2459
+ "error": {
2460
+ "type": "overloaded_error",
2461
+ "message": (
2462
+ f"Context window exceeded: request requires ~{estimated_tokens} tokens "
2463
+ f"but only {ctx_window} are available. "
2464
+ f"The conversation is too long. Please start a new session or "
2465
+ f"reduce conversation length."
2466
+ ),
2467
+ },
2468
+ }
2469
+ ),
1281
2470
  status_code=529,
1282
2471
  media_type="application/json",
1283
2472
  )
1284
2473
 
1285
2474
  # Generic upstream error -- return as Anthropic error format
1286
- error_type = "overloaded_error" if resp.status_code >= 500 else "invalid_request_error"
2475
+ error_type = (
2476
+ "overloaded_error"
2477
+ if resp.status_code >= 500
2478
+ else "invalid_request_error"
2479
+ )
1287
2480
  return Response(
1288
- content=json.dumps({
1289
- "type": "error",
1290
- "error": {
1291
- "type": error_type,
1292
- "message": error_message,
1293
- },
1294
- }),
2481
+ content=json.dumps(
2482
+ {
2483
+ "type": "error",
2484
+ "error": {
2485
+ "type": error_type,
2486
+ "message": error_message,
2487
+ },
2488
+ }
2489
+ ),
1295
2490
  status_code=529 if resp.status_code >= 500 else 400,
1296
2491
  media_type="application/json",
1297
2492
  )
1298
2493
 
1299
2494
  return StreamingResponse(
1300
- stream_anthropic_response(resp, model),
2495
+ stream_anthropic_response(resp, model, monitor, body),
1301
2496
  media_type="text/event-stream",
1302
2497
  headers={
1303
2498
  "Cache-Control": "no-cache",
@@ -1314,25 +2509,63 @@ async def messages(request: Request):
1314
2509
  # Option B: Handle non-streaming errors too
1315
2510
  if resp.status_code != 200:
1316
2511
  error_text = resp.text[:1000]
1317
- logger.error("Upstream HTTP %d (non-stream): %s", resp.status_code, error_text)
2512
+ logger.error(
2513
+ "Upstream HTTP %d (non-stream): %s", resp.status_code, error_text
2514
+ )
1318
2515
  return Response(
1319
- content=json.dumps({
1320
- "type": "error",
1321
- "error": {
1322
- "type": "overloaded_error",
1323
- "message": f"Upstream error (HTTP {resp.status_code}): {error_text[:500]}",
1324
- },
1325
- }),
2516
+ content=json.dumps(
2517
+ {
2518
+ "type": "error",
2519
+ "error": {
2520
+ "type": "overloaded_error",
2521
+ "message": f"Upstream error (HTTP {resp.status_code}): {error_text[:500]}",
2522
+ },
2523
+ }
2524
+ ),
1326
2525
  status_code=529,
1327
2526
  media_type="application/json",
1328
2527
  )
1329
2528
 
1330
2529
  openai_resp = resp.json()
2530
+ openai_resp = await _apply_unexpected_end_turn_guardrail(
2531
+ client,
2532
+ openai_resp,
2533
+ openai_body,
2534
+ body,
2535
+ monitor,
2536
+ session_id,
2537
+ )
2538
+ openai_resp = await _apply_malformed_tool_guardrail(
2539
+ client,
2540
+ openai_resp,
2541
+ openai_body,
2542
+ body,
2543
+ monitor,
2544
+ session_id,
2545
+ )
2546
+
2547
+ choice, _ = _extract_openai_choice(openai_resp)
2548
+ finish_reason = choice.get("finish_reason", "")
2549
+ if (
2550
+ "tools" in body
2551
+ and not _openai_has_tool_calls(openai_resp)
2552
+ and (
2553
+ finish_reason in {"length", "max_tokens"}
2554
+ or (
2555
+ finish_reason in {"stop", "end_turn"}
2556
+ and len(_openai_message_text(openai_resp)) > 512
2557
+ )
2558
+ )
2559
+ ):
2560
+ monitor.malformed_tool_streak += 1
2561
+ elif _openai_has_tool_calls(openai_resp):
2562
+ monitor.malformed_tool_streak = 0
2563
+
1331
2564
  anthropic_resp = openai_to_anthropic_response(openai_resp, model)
1332
2565
 
1333
2566
  # Track output tokens in session monitor
1334
2567
  output_tokens = anthropic_resp.get("usage", {}).get("output_tokens", 0)
1335
- session_monitor.record_response(output_tokens)
2568
+ monitor.record_response(output_tokens)
1336
2569
 
1337
2570
  return anthropic_resp
1338
2571
 
@@ -1377,37 +2610,51 @@ async def health():
1377
2610
 
1378
2611
 
1379
2612
  @app.get("/v1/context")
1380
- async def context_status():
2613
+ async def context_status(request: Request):
1381
2614
  """Option F: Context window monitoring endpoint.
1382
2615
 
1383
2616
  Returns current session token usage, utilization, warnings, and
1384
2617
  estimated remaining turns. Useful for dashboards and debugging.
1385
2618
  """
1386
- warning = session_monitor.get_warning_level()
1387
- turns = session_monitor.estimate_turns_remaining()
2619
+ requested_session = request.query_params.get("session_id", "")
2620
+ session_id = requested_session or last_session_id
2621
+ monitor = session_monitors.get(session_id) if session_id else None
2622
+
2623
+ if monitor is None:
2624
+ monitor = SessionMonitor(context_window=default_context_window)
2625
+
2626
+ warning = monitor.get_warning_level()
2627
+ turns = monitor.estimate_turns_remaining()
1388
2628
 
1389
2629
  return {
1390
- "context_window": session_monitor.context_window,
1391
- "last_input_tokens": session_monitor.last_input_tokens,
1392
- "last_output_tokens": session_monitor.last_output_tokens,
1393
- "peak_input_tokens": session_monitor.peak_input_tokens,
1394
- "utilization": round(session_monitor.get_utilization(), 4),
1395
- "utilization_pct": f"{session_monitor.get_utilization() * 100:.1f}%",
2630
+ "active_session_id": session_id,
2631
+ "session_count": len(session_monitors),
2632
+ "context_window": monitor.context_window,
2633
+ "last_input_tokens": monitor.last_input_tokens,
2634
+ "last_output_tokens": monitor.last_output_tokens,
2635
+ "peak_input_tokens": monitor.peak_input_tokens,
2636
+ "utilization": round(monitor.get_utilization(), 4),
2637
+ "utilization_pct": f"{monitor.get_utilization() * 100:.1f}%",
1396
2638
  "warning_level": warning,
1397
2639
  "estimated_turns_remaining": turns,
1398
- "total_requests": session_monitor.total_requests,
1399
- "prune_count": session_monitor.prune_count,
1400
- "overflow_count": session_monitor.overflow_count,
2640
+ "total_requests": monitor.total_requests,
2641
+ "prune_count": monitor.prune_count,
2642
+ "overflow_count": monitor.overflow_count,
1401
2643
  "prune_threshold": PROXY_CONTEXT_PRUNE_THRESHOLD,
1402
- "recent_history": session_monitor.context_history[-10:],
2644
+ "recent_history": monitor.context_history[-10:],
1403
2645
  # Loop protection stats
1404
2646
  "loop_protection": {
1405
- "consecutive_forced_count": session_monitor.consecutive_forced_count,
1406
- "loop_warnings_emitted": session_monitor.loop_warnings_emitted,
1407
- "tool_call_history_len": len(session_monitor.tool_call_history),
1408
- "is_looping": session_monitor.detect_tool_loop()[0],
1409
- "loop_repeat_count": session_monitor.detect_tool_loop()[1],
1410
- "recent_tool_patterns": session_monitor.tool_call_history[-5:],
2647
+ "enabled": PROXY_LOOP_BREAKER,
2648
+ "consecutive_forced_count": monitor.consecutive_forced_count,
2649
+ "no_progress_streak": monitor.no_progress_streak,
2650
+ "loop_warnings_emitted": monitor.loop_warnings_emitted,
2651
+ "unexpected_end_turn_count": monitor.unexpected_end_turn_count,
2652
+ "malformed_tool_streak": monitor.malformed_tool_streak,
2653
+ "contamination_resets": monitor.contamination_resets,
2654
+ "tool_call_history_len": len(monitor.tool_call_history),
2655
+ "is_looping": monitor.detect_tool_loop(window=PROXY_LOOP_WINDOW)[0],
2656
+ "loop_repeat_count": monitor.detect_tool_loop(window=PROXY_LOOP_WINDOW)[1],
2657
+ "recent_tool_patterns": monitor.tool_call_history[-5:],
1411
2658
  },
1412
2659
  }
1413
2660