@miller-tech/uap 1.13.13 → 1.13.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -76,9 +76,11 @@ Dependencies
76
76
  """
77
77
 
78
78
  import asyncio
79
+ import hashlib
79
80
  import json
80
81
  import logging
81
82
  import os
83
+ import re
82
84
  import sys
83
85
  import time
84
86
  import uuid
@@ -100,7 +102,35 @@ PROXY_LOG_LEVEL = os.environ.get("PROXY_LOG_LEVEL", "INFO").upper()
100
102
  PROXY_READ_TIMEOUT = float(os.environ.get("PROXY_READ_TIMEOUT", "600"))
101
103
  PROXY_MAX_CONNECTIONS = int(os.environ.get("PROXY_MAX_CONNECTIONS", "20"))
102
104
  PROXY_CONTEXT_WINDOW = int(os.environ.get("PROXY_CONTEXT_WINDOW", "0"))
103
- PROXY_CONTEXT_PRUNE_THRESHOLD = float(os.environ.get("PROXY_CONTEXT_PRUNE_THRESHOLD", "0.75"))
105
+ PROXY_CONTEXT_PRUNE_THRESHOLD = float(
106
+ os.environ.get("PROXY_CONTEXT_PRUNE_THRESHOLD", "0.75")
107
+ )
108
+ PROXY_LOOP_BREAKER = os.environ.get("PROXY_LOOP_BREAKER", "on").lower() not in {
109
+ "0",
110
+ "false",
111
+ "off",
112
+ "no",
113
+ }
114
+ PROXY_LOOP_WINDOW = int(os.environ.get("PROXY_LOOP_WINDOW", "6"))
115
+ PROXY_LOOP_REPEAT_THRESHOLD = int(os.environ.get("PROXY_LOOP_REPEAT_THRESHOLD", "8"))
116
+ PROXY_FORCED_THRESHOLD = int(os.environ.get("PROXY_FORCED_THRESHOLD", "15"))
117
+ PROXY_NO_PROGRESS_THRESHOLD = int(os.environ.get("PROXY_NO_PROGRESS_THRESHOLD", "4"))
118
+ PROXY_CONTEXT_RELEASE_THRESHOLD = float(
119
+ os.environ.get("PROXY_CONTEXT_RELEASE_THRESHOLD", "0.90")
120
+ )
121
+ PROXY_GUARDRAIL_RETRY = os.environ.get("PROXY_GUARDRAIL_RETRY", "on").lower() not in {
122
+ "0",
123
+ "false",
124
+ "off",
125
+ "no",
126
+ }
127
+ PROXY_SESSION_TTL_SECS = int(os.environ.get("PROXY_SESSION_TTL_SECS", "7200"))
128
+ PROXY_STREAM_REASONING_FALLBACK = (
129
+ os.environ.get("PROXY_STREAM_REASONING_FALLBACK", "off").strip().lower()
130
+ )
131
+ PROXY_STREAM_REASONING_MAX_CHARS = int(
132
+ os.environ.get("PROXY_STREAM_REASONING_MAX_CHARS", "240")
133
+ )
104
134
 
105
135
  # ---------------------------------------------------------------------------
106
136
  # Logging
@@ -121,19 +151,26 @@ class SessionMonitor:
121
151
  """Tracks token usage across the session to provide early warnings
122
152
  and enable proactive context management before overflow occurs."""
123
153
 
124
- context_window: int = 0 # Auto-detected or configured
154
+ context_window: int = 0 # Auto-detected or configured
125
155
  total_requests: int = 0
126
- last_input_tokens: int = 0 # Estimated input tokens of last request
127
- last_output_tokens: int = 0 # Actual output tokens of last response
128
- peak_input_tokens: int = 0 # High-water mark
129
- prune_count: int = 0 # How many times pruning was triggered
130
- overflow_count: int = 0 # How many context overflow errors caught
156
+ last_input_tokens: int = 0 # Estimated input tokens of last request
157
+ last_output_tokens: int = 0 # Actual output tokens of last response
158
+ peak_input_tokens: int = 0 # High-water mark
159
+ prune_count: int = 0 # How many times pruning was triggered
160
+ overflow_count: int = 0 # How many context overflow errors caught
131
161
  context_history: list = field(default_factory=list) # Recent token counts
132
162
 
133
163
  # --- Token Loop Protection ---
134
- tool_call_history: list = field(default_factory=list) # Recent tool call fingerprints
135
- consecutive_forced_count: int = 0 # How many times tool_choice was forced consecutively
136
- loop_warnings_emitted: int = 0 # How many loop warnings sent to the model
164
+ tool_call_history: list = field(
165
+ default_factory=list
166
+ ) # Recent tool call fingerprints
167
+ consecutive_forced_count: int = (
168
+ 0 # How many times tool_choice was forced consecutively
169
+ )
170
+ loop_warnings_emitted: int = 0 # How many loop warnings sent to the model
171
+ no_progress_streak: int = 0 # Forced tool turns without new tool_result
172
+ unexpected_end_turn_count: int = 0 # end_turn without tool_use in active loop
173
+ last_seen_ts: float = 0.0
137
174
 
138
175
  def record_request(self, estimated_tokens: int):
139
176
  """Record an outgoing request's estimated token count."""
@@ -150,6 +187,9 @@ class SessionMonitor:
150
187
  """Record a response's output token count."""
151
188
  self.last_output_tokens = output_tokens
152
189
 
190
+ def touch(self):
191
+ self.last_seen_ts = time.time()
192
+
153
193
  def get_utilization(self) -> float:
154
194
  """Get current context utilization as a fraction (0.0 - 1.0)."""
155
195
  if self.context_window <= 0:
@@ -196,25 +236,36 @@ class SessionMonitor:
196
236
  if warning == "CRITICAL":
197
237
  logger.error(
198
238
  "CONTEXT CRITICAL: %d/%d tokens (%.1f%%), %s, pruned=%d, overflows=%d",
199
- self.last_input_tokens, self.context_window, util * 100,
200
- turns_str, self.prune_count, self.overflow_count,
239
+ self.last_input_tokens,
240
+ self.context_window,
241
+ util * 100,
242
+ turns_str,
243
+ self.prune_count,
244
+ self.overflow_count,
201
245
  )
202
246
  elif warning == "HIGH":
203
247
  logger.warning(
204
248
  "CONTEXT HIGH: %d/%d tokens (%.1f%%), %s, pruned=%d",
205
- self.last_input_tokens, self.context_window, util * 100,
206
- turns_str, self.prune_count,
249
+ self.last_input_tokens,
250
+ self.context_window,
251
+ util * 100,
252
+ turns_str,
253
+ self.prune_count,
207
254
  )
208
255
  elif warning == "ELEVATED":
209
256
  logger.warning(
210
257
  "CONTEXT ELEVATED: %d/%d tokens (%.1f%%), %s",
211
- self.last_input_tokens, self.context_window, util * 100,
258
+ self.last_input_tokens,
259
+ self.context_window,
260
+ util * 100,
212
261
  turns_str,
213
262
  )
214
263
  else:
215
264
  logger.info(
216
265
  "CONTEXT: %d/%d tokens (%.1f%%), %s",
217
- self.last_input_tokens, self.context_window, util * 100,
266
+ self.last_input_tokens,
267
+ self.context_window,
268
+ util * 100,
218
269
  turns_str,
219
270
  )
220
271
 
@@ -264,30 +315,42 @@ class SessionMonitor:
264
315
  - 15+ consecutive forced requests regardless -> release
265
316
  - Context utilization > 90% -> release (let model wrap up)
266
317
  """
267
- is_looping, repeat_count = self.detect_tool_loop(window=6)
318
+ if not PROXY_LOOP_BREAKER:
319
+ return False
320
+
321
+ is_looping, repeat_count = self.detect_tool_loop(window=PROXY_LOOP_WINDOW)
268
322
 
269
323
  # Pattern 1: Detected tool call loop
270
- if is_looping and repeat_count >= 8:
324
+ if (
325
+ is_looping
326
+ and repeat_count >= PROXY_LOOP_REPEAT_THRESHOLD
327
+ and self.no_progress_streak >= PROXY_NO_PROGRESS_THRESHOLD
328
+ ):
271
329
  logger.warning(
272
- "LOOP BREAKER: Same tool pattern repeated %d times. "
330
+ "LOOP BREAKER: Same tool pattern repeated %d times with no progress streak=%d. "
273
331
  "Releasing tool_choice to 'auto'.",
274
332
  repeat_count,
333
+ self.no_progress_streak,
275
334
  )
276
335
  self.loop_warnings_emitted += 1
277
336
  return True
278
337
 
279
338
  # Pattern 2: Too many consecutive forced requests
280
- if self.consecutive_forced_count >= 15:
339
+ if (
340
+ self.consecutive_forced_count >= PROXY_FORCED_THRESHOLD
341
+ and self.no_progress_streak >= PROXY_NO_PROGRESS_THRESHOLD
342
+ ):
281
343
  logger.warning(
282
- "LOOP BREAKER: %d consecutive forced tool_choice requests. "
344
+ "LOOP BREAKER: %d consecutive forced tool_choice requests with no progress streak=%d. "
283
345
  "Releasing to 'auto'.",
284
346
  self.consecutive_forced_count,
347
+ self.no_progress_streak,
285
348
  )
286
349
  self.loop_warnings_emitted += 1
287
350
  return True
288
351
 
289
352
  # Pattern 3: Context almost full -- let model wrap up naturally
290
- if self.get_utilization() >= 0.90:
353
+ if self.get_utilization() >= PROXY_CONTEXT_RELEASE_THRESHOLD:
291
354
  logger.warning(
292
355
  "LOOP BREAKER: Context utilization %.1f%% -- releasing "
293
356
  "tool_choice to let model wrap up.",
@@ -298,7 +361,35 @@ class SessionMonitor:
298
361
  return False
299
362
 
300
363
 
301
- session_monitor = SessionMonitor()
364
+ session_monitors: dict[str, SessionMonitor] = {}
365
+ default_context_window = 0
366
+ last_session_id = ""
367
+
368
+
369
+ def _cleanup_stale_monitors(now_ts: float) -> None:
370
+ stale = [
371
+ sid
372
+ for sid, mon in session_monitors.items()
373
+ if mon.last_seen_ts > 0 and now_ts - mon.last_seen_ts > PROXY_SESSION_TTL_SECS
374
+ ]
375
+ for sid in stale:
376
+ session_monitors.pop(sid, None)
377
+
378
+
379
+ def get_session_monitor(session_id: str) -> SessionMonitor:
380
+ now_ts = time.time()
381
+ _cleanup_stale_monitors(now_ts)
382
+
383
+ monitor = session_monitors.get(session_id)
384
+ if monitor is None:
385
+ monitor = SessionMonitor(context_window=default_context_window)
386
+ session_monitors[session_id] = monitor
387
+
388
+ monitor.touch()
389
+ if monitor.context_window <= 0:
390
+ monitor.context_window = default_context_window
391
+
392
+ return monitor
302
393
 
303
394
 
304
395
  # ---------------------------------------------------------------------------
@@ -324,7 +415,8 @@ async def detect_context_window(client: httpx.AsyncClient) -> int:
324
415
  if n_ctx > 0:
325
416
  logger.info(
326
417
  "Auto-detected context window from upstream: %d tokens (%d slots)",
327
- n_ctx, len(slots),
418
+ n_ctx,
419
+ len(slots),
328
420
  )
329
421
  return n_ctx
330
422
  except Exception as exc:
@@ -398,7 +490,9 @@ def estimate_total_tokens(anthropic_body: dict) -> int:
398
490
  return tokens
399
491
 
400
492
 
401
- def prune_conversation(anthropic_body: dict, context_window: int, target_fraction: float = 0.65) -> dict:
493
+ def prune_conversation(
494
+ anthropic_body: dict, context_window: int, target_fraction: float = 0.65
495
+ ) -> dict:
402
496
  """Prune the conversation to fit within the context window.
403
497
 
404
498
  Strategy:
@@ -445,19 +539,24 @@ def prune_conversation(anthropic_body: dict, context_window: int, target_fractio
445
539
 
446
540
  # Always keep the first user message and the last N messages
447
541
  KEEP_LAST = 8 # Keep the last 8 messages (recent context)
448
- protected_head = messages[:1] # First user message
449
- protected_tail = messages[-KEEP_LAST:] if len(messages) > KEEP_LAST else messages[1:]
542
+ protected_head = messages[:1] # First user message
543
+ protected_tail = (
544
+ messages[-KEEP_LAST:] if len(messages) > KEEP_LAST else messages[1:]
545
+ )
450
546
  middle = messages[1:-KEEP_LAST] if len(messages) > KEEP_LAST + 1 else []
451
547
 
452
548
  # Calculate tokens for protected messages
453
- protected_tokens = sum(estimate_message_tokens(m) for m in protected_head + protected_tail)
549
+ protected_tokens = sum(
550
+ estimate_message_tokens(m) for m in protected_head + protected_tail
551
+ )
454
552
 
455
553
  if protected_tokens >= message_budget:
456
554
  # Even protected messages exceed budget -- truncate tool_result content
457
555
  # in the tail to fit
458
556
  logger.warning(
459
557
  "Protected messages (%d tokens) exceed budget (%d) -- truncating tool results",
460
- protected_tokens, message_budget,
558
+ protected_tokens,
559
+ message_budget,
461
560
  )
462
561
  for msg in protected_tail:
463
562
  content = msg.get("content", [])
@@ -466,7 +565,11 @@ def prune_conversation(anthropic_body: dict, context_window: int, target_fractio
466
565
  if isinstance(block, dict) and block.get("type") == "tool_result":
467
566
  result_text = _extract_text(block.get("content", ""))
468
567
  if len(result_text) > 2000:
469
- block["content"] = result_text[:1000] + "\n...[TRUNCATED]...\n" + result_text[-500:]
568
+ block["content"] = (
569
+ result_text[:1000]
570
+ + "\n...[TRUNCATED]...\n"
571
+ + result_text[-500:]
572
+ )
470
573
  anthropic_body["messages"] = protected_head + protected_tail
471
574
  return anthropic_body
472
575
 
@@ -486,8 +589,7 @@ def prune_conversation(anthropic_body: dict, context_window: int, target_fractio
486
589
 
487
590
  if isinstance(content, list):
488
591
  is_tool_result = any(
489
- isinstance(b, dict) and b.get("type") == "tool_result"
490
- for b in content
592
+ isinstance(b, dict) and b.get("type") == "tool_result" for b in content
491
593
  )
492
594
 
493
595
  # Lower priority = removed first
@@ -529,12 +631,17 @@ def prune_conversation(anthropic_body: dict, context_window: int, target_fractio
529
631
  f"The conversation continues from recent context below.]"
530
632
  ),
531
633
  }
532
- anthropic_body["messages"] = protected_head + [prune_marker] + kept_msgs + protected_tail
634
+ anthropic_body["messages"] = (
635
+ protected_head + [prune_marker] + kept_msgs + protected_tail
636
+ )
533
637
  logger.warning(
534
638
  "PRUNED: removed %d messages (~%d tokens), kept %d messages, "
535
639
  "target=%.0f%% of %d ctx",
536
- removed_count, removed_tokens, len(anthropic_body["messages"]),
537
- target_fraction * 100, context_window,
640
+ removed_count,
641
+ removed_tokens,
642
+ len(anthropic_body["messages"]),
643
+ target_fraction * 100,
644
+ context_window,
538
645
  )
539
646
  else:
540
647
  anthropic_body["messages"] = protected_head + kept_msgs + protected_tail
@@ -554,12 +661,13 @@ http_client: httpx.AsyncClient | None = None
554
661
  async def lifespan(app: FastAPI):
555
662
  """Manage the httpx client lifecycle with the FastAPI app."""
556
663
  global http_client
664
+ global default_context_window
557
665
  http_client = httpx.AsyncClient(
558
666
  timeout=httpx.Timeout(
559
- connect=10.0, # 10s to establish connection
560
- read=PROXY_READ_TIMEOUT, # configurable (default 10 min)
561
- write=30.0, # 30s to send the request body
562
- pool=10.0, # 10s to acquire a pool connection
667
+ connect=10.0, # 10s to establish connection
668
+ read=PROXY_READ_TIMEOUT, # configurable (default 10 min)
669
+ write=30.0, # 30s to send the request body
670
+ pool=10.0, # 10s to acquire a pool connection
563
671
  ),
564
672
  limits=httpx.Limits(
565
673
  max_connections=PROXY_MAX_CONNECTIONS,
@@ -569,14 +677,19 @@ async def lifespan(app: FastAPI):
569
677
  )
570
678
  logger.info(
571
679
  "Proxy started: listening on %s:%d -> upstream %s",
572
- PROXY_HOST, PROXY_PORT, LLAMA_CPP_BASE,
680
+ PROXY_HOST,
681
+ PROXY_PORT,
682
+ LLAMA_CPP_BASE,
573
683
  )
574
684
 
575
685
  # Auto-detect context window from upstream server
576
- session_monitor.context_window = await detect_context_window(http_client)
686
+ default_context_window = await detect_context_window(http_client)
687
+ for mon in session_monitors.values():
688
+ if mon.context_window <= 0:
689
+ mon.context_window = default_context_window
577
690
  logger.info(
578
691
  "Context window: %d tokens, prune threshold: %.0f%%",
579
- session_monitor.context_window,
692
+ default_context_window,
580
693
  PROXY_CONTEXT_PRUNE_THRESHOLD * 100,
581
694
  )
582
695
 
@@ -598,6 +711,7 @@ app = FastAPI(
598
711
  # Request Translation: Anthropic -> OpenAI
599
712
  # ===========================================================================
600
713
 
714
+
601
715
  def anthropic_to_openai_messages(anthropic_body: dict) -> list[dict]:
602
716
  """Convert Anthropic message format to OpenAI message format.
603
717
 
@@ -635,25 +749,33 @@ def anthropic_to_openai_messages(anthropic_body: dict) -> list[dict]:
635
749
  elif block.get("type") == "text":
636
750
  parts.append(block.get("text", ""))
637
751
  elif block.get("type") == "tool_use":
638
- messages.append({
639
- "role": "assistant",
640
- "content": None,
641
- "tool_calls": [{
642
- "id": block.get("id", f"call_{uuid.uuid4().hex[:8]}"),
643
- "type": "function",
644
- "function": {
645
- "name": block["name"],
646
- "arguments": json.dumps(block.get("input", {})),
647
- },
648
- }],
649
- })
752
+ messages.append(
753
+ {
754
+ "role": "assistant",
755
+ "content": None,
756
+ "tool_calls": [
757
+ {
758
+ "id": block.get(
759
+ "id", f"call_{uuid.uuid4().hex[:8]}"
760
+ ),
761
+ "type": "function",
762
+ "function": {
763
+ "name": block["name"],
764
+ "arguments": json.dumps(block.get("input", {})),
765
+ },
766
+ }
767
+ ],
768
+ }
769
+ )
650
770
  continue
651
771
  elif block.get("type") == "tool_result":
652
- messages.append({
653
- "role": "tool",
654
- "tool_call_id": block.get("tool_use_id", ""),
655
- "content": _extract_text(block.get("content", "")),
656
- })
772
+ messages.append(
773
+ {
774
+ "role": "tool",
775
+ "tool_call_id": block.get("tool_use_id", ""),
776
+ "content": _extract_text(block.get("content", "")),
777
+ }
778
+ )
657
779
  continue
658
780
  if parts:
659
781
  messages.append({"role": role, "content": "\n".join(parts)})
@@ -686,7 +808,77 @@ _AGENTIC_SYSTEM_SUPPLEMENT = (
686
808
  )
687
809
 
688
810
 
689
- def build_openai_request(anthropic_body: dict) -> dict:
811
+ def _content_fingerprint(content) -> str:
812
+ if isinstance(content, str):
813
+ return content[:512]
814
+ if isinstance(content, list):
815
+ parts = []
816
+ for block in content:
817
+ if isinstance(block, str):
818
+ parts.append(block)
819
+ elif isinstance(block, dict):
820
+ btype = block.get("type", "")
821
+ if btype == "text":
822
+ parts.append(block.get("text", ""))
823
+ elif btype == "tool_use":
824
+ parts.append(f"tool:{block.get('name', '')}")
825
+ elif btype == "tool_result":
826
+ parts.append(f"result:{block.get('tool_use_id', '')}")
827
+ return "\n".join(parts)[:1024]
828
+ return str(content)[:512]
829
+
830
+
831
+ def resolve_session_id(request: Request, anthropic_body: dict) -> str:
832
+ header_keys = (
833
+ "x-uap-session-id",
834
+ "x-claude-session-id",
835
+ "anthropic-session-id",
836
+ "x-session-id",
837
+ )
838
+ for key in header_keys:
839
+ value = request.headers.get(key)
840
+ if value:
841
+ return f"hdr:{value}"
842
+
843
+ metadata = anthropic_body.get("metadata", {})
844
+ if isinstance(metadata, dict):
845
+ for key in ("session_id", "conversation_id", "thread_id"):
846
+ value = metadata.get(key)
847
+ if value:
848
+ return f"meta:{value}"
849
+
850
+ first_user = ""
851
+ for msg in anthropic_body.get("messages", []):
852
+ if msg.get("role") == "user":
853
+ first_user = _content_fingerprint(msg.get("content", ""))
854
+ break
855
+
856
+ system_fingerprint = _content_fingerprint(anthropic_body.get("system", ""))
857
+ model = anthropic_body.get("model", "default")
858
+ remote = request.client.host if request.client else "unknown"
859
+ digest = hashlib.sha256(
860
+ f"{remote}|{model}|{system_fingerprint}|{first_user}".encode(
861
+ "utf-8", errors="ignore"
862
+ )
863
+ ).hexdigest()[:20]
864
+ return f"fp:{digest}"
865
+
866
+
867
+ def _last_user_has_tool_result(anthropic_body: dict) -> bool:
868
+ messages = anthropic_body.get("messages", [])
869
+ for msg in reversed(messages):
870
+ if msg.get("role") != "user":
871
+ continue
872
+ content = msg.get("content")
873
+ if not isinstance(content, list):
874
+ return False
875
+ return any(
876
+ isinstance(b, dict) and b.get("type") == "tool_result" for b in content
877
+ )
878
+ return False
879
+
880
+
881
+ def build_openai_request(anthropic_body: dict, monitor: SessionMonitor) -> dict:
690
882
  """Build an OpenAI Chat Completions request from an Anthropic Messages request."""
691
883
  openai_body = {
692
884
  "model": anthropic_body.get("model", "default"),
@@ -700,10 +892,13 @@ def build_openai_request(anthropic_body: dict) -> dict:
700
892
  openai_body["messages"][0]["content"] += _AGENTIC_SYSTEM_SUPPLEMENT
701
893
  else:
702
894
  # No system message from the client; inject one.
703
- openai_body["messages"].insert(0, {
704
- "role": "system",
705
- "content": _AGENTIC_SYSTEM_SUPPLEMENT.strip(),
706
- })
895
+ openai_body["messages"].insert(
896
+ 0,
897
+ {
898
+ "role": "system",
899
+ "content": _AGENTIC_SYSTEM_SUPPLEMENT.strip(),
900
+ },
901
+ )
707
902
 
708
903
  if "max_tokens" in anthropic_body:
709
904
  # Enforce minimum floor for thinking mode: model needs tokens for
@@ -716,7 +911,7 @@ def build_openai_request(anthropic_body: dict) -> dict:
716
911
  # Formula: max_tokens = min(requested, context_window - input_tokens - safety_margin)
717
912
  # This ensures the model's output + current input stays within bounds,
718
913
  # leaving room for the next turn's incremental growth.
719
- ctx_window = session_monitor.context_window
914
+ ctx_window = monitor.context_window
720
915
  if ctx_window > 0:
721
916
  estimated_input = estimate_total_tokens(anthropic_body)
722
917
  # Reserve 15% of context for next-turn growth (tool results, etc.)
@@ -725,8 +920,11 @@ def build_openai_request(anthropic_body: dict) -> dict:
725
920
  if available_for_output < requested_max and available_for_output > 1024:
726
921
  logger.info(
727
922
  "MAX_TOKENS capped: %d -> %d (ctx=%d, input~%d, margin=%d)",
728
- requested_max, available_for_output,
729
- ctx_window, estimated_input, safety_margin,
923
+ requested_max,
924
+ available_for_output,
925
+ ctx_window,
926
+ estimated_input,
927
+ safety_margin,
730
928
  )
731
929
  requested_max = available_for_output
732
930
  elif available_for_output <= 1024:
@@ -734,7 +932,9 @@ def build_openai_request(anthropic_body: dict) -> dict:
734
932
  logger.warning(
735
933
  "MAX_TOKENS: only %d tokens available for output (ctx=%d, input~%d). "
736
934
  "Response may be truncated.",
737
- available_for_output, ctx_window, estimated_input,
935
+ available_for_output,
936
+ ctx_window,
937
+ estimated_input,
738
938
  )
739
939
  requested_max = max(1024, available_for_output)
740
940
 
@@ -750,14 +950,16 @@ def build_openai_request(anthropic_body: dict) -> dict:
750
950
  if "tools" in anthropic_body:
751
951
  openai_body["tools"] = []
752
952
  for tool in anthropic_body["tools"]:
753
- openai_body["tools"].append({
754
- "type": "function",
755
- "function": {
756
- "name": tool["name"],
757
- "description": tool.get("description", ""),
758
- "parameters": tool.get("input_schema", {}),
759
- },
760
- })
953
+ openai_body["tools"].append(
954
+ {
955
+ "type": "function",
956
+ "function": {
957
+ "name": tool["name"],
958
+ "description": tool.get("description", ""),
959
+ "parameters": tool.get("input_schema", {}),
960
+ },
961
+ }
962
+ )
761
963
 
762
964
  # Smart tool_choice: force tool calls during the agentic loop to
763
965
  # prevent the model from producing text-only end_turn responses that
@@ -774,7 +976,8 @@ def build_openai_request(anthropic_body: dict) -> dict:
774
976
  # runaway token consumption.
775
977
  n_msgs = len(anthropic_body.get("messages", []))
776
978
  has_tool_results = any(
777
- isinstance(m.get("content"), list) and any(
979
+ isinstance(m.get("content"), list)
980
+ and any(
778
981
  isinstance(b, dict) and b.get("type") == "tool_result"
779
982
  for b in m.get("content", [])
780
983
  )
@@ -782,28 +985,41 @@ def build_openai_request(anthropic_body: dict) -> dict:
782
985
  )
783
986
 
784
987
  # Record tool calls from the last assistant message for loop detection
785
- _record_last_assistant_tool_calls(anthropic_body)
988
+ _record_last_assistant_tool_calls(anthropic_body, monitor)
989
+ last_user_has_tool_result = _last_user_has_tool_result(anthropic_body)
786
990
 
787
991
  # Check if loop breaker should override tool_choice
788
- if session_monitor.should_release_tool_choice():
992
+ if monitor.should_release_tool_choice():
789
993
  openai_body["tool_choice"] = "auto"
790
- session_monitor.consecutive_forced_count = 0
994
+ monitor.consecutive_forced_count = 0
995
+ monitor.no_progress_streak = 0
791
996
  logger.warning("tool_choice set to 'auto' by LOOP BREAKER")
792
997
  elif _last_assistant_was_text_only(anthropic_body):
793
998
  openai_body["tool_choice"] = "required"
794
- session_monitor.consecutive_forced_count += 1
795
- logger.info("tool_choice forced to 'required' (last assistant was text-only)")
999
+ monitor.consecutive_forced_count += 1
1000
+ monitor.no_progress_streak = (
1001
+ 0 if last_user_has_tool_result else monitor.no_progress_streak + 1
1002
+ )
1003
+ logger.info(
1004
+ "tool_choice forced to 'required' (last assistant was text-only)"
1005
+ )
796
1006
  elif has_tool_results and n_msgs > 2:
797
1007
  openai_body["tool_choice"] = "required"
798
- session_monitor.consecutive_forced_count += 1
799
- logger.info("tool_choice forced to 'required' (active agentic loop with tool results)")
1008
+ monitor.consecutive_forced_count += 1
1009
+ monitor.no_progress_streak = (
1010
+ 0 if last_user_has_tool_result else monitor.no_progress_streak + 1
1011
+ )
1012
+ logger.info(
1013
+ "tool_choice forced to 'required' (active agentic loop with tool results)"
1014
+ )
800
1015
  else:
801
- session_monitor.consecutive_forced_count = 0
1016
+ monitor.consecutive_forced_count = 0
1017
+ monitor.no_progress_streak = 0
802
1018
 
803
1019
  return openai_body
804
1020
 
805
1021
 
806
- def _record_last_assistant_tool_calls(anthropic_body: dict):
1022
+ def _record_last_assistant_tool_calls(anthropic_body: dict, monitor: SessionMonitor):
807
1023
  """Extract tool call names from the last assistant message and record
808
1024
  them in the session monitor for loop detection."""
809
1025
  messages = anthropic_body.get("messages", [])
@@ -818,7 +1034,70 @@ def _record_last_assistant_tool_calls(anthropic_body: dict):
818
1034
  tool_names.append(block.get("name", "unknown"))
819
1035
  break
820
1036
  if tool_names:
821
- session_monitor.record_tool_calls(tool_names)
1037
+ monitor.record_tool_calls(tool_names)
1038
+
1039
+
1040
+ def _is_unexpected_end_turn(openai_resp: dict, anthropic_body: dict) -> bool:
1041
+ choices = openai_resp.get("choices") or []
1042
+ if not choices:
1043
+ return False
1044
+
1045
+ choice = choices[0]
1046
+ finish = choice.get("finish_reason")
1047
+ if finish not in {"stop", "end_turn"}:
1048
+ return False
1049
+
1050
+ msg = choice.get("message", {})
1051
+ if msg.get("tool_calls"):
1052
+ return False
1053
+
1054
+ if "tools" not in anthropic_body:
1055
+ return False
1056
+
1057
+ has_tool_results = any(
1058
+ isinstance(m.get("content"), list)
1059
+ and any(
1060
+ isinstance(b, dict) and b.get("type") == "tool_result"
1061
+ for b in m.get("content", [])
1062
+ )
1063
+ for m in anthropic_body.get("messages", [])
1064
+ )
1065
+
1066
+ return has_tool_results or _last_assistant_was_text_only(anthropic_body)
1067
+
1068
+
1069
+ def _sanitize_reasoning_fallback_text(reasoning_text: str) -> str:
1070
+ cleaned = re.sub(r"</?think>", "", reasoning_text, flags=re.IGNORECASE)
1071
+ cleaned = re.sub(r"\s+", " ", cleaned).strip()
1072
+ if not cleaned:
1073
+ return ""
1074
+ if len(cleaned) > PROXY_STREAM_REASONING_MAX_CHARS:
1075
+ return cleaned[:PROXY_STREAM_REASONING_MAX_CHARS].rstrip() + "..."
1076
+ return cleaned
1077
+
1078
+
1079
+ def _build_reasoning_fallback_text(
1080
+ reasoning_chunks: list[str], mode: str | None = None
1081
+ ) -> str | None:
1082
+ fallback_mode = (mode or PROXY_STREAM_REASONING_FALLBACK).strip().lower()
1083
+ if fallback_mode == "off":
1084
+ return None
1085
+
1086
+ raw_text = "".join(reasoning_chunks).strip()
1087
+ if not raw_text:
1088
+ return None
1089
+
1090
+ if fallback_mode == "visible":
1091
+ return raw_text
1092
+ if fallback_mode == "sanitized":
1093
+ sanitized = _sanitize_reasoning_fallback_text(raw_text)
1094
+ return sanitized or None
1095
+
1096
+ logger.warning(
1097
+ "Unknown PROXY_STREAM_REASONING_FALLBACK=%r; disabling reasoning fallback",
1098
+ fallback_mode,
1099
+ )
1100
+ return None
822
1101
 
823
1102
 
824
1103
  def _last_assistant_was_text_only(anthropic_body: dict) -> bool:
@@ -836,11 +1115,14 @@ def _last_assistant_was_text_only(anthropic_body: dict) -> bool:
836
1115
  return bool(content.strip())
837
1116
  if isinstance(content, list):
838
1117
  has_tool_use = any(
839
- isinstance(b, dict) and b.get("type") == "tool_use"
840
- for b in content
1118
+ isinstance(b, dict) and b.get("type") == "tool_use" for b in content
841
1119
  )
842
1120
  has_text = any(
843
- (isinstance(b, dict) and b.get("type") == "text" and b.get("text", "").strip())
1121
+ (
1122
+ isinstance(b, dict)
1123
+ and b.get("type") == "text"
1124
+ and b.get("text", "").strip()
1125
+ )
844
1126
  or isinstance(b, str)
845
1127
  for b in content
846
1128
  )
@@ -854,6 +1136,7 @@ def _last_assistant_was_text_only(anthropic_body: dict) -> bool:
854
1136
  # Response Translation: OpenAI -> Anthropic
855
1137
  # ===========================================================================
856
1138
 
1139
+
857
1140
  def openai_to_anthropic_response(openai_resp: dict, model: str) -> dict:
858
1141
  """Convert an OpenAI Chat Completions response to Anthropic Messages format."""
859
1142
  choice = openai_resp.get("choices", [{}])[0]
@@ -871,12 +1154,14 @@ def openai_to_anthropic_response(openai_resp: dict, model: str) -> dict:
871
1154
  args = json.loads(fn.get("arguments", "{}"))
872
1155
  except json.JSONDecodeError:
873
1156
  args = {}
874
- content.append({
875
- "type": "tool_use",
876
- "id": tc.get("id", f"toolu_{uuid.uuid4().hex[:12]}"),
877
- "name": fn.get("name", ""),
878
- "input": args,
879
- })
1157
+ content.append(
1158
+ {
1159
+ "type": "tool_use",
1160
+ "id": tc.get("id", f"toolu_{uuid.uuid4().hex[:12]}"),
1161
+ "name": fn.get("name", ""),
1162
+ "input": args,
1163
+ }
1164
+ )
880
1165
 
881
1166
  stop_reason_map = {
882
1167
  "stop": "end_turn",
@@ -906,7 +1191,13 @@ def openai_to_anthropic_response(openai_resp: dict, model: str) -> dict:
906
1191
  # Streaming Translation: OpenAI SSE -> Anthropic SSE
907
1192
  # ===========================================================================
908
1193
 
909
- async def stream_anthropic_response(openai_stream: httpx.Response, model: str):
1194
+
1195
+ async def stream_anthropic_response(
1196
+ openai_stream: httpx.Response,
1197
+ model: str,
1198
+ monitor: SessionMonitor,
1199
+ anthropic_body: dict,
1200
+ ):
910
1201
  """Convert an OpenAI streaming response to Anthropic SSE stream format.
911
1202
 
912
1203
  Handles:
@@ -929,7 +1220,7 @@ async def stream_anthropic_response(openai_stream: httpx.Response, model: str):
929
1220
  f"data: {json.dumps({'type': 'content_block_start', 'index': 0, 'content_block': {'type': 'text', 'text': ''}})}\n\n"
930
1221
  )
931
1222
 
932
- yield "event: ping\ndata: {\"type\": \"ping\"}\n\n"
1223
+ yield 'event: ping\ndata: {"type": "ping"}\n\n'
933
1224
 
934
1225
  output_tokens = 0
935
1226
  finish_reason = "end_turn"
@@ -1058,21 +1349,29 @@ async def stream_anthropic_response(openai_stream: httpx.Response, model: str):
1058
1349
  f"data: {json.dumps({'type': 'content_block_stop', 'index': tc['block_index']})}\n\n"
1059
1350
  )
1060
1351
  else:
1061
- # Option E: If the response has no text AND no tool calls, but the
1062
- # model produced reasoning_content, forward the reasoning as visible
1063
- # text so the client doesn't receive a completely empty turn.
1352
+ # If the response has no text and no tool calls, optionally emit a
1353
+ # reasoning fallback (configurable) to avoid leaking malformed
1354
+ # internal chain-of-thought content by default.
1064
1355
  accumulated_text = "".join(text_chunks)
1065
1356
  if not accumulated_text and reasoning_chunks:
1066
- fallback_text = "".join(reasoning_chunks)
1067
- logger.warning(
1068
- "Empty response with %d reasoning tokens – forwarding reasoning as fallback text",
1069
- len(reasoning_chunks),
1070
- )
1071
- text_chunks.append(fallback_text)
1072
- yield (
1073
- f"event: content_block_delta\n"
1074
- f"data: {json.dumps({'type': 'content_block_delta', 'index': 0, 'delta': {'type': 'text_delta', 'text': fallback_text}})}\n\n"
1075
- )
1357
+ fallback_text = _build_reasoning_fallback_text(reasoning_chunks)
1358
+ if fallback_text:
1359
+ logger.warning(
1360
+ "Empty response with %d reasoning chunks – emitting fallback text (mode=%s)",
1361
+ len(reasoning_chunks),
1362
+ PROXY_STREAM_REASONING_FALLBACK,
1363
+ )
1364
+ text_chunks.append(fallback_text)
1365
+ yield (
1366
+ f"event: content_block_delta\n"
1367
+ f"data: {json.dumps({'type': 'content_block_delta', 'index': 0, 'delta': {'type': 'text_delta', 'text': fallback_text}})}\n\n"
1368
+ )
1369
+ else:
1370
+ logger.warning(
1371
+ "Empty response with %d reasoning chunks – fallback suppressed (mode=%s)",
1372
+ len(reasoning_chunks),
1373
+ PROXY_STREAM_REASONING_FALLBACK,
1374
+ )
1076
1375
 
1077
1376
  yield (
1078
1377
  f"event: content_block_stop\n"
@@ -1081,17 +1380,52 @@ async def stream_anthropic_response(openai_stream: httpx.Response, model: str):
1081
1380
 
1082
1381
  # Log response summary
1083
1382
  accumulated_text = "".join(text_chunks)
1084
- tc_names = [tc["name"] for tc in tool_calls_by_index.values()] if tool_calls_by_index else []
1085
- tc_args = [tc.get("arguments", "") for tc in tool_calls_by_index.values()] if tool_calls_by_index else []
1383
+ tc_names = (
1384
+ [tc["name"] for tc in tool_calls_by_index.values()]
1385
+ if tool_calls_by_index
1386
+ else []
1387
+ )
1388
+ tc_args = (
1389
+ [tc.get("arguments", "") for tc in tool_calls_by_index.values()]
1390
+ if tool_calls_by_index
1391
+ else []
1392
+ )
1086
1393
  logger.info(
1087
1394
  "RESP: finish=%s output_tokens=%d text_len=%d text=%.300s tool_calls=%s args=%s",
1088
- finish_reason, output_tokens,
1395
+ finish_reason,
1396
+ output_tokens,
1089
1397
  len(accumulated_text),
1090
1398
  accumulated_text[:300],
1091
1399
  tc_names,
1092
1400
  [a[:200] for a in tc_args],
1093
1401
  )
1094
1402
 
1403
+ if _is_unexpected_end_turn(
1404
+ {
1405
+ "choices": [
1406
+ {
1407
+ "finish_reason": "stop"
1408
+ if finish_reason == "end_turn"
1409
+ else finish_reason,
1410
+ "message": {
1411
+ "content": accumulated_text,
1412
+ "tool_calls": [
1413
+ {
1414
+ "function": {
1415
+ "name": tc["name"],
1416
+ "arguments": tc.get("arguments", ""),
1417
+ }
1418
+ }
1419
+ for tc in tool_calls_by_index.values()
1420
+ ],
1421
+ },
1422
+ }
1423
+ ]
1424
+ },
1425
+ anthropic_body,
1426
+ ):
1427
+ monitor.unexpected_end_turn_count += 1
1428
+
1095
1429
  # message_delta with final stop reason
1096
1430
  yield (
1097
1431
  f"event: message_delta\n"
@@ -1106,6 +1440,7 @@ async def stream_anthropic_response(openai_stream: httpx.Response, model: str):
1106
1440
  # API Endpoints
1107
1441
  # ===========================================================================
1108
1442
 
1443
+
1109
1444
  @app.post("/v1/messages")
1110
1445
  async def messages(request: Request):
1111
1446
  """Handle Anthropic Messages API requests (streaming and non-streaming).
@@ -1116,9 +1451,14 @@ async def messages(request: Request):
1116
1451
  - Option E: Smart max_tokens capping (in build_openai_request)
1117
1452
  - Option F: Session-level token monitoring with warnings
1118
1453
  """
1454
+ global last_session_id
1455
+
1119
1456
  body = await request.json()
1120
1457
  model = body.get("model", "default")
1121
1458
  is_stream = body.get("stream", False)
1459
+ session_id = resolve_session_id(request, body)
1460
+ monitor = get_session_monitor(session_id)
1461
+ last_session_id = session_id
1122
1462
 
1123
1463
  # Debug: log request summary
1124
1464
  n_messages = len(body.get("messages", []))
@@ -1128,42 +1468,51 @@ async def messages(request: Request):
1128
1468
  last_role = last_msg.get("role", "?")
1129
1469
  last_content = last_msg.get("content", "")
1130
1470
  if isinstance(last_content, list):
1131
- last_text = next((b.get("text", "") for b in last_content if b.get("type") == "text"), "")[:200]
1471
+ last_text = next(
1472
+ (b.get("text", "") for b in last_content if b.get("type") == "text"), ""
1473
+ )[:200]
1132
1474
  elif isinstance(last_content, str):
1133
1475
  last_text = last_content[:200]
1134
1476
  else:
1135
1477
  last_text = str(last_content)[:200]
1136
1478
  logger.info(
1137
1479
  "REQ: stream=%s msgs=%d tools=%d max_tokens=%s last_role=%s last_content=%.200s",
1138
- is_stream, n_messages, n_tools, max_tokens, last_role, last_text
1480
+ is_stream,
1481
+ n_messages,
1482
+ n_tools,
1483
+ max_tokens,
1484
+ last_role,
1485
+ last_text,
1139
1486
  )
1140
1487
 
1141
1488
  # --- Option F: Estimate tokens and record in session monitor ---
1142
1489
  estimated_tokens = estimate_total_tokens(body)
1143
- session_monitor.record_request(estimated_tokens)
1144
- session_monitor.log_status()
1490
+ monitor.record_request(estimated_tokens)
1491
+ monitor.log_status()
1145
1492
 
1146
1493
  # --- Option C: Prune conversation if approaching context limit ---
1147
- ctx_window = session_monitor.context_window
1494
+ ctx_window = monitor.context_window
1148
1495
  if ctx_window > 0:
1149
1496
  utilization = estimated_tokens / ctx_window
1150
1497
  if utilization >= PROXY_CONTEXT_PRUNE_THRESHOLD:
1151
1498
  logger.warning(
1152
1499
  "Context utilization %.1f%% exceeds threshold %.1f%% -- pruning conversation",
1153
- utilization * 100, PROXY_CONTEXT_PRUNE_THRESHOLD * 100,
1500
+ utilization * 100,
1501
+ PROXY_CONTEXT_PRUNE_THRESHOLD * 100,
1154
1502
  )
1155
1503
  body = prune_conversation(body, ctx_window, target_fraction=0.65)
1156
- session_monitor.prune_count += 1
1504
+ monitor.prune_count += 1
1157
1505
  # Re-estimate after pruning
1158
1506
  estimated_tokens = estimate_total_tokens(body)
1159
- session_monitor.record_request(estimated_tokens)
1507
+ monitor.record_request(estimated_tokens)
1160
1508
  n_messages = len(body.get("messages", []))
1161
1509
  logger.info(
1162
1510
  "After pruning: ~%d tokens, %d messages",
1163
- estimated_tokens, n_messages,
1511
+ estimated_tokens,
1512
+ n_messages,
1164
1513
  )
1165
1514
 
1166
- openai_body = build_openai_request(body)
1515
+ openai_body = build_openai_request(body, monitor)
1167
1516
 
1168
1517
  client = http_client
1169
1518
  if client is None:
@@ -1181,6 +1530,7 @@ async def messages(request: Request):
1181
1530
  MAX_UPSTREAM_RETRIES = 3
1182
1531
  RETRY_DELAY_SECS = 5.0
1183
1532
  last_exc: Exception | None = None
1533
+ resp: httpx.Response | None = None
1184
1534
 
1185
1535
  for attempt in range(MAX_UPSTREAM_RETRIES):
1186
1536
  try:
@@ -1201,25 +1551,46 @@ async def messages(request: Request):
1201
1551
  if attempt < MAX_UPSTREAM_RETRIES - 1:
1202
1552
  logger.warning(
1203
1553
  "Upstream connect failed (attempt %d/%d): %s – retrying in %.0fs",
1204
- attempt + 1, MAX_UPSTREAM_RETRIES,
1205
- type(exc).__name__, RETRY_DELAY_SECS,
1554
+ attempt + 1,
1555
+ MAX_UPSTREAM_RETRIES,
1556
+ type(exc).__name__,
1557
+ RETRY_DELAY_SECS,
1206
1558
  )
1207
1559
  await asyncio.sleep(RETRY_DELAY_SECS)
1208
1560
  else:
1209
1561
  logger.error(
1210
1562
  "Upstream connect failed after %d attempts: %s: %s",
1211
- MAX_UPSTREAM_RETRIES, type(exc).__name__, exc,
1563
+ MAX_UPSTREAM_RETRIES,
1564
+ type(exc).__name__,
1565
+ exc,
1212
1566
  )
1213
1567
 
1214
1568
  if last_exc is not None:
1215
1569
  return Response(
1216
- content=json.dumps({
1217
- "type": "error",
1218
- "error": {
1219
- "type": "overloaded_error",
1220
- "message": f"Upstream server unavailable after {MAX_UPSTREAM_RETRIES} retries: {last_exc}",
1221
- },
1222
- }),
1570
+ content=json.dumps(
1571
+ {
1572
+ "type": "error",
1573
+ "error": {
1574
+ "type": "overloaded_error",
1575
+ "message": f"Upstream server unavailable after {MAX_UPSTREAM_RETRIES} retries: {last_exc}",
1576
+ },
1577
+ }
1578
+ ),
1579
+ status_code=529,
1580
+ media_type="application/json",
1581
+ )
1582
+
1583
+ if resp is None:
1584
+ return Response(
1585
+ content=json.dumps(
1586
+ {
1587
+ "type": "error",
1588
+ "error": {
1589
+ "type": "overloaded_error",
1590
+ "message": "Upstream response unavailable",
1591
+ },
1592
+ }
1593
+ ),
1223
1594
  status_code=529,
1224
1595
  media_type="application/json",
1225
1596
  )
@@ -1232,9 +1603,7 @@ async def messages(request: Request):
1232
1603
  error_body = await resp.aread()
1233
1604
  await resp.aclose()
1234
1605
  error_text = error_body.decode("utf-8", errors="replace")[:1000]
1235
- logger.error(
1236
- "Upstream HTTP %d: %s", resp.status_code, error_text
1237
- )
1606
+ logger.error("Upstream HTTP %d: %s", resp.status_code, error_text)
1238
1607
 
1239
1608
  # Parse the error for a user-friendly message
1240
1609
  error_message = f"Upstream server error (HTTP {resp.status_code})"
@@ -1257,47 +1626,57 @@ async def messages(request: Request):
1257
1626
  )
1258
1627
 
1259
1628
  if is_context_overflow:
1260
- session_monitor.overflow_count += 1
1629
+ monitor.overflow_count += 1
1261
1630
  logger.error(
1262
1631
  "CONTEXT OVERFLOW detected (count=%d). "
1263
1632
  "Estimated input: %d tokens, context window: %d tokens. "
1264
1633
  "Conversation needs pruning or context window increase.",
1265
- session_monitor.overflow_count, estimated_tokens, ctx_window,
1634
+ monitor.overflow_count,
1635
+ estimated_tokens,
1636
+ ctx_window,
1266
1637
  )
1267
1638
  # Return Anthropic-format error that Claude Code can handle
1268
1639
  return Response(
1269
- content=json.dumps({
1270
- "type": "error",
1271
- "error": {
1272
- "type": "overloaded_error",
1273
- "message": (
1274
- f"Context window exceeded: request requires ~{estimated_tokens} tokens "
1275
- f"but only {ctx_window} are available. "
1276
- f"The conversation is too long. Please start a new session or "
1277
- f"reduce conversation length."
1278
- ),
1279
- },
1280
- }),
1640
+ content=json.dumps(
1641
+ {
1642
+ "type": "error",
1643
+ "error": {
1644
+ "type": "overloaded_error",
1645
+ "message": (
1646
+ f"Context window exceeded: request requires ~{estimated_tokens} tokens "
1647
+ f"but only {ctx_window} are available. "
1648
+ f"The conversation is too long. Please start a new session or "
1649
+ f"reduce conversation length."
1650
+ ),
1651
+ },
1652
+ }
1653
+ ),
1281
1654
  status_code=529,
1282
1655
  media_type="application/json",
1283
1656
  )
1284
1657
 
1285
1658
  # Generic upstream error -- return as Anthropic error format
1286
- error_type = "overloaded_error" if resp.status_code >= 500 else "invalid_request_error"
1659
+ error_type = (
1660
+ "overloaded_error"
1661
+ if resp.status_code >= 500
1662
+ else "invalid_request_error"
1663
+ )
1287
1664
  return Response(
1288
- content=json.dumps({
1289
- "type": "error",
1290
- "error": {
1291
- "type": error_type,
1292
- "message": error_message,
1293
- },
1294
- }),
1665
+ content=json.dumps(
1666
+ {
1667
+ "type": "error",
1668
+ "error": {
1669
+ "type": error_type,
1670
+ "message": error_message,
1671
+ },
1672
+ }
1673
+ ),
1295
1674
  status_code=529 if resp.status_code >= 500 else 400,
1296
1675
  media_type="application/json",
1297
1676
  )
1298
1677
 
1299
1678
  return StreamingResponse(
1300
- stream_anthropic_response(resp, model),
1679
+ stream_anthropic_response(resp, model, monitor, body),
1301
1680
  media_type="text/event-stream",
1302
1681
  headers={
1303
1682
  "Cache-Control": "no-cache",
@@ -1314,25 +1693,56 @@ async def messages(request: Request):
1314
1693
  # Option B: Handle non-streaming errors too
1315
1694
  if resp.status_code != 200:
1316
1695
  error_text = resp.text[:1000]
1317
- logger.error("Upstream HTTP %d (non-stream): %s", resp.status_code, error_text)
1696
+ logger.error(
1697
+ "Upstream HTTP %d (non-stream): %s", resp.status_code, error_text
1698
+ )
1318
1699
  return Response(
1319
- content=json.dumps({
1320
- "type": "error",
1321
- "error": {
1322
- "type": "overloaded_error",
1323
- "message": f"Upstream error (HTTP {resp.status_code}): {error_text[:500]}",
1324
- },
1325
- }),
1700
+ content=json.dumps(
1701
+ {
1702
+ "type": "error",
1703
+ "error": {
1704
+ "type": "overloaded_error",
1705
+ "message": f"Upstream error (HTTP {resp.status_code}): {error_text[:500]}",
1706
+ },
1707
+ }
1708
+ ),
1326
1709
  status_code=529,
1327
1710
  media_type="application/json",
1328
1711
  )
1329
1712
 
1330
1713
  openai_resp = resp.json()
1714
+
1715
+ if PROXY_GUARDRAIL_RETRY and _is_unexpected_end_turn(openai_resp, body):
1716
+ monitor.unexpected_end_turn_count += 1
1717
+ logger.warning(
1718
+ "GUARDRAIL: unexpected end_turn without tool_use in active loop (session=%s), retrying once with tool_choice=required",
1719
+ session_id,
1720
+ )
1721
+
1722
+ retry_body = dict(openai_body)
1723
+ retry_body["tool_choice"] = "required"
1724
+ retry_body["stream"] = False
1725
+
1726
+ retry_resp = await client.post(
1727
+ f"{LLAMA_CPP_BASE}/chat/completions",
1728
+ json=retry_body,
1729
+ headers={"Content-Type": "application/json"},
1730
+ )
1731
+ if retry_resp.status_code == 200:
1732
+ retry_json = retry_resp.json()
1733
+ retry_choice = (retry_json.get("choices") or [{}])[0]
1734
+ retry_message = retry_choice.get("message", {})
1735
+ if retry_message.get("tool_calls"):
1736
+ openai_resp = retry_json
1737
+ logger.info(
1738
+ "GUARDRAIL: retry produced tool_use; using retried response"
1739
+ )
1740
+
1331
1741
  anthropic_resp = openai_to_anthropic_response(openai_resp, model)
1332
1742
 
1333
1743
  # Track output tokens in session monitor
1334
1744
  output_tokens = anthropic_resp.get("usage", {}).get("output_tokens", 0)
1335
- session_monitor.record_response(output_tokens)
1745
+ monitor.record_response(output_tokens)
1336
1746
 
1337
1747
  return anthropic_resp
1338
1748
 
@@ -1377,37 +1787,49 @@ async def health():
1377
1787
 
1378
1788
 
1379
1789
  @app.get("/v1/context")
1380
- async def context_status():
1790
+ async def context_status(request: Request):
1381
1791
  """Option F: Context window monitoring endpoint.
1382
1792
 
1383
1793
  Returns current session token usage, utilization, warnings, and
1384
1794
  estimated remaining turns. Useful for dashboards and debugging.
1385
1795
  """
1386
- warning = session_monitor.get_warning_level()
1387
- turns = session_monitor.estimate_turns_remaining()
1796
+ requested_session = request.query_params.get("session_id", "")
1797
+ session_id = requested_session or last_session_id
1798
+ monitor = session_monitors.get(session_id) if session_id else None
1799
+
1800
+ if monitor is None:
1801
+ monitor = SessionMonitor(context_window=default_context_window)
1802
+
1803
+ warning = monitor.get_warning_level()
1804
+ turns = monitor.estimate_turns_remaining()
1388
1805
 
1389
1806
  return {
1390
- "context_window": session_monitor.context_window,
1391
- "last_input_tokens": session_monitor.last_input_tokens,
1392
- "last_output_tokens": session_monitor.last_output_tokens,
1393
- "peak_input_tokens": session_monitor.peak_input_tokens,
1394
- "utilization": round(session_monitor.get_utilization(), 4),
1395
- "utilization_pct": f"{session_monitor.get_utilization() * 100:.1f}%",
1807
+ "active_session_id": session_id,
1808
+ "session_count": len(session_monitors),
1809
+ "context_window": monitor.context_window,
1810
+ "last_input_tokens": monitor.last_input_tokens,
1811
+ "last_output_tokens": monitor.last_output_tokens,
1812
+ "peak_input_tokens": monitor.peak_input_tokens,
1813
+ "utilization": round(monitor.get_utilization(), 4),
1814
+ "utilization_pct": f"{monitor.get_utilization() * 100:.1f}%",
1396
1815
  "warning_level": warning,
1397
1816
  "estimated_turns_remaining": turns,
1398
- "total_requests": session_monitor.total_requests,
1399
- "prune_count": session_monitor.prune_count,
1400
- "overflow_count": session_monitor.overflow_count,
1817
+ "total_requests": monitor.total_requests,
1818
+ "prune_count": monitor.prune_count,
1819
+ "overflow_count": monitor.overflow_count,
1401
1820
  "prune_threshold": PROXY_CONTEXT_PRUNE_THRESHOLD,
1402
- "recent_history": session_monitor.context_history[-10:],
1821
+ "recent_history": monitor.context_history[-10:],
1403
1822
  # Loop protection stats
1404
1823
  "loop_protection": {
1405
- "consecutive_forced_count": session_monitor.consecutive_forced_count,
1406
- "loop_warnings_emitted": session_monitor.loop_warnings_emitted,
1407
- "tool_call_history_len": len(session_monitor.tool_call_history),
1408
- "is_looping": session_monitor.detect_tool_loop()[0],
1409
- "loop_repeat_count": session_monitor.detect_tool_loop()[1],
1410
- "recent_tool_patterns": session_monitor.tool_call_history[-5:],
1824
+ "enabled": PROXY_LOOP_BREAKER,
1825
+ "consecutive_forced_count": monitor.consecutive_forced_count,
1826
+ "no_progress_streak": monitor.no_progress_streak,
1827
+ "loop_warnings_emitted": monitor.loop_warnings_emitted,
1828
+ "unexpected_end_turn_count": monitor.unexpected_end_turn_count,
1829
+ "tool_call_history_len": len(monitor.tool_call_history),
1830
+ "is_looping": monitor.detect_tool_loop(window=PROXY_LOOP_WINDOW)[0],
1831
+ "loop_repeat_count": monitor.detect_tool_loop(window=PROXY_LOOP_WINDOW)[1],
1832
+ "recent_tool_patterns": monitor.tool_call_history[-5:],
1411
1833
  },
1412
1834
  }
1413
1835