@miller-tech/uap 1.13.12 → 1.13.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. package/dist/.tsbuildinfo +1 -1
  2. package/dist/benchmarks/speculative-autotune.d.ts +46 -0
  3. package/dist/benchmarks/speculative-autotune.d.ts.map +1 -0
  4. package/dist/benchmarks/speculative-autotune.js +145 -0
  5. package/dist/benchmarks/speculative-autotune.js.map +1 -0
  6. package/dist/benchmarks/token-throughput.d.ts +46 -46
  7. package/dist/bin/cli.js +2 -0
  8. package/dist/bin/cli.js.map +1 -1
  9. package/dist/bin/llama-server-optimize.js +176 -0
  10. package/dist/bin/llama-server-optimize.js.map +1 -1
  11. package/dist/bin/policy.js +0 -0
  12. package/dist/cli/hooks.js +1 -0
  13. package/dist/cli/hooks.js.map +1 -1
  14. package/dist/cli/init.d.ts +1 -0
  15. package/dist/cli/init.d.ts.map +1 -1
  16. package/dist/cli/init.js +18 -0
  17. package/dist/cli/init.js.map +1 -1
  18. package/dist/cli/setup.d.ts +1 -0
  19. package/dist/cli/setup.d.ts.map +1 -1
  20. package/dist/cli/setup.js +1 -0
  21. package/dist/cli/setup.js.map +1 -1
  22. package/dist/cli/systemd-services.d.ts +12 -0
  23. package/dist/cli/systemd-services.d.ts.map +1 -0
  24. package/dist/cli/systemd-services.js +179 -0
  25. package/dist/cli/systemd-services.js.map +1 -0
  26. package/dist/models/types.d.ts +12 -12
  27. package/dist/policies/schemas/policy.d.ts +12 -12
  28. package/dist/types/config.d.ts +24 -24
  29. package/docs/deployment/QWEN35_LLAMA_CPP.md +49 -0
  30. package/docs/deployment/UAP_LLAMA_ANTHROPIC_PROXY_BOOTSTRAP.md +279 -0
  31. package/package.json +1 -1
  32. package/templates/hooks/loop-protection.sh +250 -0
  33. package/templates/hooks/post-compact.sh +14 -0
  34. package/templates/hooks/post-tool-use-edit-write.sh +15 -0
  35. package/templates/hooks/pre-compact.sh +9 -0
  36. package/templates/hooks/pre-tool-use-bash.sh +6 -0
  37. package/templates/hooks/pre-tool-use-edit-write.sh +10 -0
  38. package/templates/hooks/session-start.sh +64 -44
  39. package/templates/hooks/stop.sh +9 -0
  40. package/tools/agents/scripts/anthropic_proxy.py +716 -166
  41. package/tools/agents/tests/test_anthropic_proxy_streaming.py +51 -0
  42. package/tools/agents/scripts/__pycache__/anthropic_proxy.cpython-313.pyc +0 -0
  43. package/tools/agents/scripts/__pycache__/tool_call_wrapper.cpython-313.pyc +0 -0
@@ -76,9 +76,11 @@ Dependencies
76
76
  """
77
77
 
78
78
  import asyncio
79
+ import hashlib
79
80
  import json
80
81
  import logging
81
82
  import os
83
+ import re
82
84
  import sys
83
85
  import time
84
86
  import uuid
@@ -100,7 +102,35 @@ PROXY_LOG_LEVEL = os.environ.get("PROXY_LOG_LEVEL", "INFO").upper()
100
102
  PROXY_READ_TIMEOUT = float(os.environ.get("PROXY_READ_TIMEOUT", "600"))
101
103
  PROXY_MAX_CONNECTIONS = int(os.environ.get("PROXY_MAX_CONNECTIONS", "20"))
102
104
  PROXY_CONTEXT_WINDOW = int(os.environ.get("PROXY_CONTEXT_WINDOW", "0"))
103
- PROXY_CONTEXT_PRUNE_THRESHOLD = float(os.environ.get("PROXY_CONTEXT_PRUNE_THRESHOLD", "0.75"))
105
+ PROXY_CONTEXT_PRUNE_THRESHOLD = float(
106
+ os.environ.get("PROXY_CONTEXT_PRUNE_THRESHOLD", "0.75")
107
+ )
108
+ PROXY_LOOP_BREAKER = os.environ.get("PROXY_LOOP_BREAKER", "on").lower() not in {
109
+ "0",
110
+ "false",
111
+ "off",
112
+ "no",
113
+ }
114
+ PROXY_LOOP_WINDOW = int(os.environ.get("PROXY_LOOP_WINDOW", "6"))
115
+ PROXY_LOOP_REPEAT_THRESHOLD = int(os.environ.get("PROXY_LOOP_REPEAT_THRESHOLD", "8"))
116
+ PROXY_FORCED_THRESHOLD = int(os.environ.get("PROXY_FORCED_THRESHOLD", "15"))
117
+ PROXY_NO_PROGRESS_THRESHOLD = int(os.environ.get("PROXY_NO_PROGRESS_THRESHOLD", "4"))
118
+ PROXY_CONTEXT_RELEASE_THRESHOLD = float(
119
+ os.environ.get("PROXY_CONTEXT_RELEASE_THRESHOLD", "0.90")
120
+ )
121
+ PROXY_GUARDRAIL_RETRY = os.environ.get("PROXY_GUARDRAIL_RETRY", "on").lower() not in {
122
+ "0",
123
+ "false",
124
+ "off",
125
+ "no",
126
+ }
127
+ PROXY_SESSION_TTL_SECS = int(os.environ.get("PROXY_SESSION_TTL_SECS", "7200"))
128
+ PROXY_STREAM_REASONING_FALLBACK = (
129
+ os.environ.get("PROXY_STREAM_REASONING_FALLBACK", "off").strip().lower()
130
+ )
131
+ PROXY_STREAM_REASONING_MAX_CHARS = int(
132
+ os.environ.get("PROXY_STREAM_REASONING_MAX_CHARS", "240")
133
+ )
104
134
 
105
135
  # ---------------------------------------------------------------------------
106
136
  # Logging
@@ -121,15 +151,27 @@ class SessionMonitor:
121
151
  """Tracks token usage across the session to provide early warnings
122
152
  and enable proactive context management before overflow occurs."""
123
153
 
124
- context_window: int = 0 # Auto-detected or configured
154
+ context_window: int = 0 # Auto-detected or configured
125
155
  total_requests: int = 0
126
- last_input_tokens: int = 0 # Estimated input tokens of last request
127
- last_output_tokens: int = 0 # Actual output tokens of last response
128
- peak_input_tokens: int = 0 # High-water mark
129
- prune_count: int = 0 # How many times pruning was triggered
130
- overflow_count: int = 0 # How many context overflow errors caught
156
+ last_input_tokens: int = 0 # Estimated input tokens of last request
157
+ last_output_tokens: int = 0 # Actual output tokens of last response
158
+ peak_input_tokens: int = 0 # High-water mark
159
+ prune_count: int = 0 # How many times pruning was triggered
160
+ overflow_count: int = 0 # How many context overflow errors caught
131
161
  context_history: list = field(default_factory=list) # Recent token counts
132
162
 
163
+ # --- Token Loop Protection ---
164
+ tool_call_history: list = field(
165
+ default_factory=list
166
+ ) # Recent tool call fingerprints
167
+ consecutive_forced_count: int = (
168
+ 0 # How many times tool_choice was forced consecutively
169
+ )
170
+ loop_warnings_emitted: int = 0 # How many loop warnings sent to the model
171
+ no_progress_streak: int = 0 # Forced tool turns without new tool_result
172
+ unexpected_end_turn_count: int = 0 # end_turn without tool_use in active loop
173
+ last_seen_ts: float = 0.0
174
+
133
175
  def record_request(self, estimated_tokens: int):
134
176
  """Record an outgoing request's estimated token count."""
135
177
  self.total_requests += 1
@@ -145,6 +187,9 @@ class SessionMonitor:
145
187
  """Record a response's output token count."""
146
188
  self.last_output_tokens = output_tokens
147
189
 
190
+ def touch(self):
191
+ self.last_seen_ts = time.time()
192
+
148
193
  def get_utilization(self) -> float:
149
194
  """Get current context utilization as a fraction (0.0 - 1.0)."""
150
195
  if self.context_window <= 0:
@@ -191,30 +236,160 @@ class SessionMonitor:
191
236
  if warning == "CRITICAL":
192
237
  logger.error(
193
238
  "CONTEXT CRITICAL: %d/%d tokens (%.1f%%), %s, pruned=%d, overflows=%d",
194
- self.last_input_tokens, self.context_window, util * 100,
195
- turns_str, self.prune_count, self.overflow_count,
239
+ self.last_input_tokens,
240
+ self.context_window,
241
+ util * 100,
242
+ turns_str,
243
+ self.prune_count,
244
+ self.overflow_count,
196
245
  )
197
246
  elif warning == "HIGH":
198
247
  logger.warning(
199
248
  "CONTEXT HIGH: %d/%d tokens (%.1f%%), %s, pruned=%d",
200
- self.last_input_tokens, self.context_window, util * 100,
201
- turns_str, self.prune_count,
249
+ self.last_input_tokens,
250
+ self.context_window,
251
+ util * 100,
252
+ turns_str,
253
+ self.prune_count,
202
254
  )
203
255
  elif warning == "ELEVATED":
204
256
  logger.warning(
205
257
  "CONTEXT ELEVATED: %d/%d tokens (%.1f%%), %s",
206
- self.last_input_tokens, self.context_window, util * 100,
258
+ self.last_input_tokens,
259
+ self.context_window,
260
+ util * 100,
207
261
  turns_str,
208
262
  )
209
263
  else:
210
264
  logger.info(
211
265
  "CONTEXT: %d/%d tokens (%.1f%%), %s",
212
- self.last_input_tokens, self.context_window, util * 100,
266
+ self.last_input_tokens,
267
+ self.context_window,
268
+ util * 100,
213
269
  turns_str,
214
270
  )
215
271
 
272
+ # --- Token Loop Protection Methods ---
273
+
274
+ def record_tool_calls(self, tool_names: list[str]):
275
+ """Record tool call names for loop detection."""
276
+ fingerprint = "|".join(sorted(tool_names)) if tool_names else ""
277
+ self.tool_call_history.append(fingerprint)
278
+ # Keep last 30 entries
279
+ if len(self.tool_call_history) > 30:
280
+ self.tool_call_history = self.tool_call_history[-30:]
281
+
282
+ def detect_tool_loop(self, window: int = 6) -> tuple[bool, int]:
283
+ """Detect if the model is stuck in a tool call loop.
284
+
285
+ Checks if the last `window` tool call fingerprints are identical.
286
+ Returns (is_looping, repeat_count).
287
+ """
288
+ if len(self.tool_call_history) < window:
289
+ return False, 0
290
+
291
+ recent = self.tool_call_history[-window:]
292
+ if not recent[0]:
293
+ return False, 0
294
+
295
+ # Check if all recent entries are the same fingerprint
296
+ if all(fp == recent[0] for fp in recent):
297
+ # Count total consecutive repeats from the end
298
+ count = 0
299
+ target = recent[0]
300
+ for fp in reversed(self.tool_call_history):
301
+ if fp == target:
302
+ count += 1
303
+ else:
304
+ break
305
+ return True, count
306
+
307
+ return False, 0
308
+
309
+ def should_release_tool_choice(self) -> bool:
310
+ """Determine if tool_choice should be relaxed to 'auto' to break a loop.
311
+
312
+ Returns True if the model appears stuck and forcing tool_choice=required
313
+ is making it worse. Thresholds:
314
+ - 8+ consecutive forced requests with same tool pattern -> release
315
+ - 15+ consecutive forced requests regardless -> release
316
+ - Context utilization > 90% -> release (let model wrap up)
317
+ """
318
+ if not PROXY_LOOP_BREAKER:
319
+ return False
320
+
321
+ is_looping, repeat_count = self.detect_tool_loop(window=PROXY_LOOP_WINDOW)
322
+
323
+ # Pattern 1: Detected tool call loop
324
+ if (
325
+ is_looping
326
+ and repeat_count >= PROXY_LOOP_REPEAT_THRESHOLD
327
+ and self.no_progress_streak >= PROXY_NO_PROGRESS_THRESHOLD
328
+ ):
329
+ logger.warning(
330
+ "LOOP BREAKER: Same tool pattern repeated %d times with no progress streak=%d. "
331
+ "Releasing tool_choice to 'auto'.",
332
+ repeat_count,
333
+ self.no_progress_streak,
334
+ )
335
+ self.loop_warnings_emitted += 1
336
+ return True
337
+
338
+ # Pattern 2: Too many consecutive forced requests
339
+ if (
340
+ self.consecutive_forced_count >= PROXY_FORCED_THRESHOLD
341
+ and self.no_progress_streak >= PROXY_NO_PROGRESS_THRESHOLD
342
+ ):
343
+ logger.warning(
344
+ "LOOP BREAKER: %d consecutive forced tool_choice requests with no progress streak=%d. "
345
+ "Releasing to 'auto'.",
346
+ self.consecutive_forced_count,
347
+ self.no_progress_streak,
348
+ )
349
+ self.loop_warnings_emitted += 1
350
+ return True
351
+
352
+ # Pattern 3: Context almost full -- let model wrap up naturally
353
+ if self.get_utilization() >= PROXY_CONTEXT_RELEASE_THRESHOLD:
354
+ logger.warning(
355
+ "LOOP BREAKER: Context utilization %.1f%% -- releasing "
356
+ "tool_choice to let model wrap up.",
357
+ self.get_utilization() * 100,
358
+ )
359
+ return True
360
+
361
+ return False
362
+
363
+
364
+ session_monitors: dict[str, SessionMonitor] = {}
365
+ default_context_window = 0
366
+ last_session_id = ""
367
+
216
368
 
217
- session_monitor = SessionMonitor()
369
+ def _cleanup_stale_monitors(now_ts: float) -> None:
370
+ stale = [
371
+ sid
372
+ for sid, mon in session_monitors.items()
373
+ if mon.last_seen_ts > 0 and now_ts - mon.last_seen_ts > PROXY_SESSION_TTL_SECS
374
+ ]
375
+ for sid in stale:
376
+ session_monitors.pop(sid, None)
377
+
378
+
379
+ def get_session_monitor(session_id: str) -> SessionMonitor:
380
+ now_ts = time.time()
381
+ _cleanup_stale_monitors(now_ts)
382
+
383
+ monitor = session_monitors.get(session_id)
384
+ if monitor is None:
385
+ monitor = SessionMonitor(context_window=default_context_window)
386
+ session_monitors[session_id] = monitor
387
+
388
+ monitor.touch()
389
+ if monitor.context_window <= 0:
390
+ monitor.context_window = default_context_window
391
+
392
+ return monitor
218
393
 
219
394
 
220
395
  # ---------------------------------------------------------------------------
@@ -240,7 +415,8 @@ async def detect_context_window(client: httpx.AsyncClient) -> int:
240
415
  if n_ctx > 0:
241
416
  logger.info(
242
417
  "Auto-detected context window from upstream: %d tokens (%d slots)",
243
- n_ctx, len(slots),
418
+ n_ctx,
419
+ len(slots),
244
420
  )
245
421
  return n_ctx
246
422
  except Exception as exc:
@@ -314,7 +490,9 @@ def estimate_total_tokens(anthropic_body: dict) -> int:
314
490
  return tokens
315
491
 
316
492
 
317
- def prune_conversation(anthropic_body: dict, context_window: int, target_fraction: float = 0.65) -> dict:
493
+ def prune_conversation(
494
+ anthropic_body: dict, context_window: int, target_fraction: float = 0.65
495
+ ) -> dict:
318
496
  """Prune the conversation to fit within the context window.
319
497
 
320
498
  Strategy:
@@ -361,19 +539,24 @@ def prune_conversation(anthropic_body: dict, context_window: int, target_fractio
361
539
 
362
540
  # Always keep the first user message and the last N messages
363
541
  KEEP_LAST = 8 # Keep the last 8 messages (recent context)
364
- protected_head = messages[:1] # First user message
365
- protected_tail = messages[-KEEP_LAST:] if len(messages) > KEEP_LAST else messages[1:]
542
+ protected_head = messages[:1] # First user message
543
+ protected_tail = (
544
+ messages[-KEEP_LAST:] if len(messages) > KEEP_LAST else messages[1:]
545
+ )
366
546
  middle = messages[1:-KEEP_LAST] if len(messages) > KEEP_LAST + 1 else []
367
547
 
368
548
  # Calculate tokens for protected messages
369
- protected_tokens = sum(estimate_message_tokens(m) for m in protected_head + protected_tail)
549
+ protected_tokens = sum(
550
+ estimate_message_tokens(m) for m in protected_head + protected_tail
551
+ )
370
552
 
371
553
  if protected_tokens >= message_budget:
372
554
  # Even protected messages exceed budget -- truncate tool_result content
373
555
  # in the tail to fit
374
556
  logger.warning(
375
557
  "Protected messages (%d tokens) exceed budget (%d) -- truncating tool results",
376
- protected_tokens, message_budget,
558
+ protected_tokens,
559
+ message_budget,
377
560
  )
378
561
  for msg in protected_tail:
379
562
  content = msg.get("content", [])
@@ -382,7 +565,11 @@ def prune_conversation(anthropic_body: dict, context_window: int, target_fractio
382
565
  if isinstance(block, dict) and block.get("type") == "tool_result":
383
566
  result_text = _extract_text(block.get("content", ""))
384
567
  if len(result_text) > 2000:
385
- block["content"] = result_text[:1000] + "\n...[TRUNCATED]...\n" + result_text[-500:]
568
+ block["content"] = (
569
+ result_text[:1000]
570
+ + "\n...[TRUNCATED]...\n"
571
+ + result_text[-500:]
572
+ )
386
573
  anthropic_body["messages"] = protected_head + protected_tail
387
574
  return anthropic_body
388
575
 
@@ -402,8 +589,7 @@ def prune_conversation(anthropic_body: dict, context_window: int, target_fractio
402
589
 
403
590
  if isinstance(content, list):
404
591
  is_tool_result = any(
405
- isinstance(b, dict) and b.get("type") == "tool_result"
406
- for b in content
592
+ isinstance(b, dict) and b.get("type") == "tool_result" for b in content
407
593
  )
408
594
 
409
595
  # Lower priority = removed first
@@ -445,12 +631,17 @@ def prune_conversation(anthropic_body: dict, context_window: int, target_fractio
445
631
  f"The conversation continues from recent context below.]"
446
632
  ),
447
633
  }
448
- anthropic_body["messages"] = protected_head + [prune_marker] + kept_msgs + protected_tail
634
+ anthropic_body["messages"] = (
635
+ protected_head + [prune_marker] + kept_msgs + protected_tail
636
+ )
449
637
  logger.warning(
450
638
  "PRUNED: removed %d messages (~%d tokens), kept %d messages, "
451
639
  "target=%.0f%% of %d ctx",
452
- removed_count, removed_tokens, len(anthropic_body["messages"]),
453
- target_fraction * 100, context_window,
640
+ removed_count,
641
+ removed_tokens,
642
+ len(anthropic_body["messages"]),
643
+ target_fraction * 100,
644
+ context_window,
454
645
  )
455
646
  else:
456
647
  anthropic_body["messages"] = protected_head + kept_msgs + protected_tail
@@ -470,12 +661,13 @@ http_client: httpx.AsyncClient | None = None
470
661
  async def lifespan(app: FastAPI):
471
662
  """Manage the httpx client lifecycle with the FastAPI app."""
472
663
  global http_client
664
+ global default_context_window
473
665
  http_client = httpx.AsyncClient(
474
666
  timeout=httpx.Timeout(
475
- connect=10.0, # 10s to establish connection
476
- read=PROXY_READ_TIMEOUT, # configurable (default 10 min)
477
- write=30.0, # 30s to send the request body
478
- pool=10.0, # 10s to acquire a pool connection
667
+ connect=10.0, # 10s to establish connection
668
+ read=PROXY_READ_TIMEOUT, # configurable (default 10 min)
669
+ write=30.0, # 30s to send the request body
670
+ pool=10.0, # 10s to acquire a pool connection
479
671
  ),
480
672
  limits=httpx.Limits(
481
673
  max_connections=PROXY_MAX_CONNECTIONS,
@@ -485,14 +677,19 @@ async def lifespan(app: FastAPI):
485
677
  )
486
678
  logger.info(
487
679
  "Proxy started: listening on %s:%d -> upstream %s",
488
- PROXY_HOST, PROXY_PORT, LLAMA_CPP_BASE,
680
+ PROXY_HOST,
681
+ PROXY_PORT,
682
+ LLAMA_CPP_BASE,
489
683
  )
490
684
 
491
685
  # Auto-detect context window from upstream server
492
- session_monitor.context_window = await detect_context_window(http_client)
686
+ default_context_window = await detect_context_window(http_client)
687
+ for mon in session_monitors.values():
688
+ if mon.context_window <= 0:
689
+ mon.context_window = default_context_window
493
690
  logger.info(
494
691
  "Context window: %d tokens, prune threshold: %.0f%%",
495
- session_monitor.context_window,
692
+ default_context_window,
496
693
  PROXY_CONTEXT_PRUNE_THRESHOLD * 100,
497
694
  )
498
695
 
@@ -514,6 +711,7 @@ app = FastAPI(
514
711
  # Request Translation: Anthropic -> OpenAI
515
712
  # ===========================================================================
516
713
 
714
+
517
715
  def anthropic_to_openai_messages(anthropic_body: dict) -> list[dict]:
518
716
  """Convert Anthropic message format to OpenAI message format.
519
717
 
@@ -551,25 +749,33 @@ def anthropic_to_openai_messages(anthropic_body: dict) -> list[dict]:
551
749
  elif block.get("type") == "text":
552
750
  parts.append(block.get("text", ""))
553
751
  elif block.get("type") == "tool_use":
554
- messages.append({
555
- "role": "assistant",
556
- "content": None,
557
- "tool_calls": [{
558
- "id": block.get("id", f"call_{uuid.uuid4().hex[:8]}"),
559
- "type": "function",
560
- "function": {
561
- "name": block["name"],
562
- "arguments": json.dumps(block.get("input", {})),
563
- },
564
- }],
565
- })
752
+ messages.append(
753
+ {
754
+ "role": "assistant",
755
+ "content": None,
756
+ "tool_calls": [
757
+ {
758
+ "id": block.get(
759
+ "id", f"call_{uuid.uuid4().hex[:8]}"
760
+ ),
761
+ "type": "function",
762
+ "function": {
763
+ "name": block["name"],
764
+ "arguments": json.dumps(block.get("input", {})),
765
+ },
766
+ }
767
+ ],
768
+ }
769
+ )
566
770
  continue
567
771
  elif block.get("type") == "tool_result":
568
- messages.append({
569
- "role": "tool",
570
- "tool_call_id": block.get("tool_use_id", ""),
571
- "content": _extract_text(block.get("content", "")),
572
- })
772
+ messages.append(
773
+ {
774
+ "role": "tool",
775
+ "tool_call_id": block.get("tool_use_id", ""),
776
+ "content": _extract_text(block.get("content", "")),
777
+ }
778
+ )
573
779
  continue
574
780
  if parts:
575
781
  messages.append({"role": role, "content": "\n".join(parts)})
@@ -602,7 +808,77 @@ _AGENTIC_SYSTEM_SUPPLEMENT = (
602
808
  )
603
809
 
604
810
 
605
- def build_openai_request(anthropic_body: dict) -> dict:
811
+ def _content_fingerprint(content) -> str:
812
+ if isinstance(content, str):
813
+ return content[:512]
814
+ if isinstance(content, list):
815
+ parts = []
816
+ for block in content:
817
+ if isinstance(block, str):
818
+ parts.append(block)
819
+ elif isinstance(block, dict):
820
+ btype = block.get("type", "")
821
+ if btype == "text":
822
+ parts.append(block.get("text", ""))
823
+ elif btype == "tool_use":
824
+ parts.append(f"tool:{block.get('name', '')}")
825
+ elif btype == "tool_result":
826
+ parts.append(f"result:{block.get('tool_use_id', '')}")
827
+ return "\n".join(parts)[:1024]
828
+ return str(content)[:512]
829
+
830
+
831
+ def resolve_session_id(request: Request, anthropic_body: dict) -> str:
832
+ header_keys = (
833
+ "x-uap-session-id",
834
+ "x-claude-session-id",
835
+ "anthropic-session-id",
836
+ "x-session-id",
837
+ )
838
+ for key in header_keys:
839
+ value = request.headers.get(key)
840
+ if value:
841
+ return f"hdr:{value}"
842
+
843
+ metadata = anthropic_body.get("metadata", {})
844
+ if isinstance(metadata, dict):
845
+ for key in ("session_id", "conversation_id", "thread_id"):
846
+ value = metadata.get(key)
847
+ if value:
848
+ return f"meta:{value}"
849
+
850
+ first_user = ""
851
+ for msg in anthropic_body.get("messages", []):
852
+ if msg.get("role") == "user":
853
+ first_user = _content_fingerprint(msg.get("content", ""))
854
+ break
855
+
856
+ system_fingerprint = _content_fingerprint(anthropic_body.get("system", ""))
857
+ model = anthropic_body.get("model", "default")
858
+ remote = request.client.host if request.client else "unknown"
859
+ digest = hashlib.sha256(
860
+ f"{remote}|{model}|{system_fingerprint}|{first_user}".encode(
861
+ "utf-8", errors="ignore"
862
+ )
863
+ ).hexdigest()[:20]
864
+ return f"fp:{digest}"
865
+
866
+
867
+ def _last_user_has_tool_result(anthropic_body: dict) -> bool:
868
+ messages = anthropic_body.get("messages", [])
869
+ for msg in reversed(messages):
870
+ if msg.get("role") != "user":
871
+ continue
872
+ content = msg.get("content")
873
+ if not isinstance(content, list):
874
+ return False
875
+ return any(
876
+ isinstance(b, dict) and b.get("type") == "tool_result" for b in content
877
+ )
878
+ return False
879
+
880
+
881
+ def build_openai_request(anthropic_body: dict, monitor: SessionMonitor) -> dict:
606
882
  """Build an OpenAI Chat Completions request from an Anthropic Messages request."""
607
883
  openai_body = {
608
884
  "model": anthropic_body.get("model", "default"),
@@ -616,10 +892,13 @@ def build_openai_request(anthropic_body: dict) -> dict:
616
892
  openai_body["messages"][0]["content"] += _AGENTIC_SYSTEM_SUPPLEMENT
617
893
  else:
618
894
  # No system message from the client; inject one.
619
- openai_body["messages"].insert(0, {
620
- "role": "system",
621
- "content": _AGENTIC_SYSTEM_SUPPLEMENT.strip(),
622
- })
895
+ openai_body["messages"].insert(
896
+ 0,
897
+ {
898
+ "role": "system",
899
+ "content": _AGENTIC_SYSTEM_SUPPLEMENT.strip(),
900
+ },
901
+ )
623
902
 
624
903
  if "max_tokens" in anthropic_body:
625
904
  # Enforce minimum floor for thinking mode: model needs tokens for
@@ -632,7 +911,7 @@ def build_openai_request(anthropic_body: dict) -> dict:
632
911
  # Formula: max_tokens = min(requested, context_window - input_tokens - safety_margin)
633
912
  # This ensures the model's output + current input stays within bounds,
634
913
  # leaving room for the next turn's incremental growth.
635
- ctx_window = session_monitor.context_window
914
+ ctx_window = monitor.context_window
636
915
  if ctx_window > 0:
637
916
  estimated_input = estimate_total_tokens(anthropic_body)
638
917
  # Reserve 15% of context for next-turn growth (tool results, etc.)
@@ -641,8 +920,11 @@ def build_openai_request(anthropic_body: dict) -> dict:
641
920
  if available_for_output < requested_max and available_for_output > 1024:
642
921
  logger.info(
643
922
  "MAX_TOKENS capped: %d -> %d (ctx=%d, input~%d, margin=%d)",
644
- requested_max, available_for_output,
645
- ctx_window, estimated_input, safety_margin,
923
+ requested_max,
924
+ available_for_output,
925
+ ctx_window,
926
+ estimated_input,
927
+ safety_margin,
646
928
  )
647
929
  requested_max = available_for_output
648
930
  elif available_for_output <= 1024:
@@ -650,7 +932,9 @@ def build_openai_request(anthropic_body: dict) -> dict:
650
932
  logger.warning(
651
933
  "MAX_TOKENS: only %d tokens available for output (ctx=%d, input~%d). "
652
934
  "Response may be truncated.",
653
- available_for_output, ctx_window, estimated_input,
935
+ available_for_output,
936
+ ctx_window,
937
+ estimated_input,
654
938
  )
655
939
  requested_max = max(1024, available_for_output)
656
940
 
@@ -666,14 +950,16 @@ def build_openai_request(anthropic_body: dict) -> dict:
666
950
  if "tools" in anthropic_body:
667
951
  openai_body["tools"] = []
668
952
  for tool in anthropic_body["tools"]:
669
- openai_body["tools"].append({
670
- "type": "function",
671
- "function": {
672
- "name": tool["name"],
673
- "description": tool.get("description", ""),
674
- "parameters": tool.get("input_schema", {}),
675
- },
676
- })
953
+ openai_body["tools"].append(
954
+ {
955
+ "type": "function",
956
+ "function": {
957
+ "name": tool["name"],
958
+ "description": tool.get("description", ""),
959
+ "parameters": tool.get("input_schema", {}),
960
+ },
961
+ }
962
+ )
677
963
 
678
964
  # Smart tool_choice: force tool calls during the agentic loop to
679
965
  # prevent the model from producing text-only end_turn responses that
@@ -684,24 +970,136 @@ def build_openai_request(anthropic_body: dict) -> dict:
684
970
  # - More than 1 message (conversation is in progress)
685
971
  # - Last assistant was text-only (would cause premature stop)
686
972
  # - OR conversation has tool_result messages (active agentic loop)
973
+ #
974
+ # LOOP PROTECTION: Release to "auto" if the session monitor detects
975
+ # a tool call loop (same tools called repeatedly), to prevent
976
+ # runaway token consumption.
687
977
  n_msgs = len(anthropic_body.get("messages", []))
688
978
  has_tool_results = any(
689
- isinstance(m.get("content"), list) and any(
979
+ isinstance(m.get("content"), list)
980
+ and any(
690
981
  isinstance(b, dict) and b.get("type") == "tool_result"
691
982
  for b in m.get("content", [])
692
983
  )
693
984
  for m in anthropic_body.get("messages", [])
694
985
  )
695
- if _last_assistant_was_text_only(anthropic_body):
986
+
987
+ # Record tool calls from the last assistant message for loop detection
988
+ _record_last_assistant_tool_calls(anthropic_body, monitor)
989
+ last_user_has_tool_result = _last_user_has_tool_result(anthropic_body)
990
+
991
+ # Check if loop breaker should override tool_choice
992
+ if monitor.should_release_tool_choice():
993
+ openai_body["tool_choice"] = "auto"
994
+ monitor.consecutive_forced_count = 0
995
+ monitor.no_progress_streak = 0
996
+ logger.warning("tool_choice set to 'auto' by LOOP BREAKER")
997
+ elif _last_assistant_was_text_only(anthropic_body):
696
998
  openai_body["tool_choice"] = "required"
697
- logger.info("tool_choice forced to 'required' (last assistant was text-only)")
999
+ monitor.consecutive_forced_count += 1
1000
+ monitor.no_progress_streak = (
1001
+ 0 if last_user_has_tool_result else monitor.no_progress_streak + 1
1002
+ )
1003
+ logger.info(
1004
+ "tool_choice forced to 'required' (last assistant was text-only)"
1005
+ )
698
1006
  elif has_tool_results and n_msgs > 2:
699
1007
  openai_body["tool_choice"] = "required"
700
- logger.info("tool_choice forced to 'required' (active agentic loop with tool results)")
1008
+ monitor.consecutive_forced_count += 1
1009
+ monitor.no_progress_streak = (
1010
+ 0 if last_user_has_tool_result else monitor.no_progress_streak + 1
1011
+ )
1012
+ logger.info(
1013
+ "tool_choice forced to 'required' (active agentic loop with tool results)"
1014
+ )
1015
+ else:
1016
+ monitor.consecutive_forced_count = 0
1017
+ monitor.no_progress_streak = 0
701
1018
 
702
1019
  return openai_body
703
1020
 
704
1021
 
1022
+ def _record_last_assistant_tool_calls(anthropic_body: dict, monitor: SessionMonitor):
1023
+ """Extract tool call names from the last assistant message and record
1024
+ them in the session monitor for loop detection."""
1025
+ messages = anthropic_body.get("messages", [])
1026
+ tool_names = []
1027
+ for msg in reversed(messages):
1028
+ if msg.get("role") != "assistant":
1029
+ continue
1030
+ content = msg.get("content")
1031
+ if isinstance(content, list):
1032
+ for block in content:
1033
+ if isinstance(block, dict) and block.get("type") == "tool_use":
1034
+ tool_names.append(block.get("name", "unknown"))
1035
+ break
1036
+ if tool_names:
1037
+ monitor.record_tool_calls(tool_names)
1038
+
1039
+
1040
+ def _is_unexpected_end_turn(openai_resp: dict, anthropic_body: dict) -> bool:
1041
+ choices = openai_resp.get("choices") or []
1042
+ if not choices:
1043
+ return False
1044
+
1045
+ choice = choices[0]
1046
+ finish = choice.get("finish_reason")
1047
+ if finish not in {"stop", "end_turn"}:
1048
+ return False
1049
+
1050
+ msg = choice.get("message", {})
1051
+ if msg.get("tool_calls"):
1052
+ return False
1053
+
1054
+ if "tools" not in anthropic_body:
1055
+ return False
1056
+
1057
+ has_tool_results = any(
1058
+ isinstance(m.get("content"), list)
1059
+ and any(
1060
+ isinstance(b, dict) and b.get("type") == "tool_result"
1061
+ for b in m.get("content", [])
1062
+ )
1063
+ for m in anthropic_body.get("messages", [])
1064
+ )
1065
+
1066
+ return has_tool_results or _last_assistant_was_text_only(anthropic_body)
1067
+
1068
+
1069
+ def _sanitize_reasoning_fallback_text(reasoning_text: str) -> str:
1070
+ cleaned = re.sub(r"</?think>", "", reasoning_text, flags=re.IGNORECASE)
1071
+ cleaned = re.sub(r"\s+", " ", cleaned).strip()
1072
+ if not cleaned:
1073
+ return ""
1074
+ if len(cleaned) > PROXY_STREAM_REASONING_MAX_CHARS:
1075
+ return cleaned[:PROXY_STREAM_REASONING_MAX_CHARS].rstrip() + "..."
1076
+ return cleaned
1077
+
1078
+
1079
+ def _build_reasoning_fallback_text(
1080
+ reasoning_chunks: list[str], mode: str | None = None
1081
+ ) -> str | None:
1082
+ fallback_mode = (mode or PROXY_STREAM_REASONING_FALLBACK).strip().lower()
1083
+ if fallback_mode == "off":
1084
+ return None
1085
+
1086
+ raw_text = "".join(reasoning_chunks).strip()
1087
+ if not raw_text:
1088
+ return None
1089
+
1090
+ if fallback_mode == "visible":
1091
+ return raw_text
1092
+ if fallback_mode == "sanitized":
1093
+ sanitized = _sanitize_reasoning_fallback_text(raw_text)
1094
+ return sanitized or None
1095
+
1096
+ logger.warning(
1097
+ "Unknown PROXY_STREAM_REASONING_FALLBACK=%r; disabling reasoning fallback",
1098
+ fallback_mode,
1099
+ )
1100
+ return None
1101
+
1102
+
705
1103
  def _last_assistant_was_text_only(anthropic_body: dict) -> bool:
706
1104
  """Check if the last assistant message in the conversation was text-only
707
1105
  (no tool_use blocks). This indicates the model may be prematurely ending
@@ -717,11 +1115,14 @@ def _last_assistant_was_text_only(anthropic_body: dict) -> bool:
717
1115
  return bool(content.strip())
718
1116
  if isinstance(content, list):
719
1117
  has_tool_use = any(
720
- isinstance(b, dict) and b.get("type") == "tool_use"
721
- for b in content
1118
+ isinstance(b, dict) and b.get("type") == "tool_use" for b in content
722
1119
  )
723
1120
  has_text = any(
724
- (isinstance(b, dict) and b.get("type") == "text" and b.get("text", "").strip())
1121
+ (
1122
+ isinstance(b, dict)
1123
+ and b.get("type") == "text"
1124
+ and b.get("text", "").strip()
1125
+ )
725
1126
  or isinstance(b, str)
726
1127
  for b in content
727
1128
  )
@@ -735,6 +1136,7 @@ def _last_assistant_was_text_only(anthropic_body: dict) -> bool:
735
1136
  # Response Translation: OpenAI -> Anthropic
736
1137
  # ===========================================================================
737
1138
 
1139
+
738
1140
  def openai_to_anthropic_response(openai_resp: dict, model: str) -> dict:
739
1141
  """Convert an OpenAI Chat Completions response to Anthropic Messages format."""
740
1142
  choice = openai_resp.get("choices", [{}])[0]
@@ -752,12 +1154,14 @@ def openai_to_anthropic_response(openai_resp: dict, model: str) -> dict:
752
1154
  args = json.loads(fn.get("arguments", "{}"))
753
1155
  except json.JSONDecodeError:
754
1156
  args = {}
755
- content.append({
756
- "type": "tool_use",
757
- "id": tc.get("id", f"toolu_{uuid.uuid4().hex[:12]}"),
758
- "name": fn.get("name", ""),
759
- "input": args,
760
- })
1157
+ content.append(
1158
+ {
1159
+ "type": "tool_use",
1160
+ "id": tc.get("id", f"toolu_{uuid.uuid4().hex[:12]}"),
1161
+ "name": fn.get("name", ""),
1162
+ "input": args,
1163
+ }
1164
+ )
761
1165
 
762
1166
  stop_reason_map = {
763
1167
  "stop": "end_turn",
@@ -787,7 +1191,13 @@ def openai_to_anthropic_response(openai_resp: dict, model: str) -> dict:
787
1191
  # Streaming Translation: OpenAI SSE -> Anthropic SSE
788
1192
  # ===========================================================================
789
1193
 
790
- async def stream_anthropic_response(openai_stream: httpx.Response, model: str):
1194
+
1195
+ async def stream_anthropic_response(
1196
+ openai_stream: httpx.Response,
1197
+ model: str,
1198
+ monitor: SessionMonitor,
1199
+ anthropic_body: dict,
1200
+ ):
791
1201
  """Convert an OpenAI streaming response to Anthropic SSE stream format.
792
1202
 
793
1203
  Handles:
@@ -810,7 +1220,7 @@ async def stream_anthropic_response(openai_stream: httpx.Response, model: str):
810
1220
  f"data: {json.dumps({'type': 'content_block_start', 'index': 0, 'content_block': {'type': 'text', 'text': ''}})}\n\n"
811
1221
  )
812
1222
 
813
- yield "event: ping\ndata: {\"type\": \"ping\"}\n\n"
1223
+ yield 'event: ping\ndata: {"type": "ping"}\n\n'
814
1224
 
815
1225
  output_tokens = 0
816
1226
  finish_reason = "end_turn"
@@ -939,21 +1349,29 @@ async def stream_anthropic_response(openai_stream: httpx.Response, model: str):
939
1349
  f"data: {json.dumps({'type': 'content_block_stop', 'index': tc['block_index']})}\n\n"
940
1350
  )
941
1351
  else:
942
- # Option E: If the response has no text AND no tool calls, but the
943
- # model produced reasoning_content, forward the reasoning as visible
944
- # text so the client doesn't receive a completely empty turn.
1352
+ # If the response has no text and no tool calls, optionally emit a
1353
+ # reasoning fallback (configurable) to avoid leaking malformed
1354
+ # internal chain-of-thought content by default.
945
1355
  accumulated_text = "".join(text_chunks)
946
1356
  if not accumulated_text and reasoning_chunks:
947
- fallback_text = "".join(reasoning_chunks)
948
- logger.warning(
949
- "Empty response with %d reasoning tokens – forwarding reasoning as fallback text",
950
- len(reasoning_chunks),
951
- )
952
- text_chunks.append(fallback_text)
953
- yield (
954
- f"event: content_block_delta\n"
955
- f"data: {json.dumps({'type': 'content_block_delta', 'index': 0, 'delta': {'type': 'text_delta', 'text': fallback_text}})}\n\n"
956
- )
1357
+ fallback_text = _build_reasoning_fallback_text(reasoning_chunks)
1358
+ if fallback_text:
1359
+ logger.warning(
1360
+ "Empty response with %d reasoning chunks – emitting fallback text (mode=%s)",
1361
+ len(reasoning_chunks),
1362
+ PROXY_STREAM_REASONING_FALLBACK,
1363
+ )
1364
+ text_chunks.append(fallback_text)
1365
+ yield (
1366
+ f"event: content_block_delta\n"
1367
+ f"data: {json.dumps({'type': 'content_block_delta', 'index': 0, 'delta': {'type': 'text_delta', 'text': fallback_text}})}\n\n"
1368
+ )
1369
+ else:
1370
+ logger.warning(
1371
+ "Empty response with %d reasoning chunks – fallback suppressed (mode=%s)",
1372
+ len(reasoning_chunks),
1373
+ PROXY_STREAM_REASONING_FALLBACK,
1374
+ )
957
1375
 
958
1376
  yield (
959
1377
  f"event: content_block_stop\n"
@@ -962,17 +1380,52 @@ async def stream_anthropic_response(openai_stream: httpx.Response, model: str):
962
1380
 
963
1381
  # Log response summary
964
1382
  accumulated_text = "".join(text_chunks)
965
- tc_names = [tc["name"] for tc in tool_calls_by_index.values()] if tool_calls_by_index else []
966
- tc_args = [tc.get("arguments", "") for tc in tool_calls_by_index.values()] if tool_calls_by_index else []
1383
+ tc_names = (
1384
+ [tc["name"] for tc in tool_calls_by_index.values()]
1385
+ if tool_calls_by_index
1386
+ else []
1387
+ )
1388
+ tc_args = (
1389
+ [tc.get("arguments", "") for tc in tool_calls_by_index.values()]
1390
+ if tool_calls_by_index
1391
+ else []
1392
+ )
967
1393
  logger.info(
968
1394
  "RESP: finish=%s output_tokens=%d text_len=%d text=%.300s tool_calls=%s args=%s",
969
- finish_reason, output_tokens,
1395
+ finish_reason,
1396
+ output_tokens,
970
1397
  len(accumulated_text),
971
1398
  accumulated_text[:300],
972
1399
  tc_names,
973
1400
  [a[:200] for a in tc_args],
974
1401
  )
975
1402
 
1403
+ if _is_unexpected_end_turn(
1404
+ {
1405
+ "choices": [
1406
+ {
1407
+ "finish_reason": "stop"
1408
+ if finish_reason == "end_turn"
1409
+ else finish_reason,
1410
+ "message": {
1411
+ "content": accumulated_text,
1412
+ "tool_calls": [
1413
+ {
1414
+ "function": {
1415
+ "name": tc["name"],
1416
+ "arguments": tc.get("arguments", ""),
1417
+ }
1418
+ }
1419
+ for tc in tool_calls_by_index.values()
1420
+ ],
1421
+ },
1422
+ }
1423
+ ]
1424
+ },
1425
+ anthropic_body,
1426
+ ):
1427
+ monitor.unexpected_end_turn_count += 1
1428
+
976
1429
  # message_delta with final stop reason
977
1430
  yield (
978
1431
  f"event: message_delta\n"
@@ -987,6 +1440,7 @@ async def stream_anthropic_response(openai_stream: httpx.Response, model: str):
987
1440
  # API Endpoints
988
1441
  # ===========================================================================
989
1442
 
1443
+
990
1444
  @app.post("/v1/messages")
991
1445
  async def messages(request: Request):
992
1446
  """Handle Anthropic Messages API requests (streaming and non-streaming).
@@ -997,9 +1451,14 @@ async def messages(request: Request):
997
1451
  - Option E: Smart max_tokens capping (in build_openai_request)
998
1452
  - Option F: Session-level token monitoring with warnings
999
1453
  """
1454
+ global last_session_id
1455
+
1000
1456
  body = await request.json()
1001
1457
  model = body.get("model", "default")
1002
1458
  is_stream = body.get("stream", False)
1459
+ session_id = resolve_session_id(request, body)
1460
+ monitor = get_session_monitor(session_id)
1461
+ last_session_id = session_id
1003
1462
 
1004
1463
  # Debug: log request summary
1005
1464
  n_messages = len(body.get("messages", []))
@@ -1009,42 +1468,51 @@ async def messages(request: Request):
1009
1468
  last_role = last_msg.get("role", "?")
1010
1469
  last_content = last_msg.get("content", "")
1011
1470
  if isinstance(last_content, list):
1012
- last_text = next((b.get("text", "") for b in last_content if b.get("type") == "text"), "")[:200]
1471
+ last_text = next(
1472
+ (b.get("text", "") for b in last_content if b.get("type") == "text"), ""
1473
+ )[:200]
1013
1474
  elif isinstance(last_content, str):
1014
1475
  last_text = last_content[:200]
1015
1476
  else:
1016
1477
  last_text = str(last_content)[:200]
1017
1478
  logger.info(
1018
1479
  "REQ: stream=%s msgs=%d tools=%d max_tokens=%s last_role=%s last_content=%.200s",
1019
- is_stream, n_messages, n_tools, max_tokens, last_role, last_text
1480
+ is_stream,
1481
+ n_messages,
1482
+ n_tools,
1483
+ max_tokens,
1484
+ last_role,
1485
+ last_text,
1020
1486
  )
1021
1487
 
1022
1488
  # --- Option F: Estimate tokens and record in session monitor ---
1023
1489
  estimated_tokens = estimate_total_tokens(body)
1024
- session_monitor.record_request(estimated_tokens)
1025
- session_monitor.log_status()
1490
+ monitor.record_request(estimated_tokens)
1491
+ monitor.log_status()
1026
1492
 
1027
1493
  # --- Option C: Prune conversation if approaching context limit ---
1028
- ctx_window = session_monitor.context_window
1494
+ ctx_window = monitor.context_window
1029
1495
  if ctx_window > 0:
1030
1496
  utilization = estimated_tokens / ctx_window
1031
1497
  if utilization >= PROXY_CONTEXT_PRUNE_THRESHOLD:
1032
1498
  logger.warning(
1033
1499
  "Context utilization %.1f%% exceeds threshold %.1f%% -- pruning conversation",
1034
- utilization * 100, PROXY_CONTEXT_PRUNE_THRESHOLD * 100,
1500
+ utilization * 100,
1501
+ PROXY_CONTEXT_PRUNE_THRESHOLD * 100,
1035
1502
  )
1036
1503
  body = prune_conversation(body, ctx_window, target_fraction=0.65)
1037
- session_monitor.prune_count += 1
1504
+ monitor.prune_count += 1
1038
1505
  # Re-estimate after pruning
1039
1506
  estimated_tokens = estimate_total_tokens(body)
1040
- session_monitor.record_request(estimated_tokens)
1507
+ monitor.record_request(estimated_tokens)
1041
1508
  n_messages = len(body.get("messages", []))
1042
1509
  logger.info(
1043
1510
  "After pruning: ~%d tokens, %d messages",
1044
- estimated_tokens, n_messages,
1511
+ estimated_tokens,
1512
+ n_messages,
1045
1513
  )
1046
1514
 
1047
- openai_body = build_openai_request(body)
1515
+ openai_body = build_openai_request(body, monitor)
1048
1516
 
1049
1517
  client = http_client
1050
1518
  if client is None:
@@ -1062,6 +1530,7 @@ async def messages(request: Request):
1062
1530
  MAX_UPSTREAM_RETRIES = 3
1063
1531
  RETRY_DELAY_SECS = 5.0
1064
1532
  last_exc: Exception | None = None
1533
+ resp: httpx.Response | None = None
1065
1534
 
1066
1535
  for attempt in range(MAX_UPSTREAM_RETRIES):
1067
1536
  try:
@@ -1082,25 +1551,46 @@ async def messages(request: Request):
1082
1551
  if attempt < MAX_UPSTREAM_RETRIES - 1:
1083
1552
  logger.warning(
1084
1553
  "Upstream connect failed (attempt %d/%d): %s – retrying in %.0fs",
1085
- attempt + 1, MAX_UPSTREAM_RETRIES,
1086
- type(exc).__name__, RETRY_DELAY_SECS,
1554
+ attempt + 1,
1555
+ MAX_UPSTREAM_RETRIES,
1556
+ type(exc).__name__,
1557
+ RETRY_DELAY_SECS,
1087
1558
  )
1088
1559
  await asyncio.sleep(RETRY_DELAY_SECS)
1089
1560
  else:
1090
1561
  logger.error(
1091
1562
  "Upstream connect failed after %d attempts: %s: %s",
1092
- MAX_UPSTREAM_RETRIES, type(exc).__name__, exc,
1563
+ MAX_UPSTREAM_RETRIES,
1564
+ type(exc).__name__,
1565
+ exc,
1093
1566
  )
1094
1567
 
1095
1568
  if last_exc is not None:
1096
1569
  return Response(
1097
- content=json.dumps({
1098
- "type": "error",
1099
- "error": {
1100
- "type": "overloaded_error",
1101
- "message": f"Upstream server unavailable after {MAX_UPSTREAM_RETRIES} retries: {last_exc}",
1102
- },
1103
- }),
1570
+ content=json.dumps(
1571
+ {
1572
+ "type": "error",
1573
+ "error": {
1574
+ "type": "overloaded_error",
1575
+ "message": f"Upstream server unavailable after {MAX_UPSTREAM_RETRIES} retries: {last_exc}",
1576
+ },
1577
+ }
1578
+ ),
1579
+ status_code=529,
1580
+ media_type="application/json",
1581
+ )
1582
+
1583
+ if resp is None:
1584
+ return Response(
1585
+ content=json.dumps(
1586
+ {
1587
+ "type": "error",
1588
+ "error": {
1589
+ "type": "overloaded_error",
1590
+ "message": "Upstream response unavailable",
1591
+ },
1592
+ }
1593
+ ),
1104
1594
  status_code=529,
1105
1595
  media_type="application/json",
1106
1596
  )
@@ -1113,9 +1603,7 @@ async def messages(request: Request):
1113
1603
  error_body = await resp.aread()
1114
1604
  await resp.aclose()
1115
1605
  error_text = error_body.decode("utf-8", errors="replace")[:1000]
1116
- logger.error(
1117
- "Upstream HTTP %d: %s", resp.status_code, error_text
1118
- )
1606
+ logger.error("Upstream HTTP %d: %s", resp.status_code, error_text)
1119
1607
 
1120
1608
  # Parse the error for a user-friendly message
1121
1609
  error_message = f"Upstream server error (HTTP {resp.status_code})"
@@ -1138,47 +1626,57 @@ async def messages(request: Request):
1138
1626
  )
1139
1627
 
1140
1628
  if is_context_overflow:
1141
- session_monitor.overflow_count += 1
1629
+ monitor.overflow_count += 1
1142
1630
  logger.error(
1143
1631
  "CONTEXT OVERFLOW detected (count=%d). "
1144
1632
  "Estimated input: %d tokens, context window: %d tokens. "
1145
1633
  "Conversation needs pruning or context window increase.",
1146
- session_monitor.overflow_count, estimated_tokens, ctx_window,
1634
+ monitor.overflow_count,
1635
+ estimated_tokens,
1636
+ ctx_window,
1147
1637
  )
1148
1638
  # Return Anthropic-format error that Claude Code can handle
1149
1639
  return Response(
1150
- content=json.dumps({
1151
- "type": "error",
1152
- "error": {
1153
- "type": "overloaded_error",
1154
- "message": (
1155
- f"Context window exceeded: request requires ~{estimated_tokens} tokens "
1156
- f"but only {ctx_window} are available. "
1157
- f"The conversation is too long. Please start a new session or "
1158
- f"reduce conversation length."
1159
- ),
1160
- },
1161
- }),
1640
+ content=json.dumps(
1641
+ {
1642
+ "type": "error",
1643
+ "error": {
1644
+ "type": "overloaded_error",
1645
+ "message": (
1646
+ f"Context window exceeded: request requires ~{estimated_tokens} tokens "
1647
+ f"but only {ctx_window} are available. "
1648
+ f"The conversation is too long. Please start a new session or "
1649
+ f"reduce conversation length."
1650
+ ),
1651
+ },
1652
+ }
1653
+ ),
1162
1654
  status_code=529,
1163
1655
  media_type="application/json",
1164
1656
  )
1165
1657
 
1166
1658
  # Generic upstream error -- return as Anthropic error format
1167
- error_type = "overloaded_error" if resp.status_code >= 500 else "invalid_request_error"
1659
+ error_type = (
1660
+ "overloaded_error"
1661
+ if resp.status_code >= 500
1662
+ else "invalid_request_error"
1663
+ )
1168
1664
  return Response(
1169
- content=json.dumps({
1170
- "type": "error",
1171
- "error": {
1172
- "type": error_type,
1173
- "message": error_message,
1174
- },
1175
- }),
1665
+ content=json.dumps(
1666
+ {
1667
+ "type": "error",
1668
+ "error": {
1669
+ "type": error_type,
1670
+ "message": error_message,
1671
+ },
1672
+ }
1673
+ ),
1176
1674
  status_code=529 if resp.status_code >= 500 else 400,
1177
1675
  media_type="application/json",
1178
1676
  )
1179
1677
 
1180
1678
  return StreamingResponse(
1181
- stream_anthropic_response(resp, model),
1679
+ stream_anthropic_response(resp, model, monitor, body),
1182
1680
  media_type="text/event-stream",
1183
1681
  headers={
1184
1682
  "Cache-Control": "no-cache",
@@ -1195,25 +1693,56 @@ async def messages(request: Request):
1195
1693
  # Option B: Handle non-streaming errors too
1196
1694
  if resp.status_code != 200:
1197
1695
  error_text = resp.text[:1000]
1198
- logger.error("Upstream HTTP %d (non-stream): %s", resp.status_code, error_text)
1696
+ logger.error(
1697
+ "Upstream HTTP %d (non-stream): %s", resp.status_code, error_text
1698
+ )
1199
1699
  return Response(
1200
- content=json.dumps({
1201
- "type": "error",
1202
- "error": {
1203
- "type": "overloaded_error",
1204
- "message": f"Upstream error (HTTP {resp.status_code}): {error_text[:500]}",
1205
- },
1206
- }),
1700
+ content=json.dumps(
1701
+ {
1702
+ "type": "error",
1703
+ "error": {
1704
+ "type": "overloaded_error",
1705
+ "message": f"Upstream error (HTTP {resp.status_code}): {error_text[:500]}",
1706
+ },
1707
+ }
1708
+ ),
1207
1709
  status_code=529,
1208
1710
  media_type="application/json",
1209
1711
  )
1210
1712
 
1211
1713
  openai_resp = resp.json()
1714
+
1715
+ if PROXY_GUARDRAIL_RETRY and _is_unexpected_end_turn(openai_resp, body):
1716
+ monitor.unexpected_end_turn_count += 1
1717
+ logger.warning(
1718
+ "GUARDRAIL: unexpected end_turn without tool_use in active loop (session=%s), retrying once with tool_choice=required",
1719
+ session_id,
1720
+ )
1721
+
1722
+ retry_body = dict(openai_body)
1723
+ retry_body["tool_choice"] = "required"
1724
+ retry_body["stream"] = False
1725
+
1726
+ retry_resp = await client.post(
1727
+ f"{LLAMA_CPP_BASE}/chat/completions",
1728
+ json=retry_body,
1729
+ headers={"Content-Type": "application/json"},
1730
+ )
1731
+ if retry_resp.status_code == 200:
1732
+ retry_json = retry_resp.json()
1733
+ retry_choice = (retry_json.get("choices") or [{}])[0]
1734
+ retry_message = retry_choice.get("message", {})
1735
+ if retry_message.get("tool_calls"):
1736
+ openai_resp = retry_json
1737
+ logger.info(
1738
+ "GUARDRAIL: retry produced tool_use; using retried response"
1739
+ )
1740
+
1212
1741
  anthropic_resp = openai_to_anthropic_response(openai_resp, model)
1213
1742
 
1214
1743
  # Track output tokens in session monitor
1215
1744
  output_tokens = anthropic_resp.get("usage", {}).get("output_tokens", 0)
1216
- session_monitor.record_response(output_tokens)
1745
+ monitor.record_response(output_tokens)
1217
1746
 
1218
1747
  return anthropic_resp
1219
1748
 
@@ -1258,29 +1787,50 @@ async def health():
1258
1787
 
1259
1788
 
1260
1789
  @app.get("/v1/context")
1261
- async def context_status():
1790
+ async def context_status(request: Request):
1262
1791
  """Option F: Context window monitoring endpoint.
1263
1792
 
1264
1793
  Returns current session token usage, utilization, warnings, and
1265
1794
  estimated remaining turns. Useful for dashboards and debugging.
1266
1795
  """
1267
- warning = session_monitor.get_warning_level()
1268
- turns = session_monitor.estimate_turns_remaining()
1796
+ requested_session = request.query_params.get("session_id", "")
1797
+ session_id = requested_session or last_session_id
1798
+ monitor = session_monitors.get(session_id) if session_id else None
1799
+
1800
+ if monitor is None:
1801
+ monitor = SessionMonitor(context_window=default_context_window)
1802
+
1803
+ warning = monitor.get_warning_level()
1804
+ turns = monitor.estimate_turns_remaining()
1269
1805
 
1270
1806
  return {
1271
- "context_window": session_monitor.context_window,
1272
- "last_input_tokens": session_monitor.last_input_tokens,
1273
- "last_output_tokens": session_monitor.last_output_tokens,
1274
- "peak_input_tokens": session_monitor.peak_input_tokens,
1275
- "utilization": round(session_monitor.get_utilization(), 4),
1276
- "utilization_pct": f"{session_monitor.get_utilization() * 100:.1f}%",
1807
+ "active_session_id": session_id,
1808
+ "session_count": len(session_monitors),
1809
+ "context_window": monitor.context_window,
1810
+ "last_input_tokens": monitor.last_input_tokens,
1811
+ "last_output_tokens": monitor.last_output_tokens,
1812
+ "peak_input_tokens": monitor.peak_input_tokens,
1813
+ "utilization": round(monitor.get_utilization(), 4),
1814
+ "utilization_pct": f"{monitor.get_utilization() * 100:.1f}%",
1277
1815
  "warning_level": warning,
1278
1816
  "estimated_turns_remaining": turns,
1279
- "total_requests": session_monitor.total_requests,
1280
- "prune_count": session_monitor.prune_count,
1281
- "overflow_count": session_monitor.overflow_count,
1817
+ "total_requests": monitor.total_requests,
1818
+ "prune_count": monitor.prune_count,
1819
+ "overflow_count": monitor.overflow_count,
1282
1820
  "prune_threshold": PROXY_CONTEXT_PRUNE_THRESHOLD,
1283
- "recent_history": session_monitor.context_history[-10:],
1821
+ "recent_history": monitor.context_history[-10:],
1822
+ # Loop protection stats
1823
+ "loop_protection": {
1824
+ "enabled": PROXY_LOOP_BREAKER,
1825
+ "consecutive_forced_count": monitor.consecutive_forced_count,
1826
+ "no_progress_streak": monitor.no_progress_streak,
1827
+ "loop_warnings_emitted": monitor.loop_warnings_emitted,
1828
+ "unexpected_end_turn_count": monitor.unexpected_end_turn_count,
1829
+ "tool_call_history_len": len(monitor.tool_call_history),
1830
+ "is_looping": monitor.detect_tool_loop(window=PROXY_LOOP_WINDOW)[0],
1831
+ "loop_repeat_count": monitor.detect_tool_loop(window=PROXY_LOOP_WINDOW)[1],
1832
+ "recent_tool_patterns": monitor.tool_call_history[-5:],
1833
+ },
1284
1834
  }
1285
1835
 
1286
1836