@miller-tech/uap 1.13.12 → 1.13.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/.tsbuildinfo +1 -1
- package/dist/benchmarks/speculative-autotune.d.ts +46 -0
- package/dist/benchmarks/speculative-autotune.d.ts.map +1 -0
- package/dist/benchmarks/speculative-autotune.js +145 -0
- package/dist/benchmarks/speculative-autotune.js.map +1 -0
- package/dist/benchmarks/token-throughput.d.ts +46 -46
- package/dist/bin/cli.js +2 -0
- package/dist/bin/cli.js.map +1 -1
- package/dist/bin/llama-server-optimize.js +176 -0
- package/dist/bin/llama-server-optimize.js.map +1 -1
- package/dist/bin/policy.js +0 -0
- package/dist/cli/hooks.js +1 -0
- package/dist/cli/hooks.js.map +1 -1
- package/dist/cli/init.d.ts +1 -0
- package/dist/cli/init.d.ts.map +1 -1
- package/dist/cli/init.js +18 -0
- package/dist/cli/init.js.map +1 -1
- package/dist/cli/setup.d.ts +1 -0
- package/dist/cli/setup.d.ts.map +1 -1
- package/dist/cli/setup.js +1 -0
- package/dist/cli/setup.js.map +1 -1
- package/dist/cli/systemd-services.d.ts +12 -0
- package/dist/cli/systemd-services.d.ts.map +1 -0
- package/dist/cli/systemd-services.js +179 -0
- package/dist/cli/systemd-services.js.map +1 -0
- package/dist/models/types.d.ts +12 -12
- package/dist/policies/schemas/policy.d.ts +12 -12
- package/dist/types/config.d.ts +24 -24
- package/docs/deployment/QWEN35_LLAMA_CPP.md +49 -0
- package/docs/deployment/UAP_LLAMA_ANTHROPIC_PROXY_BOOTSTRAP.md +279 -0
- package/package.json +1 -1
- package/templates/hooks/loop-protection.sh +250 -0
- package/templates/hooks/post-compact.sh +14 -0
- package/templates/hooks/post-tool-use-edit-write.sh +15 -0
- package/templates/hooks/pre-compact.sh +9 -0
- package/templates/hooks/pre-tool-use-bash.sh +6 -0
- package/templates/hooks/pre-tool-use-edit-write.sh +10 -0
- package/templates/hooks/session-start.sh +64 -44
- package/templates/hooks/stop.sh +9 -0
- package/tools/agents/scripts/anthropic_proxy.py +716 -166
- package/tools/agents/tests/test_anthropic_proxy_streaming.py +51 -0
- package/tools/agents/scripts/__pycache__/anthropic_proxy.cpython-313.pyc +0 -0
- package/tools/agents/scripts/__pycache__/tool_call_wrapper.cpython-313.pyc +0 -0
|
@@ -76,9 +76,11 @@ Dependencies
|
|
|
76
76
|
"""
|
|
77
77
|
|
|
78
78
|
import asyncio
|
|
79
|
+
import hashlib
|
|
79
80
|
import json
|
|
80
81
|
import logging
|
|
81
82
|
import os
|
|
83
|
+
import re
|
|
82
84
|
import sys
|
|
83
85
|
import time
|
|
84
86
|
import uuid
|
|
@@ -100,7 +102,35 @@ PROXY_LOG_LEVEL = os.environ.get("PROXY_LOG_LEVEL", "INFO").upper()
|
|
|
100
102
|
PROXY_READ_TIMEOUT = float(os.environ.get("PROXY_READ_TIMEOUT", "600"))
|
|
101
103
|
PROXY_MAX_CONNECTIONS = int(os.environ.get("PROXY_MAX_CONNECTIONS", "20"))
|
|
102
104
|
PROXY_CONTEXT_WINDOW = int(os.environ.get("PROXY_CONTEXT_WINDOW", "0"))
|
|
103
|
-
PROXY_CONTEXT_PRUNE_THRESHOLD = float(
|
|
105
|
+
PROXY_CONTEXT_PRUNE_THRESHOLD = float(
|
|
106
|
+
os.environ.get("PROXY_CONTEXT_PRUNE_THRESHOLD", "0.75")
|
|
107
|
+
)
|
|
108
|
+
PROXY_LOOP_BREAKER = os.environ.get("PROXY_LOOP_BREAKER", "on").lower() not in {
|
|
109
|
+
"0",
|
|
110
|
+
"false",
|
|
111
|
+
"off",
|
|
112
|
+
"no",
|
|
113
|
+
}
|
|
114
|
+
PROXY_LOOP_WINDOW = int(os.environ.get("PROXY_LOOP_WINDOW", "6"))
|
|
115
|
+
PROXY_LOOP_REPEAT_THRESHOLD = int(os.environ.get("PROXY_LOOP_REPEAT_THRESHOLD", "8"))
|
|
116
|
+
PROXY_FORCED_THRESHOLD = int(os.environ.get("PROXY_FORCED_THRESHOLD", "15"))
|
|
117
|
+
PROXY_NO_PROGRESS_THRESHOLD = int(os.environ.get("PROXY_NO_PROGRESS_THRESHOLD", "4"))
|
|
118
|
+
PROXY_CONTEXT_RELEASE_THRESHOLD = float(
|
|
119
|
+
os.environ.get("PROXY_CONTEXT_RELEASE_THRESHOLD", "0.90")
|
|
120
|
+
)
|
|
121
|
+
PROXY_GUARDRAIL_RETRY = os.environ.get("PROXY_GUARDRAIL_RETRY", "on").lower() not in {
|
|
122
|
+
"0",
|
|
123
|
+
"false",
|
|
124
|
+
"off",
|
|
125
|
+
"no",
|
|
126
|
+
}
|
|
127
|
+
PROXY_SESSION_TTL_SECS = int(os.environ.get("PROXY_SESSION_TTL_SECS", "7200"))
|
|
128
|
+
PROXY_STREAM_REASONING_FALLBACK = (
|
|
129
|
+
os.environ.get("PROXY_STREAM_REASONING_FALLBACK", "off").strip().lower()
|
|
130
|
+
)
|
|
131
|
+
PROXY_STREAM_REASONING_MAX_CHARS = int(
|
|
132
|
+
os.environ.get("PROXY_STREAM_REASONING_MAX_CHARS", "240")
|
|
133
|
+
)
|
|
104
134
|
|
|
105
135
|
# ---------------------------------------------------------------------------
|
|
106
136
|
# Logging
|
|
@@ -121,15 +151,27 @@ class SessionMonitor:
|
|
|
121
151
|
"""Tracks token usage across the session to provide early warnings
|
|
122
152
|
and enable proactive context management before overflow occurs."""
|
|
123
153
|
|
|
124
|
-
context_window: int = 0
|
|
154
|
+
context_window: int = 0 # Auto-detected or configured
|
|
125
155
|
total_requests: int = 0
|
|
126
|
-
last_input_tokens: int = 0
|
|
127
|
-
last_output_tokens: int = 0
|
|
128
|
-
peak_input_tokens: int = 0
|
|
129
|
-
prune_count: int = 0
|
|
130
|
-
overflow_count: int = 0
|
|
156
|
+
last_input_tokens: int = 0 # Estimated input tokens of last request
|
|
157
|
+
last_output_tokens: int = 0 # Actual output tokens of last response
|
|
158
|
+
peak_input_tokens: int = 0 # High-water mark
|
|
159
|
+
prune_count: int = 0 # How many times pruning was triggered
|
|
160
|
+
overflow_count: int = 0 # How many context overflow errors caught
|
|
131
161
|
context_history: list = field(default_factory=list) # Recent token counts
|
|
132
162
|
|
|
163
|
+
# --- Token Loop Protection ---
|
|
164
|
+
tool_call_history: list = field(
|
|
165
|
+
default_factory=list
|
|
166
|
+
) # Recent tool call fingerprints
|
|
167
|
+
consecutive_forced_count: int = (
|
|
168
|
+
0 # How many times tool_choice was forced consecutively
|
|
169
|
+
)
|
|
170
|
+
loop_warnings_emitted: int = 0 # How many loop warnings sent to the model
|
|
171
|
+
no_progress_streak: int = 0 # Forced tool turns without new tool_result
|
|
172
|
+
unexpected_end_turn_count: int = 0 # end_turn without tool_use in active loop
|
|
173
|
+
last_seen_ts: float = 0.0
|
|
174
|
+
|
|
133
175
|
def record_request(self, estimated_tokens: int):
|
|
134
176
|
"""Record an outgoing request's estimated token count."""
|
|
135
177
|
self.total_requests += 1
|
|
@@ -145,6 +187,9 @@ class SessionMonitor:
|
|
|
145
187
|
"""Record a response's output token count."""
|
|
146
188
|
self.last_output_tokens = output_tokens
|
|
147
189
|
|
|
190
|
+
def touch(self):
|
|
191
|
+
self.last_seen_ts = time.time()
|
|
192
|
+
|
|
148
193
|
def get_utilization(self) -> float:
|
|
149
194
|
"""Get current context utilization as a fraction (0.0 - 1.0)."""
|
|
150
195
|
if self.context_window <= 0:
|
|
@@ -191,30 +236,160 @@ class SessionMonitor:
|
|
|
191
236
|
if warning == "CRITICAL":
|
|
192
237
|
logger.error(
|
|
193
238
|
"CONTEXT CRITICAL: %d/%d tokens (%.1f%%), %s, pruned=%d, overflows=%d",
|
|
194
|
-
self.last_input_tokens,
|
|
195
|
-
|
|
239
|
+
self.last_input_tokens,
|
|
240
|
+
self.context_window,
|
|
241
|
+
util * 100,
|
|
242
|
+
turns_str,
|
|
243
|
+
self.prune_count,
|
|
244
|
+
self.overflow_count,
|
|
196
245
|
)
|
|
197
246
|
elif warning == "HIGH":
|
|
198
247
|
logger.warning(
|
|
199
248
|
"CONTEXT HIGH: %d/%d tokens (%.1f%%), %s, pruned=%d",
|
|
200
|
-
self.last_input_tokens,
|
|
201
|
-
|
|
249
|
+
self.last_input_tokens,
|
|
250
|
+
self.context_window,
|
|
251
|
+
util * 100,
|
|
252
|
+
turns_str,
|
|
253
|
+
self.prune_count,
|
|
202
254
|
)
|
|
203
255
|
elif warning == "ELEVATED":
|
|
204
256
|
logger.warning(
|
|
205
257
|
"CONTEXT ELEVATED: %d/%d tokens (%.1f%%), %s",
|
|
206
|
-
self.last_input_tokens,
|
|
258
|
+
self.last_input_tokens,
|
|
259
|
+
self.context_window,
|
|
260
|
+
util * 100,
|
|
207
261
|
turns_str,
|
|
208
262
|
)
|
|
209
263
|
else:
|
|
210
264
|
logger.info(
|
|
211
265
|
"CONTEXT: %d/%d tokens (%.1f%%), %s",
|
|
212
|
-
self.last_input_tokens,
|
|
266
|
+
self.last_input_tokens,
|
|
267
|
+
self.context_window,
|
|
268
|
+
util * 100,
|
|
213
269
|
turns_str,
|
|
214
270
|
)
|
|
215
271
|
|
|
272
|
+
# --- Token Loop Protection Methods ---
|
|
273
|
+
|
|
274
|
+
def record_tool_calls(self, tool_names: list[str]):
|
|
275
|
+
"""Record tool call names for loop detection."""
|
|
276
|
+
fingerprint = "|".join(sorted(tool_names)) if tool_names else ""
|
|
277
|
+
self.tool_call_history.append(fingerprint)
|
|
278
|
+
# Keep last 30 entries
|
|
279
|
+
if len(self.tool_call_history) > 30:
|
|
280
|
+
self.tool_call_history = self.tool_call_history[-30:]
|
|
281
|
+
|
|
282
|
+
def detect_tool_loop(self, window: int = 6) -> tuple[bool, int]:
|
|
283
|
+
"""Detect if the model is stuck in a tool call loop.
|
|
284
|
+
|
|
285
|
+
Checks if the last `window` tool call fingerprints are identical.
|
|
286
|
+
Returns (is_looping, repeat_count).
|
|
287
|
+
"""
|
|
288
|
+
if len(self.tool_call_history) < window:
|
|
289
|
+
return False, 0
|
|
290
|
+
|
|
291
|
+
recent = self.tool_call_history[-window:]
|
|
292
|
+
if not recent[0]:
|
|
293
|
+
return False, 0
|
|
294
|
+
|
|
295
|
+
# Check if all recent entries are the same fingerprint
|
|
296
|
+
if all(fp == recent[0] for fp in recent):
|
|
297
|
+
# Count total consecutive repeats from the end
|
|
298
|
+
count = 0
|
|
299
|
+
target = recent[0]
|
|
300
|
+
for fp in reversed(self.tool_call_history):
|
|
301
|
+
if fp == target:
|
|
302
|
+
count += 1
|
|
303
|
+
else:
|
|
304
|
+
break
|
|
305
|
+
return True, count
|
|
306
|
+
|
|
307
|
+
return False, 0
|
|
308
|
+
|
|
309
|
+
def should_release_tool_choice(self) -> bool:
|
|
310
|
+
"""Determine if tool_choice should be relaxed to 'auto' to break a loop.
|
|
311
|
+
|
|
312
|
+
Returns True if the model appears stuck and forcing tool_choice=required
|
|
313
|
+
is making it worse. Thresholds:
|
|
314
|
+
- 8+ consecutive forced requests with same tool pattern -> release
|
|
315
|
+
- 15+ consecutive forced requests regardless -> release
|
|
316
|
+
- Context utilization > 90% -> release (let model wrap up)
|
|
317
|
+
"""
|
|
318
|
+
if not PROXY_LOOP_BREAKER:
|
|
319
|
+
return False
|
|
320
|
+
|
|
321
|
+
is_looping, repeat_count = self.detect_tool_loop(window=PROXY_LOOP_WINDOW)
|
|
322
|
+
|
|
323
|
+
# Pattern 1: Detected tool call loop
|
|
324
|
+
if (
|
|
325
|
+
is_looping
|
|
326
|
+
and repeat_count >= PROXY_LOOP_REPEAT_THRESHOLD
|
|
327
|
+
and self.no_progress_streak >= PROXY_NO_PROGRESS_THRESHOLD
|
|
328
|
+
):
|
|
329
|
+
logger.warning(
|
|
330
|
+
"LOOP BREAKER: Same tool pattern repeated %d times with no progress streak=%d. "
|
|
331
|
+
"Releasing tool_choice to 'auto'.",
|
|
332
|
+
repeat_count,
|
|
333
|
+
self.no_progress_streak,
|
|
334
|
+
)
|
|
335
|
+
self.loop_warnings_emitted += 1
|
|
336
|
+
return True
|
|
337
|
+
|
|
338
|
+
# Pattern 2: Too many consecutive forced requests
|
|
339
|
+
if (
|
|
340
|
+
self.consecutive_forced_count >= PROXY_FORCED_THRESHOLD
|
|
341
|
+
and self.no_progress_streak >= PROXY_NO_PROGRESS_THRESHOLD
|
|
342
|
+
):
|
|
343
|
+
logger.warning(
|
|
344
|
+
"LOOP BREAKER: %d consecutive forced tool_choice requests with no progress streak=%d. "
|
|
345
|
+
"Releasing to 'auto'.",
|
|
346
|
+
self.consecutive_forced_count,
|
|
347
|
+
self.no_progress_streak,
|
|
348
|
+
)
|
|
349
|
+
self.loop_warnings_emitted += 1
|
|
350
|
+
return True
|
|
351
|
+
|
|
352
|
+
# Pattern 3: Context almost full -- let model wrap up naturally
|
|
353
|
+
if self.get_utilization() >= PROXY_CONTEXT_RELEASE_THRESHOLD:
|
|
354
|
+
logger.warning(
|
|
355
|
+
"LOOP BREAKER: Context utilization %.1f%% -- releasing "
|
|
356
|
+
"tool_choice to let model wrap up.",
|
|
357
|
+
self.get_utilization() * 100,
|
|
358
|
+
)
|
|
359
|
+
return True
|
|
360
|
+
|
|
361
|
+
return False
|
|
362
|
+
|
|
363
|
+
|
|
364
|
+
session_monitors: dict[str, SessionMonitor] = {}
|
|
365
|
+
default_context_window = 0
|
|
366
|
+
last_session_id = ""
|
|
367
|
+
|
|
216
368
|
|
|
217
|
-
|
|
369
|
+
def _cleanup_stale_monitors(now_ts: float) -> None:
|
|
370
|
+
stale = [
|
|
371
|
+
sid
|
|
372
|
+
for sid, mon in session_monitors.items()
|
|
373
|
+
if mon.last_seen_ts > 0 and now_ts - mon.last_seen_ts > PROXY_SESSION_TTL_SECS
|
|
374
|
+
]
|
|
375
|
+
for sid in stale:
|
|
376
|
+
session_monitors.pop(sid, None)
|
|
377
|
+
|
|
378
|
+
|
|
379
|
+
def get_session_monitor(session_id: str) -> SessionMonitor:
|
|
380
|
+
now_ts = time.time()
|
|
381
|
+
_cleanup_stale_monitors(now_ts)
|
|
382
|
+
|
|
383
|
+
monitor = session_monitors.get(session_id)
|
|
384
|
+
if monitor is None:
|
|
385
|
+
monitor = SessionMonitor(context_window=default_context_window)
|
|
386
|
+
session_monitors[session_id] = monitor
|
|
387
|
+
|
|
388
|
+
monitor.touch()
|
|
389
|
+
if monitor.context_window <= 0:
|
|
390
|
+
monitor.context_window = default_context_window
|
|
391
|
+
|
|
392
|
+
return monitor
|
|
218
393
|
|
|
219
394
|
|
|
220
395
|
# ---------------------------------------------------------------------------
|
|
@@ -240,7 +415,8 @@ async def detect_context_window(client: httpx.AsyncClient) -> int:
|
|
|
240
415
|
if n_ctx > 0:
|
|
241
416
|
logger.info(
|
|
242
417
|
"Auto-detected context window from upstream: %d tokens (%d slots)",
|
|
243
|
-
n_ctx,
|
|
418
|
+
n_ctx,
|
|
419
|
+
len(slots),
|
|
244
420
|
)
|
|
245
421
|
return n_ctx
|
|
246
422
|
except Exception as exc:
|
|
@@ -314,7 +490,9 @@ def estimate_total_tokens(anthropic_body: dict) -> int:
|
|
|
314
490
|
return tokens
|
|
315
491
|
|
|
316
492
|
|
|
317
|
-
def prune_conversation(
|
|
493
|
+
def prune_conversation(
|
|
494
|
+
anthropic_body: dict, context_window: int, target_fraction: float = 0.65
|
|
495
|
+
) -> dict:
|
|
318
496
|
"""Prune the conversation to fit within the context window.
|
|
319
497
|
|
|
320
498
|
Strategy:
|
|
@@ -361,19 +539,24 @@ def prune_conversation(anthropic_body: dict, context_window: int, target_fractio
|
|
|
361
539
|
|
|
362
540
|
# Always keep the first user message and the last N messages
|
|
363
541
|
KEEP_LAST = 8 # Keep the last 8 messages (recent context)
|
|
364
|
-
protected_head = messages[:1]
|
|
365
|
-
protected_tail =
|
|
542
|
+
protected_head = messages[:1] # First user message
|
|
543
|
+
protected_tail = (
|
|
544
|
+
messages[-KEEP_LAST:] if len(messages) > KEEP_LAST else messages[1:]
|
|
545
|
+
)
|
|
366
546
|
middle = messages[1:-KEEP_LAST] if len(messages) > KEEP_LAST + 1 else []
|
|
367
547
|
|
|
368
548
|
# Calculate tokens for protected messages
|
|
369
|
-
protected_tokens = sum(
|
|
549
|
+
protected_tokens = sum(
|
|
550
|
+
estimate_message_tokens(m) for m in protected_head + protected_tail
|
|
551
|
+
)
|
|
370
552
|
|
|
371
553
|
if protected_tokens >= message_budget:
|
|
372
554
|
# Even protected messages exceed budget -- truncate tool_result content
|
|
373
555
|
# in the tail to fit
|
|
374
556
|
logger.warning(
|
|
375
557
|
"Protected messages (%d tokens) exceed budget (%d) -- truncating tool results",
|
|
376
|
-
protected_tokens,
|
|
558
|
+
protected_tokens,
|
|
559
|
+
message_budget,
|
|
377
560
|
)
|
|
378
561
|
for msg in protected_tail:
|
|
379
562
|
content = msg.get("content", [])
|
|
@@ -382,7 +565,11 @@ def prune_conversation(anthropic_body: dict, context_window: int, target_fractio
|
|
|
382
565
|
if isinstance(block, dict) and block.get("type") == "tool_result":
|
|
383
566
|
result_text = _extract_text(block.get("content", ""))
|
|
384
567
|
if len(result_text) > 2000:
|
|
385
|
-
block["content"] =
|
|
568
|
+
block["content"] = (
|
|
569
|
+
result_text[:1000]
|
|
570
|
+
+ "\n...[TRUNCATED]...\n"
|
|
571
|
+
+ result_text[-500:]
|
|
572
|
+
)
|
|
386
573
|
anthropic_body["messages"] = protected_head + protected_tail
|
|
387
574
|
return anthropic_body
|
|
388
575
|
|
|
@@ -402,8 +589,7 @@ def prune_conversation(anthropic_body: dict, context_window: int, target_fractio
|
|
|
402
589
|
|
|
403
590
|
if isinstance(content, list):
|
|
404
591
|
is_tool_result = any(
|
|
405
|
-
isinstance(b, dict) and b.get("type") == "tool_result"
|
|
406
|
-
for b in content
|
|
592
|
+
isinstance(b, dict) and b.get("type") == "tool_result" for b in content
|
|
407
593
|
)
|
|
408
594
|
|
|
409
595
|
# Lower priority = removed first
|
|
@@ -445,12 +631,17 @@ def prune_conversation(anthropic_body: dict, context_window: int, target_fractio
|
|
|
445
631
|
f"The conversation continues from recent context below.]"
|
|
446
632
|
),
|
|
447
633
|
}
|
|
448
|
-
anthropic_body["messages"] =
|
|
634
|
+
anthropic_body["messages"] = (
|
|
635
|
+
protected_head + [prune_marker] + kept_msgs + protected_tail
|
|
636
|
+
)
|
|
449
637
|
logger.warning(
|
|
450
638
|
"PRUNED: removed %d messages (~%d tokens), kept %d messages, "
|
|
451
639
|
"target=%.0f%% of %d ctx",
|
|
452
|
-
removed_count,
|
|
453
|
-
|
|
640
|
+
removed_count,
|
|
641
|
+
removed_tokens,
|
|
642
|
+
len(anthropic_body["messages"]),
|
|
643
|
+
target_fraction * 100,
|
|
644
|
+
context_window,
|
|
454
645
|
)
|
|
455
646
|
else:
|
|
456
647
|
anthropic_body["messages"] = protected_head + kept_msgs + protected_tail
|
|
@@ -470,12 +661,13 @@ http_client: httpx.AsyncClient | None = None
|
|
|
470
661
|
async def lifespan(app: FastAPI):
|
|
471
662
|
"""Manage the httpx client lifecycle with the FastAPI app."""
|
|
472
663
|
global http_client
|
|
664
|
+
global default_context_window
|
|
473
665
|
http_client = httpx.AsyncClient(
|
|
474
666
|
timeout=httpx.Timeout(
|
|
475
|
-
connect=10.0,
|
|
476
|
-
read=PROXY_READ_TIMEOUT,
|
|
477
|
-
write=30.0,
|
|
478
|
-
pool=10.0,
|
|
667
|
+
connect=10.0, # 10s to establish connection
|
|
668
|
+
read=PROXY_READ_TIMEOUT, # configurable (default 10 min)
|
|
669
|
+
write=30.0, # 30s to send the request body
|
|
670
|
+
pool=10.0, # 10s to acquire a pool connection
|
|
479
671
|
),
|
|
480
672
|
limits=httpx.Limits(
|
|
481
673
|
max_connections=PROXY_MAX_CONNECTIONS,
|
|
@@ -485,14 +677,19 @@ async def lifespan(app: FastAPI):
|
|
|
485
677
|
)
|
|
486
678
|
logger.info(
|
|
487
679
|
"Proxy started: listening on %s:%d -> upstream %s",
|
|
488
|
-
PROXY_HOST,
|
|
680
|
+
PROXY_HOST,
|
|
681
|
+
PROXY_PORT,
|
|
682
|
+
LLAMA_CPP_BASE,
|
|
489
683
|
)
|
|
490
684
|
|
|
491
685
|
# Auto-detect context window from upstream server
|
|
492
|
-
|
|
686
|
+
default_context_window = await detect_context_window(http_client)
|
|
687
|
+
for mon in session_monitors.values():
|
|
688
|
+
if mon.context_window <= 0:
|
|
689
|
+
mon.context_window = default_context_window
|
|
493
690
|
logger.info(
|
|
494
691
|
"Context window: %d tokens, prune threshold: %.0f%%",
|
|
495
|
-
|
|
692
|
+
default_context_window,
|
|
496
693
|
PROXY_CONTEXT_PRUNE_THRESHOLD * 100,
|
|
497
694
|
)
|
|
498
695
|
|
|
@@ -514,6 +711,7 @@ app = FastAPI(
|
|
|
514
711
|
# Request Translation: Anthropic -> OpenAI
|
|
515
712
|
# ===========================================================================
|
|
516
713
|
|
|
714
|
+
|
|
517
715
|
def anthropic_to_openai_messages(anthropic_body: dict) -> list[dict]:
|
|
518
716
|
"""Convert Anthropic message format to OpenAI message format.
|
|
519
717
|
|
|
@@ -551,25 +749,33 @@ def anthropic_to_openai_messages(anthropic_body: dict) -> list[dict]:
|
|
|
551
749
|
elif block.get("type") == "text":
|
|
552
750
|
parts.append(block.get("text", ""))
|
|
553
751
|
elif block.get("type") == "tool_use":
|
|
554
|
-
messages.append(
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
"
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
752
|
+
messages.append(
|
|
753
|
+
{
|
|
754
|
+
"role": "assistant",
|
|
755
|
+
"content": None,
|
|
756
|
+
"tool_calls": [
|
|
757
|
+
{
|
|
758
|
+
"id": block.get(
|
|
759
|
+
"id", f"call_{uuid.uuid4().hex[:8]}"
|
|
760
|
+
),
|
|
761
|
+
"type": "function",
|
|
762
|
+
"function": {
|
|
763
|
+
"name": block["name"],
|
|
764
|
+
"arguments": json.dumps(block.get("input", {})),
|
|
765
|
+
},
|
|
766
|
+
}
|
|
767
|
+
],
|
|
768
|
+
}
|
|
769
|
+
)
|
|
566
770
|
continue
|
|
567
771
|
elif block.get("type") == "tool_result":
|
|
568
|
-
messages.append(
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
772
|
+
messages.append(
|
|
773
|
+
{
|
|
774
|
+
"role": "tool",
|
|
775
|
+
"tool_call_id": block.get("tool_use_id", ""),
|
|
776
|
+
"content": _extract_text(block.get("content", "")),
|
|
777
|
+
}
|
|
778
|
+
)
|
|
573
779
|
continue
|
|
574
780
|
if parts:
|
|
575
781
|
messages.append({"role": role, "content": "\n".join(parts)})
|
|
@@ -602,7 +808,77 @@ _AGENTIC_SYSTEM_SUPPLEMENT = (
|
|
|
602
808
|
)
|
|
603
809
|
|
|
604
810
|
|
|
605
|
-
def
|
|
811
|
+
def _content_fingerprint(content) -> str:
|
|
812
|
+
if isinstance(content, str):
|
|
813
|
+
return content[:512]
|
|
814
|
+
if isinstance(content, list):
|
|
815
|
+
parts = []
|
|
816
|
+
for block in content:
|
|
817
|
+
if isinstance(block, str):
|
|
818
|
+
parts.append(block)
|
|
819
|
+
elif isinstance(block, dict):
|
|
820
|
+
btype = block.get("type", "")
|
|
821
|
+
if btype == "text":
|
|
822
|
+
parts.append(block.get("text", ""))
|
|
823
|
+
elif btype == "tool_use":
|
|
824
|
+
parts.append(f"tool:{block.get('name', '')}")
|
|
825
|
+
elif btype == "tool_result":
|
|
826
|
+
parts.append(f"result:{block.get('tool_use_id', '')}")
|
|
827
|
+
return "\n".join(parts)[:1024]
|
|
828
|
+
return str(content)[:512]
|
|
829
|
+
|
|
830
|
+
|
|
831
|
+
def resolve_session_id(request: Request, anthropic_body: dict) -> str:
|
|
832
|
+
header_keys = (
|
|
833
|
+
"x-uap-session-id",
|
|
834
|
+
"x-claude-session-id",
|
|
835
|
+
"anthropic-session-id",
|
|
836
|
+
"x-session-id",
|
|
837
|
+
)
|
|
838
|
+
for key in header_keys:
|
|
839
|
+
value = request.headers.get(key)
|
|
840
|
+
if value:
|
|
841
|
+
return f"hdr:{value}"
|
|
842
|
+
|
|
843
|
+
metadata = anthropic_body.get("metadata", {})
|
|
844
|
+
if isinstance(metadata, dict):
|
|
845
|
+
for key in ("session_id", "conversation_id", "thread_id"):
|
|
846
|
+
value = metadata.get(key)
|
|
847
|
+
if value:
|
|
848
|
+
return f"meta:{value}"
|
|
849
|
+
|
|
850
|
+
first_user = ""
|
|
851
|
+
for msg in anthropic_body.get("messages", []):
|
|
852
|
+
if msg.get("role") == "user":
|
|
853
|
+
first_user = _content_fingerprint(msg.get("content", ""))
|
|
854
|
+
break
|
|
855
|
+
|
|
856
|
+
system_fingerprint = _content_fingerprint(anthropic_body.get("system", ""))
|
|
857
|
+
model = anthropic_body.get("model", "default")
|
|
858
|
+
remote = request.client.host if request.client else "unknown"
|
|
859
|
+
digest = hashlib.sha256(
|
|
860
|
+
f"{remote}|{model}|{system_fingerprint}|{first_user}".encode(
|
|
861
|
+
"utf-8", errors="ignore"
|
|
862
|
+
)
|
|
863
|
+
).hexdigest()[:20]
|
|
864
|
+
return f"fp:{digest}"
|
|
865
|
+
|
|
866
|
+
|
|
867
|
+
def _last_user_has_tool_result(anthropic_body: dict) -> bool:
|
|
868
|
+
messages = anthropic_body.get("messages", [])
|
|
869
|
+
for msg in reversed(messages):
|
|
870
|
+
if msg.get("role") != "user":
|
|
871
|
+
continue
|
|
872
|
+
content = msg.get("content")
|
|
873
|
+
if not isinstance(content, list):
|
|
874
|
+
return False
|
|
875
|
+
return any(
|
|
876
|
+
isinstance(b, dict) and b.get("type") == "tool_result" for b in content
|
|
877
|
+
)
|
|
878
|
+
return False
|
|
879
|
+
|
|
880
|
+
|
|
881
|
+
def build_openai_request(anthropic_body: dict, monitor: SessionMonitor) -> dict:
|
|
606
882
|
"""Build an OpenAI Chat Completions request from an Anthropic Messages request."""
|
|
607
883
|
openai_body = {
|
|
608
884
|
"model": anthropic_body.get("model", "default"),
|
|
@@ -616,10 +892,13 @@ def build_openai_request(anthropic_body: dict) -> dict:
|
|
|
616
892
|
openai_body["messages"][0]["content"] += _AGENTIC_SYSTEM_SUPPLEMENT
|
|
617
893
|
else:
|
|
618
894
|
# No system message from the client; inject one.
|
|
619
|
-
openai_body["messages"].insert(
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
895
|
+
openai_body["messages"].insert(
|
|
896
|
+
0,
|
|
897
|
+
{
|
|
898
|
+
"role": "system",
|
|
899
|
+
"content": _AGENTIC_SYSTEM_SUPPLEMENT.strip(),
|
|
900
|
+
},
|
|
901
|
+
)
|
|
623
902
|
|
|
624
903
|
if "max_tokens" in anthropic_body:
|
|
625
904
|
# Enforce minimum floor for thinking mode: model needs tokens for
|
|
@@ -632,7 +911,7 @@ def build_openai_request(anthropic_body: dict) -> dict:
|
|
|
632
911
|
# Formula: max_tokens = min(requested, context_window - input_tokens - safety_margin)
|
|
633
912
|
# This ensures the model's output + current input stays within bounds,
|
|
634
913
|
# leaving room for the next turn's incremental growth.
|
|
635
|
-
ctx_window =
|
|
914
|
+
ctx_window = monitor.context_window
|
|
636
915
|
if ctx_window > 0:
|
|
637
916
|
estimated_input = estimate_total_tokens(anthropic_body)
|
|
638
917
|
# Reserve 15% of context for next-turn growth (tool results, etc.)
|
|
@@ -641,8 +920,11 @@ def build_openai_request(anthropic_body: dict) -> dict:
|
|
|
641
920
|
if available_for_output < requested_max and available_for_output > 1024:
|
|
642
921
|
logger.info(
|
|
643
922
|
"MAX_TOKENS capped: %d -> %d (ctx=%d, input~%d, margin=%d)",
|
|
644
|
-
requested_max,
|
|
645
|
-
|
|
923
|
+
requested_max,
|
|
924
|
+
available_for_output,
|
|
925
|
+
ctx_window,
|
|
926
|
+
estimated_input,
|
|
927
|
+
safety_margin,
|
|
646
928
|
)
|
|
647
929
|
requested_max = available_for_output
|
|
648
930
|
elif available_for_output <= 1024:
|
|
@@ -650,7 +932,9 @@ def build_openai_request(anthropic_body: dict) -> dict:
|
|
|
650
932
|
logger.warning(
|
|
651
933
|
"MAX_TOKENS: only %d tokens available for output (ctx=%d, input~%d). "
|
|
652
934
|
"Response may be truncated.",
|
|
653
|
-
available_for_output,
|
|
935
|
+
available_for_output,
|
|
936
|
+
ctx_window,
|
|
937
|
+
estimated_input,
|
|
654
938
|
)
|
|
655
939
|
requested_max = max(1024, available_for_output)
|
|
656
940
|
|
|
@@ -666,14 +950,16 @@ def build_openai_request(anthropic_body: dict) -> dict:
|
|
|
666
950
|
if "tools" in anthropic_body:
|
|
667
951
|
openai_body["tools"] = []
|
|
668
952
|
for tool in anthropic_body["tools"]:
|
|
669
|
-
openai_body["tools"].append(
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
"
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
953
|
+
openai_body["tools"].append(
|
|
954
|
+
{
|
|
955
|
+
"type": "function",
|
|
956
|
+
"function": {
|
|
957
|
+
"name": tool["name"],
|
|
958
|
+
"description": tool.get("description", ""),
|
|
959
|
+
"parameters": tool.get("input_schema", {}),
|
|
960
|
+
},
|
|
961
|
+
}
|
|
962
|
+
)
|
|
677
963
|
|
|
678
964
|
# Smart tool_choice: force tool calls during the agentic loop to
|
|
679
965
|
# prevent the model from producing text-only end_turn responses that
|
|
@@ -684,24 +970,136 @@ def build_openai_request(anthropic_body: dict) -> dict:
|
|
|
684
970
|
# - More than 1 message (conversation is in progress)
|
|
685
971
|
# - Last assistant was text-only (would cause premature stop)
|
|
686
972
|
# - OR conversation has tool_result messages (active agentic loop)
|
|
973
|
+
#
|
|
974
|
+
# LOOP PROTECTION: Release to "auto" if the session monitor detects
|
|
975
|
+
# a tool call loop (same tools called repeatedly), to prevent
|
|
976
|
+
# runaway token consumption.
|
|
687
977
|
n_msgs = len(anthropic_body.get("messages", []))
|
|
688
978
|
has_tool_results = any(
|
|
689
|
-
isinstance(m.get("content"), list)
|
|
979
|
+
isinstance(m.get("content"), list)
|
|
980
|
+
and any(
|
|
690
981
|
isinstance(b, dict) and b.get("type") == "tool_result"
|
|
691
982
|
for b in m.get("content", [])
|
|
692
983
|
)
|
|
693
984
|
for m in anthropic_body.get("messages", [])
|
|
694
985
|
)
|
|
695
|
-
|
|
986
|
+
|
|
987
|
+
# Record tool calls from the last assistant message for loop detection
|
|
988
|
+
_record_last_assistant_tool_calls(anthropic_body, monitor)
|
|
989
|
+
last_user_has_tool_result = _last_user_has_tool_result(anthropic_body)
|
|
990
|
+
|
|
991
|
+
# Check if loop breaker should override tool_choice
|
|
992
|
+
if monitor.should_release_tool_choice():
|
|
993
|
+
openai_body["tool_choice"] = "auto"
|
|
994
|
+
monitor.consecutive_forced_count = 0
|
|
995
|
+
monitor.no_progress_streak = 0
|
|
996
|
+
logger.warning("tool_choice set to 'auto' by LOOP BREAKER")
|
|
997
|
+
elif _last_assistant_was_text_only(anthropic_body):
|
|
696
998
|
openai_body["tool_choice"] = "required"
|
|
697
|
-
|
|
999
|
+
monitor.consecutive_forced_count += 1
|
|
1000
|
+
monitor.no_progress_streak = (
|
|
1001
|
+
0 if last_user_has_tool_result else monitor.no_progress_streak + 1
|
|
1002
|
+
)
|
|
1003
|
+
logger.info(
|
|
1004
|
+
"tool_choice forced to 'required' (last assistant was text-only)"
|
|
1005
|
+
)
|
|
698
1006
|
elif has_tool_results and n_msgs > 2:
|
|
699
1007
|
openai_body["tool_choice"] = "required"
|
|
700
|
-
|
|
1008
|
+
monitor.consecutive_forced_count += 1
|
|
1009
|
+
monitor.no_progress_streak = (
|
|
1010
|
+
0 if last_user_has_tool_result else monitor.no_progress_streak + 1
|
|
1011
|
+
)
|
|
1012
|
+
logger.info(
|
|
1013
|
+
"tool_choice forced to 'required' (active agentic loop with tool results)"
|
|
1014
|
+
)
|
|
1015
|
+
else:
|
|
1016
|
+
monitor.consecutive_forced_count = 0
|
|
1017
|
+
monitor.no_progress_streak = 0
|
|
701
1018
|
|
|
702
1019
|
return openai_body
|
|
703
1020
|
|
|
704
1021
|
|
|
1022
|
+
def _record_last_assistant_tool_calls(anthropic_body: dict, monitor: SessionMonitor):
|
|
1023
|
+
"""Extract tool call names from the last assistant message and record
|
|
1024
|
+
them in the session monitor for loop detection."""
|
|
1025
|
+
messages = anthropic_body.get("messages", [])
|
|
1026
|
+
tool_names = []
|
|
1027
|
+
for msg in reversed(messages):
|
|
1028
|
+
if msg.get("role") != "assistant":
|
|
1029
|
+
continue
|
|
1030
|
+
content = msg.get("content")
|
|
1031
|
+
if isinstance(content, list):
|
|
1032
|
+
for block in content:
|
|
1033
|
+
if isinstance(block, dict) and block.get("type") == "tool_use":
|
|
1034
|
+
tool_names.append(block.get("name", "unknown"))
|
|
1035
|
+
break
|
|
1036
|
+
if tool_names:
|
|
1037
|
+
monitor.record_tool_calls(tool_names)
|
|
1038
|
+
|
|
1039
|
+
|
|
1040
|
+
def _is_unexpected_end_turn(openai_resp: dict, anthropic_body: dict) -> bool:
|
|
1041
|
+
choices = openai_resp.get("choices") or []
|
|
1042
|
+
if not choices:
|
|
1043
|
+
return False
|
|
1044
|
+
|
|
1045
|
+
choice = choices[0]
|
|
1046
|
+
finish = choice.get("finish_reason")
|
|
1047
|
+
if finish not in {"stop", "end_turn"}:
|
|
1048
|
+
return False
|
|
1049
|
+
|
|
1050
|
+
msg = choice.get("message", {})
|
|
1051
|
+
if msg.get("tool_calls"):
|
|
1052
|
+
return False
|
|
1053
|
+
|
|
1054
|
+
if "tools" not in anthropic_body:
|
|
1055
|
+
return False
|
|
1056
|
+
|
|
1057
|
+
has_tool_results = any(
|
|
1058
|
+
isinstance(m.get("content"), list)
|
|
1059
|
+
and any(
|
|
1060
|
+
isinstance(b, dict) and b.get("type") == "tool_result"
|
|
1061
|
+
for b in m.get("content", [])
|
|
1062
|
+
)
|
|
1063
|
+
for m in anthropic_body.get("messages", [])
|
|
1064
|
+
)
|
|
1065
|
+
|
|
1066
|
+
return has_tool_results or _last_assistant_was_text_only(anthropic_body)
|
|
1067
|
+
|
|
1068
|
+
|
|
1069
|
+
def _sanitize_reasoning_fallback_text(reasoning_text: str) -> str:
|
|
1070
|
+
cleaned = re.sub(r"</?think>", "", reasoning_text, flags=re.IGNORECASE)
|
|
1071
|
+
cleaned = re.sub(r"\s+", " ", cleaned).strip()
|
|
1072
|
+
if not cleaned:
|
|
1073
|
+
return ""
|
|
1074
|
+
if len(cleaned) > PROXY_STREAM_REASONING_MAX_CHARS:
|
|
1075
|
+
return cleaned[:PROXY_STREAM_REASONING_MAX_CHARS].rstrip() + "..."
|
|
1076
|
+
return cleaned
|
|
1077
|
+
|
|
1078
|
+
|
|
1079
|
+
def _build_reasoning_fallback_text(
|
|
1080
|
+
reasoning_chunks: list[str], mode: str | None = None
|
|
1081
|
+
) -> str | None:
|
|
1082
|
+
fallback_mode = (mode or PROXY_STREAM_REASONING_FALLBACK).strip().lower()
|
|
1083
|
+
if fallback_mode == "off":
|
|
1084
|
+
return None
|
|
1085
|
+
|
|
1086
|
+
raw_text = "".join(reasoning_chunks).strip()
|
|
1087
|
+
if not raw_text:
|
|
1088
|
+
return None
|
|
1089
|
+
|
|
1090
|
+
if fallback_mode == "visible":
|
|
1091
|
+
return raw_text
|
|
1092
|
+
if fallback_mode == "sanitized":
|
|
1093
|
+
sanitized = _sanitize_reasoning_fallback_text(raw_text)
|
|
1094
|
+
return sanitized or None
|
|
1095
|
+
|
|
1096
|
+
logger.warning(
|
|
1097
|
+
"Unknown PROXY_STREAM_REASONING_FALLBACK=%r; disabling reasoning fallback",
|
|
1098
|
+
fallback_mode,
|
|
1099
|
+
)
|
|
1100
|
+
return None
|
|
1101
|
+
|
|
1102
|
+
|
|
705
1103
|
def _last_assistant_was_text_only(anthropic_body: dict) -> bool:
|
|
706
1104
|
"""Check if the last assistant message in the conversation was text-only
|
|
707
1105
|
(no tool_use blocks). This indicates the model may be prematurely ending
|
|
@@ -717,11 +1115,14 @@ def _last_assistant_was_text_only(anthropic_body: dict) -> bool:
|
|
|
717
1115
|
return bool(content.strip())
|
|
718
1116
|
if isinstance(content, list):
|
|
719
1117
|
has_tool_use = any(
|
|
720
|
-
isinstance(b, dict) and b.get("type") == "tool_use"
|
|
721
|
-
for b in content
|
|
1118
|
+
isinstance(b, dict) and b.get("type") == "tool_use" for b in content
|
|
722
1119
|
)
|
|
723
1120
|
has_text = any(
|
|
724
|
-
(
|
|
1121
|
+
(
|
|
1122
|
+
isinstance(b, dict)
|
|
1123
|
+
and b.get("type") == "text"
|
|
1124
|
+
and b.get("text", "").strip()
|
|
1125
|
+
)
|
|
725
1126
|
or isinstance(b, str)
|
|
726
1127
|
for b in content
|
|
727
1128
|
)
|
|
@@ -735,6 +1136,7 @@ def _last_assistant_was_text_only(anthropic_body: dict) -> bool:
|
|
|
735
1136
|
# Response Translation: OpenAI -> Anthropic
|
|
736
1137
|
# ===========================================================================
|
|
737
1138
|
|
|
1139
|
+
|
|
738
1140
|
def openai_to_anthropic_response(openai_resp: dict, model: str) -> dict:
|
|
739
1141
|
"""Convert an OpenAI Chat Completions response to Anthropic Messages format."""
|
|
740
1142
|
choice = openai_resp.get("choices", [{}])[0]
|
|
@@ -752,12 +1154,14 @@ def openai_to_anthropic_response(openai_resp: dict, model: str) -> dict:
|
|
|
752
1154
|
args = json.loads(fn.get("arguments", "{}"))
|
|
753
1155
|
except json.JSONDecodeError:
|
|
754
1156
|
args = {}
|
|
755
|
-
content.append(
|
|
756
|
-
|
|
757
|
-
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
|
|
1157
|
+
content.append(
|
|
1158
|
+
{
|
|
1159
|
+
"type": "tool_use",
|
|
1160
|
+
"id": tc.get("id", f"toolu_{uuid.uuid4().hex[:12]}"),
|
|
1161
|
+
"name": fn.get("name", ""),
|
|
1162
|
+
"input": args,
|
|
1163
|
+
}
|
|
1164
|
+
)
|
|
761
1165
|
|
|
762
1166
|
stop_reason_map = {
|
|
763
1167
|
"stop": "end_turn",
|
|
@@ -787,7 +1191,13 @@ def openai_to_anthropic_response(openai_resp: dict, model: str) -> dict:
|
|
|
787
1191
|
# Streaming Translation: OpenAI SSE -> Anthropic SSE
|
|
788
1192
|
# ===========================================================================
|
|
789
1193
|
|
|
790
|
-
|
|
1194
|
+
|
|
1195
|
+
async def stream_anthropic_response(
|
|
1196
|
+
openai_stream: httpx.Response,
|
|
1197
|
+
model: str,
|
|
1198
|
+
monitor: SessionMonitor,
|
|
1199
|
+
anthropic_body: dict,
|
|
1200
|
+
):
|
|
791
1201
|
"""Convert an OpenAI streaming response to Anthropic SSE stream format.
|
|
792
1202
|
|
|
793
1203
|
Handles:
|
|
@@ -810,7 +1220,7 @@ async def stream_anthropic_response(openai_stream: httpx.Response, model: str):
|
|
|
810
1220
|
f"data: {json.dumps({'type': 'content_block_start', 'index': 0, 'content_block': {'type': 'text', 'text': ''}})}\n\n"
|
|
811
1221
|
)
|
|
812
1222
|
|
|
813
|
-
yield
|
|
1223
|
+
yield 'event: ping\ndata: {"type": "ping"}\n\n'
|
|
814
1224
|
|
|
815
1225
|
output_tokens = 0
|
|
816
1226
|
finish_reason = "end_turn"
|
|
@@ -939,21 +1349,29 @@ async def stream_anthropic_response(openai_stream: httpx.Response, model: str):
|
|
|
939
1349
|
f"data: {json.dumps({'type': 'content_block_stop', 'index': tc['block_index']})}\n\n"
|
|
940
1350
|
)
|
|
941
1351
|
else:
|
|
942
|
-
#
|
|
943
|
-
#
|
|
944
|
-
#
|
|
1352
|
+
# If the response has no text and no tool calls, optionally emit a
|
|
1353
|
+
# reasoning fallback (configurable) to avoid leaking malformed
|
|
1354
|
+
# internal chain-of-thought content by default.
|
|
945
1355
|
accumulated_text = "".join(text_chunks)
|
|
946
1356
|
if not accumulated_text and reasoning_chunks:
|
|
947
|
-
fallback_text =
|
|
948
|
-
|
|
949
|
-
|
|
950
|
-
|
|
951
|
-
|
|
952
|
-
|
|
953
|
-
|
|
954
|
-
|
|
955
|
-
|
|
956
|
-
|
|
1357
|
+
fallback_text = _build_reasoning_fallback_text(reasoning_chunks)
|
|
1358
|
+
if fallback_text:
|
|
1359
|
+
logger.warning(
|
|
1360
|
+
"Empty response with %d reasoning chunks – emitting fallback text (mode=%s)",
|
|
1361
|
+
len(reasoning_chunks),
|
|
1362
|
+
PROXY_STREAM_REASONING_FALLBACK,
|
|
1363
|
+
)
|
|
1364
|
+
text_chunks.append(fallback_text)
|
|
1365
|
+
yield (
|
|
1366
|
+
f"event: content_block_delta\n"
|
|
1367
|
+
f"data: {json.dumps({'type': 'content_block_delta', 'index': 0, 'delta': {'type': 'text_delta', 'text': fallback_text}})}\n\n"
|
|
1368
|
+
)
|
|
1369
|
+
else:
|
|
1370
|
+
logger.warning(
|
|
1371
|
+
"Empty response with %d reasoning chunks – fallback suppressed (mode=%s)",
|
|
1372
|
+
len(reasoning_chunks),
|
|
1373
|
+
PROXY_STREAM_REASONING_FALLBACK,
|
|
1374
|
+
)
|
|
957
1375
|
|
|
958
1376
|
yield (
|
|
959
1377
|
f"event: content_block_stop\n"
|
|
@@ -962,17 +1380,52 @@ async def stream_anthropic_response(openai_stream: httpx.Response, model: str):
|
|
|
962
1380
|
|
|
963
1381
|
# Log response summary
|
|
964
1382
|
accumulated_text = "".join(text_chunks)
|
|
965
|
-
tc_names =
|
|
966
|
-
|
|
1383
|
+
tc_names = (
|
|
1384
|
+
[tc["name"] for tc in tool_calls_by_index.values()]
|
|
1385
|
+
if tool_calls_by_index
|
|
1386
|
+
else []
|
|
1387
|
+
)
|
|
1388
|
+
tc_args = (
|
|
1389
|
+
[tc.get("arguments", "") for tc in tool_calls_by_index.values()]
|
|
1390
|
+
if tool_calls_by_index
|
|
1391
|
+
else []
|
|
1392
|
+
)
|
|
967
1393
|
logger.info(
|
|
968
1394
|
"RESP: finish=%s output_tokens=%d text_len=%d text=%.300s tool_calls=%s args=%s",
|
|
969
|
-
finish_reason,
|
|
1395
|
+
finish_reason,
|
|
1396
|
+
output_tokens,
|
|
970
1397
|
len(accumulated_text),
|
|
971
1398
|
accumulated_text[:300],
|
|
972
1399
|
tc_names,
|
|
973
1400
|
[a[:200] for a in tc_args],
|
|
974
1401
|
)
|
|
975
1402
|
|
|
1403
|
+
if _is_unexpected_end_turn(
|
|
1404
|
+
{
|
|
1405
|
+
"choices": [
|
|
1406
|
+
{
|
|
1407
|
+
"finish_reason": "stop"
|
|
1408
|
+
if finish_reason == "end_turn"
|
|
1409
|
+
else finish_reason,
|
|
1410
|
+
"message": {
|
|
1411
|
+
"content": accumulated_text,
|
|
1412
|
+
"tool_calls": [
|
|
1413
|
+
{
|
|
1414
|
+
"function": {
|
|
1415
|
+
"name": tc["name"],
|
|
1416
|
+
"arguments": tc.get("arguments", ""),
|
|
1417
|
+
}
|
|
1418
|
+
}
|
|
1419
|
+
for tc in tool_calls_by_index.values()
|
|
1420
|
+
],
|
|
1421
|
+
},
|
|
1422
|
+
}
|
|
1423
|
+
]
|
|
1424
|
+
},
|
|
1425
|
+
anthropic_body,
|
|
1426
|
+
):
|
|
1427
|
+
monitor.unexpected_end_turn_count += 1
|
|
1428
|
+
|
|
976
1429
|
# message_delta with final stop reason
|
|
977
1430
|
yield (
|
|
978
1431
|
f"event: message_delta\n"
|
|
@@ -987,6 +1440,7 @@ async def stream_anthropic_response(openai_stream: httpx.Response, model: str):
|
|
|
987
1440
|
# API Endpoints
|
|
988
1441
|
# ===========================================================================
|
|
989
1442
|
|
|
1443
|
+
|
|
990
1444
|
@app.post("/v1/messages")
|
|
991
1445
|
async def messages(request: Request):
|
|
992
1446
|
"""Handle Anthropic Messages API requests (streaming and non-streaming).
|
|
@@ -997,9 +1451,14 @@ async def messages(request: Request):
|
|
|
997
1451
|
- Option E: Smart max_tokens capping (in build_openai_request)
|
|
998
1452
|
- Option F: Session-level token monitoring with warnings
|
|
999
1453
|
"""
|
|
1454
|
+
global last_session_id
|
|
1455
|
+
|
|
1000
1456
|
body = await request.json()
|
|
1001
1457
|
model = body.get("model", "default")
|
|
1002
1458
|
is_stream = body.get("stream", False)
|
|
1459
|
+
session_id = resolve_session_id(request, body)
|
|
1460
|
+
monitor = get_session_monitor(session_id)
|
|
1461
|
+
last_session_id = session_id
|
|
1003
1462
|
|
|
1004
1463
|
# Debug: log request summary
|
|
1005
1464
|
n_messages = len(body.get("messages", []))
|
|
@@ -1009,42 +1468,51 @@ async def messages(request: Request):
|
|
|
1009
1468
|
last_role = last_msg.get("role", "?")
|
|
1010
1469
|
last_content = last_msg.get("content", "")
|
|
1011
1470
|
if isinstance(last_content, list):
|
|
1012
|
-
last_text = next(
|
|
1471
|
+
last_text = next(
|
|
1472
|
+
(b.get("text", "") for b in last_content if b.get("type") == "text"), ""
|
|
1473
|
+
)[:200]
|
|
1013
1474
|
elif isinstance(last_content, str):
|
|
1014
1475
|
last_text = last_content[:200]
|
|
1015
1476
|
else:
|
|
1016
1477
|
last_text = str(last_content)[:200]
|
|
1017
1478
|
logger.info(
|
|
1018
1479
|
"REQ: stream=%s msgs=%d tools=%d max_tokens=%s last_role=%s last_content=%.200s",
|
|
1019
|
-
is_stream,
|
|
1480
|
+
is_stream,
|
|
1481
|
+
n_messages,
|
|
1482
|
+
n_tools,
|
|
1483
|
+
max_tokens,
|
|
1484
|
+
last_role,
|
|
1485
|
+
last_text,
|
|
1020
1486
|
)
|
|
1021
1487
|
|
|
1022
1488
|
# --- Option F: Estimate tokens and record in session monitor ---
|
|
1023
1489
|
estimated_tokens = estimate_total_tokens(body)
|
|
1024
|
-
|
|
1025
|
-
|
|
1490
|
+
monitor.record_request(estimated_tokens)
|
|
1491
|
+
monitor.log_status()
|
|
1026
1492
|
|
|
1027
1493
|
# --- Option C: Prune conversation if approaching context limit ---
|
|
1028
|
-
ctx_window =
|
|
1494
|
+
ctx_window = monitor.context_window
|
|
1029
1495
|
if ctx_window > 0:
|
|
1030
1496
|
utilization = estimated_tokens / ctx_window
|
|
1031
1497
|
if utilization >= PROXY_CONTEXT_PRUNE_THRESHOLD:
|
|
1032
1498
|
logger.warning(
|
|
1033
1499
|
"Context utilization %.1f%% exceeds threshold %.1f%% -- pruning conversation",
|
|
1034
|
-
utilization * 100,
|
|
1500
|
+
utilization * 100,
|
|
1501
|
+
PROXY_CONTEXT_PRUNE_THRESHOLD * 100,
|
|
1035
1502
|
)
|
|
1036
1503
|
body = prune_conversation(body, ctx_window, target_fraction=0.65)
|
|
1037
|
-
|
|
1504
|
+
monitor.prune_count += 1
|
|
1038
1505
|
# Re-estimate after pruning
|
|
1039
1506
|
estimated_tokens = estimate_total_tokens(body)
|
|
1040
|
-
|
|
1507
|
+
monitor.record_request(estimated_tokens)
|
|
1041
1508
|
n_messages = len(body.get("messages", []))
|
|
1042
1509
|
logger.info(
|
|
1043
1510
|
"After pruning: ~%d tokens, %d messages",
|
|
1044
|
-
estimated_tokens,
|
|
1511
|
+
estimated_tokens,
|
|
1512
|
+
n_messages,
|
|
1045
1513
|
)
|
|
1046
1514
|
|
|
1047
|
-
openai_body = build_openai_request(body)
|
|
1515
|
+
openai_body = build_openai_request(body, monitor)
|
|
1048
1516
|
|
|
1049
1517
|
client = http_client
|
|
1050
1518
|
if client is None:
|
|
@@ -1062,6 +1530,7 @@ async def messages(request: Request):
|
|
|
1062
1530
|
MAX_UPSTREAM_RETRIES = 3
|
|
1063
1531
|
RETRY_DELAY_SECS = 5.0
|
|
1064
1532
|
last_exc: Exception | None = None
|
|
1533
|
+
resp: httpx.Response | None = None
|
|
1065
1534
|
|
|
1066
1535
|
for attempt in range(MAX_UPSTREAM_RETRIES):
|
|
1067
1536
|
try:
|
|
@@ -1082,25 +1551,46 @@ async def messages(request: Request):
|
|
|
1082
1551
|
if attempt < MAX_UPSTREAM_RETRIES - 1:
|
|
1083
1552
|
logger.warning(
|
|
1084
1553
|
"Upstream connect failed (attempt %d/%d): %s – retrying in %.0fs",
|
|
1085
|
-
attempt + 1,
|
|
1086
|
-
|
|
1554
|
+
attempt + 1,
|
|
1555
|
+
MAX_UPSTREAM_RETRIES,
|
|
1556
|
+
type(exc).__name__,
|
|
1557
|
+
RETRY_DELAY_SECS,
|
|
1087
1558
|
)
|
|
1088
1559
|
await asyncio.sleep(RETRY_DELAY_SECS)
|
|
1089
1560
|
else:
|
|
1090
1561
|
logger.error(
|
|
1091
1562
|
"Upstream connect failed after %d attempts: %s: %s",
|
|
1092
|
-
MAX_UPSTREAM_RETRIES,
|
|
1563
|
+
MAX_UPSTREAM_RETRIES,
|
|
1564
|
+
type(exc).__name__,
|
|
1565
|
+
exc,
|
|
1093
1566
|
)
|
|
1094
1567
|
|
|
1095
1568
|
if last_exc is not None:
|
|
1096
1569
|
return Response(
|
|
1097
|
-
content=json.dumps(
|
|
1098
|
-
|
|
1099
|
-
|
|
1100
|
-
"
|
|
1101
|
-
|
|
1102
|
-
|
|
1103
|
-
|
|
1570
|
+
content=json.dumps(
|
|
1571
|
+
{
|
|
1572
|
+
"type": "error",
|
|
1573
|
+
"error": {
|
|
1574
|
+
"type": "overloaded_error",
|
|
1575
|
+
"message": f"Upstream server unavailable after {MAX_UPSTREAM_RETRIES} retries: {last_exc}",
|
|
1576
|
+
},
|
|
1577
|
+
}
|
|
1578
|
+
),
|
|
1579
|
+
status_code=529,
|
|
1580
|
+
media_type="application/json",
|
|
1581
|
+
)
|
|
1582
|
+
|
|
1583
|
+
if resp is None:
|
|
1584
|
+
return Response(
|
|
1585
|
+
content=json.dumps(
|
|
1586
|
+
{
|
|
1587
|
+
"type": "error",
|
|
1588
|
+
"error": {
|
|
1589
|
+
"type": "overloaded_error",
|
|
1590
|
+
"message": "Upstream response unavailable",
|
|
1591
|
+
},
|
|
1592
|
+
}
|
|
1593
|
+
),
|
|
1104
1594
|
status_code=529,
|
|
1105
1595
|
media_type="application/json",
|
|
1106
1596
|
)
|
|
@@ -1113,9 +1603,7 @@ async def messages(request: Request):
|
|
|
1113
1603
|
error_body = await resp.aread()
|
|
1114
1604
|
await resp.aclose()
|
|
1115
1605
|
error_text = error_body.decode("utf-8", errors="replace")[:1000]
|
|
1116
|
-
logger.error(
|
|
1117
|
-
"Upstream HTTP %d: %s", resp.status_code, error_text
|
|
1118
|
-
)
|
|
1606
|
+
logger.error("Upstream HTTP %d: %s", resp.status_code, error_text)
|
|
1119
1607
|
|
|
1120
1608
|
# Parse the error for a user-friendly message
|
|
1121
1609
|
error_message = f"Upstream server error (HTTP {resp.status_code})"
|
|
@@ -1138,47 +1626,57 @@ async def messages(request: Request):
|
|
|
1138
1626
|
)
|
|
1139
1627
|
|
|
1140
1628
|
if is_context_overflow:
|
|
1141
|
-
|
|
1629
|
+
monitor.overflow_count += 1
|
|
1142
1630
|
logger.error(
|
|
1143
1631
|
"CONTEXT OVERFLOW detected (count=%d). "
|
|
1144
1632
|
"Estimated input: %d tokens, context window: %d tokens. "
|
|
1145
1633
|
"Conversation needs pruning or context window increase.",
|
|
1146
|
-
|
|
1634
|
+
monitor.overflow_count,
|
|
1635
|
+
estimated_tokens,
|
|
1636
|
+
ctx_window,
|
|
1147
1637
|
)
|
|
1148
1638
|
# Return Anthropic-format error that Claude Code can handle
|
|
1149
1639
|
return Response(
|
|
1150
|
-
content=json.dumps(
|
|
1151
|
-
|
|
1152
|
-
|
|
1153
|
-
"
|
|
1154
|
-
|
|
1155
|
-
|
|
1156
|
-
|
|
1157
|
-
|
|
1158
|
-
|
|
1159
|
-
|
|
1160
|
-
|
|
1161
|
-
|
|
1640
|
+
content=json.dumps(
|
|
1641
|
+
{
|
|
1642
|
+
"type": "error",
|
|
1643
|
+
"error": {
|
|
1644
|
+
"type": "overloaded_error",
|
|
1645
|
+
"message": (
|
|
1646
|
+
f"Context window exceeded: request requires ~{estimated_tokens} tokens "
|
|
1647
|
+
f"but only {ctx_window} are available. "
|
|
1648
|
+
f"The conversation is too long. Please start a new session or "
|
|
1649
|
+
f"reduce conversation length."
|
|
1650
|
+
),
|
|
1651
|
+
},
|
|
1652
|
+
}
|
|
1653
|
+
),
|
|
1162
1654
|
status_code=529,
|
|
1163
1655
|
media_type="application/json",
|
|
1164
1656
|
)
|
|
1165
1657
|
|
|
1166
1658
|
# Generic upstream error -- return as Anthropic error format
|
|
1167
|
-
error_type =
|
|
1659
|
+
error_type = (
|
|
1660
|
+
"overloaded_error"
|
|
1661
|
+
if resp.status_code >= 500
|
|
1662
|
+
else "invalid_request_error"
|
|
1663
|
+
)
|
|
1168
1664
|
return Response(
|
|
1169
|
-
content=json.dumps(
|
|
1170
|
-
|
|
1171
|
-
|
|
1172
|
-
"
|
|
1173
|
-
|
|
1174
|
-
|
|
1175
|
-
|
|
1665
|
+
content=json.dumps(
|
|
1666
|
+
{
|
|
1667
|
+
"type": "error",
|
|
1668
|
+
"error": {
|
|
1669
|
+
"type": error_type,
|
|
1670
|
+
"message": error_message,
|
|
1671
|
+
},
|
|
1672
|
+
}
|
|
1673
|
+
),
|
|
1176
1674
|
status_code=529 if resp.status_code >= 500 else 400,
|
|
1177
1675
|
media_type="application/json",
|
|
1178
1676
|
)
|
|
1179
1677
|
|
|
1180
1678
|
return StreamingResponse(
|
|
1181
|
-
stream_anthropic_response(resp, model),
|
|
1679
|
+
stream_anthropic_response(resp, model, monitor, body),
|
|
1182
1680
|
media_type="text/event-stream",
|
|
1183
1681
|
headers={
|
|
1184
1682
|
"Cache-Control": "no-cache",
|
|
@@ -1195,25 +1693,56 @@ async def messages(request: Request):
|
|
|
1195
1693
|
# Option B: Handle non-streaming errors too
|
|
1196
1694
|
if resp.status_code != 200:
|
|
1197
1695
|
error_text = resp.text[:1000]
|
|
1198
|
-
logger.error(
|
|
1696
|
+
logger.error(
|
|
1697
|
+
"Upstream HTTP %d (non-stream): %s", resp.status_code, error_text
|
|
1698
|
+
)
|
|
1199
1699
|
return Response(
|
|
1200
|
-
content=json.dumps(
|
|
1201
|
-
|
|
1202
|
-
|
|
1203
|
-
"
|
|
1204
|
-
|
|
1205
|
-
|
|
1206
|
-
|
|
1700
|
+
content=json.dumps(
|
|
1701
|
+
{
|
|
1702
|
+
"type": "error",
|
|
1703
|
+
"error": {
|
|
1704
|
+
"type": "overloaded_error",
|
|
1705
|
+
"message": f"Upstream error (HTTP {resp.status_code}): {error_text[:500]}",
|
|
1706
|
+
},
|
|
1707
|
+
}
|
|
1708
|
+
),
|
|
1207
1709
|
status_code=529,
|
|
1208
1710
|
media_type="application/json",
|
|
1209
1711
|
)
|
|
1210
1712
|
|
|
1211
1713
|
openai_resp = resp.json()
|
|
1714
|
+
|
|
1715
|
+
if PROXY_GUARDRAIL_RETRY and _is_unexpected_end_turn(openai_resp, body):
|
|
1716
|
+
monitor.unexpected_end_turn_count += 1
|
|
1717
|
+
logger.warning(
|
|
1718
|
+
"GUARDRAIL: unexpected end_turn without tool_use in active loop (session=%s), retrying once with tool_choice=required",
|
|
1719
|
+
session_id,
|
|
1720
|
+
)
|
|
1721
|
+
|
|
1722
|
+
retry_body = dict(openai_body)
|
|
1723
|
+
retry_body["tool_choice"] = "required"
|
|
1724
|
+
retry_body["stream"] = False
|
|
1725
|
+
|
|
1726
|
+
retry_resp = await client.post(
|
|
1727
|
+
f"{LLAMA_CPP_BASE}/chat/completions",
|
|
1728
|
+
json=retry_body,
|
|
1729
|
+
headers={"Content-Type": "application/json"},
|
|
1730
|
+
)
|
|
1731
|
+
if retry_resp.status_code == 200:
|
|
1732
|
+
retry_json = retry_resp.json()
|
|
1733
|
+
retry_choice = (retry_json.get("choices") or [{}])[0]
|
|
1734
|
+
retry_message = retry_choice.get("message", {})
|
|
1735
|
+
if retry_message.get("tool_calls"):
|
|
1736
|
+
openai_resp = retry_json
|
|
1737
|
+
logger.info(
|
|
1738
|
+
"GUARDRAIL: retry produced tool_use; using retried response"
|
|
1739
|
+
)
|
|
1740
|
+
|
|
1212
1741
|
anthropic_resp = openai_to_anthropic_response(openai_resp, model)
|
|
1213
1742
|
|
|
1214
1743
|
# Track output tokens in session monitor
|
|
1215
1744
|
output_tokens = anthropic_resp.get("usage", {}).get("output_tokens", 0)
|
|
1216
|
-
|
|
1745
|
+
monitor.record_response(output_tokens)
|
|
1217
1746
|
|
|
1218
1747
|
return anthropic_resp
|
|
1219
1748
|
|
|
@@ -1258,29 +1787,50 @@ async def health():
|
|
|
1258
1787
|
|
|
1259
1788
|
|
|
1260
1789
|
@app.get("/v1/context")
|
|
1261
|
-
async def context_status():
|
|
1790
|
+
async def context_status(request: Request):
|
|
1262
1791
|
"""Option F: Context window monitoring endpoint.
|
|
1263
1792
|
|
|
1264
1793
|
Returns current session token usage, utilization, warnings, and
|
|
1265
1794
|
estimated remaining turns. Useful for dashboards and debugging.
|
|
1266
1795
|
"""
|
|
1267
|
-
|
|
1268
|
-
|
|
1796
|
+
requested_session = request.query_params.get("session_id", "")
|
|
1797
|
+
session_id = requested_session or last_session_id
|
|
1798
|
+
monitor = session_monitors.get(session_id) if session_id else None
|
|
1799
|
+
|
|
1800
|
+
if monitor is None:
|
|
1801
|
+
monitor = SessionMonitor(context_window=default_context_window)
|
|
1802
|
+
|
|
1803
|
+
warning = monitor.get_warning_level()
|
|
1804
|
+
turns = monitor.estimate_turns_remaining()
|
|
1269
1805
|
|
|
1270
1806
|
return {
|
|
1271
|
-
"
|
|
1272
|
-
"
|
|
1273
|
-
"
|
|
1274
|
-
"
|
|
1275
|
-
"
|
|
1276
|
-
"
|
|
1807
|
+
"active_session_id": session_id,
|
|
1808
|
+
"session_count": len(session_monitors),
|
|
1809
|
+
"context_window": monitor.context_window,
|
|
1810
|
+
"last_input_tokens": monitor.last_input_tokens,
|
|
1811
|
+
"last_output_tokens": monitor.last_output_tokens,
|
|
1812
|
+
"peak_input_tokens": monitor.peak_input_tokens,
|
|
1813
|
+
"utilization": round(monitor.get_utilization(), 4),
|
|
1814
|
+
"utilization_pct": f"{monitor.get_utilization() * 100:.1f}%",
|
|
1277
1815
|
"warning_level": warning,
|
|
1278
1816
|
"estimated_turns_remaining": turns,
|
|
1279
|
-
"total_requests":
|
|
1280
|
-
"prune_count":
|
|
1281
|
-
"overflow_count":
|
|
1817
|
+
"total_requests": monitor.total_requests,
|
|
1818
|
+
"prune_count": monitor.prune_count,
|
|
1819
|
+
"overflow_count": monitor.overflow_count,
|
|
1282
1820
|
"prune_threshold": PROXY_CONTEXT_PRUNE_THRESHOLD,
|
|
1283
|
-
"recent_history":
|
|
1821
|
+
"recent_history": monitor.context_history[-10:],
|
|
1822
|
+
# Loop protection stats
|
|
1823
|
+
"loop_protection": {
|
|
1824
|
+
"enabled": PROXY_LOOP_BREAKER,
|
|
1825
|
+
"consecutive_forced_count": monitor.consecutive_forced_count,
|
|
1826
|
+
"no_progress_streak": monitor.no_progress_streak,
|
|
1827
|
+
"loop_warnings_emitted": monitor.loop_warnings_emitted,
|
|
1828
|
+
"unexpected_end_turn_count": monitor.unexpected_end_turn_count,
|
|
1829
|
+
"tool_call_history_len": len(monitor.tool_call_history),
|
|
1830
|
+
"is_looping": monitor.detect_tool_loop(window=PROXY_LOOP_WINDOW)[0],
|
|
1831
|
+
"loop_repeat_count": monitor.detect_tool_loop(window=PROXY_LOOP_WINDOW)[1],
|
|
1832
|
+
"recent_tool_patterns": monitor.tool_call_history[-5:],
|
|
1833
|
+
},
|
|
1284
1834
|
}
|
|
1285
1835
|
|
|
1286
1836
|
|