@miller-tech/uap 1.13.13 → 1.13.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/.tsbuildinfo +1 -1
- package/dist/benchmarks/speculative-autotune.d.ts +46 -0
- package/dist/benchmarks/speculative-autotune.d.ts.map +1 -0
- package/dist/benchmarks/speculative-autotune.js +145 -0
- package/dist/benchmarks/speculative-autotune.js.map +1 -0
- package/dist/bin/cli.js +2 -0
- package/dist/bin/cli.js.map +1 -1
- package/dist/bin/llama-server-optimize.js +176 -0
- package/dist/bin/llama-server-optimize.js.map +1 -1
- package/dist/cli/init.d.ts +1 -0
- package/dist/cli/init.d.ts.map +1 -1
- package/dist/cli/init.js +18 -0
- package/dist/cli/init.js.map +1 -1
- package/dist/cli/setup.d.ts +1 -0
- package/dist/cli/setup.d.ts.map +1 -1
- package/dist/cli/setup.js +1 -0
- package/dist/cli/setup.js.map +1 -1
- package/dist/cli/systemd-services.d.ts +12 -0
- package/dist/cli/systemd-services.d.ts.map +1 -0
- package/dist/cli/systemd-services.js +179 -0
- package/dist/cli/systemd-services.js.map +1 -0
- package/docs/deployment/QWEN35_LLAMA_CPP.md +49 -0
- package/docs/deployment/UAP_LLAMA_ANTHROPIC_PROXY_BOOTSTRAP.md +279 -0
- package/package.json +1 -1
- package/tools/agents/scripts/anthropic_proxy.py +610 -188
- package/tools/agents/tests/test_anthropic_proxy_streaming.py +51 -0
|
@@ -76,9 +76,11 @@ Dependencies
|
|
|
76
76
|
"""
|
|
77
77
|
|
|
78
78
|
import asyncio
|
|
79
|
+
import hashlib
|
|
79
80
|
import json
|
|
80
81
|
import logging
|
|
81
82
|
import os
|
|
83
|
+
import re
|
|
82
84
|
import sys
|
|
83
85
|
import time
|
|
84
86
|
import uuid
|
|
@@ -100,7 +102,35 @@ PROXY_LOG_LEVEL = os.environ.get("PROXY_LOG_LEVEL", "INFO").upper()
|
|
|
100
102
|
PROXY_READ_TIMEOUT = float(os.environ.get("PROXY_READ_TIMEOUT", "600"))
|
|
101
103
|
PROXY_MAX_CONNECTIONS = int(os.environ.get("PROXY_MAX_CONNECTIONS", "20"))
|
|
102
104
|
PROXY_CONTEXT_WINDOW = int(os.environ.get("PROXY_CONTEXT_WINDOW", "0"))
|
|
103
|
-
PROXY_CONTEXT_PRUNE_THRESHOLD = float(
|
|
105
|
+
PROXY_CONTEXT_PRUNE_THRESHOLD = float(
|
|
106
|
+
os.environ.get("PROXY_CONTEXT_PRUNE_THRESHOLD", "0.75")
|
|
107
|
+
)
|
|
108
|
+
PROXY_LOOP_BREAKER = os.environ.get("PROXY_LOOP_BREAKER", "on").lower() not in {
|
|
109
|
+
"0",
|
|
110
|
+
"false",
|
|
111
|
+
"off",
|
|
112
|
+
"no",
|
|
113
|
+
}
|
|
114
|
+
PROXY_LOOP_WINDOW = int(os.environ.get("PROXY_LOOP_WINDOW", "6"))
|
|
115
|
+
PROXY_LOOP_REPEAT_THRESHOLD = int(os.environ.get("PROXY_LOOP_REPEAT_THRESHOLD", "8"))
|
|
116
|
+
PROXY_FORCED_THRESHOLD = int(os.environ.get("PROXY_FORCED_THRESHOLD", "15"))
|
|
117
|
+
PROXY_NO_PROGRESS_THRESHOLD = int(os.environ.get("PROXY_NO_PROGRESS_THRESHOLD", "4"))
|
|
118
|
+
PROXY_CONTEXT_RELEASE_THRESHOLD = float(
|
|
119
|
+
os.environ.get("PROXY_CONTEXT_RELEASE_THRESHOLD", "0.90")
|
|
120
|
+
)
|
|
121
|
+
PROXY_GUARDRAIL_RETRY = os.environ.get("PROXY_GUARDRAIL_RETRY", "on").lower() not in {
|
|
122
|
+
"0",
|
|
123
|
+
"false",
|
|
124
|
+
"off",
|
|
125
|
+
"no",
|
|
126
|
+
}
|
|
127
|
+
PROXY_SESSION_TTL_SECS = int(os.environ.get("PROXY_SESSION_TTL_SECS", "7200"))
|
|
128
|
+
PROXY_STREAM_REASONING_FALLBACK = (
|
|
129
|
+
os.environ.get("PROXY_STREAM_REASONING_FALLBACK", "off").strip().lower()
|
|
130
|
+
)
|
|
131
|
+
PROXY_STREAM_REASONING_MAX_CHARS = int(
|
|
132
|
+
os.environ.get("PROXY_STREAM_REASONING_MAX_CHARS", "240")
|
|
133
|
+
)
|
|
104
134
|
|
|
105
135
|
# ---------------------------------------------------------------------------
|
|
106
136
|
# Logging
|
|
@@ -121,19 +151,26 @@ class SessionMonitor:
|
|
|
121
151
|
"""Tracks token usage across the session to provide early warnings
|
|
122
152
|
and enable proactive context management before overflow occurs."""
|
|
123
153
|
|
|
124
|
-
context_window: int = 0
|
|
154
|
+
context_window: int = 0 # Auto-detected or configured
|
|
125
155
|
total_requests: int = 0
|
|
126
|
-
last_input_tokens: int = 0
|
|
127
|
-
last_output_tokens: int = 0
|
|
128
|
-
peak_input_tokens: int = 0
|
|
129
|
-
prune_count: int = 0
|
|
130
|
-
overflow_count: int = 0
|
|
156
|
+
last_input_tokens: int = 0 # Estimated input tokens of last request
|
|
157
|
+
last_output_tokens: int = 0 # Actual output tokens of last response
|
|
158
|
+
peak_input_tokens: int = 0 # High-water mark
|
|
159
|
+
prune_count: int = 0 # How many times pruning was triggered
|
|
160
|
+
overflow_count: int = 0 # How many context overflow errors caught
|
|
131
161
|
context_history: list = field(default_factory=list) # Recent token counts
|
|
132
162
|
|
|
133
163
|
# --- Token Loop Protection ---
|
|
134
|
-
tool_call_history: list = field(
|
|
135
|
-
|
|
136
|
-
|
|
164
|
+
tool_call_history: list = field(
|
|
165
|
+
default_factory=list
|
|
166
|
+
) # Recent tool call fingerprints
|
|
167
|
+
consecutive_forced_count: int = (
|
|
168
|
+
0 # How many times tool_choice was forced consecutively
|
|
169
|
+
)
|
|
170
|
+
loop_warnings_emitted: int = 0 # How many loop warnings sent to the model
|
|
171
|
+
no_progress_streak: int = 0 # Forced tool turns without new tool_result
|
|
172
|
+
unexpected_end_turn_count: int = 0 # end_turn without tool_use in active loop
|
|
173
|
+
last_seen_ts: float = 0.0
|
|
137
174
|
|
|
138
175
|
def record_request(self, estimated_tokens: int):
|
|
139
176
|
"""Record an outgoing request's estimated token count."""
|
|
@@ -150,6 +187,9 @@ class SessionMonitor:
|
|
|
150
187
|
"""Record a response's output token count."""
|
|
151
188
|
self.last_output_tokens = output_tokens
|
|
152
189
|
|
|
190
|
+
def touch(self):
|
|
191
|
+
self.last_seen_ts = time.time()
|
|
192
|
+
|
|
153
193
|
def get_utilization(self) -> float:
|
|
154
194
|
"""Get current context utilization as a fraction (0.0 - 1.0)."""
|
|
155
195
|
if self.context_window <= 0:
|
|
@@ -196,25 +236,36 @@ class SessionMonitor:
|
|
|
196
236
|
if warning == "CRITICAL":
|
|
197
237
|
logger.error(
|
|
198
238
|
"CONTEXT CRITICAL: %d/%d tokens (%.1f%%), %s, pruned=%d, overflows=%d",
|
|
199
|
-
self.last_input_tokens,
|
|
200
|
-
|
|
239
|
+
self.last_input_tokens,
|
|
240
|
+
self.context_window,
|
|
241
|
+
util * 100,
|
|
242
|
+
turns_str,
|
|
243
|
+
self.prune_count,
|
|
244
|
+
self.overflow_count,
|
|
201
245
|
)
|
|
202
246
|
elif warning == "HIGH":
|
|
203
247
|
logger.warning(
|
|
204
248
|
"CONTEXT HIGH: %d/%d tokens (%.1f%%), %s, pruned=%d",
|
|
205
|
-
self.last_input_tokens,
|
|
206
|
-
|
|
249
|
+
self.last_input_tokens,
|
|
250
|
+
self.context_window,
|
|
251
|
+
util * 100,
|
|
252
|
+
turns_str,
|
|
253
|
+
self.prune_count,
|
|
207
254
|
)
|
|
208
255
|
elif warning == "ELEVATED":
|
|
209
256
|
logger.warning(
|
|
210
257
|
"CONTEXT ELEVATED: %d/%d tokens (%.1f%%), %s",
|
|
211
|
-
self.last_input_tokens,
|
|
258
|
+
self.last_input_tokens,
|
|
259
|
+
self.context_window,
|
|
260
|
+
util * 100,
|
|
212
261
|
turns_str,
|
|
213
262
|
)
|
|
214
263
|
else:
|
|
215
264
|
logger.info(
|
|
216
265
|
"CONTEXT: %d/%d tokens (%.1f%%), %s",
|
|
217
|
-
self.last_input_tokens,
|
|
266
|
+
self.last_input_tokens,
|
|
267
|
+
self.context_window,
|
|
268
|
+
util * 100,
|
|
218
269
|
turns_str,
|
|
219
270
|
)
|
|
220
271
|
|
|
@@ -264,30 +315,42 @@ class SessionMonitor:
|
|
|
264
315
|
- 15+ consecutive forced requests regardless -> release
|
|
265
316
|
- Context utilization > 90% -> release (let model wrap up)
|
|
266
317
|
"""
|
|
267
|
-
|
|
318
|
+
if not PROXY_LOOP_BREAKER:
|
|
319
|
+
return False
|
|
320
|
+
|
|
321
|
+
is_looping, repeat_count = self.detect_tool_loop(window=PROXY_LOOP_WINDOW)
|
|
268
322
|
|
|
269
323
|
# Pattern 1: Detected tool call loop
|
|
270
|
-
if
|
|
324
|
+
if (
|
|
325
|
+
is_looping
|
|
326
|
+
and repeat_count >= PROXY_LOOP_REPEAT_THRESHOLD
|
|
327
|
+
and self.no_progress_streak >= PROXY_NO_PROGRESS_THRESHOLD
|
|
328
|
+
):
|
|
271
329
|
logger.warning(
|
|
272
|
-
"LOOP BREAKER: Same tool pattern repeated %d times. "
|
|
330
|
+
"LOOP BREAKER: Same tool pattern repeated %d times with no progress streak=%d. "
|
|
273
331
|
"Releasing tool_choice to 'auto'.",
|
|
274
332
|
repeat_count,
|
|
333
|
+
self.no_progress_streak,
|
|
275
334
|
)
|
|
276
335
|
self.loop_warnings_emitted += 1
|
|
277
336
|
return True
|
|
278
337
|
|
|
279
338
|
# Pattern 2: Too many consecutive forced requests
|
|
280
|
-
if
|
|
339
|
+
if (
|
|
340
|
+
self.consecutive_forced_count >= PROXY_FORCED_THRESHOLD
|
|
341
|
+
and self.no_progress_streak >= PROXY_NO_PROGRESS_THRESHOLD
|
|
342
|
+
):
|
|
281
343
|
logger.warning(
|
|
282
|
-
"LOOP BREAKER: %d consecutive forced tool_choice requests. "
|
|
344
|
+
"LOOP BREAKER: %d consecutive forced tool_choice requests with no progress streak=%d. "
|
|
283
345
|
"Releasing to 'auto'.",
|
|
284
346
|
self.consecutive_forced_count,
|
|
347
|
+
self.no_progress_streak,
|
|
285
348
|
)
|
|
286
349
|
self.loop_warnings_emitted += 1
|
|
287
350
|
return True
|
|
288
351
|
|
|
289
352
|
# Pattern 3: Context almost full -- let model wrap up naturally
|
|
290
|
-
if self.get_utilization() >=
|
|
353
|
+
if self.get_utilization() >= PROXY_CONTEXT_RELEASE_THRESHOLD:
|
|
291
354
|
logger.warning(
|
|
292
355
|
"LOOP BREAKER: Context utilization %.1f%% -- releasing "
|
|
293
356
|
"tool_choice to let model wrap up.",
|
|
@@ -298,7 +361,35 @@ class SessionMonitor:
|
|
|
298
361
|
return False
|
|
299
362
|
|
|
300
363
|
|
|
301
|
-
|
|
364
|
+
session_monitors: dict[str, SessionMonitor] = {}
|
|
365
|
+
default_context_window = 0
|
|
366
|
+
last_session_id = ""
|
|
367
|
+
|
|
368
|
+
|
|
369
|
+
def _cleanup_stale_monitors(now_ts: float) -> None:
|
|
370
|
+
stale = [
|
|
371
|
+
sid
|
|
372
|
+
for sid, mon in session_monitors.items()
|
|
373
|
+
if mon.last_seen_ts > 0 and now_ts - mon.last_seen_ts > PROXY_SESSION_TTL_SECS
|
|
374
|
+
]
|
|
375
|
+
for sid in stale:
|
|
376
|
+
session_monitors.pop(sid, None)
|
|
377
|
+
|
|
378
|
+
|
|
379
|
+
def get_session_monitor(session_id: str) -> SessionMonitor:
|
|
380
|
+
now_ts = time.time()
|
|
381
|
+
_cleanup_stale_monitors(now_ts)
|
|
382
|
+
|
|
383
|
+
monitor = session_monitors.get(session_id)
|
|
384
|
+
if monitor is None:
|
|
385
|
+
monitor = SessionMonitor(context_window=default_context_window)
|
|
386
|
+
session_monitors[session_id] = monitor
|
|
387
|
+
|
|
388
|
+
monitor.touch()
|
|
389
|
+
if monitor.context_window <= 0:
|
|
390
|
+
monitor.context_window = default_context_window
|
|
391
|
+
|
|
392
|
+
return monitor
|
|
302
393
|
|
|
303
394
|
|
|
304
395
|
# ---------------------------------------------------------------------------
|
|
@@ -324,7 +415,8 @@ async def detect_context_window(client: httpx.AsyncClient) -> int:
|
|
|
324
415
|
if n_ctx > 0:
|
|
325
416
|
logger.info(
|
|
326
417
|
"Auto-detected context window from upstream: %d tokens (%d slots)",
|
|
327
|
-
n_ctx,
|
|
418
|
+
n_ctx,
|
|
419
|
+
len(slots),
|
|
328
420
|
)
|
|
329
421
|
return n_ctx
|
|
330
422
|
except Exception as exc:
|
|
@@ -398,7 +490,9 @@ def estimate_total_tokens(anthropic_body: dict) -> int:
|
|
|
398
490
|
return tokens
|
|
399
491
|
|
|
400
492
|
|
|
401
|
-
def prune_conversation(
|
|
493
|
+
def prune_conversation(
|
|
494
|
+
anthropic_body: dict, context_window: int, target_fraction: float = 0.65
|
|
495
|
+
) -> dict:
|
|
402
496
|
"""Prune the conversation to fit within the context window.
|
|
403
497
|
|
|
404
498
|
Strategy:
|
|
@@ -445,19 +539,24 @@ def prune_conversation(anthropic_body: dict, context_window: int, target_fractio
|
|
|
445
539
|
|
|
446
540
|
# Always keep the first user message and the last N messages
|
|
447
541
|
KEEP_LAST = 8 # Keep the last 8 messages (recent context)
|
|
448
|
-
protected_head = messages[:1]
|
|
449
|
-
protected_tail =
|
|
542
|
+
protected_head = messages[:1] # First user message
|
|
543
|
+
protected_tail = (
|
|
544
|
+
messages[-KEEP_LAST:] if len(messages) > KEEP_LAST else messages[1:]
|
|
545
|
+
)
|
|
450
546
|
middle = messages[1:-KEEP_LAST] if len(messages) > KEEP_LAST + 1 else []
|
|
451
547
|
|
|
452
548
|
# Calculate tokens for protected messages
|
|
453
|
-
protected_tokens = sum(
|
|
549
|
+
protected_tokens = sum(
|
|
550
|
+
estimate_message_tokens(m) for m in protected_head + protected_tail
|
|
551
|
+
)
|
|
454
552
|
|
|
455
553
|
if protected_tokens >= message_budget:
|
|
456
554
|
# Even protected messages exceed budget -- truncate tool_result content
|
|
457
555
|
# in the tail to fit
|
|
458
556
|
logger.warning(
|
|
459
557
|
"Protected messages (%d tokens) exceed budget (%d) -- truncating tool results",
|
|
460
|
-
protected_tokens,
|
|
558
|
+
protected_tokens,
|
|
559
|
+
message_budget,
|
|
461
560
|
)
|
|
462
561
|
for msg in protected_tail:
|
|
463
562
|
content = msg.get("content", [])
|
|
@@ -466,7 +565,11 @@ def prune_conversation(anthropic_body: dict, context_window: int, target_fractio
|
|
|
466
565
|
if isinstance(block, dict) and block.get("type") == "tool_result":
|
|
467
566
|
result_text = _extract_text(block.get("content", ""))
|
|
468
567
|
if len(result_text) > 2000:
|
|
469
|
-
block["content"] =
|
|
568
|
+
block["content"] = (
|
|
569
|
+
result_text[:1000]
|
|
570
|
+
+ "\n...[TRUNCATED]...\n"
|
|
571
|
+
+ result_text[-500:]
|
|
572
|
+
)
|
|
470
573
|
anthropic_body["messages"] = protected_head + protected_tail
|
|
471
574
|
return anthropic_body
|
|
472
575
|
|
|
@@ -486,8 +589,7 @@ def prune_conversation(anthropic_body: dict, context_window: int, target_fractio
|
|
|
486
589
|
|
|
487
590
|
if isinstance(content, list):
|
|
488
591
|
is_tool_result = any(
|
|
489
|
-
isinstance(b, dict) and b.get("type") == "tool_result"
|
|
490
|
-
for b in content
|
|
592
|
+
isinstance(b, dict) and b.get("type") == "tool_result" for b in content
|
|
491
593
|
)
|
|
492
594
|
|
|
493
595
|
# Lower priority = removed first
|
|
@@ -529,12 +631,17 @@ def prune_conversation(anthropic_body: dict, context_window: int, target_fractio
|
|
|
529
631
|
f"The conversation continues from recent context below.]"
|
|
530
632
|
),
|
|
531
633
|
}
|
|
532
|
-
anthropic_body["messages"] =
|
|
634
|
+
anthropic_body["messages"] = (
|
|
635
|
+
protected_head + [prune_marker] + kept_msgs + protected_tail
|
|
636
|
+
)
|
|
533
637
|
logger.warning(
|
|
534
638
|
"PRUNED: removed %d messages (~%d tokens), kept %d messages, "
|
|
535
639
|
"target=%.0f%% of %d ctx",
|
|
536
|
-
removed_count,
|
|
537
|
-
|
|
640
|
+
removed_count,
|
|
641
|
+
removed_tokens,
|
|
642
|
+
len(anthropic_body["messages"]),
|
|
643
|
+
target_fraction * 100,
|
|
644
|
+
context_window,
|
|
538
645
|
)
|
|
539
646
|
else:
|
|
540
647
|
anthropic_body["messages"] = protected_head + kept_msgs + protected_tail
|
|
@@ -554,12 +661,13 @@ http_client: httpx.AsyncClient | None = None
|
|
|
554
661
|
async def lifespan(app: FastAPI):
|
|
555
662
|
"""Manage the httpx client lifecycle with the FastAPI app."""
|
|
556
663
|
global http_client
|
|
664
|
+
global default_context_window
|
|
557
665
|
http_client = httpx.AsyncClient(
|
|
558
666
|
timeout=httpx.Timeout(
|
|
559
|
-
connect=10.0,
|
|
560
|
-
read=PROXY_READ_TIMEOUT,
|
|
561
|
-
write=30.0,
|
|
562
|
-
pool=10.0,
|
|
667
|
+
connect=10.0, # 10s to establish connection
|
|
668
|
+
read=PROXY_READ_TIMEOUT, # configurable (default 10 min)
|
|
669
|
+
write=30.0, # 30s to send the request body
|
|
670
|
+
pool=10.0, # 10s to acquire a pool connection
|
|
563
671
|
),
|
|
564
672
|
limits=httpx.Limits(
|
|
565
673
|
max_connections=PROXY_MAX_CONNECTIONS,
|
|
@@ -569,14 +677,19 @@ async def lifespan(app: FastAPI):
|
|
|
569
677
|
)
|
|
570
678
|
logger.info(
|
|
571
679
|
"Proxy started: listening on %s:%d -> upstream %s",
|
|
572
|
-
PROXY_HOST,
|
|
680
|
+
PROXY_HOST,
|
|
681
|
+
PROXY_PORT,
|
|
682
|
+
LLAMA_CPP_BASE,
|
|
573
683
|
)
|
|
574
684
|
|
|
575
685
|
# Auto-detect context window from upstream server
|
|
576
|
-
|
|
686
|
+
default_context_window = await detect_context_window(http_client)
|
|
687
|
+
for mon in session_monitors.values():
|
|
688
|
+
if mon.context_window <= 0:
|
|
689
|
+
mon.context_window = default_context_window
|
|
577
690
|
logger.info(
|
|
578
691
|
"Context window: %d tokens, prune threshold: %.0f%%",
|
|
579
|
-
|
|
692
|
+
default_context_window,
|
|
580
693
|
PROXY_CONTEXT_PRUNE_THRESHOLD * 100,
|
|
581
694
|
)
|
|
582
695
|
|
|
@@ -598,6 +711,7 @@ app = FastAPI(
|
|
|
598
711
|
# Request Translation: Anthropic -> OpenAI
|
|
599
712
|
# ===========================================================================
|
|
600
713
|
|
|
714
|
+
|
|
601
715
|
def anthropic_to_openai_messages(anthropic_body: dict) -> list[dict]:
|
|
602
716
|
"""Convert Anthropic message format to OpenAI message format.
|
|
603
717
|
|
|
@@ -635,25 +749,33 @@ def anthropic_to_openai_messages(anthropic_body: dict) -> list[dict]:
|
|
|
635
749
|
elif block.get("type") == "text":
|
|
636
750
|
parts.append(block.get("text", ""))
|
|
637
751
|
elif block.get("type") == "tool_use":
|
|
638
|
-
messages.append(
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
"
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
752
|
+
messages.append(
|
|
753
|
+
{
|
|
754
|
+
"role": "assistant",
|
|
755
|
+
"content": None,
|
|
756
|
+
"tool_calls": [
|
|
757
|
+
{
|
|
758
|
+
"id": block.get(
|
|
759
|
+
"id", f"call_{uuid.uuid4().hex[:8]}"
|
|
760
|
+
),
|
|
761
|
+
"type": "function",
|
|
762
|
+
"function": {
|
|
763
|
+
"name": block["name"],
|
|
764
|
+
"arguments": json.dumps(block.get("input", {})),
|
|
765
|
+
},
|
|
766
|
+
}
|
|
767
|
+
],
|
|
768
|
+
}
|
|
769
|
+
)
|
|
650
770
|
continue
|
|
651
771
|
elif block.get("type") == "tool_result":
|
|
652
|
-
messages.append(
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
772
|
+
messages.append(
|
|
773
|
+
{
|
|
774
|
+
"role": "tool",
|
|
775
|
+
"tool_call_id": block.get("tool_use_id", ""),
|
|
776
|
+
"content": _extract_text(block.get("content", "")),
|
|
777
|
+
}
|
|
778
|
+
)
|
|
657
779
|
continue
|
|
658
780
|
if parts:
|
|
659
781
|
messages.append({"role": role, "content": "\n".join(parts)})
|
|
@@ -686,7 +808,77 @@ _AGENTIC_SYSTEM_SUPPLEMENT = (
|
|
|
686
808
|
)
|
|
687
809
|
|
|
688
810
|
|
|
689
|
-
def
|
|
811
|
+
def _content_fingerprint(content) -> str:
|
|
812
|
+
if isinstance(content, str):
|
|
813
|
+
return content[:512]
|
|
814
|
+
if isinstance(content, list):
|
|
815
|
+
parts = []
|
|
816
|
+
for block in content:
|
|
817
|
+
if isinstance(block, str):
|
|
818
|
+
parts.append(block)
|
|
819
|
+
elif isinstance(block, dict):
|
|
820
|
+
btype = block.get("type", "")
|
|
821
|
+
if btype == "text":
|
|
822
|
+
parts.append(block.get("text", ""))
|
|
823
|
+
elif btype == "tool_use":
|
|
824
|
+
parts.append(f"tool:{block.get('name', '')}")
|
|
825
|
+
elif btype == "tool_result":
|
|
826
|
+
parts.append(f"result:{block.get('tool_use_id', '')}")
|
|
827
|
+
return "\n".join(parts)[:1024]
|
|
828
|
+
return str(content)[:512]
|
|
829
|
+
|
|
830
|
+
|
|
831
|
+
def resolve_session_id(request: Request, anthropic_body: dict) -> str:
|
|
832
|
+
header_keys = (
|
|
833
|
+
"x-uap-session-id",
|
|
834
|
+
"x-claude-session-id",
|
|
835
|
+
"anthropic-session-id",
|
|
836
|
+
"x-session-id",
|
|
837
|
+
)
|
|
838
|
+
for key in header_keys:
|
|
839
|
+
value = request.headers.get(key)
|
|
840
|
+
if value:
|
|
841
|
+
return f"hdr:{value}"
|
|
842
|
+
|
|
843
|
+
metadata = anthropic_body.get("metadata", {})
|
|
844
|
+
if isinstance(metadata, dict):
|
|
845
|
+
for key in ("session_id", "conversation_id", "thread_id"):
|
|
846
|
+
value = metadata.get(key)
|
|
847
|
+
if value:
|
|
848
|
+
return f"meta:{value}"
|
|
849
|
+
|
|
850
|
+
first_user = ""
|
|
851
|
+
for msg in anthropic_body.get("messages", []):
|
|
852
|
+
if msg.get("role") == "user":
|
|
853
|
+
first_user = _content_fingerprint(msg.get("content", ""))
|
|
854
|
+
break
|
|
855
|
+
|
|
856
|
+
system_fingerprint = _content_fingerprint(anthropic_body.get("system", ""))
|
|
857
|
+
model = anthropic_body.get("model", "default")
|
|
858
|
+
remote = request.client.host if request.client else "unknown"
|
|
859
|
+
digest = hashlib.sha256(
|
|
860
|
+
f"{remote}|{model}|{system_fingerprint}|{first_user}".encode(
|
|
861
|
+
"utf-8", errors="ignore"
|
|
862
|
+
)
|
|
863
|
+
).hexdigest()[:20]
|
|
864
|
+
return f"fp:{digest}"
|
|
865
|
+
|
|
866
|
+
|
|
867
|
+
def _last_user_has_tool_result(anthropic_body: dict) -> bool:
|
|
868
|
+
messages = anthropic_body.get("messages", [])
|
|
869
|
+
for msg in reversed(messages):
|
|
870
|
+
if msg.get("role") != "user":
|
|
871
|
+
continue
|
|
872
|
+
content = msg.get("content")
|
|
873
|
+
if not isinstance(content, list):
|
|
874
|
+
return False
|
|
875
|
+
return any(
|
|
876
|
+
isinstance(b, dict) and b.get("type") == "tool_result" for b in content
|
|
877
|
+
)
|
|
878
|
+
return False
|
|
879
|
+
|
|
880
|
+
|
|
881
|
+
def build_openai_request(anthropic_body: dict, monitor: SessionMonitor) -> dict:
|
|
690
882
|
"""Build an OpenAI Chat Completions request from an Anthropic Messages request."""
|
|
691
883
|
openai_body = {
|
|
692
884
|
"model": anthropic_body.get("model", "default"),
|
|
@@ -700,10 +892,13 @@ def build_openai_request(anthropic_body: dict) -> dict:
|
|
|
700
892
|
openai_body["messages"][0]["content"] += _AGENTIC_SYSTEM_SUPPLEMENT
|
|
701
893
|
else:
|
|
702
894
|
# No system message from the client; inject one.
|
|
703
|
-
openai_body["messages"].insert(
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
895
|
+
openai_body["messages"].insert(
|
|
896
|
+
0,
|
|
897
|
+
{
|
|
898
|
+
"role": "system",
|
|
899
|
+
"content": _AGENTIC_SYSTEM_SUPPLEMENT.strip(),
|
|
900
|
+
},
|
|
901
|
+
)
|
|
707
902
|
|
|
708
903
|
if "max_tokens" in anthropic_body:
|
|
709
904
|
# Enforce minimum floor for thinking mode: model needs tokens for
|
|
@@ -716,7 +911,7 @@ def build_openai_request(anthropic_body: dict) -> dict:
|
|
|
716
911
|
# Formula: max_tokens = min(requested, context_window - input_tokens - safety_margin)
|
|
717
912
|
# This ensures the model's output + current input stays within bounds,
|
|
718
913
|
# leaving room for the next turn's incremental growth.
|
|
719
|
-
ctx_window =
|
|
914
|
+
ctx_window = monitor.context_window
|
|
720
915
|
if ctx_window > 0:
|
|
721
916
|
estimated_input = estimate_total_tokens(anthropic_body)
|
|
722
917
|
# Reserve 15% of context for next-turn growth (tool results, etc.)
|
|
@@ -725,8 +920,11 @@ def build_openai_request(anthropic_body: dict) -> dict:
|
|
|
725
920
|
if available_for_output < requested_max and available_for_output > 1024:
|
|
726
921
|
logger.info(
|
|
727
922
|
"MAX_TOKENS capped: %d -> %d (ctx=%d, input~%d, margin=%d)",
|
|
728
|
-
requested_max,
|
|
729
|
-
|
|
923
|
+
requested_max,
|
|
924
|
+
available_for_output,
|
|
925
|
+
ctx_window,
|
|
926
|
+
estimated_input,
|
|
927
|
+
safety_margin,
|
|
730
928
|
)
|
|
731
929
|
requested_max = available_for_output
|
|
732
930
|
elif available_for_output <= 1024:
|
|
@@ -734,7 +932,9 @@ def build_openai_request(anthropic_body: dict) -> dict:
|
|
|
734
932
|
logger.warning(
|
|
735
933
|
"MAX_TOKENS: only %d tokens available for output (ctx=%d, input~%d). "
|
|
736
934
|
"Response may be truncated.",
|
|
737
|
-
available_for_output,
|
|
935
|
+
available_for_output,
|
|
936
|
+
ctx_window,
|
|
937
|
+
estimated_input,
|
|
738
938
|
)
|
|
739
939
|
requested_max = max(1024, available_for_output)
|
|
740
940
|
|
|
@@ -750,14 +950,16 @@ def build_openai_request(anthropic_body: dict) -> dict:
|
|
|
750
950
|
if "tools" in anthropic_body:
|
|
751
951
|
openai_body["tools"] = []
|
|
752
952
|
for tool in anthropic_body["tools"]:
|
|
753
|
-
openai_body["tools"].append(
|
|
754
|
-
|
|
755
|
-
|
|
756
|
-
"
|
|
757
|
-
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
|
|
953
|
+
openai_body["tools"].append(
|
|
954
|
+
{
|
|
955
|
+
"type": "function",
|
|
956
|
+
"function": {
|
|
957
|
+
"name": tool["name"],
|
|
958
|
+
"description": tool.get("description", ""),
|
|
959
|
+
"parameters": tool.get("input_schema", {}),
|
|
960
|
+
},
|
|
961
|
+
}
|
|
962
|
+
)
|
|
761
963
|
|
|
762
964
|
# Smart tool_choice: force tool calls during the agentic loop to
|
|
763
965
|
# prevent the model from producing text-only end_turn responses that
|
|
@@ -774,7 +976,8 @@ def build_openai_request(anthropic_body: dict) -> dict:
|
|
|
774
976
|
# runaway token consumption.
|
|
775
977
|
n_msgs = len(anthropic_body.get("messages", []))
|
|
776
978
|
has_tool_results = any(
|
|
777
|
-
isinstance(m.get("content"), list)
|
|
979
|
+
isinstance(m.get("content"), list)
|
|
980
|
+
and any(
|
|
778
981
|
isinstance(b, dict) and b.get("type") == "tool_result"
|
|
779
982
|
for b in m.get("content", [])
|
|
780
983
|
)
|
|
@@ -782,28 +985,41 @@ def build_openai_request(anthropic_body: dict) -> dict:
|
|
|
782
985
|
)
|
|
783
986
|
|
|
784
987
|
# Record tool calls from the last assistant message for loop detection
|
|
785
|
-
_record_last_assistant_tool_calls(anthropic_body)
|
|
988
|
+
_record_last_assistant_tool_calls(anthropic_body, monitor)
|
|
989
|
+
last_user_has_tool_result = _last_user_has_tool_result(anthropic_body)
|
|
786
990
|
|
|
787
991
|
# Check if loop breaker should override tool_choice
|
|
788
|
-
if
|
|
992
|
+
if monitor.should_release_tool_choice():
|
|
789
993
|
openai_body["tool_choice"] = "auto"
|
|
790
|
-
|
|
994
|
+
monitor.consecutive_forced_count = 0
|
|
995
|
+
monitor.no_progress_streak = 0
|
|
791
996
|
logger.warning("tool_choice set to 'auto' by LOOP BREAKER")
|
|
792
997
|
elif _last_assistant_was_text_only(anthropic_body):
|
|
793
998
|
openai_body["tool_choice"] = "required"
|
|
794
|
-
|
|
795
|
-
|
|
999
|
+
monitor.consecutive_forced_count += 1
|
|
1000
|
+
monitor.no_progress_streak = (
|
|
1001
|
+
0 if last_user_has_tool_result else monitor.no_progress_streak + 1
|
|
1002
|
+
)
|
|
1003
|
+
logger.info(
|
|
1004
|
+
"tool_choice forced to 'required' (last assistant was text-only)"
|
|
1005
|
+
)
|
|
796
1006
|
elif has_tool_results and n_msgs > 2:
|
|
797
1007
|
openai_body["tool_choice"] = "required"
|
|
798
|
-
|
|
799
|
-
|
|
1008
|
+
monitor.consecutive_forced_count += 1
|
|
1009
|
+
monitor.no_progress_streak = (
|
|
1010
|
+
0 if last_user_has_tool_result else monitor.no_progress_streak + 1
|
|
1011
|
+
)
|
|
1012
|
+
logger.info(
|
|
1013
|
+
"tool_choice forced to 'required' (active agentic loop with tool results)"
|
|
1014
|
+
)
|
|
800
1015
|
else:
|
|
801
|
-
|
|
1016
|
+
monitor.consecutive_forced_count = 0
|
|
1017
|
+
monitor.no_progress_streak = 0
|
|
802
1018
|
|
|
803
1019
|
return openai_body
|
|
804
1020
|
|
|
805
1021
|
|
|
806
|
-
def _record_last_assistant_tool_calls(anthropic_body: dict):
|
|
1022
|
+
def _record_last_assistant_tool_calls(anthropic_body: dict, monitor: SessionMonitor):
|
|
807
1023
|
"""Extract tool call names from the last assistant message and record
|
|
808
1024
|
them in the session monitor for loop detection."""
|
|
809
1025
|
messages = anthropic_body.get("messages", [])
|
|
@@ -818,7 +1034,70 @@ def _record_last_assistant_tool_calls(anthropic_body: dict):
|
|
|
818
1034
|
tool_names.append(block.get("name", "unknown"))
|
|
819
1035
|
break
|
|
820
1036
|
if tool_names:
|
|
821
|
-
|
|
1037
|
+
monitor.record_tool_calls(tool_names)
|
|
1038
|
+
|
|
1039
|
+
|
|
1040
|
+
def _is_unexpected_end_turn(openai_resp: dict, anthropic_body: dict) -> bool:
|
|
1041
|
+
choices = openai_resp.get("choices") or []
|
|
1042
|
+
if not choices:
|
|
1043
|
+
return False
|
|
1044
|
+
|
|
1045
|
+
choice = choices[0]
|
|
1046
|
+
finish = choice.get("finish_reason")
|
|
1047
|
+
if finish not in {"stop", "end_turn"}:
|
|
1048
|
+
return False
|
|
1049
|
+
|
|
1050
|
+
msg = choice.get("message", {})
|
|
1051
|
+
if msg.get("tool_calls"):
|
|
1052
|
+
return False
|
|
1053
|
+
|
|
1054
|
+
if "tools" not in anthropic_body:
|
|
1055
|
+
return False
|
|
1056
|
+
|
|
1057
|
+
has_tool_results = any(
|
|
1058
|
+
isinstance(m.get("content"), list)
|
|
1059
|
+
and any(
|
|
1060
|
+
isinstance(b, dict) and b.get("type") == "tool_result"
|
|
1061
|
+
for b in m.get("content", [])
|
|
1062
|
+
)
|
|
1063
|
+
for m in anthropic_body.get("messages", [])
|
|
1064
|
+
)
|
|
1065
|
+
|
|
1066
|
+
return has_tool_results or _last_assistant_was_text_only(anthropic_body)
|
|
1067
|
+
|
|
1068
|
+
|
|
1069
|
+
def _sanitize_reasoning_fallback_text(reasoning_text: str) -> str:
|
|
1070
|
+
cleaned = re.sub(r"</?think>", "", reasoning_text, flags=re.IGNORECASE)
|
|
1071
|
+
cleaned = re.sub(r"\s+", " ", cleaned).strip()
|
|
1072
|
+
if not cleaned:
|
|
1073
|
+
return ""
|
|
1074
|
+
if len(cleaned) > PROXY_STREAM_REASONING_MAX_CHARS:
|
|
1075
|
+
return cleaned[:PROXY_STREAM_REASONING_MAX_CHARS].rstrip() + "..."
|
|
1076
|
+
return cleaned
|
|
1077
|
+
|
|
1078
|
+
|
|
1079
|
+
def _build_reasoning_fallback_text(
|
|
1080
|
+
reasoning_chunks: list[str], mode: str | None = None
|
|
1081
|
+
) -> str | None:
|
|
1082
|
+
fallback_mode = (mode or PROXY_STREAM_REASONING_FALLBACK).strip().lower()
|
|
1083
|
+
if fallback_mode == "off":
|
|
1084
|
+
return None
|
|
1085
|
+
|
|
1086
|
+
raw_text = "".join(reasoning_chunks).strip()
|
|
1087
|
+
if not raw_text:
|
|
1088
|
+
return None
|
|
1089
|
+
|
|
1090
|
+
if fallback_mode == "visible":
|
|
1091
|
+
return raw_text
|
|
1092
|
+
if fallback_mode == "sanitized":
|
|
1093
|
+
sanitized = _sanitize_reasoning_fallback_text(raw_text)
|
|
1094
|
+
return sanitized or None
|
|
1095
|
+
|
|
1096
|
+
logger.warning(
|
|
1097
|
+
"Unknown PROXY_STREAM_REASONING_FALLBACK=%r; disabling reasoning fallback",
|
|
1098
|
+
fallback_mode,
|
|
1099
|
+
)
|
|
1100
|
+
return None
|
|
822
1101
|
|
|
823
1102
|
|
|
824
1103
|
def _last_assistant_was_text_only(anthropic_body: dict) -> bool:
|
|
@@ -836,11 +1115,14 @@ def _last_assistant_was_text_only(anthropic_body: dict) -> bool:
|
|
|
836
1115
|
return bool(content.strip())
|
|
837
1116
|
if isinstance(content, list):
|
|
838
1117
|
has_tool_use = any(
|
|
839
|
-
isinstance(b, dict) and b.get("type") == "tool_use"
|
|
840
|
-
for b in content
|
|
1118
|
+
isinstance(b, dict) and b.get("type") == "tool_use" for b in content
|
|
841
1119
|
)
|
|
842
1120
|
has_text = any(
|
|
843
|
-
(
|
|
1121
|
+
(
|
|
1122
|
+
isinstance(b, dict)
|
|
1123
|
+
and b.get("type") == "text"
|
|
1124
|
+
and b.get("text", "").strip()
|
|
1125
|
+
)
|
|
844
1126
|
or isinstance(b, str)
|
|
845
1127
|
for b in content
|
|
846
1128
|
)
|
|
@@ -854,6 +1136,7 @@ def _last_assistant_was_text_only(anthropic_body: dict) -> bool:
|
|
|
854
1136
|
# Response Translation: OpenAI -> Anthropic
|
|
855
1137
|
# ===========================================================================
|
|
856
1138
|
|
|
1139
|
+
|
|
857
1140
|
def openai_to_anthropic_response(openai_resp: dict, model: str) -> dict:
|
|
858
1141
|
"""Convert an OpenAI Chat Completions response to Anthropic Messages format."""
|
|
859
1142
|
choice = openai_resp.get("choices", [{}])[0]
|
|
@@ -871,12 +1154,14 @@ def openai_to_anthropic_response(openai_resp: dict, model: str) -> dict:
|
|
|
871
1154
|
args = json.loads(fn.get("arguments", "{}"))
|
|
872
1155
|
except json.JSONDecodeError:
|
|
873
1156
|
args = {}
|
|
874
|
-
content.append(
|
|
875
|
-
|
|
876
|
-
|
|
877
|
-
|
|
878
|
-
|
|
879
|
-
|
|
1157
|
+
content.append(
|
|
1158
|
+
{
|
|
1159
|
+
"type": "tool_use",
|
|
1160
|
+
"id": tc.get("id", f"toolu_{uuid.uuid4().hex[:12]}"),
|
|
1161
|
+
"name": fn.get("name", ""),
|
|
1162
|
+
"input": args,
|
|
1163
|
+
}
|
|
1164
|
+
)
|
|
880
1165
|
|
|
881
1166
|
stop_reason_map = {
|
|
882
1167
|
"stop": "end_turn",
|
|
@@ -906,7 +1191,13 @@ def openai_to_anthropic_response(openai_resp: dict, model: str) -> dict:
|
|
|
906
1191
|
# Streaming Translation: OpenAI SSE -> Anthropic SSE
|
|
907
1192
|
# ===========================================================================
|
|
908
1193
|
|
|
909
|
-
|
|
1194
|
+
|
|
1195
|
+
async def stream_anthropic_response(
|
|
1196
|
+
openai_stream: httpx.Response,
|
|
1197
|
+
model: str,
|
|
1198
|
+
monitor: SessionMonitor,
|
|
1199
|
+
anthropic_body: dict,
|
|
1200
|
+
):
|
|
910
1201
|
"""Convert an OpenAI streaming response to Anthropic SSE stream format.
|
|
911
1202
|
|
|
912
1203
|
Handles:
|
|
@@ -929,7 +1220,7 @@ async def stream_anthropic_response(openai_stream: httpx.Response, model: str):
|
|
|
929
1220
|
f"data: {json.dumps({'type': 'content_block_start', 'index': 0, 'content_block': {'type': 'text', 'text': ''}})}\n\n"
|
|
930
1221
|
)
|
|
931
1222
|
|
|
932
|
-
yield
|
|
1223
|
+
yield 'event: ping\ndata: {"type": "ping"}\n\n'
|
|
933
1224
|
|
|
934
1225
|
output_tokens = 0
|
|
935
1226
|
finish_reason = "end_turn"
|
|
@@ -1058,21 +1349,29 @@ async def stream_anthropic_response(openai_stream: httpx.Response, model: str):
|
|
|
1058
1349
|
f"data: {json.dumps({'type': 'content_block_stop', 'index': tc['block_index']})}\n\n"
|
|
1059
1350
|
)
|
|
1060
1351
|
else:
|
|
1061
|
-
#
|
|
1062
|
-
#
|
|
1063
|
-
#
|
|
1352
|
+
# If the response has no text and no tool calls, optionally emit a
|
|
1353
|
+
# reasoning fallback (configurable) to avoid leaking malformed
|
|
1354
|
+
# internal chain-of-thought content by default.
|
|
1064
1355
|
accumulated_text = "".join(text_chunks)
|
|
1065
1356
|
if not accumulated_text and reasoning_chunks:
|
|
1066
|
-
fallback_text =
|
|
1067
|
-
|
|
1068
|
-
|
|
1069
|
-
|
|
1070
|
-
|
|
1071
|
-
|
|
1072
|
-
|
|
1073
|
-
|
|
1074
|
-
|
|
1075
|
-
|
|
1357
|
+
fallback_text = _build_reasoning_fallback_text(reasoning_chunks)
|
|
1358
|
+
if fallback_text:
|
|
1359
|
+
logger.warning(
|
|
1360
|
+
"Empty response with %d reasoning chunks – emitting fallback text (mode=%s)",
|
|
1361
|
+
len(reasoning_chunks),
|
|
1362
|
+
PROXY_STREAM_REASONING_FALLBACK,
|
|
1363
|
+
)
|
|
1364
|
+
text_chunks.append(fallback_text)
|
|
1365
|
+
yield (
|
|
1366
|
+
f"event: content_block_delta\n"
|
|
1367
|
+
f"data: {json.dumps({'type': 'content_block_delta', 'index': 0, 'delta': {'type': 'text_delta', 'text': fallback_text}})}\n\n"
|
|
1368
|
+
)
|
|
1369
|
+
else:
|
|
1370
|
+
logger.warning(
|
|
1371
|
+
"Empty response with %d reasoning chunks – fallback suppressed (mode=%s)",
|
|
1372
|
+
len(reasoning_chunks),
|
|
1373
|
+
PROXY_STREAM_REASONING_FALLBACK,
|
|
1374
|
+
)
|
|
1076
1375
|
|
|
1077
1376
|
yield (
|
|
1078
1377
|
f"event: content_block_stop\n"
|
|
@@ -1081,17 +1380,52 @@ async def stream_anthropic_response(openai_stream: httpx.Response, model: str):
|
|
|
1081
1380
|
|
|
1082
1381
|
# Log response summary
|
|
1083
1382
|
accumulated_text = "".join(text_chunks)
|
|
1084
|
-
tc_names =
|
|
1085
|
-
|
|
1383
|
+
tc_names = (
|
|
1384
|
+
[tc["name"] for tc in tool_calls_by_index.values()]
|
|
1385
|
+
if tool_calls_by_index
|
|
1386
|
+
else []
|
|
1387
|
+
)
|
|
1388
|
+
tc_args = (
|
|
1389
|
+
[tc.get("arguments", "") for tc in tool_calls_by_index.values()]
|
|
1390
|
+
if tool_calls_by_index
|
|
1391
|
+
else []
|
|
1392
|
+
)
|
|
1086
1393
|
logger.info(
|
|
1087
1394
|
"RESP: finish=%s output_tokens=%d text_len=%d text=%.300s tool_calls=%s args=%s",
|
|
1088
|
-
finish_reason,
|
|
1395
|
+
finish_reason,
|
|
1396
|
+
output_tokens,
|
|
1089
1397
|
len(accumulated_text),
|
|
1090
1398
|
accumulated_text[:300],
|
|
1091
1399
|
tc_names,
|
|
1092
1400
|
[a[:200] for a in tc_args],
|
|
1093
1401
|
)
|
|
1094
1402
|
|
|
1403
|
+
if _is_unexpected_end_turn(
|
|
1404
|
+
{
|
|
1405
|
+
"choices": [
|
|
1406
|
+
{
|
|
1407
|
+
"finish_reason": "stop"
|
|
1408
|
+
if finish_reason == "end_turn"
|
|
1409
|
+
else finish_reason,
|
|
1410
|
+
"message": {
|
|
1411
|
+
"content": accumulated_text,
|
|
1412
|
+
"tool_calls": [
|
|
1413
|
+
{
|
|
1414
|
+
"function": {
|
|
1415
|
+
"name": tc["name"],
|
|
1416
|
+
"arguments": tc.get("arguments", ""),
|
|
1417
|
+
}
|
|
1418
|
+
}
|
|
1419
|
+
for tc in tool_calls_by_index.values()
|
|
1420
|
+
],
|
|
1421
|
+
},
|
|
1422
|
+
}
|
|
1423
|
+
]
|
|
1424
|
+
},
|
|
1425
|
+
anthropic_body,
|
|
1426
|
+
):
|
|
1427
|
+
monitor.unexpected_end_turn_count += 1
|
|
1428
|
+
|
|
1095
1429
|
# message_delta with final stop reason
|
|
1096
1430
|
yield (
|
|
1097
1431
|
f"event: message_delta\n"
|
|
@@ -1106,6 +1440,7 @@ async def stream_anthropic_response(openai_stream: httpx.Response, model: str):
|
|
|
1106
1440
|
# API Endpoints
|
|
1107
1441
|
# ===========================================================================
|
|
1108
1442
|
|
|
1443
|
+
|
|
1109
1444
|
@app.post("/v1/messages")
|
|
1110
1445
|
async def messages(request: Request):
|
|
1111
1446
|
"""Handle Anthropic Messages API requests (streaming and non-streaming).
|
|
@@ -1116,9 +1451,14 @@ async def messages(request: Request):
|
|
|
1116
1451
|
- Option E: Smart max_tokens capping (in build_openai_request)
|
|
1117
1452
|
- Option F: Session-level token monitoring with warnings
|
|
1118
1453
|
"""
|
|
1454
|
+
global last_session_id
|
|
1455
|
+
|
|
1119
1456
|
body = await request.json()
|
|
1120
1457
|
model = body.get("model", "default")
|
|
1121
1458
|
is_stream = body.get("stream", False)
|
|
1459
|
+
session_id = resolve_session_id(request, body)
|
|
1460
|
+
monitor = get_session_monitor(session_id)
|
|
1461
|
+
last_session_id = session_id
|
|
1122
1462
|
|
|
1123
1463
|
# Debug: log request summary
|
|
1124
1464
|
n_messages = len(body.get("messages", []))
|
|
@@ -1128,42 +1468,51 @@ async def messages(request: Request):
|
|
|
1128
1468
|
last_role = last_msg.get("role", "?")
|
|
1129
1469
|
last_content = last_msg.get("content", "")
|
|
1130
1470
|
if isinstance(last_content, list):
|
|
1131
|
-
last_text = next(
|
|
1471
|
+
last_text = next(
|
|
1472
|
+
(b.get("text", "") for b in last_content if b.get("type") == "text"), ""
|
|
1473
|
+
)[:200]
|
|
1132
1474
|
elif isinstance(last_content, str):
|
|
1133
1475
|
last_text = last_content[:200]
|
|
1134
1476
|
else:
|
|
1135
1477
|
last_text = str(last_content)[:200]
|
|
1136
1478
|
logger.info(
|
|
1137
1479
|
"REQ: stream=%s msgs=%d tools=%d max_tokens=%s last_role=%s last_content=%.200s",
|
|
1138
|
-
is_stream,
|
|
1480
|
+
is_stream,
|
|
1481
|
+
n_messages,
|
|
1482
|
+
n_tools,
|
|
1483
|
+
max_tokens,
|
|
1484
|
+
last_role,
|
|
1485
|
+
last_text,
|
|
1139
1486
|
)
|
|
1140
1487
|
|
|
1141
1488
|
# --- Option F: Estimate tokens and record in session monitor ---
|
|
1142
1489
|
estimated_tokens = estimate_total_tokens(body)
|
|
1143
|
-
|
|
1144
|
-
|
|
1490
|
+
monitor.record_request(estimated_tokens)
|
|
1491
|
+
monitor.log_status()
|
|
1145
1492
|
|
|
1146
1493
|
# --- Option C: Prune conversation if approaching context limit ---
|
|
1147
|
-
ctx_window =
|
|
1494
|
+
ctx_window = monitor.context_window
|
|
1148
1495
|
if ctx_window > 0:
|
|
1149
1496
|
utilization = estimated_tokens / ctx_window
|
|
1150
1497
|
if utilization >= PROXY_CONTEXT_PRUNE_THRESHOLD:
|
|
1151
1498
|
logger.warning(
|
|
1152
1499
|
"Context utilization %.1f%% exceeds threshold %.1f%% -- pruning conversation",
|
|
1153
|
-
utilization * 100,
|
|
1500
|
+
utilization * 100,
|
|
1501
|
+
PROXY_CONTEXT_PRUNE_THRESHOLD * 100,
|
|
1154
1502
|
)
|
|
1155
1503
|
body = prune_conversation(body, ctx_window, target_fraction=0.65)
|
|
1156
|
-
|
|
1504
|
+
monitor.prune_count += 1
|
|
1157
1505
|
# Re-estimate after pruning
|
|
1158
1506
|
estimated_tokens = estimate_total_tokens(body)
|
|
1159
|
-
|
|
1507
|
+
monitor.record_request(estimated_tokens)
|
|
1160
1508
|
n_messages = len(body.get("messages", []))
|
|
1161
1509
|
logger.info(
|
|
1162
1510
|
"After pruning: ~%d tokens, %d messages",
|
|
1163
|
-
estimated_tokens,
|
|
1511
|
+
estimated_tokens,
|
|
1512
|
+
n_messages,
|
|
1164
1513
|
)
|
|
1165
1514
|
|
|
1166
|
-
openai_body = build_openai_request(body)
|
|
1515
|
+
openai_body = build_openai_request(body, monitor)
|
|
1167
1516
|
|
|
1168
1517
|
client = http_client
|
|
1169
1518
|
if client is None:
|
|
@@ -1181,6 +1530,7 @@ async def messages(request: Request):
|
|
|
1181
1530
|
MAX_UPSTREAM_RETRIES = 3
|
|
1182
1531
|
RETRY_DELAY_SECS = 5.0
|
|
1183
1532
|
last_exc: Exception | None = None
|
|
1533
|
+
resp: httpx.Response | None = None
|
|
1184
1534
|
|
|
1185
1535
|
for attempt in range(MAX_UPSTREAM_RETRIES):
|
|
1186
1536
|
try:
|
|
@@ -1201,25 +1551,46 @@ async def messages(request: Request):
|
|
|
1201
1551
|
if attempt < MAX_UPSTREAM_RETRIES - 1:
|
|
1202
1552
|
logger.warning(
|
|
1203
1553
|
"Upstream connect failed (attempt %d/%d): %s – retrying in %.0fs",
|
|
1204
|
-
attempt + 1,
|
|
1205
|
-
|
|
1554
|
+
attempt + 1,
|
|
1555
|
+
MAX_UPSTREAM_RETRIES,
|
|
1556
|
+
type(exc).__name__,
|
|
1557
|
+
RETRY_DELAY_SECS,
|
|
1206
1558
|
)
|
|
1207
1559
|
await asyncio.sleep(RETRY_DELAY_SECS)
|
|
1208
1560
|
else:
|
|
1209
1561
|
logger.error(
|
|
1210
1562
|
"Upstream connect failed after %d attempts: %s: %s",
|
|
1211
|
-
MAX_UPSTREAM_RETRIES,
|
|
1563
|
+
MAX_UPSTREAM_RETRIES,
|
|
1564
|
+
type(exc).__name__,
|
|
1565
|
+
exc,
|
|
1212
1566
|
)
|
|
1213
1567
|
|
|
1214
1568
|
if last_exc is not None:
|
|
1215
1569
|
return Response(
|
|
1216
|
-
content=json.dumps(
|
|
1217
|
-
|
|
1218
|
-
|
|
1219
|
-
"
|
|
1220
|
-
|
|
1221
|
-
|
|
1222
|
-
|
|
1570
|
+
content=json.dumps(
|
|
1571
|
+
{
|
|
1572
|
+
"type": "error",
|
|
1573
|
+
"error": {
|
|
1574
|
+
"type": "overloaded_error",
|
|
1575
|
+
"message": f"Upstream server unavailable after {MAX_UPSTREAM_RETRIES} retries: {last_exc}",
|
|
1576
|
+
},
|
|
1577
|
+
}
|
|
1578
|
+
),
|
|
1579
|
+
status_code=529,
|
|
1580
|
+
media_type="application/json",
|
|
1581
|
+
)
|
|
1582
|
+
|
|
1583
|
+
if resp is None:
|
|
1584
|
+
return Response(
|
|
1585
|
+
content=json.dumps(
|
|
1586
|
+
{
|
|
1587
|
+
"type": "error",
|
|
1588
|
+
"error": {
|
|
1589
|
+
"type": "overloaded_error",
|
|
1590
|
+
"message": "Upstream response unavailable",
|
|
1591
|
+
},
|
|
1592
|
+
}
|
|
1593
|
+
),
|
|
1223
1594
|
status_code=529,
|
|
1224
1595
|
media_type="application/json",
|
|
1225
1596
|
)
|
|
@@ -1232,9 +1603,7 @@ async def messages(request: Request):
|
|
|
1232
1603
|
error_body = await resp.aread()
|
|
1233
1604
|
await resp.aclose()
|
|
1234
1605
|
error_text = error_body.decode("utf-8", errors="replace")[:1000]
|
|
1235
|
-
logger.error(
|
|
1236
|
-
"Upstream HTTP %d: %s", resp.status_code, error_text
|
|
1237
|
-
)
|
|
1606
|
+
logger.error("Upstream HTTP %d: %s", resp.status_code, error_text)
|
|
1238
1607
|
|
|
1239
1608
|
# Parse the error for a user-friendly message
|
|
1240
1609
|
error_message = f"Upstream server error (HTTP {resp.status_code})"
|
|
@@ -1257,47 +1626,57 @@ async def messages(request: Request):
|
|
|
1257
1626
|
)
|
|
1258
1627
|
|
|
1259
1628
|
if is_context_overflow:
|
|
1260
|
-
|
|
1629
|
+
monitor.overflow_count += 1
|
|
1261
1630
|
logger.error(
|
|
1262
1631
|
"CONTEXT OVERFLOW detected (count=%d). "
|
|
1263
1632
|
"Estimated input: %d tokens, context window: %d tokens. "
|
|
1264
1633
|
"Conversation needs pruning or context window increase.",
|
|
1265
|
-
|
|
1634
|
+
monitor.overflow_count,
|
|
1635
|
+
estimated_tokens,
|
|
1636
|
+
ctx_window,
|
|
1266
1637
|
)
|
|
1267
1638
|
# Return Anthropic-format error that Claude Code can handle
|
|
1268
1639
|
return Response(
|
|
1269
|
-
content=json.dumps(
|
|
1270
|
-
|
|
1271
|
-
|
|
1272
|
-
"
|
|
1273
|
-
|
|
1274
|
-
|
|
1275
|
-
|
|
1276
|
-
|
|
1277
|
-
|
|
1278
|
-
|
|
1279
|
-
|
|
1280
|
-
|
|
1640
|
+
content=json.dumps(
|
|
1641
|
+
{
|
|
1642
|
+
"type": "error",
|
|
1643
|
+
"error": {
|
|
1644
|
+
"type": "overloaded_error",
|
|
1645
|
+
"message": (
|
|
1646
|
+
f"Context window exceeded: request requires ~{estimated_tokens} tokens "
|
|
1647
|
+
f"but only {ctx_window} are available. "
|
|
1648
|
+
f"The conversation is too long. Please start a new session or "
|
|
1649
|
+
f"reduce conversation length."
|
|
1650
|
+
),
|
|
1651
|
+
},
|
|
1652
|
+
}
|
|
1653
|
+
),
|
|
1281
1654
|
status_code=529,
|
|
1282
1655
|
media_type="application/json",
|
|
1283
1656
|
)
|
|
1284
1657
|
|
|
1285
1658
|
# Generic upstream error -- return as Anthropic error format
|
|
1286
|
-
error_type =
|
|
1659
|
+
error_type = (
|
|
1660
|
+
"overloaded_error"
|
|
1661
|
+
if resp.status_code >= 500
|
|
1662
|
+
else "invalid_request_error"
|
|
1663
|
+
)
|
|
1287
1664
|
return Response(
|
|
1288
|
-
content=json.dumps(
|
|
1289
|
-
|
|
1290
|
-
|
|
1291
|
-
"
|
|
1292
|
-
|
|
1293
|
-
|
|
1294
|
-
|
|
1665
|
+
content=json.dumps(
|
|
1666
|
+
{
|
|
1667
|
+
"type": "error",
|
|
1668
|
+
"error": {
|
|
1669
|
+
"type": error_type,
|
|
1670
|
+
"message": error_message,
|
|
1671
|
+
},
|
|
1672
|
+
}
|
|
1673
|
+
),
|
|
1295
1674
|
status_code=529 if resp.status_code >= 500 else 400,
|
|
1296
1675
|
media_type="application/json",
|
|
1297
1676
|
)
|
|
1298
1677
|
|
|
1299
1678
|
return StreamingResponse(
|
|
1300
|
-
stream_anthropic_response(resp, model),
|
|
1679
|
+
stream_anthropic_response(resp, model, monitor, body),
|
|
1301
1680
|
media_type="text/event-stream",
|
|
1302
1681
|
headers={
|
|
1303
1682
|
"Cache-Control": "no-cache",
|
|
@@ -1314,25 +1693,56 @@ async def messages(request: Request):
|
|
|
1314
1693
|
# Option B: Handle non-streaming errors too
|
|
1315
1694
|
if resp.status_code != 200:
|
|
1316
1695
|
error_text = resp.text[:1000]
|
|
1317
|
-
logger.error(
|
|
1696
|
+
logger.error(
|
|
1697
|
+
"Upstream HTTP %d (non-stream): %s", resp.status_code, error_text
|
|
1698
|
+
)
|
|
1318
1699
|
return Response(
|
|
1319
|
-
content=json.dumps(
|
|
1320
|
-
|
|
1321
|
-
|
|
1322
|
-
"
|
|
1323
|
-
|
|
1324
|
-
|
|
1325
|
-
|
|
1700
|
+
content=json.dumps(
|
|
1701
|
+
{
|
|
1702
|
+
"type": "error",
|
|
1703
|
+
"error": {
|
|
1704
|
+
"type": "overloaded_error",
|
|
1705
|
+
"message": f"Upstream error (HTTP {resp.status_code}): {error_text[:500]}",
|
|
1706
|
+
},
|
|
1707
|
+
}
|
|
1708
|
+
),
|
|
1326
1709
|
status_code=529,
|
|
1327
1710
|
media_type="application/json",
|
|
1328
1711
|
)
|
|
1329
1712
|
|
|
1330
1713
|
openai_resp = resp.json()
|
|
1714
|
+
|
|
1715
|
+
if PROXY_GUARDRAIL_RETRY and _is_unexpected_end_turn(openai_resp, body):
|
|
1716
|
+
monitor.unexpected_end_turn_count += 1
|
|
1717
|
+
logger.warning(
|
|
1718
|
+
"GUARDRAIL: unexpected end_turn without tool_use in active loop (session=%s), retrying once with tool_choice=required",
|
|
1719
|
+
session_id,
|
|
1720
|
+
)
|
|
1721
|
+
|
|
1722
|
+
retry_body = dict(openai_body)
|
|
1723
|
+
retry_body["tool_choice"] = "required"
|
|
1724
|
+
retry_body["stream"] = False
|
|
1725
|
+
|
|
1726
|
+
retry_resp = await client.post(
|
|
1727
|
+
f"{LLAMA_CPP_BASE}/chat/completions",
|
|
1728
|
+
json=retry_body,
|
|
1729
|
+
headers={"Content-Type": "application/json"},
|
|
1730
|
+
)
|
|
1731
|
+
if retry_resp.status_code == 200:
|
|
1732
|
+
retry_json = retry_resp.json()
|
|
1733
|
+
retry_choice = (retry_json.get("choices") or [{}])[0]
|
|
1734
|
+
retry_message = retry_choice.get("message", {})
|
|
1735
|
+
if retry_message.get("tool_calls"):
|
|
1736
|
+
openai_resp = retry_json
|
|
1737
|
+
logger.info(
|
|
1738
|
+
"GUARDRAIL: retry produced tool_use; using retried response"
|
|
1739
|
+
)
|
|
1740
|
+
|
|
1331
1741
|
anthropic_resp = openai_to_anthropic_response(openai_resp, model)
|
|
1332
1742
|
|
|
1333
1743
|
# Track output tokens in session monitor
|
|
1334
1744
|
output_tokens = anthropic_resp.get("usage", {}).get("output_tokens", 0)
|
|
1335
|
-
|
|
1745
|
+
monitor.record_response(output_tokens)
|
|
1336
1746
|
|
|
1337
1747
|
return anthropic_resp
|
|
1338
1748
|
|
|
@@ -1377,37 +1787,49 @@ async def health():
|
|
|
1377
1787
|
|
|
1378
1788
|
|
|
1379
1789
|
@app.get("/v1/context")
|
|
1380
|
-
async def context_status():
|
|
1790
|
+
async def context_status(request: Request):
|
|
1381
1791
|
"""Option F: Context window monitoring endpoint.
|
|
1382
1792
|
|
|
1383
1793
|
Returns current session token usage, utilization, warnings, and
|
|
1384
1794
|
estimated remaining turns. Useful for dashboards and debugging.
|
|
1385
1795
|
"""
|
|
1386
|
-
|
|
1387
|
-
|
|
1796
|
+
requested_session = request.query_params.get("session_id", "")
|
|
1797
|
+
session_id = requested_session or last_session_id
|
|
1798
|
+
monitor = session_monitors.get(session_id) if session_id else None
|
|
1799
|
+
|
|
1800
|
+
if monitor is None:
|
|
1801
|
+
monitor = SessionMonitor(context_window=default_context_window)
|
|
1802
|
+
|
|
1803
|
+
warning = monitor.get_warning_level()
|
|
1804
|
+
turns = monitor.estimate_turns_remaining()
|
|
1388
1805
|
|
|
1389
1806
|
return {
|
|
1390
|
-
"
|
|
1391
|
-
"
|
|
1392
|
-
"
|
|
1393
|
-
"
|
|
1394
|
-
"
|
|
1395
|
-
"
|
|
1807
|
+
"active_session_id": session_id,
|
|
1808
|
+
"session_count": len(session_monitors),
|
|
1809
|
+
"context_window": monitor.context_window,
|
|
1810
|
+
"last_input_tokens": monitor.last_input_tokens,
|
|
1811
|
+
"last_output_tokens": monitor.last_output_tokens,
|
|
1812
|
+
"peak_input_tokens": monitor.peak_input_tokens,
|
|
1813
|
+
"utilization": round(monitor.get_utilization(), 4),
|
|
1814
|
+
"utilization_pct": f"{monitor.get_utilization() * 100:.1f}%",
|
|
1396
1815
|
"warning_level": warning,
|
|
1397
1816
|
"estimated_turns_remaining": turns,
|
|
1398
|
-
"total_requests":
|
|
1399
|
-
"prune_count":
|
|
1400
|
-
"overflow_count":
|
|
1817
|
+
"total_requests": monitor.total_requests,
|
|
1818
|
+
"prune_count": monitor.prune_count,
|
|
1819
|
+
"overflow_count": monitor.overflow_count,
|
|
1401
1820
|
"prune_threshold": PROXY_CONTEXT_PRUNE_THRESHOLD,
|
|
1402
|
-
"recent_history":
|
|
1821
|
+
"recent_history": monitor.context_history[-10:],
|
|
1403
1822
|
# Loop protection stats
|
|
1404
1823
|
"loop_protection": {
|
|
1405
|
-
"
|
|
1406
|
-
"
|
|
1407
|
-
"
|
|
1408
|
-
"
|
|
1409
|
-
"
|
|
1410
|
-
"
|
|
1824
|
+
"enabled": PROXY_LOOP_BREAKER,
|
|
1825
|
+
"consecutive_forced_count": monitor.consecutive_forced_count,
|
|
1826
|
+
"no_progress_streak": monitor.no_progress_streak,
|
|
1827
|
+
"loop_warnings_emitted": monitor.loop_warnings_emitted,
|
|
1828
|
+
"unexpected_end_turn_count": monitor.unexpected_end_turn_count,
|
|
1829
|
+
"tool_call_history_len": len(monitor.tool_call_history),
|
|
1830
|
+
"is_looping": monitor.detect_tool_loop(window=PROXY_LOOP_WINDOW)[0],
|
|
1831
|
+
"loop_repeat_count": monitor.detect_tool_loop(window=PROXY_LOOP_WINDOW)[1],
|
|
1832
|
+
"recent_tool_patterns": monitor.tool_call_history[-5:],
|
|
1411
1833
|
},
|
|
1412
1834
|
}
|
|
1413
1835
|
|