@miller-tech/uap 1.13.13 → 1.13.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/.tsbuildinfo +1 -1
- package/dist/benchmarks/speculative-autotune.d.ts +46 -0
- package/dist/benchmarks/speculative-autotune.d.ts.map +1 -0
- package/dist/benchmarks/speculative-autotune.js +145 -0
- package/dist/benchmarks/speculative-autotune.js.map +1 -0
- package/dist/bin/cli.js +2 -0
- package/dist/bin/cli.js.map +1 -1
- package/dist/bin/llama-server-optimize.js +176 -0
- package/dist/bin/llama-server-optimize.js.map +1 -1
- package/dist/cli/init.d.ts +1 -0
- package/dist/cli/init.d.ts.map +1 -1
- package/dist/cli/init.js +18 -0
- package/dist/cli/init.js.map +1 -1
- package/dist/cli/setup.d.ts +1 -0
- package/dist/cli/setup.d.ts.map +1 -1
- package/dist/cli/setup.js +1 -0
- package/dist/cli/setup.js.map +1 -1
- package/dist/cli/systemd-services.d.ts +12 -0
- package/dist/cli/systemd-services.d.ts.map +1 -0
- package/dist/cli/systemd-services.js +179 -0
- package/dist/cli/systemd-services.js.map +1 -0
- package/docs/INDEX.md +1 -0
- package/docs/benchmarks/SPECULATIVE_DECODING_JOURNEY_2026-03.md +221 -0
- package/docs/deployment/QWEN35_LLAMA_CPP.md +76 -0
- package/docs/deployment/UAP_LLAMA_ANTHROPIC_PROXY_BOOTSTRAP.md +279 -0
- package/package.json +1 -1
- package/tools/agents/scripts/anthropic_proxy.py +1444 -197
- package/tools/agents/tests/test_anthropic_proxy_streaming.py +525 -0
|
@@ -76,9 +76,11 @@ Dependencies
|
|
|
76
76
|
"""
|
|
77
77
|
|
|
78
78
|
import asyncio
|
|
79
|
+
import hashlib
|
|
79
80
|
import json
|
|
80
81
|
import logging
|
|
81
82
|
import os
|
|
83
|
+
import re
|
|
82
84
|
import sys
|
|
83
85
|
import time
|
|
84
86
|
import uuid
|
|
@@ -100,7 +102,107 @@ PROXY_LOG_LEVEL = os.environ.get("PROXY_LOG_LEVEL", "INFO").upper()
|
|
|
100
102
|
PROXY_READ_TIMEOUT = float(os.environ.get("PROXY_READ_TIMEOUT", "600"))
|
|
101
103
|
PROXY_MAX_CONNECTIONS = int(os.environ.get("PROXY_MAX_CONNECTIONS", "20"))
|
|
102
104
|
PROXY_CONTEXT_WINDOW = int(os.environ.get("PROXY_CONTEXT_WINDOW", "0"))
|
|
103
|
-
PROXY_CONTEXT_PRUNE_THRESHOLD = float(
|
|
105
|
+
PROXY_CONTEXT_PRUNE_THRESHOLD = float(
|
|
106
|
+
os.environ.get("PROXY_CONTEXT_PRUNE_THRESHOLD", "0.75")
|
|
107
|
+
)
|
|
108
|
+
PROXY_CONTEXT_PRUNE_TARGET_FRACTION = float(
|
|
109
|
+
os.environ.get("PROXY_CONTEXT_PRUNE_TARGET_FRACTION", "0.65")
|
|
110
|
+
)
|
|
111
|
+
PROXY_LOOP_BREAKER = os.environ.get("PROXY_LOOP_BREAKER", "on").lower() not in {
|
|
112
|
+
"0",
|
|
113
|
+
"false",
|
|
114
|
+
"off",
|
|
115
|
+
"no",
|
|
116
|
+
}
|
|
117
|
+
PROXY_LOOP_WINDOW = int(os.environ.get("PROXY_LOOP_WINDOW", "6"))
|
|
118
|
+
PROXY_LOOP_REPEAT_THRESHOLD = int(os.environ.get("PROXY_LOOP_REPEAT_THRESHOLD", "8"))
|
|
119
|
+
PROXY_FORCED_THRESHOLD = int(os.environ.get("PROXY_FORCED_THRESHOLD", "15"))
|
|
120
|
+
PROXY_NO_PROGRESS_THRESHOLD = int(os.environ.get("PROXY_NO_PROGRESS_THRESHOLD", "4"))
|
|
121
|
+
PROXY_CONTEXT_RELEASE_THRESHOLD = float(
|
|
122
|
+
os.environ.get("PROXY_CONTEXT_RELEASE_THRESHOLD", "0.90")
|
|
123
|
+
)
|
|
124
|
+
PROXY_GUARDRAIL_RETRY = os.environ.get("PROXY_GUARDRAIL_RETRY", "on").lower() not in {
|
|
125
|
+
"0",
|
|
126
|
+
"false",
|
|
127
|
+
"off",
|
|
128
|
+
"no",
|
|
129
|
+
}
|
|
130
|
+
PROXY_SESSION_TTL_SECS = int(os.environ.get("PROXY_SESSION_TTL_SECS", "7200"))
|
|
131
|
+
PROXY_STREAM_REASONING_FALLBACK = (
|
|
132
|
+
os.environ.get("PROXY_STREAM_REASONING_FALLBACK", "off").strip().lower()
|
|
133
|
+
)
|
|
134
|
+
PROXY_STREAM_REASONING_MAX_CHARS = int(
|
|
135
|
+
os.environ.get("PROXY_STREAM_REASONING_MAX_CHARS", "240")
|
|
136
|
+
)
|
|
137
|
+
PROXY_MAX_TOKENS_FLOOR = int(os.environ.get("PROXY_MAX_TOKENS_FLOOR", "16384"))
|
|
138
|
+
PROXY_TOOL_NARROWING = os.environ.get("PROXY_TOOL_NARROWING", "off").lower() not in {
|
|
139
|
+
"0",
|
|
140
|
+
"false",
|
|
141
|
+
"off",
|
|
142
|
+
"no",
|
|
143
|
+
}
|
|
144
|
+
PROXY_TOOL_NARROWING_KEEP = int(os.environ.get("PROXY_TOOL_NARROWING_KEEP", "8"))
|
|
145
|
+
PROXY_TOOL_NARROWING_MIN_TOOLS = int(
|
|
146
|
+
os.environ.get("PROXY_TOOL_NARROWING_MIN_TOOLS", "12")
|
|
147
|
+
)
|
|
148
|
+
PROXY_DISABLE_THINKING_ON_TOOL_TURNS = os.environ.get(
|
|
149
|
+
"PROXY_DISABLE_THINKING_ON_TOOL_TURNS", "off"
|
|
150
|
+
).lower() not in {
|
|
151
|
+
"0",
|
|
152
|
+
"false",
|
|
153
|
+
"off",
|
|
154
|
+
"no",
|
|
155
|
+
}
|
|
156
|
+
PROXY_MALFORMED_TOOL_GUARDRAIL = os.environ.get(
|
|
157
|
+
"PROXY_MALFORMED_TOOL_GUARDRAIL", "on"
|
|
158
|
+
).lower() not in {
|
|
159
|
+
"0",
|
|
160
|
+
"false",
|
|
161
|
+
"off",
|
|
162
|
+
"no",
|
|
163
|
+
}
|
|
164
|
+
PROXY_MALFORMED_TOOL_RETRY_MAX = int(
|
|
165
|
+
os.environ.get("PROXY_MALFORMED_TOOL_RETRY_MAX", "1")
|
|
166
|
+
)
|
|
167
|
+
PROXY_MALFORMED_TOOL_RETRY_MAX_TOKENS = int(
|
|
168
|
+
os.environ.get("PROXY_MALFORMED_TOOL_RETRY_MAX_TOKENS", "2048")
|
|
169
|
+
)
|
|
170
|
+
PROXY_MALFORMED_TOOL_RETRY_TEMPERATURE = float(
|
|
171
|
+
os.environ.get("PROXY_MALFORMED_TOOL_RETRY_TEMPERATURE", "0")
|
|
172
|
+
)
|
|
173
|
+
PROXY_MALFORMED_TOOL_STREAM_STRICT = os.environ.get(
|
|
174
|
+
"PROXY_MALFORMED_TOOL_STREAM_STRICT", "off"
|
|
175
|
+
).lower() not in {
|
|
176
|
+
"0",
|
|
177
|
+
"false",
|
|
178
|
+
"off",
|
|
179
|
+
"no",
|
|
180
|
+
}
|
|
181
|
+
PROXY_FORCE_NON_STREAM = os.environ.get(
|
|
182
|
+
"PROXY_FORCE_NON_STREAM", "off"
|
|
183
|
+
).lower() not in {
|
|
184
|
+
"0",
|
|
185
|
+
"false",
|
|
186
|
+
"off",
|
|
187
|
+
"no",
|
|
188
|
+
}
|
|
189
|
+
PROXY_SESSION_CONTAMINATION_BREAKER = os.environ.get(
|
|
190
|
+
"PROXY_SESSION_CONTAMINATION_BREAKER", "on"
|
|
191
|
+
).lower() not in {
|
|
192
|
+
"0",
|
|
193
|
+
"false",
|
|
194
|
+
"off",
|
|
195
|
+
"no",
|
|
196
|
+
}
|
|
197
|
+
PROXY_SESSION_CONTAMINATION_THRESHOLD = int(
|
|
198
|
+
os.environ.get("PROXY_SESSION_CONTAMINATION_THRESHOLD", "3")
|
|
199
|
+
)
|
|
200
|
+
PROXY_SESSION_CONTAMINATION_KEEP_LAST = int(
|
|
201
|
+
os.environ.get("PROXY_SESSION_CONTAMINATION_KEEP_LAST", "8")
|
|
202
|
+
)
|
|
203
|
+
PROXY_AGENTIC_SUPPLEMENT_MODE = (
|
|
204
|
+
os.environ.get("PROXY_AGENTIC_SUPPLEMENT_MODE", "clean").strip().lower()
|
|
205
|
+
)
|
|
104
206
|
|
|
105
207
|
# ---------------------------------------------------------------------------
|
|
106
208
|
# Logging
|
|
@@ -121,19 +223,28 @@ class SessionMonitor:
|
|
|
121
223
|
"""Tracks token usage across the session to provide early warnings
|
|
122
224
|
and enable proactive context management before overflow occurs."""
|
|
123
225
|
|
|
124
|
-
context_window: int = 0
|
|
226
|
+
context_window: int = 0 # Auto-detected or configured
|
|
125
227
|
total_requests: int = 0
|
|
126
|
-
last_input_tokens: int = 0
|
|
127
|
-
last_output_tokens: int = 0
|
|
128
|
-
peak_input_tokens: int = 0
|
|
129
|
-
prune_count: int = 0
|
|
130
|
-
overflow_count: int = 0
|
|
228
|
+
last_input_tokens: int = 0 # Estimated input tokens of last request
|
|
229
|
+
last_output_tokens: int = 0 # Actual output tokens of last response
|
|
230
|
+
peak_input_tokens: int = 0 # High-water mark
|
|
231
|
+
prune_count: int = 0 # How many times pruning was triggered
|
|
232
|
+
overflow_count: int = 0 # How many context overflow errors caught
|
|
131
233
|
context_history: list = field(default_factory=list) # Recent token counts
|
|
132
234
|
|
|
133
235
|
# --- Token Loop Protection ---
|
|
134
|
-
tool_call_history: list = field(
|
|
135
|
-
|
|
136
|
-
|
|
236
|
+
tool_call_history: list = field(
|
|
237
|
+
default_factory=list
|
|
238
|
+
) # Recent tool call fingerprints
|
|
239
|
+
consecutive_forced_count: int = (
|
|
240
|
+
0 # How many times tool_choice was forced consecutively
|
|
241
|
+
)
|
|
242
|
+
loop_warnings_emitted: int = 0 # How many loop warnings sent to the model
|
|
243
|
+
no_progress_streak: int = 0 # Forced tool turns without new tool_result
|
|
244
|
+
unexpected_end_turn_count: int = 0 # end_turn without tool_use in active loop
|
|
245
|
+
malformed_tool_streak: int = 0 # consecutive malformed pseudo tool payloads
|
|
246
|
+
contamination_resets: int = 0 # how many contamination resets were applied
|
|
247
|
+
last_seen_ts: float = 0.0
|
|
137
248
|
|
|
138
249
|
def record_request(self, estimated_tokens: int):
|
|
139
250
|
"""Record an outgoing request's estimated token count."""
|
|
@@ -150,6 +261,9 @@ class SessionMonitor:
|
|
|
150
261
|
"""Record a response's output token count."""
|
|
151
262
|
self.last_output_tokens = output_tokens
|
|
152
263
|
|
|
264
|
+
def touch(self):
|
|
265
|
+
self.last_seen_ts = time.time()
|
|
266
|
+
|
|
153
267
|
def get_utilization(self) -> float:
|
|
154
268
|
"""Get current context utilization as a fraction (0.0 - 1.0)."""
|
|
155
269
|
if self.context_window <= 0:
|
|
@@ -196,25 +310,36 @@ class SessionMonitor:
|
|
|
196
310
|
if warning == "CRITICAL":
|
|
197
311
|
logger.error(
|
|
198
312
|
"CONTEXT CRITICAL: %d/%d tokens (%.1f%%), %s, pruned=%d, overflows=%d",
|
|
199
|
-
self.last_input_tokens,
|
|
200
|
-
|
|
313
|
+
self.last_input_tokens,
|
|
314
|
+
self.context_window,
|
|
315
|
+
util * 100,
|
|
316
|
+
turns_str,
|
|
317
|
+
self.prune_count,
|
|
318
|
+
self.overflow_count,
|
|
201
319
|
)
|
|
202
320
|
elif warning == "HIGH":
|
|
203
321
|
logger.warning(
|
|
204
322
|
"CONTEXT HIGH: %d/%d tokens (%.1f%%), %s, pruned=%d",
|
|
205
|
-
self.last_input_tokens,
|
|
206
|
-
|
|
323
|
+
self.last_input_tokens,
|
|
324
|
+
self.context_window,
|
|
325
|
+
util * 100,
|
|
326
|
+
turns_str,
|
|
327
|
+
self.prune_count,
|
|
207
328
|
)
|
|
208
329
|
elif warning == "ELEVATED":
|
|
209
330
|
logger.warning(
|
|
210
331
|
"CONTEXT ELEVATED: %d/%d tokens (%.1f%%), %s",
|
|
211
|
-
self.last_input_tokens,
|
|
332
|
+
self.last_input_tokens,
|
|
333
|
+
self.context_window,
|
|
334
|
+
util * 100,
|
|
212
335
|
turns_str,
|
|
213
336
|
)
|
|
214
337
|
else:
|
|
215
338
|
logger.info(
|
|
216
339
|
"CONTEXT: %d/%d tokens (%.1f%%), %s",
|
|
217
|
-
self.last_input_tokens,
|
|
340
|
+
self.last_input_tokens,
|
|
341
|
+
self.context_window,
|
|
342
|
+
util * 100,
|
|
218
343
|
turns_str,
|
|
219
344
|
)
|
|
220
345
|
|
|
@@ -264,30 +389,42 @@ class SessionMonitor:
|
|
|
264
389
|
- 15+ consecutive forced requests regardless -> release
|
|
265
390
|
- Context utilization > 90% -> release (let model wrap up)
|
|
266
391
|
"""
|
|
267
|
-
|
|
392
|
+
if not PROXY_LOOP_BREAKER:
|
|
393
|
+
return False
|
|
394
|
+
|
|
395
|
+
is_looping, repeat_count = self.detect_tool_loop(window=PROXY_LOOP_WINDOW)
|
|
268
396
|
|
|
269
397
|
# Pattern 1: Detected tool call loop
|
|
270
|
-
if
|
|
398
|
+
if (
|
|
399
|
+
is_looping
|
|
400
|
+
and repeat_count >= PROXY_LOOP_REPEAT_THRESHOLD
|
|
401
|
+
and self.no_progress_streak >= PROXY_NO_PROGRESS_THRESHOLD
|
|
402
|
+
):
|
|
271
403
|
logger.warning(
|
|
272
|
-
"LOOP BREAKER: Same tool pattern repeated %d times. "
|
|
404
|
+
"LOOP BREAKER: Same tool pattern repeated %d times with no progress streak=%d. "
|
|
273
405
|
"Releasing tool_choice to 'auto'.",
|
|
274
406
|
repeat_count,
|
|
407
|
+
self.no_progress_streak,
|
|
275
408
|
)
|
|
276
409
|
self.loop_warnings_emitted += 1
|
|
277
410
|
return True
|
|
278
411
|
|
|
279
412
|
# Pattern 2: Too many consecutive forced requests
|
|
280
|
-
if
|
|
413
|
+
if (
|
|
414
|
+
self.consecutive_forced_count >= PROXY_FORCED_THRESHOLD
|
|
415
|
+
and self.no_progress_streak >= PROXY_NO_PROGRESS_THRESHOLD
|
|
416
|
+
):
|
|
281
417
|
logger.warning(
|
|
282
|
-
"LOOP BREAKER: %d consecutive forced tool_choice requests. "
|
|
418
|
+
"LOOP BREAKER: %d consecutive forced tool_choice requests with no progress streak=%d. "
|
|
283
419
|
"Releasing to 'auto'.",
|
|
284
420
|
self.consecutive_forced_count,
|
|
421
|
+
self.no_progress_streak,
|
|
285
422
|
)
|
|
286
423
|
self.loop_warnings_emitted += 1
|
|
287
424
|
return True
|
|
288
425
|
|
|
289
426
|
# Pattern 3: Context almost full -- let model wrap up naturally
|
|
290
|
-
if self.get_utilization() >=
|
|
427
|
+
if self.get_utilization() >= PROXY_CONTEXT_RELEASE_THRESHOLD:
|
|
291
428
|
logger.warning(
|
|
292
429
|
"LOOP BREAKER: Context utilization %.1f%% -- releasing "
|
|
293
430
|
"tool_choice to let model wrap up.",
|
|
@@ -298,7 +435,35 @@ class SessionMonitor:
|
|
|
298
435
|
return False
|
|
299
436
|
|
|
300
437
|
|
|
301
|
-
|
|
438
|
+
session_monitors: dict[str, SessionMonitor] = {}
|
|
439
|
+
default_context_window = 0
|
|
440
|
+
last_session_id = ""
|
|
441
|
+
|
|
442
|
+
|
|
443
|
+
def _cleanup_stale_monitors(now_ts: float) -> None:
|
|
444
|
+
stale = [
|
|
445
|
+
sid
|
|
446
|
+
for sid, mon in session_monitors.items()
|
|
447
|
+
if mon.last_seen_ts > 0 and now_ts - mon.last_seen_ts > PROXY_SESSION_TTL_SECS
|
|
448
|
+
]
|
|
449
|
+
for sid in stale:
|
|
450
|
+
session_monitors.pop(sid, None)
|
|
451
|
+
|
|
452
|
+
|
|
453
|
+
def get_session_monitor(session_id: str) -> SessionMonitor:
|
|
454
|
+
now_ts = time.time()
|
|
455
|
+
_cleanup_stale_monitors(now_ts)
|
|
456
|
+
|
|
457
|
+
monitor = session_monitors.get(session_id)
|
|
458
|
+
if monitor is None:
|
|
459
|
+
monitor = SessionMonitor(context_window=default_context_window)
|
|
460
|
+
session_monitors[session_id] = monitor
|
|
461
|
+
|
|
462
|
+
monitor.touch()
|
|
463
|
+
if monitor.context_window <= 0:
|
|
464
|
+
monitor.context_window = default_context_window
|
|
465
|
+
|
|
466
|
+
return monitor
|
|
302
467
|
|
|
303
468
|
|
|
304
469
|
# ---------------------------------------------------------------------------
|
|
@@ -324,7 +489,8 @@ async def detect_context_window(client: httpx.AsyncClient) -> int:
|
|
|
324
489
|
if n_ctx > 0:
|
|
325
490
|
logger.info(
|
|
326
491
|
"Auto-detected context window from upstream: %d tokens (%d slots)",
|
|
327
|
-
n_ctx,
|
|
492
|
+
n_ctx,
|
|
493
|
+
len(slots),
|
|
328
494
|
)
|
|
329
495
|
return n_ctx
|
|
330
496
|
except Exception as exc:
|
|
@@ -398,7 +564,9 @@ def estimate_total_tokens(anthropic_body: dict) -> int:
|
|
|
398
564
|
return tokens
|
|
399
565
|
|
|
400
566
|
|
|
401
|
-
def prune_conversation(
|
|
567
|
+
def prune_conversation(
|
|
568
|
+
anthropic_body: dict, context_window: int, target_fraction: float = 0.65
|
|
569
|
+
) -> dict:
|
|
402
570
|
"""Prune the conversation to fit within the context window.
|
|
403
571
|
|
|
404
572
|
Strategy:
|
|
@@ -445,19 +613,24 @@ def prune_conversation(anthropic_body: dict, context_window: int, target_fractio
|
|
|
445
613
|
|
|
446
614
|
# Always keep the first user message and the last N messages
|
|
447
615
|
KEEP_LAST = 8 # Keep the last 8 messages (recent context)
|
|
448
|
-
protected_head = messages[:1]
|
|
449
|
-
protected_tail =
|
|
616
|
+
protected_head = messages[:1] # First user message
|
|
617
|
+
protected_tail = (
|
|
618
|
+
messages[-KEEP_LAST:] if len(messages) > KEEP_LAST else messages[1:]
|
|
619
|
+
)
|
|
450
620
|
middle = messages[1:-KEEP_LAST] if len(messages) > KEEP_LAST + 1 else []
|
|
451
621
|
|
|
452
622
|
# Calculate tokens for protected messages
|
|
453
|
-
protected_tokens = sum(
|
|
623
|
+
protected_tokens = sum(
|
|
624
|
+
estimate_message_tokens(m) for m in protected_head + protected_tail
|
|
625
|
+
)
|
|
454
626
|
|
|
455
627
|
if protected_tokens >= message_budget:
|
|
456
628
|
# Even protected messages exceed budget -- truncate tool_result content
|
|
457
629
|
# in the tail to fit
|
|
458
630
|
logger.warning(
|
|
459
631
|
"Protected messages (%d tokens) exceed budget (%d) -- truncating tool results",
|
|
460
|
-
protected_tokens,
|
|
632
|
+
protected_tokens,
|
|
633
|
+
message_budget,
|
|
461
634
|
)
|
|
462
635
|
for msg in protected_tail:
|
|
463
636
|
content = msg.get("content", [])
|
|
@@ -466,7 +639,11 @@ def prune_conversation(anthropic_body: dict, context_window: int, target_fractio
|
|
|
466
639
|
if isinstance(block, dict) and block.get("type") == "tool_result":
|
|
467
640
|
result_text = _extract_text(block.get("content", ""))
|
|
468
641
|
if len(result_text) > 2000:
|
|
469
|
-
block["content"] =
|
|
642
|
+
block["content"] = (
|
|
643
|
+
result_text[:1000]
|
|
644
|
+
+ "\n...[TRUNCATED]...\n"
|
|
645
|
+
+ result_text[-500:]
|
|
646
|
+
)
|
|
470
647
|
anthropic_body["messages"] = protected_head + protected_tail
|
|
471
648
|
return anthropic_body
|
|
472
649
|
|
|
@@ -486,8 +663,7 @@ def prune_conversation(anthropic_body: dict, context_window: int, target_fractio
|
|
|
486
663
|
|
|
487
664
|
if isinstance(content, list):
|
|
488
665
|
is_tool_result = any(
|
|
489
|
-
isinstance(b, dict) and b.get("type") == "tool_result"
|
|
490
|
-
for b in content
|
|
666
|
+
isinstance(b, dict) and b.get("type") == "tool_result" for b in content
|
|
491
667
|
)
|
|
492
668
|
|
|
493
669
|
# Lower priority = removed first
|
|
@@ -529,12 +705,17 @@ def prune_conversation(anthropic_body: dict, context_window: int, target_fractio
|
|
|
529
705
|
f"The conversation continues from recent context below.]"
|
|
530
706
|
),
|
|
531
707
|
}
|
|
532
|
-
anthropic_body["messages"] =
|
|
708
|
+
anthropic_body["messages"] = (
|
|
709
|
+
protected_head + [prune_marker] + kept_msgs + protected_tail
|
|
710
|
+
)
|
|
533
711
|
logger.warning(
|
|
534
712
|
"PRUNED: removed %d messages (~%d tokens), kept %d messages, "
|
|
535
713
|
"target=%.0f%% of %d ctx",
|
|
536
|
-
removed_count,
|
|
537
|
-
|
|
714
|
+
removed_count,
|
|
715
|
+
removed_tokens,
|
|
716
|
+
len(anthropic_body["messages"]),
|
|
717
|
+
target_fraction * 100,
|
|
718
|
+
context_window,
|
|
538
719
|
)
|
|
539
720
|
else:
|
|
540
721
|
anthropic_body["messages"] = protected_head + kept_msgs + protected_tail
|
|
@@ -554,12 +735,13 @@ http_client: httpx.AsyncClient | None = None
|
|
|
554
735
|
async def lifespan(app: FastAPI):
|
|
555
736
|
"""Manage the httpx client lifecycle with the FastAPI app."""
|
|
556
737
|
global http_client
|
|
738
|
+
global default_context_window
|
|
557
739
|
http_client = httpx.AsyncClient(
|
|
558
740
|
timeout=httpx.Timeout(
|
|
559
|
-
connect=10.0,
|
|
560
|
-
read=PROXY_READ_TIMEOUT,
|
|
561
|
-
write=30.0,
|
|
562
|
-
pool=10.0,
|
|
741
|
+
connect=10.0, # 10s to establish connection
|
|
742
|
+
read=PROXY_READ_TIMEOUT, # configurable (default 10 min)
|
|
743
|
+
write=30.0, # 30s to send the request body
|
|
744
|
+
pool=10.0, # 10s to acquire a pool connection
|
|
563
745
|
),
|
|
564
746
|
limits=httpx.Limits(
|
|
565
747
|
max_connections=PROXY_MAX_CONNECTIONS,
|
|
@@ -569,15 +751,31 @@ async def lifespan(app: FastAPI):
|
|
|
569
751
|
)
|
|
570
752
|
logger.info(
|
|
571
753
|
"Proxy started: listening on %s:%d -> upstream %s",
|
|
572
|
-
PROXY_HOST,
|
|
754
|
+
PROXY_HOST,
|
|
755
|
+
PROXY_PORT,
|
|
756
|
+
LLAMA_CPP_BASE,
|
|
573
757
|
)
|
|
574
758
|
|
|
575
759
|
# Auto-detect context window from upstream server
|
|
576
|
-
|
|
760
|
+
default_context_window = await detect_context_window(http_client)
|
|
761
|
+
for mon in session_monitors.values():
|
|
762
|
+
if mon.context_window <= 0:
|
|
763
|
+
mon.context_window = default_context_window
|
|
577
764
|
logger.info(
|
|
578
|
-
"Context window: %d tokens, prune threshold: %.0f%%",
|
|
579
|
-
|
|
765
|
+
"Context window: %d tokens, prune threshold: %.0f%%, prune target: %.0f%%",
|
|
766
|
+
default_context_window,
|
|
580
767
|
PROXY_CONTEXT_PRUNE_THRESHOLD * 100,
|
|
768
|
+
_resolve_prune_target_fraction() * 100,
|
|
769
|
+
)
|
|
770
|
+
logger.info(
|
|
771
|
+
"Guardrails: malformed=%s stream_strict=%s force_non_stream=%s tool_narrowing=%s thinking_off_on_tools=%s contamination_breaker=%s(%d)",
|
|
772
|
+
PROXY_MALFORMED_TOOL_GUARDRAIL,
|
|
773
|
+
PROXY_MALFORMED_TOOL_STREAM_STRICT,
|
|
774
|
+
PROXY_FORCE_NON_STREAM,
|
|
775
|
+
PROXY_TOOL_NARROWING,
|
|
776
|
+
PROXY_DISABLE_THINKING_ON_TOOL_TURNS,
|
|
777
|
+
PROXY_SESSION_CONTAMINATION_BREAKER,
|
|
778
|
+
PROXY_SESSION_CONTAMINATION_THRESHOLD,
|
|
581
779
|
)
|
|
582
780
|
|
|
583
781
|
yield
|
|
@@ -598,6 +796,7 @@ app = FastAPI(
|
|
|
598
796
|
# Request Translation: Anthropic -> OpenAI
|
|
599
797
|
# ===========================================================================
|
|
600
798
|
|
|
799
|
+
|
|
601
800
|
def anthropic_to_openai_messages(anthropic_body: dict) -> list[dict]:
|
|
602
801
|
"""Convert Anthropic message format to OpenAI message format.
|
|
603
802
|
|
|
@@ -635,25 +834,33 @@ def anthropic_to_openai_messages(anthropic_body: dict) -> list[dict]:
|
|
|
635
834
|
elif block.get("type") == "text":
|
|
636
835
|
parts.append(block.get("text", ""))
|
|
637
836
|
elif block.get("type") == "tool_use":
|
|
638
|
-
messages.append(
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
"
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
837
|
+
messages.append(
|
|
838
|
+
{
|
|
839
|
+
"role": "assistant",
|
|
840
|
+
"content": None,
|
|
841
|
+
"tool_calls": [
|
|
842
|
+
{
|
|
843
|
+
"id": block.get(
|
|
844
|
+
"id", f"call_{uuid.uuid4().hex[:8]}"
|
|
845
|
+
),
|
|
846
|
+
"type": "function",
|
|
847
|
+
"function": {
|
|
848
|
+
"name": block["name"],
|
|
849
|
+
"arguments": json.dumps(block.get("input", {})),
|
|
850
|
+
},
|
|
851
|
+
}
|
|
852
|
+
],
|
|
853
|
+
}
|
|
854
|
+
)
|
|
650
855
|
continue
|
|
651
856
|
elif block.get("type") == "tool_result":
|
|
652
|
-
messages.append(
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
857
|
+
messages.append(
|
|
858
|
+
{
|
|
859
|
+
"role": "tool",
|
|
860
|
+
"tool_call_id": block.get("tool_use_id", ""),
|
|
861
|
+
"content": _extract_text(block.get("content", "")),
|
|
862
|
+
}
|
|
863
|
+
)
|
|
657
864
|
continue
|
|
658
865
|
if parts:
|
|
659
866
|
messages.append({"role": role, "content": "\n".join(parts)})
|
|
@@ -672,7 +879,7 @@ def _extract_text(content) -> str:
|
|
|
672
879
|
return str(content)
|
|
673
880
|
|
|
674
881
|
|
|
675
|
-
|
|
882
|
+
_AGENTIC_SYSTEM_SUPPLEMENT_LEGACY = (
|
|
676
883
|
"\n\n<agentic-protocol>\n"
|
|
677
884
|
"You are operating in an agentic coding loop with tool access. Follow these rules:\n"
|
|
678
885
|
"1. ALWAYS use tools to read, edit, write, and test code. Never just describe or explain what should be done.\n"
|
|
@@ -685,8 +892,183 @@ _AGENTIC_SYSTEM_SUPPLEMENT = (
|
|
|
685
892
|
"</agentic-protocol>"
|
|
686
893
|
)
|
|
687
894
|
|
|
895
|
+
_AGENTIC_SYSTEM_SUPPLEMENT_CLEAN = (
|
|
896
|
+
"\n\n<agentic-protocol>\n"
|
|
897
|
+
"You are operating in an agentic coding loop with tool access. Follow these rules:\n"
|
|
898
|
+
"1. Use tools for concrete work (read, edit, write, test) instead of stopping at analysis.\n"
|
|
899
|
+
"2. When a fix is identified, take the next tool action immediately.\n"
|
|
900
|
+
"3. Return final text only when the task is complete and verified.\n"
|
|
901
|
+
"4. Never output protocol fragments or raw tool schema in assistant text.\n"
|
|
902
|
+
"5. Never emit literal tag artifacts such as </parameter>, <tool_call>, or <function=...>.\n"
|
|
903
|
+
"6. When a tool is needed, emit a valid tool call object instead of prose about tool-call formatting.\n"
|
|
904
|
+
"7. If a tool call fails, adapt and try another approach.\n"
|
|
905
|
+
"</agentic-protocol>"
|
|
906
|
+
)
|
|
907
|
+
|
|
908
|
+
if PROXY_AGENTIC_SUPPLEMENT_MODE == "legacy":
|
|
909
|
+
_AGENTIC_SYSTEM_SUPPLEMENT = _AGENTIC_SYSTEM_SUPPLEMENT_LEGACY
|
|
910
|
+
elif PROXY_AGENTIC_SUPPLEMENT_MODE == "clean":
|
|
911
|
+
_AGENTIC_SYSTEM_SUPPLEMENT = _AGENTIC_SYSTEM_SUPPLEMENT_CLEAN
|
|
912
|
+
else:
|
|
913
|
+
logger.warning(
|
|
914
|
+
"Unknown PROXY_AGENTIC_SUPPLEMENT_MODE=%r; using clean supplement",
|
|
915
|
+
PROXY_AGENTIC_SUPPLEMENT_MODE,
|
|
916
|
+
)
|
|
917
|
+
_AGENTIC_SYSTEM_SUPPLEMENT = _AGENTIC_SYSTEM_SUPPLEMENT_CLEAN
|
|
918
|
+
|
|
688
919
|
|
|
689
|
-
def
|
|
920
|
+
def _content_fingerprint(content) -> str:
|
|
921
|
+
if isinstance(content, str):
|
|
922
|
+
return content[:512]
|
|
923
|
+
if isinstance(content, list):
|
|
924
|
+
parts = []
|
|
925
|
+
for block in content:
|
|
926
|
+
if isinstance(block, str):
|
|
927
|
+
parts.append(block)
|
|
928
|
+
elif isinstance(block, dict):
|
|
929
|
+
btype = block.get("type", "")
|
|
930
|
+
if btype == "text":
|
|
931
|
+
parts.append(block.get("text", ""))
|
|
932
|
+
elif btype == "tool_use":
|
|
933
|
+
parts.append(f"tool:{block.get('name', '')}")
|
|
934
|
+
elif btype == "tool_result":
|
|
935
|
+
parts.append(f"result:{block.get('tool_use_id', '')}")
|
|
936
|
+
return "\n".join(parts)[:1024]
|
|
937
|
+
return str(content)[:512]
|
|
938
|
+
|
|
939
|
+
|
|
940
|
+
def resolve_session_id(request: Request, anthropic_body: dict) -> str:
|
|
941
|
+
header_keys = (
|
|
942
|
+
"x-uap-session-id",
|
|
943
|
+
"x-claude-session-id",
|
|
944
|
+
"anthropic-session-id",
|
|
945
|
+
"x-session-id",
|
|
946
|
+
)
|
|
947
|
+
for key in header_keys:
|
|
948
|
+
value = request.headers.get(key)
|
|
949
|
+
if value:
|
|
950
|
+
return f"hdr:{value}"
|
|
951
|
+
|
|
952
|
+
metadata = anthropic_body.get("metadata", {})
|
|
953
|
+
if isinstance(metadata, dict):
|
|
954
|
+
for key in ("session_id", "conversation_id", "thread_id"):
|
|
955
|
+
value = metadata.get(key)
|
|
956
|
+
if value:
|
|
957
|
+
return f"meta:{value}"
|
|
958
|
+
|
|
959
|
+
first_user = ""
|
|
960
|
+
for msg in anthropic_body.get("messages", []):
|
|
961
|
+
if msg.get("role") == "user":
|
|
962
|
+
first_user = _content_fingerprint(msg.get("content", ""))
|
|
963
|
+
break
|
|
964
|
+
|
|
965
|
+
system_fingerprint = _content_fingerprint(anthropic_body.get("system", ""))
|
|
966
|
+
model = anthropic_body.get("model", "default")
|
|
967
|
+
remote = request.client.host if request.client else "unknown"
|
|
968
|
+
digest = hashlib.sha256(
|
|
969
|
+
f"{remote}|{model}|{system_fingerprint}|{first_user}".encode(
|
|
970
|
+
"utf-8", errors="ignore"
|
|
971
|
+
)
|
|
972
|
+
).hexdigest()[:20]
|
|
973
|
+
return f"fp:{digest}"
|
|
974
|
+
|
|
975
|
+
|
|
976
|
+
def _last_user_has_tool_result(anthropic_body: dict) -> bool:
|
|
977
|
+
messages = anthropic_body.get("messages", [])
|
|
978
|
+
for msg in reversed(messages):
|
|
979
|
+
if msg.get("role") != "user":
|
|
980
|
+
continue
|
|
981
|
+
content = msg.get("content")
|
|
982
|
+
if not isinstance(content, list):
|
|
983
|
+
return False
|
|
984
|
+
return any(
|
|
985
|
+
isinstance(b, dict) and b.get("type") == "tool_result" for b in content
|
|
986
|
+
)
|
|
987
|
+
return False
|
|
988
|
+
|
|
989
|
+
|
|
990
|
+
def _convert_anthropic_tools_to_openai(anthropic_tools: list[dict]) -> list[dict]:
|
|
991
|
+
converted = []
|
|
992
|
+
for tool in anthropic_tools:
|
|
993
|
+
converted.append(
|
|
994
|
+
{
|
|
995
|
+
"type": "function",
|
|
996
|
+
"function": {
|
|
997
|
+
"name": tool.get("name", ""),
|
|
998
|
+
"description": tool.get("description", ""),
|
|
999
|
+
"parameters": tool.get("input_schema", {}),
|
|
1000
|
+
},
|
|
1001
|
+
}
|
|
1002
|
+
)
|
|
1003
|
+
return converted
|
|
1004
|
+
|
|
1005
|
+
|
|
1006
|
+
def _latest_user_text(anthropic_body: dict) -> str:
|
|
1007
|
+
for msg in reversed(anthropic_body.get("messages", [])):
|
|
1008
|
+
if msg.get("role") != "user":
|
|
1009
|
+
continue
|
|
1010
|
+
return _extract_text(msg.get("content", ""))
|
|
1011
|
+
return ""
|
|
1012
|
+
|
|
1013
|
+
|
|
1014
|
+
def _tokenize_for_tool_ranking(text: str) -> set[str]:
|
|
1015
|
+
return {m.group(0).lower() for m in re.finditer(r"[a-zA-Z0-9_]{2,}", text)}
|
|
1016
|
+
|
|
1017
|
+
|
|
1018
|
+
def _narrow_tools_for_request(
|
|
1019
|
+
anthropic_body: dict, openai_tools: list[dict]
|
|
1020
|
+
) -> list[dict]:
|
|
1021
|
+
if not PROXY_TOOL_NARROWING:
|
|
1022
|
+
return openai_tools
|
|
1023
|
+
|
|
1024
|
+
if len(openai_tools) < max(1, PROXY_TOOL_NARROWING_MIN_TOOLS):
|
|
1025
|
+
return openai_tools
|
|
1026
|
+
|
|
1027
|
+
keep = max(1, PROXY_TOOL_NARROWING_KEEP)
|
|
1028
|
+
if keep >= len(openai_tools):
|
|
1029
|
+
return openai_tools
|
|
1030
|
+
|
|
1031
|
+
query_text = _latest_user_text(anthropic_body).lower()
|
|
1032
|
+
query_tokens = _tokenize_for_tool_ranking(query_text)
|
|
1033
|
+
if not query_tokens:
|
|
1034
|
+
narrowed = openai_tools[:keep]
|
|
1035
|
+
logger.info(
|
|
1036
|
+
"TOOL NARROWING: %d -> %d tools (no query tokens)",
|
|
1037
|
+
len(openai_tools),
|
|
1038
|
+
len(narrowed),
|
|
1039
|
+
)
|
|
1040
|
+
return narrowed
|
|
1041
|
+
|
|
1042
|
+
scored: list[tuple[int, int, dict]] = []
|
|
1043
|
+
for idx, tool in enumerate(openai_tools):
|
|
1044
|
+
fn = tool.get("function", {})
|
|
1045
|
+
name = fn.get("name", "")
|
|
1046
|
+
desc = fn.get("description", "")
|
|
1047
|
+
hay = f"{name} {desc}".lower()
|
|
1048
|
+
tool_tokens = _tokenize_for_tool_ranking(hay)
|
|
1049
|
+
overlap = len(query_tokens & tool_tokens)
|
|
1050
|
+
score = overlap * 3
|
|
1051
|
+
if name and name.lower() in query_text:
|
|
1052
|
+
score += 4
|
|
1053
|
+
if name and any(tok in name.lower() for tok in query_tokens):
|
|
1054
|
+
score += 1
|
|
1055
|
+
scored.append((score, -idx, tool))
|
|
1056
|
+
|
|
1057
|
+
scored.sort(reverse=True)
|
|
1058
|
+
selected = {id(tool) for _, _, tool in scored[:keep]}
|
|
1059
|
+
narrowed = [tool for tool in openai_tools if id(tool) in selected]
|
|
1060
|
+
|
|
1061
|
+
top_names = [t.get("function", {}).get("name", "") for t in narrowed[:4]]
|
|
1062
|
+
logger.info(
|
|
1063
|
+
"TOOL NARROWING: %d -> %d tools (top=%s)",
|
|
1064
|
+
len(openai_tools),
|
|
1065
|
+
len(narrowed),
|
|
1066
|
+
top_names,
|
|
1067
|
+
)
|
|
1068
|
+
return narrowed
|
|
1069
|
+
|
|
1070
|
+
|
|
1071
|
+
def build_openai_request(anthropic_body: dict, monitor: SessionMonitor) -> dict:
|
|
690
1072
|
"""Build an OpenAI Chat Completions request from an Anthropic Messages request."""
|
|
691
1073
|
openai_body = {
|
|
692
1074
|
"model": anthropic_body.get("model", "default"),
|
|
@@ -700,23 +1082,26 @@ def build_openai_request(anthropic_body: dict) -> dict:
|
|
|
700
1082
|
openai_body["messages"][0]["content"] += _AGENTIC_SYSTEM_SUPPLEMENT
|
|
701
1083
|
else:
|
|
702
1084
|
# No system message from the client; inject one.
|
|
703
|
-
openai_body["messages"].insert(
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
1085
|
+
openai_body["messages"].insert(
|
|
1086
|
+
0,
|
|
1087
|
+
{
|
|
1088
|
+
"role": "system",
|
|
1089
|
+
"content": _AGENTIC_SYSTEM_SUPPLEMENT.strip(),
|
|
1090
|
+
},
|
|
1091
|
+
)
|
|
707
1092
|
|
|
708
1093
|
if "max_tokens" in anthropic_body:
|
|
709
|
-
# Enforce minimum floor for thinking mode: model needs
|
|
710
|
-
# reasoning (<think>...</think>) plus
|
|
711
|
-
#
|
|
712
|
-
requested_max =
|
|
1094
|
+
# Enforce configurable minimum floor for thinking mode: model needs
|
|
1095
|
+
# tokens for reasoning (<think>...</think>) plus actual response/tool
|
|
1096
|
+
# calls. Set PROXY_MAX_TOKENS_FLOOR=0 to disable this floor.
|
|
1097
|
+
requested_max = _resolve_max_tokens_request(anthropic_body["max_tokens"])
|
|
713
1098
|
|
|
714
1099
|
# Option E: Smart max_tokens capping — prevent the response from
|
|
715
1100
|
# consuming so many tokens that the NEXT turn's input won't fit.
|
|
716
1101
|
# Formula: max_tokens = min(requested, context_window - input_tokens - safety_margin)
|
|
717
1102
|
# This ensures the model's output + current input stays within bounds,
|
|
718
1103
|
# leaving room for the next turn's incremental growth.
|
|
719
|
-
ctx_window =
|
|
1104
|
+
ctx_window = monitor.context_window
|
|
720
1105
|
if ctx_window > 0:
|
|
721
1106
|
estimated_input = estimate_total_tokens(anthropic_body)
|
|
722
1107
|
# Reserve 15% of context for next-turn growth (tool results, etc.)
|
|
@@ -725,8 +1110,11 @@ def build_openai_request(anthropic_body: dict) -> dict:
|
|
|
725
1110
|
if available_for_output < requested_max and available_for_output > 1024:
|
|
726
1111
|
logger.info(
|
|
727
1112
|
"MAX_TOKENS capped: %d -> %d (ctx=%d, input~%d, margin=%d)",
|
|
728
|
-
requested_max,
|
|
729
|
-
|
|
1113
|
+
requested_max,
|
|
1114
|
+
available_for_output,
|
|
1115
|
+
ctx_window,
|
|
1116
|
+
estimated_input,
|
|
1117
|
+
safety_margin,
|
|
730
1118
|
)
|
|
731
1119
|
requested_max = available_for_output
|
|
732
1120
|
elif available_for_output <= 1024:
|
|
@@ -734,7 +1122,9 @@ def build_openai_request(anthropic_body: dict) -> dict:
|
|
|
734
1122
|
logger.warning(
|
|
735
1123
|
"MAX_TOKENS: only %d tokens available for output (ctx=%d, input~%d). "
|
|
736
1124
|
"Response may be truncated.",
|
|
737
|
-
available_for_output,
|
|
1125
|
+
available_for_output,
|
|
1126
|
+
ctx_window,
|
|
1127
|
+
estimated_input,
|
|
738
1128
|
)
|
|
739
1129
|
requested_max = max(1024, available_for_output)
|
|
740
1130
|
|
|
@@ -748,16 +1138,12 @@ def build_openai_request(anthropic_body: dict) -> dict:
|
|
|
748
1138
|
|
|
749
1139
|
# Convert Anthropic tools to OpenAI function-calling tools
|
|
750
1140
|
if "tools" in anthropic_body:
|
|
751
|
-
openai_body["tools"] =
|
|
752
|
-
|
|
753
|
-
|
|
754
|
-
|
|
755
|
-
|
|
756
|
-
|
|
757
|
-
"description": tool.get("description", ""),
|
|
758
|
-
"parameters": tool.get("input_schema", {}),
|
|
759
|
-
},
|
|
760
|
-
})
|
|
1141
|
+
openai_body["tools"] = _convert_anthropic_tools_to_openai(
|
|
1142
|
+
anthropic_body.get("tools", [])
|
|
1143
|
+
)
|
|
1144
|
+
openai_body["tools"] = _narrow_tools_for_request(
|
|
1145
|
+
anthropic_body, openai_body["tools"]
|
|
1146
|
+
)
|
|
761
1147
|
|
|
762
1148
|
# Smart tool_choice: force tool calls during the agentic loop to
|
|
763
1149
|
# prevent the model from producing text-only end_turn responses that
|
|
@@ -774,7 +1160,8 @@ def build_openai_request(anthropic_body: dict) -> dict:
|
|
|
774
1160
|
# runaway token consumption.
|
|
775
1161
|
n_msgs = len(anthropic_body.get("messages", []))
|
|
776
1162
|
has_tool_results = any(
|
|
777
|
-
isinstance(m.get("content"), list)
|
|
1163
|
+
isinstance(m.get("content"), list)
|
|
1164
|
+
and any(
|
|
778
1165
|
isinstance(b, dict) and b.get("type") == "tool_result"
|
|
779
1166
|
for b in m.get("content", [])
|
|
780
1167
|
)
|
|
@@ -782,28 +1169,47 @@ def build_openai_request(anthropic_body: dict) -> dict:
|
|
|
782
1169
|
)
|
|
783
1170
|
|
|
784
1171
|
# Record tool calls from the last assistant message for loop detection
|
|
785
|
-
_record_last_assistant_tool_calls(anthropic_body)
|
|
1172
|
+
_record_last_assistant_tool_calls(anthropic_body, monitor)
|
|
1173
|
+
last_user_has_tool_result = _last_user_has_tool_result(anthropic_body)
|
|
786
1174
|
|
|
787
1175
|
# Check if loop breaker should override tool_choice
|
|
788
|
-
if
|
|
1176
|
+
if monitor.should_release_tool_choice():
|
|
789
1177
|
openai_body["tool_choice"] = "auto"
|
|
790
|
-
|
|
1178
|
+
monitor.consecutive_forced_count = 0
|
|
1179
|
+
monitor.no_progress_streak = 0
|
|
791
1180
|
logger.warning("tool_choice set to 'auto' by LOOP BREAKER")
|
|
792
1181
|
elif _last_assistant_was_text_only(anthropic_body):
|
|
793
1182
|
openai_body["tool_choice"] = "required"
|
|
794
|
-
|
|
795
|
-
|
|
1183
|
+
monitor.consecutive_forced_count += 1
|
|
1184
|
+
monitor.no_progress_streak = (
|
|
1185
|
+
0 if last_user_has_tool_result else monitor.no_progress_streak + 1
|
|
1186
|
+
)
|
|
1187
|
+
logger.info(
|
|
1188
|
+
"tool_choice forced to 'required' (last assistant was text-only)"
|
|
1189
|
+
)
|
|
796
1190
|
elif has_tool_results and n_msgs > 2:
|
|
797
1191
|
openai_body["tool_choice"] = "required"
|
|
798
|
-
|
|
799
|
-
|
|
1192
|
+
monitor.consecutive_forced_count += 1
|
|
1193
|
+
monitor.no_progress_streak = (
|
|
1194
|
+
0 if last_user_has_tool_result else monitor.no_progress_streak + 1
|
|
1195
|
+
)
|
|
1196
|
+
logger.info(
|
|
1197
|
+
"tool_choice forced to 'required' (active agentic loop with tool results)"
|
|
1198
|
+
)
|
|
800
1199
|
else:
|
|
801
|
-
|
|
1200
|
+
monitor.consecutive_forced_count = 0
|
|
1201
|
+
monitor.no_progress_streak = 0
|
|
1202
|
+
|
|
1203
|
+
if PROXY_DISABLE_THINKING_ON_TOOL_TURNS:
|
|
1204
|
+
openai_body["enable_thinking"] = False
|
|
1205
|
+
logger.info(
|
|
1206
|
+
"Thinking disabled for tool turn (PROXY_DISABLE_THINKING_ON_TOOL_TURNS=on)"
|
|
1207
|
+
)
|
|
802
1208
|
|
|
803
1209
|
return openai_body
|
|
804
1210
|
|
|
805
1211
|
|
|
806
|
-
def _record_last_assistant_tool_calls(anthropic_body: dict):
|
|
1212
|
+
def _record_last_assistant_tool_calls(anthropic_body: dict, monitor: SessionMonitor):
|
|
807
1213
|
"""Extract tool call names from the last assistant message and record
|
|
808
1214
|
them in the session monitor for loop detection."""
|
|
809
1215
|
messages = anthropic_body.get("messages", [])
|
|
@@ -818,7 +1224,88 @@ def _record_last_assistant_tool_calls(anthropic_body: dict):
|
|
|
818
1224
|
tool_names.append(block.get("name", "unknown"))
|
|
819
1225
|
break
|
|
820
1226
|
if tool_names:
|
|
821
|
-
|
|
1227
|
+
monitor.record_tool_calls(tool_names)
|
|
1228
|
+
|
|
1229
|
+
|
|
1230
|
+
def _is_unexpected_end_turn(openai_resp: dict, anthropic_body: dict) -> bool:
|
|
1231
|
+
choices = openai_resp.get("choices") or []
|
|
1232
|
+
if not choices:
|
|
1233
|
+
return False
|
|
1234
|
+
|
|
1235
|
+
choice = choices[0]
|
|
1236
|
+
finish = choice.get("finish_reason")
|
|
1237
|
+
if finish not in {"stop", "end_turn"}:
|
|
1238
|
+
return False
|
|
1239
|
+
|
|
1240
|
+
msg = choice.get("message", {})
|
|
1241
|
+
if msg.get("tool_calls"):
|
|
1242
|
+
return False
|
|
1243
|
+
|
|
1244
|
+
if "tools" not in anthropic_body:
|
|
1245
|
+
return False
|
|
1246
|
+
|
|
1247
|
+
has_tool_results = any(
|
|
1248
|
+
isinstance(m.get("content"), list)
|
|
1249
|
+
and any(
|
|
1250
|
+
isinstance(b, dict) and b.get("type") == "tool_result"
|
|
1251
|
+
for b in m.get("content", [])
|
|
1252
|
+
)
|
|
1253
|
+
for m in anthropic_body.get("messages", [])
|
|
1254
|
+
)
|
|
1255
|
+
|
|
1256
|
+
return has_tool_results or _last_assistant_was_text_only(anthropic_body)
|
|
1257
|
+
|
|
1258
|
+
|
|
1259
|
+
def _resolve_max_tokens_request(requested_max_tokens: int) -> int:
|
|
1260
|
+
requested = max(1, int(requested_max_tokens))
|
|
1261
|
+
floor = max(0, PROXY_MAX_TOKENS_FLOOR)
|
|
1262
|
+
if floor == 0:
|
|
1263
|
+
return requested
|
|
1264
|
+
return max(requested, floor)
|
|
1265
|
+
|
|
1266
|
+
|
|
1267
|
+
def _resolve_prune_target_fraction() -> float:
|
|
1268
|
+
if 0.0 < PROXY_CONTEXT_PRUNE_TARGET_FRACTION < 1.0:
|
|
1269
|
+
return PROXY_CONTEXT_PRUNE_TARGET_FRACTION
|
|
1270
|
+
logger.warning(
|
|
1271
|
+
"Invalid PROXY_CONTEXT_PRUNE_TARGET_FRACTION=%s; using default 0.65",
|
|
1272
|
+
PROXY_CONTEXT_PRUNE_TARGET_FRACTION,
|
|
1273
|
+
)
|
|
1274
|
+
return 0.65
|
|
1275
|
+
|
|
1276
|
+
|
|
1277
|
+
def _sanitize_reasoning_fallback_text(reasoning_text: str) -> str:
|
|
1278
|
+
cleaned = re.sub(r"</?think>", "", reasoning_text, flags=re.IGNORECASE)
|
|
1279
|
+
cleaned = re.sub(r"\s+", " ", cleaned).strip()
|
|
1280
|
+
if not cleaned:
|
|
1281
|
+
return ""
|
|
1282
|
+
if len(cleaned) > PROXY_STREAM_REASONING_MAX_CHARS:
|
|
1283
|
+
return cleaned[:PROXY_STREAM_REASONING_MAX_CHARS].rstrip() + "..."
|
|
1284
|
+
return cleaned
|
|
1285
|
+
|
|
1286
|
+
|
|
1287
|
+
def _build_reasoning_fallback_text(
|
|
1288
|
+
reasoning_chunks: list[str], mode: str | None = None
|
|
1289
|
+
) -> str | None:
|
|
1290
|
+
fallback_mode = (mode or PROXY_STREAM_REASONING_FALLBACK).strip().lower()
|
|
1291
|
+
if fallback_mode == "off":
|
|
1292
|
+
return None
|
|
1293
|
+
|
|
1294
|
+
raw_text = "".join(reasoning_chunks).strip()
|
|
1295
|
+
if not raw_text:
|
|
1296
|
+
return None
|
|
1297
|
+
|
|
1298
|
+
if fallback_mode == "visible":
|
|
1299
|
+
return raw_text
|
|
1300
|
+
if fallback_mode == "sanitized":
|
|
1301
|
+
sanitized = _sanitize_reasoning_fallback_text(raw_text)
|
|
1302
|
+
return sanitized or None
|
|
1303
|
+
|
|
1304
|
+
logger.warning(
|
|
1305
|
+
"Unknown PROXY_STREAM_REASONING_FALLBACK=%r; disabling reasoning fallback",
|
|
1306
|
+
fallback_mode,
|
|
1307
|
+
)
|
|
1308
|
+
return None
|
|
822
1309
|
|
|
823
1310
|
|
|
824
1311
|
def _last_assistant_was_text_only(anthropic_body: dict) -> bool:
|
|
@@ -836,11 +1323,14 @@ def _last_assistant_was_text_only(anthropic_body: dict) -> bool:
|
|
|
836
1323
|
return bool(content.strip())
|
|
837
1324
|
if isinstance(content, list):
|
|
838
1325
|
has_tool_use = any(
|
|
839
|
-
isinstance(b, dict) and b.get("type") == "tool_use"
|
|
840
|
-
for b in content
|
|
1326
|
+
isinstance(b, dict) and b.get("type") == "tool_use" for b in content
|
|
841
1327
|
)
|
|
842
1328
|
has_text = any(
|
|
843
|
-
(
|
|
1329
|
+
(
|
|
1330
|
+
isinstance(b, dict)
|
|
1331
|
+
and b.get("type") == "text"
|
|
1332
|
+
and b.get("text", "").strip()
|
|
1333
|
+
)
|
|
844
1334
|
or isinstance(b, str)
|
|
845
1335
|
for b in content
|
|
846
1336
|
)
|
|
@@ -850,10 +1340,468 @@ def _last_assistant_was_text_only(anthropic_body: dict) -> bool:
|
|
|
850
1340
|
return False
|
|
851
1341
|
|
|
852
1342
|
|
|
1343
|
+
def _extract_openai_choice(openai_resp: dict) -> tuple[dict, dict]:
|
|
1344
|
+
choice = (openai_resp.get("choices") or [{}])[0]
|
|
1345
|
+
message = choice.get("message") or {}
|
|
1346
|
+
return choice, message
|
|
1347
|
+
|
|
1348
|
+
|
|
1349
|
+
def _openai_message_text(openai_resp: dict) -> str:
|
|
1350
|
+
_, message = _extract_openai_choice(openai_resp)
|
|
1351
|
+
content = message.get("content", "")
|
|
1352
|
+
return content if isinstance(content, str) else str(content)
|
|
1353
|
+
|
|
1354
|
+
|
|
1355
|
+
def _extract_openai_tool_calls(openai_resp: dict) -> list[dict]:
|
|
1356
|
+
_, message = _extract_openai_choice(openai_resp)
|
|
1357
|
+
tool_calls = message.get("tool_calls") or []
|
|
1358
|
+
return tool_calls if isinstance(tool_calls, list) else []
|
|
1359
|
+
|
|
1360
|
+
|
|
1361
|
+
def _openai_has_tool_calls(openai_resp: dict) -> bool:
|
|
1362
|
+
return bool(_extract_openai_tool_calls(openai_resp))
|
|
1363
|
+
|
|
1364
|
+
|
|
1365
|
+
def _parse_openai_function_arguments(raw_args) -> tuple[dict | None, str | None]:
|
|
1366
|
+
if isinstance(raw_args, dict):
|
|
1367
|
+
return raw_args, None
|
|
1368
|
+
if isinstance(raw_args, str):
|
|
1369
|
+
try:
|
|
1370
|
+
parsed = json.loads(raw_args)
|
|
1371
|
+
except json.JSONDecodeError:
|
|
1372
|
+
return None, "invalid_json"
|
|
1373
|
+
if not isinstance(parsed, dict):
|
|
1374
|
+
return None, "arguments_not_object"
|
|
1375
|
+
return parsed, None
|
|
1376
|
+
return None, "invalid_arguments_type"
|
|
1377
|
+
|
|
1378
|
+
|
|
1379
|
+
def _schema_type_matches(value, expected_type: str) -> bool:
|
|
1380
|
+
if expected_type == "string":
|
|
1381
|
+
return isinstance(value, str)
|
|
1382
|
+
if expected_type == "number":
|
|
1383
|
+
return isinstance(value, (int, float)) and not isinstance(value, bool)
|
|
1384
|
+
if expected_type == "integer":
|
|
1385
|
+
return isinstance(value, int) and not isinstance(value, bool)
|
|
1386
|
+
if expected_type == "boolean":
|
|
1387
|
+
return isinstance(value, bool)
|
|
1388
|
+
if expected_type == "array":
|
|
1389
|
+
return isinstance(value, list)
|
|
1390
|
+
if expected_type == "object":
|
|
1391
|
+
return isinstance(value, dict)
|
|
1392
|
+
if expected_type == "null":
|
|
1393
|
+
return value is None
|
|
1394
|
+
return True
|
|
1395
|
+
|
|
1396
|
+
|
|
1397
|
+
def _string_contains_tool_markup(value: str) -> bool:
|
|
1398
|
+
lowered = value.lower()
|
|
1399
|
+
markers = ("<parameter", "</parameter", "<tool_call", "<function=", "</function")
|
|
1400
|
+
return any(marker in lowered for marker in markers)
|
|
1401
|
+
|
|
1402
|
+
|
|
1403
|
+
def _validate_tool_arguments_against_schema(
|
|
1404
|
+
args: dict, input_schema: dict
|
|
1405
|
+
) -> tuple[bool, str]:
|
|
1406
|
+
if not isinstance(input_schema, dict):
|
|
1407
|
+
return True, ""
|
|
1408
|
+
|
|
1409
|
+
required = input_schema.get("required") or []
|
|
1410
|
+
if isinstance(required, list):
|
|
1411
|
+
for field in required:
|
|
1412
|
+
if not isinstance(field, str):
|
|
1413
|
+
continue
|
|
1414
|
+
if field not in args:
|
|
1415
|
+
return False, f"missing required field '{field}'"
|
|
1416
|
+
value = args.get(field)
|
|
1417
|
+
if value is None:
|
|
1418
|
+
return False, f"required field '{field}' is null"
|
|
1419
|
+
if isinstance(value, str) and not value.strip():
|
|
1420
|
+
return False, f"required field '{field}' is empty"
|
|
1421
|
+
if isinstance(value, str) and _string_contains_tool_markup(value):
|
|
1422
|
+
return (
|
|
1423
|
+
False,
|
|
1424
|
+
f"required field '{field}' contains malformed tool markup",
|
|
1425
|
+
)
|
|
1426
|
+
|
|
1427
|
+
properties = input_schema.get("properties") or {}
|
|
1428
|
+
if isinstance(properties, dict):
|
|
1429
|
+
for key, prop_schema in properties.items():
|
|
1430
|
+
if key not in args:
|
|
1431
|
+
continue
|
|
1432
|
+
if not isinstance(prop_schema, dict):
|
|
1433
|
+
continue
|
|
1434
|
+
expected = prop_schema.get("type")
|
|
1435
|
+
if isinstance(expected, str):
|
|
1436
|
+
if not _schema_type_matches(args[key], expected):
|
|
1437
|
+
return (
|
|
1438
|
+
False,
|
|
1439
|
+
f"type mismatch for '{key}' (expected {expected})",
|
|
1440
|
+
)
|
|
1441
|
+
if expected == "string" and isinstance(args[key], str):
|
|
1442
|
+
if _string_contains_tool_markup(args[key]):
|
|
1443
|
+
return (
|
|
1444
|
+
False,
|
|
1445
|
+
f"string field '{key}' contains malformed tool markup",
|
|
1446
|
+
)
|
|
1447
|
+
elif isinstance(expected, list) and expected:
|
|
1448
|
+
if not any(_schema_type_matches(args[key], t) for t in expected):
|
|
1449
|
+
expected_str = ",".join(str(t) for t in expected)
|
|
1450
|
+
return (
|
|
1451
|
+
False,
|
|
1452
|
+
f"type mismatch for '{key}' (expected one of {expected_str})",
|
|
1453
|
+
)
|
|
1454
|
+
|
|
1455
|
+
return True, ""
|
|
1456
|
+
|
|
1457
|
+
|
|
1458
|
+
def _tool_schema_map_from_anthropic_body(anthropic_body: dict) -> dict[str, dict]:
|
|
1459
|
+
schema_map: dict[str, dict] = {}
|
|
1460
|
+
for tool in anthropic_body.get("tools", []) or []:
|
|
1461
|
+
if not isinstance(tool, dict):
|
|
1462
|
+
continue
|
|
1463
|
+
name = tool.get("name")
|
|
1464
|
+
if isinstance(name, str) and name:
|
|
1465
|
+
schema = tool.get("input_schema")
|
|
1466
|
+
schema_map[name] = schema if isinstance(schema, dict) else {}
|
|
1467
|
+
return schema_map
|
|
1468
|
+
|
|
1469
|
+
|
|
1470
|
+
def _invalid_tool_call_reason(openai_resp: dict, anthropic_body: dict) -> str | None:
|
|
1471
|
+
if "tools" not in anthropic_body:
|
|
1472
|
+
return None
|
|
1473
|
+
|
|
1474
|
+
tool_calls = _extract_openai_tool_calls(openai_resp)
|
|
1475
|
+
if not tool_calls:
|
|
1476
|
+
return None
|
|
1477
|
+
|
|
1478
|
+
schema_map = _tool_schema_map_from_anthropic_body(anthropic_body)
|
|
1479
|
+
if not schema_map:
|
|
1480
|
+
return None
|
|
1481
|
+
|
|
1482
|
+
for idx, tc in enumerate(tool_calls):
|
|
1483
|
+
if not isinstance(tc, dict):
|
|
1484
|
+
return f"tool call {idx} is not an object"
|
|
1485
|
+
fn = tc.get("function")
|
|
1486
|
+
if not isinstance(fn, dict):
|
|
1487
|
+
return f"tool call {idx} missing function payload"
|
|
1488
|
+
|
|
1489
|
+
name = fn.get("name")
|
|
1490
|
+
if not isinstance(name, str) or not name:
|
|
1491
|
+
return f"tool call {idx} missing function name"
|
|
1492
|
+
if name not in schema_map:
|
|
1493
|
+
return f"tool call {idx} uses unknown tool '{name}'"
|
|
1494
|
+
|
|
1495
|
+
args, parse_error = _parse_openai_function_arguments(fn.get("arguments", "{}"))
|
|
1496
|
+
if parse_error:
|
|
1497
|
+
return f"tool call {idx} invalid arguments ({parse_error})"
|
|
1498
|
+
if args is None:
|
|
1499
|
+
return f"tool call {idx} has empty arguments"
|
|
1500
|
+
|
|
1501
|
+
valid, reason = _validate_tool_arguments_against_schema(args, schema_map[name])
|
|
1502
|
+
if not valid:
|
|
1503
|
+
return f"tool call {idx} failed schema validation: {reason}"
|
|
1504
|
+
|
|
1505
|
+
return None
|
|
1506
|
+
|
|
1507
|
+
|
|
1508
|
+
def _openai_has_valid_tool_calls(openai_resp: dict, anthropic_body: dict) -> bool:
|
|
1509
|
+
return (
|
|
1510
|
+
_openai_has_tool_calls(openai_resp)
|
|
1511
|
+
and _invalid_tool_call_reason(openai_resp, anthropic_body) is None
|
|
1512
|
+
)
|
|
1513
|
+
|
|
1514
|
+
|
|
1515
|
+
def _looks_malformed_tool_payload(text: str) -> bool:
|
|
1516
|
+
if not text:
|
|
1517
|
+
return False
|
|
1518
|
+
|
|
1519
|
+
lowered = text.lower()
|
|
1520
|
+
primary_markers = ("</parameter", "<parameter", "<tool_call", "<function=")
|
|
1521
|
+
if any(marker in lowered for marker in primary_markers):
|
|
1522
|
+
return True
|
|
1523
|
+
|
|
1524
|
+
structural_markers = (
|
|
1525
|
+
'=\n{"description"',
|
|
1526
|
+
"</think>",
|
|
1527
|
+
)
|
|
1528
|
+
marker_hits = sum(1 for marker in structural_markers if marker in lowered)
|
|
1529
|
+
repeated_description = lowered.count('{"description"') >= 2
|
|
1530
|
+
repeated_must_call = lowered.count("you must call a tool") >= 2
|
|
1531
|
+
has_unicode_marker = "⎿" in text
|
|
1532
|
+
policy_echo_loop = repeated_must_call and (
|
|
1533
|
+
"do not summarize the issue and stop" in lowered
|
|
1534
|
+
or "must call a tool to make the fix" in lowered
|
|
1535
|
+
)
|
|
1536
|
+
policy_snippets = (
|
|
1537
|
+
"do not summarize the issue and stop",
|
|
1538
|
+
"if you have identified a problem",
|
|
1539
|
+
"you must call a tool to make the fix",
|
|
1540
|
+
"</agentic-protocol>",
|
|
1541
|
+
)
|
|
1542
|
+
policy_hits = sum(1 for snippet in policy_snippets if snippet in lowered)
|
|
1543
|
+
|
|
1544
|
+
if marker_hits >= 2:
|
|
1545
|
+
return True
|
|
1546
|
+
if marker_hits >= 1 and (
|
|
1547
|
+
repeated_description or repeated_must_call or has_unicode_marker
|
|
1548
|
+
):
|
|
1549
|
+
return True
|
|
1550
|
+
if policy_echo_loop:
|
|
1551
|
+
return True
|
|
1552
|
+
if policy_hits >= 2:
|
|
1553
|
+
return True
|
|
1554
|
+
if lowered.count("</parameter") >= 1 and lowered.count('{"description"') >= 1:
|
|
1555
|
+
return True
|
|
1556
|
+
return False
|
|
1557
|
+
|
|
1558
|
+
|
|
1559
|
+
def _is_malformed_tool_response(openai_resp: dict, anthropic_body: dict) -> bool:
|
|
1560
|
+
if "tools" not in anthropic_body:
|
|
1561
|
+
return False
|
|
1562
|
+
|
|
1563
|
+
if _invalid_tool_call_reason(openai_resp, anthropic_body):
|
|
1564
|
+
return True
|
|
1565
|
+
|
|
1566
|
+
if _openai_has_tool_calls(openai_resp):
|
|
1567
|
+
return False
|
|
1568
|
+
|
|
1569
|
+
return _looks_malformed_tool_payload(_openai_message_text(openai_resp))
|
|
1570
|
+
|
|
1571
|
+
|
|
1572
|
+
def _build_malformed_retry_body(openai_body: dict, anthropic_body: dict) -> dict:
|
|
1573
|
+
retry_body = dict(openai_body)
|
|
1574
|
+
retry_body["stream"] = False
|
|
1575
|
+
retry_body["tool_choice"] = "required"
|
|
1576
|
+
retry_body["temperature"] = PROXY_MALFORMED_TOOL_RETRY_TEMPERATURE
|
|
1577
|
+
|
|
1578
|
+
if PROXY_MALFORMED_TOOL_RETRY_MAX_TOKENS > 0:
|
|
1579
|
+
current_max = int(
|
|
1580
|
+
retry_body.get("max_tokens", PROXY_MALFORMED_TOOL_RETRY_MAX_TOKENS)
|
|
1581
|
+
)
|
|
1582
|
+
retry_body["max_tokens"] = min(
|
|
1583
|
+
current_max, PROXY_MALFORMED_TOOL_RETRY_MAX_TOKENS
|
|
1584
|
+
)
|
|
1585
|
+
|
|
1586
|
+
# On malformed retry, restore full tool list to avoid starving selection.
|
|
1587
|
+
if anthropic_body.get("tools"):
|
|
1588
|
+
retry_body["tools"] = _convert_anthropic_tools_to_openai(
|
|
1589
|
+
anthropic_body.get("tools", [])
|
|
1590
|
+
)
|
|
1591
|
+
|
|
1592
|
+
if PROXY_DISABLE_THINKING_ON_TOOL_TURNS:
|
|
1593
|
+
retry_body["enable_thinking"] = False
|
|
1594
|
+
|
|
1595
|
+
return retry_body
|
|
1596
|
+
|
|
1597
|
+
|
|
1598
|
+
def _build_clean_guardrail_openai_response(openai_resp: dict) -> dict:
|
|
1599
|
+
return {
|
|
1600
|
+
"id": openai_resp.get("id", f"chatcmpl_{uuid.uuid4().hex[:12]}"),
|
|
1601
|
+
"object": openai_resp.get("object", "chat.completion"),
|
|
1602
|
+
"created": openai_resp.get("created", int(time.time())),
|
|
1603
|
+
"model": openai_resp.get("model", "unknown"),
|
|
1604
|
+
"choices": [
|
|
1605
|
+
{
|
|
1606
|
+
"index": 0,
|
|
1607
|
+
"finish_reason": "stop",
|
|
1608
|
+
"message": {
|
|
1609
|
+
"role": "assistant",
|
|
1610
|
+
"content": (
|
|
1611
|
+
"I could not produce a valid tool-call format in this turn. "
|
|
1612
|
+
"Please continue; I will issue exactly one valid tool call next."
|
|
1613
|
+
),
|
|
1614
|
+
},
|
|
1615
|
+
}
|
|
1616
|
+
],
|
|
1617
|
+
"usage": openai_resp.get("usage", {}),
|
|
1618
|
+
}
|
|
1619
|
+
|
|
1620
|
+
|
|
1621
|
+
async def _apply_unexpected_end_turn_guardrail(
|
|
1622
|
+
client: httpx.AsyncClient,
|
|
1623
|
+
openai_resp: dict,
|
|
1624
|
+
openai_body: dict,
|
|
1625
|
+
anthropic_body: dict,
|
|
1626
|
+
monitor: SessionMonitor,
|
|
1627
|
+
session_id: str,
|
|
1628
|
+
) -> dict:
|
|
1629
|
+
if not PROXY_GUARDRAIL_RETRY:
|
|
1630
|
+
return openai_resp
|
|
1631
|
+
|
|
1632
|
+
if not _is_unexpected_end_turn(openai_resp, anthropic_body):
|
|
1633
|
+
return openai_resp
|
|
1634
|
+
|
|
1635
|
+
monitor.unexpected_end_turn_count += 1
|
|
1636
|
+
logger.warning(
|
|
1637
|
+
"GUARDRAIL: unexpected end_turn without tool_use in active loop (session=%s), retrying once with tool_choice=required",
|
|
1638
|
+
session_id,
|
|
1639
|
+
)
|
|
1640
|
+
|
|
1641
|
+
retry_body = dict(openai_body)
|
|
1642
|
+
retry_body["tool_choice"] = "required"
|
|
1643
|
+
retry_body["stream"] = False
|
|
1644
|
+
|
|
1645
|
+
retry_resp = await client.post(
|
|
1646
|
+
f"{LLAMA_CPP_BASE}/chat/completions",
|
|
1647
|
+
json=retry_body,
|
|
1648
|
+
headers={"Content-Type": "application/json"},
|
|
1649
|
+
)
|
|
1650
|
+
if retry_resp.status_code == 200:
|
|
1651
|
+
retry_json = retry_resp.json()
|
|
1652
|
+
retry_choice, retry_message = _extract_openai_choice(retry_json)
|
|
1653
|
+
if _openai_has_valid_tool_calls(retry_json, anthropic_body):
|
|
1654
|
+
logger.info("GUARDRAIL: retry produced tool_use; using retried response")
|
|
1655
|
+
return retry_json
|
|
1656
|
+
invalid_reason = _invalid_tool_call_reason(retry_json, anthropic_body)
|
|
1657
|
+
if invalid_reason:
|
|
1658
|
+
logger.warning(
|
|
1659
|
+
"GUARDRAIL: retry produced invalid tool_call payload (%s)",
|
|
1660
|
+
invalid_reason,
|
|
1661
|
+
)
|
|
1662
|
+
logger.info(
|
|
1663
|
+
"GUARDRAIL: retry returned finish_reason=%s without tool_use",
|
|
1664
|
+
retry_choice.get("finish_reason"),
|
|
1665
|
+
)
|
|
1666
|
+
else:
|
|
1667
|
+
logger.warning(
|
|
1668
|
+
"GUARDRAIL retry upstream status=%d; keeping original response",
|
|
1669
|
+
retry_resp.status_code,
|
|
1670
|
+
)
|
|
1671
|
+
|
|
1672
|
+
return openai_resp
|
|
1673
|
+
|
|
1674
|
+
|
|
1675
|
+
async def _apply_malformed_tool_guardrail(
|
|
1676
|
+
client: httpx.AsyncClient,
|
|
1677
|
+
openai_resp: dict,
|
|
1678
|
+
openai_body: dict,
|
|
1679
|
+
anthropic_body: dict,
|
|
1680
|
+
monitor: SessionMonitor,
|
|
1681
|
+
session_id: str,
|
|
1682
|
+
) -> dict:
|
|
1683
|
+
if not PROXY_MALFORMED_TOOL_GUARDRAIL:
|
|
1684
|
+
return openai_resp
|
|
1685
|
+
|
|
1686
|
+
if not _is_malformed_tool_response(openai_resp, anthropic_body):
|
|
1687
|
+
if _openai_has_valid_tool_calls(openai_resp, anthropic_body):
|
|
1688
|
+
monitor.malformed_tool_streak = 0
|
|
1689
|
+
return openai_resp
|
|
1690
|
+
|
|
1691
|
+
monitor.malformed_tool_streak += 1
|
|
1692
|
+
invalid_reason = _invalid_tool_call_reason(openai_resp, anthropic_body)
|
|
1693
|
+
if invalid_reason:
|
|
1694
|
+
excerpt = invalid_reason[:220]
|
|
1695
|
+
else:
|
|
1696
|
+
excerpt = _openai_message_text(openai_resp)[:220].replace("\n", " ")
|
|
1697
|
+
logger.warning(
|
|
1698
|
+
"MALFORMED TOOL PAYLOAD: session=%s streak=%d excerpt=%.220s",
|
|
1699
|
+
session_id,
|
|
1700
|
+
monitor.malformed_tool_streak,
|
|
1701
|
+
excerpt,
|
|
1702
|
+
)
|
|
1703
|
+
|
|
1704
|
+
attempts = max(0, PROXY_MALFORMED_TOOL_RETRY_MAX)
|
|
1705
|
+
for attempt in range(attempts):
|
|
1706
|
+
retry_body = _build_malformed_retry_body(openai_body, anthropic_body)
|
|
1707
|
+
retry_resp = await client.post(
|
|
1708
|
+
f"{LLAMA_CPP_BASE}/chat/completions",
|
|
1709
|
+
json=retry_body,
|
|
1710
|
+
headers={"Content-Type": "application/json"},
|
|
1711
|
+
)
|
|
1712
|
+
if retry_resp.status_code != 200:
|
|
1713
|
+
logger.warning(
|
|
1714
|
+
"MALFORMED RETRY failed (attempt %d/%d): HTTP %d",
|
|
1715
|
+
attempt + 1,
|
|
1716
|
+
attempts,
|
|
1717
|
+
retry_resp.status_code,
|
|
1718
|
+
)
|
|
1719
|
+
continue
|
|
1720
|
+
|
|
1721
|
+
retry_json = retry_resp.json()
|
|
1722
|
+
if _openai_has_valid_tool_calls(retry_json, anthropic_body):
|
|
1723
|
+
monitor.malformed_tool_streak = 0
|
|
1724
|
+
logger.info(
|
|
1725
|
+
"MALFORMED RETRY success: produced tool_use (attempt %d/%d)",
|
|
1726
|
+
attempt + 1,
|
|
1727
|
+
attempts,
|
|
1728
|
+
)
|
|
1729
|
+
return retry_json
|
|
1730
|
+
|
|
1731
|
+
retry_invalid_reason = _invalid_tool_call_reason(retry_json, anthropic_body)
|
|
1732
|
+
if retry_invalid_reason:
|
|
1733
|
+
logger.warning(
|
|
1734
|
+
"MALFORMED RETRY invalid tool_call payload (attempt %d/%d): %s",
|
|
1735
|
+
attempt + 1,
|
|
1736
|
+
attempts,
|
|
1737
|
+
retry_invalid_reason,
|
|
1738
|
+
)
|
|
1739
|
+
|
|
1740
|
+
if not _is_malformed_tool_response(retry_json, anthropic_body):
|
|
1741
|
+
monitor.malformed_tool_streak = 0
|
|
1742
|
+
logger.info(
|
|
1743
|
+
"MALFORMED RETRY produced clean text response (attempt %d/%d)",
|
|
1744
|
+
attempt + 1,
|
|
1745
|
+
attempts,
|
|
1746
|
+
)
|
|
1747
|
+
return retry_json
|
|
1748
|
+
|
|
1749
|
+
monitor.malformed_tool_streak += 1
|
|
1750
|
+
|
|
1751
|
+
logger.error(
|
|
1752
|
+
"MALFORMED TOOL PAYLOAD persisted after retries (session=%s); returning clean guardrail response",
|
|
1753
|
+
session_id,
|
|
1754
|
+
)
|
|
1755
|
+
return _build_clean_guardrail_openai_response(openai_resp)
|
|
1756
|
+
|
|
1757
|
+
|
|
1758
|
+
def _maybe_apply_session_contamination_breaker(
|
|
1759
|
+
anthropic_body: dict, monitor: SessionMonitor, session_id: str
|
|
1760
|
+
) -> dict:
|
|
1761
|
+
if not PROXY_SESSION_CONTAMINATION_BREAKER:
|
|
1762
|
+
return anthropic_body
|
|
1763
|
+
|
|
1764
|
+
threshold = max(1, PROXY_SESSION_CONTAMINATION_THRESHOLD)
|
|
1765
|
+
if monitor.malformed_tool_streak < threshold:
|
|
1766
|
+
return anthropic_body
|
|
1767
|
+
|
|
1768
|
+
messages = anthropic_body.get("messages", [])
|
|
1769
|
+
keep_last = max(2, PROXY_SESSION_CONTAMINATION_KEEP_LAST)
|
|
1770
|
+
if len(messages) <= keep_last + 1:
|
|
1771
|
+
monitor.malformed_tool_streak = 0
|
|
1772
|
+
return anthropic_body
|
|
1773
|
+
|
|
1774
|
+
head = messages[:1]
|
|
1775
|
+
tail = messages[-keep_last:]
|
|
1776
|
+
reset_marker = {
|
|
1777
|
+
"role": "user",
|
|
1778
|
+
"content": (
|
|
1779
|
+
"[SESSION RESET: previous turns contained malformed tool-call formatting "
|
|
1780
|
+
"artifacts. Continue from the recent context below and emit valid tool calls only.]"
|
|
1781
|
+
),
|
|
1782
|
+
}
|
|
1783
|
+
|
|
1784
|
+
updated_body = dict(anthropic_body)
|
|
1785
|
+
updated_body["messages"] = head + [reset_marker] + tail
|
|
1786
|
+
|
|
1787
|
+
monitor.contamination_resets += 1
|
|
1788
|
+
monitor.malformed_tool_streak = 0
|
|
1789
|
+
monitor.no_progress_streak = 0
|
|
1790
|
+
monitor.consecutive_forced_count = 0
|
|
1791
|
+
logger.warning(
|
|
1792
|
+
"SESSION CONTAMINATION BREAKER: session=%s reset applied, kept=%d messages",
|
|
1793
|
+
session_id,
|
|
1794
|
+
len(updated_body["messages"]),
|
|
1795
|
+
)
|
|
1796
|
+
|
|
1797
|
+
return updated_body
|
|
1798
|
+
|
|
1799
|
+
|
|
853
1800
|
# ===========================================================================
|
|
854
1801
|
# Response Translation: OpenAI -> Anthropic
|
|
855
1802
|
# ===========================================================================
|
|
856
1803
|
|
|
1804
|
+
|
|
857
1805
|
def openai_to_anthropic_response(openai_resp: dict, model: str) -> dict:
|
|
858
1806
|
"""Convert an OpenAI Chat Completions response to Anthropic Messages format."""
|
|
859
1807
|
choice = openai_resp.get("choices", [{}])[0]
|
|
@@ -871,12 +1819,14 @@ def openai_to_anthropic_response(openai_resp: dict, model: str) -> dict:
|
|
|
871
1819
|
args = json.loads(fn.get("arguments", "{}"))
|
|
872
1820
|
except json.JSONDecodeError:
|
|
873
1821
|
args = {}
|
|
874
|
-
content.append(
|
|
875
|
-
|
|
876
|
-
|
|
877
|
-
|
|
878
|
-
|
|
879
|
-
|
|
1822
|
+
content.append(
|
|
1823
|
+
{
|
|
1824
|
+
"type": "tool_use",
|
|
1825
|
+
"id": tc.get("id", f"toolu_{uuid.uuid4().hex[:12]}"),
|
|
1826
|
+
"name": fn.get("name", ""),
|
|
1827
|
+
"input": args,
|
|
1828
|
+
}
|
|
1829
|
+
)
|
|
880
1830
|
|
|
881
1831
|
stop_reason_map = {
|
|
882
1832
|
"stop": "end_turn",
|
|
@@ -902,11 +1852,78 @@ def openai_to_anthropic_response(openai_resp: dict, model: str) -> dict:
|
|
|
902
1852
|
}
|
|
903
1853
|
|
|
904
1854
|
|
|
1855
|
+
async def stream_anthropic_message(anthropic_resp: dict):
|
|
1856
|
+
"""Stream a finalized Anthropic message as SSE events."""
|
|
1857
|
+
message = {
|
|
1858
|
+
"id": anthropic_resp.get("id", f"msg_{uuid.uuid4().hex[:24]}"),
|
|
1859
|
+
"type": "message",
|
|
1860
|
+
"role": "assistant",
|
|
1861
|
+
"content": [],
|
|
1862
|
+
"model": anthropic_resp.get("model", "unknown"),
|
|
1863
|
+
"stop_reason": None,
|
|
1864
|
+
"stop_sequence": None,
|
|
1865
|
+
"usage": {"input_tokens": 0, "output_tokens": 0},
|
|
1866
|
+
}
|
|
1867
|
+
yield f"event: message_start\ndata: {json.dumps({'type': 'message_start', 'message': message})}\n\n"
|
|
1868
|
+
|
|
1869
|
+
content_blocks = anthropic_resp.get("content", []) or [{"type": "text", "text": ""}]
|
|
1870
|
+
block_index = 0
|
|
1871
|
+
for block in content_blocks:
|
|
1872
|
+
btype = block.get("type", "text")
|
|
1873
|
+
if btype == "tool_use":
|
|
1874
|
+
tool_id = block.get("id", f"toolu_{uuid.uuid4().hex[:12]}")
|
|
1875
|
+
tool_name = block.get("name", "")
|
|
1876
|
+
tool_input = json.dumps(block.get("input", {}), separators=(",", ":"))
|
|
1877
|
+
yield (
|
|
1878
|
+
"event: content_block_start\n"
|
|
1879
|
+
f"data: {json.dumps({'type': 'content_block_start', 'index': block_index, 'content_block': {'type': 'tool_use', 'id': tool_id, 'name': tool_name}})}\n\n"
|
|
1880
|
+
)
|
|
1881
|
+
if tool_input:
|
|
1882
|
+
yield (
|
|
1883
|
+
"event: content_block_delta\n"
|
|
1884
|
+
f"data: {json.dumps({'type': 'content_block_delta', 'index': block_index, 'delta': {'type': 'input_json_delta', 'partial_json': tool_input}})}\n\n"
|
|
1885
|
+
)
|
|
1886
|
+
yield (
|
|
1887
|
+
"event: content_block_stop\n"
|
|
1888
|
+
f"data: {json.dumps({'type': 'content_block_stop', 'index': block_index})}\n\n"
|
|
1889
|
+
)
|
|
1890
|
+
else:
|
|
1891
|
+
text = block.get("text", "")
|
|
1892
|
+
yield (
|
|
1893
|
+
"event: content_block_start\n"
|
|
1894
|
+
f"data: {json.dumps({'type': 'content_block_start', 'index': block_index, 'content_block': {'type': 'text', 'text': ''}})}\n\n"
|
|
1895
|
+
)
|
|
1896
|
+
if text:
|
|
1897
|
+
yield (
|
|
1898
|
+
"event: content_block_delta\n"
|
|
1899
|
+
f"data: {json.dumps({'type': 'content_block_delta', 'index': block_index, 'delta': {'type': 'text_delta', 'text': text}})}\n\n"
|
|
1900
|
+
)
|
|
1901
|
+
yield (
|
|
1902
|
+
"event: content_block_stop\n"
|
|
1903
|
+
f"data: {json.dumps({'type': 'content_block_stop', 'index': block_index})}\n\n"
|
|
1904
|
+
)
|
|
1905
|
+
block_index += 1
|
|
1906
|
+
|
|
1907
|
+
output_tokens = anthropic_resp.get("usage", {}).get("output_tokens", 0)
|
|
1908
|
+
stop_reason = anthropic_resp.get("stop_reason", "end_turn")
|
|
1909
|
+
yield (
|
|
1910
|
+
"event: message_delta\n"
|
|
1911
|
+
f"data: {json.dumps({'type': 'message_delta', 'delta': {'stop_reason': stop_reason, 'stop_sequence': None}, 'usage': {'output_tokens': output_tokens}})}\n\n"
|
|
1912
|
+
)
|
|
1913
|
+
yield f"event: message_stop\ndata: {json.dumps({'type': 'message_stop'})}\n\n"
|
|
1914
|
+
|
|
1915
|
+
|
|
905
1916
|
# ===========================================================================
|
|
906
1917
|
# Streaming Translation: OpenAI SSE -> Anthropic SSE
|
|
907
1918
|
# ===========================================================================
|
|
908
1919
|
|
|
909
|
-
|
|
1920
|
+
|
|
1921
|
+
async def stream_anthropic_response(
|
|
1922
|
+
openai_stream: httpx.Response,
|
|
1923
|
+
model: str,
|
|
1924
|
+
monitor: SessionMonitor,
|
|
1925
|
+
anthropic_body: dict,
|
|
1926
|
+
):
|
|
910
1927
|
"""Convert an OpenAI streaming response to Anthropic SSE stream format.
|
|
911
1928
|
|
|
912
1929
|
Handles:
|
|
@@ -929,7 +1946,7 @@ async def stream_anthropic_response(openai_stream: httpx.Response, model: str):
|
|
|
929
1946
|
f"data: {json.dumps({'type': 'content_block_start', 'index': 0, 'content_block': {'type': 'text', 'text': ''}})}\n\n"
|
|
930
1947
|
)
|
|
931
1948
|
|
|
932
|
-
yield
|
|
1949
|
+
yield 'event: ping\ndata: {"type": "ping"}\n\n'
|
|
933
1950
|
|
|
934
1951
|
output_tokens = 0
|
|
935
1952
|
finish_reason = "end_turn"
|
|
@@ -1058,21 +2075,29 @@ async def stream_anthropic_response(openai_stream: httpx.Response, model: str):
|
|
|
1058
2075
|
f"data: {json.dumps({'type': 'content_block_stop', 'index': tc['block_index']})}\n\n"
|
|
1059
2076
|
)
|
|
1060
2077
|
else:
|
|
1061
|
-
#
|
|
1062
|
-
#
|
|
1063
|
-
#
|
|
2078
|
+
# If the response has no text and no tool calls, optionally emit a
|
|
2079
|
+
# reasoning fallback (configurable) to avoid leaking malformed
|
|
2080
|
+
# internal chain-of-thought content by default.
|
|
1064
2081
|
accumulated_text = "".join(text_chunks)
|
|
1065
2082
|
if not accumulated_text and reasoning_chunks:
|
|
1066
|
-
fallback_text =
|
|
1067
|
-
|
|
1068
|
-
|
|
1069
|
-
|
|
1070
|
-
|
|
1071
|
-
|
|
1072
|
-
|
|
1073
|
-
|
|
1074
|
-
|
|
1075
|
-
|
|
2083
|
+
fallback_text = _build_reasoning_fallback_text(reasoning_chunks)
|
|
2084
|
+
if fallback_text:
|
|
2085
|
+
logger.warning(
|
|
2086
|
+
"Empty response with %d reasoning chunks – emitting fallback text (mode=%s)",
|
|
2087
|
+
len(reasoning_chunks),
|
|
2088
|
+
PROXY_STREAM_REASONING_FALLBACK,
|
|
2089
|
+
)
|
|
2090
|
+
text_chunks.append(fallback_text)
|
|
2091
|
+
yield (
|
|
2092
|
+
f"event: content_block_delta\n"
|
|
2093
|
+
f"data: {json.dumps({'type': 'content_block_delta', 'index': 0, 'delta': {'type': 'text_delta', 'text': fallback_text}})}\n\n"
|
|
2094
|
+
)
|
|
2095
|
+
else:
|
|
2096
|
+
logger.warning(
|
|
2097
|
+
"Empty response with %d reasoning chunks – fallback suppressed (mode=%s)",
|
|
2098
|
+
len(reasoning_chunks),
|
|
2099
|
+
PROXY_STREAM_REASONING_FALLBACK,
|
|
2100
|
+
)
|
|
1076
2101
|
|
|
1077
2102
|
yield (
|
|
1078
2103
|
f"event: content_block_stop\n"
|
|
@@ -1081,17 +2106,65 @@ async def stream_anthropic_response(openai_stream: httpx.Response, model: str):
|
|
|
1081
2106
|
|
|
1082
2107
|
# Log response summary
|
|
1083
2108
|
accumulated_text = "".join(text_chunks)
|
|
1084
|
-
tc_names =
|
|
1085
|
-
|
|
2109
|
+
tc_names = (
|
|
2110
|
+
[tc["name"] for tc in tool_calls_by_index.values()]
|
|
2111
|
+
if tool_calls_by_index
|
|
2112
|
+
else []
|
|
2113
|
+
)
|
|
2114
|
+
tc_args = (
|
|
2115
|
+
[tc.get("arguments", "") for tc in tool_calls_by_index.values()]
|
|
2116
|
+
if tool_calls_by_index
|
|
2117
|
+
else []
|
|
2118
|
+
)
|
|
1086
2119
|
logger.info(
|
|
1087
2120
|
"RESP: finish=%s output_tokens=%d text_len=%d text=%.300s tool_calls=%s args=%s",
|
|
1088
|
-
finish_reason,
|
|
2121
|
+
finish_reason,
|
|
2122
|
+
output_tokens,
|
|
1089
2123
|
len(accumulated_text),
|
|
1090
2124
|
accumulated_text[:300],
|
|
1091
2125
|
tc_names,
|
|
1092
2126
|
[a[:200] for a in tc_args],
|
|
1093
2127
|
)
|
|
1094
2128
|
|
|
2129
|
+
synthetic_openai_resp = {
|
|
2130
|
+
"choices": [
|
|
2131
|
+
{
|
|
2132
|
+
"finish_reason": "stop"
|
|
2133
|
+
if finish_reason == "end_turn"
|
|
2134
|
+
else finish_reason,
|
|
2135
|
+
"message": {
|
|
2136
|
+
"content": accumulated_text,
|
|
2137
|
+
"tool_calls": [
|
|
2138
|
+
{
|
|
2139
|
+
"function": {
|
|
2140
|
+
"name": tc["name"],
|
|
2141
|
+
"arguments": tc.get("arguments", ""),
|
|
2142
|
+
}
|
|
2143
|
+
}
|
|
2144
|
+
for tc in tool_calls_by_index.values()
|
|
2145
|
+
],
|
|
2146
|
+
},
|
|
2147
|
+
}
|
|
2148
|
+
]
|
|
2149
|
+
}
|
|
2150
|
+
|
|
2151
|
+
if _is_malformed_tool_response(synthetic_openai_resp, anthropic_body):
|
|
2152
|
+
monitor.malformed_tool_streak += 1
|
|
2153
|
+
elif (
|
|
2154
|
+
"tools" in anthropic_body
|
|
2155
|
+
and not tool_calls_by_index
|
|
2156
|
+
and (
|
|
2157
|
+
finish_reason == "max_tokens"
|
|
2158
|
+
or (finish_reason == "end_turn" and len(accumulated_text) > 512)
|
|
2159
|
+
)
|
|
2160
|
+
):
|
|
2161
|
+
monitor.malformed_tool_streak += 1
|
|
2162
|
+
elif tool_calls_by_index:
|
|
2163
|
+
monitor.malformed_tool_streak = 0
|
|
2164
|
+
|
|
2165
|
+
if _is_unexpected_end_turn(synthetic_openai_resp, anthropic_body):
|
|
2166
|
+
monitor.unexpected_end_turn_count += 1
|
|
2167
|
+
|
|
1095
2168
|
# message_delta with final stop reason
|
|
1096
2169
|
yield (
|
|
1097
2170
|
f"event: message_delta\n"
|
|
@@ -1106,6 +2179,7 @@ async def stream_anthropic_response(openai_stream: httpx.Response, model: str):
|
|
|
1106
2179
|
# API Endpoints
|
|
1107
2180
|
# ===========================================================================
|
|
1108
2181
|
|
|
2182
|
+
|
|
1109
2183
|
@app.post("/v1/messages")
|
|
1110
2184
|
async def messages(request: Request):
|
|
1111
2185
|
"""Handle Anthropic Messages API requests (streaming and non-streaming).
|
|
@@ -1116,9 +2190,16 @@ async def messages(request: Request):
|
|
|
1116
2190
|
- Option E: Smart max_tokens capping (in build_openai_request)
|
|
1117
2191
|
- Option F: Session-level token monitoring with warnings
|
|
1118
2192
|
"""
|
|
2193
|
+
global last_session_id
|
|
2194
|
+
|
|
1119
2195
|
body = await request.json()
|
|
1120
2196
|
model = body.get("model", "default")
|
|
1121
2197
|
is_stream = body.get("stream", False)
|
|
2198
|
+
session_id = resolve_session_id(request, body)
|
|
2199
|
+
monitor = get_session_monitor(session_id)
|
|
2200
|
+
last_session_id = session_id
|
|
2201
|
+
|
|
2202
|
+
body = _maybe_apply_session_contamination_breaker(body, monitor, session_id)
|
|
1122
2203
|
|
|
1123
2204
|
# Debug: log request summary
|
|
1124
2205
|
n_messages = len(body.get("messages", []))
|
|
@@ -1128,42 +2209,53 @@ async def messages(request: Request):
|
|
|
1128
2209
|
last_role = last_msg.get("role", "?")
|
|
1129
2210
|
last_content = last_msg.get("content", "")
|
|
1130
2211
|
if isinstance(last_content, list):
|
|
1131
|
-
last_text = next(
|
|
2212
|
+
last_text = next(
|
|
2213
|
+
(b.get("text", "") for b in last_content if b.get("type") == "text"), ""
|
|
2214
|
+
)[:200]
|
|
1132
2215
|
elif isinstance(last_content, str):
|
|
1133
2216
|
last_text = last_content[:200]
|
|
1134
2217
|
else:
|
|
1135
2218
|
last_text = str(last_content)[:200]
|
|
1136
2219
|
logger.info(
|
|
1137
2220
|
"REQ: stream=%s msgs=%d tools=%d max_tokens=%s last_role=%s last_content=%.200s",
|
|
1138
|
-
is_stream,
|
|
2221
|
+
is_stream,
|
|
2222
|
+
n_messages,
|
|
2223
|
+
n_tools,
|
|
2224
|
+
max_tokens,
|
|
2225
|
+
last_role,
|
|
2226
|
+
last_text,
|
|
1139
2227
|
)
|
|
1140
2228
|
|
|
1141
2229
|
# --- Option F: Estimate tokens and record in session monitor ---
|
|
1142
2230
|
estimated_tokens = estimate_total_tokens(body)
|
|
1143
|
-
|
|
1144
|
-
|
|
2231
|
+
monitor.record_request(estimated_tokens)
|
|
2232
|
+
monitor.log_status()
|
|
1145
2233
|
|
|
1146
2234
|
# --- Option C: Prune conversation if approaching context limit ---
|
|
1147
|
-
ctx_window =
|
|
2235
|
+
ctx_window = monitor.context_window
|
|
1148
2236
|
if ctx_window > 0:
|
|
1149
2237
|
utilization = estimated_tokens / ctx_window
|
|
1150
2238
|
if utilization >= PROXY_CONTEXT_PRUNE_THRESHOLD:
|
|
1151
2239
|
logger.warning(
|
|
1152
2240
|
"Context utilization %.1f%% exceeds threshold %.1f%% -- pruning conversation",
|
|
1153
|
-
utilization * 100,
|
|
2241
|
+
utilization * 100,
|
|
2242
|
+
PROXY_CONTEXT_PRUNE_THRESHOLD * 100,
|
|
2243
|
+
)
|
|
2244
|
+
body = prune_conversation(
|
|
2245
|
+
body, ctx_window, target_fraction=_resolve_prune_target_fraction()
|
|
1154
2246
|
)
|
|
1155
|
-
|
|
1156
|
-
session_monitor.prune_count += 1
|
|
2247
|
+
monitor.prune_count += 1
|
|
1157
2248
|
# Re-estimate after pruning
|
|
1158
2249
|
estimated_tokens = estimate_total_tokens(body)
|
|
1159
|
-
|
|
2250
|
+
monitor.record_request(estimated_tokens)
|
|
1160
2251
|
n_messages = len(body.get("messages", []))
|
|
1161
2252
|
logger.info(
|
|
1162
2253
|
"After pruning: ~%d tokens, %d messages",
|
|
1163
|
-
estimated_tokens,
|
|
2254
|
+
estimated_tokens,
|
|
2255
|
+
n_messages,
|
|
1164
2256
|
)
|
|
1165
2257
|
|
|
1166
|
-
openai_body = build_openai_request(body)
|
|
2258
|
+
openai_body = build_openai_request(body, monitor)
|
|
1167
2259
|
|
|
1168
2260
|
client = http_client
|
|
1169
2261
|
if client is None:
|
|
@@ -1173,6 +2265,79 @@ async def messages(request: Request):
|
|
|
1173
2265
|
media_type="application/json",
|
|
1174
2266
|
)
|
|
1175
2267
|
|
|
2268
|
+
use_guarded_non_stream = is_stream and (
|
|
2269
|
+
PROXY_FORCE_NON_STREAM
|
|
2270
|
+
or (PROXY_MALFORMED_TOOL_STREAM_STRICT and "tools" in body)
|
|
2271
|
+
)
|
|
2272
|
+
if use_guarded_non_stream:
|
|
2273
|
+
strict_body = dict(openai_body)
|
|
2274
|
+
strict_body["stream"] = False
|
|
2275
|
+
|
|
2276
|
+
strict_resp = await client.post(
|
|
2277
|
+
f"{LLAMA_CPP_BASE}/chat/completions",
|
|
2278
|
+
json=strict_body,
|
|
2279
|
+
headers={"Content-Type": "application/json"},
|
|
2280
|
+
)
|
|
2281
|
+
|
|
2282
|
+
if strict_resp.status_code != 200:
|
|
2283
|
+
error_text = strict_resp.text[:1000]
|
|
2284
|
+
logger.error(
|
|
2285
|
+
"Upstream HTTP %d (strict-stream): %s",
|
|
2286
|
+
strict_resp.status_code,
|
|
2287
|
+
error_text,
|
|
2288
|
+
)
|
|
2289
|
+
return Response(
|
|
2290
|
+
content=json.dumps(
|
|
2291
|
+
{
|
|
2292
|
+
"type": "error",
|
|
2293
|
+
"error": {
|
|
2294
|
+
"type": "overloaded_error",
|
|
2295
|
+
"message": f"Upstream error (HTTP {strict_resp.status_code}): {error_text[:500]}",
|
|
2296
|
+
},
|
|
2297
|
+
}
|
|
2298
|
+
),
|
|
2299
|
+
status_code=529,
|
|
2300
|
+
media_type="application/json",
|
|
2301
|
+
)
|
|
2302
|
+
|
|
2303
|
+
openai_resp = strict_resp.json()
|
|
2304
|
+
openai_resp = await _apply_unexpected_end_turn_guardrail(
|
|
2305
|
+
client,
|
|
2306
|
+
openai_resp,
|
|
2307
|
+
strict_body,
|
|
2308
|
+
body,
|
|
2309
|
+
monitor,
|
|
2310
|
+
session_id,
|
|
2311
|
+
)
|
|
2312
|
+
openai_resp = await _apply_malformed_tool_guardrail(
|
|
2313
|
+
client,
|
|
2314
|
+
openai_resp,
|
|
2315
|
+
strict_body,
|
|
2316
|
+
body,
|
|
2317
|
+
monitor,
|
|
2318
|
+
session_id,
|
|
2319
|
+
)
|
|
2320
|
+
|
|
2321
|
+
anthropic_resp = openai_to_anthropic_response(openai_resp, model)
|
|
2322
|
+
monitor.record_response(anthropic_resp.get("usage", {}).get("output_tokens", 0))
|
|
2323
|
+
if PROXY_FORCE_NON_STREAM:
|
|
2324
|
+
logger.info(
|
|
2325
|
+
"FORCED NON-STREAM: served stream response via guarded non-stream path"
|
|
2326
|
+
)
|
|
2327
|
+
else:
|
|
2328
|
+
logger.info(
|
|
2329
|
+
"STRICT STREAM GUARDRAIL: served stream response via guarded non-stream path"
|
|
2330
|
+
)
|
|
2331
|
+
|
|
2332
|
+
return StreamingResponse(
|
|
2333
|
+
stream_anthropic_message(anthropic_resp),
|
|
2334
|
+
media_type="text/event-stream",
|
|
2335
|
+
headers={
|
|
2336
|
+
"Cache-Control": "no-cache",
|
|
2337
|
+
"Connection": "keep-alive",
|
|
2338
|
+
},
|
|
2339
|
+
)
|
|
2340
|
+
|
|
1176
2341
|
if is_stream:
|
|
1177
2342
|
openai_body["stream"] = True
|
|
1178
2343
|
|
|
@@ -1181,6 +2346,7 @@ async def messages(request: Request):
|
|
|
1181
2346
|
MAX_UPSTREAM_RETRIES = 3
|
|
1182
2347
|
RETRY_DELAY_SECS = 5.0
|
|
1183
2348
|
last_exc: Exception | None = None
|
|
2349
|
+
resp: httpx.Response | None = None
|
|
1184
2350
|
|
|
1185
2351
|
for attempt in range(MAX_UPSTREAM_RETRIES):
|
|
1186
2352
|
try:
|
|
@@ -1201,25 +2367,46 @@ async def messages(request: Request):
|
|
|
1201
2367
|
if attempt < MAX_UPSTREAM_RETRIES - 1:
|
|
1202
2368
|
logger.warning(
|
|
1203
2369
|
"Upstream connect failed (attempt %d/%d): %s – retrying in %.0fs",
|
|
1204
|
-
attempt + 1,
|
|
1205
|
-
|
|
2370
|
+
attempt + 1,
|
|
2371
|
+
MAX_UPSTREAM_RETRIES,
|
|
2372
|
+
type(exc).__name__,
|
|
2373
|
+
RETRY_DELAY_SECS,
|
|
1206
2374
|
)
|
|
1207
2375
|
await asyncio.sleep(RETRY_DELAY_SECS)
|
|
1208
2376
|
else:
|
|
1209
2377
|
logger.error(
|
|
1210
2378
|
"Upstream connect failed after %d attempts: %s: %s",
|
|
1211
|
-
MAX_UPSTREAM_RETRIES,
|
|
2379
|
+
MAX_UPSTREAM_RETRIES,
|
|
2380
|
+
type(exc).__name__,
|
|
2381
|
+
exc,
|
|
1212
2382
|
)
|
|
1213
2383
|
|
|
1214
2384
|
if last_exc is not None:
|
|
1215
2385
|
return Response(
|
|
1216
|
-
content=json.dumps(
|
|
1217
|
-
|
|
1218
|
-
|
|
1219
|
-
"
|
|
1220
|
-
|
|
1221
|
-
|
|
1222
|
-
|
|
2386
|
+
content=json.dumps(
|
|
2387
|
+
{
|
|
2388
|
+
"type": "error",
|
|
2389
|
+
"error": {
|
|
2390
|
+
"type": "overloaded_error",
|
|
2391
|
+
"message": f"Upstream server unavailable after {MAX_UPSTREAM_RETRIES} retries: {last_exc}",
|
|
2392
|
+
},
|
|
2393
|
+
}
|
|
2394
|
+
),
|
|
2395
|
+
status_code=529,
|
|
2396
|
+
media_type="application/json",
|
|
2397
|
+
)
|
|
2398
|
+
|
|
2399
|
+
if resp is None:
|
|
2400
|
+
return Response(
|
|
2401
|
+
content=json.dumps(
|
|
2402
|
+
{
|
|
2403
|
+
"type": "error",
|
|
2404
|
+
"error": {
|
|
2405
|
+
"type": "overloaded_error",
|
|
2406
|
+
"message": "Upstream response unavailable",
|
|
2407
|
+
},
|
|
2408
|
+
}
|
|
2409
|
+
),
|
|
1223
2410
|
status_code=529,
|
|
1224
2411
|
media_type="application/json",
|
|
1225
2412
|
)
|
|
@@ -1232,9 +2419,7 @@ async def messages(request: Request):
|
|
|
1232
2419
|
error_body = await resp.aread()
|
|
1233
2420
|
await resp.aclose()
|
|
1234
2421
|
error_text = error_body.decode("utf-8", errors="replace")[:1000]
|
|
1235
|
-
logger.error(
|
|
1236
|
-
"Upstream HTTP %d: %s", resp.status_code, error_text
|
|
1237
|
-
)
|
|
2422
|
+
logger.error("Upstream HTTP %d: %s", resp.status_code, error_text)
|
|
1238
2423
|
|
|
1239
2424
|
# Parse the error for a user-friendly message
|
|
1240
2425
|
error_message = f"Upstream server error (HTTP {resp.status_code})"
|
|
@@ -1257,47 +2442,57 @@ async def messages(request: Request):
|
|
|
1257
2442
|
)
|
|
1258
2443
|
|
|
1259
2444
|
if is_context_overflow:
|
|
1260
|
-
|
|
2445
|
+
monitor.overflow_count += 1
|
|
1261
2446
|
logger.error(
|
|
1262
2447
|
"CONTEXT OVERFLOW detected (count=%d). "
|
|
1263
2448
|
"Estimated input: %d tokens, context window: %d tokens. "
|
|
1264
2449
|
"Conversation needs pruning or context window increase.",
|
|
1265
|
-
|
|
2450
|
+
monitor.overflow_count,
|
|
2451
|
+
estimated_tokens,
|
|
2452
|
+
ctx_window,
|
|
1266
2453
|
)
|
|
1267
2454
|
# Return Anthropic-format error that Claude Code can handle
|
|
1268
2455
|
return Response(
|
|
1269
|
-
content=json.dumps(
|
|
1270
|
-
|
|
1271
|
-
|
|
1272
|
-
"
|
|
1273
|
-
|
|
1274
|
-
|
|
1275
|
-
|
|
1276
|
-
|
|
1277
|
-
|
|
1278
|
-
|
|
1279
|
-
|
|
1280
|
-
|
|
2456
|
+
content=json.dumps(
|
|
2457
|
+
{
|
|
2458
|
+
"type": "error",
|
|
2459
|
+
"error": {
|
|
2460
|
+
"type": "overloaded_error",
|
|
2461
|
+
"message": (
|
|
2462
|
+
f"Context window exceeded: request requires ~{estimated_tokens} tokens "
|
|
2463
|
+
f"but only {ctx_window} are available. "
|
|
2464
|
+
f"The conversation is too long. Please start a new session or "
|
|
2465
|
+
f"reduce conversation length."
|
|
2466
|
+
),
|
|
2467
|
+
},
|
|
2468
|
+
}
|
|
2469
|
+
),
|
|
1281
2470
|
status_code=529,
|
|
1282
2471
|
media_type="application/json",
|
|
1283
2472
|
)
|
|
1284
2473
|
|
|
1285
2474
|
# Generic upstream error -- return as Anthropic error format
|
|
1286
|
-
error_type =
|
|
2475
|
+
error_type = (
|
|
2476
|
+
"overloaded_error"
|
|
2477
|
+
if resp.status_code >= 500
|
|
2478
|
+
else "invalid_request_error"
|
|
2479
|
+
)
|
|
1287
2480
|
return Response(
|
|
1288
|
-
content=json.dumps(
|
|
1289
|
-
|
|
1290
|
-
|
|
1291
|
-
"
|
|
1292
|
-
|
|
1293
|
-
|
|
1294
|
-
|
|
2481
|
+
content=json.dumps(
|
|
2482
|
+
{
|
|
2483
|
+
"type": "error",
|
|
2484
|
+
"error": {
|
|
2485
|
+
"type": error_type,
|
|
2486
|
+
"message": error_message,
|
|
2487
|
+
},
|
|
2488
|
+
}
|
|
2489
|
+
),
|
|
1295
2490
|
status_code=529 if resp.status_code >= 500 else 400,
|
|
1296
2491
|
media_type="application/json",
|
|
1297
2492
|
)
|
|
1298
2493
|
|
|
1299
2494
|
return StreamingResponse(
|
|
1300
|
-
stream_anthropic_response(resp, model),
|
|
2495
|
+
stream_anthropic_response(resp, model, monitor, body),
|
|
1301
2496
|
media_type="text/event-stream",
|
|
1302
2497
|
headers={
|
|
1303
2498
|
"Cache-Control": "no-cache",
|
|
@@ -1314,25 +2509,63 @@ async def messages(request: Request):
|
|
|
1314
2509
|
# Option B: Handle non-streaming errors too
|
|
1315
2510
|
if resp.status_code != 200:
|
|
1316
2511
|
error_text = resp.text[:1000]
|
|
1317
|
-
logger.error(
|
|
2512
|
+
logger.error(
|
|
2513
|
+
"Upstream HTTP %d (non-stream): %s", resp.status_code, error_text
|
|
2514
|
+
)
|
|
1318
2515
|
return Response(
|
|
1319
|
-
content=json.dumps(
|
|
1320
|
-
|
|
1321
|
-
|
|
1322
|
-
"
|
|
1323
|
-
|
|
1324
|
-
|
|
1325
|
-
|
|
2516
|
+
content=json.dumps(
|
|
2517
|
+
{
|
|
2518
|
+
"type": "error",
|
|
2519
|
+
"error": {
|
|
2520
|
+
"type": "overloaded_error",
|
|
2521
|
+
"message": f"Upstream error (HTTP {resp.status_code}): {error_text[:500]}",
|
|
2522
|
+
},
|
|
2523
|
+
}
|
|
2524
|
+
),
|
|
1326
2525
|
status_code=529,
|
|
1327
2526
|
media_type="application/json",
|
|
1328
2527
|
)
|
|
1329
2528
|
|
|
1330
2529
|
openai_resp = resp.json()
|
|
2530
|
+
openai_resp = await _apply_unexpected_end_turn_guardrail(
|
|
2531
|
+
client,
|
|
2532
|
+
openai_resp,
|
|
2533
|
+
openai_body,
|
|
2534
|
+
body,
|
|
2535
|
+
monitor,
|
|
2536
|
+
session_id,
|
|
2537
|
+
)
|
|
2538
|
+
openai_resp = await _apply_malformed_tool_guardrail(
|
|
2539
|
+
client,
|
|
2540
|
+
openai_resp,
|
|
2541
|
+
openai_body,
|
|
2542
|
+
body,
|
|
2543
|
+
monitor,
|
|
2544
|
+
session_id,
|
|
2545
|
+
)
|
|
2546
|
+
|
|
2547
|
+
choice, _ = _extract_openai_choice(openai_resp)
|
|
2548
|
+
finish_reason = choice.get("finish_reason", "")
|
|
2549
|
+
if (
|
|
2550
|
+
"tools" in body
|
|
2551
|
+
and not _openai_has_tool_calls(openai_resp)
|
|
2552
|
+
and (
|
|
2553
|
+
finish_reason in {"length", "max_tokens"}
|
|
2554
|
+
or (
|
|
2555
|
+
finish_reason in {"stop", "end_turn"}
|
|
2556
|
+
and len(_openai_message_text(openai_resp)) > 512
|
|
2557
|
+
)
|
|
2558
|
+
)
|
|
2559
|
+
):
|
|
2560
|
+
monitor.malformed_tool_streak += 1
|
|
2561
|
+
elif _openai_has_tool_calls(openai_resp):
|
|
2562
|
+
monitor.malformed_tool_streak = 0
|
|
2563
|
+
|
|
1331
2564
|
anthropic_resp = openai_to_anthropic_response(openai_resp, model)
|
|
1332
2565
|
|
|
1333
2566
|
# Track output tokens in session monitor
|
|
1334
2567
|
output_tokens = anthropic_resp.get("usage", {}).get("output_tokens", 0)
|
|
1335
|
-
|
|
2568
|
+
monitor.record_response(output_tokens)
|
|
1336
2569
|
|
|
1337
2570
|
return anthropic_resp
|
|
1338
2571
|
|
|
@@ -1377,37 +2610,51 @@ async def health():
|
|
|
1377
2610
|
|
|
1378
2611
|
|
|
1379
2612
|
@app.get("/v1/context")
|
|
1380
|
-
async def context_status():
|
|
2613
|
+
async def context_status(request: Request):
|
|
1381
2614
|
"""Option F: Context window monitoring endpoint.
|
|
1382
2615
|
|
|
1383
2616
|
Returns current session token usage, utilization, warnings, and
|
|
1384
2617
|
estimated remaining turns. Useful for dashboards and debugging.
|
|
1385
2618
|
"""
|
|
1386
|
-
|
|
1387
|
-
|
|
2619
|
+
requested_session = request.query_params.get("session_id", "")
|
|
2620
|
+
session_id = requested_session or last_session_id
|
|
2621
|
+
monitor = session_monitors.get(session_id) if session_id else None
|
|
2622
|
+
|
|
2623
|
+
if monitor is None:
|
|
2624
|
+
monitor = SessionMonitor(context_window=default_context_window)
|
|
2625
|
+
|
|
2626
|
+
warning = monitor.get_warning_level()
|
|
2627
|
+
turns = monitor.estimate_turns_remaining()
|
|
1388
2628
|
|
|
1389
2629
|
return {
|
|
1390
|
-
"
|
|
1391
|
-
"
|
|
1392
|
-
"
|
|
1393
|
-
"
|
|
1394
|
-
"
|
|
1395
|
-
"
|
|
2630
|
+
"active_session_id": session_id,
|
|
2631
|
+
"session_count": len(session_monitors),
|
|
2632
|
+
"context_window": monitor.context_window,
|
|
2633
|
+
"last_input_tokens": monitor.last_input_tokens,
|
|
2634
|
+
"last_output_tokens": monitor.last_output_tokens,
|
|
2635
|
+
"peak_input_tokens": monitor.peak_input_tokens,
|
|
2636
|
+
"utilization": round(monitor.get_utilization(), 4),
|
|
2637
|
+
"utilization_pct": f"{monitor.get_utilization() * 100:.1f}%",
|
|
1396
2638
|
"warning_level": warning,
|
|
1397
2639
|
"estimated_turns_remaining": turns,
|
|
1398
|
-
"total_requests":
|
|
1399
|
-
"prune_count":
|
|
1400
|
-
"overflow_count":
|
|
2640
|
+
"total_requests": monitor.total_requests,
|
|
2641
|
+
"prune_count": monitor.prune_count,
|
|
2642
|
+
"overflow_count": monitor.overflow_count,
|
|
1401
2643
|
"prune_threshold": PROXY_CONTEXT_PRUNE_THRESHOLD,
|
|
1402
|
-
"recent_history":
|
|
2644
|
+
"recent_history": monitor.context_history[-10:],
|
|
1403
2645
|
# Loop protection stats
|
|
1404
2646
|
"loop_protection": {
|
|
1405
|
-
"
|
|
1406
|
-
"
|
|
1407
|
-
"
|
|
1408
|
-
"
|
|
1409
|
-
"
|
|
1410
|
-
"
|
|
2647
|
+
"enabled": PROXY_LOOP_BREAKER,
|
|
2648
|
+
"consecutive_forced_count": monitor.consecutive_forced_count,
|
|
2649
|
+
"no_progress_streak": monitor.no_progress_streak,
|
|
2650
|
+
"loop_warnings_emitted": monitor.loop_warnings_emitted,
|
|
2651
|
+
"unexpected_end_turn_count": monitor.unexpected_end_turn_count,
|
|
2652
|
+
"malformed_tool_streak": monitor.malformed_tool_streak,
|
|
2653
|
+
"contamination_resets": monitor.contamination_resets,
|
|
2654
|
+
"tool_call_history_len": len(monitor.tool_call_history),
|
|
2655
|
+
"is_looping": monitor.detect_tool_loop(window=PROXY_LOOP_WINDOW)[0],
|
|
2656
|
+
"loop_repeat_count": monitor.detect_tool_loop(window=PROXY_LOOP_WINDOW)[1],
|
|
2657
|
+
"recent_tool_patterns": monitor.tool_call_history[-5:],
|
|
1411
2658
|
},
|
|
1412
2659
|
}
|
|
1413
2660
|
|