@miller-tech/uap 1.13.6 → 1.13.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +20 -0
- package/config/chat_template.jinja +126 -44
- package/config/model-profiles/qwen35.json +3 -3
- package/dist/.tsbuildinfo +1 -1
- package/dist/benchmarks/token-throughput.d.ts +259 -0
- package/dist/benchmarks/token-throughput.d.ts.map +1 -0
- package/dist/benchmarks/token-throughput.js +198 -0
- package/dist/benchmarks/token-throughput.js.map +1 -0
- package/dist/bin/cli.js +12 -0
- package/dist/bin/cli.js.map +1 -1
- package/dist/bin/llama-server-optimize.js +0 -0
- package/dist/bin/policy.js +0 -0
- package/dist/cli/dashboard.d.ts.map +1 -1
- package/dist/cli/dashboard.js +10 -20
- package/dist/cli/dashboard.js.map +1 -1
- package/dist/cli/init.d.ts.map +1 -1
- package/dist/cli/init.js +5 -0
- package/dist/cli/init.js.map +1 -1
- package/dist/cli/memory.d.ts.map +1 -1
- package/dist/cli/memory.js +9 -18
- package/dist/cli/memory.js.map +1 -1
- package/dist/cli/worktree.d.ts +4 -1
- package/dist/cli/worktree.d.ts.map +1 -1
- package/dist/cli/worktree.js +73 -1
- package/dist/cli/worktree.js.map +1 -1
- package/dist/coordination/adaptive-patterns.d.ts +3 -1
- package/dist/coordination/adaptive-patterns.d.ts.map +1 -1
- package/dist/coordination/adaptive-patterns.js +31 -3
- package/dist/coordination/adaptive-patterns.js.map +1 -1
- package/dist/dashboard/data-service.d.ts +44 -0
- package/dist/dashboard/data-service.d.ts.map +1 -1
- package/dist/dashboard/data-service.js +326 -17
- package/dist/dashboard/data-service.js.map +1 -1
- package/dist/memory/embeddings.d.ts.map +1 -1
- package/dist/memory/embeddings.js +1 -1
- package/dist/memory/embeddings.js.map +1 -1
- package/dist/models/router.js +1 -1
- package/dist/models/router.js.map +1 -1
- package/dist/models/types.d.ts +12 -12
- package/dist/models/types.js +13 -13
- package/dist/models/types.js.map +1 -1
- package/dist/policies/schemas/policy.d.ts +13 -13
- package/dist/policies/schemas/policy.js +1 -1
- package/dist/policies/schemas/policy.js.map +1 -1
- package/dist/tasks/coordination.js +1 -1
- package/dist/tasks/coordination.js.map +1 -1
- package/dist/types/config.d.ts +24 -24
- package/package.json +1 -1
- package/templates/hooks/session-start.sh +49 -48
- package/tools/agents/install-opencode-local.sh.j2 +57 -7
- package/tools/agents/opencode_uap_agent.py +63 -1
- package/tools/agents/scripts/__pycache__/anthropic_proxy.cpython-313.pyc +0 -0
- package/tools/agents/scripts/__pycache__/tool_call_wrapper.cpython-313.pyc +0 -0
- package/tools/agents/scripts/anthropic_proxy.py +759 -12
- package/tools/agents/scripts/tool_call_wrapper.py +9 -5
|
@@ -21,6 +21,9 @@ Key Features
|
|
|
21
21
|
- Granular timeouts (short connect, long read for LLM generation)
|
|
22
22
|
- Graceful error recovery on upstream connection drops
|
|
23
23
|
- Proper upstream cleanup on client disconnect
|
|
24
|
+
- Context window overflow protection with conversation pruning
|
|
25
|
+
- Smart max_tokens capping to prevent next-turn overflow
|
|
26
|
+
- Session-level token monitoring with warnings
|
|
24
27
|
|
|
25
28
|
Configuration (Environment Variables)
|
|
26
29
|
--------------------------------------
|
|
@@ -42,6 +45,14 @@ Configuration (Environment Variables)
|
|
|
42
45
|
PROXY_MAX_CONNECTIONS Max concurrent connections to upstream
|
|
43
46
|
Default: 20
|
|
44
47
|
|
|
48
|
+
PROXY_CONTEXT_WINDOW Override context window size (auto-detected from
|
|
49
|
+
upstream /slots endpoint if not set)
|
|
50
|
+
Default: 0 (auto-detect)
|
|
51
|
+
|
|
52
|
+
PROXY_CONTEXT_PRUNE_THRESHOLD Fraction of context window at which
|
|
53
|
+
conversation pruning activates (0.0-1.0)
|
|
54
|
+
Default: 0.75
|
|
55
|
+
|
|
45
56
|
Usage
|
|
46
57
|
-----
|
|
47
58
|
# Basic usage (connects to llama.cpp on default port):
|
|
@@ -71,6 +82,7 @@ import os
|
|
|
71
82
|
import sys
|
|
72
83
|
import time
|
|
73
84
|
import uuid
|
|
85
|
+
from dataclasses import dataclass, field
|
|
74
86
|
|
|
75
87
|
import httpx
|
|
76
88
|
from contextlib import asynccontextmanager
|
|
@@ -87,6 +99,8 @@ PROXY_HOST = os.environ.get("PROXY_HOST", "0.0.0.0")
|
|
|
87
99
|
PROXY_LOG_LEVEL = os.environ.get("PROXY_LOG_LEVEL", "INFO").upper()
|
|
88
100
|
PROXY_READ_TIMEOUT = float(os.environ.get("PROXY_READ_TIMEOUT", "600"))
|
|
89
101
|
PROXY_MAX_CONNECTIONS = int(os.environ.get("PROXY_MAX_CONNECTIONS", "20"))
|
|
102
|
+
PROXY_CONTEXT_WINDOW = int(os.environ.get("PROXY_CONTEXT_WINDOW", "0"))
|
|
103
|
+
PROXY_CONTEXT_PRUNE_THRESHOLD = float(os.environ.get("PROXY_CONTEXT_PRUNE_THRESHOLD", "0.75"))
|
|
90
104
|
|
|
91
105
|
# ---------------------------------------------------------------------------
|
|
92
106
|
# Logging
|
|
@@ -98,6 +112,352 @@ logging.basicConfig(
|
|
|
98
112
|
)
|
|
99
113
|
logger = logging.getLogger("uap.anthropic_proxy")
|
|
100
114
|
|
|
115
|
+
|
|
116
|
+
# ---------------------------------------------------------------------------
|
|
117
|
+
# Option F: Session-level Context Window Monitor
|
|
118
|
+
# ---------------------------------------------------------------------------
|
|
119
|
+
@dataclass
|
|
120
|
+
class SessionMonitor:
|
|
121
|
+
"""Tracks token usage across the session to provide early warnings
|
|
122
|
+
and enable proactive context management before overflow occurs."""
|
|
123
|
+
|
|
124
|
+
context_window: int = 0 # Auto-detected or configured
|
|
125
|
+
total_requests: int = 0
|
|
126
|
+
last_input_tokens: int = 0 # Estimated input tokens of last request
|
|
127
|
+
last_output_tokens: int = 0 # Actual output tokens of last response
|
|
128
|
+
peak_input_tokens: int = 0 # High-water mark
|
|
129
|
+
prune_count: int = 0 # How many times pruning was triggered
|
|
130
|
+
overflow_count: int = 0 # How many context overflow errors caught
|
|
131
|
+
context_history: list = field(default_factory=list) # Recent token counts
|
|
132
|
+
|
|
133
|
+
def record_request(self, estimated_tokens: int):
|
|
134
|
+
"""Record an outgoing request's estimated token count."""
|
|
135
|
+
self.total_requests += 1
|
|
136
|
+
self.last_input_tokens = estimated_tokens
|
|
137
|
+
if estimated_tokens > self.peak_input_tokens:
|
|
138
|
+
self.peak_input_tokens = estimated_tokens
|
|
139
|
+
self.context_history.append(estimated_tokens)
|
|
140
|
+
# Keep last 50 entries
|
|
141
|
+
if len(self.context_history) > 50:
|
|
142
|
+
self.context_history = self.context_history[-50:]
|
|
143
|
+
|
|
144
|
+
def record_response(self, output_tokens: int):
|
|
145
|
+
"""Record a response's output token count."""
|
|
146
|
+
self.last_output_tokens = output_tokens
|
|
147
|
+
|
|
148
|
+
def get_utilization(self) -> float:
|
|
149
|
+
"""Get current context utilization as a fraction (0.0 - 1.0)."""
|
|
150
|
+
if self.context_window <= 0:
|
|
151
|
+
return 0.0
|
|
152
|
+
return self.last_input_tokens / self.context_window
|
|
153
|
+
|
|
154
|
+
def get_warning_level(self) -> str | None:
|
|
155
|
+
"""Return warning level based on context utilization.
|
|
156
|
+
Returns None if no warning needed."""
|
|
157
|
+
util = self.get_utilization()
|
|
158
|
+
if util >= 0.95:
|
|
159
|
+
return "CRITICAL"
|
|
160
|
+
elif util >= 0.85:
|
|
161
|
+
return "HIGH"
|
|
162
|
+
elif util >= 0.75:
|
|
163
|
+
return "ELEVATED"
|
|
164
|
+
return None
|
|
165
|
+
|
|
166
|
+
def estimate_turns_remaining(self) -> int | None:
|
|
167
|
+
"""Estimate how many more agentic turns can fit before overflow."""
|
|
168
|
+
if self.context_window <= 0 or len(self.context_history) < 2:
|
|
169
|
+
return None
|
|
170
|
+
# Average growth per turn from recent history
|
|
171
|
+
deltas = [
|
|
172
|
+
self.context_history[i] - self.context_history[i - 1]
|
|
173
|
+
for i in range(1, len(self.context_history))
|
|
174
|
+
if self.context_history[i] > self.context_history[i - 1]
|
|
175
|
+
]
|
|
176
|
+
if not deltas:
|
|
177
|
+
return None
|
|
178
|
+
avg_growth = sum(deltas) / len(deltas)
|
|
179
|
+
if avg_growth <= 0:
|
|
180
|
+
return None
|
|
181
|
+
remaining_tokens = self.context_window - self.last_input_tokens
|
|
182
|
+
return max(0, int(remaining_tokens / avg_growth))
|
|
183
|
+
|
|
184
|
+
def log_status(self):
|
|
185
|
+
"""Log current session status."""
|
|
186
|
+
util = self.get_utilization()
|
|
187
|
+
warning = self.get_warning_level()
|
|
188
|
+
turns = self.estimate_turns_remaining()
|
|
189
|
+
turns_str = f"~{turns} turns remaining" if turns is not None else "unknown"
|
|
190
|
+
|
|
191
|
+
if warning == "CRITICAL":
|
|
192
|
+
logger.error(
|
|
193
|
+
"CONTEXT CRITICAL: %d/%d tokens (%.1f%%), %s, pruned=%d, overflows=%d",
|
|
194
|
+
self.last_input_tokens, self.context_window, util * 100,
|
|
195
|
+
turns_str, self.prune_count, self.overflow_count,
|
|
196
|
+
)
|
|
197
|
+
elif warning == "HIGH":
|
|
198
|
+
logger.warning(
|
|
199
|
+
"CONTEXT HIGH: %d/%d tokens (%.1f%%), %s, pruned=%d",
|
|
200
|
+
self.last_input_tokens, self.context_window, util * 100,
|
|
201
|
+
turns_str, self.prune_count,
|
|
202
|
+
)
|
|
203
|
+
elif warning == "ELEVATED":
|
|
204
|
+
logger.warning(
|
|
205
|
+
"CONTEXT ELEVATED: %d/%d tokens (%.1f%%), %s",
|
|
206
|
+
self.last_input_tokens, self.context_window, util * 100,
|
|
207
|
+
turns_str,
|
|
208
|
+
)
|
|
209
|
+
else:
|
|
210
|
+
logger.info(
|
|
211
|
+
"CONTEXT: %d/%d tokens (%.1f%%), %s",
|
|
212
|
+
self.last_input_tokens, self.context_window, util * 100,
|
|
213
|
+
turns_str,
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
session_monitor = SessionMonitor()
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
# ---------------------------------------------------------------------------
|
|
221
|
+
# Context Window Detection
|
|
222
|
+
# ---------------------------------------------------------------------------
|
|
223
|
+
async def detect_context_window(client: httpx.AsyncClient) -> int:
|
|
224
|
+
"""Auto-detect the upstream server's per-slot context window size.
|
|
225
|
+
|
|
226
|
+
Queries the /slots endpoint (llama.cpp) to get the actual n_ctx value.
|
|
227
|
+
Falls back to PROXY_CONTEXT_WINDOW env var, then to a safe default.
|
|
228
|
+
"""
|
|
229
|
+
if PROXY_CONTEXT_WINDOW > 0:
|
|
230
|
+
logger.info("Using configured context window: %d tokens", PROXY_CONTEXT_WINDOW)
|
|
231
|
+
return PROXY_CONTEXT_WINDOW
|
|
232
|
+
|
|
233
|
+
try:
|
|
234
|
+
slots_url = LLAMA_CPP_BASE.replace("/v1", "/slots")
|
|
235
|
+
resp = await client.get(slots_url, timeout=5.0)
|
|
236
|
+
if resp.status_code == 200:
|
|
237
|
+
slots = resp.json()
|
|
238
|
+
if slots and isinstance(slots, list):
|
|
239
|
+
n_ctx = slots[0].get("n_ctx", 0)
|
|
240
|
+
if n_ctx > 0:
|
|
241
|
+
logger.info(
|
|
242
|
+
"Auto-detected context window from upstream: %d tokens (%d slots)",
|
|
243
|
+
n_ctx, len(slots),
|
|
244
|
+
)
|
|
245
|
+
return n_ctx
|
|
246
|
+
except Exception as exc:
|
|
247
|
+
logger.warning("Failed to auto-detect context window: %s", exc)
|
|
248
|
+
|
|
249
|
+
# Safe default: 128K (common for modern models)
|
|
250
|
+
default = 131072
|
|
251
|
+
logger.warning("Using default context window: %d tokens", default)
|
|
252
|
+
return default
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
# ---------------------------------------------------------------------------
|
|
256
|
+
# Option C: Conversation Pruning
|
|
257
|
+
# ---------------------------------------------------------------------------
|
|
258
|
+
# Characters-per-token ratio for estimation. English text averages ~4 chars/token,
|
|
259
|
+
# but tool call JSON and code tend to be denser (~3.2 chars/token).
|
|
260
|
+
CHARS_PER_TOKEN = 3.5
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
def estimate_tokens(text: str) -> int:
|
|
264
|
+
"""Estimate token count from text length using chars-per-token heuristic."""
|
|
265
|
+
return max(1, int(len(text) / CHARS_PER_TOKEN))
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
def estimate_message_tokens(msg: dict) -> int:
|
|
269
|
+
"""Estimate token count for a single Anthropic message."""
|
|
270
|
+
tokens = 4 # Message overhead (role, separators)
|
|
271
|
+
content = msg.get("content", "")
|
|
272
|
+
if isinstance(content, str):
|
|
273
|
+
tokens += estimate_tokens(content)
|
|
274
|
+
elif isinstance(content, list):
|
|
275
|
+
for block in content:
|
|
276
|
+
if isinstance(block, str):
|
|
277
|
+
tokens += estimate_tokens(block)
|
|
278
|
+
elif isinstance(block, dict):
|
|
279
|
+
if block.get("type") == "text":
|
|
280
|
+
tokens += estimate_tokens(block.get("text", ""))
|
|
281
|
+
elif block.get("type") == "tool_use":
|
|
282
|
+
tokens += estimate_tokens(block.get("name", ""))
|
|
283
|
+
tokens += estimate_tokens(json.dumps(block.get("input", {})))
|
|
284
|
+
elif block.get("type") == "tool_result":
|
|
285
|
+
tokens += estimate_tokens(_extract_text(block.get("content", "")))
|
|
286
|
+
return tokens
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
def estimate_total_tokens(anthropic_body: dict) -> int:
|
|
290
|
+
"""Estimate total token count for an Anthropic Messages API request."""
|
|
291
|
+
tokens = 0
|
|
292
|
+
|
|
293
|
+
# System prompt
|
|
294
|
+
system = anthropic_body.get("system", "")
|
|
295
|
+
if isinstance(system, str):
|
|
296
|
+
tokens += estimate_tokens(system)
|
|
297
|
+
elif isinstance(system, list):
|
|
298
|
+
for block in system:
|
|
299
|
+
if isinstance(block, dict) and block.get("type") == "text":
|
|
300
|
+
tokens += estimate_tokens(block.get("text", ""))
|
|
301
|
+
|
|
302
|
+
# Agentic supplement tokens (always injected)
|
|
303
|
+
tokens += estimate_tokens(_AGENTIC_SYSTEM_SUPPLEMENT)
|
|
304
|
+
|
|
305
|
+
# Messages
|
|
306
|
+
for msg in anthropic_body.get("messages", []):
|
|
307
|
+
tokens += estimate_message_tokens(msg)
|
|
308
|
+
|
|
309
|
+
# Tool definitions
|
|
310
|
+
tools = anthropic_body.get("tools", [])
|
|
311
|
+
if tools:
|
|
312
|
+
tokens += estimate_tokens(json.dumps(tools))
|
|
313
|
+
|
|
314
|
+
return tokens
|
|
315
|
+
|
|
316
|
+
|
|
317
|
+
def prune_conversation(anthropic_body: dict, context_window: int, target_fraction: float = 0.65) -> dict:
|
|
318
|
+
"""Prune the conversation to fit within the context window.
|
|
319
|
+
|
|
320
|
+
Strategy:
|
|
321
|
+
- Always keep: system prompt, first user message, last N messages
|
|
322
|
+
- Remove from the middle: oldest tool_result messages first (they're
|
|
323
|
+
the largest -- file contents, command output, etc.), then oldest
|
|
324
|
+
assistant messages, then oldest user messages.
|
|
325
|
+
- Inject a [CONTEXT PRUNED] marker so the model knows history was trimmed.
|
|
326
|
+
|
|
327
|
+
Args:
|
|
328
|
+
anthropic_body: The full Anthropic request body
|
|
329
|
+
context_window: Maximum context window in tokens
|
|
330
|
+
target_fraction: Target utilization after pruning (0.0-1.0)
|
|
331
|
+
|
|
332
|
+
Returns:
|
|
333
|
+
Modified anthropic_body with pruned messages
|
|
334
|
+
"""
|
|
335
|
+
messages = anthropic_body.get("messages", [])
|
|
336
|
+
if len(messages) <= 4:
|
|
337
|
+
# Too few messages to prune meaningfully
|
|
338
|
+
return anthropic_body
|
|
339
|
+
|
|
340
|
+
target_tokens = int(context_window * target_fraction)
|
|
341
|
+
|
|
342
|
+
# Estimate non-message tokens (system, tools, agentic supplement)
|
|
343
|
+
overhead_tokens = 0
|
|
344
|
+
system = anthropic_body.get("system", "")
|
|
345
|
+
if isinstance(system, str):
|
|
346
|
+
overhead_tokens += estimate_tokens(system)
|
|
347
|
+
elif isinstance(system, list):
|
|
348
|
+
for block in system:
|
|
349
|
+
if isinstance(block, dict) and block.get("type") == "text":
|
|
350
|
+
overhead_tokens += estimate_tokens(block.get("text", ""))
|
|
351
|
+
overhead_tokens += estimate_tokens(_AGENTIC_SYSTEM_SUPPLEMENT)
|
|
352
|
+
tools = anthropic_body.get("tools", [])
|
|
353
|
+
if tools:
|
|
354
|
+
overhead_tokens += estimate_tokens(json.dumps(tools))
|
|
355
|
+
|
|
356
|
+
# Budget for messages
|
|
357
|
+
message_budget = target_tokens - overhead_tokens
|
|
358
|
+
if message_budget <= 0:
|
|
359
|
+
logger.error("System prompt + tools alone exceed target budget!")
|
|
360
|
+
return anthropic_body
|
|
361
|
+
|
|
362
|
+
# Always keep the first user message and the last N messages
|
|
363
|
+
KEEP_LAST = 8 # Keep the last 8 messages (recent context)
|
|
364
|
+
protected_head = messages[:1] # First user message
|
|
365
|
+
protected_tail = messages[-KEEP_LAST:] if len(messages) > KEEP_LAST else messages[1:]
|
|
366
|
+
middle = messages[1:-KEEP_LAST] if len(messages) > KEEP_LAST + 1 else []
|
|
367
|
+
|
|
368
|
+
# Calculate tokens for protected messages
|
|
369
|
+
protected_tokens = sum(estimate_message_tokens(m) for m in protected_head + protected_tail)
|
|
370
|
+
|
|
371
|
+
if protected_tokens >= message_budget:
|
|
372
|
+
# Even protected messages exceed budget -- truncate tool_result content
|
|
373
|
+
# in the tail to fit
|
|
374
|
+
logger.warning(
|
|
375
|
+
"Protected messages (%d tokens) exceed budget (%d) -- truncating tool results",
|
|
376
|
+
protected_tokens, message_budget,
|
|
377
|
+
)
|
|
378
|
+
for msg in protected_tail:
|
|
379
|
+
content = msg.get("content", [])
|
|
380
|
+
if isinstance(content, list):
|
|
381
|
+
for block in content:
|
|
382
|
+
if isinstance(block, dict) and block.get("type") == "tool_result":
|
|
383
|
+
result_text = _extract_text(block.get("content", ""))
|
|
384
|
+
if len(result_text) > 2000:
|
|
385
|
+
block["content"] = result_text[:1000] + "\n...[TRUNCATED]...\n" + result_text[-500:]
|
|
386
|
+
anthropic_body["messages"] = protected_head + protected_tail
|
|
387
|
+
return anthropic_body
|
|
388
|
+
|
|
389
|
+
remaining_budget = message_budget - protected_tokens
|
|
390
|
+
|
|
391
|
+
# Score middle messages for removal priority:
|
|
392
|
+
# - tool_result messages: remove first (biggest, least important historically)
|
|
393
|
+
# - assistant text-only: remove second
|
|
394
|
+
# - user messages: remove last (provide context for the model's actions)
|
|
395
|
+
# Within each category, remove oldest first.
|
|
396
|
+
scored_middle = []
|
|
397
|
+
for i, msg in enumerate(middle):
|
|
398
|
+
content = msg.get("content", [])
|
|
399
|
+
tokens = estimate_message_tokens(msg)
|
|
400
|
+
is_tool_result = False
|
|
401
|
+
is_assistant = msg.get("role") == "assistant"
|
|
402
|
+
|
|
403
|
+
if isinstance(content, list):
|
|
404
|
+
is_tool_result = any(
|
|
405
|
+
isinstance(b, dict) and b.get("type") == "tool_result"
|
|
406
|
+
for b in content
|
|
407
|
+
)
|
|
408
|
+
|
|
409
|
+
# Lower priority = removed first
|
|
410
|
+
if is_tool_result:
|
|
411
|
+
priority = 0 # Remove first
|
|
412
|
+
elif is_assistant:
|
|
413
|
+
priority = 1 # Remove second
|
|
414
|
+
else:
|
|
415
|
+
priority = 2 # Remove last (user messages)
|
|
416
|
+
|
|
417
|
+
scored_middle.append((priority, i, tokens, msg))
|
|
418
|
+
|
|
419
|
+
# Sort by priority (ascending = remove first), then by index (oldest first)
|
|
420
|
+
scored_middle.sort(key=lambda x: (x[0], x[1]))
|
|
421
|
+
|
|
422
|
+
# Greedily keep messages from highest priority (keep last) until budget fills
|
|
423
|
+
kept_middle = []
|
|
424
|
+
used_tokens = 0
|
|
425
|
+
# Process in reverse priority order (keep high-priority messages first)
|
|
426
|
+
for priority, idx, tokens, msg in reversed(scored_middle):
|
|
427
|
+
if used_tokens + tokens <= remaining_budget:
|
|
428
|
+
kept_middle.append((idx, msg))
|
|
429
|
+
used_tokens += tokens
|
|
430
|
+
|
|
431
|
+
# Sort kept messages back into original order
|
|
432
|
+
kept_middle.sort(key=lambda x: x[0])
|
|
433
|
+
kept_msgs = [m for _, m in kept_middle]
|
|
434
|
+
|
|
435
|
+
removed_count = len(middle) - len(kept_msgs)
|
|
436
|
+
removed_tokens = sum(t for _, _, t, _ in scored_middle) - used_tokens
|
|
437
|
+
|
|
438
|
+
if removed_count > 0:
|
|
439
|
+
# Insert a context-pruned marker
|
|
440
|
+
prune_marker = {
|
|
441
|
+
"role": "user",
|
|
442
|
+
"content": (
|
|
443
|
+
f"[CONTEXT PRUNED: {removed_count} older messages (~{removed_tokens} tokens) "
|
|
444
|
+
f"were removed to fit within the context window. "
|
|
445
|
+
f"The conversation continues from recent context below.]"
|
|
446
|
+
),
|
|
447
|
+
}
|
|
448
|
+
anthropic_body["messages"] = protected_head + [prune_marker] + kept_msgs + protected_tail
|
|
449
|
+
logger.warning(
|
|
450
|
+
"PRUNED: removed %d messages (~%d tokens), kept %d messages, "
|
|
451
|
+
"target=%.0f%% of %d ctx",
|
|
452
|
+
removed_count, removed_tokens, len(anthropic_body["messages"]),
|
|
453
|
+
target_fraction * 100, context_window,
|
|
454
|
+
)
|
|
455
|
+
else:
|
|
456
|
+
anthropic_body["messages"] = protected_head + kept_msgs + protected_tail
|
|
457
|
+
|
|
458
|
+
return anthropic_body
|
|
459
|
+
|
|
460
|
+
|
|
101
461
|
# ---------------------------------------------------------------------------
|
|
102
462
|
# HTTP Client Lifecycle
|
|
103
463
|
# ---------------------------------------------------------------------------
|
|
@@ -127,6 +487,15 @@ async def lifespan(app: FastAPI):
|
|
|
127
487
|
"Proxy started: listening on %s:%d -> upstream %s",
|
|
128
488
|
PROXY_HOST, PROXY_PORT, LLAMA_CPP_BASE,
|
|
129
489
|
)
|
|
490
|
+
|
|
491
|
+
# Auto-detect context window from upstream server
|
|
492
|
+
session_monitor.context_window = await detect_context_window(http_client)
|
|
493
|
+
logger.info(
|
|
494
|
+
"Context window: %d tokens, prune threshold: %.0f%%",
|
|
495
|
+
session_monitor.context_window,
|
|
496
|
+
PROXY_CONTEXT_PRUNE_THRESHOLD * 100,
|
|
497
|
+
)
|
|
498
|
+
|
|
130
499
|
yield
|
|
131
500
|
await http_client.aclose()
|
|
132
501
|
http_client = None
|
|
@@ -219,6 +588,20 @@ def _extract_text(content) -> str:
|
|
|
219
588
|
return str(content)
|
|
220
589
|
|
|
221
590
|
|
|
591
|
+
_AGENTIC_SYSTEM_SUPPLEMENT = (
|
|
592
|
+
"\n\n<agentic-protocol>\n"
|
|
593
|
+
"You are operating in an agentic coding loop with tool access. Follow these rules:\n"
|
|
594
|
+
"1. ALWAYS use tools to read, edit, write, and test code. Never just describe or explain what should be done.\n"
|
|
595
|
+
"2. After reading files and identifying an issue, proceed IMMEDIATELY to make the fix using Edit/Write tools. Do NOT stop after explaining the problem.\n"
|
|
596
|
+
"3. After making changes, run the relevant tests or build commands to verify your fix.\n"
|
|
597
|
+
"4. Only produce a final text response WITHOUT tool calls when the ENTIRE task is fully complete, verified, and you have nothing left to do.\n"
|
|
598
|
+
"5. If you have identified a problem but have not yet fixed it, you MUST call a tool to make the fix. Do NOT summarize the issue and stop.\n"
|
|
599
|
+
"6. When the user asks you to do something, DO it with tools. Do not ask for permission or confirmation.\n"
|
|
600
|
+
"7. If a tool call fails, analyze the error and try a different approach. Do not give up after one failure.\n"
|
|
601
|
+
"</agentic-protocol>"
|
|
602
|
+
)
|
|
603
|
+
|
|
604
|
+
|
|
222
605
|
def build_openai_request(anthropic_body: dict) -> dict:
|
|
223
606
|
"""Build an OpenAI Chat Completions request from an Anthropic Messages request."""
|
|
224
607
|
openai_body = {
|
|
@@ -227,8 +610,51 @@ def build_openai_request(anthropic_body: dict) -> dict:
|
|
|
227
610
|
"stream": anthropic_body.get("stream", False),
|
|
228
611
|
}
|
|
229
612
|
|
|
613
|
+
# Inject agentic protocol instructions into the system message so
|
|
614
|
+
# the model knows it must use tools to complete work, not just explain.
|
|
615
|
+
if openai_body["messages"] and openai_body["messages"][0].get("role") == "system":
|
|
616
|
+
openai_body["messages"][0]["content"] += _AGENTIC_SYSTEM_SUPPLEMENT
|
|
617
|
+
else:
|
|
618
|
+
# No system message from the client; inject one.
|
|
619
|
+
openai_body["messages"].insert(0, {
|
|
620
|
+
"role": "system",
|
|
621
|
+
"content": _AGENTIC_SYSTEM_SUPPLEMENT.strip(),
|
|
622
|
+
})
|
|
623
|
+
|
|
230
624
|
if "max_tokens" in anthropic_body:
|
|
231
|
-
|
|
625
|
+
# Enforce minimum floor for thinking mode: model needs tokens for
|
|
626
|
+
# reasoning (<think>...</think>) plus the actual response/tool calls.
|
|
627
|
+
# Claude Code typically sends 4096-8192 which is too low for thinking.
|
|
628
|
+
requested_max = max(anthropic_body["max_tokens"], 16384)
|
|
629
|
+
|
|
630
|
+
# Option E: Smart max_tokens capping — prevent the response from
|
|
631
|
+
# consuming so many tokens that the NEXT turn's input won't fit.
|
|
632
|
+
# Formula: max_tokens = min(requested, context_window - input_tokens - safety_margin)
|
|
633
|
+
# This ensures the model's output + current input stays within bounds,
|
|
634
|
+
# leaving room for the next turn's incremental growth.
|
|
635
|
+
ctx_window = session_monitor.context_window
|
|
636
|
+
if ctx_window > 0:
|
|
637
|
+
estimated_input = estimate_total_tokens(anthropic_body)
|
|
638
|
+
# Reserve 15% of context for next-turn growth (tool results, etc.)
|
|
639
|
+
safety_margin = int(ctx_window * 0.15)
|
|
640
|
+
available_for_output = ctx_window - estimated_input - safety_margin
|
|
641
|
+
if available_for_output < requested_max and available_for_output > 1024:
|
|
642
|
+
logger.info(
|
|
643
|
+
"MAX_TOKENS capped: %d -> %d (ctx=%d, input~%d, margin=%d)",
|
|
644
|
+
requested_max, available_for_output,
|
|
645
|
+
ctx_window, estimated_input, safety_margin,
|
|
646
|
+
)
|
|
647
|
+
requested_max = available_for_output
|
|
648
|
+
elif available_for_output <= 1024:
|
|
649
|
+
# Very tight on space -- allow minimum but warn
|
|
650
|
+
logger.warning(
|
|
651
|
+
"MAX_TOKENS: only %d tokens available for output (ctx=%d, input~%d). "
|
|
652
|
+
"Response may be truncated.",
|
|
653
|
+
available_for_output, ctx_window, estimated_input,
|
|
654
|
+
)
|
|
655
|
+
requested_max = max(1024, available_for_output)
|
|
656
|
+
|
|
657
|
+
openai_body["max_tokens"] = requested_max
|
|
232
658
|
if "temperature" in anthropic_body:
|
|
233
659
|
openai_body["temperature"] = anthropic_body["temperature"]
|
|
234
660
|
if "top_p" in anthropic_body:
|
|
@@ -249,9 +675,62 @@ def build_openai_request(anthropic_body: dict) -> dict:
|
|
|
249
675
|
},
|
|
250
676
|
})
|
|
251
677
|
|
|
678
|
+
# Smart tool_choice: force tool calls during the agentic loop to
|
|
679
|
+
# prevent the model from producing text-only end_turn responses that
|
|
680
|
+
# prematurely stop the loop. The model can still produce text alongside
|
|
681
|
+
# tool calls when tool_choice="required".
|
|
682
|
+
#
|
|
683
|
+
# Force "required" when:
|
|
684
|
+
# - More than 1 message (conversation is in progress)
|
|
685
|
+
# - Last assistant was text-only (would cause premature stop)
|
|
686
|
+
# - OR conversation has tool_result messages (active agentic loop)
|
|
687
|
+
n_msgs = len(anthropic_body.get("messages", []))
|
|
688
|
+
has_tool_results = any(
|
|
689
|
+
isinstance(m.get("content"), list) and any(
|
|
690
|
+
isinstance(b, dict) and b.get("type") == "tool_result"
|
|
691
|
+
for b in m.get("content", [])
|
|
692
|
+
)
|
|
693
|
+
for m in anthropic_body.get("messages", [])
|
|
694
|
+
)
|
|
695
|
+
if _last_assistant_was_text_only(anthropic_body):
|
|
696
|
+
openai_body["tool_choice"] = "required"
|
|
697
|
+
logger.info("tool_choice forced to 'required' (last assistant was text-only)")
|
|
698
|
+
elif has_tool_results and n_msgs > 2:
|
|
699
|
+
openai_body["tool_choice"] = "required"
|
|
700
|
+
logger.info("tool_choice forced to 'required' (active agentic loop with tool results)")
|
|
701
|
+
|
|
252
702
|
return openai_body
|
|
253
703
|
|
|
254
704
|
|
|
705
|
+
def _last_assistant_was_text_only(anthropic_body: dict) -> bool:
|
|
706
|
+
"""Check if the last assistant message in the conversation was text-only
|
|
707
|
+
(no tool_use blocks). This indicates the model may be prematurely ending
|
|
708
|
+
the agentic loop by explaining instead of acting."""
|
|
709
|
+
messages = anthropic_body.get("messages", [])
|
|
710
|
+
# Walk backwards to find the last assistant message
|
|
711
|
+
for msg in reversed(messages):
|
|
712
|
+
if msg.get("role") != "assistant":
|
|
713
|
+
continue
|
|
714
|
+
content = msg.get("content")
|
|
715
|
+
if isinstance(content, str):
|
|
716
|
+
# Pure text assistant message -- text-only
|
|
717
|
+
return bool(content.strip())
|
|
718
|
+
if isinstance(content, list):
|
|
719
|
+
has_tool_use = any(
|
|
720
|
+
isinstance(b, dict) and b.get("type") == "tool_use"
|
|
721
|
+
for b in content
|
|
722
|
+
)
|
|
723
|
+
has_text = any(
|
|
724
|
+
(isinstance(b, dict) and b.get("type") == "text" and b.get("text", "").strip())
|
|
725
|
+
or isinstance(b, str)
|
|
726
|
+
for b in content
|
|
727
|
+
)
|
|
728
|
+
# Text-only if there's text but no tool_use
|
|
729
|
+
return has_text and not has_tool_use
|
|
730
|
+
return False
|
|
731
|
+
return False
|
|
732
|
+
|
|
733
|
+
|
|
255
734
|
# ===========================================================================
|
|
256
735
|
# Response Translation: OpenAI -> Anthropic
|
|
257
736
|
# ===========================================================================
|
|
@@ -339,6 +818,8 @@ async def stream_anthropic_response(openai_stream: httpx.Response, model: str):
|
|
|
339
818
|
# Track tool call state for streaming tool_calls
|
|
340
819
|
tool_calls_by_index: dict[int, dict] = {}
|
|
341
820
|
tool_block_index = 1 # anthropic block index (0 = text)
|
|
821
|
+
text_chunks: list[str] = [] # accumulate text for logging
|
|
822
|
+
reasoning_chunks: list[str] = [] # accumulate reasoning for fallback
|
|
342
823
|
|
|
343
824
|
try:
|
|
344
825
|
async for line in openai_stream.aiter_lines():
|
|
@@ -355,9 +836,16 @@ async def stream_anthropic_response(openai_stream: httpx.Response, model: str):
|
|
|
355
836
|
choice = (chunk.get("choices") or [{}])[0]
|
|
356
837
|
delta = choice.get("delta", {})
|
|
357
838
|
|
|
839
|
+
# Collect reasoning_content (normally stripped; used as fallback
|
|
840
|
+
# if the model produces only reasoning with no visible output)
|
|
841
|
+
reasoning = delta.get("reasoning_content", "")
|
|
842
|
+
if reasoning:
|
|
843
|
+
reasoning_chunks.append(reasoning)
|
|
844
|
+
|
|
358
845
|
# Handle text content deltas
|
|
359
846
|
if delta.get("content"):
|
|
360
847
|
output_tokens += 1 # rough token estimate
|
|
848
|
+
text_chunks.append(delta["content"])
|
|
361
849
|
yield (
|
|
362
850
|
f"event: content_block_delta\n"
|
|
363
851
|
f"data: {json.dumps({'type': 'content_block_delta', 'index': 0, 'delta': {'type': 'text_delta', 'text': delta['content']}})}\n\n"
|
|
@@ -372,10 +860,11 @@ async def stream_anthropic_response(openai_stream: httpx.Response, model: str):
|
|
|
372
860
|
# New tool call starting
|
|
373
861
|
tc_id = tc_delta.get("id", f"toolu_{uuid.uuid4().hex[:12]}")
|
|
374
862
|
fn = tc_delta.get("function", {})
|
|
863
|
+
initial_args = fn.get("arguments", "")
|
|
375
864
|
tool_calls_by_index[tc_idx] = {
|
|
376
865
|
"id": tc_id,
|
|
377
866
|
"name": fn.get("name", ""),
|
|
378
|
-
"arguments":
|
|
867
|
+
"arguments": initial_args,
|
|
379
868
|
"block_index": tool_block_index,
|
|
380
869
|
}
|
|
381
870
|
|
|
@@ -391,6 +880,18 @@ async def stream_anthropic_response(openai_stream: httpx.Response, model: str):
|
|
|
391
880
|
f"event: content_block_start\n"
|
|
392
881
|
f"data: {json.dumps({'type': 'content_block_start', 'index': tool_block_index, 'content_block': {'type': 'tool_use', 'id': tc_id, 'name': fn.get('name', '')}})}\n\n"
|
|
393
882
|
)
|
|
883
|
+
|
|
884
|
+
# Emit initial arguments fragment (e.g. "{") that
|
|
885
|
+
# arrives with the first tool_call chunk. Without
|
|
886
|
+
# this the opening brace is swallowed and the client
|
|
887
|
+
# receives invalid JSON like "command":"ls"} instead
|
|
888
|
+
# of {"command":"ls"}.
|
|
889
|
+
if initial_args:
|
|
890
|
+
yield (
|
|
891
|
+
f"event: content_block_delta\n"
|
|
892
|
+
f"data: {json.dumps({'type': 'content_block_delta', 'index': tool_block_index, 'delta': {'type': 'input_json_delta', 'partial_json': initial_args}})}\n\n"
|
|
893
|
+
)
|
|
894
|
+
|
|
394
895
|
tool_block_index += 1
|
|
395
896
|
else:
|
|
396
897
|
# Continuation: argument chunks
|
|
@@ -406,6 +907,11 @@ async def stream_anthropic_response(openai_stream: httpx.Response, model: str):
|
|
|
406
907
|
|
|
407
908
|
if choice.get("finish_reason"):
|
|
408
909
|
fr = choice["finish_reason"]
|
|
910
|
+
if fr == "length":
|
|
911
|
+
logger.warning(
|
|
912
|
+
"Response truncated by token limit (finish_reason=length). "
|
|
913
|
+
"Consider increasing --n-predict or max_tokens."
|
|
914
|
+
)
|
|
409
915
|
finish_reason = {
|
|
410
916
|
"stop": "end_turn",
|
|
411
917
|
"length": "max_tokens",
|
|
@@ -433,11 +939,40 @@ async def stream_anthropic_response(openai_stream: httpx.Response, model: str):
|
|
|
433
939
|
f"data: {json.dumps({'type': 'content_block_stop', 'index': tc['block_index']})}\n\n"
|
|
434
940
|
)
|
|
435
941
|
else:
|
|
942
|
+
# Option E: If the response has no text AND no tool calls, but the
|
|
943
|
+
# model produced reasoning_content, forward the reasoning as visible
|
|
944
|
+
# text so the client doesn't receive a completely empty turn.
|
|
945
|
+
accumulated_text = "".join(text_chunks)
|
|
946
|
+
if not accumulated_text and reasoning_chunks:
|
|
947
|
+
fallback_text = "".join(reasoning_chunks)
|
|
948
|
+
logger.warning(
|
|
949
|
+
"Empty response with %d reasoning tokens – forwarding reasoning as fallback text",
|
|
950
|
+
len(reasoning_chunks),
|
|
951
|
+
)
|
|
952
|
+
text_chunks.append(fallback_text)
|
|
953
|
+
yield (
|
|
954
|
+
f"event: content_block_delta\n"
|
|
955
|
+
f"data: {json.dumps({'type': 'content_block_delta', 'index': 0, 'delta': {'type': 'text_delta', 'text': fallback_text}})}\n\n"
|
|
956
|
+
)
|
|
957
|
+
|
|
436
958
|
yield (
|
|
437
959
|
f"event: content_block_stop\n"
|
|
438
960
|
f"data: {json.dumps({'type': 'content_block_stop', 'index': 0})}\n\n"
|
|
439
961
|
)
|
|
440
962
|
|
|
963
|
+
# Log response summary
|
|
964
|
+
accumulated_text = "".join(text_chunks)
|
|
965
|
+
tc_names = [tc["name"] for tc in tool_calls_by_index.values()] if tool_calls_by_index else []
|
|
966
|
+
tc_args = [tc.get("arguments", "") for tc in tool_calls_by_index.values()] if tool_calls_by_index else []
|
|
967
|
+
logger.info(
|
|
968
|
+
"RESP: finish=%s output_tokens=%d text_len=%d text=%.300s tool_calls=%s args=%s",
|
|
969
|
+
finish_reason, output_tokens,
|
|
970
|
+
len(accumulated_text),
|
|
971
|
+
accumulated_text[:300],
|
|
972
|
+
tc_names,
|
|
973
|
+
[a[:200] for a in tc_args],
|
|
974
|
+
)
|
|
975
|
+
|
|
441
976
|
# message_delta with final stop reason
|
|
442
977
|
yield (
|
|
443
978
|
f"event: message_delta\n"
|
|
@@ -454,10 +989,61 @@ async def stream_anthropic_response(openai_stream: httpx.Response, model: str):
|
|
|
454
989
|
|
|
455
990
|
@app.post("/v1/messages")
|
|
456
991
|
async def messages(request: Request):
|
|
457
|
-
"""Handle Anthropic Messages API requests (streaming and non-streaming).
|
|
992
|
+
"""Handle Anthropic Messages API requests (streaming and non-streaming).
|
|
993
|
+
|
|
994
|
+
Integrates context management:
|
|
995
|
+
- Option B: HTTP error handling for upstream 4xx/5xx responses
|
|
996
|
+
- Option C: Conversation pruning when approaching context limits
|
|
997
|
+
- Option E: Smart max_tokens capping (in build_openai_request)
|
|
998
|
+
- Option F: Session-level token monitoring with warnings
|
|
999
|
+
"""
|
|
458
1000
|
body = await request.json()
|
|
459
1001
|
model = body.get("model", "default")
|
|
460
1002
|
is_stream = body.get("stream", False)
|
|
1003
|
+
|
|
1004
|
+
# Debug: log request summary
|
|
1005
|
+
n_messages = len(body.get("messages", []))
|
|
1006
|
+
n_tools = len(body.get("tools", []))
|
|
1007
|
+
max_tokens = body.get("max_tokens", "unset")
|
|
1008
|
+
last_msg = body.get("messages", [{}])[-1]
|
|
1009
|
+
last_role = last_msg.get("role", "?")
|
|
1010
|
+
last_content = last_msg.get("content", "")
|
|
1011
|
+
if isinstance(last_content, list):
|
|
1012
|
+
last_text = next((b.get("text", "") for b in last_content if b.get("type") == "text"), "")[:200]
|
|
1013
|
+
elif isinstance(last_content, str):
|
|
1014
|
+
last_text = last_content[:200]
|
|
1015
|
+
else:
|
|
1016
|
+
last_text = str(last_content)[:200]
|
|
1017
|
+
logger.info(
|
|
1018
|
+
"REQ: stream=%s msgs=%d tools=%d max_tokens=%s last_role=%s last_content=%.200s",
|
|
1019
|
+
is_stream, n_messages, n_tools, max_tokens, last_role, last_text
|
|
1020
|
+
)
|
|
1021
|
+
|
|
1022
|
+
# --- Option F: Estimate tokens and record in session monitor ---
|
|
1023
|
+
estimated_tokens = estimate_total_tokens(body)
|
|
1024
|
+
session_monitor.record_request(estimated_tokens)
|
|
1025
|
+
session_monitor.log_status()
|
|
1026
|
+
|
|
1027
|
+
# --- Option C: Prune conversation if approaching context limit ---
|
|
1028
|
+
ctx_window = session_monitor.context_window
|
|
1029
|
+
if ctx_window > 0:
|
|
1030
|
+
utilization = estimated_tokens / ctx_window
|
|
1031
|
+
if utilization >= PROXY_CONTEXT_PRUNE_THRESHOLD:
|
|
1032
|
+
logger.warning(
|
|
1033
|
+
"Context utilization %.1f%% exceeds threshold %.1f%% -- pruning conversation",
|
|
1034
|
+
utilization * 100, PROXY_CONTEXT_PRUNE_THRESHOLD * 100,
|
|
1035
|
+
)
|
|
1036
|
+
body = prune_conversation(body, ctx_window, target_fraction=0.65)
|
|
1037
|
+
session_monitor.prune_count += 1
|
|
1038
|
+
# Re-estimate after pruning
|
|
1039
|
+
estimated_tokens = estimate_total_tokens(body)
|
|
1040
|
+
session_monitor.record_request(estimated_tokens)
|
|
1041
|
+
n_messages = len(body.get("messages", []))
|
|
1042
|
+
logger.info(
|
|
1043
|
+
"After pruning: ~%d tokens, %d messages",
|
|
1044
|
+
estimated_tokens, n_messages,
|
|
1045
|
+
)
|
|
1046
|
+
|
|
461
1047
|
openai_body = build_openai_request(body)
|
|
462
1048
|
|
|
463
1049
|
client = http_client
|
|
@@ -470,15 +1056,127 @@ async def messages(request: Request):
|
|
|
470
1056
|
|
|
471
1057
|
if is_stream:
|
|
472
1058
|
openai_body["stream"] = True
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
1059
|
+
|
|
1060
|
+
# Retry upstream connection with backoff to handle
|
|
1061
|
+
# llama-server restarts gracefully instead of 500-ing to the client.
|
|
1062
|
+
MAX_UPSTREAM_RETRIES = 3
|
|
1063
|
+
RETRY_DELAY_SECS = 5.0
|
|
1064
|
+
last_exc: Exception | None = None
|
|
1065
|
+
|
|
1066
|
+
for attempt in range(MAX_UPSTREAM_RETRIES):
|
|
1067
|
+
try:
|
|
1068
|
+
resp = await client.send(
|
|
1069
|
+
client.build_request(
|
|
1070
|
+
"POST",
|
|
1071
|
+
f"{LLAMA_CPP_BASE}/chat/completions",
|
|
1072
|
+
json=openai_body,
|
|
1073
|
+
headers={"Content-Type": "application/json"},
|
|
1074
|
+
),
|
|
1075
|
+
stream=True,
|
|
1076
|
+
)
|
|
1077
|
+
# Connection succeeded – break out of retry loop
|
|
1078
|
+
last_exc = None
|
|
1079
|
+
break
|
|
1080
|
+
except (httpx.ConnectError, httpx.RemoteProtocolError) as exc:
|
|
1081
|
+
last_exc = exc
|
|
1082
|
+
if attempt < MAX_UPSTREAM_RETRIES - 1:
|
|
1083
|
+
logger.warning(
|
|
1084
|
+
"Upstream connect failed (attempt %d/%d): %s – retrying in %.0fs",
|
|
1085
|
+
attempt + 1, MAX_UPSTREAM_RETRIES,
|
|
1086
|
+
type(exc).__name__, RETRY_DELAY_SECS,
|
|
1087
|
+
)
|
|
1088
|
+
await asyncio.sleep(RETRY_DELAY_SECS)
|
|
1089
|
+
else:
|
|
1090
|
+
logger.error(
|
|
1091
|
+
"Upstream connect failed after %d attempts: %s: %s",
|
|
1092
|
+
MAX_UPSTREAM_RETRIES, type(exc).__name__, exc,
|
|
1093
|
+
)
|
|
1094
|
+
|
|
1095
|
+
if last_exc is not None:
|
|
1096
|
+
return Response(
|
|
1097
|
+
content=json.dumps({
|
|
1098
|
+
"type": "error",
|
|
1099
|
+
"error": {
|
|
1100
|
+
"type": "overloaded_error",
|
|
1101
|
+
"message": f"Upstream server unavailable after {MAX_UPSTREAM_RETRIES} retries: {last_exc}",
|
|
1102
|
+
},
|
|
1103
|
+
}),
|
|
1104
|
+
status_code=529,
|
|
1105
|
+
media_type="application/json",
|
|
1106
|
+
)
|
|
1107
|
+
|
|
1108
|
+
# --- Option B: Check HTTP status before streaming ---
|
|
1109
|
+
# llama-server returns 400 for context overflow, 500 for internal errors, etc.
|
|
1110
|
+
# Without this check, the proxy would try to stream-translate an error body,
|
|
1111
|
+
# producing an empty response that silently kills the agentic loop.
|
|
1112
|
+
if resp.status_code != 200:
|
|
1113
|
+
error_body = await resp.aread()
|
|
1114
|
+
await resp.aclose()
|
|
1115
|
+
error_text = error_body.decode("utf-8", errors="replace")[:1000]
|
|
1116
|
+
logger.error(
|
|
1117
|
+
"Upstream HTTP %d: %s", resp.status_code, error_text
|
|
1118
|
+
)
|
|
1119
|
+
|
|
1120
|
+
# Parse the error for a user-friendly message
|
|
1121
|
+
error_message = f"Upstream server error (HTTP {resp.status_code})"
|
|
1122
|
+
try:
|
|
1123
|
+
error_json = json.loads(error_body)
|
|
1124
|
+
if "error" in error_json:
|
|
1125
|
+
upstream_error = error_json["error"]
|
|
1126
|
+
if isinstance(upstream_error, dict):
|
|
1127
|
+
error_message = upstream_error.get("message", error_message)
|
|
1128
|
+
else:
|
|
1129
|
+
error_message = str(upstream_error)
|
|
1130
|
+
except (json.JSONDecodeError, KeyError):
|
|
1131
|
+
error_message = error_text[:500] if error_text else error_message
|
|
1132
|
+
|
|
1133
|
+
# Detect context overflow specifically
|
|
1134
|
+
is_context_overflow = (
|
|
1135
|
+
resp.status_code == 400
|
|
1136
|
+
and "exceeds" in error_message.lower()
|
|
1137
|
+
and "context" in error_message.lower()
|
|
1138
|
+
)
|
|
1139
|
+
|
|
1140
|
+
if is_context_overflow:
|
|
1141
|
+
session_monitor.overflow_count += 1
|
|
1142
|
+
logger.error(
|
|
1143
|
+
"CONTEXT OVERFLOW detected (count=%d). "
|
|
1144
|
+
"Estimated input: %d tokens, context window: %d tokens. "
|
|
1145
|
+
"Conversation needs pruning or context window increase.",
|
|
1146
|
+
session_monitor.overflow_count, estimated_tokens, ctx_window,
|
|
1147
|
+
)
|
|
1148
|
+
# Return Anthropic-format error that Claude Code can handle
|
|
1149
|
+
return Response(
|
|
1150
|
+
content=json.dumps({
|
|
1151
|
+
"type": "error",
|
|
1152
|
+
"error": {
|
|
1153
|
+
"type": "overloaded_error",
|
|
1154
|
+
"message": (
|
|
1155
|
+
f"Context window exceeded: request requires ~{estimated_tokens} tokens "
|
|
1156
|
+
f"but only {ctx_window} are available. "
|
|
1157
|
+
f"The conversation is too long. Please start a new session or "
|
|
1158
|
+
f"reduce conversation length."
|
|
1159
|
+
),
|
|
1160
|
+
},
|
|
1161
|
+
}),
|
|
1162
|
+
status_code=529,
|
|
1163
|
+
media_type="application/json",
|
|
1164
|
+
)
|
|
1165
|
+
|
|
1166
|
+
# Generic upstream error -- return as Anthropic error format
|
|
1167
|
+
error_type = "overloaded_error" if resp.status_code >= 500 else "invalid_request_error"
|
|
1168
|
+
return Response(
|
|
1169
|
+
content=json.dumps({
|
|
1170
|
+
"type": "error",
|
|
1171
|
+
"error": {
|
|
1172
|
+
"type": error_type,
|
|
1173
|
+
"message": error_message,
|
|
1174
|
+
},
|
|
1175
|
+
}),
|
|
1176
|
+
status_code=529 if resp.status_code >= 500 else 400,
|
|
1177
|
+
media_type="application/json",
|
|
1178
|
+
)
|
|
1179
|
+
|
|
482
1180
|
return StreamingResponse(
|
|
483
1181
|
stream_anthropic_response(resp, model),
|
|
484
1182
|
media_type="text/event-stream",
|
|
@@ -493,8 +1191,30 @@ async def messages(request: Request):
|
|
|
493
1191
|
json=openai_body,
|
|
494
1192
|
headers={"Content-Type": "application/json"},
|
|
495
1193
|
)
|
|
1194
|
+
|
|
1195
|
+
# Option B: Handle non-streaming errors too
|
|
1196
|
+
if resp.status_code != 200:
|
|
1197
|
+
error_text = resp.text[:1000]
|
|
1198
|
+
logger.error("Upstream HTTP %d (non-stream): %s", resp.status_code, error_text)
|
|
1199
|
+
return Response(
|
|
1200
|
+
content=json.dumps({
|
|
1201
|
+
"type": "error",
|
|
1202
|
+
"error": {
|
|
1203
|
+
"type": "overloaded_error",
|
|
1204
|
+
"message": f"Upstream error (HTTP {resp.status_code}): {error_text[:500]}",
|
|
1205
|
+
},
|
|
1206
|
+
}),
|
|
1207
|
+
status_code=529,
|
|
1208
|
+
media_type="application/json",
|
|
1209
|
+
)
|
|
1210
|
+
|
|
496
1211
|
openai_resp = resp.json()
|
|
497
1212
|
anthropic_resp = openai_to_anthropic_response(openai_resp, model)
|
|
1213
|
+
|
|
1214
|
+
# Track output tokens in session monitor
|
|
1215
|
+
output_tokens = anthropic_resp.get("usage", {}).get("output_tokens", 0)
|
|
1216
|
+
session_monitor.record_response(output_tokens)
|
|
1217
|
+
|
|
498
1218
|
return anthropic_resp
|
|
499
1219
|
|
|
500
1220
|
|
|
@@ -537,6 +1257,33 @@ async def health():
|
|
|
537
1257
|
}
|
|
538
1258
|
|
|
539
1259
|
|
|
1260
|
+
@app.get("/v1/context")
|
|
1261
|
+
async def context_status():
|
|
1262
|
+
"""Option F: Context window monitoring endpoint.
|
|
1263
|
+
|
|
1264
|
+
Returns current session token usage, utilization, warnings, and
|
|
1265
|
+
estimated remaining turns. Useful for dashboards and debugging.
|
|
1266
|
+
"""
|
|
1267
|
+
warning = session_monitor.get_warning_level()
|
|
1268
|
+
turns = session_monitor.estimate_turns_remaining()
|
|
1269
|
+
|
|
1270
|
+
return {
|
|
1271
|
+
"context_window": session_monitor.context_window,
|
|
1272
|
+
"last_input_tokens": session_monitor.last_input_tokens,
|
|
1273
|
+
"last_output_tokens": session_monitor.last_output_tokens,
|
|
1274
|
+
"peak_input_tokens": session_monitor.peak_input_tokens,
|
|
1275
|
+
"utilization": round(session_monitor.get_utilization(), 4),
|
|
1276
|
+
"utilization_pct": f"{session_monitor.get_utilization() * 100:.1f}%",
|
|
1277
|
+
"warning_level": warning,
|
|
1278
|
+
"estimated_turns_remaining": turns,
|
|
1279
|
+
"total_requests": session_monitor.total_requests,
|
|
1280
|
+
"prune_count": session_monitor.prune_count,
|
|
1281
|
+
"overflow_count": session_monitor.overflow_count,
|
|
1282
|
+
"prune_threshold": PROXY_CONTEXT_PRUNE_THRESHOLD,
|
|
1283
|
+
"recent_history": session_monitor.context_history[-10:],
|
|
1284
|
+
}
|
|
1285
|
+
|
|
1286
|
+
|
|
540
1287
|
# ===========================================================================
|
|
541
1288
|
# Entry Point
|
|
542
1289
|
# ===========================================================================
|