@miller-tech/uap 1.13.5 → 1.13.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +20 -0
- package/config/chat_template.jinja +126 -44
- package/config/model-profiles/qwen35.json +3 -3
- package/dist/.tsbuildinfo +1 -1
- package/dist/benchmarks/token-throughput.d.ts +259 -0
- package/dist/benchmarks/token-throughput.d.ts.map +1 -0
- package/dist/benchmarks/token-throughput.js +198 -0
- package/dist/benchmarks/token-throughput.js.map +1 -0
- package/dist/bin/cli.js +12 -0
- package/dist/bin/cli.js.map +1 -1
- package/dist/cli/dashboard.d.ts.map +1 -1
- package/dist/cli/dashboard.js +10 -20
- package/dist/cli/dashboard.js.map +1 -1
- package/dist/cli/init.d.ts.map +1 -1
- package/dist/cli/init.js +5 -0
- package/dist/cli/init.js.map +1 -1
- package/dist/cli/memory.d.ts.map +1 -1
- package/dist/cli/memory.js +9 -18
- package/dist/cli/memory.js.map +1 -1
- package/dist/cli/worktree.d.ts +4 -1
- package/dist/cli/worktree.d.ts.map +1 -1
- package/dist/cli/worktree.js +73 -1
- package/dist/cli/worktree.js.map +1 -1
- package/dist/coordination/adaptive-patterns.d.ts +3 -1
- package/dist/coordination/adaptive-patterns.d.ts.map +1 -1
- package/dist/coordination/adaptive-patterns.js +31 -3
- package/dist/coordination/adaptive-patterns.js.map +1 -1
- package/dist/dashboard/data-service.d.ts +27 -0
- package/dist/dashboard/data-service.d.ts.map +1 -1
- package/dist/dashboard/data-service.js +210 -17
- package/dist/dashboard/data-service.js.map +1 -1
- package/dist/memory/embeddings.d.ts.map +1 -1
- package/dist/memory/embeddings.js +1 -1
- package/dist/memory/embeddings.js.map +1 -1
- package/dist/models/router.js +1 -1
- package/dist/models/router.js.map +1 -1
- package/dist/models/types.js +13 -13
- package/dist/models/types.js.map +1 -1
- package/dist/tasks/coordination.js +1 -1
- package/dist/tasks/coordination.js.map +1 -1
- package/docs/deployment/QWEN35_LLAMA_CPP.md +76 -0
- package/package.json +3 -2
- package/templates/hooks/session-start.sh +91 -51
- package/tools/agents/README.md +22 -0
- package/tools/agents/install-opencode-local.sh.j2 +57 -7
- package/tools/agents/opencode_uap_agent.py +63 -1
- package/tools/agents/scripts/anthropic_proxy.py +1297 -0
- package/tools/agents/scripts/requirements-proxy.txt +5 -0
- package/tools/agents/scripts/tool_call_wrapper.py +9 -5
|
@@ -0,0 +1,1297 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
UAP Anthropic-to-OpenAI Proxy
|
|
4
|
+
==============================
|
|
5
|
+
|
|
6
|
+
A lightweight, production-ready proxy that translates Anthropic Messages API
|
|
7
|
+
requests into OpenAI Chat Completions API requests. Designed for use with
|
|
8
|
+
local LLM servers (llama.cpp, vLLM, Ollama, etc.) that expose an OpenAI-
|
|
9
|
+
compatible endpoint but need to be accessed from clients that speak the
|
|
10
|
+
Anthropic protocol (e.g., Claude Code, Forge Code).
|
|
11
|
+
|
|
12
|
+
Architecture
|
|
13
|
+
------------
|
|
14
|
+
Claude Code --(Anthropic API)--> This Proxy --(OpenAI API)--> llama.cpp
|
|
15
|
+
:4000 :8080
|
|
16
|
+
|
|
17
|
+
Key Features
|
|
18
|
+
- Full streaming support (SSE translation between protocols)
|
|
19
|
+
- Tool/function calling translation (both streaming and non-streaming)
|
|
20
|
+
- Module-level httpx.AsyncClient with connection pooling and keep-alive
|
|
21
|
+
- Granular timeouts (short connect, long read for LLM generation)
|
|
22
|
+
- Graceful error recovery on upstream connection drops
|
|
23
|
+
- Proper upstream cleanup on client disconnect
|
|
24
|
+
- Context window overflow protection with conversation pruning
|
|
25
|
+
- Smart max_tokens capping to prevent next-turn overflow
|
|
26
|
+
- Session-level token monitoring with warnings
|
|
27
|
+
|
|
28
|
+
Configuration (Environment Variables)
|
|
29
|
+
--------------------------------------
|
|
30
|
+
LLAMA_CPP_BASE Base URL of the OpenAI-compatible server
|
|
31
|
+
Default: http://192.168.1.165:8080/v1
|
|
32
|
+
|
|
33
|
+
PROXY_PORT Port for this proxy to listen on
|
|
34
|
+
Default: 4000
|
|
35
|
+
|
|
36
|
+
PROXY_HOST Host/IP to bind to
|
|
37
|
+
Default: 0.0.0.0
|
|
38
|
+
|
|
39
|
+
PROXY_LOG_LEVEL Logging level (DEBUG, INFO, WARNING, ERROR)
|
|
40
|
+
Default: INFO
|
|
41
|
+
|
|
42
|
+
PROXY_READ_TIMEOUT Read timeout in seconds for upstream LLM streaming
|
|
43
|
+
Default: 600 (10 minutes)
|
|
44
|
+
|
|
45
|
+
PROXY_MAX_CONNECTIONS Max concurrent connections to upstream
|
|
46
|
+
Default: 20
|
|
47
|
+
|
|
48
|
+
PROXY_CONTEXT_WINDOW Override context window size (auto-detected from
|
|
49
|
+
upstream /slots endpoint if not set)
|
|
50
|
+
Default: 0 (auto-detect)
|
|
51
|
+
|
|
52
|
+
PROXY_CONTEXT_PRUNE_THRESHOLD Fraction of context window at which
|
|
53
|
+
conversation pruning activates (0.0-1.0)
|
|
54
|
+
Default: 0.75
|
|
55
|
+
|
|
56
|
+
Usage
|
|
57
|
+
-----
|
|
58
|
+
# Basic usage (connects to llama.cpp on default port):
|
|
59
|
+
python anthropic_proxy.py
|
|
60
|
+
|
|
61
|
+
# Custom upstream server:
|
|
62
|
+
LLAMA_CPP_BASE=http://localhost:8080/v1 python anthropic_proxy.py
|
|
63
|
+
|
|
64
|
+
# Custom proxy port:
|
|
65
|
+
PROXY_PORT=5000 python anthropic_proxy.py
|
|
66
|
+
|
|
67
|
+
# Via npx (after npm install):
|
|
68
|
+
npx uap-anthropic-proxy
|
|
69
|
+
|
|
70
|
+
Dependencies
|
|
71
|
+
------------
|
|
72
|
+
pip install fastapi uvicorn httpx
|
|
73
|
+
|
|
74
|
+
Or from the project root:
|
|
75
|
+
pip install -r tools/agents/scripts/requirements-proxy.txt
|
|
76
|
+
"""
|
|
77
|
+
|
|
78
|
+
import asyncio
|
|
79
|
+
import json
|
|
80
|
+
import logging
|
|
81
|
+
import os
|
|
82
|
+
import sys
|
|
83
|
+
import time
|
|
84
|
+
import uuid
|
|
85
|
+
from dataclasses import dataclass, field
|
|
86
|
+
|
|
87
|
+
import httpx
|
|
88
|
+
from contextlib import asynccontextmanager
|
|
89
|
+
from fastapi import FastAPI, Request, Response
|
|
90
|
+
from fastapi.responses import StreamingResponse
|
|
91
|
+
import uvicorn
|
|
92
|
+
|
|
93
|
+
# ---------------------------------------------------------------------------
|
|
94
|
+
# Configuration (all configurable via environment variables)
|
|
95
|
+
# ---------------------------------------------------------------------------
|
|
96
|
+
LLAMA_CPP_BASE = os.environ.get("LLAMA_CPP_BASE", "http://192.168.1.165:8080/v1")
|
|
97
|
+
PROXY_PORT = int(os.environ.get("PROXY_PORT", "4000"))
|
|
98
|
+
PROXY_HOST = os.environ.get("PROXY_HOST", "0.0.0.0")
|
|
99
|
+
PROXY_LOG_LEVEL = os.environ.get("PROXY_LOG_LEVEL", "INFO").upper()
|
|
100
|
+
PROXY_READ_TIMEOUT = float(os.environ.get("PROXY_READ_TIMEOUT", "600"))
|
|
101
|
+
PROXY_MAX_CONNECTIONS = int(os.environ.get("PROXY_MAX_CONNECTIONS", "20"))
|
|
102
|
+
PROXY_CONTEXT_WINDOW = int(os.environ.get("PROXY_CONTEXT_WINDOW", "0"))
|
|
103
|
+
PROXY_CONTEXT_PRUNE_THRESHOLD = float(os.environ.get("PROXY_CONTEXT_PRUNE_THRESHOLD", "0.75"))
|
|
104
|
+
|
|
105
|
+
# ---------------------------------------------------------------------------
|
|
106
|
+
# Logging
|
|
107
|
+
# ---------------------------------------------------------------------------
|
|
108
|
+
logging.basicConfig(
|
|
109
|
+
level=getattr(logging, PROXY_LOG_LEVEL, logging.INFO),
|
|
110
|
+
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
|
|
111
|
+
datefmt="%Y-%m-%d %H:%M:%S",
|
|
112
|
+
)
|
|
113
|
+
logger = logging.getLogger("uap.anthropic_proxy")
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
# ---------------------------------------------------------------------------
|
|
117
|
+
# Option F: Session-level Context Window Monitor
|
|
118
|
+
# ---------------------------------------------------------------------------
|
|
119
|
+
@dataclass
|
|
120
|
+
class SessionMonitor:
|
|
121
|
+
"""Tracks token usage across the session to provide early warnings
|
|
122
|
+
and enable proactive context management before overflow occurs."""
|
|
123
|
+
|
|
124
|
+
context_window: int = 0 # Auto-detected or configured
|
|
125
|
+
total_requests: int = 0
|
|
126
|
+
last_input_tokens: int = 0 # Estimated input tokens of last request
|
|
127
|
+
last_output_tokens: int = 0 # Actual output tokens of last response
|
|
128
|
+
peak_input_tokens: int = 0 # High-water mark
|
|
129
|
+
prune_count: int = 0 # How many times pruning was triggered
|
|
130
|
+
overflow_count: int = 0 # How many context overflow errors caught
|
|
131
|
+
context_history: list = field(default_factory=list) # Recent token counts
|
|
132
|
+
|
|
133
|
+
def record_request(self, estimated_tokens: int):
|
|
134
|
+
"""Record an outgoing request's estimated token count."""
|
|
135
|
+
self.total_requests += 1
|
|
136
|
+
self.last_input_tokens = estimated_tokens
|
|
137
|
+
if estimated_tokens > self.peak_input_tokens:
|
|
138
|
+
self.peak_input_tokens = estimated_tokens
|
|
139
|
+
self.context_history.append(estimated_tokens)
|
|
140
|
+
# Keep last 50 entries
|
|
141
|
+
if len(self.context_history) > 50:
|
|
142
|
+
self.context_history = self.context_history[-50:]
|
|
143
|
+
|
|
144
|
+
def record_response(self, output_tokens: int):
|
|
145
|
+
"""Record a response's output token count."""
|
|
146
|
+
self.last_output_tokens = output_tokens
|
|
147
|
+
|
|
148
|
+
def get_utilization(self) -> float:
|
|
149
|
+
"""Get current context utilization as a fraction (0.0 - 1.0)."""
|
|
150
|
+
if self.context_window <= 0:
|
|
151
|
+
return 0.0
|
|
152
|
+
return self.last_input_tokens / self.context_window
|
|
153
|
+
|
|
154
|
+
def get_warning_level(self) -> str | None:
|
|
155
|
+
"""Return warning level based on context utilization.
|
|
156
|
+
Returns None if no warning needed."""
|
|
157
|
+
util = self.get_utilization()
|
|
158
|
+
if util >= 0.95:
|
|
159
|
+
return "CRITICAL"
|
|
160
|
+
elif util >= 0.85:
|
|
161
|
+
return "HIGH"
|
|
162
|
+
elif util >= 0.75:
|
|
163
|
+
return "ELEVATED"
|
|
164
|
+
return None
|
|
165
|
+
|
|
166
|
+
def estimate_turns_remaining(self) -> int | None:
|
|
167
|
+
"""Estimate how many more agentic turns can fit before overflow."""
|
|
168
|
+
if self.context_window <= 0 or len(self.context_history) < 2:
|
|
169
|
+
return None
|
|
170
|
+
# Average growth per turn from recent history
|
|
171
|
+
deltas = [
|
|
172
|
+
self.context_history[i] - self.context_history[i - 1]
|
|
173
|
+
for i in range(1, len(self.context_history))
|
|
174
|
+
if self.context_history[i] > self.context_history[i - 1]
|
|
175
|
+
]
|
|
176
|
+
if not deltas:
|
|
177
|
+
return None
|
|
178
|
+
avg_growth = sum(deltas) / len(deltas)
|
|
179
|
+
if avg_growth <= 0:
|
|
180
|
+
return None
|
|
181
|
+
remaining_tokens = self.context_window - self.last_input_tokens
|
|
182
|
+
return max(0, int(remaining_tokens / avg_growth))
|
|
183
|
+
|
|
184
|
+
def log_status(self):
|
|
185
|
+
"""Log current session status."""
|
|
186
|
+
util = self.get_utilization()
|
|
187
|
+
warning = self.get_warning_level()
|
|
188
|
+
turns = self.estimate_turns_remaining()
|
|
189
|
+
turns_str = f"~{turns} turns remaining" if turns is not None else "unknown"
|
|
190
|
+
|
|
191
|
+
if warning == "CRITICAL":
|
|
192
|
+
logger.error(
|
|
193
|
+
"CONTEXT CRITICAL: %d/%d tokens (%.1f%%), %s, pruned=%d, overflows=%d",
|
|
194
|
+
self.last_input_tokens, self.context_window, util * 100,
|
|
195
|
+
turns_str, self.prune_count, self.overflow_count,
|
|
196
|
+
)
|
|
197
|
+
elif warning == "HIGH":
|
|
198
|
+
logger.warning(
|
|
199
|
+
"CONTEXT HIGH: %d/%d tokens (%.1f%%), %s, pruned=%d",
|
|
200
|
+
self.last_input_tokens, self.context_window, util * 100,
|
|
201
|
+
turns_str, self.prune_count,
|
|
202
|
+
)
|
|
203
|
+
elif warning == "ELEVATED":
|
|
204
|
+
logger.warning(
|
|
205
|
+
"CONTEXT ELEVATED: %d/%d tokens (%.1f%%), %s",
|
|
206
|
+
self.last_input_tokens, self.context_window, util * 100,
|
|
207
|
+
turns_str,
|
|
208
|
+
)
|
|
209
|
+
else:
|
|
210
|
+
logger.info(
|
|
211
|
+
"CONTEXT: %d/%d tokens (%.1f%%), %s",
|
|
212
|
+
self.last_input_tokens, self.context_window, util * 100,
|
|
213
|
+
turns_str,
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
session_monitor = SessionMonitor()
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
# ---------------------------------------------------------------------------
|
|
221
|
+
# Context Window Detection
|
|
222
|
+
# ---------------------------------------------------------------------------
|
|
223
|
+
async def detect_context_window(client: httpx.AsyncClient) -> int:
|
|
224
|
+
"""Auto-detect the upstream server's per-slot context window size.
|
|
225
|
+
|
|
226
|
+
Queries the /slots endpoint (llama.cpp) to get the actual n_ctx value.
|
|
227
|
+
Falls back to PROXY_CONTEXT_WINDOW env var, then to a safe default.
|
|
228
|
+
"""
|
|
229
|
+
if PROXY_CONTEXT_WINDOW > 0:
|
|
230
|
+
logger.info("Using configured context window: %d tokens", PROXY_CONTEXT_WINDOW)
|
|
231
|
+
return PROXY_CONTEXT_WINDOW
|
|
232
|
+
|
|
233
|
+
try:
|
|
234
|
+
slots_url = LLAMA_CPP_BASE.replace("/v1", "/slots")
|
|
235
|
+
resp = await client.get(slots_url, timeout=5.0)
|
|
236
|
+
if resp.status_code == 200:
|
|
237
|
+
slots = resp.json()
|
|
238
|
+
if slots and isinstance(slots, list):
|
|
239
|
+
n_ctx = slots[0].get("n_ctx", 0)
|
|
240
|
+
if n_ctx > 0:
|
|
241
|
+
logger.info(
|
|
242
|
+
"Auto-detected context window from upstream: %d tokens (%d slots)",
|
|
243
|
+
n_ctx, len(slots),
|
|
244
|
+
)
|
|
245
|
+
return n_ctx
|
|
246
|
+
except Exception as exc:
|
|
247
|
+
logger.warning("Failed to auto-detect context window: %s", exc)
|
|
248
|
+
|
|
249
|
+
# Safe default: 128K (common for modern models)
|
|
250
|
+
default = 131072
|
|
251
|
+
logger.warning("Using default context window: %d tokens", default)
|
|
252
|
+
return default
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
# ---------------------------------------------------------------------------
|
|
256
|
+
# Option C: Conversation Pruning
|
|
257
|
+
# ---------------------------------------------------------------------------
|
|
258
|
+
# Characters-per-token ratio for estimation. English text averages ~4 chars/token,
|
|
259
|
+
# but tool call JSON and code tend to be denser (~3.2 chars/token).
|
|
260
|
+
CHARS_PER_TOKEN = 3.5
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
def estimate_tokens(text: str) -> int:
|
|
264
|
+
"""Estimate token count from text length using chars-per-token heuristic."""
|
|
265
|
+
return max(1, int(len(text) / CHARS_PER_TOKEN))
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
def estimate_message_tokens(msg: dict) -> int:
|
|
269
|
+
"""Estimate token count for a single Anthropic message."""
|
|
270
|
+
tokens = 4 # Message overhead (role, separators)
|
|
271
|
+
content = msg.get("content", "")
|
|
272
|
+
if isinstance(content, str):
|
|
273
|
+
tokens += estimate_tokens(content)
|
|
274
|
+
elif isinstance(content, list):
|
|
275
|
+
for block in content:
|
|
276
|
+
if isinstance(block, str):
|
|
277
|
+
tokens += estimate_tokens(block)
|
|
278
|
+
elif isinstance(block, dict):
|
|
279
|
+
if block.get("type") == "text":
|
|
280
|
+
tokens += estimate_tokens(block.get("text", ""))
|
|
281
|
+
elif block.get("type") == "tool_use":
|
|
282
|
+
tokens += estimate_tokens(block.get("name", ""))
|
|
283
|
+
tokens += estimate_tokens(json.dumps(block.get("input", {})))
|
|
284
|
+
elif block.get("type") == "tool_result":
|
|
285
|
+
tokens += estimate_tokens(_extract_text(block.get("content", "")))
|
|
286
|
+
return tokens
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
def estimate_total_tokens(anthropic_body: dict) -> int:
|
|
290
|
+
"""Estimate total token count for an Anthropic Messages API request."""
|
|
291
|
+
tokens = 0
|
|
292
|
+
|
|
293
|
+
# System prompt
|
|
294
|
+
system = anthropic_body.get("system", "")
|
|
295
|
+
if isinstance(system, str):
|
|
296
|
+
tokens += estimate_tokens(system)
|
|
297
|
+
elif isinstance(system, list):
|
|
298
|
+
for block in system:
|
|
299
|
+
if isinstance(block, dict) and block.get("type") == "text":
|
|
300
|
+
tokens += estimate_tokens(block.get("text", ""))
|
|
301
|
+
|
|
302
|
+
# Agentic supplement tokens (always injected)
|
|
303
|
+
tokens += estimate_tokens(_AGENTIC_SYSTEM_SUPPLEMENT)
|
|
304
|
+
|
|
305
|
+
# Messages
|
|
306
|
+
for msg in anthropic_body.get("messages", []):
|
|
307
|
+
tokens += estimate_message_tokens(msg)
|
|
308
|
+
|
|
309
|
+
# Tool definitions
|
|
310
|
+
tools = anthropic_body.get("tools", [])
|
|
311
|
+
if tools:
|
|
312
|
+
tokens += estimate_tokens(json.dumps(tools))
|
|
313
|
+
|
|
314
|
+
return tokens
|
|
315
|
+
|
|
316
|
+
|
|
317
|
+
def prune_conversation(anthropic_body: dict, context_window: int, target_fraction: float = 0.65) -> dict:
|
|
318
|
+
"""Prune the conversation to fit within the context window.
|
|
319
|
+
|
|
320
|
+
Strategy:
|
|
321
|
+
- Always keep: system prompt, first user message, last N messages
|
|
322
|
+
- Remove from the middle: oldest tool_result messages first (they're
|
|
323
|
+
the largest -- file contents, command output, etc.), then oldest
|
|
324
|
+
assistant messages, then oldest user messages.
|
|
325
|
+
- Inject a [CONTEXT PRUNED] marker so the model knows history was trimmed.
|
|
326
|
+
|
|
327
|
+
Args:
|
|
328
|
+
anthropic_body: The full Anthropic request body
|
|
329
|
+
context_window: Maximum context window in tokens
|
|
330
|
+
target_fraction: Target utilization after pruning (0.0-1.0)
|
|
331
|
+
|
|
332
|
+
Returns:
|
|
333
|
+
Modified anthropic_body with pruned messages
|
|
334
|
+
"""
|
|
335
|
+
messages = anthropic_body.get("messages", [])
|
|
336
|
+
if len(messages) <= 4:
|
|
337
|
+
# Too few messages to prune meaningfully
|
|
338
|
+
return anthropic_body
|
|
339
|
+
|
|
340
|
+
target_tokens = int(context_window * target_fraction)
|
|
341
|
+
|
|
342
|
+
# Estimate non-message tokens (system, tools, agentic supplement)
|
|
343
|
+
overhead_tokens = 0
|
|
344
|
+
system = anthropic_body.get("system", "")
|
|
345
|
+
if isinstance(system, str):
|
|
346
|
+
overhead_tokens += estimate_tokens(system)
|
|
347
|
+
elif isinstance(system, list):
|
|
348
|
+
for block in system:
|
|
349
|
+
if isinstance(block, dict) and block.get("type") == "text":
|
|
350
|
+
overhead_tokens += estimate_tokens(block.get("text", ""))
|
|
351
|
+
overhead_tokens += estimate_tokens(_AGENTIC_SYSTEM_SUPPLEMENT)
|
|
352
|
+
tools = anthropic_body.get("tools", [])
|
|
353
|
+
if tools:
|
|
354
|
+
overhead_tokens += estimate_tokens(json.dumps(tools))
|
|
355
|
+
|
|
356
|
+
# Budget for messages
|
|
357
|
+
message_budget = target_tokens - overhead_tokens
|
|
358
|
+
if message_budget <= 0:
|
|
359
|
+
logger.error("System prompt + tools alone exceed target budget!")
|
|
360
|
+
return anthropic_body
|
|
361
|
+
|
|
362
|
+
# Always keep the first user message and the last N messages
|
|
363
|
+
KEEP_LAST = 8 # Keep the last 8 messages (recent context)
|
|
364
|
+
protected_head = messages[:1] # First user message
|
|
365
|
+
protected_tail = messages[-KEEP_LAST:] if len(messages) > KEEP_LAST else messages[1:]
|
|
366
|
+
middle = messages[1:-KEEP_LAST] if len(messages) > KEEP_LAST + 1 else []
|
|
367
|
+
|
|
368
|
+
# Calculate tokens for protected messages
|
|
369
|
+
protected_tokens = sum(estimate_message_tokens(m) for m in protected_head + protected_tail)
|
|
370
|
+
|
|
371
|
+
if protected_tokens >= message_budget:
|
|
372
|
+
# Even protected messages exceed budget -- truncate tool_result content
|
|
373
|
+
# in the tail to fit
|
|
374
|
+
logger.warning(
|
|
375
|
+
"Protected messages (%d tokens) exceed budget (%d) -- truncating tool results",
|
|
376
|
+
protected_tokens, message_budget,
|
|
377
|
+
)
|
|
378
|
+
for msg in protected_tail:
|
|
379
|
+
content = msg.get("content", [])
|
|
380
|
+
if isinstance(content, list):
|
|
381
|
+
for block in content:
|
|
382
|
+
if isinstance(block, dict) and block.get("type") == "tool_result":
|
|
383
|
+
result_text = _extract_text(block.get("content", ""))
|
|
384
|
+
if len(result_text) > 2000:
|
|
385
|
+
block["content"] = result_text[:1000] + "\n...[TRUNCATED]...\n" + result_text[-500:]
|
|
386
|
+
anthropic_body["messages"] = protected_head + protected_tail
|
|
387
|
+
return anthropic_body
|
|
388
|
+
|
|
389
|
+
remaining_budget = message_budget - protected_tokens
|
|
390
|
+
|
|
391
|
+
# Score middle messages for removal priority:
|
|
392
|
+
# - tool_result messages: remove first (biggest, least important historically)
|
|
393
|
+
# - assistant text-only: remove second
|
|
394
|
+
# - user messages: remove last (provide context for the model's actions)
|
|
395
|
+
# Within each category, remove oldest first.
|
|
396
|
+
scored_middle = []
|
|
397
|
+
for i, msg in enumerate(middle):
|
|
398
|
+
content = msg.get("content", [])
|
|
399
|
+
tokens = estimate_message_tokens(msg)
|
|
400
|
+
is_tool_result = False
|
|
401
|
+
is_assistant = msg.get("role") == "assistant"
|
|
402
|
+
|
|
403
|
+
if isinstance(content, list):
|
|
404
|
+
is_tool_result = any(
|
|
405
|
+
isinstance(b, dict) and b.get("type") == "tool_result"
|
|
406
|
+
for b in content
|
|
407
|
+
)
|
|
408
|
+
|
|
409
|
+
# Lower priority = removed first
|
|
410
|
+
if is_tool_result:
|
|
411
|
+
priority = 0 # Remove first
|
|
412
|
+
elif is_assistant:
|
|
413
|
+
priority = 1 # Remove second
|
|
414
|
+
else:
|
|
415
|
+
priority = 2 # Remove last (user messages)
|
|
416
|
+
|
|
417
|
+
scored_middle.append((priority, i, tokens, msg))
|
|
418
|
+
|
|
419
|
+
# Sort by priority (ascending = remove first), then by index (oldest first)
|
|
420
|
+
scored_middle.sort(key=lambda x: (x[0], x[1]))
|
|
421
|
+
|
|
422
|
+
# Greedily keep messages from highest priority (keep last) until budget fills
|
|
423
|
+
kept_middle = []
|
|
424
|
+
used_tokens = 0
|
|
425
|
+
# Process in reverse priority order (keep high-priority messages first)
|
|
426
|
+
for priority, idx, tokens, msg in reversed(scored_middle):
|
|
427
|
+
if used_tokens + tokens <= remaining_budget:
|
|
428
|
+
kept_middle.append((idx, msg))
|
|
429
|
+
used_tokens += tokens
|
|
430
|
+
|
|
431
|
+
# Sort kept messages back into original order
|
|
432
|
+
kept_middle.sort(key=lambda x: x[0])
|
|
433
|
+
kept_msgs = [m for _, m in kept_middle]
|
|
434
|
+
|
|
435
|
+
removed_count = len(middle) - len(kept_msgs)
|
|
436
|
+
removed_tokens = sum(t for _, _, t, _ in scored_middle) - used_tokens
|
|
437
|
+
|
|
438
|
+
if removed_count > 0:
|
|
439
|
+
# Insert a context-pruned marker
|
|
440
|
+
prune_marker = {
|
|
441
|
+
"role": "user",
|
|
442
|
+
"content": (
|
|
443
|
+
f"[CONTEXT PRUNED: {removed_count} older messages (~{removed_tokens} tokens) "
|
|
444
|
+
f"were removed to fit within the context window. "
|
|
445
|
+
f"The conversation continues from recent context below.]"
|
|
446
|
+
),
|
|
447
|
+
}
|
|
448
|
+
anthropic_body["messages"] = protected_head + [prune_marker] + kept_msgs + protected_tail
|
|
449
|
+
logger.warning(
|
|
450
|
+
"PRUNED: removed %d messages (~%d tokens), kept %d messages, "
|
|
451
|
+
"target=%.0f%% of %d ctx",
|
|
452
|
+
removed_count, removed_tokens, len(anthropic_body["messages"]),
|
|
453
|
+
target_fraction * 100, context_window,
|
|
454
|
+
)
|
|
455
|
+
else:
|
|
456
|
+
anthropic_body["messages"] = protected_head + kept_msgs + protected_tail
|
|
457
|
+
|
|
458
|
+
return anthropic_body
|
|
459
|
+
|
|
460
|
+
|
|
461
|
+
# ---------------------------------------------------------------------------
|
|
462
|
+
# HTTP Client Lifecycle
|
|
463
|
+
# ---------------------------------------------------------------------------
|
|
464
|
+
# Module-level httpx.AsyncClient for connection reuse + keep-alive.
|
|
465
|
+
# Granular timeouts: short connect, long read for streaming LLM output.
|
|
466
|
+
http_client: httpx.AsyncClient | None = None
|
|
467
|
+
|
|
468
|
+
|
|
469
|
+
@asynccontextmanager
|
|
470
|
+
async def lifespan(app: FastAPI):
|
|
471
|
+
"""Manage the httpx client lifecycle with the FastAPI app."""
|
|
472
|
+
global http_client
|
|
473
|
+
http_client = httpx.AsyncClient(
|
|
474
|
+
timeout=httpx.Timeout(
|
|
475
|
+
connect=10.0, # 10s to establish connection
|
|
476
|
+
read=PROXY_READ_TIMEOUT, # configurable (default 10 min)
|
|
477
|
+
write=30.0, # 30s to send the request body
|
|
478
|
+
pool=10.0, # 10s to acquire a pool connection
|
|
479
|
+
),
|
|
480
|
+
limits=httpx.Limits(
|
|
481
|
+
max_connections=PROXY_MAX_CONNECTIONS,
|
|
482
|
+
max_keepalive_connections=PROXY_MAX_CONNECTIONS // 2,
|
|
483
|
+
keepalive_expiry=120,
|
|
484
|
+
),
|
|
485
|
+
)
|
|
486
|
+
logger.info(
|
|
487
|
+
"Proxy started: listening on %s:%d -> upstream %s",
|
|
488
|
+
PROXY_HOST, PROXY_PORT, LLAMA_CPP_BASE,
|
|
489
|
+
)
|
|
490
|
+
|
|
491
|
+
# Auto-detect context window from upstream server
|
|
492
|
+
session_monitor.context_window = await detect_context_window(http_client)
|
|
493
|
+
logger.info(
|
|
494
|
+
"Context window: %d tokens, prune threshold: %.0f%%",
|
|
495
|
+
session_monitor.context_window,
|
|
496
|
+
PROXY_CONTEXT_PRUNE_THRESHOLD * 100,
|
|
497
|
+
)
|
|
498
|
+
|
|
499
|
+
yield
|
|
500
|
+
await http_client.aclose()
|
|
501
|
+
http_client = None
|
|
502
|
+
logger.info("Proxy shut down")
|
|
503
|
+
|
|
504
|
+
|
|
505
|
+
app = FastAPI(
|
|
506
|
+
title="UAP Anthropic Proxy",
|
|
507
|
+
description="Translates Anthropic Messages API to OpenAI Chat Completions API",
|
|
508
|
+
version="1.0.0",
|
|
509
|
+
lifespan=lifespan,
|
|
510
|
+
)
|
|
511
|
+
|
|
512
|
+
|
|
513
|
+
# ===========================================================================
|
|
514
|
+
# Request Translation: Anthropic -> OpenAI
|
|
515
|
+
# ===========================================================================
|
|
516
|
+
|
|
517
|
+
def anthropic_to_openai_messages(anthropic_body: dict) -> list[dict]:
|
|
518
|
+
"""Convert Anthropic message format to OpenAI message format.
|
|
519
|
+
|
|
520
|
+
Handles:
|
|
521
|
+
- System prompt (string or content block array)
|
|
522
|
+
- Text content blocks
|
|
523
|
+
- Tool use blocks (-> OpenAI function calls)
|
|
524
|
+
- Tool result blocks (-> OpenAI tool messages)
|
|
525
|
+
"""
|
|
526
|
+
messages = []
|
|
527
|
+
|
|
528
|
+
# Anthropic has system as a top-level param
|
|
529
|
+
system = anthropic_body.get("system")
|
|
530
|
+
if system:
|
|
531
|
+
if isinstance(system, str):
|
|
532
|
+
messages.append({"role": "system", "content": system})
|
|
533
|
+
elif isinstance(system, list):
|
|
534
|
+
text = "\n".join(
|
|
535
|
+
b.get("text", "") for b in system if b.get("type") == "text"
|
|
536
|
+
)
|
|
537
|
+
if text:
|
|
538
|
+
messages.append({"role": "system", "content": text})
|
|
539
|
+
|
|
540
|
+
for msg in anthropic_body.get("messages", []):
|
|
541
|
+
role = msg["role"]
|
|
542
|
+
content = msg.get("content")
|
|
543
|
+
|
|
544
|
+
if isinstance(content, str):
|
|
545
|
+
messages.append({"role": role, "content": content})
|
|
546
|
+
elif isinstance(content, list):
|
|
547
|
+
parts = []
|
|
548
|
+
for block in content:
|
|
549
|
+
if isinstance(block, str):
|
|
550
|
+
parts.append(block)
|
|
551
|
+
elif block.get("type") == "text":
|
|
552
|
+
parts.append(block.get("text", ""))
|
|
553
|
+
elif block.get("type") == "tool_use":
|
|
554
|
+
messages.append({
|
|
555
|
+
"role": "assistant",
|
|
556
|
+
"content": None,
|
|
557
|
+
"tool_calls": [{
|
|
558
|
+
"id": block.get("id", f"call_{uuid.uuid4().hex[:8]}"),
|
|
559
|
+
"type": "function",
|
|
560
|
+
"function": {
|
|
561
|
+
"name": block["name"],
|
|
562
|
+
"arguments": json.dumps(block.get("input", {})),
|
|
563
|
+
},
|
|
564
|
+
}],
|
|
565
|
+
})
|
|
566
|
+
continue
|
|
567
|
+
elif block.get("type") == "tool_result":
|
|
568
|
+
messages.append({
|
|
569
|
+
"role": "tool",
|
|
570
|
+
"tool_call_id": block.get("tool_use_id", ""),
|
|
571
|
+
"content": _extract_text(block.get("content", "")),
|
|
572
|
+
})
|
|
573
|
+
continue
|
|
574
|
+
if parts:
|
|
575
|
+
messages.append({"role": role, "content": "\n".join(parts)})
|
|
576
|
+
|
|
577
|
+
return messages
|
|
578
|
+
|
|
579
|
+
|
|
580
|
+
def _extract_text(content) -> str:
|
|
581
|
+
"""Extract plain text from Anthropic content (string, list, or other)."""
|
|
582
|
+
if isinstance(content, str):
|
|
583
|
+
return content
|
|
584
|
+
if isinstance(content, list):
|
|
585
|
+
return "\n".join(
|
|
586
|
+
b.get("text", "") if isinstance(b, dict) else str(b) for b in content
|
|
587
|
+
)
|
|
588
|
+
return str(content)
|
|
589
|
+
|
|
590
|
+
|
|
591
|
+
_AGENTIC_SYSTEM_SUPPLEMENT = (
|
|
592
|
+
"\n\n<agentic-protocol>\n"
|
|
593
|
+
"You are operating in an agentic coding loop with tool access. Follow these rules:\n"
|
|
594
|
+
"1. ALWAYS use tools to read, edit, write, and test code. Never just describe or explain what should be done.\n"
|
|
595
|
+
"2. After reading files and identifying an issue, proceed IMMEDIATELY to make the fix using Edit/Write tools. Do NOT stop after explaining the problem.\n"
|
|
596
|
+
"3. After making changes, run the relevant tests or build commands to verify your fix.\n"
|
|
597
|
+
"4. Only produce a final text response WITHOUT tool calls when the ENTIRE task is fully complete, verified, and you have nothing left to do.\n"
|
|
598
|
+
"5. If you have identified a problem but have not yet fixed it, you MUST call a tool to make the fix. Do NOT summarize the issue and stop.\n"
|
|
599
|
+
"6. When the user asks you to do something, DO it with tools. Do not ask for permission or confirmation.\n"
|
|
600
|
+
"7. If a tool call fails, analyze the error and try a different approach. Do not give up after one failure.\n"
|
|
601
|
+
"</agentic-protocol>"
|
|
602
|
+
)
|
|
603
|
+
|
|
604
|
+
|
|
605
|
+
def build_openai_request(anthropic_body: dict) -> dict:
|
|
606
|
+
"""Build an OpenAI Chat Completions request from an Anthropic Messages request."""
|
|
607
|
+
openai_body = {
|
|
608
|
+
"model": anthropic_body.get("model", "default"),
|
|
609
|
+
"messages": anthropic_to_openai_messages(anthropic_body),
|
|
610
|
+
"stream": anthropic_body.get("stream", False),
|
|
611
|
+
}
|
|
612
|
+
|
|
613
|
+
# Inject agentic protocol instructions into the system message so
|
|
614
|
+
# the model knows it must use tools to complete work, not just explain.
|
|
615
|
+
if openai_body["messages"] and openai_body["messages"][0].get("role") == "system":
|
|
616
|
+
openai_body["messages"][0]["content"] += _AGENTIC_SYSTEM_SUPPLEMENT
|
|
617
|
+
else:
|
|
618
|
+
# No system message from the client; inject one.
|
|
619
|
+
openai_body["messages"].insert(0, {
|
|
620
|
+
"role": "system",
|
|
621
|
+
"content": _AGENTIC_SYSTEM_SUPPLEMENT.strip(),
|
|
622
|
+
})
|
|
623
|
+
|
|
624
|
+
if "max_tokens" in anthropic_body:
|
|
625
|
+
# Enforce minimum floor for thinking mode: model needs tokens for
|
|
626
|
+
# reasoning (<think>...</think>) plus the actual response/tool calls.
|
|
627
|
+
# Claude Code typically sends 4096-8192 which is too low for thinking.
|
|
628
|
+
requested_max = max(anthropic_body["max_tokens"], 16384)
|
|
629
|
+
|
|
630
|
+
# Option E: Smart max_tokens capping — prevent the response from
|
|
631
|
+
# consuming so many tokens that the NEXT turn's input won't fit.
|
|
632
|
+
# Formula: max_tokens = min(requested, context_window - input_tokens - safety_margin)
|
|
633
|
+
# This ensures the model's output + current input stays within bounds,
|
|
634
|
+
# leaving room for the next turn's incremental growth.
|
|
635
|
+
ctx_window = session_monitor.context_window
|
|
636
|
+
if ctx_window > 0:
|
|
637
|
+
estimated_input = estimate_total_tokens(anthropic_body)
|
|
638
|
+
# Reserve 15% of context for next-turn growth (tool results, etc.)
|
|
639
|
+
safety_margin = int(ctx_window * 0.15)
|
|
640
|
+
available_for_output = ctx_window - estimated_input - safety_margin
|
|
641
|
+
if available_for_output < requested_max and available_for_output > 1024:
|
|
642
|
+
logger.info(
|
|
643
|
+
"MAX_TOKENS capped: %d -> %d (ctx=%d, input~%d, margin=%d)",
|
|
644
|
+
requested_max, available_for_output,
|
|
645
|
+
ctx_window, estimated_input, safety_margin,
|
|
646
|
+
)
|
|
647
|
+
requested_max = available_for_output
|
|
648
|
+
elif available_for_output <= 1024:
|
|
649
|
+
# Very tight on space -- allow minimum but warn
|
|
650
|
+
logger.warning(
|
|
651
|
+
"MAX_TOKENS: only %d tokens available for output (ctx=%d, input~%d). "
|
|
652
|
+
"Response may be truncated.",
|
|
653
|
+
available_for_output, ctx_window, estimated_input,
|
|
654
|
+
)
|
|
655
|
+
requested_max = max(1024, available_for_output)
|
|
656
|
+
|
|
657
|
+
openai_body["max_tokens"] = requested_max
|
|
658
|
+
if "temperature" in anthropic_body:
|
|
659
|
+
openai_body["temperature"] = anthropic_body["temperature"]
|
|
660
|
+
if "top_p" in anthropic_body:
|
|
661
|
+
openai_body["top_p"] = anthropic_body["top_p"]
|
|
662
|
+
if "stop_sequences" in anthropic_body:
|
|
663
|
+
openai_body["stop"] = anthropic_body["stop_sequences"]
|
|
664
|
+
|
|
665
|
+
# Convert Anthropic tools to OpenAI function-calling tools
|
|
666
|
+
if "tools" in anthropic_body:
|
|
667
|
+
openai_body["tools"] = []
|
|
668
|
+
for tool in anthropic_body["tools"]:
|
|
669
|
+
openai_body["tools"].append({
|
|
670
|
+
"type": "function",
|
|
671
|
+
"function": {
|
|
672
|
+
"name": tool["name"],
|
|
673
|
+
"description": tool.get("description", ""),
|
|
674
|
+
"parameters": tool.get("input_schema", {}),
|
|
675
|
+
},
|
|
676
|
+
})
|
|
677
|
+
|
|
678
|
+
# Smart tool_choice: force tool calls during the agentic loop to
|
|
679
|
+
# prevent the model from producing text-only end_turn responses that
|
|
680
|
+
# prematurely stop the loop. The model can still produce text alongside
|
|
681
|
+
# tool calls when tool_choice="required".
|
|
682
|
+
#
|
|
683
|
+
# Force "required" when:
|
|
684
|
+
# - More than 1 message (conversation is in progress)
|
|
685
|
+
# - Last assistant was text-only (would cause premature stop)
|
|
686
|
+
# - OR conversation has tool_result messages (active agentic loop)
|
|
687
|
+
n_msgs = len(anthropic_body.get("messages", []))
|
|
688
|
+
has_tool_results = any(
|
|
689
|
+
isinstance(m.get("content"), list) and any(
|
|
690
|
+
isinstance(b, dict) and b.get("type") == "tool_result"
|
|
691
|
+
for b in m.get("content", [])
|
|
692
|
+
)
|
|
693
|
+
for m in anthropic_body.get("messages", [])
|
|
694
|
+
)
|
|
695
|
+
if _last_assistant_was_text_only(anthropic_body):
|
|
696
|
+
openai_body["tool_choice"] = "required"
|
|
697
|
+
logger.info("tool_choice forced to 'required' (last assistant was text-only)")
|
|
698
|
+
elif has_tool_results and n_msgs > 2:
|
|
699
|
+
openai_body["tool_choice"] = "required"
|
|
700
|
+
logger.info("tool_choice forced to 'required' (active agentic loop with tool results)")
|
|
701
|
+
|
|
702
|
+
return openai_body
|
|
703
|
+
|
|
704
|
+
|
|
705
|
+
def _last_assistant_was_text_only(anthropic_body: dict) -> bool:
|
|
706
|
+
"""Check if the last assistant message in the conversation was text-only
|
|
707
|
+
(no tool_use blocks). This indicates the model may be prematurely ending
|
|
708
|
+
the agentic loop by explaining instead of acting."""
|
|
709
|
+
messages = anthropic_body.get("messages", [])
|
|
710
|
+
# Walk backwards to find the last assistant message
|
|
711
|
+
for msg in reversed(messages):
|
|
712
|
+
if msg.get("role") != "assistant":
|
|
713
|
+
continue
|
|
714
|
+
content = msg.get("content")
|
|
715
|
+
if isinstance(content, str):
|
|
716
|
+
# Pure text assistant message -- text-only
|
|
717
|
+
return bool(content.strip())
|
|
718
|
+
if isinstance(content, list):
|
|
719
|
+
has_tool_use = any(
|
|
720
|
+
isinstance(b, dict) and b.get("type") == "tool_use"
|
|
721
|
+
for b in content
|
|
722
|
+
)
|
|
723
|
+
has_text = any(
|
|
724
|
+
(isinstance(b, dict) and b.get("type") == "text" and b.get("text", "").strip())
|
|
725
|
+
or isinstance(b, str)
|
|
726
|
+
for b in content
|
|
727
|
+
)
|
|
728
|
+
# Text-only if there's text but no tool_use
|
|
729
|
+
return has_text and not has_tool_use
|
|
730
|
+
return False
|
|
731
|
+
return False
|
|
732
|
+
|
|
733
|
+
|
|
734
|
+
# ===========================================================================
|
|
735
|
+
# Response Translation: OpenAI -> Anthropic
|
|
736
|
+
# ===========================================================================
|
|
737
|
+
|
|
738
|
+
def openai_to_anthropic_response(openai_resp: dict, model: str) -> dict:
|
|
739
|
+
"""Convert an OpenAI Chat Completions response to Anthropic Messages format."""
|
|
740
|
+
choice = openai_resp.get("choices", [{}])[0]
|
|
741
|
+
message = choice.get("message", {})
|
|
742
|
+
finish = choice.get("finish_reason", "stop")
|
|
743
|
+
|
|
744
|
+
content = []
|
|
745
|
+
if message.get("content"):
|
|
746
|
+
content.append({"type": "text", "text": message["content"]})
|
|
747
|
+
|
|
748
|
+
# Convert tool calls
|
|
749
|
+
for tc in message.get("tool_calls", []):
|
|
750
|
+
fn = tc.get("function", {})
|
|
751
|
+
try:
|
|
752
|
+
args = json.loads(fn.get("arguments", "{}"))
|
|
753
|
+
except json.JSONDecodeError:
|
|
754
|
+
args = {}
|
|
755
|
+
content.append({
|
|
756
|
+
"type": "tool_use",
|
|
757
|
+
"id": tc.get("id", f"toolu_{uuid.uuid4().hex[:12]}"),
|
|
758
|
+
"name": fn.get("name", ""),
|
|
759
|
+
"input": args,
|
|
760
|
+
})
|
|
761
|
+
|
|
762
|
+
stop_reason_map = {
|
|
763
|
+
"stop": "end_turn",
|
|
764
|
+
"length": "max_tokens",
|
|
765
|
+
"tool_calls": "tool_use",
|
|
766
|
+
"function_call": "tool_use",
|
|
767
|
+
}
|
|
768
|
+
|
|
769
|
+
usage = openai_resp.get("usage", {})
|
|
770
|
+
|
|
771
|
+
return {
|
|
772
|
+
"id": f"msg_{uuid.uuid4().hex[:24]}",
|
|
773
|
+
"type": "message",
|
|
774
|
+
"role": "assistant",
|
|
775
|
+
"content": content if content else [{"type": "text", "text": ""}],
|
|
776
|
+
"model": model,
|
|
777
|
+
"stop_reason": stop_reason_map.get(finish, "end_turn"),
|
|
778
|
+
"stop_sequence": None,
|
|
779
|
+
"usage": {
|
|
780
|
+
"input_tokens": usage.get("prompt_tokens", 0),
|
|
781
|
+
"output_tokens": usage.get("completion_tokens", 0),
|
|
782
|
+
},
|
|
783
|
+
}
|
|
784
|
+
|
|
785
|
+
|
|
786
|
+
# ===========================================================================
|
|
787
|
+
# Streaming Translation: OpenAI SSE -> Anthropic SSE
|
|
788
|
+
# ===========================================================================
|
|
789
|
+
|
|
790
|
+
async def stream_anthropic_response(openai_stream: httpx.Response, model: str):
|
|
791
|
+
"""Convert an OpenAI streaming response to Anthropic SSE stream format.
|
|
792
|
+
|
|
793
|
+
Handles:
|
|
794
|
+
- Text content deltas -> content_block_delta (text_delta)
|
|
795
|
+
- Tool call deltas -> content_block_start (tool_use) + input_json_delta
|
|
796
|
+
- Graceful error recovery on upstream connection drops
|
|
797
|
+
- Proper upstream response closure on client disconnect
|
|
798
|
+
"""
|
|
799
|
+
msg_id = f"msg_{uuid.uuid4().hex[:24]}"
|
|
800
|
+
|
|
801
|
+
# message_start
|
|
802
|
+
yield (
|
|
803
|
+
f"event: message_start\n"
|
|
804
|
+
f"data: {json.dumps({'type': 'message_start', 'message': {'id': msg_id, 'type': 'message', 'role': 'assistant', 'content': [], 'model': model, 'stop_reason': None, 'stop_sequence': None, 'usage': {'input_tokens': 0, 'output_tokens': 0}}})}\n\n"
|
|
805
|
+
)
|
|
806
|
+
|
|
807
|
+
# content_block_start for text (index 0)
|
|
808
|
+
yield (
|
|
809
|
+
f"event: content_block_start\n"
|
|
810
|
+
f"data: {json.dumps({'type': 'content_block_start', 'index': 0, 'content_block': {'type': 'text', 'text': ''}})}\n\n"
|
|
811
|
+
)
|
|
812
|
+
|
|
813
|
+
yield "event: ping\ndata: {\"type\": \"ping\"}\n\n"
|
|
814
|
+
|
|
815
|
+
output_tokens = 0
|
|
816
|
+
finish_reason = "end_turn"
|
|
817
|
+
|
|
818
|
+
# Track tool call state for streaming tool_calls
|
|
819
|
+
tool_calls_by_index: dict[int, dict] = {}
|
|
820
|
+
tool_block_index = 1 # anthropic block index (0 = text)
|
|
821
|
+
text_chunks: list[str] = [] # accumulate text for logging
|
|
822
|
+
reasoning_chunks: list[str] = [] # accumulate reasoning for fallback
|
|
823
|
+
|
|
824
|
+
try:
|
|
825
|
+
async for line in openai_stream.aiter_lines():
|
|
826
|
+
if not line.startswith("data: "):
|
|
827
|
+
continue
|
|
828
|
+
data = line[6:].strip()
|
|
829
|
+
if data == "[DONE]":
|
|
830
|
+
break
|
|
831
|
+
try:
|
|
832
|
+
chunk = json.loads(data)
|
|
833
|
+
except json.JSONDecodeError:
|
|
834
|
+
continue
|
|
835
|
+
|
|
836
|
+
choice = (chunk.get("choices") or [{}])[0]
|
|
837
|
+
delta = choice.get("delta", {})
|
|
838
|
+
|
|
839
|
+
# Collect reasoning_content (normally stripped; used as fallback
|
|
840
|
+
# if the model produces only reasoning with no visible output)
|
|
841
|
+
reasoning = delta.get("reasoning_content", "")
|
|
842
|
+
if reasoning:
|
|
843
|
+
reasoning_chunks.append(reasoning)
|
|
844
|
+
|
|
845
|
+
# Handle text content deltas
|
|
846
|
+
if delta.get("content"):
|
|
847
|
+
output_tokens += 1 # rough token estimate
|
|
848
|
+
text_chunks.append(delta["content"])
|
|
849
|
+
yield (
|
|
850
|
+
f"event: content_block_delta\n"
|
|
851
|
+
f"data: {json.dumps({'type': 'content_block_delta', 'index': 0, 'delta': {'type': 'text_delta', 'text': delta['content']}})}\n\n"
|
|
852
|
+
)
|
|
853
|
+
|
|
854
|
+
# Handle tool_calls deltas
|
|
855
|
+
if delta.get("tool_calls"):
|
|
856
|
+
for tc_delta in delta["tool_calls"]:
|
|
857
|
+
tc_idx = tc_delta.get("index", 0)
|
|
858
|
+
|
|
859
|
+
if tc_idx not in tool_calls_by_index:
|
|
860
|
+
# New tool call starting
|
|
861
|
+
tc_id = tc_delta.get("id", f"toolu_{uuid.uuid4().hex[:12]}")
|
|
862
|
+
fn = tc_delta.get("function", {})
|
|
863
|
+
initial_args = fn.get("arguments", "")
|
|
864
|
+
tool_calls_by_index[tc_idx] = {
|
|
865
|
+
"id": tc_id,
|
|
866
|
+
"name": fn.get("name", ""),
|
|
867
|
+
"arguments": initial_args,
|
|
868
|
+
"block_index": tool_block_index,
|
|
869
|
+
}
|
|
870
|
+
|
|
871
|
+
# Close text block before first tool block
|
|
872
|
+
if tool_block_index == 1:
|
|
873
|
+
yield (
|
|
874
|
+
f"event: content_block_stop\n"
|
|
875
|
+
f"data: {json.dumps({'type': 'content_block_stop', 'index': 0})}\n\n"
|
|
876
|
+
)
|
|
877
|
+
|
|
878
|
+
# Emit content_block_start for this tool_use
|
|
879
|
+
yield (
|
|
880
|
+
f"event: content_block_start\n"
|
|
881
|
+
f"data: {json.dumps({'type': 'content_block_start', 'index': tool_block_index, 'content_block': {'type': 'tool_use', 'id': tc_id, 'name': fn.get('name', '')}})}\n\n"
|
|
882
|
+
)
|
|
883
|
+
|
|
884
|
+
# Emit initial arguments fragment (e.g. "{") that
|
|
885
|
+
# arrives with the first tool_call chunk. Without
|
|
886
|
+
# this the opening brace is swallowed and the client
|
|
887
|
+
# receives invalid JSON like "command":"ls"} instead
|
|
888
|
+
# of {"command":"ls"}.
|
|
889
|
+
if initial_args:
|
|
890
|
+
yield (
|
|
891
|
+
f"event: content_block_delta\n"
|
|
892
|
+
f"data: {json.dumps({'type': 'content_block_delta', 'index': tool_block_index, 'delta': {'type': 'input_json_delta', 'partial_json': initial_args}})}\n\n"
|
|
893
|
+
)
|
|
894
|
+
|
|
895
|
+
tool_block_index += 1
|
|
896
|
+
else:
|
|
897
|
+
# Continuation: argument chunks
|
|
898
|
+
fn = tc_delta.get("function", {})
|
|
899
|
+
arg_chunk = fn.get("arguments", "")
|
|
900
|
+
if arg_chunk:
|
|
901
|
+
tool_calls_by_index[tc_idx]["arguments"] += arg_chunk
|
|
902
|
+
bidx = tool_calls_by_index[tc_idx]["block_index"]
|
|
903
|
+
yield (
|
|
904
|
+
f"event: content_block_delta\n"
|
|
905
|
+
f"data: {json.dumps({'type': 'content_block_delta', 'index': bidx, 'delta': {'type': 'input_json_delta', 'partial_json': arg_chunk}})}\n\n"
|
|
906
|
+
)
|
|
907
|
+
|
|
908
|
+
if choice.get("finish_reason"):
|
|
909
|
+
fr = choice["finish_reason"]
|
|
910
|
+
if fr == "length":
|
|
911
|
+
logger.warning(
|
|
912
|
+
"Response truncated by token limit (finish_reason=length). "
|
|
913
|
+
"Consider increasing --n-predict or max_tokens."
|
|
914
|
+
)
|
|
915
|
+
finish_reason = {
|
|
916
|
+
"stop": "end_turn",
|
|
917
|
+
"length": "max_tokens",
|
|
918
|
+
"tool_calls": "tool_use",
|
|
919
|
+
}.get(fr, "end_turn")
|
|
920
|
+
|
|
921
|
+
except (httpx.ReadError, httpx.RemoteProtocolError, httpx.StreamClosed) as exc:
|
|
922
|
+
logger.warning("Upstream stream error: %s: %s", type(exc).__name__, exc)
|
|
923
|
+
finish_reason = "end_turn"
|
|
924
|
+
except asyncio.CancelledError:
|
|
925
|
+
logger.info("Client disconnected, closing upstream stream")
|
|
926
|
+
raise
|
|
927
|
+
except Exception as exc:
|
|
928
|
+
logger.error("Unexpected stream error: %s: %s", type(exc).__name__, exc)
|
|
929
|
+
finish_reason = "end_turn"
|
|
930
|
+
finally:
|
|
931
|
+
# Always close the upstream response to stop LLM generation
|
|
932
|
+
await openai_stream.aclose()
|
|
933
|
+
|
|
934
|
+
# Close any open tool call blocks
|
|
935
|
+
if tool_calls_by_index:
|
|
936
|
+
for tc in tool_calls_by_index.values():
|
|
937
|
+
yield (
|
|
938
|
+
f"event: content_block_stop\n"
|
|
939
|
+
f"data: {json.dumps({'type': 'content_block_stop', 'index': tc['block_index']})}\n\n"
|
|
940
|
+
)
|
|
941
|
+
else:
|
|
942
|
+
# Option E: If the response has no text AND no tool calls, but the
|
|
943
|
+
# model produced reasoning_content, forward the reasoning as visible
|
|
944
|
+
# text so the client doesn't receive a completely empty turn.
|
|
945
|
+
accumulated_text = "".join(text_chunks)
|
|
946
|
+
if not accumulated_text and reasoning_chunks:
|
|
947
|
+
fallback_text = "".join(reasoning_chunks)
|
|
948
|
+
logger.warning(
|
|
949
|
+
"Empty response with %d reasoning tokens – forwarding reasoning as fallback text",
|
|
950
|
+
len(reasoning_chunks),
|
|
951
|
+
)
|
|
952
|
+
text_chunks.append(fallback_text)
|
|
953
|
+
yield (
|
|
954
|
+
f"event: content_block_delta\n"
|
|
955
|
+
f"data: {json.dumps({'type': 'content_block_delta', 'index': 0, 'delta': {'type': 'text_delta', 'text': fallback_text}})}\n\n"
|
|
956
|
+
)
|
|
957
|
+
|
|
958
|
+
yield (
|
|
959
|
+
f"event: content_block_stop\n"
|
|
960
|
+
f"data: {json.dumps({'type': 'content_block_stop', 'index': 0})}\n\n"
|
|
961
|
+
)
|
|
962
|
+
|
|
963
|
+
# Log response summary
|
|
964
|
+
accumulated_text = "".join(text_chunks)
|
|
965
|
+
tc_names = [tc["name"] for tc in tool_calls_by_index.values()] if tool_calls_by_index else []
|
|
966
|
+
tc_args = [tc.get("arguments", "") for tc in tool_calls_by_index.values()] if tool_calls_by_index else []
|
|
967
|
+
logger.info(
|
|
968
|
+
"RESP: finish=%s output_tokens=%d text_len=%d text=%.300s tool_calls=%s args=%s",
|
|
969
|
+
finish_reason, output_tokens,
|
|
970
|
+
len(accumulated_text),
|
|
971
|
+
accumulated_text[:300],
|
|
972
|
+
tc_names,
|
|
973
|
+
[a[:200] for a in tc_args],
|
|
974
|
+
)
|
|
975
|
+
|
|
976
|
+
# message_delta with final stop reason
|
|
977
|
+
yield (
|
|
978
|
+
f"event: message_delta\n"
|
|
979
|
+
f"data: {json.dumps({'type': 'message_delta', 'delta': {'stop_reason': finish_reason, 'stop_sequence': None}, 'usage': {'output_tokens': output_tokens}})}\n\n"
|
|
980
|
+
)
|
|
981
|
+
|
|
982
|
+
# message_stop
|
|
983
|
+
yield f"event: message_stop\ndata: {json.dumps({'type': 'message_stop'})}\n\n"
|
|
984
|
+
|
|
985
|
+
|
|
986
|
+
# ===========================================================================
|
|
987
|
+
# API Endpoints
|
|
988
|
+
# ===========================================================================
|
|
989
|
+
|
|
990
|
+
@app.post("/v1/messages")
|
|
991
|
+
async def messages(request: Request):
|
|
992
|
+
"""Handle Anthropic Messages API requests (streaming and non-streaming).
|
|
993
|
+
|
|
994
|
+
Integrates context management:
|
|
995
|
+
- Option B: HTTP error handling for upstream 4xx/5xx responses
|
|
996
|
+
- Option C: Conversation pruning when approaching context limits
|
|
997
|
+
- Option E: Smart max_tokens capping (in build_openai_request)
|
|
998
|
+
- Option F: Session-level token monitoring with warnings
|
|
999
|
+
"""
|
|
1000
|
+
body = await request.json()
|
|
1001
|
+
model = body.get("model", "default")
|
|
1002
|
+
is_stream = body.get("stream", False)
|
|
1003
|
+
|
|
1004
|
+
# Debug: log request summary
|
|
1005
|
+
n_messages = len(body.get("messages", []))
|
|
1006
|
+
n_tools = len(body.get("tools", []))
|
|
1007
|
+
max_tokens = body.get("max_tokens", "unset")
|
|
1008
|
+
last_msg = body.get("messages", [{}])[-1]
|
|
1009
|
+
last_role = last_msg.get("role", "?")
|
|
1010
|
+
last_content = last_msg.get("content", "")
|
|
1011
|
+
if isinstance(last_content, list):
|
|
1012
|
+
last_text = next((b.get("text", "") for b in last_content if b.get("type") == "text"), "")[:200]
|
|
1013
|
+
elif isinstance(last_content, str):
|
|
1014
|
+
last_text = last_content[:200]
|
|
1015
|
+
else:
|
|
1016
|
+
last_text = str(last_content)[:200]
|
|
1017
|
+
logger.info(
|
|
1018
|
+
"REQ: stream=%s msgs=%d tools=%d max_tokens=%s last_role=%s last_content=%.200s",
|
|
1019
|
+
is_stream, n_messages, n_tools, max_tokens, last_role, last_text
|
|
1020
|
+
)
|
|
1021
|
+
|
|
1022
|
+
# --- Option F: Estimate tokens and record in session monitor ---
|
|
1023
|
+
estimated_tokens = estimate_total_tokens(body)
|
|
1024
|
+
session_monitor.record_request(estimated_tokens)
|
|
1025
|
+
session_monitor.log_status()
|
|
1026
|
+
|
|
1027
|
+
# --- Option C: Prune conversation if approaching context limit ---
|
|
1028
|
+
ctx_window = session_monitor.context_window
|
|
1029
|
+
if ctx_window > 0:
|
|
1030
|
+
utilization = estimated_tokens / ctx_window
|
|
1031
|
+
if utilization >= PROXY_CONTEXT_PRUNE_THRESHOLD:
|
|
1032
|
+
logger.warning(
|
|
1033
|
+
"Context utilization %.1f%% exceeds threshold %.1f%% -- pruning conversation",
|
|
1034
|
+
utilization * 100, PROXY_CONTEXT_PRUNE_THRESHOLD * 100,
|
|
1035
|
+
)
|
|
1036
|
+
body = prune_conversation(body, ctx_window, target_fraction=0.65)
|
|
1037
|
+
session_monitor.prune_count += 1
|
|
1038
|
+
# Re-estimate after pruning
|
|
1039
|
+
estimated_tokens = estimate_total_tokens(body)
|
|
1040
|
+
session_monitor.record_request(estimated_tokens)
|
|
1041
|
+
n_messages = len(body.get("messages", []))
|
|
1042
|
+
logger.info(
|
|
1043
|
+
"After pruning: ~%d tokens, %d messages",
|
|
1044
|
+
estimated_tokens, n_messages,
|
|
1045
|
+
)
|
|
1046
|
+
|
|
1047
|
+
openai_body = build_openai_request(body)
|
|
1048
|
+
|
|
1049
|
+
client = http_client
|
|
1050
|
+
if client is None:
|
|
1051
|
+
return Response(
|
|
1052
|
+
content=json.dumps({"error": "Proxy not initialized"}),
|
|
1053
|
+
status_code=503,
|
|
1054
|
+
media_type="application/json",
|
|
1055
|
+
)
|
|
1056
|
+
|
|
1057
|
+
if is_stream:
|
|
1058
|
+
openai_body["stream"] = True
|
|
1059
|
+
|
|
1060
|
+
# Retry upstream connection with backoff to handle
|
|
1061
|
+
# llama-server restarts gracefully instead of 500-ing to the client.
|
|
1062
|
+
MAX_UPSTREAM_RETRIES = 3
|
|
1063
|
+
RETRY_DELAY_SECS = 5.0
|
|
1064
|
+
last_exc: Exception | None = None
|
|
1065
|
+
|
|
1066
|
+
for attempt in range(MAX_UPSTREAM_RETRIES):
|
|
1067
|
+
try:
|
|
1068
|
+
resp = await client.send(
|
|
1069
|
+
client.build_request(
|
|
1070
|
+
"POST",
|
|
1071
|
+
f"{LLAMA_CPP_BASE}/chat/completions",
|
|
1072
|
+
json=openai_body,
|
|
1073
|
+
headers={"Content-Type": "application/json"},
|
|
1074
|
+
),
|
|
1075
|
+
stream=True,
|
|
1076
|
+
)
|
|
1077
|
+
# Connection succeeded – break out of retry loop
|
|
1078
|
+
last_exc = None
|
|
1079
|
+
break
|
|
1080
|
+
except (httpx.ConnectError, httpx.RemoteProtocolError) as exc:
|
|
1081
|
+
last_exc = exc
|
|
1082
|
+
if attempt < MAX_UPSTREAM_RETRIES - 1:
|
|
1083
|
+
logger.warning(
|
|
1084
|
+
"Upstream connect failed (attempt %d/%d): %s – retrying in %.0fs",
|
|
1085
|
+
attempt + 1, MAX_UPSTREAM_RETRIES,
|
|
1086
|
+
type(exc).__name__, RETRY_DELAY_SECS,
|
|
1087
|
+
)
|
|
1088
|
+
await asyncio.sleep(RETRY_DELAY_SECS)
|
|
1089
|
+
else:
|
|
1090
|
+
logger.error(
|
|
1091
|
+
"Upstream connect failed after %d attempts: %s: %s",
|
|
1092
|
+
MAX_UPSTREAM_RETRIES, type(exc).__name__, exc,
|
|
1093
|
+
)
|
|
1094
|
+
|
|
1095
|
+
if last_exc is not None:
|
|
1096
|
+
return Response(
|
|
1097
|
+
content=json.dumps({
|
|
1098
|
+
"type": "error",
|
|
1099
|
+
"error": {
|
|
1100
|
+
"type": "overloaded_error",
|
|
1101
|
+
"message": f"Upstream server unavailable after {MAX_UPSTREAM_RETRIES} retries: {last_exc}",
|
|
1102
|
+
},
|
|
1103
|
+
}),
|
|
1104
|
+
status_code=529,
|
|
1105
|
+
media_type="application/json",
|
|
1106
|
+
)
|
|
1107
|
+
|
|
1108
|
+
# --- Option B: Check HTTP status before streaming ---
|
|
1109
|
+
# llama-server returns 400 for context overflow, 500 for internal errors, etc.
|
|
1110
|
+
# Without this check, the proxy would try to stream-translate an error body,
|
|
1111
|
+
# producing an empty response that silently kills the agentic loop.
|
|
1112
|
+
if resp.status_code != 200:
|
|
1113
|
+
error_body = await resp.aread()
|
|
1114
|
+
await resp.aclose()
|
|
1115
|
+
error_text = error_body.decode("utf-8", errors="replace")[:1000]
|
|
1116
|
+
logger.error(
|
|
1117
|
+
"Upstream HTTP %d: %s", resp.status_code, error_text
|
|
1118
|
+
)
|
|
1119
|
+
|
|
1120
|
+
# Parse the error for a user-friendly message
|
|
1121
|
+
error_message = f"Upstream server error (HTTP {resp.status_code})"
|
|
1122
|
+
try:
|
|
1123
|
+
error_json = json.loads(error_body)
|
|
1124
|
+
if "error" in error_json:
|
|
1125
|
+
upstream_error = error_json["error"]
|
|
1126
|
+
if isinstance(upstream_error, dict):
|
|
1127
|
+
error_message = upstream_error.get("message", error_message)
|
|
1128
|
+
else:
|
|
1129
|
+
error_message = str(upstream_error)
|
|
1130
|
+
except (json.JSONDecodeError, KeyError):
|
|
1131
|
+
error_message = error_text[:500] if error_text else error_message
|
|
1132
|
+
|
|
1133
|
+
# Detect context overflow specifically
|
|
1134
|
+
is_context_overflow = (
|
|
1135
|
+
resp.status_code == 400
|
|
1136
|
+
and "exceeds" in error_message.lower()
|
|
1137
|
+
and "context" in error_message.lower()
|
|
1138
|
+
)
|
|
1139
|
+
|
|
1140
|
+
if is_context_overflow:
|
|
1141
|
+
session_monitor.overflow_count += 1
|
|
1142
|
+
logger.error(
|
|
1143
|
+
"CONTEXT OVERFLOW detected (count=%d). "
|
|
1144
|
+
"Estimated input: %d tokens, context window: %d tokens. "
|
|
1145
|
+
"Conversation needs pruning or context window increase.",
|
|
1146
|
+
session_monitor.overflow_count, estimated_tokens, ctx_window,
|
|
1147
|
+
)
|
|
1148
|
+
# Return Anthropic-format error that Claude Code can handle
|
|
1149
|
+
return Response(
|
|
1150
|
+
content=json.dumps({
|
|
1151
|
+
"type": "error",
|
|
1152
|
+
"error": {
|
|
1153
|
+
"type": "overloaded_error",
|
|
1154
|
+
"message": (
|
|
1155
|
+
f"Context window exceeded: request requires ~{estimated_tokens} tokens "
|
|
1156
|
+
f"but only {ctx_window} are available. "
|
|
1157
|
+
f"The conversation is too long. Please start a new session or "
|
|
1158
|
+
f"reduce conversation length."
|
|
1159
|
+
),
|
|
1160
|
+
},
|
|
1161
|
+
}),
|
|
1162
|
+
status_code=529,
|
|
1163
|
+
media_type="application/json",
|
|
1164
|
+
)
|
|
1165
|
+
|
|
1166
|
+
# Generic upstream error -- return as Anthropic error format
|
|
1167
|
+
error_type = "overloaded_error" if resp.status_code >= 500 else "invalid_request_error"
|
|
1168
|
+
return Response(
|
|
1169
|
+
content=json.dumps({
|
|
1170
|
+
"type": "error",
|
|
1171
|
+
"error": {
|
|
1172
|
+
"type": error_type,
|
|
1173
|
+
"message": error_message,
|
|
1174
|
+
},
|
|
1175
|
+
}),
|
|
1176
|
+
status_code=529 if resp.status_code >= 500 else 400,
|
|
1177
|
+
media_type="application/json",
|
|
1178
|
+
)
|
|
1179
|
+
|
|
1180
|
+
return StreamingResponse(
|
|
1181
|
+
stream_anthropic_response(resp, model),
|
|
1182
|
+
media_type="text/event-stream",
|
|
1183
|
+
headers={
|
|
1184
|
+
"Cache-Control": "no-cache",
|
|
1185
|
+
"Connection": "keep-alive",
|
|
1186
|
+
},
|
|
1187
|
+
)
|
|
1188
|
+
else:
|
|
1189
|
+
resp = await client.post(
|
|
1190
|
+
f"{LLAMA_CPP_BASE}/chat/completions",
|
|
1191
|
+
json=openai_body,
|
|
1192
|
+
headers={"Content-Type": "application/json"},
|
|
1193
|
+
)
|
|
1194
|
+
|
|
1195
|
+
# Option B: Handle non-streaming errors too
|
|
1196
|
+
if resp.status_code != 200:
|
|
1197
|
+
error_text = resp.text[:1000]
|
|
1198
|
+
logger.error("Upstream HTTP %d (non-stream): %s", resp.status_code, error_text)
|
|
1199
|
+
return Response(
|
|
1200
|
+
content=json.dumps({
|
|
1201
|
+
"type": "error",
|
|
1202
|
+
"error": {
|
|
1203
|
+
"type": "overloaded_error",
|
|
1204
|
+
"message": f"Upstream error (HTTP {resp.status_code}): {error_text[:500]}",
|
|
1205
|
+
},
|
|
1206
|
+
}),
|
|
1207
|
+
status_code=529,
|
|
1208
|
+
media_type="application/json",
|
|
1209
|
+
)
|
|
1210
|
+
|
|
1211
|
+
openai_resp = resp.json()
|
|
1212
|
+
anthropic_resp = openai_to_anthropic_response(openai_resp, model)
|
|
1213
|
+
|
|
1214
|
+
# Track output tokens in session monitor
|
|
1215
|
+
output_tokens = anthropic_resp.get("usage", {}).get("output_tokens", 0)
|
|
1216
|
+
session_monitor.record_response(output_tokens)
|
|
1217
|
+
|
|
1218
|
+
return anthropic_resp
|
|
1219
|
+
|
|
1220
|
+
|
|
1221
|
+
@app.post("/anthropic/v1/messages")
|
|
1222
|
+
async def messages_anthropic(request: Request):
|
|
1223
|
+
"""Alternative endpoint path used by some Claude Code configurations."""
|
|
1224
|
+
return await messages(request)
|
|
1225
|
+
|
|
1226
|
+
|
|
1227
|
+
@app.get("/v1/models")
|
|
1228
|
+
async def models():
|
|
1229
|
+
"""Return available model list (spoofs Anthropic model IDs for client compatibility)."""
|
|
1230
|
+
return {
|
|
1231
|
+
"data": [
|
|
1232
|
+
{"id": "claude-sonnet-4-20250514", "object": "model"},
|
|
1233
|
+
{"id": "claude-3-5-sonnet-20241022", "object": "model"},
|
|
1234
|
+
]
|
|
1235
|
+
}
|
|
1236
|
+
|
|
1237
|
+
|
|
1238
|
+
@app.get("/health")
|
|
1239
|
+
async def health():
|
|
1240
|
+
"""Health check endpoint for monitoring and load balancers."""
|
|
1241
|
+
upstream_ok = False
|
|
1242
|
+
try:
|
|
1243
|
+
if http_client:
|
|
1244
|
+
resp = await http_client.get(
|
|
1245
|
+
LLAMA_CPP_BASE.replace("/v1", "/health"),
|
|
1246
|
+
timeout=5.0,
|
|
1247
|
+
)
|
|
1248
|
+
upstream_ok = resp.status_code == 200
|
|
1249
|
+
except Exception:
|
|
1250
|
+
pass
|
|
1251
|
+
|
|
1252
|
+
return {
|
|
1253
|
+
"status": "ok" if upstream_ok else "degraded",
|
|
1254
|
+
"proxy": "ok",
|
|
1255
|
+
"upstream": "ok" if upstream_ok else "unreachable",
|
|
1256
|
+
"upstream_url": LLAMA_CPP_BASE,
|
|
1257
|
+
}
|
|
1258
|
+
|
|
1259
|
+
|
|
1260
|
+
@app.get("/v1/context")
|
|
1261
|
+
async def context_status():
|
|
1262
|
+
"""Option F: Context window monitoring endpoint.
|
|
1263
|
+
|
|
1264
|
+
Returns current session token usage, utilization, warnings, and
|
|
1265
|
+
estimated remaining turns. Useful for dashboards and debugging.
|
|
1266
|
+
"""
|
|
1267
|
+
warning = session_monitor.get_warning_level()
|
|
1268
|
+
turns = session_monitor.estimate_turns_remaining()
|
|
1269
|
+
|
|
1270
|
+
return {
|
|
1271
|
+
"context_window": session_monitor.context_window,
|
|
1272
|
+
"last_input_tokens": session_monitor.last_input_tokens,
|
|
1273
|
+
"last_output_tokens": session_monitor.last_output_tokens,
|
|
1274
|
+
"peak_input_tokens": session_monitor.peak_input_tokens,
|
|
1275
|
+
"utilization": round(session_monitor.get_utilization(), 4),
|
|
1276
|
+
"utilization_pct": f"{session_monitor.get_utilization() * 100:.1f}%",
|
|
1277
|
+
"warning_level": warning,
|
|
1278
|
+
"estimated_turns_remaining": turns,
|
|
1279
|
+
"total_requests": session_monitor.total_requests,
|
|
1280
|
+
"prune_count": session_monitor.prune_count,
|
|
1281
|
+
"overflow_count": session_monitor.overflow_count,
|
|
1282
|
+
"prune_threshold": PROXY_CONTEXT_PRUNE_THRESHOLD,
|
|
1283
|
+
"recent_history": session_monitor.context_history[-10:],
|
|
1284
|
+
}
|
|
1285
|
+
|
|
1286
|
+
|
|
1287
|
+
# ===========================================================================
|
|
1288
|
+
# Entry Point
|
|
1289
|
+
# ===========================================================================
|
|
1290
|
+
|
|
1291
|
+
if __name__ == "__main__":
|
|
1292
|
+
uvicorn.run(
|
|
1293
|
+
app,
|
|
1294
|
+
host=PROXY_HOST,
|
|
1295
|
+
port=PROXY_PORT,
|
|
1296
|
+
log_level=PROXY_LOG_LEVEL.lower(),
|
|
1297
|
+
)
|