@miller-tech/uap 1.13.5 → 1.13.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. package/README.md +20 -0
  2. package/config/chat_template.jinja +126 -44
  3. package/config/model-profiles/qwen35.json +3 -3
  4. package/dist/.tsbuildinfo +1 -1
  5. package/dist/benchmarks/token-throughput.d.ts +259 -0
  6. package/dist/benchmarks/token-throughput.d.ts.map +1 -0
  7. package/dist/benchmarks/token-throughput.js +198 -0
  8. package/dist/benchmarks/token-throughput.js.map +1 -0
  9. package/dist/bin/cli.js +12 -0
  10. package/dist/bin/cli.js.map +1 -1
  11. package/dist/cli/dashboard.d.ts.map +1 -1
  12. package/dist/cli/dashboard.js +10 -20
  13. package/dist/cli/dashboard.js.map +1 -1
  14. package/dist/cli/init.d.ts.map +1 -1
  15. package/dist/cli/init.js +5 -0
  16. package/dist/cli/init.js.map +1 -1
  17. package/dist/cli/memory.d.ts.map +1 -1
  18. package/dist/cli/memory.js +9 -18
  19. package/dist/cli/memory.js.map +1 -1
  20. package/dist/cli/worktree.d.ts +4 -1
  21. package/dist/cli/worktree.d.ts.map +1 -1
  22. package/dist/cli/worktree.js +73 -1
  23. package/dist/cli/worktree.js.map +1 -1
  24. package/dist/coordination/adaptive-patterns.d.ts +3 -1
  25. package/dist/coordination/adaptive-patterns.d.ts.map +1 -1
  26. package/dist/coordination/adaptive-patterns.js +31 -3
  27. package/dist/coordination/adaptive-patterns.js.map +1 -1
  28. package/dist/dashboard/data-service.d.ts +27 -0
  29. package/dist/dashboard/data-service.d.ts.map +1 -1
  30. package/dist/dashboard/data-service.js +210 -17
  31. package/dist/dashboard/data-service.js.map +1 -1
  32. package/dist/memory/embeddings.d.ts.map +1 -1
  33. package/dist/memory/embeddings.js +1 -1
  34. package/dist/memory/embeddings.js.map +1 -1
  35. package/dist/models/router.js +1 -1
  36. package/dist/models/router.js.map +1 -1
  37. package/dist/models/types.js +13 -13
  38. package/dist/models/types.js.map +1 -1
  39. package/dist/tasks/coordination.js +1 -1
  40. package/dist/tasks/coordination.js.map +1 -1
  41. package/docs/deployment/QWEN35_LLAMA_CPP.md +76 -0
  42. package/package.json +3 -2
  43. package/templates/hooks/session-start.sh +91 -51
  44. package/tools/agents/README.md +22 -0
  45. package/tools/agents/install-opencode-local.sh.j2 +57 -7
  46. package/tools/agents/opencode_uap_agent.py +63 -1
  47. package/tools/agents/scripts/anthropic_proxy.py +1297 -0
  48. package/tools/agents/scripts/requirements-proxy.txt +5 -0
  49. package/tools/agents/scripts/tool_call_wrapper.py +9 -5
@@ -0,0 +1,1297 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ UAP Anthropic-to-OpenAI Proxy
4
+ ==============================
5
+
6
+ A lightweight, production-ready proxy that translates Anthropic Messages API
7
+ requests into OpenAI Chat Completions API requests. Designed for use with
8
+ local LLM servers (llama.cpp, vLLM, Ollama, etc.) that expose an OpenAI-
9
+ compatible endpoint but need to be accessed from clients that speak the
10
+ Anthropic protocol (e.g., Claude Code, Forge Code).
11
+
12
+ Architecture
13
+ ------------
14
+ Claude Code --(Anthropic API)--> This Proxy --(OpenAI API)--> llama.cpp
15
+ :4000 :8080
16
+
17
+ Key Features
18
+ - Full streaming support (SSE translation between protocols)
19
+ - Tool/function calling translation (both streaming and non-streaming)
20
+ - Module-level httpx.AsyncClient with connection pooling and keep-alive
21
+ - Granular timeouts (short connect, long read for LLM generation)
22
+ - Graceful error recovery on upstream connection drops
23
+ - Proper upstream cleanup on client disconnect
24
+ - Context window overflow protection with conversation pruning
25
+ - Smart max_tokens capping to prevent next-turn overflow
26
+ - Session-level token monitoring with warnings
27
+
28
+ Configuration (Environment Variables)
29
+ --------------------------------------
30
+ LLAMA_CPP_BASE Base URL of the OpenAI-compatible server
31
+ Default: http://192.168.1.165:8080/v1
32
+
33
+ PROXY_PORT Port for this proxy to listen on
34
+ Default: 4000
35
+
36
+ PROXY_HOST Host/IP to bind to
37
+ Default: 0.0.0.0
38
+
39
+ PROXY_LOG_LEVEL Logging level (DEBUG, INFO, WARNING, ERROR)
40
+ Default: INFO
41
+
42
+ PROXY_READ_TIMEOUT Read timeout in seconds for upstream LLM streaming
43
+ Default: 600 (10 minutes)
44
+
45
+ PROXY_MAX_CONNECTIONS Max concurrent connections to upstream
46
+ Default: 20
47
+
48
+ PROXY_CONTEXT_WINDOW Override context window size (auto-detected from
49
+ upstream /slots endpoint if not set)
50
+ Default: 0 (auto-detect)
51
+
52
+ PROXY_CONTEXT_PRUNE_THRESHOLD Fraction of context window at which
53
+ conversation pruning activates (0.0-1.0)
54
+ Default: 0.75
55
+
56
+ Usage
57
+ -----
58
+ # Basic usage (connects to llama.cpp on default port):
59
+ python anthropic_proxy.py
60
+
61
+ # Custom upstream server:
62
+ LLAMA_CPP_BASE=http://localhost:8080/v1 python anthropic_proxy.py
63
+
64
+ # Custom proxy port:
65
+ PROXY_PORT=5000 python anthropic_proxy.py
66
+
67
+ # Via npx (after npm install):
68
+ npx uap-anthropic-proxy
69
+
70
+ Dependencies
71
+ ------------
72
+ pip install fastapi uvicorn httpx
73
+
74
+ Or from the project root:
75
+ pip install -r tools/agents/scripts/requirements-proxy.txt
76
+ """
77
+
78
+ import asyncio
79
+ import json
80
+ import logging
81
+ import os
82
+ import sys
83
+ import time
84
+ import uuid
85
+ from dataclasses import dataclass, field
86
+
87
+ import httpx
88
+ from contextlib import asynccontextmanager
89
+ from fastapi import FastAPI, Request, Response
90
+ from fastapi.responses import StreamingResponse
91
+ import uvicorn
92
+
93
+ # ---------------------------------------------------------------------------
94
+ # Configuration (all configurable via environment variables)
95
+ # ---------------------------------------------------------------------------
96
+ LLAMA_CPP_BASE = os.environ.get("LLAMA_CPP_BASE", "http://192.168.1.165:8080/v1")
97
+ PROXY_PORT = int(os.environ.get("PROXY_PORT", "4000"))
98
+ PROXY_HOST = os.environ.get("PROXY_HOST", "0.0.0.0")
99
+ PROXY_LOG_LEVEL = os.environ.get("PROXY_LOG_LEVEL", "INFO").upper()
100
+ PROXY_READ_TIMEOUT = float(os.environ.get("PROXY_READ_TIMEOUT", "600"))
101
+ PROXY_MAX_CONNECTIONS = int(os.environ.get("PROXY_MAX_CONNECTIONS", "20"))
102
+ PROXY_CONTEXT_WINDOW = int(os.environ.get("PROXY_CONTEXT_WINDOW", "0"))
103
+ PROXY_CONTEXT_PRUNE_THRESHOLD = float(os.environ.get("PROXY_CONTEXT_PRUNE_THRESHOLD", "0.75"))
104
+
105
+ # ---------------------------------------------------------------------------
106
+ # Logging
107
+ # ---------------------------------------------------------------------------
108
+ logging.basicConfig(
109
+ level=getattr(logging, PROXY_LOG_LEVEL, logging.INFO),
110
+ format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
111
+ datefmt="%Y-%m-%d %H:%M:%S",
112
+ )
113
+ logger = logging.getLogger("uap.anthropic_proxy")
114
+
115
+
116
+ # ---------------------------------------------------------------------------
117
+ # Option F: Session-level Context Window Monitor
118
+ # ---------------------------------------------------------------------------
119
+ @dataclass
120
+ class SessionMonitor:
121
+ """Tracks token usage across the session to provide early warnings
122
+ and enable proactive context management before overflow occurs."""
123
+
124
+ context_window: int = 0 # Auto-detected or configured
125
+ total_requests: int = 0
126
+ last_input_tokens: int = 0 # Estimated input tokens of last request
127
+ last_output_tokens: int = 0 # Actual output tokens of last response
128
+ peak_input_tokens: int = 0 # High-water mark
129
+ prune_count: int = 0 # How many times pruning was triggered
130
+ overflow_count: int = 0 # How many context overflow errors caught
131
+ context_history: list = field(default_factory=list) # Recent token counts
132
+
133
+ def record_request(self, estimated_tokens: int):
134
+ """Record an outgoing request's estimated token count."""
135
+ self.total_requests += 1
136
+ self.last_input_tokens = estimated_tokens
137
+ if estimated_tokens > self.peak_input_tokens:
138
+ self.peak_input_tokens = estimated_tokens
139
+ self.context_history.append(estimated_tokens)
140
+ # Keep last 50 entries
141
+ if len(self.context_history) > 50:
142
+ self.context_history = self.context_history[-50:]
143
+
144
+ def record_response(self, output_tokens: int):
145
+ """Record a response's output token count."""
146
+ self.last_output_tokens = output_tokens
147
+
148
+ def get_utilization(self) -> float:
149
+ """Get current context utilization as a fraction (0.0 - 1.0)."""
150
+ if self.context_window <= 0:
151
+ return 0.0
152
+ return self.last_input_tokens / self.context_window
153
+
154
+ def get_warning_level(self) -> str | None:
155
+ """Return warning level based on context utilization.
156
+ Returns None if no warning needed."""
157
+ util = self.get_utilization()
158
+ if util >= 0.95:
159
+ return "CRITICAL"
160
+ elif util >= 0.85:
161
+ return "HIGH"
162
+ elif util >= 0.75:
163
+ return "ELEVATED"
164
+ return None
165
+
166
+ def estimate_turns_remaining(self) -> int | None:
167
+ """Estimate how many more agentic turns can fit before overflow."""
168
+ if self.context_window <= 0 or len(self.context_history) < 2:
169
+ return None
170
+ # Average growth per turn from recent history
171
+ deltas = [
172
+ self.context_history[i] - self.context_history[i - 1]
173
+ for i in range(1, len(self.context_history))
174
+ if self.context_history[i] > self.context_history[i - 1]
175
+ ]
176
+ if not deltas:
177
+ return None
178
+ avg_growth = sum(deltas) / len(deltas)
179
+ if avg_growth <= 0:
180
+ return None
181
+ remaining_tokens = self.context_window - self.last_input_tokens
182
+ return max(0, int(remaining_tokens / avg_growth))
183
+
184
+ def log_status(self):
185
+ """Log current session status."""
186
+ util = self.get_utilization()
187
+ warning = self.get_warning_level()
188
+ turns = self.estimate_turns_remaining()
189
+ turns_str = f"~{turns} turns remaining" if turns is not None else "unknown"
190
+
191
+ if warning == "CRITICAL":
192
+ logger.error(
193
+ "CONTEXT CRITICAL: %d/%d tokens (%.1f%%), %s, pruned=%d, overflows=%d",
194
+ self.last_input_tokens, self.context_window, util * 100,
195
+ turns_str, self.prune_count, self.overflow_count,
196
+ )
197
+ elif warning == "HIGH":
198
+ logger.warning(
199
+ "CONTEXT HIGH: %d/%d tokens (%.1f%%), %s, pruned=%d",
200
+ self.last_input_tokens, self.context_window, util * 100,
201
+ turns_str, self.prune_count,
202
+ )
203
+ elif warning == "ELEVATED":
204
+ logger.warning(
205
+ "CONTEXT ELEVATED: %d/%d tokens (%.1f%%), %s",
206
+ self.last_input_tokens, self.context_window, util * 100,
207
+ turns_str,
208
+ )
209
+ else:
210
+ logger.info(
211
+ "CONTEXT: %d/%d tokens (%.1f%%), %s",
212
+ self.last_input_tokens, self.context_window, util * 100,
213
+ turns_str,
214
+ )
215
+
216
+
217
+ session_monitor = SessionMonitor()
218
+
219
+
220
+ # ---------------------------------------------------------------------------
221
+ # Context Window Detection
222
+ # ---------------------------------------------------------------------------
223
+ async def detect_context_window(client: httpx.AsyncClient) -> int:
224
+ """Auto-detect the upstream server's per-slot context window size.
225
+
226
+ Queries the /slots endpoint (llama.cpp) to get the actual n_ctx value.
227
+ Falls back to PROXY_CONTEXT_WINDOW env var, then to a safe default.
228
+ """
229
+ if PROXY_CONTEXT_WINDOW > 0:
230
+ logger.info("Using configured context window: %d tokens", PROXY_CONTEXT_WINDOW)
231
+ return PROXY_CONTEXT_WINDOW
232
+
233
+ try:
234
+ slots_url = LLAMA_CPP_BASE.replace("/v1", "/slots")
235
+ resp = await client.get(slots_url, timeout=5.0)
236
+ if resp.status_code == 200:
237
+ slots = resp.json()
238
+ if slots and isinstance(slots, list):
239
+ n_ctx = slots[0].get("n_ctx", 0)
240
+ if n_ctx > 0:
241
+ logger.info(
242
+ "Auto-detected context window from upstream: %d tokens (%d slots)",
243
+ n_ctx, len(slots),
244
+ )
245
+ return n_ctx
246
+ except Exception as exc:
247
+ logger.warning("Failed to auto-detect context window: %s", exc)
248
+
249
+ # Safe default: 128K (common for modern models)
250
+ default = 131072
251
+ logger.warning("Using default context window: %d tokens", default)
252
+ return default
253
+
254
+
255
+ # ---------------------------------------------------------------------------
256
+ # Option C: Conversation Pruning
257
+ # ---------------------------------------------------------------------------
258
+ # Characters-per-token ratio for estimation. English text averages ~4 chars/token,
259
+ # but tool call JSON and code tend to be denser (~3.2 chars/token).
260
+ CHARS_PER_TOKEN = 3.5
261
+
262
+
263
+ def estimate_tokens(text: str) -> int:
264
+ """Estimate token count from text length using chars-per-token heuristic."""
265
+ return max(1, int(len(text) / CHARS_PER_TOKEN))
266
+
267
+
268
+ def estimate_message_tokens(msg: dict) -> int:
269
+ """Estimate token count for a single Anthropic message."""
270
+ tokens = 4 # Message overhead (role, separators)
271
+ content = msg.get("content", "")
272
+ if isinstance(content, str):
273
+ tokens += estimate_tokens(content)
274
+ elif isinstance(content, list):
275
+ for block in content:
276
+ if isinstance(block, str):
277
+ tokens += estimate_tokens(block)
278
+ elif isinstance(block, dict):
279
+ if block.get("type") == "text":
280
+ tokens += estimate_tokens(block.get("text", ""))
281
+ elif block.get("type") == "tool_use":
282
+ tokens += estimate_tokens(block.get("name", ""))
283
+ tokens += estimate_tokens(json.dumps(block.get("input", {})))
284
+ elif block.get("type") == "tool_result":
285
+ tokens += estimate_tokens(_extract_text(block.get("content", "")))
286
+ return tokens
287
+
288
+
289
+ def estimate_total_tokens(anthropic_body: dict) -> int:
290
+ """Estimate total token count for an Anthropic Messages API request."""
291
+ tokens = 0
292
+
293
+ # System prompt
294
+ system = anthropic_body.get("system", "")
295
+ if isinstance(system, str):
296
+ tokens += estimate_tokens(system)
297
+ elif isinstance(system, list):
298
+ for block in system:
299
+ if isinstance(block, dict) and block.get("type") == "text":
300
+ tokens += estimate_tokens(block.get("text", ""))
301
+
302
+ # Agentic supplement tokens (always injected)
303
+ tokens += estimate_tokens(_AGENTIC_SYSTEM_SUPPLEMENT)
304
+
305
+ # Messages
306
+ for msg in anthropic_body.get("messages", []):
307
+ tokens += estimate_message_tokens(msg)
308
+
309
+ # Tool definitions
310
+ tools = anthropic_body.get("tools", [])
311
+ if tools:
312
+ tokens += estimate_tokens(json.dumps(tools))
313
+
314
+ return tokens
315
+
316
+
317
+ def prune_conversation(anthropic_body: dict, context_window: int, target_fraction: float = 0.65) -> dict:
318
+ """Prune the conversation to fit within the context window.
319
+
320
+ Strategy:
321
+ - Always keep: system prompt, first user message, last N messages
322
+ - Remove from the middle: oldest tool_result messages first (they're
323
+ the largest -- file contents, command output, etc.), then oldest
324
+ assistant messages, then oldest user messages.
325
+ - Inject a [CONTEXT PRUNED] marker so the model knows history was trimmed.
326
+
327
+ Args:
328
+ anthropic_body: The full Anthropic request body
329
+ context_window: Maximum context window in tokens
330
+ target_fraction: Target utilization after pruning (0.0-1.0)
331
+
332
+ Returns:
333
+ Modified anthropic_body with pruned messages
334
+ """
335
+ messages = anthropic_body.get("messages", [])
336
+ if len(messages) <= 4:
337
+ # Too few messages to prune meaningfully
338
+ return anthropic_body
339
+
340
+ target_tokens = int(context_window * target_fraction)
341
+
342
+ # Estimate non-message tokens (system, tools, agentic supplement)
343
+ overhead_tokens = 0
344
+ system = anthropic_body.get("system", "")
345
+ if isinstance(system, str):
346
+ overhead_tokens += estimate_tokens(system)
347
+ elif isinstance(system, list):
348
+ for block in system:
349
+ if isinstance(block, dict) and block.get("type") == "text":
350
+ overhead_tokens += estimate_tokens(block.get("text", ""))
351
+ overhead_tokens += estimate_tokens(_AGENTIC_SYSTEM_SUPPLEMENT)
352
+ tools = anthropic_body.get("tools", [])
353
+ if tools:
354
+ overhead_tokens += estimate_tokens(json.dumps(tools))
355
+
356
+ # Budget for messages
357
+ message_budget = target_tokens - overhead_tokens
358
+ if message_budget <= 0:
359
+ logger.error("System prompt + tools alone exceed target budget!")
360
+ return anthropic_body
361
+
362
+ # Always keep the first user message and the last N messages
363
+ KEEP_LAST = 8 # Keep the last 8 messages (recent context)
364
+ protected_head = messages[:1] # First user message
365
+ protected_tail = messages[-KEEP_LAST:] if len(messages) > KEEP_LAST else messages[1:]
366
+ middle = messages[1:-KEEP_LAST] if len(messages) > KEEP_LAST + 1 else []
367
+
368
+ # Calculate tokens for protected messages
369
+ protected_tokens = sum(estimate_message_tokens(m) for m in protected_head + protected_tail)
370
+
371
+ if protected_tokens >= message_budget:
372
+ # Even protected messages exceed budget -- truncate tool_result content
373
+ # in the tail to fit
374
+ logger.warning(
375
+ "Protected messages (%d tokens) exceed budget (%d) -- truncating tool results",
376
+ protected_tokens, message_budget,
377
+ )
378
+ for msg in protected_tail:
379
+ content = msg.get("content", [])
380
+ if isinstance(content, list):
381
+ for block in content:
382
+ if isinstance(block, dict) and block.get("type") == "tool_result":
383
+ result_text = _extract_text(block.get("content", ""))
384
+ if len(result_text) > 2000:
385
+ block["content"] = result_text[:1000] + "\n...[TRUNCATED]...\n" + result_text[-500:]
386
+ anthropic_body["messages"] = protected_head + protected_tail
387
+ return anthropic_body
388
+
389
+ remaining_budget = message_budget - protected_tokens
390
+
391
+ # Score middle messages for removal priority:
392
+ # - tool_result messages: remove first (biggest, least important historically)
393
+ # - assistant text-only: remove second
394
+ # - user messages: remove last (provide context for the model's actions)
395
+ # Within each category, remove oldest first.
396
+ scored_middle = []
397
+ for i, msg in enumerate(middle):
398
+ content = msg.get("content", [])
399
+ tokens = estimate_message_tokens(msg)
400
+ is_tool_result = False
401
+ is_assistant = msg.get("role") == "assistant"
402
+
403
+ if isinstance(content, list):
404
+ is_tool_result = any(
405
+ isinstance(b, dict) and b.get("type") == "tool_result"
406
+ for b in content
407
+ )
408
+
409
+ # Lower priority = removed first
410
+ if is_tool_result:
411
+ priority = 0 # Remove first
412
+ elif is_assistant:
413
+ priority = 1 # Remove second
414
+ else:
415
+ priority = 2 # Remove last (user messages)
416
+
417
+ scored_middle.append((priority, i, tokens, msg))
418
+
419
+ # Sort by priority (ascending = remove first), then by index (oldest first)
420
+ scored_middle.sort(key=lambda x: (x[0], x[1]))
421
+
422
+ # Greedily keep messages from highest priority (keep last) until budget fills
423
+ kept_middle = []
424
+ used_tokens = 0
425
+ # Process in reverse priority order (keep high-priority messages first)
426
+ for priority, idx, tokens, msg in reversed(scored_middle):
427
+ if used_tokens + tokens <= remaining_budget:
428
+ kept_middle.append((idx, msg))
429
+ used_tokens += tokens
430
+
431
+ # Sort kept messages back into original order
432
+ kept_middle.sort(key=lambda x: x[0])
433
+ kept_msgs = [m for _, m in kept_middle]
434
+
435
+ removed_count = len(middle) - len(kept_msgs)
436
+ removed_tokens = sum(t for _, _, t, _ in scored_middle) - used_tokens
437
+
438
+ if removed_count > 0:
439
+ # Insert a context-pruned marker
440
+ prune_marker = {
441
+ "role": "user",
442
+ "content": (
443
+ f"[CONTEXT PRUNED: {removed_count} older messages (~{removed_tokens} tokens) "
444
+ f"were removed to fit within the context window. "
445
+ f"The conversation continues from recent context below.]"
446
+ ),
447
+ }
448
+ anthropic_body["messages"] = protected_head + [prune_marker] + kept_msgs + protected_tail
449
+ logger.warning(
450
+ "PRUNED: removed %d messages (~%d tokens), kept %d messages, "
451
+ "target=%.0f%% of %d ctx",
452
+ removed_count, removed_tokens, len(anthropic_body["messages"]),
453
+ target_fraction * 100, context_window,
454
+ )
455
+ else:
456
+ anthropic_body["messages"] = protected_head + kept_msgs + protected_tail
457
+
458
+ return anthropic_body
459
+
460
+
461
+ # ---------------------------------------------------------------------------
462
+ # HTTP Client Lifecycle
463
+ # ---------------------------------------------------------------------------
464
+ # Module-level httpx.AsyncClient for connection reuse + keep-alive.
465
+ # Granular timeouts: short connect, long read for streaming LLM output.
466
+ http_client: httpx.AsyncClient | None = None
467
+
468
+
469
+ @asynccontextmanager
470
+ async def lifespan(app: FastAPI):
471
+ """Manage the httpx client lifecycle with the FastAPI app."""
472
+ global http_client
473
+ http_client = httpx.AsyncClient(
474
+ timeout=httpx.Timeout(
475
+ connect=10.0, # 10s to establish connection
476
+ read=PROXY_READ_TIMEOUT, # configurable (default 10 min)
477
+ write=30.0, # 30s to send the request body
478
+ pool=10.0, # 10s to acquire a pool connection
479
+ ),
480
+ limits=httpx.Limits(
481
+ max_connections=PROXY_MAX_CONNECTIONS,
482
+ max_keepalive_connections=PROXY_MAX_CONNECTIONS // 2,
483
+ keepalive_expiry=120,
484
+ ),
485
+ )
486
+ logger.info(
487
+ "Proxy started: listening on %s:%d -> upstream %s",
488
+ PROXY_HOST, PROXY_PORT, LLAMA_CPP_BASE,
489
+ )
490
+
491
+ # Auto-detect context window from upstream server
492
+ session_monitor.context_window = await detect_context_window(http_client)
493
+ logger.info(
494
+ "Context window: %d tokens, prune threshold: %.0f%%",
495
+ session_monitor.context_window,
496
+ PROXY_CONTEXT_PRUNE_THRESHOLD * 100,
497
+ )
498
+
499
+ yield
500
+ await http_client.aclose()
501
+ http_client = None
502
+ logger.info("Proxy shut down")
503
+
504
+
505
+ app = FastAPI(
506
+ title="UAP Anthropic Proxy",
507
+ description="Translates Anthropic Messages API to OpenAI Chat Completions API",
508
+ version="1.0.0",
509
+ lifespan=lifespan,
510
+ )
511
+
512
+
513
+ # ===========================================================================
514
+ # Request Translation: Anthropic -> OpenAI
515
+ # ===========================================================================
516
+
517
+ def anthropic_to_openai_messages(anthropic_body: dict) -> list[dict]:
518
+ """Convert Anthropic message format to OpenAI message format.
519
+
520
+ Handles:
521
+ - System prompt (string or content block array)
522
+ - Text content blocks
523
+ - Tool use blocks (-> OpenAI function calls)
524
+ - Tool result blocks (-> OpenAI tool messages)
525
+ """
526
+ messages = []
527
+
528
+ # Anthropic has system as a top-level param
529
+ system = anthropic_body.get("system")
530
+ if system:
531
+ if isinstance(system, str):
532
+ messages.append({"role": "system", "content": system})
533
+ elif isinstance(system, list):
534
+ text = "\n".join(
535
+ b.get("text", "") for b in system if b.get("type") == "text"
536
+ )
537
+ if text:
538
+ messages.append({"role": "system", "content": text})
539
+
540
+ for msg in anthropic_body.get("messages", []):
541
+ role = msg["role"]
542
+ content = msg.get("content")
543
+
544
+ if isinstance(content, str):
545
+ messages.append({"role": role, "content": content})
546
+ elif isinstance(content, list):
547
+ parts = []
548
+ for block in content:
549
+ if isinstance(block, str):
550
+ parts.append(block)
551
+ elif block.get("type") == "text":
552
+ parts.append(block.get("text", ""))
553
+ elif block.get("type") == "tool_use":
554
+ messages.append({
555
+ "role": "assistant",
556
+ "content": None,
557
+ "tool_calls": [{
558
+ "id": block.get("id", f"call_{uuid.uuid4().hex[:8]}"),
559
+ "type": "function",
560
+ "function": {
561
+ "name": block["name"],
562
+ "arguments": json.dumps(block.get("input", {})),
563
+ },
564
+ }],
565
+ })
566
+ continue
567
+ elif block.get("type") == "tool_result":
568
+ messages.append({
569
+ "role": "tool",
570
+ "tool_call_id": block.get("tool_use_id", ""),
571
+ "content": _extract_text(block.get("content", "")),
572
+ })
573
+ continue
574
+ if parts:
575
+ messages.append({"role": role, "content": "\n".join(parts)})
576
+
577
+ return messages
578
+
579
+
580
+ def _extract_text(content) -> str:
581
+ """Extract plain text from Anthropic content (string, list, or other)."""
582
+ if isinstance(content, str):
583
+ return content
584
+ if isinstance(content, list):
585
+ return "\n".join(
586
+ b.get("text", "") if isinstance(b, dict) else str(b) for b in content
587
+ )
588
+ return str(content)
589
+
590
+
591
+ _AGENTIC_SYSTEM_SUPPLEMENT = (
592
+ "\n\n<agentic-protocol>\n"
593
+ "You are operating in an agentic coding loop with tool access. Follow these rules:\n"
594
+ "1. ALWAYS use tools to read, edit, write, and test code. Never just describe or explain what should be done.\n"
595
+ "2. After reading files and identifying an issue, proceed IMMEDIATELY to make the fix using Edit/Write tools. Do NOT stop after explaining the problem.\n"
596
+ "3. After making changes, run the relevant tests or build commands to verify your fix.\n"
597
+ "4. Only produce a final text response WITHOUT tool calls when the ENTIRE task is fully complete, verified, and you have nothing left to do.\n"
598
+ "5. If you have identified a problem but have not yet fixed it, you MUST call a tool to make the fix. Do NOT summarize the issue and stop.\n"
599
+ "6. When the user asks you to do something, DO it with tools. Do not ask for permission or confirmation.\n"
600
+ "7. If a tool call fails, analyze the error and try a different approach. Do not give up after one failure.\n"
601
+ "</agentic-protocol>"
602
+ )
603
+
604
+
605
+ def build_openai_request(anthropic_body: dict) -> dict:
606
+ """Build an OpenAI Chat Completions request from an Anthropic Messages request."""
607
+ openai_body = {
608
+ "model": anthropic_body.get("model", "default"),
609
+ "messages": anthropic_to_openai_messages(anthropic_body),
610
+ "stream": anthropic_body.get("stream", False),
611
+ }
612
+
613
+ # Inject agentic protocol instructions into the system message so
614
+ # the model knows it must use tools to complete work, not just explain.
615
+ if openai_body["messages"] and openai_body["messages"][0].get("role") == "system":
616
+ openai_body["messages"][0]["content"] += _AGENTIC_SYSTEM_SUPPLEMENT
617
+ else:
618
+ # No system message from the client; inject one.
619
+ openai_body["messages"].insert(0, {
620
+ "role": "system",
621
+ "content": _AGENTIC_SYSTEM_SUPPLEMENT.strip(),
622
+ })
623
+
624
+ if "max_tokens" in anthropic_body:
625
+ # Enforce minimum floor for thinking mode: model needs tokens for
626
+ # reasoning (<think>...</think>) plus the actual response/tool calls.
627
+ # Claude Code typically sends 4096-8192 which is too low for thinking.
628
+ requested_max = max(anthropic_body["max_tokens"], 16384)
629
+
630
+ # Option E: Smart max_tokens capping — prevent the response from
631
+ # consuming so many tokens that the NEXT turn's input won't fit.
632
+ # Formula: max_tokens = min(requested, context_window - input_tokens - safety_margin)
633
+ # This ensures the model's output + current input stays within bounds,
634
+ # leaving room for the next turn's incremental growth.
635
+ ctx_window = session_monitor.context_window
636
+ if ctx_window > 0:
637
+ estimated_input = estimate_total_tokens(anthropic_body)
638
+ # Reserve 15% of context for next-turn growth (tool results, etc.)
639
+ safety_margin = int(ctx_window * 0.15)
640
+ available_for_output = ctx_window - estimated_input - safety_margin
641
+ if available_for_output < requested_max and available_for_output > 1024:
642
+ logger.info(
643
+ "MAX_TOKENS capped: %d -> %d (ctx=%d, input~%d, margin=%d)",
644
+ requested_max, available_for_output,
645
+ ctx_window, estimated_input, safety_margin,
646
+ )
647
+ requested_max = available_for_output
648
+ elif available_for_output <= 1024:
649
+ # Very tight on space -- allow minimum but warn
650
+ logger.warning(
651
+ "MAX_TOKENS: only %d tokens available for output (ctx=%d, input~%d). "
652
+ "Response may be truncated.",
653
+ available_for_output, ctx_window, estimated_input,
654
+ )
655
+ requested_max = max(1024, available_for_output)
656
+
657
+ openai_body["max_tokens"] = requested_max
658
+ if "temperature" in anthropic_body:
659
+ openai_body["temperature"] = anthropic_body["temperature"]
660
+ if "top_p" in anthropic_body:
661
+ openai_body["top_p"] = anthropic_body["top_p"]
662
+ if "stop_sequences" in anthropic_body:
663
+ openai_body["stop"] = anthropic_body["stop_sequences"]
664
+
665
+ # Convert Anthropic tools to OpenAI function-calling tools
666
+ if "tools" in anthropic_body:
667
+ openai_body["tools"] = []
668
+ for tool in anthropic_body["tools"]:
669
+ openai_body["tools"].append({
670
+ "type": "function",
671
+ "function": {
672
+ "name": tool["name"],
673
+ "description": tool.get("description", ""),
674
+ "parameters": tool.get("input_schema", {}),
675
+ },
676
+ })
677
+
678
+ # Smart tool_choice: force tool calls during the agentic loop to
679
+ # prevent the model from producing text-only end_turn responses that
680
+ # prematurely stop the loop. The model can still produce text alongside
681
+ # tool calls when tool_choice="required".
682
+ #
683
+ # Force "required" when:
684
+ # - More than 1 message (conversation is in progress)
685
+ # - Last assistant was text-only (would cause premature stop)
686
+ # - OR conversation has tool_result messages (active agentic loop)
687
+ n_msgs = len(anthropic_body.get("messages", []))
688
+ has_tool_results = any(
689
+ isinstance(m.get("content"), list) and any(
690
+ isinstance(b, dict) and b.get("type") == "tool_result"
691
+ for b in m.get("content", [])
692
+ )
693
+ for m in anthropic_body.get("messages", [])
694
+ )
695
+ if _last_assistant_was_text_only(anthropic_body):
696
+ openai_body["tool_choice"] = "required"
697
+ logger.info("tool_choice forced to 'required' (last assistant was text-only)")
698
+ elif has_tool_results and n_msgs > 2:
699
+ openai_body["tool_choice"] = "required"
700
+ logger.info("tool_choice forced to 'required' (active agentic loop with tool results)")
701
+
702
+ return openai_body
703
+
704
+
705
+ def _last_assistant_was_text_only(anthropic_body: dict) -> bool:
706
+ """Check if the last assistant message in the conversation was text-only
707
+ (no tool_use blocks). This indicates the model may be prematurely ending
708
+ the agentic loop by explaining instead of acting."""
709
+ messages = anthropic_body.get("messages", [])
710
+ # Walk backwards to find the last assistant message
711
+ for msg in reversed(messages):
712
+ if msg.get("role") != "assistant":
713
+ continue
714
+ content = msg.get("content")
715
+ if isinstance(content, str):
716
+ # Pure text assistant message -- text-only
717
+ return bool(content.strip())
718
+ if isinstance(content, list):
719
+ has_tool_use = any(
720
+ isinstance(b, dict) and b.get("type") == "tool_use"
721
+ for b in content
722
+ )
723
+ has_text = any(
724
+ (isinstance(b, dict) and b.get("type") == "text" and b.get("text", "").strip())
725
+ or isinstance(b, str)
726
+ for b in content
727
+ )
728
+ # Text-only if there's text but no tool_use
729
+ return has_text and not has_tool_use
730
+ return False
731
+ return False
732
+
733
+
734
+ # ===========================================================================
735
+ # Response Translation: OpenAI -> Anthropic
736
+ # ===========================================================================
737
+
738
+ def openai_to_anthropic_response(openai_resp: dict, model: str) -> dict:
739
+ """Convert an OpenAI Chat Completions response to Anthropic Messages format."""
740
+ choice = openai_resp.get("choices", [{}])[0]
741
+ message = choice.get("message", {})
742
+ finish = choice.get("finish_reason", "stop")
743
+
744
+ content = []
745
+ if message.get("content"):
746
+ content.append({"type": "text", "text": message["content"]})
747
+
748
+ # Convert tool calls
749
+ for tc in message.get("tool_calls", []):
750
+ fn = tc.get("function", {})
751
+ try:
752
+ args = json.loads(fn.get("arguments", "{}"))
753
+ except json.JSONDecodeError:
754
+ args = {}
755
+ content.append({
756
+ "type": "tool_use",
757
+ "id": tc.get("id", f"toolu_{uuid.uuid4().hex[:12]}"),
758
+ "name": fn.get("name", ""),
759
+ "input": args,
760
+ })
761
+
762
+ stop_reason_map = {
763
+ "stop": "end_turn",
764
+ "length": "max_tokens",
765
+ "tool_calls": "tool_use",
766
+ "function_call": "tool_use",
767
+ }
768
+
769
+ usage = openai_resp.get("usage", {})
770
+
771
+ return {
772
+ "id": f"msg_{uuid.uuid4().hex[:24]}",
773
+ "type": "message",
774
+ "role": "assistant",
775
+ "content": content if content else [{"type": "text", "text": ""}],
776
+ "model": model,
777
+ "stop_reason": stop_reason_map.get(finish, "end_turn"),
778
+ "stop_sequence": None,
779
+ "usage": {
780
+ "input_tokens": usage.get("prompt_tokens", 0),
781
+ "output_tokens": usage.get("completion_tokens", 0),
782
+ },
783
+ }
784
+
785
+
786
+ # ===========================================================================
787
+ # Streaming Translation: OpenAI SSE -> Anthropic SSE
788
+ # ===========================================================================
789
+
790
+ async def stream_anthropic_response(openai_stream: httpx.Response, model: str):
791
+ """Convert an OpenAI streaming response to Anthropic SSE stream format.
792
+
793
+ Handles:
794
+ - Text content deltas -> content_block_delta (text_delta)
795
+ - Tool call deltas -> content_block_start (tool_use) + input_json_delta
796
+ - Graceful error recovery on upstream connection drops
797
+ - Proper upstream response closure on client disconnect
798
+ """
799
+ msg_id = f"msg_{uuid.uuid4().hex[:24]}"
800
+
801
+ # message_start
802
+ yield (
803
+ f"event: message_start\n"
804
+ f"data: {json.dumps({'type': 'message_start', 'message': {'id': msg_id, 'type': 'message', 'role': 'assistant', 'content': [], 'model': model, 'stop_reason': None, 'stop_sequence': None, 'usage': {'input_tokens': 0, 'output_tokens': 0}}})}\n\n"
805
+ )
806
+
807
+ # content_block_start for text (index 0)
808
+ yield (
809
+ f"event: content_block_start\n"
810
+ f"data: {json.dumps({'type': 'content_block_start', 'index': 0, 'content_block': {'type': 'text', 'text': ''}})}\n\n"
811
+ )
812
+
813
+ yield "event: ping\ndata: {\"type\": \"ping\"}\n\n"
814
+
815
+ output_tokens = 0
816
+ finish_reason = "end_turn"
817
+
818
+ # Track tool call state for streaming tool_calls
819
+ tool_calls_by_index: dict[int, dict] = {}
820
+ tool_block_index = 1 # anthropic block index (0 = text)
821
+ text_chunks: list[str] = [] # accumulate text for logging
822
+ reasoning_chunks: list[str] = [] # accumulate reasoning for fallback
823
+
824
+ try:
825
+ async for line in openai_stream.aiter_lines():
826
+ if not line.startswith("data: "):
827
+ continue
828
+ data = line[6:].strip()
829
+ if data == "[DONE]":
830
+ break
831
+ try:
832
+ chunk = json.loads(data)
833
+ except json.JSONDecodeError:
834
+ continue
835
+
836
+ choice = (chunk.get("choices") or [{}])[0]
837
+ delta = choice.get("delta", {})
838
+
839
+ # Collect reasoning_content (normally stripped; used as fallback
840
+ # if the model produces only reasoning with no visible output)
841
+ reasoning = delta.get("reasoning_content", "")
842
+ if reasoning:
843
+ reasoning_chunks.append(reasoning)
844
+
845
+ # Handle text content deltas
846
+ if delta.get("content"):
847
+ output_tokens += 1 # rough token estimate
848
+ text_chunks.append(delta["content"])
849
+ yield (
850
+ f"event: content_block_delta\n"
851
+ f"data: {json.dumps({'type': 'content_block_delta', 'index': 0, 'delta': {'type': 'text_delta', 'text': delta['content']}})}\n\n"
852
+ )
853
+
854
+ # Handle tool_calls deltas
855
+ if delta.get("tool_calls"):
856
+ for tc_delta in delta["tool_calls"]:
857
+ tc_idx = tc_delta.get("index", 0)
858
+
859
+ if tc_idx not in tool_calls_by_index:
860
+ # New tool call starting
861
+ tc_id = tc_delta.get("id", f"toolu_{uuid.uuid4().hex[:12]}")
862
+ fn = tc_delta.get("function", {})
863
+ initial_args = fn.get("arguments", "")
864
+ tool_calls_by_index[tc_idx] = {
865
+ "id": tc_id,
866
+ "name": fn.get("name", ""),
867
+ "arguments": initial_args,
868
+ "block_index": tool_block_index,
869
+ }
870
+
871
+ # Close text block before first tool block
872
+ if tool_block_index == 1:
873
+ yield (
874
+ f"event: content_block_stop\n"
875
+ f"data: {json.dumps({'type': 'content_block_stop', 'index': 0})}\n\n"
876
+ )
877
+
878
+ # Emit content_block_start for this tool_use
879
+ yield (
880
+ f"event: content_block_start\n"
881
+ f"data: {json.dumps({'type': 'content_block_start', 'index': tool_block_index, 'content_block': {'type': 'tool_use', 'id': tc_id, 'name': fn.get('name', '')}})}\n\n"
882
+ )
883
+
884
+ # Emit initial arguments fragment (e.g. "{") that
885
+ # arrives with the first tool_call chunk. Without
886
+ # this the opening brace is swallowed and the client
887
+ # receives invalid JSON like "command":"ls"} instead
888
+ # of {"command":"ls"}.
889
+ if initial_args:
890
+ yield (
891
+ f"event: content_block_delta\n"
892
+ f"data: {json.dumps({'type': 'content_block_delta', 'index': tool_block_index, 'delta': {'type': 'input_json_delta', 'partial_json': initial_args}})}\n\n"
893
+ )
894
+
895
+ tool_block_index += 1
896
+ else:
897
+ # Continuation: argument chunks
898
+ fn = tc_delta.get("function", {})
899
+ arg_chunk = fn.get("arguments", "")
900
+ if arg_chunk:
901
+ tool_calls_by_index[tc_idx]["arguments"] += arg_chunk
902
+ bidx = tool_calls_by_index[tc_idx]["block_index"]
903
+ yield (
904
+ f"event: content_block_delta\n"
905
+ f"data: {json.dumps({'type': 'content_block_delta', 'index': bidx, 'delta': {'type': 'input_json_delta', 'partial_json': arg_chunk}})}\n\n"
906
+ )
907
+
908
+ if choice.get("finish_reason"):
909
+ fr = choice["finish_reason"]
910
+ if fr == "length":
911
+ logger.warning(
912
+ "Response truncated by token limit (finish_reason=length). "
913
+ "Consider increasing --n-predict or max_tokens."
914
+ )
915
+ finish_reason = {
916
+ "stop": "end_turn",
917
+ "length": "max_tokens",
918
+ "tool_calls": "tool_use",
919
+ }.get(fr, "end_turn")
920
+
921
+ except (httpx.ReadError, httpx.RemoteProtocolError, httpx.StreamClosed) as exc:
922
+ logger.warning("Upstream stream error: %s: %s", type(exc).__name__, exc)
923
+ finish_reason = "end_turn"
924
+ except asyncio.CancelledError:
925
+ logger.info("Client disconnected, closing upstream stream")
926
+ raise
927
+ except Exception as exc:
928
+ logger.error("Unexpected stream error: %s: %s", type(exc).__name__, exc)
929
+ finish_reason = "end_turn"
930
+ finally:
931
+ # Always close the upstream response to stop LLM generation
932
+ await openai_stream.aclose()
933
+
934
+ # Close any open tool call blocks
935
+ if tool_calls_by_index:
936
+ for tc in tool_calls_by_index.values():
937
+ yield (
938
+ f"event: content_block_stop\n"
939
+ f"data: {json.dumps({'type': 'content_block_stop', 'index': tc['block_index']})}\n\n"
940
+ )
941
+ else:
942
+ # Option E: If the response has no text AND no tool calls, but the
943
+ # model produced reasoning_content, forward the reasoning as visible
944
+ # text so the client doesn't receive a completely empty turn.
945
+ accumulated_text = "".join(text_chunks)
946
+ if not accumulated_text and reasoning_chunks:
947
+ fallback_text = "".join(reasoning_chunks)
948
+ logger.warning(
949
+ "Empty response with %d reasoning tokens – forwarding reasoning as fallback text",
950
+ len(reasoning_chunks),
951
+ )
952
+ text_chunks.append(fallback_text)
953
+ yield (
954
+ f"event: content_block_delta\n"
955
+ f"data: {json.dumps({'type': 'content_block_delta', 'index': 0, 'delta': {'type': 'text_delta', 'text': fallback_text}})}\n\n"
956
+ )
957
+
958
+ yield (
959
+ f"event: content_block_stop\n"
960
+ f"data: {json.dumps({'type': 'content_block_stop', 'index': 0})}\n\n"
961
+ )
962
+
963
+ # Log response summary
964
+ accumulated_text = "".join(text_chunks)
965
+ tc_names = [tc["name"] for tc in tool_calls_by_index.values()] if tool_calls_by_index else []
966
+ tc_args = [tc.get("arguments", "") for tc in tool_calls_by_index.values()] if tool_calls_by_index else []
967
+ logger.info(
968
+ "RESP: finish=%s output_tokens=%d text_len=%d text=%.300s tool_calls=%s args=%s",
969
+ finish_reason, output_tokens,
970
+ len(accumulated_text),
971
+ accumulated_text[:300],
972
+ tc_names,
973
+ [a[:200] for a in tc_args],
974
+ )
975
+
976
+ # message_delta with final stop reason
977
+ yield (
978
+ f"event: message_delta\n"
979
+ f"data: {json.dumps({'type': 'message_delta', 'delta': {'stop_reason': finish_reason, 'stop_sequence': None}, 'usage': {'output_tokens': output_tokens}})}\n\n"
980
+ )
981
+
982
+ # message_stop
983
+ yield f"event: message_stop\ndata: {json.dumps({'type': 'message_stop'})}\n\n"
984
+
985
+
986
+ # ===========================================================================
987
+ # API Endpoints
988
+ # ===========================================================================
989
+
990
+ @app.post("/v1/messages")
991
+ async def messages(request: Request):
992
+ """Handle Anthropic Messages API requests (streaming and non-streaming).
993
+
994
+ Integrates context management:
995
+ - Option B: HTTP error handling for upstream 4xx/5xx responses
996
+ - Option C: Conversation pruning when approaching context limits
997
+ - Option E: Smart max_tokens capping (in build_openai_request)
998
+ - Option F: Session-level token monitoring with warnings
999
+ """
1000
+ body = await request.json()
1001
+ model = body.get("model", "default")
1002
+ is_stream = body.get("stream", False)
1003
+
1004
+ # Debug: log request summary
1005
+ n_messages = len(body.get("messages", []))
1006
+ n_tools = len(body.get("tools", []))
1007
+ max_tokens = body.get("max_tokens", "unset")
1008
+ last_msg = body.get("messages", [{}])[-1]
1009
+ last_role = last_msg.get("role", "?")
1010
+ last_content = last_msg.get("content", "")
1011
+ if isinstance(last_content, list):
1012
+ last_text = next((b.get("text", "") for b in last_content if b.get("type") == "text"), "")[:200]
1013
+ elif isinstance(last_content, str):
1014
+ last_text = last_content[:200]
1015
+ else:
1016
+ last_text = str(last_content)[:200]
1017
+ logger.info(
1018
+ "REQ: stream=%s msgs=%d tools=%d max_tokens=%s last_role=%s last_content=%.200s",
1019
+ is_stream, n_messages, n_tools, max_tokens, last_role, last_text
1020
+ )
1021
+
1022
+ # --- Option F: Estimate tokens and record in session monitor ---
1023
+ estimated_tokens = estimate_total_tokens(body)
1024
+ session_monitor.record_request(estimated_tokens)
1025
+ session_monitor.log_status()
1026
+
1027
+ # --- Option C: Prune conversation if approaching context limit ---
1028
+ ctx_window = session_monitor.context_window
1029
+ if ctx_window > 0:
1030
+ utilization = estimated_tokens / ctx_window
1031
+ if utilization >= PROXY_CONTEXT_PRUNE_THRESHOLD:
1032
+ logger.warning(
1033
+ "Context utilization %.1f%% exceeds threshold %.1f%% -- pruning conversation",
1034
+ utilization * 100, PROXY_CONTEXT_PRUNE_THRESHOLD * 100,
1035
+ )
1036
+ body = prune_conversation(body, ctx_window, target_fraction=0.65)
1037
+ session_monitor.prune_count += 1
1038
+ # Re-estimate after pruning
1039
+ estimated_tokens = estimate_total_tokens(body)
1040
+ session_monitor.record_request(estimated_tokens)
1041
+ n_messages = len(body.get("messages", []))
1042
+ logger.info(
1043
+ "After pruning: ~%d tokens, %d messages",
1044
+ estimated_tokens, n_messages,
1045
+ )
1046
+
1047
+ openai_body = build_openai_request(body)
1048
+
1049
+ client = http_client
1050
+ if client is None:
1051
+ return Response(
1052
+ content=json.dumps({"error": "Proxy not initialized"}),
1053
+ status_code=503,
1054
+ media_type="application/json",
1055
+ )
1056
+
1057
+ if is_stream:
1058
+ openai_body["stream"] = True
1059
+
1060
+ # Retry upstream connection with backoff to handle
1061
+ # llama-server restarts gracefully instead of 500-ing to the client.
1062
+ MAX_UPSTREAM_RETRIES = 3
1063
+ RETRY_DELAY_SECS = 5.0
1064
+ last_exc: Exception | None = None
1065
+
1066
+ for attempt in range(MAX_UPSTREAM_RETRIES):
1067
+ try:
1068
+ resp = await client.send(
1069
+ client.build_request(
1070
+ "POST",
1071
+ f"{LLAMA_CPP_BASE}/chat/completions",
1072
+ json=openai_body,
1073
+ headers={"Content-Type": "application/json"},
1074
+ ),
1075
+ stream=True,
1076
+ )
1077
+ # Connection succeeded – break out of retry loop
1078
+ last_exc = None
1079
+ break
1080
+ except (httpx.ConnectError, httpx.RemoteProtocolError) as exc:
1081
+ last_exc = exc
1082
+ if attempt < MAX_UPSTREAM_RETRIES - 1:
1083
+ logger.warning(
1084
+ "Upstream connect failed (attempt %d/%d): %s – retrying in %.0fs",
1085
+ attempt + 1, MAX_UPSTREAM_RETRIES,
1086
+ type(exc).__name__, RETRY_DELAY_SECS,
1087
+ )
1088
+ await asyncio.sleep(RETRY_DELAY_SECS)
1089
+ else:
1090
+ logger.error(
1091
+ "Upstream connect failed after %d attempts: %s: %s",
1092
+ MAX_UPSTREAM_RETRIES, type(exc).__name__, exc,
1093
+ )
1094
+
1095
+ if last_exc is not None:
1096
+ return Response(
1097
+ content=json.dumps({
1098
+ "type": "error",
1099
+ "error": {
1100
+ "type": "overloaded_error",
1101
+ "message": f"Upstream server unavailable after {MAX_UPSTREAM_RETRIES} retries: {last_exc}",
1102
+ },
1103
+ }),
1104
+ status_code=529,
1105
+ media_type="application/json",
1106
+ )
1107
+
1108
+ # --- Option B: Check HTTP status before streaming ---
1109
+ # llama-server returns 400 for context overflow, 500 for internal errors, etc.
1110
+ # Without this check, the proxy would try to stream-translate an error body,
1111
+ # producing an empty response that silently kills the agentic loop.
1112
+ if resp.status_code != 200:
1113
+ error_body = await resp.aread()
1114
+ await resp.aclose()
1115
+ error_text = error_body.decode("utf-8", errors="replace")[:1000]
1116
+ logger.error(
1117
+ "Upstream HTTP %d: %s", resp.status_code, error_text
1118
+ )
1119
+
1120
+ # Parse the error for a user-friendly message
1121
+ error_message = f"Upstream server error (HTTP {resp.status_code})"
1122
+ try:
1123
+ error_json = json.loads(error_body)
1124
+ if "error" in error_json:
1125
+ upstream_error = error_json["error"]
1126
+ if isinstance(upstream_error, dict):
1127
+ error_message = upstream_error.get("message", error_message)
1128
+ else:
1129
+ error_message = str(upstream_error)
1130
+ except (json.JSONDecodeError, KeyError):
1131
+ error_message = error_text[:500] if error_text else error_message
1132
+
1133
+ # Detect context overflow specifically
1134
+ is_context_overflow = (
1135
+ resp.status_code == 400
1136
+ and "exceeds" in error_message.lower()
1137
+ and "context" in error_message.lower()
1138
+ )
1139
+
1140
+ if is_context_overflow:
1141
+ session_monitor.overflow_count += 1
1142
+ logger.error(
1143
+ "CONTEXT OVERFLOW detected (count=%d). "
1144
+ "Estimated input: %d tokens, context window: %d tokens. "
1145
+ "Conversation needs pruning or context window increase.",
1146
+ session_monitor.overflow_count, estimated_tokens, ctx_window,
1147
+ )
1148
+ # Return Anthropic-format error that Claude Code can handle
1149
+ return Response(
1150
+ content=json.dumps({
1151
+ "type": "error",
1152
+ "error": {
1153
+ "type": "overloaded_error",
1154
+ "message": (
1155
+ f"Context window exceeded: request requires ~{estimated_tokens} tokens "
1156
+ f"but only {ctx_window} are available. "
1157
+ f"The conversation is too long. Please start a new session or "
1158
+ f"reduce conversation length."
1159
+ ),
1160
+ },
1161
+ }),
1162
+ status_code=529,
1163
+ media_type="application/json",
1164
+ )
1165
+
1166
+ # Generic upstream error -- return as Anthropic error format
1167
+ error_type = "overloaded_error" if resp.status_code >= 500 else "invalid_request_error"
1168
+ return Response(
1169
+ content=json.dumps({
1170
+ "type": "error",
1171
+ "error": {
1172
+ "type": error_type,
1173
+ "message": error_message,
1174
+ },
1175
+ }),
1176
+ status_code=529 if resp.status_code >= 500 else 400,
1177
+ media_type="application/json",
1178
+ )
1179
+
1180
+ return StreamingResponse(
1181
+ stream_anthropic_response(resp, model),
1182
+ media_type="text/event-stream",
1183
+ headers={
1184
+ "Cache-Control": "no-cache",
1185
+ "Connection": "keep-alive",
1186
+ },
1187
+ )
1188
+ else:
1189
+ resp = await client.post(
1190
+ f"{LLAMA_CPP_BASE}/chat/completions",
1191
+ json=openai_body,
1192
+ headers={"Content-Type": "application/json"},
1193
+ )
1194
+
1195
+ # Option B: Handle non-streaming errors too
1196
+ if resp.status_code != 200:
1197
+ error_text = resp.text[:1000]
1198
+ logger.error("Upstream HTTP %d (non-stream): %s", resp.status_code, error_text)
1199
+ return Response(
1200
+ content=json.dumps({
1201
+ "type": "error",
1202
+ "error": {
1203
+ "type": "overloaded_error",
1204
+ "message": f"Upstream error (HTTP {resp.status_code}): {error_text[:500]}",
1205
+ },
1206
+ }),
1207
+ status_code=529,
1208
+ media_type="application/json",
1209
+ )
1210
+
1211
+ openai_resp = resp.json()
1212
+ anthropic_resp = openai_to_anthropic_response(openai_resp, model)
1213
+
1214
+ # Track output tokens in session monitor
1215
+ output_tokens = anthropic_resp.get("usage", {}).get("output_tokens", 0)
1216
+ session_monitor.record_response(output_tokens)
1217
+
1218
+ return anthropic_resp
1219
+
1220
+
1221
+ @app.post("/anthropic/v1/messages")
1222
+ async def messages_anthropic(request: Request):
1223
+ """Alternative endpoint path used by some Claude Code configurations."""
1224
+ return await messages(request)
1225
+
1226
+
1227
+ @app.get("/v1/models")
1228
+ async def models():
1229
+ """Return available model list (spoofs Anthropic model IDs for client compatibility)."""
1230
+ return {
1231
+ "data": [
1232
+ {"id": "claude-sonnet-4-20250514", "object": "model"},
1233
+ {"id": "claude-3-5-sonnet-20241022", "object": "model"},
1234
+ ]
1235
+ }
1236
+
1237
+
1238
+ @app.get("/health")
1239
+ async def health():
1240
+ """Health check endpoint for monitoring and load balancers."""
1241
+ upstream_ok = False
1242
+ try:
1243
+ if http_client:
1244
+ resp = await http_client.get(
1245
+ LLAMA_CPP_BASE.replace("/v1", "/health"),
1246
+ timeout=5.0,
1247
+ )
1248
+ upstream_ok = resp.status_code == 200
1249
+ except Exception:
1250
+ pass
1251
+
1252
+ return {
1253
+ "status": "ok" if upstream_ok else "degraded",
1254
+ "proxy": "ok",
1255
+ "upstream": "ok" if upstream_ok else "unreachable",
1256
+ "upstream_url": LLAMA_CPP_BASE,
1257
+ }
1258
+
1259
+
1260
+ @app.get("/v1/context")
1261
+ async def context_status():
1262
+ """Option F: Context window monitoring endpoint.
1263
+
1264
+ Returns current session token usage, utilization, warnings, and
1265
+ estimated remaining turns. Useful for dashboards and debugging.
1266
+ """
1267
+ warning = session_monitor.get_warning_level()
1268
+ turns = session_monitor.estimate_turns_remaining()
1269
+
1270
+ return {
1271
+ "context_window": session_monitor.context_window,
1272
+ "last_input_tokens": session_monitor.last_input_tokens,
1273
+ "last_output_tokens": session_monitor.last_output_tokens,
1274
+ "peak_input_tokens": session_monitor.peak_input_tokens,
1275
+ "utilization": round(session_monitor.get_utilization(), 4),
1276
+ "utilization_pct": f"{session_monitor.get_utilization() * 100:.1f}%",
1277
+ "warning_level": warning,
1278
+ "estimated_turns_remaining": turns,
1279
+ "total_requests": session_monitor.total_requests,
1280
+ "prune_count": session_monitor.prune_count,
1281
+ "overflow_count": session_monitor.overflow_count,
1282
+ "prune_threshold": PROXY_CONTEXT_PRUNE_THRESHOLD,
1283
+ "recent_history": session_monitor.context_history[-10:],
1284
+ }
1285
+
1286
+
1287
+ # ===========================================================================
1288
+ # Entry Point
1289
+ # ===========================================================================
1290
+
1291
+ if __name__ == "__main__":
1292
+ uvicorn.run(
1293
+ app,
1294
+ host=PROXY_HOST,
1295
+ port=PROXY_PORT,
1296
+ log_level=PROXY_LOG_LEVEL.lower(),
1297
+ )