@miller-tech/uap 1.13.6 → 1.13.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. package/README.md +20 -0
  2. package/config/chat_template.jinja +126 -44
  3. package/config/model-profiles/qwen35.json +3 -3
  4. package/dist/.tsbuildinfo +1 -1
  5. package/dist/benchmarks/token-throughput.d.ts +259 -0
  6. package/dist/benchmarks/token-throughput.d.ts.map +1 -0
  7. package/dist/benchmarks/token-throughput.js +198 -0
  8. package/dist/benchmarks/token-throughput.js.map +1 -0
  9. package/dist/bin/cli.js +12 -0
  10. package/dist/bin/cli.js.map +1 -1
  11. package/dist/bin/llama-server-optimize.js +0 -0
  12. package/dist/bin/policy.js +0 -0
  13. package/dist/cli/dashboard.d.ts.map +1 -1
  14. package/dist/cli/dashboard.js +10 -20
  15. package/dist/cli/dashboard.js.map +1 -1
  16. package/dist/cli/init.d.ts.map +1 -1
  17. package/dist/cli/init.js +5 -0
  18. package/dist/cli/init.js.map +1 -1
  19. package/dist/cli/memory.d.ts.map +1 -1
  20. package/dist/cli/memory.js +9 -18
  21. package/dist/cli/memory.js.map +1 -1
  22. package/dist/cli/worktree.d.ts +4 -1
  23. package/dist/cli/worktree.d.ts.map +1 -1
  24. package/dist/cli/worktree.js +73 -1
  25. package/dist/cli/worktree.js.map +1 -1
  26. package/dist/coordination/adaptive-patterns.d.ts +3 -1
  27. package/dist/coordination/adaptive-patterns.d.ts.map +1 -1
  28. package/dist/coordination/adaptive-patterns.js +31 -3
  29. package/dist/coordination/adaptive-patterns.js.map +1 -1
  30. package/dist/dashboard/data-service.d.ts +44 -0
  31. package/dist/dashboard/data-service.d.ts.map +1 -1
  32. package/dist/dashboard/data-service.js +326 -17
  33. package/dist/dashboard/data-service.js.map +1 -1
  34. package/dist/memory/embeddings.d.ts.map +1 -1
  35. package/dist/memory/embeddings.js +1 -1
  36. package/dist/memory/embeddings.js.map +1 -1
  37. package/dist/models/router.js +1 -1
  38. package/dist/models/router.js.map +1 -1
  39. package/dist/models/types.d.ts +12 -12
  40. package/dist/models/types.js +13 -13
  41. package/dist/models/types.js.map +1 -1
  42. package/dist/policies/schemas/policy.d.ts +13 -13
  43. package/dist/policies/schemas/policy.js +1 -1
  44. package/dist/policies/schemas/policy.js.map +1 -1
  45. package/dist/tasks/coordination.js +1 -1
  46. package/dist/tasks/coordination.js.map +1 -1
  47. package/dist/types/config.d.ts +24 -24
  48. package/package.json +1 -1
  49. package/templates/hooks/session-start.sh +49 -48
  50. package/tools/agents/install-opencode-local.sh.j2 +57 -7
  51. package/tools/agents/opencode_uap_agent.py +63 -1
  52. package/tools/agents/scripts/__pycache__/anthropic_proxy.cpython-313.pyc +0 -0
  53. package/tools/agents/scripts/__pycache__/tool_call_wrapper.cpython-313.pyc +0 -0
  54. package/tools/agents/scripts/anthropic_proxy.py +759 -12
  55. package/tools/agents/scripts/tool_call_wrapper.py +9 -5
@@ -21,6 +21,9 @@ Key Features
21
21
  - Granular timeouts (short connect, long read for LLM generation)
22
22
  - Graceful error recovery on upstream connection drops
23
23
  - Proper upstream cleanup on client disconnect
24
+ - Context window overflow protection with conversation pruning
25
+ - Smart max_tokens capping to prevent next-turn overflow
26
+ - Session-level token monitoring with warnings
24
27
 
25
28
  Configuration (Environment Variables)
26
29
  --------------------------------------
@@ -42,6 +45,14 @@ Configuration (Environment Variables)
42
45
  PROXY_MAX_CONNECTIONS Max concurrent connections to upstream
43
46
  Default: 20
44
47
 
48
+ PROXY_CONTEXT_WINDOW Override context window size (auto-detected from
49
+ upstream /slots endpoint if not set)
50
+ Default: 0 (auto-detect)
51
+
52
+ PROXY_CONTEXT_PRUNE_THRESHOLD Fraction of context window at which
53
+ conversation pruning activates (0.0-1.0)
54
+ Default: 0.75
55
+
45
56
  Usage
46
57
  -----
47
58
  # Basic usage (connects to llama.cpp on default port):
@@ -71,6 +82,7 @@ import os
71
82
  import sys
72
83
  import time
73
84
  import uuid
85
+ from dataclasses import dataclass, field
74
86
 
75
87
  import httpx
76
88
  from contextlib import asynccontextmanager
@@ -87,6 +99,8 @@ PROXY_HOST = os.environ.get("PROXY_HOST", "0.0.0.0")
87
99
  PROXY_LOG_LEVEL = os.environ.get("PROXY_LOG_LEVEL", "INFO").upper()
88
100
  PROXY_READ_TIMEOUT = float(os.environ.get("PROXY_READ_TIMEOUT", "600"))
89
101
  PROXY_MAX_CONNECTIONS = int(os.environ.get("PROXY_MAX_CONNECTIONS", "20"))
102
+ PROXY_CONTEXT_WINDOW = int(os.environ.get("PROXY_CONTEXT_WINDOW", "0"))
103
+ PROXY_CONTEXT_PRUNE_THRESHOLD = float(os.environ.get("PROXY_CONTEXT_PRUNE_THRESHOLD", "0.75"))
90
104
 
91
105
  # ---------------------------------------------------------------------------
92
106
  # Logging
@@ -98,6 +112,352 @@ logging.basicConfig(
98
112
  )
99
113
  logger = logging.getLogger("uap.anthropic_proxy")
100
114
 
115
+
116
+ # ---------------------------------------------------------------------------
117
+ # Option F: Session-level Context Window Monitor
118
+ # ---------------------------------------------------------------------------
119
+ @dataclass
120
+ class SessionMonitor:
121
+ """Tracks token usage across the session to provide early warnings
122
+ and enable proactive context management before overflow occurs."""
123
+
124
+ context_window: int = 0 # Auto-detected or configured
125
+ total_requests: int = 0
126
+ last_input_tokens: int = 0 # Estimated input tokens of last request
127
+ last_output_tokens: int = 0 # Actual output tokens of last response
128
+ peak_input_tokens: int = 0 # High-water mark
129
+ prune_count: int = 0 # How many times pruning was triggered
130
+ overflow_count: int = 0 # How many context overflow errors caught
131
+ context_history: list = field(default_factory=list) # Recent token counts
132
+
133
+ def record_request(self, estimated_tokens: int):
134
+ """Record an outgoing request's estimated token count."""
135
+ self.total_requests += 1
136
+ self.last_input_tokens = estimated_tokens
137
+ if estimated_tokens > self.peak_input_tokens:
138
+ self.peak_input_tokens = estimated_tokens
139
+ self.context_history.append(estimated_tokens)
140
+ # Keep last 50 entries
141
+ if len(self.context_history) > 50:
142
+ self.context_history = self.context_history[-50:]
143
+
144
+ def record_response(self, output_tokens: int):
145
+ """Record a response's output token count."""
146
+ self.last_output_tokens = output_tokens
147
+
148
+ def get_utilization(self) -> float:
149
+ """Get current context utilization as a fraction (0.0 - 1.0)."""
150
+ if self.context_window <= 0:
151
+ return 0.0
152
+ return self.last_input_tokens / self.context_window
153
+
154
+ def get_warning_level(self) -> str | None:
155
+ """Return warning level based on context utilization.
156
+ Returns None if no warning needed."""
157
+ util = self.get_utilization()
158
+ if util >= 0.95:
159
+ return "CRITICAL"
160
+ elif util >= 0.85:
161
+ return "HIGH"
162
+ elif util >= 0.75:
163
+ return "ELEVATED"
164
+ return None
165
+
166
+ def estimate_turns_remaining(self) -> int | None:
167
+ """Estimate how many more agentic turns can fit before overflow."""
168
+ if self.context_window <= 0 or len(self.context_history) < 2:
169
+ return None
170
+ # Average growth per turn from recent history
171
+ deltas = [
172
+ self.context_history[i] - self.context_history[i - 1]
173
+ for i in range(1, len(self.context_history))
174
+ if self.context_history[i] > self.context_history[i - 1]
175
+ ]
176
+ if not deltas:
177
+ return None
178
+ avg_growth = sum(deltas) / len(deltas)
179
+ if avg_growth <= 0:
180
+ return None
181
+ remaining_tokens = self.context_window - self.last_input_tokens
182
+ return max(0, int(remaining_tokens / avg_growth))
183
+
184
+ def log_status(self):
185
+ """Log current session status."""
186
+ util = self.get_utilization()
187
+ warning = self.get_warning_level()
188
+ turns = self.estimate_turns_remaining()
189
+ turns_str = f"~{turns} turns remaining" if turns is not None else "unknown"
190
+
191
+ if warning == "CRITICAL":
192
+ logger.error(
193
+ "CONTEXT CRITICAL: %d/%d tokens (%.1f%%), %s, pruned=%d, overflows=%d",
194
+ self.last_input_tokens, self.context_window, util * 100,
195
+ turns_str, self.prune_count, self.overflow_count,
196
+ )
197
+ elif warning == "HIGH":
198
+ logger.warning(
199
+ "CONTEXT HIGH: %d/%d tokens (%.1f%%), %s, pruned=%d",
200
+ self.last_input_tokens, self.context_window, util * 100,
201
+ turns_str, self.prune_count,
202
+ )
203
+ elif warning == "ELEVATED":
204
+ logger.warning(
205
+ "CONTEXT ELEVATED: %d/%d tokens (%.1f%%), %s",
206
+ self.last_input_tokens, self.context_window, util * 100,
207
+ turns_str,
208
+ )
209
+ else:
210
+ logger.info(
211
+ "CONTEXT: %d/%d tokens (%.1f%%), %s",
212
+ self.last_input_tokens, self.context_window, util * 100,
213
+ turns_str,
214
+ )
215
+
216
+
217
+ session_monitor = SessionMonitor()
218
+
219
+
220
+ # ---------------------------------------------------------------------------
221
+ # Context Window Detection
222
+ # ---------------------------------------------------------------------------
223
+ async def detect_context_window(client: httpx.AsyncClient) -> int:
224
+ """Auto-detect the upstream server's per-slot context window size.
225
+
226
+ Queries the /slots endpoint (llama.cpp) to get the actual n_ctx value.
227
+ Falls back to PROXY_CONTEXT_WINDOW env var, then to a safe default.
228
+ """
229
+ if PROXY_CONTEXT_WINDOW > 0:
230
+ logger.info("Using configured context window: %d tokens", PROXY_CONTEXT_WINDOW)
231
+ return PROXY_CONTEXT_WINDOW
232
+
233
+ try:
234
+ slots_url = LLAMA_CPP_BASE.replace("/v1", "/slots")
235
+ resp = await client.get(slots_url, timeout=5.0)
236
+ if resp.status_code == 200:
237
+ slots = resp.json()
238
+ if slots and isinstance(slots, list):
239
+ n_ctx = slots[0].get("n_ctx", 0)
240
+ if n_ctx > 0:
241
+ logger.info(
242
+ "Auto-detected context window from upstream: %d tokens (%d slots)",
243
+ n_ctx, len(slots),
244
+ )
245
+ return n_ctx
246
+ except Exception as exc:
247
+ logger.warning("Failed to auto-detect context window: %s", exc)
248
+
249
+ # Safe default: 128K (common for modern models)
250
+ default = 131072
251
+ logger.warning("Using default context window: %d tokens", default)
252
+ return default
253
+
254
+
255
+ # ---------------------------------------------------------------------------
256
+ # Option C: Conversation Pruning
257
+ # ---------------------------------------------------------------------------
258
+ # Characters-per-token ratio for estimation. English text averages ~4 chars/token,
259
+ # but tool call JSON and code tend to be denser (~3.2 chars/token).
260
+ CHARS_PER_TOKEN = 3.5
261
+
262
+
263
+ def estimate_tokens(text: str) -> int:
264
+ """Estimate token count from text length using chars-per-token heuristic."""
265
+ return max(1, int(len(text) / CHARS_PER_TOKEN))
266
+
267
+
268
+ def estimate_message_tokens(msg: dict) -> int:
269
+ """Estimate token count for a single Anthropic message."""
270
+ tokens = 4 # Message overhead (role, separators)
271
+ content = msg.get("content", "")
272
+ if isinstance(content, str):
273
+ tokens += estimate_tokens(content)
274
+ elif isinstance(content, list):
275
+ for block in content:
276
+ if isinstance(block, str):
277
+ tokens += estimate_tokens(block)
278
+ elif isinstance(block, dict):
279
+ if block.get("type") == "text":
280
+ tokens += estimate_tokens(block.get("text", ""))
281
+ elif block.get("type") == "tool_use":
282
+ tokens += estimate_tokens(block.get("name", ""))
283
+ tokens += estimate_tokens(json.dumps(block.get("input", {})))
284
+ elif block.get("type") == "tool_result":
285
+ tokens += estimate_tokens(_extract_text(block.get("content", "")))
286
+ return tokens
287
+
288
+
289
+ def estimate_total_tokens(anthropic_body: dict) -> int:
290
+ """Estimate total token count for an Anthropic Messages API request."""
291
+ tokens = 0
292
+
293
+ # System prompt
294
+ system = anthropic_body.get("system", "")
295
+ if isinstance(system, str):
296
+ tokens += estimate_tokens(system)
297
+ elif isinstance(system, list):
298
+ for block in system:
299
+ if isinstance(block, dict) and block.get("type") == "text":
300
+ tokens += estimate_tokens(block.get("text", ""))
301
+
302
+ # Agentic supplement tokens (always injected)
303
+ tokens += estimate_tokens(_AGENTIC_SYSTEM_SUPPLEMENT)
304
+
305
+ # Messages
306
+ for msg in anthropic_body.get("messages", []):
307
+ tokens += estimate_message_tokens(msg)
308
+
309
+ # Tool definitions
310
+ tools = anthropic_body.get("tools", [])
311
+ if tools:
312
+ tokens += estimate_tokens(json.dumps(tools))
313
+
314
+ return tokens
315
+
316
+
317
+ def prune_conversation(anthropic_body: dict, context_window: int, target_fraction: float = 0.65) -> dict:
318
+ """Prune the conversation to fit within the context window.
319
+
320
+ Strategy:
321
+ - Always keep: system prompt, first user message, last N messages
322
+ - Remove from the middle: oldest tool_result messages first (they're
323
+ the largest -- file contents, command output, etc.), then oldest
324
+ assistant messages, then oldest user messages.
325
+ - Inject a [CONTEXT PRUNED] marker so the model knows history was trimmed.
326
+
327
+ Args:
328
+ anthropic_body: The full Anthropic request body
329
+ context_window: Maximum context window in tokens
330
+ target_fraction: Target utilization after pruning (0.0-1.0)
331
+
332
+ Returns:
333
+ Modified anthropic_body with pruned messages
334
+ """
335
+ messages = anthropic_body.get("messages", [])
336
+ if len(messages) <= 4:
337
+ # Too few messages to prune meaningfully
338
+ return anthropic_body
339
+
340
+ target_tokens = int(context_window * target_fraction)
341
+
342
+ # Estimate non-message tokens (system, tools, agentic supplement)
343
+ overhead_tokens = 0
344
+ system = anthropic_body.get("system", "")
345
+ if isinstance(system, str):
346
+ overhead_tokens += estimate_tokens(system)
347
+ elif isinstance(system, list):
348
+ for block in system:
349
+ if isinstance(block, dict) and block.get("type") == "text":
350
+ overhead_tokens += estimate_tokens(block.get("text", ""))
351
+ overhead_tokens += estimate_tokens(_AGENTIC_SYSTEM_SUPPLEMENT)
352
+ tools = anthropic_body.get("tools", [])
353
+ if tools:
354
+ overhead_tokens += estimate_tokens(json.dumps(tools))
355
+
356
+ # Budget for messages
357
+ message_budget = target_tokens - overhead_tokens
358
+ if message_budget <= 0:
359
+ logger.error("System prompt + tools alone exceed target budget!")
360
+ return anthropic_body
361
+
362
+ # Always keep the first user message and the last N messages
363
+ KEEP_LAST = 8 # Keep the last 8 messages (recent context)
364
+ protected_head = messages[:1] # First user message
365
+ protected_tail = messages[-KEEP_LAST:] if len(messages) > KEEP_LAST else messages[1:]
366
+ middle = messages[1:-KEEP_LAST] if len(messages) > KEEP_LAST + 1 else []
367
+
368
+ # Calculate tokens for protected messages
369
+ protected_tokens = sum(estimate_message_tokens(m) for m in protected_head + protected_tail)
370
+
371
+ if protected_tokens >= message_budget:
372
+ # Even protected messages exceed budget -- truncate tool_result content
373
+ # in the tail to fit
374
+ logger.warning(
375
+ "Protected messages (%d tokens) exceed budget (%d) -- truncating tool results",
376
+ protected_tokens, message_budget,
377
+ )
378
+ for msg in protected_tail:
379
+ content = msg.get("content", [])
380
+ if isinstance(content, list):
381
+ for block in content:
382
+ if isinstance(block, dict) and block.get("type") == "tool_result":
383
+ result_text = _extract_text(block.get("content", ""))
384
+ if len(result_text) > 2000:
385
+ block["content"] = result_text[:1000] + "\n...[TRUNCATED]...\n" + result_text[-500:]
386
+ anthropic_body["messages"] = protected_head + protected_tail
387
+ return anthropic_body
388
+
389
+ remaining_budget = message_budget - protected_tokens
390
+
391
+ # Score middle messages for removal priority:
392
+ # - tool_result messages: remove first (biggest, least important historically)
393
+ # - assistant text-only: remove second
394
+ # - user messages: remove last (provide context for the model's actions)
395
+ # Within each category, remove oldest first.
396
+ scored_middle = []
397
+ for i, msg in enumerate(middle):
398
+ content = msg.get("content", [])
399
+ tokens = estimate_message_tokens(msg)
400
+ is_tool_result = False
401
+ is_assistant = msg.get("role") == "assistant"
402
+
403
+ if isinstance(content, list):
404
+ is_tool_result = any(
405
+ isinstance(b, dict) and b.get("type") == "tool_result"
406
+ for b in content
407
+ )
408
+
409
+ # Lower priority = removed first
410
+ if is_tool_result:
411
+ priority = 0 # Remove first
412
+ elif is_assistant:
413
+ priority = 1 # Remove second
414
+ else:
415
+ priority = 2 # Remove last (user messages)
416
+
417
+ scored_middle.append((priority, i, tokens, msg))
418
+
419
+ # Sort by priority (ascending = remove first), then by index (oldest first)
420
+ scored_middle.sort(key=lambda x: (x[0], x[1]))
421
+
422
+ # Greedily keep messages from highest priority (keep last) until budget fills
423
+ kept_middle = []
424
+ used_tokens = 0
425
+ # Process in reverse priority order (keep high-priority messages first)
426
+ for priority, idx, tokens, msg in reversed(scored_middle):
427
+ if used_tokens + tokens <= remaining_budget:
428
+ kept_middle.append((idx, msg))
429
+ used_tokens += tokens
430
+
431
+ # Sort kept messages back into original order
432
+ kept_middle.sort(key=lambda x: x[0])
433
+ kept_msgs = [m for _, m in kept_middle]
434
+
435
+ removed_count = len(middle) - len(kept_msgs)
436
+ removed_tokens = sum(t for _, _, t, _ in scored_middle) - used_tokens
437
+
438
+ if removed_count > 0:
439
+ # Insert a context-pruned marker
440
+ prune_marker = {
441
+ "role": "user",
442
+ "content": (
443
+ f"[CONTEXT PRUNED: {removed_count} older messages (~{removed_tokens} tokens) "
444
+ f"were removed to fit within the context window. "
445
+ f"The conversation continues from recent context below.]"
446
+ ),
447
+ }
448
+ anthropic_body["messages"] = protected_head + [prune_marker] + kept_msgs + protected_tail
449
+ logger.warning(
450
+ "PRUNED: removed %d messages (~%d tokens), kept %d messages, "
451
+ "target=%.0f%% of %d ctx",
452
+ removed_count, removed_tokens, len(anthropic_body["messages"]),
453
+ target_fraction * 100, context_window,
454
+ )
455
+ else:
456
+ anthropic_body["messages"] = protected_head + kept_msgs + protected_tail
457
+
458
+ return anthropic_body
459
+
460
+
101
461
  # ---------------------------------------------------------------------------
102
462
  # HTTP Client Lifecycle
103
463
  # ---------------------------------------------------------------------------
@@ -127,6 +487,15 @@ async def lifespan(app: FastAPI):
127
487
  "Proxy started: listening on %s:%d -> upstream %s",
128
488
  PROXY_HOST, PROXY_PORT, LLAMA_CPP_BASE,
129
489
  )
490
+
491
+ # Auto-detect context window from upstream server
492
+ session_monitor.context_window = await detect_context_window(http_client)
493
+ logger.info(
494
+ "Context window: %d tokens, prune threshold: %.0f%%",
495
+ session_monitor.context_window,
496
+ PROXY_CONTEXT_PRUNE_THRESHOLD * 100,
497
+ )
498
+
130
499
  yield
131
500
  await http_client.aclose()
132
501
  http_client = None
@@ -219,6 +588,20 @@ def _extract_text(content) -> str:
219
588
  return str(content)
220
589
 
221
590
 
591
+ _AGENTIC_SYSTEM_SUPPLEMENT = (
592
+ "\n\n<agentic-protocol>\n"
593
+ "You are operating in an agentic coding loop with tool access. Follow these rules:\n"
594
+ "1. ALWAYS use tools to read, edit, write, and test code. Never just describe or explain what should be done.\n"
595
+ "2. After reading files and identifying an issue, proceed IMMEDIATELY to make the fix using Edit/Write tools. Do NOT stop after explaining the problem.\n"
596
+ "3. After making changes, run the relevant tests or build commands to verify your fix.\n"
597
+ "4. Only produce a final text response WITHOUT tool calls when the ENTIRE task is fully complete, verified, and you have nothing left to do.\n"
598
+ "5. If you have identified a problem but have not yet fixed it, you MUST call a tool to make the fix. Do NOT summarize the issue and stop.\n"
599
+ "6. When the user asks you to do something, DO it with tools. Do not ask for permission or confirmation.\n"
600
+ "7. If a tool call fails, analyze the error and try a different approach. Do not give up after one failure.\n"
601
+ "</agentic-protocol>"
602
+ )
603
+
604
+
222
605
  def build_openai_request(anthropic_body: dict) -> dict:
223
606
  """Build an OpenAI Chat Completions request from an Anthropic Messages request."""
224
607
  openai_body = {
@@ -227,8 +610,51 @@ def build_openai_request(anthropic_body: dict) -> dict:
227
610
  "stream": anthropic_body.get("stream", False),
228
611
  }
229
612
 
613
+ # Inject agentic protocol instructions into the system message so
614
+ # the model knows it must use tools to complete work, not just explain.
615
+ if openai_body["messages"] and openai_body["messages"][0].get("role") == "system":
616
+ openai_body["messages"][0]["content"] += _AGENTIC_SYSTEM_SUPPLEMENT
617
+ else:
618
+ # No system message from the client; inject one.
619
+ openai_body["messages"].insert(0, {
620
+ "role": "system",
621
+ "content": _AGENTIC_SYSTEM_SUPPLEMENT.strip(),
622
+ })
623
+
230
624
  if "max_tokens" in anthropic_body:
231
- openai_body["max_tokens"] = anthropic_body["max_tokens"]
625
+ # Enforce minimum floor for thinking mode: model needs tokens for
626
+ # reasoning (<think>...</think>) plus the actual response/tool calls.
627
+ # Claude Code typically sends 4096-8192 which is too low for thinking.
628
+ requested_max = max(anthropic_body["max_tokens"], 16384)
629
+
630
+ # Option E: Smart max_tokens capping — prevent the response from
631
+ # consuming so many tokens that the NEXT turn's input won't fit.
632
+ # Formula: max_tokens = min(requested, context_window - input_tokens - safety_margin)
633
+ # This ensures the model's output + current input stays within bounds,
634
+ # leaving room for the next turn's incremental growth.
635
+ ctx_window = session_monitor.context_window
636
+ if ctx_window > 0:
637
+ estimated_input = estimate_total_tokens(anthropic_body)
638
+ # Reserve 15% of context for next-turn growth (tool results, etc.)
639
+ safety_margin = int(ctx_window * 0.15)
640
+ available_for_output = ctx_window - estimated_input - safety_margin
641
+ if available_for_output < requested_max and available_for_output > 1024:
642
+ logger.info(
643
+ "MAX_TOKENS capped: %d -> %d (ctx=%d, input~%d, margin=%d)",
644
+ requested_max, available_for_output,
645
+ ctx_window, estimated_input, safety_margin,
646
+ )
647
+ requested_max = available_for_output
648
+ elif available_for_output <= 1024:
649
+ # Very tight on space -- allow minimum but warn
650
+ logger.warning(
651
+ "MAX_TOKENS: only %d tokens available for output (ctx=%d, input~%d). "
652
+ "Response may be truncated.",
653
+ available_for_output, ctx_window, estimated_input,
654
+ )
655
+ requested_max = max(1024, available_for_output)
656
+
657
+ openai_body["max_tokens"] = requested_max
232
658
  if "temperature" in anthropic_body:
233
659
  openai_body["temperature"] = anthropic_body["temperature"]
234
660
  if "top_p" in anthropic_body:
@@ -249,9 +675,62 @@ def build_openai_request(anthropic_body: dict) -> dict:
249
675
  },
250
676
  })
251
677
 
678
+ # Smart tool_choice: force tool calls during the agentic loop to
679
+ # prevent the model from producing text-only end_turn responses that
680
+ # prematurely stop the loop. The model can still produce text alongside
681
+ # tool calls when tool_choice="required".
682
+ #
683
+ # Force "required" when:
684
+ # - More than 1 message (conversation is in progress)
685
+ # - Last assistant was text-only (would cause premature stop)
686
+ # - OR conversation has tool_result messages (active agentic loop)
687
+ n_msgs = len(anthropic_body.get("messages", []))
688
+ has_tool_results = any(
689
+ isinstance(m.get("content"), list) and any(
690
+ isinstance(b, dict) and b.get("type") == "tool_result"
691
+ for b in m.get("content", [])
692
+ )
693
+ for m in anthropic_body.get("messages", [])
694
+ )
695
+ if _last_assistant_was_text_only(anthropic_body):
696
+ openai_body["tool_choice"] = "required"
697
+ logger.info("tool_choice forced to 'required' (last assistant was text-only)")
698
+ elif has_tool_results and n_msgs > 2:
699
+ openai_body["tool_choice"] = "required"
700
+ logger.info("tool_choice forced to 'required' (active agentic loop with tool results)")
701
+
252
702
  return openai_body
253
703
 
254
704
 
705
+ def _last_assistant_was_text_only(anthropic_body: dict) -> bool:
706
+ """Check if the last assistant message in the conversation was text-only
707
+ (no tool_use blocks). This indicates the model may be prematurely ending
708
+ the agentic loop by explaining instead of acting."""
709
+ messages = anthropic_body.get("messages", [])
710
+ # Walk backwards to find the last assistant message
711
+ for msg in reversed(messages):
712
+ if msg.get("role") != "assistant":
713
+ continue
714
+ content = msg.get("content")
715
+ if isinstance(content, str):
716
+ # Pure text assistant message -- text-only
717
+ return bool(content.strip())
718
+ if isinstance(content, list):
719
+ has_tool_use = any(
720
+ isinstance(b, dict) and b.get("type") == "tool_use"
721
+ for b in content
722
+ )
723
+ has_text = any(
724
+ (isinstance(b, dict) and b.get("type") == "text" and b.get("text", "").strip())
725
+ or isinstance(b, str)
726
+ for b in content
727
+ )
728
+ # Text-only if there's text but no tool_use
729
+ return has_text and not has_tool_use
730
+ return False
731
+ return False
732
+
733
+
255
734
  # ===========================================================================
256
735
  # Response Translation: OpenAI -> Anthropic
257
736
  # ===========================================================================
@@ -339,6 +818,8 @@ async def stream_anthropic_response(openai_stream: httpx.Response, model: str):
339
818
  # Track tool call state for streaming tool_calls
340
819
  tool_calls_by_index: dict[int, dict] = {}
341
820
  tool_block_index = 1 # anthropic block index (0 = text)
821
+ text_chunks: list[str] = [] # accumulate text for logging
822
+ reasoning_chunks: list[str] = [] # accumulate reasoning for fallback
342
823
 
343
824
  try:
344
825
  async for line in openai_stream.aiter_lines():
@@ -355,9 +836,16 @@ async def stream_anthropic_response(openai_stream: httpx.Response, model: str):
355
836
  choice = (chunk.get("choices") or [{}])[0]
356
837
  delta = choice.get("delta", {})
357
838
 
839
+ # Collect reasoning_content (normally stripped; used as fallback
840
+ # if the model produces only reasoning with no visible output)
841
+ reasoning = delta.get("reasoning_content", "")
842
+ if reasoning:
843
+ reasoning_chunks.append(reasoning)
844
+
358
845
  # Handle text content deltas
359
846
  if delta.get("content"):
360
847
  output_tokens += 1 # rough token estimate
848
+ text_chunks.append(delta["content"])
361
849
  yield (
362
850
  f"event: content_block_delta\n"
363
851
  f"data: {json.dumps({'type': 'content_block_delta', 'index': 0, 'delta': {'type': 'text_delta', 'text': delta['content']}})}\n\n"
@@ -372,10 +860,11 @@ async def stream_anthropic_response(openai_stream: httpx.Response, model: str):
372
860
  # New tool call starting
373
861
  tc_id = tc_delta.get("id", f"toolu_{uuid.uuid4().hex[:12]}")
374
862
  fn = tc_delta.get("function", {})
863
+ initial_args = fn.get("arguments", "")
375
864
  tool_calls_by_index[tc_idx] = {
376
865
  "id": tc_id,
377
866
  "name": fn.get("name", ""),
378
- "arguments": fn.get("arguments", ""),
867
+ "arguments": initial_args,
379
868
  "block_index": tool_block_index,
380
869
  }
381
870
 
@@ -391,6 +880,18 @@ async def stream_anthropic_response(openai_stream: httpx.Response, model: str):
391
880
  f"event: content_block_start\n"
392
881
  f"data: {json.dumps({'type': 'content_block_start', 'index': tool_block_index, 'content_block': {'type': 'tool_use', 'id': tc_id, 'name': fn.get('name', '')}})}\n\n"
393
882
  )
883
+
884
+ # Emit initial arguments fragment (e.g. "{") that
885
+ # arrives with the first tool_call chunk. Without
886
+ # this the opening brace is swallowed and the client
887
+ # receives invalid JSON like "command":"ls"} instead
888
+ # of {"command":"ls"}.
889
+ if initial_args:
890
+ yield (
891
+ f"event: content_block_delta\n"
892
+ f"data: {json.dumps({'type': 'content_block_delta', 'index': tool_block_index, 'delta': {'type': 'input_json_delta', 'partial_json': initial_args}})}\n\n"
893
+ )
894
+
394
895
  tool_block_index += 1
395
896
  else:
396
897
  # Continuation: argument chunks
@@ -406,6 +907,11 @@ async def stream_anthropic_response(openai_stream: httpx.Response, model: str):
406
907
 
407
908
  if choice.get("finish_reason"):
408
909
  fr = choice["finish_reason"]
910
+ if fr == "length":
911
+ logger.warning(
912
+ "Response truncated by token limit (finish_reason=length). "
913
+ "Consider increasing --n-predict or max_tokens."
914
+ )
409
915
  finish_reason = {
410
916
  "stop": "end_turn",
411
917
  "length": "max_tokens",
@@ -433,11 +939,40 @@ async def stream_anthropic_response(openai_stream: httpx.Response, model: str):
433
939
  f"data: {json.dumps({'type': 'content_block_stop', 'index': tc['block_index']})}\n\n"
434
940
  )
435
941
  else:
942
+ # Option E: If the response has no text AND no tool calls, but the
943
+ # model produced reasoning_content, forward the reasoning as visible
944
+ # text so the client doesn't receive a completely empty turn.
945
+ accumulated_text = "".join(text_chunks)
946
+ if not accumulated_text and reasoning_chunks:
947
+ fallback_text = "".join(reasoning_chunks)
948
+ logger.warning(
949
+ "Empty response with %d reasoning tokens – forwarding reasoning as fallback text",
950
+ len(reasoning_chunks),
951
+ )
952
+ text_chunks.append(fallback_text)
953
+ yield (
954
+ f"event: content_block_delta\n"
955
+ f"data: {json.dumps({'type': 'content_block_delta', 'index': 0, 'delta': {'type': 'text_delta', 'text': fallback_text}})}\n\n"
956
+ )
957
+
436
958
  yield (
437
959
  f"event: content_block_stop\n"
438
960
  f"data: {json.dumps({'type': 'content_block_stop', 'index': 0})}\n\n"
439
961
  )
440
962
 
963
+ # Log response summary
964
+ accumulated_text = "".join(text_chunks)
965
+ tc_names = [tc["name"] for tc in tool_calls_by_index.values()] if tool_calls_by_index else []
966
+ tc_args = [tc.get("arguments", "") for tc in tool_calls_by_index.values()] if tool_calls_by_index else []
967
+ logger.info(
968
+ "RESP: finish=%s output_tokens=%d text_len=%d text=%.300s tool_calls=%s args=%s",
969
+ finish_reason, output_tokens,
970
+ len(accumulated_text),
971
+ accumulated_text[:300],
972
+ tc_names,
973
+ [a[:200] for a in tc_args],
974
+ )
975
+
441
976
  # message_delta with final stop reason
442
977
  yield (
443
978
  f"event: message_delta\n"
@@ -454,10 +989,61 @@ async def stream_anthropic_response(openai_stream: httpx.Response, model: str):
454
989
 
455
990
  @app.post("/v1/messages")
456
991
  async def messages(request: Request):
457
- """Handle Anthropic Messages API requests (streaming and non-streaming)."""
992
+ """Handle Anthropic Messages API requests (streaming and non-streaming).
993
+
994
+ Integrates context management:
995
+ - Option B: HTTP error handling for upstream 4xx/5xx responses
996
+ - Option C: Conversation pruning when approaching context limits
997
+ - Option E: Smart max_tokens capping (in build_openai_request)
998
+ - Option F: Session-level token monitoring with warnings
999
+ """
458
1000
  body = await request.json()
459
1001
  model = body.get("model", "default")
460
1002
  is_stream = body.get("stream", False)
1003
+
1004
+ # Debug: log request summary
1005
+ n_messages = len(body.get("messages", []))
1006
+ n_tools = len(body.get("tools", []))
1007
+ max_tokens = body.get("max_tokens", "unset")
1008
+ last_msg = body.get("messages", [{}])[-1]
1009
+ last_role = last_msg.get("role", "?")
1010
+ last_content = last_msg.get("content", "")
1011
+ if isinstance(last_content, list):
1012
+ last_text = next((b.get("text", "") for b in last_content if b.get("type") == "text"), "")[:200]
1013
+ elif isinstance(last_content, str):
1014
+ last_text = last_content[:200]
1015
+ else:
1016
+ last_text = str(last_content)[:200]
1017
+ logger.info(
1018
+ "REQ: stream=%s msgs=%d tools=%d max_tokens=%s last_role=%s last_content=%.200s",
1019
+ is_stream, n_messages, n_tools, max_tokens, last_role, last_text
1020
+ )
1021
+
1022
+ # --- Option F: Estimate tokens and record in session monitor ---
1023
+ estimated_tokens = estimate_total_tokens(body)
1024
+ session_monitor.record_request(estimated_tokens)
1025
+ session_monitor.log_status()
1026
+
1027
+ # --- Option C: Prune conversation if approaching context limit ---
1028
+ ctx_window = session_monitor.context_window
1029
+ if ctx_window > 0:
1030
+ utilization = estimated_tokens / ctx_window
1031
+ if utilization >= PROXY_CONTEXT_PRUNE_THRESHOLD:
1032
+ logger.warning(
1033
+ "Context utilization %.1f%% exceeds threshold %.1f%% -- pruning conversation",
1034
+ utilization * 100, PROXY_CONTEXT_PRUNE_THRESHOLD * 100,
1035
+ )
1036
+ body = prune_conversation(body, ctx_window, target_fraction=0.65)
1037
+ session_monitor.prune_count += 1
1038
+ # Re-estimate after pruning
1039
+ estimated_tokens = estimate_total_tokens(body)
1040
+ session_monitor.record_request(estimated_tokens)
1041
+ n_messages = len(body.get("messages", []))
1042
+ logger.info(
1043
+ "After pruning: ~%d tokens, %d messages",
1044
+ estimated_tokens, n_messages,
1045
+ )
1046
+
461
1047
  openai_body = build_openai_request(body)
462
1048
 
463
1049
  client = http_client
@@ -470,15 +1056,127 @@ async def messages(request: Request):
470
1056
 
471
1057
  if is_stream:
472
1058
  openai_body["stream"] = True
473
- resp = await client.send(
474
- client.build_request(
475
- "POST",
476
- f"{LLAMA_CPP_BASE}/chat/completions",
477
- json=openai_body,
478
- headers={"Content-Type": "application/json"},
479
- ),
480
- stream=True,
481
- )
1059
+
1060
+ # Retry upstream connection with backoff to handle
1061
+ # llama-server restarts gracefully instead of 500-ing to the client.
1062
+ MAX_UPSTREAM_RETRIES = 3
1063
+ RETRY_DELAY_SECS = 5.0
1064
+ last_exc: Exception | None = None
1065
+
1066
+ for attempt in range(MAX_UPSTREAM_RETRIES):
1067
+ try:
1068
+ resp = await client.send(
1069
+ client.build_request(
1070
+ "POST",
1071
+ f"{LLAMA_CPP_BASE}/chat/completions",
1072
+ json=openai_body,
1073
+ headers={"Content-Type": "application/json"},
1074
+ ),
1075
+ stream=True,
1076
+ )
1077
+ # Connection succeeded – break out of retry loop
1078
+ last_exc = None
1079
+ break
1080
+ except (httpx.ConnectError, httpx.RemoteProtocolError) as exc:
1081
+ last_exc = exc
1082
+ if attempt < MAX_UPSTREAM_RETRIES - 1:
1083
+ logger.warning(
1084
+ "Upstream connect failed (attempt %d/%d): %s – retrying in %.0fs",
1085
+ attempt + 1, MAX_UPSTREAM_RETRIES,
1086
+ type(exc).__name__, RETRY_DELAY_SECS,
1087
+ )
1088
+ await asyncio.sleep(RETRY_DELAY_SECS)
1089
+ else:
1090
+ logger.error(
1091
+ "Upstream connect failed after %d attempts: %s: %s",
1092
+ MAX_UPSTREAM_RETRIES, type(exc).__name__, exc,
1093
+ )
1094
+
1095
+ if last_exc is not None:
1096
+ return Response(
1097
+ content=json.dumps({
1098
+ "type": "error",
1099
+ "error": {
1100
+ "type": "overloaded_error",
1101
+ "message": f"Upstream server unavailable after {MAX_UPSTREAM_RETRIES} retries: {last_exc}",
1102
+ },
1103
+ }),
1104
+ status_code=529,
1105
+ media_type="application/json",
1106
+ )
1107
+
1108
+ # --- Option B: Check HTTP status before streaming ---
1109
+ # llama-server returns 400 for context overflow, 500 for internal errors, etc.
1110
+ # Without this check, the proxy would try to stream-translate an error body,
1111
+ # producing an empty response that silently kills the agentic loop.
1112
+ if resp.status_code != 200:
1113
+ error_body = await resp.aread()
1114
+ await resp.aclose()
1115
+ error_text = error_body.decode("utf-8", errors="replace")[:1000]
1116
+ logger.error(
1117
+ "Upstream HTTP %d: %s", resp.status_code, error_text
1118
+ )
1119
+
1120
+ # Parse the error for a user-friendly message
1121
+ error_message = f"Upstream server error (HTTP {resp.status_code})"
1122
+ try:
1123
+ error_json = json.loads(error_body)
1124
+ if "error" in error_json:
1125
+ upstream_error = error_json["error"]
1126
+ if isinstance(upstream_error, dict):
1127
+ error_message = upstream_error.get("message", error_message)
1128
+ else:
1129
+ error_message = str(upstream_error)
1130
+ except (json.JSONDecodeError, KeyError):
1131
+ error_message = error_text[:500] if error_text else error_message
1132
+
1133
+ # Detect context overflow specifically
1134
+ is_context_overflow = (
1135
+ resp.status_code == 400
1136
+ and "exceeds" in error_message.lower()
1137
+ and "context" in error_message.lower()
1138
+ )
1139
+
1140
+ if is_context_overflow:
1141
+ session_monitor.overflow_count += 1
1142
+ logger.error(
1143
+ "CONTEXT OVERFLOW detected (count=%d). "
1144
+ "Estimated input: %d tokens, context window: %d tokens. "
1145
+ "Conversation needs pruning or context window increase.",
1146
+ session_monitor.overflow_count, estimated_tokens, ctx_window,
1147
+ )
1148
+ # Return Anthropic-format error that Claude Code can handle
1149
+ return Response(
1150
+ content=json.dumps({
1151
+ "type": "error",
1152
+ "error": {
1153
+ "type": "overloaded_error",
1154
+ "message": (
1155
+ f"Context window exceeded: request requires ~{estimated_tokens} tokens "
1156
+ f"but only {ctx_window} are available. "
1157
+ f"The conversation is too long. Please start a new session or "
1158
+ f"reduce conversation length."
1159
+ ),
1160
+ },
1161
+ }),
1162
+ status_code=529,
1163
+ media_type="application/json",
1164
+ )
1165
+
1166
+ # Generic upstream error -- return as Anthropic error format
1167
+ error_type = "overloaded_error" if resp.status_code >= 500 else "invalid_request_error"
1168
+ return Response(
1169
+ content=json.dumps({
1170
+ "type": "error",
1171
+ "error": {
1172
+ "type": error_type,
1173
+ "message": error_message,
1174
+ },
1175
+ }),
1176
+ status_code=529 if resp.status_code >= 500 else 400,
1177
+ media_type="application/json",
1178
+ )
1179
+
482
1180
  return StreamingResponse(
483
1181
  stream_anthropic_response(resp, model),
484
1182
  media_type="text/event-stream",
@@ -493,8 +1191,30 @@ async def messages(request: Request):
493
1191
  json=openai_body,
494
1192
  headers={"Content-Type": "application/json"},
495
1193
  )
1194
+
1195
+ # Option B: Handle non-streaming errors too
1196
+ if resp.status_code != 200:
1197
+ error_text = resp.text[:1000]
1198
+ logger.error("Upstream HTTP %d (non-stream): %s", resp.status_code, error_text)
1199
+ return Response(
1200
+ content=json.dumps({
1201
+ "type": "error",
1202
+ "error": {
1203
+ "type": "overloaded_error",
1204
+ "message": f"Upstream error (HTTP {resp.status_code}): {error_text[:500]}",
1205
+ },
1206
+ }),
1207
+ status_code=529,
1208
+ media_type="application/json",
1209
+ )
1210
+
496
1211
  openai_resp = resp.json()
497
1212
  anthropic_resp = openai_to_anthropic_response(openai_resp, model)
1213
+
1214
+ # Track output tokens in session monitor
1215
+ output_tokens = anthropic_resp.get("usage", {}).get("output_tokens", 0)
1216
+ session_monitor.record_response(output_tokens)
1217
+
498
1218
  return anthropic_resp
499
1219
 
500
1220
 
@@ -537,6 +1257,33 @@ async def health():
537
1257
  }
538
1258
 
539
1259
 
1260
+ @app.get("/v1/context")
1261
+ async def context_status():
1262
+ """Option F: Context window monitoring endpoint.
1263
+
1264
+ Returns current session token usage, utilization, warnings, and
1265
+ estimated remaining turns. Useful for dashboards and debugging.
1266
+ """
1267
+ warning = session_monitor.get_warning_level()
1268
+ turns = session_monitor.estimate_turns_remaining()
1269
+
1270
+ return {
1271
+ "context_window": session_monitor.context_window,
1272
+ "last_input_tokens": session_monitor.last_input_tokens,
1273
+ "last_output_tokens": session_monitor.last_output_tokens,
1274
+ "peak_input_tokens": session_monitor.peak_input_tokens,
1275
+ "utilization": round(session_monitor.get_utilization(), 4),
1276
+ "utilization_pct": f"{session_monitor.get_utilization() * 100:.1f}%",
1277
+ "warning_level": warning,
1278
+ "estimated_turns_remaining": turns,
1279
+ "total_requests": session_monitor.total_requests,
1280
+ "prune_count": session_monitor.prune_count,
1281
+ "overflow_count": session_monitor.overflow_count,
1282
+ "prune_threshold": PROXY_CONTEXT_PRUNE_THRESHOLD,
1283
+ "recent_history": session_monitor.context_history[-10:],
1284
+ }
1285
+
1286
+
540
1287
  # ===========================================================================
541
1288
  # Entry Point
542
1289
  # ===========================================================================