@miller-tech/uap 1.20.14 → 1.20.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@miller-tech/uap",
3
- "version": "1.20.14",
3
+ "version": "1.20.16",
4
4
  "description": "Autonomous AI agent memory system with CLAUDE.md protocol enforcement",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",
@@ -107,7 +107,9 @@ ANTHROPIC_PASSTHROUGH_MODELS = os.environ.get("ANTHROPIC_PASSTHROUGH_MODELS", ""
107
107
  PROXY_PORT = int(os.environ.get("PROXY_PORT", "4000"))
108
108
  PROXY_HOST = os.environ.get("PROXY_HOST", "0.0.0.0")
109
109
  PROXY_LOG_LEVEL = os.environ.get("PROXY_LOG_LEVEL", "INFO").upper()
110
- PROXY_READ_TIMEOUT = float(os.environ.get("PROXY_READ_TIMEOUT", "600"))
110
+ PROXY_READ_TIMEOUT = float(os.environ.get("PROXY_READ_TIMEOUT", "180"))
111
+ PROXY_GENERATION_TIMEOUT = float(os.environ.get("PROXY_GENERATION_TIMEOUT", "300"))
112
+ PROXY_SLOT_HANG_TIMEOUT = float(os.environ.get("PROXY_SLOT_HANG_TIMEOUT", "120"))
111
113
  PROXY_UPSTREAM_RETRY_MAX = int(os.environ.get("PROXY_UPSTREAM_RETRY_MAX", "3"))
112
114
  PROXY_UPSTREAM_RETRY_DELAY_SECS = float(os.environ.get("PROXY_UPSTREAM_RETRY_DELAY_SECS", "5"))
113
115
  PROXY_MAX_CONNECTIONS = int(os.environ.get("PROXY_MAX_CONNECTIONS", "20"))
@@ -1311,6 +1313,72 @@ async def _post_with_retry(
1311
1313
  raise last_exc if last_exc else RuntimeError("upstream retry failed")
1312
1314
 
1313
1315
 
1316
+ async def _post_with_generation_timeout(
1317
+ client: httpx.AsyncClient,
1318
+ url: str,
1319
+ payload: dict,
1320
+ headers: dict,
1321
+ ) -> httpx.Response:
1322
+ """Wrap _post_with_retry with an explicit asyncio generation timeout.
1323
+
1324
+ The httpx read timeout may not fire for hung connections where the server
1325
+ keeps the socket open but produces no data (observed with llama.cpp server
1326
+ hanging after prompt processing). This wrapper uses asyncio.wait_for to
1327
+ enforce a hard deadline.
1328
+ """
1329
+ timeout = PROXY_GENERATION_TIMEOUT
1330
+ if timeout <= 0:
1331
+ return await _post_with_retry(client, url, payload, headers)
1332
+ try:
1333
+ return await asyncio.wait_for(
1334
+ _post_with_retry(client, url, payload, headers),
1335
+ timeout=timeout,
1336
+ )
1337
+ except asyncio.TimeoutError:
1338
+ logger.error(
1339
+ "GENERATION TIMEOUT: request to %s exceeded %ds hard deadline",
1340
+ url,
1341
+ int(timeout),
1342
+ )
1343
+ raise httpx.ReadTimeout(
1344
+ f"Generation timeout after {int(timeout)}s (PROXY_GENERATION_TIMEOUT)"
1345
+ )
1346
+
1347
+
1348
+ async def _check_slot_hang(slot_url: str) -> bool:
1349
+ """Check if any upstream slot is hung (processing but n_decoded=0).
1350
+
1351
+ Returns True if a hung slot was detected and the server was restarted.
1352
+ """
1353
+ if PROXY_SLOT_HANG_TIMEOUT <= 0:
1354
+ return False
1355
+ try:
1356
+ async with httpx.AsyncClient() as check_client:
1357
+ resp = await check_client.get(slot_url, timeout=5.0)
1358
+ if resp.status_code != 200:
1359
+ return False
1360
+ slots = resp.json()
1361
+ for slot in slots:
1362
+ if (
1363
+ slot.get("is_processing", False)
1364
+ and slot.get("n_decoded", -1) == 0
1365
+ ):
1366
+ # Slot is processing but hasn't decoded any tokens —
1367
+ # check how long by looking at the task start time.
1368
+ # Since we can't easily get the start time from the slot,
1369
+ # we'll just log a warning. The generation timeout will
1370
+ # handle the actual cancellation.
1371
+ logger.warning(
1372
+ "SLOT HANG DETECTED: slot %d is_processing=True n_decoded=0 task=%s",
1373
+ slot.get("id", -1),
1374
+ slot.get("id_task", "?"),
1375
+ )
1376
+ return True
1377
+ except Exception as exc:
1378
+ logger.debug("Slot hang check failed: %s", exc)
1379
+ return False
1380
+
1381
+
1314
1382
  @asynccontextmanager
1315
1383
  async def lifespan(app: FastAPI):
1316
1384
  """Manage the httpx client lifecycle with the FastAPI app."""
@@ -1383,6 +1451,12 @@ async def lifespan(app: FastAPI):
1383
1451
  TOOL_CALL_GRAMMAR_TOOLS_COMPATIBLE,
1384
1452
  PROXY_TOOL_CALL_GRAMMAR_PATH,
1385
1453
  )
1454
+ logger.info(
1455
+ "Timeouts: read=%ds generation=%ds slot_hang=%ds",
1456
+ int(PROXY_READ_TIMEOUT),
1457
+ int(PROXY_GENERATION_TIMEOUT),
1458
+ int(PROXY_SLOT_HANG_TIMEOUT),
1459
+ )
1386
1460
 
1387
1461
  yield
1388
1462
  await http_client.aclose()
@@ -2167,16 +2241,24 @@ def build_openai_request(
2167
2241
  # Enforce configurable minimum floor for thinking mode: model needs
2168
2242
  # tokens for reasoning (<think>...</think>) plus actual response/tool
2169
2243
  # calls. Set PROXY_MAX_TOKENS_FLOOR=0 to disable this floor.
2170
- floor_bypassed_for_tool_turn = (
2171
- has_tools
2172
- and PROXY_DISABLE_THINKING_ON_TOOL_TURNS
2173
- and PROXY_MAX_TOKENS_FLOOR > 0
2244
+ #
2245
+ # The floor is ONLY applied when thinking is actually enabled —
2246
+ # skip it for non-tool requests (tools=0) and for tool turns
2247
+ # with thinking disabled, to prevent inflating short preflight
2248
+ # requests (e.g. max_tokens=100 for plan generation).
2249
+ thinking_active_for_request = has_tools and not PROXY_DISABLE_THINKING_ON_TOOL_TURNS
2250
+ skip_floor = (
2251
+ not has_tools # non-tool requests don't need thinking headroom
2252
+ or PROXY_DISABLE_THINKING_ON_TOOL_TURNS # thinking disabled on tool turns
2253
+ or PROXY_MAX_TOKENS_FLOOR <= 0 # floor explicitly disabled
2174
2254
  )
2175
- if floor_bypassed_for_tool_turn:
2255
+ if skip_floor:
2176
2256
  requested_max = requested_raw
2177
- if requested_raw < PROXY_MAX_TOKENS_FLOOR:
2257
+ if requested_raw < PROXY_MAX_TOKENS_FLOOR and PROXY_MAX_TOKENS_FLOOR > 0:
2178
2258
  logger.info(
2179
- "MAX_TOKENS floor bypassed for tool turn with thinking disabled: requested=%d floor=%d",
2259
+ "MAX_TOKENS floor skipped: has_tools=%s thinking_active=%s requested=%d floor=%d",
2260
+ has_tools,
2261
+ thinking_active_for_request,
2180
2262
  requested_raw,
2181
2263
  PROXY_MAX_TOKENS_FLOOR,
2182
2264
  )
@@ -4435,6 +4517,48 @@ def _maybe_extract_text_tool_calls(openai_resp: dict) -> dict:
4435
4517
  return openai_resp
4436
4518
 
4437
4519
 
4520
+ def _detect_and_truncate_degenerate_repetition(openai_resp: dict) -> dict:
4521
+ """Detect degenerate repetitive text and truncate at first repetition.
4522
+
4523
+ When the model produces highly repetitive output (e.g. the same 20+ char
4524
+ substring repeated 10+ times), truncate at the first repetition boundary
4525
+ and set finish_reason to stop.
4526
+ """
4527
+ text = _openai_message_text(openai_resp)
4528
+ if not text or len(text) < 200:
4529
+ return openai_resp
4530
+
4531
+ # Look for repeated substrings of length 20-100
4532
+ for substr_len in (60, 40, 20):
4533
+ # Sample from the middle of the text to find the repeating pattern
4534
+ mid = len(text) // 2
4535
+ sample = text[mid : mid + substr_len]
4536
+ if not sample.strip():
4537
+ continue
4538
+ count = text.count(sample)
4539
+ if count >= 8:
4540
+ # Found degenerate repetition — truncate at first occurrence + one repeat
4541
+ first_pos = text.find(sample)
4542
+ second_pos = text.find(sample, first_pos + len(sample))
4543
+ if second_pos > first_pos:
4544
+ truncated = text[:second_pos].rstrip()
4545
+ logger.warning(
4546
+ "DEGENERATE REPETITION: detected %d repeats of %d-char substring, truncating %d -> %d chars",
4547
+ count,
4548
+ substr_len,
4549
+ len(text),
4550
+ len(truncated),
4551
+ )
4552
+ # Update the response
4553
+ choices = openai_resp.get("choices", [])
4554
+ if choices:
4555
+ msg = choices[0].get("message", {})
4556
+ msg["content"] = truncated
4557
+ choices[0]["finish_reason"] = "stop"
4558
+ return openai_resp
4559
+ return openai_resp
4560
+
4561
+
4438
4562
  def openai_to_anthropic_response(openai_resp: dict, model: str) -> dict:
4439
4563
  """Convert an OpenAI Chat Completions response to Anthropic Messages format."""
4440
4564
  # First: try to recover tool calls trapped in text XML tags
@@ -5166,13 +5290,15 @@ async def messages(request: Request):
5166
5290
  strict_body["stream"] = False
5167
5291
 
5168
5292
  try:
5169
- strict_resp = await _post_with_retry(
5293
+ strict_resp = await _post_with_generation_timeout(
5170
5294
  client,
5171
5295
  f"{LLAMA_CPP_BASE}/chat/completions",
5172
5296
  strict_body,
5173
5297
  {"Content-Type": "application/json"},
5174
5298
  )
5175
5299
  except Exception as exc:
5300
+ # Check if upstream is hung before returning error
5301
+ await _check_slot_hang(f"{LLAMA_CPP_BASE}/slots")
5176
5302
  return Response(
5177
5303
  content=json.dumps(
5178
5304
  {
@@ -5196,7 +5322,7 @@ async def messages(request: Request):
5196
5322
  "strict-stream",
5197
5323
  ):
5198
5324
  try:
5199
- strict_resp = await _post_with_retry(
5325
+ strict_resp = await _post_with_generation_timeout(
5200
5326
  client,
5201
5327
  f"{LLAMA_CPP_BASE}/chat/completions",
5202
5328
  strict_body,
@@ -5258,6 +5384,7 @@ async def messages(request: Request):
5258
5384
  session_id,
5259
5385
  )
5260
5386
 
5387
+ openai_resp = _detect_and_truncate_degenerate_repetition(openai_resp)
5261
5388
  anthropic_resp = openai_to_anthropic_response(openai_resp, model)
5262
5389
  monitor.record_response(anthropic_resp.get("usage", {}).get("output_tokens", 0))
5263
5390
  # Update last_input_tokens from upstream's actual prompt_tokens
@@ -5478,7 +5605,7 @@ async def messages(request: Request):
5478
5605
  )
5479
5606
  else:
5480
5607
  try:
5481
- resp = await _post_with_retry(
5608
+ resp = await _post_with_generation_timeout(
5482
5609
  client,
5483
5610
  f"{LLAMA_CPP_BASE}/chat/completions",
5484
5611
  openai_body,
@@ -5508,7 +5635,7 @@ async def messages(request: Request):
5508
5635
  "non-stream",
5509
5636
  ):
5510
5637
  try:
5511
- resp = await _post_with_retry(
5638
+ resp = await _post_with_generation_timeout(
5512
5639
  client,
5513
5640
  f"{LLAMA_CPP_BASE}/chat/completions",
5514
5641
  openai_body,
@@ -5596,6 +5723,7 @@ async def messages(request: Request):
5596
5723
  monitor.invalid_tool_call_streak = 0
5597
5724
  monitor.required_tool_miss_streak = 0
5598
5725
 
5726
+ openai_resp = _detect_and_truncate_degenerate_repetition(openai_resp)
5599
5727
  anthropic_resp = openai_to_anthropic_response(openai_resp, model)
5600
5728
 
5601
5729
  # Track output tokens in session monitor
@@ -6,6 +6,8 @@ import json
6
6
  import unittest
7
7
  from pathlib import Path
8
8
 
9
+ import httpx
10
+
9
11
 
10
12
  def _load_proxy_module():
11
13
  proxy_path = Path(__file__).resolve().parents[1] / "scripts" / "anthropic_proxy.py"
@@ -116,7 +118,8 @@ class TestProxyConfigTuning(unittest.TestCase):
116
118
  setattr(proxy, "PROXY_MAX_TOKENS_FLOOR", old_floor)
117
119
  setattr(proxy, "PROXY_DISABLE_THINKING_ON_TOOL_TURNS", old_disable)
118
120
 
119
- def test_build_request_keeps_floor_for_non_tool_turns(self):
121
+ def test_build_request_skips_floor_for_non_tool_turns(self):
122
+ """Non-tool requests should NOT have the max_tokens floor applied."""
120
123
  old_floor = getattr(proxy, "PROXY_MAX_TOKENS_FLOOR")
121
124
  old_disable = getattr(proxy, "PROXY_DISABLE_THINKING_ON_TOOL_TURNS")
122
125
  try:
@@ -132,7 +135,8 @@ class TestProxyConfigTuning(unittest.TestCase):
132
135
  openai = proxy.build_openai_request(
133
136
  body, proxy.SessionMonitor(context_window=0)
134
137
  )
135
- self.assertEqual(openai.get("max_tokens"), 4096)
138
+ # Floor should NOT inflate max_tokens for non-tool requests
139
+ self.assertEqual(openai.get("max_tokens"), 512)
136
140
  finally:
137
141
  setattr(proxy, "PROXY_MAX_TOKENS_FLOOR", old_floor)
138
142
  setattr(proxy, "PROXY_DISABLE_THINKING_ON_TOOL_TURNS", old_disable)
@@ -3437,6 +3441,143 @@ class TestMalformedRetryHardening(unittest.TestCase):
3437
3441
  self.assertNotIn("<tool_call>", m.get("content", ""))
3438
3442
 
3439
3443
 
3444
+ class TestDegenerateRepetitionDetection(unittest.TestCase):
3445
+ """Tests for degenerate repetition detection and truncation."""
3446
+
3447
+ def test_detects_and_truncates_repetitive_text(self):
3448
+ """Highly repetitive text should be truncated."""
3449
+ repeated = "Mermaid Diagrams](docs/mermaid-diagrams" * 50
3450
+ openai_resp = {
3451
+ "choices": [{"message": {"content": repeated}, "finish_reason": "length"}]
3452
+ }
3453
+ result = proxy._detect_and_truncate_degenerate_repetition(openai_resp)
3454
+ truncated_text = result["choices"][0]["message"]["content"]
3455
+ self.assertLess(len(truncated_text), len(repeated))
3456
+ self.assertEqual(result["choices"][0]["finish_reason"], "stop")
3457
+
3458
+ def test_preserves_non_repetitive_text(self):
3459
+ """Normal text should not be modified."""
3460
+ text = "This is a perfectly normal response with varied content. " * 5
3461
+ openai_resp = {
3462
+ "choices": [{"message": {"content": text}, "finish_reason": "stop"}]
3463
+ }
3464
+ result = proxy._detect_and_truncate_degenerate_repetition(openai_resp)
3465
+ self.assertEqual(result["choices"][0]["message"]["content"], text)
3466
+
3467
+ def test_preserves_short_text(self):
3468
+ """Short text (< 200 chars) should not be processed."""
3469
+ text = "Short response."
3470
+ openai_resp = {
3471
+ "choices": [{"message": {"content": text}, "finish_reason": "stop"}]
3472
+ }
3473
+ result = proxy._detect_and_truncate_degenerate_repetition(openai_resp)
3474
+ self.assertEqual(result["choices"][0]["message"]["content"], text)
3475
+
3476
+ def test_max_tokens_floor_skipped_for_non_tool_requests(self):
3477
+ """max_tokens floor should not inflate non-tool requests."""
3478
+ old_floor = getattr(proxy, "PROXY_MAX_TOKENS_FLOOR")
3479
+ old_disable = getattr(proxy, "PROXY_DISABLE_THINKING_ON_TOOL_TURNS")
3480
+ try:
3481
+ setattr(proxy, "PROXY_MAX_TOKENS_FLOOR", 16384)
3482
+ setattr(proxy, "PROXY_DISABLE_THINKING_ON_TOOL_TURNS", False)
3483
+
3484
+ body = {
3485
+ "model": "test",
3486
+ "max_tokens": 100,
3487
+ "messages": [{"role": "user", "content": "generate a title"}],
3488
+ }
3489
+ openai = proxy.build_openai_request(
3490
+ body, proxy.SessionMonitor(context_window=0)
3491
+ )
3492
+ # No tools = no floor inflation
3493
+ self.assertEqual(openai.get("max_tokens"), 100)
3494
+ finally:
3495
+ setattr(proxy, "PROXY_MAX_TOKENS_FLOOR", old_floor)
3496
+ setattr(proxy, "PROXY_DISABLE_THINKING_ON_TOOL_TURNS", old_disable)
3497
+
3498
+ def test_max_tokens_floor_applied_when_thinking_active(self):
3499
+ """max_tokens floor should apply when tools present and thinking enabled."""
3500
+ old_floor = getattr(proxy, "PROXY_MAX_TOKENS_FLOOR")
3501
+ old_disable = getattr(proxy, "PROXY_DISABLE_THINKING_ON_TOOL_TURNS")
3502
+ try:
3503
+ setattr(proxy, "PROXY_MAX_TOKENS_FLOOR", 4096)
3504
+ setattr(proxy, "PROXY_DISABLE_THINKING_ON_TOOL_TURNS", False)
3505
+
3506
+ body = {
3507
+ "model": "test",
3508
+ "max_tokens": 512,
3509
+ "messages": [{"role": "user", "content": "run command"}],
3510
+ "tools": [{"name": "Bash", "description": "run", "input_schema": {"type": "object"}}],
3511
+ }
3512
+ openai = proxy.build_openai_request(
3513
+ body, proxy.SessionMonitor(context_window=0)
3514
+ )
3515
+ # Tools + thinking enabled = floor applied
3516
+ self.assertEqual(openai.get("max_tokens"), 4096)
3517
+ finally:
3518
+ setattr(proxy, "PROXY_MAX_TOKENS_FLOOR", old_floor)
3519
+ setattr(proxy, "PROXY_DISABLE_THINKING_ON_TOOL_TURNS", old_disable)
3520
+
3521
+
3522
+ class TestGenerationHangRecovery(unittest.TestCase):
3523
+ """Tests for generation hang recovery: timeouts, slot hang detection."""
3524
+
3525
+ def test_read_timeout_default_is_180(self):
3526
+ """Option 3: default read timeout reduced from 600 to 180."""
3527
+ self.assertEqual(proxy.PROXY_READ_TIMEOUT, 180)
3528
+
3529
+ def test_generation_timeout_default_is_300(self):
3530
+ """Option 1: generation timeout is 300s."""
3531
+ self.assertEqual(proxy.PROXY_GENERATION_TIMEOUT, 300)
3532
+
3533
+ def test_slot_hang_timeout_default_is_120(self):
3534
+ """Option 2: slot hang timeout is 120s."""
3535
+ self.assertEqual(proxy.PROXY_SLOT_HANG_TIMEOUT, 120)
3536
+
3537
+ def test_generation_timeout_wraps_post_with_retry(self):
3538
+ """_post_with_generation_timeout raises ReadTimeout on asyncio timeout."""
3539
+ import asyncio
3540
+
3541
+ async def _run():
3542
+ # Create a mock client that hangs forever
3543
+ async def _hanging_post(*args, **kwargs):
3544
+ await asyncio.sleep(999)
3545
+
3546
+ class FakeClient:
3547
+ async def post(self, *args, **kwargs):
3548
+ await asyncio.sleep(999)
3549
+
3550
+ old_timeout = proxy.PROXY_GENERATION_TIMEOUT
3551
+ old_retry_max = proxy.PROXY_UPSTREAM_RETRY_MAX
3552
+ try:
3553
+ proxy.PROXY_GENERATION_TIMEOUT = 0.1 # 100ms
3554
+ proxy.PROXY_UPSTREAM_RETRY_MAX = 1
3555
+ with self.assertRaises(httpx.ReadTimeout):
3556
+ await proxy._post_with_generation_timeout(
3557
+ FakeClient(),
3558
+ "http://localhost:9999/fake",
3559
+ {},
3560
+ {},
3561
+ )
3562
+ finally:
3563
+ proxy.PROXY_GENERATION_TIMEOUT = old_timeout
3564
+ proxy.PROXY_UPSTREAM_RETRY_MAX = old_retry_max
3565
+
3566
+ asyncio.run(_run())
3567
+
3568
+ def test_check_slot_hang_detects_stuck_slot(self):
3569
+ """_check_slot_hang returns True when a slot is processing with n_decoded=0."""
3570
+ import asyncio
3571
+
3572
+ async def _run():
3573
+ # We can't easily mock the HTTP call, but we can verify the function
3574
+ # doesn't crash when the server is unreachable
3575
+ result = await proxy._check_slot_hang("http://localhost:9999/nonexistent")
3576
+ self.assertFalse(result)
3577
+
3578
+ asyncio.run(_run())
3579
+
3580
+
3440
3581
  if __name__ == "__main__":
3441
3582
  unittest.main()
3442
3583