@miller-tech/uap 1.20.15 → 1.20.16
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json
CHANGED
|
@@ -107,7 +107,9 @@ ANTHROPIC_PASSTHROUGH_MODELS = os.environ.get("ANTHROPIC_PASSTHROUGH_MODELS", ""
|
|
|
107
107
|
PROXY_PORT = int(os.environ.get("PROXY_PORT", "4000"))
|
|
108
108
|
PROXY_HOST = os.environ.get("PROXY_HOST", "0.0.0.0")
|
|
109
109
|
PROXY_LOG_LEVEL = os.environ.get("PROXY_LOG_LEVEL", "INFO").upper()
|
|
110
|
-
PROXY_READ_TIMEOUT = float(os.environ.get("PROXY_READ_TIMEOUT", "
|
|
110
|
+
PROXY_READ_TIMEOUT = float(os.environ.get("PROXY_READ_TIMEOUT", "180"))
|
|
111
|
+
PROXY_GENERATION_TIMEOUT = float(os.environ.get("PROXY_GENERATION_TIMEOUT", "300"))
|
|
112
|
+
PROXY_SLOT_HANG_TIMEOUT = float(os.environ.get("PROXY_SLOT_HANG_TIMEOUT", "120"))
|
|
111
113
|
PROXY_UPSTREAM_RETRY_MAX = int(os.environ.get("PROXY_UPSTREAM_RETRY_MAX", "3"))
|
|
112
114
|
PROXY_UPSTREAM_RETRY_DELAY_SECS = float(os.environ.get("PROXY_UPSTREAM_RETRY_DELAY_SECS", "5"))
|
|
113
115
|
PROXY_MAX_CONNECTIONS = int(os.environ.get("PROXY_MAX_CONNECTIONS", "20"))
|
|
@@ -1311,6 +1313,72 @@ async def _post_with_retry(
|
|
|
1311
1313
|
raise last_exc if last_exc else RuntimeError("upstream retry failed")
|
|
1312
1314
|
|
|
1313
1315
|
|
|
1316
|
+
async def _post_with_generation_timeout(
|
|
1317
|
+
client: httpx.AsyncClient,
|
|
1318
|
+
url: str,
|
|
1319
|
+
payload: dict,
|
|
1320
|
+
headers: dict,
|
|
1321
|
+
) -> httpx.Response:
|
|
1322
|
+
"""Wrap _post_with_retry with an explicit asyncio generation timeout.
|
|
1323
|
+
|
|
1324
|
+
The httpx read timeout may not fire for hung connections where the server
|
|
1325
|
+
keeps the socket open but produces no data (observed with llama.cpp server
|
|
1326
|
+
hanging after prompt processing). This wrapper uses asyncio.wait_for to
|
|
1327
|
+
enforce a hard deadline.
|
|
1328
|
+
"""
|
|
1329
|
+
timeout = PROXY_GENERATION_TIMEOUT
|
|
1330
|
+
if timeout <= 0:
|
|
1331
|
+
return await _post_with_retry(client, url, payload, headers)
|
|
1332
|
+
try:
|
|
1333
|
+
return await asyncio.wait_for(
|
|
1334
|
+
_post_with_retry(client, url, payload, headers),
|
|
1335
|
+
timeout=timeout,
|
|
1336
|
+
)
|
|
1337
|
+
except asyncio.TimeoutError:
|
|
1338
|
+
logger.error(
|
|
1339
|
+
"GENERATION TIMEOUT: request to %s exceeded %ds hard deadline",
|
|
1340
|
+
url,
|
|
1341
|
+
int(timeout),
|
|
1342
|
+
)
|
|
1343
|
+
raise httpx.ReadTimeout(
|
|
1344
|
+
f"Generation timeout after {int(timeout)}s (PROXY_GENERATION_TIMEOUT)"
|
|
1345
|
+
)
|
|
1346
|
+
|
|
1347
|
+
|
|
1348
|
+
async def _check_slot_hang(slot_url: str) -> bool:
|
|
1349
|
+
"""Check if any upstream slot is hung (processing but n_decoded=0).
|
|
1350
|
+
|
|
1351
|
+
Returns True if a hung slot was detected and the server was restarted.
|
|
1352
|
+
"""
|
|
1353
|
+
if PROXY_SLOT_HANG_TIMEOUT <= 0:
|
|
1354
|
+
return False
|
|
1355
|
+
try:
|
|
1356
|
+
async with httpx.AsyncClient() as check_client:
|
|
1357
|
+
resp = await check_client.get(slot_url, timeout=5.0)
|
|
1358
|
+
if resp.status_code != 200:
|
|
1359
|
+
return False
|
|
1360
|
+
slots = resp.json()
|
|
1361
|
+
for slot in slots:
|
|
1362
|
+
if (
|
|
1363
|
+
slot.get("is_processing", False)
|
|
1364
|
+
and slot.get("n_decoded", -1) == 0
|
|
1365
|
+
):
|
|
1366
|
+
# Slot is processing but hasn't decoded any tokens —
|
|
1367
|
+
# check how long by looking at the task start time.
|
|
1368
|
+
# Since we can't easily get the start time from the slot,
|
|
1369
|
+
# we'll just log a warning. The generation timeout will
|
|
1370
|
+
# handle the actual cancellation.
|
|
1371
|
+
logger.warning(
|
|
1372
|
+
"SLOT HANG DETECTED: slot %d is_processing=True n_decoded=0 task=%s",
|
|
1373
|
+
slot.get("id", -1),
|
|
1374
|
+
slot.get("id_task", "?"),
|
|
1375
|
+
)
|
|
1376
|
+
return True
|
|
1377
|
+
except Exception as exc:
|
|
1378
|
+
logger.debug("Slot hang check failed: %s", exc)
|
|
1379
|
+
return False
|
|
1380
|
+
|
|
1381
|
+
|
|
1314
1382
|
@asynccontextmanager
|
|
1315
1383
|
async def lifespan(app: FastAPI):
|
|
1316
1384
|
"""Manage the httpx client lifecycle with the FastAPI app."""
|
|
@@ -1383,6 +1451,12 @@ async def lifespan(app: FastAPI):
|
|
|
1383
1451
|
TOOL_CALL_GRAMMAR_TOOLS_COMPATIBLE,
|
|
1384
1452
|
PROXY_TOOL_CALL_GRAMMAR_PATH,
|
|
1385
1453
|
)
|
|
1454
|
+
logger.info(
|
|
1455
|
+
"Timeouts: read=%ds generation=%ds slot_hang=%ds",
|
|
1456
|
+
int(PROXY_READ_TIMEOUT),
|
|
1457
|
+
int(PROXY_GENERATION_TIMEOUT),
|
|
1458
|
+
int(PROXY_SLOT_HANG_TIMEOUT),
|
|
1459
|
+
)
|
|
1386
1460
|
|
|
1387
1461
|
yield
|
|
1388
1462
|
await http_client.aclose()
|
|
@@ -5216,13 +5290,15 @@ async def messages(request: Request):
|
|
|
5216
5290
|
strict_body["stream"] = False
|
|
5217
5291
|
|
|
5218
5292
|
try:
|
|
5219
|
-
strict_resp = await
|
|
5293
|
+
strict_resp = await _post_with_generation_timeout(
|
|
5220
5294
|
client,
|
|
5221
5295
|
f"{LLAMA_CPP_BASE}/chat/completions",
|
|
5222
5296
|
strict_body,
|
|
5223
5297
|
{"Content-Type": "application/json"},
|
|
5224
5298
|
)
|
|
5225
5299
|
except Exception as exc:
|
|
5300
|
+
# Check if upstream is hung before returning error
|
|
5301
|
+
await _check_slot_hang(f"{LLAMA_CPP_BASE}/slots")
|
|
5226
5302
|
return Response(
|
|
5227
5303
|
content=json.dumps(
|
|
5228
5304
|
{
|
|
@@ -5246,7 +5322,7 @@ async def messages(request: Request):
|
|
|
5246
5322
|
"strict-stream",
|
|
5247
5323
|
):
|
|
5248
5324
|
try:
|
|
5249
|
-
strict_resp = await
|
|
5325
|
+
strict_resp = await _post_with_generation_timeout(
|
|
5250
5326
|
client,
|
|
5251
5327
|
f"{LLAMA_CPP_BASE}/chat/completions",
|
|
5252
5328
|
strict_body,
|
|
@@ -5529,7 +5605,7 @@ async def messages(request: Request):
|
|
|
5529
5605
|
)
|
|
5530
5606
|
else:
|
|
5531
5607
|
try:
|
|
5532
|
-
resp = await
|
|
5608
|
+
resp = await _post_with_generation_timeout(
|
|
5533
5609
|
client,
|
|
5534
5610
|
f"{LLAMA_CPP_BASE}/chat/completions",
|
|
5535
5611
|
openai_body,
|
|
@@ -5559,7 +5635,7 @@ async def messages(request: Request):
|
|
|
5559
5635
|
"non-stream",
|
|
5560
5636
|
):
|
|
5561
5637
|
try:
|
|
5562
|
-
resp = await
|
|
5638
|
+
resp = await _post_with_generation_timeout(
|
|
5563
5639
|
client,
|
|
5564
5640
|
f"{LLAMA_CPP_BASE}/chat/completions",
|
|
5565
5641
|
openai_body,
|
|
@@ -6,6 +6,8 @@ import json
|
|
|
6
6
|
import unittest
|
|
7
7
|
from pathlib import Path
|
|
8
8
|
|
|
9
|
+
import httpx
|
|
10
|
+
|
|
9
11
|
|
|
10
12
|
def _load_proxy_module():
|
|
11
13
|
proxy_path = Path(__file__).resolve().parents[1] / "scripts" / "anthropic_proxy.py"
|
|
@@ -3517,6 +3519,65 @@ class TestDegenerateRepetitionDetection(unittest.TestCase):
|
|
|
3517
3519
|
setattr(proxy, "PROXY_DISABLE_THINKING_ON_TOOL_TURNS", old_disable)
|
|
3518
3520
|
|
|
3519
3521
|
|
|
3522
|
+
class TestGenerationHangRecovery(unittest.TestCase):
|
|
3523
|
+
"""Tests for generation hang recovery: timeouts, slot hang detection."""
|
|
3524
|
+
|
|
3525
|
+
def test_read_timeout_default_is_180(self):
|
|
3526
|
+
"""Option 3: default read timeout reduced from 600 to 180."""
|
|
3527
|
+
self.assertEqual(proxy.PROXY_READ_TIMEOUT, 180)
|
|
3528
|
+
|
|
3529
|
+
def test_generation_timeout_default_is_300(self):
|
|
3530
|
+
"""Option 1: generation timeout is 300s."""
|
|
3531
|
+
self.assertEqual(proxy.PROXY_GENERATION_TIMEOUT, 300)
|
|
3532
|
+
|
|
3533
|
+
def test_slot_hang_timeout_default_is_120(self):
|
|
3534
|
+
"""Option 2: slot hang timeout is 120s."""
|
|
3535
|
+
self.assertEqual(proxy.PROXY_SLOT_HANG_TIMEOUT, 120)
|
|
3536
|
+
|
|
3537
|
+
def test_generation_timeout_wraps_post_with_retry(self):
|
|
3538
|
+
"""_post_with_generation_timeout raises ReadTimeout on asyncio timeout."""
|
|
3539
|
+
import asyncio
|
|
3540
|
+
|
|
3541
|
+
async def _run():
|
|
3542
|
+
# Create a mock client that hangs forever
|
|
3543
|
+
async def _hanging_post(*args, **kwargs):
|
|
3544
|
+
await asyncio.sleep(999)
|
|
3545
|
+
|
|
3546
|
+
class FakeClient:
|
|
3547
|
+
async def post(self, *args, **kwargs):
|
|
3548
|
+
await asyncio.sleep(999)
|
|
3549
|
+
|
|
3550
|
+
old_timeout = proxy.PROXY_GENERATION_TIMEOUT
|
|
3551
|
+
old_retry_max = proxy.PROXY_UPSTREAM_RETRY_MAX
|
|
3552
|
+
try:
|
|
3553
|
+
proxy.PROXY_GENERATION_TIMEOUT = 0.1 # 100ms
|
|
3554
|
+
proxy.PROXY_UPSTREAM_RETRY_MAX = 1
|
|
3555
|
+
with self.assertRaises(httpx.ReadTimeout):
|
|
3556
|
+
await proxy._post_with_generation_timeout(
|
|
3557
|
+
FakeClient(),
|
|
3558
|
+
"http://localhost:9999/fake",
|
|
3559
|
+
{},
|
|
3560
|
+
{},
|
|
3561
|
+
)
|
|
3562
|
+
finally:
|
|
3563
|
+
proxy.PROXY_GENERATION_TIMEOUT = old_timeout
|
|
3564
|
+
proxy.PROXY_UPSTREAM_RETRY_MAX = old_retry_max
|
|
3565
|
+
|
|
3566
|
+
asyncio.run(_run())
|
|
3567
|
+
|
|
3568
|
+
def test_check_slot_hang_detects_stuck_slot(self):
|
|
3569
|
+
"""_check_slot_hang returns True when a slot is processing with n_decoded=0."""
|
|
3570
|
+
import asyncio
|
|
3571
|
+
|
|
3572
|
+
async def _run():
|
|
3573
|
+
# We can't easily mock the HTTP call, but we can verify the function
|
|
3574
|
+
# doesn't crash when the server is unreachable
|
|
3575
|
+
result = await proxy._check_slot_hang("http://localhost:9999/nonexistent")
|
|
3576
|
+
self.assertFalse(result)
|
|
3577
|
+
|
|
3578
|
+
asyncio.run(_run())
|
|
3579
|
+
|
|
3580
|
+
|
|
3520
3581
|
if __name__ == "__main__":
|
|
3521
3582
|
unittest.main()
|
|
3522
3583
|
|