@miller-tech/uap 1.20.29 → 1.20.32
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json
CHANGED
|
@@ -1348,6 +1348,43 @@ def prune_conversation(
|
|
|
1348
1348
|
http_client: httpx.AsyncClient | None = None
|
|
1349
1349
|
|
|
1350
1350
|
|
|
1351
|
+
def _is_loading_model_503(resp: httpx.Response) -> bool:
|
|
1352
|
+
"""Check if response is a 503 'Loading model' from llama.cpp."""
|
|
1353
|
+
if resp.status_code != 503:
|
|
1354
|
+
return False
|
|
1355
|
+
try:
|
|
1356
|
+
return "loading model" in resp.text.lower()
|
|
1357
|
+
except Exception:
|
|
1358
|
+
return False
|
|
1359
|
+
|
|
1360
|
+
|
|
1361
|
+
async def _wait_for_upstream_health(
|
|
1362
|
+
client: httpx.AsyncClient,
|
|
1363
|
+
max_wait: float = 60.0,
|
|
1364
|
+
poll_interval: float = 5.0,
|
|
1365
|
+
) -> bool:
|
|
1366
|
+
"""Poll upstream /health until ready or timeout. Returns True if healthy."""
|
|
1367
|
+
health_url = LLAMA_CPP_BASE.replace("/v1", "/health")
|
|
1368
|
+
elapsed = 0.0
|
|
1369
|
+
while elapsed < max_wait:
|
|
1370
|
+
try:
|
|
1371
|
+
resp = await client.get(health_url, timeout=5.0)
|
|
1372
|
+
if resp.status_code == 200:
|
|
1373
|
+
data = resp.json() if resp.headers.get("content-type", "").startswith("application/json") else {}
|
|
1374
|
+
if data.get("status") == "ok" or resp.status_code == 200:
|
|
1375
|
+
if elapsed > 0:
|
|
1376
|
+
logger.info(
|
|
1377
|
+
"UPSTREAM HEALTH: recovered after %.0fs wait", elapsed
|
|
1378
|
+
)
|
|
1379
|
+
return True
|
|
1380
|
+
except Exception:
|
|
1381
|
+
pass
|
|
1382
|
+
await asyncio.sleep(poll_interval)
|
|
1383
|
+
elapsed += poll_interval
|
|
1384
|
+
logger.error("UPSTREAM HEALTH: not ready after %.0fs", max_wait)
|
|
1385
|
+
return False
|
|
1386
|
+
|
|
1387
|
+
|
|
1351
1388
|
async def _post_with_retry(
|
|
1352
1389
|
client: httpx.AsyncClient,
|
|
1353
1390
|
url: str,
|
|
@@ -1357,7 +1394,19 @@ async def _post_with_retry(
|
|
|
1357
1394
|
last_exc: Exception | None = None
|
|
1358
1395
|
for attempt in range(PROXY_UPSTREAM_RETRY_MAX):
|
|
1359
1396
|
try:
|
|
1360
|
-
|
|
1397
|
+
resp = await client.post(url, json=payload, headers=headers)
|
|
1398
|
+
# Cycle 19 Option 1: if 503 "Loading model", wait for health then retry
|
|
1399
|
+
if _is_loading_model_503(resp):
|
|
1400
|
+
logger.warning(
|
|
1401
|
+
"Upstream 503 Loading model (attempt %d/%d) – waiting for health",
|
|
1402
|
+
attempt + 1,
|
|
1403
|
+
PROXY_UPSTREAM_RETRY_MAX,
|
|
1404
|
+
)
|
|
1405
|
+
healthy = await _wait_for_upstream_health(client, max_wait=60.0)
|
|
1406
|
+
if healthy and attempt < PROXY_UPSTREAM_RETRY_MAX - 1:
|
|
1407
|
+
continue # retry the request now that upstream is healthy
|
|
1408
|
+
return resp # return the 503 if health wait timed out
|
|
1409
|
+
return resp
|
|
1361
1410
|
except (httpx.ConnectError, httpx.RemoteProtocolError, httpx.ReadTimeout) as exc:
|
|
1362
1411
|
last_exc = exc
|
|
1363
1412
|
if attempt < PROXY_UPSTREAM_RETRY_MAX - 1:
|
|
@@ -5991,6 +6040,27 @@ async def messages(request: Request):
|
|
|
5991
6040
|
|
|
5992
6041
|
if strict_resp.status_code != 200:
|
|
5993
6042
|
error_text = strict_resp.text[:1000]
|
|
6043
|
+
# Cycle 19 Option 2: For 503 "Loading model", don't advance state
|
|
6044
|
+
# machine — return retriable 503 with Retry-After header so the
|
|
6045
|
+
# client can retry without wasting state machine budget.
|
|
6046
|
+
if _is_loading_model_503(strict_resp):
|
|
6047
|
+
logger.warning(
|
|
6048
|
+
"Upstream 503 Loading model (strict-stream) — returning retriable 503 without advancing state",
|
|
6049
|
+
)
|
|
6050
|
+
return Response(
|
|
6051
|
+
content=json.dumps(
|
|
6052
|
+
{
|
|
6053
|
+
"type": "error",
|
|
6054
|
+
"error": {
|
|
6055
|
+
"type": "overloaded_error",
|
|
6056
|
+
"message": "Upstream model is loading. Retry in 10 seconds.",
|
|
6057
|
+
},
|
|
6058
|
+
}
|
|
6059
|
+
),
|
|
6060
|
+
status_code=503,
|
|
6061
|
+
headers={"Retry-After": "10"},
|
|
6062
|
+
media_type="application/json",
|
|
6063
|
+
)
|
|
5994
6064
|
logger.error(
|
|
5995
6065
|
"Upstream HTTP %d (strict-stream): %s",
|
|
5996
6066
|
strict_resp.status_code,
|
|
@@ -4796,3 +4796,25 @@ class TestCycle18SessionBanAndLogNoise(unittest.TestCase):
|
|
|
4796
4796
|
finally:
|
|
4797
4797
|
for k, v in old_vals.items():
|
|
4798
4798
|
setattr(proxy, k, v)
|
|
4799
|
+
|
|
4800
|
+
|
|
4801
|
+
class TestUpstream503Resilience(unittest.TestCase):
|
|
4802
|
+
"""Tests for Cycle 19: upstream 503 Loading model resilience."""
|
|
4803
|
+
|
|
4804
|
+
def test_is_loading_model_503_detects_loading(self):
|
|
4805
|
+
"""Detects 503 Loading model response."""
|
|
4806
|
+
resp = httpx.Response(
|
|
4807
|
+
503,
|
|
4808
|
+
text='{"error":{"message":"Loading model","type":"unavailable_error","code":503}}',
|
|
4809
|
+
)
|
|
4810
|
+
self.assertTrue(proxy._is_loading_model_503(resp))
|
|
4811
|
+
|
|
4812
|
+
def test_is_loading_model_503_ignores_other_503(self):
|
|
4813
|
+
"""Does not match 503 with different message."""
|
|
4814
|
+
resp = httpx.Response(503, text='{"error":{"message":"Server busy"}}')
|
|
4815
|
+
self.assertFalse(proxy._is_loading_model_503(resp))
|
|
4816
|
+
|
|
4817
|
+
def test_is_loading_model_503_ignores_200(self):
|
|
4818
|
+
"""Does not match 200 even with loading text."""
|
|
4819
|
+
resp = httpx.Response(200, text='{"status":"loading model"}')
|
|
4820
|
+
self.assertFalse(proxy._is_loading_model_503(resp))
|