@miller-tech/uap 1.20.29 → 1.20.32

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@miller-tech/uap",
3
- "version": "1.20.29",
3
+ "version": "1.20.32",
4
4
  "description": "Autonomous AI agent memory system with CLAUDE.md protocol enforcement",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",
@@ -1348,6 +1348,43 @@ def prune_conversation(
1348
1348
  http_client: httpx.AsyncClient | None = None
1349
1349
 
1350
1350
 
1351
+ def _is_loading_model_503(resp: httpx.Response) -> bool:
1352
+ """Check if response is a 503 'Loading model' from llama.cpp."""
1353
+ if resp.status_code != 503:
1354
+ return False
1355
+ try:
1356
+ return "loading model" in resp.text.lower()
1357
+ except Exception:
1358
+ return False
1359
+
1360
+
1361
+ async def _wait_for_upstream_health(
1362
+ client: httpx.AsyncClient,
1363
+ max_wait: float = 60.0,
1364
+ poll_interval: float = 5.0,
1365
+ ) -> bool:
1366
+ """Poll upstream /health until ready or timeout. Returns True if healthy."""
1367
+ health_url = LLAMA_CPP_BASE.replace("/v1", "/health")
1368
+ elapsed = 0.0
1369
+ while elapsed < max_wait:
1370
+ try:
1371
+ resp = await client.get(health_url, timeout=5.0)
1372
+ if resp.status_code == 200:
1373
+ data = resp.json() if resp.headers.get("content-type", "").startswith("application/json") else {}
1374
+ if data.get("status") == "ok" or resp.status_code == 200:
1375
+ if elapsed > 0:
1376
+ logger.info(
1377
+ "UPSTREAM HEALTH: recovered after %.0fs wait", elapsed
1378
+ )
1379
+ return True
1380
+ except Exception:
1381
+ pass
1382
+ await asyncio.sleep(poll_interval)
1383
+ elapsed += poll_interval
1384
+ logger.error("UPSTREAM HEALTH: not ready after %.0fs", max_wait)
1385
+ return False
1386
+
1387
+
1351
1388
  async def _post_with_retry(
1352
1389
  client: httpx.AsyncClient,
1353
1390
  url: str,
@@ -1357,7 +1394,19 @@ async def _post_with_retry(
1357
1394
  last_exc: Exception | None = None
1358
1395
  for attempt in range(PROXY_UPSTREAM_RETRY_MAX):
1359
1396
  try:
1360
- return await client.post(url, json=payload, headers=headers)
1397
+ resp = await client.post(url, json=payload, headers=headers)
1398
+ # Cycle 19 Option 1: if 503 "Loading model", wait for health then retry
1399
+ if _is_loading_model_503(resp):
1400
+ logger.warning(
1401
+ "Upstream 503 Loading model (attempt %d/%d) – waiting for health",
1402
+ attempt + 1,
1403
+ PROXY_UPSTREAM_RETRY_MAX,
1404
+ )
1405
+ healthy = await _wait_for_upstream_health(client, max_wait=60.0)
1406
+ if healthy and attempt < PROXY_UPSTREAM_RETRY_MAX - 1:
1407
+ continue # retry the request now that upstream is healthy
1408
+ return resp # return the 503 if health wait timed out
1409
+ return resp
1361
1410
  except (httpx.ConnectError, httpx.RemoteProtocolError, httpx.ReadTimeout) as exc:
1362
1411
  last_exc = exc
1363
1412
  if attempt < PROXY_UPSTREAM_RETRY_MAX - 1:
@@ -5991,6 +6040,27 @@ async def messages(request: Request):
5991
6040
 
5992
6041
  if strict_resp.status_code != 200:
5993
6042
  error_text = strict_resp.text[:1000]
6043
+ # Cycle 19 Option 2: For 503 "Loading model", don't advance state
6044
+ # machine — return retriable 503 with Retry-After header so the
6045
+ # client can retry without wasting state machine budget.
6046
+ if _is_loading_model_503(strict_resp):
6047
+ logger.warning(
6048
+ "Upstream 503 Loading model (strict-stream) — returning retriable 503 without advancing state",
6049
+ )
6050
+ return Response(
6051
+ content=json.dumps(
6052
+ {
6053
+ "type": "error",
6054
+ "error": {
6055
+ "type": "overloaded_error",
6056
+ "message": "Upstream model is loading. Retry in 10 seconds.",
6057
+ },
6058
+ }
6059
+ ),
6060
+ status_code=503,
6061
+ headers={"Retry-After": "10"},
6062
+ media_type="application/json",
6063
+ )
5994
6064
  logger.error(
5995
6065
  "Upstream HTTP %d (strict-stream): %s",
5996
6066
  strict_resp.status_code,
@@ -4796,3 +4796,25 @@ class TestCycle18SessionBanAndLogNoise(unittest.TestCase):
4796
4796
  finally:
4797
4797
  for k, v in old_vals.items():
4798
4798
  setattr(proxy, k, v)
4799
+
4800
+
4801
+ class TestUpstream503Resilience(unittest.TestCase):
4802
+ """Tests for Cycle 19: upstream 503 Loading model resilience."""
4803
+
4804
+ def test_is_loading_model_503_detects_loading(self):
4805
+ """Detects 503 Loading model response."""
4806
+ resp = httpx.Response(
4807
+ 503,
4808
+ text='{"error":{"message":"Loading model","type":"unavailable_error","code":503}}',
4809
+ )
4810
+ self.assertTrue(proxy._is_loading_model_503(resp))
4811
+
4812
+ def test_is_loading_model_503_ignores_other_503(self):
4813
+ """Does not match 503 with different message."""
4814
+ resp = httpx.Response(503, text='{"error":{"message":"Server busy"}}')
4815
+ self.assertFalse(proxy._is_loading_model_503(resp))
4816
+
4817
+ def test_is_loading_model_503_ignores_200(self):
4818
+ """Does not match 200 even with loading text."""
4819
+ resp = httpx.Response(200, text='{"status":"loading model"}')
4820
+ self.assertFalse(proxy._is_loading_model_503(resp))