@miller-tech/uap 1.20.45 → 1.20.46
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json
CHANGED
|
@@ -1523,6 +1523,14 @@ PROXY_SLOT_SAVE_DIR = os.environ.get(
|
|
|
1523
1523
|
PROXY_SLOT_CACHE_MAX_FILES = int(os.environ.get("PROXY_SLOT_CACHE_MAX_FILES", "12"))
|
|
1524
1524
|
# llama-server slot id — always 0 under --parallel 1.
|
|
1525
1525
|
PROXY_SLOT_ID = int(os.environ.get("PROXY_SLOT_ID", "0"))
|
|
1526
|
+
# HTTP timeouts for the /slots save|restore calls. A large session's KV
|
|
1527
|
+
# state (131k ctx) is ~1 GiB; serializing it to / loading it from disk on
|
|
1528
|
+
# a slower model (e.g. Qwen3.6-35B-A3B MoE) can exceed the original
|
|
1529
|
+
# hardcoded 60s/120s, surfacing as `SLOT SAVE/RESTORE error` with an empty
|
|
1530
|
+
# httpx-timeout exception. Restore is given more headroom than save since
|
|
1531
|
+
# it also waits on the disk read + KV reload.
|
|
1532
|
+
PROXY_SLOT_SAVE_TIMEOUT = float(os.environ.get("PROXY_SLOT_SAVE_TIMEOUT", "180"))
|
|
1533
|
+
PROXY_SLOT_RESTORE_TIMEOUT = float(os.environ.get("PROXY_SLOT_RESTORE_TIMEOUT", "300"))
|
|
1526
1534
|
|
|
1527
1535
|
# Module state. Mutated only inside the upstream_semaphore-held section
|
|
1528
1536
|
# (_post_with_retry), so no extra lock is needed.
|
|
@@ -1556,7 +1564,9 @@ async def _save_slot(client: httpx.AsyncClient, session_id: str) -> bool:
|
|
|
1556
1564
|
fn = _slot_filename(session_id)
|
|
1557
1565
|
url = f"{_slot_endpoint_base()}/slots/{PROXY_SLOT_ID}?action=save"
|
|
1558
1566
|
try:
|
|
1559
|
-
resp = await client.post(
|
|
1567
|
+
resp = await client.post(
|
|
1568
|
+
url, json={"filename": fn}, timeout=PROXY_SLOT_SAVE_TIMEOUT
|
|
1569
|
+
)
|
|
1560
1570
|
if resp.status_code == 200:
|
|
1561
1571
|
logger.info("SLOT SAVE: session=%s -> %s", session_id, fn)
|
|
1562
1572
|
return True
|
|
@@ -1565,7 +1575,12 @@ async def _save_slot(client: httpx.AsyncClient, session_id: str) -> bool:
|
|
|
1565
1575
|
session_id, resp.status_code, resp.text[:200],
|
|
1566
1576
|
)
|
|
1567
1577
|
except Exception as exc:
|
|
1568
|
-
|
|
1578
|
+
# Include the exception TYPE — httpx timeout exceptions stringify
|
|
1579
|
+
# to "" and an empty message log line is undiagnosable.
|
|
1580
|
+
logger.warning(
|
|
1581
|
+
"SLOT SAVE error: session=%s %s: %s",
|
|
1582
|
+
session_id, type(exc).__name__, exc,
|
|
1583
|
+
)
|
|
1569
1584
|
return False
|
|
1570
1585
|
|
|
1571
1586
|
|
|
@@ -1581,7 +1596,9 @@ async def _restore_slot(client: httpx.AsyncClient, session_id: str) -> bool:
|
|
|
1581
1596
|
return False
|
|
1582
1597
|
url = f"{_slot_endpoint_base()}/slots/{PROXY_SLOT_ID}?action=restore"
|
|
1583
1598
|
try:
|
|
1584
|
-
resp = await client.post(
|
|
1599
|
+
resp = await client.post(
|
|
1600
|
+
url, json={"filename": fn}, timeout=PROXY_SLOT_RESTORE_TIMEOUT
|
|
1601
|
+
)
|
|
1585
1602
|
if resp.status_code == 200:
|
|
1586
1603
|
logger.info("SLOT RESTORE: session=%s <- %s", session_id, fn)
|
|
1587
1604
|
return True
|
|
@@ -1590,7 +1607,12 @@ async def _restore_slot(client: httpx.AsyncClient, session_id: str) -> bool:
|
|
|
1590
1607
|
session_id, resp.status_code, resp.text[:200],
|
|
1591
1608
|
)
|
|
1592
1609
|
except Exception as exc:
|
|
1593
|
-
|
|
1610
|
+
# Include the exception TYPE — httpx timeout exceptions stringify
|
|
1611
|
+
# to "" and an empty message log line is undiagnosable.
|
|
1612
|
+
logger.warning(
|
|
1613
|
+
"SLOT RESTORE error: session=%s %s: %s",
|
|
1614
|
+
session_id, type(exc).__name__, exc,
|
|
1615
|
+
)
|
|
1594
1616
|
return False
|
|
1595
1617
|
|
|
1596
1618
|
|
|
@@ -5212,14 +5212,14 @@ class TestThinkingBlockExtraction(unittest.TestCase):
|
|
|
5212
5212
|
|
|
5213
5213
|
|
|
5214
5214
|
class _SlotFakeClient:
|
|
5215
|
-
"""Records POST calls for slot
|
|
5215
|
+
"""Records POST calls (incl. the timeout kwarg) for slot tests."""
|
|
5216
5216
|
|
|
5217
5217
|
def __init__(self, status_code=200):
|
|
5218
5218
|
self.calls = []
|
|
5219
5219
|
self._status = status_code
|
|
5220
5220
|
|
|
5221
5221
|
async def post(self, url, json=None, timeout=None): # noqa: A002
|
|
5222
|
-
self.calls.append({"url": url, "json": json})
|
|
5222
|
+
self.calls.append({"url": url, "json": json, "timeout": timeout})
|
|
5223
5223
|
return _FakeResponse({}, status_code=self._status)
|
|
5224
5224
|
|
|
5225
5225
|
|
|
@@ -5240,6 +5240,8 @@ class TestSlotSaveRestore(unittest.TestCase):
|
|
|
5240
5240
|
"PROXY_SLOT_SAVE_RESTORE",
|
|
5241
5241
|
"PROXY_SLOT_CACHE_MAX_FILES",
|
|
5242
5242
|
"PROXY_SLOT_ID",
|
|
5243
|
+
"PROXY_SLOT_SAVE_TIMEOUT",
|
|
5244
|
+
"PROXY_SLOT_RESTORE_TIMEOUT",
|
|
5243
5245
|
"_slot_owner_session",
|
|
5244
5246
|
)
|
|
5245
5247
|
}
|
|
@@ -5326,6 +5328,29 @@ class TestSlotSaveRestore(unittest.TestCase):
|
|
|
5326
5328
|
self.assertIn("fp:aaaa", proxy._slot_lru)
|
|
5327
5329
|
self.assertIn("fp:bbbb", proxy._slot_lru)
|
|
5328
5330
|
|
|
5331
|
+
def test_slot_timeout_defaults_are_sane(self):
|
|
5332
|
+
"""Slot save/restore HTTP timeouts must be configurable and large
|
|
5333
|
+
enough for a slow model's ~1 GiB KV serialization. Restore gets more
|
|
5334
|
+
headroom than save (it also waits on disk read + KV reload)."""
|
|
5335
|
+
self.assertIsInstance(proxy.PROXY_SLOT_SAVE_TIMEOUT, float)
|
|
5336
|
+
self.assertIsInstance(proxy.PROXY_SLOT_RESTORE_TIMEOUT, float)
|
|
5337
|
+
# Both above the original hardcoded 60s/120s that were too tight
|
|
5338
|
+
# for the 35B-A3B (surfaced as empty-message SLOT SAVE/RESTORE errors).
|
|
5339
|
+
self.assertGreaterEqual(proxy.PROXY_SLOT_SAVE_TIMEOUT, 120.0)
|
|
5340
|
+
self.assertGreaterEqual(proxy.PROXY_SLOT_RESTORE_TIMEOUT, 180.0)
|
|
5341
|
+
self.assertGreaterEqual(
|
|
5342
|
+
proxy.PROXY_SLOT_RESTORE_TIMEOUT, proxy.PROXY_SLOT_SAVE_TIMEOUT
|
|
5343
|
+
)
|
|
5344
|
+
|
|
5345
|
+
def test_save_slot_passes_configured_timeout(self):
|
|
5346
|
+
"""_save_slot must hand its httpx POST the configured
|
|
5347
|
+
PROXY_SLOT_SAVE_TIMEOUT, not a hardcoded value."""
|
|
5348
|
+
proxy.PROXY_SLOT_SAVE_TIMEOUT = 222.0
|
|
5349
|
+
client = _SlotFakeClient(status_code=200)
|
|
5350
|
+
asyncio.run(proxy._save_slot(client, "fp:timeoutcheck"))
|
|
5351
|
+
self.assertEqual(len(client.calls), 1)
|
|
5352
|
+
self.assertEqual(client.calls[0]["timeout"], 222.0)
|
|
5353
|
+
|
|
5329
5354
|
def test_evict_slot_files_respects_lru_cap_and_owner(self):
|
|
5330
5355
|
"""LRU eviction removes oldest entries beyond the cap but never the
|
|
5331
5356
|
session currently owning the slot."""
|