@miller-tech/uap 1.20.45 → 1.20.47
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json
CHANGED
|
@@ -812,7 +812,12 @@ class SessionMonitor:
|
|
|
812
812
|
turns_str = f"~{turns} turns remaining" if turns is not None else "unknown"
|
|
813
813
|
|
|
814
814
|
if warning == "CRITICAL":
|
|
815
|
-
|
|
815
|
+
# WARNING, not ERROR: critical context utilization is a *handled*
|
|
816
|
+
# condition — the proxy force-prunes and the session continues.
|
|
817
|
+
# Logging it at ERROR floods the error stream (100+/2h for a few
|
|
818
|
+
# context-saturated agentic sessions) and drowns genuine failures.
|
|
819
|
+
# CONTEXT HIGH below is already WARNING; this keeps parity.
|
|
820
|
+
logger.warning(
|
|
816
821
|
"CONTEXT CRITICAL: %d/%d tokens (%.1f%%), %s, pruned=%d, overflows=%d",
|
|
817
822
|
self.last_input_tokens,
|
|
818
823
|
self.context_window,
|
|
@@ -1523,6 +1528,14 @@ PROXY_SLOT_SAVE_DIR = os.environ.get(
|
|
|
1523
1528
|
PROXY_SLOT_CACHE_MAX_FILES = int(os.environ.get("PROXY_SLOT_CACHE_MAX_FILES", "12"))
|
|
1524
1529
|
# llama-server slot id — always 0 under --parallel 1.
|
|
1525
1530
|
PROXY_SLOT_ID = int(os.environ.get("PROXY_SLOT_ID", "0"))
|
|
1531
|
+
# HTTP timeouts for the /slots save|restore calls. A large session's KV
|
|
1532
|
+
# state (131k ctx) is ~1 GiB; serializing it to / loading it from disk on
|
|
1533
|
+
# a slower model (e.g. Qwen3.6-35B-A3B MoE) can exceed the original
|
|
1534
|
+
# hardcoded 60s/120s, surfacing as `SLOT SAVE/RESTORE error` with an empty
|
|
1535
|
+
# httpx-timeout exception. Restore is given more headroom than save since
|
|
1536
|
+
# it also waits on the disk read + KV reload.
|
|
1537
|
+
PROXY_SLOT_SAVE_TIMEOUT = float(os.environ.get("PROXY_SLOT_SAVE_TIMEOUT", "180"))
|
|
1538
|
+
PROXY_SLOT_RESTORE_TIMEOUT = float(os.environ.get("PROXY_SLOT_RESTORE_TIMEOUT", "300"))
|
|
1526
1539
|
|
|
1527
1540
|
# Module state. Mutated only inside the upstream_semaphore-held section
|
|
1528
1541
|
# (_post_with_retry), so no extra lock is needed.
|
|
@@ -1556,7 +1569,9 @@ async def _save_slot(client: httpx.AsyncClient, session_id: str) -> bool:
|
|
|
1556
1569
|
fn = _slot_filename(session_id)
|
|
1557
1570
|
url = f"{_slot_endpoint_base()}/slots/{PROXY_SLOT_ID}?action=save"
|
|
1558
1571
|
try:
|
|
1559
|
-
resp = await client.post(
|
|
1572
|
+
resp = await client.post(
|
|
1573
|
+
url, json={"filename": fn}, timeout=PROXY_SLOT_SAVE_TIMEOUT
|
|
1574
|
+
)
|
|
1560
1575
|
if resp.status_code == 200:
|
|
1561
1576
|
logger.info("SLOT SAVE: session=%s -> %s", session_id, fn)
|
|
1562
1577
|
return True
|
|
@@ -1565,7 +1580,12 @@ async def _save_slot(client: httpx.AsyncClient, session_id: str) -> bool:
|
|
|
1565
1580
|
session_id, resp.status_code, resp.text[:200],
|
|
1566
1581
|
)
|
|
1567
1582
|
except Exception as exc:
|
|
1568
|
-
|
|
1583
|
+
# Include the exception TYPE — httpx timeout exceptions stringify
|
|
1584
|
+
# to "" and an empty message log line is undiagnosable.
|
|
1585
|
+
logger.warning(
|
|
1586
|
+
"SLOT SAVE error: session=%s %s: %s",
|
|
1587
|
+
session_id, type(exc).__name__, exc,
|
|
1588
|
+
)
|
|
1569
1589
|
return False
|
|
1570
1590
|
|
|
1571
1591
|
|
|
@@ -1581,7 +1601,9 @@ async def _restore_slot(client: httpx.AsyncClient, session_id: str) -> bool:
|
|
|
1581
1601
|
return False
|
|
1582
1602
|
url = f"{_slot_endpoint_base()}/slots/{PROXY_SLOT_ID}?action=restore"
|
|
1583
1603
|
try:
|
|
1584
|
-
resp = await client.post(
|
|
1604
|
+
resp = await client.post(
|
|
1605
|
+
url, json={"filename": fn}, timeout=PROXY_SLOT_RESTORE_TIMEOUT
|
|
1606
|
+
)
|
|
1585
1607
|
if resp.status_code == 200:
|
|
1586
1608
|
logger.info("SLOT RESTORE: session=%s <- %s", session_id, fn)
|
|
1587
1609
|
return True
|
|
@@ -1590,7 +1612,12 @@ async def _restore_slot(client: httpx.AsyncClient, session_id: str) -> bool:
|
|
|
1590
1612
|
session_id, resp.status_code, resp.text[:200],
|
|
1591
1613
|
)
|
|
1592
1614
|
except Exception as exc:
|
|
1593
|
-
|
|
1615
|
+
# Include the exception TYPE — httpx timeout exceptions stringify
|
|
1616
|
+
# to "" and an empty message log line is undiagnosable.
|
|
1617
|
+
logger.warning(
|
|
1618
|
+
"SLOT RESTORE error: session=%s %s: %s",
|
|
1619
|
+
session_id, type(exc).__name__, exc,
|
|
1620
|
+
)
|
|
1594
1621
|
return False
|
|
1595
1622
|
|
|
1596
1623
|
|
|
@@ -5212,14 +5212,14 @@ class TestThinkingBlockExtraction(unittest.TestCase):
|
|
|
5212
5212
|
|
|
5213
5213
|
|
|
5214
5214
|
class _SlotFakeClient:
|
|
5215
|
-
"""Records POST calls for slot
|
|
5215
|
+
"""Records POST calls (incl. the timeout kwarg) for slot tests."""
|
|
5216
5216
|
|
|
5217
5217
|
def __init__(self, status_code=200):
|
|
5218
5218
|
self.calls = []
|
|
5219
5219
|
self._status = status_code
|
|
5220
5220
|
|
|
5221
5221
|
async def post(self, url, json=None, timeout=None): # noqa: A002
|
|
5222
|
-
self.calls.append({"url": url, "json": json})
|
|
5222
|
+
self.calls.append({"url": url, "json": json, "timeout": timeout})
|
|
5223
5223
|
return _FakeResponse({}, status_code=self._status)
|
|
5224
5224
|
|
|
5225
5225
|
|
|
@@ -5240,6 +5240,8 @@ class TestSlotSaveRestore(unittest.TestCase):
|
|
|
5240
5240
|
"PROXY_SLOT_SAVE_RESTORE",
|
|
5241
5241
|
"PROXY_SLOT_CACHE_MAX_FILES",
|
|
5242
5242
|
"PROXY_SLOT_ID",
|
|
5243
|
+
"PROXY_SLOT_SAVE_TIMEOUT",
|
|
5244
|
+
"PROXY_SLOT_RESTORE_TIMEOUT",
|
|
5243
5245
|
"_slot_owner_session",
|
|
5244
5246
|
)
|
|
5245
5247
|
}
|
|
@@ -5326,6 +5328,29 @@ class TestSlotSaveRestore(unittest.TestCase):
|
|
|
5326
5328
|
self.assertIn("fp:aaaa", proxy._slot_lru)
|
|
5327
5329
|
self.assertIn("fp:bbbb", proxy._slot_lru)
|
|
5328
5330
|
|
|
5331
|
+
def test_slot_timeout_defaults_are_sane(self):
|
|
5332
|
+
"""Slot save/restore HTTP timeouts must be configurable and large
|
|
5333
|
+
enough for a slow model's ~1 GiB KV serialization. Restore gets more
|
|
5334
|
+
headroom than save (it also waits on disk read + KV reload)."""
|
|
5335
|
+
self.assertIsInstance(proxy.PROXY_SLOT_SAVE_TIMEOUT, float)
|
|
5336
|
+
self.assertIsInstance(proxy.PROXY_SLOT_RESTORE_TIMEOUT, float)
|
|
5337
|
+
# Both above the original hardcoded 60s/120s that were too tight
|
|
5338
|
+
# for the 35B-A3B (surfaced as empty-message SLOT SAVE/RESTORE errors).
|
|
5339
|
+
self.assertGreaterEqual(proxy.PROXY_SLOT_SAVE_TIMEOUT, 120.0)
|
|
5340
|
+
self.assertGreaterEqual(proxy.PROXY_SLOT_RESTORE_TIMEOUT, 180.0)
|
|
5341
|
+
self.assertGreaterEqual(
|
|
5342
|
+
proxy.PROXY_SLOT_RESTORE_TIMEOUT, proxy.PROXY_SLOT_SAVE_TIMEOUT
|
|
5343
|
+
)
|
|
5344
|
+
|
|
5345
|
+
def test_save_slot_passes_configured_timeout(self):
|
|
5346
|
+
"""_save_slot must hand its httpx POST the configured
|
|
5347
|
+
PROXY_SLOT_SAVE_TIMEOUT, not a hardcoded value."""
|
|
5348
|
+
proxy.PROXY_SLOT_SAVE_TIMEOUT = 222.0
|
|
5349
|
+
client = _SlotFakeClient(status_code=200)
|
|
5350
|
+
asyncio.run(proxy._save_slot(client, "fp:timeoutcheck"))
|
|
5351
|
+
self.assertEqual(len(client.calls), 1)
|
|
5352
|
+
self.assertEqual(client.calls[0]["timeout"], 222.0)
|
|
5353
|
+
|
|
5329
5354
|
def test_evict_slot_files_respects_lru_cap_and_owner(self):
|
|
5330
5355
|
"""LRU eviction removes oldest entries beyond the cap but never the
|
|
5331
5356
|
session currently owning the slot."""
|