@miller-tech/uap 1.20.41 → 1.20.42

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@miller-tech/uap",
3
- "version": "1.20.41",
3
+ "version": "1.20.42",
4
4
  "description": "Autonomous AI agent memory system with CLAUDE.md protocol enforcement",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",
@@ -82,6 +82,7 @@ Dependencies
82
82
  """
83
83
 
84
84
  import asyncio
85
+ import contextvars
85
86
  import copy
86
87
  import hashlib
87
88
  import json
@@ -91,7 +92,7 @@ import re
91
92
  import sys
92
93
  import time
93
94
  import uuid
94
- from collections import defaultdict, deque
95
+ from collections import OrderedDict, defaultdict, deque
95
96
  from dataclasses import dataclass, field
96
97
  from pathlib import Path
97
98
 
@@ -1492,6 +1493,185 @@ PROXY_CONCURRENCY_QUEUE_TIMEOUT = float(
1492
1493
  upstream_semaphore: asyncio.Semaphore | None = None
1493
1494
 
1494
1495
 
1496
+ # ---------------------------------------------------------------------------
1497
+ # Slot save/restore — cross-session KV-cache preservation
1498
+ # ---------------------------------------------------------------------------
1499
+ # llama.cpp runs --parallel 1 (a single slot). When N distinct client
1500
+ # sessions multiplex onto that slot, each session switch evicts the prior
1501
+ # session's KV cache: the incoming request shares only the ~32-token
1502
+ # chat-template header, so llama-server force-reprocesses the entire prompt
1503
+ # (observed: ~17% of requests, 60-96s of prompt eval each).
1504
+ #
1505
+ # When PROXY_SLOT_SAVE_RESTORE is on, the proxy saves the outgoing session's
1506
+ # slot KV state to disk and restores the incoming session's state on a
1507
+ # switch, via llama-server's /slots/{id}?action=save|restore API (requires
1508
+ # the server to be launched with --slot-save-path). A restore reads
1509
+ # ~150-940 MiB from disk (~1-3s) instead of a 60-96s full recompute.
1510
+ #
1511
+ # Default OFF — opt in per-deployment via PROXY_SLOT_SAVE_RESTORE=on.
1512
+ PROXY_SLOT_SAVE_RESTORE = os.environ.get(
1513
+ "PROXY_SLOT_SAVE_RESTORE", "off"
1514
+ ).lower() not in {"", "0", "off", "false", "no"}
1515
+ # Directory the proxy uses for its own LRU bookkeeping + startup cleanup.
1516
+ # MUST match the llama-server --slot-save-path value: the server resolves
1517
+ # the filename the proxy sends relative to its own --slot-save-path.
1518
+ PROXY_SLOT_SAVE_DIR = os.environ.get(
1519
+ "PROXY_SLOT_SAVE_DIR", "/home/cogtek/.cache/uap/llama-slots"
1520
+ )
1521
+ # Max saved slot files kept on disk; least-recently-used files are evicted
1522
+ # beyond this. Each file can be ~1 GiB for a 100k-token session.
1523
+ PROXY_SLOT_CACHE_MAX_FILES = int(os.environ.get("PROXY_SLOT_CACHE_MAX_FILES", "12"))
1524
+ # llama-server slot id — always 0 under --parallel 1.
1525
+ PROXY_SLOT_ID = int(os.environ.get("PROXY_SLOT_ID", "0"))
1526
+
1527
+ # Module state. Mutated only inside the upstream_semaphore-held section
1528
+ # (_post_with_retry), so no extra lock is needed.
1529
+ _slot_owner_session: str | None = None # session id currently loaded in the slot
1530
+ _slot_lru: "OrderedDict[str, float]" = OrderedDict() # session -> last-access ts
1531
+
1532
+ # Per-request session id, set by the request handler and read by
1533
+ # _ensure_slot_for_session inside _post_with_retry. A ContextVar keeps the
1534
+ # value request-local without threading it through every call signature.
1535
+ _current_request_session: contextvars.ContextVar[str | None] = contextvars.ContextVar(
1536
+ "uap_current_request_session", default=None
1537
+ )
1538
+
1539
+
1540
+ def _slot_endpoint_base() -> str:
1541
+ """Base URL for llama-server's /slots endpoint (LLAMA_CPP_BASE without /v1)."""
1542
+ base = LLAMA_CPP_BASE.rstrip("/")
1543
+ if base.endswith("/v1"):
1544
+ base = base[: -len("/v1")]
1545
+ return base
1546
+
1547
+
1548
+ def _slot_filename(session_id: str) -> str:
1549
+ """Map a session id to a filesystem-safe slot-state filename."""
1550
+ safe = re.sub(r"[^A-Za-z0-9_.-]", "_", session_id)
1551
+ return f"slot-{safe}.bin"
1552
+
1553
+
1554
+ async def _save_slot(client: httpx.AsyncClient, session_id: str) -> bool:
1555
+ """Persist the current slot KV state under *session_id*'s filename."""
1556
+ fn = _slot_filename(session_id)
1557
+ url = f"{_slot_endpoint_base()}/slots/{PROXY_SLOT_ID}?action=save"
1558
+ try:
1559
+ resp = await client.post(url, json={"filename": fn}, timeout=60.0)
1560
+ if resp.status_code == 200:
1561
+ logger.info("SLOT SAVE: session=%s -> %s", session_id, fn)
1562
+ return True
1563
+ logger.warning(
1564
+ "SLOT SAVE failed: session=%s http=%d %s",
1565
+ session_id, resp.status_code, resp.text[:200],
1566
+ )
1567
+ except Exception as exc:
1568
+ logger.warning("SLOT SAVE error: session=%s %s", session_id, exc)
1569
+ return False
1570
+
1571
+
1572
+ async def _restore_slot(client: httpx.AsyncClient, session_id: str) -> bool:
1573
+ """Restore *session_id*'s saved slot KV state.
1574
+
1575
+ Returns False if no saved file exists or the restore failed — the caller
1576
+ then proceeds with a normal (full-reprocess) upstream call.
1577
+ """
1578
+ fn = _slot_filename(session_id)
1579
+ path = os.path.join(PROXY_SLOT_SAVE_DIR, fn)
1580
+ if not os.path.exists(path):
1581
+ return False
1582
+ url = f"{_slot_endpoint_base()}/slots/{PROXY_SLOT_ID}?action=restore"
1583
+ try:
1584
+ resp = await client.post(url, json={"filename": fn}, timeout=120.0)
1585
+ if resp.status_code == 200:
1586
+ logger.info("SLOT RESTORE: session=%s <- %s", session_id, fn)
1587
+ return True
1588
+ logger.warning(
1589
+ "SLOT RESTORE failed: session=%s http=%d %s",
1590
+ session_id, resp.status_code, resp.text[:200],
1591
+ )
1592
+ except Exception as exc:
1593
+ logger.warning("SLOT RESTORE error: session=%s %s", session_id, exc)
1594
+ return False
1595
+
1596
+
1597
+ def _evict_slot_files() -> None:
1598
+ """LRU-evict saved slot files beyond PROXY_SLOT_CACHE_MAX_FILES.
1599
+
1600
+ The session currently owning the slot is never evicted (its file is the
1601
+ live restore point). Eviction order is oldest-access first.
1602
+ """
1603
+ if len(_slot_lru) <= PROXY_SLOT_CACHE_MAX_FILES:
1604
+ return
1605
+ evictable = [s for s in _slot_lru if s != _slot_owner_session]
1606
+ excess = len(_slot_lru) - PROXY_SLOT_CACHE_MAX_FILES
1607
+ for old_session in evictable[:excess]:
1608
+ old_path = os.path.join(PROXY_SLOT_SAVE_DIR, _slot_filename(old_session))
1609
+ try:
1610
+ os.remove(old_path)
1611
+ except FileNotFoundError:
1612
+ pass
1613
+ except OSError as exc:
1614
+ logger.warning("SLOT EVICT error: %s", exc)
1615
+ del _slot_lru[old_session]
1616
+ logger.info("SLOT EVICT: removed LRU slot file for session=%s", old_session)
1617
+
1618
+
1619
+ async def _ensure_slot_for_session(
1620
+ client: httpx.AsyncClient | None, session_id: str | None
1621
+ ) -> None:
1622
+ """Make the upstream slot hold *session_id*'s KV state.
1623
+
1624
+ Called inside the upstream_semaphore-held section, so module state is
1625
+ mutated without an extra lock. No-op when slot save/restore is disabled
1626
+ or the slot already belongs to this session. On a session switch, saves
1627
+ the outgoing session's state and restores the incoming session's.
1628
+ """
1629
+ global _slot_owner_session
1630
+ if not PROXY_SLOT_SAVE_RESTORE or not session_id or client is None:
1631
+ return
1632
+ if session_id == _slot_owner_session:
1633
+ if session_id in _slot_lru:
1634
+ _slot_lru.move_to_end(session_id)
1635
+ return
1636
+ if _slot_owner_session is not None:
1637
+ if await _save_slot(client, _slot_owner_session):
1638
+ _slot_lru[_slot_owner_session] = time.time()
1639
+ _slot_lru.move_to_end(_slot_owner_session)
1640
+ await _restore_slot(client, session_id)
1641
+ _slot_owner_session = session_id
1642
+ _slot_lru[session_id] = time.time()
1643
+ _slot_lru.move_to_end(session_id)
1644
+ _evict_slot_files()
1645
+
1646
+
1647
+ def _prepare_slot_save_dir() -> None:
1648
+ """Create + clear the slot-save directory at proxy startup.
1649
+
1650
+ Stale files from a previous run may be shape-incompatible with the
1651
+ current model (e.g. after a 35B->27B switch); restoring a mismatched
1652
+ file could crash or corrupt the slot. Clearing on startup is the safe
1653
+ belt-and-suspenders move — cross-restart cache reuse is sacrificed for
1654
+ correctness. llama-server itself also rejects mismatched restores, but
1655
+ we do not rely on that alone.
1656
+ """
1657
+ if not PROXY_SLOT_SAVE_RESTORE:
1658
+ return
1659
+ try:
1660
+ os.makedirs(PROXY_SLOT_SAVE_DIR, exist_ok=True)
1661
+ removed = 0
1662
+ for f in os.listdir(PROXY_SLOT_SAVE_DIR):
1663
+ if f.startswith("slot-") and f.endswith(".bin"):
1664
+ os.remove(os.path.join(PROXY_SLOT_SAVE_DIR, f))
1665
+ removed += 1
1666
+ _slot_lru.clear()
1667
+ logger.info(
1668
+ "SLOT SAVE/RESTORE: enabled, dir=%s, cleared %d stale file(s) on startup",
1669
+ PROXY_SLOT_SAVE_DIR, removed,
1670
+ )
1671
+ except OSError as exc:
1672
+ logger.warning("SLOT SAVE/RESTORE: startup dir prep failed: %s", exc)
1673
+
1674
+
1495
1675
  async def _acquire_upstream_slot() -> bool:
1496
1676
  """Acquire a semaphore slot for an upstream request.
1497
1677
 
@@ -1588,6 +1768,9 @@ async def _post_with_retry(
1588
1768
  request=None,
1589
1769
  )
1590
1770
  try:
1771
+ # Inside the serialized section: swap the upstream slot's KV state to
1772
+ # this request's session if needed (no-op when disabled or unchanged).
1773
+ await _ensure_slot_for_session(client, _current_request_session.get())
1591
1774
  return await _post_with_retry_inner(client, url, payload, headers)
1592
1775
  finally:
1593
1776
  _release_upstream_slot()
@@ -1715,6 +1898,7 @@ async def lifespan(app: FastAPI):
1715
1898
  PROXY_CONCURRENCY_LIMIT,
1716
1899
  PROXY_CONCURRENCY_QUEUE_TIMEOUT,
1717
1900
  )
1901
+ _prepare_slot_save_dir()
1718
1902
  http_client = httpx.AsyncClient(
1719
1903
  timeout=httpx.Timeout(
1720
1904
  connect=10.0, # 10s to establish connection
@@ -7155,6 +7339,12 @@ async def messages(request: Request):
7155
7339
  session_id = resolve_session_id(request, body)
7156
7340
  monitor = get_session_monitor(session_id)
7157
7341
  last_session_id = session_id
7342
+ # Make the session id visible to _ensure_slot_for_session inside
7343
+ # _post_with_retry. The /v1/chat/completions handler also reaches this
7344
+ # path (it builds a synthetic request and calls messages()), so this
7345
+ # single set covers both the Anthropic and OpenAI-passthrough entry
7346
+ # points for local llama-server requests.
7347
+ _current_request_session.set(session_id)
7158
7348
 
7159
7349
  profile_prompt_suffix = None
7160
7350
  profile_grammar = None
@@ -5135,3 +5135,140 @@ class TestThinkingBlockExtraction(unittest.TestCase):
5135
5135
  # Body has the prose between the two blocks
5136
5136
  self.assertEqual(body, "partial answer")
5137
5137
  self.assertNotIn("<think>", body)
5138
+
5139
+
5140
+ class _SlotFakeClient:
5141
+ """Records POST calls for slot save/restore tests."""
5142
+
5143
+ def __init__(self, status_code=200):
5144
+ self.calls = []
5145
+ self._status = status_code
5146
+
5147
+ async def post(self, url, json=None, timeout=None): # noqa: A002
5148
+ self.calls.append({"url": url, "json": json})
5149
+ return _FakeResponse({}, status_code=self._status)
5150
+
5151
+
5152
+ class TestSlotSaveRestore(unittest.TestCase):
5153
+ """Tests for the cross-session KV-cache slot save/restore feature.
5154
+
5155
+ Prevents the regression where N agentic sessions multiplexing onto
5156
+ llama-server's single slot (--parallel 1) each evict the prior
5157
+ session's KV cache, forcing a 60-96s full prompt reprocess (~17% of
5158
+ requests). The proxy saves the outgoing session's slot state and
5159
+ restores the incoming session's on a switch."""
5160
+
5161
+ def setUp(self):
5162
+ # Snapshot + reset module state touched by these tests.
5163
+ self._saved = {
5164
+ k: getattr(proxy, k)
5165
+ for k in (
5166
+ "PROXY_SLOT_SAVE_RESTORE",
5167
+ "PROXY_SLOT_CACHE_MAX_FILES",
5168
+ "PROXY_SLOT_ID",
5169
+ "_slot_owner_session",
5170
+ )
5171
+ }
5172
+ self._saved_lru = list(proxy._slot_lru.items())
5173
+ proxy._slot_owner_session = None
5174
+ proxy._slot_lru.clear()
5175
+
5176
+ def tearDown(self):
5177
+ for k, v in self._saved.items():
5178
+ setattr(proxy, k, v)
5179
+ proxy._slot_lru.clear()
5180
+ proxy._slot_lru.update(self._saved_lru)
5181
+
5182
+ def test_slot_filename_sanitizes_session_id(self):
5183
+ """Session ids like 'fp:abc123' / 'hdr:weird/value' must become
5184
+ filesystem-safe filenames."""
5185
+ self.assertEqual(
5186
+ proxy._slot_filename("fp:5735f94edf4bccb31e1e"),
5187
+ "slot-fp_5735f94edf4bccb31e1e.bin",
5188
+ )
5189
+ self.assertEqual(
5190
+ proxy._slot_filename("hdr:weird/value with spaces"),
5191
+ "slot-hdr_weird_value_with_spaces.bin",
5192
+ )
5193
+ # Already-safe characters are preserved
5194
+ self.assertEqual(proxy._slot_filename("meta.abc-1_2"), "slot-meta.abc-1_2.bin")
5195
+
5196
+ def test_slot_endpoint_base_strips_v1_suffix(self):
5197
+ """The /slots API lives at the server root, not under /v1."""
5198
+ old = proxy.LLAMA_CPP_BASE
5199
+ try:
5200
+ proxy.LLAMA_CPP_BASE = "http://127.0.0.1:8080/v1"
5201
+ self.assertEqual(proxy._slot_endpoint_base(), "http://127.0.0.1:8080")
5202
+ proxy.LLAMA_CPP_BASE = "http://127.0.0.1:8080/v1/"
5203
+ self.assertEqual(proxy._slot_endpoint_base(), "http://127.0.0.1:8080")
5204
+ finally:
5205
+ proxy.LLAMA_CPP_BASE = old
5206
+
5207
+ def test_ensure_slot_noop_when_disabled(self):
5208
+ """With PROXY_SLOT_SAVE_RESTORE off, no client calls are made."""
5209
+ proxy.PROXY_SLOT_SAVE_RESTORE = False
5210
+ client = _SlotFakeClient()
5211
+ asyncio.run(proxy._ensure_slot_for_session(client, "fp:abc"))
5212
+ self.assertEqual(client.calls, [])
5213
+ self.assertIsNone(proxy._slot_owner_session)
5214
+
5215
+ def test_ensure_slot_noop_when_session_unchanged(self):
5216
+ """When the slot already owns the session, no save/restore fires."""
5217
+ proxy.PROXY_SLOT_SAVE_RESTORE = True
5218
+ proxy._slot_owner_session = "fp:same"
5219
+ client = _SlotFakeClient()
5220
+ asyncio.run(proxy._ensure_slot_for_session(client, "fp:same"))
5221
+ self.assertEqual(client.calls, [])
5222
+ self.assertEqual(proxy._slot_owner_session, "fp:same")
5223
+
5224
+ def test_ensure_slot_first_session_no_save(self):
5225
+ """The very first session (owner is None) triggers no save — there
5226
+ is nothing in the slot to preserve. Restore is attempted but a
5227
+ missing file is a clean miss (no client call)."""
5228
+ proxy.PROXY_SLOT_SAVE_RESTORE = True
5229
+ proxy._slot_owner_session = None
5230
+ client = _SlotFakeClient()
5231
+ asyncio.run(proxy._ensure_slot_for_session(client, "fp:first"))
5232
+ # No saved file for fp:first exists -> _restore_slot returns early,
5233
+ # no save needed for a None owner -> zero client calls.
5234
+ self.assertEqual(client.calls, [])
5235
+ self.assertEqual(proxy._slot_owner_session, "fp:first")
5236
+
5237
+ def test_ensure_slot_switch_saves_outgoing_session(self):
5238
+ """Switching from session A to B saves A's slot state. B's restore
5239
+ is a clean miss (no file) so only the save POST is observed."""
5240
+ proxy.PROXY_SLOT_SAVE_RESTORE = True
5241
+ proxy._slot_owner_session = "fp:aaaa"
5242
+ client = _SlotFakeClient(status_code=200)
5243
+ asyncio.run(proxy._ensure_slot_for_session(client, "fp:bbbb"))
5244
+ # Exactly one POST: the save of the outgoing session A.
5245
+ self.assertEqual(len(client.calls), 1)
5246
+ self.assertIn("action=save", client.calls[0]["url"])
5247
+ self.assertEqual(
5248
+ client.calls[0]["json"]["filename"], "slot-fp_aaaa.bin"
5249
+ )
5250
+ self.assertEqual(proxy._slot_owner_session, "fp:bbbb")
5251
+ # Both sessions are now tracked in the LRU.
5252
+ self.assertIn("fp:aaaa", proxy._slot_lru)
5253
+ self.assertIn("fp:bbbb", proxy._slot_lru)
5254
+
5255
+ def test_evict_slot_files_respects_lru_cap_and_owner(self):
5256
+ """LRU eviction removes oldest entries beyond the cap but never the
5257
+ session currently owning the slot."""
5258
+ proxy.PROXY_SLOT_SAVE_RESTORE = True
5259
+ proxy.PROXY_SLOT_CACHE_MAX_FILES = 3
5260
+ proxy._slot_owner_session = "fp:owner"
5261
+ # Insert 5 entries oldest-first; owner inserted early so it would be
5262
+ # an eviction candidate by age — but must be protected.
5263
+ for i, sess in enumerate(
5264
+ ["fp:owner", "fp:old1", "fp:old2", "fp:new1", "fp:new2"]
5265
+ ):
5266
+ proxy._slot_lru[sess] = float(i)
5267
+ proxy._evict_slot_files()
5268
+ # Cap is 3; 5 entries -> 2 evicted. Owner protected, so the 2 oldest
5269
+ # non-owner entries (old1, old2) are evicted.
5270
+ self.assertNotIn("fp:old1", proxy._slot_lru)
5271
+ self.assertNotIn("fp:old2", proxy._slot_lru)
5272
+ self.assertIn("fp:owner", proxy._slot_lru)
5273
+ self.assertIn("fp:new1", proxy._slot_lru)
5274
+ self.assertIn("fp:new2", proxy._slot_lru)