npm - @miller-tech/uap - Versions diffs - 1.20.41 → 1.20.42 - Mend

@miller-tech/uap 1.20.41 → 1.20.42

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/package.json +1 -1
package/tools/agents/scripts/anthropic_proxy.py +191 -1
package/tools/agents/tests/test_anthropic_proxy_streaming.py +137 -0

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@miller-tech/uap",
-  "version": "1.20.41",
+  "version": "1.20.42",
   "description": "Autonomous AI agent memory system with CLAUDE.md protocol enforcement",
   "type": "module",
   "main": "dist/index.js",

package/tools/agents/scripts/anthropic_proxy.py CHANGED Viewed

@@ -82,6 +82,7 @@ Dependencies
 """
 import asyncio
+import contextvars
 import copy
 import hashlib
 import json
@@ -91,7 +92,7 @@ import re
 import sys
 import time
 import uuid
-from collections import defaultdict, deque
+from collections import OrderedDict, defaultdict, deque
 from dataclasses import dataclass, field
 from pathlib import Path
@@ -1492,6 +1493,185 @@ PROXY_CONCURRENCY_QUEUE_TIMEOUT = float(
 upstream_semaphore: asyncio.Semaphore | None = None
+# ---------------------------------------------------------------------------
+# Slot save/restore — cross-session KV-cache preservation
+# ---------------------------------------------------------------------------
+# llama.cpp runs --parallel 1 (a single slot). When N distinct client
+# sessions multiplex onto that slot, each session switch evicts the prior
+# session's KV cache: the incoming request shares only the ~32-token
+# chat-template header, so llama-server force-reprocesses the entire prompt
+# (observed: ~17% of requests, 60-96s of prompt eval each).
+#
+# When PROXY_SLOT_SAVE_RESTORE is on, the proxy saves the outgoing session's
+# slot KV state to disk and restores the incoming session's state on a
+# switch, via llama-server's /slots/{id}?action=save|restore API (requires
+# the server to be launched with --slot-save-path). A restore reads
+# ~150-940 MiB from disk (~1-3s) instead of a 60-96s full recompute.
+#
+# Default OFF — opt in per-deployment via PROXY_SLOT_SAVE_RESTORE=on.
+PROXY_SLOT_SAVE_RESTORE = os.environ.get(
+    "PROXY_SLOT_SAVE_RESTORE", "off"
+).lower() not in {"", "0", "off", "false", "no"}
+# Directory the proxy uses for its own LRU bookkeeping + startup cleanup.
+# MUST match the llama-server --slot-save-path value: the server resolves
+# the filename the proxy sends relative to its own --slot-save-path.
+PROXY_SLOT_SAVE_DIR = os.environ.get(
+    "PROXY_SLOT_SAVE_DIR", "/home/cogtek/.cache/uap/llama-slots"
+)
+# Max saved slot files kept on disk; least-recently-used files are evicted
+# beyond this. Each file can be ~1 GiB for a 100k-token session.
+PROXY_SLOT_CACHE_MAX_FILES = int(os.environ.get("PROXY_SLOT_CACHE_MAX_FILES", "12"))
+# llama-server slot id — always 0 under --parallel 1.
+PROXY_SLOT_ID = int(os.environ.get("PROXY_SLOT_ID", "0"))
+# Module state. Mutated only inside the upstream_semaphore-held section
+# (_post_with_retry), so no extra lock is needed.
+_slot_owner_session: str | None = None  # session id currently loaded in the slot
+_slot_lru: "OrderedDict[str, float]" = OrderedDict()  # session -> last-access ts
+# Per-request session id, set by the request handler and read by
+# _ensure_slot_for_session inside _post_with_retry. A ContextVar keeps the
+# value request-local without threading it through every call signature.
+_current_request_session: contextvars.ContextVar[str | None] = contextvars.ContextVar(
+    "uap_current_request_session", default=None
+)
+def _slot_endpoint_base() -> str:
+    """Base URL for llama-server's /slots endpoint (LLAMA_CPP_BASE without /v1)."""
+    base = LLAMA_CPP_BASE.rstrip("/")
+    if base.endswith("/v1"):
+        base = base[: -len("/v1")]
+    return base
+def _slot_filename(session_id: str) -> str:
+    """Map a session id to a filesystem-safe slot-state filename."""
+    safe = re.sub(r"[^A-Za-z0-9_.-]", "_", session_id)
+    return f"slot-{safe}.bin"
+async def _save_slot(client: httpx.AsyncClient, session_id: str) -> bool:
+    """Persist the current slot KV state under *session_id*'s filename."""
+    fn = _slot_filename(session_id)
+    url = f"{_slot_endpoint_base()}/slots/{PROXY_SLOT_ID}?action=save"
+    try:
+        resp = await client.post(url, json={"filename": fn}, timeout=60.0)
+        if resp.status_code == 200:
+            logger.info("SLOT SAVE: session=%s -> %s", session_id, fn)
+            return True
+        logger.warning(
+            "SLOT SAVE failed: session=%s http=%d %s",
+            session_id, resp.status_code, resp.text[:200],
+        )
+    except Exception as exc:
+        logger.warning("SLOT SAVE error: session=%s %s", session_id, exc)
+    return False
+async def _restore_slot(client: httpx.AsyncClient, session_id: str) -> bool:
+    """Restore *session_id*'s saved slot KV state.
+    Returns False if no saved file exists or the restore failed — the caller
+    then proceeds with a normal (full-reprocess) upstream call.
+    """
+    fn = _slot_filename(session_id)
+    path = os.path.join(PROXY_SLOT_SAVE_DIR, fn)
+    if not os.path.exists(path):
+        return False
+    url = f"{_slot_endpoint_base()}/slots/{PROXY_SLOT_ID}?action=restore"
+    try:
+        resp = await client.post(url, json={"filename": fn}, timeout=120.0)
+        if resp.status_code == 200:
+            logger.info("SLOT RESTORE: session=%s <- %s", session_id, fn)
+            return True
+        logger.warning(
+            "SLOT RESTORE failed: session=%s http=%d %s",
+            session_id, resp.status_code, resp.text[:200],
+        )
+    except Exception as exc:
+        logger.warning("SLOT RESTORE error: session=%s %s", session_id, exc)
+    return False
+def _evict_slot_files() -> None:
+    """LRU-evict saved slot files beyond PROXY_SLOT_CACHE_MAX_FILES.
+    The session currently owning the slot is never evicted (its file is the
+    live restore point). Eviction order is oldest-access first.
+    """
+    if len(_slot_lru) <= PROXY_SLOT_CACHE_MAX_FILES:
+        return
+    evictable = [s for s in _slot_lru if s != _slot_owner_session]
+    excess = len(_slot_lru) - PROXY_SLOT_CACHE_MAX_FILES
+    for old_session in evictable[:excess]:
+        old_path = os.path.join(PROXY_SLOT_SAVE_DIR, _slot_filename(old_session))
+        try:
+            os.remove(old_path)
+        except FileNotFoundError:
+            pass
+        except OSError as exc:
+            logger.warning("SLOT EVICT error: %s", exc)
+        del _slot_lru[old_session]
+        logger.info("SLOT EVICT: removed LRU slot file for session=%s", old_session)
+async def _ensure_slot_for_session(
+    client: httpx.AsyncClient | None, session_id: str | None
+) -> None:
+    """Make the upstream slot hold *session_id*'s KV state.
+    Called inside the upstream_semaphore-held section, so module state is
+    mutated without an extra lock. No-op when slot save/restore is disabled
+    or the slot already belongs to this session. On a session switch, saves
+    the outgoing session's state and restores the incoming session's.
+    """
+    global _slot_owner_session
+    if not PROXY_SLOT_SAVE_RESTORE or not session_id or client is None:
+        return
+    if session_id == _slot_owner_session:
+        if session_id in _slot_lru:
+            _slot_lru.move_to_end(session_id)
+        return
+    if _slot_owner_session is not None:
+        if await _save_slot(client, _slot_owner_session):
+            _slot_lru[_slot_owner_session] = time.time()
+            _slot_lru.move_to_end(_slot_owner_session)
+    await _restore_slot(client, session_id)
+    _slot_owner_session = session_id
+    _slot_lru[session_id] = time.time()
+    _slot_lru.move_to_end(session_id)
+    _evict_slot_files()
+def _prepare_slot_save_dir() -> None:
+    """Create + clear the slot-save directory at proxy startup.
+    Stale files from a previous run may be shape-incompatible with the
+    current model (e.g. after a 35B->27B switch); restoring a mismatched
+    file could crash or corrupt the slot. Clearing on startup is the safe
+    belt-and-suspenders move — cross-restart cache reuse is sacrificed for
+    correctness. llama-server itself also rejects mismatched restores, but
+    we do not rely on that alone.
+    """
+    if not PROXY_SLOT_SAVE_RESTORE:
+        return
+    try:
+        os.makedirs(PROXY_SLOT_SAVE_DIR, exist_ok=True)
+        removed = 0
+        for f in os.listdir(PROXY_SLOT_SAVE_DIR):
+            if f.startswith("slot-") and f.endswith(".bin"):
+                os.remove(os.path.join(PROXY_SLOT_SAVE_DIR, f))
+                removed += 1
+        _slot_lru.clear()
+        logger.info(
+            "SLOT SAVE/RESTORE: enabled, dir=%s, cleared %d stale file(s) on startup",
+            PROXY_SLOT_SAVE_DIR, removed,
+        )
+    except OSError as exc:
+        logger.warning("SLOT SAVE/RESTORE: startup dir prep failed: %s", exc)
 async def _acquire_upstream_slot() -> bool:
     """Acquire a semaphore slot for an upstream request.
@@ -1588,6 +1768,9 @@ async def _post_with_retry(
             request=None,
         )
     try:
+        # Inside the serialized section: swap the upstream slot's KV state to
+        # this request's session if needed (no-op when disabled or unchanged).
+        await _ensure_slot_for_session(client, _current_request_session.get())
         return await _post_with_retry_inner(client, url, payload, headers)
     finally:
         _release_upstream_slot()
@@ -1715,6 +1898,7 @@ async def lifespan(app: FastAPI):
         PROXY_CONCURRENCY_LIMIT,
         PROXY_CONCURRENCY_QUEUE_TIMEOUT,
     )
+    _prepare_slot_save_dir()
     http_client = httpx.AsyncClient(
         timeout=httpx.Timeout(
             connect=10.0,  # 10s to establish connection
@@ -7155,6 +7339,12 @@ async def messages(request: Request):
     session_id = resolve_session_id(request, body)
     monitor = get_session_monitor(session_id)
     last_session_id = session_id
+    # Make the session id visible to _ensure_slot_for_session inside
+    # _post_with_retry. The /v1/chat/completions handler also reaches this
+    # path (it builds a synthetic request and calls messages()), so this
+    # single set covers both the Anthropic and OpenAI-passthrough entry
+    # points for local llama-server requests.
+    _current_request_session.set(session_id)
     profile_prompt_suffix = None
     profile_grammar = None

package/tools/agents/tests/test_anthropic_proxy_streaming.py CHANGED Viewed

@@ -5135,3 +5135,140 @@ class TestThinkingBlockExtraction(unittest.TestCase):
         # Body has the prose between the two blocks
         self.assertEqual(body, "partial answer")
         self.assertNotIn("<think>", body)
+class _SlotFakeClient:
+    """Records POST calls for slot save/restore tests."""
+    def __init__(self, status_code=200):
+        self.calls = []
+        self._status = status_code
+    async def post(self, url, json=None, timeout=None):  # noqa: A002
+        self.calls.append({"url": url, "json": json})
+        return _FakeResponse({}, status_code=self._status)
+class TestSlotSaveRestore(unittest.TestCase):
+    """Tests for the cross-session KV-cache slot save/restore feature.
+    Prevents the regression where N agentic sessions multiplexing onto
+    llama-server's single slot (--parallel 1) each evict the prior
+    session's KV cache, forcing a 60-96s full prompt reprocess (~17% of
+    requests). The proxy saves the outgoing session's slot state and
+    restores the incoming session's on a switch."""
+    def setUp(self):
+        # Snapshot + reset module state touched by these tests.
+        self._saved = {
+            k: getattr(proxy, k)
+            for k in (
+                "PROXY_SLOT_SAVE_RESTORE",
+                "PROXY_SLOT_CACHE_MAX_FILES",
+                "PROXY_SLOT_ID",
+                "_slot_owner_session",
+            )
+        }
+        self._saved_lru = list(proxy._slot_lru.items())
+        proxy._slot_owner_session = None
+        proxy._slot_lru.clear()
+    def tearDown(self):
+        for k, v in self._saved.items():
+            setattr(proxy, k, v)
+        proxy._slot_lru.clear()
+        proxy._slot_lru.update(self._saved_lru)
+    def test_slot_filename_sanitizes_session_id(self):
+        """Session ids like 'fp:abc123' / 'hdr:weird/value' must become
+        filesystem-safe filenames."""
+        self.assertEqual(
+            proxy._slot_filename("fp:5735f94edf4bccb31e1e"),
+            "slot-fp_5735f94edf4bccb31e1e.bin",
+        )
+        self.assertEqual(
+            proxy._slot_filename("hdr:weird/value with spaces"),
+            "slot-hdr_weird_value_with_spaces.bin",
+        )
+        # Already-safe characters are preserved
+        self.assertEqual(proxy._slot_filename("meta.abc-1_2"), "slot-meta.abc-1_2.bin")
+    def test_slot_endpoint_base_strips_v1_suffix(self):
+        """The /slots API lives at the server root, not under /v1."""
+        old = proxy.LLAMA_CPP_BASE
+        try:
+            proxy.LLAMA_CPP_BASE = "http://127.0.0.1:8080/v1"
+            self.assertEqual(proxy._slot_endpoint_base(), "http://127.0.0.1:8080")
+            proxy.LLAMA_CPP_BASE = "http://127.0.0.1:8080/v1/"
+            self.assertEqual(proxy._slot_endpoint_base(), "http://127.0.0.1:8080")
+        finally:
+            proxy.LLAMA_CPP_BASE = old
+    def test_ensure_slot_noop_when_disabled(self):
+        """With PROXY_SLOT_SAVE_RESTORE off, no client calls are made."""
+        proxy.PROXY_SLOT_SAVE_RESTORE = False
+        client = _SlotFakeClient()
+        asyncio.run(proxy._ensure_slot_for_session(client, "fp:abc"))
+        self.assertEqual(client.calls, [])
+        self.assertIsNone(proxy._slot_owner_session)
+    def test_ensure_slot_noop_when_session_unchanged(self):
+        """When the slot already owns the session, no save/restore fires."""
+        proxy.PROXY_SLOT_SAVE_RESTORE = True
+        proxy._slot_owner_session = "fp:same"
+        client = _SlotFakeClient()
+        asyncio.run(proxy._ensure_slot_for_session(client, "fp:same"))
+        self.assertEqual(client.calls, [])
+        self.assertEqual(proxy._slot_owner_session, "fp:same")
+    def test_ensure_slot_first_session_no_save(self):
+        """The very first session (owner is None) triggers no save — there
+        is nothing in the slot to preserve. Restore is attempted but a
+        missing file is a clean miss (no client call)."""
+        proxy.PROXY_SLOT_SAVE_RESTORE = True
+        proxy._slot_owner_session = None
+        client = _SlotFakeClient()
+        asyncio.run(proxy._ensure_slot_for_session(client, "fp:first"))
+        # No saved file for fp:first exists -> _restore_slot returns early,
+        # no save needed for a None owner -> zero client calls.
+        self.assertEqual(client.calls, [])
+        self.assertEqual(proxy._slot_owner_session, "fp:first")
+    def test_ensure_slot_switch_saves_outgoing_session(self):
+        """Switching from session A to B saves A's slot state. B's restore
+        is a clean miss (no file) so only the save POST is observed."""
+        proxy.PROXY_SLOT_SAVE_RESTORE = True
+        proxy._slot_owner_session = "fp:aaaa"
+        client = _SlotFakeClient(status_code=200)
+        asyncio.run(proxy._ensure_slot_for_session(client, "fp:bbbb"))
+        # Exactly one POST: the save of the outgoing session A.
+        self.assertEqual(len(client.calls), 1)
+        self.assertIn("action=save", client.calls[0]["url"])
+        self.assertEqual(
+            client.calls[0]["json"]["filename"], "slot-fp_aaaa.bin"
+        )
+        self.assertEqual(proxy._slot_owner_session, "fp:bbbb")
+        # Both sessions are now tracked in the LRU.
+        self.assertIn("fp:aaaa", proxy._slot_lru)
+        self.assertIn("fp:bbbb", proxy._slot_lru)
+    def test_evict_slot_files_respects_lru_cap_and_owner(self):
+        """LRU eviction removes oldest entries beyond the cap but never the
+        session currently owning the slot."""
+        proxy.PROXY_SLOT_SAVE_RESTORE = True
+        proxy.PROXY_SLOT_CACHE_MAX_FILES = 3
+        proxy._slot_owner_session = "fp:owner"
+        # Insert 5 entries oldest-first; owner inserted early so it would be
+        # an eviction candidate by age — but must be protected.
+        for i, sess in enumerate(
+            ["fp:owner", "fp:old1", "fp:old2", "fp:new1", "fp:new2"]
+        ):
+            proxy._slot_lru[sess] = float(i)
+        proxy._evict_slot_files()
+        # Cap is 3; 5 entries -> 2 evicted. Owner protected, so the 2 oldest
+        # non-owner entries (old1, old2) are evicted.
+        self.assertNotIn("fp:old1", proxy._slot_lru)
+        self.assertNotIn("fp:old2", proxy._slot_lru)
+        self.assertIn("fp:owner", proxy._slot_lru)
+        self.assertIn("fp:new1", proxy._slot_lru)
+        self.assertIn("fp:new2", proxy._slot_lru)