PyPI - atelya - Versions diffs - 0.1.19__py3-none-any.whl - Mend

atelya 0.1.19__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

amem/__init__.py +17 -0
amem/bench.py +238 -0
amem/cli.py +59 -0
amem/client.py +200 -0
amem/engine.py +234 -0
amem/evict_regime.py +298 -0
amem/extract.py +89 -0
amem/integrations/__init__.py +4 -0
amem/integrations/langgraph.py +188 -0
amem/kv_engine.py +362 -0
amem/kv_serve.py +465 -0
amem/mcp_server.py +127 -0
amem/memory.py +271 -0
amem/proxy.py +314 -0
amem/serve.py +320 -0
amem/serve_common.py +96 -0
amem/usage.py +142 -0
atelya-0.1.19.dist-info/METADATA +171 -0
atelya-0.1.19.dist-info/RECORD +23 -0
atelya-0.1.19.dist-info/WHEEL +5 -0
atelya-0.1.19.dist-info/entry_points.txt +2 -0
atelya-0.1.19.dist-info/licenses/LICENSE +202 -0
atelya-0.1.19.dist-info/top_level.txt +1 -0

amem/__init__.py ADDED Viewed

@@ -0,0 +1,17 @@
+"""amem — AI-first memory OS: semantic recall that returns reusable KV.
+Four front doors on ONE lifecycle memory store (~/amem_data):
+  - kv-serve : self-host KV-native moat (CacheBlend position-independent reuse
+               + KvPolicy value-model eviction). Hardened: API-key auth,
+               input validation, durable startup, per-user concurrency,
+               $-saved observability. -> the full KV moat, production-shaped.
+  - serve    : self-host APC serve (local open model + KV residency).
+  - proxy    : closed-LLM drop-in (Claude/GPT) + cache orchestration -> on-ramp.
+  - mcp      : MCP server for Claude Desktop / Cursor / Claude Code (stdio).
+"""
+__version__ = "0.1.19"
+from .memory import MemoryStore, Session  # noqa: F401
+from .client import Amem, AmemError  # noqa: F401  (stdlib-only SDK; cheap to import)
+__all__ = ["MemoryStore", "Session", "Amem", "AmemError", "__version__"]

amem/bench.py ADDED Viewed

@@ -0,0 +1,238 @@
+#!/usr/bin/env python3
+# =====================================================================
+# amem.bench  —  reproducible cost-curve benchmark (the adoption proof).
+#
+# Runs the REAL product path: a big memory -> ONE session -> many DIFFERENT queries via
+# amem's session KV-residency (amem.memory) + vLLM APC (amem.engine.MemoryEngine). It
+# measures the headline market-redesign claim (§0): the cost of memory is the CURVE SHAPE.
+#
+#   amem (KV-residency): the generous full-fidelity working set is kept RESIDENT as reused
+#     KV; each new query only prefills the NEWLY-APPENDED tail (the resident prefix is an
+#     APC hit). -> per-query new prefill plateaus -> cumulative cost is ~FLAT.
+#   text-layer baseline (Mem0/Letta/LangChain): the working set is TEXT re-fed and
+#     RE-PREFILLED every query. -> cumulative cost is LINEAR in #queries.
+#
+# The advantage is the RATIO baseline/amem, which GROWS with the number of queries M.
+# Run it on YOUR model and YOUR memory; it prints the numbers and writes a JSON (and an
+# optional self-contained HTML chart) from the MEASURED values on your stack.
+#
+# USAGE (needs the self-host extra: pip install 'amem[selfhost]')
+#   amem bench                                   # uses a LongMemEval slice (default)
+#   amem bench --memory-file mem.txt --queries-file q.txt   # YOUR data (one item per line)
+#   amem bench --model <hf-model> --items 8 --queries 24 --out bench.json --chart bench.html
+#
+# Cost is exact token bookkeeping (the economic claim). Wall TTFT is reported too as an
+# engine-level corroboration of reuse, but it is box-dependent (secondary).
+# =====================================================================
+from __future__ import annotations
+import argparse
+import json
+import statistics
+from pathlib import Path
+from typing import List, Tuple
+from .engine import MemoryEngine, load_memory
+from .memory import MemoryStore, Session
+def _build_corpus(args, ntok) -> Tuple[List[str], List[str]]:
+    """Return (memory_items, queries). From user files if given, else a LongMemEval slice."""
+    if args.memory_file:
+        mem = [ln.strip() for ln in Path(args.memory_file).read_text().splitlines() if ln.strip()]
+        if not args.queries_file:
+            raise SystemExit("--memory-file requires --queries-file (one query per line)")
+        qs = [ln.strip() for ln in Path(args.queries_file).read_text().splitlines() if ln.strip()]
+        if not mem or not qs:
+            raise SystemExit("empty memory-file or queries-file")
+        return mem, qs
+    data = json.loads(Path(args.lme).read_text())
+    M = max(1, min(args.items, len(data)))
+    idxs = [int(round(i * (len(data) - 1) / max(M - 1, 1))) for i in range(M)]
+    per_item, qs = [], []
+    for ix in idxs:
+        item = data[ix]
+        turns, _, _ = load_memory(item)
+        if turns:
+            per_item.append(turns)
+        q = (item.get("question") or "").strip()
+        if q:
+            qs.append(q)
+    # round-robin across items up to the cap so every item (and its query) is represented,
+    # and bge indexing stays bounded even though a single LME memory can be huge.
+    mem, cap, pos = [], args.max_memory_turns, 0
+    while len(mem) < cap and any(pos < len(t) for t in per_item):
+        for t in per_item:
+            if pos < len(t) and len(mem) < cap:
+                mem.append(t[pos])
+        pos += 1
+    if args.queries and args.queries < len(qs):
+        qs = qs[:args.queries]
+    return mem, qs
+def run(args) -> dict:
+    print(f"[bench] loading engine: {args.model}  (first start compiles; silence is normal) ...")
+    eng = MemoryEngine(args.model, args.max_model_len, args.gpu_mem_frac, args.eager, args.bge_model)
+    def ntok(s: str) -> int:
+        return len(eng.tok.encode(s, add_special_tokens=False)) if s else 0
+    mem, queries = _build_corpus(args, ntok)
+    mem_tokens = sum(ntok(t) for t in mem)
+    print(f"[bench] memory: {len(mem)} items / {mem_tokens} tokens  |  queries: {len(queries)}  "
+          f"|  budget: {args.budget_tokens} tok/working-set")
+    import tempfile, os
+    store_path = os.path.join(tempfile.mkdtemp(), "bench.jsonl")
+    store = MemoryStore("bench", store_path, eng.bge.embed, ntok, embed_model=args.bge_model)
+    print("[bench] indexing memory (bge) ...")
+    store.add(mem, dedup=False)                       # full-fidelity memory; no dedup for a clean corpus
+    sess = Session("bench", "bench", args.budget_tokens, ntok)
+    eng.ask("warmup", pool="hello", max_tokens=1)     # warm the engine before timing
+    rows = []
+    print("\n  q  | new_prefill | resident | reuse% | TTFT(ms) | amem_in | base_in")
+    print("  ---+-------------+----------+--------+----------+---------+--------")
+    for i, q in enumerate(queries, 1):
+        pool, reused, appended, n_live = sess.grow(store, q, args.max_turns)
+        qtok = ntok(q)
+        _, secs = eng.ask(q, pool=(pool if pool else " "), max_tokens=1)   # TTFT-ish (prefill cost)
+        resident = sess.resident_tokens
+        amem_in = appended + qtok                     # amem prefills only the new tail + query (APC reuses prefix)
+        base_in = resident + qtok                     # text-layer re-prefills the WHOLE working set every query
+        reuse_frac = reused / (reused + appended) if (reused + appended) else 1.0
+        rows.append({"q": i, "new_prefill_tokens": appended, "resident_tokens": resident,
+                     "reuse_fraction": round(reuse_frac, 4), "ttft_ms": round(secs * 1000, 1),
+                     "amem_input_tokens": amem_in, "base_input_tokens": base_in,
+                     "query_tokens": qtok, "n_turns_resident": n_live})
+        print(f"  {i:2} | {appended:11} | {resident:8} | {reuse_frac*100:5.0f}% | {secs*1000:8.1f} | "
+              f"{amem_in:7} | {base_in:7}")
+    # ---- cumulative curves + ratios (the headline) ----
+    cum_amem, cum_base, a, b = [], [], 0, 0
+    for r in rows:
+        a += r["amem_input_tokens"]; b += r["base_input_tokens"]
+        cum_amem.append(a); cum_base.append(b)
+    N = len(rows)
+    def ratio_at(k):
+        k = min(max(k, 1), N) - 1
+        return round(cum_base[k] / cum_amem[k], 2) if cum_amem[k] else None
+    milestones = sorted(set([1, max(1, N // 4), max(1, N // 2), N]))
+    ratios = {f"M={m}": ratio_at(m) for m in milestones}
+    ttfts = [r["ttft_ms"] for r in rows]
+    cold = ttfts[0] if ttfts else None
+    warm = statistics.mean(ttfts[1:]) if len(ttfts) > 1 else None
+    reuse_final = rows[-1]["reuse_fraction"] if rows else 0.0
+    hit_budget = any(r["resident_tokens"] >= args.budget_tokens for r in rows)
+    result = {
+        "model": args.model, "memory_items": len(mem), "memory_tokens": mem_tokens,
+        "n_queries": N, "budget_tokens": args.budget_tokens, "max_turns": args.max_turns,
+        "rows": rows,
+        "cumulative_amem_input_tokens": cum_amem,
+        "cumulative_baseline_input_tokens": cum_base,
+        "ratios_baseline_over_amem": ratios,
+        "reuse_fraction_final": reuse_final,
+        "ttft_cold_ms": cold, "ttft_warm_mean_ms": warm,
+        "resident_hit_budget": hit_budget,
+    }
+    print("\n" + "=" * 68)
+    print("RESULT — amem cost-curve benchmark (KV-residency vs text-layer re-prefill)")
+    print("=" * 68)
+    print(f"memory                 : {len(mem)} items / {mem_tokens} tokens")
+    print(f"queries (one session)  : {N}")
+    print(f"cumulative prefill     : amem {cum_amem[-1]:,} tok   vs   text-layer {cum_base[-1]:,} tok")
+    print("advantage (baseline/amem): " + "  ".join(f"{k} {v}x" for k, v in ratios.items()) + "   (grows with M)")
+    print(f"reuse fraction (final) : {reuse_final*100:.0f}%   (-> 1.0 = every query served from resident KV)")
+    if cold and warm:
+        print(f"engine TTFT (corrob.)  : cold {cold:.0f}ms -> warm-mean {warm:.0f}ms  "
+              f"({cold/warm:.1f}x)   [box-dependent, secondary]")
+    if hit_budget:
+        print("note: working set hit the budget cap -> later queries reuse the resident KV (flat); "
+              "content beyond budget needs eviction policy (amem-kv kvpolicy, plan §4.3c).")
+    print("=" * 68)
+    print("text-layer RE-PREFILLS the working set every query (linear $). amem keeps it RESIDENT")
+    print("as reused KV: full-fidelity answers (plan §7.6) at a flat, reuse-amortized cost (§7.7/§7.12).")
+    if args.out:
+        Path(args.out).write_text(json.dumps(result, indent=2))
+        print(f"\n[saved] {args.out}")
+    if args.chart:
+        _render_chart(result, args.chart)
+        print(f"[chart] {args.chart}  (self-contained; open in a browser)")
+    return result
+def _render_chart(d: dict, path: str):
+    """Self-contained HTML: amem (flat) vs text-layer (linear) cumulative prefill, from MEASURED data."""
+    amem = d["cumulative_amem_input_tokens"]; base = d["cumulative_baseline_input_tokens"]
+    n = len(amem); top = max(base[-1], 1)
+    W, H, P = 720, 420, 56
+    def X(i): return P + (W - 2 * P) * (i / max(n - 1, 1))
+    def Y(v): return H - P - (H - 2 * P) * (v / top)
+    amem_pts = " ".join(f"{X(i):.1f},{Y(v):.1f}" for i, v in enumerate(amem))
+    base_pts = " ".join(f"{X(i):.1f},{Y(v):.1f}" for i, v in enumerate(base))
+    ratios = d["ratios_baseline_over_amem"]
+    headline = list(ratios.items())[-1]
+    yticks = "".join(
+        f'<line x1="{P}" y1="{Y(top*f):.1f}" x2="{W-P}" y2="{Y(top*f):.1f}" stroke="#1f2937"/>'
+        f'<text x="{P-8}" y="{Y(top*f)+4:.1f}" fill="#6b7280" font-size="11" text-anchor="end">{int(top*f/1000)}k</text>'
+        for f in (0, .25, .5, .75, 1))
+    html = f"""<!doctype html><html><head><meta charset="utf-8"><title>amem cost curve</title>
+<style>body{{margin:0;background:#0b0f17;color:#e5e7eb;font-family:ui-monospace,Menlo,Consolas,monospace}}
+.wrap{{max-width:760px;margin:32px auto;padding:0 16px}}h1{{font-size:18px;font-weight:600;margin:0 0 2px}}
+.sub{{color:#9ca3af;font-size:12px;margin-bottom:16px}}.lg{{display:flex;gap:18px;font-size:12px;margin-top:10px}}
+.dot{{display:inline-block;width:10px;height:10px;border-radius:2px;margin-right:6px;vertical-align:middle}}
+.big{{font-size:13px;color:#cbd5e1;margin-top:12px}}.big b{{color:#34d399}}</style></head>
+<body><div class="wrap">
+<h1>Cost of memory = the curve shape</h1>
+<div class="sub">{d['model']} · memory {d['memory_tokens']:,} tok · {d['n_queries']} queries · measured on your stack</div>
+<svg viewBox="0 0 {W} {H}" width="100%">
+{yticks}
+<line x1="{P}" y1="{H-P}" x2="{W-P}" y2="{H-P}" stroke="#374151"/>
+<polyline points="{base_pts}" fill="none" stroke="#f87171" stroke-width="2.5"/>
+<polyline points="{amem_pts}" fill="none" stroke="#34d399" stroke-width="2.5"/>
+<text x="{W-P}" y="{Y(base[-1])-8:.1f}" fill="#f87171" font-size="12" text-anchor="end">text-layer (re-prefill)</text>
+<text x="{W-P}" y="{Y(amem[-1])-8:.1f}" fill="#34d399" font-size="12" text-anchor="end">amem (KV-resident)</text>
+<text x="{W/2}" y="{H-16}" fill="#6b7280" font-size="11" text-anchor="middle">queries in one session →</text>
+</svg>
+<div class="lg"><span><span class="dot" style="background:#34d399"></span>amem cumulative prefill</span>
+<span><span class="dot" style="background:#f87171"></span>text-layer cumulative prefill</span></div>
+<div class="big">At <b>{headline[0]}</b>, amem prefills <b>{headline[1]}×</b> fewer input tokens — and the gap grows with every query.
+reuse fraction → <b>{d['reuse_fraction_final']*100:.0f}%</b>.</div>
+</div></body></html>"""
+    Path(path).write_text(html)
+def main(argv=None):
+    ap = argparse.ArgumentParser(prog="amem bench",
+        description="Reproducible cost-curve benchmark: KV-residency vs text-layer re-prefill, on your stack.")
+    ap.add_argument("--model", default="Qwen/Qwen2.5-7B-Instruct-AWQ")
+    ap.add_argument("--memory-file", help="text file, one memory item per line (your data)")
+    ap.add_argument("--queries-file", help="text file, one query per line (required with --memory-file)")
+    ap.add_argument("--lme", default=str(Path.home() / "lme_data" / "longmemeval_s_cleaned.json"),
+                    help="LongMemEval json used to build a memory if --memory-file is not given")
+    ap.add_argument("--items", type=int, default=8, help="LME items to union into the memory")
+    ap.add_argument("--max-memory-turns", type=int, default=1500,
+                    help="cap total memory turns (round-robin across items) so indexing stays bounded")
+    ap.add_argument("--queries", type=int, default=0, help="cap number of queries (0 = all)")
+    ap.add_argument("--budget-tokens", type=int, default=6000)
+    ap.add_argument("--max-turns", type=int, default=40)
+    ap.add_argument("--max-model-len", type=int, default=8192)
+    ap.add_argument("--gpu-mem-frac", type=float, default=0.85)
+    ap.add_argument("--bge-model", default="BAAI/bge-small-en-v1.5")
+    ap.add_argument("--eager", action="store_true")
+    ap.add_argument("--out", default=str(Path.home() / "amem_bench.json"))
+    ap.add_argument("--chart", default=None, help="optional path to write a self-contained HTML chart")
+    args = ap.parse_args(argv)
+    run(args)
+    return 0
+if __name__ == "__main__":
+    main()

amem/cli.py ADDED Viewed

@@ -0,0 +1,59 @@
+#!/usr/bin/env python3
+"""amem CLI — one entry point for all three front doors.
+  amem proxy   closed-LLM drop-in (Claude/GPT) + cache orchestration  (light, no GPU)
+  amem mcp     MCP server for Claude Desktop / Cursor / Claude Code    (stdio)
+  amem serve   self-host serve (local open model + KV residency)       [pip install amem[selfhost]]
+Subcommand options are passed straight through (e.g. `amem proxy --port 8100 --help`).
+"""
+import importlib.util
+import sys
+from . import __version__
+USAGE = (
+    f"amem {__version__} — AI-first memory OS (recall that returns reusable KV)\n\n"
+    "usage: amem <command> [options]\n\n"
+    "commands:\n"
+    "  proxy   closed-LLM drop-in (Claude/GPT) + cache orchestration  (light, no GPU)\n"
+    "  mcp     MCP server for Claude Desktop / Cursor / Claude Code    (stdio)\n"
+    "  serve   self-host serve (local open model + KV residency)       [needs: pip install amem[selfhost]]\n"
+    "  kv-serve KV-native serve: CacheBlend reuse + KvPolicy eviction (moat) [needs: amem[selfhost]]\n"
+    "  bench   reproducible cost-curve benchmark (KV-residency vs re-prefill) [needs: amem[selfhost]]\n"
+    "  version print version\n\n"
+    "run `amem <command> --help` for that command's options.\n"
+)
+def main(argv=None):
+    argv = list(sys.argv[1:] if argv is None else argv)
+    if not argv or argv[0] in ("-h", "--help", "help"):
+        print(USAGE); return 0
+    cmd, rest = argv[0], argv[1:]
+    if cmd in ("version", "--version", "-V"):
+        print(f"amem {__version__}"); return 0
+    # subcommands re-parse argv themselves -> hand them their own argv
+    sys.argv = [f"amem-{cmd}"] + rest
+    if cmd == "proxy":
+        from . import proxy; return proxy.main()
+    if cmd == "mcp":
+        from . import mcp_server; return mcp_server.main()
+    if cmd == "serve":
+        if importlib.util.find_spec("vllm") is None:
+            sys.exit("`amem serve` needs the self-host extra:  pip install 'amem[selfhost]'")
+        from . import serve; return serve.main()
+    if cmd in ("kv-serve", "kvserve"):
+        if importlib.util.find_spec("vllm") is None:
+            sys.exit("`amem kv-serve` needs the self-host extra:  pip install 'amem[selfhost]'")
+        from . import kv_serve; return kv_serve.main()
+    if cmd == "bench":
+        if importlib.util.find_spec("vllm") is None:
+            sys.exit("`amem bench` needs the self-host extra:  pip install 'amem[selfhost]'")
+        from . import bench; return bench.main()
+    sys.exit(f"unknown command: {cmd}\n\n{USAGE}")
+if __name__ == "__main__":
+    main()

amem/client.py ADDED Viewed

@@ -0,0 +1,200 @@
+#!/usr/bin/env python3
+# =====================================================================
+# amem_client.py  —  tiny Python SDK for amem serve v1 (session residency).
+#
+#   from amem_client import Amem
+#   mem = Amem("http://localhost:8000")
+#   mem.remember("alice", ["alice lives in Seattle and works at Boeing", ...])
+#   sid = mem.start_session("alice")                       # one resident set for the chat
+#   print(mem.ask("alice", "Where does Alice's sister live?", session=sid))
+#   print(mem.ask("alice", "What company does she work for?", session=sid))  # DIFFERENT q -> reuses KV
+#
+# DROP-IN (the on-ramp): the standard OpenAI SDK works unchanged -- point base_url at
+# amem and pass `user` as the memory namespace. Residency defaults ON (per-user session):
+#   from openai import OpenAI
+#   client = OpenAI(base_url="http://localhost:8000/v1", api_key="not-needed")
+#   r = client.chat.completions.create(model="amem", user="alice",
+#         messages=[{"role":"user","content":"Where does Alice's sister live?"}])
+#   print(r.choices[0].message.content)
+#
+# SECURED server (started with --api-key):
+#   mem = Amem("http://localhost:8001", api_key="<key>")          # serve/kv-serve: Bearer
+#   mem = Amem.for_proxy("http://localhost:8100", api_key="<key>")  # proxy: X-Amem-Key
+# =====================================================================
+import json
+from typing import List, Optional, Union
+from urllib import request as _rq
+from urllib.error import HTTPError, URLError
+class AmemError(RuntimeError):
+    """Raised on a non-2xx response (carries .status and the server's .detail)."""
+    def __init__(self, status: int, detail: str):
+        self.status = status
+        self.detail = detail
+        super().__init__(f"amem {status}: {detail}")
+class Amem:
+    def __init__(self, base_url: str = "http://localhost:8000", api_key: Optional[str] = None,
+                 auth_header: str = "authorization", auth_scheme: Optional[str] = "Bearer",
+                 timeout: float = 600.0):
+        """SDK for amem serve / kv-serve.
+        If the server was started with --api-key, pass `api_key` here. Defaults to the
+        self-host scheme `Authorization: Bearer <key>`. For the closed `proxy`, use
+        `Amem.for_proxy(base_url, api_key=...)` (it sends `X-Amem-Key`, leaving
+        Authorization free for your upstream provider key).
+        """
+        self.base = base_url.rstrip("/")
+        self.timeout = timeout
+        self._headers = {"content-type": "application/json"}
+        if api_key:
+            try:
+                str(api_key).encode("latin-1")          # HTTP headers are latin-1; a non-ASCII key (e.g. a pasted placeholder) must fail clearly, not crash deep in urllib
+            except UnicodeEncodeError as e:
+                raise AmemError(0, "api_key contains non-ASCII characters; API keys must be ASCII (did you paste a placeholder instead of the real key?)") from e
+            self._headers[auth_header.lower()] = f"{auth_scheme} {api_key}" if auth_scheme else str(api_key)
+    @classmethod
+    def for_proxy(cls, base_url: str = "http://localhost:8100", api_key: Optional[str] = None,
+                  timeout: float = 600.0) -> "Amem":
+        """Client for the closed proxy: amem-auth rides on X-Amem-Key (raw), so the
+        Authorization header stays reserved for the upstream provider key."""
+        return cls(base_url, api_key=api_key, auth_header="x-amem-key", auth_scheme=None, timeout=timeout)
+    def _req(self, path: str, method: str, payload: Optional[dict] = None) -> dict:
+        data = json.dumps(payload).encode() if payload is not None else None
+        req = _rq.Request(self.base + path, data=data, headers=self._headers, method=method)
+        try:
+            with _rq.urlopen(req, timeout=self.timeout) as r:
+                body = r.read().decode()
+                return json.loads(body) if body else {}
+        except HTTPError as e:                       # 401/400/404/500 -> clean AmemError, no raw stack
+            raw = e.read().decode(errors="replace")
+            try:
+                raw = json.loads(raw).get("detail", raw)
+            except Exception:
+                pass
+            raise AmemError(e.code, raw) from None
+        except URLError as e:
+            raise AmemError(0, f"cannot reach {self.base}: {e.reason}") from None
+    def _post(self, path: str, payload: dict) -> dict:
+        return self._req(path, "POST", payload)
+    def _get(self, path: str) -> dict:
+        return self._req(path, "GET")
+    # ---- core API ----
+    def health(self) -> dict:
+        return self._get("/health")
+    def stats(self) -> dict:
+        return self._get("/v1/stats")
+    def remember(self, user: str, texts: Union[str, List[str]], keys: Optional[List[str]] = None) -> dict:
+        if isinstance(texts, str):
+            texts = [texts]
+        payload = {"user": user, "texts": texts}
+        if keys is not None:
+            payload["keys"] = [keys] if isinstance(keys, str) else keys
+        return self._post("/v1/memories", payload)
+    def list_memories(self, user: str) -> list:
+        return self._get(f"/v1/memories/{user}").get("memories", [])
+    def forget(self, user: str, memory_id: str) -> dict:
+        return self._req(f"/v1/memories/{user}/{memory_id}", "DELETE")
+    def update(self, user: str, memory_id: str, text: str) -> dict:
+        return self._req(f"/v1/memories/{user}/{memory_id}", "PATCH", {"text": text})
+    def users(self) -> list:
+        """kv-serve only: list tenants with active-memory counts."""
+        return self._get("/v1/users").get("users", [])
+    def extract(self, user: str, text: str) -> dict:
+        """kv-serve only: extract atomic facts from raw text with the local model and
+        store them (the §7.8 front-end). Returns {facts, added, active}."""
+        return self._post("/v1/extract", {"user": user, "text": text})
+    def usage(self, user: str = None, since=None, until=None) -> dict:
+        """kv-serve only: durable PER-TENANT usage over [since, until] (the billing source of
+        truth; epoch or ISO-8601 for since/until). Returns {per_user, totals, instance, window}.
+        Bill on per_user[u]['prefill_tokens_saved'] (value-aligned, §2.5-G) or memory_ops.
+        For an invoice-ready CSV, GET /v1/usage?format=csv directly."""
+        from urllib.parse import urlencode
+        q = {k: v for k, v in (("user", user), ("since", since), ("until", until)) if v is not None}
+        return self._get("/v1/usage" + ("?" + urlencode(q) if q else ""))
+    def start_session(self, user: str, seed_query: str = None, budget_tokens: int = None) -> str:
+        payload = {"user": user}
+        if seed_query is not None:
+            payload["seed_query"] = seed_query
+        if budget_tokens is not None:
+            payload["budget_tokens"] = budget_tokens
+        return self._post("/v1/sessions", payload)["session"]
+    def session_state(self, sid: str) -> dict:
+        return self._get(f"/v1/sessions/{sid}")
+    def end_session(self, sid: str) -> dict:
+        return self._req(f"/v1/sessions/{sid}", "DELETE")
+    def recall(self, user: str, query: str, session: str = None, budget_tokens: int = None) -> dict:
+        payload = {"user": user, "query": query}
+        if session is not None:
+            payload["session"] = session
+        if budget_tokens is not None:
+            payload["budget_tokens"] = budget_tokens
+        return self._post("/v1/recall", payload)
+    def ask_full(self, user: str, query: str, session: str = None,
+                 residency: bool = True, max_tokens: int = 300) -> dict:
+        payload = {"model": "amem", "user": user, "max_tokens": max_tokens,
+                   "residency": residency, "messages": [{"role": "user", "content": query}]}
+        if session is not None:
+            payload["session"] = session
+        return self._post("/v1/chat/completions", payload)
+    def ask(self, user: str, query: str, session: str = None,
+            residency: bool = True, max_tokens: int = 300) -> str:
+        return self.ask_full(user, query, session, residency, max_tokens)["choices"][0]["message"]["content"]
+if __name__ == "__main__":
+    import sys
+    base = sys.argv[1] if len(sys.argv) > 1 else "http://localhost:8000"
+    mem = Amem(base)
+    print("health:", json.dumps(mem.health(), indent=2))
+    print("\nremember:", mem.remember("alice", [
+        "alice lives in Seattle and works at Boeing",
+        "alice's sister Maria lives in Denver",
+        "alice's launch budget was approved at 4417 thousand dollars",
+        "alice is allergic to peanuts and prefers window seats",
+        "alice's manager is named Tom and the project ships in March",
+    ]))
+    # THE POINT: one session, several DIFFERENT questions -> the resident working set
+    # grows append-only, then reuse fires (memory_tokens_reused rises, prefilled_now -> small).
+    sid = mem.start_session("alice")
+    print(f"\nsession {sid} — asking DIFFERENT questions, watch reuse grow:\n")
+    for q in ["Where does Alice's sister live?",
+              "What company does Alice work for?",
+              "How big was the approved launch budget?",
+              "What is Alice allergic to?",
+              "Who is Alice's manager?"]:
+        r = mem.ask_full("alice", q, session=sid)
+        a = r["amem"]
+        print(f"  Q: {q}")
+        print(f"     -> {r['choices'][0]['message']['content']}")
+        print(f"     resident={a['resident_tokens']}tok  reused={a['memory_tokens_reused']}tok  "
+              f"prefilled_now={a['memory_tokens_prefilled_now']}tok  kv_reused={a['kv_reused']}  "
+              f"{a['latency_s']}s")
+    print("\nrecall (inverted, same session):",
+          json.dumps(mem.recall("alice", "Tell me about Alice's travel preferences", session=sid), indent=2)[:400])
+    print("\nstats:", json.dumps(mem.stats(), indent=2))