atelya 0.1.19__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
amem/__init__.py ADDED
@@ -0,0 +1,17 @@
1
+ """amem — AI-first memory OS: semantic recall that returns reusable KV.
2
+
3
+ Four front doors on ONE lifecycle memory store (~/amem_data):
4
+ - kv-serve : self-host KV-native moat (CacheBlend position-independent reuse
5
+ + KvPolicy value-model eviction). Hardened: API-key auth,
6
+ input validation, durable startup, per-user concurrency,
7
+ $-saved observability. -> the full KV moat, production-shaped.
8
+ - serve : self-host APC serve (local open model + KV residency).
9
+ - proxy : closed-LLM drop-in (Claude/GPT) + cache orchestration -> on-ramp.
10
+ - mcp : MCP server for Claude Desktop / Cursor / Claude Code (stdio).
11
+ """
12
+ __version__ = "0.1.19"
13
+
14
+ from .memory import MemoryStore, Session # noqa: F401
15
+ from .client import Amem, AmemError # noqa: F401 (stdlib-only SDK; cheap to import)
16
+
17
+ __all__ = ["MemoryStore", "Session", "Amem", "AmemError", "__version__"]
amem/bench.py ADDED
@@ -0,0 +1,238 @@
1
+ #!/usr/bin/env python3
2
+ # =====================================================================
3
+ # amem.bench — reproducible cost-curve benchmark (the adoption proof).
4
+ #
5
+ # Runs the REAL product path: a big memory -> ONE session -> many DIFFERENT queries via
6
+ # amem's session KV-residency (amem.memory) + vLLM APC (amem.engine.MemoryEngine). It
7
+ # measures the headline market-redesign claim (§0): the cost of memory is the CURVE SHAPE.
8
+ #
9
+ # amem (KV-residency): the generous full-fidelity working set is kept RESIDENT as reused
10
+ # KV; each new query only prefills the NEWLY-APPENDED tail (the resident prefix is an
11
+ # APC hit). -> per-query new prefill plateaus -> cumulative cost is ~FLAT.
12
+ # text-layer baseline (Mem0/Letta/LangChain): the working set is TEXT re-fed and
13
+ # RE-PREFILLED every query. -> cumulative cost is LINEAR in #queries.
14
+ #
15
+ # The advantage is the RATIO baseline/amem, which GROWS with the number of queries M.
16
+ # Run it on YOUR model and YOUR memory; it prints the numbers and writes a JSON (and an
17
+ # optional self-contained HTML chart) from the MEASURED values on your stack.
18
+ #
19
+ # USAGE (needs the self-host extra: pip install 'amem[selfhost]')
20
+ # amem bench # uses a LongMemEval slice (default)
21
+ # amem bench --memory-file mem.txt --queries-file q.txt # YOUR data (one item per line)
22
+ # amem bench --model <hf-model> --items 8 --queries 24 --out bench.json --chart bench.html
23
+ #
24
+ # Cost is exact token bookkeeping (the economic claim). Wall TTFT is reported too as an
25
+ # engine-level corroboration of reuse, but it is box-dependent (secondary).
26
+ # =====================================================================
27
+
28
+ from __future__ import annotations
29
+
30
+ import argparse
31
+ import json
32
+ import statistics
33
+ from pathlib import Path
34
+ from typing import List, Tuple
35
+
36
+ from .engine import MemoryEngine, load_memory
37
+ from .memory import MemoryStore, Session
38
+
39
+
40
+ def _build_corpus(args, ntok) -> Tuple[List[str], List[str]]:
41
+ """Return (memory_items, queries). From user files if given, else a LongMemEval slice."""
42
+ if args.memory_file:
43
+ mem = [ln.strip() for ln in Path(args.memory_file).read_text().splitlines() if ln.strip()]
44
+ if not args.queries_file:
45
+ raise SystemExit("--memory-file requires --queries-file (one query per line)")
46
+ qs = [ln.strip() for ln in Path(args.queries_file).read_text().splitlines() if ln.strip()]
47
+ if not mem or not qs:
48
+ raise SystemExit("empty memory-file or queries-file")
49
+ return mem, qs
50
+
51
+ data = json.loads(Path(args.lme).read_text())
52
+ M = max(1, min(args.items, len(data)))
53
+ idxs = [int(round(i * (len(data) - 1) / max(M - 1, 1))) for i in range(M)]
54
+ per_item, qs = [], []
55
+ for ix in idxs:
56
+ item = data[ix]
57
+ turns, _, _ = load_memory(item)
58
+ if turns:
59
+ per_item.append(turns)
60
+ q = (item.get("question") or "").strip()
61
+ if q:
62
+ qs.append(q)
63
+ # round-robin across items up to the cap so every item (and its query) is represented,
64
+ # and bge indexing stays bounded even though a single LME memory can be huge.
65
+ mem, cap, pos = [], args.max_memory_turns, 0
66
+ while len(mem) < cap and any(pos < len(t) for t in per_item):
67
+ for t in per_item:
68
+ if pos < len(t) and len(mem) < cap:
69
+ mem.append(t[pos])
70
+ pos += 1
71
+ if args.queries and args.queries < len(qs):
72
+ qs = qs[:args.queries]
73
+ return mem, qs
74
+
75
+
76
+ def run(args) -> dict:
77
+ print(f"[bench] loading engine: {args.model} (first start compiles; silence is normal) ...")
78
+ eng = MemoryEngine(args.model, args.max_model_len, args.gpu_mem_frac, args.eager, args.bge_model)
79
+
80
+ def ntok(s: str) -> int:
81
+ return len(eng.tok.encode(s, add_special_tokens=False)) if s else 0
82
+
83
+ mem, queries = _build_corpus(args, ntok)
84
+ mem_tokens = sum(ntok(t) for t in mem)
85
+ print(f"[bench] memory: {len(mem)} items / {mem_tokens} tokens | queries: {len(queries)} "
86
+ f"| budget: {args.budget_tokens} tok/working-set")
87
+
88
+ import tempfile, os
89
+ store_path = os.path.join(tempfile.mkdtemp(), "bench.jsonl")
90
+ store = MemoryStore("bench", store_path, eng.bge.embed, ntok, embed_model=args.bge_model)
91
+ print("[bench] indexing memory (bge) ...")
92
+ store.add(mem, dedup=False) # full-fidelity memory; no dedup for a clean corpus
93
+
94
+ sess = Session("bench", "bench", args.budget_tokens, ntok)
95
+ eng.ask("warmup", pool="hello", max_tokens=1) # warm the engine before timing
96
+
97
+ rows = []
98
+ print("\n q | new_prefill | resident | reuse% | TTFT(ms) | amem_in | base_in")
99
+ print(" ---+-------------+----------+--------+----------+---------+--------")
100
+ for i, q in enumerate(queries, 1):
101
+ pool, reused, appended, n_live = sess.grow(store, q, args.max_turns)
102
+ qtok = ntok(q)
103
+ _, secs = eng.ask(q, pool=(pool if pool else " "), max_tokens=1) # TTFT-ish (prefill cost)
104
+ resident = sess.resident_tokens
105
+ amem_in = appended + qtok # amem prefills only the new tail + query (APC reuses prefix)
106
+ base_in = resident + qtok # text-layer re-prefills the WHOLE working set every query
107
+ reuse_frac = reused / (reused + appended) if (reused + appended) else 1.0
108
+ rows.append({"q": i, "new_prefill_tokens": appended, "resident_tokens": resident,
109
+ "reuse_fraction": round(reuse_frac, 4), "ttft_ms": round(secs * 1000, 1),
110
+ "amem_input_tokens": amem_in, "base_input_tokens": base_in,
111
+ "query_tokens": qtok, "n_turns_resident": n_live})
112
+ print(f" {i:2} | {appended:11} | {resident:8} | {reuse_frac*100:5.0f}% | {secs*1000:8.1f} | "
113
+ f"{amem_in:7} | {base_in:7}")
114
+
115
+ # ---- cumulative curves + ratios (the headline) ----
116
+ cum_amem, cum_base, a, b = [], [], 0, 0
117
+ for r in rows:
118
+ a += r["amem_input_tokens"]; b += r["base_input_tokens"]
119
+ cum_amem.append(a); cum_base.append(b)
120
+ N = len(rows)
121
+ def ratio_at(k):
122
+ k = min(max(k, 1), N) - 1
123
+ return round(cum_base[k] / cum_amem[k], 2) if cum_amem[k] else None
124
+ milestones = sorted(set([1, max(1, N // 4), max(1, N // 2), N]))
125
+ ratios = {f"M={m}": ratio_at(m) for m in milestones}
126
+ ttfts = [r["ttft_ms"] for r in rows]
127
+ cold = ttfts[0] if ttfts else None
128
+ warm = statistics.mean(ttfts[1:]) if len(ttfts) > 1 else None
129
+ reuse_final = rows[-1]["reuse_fraction"] if rows else 0.0
130
+ hit_budget = any(r["resident_tokens"] >= args.budget_tokens for r in rows)
131
+
132
+ result = {
133
+ "model": args.model, "memory_items": len(mem), "memory_tokens": mem_tokens,
134
+ "n_queries": N, "budget_tokens": args.budget_tokens, "max_turns": args.max_turns,
135
+ "rows": rows,
136
+ "cumulative_amem_input_tokens": cum_amem,
137
+ "cumulative_baseline_input_tokens": cum_base,
138
+ "ratios_baseline_over_amem": ratios,
139
+ "reuse_fraction_final": reuse_final,
140
+ "ttft_cold_ms": cold, "ttft_warm_mean_ms": warm,
141
+ "resident_hit_budget": hit_budget,
142
+ }
143
+
144
+ print("\n" + "=" * 68)
145
+ print("RESULT — amem cost-curve benchmark (KV-residency vs text-layer re-prefill)")
146
+ print("=" * 68)
147
+ print(f"memory : {len(mem)} items / {mem_tokens} tokens")
148
+ print(f"queries (one session) : {N}")
149
+ print(f"cumulative prefill : amem {cum_amem[-1]:,} tok vs text-layer {cum_base[-1]:,} tok")
150
+ print("advantage (baseline/amem): " + " ".join(f"{k} {v}x" for k, v in ratios.items()) + " (grows with M)")
151
+ print(f"reuse fraction (final) : {reuse_final*100:.0f}% (-> 1.0 = every query served from resident KV)")
152
+ if cold and warm:
153
+ print(f"engine TTFT (corrob.) : cold {cold:.0f}ms -> warm-mean {warm:.0f}ms "
154
+ f"({cold/warm:.1f}x) [box-dependent, secondary]")
155
+ if hit_budget:
156
+ print("note: working set hit the budget cap -> later queries reuse the resident KV (flat); "
157
+ "content beyond budget needs eviction policy (amem-kv kvpolicy, plan §4.3c).")
158
+ print("=" * 68)
159
+ print("text-layer RE-PREFILLS the working set every query (linear $). amem keeps it RESIDENT")
160
+ print("as reused KV: full-fidelity answers (plan §7.6) at a flat, reuse-amortized cost (§7.7/§7.12).")
161
+
162
+ if args.out:
163
+ Path(args.out).write_text(json.dumps(result, indent=2))
164
+ print(f"\n[saved] {args.out}")
165
+ if args.chart:
166
+ _render_chart(result, args.chart)
167
+ print(f"[chart] {args.chart} (self-contained; open in a browser)")
168
+ return result
169
+
170
+
171
+ def _render_chart(d: dict, path: str):
172
+ """Self-contained HTML: amem (flat) vs text-layer (linear) cumulative prefill, from MEASURED data."""
173
+ amem = d["cumulative_amem_input_tokens"]; base = d["cumulative_baseline_input_tokens"]
174
+ n = len(amem); top = max(base[-1], 1)
175
+ W, H, P = 720, 420, 56
176
+ def X(i): return P + (W - 2 * P) * (i / max(n - 1, 1))
177
+ def Y(v): return H - P - (H - 2 * P) * (v / top)
178
+ amem_pts = " ".join(f"{X(i):.1f},{Y(v):.1f}" for i, v in enumerate(amem))
179
+ base_pts = " ".join(f"{X(i):.1f},{Y(v):.1f}" for i, v in enumerate(base))
180
+ ratios = d["ratios_baseline_over_amem"]
181
+ headline = list(ratios.items())[-1]
182
+ yticks = "".join(
183
+ f'<line x1="{P}" y1="{Y(top*f):.1f}" x2="{W-P}" y2="{Y(top*f):.1f}" stroke="#1f2937"/>'
184
+ f'<text x="{P-8}" y="{Y(top*f)+4:.1f}" fill="#6b7280" font-size="11" text-anchor="end">{int(top*f/1000)}k</text>'
185
+ for f in (0, .25, .5, .75, 1))
186
+ html = f"""<!doctype html><html><head><meta charset="utf-8"><title>amem cost curve</title>
187
+ <style>body{{margin:0;background:#0b0f17;color:#e5e7eb;font-family:ui-monospace,Menlo,Consolas,monospace}}
188
+ .wrap{{max-width:760px;margin:32px auto;padding:0 16px}}h1{{font-size:18px;font-weight:600;margin:0 0 2px}}
189
+ .sub{{color:#9ca3af;font-size:12px;margin-bottom:16px}}.lg{{display:flex;gap:18px;font-size:12px;margin-top:10px}}
190
+ .dot{{display:inline-block;width:10px;height:10px;border-radius:2px;margin-right:6px;vertical-align:middle}}
191
+ .big{{font-size:13px;color:#cbd5e1;margin-top:12px}}.big b{{color:#34d399}}</style></head>
192
+ <body><div class="wrap">
193
+ <h1>Cost of memory = the curve shape</h1>
194
+ <div class="sub">{d['model']} · memory {d['memory_tokens']:,} tok · {d['n_queries']} queries · measured on your stack</div>
195
+ <svg viewBox="0 0 {W} {H}" width="100%">
196
+ {yticks}
197
+ <line x1="{P}" y1="{H-P}" x2="{W-P}" y2="{H-P}" stroke="#374151"/>
198
+ <polyline points="{base_pts}" fill="none" stroke="#f87171" stroke-width="2.5"/>
199
+ <polyline points="{amem_pts}" fill="none" stroke="#34d399" stroke-width="2.5"/>
200
+ <text x="{W-P}" y="{Y(base[-1])-8:.1f}" fill="#f87171" font-size="12" text-anchor="end">text-layer (re-prefill)</text>
201
+ <text x="{W-P}" y="{Y(amem[-1])-8:.1f}" fill="#34d399" font-size="12" text-anchor="end">amem (KV-resident)</text>
202
+ <text x="{W/2}" y="{H-16}" fill="#6b7280" font-size="11" text-anchor="middle">queries in one session →</text>
203
+ </svg>
204
+ <div class="lg"><span><span class="dot" style="background:#34d399"></span>amem cumulative prefill</span>
205
+ <span><span class="dot" style="background:#f87171"></span>text-layer cumulative prefill</span></div>
206
+ <div class="big">At <b>{headline[0]}</b>, amem prefills <b>{headline[1]}×</b> fewer input tokens — and the gap grows with every query.
207
+ reuse fraction → <b>{d['reuse_fraction_final']*100:.0f}%</b>.</div>
208
+ </div></body></html>"""
209
+ Path(path).write_text(html)
210
+
211
+
212
+ def main(argv=None):
213
+ ap = argparse.ArgumentParser(prog="amem bench",
214
+ description="Reproducible cost-curve benchmark: KV-residency vs text-layer re-prefill, on your stack.")
215
+ ap.add_argument("--model", default="Qwen/Qwen2.5-7B-Instruct-AWQ")
216
+ ap.add_argument("--memory-file", help="text file, one memory item per line (your data)")
217
+ ap.add_argument("--queries-file", help="text file, one query per line (required with --memory-file)")
218
+ ap.add_argument("--lme", default=str(Path.home() / "lme_data" / "longmemeval_s_cleaned.json"),
219
+ help="LongMemEval json used to build a memory if --memory-file is not given")
220
+ ap.add_argument("--items", type=int, default=8, help="LME items to union into the memory")
221
+ ap.add_argument("--max-memory-turns", type=int, default=1500,
222
+ help="cap total memory turns (round-robin across items) so indexing stays bounded")
223
+ ap.add_argument("--queries", type=int, default=0, help="cap number of queries (0 = all)")
224
+ ap.add_argument("--budget-tokens", type=int, default=6000)
225
+ ap.add_argument("--max-turns", type=int, default=40)
226
+ ap.add_argument("--max-model-len", type=int, default=8192)
227
+ ap.add_argument("--gpu-mem-frac", type=float, default=0.85)
228
+ ap.add_argument("--bge-model", default="BAAI/bge-small-en-v1.5")
229
+ ap.add_argument("--eager", action="store_true")
230
+ ap.add_argument("--out", default=str(Path.home() / "amem_bench.json"))
231
+ ap.add_argument("--chart", default=None, help="optional path to write a self-contained HTML chart")
232
+ args = ap.parse_args(argv)
233
+ run(args)
234
+ return 0
235
+
236
+
237
+ if __name__ == "__main__":
238
+ main()
amem/cli.py ADDED
@@ -0,0 +1,59 @@
1
+ #!/usr/bin/env python3
2
+ """amem CLI — one entry point for all three front doors.
3
+
4
+ amem proxy closed-LLM drop-in (Claude/GPT) + cache orchestration (light, no GPU)
5
+ amem mcp MCP server for Claude Desktop / Cursor / Claude Code (stdio)
6
+ amem serve self-host serve (local open model + KV residency) [pip install amem[selfhost]]
7
+
8
+ Subcommand options are passed straight through (e.g. `amem proxy --port 8100 --help`).
9
+ """
10
+ import importlib.util
11
+ import sys
12
+
13
+ from . import __version__
14
+
15
+ USAGE = (
16
+ f"amem {__version__} — AI-first memory OS (recall that returns reusable KV)\n\n"
17
+ "usage: amem <command> [options]\n\n"
18
+ "commands:\n"
19
+ " proxy closed-LLM drop-in (Claude/GPT) + cache orchestration (light, no GPU)\n"
20
+ " mcp MCP server for Claude Desktop / Cursor / Claude Code (stdio)\n"
21
+ " serve self-host serve (local open model + KV residency) [needs: pip install amem[selfhost]]\n"
22
+ " kv-serve KV-native serve: CacheBlend reuse + KvPolicy eviction (moat) [needs: amem[selfhost]]\n"
23
+ " bench reproducible cost-curve benchmark (KV-residency vs re-prefill) [needs: amem[selfhost]]\n"
24
+ " version print version\n\n"
25
+ "run `amem <command> --help` for that command's options.\n"
26
+ )
27
+
28
+
29
+ def main(argv=None):
30
+ argv = list(sys.argv[1:] if argv is None else argv)
31
+ if not argv or argv[0] in ("-h", "--help", "help"):
32
+ print(USAGE); return 0
33
+ cmd, rest = argv[0], argv[1:]
34
+ if cmd in ("version", "--version", "-V"):
35
+ print(f"amem {__version__}"); return 0
36
+
37
+ # subcommands re-parse argv themselves -> hand them their own argv
38
+ sys.argv = [f"amem-{cmd}"] + rest
39
+ if cmd == "proxy":
40
+ from . import proxy; return proxy.main()
41
+ if cmd == "mcp":
42
+ from . import mcp_server; return mcp_server.main()
43
+ if cmd == "serve":
44
+ if importlib.util.find_spec("vllm") is None:
45
+ sys.exit("`amem serve` needs the self-host extra: pip install 'amem[selfhost]'")
46
+ from . import serve; return serve.main()
47
+ if cmd in ("kv-serve", "kvserve"):
48
+ if importlib.util.find_spec("vllm") is None:
49
+ sys.exit("`amem kv-serve` needs the self-host extra: pip install 'amem[selfhost]'")
50
+ from . import kv_serve; return kv_serve.main()
51
+ if cmd == "bench":
52
+ if importlib.util.find_spec("vllm") is None:
53
+ sys.exit("`amem bench` needs the self-host extra: pip install 'amem[selfhost]'")
54
+ from . import bench; return bench.main()
55
+ sys.exit(f"unknown command: {cmd}\n\n{USAGE}")
56
+
57
+
58
+ if __name__ == "__main__":
59
+ main()
amem/client.py ADDED
@@ -0,0 +1,200 @@
1
+ #!/usr/bin/env python3
2
+ # =====================================================================
3
+ # amem_client.py — tiny Python SDK for amem serve v1 (session residency).
4
+ #
5
+ # from amem_client import Amem
6
+ # mem = Amem("http://localhost:8000")
7
+ # mem.remember("alice", ["alice lives in Seattle and works at Boeing", ...])
8
+ # sid = mem.start_session("alice") # one resident set for the chat
9
+ # print(mem.ask("alice", "Where does Alice's sister live?", session=sid))
10
+ # print(mem.ask("alice", "What company does she work for?", session=sid)) # DIFFERENT q -> reuses KV
11
+ #
12
+ # DROP-IN (the on-ramp): the standard OpenAI SDK works unchanged -- point base_url at
13
+ # amem and pass `user` as the memory namespace. Residency defaults ON (per-user session):
14
+ # from openai import OpenAI
15
+ # client = OpenAI(base_url="http://localhost:8000/v1", api_key="not-needed")
16
+ # r = client.chat.completions.create(model="amem", user="alice",
17
+ # messages=[{"role":"user","content":"Where does Alice's sister live?"}])
18
+ # print(r.choices[0].message.content)
19
+ #
20
+ # SECURED server (started with --api-key):
21
+ # mem = Amem("http://localhost:8001", api_key="<key>") # serve/kv-serve: Bearer
22
+ # mem = Amem.for_proxy("http://localhost:8100", api_key="<key>") # proxy: X-Amem-Key
23
+ # =====================================================================
24
+
25
+ import json
26
+ from typing import List, Optional, Union
27
+ from urllib import request as _rq
28
+ from urllib.error import HTTPError, URLError
29
+
30
+
31
+ class AmemError(RuntimeError):
32
+ """Raised on a non-2xx response (carries .status and the server's .detail)."""
33
+ def __init__(self, status: int, detail: str):
34
+ self.status = status
35
+ self.detail = detail
36
+ super().__init__(f"amem {status}: {detail}")
37
+
38
+
39
+ class Amem:
40
+ def __init__(self, base_url: str = "http://localhost:8000", api_key: Optional[str] = None,
41
+ auth_header: str = "authorization", auth_scheme: Optional[str] = "Bearer",
42
+ timeout: float = 600.0):
43
+ """SDK for amem serve / kv-serve.
44
+
45
+ If the server was started with --api-key, pass `api_key` here. Defaults to the
46
+ self-host scheme `Authorization: Bearer <key>`. For the closed `proxy`, use
47
+ `Amem.for_proxy(base_url, api_key=...)` (it sends `X-Amem-Key`, leaving
48
+ Authorization free for your upstream provider key).
49
+ """
50
+ self.base = base_url.rstrip("/")
51
+ self.timeout = timeout
52
+ self._headers = {"content-type": "application/json"}
53
+ if api_key:
54
+ try:
55
+ str(api_key).encode("latin-1") # HTTP headers are latin-1; a non-ASCII key (e.g. a pasted placeholder) must fail clearly, not crash deep in urllib
56
+ except UnicodeEncodeError as e:
57
+ raise AmemError(0, "api_key contains non-ASCII characters; API keys must be ASCII (did you paste a placeholder instead of the real key?)") from e
58
+ self._headers[auth_header.lower()] = f"{auth_scheme} {api_key}" if auth_scheme else str(api_key)
59
+
60
+ @classmethod
61
+ def for_proxy(cls, base_url: str = "http://localhost:8100", api_key: Optional[str] = None,
62
+ timeout: float = 600.0) -> "Amem":
63
+ """Client for the closed proxy: amem-auth rides on X-Amem-Key (raw), so the
64
+ Authorization header stays reserved for the upstream provider key."""
65
+ return cls(base_url, api_key=api_key, auth_header="x-amem-key", auth_scheme=None, timeout=timeout)
66
+
67
+ def _req(self, path: str, method: str, payload: Optional[dict] = None) -> dict:
68
+ data = json.dumps(payload).encode() if payload is not None else None
69
+ req = _rq.Request(self.base + path, data=data, headers=self._headers, method=method)
70
+ try:
71
+ with _rq.urlopen(req, timeout=self.timeout) as r:
72
+ body = r.read().decode()
73
+ return json.loads(body) if body else {}
74
+ except HTTPError as e: # 401/400/404/500 -> clean AmemError, no raw stack
75
+ raw = e.read().decode(errors="replace")
76
+ try:
77
+ raw = json.loads(raw).get("detail", raw)
78
+ except Exception:
79
+ pass
80
+ raise AmemError(e.code, raw) from None
81
+ except URLError as e:
82
+ raise AmemError(0, f"cannot reach {self.base}: {e.reason}") from None
83
+
84
+ def _post(self, path: str, payload: dict) -> dict:
85
+ return self._req(path, "POST", payload)
86
+
87
+ def _get(self, path: str) -> dict:
88
+ return self._req(path, "GET")
89
+
90
+ # ---- core API ----
91
+ def health(self) -> dict:
92
+ return self._get("/health")
93
+
94
+ def stats(self) -> dict:
95
+ return self._get("/v1/stats")
96
+
97
+ def remember(self, user: str, texts: Union[str, List[str]], keys: Optional[List[str]] = None) -> dict:
98
+ if isinstance(texts, str):
99
+ texts = [texts]
100
+ payload = {"user": user, "texts": texts}
101
+ if keys is not None:
102
+ payload["keys"] = [keys] if isinstance(keys, str) else keys
103
+ return self._post("/v1/memories", payload)
104
+
105
+ def list_memories(self, user: str) -> list:
106
+ return self._get(f"/v1/memories/{user}").get("memories", [])
107
+
108
+ def forget(self, user: str, memory_id: str) -> dict:
109
+ return self._req(f"/v1/memories/{user}/{memory_id}", "DELETE")
110
+
111
+ def update(self, user: str, memory_id: str, text: str) -> dict:
112
+ return self._req(f"/v1/memories/{user}/{memory_id}", "PATCH", {"text": text})
113
+
114
+ def users(self) -> list:
115
+ """kv-serve only: list tenants with active-memory counts."""
116
+ return self._get("/v1/users").get("users", [])
117
+
118
+ def extract(self, user: str, text: str) -> dict:
119
+ """kv-serve only: extract atomic facts from raw text with the local model and
120
+ store them (the §7.8 front-end). Returns {facts, added, active}."""
121
+ return self._post("/v1/extract", {"user": user, "text": text})
122
+
123
+ def usage(self, user: str = None, since=None, until=None) -> dict:
124
+ """kv-serve only: durable PER-TENANT usage over [since, until] (the billing source of
125
+ truth; epoch or ISO-8601 for since/until). Returns {per_user, totals, instance, window}.
126
+ Bill on per_user[u]['prefill_tokens_saved'] (value-aligned, §2.5-G) or memory_ops.
127
+ For an invoice-ready CSV, GET /v1/usage?format=csv directly."""
128
+ from urllib.parse import urlencode
129
+ q = {k: v for k, v in (("user", user), ("since", since), ("until", until)) if v is not None}
130
+ return self._get("/v1/usage" + ("?" + urlencode(q) if q else ""))
131
+
132
+ def start_session(self, user: str, seed_query: str = None, budget_tokens: int = None) -> str:
133
+ payload = {"user": user}
134
+ if seed_query is not None:
135
+ payload["seed_query"] = seed_query
136
+ if budget_tokens is not None:
137
+ payload["budget_tokens"] = budget_tokens
138
+ return self._post("/v1/sessions", payload)["session"]
139
+
140
+ def session_state(self, sid: str) -> dict:
141
+ return self._get(f"/v1/sessions/{sid}")
142
+
143
+ def end_session(self, sid: str) -> dict:
144
+ return self._req(f"/v1/sessions/{sid}", "DELETE")
145
+
146
+ def recall(self, user: str, query: str, session: str = None, budget_tokens: int = None) -> dict:
147
+ payload = {"user": user, "query": query}
148
+ if session is not None:
149
+ payload["session"] = session
150
+ if budget_tokens is not None:
151
+ payload["budget_tokens"] = budget_tokens
152
+ return self._post("/v1/recall", payload)
153
+
154
+ def ask_full(self, user: str, query: str, session: str = None,
155
+ residency: bool = True, max_tokens: int = 300) -> dict:
156
+ payload = {"model": "amem", "user": user, "max_tokens": max_tokens,
157
+ "residency": residency, "messages": [{"role": "user", "content": query}]}
158
+ if session is not None:
159
+ payload["session"] = session
160
+ return self._post("/v1/chat/completions", payload)
161
+
162
+ def ask(self, user: str, query: str, session: str = None,
163
+ residency: bool = True, max_tokens: int = 300) -> str:
164
+ return self.ask_full(user, query, session, residency, max_tokens)["choices"][0]["message"]["content"]
165
+
166
+
167
+ if __name__ == "__main__":
168
+ import sys
169
+ base = sys.argv[1] if len(sys.argv) > 1 else "http://localhost:8000"
170
+ mem = Amem(base)
171
+
172
+ print("health:", json.dumps(mem.health(), indent=2))
173
+ print("\nremember:", mem.remember("alice", [
174
+ "alice lives in Seattle and works at Boeing",
175
+ "alice's sister Maria lives in Denver",
176
+ "alice's launch budget was approved at 4417 thousand dollars",
177
+ "alice is allergic to peanuts and prefers window seats",
178
+ "alice's manager is named Tom and the project ships in March",
179
+ ]))
180
+
181
+ # THE POINT: one session, several DIFFERENT questions -> the resident working set
182
+ # grows append-only, then reuse fires (memory_tokens_reused rises, prefilled_now -> small).
183
+ sid = mem.start_session("alice")
184
+ print(f"\nsession {sid} — asking DIFFERENT questions, watch reuse grow:\n")
185
+ for q in ["Where does Alice's sister live?",
186
+ "What company does Alice work for?",
187
+ "How big was the approved launch budget?",
188
+ "What is Alice allergic to?",
189
+ "Who is Alice's manager?"]:
190
+ r = mem.ask_full("alice", q, session=sid)
191
+ a = r["amem"]
192
+ print(f" Q: {q}")
193
+ print(f" -> {r['choices'][0]['message']['content']}")
194
+ print(f" resident={a['resident_tokens']}tok reused={a['memory_tokens_reused']}tok "
195
+ f"prefilled_now={a['memory_tokens_prefilled_now']}tok kv_reused={a['kv_reused']} "
196
+ f"{a['latency_s']}s")
197
+
198
+ print("\nrecall (inverted, same session):",
199
+ json.dumps(mem.recall("alice", "Tell me about Alice's travel preferences", session=sid), indent=2)[:400])
200
+ print("\nstats:", json.dumps(mem.stats(), indent=2))