grafomem 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- grafomem-0.2.0/LICENSE +21 -0
- grafomem-0.2.0/PKG-INFO +227 -0
- grafomem-0.2.0/README.md +183 -0
- grafomem-0.2.0/pyproject.toml +52 -0
- grafomem-0.2.0/setup.cfg +4 -0
- grafomem-0.2.0/src/aml/__init__.py +3 -0
- grafomem-0.2.0/src/aml/adapter_check.py +136 -0
- grafomem-0.2.0/src/aml/backends/__init__.py +1 -0
- grafomem-0.2.0/src/aml/backends/bi_temporal.py +225 -0
- grafomem-0.2.0/src/aml/backends/bounded_vector.py +197 -0
- grafomem-0.2.0/src/aml/backends/conflict_backends.py +333 -0
- grafomem-0.2.0/src/aml/backends/cross_session_backends.py +267 -0
- grafomem-0.2.0/src/aml/backends/delete_backends.py +254 -0
- grafomem-0.2.0/src/aml/backends/gmp_reference.py +285 -0
- grafomem-0.2.0/src/aml/backends/interface.py +517 -0
- grafomem-0.2.0/src/aml/backends/isolation_backends.py +524 -0
- grafomem-0.2.0/src/aml/backends/persistence.py +143 -0
- grafomem-0.2.0/src/aml/backends/retention_backends.py +215 -0
- grafomem-0.2.0/src/aml/backends/sqlite_gmp.py +525 -0
- grafomem-0.2.0/src/aml/backends/supersession_chain.py +187 -0
- grafomem-0.2.0/src/aml/backends/tenant_backends.py +251 -0
- grafomem-0.2.0/src/aml/backends/vector_only.py +226 -0
- grafomem-0.2.0/src/aml/cli.py +627 -0
- grafomem-0.2.0/src/aml/eval/__init__.py +1 -0
- grafomem-0.2.0/src/aml/eval/concurrency.py +456 -0
- grafomem-0.2.0/src/aml/eval/concurrency_runner.py +320 -0
- grafomem-0.2.0/src/aml/eval/conformance.py +545 -0
- grafomem-0.2.0/src/aml/eval/harness.py +279 -0
- grafomem-0.2.0/src/aml/eval/metrics.py +268 -0
- grafomem-0.2.0/src/aml/eval/report.py +307 -0
- grafomem-0.2.0/src/aml/generator/__init__.py +1 -0
- grafomem-0.2.0/src/aml/generator/oracle.py +502 -0
- grafomem-0.2.0/src/aml/generator/trace.py +1020 -0
- grafomem-0.2.0/src/aml/generator/validators.py +447 -0
- grafomem-0.2.0/src/aml/generator/workloads/__init__.py +1 -0
- grafomem-0.2.0/src/aml/generator/workloads/w1.py +369 -0
- grafomem-0.2.0/src/aml/generator/workloads/w10.py +412 -0
- grafomem-0.2.0/src/aml/generator/workloads/w2.py +280 -0
- grafomem-0.2.0/src/aml/generator/workloads/w3.py +269 -0
- grafomem-0.2.0/src/aml/generator/workloads/w4.py +234 -0
- grafomem-0.2.0/src/aml/generator/workloads/w5.py +253 -0
- grafomem-0.2.0/src/aml/generator/workloads/w6.py +254 -0
- grafomem-0.2.0/src/aml/generator/workloads/w7.py +259 -0
- grafomem-0.2.0/src/aml/generator/workloads/w8.py +242 -0
- grafomem-0.2.0/src/aml/generator/workloads/w9.py +309 -0
- grafomem-0.2.0/src/aml/provenance.py +114 -0
- grafomem-0.2.0/src/aml/server/__init__.py +1 -0
- grafomem-0.2.0/src/aml/server/app.py +362 -0
- grafomem-0.2.0/src/aml/server/auth.py +102 -0
- grafomem-0.2.0/src/aml/server/ingestion.py +244 -0
- grafomem-0.2.0/src/aml/server/mcp.py +266 -0
- grafomem-0.2.0/src/aml/server/stores.py +113 -0
- grafomem-0.2.0/src/aml/wire.py +446 -0
- grafomem-0.2.0/src/grafomem.egg-info/PKG-INFO +227 -0
- grafomem-0.2.0/src/grafomem.egg-info/SOURCES.txt +61 -0
- grafomem-0.2.0/src/grafomem.egg-info/dependency_links.txt +1 -0
- grafomem-0.2.0/src/grafomem.egg-info/entry_points.txt +2 -0
- grafomem-0.2.0/src/grafomem.egg-info/requires.txt +26 -0
- grafomem-0.2.0/src/grafomem.egg-info/top_level.txt +1 -0
- grafomem-0.2.0/tests/test_concurrency.py +97 -0
- grafomem-0.2.0/tests/test_conformance.py +64 -0
- grafomem-0.2.0/tests/test_corpus.py +66 -0
- grafomem-0.2.0/tests/test_interface.py +184 -0
grafomem-0.2.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 GNS Foundation
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
grafomem-0.2.0/PKG-INFO
ADDED
|
@@ -0,0 +1,227 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: grafomem
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: GRAFOMEM — agent-memory conformance benchmark and compliance toolkit
|
|
5
|
+
Author: Camilo Ayerbe Posada
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/GNS-Foundation/grafomem
|
|
8
|
+
Project-URL: Documentation, https://github.com/GNS-Foundation/grafomem/tree/main/docs
|
|
9
|
+
Project-URL: Repository, https://github.com/GNS-Foundation/grafomem
|
|
10
|
+
Project-URL: Issues, https://github.com/GNS-Foundation/grafomem/issues
|
|
11
|
+
Project-URL: API, https://grafomem-production.up.railway.app/docs
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
19
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
20
|
+
Requires-Python: >=3.11
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
License-File: LICENSE
|
|
23
|
+
Requires-Dist: jsonschema>=4
|
|
24
|
+
Requires-Dist: click>=8
|
|
25
|
+
Provides-Extra: backends
|
|
26
|
+
Requires-Dist: sentence-transformers>=2.2; extra == "backends"
|
|
27
|
+
Requires-Dist: numpy>=1.24; extra == "backends"
|
|
28
|
+
Provides-Extra: store
|
|
29
|
+
Requires-Dist: sqlite-vec; extra == "store"
|
|
30
|
+
Requires-Dist: apsw; extra == "store"
|
|
31
|
+
Provides-Extra: crypto
|
|
32
|
+
Requires-Dist: cryptography>=41; extra == "crypto"
|
|
33
|
+
Provides-Extra: server
|
|
34
|
+
Requires-Dist: fastapi>=0.110; extra == "server"
|
|
35
|
+
Requires-Dist: uvicorn[standard]>=0.29; extra == "server"
|
|
36
|
+
Requires-Dist: mcp>=1.0; extra == "server"
|
|
37
|
+
Requires-Dist: pydantic>=2.0; extra == "server"
|
|
38
|
+
Provides-Extra: all
|
|
39
|
+
Requires-Dist: grafomem[backends,crypto,server,store]; extra == "all"
|
|
40
|
+
Provides-Extra: dev
|
|
41
|
+
Requires-Dist: pytest>=8; extra == "dev"
|
|
42
|
+
Requires-Dist: httpx>=0.27; extra == "dev"
|
|
43
|
+
Dynamic: license-file
|
|
44
|
+
|
|
45
|
+
# GRAFOMEM
|
|
46
|
+
|
|
47
|
+
**An agent-memory benchmark that became a memory protocol.**
|
|
48
|
+
|
|
49
|
+
GRAFOMEM began as a benchmark for one question — *what should a standard for agent
|
|
50
|
+
memory actually specify?* — and turned into the answer: a benchmark, a protocol
|
|
51
|
+
(**GMP**), an executable conformance suite, and a certified, persistent reference
|
|
52
|
+
implementation. The thesis in one line:
|
|
53
|
+
|
|
54
|
+
> Memory capabilities are **orthogonal**, a **declared** capability is not the same as
|
|
55
|
+
> **observed** behavior, and the only way to tell them apart is to **test** — so agent
|
|
56
|
+
> memory should be specified and conformance-checked like any other protocol.
|
|
57
|
+
|
|
58
|
+
Clean-room research project. [grafomem.com](https://grafomem.com)
|
|
59
|
+
|
|
60
|
+
---
|
|
61
|
+
|
|
62
|
+
## The thesis, in three results
|
|
63
|
+
|
|
64
|
+
1. **Four orthogonal axes.** A memory standard must separately specify: representational
|
|
65
|
+
capability (versioning / supersession), embedding quality, retention policy, and a
|
|
66
|
+
two-sided privacy primitive (deletion **and** tenant isolation). The benchmark shows
|
|
67
|
+
these are separately specifiable and verifiable.
|
|
68
|
+
2. **Claims ≠ behavior.** A backend can *declare* `HARD_DELETE` or `MULTI_TENANT` and
|
|
69
|
+
still leak forbidden data (findings F10, F12). The declaration is not the guarantee.
|
|
70
|
+
This is the load-bearing result — and the reason a conformance suite has to exist.
|
|
71
|
+
3. **Protocol + conformance.** "Supports capability X" is defined operationally:
|
|
72
|
+
*passes the conformance suite for X.* The spec, the suite, and implementations that
|
|
73
|
+
certify against it all exist and agree.
|
|
74
|
+
|
|
75
|
+
---
|
|
76
|
+
|
|
77
|
+
## The stack
|
|
78
|
+
|
|
79
|
+
| Layer | What it is | Where |
|
|
80
|
+
|---|---|---|
|
|
81
|
+
| **Benchmark** | 10 workloads (W1–W10), 20 findings; locked corpus 135 traces / 61,754 turns / 17,612 queries (v0.2.0) | `src/aml/generator/`, `scripts/run_w*.py` |
|
|
82
|
+
| **Paper** | arXiv technical report | `docs/grafomem-paper.pdf` |
|
|
83
|
+
| **Spec** | GMP v0.2 (draft) — protocol semantics (RFC 2119) | `docs/gmp-spec-v0.2.md` |
|
|
84
|
+
| **Conformance** | executable §8: `supports X` ≝ passes the suite for X | `src/aml/eval/conformance.py` |
|
|
85
|
+
| **Reference** | in-memory backend, self-certifying | `src/aml/backends/gmp_reference.py` |
|
|
86
|
+
| **Wire** | HTTP + JSON binding; the client *is* a `MemoryBackend` | `src/aml/wire.py` |
|
|
87
|
+
| **Store** | persistent SQLite + sqlite-vec; survives restart | `src/aml/backends/sqlite_gmp.py` |
|
|
88
|
+
|
|
89
|
+
Each layer certifies the one beside it. The reference backend runs the conformance
|
|
90
|
+
suite **on itself**; the wire client runs the *same* suite **over a socket**; the
|
|
91
|
+
SQLite store runs it **on a file**. The contract is transport- and
|
|
92
|
+
implementation-independent by construction — not by assertion.
|
|
93
|
+
|
|
94
|
+
**v0.2:** W7–W10 built (findings F14–F20); W7, W9, W10 corpus-locked into v0.2.0, W8 held out; W10 (operational concurrency & isolation) gated by M8 isolation conformance — §4.10, gmp-spec §10.
|
|
95
|
+
|
|
96
|
+
---
|
|
97
|
+
|
|
98
|
+
## Key findings
|
|
99
|
+
|
|
100
|
+
The benchmark is the evidence base. Full table in the paper (Appendix C); the
|
|
101
|
+
load-bearing ones:
|
|
102
|
+
|
|
103
|
+
- **Capabilities are inert without the workload that needs them.** On a pure-vector
|
|
104
|
+
retrieval task, declaring `SUPERSESSION_CHAIN` / `BI_TEMPORAL` changes nothing
|
|
105
|
+
(Δ = +0.000). On a drift task, supersession recovers recall from 0.281 → 0.867 at a
|
|
106
|
+
tight budget (**+0.585**). The capability matters exactly when the workload exercises it.
|
|
107
|
+
- **The embedder is the lever, not the capabilities.** Swapping the stub for a real
|
|
108
|
+
embedder moves recall +0.510 at budget 32; toggling capabilities on the same task
|
|
109
|
+
moves it +0.000.
|
|
110
|
+
- **Declared ≠ honest (F10, F12).** A backend that claims `HARD_DELETE` but soft-deletes
|
|
111
|
+
leaks deleted facts with probability 1.0; one that claims `MULTI_TENANT` but shares an
|
|
112
|
+
index leaks across tenants with probability 1.0. Both *pass their own type signature*
|
|
113
|
+
and *fail conformance*. Deletion and tenant isolation unify at the read path — a single
|
|
114
|
+
`Forbidden(q)` set — which is why one two-sided test catches both.
|
|
115
|
+
|
|
116
|
+
---
|
|
117
|
+
|
|
118
|
+
## Latency — reference store, locked corpus
|
|
119
|
+
|
|
120
|
+
The W1–W6 subset ingested into one growing SQLite + sqlite-vec store (N = 38,882
|
|
121
|
+
rows), BGE-small embeddings, on an Apple-Silicon laptop. Numbers are post-v0.2 (the
|
|
122
|
+
metadata-column pre-filter):
|
|
123
|
+
|
|
124
|
+
| op | count | p50 | p95 |
|
|
125
|
+
|---|---:|---:|---:|
|
|
126
|
+
| write | 37,015 | 10.17ms | 13.66ms |
|
|
127
|
+
| supersede | 2,262 | 9.85ms | 12.90ms |
|
|
128
|
+
| delete | 395 | **0.03ms** | 0.05ms |
|
|
129
|
+
| retrieve | 15,342 | 11.27ms | **27.80ms** |
|
|
130
|
+
|
|
131
|
+
What the numbers say:
|
|
132
|
+
|
|
133
|
+
- **The embedder is the floor.** Every op that embeds sits at ~10ms; `delete` (the one
|
|
134
|
+
op that doesn't) is 0.03ms. The store's own machinery is sub-millisecond. Single-item
|
|
135
|
+
write throughput is ~97/s on MPS — and a `write_many` bulk-ingest path that batches the
|
|
136
|
+
embedder under one transaction hits **847/s (8.6×)** with an identical resulting store,
|
|
137
|
+
confirming the embedder, not the store, was the entire write cost.
|
|
138
|
+
- **The v0.2 pre-filter crushed the tail.** In v0.1, selective queries (`as_of`, tenant)
|
|
139
|
+
ranked-then-filtered and triggered an adaptive widening loop, putting retrieve p95 at
|
|
140
|
+
**82ms**. v0.2 pushes the tenant/valid-time predicate into the KNN as metadata columns
|
|
141
|
+
and bounds `k` by the char budget — p95 fell to **27.80ms** (−66%) with identical
|
|
142
|
+
results, and the high-N p50 sits *at or below* v0.1's flat region.
|
|
143
|
+
- **Retrieve p50 still grows with N** (~10ms under 10k → ~25ms at 25–50k). sqlite-vec is
|
|
144
|
+
brute-force — there is no ANN index — so the scan is O(N). The pre-filter made
|
|
145
|
+
retrieval *correct and tight*, not *sublinear*; the next lever for retrieve-at-scale is
|
|
146
|
+
an actual ANN index, not more tuning. At 38k rows / 25ms p50 it isn't needed yet.
|
|
147
|
+
- **sqlite-vec ≈ numpy brute force** on pure-vector workloads (every bucket within ~1ms).
|
|
148
|
+
The store's value at this scale is **persistence and not pinning the corpus in RAM**,
|
|
149
|
+
not speed — confirming the in-memory reference is the right default below large scale.
|
|
150
|
+
|
|
151
|
+
---
|
|
152
|
+
|
|
153
|
+
## Running it
|
|
154
|
+
|
|
155
|
+
Editable install (src-layout; this also puts `aml` on the path permanently):
|
|
156
|
+
|
|
157
|
+
```bash
|
|
158
|
+
python -m venv .venv && source .venv/bin/activate
|
|
159
|
+
pip install -e ".[backends]" # aml + sentence-transformers + torch
|
|
160
|
+
pip install sqlite-vec apsw # for the persistent store
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
> **macOS note:** the store needs SQLite extension loading. If your Python's `sqlite3`
|
|
164
|
+
> lacks it, the backend transparently falls back to `apsw` (bundled SQLite). Keep `apsw`
|
|
165
|
+
> installed.
|
|
166
|
+
|
|
167
|
+
**Self-validating smokes** — each stands up a backend and runs the conformance suite
|
|
168
|
+
against it:
|
|
169
|
+
|
|
170
|
+
```bash
|
|
171
|
+
python -m aml.backends.gmp_reference # reference impl certifies itself
|
|
172
|
+
python -m aml.wire # conformance suite passes over HTTP
|
|
173
|
+
python -m aml.backends.sqlite_gmp # persistent store: survives reopen + passes suite
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
**Benchmark experiments** and the **scale probe**:
|
|
177
|
+
|
|
178
|
+
```bash
|
|
179
|
+
python -m scripts.run_w1 # F1, F2 — vector vs recency floor + budget sweep
|
|
180
|
+
python -m scripts.run_w2 # F3–F5 — drift, supersession, bi-temporal
|
|
181
|
+
python -m scripts.run_w3 # F6, F7 — distractor noise; the embedder lever
|
|
182
|
+
python -m scripts.scale_probe # corpus latency + sqlite-vec vs brute force
|
|
183
|
+
```
|
|
184
|
+
|
|
185
|
+
---
|
|
186
|
+
|
|
187
|
+
## Status & roadmap
|
|
188
|
+
|
|
189
|
+
**v0.1 — complete.** Benchmark, paper, GMP v0.1 spec, conformance suite, in-memory
|
|
190
|
+
reference (certified), HTTP+JSON wire binding (suite passes over a socket), persistent
|
|
191
|
+
SQLite + sqlite-vec store (survives restart, passes the full profile), scale probe.
|
|
192
|
+
v0.1 normative subset: `{AUDIT, SUPERSESSION_CHAIN, BI_TEMPORAL, HARD_DELETE, MULTI_TENANT}`.
|
|
193
|
+
|
|
194
|
+
**v0.2 — in progress.**
|
|
195
|
+
|
|
196
|
+
- **Metadata-column pre-filter** in the store — **done.** `tenant_id` / valid-time live
|
|
197
|
+
in the vec0 table (nulls sentinel-encoded, since sqlite-vec metadata filters don't
|
|
198
|
+
support `IS NULL`); the KNN filters natively and `k` is bounded by the char budget.
|
|
199
|
+
Drops retrieve p95 82→28ms with identical results and exact selective-filter retrieval;
|
|
200
|
+
the O(N) brute-force scan remains (no ANN index — a separate lever, not needed at this
|
|
201
|
+
scale).
|
|
202
|
+
- **Batched embedding** on the ingest path — **done.** A `write_many` fast-path embeds a
|
|
203
|
+
batch in one forward pass under one transaction: **97 → 847 items/s (8.6×)**, same
|
|
204
|
+
resulting store. Optional accelerator; the single-`write` Protocol path is unchanged.
|
|
205
|
+
- **Reserved capabilities — provenance pair done.** `PROVENANCE` (normative) and
|
|
206
|
+
`CRYPTOGRAPHIC_PROVENANCE` (optional) are implemented in both backends and certified by the
|
|
207
|
+
v0.2 conformance suite: Ed25519 over a content-store fact_id, `source` persisted (signatures
|
|
208
|
+
survive restart). Provenance has **no benchmark workload by design** — it is integrity metadata,
|
|
209
|
+
"verifiability, not ranking" (gmp-spec §7.5), so the suite tests it with constructed probes like
|
|
210
|
+
`AUDIT`, not a retrieval workload (§8.3).
|
|
211
|
+
- **Workloads W7–W9 — built.** W7 (Conflict Detection), W8 (Forgetting Curve), W9 (Cross-Session
|
|
212
|
+
Deletion); generators, backend spectrums, runners, findings F14–F18. W7 and W9 are corpus-locked
|
|
213
|
+
into v0.2.0; W8 is held out pending the summarise/merge retention variant. These home the last
|
|
214
|
+
two reserved flags — `CONFLICT_DETECTION` (W7) and `CROSS_SESSION_PROPAGATION` (W9), now
|
|
215
|
+
**un-reserved** in gmp-spec §7.4.
|
|
216
|
+
- **W10 — Operational Concurrency & Isolation — built and corpus-locked.** Trace-schema v0.2
|
|
217
|
+
carries set-valued ground truth; `interface.py` adds `submit_concurrent` (gated by
|
|
218
|
+
`CONCURRENCY_CONTROL`, the 10th flag), `IsolationPolicy`, and a `declared_policy`
|
|
219
|
+
self-description on the concurrent backend. A five-store spectrum (serializable →
|
|
220
|
+
resurrecting) drives **M8 isolation conformance**, which catches both the over-claimer
|
|
221
|
+
(declares serializable, delivers read-committed — F19) and the §10.4 durability violator
|
|
222
|
+
(resurrects a committed delete — F20). Locked into v0.2.0 (135 traces). The suite's last
|
|
223
|
+
open frontier, now closed.
|
|
224
|
+
|
|
225
|
+
The arc is protocol-first: the spec and suite are the standard; the implementations are
|
|
226
|
+
the proof it's real and runnable. "Postgres for agent memory" is the destination, not
|
|
227
|
+
the starting point.
|
grafomem-0.2.0/README.md
ADDED
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
# GRAFOMEM
|
|
2
|
+
|
|
3
|
+
**An agent-memory benchmark that became a memory protocol.**
|
|
4
|
+
|
|
5
|
+
GRAFOMEM began as a benchmark for one question — *what should a standard for agent
|
|
6
|
+
memory actually specify?* — and turned into the answer: a benchmark, a protocol
|
|
7
|
+
(**GMP**), an executable conformance suite, and a certified, persistent reference
|
|
8
|
+
implementation. The thesis in one line:
|
|
9
|
+
|
|
10
|
+
> Memory capabilities are **orthogonal**, a **declared** capability is not the same as
|
|
11
|
+
> **observed** behavior, and the only way to tell them apart is to **test** — so agent
|
|
12
|
+
> memory should be specified and conformance-checked like any other protocol.
|
|
13
|
+
|
|
14
|
+
Clean-room research project. [grafomem.com](https://grafomem.com)
|
|
15
|
+
|
|
16
|
+
---
|
|
17
|
+
|
|
18
|
+
## The thesis, in three results
|
|
19
|
+
|
|
20
|
+
1. **Four orthogonal axes.** A memory standard must separately specify: representational
|
|
21
|
+
capability (versioning / supersession), embedding quality, retention policy, and a
|
|
22
|
+
two-sided privacy primitive (deletion **and** tenant isolation). The benchmark shows
|
|
23
|
+
these are separately specifiable and verifiable.
|
|
24
|
+
2. **Claims ≠ behavior.** A backend can *declare* `HARD_DELETE` or `MULTI_TENANT` and
|
|
25
|
+
still leak forbidden data (findings F10, F12). The declaration is not the guarantee.
|
|
26
|
+
This is the load-bearing result — and the reason a conformance suite has to exist.
|
|
27
|
+
3. **Protocol + conformance.** "Supports capability X" is defined operationally:
|
|
28
|
+
*passes the conformance suite for X.* The spec, the suite, and implementations that
|
|
29
|
+
certify against it all exist and agree.
|
|
30
|
+
|
|
31
|
+
---
|
|
32
|
+
|
|
33
|
+
## The stack
|
|
34
|
+
|
|
35
|
+
| Layer | What it is | Where |
|
|
36
|
+
|---|---|---|
|
|
37
|
+
| **Benchmark** | 10 workloads (W1–W10), 20 findings; locked corpus 135 traces / 61,754 turns / 17,612 queries (v0.2.0) | `src/aml/generator/`, `scripts/run_w*.py` |
|
|
38
|
+
| **Paper** | arXiv technical report | `docs/grafomem-paper.pdf` |
|
|
39
|
+
| **Spec** | GMP v0.2 (draft) — protocol semantics (RFC 2119) | `docs/gmp-spec-v0.2.md` |
|
|
40
|
+
| **Conformance** | executable §8: `supports X` ≝ passes the suite for X | `src/aml/eval/conformance.py` |
|
|
41
|
+
| **Reference** | in-memory backend, self-certifying | `src/aml/backends/gmp_reference.py` |
|
|
42
|
+
| **Wire** | HTTP + JSON binding; the client *is* a `MemoryBackend` | `src/aml/wire.py` |
|
|
43
|
+
| **Store** | persistent SQLite + sqlite-vec; survives restart | `src/aml/backends/sqlite_gmp.py` |
|
|
44
|
+
|
|
45
|
+
Each layer certifies the one beside it. The reference backend runs the conformance
|
|
46
|
+
suite **on itself**; the wire client runs the *same* suite **over a socket**; the
|
|
47
|
+
SQLite store runs it **on a file**. The contract is transport- and
|
|
48
|
+
implementation-independent by construction — not by assertion.
|
|
49
|
+
|
|
50
|
+
**v0.2:** W7–W10 built (findings F14–F20); W7, W9, W10 corpus-locked into v0.2.0, W8 held out; W10 (operational concurrency & isolation) gated by M8 isolation conformance — §4.10, gmp-spec §10.
|
|
51
|
+
|
|
52
|
+
---
|
|
53
|
+
|
|
54
|
+
## Key findings
|
|
55
|
+
|
|
56
|
+
The benchmark is the evidence base. Full table in the paper (Appendix C); the
|
|
57
|
+
load-bearing ones:
|
|
58
|
+
|
|
59
|
+
- **Capabilities are inert without the workload that needs them.** On a pure-vector
|
|
60
|
+
retrieval task, declaring `SUPERSESSION_CHAIN` / `BI_TEMPORAL` changes nothing
|
|
61
|
+
(Δ = +0.000). On a drift task, supersession recovers recall from 0.281 → 0.867 at a
|
|
62
|
+
tight budget (**+0.585**). The capability matters exactly when the workload exercises it.
|
|
63
|
+
- **The embedder is the lever, not the capabilities.** Swapping the stub for a real
|
|
64
|
+
embedder moves recall +0.510 at budget 32; toggling capabilities on the same task
|
|
65
|
+
moves it +0.000.
|
|
66
|
+
- **Declared ≠ honest (F10, F12).** A backend that claims `HARD_DELETE` but soft-deletes
|
|
67
|
+
leaks deleted facts with probability 1.0; one that claims `MULTI_TENANT` but shares an
|
|
68
|
+
index leaks across tenants with probability 1.0. Both *pass their own type signature*
|
|
69
|
+
and *fail conformance*. Deletion and tenant isolation unify at the read path — a single
|
|
70
|
+
`Forbidden(q)` set — which is why one two-sided test catches both.
|
|
71
|
+
|
|
72
|
+
---
|
|
73
|
+
|
|
74
|
+
## Latency — reference store, locked corpus
|
|
75
|
+
|
|
76
|
+
The W1–W6 subset ingested into one growing SQLite + sqlite-vec store (N = 38,882
|
|
77
|
+
rows), BGE-small embeddings, on an Apple-Silicon laptop. Numbers are post-v0.2 (the
|
|
78
|
+
metadata-column pre-filter):
|
|
79
|
+
|
|
80
|
+
| op | count | p50 | p95 |
|
|
81
|
+
|---|---:|---:|---:|
|
|
82
|
+
| write | 37,015 | 10.17ms | 13.66ms |
|
|
83
|
+
| supersede | 2,262 | 9.85ms | 12.90ms |
|
|
84
|
+
| delete | 395 | **0.03ms** | 0.05ms |
|
|
85
|
+
| retrieve | 15,342 | 11.27ms | **27.80ms** |
|
|
86
|
+
|
|
87
|
+
What the numbers say:
|
|
88
|
+
|
|
89
|
+
- **The embedder is the floor.** Every op that embeds sits at ~10ms; `delete` (the one
|
|
90
|
+
op that doesn't) is 0.03ms. The store's own machinery is sub-millisecond. Single-item
|
|
91
|
+
write throughput is ~97/s on MPS — and a `write_many` bulk-ingest path that batches the
|
|
92
|
+
embedder under one transaction hits **847/s (8.6×)** with an identical resulting store,
|
|
93
|
+
confirming the embedder, not the store, was the entire write cost.
|
|
94
|
+
- **The v0.2 pre-filter crushed the tail.** In v0.1, selective queries (`as_of`, tenant)
|
|
95
|
+
ranked-then-filtered and triggered an adaptive widening loop, putting retrieve p95 at
|
|
96
|
+
**82ms**. v0.2 pushes the tenant/valid-time predicate into the KNN as metadata columns
|
|
97
|
+
and bounds `k` by the char budget — p95 fell to **27.80ms** (−66%) with identical
|
|
98
|
+
results, and the high-N p50 sits *at or below* v0.1's flat region.
|
|
99
|
+
- **Retrieve p50 still grows with N** (~10ms under 10k → ~25ms at 25–50k). sqlite-vec is
|
|
100
|
+
brute-force — there is no ANN index — so the scan is O(N). The pre-filter made
|
|
101
|
+
retrieval *correct and tight*, not *sublinear*; the next lever for retrieve-at-scale is
|
|
102
|
+
an actual ANN index, not more tuning. At 38k rows / 25ms p50 it isn't needed yet.
|
|
103
|
+
- **sqlite-vec ≈ numpy brute force** on pure-vector workloads (every bucket within ~1ms).
|
|
104
|
+
The store's value at this scale is **persistence and not pinning the corpus in RAM**,
|
|
105
|
+
not speed — confirming the in-memory reference is the right default below large scale.
|
|
106
|
+
|
|
107
|
+
---
|
|
108
|
+
|
|
109
|
+
## Running it
|
|
110
|
+
|
|
111
|
+
Editable install (src-layout; this also puts `aml` on the path permanently):
|
|
112
|
+
|
|
113
|
+
```bash
|
|
114
|
+
python -m venv .venv && source .venv/bin/activate
|
|
115
|
+
pip install -e ".[backends]" # aml + sentence-transformers + torch
|
|
116
|
+
pip install sqlite-vec apsw # for the persistent store
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
> **macOS note:** the store needs SQLite extension loading. If your Python's `sqlite3`
|
|
120
|
+
> lacks it, the backend transparently falls back to `apsw` (bundled SQLite). Keep `apsw`
|
|
121
|
+
> installed.
|
|
122
|
+
|
|
123
|
+
**Self-validating smokes** — each stands up a backend and runs the conformance suite
|
|
124
|
+
against it:
|
|
125
|
+
|
|
126
|
+
```bash
|
|
127
|
+
python -m aml.backends.gmp_reference # reference impl certifies itself
|
|
128
|
+
python -m aml.wire # conformance suite passes over HTTP
|
|
129
|
+
python -m aml.backends.sqlite_gmp # persistent store: survives reopen + passes suite
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
**Benchmark experiments** and the **scale probe**:
|
|
133
|
+
|
|
134
|
+
```bash
|
|
135
|
+
python -m scripts.run_w1 # F1, F2 — vector vs recency floor + budget sweep
|
|
136
|
+
python -m scripts.run_w2 # F3–F5 — drift, supersession, bi-temporal
|
|
137
|
+
python -m scripts.run_w3 # F6, F7 — distractor noise; the embedder lever
|
|
138
|
+
python -m scripts.scale_probe # corpus latency + sqlite-vec vs brute force
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
---
|
|
142
|
+
|
|
143
|
+
## Status & roadmap
|
|
144
|
+
|
|
145
|
+
**v0.1 — complete.** Benchmark, paper, GMP v0.1 spec, conformance suite, in-memory
|
|
146
|
+
reference (certified), HTTP+JSON wire binding (suite passes over a socket), persistent
|
|
147
|
+
SQLite + sqlite-vec store (survives restart, passes the full profile), scale probe.
|
|
148
|
+
v0.1 normative subset: `{AUDIT, SUPERSESSION_CHAIN, BI_TEMPORAL, HARD_DELETE, MULTI_TENANT}`.
|
|
149
|
+
|
|
150
|
+
**v0.2 — in progress.**
|
|
151
|
+
|
|
152
|
+
- **Metadata-column pre-filter** in the store — **done.** `tenant_id` / valid-time live
|
|
153
|
+
in the vec0 table (nulls sentinel-encoded, since sqlite-vec metadata filters don't
|
|
154
|
+
support `IS NULL`); the KNN filters natively and `k` is bounded by the char budget.
|
|
155
|
+
Drops retrieve p95 82→28ms with identical results and exact selective-filter retrieval;
|
|
156
|
+
the O(N) brute-force scan remains (no ANN index — a separate lever, not needed at this
|
|
157
|
+
scale).
|
|
158
|
+
- **Batched embedding** on the ingest path — **done.** A `write_many` fast-path embeds a
|
|
159
|
+
batch in one forward pass under one transaction: **97 → 847 items/s (8.6×)**, same
|
|
160
|
+
resulting store. Optional accelerator; the single-`write` Protocol path is unchanged.
|
|
161
|
+
- **Reserved capabilities — provenance pair done.** `PROVENANCE` (normative) and
|
|
162
|
+
`CRYPTOGRAPHIC_PROVENANCE` (optional) are implemented in both backends and certified by the
|
|
163
|
+
v0.2 conformance suite: Ed25519 over a content-store fact_id, `source` persisted (signatures
|
|
164
|
+
survive restart). Provenance has **no benchmark workload by design** — it is integrity metadata,
|
|
165
|
+
"verifiability, not ranking" (gmp-spec §7.5), so the suite tests it with constructed probes like
|
|
166
|
+
`AUDIT`, not a retrieval workload (§8.3).
|
|
167
|
+
- **Workloads W7–W9 — built.** W7 (Conflict Detection), W8 (Forgetting Curve), W9 (Cross-Session
|
|
168
|
+
Deletion); generators, backend spectrums, runners, findings F14–F18. W7 and W9 are corpus-locked
|
|
169
|
+
into v0.2.0; W8 is held out pending the summarise/merge retention variant. These home the last
|
|
170
|
+
two reserved flags — `CONFLICT_DETECTION` (W7) and `CROSS_SESSION_PROPAGATION` (W9), now
|
|
171
|
+
**un-reserved** in gmp-spec §7.4.
|
|
172
|
+
- **W10 — Operational Concurrency & Isolation — built and corpus-locked.** Trace-schema v0.2
|
|
173
|
+
carries set-valued ground truth; `interface.py` adds `submit_concurrent` (gated by
|
|
174
|
+
`CONCURRENCY_CONTROL`, the 10th flag), `IsolationPolicy`, and a `declared_policy`
|
|
175
|
+
self-description on the concurrent backend. A five-store spectrum (serializable →
|
|
176
|
+
resurrecting) drives **M8 isolation conformance**, which catches both the over-claimer
|
|
177
|
+
(declares serializable, delivers read-committed — F19) and the §10.4 durability violator
|
|
178
|
+
(resurrects a committed delete — F20). Locked into v0.2.0 (135 traces). The suite's last
|
|
179
|
+
open frontier, now closed.
|
|
180
|
+
|
|
181
|
+
The arc is protocol-first: the spec and suite are the standard; the implementations are
|
|
182
|
+
the proof it's real and runnable. "Postgres for agent memory" is the destination, not
|
|
183
|
+
the starting point.
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "grafomem"
|
|
7
|
+
version = "0.2.0"
|
|
8
|
+
description = "GRAFOMEM — agent-memory conformance benchmark and compliance toolkit"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.11"
|
|
11
|
+
license = { text = "MIT" }
|
|
12
|
+
authors = [{ name = "Camilo Ayerbe Posada" }]
|
|
13
|
+
classifiers = [
|
|
14
|
+
"Development Status :: 4 - Beta",
|
|
15
|
+
"Intended Audience :: Developers",
|
|
16
|
+
"License :: OSI Approved :: MIT License",
|
|
17
|
+
"Programming Language :: Python :: 3",
|
|
18
|
+
"Programming Language :: Python :: 3.11",
|
|
19
|
+
"Programming Language :: Python :: 3.12",
|
|
20
|
+
"Programming Language :: Python :: 3.13",
|
|
21
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
22
|
+
]
|
|
23
|
+
dependencies = [
|
|
24
|
+
"jsonschema>=4",
|
|
25
|
+
"click>=8",
|
|
26
|
+
]
|
|
27
|
+
|
|
28
|
+
[project.urls]
|
|
29
|
+
Homepage = "https://github.com/GNS-Foundation/grafomem"
|
|
30
|
+
Documentation = "https://github.com/GNS-Foundation/grafomem/tree/main/docs"
|
|
31
|
+
Repository = "https://github.com/GNS-Foundation/grafomem"
|
|
32
|
+
Issues = "https://github.com/GNS-Foundation/grafomem/issues"
|
|
33
|
+
API = "https://grafomem-production.up.railway.app/docs"
|
|
34
|
+
|
|
35
|
+
[project.scripts]
|
|
36
|
+
grafomem = "aml.cli:main"
|
|
37
|
+
|
|
38
|
+
[project.optional-dependencies]
|
|
39
|
+
# Reference vector adapter (vector_only): pinned embedding model + exact cosine.
|
|
40
|
+
backends = ["sentence-transformers>=2.2", "numpy>=1.24"]
|
|
41
|
+
store = ["sqlite-vec", "apsw"]
|
|
42
|
+
crypto = ["cryptography>=41"]
|
|
43
|
+
# Production server: FastAPI + MCP + batched ingestion.
|
|
44
|
+
server = ["fastapi>=0.110", "uvicorn[standard]>=0.29", "mcp>=1.0", "pydantic>=2.0"]
|
|
45
|
+
all = ["grafomem[crypto,backends,store,server]"]
|
|
46
|
+
dev = ["pytest>=8", "httpx>=0.27"]
|
|
47
|
+
|
|
48
|
+
[tool.setuptools.packages.find]
|
|
49
|
+
where = ["src"]
|
|
50
|
+
|
|
51
|
+
[tool.pytest.ini_options]
|
|
52
|
+
testpaths = ["tests"]
|
grafomem-0.2.0/setup.cfg
ADDED
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
"""
|
|
2
|
+
GRAFOMEM adapter pre-flight checker.
|
|
3
|
+
|
|
4
|
+
grafomem check -b my_module:MyBackend
|
|
5
|
+
|
|
6
|
+
Quick structural validation BEFORE running the full conformance suite:
|
|
7
|
+
checks Protocol compliance, method signatures, capability coherence,
|
|
8
|
+
and basic round-trip. Fails fast with actionable error messages.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import inspect
|
|
14
|
+
from typing import Any
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def check_adapter(cls: type) -> list[str]:
|
|
18
|
+
"""Run pre-flight checks against a backend class. Returns a list of errors.
|
|
19
|
+
Empty list means the adapter is structurally conformant."""
|
|
20
|
+
errors: list[str] = []
|
|
21
|
+
|
|
22
|
+
# 1. Check required methods exist
|
|
23
|
+
required_methods = ["capabilities", "write", "retrieve", "delete",
|
|
24
|
+
"supersede", "audit", "flush"]
|
|
25
|
+
for method in required_methods:
|
|
26
|
+
if not hasattr(cls, method):
|
|
27
|
+
errors.append(f"Missing required method: {method}()")
|
|
28
|
+
elif not callable(getattr(cls, method)):
|
|
29
|
+
errors.append(f"{method} exists but is not callable")
|
|
30
|
+
|
|
31
|
+
if errors:
|
|
32
|
+
return errors # can't proceed without basic methods
|
|
33
|
+
|
|
34
|
+
# 2. Try to instantiate
|
|
35
|
+
instance = None
|
|
36
|
+
try:
|
|
37
|
+
# Check if __init__ needs arguments beyond self
|
|
38
|
+
sig = inspect.signature(cls.__init__)
|
|
39
|
+
params = [p for p in sig.parameters.values()
|
|
40
|
+
if p.name != "self" and p.default is inspect.Parameter.empty]
|
|
41
|
+
if params:
|
|
42
|
+
errors.append(
|
|
43
|
+
f"Constructor requires arguments: {[p.name for p in params]}. "
|
|
44
|
+
f"The conformance suite needs a zero-arg factory. Consider using "
|
|
45
|
+
f"a wrapper lambda: lambda: MyBackend(arg1, arg2)"
|
|
46
|
+
)
|
|
47
|
+
return errors
|
|
48
|
+
instance = cls()
|
|
49
|
+
except Exception as e:
|
|
50
|
+
errors.append(f"Failed to instantiate {cls.__name__}(): {e}")
|
|
51
|
+
return errors
|
|
52
|
+
|
|
53
|
+
# 3. Check capabilities() returns set[Capability]
|
|
54
|
+
try:
|
|
55
|
+
from aml.backends.interface import Capability
|
|
56
|
+
caps = instance.capabilities()
|
|
57
|
+
if not isinstance(caps, set):
|
|
58
|
+
errors.append(f"capabilities() returned {type(caps).__name__}, expected set")
|
|
59
|
+
else:
|
|
60
|
+
for c in caps:
|
|
61
|
+
if not isinstance(c, Capability):
|
|
62
|
+
errors.append(f"capabilities() contains {c!r} which is not a Capability enum member")
|
|
63
|
+
except Exception as e:
|
|
64
|
+
errors.append(f"capabilities() raised: {e}")
|
|
65
|
+
|
|
66
|
+
# 4. Check Protocol compliance
|
|
67
|
+
try:
|
|
68
|
+
from aml.backends.interface import MemoryBackend
|
|
69
|
+
if not isinstance(instance, MemoryBackend):
|
|
70
|
+
errors.append(
|
|
71
|
+
f"{cls.__name__} does not satisfy the MemoryBackend Protocol. "
|
|
72
|
+
f"Check method signatures match the Protocol definition."
|
|
73
|
+
)
|
|
74
|
+
except Exception as e:
|
|
75
|
+
errors.append(f"Protocol check failed: {e}")
|
|
76
|
+
|
|
77
|
+
# 5. Basic round-trip: write + retrieve
|
|
78
|
+
try:
|
|
79
|
+
from aml.backends.interface import WriteOptions, RetrieveOptions
|
|
80
|
+
ref = instance.write("test content", WriteOptions())
|
|
81
|
+
if ref is None:
|
|
82
|
+
errors.append("write() returned None; must return a MemoryRef")
|
|
83
|
+
instance.flush()
|
|
84
|
+
mems = instance.retrieve("test", RetrieveOptions(budget_tokens=1024))
|
|
85
|
+
if not isinstance(mems, list):
|
|
86
|
+
errors.append(f"retrieve() returned {type(mems).__name__}, expected list")
|
|
87
|
+
except Exception as e:
|
|
88
|
+
errors.append(f"Basic write/retrieve round-trip failed: {e}")
|
|
89
|
+
|
|
90
|
+
return errors
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def print_check(cls: type) -> bool:
|
|
94
|
+
"""Run and print pre-flight check results. Returns True if all pass."""
|
|
95
|
+
print(f"GRAFOMEM adapter check — {cls.__name__}\n")
|
|
96
|
+
|
|
97
|
+
errors = check_adapter(cls)
|
|
98
|
+
|
|
99
|
+
if not errors:
|
|
100
|
+
# Also show declared capabilities
|
|
101
|
+
try:
|
|
102
|
+
instance = cls()
|
|
103
|
+
caps = instance.capabilities()
|
|
104
|
+
print(f"✓ Protocol compliance OK")
|
|
105
|
+
print(f"✓ Method signatures OK")
|
|
106
|
+
print(f"✓ Write/retrieve round-trip OK")
|
|
107
|
+
print(f"✓ Declared capabilities: {{{', '.join(sorted(c.value for c in caps))}}}")
|
|
108
|
+
print(f"\nAdapter is structurally conformant. Run `grafomem conformance` for full suite.")
|
|
109
|
+
except Exception:
|
|
110
|
+
print(f"✓ Structural checks passed")
|
|
111
|
+
return True
|
|
112
|
+
else:
|
|
113
|
+
for e in errors:
|
|
114
|
+
print(f"✗ {e}")
|
|
115
|
+
print(f"\n{len(errors)} error(s). Fix these before running the conformance suite.")
|
|
116
|
+
return False
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
# ---------------------------------------------------------------------------
|
|
120
|
+
# Smoke
|
|
121
|
+
# ---------------------------------------------------------------------------
|
|
122
|
+
|
|
123
|
+
if __name__ == "__main__":
|
|
124
|
+
from aml.backends.gmp_reference import GMPReferenceBackend
|
|
125
|
+
from aml.backends.vector_only import _stub_embedder, VectorOnlyBackend
|
|
126
|
+
from aml.backends.persistence import PersistenceBackend
|
|
127
|
+
|
|
128
|
+
for cls_factory in [
|
|
129
|
+
lambda: PersistenceBackend,
|
|
130
|
+
lambda: type("BadBackend", (), {}), # deliberately broken
|
|
131
|
+
]:
|
|
132
|
+
cls = cls_factory()
|
|
133
|
+
print_check(cls)
|
|
134
|
+
print()
|
|
135
|
+
|
|
136
|
+
print("✓ Adapter check module smoke green.")
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Backend adapters + the MemoryBackend interface contract."""
|