grafomem 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. grafomem-0.2.0/LICENSE +21 -0
  2. grafomem-0.2.0/PKG-INFO +227 -0
  3. grafomem-0.2.0/README.md +183 -0
  4. grafomem-0.2.0/pyproject.toml +52 -0
  5. grafomem-0.2.0/setup.cfg +4 -0
  6. grafomem-0.2.0/src/aml/__init__.py +3 -0
  7. grafomem-0.2.0/src/aml/adapter_check.py +136 -0
  8. grafomem-0.2.0/src/aml/backends/__init__.py +1 -0
  9. grafomem-0.2.0/src/aml/backends/bi_temporal.py +225 -0
  10. grafomem-0.2.0/src/aml/backends/bounded_vector.py +197 -0
  11. grafomem-0.2.0/src/aml/backends/conflict_backends.py +333 -0
  12. grafomem-0.2.0/src/aml/backends/cross_session_backends.py +267 -0
  13. grafomem-0.2.0/src/aml/backends/delete_backends.py +254 -0
  14. grafomem-0.2.0/src/aml/backends/gmp_reference.py +285 -0
  15. grafomem-0.2.0/src/aml/backends/interface.py +517 -0
  16. grafomem-0.2.0/src/aml/backends/isolation_backends.py +524 -0
  17. grafomem-0.2.0/src/aml/backends/persistence.py +143 -0
  18. grafomem-0.2.0/src/aml/backends/retention_backends.py +215 -0
  19. grafomem-0.2.0/src/aml/backends/sqlite_gmp.py +525 -0
  20. grafomem-0.2.0/src/aml/backends/supersession_chain.py +187 -0
  21. grafomem-0.2.0/src/aml/backends/tenant_backends.py +251 -0
  22. grafomem-0.2.0/src/aml/backends/vector_only.py +226 -0
  23. grafomem-0.2.0/src/aml/cli.py +627 -0
  24. grafomem-0.2.0/src/aml/eval/__init__.py +1 -0
  25. grafomem-0.2.0/src/aml/eval/concurrency.py +456 -0
  26. grafomem-0.2.0/src/aml/eval/concurrency_runner.py +320 -0
  27. grafomem-0.2.0/src/aml/eval/conformance.py +545 -0
  28. grafomem-0.2.0/src/aml/eval/harness.py +279 -0
  29. grafomem-0.2.0/src/aml/eval/metrics.py +268 -0
  30. grafomem-0.2.0/src/aml/eval/report.py +307 -0
  31. grafomem-0.2.0/src/aml/generator/__init__.py +1 -0
  32. grafomem-0.2.0/src/aml/generator/oracle.py +502 -0
  33. grafomem-0.2.0/src/aml/generator/trace.py +1020 -0
  34. grafomem-0.2.0/src/aml/generator/validators.py +447 -0
  35. grafomem-0.2.0/src/aml/generator/workloads/__init__.py +1 -0
  36. grafomem-0.2.0/src/aml/generator/workloads/w1.py +369 -0
  37. grafomem-0.2.0/src/aml/generator/workloads/w10.py +412 -0
  38. grafomem-0.2.0/src/aml/generator/workloads/w2.py +280 -0
  39. grafomem-0.2.0/src/aml/generator/workloads/w3.py +269 -0
  40. grafomem-0.2.0/src/aml/generator/workloads/w4.py +234 -0
  41. grafomem-0.2.0/src/aml/generator/workloads/w5.py +253 -0
  42. grafomem-0.2.0/src/aml/generator/workloads/w6.py +254 -0
  43. grafomem-0.2.0/src/aml/generator/workloads/w7.py +259 -0
  44. grafomem-0.2.0/src/aml/generator/workloads/w8.py +242 -0
  45. grafomem-0.2.0/src/aml/generator/workloads/w9.py +309 -0
  46. grafomem-0.2.0/src/aml/provenance.py +114 -0
  47. grafomem-0.2.0/src/aml/server/__init__.py +1 -0
  48. grafomem-0.2.0/src/aml/server/app.py +362 -0
  49. grafomem-0.2.0/src/aml/server/auth.py +102 -0
  50. grafomem-0.2.0/src/aml/server/ingestion.py +244 -0
  51. grafomem-0.2.0/src/aml/server/mcp.py +266 -0
  52. grafomem-0.2.0/src/aml/server/stores.py +113 -0
  53. grafomem-0.2.0/src/aml/wire.py +446 -0
  54. grafomem-0.2.0/src/grafomem.egg-info/PKG-INFO +227 -0
  55. grafomem-0.2.0/src/grafomem.egg-info/SOURCES.txt +61 -0
  56. grafomem-0.2.0/src/grafomem.egg-info/dependency_links.txt +1 -0
  57. grafomem-0.2.0/src/grafomem.egg-info/entry_points.txt +2 -0
  58. grafomem-0.2.0/src/grafomem.egg-info/requires.txt +26 -0
  59. grafomem-0.2.0/src/grafomem.egg-info/top_level.txt +1 -0
  60. grafomem-0.2.0/tests/test_concurrency.py +97 -0
  61. grafomem-0.2.0/tests/test_conformance.py +64 -0
  62. grafomem-0.2.0/tests/test_corpus.py +66 -0
  63. grafomem-0.2.0/tests/test_interface.py +184 -0
grafomem-0.2.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 GNS Foundation
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,227 @@
1
+ Metadata-Version: 2.4
2
+ Name: grafomem
3
+ Version: 0.2.0
4
+ Summary: GRAFOMEM — agent-memory conformance benchmark and compliance toolkit
5
+ Author: Camilo Ayerbe Posada
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/GNS-Foundation/grafomem
8
+ Project-URL: Documentation, https://github.com/GNS-Foundation/grafomem/tree/main/docs
9
+ Project-URL: Repository, https://github.com/GNS-Foundation/grafomem
10
+ Project-URL: Issues, https://github.com/GNS-Foundation/grafomem/issues
11
+ Project-URL: API, https://grafomem-production.up.railway.app/docs
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Programming Language :: Python :: 3.13
19
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
20
+ Requires-Python: >=3.11
21
+ Description-Content-Type: text/markdown
22
+ License-File: LICENSE
23
+ Requires-Dist: jsonschema>=4
24
+ Requires-Dist: click>=8
25
+ Provides-Extra: backends
26
+ Requires-Dist: sentence-transformers>=2.2; extra == "backends"
27
+ Requires-Dist: numpy>=1.24; extra == "backends"
28
+ Provides-Extra: store
29
+ Requires-Dist: sqlite-vec; extra == "store"
30
+ Requires-Dist: apsw; extra == "store"
31
+ Provides-Extra: crypto
32
+ Requires-Dist: cryptography>=41; extra == "crypto"
33
+ Provides-Extra: server
34
+ Requires-Dist: fastapi>=0.110; extra == "server"
35
+ Requires-Dist: uvicorn[standard]>=0.29; extra == "server"
36
+ Requires-Dist: mcp>=1.0; extra == "server"
37
+ Requires-Dist: pydantic>=2.0; extra == "server"
38
+ Provides-Extra: all
39
+ Requires-Dist: grafomem[backends,crypto,server,store]; extra == "all"
40
+ Provides-Extra: dev
41
+ Requires-Dist: pytest>=8; extra == "dev"
42
+ Requires-Dist: httpx>=0.27; extra == "dev"
43
+ Dynamic: license-file
44
+
45
+ # GRAFOMEM
46
+
47
+ **An agent-memory benchmark that became a memory protocol.**
48
+
49
+ GRAFOMEM began as a benchmark for one question — *what should a standard for agent
50
+ memory actually specify?* — and turned into the answer: a benchmark, a protocol
51
+ (**GMP**), an executable conformance suite, and a certified, persistent reference
52
+ implementation. The thesis in one line:
53
+
54
+ > Memory capabilities are **orthogonal**, a **declared** capability is not the same as
55
+ > **observed** behavior, and the only way to tell them apart is to **test** — so agent
56
+ > memory should be specified and conformance-checked like any other protocol.
57
+
58
+ Clean-room research project. [grafomem.com](https://grafomem.com)
59
+
60
+ ---
61
+
62
+ ## The thesis, in three results
63
+
64
+ 1. **Four orthogonal axes.** A memory standard must separately specify: representational
65
+ capability (versioning / supersession), embedding quality, retention policy, and a
66
+ two-sided privacy primitive (deletion **and** tenant isolation). The benchmark shows
67
+ these are separately specifiable and verifiable.
68
+ 2. **Claims ≠ behavior.** A backend can *declare* `HARD_DELETE` or `MULTI_TENANT` and
69
+ still leak forbidden data (findings F10, F12). The declaration is not the guarantee.
70
+ This is the load-bearing result — and the reason a conformance suite has to exist.
71
+ 3. **Protocol + conformance.** "Supports capability X" is defined operationally:
72
+ *passes the conformance suite for X.* The spec, the suite, and implementations that
73
+ certify against it all exist and agree.
74
+
75
+ ---
76
+
77
+ ## The stack
78
+
79
+ | Layer | What it is | Where |
80
+ |---|---|---|
81
+ | **Benchmark** | 10 workloads (W1–W10), 20 findings; locked corpus 135 traces / 61,754 turns / 17,612 queries (v0.2.0) | `src/aml/generator/`, `scripts/run_w*.py` |
82
+ | **Paper** | arXiv technical report | `docs/grafomem-paper.pdf` |
83
+ | **Spec** | GMP v0.2 (draft) — protocol semantics (RFC 2119) | `docs/gmp-spec-v0.2.md` |
84
+ | **Conformance** | executable §8: `supports X` ≝ passes the suite for X | `src/aml/eval/conformance.py` |
85
+ | **Reference** | in-memory backend, self-certifying | `src/aml/backends/gmp_reference.py` |
86
+ | **Wire** | HTTP + JSON binding; the client *is* a `MemoryBackend` | `src/aml/wire.py` |
87
+ | **Store** | persistent SQLite + sqlite-vec; survives restart | `src/aml/backends/sqlite_gmp.py` |
88
+
89
+ Each layer certifies the one beside it. The reference backend runs the conformance
90
+ suite **on itself**; the wire client runs the *same* suite **over a socket**; the
91
+ SQLite store runs it **on a file**. The contract is transport- and
92
+ implementation-independent by construction — not by assertion.
93
+
94
+ **v0.2:** W7–W10 built (findings F14–F20); W7, W9, W10 corpus-locked into v0.2.0, W8 held out; W10 (operational concurrency & isolation) gated by M8 isolation conformance — §4.10, gmp-spec §10.
95
+
96
+ ---
97
+
98
+ ## Key findings
99
+
100
+ The benchmark is the evidence base. Full table in the paper (Appendix C); the
101
+ load-bearing ones:
102
+
103
+ - **Capabilities are inert without the workload that needs them.** On a pure-vector
104
+ retrieval task, declaring `SUPERSESSION_CHAIN` / `BI_TEMPORAL` changes nothing
105
+ (Δ = +0.000). On a drift task, supersession recovers recall from 0.281 → 0.867 at a
106
+ tight budget (**+0.585**). The capability matters exactly when the workload exercises it.
107
+ - **The embedder is the lever, not the capabilities.** Swapping the stub for a real
108
+ embedder moves recall +0.510 at budget 32; toggling capabilities on the same task
109
+ moves it +0.000.
110
+ - **Declared ≠ honest (F10, F12).** A backend that claims `HARD_DELETE` but soft-deletes
111
+ leaks deleted facts with probability 1.0; one that claims `MULTI_TENANT` but shares an
112
+ index leaks across tenants with probability 1.0. Both *pass their own type signature*
113
+ and *fail conformance*. Deletion and tenant isolation unify at the read path — a single
114
+ `Forbidden(q)` set — which is why one two-sided test catches both.
115
+
116
+ ---
117
+
118
+ ## Latency — reference store, locked corpus
119
+
120
+ The W1–W6 subset ingested into one growing SQLite + sqlite-vec store (N = 38,882
121
+ rows), BGE-small embeddings, on an Apple-Silicon laptop. Numbers are post-v0.2 (the
122
+ metadata-column pre-filter):
123
+
124
+ | op | count | p50 | p95 |
125
+ |---|---:|---:|---:|
126
+ | write | 37,015 | 10.17ms | 13.66ms |
127
+ | supersede | 2,262 | 9.85ms | 12.90ms |
128
+ | delete | 395 | **0.03ms** | 0.05ms |
129
+ | retrieve | 15,342 | 11.27ms | **27.80ms** |
130
+
131
+ What the numbers say:
132
+
133
+ - **The embedder is the floor.** Every op that embeds sits at ~10ms; `delete` (the one
134
+ op that doesn't) is 0.03ms. The store's own machinery is sub-millisecond. Single-item
135
+ write throughput is ~97/s on MPS — and a `write_many` bulk-ingest path that batches the
136
+ embedder under one transaction hits **847/s (8.6×)** with an identical resulting store,
137
+ confirming the embedder, not the store, was the entire write cost.
138
+ - **The v0.2 pre-filter crushed the tail.** In v0.1, selective queries (`as_of`, tenant)
139
+ ranked-then-filtered and triggered an adaptive widening loop, putting retrieve p95 at
140
+ **82ms**. v0.2 pushes the tenant/valid-time predicate into the KNN as metadata columns
141
+ and bounds `k` by the char budget — p95 fell to **27.80ms** (−66%) with identical
142
+ results, and the high-N p50 sits *at or below* v0.1's flat region.
143
+ - **Retrieve p50 still grows with N** (~10ms under 10k → ~25ms at 25–50k). sqlite-vec is
144
+ brute-force — there is no ANN index — so the scan is O(N). The pre-filter made
145
+ retrieval *correct and tight*, not *sublinear*; the next lever for retrieve-at-scale is
146
+ an actual ANN index, not more tuning. At 38k rows / 25ms p50 it isn't needed yet.
147
+ - **sqlite-vec ≈ numpy brute force** on pure-vector workloads (every bucket within ~1ms).
148
+ The store's value at this scale is **persistence and not pinning the corpus in RAM**,
149
+ not speed — confirming the in-memory reference is the right default below large scale.
150
+
151
+ ---
152
+
153
+ ## Running it
154
+
155
+ Editable install (src-layout; this also puts `aml` on the path permanently):
156
+
157
+ ```bash
158
+ python -m venv .venv && source .venv/bin/activate
159
+ pip install -e ".[backends]" # aml + sentence-transformers + torch
160
+ pip install sqlite-vec apsw # for the persistent store
161
+ ```
162
+
163
+ > **macOS note:** the store needs SQLite extension loading. If your Python's `sqlite3`
164
+ > lacks it, the backend transparently falls back to `apsw` (bundled SQLite). Keep `apsw`
165
+ > installed.
166
+
167
+ **Self-validating smokes** — each stands up a backend and runs the conformance suite
168
+ against it:
169
+
170
+ ```bash
171
+ python -m aml.backends.gmp_reference # reference impl certifies itself
172
+ python -m aml.wire # conformance suite passes over HTTP
173
+ python -m aml.backends.sqlite_gmp # persistent store: survives reopen + passes suite
174
+ ```
175
+
176
+ **Benchmark experiments** and the **scale probe**:
177
+
178
+ ```bash
179
+ python -m scripts.run_w1 # F1, F2 — vector vs recency floor + budget sweep
180
+ python -m scripts.run_w2 # F3–F5 — drift, supersession, bi-temporal
181
+ python -m scripts.run_w3 # F6, F7 — distractor noise; the embedder lever
182
+ python -m scripts.scale_probe # corpus latency + sqlite-vec vs brute force
183
+ ```
184
+
185
+ ---
186
+
187
+ ## Status & roadmap
188
+
189
+ **v0.1 — complete.** Benchmark, paper, GMP v0.1 spec, conformance suite, in-memory
190
+ reference (certified), HTTP+JSON wire binding (suite passes over a socket), persistent
191
+ SQLite + sqlite-vec store (survives restart, passes the full profile), scale probe.
192
+ v0.1 normative subset: `{AUDIT, SUPERSESSION_CHAIN, BI_TEMPORAL, HARD_DELETE, MULTI_TENANT}`.
193
+
194
+ **v0.2 — in progress.**
195
+
196
+ - **Metadata-column pre-filter** in the store — **done.** `tenant_id` / valid-time live
197
+ in the vec0 table (nulls sentinel-encoded, since sqlite-vec metadata filters don't
198
+ support `IS NULL`); the KNN filters natively and `k` is bounded by the char budget.
199
+ Drops retrieve p95 82→28ms with identical results and exact selective-filter retrieval;
200
+ the O(N) brute-force scan remains (no ANN index — a separate lever, not needed at this
201
+ scale).
202
+ - **Batched embedding** on the ingest path — **done.** A `write_many` fast-path embeds a
203
+ batch in one forward pass under one transaction: **97 → 847 items/s (8.6×)**, same
204
+ resulting store. Optional accelerator; the single-`write` Protocol path is unchanged.
205
+ - **Reserved capabilities — provenance pair done.** `PROVENANCE` (normative) and
206
+ `CRYPTOGRAPHIC_PROVENANCE` (optional) are implemented in both backends and certified by the
207
+ v0.2 conformance suite: Ed25519 over a content-store fact_id, `source` persisted (signatures
208
+ survive restart). Provenance has **no benchmark workload by design** — it is integrity metadata,
209
+ "verifiability, not ranking" (gmp-spec §7.5), so the suite tests it with constructed probes like
210
+ `AUDIT`, not a retrieval workload (§8.3).
211
+ - **Workloads W7–W9 — built.** W7 (Conflict Detection), W8 (Forgetting Curve), W9 (Cross-Session
212
+ Deletion); generators, backend spectrums, runners, findings F14–F18. W7 and W9 are corpus-locked
213
+ into v0.2.0; W8 is held out pending the summarise/merge retention variant. These home the last
214
+ two reserved flags — `CONFLICT_DETECTION` (W7) and `CROSS_SESSION_PROPAGATION` (W9), now
215
+ **un-reserved** in gmp-spec §7.4.
216
+ - **W10 — Operational Concurrency & Isolation — built and corpus-locked.** Trace-schema v0.2
217
+ carries set-valued ground truth; `interface.py` adds `submit_concurrent` (gated by
218
+ `CONCURRENCY_CONTROL`, the 10th flag), `IsolationPolicy`, and a `declared_policy`
219
+ self-description on the concurrent backend. A five-store spectrum (serializable →
220
+ resurrecting) drives **M8 isolation conformance**, which catches both the over-claimer
221
+ (declares serializable, delivers read-committed — F19) and the §10.4 durability violator
222
+ (resurrects a committed delete — F20). Locked into v0.2.0 (135 traces). The suite's last
223
+ open frontier, now closed.
224
+
225
+ The arc is protocol-first: the spec and suite are the standard; the implementations are
226
+ the proof it's real and runnable. "Postgres for agent memory" is the destination, not
227
+ the starting point.
@@ -0,0 +1,183 @@
1
+ # GRAFOMEM
2
+
3
+ **An agent-memory benchmark that became a memory protocol.**
4
+
5
+ GRAFOMEM began as a benchmark for one question — *what should a standard for agent
6
+ memory actually specify?* — and turned into the answer: a benchmark, a protocol
7
+ (**GMP**), an executable conformance suite, and a certified, persistent reference
8
+ implementation. The thesis in one line:
9
+
10
+ > Memory capabilities are **orthogonal**, a **declared** capability is not the same as
11
+ > **observed** behavior, and the only way to tell them apart is to **test** — so agent
12
+ > memory should be specified and conformance-checked like any other protocol.
13
+
14
+ Clean-room research project. [grafomem.com](https://grafomem.com)
15
+
16
+ ---
17
+
18
+ ## The thesis, in three results
19
+
20
+ 1. **Four orthogonal axes.** A memory standard must separately specify: representational
21
+ capability (versioning / supersession), embedding quality, retention policy, and a
22
+ two-sided privacy primitive (deletion **and** tenant isolation). The benchmark shows
23
+ these are separately specifiable and verifiable.
24
+ 2. **Claims ≠ behavior.** A backend can *declare* `HARD_DELETE` or `MULTI_TENANT` and
25
+ still leak forbidden data (findings F10, F12). The declaration is not the guarantee.
26
+ This is the load-bearing result — and the reason a conformance suite has to exist.
27
+ 3. **Protocol + conformance.** "Supports capability X" is defined operationally:
28
+ *passes the conformance suite for X.* The spec, the suite, and implementations that
29
+ certify against it all exist and agree.
30
+
31
+ ---
32
+
33
+ ## The stack
34
+
35
+ | Layer | What it is | Where |
36
+ |---|---|---|
37
+ | **Benchmark** | 10 workloads (W1–W10), 20 findings; locked corpus 135 traces / 61,754 turns / 17,612 queries (v0.2.0) | `src/aml/generator/`, `scripts/run_w*.py` |
38
+ | **Paper** | arXiv technical report | `docs/grafomem-paper.pdf` |
39
+ | **Spec** | GMP v0.2 (draft) — protocol semantics (RFC 2119) | `docs/gmp-spec-v0.2.md` |
40
+ | **Conformance** | executable §8: `supports X` ≝ passes the suite for X | `src/aml/eval/conformance.py` |
41
+ | **Reference** | in-memory backend, self-certifying | `src/aml/backends/gmp_reference.py` |
42
+ | **Wire** | HTTP + JSON binding; the client *is* a `MemoryBackend` | `src/aml/wire.py` |
43
+ | **Store** | persistent SQLite + sqlite-vec; survives restart | `src/aml/backends/sqlite_gmp.py` |
44
+
45
+ Each layer certifies the one beside it. The reference backend runs the conformance
46
+ suite **on itself**; the wire client runs the *same* suite **over a socket**; the
47
+ SQLite store runs it **on a file**. The contract is transport- and
48
+ implementation-independent by construction — not by assertion.
49
+
50
+ **v0.2:** W7–W10 built (findings F14–F20); W7, W9, W10 corpus-locked into v0.2.0, W8 held out; W10 (operational concurrency & isolation) gated by M8 isolation conformance — §4.10, gmp-spec §10.
51
+
52
+ ---
53
+
54
+ ## Key findings
55
+
56
+ The benchmark is the evidence base. Full table in the paper (Appendix C); the
57
+ load-bearing ones:
58
+
59
+ - **Capabilities are inert without the workload that needs them.** On a pure-vector
60
+ retrieval task, declaring `SUPERSESSION_CHAIN` / `BI_TEMPORAL` changes nothing
61
+ (Δ = +0.000). On a drift task, supersession recovers recall from 0.281 → 0.867 at a
62
+ tight budget (**+0.585**). The capability matters exactly when the workload exercises it.
63
+ - **The embedder is the lever, not the capabilities.** Swapping the stub for a real
64
+ embedder moves recall +0.510 at budget 32; toggling capabilities on the same task
65
+ moves it +0.000.
66
+ - **Declared ≠ honest (F10, F12).** A backend that claims `HARD_DELETE` but soft-deletes
67
+ leaks deleted facts with probability 1.0; one that claims `MULTI_TENANT` but shares an
68
+ index leaks across tenants with probability 1.0. Both *pass their own type signature*
69
+ and *fail conformance*. Deletion and tenant isolation unify at the read path — a single
70
+ `Forbidden(q)` set — which is why one two-sided test catches both.
71
+
72
+ ---
73
+
74
+ ## Latency — reference store, locked corpus
75
+
76
+ The W1–W6 subset ingested into one growing SQLite + sqlite-vec store (N = 38,882
77
+ rows), BGE-small embeddings, on an Apple-Silicon laptop. Numbers are post-v0.2 (the
78
+ metadata-column pre-filter):
79
+
80
+ | op | count | p50 | p95 |
81
+ |---|---:|---:|---:|
82
+ | write | 37,015 | 10.17ms | 13.66ms |
83
+ | supersede | 2,262 | 9.85ms | 12.90ms |
84
+ | delete | 395 | **0.03ms** | 0.05ms |
85
+ | retrieve | 15,342 | 11.27ms | **27.80ms** |
86
+
87
+ What the numbers say:
88
+
89
+ - **The embedder is the floor.** Every op that embeds sits at ~10ms; `delete` (the one
90
+ op that doesn't) is 0.03ms. The store's own machinery is sub-millisecond. Single-item
91
+ write throughput is ~97/s on MPS — and a `write_many` bulk-ingest path that batches the
92
+ embedder under one transaction hits **847/s (8.6×)** with an identical resulting store,
93
+ confirming the embedder, not the store, was the entire write cost.
94
+ - **The v0.2 pre-filter crushed the tail.** In v0.1, selective queries (`as_of`, tenant)
95
+ ranked-then-filtered and triggered an adaptive widening loop, putting retrieve p95 at
96
+ **82ms**. v0.2 pushes the tenant/valid-time predicate into the KNN as metadata columns
97
+ and bounds `k` by the char budget — p95 fell to **27.80ms** (−66%) with identical
98
+ results, and the high-N p50 sits *at or below* v0.1's flat region.
99
+ - **Retrieve p50 still grows with N** (~10ms under 10k → ~25ms at 25–50k). sqlite-vec is
100
+ brute-force — there is no ANN index — so the scan is O(N). The pre-filter made
101
+ retrieval *correct and tight*, not *sublinear*; the next lever for retrieve-at-scale is
102
+ an actual ANN index, not more tuning. At 38k rows / 25ms p50 it isn't needed yet.
103
+ - **sqlite-vec ≈ numpy brute force** on pure-vector workloads (every bucket within ~1ms).
104
+ The store's value at this scale is **persistence and not pinning the corpus in RAM**,
105
+ not speed — confirming the in-memory reference is the right default below large scale.
106
+
107
+ ---
108
+
109
+ ## Running it
110
+
111
+ Editable install (src-layout; this also puts `aml` on the path permanently):
112
+
113
+ ```bash
114
+ python -m venv .venv && source .venv/bin/activate
115
+ pip install -e ".[backends]" # aml + sentence-transformers + torch
116
+ pip install sqlite-vec apsw # for the persistent store
117
+ ```
118
+
119
+ > **macOS note:** the store needs SQLite extension loading. If your Python's `sqlite3`
120
+ > lacks it, the backend transparently falls back to `apsw` (bundled SQLite). Keep `apsw`
121
+ > installed.
122
+
123
+ **Self-validating smokes** — each stands up a backend and runs the conformance suite
124
+ against it:
125
+
126
+ ```bash
127
+ python -m aml.backends.gmp_reference # reference impl certifies itself
128
+ python -m aml.wire # conformance suite passes over HTTP
129
+ python -m aml.backends.sqlite_gmp # persistent store: survives reopen + passes suite
130
+ ```
131
+
132
+ **Benchmark experiments** and the **scale probe**:
133
+
134
+ ```bash
135
+ python -m scripts.run_w1 # F1, F2 — vector vs recency floor + budget sweep
136
+ python -m scripts.run_w2 # F3–F5 — drift, supersession, bi-temporal
137
+ python -m scripts.run_w3 # F6, F7 — distractor noise; the embedder lever
138
+ python -m scripts.scale_probe # corpus latency + sqlite-vec vs brute force
139
+ ```
140
+
141
+ ---
142
+
143
+ ## Status & roadmap
144
+
145
+ **v0.1 — complete.** Benchmark, paper, GMP v0.1 spec, conformance suite, in-memory
146
+ reference (certified), HTTP+JSON wire binding (suite passes over a socket), persistent
147
+ SQLite + sqlite-vec store (survives restart, passes the full profile), scale probe.
148
+ v0.1 normative subset: `{AUDIT, SUPERSESSION_CHAIN, BI_TEMPORAL, HARD_DELETE, MULTI_TENANT}`.
149
+
150
+ **v0.2 — in progress.**
151
+
152
+ - **Metadata-column pre-filter** in the store — **done.** `tenant_id` / valid-time live
153
+ in the vec0 table (nulls sentinel-encoded, since sqlite-vec metadata filters don't
154
+ support `IS NULL`); the KNN filters natively and `k` is bounded by the char budget.
155
+ Drops retrieve p95 82→28ms with identical results and exact selective-filter retrieval;
156
+ the O(N) brute-force scan remains (no ANN index — a separate lever, not needed at this
157
+ scale).
158
+ - **Batched embedding** on the ingest path — **done.** A `write_many` fast-path embeds a
159
+ batch in one forward pass under one transaction: **97 → 847 items/s (8.6×)**, same
160
+ resulting store. Optional accelerator; the single-`write` Protocol path is unchanged.
161
+ - **Reserved capabilities — provenance pair done.** `PROVENANCE` (normative) and
162
+ `CRYPTOGRAPHIC_PROVENANCE` (optional) are implemented in both backends and certified by the
163
+ v0.2 conformance suite: Ed25519 over a content-store fact_id, `source` persisted (signatures
164
+ survive restart). Provenance has **no benchmark workload by design** — it is integrity metadata,
165
+ "verifiability, not ranking" (gmp-spec §7.5), so the suite tests it with constructed probes like
166
+ `AUDIT`, not a retrieval workload (§8.3).
167
+ - **Workloads W7–W9 — built.** W7 (Conflict Detection), W8 (Forgetting Curve), W9 (Cross-Session
168
+ Deletion); generators, backend spectrums, runners, findings F14–F18. W7 and W9 are corpus-locked
169
+ into v0.2.0; W8 is held out pending the summarise/merge retention variant. These home the last
170
+ two reserved flags — `CONFLICT_DETECTION` (W7) and `CROSS_SESSION_PROPAGATION` (W9), now
171
+ **un-reserved** in gmp-spec §7.4.
172
+ - **W10 — Operational Concurrency & Isolation — built and corpus-locked.** Trace-schema v0.2
173
+ carries set-valued ground truth; `interface.py` adds `submit_concurrent` (gated by
174
+ `CONCURRENCY_CONTROL`, the 10th flag), `IsolationPolicy`, and a `declared_policy`
175
+ self-description on the concurrent backend. A five-store spectrum (serializable →
176
+ resurrecting) drives **M8 isolation conformance**, which catches both the over-claimer
177
+ (declares serializable, delivers read-committed — F19) and the §10.4 durability violator
178
+ (resurrects a committed delete — F20). Locked into v0.2.0 (135 traces). The suite's last
179
+ open frontier, now closed.
180
+
181
+ The arc is protocol-first: the spec and suite are the standard; the implementations are
182
+ the proof it's real and runnable. "Postgres for agent memory" is the destination, not
183
+ the starting point.
@@ -0,0 +1,52 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "grafomem"
7
+ version = "0.2.0"
8
+ description = "GRAFOMEM — agent-memory conformance benchmark and compliance toolkit"
9
+ readme = "README.md"
10
+ requires-python = ">=3.11"
11
+ license = { text = "MIT" }
12
+ authors = [{ name = "Camilo Ayerbe Posada" }]
13
+ classifiers = [
14
+ "Development Status :: 4 - Beta",
15
+ "Intended Audience :: Developers",
16
+ "License :: OSI Approved :: MIT License",
17
+ "Programming Language :: Python :: 3",
18
+ "Programming Language :: Python :: 3.11",
19
+ "Programming Language :: Python :: 3.12",
20
+ "Programming Language :: Python :: 3.13",
21
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
22
+ ]
23
+ dependencies = [
24
+ "jsonschema>=4",
25
+ "click>=8",
26
+ ]
27
+
28
+ [project.urls]
29
+ Homepage = "https://github.com/GNS-Foundation/grafomem"
30
+ Documentation = "https://github.com/GNS-Foundation/grafomem/tree/main/docs"
31
+ Repository = "https://github.com/GNS-Foundation/grafomem"
32
+ Issues = "https://github.com/GNS-Foundation/grafomem/issues"
33
+ API = "https://grafomem-production.up.railway.app/docs"
34
+
35
+ [project.scripts]
36
+ grafomem = "aml.cli:main"
37
+
38
+ [project.optional-dependencies]
39
+ # Reference vector adapter (vector_only): pinned embedding model + exact cosine.
40
+ backends = ["sentence-transformers>=2.2", "numpy>=1.24"]
41
+ store = ["sqlite-vec", "apsw"]
42
+ crypto = ["cryptography>=41"]
43
+ # Production server: FastAPI + MCP + batched ingestion.
44
+ server = ["fastapi>=0.110", "uvicorn[standard]>=0.29", "mcp>=1.0", "pydantic>=2.0"]
45
+ all = ["grafomem[crypto,backends,store,server]"]
46
+ dev = ["pytest>=8", "httpx>=0.27"]
47
+
48
+ [tool.setuptools.packages.find]
49
+ where = ["src"]
50
+
51
+ [tool.pytest.ini_options]
52
+ testpaths = ["tests"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,3 @@
1
+ """GRAFOMEM — agent-memory benchmark framework."""
2
+
3
+ __version__ = "0.1.3"
@@ -0,0 +1,136 @@
1
+ """
2
+ GRAFOMEM adapter pre-flight checker.
3
+
4
+ grafomem check -b my_module:MyBackend
5
+
6
+ Quick structural validation BEFORE running the full conformance suite:
7
+ checks Protocol compliance, method signatures, capability coherence,
8
+ and basic round-trip. Fails fast with actionable error messages.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import inspect
14
+ from typing import Any
15
+
16
+
17
+ def check_adapter(cls: type) -> list[str]:
18
+ """Run pre-flight checks against a backend class. Returns a list of errors.
19
+ Empty list means the adapter is structurally conformant."""
20
+ errors: list[str] = []
21
+
22
+ # 1. Check required methods exist
23
+ required_methods = ["capabilities", "write", "retrieve", "delete",
24
+ "supersede", "audit", "flush"]
25
+ for method in required_methods:
26
+ if not hasattr(cls, method):
27
+ errors.append(f"Missing required method: {method}()")
28
+ elif not callable(getattr(cls, method)):
29
+ errors.append(f"{method} exists but is not callable")
30
+
31
+ if errors:
32
+ return errors # can't proceed without basic methods
33
+
34
+ # 2. Try to instantiate
35
+ instance = None
36
+ try:
37
+ # Check if __init__ needs arguments beyond self
38
+ sig = inspect.signature(cls.__init__)
39
+ params = [p for p in sig.parameters.values()
40
+ if p.name != "self" and p.default is inspect.Parameter.empty]
41
+ if params:
42
+ errors.append(
43
+ f"Constructor requires arguments: {[p.name for p in params]}. "
44
+ f"The conformance suite needs a zero-arg factory. Consider using "
45
+ f"a wrapper lambda: lambda: MyBackend(arg1, arg2)"
46
+ )
47
+ return errors
48
+ instance = cls()
49
+ except Exception as e:
50
+ errors.append(f"Failed to instantiate {cls.__name__}(): {e}")
51
+ return errors
52
+
53
+ # 3. Check capabilities() returns set[Capability]
54
+ try:
55
+ from aml.backends.interface import Capability
56
+ caps = instance.capabilities()
57
+ if not isinstance(caps, set):
58
+ errors.append(f"capabilities() returned {type(caps).__name__}, expected set")
59
+ else:
60
+ for c in caps:
61
+ if not isinstance(c, Capability):
62
+ errors.append(f"capabilities() contains {c!r} which is not a Capability enum member")
63
+ except Exception as e:
64
+ errors.append(f"capabilities() raised: {e}")
65
+
66
+ # 4. Check Protocol compliance
67
+ try:
68
+ from aml.backends.interface import MemoryBackend
69
+ if not isinstance(instance, MemoryBackend):
70
+ errors.append(
71
+ f"{cls.__name__} does not satisfy the MemoryBackend Protocol. "
72
+ f"Check method signatures match the Protocol definition."
73
+ )
74
+ except Exception as e:
75
+ errors.append(f"Protocol check failed: {e}")
76
+
77
+ # 5. Basic round-trip: write + retrieve
78
+ try:
79
+ from aml.backends.interface import WriteOptions, RetrieveOptions
80
+ ref = instance.write("test content", WriteOptions())
81
+ if ref is None:
82
+ errors.append("write() returned None; must return a MemoryRef")
83
+ instance.flush()
84
+ mems = instance.retrieve("test", RetrieveOptions(budget_tokens=1024))
85
+ if not isinstance(mems, list):
86
+ errors.append(f"retrieve() returned {type(mems).__name__}, expected list")
87
+ except Exception as e:
88
+ errors.append(f"Basic write/retrieve round-trip failed: {e}")
89
+
90
+ return errors
91
+
92
+
93
+ def print_check(cls: type) -> bool:
94
+ """Run and print pre-flight check results. Returns True if all pass."""
95
+ print(f"GRAFOMEM adapter check — {cls.__name__}\n")
96
+
97
+ errors = check_adapter(cls)
98
+
99
+ if not errors:
100
+ # Also show declared capabilities
101
+ try:
102
+ instance = cls()
103
+ caps = instance.capabilities()
104
+ print(f"✓ Protocol compliance OK")
105
+ print(f"✓ Method signatures OK")
106
+ print(f"✓ Write/retrieve round-trip OK")
107
+ print(f"✓ Declared capabilities: {{{', '.join(sorted(c.value for c in caps))}}}")
108
+ print(f"\nAdapter is structurally conformant. Run `grafomem conformance` for full suite.")
109
+ except Exception:
110
+ print(f"✓ Structural checks passed")
111
+ return True
112
+ else:
113
+ for e in errors:
114
+ print(f"✗ {e}")
115
+ print(f"\n{len(errors)} error(s). Fix these before running the conformance suite.")
116
+ return False
117
+
118
+
119
+ # ---------------------------------------------------------------------------
120
+ # Smoke
121
+ # ---------------------------------------------------------------------------
122
+
123
+ if __name__ == "__main__":
124
+ from aml.backends.gmp_reference import GMPReferenceBackend
125
+ from aml.backends.vector_only import _stub_embedder, VectorOnlyBackend
126
+ from aml.backends.persistence import PersistenceBackend
127
+
128
+ for cls_factory in [
129
+ lambda: PersistenceBackend,
130
+ lambda: type("BadBackend", (), {}), # deliberately broken
131
+ ]:
132
+ cls = cls_factory()
133
+ print_check(cls)
134
+ print()
135
+
136
+ print("✓ Adapter check module smoke green.")
@@ -0,0 +1 @@
1
+ """Backend adapters + the MemoryBackend interface contract."""