midas-memory 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. midas_memory-0.0.1/.env.example +5 -0
  2. midas_memory-0.0.1/.github/workflows/ci.yml +22 -0
  3. midas_memory-0.0.1/.gitignore +41 -0
  4. midas_memory-0.0.1/BENCHMARKS.md +260 -0
  5. midas_memory-0.0.1/CHANGELOG.md +115 -0
  6. midas_memory-0.0.1/CONTRIBUTING.md +24 -0
  7. midas_memory-0.0.1/LICENSE +21 -0
  8. midas_memory-0.0.1/PKG-INFO +343 -0
  9. midas_memory-0.0.1/README.md +316 -0
  10. midas_memory-0.0.1/docs/long-horizon-memory.md +323 -0
  11. midas_memory-0.0.1/docs/research-notes.md +149 -0
  12. midas_memory-0.0.1/eval/__init__.py +6 -0
  13. midas_memory-0.0.1/eval/adapters/__init__.py +5 -0
  14. midas_memory-0.0.1/eval/adapters/base.py +35 -0
  15. midas_memory-0.0.1/eval/adapters/baseline_raw.py +41 -0
  16. midas_memory-0.0.1/eval/adapters/mem0_adapter.py +149 -0
  17. midas_memory-0.0.1/eval/adapters/midas_adapter.py +184 -0
  18. midas_memory-0.0.1/eval/bench_ann.py +108 -0
  19. midas_memory-0.0.1/eval/bench_perf.py +113 -0
  20. midas_memory-0.0.1/eval/datasets.py +440 -0
  21. midas_memory-0.0.1/eval/llm.py +140 -0
  22. midas_memory-0.0.1/eval/metrics.py +235 -0
  23. midas_memory-0.0.1/eval/midas_sweep.py +83 -0
  24. midas_memory-0.0.1/eval/multiday.py +278 -0
  25. midas_memory-0.0.1/eval/retention.py +227 -0
  26. midas_memory-0.0.1/eval/runner.py +658 -0
  27. midas_memory-0.0.1/eval/schema.py +47 -0
  28. midas_memory-0.0.1/examples/coding_agent_demo.py +58 -0
  29. midas_memory-0.0.1/midas/__init__.py +64 -0
  30. midas_memory-0.0.1/midas/ann.py +156 -0
  31. midas_memory-0.0.1/midas/bm25.py +51 -0
  32. midas_memory-0.0.1/midas/embeddings.py +334 -0
  33. midas_memory-0.0.1/midas/entity.py +51 -0
  34. midas_memory-0.0.1/midas/importance.py +146 -0
  35. midas_memory-0.0.1/midas/integrations/__init__.py +2 -0
  36. midas_memory-0.0.1/midas/integrations/langgraph_store.py +150 -0
  37. midas_memory-0.0.1/midas/mcp_server.py +220 -0
  38. midas_memory-0.0.1/midas/memory.py +1028 -0
  39. midas_memory-0.0.1/midas/nli.py +90 -0
  40. midas_memory-0.0.1/midas/policy.py +70 -0
  41. midas_memory-0.0.1/midas/py.typed +0 -0
  42. midas_memory-0.0.1/midas/sqlite_store.py +120 -0
  43. midas_memory-0.0.1/midas/store.py +114 -0
  44. midas_memory-0.0.1/midas/types.py +47 -0
  45. midas_memory-0.0.1/pyproject.toml +43 -0
  46. midas_memory-0.0.1/quickstart.py +32 -0
  47. midas_memory-0.0.1/tests/test_ann.py +72 -0
  48. midas_memory-0.0.1/tests/test_bm25.py +27 -0
  49. midas_memory-0.0.1/tests/test_capture.py +58 -0
  50. midas_memory-0.0.1/tests/test_cost_metrics.py +25 -0
  51. midas_memory-0.0.1/tests/test_embeddings_cache.py +79 -0
  52. midas_memory-0.0.1/tests/test_entity.py +53 -0
  53. midas_memory-0.0.1/tests/test_importance.py +69 -0
  54. midas_memory-0.0.1/tests/test_langgraph_store.py +31 -0
  55. midas_memory-0.0.1/tests/test_mcp_server.py +102 -0
  56. midas_memory-0.0.1/tests/test_memory_consolidate.py +83 -0
  57. midas_memory-0.0.1/tests/test_memory_forget.py +125 -0
  58. midas_memory-0.0.1/tests/test_memory_supersede.py +404 -0
  59. midas_memory-0.0.1/tests/test_novelty.py +68 -0
  60. midas_memory-0.0.1/tests/test_reinforce.py +76 -0
  61. midas_memory-0.0.1/tests/test_runner_trace.py +84 -0
  62. midas_memory-0.0.1/tests/test_sqlite_store.py +32 -0
  63. midas_memory-0.0.1/tests/test_store_vectorized.py +49 -0
  64. midas_memory-0.0.1/tests/test_structural_importance.py +44 -0
@@ -0,0 +1,5 @@
1
+ # Copy to .env and fill in. Only needed for --openai (real embeddings / LLM-judge).
2
+ OPENAI_API_KEY=
3
+
4
+ # Optional: alternative providers (DeepSeek/Mistral/Grok are OpenAI-compatible).
5
+ # OPENAI_BASE_URL=
@@ -0,0 +1,22 @@
1
+ name: tests
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+
8
+ jobs:
9
+ test:
10
+ runs-on: ubuntu-latest
11
+ strategy:
12
+ matrix:
13
+ python: ["3.11", "3.12"]
14
+ steps:
15
+ - uses: actions/checkout@v4
16
+ - uses: actions/setup-python@v5
17
+ with:
18
+ python-version: ${{ matrix.python }}
19
+ - name: Install (all extras + dev)
20
+ run: pip install ".[all,dev]"
21
+ - name: Run the suite
22
+ run: python -m pytest -q
@@ -0,0 +1,41 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *.egg-info/
5
+ .pytest_cache/
6
+ .ruff_cache/
7
+ dist/
8
+ build/
9
+
10
+ # Environments
11
+ .venv/
12
+ venv/
13
+ .env
14
+
15
+ # uv
16
+ uv.lock
17
+
18
+ # Eval outputs
19
+ results/
20
+ *.local.json
21
+
22
+ # OS / editor
23
+ .DS_Store
24
+ Thumbs.db
25
+
26
+ # Large / downloadable benchmark datasets (fetched via the loader — see eval/datasets.py)
27
+ data/longmemeval_s.json
28
+ data/longmemeval_oracle.json
29
+ data/longmemeval_m.json
30
+
31
+ # Local uv cache (set via UV_CACHE_DIR for this project)
32
+ .uv-cache/
33
+
34
+ # Ad-hoc scratch
35
+ /test_category_breakdown.py
36
+
37
+ # Internal strategy / competitive / handoff docs — keep OUT of any public release
38
+ _private/
39
+
40
+ # Datasets are fetched locally via eval/datasets.py, never committed
41
+ /data/
@@ -0,0 +1,260 @@
1
+ # Midas Benchmarks
2
+
3
+ Honest, reproducible benchmarks for the Midas agentic-memory SDK. Every number here comes from a
4
+ real run with the command to reproduce it. We deliberately **lead with reader-independent metrics**
5
+ (retrieval + cost) and treat end-to-end answer correctness as a secondary, noisy signal — see
6
+ *Methodology* for why that is the honest choice, not a convenient one.
7
+
8
+ ## TL;DR
9
+
10
+ Midas isolates and wins the two axes that actually measure a *memory layer* (as opposed to the
11
+ reader LLM stacked on top):
12
+
13
+ - **Retrieval** — on LongMemEval-`s` (evidence buried among distractors), Midas retrieves the
14
+ supporting turns at **recall@k 0.95** vs a recency-window baseline's **0.03**.
15
+ - **Cost** — Midas does **0 LLM calls, $0 API spend, and 0 data egress at ingest** (local embeddings
16
+ only), versus LLM-at-ingest memory systems that call an LLM per session to extract facts.
17
+
18
+ ## 1. Retrieval quality — `recall@k` (deterministic)
19
+
20
+ Fraction of gold supporting turns retrieved into the context. Fully deterministic (local embeddings,
21
+ no LLM), so it reproduces exactly.
22
+
23
+ | dataset | setting | baseline-raw | **Midas** |
24
+ |---|---|---:|---:|
25
+ | **LongMemEval-`s`** (buried evidence, hard retrieval) | n=40, bge-base, no rerank, seed 0 | 0.03 | **0.95** |
26
+ | **LoCoMo** (5 conversations) | n=50, bge-base, no rerank, seed 0 | 0.02 | **0.85** |
27
+
28
+ Across **both** datasets Midas retrieves the supporting turns at **0.85–0.95** while a recency window
29
+ gets **≤0.03** — the wedge holds beyond a single benchmark. On LongMemEval-`s` (n=40) the per-category
30
+ recall@k is strong across the board: fact 0.89 · multi-session 0.97 · knowledge-update 1.00 ·
31
+ temporal 0.95 · preference 1.00. A recency window finds essentially **none** of the buried evidence;
32
+ Midas finds ~9 in 10 — exactly the multi-session setting where retrieval quality decides whether the
33
+ answer is even *possible*. (`min_relevance` parsimony is a separate cost/quality knob; the numbers
34
+ above are pure retrieval, no pruning.)
35
+
36
+ **Time-aware retrieval (LLM-free).** Memories carry real **event time** (parsed from the dataset's
37
+ session timestamps), so recency and chronological context reflect *when things happened*, not load
38
+ order — the bitemporal signal long-horizon memory needs. Turning it on lifts **temporal recall@k
39
+ 0.86 → 0.95** (n=40, A/B via `--midas-no-time`), in line with the LongMemEval paper's +7–11% for
40
+ temporal handling — but done with regex/relative-date math, **not** an LLM, preserving the no-LLM
41
+ ingest/query edge. (fact dips 0.92 → 0.89, within n=13 noise and with no effect on fact *answer*.)
42
+
43
+ ```bash
44
+ # reproduce (deterministic; downloads LongMemEval-s on first run)
45
+ python -m eval.runner --dataset longmemeval --variant s --local \
46
+ --local-max-text-chars 600 --local-batch-size 16 --midas-no-rerank \
47
+ --max-questions 15 --limit 20 --seed 0
48
+ ```
49
+
50
+ ## 2. Cost / latency — the no-LLM edge (memory layer only)
51
+
52
+ Measured with the runner's cost instrumentation; excludes the shared reader/judge LLM (identical
53
+ across systems).
54
+
55
+ | system | ingest ms/event | memory-layer LLM | API $ | data egress |
56
+ |---|---:|---|---|---|
57
+ | **Midas** | ~116 (cold) · ~0 (cached) | **0** | **$0** | **none** |
58
+ | Mem0 *(LLM-at-ingest class)* | ~668 | ≥1 call / session | yes (per token) | yes (every turn) |
59
+
60
+ Midas's ingest cost is pure local ONNX embedding. LLM-at-ingest systems (Mem0, and **Hindsight**,
61
+ whose TEMPR extracts facts with an LLM at `retain` and CARA reasons with an LLM at `reflect`) pay an
62
+ LLM call per ingested session — which means **$/token forever at scale, seconds of latency, and every
63
+ conversation turn leaving the box**. At the scale where agent memory actually matters, that cost
64
+ structure — not a few points of benchmark accuracy — is what decides build-vs-buy.
65
+
66
+ **Microbenchmark (`eval/bench_perf.py`, bge-base on a modest CPU box — measured, not estimated):** a
67
+ single `remember` is **~16 ms p50** on short records (embed-bound — there is no LLM, just the ONNX
68
+ embedding forward pass; the time scales with text length, so the longer real turns above land at the
69
+ ~116 ms/event figure), `build_context` **~51 ms p50** over a 2,000-record store; ingest batches far
70
+ faster via `remember_many`. Honest framing: these are **tens of milliseconds, embed-bound** — fast and local (no
71
+ per-turn network round-trip), not sub-millisecond. **Footprint: ~3.6 MB per 1,000 records** at 768 dims
72
+ — embeddings are stored as **float32 arrays**; a Python `list[float]` would cost ~7× more (~24 MB/1k),
73
+ and switching to float32 left `recall@k` unchanged (LoCoMo 0.27 → 0.27). SQLite persistence already
74
+ stored float32.
75
+
76
+ ```bash
77
+ python -m eval.bench_perf --local --n 2000 --q 200 # latency · throughput · real (tracemalloc) footprint
78
+ ```
79
+
80
+ **Every Midas mechanism is local, $0, zero-egress** — embeddings (bge-base ONNX), recall, supersession,
81
+ the NLI contradiction/entailment checks (`midas/nli.py`, int8 ONNX MNLI), and the abstention metric.
82
+ The only LLM is the *reader*, which is pluggable. **Demonstrated end-to-end fully offline** — Midas +
83
+ a local `llama3.2:1b` reader/judge via Ollama (on a local GPU): LongMemEval-`s` n=10 → recall@k **0.80**,
84
+ answer **0.40**, **0 API calls / $0 / nothing leaves the box**. (The modest answer rate is the 1B
85
+ reader; a larger local model lifts it — correctness is reader-bound. The point: the pipeline is 100%
86
+ local at zero marginal cost.)
87
+
88
+ ```bash
89
+ # fully-local, $0, offline pipeline (local Ollama reader; no API key):
90
+ ollama serve & # then: ollama pull llama3.2:1b
91
+ python -m eval.runner --dataset longmemeval --variant s --local --midas-no-rerank \
92
+ --judge --judge-ollama --judge-model llama3.2:1b --max-questions 10 --seed 0
93
+ ```
94
+
95
+ ```bash
96
+ # reproduce (cold ingest cost: disable the embedding cache)
97
+ python -m eval.runner --dataset longmemeval --variant s --local \
98
+ --no-local-embedding-cache --midas-no-rerank --max-questions 3 --limit 20 --seed 0
99
+ ```
100
+
101
+ ## 3. Provenance (auditability)
102
+
103
+ `recall@k` is computable for Midas and the recency baseline because they return **source turn IDs**.
104
+ It is **N/A for fact-synthesizing systems** (Mem0, Hindsight) — they return LLM-rewritten facts, not
105
+ traceable sources. For Midas this is a feature: retrieved context is **auditable back to the exact
106
+ source turn**, with no extraction-time LLM that can silently hallucinate. This matters for enterprise
107
+ and compliance.
108
+
109
+ ## 4. Scaling — sub-linear search past the exact scan (ANN)
110
+
111
+ The default `InMemoryStore` runs an **exact** cached cosine scan: O(N) per query but fast in absolute
112
+ terms (~5 ms/query at 36k × 768-d; ~130–230 ms extrapolated to 1M). For larger corpora, `IVFStore`
113
+ wraps a **numpy-only** inverted-file index — **no native dependency** (unlike faiss/hnswlib): the
114
+ corpus is k-means-clustered into `nlist` cells and a query scans only the `nprobe` nearest cells, so
115
+ search is **sub-linear**. `nprobe` tunes recall vs latency at query time, with no rebuild.
116
+
117
+ Measured on the **36k real bge-base embeddings** cached from the runs above (k=10, 500 held-out
118
+ queries — real embeddings cluster, which is IVF's intended regime; uniform-random vectors are its
119
+ worst case):
120
+
121
+ | nprobe | recall@10 vs exact | IVF ms/q | speedup vs exact |
122
+ |---:|---:|---:|---:|
123
+ | 1 | 0.52 | 0.13 | 37× |
124
+ | 4 | 0.82 | 0.76 | 6× |
125
+ | 8 | 0.91 | 1.49 | 3× |
126
+ | 16 | 0.95 | 3.27 | 1.5× |
127
+
128
+ The win **grows with N** (IVF scans ≈ `nprobe·√N` candidates vs exact's N): the exact↔IVF crossover
129
+ is ~10k records, and at nprobe=8 the speedup rises 0×→1×→2×→3× across 5k→36k. Extrapolating by the
130
+ candidate count, at **1M** records exact ≈ 130–230 ms vs IVF(nprobe=8) ≈ 8 ms — **~20× at recall
131
+ ~0.90**. Below ~10k the exact scan wins (clustering overhead dominates), which is why `InMemoryStore`
132
+ stays the default and `IVFStore` is opt-in for large, read-heavy corpora.
133
+
134
+ ```bash
135
+ python -m eval.bench_ann # real cached embeddings if present, else synthetic clustered
136
+ ```
137
+
138
+ ## 5. Retention — selective forgetting beats recency (no LLM)
139
+
140
+ Long-horizon memory must stay **bounded**. Midas forgets by `memory_value` (importance × recency); the
141
+ real question is whether that keeps the *right* memories under pressure. Measured on **LongMemEval-`s`
142
+ (n=40, evidence buried among distractors)** by evicting to a fixed budget and comparing policies
143
+ (`eval/retention.py`, recall@k averaged over all 40 questions):
144
+
145
+ | keep | importance — `StructuralImportance` | importance — `ContentImportance` | recency (FIFO) | random |
146
+ |---|---:|---:|---:|---:|
147
+ | 50% | **0.56** | 0.43 | 0.36 | 0.25 |
148
+ | 25% | **0.36** | 0.26 | 0.19 | 0.12 |
149
+
150
+ **Importance-ranked forgetting beats recency at every level** (`value > fifo > random`), and a structural
151
+ salience signal (boost an *assertion of a durable attribute*; demote questions/meta) beats the plain
152
+ content score by **+0.10–0.13 recall@k** under forgetting — all **no-LLM**. On undifferentiated chat
153
+ (LoCoMo) the signal is neutral, because recall there doesn't gate on importance; the buried-fact setting
154
+ is where it shows. (The honest negatives along the way — *novelty-vs-store* and *reinforcement* — are
155
+ documented in the design doc; the moat is reporting them too.)
156
+
157
+ ```bash
158
+ python -m eval.retention --dataset longmemeval --variant s --local --no-rerank \
159
+ --structural-importance --value-rank-only --max-questions 40 --fractions 0.5,0.25
160
+ ```
161
+
162
+ ## 6. Correctness with a fixed strong reader (secondary)
163
+
164
+ `recall@k` measures the memory layer directly; *answer correctness* additionally depends on the reader
165
+ LLM (see Methodology). Holding the reader **fixed and identical across systems** (`gpt-4.1-mini` at
166
+ temp 0 — the same non-reasoning reader class the LongMemEval leaderboard uses), Midas's retrieval edge
167
+ converts to a large answer edge:
168
+
169
+ | dataset (reader = gpt-4.1-mini) | baseline-raw answer | **Midas** answer |
170
+ |---|---:|---:|
171
+ | LongMemEval-`s` (n=40, seed 0) | 0.05 | **0.82** |
172
+
173
+ Per-category Midas answer (indicative, wide bars at n=4–13): fact **1.00** · knowledge-update **1.00** ·
174
+ multi-session **0.89** · temporal **~0.64–0.82** (noisy) · preference 0.33. Same reader for both, so the
175
+ ~16× gap (0.82 vs 0.05) is the memory layer: a recency window almost never holds the buried evidence
176
+ (recall@k 0.03), so the reader cannot answer.
177
+ For scale, 2026 SOTA on LongMemEval is **reader-dominated and LLM-ingest-based**: Mastra Observational
178
+ Memory scores **84.2% (gpt-4o) → 94.9% (gpt-5-mini)** — a +11pt swing from the *reader alone* — using
179
+ an LLM Observer/Reflector at ingest; Mem0 ~94.4.
180
+
181
+ **Reader sweep — same reader as SOTA, but Midas does ZERO LLM at ingest** (LongMemEval-`s`, n=40,
182
+ seed 0; judge fixed = gpt-4o to match Observational Memory's protocol; structured answerer):
183
+
184
+ | reader (Midas, no-LLM ingest) | **Midas** answer | Observational Memory (LLM ingest) |
185
+ |---|---:|---:|
186
+ | gpt-4o | **0.84** | 0.84 — **match** |
187
+ | gpt-5-mini | 0.87–0.89 | 0.95 |
188
+
189
+ **At gpt-4o, Midas ties the SOTA (0.84) with $0 LLM ingest** — OM gets the same number by running an
190
+ LLM Observer+Reflector on every conversation at ingest. With gpt-5-mini Midas reaches 0.87–0.89 vs
191
+ OM's 0.95: OM's curated observations help a strong reader more than raw retrieved turns do. Across the
192
+ sweep Midas pays **$0 at ingest, no data egress, and returns source-traceable turns** — none of which
193
+ the LLM-ingest systems offer — and its retrieval **scales to ~500-session corpora (LongMemEval-`m`) by
194
+ retrieving + forgetting** (measured: a ~4,944-turn haystack assembles a bounded ~480-token context,
195
+ recall@k 0.78 over n=3), where a keep-every-observation-in-context design does not fit by construction.
196
+ *(We measure Midas's side at that scale; we do not run OM — the overflow is an architectural inference,
197
+ not a head-to-head.)* By category Midas **leads
198
+ multi-session (0.89 vs OM's 0.872)** and matches knowledge-update (1.00 vs 0.962); the remaining gap is
199
+ **temporal** (0.82 vs 0.955; per-category n=4–13 → wide bars).
200
+
201
+ A **structured answerer** (ask the reader to pull the relevant dated entries and do the date arithmetic
202
+ before answering) lifts **non-reasoning** readers a lot — gpt-4o 0.76 → 0.84, multi-session 0.56 → 0.89
203
+ — and is neutral for reasoning readers that already do this internally (gpt-5-mini 0.89 → 0.87, within
204
+ noise). Since the cheap, deployable readers are the non-reasoning ones, it is on by default.
205
+
206
+ **Time-awareness — measured on the deterministic metric.** The LLM-free event-time grounding lifts
207
+ **temporal `recall@k` 0.86 → 0.95** (A/B via `--midas-no-time`, deterministic and reproducible) with
208
+ no real regression elsewhere (multi-session holds at 0.97; fact 0.92 → 0.89 is within n=13 noise). Its
209
+ effect on *answer* correctness is real in principle — the reader can resolve "how many days ago…" from
210
+ the dated context + a "today" anchor — but at **n=11 per category the answer deltas are inside run-to-run
211
+ judge noise** (the temporal answer alone bounced 0.64–0.82 across identical-config runs), so we do
212
+ **not** quote a per-category answer lift. This is the methodology working as intended: trust `recall@k`,
213
+ distrust small-n correctness deltas.
214
+
215
+ **Caveat:** n=40 sample with gpt-4.1-mini; the published Zep/Mem0 numbers run their full systems over
216
+ the full set with GPT-4o. Correctness also moves far more with the reader than with the memory layer
217
+ (see Methodology) — a strong reader can still miss multi-hop reasoning even when recall@k is high (the
218
+ evidence is present; the reasoning is the bottleneck). So we treat correctness as a secondary, wide-bar
219
+ signal and lead with `recall@k`.
220
+
221
+ ```bash
222
+ # reproduce (needs an LLM key; this used OpenRouter gpt-4.1-mini as reader + judge)
223
+ JUDGE_PROVIDER=openrouter JUDGE_MODEL=openai/gpt-4.1-mini \
224
+ python -m eval.runner --dataset longmemeval --variant s --local \
225
+ --local-max-text-chars 600 --local-batch-size 16 --midas-no-rerank \
226
+ --judge --max-questions 40 --limit 20 --seed 0 # add --midas-no-time for the A/B
227
+ ```
228
+
229
+ ## Methodology — why reader-independent metrics
230
+
231
+ End-to-end "answer correctness" on these benchmarks is **dominated by the reader LLM, not the memory
232
+ layer**:
233
+
234
+ - Holding the reader fixed, a memory layer's lift is real; but swapping in a bigger reader moves the
235
+ *headline* far more than the memory does. (Public SOTA on LongMemEval reports ~39% → ~83% from the
236
+ memory system but ~83% → ~91% from *just a larger reader* — most of the headline is the reader.)
237
+ - Our own hosted LLM judge (an MoE served via API) is **not reproducible across sessions**: identical
238
+ inputs scored ~0.46 one day and ~0.13 the next, even at temperature 0. We added a local,
239
+ seed-pinned, serialized judge (`--judge-ollama`) to make correctness reproducible, but a small local
240
+ reader is too weak to *use* good context — so correctness still does not cleanly isolate memory
241
+ quality.
242
+
243
+ Therefore: **`recall@k` (deterministic, reader-independent) and ingest cost (structural) are our
244
+ primary metrics.** We report correctness only with a fixed reader and wide error bars, and never as a
245
+ headline.
246
+
247
+ ### Honest caveats
248
+ - **Sample** is n=40 on LongMemEval-`s` and n=50 across 5 LoCoMo conversations. `recall@k` is
249
+ deterministic, so the sample is real; the full LongMemEval set / all 10 LoCoMo conversations would
250
+ tighten it further.
251
+ - **Latency is hardware/provider-dependent** (the ~668 ms for the LLM-at-ingest class includes API
252
+ round-trip). The durable, hardware-independent claim is the **0-LLM / $0 / no-egress** column.
253
+ - **baseline-raw** = "stuff recent turns into the window" (the naive big-context approach).
254
+ - Numbers measured on CPU with `BAAI/bge-base-en-v1.5`. GPU / a faster embedder lowers Midas latency.
255
+ - **Reranking is off by default on large haystacks.** A cross-encoder reranker is available, but on
256
+ LongMemEval-`s` (CPU) it added ~80× query latency (4.2 s vs 53 ms) with **no `recall@k` change**
257
+ (0.88 → 0.88): it reorders the records that already fit the budget (which can help the *reader*) but
258
+ does not change *which* evidence fits. So it is not on the retrieval-quality path here.
259
+
260
+ *All commands run from the repo root. `recall@k` requires no API key; `--judge*` flags do.*
@@ -0,0 +1,115 @@
1
+ # Changelog
2
+
3
+ Notable changes to Midas. Pre-1.0 — the API may change. Format loosely follows
4
+ [Keep a Changelog](https://keepachangelog.com/).
5
+
6
+ ## [Unreleased]
7
+
8
+ ### Added
9
+ - **Core SDK** — `Memory` (`remember` / `recall` / `build_context` / `assemble`) ranking by
10
+ relevance × importance × recency, with same-thread neighbour-window expansion and budgeted,
11
+ highest-value-first context assembly. No LLM at ingest or query.
12
+ - **Embedders** — `HashingEmbedder` (offline, zero-dep), `LocalEmbedder` (fastembed/ONNX, bge-base),
13
+ `OpenAIEmbedder`, and `DiskCachedEmbedder` (persistent SQLite cache keyed by model + dim + text).
14
+ `LocalReranker` (cross-encoder, length-capped to avoid ONNX crashes).
15
+ - **Stores** — `InMemoryStore` with a vectorised cosine scan over a **cached** embedding matrix
16
+ (numpy; comfortable to ~1M memories) and an identical pure-Python fallback; `SQLiteStore` for
17
+ **persistence across restarts** with **no native extension** (pure stdlib sqlite3).
18
+ - **float32 in-memory embeddings** — records store the embedding as a float32 numpy array, not a
19
+ Python `list[float]` (~32 B/value). Measured ~**7× smaller footprint** at 768 dims (a 1M-record
20
+ in-memory store drops from ~24 GB to ~3.5 GB) and **faster queries** (float32 matmul); SQLite already
21
+ persisted float32. Measured by `eval/bench_perf.py` (latency / throughput / real tracemalloc footprint
22
+ — the numbers the project had never measured).
23
+ - **Hybrid retrieval** (BM25 fused with semantic) — off by default; see `BENCHMARKS.md` for the
24
+ honest negative result on conversational data.
25
+ - **Belief revision** (supersession) for typed durable facts — off by default; chat never supersedes
26
+ chat. Paraphrased updates are caught by the embedder's cosine similarity (no hand-tuned synonym map),
27
+ so it generalizes beyond any one dataset.
28
+ - **Local NLI** (`midas/nli.py`, LLM-free) — a small int8 ONNX MNLI cross-encoder (onnxruntime +
29
+ tokenizers, ~70 MB, no torch/API). Powers **contradiction-gated conversational belief revision**:
30
+ a chat turn revises an earlier belief only when NLI scores it an actual contradiction. This *fixes*
31
+ the cue-heuristic's over-supersession (LongMemEval temporal recall restored 0.76 → **0.95**) while
32
+ staying precise on real updates — closing the "cheap no-LLM contradiction detection" open problem.
33
+ Also exposes **post-hoc answer-grounding** (`--answer-verify-nli`) — override to "I don't know" when
34
+ no retrieved turn entails the answer. Honest result: it does NOT reliably improve abstention (a
35
+ deterministic-reader A/B is unchanged, 0.37→0.37) because the confabulation is drawn from a retrieved
36
+ distractor that *entails* it. Abstention/Calibrated remains the open frontier; see docs.
37
+ - **Entity-grounded abstention** (`midas/entity.py`, LLM-free) — a new abstention lever orthogonal to
38
+ cosine/NLI: abstain when the answer's source turn is about a *different entity* than the question asks
39
+ (the diagnosed confab-from-distractor root cause). Dropping recurring *attribute* words makes the focus
40
+ the entity noun; **offline-validated 8/8** on the diagnosed failure cases (incl. "favorite colour" vs
41
+ "favorite food", "city" vs "Barcelona"), 11 tests. Honest limit: crafted cases — the end-to-end win
42
+ needs a capable reader (local 1B doesn't confabulate-from-distractor; hosted credits exhausted).
43
+ - **Time-aware retrieval** (LLM-free) — memories carry real **event time** (`remember(created_at=…)`);
44
+ `recall`/`build_context` take a query `now` so recency decays from when a question is asked, context
45
+ renders true dates (UTC), and a "today" header anchors relative-time reasoning. Bitemporal signal,
46
+ no LLM. Eval ablation: `--midas-no-time`.
47
+ - **Selective forgetting + temporal tiers** (LLM-free) — `Memory.forget_decayed()` evicts the
48
+ lowest-value memories (`memory_value` = importance × recency) to bound storage and context growth,
49
+ **protecting the durable tier** (facts/preferences/constraints, high importance) and never orphaning
50
+ a supersession chain; returns the forgotten ids (deletion audit trail). `Memory.tier()` names a
51
+ memory's horizon — short (≤1d) / medium (≤1w) / long (multi-day). Measured with `eval/retention.py`
52
+ (eviction policies at the same retained budget): on data with an importance signal, value-based
53
+ forgetting **holds recall@k 1.00 at 25–50% retention** while recency/random eviction fall to
54
+ 0.17–0.60; on uniform-importance chat it **reduces to recency** (honest — needs a per-turn importance
55
+ signal, the next step) while cutting context tokens ~3×. Purely additive: no-forget recall@k
56
+ unchanged (LoCoMo 0.62).
57
+ - **Content importance scoring** (`ContentImportance`, LLM-free) — derive a turn's importance 1–5 from
58
+ content alone (content-word density, numbers/dates, proper nouns, anti-backchannel); `Memory(
59
+ importance_scorer=…)` auto-applies it to turns ingested without one, so raw chat gets a salience for
60
+ forgetting/tiering. Measured: as a forgetting **protection** it lifts LoCoMo recall@k under eviction
61
+ from 0.10 (recency) to **0.18** (sheds filler, keeps facts); as a pure rank it helps only at moderate
62
+ compression.
63
+ - **Novelty-vs-store importance** (`Memory(novelty_weight=…)`, LLM-free) — blends importance with
64
+ `1 − max-cosine-to-store` so a *new* fact can outrank a *repeated* one. **Off by default: a measured
65
+ negative.** At equal budget it is neutral on LoCoMo/synthetic recall@k and *harmful* on multiday
66
+ (1.00 → 0.60), because repetition usually signals importance and demoting restated gold evicts it.
67
+ Kept as a tested, opt-in knob; its right home is consolidation (dedup), not eviction-ranking.
68
+ - **Reinforcement importance** (`Memory(reinforce=True)`, LLM-free) — the inverse of novelty: a restated
69
+ turn *boosts* the matched memory's importance + recency (repetition ⇒ salience); in `capture` a
70
+ restatement reinforces the existing memory and is skipped. **Off by default: also a measured negative**
71
+ — recall@k drops at equal budget (LoCoMo 0.08→0.03 @25%; multiday 0.60→0.40 @25%). Unifying finding:
72
+ on raw conversation **repetition tracks commonness, not importance**, so neither novelty nor
73
+ reinforcement improves no-LLM forgetting. Content-salience as a *protection* stays the best signal.
74
+ - **Extractive consolidation** (`Memory.consolidate`, LLM-free) — collapse near-**duplicate** restatements
75
+ to the single highest-value copy (cosine ≥ threshold, chains preserved); extractive (drops redundant
76
+ records, keeps provenance — never LLM-rewrites). Measured safe (recall@k held: LoCoMo 0.27→0.26 dropping
77
+ 10 dups at 0.92); yield is modest at safe thresholds on paraphrase-heavy data and grows with literal
78
+ redundancy/scale.
79
+ - **MCP server** (`python -m midas.mcp_server`) — `remember` (auto-derives importance from content),
80
+ `recall` (source-traceable), `build_context`, `maintain` (no-LLM retention: dedup + selective
81
+ forgetting, returns the **deletion audit** of removed ids), `stats` (counts + temporal-tier
82
+ distribution), `forget`, `forget_all`. Optional SQLite persistence via `MIDAS_MCP_DB`; optional
83
+ **bounded memory** via `MIDAS_MCP_MAX_RECORDS` (auto-forget the lowest-value tail over the cap). The
84
+ privacy/cost/provenance/retention surface for long-running and enterprise agents.
85
+ - **Zero-config auto-memory** (LLM-free) — install the MCP server and Midas starts remembering on its
86
+ own. The server **injects a memory policy** into the agent (MCP `instructions` + a `memory_session`
87
+ prompt): recall-then-`capture`. `Memory.capture()` + `MemoryPolicy` impose the relevance parameters —
88
+ it scores each turn's importance, enforces a floor (`MIDAS_MCP_MIN_IMPORTANCE`, default 2) and skips
89
+ duplicates, and reports stored/skipped + why. The agent captures freely; Midas decides what's kept.
90
+ - **Eval harness** (`eval/`, dev-only) — LoCoMo + LongMemEval loaders, deterministic `recall@k`,
91
+ per-adapter cost/latency instrumentation, and an optional LLM judge (hosted or local Ollama,
92
+ seed-pinned + serialized for reproducibility). **Reader and judge models are decoupled**
93
+ (`--reader-model` vs `--judge-model`) so correctness can be measured with a fixed judge while
94
+ sweeping readers — the apples-to-apples protocol published leaderboards use (e.g. gpt-4o judge).
95
+ - **Artifacts** — `BENCHMARKS.md` (reader-independent results + reproduce commands),
96
+ `docs/research-notes.md` (measured findings), a coding-agent demo, PEP 561 typing (`py.typed`),
97
+ and an MIT license.
98
+
99
+ ### Measured (see BENCHMARKS.md)
100
+ - Retrieval `recall@k`: LongMemEval-`s` **0.95** (n=40, time-aware) and LoCoMo **0.85** (5
101
+ conversations) vs a recency-window baseline ≤0.03. Time-awareness lifts **temporal recall@k
102
+ 0.86→0.95** (deterministic A/B, `--midas-no-time`), no real regression elsewhere.
103
+ - Answer correctness (reader = gpt-4.1-mini, n=40): Midas **0.82** vs baseline **0.05**. Per-category
104
+ answer deltas are within run-to-run judge noise at n≤13, so we lead with `recall@k`.
105
+ - **Same-reader head-to-head (judge=gpt-4o, structured answerer):** Midas **0.84 @ gpt-4o = SOTA
106
+ Observational Memory's 0.84**, with **zero LLM at ingest** (OM runs an LLM per conversation);
107
+ 0.87–0.89 @ gpt-5-mini vs OM 0.95. Midas leads multi-session (0.89 vs 0.872).
108
+ - Structured answerer (extract relevant dated entries + compute time deltas before answering) lifts
109
+ non-reasoning readers (gpt-4o 0.76→0.84) and is neutral for reasoning readers.
110
+ - Ingest cost: **0 LLM calls, $0 API, 0 data egress** (local embeddings only).
111
+ - In-memory recall latency ~0.2 µs/record after matrix caching (~70× the naive Python scan).
112
+
113
+ ### Notes
114
+ - Reader-independent metrics (`recall@k`, cost) are primary; end-to-end answer correctness is
115
+ reader-dominated and reported as secondary/noisy — see `docs/research-notes.md`.
@@ -0,0 +1,24 @@
1
+ # Contributing to Midas
2
+
3
+ Thanks for considering a contribution. Midas is **eval-first**: the project's one durable asset is that
4
+ its reported numbers are true. A few principles keep it that way.
5
+
6
+ 1. **Measure, don't claim.** Any change that affects retrieval or forgetting must show its effect on a
7
+ reproducible metric — `recall@k` is deterministic (`python -m eval.runner …` / `python -m
8
+ eval.retention …`). Quote numbers with the command, dataset, `n`, and caveats.
9
+ 2. **Regression-check.** Run the suite (`python -m pytest -q`); for retrieval changes, confirm LoCoMo
10
+ `recall@k` is unchanged. A win on a toy can break real data — that has happened here before.
11
+ 3. **No LLM at ingest or query.** The wedge is local, cheap, auditable. New *no-LLM* mechanisms are very
12
+ welcome; an LLM in the ingest/query path is not.
13
+ 4. **Honest negatives are valued.** A measured "this didn't work" is a real contribution — the design
14
+ doc (`docs/long-horizon-memory.md`) keeps several on purpose.
15
+
16
+ ## Dev setup
17
+
18
+ ```bash
19
+ git clone https://github.com/vornicx/Midas && cd Midas
20
+ pip install -e ".[all,dev]"
21
+ python -m pytest -q
22
+ ```
23
+
24
+ Open an issue first for anything non-trivial. Small, measured, well-tested PRs merge fastest.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Midas authors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.