midas-memory 0.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- midas_memory-0.0.1/.env.example +5 -0
- midas_memory-0.0.1/.github/workflows/ci.yml +22 -0
- midas_memory-0.0.1/.gitignore +41 -0
- midas_memory-0.0.1/BENCHMARKS.md +260 -0
- midas_memory-0.0.1/CHANGELOG.md +115 -0
- midas_memory-0.0.1/CONTRIBUTING.md +24 -0
- midas_memory-0.0.1/LICENSE +21 -0
- midas_memory-0.0.1/PKG-INFO +343 -0
- midas_memory-0.0.1/README.md +316 -0
- midas_memory-0.0.1/docs/long-horizon-memory.md +323 -0
- midas_memory-0.0.1/docs/research-notes.md +149 -0
- midas_memory-0.0.1/eval/__init__.py +6 -0
- midas_memory-0.0.1/eval/adapters/__init__.py +5 -0
- midas_memory-0.0.1/eval/adapters/base.py +35 -0
- midas_memory-0.0.1/eval/adapters/baseline_raw.py +41 -0
- midas_memory-0.0.1/eval/adapters/mem0_adapter.py +149 -0
- midas_memory-0.0.1/eval/adapters/midas_adapter.py +184 -0
- midas_memory-0.0.1/eval/bench_ann.py +108 -0
- midas_memory-0.0.1/eval/bench_perf.py +113 -0
- midas_memory-0.0.1/eval/datasets.py +440 -0
- midas_memory-0.0.1/eval/llm.py +140 -0
- midas_memory-0.0.1/eval/metrics.py +235 -0
- midas_memory-0.0.1/eval/midas_sweep.py +83 -0
- midas_memory-0.0.1/eval/multiday.py +278 -0
- midas_memory-0.0.1/eval/retention.py +227 -0
- midas_memory-0.0.1/eval/runner.py +658 -0
- midas_memory-0.0.1/eval/schema.py +47 -0
- midas_memory-0.0.1/examples/coding_agent_demo.py +58 -0
- midas_memory-0.0.1/midas/__init__.py +64 -0
- midas_memory-0.0.1/midas/ann.py +156 -0
- midas_memory-0.0.1/midas/bm25.py +51 -0
- midas_memory-0.0.1/midas/embeddings.py +334 -0
- midas_memory-0.0.1/midas/entity.py +51 -0
- midas_memory-0.0.1/midas/importance.py +146 -0
- midas_memory-0.0.1/midas/integrations/__init__.py +2 -0
- midas_memory-0.0.1/midas/integrations/langgraph_store.py +150 -0
- midas_memory-0.0.1/midas/mcp_server.py +220 -0
- midas_memory-0.0.1/midas/memory.py +1028 -0
- midas_memory-0.0.1/midas/nli.py +90 -0
- midas_memory-0.0.1/midas/policy.py +70 -0
- midas_memory-0.0.1/midas/py.typed +0 -0
- midas_memory-0.0.1/midas/sqlite_store.py +120 -0
- midas_memory-0.0.1/midas/store.py +114 -0
- midas_memory-0.0.1/midas/types.py +47 -0
- midas_memory-0.0.1/pyproject.toml +43 -0
- midas_memory-0.0.1/quickstart.py +32 -0
- midas_memory-0.0.1/tests/test_ann.py +72 -0
- midas_memory-0.0.1/tests/test_bm25.py +27 -0
- midas_memory-0.0.1/tests/test_capture.py +58 -0
- midas_memory-0.0.1/tests/test_cost_metrics.py +25 -0
- midas_memory-0.0.1/tests/test_embeddings_cache.py +79 -0
- midas_memory-0.0.1/tests/test_entity.py +53 -0
- midas_memory-0.0.1/tests/test_importance.py +69 -0
- midas_memory-0.0.1/tests/test_langgraph_store.py +31 -0
- midas_memory-0.0.1/tests/test_mcp_server.py +102 -0
- midas_memory-0.0.1/tests/test_memory_consolidate.py +83 -0
- midas_memory-0.0.1/tests/test_memory_forget.py +125 -0
- midas_memory-0.0.1/tests/test_memory_supersede.py +404 -0
- midas_memory-0.0.1/tests/test_novelty.py +68 -0
- midas_memory-0.0.1/tests/test_reinforce.py +76 -0
- midas_memory-0.0.1/tests/test_runner_trace.py +84 -0
- midas_memory-0.0.1/tests/test_sqlite_store.py +32 -0
- midas_memory-0.0.1/tests/test_store_vectorized.py +49 -0
- midas_memory-0.0.1/tests/test_structural_importance.py +44 -0
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
name: tests
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
|
|
8
|
+
jobs:
|
|
9
|
+
test:
|
|
10
|
+
runs-on: ubuntu-latest
|
|
11
|
+
strategy:
|
|
12
|
+
matrix:
|
|
13
|
+
python: ["3.11", "3.12"]
|
|
14
|
+
steps:
|
|
15
|
+
- uses: actions/checkout@v4
|
|
16
|
+
- uses: actions/setup-python@v5
|
|
17
|
+
with:
|
|
18
|
+
python-version: ${{ matrix.python }}
|
|
19
|
+
- name: Install (all extras + dev)
|
|
20
|
+
run: pip install ".[all,dev]"
|
|
21
|
+
- name: Run the suite
|
|
22
|
+
run: python -m pytest -q
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*.egg-info/
|
|
5
|
+
.pytest_cache/
|
|
6
|
+
.ruff_cache/
|
|
7
|
+
dist/
|
|
8
|
+
build/
|
|
9
|
+
|
|
10
|
+
# Environments
|
|
11
|
+
.venv/
|
|
12
|
+
venv/
|
|
13
|
+
.env
|
|
14
|
+
|
|
15
|
+
# uv
|
|
16
|
+
uv.lock
|
|
17
|
+
|
|
18
|
+
# Eval outputs
|
|
19
|
+
results/
|
|
20
|
+
*.local.json
|
|
21
|
+
|
|
22
|
+
# OS / editor
|
|
23
|
+
.DS_Store
|
|
24
|
+
Thumbs.db
|
|
25
|
+
|
|
26
|
+
# Large / downloadable benchmark datasets (fetched via the loader — see eval/datasets.py)
|
|
27
|
+
data/longmemeval_s.json
|
|
28
|
+
data/longmemeval_oracle.json
|
|
29
|
+
data/longmemeval_m.json
|
|
30
|
+
|
|
31
|
+
# Local uv cache (set via UV_CACHE_DIR for this project)
|
|
32
|
+
.uv-cache/
|
|
33
|
+
|
|
34
|
+
# Ad-hoc scratch
|
|
35
|
+
/test_category_breakdown.py
|
|
36
|
+
|
|
37
|
+
# Internal strategy / competitive / handoff docs — keep OUT of any public release
|
|
38
|
+
_private/
|
|
39
|
+
|
|
40
|
+
# Datasets are fetched locally via eval/datasets.py, never committed
|
|
41
|
+
/data/
|
|
@@ -0,0 +1,260 @@
|
|
|
1
|
+
# Midas Benchmarks
|
|
2
|
+
|
|
3
|
+
Honest, reproducible benchmarks for the Midas agentic-memory SDK. Every number here comes from a
|
|
4
|
+
real run with the command to reproduce it. We deliberately **lead with reader-independent metrics**
|
|
5
|
+
(retrieval + cost) and treat end-to-end answer correctness as a secondary, noisy signal — see
|
|
6
|
+
*Methodology* for why that is the honest choice, not a convenient one.
|
|
7
|
+
|
|
8
|
+
## TL;DR
|
|
9
|
+
|
|
10
|
+
Midas isolates and wins the two axes that actually measure a *memory layer* (as opposed to the
|
|
11
|
+
reader LLM stacked on top):
|
|
12
|
+
|
|
13
|
+
- **Retrieval** — on LongMemEval-`s` (evidence buried among distractors), Midas retrieves the
|
|
14
|
+
supporting turns at **recall@k 0.95** vs a recency-window baseline's **0.03**.
|
|
15
|
+
- **Cost** — Midas does **0 LLM calls, $0 API spend, and 0 data egress at ingest** (local embeddings
|
|
16
|
+
only), versus LLM-at-ingest memory systems that call an LLM per session to extract facts.
|
|
17
|
+
|
|
18
|
+
## 1. Retrieval quality — `recall@k` (deterministic)
|
|
19
|
+
|
|
20
|
+
Fraction of gold supporting turns retrieved into the context. Fully deterministic (local embeddings,
|
|
21
|
+
no LLM), so it reproduces exactly.
|
|
22
|
+
|
|
23
|
+
| dataset | setting | baseline-raw | **Midas** |
|
|
24
|
+
|---|---|---:|---:|
|
|
25
|
+
| **LongMemEval-`s`** (buried evidence, hard retrieval) | n=40, bge-base, no rerank, seed 0 | 0.03 | **0.95** |
|
|
26
|
+
| **LoCoMo** (5 conversations) | n=50, bge-base, no rerank, seed 0 | 0.02 | **0.85** |
|
|
27
|
+
|
|
28
|
+
Across **both** datasets Midas retrieves the supporting turns at **0.85–0.95** while a recency window
|
|
29
|
+
gets **≤0.03** — the wedge holds beyond a single benchmark. On LongMemEval-`s` (n=40) the per-category
|
|
30
|
+
recall@k is strong across the board: fact 0.89 · multi-session 0.97 · knowledge-update 1.00 ·
|
|
31
|
+
temporal 0.95 · preference 1.00. A recency window finds essentially **none** of the buried evidence;
|
|
32
|
+
Midas finds ~9 in 10 — exactly the multi-session setting where retrieval quality decides whether the
|
|
33
|
+
answer is even *possible*. (`min_relevance` parsimony is a separate cost/quality knob; the numbers
|
|
34
|
+
above are pure retrieval, no pruning.)
|
|
35
|
+
|
|
36
|
+
**Time-aware retrieval (LLM-free).** Memories carry real **event time** (parsed from the dataset's
|
|
37
|
+
session timestamps), so recency and chronological context reflect *when things happened*, not load
|
|
38
|
+
order — the bitemporal signal long-horizon memory needs. Turning it on lifts **temporal recall@k
|
|
39
|
+
0.86 → 0.95** (n=40, A/B via `--midas-no-time`), in line with the LongMemEval paper's +7–11% for
|
|
40
|
+
temporal handling — but done with regex/relative-date math, **not** an LLM, preserving the no-LLM
|
|
41
|
+
ingest/query edge. (fact dips 0.92 → 0.89, within n=13 noise and with no effect on fact *answer*.)
|
|
42
|
+
|
|
43
|
+
```bash
|
|
44
|
+
# reproduce (deterministic; downloads LongMemEval-s on first run)
|
|
45
|
+
python -m eval.runner --dataset longmemeval --variant s --local \
|
|
46
|
+
--local-max-text-chars 600 --local-batch-size 16 --midas-no-rerank \
|
|
47
|
+
--max-questions 15 --limit 20 --seed 0
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
## 2. Cost / latency — the no-LLM edge (memory layer only)
|
|
51
|
+
|
|
52
|
+
Measured with the runner's cost instrumentation; excludes the shared reader/judge LLM (identical
|
|
53
|
+
across systems).
|
|
54
|
+
|
|
55
|
+
| system | ingest ms/event | memory-layer LLM | API $ | data egress |
|
|
56
|
+
|---|---:|---|---|---|
|
|
57
|
+
| **Midas** | ~116 (cold) · ~0 (cached) | **0** | **$0** | **none** |
|
|
58
|
+
| Mem0 *(LLM-at-ingest class)* | ~668 | ≥1 call / session | yes (per token) | yes (every turn) |
|
|
59
|
+
|
|
60
|
+
Midas's ingest cost is pure local ONNX embedding. LLM-at-ingest systems (Mem0, and **Hindsight**,
|
|
61
|
+
whose TEMPR extracts facts with an LLM at `retain` and CARA reasons with an LLM at `reflect`) pay an
|
|
62
|
+
LLM call per ingested session — which means **$/token forever at scale, seconds of latency, and every
|
|
63
|
+
conversation turn leaving the box**. At the scale where agent memory actually matters, that cost
|
|
64
|
+
structure — not a few points of benchmark accuracy — is what decides build-vs-buy.
|
|
65
|
+
|
|
66
|
+
**Microbenchmark (`eval/bench_perf.py`, bge-base on a modest CPU box — measured, not estimated):** a
|
|
67
|
+
single `remember` is **~16 ms p50** on short records (embed-bound — there is no LLM, just the ONNX
|
|
68
|
+
embedding forward pass; the time scales with text length, so the longer real turns above land at the
|
|
69
|
+
~116 ms/event figure), `build_context` **~51 ms p50** over a 2,000-record store; ingest batches far
|
|
70
|
+
faster via `remember_many`. Honest framing: these are **tens of milliseconds, embed-bound** — fast and local (no
|
|
71
|
+
per-turn network round-trip), not sub-millisecond. **Footprint: ~3.6 MB per 1,000 records** at 768 dims
|
|
72
|
+
— embeddings are stored as **float32 arrays**; a Python `list[float]` would cost ~7× more (~24 MB/1k),
|
|
73
|
+
and switching to float32 left `recall@k` unchanged (LoCoMo 0.27 → 0.27). SQLite persistence already
|
|
74
|
+
stored float32.
|
|
75
|
+
|
|
76
|
+
```bash
|
|
77
|
+
python -m eval.bench_perf --local --n 2000 --q 200 # latency · throughput · real (tracemalloc) footprint
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
**Every Midas mechanism is local, $0, zero-egress** — embeddings (bge-base ONNX), recall, supersession,
|
|
81
|
+
the NLI contradiction/entailment checks (`midas/nli.py`, int8 ONNX MNLI), and the abstention metric.
|
|
82
|
+
The only LLM is the *reader*, which is pluggable. **Demonstrated end-to-end fully offline** — Midas +
|
|
83
|
+
a local `llama3.2:1b` reader/judge via Ollama (on a local GPU): LongMemEval-`s` n=10 → recall@k **0.80**,
|
|
84
|
+
answer **0.40**, **0 API calls / $0 / nothing leaves the box**. (The modest answer rate is the 1B
|
|
85
|
+
reader; a larger local model lifts it — correctness is reader-bound. The point: the pipeline is 100%
|
|
86
|
+
local at zero marginal cost.)
|
|
87
|
+
|
|
88
|
+
```bash
|
|
89
|
+
# fully-local, $0, offline pipeline (local Ollama reader; no API key):
|
|
90
|
+
ollama serve & # then: ollama pull llama3.2:1b
|
|
91
|
+
python -m eval.runner --dataset longmemeval --variant s --local --midas-no-rerank \
|
|
92
|
+
--judge --judge-ollama --judge-model llama3.2:1b --max-questions 10 --seed 0
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
```bash
|
|
96
|
+
# reproduce (cold ingest cost: disable the embedding cache)
|
|
97
|
+
python -m eval.runner --dataset longmemeval --variant s --local \
|
|
98
|
+
--no-local-embedding-cache --midas-no-rerank --max-questions 3 --limit 20 --seed 0
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
## 3. Provenance (auditability)
|
|
102
|
+
|
|
103
|
+
`recall@k` is computable for Midas and the recency baseline because they return **source turn IDs**.
|
|
104
|
+
It is **N/A for fact-synthesizing systems** (Mem0, Hindsight) — they return LLM-rewritten facts, not
|
|
105
|
+
traceable sources. For Midas this is a feature: retrieved context is **auditable back to the exact
|
|
106
|
+
source turn**, with no extraction-time LLM that can silently hallucinate. This matters for enterprise
|
|
107
|
+
and compliance.
|
|
108
|
+
|
|
109
|
+
## 4. Scaling — sub-linear search past the exact scan (ANN)
|
|
110
|
+
|
|
111
|
+
The default `InMemoryStore` runs an **exact** cached cosine scan: O(N) per query but fast in absolute
|
|
112
|
+
terms (~5 ms/query at 36k × 768-d; ~130–230 ms extrapolated to 1M). For larger corpora, `IVFStore`
|
|
113
|
+
wraps a **numpy-only** inverted-file index — **no native dependency** (unlike faiss/hnswlib): the
|
|
114
|
+
corpus is k-means-clustered into `nlist` cells and a query scans only the `nprobe` nearest cells, so
|
|
115
|
+
search is **sub-linear**. `nprobe` tunes recall vs latency at query time, with no rebuild.
|
|
116
|
+
|
|
117
|
+
Measured on the **36k real bge-base embeddings** cached from the runs above (k=10, 500 held-out
|
|
118
|
+
queries — real embeddings cluster, which is IVF's intended regime; uniform-random vectors are its
|
|
119
|
+
worst case):
|
|
120
|
+
|
|
121
|
+
| nprobe | recall@10 vs exact | IVF ms/q | speedup vs exact |
|
|
122
|
+
|---:|---:|---:|---:|
|
|
123
|
+
| 1 | 0.52 | 0.13 | 37× |
|
|
124
|
+
| 4 | 0.82 | 0.76 | 6× |
|
|
125
|
+
| 8 | 0.91 | 1.49 | 3× |
|
|
126
|
+
| 16 | 0.95 | 3.27 | 1.5× |
|
|
127
|
+
|
|
128
|
+
The win **grows with N** (IVF scans ≈ `nprobe·√N` candidates vs exact's N): the exact↔IVF crossover
|
|
129
|
+
is ~10k records, and at nprobe=8 the speedup rises 0×→1×→2×→3× across 5k→36k. Extrapolating by the
|
|
130
|
+
candidate count, at **1M** records exact ≈ 130–230 ms vs IVF(nprobe=8) ≈ 8 ms — **~20× at recall
|
|
131
|
+
~0.90**. Below ~10k the exact scan wins (clustering overhead dominates), which is why `InMemoryStore`
|
|
132
|
+
stays the default and `IVFStore` is opt-in for large, read-heavy corpora.
|
|
133
|
+
|
|
134
|
+
```bash
|
|
135
|
+
python -m eval.bench_ann # real cached embeddings if present, else synthetic clustered
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
## 5. Retention — selective forgetting beats recency (no LLM)
|
|
139
|
+
|
|
140
|
+
Long-horizon memory must stay **bounded**. Midas forgets by `memory_value` (importance × recency); the
|
|
141
|
+
real question is whether that keeps the *right* memories under pressure. Measured on **LongMemEval-`s`
|
|
142
|
+
(n=40, evidence buried among distractors)** by evicting to a fixed budget and comparing policies
|
|
143
|
+
(`eval/retention.py`, recall@k averaged over all 40 questions):
|
|
144
|
+
|
|
145
|
+
| keep | importance — `StructuralImportance` | importance — `ContentImportance` | recency (FIFO) | random |
|
|
146
|
+
|---|---:|---:|---:|---:|
|
|
147
|
+
| 50% | **0.56** | 0.43 | 0.36 | 0.25 |
|
|
148
|
+
| 25% | **0.36** | 0.26 | 0.19 | 0.12 |
|
|
149
|
+
|
|
150
|
+
**Importance-ranked forgetting beats recency at every level** (`value > fifo > random`), and a structural
|
|
151
|
+
salience signal (boost an *assertion of a durable attribute*; demote questions/meta) beats the plain
|
|
152
|
+
content score by **+0.10–0.13 recall@k** under forgetting — all **no-LLM**. On undifferentiated chat
|
|
153
|
+
(LoCoMo) the signal is neutral, because recall there doesn't gate on importance; the buried-fact setting
|
|
154
|
+
is where it shows. (The honest negatives along the way — *novelty-vs-store* and *reinforcement* — are
|
|
155
|
+
documented in the design doc; the moat is reporting them too.)
|
|
156
|
+
|
|
157
|
+
```bash
|
|
158
|
+
python -m eval.retention --dataset longmemeval --variant s --local --no-rerank \
|
|
159
|
+
--structural-importance --value-rank-only --max-questions 40 --fractions 0.5,0.25
|
|
160
|
+
```
|
|
161
|
+
|
|
162
|
+
## 6. Correctness with a fixed strong reader (secondary)
|
|
163
|
+
|
|
164
|
+
`recall@k` measures the memory layer directly; *answer correctness* additionally depends on the reader
|
|
165
|
+
LLM (see Methodology). Holding the reader **fixed and identical across systems** (`gpt-4.1-mini` at
|
|
166
|
+
temp 0 — the same non-reasoning reader class the LongMemEval leaderboard uses), Midas's retrieval edge
|
|
167
|
+
converts to a large answer edge:
|
|
168
|
+
|
|
169
|
+
| dataset (reader = gpt-4.1-mini) | baseline-raw answer | **Midas** answer |
|
|
170
|
+
|---|---:|---:|
|
|
171
|
+
| LongMemEval-`s` (n=40, seed 0) | 0.05 | **0.82** |
|
|
172
|
+
|
|
173
|
+
Per-category Midas answer (indicative, wide bars at n=4–13): fact **1.00** · knowledge-update **1.00** ·
|
|
174
|
+
multi-session **0.89** · temporal **~0.64–0.82** (noisy) · preference 0.33. Same reader for both, so the
|
|
175
|
+
~16× gap (0.82 vs 0.05) is the memory layer: a recency window almost never holds the buried evidence
|
|
176
|
+
(recall@k 0.03), so the reader cannot answer.
|
|
177
|
+
For scale, 2026 SOTA on LongMemEval is **reader-dominated and LLM-ingest-based**: Mastra Observational
|
|
178
|
+
Memory scores **84.2% (gpt-4o) → 94.9% (gpt-5-mini)** — a +11pt swing from the *reader alone* — using
|
|
179
|
+
an LLM Observer/Reflector at ingest; Mem0 ~94.4.
|
|
180
|
+
|
|
181
|
+
**Reader sweep — same reader as SOTA, but Midas does ZERO LLM at ingest** (LongMemEval-`s`, n=40,
|
|
182
|
+
seed 0; judge fixed = gpt-4o to match Observational Memory's protocol; structured answerer):
|
|
183
|
+
|
|
184
|
+
| reader (Midas, no-LLM ingest) | **Midas** answer | Observational Memory (LLM ingest) |
|
|
185
|
+
|---|---:|---:|
|
|
186
|
+
| gpt-4o | **0.84** | 0.84 — **match** |
|
|
187
|
+
| gpt-5-mini | 0.87–0.89 | 0.95 |
|
|
188
|
+
|
|
189
|
+
**At gpt-4o, Midas ties the SOTA (0.84) with $0 LLM ingest** — OM gets the same number by running an
|
|
190
|
+
LLM Observer+Reflector on every conversation at ingest. With gpt-5-mini Midas reaches 0.87–0.89 vs
|
|
191
|
+
OM's 0.95: OM's curated observations help a strong reader more than raw retrieved turns do. Across the
|
|
192
|
+
sweep Midas pays **$0 at ingest, no data egress, and returns source-traceable turns** — none of which
|
|
193
|
+
the LLM-ingest systems offer — and its retrieval **scales to ~500-session corpora (LongMemEval-`m`) by
|
|
194
|
+
retrieving + forgetting** (measured: a ~4,944-turn haystack assembles a bounded ~480-token context,
|
|
195
|
+
recall@k 0.78 over n=3), where a keep-every-observation-in-context design does not fit by construction.
|
|
196
|
+
*(We measure Midas's side at that scale; we do not run OM — the overflow is an architectural inference,
|
|
197
|
+
not a head-to-head.)* By category Midas **leads
|
|
198
|
+
multi-session (0.89 vs OM's 0.872)** and matches knowledge-update (1.00 vs 0.962); the remaining gap is
|
|
199
|
+
**temporal** (0.82 vs 0.955; per-category n=4–13 → wide bars).
|
|
200
|
+
|
|
201
|
+
A **structured answerer** (ask the reader to pull the relevant dated entries and do the date arithmetic
|
|
202
|
+
before answering) lifts **non-reasoning** readers a lot — gpt-4o 0.76 → 0.84, multi-session 0.56 → 0.89
|
|
203
|
+
— and is neutral for reasoning readers that already do this internally (gpt-5-mini 0.89 → 0.87, within
|
|
204
|
+
noise). Since the cheap, deployable readers are the non-reasoning ones, it is on by default.
|
|
205
|
+
|
|
206
|
+
**Time-awareness — measured on the deterministic metric.** The LLM-free event-time grounding lifts
|
|
207
|
+
**temporal `recall@k` 0.86 → 0.95** (A/B via `--midas-no-time`, deterministic and reproducible) with
|
|
208
|
+
no real regression elsewhere (multi-session holds at 0.97; fact 0.92 → 0.89 is within n=13 noise). Its
|
|
209
|
+
effect on *answer* correctness is real in principle — the reader can resolve "how many days ago…" from
|
|
210
|
+
the dated context + a "today" anchor — but at **n=11 per category the answer deltas are inside run-to-run
|
|
211
|
+
judge noise** (the temporal answer alone bounced 0.64–0.82 across identical-config runs), so we do
|
|
212
|
+
**not** quote a per-category answer lift. This is the methodology working as intended: trust `recall@k`,
|
|
213
|
+
distrust small-n correctness deltas.
|
|
214
|
+
|
|
215
|
+
**Caveat:** n=40 sample with gpt-4.1-mini; the published Zep/Mem0 numbers run their full systems over
|
|
216
|
+
the full set with GPT-4o. Correctness also moves far more with the reader than with the memory layer
|
|
217
|
+
(see Methodology) — a strong reader can still miss multi-hop reasoning even when recall@k is high (the
|
|
218
|
+
evidence is present; the reasoning is the bottleneck). So we treat correctness as a secondary, wide-bar
|
|
219
|
+
signal and lead with `recall@k`.
|
|
220
|
+
|
|
221
|
+
```bash
|
|
222
|
+
# reproduce (needs an LLM key; this used OpenRouter gpt-4.1-mini as reader + judge)
|
|
223
|
+
JUDGE_PROVIDER=openrouter JUDGE_MODEL=openai/gpt-4.1-mini \
|
|
224
|
+
python -m eval.runner --dataset longmemeval --variant s --local \
|
|
225
|
+
--local-max-text-chars 600 --local-batch-size 16 --midas-no-rerank \
|
|
226
|
+
--judge --max-questions 40 --limit 20 --seed 0 # add --midas-no-time for the A/B
|
|
227
|
+
```
|
|
228
|
+
|
|
229
|
+
## Methodology — why reader-independent metrics
|
|
230
|
+
|
|
231
|
+
End-to-end "answer correctness" on these benchmarks is **dominated by the reader LLM, not the memory
|
|
232
|
+
layer**:
|
|
233
|
+
|
|
234
|
+
- Holding the reader fixed, a memory layer's lift is real; but swapping in a bigger reader moves the
|
|
235
|
+
*headline* far more than the memory does. (Public SOTA on LongMemEval reports ~39% → ~83% from the
|
|
236
|
+
memory system but ~83% → ~91% from *just a larger reader* — most of the headline is the reader.)
|
|
237
|
+
- Our own hosted LLM judge (an MoE served via API) is **not reproducible across sessions**: identical
|
|
238
|
+
inputs scored ~0.46 one day and ~0.13 the next, even at temperature 0. We added a local,
|
|
239
|
+
seed-pinned, serialized judge (`--judge-ollama`) to make correctness reproducible, but a small local
|
|
240
|
+
reader is too weak to *use* good context — so correctness still does not cleanly isolate memory
|
|
241
|
+
quality.
|
|
242
|
+
|
|
243
|
+
Therefore: **`recall@k` (deterministic, reader-independent) and ingest cost (structural) are our
|
|
244
|
+
primary metrics.** We report correctness only with a fixed reader and wide error bars, and never as a
|
|
245
|
+
headline.
|
|
246
|
+
|
|
247
|
+
### Honest caveats
|
|
248
|
+
- **Sample** is n=40 on LongMemEval-`s` and n=50 across 5 LoCoMo conversations. `recall@k` is
|
|
249
|
+
deterministic, so the sample is real; the full LongMemEval set / all 10 LoCoMo conversations would
|
|
250
|
+
tighten it further.
|
|
251
|
+
- **Latency is hardware/provider-dependent** (the ~668 ms for the LLM-at-ingest class includes API
|
|
252
|
+
round-trip). The durable, hardware-independent claim is the **0-LLM / $0 / no-egress** column.
|
|
253
|
+
- **baseline-raw** = "stuff recent turns into the window" (the naive big-context approach).
|
|
254
|
+
- Numbers measured on CPU with `BAAI/bge-base-en-v1.5`. GPU / a faster embedder lowers Midas latency.
|
|
255
|
+
- **Reranking is off by default on large haystacks.** A cross-encoder reranker is available, but on
|
|
256
|
+
LongMemEval-`s` (CPU) it added ~80× query latency (4.2 s vs 53 ms) with **no `recall@k` change**
|
|
257
|
+
(0.88 → 0.88): it reorders the records that already fit the budget (which can help the *reader*) but
|
|
258
|
+
does not change *which* evidence fits. So it is not on the retrieval-quality path here.
|
|
259
|
+
|
|
260
|
+
*All commands run from the repo root. `recall@k` requires no API key; `--judge*` flags do.*
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
Notable changes to Midas. Pre-1.0 — the API may change. Format loosely follows
|
|
4
|
+
[Keep a Changelog](https://keepachangelog.com/).
|
|
5
|
+
|
|
6
|
+
## [Unreleased]
|
|
7
|
+
|
|
8
|
+
### Added
|
|
9
|
+
- **Core SDK** — `Memory` (`remember` / `recall` / `build_context` / `assemble`) ranking by
|
|
10
|
+
relevance × importance × recency, with same-thread neighbour-window expansion and budgeted,
|
|
11
|
+
highest-value-first context assembly. No LLM at ingest or query.
|
|
12
|
+
- **Embedders** — `HashingEmbedder` (offline, zero-dep), `LocalEmbedder` (fastembed/ONNX, bge-base),
|
|
13
|
+
`OpenAIEmbedder`, and `DiskCachedEmbedder` (persistent SQLite cache keyed by model + dim + text).
|
|
14
|
+
`LocalReranker` (cross-encoder, length-capped to avoid ONNX crashes).
|
|
15
|
+
- **Stores** — `InMemoryStore` with a vectorised cosine scan over a **cached** embedding matrix
|
|
16
|
+
(numpy; comfortable to ~1M memories) and an identical pure-Python fallback; `SQLiteStore` for
|
|
17
|
+
**persistence across restarts** with **no native extension** (pure stdlib sqlite3).
|
|
18
|
+
- **float32 in-memory embeddings** — records store the embedding as a float32 numpy array, not a
|
|
19
|
+
Python `list[float]` (~32 B/value). Measured ~**7× smaller footprint** at 768 dims (a 1M-record
|
|
20
|
+
in-memory store drops from ~24 GB to ~3.5 GB) and **faster queries** (float32 matmul); SQLite already
|
|
21
|
+
persisted float32. Measured by `eval/bench_perf.py` (latency / throughput / real tracemalloc footprint
|
|
22
|
+
— the numbers the project had never measured).
|
|
23
|
+
- **Hybrid retrieval** (BM25 fused with semantic) — off by default; see `BENCHMARKS.md` for the
|
|
24
|
+
honest negative result on conversational data.
|
|
25
|
+
- **Belief revision** (supersession) for typed durable facts — off by default; chat never supersedes
|
|
26
|
+
chat. Paraphrased updates are caught by the embedder's cosine similarity (no hand-tuned synonym map),
|
|
27
|
+
so it generalizes beyond any one dataset.
|
|
28
|
+
- **Local NLI** (`midas/nli.py`, LLM-free) — a small int8 ONNX MNLI cross-encoder (onnxruntime +
|
|
29
|
+
tokenizers, ~70 MB, no torch/API). Powers **contradiction-gated conversational belief revision**:
|
|
30
|
+
a chat turn revises an earlier belief only when NLI scores it an actual contradiction. This *fixes*
|
|
31
|
+
the cue-heuristic's over-supersession (LongMemEval temporal recall restored 0.76 → **0.95**) while
|
|
32
|
+
staying precise on real updates — closing the "cheap no-LLM contradiction detection" open problem.
|
|
33
|
+
Also exposes **post-hoc answer-grounding** (`--answer-verify-nli`) — override to "I don't know" when
|
|
34
|
+
no retrieved turn entails the answer. Honest result: it does NOT reliably improve abstention (a
|
|
35
|
+
deterministic-reader A/B is unchanged, 0.37→0.37) because the confabulation is drawn from a retrieved
|
|
36
|
+
distractor that *entails* it. Abstention/Calibrated remains the open frontier; see docs.
|
|
37
|
+
- **Entity-grounded abstention** (`midas/entity.py`, LLM-free) — a new abstention lever orthogonal to
|
|
38
|
+
cosine/NLI: abstain when the answer's source turn is about a *different entity* than the question asks
|
|
39
|
+
(the diagnosed confab-from-distractor root cause). Dropping recurring *attribute* words makes the focus
|
|
40
|
+
the entity noun; **offline-validated 8/8** on the diagnosed failure cases (incl. "favorite colour" vs
|
|
41
|
+
"favorite food", "city" vs "Barcelona"), 11 tests. Honest limit: crafted cases — the end-to-end win
|
|
42
|
+
needs a capable reader (local 1B doesn't confabulate-from-distractor; hosted credits exhausted).
|
|
43
|
+
- **Time-aware retrieval** (LLM-free) — memories carry real **event time** (`remember(created_at=…)`);
|
|
44
|
+
`recall`/`build_context` take a query `now` so recency decays from when a question is asked, context
|
|
45
|
+
renders true dates (UTC), and a "today" header anchors relative-time reasoning. Bitemporal signal,
|
|
46
|
+
no LLM. Eval ablation: `--midas-no-time`.
|
|
47
|
+
- **Selective forgetting + temporal tiers** (LLM-free) — `Memory.forget_decayed()` evicts the
|
|
48
|
+
lowest-value memories (`memory_value` = importance × recency) to bound storage and context growth,
|
|
49
|
+
**protecting the durable tier** (facts/preferences/constraints, high importance) and never orphaning
|
|
50
|
+
a supersession chain; returns the forgotten ids (deletion audit trail). `Memory.tier()` names a
|
|
51
|
+
memory's horizon — short (≤1d) / medium (≤1w) / long (multi-day). Measured with `eval/retention.py`
|
|
52
|
+
(eviction policies at the same retained budget): on data with an importance signal, value-based
|
|
53
|
+
forgetting **holds recall@k 1.00 at 25–50% retention** while recency/random eviction fall to
|
|
54
|
+
0.17–0.60; on uniform-importance chat it **reduces to recency** (honest — needs a per-turn importance
|
|
55
|
+
signal, the next step) while cutting context tokens ~3×. Purely additive: no-forget recall@k
|
|
56
|
+
unchanged (LoCoMo 0.62).
|
|
57
|
+
- **Content importance scoring** (`ContentImportance`, LLM-free) — derive a turn's importance 1–5 from
|
|
58
|
+
content alone (content-word density, numbers/dates, proper nouns, anti-backchannel); `Memory(
|
|
59
|
+
importance_scorer=…)` auto-applies it to turns ingested without one, so raw chat gets a salience for
|
|
60
|
+
forgetting/tiering. Measured: as a forgetting **protection** it lifts LoCoMo recall@k under eviction
|
|
61
|
+
from 0.10 (recency) to **0.18** (sheds filler, keeps facts); as a pure rank it helps only at moderate
|
|
62
|
+
compression.
|
|
63
|
+
- **Novelty-vs-store importance** (`Memory(novelty_weight=…)`, LLM-free) — blends importance with
|
|
64
|
+
`1 − max-cosine-to-store` so a *new* fact can outrank a *repeated* one. **Off by default: a measured
|
|
65
|
+
negative.** At equal budget it is neutral on LoCoMo/synthetic recall@k and *harmful* on multiday
|
|
66
|
+
(1.00 → 0.60), because repetition usually signals importance and demoting restated gold evicts it.
|
|
67
|
+
Kept as a tested, opt-in knob; its right home is consolidation (dedup), not eviction-ranking.
|
|
68
|
+
- **Reinforcement importance** (`Memory(reinforce=True)`, LLM-free) — the inverse of novelty: a restated
|
|
69
|
+
turn *boosts* the matched memory's importance + recency (repetition ⇒ salience); in `capture` a
|
|
70
|
+
restatement reinforces the existing memory and is skipped. **Off by default: also a measured negative**
|
|
71
|
+
— recall@k drops at equal budget (LoCoMo 0.08→0.03 @25%; multiday 0.60→0.40 @25%). Unifying finding:
|
|
72
|
+
on raw conversation **repetition tracks commonness, not importance**, so neither novelty nor
|
|
73
|
+
reinforcement improves no-LLM forgetting. Content-salience as a *protection* stays the best signal.
|
|
74
|
+
- **Extractive consolidation** (`Memory.consolidate`, LLM-free) — collapse near-**duplicate** restatements
|
|
75
|
+
to the single highest-value copy (cosine ≥ threshold, chains preserved); extractive (drops redundant
|
|
76
|
+
records, keeps provenance — never LLM-rewrites). Measured safe (recall@k held: LoCoMo 0.27→0.26 dropping
|
|
77
|
+
10 dups at 0.92); yield is modest at safe thresholds on paraphrase-heavy data and grows with literal
|
|
78
|
+
redundancy/scale.
|
|
79
|
+
- **MCP server** (`python -m midas.mcp_server`) — `remember` (auto-derives importance from content),
|
|
80
|
+
`recall` (source-traceable), `build_context`, `maintain` (no-LLM retention: dedup + selective
|
|
81
|
+
forgetting, returns the **deletion audit** of removed ids), `stats` (counts + temporal-tier
|
|
82
|
+
distribution), `forget`, `forget_all`. Optional SQLite persistence via `MIDAS_MCP_DB`; optional
|
|
83
|
+
**bounded memory** via `MIDAS_MCP_MAX_RECORDS` (auto-forget the lowest-value tail over the cap). The
|
|
84
|
+
privacy/cost/provenance/retention surface for long-running and enterprise agents.
|
|
85
|
+
- **Zero-config auto-memory** (LLM-free) — install the MCP server and Midas starts remembering on its
|
|
86
|
+
own. The server **injects a memory policy** into the agent (MCP `instructions` + a `memory_session`
|
|
87
|
+
prompt): recall-then-`capture`. `Memory.capture()` + `MemoryPolicy` impose the relevance parameters —
|
|
88
|
+
it scores each turn's importance, enforces a floor (`MIDAS_MCP_MIN_IMPORTANCE`, default 2) and skips
|
|
89
|
+
duplicates, and reports stored/skipped + why. The agent captures freely; Midas decides what's kept.
|
|
90
|
+
- **Eval harness** (`eval/`, dev-only) — LoCoMo + LongMemEval loaders, deterministic `recall@k`,
|
|
91
|
+
per-adapter cost/latency instrumentation, and an optional LLM judge (hosted or local Ollama,
|
|
92
|
+
seed-pinned + serialized for reproducibility). **Reader and judge models are decoupled**
|
|
93
|
+
(`--reader-model` vs `--judge-model`) so correctness can be measured with a fixed judge while
|
|
94
|
+
sweeping readers — the apples-to-apples protocol published leaderboards use (e.g. gpt-4o judge).
|
|
95
|
+
- **Artifacts** — `BENCHMARKS.md` (reader-independent results + reproduce commands),
|
|
96
|
+
`docs/research-notes.md` (measured findings), a coding-agent demo, PEP 561 typing (`py.typed`),
|
|
97
|
+
and an MIT license.
|
|
98
|
+
|
|
99
|
+
### Measured (see BENCHMARKS.md)
|
|
100
|
+
- Retrieval `recall@k`: LongMemEval-`s` **0.95** (n=40, time-aware) and LoCoMo **0.85** (5
|
|
101
|
+
conversations) vs a recency-window baseline ≤0.03. Time-awareness lifts **temporal recall@k
|
|
102
|
+
0.86→0.95** (deterministic A/B, `--midas-no-time`), no real regression elsewhere.
|
|
103
|
+
- Answer correctness (reader = gpt-4.1-mini, n=40): Midas **0.82** vs baseline **0.05**. Per-category
|
|
104
|
+
answer deltas are within run-to-run judge noise at n≤13, so we lead with `recall@k`.
|
|
105
|
+
- **Same-reader head-to-head (judge=gpt-4o, structured answerer):** Midas **0.84 @ gpt-4o = SOTA
|
|
106
|
+
Observational Memory's 0.84**, with **zero LLM at ingest** (OM runs an LLM per conversation);
|
|
107
|
+
0.87–0.89 @ gpt-5-mini vs OM 0.95. Midas leads multi-session (0.89 vs 0.872).
|
|
108
|
+
- Structured answerer (extract relevant dated entries + compute time deltas before answering) lifts
|
|
109
|
+
non-reasoning readers (gpt-4o 0.76→0.84) and is neutral for reasoning readers.
|
|
110
|
+
- Ingest cost: **0 LLM calls, $0 API, 0 data egress** (local embeddings only).
|
|
111
|
+
- In-memory recall latency ~0.2 µs/record after matrix caching (~70× the naive Python scan).
|
|
112
|
+
|
|
113
|
+
### Notes
|
|
114
|
+
- Reader-independent metrics (`recall@k`, cost) are primary; end-to-end answer correctness is
|
|
115
|
+
reader-dominated and reported as secondary/noisy — see `docs/research-notes.md`.
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
# Contributing to Midas
|
|
2
|
+
|
|
3
|
+
Thanks for considering a contribution. Midas is **eval-first**: the project's one durable asset is that
|
|
4
|
+
its reported numbers are true. A few principles keep it that way.
|
|
5
|
+
|
|
6
|
+
1. **Measure, don't claim.** Any change that affects retrieval or forgetting must show its effect on a
|
|
7
|
+
reproducible metric — `recall@k` is deterministic (`python -m eval.runner …` / `python -m
|
|
8
|
+
eval.retention …`). Quote numbers with the command, dataset, `n`, and caveats.
|
|
9
|
+
2. **Regression-check.** Run the suite (`python -m pytest -q`); for retrieval changes, confirm LoCoMo
|
|
10
|
+
`recall@k` is unchanged. A win on a toy can break real data — that has happened here before.
|
|
11
|
+
3. **No LLM at ingest or query.** The wedge is local, cheap, auditable. New *no-LLM* mechanisms are very
|
|
12
|
+
welcome; an LLM in the ingest/query path is not.
|
|
13
|
+
4. **Honest negatives are valued.** A measured "this didn't work" is a real contribution — the design
|
|
14
|
+
doc (`docs/long-horizon-memory.md`) keeps several on purpose.
|
|
15
|
+
|
|
16
|
+
## Dev setup
|
|
17
|
+
|
|
18
|
+
```bash
|
|
19
|
+
git clone https://github.com/vornicx/Midas && cd Midas
|
|
20
|
+
pip install -e ".[all,dev]"
|
|
21
|
+
python -m pytest -q
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
Open an issue first for anything non-trivial. Small, measured, well-tested PRs merge fastest.
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Midas authors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|