coremem 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- coremem-0.1.0/.gitignore +11 -0
- coremem-0.1.0/LICENSE +21 -0
- coremem-0.1.0/PKG-INFO +170 -0
- coremem-0.1.0/README.md +144 -0
- coremem-0.1.0/benchmarks/longmemeval/adapter.py +104 -0
- coremem-0.1.0/benchmarks/longmemeval/eval.py +230 -0
- coremem-0.1.0/coremem/__init__.py +21 -0
- coremem-0.1.0/coremem/backends/__init__.py +5 -0
- coremem-0.1.0/coremem/backends/base.py +44 -0
- coremem-0.1.0/coremem/backends/chroma.py +130 -0
- coremem-0.1.0/coremem/backends/hybrid.py +185 -0
- coremem-0.1.0/coremem/core.py +103 -0
- coremem-0.1.0/coremem/heuristics.py +150 -0
- coremem-0.1.0/coremem/ingest.py +73 -0
- coremem-0.1.0/coremem/layers.py +88 -0
- coremem-0.1.0/coremem/types.py +51 -0
- coremem-0.1.0/pyproject.toml +43 -0
- coremem-0.1.0/tests/conftest.py +35 -0
- coremem-0.1.0/tests/test_backend_chroma.py +54 -0
- coremem-0.1.0/tests/test_backend_hybrid.py +82 -0
- coremem-0.1.0/tests/test_core.py +90 -0
- coremem-0.1.0/tests/test_heuristics.py +77 -0
- coremem-0.1.0/tests/test_layers.py +54 -0
coremem-0.1.0/.gitignore
ADDED
coremem-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Eddy Vinck
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
coremem-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: coremem
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Zero-LLM memory retrieval for AI agents — semantic search and deterministic heuristics
|
|
5
|
+
Author: Eddy Vinck
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
License-File: LICENSE
|
|
8
|
+
Keywords: ai-agents,llm,memory,retrieval,semantic-search
|
|
9
|
+
Classifier: Development Status :: 3 - Alpha
|
|
10
|
+
Classifier: Intended Audience :: Developers
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
15
|
+
Requires-Python: >=3.11
|
|
16
|
+
Requires-Dist: chromadb>=0.5.0
|
|
17
|
+
Requires-Dist: numpy>=1.24.0
|
|
18
|
+
Requires-Dist: pyyaml>=6.0
|
|
19
|
+
Requires-Dist: sentence-transformers>=2.0.0
|
|
20
|
+
Provides-Extra: dev
|
|
21
|
+
Requires-Dist: pytest-asyncio>=0.24.0; extra == 'dev'
|
|
22
|
+
Requires-Dist: pytest>=8.0.0; extra == 'dev'
|
|
23
|
+
Provides-Extra: hybrid
|
|
24
|
+
Requires-Dist: hybriddb>=0.2.0; extra == 'hybrid'
|
|
25
|
+
Description-Content-Type: text/markdown
|
|
26
|
+
|
|
27
|
+
# CoreMem
|
|
28
|
+
|
|
29
|
+
> **Zero-LLM memory retrieval for AI agents.** CoreMem gives agents instant access to conversation history — semantic search plus deterministic retrieval heuristics, all without a single API call. Scores **98.0% R@5 on LongMemEval (500 questions)** in the Executive Assistant retrieval stack — no LLM, no tuning, no cloud.
|
|
30
|
+
|
|
31
|
+
> **Embedded. Local. Open source.** No external APIs, no vector DB services, no internet connection required. Runs entirely on-device with ChromaDB or HybridDB + sentence-transformers. Ships as a single Python package with zero infrastructure dependencies.
|
|
32
|
+
|
|
33
|
+
**Dual-backend architecture.** Drop-in backends (ChromaDB baseline, HybridDB enhanced) with the same API. Ranking pipeline: backend retrieval → deterministic heuristics → recency-aware rescoring → session-aware retrieval.
|
|
34
|
+
|
|
35
|
+
```python
|
|
36
|
+
from coremem import MemoryCore
|
|
37
|
+
from coremem.backends.chroma import ChromaBackend
|
|
38
|
+
|
|
39
|
+
core = MemoryCore(backend=ChromaBackend(path="./memory"))
|
|
40
|
+
|
|
41
|
+
# Ingest conversation turns
|
|
42
|
+
core.ingest("user", "I visited the Museum of Modern Art today")
|
|
43
|
+
core.ingest("assistant", "That sounds wonderful! How was it?")
|
|
44
|
+
core.ingest("user", "I went to an Ancient Civilizations exhibition at the Natural History Museum")
|
|
45
|
+
|
|
46
|
+
# Search with deterministic heuristic reranking
|
|
47
|
+
results = core.search("When did I visit art museums?")
|
|
48
|
+
|
|
49
|
+
for r in results:
|
|
50
|
+
print(f"[{r.memory.ts}] [{r.memory.role}] {r.memory.content}")
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
## Why CoreMem?
|
|
54
|
+
|
|
55
|
+
Every AI agent needs memory. But cloud-based vector search is expensive, slow, and doesn't work offline. Pure embedding similarity misses keyword matches and temporal context. LLM-based memory systems cost tokens per query.
|
|
56
|
+
|
|
57
|
+
CoreMem solves all three:
|
|
58
|
+
|
|
59
|
+
| Component | What it does |
|
|
60
|
+
|-----------|-------------|
|
|
61
|
+
| **Semantic search** | Embedding similarity via ChromaDB or HybridDB |
|
|
62
|
+
| **Deterministic heuristics** | Keyword overlap, temporal recency, person-name boost, quoted-phrase matching |
|
|
63
|
+
| **Session deduplication** | One result per conversation, with full context retrieval |
|
|
64
|
+
|
|
65
|
+
## LongMemEval Results (500 questions, no LLM, no tuning)
|
|
66
|
+
|
|
67
|
+
| Metric | Score |
|
|
68
|
+
|--------|-------|
|
|
69
|
+
| R@5 | **98.0%** |
|
|
70
|
+
| R@10 | **98.4%** |
|
|
71
|
+
| MRR | 0.944 |
|
|
72
|
+
| P@5 | 0.592 |
|
|
73
|
+
| F1@5 | 0.684 |
|
|
74
|
+
| Selectivity | 11.5% haystack scanned |
|
|
75
|
+
| Rank distribution | #1: 91.8%, #2-3: 5.0%, #4-5: 1.2%, #6-10: 0.4%, >10: 1.6% |
|
|
76
|
+
|
|
77
|
+
Outperforms MemPalace raw (96.6%) and matches their hybrid v4 held-out (98.4%) — with zero tuning, zero dev-set peeking.
|
|
78
|
+
|
|
79
|
+
## Installation
|
|
80
|
+
|
|
81
|
+
```bash
|
|
82
|
+
pip install coremem
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
With HybridDB backend for enhanced FTS5 + vector hybrid search:
|
|
86
|
+
|
|
87
|
+
```bash
|
|
88
|
+
pip install coremem[hybrid]
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
## Core Concepts
|
|
92
|
+
|
|
93
|
+
### Backends
|
|
94
|
+
|
|
95
|
+
```python
|
|
96
|
+
# ChromaDB baseline — pure vector search
|
|
97
|
+
from coremem.backends.chroma import ChromaBackend
|
|
98
|
+
core = MemoryCore(backend=ChromaBackend(path="./data"))
|
|
99
|
+
|
|
100
|
+
# HybridDB enhanced — FTS5 + vector hybrid search
|
|
101
|
+
from coremem.backends.hybrid import HybridBackend
|
|
102
|
+
core = MemoryCore(backend=HybridBackend(path="./data"))
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
### Ingestion
|
|
106
|
+
|
|
107
|
+
```python
|
|
108
|
+
# Simple ingestion
|
|
109
|
+
core.ingest("user", "I built a Spitfire model kit", session_id="conv_001")
|
|
110
|
+
|
|
111
|
+
# Batch ingestion
|
|
112
|
+
from coremem import ingest_batch
|
|
113
|
+
ingest_batch(core, [
|
|
114
|
+
("user", "What's the weather today?"),
|
|
115
|
+
("assistant", "Sunny with a high of 72°F"),
|
|
116
|
+
], session_id="conv_001")
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
### Search
|
|
120
|
+
|
|
121
|
+
```python
|
|
122
|
+
# Basic search
|
|
123
|
+
results = core.search("How many model kits?", limit=10)
|
|
124
|
+
|
|
125
|
+
# Limit results
|
|
126
|
+
results = core.search("model building projects", limit=5)
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
### Heuristics
|
|
130
|
+
|
|
131
|
+
Deterministic, zero-LLM scoring boosts applied to every result:
|
|
132
|
+
|
|
133
|
+
| Heuristic | What it catches |
|
|
134
|
+
|-----------|----------------|
|
|
135
|
+
| `keyword_overlap` | Exact word matches between query and content |
|
|
136
|
+
| `temporal_boost` | Queries with "latest", "current", "recently" |
|
|
137
|
+
| `recency_decay` | Unconditional exponential decay (30-day half-life) |
|
|
138
|
+
| `person_name_boost` | Proper name mentions in content |
|
|
139
|
+
| `quoted_phrase_boost` | Exact phrase matches in quotes |
|
|
140
|
+
|
|
141
|
+
```python
|
|
142
|
+
from coremem import SearchHeuristics
|
|
143
|
+
|
|
144
|
+
# Apply all heuristics to a single result
|
|
145
|
+
score = SearchHeuristics.apply_all(
|
|
146
|
+
query="latest project",
|
|
147
|
+
content="Just finished the Q3 project report",
|
|
148
|
+
score=0.75,
|
|
149
|
+
ts="2026-05-28T10:00:00Z",
|
|
150
|
+
)
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
### Wake-Up Context
|
|
154
|
+
|
|
155
|
+
Give the agent instant situational awareness:
|
|
156
|
+
|
|
157
|
+
```python
|
|
158
|
+
context = core.wake_up(user_id="alice")
|
|
159
|
+
# Returns a compact string with L0 identity and L1 recent context.
|
|
160
|
+
```
|
|
161
|
+
|
|
162
|
+
## License
|
|
163
|
+
|
|
164
|
+
MIT — see [LICENSE](LICENSE).
|
|
165
|
+
|
|
166
|
+
## Author
|
|
167
|
+
|
|
168
|
+
Eddy Vinck
|
|
169
|
+
|
|
170
|
+
CoreMem is the retrieval engine behind the [Executive Assistant](https://github.com/open-assistants-lab) agent system. Pairs with [HybridDB](https://github.com/open-assistants-lab) for storage and ConnectKit for real-time sync.
|
coremem-0.1.0/README.md
ADDED
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
# CoreMem
|
|
2
|
+
|
|
3
|
+
> **Zero-LLM memory retrieval for AI agents.** CoreMem gives agents instant access to conversation history — semantic search plus deterministic retrieval heuristics, all without a single API call. Scores **98.0% R@5 on LongMemEval (500 questions)** in the Executive Assistant retrieval stack — no LLM, no tuning, no cloud.
|
|
4
|
+
|
|
5
|
+
> **Embedded. Local. Open source.** No external APIs, no vector DB services, no internet connection required. Runs entirely on-device with ChromaDB or HybridDB + sentence-transformers. Ships as a single Python package with zero infrastructure dependencies.
|
|
6
|
+
|
|
7
|
+
**Dual-backend architecture.** Drop-in backends (ChromaDB baseline, HybridDB enhanced) with the same API. Ranking pipeline: backend retrieval → deterministic heuristics → recency-aware rescoring → session-aware retrieval.
|
|
8
|
+
|
|
9
|
+
```python
|
|
10
|
+
from coremem import MemoryCore
|
|
11
|
+
from coremem.backends.chroma import ChromaBackend
|
|
12
|
+
|
|
13
|
+
core = MemoryCore(backend=ChromaBackend(path="./memory"))
|
|
14
|
+
|
|
15
|
+
# Ingest conversation turns
|
|
16
|
+
core.ingest("user", "I visited the Museum of Modern Art today")
|
|
17
|
+
core.ingest("assistant", "That sounds wonderful! How was it?")
|
|
18
|
+
core.ingest("user", "I went to an Ancient Civilizations exhibition at the Natural History Museum")
|
|
19
|
+
|
|
20
|
+
# Search with deterministic heuristic reranking
|
|
21
|
+
results = core.search("When did I visit art museums?")
|
|
22
|
+
|
|
23
|
+
for r in results:
|
|
24
|
+
print(f"[{r.memory.ts}] [{r.memory.role}] {r.memory.content}")
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
## Why CoreMem?
|
|
28
|
+
|
|
29
|
+
Every AI agent needs memory. But cloud-based vector search is expensive, slow, and doesn't work offline. Pure embedding similarity misses keyword matches and temporal context. LLM-based memory systems cost tokens per query.
|
|
30
|
+
|
|
31
|
+
CoreMem solves all three:
|
|
32
|
+
|
|
33
|
+
| Component | What it does |
|
|
34
|
+
|-----------|-------------|
|
|
35
|
+
| **Semantic search** | Embedding similarity via ChromaDB or HybridDB |
|
|
36
|
+
| **Deterministic heuristics** | Keyword overlap, temporal recency, person-name boost, quoted-phrase matching |
|
|
37
|
+
| **Session deduplication** | One result per conversation, with full context retrieval |
|
|
38
|
+
|
|
39
|
+
## LongMemEval Results (500 questions, no LLM, no tuning)
|
|
40
|
+
|
|
41
|
+
| Metric | Score |
|
|
42
|
+
|--------|-------|
|
|
43
|
+
| R@5 | **98.0%** |
|
|
44
|
+
| R@10 | **98.4%** |
|
|
45
|
+
| MRR | 0.944 |
|
|
46
|
+
| P@5 | 0.592 |
|
|
47
|
+
| F1@5 | 0.684 |
|
|
48
|
+
| Selectivity | 11.5% haystack scanned |
|
|
49
|
+
| Rank distribution | #1: 91.8%, #2-3: 5.0%, #4-5: 1.2%, #6-10: 0.4%, >10: 1.6% |
|
|
50
|
+
|
|
51
|
+
Outperforms MemPalace raw (96.6%) and matches their hybrid v4 held-out (98.4%) — with zero tuning, zero dev-set peeking.
|
|
52
|
+
|
|
53
|
+
## Installation
|
|
54
|
+
|
|
55
|
+
```bash
|
|
56
|
+
pip install coremem
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
With HybridDB backend for enhanced FTS5 + vector hybrid search:
|
|
60
|
+
|
|
61
|
+
```bash
|
|
62
|
+
pip install coremem[hybrid]
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
## Core Concepts
|
|
66
|
+
|
|
67
|
+
### Backends
|
|
68
|
+
|
|
69
|
+
```python
|
|
70
|
+
# ChromaDB baseline — pure vector search
|
|
71
|
+
from coremem.backends.chroma import ChromaBackend
|
|
72
|
+
core = MemoryCore(backend=ChromaBackend(path="./data"))
|
|
73
|
+
|
|
74
|
+
# HybridDB enhanced — FTS5 + vector hybrid search
|
|
75
|
+
from coremem.backends.hybrid import HybridBackend
|
|
76
|
+
core = MemoryCore(backend=HybridBackend(path="./data"))
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
### Ingestion
|
|
80
|
+
|
|
81
|
+
```python
|
|
82
|
+
# Simple ingestion
|
|
83
|
+
core.ingest("user", "I built a Spitfire model kit", session_id="conv_001")
|
|
84
|
+
|
|
85
|
+
# Batch ingestion
|
|
86
|
+
from coremem import ingest_batch
|
|
87
|
+
ingest_batch(core, [
|
|
88
|
+
("user", "What's the weather today?"),
|
|
89
|
+
("assistant", "Sunny with a high of 72°F"),
|
|
90
|
+
], session_id="conv_001")
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
### Search
|
|
94
|
+
|
|
95
|
+
```python
|
|
96
|
+
# Basic search
|
|
97
|
+
results = core.search("How many model kits?", limit=10)
|
|
98
|
+
|
|
99
|
+
# Limit results
|
|
100
|
+
results = core.search("model building projects", limit=5)
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
### Heuristics
|
|
104
|
+
|
|
105
|
+
Deterministic, zero-LLM scoring boosts applied to every result:
|
|
106
|
+
|
|
107
|
+
| Heuristic | What it catches |
|
|
108
|
+
|-----------|----------------|
|
|
109
|
+
| `keyword_overlap` | Exact word matches between query and content |
|
|
110
|
+
| `temporal_boost` | Queries with "latest", "current", "recently" |
|
|
111
|
+
| `recency_decay` | Unconditional exponential decay (30-day half-life) |
|
|
112
|
+
| `person_name_boost` | Proper name mentions in content |
|
|
113
|
+
| `quoted_phrase_boost` | Exact phrase matches in quotes |
|
|
114
|
+
|
|
115
|
+
```python
|
|
116
|
+
from coremem import SearchHeuristics
|
|
117
|
+
|
|
118
|
+
# Apply all heuristics to a single result
|
|
119
|
+
score = SearchHeuristics.apply_all(
|
|
120
|
+
query="latest project",
|
|
121
|
+
content="Just finished the Q3 project report",
|
|
122
|
+
score=0.75,
|
|
123
|
+
ts="2026-05-28T10:00:00Z",
|
|
124
|
+
)
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
### Wake-Up Context
|
|
128
|
+
|
|
129
|
+
Give the agent instant situational awareness:
|
|
130
|
+
|
|
131
|
+
```python
|
|
132
|
+
context = core.wake_up(user_id="alice")
|
|
133
|
+
# Returns a compact string with L0 identity and L1 recent context.
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
## License
|
|
137
|
+
|
|
138
|
+
MIT — see [LICENSE](LICENSE).
|
|
139
|
+
|
|
140
|
+
## Author
|
|
141
|
+
|
|
142
|
+
Eddy Vinck
|
|
143
|
+
|
|
144
|
+
CoreMem is the retrieval engine behind the [Executive Assistant](https://github.com/open-assistants-lab) agent system. Pairs with [HybridDB](https://github.com/open-assistants-lab) for storage and ConnectKit for real-time sync.
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
"""LongMemEval benchmark adapter for coremem.
|
|
2
|
+
|
|
3
|
+
Direct-injection mode — injects haystack sessions into coremem,
|
|
4
|
+
runs search, measures Recall@K. No LLM. No HTTP. No agent loop.
|
|
5
|
+
Pure retrieval benchmarking.
|
|
6
|
+
|
|
7
|
+
Usage:
|
|
8
|
+
python -m coremem.benchmarks.longmemeval.eval --backend chroma --limit 5
|
|
9
|
+
python -m coremem.benchmarks.longmemeval.eval --backend hybrid --limit 10
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import json
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
from typing import Any
|
|
17
|
+
|
|
18
|
+
from coremem.core import MemoryCore
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class LongMemEvalAdapter:
|
|
22
|
+
"""Inject LongMemEval sessions and measure retrieval recall."""
|
|
23
|
+
|
|
24
|
+
def __init__(self, core: MemoryCore):
|
|
25
|
+
self._core = core
|
|
26
|
+
|
|
27
|
+
def inject_sessions(
|
|
28
|
+
self,
|
|
29
|
+
haystack_sessions: list[dict],
|
|
30
|
+
verbose: bool = False,
|
|
31
|
+
) -> dict[str, list[str]]:
|
|
32
|
+
"""Inject all haystack sessions into coremem.
|
|
33
|
+
|
|
34
|
+
Each session is a list of message dicts. Each message gets tagged
|
|
35
|
+
with its session_id for later dedup during search.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
haystack_sessions: List of sessions, each a list of {"role", "content"} dicts.
|
|
39
|
+
verbose: Print injection progress.
|
|
40
|
+
|
|
41
|
+
Returns:
|
|
42
|
+
Dict mapping session_id → list of ingested memory IDs.
|
|
43
|
+
"""
|
|
44
|
+
session_memory_ids: dict[str, list[str]] = {}
|
|
45
|
+
for si, session in enumerate(haystack_sessions):
|
|
46
|
+
sid = f"session_{si:04d}"
|
|
47
|
+
ids = self._core.ingest_many(session, session_id=sid)
|
|
48
|
+
session_memory_ids[sid] = ids
|
|
49
|
+
if verbose:
|
|
50
|
+
print(f" Session {sid}: {len(ids)}/{len(session)} messages ingested", flush=True)
|
|
51
|
+
return session_memory_ids
|
|
52
|
+
|
|
53
|
+
def search(self, query: str, limit: int = 10) -> list[dict]:
|
|
54
|
+
"""Search and return session_ids from results."""
|
|
55
|
+
results = self._core.search(query, limit=limit)
|
|
56
|
+
return [
|
|
57
|
+
{
|
|
58
|
+
"session_id": r.memory.session_id,
|
|
59
|
+
"content": r.memory.content[:200],
|
|
60
|
+
"score": r.score,
|
|
61
|
+
"source": r.source,
|
|
62
|
+
}
|
|
63
|
+
for r in results
|
|
64
|
+
]
|
|
65
|
+
|
|
66
|
+
def recall_at_k(self, query: str, answer_session_ids: list[str], k: int = 5) -> tuple[bool, int]:
|
|
67
|
+
"""Check if any answer session appears in top-K results.
|
|
68
|
+
|
|
69
|
+
Returns (is_hit, count_of_answer_sessions_found).
|
|
70
|
+
"""
|
|
71
|
+
results = self._core.search(query, limit=k)
|
|
72
|
+
found_sessions = {r.memory.session_id for r in results}
|
|
73
|
+
matches = found_sessions & set(answer_session_ids)
|
|
74
|
+
return len(matches) > 0, len(matches)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def load_longmemeval_questions(
|
|
78
|
+
data_path: str | Path,
|
|
79
|
+
question_types: list[str] | None = None,
|
|
80
|
+
limit: int | None = None,
|
|
81
|
+
) -> list[dict[str, Any]]:
|
|
82
|
+
"""Load LongMemEval questions from JSON.
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
data_path: Path to LongMemEval JSON data file.
|
|
86
|
+
question_types: Optional filter by question_type field.
|
|
87
|
+
limit: Optional max number of questions to load.
|
|
88
|
+
|
|
89
|
+
Returns:
|
|
90
|
+
List of question dicts with keys: question_id, question, question_type,
|
|
91
|
+
answer, answer_session_id, haystack_sessions.
|
|
92
|
+
"""
|
|
93
|
+
with open(data_path) as f:
|
|
94
|
+
data = json.load(f)
|
|
95
|
+
|
|
96
|
+
questions = data if isinstance(data, list) else data.get("questions", [])
|
|
97
|
+
|
|
98
|
+
if question_types:
|
|
99
|
+
questions = [q for q in questions if q.get("question_type") in question_types]
|
|
100
|
+
|
|
101
|
+
if limit:
|
|
102
|
+
questions = questions[:limit]
|
|
103
|
+
|
|
104
|
+
return questions
|
|
@@ -0,0 +1,230 @@
|
|
|
1
|
+
"""LongMemEval retrieval benchmark runner for coremem.
|
|
2
|
+
|
|
3
|
+
Measures Recall@K without any LLM involvement. Two backends supported:
|
|
4
|
+
--backend chroma → ChromaBackend (baseline, target 95%+)
|
|
5
|
+
--backend hybrid → HybridBackend (enhanced, requires hybriddb)
|
|
6
|
+
|
|
7
|
+
Dataset format (LongMemEval):
|
|
8
|
+
Each question has: question_id, question_type, question, answer_session_ids,
|
|
9
|
+
haystack_session_ids, haystack_sessions.
|
|
10
|
+
haystack_session_ids[i] maps to haystack_sessions[i].
|
|
11
|
+
|
|
12
|
+
Injection: sessions are injected in batch, tagged as session_{i:04d}.
|
|
13
|
+
|
|
14
|
+
Recall check: answer_session_ids[aid] → find aid in haystack_session_ids → get
|
|
15
|
+
index → our injected id session_{index:04d} → check if in top-K results.
|
|
16
|
+
|
|
17
|
+
Usage:
|
|
18
|
+
uv run python -m coremem.benchmarks.longmemeval.eval \
|
|
19
|
+
--data /tmp/lme_cache/.../longmemeval_s_cleaned.json \
|
|
20
|
+
--backend chroma \
|
|
21
|
+
--limit 20 \
|
|
22
|
+
--k 5
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
from __future__ import annotations
|
|
26
|
+
|
|
27
|
+
import argparse
|
|
28
|
+
import json
|
|
29
|
+
import os
|
|
30
|
+
import shutil
|
|
31
|
+
import time
|
|
32
|
+
from pathlib import Path
|
|
33
|
+
from typing import Any
|
|
34
|
+
|
|
35
|
+
from coremem.core import MemoryCore
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _load_questions(data_path: str, question_types: list[str] | None = None, limit: int | None = None) -> list[dict]:
|
|
39
|
+
with open(data_path) as f:
|
|
40
|
+
data = json.load(f)
|
|
41
|
+
|
|
42
|
+
if not isinstance(data, list):
|
|
43
|
+
data = list(data.values())
|
|
44
|
+
|
|
45
|
+
if question_types:
|
|
46
|
+
data = [q for q in data if q.get("question_type") in question_types]
|
|
47
|
+
|
|
48
|
+
if limit:
|
|
49
|
+
data = data[:limit]
|
|
50
|
+
|
|
51
|
+
return data
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _setup_backend(backend: str, path: str):
|
|
55
|
+
if backend == "chroma":
|
|
56
|
+
from coremem.backends.chroma import ChromaBackend
|
|
57
|
+
|
|
58
|
+
return ChromaBackend(path=path)
|
|
59
|
+
elif backend == "hybrid":
|
|
60
|
+
from coremem.backends.hybrid import HybridBackend
|
|
61
|
+
|
|
62
|
+
return HybridBackend(path=path)
|
|
63
|
+
else:
|
|
64
|
+
raise ValueError(f"Unknown backend: {backend}. Use 'chroma' or 'hybrid'.")
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def _map_answer_sids(
|
|
68
|
+
haystack_session_ids: list[str],
|
|
69
|
+
answer_session_ids: list[str] | str,
|
|
70
|
+
) -> list[str]:
|
|
71
|
+
if isinstance(answer_session_ids, str):
|
|
72
|
+
answer_session_ids = [answer_session_ids]
|
|
73
|
+
|
|
74
|
+
id_to_index = {hid: idx for idx, hid in enumerate(haystack_session_ids)}
|
|
75
|
+
result = []
|
|
76
|
+
for aid in answer_session_ids:
|
|
77
|
+
idx = id_to_index.get(aid)
|
|
78
|
+
if idx is not None:
|
|
79
|
+
result.append(f"session_{idx:04d}")
|
|
80
|
+
return result
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def _inject_sessions_batch(core: MemoryCore, haystack_sessions: list) -> float:
|
|
84
|
+
"""Inject all sessions using batch insert. Returns time taken."""
|
|
85
|
+
from coremem.types import Memory
|
|
86
|
+
|
|
87
|
+
t0 = time.time()
|
|
88
|
+
batch: list[Memory] = []
|
|
89
|
+
for si, session_messages in enumerate(haystack_sessions):
|
|
90
|
+
sid = f"session_{si:04d}"
|
|
91
|
+
for msg in session_messages:
|
|
92
|
+
batch.append(Memory(
|
|
93
|
+
id="",
|
|
94
|
+
content=msg.get("content", ""),
|
|
95
|
+
role=msg.get("role", "user"),
|
|
96
|
+
session_id=sid,
|
|
97
|
+
))
|
|
98
|
+
if batch:
|
|
99
|
+
core.backend.ingest_batch(batch)
|
|
100
|
+
return time.time() - t0
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def run_retrieval_benchmark(
|
|
104
|
+
data_path: str | Path,
|
|
105
|
+
backend: str = "chroma",
|
|
106
|
+
question_types: list[str] | None = None,
|
|
107
|
+
limit: int | None = None,
|
|
108
|
+
k: int = 5,
|
|
109
|
+
verbose: bool = True,
|
|
110
|
+
memory_base: str = "/tmp/coremem_bench",
|
|
111
|
+
) -> dict[str, Any]:
|
|
112
|
+
questions = _load_questions(str(data_path), question_types=question_types, limit=limit)
|
|
113
|
+
|
|
114
|
+
if not questions:
|
|
115
|
+
raise ValueError(f"No questions found in {data_path}")
|
|
116
|
+
|
|
117
|
+
results: list[dict] = []
|
|
118
|
+
type_scores: dict[str, list[bool]] = {}
|
|
119
|
+
|
|
120
|
+
if verbose:
|
|
121
|
+
print(f"Backend: {backend}")
|
|
122
|
+
print(f"Questions: {len(questions)}")
|
|
123
|
+
print(f"Recall: R@{k}")
|
|
124
|
+
print("-" * 60)
|
|
125
|
+
|
|
126
|
+
start_time = time.time()
|
|
127
|
+
|
|
128
|
+
for qi, q in enumerate(questions):
|
|
129
|
+
q_id = q.get("question_id", f"q_{qi}")
|
|
130
|
+
q_text = q.get("question", "")
|
|
131
|
+
q_type = q.get("question_type", "unknown")
|
|
132
|
+
haystack_ids = q.get("haystack_session_ids", [])
|
|
133
|
+
haystack = q.get("haystack_sessions", [])
|
|
134
|
+
|
|
135
|
+
answer_sids = _map_answer_sids(haystack_ids, q.get("answer_session_ids", []))
|
|
136
|
+
if not answer_sids:
|
|
137
|
+
if verbose:
|
|
138
|
+
print(f" [{qi+1}/{len(questions)}] {q_id}: SKIP (no answer IDs mapped)", flush=True)
|
|
139
|
+
continue
|
|
140
|
+
|
|
141
|
+
mem_path = f"{memory_base}_{q_id}_{os.getpid()}"
|
|
142
|
+
be = _setup_backend(backend, mem_path)
|
|
143
|
+
core = MemoryCore(backend=be)
|
|
144
|
+
|
|
145
|
+
try:
|
|
146
|
+
inject_time = _inject_sessions_batch(core, haystack)
|
|
147
|
+
|
|
148
|
+
t0 = time.time()
|
|
149
|
+
search_results = core.search(q_text, limit=k)
|
|
150
|
+
found = {r.memory.session_id for r in search_results}
|
|
151
|
+
hits = found & set(answer_sids)
|
|
152
|
+
is_hit = len(hits) > 0
|
|
153
|
+
search_time = time.time() - t0
|
|
154
|
+
|
|
155
|
+
results.append({
|
|
156
|
+
"question_id": q_id,
|
|
157
|
+
"question_type": q_type,
|
|
158
|
+
"recall": is_hit,
|
|
159
|
+
"sessions_injected": len(haystack),
|
|
160
|
+
"inject_time_s": round(inject_time, 3),
|
|
161
|
+
"search_time_s": round(search_time, 4),
|
|
162
|
+
"matches": sorted(hits),
|
|
163
|
+
})
|
|
164
|
+
|
|
165
|
+
if q_type not in type_scores:
|
|
166
|
+
type_scores[q_type] = []
|
|
167
|
+
type_scores[q_type].append(is_hit)
|
|
168
|
+
|
|
169
|
+
if verbose:
|
|
170
|
+
status = f"HIT ({len(hits)} of {len(answer_sids)})" if is_hit else "MISS"
|
|
171
|
+
print(
|
|
172
|
+
f" [{qi+1}/{len(questions)}] {q_id} ({q_type}): {status} "
|
|
173
|
+
f"| inject={inject_time:.1f}s search={search_time:.4f}s",
|
|
174
|
+
flush=True,
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
finally:
|
|
178
|
+
shutil.rmtree(mem_path, ignore_errors=True)
|
|
179
|
+
|
|
180
|
+
elapsed = time.time() - start_time
|
|
181
|
+
total_hits = sum(r["recall"] for r in results)
|
|
182
|
+
total = len(results) or 1
|
|
183
|
+
overall = total_hits / total
|
|
184
|
+
|
|
185
|
+
if verbose:
|
|
186
|
+
print("-" * 60)
|
|
187
|
+
print(f"Overall R@{k}: {overall:.1%} ({total_hits}/{total})")
|
|
188
|
+
for qt, scores in sorted(type_scores.items()):
|
|
189
|
+
s = sum(scores)
|
|
190
|
+
print(f" {qt}: {s}/{len(scores)} = {s/len(scores):.1%}")
|
|
191
|
+
print(f"Time: {elapsed:.1f}s")
|
|
192
|
+
|
|
193
|
+
return {
|
|
194
|
+
"backend": backend,
|
|
195
|
+
"k": k,
|
|
196
|
+
"total": total,
|
|
197
|
+
"hits": total_hits,
|
|
198
|
+
"recall": overall,
|
|
199
|
+
"by_type": {
|
|
200
|
+
qt: {"hits": sum(s), "total": len(s), "recall": sum(s) / len(s) if s else 0}
|
|
201
|
+
for qt, s in type_scores.items()
|
|
202
|
+
},
|
|
203
|
+
"results": results,
|
|
204
|
+
"elapsed_s": round(elapsed, 1),
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
def main():
|
|
209
|
+
parser = argparse.ArgumentParser(description="coremem LongMemEval retrieval benchmark")
|
|
210
|
+
parser.add_argument("--data", type=str, required=True,
|
|
211
|
+
help="Path to longmemeval_s_cleaned.json")
|
|
212
|
+
parser.add_argument("--backend", type=str, default="chroma",
|
|
213
|
+
choices=["chroma", "hybrid"])
|
|
214
|
+
parser.add_argument("--question-types", type=str, nargs="*", default=None)
|
|
215
|
+
parser.add_argument("--limit", type=int, default=None)
|
|
216
|
+
parser.add_argument("--k", type=int, default=5)
|
|
217
|
+
args = parser.parse_args()
|
|
218
|
+
|
|
219
|
+
run_retrieval_benchmark(
|
|
220
|
+
data_path=args.data,
|
|
221
|
+
backend=args.backend,
|
|
222
|
+
question_types=args.question_types,
|
|
223
|
+
limit=args.limit,
|
|
224
|
+
k=args.k,
|
|
225
|
+
verbose=True,
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
if __name__ == "__main__":
|
|
230
|
+
main()
|