coremem 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,11 @@
1
+ __pycache__/
2
+ *.py[cod]
3
+ *.egg-info/
4
+ dist/
5
+ .venv/
6
+ .pytest_cache/
7
+ *.db
8
+ vectors/
9
+
10
+ # Benchmark results
11
+ results/
coremem-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Eddy Vinck
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
coremem-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,170 @@
1
+ Metadata-Version: 2.4
2
+ Name: coremem
3
+ Version: 0.1.0
4
+ Summary: Zero-LLM memory retrieval for AI agents — semantic search and deterministic heuristics
5
+ Author: Eddy Vinck
6
+ License-Expression: MIT
7
+ License-File: LICENSE
8
+ Keywords: ai-agents,llm,memory,retrieval,semantic-search
9
+ Classifier: Development Status :: 3 - Alpha
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Programming Language :: Python :: 3.11
13
+ Classifier: Programming Language :: Python :: 3.12
14
+ Classifier: Programming Language :: Python :: 3.13
15
+ Requires-Python: >=3.11
16
+ Requires-Dist: chromadb>=0.5.0
17
+ Requires-Dist: numpy>=1.24.0
18
+ Requires-Dist: pyyaml>=6.0
19
+ Requires-Dist: sentence-transformers>=2.0.0
20
+ Provides-Extra: dev
21
+ Requires-Dist: pytest-asyncio>=0.24.0; extra == 'dev'
22
+ Requires-Dist: pytest>=8.0.0; extra == 'dev'
23
+ Provides-Extra: hybrid
24
+ Requires-Dist: hybriddb>=0.2.0; extra == 'hybrid'
25
+ Description-Content-Type: text/markdown
26
+
27
+ # CoreMem
28
+
29
+ > **Zero-LLM memory retrieval for AI agents.** CoreMem gives agents instant access to conversation history — semantic search plus deterministic retrieval heuristics, all without a single API call. Scores **98.0% R@5 on LongMemEval (500 questions)** in the Executive Assistant retrieval stack — no LLM, no tuning, no cloud.
30
+
31
+ > **Embedded. Local. Open source.** No external APIs, no vector DB services, no internet connection required. Runs entirely on-device with ChromaDB or HybridDB + sentence-transformers. Ships as a single Python package with zero infrastructure dependencies.
32
+
33
+ **Dual-backend architecture.** Drop-in backends (ChromaDB baseline, HybridDB enhanced) with the same API. Ranking pipeline: backend retrieval → deterministic heuristics → recency-aware rescoring → session-aware retrieval.
34
+
35
+ ```python
36
+ from coremem import MemoryCore
37
+ from coremem.backends.chroma import ChromaBackend
38
+
39
+ core = MemoryCore(backend=ChromaBackend(path="./memory"))
40
+
41
+ # Ingest conversation turns
42
+ core.ingest("user", "I visited the Museum of Modern Art today")
43
+ core.ingest("assistant", "That sounds wonderful! How was it?")
44
+ core.ingest("user", "I went to an Ancient Civilizations exhibition at the Natural History Museum")
45
+
46
+ # Search with deterministic heuristic reranking
47
+ results = core.search("When did I visit art museums?")
48
+
49
+ for r in results:
50
+ print(f"[{r.memory.ts}] [{r.memory.role}] {r.memory.content}")
51
+ ```
52
+
53
+ ## Why CoreMem?
54
+
55
+ Every AI agent needs memory. But cloud-based vector search is expensive, slow, and doesn't work offline. Pure embedding similarity misses keyword matches and temporal context. LLM-based memory systems cost tokens per query.
56
+
57
+ CoreMem solves all three:
58
+
59
+ | Component | What it does |
60
+ |-----------|-------------|
61
+ | **Semantic search** | Embedding similarity via ChromaDB or HybridDB |
62
+ | **Deterministic heuristics** | Keyword overlap, temporal recency, person-name boost, quoted-phrase matching |
63
+ | **Session deduplication** | One result per conversation, with full context retrieval |
64
+
65
+ ## LongMemEval Results (500 questions, no LLM, no tuning)
66
+
67
+ | Metric | Score |
68
+ |--------|-------|
69
+ | R@5 | **98.0%** |
70
+ | R@10 | **98.4%** |
71
+ | MRR | 0.944 |
72
+ | P@5 | 0.592 |
73
+ | F1@5 | 0.684 |
74
+ | Selectivity | 11.5% haystack scanned |
75
+ | Rank distribution | #1: 91.8%, #2-3: 5.0%, #4-5: 1.2%, #6-10: 0.4%, >10: 1.6% |
76
+
77
+ Outperforms MemPalace raw (96.6%) and matches their hybrid v4 held-out (98.4%) — with zero tuning, zero dev-set peeking.
78
+
79
+ ## Installation
80
+
81
+ ```bash
82
+ pip install coremem
83
+ ```
84
+
85
+ With HybridDB backend for enhanced FTS5 + vector hybrid search:
86
+
87
+ ```bash
88
+ pip install coremem[hybrid]
89
+ ```
90
+
91
+ ## Core Concepts
92
+
93
+ ### Backends
94
+
95
+ ```python
96
+ # ChromaDB baseline — pure vector search
97
+ from coremem.backends.chroma import ChromaBackend
98
+ core = MemoryCore(backend=ChromaBackend(path="./data"))
99
+
100
+ # HybridDB enhanced — FTS5 + vector hybrid search
101
+ from coremem.backends.hybrid import HybridBackend
102
+ core = MemoryCore(backend=HybridBackend(path="./data"))
103
+ ```
104
+
105
+ ### Ingestion
106
+
107
+ ```python
108
+ # Simple ingestion
109
+ core.ingest("user", "I built a Spitfire model kit", session_id="conv_001")
110
+
111
+ # Batch ingestion
112
+ from coremem import ingest_batch
113
+ ingest_batch(core, [
114
+ ("user", "What's the weather today?"),
115
+ ("assistant", "Sunny with a high of 72°F"),
116
+ ], session_id="conv_001")
117
+ ```
118
+
119
+ ### Search
120
+
121
+ ```python
122
+ # Basic search
123
+ results = core.search("How many model kits?", limit=10)
124
+
125
+ # Limit results
126
+ results = core.search("model building projects", limit=5)
127
+ ```
128
+
129
+ ### Heuristics
130
+
131
+ Deterministic, zero-LLM scoring boosts applied to every result:
132
+
133
+ | Heuristic | What it catches |
134
+ |-----------|----------------|
135
+ | `keyword_overlap` | Exact word matches between query and content |
136
+ | `temporal_boost` | Queries with "latest", "current", "recently" |
137
+ | `recency_decay` | Unconditional exponential decay (30-day half-life) |
138
+ | `person_name_boost` | Proper name mentions in content |
139
+ | `quoted_phrase_boost` | Exact phrase matches in quotes |
140
+
141
+ ```python
142
+ from coremem import SearchHeuristics
143
+
144
+ # Apply all heuristics to a single result
145
+ score = SearchHeuristics.apply_all(
146
+ query="latest project",
147
+ content="Just finished the Q3 project report",
148
+ score=0.75,
149
+ ts="2026-05-28T10:00:00Z",
150
+ )
151
+ ```
152
+
153
+ ### Wake-Up Context
154
+
155
+ Give the agent instant situational awareness:
156
+
157
+ ```python
158
+ context = core.wake_up(user_id="alice")
159
+ # Returns a compact string with L0 identity and L1 recent context.
160
+ ```
161
+
162
+ ## License
163
+
164
+ MIT — see [LICENSE](LICENSE).
165
+
166
+ ## Author
167
+
168
+ Eddy Vinck
169
+
170
+ CoreMem is the retrieval engine behind the [Executive Assistant](https://github.com/open-assistants-lab) agent system. Pairs with [HybridDB](https://github.com/open-assistants-lab) for storage and ConnectKit for real-time sync.
@@ -0,0 +1,144 @@
1
+ # CoreMem
2
+
3
+ > **Zero-LLM memory retrieval for AI agents.** CoreMem gives agents instant access to conversation history — semantic search plus deterministic retrieval heuristics, all without a single API call. Scores **98.0% R@5 on LongMemEval (500 questions)** in the Executive Assistant retrieval stack — no LLM, no tuning, no cloud.
4
+
5
+ > **Embedded. Local. Open source.** No external APIs, no vector DB services, no internet connection required. Runs entirely on-device with ChromaDB or HybridDB + sentence-transformers. Ships as a single Python package with zero infrastructure dependencies.
6
+
7
+ **Dual-backend architecture.** Drop-in backends (ChromaDB baseline, HybridDB enhanced) with the same API. Ranking pipeline: backend retrieval → deterministic heuristics → recency-aware rescoring → session-aware retrieval.
8
+
9
+ ```python
10
+ from coremem import MemoryCore
11
+ from coremem.backends.chroma import ChromaBackend
12
+
13
+ core = MemoryCore(backend=ChromaBackend(path="./memory"))
14
+
15
+ # Ingest conversation turns
16
+ core.ingest("user", "I visited the Museum of Modern Art today")
17
+ core.ingest("assistant", "That sounds wonderful! How was it?")
18
+ core.ingest("user", "I went to an Ancient Civilizations exhibition at the Natural History Museum")
19
+
20
+ # Search with deterministic heuristic reranking
21
+ results = core.search("When did I visit art museums?")
22
+
23
+ for r in results:
24
+ print(f"[{r.memory.ts}] [{r.memory.role}] {r.memory.content}")
25
+ ```
26
+
27
+ ## Why CoreMem?
28
+
29
+ Every AI agent needs memory. But cloud-based vector search is expensive, slow, and doesn't work offline. Pure embedding similarity misses keyword matches and temporal context. LLM-based memory systems cost tokens per query.
30
+
31
+ CoreMem solves all three:
32
+
33
+ | Component | What it does |
34
+ |-----------|-------------|
35
+ | **Semantic search** | Embedding similarity via ChromaDB or HybridDB |
36
+ | **Deterministic heuristics** | Keyword overlap, temporal recency, person-name boost, quoted-phrase matching |
37
+ | **Session deduplication** | One result per conversation, with full context retrieval |
38
+
39
+ ## LongMemEval Results (500 questions, no LLM, no tuning)
40
+
41
+ | Metric | Score |
42
+ |--------|-------|
43
+ | R@5 | **98.0%** |
44
+ | R@10 | **98.4%** |
45
+ | MRR | 0.944 |
46
+ | P@5 | 0.592 |
47
+ | F1@5 | 0.684 |
48
+ | Selectivity | 11.5% haystack scanned |
49
+ | Rank distribution | #1: 91.8%, #2-3: 5.0%, #4-5: 1.2%, #6-10: 0.4%, >10: 1.6% |
50
+
51
+ Outperforms MemPalace raw (96.6%) and matches their hybrid v4 held-out (98.4%) — with zero tuning, zero dev-set peeking.
52
+
53
+ ## Installation
54
+
55
+ ```bash
56
+ pip install coremem
57
+ ```
58
+
59
+ With HybridDB backend for enhanced FTS5 + vector hybrid search:
60
+
61
+ ```bash
62
+ pip install coremem[hybrid]
63
+ ```
64
+
65
+ ## Core Concepts
66
+
67
+ ### Backends
68
+
69
+ ```python
70
+ # ChromaDB baseline — pure vector search
71
+ from coremem.backends.chroma import ChromaBackend
72
+ core = MemoryCore(backend=ChromaBackend(path="./data"))
73
+
74
+ # HybridDB enhanced — FTS5 + vector hybrid search
75
+ from coremem.backends.hybrid import HybridBackend
76
+ core = MemoryCore(backend=HybridBackend(path="./data"))
77
+ ```
78
+
79
+ ### Ingestion
80
+
81
+ ```python
82
+ # Simple ingestion
83
+ core.ingest("user", "I built a Spitfire model kit", session_id="conv_001")
84
+
85
+ # Batch ingestion
86
+ from coremem import ingest_batch
87
+ ingest_batch(core, [
88
+ ("user", "What's the weather today?"),
89
+ ("assistant", "Sunny with a high of 72°F"),
90
+ ], session_id="conv_001")
91
+ ```
92
+
93
+ ### Search
94
+
95
+ ```python
96
+ # Basic search
97
+ results = core.search("How many model kits?", limit=10)
98
+
99
+ # Limit results
100
+ results = core.search("model building projects", limit=5)
101
+ ```
102
+
103
+ ### Heuristics
104
+
105
+ Deterministic, zero-LLM scoring boosts applied to every result:
106
+
107
+ | Heuristic | What it catches |
108
+ |-----------|----------------|
109
+ | `keyword_overlap` | Exact word matches between query and content |
110
+ | `temporal_boost` | Queries with "latest", "current", "recently" |
111
+ | `recency_decay` | Unconditional exponential decay (30-day half-life) |
112
+ | `person_name_boost` | Proper name mentions in content |
113
+ | `quoted_phrase_boost` | Exact phrase matches in quotes |
114
+
115
+ ```python
116
+ from coremem import SearchHeuristics
117
+
118
+ # Apply all heuristics to a single result
119
+ score = SearchHeuristics.apply_all(
120
+ query="latest project",
121
+ content="Just finished the Q3 project report",
122
+ score=0.75,
123
+ ts="2026-05-28T10:00:00Z",
124
+ )
125
+ ```
126
+
127
+ ### Wake-Up Context
128
+
129
+ Give the agent instant situational awareness:
130
+
131
+ ```python
132
+ context = core.wake_up(user_id="alice")
133
+ # Returns a compact string with L0 identity and L1 recent context.
134
+ ```
135
+
136
+ ## License
137
+
138
+ MIT — see [LICENSE](LICENSE).
139
+
140
+ ## Author
141
+
142
+ Eddy Vinck
143
+
144
+ CoreMem is the retrieval engine behind the [Executive Assistant](https://github.com/open-assistants-lab) agent system. Pairs with [HybridDB](https://github.com/open-assistants-lab) for storage and ConnectKit for real-time sync.
@@ -0,0 +1,104 @@
1
+ """LongMemEval benchmark adapter for coremem.
2
+
3
+ Direct-injection mode — injects haystack sessions into coremem,
4
+ runs search, measures Recall@K. No LLM. No HTTP. No agent loop.
5
+ Pure retrieval benchmarking.
6
+
7
+ Usage:
8
+ python -m coremem.benchmarks.longmemeval.eval --backend chroma --limit 5
9
+ python -m coremem.benchmarks.longmemeval.eval --backend hybrid --limit 10
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import json
15
+ from pathlib import Path
16
+ from typing import Any
17
+
18
+ from coremem.core import MemoryCore
19
+
20
+
21
+ class LongMemEvalAdapter:
22
+ """Inject LongMemEval sessions and measure retrieval recall."""
23
+
24
+ def __init__(self, core: MemoryCore):
25
+ self._core = core
26
+
27
+ def inject_sessions(
28
+ self,
29
+ haystack_sessions: list[dict],
30
+ verbose: bool = False,
31
+ ) -> dict[str, list[str]]:
32
+ """Inject all haystack sessions into coremem.
33
+
34
+ Each session is a list of message dicts. Each message gets tagged
35
+ with its session_id for later dedup during search.
36
+
37
+ Args:
38
+ haystack_sessions: List of sessions, each a list of {"role", "content"} dicts.
39
+ verbose: Print injection progress.
40
+
41
+ Returns:
42
+ Dict mapping session_id → list of ingested memory IDs.
43
+ """
44
+ session_memory_ids: dict[str, list[str]] = {}
45
+ for si, session in enumerate(haystack_sessions):
46
+ sid = f"session_{si:04d}"
47
+ ids = self._core.ingest_many(session, session_id=sid)
48
+ session_memory_ids[sid] = ids
49
+ if verbose:
50
+ print(f" Session {sid}: {len(ids)}/{len(session)} messages ingested", flush=True)
51
+ return session_memory_ids
52
+
53
+ def search(self, query: str, limit: int = 10) -> list[dict]:
54
+ """Search and return session_ids from results."""
55
+ results = self._core.search(query, limit=limit)
56
+ return [
57
+ {
58
+ "session_id": r.memory.session_id,
59
+ "content": r.memory.content[:200],
60
+ "score": r.score,
61
+ "source": r.source,
62
+ }
63
+ for r in results
64
+ ]
65
+
66
+ def recall_at_k(self, query: str, answer_session_ids: list[str], k: int = 5) -> tuple[bool, int]:
67
+ """Check if any answer session appears in top-K results.
68
+
69
+ Returns (is_hit, count_of_answer_sessions_found).
70
+ """
71
+ results = self._core.search(query, limit=k)
72
+ found_sessions = {r.memory.session_id for r in results}
73
+ matches = found_sessions & set(answer_session_ids)
74
+ return len(matches) > 0, len(matches)
75
+
76
+
77
+ def load_longmemeval_questions(
78
+ data_path: str | Path,
79
+ question_types: list[str] | None = None,
80
+ limit: int | None = None,
81
+ ) -> list[dict[str, Any]]:
82
+ """Load LongMemEval questions from JSON.
83
+
84
+ Args:
85
+ data_path: Path to LongMemEval JSON data file.
86
+ question_types: Optional filter by question_type field.
87
+ limit: Optional max number of questions to load.
88
+
89
+ Returns:
90
+ List of question dicts with keys: question_id, question, question_type,
91
+ answer, answer_session_id, haystack_sessions.
92
+ """
93
+ with open(data_path) as f:
94
+ data = json.load(f)
95
+
96
+ questions = data if isinstance(data, list) else data.get("questions", [])
97
+
98
+ if question_types:
99
+ questions = [q for q in questions if q.get("question_type") in question_types]
100
+
101
+ if limit:
102
+ questions = questions[:limit]
103
+
104
+ return questions
@@ -0,0 +1,230 @@
1
+ """LongMemEval retrieval benchmark runner for coremem.
2
+
3
+ Measures Recall@K without any LLM involvement. Two backends supported:
4
+ --backend chroma → ChromaBackend (baseline, target 95%+)
5
+ --backend hybrid → HybridBackend (enhanced, requires hybriddb)
6
+
7
+ Dataset format (LongMemEval):
8
+ Each question has: question_id, question_type, question, answer_session_ids,
9
+ haystack_session_ids, haystack_sessions.
10
+ haystack_session_ids[i] maps to haystack_sessions[i].
11
+
12
+ Injection: sessions are injected in batch, tagged as session_{i:04d}.
13
+
14
+ Recall check: answer_session_ids[aid] → find aid in haystack_session_ids → get
15
+ index → our injected id session_{index:04d} → check if in top-K results.
16
+
17
+ Usage:
18
+ uv run python -m coremem.benchmarks.longmemeval.eval \
19
+ --data /tmp/lme_cache/.../longmemeval_s_cleaned.json \
20
+ --backend chroma \
21
+ --limit 20 \
22
+ --k 5
23
+ """
24
+
25
+ from __future__ import annotations
26
+
27
+ import argparse
28
+ import json
29
+ import os
30
+ import shutil
31
+ import time
32
+ from pathlib import Path
33
+ from typing import Any
34
+
35
+ from coremem.core import MemoryCore
36
+
37
+
38
+ def _load_questions(data_path: str, question_types: list[str] | None = None, limit: int | None = None) -> list[dict]:
39
+ with open(data_path) as f:
40
+ data = json.load(f)
41
+
42
+ if not isinstance(data, list):
43
+ data = list(data.values())
44
+
45
+ if question_types:
46
+ data = [q for q in data if q.get("question_type") in question_types]
47
+
48
+ if limit:
49
+ data = data[:limit]
50
+
51
+ return data
52
+
53
+
54
+ def _setup_backend(backend: str, path: str):
55
+ if backend == "chroma":
56
+ from coremem.backends.chroma import ChromaBackend
57
+
58
+ return ChromaBackend(path=path)
59
+ elif backend == "hybrid":
60
+ from coremem.backends.hybrid import HybridBackend
61
+
62
+ return HybridBackend(path=path)
63
+ else:
64
+ raise ValueError(f"Unknown backend: {backend}. Use 'chroma' or 'hybrid'.")
65
+
66
+
67
+ def _map_answer_sids(
68
+ haystack_session_ids: list[str],
69
+ answer_session_ids: list[str] | str,
70
+ ) -> list[str]:
71
+ if isinstance(answer_session_ids, str):
72
+ answer_session_ids = [answer_session_ids]
73
+
74
+ id_to_index = {hid: idx for idx, hid in enumerate(haystack_session_ids)}
75
+ result = []
76
+ for aid in answer_session_ids:
77
+ idx = id_to_index.get(aid)
78
+ if idx is not None:
79
+ result.append(f"session_{idx:04d}")
80
+ return result
81
+
82
+
83
+ def _inject_sessions_batch(core: MemoryCore, haystack_sessions: list) -> float:
84
+ """Inject all sessions using batch insert. Returns time taken."""
85
+ from coremem.types import Memory
86
+
87
+ t0 = time.time()
88
+ batch: list[Memory] = []
89
+ for si, session_messages in enumerate(haystack_sessions):
90
+ sid = f"session_{si:04d}"
91
+ for msg in session_messages:
92
+ batch.append(Memory(
93
+ id="",
94
+ content=msg.get("content", ""),
95
+ role=msg.get("role", "user"),
96
+ session_id=sid,
97
+ ))
98
+ if batch:
99
+ core.backend.ingest_batch(batch)
100
+ return time.time() - t0
101
+
102
+
103
+ def run_retrieval_benchmark(
104
+ data_path: str | Path,
105
+ backend: str = "chroma",
106
+ question_types: list[str] | None = None,
107
+ limit: int | None = None,
108
+ k: int = 5,
109
+ verbose: bool = True,
110
+ memory_base: str = "/tmp/coremem_bench",
111
+ ) -> dict[str, Any]:
112
+ questions = _load_questions(str(data_path), question_types=question_types, limit=limit)
113
+
114
+ if not questions:
115
+ raise ValueError(f"No questions found in {data_path}")
116
+
117
+ results: list[dict] = []
118
+ type_scores: dict[str, list[bool]] = {}
119
+
120
+ if verbose:
121
+ print(f"Backend: {backend}")
122
+ print(f"Questions: {len(questions)}")
123
+ print(f"Recall: R@{k}")
124
+ print("-" * 60)
125
+
126
+ start_time = time.time()
127
+
128
+ for qi, q in enumerate(questions):
129
+ q_id = q.get("question_id", f"q_{qi}")
130
+ q_text = q.get("question", "")
131
+ q_type = q.get("question_type", "unknown")
132
+ haystack_ids = q.get("haystack_session_ids", [])
133
+ haystack = q.get("haystack_sessions", [])
134
+
135
+ answer_sids = _map_answer_sids(haystack_ids, q.get("answer_session_ids", []))
136
+ if not answer_sids:
137
+ if verbose:
138
+ print(f" [{qi+1}/{len(questions)}] {q_id}: SKIP (no answer IDs mapped)", flush=True)
139
+ continue
140
+
141
+ mem_path = f"{memory_base}_{q_id}_{os.getpid()}"
142
+ be = _setup_backend(backend, mem_path)
143
+ core = MemoryCore(backend=be)
144
+
145
+ try:
146
+ inject_time = _inject_sessions_batch(core, haystack)
147
+
148
+ t0 = time.time()
149
+ search_results = core.search(q_text, limit=k)
150
+ found = {r.memory.session_id for r in search_results}
151
+ hits = found & set(answer_sids)
152
+ is_hit = len(hits) > 0
153
+ search_time = time.time() - t0
154
+
155
+ results.append({
156
+ "question_id": q_id,
157
+ "question_type": q_type,
158
+ "recall": is_hit,
159
+ "sessions_injected": len(haystack),
160
+ "inject_time_s": round(inject_time, 3),
161
+ "search_time_s": round(search_time, 4),
162
+ "matches": sorted(hits),
163
+ })
164
+
165
+ if q_type not in type_scores:
166
+ type_scores[q_type] = []
167
+ type_scores[q_type].append(is_hit)
168
+
169
+ if verbose:
170
+ status = f"HIT ({len(hits)} of {len(answer_sids)})" if is_hit else "MISS"
171
+ print(
172
+ f" [{qi+1}/{len(questions)}] {q_id} ({q_type}): {status} "
173
+ f"| inject={inject_time:.1f}s search={search_time:.4f}s",
174
+ flush=True,
175
+ )
176
+
177
+ finally:
178
+ shutil.rmtree(mem_path, ignore_errors=True)
179
+
180
+ elapsed = time.time() - start_time
181
+ total_hits = sum(r["recall"] for r in results)
182
+ total = len(results) or 1
183
+ overall = total_hits / total
184
+
185
+ if verbose:
186
+ print("-" * 60)
187
+ print(f"Overall R@{k}: {overall:.1%} ({total_hits}/{total})")
188
+ for qt, scores in sorted(type_scores.items()):
189
+ s = sum(scores)
190
+ print(f" {qt}: {s}/{len(scores)} = {s/len(scores):.1%}")
191
+ print(f"Time: {elapsed:.1f}s")
192
+
193
+ return {
194
+ "backend": backend,
195
+ "k": k,
196
+ "total": total,
197
+ "hits": total_hits,
198
+ "recall": overall,
199
+ "by_type": {
200
+ qt: {"hits": sum(s), "total": len(s), "recall": sum(s) / len(s) if s else 0}
201
+ for qt, s in type_scores.items()
202
+ },
203
+ "results": results,
204
+ "elapsed_s": round(elapsed, 1),
205
+ }
206
+
207
+
208
+ def main():
209
+ parser = argparse.ArgumentParser(description="coremem LongMemEval retrieval benchmark")
210
+ parser.add_argument("--data", type=str, required=True,
211
+ help="Path to longmemeval_s_cleaned.json")
212
+ parser.add_argument("--backend", type=str, default="chroma",
213
+ choices=["chroma", "hybrid"])
214
+ parser.add_argument("--question-types", type=str, nargs="*", default=None)
215
+ parser.add_argument("--limit", type=int, default=None)
216
+ parser.add_argument("--k", type=int, default=5)
217
+ args = parser.parse_args()
218
+
219
+ run_retrieval_benchmark(
220
+ data_path=args.data,
221
+ backend=args.backend,
222
+ question_types=args.question_types,
223
+ limit=args.limit,
224
+ k=args.k,
225
+ verbose=True,
226
+ )
227
+
228
+
229
+ if __name__ == "__main__":
230
+ main()