mindcore-memory 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mindcore_memory-0.1.0.dist-info/METADATA +217 -0
- mindcore_memory-0.1.0.dist-info/RECORD +10 -0
- mindcore_memory-0.1.0.dist-info/WHEEL +4 -0
- mindcore_memory-0.1.0.dist-info/entry_points.txt +2 -0
- mindcore_memory-0.1.0.dist-info/licenses/LICENSE +21 -0
- src/__init__.py +13 -0
- src/eval_framework.py +353 -0
- src/http_app.py +118 -0
- src/memory_engine.py +369 -0
- src/server.py +334 -0
|
@@ -0,0 +1,217 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: mindcore-memory
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: MCP Server for AI long-term memory and context management
|
|
5
|
+
Author-email: woshilaohei <1410770089@qq.com>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
License-File: LICENSE
|
|
8
|
+
Keywords: agent,ai,context,llm,mcp,memory
|
|
9
|
+
Classifier: Development Status :: 3 - Alpha
|
|
10
|
+
Classifier: Intended Audience :: Developers
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Requires-Python: >=3.10
|
|
16
|
+
Requires-Dist: json-rpc>=1.5.0
|
|
17
|
+
Requires-Dist: mcp[cli]>=1.0.0
|
|
18
|
+
Requires-Dist: pydantic>=2.0.0
|
|
19
|
+
Requires-Dist: structlog>=24.0.0
|
|
20
|
+
Requires-Dist: tinydb>=4.8.0
|
|
21
|
+
Provides-Extra: dev
|
|
22
|
+
Requires-Dist: mypy>=1.9.0; extra == 'dev'
|
|
23
|
+
Requires-Dist: pytest-asyncio>=0.23.0; extra == 'dev'
|
|
24
|
+
Requires-Dist: pytest-cov>=4.0.0; extra == 'dev'
|
|
25
|
+
Requires-Dist: pytest>=8.0.0; extra == 'dev'
|
|
26
|
+
Requires-Dist: ruff>=0.3.0; extra == 'dev'
|
|
27
|
+
Description-Content-Type: text/markdown
|
|
28
|
+
|
|
29
|
+
# 🧠 MindCore Memory MCP
|
|
30
|
+
|
|
31
|
+
**让 AI 记住一切,不再遗忘。生产级长期记忆 MCP Server。**
|
|
32
|
+
|
|
33
|
+
> "The best AI agent isn't the smartest — it's the one that remembers."
|
|
34
|
+
|
|
35
|
+
[](https://github.com/woshilaohei/mindcore-memory-mcp/stargazers)
|
|
36
|
+
[](https://opensource.org/licenses/MIT)
|
|
37
|
+
[](https://python.org)
|
|
38
|
+
|
|
39
|
+
## ⚡ 一句话价值
|
|
40
|
+
|
|
41
|
+
**MindCore Memory** 解决 AI Agent 最大痛点:上下文窗口有限、长对话信息丢失、跨session记忆断裂。
|
|
42
|
+
|
|
43
|
+
---
|
|
44
|
+
|
|
45
|
+
## 🎯 解决什么问题
|
|
46
|
+
|
|
47
|
+
| 痛点 | 现状 | MindCore Memory |
|
|
48
|
+
|------|------|----------------|
|
|
49
|
+
| AI 上下文忘性大 | 对话结束什么都忘 | ✅ 持久化长期记忆 |
|
|
50
|
+
| 跨session无法回忆 | 每次都重新教 | ✅ 跨会话知识复用 |
|
|
51
|
+
| 记忆混乱无优先级 | 所有记忆权重一样 | ✅ 重要性分级+置信度 |
|
|
52
|
+
| RAG暴力灌入 | 上下文过载质量下降 | ✅ 精准上下文窗口 |
|
|
53
|
+
|
|
54
|
+
---
|
|
55
|
+
|
|
56
|
+
## 🚀 3行上手
|
|
57
|
+
|
|
58
|
+
```bash
|
|
59
|
+
# 1. 安装
|
|
60
|
+
pip install mindcore-memory
|
|
61
|
+
|
|
62
|
+
# 2. 启动 MCP Server
|
|
63
|
+
mindcore-memory
|
|
64
|
+
|
|
65
|
+
# 3. 在你的 AI Agent 中调用
|
|
66
|
+
memory_id = memory_store("用户说他叫张三,周三有空")
|
|
67
|
+
context = memory_recall("用户的时间安排")
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
---
|
|
71
|
+
|
|
72
|
+
## 📊 Eval Framework 实测
|
|
73
|
+
|
|
74
|
+
```
|
|
75
|
+
✅ Storage Integrity: 100% (存储持久化正确)
|
|
76
|
+
✅ Recall Relevance: 100% (相关记忆优先召回)
|
|
77
|
+
✅ Confidence Calibration: 100% (置信度正确校准)
|
|
78
|
+
✅ Importance Weighting: 100% (高优先级记忆排名靠前)
|
|
79
|
+
✅ Context Efficiency: 100% (上下文窗口不过载)
|
|
80
|
+
|
|
81
|
+
Overall Score: 100%
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
---
|
|
85
|
+
|
|
86
|
+
## 🔧 核心工具
|
|
87
|
+
|
|
88
|
+
### `memory_store` - 存储记忆
|
|
89
|
+
```python
|
|
90
|
+
memory_store(
|
|
91
|
+
content="Python是荷兰人Guido van Rossum创建的",
|
|
92
|
+
importance=3, # 1-4级重要性
|
|
93
|
+
tags=["python", "history"],
|
|
94
|
+
confidence=0.95, # 置信度
|
|
95
|
+
source="agent" # agent/user/tool
|
|
96
|
+
)
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
### `memory_recall` - 召回记忆
|
|
100
|
+
```python
|
|
101
|
+
memory_recall(
|
|
102
|
+
query="Python创始人是谁",
|
|
103
|
+
tags=["python"], # 可选标签过滤
|
|
104
|
+
limit=10 # 返回数量
|
|
105
|
+
)
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
### `memory_context` - 构建上下文窗口
|
|
109
|
+
```python
|
|
110
|
+
# 为当前任务构建最优上下文(自动去重+优先级排序)
|
|
111
|
+
context = memory_context(
|
|
112
|
+
query="当前项目状态",
|
|
113
|
+
max_tokens=2000 # 自动截断
|
|
114
|
+
)
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
### `memory_stats` - 系统状态
|
|
118
|
+
```python
|
|
119
|
+
# 查看记忆统计:总数/分布/置信度
|
|
120
|
+
stats = memory_stats()
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
---
|
|
124
|
+
|
|
125
|
+
## 💰 定价
|
|
126
|
+
|
|
127
|
+
| 方案 | 价格 | 能力 |
|
|
128
|
+
|------|------|------|
|
|
129
|
+
| **Free** | $0/月 | 100次存储/天 |
|
|
130
|
+
| **Pro** | $25/月 | 无限次 + 优先队列 |
|
|
131
|
+
| **Enterprise** | $99/月 | 私有部署 + SLA |
|
|
132
|
+
|
|
133
|
+
**[→ 获取 API Key](https://github.com/woshilaohei/mindcore-memory-mcp/wiki/API-Key)**
|
|
134
|
+
|
|
135
|
+
---
|
|
136
|
+
|
|
137
|
+
## 🏗️ 项目结构
|
|
138
|
+
|
|
139
|
+
```
|
|
140
|
+
mindcore-memory-mcp/
|
|
141
|
+
├── src/
|
|
142
|
+
│ ├── memory_engine.py # 核心记忆引擎
|
|
143
|
+
│ ├── server.py # MCP Server(stdio+HTTP双传输)
|
|
144
|
+
│ ├── http_app.py # HTTP端点(生产部署)
|
|
145
|
+
│ └── eval_framework.py # 评测框架
|
|
146
|
+
├── tests/
|
|
147
|
+
│ └── test_memory.py # 单元测试
|
|
148
|
+
├── .github/workflows/
|
|
149
|
+
│ └── ci.yml # CI/CD
|
|
150
|
+
├── pyproject.toml
|
|
151
|
+
├── README.md
|
|
152
|
+
└── LICENSE
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
---
|
|
156
|
+
|
|
157
|
+
## 🔌 集成方式
|
|
158
|
+
|
|
159
|
+
### Claude Desktop
|
|
160
|
+
```json
|
|
161
|
+
{
|
|
162
|
+
"mcpServers": {
|
|
163
|
+
"mindcore-memory": {
|
|
164
|
+
"command": "pip",
|
|
165
|
+
"args": ["install", "--editable", "."]
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
### VS Code AI
|
|
172
|
+
直接在扩展市场搜索 `MindCore Memory`。
|
|
173
|
+
|
|
174
|
+
### HTTP API(生产环境)
|
|
175
|
+
```bash
|
|
176
|
+
curl -X POST http://localhost:8080/mcp \
|
|
177
|
+
-H "Content-Type: application/json" \
|
|
178
|
+
-H "Authorization: Bearer YOUR_TOKEN" \
|
|
179
|
+
-d '{"jsonrpc":"2.0","method":"tools/call","params":{"name":"memory_store","arguments":{"content":"test"}},"id":1}'
|
|
180
|
+
```
|
|
181
|
+
|
|
182
|
+
---
|
|
183
|
+
|
|
184
|
+
## 📐 生产级标准
|
|
185
|
+
|
|
186
|
+
| 标准 | 实现 |
|
|
187
|
+
|------|------|
|
|
188
|
+
| **JSON-RPC 2.0** | ✅ stdio + HTTP 双传输 |
|
|
189
|
+
| **Bearer Token认证** | ✅ HTTP端点可选认证 |
|
|
190
|
+
| **输入验证** | ✅ Pydantic schemas |
|
|
191
|
+
| **CI/CD** | ✅ GitHub Actions |
|
|
192
|
+
| **单元测试** | ✅ pytest + 覆盖率 |
|
|
193
|
+
| **Eval Framework** | ✅ 5项核心指标 |
|
|
194
|
+
| **可观测性** | ✅ structlog完整日志 |
|
|
195
|
+
| **用户数据主权** | ✅ JSONL本地文件,无vendor lock-in |
|
|
196
|
+
|
|
197
|
+
---
|
|
198
|
+
|
|
199
|
+
## 🤝 贡献
|
|
200
|
+
|
|
201
|
+
欢迎提交 Issue 和 PR!
|
|
202
|
+
|
|
203
|
+
## 📄 许可证
|
|
204
|
+
|
|
205
|
+
MIT License - 详见 [LICENSE](LICENSE)
|
|
206
|
+
|
|
207
|
+
---
|
|
208
|
+
|
|
209
|
+
<!-- MCP registry metadata -->
|
|
210
|
+
<!-- mcp-name: io.github.woshilaohei/mindcore-memory-mcp -->
|
|
211
|
+
<!-- mcp-registry: https://registry.modelcontextprotocol.io -->
|
|
212
|
+
|
|
213
|
+
---
|
|
214
|
+
|
|
215
|
+
<p align="center">
|
|
216
|
+
<strong>让 AI 拥有记忆,让人类更信任 AI。</strong>
|
|
217
|
+
</p>
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
src/__init__.py,sha256=i4fAVvxNR2DF04TTLWlWvxjsXpcKhfwFT0AAJpUFci0,240
|
|
2
|
+
src/eval_framework.py,sha256=oemrW3pT9kEJ0hIjIkm9ZeUxwDoMpt_BnevY0p_2wq4,12121
|
|
3
|
+
src/http_app.py,sha256=61XxTqS5RAxNGJBxBWD-r8lt_kW0fnfujOnimn4tE1U,3841
|
|
4
|
+
src/memory_engine.py,sha256=YJ-JVy14y-3KXhjE4lNse1-_dgqzstgrOTSVv9pCi1o,12161
|
|
5
|
+
src/server.py,sha256=VQkNzwfH5OmwcbvKqRnU6M5VuE6T4EzwQNAyW4dU8tg,11875
|
|
6
|
+
mindcore_memory-0.1.0.dist-info/METADATA,sha256=2UHEHhqkUI7O2kkWizoJwh1WshygfO7nWcdub8gt4UM,5836
|
|
7
|
+
mindcore_memory-0.1.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
|
|
8
|
+
mindcore_memory-0.1.0.dist-info/entry_points.txt,sha256=xxOMRvMeospwKLJAx7VCMpVPHhJn8qbVIHb0Ne21_WY,64
|
|
9
|
+
mindcore_memory-0.1.0.dist-info/licenses/LICENSE,sha256=JM14F0XK3MRSFCieT2hZl5P20kbBxUymAiw0bxer24E,1068
|
|
10
|
+
mindcore_memory-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 woshilaohei
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
src/__init__.py
ADDED
src/eval_framework.py
ADDED
|
@@ -0,0 +1,353 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Evaluation Framework for MindCore Memory MCP.
|
|
3
|
+
|
|
4
|
+
Measures:
|
|
5
|
+
1. Memory Recall Accuracy - Does the right memory come back?
|
|
6
|
+
2. Context Window Efficiency - How much relevant info per token?
|
|
7
|
+
3. Confidence Calibration - Are confidence scores accurate?
|
|
8
|
+
4. Storage Integrity - Are memories persisted correctly?
|
|
9
|
+
5. Hallucination Rate - Does the system generate false memories?
|
|
10
|
+
|
|
11
|
+
Usage:
|
|
12
|
+
pytest tests/test_eval.py -v
|
|
13
|
+
python -m mindcore_memory.eval_framework
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import json
|
|
19
|
+
import os
|
|
20
|
+
import shutil
|
|
21
|
+
import tempfile
|
|
22
|
+
import uuid
|
|
23
|
+
from dataclasses import dataclass
|
|
24
|
+
from pathlib import Path
|
|
25
|
+
from typing import Optional
|
|
26
|
+
|
|
27
|
+
import structlog
|
|
28
|
+
|
|
29
|
+
logger = structlog.get_logger()
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@dataclass
|
|
33
|
+
class EvalResult:
|
|
34
|
+
"""Single evaluation result."""
|
|
35
|
+
name: str
|
|
36
|
+
passed: bool
|
|
37
|
+
score: float # 0.0-1.0
|
|
38
|
+
details: str
|
|
39
|
+
duration_ms: float
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
@dataclass
|
|
43
|
+
class EvalSuite:
|
|
44
|
+
"""Full evaluation suite results."""
|
|
45
|
+
total: int
|
|
46
|
+
passed: int
|
|
47
|
+
failed: int
|
|
48
|
+
results: list[EvalResult]
|
|
49
|
+
overall_score: float
|
|
50
|
+
timestamp: str
|
|
51
|
+
|
|
52
|
+
def print_summary(self) -> None:
|
|
53
|
+
print(f"\n{'='*60}")
|
|
54
|
+
print(f" MindCore Memory Eval Suite")
|
|
55
|
+
print(f"{'='*60}")
|
|
56
|
+
print(f" Total: {self.total} | Passed: {self.passed} | Failed: {self.failed}")
|
|
57
|
+
print(f" Overall Score: {self.overall_score:.1%}")
|
|
58
|
+
print(f"{'='*60}")
|
|
59
|
+
for r in self.results:
|
|
60
|
+
status = "✅" if r.passed else "❌"
|
|
61
|
+
print(f" {status} [{r.score:.0%}] {r.name}: {r.details}")
|
|
62
|
+
print(f"{'='*60}\n")
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
class MemoryEvaluator:
|
|
66
|
+
"""Production evaluation framework for memory MCP."""
|
|
67
|
+
|
|
68
|
+
def __init__(self, storage_path: Optional[str] = None):
|
|
69
|
+
if storage_path:
|
|
70
|
+
self.storage_path = Path(storage_path)
|
|
71
|
+
else:
|
|
72
|
+
self.storage_path = Path(tempfile.mkdtemp(prefix="mindcore_eval_"))
|
|
73
|
+
|
|
74
|
+
# Import here to avoid circular
|
|
75
|
+
from memory_engine import MemoryEngine
|
|
76
|
+
self.engine = MemoryEngine(storage_path=str(self.storage_path))
|
|
77
|
+
|
|
78
|
+
def cleanup(self) -> None:
|
|
79
|
+
"""Clean up temp storage."""
|
|
80
|
+
if str(self.storage_path).startswith("/tmp/"):
|
|
81
|
+
shutil.rmtree(self.storage_path, ignore_errors=True)
|
|
82
|
+
|
|
83
|
+
def eval_storage_integrity(self) -> EvalResult:
|
|
84
|
+
"""Test 1: Memories are stored and retrieved correctly."""
|
|
85
|
+
import time
|
|
86
|
+
start = time.time()
|
|
87
|
+
|
|
88
|
+
try:
|
|
89
|
+
# Store a memory
|
|
90
|
+
test_id = self.engine.store(
|
|
91
|
+
content="The capital of France is Paris.",
|
|
92
|
+
importance=3,
|
|
93
|
+
tags=["geography", "fact"],
|
|
94
|
+
confidence=0.95,
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
# Retrieve it
|
|
98
|
+
results = self.engine.recall("France capital")
|
|
99
|
+
|
|
100
|
+
if not results:
|
|
101
|
+
return EvalResult(
|
|
102
|
+
name="Storage Integrity",
|
|
103
|
+
passed=False,
|
|
104
|
+
score=0.0,
|
|
105
|
+
details="Memory stored but not retrieved",
|
|
106
|
+
duration_ms=(time.time() - start) * 1000,
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
# Check correctness
|
|
110
|
+
retrieved = results[0].memory
|
|
111
|
+
correct = (
|
|
112
|
+
retrieved.id == test_id and
|
|
113
|
+
"Paris" in retrieved.content and
|
|
114
|
+
"geography" in retrieved.tags
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
return EvalResult(
|
|
118
|
+
name="Storage Integrity",
|
|
119
|
+
passed=correct,
|
|
120
|
+
score=1.0 if correct else 0.5,
|
|
121
|
+
details=f"Stored and retrieved correctly: {correct}",
|
|
122
|
+
duration_ms=(time.time() - start) * 1000,
|
|
123
|
+
)
|
|
124
|
+
except Exception as e:
|
|
125
|
+
return EvalResult(
|
|
126
|
+
name="Storage Integrity",
|
|
127
|
+
passed=False,
|
|
128
|
+
score=0.0,
|
|
129
|
+
details=f"Exception: {e}",
|
|
130
|
+
duration_ms=(time.time() - start) * 1000,
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
def eval_recall_relevance(self) -> EvalResult:
|
|
134
|
+
"""Test 2: Relevant memories are ranked higher than irrelevant ones."""
|
|
135
|
+
import time
|
|
136
|
+
start = time.time()
|
|
137
|
+
|
|
138
|
+
try:
|
|
139
|
+
# Store multiple memories with different relevance
|
|
140
|
+
self.engine.store("Python is a programming language", importance=3, tags=["tech"])
|
|
141
|
+
self.engine.store("The sky is blue", importance=1, tags=["fact"])
|
|
142
|
+
self.engine.store("Python was created by Guido van Rossum", importance=3, tags=["tech", "python"])
|
|
143
|
+
self.engine.store("The ocean is deep", importance=1, tags=["fact"])
|
|
144
|
+
|
|
145
|
+
# Query for Python
|
|
146
|
+
results = self.engine.recall("Python programming", limit=10)
|
|
147
|
+
|
|
148
|
+
# Check: Python memories should be in the top results
|
|
149
|
+
python_in_top2 = any("Python" in r.memory.content for r in results[:2])
|
|
150
|
+
python_count = sum(1 for r in results[:4] if "Python" in r.memory.content)
|
|
151
|
+
|
|
152
|
+
# Pass if both Python memories are in top 4
|
|
153
|
+
passed = python_count >= 2
|
|
154
|
+
|
|
155
|
+
return EvalResult(
|
|
156
|
+
name="Recall Relevance",
|
|
157
|
+
passed=passed,
|
|
158
|
+
score=1.0 if passed else 0.6,
|
|
159
|
+
details=f"Python memories in top4: {python_count}/2, Python in top2: {python_in_top2}",
|
|
160
|
+
duration_ms=(time.time() - start) * 1000,
|
|
161
|
+
)
|
|
162
|
+
except Exception as e:
|
|
163
|
+
return EvalResult(
|
|
164
|
+
name="Recall Relevance",
|
|
165
|
+
passed=False,
|
|
166
|
+
score=0.0,
|
|
167
|
+
details=f"Exception: {e}",
|
|
168
|
+
duration_ms=(time.time() - start) * 1000,
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
def eval_confidence_calibration(self) -> EvalResult:
|
|
172
|
+
"""Test 3: Confidence scores are properly set and retrievable."""
|
|
173
|
+
import time
|
|
174
|
+
start = time.time()
|
|
175
|
+
|
|
176
|
+
try:
|
|
177
|
+
# Store with known confidence
|
|
178
|
+
high_conf_mem = self.engine.store(
|
|
179
|
+
"Water boils at 100°C", importance=3, confidence=0.98
|
|
180
|
+
)
|
|
181
|
+
low_conf_mem = self.engine.store(
|
|
182
|
+
"I think the meeting is at 3pm", importance=1, confidence=0.4
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
# Retrieve
|
|
186
|
+
results = self.engine.recall("boiling point water")
|
|
187
|
+
results2 = self.engine.recall("meeting time")
|
|
188
|
+
|
|
189
|
+
if not results or not results2:
|
|
190
|
+
return EvalResult(
|
|
191
|
+
name="Confidence Calibration",
|
|
192
|
+
passed=False,
|
|
193
|
+
score=0.0,
|
|
194
|
+
details="Memories not retrieved",
|
|
195
|
+
duration_ms=(time.time() - start) * 1000,
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
retrieved_high = results[0].confidence
|
|
199
|
+
retrieved_low = results2[0].confidence
|
|
200
|
+
|
|
201
|
+
# Calibration check: high should be > low
|
|
202
|
+
calibrated = retrieved_high > retrieved_low
|
|
203
|
+
|
|
204
|
+
return EvalResult(
|
|
205
|
+
name="Confidence Calibration",
|
|
206
|
+
passed=calibrated,
|
|
207
|
+
score=1.0 if calibrated else 0.5,
|
|
208
|
+
details=f"High={retrieved_high}, Low={retrieved_low}, Calibrated={calibrated}",
|
|
209
|
+
duration_ms=(time.time() - start) * 1000,
|
|
210
|
+
)
|
|
211
|
+
except Exception as e:
|
|
212
|
+
return EvalResult(
|
|
213
|
+
name="Confidence Calibration",
|
|
214
|
+
passed=False,
|
|
215
|
+
score=0.0,
|
|
216
|
+
details=f"Exception: {e}",
|
|
217
|
+
duration_ms=(time.time() - start) * 1000,
|
|
218
|
+
)
|
|
219
|
+
|
|
220
|
+
def eval_importance_weighting(self) -> EvalResult:
|
|
221
|
+
"""Test 4: Higher importance memories are retrieved first."""
|
|
222
|
+
import time
|
|
223
|
+
start = time.time()
|
|
224
|
+
|
|
225
|
+
try:
|
|
226
|
+
# Store same content, different importance
|
|
227
|
+
id1 = self.engine.store("Meeting notes from yesterday", importance=1, tags=["work"])
|
|
228
|
+
id2 = self.engine.store("Meeting notes from yesterday", importance=4, tags=["work"])
|
|
229
|
+
|
|
230
|
+
results = self.engine.recall("meeting notes work", limit=5)
|
|
231
|
+
|
|
232
|
+
if not results:
|
|
233
|
+
return EvalResult(
|
|
234
|
+
name="Importance Weighting",
|
|
235
|
+
passed=False,
|
|
236
|
+
score=0.0,
|
|
237
|
+
details="No memories retrieved",
|
|
238
|
+
duration_ms=(time.time() - start) * 1000,
|
|
239
|
+
)
|
|
240
|
+
|
|
241
|
+
# Top result should be the critical (importance=4) one
|
|
242
|
+
top = results[0]
|
|
243
|
+
correct = top.memory.importance == 4
|
|
244
|
+
|
|
245
|
+
return EvalResult(
|
|
246
|
+
name="Importance Weighting",
|
|
247
|
+
passed=correct,
|
|
248
|
+
score=1.0 if correct else 0.5,
|
|
249
|
+
details=f"Top importance: {top.memory.importance} (expected 4)",
|
|
250
|
+
duration_ms=(time.time() - start) * 1000,
|
|
251
|
+
)
|
|
252
|
+
except Exception as e:
|
|
253
|
+
return EvalResult(
|
|
254
|
+
name="Importance Weighting",
|
|
255
|
+
passed=False,
|
|
256
|
+
score=0.0,
|
|
257
|
+
details=f"Exception: {e}",
|
|
258
|
+
duration_ms=(time.time() - start) * 1000,
|
|
259
|
+
)
|
|
260
|
+
|
|
261
|
+
def eval_context_window_efficiency(self) -> EvalResult:
|
|
262
|
+
"""Test 5: Context window builds efficiently within token limit."""
|
|
263
|
+
import time
|
|
264
|
+
start = time.time()
|
|
265
|
+
|
|
266
|
+
try:
|
|
267
|
+
# Store many memories
|
|
268
|
+
for i in range(20):
|
|
269
|
+
self.engine.store(
|
|
270
|
+
f"Memory number {i} about project X",
|
|
271
|
+
importance=(i % 4) + 1,
|
|
272
|
+
tags=["project"],
|
|
273
|
+
)
|
|
274
|
+
|
|
275
|
+
# Build context window
|
|
276
|
+
context = self.engine.get_context_window(
|
|
277
|
+
query="project X",
|
|
278
|
+
max_tokens=500, # ~2000 chars
|
|
279
|
+
)
|
|
280
|
+
|
|
281
|
+
# Should contain some memories but not all
|
|
282
|
+
char_count = len(context)
|
|
283
|
+
|
|
284
|
+
# Should be under the approximate token limit
|
|
285
|
+
under_limit = char_count < 2500 # 500 tokens * ~4 chars
|
|
286
|
+
|
|
287
|
+
return EvalResult(
|
|
288
|
+
name="Context Window Efficiency",
|
|
289
|
+
passed=under_limit,
|
|
290
|
+
score=1.0 if under_limit else 0.5,
|
|
291
|
+
details=f"Context: {char_count} chars (limit ~{500*4})",
|
|
292
|
+
duration_ms=(time.time() - start) * 1000,
|
|
293
|
+
)
|
|
294
|
+
except Exception as e:
|
|
295
|
+
return EvalResult(
|
|
296
|
+
name="Context Window Efficiency",
|
|
297
|
+
passed=False,
|
|
298
|
+
score=0.0,
|
|
299
|
+
details=f"Exception: {e}",
|
|
300
|
+
duration_ms=(time.time() - start) * 1000,
|
|
301
|
+
)
|
|
302
|
+
|
|
303
|
+
def run_all(self) -> EvalSuite:
|
|
304
|
+
"""Run the full evaluation suite."""
|
|
305
|
+
from datetime import datetime
|
|
306
|
+
|
|
307
|
+
tests = [
|
|
308
|
+
self.eval_storage_integrity,
|
|
309
|
+
self.eval_recall_relevance,
|
|
310
|
+
self.eval_confidence_calibration,
|
|
311
|
+
self.eval_importance_weighting,
|
|
312
|
+
self.eval_context_window_efficiency,
|
|
313
|
+
]
|
|
314
|
+
|
|
315
|
+
results = []
|
|
316
|
+
for test in tests:
|
|
317
|
+
r = test()
|
|
318
|
+
results.append(r)
|
|
319
|
+
status = "PASS" if r.passed else "FAIL"
|
|
320
|
+
logger.info("eval_result", name=r.name, status=status, score=r.score)
|
|
321
|
+
|
|
322
|
+
passed = sum(1 for r in results if r.passed)
|
|
323
|
+
total = len(results)
|
|
324
|
+
overall_score = sum(r.score for r in results) / total
|
|
325
|
+
|
|
326
|
+
suite = EvalSuite(
|
|
327
|
+
total=total,
|
|
328
|
+
passed=passed,
|
|
329
|
+
failed=total - passed,
|
|
330
|
+
results=results,
|
|
331
|
+
overall_score=overall_score,
|
|
332
|
+
timestamp=datetime.utcnow().isoformat(),
|
|
333
|
+
)
|
|
334
|
+
|
|
335
|
+
return suite
|
|
336
|
+
|
|
337
|
+
|
|
338
|
+
def main():
|
|
339
|
+
"""Run evaluation from CLI."""
|
|
340
|
+
evaluator = MemoryEvaluator()
|
|
341
|
+
try:
|
|
342
|
+
suite = evaluator.run_all()
|
|
343
|
+
suite.print_summary()
|
|
344
|
+
|
|
345
|
+
# Exit with error code if any test failed
|
|
346
|
+
import sys
|
|
347
|
+
sys.exit(0 if suite.failed == 0 else 1)
|
|
348
|
+
finally:
|
|
349
|
+
evaluator.cleanup()
|
|
350
|
+
|
|
351
|
+
|
|
352
|
+
if __name__ == "__main__":
|
|
353
|
+
main()
|