agmem 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agmem-0.1.1.dist-info/METADATA +656 -0
- agmem-0.1.1.dist-info/RECORD +67 -0
- agmem-0.1.1.dist-info/WHEEL +5 -0
- agmem-0.1.1.dist-info/entry_points.txt +2 -0
- agmem-0.1.1.dist-info/licenses/LICENSE +21 -0
- agmem-0.1.1.dist-info/top_level.txt +1 -0
- memvcs/__init__.py +9 -0
- memvcs/cli.py +178 -0
- memvcs/commands/__init__.py +23 -0
- memvcs/commands/add.py +258 -0
- memvcs/commands/base.py +23 -0
- memvcs/commands/blame.py +169 -0
- memvcs/commands/branch.py +110 -0
- memvcs/commands/checkout.py +101 -0
- memvcs/commands/clean.py +76 -0
- memvcs/commands/clone.py +91 -0
- memvcs/commands/commit.py +174 -0
- memvcs/commands/daemon.py +267 -0
- memvcs/commands/diff.py +157 -0
- memvcs/commands/fsck.py +203 -0
- memvcs/commands/garden.py +107 -0
- memvcs/commands/graph.py +151 -0
- memvcs/commands/init.py +61 -0
- memvcs/commands/log.py +103 -0
- memvcs/commands/mcp.py +59 -0
- memvcs/commands/merge.py +88 -0
- memvcs/commands/pull.py +65 -0
- memvcs/commands/push.py +143 -0
- memvcs/commands/reflog.py +52 -0
- memvcs/commands/remote.py +51 -0
- memvcs/commands/reset.py +98 -0
- memvcs/commands/search.py +163 -0
- memvcs/commands/serve.py +54 -0
- memvcs/commands/show.py +125 -0
- memvcs/commands/stash.py +97 -0
- memvcs/commands/status.py +112 -0
- memvcs/commands/tag.py +117 -0
- memvcs/commands/test.py +132 -0
- memvcs/commands/tree.py +156 -0
- memvcs/core/__init__.py +21 -0
- memvcs/core/config_loader.py +245 -0
- memvcs/core/constants.py +12 -0
- memvcs/core/diff.py +380 -0
- memvcs/core/gardener.py +466 -0
- memvcs/core/hooks.py +151 -0
- memvcs/core/knowledge_graph.py +381 -0
- memvcs/core/merge.py +474 -0
- memvcs/core/objects.py +323 -0
- memvcs/core/pii_scanner.py +343 -0
- memvcs/core/refs.py +447 -0
- memvcs/core/remote.py +278 -0
- memvcs/core/repository.py +522 -0
- memvcs/core/schema.py +414 -0
- memvcs/core/staging.py +227 -0
- memvcs/core/storage/__init__.py +72 -0
- memvcs/core/storage/base.py +359 -0
- memvcs/core/storage/gcs.py +308 -0
- memvcs/core/storage/local.py +182 -0
- memvcs/core/storage/s3.py +369 -0
- memvcs/core/test_runner.py +371 -0
- memvcs/core/vector_store.py +313 -0
- memvcs/integrations/__init__.py +5 -0
- memvcs/integrations/mcp_server.py +267 -0
- memvcs/integrations/web_ui/__init__.py +1 -0
- memvcs/integrations/web_ui/server.py +352 -0
- memvcs/utils/__init__.py +9 -0
- memvcs/utils/helpers.py +178 -0
|
@@ -0,0 +1,371 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Test runner for agmem memory tests.
|
|
3
|
+
|
|
4
|
+
Implements CI/CD-style testing for agent memory to prevent hallucinated facts
|
|
5
|
+
from corrupting the knowledge base.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import os
|
|
9
|
+
import json
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import List, Dict, Any, Optional
|
|
12
|
+
from dataclasses import dataclass, field
|
|
13
|
+
from datetime import datetime
|
|
14
|
+
|
|
15
|
+
try:
|
|
16
|
+
import yaml
|
|
17
|
+
YAML_AVAILABLE = True
|
|
18
|
+
except ImportError:
|
|
19
|
+
YAML_AVAILABLE = False
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass
|
|
23
|
+
class TestCase:
|
|
24
|
+
"""A single test case for memory validation."""
|
|
25
|
+
name: str
|
|
26
|
+
query: str
|
|
27
|
+
expected_fact: str
|
|
28
|
+
confidence_threshold: float = 0.7
|
|
29
|
+
required: bool = False # If True, blocks commit on failure
|
|
30
|
+
tags: List[str] = field(default_factory=list)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@dataclass
|
|
34
|
+
class TestFailure:
|
|
35
|
+
"""Represents a failed test."""
|
|
36
|
+
test_name: str
|
|
37
|
+
query: str
|
|
38
|
+
expected: str
|
|
39
|
+
actual: Optional[str]
|
|
40
|
+
message: str
|
|
41
|
+
is_critical: bool = False
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
@dataclass
|
|
45
|
+
class TestResult:
|
|
46
|
+
"""Result of running memory tests."""
|
|
47
|
+
passed: bool
|
|
48
|
+
total_count: int
|
|
49
|
+
passed_count: int
|
|
50
|
+
failed_count: int
|
|
51
|
+
failures: List[TestFailure] = field(default_factory=list)
|
|
52
|
+
duration_ms: int = 0
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class TestRunner:
|
|
56
|
+
"""
|
|
57
|
+
Runner for memory regression tests.
|
|
58
|
+
|
|
59
|
+
Tests are defined in YAML files in the tests/ directory of the memory repo.
|
|
60
|
+
"""
|
|
61
|
+
|
|
62
|
+
def __init__(self, repo, vector_store=None):
|
|
63
|
+
"""
|
|
64
|
+
Initialize test runner.
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
repo: Repository instance
|
|
68
|
+
vector_store: Optional VectorStore for semantic search tests
|
|
69
|
+
"""
|
|
70
|
+
self.repo = repo
|
|
71
|
+
self.vector_store = vector_store
|
|
72
|
+
self.tests_dir = repo.root / 'tests'
|
|
73
|
+
|
|
74
|
+
def load_tests(self) -> List[TestCase]:
|
|
75
|
+
"""
|
|
76
|
+
Load all test cases from the tests/ directory.
|
|
77
|
+
|
|
78
|
+
Returns:
|
|
79
|
+
List of TestCase objects
|
|
80
|
+
"""
|
|
81
|
+
tests = []
|
|
82
|
+
|
|
83
|
+
if not self.tests_dir.exists():
|
|
84
|
+
return tests
|
|
85
|
+
|
|
86
|
+
for test_file in self.tests_dir.glob('**/*.yaml'):
|
|
87
|
+
tests.extend(self._load_test_file(test_file))
|
|
88
|
+
|
|
89
|
+
for test_file in self.tests_dir.glob('**/*.yml'):
|
|
90
|
+
tests.extend(self._load_test_file(test_file))
|
|
91
|
+
|
|
92
|
+
for test_file in self.tests_dir.glob('**/*.json'):
|
|
93
|
+
tests.extend(self._load_json_test_file(test_file))
|
|
94
|
+
|
|
95
|
+
return tests
|
|
96
|
+
|
|
97
|
+
def _load_test_file(self, path: Path) -> List[TestCase]:
|
|
98
|
+
"""Load tests from a YAML file."""
|
|
99
|
+
if not YAML_AVAILABLE:
|
|
100
|
+
return []
|
|
101
|
+
|
|
102
|
+
try:
|
|
103
|
+
with open(path) as f:
|
|
104
|
+
data = yaml.safe_load(f)
|
|
105
|
+
|
|
106
|
+
if not data or 'tests' not in data:
|
|
107
|
+
return []
|
|
108
|
+
|
|
109
|
+
tests = []
|
|
110
|
+
file_name = path.stem
|
|
111
|
+
|
|
112
|
+
for i, test_data in enumerate(data['tests']):
|
|
113
|
+
name = test_data.get('name', f"{file_name}_{i}")
|
|
114
|
+
tests.append(TestCase(
|
|
115
|
+
name=name,
|
|
116
|
+
query=test_data['query'],
|
|
117
|
+
expected_fact=test_data['expected_fact'],
|
|
118
|
+
confidence_threshold=test_data.get('confidence_threshold', 0.7),
|
|
119
|
+
required=test_data.get('required', False),
|
|
120
|
+
tags=test_data.get('tags', [])
|
|
121
|
+
))
|
|
122
|
+
|
|
123
|
+
return tests
|
|
124
|
+
|
|
125
|
+
except Exception as e:
|
|
126
|
+
print(f"Warning: Failed to load test file {path}: {e}")
|
|
127
|
+
return []
|
|
128
|
+
|
|
129
|
+
def _load_json_test_file(self, path: Path) -> List[TestCase]:
|
|
130
|
+
"""Load tests from a JSON file."""
|
|
131
|
+
try:
|
|
132
|
+
with open(path) as f:
|
|
133
|
+
data = json.load(f)
|
|
134
|
+
|
|
135
|
+
if not data:
|
|
136
|
+
return []
|
|
137
|
+
|
|
138
|
+
# Support both array of tests and object with 'tests' key
|
|
139
|
+
if isinstance(data, list):
|
|
140
|
+
test_list = data
|
|
141
|
+
elif 'tests' in data:
|
|
142
|
+
test_list = data['tests']
|
|
143
|
+
else:
|
|
144
|
+
return []
|
|
145
|
+
|
|
146
|
+
tests = []
|
|
147
|
+
file_name = path.stem
|
|
148
|
+
|
|
149
|
+
for i, test_data in enumerate(test_list):
|
|
150
|
+
name = test_data.get('name', f"{file_name}_{i}")
|
|
151
|
+
tests.append(TestCase(
|
|
152
|
+
name=name,
|
|
153
|
+
query=test_data['query'],
|
|
154
|
+
expected_fact=test_data['expected_fact'],
|
|
155
|
+
confidence_threshold=test_data.get('confidence_threshold', 0.7),
|
|
156
|
+
required=test_data.get('required', False),
|
|
157
|
+
tags=test_data.get('tags', [])
|
|
158
|
+
))
|
|
159
|
+
|
|
160
|
+
return tests
|
|
161
|
+
|
|
162
|
+
except Exception as e:
|
|
163
|
+
print(f"Warning: Failed to load test file {path}: {e}")
|
|
164
|
+
return []
|
|
165
|
+
|
|
166
|
+
def run_test(self, test: TestCase) -> Optional[TestFailure]:
|
|
167
|
+
"""
|
|
168
|
+
Run a single test case.
|
|
169
|
+
|
|
170
|
+
Returns:
|
|
171
|
+
TestFailure if test failed, None if passed
|
|
172
|
+
"""
|
|
173
|
+
# If we have a vector store, use semantic search
|
|
174
|
+
if self.vector_store:
|
|
175
|
+
return self._run_semantic_test(test)
|
|
176
|
+
else:
|
|
177
|
+
# Fall back to simple text matching
|
|
178
|
+
return self._run_text_test(test)
|
|
179
|
+
|
|
180
|
+
def _run_semantic_test(self, test: TestCase) -> Optional[TestFailure]:
|
|
181
|
+
"""Run test using semantic search."""
|
|
182
|
+
try:
|
|
183
|
+
results = self.vector_store.search(test.query, k=5)
|
|
184
|
+
|
|
185
|
+
if not results:
|
|
186
|
+
return TestFailure(
|
|
187
|
+
test_name=test.name,
|
|
188
|
+
query=test.query,
|
|
189
|
+
expected=test.expected_fact,
|
|
190
|
+
actual=None,
|
|
191
|
+
message="No results found for query",
|
|
192
|
+
is_critical=test.required
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
# Check if any result contains the expected fact
|
|
196
|
+
for result in results:
|
|
197
|
+
content = result.get('content', '')
|
|
198
|
+
similarity = result.get('similarity', 0)
|
|
199
|
+
|
|
200
|
+
if similarity >= test.confidence_threshold:
|
|
201
|
+
# Use simple string matching as judge
|
|
202
|
+
if self._fact_matches(test.expected_fact, content):
|
|
203
|
+
return None # Test passed
|
|
204
|
+
|
|
205
|
+
# No matching result found
|
|
206
|
+
best_result = results[0] if results else {}
|
|
207
|
+
return TestFailure(
|
|
208
|
+
test_name=test.name,
|
|
209
|
+
query=test.query,
|
|
210
|
+
expected=test.expected_fact,
|
|
211
|
+
actual=best_result.get('content', '')[:200],
|
|
212
|
+
message=f"Expected fact not found in top results (best similarity: {best_result.get('similarity', 0):.2f})",
|
|
213
|
+
is_critical=test.required
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
except Exception as e:
|
|
217
|
+
return TestFailure(
|
|
218
|
+
test_name=test.name,
|
|
219
|
+
query=test.query,
|
|
220
|
+
expected=test.expected_fact,
|
|
221
|
+
actual=None,
|
|
222
|
+
message=f"Error running semantic test: {e}",
|
|
223
|
+
is_critical=test.required
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
def _run_text_test(self, test: TestCase) -> Optional[TestFailure]:
|
|
227
|
+
"""Run test using simple text search through memory files."""
|
|
228
|
+
try:
|
|
229
|
+
current_dir = self.repo.root / 'current'
|
|
230
|
+
|
|
231
|
+
if not current_dir.exists():
|
|
232
|
+
return TestFailure(
|
|
233
|
+
test_name=test.name,
|
|
234
|
+
query=test.query,
|
|
235
|
+
expected=test.expected_fact,
|
|
236
|
+
actual=None,
|
|
237
|
+
message="No current/ directory found",
|
|
238
|
+
is_critical=test.required
|
|
239
|
+
)
|
|
240
|
+
|
|
241
|
+
# Search through all memory files
|
|
242
|
+
for memory_file in current_dir.glob('**/*.md'):
|
|
243
|
+
try:
|
|
244
|
+
content = memory_file.read_text()
|
|
245
|
+
if self._fact_matches(test.expected_fact, content):
|
|
246
|
+
return None # Test passed
|
|
247
|
+
except Exception:
|
|
248
|
+
continue
|
|
249
|
+
|
|
250
|
+
return TestFailure(
|
|
251
|
+
test_name=test.name,
|
|
252
|
+
query=test.query,
|
|
253
|
+
expected=test.expected_fact,
|
|
254
|
+
actual=None,
|
|
255
|
+
message="Expected fact not found in any memory file",
|
|
256
|
+
is_critical=test.required
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
except Exception as e:
|
|
260
|
+
return TestFailure(
|
|
261
|
+
test_name=test.name,
|
|
262
|
+
query=test.query,
|
|
263
|
+
expected=test.expected_fact,
|
|
264
|
+
actual=None,
|
|
265
|
+
message=f"Error running text test: {e}",
|
|
266
|
+
is_critical=test.required
|
|
267
|
+
)
|
|
268
|
+
|
|
269
|
+
def _fact_matches(self, expected: str, content: str) -> bool:
|
|
270
|
+
"""
|
|
271
|
+
Check if expected fact is present in content.
|
|
272
|
+
|
|
273
|
+
Uses case-insensitive substring matching.
|
|
274
|
+
For more sophisticated matching, this could use an LLM judge.
|
|
275
|
+
"""
|
|
276
|
+
expected_lower = expected.lower()
|
|
277
|
+
content_lower = content.lower()
|
|
278
|
+
|
|
279
|
+
# Direct substring match
|
|
280
|
+
if expected_lower in content_lower:
|
|
281
|
+
return True
|
|
282
|
+
|
|
283
|
+
# Check if all key words are present
|
|
284
|
+
key_words = expected_lower.split()
|
|
285
|
+
if len(key_words) > 2:
|
|
286
|
+
matches = sum(1 for word in key_words if word in content_lower)
|
|
287
|
+
if matches >= len(key_words) * 0.8: # 80% of words match
|
|
288
|
+
return True
|
|
289
|
+
|
|
290
|
+
return False
|
|
291
|
+
|
|
292
|
+
def run_all(self, tags: Optional[List[str]] = None) -> TestResult:
|
|
293
|
+
"""
|
|
294
|
+
Run all tests.
|
|
295
|
+
|
|
296
|
+
Args:
|
|
297
|
+
tags: Optional list of tags to filter tests
|
|
298
|
+
|
|
299
|
+
Returns:
|
|
300
|
+
TestResult with overall results
|
|
301
|
+
"""
|
|
302
|
+
start_time = datetime.now()
|
|
303
|
+
tests = self.load_tests()
|
|
304
|
+
|
|
305
|
+
# Filter by tags if specified
|
|
306
|
+
if tags:
|
|
307
|
+
tests = [t for t in tests if any(tag in t.tags for tag in tags)]
|
|
308
|
+
|
|
309
|
+
failures = []
|
|
310
|
+
passed_count = 0
|
|
311
|
+
|
|
312
|
+
for test in tests:
|
|
313
|
+
failure = self.run_test(test)
|
|
314
|
+
if failure:
|
|
315
|
+
failures.append(failure)
|
|
316
|
+
else:
|
|
317
|
+
passed_count += 1
|
|
318
|
+
|
|
319
|
+
duration = (datetime.now() - start_time).total_seconds() * 1000
|
|
320
|
+
|
|
321
|
+
# Check if any critical tests failed
|
|
322
|
+
critical_failures = [f for f in failures if f.is_critical]
|
|
323
|
+
passed = len(critical_failures) == 0
|
|
324
|
+
|
|
325
|
+
return TestResult(
|
|
326
|
+
passed=passed,
|
|
327
|
+
total_count=len(tests),
|
|
328
|
+
passed_count=passed_count,
|
|
329
|
+
failed_count=len(failures),
|
|
330
|
+
failures=failures,
|
|
331
|
+
duration_ms=int(duration)
|
|
332
|
+
)
|
|
333
|
+
|
|
334
|
+
def run_for_branch(self, branch: str) -> TestResult:
|
|
335
|
+
"""
|
|
336
|
+
Run tests against a specific branch.
|
|
337
|
+
|
|
338
|
+
Creates a temporary vector store with only the branch's data.
|
|
339
|
+
|
|
340
|
+
Args:
|
|
341
|
+
branch: Branch name to test
|
|
342
|
+
|
|
343
|
+
Returns:
|
|
344
|
+
TestResult
|
|
345
|
+
"""
|
|
346
|
+
# For now, just run normal tests
|
|
347
|
+
# TODO: Implement branch-specific testing with temporary vector store
|
|
348
|
+
return self.run_all()
|
|
349
|
+
|
|
350
|
+
|
|
351
|
+
def create_test_template() -> str:
|
|
352
|
+
"""Create a template test file."""
|
|
353
|
+
return """# Memory Tests
|
|
354
|
+
# Tests are run with 'agmem test' to validate memory consistency
|
|
355
|
+
|
|
356
|
+
tests:
|
|
357
|
+
- name: "example_test"
|
|
358
|
+
query: "What is the main purpose of this project?"
|
|
359
|
+
expected_fact: "version control for agent memory"
|
|
360
|
+
confidence_threshold: 0.7
|
|
361
|
+
required: false
|
|
362
|
+
tags:
|
|
363
|
+
- "core"
|
|
364
|
+
- "basics"
|
|
365
|
+
|
|
366
|
+
# Add more tests below:
|
|
367
|
+
# - name: "test_name"
|
|
368
|
+
# query: "Your query here"
|
|
369
|
+
# expected_fact: "Expected answer"
|
|
370
|
+
# required: true # Set to true for critical tests that block commits
|
|
371
|
+
"""
|
|
@@ -0,0 +1,313 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Vector store for semantic search over agmem memory.
|
|
3
|
+
|
|
4
|
+
Uses sqlite-vec for local vector storage and sentence-transformers for embeddings.
|
|
5
|
+
Requires: pip install agmem[vector]
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import logging
|
|
9
|
+
from typing import List, Optional, Tuple
|
|
10
|
+
|
|
11
|
+
from .constants import MEMORY_TYPES
|
|
12
|
+
import struct
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger("agmem.vector_store")
|
|
16
|
+
|
|
17
|
+
# Embedding dimension for all-MiniLM-L6-v2
|
|
18
|
+
EMBEDDING_DIM = 384
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _serialize_f32(vector: List[float]) -> bytes:
|
|
22
|
+
"""Serialize float list to bytes for sqlite-vec."""
|
|
23
|
+
return struct.pack(f"{len(vector)}f", *vector)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class VectorStore:
|
|
27
|
+
"""Semantic search over memory using vector embeddings."""
|
|
28
|
+
|
|
29
|
+
def __init__(self, mem_dir: Path):
|
|
30
|
+
self.mem_dir = Path(mem_dir)
|
|
31
|
+
self.db_path = self.mem_dir / "vectors.db"
|
|
32
|
+
self._model = None
|
|
33
|
+
self._conn = None
|
|
34
|
+
|
|
35
|
+
def _get_connection(self):
|
|
36
|
+
"""Get SQLite connection with sqlite-vec loaded."""
|
|
37
|
+
if self._conn is not None:
|
|
38
|
+
return self._conn
|
|
39
|
+
|
|
40
|
+
try:
|
|
41
|
+
import sqlite3
|
|
42
|
+
import sqlite_vec
|
|
43
|
+
|
|
44
|
+
self._conn = sqlite3.connect(str(self.db_path))
|
|
45
|
+
self._conn.enable_load_extension(True)
|
|
46
|
+
sqlite_vec.load(self._conn)
|
|
47
|
+
self._conn.enable_load_extension(False)
|
|
48
|
+
return self._conn
|
|
49
|
+
except ImportError as e:
|
|
50
|
+
raise ImportError(
|
|
51
|
+
"Vector search requires sqlite-vec. Install with: pip install agmem[vector]"
|
|
52
|
+
) from e
|
|
53
|
+
except AttributeError as e:
|
|
54
|
+
raise ImportError(
|
|
55
|
+
"SQLite extension loading not supported. "
|
|
56
|
+
"On macOS, try: brew install python (for Homebrew SQLite)"
|
|
57
|
+
) from e
|
|
58
|
+
|
|
59
|
+
def _get_model(self):
|
|
60
|
+
"""Lazy-load the sentence-transformers model."""
|
|
61
|
+
if self._model is not None:
|
|
62
|
+
return self._model
|
|
63
|
+
|
|
64
|
+
try:
|
|
65
|
+
from sentence_transformers import SentenceTransformer
|
|
66
|
+
|
|
67
|
+
self._model = SentenceTransformer("all-MiniLM-L6-v2")
|
|
68
|
+
return self._model
|
|
69
|
+
except ImportError as e:
|
|
70
|
+
raise ImportError(
|
|
71
|
+
"Vector search requires sentence-transformers. "
|
|
72
|
+
"Install with: pip install agmem[vector]"
|
|
73
|
+
) from e
|
|
74
|
+
|
|
75
|
+
def _ensure_tables(self):
|
|
76
|
+
"""Create vector and metadata tables if they don't exist."""
|
|
77
|
+
conn = self._get_connection()
|
|
78
|
+
conn.execute("""
|
|
79
|
+
CREATE TABLE IF NOT EXISTS memory_meta (
|
|
80
|
+
rowid INTEGER PRIMARY KEY,
|
|
81
|
+
path TEXT NOT NULL,
|
|
82
|
+
content TEXT NOT NULL,
|
|
83
|
+
blob_hash TEXT,
|
|
84
|
+
commit_hash TEXT,
|
|
85
|
+
author TEXT,
|
|
86
|
+
indexed_at TEXT
|
|
87
|
+
)
|
|
88
|
+
""")
|
|
89
|
+
# Try to add new columns to existing tables (for upgrades)
|
|
90
|
+
for col in ['commit_hash TEXT', 'author TEXT', 'indexed_at TEXT']:
|
|
91
|
+
try:
|
|
92
|
+
conn.execute(f"ALTER TABLE memory_meta ADD COLUMN {col}")
|
|
93
|
+
except Exception:
|
|
94
|
+
pass # Column already exists
|
|
95
|
+
try:
|
|
96
|
+
conn.execute(f"""
|
|
97
|
+
CREATE VIRTUAL TABLE IF NOT EXISTS vec_memory
|
|
98
|
+
USING vec0(embedding float[{EMBEDDING_DIM}])
|
|
99
|
+
""")
|
|
100
|
+
except Exception as e:
|
|
101
|
+
# vec0 might already exist with different schema
|
|
102
|
+
logger.debug("vec_memory creation: %s", e)
|
|
103
|
+
conn.commit()
|
|
104
|
+
|
|
105
|
+
def _embed(self, text: str) -> List[float]:
|
|
106
|
+
"""Generate embedding for text."""
|
|
107
|
+
model = self._get_model()
|
|
108
|
+
emb = model.encode(text, convert_to_numpy=True)
|
|
109
|
+
return emb.astype("float32").tolist()
|
|
110
|
+
|
|
111
|
+
def index_content(
|
|
112
|
+
self,
|
|
113
|
+
path: str,
|
|
114
|
+
content: str,
|
|
115
|
+
blob_hash: Optional[str] = None,
|
|
116
|
+
commit_hash: Optional[str] = None,
|
|
117
|
+
author: Optional[str] = None
|
|
118
|
+
) -> None:
|
|
119
|
+
"""
|
|
120
|
+
Index a memory file for semantic search.
|
|
121
|
+
|
|
122
|
+
Args:
|
|
123
|
+
path: File path relative to current/
|
|
124
|
+
content: File content to index
|
|
125
|
+
blob_hash: Optional blob hash from object store
|
|
126
|
+
commit_hash: Optional commit hash for provenance tracking
|
|
127
|
+
author: Optional author string for provenance tracking
|
|
128
|
+
"""
|
|
129
|
+
from datetime import datetime
|
|
130
|
+
|
|
131
|
+
self._ensure_tables()
|
|
132
|
+
conn = self._get_connection()
|
|
133
|
+
|
|
134
|
+
embedding = self._embed(content)
|
|
135
|
+
emb_bytes = _serialize_f32(embedding)
|
|
136
|
+
indexed_at = datetime.utcnow().isoformat() + 'Z'
|
|
137
|
+
|
|
138
|
+
with conn:
|
|
139
|
+
conn.execute(
|
|
140
|
+
"""INSERT INTO memory_meta
|
|
141
|
+
(path, content, blob_hash, commit_hash, author, indexed_at)
|
|
142
|
+
VALUES (?, ?, ?, ?, ?, ?)""",
|
|
143
|
+
(path, content[:10000], blob_hash, commit_hash, author, indexed_at),
|
|
144
|
+
)
|
|
145
|
+
rowid = conn.execute("SELECT last_insert_rowid()").fetchone()[0]
|
|
146
|
+
conn.execute(
|
|
147
|
+
"INSERT INTO vec_memory (rowid, embedding) VALUES (?, ?)",
|
|
148
|
+
(rowid, emb_bytes),
|
|
149
|
+
)
|
|
150
|
+
conn.commit()
|
|
151
|
+
|
|
152
|
+
def index_directory(self, current_dir: Path) -> int:
|
|
153
|
+
"""Index all memory files in current/ directory. Returns count indexed."""
|
|
154
|
+
self._ensure_tables()
|
|
155
|
+
count = 0
|
|
156
|
+
|
|
157
|
+
for subdir in MEMORY_TYPES:
|
|
158
|
+
dir_path = current_dir / subdir
|
|
159
|
+
if not dir_path.exists():
|
|
160
|
+
continue
|
|
161
|
+
for f in dir_path.rglob("*"):
|
|
162
|
+
if f.is_file():
|
|
163
|
+
try:
|
|
164
|
+
content = f.read_text(encoding="utf-8", errors="replace")
|
|
165
|
+
rel_path = str(f.relative_to(current_dir))
|
|
166
|
+
self.index_content(rel_path, content)
|
|
167
|
+
count += 1
|
|
168
|
+
except Exception as e:
|
|
169
|
+
logger.warning("Failed to index %s: %s", f, e)
|
|
170
|
+
|
|
171
|
+
return count
|
|
172
|
+
|
|
173
|
+
def search(
|
|
174
|
+
self, query: str, limit: int = 10, min_score: Optional[float] = None
|
|
175
|
+
) -> List[Tuple[str, str, float]]:
|
|
176
|
+
"""
|
|
177
|
+
Semantic search. Returns list of (path, content_snippet, distance).
|
|
178
|
+
Lower distance = more similar.
|
|
179
|
+
"""
|
|
180
|
+
self._ensure_tables()
|
|
181
|
+
conn = self._get_connection()
|
|
182
|
+
|
|
183
|
+
query_embedding = self._embed(query)
|
|
184
|
+
emb_bytes = _serialize_f32(query_embedding)
|
|
185
|
+
|
|
186
|
+
rows = conn.execute(
|
|
187
|
+
"""
|
|
188
|
+
SELECT m.path, m.content, v.distance
|
|
189
|
+
FROM vec_memory v
|
|
190
|
+
JOIN memory_meta m ON v.rowid = m.rowid
|
|
191
|
+
WHERE v.embedding MATCH ?
|
|
192
|
+
ORDER BY v.distance
|
|
193
|
+
LIMIT ?
|
|
194
|
+
""",
|
|
195
|
+
(emb_bytes, limit),
|
|
196
|
+
).fetchall()
|
|
197
|
+
|
|
198
|
+
results = []
|
|
199
|
+
for path, content, distance in rows:
|
|
200
|
+
if min_score is not None and distance > min_score:
|
|
201
|
+
continue
|
|
202
|
+
snippet = content[:500] + ("..." if len(content) > 500 else "")
|
|
203
|
+
results.append((path, snippet, float(distance)))
|
|
204
|
+
|
|
205
|
+
return results
|
|
206
|
+
|
|
207
|
+
def search_with_provenance(
|
|
208
|
+
self, query: str, limit: int = 10, min_score: Optional[float] = None
|
|
209
|
+
) -> List[dict]:
|
|
210
|
+
"""
|
|
211
|
+
Semantic search with provenance metadata.
|
|
212
|
+
|
|
213
|
+
Returns list of dicts with: path, content, distance, commit_hash, author, indexed_at
|
|
214
|
+
"""
|
|
215
|
+
self._ensure_tables()
|
|
216
|
+
conn = self._get_connection()
|
|
217
|
+
|
|
218
|
+
query_embedding = self._embed(query)
|
|
219
|
+
emb_bytes = _serialize_f32(query_embedding)
|
|
220
|
+
|
|
221
|
+
rows = conn.execute(
|
|
222
|
+
"""
|
|
223
|
+
SELECT m.path, m.content, v.distance, m.commit_hash, m.author, m.indexed_at, m.blob_hash
|
|
224
|
+
FROM vec_memory v
|
|
225
|
+
JOIN memory_meta m ON v.rowid = m.rowid
|
|
226
|
+
WHERE v.embedding MATCH ?
|
|
227
|
+
ORDER BY v.distance
|
|
228
|
+
LIMIT ?
|
|
229
|
+
""",
|
|
230
|
+
(emb_bytes, limit),
|
|
231
|
+
).fetchall()
|
|
232
|
+
|
|
233
|
+
results = []
|
|
234
|
+
for path, content, distance, commit_hash, author, indexed_at, blob_hash in rows:
|
|
235
|
+
if min_score is not None and distance > min_score:
|
|
236
|
+
continue
|
|
237
|
+
snippet = content[:500] + ("..." if len(content) > 500 else "")
|
|
238
|
+
results.append({
|
|
239
|
+
'path': path,
|
|
240
|
+
'content': snippet,
|
|
241
|
+
'distance': float(distance),
|
|
242
|
+
'similarity': 1.0 - float(distance), # Convert to similarity score
|
|
243
|
+
'commit_hash': commit_hash,
|
|
244
|
+
'author': author,
|
|
245
|
+
'indexed_at': indexed_at,
|
|
246
|
+
'blob_hash': blob_hash
|
|
247
|
+
})
|
|
248
|
+
|
|
249
|
+
return results
|
|
250
|
+
|
|
251
|
+
def get_all_entries(self) -> List[dict]:
|
|
252
|
+
"""
|
|
253
|
+
Get all indexed entries with their metadata.
|
|
254
|
+
|
|
255
|
+
Used for fsck operations to check for dangling vectors.
|
|
256
|
+
"""
|
|
257
|
+
self._ensure_tables()
|
|
258
|
+
conn = self._get_connection()
|
|
259
|
+
|
|
260
|
+
rows = conn.execute(
|
|
261
|
+
"""
|
|
262
|
+
SELECT rowid, path, blob_hash, commit_hash, author, indexed_at
|
|
263
|
+
FROM memory_meta
|
|
264
|
+
"""
|
|
265
|
+
).fetchall()
|
|
266
|
+
|
|
267
|
+
return [
|
|
268
|
+
{
|
|
269
|
+
'rowid': rowid,
|
|
270
|
+
'path': path,
|
|
271
|
+
'blob_hash': blob_hash,
|
|
272
|
+
'commit_hash': commit_hash,
|
|
273
|
+
'author': author,
|
|
274
|
+
'indexed_at': indexed_at
|
|
275
|
+
}
|
|
276
|
+
for rowid, path, blob_hash, commit_hash, author, indexed_at in rows
|
|
277
|
+
]
|
|
278
|
+
|
|
279
|
+
def delete_entry(self, rowid: int) -> bool:
|
|
280
|
+
"""
|
|
281
|
+
Delete an entry by rowid.
|
|
282
|
+
|
|
283
|
+
Used by fsck to remove dangling vectors.
|
|
284
|
+
"""
|
|
285
|
+
conn = self._get_connection()
|
|
286
|
+
try:
|
|
287
|
+
with conn:
|
|
288
|
+
conn.execute("DELETE FROM memory_meta WHERE rowid = ?", (rowid,))
|
|
289
|
+
conn.execute("DELETE FROM vec_memory WHERE rowid = ?", (rowid,))
|
|
290
|
+
conn.commit()
|
|
291
|
+
return True
|
|
292
|
+
except Exception as e:
|
|
293
|
+
logger.warning("Failed to delete entry %s: %s", rowid, e)
|
|
294
|
+
return False
|
|
295
|
+
|
|
296
|
+
def rebuild_index(self, current_dir: Path) -> int:
|
|
297
|
+
"""Clear and rebuild the vector index from current/."""
|
|
298
|
+
conn = self._get_connection()
|
|
299
|
+
with conn:
|
|
300
|
+
try:
|
|
301
|
+
conn.execute("DROP TABLE IF EXISTS vec_memory")
|
|
302
|
+
except Exception:
|
|
303
|
+
pass
|
|
304
|
+
conn.execute("DELETE FROM memory_meta")
|
|
305
|
+
conn.commit()
|
|
306
|
+
self._ensure_tables()
|
|
307
|
+
return self.index_directory(current_dir)
|
|
308
|
+
|
|
309
|
+
def close(self):
|
|
310
|
+
"""Close the database connection."""
|
|
311
|
+
if self._conn:
|
|
312
|
+
self._conn.close()
|
|
313
|
+
self._conn = None
|