oghma 0.0.1__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- oghma/__init__.py +1 -3
- oghma/cli.py +342 -0
- oghma/config.py +262 -0
- oghma/daemon.py +198 -0
- oghma/embedder.py +107 -0
- oghma/exporter.py +177 -0
- oghma/extractor.py +180 -0
- oghma/mcp_server.py +112 -0
- oghma/migration.py +63 -0
- oghma/parsers/__init__.py +26 -0
- oghma/parsers/base.py +24 -0
- oghma/parsers/claude_code.py +62 -0
- oghma/parsers/codex.py +84 -0
- oghma/parsers/openclaw.py +64 -0
- oghma/parsers/opencode.py +90 -0
- oghma/storage.py +753 -0
- oghma/watcher.py +97 -0
- oghma-0.3.0.dist-info/METADATA +26 -0
- oghma-0.3.0.dist-info/RECORD +22 -0
- {oghma-0.0.1.dist-info → oghma-0.3.0.dist-info}/WHEEL +2 -1
- oghma-0.3.0.dist-info/entry_points.txt +3 -0
- oghma-0.3.0.dist-info/top_level.txt +1 -0
- oghma-0.0.1.dist-info/METADATA +0 -33
- oghma-0.0.1.dist-info/RECORD +0 -4
oghma/storage.py
ADDED
|
@@ -0,0 +1,753 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import logging
|
|
3
|
+
import sqlite3
|
|
4
|
+
from contextlib import contextmanager
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any, TypedDict
|
|
7
|
+
|
|
8
|
+
from oghma.config import Config, get_db_path
|
|
9
|
+
|
|
10
|
+
try:
|
|
11
|
+
import sqlite_vec
|
|
12
|
+
except ImportError: # pragma: no cover - optional runtime dependency in tests
|
|
13
|
+
sqlite_vec = None
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class MemoryRecord(TypedDict):
|
|
19
|
+
id: int
|
|
20
|
+
content: str
|
|
21
|
+
category: str
|
|
22
|
+
source_tool: str
|
|
23
|
+
source_file: str
|
|
24
|
+
source_session: str | None
|
|
25
|
+
confidence: float
|
|
26
|
+
created_at: str
|
|
27
|
+
updated_at: str
|
|
28
|
+
status: str
|
|
29
|
+
has_embedding: int
|
|
30
|
+
metadata: dict[str, Any]
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class ExtractionStateRecord(TypedDict):
|
|
34
|
+
id: int
|
|
35
|
+
source_path: str
|
|
36
|
+
last_mtime: float
|
|
37
|
+
last_size: int
|
|
38
|
+
last_extracted_at: str
|
|
39
|
+
message_count: int
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class ExtractionLogRecord(TypedDict):
|
|
43
|
+
id: int
|
|
44
|
+
source_path: str
|
|
45
|
+
memories_extracted: int
|
|
46
|
+
tokens_used: int
|
|
47
|
+
duration_ms: int
|
|
48
|
+
error: str | None
|
|
49
|
+
created_at: str
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class Storage:
|
|
53
|
+
# Hybrid search tuning constants.
|
|
54
|
+
MIN_HYBRID_QUERY_LENGTH = 3
|
|
55
|
+
VECTOR_K_MULTIPLIER = 4
|
|
56
|
+
VECTOR_K_MIN = 25
|
|
57
|
+
RRF_K_DEFAULT = 60
|
|
58
|
+
|
|
59
|
+
def __init__(
|
|
60
|
+
self,
|
|
61
|
+
db_path: str | None = None,
|
|
62
|
+
config: Config | None = None,
|
|
63
|
+
read_only: bool = False,
|
|
64
|
+
):
|
|
65
|
+
self.db_path = db_path or get_db_path(config)
|
|
66
|
+
self.read_only = read_only
|
|
67
|
+
self._config = config
|
|
68
|
+
self.embedding_dimensions = (
|
|
69
|
+
config.get("embedding", {}).get("dimensions", 1536) if config else 1536
|
|
70
|
+
)
|
|
71
|
+
self._vec_available = sqlite_vec is not None
|
|
72
|
+
self._vector_search_enabled = self._vec_available
|
|
73
|
+
|
|
74
|
+
if self.read_only:
|
|
75
|
+
db_file = Path(self.db_path)
|
|
76
|
+
if not db_file.exists():
|
|
77
|
+
raise FileNotFoundError(f"Database not found: {self.db_path}")
|
|
78
|
+
self._connection_target = f"file:{db_file.resolve()}?mode=ro"
|
|
79
|
+
self._use_uri = True
|
|
80
|
+
else:
|
|
81
|
+
self._connection_target = self.db_path
|
|
82
|
+
self._use_uri = False
|
|
83
|
+
self._init_db()
|
|
84
|
+
|
|
85
|
+
@contextmanager
|
|
86
|
+
def _get_connection(self):
|
|
87
|
+
conn = sqlite3.connect(self._connection_target, uri=self._use_uri)
|
|
88
|
+
conn.row_factory = sqlite3.Row
|
|
89
|
+
self._configure_connection(conn)
|
|
90
|
+
try:
|
|
91
|
+
yield conn
|
|
92
|
+
if not self.read_only:
|
|
93
|
+
conn.commit()
|
|
94
|
+
except Exception:
|
|
95
|
+
if not self.read_only:
|
|
96
|
+
conn.rollback()
|
|
97
|
+
raise
|
|
98
|
+
finally:
|
|
99
|
+
conn.close()
|
|
100
|
+
|
|
101
|
+
def _configure_connection(self, conn: sqlite3.Connection) -> None:
|
|
102
|
+
if not self._vec_available:
|
|
103
|
+
return
|
|
104
|
+
try:
|
|
105
|
+
sqlite_vec.load(conn)
|
|
106
|
+
except Exception:
|
|
107
|
+
self._vector_search_enabled = False
|
|
108
|
+
logger.warning(
|
|
109
|
+
"sqlite-vec extension failed to load; vector search disabled",
|
|
110
|
+
exc_info=True,
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
def _fallback_keyword_search(
|
|
114
|
+
self,
|
|
115
|
+
*,
|
|
116
|
+
query: str,
|
|
117
|
+
category: str | None,
|
|
118
|
+
source_tool: str | None,
|
|
119
|
+
status: str,
|
|
120
|
+
limit: int,
|
|
121
|
+
offset: int,
|
|
122
|
+
reason: str,
|
|
123
|
+
exc_info: bool = False,
|
|
124
|
+
) -> list[MemoryRecord]:
|
|
125
|
+
log_fn = logger.warning if exc_info else logger.info
|
|
126
|
+
log_fn("Hybrid/vector search fell back to keyword search: %s", reason, exc_info=exc_info)
|
|
127
|
+
return self.search_memories(
|
|
128
|
+
query=query,
|
|
129
|
+
category=category,
|
|
130
|
+
source_tool=source_tool,
|
|
131
|
+
status=status,
|
|
132
|
+
limit=limit,
|
|
133
|
+
offset=offset,
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
def _ensure_column(
|
|
137
|
+
self,
|
|
138
|
+
cursor: sqlite3.Cursor,
|
|
139
|
+
table_name: str,
|
|
140
|
+
column_name: str,
|
|
141
|
+
definition: str,
|
|
142
|
+
) -> None:
|
|
143
|
+
cursor.execute(f"PRAGMA table_info({table_name})")
|
|
144
|
+
columns = {row[1] for row in cursor.fetchall()}
|
|
145
|
+
if column_name not in columns:
|
|
146
|
+
cursor.execute(f"ALTER TABLE {table_name} ADD COLUMN {column_name} {definition}")
|
|
147
|
+
|
|
148
|
+
def _serialize_embedding(self, embedding: list[float]) -> Any:
|
|
149
|
+
if sqlite_vec and hasattr(sqlite_vec, "serialize_float32"):
|
|
150
|
+
return sqlite_vec.serialize_float32(embedding)
|
|
151
|
+
return json.dumps(embedding)
|
|
152
|
+
|
|
153
|
+
def _init_db(self) -> None:
|
|
154
|
+
Path(self.db_path).parent.mkdir(parents=True, exist_ok=True)
|
|
155
|
+
|
|
156
|
+
with self._get_connection() as conn:
|
|
157
|
+
cursor = conn.cursor()
|
|
158
|
+
|
|
159
|
+
cursor.execute("""
|
|
160
|
+
CREATE TABLE IF NOT EXISTS memories (
|
|
161
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
162
|
+
content TEXT NOT NULL,
|
|
163
|
+
category TEXT NOT NULL,
|
|
164
|
+
source_tool TEXT NOT NULL,
|
|
165
|
+
source_file TEXT NOT NULL,
|
|
166
|
+
source_session TEXT,
|
|
167
|
+
confidence REAL DEFAULT 1.0,
|
|
168
|
+
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
|
169
|
+
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
|
170
|
+
status TEXT DEFAULT 'active',
|
|
171
|
+
metadata JSON
|
|
172
|
+
)
|
|
173
|
+
""")
|
|
174
|
+
self._ensure_column(cursor, "memories", "has_embedding", "INTEGER DEFAULT 0")
|
|
175
|
+
|
|
176
|
+
cursor.execute("""
|
|
177
|
+
CREATE INDEX IF NOT EXISTS idx_memories_category ON memories(category)
|
|
178
|
+
""")
|
|
179
|
+
|
|
180
|
+
cursor.execute("""
|
|
181
|
+
CREATE INDEX IF NOT EXISTS idx_memories_source_tool ON memories(source_tool)
|
|
182
|
+
""")
|
|
183
|
+
|
|
184
|
+
cursor.execute("""
|
|
185
|
+
CREATE INDEX IF NOT EXISTS idx_memories_created_at ON memories(created_at DESC)
|
|
186
|
+
""")
|
|
187
|
+
|
|
188
|
+
cursor.execute("""
|
|
189
|
+
CREATE INDEX IF NOT EXISTS idx_memories_status ON memories(status)
|
|
190
|
+
""")
|
|
191
|
+
|
|
192
|
+
cursor.execute("""
|
|
193
|
+
CREATE VIRTUAL TABLE IF NOT EXISTS memories_fts USING fts5(
|
|
194
|
+
content,
|
|
195
|
+
category,
|
|
196
|
+
source_tool,
|
|
197
|
+
content=memories,
|
|
198
|
+
content_rowid=id
|
|
199
|
+
)
|
|
200
|
+
""")
|
|
201
|
+
|
|
202
|
+
if self._vector_search_enabled:
|
|
203
|
+
cursor.execute(
|
|
204
|
+
f"""
|
|
205
|
+
CREATE VIRTUAL TABLE IF NOT EXISTS memories_vec USING vec0(
|
|
206
|
+
memory_id INTEGER PRIMARY KEY,
|
|
207
|
+
embedding float[{self.embedding_dimensions}]
|
|
208
|
+
)
|
|
209
|
+
"""
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
cursor.execute("""
|
|
213
|
+
CREATE TABLE IF NOT EXISTS extraction_state (
|
|
214
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
215
|
+
source_path TEXT UNIQUE NOT NULL,
|
|
216
|
+
last_mtime REAL NOT NULL,
|
|
217
|
+
last_size INTEGER NOT NULL,
|
|
218
|
+
last_extracted_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
|
219
|
+
message_count INTEGER DEFAULT 0
|
|
220
|
+
)
|
|
221
|
+
""")
|
|
222
|
+
|
|
223
|
+
cursor.execute(
|
|
224
|
+
"""
|
|
225
|
+
CREATE INDEX IF NOT EXISTS idx_extraction_state_path
|
|
226
|
+
ON extraction_state(source_path)
|
|
227
|
+
"""
|
|
228
|
+
)
|
|
229
|
+
|
|
230
|
+
cursor.execute("""
|
|
231
|
+
CREATE TABLE IF NOT EXISTS extraction_log (
|
|
232
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
233
|
+
source_path TEXT NOT NULL,
|
|
234
|
+
memories_extracted INTEGER DEFAULT 0,
|
|
235
|
+
tokens_used INTEGER DEFAULT 0,
|
|
236
|
+
duration_ms INTEGER DEFAULT 0,
|
|
237
|
+
error TEXT,
|
|
238
|
+
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
|
239
|
+
)
|
|
240
|
+
""")
|
|
241
|
+
|
|
242
|
+
cursor.execute("""
|
|
243
|
+
CREATE INDEX IF NOT EXISTS idx_extraction_log_path ON extraction_log(source_path)
|
|
244
|
+
""")
|
|
245
|
+
|
|
246
|
+
cursor.execute(
|
|
247
|
+
"""
|
|
248
|
+
CREATE INDEX IF NOT EXISTS idx_extraction_log_created_at
|
|
249
|
+
ON extraction_log(created_at DESC)
|
|
250
|
+
"""
|
|
251
|
+
)
|
|
252
|
+
|
|
253
|
+
cursor.execute("""
|
|
254
|
+
CREATE TRIGGER IF NOT EXISTS memories_fts_insert AFTER INSERT ON memories BEGIN
|
|
255
|
+
INSERT INTO memories_fts(rowid, content, category, source_tool)
|
|
256
|
+
VALUES (NEW.id, NEW.content, NEW.category, NEW.source_tool);
|
|
257
|
+
END
|
|
258
|
+
""")
|
|
259
|
+
|
|
260
|
+
cursor.execute("""
|
|
261
|
+
CREATE TRIGGER IF NOT EXISTS memories_fts_delete AFTER DELETE ON memories BEGIN
|
|
262
|
+
DELETE FROM memories_fts WHERE rowid = OLD.id;
|
|
263
|
+
END
|
|
264
|
+
""")
|
|
265
|
+
|
|
266
|
+
cursor.execute("""
|
|
267
|
+
CREATE TRIGGER IF NOT EXISTS memories_fts_update AFTER UPDATE ON memories BEGIN
|
|
268
|
+
DELETE FROM memories_fts WHERE rowid = OLD.id;
|
|
269
|
+
INSERT INTO memories_fts(rowid, content, category, source_tool)
|
|
270
|
+
VALUES (NEW.id, NEW.content, NEW.category, NEW.source_tool);
|
|
271
|
+
END
|
|
272
|
+
""")
|
|
273
|
+
|
|
274
|
+
def add_memory(
|
|
275
|
+
self,
|
|
276
|
+
content: str,
|
|
277
|
+
category: str,
|
|
278
|
+
source_tool: str,
|
|
279
|
+
source_file: str,
|
|
280
|
+
source_session: str | None = None,
|
|
281
|
+
confidence: float = 1.0,
|
|
282
|
+
metadata: dict[str, Any] | None = None,
|
|
283
|
+
embedding: list[float] | None = None,
|
|
284
|
+
) -> int:
|
|
285
|
+
with self._get_connection() as conn:
|
|
286
|
+
cursor = conn.cursor()
|
|
287
|
+
metadata_json = json.dumps(metadata) if metadata else None
|
|
288
|
+
|
|
289
|
+
cursor.execute(
|
|
290
|
+
"""
|
|
291
|
+
INSERT INTO memories
|
|
292
|
+
(content, category, source_tool, source_file,
|
|
293
|
+
source_session, confidence, metadata, has_embedding)
|
|
294
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
|
|
295
|
+
""",
|
|
296
|
+
(
|
|
297
|
+
content,
|
|
298
|
+
category,
|
|
299
|
+
source_tool,
|
|
300
|
+
source_file,
|
|
301
|
+
source_session,
|
|
302
|
+
confidence,
|
|
303
|
+
metadata_json,
|
|
304
|
+
1 if embedding is not None and self._vector_search_enabled else 0,
|
|
305
|
+
),
|
|
306
|
+
)
|
|
307
|
+
memory_id = cursor.lastrowid or 0
|
|
308
|
+
|
|
309
|
+
if embedding is not None and self._vector_search_enabled:
|
|
310
|
+
cursor.execute(
|
|
311
|
+
"INSERT OR REPLACE INTO memories_vec (memory_id, embedding) VALUES (?, ?)",
|
|
312
|
+
(memory_id, self._serialize_embedding(embedding)),
|
|
313
|
+
)
|
|
314
|
+
|
|
315
|
+
return memory_id
|
|
316
|
+
|
|
317
|
+
def search_memories(
|
|
318
|
+
self,
|
|
319
|
+
query: str,
|
|
320
|
+
category: str | None = None,
|
|
321
|
+
source_tool: str | None = None,
|
|
322
|
+
status: str = "active",
|
|
323
|
+
limit: int = 10,
|
|
324
|
+
offset: int = 0,
|
|
325
|
+
) -> list[MemoryRecord]:
|
|
326
|
+
with self._get_connection() as conn:
|
|
327
|
+
cursor = conn.cursor()
|
|
328
|
+
|
|
329
|
+
sql = """
|
|
330
|
+
SELECT m.*
|
|
331
|
+
FROM memories m
|
|
332
|
+
WHERE m.id IN (
|
|
333
|
+
SELECT rowid FROM memories_fts WHERE memories_fts MATCH ?
|
|
334
|
+
)
|
|
335
|
+
AND m.status = ?
|
|
336
|
+
"""
|
|
337
|
+
params: list[str | int] = [query, status]
|
|
338
|
+
|
|
339
|
+
if category:
|
|
340
|
+
sql += " AND m.category = ?"
|
|
341
|
+
params.append(category)
|
|
342
|
+
|
|
343
|
+
if source_tool:
|
|
344
|
+
sql += " AND m.source_tool = ?"
|
|
345
|
+
params.append(source_tool)
|
|
346
|
+
|
|
347
|
+
sql += " ORDER BY m.created_at DESC LIMIT ? OFFSET ?"
|
|
348
|
+
params.extend([limit, offset])
|
|
349
|
+
|
|
350
|
+
cursor.execute(sql, params)
|
|
351
|
+
rows = cursor.fetchall()
|
|
352
|
+
|
|
353
|
+
return [self._row_to_memory_record(row) for row in rows]
|
|
354
|
+
|
|
355
|
+
def _row_to_memory_record(self, row: sqlite3.Row) -> MemoryRecord:
|
|
356
|
+
return {
|
|
357
|
+
"id": row["id"],
|
|
358
|
+
"content": row["content"],
|
|
359
|
+
"category": row["category"],
|
|
360
|
+
"source_tool": row["source_tool"],
|
|
361
|
+
"source_file": row["source_file"],
|
|
362
|
+
"source_session": row["source_session"],
|
|
363
|
+
"confidence": row["confidence"],
|
|
364
|
+
"created_at": row["created_at"],
|
|
365
|
+
"updated_at": row["updated_at"],
|
|
366
|
+
"status": row["status"],
|
|
367
|
+
"has_embedding": row["has_embedding"] if "has_embedding" in row.keys() else 0,
|
|
368
|
+
"metadata": json.loads(row["metadata"]) if row["metadata"] else {},
|
|
369
|
+
}
|
|
370
|
+
|
|
371
|
+
def upsert_memory_embedding(self, memory_id: int, embedding: list[float]) -> bool:
|
|
372
|
+
if not self._vector_search_enabled:
|
|
373
|
+
return False
|
|
374
|
+
|
|
375
|
+
with self._get_connection() as conn:
|
|
376
|
+
cursor = conn.cursor()
|
|
377
|
+
cursor.execute("SELECT id FROM memories WHERE id = ?", (memory_id,))
|
|
378
|
+
if cursor.fetchone() is None:
|
|
379
|
+
return False
|
|
380
|
+
|
|
381
|
+
cursor.execute(
|
|
382
|
+
"INSERT OR REPLACE INTO memories_vec (memory_id, embedding) VALUES (?, ?)",
|
|
383
|
+
(memory_id, self._serialize_embedding(embedding)),
|
|
384
|
+
)
|
|
385
|
+
cursor.execute(
|
|
386
|
+
(
|
|
387
|
+
"UPDATE memories SET has_embedding = 1, "
|
|
388
|
+
"updated_at = CURRENT_TIMESTAMP WHERE id = ?"
|
|
389
|
+
),
|
|
390
|
+
(memory_id,),
|
|
391
|
+
)
|
|
392
|
+
return True
|
|
393
|
+
|
|
394
|
+
def get_memories_without_embeddings(self, limit: int = 100) -> list[MemoryRecord]:
|
|
395
|
+
with self._get_connection() as conn:
|
|
396
|
+
cursor = conn.cursor()
|
|
397
|
+
cursor.execute(
|
|
398
|
+
"""
|
|
399
|
+
SELECT * FROM memories
|
|
400
|
+
WHERE status = 'active' AND has_embedding = 0
|
|
401
|
+
ORDER BY id ASC
|
|
402
|
+
LIMIT ?
|
|
403
|
+
""",
|
|
404
|
+
(limit,),
|
|
405
|
+
)
|
|
406
|
+
rows = cursor.fetchall()
|
|
407
|
+
return [self._row_to_memory_record(row) for row in rows]
|
|
408
|
+
|
|
409
|
+
def get_embedding_progress(self) -> tuple[int, int]:
|
|
410
|
+
with self._get_connection() as conn:
|
|
411
|
+
cursor = conn.cursor()
|
|
412
|
+
cursor.execute("SELECT COUNT(*) FROM memories WHERE status = 'active'")
|
|
413
|
+
total_row = cursor.fetchone()
|
|
414
|
+
cursor.execute(
|
|
415
|
+
"SELECT COUNT(*) FROM memories WHERE status = 'active' AND has_embedding = 1"
|
|
416
|
+
)
|
|
417
|
+
done_row = cursor.fetchone()
|
|
418
|
+
total = int(total_row[0]) if total_row else 0
|
|
419
|
+
done = int(done_row[0]) if done_row else 0
|
|
420
|
+
return done, total
|
|
421
|
+
|
|
422
|
+
def search_memories_hybrid(
|
|
423
|
+
self,
|
|
424
|
+
query: str,
|
|
425
|
+
query_embedding: list[float] | None = None,
|
|
426
|
+
category: str | None = None,
|
|
427
|
+
source_tool: str | None = None,
|
|
428
|
+
status: str = "active",
|
|
429
|
+
limit: int = 10,
|
|
430
|
+
offset: int = 0,
|
|
431
|
+
search_mode: str = "hybrid",
|
|
432
|
+
rrf_k: int = RRF_K_DEFAULT,
|
|
433
|
+
) -> list[MemoryRecord]:
|
|
434
|
+
if search_mode not in {"keyword", "vector", "hybrid"}:
|
|
435
|
+
raise ValueError("search_mode must be one of: keyword, vector, hybrid")
|
|
436
|
+
|
|
437
|
+
if search_mode == "keyword":
|
|
438
|
+
return self.search_memories(
|
|
439
|
+
query=query,
|
|
440
|
+
category=category,
|
|
441
|
+
source_tool=source_tool,
|
|
442
|
+
status=status,
|
|
443
|
+
limit=limit,
|
|
444
|
+
offset=offset,
|
|
445
|
+
)
|
|
446
|
+
|
|
447
|
+
if not self._vector_search_enabled:
|
|
448
|
+
return self._fallback_keyword_search(
|
|
449
|
+
query=query,
|
|
450
|
+
category=category,
|
|
451
|
+
source_tool=source_tool,
|
|
452
|
+
status=status,
|
|
453
|
+
limit=limit,
|
|
454
|
+
offset=offset,
|
|
455
|
+
reason="sqlite-vec unavailable",
|
|
456
|
+
)
|
|
457
|
+
|
|
458
|
+
if len(query.strip()) < self.MIN_HYBRID_QUERY_LENGTH:
|
|
459
|
+
return self._fallback_keyword_search(
|
|
460
|
+
query=query,
|
|
461
|
+
category=category,
|
|
462
|
+
source_tool=source_tool,
|
|
463
|
+
status=status,
|
|
464
|
+
limit=limit,
|
|
465
|
+
offset=offset,
|
|
466
|
+
reason=f"query shorter than {self.MIN_HYBRID_QUERY_LENGTH} chars",
|
|
467
|
+
)
|
|
468
|
+
|
|
469
|
+
if not query_embedding:
|
|
470
|
+
return self._fallback_keyword_search(
|
|
471
|
+
query=query,
|
|
472
|
+
category=category,
|
|
473
|
+
source_tool=source_tool,
|
|
474
|
+
status=status,
|
|
475
|
+
limit=limit,
|
|
476
|
+
offset=offset,
|
|
477
|
+
reason="query embedding missing",
|
|
478
|
+
)
|
|
479
|
+
|
|
480
|
+
with self._get_connection() as conn:
|
|
481
|
+
cursor = conn.cursor()
|
|
482
|
+
|
|
483
|
+
cursor.execute(
|
|
484
|
+
"SELECT COUNT(*) FROM memories WHERE status = ? AND has_embedding = 1",
|
|
485
|
+
(status,),
|
|
486
|
+
)
|
|
487
|
+
row = cursor.fetchone()
|
|
488
|
+
if not row or row[0] == 0:
|
|
489
|
+
return self._fallback_keyword_search(
|
|
490
|
+
query=query,
|
|
491
|
+
category=category,
|
|
492
|
+
source_tool=source_tool,
|
|
493
|
+
status=status,
|
|
494
|
+
limit=limit,
|
|
495
|
+
offset=offset,
|
|
496
|
+
reason="no embedded memories available",
|
|
497
|
+
)
|
|
498
|
+
|
|
499
|
+
filters = ""
|
|
500
|
+
filter_params: list[str] = [status]
|
|
501
|
+
if category:
|
|
502
|
+
filters += " AND m.category = ?"
|
|
503
|
+
filter_params.append(category)
|
|
504
|
+
if source_tool:
|
|
505
|
+
filters += " AND m.source_tool = ?"
|
|
506
|
+
filter_params.append(source_tool)
|
|
507
|
+
|
|
508
|
+
vector_k = max(limit * self.VECTOR_K_MULTIPLIER, self.VECTOR_K_MIN)
|
|
509
|
+
vec_query = self._serialize_embedding(query_embedding)
|
|
510
|
+
|
|
511
|
+
try:
|
|
512
|
+
if search_mode == "vector":
|
|
513
|
+
sql = f"""
|
|
514
|
+
WITH vec AS (
|
|
515
|
+
SELECT m.id AS memory_id
|
|
516
|
+
FROM memories_vec v
|
|
517
|
+
JOIN memories m ON m.id = v.memory_id
|
|
518
|
+
WHERE v.embedding MATCH ? AND k = ?
|
|
519
|
+
AND m.status = ?
|
|
520
|
+
{filters}
|
|
521
|
+
ORDER BY v.distance
|
|
522
|
+
LIMIT ?
|
|
523
|
+
)
|
|
524
|
+
SELECT m.*
|
|
525
|
+
FROM vec
|
|
526
|
+
JOIN memories m ON m.id = vec.memory_id
|
|
527
|
+
ORDER BY m.created_at DESC
|
|
528
|
+
LIMIT ? OFFSET ?
|
|
529
|
+
"""
|
|
530
|
+
params: list[Any] = [
|
|
531
|
+
vec_query,
|
|
532
|
+
vector_k,
|
|
533
|
+
*filter_params,
|
|
534
|
+
vector_k,
|
|
535
|
+
limit,
|
|
536
|
+
offset,
|
|
537
|
+
]
|
|
538
|
+
else:
|
|
539
|
+
sql = f"""
|
|
540
|
+
WITH
|
|
541
|
+
fts AS (
|
|
542
|
+
SELECT
|
|
543
|
+
m.id AS memory_id,
|
|
544
|
+
ROW_NUMBER() OVER (ORDER BY bm25(memories_fts)) AS fts_rank
|
|
545
|
+
FROM memories_fts
|
|
546
|
+
JOIN memories m ON m.id = memories_fts.rowid
|
|
547
|
+
WHERE memories_fts MATCH ?
|
|
548
|
+
AND m.status = ?
|
|
549
|
+
{filters}
|
|
550
|
+
LIMIT ?
|
|
551
|
+
),
|
|
552
|
+
vec AS (
|
|
553
|
+
SELECT
|
|
554
|
+
m.id AS memory_id,
|
|
555
|
+
ROW_NUMBER() OVER (ORDER BY v.distance) AS vec_rank
|
|
556
|
+
FROM memories_vec v
|
|
557
|
+
JOIN memories m ON m.id = v.memory_id
|
|
558
|
+
WHERE v.embedding MATCH ? AND k = ?
|
|
559
|
+
AND m.status = ?
|
|
560
|
+
{filters}
|
|
561
|
+
LIMIT ?
|
|
562
|
+
),
|
|
563
|
+
rrf AS (
|
|
564
|
+
SELECT memory_id, (1.0 / (? + fts_rank)) * 0.5 AS score FROM fts
|
|
565
|
+
UNION ALL
|
|
566
|
+
SELECT memory_id, (1.0 / (? + vec_rank)) * 0.5 AS score FROM vec
|
|
567
|
+
),
|
|
568
|
+
ranked AS (
|
|
569
|
+
SELECT memory_id, SUM(score) AS rrf_score
|
|
570
|
+
FROM rrf
|
|
571
|
+
GROUP BY memory_id
|
|
572
|
+
)
|
|
573
|
+
SELECT m.*
|
|
574
|
+
FROM ranked
|
|
575
|
+
JOIN memories m ON m.id = ranked.memory_id
|
|
576
|
+
ORDER BY ranked.rrf_score DESC, m.created_at DESC
|
|
577
|
+
LIMIT ? OFFSET ?
|
|
578
|
+
"""
|
|
579
|
+
params = [
|
|
580
|
+
query,
|
|
581
|
+
*filter_params,
|
|
582
|
+
vector_k,
|
|
583
|
+
vec_query,
|
|
584
|
+
vector_k,
|
|
585
|
+
*filter_params,
|
|
586
|
+
vector_k,
|
|
587
|
+
rrf_k,
|
|
588
|
+
rrf_k,
|
|
589
|
+
limit,
|
|
590
|
+
offset,
|
|
591
|
+
]
|
|
592
|
+
|
|
593
|
+
cursor.execute(sql, params)
|
|
594
|
+
rows = cursor.fetchall()
|
|
595
|
+
return [self._row_to_memory_record(row) for row in rows]
|
|
596
|
+
except sqlite3.Error:
|
|
597
|
+
return self._fallback_keyword_search(
|
|
598
|
+
query=query,
|
|
599
|
+
category=category,
|
|
600
|
+
source_tool=source_tool,
|
|
601
|
+
status=status,
|
|
602
|
+
limit=limit,
|
|
603
|
+
offset=offset,
|
|
604
|
+
reason="sqlite query error",
|
|
605
|
+
exc_info=True,
|
|
606
|
+
)
|
|
607
|
+
|
|
608
|
+
def get_memory_by_id(self, memory_id: int) -> MemoryRecord | None:
|
|
609
|
+
with self._get_connection() as conn:
|
|
610
|
+
cursor = conn.cursor()
|
|
611
|
+
cursor.execute("SELECT * FROM memories WHERE id = ?", (memory_id,))
|
|
612
|
+
row = cursor.fetchone()
|
|
613
|
+
|
|
614
|
+
if row is None:
|
|
615
|
+
return None
|
|
616
|
+
|
|
617
|
+
return self._row_to_memory_record(row)
|
|
618
|
+
|
|
619
|
+
def update_memory_status(self, memory_id: int, status: str) -> bool:
|
|
620
|
+
with self._get_connection() as conn:
|
|
621
|
+
cursor = conn.cursor()
|
|
622
|
+
cursor.execute(
|
|
623
|
+
"UPDATE memories SET status = ?, updated_at = CURRENT_TIMESTAMP WHERE id = ?",
|
|
624
|
+
(status, memory_id),
|
|
625
|
+
)
|
|
626
|
+
return cursor.rowcount > 0
|
|
627
|
+
|
|
628
|
+
def get_extraction_state(self, source_path: str) -> ExtractionStateRecord | None:
|
|
629
|
+
with self._get_connection() as conn:
|
|
630
|
+
cursor = conn.cursor()
|
|
631
|
+
cursor.execute("SELECT * FROM extraction_state WHERE source_path = ?", (source_path,))
|
|
632
|
+
row = cursor.fetchone()
|
|
633
|
+
|
|
634
|
+
if row is None:
|
|
635
|
+
return None
|
|
636
|
+
|
|
637
|
+
return {
|
|
638
|
+
"id": row["id"],
|
|
639
|
+
"source_path": row["source_path"],
|
|
640
|
+
"last_mtime": row["last_mtime"],
|
|
641
|
+
"last_size": row["last_size"],
|
|
642
|
+
"last_extracted_at": row["last_extracted_at"],
|
|
643
|
+
"message_count": row["message_count"],
|
|
644
|
+
}
|
|
645
|
+
|
|
646
|
+
def update_extraction_state(
|
|
647
|
+
self,
|
|
648
|
+
source_path: str,
|
|
649
|
+
last_mtime: float,
|
|
650
|
+
last_size: int,
|
|
651
|
+
message_count: int = 0,
|
|
652
|
+
) -> None:
|
|
653
|
+
with self._get_connection() as conn:
|
|
654
|
+
cursor = conn.cursor()
|
|
655
|
+
cursor.execute(
|
|
656
|
+
"""
|
|
657
|
+
INSERT INTO extraction_state
|
|
658
|
+
(source_path, last_mtime, last_size, message_count, last_extracted_at)
|
|
659
|
+
VALUES (?, ?, ?, ?, CURRENT_TIMESTAMP)
|
|
660
|
+
ON CONFLICT(source_path) DO UPDATE SET
|
|
661
|
+
last_mtime = excluded.last_mtime,
|
|
662
|
+
last_size = excluded.last_size,
|
|
663
|
+
message_count = excluded.message_count,
|
|
664
|
+
last_extracted_at = CURRENT_TIMESTAMP
|
|
665
|
+
""",
|
|
666
|
+
(source_path, last_mtime, last_size, message_count),
|
|
667
|
+
)
|
|
668
|
+
|
|
669
|
+
def log_extraction(
|
|
670
|
+
self,
|
|
671
|
+
source_path: str,
|
|
672
|
+
memories_extracted: int = 0,
|
|
673
|
+
tokens_used: int = 0,
|
|
674
|
+
duration_ms: int = 0,
|
|
675
|
+
error: str | None = None,
|
|
676
|
+
) -> int:
|
|
677
|
+
with self._get_connection() as conn:
|
|
678
|
+
cursor = conn.cursor()
|
|
679
|
+
cursor.execute(
|
|
680
|
+
"""
|
|
681
|
+
INSERT INTO extraction_log
|
|
682
|
+
(source_path, memories_extracted, tokens_used, duration_ms, error)
|
|
683
|
+
VALUES (?, ?, ?, ?, ?)
|
|
684
|
+
""",
|
|
685
|
+
(source_path, memories_extracted, tokens_used, duration_ms, error),
|
|
686
|
+
)
|
|
687
|
+
return cursor.lastrowid or 0
|
|
688
|
+
|
|
689
|
+
def get_memory_count(self, status: str = "active") -> int:
|
|
690
|
+
with self._get_connection() as conn:
|
|
691
|
+
cursor = conn.cursor()
|
|
692
|
+
cursor.execute("SELECT COUNT(*) FROM memories WHERE status = ?", (status,))
|
|
693
|
+
row = cursor.fetchone()
|
|
694
|
+
return row[0] if row else 0
|
|
695
|
+
|
|
696
|
+
def get_all_memories(
|
|
697
|
+
self, status: str = "active", category: str | None = None
|
|
698
|
+
) -> list[MemoryRecord]:
|
|
699
|
+
with self._get_connection() as conn:
|
|
700
|
+
cursor = conn.cursor()
|
|
701
|
+
|
|
702
|
+
sql = "SELECT * FROM memories WHERE status = ?"
|
|
703
|
+
params: list[str] = [status]
|
|
704
|
+
|
|
705
|
+
if category:
|
|
706
|
+
sql += " AND category = ?"
|
|
707
|
+
params.append(category)
|
|
708
|
+
|
|
709
|
+
sql += " ORDER BY created_at DESC"
|
|
710
|
+
cursor.execute(sql, params)
|
|
711
|
+
rows = cursor.fetchall()
|
|
712
|
+
|
|
713
|
+
return [self._row_to_memory_record(row) for row in rows]
|
|
714
|
+
|
|
715
|
+
def get_all_extraction_states(self) -> list[ExtractionStateRecord]:
|
|
716
|
+
with self._get_connection() as conn:
|
|
717
|
+
cursor = conn.cursor()
|
|
718
|
+
cursor.execute("SELECT * FROM extraction_state")
|
|
719
|
+
rows = cursor.fetchall()
|
|
720
|
+
|
|
721
|
+
return [
|
|
722
|
+
{
|
|
723
|
+
"id": row["id"],
|
|
724
|
+
"source_path": row["source_path"],
|
|
725
|
+
"last_mtime": row["last_mtime"],
|
|
726
|
+
"last_size": row["last_size"],
|
|
727
|
+
"last_extracted_at": row["last_extracted_at"],
|
|
728
|
+
"message_count": row["message_count"],
|
|
729
|
+
}
|
|
730
|
+
for row in rows
|
|
731
|
+
]
|
|
732
|
+
|
|
733
|
+
def get_recent_extraction_logs(self, limit: int = 10) -> list[ExtractionLogRecord]:
|
|
734
|
+
with self._get_connection() as conn:
|
|
735
|
+
cursor = conn.cursor()
|
|
736
|
+
cursor.execute(
|
|
737
|
+
"SELECT * FROM extraction_log ORDER BY created_at DESC LIMIT ?",
|
|
738
|
+
(limit,),
|
|
739
|
+
)
|
|
740
|
+
rows = cursor.fetchall()
|
|
741
|
+
|
|
742
|
+
return [
|
|
743
|
+
{
|
|
744
|
+
"id": row["id"],
|
|
745
|
+
"source_path": row["source_path"],
|
|
746
|
+
"memories_extracted": row["memories_extracted"],
|
|
747
|
+
"tokens_used": row["tokens_used"],
|
|
748
|
+
"duration_ms": row["duration_ms"],
|
|
749
|
+
"error": row["error"],
|
|
750
|
+
"created_at": row["created_at"],
|
|
751
|
+
}
|
|
752
|
+
for row in rows
|
|
753
|
+
]
|