keep-skill 0.1.0__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- keep/__init__.py +3 -6
- keep/api.py +1052 -145
- keep/cli.py +705 -132
- keep/config.py +172 -41
- keep/context.py +1 -125
- keep/document_store.py +908 -0
- keep/errors.py +33 -0
- keep/indexing.py +1 -1
- keep/logging_config.py +34 -3
- keep/paths.py +81 -17
- keep/pending_summaries.py +52 -40
- keep/providers/embedding_cache.py +59 -46
- keep/providers/embeddings.py +43 -13
- keep/providers/mlx.py +23 -21
- keep/store.py +169 -25
- keep_skill-0.3.0.dist-info/METADATA +218 -0
- keep_skill-0.3.0.dist-info/RECORD +28 -0
- keep_skill-0.1.0.dist-info/METADATA +0 -290
- keep_skill-0.1.0.dist-info/RECORD +0 -26
- {keep_skill-0.1.0.dist-info → keep_skill-0.3.0.dist-info}/WHEEL +0 -0
- {keep_skill-0.1.0.dist-info → keep_skill-0.3.0.dist-info}/entry_points.txt +0 -0
- {keep_skill-0.1.0.dist-info → keep_skill-0.3.0.dist-info}/licenses/LICENSE +0 -0
keep/document_store.py
ADDED
|
@@ -0,0 +1,908 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Document store using SQLite.
|
|
3
|
+
|
|
4
|
+
Stores canonical document records separate from embeddings.
|
|
5
|
+
This enables multiple embedding providers to index the same documents.
|
|
6
|
+
|
|
7
|
+
The document store is the source of truth for:
|
|
8
|
+
- Document identity (URI / custom ID)
|
|
9
|
+
- Summary text
|
|
10
|
+
- Tags (source + system)
|
|
11
|
+
- Timestamps
|
|
12
|
+
|
|
13
|
+
Embeddings are stored in ChromaDB collections, keyed by embedding provider.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
import json
|
|
17
|
+
import sqlite3
|
|
18
|
+
import threading
|
|
19
|
+
from dataclasses import dataclass
|
|
20
|
+
from datetime import datetime, timezone
|
|
21
|
+
from pathlib import Path
|
|
22
|
+
from typing import Any, Optional
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
# Schema version for migrations
|
|
26
|
+
SCHEMA_VERSION = 1
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dataclass
|
|
30
|
+
class VersionInfo:
|
|
31
|
+
"""
|
|
32
|
+
Information about a document version.
|
|
33
|
+
|
|
34
|
+
Used for version navigation and history display.
|
|
35
|
+
"""
|
|
36
|
+
version: int # 1=oldest archived, increasing
|
|
37
|
+
summary: str
|
|
38
|
+
tags: dict[str, str]
|
|
39
|
+
created_at: str
|
|
40
|
+
content_hash: Optional[str] = None
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@dataclass
|
|
44
|
+
class DocumentRecord:
|
|
45
|
+
"""
|
|
46
|
+
A canonical document record.
|
|
47
|
+
|
|
48
|
+
This is the source of truth, independent of any embedding index.
|
|
49
|
+
"""
|
|
50
|
+
id: str
|
|
51
|
+
collection: str
|
|
52
|
+
summary: str
|
|
53
|
+
tags: dict[str, str]
|
|
54
|
+
created_at: str
|
|
55
|
+
updated_at: str
|
|
56
|
+
content_hash: Optional[str] = None
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class DocumentStore:
|
|
60
|
+
"""
|
|
61
|
+
SQLite-backed store for canonical document records.
|
|
62
|
+
|
|
63
|
+
Separates document metadata from embedding storage, enabling:
|
|
64
|
+
- Multiple embedding providers per document
|
|
65
|
+
- Efficient tag/metadata queries without ChromaDB
|
|
66
|
+
- Clear separation of concerns
|
|
67
|
+
"""
|
|
68
|
+
|
|
69
|
+
def __init__(self, store_path: Path):
|
|
70
|
+
"""
|
|
71
|
+
Args:
|
|
72
|
+
store_path: Path to SQLite database file
|
|
73
|
+
"""
|
|
74
|
+
self._db_path = store_path
|
|
75
|
+
self._conn: Optional[sqlite3.Connection] = None
|
|
76
|
+
self._lock = threading.Lock()
|
|
77
|
+
self._init_db()
|
|
78
|
+
|
|
79
|
+
def _init_db(self) -> None:
|
|
80
|
+
"""Initialize the SQLite database."""
|
|
81
|
+
self._db_path.parent.mkdir(parents=True, exist_ok=True)
|
|
82
|
+
self._conn = sqlite3.connect(str(self._db_path), check_same_thread=False)
|
|
83
|
+
self._conn.row_factory = sqlite3.Row
|
|
84
|
+
|
|
85
|
+
# Enable WAL mode for better concurrent access across processes
|
|
86
|
+
self._conn.execute("PRAGMA journal_mode=WAL")
|
|
87
|
+
# Wait up to 5 seconds for locks instead of failing immediately
|
|
88
|
+
self._conn.execute("PRAGMA busy_timeout=5000")
|
|
89
|
+
|
|
90
|
+
self._conn.execute("""
|
|
91
|
+
CREATE TABLE IF NOT EXISTS documents (
|
|
92
|
+
id TEXT NOT NULL,
|
|
93
|
+
collection TEXT NOT NULL,
|
|
94
|
+
summary TEXT NOT NULL,
|
|
95
|
+
tags_json TEXT NOT NULL DEFAULT '{}',
|
|
96
|
+
created_at TEXT NOT NULL,
|
|
97
|
+
updated_at TEXT NOT NULL,
|
|
98
|
+
content_hash TEXT,
|
|
99
|
+
PRIMARY KEY (id, collection)
|
|
100
|
+
)
|
|
101
|
+
""")
|
|
102
|
+
|
|
103
|
+
# Migration: add content_hash column if missing (for existing databases)
|
|
104
|
+
cursor = self._conn.execute("PRAGMA table_info(documents)")
|
|
105
|
+
columns = {row[1] for row in cursor.fetchall()}
|
|
106
|
+
if "content_hash" not in columns:
|
|
107
|
+
self._conn.execute("ALTER TABLE documents ADD COLUMN content_hash TEXT")
|
|
108
|
+
|
|
109
|
+
# Index for collection queries
|
|
110
|
+
self._conn.execute("""
|
|
111
|
+
CREATE INDEX IF NOT EXISTS idx_documents_collection
|
|
112
|
+
ON documents(collection)
|
|
113
|
+
""")
|
|
114
|
+
|
|
115
|
+
# Index for timestamp queries
|
|
116
|
+
self._conn.execute("""
|
|
117
|
+
CREATE INDEX IF NOT EXISTS idx_documents_updated
|
|
118
|
+
ON documents(updated_at)
|
|
119
|
+
""")
|
|
120
|
+
|
|
121
|
+
self._conn.commit()
|
|
122
|
+
|
|
123
|
+
# Run schema migrations
|
|
124
|
+
self._migrate_schema()
|
|
125
|
+
|
|
126
|
+
def _migrate_schema(self) -> None:
|
|
127
|
+
"""
|
|
128
|
+
Run schema migrations using PRAGMA user_version.
|
|
129
|
+
|
|
130
|
+
Migrations:
|
|
131
|
+
- Version 0 → 1: Create document_versions table
|
|
132
|
+
"""
|
|
133
|
+
cursor = self._conn.execute("PRAGMA user_version")
|
|
134
|
+
current_version = cursor.fetchone()[0]
|
|
135
|
+
|
|
136
|
+
if current_version < 1:
|
|
137
|
+
# Create versions table for document history
|
|
138
|
+
self._conn.execute("""
|
|
139
|
+
CREATE TABLE IF NOT EXISTS document_versions (
|
|
140
|
+
id TEXT NOT NULL,
|
|
141
|
+
collection TEXT NOT NULL,
|
|
142
|
+
version INTEGER NOT NULL,
|
|
143
|
+
summary TEXT NOT NULL,
|
|
144
|
+
tags_json TEXT NOT NULL,
|
|
145
|
+
content_hash TEXT,
|
|
146
|
+
created_at TEXT NOT NULL,
|
|
147
|
+
PRIMARY KEY (id, collection, version)
|
|
148
|
+
)
|
|
149
|
+
""")
|
|
150
|
+
|
|
151
|
+
# Index for efficient version lookups
|
|
152
|
+
self._conn.execute("""
|
|
153
|
+
CREATE INDEX IF NOT EXISTS idx_versions_doc
|
|
154
|
+
ON document_versions(id, collection, version DESC)
|
|
155
|
+
""")
|
|
156
|
+
|
|
157
|
+
self._conn.execute(f"PRAGMA user_version = {SCHEMA_VERSION}")
|
|
158
|
+
self._conn.commit()
|
|
159
|
+
|
|
160
|
+
def _now(self) -> str:
|
|
161
|
+
"""Current timestamp in ISO format."""
|
|
162
|
+
return datetime.now(timezone.utc).isoformat()
|
|
163
|
+
|
|
164
|
+
def _get_unlocked(self, collection: str, id: str) -> Optional[DocumentRecord]:
|
|
165
|
+
"""Get a document by ID without acquiring the lock (for use within locked contexts)."""
|
|
166
|
+
cursor = self._conn.execute("""
|
|
167
|
+
SELECT id, collection, summary, tags_json, created_at, updated_at, content_hash
|
|
168
|
+
FROM documents
|
|
169
|
+
WHERE id = ? AND collection = ?
|
|
170
|
+
""", (id, collection))
|
|
171
|
+
|
|
172
|
+
row = cursor.fetchone()
|
|
173
|
+
if row is None:
|
|
174
|
+
return None
|
|
175
|
+
|
|
176
|
+
return DocumentRecord(
|
|
177
|
+
id=row["id"],
|
|
178
|
+
collection=row["collection"],
|
|
179
|
+
summary=row["summary"],
|
|
180
|
+
tags=json.loads(row["tags_json"]),
|
|
181
|
+
created_at=row["created_at"],
|
|
182
|
+
updated_at=row["updated_at"],
|
|
183
|
+
content_hash=row["content_hash"],
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
# -------------------------------------------------------------------------
|
|
187
|
+
# Write Operations
|
|
188
|
+
# -------------------------------------------------------------------------
|
|
189
|
+
|
|
190
|
+
def upsert(
|
|
191
|
+
self,
|
|
192
|
+
collection: str,
|
|
193
|
+
id: str,
|
|
194
|
+
summary: str,
|
|
195
|
+
tags: dict[str, str],
|
|
196
|
+
content_hash: Optional[str] = None,
|
|
197
|
+
) -> tuple[DocumentRecord, bool]:
|
|
198
|
+
"""
|
|
199
|
+
Insert or update a document record.
|
|
200
|
+
|
|
201
|
+
Preserves created_at on update. Updates updated_at always.
|
|
202
|
+
Archives the current version to history before updating.
|
|
203
|
+
|
|
204
|
+
Args:
|
|
205
|
+
collection: Collection name
|
|
206
|
+
id: Document identifier (URI or custom)
|
|
207
|
+
summary: Document summary text
|
|
208
|
+
tags: All tags (source + system)
|
|
209
|
+
content_hash: SHA256 hash of content (for change detection)
|
|
210
|
+
|
|
211
|
+
Returns:
|
|
212
|
+
Tuple of (stored DocumentRecord, content_changed bool).
|
|
213
|
+
content_changed is True if content hash differs from previous,
|
|
214
|
+
False if only tags/summary changed or if new document.
|
|
215
|
+
"""
|
|
216
|
+
now = self._now()
|
|
217
|
+
tags_json = json.dumps(tags, ensure_ascii=False)
|
|
218
|
+
|
|
219
|
+
with self._lock:
|
|
220
|
+
# Check if exists to preserve created_at and archive
|
|
221
|
+
existing = self._get_unlocked(collection, id)
|
|
222
|
+
created_at = existing.created_at if existing else now
|
|
223
|
+
content_changed = False
|
|
224
|
+
|
|
225
|
+
if existing:
|
|
226
|
+
# Archive current version before updating
|
|
227
|
+
self._archive_current_unlocked(collection, id, existing)
|
|
228
|
+
# Detect content change
|
|
229
|
+
content_changed = (
|
|
230
|
+
content_hash is not None
|
|
231
|
+
and existing.content_hash != content_hash
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
self._conn.execute("""
|
|
235
|
+
INSERT OR REPLACE INTO documents
|
|
236
|
+
(id, collection, summary, tags_json, created_at, updated_at, content_hash)
|
|
237
|
+
VALUES (?, ?, ?, ?, ?, ?, ?)
|
|
238
|
+
""", (id, collection, summary, tags_json, created_at, now, content_hash))
|
|
239
|
+
self._conn.commit()
|
|
240
|
+
|
|
241
|
+
return DocumentRecord(
|
|
242
|
+
id=id,
|
|
243
|
+
collection=collection,
|
|
244
|
+
summary=summary,
|
|
245
|
+
tags=tags,
|
|
246
|
+
created_at=created_at,
|
|
247
|
+
updated_at=now,
|
|
248
|
+
content_hash=content_hash,
|
|
249
|
+
), content_changed
|
|
250
|
+
|
|
251
|
+
def _archive_current_unlocked(
|
|
252
|
+
self,
|
|
253
|
+
collection: str,
|
|
254
|
+
id: str,
|
|
255
|
+
current: DocumentRecord,
|
|
256
|
+
) -> int:
|
|
257
|
+
"""
|
|
258
|
+
Archive the current version to the versions table.
|
|
259
|
+
|
|
260
|
+
Must be called within a lock context.
|
|
261
|
+
|
|
262
|
+
Args:
|
|
263
|
+
collection: Collection name
|
|
264
|
+
id: Document identifier
|
|
265
|
+
current: Current document record to archive
|
|
266
|
+
|
|
267
|
+
Returns:
|
|
268
|
+
The version number assigned to the archived version
|
|
269
|
+
"""
|
|
270
|
+
# Get the next version number
|
|
271
|
+
cursor = self._conn.execute("""
|
|
272
|
+
SELECT COALESCE(MAX(version), 0) + 1
|
|
273
|
+
FROM document_versions
|
|
274
|
+
WHERE id = ? AND collection = ?
|
|
275
|
+
""", (id, collection))
|
|
276
|
+
next_version = cursor.fetchone()[0]
|
|
277
|
+
|
|
278
|
+
# Insert the current state as a version
|
|
279
|
+
self._conn.execute("""
|
|
280
|
+
INSERT INTO document_versions
|
|
281
|
+
(id, collection, version, summary, tags_json, content_hash, created_at)
|
|
282
|
+
VALUES (?, ?, ?, ?, ?, ?, ?)
|
|
283
|
+
""", (
|
|
284
|
+
id,
|
|
285
|
+
collection,
|
|
286
|
+
next_version,
|
|
287
|
+
current.summary,
|
|
288
|
+
json.dumps(current.tags, ensure_ascii=False),
|
|
289
|
+
current.content_hash,
|
|
290
|
+
current.updated_at, # Use updated_at as the version's timestamp
|
|
291
|
+
))
|
|
292
|
+
|
|
293
|
+
return next_version
|
|
294
|
+
|
|
295
|
+
def update_summary(self, collection: str, id: str, summary: str) -> bool:
|
|
296
|
+
"""
|
|
297
|
+
Update just the summary of an existing document.
|
|
298
|
+
|
|
299
|
+
Used by lazy summarization to replace placeholder summaries.
|
|
300
|
+
|
|
301
|
+
Args:
|
|
302
|
+
collection: Collection name
|
|
303
|
+
id: Document identifier
|
|
304
|
+
summary: New summary text
|
|
305
|
+
|
|
306
|
+
Returns:
|
|
307
|
+
True if document was found and updated, False otherwise
|
|
308
|
+
"""
|
|
309
|
+
now = self._now()
|
|
310
|
+
|
|
311
|
+
with self._lock:
|
|
312
|
+
cursor = self._conn.execute("""
|
|
313
|
+
UPDATE documents
|
|
314
|
+
SET summary = ?, updated_at = ?
|
|
315
|
+
WHERE id = ? AND collection = ?
|
|
316
|
+
""", (summary, now, id, collection))
|
|
317
|
+
self._conn.commit()
|
|
318
|
+
|
|
319
|
+
return cursor.rowcount > 0
|
|
320
|
+
|
|
321
|
+
def update_tags(
|
|
322
|
+
self,
|
|
323
|
+
collection: str,
|
|
324
|
+
id: str,
|
|
325
|
+
tags: dict[str, str],
|
|
326
|
+
) -> bool:
|
|
327
|
+
"""
|
|
328
|
+
Update tags of an existing document.
|
|
329
|
+
|
|
330
|
+
Args:
|
|
331
|
+
collection: Collection name
|
|
332
|
+
id: Document identifier
|
|
333
|
+
tags: New tags dict (replaces existing)
|
|
334
|
+
|
|
335
|
+
Returns:
|
|
336
|
+
True if document was found and updated, False otherwise
|
|
337
|
+
"""
|
|
338
|
+
now = self._now()
|
|
339
|
+
tags_json = json.dumps(tags, ensure_ascii=False)
|
|
340
|
+
|
|
341
|
+
with self._lock:
|
|
342
|
+
cursor = self._conn.execute("""
|
|
343
|
+
UPDATE documents
|
|
344
|
+
SET tags_json = ?, updated_at = ?
|
|
345
|
+
WHERE id = ? AND collection = ?
|
|
346
|
+
""", (tags_json, now, id, collection))
|
|
347
|
+
self._conn.commit()
|
|
348
|
+
|
|
349
|
+
return cursor.rowcount > 0
|
|
350
|
+
|
|
351
|
+
def delete(self, collection: str, id: str, delete_versions: bool = True) -> bool:
|
|
352
|
+
"""
|
|
353
|
+
Delete a document record and optionally its version history.
|
|
354
|
+
|
|
355
|
+
Args:
|
|
356
|
+
collection: Collection name
|
|
357
|
+
id: Document identifier
|
|
358
|
+
delete_versions: If True, also delete version history
|
|
359
|
+
|
|
360
|
+
Returns:
|
|
361
|
+
True if document existed and was deleted
|
|
362
|
+
"""
|
|
363
|
+
with self._lock:
|
|
364
|
+
cursor = self._conn.execute("""
|
|
365
|
+
DELETE FROM documents
|
|
366
|
+
WHERE id = ? AND collection = ?
|
|
367
|
+
""", (id, collection))
|
|
368
|
+
|
|
369
|
+
if delete_versions:
|
|
370
|
+
self._conn.execute("""
|
|
371
|
+
DELETE FROM document_versions
|
|
372
|
+
WHERE id = ? AND collection = ?
|
|
373
|
+
""", (id, collection))
|
|
374
|
+
|
|
375
|
+
self._conn.commit()
|
|
376
|
+
|
|
377
|
+
return cursor.rowcount > 0
|
|
378
|
+
|
|
379
|
+
# -------------------------------------------------------------------------
|
|
380
|
+
# Read Operations
|
|
381
|
+
# -------------------------------------------------------------------------
|
|
382
|
+
|
|
383
|
+
def get(self, collection: str, id: str) -> Optional[DocumentRecord]:
|
|
384
|
+
"""
|
|
385
|
+
Get a document by ID.
|
|
386
|
+
|
|
387
|
+
Args:
|
|
388
|
+
collection: Collection name
|
|
389
|
+
id: Document identifier
|
|
390
|
+
|
|
391
|
+
Returns:
|
|
392
|
+
DocumentRecord if found, None otherwise
|
|
393
|
+
"""
|
|
394
|
+
cursor = self._conn.execute("""
|
|
395
|
+
SELECT id, collection, summary, tags_json, created_at, updated_at, content_hash
|
|
396
|
+
FROM documents
|
|
397
|
+
WHERE id = ? AND collection = ?
|
|
398
|
+
""", (id, collection))
|
|
399
|
+
|
|
400
|
+
row = cursor.fetchone()
|
|
401
|
+
if row is None:
|
|
402
|
+
return None
|
|
403
|
+
|
|
404
|
+
return DocumentRecord(
|
|
405
|
+
id=row["id"],
|
|
406
|
+
collection=row["collection"],
|
|
407
|
+
summary=row["summary"],
|
|
408
|
+
tags=json.loads(row["tags_json"]),
|
|
409
|
+
created_at=row["created_at"],
|
|
410
|
+
updated_at=row["updated_at"],
|
|
411
|
+
content_hash=row["content_hash"],
|
|
412
|
+
)
|
|
413
|
+
|
|
414
|
+
def get_version(
|
|
415
|
+
self,
|
|
416
|
+
collection: str,
|
|
417
|
+
id: str,
|
|
418
|
+
offset: int = 0,
|
|
419
|
+
) -> Optional[VersionInfo]:
|
|
420
|
+
"""
|
|
421
|
+
Get a specific version of a document by offset.
|
|
422
|
+
|
|
423
|
+
Offset semantics:
|
|
424
|
+
- 0 = current version (returns None, use get() instead)
|
|
425
|
+
- 1 = previous version (most recent archived)
|
|
426
|
+
- 2 = two versions ago
|
|
427
|
+
- etc.
|
|
428
|
+
|
|
429
|
+
Args:
|
|
430
|
+
collection: Collection name
|
|
431
|
+
id: Document identifier
|
|
432
|
+
offset: Version offset (0=current, 1=previous, etc.)
|
|
433
|
+
|
|
434
|
+
Returns:
|
|
435
|
+
VersionInfo if found, None if offset 0 or version doesn't exist
|
|
436
|
+
"""
|
|
437
|
+
if offset == 0:
|
|
438
|
+
# Offset 0 means current - caller should use get()
|
|
439
|
+
return None
|
|
440
|
+
|
|
441
|
+
# Get max version to calculate the target
|
|
442
|
+
cursor = self._conn.execute("""
|
|
443
|
+
SELECT MAX(version) FROM document_versions
|
|
444
|
+
WHERE id = ? AND collection = ?
|
|
445
|
+
""", (id, collection))
|
|
446
|
+
max_version = cursor.fetchone()[0]
|
|
447
|
+
|
|
448
|
+
if max_version is None:
|
|
449
|
+
return None # No versions archived
|
|
450
|
+
|
|
451
|
+
# offset=1 → max_version, offset=2 → max_version-1, etc.
|
|
452
|
+
target_version = max_version - (offset - 1)
|
|
453
|
+
|
|
454
|
+
if target_version < 1:
|
|
455
|
+
return None # Requested version doesn't exist
|
|
456
|
+
|
|
457
|
+
cursor = self._conn.execute("""
|
|
458
|
+
SELECT version, summary, tags_json, content_hash, created_at
|
|
459
|
+
FROM document_versions
|
|
460
|
+
WHERE id = ? AND collection = ? AND version = ?
|
|
461
|
+
""", (id, collection, target_version))
|
|
462
|
+
|
|
463
|
+
row = cursor.fetchone()
|
|
464
|
+
if row is None:
|
|
465
|
+
return None
|
|
466
|
+
|
|
467
|
+
return VersionInfo(
|
|
468
|
+
version=row["version"],
|
|
469
|
+
summary=row["summary"],
|
|
470
|
+
tags=json.loads(row["tags_json"]),
|
|
471
|
+
created_at=row["created_at"],
|
|
472
|
+
content_hash=row["content_hash"],
|
|
473
|
+
)
|
|
474
|
+
|
|
475
|
+
def list_versions(
|
|
476
|
+
self,
|
|
477
|
+
collection: str,
|
|
478
|
+
id: str,
|
|
479
|
+
limit: int = 10,
|
|
480
|
+
) -> list[VersionInfo]:
|
|
481
|
+
"""
|
|
482
|
+
List version history for a document.
|
|
483
|
+
|
|
484
|
+
Returns versions in reverse chronological order (newest first).
|
|
485
|
+
|
|
486
|
+
Args:
|
|
487
|
+
collection: Collection name
|
|
488
|
+
id: Document identifier
|
|
489
|
+
limit: Maximum versions to return
|
|
490
|
+
|
|
491
|
+
Returns:
|
|
492
|
+
List of VersionInfo, newest archived first
|
|
493
|
+
"""
|
|
494
|
+
cursor = self._conn.execute("""
|
|
495
|
+
SELECT version, summary, tags_json, content_hash, created_at
|
|
496
|
+
FROM document_versions
|
|
497
|
+
WHERE id = ? AND collection = ?
|
|
498
|
+
ORDER BY version DESC
|
|
499
|
+
LIMIT ?
|
|
500
|
+
""", (id, collection, limit))
|
|
501
|
+
|
|
502
|
+
versions = []
|
|
503
|
+
for row in cursor:
|
|
504
|
+
versions.append(VersionInfo(
|
|
505
|
+
version=row["version"],
|
|
506
|
+
summary=row["summary"],
|
|
507
|
+
tags=json.loads(row["tags_json"]),
|
|
508
|
+
created_at=row["created_at"],
|
|
509
|
+
content_hash=row["content_hash"],
|
|
510
|
+
))
|
|
511
|
+
|
|
512
|
+
return versions
|
|
513
|
+
|
|
514
|
+
def get_version_nav(
|
|
515
|
+
self,
|
|
516
|
+
collection: str,
|
|
517
|
+
id: str,
|
|
518
|
+
current_version: Optional[int] = None,
|
|
519
|
+
limit: int = 3,
|
|
520
|
+
) -> dict[str, list[VersionInfo]]:
|
|
521
|
+
"""
|
|
522
|
+
Get version navigation info (prev/next) for display.
|
|
523
|
+
|
|
524
|
+
Args:
|
|
525
|
+
collection: Collection name
|
|
526
|
+
id: Document identifier
|
|
527
|
+
current_version: The version being viewed (None = current/live version)
|
|
528
|
+
limit: Max previous versions to return when viewing current
|
|
529
|
+
|
|
530
|
+
Returns:
|
|
531
|
+
Dict with 'prev' and optionally 'next' lists of VersionInfo.
|
|
532
|
+
When viewing current (None): {'prev': [up to limit versions]}
|
|
533
|
+
When viewing old version N: {'prev': [N-1 if exists], 'next': [N+1 if exists]}
|
|
534
|
+
"""
|
|
535
|
+
result: dict[str, list[VersionInfo]] = {"prev": []}
|
|
536
|
+
|
|
537
|
+
if current_version is None:
|
|
538
|
+
# Viewing current version: get up to `limit` previous versions
|
|
539
|
+
versions = self.list_versions(collection, id, limit=limit)
|
|
540
|
+
result["prev"] = versions
|
|
541
|
+
else:
|
|
542
|
+
# Viewing an old version: get prev (N-1) and next (N+1)
|
|
543
|
+
# Previous version (older)
|
|
544
|
+
if current_version > 1:
|
|
545
|
+
cursor = self._conn.execute("""
|
|
546
|
+
SELECT version, summary, tags_json, content_hash, created_at
|
|
547
|
+
FROM document_versions
|
|
548
|
+
WHERE id = ? AND collection = ? AND version = ?
|
|
549
|
+
""", (id, collection, current_version - 1))
|
|
550
|
+
row = cursor.fetchone()
|
|
551
|
+
if row:
|
|
552
|
+
result["prev"] = [VersionInfo(
|
|
553
|
+
version=row["version"],
|
|
554
|
+
summary=row["summary"],
|
|
555
|
+
tags=json.loads(row["tags_json"]),
|
|
556
|
+
created_at=row["created_at"],
|
|
557
|
+
content_hash=row["content_hash"],
|
|
558
|
+
)]
|
|
559
|
+
|
|
560
|
+
# Next version (newer)
|
|
561
|
+
cursor = self._conn.execute("""
|
|
562
|
+
SELECT version, summary, tags_json, content_hash, created_at
|
|
563
|
+
FROM document_versions
|
|
564
|
+
WHERE id = ? AND collection = ? AND version = ?
|
|
565
|
+
""", (id, collection, current_version + 1))
|
|
566
|
+
row = cursor.fetchone()
|
|
567
|
+
if row:
|
|
568
|
+
result["next"] = [VersionInfo(
|
|
569
|
+
version=row["version"],
|
|
570
|
+
summary=row["summary"],
|
|
571
|
+
tags=json.loads(row["tags_json"]),
|
|
572
|
+
created_at=row["created_at"],
|
|
573
|
+
content_hash=row["content_hash"],
|
|
574
|
+
)]
|
|
575
|
+
else:
|
|
576
|
+
# Check if there's a current version (meaning we're at newest archived)
|
|
577
|
+
if self.exists(collection, id):
|
|
578
|
+
# Next is "current" - indicate this with empty next
|
|
579
|
+
# (caller knows to check current doc)
|
|
580
|
+
result["next"] = []
|
|
581
|
+
|
|
582
|
+
return result
|
|
583
|
+
|
|
584
|
+
def version_count(self, collection: str, id: str) -> int:
|
|
585
|
+
"""Count archived versions for a document."""
|
|
586
|
+
cursor = self._conn.execute("""
|
|
587
|
+
SELECT COUNT(*) FROM document_versions
|
|
588
|
+
WHERE id = ? AND collection = ?
|
|
589
|
+
""", (id, collection))
|
|
590
|
+
return cursor.fetchone()[0]
|
|
591
|
+
|
|
592
|
+
def get_many(
|
|
593
|
+
self,
|
|
594
|
+
collection: str,
|
|
595
|
+
ids: list[str],
|
|
596
|
+
) -> dict[str, DocumentRecord]:
|
|
597
|
+
"""
|
|
598
|
+
Get multiple documents by ID.
|
|
599
|
+
|
|
600
|
+
Args:
|
|
601
|
+
collection: Collection name
|
|
602
|
+
ids: List of document identifiers
|
|
603
|
+
|
|
604
|
+
Returns:
|
|
605
|
+
Dict mapping id → DocumentRecord (missing IDs omitted)
|
|
606
|
+
"""
|
|
607
|
+
if not ids:
|
|
608
|
+
return {}
|
|
609
|
+
|
|
610
|
+
placeholders = ",".join("?" * len(ids))
|
|
611
|
+
cursor = self._conn.execute(f"""
|
|
612
|
+
SELECT id, collection, summary, tags_json, created_at, updated_at, content_hash
|
|
613
|
+
FROM documents
|
|
614
|
+
WHERE collection = ? AND id IN ({placeholders})
|
|
615
|
+
""", (collection, *ids))
|
|
616
|
+
|
|
617
|
+
results = {}
|
|
618
|
+
for row in cursor:
|
|
619
|
+
results[row["id"]] = DocumentRecord(
|
|
620
|
+
id=row["id"],
|
|
621
|
+
collection=row["collection"],
|
|
622
|
+
summary=row["summary"],
|
|
623
|
+
tags=json.loads(row["tags_json"]),
|
|
624
|
+
created_at=row["created_at"],
|
|
625
|
+
updated_at=row["updated_at"],
|
|
626
|
+
content_hash=row["content_hash"],
|
|
627
|
+
)
|
|
628
|
+
|
|
629
|
+
return results
|
|
630
|
+
|
|
631
|
+
def exists(self, collection: str, id: str) -> bool:
|
|
632
|
+
"""Check if a document exists."""
|
|
633
|
+
cursor = self._conn.execute("""
|
|
634
|
+
SELECT 1 FROM documents
|
|
635
|
+
WHERE id = ? AND collection = ?
|
|
636
|
+
""", (id, collection))
|
|
637
|
+
return cursor.fetchone() is not None
|
|
638
|
+
|
|
639
|
+
def list_ids(
|
|
640
|
+
self,
|
|
641
|
+
collection: str,
|
|
642
|
+
limit: Optional[int] = None,
|
|
643
|
+
) -> list[str]:
|
|
644
|
+
"""
|
|
645
|
+
List document IDs in a collection.
|
|
646
|
+
|
|
647
|
+
Args:
|
|
648
|
+
collection: Collection name
|
|
649
|
+
limit: Maximum number to return (None for all)
|
|
650
|
+
|
|
651
|
+
Returns:
|
|
652
|
+
List of document IDs
|
|
653
|
+
"""
|
|
654
|
+
if limit:
|
|
655
|
+
cursor = self._conn.execute("""
|
|
656
|
+
SELECT id FROM documents
|
|
657
|
+
WHERE collection = ?
|
|
658
|
+
ORDER BY updated_at DESC
|
|
659
|
+
LIMIT ?
|
|
660
|
+
""", (collection, limit))
|
|
661
|
+
else:
|
|
662
|
+
cursor = self._conn.execute("""
|
|
663
|
+
SELECT id FROM documents
|
|
664
|
+
WHERE collection = ?
|
|
665
|
+
ORDER BY updated_at DESC
|
|
666
|
+
""", (collection,))
|
|
667
|
+
|
|
668
|
+
return [row["id"] for row in cursor]
|
|
669
|
+
|
|
670
|
+
def list_recent(
|
|
671
|
+
self,
|
|
672
|
+
collection: str,
|
|
673
|
+
limit: int = 10,
|
|
674
|
+
) -> list[DocumentRecord]:
|
|
675
|
+
"""
|
|
676
|
+
List recent documents ordered by update time.
|
|
677
|
+
|
|
678
|
+
Args:
|
|
679
|
+
collection: Collection name
|
|
680
|
+
limit: Maximum number to return
|
|
681
|
+
|
|
682
|
+
Returns:
|
|
683
|
+
List of DocumentRecords, most recently updated first
|
|
684
|
+
"""
|
|
685
|
+
cursor = self._conn.execute("""
|
|
686
|
+
SELECT id, collection, summary, tags_json, created_at, updated_at, content_hash
|
|
687
|
+
FROM documents
|
|
688
|
+
WHERE collection = ?
|
|
689
|
+
ORDER BY updated_at DESC
|
|
690
|
+
LIMIT ?
|
|
691
|
+
""", (collection, limit))
|
|
692
|
+
|
|
693
|
+
return [
|
|
694
|
+
DocumentRecord(
|
|
695
|
+
id=row["id"],
|
|
696
|
+
collection=row["collection"],
|
|
697
|
+
summary=row["summary"],
|
|
698
|
+
tags=json.loads(row["tags_json"]),
|
|
699
|
+
created_at=row["created_at"],
|
|
700
|
+
updated_at=row["updated_at"],
|
|
701
|
+
content_hash=row["content_hash"],
|
|
702
|
+
)
|
|
703
|
+
for row in cursor
|
|
704
|
+
]
|
|
705
|
+
|
|
706
|
+
def count(self, collection: str) -> int:
|
|
707
|
+
"""Count documents in a collection."""
|
|
708
|
+
cursor = self._conn.execute("""
|
|
709
|
+
SELECT COUNT(*) FROM documents
|
|
710
|
+
WHERE collection = ?
|
|
711
|
+
""", (collection,))
|
|
712
|
+
return cursor.fetchone()[0]
|
|
713
|
+
|
|
714
|
+
def count_all(self) -> int:
|
|
715
|
+
"""Count total documents across all collections."""
|
|
716
|
+
cursor = self._conn.execute("SELECT COUNT(*) FROM documents")
|
|
717
|
+
return cursor.fetchone()[0]
|
|
718
|
+
|
|
719
|
+
def query_by_id_prefix(
|
|
720
|
+
self,
|
|
721
|
+
collection: str,
|
|
722
|
+
prefix: str,
|
|
723
|
+
) -> list[DocumentRecord]:
|
|
724
|
+
"""
|
|
725
|
+
Query documents by ID prefix.
|
|
726
|
+
|
|
727
|
+
Args:
|
|
728
|
+
collection: Collection name
|
|
729
|
+
prefix: ID prefix to match (e.g., "_system:")
|
|
730
|
+
|
|
731
|
+
Returns:
|
|
732
|
+
List of matching DocumentRecords
|
|
733
|
+
"""
|
|
734
|
+
cursor = self._conn.execute("""
|
|
735
|
+
SELECT id, collection, summary, tags_json, created_at, updated_at, content_hash
|
|
736
|
+
FROM documents
|
|
737
|
+
WHERE collection = ? AND id LIKE ?
|
|
738
|
+
ORDER BY id
|
|
739
|
+
""", (collection, f"{prefix}%"))
|
|
740
|
+
|
|
741
|
+
results = []
|
|
742
|
+
for row in cursor:
|
|
743
|
+
results.append(DocumentRecord(
|
|
744
|
+
id=row["id"],
|
|
745
|
+
collection=row["collection"],
|
|
746
|
+
summary=row["summary"],
|
|
747
|
+
tags=json.loads(row["tags_json"]),
|
|
748
|
+
created_at=row["created_at"],
|
|
749
|
+
updated_at=row["updated_at"],
|
|
750
|
+
content_hash=row["content_hash"],
|
|
751
|
+
))
|
|
752
|
+
|
|
753
|
+
return results
|
|
754
|
+
|
|
755
|
+
# -------------------------------------------------------------------------
|
|
756
|
+
# Tag Queries
|
|
757
|
+
# -------------------------------------------------------------------------
|
|
758
|
+
|
|
759
|
+
def list_distinct_tag_keys(self, collection: str) -> list[str]:
|
|
760
|
+
"""
|
|
761
|
+
List all distinct tag keys used in the collection.
|
|
762
|
+
|
|
763
|
+
Excludes system tags (prefixed with _).
|
|
764
|
+
|
|
765
|
+
Returns:
|
|
766
|
+
Sorted list of distinct tag keys
|
|
767
|
+
"""
|
|
768
|
+
cursor = self._conn.execute("""
|
|
769
|
+
SELECT tags_json FROM documents
|
|
770
|
+
WHERE collection = ?
|
|
771
|
+
""", (collection,))
|
|
772
|
+
|
|
773
|
+
keys: set[str] = set()
|
|
774
|
+
for row in cursor:
|
|
775
|
+
tags = json.loads(row["tags_json"])
|
|
776
|
+
for key in tags:
|
|
777
|
+
if not key.startswith("_"):
|
|
778
|
+
keys.add(key)
|
|
779
|
+
|
|
780
|
+
return sorted(keys)
|
|
781
|
+
|
|
782
|
+
def list_distinct_tag_values(self, collection: str, key: str) -> list[str]:
|
|
783
|
+
"""
|
|
784
|
+
List all distinct values for a given tag key.
|
|
785
|
+
|
|
786
|
+
Args:
|
|
787
|
+
collection: Collection name
|
|
788
|
+
key: Tag key to get values for
|
|
789
|
+
|
|
790
|
+
Returns:
|
|
791
|
+
Sorted list of distinct values
|
|
792
|
+
"""
|
|
793
|
+
cursor = self._conn.execute("""
|
|
794
|
+
SELECT tags_json FROM documents
|
|
795
|
+
WHERE collection = ?
|
|
796
|
+
""", (collection,))
|
|
797
|
+
|
|
798
|
+
values: set[str] = set()
|
|
799
|
+
for row in cursor:
|
|
800
|
+
tags = json.loads(row["tags_json"])
|
|
801
|
+
if key in tags:
|
|
802
|
+
values.add(tags[key])
|
|
803
|
+
|
|
804
|
+
return sorted(values)
|
|
805
|
+
|
|
806
|
+
def query_by_tag_key(
|
|
807
|
+
self,
|
|
808
|
+
collection: str,
|
|
809
|
+
key: str,
|
|
810
|
+
limit: int = 100,
|
|
811
|
+
since_date: Optional[str] = None,
|
|
812
|
+
) -> list[DocumentRecord]:
|
|
813
|
+
"""
|
|
814
|
+
Find documents that have a specific tag key (any value).
|
|
815
|
+
|
|
816
|
+
Args:
|
|
817
|
+
collection: Collection name
|
|
818
|
+
key: Tag key to search for
|
|
819
|
+
limit: Maximum results
|
|
820
|
+
since_date: Only include items updated on or after this date (YYYY-MM-DD)
|
|
821
|
+
|
|
822
|
+
Returns:
|
|
823
|
+
List of matching DocumentRecords
|
|
824
|
+
"""
|
|
825
|
+
# SQLite JSON functions for tag key existence
|
|
826
|
+
# json_extract returns NULL if key doesn't exist
|
|
827
|
+
params: list[Any] = [collection, f"$.{key}"]
|
|
828
|
+
|
|
829
|
+
sql = """
|
|
830
|
+
SELECT id, collection, summary, tags_json, created_at, updated_at, content_hash
|
|
831
|
+
FROM documents
|
|
832
|
+
WHERE collection = ?
|
|
833
|
+
AND json_extract(tags_json, ?) IS NOT NULL
|
|
834
|
+
"""
|
|
835
|
+
|
|
836
|
+
if since_date is not None:
|
|
837
|
+
# Compare against the date portion of updated_at
|
|
838
|
+
sql += " AND updated_at >= ?\n"
|
|
839
|
+
params.append(since_date)
|
|
840
|
+
|
|
841
|
+
sql += "ORDER BY updated_at DESC\nLIMIT ?"
|
|
842
|
+
params.append(limit)
|
|
843
|
+
|
|
844
|
+
cursor = self._conn.execute(sql, params)
|
|
845
|
+
|
|
846
|
+
results = []
|
|
847
|
+
for row in cursor:
|
|
848
|
+
results.append(DocumentRecord(
|
|
849
|
+
id=row["id"],
|
|
850
|
+
collection=row["collection"],
|
|
851
|
+
summary=row["summary"],
|
|
852
|
+
tags=json.loads(row["tags_json"]),
|
|
853
|
+
created_at=row["created_at"],
|
|
854
|
+
updated_at=row["updated_at"],
|
|
855
|
+
content_hash=row["content_hash"],
|
|
856
|
+
))
|
|
857
|
+
|
|
858
|
+
return results
|
|
859
|
+
|
|
860
|
+
# -------------------------------------------------------------------------
|
|
861
|
+
# Collection Management
|
|
862
|
+
# -------------------------------------------------------------------------
|
|
863
|
+
|
|
864
|
+
def list_collections(self) -> list[str]:
|
|
865
|
+
"""List all collection names."""
|
|
866
|
+
cursor = self._conn.execute("""
|
|
867
|
+
SELECT DISTINCT collection FROM documents
|
|
868
|
+
ORDER BY collection
|
|
869
|
+
""")
|
|
870
|
+
return [row["collection"] for row in cursor]
|
|
871
|
+
|
|
872
|
+
def delete_collection(self, collection: str) -> int:
|
|
873
|
+
"""
|
|
874
|
+
Delete all documents in a collection.
|
|
875
|
+
|
|
876
|
+
Args:
|
|
877
|
+
collection: Collection name
|
|
878
|
+
|
|
879
|
+
Returns:
|
|
880
|
+
Number of documents deleted
|
|
881
|
+
"""
|
|
882
|
+
with self._lock:
|
|
883
|
+
cursor = self._conn.execute("""
|
|
884
|
+
DELETE FROM documents
|
|
885
|
+
WHERE collection = ?
|
|
886
|
+
""", (collection,))
|
|
887
|
+
self._conn.commit()
|
|
888
|
+
return cursor.rowcount
|
|
889
|
+
|
|
890
|
+
# -------------------------------------------------------------------------
|
|
891
|
+
# Lifecycle
|
|
892
|
+
# -------------------------------------------------------------------------
|
|
893
|
+
|
|
894
|
+
def close(self) -> None:
|
|
895
|
+
"""Close the database connection."""
|
|
896
|
+
if self._conn is not None:
|
|
897
|
+
self._conn.close()
|
|
898
|
+
self._conn = None
|
|
899
|
+
|
|
900
|
+
def __enter__(self):
|
|
901
|
+
return self
|
|
902
|
+
|
|
903
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
904
|
+
self.close()
|
|
905
|
+
return False
|
|
906
|
+
|
|
907
|
+
def __del__(self):
|
|
908
|
+
self.close()
|