keep-skill 0.2.0__py3-none-any.whl → 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- keep/__init__.py +1 -1
- keep/api.py +325 -11
- keep/cli.py +453 -83
- keep/config.py +2 -2
- keep/document_store.py +351 -12
- keep/pending_summaries.py +6 -0
- keep/providers/embedding_cache.py +6 -0
- keep/store.py +128 -11
- keep_skill-0.4.1.dist-info/METADATA +219 -0
- {keep_skill-0.2.0.dist-info → keep_skill-0.4.1.dist-info}/RECORD +13 -13
- keep_skill-0.2.0.dist-info/METADATA +0 -304
- {keep_skill-0.2.0.dist-info → keep_skill-0.4.1.dist-info}/WHEEL +0 -0
- {keep_skill-0.2.0.dist-info → keep_skill-0.4.1.dist-info}/entry_points.txt +0 -0
- {keep_skill-0.2.0.dist-info → keep_skill-0.4.1.dist-info}/licenses/LICENSE +0 -0
keep/config.py
CHANGED
|
@@ -21,7 +21,7 @@ except ImportError:
|
|
|
21
21
|
|
|
22
22
|
|
|
23
23
|
CONFIG_FILENAME = "keep.toml"
|
|
24
|
-
CONFIG_VERSION =
|
|
24
|
+
CONFIG_VERSION = 3 # Bumped for document versioning support
|
|
25
25
|
|
|
26
26
|
|
|
27
27
|
@dataclass
|
|
@@ -89,7 +89,7 @@ class StoreConfig:
|
|
|
89
89
|
default_tags: dict[str, str] = field(default_factory=dict)
|
|
90
90
|
|
|
91
91
|
# Maximum length for summaries (used for smart remember and validation)
|
|
92
|
-
max_summary_length: int =
|
|
92
|
+
max_summary_length: int = 500
|
|
93
93
|
|
|
94
94
|
@property
|
|
95
95
|
def config_path(self) -> Path:
|
keep/document_store.py
CHANGED
|
@@ -22,6 +22,24 @@ from pathlib import Path
|
|
|
22
22
|
from typing import Any, Optional
|
|
23
23
|
|
|
24
24
|
|
|
25
|
+
# Schema version for migrations
|
|
26
|
+
SCHEMA_VERSION = 1
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dataclass
|
|
30
|
+
class VersionInfo:
|
|
31
|
+
"""
|
|
32
|
+
Information about a document version.
|
|
33
|
+
|
|
34
|
+
Used for version navigation and history display.
|
|
35
|
+
"""
|
|
36
|
+
version: int # 1=oldest archived, increasing
|
|
37
|
+
summary: str
|
|
38
|
+
tags: dict[str, str]
|
|
39
|
+
created_at: str
|
|
40
|
+
content_hash: Optional[str] = None
|
|
41
|
+
|
|
42
|
+
|
|
25
43
|
@dataclass
|
|
26
44
|
class DocumentRecord:
|
|
27
45
|
"""
|
|
@@ -63,7 +81,12 @@ class DocumentStore:
|
|
|
63
81
|
self._db_path.parent.mkdir(parents=True, exist_ok=True)
|
|
64
82
|
self._conn = sqlite3.connect(str(self._db_path), check_same_thread=False)
|
|
65
83
|
self._conn.row_factory = sqlite3.Row
|
|
66
|
-
|
|
84
|
+
|
|
85
|
+
# Enable WAL mode for better concurrent access across processes
|
|
86
|
+
self._conn.execute("PRAGMA journal_mode=WAL")
|
|
87
|
+
# Wait up to 5 seconds for locks instead of failing immediately
|
|
88
|
+
self._conn.execute("PRAGMA busy_timeout=5000")
|
|
89
|
+
|
|
67
90
|
self._conn.execute("""
|
|
68
91
|
CREATE TABLE IF NOT EXISTS documents (
|
|
69
92
|
id TEXT NOT NULL,
|
|
@@ -82,20 +105,57 @@ class DocumentStore:
|
|
|
82
105
|
columns = {row[1] for row in cursor.fetchall()}
|
|
83
106
|
if "content_hash" not in columns:
|
|
84
107
|
self._conn.execute("ALTER TABLE documents ADD COLUMN content_hash TEXT")
|
|
85
|
-
|
|
108
|
+
|
|
86
109
|
# Index for collection queries
|
|
87
110
|
self._conn.execute("""
|
|
88
111
|
CREATE INDEX IF NOT EXISTS idx_documents_collection
|
|
89
112
|
ON documents(collection)
|
|
90
113
|
""")
|
|
91
|
-
|
|
114
|
+
|
|
92
115
|
# Index for timestamp queries
|
|
93
116
|
self._conn.execute("""
|
|
94
117
|
CREATE INDEX IF NOT EXISTS idx_documents_updated
|
|
95
118
|
ON documents(updated_at)
|
|
96
119
|
""")
|
|
97
|
-
|
|
120
|
+
|
|
98
121
|
self._conn.commit()
|
|
122
|
+
|
|
123
|
+
# Run schema migrations
|
|
124
|
+
self._migrate_schema()
|
|
125
|
+
|
|
126
|
+
def _migrate_schema(self) -> None:
|
|
127
|
+
"""
|
|
128
|
+
Run schema migrations using PRAGMA user_version.
|
|
129
|
+
|
|
130
|
+
Migrations:
|
|
131
|
+
- Version 0 → 1: Create document_versions table
|
|
132
|
+
"""
|
|
133
|
+
cursor = self._conn.execute("PRAGMA user_version")
|
|
134
|
+
current_version = cursor.fetchone()[0]
|
|
135
|
+
|
|
136
|
+
if current_version < 1:
|
|
137
|
+
# Create versions table for document history
|
|
138
|
+
self._conn.execute("""
|
|
139
|
+
CREATE TABLE IF NOT EXISTS document_versions (
|
|
140
|
+
id TEXT NOT NULL,
|
|
141
|
+
collection TEXT NOT NULL,
|
|
142
|
+
version INTEGER NOT NULL,
|
|
143
|
+
summary TEXT NOT NULL,
|
|
144
|
+
tags_json TEXT NOT NULL,
|
|
145
|
+
content_hash TEXT,
|
|
146
|
+
created_at TEXT NOT NULL,
|
|
147
|
+
PRIMARY KEY (id, collection, version)
|
|
148
|
+
)
|
|
149
|
+
""")
|
|
150
|
+
|
|
151
|
+
# Index for efficient version lookups
|
|
152
|
+
self._conn.execute("""
|
|
153
|
+
CREATE INDEX IF NOT EXISTS idx_versions_doc
|
|
154
|
+
ON document_versions(id, collection, version DESC)
|
|
155
|
+
""")
|
|
156
|
+
|
|
157
|
+
self._conn.execute(f"PRAGMA user_version = {SCHEMA_VERSION}")
|
|
158
|
+
self._conn.commit()
|
|
99
159
|
|
|
100
160
|
def _now(self) -> str:
|
|
101
161
|
"""Current timestamp in ISO format."""
|
|
@@ -134,11 +194,12 @@ class DocumentStore:
|
|
|
134
194
|
summary: str,
|
|
135
195
|
tags: dict[str, str],
|
|
136
196
|
content_hash: Optional[str] = None,
|
|
137
|
-
) -> DocumentRecord:
|
|
197
|
+
) -> tuple[DocumentRecord, bool]:
|
|
138
198
|
"""
|
|
139
199
|
Insert or update a document record.
|
|
140
200
|
|
|
141
201
|
Preserves created_at on update. Updates updated_at always.
|
|
202
|
+
Archives the current version to history before updating.
|
|
142
203
|
|
|
143
204
|
Args:
|
|
144
205
|
collection: Collection name
|
|
@@ -148,15 +209,27 @@ class DocumentStore:
|
|
|
148
209
|
content_hash: SHA256 hash of content (for change detection)
|
|
149
210
|
|
|
150
211
|
Returns:
|
|
151
|
-
|
|
212
|
+
Tuple of (stored DocumentRecord, content_changed bool).
|
|
213
|
+
content_changed is True if content hash differs from previous,
|
|
214
|
+
False if only tags/summary changed or if new document.
|
|
152
215
|
"""
|
|
153
216
|
now = self._now()
|
|
154
217
|
tags_json = json.dumps(tags, ensure_ascii=False)
|
|
155
218
|
|
|
156
219
|
with self._lock:
|
|
157
|
-
# Check if exists to preserve created_at
|
|
220
|
+
# Check if exists to preserve created_at and archive
|
|
158
221
|
existing = self._get_unlocked(collection, id)
|
|
159
222
|
created_at = existing.created_at if existing else now
|
|
223
|
+
content_changed = False
|
|
224
|
+
|
|
225
|
+
if existing:
|
|
226
|
+
# Archive current version before updating
|
|
227
|
+
self._archive_current_unlocked(collection, id, existing)
|
|
228
|
+
# Detect content change
|
|
229
|
+
content_changed = (
|
|
230
|
+
content_hash is not None
|
|
231
|
+
and existing.content_hash != content_hash
|
|
232
|
+
)
|
|
160
233
|
|
|
161
234
|
self._conn.execute("""
|
|
162
235
|
INSERT OR REPLACE INTO documents
|
|
@@ -173,7 +246,51 @@ class DocumentStore:
|
|
|
173
246
|
created_at=created_at,
|
|
174
247
|
updated_at=now,
|
|
175
248
|
content_hash=content_hash,
|
|
176
|
-
)
|
|
249
|
+
), content_changed
|
|
250
|
+
|
|
251
|
+
def _archive_current_unlocked(
|
|
252
|
+
self,
|
|
253
|
+
collection: str,
|
|
254
|
+
id: str,
|
|
255
|
+
current: DocumentRecord,
|
|
256
|
+
) -> int:
|
|
257
|
+
"""
|
|
258
|
+
Archive the current version to the versions table.
|
|
259
|
+
|
|
260
|
+
Must be called within a lock context.
|
|
261
|
+
|
|
262
|
+
Args:
|
|
263
|
+
collection: Collection name
|
|
264
|
+
id: Document identifier
|
|
265
|
+
current: Current document record to archive
|
|
266
|
+
|
|
267
|
+
Returns:
|
|
268
|
+
The version number assigned to the archived version
|
|
269
|
+
"""
|
|
270
|
+
# Get the next version number
|
|
271
|
+
cursor = self._conn.execute("""
|
|
272
|
+
SELECT COALESCE(MAX(version), 0) + 1
|
|
273
|
+
FROM document_versions
|
|
274
|
+
WHERE id = ? AND collection = ?
|
|
275
|
+
""", (id, collection))
|
|
276
|
+
next_version = cursor.fetchone()[0]
|
|
277
|
+
|
|
278
|
+
# Insert the current state as a version
|
|
279
|
+
self._conn.execute("""
|
|
280
|
+
INSERT INTO document_versions
|
|
281
|
+
(id, collection, version, summary, tags_json, content_hash, created_at)
|
|
282
|
+
VALUES (?, ?, ?, ?, ?, ?, ?)
|
|
283
|
+
""", (
|
|
284
|
+
id,
|
|
285
|
+
collection,
|
|
286
|
+
next_version,
|
|
287
|
+
current.summary,
|
|
288
|
+
json.dumps(current.tags, ensure_ascii=False),
|
|
289
|
+
current.content_hash,
|
|
290
|
+
current.updated_at, # Use updated_at as the version's timestamp
|
|
291
|
+
))
|
|
292
|
+
|
|
293
|
+
return next_version
|
|
177
294
|
|
|
178
295
|
def update_summary(self, collection: str, id: str, summary: str) -> bool:
|
|
179
296
|
"""
|
|
@@ -231,13 +348,14 @@ class DocumentStore:
|
|
|
231
348
|
|
|
232
349
|
return cursor.rowcount > 0
|
|
233
350
|
|
|
234
|
-
def delete(self, collection: str, id: str) -> bool:
|
|
351
|
+
def delete(self, collection: str, id: str, delete_versions: bool = True) -> bool:
|
|
235
352
|
"""
|
|
236
|
-
Delete a document record.
|
|
353
|
+
Delete a document record and optionally its version history.
|
|
237
354
|
|
|
238
355
|
Args:
|
|
239
356
|
collection: Collection name
|
|
240
357
|
id: Document identifier
|
|
358
|
+
delete_versions: If True, also delete version history
|
|
241
359
|
|
|
242
360
|
Returns:
|
|
243
361
|
True if document existed and was deleted
|
|
@@ -247,6 +365,13 @@ class DocumentStore:
|
|
|
247
365
|
DELETE FROM documents
|
|
248
366
|
WHERE id = ? AND collection = ?
|
|
249
367
|
""", (id, collection))
|
|
368
|
+
|
|
369
|
+
if delete_versions:
|
|
370
|
+
self._conn.execute("""
|
|
371
|
+
DELETE FROM document_versions
|
|
372
|
+
WHERE id = ? AND collection = ?
|
|
373
|
+
""", (id, collection))
|
|
374
|
+
|
|
250
375
|
self._conn.commit()
|
|
251
376
|
|
|
252
377
|
return cursor.rowcount > 0
|
|
@@ -285,7 +410,185 @@ class DocumentStore:
|
|
|
285
410
|
updated_at=row["updated_at"],
|
|
286
411
|
content_hash=row["content_hash"],
|
|
287
412
|
)
|
|
288
|
-
|
|
413
|
+
|
|
414
|
+
def get_version(
|
|
415
|
+
self,
|
|
416
|
+
collection: str,
|
|
417
|
+
id: str,
|
|
418
|
+
offset: int = 0,
|
|
419
|
+
) -> Optional[VersionInfo]:
|
|
420
|
+
"""
|
|
421
|
+
Get a specific version of a document by offset.
|
|
422
|
+
|
|
423
|
+
Offset semantics:
|
|
424
|
+
- 0 = current version (returns None, use get() instead)
|
|
425
|
+
- 1 = previous version (most recent archived)
|
|
426
|
+
- 2 = two versions ago
|
|
427
|
+
- etc.
|
|
428
|
+
|
|
429
|
+
Args:
|
|
430
|
+
collection: Collection name
|
|
431
|
+
id: Document identifier
|
|
432
|
+
offset: Version offset (0=current, 1=previous, etc.)
|
|
433
|
+
|
|
434
|
+
Returns:
|
|
435
|
+
VersionInfo if found, None if offset 0 or version doesn't exist
|
|
436
|
+
"""
|
|
437
|
+
if offset == 0:
|
|
438
|
+
# Offset 0 means current - caller should use get()
|
|
439
|
+
return None
|
|
440
|
+
|
|
441
|
+
# Get max version to calculate the target
|
|
442
|
+
cursor = self._conn.execute("""
|
|
443
|
+
SELECT MAX(version) FROM document_versions
|
|
444
|
+
WHERE id = ? AND collection = ?
|
|
445
|
+
""", (id, collection))
|
|
446
|
+
max_version = cursor.fetchone()[0]
|
|
447
|
+
|
|
448
|
+
if max_version is None:
|
|
449
|
+
return None # No versions archived
|
|
450
|
+
|
|
451
|
+
# offset=1 → max_version, offset=2 → max_version-1, etc.
|
|
452
|
+
target_version = max_version - (offset - 1)
|
|
453
|
+
|
|
454
|
+
if target_version < 1:
|
|
455
|
+
return None # Requested version doesn't exist
|
|
456
|
+
|
|
457
|
+
cursor = self._conn.execute("""
|
|
458
|
+
SELECT version, summary, tags_json, content_hash, created_at
|
|
459
|
+
FROM document_versions
|
|
460
|
+
WHERE id = ? AND collection = ? AND version = ?
|
|
461
|
+
""", (id, collection, target_version))
|
|
462
|
+
|
|
463
|
+
row = cursor.fetchone()
|
|
464
|
+
if row is None:
|
|
465
|
+
return None
|
|
466
|
+
|
|
467
|
+
return VersionInfo(
|
|
468
|
+
version=row["version"],
|
|
469
|
+
summary=row["summary"],
|
|
470
|
+
tags=json.loads(row["tags_json"]),
|
|
471
|
+
created_at=row["created_at"],
|
|
472
|
+
content_hash=row["content_hash"],
|
|
473
|
+
)
|
|
474
|
+
|
|
475
|
+
def list_versions(
|
|
476
|
+
self,
|
|
477
|
+
collection: str,
|
|
478
|
+
id: str,
|
|
479
|
+
limit: int = 10,
|
|
480
|
+
) -> list[VersionInfo]:
|
|
481
|
+
"""
|
|
482
|
+
List version history for a document.
|
|
483
|
+
|
|
484
|
+
Returns versions in reverse chronological order (newest first).
|
|
485
|
+
|
|
486
|
+
Args:
|
|
487
|
+
collection: Collection name
|
|
488
|
+
id: Document identifier
|
|
489
|
+
limit: Maximum versions to return
|
|
490
|
+
|
|
491
|
+
Returns:
|
|
492
|
+
List of VersionInfo, newest archived first
|
|
493
|
+
"""
|
|
494
|
+
cursor = self._conn.execute("""
|
|
495
|
+
SELECT version, summary, tags_json, content_hash, created_at
|
|
496
|
+
FROM document_versions
|
|
497
|
+
WHERE id = ? AND collection = ?
|
|
498
|
+
ORDER BY version DESC
|
|
499
|
+
LIMIT ?
|
|
500
|
+
""", (id, collection, limit))
|
|
501
|
+
|
|
502
|
+
versions = []
|
|
503
|
+
for row in cursor:
|
|
504
|
+
versions.append(VersionInfo(
|
|
505
|
+
version=row["version"],
|
|
506
|
+
summary=row["summary"],
|
|
507
|
+
tags=json.loads(row["tags_json"]),
|
|
508
|
+
created_at=row["created_at"],
|
|
509
|
+
content_hash=row["content_hash"],
|
|
510
|
+
))
|
|
511
|
+
|
|
512
|
+
return versions
|
|
513
|
+
|
|
514
|
+
def get_version_nav(
|
|
515
|
+
self,
|
|
516
|
+
collection: str,
|
|
517
|
+
id: str,
|
|
518
|
+
current_version: Optional[int] = None,
|
|
519
|
+
limit: int = 3,
|
|
520
|
+
) -> dict[str, list[VersionInfo]]:
|
|
521
|
+
"""
|
|
522
|
+
Get version navigation info (prev/next) for display.
|
|
523
|
+
|
|
524
|
+
Args:
|
|
525
|
+
collection: Collection name
|
|
526
|
+
id: Document identifier
|
|
527
|
+
current_version: The version being viewed (None = current/live version)
|
|
528
|
+
limit: Max previous versions to return when viewing current
|
|
529
|
+
|
|
530
|
+
Returns:
|
|
531
|
+
Dict with 'prev' and optionally 'next' lists of VersionInfo.
|
|
532
|
+
When viewing current (None): {'prev': [up to limit versions]}
|
|
533
|
+
When viewing old version N: {'prev': [N-1 if exists], 'next': [N+1 if exists]}
|
|
534
|
+
"""
|
|
535
|
+
result: dict[str, list[VersionInfo]] = {"prev": []}
|
|
536
|
+
|
|
537
|
+
if current_version is None:
|
|
538
|
+
# Viewing current version: get up to `limit` previous versions
|
|
539
|
+
versions = self.list_versions(collection, id, limit=limit)
|
|
540
|
+
result["prev"] = versions
|
|
541
|
+
else:
|
|
542
|
+
# Viewing an old version: get prev (N-1) and next (N+1)
|
|
543
|
+
# Previous version (older)
|
|
544
|
+
if current_version > 1:
|
|
545
|
+
cursor = self._conn.execute("""
|
|
546
|
+
SELECT version, summary, tags_json, content_hash, created_at
|
|
547
|
+
FROM document_versions
|
|
548
|
+
WHERE id = ? AND collection = ? AND version = ?
|
|
549
|
+
""", (id, collection, current_version - 1))
|
|
550
|
+
row = cursor.fetchone()
|
|
551
|
+
if row:
|
|
552
|
+
result["prev"] = [VersionInfo(
|
|
553
|
+
version=row["version"],
|
|
554
|
+
summary=row["summary"],
|
|
555
|
+
tags=json.loads(row["tags_json"]),
|
|
556
|
+
created_at=row["created_at"],
|
|
557
|
+
content_hash=row["content_hash"],
|
|
558
|
+
)]
|
|
559
|
+
|
|
560
|
+
# Next version (newer)
|
|
561
|
+
cursor = self._conn.execute("""
|
|
562
|
+
SELECT version, summary, tags_json, content_hash, created_at
|
|
563
|
+
FROM document_versions
|
|
564
|
+
WHERE id = ? AND collection = ? AND version = ?
|
|
565
|
+
""", (id, collection, current_version + 1))
|
|
566
|
+
row = cursor.fetchone()
|
|
567
|
+
if row:
|
|
568
|
+
result["next"] = [VersionInfo(
|
|
569
|
+
version=row["version"],
|
|
570
|
+
summary=row["summary"],
|
|
571
|
+
tags=json.loads(row["tags_json"]),
|
|
572
|
+
created_at=row["created_at"],
|
|
573
|
+
content_hash=row["content_hash"],
|
|
574
|
+
)]
|
|
575
|
+
else:
|
|
576
|
+
# Check if there's a current version (meaning we're at newest archived)
|
|
577
|
+
if self.exists(collection, id):
|
|
578
|
+
# Next is "current" - indicate this with empty next
|
|
579
|
+
# (caller knows to check current doc)
|
|
580
|
+
result["next"] = []
|
|
581
|
+
|
|
582
|
+
return result
|
|
583
|
+
|
|
584
|
+
def version_count(self, collection: str, id: str) -> int:
|
|
585
|
+
"""Count archived versions for a document."""
|
|
586
|
+
cursor = self._conn.execute("""
|
|
587
|
+
SELECT COUNT(*) FROM document_versions
|
|
588
|
+
WHERE id = ? AND collection = ?
|
|
589
|
+
""", (id, collection))
|
|
590
|
+
return cursor.fetchone()[0]
|
|
591
|
+
|
|
289
592
|
def get_many(
|
|
290
593
|
self,
|
|
291
594
|
collection: str,
|
|
@@ -363,7 +666,43 @@ class DocumentStore:
|
|
|
363
666
|
""", (collection,))
|
|
364
667
|
|
|
365
668
|
return [row["id"] for row in cursor]
|
|
366
|
-
|
|
669
|
+
|
|
670
|
+
def list_recent(
|
|
671
|
+
self,
|
|
672
|
+
collection: str,
|
|
673
|
+
limit: int = 10,
|
|
674
|
+
) -> list[DocumentRecord]:
|
|
675
|
+
"""
|
|
676
|
+
List recent documents ordered by update time.
|
|
677
|
+
|
|
678
|
+
Args:
|
|
679
|
+
collection: Collection name
|
|
680
|
+
limit: Maximum number to return
|
|
681
|
+
|
|
682
|
+
Returns:
|
|
683
|
+
List of DocumentRecords, most recently updated first
|
|
684
|
+
"""
|
|
685
|
+
cursor = self._conn.execute("""
|
|
686
|
+
SELECT id, collection, summary, tags_json, created_at, updated_at, content_hash
|
|
687
|
+
FROM documents
|
|
688
|
+
WHERE collection = ?
|
|
689
|
+
ORDER BY updated_at DESC
|
|
690
|
+
LIMIT ?
|
|
691
|
+
""", (collection, limit))
|
|
692
|
+
|
|
693
|
+
return [
|
|
694
|
+
DocumentRecord(
|
|
695
|
+
id=row["id"],
|
|
696
|
+
collection=row["collection"],
|
|
697
|
+
summary=row["summary"],
|
|
698
|
+
tags=json.loads(row["tags_json"]),
|
|
699
|
+
created_at=row["created_at"],
|
|
700
|
+
updated_at=row["updated_at"],
|
|
701
|
+
content_hash=row["content_hash"],
|
|
702
|
+
)
|
|
703
|
+
for row in cursor
|
|
704
|
+
]
|
|
705
|
+
|
|
367
706
|
def count(self, collection: str) -> int:
|
|
368
707
|
"""Count documents in a collection."""
|
|
369
708
|
cursor = self._conn.execute("""
|
keep/pending_summaries.py
CHANGED
|
@@ -45,6 +45,12 @@ class PendingSummaryQueue:
|
|
|
45
45
|
"""Initialize the SQLite database."""
|
|
46
46
|
self._queue_path.parent.mkdir(parents=True, exist_ok=True)
|
|
47
47
|
self._conn = sqlite3.connect(str(self._queue_path), check_same_thread=False)
|
|
48
|
+
|
|
49
|
+
# Enable WAL mode for better concurrent access across processes
|
|
50
|
+
self._conn.execute("PRAGMA journal_mode=WAL")
|
|
51
|
+
# Wait up to 5 seconds for locks instead of failing immediately
|
|
52
|
+
self._conn.execute("PRAGMA busy_timeout=5000")
|
|
53
|
+
|
|
48
54
|
self._conn.execute("""
|
|
49
55
|
CREATE TABLE IF NOT EXISTS pending_summaries (
|
|
50
56
|
id TEXT NOT NULL,
|
|
@@ -40,6 +40,12 @@ class EmbeddingCache:
|
|
|
40
40
|
"""Initialize the SQLite database."""
|
|
41
41
|
self._cache_path.parent.mkdir(parents=True, exist_ok=True)
|
|
42
42
|
self._conn = sqlite3.connect(str(self._cache_path), check_same_thread=False)
|
|
43
|
+
|
|
44
|
+
# Enable WAL mode for better concurrent access across processes
|
|
45
|
+
self._conn.execute("PRAGMA journal_mode=WAL")
|
|
46
|
+
# Wait up to 5 seconds for locks instead of failing immediately
|
|
47
|
+
self._conn.execute("PRAGMA busy_timeout=5000")
|
|
48
|
+
|
|
43
49
|
self._conn.execute("""
|
|
44
50
|
CREATE TABLE IF NOT EXISTS embedding_cache (
|
|
45
51
|
content_hash TEXT PRIMARY KEY,
|
keep/store.py
CHANGED
|
@@ -124,7 +124,7 @@ class ChromaStore:
|
|
|
124
124
|
) -> None:
|
|
125
125
|
"""
|
|
126
126
|
Insert or update an item in the store.
|
|
127
|
-
|
|
127
|
+
|
|
128
128
|
Args:
|
|
129
129
|
collection: Collection name
|
|
130
130
|
id: Item identifier (URI or custom)
|
|
@@ -140,9 +140,9 @@ class ChromaStore:
|
|
|
140
140
|
f"Embedding dimension mismatch: expected {self._embedding_dimension}, "
|
|
141
141
|
f"got {len(embedding)}"
|
|
142
142
|
)
|
|
143
|
-
|
|
143
|
+
|
|
144
144
|
coll = self._get_collection(collection)
|
|
145
|
-
|
|
145
|
+
|
|
146
146
|
# Add timestamp if not present
|
|
147
147
|
now = datetime.now(timezone.utc).isoformat()
|
|
148
148
|
if "_updated" not in tags:
|
|
@@ -158,36 +158,122 @@ class ChromaStore:
|
|
|
158
158
|
tags = {**tags, "_created": now}
|
|
159
159
|
else:
|
|
160
160
|
tags = {**tags, "_created": now}
|
|
161
|
-
|
|
161
|
+
|
|
162
162
|
# Add date portion for easier date queries
|
|
163
163
|
tags = {**tags, "_updated_date": now[:10]}
|
|
164
|
-
|
|
164
|
+
|
|
165
165
|
coll.upsert(
|
|
166
166
|
ids=[id],
|
|
167
167
|
embeddings=[embedding],
|
|
168
168
|
documents=[summary],
|
|
169
169
|
metadatas=[self._tags_to_metadata(tags)],
|
|
170
170
|
)
|
|
171
|
+
|
|
172
|
+
def upsert_version(
|
|
173
|
+
self,
|
|
174
|
+
collection: str,
|
|
175
|
+
id: str,
|
|
176
|
+
version: int,
|
|
177
|
+
embedding: list[float],
|
|
178
|
+
summary: str,
|
|
179
|
+
tags: dict[str, str],
|
|
180
|
+
) -> None:
|
|
181
|
+
"""
|
|
182
|
+
Store an archived version with a versioned ID.
|
|
183
|
+
|
|
184
|
+
The versioned ID format is: {id}@v{version}
|
|
185
|
+
Metadata includes _version and _base_id for filtering/navigation.
|
|
186
|
+
|
|
187
|
+
Args:
|
|
188
|
+
collection: Collection name
|
|
189
|
+
id: Base item identifier (not versioned)
|
|
190
|
+
version: Version number (1=oldest archived)
|
|
191
|
+
embedding: Vector embedding
|
|
192
|
+
summary: Human-readable summary
|
|
193
|
+
tags: All tags from the archived version
|
|
194
|
+
"""
|
|
195
|
+
if self._embedding_dimension is None:
|
|
196
|
+
self._embedding_dimension = len(embedding)
|
|
197
|
+
elif len(embedding) != self._embedding_dimension:
|
|
198
|
+
raise ValueError(
|
|
199
|
+
f"Embedding dimension mismatch: expected {self._embedding_dimension}, "
|
|
200
|
+
f"got {len(embedding)}"
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
coll = self._get_collection(collection)
|
|
204
|
+
|
|
205
|
+
# Versioned ID format
|
|
206
|
+
versioned_id = f"{id}@v{version}"
|
|
207
|
+
|
|
208
|
+
# Add version metadata
|
|
209
|
+
versioned_tags = dict(tags)
|
|
210
|
+
versioned_tags["_version"] = str(version)
|
|
211
|
+
versioned_tags["_base_id"] = id
|
|
212
|
+
|
|
213
|
+
coll.upsert(
|
|
214
|
+
ids=[versioned_id],
|
|
215
|
+
embeddings=[embedding],
|
|
216
|
+
documents=[summary],
|
|
217
|
+
metadatas=[self._tags_to_metadata(versioned_tags)],
|
|
218
|
+
)
|
|
219
|
+
|
|
220
|
+
def get_content_hash(self, collection: str, id: str) -> Optional[str]:
|
|
221
|
+
"""
|
|
222
|
+
Get the content hash of an existing item.
|
|
223
|
+
|
|
224
|
+
Used to check if content changed (to skip re-embedding).
|
|
225
|
+
|
|
226
|
+
Args:
|
|
227
|
+
collection: Collection name
|
|
228
|
+
id: Item identifier
|
|
229
|
+
|
|
230
|
+
Returns:
|
|
231
|
+
Content hash if item exists and has one, None otherwise
|
|
232
|
+
"""
|
|
233
|
+
coll = self._get_collection(collection)
|
|
234
|
+
result = coll.get(ids=[id], include=["metadatas"])
|
|
235
|
+
|
|
236
|
+
if not result["ids"]:
|
|
237
|
+
return None
|
|
238
|
+
|
|
239
|
+
metadata = result["metadatas"][0] or {}
|
|
240
|
+
return metadata.get("_content_hash")
|
|
171
241
|
|
|
172
|
-
def delete(self, collection: str, id: str) -> bool:
|
|
242
|
+
def delete(self, collection: str, id: str, delete_versions: bool = True) -> bool:
|
|
173
243
|
"""
|
|
174
244
|
Delete an item from the store.
|
|
175
|
-
|
|
245
|
+
|
|
176
246
|
Args:
|
|
177
247
|
collection: Collection name
|
|
178
248
|
id: Item identifier
|
|
179
|
-
|
|
249
|
+
delete_versions: If True, also delete versioned copies ({id}@v{N})
|
|
250
|
+
|
|
180
251
|
Returns:
|
|
181
252
|
True if item existed and was deleted, False if not found
|
|
182
253
|
"""
|
|
183
254
|
coll = self._get_collection(collection)
|
|
184
|
-
|
|
255
|
+
|
|
185
256
|
# Check existence first
|
|
186
257
|
existing = coll.get(ids=[id])
|
|
187
258
|
if not existing["ids"]:
|
|
188
259
|
return False
|
|
189
|
-
|
|
260
|
+
|
|
190
261
|
coll.delete(ids=[id])
|
|
262
|
+
|
|
263
|
+
if delete_versions:
|
|
264
|
+
# Delete all versioned copies
|
|
265
|
+
# Query by _base_id metadata to find all versions
|
|
266
|
+
try:
|
|
267
|
+
versions = coll.get(
|
|
268
|
+
where={"_base_id": id},
|
|
269
|
+
include=[],
|
|
270
|
+
)
|
|
271
|
+
if versions["ids"]:
|
|
272
|
+
coll.delete(ids=versions["ids"])
|
|
273
|
+
except Exception:
|
|
274
|
+
# Metadata filter might fail if no versions exist
|
|
275
|
+
pass
|
|
276
|
+
|
|
191
277
|
return True
|
|
192
278
|
|
|
193
279
|
def update_summary(self, collection: str, id: str, summary: str) -> bool:
|
|
@@ -295,7 +381,38 @@ class ChromaStore:
|
|
|
295
381
|
coll = self._get_collection(collection)
|
|
296
382
|
result = coll.get(ids=[id], include=[])
|
|
297
383
|
return bool(result["ids"])
|
|
298
|
-
|
|
384
|
+
|
|
385
|
+
def get_embedding(self, collection: str, id: str) -> list[float] | None:
|
|
386
|
+
"""
|
|
387
|
+
Retrieve the stored embedding for a document.
|
|
388
|
+
|
|
389
|
+
Args:
|
|
390
|
+
collection: Collection name
|
|
391
|
+
id: Item identifier
|
|
392
|
+
|
|
393
|
+
Returns:
|
|
394
|
+
Embedding vector if found, None otherwise
|
|
395
|
+
"""
|
|
396
|
+
coll = self._get_collection(collection)
|
|
397
|
+
result = coll.get(ids=[id], include=["embeddings"])
|
|
398
|
+
if not result["ids"] or result["embeddings"] is None or len(result["embeddings"]) == 0:
|
|
399
|
+
return None
|
|
400
|
+
return list(result["embeddings"][0])
|
|
401
|
+
|
|
402
|
+
def list_ids(self, collection: str) -> list[str]:
|
|
403
|
+
"""
|
|
404
|
+
List all document IDs in a collection.
|
|
405
|
+
|
|
406
|
+
Args:
|
|
407
|
+
collection: Collection name
|
|
408
|
+
|
|
409
|
+
Returns:
|
|
410
|
+
List of document IDs
|
|
411
|
+
"""
|
|
412
|
+
coll = self._get_collection(collection)
|
|
413
|
+
result = coll.get(include=[])
|
|
414
|
+
return result["ids"]
|
|
415
|
+
|
|
299
416
|
def query_embedding(
|
|
300
417
|
self,
|
|
301
418
|
collection: str,
|