keep-skill 0.1.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
keep/document_store.py ADDED
@@ -0,0 +1,908 @@
1
+ """
2
+ Document store using SQLite.
3
+
4
+ Stores canonical document records separate from embeddings.
5
+ This enables multiple embedding providers to index the same documents.
6
+
7
+ The document store is the source of truth for:
8
+ - Document identity (URI / custom ID)
9
+ - Summary text
10
+ - Tags (source + system)
11
+ - Timestamps
12
+
13
+ Embeddings are stored in ChromaDB collections, keyed by embedding provider.
14
+ """
15
+
16
+ import json
17
+ import sqlite3
18
+ import threading
19
+ from dataclasses import dataclass
20
+ from datetime import datetime, timezone
21
+ from pathlib import Path
22
+ from typing import Any, Optional
23
+
24
+
25
+ # Schema version for migrations
26
+ SCHEMA_VERSION = 1
27
+
28
+
29
+ @dataclass
30
+ class VersionInfo:
31
+ """
32
+ Information about a document version.
33
+
34
+ Used for version navigation and history display.
35
+ """
36
+ version: int # 1=oldest archived, increasing
37
+ summary: str
38
+ tags: dict[str, str]
39
+ created_at: str
40
+ content_hash: Optional[str] = None
41
+
42
+
43
+ @dataclass
44
+ class DocumentRecord:
45
+ """
46
+ A canonical document record.
47
+
48
+ This is the source of truth, independent of any embedding index.
49
+ """
50
+ id: str
51
+ collection: str
52
+ summary: str
53
+ tags: dict[str, str]
54
+ created_at: str
55
+ updated_at: str
56
+ content_hash: Optional[str] = None
57
+
58
+
59
+ class DocumentStore:
60
+ """
61
+ SQLite-backed store for canonical document records.
62
+
63
+ Separates document metadata from embedding storage, enabling:
64
+ - Multiple embedding providers per document
65
+ - Efficient tag/metadata queries without ChromaDB
66
+ - Clear separation of concerns
67
+ """
68
+
69
+ def __init__(self, store_path: Path):
70
+ """
71
+ Args:
72
+ store_path: Path to SQLite database file
73
+ """
74
+ self._db_path = store_path
75
+ self._conn: Optional[sqlite3.Connection] = None
76
+ self._lock = threading.Lock()
77
+ self._init_db()
78
+
79
+ def _init_db(self) -> None:
80
+ """Initialize the SQLite database."""
81
+ self._db_path.parent.mkdir(parents=True, exist_ok=True)
82
+ self._conn = sqlite3.connect(str(self._db_path), check_same_thread=False)
83
+ self._conn.row_factory = sqlite3.Row
84
+
85
+ # Enable WAL mode for better concurrent access across processes
86
+ self._conn.execute("PRAGMA journal_mode=WAL")
87
+ # Wait up to 5 seconds for locks instead of failing immediately
88
+ self._conn.execute("PRAGMA busy_timeout=5000")
89
+
90
+ self._conn.execute("""
91
+ CREATE TABLE IF NOT EXISTS documents (
92
+ id TEXT NOT NULL,
93
+ collection TEXT NOT NULL,
94
+ summary TEXT NOT NULL,
95
+ tags_json TEXT NOT NULL DEFAULT '{}',
96
+ created_at TEXT NOT NULL,
97
+ updated_at TEXT NOT NULL,
98
+ content_hash TEXT,
99
+ PRIMARY KEY (id, collection)
100
+ )
101
+ """)
102
+
103
+ # Migration: add content_hash column if missing (for existing databases)
104
+ cursor = self._conn.execute("PRAGMA table_info(documents)")
105
+ columns = {row[1] for row in cursor.fetchall()}
106
+ if "content_hash" not in columns:
107
+ self._conn.execute("ALTER TABLE documents ADD COLUMN content_hash TEXT")
108
+
109
+ # Index for collection queries
110
+ self._conn.execute("""
111
+ CREATE INDEX IF NOT EXISTS idx_documents_collection
112
+ ON documents(collection)
113
+ """)
114
+
115
+ # Index for timestamp queries
116
+ self._conn.execute("""
117
+ CREATE INDEX IF NOT EXISTS idx_documents_updated
118
+ ON documents(updated_at)
119
+ """)
120
+
121
+ self._conn.commit()
122
+
123
+ # Run schema migrations
124
+ self._migrate_schema()
125
+
126
+ def _migrate_schema(self) -> None:
127
+ """
128
+ Run schema migrations using PRAGMA user_version.
129
+
130
+ Migrations:
131
+ - Version 0 → 1: Create document_versions table
132
+ """
133
+ cursor = self._conn.execute("PRAGMA user_version")
134
+ current_version = cursor.fetchone()[0]
135
+
136
+ if current_version < 1:
137
+ # Create versions table for document history
138
+ self._conn.execute("""
139
+ CREATE TABLE IF NOT EXISTS document_versions (
140
+ id TEXT NOT NULL,
141
+ collection TEXT NOT NULL,
142
+ version INTEGER NOT NULL,
143
+ summary TEXT NOT NULL,
144
+ tags_json TEXT NOT NULL,
145
+ content_hash TEXT,
146
+ created_at TEXT NOT NULL,
147
+ PRIMARY KEY (id, collection, version)
148
+ )
149
+ """)
150
+
151
+ # Index for efficient version lookups
152
+ self._conn.execute("""
153
+ CREATE INDEX IF NOT EXISTS idx_versions_doc
154
+ ON document_versions(id, collection, version DESC)
155
+ """)
156
+
157
+ self._conn.execute(f"PRAGMA user_version = {SCHEMA_VERSION}")
158
+ self._conn.commit()
159
+
160
+ def _now(self) -> str:
161
+ """Current timestamp in ISO format."""
162
+ return datetime.now(timezone.utc).isoformat()
163
+
164
+ def _get_unlocked(self, collection: str, id: str) -> Optional[DocumentRecord]:
165
+ """Get a document by ID without acquiring the lock (for use within locked contexts)."""
166
+ cursor = self._conn.execute("""
167
+ SELECT id, collection, summary, tags_json, created_at, updated_at, content_hash
168
+ FROM documents
169
+ WHERE id = ? AND collection = ?
170
+ """, (id, collection))
171
+
172
+ row = cursor.fetchone()
173
+ if row is None:
174
+ return None
175
+
176
+ return DocumentRecord(
177
+ id=row["id"],
178
+ collection=row["collection"],
179
+ summary=row["summary"],
180
+ tags=json.loads(row["tags_json"]),
181
+ created_at=row["created_at"],
182
+ updated_at=row["updated_at"],
183
+ content_hash=row["content_hash"],
184
+ )
185
+
186
+ # -------------------------------------------------------------------------
187
+ # Write Operations
188
+ # -------------------------------------------------------------------------
189
+
190
+ def upsert(
191
+ self,
192
+ collection: str,
193
+ id: str,
194
+ summary: str,
195
+ tags: dict[str, str],
196
+ content_hash: Optional[str] = None,
197
+ ) -> tuple[DocumentRecord, bool]:
198
+ """
199
+ Insert or update a document record.
200
+
201
+ Preserves created_at on update. Updates updated_at always.
202
+ Archives the current version to history before updating.
203
+
204
+ Args:
205
+ collection: Collection name
206
+ id: Document identifier (URI or custom)
207
+ summary: Document summary text
208
+ tags: All tags (source + system)
209
+ content_hash: SHA256 hash of content (for change detection)
210
+
211
+ Returns:
212
+ Tuple of (stored DocumentRecord, content_changed bool).
213
+ content_changed is True if content hash differs from previous,
214
+ False if only tags/summary changed or if new document.
215
+ """
216
+ now = self._now()
217
+ tags_json = json.dumps(tags, ensure_ascii=False)
218
+
219
+ with self._lock:
220
+ # Check if exists to preserve created_at and archive
221
+ existing = self._get_unlocked(collection, id)
222
+ created_at = existing.created_at if existing else now
223
+ content_changed = False
224
+
225
+ if existing:
226
+ # Archive current version before updating
227
+ self._archive_current_unlocked(collection, id, existing)
228
+ # Detect content change
229
+ content_changed = (
230
+ content_hash is not None
231
+ and existing.content_hash != content_hash
232
+ )
233
+
234
+ self._conn.execute("""
235
+ INSERT OR REPLACE INTO documents
236
+ (id, collection, summary, tags_json, created_at, updated_at, content_hash)
237
+ VALUES (?, ?, ?, ?, ?, ?, ?)
238
+ """, (id, collection, summary, tags_json, created_at, now, content_hash))
239
+ self._conn.commit()
240
+
241
+ return DocumentRecord(
242
+ id=id,
243
+ collection=collection,
244
+ summary=summary,
245
+ tags=tags,
246
+ created_at=created_at,
247
+ updated_at=now,
248
+ content_hash=content_hash,
249
+ ), content_changed
250
+
251
+ def _archive_current_unlocked(
252
+ self,
253
+ collection: str,
254
+ id: str,
255
+ current: DocumentRecord,
256
+ ) -> int:
257
+ """
258
+ Archive the current version to the versions table.
259
+
260
+ Must be called within a lock context.
261
+
262
+ Args:
263
+ collection: Collection name
264
+ id: Document identifier
265
+ current: Current document record to archive
266
+
267
+ Returns:
268
+ The version number assigned to the archived version
269
+ """
270
+ # Get the next version number
271
+ cursor = self._conn.execute("""
272
+ SELECT COALESCE(MAX(version), 0) + 1
273
+ FROM document_versions
274
+ WHERE id = ? AND collection = ?
275
+ """, (id, collection))
276
+ next_version = cursor.fetchone()[0]
277
+
278
+ # Insert the current state as a version
279
+ self._conn.execute("""
280
+ INSERT INTO document_versions
281
+ (id, collection, version, summary, tags_json, content_hash, created_at)
282
+ VALUES (?, ?, ?, ?, ?, ?, ?)
283
+ """, (
284
+ id,
285
+ collection,
286
+ next_version,
287
+ current.summary,
288
+ json.dumps(current.tags, ensure_ascii=False),
289
+ current.content_hash,
290
+ current.updated_at, # Use updated_at as the version's timestamp
291
+ ))
292
+
293
+ return next_version
294
+
295
+ def update_summary(self, collection: str, id: str, summary: str) -> bool:
296
+ """
297
+ Update just the summary of an existing document.
298
+
299
+ Used by lazy summarization to replace placeholder summaries.
300
+
301
+ Args:
302
+ collection: Collection name
303
+ id: Document identifier
304
+ summary: New summary text
305
+
306
+ Returns:
307
+ True if document was found and updated, False otherwise
308
+ """
309
+ now = self._now()
310
+
311
+ with self._lock:
312
+ cursor = self._conn.execute("""
313
+ UPDATE documents
314
+ SET summary = ?, updated_at = ?
315
+ WHERE id = ? AND collection = ?
316
+ """, (summary, now, id, collection))
317
+ self._conn.commit()
318
+
319
+ return cursor.rowcount > 0
320
+
321
+ def update_tags(
322
+ self,
323
+ collection: str,
324
+ id: str,
325
+ tags: dict[str, str],
326
+ ) -> bool:
327
+ """
328
+ Update tags of an existing document.
329
+
330
+ Args:
331
+ collection: Collection name
332
+ id: Document identifier
333
+ tags: New tags dict (replaces existing)
334
+
335
+ Returns:
336
+ True if document was found and updated, False otherwise
337
+ """
338
+ now = self._now()
339
+ tags_json = json.dumps(tags, ensure_ascii=False)
340
+
341
+ with self._lock:
342
+ cursor = self._conn.execute("""
343
+ UPDATE documents
344
+ SET tags_json = ?, updated_at = ?
345
+ WHERE id = ? AND collection = ?
346
+ """, (tags_json, now, id, collection))
347
+ self._conn.commit()
348
+
349
+ return cursor.rowcount > 0
350
+
351
+ def delete(self, collection: str, id: str, delete_versions: bool = True) -> bool:
352
+ """
353
+ Delete a document record and optionally its version history.
354
+
355
+ Args:
356
+ collection: Collection name
357
+ id: Document identifier
358
+ delete_versions: If True, also delete version history
359
+
360
+ Returns:
361
+ True if document existed and was deleted
362
+ """
363
+ with self._lock:
364
+ cursor = self._conn.execute("""
365
+ DELETE FROM documents
366
+ WHERE id = ? AND collection = ?
367
+ """, (id, collection))
368
+
369
+ if delete_versions:
370
+ self._conn.execute("""
371
+ DELETE FROM document_versions
372
+ WHERE id = ? AND collection = ?
373
+ """, (id, collection))
374
+
375
+ self._conn.commit()
376
+
377
+ return cursor.rowcount > 0
378
+
379
+ # -------------------------------------------------------------------------
380
+ # Read Operations
381
+ # -------------------------------------------------------------------------
382
+
383
+ def get(self, collection: str, id: str) -> Optional[DocumentRecord]:
384
+ """
385
+ Get a document by ID.
386
+
387
+ Args:
388
+ collection: Collection name
389
+ id: Document identifier
390
+
391
+ Returns:
392
+ DocumentRecord if found, None otherwise
393
+ """
394
+ cursor = self._conn.execute("""
395
+ SELECT id, collection, summary, tags_json, created_at, updated_at, content_hash
396
+ FROM documents
397
+ WHERE id = ? AND collection = ?
398
+ """, (id, collection))
399
+
400
+ row = cursor.fetchone()
401
+ if row is None:
402
+ return None
403
+
404
+ return DocumentRecord(
405
+ id=row["id"],
406
+ collection=row["collection"],
407
+ summary=row["summary"],
408
+ tags=json.loads(row["tags_json"]),
409
+ created_at=row["created_at"],
410
+ updated_at=row["updated_at"],
411
+ content_hash=row["content_hash"],
412
+ )
413
+
414
+ def get_version(
415
+ self,
416
+ collection: str,
417
+ id: str,
418
+ offset: int = 0,
419
+ ) -> Optional[VersionInfo]:
420
+ """
421
+ Get a specific version of a document by offset.
422
+
423
+ Offset semantics:
424
+ - 0 = current version (returns None, use get() instead)
425
+ - 1 = previous version (most recent archived)
426
+ - 2 = two versions ago
427
+ - etc.
428
+
429
+ Args:
430
+ collection: Collection name
431
+ id: Document identifier
432
+ offset: Version offset (0=current, 1=previous, etc.)
433
+
434
+ Returns:
435
+ VersionInfo if found, None if offset 0 or version doesn't exist
436
+ """
437
+ if offset == 0:
438
+ # Offset 0 means current - caller should use get()
439
+ return None
440
+
441
+ # Get max version to calculate the target
442
+ cursor = self._conn.execute("""
443
+ SELECT MAX(version) FROM document_versions
444
+ WHERE id = ? AND collection = ?
445
+ """, (id, collection))
446
+ max_version = cursor.fetchone()[0]
447
+
448
+ if max_version is None:
449
+ return None # No versions archived
450
+
451
+ # offset=1 → max_version, offset=2 → max_version-1, etc.
452
+ target_version = max_version - (offset - 1)
453
+
454
+ if target_version < 1:
455
+ return None # Requested version doesn't exist
456
+
457
+ cursor = self._conn.execute("""
458
+ SELECT version, summary, tags_json, content_hash, created_at
459
+ FROM document_versions
460
+ WHERE id = ? AND collection = ? AND version = ?
461
+ """, (id, collection, target_version))
462
+
463
+ row = cursor.fetchone()
464
+ if row is None:
465
+ return None
466
+
467
+ return VersionInfo(
468
+ version=row["version"],
469
+ summary=row["summary"],
470
+ tags=json.loads(row["tags_json"]),
471
+ created_at=row["created_at"],
472
+ content_hash=row["content_hash"],
473
+ )
474
+
475
+ def list_versions(
476
+ self,
477
+ collection: str,
478
+ id: str,
479
+ limit: int = 10,
480
+ ) -> list[VersionInfo]:
481
+ """
482
+ List version history for a document.
483
+
484
+ Returns versions in reverse chronological order (newest first).
485
+
486
+ Args:
487
+ collection: Collection name
488
+ id: Document identifier
489
+ limit: Maximum versions to return
490
+
491
+ Returns:
492
+ List of VersionInfo, newest archived first
493
+ """
494
+ cursor = self._conn.execute("""
495
+ SELECT version, summary, tags_json, content_hash, created_at
496
+ FROM document_versions
497
+ WHERE id = ? AND collection = ?
498
+ ORDER BY version DESC
499
+ LIMIT ?
500
+ """, (id, collection, limit))
501
+
502
+ versions = []
503
+ for row in cursor:
504
+ versions.append(VersionInfo(
505
+ version=row["version"],
506
+ summary=row["summary"],
507
+ tags=json.loads(row["tags_json"]),
508
+ created_at=row["created_at"],
509
+ content_hash=row["content_hash"],
510
+ ))
511
+
512
+ return versions
513
+
514
+ def get_version_nav(
515
+ self,
516
+ collection: str,
517
+ id: str,
518
+ current_version: Optional[int] = None,
519
+ limit: int = 3,
520
+ ) -> dict[str, list[VersionInfo]]:
521
+ """
522
+ Get version navigation info (prev/next) for display.
523
+
524
+ Args:
525
+ collection: Collection name
526
+ id: Document identifier
527
+ current_version: The version being viewed (None = current/live version)
528
+ limit: Max previous versions to return when viewing current
529
+
530
+ Returns:
531
+ Dict with 'prev' and optionally 'next' lists of VersionInfo.
532
+ When viewing current (None): {'prev': [up to limit versions]}
533
+ When viewing old version N: {'prev': [N-1 if exists], 'next': [N+1 if exists]}
534
+ """
535
+ result: dict[str, list[VersionInfo]] = {"prev": []}
536
+
537
+ if current_version is None:
538
+ # Viewing current version: get up to `limit` previous versions
539
+ versions = self.list_versions(collection, id, limit=limit)
540
+ result["prev"] = versions
541
+ else:
542
+ # Viewing an old version: get prev (N-1) and next (N+1)
543
+ # Previous version (older)
544
+ if current_version > 1:
545
+ cursor = self._conn.execute("""
546
+ SELECT version, summary, tags_json, content_hash, created_at
547
+ FROM document_versions
548
+ WHERE id = ? AND collection = ? AND version = ?
549
+ """, (id, collection, current_version - 1))
550
+ row = cursor.fetchone()
551
+ if row:
552
+ result["prev"] = [VersionInfo(
553
+ version=row["version"],
554
+ summary=row["summary"],
555
+ tags=json.loads(row["tags_json"]),
556
+ created_at=row["created_at"],
557
+ content_hash=row["content_hash"],
558
+ )]
559
+
560
+ # Next version (newer)
561
+ cursor = self._conn.execute("""
562
+ SELECT version, summary, tags_json, content_hash, created_at
563
+ FROM document_versions
564
+ WHERE id = ? AND collection = ? AND version = ?
565
+ """, (id, collection, current_version + 1))
566
+ row = cursor.fetchone()
567
+ if row:
568
+ result["next"] = [VersionInfo(
569
+ version=row["version"],
570
+ summary=row["summary"],
571
+ tags=json.loads(row["tags_json"]),
572
+ created_at=row["created_at"],
573
+ content_hash=row["content_hash"],
574
+ )]
575
+ else:
576
+ # Check if there's a current version (meaning we're at newest archived)
577
+ if self.exists(collection, id):
578
+ # Next is "current" - indicate this with empty next
579
+ # (caller knows to check current doc)
580
+ result["next"] = []
581
+
582
+ return result
583
+
584
+ def version_count(self, collection: str, id: str) -> int:
585
+ """Count archived versions for a document."""
586
+ cursor = self._conn.execute("""
587
+ SELECT COUNT(*) FROM document_versions
588
+ WHERE id = ? AND collection = ?
589
+ """, (id, collection))
590
+ return cursor.fetchone()[0]
591
+
592
+ def get_many(
593
+ self,
594
+ collection: str,
595
+ ids: list[str],
596
+ ) -> dict[str, DocumentRecord]:
597
+ """
598
+ Get multiple documents by ID.
599
+
600
+ Args:
601
+ collection: Collection name
602
+ ids: List of document identifiers
603
+
604
+ Returns:
605
+ Dict mapping id → DocumentRecord (missing IDs omitted)
606
+ """
607
+ if not ids:
608
+ return {}
609
+
610
+ placeholders = ",".join("?" * len(ids))
611
+ cursor = self._conn.execute(f"""
612
+ SELECT id, collection, summary, tags_json, created_at, updated_at, content_hash
613
+ FROM documents
614
+ WHERE collection = ? AND id IN ({placeholders})
615
+ """, (collection, *ids))
616
+
617
+ results = {}
618
+ for row in cursor:
619
+ results[row["id"]] = DocumentRecord(
620
+ id=row["id"],
621
+ collection=row["collection"],
622
+ summary=row["summary"],
623
+ tags=json.loads(row["tags_json"]),
624
+ created_at=row["created_at"],
625
+ updated_at=row["updated_at"],
626
+ content_hash=row["content_hash"],
627
+ )
628
+
629
+ return results
630
+
631
+ def exists(self, collection: str, id: str) -> bool:
632
+ """Check if a document exists."""
633
+ cursor = self._conn.execute("""
634
+ SELECT 1 FROM documents
635
+ WHERE id = ? AND collection = ?
636
+ """, (id, collection))
637
+ return cursor.fetchone() is not None
638
+
639
+ def list_ids(
640
+ self,
641
+ collection: str,
642
+ limit: Optional[int] = None,
643
+ ) -> list[str]:
644
+ """
645
+ List document IDs in a collection.
646
+
647
+ Args:
648
+ collection: Collection name
649
+ limit: Maximum number to return (None for all)
650
+
651
+ Returns:
652
+ List of document IDs
653
+ """
654
+ if limit:
655
+ cursor = self._conn.execute("""
656
+ SELECT id FROM documents
657
+ WHERE collection = ?
658
+ ORDER BY updated_at DESC
659
+ LIMIT ?
660
+ """, (collection, limit))
661
+ else:
662
+ cursor = self._conn.execute("""
663
+ SELECT id FROM documents
664
+ WHERE collection = ?
665
+ ORDER BY updated_at DESC
666
+ """, (collection,))
667
+
668
+ return [row["id"] for row in cursor]
669
+
670
+ def list_recent(
671
+ self,
672
+ collection: str,
673
+ limit: int = 10,
674
+ ) -> list[DocumentRecord]:
675
+ """
676
+ List recent documents ordered by update time.
677
+
678
+ Args:
679
+ collection: Collection name
680
+ limit: Maximum number to return
681
+
682
+ Returns:
683
+ List of DocumentRecords, most recently updated first
684
+ """
685
+ cursor = self._conn.execute("""
686
+ SELECT id, collection, summary, tags_json, created_at, updated_at, content_hash
687
+ FROM documents
688
+ WHERE collection = ?
689
+ ORDER BY updated_at DESC
690
+ LIMIT ?
691
+ """, (collection, limit))
692
+
693
+ return [
694
+ DocumentRecord(
695
+ id=row["id"],
696
+ collection=row["collection"],
697
+ summary=row["summary"],
698
+ tags=json.loads(row["tags_json"]),
699
+ created_at=row["created_at"],
700
+ updated_at=row["updated_at"],
701
+ content_hash=row["content_hash"],
702
+ )
703
+ for row in cursor
704
+ ]
705
+
706
+ def count(self, collection: str) -> int:
707
+ """Count documents in a collection."""
708
+ cursor = self._conn.execute("""
709
+ SELECT COUNT(*) FROM documents
710
+ WHERE collection = ?
711
+ """, (collection,))
712
+ return cursor.fetchone()[0]
713
+
714
+ def count_all(self) -> int:
715
+ """Count total documents across all collections."""
716
+ cursor = self._conn.execute("SELECT COUNT(*) FROM documents")
717
+ return cursor.fetchone()[0]
718
+
719
+ def query_by_id_prefix(
720
+ self,
721
+ collection: str,
722
+ prefix: str,
723
+ ) -> list[DocumentRecord]:
724
+ """
725
+ Query documents by ID prefix.
726
+
727
+ Args:
728
+ collection: Collection name
729
+ prefix: ID prefix to match (e.g., "_system:")
730
+
731
+ Returns:
732
+ List of matching DocumentRecords
733
+ """
734
+ cursor = self._conn.execute("""
735
+ SELECT id, collection, summary, tags_json, created_at, updated_at, content_hash
736
+ FROM documents
737
+ WHERE collection = ? AND id LIKE ?
738
+ ORDER BY id
739
+ """, (collection, f"{prefix}%"))
740
+
741
+ results = []
742
+ for row in cursor:
743
+ results.append(DocumentRecord(
744
+ id=row["id"],
745
+ collection=row["collection"],
746
+ summary=row["summary"],
747
+ tags=json.loads(row["tags_json"]),
748
+ created_at=row["created_at"],
749
+ updated_at=row["updated_at"],
750
+ content_hash=row["content_hash"],
751
+ ))
752
+
753
+ return results
754
+
755
+ # -------------------------------------------------------------------------
756
+ # Tag Queries
757
+ # -------------------------------------------------------------------------
758
+
759
+ def list_distinct_tag_keys(self, collection: str) -> list[str]:
760
+ """
761
+ List all distinct tag keys used in the collection.
762
+
763
+ Excludes system tags (prefixed with _).
764
+
765
+ Returns:
766
+ Sorted list of distinct tag keys
767
+ """
768
+ cursor = self._conn.execute("""
769
+ SELECT tags_json FROM documents
770
+ WHERE collection = ?
771
+ """, (collection,))
772
+
773
+ keys: set[str] = set()
774
+ for row in cursor:
775
+ tags = json.loads(row["tags_json"])
776
+ for key in tags:
777
+ if not key.startswith("_"):
778
+ keys.add(key)
779
+
780
+ return sorted(keys)
781
+
782
+ def list_distinct_tag_values(self, collection: str, key: str) -> list[str]:
783
+ """
784
+ List all distinct values for a given tag key.
785
+
786
+ Args:
787
+ collection: Collection name
788
+ key: Tag key to get values for
789
+
790
+ Returns:
791
+ Sorted list of distinct values
792
+ """
793
+ cursor = self._conn.execute("""
794
+ SELECT tags_json FROM documents
795
+ WHERE collection = ?
796
+ """, (collection,))
797
+
798
+ values: set[str] = set()
799
+ for row in cursor:
800
+ tags = json.loads(row["tags_json"])
801
+ if key in tags:
802
+ values.add(tags[key])
803
+
804
+ return sorted(values)
805
+
806
+ def query_by_tag_key(
807
+ self,
808
+ collection: str,
809
+ key: str,
810
+ limit: int = 100,
811
+ since_date: Optional[str] = None,
812
+ ) -> list[DocumentRecord]:
813
+ """
814
+ Find documents that have a specific tag key (any value).
815
+
816
+ Args:
817
+ collection: Collection name
818
+ key: Tag key to search for
819
+ limit: Maximum results
820
+ since_date: Only include items updated on or after this date (YYYY-MM-DD)
821
+
822
+ Returns:
823
+ List of matching DocumentRecords
824
+ """
825
+ # SQLite JSON functions for tag key existence
826
+ # json_extract returns NULL if key doesn't exist
827
+ params: list[Any] = [collection, f"$.{key}"]
828
+
829
+ sql = """
830
+ SELECT id, collection, summary, tags_json, created_at, updated_at, content_hash
831
+ FROM documents
832
+ WHERE collection = ?
833
+ AND json_extract(tags_json, ?) IS NOT NULL
834
+ """
835
+
836
+ if since_date is not None:
837
+ # Compare against the date portion of updated_at
838
+ sql += " AND updated_at >= ?\n"
839
+ params.append(since_date)
840
+
841
+ sql += "ORDER BY updated_at DESC\nLIMIT ?"
842
+ params.append(limit)
843
+
844
+ cursor = self._conn.execute(sql, params)
845
+
846
+ results = []
847
+ for row in cursor:
848
+ results.append(DocumentRecord(
849
+ id=row["id"],
850
+ collection=row["collection"],
851
+ summary=row["summary"],
852
+ tags=json.loads(row["tags_json"]),
853
+ created_at=row["created_at"],
854
+ updated_at=row["updated_at"],
855
+ content_hash=row["content_hash"],
856
+ ))
857
+
858
+ return results
859
+
860
+ # -------------------------------------------------------------------------
861
+ # Collection Management
862
+ # -------------------------------------------------------------------------
863
+
864
+ def list_collections(self) -> list[str]:
865
+ """List all collection names."""
866
+ cursor = self._conn.execute("""
867
+ SELECT DISTINCT collection FROM documents
868
+ ORDER BY collection
869
+ """)
870
+ return [row["collection"] for row in cursor]
871
+
872
+ def delete_collection(self, collection: str) -> int:
873
+ """
874
+ Delete all documents in a collection.
875
+
876
+ Args:
877
+ collection: Collection name
878
+
879
+ Returns:
880
+ Number of documents deleted
881
+ """
882
+ with self._lock:
883
+ cursor = self._conn.execute("""
884
+ DELETE FROM documents
885
+ WHERE collection = ?
886
+ """, (collection,))
887
+ self._conn.commit()
888
+ return cursor.rowcount
889
+
890
+ # -------------------------------------------------------------------------
891
+ # Lifecycle
892
+ # -------------------------------------------------------------------------
893
+
894
+ def close(self) -> None:
895
+ """Close the database connection."""
896
+ if self._conn is not None:
897
+ self._conn.close()
898
+ self._conn = None
899
+
900
+ def __enter__(self):
901
+ return self
902
+
903
+ def __exit__(self, exc_type, exc_val, exc_tb):
904
+ self.close()
905
+ return False
906
+
907
+ def __del__(self):
908
+ self.close()