keep-skill 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
keep/document_store.py ADDED
@@ -0,0 +1,569 @@
1
+ """
2
+ Document store using SQLite.
3
+
4
+ Stores canonical document records separate from embeddings.
5
+ This enables multiple embedding providers to index the same documents.
6
+
7
+ The document store is the source of truth for:
8
+ - Document identity (URI / custom ID)
9
+ - Summary text
10
+ - Tags (source + system)
11
+ - Timestamps
12
+
13
+ Embeddings are stored in ChromaDB collections, keyed by embedding provider.
14
+ """
15
+
16
+ import json
17
+ import sqlite3
18
+ import threading
19
+ from dataclasses import dataclass
20
+ from datetime import datetime, timezone
21
+ from pathlib import Path
22
+ from typing import Any, Optional
23
+
24
+
25
+ @dataclass
26
+ class DocumentRecord:
27
+ """
28
+ A canonical document record.
29
+
30
+ This is the source of truth, independent of any embedding index.
31
+ """
32
+ id: str
33
+ collection: str
34
+ summary: str
35
+ tags: dict[str, str]
36
+ created_at: str
37
+ updated_at: str
38
+ content_hash: Optional[str] = None
39
+
40
+
41
+ class DocumentStore:
42
+ """
43
+ SQLite-backed store for canonical document records.
44
+
45
+ Separates document metadata from embedding storage, enabling:
46
+ - Multiple embedding providers per document
47
+ - Efficient tag/metadata queries without ChromaDB
48
+ - Clear separation of concerns
49
+ """
50
+
51
+ def __init__(self, store_path: Path):
52
+ """
53
+ Args:
54
+ store_path: Path to SQLite database file
55
+ """
56
+ self._db_path = store_path
57
+ self._conn: Optional[sqlite3.Connection] = None
58
+ self._lock = threading.Lock()
59
+ self._init_db()
60
+
61
+ def _init_db(self) -> None:
62
+ """Initialize the SQLite database."""
63
+ self._db_path.parent.mkdir(parents=True, exist_ok=True)
64
+ self._conn = sqlite3.connect(str(self._db_path), check_same_thread=False)
65
+ self._conn.row_factory = sqlite3.Row
66
+
67
+ self._conn.execute("""
68
+ CREATE TABLE IF NOT EXISTS documents (
69
+ id TEXT NOT NULL,
70
+ collection TEXT NOT NULL,
71
+ summary TEXT NOT NULL,
72
+ tags_json TEXT NOT NULL DEFAULT '{}',
73
+ created_at TEXT NOT NULL,
74
+ updated_at TEXT NOT NULL,
75
+ content_hash TEXT,
76
+ PRIMARY KEY (id, collection)
77
+ )
78
+ """)
79
+
80
+ # Migration: add content_hash column if missing (for existing databases)
81
+ cursor = self._conn.execute("PRAGMA table_info(documents)")
82
+ columns = {row[1] for row in cursor.fetchall()}
83
+ if "content_hash" not in columns:
84
+ self._conn.execute("ALTER TABLE documents ADD COLUMN content_hash TEXT")
85
+
86
+ # Index for collection queries
87
+ self._conn.execute("""
88
+ CREATE INDEX IF NOT EXISTS idx_documents_collection
89
+ ON documents(collection)
90
+ """)
91
+
92
+ # Index for timestamp queries
93
+ self._conn.execute("""
94
+ CREATE INDEX IF NOT EXISTS idx_documents_updated
95
+ ON documents(updated_at)
96
+ """)
97
+
98
+ self._conn.commit()
99
+
100
+ def _now(self) -> str:
101
+ """Current timestamp in ISO format."""
102
+ return datetime.now(timezone.utc).isoformat()
103
+
104
+ def _get_unlocked(self, collection: str, id: str) -> Optional[DocumentRecord]:
105
+ """Get a document by ID without acquiring the lock (for use within locked contexts)."""
106
+ cursor = self._conn.execute("""
107
+ SELECT id, collection, summary, tags_json, created_at, updated_at, content_hash
108
+ FROM documents
109
+ WHERE id = ? AND collection = ?
110
+ """, (id, collection))
111
+
112
+ row = cursor.fetchone()
113
+ if row is None:
114
+ return None
115
+
116
+ return DocumentRecord(
117
+ id=row["id"],
118
+ collection=row["collection"],
119
+ summary=row["summary"],
120
+ tags=json.loads(row["tags_json"]),
121
+ created_at=row["created_at"],
122
+ updated_at=row["updated_at"],
123
+ content_hash=row["content_hash"],
124
+ )
125
+
126
+ # -------------------------------------------------------------------------
127
+ # Write Operations
128
+ # -------------------------------------------------------------------------
129
+
130
+ def upsert(
131
+ self,
132
+ collection: str,
133
+ id: str,
134
+ summary: str,
135
+ tags: dict[str, str],
136
+ content_hash: Optional[str] = None,
137
+ ) -> DocumentRecord:
138
+ """
139
+ Insert or update a document record.
140
+
141
+ Preserves created_at on update. Updates updated_at always.
142
+
143
+ Args:
144
+ collection: Collection name
145
+ id: Document identifier (URI or custom)
146
+ summary: Document summary text
147
+ tags: All tags (source + system)
148
+ content_hash: SHA256 hash of content (for change detection)
149
+
150
+ Returns:
151
+ The stored DocumentRecord
152
+ """
153
+ now = self._now()
154
+ tags_json = json.dumps(tags, ensure_ascii=False)
155
+
156
+ with self._lock:
157
+ # Check if exists to preserve created_at
158
+ existing = self._get_unlocked(collection, id)
159
+ created_at = existing.created_at if existing else now
160
+
161
+ self._conn.execute("""
162
+ INSERT OR REPLACE INTO documents
163
+ (id, collection, summary, tags_json, created_at, updated_at, content_hash)
164
+ VALUES (?, ?, ?, ?, ?, ?, ?)
165
+ """, (id, collection, summary, tags_json, created_at, now, content_hash))
166
+ self._conn.commit()
167
+
168
+ return DocumentRecord(
169
+ id=id,
170
+ collection=collection,
171
+ summary=summary,
172
+ tags=tags,
173
+ created_at=created_at,
174
+ updated_at=now,
175
+ content_hash=content_hash,
176
+ )
177
+
178
+ def update_summary(self, collection: str, id: str, summary: str) -> bool:
179
+ """
180
+ Update just the summary of an existing document.
181
+
182
+ Used by lazy summarization to replace placeholder summaries.
183
+
184
+ Args:
185
+ collection: Collection name
186
+ id: Document identifier
187
+ summary: New summary text
188
+
189
+ Returns:
190
+ True if document was found and updated, False otherwise
191
+ """
192
+ now = self._now()
193
+
194
+ with self._lock:
195
+ cursor = self._conn.execute("""
196
+ UPDATE documents
197
+ SET summary = ?, updated_at = ?
198
+ WHERE id = ? AND collection = ?
199
+ """, (summary, now, id, collection))
200
+ self._conn.commit()
201
+
202
+ return cursor.rowcount > 0
203
+
204
+ def update_tags(
205
+ self,
206
+ collection: str,
207
+ id: str,
208
+ tags: dict[str, str],
209
+ ) -> bool:
210
+ """
211
+ Update tags of an existing document.
212
+
213
+ Args:
214
+ collection: Collection name
215
+ id: Document identifier
216
+ tags: New tags dict (replaces existing)
217
+
218
+ Returns:
219
+ True if document was found and updated, False otherwise
220
+ """
221
+ now = self._now()
222
+ tags_json = json.dumps(tags, ensure_ascii=False)
223
+
224
+ with self._lock:
225
+ cursor = self._conn.execute("""
226
+ UPDATE documents
227
+ SET tags_json = ?, updated_at = ?
228
+ WHERE id = ? AND collection = ?
229
+ """, (tags_json, now, id, collection))
230
+ self._conn.commit()
231
+
232
+ return cursor.rowcount > 0
233
+
234
+ def delete(self, collection: str, id: str) -> bool:
235
+ """
236
+ Delete a document record.
237
+
238
+ Args:
239
+ collection: Collection name
240
+ id: Document identifier
241
+
242
+ Returns:
243
+ True if document existed and was deleted
244
+ """
245
+ with self._lock:
246
+ cursor = self._conn.execute("""
247
+ DELETE FROM documents
248
+ WHERE id = ? AND collection = ?
249
+ """, (id, collection))
250
+ self._conn.commit()
251
+
252
+ return cursor.rowcount > 0
253
+
254
+ # -------------------------------------------------------------------------
255
+ # Read Operations
256
+ # -------------------------------------------------------------------------
257
+
258
+ def get(self, collection: str, id: str) -> Optional[DocumentRecord]:
259
+ """
260
+ Get a document by ID.
261
+
262
+ Args:
263
+ collection: Collection name
264
+ id: Document identifier
265
+
266
+ Returns:
267
+ DocumentRecord if found, None otherwise
268
+ """
269
+ cursor = self._conn.execute("""
270
+ SELECT id, collection, summary, tags_json, created_at, updated_at, content_hash
271
+ FROM documents
272
+ WHERE id = ? AND collection = ?
273
+ """, (id, collection))
274
+
275
+ row = cursor.fetchone()
276
+ if row is None:
277
+ return None
278
+
279
+ return DocumentRecord(
280
+ id=row["id"],
281
+ collection=row["collection"],
282
+ summary=row["summary"],
283
+ tags=json.loads(row["tags_json"]),
284
+ created_at=row["created_at"],
285
+ updated_at=row["updated_at"],
286
+ content_hash=row["content_hash"],
287
+ )
288
+
289
+ def get_many(
290
+ self,
291
+ collection: str,
292
+ ids: list[str],
293
+ ) -> dict[str, DocumentRecord]:
294
+ """
295
+ Get multiple documents by ID.
296
+
297
+ Args:
298
+ collection: Collection name
299
+ ids: List of document identifiers
300
+
301
+ Returns:
302
+ Dict mapping id → DocumentRecord (missing IDs omitted)
303
+ """
304
+ if not ids:
305
+ return {}
306
+
307
+ placeholders = ",".join("?" * len(ids))
308
+ cursor = self._conn.execute(f"""
309
+ SELECT id, collection, summary, tags_json, created_at, updated_at, content_hash
310
+ FROM documents
311
+ WHERE collection = ? AND id IN ({placeholders})
312
+ """, (collection, *ids))
313
+
314
+ results = {}
315
+ for row in cursor:
316
+ results[row["id"]] = DocumentRecord(
317
+ id=row["id"],
318
+ collection=row["collection"],
319
+ summary=row["summary"],
320
+ tags=json.loads(row["tags_json"]),
321
+ created_at=row["created_at"],
322
+ updated_at=row["updated_at"],
323
+ content_hash=row["content_hash"],
324
+ )
325
+
326
+ return results
327
+
328
+ def exists(self, collection: str, id: str) -> bool:
329
+ """Check if a document exists."""
330
+ cursor = self._conn.execute("""
331
+ SELECT 1 FROM documents
332
+ WHERE id = ? AND collection = ?
333
+ """, (id, collection))
334
+ return cursor.fetchone() is not None
335
+
336
+ def list_ids(
337
+ self,
338
+ collection: str,
339
+ limit: Optional[int] = None,
340
+ ) -> list[str]:
341
+ """
342
+ List document IDs in a collection.
343
+
344
+ Args:
345
+ collection: Collection name
346
+ limit: Maximum number to return (None for all)
347
+
348
+ Returns:
349
+ List of document IDs
350
+ """
351
+ if limit:
352
+ cursor = self._conn.execute("""
353
+ SELECT id FROM documents
354
+ WHERE collection = ?
355
+ ORDER BY updated_at DESC
356
+ LIMIT ?
357
+ """, (collection, limit))
358
+ else:
359
+ cursor = self._conn.execute("""
360
+ SELECT id FROM documents
361
+ WHERE collection = ?
362
+ ORDER BY updated_at DESC
363
+ """, (collection,))
364
+
365
+ return [row["id"] for row in cursor]
366
+
367
+ def count(self, collection: str) -> int:
368
+ """Count documents in a collection."""
369
+ cursor = self._conn.execute("""
370
+ SELECT COUNT(*) FROM documents
371
+ WHERE collection = ?
372
+ """, (collection,))
373
+ return cursor.fetchone()[0]
374
+
375
+ def count_all(self) -> int:
376
+ """Count total documents across all collections."""
377
+ cursor = self._conn.execute("SELECT COUNT(*) FROM documents")
378
+ return cursor.fetchone()[0]
379
+
380
+ def query_by_id_prefix(
381
+ self,
382
+ collection: str,
383
+ prefix: str,
384
+ ) -> list[DocumentRecord]:
385
+ """
386
+ Query documents by ID prefix.
387
+
388
+ Args:
389
+ collection: Collection name
390
+ prefix: ID prefix to match (e.g., "_system:")
391
+
392
+ Returns:
393
+ List of matching DocumentRecords
394
+ """
395
+ cursor = self._conn.execute("""
396
+ SELECT id, collection, summary, tags_json, created_at, updated_at, content_hash
397
+ FROM documents
398
+ WHERE collection = ? AND id LIKE ?
399
+ ORDER BY id
400
+ """, (collection, f"{prefix}%"))
401
+
402
+ results = []
403
+ for row in cursor:
404
+ results.append(DocumentRecord(
405
+ id=row["id"],
406
+ collection=row["collection"],
407
+ summary=row["summary"],
408
+ tags=json.loads(row["tags_json"]),
409
+ created_at=row["created_at"],
410
+ updated_at=row["updated_at"],
411
+ content_hash=row["content_hash"],
412
+ ))
413
+
414
+ return results
415
+
416
+ # -------------------------------------------------------------------------
417
+ # Tag Queries
418
+ # -------------------------------------------------------------------------
419
+
420
+ def list_distinct_tag_keys(self, collection: str) -> list[str]:
421
+ """
422
+ List all distinct tag keys used in the collection.
423
+
424
+ Excludes system tags (prefixed with _).
425
+
426
+ Returns:
427
+ Sorted list of distinct tag keys
428
+ """
429
+ cursor = self._conn.execute("""
430
+ SELECT tags_json FROM documents
431
+ WHERE collection = ?
432
+ """, (collection,))
433
+
434
+ keys: set[str] = set()
435
+ for row in cursor:
436
+ tags = json.loads(row["tags_json"])
437
+ for key in tags:
438
+ if not key.startswith("_"):
439
+ keys.add(key)
440
+
441
+ return sorted(keys)
442
+
443
+ def list_distinct_tag_values(self, collection: str, key: str) -> list[str]:
444
+ """
445
+ List all distinct values for a given tag key.
446
+
447
+ Args:
448
+ collection: Collection name
449
+ key: Tag key to get values for
450
+
451
+ Returns:
452
+ Sorted list of distinct values
453
+ """
454
+ cursor = self._conn.execute("""
455
+ SELECT tags_json FROM documents
456
+ WHERE collection = ?
457
+ """, (collection,))
458
+
459
+ values: set[str] = set()
460
+ for row in cursor:
461
+ tags = json.loads(row["tags_json"])
462
+ if key in tags:
463
+ values.add(tags[key])
464
+
465
+ return sorted(values)
466
+
467
+ def query_by_tag_key(
468
+ self,
469
+ collection: str,
470
+ key: str,
471
+ limit: int = 100,
472
+ since_date: Optional[str] = None,
473
+ ) -> list[DocumentRecord]:
474
+ """
475
+ Find documents that have a specific tag key (any value).
476
+
477
+ Args:
478
+ collection: Collection name
479
+ key: Tag key to search for
480
+ limit: Maximum results
481
+ since_date: Only include items updated on or after this date (YYYY-MM-DD)
482
+
483
+ Returns:
484
+ List of matching DocumentRecords
485
+ """
486
+ # SQLite JSON functions for tag key existence
487
+ # json_extract returns NULL if key doesn't exist
488
+ params: list[Any] = [collection, f"$.{key}"]
489
+
490
+ sql = """
491
+ SELECT id, collection, summary, tags_json, created_at, updated_at, content_hash
492
+ FROM documents
493
+ WHERE collection = ?
494
+ AND json_extract(tags_json, ?) IS NOT NULL
495
+ """
496
+
497
+ if since_date is not None:
498
+ # Compare against the date portion of updated_at
499
+ sql += " AND updated_at >= ?\n"
500
+ params.append(since_date)
501
+
502
+ sql += "ORDER BY updated_at DESC\nLIMIT ?"
503
+ params.append(limit)
504
+
505
+ cursor = self._conn.execute(sql, params)
506
+
507
+ results = []
508
+ for row in cursor:
509
+ results.append(DocumentRecord(
510
+ id=row["id"],
511
+ collection=row["collection"],
512
+ summary=row["summary"],
513
+ tags=json.loads(row["tags_json"]),
514
+ created_at=row["created_at"],
515
+ updated_at=row["updated_at"],
516
+ content_hash=row["content_hash"],
517
+ ))
518
+
519
+ return results
520
+
521
+ # -------------------------------------------------------------------------
522
+ # Collection Management
523
+ # -------------------------------------------------------------------------
524
+
525
+ def list_collections(self) -> list[str]:
526
+ """List all collection names."""
527
+ cursor = self._conn.execute("""
528
+ SELECT DISTINCT collection FROM documents
529
+ ORDER BY collection
530
+ """)
531
+ return [row["collection"] for row in cursor]
532
+
533
+ def delete_collection(self, collection: str) -> int:
534
+ """
535
+ Delete all documents in a collection.
536
+
537
+ Args:
538
+ collection: Collection name
539
+
540
+ Returns:
541
+ Number of documents deleted
542
+ """
543
+ with self._lock:
544
+ cursor = self._conn.execute("""
545
+ DELETE FROM documents
546
+ WHERE collection = ?
547
+ """, (collection,))
548
+ self._conn.commit()
549
+ return cursor.rowcount
550
+
551
+ # -------------------------------------------------------------------------
552
+ # Lifecycle
553
+ # -------------------------------------------------------------------------
554
+
555
+ def close(self) -> None:
556
+ """Close the database connection."""
557
+ if self._conn is not None:
558
+ self._conn.close()
559
+ self._conn = None
560
+
561
+ def __enter__(self):
562
+ return self
563
+
564
+ def __exit__(self, exc_type, exc_val, exc_tb):
565
+ self.close()
566
+ return False
567
+
568
+ def __del__(self):
569
+ self.close()
keep/errors.py ADDED
@@ -0,0 +1,33 @@
1
+ """
2
+ Error logging utilities for keep CLI.
3
+
4
+ Logs full stack traces to /tmp for debugging while showing clean messages to users.
5
+ """
6
+
7
+ import traceback
8
+ from datetime import datetime, timezone
9
+ from pathlib import Path
10
+
11
+ ERROR_LOG_PATH = Path("/tmp/keep-errors.log")
12
+
13
+
14
+ def log_exception(exc: Exception, context: str = "") -> Path:
15
+ """
16
+ Log exception with full traceback to file.
17
+
18
+ Args:
19
+ exc: The exception that occurred
20
+ context: Optional context string (e.g., command name)
21
+
22
+ Returns:
23
+ Path to the error log file
24
+ """
25
+ timestamp = datetime.now(timezone.utc).isoformat()
26
+ with open(ERROR_LOG_PATH, "a") as f:
27
+ f.write(f"\n{'='*60}\n")
28
+ f.write(f"[{timestamp}]")
29
+ if context:
30
+ f.write(f" {context}")
31
+ f.write("\n")
32
+ f.write(traceback.format_exc())
33
+ return ERROR_LOG_PATH
keep/indexing.py CHANGED
@@ -71,7 +71,7 @@ class IndexingConfig:
71
71
  """Approximation for token estimation."""
72
72
 
73
73
  # Summarization settings (always used)
74
- summary_max_chars: int = 500
74
+ summary_max_chars: int = 1000
75
75
  """Maximum summary length in characters."""
76
76
 
77
77
  # BM25 settings
keep/logging_config.py CHANGED
@@ -57,17 +57,48 @@ def configure_quiet_mode(quiet: bool = True):
57
57
  def enable_verbose_mode():
58
58
  """Re-enable verbose output for debugging."""
59
59
  configure_quiet_mode(quiet=False)
60
-
60
+
61
61
  # Restore defaults
62
62
  os.environ.pop("HF_HUB_DISABLE_PROGRESS_BARS", None)
63
63
  os.environ.pop("TRANSFORMERS_VERBOSITY", None)
64
-
64
+
65
65
  # Re-enable warnings
66
66
  warnings.filterwarnings("default")
67
-
67
+
68
68
  # Reset logging levels
69
69
  import logging
70
70
  logging.getLogger("transformers").setLevel(logging.INFO)
71
71
  logging.getLogger("sentence_transformers").setLevel(logging.INFO)
72
72
  logging.getLogger("mlx").setLevel(logging.INFO)
73
73
  logging.getLogger("chromadb").setLevel(logging.INFO)
74
+
75
+
76
+ def enable_debug_mode():
77
+ """Enable debug-level logging to stderr."""
78
+ import logging
79
+
80
+ # Re-enable warnings
81
+ warnings.filterwarnings("default")
82
+
83
+ # Restore library verbosity
84
+ os.environ.pop("HF_HUB_DISABLE_PROGRESS_BARS", None)
85
+ os.environ.pop("TRANSFORMERS_VERBOSITY", None)
86
+
87
+ # Configure root logger for debug output
88
+ root_logger = logging.getLogger()
89
+ root_logger.setLevel(logging.DEBUG)
90
+
91
+ # Add stderr handler if not already present
92
+ if not any(isinstance(h, logging.StreamHandler) and h.stream == sys.stderr
93
+ for h in root_logger.handlers):
94
+ handler = logging.StreamHandler(sys.stderr)
95
+ handler.setLevel(logging.DEBUG)
96
+ handler.setFormatter(logging.Formatter(
97
+ "%(asctime)s %(levelname)s %(name)s: %(message)s",
98
+ datefmt="%H:%M:%S"
99
+ ))
100
+ root_logger.addHandler(handler)
101
+
102
+ # Set library loggers to DEBUG
103
+ for name in ("keep", "transformers", "sentence_transformers", "mlx", "chromadb"):
104
+ logging.getLogger(name).setLevel(logging.DEBUG)