keep-skill 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
keep/api.py ADDED
@@ -0,0 +1,686 @@
1
+ """
2
+ Core API for associative memory.
3
+
4
+ This is the minimal working implementation focused on:
5
+ - update(): fetch → embed → summarize → store
6
+ - remember(): embed → summarize → store
7
+ - find(): embed query → search
8
+ - get(): retrieve by ID
9
+ """
10
+
11
+ import re
12
+ from datetime import datetime, timezone
13
+ from pathlib import Path
14
+ from typing import Any, Optional
15
+
16
+ import os
17
+ import subprocess
18
+ import sys
19
+
20
+ from .config import load_or_create_config, StoreConfig
21
+ from .paths import get_default_store_path
22
+ from .pending_summaries import PendingSummaryQueue
23
+ from .providers import get_registry
24
+ from .providers.base import (
25
+ DocumentProvider,
26
+ EmbeddingProvider,
27
+ SummarizationProvider,
28
+ )
29
+ from .providers.embedding_cache import CachingEmbeddingProvider
30
+ from .store import ChromaStore
31
+ from .types import Item, filter_non_system_tags
32
+
33
+
34
+ # Default max length for truncated placeholder summaries
35
+ TRUNCATE_LENGTH = 500
36
+
37
+ # Maximum attempts before giving up on a pending summary
38
+ MAX_SUMMARY_ATTEMPTS = 5
39
+
40
+
41
+ # Collection name validation: lowercase ASCII and underscores only
42
+ COLLECTION_NAME_PATTERN = re.compile(r"^[a-z][a-z0-9_]*$")
43
+
44
+
45
+ class Keeper:
46
+ """
47
+ Semantic memory keeper - persistent storage with similarity search.
48
+
49
+ Example:
50
+ kp = Keeper()
51
+ kp.update("file:///path/to/readme.md")
52
+ results = kp.find("installation instructions")
53
+ """
54
+
55
+ def __init__(
56
+ self,
57
+ store_path: Optional[str | Path] = None,
58
+ collection: str = "default",
59
+ decay_half_life_days: float = 30.0
60
+ ) -> None:
61
+ """
62
+ Initialize or open an existing associative memory store.
63
+
64
+ Args:
65
+ store_path: Path to store directory. Uses default if not specified.
66
+ collection: Default collection name.
67
+ decay_half_life_days: Memory decay half-life in days (ACT-R model).
68
+ After this many days, an item's effective relevance is halved.
69
+ Set to 0 or negative to disable decay.
70
+ """
71
+ # Resolve store path
72
+ if store_path is None:
73
+ self._store_path = get_default_store_path()
74
+ else:
75
+ self._store_path = Path(store_path).resolve()
76
+
77
+ # Validate collection name
78
+ if not COLLECTION_NAME_PATTERN.match(collection):
79
+ raise ValueError(
80
+ f"Invalid collection name '{collection}'. "
81
+ "Must be lowercase ASCII, starting with a letter."
82
+ )
83
+ self._default_collection = collection
84
+ self._decay_half_life_days = decay_half_life_days
85
+
86
+ # Load or create configuration
87
+ self._config: StoreConfig = load_or_create_config(self._store_path)
88
+
89
+ # Initialize providers
90
+ registry = get_registry()
91
+
92
+ self._document_provider: DocumentProvider = registry.create_document(
93
+ self._config.document.name,
94
+ self._config.document.params,
95
+ )
96
+
97
+ # Create embedding provider with caching
98
+ base_embedding_provider = registry.create_embedding(
99
+ self._config.embedding.name,
100
+ self._config.embedding.params,
101
+ )
102
+ cache_path = self._store_path / "embedding_cache.db"
103
+ self._embedding_provider: EmbeddingProvider = CachingEmbeddingProvider(
104
+ base_embedding_provider,
105
+ cache_path=cache_path,
106
+ )
107
+
108
+ self._summarization_provider: SummarizationProvider = registry.create_summarization(
109
+ self._config.summarization.name,
110
+ self._config.summarization.params,
111
+ )
112
+
113
+ # Initialize pending summary queue
114
+ queue_path = self._store_path / "pending_summaries.db"
115
+ self._pending_queue = PendingSummaryQueue(queue_path)
116
+
117
+ # Initialize store
118
+ self._store = ChromaStore(
119
+ self._store_path,
120
+ embedding_dimension=self._embedding_provider.dimension,
121
+ )
122
+
123
+ def _resolve_collection(self, collection: Optional[str]) -> str:
124
+ """Resolve collection name, validating if provided."""
125
+ if collection is None:
126
+ return self._default_collection
127
+ if not COLLECTION_NAME_PATTERN.match(collection):
128
+ raise ValueError(f"Invalid collection name: {collection}")
129
+ return collection
130
+
131
+ # -------------------------------------------------------------------------
132
+ # Write Operations
133
+ # -------------------------------------------------------------------------
134
+
135
+ def update(
136
+ self,
137
+ id: str,
138
+ source_tags: Optional[dict[str, str]] = None,
139
+ *,
140
+ collection: Optional[str] = None,
141
+ lazy: bool = False
142
+ ) -> Item:
143
+ """
144
+ Insert or update a document in the store.
145
+
146
+ Fetches the document, generates embeddings and summary, then stores it.
147
+
148
+ **Update behavior:**
149
+ - Summary: Always replaced with newly generated summary
150
+ - Tags: Merged - existing source tags are preserved, new source_tags override
151
+ on key collision. System tags (prefixed with _) are always managed by
152
+ the system.
153
+
154
+ Args:
155
+ id: URI of document to fetch and index
156
+ source_tags: User-provided tags to merge with existing tags
157
+ collection: Target collection (uses default if None)
158
+ lazy: If True, use truncated placeholder summary and queue for
159
+ background processing. Use `process_pending()` to generate
160
+ real summaries later.
161
+
162
+ Returns:
163
+ The stored Item with merged tags and new summary
164
+ """
165
+ coll = self._resolve_collection(collection)
166
+
167
+ # Get existing item to preserve tags
168
+ existing_tags = {}
169
+ existing = self._store.get(coll, id)
170
+ if existing:
171
+ # Extract existing non-system tags
172
+ existing_tags = filter_non_system_tags(existing.tags)
173
+
174
+ # Fetch document
175
+ doc = self._document_provider.fetch(id)
176
+
177
+ # Generate embedding
178
+ embedding = self._embedding_provider.embed(doc.content)
179
+
180
+ # Generate summary (or queue for later if lazy)
181
+ if lazy:
182
+ # Truncated placeholder
183
+ if len(doc.content) > TRUNCATE_LENGTH:
184
+ summary = doc.content[:TRUNCATE_LENGTH] + "..."
185
+ else:
186
+ summary = doc.content
187
+ # Queue for background processing
188
+ self._pending_queue.enqueue(id, coll, doc.content)
189
+ else:
190
+ summary = self._summarization_provider.summarize(doc.content)
191
+
192
+ # Build tags: existing + new (new overrides on collision)
193
+ tags = {**existing_tags}
194
+
195
+ # Merge in new source tags (filtered to prevent system tag override)
196
+ if source_tags:
197
+ tags.update(filter_non_system_tags(source_tags))
198
+
199
+ # Add system tags
200
+ tags["_source"] = "uri"
201
+ if doc.content_type:
202
+ tags["_content_type"] = doc.content_type
203
+
204
+ # Store
205
+ self._store.upsert(
206
+ collection=coll,
207
+ id=id,
208
+ embedding=embedding,
209
+ summary=summary,
210
+ tags=tags,
211
+ )
212
+
213
+ # Spawn background processor if lazy
214
+ if lazy:
215
+ self._spawn_processor()
216
+
217
+ # Return the stored item
218
+ result = self._store.get(coll, id)
219
+ return result.to_item()
220
+
221
+ def remember(
222
+ self,
223
+ content: str,
224
+ *,
225
+ id: Optional[str] = None,
226
+ source_tags: Optional[dict[str, str]] = None,
227
+ collection: Optional[str] = None,
228
+ lazy: bool = False
229
+ ) -> Item:
230
+ """
231
+ Store inline content directly (without fetching from a URI).
232
+
233
+ Use for conversation snippets, notes, insights.
234
+
235
+ **Update behavior (when id already exists):**
236
+ - Summary: Replaced with newly generated summary from content
237
+ - Tags: Merged - existing source tags preserved, new source_tags override
238
+ on key collision. System tags (prefixed with _) are always managed by
239
+ the system.
240
+
241
+ Args:
242
+ content: Text to store and index
243
+ id: Optional custom ID (auto-generated if None)
244
+ source_tags: User-provided tags to merge with existing tags
245
+ collection: Target collection (uses default if None)
246
+ lazy: If True, use truncated placeholder summary and queue for
247
+ background processing. Use `process_pending()` to generate
248
+ real summaries later.
249
+
250
+ Returns:
251
+ The stored Item with merged tags and new summary
252
+ """
253
+ coll = self._resolve_collection(collection)
254
+
255
+ # Generate ID if not provided
256
+ if id is None:
257
+ timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S.%f")
258
+ id = f"mem:{timestamp}"
259
+
260
+ # Get existing item to preserve tags
261
+ existing_tags = {}
262
+ existing = self._store.get(coll, id)
263
+ if existing:
264
+ # Extract existing non-system tags
265
+ existing_tags = filter_non_system_tags(existing.tags)
266
+
267
+ # Generate embedding
268
+ embedding = self._embedding_provider.embed(content)
269
+
270
+ # Generate summary (or queue for later if lazy)
271
+ if lazy:
272
+ # Truncated placeholder
273
+ if len(content) > TRUNCATE_LENGTH:
274
+ summary = content[:TRUNCATE_LENGTH] + "..."
275
+ else:
276
+ summary = content
277
+ # Queue for background processing
278
+ self._pending_queue.enqueue(id, coll, content)
279
+ else:
280
+ summary = self._summarization_provider.summarize(content)
281
+
282
+ # Build tags: existing + new (new overrides on collision)
283
+ tags = {**existing_tags}
284
+
285
+ # Merge in new source tags (filtered)
286
+ if source_tags:
287
+ tags.update(filter_non_system_tags(source_tags))
288
+
289
+ # Add system tags
290
+ tags["_source"] = "inline"
291
+
292
+ # Store
293
+ self._store.upsert(
294
+ collection=coll,
295
+ id=id,
296
+ embedding=embedding,
297
+ summary=summary,
298
+ tags=tags,
299
+ )
300
+
301
+ # Spawn background processor if lazy
302
+ if lazy:
303
+ self._spawn_processor()
304
+
305
+ # Return the stored item
306
+ result = self._store.get(coll, id)
307
+ return result.to_item()
308
+
309
+ # -------------------------------------------------------------------------
310
+ # Query Operations
311
+ # -------------------------------------------------------------------------
312
+
313
+ def _apply_recency_decay(self, items: list[Item]) -> list[Item]:
314
+ """
315
+ Apply ACT-R style recency decay to search results.
316
+
317
+ Multiplies each item's similarity score by a decay factor based on
318
+ time since last update. Uses exponential decay with configurable half-life.
319
+
320
+ Formula: effective_score = similarity × 0.5^(days_elapsed / half_life)
321
+ """
322
+ if self._decay_half_life_days <= 0:
323
+ return items # Decay disabled
324
+
325
+ now = datetime.now(timezone.utc)
326
+ decayed_items = []
327
+
328
+ for item in items:
329
+ # Get last update time from tags
330
+ updated_str = item.tags.get("_updated")
331
+ if updated_str and item.score is not None:
332
+ try:
333
+ # Parse ISO timestamp
334
+ updated = datetime.fromisoformat(updated_str.replace("Z", "+00:00"))
335
+ days_elapsed = (now - updated).total_seconds() / 86400
336
+
337
+ # Exponential decay: 0.5^(days/half_life)
338
+ decay_factor = 0.5 ** (days_elapsed / self._decay_half_life_days)
339
+ decayed_score = item.score * decay_factor
340
+
341
+ # Create new Item with decayed score
342
+ decayed_items.append(Item(
343
+ id=item.id,
344
+ summary=item.summary,
345
+ tags=item.tags,
346
+ score=decayed_score
347
+ ))
348
+ except (ValueError, TypeError):
349
+ # If timestamp parsing fails, keep original
350
+ decayed_items.append(item)
351
+ else:
352
+ decayed_items.append(item)
353
+
354
+ # Re-sort by decayed score (highest first)
355
+ decayed_items.sort(key=lambda x: x.score if x.score is not None else 0, reverse=True)
356
+
357
+ return decayed_items
358
+
359
+ def find(
360
+ self,
361
+ query: str,
362
+ *,
363
+ limit: int = 10,
364
+ collection: Optional[str] = None
365
+ ) -> list[Item]:
366
+ """
367
+ Find items using semantic similarity search.
368
+
369
+ Scores are adjusted by recency decay (ACT-R model) - older items
370
+ have reduced effective relevance unless recently accessed.
371
+ """
372
+ coll = self._resolve_collection(collection)
373
+
374
+ # Embed query
375
+ embedding = self._embedding_provider.embed(query)
376
+
377
+ # Search (fetch extra to account for re-ranking)
378
+ fetch_limit = limit * 2 if self._decay_half_life_days > 0 else limit
379
+ results = self._store.query_embedding(coll, embedding, limit=fetch_limit)
380
+
381
+ # Convert to Items and apply decay
382
+ items = [r.to_item() for r in results]
383
+ items = self._apply_recency_decay(items)
384
+
385
+ return items[:limit]
386
+
387
+ def find_similar(
388
+ self,
389
+ id: str,
390
+ *,
391
+ limit: int = 10,
392
+ include_self: bool = False,
393
+ collection: Optional[str] = None
394
+ ) -> list[Item]:
395
+ """
396
+ Find items similar to an existing item.
397
+ """
398
+ coll = self._resolve_collection(collection)
399
+
400
+ # Get the item to find its embedding
401
+ item = self._store.get(coll, id)
402
+ if item is None:
403
+ raise KeyError(f"Item not found: {id}")
404
+
405
+ # Search using the summary's embedding
406
+ embedding = self._embedding_provider.embed(item.summary)
407
+ actual_limit = limit + 1 if not include_self else limit
408
+ results = self._store.query_embedding(coll, embedding, limit=actual_limit)
409
+
410
+ # Filter self if needed
411
+ if not include_self:
412
+ results = [r for r in results if r.id != id]
413
+
414
+ # Convert to Items and apply decay
415
+ items = [r.to_item() for r in results]
416
+ items = self._apply_recency_decay(items)
417
+
418
+ return items[:limit]
419
+
420
+ def query_fulltext(
421
+ self,
422
+ query: str,
423
+ *,
424
+ limit: int = 10,
425
+ collection: Optional[str] = None
426
+ ) -> list[Item]:
427
+ """
428
+ Search item summaries using full-text search.
429
+ """
430
+ coll = self._resolve_collection(collection)
431
+ results = self._store.query_fulltext(coll, query, limit=limit)
432
+ return [r.to_item() for r in results]
433
+
434
+ def query_tag(
435
+ self,
436
+ key: Optional[str] = None,
437
+ value: Optional[str] = None,
438
+ *,
439
+ limit: int = 100,
440
+ collection: Optional[str] = None,
441
+ **tags: str
442
+ ) -> list[Item]:
443
+ """
444
+ Find items by tag(s).
445
+
446
+ Usage:
447
+ # Simple: single key-value pair
448
+ query_tag("project", "myapp")
449
+ query_tag("tradition", "buddhist")
450
+
451
+ # Advanced: multiple tags via kwargs
452
+ query_tag(tradition="buddhist", source="mn22")
453
+ """
454
+ coll = self._resolve_collection(collection)
455
+
456
+ # Build tag filter from positional or keyword args
457
+ tag_filter = {}
458
+
459
+ if key is not None:
460
+ if value is None:
461
+ raise ValueError(f"Value required when querying by key '{key}'")
462
+ tag_filter[key] = value
463
+
464
+ if tags:
465
+ tag_filter.update(tags)
466
+
467
+ if not tag_filter:
468
+ raise ValueError("At least one tag must be specified")
469
+
470
+ # Build where clause
471
+ where = {k: v for k, v in tag_filter.items()}
472
+
473
+ results = self._store.query_metadata(coll, where, limit=limit)
474
+ return [r.to_item() for r in results]
475
+
476
+ # -------------------------------------------------------------------------
477
+ # Direct Access
478
+ # -------------------------------------------------------------------------
479
+
480
+ def get(self, id: str, *, collection: Optional[str] = None) -> Optional[Item]:
481
+ """
482
+ Retrieve a specific item by ID.
483
+ """
484
+ coll = self._resolve_collection(collection)
485
+ result = self._store.get(coll, id)
486
+ if result is None:
487
+ return None
488
+ return result.to_item()
489
+
490
+ def exists(self, id: str, *, collection: Optional[str] = None) -> bool:
491
+ """
492
+ Check if an item exists in the store.
493
+ """
494
+ coll = self._resolve_collection(collection)
495
+ return self._store.exists(coll, id)
496
+
497
+ def delete(self, id: str, *, collection: Optional[str] = None) -> bool:
498
+ """
499
+ Delete an item from the store.
500
+
501
+ Returns True if item existed and was deleted.
502
+ """
503
+ coll = self._resolve_collection(collection)
504
+ return self._store.delete(coll, id)
505
+
506
+ # -------------------------------------------------------------------------
507
+ # Collection Management
508
+ # -------------------------------------------------------------------------
509
+
510
+ def list_collections(self) -> list[str]:
511
+ """
512
+ List all collections in the store.
513
+ """
514
+ return self._store.list_collections()
515
+
516
+ def count(self, *, collection: Optional[str] = None) -> int:
517
+ """
518
+ Count items in a collection.
519
+ """
520
+ coll = self._resolve_collection(collection)
521
+ return self._store.count(coll)
522
+
523
+ def embedding_cache_stats(self) -> dict:
524
+ """
525
+ Get embedding cache statistics.
526
+
527
+ Returns dict with: entries, hits, misses, hit_rate, cache_path
528
+ """
529
+ if isinstance(self._embedding_provider, CachingEmbeddingProvider):
530
+ return self._embedding_provider.stats()
531
+ return {"enabled": False}
532
+
533
+ # -------------------------------------------------------------------------
534
+ # Pending Summaries
535
+ # -------------------------------------------------------------------------
536
+
537
+ def process_pending(self, limit: int = 10) -> int:
538
+ """
539
+ Process pending summaries queued by lazy update/remember.
540
+
541
+ Generates real summaries for items that were indexed with
542
+ truncated placeholders. Updates the stored items in place.
543
+
544
+ Items that fail MAX_SUMMARY_ATTEMPTS times are removed from
545
+ the queue (the truncated placeholder remains in the store).
546
+
547
+ Args:
548
+ limit: Maximum number of items to process in this batch
549
+
550
+ Returns:
551
+ Number of items successfully processed
552
+ """
553
+ items = self._pending_queue.dequeue(limit=limit)
554
+ processed = 0
555
+
556
+ for item in items:
557
+ # Skip items that have failed too many times
558
+ # (attempts was already incremented by dequeue, so check >= MAX)
559
+ if item.attempts >= MAX_SUMMARY_ATTEMPTS:
560
+ # Give up - remove from queue, keep truncated placeholder
561
+ self._pending_queue.complete(item.id, item.collection)
562
+ continue
563
+
564
+ try:
565
+ # Generate real summary
566
+ summary = self._summarization_provider.summarize(item.content)
567
+
568
+ # Update the stored item's summary
569
+ self._store.update_summary(item.collection, item.id, summary)
570
+
571
+ # Remove from queue
572
+ self._pending_queue.complete(item.id, item.collection)
573
+ processed += 1
574
+
575
+ except Exception:
576
+ # Leave in queue for retry (attempt counter already incremented)
577
+ pass
578
+
579
+ return processed
580
+
581
+ def pending_count(self) -> int:
582
+ """Get count of pending summaries awaiting processing."""
583
+ return self._pending_queue.count()
584
+
585
+ def pending_stats(self) -> dict:
586
+ """
587
+ Get pending summary queue statistics.
588
+
589
+ Returns dict with: pending, collections, max_attempts, oldest, queue_path
590
+ """
591
+ return self._pending_queue.stats()
592
+
593
+ @property
594
+ def _processor_pid_path(self) -> Path:
595
+ """Path to the processor PID file."""
596
+ return self._store_path / "processor.pid"
597
+
598
+ def _is_processor_running(self) -> bool:
599
+ """Check if a processor is already running."""
600
+ pid_path = self._processor_pid_path
601
+ if not pid_path.exists():
602
+ return False
603
+
604
+ try:
605
+ pid = int(pid_path.read_text().strip())
606
+ # Check if process is alive by sending signal 0
607
+ os.kill(pid, 0)
608
+ return True
609
+ except (ValueError, ProcessLookupError, PermissionError):
610
+ # PID file invalid, process dead, or permission issue
611
+ # Clean up stale PID file
612
+ try:
613
+ pid_path.unlink()
614
+ except OSError:
615
+ pass
616
+ return False
617
+
618
+ def _spawn_processor(self) -> bool:
619
+ """
620
+ Spawn a background processor if not already running.
621
+
622
+ Returns True if a new processor was spawned, False if one was
623
+ already running or spawn failed.
624
+ """
625
+ if self._is_processor_running():
626
+ return False
627
+
628
+ try:
629
+ # Spawn detached process
630
+ # Use sys.executable to ensure we use the same Python
631
+ cmd = [
632
+ sys.executable, "-m", "keep.cli",
633
+ "process-pending",
634
+ "--daemon",
635
+ "--store", str(self._store_path),
636
+ ]
637
+
638
+ # Platform-specific detachment
639
+ kwargs: dict = {
640
+ "stdout": subprocess.DEVNULL,
641
+ "stderr": subprocess.DEVNULL,
642
+ "stdin": subprocess.DEVNULL,
643
+ }
644
+
645
+ if sys.platform != "win32":
646
+ # Unix: start new session to fully detach
647
+ kwargs["start_new_session"] = True
648
+ else:
649
+ # Windows: use CREATE_NEW_PROCESS_GROUP
650
+ kwargs["creationflags"] = subprocess.CREATE_NEW_PROCESS_GROUP
651
+
652
+ subprocess.Popen(cmd, **kwargs)
653
+ return True
654
+
655
+ except Exception:
656
+ # Spawn failed - not critical, queue will be processed later
657
+ return False
658
+
659
+ def close(self) -> None:
660
+ """
661
+ Close resources (embedding cache connection, pending queue, etc.).
662
+
663
+ Good practice to call when done, though Python's GC will clean up eventually.
664
+ """
665
+ # Close embedding cache if it exists
666
+ if hasattr(self._embedding_provider, '_cache'):
667
+ cache = self._embedding_provider._cache
668
+ if hasattr(cache, 'close'):
669
+ cache.close()
670
+
671
+ # Close pending summary queue
672
+ if hasattr(self, '_pending_queue'):
673
+ self._pending_queue.close()
674
+
675
+ def __enter__(self):
676
+ """Context manager entry."""
677
+ return self
678
+
679
+ def __exit__(self, exc_type, exc_val, exc_tb):
680
+ """Context manager exit - close resources."""
681
+ self.close()
682
+ return False
683
+
684
+ def __del__(self):
685
+ """Cleanup on deletion."""
686
+ self.close()