keep-skill 0.1.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
keep/api.py CHANGED
@@ -8,17 +8,104 @@ This is the minimal working implementation focused on:
8
8
  - get(): retrieve by ID
9
9
  """
10
10
 
11
+ import hashlib
12
+ import logging
11
13
  import re
12
- from datetime import datetime, timezone
14
+ from datetime import datetime, timezone, timedelta
13
15
  from pathlib import Path
14
16
  from typing import Any, Optional
15
17
 
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ def _parse_since(since: str) -> str:
22
+ """
23
+ Parse a 'since' string and return a YYYY-MM-DD cutoff date.
24
+
25
+ Accepts:
26
+ - ISO 8601 duration: P3D (3 days), P1W (1 week), PT1H (1 hour), P1DT12H, etc.
27
+ - ISO date: 2026-01-15
28
+ - Date with slashes: 2026/01/15
29
+
30
+ Returns:
31
+ YYYY-MM-DD string for the cutoff date
32
+ """
33
+ since = since.strip()
34
+
35
+ # ISO 8601 duration: P[n]Y[n]M[n]W[n]DT[n]H[n]M[n]S
36
+ if since.upper().startswith("P"):
37
+ duration_str = since.upper()
38
+
39
+ # Parse duration components
40
+ years = months = weeks = days = hours = minutes = seconds = 0
41
+
42
+ # Split on T to separate date and time parts
43
+ if "T" in duration_str:
44
+ date_part, time_part = duration_str.split("T", 1)
45
+ else:
46
+ date_part = duration_str
47
+ time_part = ""
48
+
49
+ # Parse date part (P[n]Y[n]M[n]W[n]D)
50
+ date_part = date_part[1:] # Remove leading P
51
+ for match in re.finditer(r"(\d+)([YMWD])", date_part):
52
+ value, unit = int(match.group(1)), match.group(2)
53
+ if unit == "Y":
54
+ years = value
55
+ elif unit == "M":
56
+ months = value
57
+ elif unit == "W":
58
+ weeks = value
59
+ elif unit == "D":
60
+ days = value
61
+
62
+ # Parse time part ([n]H[n]M[n]S)
63
+ for match in re.finditer(r"(\d+)([HMS])", time_part):
64
+ value, unit = int(match.group(1)), match.group(2)
65
+ if unit == "H":
66
+ hours = value
67
+ elif unit == "M":
68
+ minutes = value
69
+ elif unit == "S":
70
+ seconds = value
71
+
72
+ # Convert to timedelta (approximate months/years)
73
+ total_days = years * 365 + months * 30 + weeks * 7 + days
74
+ delta = timedelta(days=total_days, hours=hours, minutes=minutes, seconds=seconds)
75
+ cutoff = datetime.now(timezone.utc) - delta
76
+ return cutoff.strftime("%Y-%m-%d")
77
+
78
+ # Try parsing as date
79
+ # ISO format: 2026-01-15 or 2026-01-15T...
80
+ # Slash format: 2026/01/15
81
+ date_str = since.replace("/", "-").split("T")[0]
82
+
83
+ try:
84
+ parsed = datetime.strptime(date_str, "%Y-%m-%d")
85
+ return parsed.strftime("%Y-%m-%d")
86
+ except ValueError:
87
+ pass
88
+
89
+ raise ValueError(
90
+ f"Invalid 'since' format: {since}. "
91
+ "Use ISO duration (P3D, PT1H, P1W) or date (2026-01-15)"
92
+ )
93
+
94
+
95
+ def _filter_by_date(items: list, since: str) -> list:
96
+ """Filter items to only those updated since the given date/duration."""
97
+ cutoff = _parse_since(since)
98
+ return [
99
+ item for item in items
100
+ if item.tags.get("_updated_date", "0000-00-00") >= cutoff
101
+ ]
102
+
16
103
  import os
17
104
  import subprocess
18
105
  import sys
19
106
 
20
- from .config import load_or_create_config, StoreConfig
21
- from .paths import get_default_store_path
107
+ from .config import load_or_create_config, save_config, StoreConfig, EmbeddingIdentity
108
+ from .paths import get_config_dir, get_default_store_path
22
109
  from .pending_summaries import PendingSummaryQueue
23
110
  from .providers import get_registry
24
111
  from .providers.base import (
@@ -27,8 +114,9 @@ from .providers.base import (
27
114
  SummarizationProvider,
28
115
  )
29
116
  from .providers.embedding_cache import CachingEmbeddingProvider
117
+ from .document_store import VersionInfo
30
118
  from .store import ChromaStore
31
- from .types import Item, filter_non_system_tags
119
+ from .types import Item, filter_non_system_tags, SYSTEM_TAG_PREFIX
32
120
 
33
121
 
34
122
  # Default max length for truncated placeholder summaries
@@ -41,6 +129,88 @@ MAX_SUMMARY_ATTEMPTS = 5
41
129
  # Collection name validation: lowercase ASCII and underscores only
42
130
  COLLECTION_NAME_PATTERN = re.compile(r"^[a-z][a-z0-9_]*$")
43
131
 
132
+ # Environment variable prefix for auto-applied tags
133
+ ENV_TAG_PREFIX = "KEEP_TAG_"
134
+
135
+ # Fixed ID for the current working context (singleton)
136
+ NOWDOC_ID = "_now:default"
137
+
138
+ # Path to system documents
139
+ SYSTEM_DOC_DIR = Path(__file__).parent.parent / "docs" / "system"
140
+
141
+
142
+ def _load_frontmatter(path: Path) -> tuple[str, dict[str, str]]:
143
+ """
144
+ Load content and tags from a file with optional YAML frontmatter.
145
+
146
+ Args:
147
+ path: Path to the file
148
+
149
+ Returns:
150
+ (content, tags) tuple. Tags empty if no frontmatter.
151
+
152
+ Raises:
153
+ FileNotFoundError: If the file doesn't exist
154
+ """
155
+ text = path.read_text()
156
+
157
+ # Parse YAML frontmatter if present
158
+ if text.startswith("---"):
159
+ parts = text.split("---", 2)
160
+ if len(parts) >= 3:
161
+ import yaml
162
+ frontmatter = yaml.safe_load(parts[1])
163
+ content = parts[2].lstrip("\n")
164
+ if frontmatter:
165
+ tags = frontmatter.get("tags", {})
166
+ # Ensure all tag values are strings
167
+ tags = {k: str(v) for k, v in tags.items()}
168
+ return content, tags
169
+ return content, {}
170
+
171
+ return text, {}
172
+
173
+
174
+ def _get_env_tags() -> dict[str, str]:
175
+ """
176
+ Collect tags from KEEP_TAG_* environment variables.
177
+
178
+ KEEP_TAG_PROJECT=foo -> {"project": "foo"}
179
+ KEEP_TAG_MyTag=bar -> {"mytag": "bar"}
180
+
181
+ Tag keys are lowercased for consistency.
182
+ """
183
+ tags = {}
184
+ for key, value in os.environ.items():
185
+ if key.startswith(ENV_TAG_PREFIX) and value:
186
+ tag_key = key[len(ENV_TAG_PREFIX):].lower()
187
+ tags[tag_key] = value
188
+ return tags
189
+
190
+
191
+ def _content_hash(content: str) -> str:
192
+ """SHA256 hash of content for change detection."""
193
+ return hashlib.sha256(content.encode("utf-8")).hexdigest()
194
+
195
+
196
+ def _text_content_id(content: str) -> str:
197
+ """
198
+ Generate a content-addressed ID for text updates.
199
+
200
+ This makes text updates versioned by content:
201
+ - `keep update "my note"` → ID = _text:{hash[:12]}
202
+ - `keep update "my note" -t status=done` → same ID, new version
203
+ - `keep update "different note"` → different ID
204
+
205
+ Args:
206
+ content: The text content
207
+
208
+ Returns:
209
+ Content-addressed ID in format _text:{hash[:12]}
210
+ """
211
+ content_hash = hashlib.sha256(content.encode("utf-8")).hexdigest()[:12]
212
+ return f"_text:{content_hash}"
213
+
44
214
 
45
215
  class Keeper:
46
216
  """
@@ -60,20 +230,15 @@ class Keeper:
60
230
  ) -> None:
61
231
  """
62
232
  Initialize or open an existing associative memory store.
63
-
233
+
64
234
  Args:
65
235
  store_path: Path to store directory. Uses default if not specified.
236
+ Overrides any store.path setting in config.
66
237
  collection: Default collection name.
67
238
  decay_half_life_days: Memory decay half-life in days (ACT-R model).
68
239
  After this many days, an item's effective relevance is halved.
69
240
  Set to 0 or negative to disable decay.
70
241
  """
71
- # Resolve store path
72
- if store_path is None:
73
- self._store_path = get_default_store_path()
74
- else:
75
- self._store_path = Path(store_path).resolve()
76
-
77
242
  # Validate collection name
78
243
  if not COLLECTION_NAME_PATTERN.match(collection):
79
244
  raise ValueError(
@@ -82,43 +247,161 @@ class Keeper:
82
247
  )
83
248
  self._default_collection = collection
84
249
  self._decay_half_life_days = decay_half_life_days
85
-
250
+
251
+ # Resolve config and store paths
252
+ # If store_path is explicitly provided, use it as both config and store location
253
+ # Otherwise, discover config via tree-walk and let config determine store
254
+ if store_path is not None:
255
+ self._store_path = Path(store_path).resolve()
256
+ config_dir = self._store_path
257
+ else:
258
+ # Discover config directory (tree-walk or envvar)
259
+ config_dir = get_config_dir()
260
+
86
261
  # Load or create configuration
87
- self._config: StoreConfig = load_or_create_config(self._store_path)
88
-
89
- # Initialize providers
262
+ self._config: StoreConfig = load_or_create_config(config_dir)
263
+
264
+ # If store_path wasn't explicit, resolve from config
265
+ if store_path is None:
266
+ self._store_path = get_default_store_path(self._config)
267
+
268
+ # Initialize document provider (needed for most operations)
90
269
  registry = get_registry()
91
-
92
270
  self._document_provider: DocumentProvider = registry.create_document(
93
271
  self._config.document.name,
94
272
  self._config.document.params,
95
273
  )
96
-
97
- # Create embedding provider with caching
98
- base_embedding_provider = registry.create_embedding(
99
- self._config.embedding.name,
100
- self._config.embedding.params,
101
- )
102
- cache_path = self._store_path / "embedding_cache.db"
103
- self._embedding_provider: EmbeddingProvider = CachingEmbeddingProvider(
104
- base_embedding_provider,
105
- cache_path=cache_path,
106
- )
107
-
108
- self._summarization_provider: SummarizationProvider = registry.create_summarization(
109
- self._config.summarization.name,
110
- self._config.summarization.params,
111
- )
274
+
275
+ # Lazy-loaded providers (created on first use to avoid network access for read-only ops)
276
+ self._embedding_provider: Optional[EmbeddingProvider] = None
277
+ self._summarization_provider: Optional[SummarizationProvider] = None
112
278
 
113
279
  # Initialize pending summary queue
114
280
  queue_path = self._store_path / "pending_summaries.db"
115
281
  self._pending_queue = PendingSummaryQueue(queue_path)
116
282
 
117
- # Initialize store
283
+ # Initialize document store (canonical records)
284
+ from .document_store import DocumentStore
285
+ doc_store_path = self._store_path / "documents.db"
286
+ self._document_store = DocumentStore(doc_store_path)
287
+
288
+ # Initialize ChromaDB store (embedding index)
289
+ # Use dimension from stored identity if available (allows offline read-only access)
290
+ embedding_dim = None
291
+ if self._config.embedding_identity:
292
+ embedding_dim = self._config.embedding_identity.dimension
118
293
  self._store = ChromaStore(
119
294
  self._store_path,
120
- embedding_dimension=self._embedding_provider.dimension,
295
+ embedding_dimension=embedding_dim,
121
296
  )
297
+
298
+ # Preload system documents (only if not already present)
299
+ self._ensure_system_documents()
300
+
301
+ def _ensure_system_documents(self) -> None:
302
+ """
303
+ Ensure system documents are loaded into the store.
304
+
305
+ Scans all .md files in docs/system/. Each file is indexed with its
306
+ file:// URI as the ID and `_category: system` tag for identification.
307
+ Content becomes the summary directly (no auto-summarization).
308
+
309
+ Called during init. Only loads docs that don't already exist,
310
+ so user modifications are preserved and no network access occurs
311
+ if docs are already present.
312
+ """
313
+ for path in SYSTEM_DOC_DIR.glob("*.md"):
314
+ try:
315
+ uri = f"file://{path.resolve()}"
316
+ if not self.exists(uri):
317
+ content, tags = _load_frontmatter(path)
318
+ tags["category"] = "system"
319
+ self.remember(content, id=uri, tags=tags)
320
+ except FileNotFoundError:
321
+ # System file missing - skip silently
322
+ pass
323
+
324
+ def _get_embedding_provider(self) -> EmbeddingProvider:
325
+ """
326
+ Get embedding provider, creating it lazily on first use.
327
+
328
+ This allows read-only operations to work offline without loading
329
+ the embedding model (which may try to reach HuggingFace).
330
+ """
331
+ if self._embedding_provider is None:
332
+ registry = get_registry()
333
+ base_provider = registry.create_embedding(
334
+ self._config.embedding.name,
335
+ self._config.embedding.params,
336
+ )
337
+ cache_path = self._store_path / "embedding_cache.db"
338
+ self._embedding_provider = CachingEmbeddingProvider(
339
+ base_provider,
340
+ cache_path=cache_path,
341
+ )
342
+ # Validate or record embedding identity
343
+ self._validate_embedding_identity(self._embedding_provider)
344
+ # Update store's embedding dimension if it wasn't known at init
345
+ if self._store._embedding_dimension is None:
346
+ self._store._embedding_dimension = self._embedding_provider.dimension
347
+ return self._embedding_provider
348
+
349
+ def _get_summarization_provider(self) -> SummarizationProvider:
350
+ """
351
+ Get summarization provider, creating it lazily on first use.
352
+ """
353
+ if self._summarization_provider is None:
354
+ registry = get_registry()
355
+ self._summarization_provider = registry.create_summarization(
356
+ self._config.summarization.name,
357
+ self._config.summarization.params,
358
+ )
359
+ return self._summarization_provider
360
+
361
+ def _validate_embedding_identity(self, provider: EmbeddingProvider) -> None:
362
+ """
363
+ Validate embedding provider matches stored identity, or record it.
364
+
365
+ On first use, records the embedding identity to config.
366
+ On subsequent uses, validates that the current provider matches.
367
+
368
+ Raises:
369
+ ValueError: If embedding provider changed incompatibly
370
+ """
371
+ # Get current provider's identity
372
+ current = EmbeddingIdentity(
373
+ provider=self._config.embedding.name,
374
+ model=getattr(provider, "model_name", "unknown"),
375
+ dimension=provider.dimension,
376
+ )
377
+
378
+ stored = self._config.embedding_identity
379
+
380
+ if stored is None:
381
+ # First use: record the identity
382
+ self._config.embedding_identity = current
383
+ save_config(self._config)
384
+ else:
385
+ # Validate compatibility
386
+ if (stored.provider != current.provider or
387
+ stored.model != current.model or
388
+ stored.dimension != current.dimension):
389
+ raise ValueError(
390
+ f"Embedding provider mismatch!\n"
391
+ f" Stored: {stored.provider}/{stored.model} ({stored.dimension}d)\n"
392
+ f" Current: {current.provider}/{current.model} ({current.dimension}d)\n"
393
+ f"\n"
394
+ f"Changing embedding providers invalidates existing embeddings.\n"
395
+ f"Options:\n"
396
+ f" 1. Use the original provider\n"
397
+ f" 2. Delete .keep/ and re-index\n"
398
+ f" 3. (Future) Run migration to re-embed with new provider"
399
+ )
400
+
401
+ @property
402
+ def embedding_identity(self) -> EmbeddingIdentity | None:
403
+ """Current embedding identity (provider, model, dimension)."""
404
+ return self._config.embedding_identity
122
405
 
123
406
  def _resolve_collection(self, collection: Optional[str]) -> str:
124
407
  """Resolve collection name, validating if provided."""
@@ -135,8 +418,10 @@ class Keeper:
135
418
  def update(
136
419
  self,
137
420
  id: str,
138
- source_tags: Optional[dict[str, str]] = None,
421
+ tags: Optional[dict[str, str]] = None,
139
422
  *,
423
+ summary: Optional[str] = None,
424
+ source_tags: Optional[dict[str, str]] = None, # Deprecated alias
140
425
  collection: Optional[str] = None,
141
426
  lazy: bool = False
142
427
  ) -> Item:
@@ -146,84 +431,170 @@ class Keeper:
146
431
  Fetches the document, generates embeddings and summary, then stores it.
147
432
 
148
433
  **Update behavior:**
149
- - Summary: Always replaced with newly generated summary
150
- - Tags: Merged - existing source tags are preserved, new source_tags override
434
+ - Summary: Replaced with user-provided or newly generated summary
435
+ - Tags: Merged - existing tags are preserved, new tags override
151
436
  on key collision. System tags (prefixed with _) are always managed by
152
437
  the system.
153
438
 
154
439
  Args:
155
440
  id: URI of document to fetch and index
156
- source_tags: User-provided tags to merge with existing tags
441
+ tags: User-provided tags to merge with existing tags
442
+ summary: User-provided summary (skips auto-summarization if given)
443
+ source_tags: Deprecated alias for 'tags'
157
444
  collection: Target collection (uses default if None)
158
445
  lazy: If True, use truncated placeholder summary and queue for
159
446
  background processing. Use `process_pending()` to generate
160
- real summaries later.
447
+ real summaries later. Ignored if summary is provided.
161
448
 
162
449
  Returns:
163
450
  The stored Item with merged tags and new summary
164
451
  """
452
+ # Handle deprecated source_tags parameter
453
+ if source_tags is not None:
454
+ import warnings
455
+ warnings.warn(
456
+ "source_tags is deprecated, use 'tags' instead",
457
+ DeprecationWarning,
458
+ stacklevel=2
459
+ )
460
+ if tags is None:
461
+ tags = source_tags
462
+
165
463
  coll = self._resolve_collection(collection)
166
464
 
167
- # Get existing item to preserve tags
465
+ # Get existing item to preserve tags (check document store first, fall back to ChromaDB)
168
466
  existing_tags = {}
169
- existing = self._store.get(coll, id)
170
- if existing:
171
- # Extract existing non-system tags
172
- existing_tags = filter_non_system_tags(existing.tags)
467
+ existing_doc = self._document_store.get(coll, id)
468
+ if existing_doc:
469
+ existing_tags = filter_non_system_tags(existing_doc.tags)
470
+ else:
471
+ # Fall back to ChromaDB for legacy data
472
+ existing = self._store.get(coll, id)
473
+ if existing:
474
+ existing_tags = filter_non_system_tags(existing.tags)
173
475
 
174
476
  # Fetch document
175
477
  doc = self._document_provider.fetch(id)
176
478
 
479
+ # Compute content hash for change detection
480
+ new_hash = _content_hash(doc.content)
481
+
177
482
  # Generate embedding
178
- embedding = self._embedding_provider.embed(doc.content)
483
+ embedding = self._get_embedding_provider().embed(doc.content)
179
484
 
180
- # Generate summary (or queue for later if lazy)
181
- if lazy:
182
- # Truncated placeholder
183
- if len(doc.content) > TRUNCATE_LENGTH:
184
- summary = doc.content[:TRUNCATE_LENGTH] + "..."
485
+ # Determine summary - skip if content unchanged
486
+ max_len = self._config.max_summary_length
487
+ content_unchanged = (
488
+ existing_doc is not None
489
+ and existing_doc.content_hash == new_hash
490
+ )
491
+
492
+ if content_unchanged and summary is None:
493
+ # Content unchanged - preserve existing summary
494
+ logger.debug("Content unchanged, skipping summarization for %s", id)
495
+ final_summary = existing_doc.summary
496
+ elif summary is not None:
497
+ # User-provided summary - validate length
498
+ if len(summary) > max_len:
499
+ import warnings
500
+ warnings.warn(
501
+ f"Summary exceeds max_summary_length ({len(summary)} > {max_len}), truncating",
502
+ UserWarning,
503
+ stacklevel=2
504
+ )
505
+ summary = summary[:max_len]
506
+ final_summary = summary
507
+ elif lazy:
508
+ # Truncated placeholder for lazy mode
509
+ if len(doc.content) > max_len:
510
+ final_summary = doc.content[:max_len] + "..."
185
511
  else:
186
- summary = doc.content
512
+ final_summary = doc.content
187
513
  # Queue for background processing
188
514
  self._pending_queue.enqueue(id, coll, doc.content)
189
515
  else:
190
- summary = self._summarization_provider.summarize(doc.content)
516
+ # Auto-generate summary
517
+ final_summary = self._get_summarization_provider().summarize(doc.content)
518
+
519
+ # Build tags: existing → config → env → user (later wins on collision)
520
+ merged_tags = {**existing_tags}
191
521
 
192
- # Build tags: existing + new (new overrides on collision)
193
- tags = {**existing_tags}
522
+ # Merge config default tags
523
+ if self._config.default_tags:
524
+ merged_tags.update(self._config.default_tags)
194
525
 
195
- # Merge in new source tags (filtered to prevent system tag override)
196
- if source_tags:
197
- tags.update(filter_non_system_tags(source_tags))
526
+ # Merge environment variable tags
527
+ env_tags = _get_env_tags()
528
+ merged_tags.update(env_tags)
529
+
530
+ # Merge in user-provided tags (filtered to prevent system tag override)
531
+ if tags:
532
+ merged_tags.update(filter_non_system_tags(tags))
198
533
 
199
534
  # Add system tags
200
- tags["_source"] = "uri"
535
+ merged_tags["_source"] = "uri"
201
536
  if doc.content_type:
202
- tags["_content_type"] = doc.content_type
537
+ merged_tags["_content_type"] = doc.content_type
203
538
 
204
- # Store
539
+ # Get existing doc info for versioning before upsert
540
+ old_doc = self._document_store.get(coll, id)
541
+
542
+ # Dual-write: document store (canonical) + ChromaDB (embedding index)
543
+ # DocumentStore.upsert now returns (record, content_changed) and archives old version
544
+ doc_record, content_changed = self._document_store.upsert(
545
+ collection=coll,
546
+ id=id,
547
+ summary=final_summary,
548
+ tags=merged_tags,
549
+ content_hash=new_hash,
550
+ )
551
+
552
+ # Store embedding for current version
205
553
  self._store.upsert(
206
554
  collection=coll,
207
555
  id=id,
208
556
  embedding=embedding,
209
- summary=summary,
210
- tags=tags,
557
+ summary=final_summary,
558
+ tags=merged_tags,
211
559
  )
212
560
 
213
- # Spawn background processor if lazy
214
- if lazy:
561
+ # If content changed and we archived a version, also store versioned embedding
562
+ # Skip if content hash is same (only tags/summary changed)
563
+ if old_doc is not None and content_changed:
564
+ # Get the version number that was just archived
565
+ version_count = self._document_store.version_count(coll, id)
566
+ if version_count > 0:
567
+ # Re-embed the old content for the archived version
568
+ old_embedding = self._get_embedding_provider().embed(old_doc.summary)
569
+ self._store.upsert_version(
570
+ collection=coll,
571
+ id=id,
572
+ version=version_count,
573
+ embedding=old_embedding,
574
+ summary=old_doc.summary,
575
+ tags=old_doc.tags,
576
+ )
577
+
578
+ # Spawn background processor if lazy (only if summary wasn't user-provided and content changed)
579
+ if lazy and summary is None and not content_unchanged:
215
580
  self._spawn_processor()
216
581
 
217
582
  # Return the stored item
218
- result = self._store.get(coll, id)
219
- return result.to_item()
583
+ doc_record = self._document_store.get(coll, id)
584
+ return Item(
585
+ id=doc_record.id,
586
+ summary=doc_record.summary,
587
+ tags=doc_record.tags,
588
+ )
220
589
 
221
590
  def remember(
222
591
  self,
223
592
  content: str,
224
593
  *,
225
594
  id: Optional[str] = None,
226
- source_tags: Optional[dict[str, str]] = None,
595
+ summary: Optional[str] = None,
596
+ tags: Optional[dict[str, str]] = None,
597
+ source_tags: Optional[dict[str, str]] = None, # Deprecated alias
227
598
  collection: Optional[str] = None,
228
599
  lazy: bool = False
229
600
  ) -> Item:
@@ -232,24 +603,42 @@ class Keeper:
232
603
 
233
604
  Use for conversation snippets, notes, insights.
234
605
 
606
+ **Smart summary behavior:**
607
+ - If summary is provided, use it (skips auto-summarization)
608
+ - If content is short (≤ max_summary_length), use content verbatim
609
+ - Otherwise, generate summary via summarization provider
610
+
235
611
  **Update behavior (when id already exists):**
236
- - Summary: Replaced with newly generated summary from content
237
- - Tags: Merged - existing source tags preserved, new source_tags override
612
+ - Summary: Replaced with user-provided, content, or generated summary
613
+ - Tags: Merged - existing tags preserved, new tags override
238
614
  on key collision. System tags (prefixed with _) are always managed by
239
615
  the system.
240
616
 
241
617
  Args:
242
618
  content: Text to store and index
243
619
  id: Optional custom ID (auto-generated if None)
244
- source_tags: User-provided tags to merge with existing tags
620
+ summary: User-provided summary (skips auto-summarization if given)
621
+ tags: User-provided tags to merge with existing tags
622
+ source_tags: Deprecated alias for 'tags'
245
623
  collection: Target collection (uses default if None)
246
- lazy: If True, use truncated placeholder summary and queue for
247
- background processing. Use `process_pending()` to generate
248
- real summaries later.
624
+ lazy: If True and content is long, use truncated placeholder summary
625
+ and queue for background processing. Ignored if content is
626
+ short or summary is provided.
249
627
 
250
628
  Returns:
251
629
  The stored Item with merged tags and new summary
252
630
  """
631
+ # Handle deprecated source_tags parameter
632
+ if source_tags is not None:
633
+ import warnings
634
+ warnings.warn(
635
+ "source_tags is deprecated, use 'tags' instead",
636
+ DeprecationWarning,
637
+ stacklevel=2
638
+ )
639
+ if tags is None:
640
+ tags = source_tags
641
+
253
642
  coll = self._resolve_collection(collection)
254
643
 
255
644
  # Generate ID if not provided
@@ -257,54 +646,124 @@ class Keeper:
257
646
  timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S.%f")
258
647
  id = f"mem:{timestamp}"
259
648
 
260
- # Get existing item to preserve tags
649
+ # Get existing item to preserve tags (check document store first, fall back to ChromaDB)
261
650
  existing_tags = {}
262
- existing = self._store.get(coll, id)
263
- if existing:
264
- # Extract existing non-system tags
265
- existing_tags = filter_non_system_tags(existing.tags)
651
+ existing_doc = self._document_store.get(coll, id)
652
+ if existing_doc:
653
+ existing_tags = filter_non_system_tags(existing_doc.tags)
654
+ else:
655
+ existing = self._store.get(coll, id)
656
+ if existing:
657
+ existing_tags = filter_non_system_tags(existing.tags)
658
+
659
+ # Compute content hash for change detection
660
+ new_hash = _content_hash(content)
266
661
 
267
662
  # Generate embedding
268
- embedding = self._embedding_provider.embed(content)
663
+ embedding = self._get_embedding_provider().embed(content)
269
664
 
270
- # Generate summary (or queue for later if lazy)
271
- if lazy:
272
- # Truncated placeholder
273
- if len(content) > TRUNCATE_LENGTH:
274
- summary = content[:TRUNCATE_LENGTH] + "..."
275
- else:
276
- summary = content
665
+ # Determine summary (smart behavior for remember) - skip if content unchanged
666
+ max_len = self._config.max_summary_length
667
+ content_unchanged = (
668
+ existing_doc is not None
669
+ and existing_doc.content_hash == new_hash
670
+ )
671
+
672
+ if content_unchanged and summary is None:
673
+ # Content unchanged - preserve existing summary
674
+ logger.debug("Content unchanged, skipping summarization for %s", id)
675
+ final_summary = existing_doc.summary
676
+ elif summary is not None:
677
+ # User-provided summary - validate length
678
+ if len(summary) > max_len:
679
+ import warnings
680
+ warnings.warn(
681
+ f"Summary exceeds max_summary_length ({len(summary)} > {max_len}), truncating",
682
+ UserWarning,
683
+ stacklevel=2
684
+ )
685
+ summary = summary[:max_len]
686
+ final_summary = summary
687
+ elif len(content) <= max_len:
688
+ # Content is short enough - use verbatim (smart summary)
689
+ final_summary = content
690
+ elif lazy:
691
+ # Content is long and lazy mode - truncated placeholder
692
+ final_summary = content[:max_len] + "..."
277
693
  # Queue for background processing
278
694
  self._pending_queue.enqueue(id, coll, content)
279
695
  else:
280
- summary = self._summarization_provider.summarize(content)
696
+ # Content is long - generate summary
697
+ final_summary = self._get_summarization_provider().summarize(content)
698
+
699
+ # Build tags: existing → config → env → user (later wins on collision)
700
+ merged_tags = {**existing_tags}
281
701
 
282
- # Build tags: existing + new (new overrides on collision)
283
- tags = {**existing_tags}
702
+ # Merge config default tags
703
+ if self._config.default_tags:
704
+ merged_tags.update(self._config.default_tags)
284
705
 
285
- # Merge in new source tags (filtered)
286
- if source_tags:
287
- tags.update(filter_non_system_tags(source_tags))
706
+ # Merge environment variable tags
707
+ env_tags = _get_env_tags()
708
+ merged_tags.update(env_tags)
709
+
710
+ # Merge in user-provided tags (filtered)
711
+ if tags:
712
+ merged_tags.update(filter_non_system_tags(tags))
288
713
 
289
714
  # Add system tags
290
- tags["_source"] = "inline"
715
+ merged_tags["_source"] = "inline"
291
716
 
292
- # Store
717
+ # Get existing doc info for versioning before upsert
718
+ old_doc = self._document_store.get(coll, id)
719
+
720
+ # Dual-write: document store (canonical) + ChromaDB (embedding index)
721
+ # DocumentStore.upsert now returns (record, content_changed) and archives old version
722
+ doc_record, content_changed = self._document_store.upsert(
723
+ collection=coll,
724
+ id=id,
725
+ summary=final_summary,
726
+ tags=merged_tags,
727
+ content_hash=new_hash,
728
+ )
729
+
730
+ # Store embedding for current version
293
731
  self._store.upsert(
294
732
  collection=coll,
295
733
  id=id,
296
734
  embedding=embedding,
297
- summary=summary,
298
- tags=tags,
735
+ summary=final_summary,
736
+ tags=merged_tags,
299
737
  )
300
738
 
301
- # Spawn background processor if lazy
302
- if lazy:
739
+ # If content changed and we archived a version, also store versioned embedding
740
+ # Skip if content hash is same (only tags/summary changed)
741
+ if old_doc is not None and content_changed:
742
+ # Get the version number that was just archived
743
+ version_count = self._document_store.version_count(coll, id)
744
+ if version_count > 0:
745
+ # Re-embed the old content for the archived version
746
+ old_embedding = self._get_embedding_provider().embed(old_doc.summary)
747
+ self._store.upsert_version(
748
+ collection=coll,
749
+ id=id,
750
+ version=version_count,
751
+ embedding=old_embedding,
752
+ summary=old_doc.summary,
753
+ tags=old_doc.tags,
754
+ )
755
+
756
+ # Spawn background processor if lazy and content was queued (only if content changed)
757
+ if lazy and summary is None and len(content) > max_len and not content_unchanged:
303
758
  self._spawn_processor()
304
759
 
305
760
  # Return the stored item
306
- result = self._store.get(coll, id)
307
- return result.to_item()
761
+ doc_record = self._document_store.get(coll, id)
762
+ return Item(
763
+ id=doc_record.id,
764
+ summary=doc_record.summary,
765
+ tags=doc_record.tags,
766
+ )
308
767
 
309
768
  # -------------------------------------------------------------------------
310
769
  # Query Operations
@@ -361,27 +820,40 @@ class Keeper:
361
820
  query: str,
362
821
  *,
363
822
  limit: int = 10,
823
+ since: Optional[str] = None,
364
824
  collection: Optional[str] = None
365
825
  ) -> list[Item]:
366
826
  """
367
827
  Find items using semantic similarity search.
368
-
828
+
369
829
  Scores are adjusted by recency decay (ACT-R model) - older items
370
830
  have reduced effective relevance unless recently accessed.
831
+
832
+ Args:
833
+ query: Search query text
834
+ limit: Maximum results to return
835
+ since: Only include items updated since (ISO duration like P3D, or date)
836
+ collection: Target collection
371
837
  """
372
838
  coll = self._resolve_collection(collection)
373
-
839
+
374
840
  # Embed query
375
- embedding = self._embedding_provider.embed(query)
376
-
377
- # Search (fetch extra to account for re-ranking)
841
+ embedding = self._get_embedding_provider().embed(query)
842
+
843
+ # Search (fetch extra to account for re-ranking and date filtering)
378
844
  fetch_limit = limit * 2 if self._decay_half_life_days > 0 else limit
845
+ if since is not None:
846
+ fetch_limit = max(fetch_limit, limit * 3) # Fetch more when filtering
379
847
  results = self._store.query_embedding(coll, embedding, limit=fetch_limit)
380
-
848
+
381
849
  # Convert to Items and apply decay
382
850
  items = [r.to_item() for r in results]
383
851
  items = self._apply_recency_decay(items)
384
-
852
+
853
+ # Apply date filter if specified
854
+ if since is not None:
855
+ items = _filter_by_date(items, since)
856
+
385
857
  return items[:limit]
386
858
 
387
859
  def find_similar(
@@ -389,32 +861,46 @@ class Keeper:
389
861
  id: str,
390
862
  *,
391
863
  limit: int = 10,
864
+ since: Optional[str] = None,
392
865
  include_self: bool = False,
393
866
  collection: Optional[str] = None
394
867
  ) -> list[Item]:
395
868
  """
396
869
  Find items similar to an existing item.
870
+
871
+ Args:
872
+ id: ID of item to find similar items for
873
+ limit: Maximum results to return
874
+ since: Only include items updated since (ISO duration like P3D, or date)
875
+ include_self: Include the queried item in results
876
+ collection: Target collection
397
877
  """
398
878
  coll = self._resolve_collection(collection)
399
-
879
+
400
880
  # Get the item to find its embedding
401
881
  item = self._store.get(coll, id)
402
882
  if item is None:
403
883
  raise KeyError(f"Item not found: {id}")
404
-
405
- # Search using the summary's embedding
406
- embedding = self._embedding_provider.embed(item.summary)
884
+
885
+ # Search using the summary's embedding (fetch extra when filtering)
886
+ embedding = self._get_embedding_provider().embed(item.summary)
407
887
  actual_limit = limit + 1 if not include_self else limit
888
+ if since is not None:
889
+ actual_limit = max(actual_limit, limit * 3)
408
890
  results = self._store.query_embedding(coll, embedding, limit=actual_limit)
409
-
891
+
410
892
  # Filter self if needed
411
893
  if not include_self:
412
894
  results = [r for r in results if r.id != id]
413
-
895
+
414
896
  # Convert to Items and apply decay
415
897
  items = [r.to_item() for r in results]
416
898
  items = self._apply_recency_decay(items)
417
-
899
+
900
+ # Apply date filter if specified
901
+ if since is not None:
902
+ items = _filter_by_date(items, since)
903
+
418
904
  return items[:limit]
419
905
 
420
906
  def query_fulltext(
@@ -422,14 +908,30 @@ class Keeper:
422
908
  query: str,
423
909
  *,
424
910
  limit: int = 10,
911
+ since: Optional[str] = None,
425
912
  collection: Optional[str] = None
426
913
  ) -> list[Item]:
427
914
  """
428
915
  Search item summaries using full-text search.
916
+
917
+ Args:
918
+ query: Text to search for in summaries
919
+ limit: Maximum results to return
920
+ since: Only include items updated since (ISO duration like P3D, or date)
921
+ collection: Target collection
429
922
  """
430
923
  coll = self._resolve_collection(collection)
431
- results = self._store.query_fulltext(coll, query, limit=limit)
432
- return [r.to_item() for r in results]
924
+
925
+ # Fetch extra when filtering by date
926
+ fetch_limit = limit * 3 if since is not None else limit
927
+ results = self._store.query_fulltext(coll, query, limit=fetch_limit)
928
+ items = [r.to_item() for r in results]
929
+
930
+ # Apply date filter if specified
931
+ if since is not None:
932
+ items = _filter_by_date(items, since)
933
+
934
+ return items[:limit]
433
935
 
434
936
  def query_tag(
435
937
  self,
@@ -437,6 +939,7 @@ class Keeper:
437
939
  value: Optional[str] = None,
438
940
  *,
439
941
  limit: int = 100,
942
+ since: Optional[str] = None,
440
943
  collection: Optional[str] = None,
441
944
  **tags: str
442
945
  ) -> list[Item]:
@@ -444,21 +947,39 @@ class Keeper:
444
947
  Find items by tag(s).
445
948
 
446
949
  Usage:
447
- # Simple: single key-value pair
950
+ # Key only: find all docs with this tag key (any value)
951
+ query_tag("project")
952
+
953
+ # Key with value: find docs with specific tag value
448
954
  query_tag("project", "myapp")
449
- query_tag("tradition", "buddhist")
450
955
 
451
- # Advanced: multiple tags via kwargs
956
+ # Multiple tags via kwargs
452
957
  query_tag(tradition="buddhist", source="mn22")
958
+
959
+ Args:
960
+ key: Tag key to search for
961
+ value: Tag value (optional, any value if not provided)
962
+ limit: Maximum results to return
963
+ since: Only include items updated since (ISO duration like P3D, or date)
964
+ collection: Target collection
965
+ **tags: Additional tag filters as keyword arguments
453
966
  """
454
967
  coll = self._resolve_collection(collection)
455
968
 
969
+ # Key-only query: find docs that have this tag key (any value)
970
+ # Uses DocumentStore which supports efficient SQL date filtering
971
+ if key is not None and value is None and not tags:
972
+ # Convert since to cutoff date for SQL query
973
+ since_date = _parse_since(since) if since else None
974
+ docs = self._document_store.query_by_tag_key(
975
+ coll, key, limit=limit, since_date=since_date
976
+ )
977
+ return [Item(id=d.id, summary=d.summary, tags=d.tags) for d in docs]
978
+
456
979
  # Build tag filter from positional or keyword args
457
980
  tag_filter = {}
458
981
 
459
- if key is not None:
460
- if value is None:
461
- raise ValueError(f"Value required when querying by key '{key}'")
982
+ if key is not None and value is not None:
462
983
  tag_filter[key] = value
463
984
 
464
985
  if tags:
@@ -467,11 +988,50 @@ class Keeper:
467
988
  if not tag_filter:
468
989
  raise ValueError("At least one tag must be specified")
469
990
 
470
- # Build where clause
471
- where = {k: v for k, v in tag_filter.items()}
991
+ # Build where clause for tag filters only
992
+ # (ChromaDB $gte doesn't support string dates, so date filtering is done post-query)
993
+ where_conditions = [{k: v} for k, v in tag_filter.items()]
994
+
995
+ # Use $and if multiple conditions, otherwise single condition
996
+ if len(where_conditions) == 1:
997
+ where = where_conditions[0]
998
+ else:
999
+ where = {"$and": where_conditions}
1000
+
1001
+ # Fetch extra when filtering by date
1002
+ fetch_limit = limit * 3 if since is not None else limit
1003
+ results = self._store.query_metadata(coll, where, limit=fetch_limit)
1004
+ items = [r.to_item() for r in results]
1005
+
1006
+ # Apply date filter if specified (post-filter)
1007
+ if since is not None:
1008
+ items = _filter_by_date(items, since)
472
1009
 
473
- results = self._store.query_metadata(coll, where, limit=limit)
474
- return [r.to_item() for r in results]
1010
+ return items[:limit]
1011
+
1012
+ def list_tags(
1013
+ self,
1014
+ key: Optional[str] = None,
1015
+ *,
1016
+ collection: Optional[str] = None,
1017
+ ) -> list[str]:
1018
+ """
1019
+ List distinct tag keys or values.
1020
+
1021
+ Args:
1022
+ key: If provided, list distinct values for this key.
1023
+ If None, list distinct tag keys.
1024
+ collection: Target collection
1025
+
1026
+ Returns:
1027
+ Sorted list of distinct keys or values
1028
+ """
1029
+ coll = self._resolve_collection(collection)
1030
+
1031
+ if key is None:
1032
+ return self._document_store.list_distinct_tag_keys(coll)
1033
+ else:
1034
+ return self._document_store.list_distinct_tag_values(coll, key)
475
1035
 
476
1036
  # -------------------------------------------------------------------------
477
1037
  # Direct Access
@@ -480,29 +1040,273 @@ class Keeper:
480
1040
  def get(self, id: str, *, collection: Optional[str] = None) -> Optional[Item]:
481
1041
  """
482
1042
  Retrieve a specific item by ID.
1043
+
1044
+ Reads from document store (canonical), falls back to ChromaDB for legacy data.
483
1045
  """
484
1046
  coll = self._resolve_collection(collection)
1047
+
1048
+ # Try document store first (canonical)
1049
+ doc_record = self._document_store.get(coll, id)
1050
+ if doc_record:
1051
+ return Item(
1052
+ id=doc_record.id,
1053
+ summary=doc_record.summary,
1054
+ tags=doc_record.tags,
1055
+ )
1056
+
1057
+ # Fall back to ChromaDB for legacy data
485
1058
  result = self._store.get(coll, id)
486
1059
  if result is None:
487
1060
  return None
488
1061
  return result.to_item()
489
-
1062
+
1063
+ def get_version(
1064
+ self,
1065
+ id: str,
1066
+ offset: int = 0,
1067
+ *,
1068
+ collection: Optional[str] = None,
1069
+ ) -> Optional[Item]:
1070
+ """
1071
+ Get a specific version of a document by offset.
1072
+
1073
+ Offset semantics:
1074
+ - 0 = current version
1075
+ - 1 = previous version
1076
+ - 2 = two versions ago
1077
+ - etc.
1078
+
1079
+ Args:
1080
+ id: Document identifier
1081
+ offset: Version offset (0=current, 1=previous, etc.)
1082
+ collection: Target collection
1083
+
1084
+ Returns:
1085
+ Item if found, None if version doesn't exist
1086
+ """
1087
+ coll = self._resolve_collection(collection)
1088
+
1089
+ if offset == 0:
1090
+ # Current version
1091
+ return self.get(id, collection=collection)
1092
+
1093
+ # Get archived version
1094
+ version_info = self._document_store.get_version(coll, id, offset)
1095
+ if version_info is None:
1096
+ return None
1097
+
1098
+ return Item(
1099
+ id=id,
1100
+ summary=version_info.summary,
1101
+ tags=version_info.tags,
1102
+ )
1103
+
1104
+ def list_versions(
1105
+ self,
1106
+ id: str,
1107
+ limit: int = 10,
1108
+ *,
1109
+ collection: Optional[str] = None,
1110
+ ) -> list[VersionInfo]:
1111
+ """
1112
+ List version history for a document.
1113
+
1114
+ Returns versions in reverse chronological order (newest archived first).
1115
+ Does not include the current version.
1116
+
1117
+ Args:
1118
+ id: Document identifier
1119
+ limit: Maximum versions to return
1120
+ collection: Target collection
1121
+
1122
+ Returns:
1123
+ List of VersionInfo, newest archived first
1124
+ """
1125
+ coll = self._resolve_collection(collection)
1126
+ return self._document_store.list_versions(coll, id, limit)
1127
+
1128
+ def get_version_nav(
1129
+ self,
1130
+ id: str,
1131
+ current_version: Optional[int] = None,
1132
+ limit: int = 3,
1133
+ *,
1134
+ collection: Optional[str] = None,
1135
+ ) -> dict[str, list[VersionInfo]]:
1136
+ """
1137
+ Get version navigation info (prev/next) for display.
1138
+
1139
+ Args:
1140
+ id: Document identifier
1141
+ current_version: The version being viewed (None = current/live version)
1142
+ limit: Max previous versions to return when viewing current
1143
+ collection: Target collection
1144
+
1145
+ Returns:
1146
+ Dict with 'prev' and optionally 'next' lists of VersionInfo.
1147
+ """
1148
+ coll = self._resolve_collection(collection)
1149
+ return self._document_store.get_version_nav(coll, id, current_version, limit)
1150
+
490
1151
  def exists(self, id: str, *, collection: Optional[str] = None) -> bool:
491
1152
  """
492
1153
  Check if an item exists in the store.
493
1154
  """
494
1155
  coll = self._resolve_collection(collection)
495
- return self._store.exists(coll, id)
1156
+ # Check document store first, then ChromaDB
1157
+ return self._document_store.exists(coll, id) or self._store.exists(coll, id)
496
1158
 
497
- def delete(self, id: str, *, collection: Optional[str] = None) -> bool:
1159
+ def delete(
1160
+ self,
1161
+ id: str,
1162
+ *,
1163
+ collection: Optional[str] = None,
1164
+ delete_versions: bool = True,
1165
+ ) -> bool:
498
1166
  """
499
- Delete an item from the store.
500
-
501
- Returns True if item existed and was deleted.
1167
+ Delete an item from both stores.
1168
+
1169
+ Args:
1170
+ id: Document identifier
1171
+ collection: Target collection
1172
+ delete_versions: If True, also delete version history
1173
+
1174
+ Returns:
1175
+ True if item existed and was deleted.
502
1176
  """
503
1177
  coll = self._resolve_collection(collection)
504
- return self._store.delete(coll, id)
505
-
1178
+ # Delete from both stores (including versions)
1179
+ doc_deleted = self._document_store.delete(coll, id, delete_versions=delete_versions)
1180
+ chroma_deleted = self._store.delete(coll, id, delete_versions=delete_versions)
1181
+ return doc_deleted or chroma_deleted
1182
+
1183
+ # -------------------------------------------------------------------------
1184
+ # Current Working Context (Now)
1185
+ # -------------------------------------------------------------------------
1186
+
1187
+ def get_now(self) -> Item:
1188
+ """
1189
+ Get the current working context.
1190
+
1191
+ A singleton document representing what you're currently working on.
1192
+ If it doesn't exist, creates one with default content and tags from
1193
+ docs/system/now.md.
1194
+
1195
+ Returns:
1196
+ The current context Item (never None - auto-creates if missing)
1197
+ """
1198
+ item = self.get(NOWDOC_ID)
1199
+ if item is None:
1200
+ # First-time initialization with default content and tags
1201
+ try:
1202
+ default_content, default_tags = _load_frontmatter(SYSTEM_DOC_DIR / "now.md")
1203
+ except FileNotFoundError:
1204
+ # Fallback if system file is missing
1205
+ default_content = "# Now\n\nYour working context."
1206
+ default_tags = {}
1207
+ item = self.set_now(default_content, tags=default_tags)
1208
+ return item
1209
+
1210
+ def set_now(
1211
+ self,
1212
+ content: str,
1213
+ *,
1214
+ tags: Optional[dict[str, str]] = None,
1215
+ ) -> Item:
1216
+ """
1217
+ Set the current working context.
1218
+
1219
+ Updates the singleton context with new content. Uses remember()
1220
+ internally with the fixed NOWDOC_ID.
1221
+
1222
+ Args:
1223
+ content: New content for the current context
1224
+ tags: Optional additional tags to apply
1225
+
1226
+ Returns:
1227
+ The updated context Item
1228
+ """
1229
+ return self.remember(content, id=NOWDOC_ID, tags=tags)
1230
+
1231
+ def list_system_documents(
1232
+ self,
1233
+ *,
1234
+ collection: Optional[str] = None,
1235
+ ) -> list[Item]:
1236
+ """
1237
+ List all system documents.
1238
+
1239
+ System documents are identified by the `category: system` tag.
1240
+ These are preloaded on init and provide foundational content.
1241
+
1242
+ Args:
1243
+ collection: Target collection (default: default collection)
1244
+
1245
+ Returns:
1246
+ List of system document Items
1247
+ """
1248
+ return self.query_tag("category", "system", collection=collection)
1249
+
1250
+ def tag(
1251
+ self,
1252
+ id: str,
1253
+ tags: Optional[dict[str, str]] = None,
1254
+ *,
1255
+ collection: Optional[str] = None,
1256
+ ) -> Optional[Item]:
1257
+ """
1258
+ Update tags on an existing document without re-processing.
1259
+
1260
+ Does NOT re-fetch, re-embed, or re-summarize. Only updates tags.
1261
+
1262
+ Tag behavior:
1263
+ - Provided tags are merged with existing user tags
1264
+ - Empty string value ("") deletes that tag
1265
+ - System tags (_prefixed) cannot be modified via this method
1266
+
1267
+ Args:
1268
+ id: Document identifier
1269
+ tags: Tags to add/update/delete (empty string = delete)
1270
+ collection: Target collection
1271
+
1272
+ Returns:
1273
+ Updated Item if found, None if document doesn't exist
1274
+ """
1275
+ coll = self._resolve_collection(collection)
1276
+
1277
+ # Get existing item (prefer document store, fall back to ChromaDB)
1278
+ existing = self.get(id, collection=collection)
1279
+ if existing is None:
1280
+ return None
1281
+
1282
+ # Start with existing tags, separate system from user
1283
+ current_tags = dict(existing.tags)
1284
+ system_tags = {k: v for k, v in current_tags.items()
1285
+ if k.startswith(SYSTEM_TAG_PREFIX)}
1286
+ user_tags = {k: v for k, v in current_tags.items()
1287
+ if not k.startswith(SYSTEM_TAG_PREFIX)}
1288
+
1289
+ # Apply tag changes (filter out system tags from input)
1290
+ if tags:
1291
+ for key, value in tags.items():
1292
+ if key.startswith(SYSTEM_TAG_PREFIX):
1293
+ continue # Cannot modify system tags
1294
+ if value == "":
1295
+ # Empty string = delete
1296
+ user_tags.pop(key, None)
1297
+ else:
1298
+ user_tags[key] = value
1299
+
1300
+ # Merge back: user tags + system tags
1301
+ final_tags = {**user_tags, **system_tags}
1302
+
1303
+ # Dual-write to both stores
1304
+ self._document_store.update_tags(coll, id, final_tags)
1305
+ self._store.update_tags(coll, id, final_tags)
1306
+
1307
+ # Return updated item
1308
+ return self.get(id, collection=collection)
1309
+
506
1310
  # -------------------------------------------------------------------------
507
1311
  # Collection Management
508
1312
  # -------------------------------------------------------------------------
@@ -511,21 +1315,61 @@ class Keeper:
511
1315
  """
512
1316
  List all collections in the store.
513
1317
  """
514
- return self._store.list_collections()
1318
+ # Merge collections from both stores
1319
+ doc_collections = set(self._document_store.list_collections())
1320
+ chroma_collections = set(self._store.list_collections())
1321
+ return sorted(doc_collections | chroma_collections)
515
1322
 
516
1323
  def count(self, *, collection: Optional[str] = None) -> int:
517
1324
  """
518
1325
  Count items in a collection.
1326
+
1327
+ Returns count from document store if available, else ChromaDB.
519
1328
  """
520
1329
  coll = self._resolve_collection(collection)
1330
+ doc_count = self._document_store.count(coll)
1331
+ if doc_count > 0:
1332
+ return doc_count
521
1333
  return self._store.count(coll)
522
-
1334
+
1335
+ def list_recent(
1336
+ self,
1337
+ limit: int = 10,
1338
+ *,
1339
+ collection: Optional[str] = None,
1340
+ ) -> list[Item]:
1341
+ """
1342
+ List recent items ordered by update time.
1343
+
1344
+ Args:
1345
+ limit: Maximum number to return (default 10)
1346
+ collection: Collection to query (uses default if not specified)
1347
+
1348
+ Returns:
1349
+ List of Items, most recently updated first
1350
+ """
1351
+ coll = self._resolve_collection(collection)
1352
+ records = self._document_store.list_recent(coll, limit)
1353
+
1354
+ return [
1355
+ Item(
1356
+ id=rec.id,
1357
+ summary=rec.summary,
1358
+ tags=rec.tags,
1359
+ score=None,
1360
+ )
1361
+ for rec in records
1362
+ ]
1363
+
523
1364
  def embedding_cache_stats(self) -> dict:
524
1365
  """
525
1366
  Get embedding cache statistics.
526
1367
 
527
1368
  Returns dict with: entries, hits, misses, hit_rate, cache_path
1369
+ Returns {"loaded": False} if embedding provider hasn't been loaded yet.
528
1370
  """
1371
+ if self._embedding_provider is None:
1372
+ return {"loaded": False}
529
1373
  if isinstance(self._embedding_provider, CachingEmbeddingProvider):
530
1374
  return self._embedding_provider.stats()
531
1375
  return {"enabled": False}
@@ -563,9 +1407,10 @@ class Keeper:
563
1407
 
564
1408
  try:
565
1409
  # Generate real summary
566
- summary = self._summarization_provider.summarize(item.content)
1410
+ summary = self._get_summarization_provider().summarize(item.content)
567
1411
 
568
- # Update the stored item's summary
1412
+ # Update summary in both stores
1413
+ self._document_store.update_summary(item.collection, item.id, summary)
569
1414
  self._store.update_summary(item.collection, item.id, summary)
570
1415
 
571
1416
  # Remove from queue
@@ -652,21 +1497,83 @@ class Keeper:
652
1497
  subprocess.Popen(cmd, **kwargs)
653
1498
  return True
654
1499
 
655
- except Exception:
656
- # Spawn failed - not critical, queue will be processed later
1500
+ except Exception as e:
1501
+ # Spawn failed - log for debugging, queue will be processed later
1502
+ logger.warning("Failed to spawn background processor: %s", e)
657
1503
  return False
658
1504
 
1505
+ def reconcile(
1506
+ self,
1507
+ collection: Optional[str] = None,
1508
+ fix: bool = False,
1509
+ ) -> dict:
1510
+ """
1511
+ Check and optionally fix consistency between DocumentStore and ChromaDB.
1512
+
1513
+ Detects:
1514
+ - Documents in DocumentStore missing from ChromaDB (not searchable)
1515
+ - Documents in ChromaDB missing from DocumentStore (orphaned embeddings)
1516
+
1517
+ Args:
1518
+ collection: Collection to check (None = default collection)
1519
+ fix: If True, re-index documents missing from ChromaDB
1520
+
1521
+ Returns:
1522
+ Dict with 'missing_from_chroma', 'orphaned_in_chroma', 'fixed' counts
1523
+ """
1524
+ coll = self._resolve_collection(collection)
1525
+
1526
+ # Get IDs from both stores
1527
+ doc_ids = set(self._document_store.list_ids(coll))
1528
+ chroma_ids = set(self._store.list_ids(coll))
1529
+
1530
+ missing_from_chroma = doc_ids - chroma_ids
1531
+ orphaned_in_chroma = chroma_ids - doc_ids
1532
+
1533
+ fixed = 0
1534
+ if fix and missing_from_chroma:
1535
+ for doc_id in missing_from_chroma:
1536
+ try:
1537
+ # Re-fetch and re-index
1538
+ doc_record = self._document_store.get(coll, doc_id)
1539
+ if doc_record:
1540
+ # Fetch original content
1541
+ doc = self._document_provider.fetch(doc_id)
1542
+ embedding = self._get_embedding_provider().embed(doc.content)
1543
+
1544
+ # Write to ChromaDB
1545
+ self._store.upsert(
1546
+ collection=coll,
1547
+ id=doc_id,
1548
+ embedding=embedding,
1549
+ summary=doc_record.summary,
1550
+ tags=doc_record.tags,
1551
+ )
1552
+ fixed += 1
1553
+ logger.info("Reconciled: %s", doc_id)
1554
+ except Exception as e:
1555
+ logger.warning("Failed to reconcile %s: %s", doc_id, e)
1556
+
1557
+ return {
1558
+ "missing_from_chroma": len(missing_from_chroma),
1559
+ "orphaned_in_chroma": len(orphaned_in_chroma),
1560
+ "fixed": fixed,
1561
+ "missing_ids": list(missing_from_chroma) if missing_from_chroma else [],
1562
+ "orphaned_ids": list(orphaned_in_chroma) if orphaned_in_chroma else [],
1563
+ }
1564
+
659
1565
  def close(self) -> None:
660
1566
  """
661
1567
  Close resources (embedding cache connection, pending queue, etc.).
662
1568
 
663
1569
  Good practice to call when done, though Python's GC will clean up eventually.
664
1570
  """
665
- # Close embedding cache if it exists
666
- if hasattr(self._embedding_provider, '_cache'):
667
- cache = self._embedding_provider._cache
668
- if hasattr(cache, 'close'):
669
- cache.close()
1571
+ # Close embedding cache if it was loaded
1572
+ if self._embedding_provider is not None:
1573
+ if hasattr(self._embedding_provider, '_cache'):
1574
+ cache = self._embedding_provider._cache
1575
+ if hasattr(cache, 'close'):
1576
+ cache.close()
670
1577
 
671
1578
  # Close pending summary queue
672
1579
  if hasattr(self, '_pending_queue'):