keep-skill 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
keep/api.py CHANGED
@@ -8,17 +8,104 @@ This is the minimal working implementation focused on:
8
8
  - get(): retrieve by ID
9
9
  """
10
10
 
11
+ import hashlib
12
+ import logging
11
13
  import re
12
- from datetime import datetime, timezone
14
+ from datetime import datetime, timezone, timedelta
13
15
  from pathlib import Path
14
16
  from typing import Any, Optional
15
17
 
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ def _parse_since(since: str) -> str:
22
+ """
23
+ Parse a 'since' string and return a YYYY-MM-DD cutoff date.
24
+
25
+ Accepts:
26
+ - ISO 8601 duration: P3D (3 days), P1W (1 week), PT1H (1 hour), P1DT12H, etc.
27
+ - ISO date: 2026-01-15
28
+ - Date with slashes: 2026/01/15
29
+
30
+ Returns:
31
+ YYYY-MM-DD string for the cutoff date
32
+ """
33
+ since = since.strip()
34
+
35
+ # ISO 8601 duration: P[n]Y[n]M[n]W[n]DT[n]H[n]M[n]S
36
+ if since.upper().startswith("P"):
37
+ duration_str = since.upper()
38
+
39
+ # Parse duration components
40
+ years = months = weeks = days = hours = minutes = seconds = 0
41
+
42
+ # Split on T to separate date and time parts
43
+ if "T" in duration_str:
44
+ date_part, time_part = duration_str.split("T", 1)
45
+ else:
46
+ date_part = duration_str
47
+ time_part = ""
48
+
49
+ # Parse date part (P[n]Y[n]M[n]W[n]D)
50
+ date_part = date_part[1:] # Remove leading P
51
+ for match in re.finditer(r"(\d+)([YMWD])", date_part):
52
+ value, unit = int(match.group(1)), match.group(2)
53
+ if unit == "Y":
54
+ years = value
55
+ elif unit == "M":
56
+ months = value
57
+ elif unit == "W":
58
+ weeks = value
59
+ elif unit == "D":
60
+ days = value
61
+
62
+ # Parse time part ([n]H[n]M[n]S)
63
+ for match in re.finditer(r"(\d+)([HMS])", time_part):
64
+ value, unit = int(match.group(1)), match.group(2)
65
+ if unit == "H":
66
+ hours = value
67
+ elif unit == "M":
68
+ minutes = value
69
+ elif unit == "S":
70
+ seconds = value
71
+
72
+ # Convert to timedelta (approximate months/years)
73
+ total_days = years * 365 + months * 30 + weeks * 7 + days
74
+ delta = timedelta(days=total_days, hours=hours, minutes=minutes, seconds=seconds)
75
+ cutoff = datetime.now(timezone.utc) - delta
76
+ return cutoff.strftime("%Y-%m-%d")
77
+
78
+ # Try parsing as date
79
+ # ISO format: 2026-01-15 or 2026-01-15T...
80
+ # Slash format: 2026/01/15
81
+ date_str = since.replace("/", "-").split("T")[0]
82
+
83
+ try:
84
+ parsed = datetime.strptime(date_str, "%Y-%m-%d")
85
+ return parsed.strftime("%Y-%m-%d")
86
+ except ValueError:
87
+ pass
88
+
89
+ raise ValueError(
90
+ f"Invalid 'since' format: {since}. "
91
+ "Use ISO duration (P3D, PT1H, P1W) or date (2026-01-15)"
92
+ )
93
+
94
+
95
+ def _filter_by_date(items: list, since: str) -> list:
96
+ """Filter items to only those updated since the given date/duration."""
97
+ cutoff = _parse_since(since)
98
+ return [
99
+ item for item in items
100
+ if item.tags.get("_updated_date", "0000-00-00") >= cutoff
101
+ ]
102
+
16
103
  import os
17
104
  import subprocess
18
105
  import sys
19
106
 
20
- from .config import load_or_create_config, StoreConfig
21
- from .paths import get_default_store_path
107
+ from .config import load_or_create_config, save_config, StoreConfig, EmbeddingIdentity
108
+ from .paths import get_config_dir, get_default_store_path
22
109
  from .pending_summaries import PendingSummaryQueue
23
110
  from .providers import get_registry
24
111
  from .providers.base import (
@@ -28,7 +115,7 @@ from .providers.base import (
28
115
  )
29
116
  from .providers.embedding_cache import CachingEmbeddingProvider
30
117
  from .store import ChromaStore
31
- from .types import Item, filter_non_system_tags
118
+ from .types import Item, filter_non_system_tags, SYSTEM_TAG_PREFIX
32
119
 
33
120
 
34
121
  # Default max length for truncated placeholder summaries
@@ -41,6 +128,69 @@ MAX_SUMMARY_ATTEMPTS = 5
41
128
  # Collection name validation: lowercase ASCII and underscores only
42
129
  COLLECTION_NAME_PATTERN = re.compile(r"^[a-z][a-z0-9_]*$")
43
130
 
131
+ # Environment variable prefix for auto-applied tags
132
+ ENV_TAG_PREFIX = "KEEP_TAG_"
133
+
134
+ # Fixed ID for the current working context (singleton)
135
+ NOWDOC_ID = "_now:default"
136
+
137
+ # Path to system documents
138
+ SYSTEM_DOC_DIR = Path(__file__).parent.parent / "docs" / "system"
139
+
140
+
141
+ def _load_frontmatter(path: Path) -> tuple[str, dict[str, str]]:
142
+ """
143
+ Load content and tags from a file with optional YAML frontmatter.
144
+
145
+ Args:
146
+ path: Path to the file
147
+
148
+ Returns:
149
+ (content, tags) tuple. Tags empty if no frontmatter.
150
+
151
+ Raises:
152
+ FileNotFoundError: If the file doesn't exist
153
+ """
154
+ text = path.read_text()
155
+
156
+ # Parse YAML frontmatter if present
157
+ if text.startswith("---"):
158
+ parts = text.split("---", 2)
159
+ if len(parts) >= 3:
160
+ import yaml
161
+ frontmatter = yaml.safe_load(parts[1])
162
+ content = parts[2].lstrip("\n")
163
+ if frontmatter:
164
+ tags = frontmatter.get("tags", {})
165
+ # Ensure all tag values are strings
166
+ tags = {k: str(v) for k, v in tags.items()}
167
+ return content, tags
168
+ return content, {}
169
+
170
+ return text, {}
171
+
172
+
173
+ def _get_env_tags() -> dict[str, str]:
174
+ """
175
+ Collect tags from KEEP_TAG_* environment variables.
176
+
177
+ KEEP_TAG_PROJECT=foo -> {"project": "foo"}
178
+ KEEP_TAG_MyTag=bar -> {"mytag": "bar"}
179
+
180
+ Tag keys are lowercased for consistency.
181
+ """
182
+ tags = {}
183
+ for key, value in os.environ.items():
184
+ if key.startswith(ENV_TAG_PREFIX) and value:
185
+ tag_key = key[len(ENV_TAG_PREFIX):].lower()
186
+ tags[tag_key] = value
187
+ return tags
188
+
189
+
190
+ def _content_hash(content: str) -> str:
191
+ """SHA256 hash of content for change detection."""
192
+ return hashlib.sha256(content.encode("utf-8")).hexdigest()
193
+
44
194
 
45
195
  class Keeper:
46
196
  """
@@ -60,20 +210,15 @@ class Keeper:
60
210
  ) -> None:
61
211
  """
62
212
  Initialize or open an existing associative memory store.
63
-
213
+
64
214
  Args:
65
215
  store_path: Path to store directory. Uses default if not specified.
216
+ Overrides any store.path setting in config.
66
217
  collection: Default collection name.
67
218
  decay_half_life_days: Memory decay half-life in days (ACT-R model).
68
219
  After this many days, an item's effective relevance is halved.
69
220
  Set to 0 or negative to disable decay.
70
221
  """
71
- # Resolve store path
72
- if store_path is None:
73
- self._store_path = get_default_store_path()
74
- else:
75
- self._store_path = Path(store_path).resolve()
76
-
77
222
  # Validate collection name
78
223
  if not COLLECTION_NAME_PATTERN.match(collection):
79
224
  raise ValueError(
@@ -82,43 +227,161 @@ class Keeper:
82
227
  )
83
228
  self._default_collection = collection
84
229
  self._decay_half_life_days = decay_half_life_days
85
-
230
+
231
+ # Resolve config and store paths
232
+ # If store_path is explicitly provided, use it as both config and store location
233
+ # Otherwise, discover config via tree-walk and let config determine store
234
+ if store_path is not None:
235
+ self._store_path = Path(store_path).resolve()
236
+ config_dir = self._store_path
237
+ else:
238
+ # Discover config directory (tree-walk or envvar)
239
+ config_dir = get_config_dir()
240
+
86
241
  # Load or create configuration
87
- self._config: StoreConfig = load_or_create_config(self._store_path)
88
-
89
- # Initialize providers
242
+ self._config: StoreConfig = load_or_create_config(config_dir)
243
+
244
+ # If store_path wasn't explicit, resolve from config
245
+ if store_path is None:
246
+ self._store_path = get_default_store_path(self._config)
247
+
248
+ # Initialize document provider (needed for most operations)
90
249
  registry = get_registry()
91
-
92
250
  self._document_provider: DocumentProvider = registry.create_document(
93
251
  self._config.document.name,
94
252
  self._config.document.params,
95
253
  )
96
-
97
- # Create embedding provider with caching
98
- base_embedding_provider = registry.create_embedding(
99
- self._config.embedding.name,
100
- self._config.embedding.params,
101
- )
102
- cache_path = self._store_path / "embedding_cache.db"
103
- self._embedding_provider: EmbeddingProvider = CachingEmbeddingProvider(
104
- base_embedding_provider,
105
- cache_path=cache_path,
106
- )
107
-
108
- self._summarization_provider: SummarizationProvider = registry.create_summarization(
109
- self._config.summarization.name,
110
- self._config.summarization.params,
111
- )
254
+
255
+ # Lazy-loaded providers (created on first use to avoid network access for read-only ops)
256
+ self._embedding_provider: Optional[EmbeddingProvider] = None
257
+ self._summarization_provider: Optional[SummarizationProvider] = None
112
258
 
113
259
  # Initialize pending summary queue
114
260
  queue_path = self._store_path / "pending_summaries.db"
115
261
  self._pending_queue = PendingSummaryQueue(queue_path)
116
262
 
117
- # Initialize store
263
+ # Initialize document store (canonical records)
264
+ from .document_store import DocumentStore
265
+ doc_store_path = self._store_path / "documents.db"
266
+ self._document_store = DocumentStore(doc_store_path)
267
+
268
+ # Initialize ChromaDB store (embedding index)
269
+ # Use dimension from stored identity if available (allows offline read-only access)
270
+ embedding_dim = None
271
+ if self._config.embedding_identity:
272
+ embedding_dim = self._config.embedding_identity.dimension
118
273
  self._store = ChromaStore(
119
274
  self._store_path,
120
- embedding_dimension=self._embedding_provider.dimension,
275
+ embedding_dimension=embedding_dim,
276
+ )
277
+
278
+ # Preload system documents (only if not already present)
279
+ self._ensure_system_documents()
280
+
281
+ def _ensure_system_documents(self) -> None:
282
+ """
283
+ Ensure system documents are loaded into the store.
284
+
285
+ Scans all .md files in docs/system/. Each file is indexed with its
286
+ file:// URI as the ID and `_category: system` tag for identification.
287
+ Content becomes the summary directly (no auto-summarization).
288
+
289
+ Called during init. Only loads docs that don't already exist,
290
+ so user modifications are preserved and no network access occurs
291
+ if docs are already present.
292
+ """
293
+ for path in SYSTEM_DOC_DIR.glob("*.md"):
294
+ try:
295
+ uri = f"file://{path.resolve()}"
296
+ if not self.exists(uri):
297
+ content, tags = _load_frontmatter(path)
298
+ tags["category"] = "system"
299
+ self.remember(content, id=uri, tags=tags)
300
+ except FileNotFoundError:
301
+ # System file missing - skip silently
302
+ pass
303
+
304
+ def _get_embedding_provider(self) -> EmbeddingProvider:
305
+ """
306
+ Get embedding provider, creating it lazily on first use.
307
+
308
+ This allows read-only operations to work offline without loading
309
+ the embedding model (which may try to reach HuggingFace).
310
+ """
311
+ if self._embedding_provider is None:
312
+ registry = get_registry()
313
+ base_provider = registry.create_embedding(
314
+ self._config.embedding.name,
315
+ self._config.embedding.params,
316
+ )
317
+ cache_path = self._store_path / "embedding_cache.db"
318
+ self._embedding_provider = CachingEmbeddingProvider(
319
+ base_provider,
320
+ cache_path=cache_path,
321
+ )
322
+ # Validate or record embedding identity
323
+ self._validate_embedding_identity(self._embedding_provider)
324
+ # Update store's embedding dimension if it wasn't known at init
325
+ if self._store._embedding_dimension is None:
326
+ self._store._embedding_dimension = self._embedding_provider.dimension
327
+ return self._embedding_provider
328
+
329
+ def _get_summarization_provider(self) -> SummarizationProvider:
330
+ """
331
+ Get summarization provider, creating it lazily on first use.
332
+ """
333
+ if self._summarization_provider is None:
334
+ registry = get_registry()
335
+ self._summarization_provider = registry.create_summarization(
336
+ self._config.summarization.name,
337
+ self._config.summarization.params,
338
+ )
339
+ return self._summarization_provider
340
+
341
+ def _validate_embedding_identity(self, provider: EmbeddingProvider) -> None:
342
+ """
343
+ Validate embedding provider matches stored identity, or record it.
344
+
345
+ On first use, records the embedding identity to config.
346
+ On subsequent uses, validates that the current provider matches.
347
+
348
+ Raises:
349
+ ValueError: If embedding provider changed incompatibly
350
+ """
351
+ # Get current provider's identity
352
+ current = EmbeddingIdentity(
353
+ provider=self._config.embedding.name,
354
+ model=getattr(provider, "model_name", "unknown"),
355
+ dimension=provider.dimension,
121
356
  )
357
+
358
+ stored = self._config.embedding_identity
359
+
360
+ if stored is None:
361
+ # First use: record the identity
362
+ self._config.embedding_identity = current
363
+ save_config(self._config)
364
+ else:
365
+ # Validate compatibility
366
+ if (stored.provider != current.provider or
367
+ stored.model != current.model or
368
+ stored.dimension != current.dimension):
369
+ raise ValueError(
370
+ f"Embedding provider mismatch!\n"
371
+ f" Stored: {stored.provider}/{stored.model} ({stored.dimension}d)\n"
372
+ f" Current: {current.provider}/{current.model} ({current.dimension}d)\n"
373
+ f"\n"
374
+ f"Changing embedding providers invalidates existing embeddings.\n"
375
+ f"Options:\n"
376
+ f" 1. Use the original provider\n"
377
+ f" 2. Delete .keep/ and re-index\n"
378
+ f" 3. (Future) Run migration to re-embed with new provider"
379
+ )
380
+
381
+ @property
382
+ def embedding_identity(self) -> EmbeddingIdentity | None:
383
+ """Current embedding identity (provider, model, dimension)."""
384
+ return self._config.embedding_identity
122
385
 
123
386
  def _resolve_collection(self, collection: Optional[str]) -> str:
124
387
  """Resolve collection name, validating if provided."""
@@ -135,8 +398,10 @@ class Keeper:
135
398
  def update(
136
399
  self,
137
400
  id: str,
138
- source_tags: Optional[dict[str, str]] = None,
401
+ tags: Optional[dict[str, str]] = None,
139
402
  *,
403
+ summary: Optional[str] = None,
404
+ source_tags: Optional[dict[str, str]] = None, # Deprecated alias
140
405
  collection: Optional[str] = None,
141
406
  lazy: bool = False
142
407
  ) -> Item:
@@ -146,84 +411,147 @@ class Keeper:
146
411
  Fetches the document, generates embeddings and summary, then stores it.
147
412
 
148
413
  **Update behavior:**
149
- - Summary: Always replaced with newly generated summary
150
- - Tags: Merged - existing source tags are preserved, new source_tags override
414
+ - Summary: Replaced with user-provided or newly generated summary
415
+ - Tags: Merged - existing tags are preserved, new tags override
151
416
  on key collision. System tags (prefixed with _) are always managed by
152
417
  the system.
153
418
 
154
419
  Args:
155
420
  id: URI of document to fetch and index
156
- source_tags: User-provided tags to merge with existing tags
421
+ tags: User-provided tags to merge with existing tags
422
+ summary: User-provided summary (skips auto-summarization if given)
423
+ source_tags: Deprecated alias for 'tags'
157
424
  collection: Target collection (uses default if None)
158
425
  lazy: If True, use truncated placeholder summary and queue for
159
426
  background processing. Use `process_pending()` to generate
160
- real summaries later.
427
+ real summaries later. Ignored if summary is provided.
161
428
 
162
429
  Returns:
163
430
  The stored Item with merged tags and new summary
164
431
  """
432
+ # Handle deprecated source_tags parameter
433
+ if source_tags is not None:
434
+ import warnings
435
+ warnings.warn(
436
+ "source_tags is deprecated, use 'tags' instead",
437
+ DeprecationWarning,
438
+ stacklevel=2
439
+ )
440
+ if tags is None:
441
+ tags = source_tags
442
+
165
443
  coll = self._resolve_collection(collection)
166
444
 
167
- # Get existing item to preserve tags
445
+ # Get existing item to preserve tags (check document store first, fall back to ChromaDB)
168
446
  existing_tags = {}
169
- existing = self._store.get(coll, id)
170
- if existing:
171
- # Extract existing non-system tags
172
- existing_tags = filter_non_system_tags(existing.tags)
447
+ existing_doc = self._document_store.get(coll, id)
448
+ if existing_doc:
449
+ existing_tags = filter_non_system_tags(existing_doc.tags)
450
+ else:
451
+ # Fall back to ChromaDB for legacy data
452
+ existing = self._store.get(coll, id)
453
+ if existing:
454
+ existing_tags = filter_non_system_tags(existing.tags)
173
455
 
174
456
  # Fetch document
175
457
  doc = self._document_provider.fetch(id)
176
458
 
459
+ # Compute content hash for change detection
460
+ new_hash = _content_hash(doc.content)
461
+
177
462
  # Generate embedding
178
- embedding = self._embedding_provider.embed(doc.content)
463
+ embedding = self._get_embedding_provider().embed(doc.content)
179
464
 
180
- # Generate summary (or queue for later if lazy)
181
- if lazy:
182
- # Truncated placeholder
183
- if len(doc.content) > TRUNCATE_LENGTH:
184
- summary = doc.content[:TRUNCATE_LENGTH] + "..."
465
+ # Determine summary - skip if content unchanged
466
+ max_len = self._config.max_summary_length
467
+ content_unchanged = (
468
+ existing_doc is not None
469
+ and existing_doc.content_hash == new_hash
470
+ )
471
+
472
+ if content_unchanged and summary is None:
473
+ # Content unchanged - preserve existing summary
474
+ logger.debug("Content unchanged, skipping summarization for %s", id)
475
+ final_summary = existing_doc.summary
476
+ elif summary is not None:
477
+ # User-provided summary - validate length
478
+ if len(summary) > max_len:
479
+ import warnings
480
+ warnings.warn(
481
+ f"Summary exceeds max_summary_length ({len(summary)} > {max_len}), truncating",
482
+ UserWarning,
483
+ stacklevel=2
484
+ )
485
+ summary = summary[:max_len]
486
+ final_summary = summary
487
+ elif lazy:
488
+ # Truncated placeholder for lazy mode
489
+ if len(doc.content) > max_len:
490
+ final_summary = doc.content[:max_len] + "..."
185
491
  else:
186
- summary = doc.content
492
+ final_summary = doc.content
187
493
  # Queue for background processing
188
494
  self._pending_queue.enqueue(id, coll, doc.content)
189
495
  else:
190
- summary = self._summarization_provider.summarize(doc.content)
496
+ # Auto-generate summary
497
+ final_summary = self._get_summarization_provider().summarize(doc.content)
191
498
 
192
- # Build tags: existing + new (new overrides on collision)
193
- tags = {**existing_tags}
499
+ # Build tags: existing config → env → user (later wins on collision)
500
+ merged_tags = {**existing_tags}
194
501
 
195
- # Merge in new source tags (filtered to prevent system tag override)
196
- if source_tags:
197
- tags.update(filter_non_system_tags(source_tags))
502
+ # Merge config default tags
503
+ if self._config.default_tags:
504
+ merged_tags.update(self._config.default_tags)
505
+
506
+ # Merge environment variable tags
507
+ env_tags = _get_env_tags()
508
+ merged_tags.update(env_tags)
509
+
510
+ # Merge in user-provided tags (filtered to prevent system tag override)
511
+ if tags:
512
+ merged_tags.update(filter_non_system_tags(tags))
198
513
 
199
514
  # Add system tags
200
- tags["_source"] = "uri"
515
+ merged_tags["_source"] = "uri"
201
516
  if doc.content_type:
202
- tags["_content_type"] = doc.content_type
517
+ merged_tags["_content_type"] = doc.content_type
203
518
 
204
- # Store
519
+ # Dual-write: document store (canonical) + ChromaDB (embedding index)
520
+ self._document_store.upsert(
521
+ collection=coll,
522
+ id=id,
523
+ summary=final_summary,
524
+ tags=merged_tags,
525
+ content_hash=new_hash,
526
+ )
205
527
  self._store.upsert(
206
528
  collection=coll,
207
529
  id=id,
208
530
  embedding=embedding,
209
- summary=summary,
210
- tags=tags,
531
+ summary=final_summary,
532
+ tags=merged_tags,
211
533
  )
212
534
 
213
- # Spawn background processor if lazy
214
- if lazy:
535
+ # Spawn background processor if lazy (only if summary wasn't user-provided and content changed)
536
+ if lazy and summary is None and not content_unchanged:
215
537
  self._spawn_processor()
216
538
 
217
539
  # Return the stored item
218
- result = self._store.get(coll, id)
219
- return result.to_item()
540
+ doc_record = self._document_store.get(coll, id)
541
+ return Item(
542
+ id=doc_record.id,
543
+ summary=doc_record.summary,
544
+ tags=doc_record.tags,
545
+ )
220
546
 
221
547
  def remember(
222
548
  self,
223
549
  content: str,
224
550
  *,
225
551
  id: Optional[str] = None,
226
- source_tags: Optional[dict[str, str]] = None,
552
+ summary: Optional[str] = None,
553
+ tags: Optional[dict[str, str]] = None,
554
+ source_tags: Optional[dict[str, str]] = None, # Deprecated alias
227
555
  collection: Optional[str] = None,
228
556
  lazy: bool = False
229
557
  ) -> Item:
@@ -232,24 +560,42 @@ class Keeper:
232
560
 
233
561
  Use for conversation snippets, notes, insights.
234
562
 
563
+ **Smart summary behavior:**
564
+ - If summary is provided, use it (skips auto-summarization)
565
+ - If content is short (≤ max_summary_length), use content verbatim
566
+ - Otherwise, generate summary via summarization provider
567
+
235
568
  **Update behavior (when id already exists):**
236
- - Summary: Replaced with newly generated summary from content
237
- - Tags: Merged - existing source tags preserved, new source_tags override
569
+ - Summary: Replaced with user-provided, content, or generated summary
570
+ - Tags: Merged - existing tags preserved, new tags override
238
571
  on key collision. System tags (prefixed with _) are always managed by
239
572
  the system.
240
573
 
241
574
  Args:
242
575
  content: Text to store and index
243
576
  id: Optional custom ID (auto-generated if None)
244
- source_tags: User-provided tags to merge with existing tags
577
+ summary: User-provided summary (skips auto-summarization if given)
578
+ tags: User-provided tags to merge with existing tags
579
+ source_tags: Deprecated alias for 'tags'
245
580
  collection: Target collection (uses default if None)
246
- lazy: If True, use truncated placeholder summary and queue for
247
- background processing. Use `process_pending()` to generate
248
- real summaries later.
581
+ lazy: If True and content is long, use truncated placeholder summary
582
+ and queue for background processing. Ignored if content is
583
+ short or summary is provided.
249
584
 
250
585
  Returns:
251
586
  The stored Item with merged tags and new summary
252
587
  """
588
+ # Handle deprecated source_tags parameter
589
+ if source_tags is not None:
590
+ import warnings
591
+ warnings.warn(
592
+ "source_tags is deprecated, use 'tags' instead",
593
+ DeprecationWarning,
594
+ stacklevel=2
595
+ )
596
+ if tags is None:
597
+ tags = source_tags
598
+
253
599
  coll = self._resolve_collection(collection)
254
600
 
255
601
  # Generate ID if not provided
@@ -257,54 +603,101 @@ class Keeper:
257
603
  timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S.%f")
258
604
  id = f"mem:{timestamp}"
259
605
 
260
- # Get existing item to preserve tags
606
+ # Get existing item to preserve tags (check document store first, fall back to ChromaDB)
261
607
  existing_tags = {}
262
- existing = self._store.get(coll, id)
263
- if existing:
264
- # Extract existing non-system tags
265
- existing_tags = filter_non_system_tags(existing.tags)
608
+ existing_doc = self._document_store.get(coll, id)
609
+ if existing_doc:
610
+ existing_tags = filter_non_system_tags(existing_doc.tags)
611
+ else:
612
+ existing = self._store.get(coll, id)
613
+ if existing:
614
+ existing_tags = filter_non_system_tags(existing.tags)
615
+
616
+ # Compute content hash for change detection
617
+ new_hash = _content_hash(content)
266
618
 
267
619
  # Generate embedding
268
- embedding = self._embedding_provider.embed(content)
620
+ embedding = self._get_embedding_provider().embed(content)
269
621
 
270
- # Generate summary (or queue for later if lazy)
271
- if lazy:
272
- # Truncated placeholder
273
- if len(content) > TRUNCATE_LENGTH:
274
- summary = content[:TRUNCATE_LENGTH] + "..."
275
- else:
276
- summary = content
622
+ # Determine summary (smart behavior for remember) - skip if content unchanged
623
+ max_len = self._config.max_summary_length
624
+ content_unchanged = (
625
+ existing_doc is not None
626
+ and existing_doc.content_hash == new_hash
627
+ )
628
+
629
+ if content_unchanged and summary is None:
630
+ # Content unchanged - preserve existing summary
631
+ logger.debug("Content unchanged, skipping summarization for %s", id)
632
+ final_summary = existing_doc.summary
633
+ elif summary is not None:
634
+ # User-provided summary - validate length
635
+ if len(summary) > max_len:
636
+ import warnings
637
+ warnings.warn(
638
+ f"Summary exceeds max_summary_length ({len(summary)} > {max_len}), truncating",
639
+ UserWarning,
640
+ stacklevel=2
641
+ )
642
+ summary = summary[:max_len]
643
+ final_summary = summary
644
+ elif len(content) <= max_len:
645
+ # Content is short enough - use verbatim (smart summary)
646
+ final_summary = content
647
+ elif lazy:
648
+ # Content is long and lazy mode - truncated placeholder
649
+ final_summary = content[:max_len] + "..."
277
650
  # Queue for background processing
278
651
  self._pending_queue.enqueue(id, coll, content)
279
652
  else:
280
- summary = self._summarization_provider.summarize(content)
653
+ # Content is long - generate summary
654
+ final_summary = self._get_summarization_provider().summarize(content)
655
+
656
+ # Build tags: existing → config → env → user (later wins on collision)
657
+ merged_tags = {**existing_tags}
281
658
 
282
- # Build tags: existing + new (new overrides on collision)
283
- tags = {**existing_tags}
659
+ # Merge config default tags
660
+ if self._config.default_tags:
661
+ merged_tags.update(self._config.default_tags)
284
662
 
285
- # Merge in new source tags (filtered)
286
- if source_tags:
287
- tags.update(filter_non_system_tags(source_tags))
663
+ # Merge environment variable tags
664
+ env_tags = _get_env_tags()
665
+ merged_tags.update(env_tags)
666
+
667
+ # Merge in user-provided tags (filtered)
668
+ if tags:
669
+ merged_tags.update(filter_non_system_tags(tags))
288
670
 
289
671
  # Add system tags
290
- tags["_source"] = "inline"
672
+ merged_tags["_source"] = "inline"
291
673
 
292
- # Store
674
+ # Dual-write: document store (canonical) + ChromaDB (embedding index)
675
+ self._document_store.upsert(
676
+ collection=coll,
677
+ id=id,
678
+ summary=final_summary,
679
+ tags=merged_tags,
680
+ content_hash=new_hash,
681
+ )
293
682
  self._store.upsert(
294
683
  collection=coll,
295
684
  id=id,
296
685
  embedding=embedding,
297
- summary=summary,
298
- tags=tags,
686
+ summary=final_summary,
687
+ tags=merged_tags,
299
688
  )
300
689
 
301
- # Spawn background processor if lazy
302
- if lazy:
690
+ # Spawn background processor if lazy and content was queued (only if content changed)
691
+ if lazy and summary is None and len(content) > max_len and not content_unchanged:
303
692
  self._spawn_processor()
304
693
 
305
694
  # Return the stored item
306
- result = self._store.get(coll, id)
307
- return result.to_item()
695
+ doc_record = self._document_store.get(coll, id)
696
+ return Item(
697
+ id=doc_record.id,
698
+ summary=doc_record.summary,
699
+ tags=doc_record.tags,
700
+ )
308
701
 
309
702
  # -------------------------------------------------------------------------
310
703
  # Query Operations
@@ -361,27 +754,40 @@ class Keeper:
361
754
  query: str,
362
755
  *,
363
756
  limit: int = 10,
757
+ since: Optional[str] = None,
364
758
  collection: Optional[str] = None
365
759
  ) -> list[Item]:
366
760
  """
367
761
  Find items using semantic similarity search.
368
-
762
+
369
763
  Scores are adjusted by recency decay (ACT-R model) - older items
370
764
  have reduced effective relevance unless recently accessed.
765
+
766
+ Args:
767
+ query: Search query text
768
+ limit: Maximum results to return
769
+ since: Only include items updated since (ISO duration like P3D, or date)
770
+ collection: Target collection
371
771
  """
372
772
  coll = self._resolve_collection(collection)
373
-
773
+
374
774
  # Embed query
375
- embedding = self._embedding_provider.embed(query)
376
-
377
- # Search (fetch extra to account for re-ranking)
775
+ embedding = self._get_embedding_provider().embed(query)
776
+
777
+ # Search (fetch extra to account for re-ranking and date filtering)
378
778
  fetch_limit = limit * 2 if self._decay_half_life_days > 0 else limit
779
+ if since is not None:
780
+ fetch_limit = max(fetch_limit, limit * 3) # Fetch more when filtering
379
781
  results = self._store.query_embedding(coll, embedding, limit=fetch_limit)
380
-
782
+
381
783
  # Convert to Items and apply decay
382
784
  items = [r.to_item() for r in results]
383
785
  items = self._apply_recency_decay(items)
384
-
786
+
787
+ # Apply date filter if specified
788
+ if since is not None:
789
+ items = _filter_by_date(items, since)
790
+
385
791
  return items[:limit]
386
792
 
387
793
  def find_similar(
@@ -389,32 +795,46 @@ class Keeper:
389
795
  id: str,
390
796
  *,
391
797
  limit: int = 10,
798
+ since: Optional[str] = None,
392
799
  include_self: bool = False,
393
800
  collection: Optional[str] = None
394
801
  ) -> list[Item]:
395
802
  """
396
803
  Find items similar to an existing item.
804
+
805
+ Args:
806
+ id: ID of item to find similar items for
807
+ limit: Maximum results to return
808
+ since: Only include items updated since (ISO duration like P3D, or date)
809
+ include_self: Include the queried item in results
810
+ collection: Target collection
397
811
  """
398
812
  coll = self._resolve_collection(collection)
399
-
813
+
400
814
  # Get the item to find its embedding
401
815
  item = self._store.get(coll, id)
402
816
  if item is None:
403
817
  raise KeyError(f"Item not found: {id}")
404
-
405
- # Search using the summary's embedding
406
- embedding = self._embedding_provider.embed(item.summary)
818
+
819
+ # Search using the summary's embedding (fetch extra when filtering)
820
+ embedding = self._get_embedding_provider().embed(item.summary)
407
821
  actual_limit = limit + 1 if not include_self else limit
822
+ if since is not None:
823
+ actual_limit = max(actual_limit, limit * 3)
408
824
  results = self._store.query_embedding(coll, embedding, limit=actual_limit)
409
-
825
+
410
826
  # Filter self if needed
411
827
  if not include_self:
412
828
  results = [r for r in results if r.id != id]
413
-
829
+
414
830
  # Convert to Items and apply decay
415
831
  items = [r.to_item() for r in results]
416
832
  items = self._apply_recency_decay(items)
417
-
833
+
834
+ # Apply date filter if specified
835
+ if since is not None:
836
+ items = _filter_by_date(items, since)
837
+
418
838
  return items[:limit]
419
839
 
420
840
  def query_fulltext(
@@ -422,14 +842,30 @@ class Keeper:
422
842
  query: str,
423
843
  *,
424
844
  limit: int = 10,
845
+ since: Optional[str] = None,
425
846
  collection: Optional[str] = None
426
847
  ) -> list[Item]:
427
848
  """
428
849
  Search item summaries using full-text search.
850
+
851
+ Args:
852
+ query: Text to search for in summaries
853
+ limit: Maximum results to return
854
+ since: Only include items updated since (ISO duration like P3D, or date)
855
+ collection: Target collection
429
856
  """
430
857
  coll = self._resolve_collection(collection)
431
- results = self._store.query_fulltext(coll, query, limit=limit)
432
- return [r.to_item() for r in results]
858
+
859
+ # Fetch extra when filtering by date
860
+ fetch_limit = limit * 3 if since is not None else limit
861
+ results = self._store.query_fulltext(coll, query, limit=fetch_limit)
862
+ items = [r.to_item() for r in results]
863
+
864
+ # Apply date filter if specified
865
+ if since is not None:
866
+ items = _filter_by_date(items, since)
867
+
868
+ return items[:limit]
433
869
 
434
870
  def query_tag(
435
871
  self,
@@ -437,6 +873,7 @@ class Keeper:
437
873
  value: Optional[str] = None,
438
874
  *,
439
875
  limit: int = 100,
876
+ since: Optional[str] = None,
440
877
  collection: Optional[str] = None,
441
878
  **tags: str
442
879
  ) -> list[Item]:
@@ -444,21 +881,39 @@ class Keeper:
444
881
  Find items by tag(s).
445
882
 
446
883
  Usage:
447
- # Simple: single key-value pair
884
+ # Key only: find all docs with this tag key (any value)
885
+ query_tag("project")
886
+
887
+ # Key with value: find docs with specific tag value
448
888
  query_tag("project", "myapp")
449
- query_tag("tradition", "buddhist")
450
889
 
451
- # Advanced: multiple tags via kwargs
890
+ # Multiple tags via kwargs
452
891
  query_tag(tradition="buddhist", source="mn22")
892
+
893
+ Args:
894
+ key: Tag key to search for
895
+ value: Tag value (optional, any value if not provided)
896
+ limit: Maximum results to return
897
+ since: Only include items updated since (ISO duration like P3D, or date)
898
+ collection: Target collection
899
+ **tags: Additional tag filters as keyword arguments
453
900
  """
454
901
  coll = self._resolve_collection(collection)
455
902
 
903
+ # Key-only query: find docs that have this tag key (any value)
904
+ # Uses DocumentStore which supports efficient SQL date filtering
905
+ if key is not None and value is None and not tags:
906
+ # Convert since to cutoff date for SQL query
907
+ since_date = _parse_since(since) if since else None
908
+ docs = self._document_store.query_by_tag_key(
909
+ coll, key, limit=limit, since_date=since_date
910
+ )
911
+ return [Item(id=d.id, summary=d.summary, tags=d.tags) for d in docs]
912
+
456
913
  # Build tag filter from positional or keyword args
457
914
  tag_filter = {}
458
915
 
459
- if key is not None:
460
- if value is None:
461
- raise ValueError(f"Value required when querying by key '{key}'")
916
+ if key is not None and value is not None:
462
917
  tag_filter[key] = value
463
918
 
464
919
  if tags:
@@ -467,11 +922,50 @@ class Keeper:
467
922
  if not tag_filter:
468
923
  raise ValueError("At least one tag must be specified")
469
924
 
470
- # Build where clause
471
- where = {k: v for k, v in tag_filter.items()}
925
+ # Build where clause for tag filters only
926
+ # (ChromaDB $gte doesn't support string dates, so date filtering is done post-query)
927
+ where_conditions = [{k: v} for k, v in tag_filter.items()]
928
+
929
+ # Use $and if multiple conditions, otherwise single condition
930
+ if len(where_conditions) == 1:
931
+ where = where_conditions[0]
932
+ else:
933
+ where = {"$and": where_conditions}
934
+
935
+ # Fetch extra when filtering by date
936
+ fetch_limit = limit * 3 if since is not None else limit
937
+ results = self._store.query_metadata(coll, where, limit=fetch_limit)
938
+ items = [r.to_item() for r in results]
939
+
940
+ # Apply date filter if specified (post-filter)
941
+ if since is not None:
942
+ items = _filter_by_date(items, since)
943
+
944
+ return items[:limit]
945
+
946
+ def list_tags(
947
+ self,
948
+ key: Optional[str] = None,
949
+ *,
950
+ collection: Optional[str] = None,
951
+ ) -> list[str]:
952
+ """
953
+ List distinct tag keys or values.
472
954
 
473
- results = self._store.query_metadata(coll, where, limit=limit)
474
- return [r.to_item() for r in results]
955
+ Args:
956
+ key: If provided, list distinct values for this key.
957
+ If None, list distinct tag keys.
958
+ collection: Target collection
959
+
960
+ Returns:
961
+ Sorted list of distinct keys or values
962
+ """
963
+ coll = self._resolve_collection(collection)
964
+
965
+ if key is None:
966
+ return self._document_store.list_distinct_tag_keys(coll)
967
+ else:
968
+ return self._document_store.list_distinct_tag_values(coll, key)
475
969
 
476
970
  # -------------------------------------------------------------------------
477
971
  # Direct Access
@@ -480,8 +974,21 @@ class Keeper:
480
974
  def get(self, id: str, *, collection: Optional[str] = None) -> Optional[Item]:
481
975
  """
482
976
  Retrieve a specific item by ID.
977
+
978
+ Reads from document store (canonical), falls back to ChromaDB for legacy data.
483
979
  """
484
980
  coll = self._resolve_collection(collection)
981
+
982
+ # Try document store first (canonical)
983
+ doc_record = self._document_store.get(coll, id)
984
+ if doc_record:
985
+ return Item(
986
+ id=doc_record.id,
987
+ summary=doc_record.summary,
988
+ tags=doc_record.tags,
989
+ )
990
+
991
+ # Fall back to ChromaDB for legacy data
485
992
  result = self._store.get(coll, id)
486
993
  if result is None:
487
994
  return None
@@ -492,17 +999,148 @@ class Keeper:
492
999
  Check if an item exists in the store.
493
1000
  """
494
1001
  coll = self._resolve_collection(collection)
495
- return self._store.exists(coll, id)
1002
+ # Check document store first, then ChromaDB
1003
+ return self._document_store.exists(coll, id) or self._store.exists(coll, id)
496
1004
 
497
1005
  def delete(self, id: str, *, collection: Optional[str] = None) -> bool:
498
1006
  """
499
- Delete an item from the store.
500
-
1007
+ Delete an item from both stores.
1008
+
501
1009
  Returns True if item existed and was deleted.
502
1010
  """
503
1011
  coll = self._resolve_collection(collection)
504
- return self._store.delete(coll, id)
505
-
1012
+ # Delete from both stores
1013
+ doc_deleted = self._document_store.delete(coll, id)
1014
+ chroma_deleted = self._store.delete(coll, id)
1015
+ return doc_deleted or chroma_deleted
1016
+
1017
+ # -------------------------------------------------------------------------
1018
+ # Current Working Context (Now)
1019
+ # -------------------------------------------------------------------------
1020
+
1021
+ def get_now(self) -> Item:
1022
+ """
1023
+ Get the current working context.
1024
+
1025
+ A singleton document representing what you're currently working on.
1026
+ If it doesn't exist, creates one with default content and tags from
1027
+ docs/system/now.md.
1028
+
1029
+ Returns:
1030
+ The current context Item (never None - auto-creates if missing)
1031
+ """
1032
+ item = self.get(NOWDOC_ID)
1033
+ if item is None:
1034
+ # First-time initialization with default content and tags
1035
+ try:
1036
+ default_content, default_tags = _load_frontmatter(SYSTEM_DOC_DIR / "now.md")
1037
+ except FileNotFoundError:
1038
+ # Fallback if system file is missing
1039
+ default_content = "# Now\n\nYour working context."
1040
+ default_tags = {}
1041
+ item = self.set_now(default_content, tags=default_tags)
1042
+ return item
1043
+
1044
+ def set_now(
1045
+ self,
1046
+ content: str,
1047
+ *,
1048
+ tags: Optional[dict[str, str]] = None,
1049
+ ) -> Item:
1050
+ """
1051
+ Set the current working context.
1052
+
1053
+ Updates the singleton context with new content. Uses remember()
1054
+ internally with the fixed NOWDOC_ID.
1055
+
1056
+ Args:
1057
+ content: New content for the current context
1058
+ tags: Optional additional tags to apply
1059
+
1060
+ Returns:
1061
+ The updated context Item
1062
+ """
1063
+ return self.remember(content, id=NOWDOC_ID, tags=tags)
1064
+
1065
+ def list_system_documents(
1066
+ self,
1067
+ *,
1068
+ collection: Optional[str] = None,
1069
+ ) -> list[Item]:
1070
+ """
1071
+ List all system documents.
1072
+
1073
+ System documents are identified by the `category: system` tag.
1074
+ These are preloaded on init and provide foundational content.
1075
+
1076
+ Args:
1077
+ collection: Target collection (default: default collection)
1078
+
1079
+ Returns:
1080
+ List of system document Items
1081
+ """
1082
+ return self.query_tag("category", "system", collection=collection)
1083
+
1084
+ def tag(
1085
+ self,
1086
+ id: str,
1087
+ tags: Optional[dict[str, str]] = None,
1088
+ *,
1089
+ collection: Optional[str] = None,
1090
+ ) -> Optional[Item]:
1091
+ """
1092
+ Update tags on an existing document without re-processing.
1093
+
1094
+ Does NOT re-fetch, re-embed, or re-summarize. Only updates tags.
1095
+
1096
+ Tag behavior:
1097
+ - Provided tags are merged with existing user tags
1098
+ - Empty string value ("") deletes that tag
1099
+ - System tags (_prefixed) cannot be modified via this method
1100
+
1101
+ Args:
1102
+ id: Document identifier
1103
+ tags: Tags to add/update/delete (empty string = delete)
1104
+ collection: Target collection
1105
+
1106
+ Returns:
1107
+ Updated Item if found, None if document doesn't exist
1108
+ """
1109
+ coll = self._resolve_collection(collection)
1110
+
1111
+ # Get existing item (prefer document store, fall back to ChromaDB)
1112
+ existing = self.get(id, collection=collection)
1113
+ if existing is None:
1114
+ return None
1115
+
1116
+ # Start with existing tags, separate system from user
1117
+ current_tags = dict(existing.tags)
1118
+ system_tags = {k: v for k, v in current_tags.items()
1119
+ if k.startswith(SYSTEM_TAG_PREFIX)}
1120
+ user_tags = {k: v for k, v in current_tags.items()
1121
+ if not k.startswith(SYSTEM_TAG_PREFIX)}
1122
+
1123
+ # Apply tag changes (filter out system tags from input)
1124
+ if tags:
1125
+ for key, value in tags.items():
1126
+ if key.startswith(SYSTEM_TAG_PREFIX):
1127
+ continue # Cannot modify system tags
1128
+ if value == "":
1129
+ # Empty string = delete
1130
+ user_tags.pop(key, None)
1131
+ else:
1132
+ user_tags[key] = value
1133
+
1134
+ # Merge back: user tags + system tags
1135
+ final_tags = {**user_tags, **system_tags}
1136
+
1137
+ # Dual-write to both stores
1138
+ self._document_store.update_tags(coll, id, final_tags)
1139
+ self._store.update_tags(coll, id, final_tags)
1140
+
1141
+ # Return updated item
1142
+ return self.get(id, collection=collection)
1143
+
506
1144
  # -------------------------------------------------------------------------
507
1145
  # Collection Management
508
1146
  # -------------------------------------------------------------------------
@@ -511,13 +1149,21 @@ class Keeper:
511
1149
  """
512
1150
  List all collections in the store.
513
1151
  """
514
- return self._store.list_collections()
1152
+ # Merge collections from both stores
1153
+ doc_collections = set(self._document_store.list_collections())
1154
+ chroma_collections = set(self._store.list_collections())
1155
+ return sorted(doc_collections | chroma_collections)
515
1156
 
516
1157
  def count(self, *, collection: Optional[str] = None) -> int:
517
1158
  """
518
1159
  Count items in a collection.
1160
+
1161
+ Returns count from document store if available, else ChromaDB.
519
1162
  """
520
1163
  coll = self._resolve_collection(collection)
1164
+ doc_count = self._document_store.count(coll)
1165
+ if doc_count > 0:
1166
+ return doc_count
521
1167
  return self._store.count(coll)
522
1168
 
523
1169
  def embedding_cache_stats(self) -> dict:
@@ -525,7 +1171,10 @@ class Keeper:
525
1171
  Get embedding cache statistics.
526
1172
 
527
1173
  Returns dict with: entries, hits, misses, hit_rate, cache_path
1174
+ Returns {"loaded": False} if embedding provider hasn't been loaded yet.
528
1175
  """
1176
+ if self._embedding_provider is None:
1177
+ return {"loaded": False}
529
1178
  if isinstance(self._embedding_provider, CachingEmbeddingProvider):
530
1179
  return self._embedding_provider.stats()
531
1180
  return {"enabled": False}
@@ -563,9 +1212,10 @@ class Keeper:
563
1212
 
564
1213
  try:
565
1214
  # Generate real summary
566
- summary = self._summarization_provider.summarize(item.content)
1215
+ summary = self._get_summarization_provider().summarize(item.content)
567
1216
 
568
- # Update the stored item's summary
1217
+ # Update summary in both stores
1218
+ self._document_store.update_summary(item.collection, item.id, summary)
569
1219
  self._store.update_summary(item.collection, item.id, summary)
570
1220
 
571
1221
  # Remove from queue
@@ -652,8 +1302,9 @@ class Keeper:
652
1302
  subprocess.Popen(cmd, **kwargs)
653
1303
  return True
654
1304
 
655
- except Exception:
656
- # Spawn failed - not critical, queue will be processed later
1305
+ except Exception as e:
1306
+ # Spawn failed - log for debugging, queue will be processed later
1307
+ logger.warning("Failed to spawn background processor: %s", e)
657
1308
  return False
658
1309
 
659
1310
  def close(self) -> None:
@@ -662,11 +1313,12 @@ class Keeper:
662
1313
 
663
1314
  Good practice to call when done, though Python's GC will clean up eventually.
664
1315
  """
665
- # Close embedding cache if it exists
666
- if hasattr(self._embedding_provider, '_cache'):
667
- cache = self._embedding_provider._cache
668
- if hasattr(cache, 'close'):
669
- cache.close()
1316
+ # Close embedding cache if it was loaded
1317
+ if self._embedding_provider is not None:
1318
+ if hasattr(self._embedding_provider, '_cache'):
1319
+ cache = self._embedding_provider._cache
1320
+ if hasattr(cache, 'close'):
1321
+ cache.close()
670
1322
 
671
1323
  # Close pending summary queue
672
1324
  if hasattr(self, '_pending_queue'):