keep-skill 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- keep/__init__.py +3 -6
- keep/api.py +793 -141
- keep/cli.py +467 -129
- keep/config.py +172 -41
- keep/context.py +1 -125
- keep/document_store.py +569 -0
- keep/errors.py +33 -0
- keep/indexing.py +1 -1
- keep/logging_config.py +34 -3
- keep/paths.py +81 -17
- keep/pending_summaries.py +46 -40
- keep/providers/embedding_cache.py +53 -46
- keep/providers/embeddings.py +43 -13
- keep/providers/mlx.py +23 -21
- keep/store.py +58 -14
- {keep_skill-0.1.0.dist-info → keep_skill-0.2.0.dist-info}/METADATA +29 -15
- keep_skill-0.2.0.dist-info/RECORD +28 -0
- keep_skill-0.1.0.dist-info/RECORD +0 -26
- {keep_skill-0.1.0.dist-info → keep_skill-0.2.0.dist-info}/WHEEL +0 -0
- {keep_skill-0.1.0.dist-info → keep_skill-0.2.0.dist-info}/entry_points.txt +0 -0
- {keep_skill-0.1.0.dist-info → keep_skill-0.2.0.dist-info}/licenses/LICENSE +0 -0
keep/api.py
CHANGED
|
@@ -8,17 +8,104 @@ This is the minimal working implementation focused on:
|
|
|
8
8
|
- get(): retrieve by ID
|
|
9
9
|
"""
|
|
10
10
|
|
|
11
|
+
import hashlib
|
|
12
|
+
import logging
|
|
11
13
|
import re
|
|
12
|
-
from datetime import datetime, timezone
|
|
14
|
+
from datetime import datetime, timezone, timedelta
|
|
13
15
|
from pathlib import Path
|
|
14
16
|
from typing import Any, Optional
|
|
15
17
|
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _parse_since(since: str) -> str:
|
|
22
|
+
"""
|
|
23
|
+
Parse a 'since' string and return a YYYY-MM-DD cutoff date.
|
|
24
|
+
|
|
25
|
+
Accepts:
|
|
26
|
+
- ISO 8601 duration: P3D (3 days), P1W (1 week), PT1H (1 hour), P1DT12H, etc.
|
|
27
|
+
- ISO date: 2026-01-15
|
|
28
|
+
- Date with slashes: 2026/01/15
|
|
29
|
+
|
|
30
|
+
Returns:
|
|
31
|
+
YYYY-MM-DD string for the cutoff date
|
|
32
|
+
"""
|
|
33
|
+
since = since.strip()
|
|
34
|
+
|
|
35
|
+
# ISO 8601 duration: P[n]Y[n]M[n]W[n]DT[n]H[n]M[n]S
|
|
36
|
+
if since.upper().startswith("P"):
|
|
37
|
+
duration_str = since.upper()
|
|
38
|
+
|
|
39
|
+
# Parse duration components
|
|
40
|
+
years = months = weeks = days = hours = minutes = seconds = 0
|
|
41
|
+
|
|
42
|
+
# Split on T to separate date and time parts
|
|
43
|
+
if "T" in duration_str:
|
|
44
|
+
date_part, time_part = duration_str.split("T", 1)
|
|
45
|
+
else:
|
|
46
|
+
date_part = duration_str
|
|
47
|
+
time_part = ""
|
|
48
|
+
|
|
49
|
+
# Parse date part (P[n]Y[n]M[n]W[n]D)
|
|
50
|
+
date_part = date_part[1:] # Remove leading P
|
|
51
|
+
for match in re.finditer(r"(\d+)([YMWD])", date_part):
|
|
52
|
+
value, unit = int(match.group(1)), match.group(2)
|
|
53
|
+
if unit == "Y":
|
|
54
|
+
years = value
|
|
55
|
+
elif unit == "M":
|
|
56
|
+
months = value
|
|
57
|
+
elif unit == "W":
|
|
58
|
+
weeks = value
|
|
59
|
+
elif unit == "D":
|
|
60
|
+
days = value
|
|
61
|
+
|
|
62
|
+
# Parse time part ([n]H[n]M[n]S)
|
|
63
|
+
for match in re.finditer(r"(\d+)([HMS])", time_part):
|
|
64
|
+
value, unit = int(match.group(1)), match.group(2)
|
|
65
|
+
if unit == "H":
|
|
66
|
+
hours = value
|
|
67
|
+
elif unit == "M":
|
|
68
|
+
minutes = value
|
|
69
|
+
elif unit == "S":
|
|
70
|
+
seconds = value
|
|
71
|
+
|
|
72
|
+
# Convert to timedelta (approximate months/years)
|
|
73
|
+
total_days = years * 365 + months * 30 + weeks * 7 + days
|
|
74
|
+
delta = timedelta(days=total_days, hours=hours, minutes=minutes, seconds=seconds)
|
|
75
|
+
cutoff = datetime.now(timezone.utc) - delta
|
|
76
|
+
return cutoff.strftime("%Y-%m-%d")
|
|
77
|
+
|
|
78
|
+
# Try parsing as date
|
|
79
|
+
# ISO format: 2026-01-15 or 2026-01-15T...
|
|
80
|
+
# Slash format: 2026/01/15
|
|
81
|
+
date_str = since.replace("/", "-").split("T")[0]
|
|
82
|
+
|
|
83
|
+
try:
|
|
84
|
+
parsed = datetime.strptime(date_str, "%Y-%m-%d")
|
|
85
|
+
return parsed.strftime("%Y-%m-%d")
|
|
86
|
+
except ValueError:
|
|
87
|
+
pass
|
|
88
|
+
|
|
89
|
+
raise ValueError(
|
|
90
|
+
f"Invalid 'since' format: {since}. "
|
|
91
|
+
"Use ISO duration (P3D, PT1H, P1W) or date (2026-01-15)"
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def _filter_by_date(items: list, since: str) -> list:
|
|
96
|
+
"""Filter items to only those updated since the given date/duration."""
|
|
97
|
+
cutoff = _parse_since(since)
|
|
98
|
+
return [
|
|
99
|
+
item for item in items
|
|
100
|
+
if item.tags.get("_updated_date", "0000-00-00") >= cutoff
|
|
101
|
+
]
|
|
102
|
+
|
|
16
103
|
import os
|
|
17
104
|
import subprocess
|
|
18
105
|
import sys
|
|
19
106
|
|
|
20
|
-
from .config import load_or_create_config, StoreConfig
|
|
21
|
-
from .paths import get_default_store_path
|
|
107
|
+
from .config import load_or_create_config, save_config, StoreConfig, EmbeddingIdentity
|
|
108
|
+
from .paths import get_config_dir, get_default_store_path
|
|
22
109
|
from .pending_summaries import PendingSummaryQueue
|
|
23
110
|
from .providers import get_registry
|
|
24
111
|
from .providers.base import (
|
|
@@ -28,7 +115,7 @@ from .providers.base import (
|
|
|
28
115
|
)
|
|
29
116
|
from .providers.embedding_cache import CachingEmbeddingProvider
|
|
30
117
|
from .store import ChromaStore
|
|
31
|
-
from .types import Item, filter_non_system_tags
|
|
118
|
+
from .types import Item, filter_non_system_tags, SYSTEM_TAG_PREFIX
|
|
32
119
|
|
|
33
120
|
|
|
34
121
|
# Default max length for truncated placeholder summaries
|
|
@@ -41,6 +128,69 @@ MAX_SUMMARY_ATTEMPTS = 5
|
|
|
41
128
|
# Collection name validation: lowercase ASCII and underscores only
|
|
42
129
|
COLLECTION_NAME_PATTERN = re.compile(r"^[a-z][a-z0-9_]*$")
|
|
43
130
|
|
|
131
|
+
# Environment variable prefix for auto-applied tags
|
|
132
|
+
ENV_TAG_PREFIX = "KEEP_TAG_"
|
|
133
|
+
|
|
134
|
+
# Fixed ID for the current working context (singleton)
|
|
135
|
+
NOWDOC_ID = "_now:default"
|
|
136
|
+
|
|
137
|
+
# Path to system documents
|
|
138
|
+
SYSTEM_DOC_DIR = Path(__file__).parent.parent / "docs" / "system"
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def _load_frontmatter(path: Path) -> tuple[str, dict[str, str]]:
|
|
142
|
+
"""
|
|
143
|
+
Load content and tags from a file with optional YAML frontmatter.
|
|
144
|
+
|
|
145
|
+
Args:
|
|
146
|
+
path: Path to the file
|
|
147
|
+
|
|
148
|
+
Returns:
|
|
149
|
+
(content, tags) tuple. Tags empty if no frontmatter.
|
|
150
|
+
|
|
151
|
+
Raises:
|
|
152
|
+
FileNotFoundError: If the file doesn't exist
|
|
153
|
+
"""
|
|
154
|
+
text = path.read_text()
|
|
155
|
+
|
|
156
|
+
# Parse YAML frontmatter if present
|
|
157
|
+
if text.startswith("---"):
|
|
158
|
+
parts = text.split("---", 2)
|
|
159
|
+
if len(parts) >= 3:
|
|
160
|
+
import yaml
|
|
161
|
+
frontmatter = yaml.safe_load(parts[1])
|
|
162
|
+
content = parts[2].lstrip("\n")
|
|
163
|
+
if frontmatter:
|
|
164
|
+
tags = frontmatter.get("tags", {})
|
|
165
|
+
# Ensure all tag values are strings
|
|
166
|
+
tags = {k: str(v) for k, v in tags.items()}
|
|
167
|
+
return content, tags
|
|
168
|
+
return content, {}
|
|
169
|
+
|
|
170
|
+
return text, {}
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def _get_env_tags() -> dict[str, str]:
|
|
174
|
+
"""
|
|
175
|
+
Collect tags from KEEP_TAG_* environment variables.
|
|
176
|
+
|
|
177
|
+
KEEP_TAG_PROJECT=foo -> {"project": "foo"}
|
|
178
|
+
KEEP_TAG_MyTag=bar -> {"mytag": "bar"}
|
|
179
|
+
|
|
180
|
+
Tag keys are lowercased for consistency.
|
|
181
|
+
"""
|
|
182
|
+
tags = {}
|
|
183
|
+
for key, value in os.environ.items():
|
|
184
|
+
if key.startswith(ENV_TAG_PREFIX) and value:
|
|
185
|
+
tag_key = key[len(ENV_TAG_PREFIX):].lower()
|
|
186
|
+
tags[tag_key] = value
|
|
187
|
+
return tags
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
def _content_hash(content: str) -> str:
|
|
191
|
+
"""SHA256 hash of content for change detection."""
|
|
192
|
+
return hashlib.sha256(content.encode("utf-8")).hexdigest()
|
|
193
|
+
|
|
44
194
|
|
|
45
195
|
class Keeper:
|
|
46
196
|
"""
|
|
@@ -60,20 +210,15 @@ class Keeper:
|
|
|
60
210
|
) -> None:
|
|
61
211
|
"""
|
|
62
212
|
Initialize or open an existing associative memory store.
|
|
63
|
-
|
|
213
|
+
|
|
64
214
|
Args:
|
|
65
215
|
store_path: Path to store directory. Uses default if not specified.
|
|
216
|
+
Overrides any store.path setting in config.
|
|
66
217
|
collection: Default collection name.
|
|
67
218
|
decay_half_life_days: Memory decay half-life in days (ACT-R model).
|
|
68
219
|
After this many days, an item's effective relevance is halved.
|
|
69
220
|
Set to 0 or negative to disable decay.
|
|
70
221
|
"""
|
|
71
|
-
# Resolve store path
|
|
72
|
-
if store_path is None:
|
|
73
|
-
self._store_path = get_default_store_path()
|
|
74
|
-
else:
|
|
75
|
-
self._store_path = Path(store_path).resolve()
|
|
76
|
-
|
|
77
222
|
# Validate collection name
|
|
78
223
|
if not COLLECTION_NAME_PATTERN.match(collection):
|
|
79
224
|
raise ValueError(
|
|
@@ -82,43 +227,161 @@ class Keeper:
|
|
|
82
227
|
)
|
|
83
228
|
self._default_collection = collection
|
|
84
229
|
self._decay_half_life_days = decay_half_life_days
|
|
85
|
-
|
|
230
|
+
|
|
231
|
+
# Resolve config and store paths
|
|
232
|
+
# If store_path is explicitly provided, use it as both config and store location
|
|
233
|
+
# Otherwise, discover config via tree-walk and let config determine store
|
|
234
|
+
if store_path is not None:
|
|
235
|
+
self._store_path = Path(store_path).resolve()
|
|
236
|
+
config_dir = self._store_path
|
|
237
|
+
else:
|
|
238
|
+
# Discover config directory (tree-walk or envvar)
|
|
239
|
+
config_dir = get_config_dir()
|
|
240
|
+
|
|
86
241
|
# Load or create configuration
|
|
87
|
-
self._config: StoreConfig = load_or_create_config(
|
|
88
|
-
|
|
89
|
-
#
|
|
242
|
+
self._config: StoreConfig = load_or_create_config(config_dir)
|
|
243
|
+
|
|
244
|
+
# If store_path wasn't explicit, resolve from config
|
|
245
|
+
if store_path is None:
|
|
246
|
+
self._store_path = get_default_store_path(self._config)
|
|
247
|
+
|
|
248
|
+
# Initialize document provider (needed for most operations)
|
|
90
249
|
registry = get_registry()
|
|
91
|
-
|
|
92
250
|
self._document_provider: DocumentProvider = registry.create_document(
|
|
93
251
|
self._config.document.name,
|
|
94
252
|
self._config.document.params,
|
|
95
253
|
)
|
|
96
|
-
|
|
97
|
-
#
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
self._config.embedding.params,
|
|
101
|
-
)
|
|
102
|
-
cache_path = self._store_path / "embedding_cache.db"
|
|
103
|
-
self._embedding_provider: EmbeddingProvider = CachingEmbeddingProvider(
|
|
104
|
-
base_embedding_provider,
|
|
105
|
-
cache_path=cache_path,
|
|
106
|
-
)
|
|
107
|
-
|
|
108
|
-
self._summarization_provider: SummarizationProvider = registry.create_summarization(
|
|
109
|
-
self._config.summarization.name,
|
|
110
|
-
self._config.summarization.params,
|
|
111
|
-
)
|
|
254
|
+
|
|
255
|
+
# Lazy-loaded providers (created on first use to avoid network access for read-only ops)
|
|
256
|
+
self._embedding_provider: Optional[EmbeddingProvider] = None
|
|
257
|
+
self._summarization_provider: Optional[SummarizationProvider] = None
|
|
112
258
|
|
|
113
259
|
# Initialize pending summary queue
|
|
114
260
|
queue_path = self._store_path / "pending_summaries.db"
|
|
115
261
|
self._pending_queue = PendingSummaryQueue(queue_path)
|
|
116
262
|
|
|
117
|
-
# Initialize store
|
|
263
|
+
# Initialize document store (canonical records)
|
|
264
|
+
from .document_store import DocumentStore
|
|
265
|
+
doc_store_path = self._store_path / "documents.db"
|
|
266
|
+
self._document_store = DocumentStore(doc_store_path)
|
|
267
|
+
|
|
268
|
+
# Initialize ChromaDB store (embedding index)
|
|
269
|
+
# Use dimension from stored identity if available (allows offline read-only access)
|
|
270
|
+
embedding_dim = None
|
|
271
|
+
if self._config.embedding_identity:
|
|
272
|
+
embedding_dim = self._config.embedding_identity.dimension
|
|
118
273
|
self._store = ChromaStore(
|
|
119
274
|
self._store_path,
|
|
120
|
-
embedding_dimension=
|
|
275
|
+
embedding_dimension=embedding_dim,
|
|
276
|
+
)
|
|
277
|
+
|
|
278
|
+
# Preload system documents (only if not already present)
|
|
279
|
+
self._ensure_system_documents()
|
|
280
|
+
|
|
281
|
+
def _ensure_system_documents(self) -> None:
|
|
282
|
+
"""
|
|
283
|
+
Ensure system documents are loaded into the store.
|
|
284
|
+
|
|
285
|
+
Scans all .md files in docs/system/. Each file is indexed with its
|
|
286
|
+
file:// URI as the ID and `_category: system` tag for identification.
|
|
287
|
+
Content becomes the summary directly (no auto-summarization).
|
|
288
|
+
|
|
289
|
+
Called during init. Only loads docs that don't already exist,
|
|
290
|
+
so user modifications are preserved and no network access occurs
|
|
291
|
+
if docs are already present.
|
|
292
|
+
"""
|
|
293
|
+
for path in SYSTEM_DOC_DIR.glob("*.md"):
|
|
294
|
+
try:
|
|
295
|
+
uri = f"file://{path.resolve()}"
|
|
296
|
+
if not self.exists(uri):
|
|
297
|
+
content, tags = _load_frontmatter(path)
|
|
298
|
+
tags["category"] = "system"
|
|
299
|
+
self.remember(content, id=uri, tags=tags)
|
|
300
|
+
except FileNotFoundError:
|
|
301
|
+
# System file missing - skip silently
|
|
302
|
+
pass
|
|
303
|
+
|
|
304
|
+
def _get_embedding_provider(self) -> EmbeddingProvider:
|
|
305
|
+
"""
|
|
306
|
+
Get embedding provider, creating it lazily on first use.
|
|
307
|
+
|
|
308
|
+
This allows read-only operations to work offline without loading
|
|
309
|
+
the embedding model (which may try to reach HuggingFace).
|
|
310
|
+
"""
|
|
311
|
+
if self._embedding_provider is None:
|
|
312
|
+
registry = get_registry()
|
|
313
|
+
base_provider = registry.create_embedding(
|
|
314
|
+
self._config.embedding.name,
|
|
315
|
+
self._config.embedding.params,
|
|
316
|
+
)
|
|
317
|
+
cache_path = self._store_path / "embedding_cache.db"
|
|
318
|
+
self._embedding_provider = CachingEmbeddingProvider(
|
|
319
|
+
base_provider,
|
|
320
|
+
cache_path=cache_path,
|
|
321
|
+
)
|
|
322
|
+
# Validate or record embedding identity
|
|
323
|
+
self._validate_embedding_identity(self._embedding_provider)
|
|
324
|
+
# Update store's embedding dimension if it wasn't known at init
|
|
325
|
+
if self._store._embedding_dimension is None:
|
|
326
|
+
self._store._embedding_dimension = self._embedding_provider.dimension
|
|
327
|
+
return self._embedding_provider
|
|
328
|
+
|
|
329
|
+
def _get_summarization_provider(self) -> SummarizationProvider:
|
|
330
|
+
"""
|
|
331
|
+
Get summarization provider, creating it lazily on first use.
|
|
332
|
+
"""
|
|
333
|
+
if self._summarization_provider is None:
|
|
334
|
+
registry = get_registry()
|
|
335
|
+
self._summarization_provider = registry.create_summarization(
|
|
336
|
+
self._config.summarization.name,
|
|
337
|
+
self._config.summarization.params,
|
|
338
|
+
)
|
|
339
|
+
return self._summarization_provider
|
|
340
|
+
|
|
341
|
+
def _validate_embedding_identity(self, provider: EmbeddingProvider) -> None:
|
|
342
|
+
"""
|
|
343
|
+
Validate embedding provider matches stored identity, or record it.
|
|
344
|
+
|
|
345
|
+
On first use, records the embedding identity to config.
|
|
346
|
+
On subsequent uses, validates that the current provider matches.
|
|
347
|
+
|
|
348
|
+
Raises:
|
|
349
|
+
ValueError: If embedding provider changed incompatibly
|
|
350
|
+
"""
|
|
351
|
+
# Get current provider's identity
|
|
352
|
+
current = EmbeddingIdentity(
|
|
353
|
+
provider=self._config.embedding.name,
|
|
354
|
+
model=getattr(provider, "model_name", "unknown"),
|
|
355
|
+
dimension=provider.dimension,
|
|
121
356
|
)
|
|
357
|
+
|
|
358
|
+
stored = self._config.embedding_identity
|
|
359
|
+
|
|
360
|
+
if stored is None:
|
|
361
|
+
# First use: record the identity
|
|
362
|
+
self._config.embedding_identity = current
|
|
363
|
+
save_config(self._config)
|
|
364
|
+
else:
|
|
365
|
+
# Validate compatibility
|
|
366
|
+
if (stored.provider != current.provider or
|
|
367
|
+
stored.model != current.model or
|
|
368
|
+
stored.dimension != current.dimension):
|
|
369
|
+
raise ValueError(
|
|
370
|
+
f"Embedding provider mismatch!\n"
|
|
371
|
+
f" Stored: {stored.provider}/{stored.model} ({stored.dimension}d)\n"
|
|
372
|
+
f" Current: {current.provider}/{current.model} ({current.dimension}d)\n"
|
|
373
|
+
f"\n"
|
|
374
|
+
f"Changing embedding providers invalidates existing embeddings.\n"
|
|
375
|
+
f"Options:\n"
|
|
376
|
+
f" 1. Use the original provider\n"
|
|
377
|
+
f" 2. Delete .keep/ and re-index\n"
|
|
378
|
+
f" 3. (Future) Run migration to re-embed with new provider"
|
|
379
|
+
)
|
|
380
|
+
|
|
381
|
+
@property
|
|
382
|
+
def embedding_identity(self) -> EmbeddingIdentity | None:
|
|
383
|
+
"""Current embedding identity (provider, model, dimension)."""
|
|
384
|
+
return self._config.embedding_identity
|
|
122
385
|
|
|
123
386
|
def _resolve_collection(self, collection: Optional[str]) -> str:
|
|
124
387
|
"""Resolve collection name, validating if provided."""
|
|
@@ -135,8 +398,10 @@ class Keeper:
|
|
|
135
398
|
def update(
|
|
136
399
|
self,
|
|
137
400
|
id: str,
|
|
138
|
-
|
|
401
|
+
tags: Optional[dict[str, str]] = None,
|
|
139
402
|
*,
|
|
403
|
+
summary: Optional[str] = None,
|
|
404
|
+
source_tags: Optional[dict[str, str]] = None, # Deprecated alias
|
|
140
405
|
collection: Optional[str] = None,
|
|
141
406
|
lazy: bool = False
|
|
142
407
|
) -> Item:
|
|
@@ -146,84 +411,147 @@ class Keeper:
|
|
|
146
411
|
Fetches the document, generates embeddings and summary, then stores it.
|
|
147
412
|
|
|
148
413
|
**Update behavior:**
|
|
149
|
-
- Summary:
|
|
150
|
-
- Tags: Merged - existing
|
|
414
|
+
- Summary: Replaced with user-provided or newly generated summary
|
|
415
|
+
- Tags: Merged - existing tags are preserved, new tags override
|
|
151
416
|
on key collision. System tags (prefixed with _) are always managed by
|
|
152
417
|
the system.
|
|
153
418
|
|
|
154
419
|
Args:
|
|
155
420
|
id: URI of document to fetch and index
|
|
156
|
-
|
|
421
|
+
tags: User-provided tags to merge with existing tags
|
|
422
|
+
summary: User-provided summary (skips auto-summarization if given)
|
|
423
|
+
source_tags: Deprecated alias for 'tags'
|
|
157
424
|
collection: Target collection (uses default if None)
|
|
158
425
|
lazy: If True, use truncated placeholder summary and queue for
|
|
159
426
|
background processing. Use `process_pending()` to generate
|
|
160
|
-
real summaries later.
|
|
427
|
+
real summaries later. Ignored if summary is provided.
|
|
161
428
|
|
|
162
429
|
Returns:
|
|
163
430
|
The stored Item with merged tags and new summary
|
|
164
431
|
"""
|
|
432
|
+
# Handle deprecated source_tags parameter
|
|
433
|
+
if source_tags is not None:
|
|
434
|
+
import warnings
|
|
435
|
+
warnings.warn(
|
|
436
|
+
"source_tags is deprecated, use 'tags' instead",
|
|
437
|
+
DeprecationWarning,
|
|
438
|
+
stacklevel=2
|
|
439
|
+
)
|
|
440
|
+
if tags is None:
|
|
441
|
+
tags = source_tags
|
|
442
|
+
|
|
165
443
|
coll = self._resolve_collection(collection)
|
|
166
444
|
|
|
167
|
-
# Get existing item to preserve tags
|
|
445
|
+
# Get existing item to preserve tags (check document store first, fall back to ChromaDB)
|
|
168
446
|
existing_tags = {}
|
|
169
|
-
|
|
170
|
-
if
|
|
171
|
-
|
|
172
|
-
|
|
447
|
+
existing_doc = self._document_store.get(coll, id)
|
|
448
|
+
if existing_doc:
|
|
449
|
+
existing_tags = filter_non_system_tags(existing_doc.tags)
|
|
450
|
+
else:
|
|
451
|
+
# Fall back to ChromaDB for legacy data
|
|
452
|
+
existing = self._store.get(coll, id)
|
|
453
|
+
if existing:
|
|
454
|
+
existing_tags = filter_non_system_tags(existing.tags)
|
|
173
455
|
|
|
174
456
|
# Fetch document
|
|
175
457
|
doc = self._document_provider.fetch(id)
|
|
176
458
|
|
|
459
|
+
# Compute content hash for change detection
|
|
460
|
+
new_hash = _content_hash(doc.content)
|
|
461
|
+
|
|
177
462
|
# Generate embedding
|
|
178
|
-
embedding = self.
|
|
463
|
+
embedding = self._get_embedding_provider().embed(doc.content)
|
|
179
464
|
|
|
180
|
-
#
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
465
|
+
# Determine summary - skip if content unchanged
|
|
466
|
+
max_len = self._config.max_summary_length
|
|
467
|
+
content_unchanged = (
|
|
468
|
+
existing_doc is not None
|
|
469
|
+
and existing_doc.content_hash == new_hash
|
|
470
|
+
)
|
|
471
|
+
|
|
472
|
+
if content_unchanged and summary is None:
|
|
473
|
+
# Content unchanged - preserve existing summary
|
|
474
|
+
logger.debug("Content unchanged, skipping summarization for %s", id)
|
|
475
|
+
final_summary = existing_doc.summary
|
|
476
|
+
elif summary is not None:
|
|
477
|
+
# User-provided summary - validate length
|
|
478
|
+
if len(summary) > max_len:
|
|
479
|
+
import warnings
|
|
480
|
+
warnings.warn(
|
|
481
|
+
f"Summary exceeds max_summary_length ({len(summary)} > {max_len}), truncating",
|
|
482
|
+
UserWarning,
|
|
483
|
+
stacklevel=2
|
|
484
|
+
)
|
|
485
|
+
summary = summary[:max_len]
|
|
486
|
+
final_summary = summary
|
|
487
|
+
elif lazy:
|
|
488
|
+
# Truncated placeholder for lazy mode
|
|
489
|
+
if len(doc.content) > max_len:
|
|
490
|
+
final_summary = doc.content[:max_len] + "..."
|
|
185
491
|
else:
|
|
186
|
-
|
|
492
|
+
final_summary = doc.content
|
|
187
493
|
# Queue for background processing
|
|
188
494
|
self._pending_queue.enqueue(id, coll, doc.content)
|
|
189
495
|
else:
|
|
190
|
-
|
|
496
|
+
# Auto-generate summary
|
|
497
|
+
final_summary = self._get_summarization_provider().summarize(doc.content)
|
|
191
498
|
|
|
192
|
-
# Build tags: existing
|
|
193
|
-
|
|
499
|
+
# Build tags: existing → config → env → user (later wins on collision)
|
|
500
|
+
merged_tags = {**existing_tags}
|
|
194
501
|
|
|
195
|
-
# Merge
|
|
196
|
-
if
|
|
197
|
-
|
|
502
|
+
# Merge config default tags
|
|
503
|
+
if self._config.default_tags:
|
|
504
|
+
merged_tags.update(self._config.default_tags)
|
|
505
|
+
|
|
506
|
+
# Merge environment variable tags
|
|
507
|
+
env_tags = _get_env_tags()
|
|
508
|
+
merged_tags.update(env_tags)
|
|
509
|
+
|
|
510
|
+
# Merge in user-provided tags (filtered to prevent system tag override)
|
|
511
|
+
if tags:
|
|
512
|
+
merged_tags.update(filter_non_system_tags(tags))
|
|
198
513
|
|
|
199
514
|
# Add system tags
|
|
200
|
-
|
|
515
|
+
merged_tags["_source"] = "uri"
|
|
201
516
|
if doc.content_type:
|
|
202
|
-
|
|
517
|
+
merged_tags["_content_type"] = doc.content_type
|
|
203
518
|
|
|
204
|
-
#
|
|
519
|
+
# Dual-write: document store (canonical) + ChromaDB (embedding index)
|
|
520
|
+
self._document_store.upsert(
|
|
521
|
+
collection=coll,
|
|
522
|
+
id=id,
|
|
523
|
+
summary=final_summary,
|
|
524
|
+
tags=merged_tags,
|
|
525
|
+
content_hash=new_hash,
|
|
526
|
+
)
|
|
205
527
|
self._store.upsert(
|
|
206
528
|
collection=coll,
|
|
207
529
|
id=id,
|
|
208
530
|
embedding=embedding,
|
|
209
|
-
summary=
|
|
210
|
-
tags=
|
|
531
|
+
summary=final_summary,
|
|
532
|
+
tags=merged_tags,
|
|
211
533
|
)
|
|
212
534
|
|
|
213
|
-
# Spawn background processor if lazy
|
|
214
|
-
if lazy:
|
|
535
|
+
# Spawn background processor if lazy (only if summary wasn't user-provided and content changed)
|
|
536
|
+
if lazy and summary is None and not content_unchanged:
|
|
215
537
|
self._spawn_processor()
|
|
216
538
|
|
|
217
539
|
# Return the stored item
|
|
218
|
-
|
|
219
|
-
return
|
|
540
|
+
doc_record = self._document_store.get(coll, id)
|
|
541
|
+
return Item(
|
|
542
|
+
id=doc_record.id,
|
|
543
|
+
summary=doc_record.summary,
|
|
544
|
+
tags=doc_record.tags,
|
|
545
|
+
)
|
|
220
546
|
|
|
221
547
|
def remember(
|
|
222
548
|
self,
|
|
223
549
|
content: str,
|
|
224
550
|
*,
|
|
225
551
|
id: Optional[str] = None,
|
|
226
|
-
|
|
552
|
+
summary: Optional[str] = None,
|
|
553
|
+
tags: Optional[dict[str, str]] = None,
|
|
554
|
+
source_tags: Optional[dict[str, str]] = None, # Deprecated alias
|
|
227
555
|
collection: Optional[str] = None,
|
|
228
556
|
lazy: bool = False
|
|
229
557
|
) -> Item:
|
|
@@ -232,24 +560,42 @@ class Keeper:
|
|
|
232
560
|
|
|
233
561
|
Use for conversation snippets, notes, insights.
|
|
234
562
|
|
|
563
|
+
**Smart summary behavior:**
|
|
564
|
+
- If summary is provided, use it (skips auto-summarization)
|
|
565
|
+
- If content is short (≤ max_summary_length), use content verbatim
|
|
566
|
+
- Otherwise, generate summary via summarization provider
|
|
567
|
+
|
|
235
568
|
**Update behavior (when id already exists):**
|
|
236
|
-
- Summary: Replaced with
|
|
237
|
-
- Tags: Merged - existing
|
|
569
|
+
- Summary: Replaced with user-provided, content, or generated summary
|
|
570
|
+
- Tags: Merged - existing tags preserved, new tags override
|
|
238
571
|
on key collision. System tags (prefixed with _) are always managed by
|
|
239
572
|
the system.
|
|
240
573
|
|
|
241
574
|
Args:
|
|
242
575
|
content: Text to store and index
|
|
243
576
|
id: Optional custom ID (auto-generated if None)
|
|
244
|
-
|
|
577
|
+
summary: User-provided summary (skips auto-summarization if given)
|
|
578
|
+
tags: User-provided tags to merge with existing tags
|
|
579
|
+
source_tags: Deprecated alias for 'tags'
|
|
245
580
|
collection: Target collection (uses default if None)
|
|
246
|
-
lazy: If True, use truncated placeholder summary
|
|
247
|
-
background processing.
|
|
248
|
-
|
|
581
|
+
lazy: If True and content is long, use truncated placeholder summary
|
|
582
|
+
and queue for background processing. Ignored if content is
|
|
583
|
+
short or summary is provided.
|
|
249
584
|
|
|
250
585
|
Returns:
|
|
251
586
|
The stored Item with merged tags and new summary
|
|
252
587
|
"""
|
|
588
|
+
# Handle deprecated source_tags parameter
|
|
589
|
+
if source_tags is not None:
|
|
590
|
+
import warnings
|
|
591
|
+
warnings.warn(
|
|
592
|
+
"source_tags is deprecated, use 'tags' instead",
|
|
593
|
+
DeprecationWarning,
|
|
594
|
+
stacklevel=2
|
|
595
|
+
)
|
|
596
|
+
if tags is None:
|
|
597
|
+
tags = source_tags
|
|
598
|
+
|
|
253
599
|
coll = self._resolve_collection(collection)
|
|
254
600
|
|
|
255
601
|
# Generate ID if not provided
|
|
@@ -257,54 +603,101 @@ class Keeper:
|
|
|
257
603
|
timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S.%f")
|
|
258
604
|
id = f"mem:{timestamp}"
|
|
259
605
|
|
|
260
|
-
# Get existing item to preserve tags
|
|
606
|
+
# Get existing item to preserve tags (check document store first, fall back to ChromaDB)
|
|
261
607
|
existing_tags = {}
|
|
262
|
-
|
|
263
|
-
if
|
|
264
|
-
|
|
265
|
-
|
|
608
|
+
existing_doc = self._document_store.get(coll, id)
|
|
609
|
+
if existing_doc:
|
|
610
|
+
existing_tags = filter_non_system_tags(existing_doc.tags)
|
|
611
|
+
else:
|
|
612
|
+
existing = self._store.get(coll, id)
|
|
613
|
+
if existing:
|
|
614
|
+
existing_tags = filter_non_system_tags(existing.tags)
|
|
615
|
+
|
|
616
|
+
# Compute content hash for change detection
|
|
617
|
+
new_hash = _content_hash(content)
|
|
266
618
|
|
|
267
619
|
# Generate embedding
|
|
268
|
-
embedding = self.
|
|
620
|
+
embedding = self._get_embedding_provider().embed(content)
|
|
269
621
|
|
|
270
|
-
#
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
622
|
+
# Determine summary (smart behavior for remember) - skip if content unchanged
|
|
623
|
+
max_len = self._config.max_summary_length
|
|
624
|
+
content_unchanged = (
|
|
625
|
+
existing_doc is not None
|
|
626
|
+
and existing_doc.content_hash == new_hash
|
|
627
|
+
)
|
|
628
|
+
|
|
629
|
+
if content_unchanged and summary is None:
|
|
630
|
+
# Content unchanged - preserve existing summary
|
|
631
|
+
logger.debug("Content unchanged, skipping summarization for %s", id)
|
|
632
|
+
final_summary = existing_doc.summary
|
|
633
|
+
elif summary is not None:
|
|
634
|
+
# User-provided summary - validate length
|
|
635
|
+
if len(summary) > max_len:
|
|
636
|
+
import warnings
|
|
637
|
+
warnings.warn(
|
|
638
|
+
f"Summary exceeds max_summary_length ({len(summary)} > {max_len}), truncating",
|
|
639
|
+
UserWarning,
|
|
640
|
+
stacklevel=2
|
|
641
|
+
)
|
|
642
|
+
summary = summary[:max_len]
|
|
643
|
+
final_summary = summary
|
|
644
|
+
elif len(content) <= max_len:
|
|
645
|
+
# Content is short enough - use verbatim (smart summary)
|
|
646
|
+
final_summary = content
|
|
647
|
+
elif lazy:
|
|
648
|
+
# Content is long and lazy mode - truncated placeholder
|
|
649
|
+
final_summary = content[:max_len] + "..."
|
|
277
650
|
# Queue for background processing
|
|
278
651
|
self._pending_queue.enqueue(id, coll, content)
|
|
279
652
|
else:
|
|
280
|
-
|
|
653
|
+
# Content is long - generate summary
|
|
654
|
+
final_summary = self._get_summarization_provider().summarize(content)
|
|
655
|
+
|
|
656
|
+
# Build tags: existing → config → env → user (later wins on collision)
|
|
657
|
+
merged_tags = {**existing_tags}
|
|
281
658
|
|
|
282
|
-
#
|
|
283
|
-
|
|
659
|
+
# Merge config default tags
|
|
660
|
+
if self._config.default_tags:
|
|
661
|
+
merged_tags.update(self._config.default_tags)
|
|
284
662
|
|
|
285
|
-
# Merge
|
|
286
|
-
|
|
287
|
-
|
|
663
|
+
# Merge environment variable tags
|
|
664
|
+
env_tags = _get_env_tags()
|
|
665
|
+
merged_tags.update(env_tags)
|
|
666
|
+
|
|
667
|
+
# Merge in user-provided tags (filtered)
|
|
668
|
+
if tags:
|
|
669
|
+
merged_tags.update(filter_non_system_tags(tags))
|
|
288
670
|
|
|
289
671
|
# Add system tags
|
|
290
|
-
|
|
672
|
+
merged_tags["_source"] = "inline"
|
|
291
673
|
|
|
292
|
-
#
|
|
674
|
+
# Dual-write: document store (canonical) + ChromaDB (embedding index)
|
|
675
|
+
self._document_store.upsert(
|
|
676
|
+
collection=coll,
|
|
677
|
+
id=id,
|
|
678
|
+
summary=final_summary,
|
|
679
|
+
tags=merged_tags,
|
|
680
|
+
content_hash=new_hash,
|
|
681
|
+
)
|
|
293
682
|
self._store.upsert(
|
|
294
683
|
collection=coll,
|
|
295
684
|
id=id,
|
|
296
685
|
embedding=embedding,
|
|
297
|
-
summary=
|
|
298
|
-
tags=
|
|
686
|
+
summary=final_summary,
|
|
687
|
+
tags=merged_tags,
|
|
299
688
|
)
|
|
300
689
|
|
|
301
|
-
# Spawn background processor if lazy
|
|
302
|
-
if lazy:
|
|
690
|
+
# Spawn background processor if lazy and content was queued (only if content changed)
|
|
691
|
+
if lazy and summary is None and len(content) > max_len and not content_unchanged:
|
|
303
692
|
self._spawn_processor()
|
|
304
693
|
|
|
305
694
|
# Return the stored item
|
|
306
|
-
|
|
307
|
-
return
|
|
695
|
+
doc_record = self._document_store.get(coll, id)
|
|
696
|
+
return Item(
|
|
697
|
+
id=doc_record.id,
|
|
698
|
+
summary=doc_record.summary,
|
|
699
|
+
tags=doc_record.tags,
|
|
700
|
+
)
|
|
308
701
|
|
|
309
702
|
# -------------------------------------------------------------------------
|
|
310
703
|
# Query Operations
|
|
@@ -361,27 +754,40 @@ class Keeper:
|
|
|
361
754
|
query: str,
|
|
362
755
|
*,
|
|
363
756
|
limit: int = 10,
|
|
757
|
+
since: Optional[str] = None,
|
|
364
758
|
collection: Optional[str] = None
|
|
365
759
|
) -> list[Item]:
|
|
366
760
|
"""
|
|
367
761
|
Find items using semantic similarity search.
|
|
368
|
-
|
|
762
|
+
|
|
369
763
|
Scores are adjusted by recency decay (ACT-R model) - older items
|
|
370
764
|
have reduced effective relevance unless recently accessed.
|
|
765
|
+
|
|
766
|
+
Args:
|
|
767
|
+
query: Search query text
|
|
768
|
+
limit: Maximum results to return
|
|
769
|
+
since: Only include items updated since (ISO duration like P3D, or date)
|
|
770
|
+
collection: Target collection
|
|
371
771
|
"""
|
|
372
772
|
coll = self._resolve_collection(collection)
|
|
373
|
-
|
|
773
|
+
|
|
374
774
|
# Embed query
|
|
375
|
-
embedding = self.
|
|
376
|
-
|
|
377
|
-
# Search (fetch extra to account for re-ranking)
|
|
775
|
+
embedding = self._get_embedding_provider().embed(query)
|
|
776
|
+
|
|
777
|
+
# Search (fetch extra to account for re-ranking and date filtering)
|
|
378
778
|
fetch_limit = limit * 2 if self._decay_half_life_days > 0 else limit
|
|
779
|
+
if since is not None:
|
|
780
|
+
fetch_limit = max(fetch_limit, limit * 3) # Fetch more when filtering
|
|
379
781
|
results = self._store.query_embedding(coll, embedding, limit=fetch_limit)
|
|
380
|
-
|
|
782
|
+
|
|
381
783
|
# Convert to Items and apply decay
|
|
382
784
|
items = [r.to_item() for r in results]
|
|
383
785
|
items = self._apply_recency_decay(items)
|
|
384
|
-
|
|
786
|
+
|
|
787
|
+
# Apply date filter if specified
|
|
788
|
+
if since is not None:
|
|
789
|
+
items = _filter_by_date(items, since)
|
|
790
|
+
|
|
385
791
|
return items[:limit]
|
|
386
792
|
|
|
387
793
|
def find_similar(
|
|
@@ -389,32 +795,46 @@ class Keeper:
|
|
|
389
795
|
id: str,
|
|
390
796
|
*,
|
|
391
797
|
limit: int = 10,
|
|
798
|
+
since: Optional[str] = None,
|
|
392
799
|
include_self: bool = False,
|
|
393
800
|
collection: Optional[str] = None
|
|
394
801
|
) -> list[Item]:
|
|
395
802
|
"""
|
|
396
803
|
Find items similar to an existing item.
|
|
804
|
+
|
|
805
|
+
Args:
|
|
806
|
+
id: ID of item to find similar items for
|
|
807
|
+
limit: Maximum results to return
|
|
808
|
+
since: Only include items updated since (ISO duration like P3D, or date)
|
|
809
|
+
include_self: Include the queried item in results
|
|
810
|
+
collection: Target collection
|
|
397
811
|
"""
|
|
398
812
|
coll = self._resolve_collection(collection)
|
|
399
|
-
|
|
813
|
+
|
|
400
814
|
# Get the item to find its embedding
|
|
401
815
|
item = self._store.get(coll, id)
|
|
402
816
|
if item is None:
|
|
403
817
|
raise KeyError(f"Item not found: {id}")
|
|
404
|
-
|
|
405
|
-
# Search using the summary's embedding
|
|
406
|
-
embedding = self.
|
|
818
|
+
|
|
819
|
+
# Search using the summary's embedding (fetch extra when filtering)
|
|
820
|
+
embedding = self._get_embedding_provider().embed(item.summary)
|
|
407
821
|
actual_limit = limit + 1 if not include_self else limit
|
|
822
|
+
if since is not None:
|
|
823
|
+
actual_limit = max(actual_limit, limit * 3)
|
|
408
824
|
results = self._store.query_embedding(coll, embedding, limit=actual_limit)
|
|
409
|
-
|
|
825
|
+
|
|
410
826
|
# Filter self if needed
|
|
411
827
|
if not include_self:
|
|
412
828
|
results = [r for r in results if r.id != id]
|
|
413
|
-
|
|
829
|
+
|
|
414
830
|
# Convert to Items and apply decay
|
|
415
831
|
items = [r.to_item() for r in results]
|
|
416
832
|
items = self._apply_recency_decay(items)
|
|
417
|
-
|
|
833
|
+
|
|
834
|
+
# Apply date filter if specified
|
|
835
|
+
if since is not None:
|
|
836
|
+
items = _filter_by_date(items, since)
|
|
837
|
+
|
|
418
838
|
return items[:limit]
|
|
419
839
|
|
|
420
840
|
def query_fulltext(
|
|
@@ -422,14 +842,30 @@ class Keeper:
|
|
|
422
842
|
query: str,
|
|
423
843
|
*,
|
|
424
844
|
limit: int = 10,
|
|
845
|
+
since: Optional[str] = None,
|
|
425
846
|
collection: Optional[str] = None
|
|
426
847
|
) -> list[Item]:
|
|
427
848
|
"""
|
|
428
849
|
Search item summaries using full-text search.
|
|
850
|
+
|
|
851
|
+
Args:
|
|
852
|
+
query: Text to search for in summaries
|
|
853
|
+
limit: Maximum results to return
|
|
854
|
+
since: Only include items updated since (ISO duration like P3D, or date)
|
|
855
|
+
collection: Target collection
|
|
429
856
|
"""
|
|
430
857
|
coll = self._resolve_collection(collection)
|
|
431
|
-
|
|
432
|
-
|
|
858
|
+
|
|
859
|
+
# Fetch extra when filtering by date
|
|
860
|
+
fetch_limit = limit * 3 if since is not None else limit
|
|
861
|
+
results = self._store.query_fulltext(coll, query, limit=fetch_limit)
|
|
862
|
+
items = [r.to_item() for r in results]
|
|
863
|
+
|
|
864
|
+
# Apply date filter if specified
|
|
865
|
+
if since is not None:
|
|
866
|
+
items = _filter_by_date(items, since)
|
|
867
|
+
|
|
868
|
+
return items[:limit]
|
|
433
869
|
|
|
434
870
|
def query_tag(
|
|
435
871
|
self,
|
|
@@ -437,6 +873,7 @@ class Keeper:
|
|
|
437
873
|
value: Optional[str] = None,
|
|
438
874
|
*,
|
|
439
875
|
limit: int = 100,
|
|
876
|
+
since: Optional[str] = None,
|
|
440
877
|
collection: Optional[str] = None,
|
|
441
878
|
**tags: str
|
|
442
879
|
) -> list[Item]:
|
|
@@ -444,21 +881,39 @@ class Keeper:
|
|
|
444
881
|
Find items by tag(s).
|
|
445
882
|
|
|
446
883
|
Usage:
|
|
447
|
-
#
|
|
884
|
+
# Key only: find all docs with this tag key (any value)
|
|
885
|
+
query_tag("project")
|
|
886
|
+
|
|
887
|
+
# Key with value: find docs with specific tag value
|
|
448
888
|
query_tag("project", "myapp")
|
|
449
|
-
query_tag("tradition", "buddhist")
|
|
450
889
|
|
|
451
|
-
#
|
|
890
|
+
# Multiple tags via kwargs
|
|
452
891
|
query_tag(tradition="buddhist", source="mn22")
|
|
892
|
+
|
|
893
|
+
Args:
|
|
894
|
+
key: Tag key to search for
|
|
895
|
+
value: Tag value (optional, any value if not provided)
|
|
896
|
+
limit: Maximum results to return
|
|
897
|
+
since: Only include items updated since (ISO duration like P3D, or date)
|
|
898
|
+
collection: Target collection
|
|
899
|
+
**tags: Additional tag filters as keyword arguments
|
|
453
900
|
"""
|
|
454
901
|
coll = self._resolve_collection(collection)
|
|
455
902
|
|
|
903
|
+
# Key-only query: find docs that have this tag key (any value)
|
|
904
|
+
# Uses DocumentStore which supports efficient SQL date filtering
|
|
905
|
+
if key is not None and value is None and not tags:
|
|
906
|
+
# Convert since to cutoff date for SQL query
|
|
907
|
+
since_date = _parse_since(since) if since else None
|
|
908
|
+
docs = self._document_store.query_by_tag_key(
|
|
909
|
+
coll, key, limit=limit, since_date=since_date
|
|
910
|
+
)
|
|
911
|
+
return [Item(id=d.id, summary=d.summary, tags=d.tags) for d in docs]
|
|
912
|
+
|
|
456
913
|
# Build tag filter from positional or keyword args
|
|
457
914
|
tag_filter = {}
|
|
458
915
|
|
|
459
|
-
if key is not None:
|
|
460
|
-
if value is None:
|
|
461
|
-
raise ValueError(f"Value required when querying by key '{key}'")
|
|
916
|
+
if key is not None and value is not None:
|
|
462
917
|
tag_filter[key] = value
|
|
463
918
|
|
|
464
919
|
if tags:
|
|
@@ -467,11 +922,50 @@ class Keeper:
|
|
|
467
922
|
if not tag_filter:
|
|
468
923
|
raise ValueError("At least one tag must be specified")
|
|
469
924
|
|
|
470
|
-
# Build where clause
|
|
471
|
-
|
|
925
|
+
# Build where clause for tag filters only
|
|
926
|
+
# (ChromaDB $gte doesn't support string dates, so date filtering is done post-query)
|
|
927
|
+
where_conditions = [{k: v} for k, v in tag_filter.items()]
|
|
928
|
+
|
|
929
|
+
# Use $and if multiple conditions, otherwise single condition
|
|
930
|
+
if len(where_conditions) == 1:
|
|
931
|
+
where = where_conditions[0]
|
|
932
|
+
else:
|
|
933
|
+
where = {"$and": where_conditions}
|
|
934
|
+
|
|
935
|
+
# Fetch extra when filtering by date
|
|
936
|
+
fetch_limit = limit * 3 if since is not None else limit
|
|
937
|
+
results = self._store.query_metadata(coll, where, limit=fetch_limit)
|
|
938
|
+
items = [r.to_item() for r in results]
|
|
939
|
+
|
|
940
|
+
# Apply date filter if specified (post-filter)
|
|
941
|
+
if since is not None:
|
|
942
|
+
items = _filter_by_date(items, since)
|
|
943
|
+
|
|
944
|
+
return items[:limit]
|
|
945
|
+
|
|
946
|
+
def list_tags(
|
|
947
|
+
self,
|
|
948
|
+
key: Optional[str] = None,
|
|
949
|
+
*,
|
|
950
|
+
collection: Optional[str] = None,
|
|
951
|
+
) -> list[str]:
|
|
952
|
+
"""
|
|
953
|
+
List distinct tag keys or values.
|
|
472
954
|
|
|
473
|
-
|
|
474
|
-
|
|
955
|
+
Args:
|
|
956
|
+
key: If provided, list distinct values for this key.
|
|
957
|
+
If None, list distinct tag keys.
|
|
958
|
+
collection: Target collection
|
|
959
|
+
|
|
960
|
+
Returns:
|
|
961
|
+
Sorted list of distinct keys or values
|
|
962
|
+
"""
|
|
963
|
+
coll = self._resolve_collection(collection)
|
|
964
|
+
|
|
965
|
+
if key is None:
|
|
966
|
+
return self._document_store.list_distinct_tag_keys(coll)
|
|
967
|
+
else:
|
|
968
|
+
return self._document_store.list_distinct_tag_values(coll, key)
|
|
475
969
|
|
|
476
970
|
# -------------------------------------------------------------------------
|
|
477
971
|
# Direct Access
|
|
@@ -480,8 +974,21 @@ class Keeper:
|
|
|
480
974
|
def get(self, id: str, *, collection: Optional[str] = None) -> Optional[Item]:
|
|
481
975
|
"""
|
|
482
976
|
Retrieve a specific item by ID.
|
|
977
|
+
|
|
978
|
+
Reads from document store (canonical), falls back to ChromaDB for legacy data.
|
|
483
979
|
"""
|
|
484
980
|
coll = self._resolve_collection(collection)
|
|
981
|
+
|
|
982
|
+
# Try document store first (canonical)
|
|
983
|
+
doc_record = self._document_store.get(coll, id)
|
|
984
|
+
if doc_record:
|
|
985
|
+
return Item(
|
|
986
|
+
id=doc_record.id,
|
|
987
|
+
summary=doc_record.summary,
|
|
988
|
+
tags=doc_record.tags,
|
|
989
|
+
)
|
|
990
|
+
|
|
991
|
+
# Fall back to ChromaDB for legacy data
|
|
485
992
|
result = self._store.get(coll, id)
|
|
486
993
|
if result is None:
|
|
487
994
|
return None
|
|
@@ -492,17 +999,148 @@ class Keeper:
|
|
|
492
999
|
Check if an item exists in the store.
|
|
493
1000
|
"""
|
|
494
1001
|
coll = self._resolve_collection(collection)
|
|
495
|
-
|
|
1002
|
+
# Check document store first, then ChromaDB
|
|
1003
|
+
return self._document_store.exists(coll, id) or self._store.exists(coll, id)
|
|
496
1004
|
|
|
497
1005
|
def delete(self, id: str, *, collection: Optional[str] = None) -> bool:
|
|
498
1006
|
"""
|
|
499
|
-
Delete an item from
|
|
500
|
-
|
|
1007
|
+
Delete an item from both stores.
|
|
1008
|
+
|
|
501
1009
|
Returns True if item existed and was deleted.
|
|
502
1010
|
"""
|
|
503
1011
|
coll = self._resolve_collection(collection)
|
|
504
|
-
|
|
505
|
-
|
|
1012
|
+
# Delete from both stores
|
|
1013
|
+
doc_deleted = self._document_store.delete(coll, id)
|
|
1014
|
+
chroma_deleted = self._store.delete(coll, id)
|
|
1015
|
+
return doc_deleted or chroma_deleted
|
|
1016
|
+
|
|
1017
|
+
# -------------------------------------------------------------------------
|
|
1018
|
+
# Current Working Context (Now)
|
|
1019
|
+
# -------------------------------------------------------------------------
|
|
1020
|
+
|
|
1021
|
+
def get_now(self) -> Item:
|
|
1022
|
+
"""
|
|
1023
|
+
Get the current working context.
|
|
1024
|
+
|
|
1025
|
+
A singleton document representing what you're currently working on.
|
|
1026
|
+
If it doesn't exist, creates one with default content and tags from
|
|
1027
|
+
docs/system/now.md.
|
|
1028
|
+
|
|
1029
|
+
Returns:
|
|
1030
|
+
The current context Item (never None - auto-creates if missing)
|
|
1031
|
+
"""
|
|
1032
|
+
item = self.get(NOWDOC_ID)
|
|
1033
|
+
if item is None:
|
|
1034
|
+
# First-time initialization with default content and tags
|
|
1035
|
+
try:
|
|
1036
|
+
default_content, default_tags = _load_frontmatter(SYSTEM_DOC_DIR / "now.md")
|
|
1037
|
+
except FileNotFoundError:
|
|
1038
|
+
# Fallback if system file is missing
|
|
1039
|
+
default_content = "# Now\n\nYour working context."
|
|
1040
|
+
default_tags = {}
|
|
1041
|
+
item = self.set_now(default_content, tags=default_tags)
|
|
1042
|
+
return item
|
|
1043
|
+
|
|
1044
|
+
def set_now(
|
|
1045
|
+
self,
|
|
1046
|
+
content: str,
|
|
1047
|
+
*,
|
|
1048
|
+
tags: Optional[dict[str, str]] = None,
|
|
1049
|
+
) -> Item:
|
|
1050
|
+
"""
|
|
1051
|
+
Set the current working context.
|
|
1052
|
+
|
|
1053
|
+
Updates the singleton context with new content. Uses remember()
|
|
1054
|
+
internally with the fixed NOWDOC_ID.
|
|
1055
|
+
|
|
1056
|
+
Args:
|
|
1057
|
+
content: New content for the current context
|
|
1058
|
+
tags: Optional additional tags to apply
|
|
1059
|
+
|
|
1060
|
+
Returns:
|
|
1061
|
+
The updated context Item
|
|
1062
|
+
"""
|
|
1063
|
+
return self.remember(content, id=NOWDOC_ID, tags=tags)
|
|
1064
|
+
|
|
1065
|
+
def list_system_documents(
|
|
1066
|
+
self,
|
|
1067
|
+
*,
|
|
1068
|
+
collection: Optional[str] = None,
|
|
1069
|
+
) -> list[Item]:
|
|
1070
|
+
"""
|
|
1071
|
+
List all system documents.
|
|
1072
|
+
|
|
1073
|
+
System documents are identified by the `category: system` tag.
|
|
1074
|
+
These are preloaded on init and provide foundational content.
|
|
1075
|
+
|
|
1076
|
+
Args:
|
|
1077
|
+
collection: Target collection (default: default collection)
|
|
1078
|
+
|
|
1079
|
+
Returns:
|
|
1080
|
+
List of system document Items
|
|
1081
|
+
"""
|
|
1082
|
+
return self.query_tag("category", "system", collection=collection)
|
|
1083
|
+
|
|
1084
|
+
def tag(
|
|
1085
|
+
self,
|
|
1086
|
+
id: str,
|
|
1087
|
+
tags: Optional[dict[str, str]] = None,
|
|
1088
|
+
*,
|
|
1089
|
+
collection: Optional[str] = None,
|
|
1090
|
+
) -> Optional[Item]:
|
|
1091
|
+
"""
|
|
1092
|
+
Update tags on an existing document without re-processing.
|
|
1093
|
+
|
|
1094
|
+
Does NOT re-fetch, re-embed, or re-summarize. Only updates tags.
|
|
1095
|
+
|
|
1096
|
+
Tag behavior:
|
|
1097
|
+
- Provided tags are merged with existing user tags
|
|
1098
|
+
- Empty string value ("") deletes that tag
|
|
1099
|
+
- System tags (_prefixed) cannot be modified via this method
|
|
1100
|
+
|
|
1101
|
+
Args:
|
|
1102
|
+
id: Document identifier
|
|
1103
|
+
tags: Tags to add/update/delete (empty string = delete)
|
|
1104
|
+
collection: Target collection
|
|
1105
|
+
|
|
1106
|
+
Returns:
|
|
1107
|
+
Updated Item if found, None if document doesn't exist
|
|
1108
|
+
"""
|
|
1109
|
+
coll = self._resolve_collection(collection)
|
|
1110
|
+
|
|
1111
|
+
# Get existing item (prefer document store, fall back to ChromaDB)
|
|
1112
|
+
existing = self.get(id, collection=collection)
|
|
1113
|
+
if existing is None:
|
|
1114
|
+
return None
|
|
1115
|
+
|
|
1116
|
+
# Start with existing tags, separate system from user
|
|
1117
|
+
current_tags = dict(existing.tags)
|
|
1118
|
+
system_tags = {k: v for k, v in current_tags.items()
|
|
1119
|
+
if k.startswith(SYSTEM_TAG_PREFIX)}
|
|
1120
|
+
user_tags = {k: v for k, v in current_tags.items()
|
|
1121
|
+
if not k.startswith(SYSTEM_TAG_PREFIX)}
|
|
1122
|
+
|
|
1123
|
+
# Apply tag changes (filter out system tags from input)
|
|
1124
|
+
if tags:
|
|
1125
|
+
for key, value in tags.items():
|
|
1126
|
+
if key.startswith(SYSTEM_TAG_PREFIX):
|
|
1127
|
+
continue # Cannot modify system tags
|
|
1128
|
+
if value == "":
|
|
1129
|
+
# Empty string = delete
|
|
1130
|
+
user_tags.pop(key, None)
|
|
1131
|
+
else:
|
|
1132
|
+
user_tags[key] = value
|
|
1133
|
+
|
|
1134
|
+
# Merge back: user tags + system tags
|
|
1135
|
+
final_tags = {**user_tags, **system_tags}
|
|
1136
|
+
|
|
1137
|
+
# Dual-write to both stores
|
|
1138
|
+
self._document_store.update_tags(coll, id, final_tags)
|
|
1139
|
+
self._store.update_tags(coll, id, final_tags)
|
|
1140
|
+
|
|
1141
|
+
# Return updated item
|
|
1142
|
+
return self.get(id, collection=collection)
|
|
1143
|
+
|
|
506
1144
|
# -------------------------------------------------------------------------
|
|
507
1145
|
# Collection Management
|
|
508
1146
|
# -------------------------------------------------------------------------
|
|
@@ -511,13 +1149,21 @@ class Keeper:
|
|
|
511
1149
|
"""
|
|
512
1150
|
List all collections in the store.
|
|
513
1151
|
"""
|
|
514
|
-
|
|
1152
|
+
# Merge collections from both stores
|
|
1153
|
+
doc_collections = set(self._document_store.list_collections())
|
|
1154
|
+
chroma_collections = set(self._store.list_collections())
|
|
1155
|
+
return sorted(doc_collections | chroma_collections)
|
|
515
1156
|
|
|
516
1157
|
def count(self, *, collection: Optional[str] = None) -> int:
|
|
517
1158
|
"""
|
|
518
1159
|
Count items in a collection.
|
|
1160
|
+
|
|
1161
|
+
Returns count from document store if available, else ChromaDB.
|
|
519
1162
|
"""
|
|
520
1163
|
coll = self._resolve_collection(collection)
|
|
1164
|
+
doc_count = self._document_store.count(coll)
|
|
1165
|
+
if doc_count > 0:
|
|
1166
|
+
return doc_count
|
|
521
1167
|
return self._store.count(coll)
|
|
522
1168
|
|
|
523
1169
|
def embedding_cache_stats(self) -> dict:
|
|
@@ -525,7 +1171,10 @@ class Keeper:
|
|
|
525
1171
|
Get embedding cache statistics.
|
|
526
1172
|
|
|
527
1173
|
Returns dict with: entries, hits, misses, hit_rate, cache_path
|
|
1174
|
+
Returns {"loaded": False} if embedding provider hasn't been loaded yet.
|
|
528
1175
|
"""
|
|
1176
|
+
if self._embedding_provider is None:
|
|
1177
|
+
return {"loaded": False}
|
|
529
1178
|
if isinstance(self._embedding_provider, CachingEmbeddingProvider):
|
|
530
1179
|
return self._embedding_provider.stats()
|
|
531
1180
|
return {"enabled": False}
|
|
@@ -563,9 +1212,10 @@ class Keeper:
|
|
|
563
1212
|
|
|
564
1213
|
try:
|
|
565
1214
|
# Generate real summary
|
|
566
|
-
summary = self.
|
|
1215
|
+
summary = self._get_summarization_provider().summarize(item.content)
|
|
567
1216
|
|
|
568
|
-
# Update
|
|
1217
|
+
# Update summary in both stores
|
|
1218
|
+
self._document_store.update_summary(item.collection, item.id, summary)
|
|
569
1219
|
self._store.update_summary(item.collection, item.id, summary)
|
|
570
1220
|
|
|
571
1221
|
# Remove from queue
|
|
@@ -652,8 +1302,9 @@ class Keeper:
|
|
|
652
1302
|
subprocess.Popen(cmd, **kwargs)
|
|
653
1303
|
return True
|
|
654
1304
|
|
|
655
|
-
except Exception:
|
|
656
|
-
# Spawn failed -
|
|
1305
|
+
except Exception as e:
|
|
1306
|
+
# Spawn failed - log for debugging, queue will be processed later
|
|
1307
|
+
logger.warning("Failed to spawn background processor: %s", e)
|
|
657
1308
|
return False
|
|
658
1309
|
|
|
659
1310
|
def close(self) -> None:
|
|
@@ -662,11 +1313,12 @@ class Keeper:
|
|
|
662
1313
|
|
|
663
1314
|
Good practice to call when done, though Python's GC will clean up eventually.
|
|
664
1315
|
"""
|
|
665
|
-
# Close embedding cache if it
|
|
666
|
-
if
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
cache
|
|
1316
|
+
# Close embedding cache if it was loaded
|
|
1317
|
+
if self._embedding_provider is not None:
|
|
1318
|
+
if hasattr(self._embedding_provider, '_cache'):
|
|
1319
|
+
cache = self._embedding_provider._cache
|
|
1320
|
+
if hasattr(cache, 'close'):
|
|
1321
|
+
cache.close()
|
|
670
1322
|
|
|
671
1323
|
# Close pending summary queue
|
|
672
1324
|
if hasattr(self, '_pending_queue'):
|