keep-skill 0.1.0__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- keep/__init__.py +3 -6
- keep/api.py +1052 -145
- keep/cli.py +705 -132
- keep/config.py +172 -41
- keep/context.py +1 -125
- keep/document_store.py +908 -0
- keep/errors.py +33 -0
- keep/indexing.py +1 -1
- keep/logging_config.py +34 -3
- keep/paths.py +81 -17
- keep/pending_summaries.py +52 -40
- keep/providers/embedding_cache.py +59 -46
- keep/providers/embeddings.py +43 -13
- keep/providers/mlx.py +23 -21
- keep/store.py +169 -25
- keep_skill-0.3.0.dist-info/METADATA +218 -0
- keep_skill-0.3.0.dist-info/RECORD +28 -0
- keep_skill-0.1.0.dist-info/METADATA +0 -290
- keep_skill-0.1.0.dist-info/RECORD +0 -26
- {keep_skill-0.1.0.dist-info → keep_skill-0.3.0.dist-info}/WHEEL +0 -0
- {keep_skill-0.1.0.dist-info → keep_skill-0.3.0.dist-info}/entry_points.txt +0 -0
- {keep_skill-0.1.0.dist-info → keep_skill-0.3.0.dist-info}/licenses/LICENSE +0 -0
keep/api.py
CHANGED
|
@@ -8,17 +8,104 @@ This is the minimal working implementation focused on:
|
|
|
8
8
|
- get(): retrieve by ID
|
|
9
9
|
"""
|
|
10
10
|
|
|
11
|
+
import hashlib
|
|
12
|
+
import logging
|
|
11
13
|
import re
|
|
12
|
-
from datetime import datetime, timezone
|
|
14
|
+
from datetime import datetime, timezone, timedelta
|
|
13
15
|
from pathlib import Path
|
|
14
16
|
from typing import Any, Optional
|
|
15
17
|
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _parse_since(since: str) -> str:
|
|
22
|
+
"""
|
|
23
|
+
Parse a 'since' string and return a YYYY-MM-DD cutoff date.
|
|
24
|
+
|
|
25
|
+
Accepts:
|
|
26
|
+
- ISO 8601 duration: P3D (3 days), P1W (1 week), PT1H (1 hour), P1DT12H, etc.
|
|
27
|
+
- ISO date: 2026-01-15
|
|
28
|
+
- Date with slashes: 2026/01/15
|
|
29
|
+
|
|
30
|
+
Returns:
|
|
31
|
+
YYYY-MM-DD string for the cutoff date
|
|
32
|
+
"""
|
|
33
|
+
since = since.strip()
|
|
34
|
+
|
|
35
|
+
# ISO 8601 duration: P[n]Y[n]M[n]W[n]DT[n]H[n]M[n]S
|
|
36
|
+
if since.upper().startswith("P"):
|
|
37
|
+
duration_str = since.upper()
|
|
38
|
+
|
|
39
|
+
# Parse duration components
|
|
40
|
+
years = months = weeks = days = hours = minutes = seconds = 0
|
|
41
|
+
|
|
42
|
+
# Split on T to separate date and time parts
|
|
43
|
+
if "T" in duration_str:
|
|
44
|
+
date_part, time_part = duration_str.split("T", 1)
|
|
45
|
+
else:
|
|
46
|
+
date_part = duration_str
|
|
47
|
+
time_part = ""
|
|
48
|
+
|
|
49
|
+
# Parse date part (P[n]Y[n]M[n]W[n]D)
|
|
50
|
+
date_part = date_part[1:] # Remove leading P
|
|
51
|
+
for match in re.finditer(r"(\d+)([YMWD])", date_part):
|
|
52
|
+
value, unit = int(match.group(1)), match.group(2)
|
|
53
|
+
if unit == "Y":
|
|
54
|
+
years = value
|
|
55
|
+
elif unit == "M":
|
|
56
|
+
months = value
|
|
57
|
+
elif unit == "W":
|
|
58
|
+
weeks = value
|
|
59
|
+
elif unit == "D":
|
|
60
|
+
days = value
|
|
61
|
+
|
|
62
|
+
# Parse time part ([n]H[n]M[n]S)
|
|
63
|
+
for match in re.finditer(r"(\d+)([HMS])", time_part):
|
|
64
|
+
value, unit = int(match.group(1)), match.group(2)
|
|
65
|
+
if unit == "H":
|
|
66
|
+
hours = value
|
|
67
|
+
elif unit == "M":
|
|
68
|
+
minutes = value
|
|
69
|
+
elif unit == "S":
|
|
70
|
+
seconds = value
|
|
71
|
+
|
|
72
|
+
# Convert to timedelta (approximate months/years)
|
|
73
|
+
total_days = years * 365 + months * 30 + weeks * 7 + days
|
|
74
|
+
delta = timedelta(days=total_days, hours=hours, minutes=minutes, seconds=seconds)
|
|
75
|
+
cutoff = datetime.now(timezone.utc) - delta
|
|
76
|
+
return cutoff.strftime("%Y-%m-%d")
|
|
77
|
+
|
|
78
|
+
# Try parsing as date
|
|
79
|
+
# ISO format: 2026-01-15 or 2026-01-15T...
|
|
80
|
+
# Slash format: 2026/01/15
|
|
81
|
+
date_str = since.replace("/", "-").split("T")[0]
|
|
82
|
+
|
|
83
|
+
try:
|
|
84
|
+
parsed = datetime.strptime(date_str, "%Y-%m-%d")
|
|
85
|
+
return parsed.strftime("%Y-%m-%d")
|
|
86
|
+
except ValueError:
|
|
87
|
+
pass
|
|
88
|
+
|
|
89
|
+
raise ValueError(
|
|
90
|
+
f"Invalid 'since' format: {since}. "
|
|
91
|
+
"Use ISO duration (P3D, PT1H, P1W) or date (2026-01-15)"
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def _filter_by_date(items: list, since: str) -> list:
|
|
96
|
+
"""Filter items to only those updated since the given date/duration."""
|
|
97
|
+
cutoff = _parse_since(since)
|
|
98
|
+
return [
|
|
99
|
+
item for item in items
|
|
100
|
+
if item.tags.get("_updated_date", "0000-00-00") >= cutoff
|
|
101
|
+
]
|
|
102
|
+
|
|
16
103
|
import os
|
|
17
104
|
import subprocess
|
|
18
105
|
import sys
|
|
19
106
|
|
|
20
|
-
from .config import load_or_create_config, StoreConfig
|
|
21
|
-
from .paths import get_default_store_path
|
|
107
|
+
from .config import load_or_create_config, save_config, StoreConfig, EmbeddingIdentity
|
|
108
|
+
from .paths import get_config_dir, get_default_store_path
|
|
22
109
|
from .pending_summaries import PendingSummaryQueue
|
|
23
110
|
from .providers import get_registry
|
|
24
111
|
from .providers.base import (
|
|
@@ -27,8 +114,9 @@ from .providers.base import (
|
|
|
27
114
|
SummarizationProvider,
|
|
28
115
|
)
|
|
29
116
|
from .providers.embedding_cache import CachingEmbeddingProvider
|
|
117
|
+
from .document_store import VersionInfo
|
|
30
118
|
from .store import ChromaStore
|
|
31
|
-
from .types import Item, filter_non_system_tags
|
|
119
|
+
from .types import Item, filter_non_system_tags, SYSTEM_TAG_PREFIX
|
|
32
120
|
|
|
33
121
|
|
|
34
122
|
# Default max length for truncated placeholder summaries
|
|
@@ -41,6 +129,88 @@ MAX_SUMMARY_ATTEMPTS = 5
|
|
|
41
129
|
# Collection name validation: lowercase ASCII and underscores only
|
|
42
130
|
COLLECTION_NAME_PATTERN = re.compile(r"^[a-z][a-z0-9_]*$")
|
|
43
131
|
|
|
132
|
+
# Environment variable prefix for auto-applied tags
|
|
133
|
+
ENV_TAG_PREFIX = "KEEP_TAG_"
|
|
134
|
+
|
|
135
|
+
# Fixed ID for the current working context (singleton)
|
|
136
|
+
NOWDOC_ID = "_now:default"
|
|
137
|
+
|
|
138
|
+
# Path to system documents
|
|
139
|
+
SYSTEM_DOC_DIR = Path(__file__).parent.parent / "docs" / "system"
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def _load_frontmatter(path: Path) -> tuple[str, dict[str, str]]:
|
|
143
|
+
"""
|
|
144
|
+
Load content and tags from a file with optional YAML frontmatter.
|
|
145
|
+
|
|
146
|
+
Args:
|
|
147
|
+
path: Path to the file
|
|
148
|
+
|
|
149
|
+
Returns:
|
|
150
|
+
(content, tags) tuple. Tags empty if no frontmatter.
|
|
151
|
+
|
|
152
|
+
Raises:
|
|
153
|
+
FileNotFoundError: If the file doesn't exist
|
|
154
|
+
"""
|
|
155
|
+
text = path.read_text()
|
|
156
|
+
|
|
157
|
+
# Parse YAML frontmatter if present
|
|
158
|
+
if text.startswith("---"):
|
|
159
|
+
parts = text.split("---", 2)
|
|
160
|
+
if len(parts) >= 3:
|
|
161
|
+
import yaml
|
|
162
|
+
frontmatter = yaml.safe_load(parts[1])
|
|
163
|
+
content = parts[2].lstrip("\n")
|
|
164
|
+
if frontmatter:
|
|
165
|
+
tags = frontmatter.get("tags", {})
|
|
166
|
+
# Ensure all tag values are strings
|
|
167
|
+
tags = {k: str(v) for k, v in tags.items()}
|
|
168
|
+
return content, tags
|
|
169
|
+
return content, {}
|
|
170
|
+
|
|
171
|
+
return text, {}
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def _get_env_tags() -> dict[str, str]:
|
|
175
|
+
"""
|
|
176
|
+
Collect tags from KEEP_TAG_* environment variables.
|
|
177
|
+
|
|
178
|
+
KEEP_TAG_PROJECT=foo -> {"project": "foo"}
|
|
179
|
+
KEEP_TAG_MyTag=bar -> {"mytag": "bar"}
|
|
180
|
+
|
|
181
|
+
Tag keys are lowercased for consistency.
|
|
182
|
+
"""
|
|
183
|
+
tags = {}
|
|
184
|
+
for key, value in os.environ.items():
|
|
185
|
+
if key.startswith(ENV_TAG_PREFIX) and value:
|
|
186
|
+
tag_key = key[len(ENV_TAG_PREFIX):].lower()
|
|
187
|
+
tags[tag_key] = value
|
|
188
|
+
return tags
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def _content_hash(content: str) -> str:
|
|
192
|
+
"""SHA256 hash of content for change detection."""
|
|
193
|
+
return hashlib.sha256(content.encode("utf-8")).hexdigest()
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def _text_content_id(content: str) -> str:
|
|
197
|
+
"""
|
|
198
|
+
Generate a content-addressed ID for text updates.
|
|
199
|
+
|
|
200
|
+
This makes text updates versioned by content:
|
|
201
|
+
- `keep update "my note"` → ID = _text:{hash[:12]}
|
|
202
|
+
- `keep update "my note" -t status=done` → same ID, new version
|
|
203
|
+
- `keep update "different note"` → different ID
|
|
204
|
+
|
|
205
|
+
Args:
|
|
206
|
+
content: The text content
|
|
207
|
+
|
|
208
|
+
Returns:
|
|
209
|
+
Content-addressed ID in format _text:{hash[:12]}
|
|
210
|
+
"""
|
|
211
|
+
content_hash = hashlib.sha256(content.encode("utf-8")).hexdigest()[:12]
|
|
212
|
+
return f"_text:{content_hash}"
|
|
213
|
+
|
|
44
214
|
|
|
45
215
|
class Keeper:
|
|
46
216
|
"""
|
|
@@ -60,20 +230,15 @@ class Keeper:
|
|
|
60
230
|
) -> None:
|
|
61
231
|
"""
|
|
62
232
|
Initialize or open an existing associative memory store.
|
|
63
|
-
|
|
233
|
+
|
|
64
234
|
Args:
|
|
65
235
|
store_path: Path to store directory. Uses default if not specified.
|
|
236
|
+
Overrides any store.path setting in config.
|
|
66
237
|
collection: Default collection name.
|
|
67
238
|
decay_half_life_days: Memory decay half-life in days (ACT-R model).
|
|
68
239
|
After this many days, an item's effective relevance is halved.
|
|
69
240
|
Set to 0 or negative to disable decay.
|
|
70
241
|
"""
|
|
71
|
-
# Resolve store path
|
|
72
|
-
if store_path is None:
|
|
73
|
-
self._store_path = get_default_store_path()
|
|
74
|
-
else:
|
|
75
|
-
self._store_path = Path(store_path).resolve()
|
|
76
|
-
|
|
77
242
|
# Validate collection name
|
|
78
243
|
if not COLLECTION_NAME_PATTERN.match(collection):
|
|
79
244
|
raise ValueError(
|
|
@@ -82,43 +247,161 @@ class Keeper:
|
|
|
82
247
|
)
|
|
83
248
|
self._default_collection = collection
|
|
84
249
|
self._decay_half_life_days = decay_half_life_days
|
|
85
|
-
|
|
250
|
+
|
|
251
|
+
# Resolve config and store paths
|
|
252
|
+
# If store_path is explicitly provided, use it as both config and store location
|
|
253
|
+
# Otherwise, discover config via tree-walk and let config determine store
|
|
254
|
+
if store_path is not None:
|
|
255
|
+
self._store_path = Path(store_path).resolve()
|
|
256
|
+
config_dir = self._store_path
|
|
257
|
+
else:
|
|
258
|
+
# Discover config directory (tree-walk or envvar)
|
|
259
|
+
config_dir = get_config_dir()
|
|
260
|
+
|
|
86
261
|
# Load or create configuration
|
|
87
|
-
self._config: StoreConfig = load_or_create_config(
|
|
88
|
-
|
|
89
|
-
#
|
|
262
|
+
self._config: StoreConfig = load_or_create_config(config_dir)
|
|
263
|
+
|
|
264
|
+
# If store_path wasn't explicit, resolve from config
|
|
265
|
+
if store_path is None:
|
|
266
|
+
self._store_path = get_default_store_path(self._config)
|
|
267
|
+
|
|
268
|
+
# Initialize document provider (needed for most operations)
|
|
90
269
|
registry = get_registry()
|
|
91
|
-
|
|
92
270
|
self._document_provider: DocumentProvider = registry.create_document(
|
|
93
271
|
self._config.document.name,
|
|
94
272
|
self._config.document.params,
|
|
95
273
|
)
|
|
96
|
-
|
|
97
|
-
#
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
self._config.embedding.params,
|
|
101
|
-
)
|
|
102
|
-
cache_path = self._store_path / "embedding_cache.db"
|
|
103
|
-
self._embedding_provider: EmbeddingProvider = CachingEmbeddingProvider(
|
|
104
|
-
base_embedding_provider,
|
|
105
|
-
cache_path=cache_path,
|
|
106
|
-
)
|
|
107
|
-
|
|
108
|
-
self._summarization_provider: SummarizationProvider = registry.create_summarization(
|
|
109
|
-
self._config.summarization.name,
|
|
110
|
-
self._config.summarization.params,
|
|
111
|
-
)
|
|
274
|
+
|
|
275
|
+
# Lazy-loaded providers (created on first use to avoid network access for read-only ops)
|
|
276
|
+
self._embedding_provider: Optional[EmbeddingProvider] = None
|
|
277
|
+
self._summarization_provider: Optional[SummarizationProvider] = None
|
|
112
278
|
|
|
113
279
|
# Initialize pending summary queue
|
|
114
280
|
queue_path = self._store_path / "pending_summaries.db"
|
|
115
281
|
self._pending_queue = PendingSummaryQueue(queue_path)
|
|
116
282
|
|
|
117
|
-
# Initialize store
|
|
283
|
+
# Initialize document store (canonical records)
|
|
284
|
+
from .document_store import DocumentStore
|
|
285
|
+
doc_store_path = self._store_path / "documents.db"
|
|
286
|
+
self._document_store = DocumentStore(doc_store_path)
|
|
287
|
+
|
|
288
|
+
# Initialize ChromaDB store (embedding index)
|
|
289
|
+
# Use dimension from stored identity if available (allows offline read-only access)
|
|
290
|
+
embedding_dim = None
|
|
291
|
+
if self._config.embedding_identity:
|
|
292
|
+
embedding_dim = self._config.embedding_identity.dimension
|
|
118
293
|
self._store = ChromaStore(
|
|
119
294
|
self._store_path,
|
|
120
|
-
embedding_dimension=
|
|
295
|
+
embedding_dimension=embedding_dim,
|
|
121
296
|
)
|
|
297
|
+
|
|
298
|
+
# Preload system documents (only if not already present)
|
|
299
|
+
self._ensure_system_documents()
|
|
300
|
+
|
|
301
|
+
def _ensure_system_documents(self) -> None:
|
|
302
|
+
"""
|
|
303
|
+
Ensure system documents are loaded into the store.
|
|
304
|
+
|
|
305
|
+
Scans all .md files in docs/system/. Each file is indexed with its
|
|
306
|
+
file:// URI as the ID and `_category: system` tag for identification.
|
|
307
|
+
Content becomes the summary directly (no auto-summarization).
|
|
308
|
+
|
|
309
|
+
Called during init. Only loads docs that don't already exist,
|
|
310
|
+
so user modifications are preserved and no network access occurs
|
|
311
|
+
if docs are already present.
|
|
312
|
+
"""
|
|
313
|
+
for path in SYSTEM_DOC_DIR.glob("*.md"):
|
|
314
|
+
try:
|
|
315
|
+
uri = f"file://{path.resolve()}"
|
|
316
|
+
if not self.exists(uri):
|
|
317
|
+
content, tags = _load_frontmatter(path)
|
|
318
|
+
tags["category"] = "system"
|
|
319
|
+
self.remember(content, id=uri, tags=tags)
|
|
320
|
+
except FileNotFoundError:
|
|
321
|
+
# System file missing - skip silently
|
|
322
|
+
pass
|
|
323
|
+
|
|
324
|
+
def _get_embedding_provider(self) -> EmbeddingProvider:
|
|
325
|
+
"""
|
|
326
|
+
Get embedding provider, creating it lazily on first use.
|
|
327
|
+
|
|
328
|
+
This allows read-only operations to work offline without loading
|
|
329
|
+
the embedding model (which may try to reach HuggingFace).
|
|
330
|
+
"""
|
|
331
|
+
if self._embedding_provider is None:
|
|
332
|
+
registry = get_registry()
|
|
333
|
+
base_provider = registry.create_embedding(
|
|
334
|
+
self._config.embedding.name,
|
|
335
|
+
self._config.embedding.params,
|
|
336
|
+
)
|
|
337
|
+
cache_path = self._store_path / "embedding_cache.db"
|
|
338
|
+
self._embedding_provider = CachingEmbeddingProvider(
|
|
339
|
+
base_provider,
|
|
340
|
+
cache_path=cache_path,
|
|
341
|
+
)
|
|
342
|
+
# Validate or record embedding identity
|
|
343
|
+
self._validate_embedding_identity(self._embedding_provider)
|
|
344
|
+
# Update store's embedding dimension if it wasn't known at init
|
|
345
|
+
if self._store._embedding_dimension is None:
|
|
346
|
+
self._store._embedding_dimension = self._embedding_provider.dimension
|
|
347
|
+
return self._embedding_provider
|
|
348
|
+
|
|
349
|
+
def _get_summarization_provider(self) -> SummarizationProvider:
|
|
350
|
+
"""
|
|
351
|
+
Get summarization provider, creating it lazily on first use.
|
|
352
|
+
"""
|
|
353
|
+
if self._summarization_provider is None:
|
|
354
|
+
registry = get_registry()
|
|
355
|
+
self._summarization_provider = registry.create_summarization(
|
|
356
|
+
self._config.summarization.name,
|
|
357
|
+
self._config.summarization.params,
|
|
358
|
+
)
|
|
359
|
+
return self._summarization_provider
|
|
360
|
+
|
|
361
|
+
def _validate_embedding_identity(self, provider: EmbeddingProvider) -> None:
|
|
362
|
+
"""
|
|
363
|
+
Validate embedding provider matches stored identity, or record it.
|
|
364
|
+
|
|
365
|
+
On first use, records the embedding identity to config.
|
|
366
|
+
On subsequent uses, validates that the current provider matches.
|
|
367
|
+
|
|
368
|
+
Raises:
|
|
369
|
+
ValueError: If embedding provider changed incompatibly
|
|
370
|
+
"""
|
|
371
|
+
# Get current provider's identity
|
|
372
|
+
current = EmbeddingIdentity(
|
|
373
|
+
provider=self._config.embedding.name,
|
|
374
|
+
model=getattr(provider, "model_name", "unknown"),
|
|
375
|
+
dimension=provider.dimension,
|
|
376
|
+
)
|
|
377
|
+
|
|
378
|
+
stored = self._config.embedding_identity
|
|
379
|
+
|
|
380
|
+
if stored is None:
|
|
381
|
+
# First use: record the identity
|
|
382
|
+
self._config.embedding_identity = current
|
|
383
|
+
save_config(self._config)
|
|
384
|
+
else:
|
|
385
|
+
# Validate compatibility
|
|
386
|
+
if (stored.provider != current.provider or
|
|
387
|
+
stored.model != current.model or
|
|
388
|
+
stored.dimension != current.dimension):
|
|
389
|
+
raise ValueError(
|
|
390
|
+
f"Embedding provider mismatch!\n"
|
|
391
|
+
f" Stored: {stored.provider}/{stored.model} ({stored.dimension}d)\n"
|
|
392
|
+
f" Current: {current.provider}/{current.model} ({current.dimension}d)\n"
|
|
393
|
+
f"\n"
|
|
394
|
+
f"Changing embedding providers invalidates existing embeddings.\n"
|
|
395
|
+
f"Options:\n"
|
|
396
|
+
f" 1. Use the original provider\n"
|
|
397
|
+
f" 2. Delete .keep/ and re-index\n"
|
|
398
|
+
f" 3. (Future) Run migration to re-embed with new provider"
|
|
399
|
+
)
|
|
400
|
+
|
|
401
|
+
@property
|
|
402
|
+
def embedding_identity(self) -> EmbeddingIdentity | None:
|
|
403
|
+
"""Current embedding identity (provider, model, dimension)."""
|
|
404
|
+
return self._config.embedding_identity
|
|
122
405
|
|
|
123
406
|
def _resolve_collection(self, collection: Optional[str]) -> str:
|
|
124
407
|
"""Resolve collection name, validating if provided."""
|
|
@@ -135,8 +418,10 @@ class Keeper:
|
|
|
135
418
|
def update(
|
|
136
419
|
self,
|
|
137
420
|
id: str,
|
|
138
|
-
|
|
421
|
+
tags: Optional[dict[str, str]] = None,
|
|
139
422
|
*,
|
|
423
|
+
summary: Optional[str] = None,
|
|
424
|
+
source_tags: Optional[dict[str, str]] = None, # Deprecated alias
|
|
140
425
|
collection: Optional[str] = None,
|
|
141
426
|
lazy: bool = False
|
|
142
427
|
) -> Item:
|
|
@@ -146,84 +431,170 @@ class Keeper:
|
|
|
146
431
|
Fetches the document, generates embeddings and summary, then stores it.
|
|
147
432
|
|
|
148
433
|
**Update behavior:**
|
|
149
|
-
- Summary:
|
|
150
|
-
- Tags: Merged - existing
|
|
434
|
+
- Summary: Replaced with user-provided or newly generated summary
|
|
435
|
+
- Tags: Merged - existing tags are preserved, new tags override
|
|
151
436
|
on key collision. System tags (prefixed with _) are always managed by
|
|
152
437
|
the system.
|
|
153
438
|
|
|
154
439
|
Args:
|
|
155
440
|
id: URI of document to fetch and index
|
|
156
|
-
|
|
441
|
+
tags: User-provided tags to merge with existing tags
|
|
442
|
+
summary: User-provided summary (skips auto-summarization if given)
|
|
443
|
+
source_tags: Deprecated alias for 'tags'
|
|
157
444
|
collection: Target collection (uses default if None)
|
|
158
445
|
lazy: If True, use truncated placeholder summary and queue for
|
|
159
446
|
background processing. Use `process_pending()` to generate
|
|
160
|
-
real summaries later.
|
|
447
|
+
real summaries later. Ignored if summary is provided.
|
|
161
448
|
|
|
162
449
|
Returns:
|
|
163
450
|
The stored Item with merged tags and new summary
|
|
164
451
|
"""
|
|
452
|
+
# Handle deprecated source_tags parameter
|
|
453
|
+
if source_tags is not None:
|
|
454
|
+
import warnings
|
|
455
|
+
warnings.warn(
|
|
456
|
+
"source_tags is deprecated, use 'tags' instead",
|
|
457
|
+
DeprecationWarning,
|
|
458
|
+
stacklevel=2
|
|
459
|
+
)
|
|
460
|
+
if tags is None:
|
|
461
|
+
tags = source_tags
|
|
462
|
+
|
|
165
463
|
coll = self._resolve_collection(collection)
|
|
166
464
|
|
|
167
|
-
# Get existing item to preserve tags
|
|
465
|
+
# Get existing item to preserve tags (check document store first, fall back to ChromaDB)
|
|
168
466
|
existing_tags = {}
|
|
169
|
-
|
|
170
|
-
if
|
|
171
|
-
|
|
172
|
-
|
|
467
|
+
existing_doc = self._document_store.get(coll, id)
|
|
468
|
+
if existing_doc:
|
|
469
|
+
existing_tags = filter_non_system_tags(existing_doc.tags)
|
|
470
|
+
else:
|
|
471
|
+
# Fall back to ChromaDB for legacy data
|
|
472
|
+
existing = self._store.get(coll, id)
|
|
473
|
+
if existing:
|
|
474
|
+
existing_tags = filter_non_system_tags(existing.tags)
|
|
173
475
|
|
|
174
476
|
# Fetch document
|
|
175
477
|
doc = self._document_provider.fetch(id)
|
|
176
478
|
|
|
479
|
+
# Compute content hash for change detection
|
|
480
|
+
new_hash = _content_hash(doc.content)
|
|
481
|
+
|
|
177
482
|
# Generate embedding
|
|
178
|
-
embedding = self.
|
|
483
|
+
embedding = self._get_embedding_provider().embed(doc.content)
|
|
179
484
|
|
|
180
|
-
#
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
485
|
+
# Determine summary - skip if content unchanged
|
|
486
|
+
max_len = self._config.max_summary_length
|
|
487
|
+
content_unchanged = (
|
|
488
|
+
existing_doc is not None
|
|
489
|
+
and existing_doc.content_hash == new_hash
|
|
490
|
+
)
|
|
491
|
+
|
|
492
|
+
if content_unchanged and summary is None:
|
|
493
|
+
# Content unchanged - preserve existing summary
|
|
494
|
+
logger.debug("Content unchanged, skipping summarization for %s", id)
|
|
495
|
+
final_summary = existing_doc.summary
|
|
496
|
+
elif summary is not None:
|
|
497
|
+
# User-provided summary - validate length
|
|
498
|
+
if len(summary) > max_len:
|
|
499
|
+
import warnings
|
|
500
|
+
warnings.warn(
|
|
501
|
+
f"Summary exceeds max_summary_length ({len(summary)} > {max_len}), truncating",
|
|
502
|
+
UserWarning,
|
|
503
|
+
stacklevel=2
|
|
504
|
+
)
|
|
505
|
+
summary = summary[:max_len]
|
|
506
|
+
final_summary = summary
|
|
507
|
+
elif lazy:
|
|
508
|
+
# Truncated placeholder for lazy mode
|
|
509
|
+
if len(doc.content) > max_len:
|
|
510
|
+
final_summary = doc.content[:max_len] + "..."
|
|
185
511
|
else:
|
|
186
|
-
|
|
512
|
+
final_summary = doc.content
|
|
187
513
|
# Queue for background processing
|
|
188
514
|
self._pending_queue.enqueue(id, coll, doc.content)
|
|
189
515
|
else:
|
|
190
|
-
|
|
516
|
+
# Auto-generate summary
|
|
517
|
+
final_summary = self._get_summarization_provider().summarize(doc.content)
|
|
518
|
+
|
|
519
|
+
# Build tags: existing → config → env → user (later wins on collision)
|
|
520
|
+
merged_tags = {**existing_tags}
|
|
191
521
|
|
|
192
|
-
#
|
|
193
|
-
|
|
522
|
+
# Merge config default tags
|
|
523
|
+
if self._config.default_tags:
|
|
524
|
+
merged_tags.update(self._config.default_tags)
|
|
194
525
|
|
|
195
|
-
# Merge
|
|
196
|
-
|
|
197
|
-
|
|
526
|
+
# Merge environment variable tags
|
|
527
|
+
env_tags = _get_env_tags()
|
|
528
|
+
merged_tags.update(env_tags)
|
|
529
|
+
|
|
530
|
+
# Merge in user-provided tags (filtered to prevent system tag override)
|
|
531
|
+
if tags:
|
|
532
|
+
merged_tags.update(filter_non_system_tags(tags))
|
|
198
533
|
|
|
199
534
|
# Add system tags
|
|
200
|
-
|
|
535
|
+
merged_tags["_source"] = "uri"
|
|
201
536
|
if doc.content_type:
|
|
202
|
-
|
|
537
|
+
merged_tags["_content_type"] = doc.content_type
|
|
203
538
|
|
|
204
|
-
#
|
|
539
|
+
# Get existing doc info for versioning before upsert
|
|
540
|
+
old_doc = self._document_store.get(coll, id)
|
|
541
|
+
|
|
542
|
+
# Dual-write: document store (canonical) + ChromaDB (embedding index)
|
|
543
|
+
# DocumentStore.upsert now returns (record, content_changed) and archives old version
|
|
544
|
+
doc_record, content_changed = self._document_store.upsert(
|
|
545
|
+
collection=coll,
|
|
546
|
+
id=id,
|
|
547
|
+
summary=final_summary,
|
|
548
|
+
tags=merged_tags,
|
|
549
|
+
content_hash=new_hash,
|
|
550
|
+
)
|
|
551
|
+
|
|
552
|
+
# Store embedding for current version
|
|
205
553
|
self._store.upsert(
|
|
206
554
|
collection=coll,
|
|
207
555
|
id=id,
|
|
208
556
|
embedding=embedding,
|
|
209
|
-
summary=
|
|
210
|
-
tags=
|
|
557
|
+
summary=final_summary,
|
|
558
|
+
tags=merged_tags,
|
|
211
559
|
)
|
|
212
560
|
|
|
213
|
-
#
|
|
214
|
-
if
|
|
561
|
+
# If content changed and we archived a version, also store versioned embedding
|
|
562
|
+
# Skip if content hash is same (only tags/summary changed)
|
|
563
|
+
if old_doc is not None and content_changed:
|
|
564
|
+
# Get the version number that was just archived
|
|
565
|
+
version_count = self._document_store.version_count(coll, id)
|
|
566
|
+
if version_count > 0:
|
|
567
|
+
# Re-embed the old content for the archived version
|
|
568
|
+
old_embedding = self._get_embedding_provider().embed(old_doc.summary)
|
|
569
|
+
self._store.upsert_version(
|
|
570
|
+
collection=coll,
|
|
571
|
+
id=id,
|
|
572
|
+
version=version_count,
|
|
573
|
+
embedding=old_embedding,
|
|
574
|
+
summary=old_doc.summary,
|
|
575
|
+
tags=old_doc.tags,
|
|
576
|
+
)
|
|
577
|
+
|
|
578
|
+
# Spawn background processor if lazy (only if summary wasn't user-provided and content changed)
|
|
579
|
+
if lazy and summary is None and not content_unchanged:
|
|
215
580
|
self._spawn_processor()
|
|
216
581
|
|
|
217
582
|
# Return the stored item
|
|
218
|
-
|
|
219
|
-
return
|
|
583
|
+
doc_record = self._document_store.get(coll, id)
|
|
584
|
+
return Item(
|
|
585
|
+
id=doc_record.id,
|
|
586
|
+
summary=doc_record.summary,
|
|
587
|
+
tags=doc_record.tags,
|
|
588
|
+
)
|
|
220
589
|
|
|
221
590
|
def remember(
|
|
222
591
|
self,
|
|
223
592
|
content: str,
|
|
224
593
|
*,
|
|
225
594
|
id: Optional[str] = None,
|
|
226
|
-
|
|
595
|
+
summary: Optional[str] = None,
|
|
596
|
+
tags: Optional[dict[str, str]] = None,
|
|
597
|
+
source_tags: Optional[dict[str, str]] = None, # Deprecated alias
|
|
227
598
|
collection: Optional[str] = None,
|
|
228
599
|
lazy: bool = False
|
|
229
600
|
) -> Item:
|
|
@@ -232,24 +603,42 @@ class Keeper:
|
|
|
232
603
|
|
|
233
604
|
Use for conversation snippets, notes, insights.
|
|
234
605
|
|
|
606
|
+
**Smart summary behavior:**
|
|
607
|
+
- If summary is provided, use it (skips auto-summarization)
|
|
608
|
+
- If content is short (≤ max_summary_length), use content verbatim
|
|
609
|
+
- Otherwise, generate summary via summarization provider
|
|
610
|
+
|
|
235
611
|
**Update behavior (when id already exists):**
|
|
236
|
-
- Summary: Replaced with
|
|
237
|
-
- Tags: Merged - existing
|
|
612
|
+
- Summary: Replaced with user-provided, content, or generated summary
|
|
613
|
+
- Tags: Merged - existing tags preserved, new tags override
|
|
238
614
|
on key collision. System tags (prefixed with _) are always managed by
|
|
239
615
|
the system.
|
|
240
616
|
|
|
241
617
|
Args:
|
|
242
618
|
content: Text to store and index
|
|
243
619
|
id: Optional custom ID (auto-generated if None)
|
|
244
|
-
|
|
620
|
+
summary: User-provided summary (skips auto-summarization if given)
|
|
621
|
+
tags: User-provided tags to merge with existing tags
|
|
622
|
+
source_tags: Deprecated alias for 'tags'
|
|
245
623
|
collection: Target collection (uses default if None)
|
|
246
|
-
lazy: If True, use truncated placeholder summary
|
|
247
|
-
background processing.
|
|
248
|
-
|
|
624
|
+
lazy: If True and content is long, use truncated placeholder summary
|
|
625
|
+
and queue for background processing. Ignored if content is
|
|
626
|
+
short or summary is provided.
|
|
249
627
|
|
|
250
628
|
Returns:
|
|
251
629
|
The stored Item with merged tags and new summary
|
|
252
630
|
"""
|
|
631
|
+
# Handle deprecated source_tags parameter
|
|
632
|
+
if source_tags is not None:
|
|
633
|
+
import warnings
|
|
634
|
+
warnings.warn(
|
|
635
|
+
"source_tags is deprecated, use 'tags' instead",
|
|
636
|
+
DeprecationWarning,
|
|
637
|
+
stacklevel=2
|
|
638
|
+
)
|
|
639
|
+
if tags is None:
|
|
640
|
+
tags = source_tags
|
|
641
|
+
|
|
253
642
|
coll = self._resolve_collection(collection)
|
|
254
643
|
|
|
255
644
|
# Generate ID if not provided
|
|
@@ -257,54 +646,124 @@ class Keeper:
|
|
|
257
646
|
timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S.%f")
|
|
258
647
|
id = f"mem:{timestamp}"
|
|
259
648
|
|
|
260
|
-
# Get existing item to preserve tags
|
|
649
|
+
# Get existing item to preserve tags (check document store first, fall back to ChromaDB)
|
|
261
650
|
existing_tags = {}
|
|
262
|
-
|
|
263
|
-
if
|
|
264
|
-
|
|
265
|
-
|
|
651
|
+
existing_doc = self._document_store.get(coll, id)
|
|
652
|
+
if existing_doc:
|
|
653
|
+
existing_tags = filter_non_system_tags(existing_doc.tags)
|
|
654
|
+
else:
|
|
655
|
+
existing = self._store.get(coll, id)
|
|
656
|
+
if existing:
|
|
657
|
+
existing_tags = filter_non_system_tags(existing.tags)
|
|
658
|
+
|
|
659
|
+
# Compute content hash for change detection
|
|
660
|
+
new_hash = _content_hash(content)
|
|
266
661
|
|
|
267
662
|
# Generate embedding
|
|
268
|
-
embedding = self.
|
|
663
|
+
embedding = self._get_embedding_provider().embed(content)
|
|
269
664
|
|
|
270
|
-
#
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
665
|
+
# Determine summary (smart behavior for remember) - skip if content unchanged
|
|
666
|
+
max_len = self._config.max_summary_length
|
|
667
|
+
content_unchanged = (
|
|
668
|
+
existing_doc is not None
|
|
669
|
+
and existing_doc.content_hash == new_hash
|
|
670
|
+
)
|
|
671
|
+
|
|
672
|
+
if content_unchanged and summary is None:
|
|
673
|
+
# Content unchanged - preserve existing summary
|
|
674
|
+
logger.debug("Content unchanged, skipping summarization for %s", id)
|
|
675
|
+
final_summary = existing_doc.summary
|
|
676
|
+
elif summary is not None:
|
|
677
|
+
# User-provided summary - validate length
|
|
678
|
+
if len(summary) > max_len:
|
|
679
|
+
import warnings
|
|
680
|
+
warnings.warn(
|
|
681
|
+
f"Summary exceeds max_summary_length ({len(summary)} > {max_len}), truncating",
|
|
682
|
+
UserWarning,
|
|
683
|
+
stacklevel=2
|
|
684
|
+
)
|
|
685
|
+
summary = summary[:max_len]
|
|
686
|
+
final_summary = summary
|
|
687
|
+
elif len(content) <= max_len:
|
|
688
|
+
# Content is short enough - use verbatim (smart summary)
|
|
689
|
+
final_summary = content
|
|
690
|
+
elif lazy:
|
|
691
|
+
# Content is long and lazy mode - truncated placeholder
|
|
692
|
+
final_summary = content[:max_len] + "..."
|
|
277
693
|
# Queue for background processing
|
|
278
694
|
self._pending_queue.enqueue(id, coll, content)
|
|
279
695
|
else:
|
|
280
|
-
|
|
696
|
+
# Content is long - generate summary
|
|
697
|
+
final_summary = self._get_summarization_provider().summarize(content)
|
|
698
|
+
|
|
699
|
+
# Build tags: existing → config → env → user (later wins on collision)
|
|
700
|
+
merged_tags = {**existing_tags}
|
|
281
701
|
|
|
282
|
-
#
|
|
283
|
-
|
|
702
|
+
# Merge config default tags
|
|
703
|
+
if self._config.default_tags:
|
|
704
|
+
merged_tags.update(self._config.default_tags)
|
|
284
705
|
|
|
285
|
-
# Merge
|
|
286
|
-
|
|
287
|
-
|
|
706
|
+
# Merge environment variable tags
|
|
707
|
+
env_tags = _get_env_tags()
|
|
708
|
+
merged_tags.update(env_tags)
|
|
709
|
+
|
|
710
|
+
# Merge in user-provided tags (filtered)
|
|
711
|
+
if tags:
|
|
712
|
+
merged_tags.update(filter_non_system_tags(tags))
|
|
288
713
|
|
|
289
714
|
# Add system tags
|
|
290
|
-
|
|
715
|
+
merged_tags["_source"] = "inline"
|
|
291
716
|
|
|
292
|
-
#
|
|
717
|
+
# Get existing doc info for versioning before upsert
|
|
718
|
+
old_doc = self._document_store.get(coll, id)
|
|
719
|
+
|
|
720
|
+
# Dual-write: document store (canonical) + ChromaDB (embedding index)
|
|
721
|
+
# DocumentStore.upsert now returns (record, content_changed) and archives old version
|
|
722
|
+
doc_record, content_changed = self._document_store.upsert(
|
|
723
|
+
collection=coll,
|
|
724
|
+
id=id,
|
|
725
|
+
summary=final_summary,
|
|
726
|
+
tags=merged_tags,
|
|
727
|
+
content_hash=new_hash,
|
|
728
|
+
)
|
|
729
|
+
|
|
730
|
+
# Store embedding for current version
|
|
293
731
|
self._store.upsert(
|
|
294
732
|
collection=coll,
|
|
295
733
|
id=id,
|
|
296
734
|
embedding=embedding,
|
|
297
|
-
summary=
|
|
298
|
-
tags=
|
|
735
|
+
summary=final_summary,
|
|
736
|
+
tags=merged_tags,
|
|
299
737
|
)
|
|
300
738
|
|
|
301
|
-
#
|
|
302
|
-
if
|
|
739
|
+
# If content changed and we archived a version, also store versioned embedding
|
|
740
|
+
# Skip if content hash is same (only tags/summary changed)
|
|
741
|
+
if old_doc is not None and content_changed:
|
|
742
|
+
# Get the version number that was just archived
|
|
743
|
+
version_count = self._document_store.version_count(coll, id)
|
|
744
|
+
if version_count > 0:
|
|
745
|
+
# Re-embed the old content for the archived version
|
|
746
|
+
old_embedding = self._get_embedding_provider().embed(old_doc.summary)
|
|
747
|
+
self._store.upsert_version(
|
|
748
|
+
collection=coll,
|
|
749
|
+
id=id,
|
|
750
|
+
version=version_count,
|
|
751
|
+
embedding=old_embedding,
|
|
752
|
+
summary=old_doc.summary,
|
|
753
|
+
tags=old_doc.tags,
|
|
754
|
+
)
|
|
755
|
+
|
|
756
|
+
# Spawn background processor if lazy and content was queued (only if content changed)
|
|
757
|
+
if lazy and summary is None and len(content) > max_len and not content_unchanged:
|
|
303
758
|
self._spawn_processor()
|
|
304
759
|
|
|
305
760
|
# Return the stored item
|
|
306
|
-
|
|
307
|
-
return
|
|
761
|
+
doc_record = self._document_store.get(coll, id)
|
|
762
|
+
return Item(
|
|
763
|
+
id=doc_record.id,
|
|
764
|
+
summary=doc_record.summary,
|
|
765
|
+
tags=doc_record.tags,
|
|
766
|
+
)
|
|
308
767
|
|
|
309
768
|
# -------------------------------------------------------------------------
|
|
310
769
|
# Query Operations
|
|
@@ -361,27 +820,40 @@ class Keeper:
|
|
|
361
820
|
query: str,
|
|
362
821
|
*,
|
|
363
822
|
limit: int = 10,
|
|
823
|
+
since: Optional[str] = None,
|
|
364
824
|
collection: Optional[str] = None
|
|
365
825
|
) -> list[Item]:
|
|
366
826
|
"""
|
|
367
827
|
Find items using semantic similarity search.
|
|
368
|
-
|
|
828
|
+
|
|
369
829
|
Scores are adjusted by recency decay (ACT-R model) - older items
|
|
370
830
|
have reduced effective relevance unless recently accessed.
|
|
831
|
+
|
|
832
|
+
Args:
|
|
833
|
+
query: Search query text
|
|
834
|
+
limit: Maximum results to return
|
|
835
|
+
since: Only include items updated since (ISO duration like P3D, or date)
|
|
836
|
+
collection: Target collection
|
|
371
837
|
"""
|
|
372
838
|
coll = self._resolve_collection(collection)
|
|
373
|
-
|
|
839
|
+
|
|
374
840
|
# Embed query
|
|
375
|
-
embedding = self.
|
|
376
|
-
|
|
377
|
-
# Search (fetch extra to account for re-ranking)
|
|
841
|
+
embedding = self._get_embedding_provider().embed(query)
|
|
842
|
+
|
|
843
|
+
# Search (fetch extra to account for re-ranking and date filtering)
|
|
378
844
|
fetch_limit = limit * 2 if self._decay_half_life_days > 0 else limit
|
|
845
|
+
if since is not None:
|
|
846
|
+
fetch_limit = max(fetch_limit, limit * 3) # Fetch more when filtering
|
|
379
847
|
results = self._store.query_embedding(coll, embedding, limit=fetch_limit)
|
|
380
|
-
|
|
848
|
+
|
|
381
849
|
# Convert to Items and apply decay
|
|
382
850
|
items = [r.to_item() for r in results]
|
|
383
851
|
items = self._apply_recency_decay(items)
|
|
384
|
-
|
|
852
|
+
|
|
853
|
+
# Apply date filter if specified
|
|
854
|
+
if since is not None:
|
|
855
|
+
items = _filter_by_date(items, since)
|
|
856
|
+
|
|
385
857
|
return items[:limit]
|
|
386
858
|
|
|
387
859
|
def find_similar(
|
|
@@ -389,32 +861,46 @@ class Keeper:
|
|
|
389
861
|
id: str,
|
|
390
862
|
*,
|
|
391
863
|
limit: int = 10,
|
|
864
|
+
since: Optional[str] = None,
|
|
392
865
|
include_self: bool = False,
|
|
393
866
|
collection: Optional[str] = None
|
|
394
867
|
) -> list[Item]:
|
|
395
868
|
"""
|
|
396
869
|
Find items similar to an existing item.
|
|
870
|
+
|
|
871
|
+
Args:
|
|
872
|
+
id: ID of item to find similar items for
|
|
873
|
+
limit: Maximum results to return
|
|
874
|
+
since: Only include items updated since (ISO duration like P3D, or date)
|
|
875
|
+
include_self: Include the queried item in results
|
|
876
|
+
collection: Target collection
|
|
397
877
|
"""
|
|
398
878
|
coll = self._resolve_collection(collection)
|
|
399
|
-
|
|
879
|
+
|
|
400
880
|
# Get the item to find its embedding
|
|
401
881
|
item = self._store.get(coll, id)
|
|
402
882
|
if item is None:
|
|
403
883
|
raise KeyError(f"Item not found: {id}")
|
|
404
|
-
|
|
405
|
-
# Search using the summary's embedding
|
|
406
|
-
embedding = self.
|
|
884
|
+
|
|
885
|
+
# Search using the summary's embedding (fetch extra when filtering)
|
|
886
|
+
embedding = self._get_embedding_provider().embed(item.summary)
|
|
407
887
|
actual_limit = limit + 1 if not include_self else limit
|
|
888
|
+
if since is not None:
|
|
889
|
+
actual_limit = max(actual_limit, limit * 3)
|
|
408
890
|
results = self._store.query_embedding(coll, embedding, limit=actual_limit)
|
|
409
|
-
|
|
891
|
+
|
|
410
892
|
# Filter self if needed
|
|
411
893
|
if not include_self:
|
|
412
894
|
results = [r for r in results if r.id != id]
|
|
413
|
-
|
|
895
|
+
|
|
414
896
|
# Convert to Items and apply decay
|
|
415
897
|
items = [r.to_item() for r in results]
|
|
416
898
|
items = self._apply_recency_decay(items)
|
|
417
|
-
|
|
899
|
+
|
|
900
|
+
# Apply date filter if specified
|
|
901
|
+
if since is not None:
|
|
902
|
+
items = _filter_by_date(items, since)
|
|
903
|
+
|
|
418
904
|
return items[:limit]
|
|
419
905
|
|
|
420
906
|
def query_fulltext(
|
|
@@ -422,14 +908,30 @@ class Keeper:
|
|
|
422
908
|
query: str,
|
|
423
909
|
*,
|
|
424
910
|
limit: int = 10,
|
|
911
|
+
since: Optional[str] = None,
|
|
425
912
|
collection: Optional[str] = None
|
|
426
913
|
) -> list[Item]:
|
|
427
914
|
"""
|
|
428
915
|
Search item summaries using full-text search.
|
|
916
|
+
|
|
917
|
+
Args:
|
|
918
|
+
query: Text to search for in summaries
|
|
919
|
+
limit: Maximum results to return
|
|
920
|
+
since: Only include items updated since (ISO duration like P3D, or date)
|
|
921
|
+
collection: Target collection
|
|
429
922
|
"""
|
|
430
923
|
coll = self._resolve_collection(collection)
|
|
431
|
-
|
|
432
|
-
|
|
924
|
+
|
|
925
|
+
# Fetch extra when filtering by date
|
|
926
|
+
fetch_limit = limit * 3 if since is not None else limit
|
|
927
|
+
results = self._store.query_fulltext(coll, query, limit=fetch_limit)
|
|
928
|
+
items = [r.to_item() for r in results]
|
|
929
|
+
|
|
930
|
+
# Apply date filter if specified
|
|
931
|
+
if since is not None:
|
|
932
|
+
items = _filter_by_date(items, since)
|
|
933
|
+
|
|
934
|
+
return items[:limit]
|
|
433
935
|
|
|
434
936
|
def query_tag(
|
|
435
937
|
self,
|
|
@@ -437,6 +939,7 @@ class Keeper:
|
|
|
437
939
|
value: Optional[str] = None,
|
|
438
940
|
*,
|
|
439
941
|
limit: int = 100,
|
|
942
|
+
since: Optional[str] = None,
|
|
440
943
|
collection: Optional[str] = None,
|
|
441
944
|
**tags: str
|
|
442
945
|
) -> list[Item]:
|
|
@@ -444,21 +947,39 @@ class Keeper:
|
|
|
444
947
|
Find items by tag(s).
|
|
445
948
|
|
|
446
949
|
Usage:
|
|
447
|
-
#
|
|
950
|
+
# Key only: find all docs with this tag key (any value)
|
|
951
|
+
query_tag("project")
|
|
952
|
+
|
|
953
|
+
# Key with value: find docs with specific tag value
|
|
448
954
|
query_tag("project", "myapp")
|
|
449
|
-
query_tag("tradition", "buddhist")
|
|
450
955
|
|
|
451
|
-
#
|
|
956
|
+
# Multiple tags via kwargs
|
|
452
957
|
query_tag(tradition="buddhist", source="mn22")
|
|
958
|
+
|
|
959
|
+
Args:
|
|
960
|
+
key: Tag key to search for
|
|
961
|
+
value: Tag value (optional, any value if not provided)
|
|
962
|
+
limit: Maximum results to return
|
|
963
|
+
since: Only include items updated since (ISO duration like P3D, or date)
|
|
964
|
+
collection: Target collection
|
|
965
|
+
**tags: Additional tag filters as keyword arguments
|
|
453
966
|
"""
|
|
454
967
|
coll = self._resolve_collection(collection)
|
|
455
968
|
|
|
969
|
+
# Key-only query: find docs that have this tag key (any value)
|
|
970
|
+
# Uses DocumentStore which supports efficient SQL date filtering
|
|
971
|
+
if key is not None and value is None and not tags:
|
|
972
|
+
# Convert since to cutoff date for SQL query
|
|
973
|
+
since_date = _parse_since(since) if since else None
|
|
974
|
+
docs = self._document_store.query_by_tag_key(
|
|
975
|
+
coll, key, limit=limit, since_date=since_date
|
|
976
|
+
)
|
|
977
|
+
return [Item(id=d.id, summary=d.summary, tags=d.tags) for d in docs]
|
|
978
|
+
|
|
456
979
|
# Build tag filter from positional or keyword args
|
|
457
980
|
tag_filter = {}
|
|
458
981
|
|
|
459
|
-
if key is not None:
|
|
460
|
-
if value is None:
|
|
461
|
-
raise ValueError(f"Value required when querying by key '{key}'")
|
|
982
|
+
if key is not None and value is not None:
|
|
462
983
|
tag_filter[key] = value
|
|
463
984
|
|
|
464
985
|
if tags:
|
|
@@ -467,11 +988,50 @@ class Keeper:
|
|
|
467
988
|
if not tag_filter:
|
|
468
989
|
raise ValueError("At least one tag must be specified")
|
|
469
990
|
|
|
470
|
-
# Build where clause
|
|
471
|
-
|
|
991
|
+
# Build where clause for tag filters only
|
|
992
|
+
# (ChromaDB $gte doesn't support string dates, so date filtering is done post-query)
|
|
993
|
+
where_conditions = [{k: v} for k, v in tag_filter.items()]
|
|
994
|
+
|
|
995
|
+
# Use $and if multiple conditions, otherwise single condition
|
|
996
|
+
if len(where_conditions) == 1:
|
|
997
|
+
where = where_conditions[0]
|
|
998
|
+
else:
|
|
999
|
+
where = {"$and": where_conditions}
|
|
1000
|
+
|
|
1001
|
+
# Fetch extra when filtering by date
|
|
1002
|
+
fetch_limit = limit * 3 if since is not None else limit
|
|
1003
|
+
results = self._store.query_metadata(coll, where, limit=fetch_limit)
|
|
1004
|
+
items = [r.to_item() for r in results]
|
|
1005
|
+
|
|
1006
|
+
# Apply date filter if specified (post-filter)
|
|
1007
|
+
if since is not None:
|
|
1008
|
+
items = _filter_by_date(items, since)
|
|
472
1009
|
|
|
473
|
-
|
|
474
|
-
|
|
1010
|
+
return items[:limit]
|
|
1011
|
+
|
|
1012
|
+
def list_tags(
|
|
1013
|
+
self,
|
|
1014
|
+
key: Optional[str] = None,
|
|
1015
|
+
*,
|
|
1016
|
+
collection: Optional[str] = None,
|
|
1017
|
+
) -> list[str]:
|
|
1018
|
+
"""
|
|
1019
|
+
List distinct tag keys or values.
|
|
1020
|
+
|
|
1021
|
+
Args:
|
|
1022
|
+
key: If provided, list distinct values for this key.
|
|
1023
|
+
If None, list distinct tag keys.
|
|
1024
|
+
collection: Target collection
|
|
1025
|
+
|
|
1026
|
+
Returns:
|
|
1027
|
+
Sorted list of distinct keys or values
|
|
1028
|
+
"""
|
|
1029
|
+
coll = self._resolve_collection(collection)
|
|
1030
|
+
|
|
1031
|
+
if key is None:
|
|
1032
|
+
return self._document_store.list_distinct_tag_keys(coll)
|
|
1033
|
+
else:
|
|
1034
|
+
return self._document_store.list_distinct_tag_values(coll, key)
|
|
475
1035
|
|
|
476
1036
|
# -------------------------------------------------------------------------
|
|
477
1037
|
# Direct Access
|
|
@@ -480,29 +1040,273 @@ class Keeper:
|
|
|
480
1040
|
def get(self, id: str, *, collection: Optional[str] = None) -> Optional[Item]:
|
|
481
1041
|
"""
|
|
482
1042
|
Retrieve a specific item by ID.
|
|
1043
|
+
|
|
1044
|
+
Reads from document store (canonical), falls back to ChromaDB for legacy data.
|
|
483
1045
|
"""
|
|
484
1046
|
coll = self._resolve_collection(collection)
|
|
1047
|
+
|
|
1048
|
+
# Try document store first (canonical)
|
|
1049
|
+
doc_record = self._document_store.get(coll, id)
|
|
1050
|
+
if doc_record:
|
|
1051
|
+
return Item(
|
|
1052
|
+
id=doc_record.id,
|
|
1053
|
+
summary=doc_record.summary,
|
|
1054
|
+
tags=doc_record.tags,
|
|
1055
|
+
)
|
|
1056
|
+
|
|
1057
|
+
# Fall back to ChromaDB for legacy data
|
|
485
1058
|
result = self._store.get(coll, id)
|
|
486
1059
|
if result is None:
|
|
487
1060
|
return None
|
|
488
1061
|
return result.to_item()
|
|
489
|
-
|
|
1062
|
+
|
|
1063
|
+
def get_version(
|
|
1064
|
+
self,
|
|
1065
|
+
id: str,
|
|
1066
|
+
offset: int = 0,
|
|
1067
|
+
*,
|
|
1068
|
+
collection: Optional[str] = None,
|
|
1069
|
+
) -> Optional[Item]:
|
|
1070
|
+
"""
|
|
1071
|
+
Get a specific version of a document by offset.
|
|
1072
|
+
|
|
1073
|
+
Offset semantics:
|
|
1074
|
+
- 0 = current version
|
|
1075
|
+
- 1 = previous version
|
|
1076
|
+
- 2 = two versions ago
|
|
1077
|
+
- etc.
|
|
1078
|
+
|
|
1079
|
+
Args:
|
|
1080
|
+
id: Document identifier
|
|
1081
|
+
offset: Version offset (0=current, 1=previous, etc.)
|
|
1082
|
+
collection: Target collection
|
|
1083
|
+
|
|
1084
|
+
Returns:
|
|
1085
|
+
Item if found, None if version doesn't exist
|
|
1086
|
+
"""
|
|
1087
|
+
coll = self._resolve_collection(collection)
|
|
1088
|
+
|
|
1089
|
+
if offset == 0:
|
|
1090
|
+
# Current version
|
|
1091
|
+
return self.get(id, collection=collection)
|
|
1092
|
+
|
|
1093
|
+
# Get archived version
|
|
1094
|
+
version_info = self._document_store.get_version(coll, id, offset)
|
|
1095
|
+
if version_info is None:
|
|
1096
|
+
return None
|
|
1097
|
+
|
|
1098
|
+
return Item(
|
|
1099
|
+
id=id,
|
|
1100
|
+
summary=version_info.summary,
|
|
1101
|
+
tags=version_info.tags,
|
|
1102
|
+
)
|
|
1103
|
+
|
|
1104
|
+
def list_versions(
|
|
1105
|
+
self,
|
|
1106
|
+
id: str,
|
|
1107
|
+
limit: int = 10,
|
|
1108
|
+
*,
|
|
1109
|
+
collection: Optional[str] = None,
|
|
1110
|
+
) -> list[VersionInfo]:
|
|
1111
|
+
"""
|
|
1112
|
+
List version history for a document.
|
|
1113
|
+
|
|
1114
|
+
Returns versions in reverse chronological order (newest archived first).
|
|
1115
|
+
Does not include the current version.
|
|
1116
|
+
|
|
1117
|
+
Args:
|
|
1118
|
+
id: Document identifier
|
|
1119
|
+
limit: Maximum versions to return
|
|
1120
|
+
collection: Target collection
|
|
1121
|
+
|
|
1122
|
+
Returns:
|
|
1123
|
+
List of VersionInfo, newest archived first
|
|
1124
|
+
"""
|
|
1125
|
+
coll = self._resolve_collection(collection)
|
|
1126
|
+
return self._document_store.list_versions(coll, id, limit)
|
|
1127
|
+
|
|
1128
|
+
def get_version_nav(
|
|
1129
|
+
self,
|
|
1130
|
+
id: str,
|
|
1131
|
+
current_version: Optional[int] = None,
|
|
1132
|
+
limit: int = 3,
|
|
1133
|
+
*,
|
|
1134
|
+
collection: Optional[str] = None,
|
|
1135
|
+
) -> dict[str, list[VersionInfo]]:
|
|
1136
|
+
"""
|
|
1137
|
+
Get version navigation info (prev/next) for display.
|
|
1138
|
+
|
|
1139
|
+
Args:
|
|
1140
|
+
id: Document identifier
|
|
1141
|
+
current_version: The version being viewed (None = current/live version)
|
|
1142
|
+
limit: Max previous versions to return when viewing current
|
|
1143
|
+
collection: Target collection
|
|
1144
|
+
|
|
1145
|
+
Returns:
|
|
1146
|
+
Dict with 'prev' and optionally 'next' lists of VersionInfo.
|
|
1147
|
+
"""
|
|
1148
|
+
coll = self._resolve_collection(collection)
|
|
1149
|
+
return self._document_store.get_version_nav(coll, id, current_version, limit)
|
|
1150
|
+
|
|
490
1151
|
def exists(self, id: str, *, collection: Optional[str] = None) -> bool:
|
|
491
1152
|
"""
|
|
492
1153
|
Check if an item exists in the store.
|
|
493
1154
|
"""
|
|
494
1155
|
coll = self._resolve_collection(collection)
|
|
495
|
-
|
|
1156
|
+
# Check document store first, then ChromaDB
|
|
1157
|
+
return self._document_store.exists(coll, id) or self._store.exists(coll, id)
|
|
496
1158
|
|
|
497
|
-
def delete(
|
|
1159
|
+
def delete(
|
|
1160
|
+
self,
|
|
1161
|
+
id: str,
|
|
1162
|
+
*,
|
|
1163
|
+
collection: Optional[str] = None,
|
|
1164
|
+
delete_versions: bool = True,
|
|
1165
|
+
) -> bool:
|
|
498
1166
|
"""
|
|
499
|
-
Delete an item from
|
|
500
|
-
|
|
501
|
-
|
|
1167
|
+
Delete an item from both stores.
|
|
1168
|
+
|
|
1169
|
+
Args:
|
|
1170
|
+
id: Document identifier
|
|
1171
|
+
collection: Target collection
|
|
1172
|
+
delete_versions: If True, also delete version history
|
|
1173
|
+
|
|
1174
|
+
Returns:
|
|
1175
|
+
True if item existed and was deleted.
|
|
502
1176
|
"""
|
|
503
1177
|
coll = self._resolve_collection(collection)
|
|
504
|
-
|
|
505
|
-
|
|
1178
|
+
# Delete from both stores (including versions)
|
|
1179
|
+
doc_deleted = self._document_store.delete(coll, id, delete_versions=delete_versions)
|
|
1180
|
+
chroma_deleted = self._store.delete(coll, id, delete_versions=delete_versions)
|
|
1181
|
+
return doc_deleted or chroma_deleted
|
|
1182
|
+
|
|
1183
|
+
# -------------------------------------------------------------------------
|
|
1184
|
+
# Current Working Context (Now)
|
|
1185
|
+
# -------------------------------------------------------------------------
|
|
1186
|
+
|
|
1187
|
+
def get_now(self) -> Item:
|
|
1188
|
+
"""
|
|
1189
|
+
Get the current working context.
|
|
1190
|
+
|
|
1191
|
+
A singleton document representing what you're currently working on.
|
|
1192
|
+
If it doesn't exist, creates one with default content and tags from
|
|
1193
|
+
docs/system/now.md.
|
|
1194
|
+
|
|
1195
|
+
Returns:
|
|
1196
|
+
The current context Item (never None - auto-creates if missing)
|
|
1197
|
+
"""
|
|
1198
|
+
item = self.get(NOWDOC_ID)
|
|
1199
|
+
if item is None:
|
|
1200
|
+
# First-time initialization with default content and tags
|
|
1201
|
+
try:
|
|
1202
|
+
default_content, default_tags = _load_frontmatter(SYSTEM_DOC_DIR / "now.md")
|
|
1203
|
+
except FileNotFoundError:
|
|
1204
|
+
# Fallback if system file is missing
|
|
1205
|
+
default_content = "# Now\n\nYour working context."
|
|
1206
|
+
default_tags = {}
|
|
1207
|
+
item = self.set_now(default_content, tags=default_tags)
|
|
1208
|
+
return item
|
|
1209
|
+
|
|
1210
|
+
def set_now(
|
|
1211
|
+
self,
|
|
1212
|
+
content: str,
|
|
1213
|
+
*,
|
|
1214
|
+
tags: Optional[dict[str, str]] = None,
|
|
1215
|
+
) -> Item:
|
|
1216
|
+
"""
|
|
1217
|
+
Set the current working context.
|
|
1218
|
+
|
|
1219
|
+
Updates the singleton context with new content. Uses remember()
|
|
1220
|
+
internally with the fixed NOWDOC_ID.
|
|
1221
|
+
|
|
1222
|
+
Args:
|
|
1223
|
+
content: New content for the current context
|
|
1224
|
+
tags: Optional additional tags to apply
|
|
1225
|
+
|
|
1226
|
+
Returns:
|
|
1227
|
+
The updated context Item
|
|
1228
|
+
"""
|
|
1229
|
+
return self.remember(content, id=NOWDOC_ID, tags=tags)
|
|
1230
|
+
|
|
1231
|
+
def list_system_documents(
|
|
1232
|
+
self,
|
|
1233
|
+
*,
|
|
1234
|
+
collection: Optional[str] = None,
|
|
1235
|
+
) -> list[Item]:
|
|
1236
|
+
"""
|
|
1237
|
+
List all system documents.
|
|
1238
|
+
|
|
1239
|
+
System documents are identified by the `category: system` tag.
|
|
1240
|
+
These are preloaded on init and provide foundational content.
|
|
1241
|
+
|
|
1242
|
+
Args:
|
|
1243
|
+
collection: Target collection (default: default collection)
|
|
1244
|
+
|
|
1245
|
+
Returns:
|
|
1246
|
+
List of system document Items
|
|
1247
|
+
"""
|
|
1248
|
+
return self.query_tag("category", "system", collection=collection)
|
|
1249
|
+
|
|
1250
|
+
def tag(
|
|
1251
|
+
self,
|
|
1252
|
+
id: str,
|
|
1253
|
+
tags: Optional[dict[str, str]] = None,
|
|
1254
|
+
*,
|
|
1255
|
+
collection: Optional[str] = None,
|
|
1256
|
+
) -> Optional[Item]:
|
|
1257
|
+
"""
|
|
1258
|
+
Update tags on an existing document without re-processing.
|
|
1259
|
+
|
|
1260
|
+
Does NOT re-fetch, re-embed, or re-summarize. Only updates tags.
|
|
1261
|
+
|
|
1262
|
+
Tag behavior:
|
|
1263
|
+
- Provided tags are merged with existing user tags
|
|
1264
|
+
- Empty string value ("") deletes that tag
|
|
1265
|
+
- System tags (_prefixed) cannot be modified via this method
|
|
1266
|
+
|
|
1267
|
+
Args:
|
|
1268
|
+
id: Document identifier
|
|
1269
|
+
tags: Tags to add/update/delete (empty string = delete)
|
|
1270
|
+
collection: Target collection
|
|
1271
|
+
|
|
1272
|
+
Returns:
|
|
1273
|
+
Updated Item if found, None if document doesn't exist
|
|
1274
|
+
"""
|
|
1275
|
+
coll = self._resolve_collection(collection)
|
|
1276
|
+
|
|
1277
|
+
# Get existing item (prefer document store, fall back to ChromaDB)
|
|
1278
|
+
existing = self.get(id, collection=collection)
|
|
1279
|
+
if existing is None:
|
|
1280
|
+
return None
|
|
1281
|
+
|
|
1282
|
+
# Start with existing tags, separate system from user
|
|
1283
|
+
current_tags = dict(existing.tags)
|
|
1284
|
+
system_tags = {k: v for k, v in current_tags.items()
|
|
1285
|
+
if k.startswith(SYSTEM_TAG_PREFIX)}
|
|
1286
|
+
user_tags = {k: v for k, v in current_tags.items()
|
|
1287
|
+
if not k.startswith(SYSTEM_TAG_PREFIX)}
|
|
1288
|
+
|
|
1289
|
+
# Apply tag changes (filter out system tags from input)
|
|
1290
|
+
if tags:
|
|
1291
|
+
for key, value in tags.items():
|
|
1292
|
+
if key.startswith(SYSTEM_TAG_PREFIX):
|
|
1293
|
+
continue # Cannot modify system tags
|
|
1294
|
+
if value == "":
|
|
1295
|
+
# Empty string = delete
|
|
1296
|
+
user_tags.pop(key, None)
|
|
1297
|
+
else:
|
|
1298
|
+
user_tags[key] = value
|
|
1299
|
+
|
|
1300
|
+
# Merge back: user tags + system tags
|
|
1301
|
+
final_tags = {**user_tags, **system_tags}
|
|
1302
|
+
|
|
1303
|
+
# Dual-write to both stores
|
|
1304
|
+
self._document_store.update_tags(coll, id, final_tags)
|
|
1305
|
+
self._store.update_tags(coll, id, final_tags)
|
|
1306
|
+
|
|
1307
|
+
# Return updated item
|
|
1308
|
+
return self.get(id, collection=collection)
|
|
1309
|
+
|
|
506
1310
|
# -------------------------------------------------------------------------
|
|
507
1311
|
# Collection Management
|
|
508
1312
|
# -------------------------------------------------------------------------
|
|
@@ -511,21 +1315,61 @@ class Keeper:
|
|
|
511
1315
|
"""
|
|
512
1316
|
List all collections in the store.
|
|
513
1317
|
"""
|
|
514
|
-
|
|
1318
|
+
# Merge collections from both stores
|
|
1319
|
+
doc_collections = set(self._document_store.list_collections())
|
|
1320
|
+
chroma_collections = set(self._store.list_collections())
|
|
1321
|
+
return sorted(doc_collections | chroma_collections)
|
|
515
1322
|
|
|
516
1323
|
def count(self, *, collection: Optional[str] = None) -> int:
|
|
517
1324
|
"""
|
|
518
1325
|
Count items in a collection.
|
|
1326
|
+
|
|
1327
|
+
Returns count from document store if available, else ChromaDB.
|
|
519
1328
|
"""
|
|
520
1329
|
coll = self._resolve_collection(collection)
|
|
1330
|
+
doc_count = self._document_store.count(coll)
|
|
1331
|
+
if doc_count > 0:
|
|
1332
|
+
return doc_count
|
|
521
1333
|
return self._store.count(coll)
|
|
522
|
-
|
|
1334
|
+
|
|
1335
|
+
def list_recent(
|
|
1336
|
+
self,
|
|
1337
|
+
limit: int = 10,
|
|
1338
|
+
*,
|
|
1339
|
+
collection: Optional[str] = None,
|
|
1340
|
+
) -> list[Item]:
|
|
1341
|
+
"""
|
|
1342
|
+
List recent items ordered by update time.
|
|
1343
|
+
|
|
1344
|
+
Args:
|
|
1345
|
+
limit: Maximum number to return (default 10)
|
|
1346
|
+
collection: Collection to query (uses default if not specified)
|
|
1347
|
+
|
|
1348
|
+
Returns:
|
|
1349
|
+
List of Items, most recently updated first
|
|
1350
|
+
"""
|
|
1351
|
+
coll = self._resolve_collection(collection)
|
|
1352
|
+
records = self._document_store.list_recent(coll, limit)
|
|
1353
|
+
|
|
1354
|
+
return [
|
|
1355
|
+
Item(
|
|
1356
|
+
id=rec.id,
|
|
1357
|
+
summary=rec.summary,
|
|
1358
|
+
tags=rec.tags,
|
|
1359
|
+
score=None,
|
|
1360
|
+
)
|
|
1361
|
+
for rec in records
|
|
1362
|
+
]
|
|
1363
|
+
|
|
523
1364
|
def embedding_cache_stats(self) -> dict:
|
|
524
1365
|
"""
|
|
525
1366
|
Get embedding cache statistics.
|
|
526
1367
|
|
|
527
1368
|
Returns dict with: entries, hits, misses, hit_rate, cache_path
|
|
1369
|
+
Returns {"loaded": False} if embedding provider hasn't been loaded yet.
|
|
528
1370
|
"""
|
|
1371
|
+
if self._embedding_provider is None:
|
|
1372
|
+
return {"loaded": False}
|
|
529
1373
|
if isinstance(self._embedding_provider, CachingEmbeddingProvider):
|
|
530
1374
|
return self._embedding_provider.stats()
|
|
531
1375
|
return {"enabled": False}
|
|
@@ -563,9 +1407,10 @@ class Keeper:
|
|
|
563
1407
|
|
|
564
1408
|
try:
|
|
565
1409
|
# Generate real summary
|
|
566
|
-
summary = self.
|
|
1410
|
+
summary = self._get_summarization_provider().summarize(item.content)
|
|
567
1411
|
|
|
568
|
-
# Update
|
|
1412
|
+
# Update summary in both stores
|
|
1413
|
+
self._document_store.update_summary(item.collection, item.id, summary)
|
|
569
1414
|
self._store.update_summary(item.collection, item.id, summary)
|
|
570
1415
|
|
|
571
1416
|
# Remove from queue
|
|
@@ -652,21 +1497,83 @@ class Keeper:
|
|
|
652
1497
|
subprocess.Popen(cmd, **kwargs)
|
|
653
1498
|
return True
|
|
654
1499
|
|
|
655
|
-
except Exception:
|
|
656
|
-
# Spawn failed -
|
|
1500
|
+
except Exception as e:
|
|
1501
|
+
# Spawn failed - log for debugging, queue will be processed later
|
|
1502
|
+
logger.warning("Failed to spawn background processor: %s", e)
|
|
657
1503
|
return False
|
|
658
1504
|
|
|
1505
|
+
def reconcile(
|
|
1506
|
+
self,
|
|
1507
|
+
collection: Optional[str] = None,
|
|
1508
|
+
fix: bool = False,
|
|
1509
|
+
) -> dict:
|
|
1510
|
+
"""
|
|
1511
|
+
Check and optionally fix consistency between DocumentStore and ChromaDB.
|
|
1512
|
+
|
|
1513
|
+
Detects:
|
|
1514
|
+
- Documents in DocumentStore missing from ChromaDB (not searchable)
|
|
1515
|
+
- Documents in ChromaDB missing from DocumentStore (orphaned embeddings)
|
|
1516
|
+
|
|
1517
|
+
Args:
|
|
1518
|
+
collection: Collection to check (None = default collection)
|
|
1519
|
+
fix: If True, re-index documents missing from ChromaDB
|
|
1520
|
+
|
|
1521
|
+
Returns:
|
|
1522
|
+
Dict with 'missing_from_chroma', 'orphaned_in_chroma', 'fixed' counts
|
|
1523
|
+
"""
|
|
1524
|
+
coll = self._resolve_collection(collection)
|
|
1525
|
+
|
|
1526
|
+
# Get IDs from both stores
|
|
1527
|
+
doc_ids = set(self._document_store.list_ids(coll))
|
|
1528
|
+
chroma_ids = set(self._store.list_ids(coll))
|
|
1529
|
+
|
|
1530
|
+
missing_from_chroma = doc_ids - chroma_ids
|
|
1531
|
+
orphaned_in_chroma = chroma_ids - doc_ids
|
|
1532
|
+
|
|
1533
|
+
fixed = 0
|
|
1534
|
+
if fix and missing_from_chroma:
|
|
1535
|
+
for doc_id in missing_from_chroma:
|
|
1536
|
+
try:
|
|
1537
|
+
# Re-fetch and re-index
|
|
1538
|
+
doc_record = self._document_store.get(coll, doc_id)
|
|
1539
|
+
if doc_record:
|
|
1540
|
+
# Fetch original content
|
|
1541
|
+
doc = self._document_provider.fetch(doc_id)
|
|
1542
|
+
embedding = self._get_embedding_provider().embed(doc.content)
|
|
1543
|
+
|
|
1544
|
+
# Write to ChromaDB
|
|
1545
|
+
self._store.upsert(
|
|
1546
|
+
collection=coll,
|
|
1547
|
+
id=doc_id,
|
|
1548
|
+
embedding=embedding,
|
|
1549
|
+
summary=doc_record.summary,
|
|
1550
|
+
tags=doc_record.tags,
|
|
1551
|
+
)
|
|
1552
|
+
fixed += 1
|
|
1553
|
+
logger.info("Reconciled: %s", doc_id)
|
|
1554
|
+
except Exception as e:
|
|
1555
|
+
logger.warning("Failed to reconcile %s: %s", doc_id, e)
|
|
1556
|
+
|
|
1557
|
+
return {
|
|
1558
|
+
"missing_from_chroma": len(missing_from_chroma),
|
|
1559
|
+
"orphaned_in_chroma": len(orphaned_in_chroma),
|
|
1560
|
+
"fixed": fixed,
|
|
1561
|
+
"missing_ids": list(missing_from_chroma) if missing_from_chroma else [],
|
|
1562
|
+
"orphaned_ids": list(orphaned_in_chroma) if orphaned_in_chroma else [],
|
|
1563
|
+
}
|
|
1564
|
+
|
|
659
1565
|
def close(self) -> None:
|
|
660
1566
|
"""
|
|
661
1567
|
Close resources (embedding cache connection, pending queue, etc.).
|
|
662
1568
|
|
|
663
1569
|
Good practice to call when done, though Python's GC will clean up eventually.
|
|
664
1570
|
"""
|
|
665
|
-
# Close embedding cache if it
|
|
666
|
-
if
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
cache
|
|
1571
|
+
# Close embedding cache if it was loaded
|
|
1572
|
+
if self._embedding_provider is not None:
|
|
1573
|
+
if hasattr(self._embedding_provider, '_cache'):
|
|
1574
|
+
cache = self._embedding_provider._cache
|
|
1575
|
+
if hasattr(cache, 'close'):
|
|
1576
|
+
cache.close()
|
|
670
1577
|
|
|
671
1578
|
# Close pending summary queue
|
|
672
1579
|
if hasattr(self, '_pending_queue'):
|