keep-skill 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,107 @@
1
+ """
2
+ Default summarization providers.
3
+
4
+ Simple, zero-dependency summarizers for getting started.
5
+ """
6
+
7
+ from .base import SummarizationProvider, get_registry
8
+
9
+
10
+ class TruncationSummarizer:
11
+ """
12
+ Simple summarizer that truncates content to a max length.
13
+
14
+ Zero dependencies. Good enough to get started; replace with
15
+ LLM-based summarization when quality matters.
16
+ """
17
+
18
+ def __init__(self, max_length: int = 500, suffix: str = "..."):
19
+ """
20
+ Args:
21
+ max_length: Maximum summary length in characters
22
+ suffix: Suffix to add when truncated
23
+ """
24
+ self.max_length = max_length
25
+ self.suffix = suffix
26
+
27
+ def summarize(self, content: str, *, max_length: int | None = None) -> str:
28
+ """
29
+ Summarize by taking first N characters.
30
+
31
+ Tries to break at word boundaries.
32
+ """
33
+ limit = max_length or self.max_length
34
+
35
+ if len(content) <= limit:
36
+ return content.strip()
37
+
38
+ # Find last space before limit to break at word boundary
39
+ truncated = content[:limit]
40
+ last_space = truncated.rfind(' ')
41
+
42
+ if last_space > limit * 0.7: # Don't break too early
43
+ truncated = truncated[:last_space]
44
+
45
+ return truncated.strip() + self.suffix
46
+
47
+
48
+ class FirstParagraphSummarizer:
49
+ """
50
+ Summarizer that extracts the first paragraph or meaningful chunk.
51
+
52
+ Better than pure truncation for documents with structure.
53
+ """
54
+
55
+ def __init__(self, max_length: int = 500, suffix: str = "..."):
56
+ self.max_length = max_length
57
+ self.suffix = suffix
58
+
59
+ def summarize(self, content: str, *, max_length: int | None = None) -> str:
60
+ """Extract first paragraph, falling back to truncation."""
61
+ limit = max_length or self.max_length
62
+
63
+ # Strip leading whitespace and find first paragraph
64
+ content = content.strip()
65
+
66
+ # Look for paragraph break (double newline)
67
+ para_end = content.find('\n\n')
68
+ if para_end > 0 and para_end < limit:
69
+ first_para = content[:para_end].strip()
70
+ if len(first_para) > 50: # Paragraph is meaningful
71
+ return first_para
72
+
73
+ # Fall back to truncation
74
+ if len(content) <= limit:
75
+ return content
76
+
77
+ truncated = content[:limit]
78
+ last_space = truncated.rfind(' ')
79
+ if last_space > limit * 0.7:
80
+ truncated = truncated[:last_space]
81
+
82
+ return truncated.strip() + self.suffix
83
+
84
+
85
+ class PassthroughSummarizer:
86
+ """
87
+ No-op summarizer that returns content as-is (or truncated).
88
+
89
+ Useful when you want to store the full content as the summary.
90
+ """
91
+
92
+ def __init__(self, max_length: int = 10000):
93
+ self.max_length = max_length
94
+
95
+ def summarize(self, content: str, *, max_length: int | None = None) -> str:
96
+ """Return content, possibly truncated to max length."""
97
+ limit = max_length or self.max_length
98
+ if len(content) <= limit:
99
+ return content
100
+ return content[:limit]
101
+
102
+
103
+ # Register providers
104
+ _registry = get_registry()
105
+ _registry.register_summarization("truncate", TruncationSummarizer)
106
+ _registry.register_summarization("first_paragraph", FirstParagraphSummarizer)
107
+ _registry.register_summarization("passthrough", PassthroughSummarizer)
keep/store.py ADDED
@@ -0,0 +1,403 @@
1
+ """
2
+ Vector store implementation using ChromaDb.
3
+
4
+ This is the first concrete store implementation. The interface is designed
5
+ to be extractable to a Protocol when additional backends are needed.
6
+
7
+ For now, ChromaDb is the only implementation — and that's fine.
8
+ """
9
+
10
+ from dataclasses import dataclass
11
+ from datetime import datetime, timezone
12
+ from pathlib import Path
13
+ from typing import Any, Optional
14
+
15
+ from .types import Item, SYSTEM_TAG_PREFIX
16
+
17
+
18
+ @dataclass
19
+ class StoreResult:
20
+ """Result from a store query with raw data before Item conversion."""
21
+ id: str
22
+ summary: str
23
+ tags: dict[str, str]
24
+ distance: float | None = None # Lower is more similar in Chroma
25
+
26
+ def to_item(self) -> Item:
27
+ """Convert to Item, transforming distance to similarity score."""
28
+ # Chroma uses L2 distance by default; convert to 0-1 similarity
29
+ # score = 1 / (1 + distance) gives us 1.0 for identical, approaching 0 for distant
30
+ score = None
31
+ if self.distance is not None:
32
+ score = 1.0 / (1.0 + self.distance)
33
+ return Item(id=self.id, summary=self.summary, tags=self.tags, score=score)
34
+
35
+
36
+ class ChromaStore:
37
+ """
38
+ Persistent vector store using ChromaDb.
39
+
40
+ Each collection maps to a ChromaDb collection. Items are stored with:
41
+ - id: The item's URI or custom identifier
42
+ - embedding: Vector representation for similarity search
43
+ - document: The item's summary (stored for retrieval, searchable)
44
+ - metadata: All tags (flattened to strings for Chroma compatibility)
45
+
46
+ The store is initialized at a specific path and persists across sessions.
47
+
48
+ Future: This class's public interface could become a Protocol for
49
+ pluggable backends (SQLite+faiss, Postgres+pgvector, etc.)
50
+ """
51
+
52
+ def __init__(self, store_path: Path, embedding_dimension: int):
53
+ """
54
+ Initialize or open a ChromaDb store.
55
+
56
+ Args:
57
+ store_path: Directory for persistent storage
58
+ embedding_dimension: Expected dimension of embeddings (for validation)
59
+ """
60
+ try:
61
+ import chromadb
62
+ from chromadb.config import Settings
63
+ except ImportError:
64
+ raise RuntimeError(
65
+ "ChromaStore requires 'chromadb' library. "
66
+ "Install with: pip install chromadb"
67
+ )
68
+
69
+ self._store_path = store_path
70
+ self._embedding_dimension = embedding_dimension
71
+
72
+ # Ensure store directory exists
73
+ store_path.mkdir(parents=True, exist_ok=True)
74
+
75
+ # Initialize persistent client
76
+ self._client = chromadb.PersistentClient(
77
+ path=str(store_path / "chroma"),
78
+ settings=Settings(
79
+ anonymized_telemetry=False,
80
+ allow_reset=True,
81
+ )
82
+ )
83
+
84
+ # Cache of collection handles
85
+ self._collections: dict[str, Any] = {}
86
+
87
+ def _get_collection(self, name: str) -> Any:
88
+ """Get or create a collection by name."""
89
+ if name not in self._collections:
90
+ # get_or_create handles both cases
91
+ self._collections[name] = self._client.get_or_create_collection(
92
+ name=name,
93
+ metadata={"hnsw:space": "l2"}, # L2 distance for similarity
94
+ )
95
+ return self._collections[name]
96
+
97
+ def _tags_to_metadata(self, tags: dict[str, str]) -> dict[str, Any]:
98
+ """
99
+ Convert tags to Chroma metadata format.
100
+
101
+ Chroma metadata values must be str, int, float, or bool.
102
+ We store everything as strings for consistency.
103
+ """
104
+ return {k: str(v) for k, v in tags.items()}
105
+
106
+ def _metadata_to_tags(self, metadata: dict[str, Any] | None) -> dict[str, str]:
107
+ """Convert Chroma metadata back to tags."""
108
+ if metadata is None:
109
+ return {}
110
+ return {k: str(v) for k, v in metadata.items()}
111
+
112
+ # -------------------------------------------------------------------------
113
+ # Write Operations
114
+ # -------------------------------------------------------------------------
115
+
116
+ def upsert(
117
+ self,
118
+ collection: str,
119
+ id: str,
120
+ embedding: list[float],
121
+ summary: str,
122
+ tags: dict[str, str],
123
+ ) -> None:
124
+ """
125
+ Insert or update an item in the store.
126
+
127
+ Args:
128
+ collection: Collection name
129
+ id: Item identifier (URI or custom)
130
+ embedding: Vector embedding
131
+ summary: Human-readable summary (stored as document)
132
+ tags: All tags (source + system + generated)
133
+ """
134
+ if len(embedding) != self._embedding_dimension:
135
+ raise ValueError(
136
+ f"Embedding dimension mismatch: expected {self._embedding_dimension}, "
137
+ f"got {len(embedding)}"
138
+ )
139
+
140
+ coll = self._get_collection(collection)
141
+
142
+ # Add timestamp if not present
143
+ now = datetime.now(timezone.utc).isoformat()
144
+ if "_updated" not in tags:
145
+ tags = {**tags, "_updated": now}
146
+ if "_created" not in tags:
147
+ # Check if item exists to preserve original created time
148
+ existing = coll.get(ids=[id], include=["metadatas"])
149
+ if existing["ids"]:
150
+ old_created = existing["metadatas"][0].get("_created")
151
+ if old_created:
152
+ tags = {**tags, "_created": old_created}
153
+ else:
154
+ tags = {**tags, "_created": now}
155
+ else:
156
+ tags = {**tags, "_created": now}
157
+
158
+ # Add date portion for easier date queries
159
+ tags = {**tags, "_updated_date": now[:10]}
160
+
161
+ coll.upsert(
162
+ ids=[id],
163
+ embeddings=[embedding],
164
+ documents=[summary],
165
+ metadatas=[self._tags_to_metadata(tags)],
166
+ )
167
+
168
+ def delete(self, collection: str, id: str) -> bool:
169
+ """
170
+ Delete an item from the store.
171
+
172
+ Args:
173
+ collection: Collection name
174
+ id: Item identifier
175
+
176
+ Returns:
177
+ True if item existed and was deleted, False if not found
178
+ """
179
+ coll = self._get_collection(collection)
180
+
181
+ # Check existence first
182
+ existing = coll.get(ids=[id])
183
+ if not existing["ids"]:
184
+ return False
185
+
186
+ coll.delete(ids=[id])
187
+ return True
188
+
189
+ def update_summary(self, collection: str, id: str, summary: str) -> bool:
190
+ """
191
+ Update just the summary of an existing item.
192
+
193
+ Used by lazy summarization to replace placeholder summaries
194
+ with real generated summaries.
195
+
196
+ Args:
197
+ collection: Collection name
198
+ id: Item identifier
199
+ summary: New summary text
200
+
201
+ Returns:
202
+ True if item was updated, False if not found
203
+ """
204
+ coll = self._get_collection(collection)
205
+
206
+ # Get existing item
207
+ existing = coll.get(ids=[id], include=["metadatas"])
208
+ if not existing["ids"]:
209
+ return False
210
+
211
+ # Update metadata with new timestamp
212
+ metadata = existing["metadatas"][0] or {}
213
+ now = datetime.now(timezone.utc).isoformat()
214
+ metadata["_updated"] = now
215
+ metadata["_updated_date"] = now[:10]
216
+
217
+ # Update just the document (summary) and metadata
218
+ coll.update(
219
+ ids=[id],
220
+ documents=[summary],
221
+ metadatas=[metadata],
222
+ )
223
+ return True
224
+
225
+ # -------------------------------------------------------------------------
226
+ # Read Operations
227
+ # -------------------------------------------------------------------------
228
+
229
+ def get(self, collection: str, id: str) -> StoreResult | None:
230
+ """
231
+ Retrieve a specific item by ID.
232
+
233
+ Args:
234
+ collection: Collection name
235
+ id: Item identifier
236
+
237
+ Returns:
238
+ StoreResult if found, None otherwise
239
+ """
240
+ coll = self._get_collection(collection)
241
+ result = coll.get(
242
+ ids=[id],
243
+ include=["documents", "metadatas"],
244
+ )
245
+
246
+ if not result["ids"]:
247
+ return None
248
+
249
+ return StoreResult(
250
+ id=result["ids"][0],
251
+ summary=result["documents"][0] or "",
252
+ tags=self._metadata_to_tags(result["metadatas"][0]),
253
+ )
254
+
255
+ def exists(self, collection: str, id: str) -> bool:
256
+ """Check if an item exists in the store."""
257
+ coll = self._get_collection(collection)
258
+ result = coll.get(ids=[id], include=[])
259
+ return bool(result["ids"])
260
+
261
+ def query_embedding(
262
+ self,
263
+ collection: str,
264
+ embedding: list[float],
265
+ limit: int = 10,
266
+ where: dict[str, Any] | None = None,
267
+ ) -> list[StoreResult]:
268
+ """
269
+ Query by embedding similarity.
270
+
271
+ Args:
272
+ collection: Collection name
273
+ embedding: Query embedding vector
274
+ limit: Maximum results to return
275
+ where: Optional metadata filter (Chroma where clause)
276
+
277
+ Returns:
278
+ List of results ordered by similarity (most similar first)
279
+ """
280
+ coll = self._get_collection(collection)
281
+
282
+ query_params = {
283
+ "query_embeddings": [embedding],
284
+ "n_results": limit,
285
+ "include": ["documents", "metadatas", "distances"],
286
+ }
287
+ if where:
288
+ query_params["where"] = where
289
+
290
+ result = coll.query(**query_params)
291
+
292
+ results = []
293
+ for i, id in enumerate(result["ids"][0]):
294
+ results.append(StoreResult(
295
+ id=id,
296
+ summary=result["documents"][0][i] or "",
297
+ tags=self._metadata_to_tags(result["metadatas"][0][i]),
298
+ distance=result["distances"][0][i] if result["distances"] else None,
299
+ ))
300
+
301
+ return results
302
+
303
+ def query_metadata(
304
+ self,
305
+ collection: str,
306
+ where: dict[str, Any],
307
+ limit: int = 100,
308
+ ) -> list[StoreResult]:
309
+ """
310
+ Query by metadata filter (tag query).
311
+
312
+ Args:
313
+ collection: Collection name
314
+ where: Chroma where clause for metadata filtering
315
+ limit: Maximum results to return
316
+
317
+ Returns:
318
+ List of matching results (no particular order)
319
+ """
320
+ coll = self._get_collection(collection)
321
+
322
+ result = coll.get(
323
+ where=where,
324
+ limit=limit,
325
+ include=["documents", "metadatas"],
326
+ )
327
+
328
+ results = []
329
+ for i, id in enumerate(result["ids"]):
330
+ results.append(StoreResult(
331
+ id=id,
332
+ summary=result["documents"][i] or "",
333
+ tags=self._metadata_to_tags(result["metadatas"][i]),
334
+ ))
335
+
336
+ return results
337
+
338
+ def query_fulltext(
339
+ self,
340
+ collection: str,
341
+ query: str,
342
+ limit: int = 10,
343
+ ) -> list[StoreResult]:
344
+ """
345
+ Query by full-text search on document content (summaries).
346
+
347
+ Args:
348
+ collection: Collection name
349
+ query: Text to search for
350
+ limit: Maximum results to return
351
+
352
+ Returns:
353
+ List of matching results
354
+ """
355
+ coll = self._get_collection(collection)
356
+
357
+ # Chroma's where_document does substring matching
358
+ result = coll.get(
359
+ where_document={"$contains": query},
360
+ limit=limit,
361
+ include=["documents", "metadatas"],
362
+ )
363
+
364
+ results = []
365
+ for i, id in enumerate(result["ids"]):
366
+ results.append(StoreResult(
367
+ id=id,
368
+ summary=result["documents"][i] or "",
369
+ tags=self._metadata_to_tags(result["metadatas"][i]),
370
+ ))
371
+
372
+ return results
373
+
374
+ # -------------------------------------------------------------------------
375
+ # Collection Management
376
+ # -------------------------------------------------------------------------
377
+
378
+ def list_collections(self) -> list[str]:
379
+ """List all collection names in the store."""
380
+ collections = self._client.list_collections()
381
+ return [c.name for c in collections]
382
+
383
+ def delete_collection(self, name: str) -> bool:
384
+ """
385
+ Delete an entire collection.
386
+
387
+ Args:
388
+ name: Collection name
389
+
390
+ Returns:
391
+ True if collection existed and was deleted
392
+ """
393
+ try:
394
+ self._client.delete_collection(name)
395
+ self._collections.pop(name, None)
396
+ return True
397
+ except ValueError:
398
+ return False
399
+
400
+ def count(self, collection: str) -> int:
401
+ """Return the number of items in a collection."""
402
+ coll = self._get_collection(collection)
403
+ return coll.count()
keep/types.py ADDED
@@ -0,0 +1,65 @@
1
+ """
2
+ Data types for associative memory.
3
+ """
4
+
5
+ from dataclasses import dataclass, field
6
+ from typing import Optional
7
+
8
+
9
+ # System tag prefix - tags starting with this are managed by the system
10
+ SYSTEM_TAG_PREFIX = "_"
11
+
12
+
13
+ def filter_non_system_tags(tags: dict[str, str]) -> dict[str, str]:
14
+ """
15
+ Filter out any system tags (those starting with '_').
16
+
17
+ Use this to ensure source tags and derived tags cannot
18
+ overwrite system-managed values.
19
+ """
20
+ return {k: v for k, v in tags.items() if not k.startswith(SYSTEM_TAG_PREFIX)}
21
+
22
+
23
+ @dataclass(frozen=True)
24
+ class Item:
25
+ """
26
+ An item retrieved from the associative memory store.
27
+
28
+ This is a read-only snapshot. To modify an item, use api.update()
29
+ which returns a new Item with updated values.
30
+
31
+ Timestamps and other system metadata live in tags, not as explicit fields.
32
+ This follows the "schema as data" principle.
33
+
34
+ Attributes:
35
+ id: URI or custom identifier for the item
36
+ summary: Generated summary of the content
37
+ tags: All tags (source, system, and generated combined)
38
+ score: Similarity score (present only in search results)
39
+
40
+ System tags (managed automatically, in tags dict):
41
+ _created: ISO timestamp when first indexed
42
+ _updated: ISO timestamp when last indexed
43
+ _updated_date: Date portion for easier queries
44
+ _content_type: MIME type if known
45
+ _source: How content was obtained (uri, inline)
46
+ _session: Session that last touched this item
47
+ """
48
+ id: str
49
+ summary: str
50
+ tags: dict[str, str] = field(default_factory=dict)
51
+ score: Optional[float] = None
52
+
53
+ @property
54
+ def created(self) -> str | None:
55
+ """ISO timestamp when first indexed (from _created tag)."""
56
+ return self.tags.get("_created")
57
+
58
+ @property
59
+ def updated(self) -> str | None:
60
+ """ISO timestamp when last indexed (from _updated tag)."""
61
+ return self.tags.get("_updated")
62
+
63
+ def __str__(self) -> str:
64
+ score_str = f" [{self.score:.3f}]" if self.score is not None else ""
65
+ return f"{self.id}{score_str}: {self.summary[:60]}..."