keep-skill 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
keep/config.py ADDED
@@ -0,0 +1,323 @@
1
+ """
2
+ Configuration management for associative memory stores.
3
+
4
+ The configuration is stored as a TOML file in the store directory.
5
+ It specifies which providers to use and their parameters.
6
+ """
7
+
8
+ import os
9
+ import platform
10
+ import tomllib
11
+ from dataclasses import dataclass, field
12
+ from datetime import datetime, timezone
13
+ from pathlib import Path
14
+ from typing import Any
15
+
16
+ # tomli_w for writing TOML (tomllib is read-only)
17
+ try:
18
+ import tomli_w
19
+ except ImportError:
20
+ tomli_w = None # type: ignore
21
+
22
+
23
+ CONFIG_FILENAME = "keep.toml"
24
+ CONFIG_VERSION = 1
25
+
26
+
27
+ @dataclass
28
+ class ProviderConfig:
29
+ """Configuration for a single provider."""
30
+ name: str
31
+ params: dict[str, Any] = field(default_factory=dict)
32
+
33
+
34
+ @dataclass
35
+ class StoreConfig:
36
+ """Complete store configuration."""
37
+ path: Path
38
+ version: int = CONFIG_VERSION
39
+ created: str = field(default_factory=lambda: datetime.now(timezone.utc).isoformat())
40
+
41
+ # Provider configurations
42
+ embedding: ProviderConfig = field(default_factory=lambda: ProviderConfig("sentence-transformers"))
43
+ summarization: ProviderConfig = field(default_factory=lambda: ProviderConfig("truncate"))
44
+ document: ProviderConfig = field(default_factory=lambda: ProviderConfig("composite"))
45
+
46
+ @property
47
+ def config_path(self) -> Path:
48
+ """Path to the TOML config file."""
49
+ return self.path / CONFIG_FILENAME
50
+
51
+ def exists(self) -> bool:
52
+ """Check if config file exists."""
53
+ return self.config_path.exists()
54
+
55
+
56
+ def read_openclaw_config() -> dict | None:
57
+ """
58
+ Read OpenClaw configuration if available.
59
+
60
+ Checks:
61
+ 1. OPENCLAW_CONFIG environment variable
62
+ 2. ~/.openclaw/openclaw.json (default location)
63
+
64
+ Returns None if not found or invalid.
65
+ """
66
+ import json
67
+
68
+ # Try environment variable first
69
+ config_path_str = os.environ.get("OPENCLAW_CONFIG")
70
+ if config_path_str:
71
+ config_file = Path(config_path_str)
72
+ else:
73
+ # Default location
74
+ config_file = Path.home() / ".openclaw" / "openclaw.json"
75
+
76
+ if not config_file.exists():
77
+ return None
78
+
79
+ try:
80
+ with open(config_file) as f:
81
+ return json.load(f)
82
+ except (json.JSONDecodeError, IOError):
83
+ return None
84
+
85
+
86
+ def get_openclaw_memory_search_config(openclaw_config: dict | None) -> dict | None:
87
+ """
88
+ Extract memorySearch config from OpenClaw config.
89
+
90
+ Returns the memorySearch settings or None if not configured.
91
+
92
+ Example structure:
93
+ {
94
+ "provider": "openai" | "gemini" | "local" | "auto",
95
+ "model": "text-embedding-3-small",
96
+ "remote": {
97
+ "apiKey": "sk-...",
98
+ "baseUrl": "https://..."
99
+ }
100
+ }
101
+ """
102
+ if not openclaw_config:
103
+ return None
104
+
105
+ return (openclaw_config
106
+ .get("agents", {})
107
+ .get("defaults", {})
108
+ .get("memorySearch", None))
109
+
110
+
111
+ def detect_default_providers() -> dict[str, ProviderConfig]:
112
+ """
113
+ Detect the best default providers for the current environment.
114
+
115
+ Priority for embeddings:
116
+ 1. OpenClaw memorySearch config (if configured with provider + API key)
117
+ 2. sentence-transformers (local fallback)
118
+
119
+ Priority for summarization:
120
+ 1. OpenClaw model config + Anthropic (if configured and ANTHROPIC_API_KEY available)
121
+ 2. MLX (Apple Silicon local-first)
122
+ 3. OpenAI (if API key available)
123
+ 4. Fallback: truncate
124
+
125
+ Returns provider configs for: embedding, summarization, document
126
+ """
127
+ providers = {}
128
+
129
+ # Check for Apple Silicon
130
+ is_apple_silicon = (
131
+ platform.system() == "Darwin" and
132
+ platform.machine() == "arm64"
133
+ )
134
+
135
+ # Check for API keys
136
+ has_anthropic_key = bool(os.environ.get("ANTHROPIC_API_KEY"))
137
+ has_openai_key = bool(
138
+ os.environ.get("KEEP_OPENAI_API_KEY") or
139
+ os.environ.get("OPENAI_API_KEY")
140
+ )
141
+ has_gemini_key = bool(
142
+ os.environ.get("GEMINI_API_KEY") or
143
+ os.environ.get("GOOGLE_API_KEY")
144
+ )
145
+
146
+ # Check for OpenClaw config
147
+ openclaw_config = read_openclaw_config()
148
+ openclaw_model = None
149
+ if openclaw_config:
150
+ model_str = (openclaw_config.get("agents", {})
151
+ .get("defaults", {})
152
+ .get("model", {})
153
+ .get("primary", ""))
154
+ if model_str:
155
+ openclaw_model = model_str
156
+
157
+ # Get OpenClaw memorySearch config for embeddings
158
+ memory_search = get_openclaw_memory_search_config(openclaw_config)
159
+
160
+ # Embedding: check OpenClaw memorySearch config first, then fall back to local
161
+ embedding_provider = None
162
+ if memory_search:
163
+ ms_provider = memory_search.get("provider", "auto")
164
+ ms_model = memory_search.get("model")
165
+ ms_api_key = memory_search.get("remote", {}).get("apiKey")
166
+
167
+ if ms_provider == "openai" or (ms_provider == "auto" and has_openai_key):
168
+ # Use OpenAI embeddings if configured or auto with key available
169
+ api_key = ms_api_key or os.environ.get("OPENAI_API_KEY")
170
+ if api_key:
171
+ params = {}
172
+ if ms_model:
173
+ params["model"] = ms_model
174
+ embedding_provider = ProviderConfig("openai", params)
175
+
176
+ elif ms_provider == "gemini" or (ms_provider == "auto" and has_gemini_key and not has_openai_key):
177
+ # Use Gemini embeddings if configured or auto with key available
178
+ api_key = ms_api_key or os.environ.get("GEMINI_API_KEY") or os.environ.get("GOOGLE_API_KEY")
179
+ if api_key:
180
+ params = {}
181
+ if ms_model:
182
+ params["model"] = ms_model
183
+ embedding_provider = ProviderConfig("gemini", params)
184
+
185
+ # Fall back to sentence-transformers (local, always works)
186
+ if embedding_provider is None:
187
+ embedding_provider = ProviderConfig("sentence-transformers")
188
+
189
+ providers["embedding"] = embedding_provider
190
+
191
+ # Summarization: priority order based on availability
192
+ # 1. OpenClaw + Anthropic (if configured and key available)
193
+ if openclaw_model and openclaw_model.startswith("anthropic/") and has_anthropic_key:
194
+ # Extract model name from "anthropic/claude-sonnet-4-5" format
195
+ model_name = openclaw_model.split("/", 1)[1] if "/" in openclaw_model else "claude-3-5-haiku-20241022"
196
+ # Map OpenClaw model names to actual Anthropic model names
197
+ model_mapping = {
198
+ "claude-sonnet-4": "claude-sonnet-4-20250514",
199
+ "claude-sonnet-4-5": "claude-sonnet-4-20250514",
200
+ "claude-sonnet-3-5": "claude-3-5-sonnet-20241022",
201
+ "claude-haiku-3-5": "claude-3-5-haiku-20241022",
202
+ }
203
+ actual_model = model_mapping.get(model_name, "claude-3-5-haiku-20241022")
204
+ providers["summarization"] = ProviderConfig("anthropic", {"model": actual_model})
205
+ # 2. MLX on Apple Silicon (local-first)
206
+ elif is_apple_silicon:
207
+ try:
208
+ import mlx_lm # noqa
209
+ providers["summarization"] = ProviderConfig("mlx", {"model": "mlx-community/Llama-3.2-3B-Instruct-4bit"})
210
+ except ImportError:
211
+ if has_openai_key:
212
+ providers["summarization"] = ProviderConfig("openai")
213
+ else:
214
+ providers["summarization"] = ProviderConfig("passthrough")
215
+ # 3. OpenAI (if key available)
216
+ elif has_openai_key:
217
+ providers["summarization"] = ProviderConfig("openai")
218
+ # 4. Fallback: truncate
219
+ else:
220
+ providers["summarization"] = ProviderConfig("truncate")
221
+
222
+ # Document provider is always composite
223
+ providers["document"] = ProviderConfig("composite")
224
+
225
+ return providers
226
+
227
+
228
+ def create_default_config(store_path: Path) -> StoreConfig:
229
+ """Create a new config with auto-detected defaults."""
230
+ providers = detect_default_providers()
231
+
232
+ return StoreConfig(
233
+ path=store_path,
234
+ embedding=providers["embedding"],
235
+ summarization=providers["summarization"],
236
+ document=providers["document"],
237
+ )
238
+
239
+
240
+ def load_config(store_path: Path) -> StoreConfig:
241
+ """
242
+ Load configuration from a store directory.
243
+
244
+ Raises:
245
+ FileNotFoundError: If config doesn't exist
246
+ ValueError: If config is invalid
247
+ """
248
+ config_path = store_path / CONFIG_FILENAME
249
+
250
+ if not config_path.exists():
251
+ raise FileNotFoundError(f"Config not found: {config_path}")
252
+
253
+ with open(config_path, "rb") as f:
254
+ data = tomllib.load(f)
255
+
256
+ # Validate version
257
+ version = data.get("store", {}).get("version", 1)
258
+ if version > CONFIG_VERSION:
259
+ raise ValueError(f"Config version {version} is newer than supported ({CONFIG_VERSION})")
260
+
261
+ # Parse provider configs
262
+ def parse_provider(section: dict) -> ProviderConfig:
263
+ return ProviderConfig(
264
+ name=section.get("name", ""),
265
+ params={k: v for k, v in section.items() if k != "name"},
266
+ )
267
+
268
+ return StoreConfig(
269
+ path=store_path,
270
+ version=version,
271
+ created=data.get("store", {}).get("created", ""),
272
+ embedding=parse_provider(data.get("embedding", {"name": "sentence-transformers"})),
273
+ summarization=parse_provider(data.get("summarization", {"name": "truncate"})),
274
+ document=parse_provider(data.get("document", {"name": "composite"})),
275
+ )
276
+
277
+
278
+ def save_config(config: StoreConfig) -> None:
279
+ """
280
+ Save configuration to the store directory.
281
+
282
+ Creates the directory if it doesn't exist.
283
+ """
284
+ if tomli_w is None:
285
+ raise RuntimeError("tomli_w is required to save config. Install with: pip install tomli-w")
286
+
287
+ # Ensure directory exists
288
+ config.path.mkdir(parents=True, exist_ok=True)
289
+
290
+ # Build TOML structure
291
+ def provider_to_dict(p: ProviderConfig) -> dict:
292
+ d = {"name": p.name}
293
+ d.update(p.params)
294
+ return d
295
+
296
+ data = {
297
+ "store": {
298
+ "version": config.version,
299
+ "created": config.created,
300
+ },
301
+ "embedding": provider_to_dict(config.embedding),
302
+ "summarization": provider_to_dict(config.summarization),
303
+ "document": provider_to_dict(config.document),
304
+ }
305
+
306
+ with open(config.config_path, "wb") as f:
307
+ tomli_w.dump(data, f)
308
+
309
+
310
+ def load_or_create_config(store_path: Path) -> StoreConfig:
311
+ """
312
+ Load existing config or create a new one with defaults.
313
+
314
+ This is the main entry point for config management.
315
+ """
316
+ config_path = store_path / CONFIG_FILENAME
317
+
318
+ if config_path.exists():
319
+ return load_config(store_path)
320
+ else:
321
+ config = create_default_config(store_path)
322
+ save_config(config)
323
+ return config
keep/context.py ADDED
@@ -0,0 +1,127 @@
1
+ """
2
+ Working context and top-of-mind retrieval.
3
+
4
+ This module provides hierarchical context management for efficient
5
+ "what are we working on?" queries with O(log(log(N))) retrieval.
6
+ """
7
+
8
+ from dataclasses import dataclass, field
9
+ from datetime import datetime, timezone
10
+ from typing import Any, Optional
11
+
12
+
13
+ @dataclass
14
+ class WorkingContext:
15
+ """
16
+ The current working context — a high-level summary of active work.
17
+
18
+ This is the "Level 3" summary that any agent can read to instantly
19
+ understand what's being worked on.
20
+
21
+ Attributes:
22
+ summary: Natural language description of current focus
23
+ active_items: IDs of items currently being worked with
24
+ topics: Active topic/domain tags
25
+ updated: When context was last updated
26
+ session_id: Current session identifier
27
+ metadata: Additional context-specific data (arbitrary structure)
28
+ """
29
+ summary: str
30
+ active_items: list[str] = field(default_factory=list)
31
+ topics: list[str] = field(default_factory=list)
32
+ updated: str = field(default_factory=lambda: datetime.now(timezone.utc).isoformat())
33
+ session_id: Optional[str] = None
34
+ metadata: dict[str, Any] = field(default_factory=dict)
35
+
36
+
37
+ @dataclass
38
+ class TopicSummary:
39
+ """
40
+ A summary of items within a topic cluster (Level 2).
41
+
42
+ Topics aggregate related items and provide a mid-level
43
+ overview without retrieving all underlying items.
44
+
45
+ Attributes:
46
+ topic: Topic identifier (tag value)
47
+ summary: Generated summary of topic contents
48
+ item_count: Number of items in this topic
49
+ key_items: IDs of the most important items in the topic
50
+ subtopics: Child topics if hierarchical
51
+ updated: When topic summary was last regenerated
52
+ """
53
+ topic: str
54
+ summary: str
55
+ item_count: int
56
+ key_items: list[str] = field(default_factory=list)
57
+ subtopics: list[str] = field(default_factory=list)
58
+ updated: str = field(default_factory=lambda: datetime.now(timezone.utc).isoformat())
59
+
60
+
61
+ @dataclass
62
+ class RoutingContext:
63
+ """
64
+ Describes how items are routed between private and shared stores.
65
+
66
+ This document lives at a well-known location in the shared store.
67
+ The facade reads it to make routing decisions. The private store
68
+ is physically separate and invisible from the shared store.
69
+
70
+ Attributes:
71
+ summary: Natural language description of the privacy model
72
+ private_patterns: Tag patterns that route to private store (each pattern is dict[str, str])
73
+ private_store_path: Location of the private store (if local)
74
+ updated: When routing was last modified
75
+ metadata: Additional routing configuration
76
+ """
77
+ summary: str = "Items tagged for private/draft visibility route to a separate store."
78
+ private_patterns: list[dict[str, str]] = field(default_factory=lambda: [
79
+ {"_visibility": "draft"},
80
+ {"_visibility": "private"},
81
+ {"_for": "self"},
82
+ ])
83
+ private_store_path: Optional[str] = None # Resolved at init; None = default location
84
+ updated: str = field(default_factory=lambda: datetime.now(timezone.utc).isoformat())
85
+ metadata: dict[str, Any] = field(default_factory=dict)
86
+
87
+
88
+ # Well-known item ID for the routing context document
89
+ ROUTING_CONTEXT_ID = "_system:routing"
90
+
91
+
92
+ # Reserved system tags for context management (stored with items)
93
+ CONTEXT_TAGS = {
94
+ "_session": "Session that last touched this item",
95
+ "_topic": "Primary topic classification",
96
+ "_level": "Hierarchy level (0=source, 1=cluster, 2=topic, 3=context)",
97
+ "_summarizes": "IDs of items this item summarizes (for hierarchy)",
98
+ }
99
+
100
+ # Relevance scoring is computed at query time, NOT stored.
101
+ # This preserves agility between broad exploration and focused work.
102
+ # Score factors:
103
+ # - semantic similarity to query/hint
104
+ # - recency (time decay)
105
+ # - topic overlap with current WorkingContext.topics
106
+ # - session affinity (same session = boost)
107
+ # The weighting of these factors can vary by retrieval mode.
108
+
109
+
110
+ def generate_session_id() -> str:
111
+ """Generate a unique session identifier."""
112
+ import uuid
113
+ date = datetime.now(timezone.utc).strftime("%Y-%m-%d")
114
+ short_uuid = uuid.uuid4().hex[:8]
115
+ return f"{date}:{short_uuid}"
116
+
117
+
118
+ def matches_private_pattern(tags: dict[str, str], patterns: list[dict[str, str]]) -> bool:
119
+ """
120
+ Check if an item's tags match any private routing pattern.
121
+
122
+ A pattern matches if ALL its key-value pairs are present in tags.
123
+ """
124
+ for pattern in patterns:
125
+ if all(tags.get(k) == v for k, v in pattern.items()):
126
+ return True
127
+ return False
keep/indexing.py ADDED
@@ -0,0 +1,208 @@
1
+ """
2
+ Indexing modes for controlling embedding granularity.
3
+
4
+ Summarization ALWAYS happens (it's cheap and aids retrieval).
5
+ The mode controls what gets embedded:
6
+
7
+ - DOCUMENT: Embed summary only (1 vector per doc, fast)
8
+ - CHUNKED: Embed chunks only (N vectors per doc, OpenClaw-compatible)
9
+ - HYBRID: Embed summary + chunks (best recall, more storage)
10
+ - BM25_ONLY: Fulltext index only (no embeddings, keyword search)
11
+ """
12
+
13
+ from dataclasses import dataclass
14
+ from enum import Enum
15
+ from typing import Iterator, Protocol
16
+
17
+
18
+ class IndexingMode(Enum):
19
+ """Controls embedding granularity. Summary is always stored."""
20
+
21
+ DOCUMENT = "document"
22
+ """
23
+ Embed summary only.
24
+ - One vector per document
25
+ - Fast, good for "what is this document about?"
26
+ - Summary always available for display
27
+ """
28
+
29
+ CHUNKED = "chunked"
30
+ """
31
+ Embed chunks only.
32
+ - N vectors per document (one per ~400-token chunk)
33
+ - OpenClaw-compatible mode
34
+ - Good for passage-level retrieval
35
+ - Summary stored but not embedded
36
+ """
37
+
38
+ HYBRID = "hybrid"
39
+ """
40
+ Embed summary AND chunks.
41
+ - 1+N vectors per document
42
+ - Best recall (semantic anchor + passage-level)
43
+ - More storage, more embedding calls
44
+ """
45
+
46
+ BM25_ONLY = "bm25_only"
47
+ """
48
+ Fulltext index only.
49
+ - No embeddings at all
50
+ - Summary stored for display
51
+ - Keyword search only (exact token matching)
52
+ - Fastest, minimal resource usage
53
+ """
54
+
55
+
56
+ @dataclass
57
+ class IndexingConfig:
58
+ """Configuration for the indexing pipeline."""
59
+
60
+ mode: IndexingMode = IndexingMode.DOCUMENT
61
+ """Which embedding strategy to use. Summary always stored."""
62
+
63
+ # Chunking settings (for CHUNKED/HYBRID modes)
64
+ chunk_target_tokens: int = 400
65
+ """Target tokens per chunk (OpenClaw default: 400)."""
66
+
67
+ chunk_overlap_tokens: int = 80
68
+ """Overlap between chunks (OpenClaw default: 80)."""
69
+
70
+ tokens_per_word: float = 1.3
71
+ """Approximation for token estimation."""
72
+
73
+ # Summarization settings (always used)
74
+ summary_max_chars: int = 500
75
+ """Maximum summary length in characters."""
76
+
77
+ # BM25 settings
78
+ enable_fulltext: bool = True
79
+ """Whether to build FTS index alongside vectors."""
80
+
81
+ # Hybrid search weights (vector + BM25)
82
+ vector_weight: float = 0.7
83
+ """Weight for vector similarity in hybrid search."""
84
+
85
+ text_weight: float = 0.3
86
+ """Weight for BM25 score in hybrid search."""
87
+
88
+ @classmethod
89
+ def document_mode(cls) -> "IndexingConfig":
90
+ """Fast: embed summary only."""
91
+ return cls(mode=IndexingMode.DOCUMENT)
92
+
93
+ @classmethod
94
+ def chunked_mode(cls) -> "IndexingConfig":
95
+ """OpenClaw-compatible: embed chunks."""
96
+ return cls(
97
+ mode=IndexingMode.CHUNKED,
98
+ chunk_target_tokens=400,
99
+ chunk_overlap_tokens=80,
100
+ enable_fulltext=True,
101
+ vector_weight=0.7,
102
+ text_weight=0.3,
103
+ )
104
+
105
+ @classmethod
106
+ def hybrid_mode(cls) -> "IndexingConfig":
107
+ """Best recall: embed summary + chunks."""
108
+ return cls(
109
+ mode=IndexingMode.HYBRID,
110
+ chunk_target_tokens=400,
111
+ chunk_overlap_tokens=80,
112
+ )
113
+
114
+ @classmethod
115
+ def bm25_only(cls) -> "IndexingConfig":
116
+ """Fastest: no embeddings, keyword search only."""
117
+ return cls(
118
+ mode=IndexingMode.BM25_ONLY,
119
+ enable_fulltext=True,
120
+ )
121
+
122
+ def __post_init__(self):
123
+ # Normalize weights
124
+ total = self.vector_weight + self.text_weight
125
+ if total > 0:
126
+ self.vector_weight = self.vector_weight / total
127
+ self.text_weight = self.text_weight / total
128
+
129
+
130
+ # --- Chunking ---
131
+
132
+ @dataclass(frozen=True)
133
+ class Chunk:
134
+ """A chunk of text with position info."""
135
+ text: str
136
+ start_char: int
137
+ end_char: int
138
+ index: int # 0-based chunk number
139
+
140
+
141
+ class Chunker(Protocol):
142
+ """Protocol for text chunking strategies."""
143
+
144
+ def chunk(self, text: str) -> Iterator[Chunk]:
145
+ """Split text into overlapping chunks."""
146
+ ...
147
+
148
+
149
+ @dataclass
150
+ class TokenChunker:
151
+ """Chunk by approximate token count with overlap.
152
+
153
+ OpenClaw defaults: ~400 tokens target, 80 token overlap.
154
+ """
155
+ target_tokens: int = 400
156
+ overlap_tokens: int = 80
157
+ tokens_per_word: float = 1.3
158
+
159
+ def chunk(self, text: str) -> Iterator[Chunk]:
160
+ """Split text into overlapping chunks by token estimate."""
161
+ if not text.strip():
162
+ return
163
+
164
+ words = text.split()
165
+ if not words:
166
+ return
167
+
168
+ target_words = int(self.target_tokens / self.tokens_per_word)
169
+ overlap_words = int(self.overlap_tokens / self.tokens_per_word)
170
+ step_words = max(1, target_words - overlap_words)
171
+
172
+ # Track character positions
173
+ word_positions: list[tuple[int, int]] = []
174
+ pos = 0
175
+ for word in words:
176
+ start = text.find(word, pos)
177
+ end = start + len(word)
178
+ word_positions.append((start, end))
179
+ pos = end
180
+
181
+ chunk_index = 0
182
+ word_index = 0
183
+
184
+ while word_index < len(words):
185
+ end_word = min(word_index + target_words, len(words))
186
+ chunk_words = words[word_index:end_word]
187
+
188
+ start_char = word_positions[word_index][0]
189
+ end_char = word_positions[end_word - 1][1]
190
+
191
+ yield Chunk(
192
+ text=" ".join(chunk_words),
193
+ start_char=start_char,
194
+ end_char=end_char,
195
+ index=chunk_index,
196
+ )
197
+
198
+ chunk_index += 1
199
+ word_index += step_words
200
+
201
+ # Don't create tiny final chunks
202
+ if word_index < len(words) and len(words) - word_index < overlap_words:
203
+ break
204
+
205
+
206
+ def estimate_tokens(text: str, tokens_per_word: float = 1.3) -> int:
207
+ """Estimate token count from text."""
208
+ return int(len(text.split()) * tokens_per_word)