keep-skill 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- keep/__init__.py +53 -0
- keep/__main__.py +8 -0
- keep/api.py +686 -0
- keep/chunking.py +364 -0
- keep/cli.py +503 -0
- keep/config.py +323 -0
- keep/context.py +127 -0
- keep/indexing.py +208 -0
- keep/logging_config.py +73 -0
- keep/paths.py +67 -0
- keep/pending_summaries.py +166 -0
- keep/providers/__init__.py +40 -0
- keep/providers/base.py +416 -0
- keep/providers/documents.py +250 -0
- keep/providers/embedding_cache.py +260 -0
- keep/providers/embeddings.py +245 -0
- keep/providers/llm.py +371 -0
- keep/providers/mlx.py +256 -0
- keep/providers/summarization.py +107 -0
- keep/store.py +403 -0
- keep/types.py +65 -0
- keep_skill-0.1.0.dist-info/METADATA +290 -0
- keep_skill-0.1.0.dist-info/RECORD +26 -0
- keep_skill-0.1.0.dist-info/WHEEL +4 -0
- keep_skill-0.1.0.dist-info/entry_points.txt +2 -0
- keep_skill-0.1.0.dist-info/licenses/LICENSE +21 -0
keep/config.py
ADDED
|
@@ -0,0 +1,323 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Configuration management for associative memory stores.
|
|
3
|
+
|
|
4
|
+
The configuration is stored as a TOML file in the store directory.
|
|
5
|
+
It specifies which providers to use and their parameters.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import os
|
|
9
|
+
import platform
|
|
10
|
+
import tomllib
|
|
11
|
+
from dataclasses import dataclass, field
|
|
12
|
+
from datetime import datetime, timezone
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from typing import Any
|
|
15
|
+
|
|
16
|
+
# tomli_w for writing TOML (tomllib is read-only)
|
|
17
|
+
try:
|
|
18
|
+
import tomli_w
|
|
19
|
+
except ImportError:
|
|
20
|
+
tomli_w = None # type: ignore
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
CONFIG_FILENAME = "keep.toml"
|
|
24
|
+
CONFIG_VERSION = 1
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@dataclass
|
|
28
|
+
class ProviderConfig:
|
|
29
|
+
"""Configuration for a single provider."""
|
|
30
|
+
name: str
|
|
31
|
+
params: dict[str, Any] = field(default_factory=dict)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@dataclass
|
|
35
|
+
class StoreConfig:
|
|
36
|
+
"""Complete store configuration."""
|
|
37
|
+
path: Path
|
|
38
|
+
version: int = CONFIG_VERSION
|
|
39
|
+
created: str = field(default_factory=lambda: datetime.now(timezone.utc).isoformat())
|
|
40
|
+
|
|
41
|
+
# Provider configurations
|
|
42
|
+
embedding: ProviderConfig = field(default_factory=lambda: ProviderConfig("sentence-transformers"))
|
|
43
|
+
summarization: ProviderConfig = field(default_factory=lambda: ProviderConfig("truncate"))
|
|
44
|
+
document: ProviderConfig = field(default_factory=lambda: ProviderConfig("composite"))
|
|
45
|
+
|
|
46
|
+
@property
|
|
47
|
+
def config_path(self) -> Path:
|
|
48
|
+
"""Path to the TOML config file."""
|
|
49
|
+
return self.path / CONFIG_FILENAME
|
|
50
|
+
|
|
51
|
+
def exists(self) -> bool:
|
|
52
|
+
"""Check if config file exists."""
|
|
53
|
+
return self.config_path.exists()
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def read_openclaw_config() -> dict | None:
|
|
57
|
+
"""
|
|
58
|
+
Read OpenClaw configuration if available.
|
|
59
|
+
|
|
60
|
+
Checks:
|
|
61
|
+
1. OPENCLAW_CONFIG environment variable
|
|
62
|
+
2. ~/.openclaw/openclaw.json (default location)
|
|
63
|
+
|
|
64
|
+
Returns None if not found or invalid.
|
|
65
|
+
"""
|
|
66
|
+
import json
|
|
67
|
+
|
|
68
|
+
# Try environment variable first
|
|
69
|
+
config_path_str = os.environ.get("OPENCLAW_CONFIG")
|
|
70
|
+
if config_path_str:
|
|
71
|
+
config_file = Path(config_path_str)
|
|
72
|
+
else:
|
|
73
|
+
# Default location
|
|
74
|
+
config_file = Path.home() / ".openclaw" / "openclaw.json"
|
|
75
|
+
|
|
76
|
+
if not config_file.exists():
|
|
77
|
+
return None
|
|
78
|
+
|
|
79
|
+
try:
|
|
80
|
+
with open(config_file) as f:
|
|
81
|
+
return json.load(f)
|
|
82
|
+
except (json.JSONDecodeError, IOError):
|
|
83
|
+
return None
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def get_openclaw_memory_search_config(openclaw_config: dict | None) -> dict | None:
|
|
87
|
+
"""
|
|
88
|
+
Extract memorySearch config from OpenClaw config.
|
|
89
|
+
|
|
90
|
+
Returns the memorySearch settings or None if not configured.
|
|
91
|
+
|
|
92
|
+
Example structure:
|
|
93
|
+
{
|
|
94
|
+
"provider": "openai" | "gemini" | "local" | "auto",
|
|
95
|
+
"model": "text-embedding-3-small",
|
|
96
|
+
"remote": {
|
|
97
|
+
"apiKey": "sk-...",
|
|
98
|
+
"baseUrl": "https://..."
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
"""
|
|
102
|
+
if not openclaw_config:
|
|
103
|
+
return None
|
|
104
|
+
|
|
105
|
+
return (openclaw_config
|
|
106
|
+
.get("agents", {})
|
|
107
|
+
.get("defaults", {})
|
|
108
|
+
.get("memorySearch", None))
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def detect_default_providers() -> dict[str, ProviderConfig]:
|
|
112
|
+
"""
|
|
113
|
+
Detect the best default providers for the current environment.
|
|
114
|
+
|
|
115
|
+
Priority for embeddings:
|
|
116
|
+
1. OpenClaw memorySearch config (if configured with provider + API key)
|
|
117
|
+
2. sentence-transformers (local fallback)
|
|
118
|
+
|
|
119
|
+
Priority for summarization:
|
|
120
|
+
1. OpenClaw model config + Anthropic (if configured and ANTHROPIC_API_KEY available)
|
|
121
|
+
2. MLX (Apple Silicon local-first)
|
|
122
|
+
3. OpenAI (if API key available)
|
|
123
|
+
4. Fallback: truncate
|
|
124
|
+
|
|
125
|
+
Returns provider configs for: embedding, summarization, document
|
|
126
|
+
"""
|
|
127
|
+
providers = {}
|
|
128
|
+
|
|
129
|
+
# Check for Apple Silicon
|
|
130
|
+
is_apple_silicon = (
|
|
131
|
+
platform.system() == "Darwin" and
|
|
132
|
+
platform.machine() == "arm64"
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
# Check for API keys
|
|
136
|
+
has_anthropic_key = bool(os.environ.get("ANTHROPIC_API_KEY"))
|
|
137
|
+
has_openai_key = bool(
|
|
138
|
+
os.environ.get("KEEP_OPENAI_API_KEY") or
|
|
139
|
+
os.environ.get("OPENAI_API_KEY")
|
|
140
|
+
)
|
|
141
|
+
has_gemini_key = bool(
|
|
142
|
+
os.environ.get("GEMINI_API_KEY") or
|
|
143
|
+
os.environ.get("GOOGLE_API_KEY")
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
# Check for OpenClaw config
|
|
147
|
+
openclaw_config = read_openclaw_config()
|
|
148
|
+
openclaw_model = None
|
|
149
|
+
if openclaw_config:
|
|
150
|
+
model_str = (openclaw_config.get("agents", {})
|
|
151
|
+
.get("defaults", {})
|
|
152
|
+
.get("model", {})
|
|
153
|
+
.get("primary", ""))
|
|
154
|
+
if model_str:
|
|
155
|
+
openclaw_model = model_str
|
|
156
|
+
|
|
157
|
+
# Get OpenClaw memorySearch config for embeddings
|
|
158
|
+
memory_search = get_openclaw_memory_search_config(openclaw_config)
|
|
159
|
+
|
|
160
|
+
# Embedding: check OpenClaw memorySearch config first, then fall back to local
|
|
161
|
+
embedding_provider = None
|
|
162
|
+
if memory_search:
|
|
163
|
+
ms_provider = memory_search.get("provider", "auto")
|
|
164
|
+
ms_model = memory_search.get("model")
|
|
165
|
+
ms_api_key = memory_search.get("remote", {}).get("apiKey")
|
|
166
|
+
|
|
167
|
+
if ms_provider == "openai" or (ms_provider == "auto" and has_openai_key):
|
|
168
|
+
# Use OpenAI embeddings if configured or auto with key available
|
|
169
|
+
api_key = ms_api_key or os.environ.get("OPENAI_API_KEY")
|
|
170
|
+
if api_key:
|
|
171
|
+
params = {}
|
|
172
|
+
if ms_model:
|
|
173
|
+
params["model"] = ms_model
|
|
174
|
+
embedding_provider = ProviderConfig("openai", params)
|
|
175
|
+
|
|
176
|
+
elif ms_provider == "gemini" or (ms_provider == "auto" and has_gemini_key and not has_openai_key):
|
|
177
|
+
# Use Gemini embeddings if configured or auto with key available
|
|
178
|
+
api_key = ms_api_key or os.environ.get("GEMINI_API_KEY") or os.environ.get("GOOGLE_API_KEY")
|
|
179
|
+
if api_key:
|
|
180
|
+
params = {}
|
|
181
|
+
if ms_model:
|
|
182
|
+
params["model"] = ms_model
|
|
183
|
+
embedding_provider = ProviderConfig("gemini", params)
|
|
184
|
+
|
|
185
|
+
# Fall back to sentence-transformers (local, always works)
|
|
186
|
+
if embedding_provider is None:
|
|
187
|
+
embedding_provider = ProviderConfig("sentence-transformers")
|
|
188
|
+
|
|
189
|
+
providers["embedding"] = embedding_provider
|
|
190
|
+
|
|
191
|
+
# Summarization: priority order based on availability
|
|
192
|
+
# 1. OpenClaw + Anthropic (if configured and key available)
|
|
193
|
+
if openclaw_model and openclaw_model.startswith("anthropic/") and has_anthropic_key:
|
|
194
|
+
# Extract model name from "anthropic/claude-sonnet-4-5" format
|
|
195
|
+
model_name = openclaw_model.split("/", 1)[1] if "/" in openclaw_model else "claude-3-5-haiku-20241022"
|
|
196
|
+
# Map OpenClaw model names to actual Anthropic model names
|
|
197
|
+
model_mapping = {
|
|
198
|
+
"claude-sonnet-4": "claude-sonnet-4-20250514",
|
|
199
|
+
"claude-sonnet-4-5": "claude-sonnet-4-20250514",
|
|
200
|
+
"claude-sonnet-3-5": "claude-3-5-sonnet-20241022",
|
|
201
|
+
"claude-haiku-3-5": "claude-3-5-haiku-20241022",
|
|
202
|
+
}
|
|
203
|
+
actual_model = model_mapping.get(model_name, "claude-3-5-haiku-20241022")
|
|
204
|
+
providers["summarization"] = ProviderConfig("anthropic", {"model": actual_model})
|
|
205
|
+
# 2. MLX on Apple Silicon (local-first)
|
|
206
|
+
elif is_apple_silicon:
|
|
207
|
+
try:
|
|
208
|
+
import mlx_lm # noqa
|
|
209
|
+
providers["summarization"] = ProviderConfig("mlx", {"model": "mlx-community/Llama-3.2-3B-Instruct-4bit"})
|
|
210
|
+
except ImportError:
|
|
211
|
+
if has_openai_key:
|
|
212
|
+
providers["summarization"] = ProviderConfig("openai")
|
|
213
|
+
else:
|
|
214
|
+
providers["summarization"] = ProviderConfig("passthrough")
|
|
215
|
+
# 3. OpenAI (if key available)
|
|
216
|
+
elif has_openai_key:
|
|
217
|
+
providers["summarization"] = ProviderConfig("openai")
|
|
218
|
+
# 4. Fallback: truncate
|
|
219
|
+
else:
|
|
220
|
+
providers["summarization"] = ProviderConfig("truncate")
|
|
221
|
+
|
|
222
|
+
# Document provider is always composite
|
|
223
|
+
providers["document"] = ProviderConfig("composite")
|
|
224
|
+
|
|
225
|
+
return providers
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
def create_default_config(store_path: Path) -> StoreConfig:
|
|
229
|
+
"""Create a new config with auto-detected defaults."""
|
|
230
|
+
providers = detect_default_providers()
|
|
231
|
+
|
|
232
|
+
return StoreConfig(
|
|
233
|
+
path=store_path,
|
|
234
|
+
embedding=providers["embedding"],
|
|
235
|
+
summarization=providers["summarization"],
|
|
236
|
+
document=providers["document"],
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
def load_config(store_path: Path) -> StoreConfig:
|
|
241
|
+
"""
|
|
242
|
+
Load configuration from a store directory.
|
|
243
|
+
|
|
244
|
+
Raises:
|
|
245
|
+
FileNotFoundError: If config doesn't exist
|
|
246
|
+
ValueError: If config is invalid
|
|
247
|
+
"""
|
|
248
|
+
config_path = store_path / CONFIG_FILENAME
|
|
249
|
+
|
|
250
|
+
if not config_path.exists():
|
|
251
|
+
raise FileNotFoundError(f"Config not found: {config_path}")
|
|
252
|
+
|
|
253
|
+
with open(config_path, "rb") as f:
|
|
254
|
+
data = tomllib.load(f)
|
|
255
|
+
|
|
256
|
+
# Validate version
|
|
257
|
+
version = data.get("store", {}).get("version", 1)
|
|
258
|
+
if version > CONFIG_VERSION:
|
|
259
|
+
raise ValueError(f"Config version {version} is newer than supported ({CONFIG_VERSION})")
|
|
260
|
+
|
|
261
|
+
# Parse provider configs
|
|
262
|
+
def parse_provider(section: dict) -> ProviderConfig:
|
|
263
|
+
return ProviderConfig(
|
|
264
|
+
name=section.get("name", ""),
|
|
265
|
+
params={k: v for k, v in section.items() if k != "name"},
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
return StoreConfig(
|
|
269
|
+
path=store_path,
|
|
270
|
+
version=version,
|
|
271
|
+
created=data.get("store", {}).get("created", ""),
|
|
272
|
+
embedding=parse_provider(data.get("embedding", {"name": "sentence-transformers"})),
|
|
273
|
+
summarization=parse_provider(data.get("summarization", {"name": "truncate"})),
|
|
274
|
+
document=parse_provider(data.get("document", {"name": "composite"})),
|
|
275
|
+
)
|
|
276
|
+
|
|
277
|
+
|
|
278
|
+
def save_config(config: StoreConfig) -> None:
|
|
279
|
+
"""
|
|
280
|
+
Save configuration to the store directory.
|
|
281
|
+
|
|
282
|
+
Creates the directory if it doesn't exist.
|
|
283
|
+
"""
|
|
284
|
+
if tomli_w is None:
|
|
285
|
+
raise RuntimeError("tomli_w is required to save config. Install with: pip install tomli-w")
|
|
286
|
+
|
|
287
|
+
# Ensure directory exists
|
|
288
|
+
config.path.mkdir(parents=True, exist_ok=True)
|
|
289
|
+
|
|
290
|
+
# Build TOML structure
|
|
291
|
+
def provider_to_dict(p: ProviderConfig) -> dict:
|
|
292
|
+
d = {"name": p.name}
|
|
293
|
+
d.update(p.params)
|
|
294
|
+
return d
|
|
295
|
+
|
|
296
|
+
data = {
|
|
297
|
+
"store": {
|
|
298
|
+
"version": config.version,
|
|
299
|
+
"created": config.created,
|
|
300
|
+
},
|
|
301
|
+
"embedding": provider_to_dict(config.embedding),
|
|
302
|
+
"summarization": provider_to_dict(config.summarization),
|
|
303
|
+
"document": provider_to_dict(config.document),
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
with open(config.config_path, "wb") as f:
|
|
307
|
+
tomli_w.dump(data, f)
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
def load_or_create_config(store_path: Path) -> StoreConfig:
|
|
311
|
+
"""
|
|
312
|
+
Load existing config or create a new one with defaults.
|
|
313
|
+
|
|
314
|
+
This is the main entry point for config management.
|
|
315
|
+
"""
|
|
316
|
+
config_path = store_path / CONFIG_FILENAME
|
|
317
|
+
|
|
318
|
+
if config_path.exists():
|
|
319
|
+
return load_config(store_path)
|
|
320
|
+
else:
|
|
321
|
+
config = create_default_config(store_path)
|
|
322
|
+
save_config(config)
|
|
323
|
+
return config
|
keep/context.py
ADDED
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Working context and top-of-mind retrieval.
|
|
3
|
+
|
|
4
|
+
This module provides hierarchical context management for efficient
|
|
5
|
+
"what are we working on?" queries with O(log(log(N))) retrieval.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from dataclasses import dataclass, field
|
|
9
|
+
from datetime import datetime, timezone
|
|
10
|
+
from typing import Any, Optional
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass
|
|
14
|
+
class WorkingContext:
|
|
15
|
+
"""
|
|
16
|
+
The current working context — a high-level summary of active work.
|
|
17
|
+
|
|
18
|
+
This is the "Level 3" summary that any agent can read to instantly
|
|
19
|
+
understand what's being worked on.
|
|
20
|
+
|
|
21
|
+
Attributes:
|
|
22
|
+
summary: Natural language description of current focus
|
|
23
|
+
active_items: IDs of items currently being worked with
|
|
24
|
+
topics: Active topic/domain tags
|
|
25
|
+
updated: When context was last updated
|
|
26
|
+
session_id: Current session identifier
|
|
27
|
+
metadata: Additional context-specific data (arbitrary structure)
|
|
28
|
+
"""
|
|
29
|
+
summary: str
|
|
30
|
+
active_items: list[str] = field(default_factory=list)
|
|
31
|
+
topics: list[str] = field(default_factory=list)
|
|
32
|
+
updated: str = field(default_factory=lambda: datetime.now(timezone.utc).isoformat())
|
|
33
|
+
session_id: Optional[str] = None
|
|
34
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@dataclass
|
|
38
|
+
class TopicSummary:
|
|
39
|
+
"""
|
|
40
|
+
A summary of items within a topic cluster (Level 2).
|
|
41
|
+
|
|
42
|
+
Topics aggregate related items and provide a mid-level
|
|
43
|
+
overview without retrieving all underlying items.
|
|
44
|
+
|
|
45
|
+
Attributes:
|
|
46
|
+
topic: Topic identifier (tag value)
|
|
47
|
+
summary: Generated summary of topic contents
|
|
48
|
+
item_count: Number of items in this topic
|
|
49
|
+
key_items: IDs of the most important items in the topic
|
|
50
|
+
subtopics: Child topics if hierarchical
|
|
51
|
+
updated: When topic summary was last regenerated
|
|
52
|
+
"""
|
|
53
|
+
topic: str
|
|
54
|
+
summary: str
|
|
55
|
+
item_count: int
|
|
56
|
+
key_items: list[str] = field(default_factory=list)
|
|
57
|
+
subtopics: list[str] = field(default_factory=list)
|
|
58
|
+
updated: str = field(default_factory=lambda: datetime.now(timezone.utc).isoformat())
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
@dataclass
|
|
62
|
+
class RoutingContext:
|
|
63
|
+
"""
|
|
64
|
+
Describes how items are routed between private and shared stores.
|
|
65
|
+
|
|
66
|
+
This document lives at a well-known location in the shared store.
|
|
67
|
+
The facade reads it to make routing decisions. The private store
|
|
68
|
+
is physically separate and invisible from the shared store.
|
|
69
|
+
|
|
70
|
+
Attributes:
|
|
71
|
+
summary: Natural language description of the privacy model
|
|
72
|
+
private_patterns: Tag patterns that route to private store (each pattern is dict[str, str])
|
|
73
|
+
private_store_path: Location of the private store (if local)
|
|
74
|
+
updated: When routing was last modified
|
|
75
|
+
metadata: Additional routing configuration
|
|
76
|
+
"""
|
|
77
|
+
summary: str = "Items tagged for private/draft visibility route to a separate store."
|
|
78
|
+
private_patterns: list[dict[str, str]] = field(default_factory=lambda: [
|
|
79
|
+
{"_visibility": "draft"},
|
|
80
|
+
{"_visibility": "private"},
|
|
81
|
+
{"_for": "self"},
|
|
82
|
+
])
|
|
83
|
+
private_store_path: Optional[str] = None # Resolved at init; None = default location
|
|
84
|
+
updated: str = field(default_factory=lambda: datetime.now(timezone.utc).isoformat())
|
|
85
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
# Well-known item ID for the routing context document
|
|
89
|
+
ROUTING_CONTEXT_ID = "_system:routing"
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
# Reserved system tags for context management (stored with items)
|
|
93
|
+
CONTEXT_TAGS = {
|
|
94
|
+
"_session": "Session that last touched this item",
|
|
95
|
+
"_topic": "Primary topic classification",
|
|
96
|
+
"_level": "Hierarchy level (0=source, 1=cluster, 2=topic, 3=context)",
|
|
97
|
+
"_summarizes": "IDs of items this item summarizes (for hierarchy)",
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
# Relevance scoring is computed at query time, NOT stored.
|
|
101
|
+
# This preserves agility between broad exploration and focused work.
|
|
102
|
+
# Score factors:
|
|
103
|
+
# - semantic similarity to query/hint
|
|
104
|
+
# - recency (time decay)
|
|
105
|
+
# - topic overlap with current WorkingContext.topics
|
|
106
|
+
# - session affinity (same session = boost)
|
|
107
|
+
# The weighting of these factors can vary by retrieval mode.
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def generate_session_id() -> str:
|
|
111
|
+
"""Generate a unique session identifier."""
|
|
112
|
+
import uuid
|
|
113
|
+
date = datetime.now(timezone.utc).strftime("%Y-%m-%d")
|
|
114
|
+
short_uuid = uuid.uuid4().hex[:8]
|
|
115
|
+
return f"{date}:{short_uuid}"
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def matches_private_pattern(tags: dict[str, str], patterns: list[dict[str, str]]) -> bool:
|
|
119
|
+
"""
|
|
120
|
+
Check if an item's tags match any private routing pattern.
|
|
121
|
+
|
|
122
|
+
A pattern matches if ALL its key-value pairs are present in tags.
|
|
123
|
+
"""
|
|
124
|
+
for pattern in patterns:
|
|
125
|
+
if all(tags.get(k) == v for k, v in pattern.items()):
|
|
126
|
+
return True
|
|
127
|
+
return False
|
keep/indexing.py
ADDED
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Indexing modes for controlling embedding granularity.
|
|
3
|
+
|
|
4
|
+
Summarization ALWAYS happens (it's cheap and aids retrieval).
|
|
5
|
+
The mode controls what gets embedded:
|
|
6
|
+
|
|
7
|
+
- DOCUMENT: Embed summary only (1 vector per doc, fast)
|
|
8
|
+
- CHUNKED: Embed chunks only (N vectors per doc, OpenClaw-compatible)
|
|
9
|
+
- HYBRID: Embed summary + chunks (best recall, more storage)
|
|
10
|
+
- BM25_ONLY: Fulltext index only (no embeddings, keyword search)
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from dataclasses import dataclass
|
|
14
|
+
from enum import Enum
|
|
15
|
+
from typing import Iterator, Protocol
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class IndexingMode(Enum):
|
|
19
|
+
"""Controls embedding granularity. Summary is always stored."""
|
|
20
|
+
|
|
21
|
+
DOCUMENT = "document"
|
|
22
|
+
"""
|
|
23
|
+
Embed summary only.
|
|
24
|
+
- One vector per document
|
|
25
|
+
- Fast, good for "what is this document about?"
|
|
26
|
+
- Summary always available for display
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
CHUNKED = "chunked"
|
|
30
|
+
"""
|
|
31
|
+
Embed chunks only.
|
|
32
|
+
- N vectors per document (one per ~400-token chunk)
|
|
33
|
+
- OpenClaw-compatible mode
|
|
34
|
+
- Good for passage-level retrieval
|
|
35
|
+
- Summary stored but not embedded
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
HYBRID = "hybrid"
|
|
39
|
+
"""
|
|
40
|
+
Embed summary AND chunks.
|
|
41
|
+
- 1+N vectors per document
|
|
42
|
+
- Best recall (semantic anchor + passage-level)
|
|
43
|
+
- More storage, more embedding calls
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
BM25_ONLY = "bm25_only"
|
|
47
|
+
"""
|
|
48
|
+
Fulltext index only.
|
|
49
|
+
- No embeddings at all
|
|
50
|
+
- Summary stored for display
|
|
51
|
+
- Keyword search only (exact token matching)
|
|
52
|
+
- Fastest, minimal resource usage
|
|
53
|
+
"""
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
@dataclass
|
|
57
|
+
class IndexingConfig:
|
|
58
|
+
"""Configuration for the indexing pipeline."""
|
|
59
|
+
|
|
60
|
+
mode: IndexingMode = IndexingMode.DOCUMENT
|
|
61
|
+
"""Which embedding strategy to use. Summary always stored."""
|
|
62
|
+
|
|
63
|
+
# Chunking settings (for CHUNKED/HYBRID modes)
|
|
64
|
+
chunk_target_tokens: int = 400
|
|
65
|
+
"""Target tokens per chunk (OpenClaw default: 400)."""
|
|
66
|
+
|
|
67
|
+
chunk_overlap_tokens: int = 80
|
|
68
|
+
"""Overlap between chunks (OpenClaw default: 80)."""
|
|
69
|
+
|
|
70
|
+
tokens_per_word: float = 1.3
|
|
71
|
+
"""Approximation for token estimation."""
|
|
72
|
+
|
|
73
|
+
# Summarization settings (always used)
|
|
74
|
+
summary_max_chars: int = 500
|
|
75
|
+
"""Maximum summary length in characters."""
|
|
76
|
+
|
|
77
|
+
# BM25 settings
|
|
78
|
+
enable_fulltext: bool = True
|
|
79
|
+
"""Whether to build FTS index alongside vectors."""
|
|
80
|
+
|
|
81
|
+
# Hybrid search weights (vector + BM25)
|
|
82
|
+
vector_weight: float = 0.7
|
|
83
|
+
"""Weight for vector similarity in hybrid search."""
|
|
84
|
+
|
|
85
|
+
text_weight: float = 0.3
|
|
86
|
+
"""Weight for BM25 score in hybrid search."""
|
|
87
|
+
|
|
88
|
+
@classmethod
|
|
89
|
+
def document_mode(cls) -> "IndexingConfig":
|
|
90
|
+
"""Fast: embed summary only."""
|
|
91
|
+
return cls(mode=IndexingMode.DOCUMENT)
|
|
92
|
+
|
|
93
|
+
@classmethod
|
|
94
|
+
def chunked_mode(cls) -> "IndexingConfig":
|
|
95
|
+
"""OpenClaw-compatible: embed chunks."""
|
|
96
|
+
return cls(
|
|
97
|
+
mode=IndexingMode.CHUNKED,
|
|
98
|
+
chunk_target_tokens=400,
|
|
99
|
+
chunk_overlap_tokens=80,
|
|
100
|
+
enable_fulltext=True,
|
|
101
|
+
vector_weight=0.7,
|
|
102
|
+
text_weight=0.3,
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
@classmethod
|
|
106
|
+
def hybrid_mode(cls) -> "IndexingConfig":
|
|
107
|
+
"""Best recall: embed summary + chunks."""
|
|
108
|
+
return cls(
|
|
109
|
+
mode=IndexingMode.HYBRID,
|
|
110
|
+
chunk_target_tokens=400,
|
|
111
|
+
chunk_overlap_tokens=80,
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
@classmethod
|
|
115
|
+
def bm25_only(cls) -> "IndexingConfig":
|
|
116
|
+
"""Fastest: no embeddings, keyword search only."""
|
|
117
|
+
return cls(
|
|
118
|
+
mode=IndexingMode.BM25_ONLY,
|
|
119
|
+
enable_fulltext=True,
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
def __post_init__(self):
|
|
123
|
+
# Normalize weights
|
|
124
|
+
total = self.vector_weight + self.text_weight
|
|
125
|
+
if total > 0:
|
|
126
|
+
self.vector_weight = self.vector_weight / total
|
|
127
|
+
self.text_weight = self.text_weight / total
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
# --- Chunking ---
|
|
131
|
+
|
|
132
|
+
@dataclass(frozen=True)
|
|
133
|
+
class Chunk:
|
|
134
|
+
"""A chunk of text with position info."""
|
|
135
|
+
text: str
|
|
136
|
+
start_char: int
|
|
137
|
+
end_char: int
|
|
138
|
+
index: int # 0-based chunk number
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
class Chunker(Protocol):
|
|
142
|
+
"""Protocol for text chunking strategies."""
|
|
143
|
+
|
|
144
|
+
def chunk(self, text: str) -> Iterator[Chunk]:
|
|
145
|
+
"""Split text into overlapping chunks."""
|
|
146
|
+
...
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
@dataclass
|
|
150
|
+
class TokenChunker:
|
|
151
|
+
"""Chunk by approximate token count with overlap.
|
|
152
|
+
|
|
153
|
+
OpenClaw defaults: ~400 tokens target, 80 token overlap.
|
|
154
|
+
"""
|
|
155
|
+
target_tokens: int = 400
|
|
156
|
+
overlap_tokens: int = 80
|
|
157
|
+
tokens_per_word: float = 1.3
|
|
158
|
+
|
|
159
|
+
def chunk(self, text: str) -> Iterator[Chunk]:
|
|
160
|
+
"""Split text into overlapping chunks by token estimate."""
|
|
161
|
+
if not text.strip():
|
|
162
|
+
return
|
|
163
|
+
|
|
164
|
+
words = text.split()
|
|
165
|
+
if not words:
|
|
166
|
+
return
|
|
167
|
+
|
|
168
|
+
target_words = int(self.target_tokens / self.tokens_per_word)
|
|
169
|
+
overlap_words = int(self.overlap_tokens / self.tokens_per_word)
|
|
170
|
+
step_words = max(1, target_words - overlap_words)
|
|
171
|
+
|
|
172
|
+
# Track character positions
|
|
173
|
+
word_positions: list[tuple[int, int]] = []
|
|
174
|
+
pos = 0
|
|
175
|
+
for word in words:
|
|
176
|
+
start = text.find(word, pos)
|
|
177
|
+
end = start + len(word)
|
|
178
|
+
word_positions.append((start, end))
|
|
179
|
+
pos = end
|
|
180
|
+
|
|
181
|
+
chunk_index = 0
|
|
182
|
+
word_index = 0
|
|
183
|
+
|
|
184
|
+
while word_index < len(words):
|
|
185
|
+
end_word = min(word_index + target_words, len(words))
|
|
186
|
+
chunk_words = words[word_index:end_word]
|
|
187
|
+
|
|
188
|
+
start_char = word_positions[word_index][0]
|
|
189
|
+
end_char = word_positions[end_word - 1][1]
|
|
190
|
+
|
|
191
|
+
yield Chunk(
|
|
192
|
+
text=" ".join(chunk_words),
|
|
193
|
+
start_char=start_char,
|
|
194
|
+
end_char=end_char,
|
|
195
|
+
index=chunk_index,
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
chunk_index += 1
|
|
199
|
+
word_index += step_words
|
|
200
|
+
|
|
201
|
+
# Don't create tiny final chunks
|
|
202
|
+
if word_index < len(words) and len(words) - word_index < overlap_words:
|
|
203
|
+
break
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
def estimate_tokens(text: str, tokens_per_word: float = 1.3) -> int:
|
|
207
|
+
"""Estimate token count from text."""
|
|
208
|
+
return int(len(text.split()) * tokens_per_word)
|