keep-skill 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- keep/__init__.py +53 -0
- keep/__main__.py +8 -0
- keep/api.py +686 -0
- keep/chunking.py +364 -0
- keep/cli.py +503 -0
- keep/config.py +323 -0
- keep/context.py +127 -0
- keep/indexing.py +208 -0
- keep/logging_config.py +73 -0
- keep/paths.py +67 -0
- keep/pending_summaries.py +166 -0
- keep/providers/__init__.py +40 -0
- keep/providers/base.py +416 -0
- keep/providers/documents.py +250 -0
- keep/providers/embedding_cache.py +260 -0
- keep/providers/embeddings.py +245 -0
- keep/providers/llm.py +371 -0
- keep/providers/mlx.py +256 -0
- keep/providers/summarization.py +107 -0
- keep/store.py +403 -0
- keep/types.py +65 -0
- keep_skill-0.1.0.dist-info/METADATA +290 -0
- keep_skill-0.1.0.dist-info/RECORD +26 -0
- keep_skill-0.1.0.dist-info/WHEEL +4 -0
- keep_skill-0.1.0.dist-info/entry_points.txt +2 -0
- keep_skill-0.1.0.dist-info/licenses/LICENSE +21 -0
keep/chunking.py
ADDED
|
@@ -0,0 +1,364 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Chunking strategies for splitting documents into embeddable pieces.
|
|
3
|
+
|
|
4
|
+
Unlike summarization (which condenses), chunking splits mechanically
|
|
5
|
+
by token count with overlap to preserve context across boundaries.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from dataclasses import dataclass
|
|
9
|
+
from typing import Protocol, Iterator
|
|
10
|
+
import re
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass(frozen=True)
|
|
14
|
+
class Chunk:
|
|
15
|
+
"""A piece of a document suitable for embedding."""
|
|
16
|
+
text: str
|
|
17
|
+
start_char: int
|
|
18
|
+
end_char: int
|
|
19
|
+
index: int # 0-based chunk number within document
|
|
20
|
+
|
|
21
|
+
@property
|
|
22
|
+
def char_span(self) -> tuple[int, int]:
|
|
23
|
+
return (self.start_char, self.end_char)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class ChunkingProvider(Protocol):
|
|
27
|
+
"""Protocol for document chunking strategies."""
|
|
28
|
+
|
|
29
|
+
def chunk(self, text: str) -> Iterator[Chunk]:
|
|
30
|
+
"""Split text into overlapping chunks."""
|
|
31
|
+
...
|
|
32
|
+
|
|
33
|
+
@property
|
|
34
|
+
def target_tokens(self) -> int:
|
|
35
|
+
"""Target token count per chunk."""
|
|
36
|
+
...
|
|
37
|
+
|
|
38
|
+
@property
|
|
39
|
+
def overlap_tokens(self) -> int:
|
|
40
|
+
"""Overlap between adjacent chunks."""
|
|
41
|
+
...
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class TokenChunker:
|
|
45
|
+
"""
|
|
46
|
+
Chunk by approximate token count with overlap.
|
|
47
|
+
|
|
48
|
+
Uses whitespace splitting as a rough token approximation
|
|
49
|
+
(1 token ≈ 0.75 words for English). For precise tokenization,
|
|
50
|
+
subclass and override _tokenize/_detokenize.
|
|
51
|
+
|
|
52
|
+
OpenClaw defaults: target=400 tokens, overlap=80 tokens
|
|
53
|
+
"""
|
|
54
|
+
|
|
55
|
+
def __init__(
|
|
56
|
+
self,
|
|
57
|
+
target_tokens: int = 400,
|
|
58
|
+
overlap_tokens: int = 80,
|
|
59
|
+
chars_per_token: float = 4.0, # rough estimate
|
|
60
|
+
):
|
|
61
|
+
self._target_tokens = target_tokens
|
|
62
|
+
self._overlap_tokens = overlap_tokens
|
|
63
|
+
self._chars_per_token = chars_per_token
|
|
64
|
+
|
|
65
|
+
@property
|
|
66
|
+
def target_tokens(self) -> int:
|
|
67
|
+
return self._target_tokens
|
|
68
|
+
|
|
69
|
+
@property
|
|
70
|
+
def overlap_tokens(self) -> int:
|
|
71
|
+
return self._overlap_tokens
|
|
72
|
+
|
|
73
|
+
@property
|
|
74
|
+
def target_chars(self) -> int:
|
|
75
|
+
return int(self._target_tokens * self._chars_per_token)
|
|
76
|
+
|
|
77
|
+
@property
|
|
78
|
+
def overlap_chars(self) -> int:
|
|
79
|
+
return int(self._overlap_tokens * self._chars_per_token)
|
|
80
|
+
|
|
81
|
+
def chunk(self, text: str) -> Iterator[Chunk]:
|
|
82
|
+
"""Split text into overlapping chunks."""
|
|
83
|
+
if not text.strip():
|
|
84
|
+
return
|
|
85
|
+
|
|
86
|
+
# For short texts, return as single chunk
|
|
87
|
+
if len(text) <= self.target_chars:
|
|
88
|
+
yield Chunk(text=text, start_char=0, end_char=len(text), index=0)
|
|
89
|
+
return
|
|
90
|
+
|
|
91
|
+
stride = self.target_chars - self.overlap_chars
|
|
92
|
+
start = 0
|
|
93
|
+
index = 0
|
|
94
|
+
|
|
95
|
+
while start < len(text):
|
|
96
|
+
end = min(start + self.target_chars, len(text))
|
|
97
|
+
|
|
98
|
+
# Try to break at word boundary
|
|
99
|
+
if end < len(text):
|
|
100
|
+
# Look for whitespace near the end
|
|
101
|
+
break_point = self._find_break_point(text, end)
|
|
102
|
+
if break_point > start:
|
|
103
|
+
end = break_point
|
|
104
|
+
|
|
105
|
+
chunk_text = text[start:end].strip()
|
|
106
|
+
if chunk_text:
|
|
107
|
+
yield Chunk(
|
|
108
|
+
text=chunk_text,
|
|
109
|
+
start_char=start,
|
|
110
|
+
end_char=end,
|
|
111
|
+
index=index,
|
|
112
|
+
)
|
|
113
|
+
index += 1
|
|
114
|
+
|
|
115
|
+
# Move forward by stride, not by chunk length
|
|
116
|
+
start += stride
|
|
117
|
+
|
|
118
|
+
# Ensure progress even if stride is weird
|
|
119
|
+
if start <= 0:
|
|
120
|
+
start = end
|
|
121
|
+
|
|
122
|
+
def _find_break_point(self, text: str, target: int, window: int = 50) -> int:
|
|
123
|
+
"""Find a good break point (whitespace) near target position."""
|
|
124
|
+
# Search backwards from target for whitespace
|
|
125
|
+
search_start = max(0, target - window)
|
|
126
|
+
search_region = text[search_start:target]
|
|
127
|
+
|
|
128
|
+
# Find last whitespace in region
|
|
129
|
+
match = None
|
|
130
|
+
for m in re.finditer(r'\s+', search_region):
|
|
131
|
+
match = m
|
|
132
|
+
|
|
133
|
+
if match:
|
|
134
|
+
return search_start + match.end()
|
|
135
|
+
return target
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
class SentenceChunker(TokenChunker):
|
|
139
|
+
"""
|
|
140
|
+
Chunk by sentences, respecting token limits.
|
|
141
|
+
|
|
142
|
+
Tries to keep sentences intact while staying under token limit.
|
|
143
|
+
Falls back to mid-sentence breaks for very long sentences.
|
|
144
|
+
"""
|
|
145
|
+
|
|
146
|
+
# Simple sentence boundary pattern
|
|
147
|
+
SENTENCE_END = re.compile(r'(?<=[.!?])\s+(?=[A-Z])')
|
|
148
|
+
|
|
149
|
+
def chunk(self, text: str) -> Iterator[Chunk]:
|
|
150
|
+
"""Split into sentence-aligned chunks."""
|
|
151
|
+
if not text.strip():
|
|
152
|
+
return
|
|
153
|
+
|
|
154
|
+
if len(text) <= self.target_chars:
|
|
155
|
+
yield Chunk(text=text, start_char=0, end_char=len(text), index=0)
|
|
156
|
+
return
|
|
157
|
+
|
|
158
|
+
sentences = self.SENTENCE_END.split(text)
|
|
159
|
+
|
|
160
|
+
current_chunk = []
|
|
161
|
+
current_len = 0
|
|
162
|
+
chunk_start = 0
|
|
163
|
+
char_pos = 0
|
|
164
|
+
index = 0
|
|
165
|
+
|
|
166
|
+
for sentence in sentences:
|
|
167
|
+
sentence_len = len(sentence)
|
|
168
|
+
|
|
169
|
+
# If single sentence exceeds limit, fall back to token chunking
|
|
170
|
+
if sentence_len > self.target_chars:
|
|
171
|
+
# Flush current chunk first
|
|
172
|
+
if current_chunk:
|
|
173
|
+
chunk_text = ' '.join(current_chunk)
|
|
174
|
+
yield Chunk(
|
|
175
|
+
text=chunk_text,
|
|
176
|
+
start_char=chunk_start,
|
|
177
|
+
end_char=char_pos,
|
|
178
|
+
index=index,
|
|
179
|
+
)
|
|
180
|
+
index += 1
|
|
181
|
+
current_chunk = []
|
|
182
|
+
current_len = 0
|
|
183
|
+
|
|
184
|
+
# Chunk the long sentence
|
|
185
|
+
for sub_chunk in super().chunk(sentence):
|
|
186
|
+
yield Chunk(
|
|
187
|
+
text=sub_chunk.text,
|
|
188
|
+
start_char=char_pos + sub_chunk.start_char,
|
|
189
|
+
end_char=char_pos + sub_chunk.end_char,
|
|
190
|
+
index=index,
|
|
191
|
+
)
|
|
192
|
+
index += 1
|
|
193
|
+
|
|
194
|
+
char_pos += sentence_len + 1 # +1 for space
|
|
195
|
+
chunk_start = char_pos
|
|
196
|
+
continue
|
|
197
|
+
|
|
198
|
+
# Would adding this sentence exceed limit?
|
|
199
|
+
if current_len + sentence_len + 1 > self.target_chars and current_chunk:
|
|
200
|
+
# Emit current chunk
|
|
201
|
+
chunk_text = ' '.join(current_chunk)
|
|
202
|
+
yield Chunk(
|
|
203
|
+
text=chunk_text,
|
|
204
|
+
start_char=chunk_start,
|
|
205
|
+
end_char=char_pos,
|
|
206
|
+
index=index,
|
|
207
|
+
)
|
|
208
|
+
index += 1
|
|
209
|
+
|
|
210
|
+
# Start new chunk with overlap
|
|
211
|
+
# Keep last sentence(s) up to overlap size
|
|
212
|
+
overlap_sents = []
|
|
213
|
+
overlap_len = 0
|
|
214
|
+
for s in reversed(current_chunk):
|
|
215
|
+
if overlap_len + len(s) > self.overlap_chars:
|
|
216
|
+
break
|
|
217
|
+
overlap_sents.insert(0, s)
|
|
218
|
+
overlap_len += len(s) + 1
|
|
219
|
+
|
|
220
|
+
current_chunk = overlap_sents
|
|
221
|
+
current_len = overlap_len
|
|
222
|
+
chunk_start = char_pos - overlap_len
|
|
223
|
+
|
|
224
|
+
current_chunk.append(sentence)
|
|
225
|
+
current_len += sentence_len + 1
|
|
226
|
+
char_pos += sentence_len + 1
|
|
227
|
+
|
|
228
|
+
# Emit final chunk
|
|
229
|
+
if current_chunk:
|
|
230
|
+
chunk_text = ' '.join(current_chunk)
|
|
231
|
+
yield Chunk(
|
|
232
|
+
text=chunk_text,
|
|
233
|
+
start_char=chunk_start,
|
|
234
|
+
end_char=len(text),
|
|
235
|
+
index=index,
|
|
236
|
+
)
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
class MarkdownChunker(TokenChunker):
|
|
240
|
+
"""
|
|
241
|
+
Chunk Markdown documents respecting structure.
|
|
242
|
+
|
|
243
|
+
Tries to break at:
|
|
244
|
+
1. Heading boundaries (# ## ###)
|
|
245
|
+
2. Paragraph boundaries (blank lines)
|
|
246
|
+
3. Sentence boundaries
|
|
247
|
+
4. Word boundaries (fallback)
|
|
248
|
+
"""
|
|
249
|
+
|
|
250
|
+
HEADING = re.compile(r'^#{1,6}\s+', re.MULTILINE)
|
|
251
|
+
PARAGRAPH = re.compile(r'\n\n+')
|
|
252
|
+
|
|
253
|
+
def chunk(self, text: str) -> Iterator[Chunk]:
|
|
254
|
+
"""Split Markdown into structure-aware chunks."""
|
|
255
|
+
if not text.strip():
|
|
256
|
+
return
|
|
257
|
+
|
|
258
|
+
if len(text) <= self.target_chars:
|
|
259
|
+
yield Chunk(text=text, start_char=0, end_char=len(text), index=0)
|
|
260
|
+
return
|
|
261
|
+
|
|
262
|
+
# Split on headings first
|
|
263
|
+
sections = []
|
|
264
|
+
last_end = 0
|
|
265
|
+
|
|
266
|
+
for match in self.HEADING.finditer(text):
|
|
267
|
+
if match.start() > last_end:
|
|
268
|
+
sections.append((last_end, match.start()))
|
|
269
|
+
last_end = match.start()
|
|
270
|
+
|
|
271
|
+
if last_end < len(text):
|
|
272
|
+
sections.append((last_end, len(text)))
|
|
273
|
+
|
|
274
|
+
# If no headings, fall back to paragraph splitting
|
|
275
|
+
if len(sections) <= 1:
|
|
276
|
+
yield from self._chunk_by_paragraphs(text)
|
|
277
|
+
return
|
|
278
|
+
|
|
279
|
+
index = 0
|
|
280
|
+
for start, end in sections:
|
|
281
|
+
section_text = text[start:end]
|
|
282
|
+
|
|
283
|
+
if len(section_text) <= self.target_chars:
|
|
284
|
+
if section_text.strip():
|
|
285
|
+
yield Chunk(
|
|
286
|
+
text=section_text.strip(),
|
|
287
|
+
start_char=start,
|
|
288
|
+
end_char=end,
|
|
289
|
+
index=index,
|
|
290
|
+
)
|
|
291
|
+
index += 1
|
|
292
|
+
else:
|
|
293
|
+
# Section too long, chunk it
|
|
294
|
+
for sub in self._chunk_by_paragraphs(section_text):
|
|
295
|
+
yield Chunk(
|
|
296
|
+
text=sub.text,
|
|
297
|
+
start_char=start + sub.start_char,
|
|
298
|
+
end_char=start + sub.end_char,
|
|
299
|
+
index=index,
|
|
300
|
+
)
|
|
301
|
+
index += 1
|
|
302
|
+
|
|
303
|
+
def _chunk_by_paragraphs(self, text: str) -> Iterator[Chunk]:
|
|
304
|
+
"""Fall back to paragraph-based chunking."""
|
|
305
|
+
paragraphs = self.PARAGRAPH.split(text)
|
|
306
|
+
|
|
307
|
+
current_chunk = []
|
|
308
|
+
current_len = 0
|
|
309
|
+
chunk_start = 0
|
|
310
|
+
char_pos = 0
|
|
311
|
+
index = 0
|
|
312
|
+
|
|
313
|
+
for para in paragraphs:
|
|
314
|
+
para_len = len(para)
|
|
315
|
+
|
|
316
|
+
if para_len > self.target_chars:
|
|
317
|
+
# Flush and chunk the long paragraph
|
|
318
|
+
if current_chunk:
|
|
319
|
+
yield Chunk(
|
|
320
|
+
text='\n\n'.join(current_chunk),
|
|
321
|
+
start_char=chunk_start,
|
|
322
|
+
end_char=char_pos,
|
|
323
|
+
index=index,
|
|
324
|
+
)
|
|
325
|
+
index += 1
|
|
326
|
+
current_chunk = []
|
|
327
|
+
current_len = 0
|
|
328
|
+
|
|
329
|
+
for sub in super().chunk(para):
|
|
330
|
+
yield Chunk(
|
|
331
|
+
text=sub.text,
|
|
332
|
+
start_char=char_pos + sub.start_char,
|
|
333
|
+
end_char=char_pos + sub.end_char,
|
|
334
|
+
index=index,
|
|
335
|
+
)
|
|
336
|
+
index += 1
|
|
337
|
+
|
|
338
|
+
char_pos += para_len + 2
|
|
339
|
+
chunk_start = char_pos
|
|
340
|
+
continue
|
|
341
|
+
|
|
342
|
+
if current_len + para_len + 2 > self.target_chars and current_chunk:
|
|
343
|
+
yield Chunk(
|
|
344
|
+
text='\n\n'.join(current_chunk),
|
|
345
|
+
start_char=chunk_start,
|
|
346
|
+
end_char=char_pos,
|
|
347
|
+
index=index,
|
|
348
|
+
)
|
|
349
|
+
index += 1
|
|
350
|
+
current_chunk = []
|
|
351
|
+
current_len = 0
|
|
352
|
+
chunk_start = char_pos
|
|
353
|
+
|
|
354
|
+
current_chunk.append(para)
|
|
355
|
+
current_len += para_len + 2
|
|
356
|
+
char_pos += para_len + 2
|
|
357
|
+
|
|
358
|
+
if current_chunk:
|
|
359
|
+
yield Chunk(
|
|
360
|
+
text='\n\n'.join(current_chunk),
|
|
361
|
+
start_char=chunk_start,
|
|
362
|
+
end_char=len(text),
|
|
363
|
+
index=index,
|
|
364
|
+
)
|