keep-skill 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
keep/chunking.py ADDED
@@ -0,0 +1,364 @@
1
+ """
2
+ Chunking strategies for splitting documents into embeddable pieces.
3
+
4
+ Unlike summarization (which condenses), chunking splits mechanically
5
+ by token count with overlap to preserve context across boundaries.
6
+ """
7
+
8
+ from dataclasses import dataclass
9
+ from typing import Protocol, Iterator
10
+ import re
11
+
12
+
13
+ @dataclass(frozen=True)
14
+ class Chunk:
15
+ """A piece of a document suitable for embedding."""
16
+ text: str
17
+ start_char: int
18
+ end_char: int
19
+ index: int # 0-based chunk number within document
20
+
21
+ @property
22
+ def char_span(self) -> tuple[int, int]:
23
+ return (self.start_char, self.end_char)
24
+
25
+
26
+ class ChunkingProvider(Protocol):
27
+ """Protocol for document chunking strategies."""
28
+
29
+ def chunk(self, text: str) -> Iterator[Chunk]:
30
+ """Split text into overlapping chunks."""
31
+ ...
32
+
33
+ @property
34
+ def target_tokens(self) -> int:
35
+ """Target token count per chunk."""
36
+ ...
37
+
38
+ @property
39
+ def overlap_tokens(self) -> int:
40
+ """Overlap between adjacent chunks."""
41
+ ...
42
+
43
+
44
+ class TokenChunker:
45
+ """
46
+ Chunk by approximate token count with overlap.
47
+
48
+ Uses whitespace splitting as a rough token approximation
49
+ (1 token ≈ 0.75 words for English). For precise tokenization,
50
+ subclass and override _tokenize/_detokenize.
51
+
52
+ OpenClaw defaults: target=400 tokens, overlap=80 tokens
53
+ """
54
+
55
+ def __init__(
56
+ self,
57
+ target_tokens: int = 400,
58
+ overlap_tokens: int = 80,
59
+ chars_per_token: float = 4.0, # rough estimate
60
+ ):
61
+ self._target_tokens = target_tokens
62
+ self._overlap_tokens = overlap_tokens
63
+ self._chars_per_token = chars_per_token
64
+
65
+ @property
66
+ def target_tokens(self) -> int:
67
+ return self._target_tokens
68
+
69
+ @property
70
+ def overlap_tokens(self) -> int:
71
+ return self._overlap_tokens
72
+
73
+ @property
74
+ def target_chars(self) -> int:
75
+ return int(self._target_tokens * self._chars_per_token)
76
+
77
+ @property
78
+ def overlap_chars(self) -> int:
79
+ return int(self._overlap_tokens * self._chars_per_token)
80
+
81
+ def chunk(self, text: str) -> Iterator[Chunk]:
82
+ """Split text into overlapping chunks."""
83
+ if not text.strip():
84
+ return
85
+
86
+ # For short texts, return as single chunk
87
+ if len(text) <= self.target_chars:
88
+ yield Chunk(text=text, start_char=0, end_char=len(text), index=0)
89
+ return
90
+
91
+ stride = self.target_chars - self.overlap_chars
92
+ start = 0
93
+ index = 0
94
+
95
+ while start < len(text):
96
+ end = min(start + self.target_chars, len(text))
97
+
98
+ # Try to break at word boundary
99
+ if end < len(text):
100
+ # Look for whitespace near the end
101
+ break_point = self._find_break_point(text, end)
102
+ if break_point > start:
103
+ end = break_point
104
+
105
+ chunk_text = text[start:end].strip()
106
+ if chunk_text:
107
+ yield Chunk(
108
+ text=chunk_text,
109
+ start_char=start,
110
+ end_char=end,
111
+ index=index,
112
+ )
113
+ index += 1
114
+
115
+ # Move forward by stride, not by chunk length
116
+ start += stride
117
+
118
+ # Ensure progress even if stride is weird
119
+ if start <= 0:
120
+ start = end
121
+
122
+ def _find_break_point(self, text: str, target: int, window: int = 50) -> int:
123
+ """Find a good break point (whitespace) near target position."""
124
+ # Search backwards from target for whitespace
125
+ search_start = max(0, target - window)
126
+ search_region = text[search_start:target]
127
+
128
+ # Find last whitespace in region
129
+ match = None
130
+ for m in re.finditer(r'\s+', search_region):
131
+ match = m
132
+
133
+ if match:
134
+ return search_start + match.end()
135
+ return target
136
+
137
+
138
+ class SentenceChunker(TokenChunker):
139
+ """
140
+ Chunk by sentences, respecting token limits.
141
+
142
+ Tries to keep sentences intact while staying under token limit.
143
+ Falls back to mid-sentence breaks for very long sentences.
144
+ """
145
+
146
+ # Simple sentence boundary pattern
147
+ SENTENCE_END = re.compile(r'(?<=[.!?])\s+(?=[A-Z])')
148
+
149
+ def chunk(self, text: str) -> Iterator[Chunk]:
150
+ """Split into sentence-aligned chunks."""
151
+ if not text.strip():
152
+ return
153
+
154
+ if len(text) <= self.target_chars:
155
+ yield Chunk(text=text, start_char=0, end_char=len(text), index=0)
156
+ return
157
+
158
+ sentences = self.SENTENCE_END.split(text)
159
+
160
+ current_chunk = []
161
+ current_len = 0
162
+ chunk_start = 0
163
+ char_pos = 0
164
+ index = 0
165
+
166
+ for sentence in sentences:
167
+ sentence_len = len(sentence)
168
+
169
+ # If single sentence exceeds limit, fall back to token chunking
170
+ if sentence_len > self.target_chars:
171
+ # Flush current chunk first
172
+ if current_chunk:
173
+ chunk_text = ' '.join(current_chunk)
174
+ yield Chunk(
175
+ text=chunk_text,
176
+ start_char=chunk_start,
177
+ end_char=char_pos,
178
+ index=index,
179
+ )
180
+ index += 1
181
+ current_chunk = []
182
+ current_len = 0
183
+
184
+ # Chunk the long sentence
185
+ for sub_chunk in super().chunk(sentence):
186
+ yield Chunk(
187
+ text=sub_chunk.text,
188
+ start_char=char_pos + sub_chunk.start_char,
189
+ end_char=char_pos + sub_chunk.end_char,
190
+ index=index,
191
+ )
192
+ index += 1
193
+
194
+ char_pos += sentence_len + 1 # +1 for space
195
+ chunk_start = char_pos
196
+ continue
197
+
198
+ # Would adding this sentence exceed limit?
199
+ if current_len + sentence_len + 1 > self.target_chars and current_chunk:
200
+ # Emit current chunk
201
+ chunk_text = ' '.join(current_chunk)
202
+ yield Chunk(
203
+ text=chunk_text,
204
+ start_char=chunk_start,
205
+ end_char=char_pos,
206
+ index=index,
207
+ )
208
+ index += 1
209
+
210
+ # Start new chunk with overlap
211
+ # Keep last sentence(s) up to overlap size
212
+ overlap_sents = []
213
+ overlap_len = 0
214
+ for s in reversed(current_chunk):
215
+ if overlap_len + len(s) > self.overlap_chars:
216
+ break
217
+ overlap_sents.insert(0, s)
218
+ overlap_len += len(s) + 1
219
+
220
+ current_chunk = overlap_sents
221
+ current_len = overlap_len
222
+ chunk_start = char_pos - overlap_len
223
+
224
+ current_chunk.append(sentence)
225
+ current_len += sentence_len + 1
226
+ char_pos += sentence_len + 1
227
+
228
+ # Emit final chunk
229
+ if current_chunk:
230
+ chunk_text = ' '.join(current_chunk)
231
+ yield Chunk(
232
+ text=chunk_text,
233
+ start_char=chunk_start,
234
+ end_char=len(text),
235
+ index=index,
236
+ )
237
+
238
+
239
+ class MarkdownChunker(TokenChunker):
240
+ """
241
+ Chunk Markdown documents respecting structure.
242
+
243
+ Tries to break at:
244
+ 1. Heading boundaries (# ## ###)
245
+ 2. Paragraph boundaries (blank lines)
246
+ 3. Sentence boundaries
247
+ 4. Word boundaries (fallback)
248
+ """
249
+
250
+ HEADING = re.compile(r'^#{1,6}\s+', re.MULTILINE)
251
+ PARAGRAPH = re.compile(r'\n\n+')
252
+
253
+ def chunk(self, text: str) -> Iterator[Chunk]:
254
+ """Split Markdown into structure-aware chunks."""
255
+ if not text.strip():
256
+ return
257
+
258
+ if len(text) <= self.target_chars:
259
+ yield Chunk(text=text, start_char=0, end_char=len(text), index=0)
260
+ return
261
+
262
+ # Split on headings first
263
+ sections = []
264
+ last_end = 0
265
+
266
+ for match in self.HEADING.finditer(text):
267
+ if match.start() > last_end:
268
+ sections.append((last_end, match.start()))
269
+ last_end = match.start()
270
+
271
+ if last_end < len(text):
272
+ sections.append((last_end, len(text)))
273
+
274
+ # If no headings, fall back to paragraph splitting
275
+ if len(sections) <= 1:
276
+ yield from self._chunk_by_paragraphs(text)
277
+ return
278
+
279
+ index = 0
280
+ for start, end in sections:
281
+ section_text = text[start:end]
282
+
283
+ if len(section_text) <= self.target_chars:
284
+ if section_text.strip():
285
+ yield Chunk(
286
+ text=section_text.strip(),
287
+ start_char=start,
288
+ end_char=end,
289
+ index=index,
290
+ )
291
+ index += 1
292
+ else:
293
+ # Section too long, chunk it
294
+ for sub in self._chunk_by_paragraphs(section_text):
295
+ yield Chunk(
296
+ text=sub.text,
297
+ start_char=start + sub.start_char,
298
+ end_char=start + sub.end_char,
299
+ index=index,
300
+ )
301
+ index += 1
302
+
303
+ def _chunk_by_paragraphs(self, text: str) -> Iterator[Chunk]:
304
+ """Fall back to paragraph-based chunking."""
305
+ paragraphs = self.PARAGRAPH.split(text)
306
+
307
+ current_chunk = []
308
+ current_len = 0
309
+ chunk_start = 0
310
+ char_pos = 0
311
+ index = 0
312
+
313
+ for para in paragraphs:
314
+ para_len = len(para)
315
+
316
+ if para_len > self.target_chars:
317
+ # Flush and chunk the long paragraph
318
+ if current_chunk:
319
+ yield Chunk(
320
+ text='\n\n'.join(current_chunk),
321
+ start_char=chunk_start,
322
+ end_char=char_pos,
323
+ index=index,
324
+ )
325
+ index += 1
326
+ current_chunk = []
327
+ current_len = 0
328
+
329
+ for sub in super().chunk(para):
330
+ yield Chunk(
331
+ text=sub.text,
332
+ start_char=char_pos + sub.start_char,
333
+ end_char=char_pos + sub.end_char,
334
+ index=index,
335
+ )
336
+ index += 1
337
+
338
+ char_pos += para_len + 2
339
+ chunk_start = char_pos
340
+ continue
341
+
342
+ if current_len + para_len + 2 > self.target_chars and current_chunk:
343
+ yield Chunk(
344
+ text='\n\n'.join(current_chunk),
345
+ start_char=chunk_start,
346
+ end_char=char_pos,
347
+ index=index,
348
+ )
349
+ index += 1
350
+ current_chunk = []
351
+ current_len = 0
352
+ chunk_start = char_pos
353
+
354
+ current_chunk.append(para)
355
+ current_len += para_len + 2
356
+ char_pos += para_len + 2
357
+
358
+ if current_chunk:
359
+ yield Chunk(
360
+ text='\n\n'.join(current_chunk),
361
+ start_char=chunk_start,
362
+ end_char=len(text),
363
+ index=index,
364
+ )