omni-split 0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,519 @@
1
+ """Dataclasses for Chonkie."""
2
+
3
+ from dataclasses import dataclass, field
4
+ from typing import TYPE_CHECKING, List, Literal, Optional, Union
5
+
6
+ if TYPE_CHECKING:
7
+ import numpy as np
8
+
9
+
10
+ @dataclass
11
+ class Context:
12
+ """A dataclass representing contextual information for chunk refinement.
13
+
14
+ This class stores text and token count information that can be used to add
15
+ context to chunks during the refinement process. It can represent context
16
+ that comes before or after a chunk.
17
+
18
+ Attributes:
19
+ text (str): The context text
20
+ token_count (int): Number of tokens in the context text
21
+ start_index (Optional[int]): Starting position of context in original text
22
+ end_index (Optional[int]): Ending position of context in original text
23
+
24
+ Example:
25
+ context = Context(
26
+ text="This is some context.",
27
+ token_count=5,
28
+ start_index=0,
29
+ end_index=20
30
+ )
31
+
32
+ """
33
+
34
+ text: str
35
+ token_count: int
36
+ start_index: Optional[int] = None
37
+ end_index: Optional[int] = None
38
+
39
+ # Trivial function but we keep it for consistency with other chunk types.
40
+ def to_dict(self) -> dict:
41
+ """Return the Context as a dictionary."""
42
+ return self.__dict__.copy()
43
+
44
+ @classmethod
45
+ def from_dict(cls, data: dict):
46
+ """Create a Context object from a dictionary."""
47
+ return cls(**data)
48
+
49
+ def __post_init__(self):
50
+ """Validate the Context attributes after initialization."""
51
+ if not isinstance(self.text, str):
52
+ raise ValueError("text must be a string")
53
+ if self.token_count < 0:
54
+ raise ValueError("token_count must be non-negative")
55
+ if self.start_index is not None and self.end_index is not None and self.start_index > self.end_index:
56
+ raise ValueError("start_index must be less than or equal to end_index")
57
+
58
+ def __len__(self) -> int:
59
+ """Return the length of the context text."""
60
+ return len(self.text)
61
+
62
+ def __str__(self) -> str:
63
+ """Return a string representation of the Context."""
64
+ return self.text
65
+
66
+ def __repr__(self) -> str:
67
+ """Return a detailed string representation of the Context."""
68
+ return f"Context(text='{self.text}', token_count={self.token_count}, " f"start_index={self.start_index}, end_index={self.end_index})"
69
+
70
+
71
+ @dataclass
72
+ class Chunk:
73
+ """Dataclass representing a text chunk with metadata.
74
+
75
+ All attributes are read-only via slots for performance reasons.
76
+
77
+ Attributes:
78
+ text: The text content of the chunk
79
+ start_index: The starting index of the chunk in the original text
80
+ end_index: The ending index of the chunk in the original text
81
+ token_count: The number of tokens in the chunk
82
+ context: The context of the chunk, useful for refinery classes
83
+
84
+ """
85
+
86
+ text: str
87
+ start_index: int
88
+ end_index: int
89
+ token_count: int
90
+ context: Optional[Context] = None
91
+
92
+ # Trivial function but we keep it for consistency across chunk types.
93
+ def to_dict(self) -> dict:
94
+ """Return the Chunk as a dictionary."""
95
+ result = self.__dict__.copy()
96
+ result["context"] = self.context.to_dict() if self.context is not None else None
97
+ return result
98
+
99
+ @classmethod
100
+ def from_dict(cls, data: dict):
101
+ """Create a Chunk object from a dictionary."""
102
+ context_repr = data.pop("context")
103
+ return cls(
104
+ **data,
105
+ context=Context.from_dict(context_repr) if context_repr is not None else None,
106
+ )
107
+
108
+ def __str__(self) -> str:
109
+ """Return string representation of the chunk."""
110
+ return self.text
111
+
112
+ def __len__(self) -> int:
113
+ """Return the length of the chunk."""
114
+ return len(self.text)
115
+
116
+ def __repr__(self) -> str:
117
+ """Return string representation of the chunk."""
118
+ if self.context is not None:
119
+ return f"Chunk(text={self.text}, start_index={self.start_index}, " f"end_index={self.end_index}, token_count={self.token_count})"
120
+ else:
121
+ return f"Chunk(text={self.text}, start_index={self.start_index}, " f"end_index={self.end_index}, token_count={self.token_count}, " f"context={self.context})"
122
+
123
+ def __iter__(self):
124
+ """Return an iterator over the chunk."""
125
+ return iter(self.text)
126
+
127
+ def __getitem__(self, index: int):
128
+ """Return the item at the given index."""
129
+ return self.text[index]
130
+
131
+ def copy(self) -> "Chunk":
132
+ """Return a deep copy of the chunk."""
133
+ return Chunk.from_dict(self.to_dict())
134
+
135
+
136
+ @dataclass
137
+ class Sentence:
138
+ """Dataclass representing a sentence with metadata.
139
+
140
+ All attributes are read-only via slots for performance reasons.
141
+
142
+ Attributes:
143
+ text: The text content of the sentence
144
+ start_index: The starting index of the sentence in the original text
145
+ end_index: The ending index of the sentence in the original text
146
+ token_count: The number of tokens in the sentence
147
+
148
+ """
149
+
150
+ text: str
151
+ start_index: int
152
+ end_index: int
153
+ token_count: int
154
+
155
+ # Trivial functions but we keep them for consistency with other chunk types.
156
+ def to_dict(self) -> dict:
157
+ """Return the Chunk as a dictionary."""
158
+ return self.__dict__.copy()
159
+
160
+ @classmethod
161
+ def from_dict(cls, data: dict):
162
+ """Create a Sentence object from a dictionary."""
163
+ return cls(**data)
164
+
165
+ def __repr__(self) -> str:
166
+ """Return a string representation of the Sentence."""
167
+ return f"Sentence(text={self.text}, start_index={self.start_index}, " f"end_index={self.end_index}, token_count={self.token_count})"
168
+
169
+
170
+ @dataclass
171
+ class SentenceChunk(Chunk):
172
+ """Dataclass representing a sentence chunk with metadata.
173
+
174
+ All attributes are read-only via slots for performance reasons.
175
+
176
+ Attributes:
177
+ text: The text content of the chunk
178
+ start_index: The starting index of the chunk in the original text
179
+ end_index: The ending index of the chunk in the original text
180
+ token_count: The number of tokens in the chunk
181
+ sentences: List of Sentence objects in the chunk
182
+
183
+ """
184
+
185
+ # Don't redeclare inherited fields
186
+ sentences: List[Sentence] = field(default_factory=list)
187
+
188
+ def to_dict(self) -> dict:
189
+ """Return the SentenceChunk as a dictionary."""
190
+ result = super().to_dict()
191
+ result["sentences"] = [sentence.to_dict() for sentence in self.sentences]
192
+ return result
193
+
194
+ @classmethod
195
+ def from_dict(cls, data: dict) -> "SentenceChunk":
196
+ """Create a SentenceChunk object from a dictionary."""
197
+ sentences_dict = data.pop("sentences") if "sentences" in data else None
198
+ sentences = [Sentence.from_dict(sentence) for sentence in sentences_dict] if sentences_dict is not None else []
199
+ return cls(**data, sentences=sentences)
200
+
201
+ def __repr__(self) -> str:
202
+ """Return a string representation of the SentenceChunk."""
203
+ return f"SentenceChunk(text={self.text}, start_index={self.start_index}, " f"end_index={self.end_index}, token_count={self.token_count}, " f"sentences={self.sentences})"
204
+
205
+
206
+ @dataclass
207
+ class SemanticSentence(Sentence):
208
+ """Dataclass representing a semantic sentence with metadata.
209
+
210
+ This class is used to represent a sentence with an embedding.
211
+
212
+ Attributes:
213
+ text: The text content of the sentence
214
+ start_index: The starting index of the sentence in the original text
215
+ end_index: The ending index of the sentence in the original text
216
+ token_count: The number of tokens in the sentence
217
+ embedding: The sentence embedding
218
+
219
+ """
220
+
221
+ embedding: Optional["np.ndarray"] = field(default=None)
222
+
223
+ def to_dict(self) -> dict:
224
+ """Return the SemanticSentence as a dictionary."""
225
+ result = super().to_dict()
226
+ result["embedding"] = self.embedding.tolist() if self.embedding is not None else None
227
+ return result
228
+
229
+ @classmethod
230
+ def from_dict(cls, data: dict):
231
+ """Create a SemanticSentence object from a dictionary."""
232
+ embedding_list = data.pop("embedding")
233
+ # NOTE: We can't use np.array() here because we don't import numpy in this file,
234
+ # and we don't want add 50MiB to the package size.
235
+ embedding = embedding_list if embedding_list is not None else None
236
+ return cls(**data, embedding=embedding)
237
+
238
+ def __repr__(self) -> str:
239
+ """Return a string representation of the SemanticSentence."""
240
+ return f"SemanticSentence(text={self.text}, start_index={self.start_index}, " f"end_index={self.end_index}, token_count={self.token_count}, " f"embedding={self.embedding})"
241
+
242
+
243
+ @dataclass
244
+ class SemanticChunk(SentenceChunk):
245
+ """SemanticChunk dataclass representing a semantic chunk with metadata.
246
+
247
+ Attributes:
248
+ text: The text content of the chunk
249
+ start_index: The starting index of the chunk in the original text
250
+ end_index: The ending index of the chunk in the original text
251
+ token_count: The number of tokens in the chunk
252
+ sentences: List of SemanticSentence objects in the chunk
253
+
254
+ """
255
+
256
+ sentences: List[SemanticSentence] = field(default_factory=list)
257
+
258
+ def to_dict(self) -> dict:
259
+ """Return the SemanticChunk as a dictionary."""
260
+ result = super().to_dict()
261
+ result["sentences"] = [sentence.to_dict() for sentence in self.sentences]
262
+ return result
263
+
264
+ @classmethod
265
+ def from_dict(cls, data: dict):
266
+ """Create a SemanticChunk object from a dictionary."""
267
+ sentences_dict = data.pop("sentences")
268
+ sentences = [SemanticSentence.from_dict(sentence) for sentence in sentences_dict]
269
+ return cls(**data, sentences=sentences)
270
+
271
+ def __repr__(self) -> str:
272
+ """Return a string representation of the SemanticChunk."""
273
+ return f"SemanticChunk(text={self.text}, start_index={self.start_index}, " f"end_index={self.end_index}, token_count={self.token_count}, " f"sentences={self.sentences})"
274
+
275
+
276
+ @dataclass
277
+ class LateSentence(Sentence):
278
+ """LateSentence dataclass representing a sentence with an embedding.
279
+
280
+ This class is used to represent a sentence with an embedding.
281
+
282
+ Attributes:
283
+ text: The text content of the sentence
284
+ start_index: The starting index of the sentence in the original text
285
+ end_index: The ending index of the sentence in the original text
286
+ token_count: The number of tokens in the sentence
287
+ embedding: The sentence embedding
288
+
289
+ """
290
+
291
+ embedding: Optional["np.ndarray"] = field(default=None)
292
+
293
+ def to_dict(self) -> dict:
294
+ """Return the LateSentence as a dictionary."""
295
+ result = super().to_dict()
296
+ result["embedding"] = self.embedding.tolist() if self.embedding is not None else None
297
+ return result
298
+
299
+ @classmethod
300
+ def from_dict(cls, data: dict):
301
+ """Create a LateSentence object from a dictionary."""
302
+ embedding_list = data.pop("embedding")
303
+ embedding = np.array(embedding_list, dtype=np.float64) if embedding_list is not None else None
304
+ return cls(**data, embedding=embedding)
305
+
306
+ def __repr__(self) -> str:
307
+ """Return a string representation of the LateSentence."""
308
+ return f"LateSentence(text={self.text}, start_index={self.start_index}, " f"end_index={self.end_index}, token_count={self.token_count}, " f"embedding={self.embedding})"
309
+
310
+
311
+ @dataclass
312
+ class LateChunk(Chunk):
313
+ """LateChunk dataclass representing a chunk with an embedding.
314
+
315
+ This class is used to represent a chunk with an embedding.
316
+
317
+ Attributes:
318
+ text: The text content of the chunk
319
+ start_index: The starting index of the chunk in the original text
320
+ end_index: The ending index of the chunk in the original text
321
+ token_count: The number of tokens in the chunk
322
+ embedding: The chunk embedding
323
+
324
+ """
325
+
326
+ sentences: List[LateSentence] = field(default_factory=list)
327
+ embedding: Optional["np.ndarray"] = field(default=None)
328
+
329
+ def to_dict(self) -> dict:
330
+ """Return the LateChunk as a dictionary."""
331
+ result = super().to_dict()
332
+ result["sentences"] = [sentence.to_dict() for sentence in self.sentences]
333
+ result["embedding"] = self.embedding.tolist() if self.embedding is not None else None
334
+ return result
335
+
336
+ @classmethod
337
+ def from_dict(cls, data: dict):
338
+ """Create a LateChunk object from a dictionary."""
339
+ sentences_dict = data.pop("sentences")
340
+ sentences = [LateSentence.from_dict(sentence) for sentence in sentences_dict]
341
+ embedding_list = data.pop("embedding")
342
+ embedding = np.array(embedding_list, dtype=np.float64) if embedding_list is not None else None
343
+ return cls(**data, sentences=sentences, embedding=embedding)
344
+
345
+ def __repr__(self) -> str:
346
+ """Return a string representation of the LateChunk."""
347
+ return f"LateChunk(text={self.text}, start_index={self.start_index}, " f"end_index={self.end_index}, token_count={self.token_count}, " f"sentences={self.sentences}, embedding={self.embedding})"
348
+
349
+
350
+ @dataclass
351
+ class RecursiveLevel:
352
+ """Configuration for a single level of recursive chunking.
353
+
354
+ Attributes:
355
+ delimiters: The delimiters to use for the level. If None, that level will use tokens to determine chunk boundaries.
356
+ whitespace: Whether to use whitespace as a delimiter.
357
+
358
+ """
359
+
360
+ delimiters: Union[List[str], str, None] = None
361
+ whitespace: bool = False
362
+ include_delim: Union[Literal["prev", "next", None], None] = "prev"
363
+
364
+ def __post_init__(self):
365
+ """Post-initialize the recursive level."""
366
+ self.validate()
367
+
368
+ def validate(self):
369
+ """Validate the recursive level."""
370
+ if self.delimiters is not None and self.whitespace:
371
+ raise ValueError("Cannot have both delimiters and whitespace. " "Use two separate levels instead, one for whitespace and one for delimiters.")
372
+ if self.delimiters is not None:
373
+ for delimiter in self.delimiters:
374
+ if not isinstance(delimiter, str):
375
+ raise ValueError("All delimiters must be strings")
376
+ if len(delimiter) == 0:
377
+ raise ValueError("All delimiters must be non-empty strings")
378
+ if delimiter == " ":
379
+ raise ValueError(
380
+ "Cannot use whitespace as a delimiter",
381
+ "Use whitespace=True instead",
382
+ )
383
+
384
+ def to_dict(self) -> dict:
385
+ """Return the RecursiveLevel as a dictionary."""
386
+ return self.__dict__.copy()
387
+
388
+ @classmethod
389
+ def from_dict(cls, data: dict):
390
+ """Create a RecursiveLevel object from a dictionary."""
391
+ return cls(**data)
392
+
393
+ def __repr__(self) -> str:
394
+ """Return a string representation of the RecursiveLevel."""
395
+ return f"RecursiveLevel(delimiters={self.delimiters}, " f"whitespace={self.whitespace}, " f"include_delim={self.include_delim})"
396
+
397
+
398
+ @dataclass
399
+ class RecursiveRules:
400
+ """Collection of rules for recursive chunking."""
401
+
402
+ levels: Union[List[RecursiveLevel], RecursiveLevel, None] = None
403
+
404
+ def __post_init__(self):
405
+ """Initialize the recursive rules if not already initialized."""
406
+ # Set default levels if not already initialized
407
+ if self.levels is None:
408
+ # First level should be paragraphs
409
+ paragraph_level = RecursiveLevel(delimiters=["\n\n", "\n", "\r\n"], whitespace=False)
410
+ # Second level should be sentences
411
+ sentence_level = RecursiveLevel(delimiters=[".", "?", "!"], whitespace=False)
412
+
413
+ # Third level can be sub-sentences, like '...', ',', ';', ':', etc.
414
+ sub_sentence_level = RecursiveLevel(
415
+ delimiters=[
416
+ ",",
417
+ ";",
418
+ ":",
419
+ "...",
420
+ "-",
421
+ "(",
422
+ ")",
423
+ "[",
424
+ "]",
425
+ "{",
426
+ "}",
427
+ "<",
428
+ ">",
429
+ "|",
430
+ "~",
431
+ "`",
432
+ "'",
433
+ '"',
434
+ ],
435
+ whitespace=False,
436
+ )
437
+
438
+ # Fourth level should be words
439
+ word_level = RecursiveLevel(delimiters=None, whitespace=True)
440
+ # Fifth level should be tokens
441
+ # NOTE: When delimiters is None, the level will use tokens to determine chunk boundaries.
442
+ token_level = RecursiveLevel(delimiters=None, whitespace=False)
443
+ self.levels = [
444
+ paragraph_level,
445
+ sentence_level,
446
+ sub_sentence_level,
447
+ word_level,
448
+ token_level,
449
+ ]
450
+ else:
451
+ if isinstance(self.levels, RecursiveLevel):
452
+ self.levels.validate()
453
+ elif isinstance(self.levels, list) and all(isinstance(level, RecursiveLevel) for level in self.levels):
454
+ for level in self.levels:
455
+ level.validate()
456
+
457
+ def __iter__(self):
458
+ """Iterate over the levels."""
459
+ return iter(self.levels)
460
+
461
+ def __getitem__(self, index: int) -> RecursiveLevel:
462
+ """Get a level by index."""
463
+ return self.levels[index]
464
+
465
+ def __len__(self) -> int:
466
+ """Get the number of levels."""
467
+ return len(self.levels)
468
+
469
+ def __repr__(self) -> str:
470
+ """Get a string representation of the recursive rules."""
471
+ return f"RecursiveRules(levels={self.levels})"
472
+
473
+ def to_dict(self) -> dict:
474
+ """Return the RecursiveRules as a dictionary."""
475
+ result = dict()
476
+ result["levels"] = None
477
+ if isinstance(self.levels, RecursiveLevel):
478
+ result["levels"] = self.levels.to_dict()
479
+ elif isinstance(self.levels, list):
480
+ result["levels"] = [level.to_dict() for level in self.levels]
481
+ else:
482
+ raise ValueError("Invalid levels type")
483
+ return result
484
+
485
+ @classmethod
486
+ def from_dict(cls, data: dict):
487
+ """Create a RecursiveRules object from a dictionary."""
488
+ levels_repr = data.pop("levels")
489
+ levels = None
490
+ if levels_repr is not None:
491
+ if isinstance(levels_repr, dict):
492
+ levels = RecursiveLevel.from_dict(levels_repr)
493
+ elif isinstance(levels_repr, list):
494
+ levels = [RecursiveLevel.from_dict(level) for level in levels_repr]
495
+ return cls(levels=levels)
496
+
497
+
498
+ @dataclass
499
+ class RecursiveChunk(Chunk):
500
+ """A Chunk with a level attribute."""
501
+
502
+ level: Union[int, None] = None
503
+
504
+ def __repr__(self) -> str:
505
+ """Get a string representation of the recursive chunk."""
506
+ return f"RecursiveChunk(text={self.text}, " f"start_index={self.start_index}, " f"end_index={self.end_index}, " f"token_count={self.token_count}, " f"level={self.level})"
507
+
508
+ def __str__(self) -> str:
509
+ """Get a string representation of the recursive chunk."""
510
+ return f"RecursiveChunk(text={self.text}, " f"start_index={self.start_index}, " f"end_index={self.end_index}, " f"token_count={self.token_count}, " f"level={self.level})"
511
+
512
+ def to_dict(self) -> dict:
513
+ """Return the RecursiveChunk as a dictionary."""
514
+ return self.__dict__.copy()
515
+
516
+ @classmethod
517
+ def from_dict(cls, data: dict):
518
+ """Create a RecursiveChunk object from a dictionary."""
519
+ return cls(**data)