rakam-systems-vectorstore 0.1.1rc7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rakam_systems_vectorstore/MANIFEST.in +26 -0
- rakam_systems_vectorstore/README.md +1071 -0
- rakam_systems_vectorstore/__init__.py +93 -0
- rakam_systems_vectorstore/components/__init__.py +0 -0
- rakam_systems_vectorstore/components/chunker/__init__.py +19 -0
- rakam_systems_vectorstore/components/chunker/advanced_chunker.py +1019 -0
- rakam_systems_vectorstore/components/chunker/text_chunker.py +154 -0
- rakam_systems_vectorstore/components/embedding_model/__init__.py +0 -0
- rakam_systems_vectorstore/components/embedding_model/configurable_embeddings.py +546 -0
- rakam_systems_vectorstore/components/embedding_model/openai_embeddings.py +259 -0
- rakam_systems_vectorstore/components/loader/__init__.py +31 -0
- rakam_systems_vectorstore/components/loader/adaptive_loader.py +512 -0
- rakam_systems_vectorstore/components/loader/code_loader.py +699 -0
- rakam_systems_vectorstore/components/loader/doc_loader.py +812 -0
- rakam_systems_vectorstore/components/loader/eml_loader.py +556 -0
- rakam_systems_vectorstore/components/loader/html_loader.py +626 -0
- rakam_systems_vectorstore/components/loader/md_loader.py +622 -0
- rakam_systems_vectorstore/components/loader/odt_loader.py +750 -0
- rakam_systems_vectorstore/components/loader/pdf_loader.py +771 -0
- rakam_systems_vectorstore/components/loader/pdf_loader_light.py +723 -0
- rakam_systems_vectorstore/components/loader/tabular_loader.py +597 -0
- rakam_systems_vectorstore/components/vectorstore/__init__.py +0 -0
- rakam_systems_vectorstore/components/vectorstore/apps.py +10 -0
- rakam_systems_vectorstore/components/vectorstore/configurable_pg_vector_store.py +1661 -0
- rakam_systems_vectorstore/components/vectorstore/faiss_vector_store.py +878 -0
- rakam_systems_vectorstore/components/vectorstore/migrations/0001_initial.py +55 -0
- rakam_systems_vectorstore/components/vectorstore/migrations/__init__.py +0 -0
- rakam_systems_vectorstore/components/vectorstore/models.py +10 -0
- rakam_systems_vectorstore/components/vectorstore/pg_models.py +97 -0
- rakam_systems_vectorstore/components/vectorstore/pg_vector_store.py +827 -0
- rakam_systems_vectorstore/config.py +266 -0
- rakam_systems_vectorstore/core.py +8 -0
- rakam_systems_vectorstore/pyproject.toml +113 -0
- rakam_systems_vectorstore/server/README.md +290 -0
- rakam_systems_vectorstore/server/__init__.py +20 -0
- rakam_systems_vectorstore/server/mcp_server_vector.py +325 -0
- rakam_systems_vectorstore/setup.py +103 -0
- rakam_systems_vectorstore-0.1.1rc7.dist-info/METADATA +370 -0
- rakam_systems_vectorstore-0.1.1rc7.dist-info/RECORD +40 -0
- rakam_systems_vectorstore-0.1.1rc7.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,1019 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Advanced Chunker with Customizable Serialization
|
|
3
|
+
|
|
4
|
+
This module provides an advanced chunking system that allows customization of
|
|
5
|
+
serialization strategies for different document elements (tables, pictures, etc.)
|
|
6
|
+
during the chunking process.
|
|
7
|
+
|
|
8
|
+
Key Features:
|
|
9
|
+
- Hybrid chunking with customizable serialization
|
|
10
|
+
- Support for different table serialization formats (triplet, markdown, etc.)
|
|
11
|
+
- Configurable picture serialization with annotation support
|
|
12
|
+
- Token-aware chunking with contextual information
|
|
13
|
+
- Extensible serializer provider pattern
|
|
14
|
+
|
|
15
|
+
Usage Example:
|
|
16
|
+
```python
|
|
17
|
+
from advanced_chunker import AdvancedChunker
|
|
18
|
+
|
|
19
|
+
# Create chunker with markdown tables
|
|
20
|
+
chunker = AdvancedChunker(strategy="markdown_tables")
|
|
21
|
+
|
|
22
|
+
# Chunk documents
|
|
23
|
+
documents = ["document text here"]
|
|
24
|
+
chunks = chunker.run(documents)
|
|
25
|
+
```
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
from __future__ import annotations
|
|
29
|
+
import re
|
|
30
|
+
from typing import Any, Iterable, List, Optional, Type
|
|
31
|
+
from abc import abstractmethod
|
|
32
|
+
|
|
33
|
+
from rakam_systems_core.ai_core.interfaces.chunker import Chunker
|
|
34
|
+
|
|
35
|
+
try:
|
|
36
|
+
from chonkie import SentenceChunker
|
|
37
|
+
CHONKIE_AVAILABLE = True
|
|
38
|
+
except ImportError:
|
|
39
|
+
CHONKIE_AVAILABLE = False
|
|
40
|
+
|
|
41
|
+
from docling_core.transforms.chunker.hybrid_chunker import HybridChunker
|
|
42
|
+
from docling_core.transforms.chunker.base import BaseChunk
|
|
43
|
+
from docling_core.transforms.chunker.hierarchical_chunker import (
|
|
44
|
+
DocChunk,
|
|
45
|
+
DocMeta,
|
|
46
|
+
ChunkingDocSerializer,
|
|
47
|
+
ChunkingSerializerProvider,
|
|
48
|
+
)
|
|
49
|
+
from docling_core.transforms.chunker.tokenizer.base import BaseTokenizer
|
|
50
|
+
from docling_core.transforms.chunker.tokenizer.huggingface import HuggingFaceTokenizer
|
|
51
|
+
from docling_core.transforms.serializer.base import (
|
|
52
|
+
BaseDocSerializer,
|
|
53
|
+
SerializationResult,
|
|
54
|
+
)
|
|
55
|
+
from docling_core.transforms.serializer.common import create_ser_result
|
|
56
|
+
from docling_core.transforms.serializer.markdown import (
|
|
57
|
+
MarkdownTableSerializer,
|
|
58
|
+
MarkdownPictureSerializer,
|
|
59
|
+
MarkdownParams,
|
|
60
|
+
)
|
|
61
|
+
from docling_core.types.doc.document import (
|
|
62
|
+
DoclingDocument,
|
|
63
|
+
PictureClassificationData,
|
|
64
|
+
PictureDescriptionData,
|
|
65
|
+
PictureMoleculeData,
|
|
66
|
+
PictureItem,
|
|
67
|
+
)
|
|
68
|
+
from docling_core.types.doc.labels import DocItemLabel
|
|
69
|
+
from transformers import AutoTokenizer
|
|
70
|
+
from typing_extensions import override
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
class BaseSerializerProvider(ChunkingSerializerProvider):
|
|
74
|
+
"""Base class for serializer providers with common configuration."""
|
|
75
|
+
|
|
76
|
+
def __init__(
|
|
77
|
+
self,
|
|
78
|
+
table_serializer: Optional[Any] = None,
|
|
79
|
+
picture_serializer: Optional[Any] = None,
|
|
80
|
+
params: Optional[MarkdownParams] = None,
|
|
81
|
+
):
|
|
82
|
+
"""
|
|
83
|
+
Initialize the serializer provider.
|
|
84
|
+
|
|
85
|
+
Args:
|
|
86
|
+
table_serializer: Custom table serializer instance
|
|
87
|
+
picture_serializer: Custom picture serializer instance
|
|
88
|
+
params: Markdown serialization parameters
|
|
89
|
+
"""
|
|
90
|
+
self.table_serializer = table_serializer
|
|
91
|
+
self.picture_serializer = picture_serializer
|
|
92
|
+
self.params = params or MarkdownParams()
|
|
93
|
+
|
|
94
|
+
@abstractmethod
|
|
95
|
+
def get_serializer(self, doc: DoclingDocument) -> ChunkingDocSerializer:
|
|
96
|
+
"""Get the configured serializer for the document."""
|
|
97
|
+
pass
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
class DefaultSerializerProvider(BaseSerializerProvider):
|
|
101
|
+
"""Default serializer provider with standard settings."""
|
|
102
|
+
|
|
103
|
+
def get_serializer(self, doc: DoclingDocument) -> ChunkingDocSerializer:
|
|
104
|
+
"""Get default serializer."""
|
|
105
|
+
kwargs = {"doc": doc, "params": self.params}
|
|
106
|
+
if self.table_serializer:
|
|
107
|
+
kwargs["table_serializer"] = self.table_serializer
|
|
108
|
+
if self.picture_serializer:
|
|
109
|
+
kwargs["picture_serializer"] = self.picture_serializer
|
|
110
|
+
return ChunkingDocSerializer(**kwargs)
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
class MDTableSerializerProvider(BaseSerializerProvider):
|
|
114
|
+
"""
|
|
115
|
+
Serializer provider that uses Markdown format for tables.
|
|
116
|
+
|
|
117
|
+
This provider converts tables to Markdown format instead of the default
|
|
118
|
+
triplet notation, making them more human-readable.
|
|
119
|
+
"""
|
|
120
|
+
|
|
121
|
+
def __init__(self, params: Optional[MarkdownParams] = None):
|
|
122
|
+
"""Initialize with Markdown table serializer."""
|
|
123
|
+
super().__init__(
|
|
124
|
+
table_serializer=MarkdownTableSerializer(),
|
|
125
|
+
params=params,
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
def get_serializer(self, doc: DoclingDocument) -> ChunkingDocSerializer:
|
|
129
|
+
"""Get serializer with Markdown table formatting."""
|
|
130
|
+
return ChunkingDocSerializer(
|
|
131
|
+
doc=doc,
|
|
132
|
+
table_serializer=self.table_serializer,
|
|
133
|
+
params=self.params,
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
class ImgPlaceholderSerializerProvider(BaseSerializerProvider):
|
|
138
|
+
"""
|
|
139
|
+
Serializer provider with customizable image placeholder.
|
|
140
|
+
|
|
141
|
+
This provider allows you to specify a custom placeholder text for images
|
|
142
|
+
in the serialized output.
|
|
143
|
+
"""
|
|
144
|
+
|
|
145
|
+
def __init__(self, image_placeholder: str = "<!-- image -->"):
|
|
146
|
+
"""
|
|
147
|
+
Initialize with custom image placeholder.
|
|
148
|
+
|
|
149
|
+
Args:
|
|
150
|
+
image_placeholder: Text to use as placeholder for images
|
|
151
|
+
"""
|
|
152
|
+
super().__init__(
|
|
153
|
+
params=MarkdownParams(image_placeholder=image_placeholder)
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
def get_serializer(self, doc: DoclingDocument) -> ChunkingDocSerializer:
|
|
157
|
+
"""Get serializer with custom image placeholder."""
|
|
158
|
+
return ChunkingDocSerializer(doc=doc, params=self.params)
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
class AnnotationPictureSerializer(MarkdownPictureSerializer):
|
|
162
|
+
"""
|
|
163
|
+
Picture serializer that leverages picture annotations.
|
|
164
|
+
|
|
165
|
+
This serializer extracts and includes annotation information such as:
|
|
166
|
+
- Picture classifications (predicted class)
|
|
167
|
+
- Molecule data (SMILES notation)
|
|
168
|
+
- Picture descriptions
|
|
169
|
+
"""
|
|
170
|
+
|
|
171
|
+
@override
|
|
172
|
+
def serialize(
|
|
173
|
+
self,
|
|
174
|
+
*,
|
|
175
|
+
item: PictureItem,
|
|
176
|
+
doc_serializer: BaseDocSerializer,
|
|
177
|
+
doc: DoclingDocument,
|
|
178
|
+
**kwargs: Any,
|
|
179
|
+
) -> SerializationResult:
|
|
180
|
+
"""
|
|
181
|
+
Serialize picture with annotations.
|
|
182
|
+
|
|
183
|
+
Args:
|
|
184
|
+
item: Picture item to serialize
|
|
185
|
+
doc_serializer: Document serializer instance
|
|
186
|
+
doc: Parent document
|
|
187
|
+
**kwargs: Additional serialization arguments
|
|
188
|
+
|
|
189
|
+
Returns:
|
|
190
|
+
Serialization result with annotation text
|
|
191
|
+
"""
|
|
192
|
+
text_parts: list[str] = []
|
|
193
|
+
|
|
194
|
+
# Extract annotations
|
|
195
|
+
for annotation in item.annotations:
|
|
196
|
+
if isinstance(annotation, PictureClassificationData):
|
|
197
|
+
predicted_class = (
|
|
198
|
+
annotation.predicted_classes[0].class_name
|
|
199
|
+
if annotation.predicted_classes
|
|
200
|
+
else None
|
|
201
|
+
)
|
|
202
|
+
if predicted_class is not None:
|
|
203
|
+
text_parts.append(f"Picture type: {predicted_class}")
|
|
204
|
+
|
|
205
|
+
elif isinstance(annotation, PictureMoleculeData):
|
|
206
|
+
text_parts.append(f"SMILES: {annotation.smi}")
|
|
207
|
+
|
|
208
|
+
elif isinstance(annotation, PictureDescriptionData):
|
|
209
|
+
text_parts.append(f"Picture description: {annotation.text}")
|
|
210
|
+
|
|
211
|
+
# Join and post-process
|
|
212
|
+
text_res = "\n".join(text_parts)
|
|
213
|
+
text_res = doc_serializer.post_process(text=text_res)
|
|
214
|
+
return create_ser_result(text=text_res, span_source=item)
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
class ImgAnnotationSerializerProvider(BaseSerializerProvider):
|
|
218
|
+
"""
|
|
219
|
+
Serializer provider that includes picture annotations in output.
|
|
220
|
+
|
|
221
|
+
This provider uses the AnnotationPictureSerializer to include rich
|
|
222
|
+
annotation data for pictures in the chunked output.
|
|
223
|
+
"""
|
|
224
|
+
|
|
225
|
+
def __init__(self):
|
|
226
|
+
"""Initialize with annotation picture serializer."""
|
|
227
|
+
super().__init__(picture_serializer=AnnotationPictureSerializer())
|
|
228
|
+
|
|
229
|
+
def get_serializer(self, doc: DoclingDocument) -> ChunkingDocSerializer:
|
|
230
|
+
"""Get serializer with picture annotation support."""
|
|
231
|
+
return ChunkingDocSerializer(
|
|
232
|
+
doc=doc,
|
|
233
|
+
picture_serializer=self.picture_serializer,
|
|
234
|
+
)
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
class AdvancedChunker(Chunker):
|
|
238
|
+
"""
|
|
239
|
+
Advanced chunker with customizable serialization strategies.
|
|
240
|
+
|
|
241
|
+
This class implements the Chunker interface and wraps the HybridChunker
|
|
242
|
+
to provide customizable serialization strategies for various document elements.
|
|
243
|
+
|
|
244
|
+
Attributes:
|
|
245
|
+
tokenizer: Tokenizer instance for token counting
|
|
246
|
+
hybrid_chunker: Underlying HybridChunker instance
|
|
247
|
+
embed_model_id: Model ID for tokenization
|
|
248
|
+
serializer_provider: Provider for custom serialization
|
|
249
|
+
include_heading_markers: Whether to include markdown # markers in headings
|
|
250
|
+
max_tokens: Maximum tokens per chunk
|
|
251
|
+
merge_peers: Whether to merge adjacent small chunks
|
|
252
|
+
min_chunk_tokens: Minimum tokens for a chunk to be kept standalone
|
|
253
|
+
"""
|
|
254
|
+
|
|
255
|
+
# Default configuration for better chunking quality
|
|
256
|
+
DEFAULT_MAX_TOKENS = 1024 # Larger chunks for better context
|
|
257
|
+
DEFAULT_MERGE_PEERS = True # Merge small adjacent chunks
|
|
258
|
+
DEFAULT_MIN_CHUNK_TOKENS = 50 # Minimum tokens for standalone chunks
|
|
259
|
+
|
|
260
|
+
def __init__(
|
|
261
|
+
self,
|
|
262
|
+
embed_model_id: str = "sentence-transformers/all-MiniLM-L6-v2",
|
|
263
|
+
tokenizer: Optional[BaseTokenizer] = None,
|
|
264
|
+
serializer_provider: Optional[ChunkingSerializerProvider] = None,
|
|
265
|
+
strategy: Optional[str] = None,
|
|
266
|
+
name: str = "advanced_chunker",
|
|
267
|
+
include_heading_markers: bool = True,
|
|
268
|
+
max_tokens: int = DEFAULT_MAX_TOKENS,
|
|
269
|
+
merge_peers: bool = DEFAULT_MERGE_PEERS,
|
|
270
|
+
min_chunk_tokens: int = DEFAULT_MIN_CHUNK_TOKENS,
|
|
271
|
+
filter_toc: bool = True,
|
|
272
|
+
**chunker_kwargs,
|
|
273
|
+
):
|
|
274
|
+
"""
|
|
275
|
+
Initialize the advanced chunker.
|
|
276
|
+
|
|
277
|
+
Args:
|
|
278
|
+
embed_model_id: HuggingFace model ID for tokenization
|
|
279
|
+
tokenizer: Optional custom tokenizer (if not provided, will be created)
|
|
280
|
+
serializer_provider: Custom serializer provider for document elements
|
|
281
|
+
strategy: Pre-configured strategy name (default, markdown_tables, etc.)
|
|
282
|
+
name: Component name
|
|
283
|
+
include_heading_markers: If True, adds markdown # markers to headings
|
|
284
|
+
in contextualized output (default: True)
|
|
285
|
+
max_tokens: Maximum tokens per chunk (default: 1024)
|
|
286
|
+
merge_peers: If True, merges adjacent small chunks with same metadata (default: True)
|
|
287
|
+
min_chunk_tokens: Minimum tokens for a chunk to be kept, smaller chunks
|
|
288
|
+
will be merged with neighbors (default: 50)
|
|
289
|
+
filter_toc: If True, filters out Table of Contents entries (default: True)
|
|
290
|
+
**chunker_kwargs: Additional arguments for HybridChunker
|
|
291
|
+
"""
|
|
292
|
+
super().__init__(name=name)
|
|
293
|
+
|
|
294
|
+
self.embed_model_id = embed_model_id
|
|
295
|
+
self.include_heading_markers = include_heading_markers
|
|
296
|
+
self.max_tokens = max_tokens
|
|
297
|
+
self.merge_peers = merge_peers
|
|
298
|
+
self.min_chunk_tokens = min_chunk_tokens
|
|
299
|
+
self.filter_toc = filter_toc
|
|
300
|
+
|
|
301
|
+
# Handle strategy-based provider creation
|
|
302
|
+
if strategy is not None and serializer_provider is None:
|
|
303
|
+
serializer_provider = self._create_provider_from_strategy(
|
|
304
|
+
strategy, **chunker_kwargs
|
|
305
|
+
)
|
|
306
|
+
|
|
307
|
+
self.serializer_provider = serializer_provider
|
|
308
|
+
|
|
309
|
+
# Initialize tokenizer
|
|
310
|
+
if tokenizer is None:
|
|
311
|
+
self.tokenizer = HuggingFaceTokenizer(
|
|
312
|
+
tokenizer=AutoTokenizer.from_pretrained(embed_model_id)
|
|
313
|
+
)
|
|
314
|
+
else:
|
|
315
|
+
self.tokenizer = tokenizer
|
|
316
|
+
|
|
317
|
+
# Initialize chunker with improved settings
|
|
318
|
+
chunker_config = {
|
|
319
|
+
"tokenizer": self.tokenizer,
|
|
320
|
+
"max_tokens": max_tokens,
|
|
321
|
+
"merge_peers": merge_peers,
|
|
322
|
+
}
|
|
323
|
+
if self.serializer_provider is not None:
|
|
324
|
+
chunker_config["serializer_provider"] = self.serializer_provider
|
|
325
|
+
chunker_config.update(chunker_kwargs)
|
|
326
|
+
|
|
327
|
+
self.hybrid_chunker = HybridChunker(**chunker_config)
|
|
328
|
+
|
|
329
|
+
def _create_provider_from_strategy(
|
|
330
|
+
self, strategy: str, **kwargs
|
|
331
|
+
) -> ChunkingSerializerProvider:
|
|
332
|
+
"""Create a serializer provider from a strategy name."""
|
|
333
|
+
provider_map = {
|
|
334
|
+
"default": DefaultSerializerProvider,
|
|
335
|
+
"markdown_tables": MDTableSerializerProvider,
|
|
336
|
+
"custom_placeholder": ImgPlaceholderSerializerProvider,
|
|
337
|
+
"annotations": ImgAnnotationSerializerProvider,
|
|
338
|
+
}
|
|
339
|
+
|
|
340
|
+
if strategy not in provider_map:
|
|
341
|
+
raise ValueError(
|
|
342
|
+
f"Unknown strategy: {strategy}. "
|
|
343
|
+
f"Available: {list(provider_map.keys())}"
|
|
344
|
+
)
|
|
345
|
+
|
|
346
|
+
provider_class = provider_map[strategy]
|
|
347
|
+
|
|
348
|
+
# Filter kwargs for provider initialization
|
|
349
|
+
import inspect
|
|
350
|
+
provider_sig = inspect.signature(provider_class.__init__)
|
|
351
|
+
provider_params = set(provider_sig.parameters.keys()) - {"self"}
|
|
352
|
+
provider_kwargs = {k: v for k,
|
|
353
|
+
v in kwargs.items() if k in provider_params}
|
|
354
|
+
|
|
355
|
+
return provider_class(**provider_kwargs)
|
|
356
|
+
|
|
357
|
+
def run(self, documents: List[str]) -> List[str]:
|
|
358
|
+
"""
|
|
359
|
+
Split documents into smaller chunks.
|
|
360
|
+
|
|
361
|
+
This implementation expects documents to be already processed by Docling
|
|
362
|
+
or similar tools. For raw text, it falls back to simple chunking.
|
|
363
|
+
|
|
364
|
+
Args:
|
|
365
|
+
documents: List of document strings to chunk
|
|
366
|
+
|
|
367
|
+
Returns:
|
|
368
|
+
List of chunk strings
|
|
369
|
+
"""
|
|
370
|
+
chunks = []
|
|
371
|
+
|
|
372
|
+
for doc_str in documents:
|
|
373
|
+
# Try to parse as DoclingDocument JSON
|
|
374
|
+
try:
|
|
375
|
+
import json
|
|
376
|
+
# Check if it's JSON format
|
|
377
|
+
json.loads(doc_str)
|
|
378
|
+
doc = DoclingDocument.model_validate_json(doc_str)
|
|
379
|
+
# Use hybrid chunker for structured documents
|
|
380
|
+
for chunk in self.hybrid_chunker.chunk(dl_doc=doc):
|
|
381
|
+
ctx_text = self.contextualize(chunk=chunk)
|
|
382
|
+
chunks.append(ctx_text)
|
|
383
|
+
except (Exception,):
|
|
384
|
+
# Fall back to simple text chunking for raw text using Chonkie
|
|
385
|
+
chunk_results = self.chunk_text(doc_str)
|
|
386
|
+
chunks.extend([chunk_info["text"]
|
|
387
|
+
for chunk_info in chunk_results])
|
|
388
|
+
|
|
389
|
+
return chunks
|
|
390
|
+
|
|
391
|
+
def chunk_text(
|
|
392
|
+
self,
|
|
393
|
+
text: str,
|
|
394
|
+
chunk_size: int = 2048,
|
|
395
|
+
chunk_overlap: int = 128,
|
|
396
|
+
min_sentences_per_chunk: int = 1,
|
|
397
|
+
tokenizer: str = "character",
|
|
398
|
+
) -> List[dict[str, Any]]:
|
|
399
|
+
"""
|
|
400
|
+
Chunk raw text using the Chonkie library's SentenceChunker.
|
|
401
|
+
|
|
402
|
+
This method provides a simpler alternative to the Docling-based chunking
|
|
403
|
+
for plain text documents. It uses sentence-based chunking with configurable
|
|
404
|
+
token limits and overlap.
|
|
405
|
+
|
|
406
|
+
Args:
|
|
407
|
+
text: Raw text to chunk
|
|
408
|
+
chunk_size: Maximum tokens per chunk (default: 2048)
|
|
409
|
+
chunk_overlap: Overlap between consecutive chunks in tokens (default: 128)
|
|
410
|
+
min_sentences_per_chunk: Minimum sentences per chunk (default: 1)
|
|
411
|
+
tokenizer: Tokenizer to use - "character", "gpt2", or any HuggingFace tokenizer (default: "character")
|
|
412
|
+
|
|
413
|
+
Returns:
|
|
414
|
+
List of dictionaries with chunk information:
|
|
415
|
+
- text: The chunk text
|
|
416
|
+
- token_count: Number of tokens in the chunk
|
|
417
|
+
- start_index: Starting character index in original text
|
|
418
|
+
- end_index: Ending character index in original text
|
|
419
|
+
|
|
420
|
+
Raises:
|
|
421
|
+
ImportError: If chonkie is not installed
|
|
422
|
+
|
|
423
|
+
Example:
|
|
424
|
+
```python
|
|
425
|
+
chunker = AdvancedChunker()
|
|
426
|
+
chunks = chunker.chunk_text(
|
|
427
|
+
"Your long text here...",
|
|
428
|
+
chunk_size=1024,
|
|
429
|
+
chunk_overlap=64
|
|
430
|
+
)
|
|
431
|
+
|
|
432
|
+
for chunk_info in chunks:
|
|
433
|
+
print(f"Text: {chunk_info['text']}")
|
|
434
|
+
print(f"Tokens: {chunk_info['token_count']}")
|
|
435
|
+
```
|
|
436
|
+
"""
|
|
437
|
+
if not CHONKIE_AVAILABLE:
|
|
438
|
+
raise ImportError(
|
|
439
|
+
"chonkie is not installed. Please install it with: "
|
|
440
|
+
"pip install chonkie==1.4.2"
|
|
441
|
+
)
|
|
442
|
+
|
|
443
|
+
# Initialize the Chonkie SentenceChunker
|
|
444
|
+
chonkie_chunker = SentenceChunker(
|
|
445
|
+
tokenizer=tokenizer,
|
|
446
|
+
chunk_size=chunk_size,
|
|
447
|
+
chunk_overlap=chunk_overlap,
|
|
448
|
+
min_sentences_per_chunk=min_sentences_per_chunk,
|
|
449
|
+
)
|
|
450
|
+
|
|
451
|
+
# Chunk the text
|
|
452
|
+
chunks = chonkie_chunker(text)
|
|
453
|
+
|
|
454
|
+
# Convert Chonkie chunks to our format
|
|
455
|
+
result = []
|
|
456
|
+
for chunk in chunks:
|
|
457
|
+
chunk_info = {
|
|
458
|
+
"text": chunk.text,
|
|
459
|
+
"token_count": chunk.token_count,
|
|
460
|
+
"start_index": chunk.start_index,
|
|
461
|
+
"end_index": chunk.end_index,
|
|
462
|
+
}
|
|
463
|
+
result.append(chunk_info)
|
|
464
|
+
|
|
465
|
+
return result
|
|
466
|
+
|
|
467
|
+
def chunk_docling_document(
|
|
468
|
+
self,
|
|
469
|
+
dl_doc: DoclingDocument,
|
|
470
|
+
post_process: bool = True
|
|
471
|
+
) -> Iterable[BaseChunk]:
|
|
472
|
+
"""
|
|
473
|
+
Generate chunks from a Docling document.
|
|
474
|
+
|
|
475
|
+
This is an advanced method for working directly with DoclingDocument objects.
|
|
476
|
+
For the standard Chunker interface, use the run() method.
|
|
477
|
+
|
|
478
|
+
Args:
|
|
479
|
+
dl_doc: DoclingDocument to chunk
|
|
480
|
+
post_process: If True, applies post-processing to filter TOC and merge
|
|
481
|
+
small chunks (default: True)
|
|
482
|
+
|
|
483
|
+
Returns:
|
|
484
|
+
Iterable of BaseChunk objects
|
|
485
|
+
"""
|
|
486
|
+
chunks = list(self.hybrid_chunker.chunk(dl_doc=dl_doc))
|
|
487
|
+
|
|
488
|
+
if post_process:
|
|
489
|
+
chunks = self._post_process_chunks(chunks)
|
|
490
|
+
|
|
491
|
+
return chunks
|
|
492
|
+
|
|
493
|
+
def _post_process_chunks(self, chunks: List[BaseChunk]) -> List[BaseChunk]:
|
|
494
|
+
"""
|
|
495
|
+
Post-process chunks to improve quality.
|
|
496
|
+
|
|
497
|
+
This method:
|
|
498
|
+
1. Filters out Table of Contents entries
|
|
499
|
+
2. Merges image-only chunks with adjacent content
|
|
500
|
+
3. Merges incomplete table fragments with adjacent content
|
|
501
|
+
4. Merges very small chunks with their neighbors
|
|
502
|
+
5. Removes duplicate heading-only chunks
|
|
503
|
+
|
|
504
|
+
Args:
|
|
505
|
+
chunks: List of chunks to process
|
|
506
|
+
|
|
507
|
+
Returns:
|
|
508
|
+
Processed list of chunks
|
|
509
|
+
"""
|
|
510
|
+
if not chunks:
|
|
511
|
+
return chunks
|
|
512
|
+
|
|
513
|
+
# First pass: Filter TOC and mark chunks for processing
|
|
514
|
+
filtered_chunks = []
|
|
515
|
+
# Chunks waiting to be merged (images, table fragments)
|
|
516
|
+
pending_merge_chunks = []
|
|
517
|
+
|
|
518
|
+
for chunk in chunks:
|
|
519
|
+
# Filter TOC entries
|
|
520
|
+
if self.filter_toc and self._is_toc_chunk(chunk):
|
|
521
|
+
continue
|
|
522
|
+
|
|
523
|
+
# Check if chunk is image-only or incomplete table fragment
|
|
524
|
+
should_merge = (
|
|
525
|
+
self._is_image_only_chunk(chunk) or
|
|
526
|
+
self._is_incomplete_table_fragment(chunk)
|
|
527
|
+
)
|
|
528
|
+
|
|
529
|
+
if should_merge:
|
|
530
|
+
# Accumulate chunks to merge with next content chunk
|
|
531
|
+
pending_merge_chunks.append(chunk)
|
|
532
|
+
continue
|
|
533
|
+
|
|
534
|
+
# If we have pending chunks to merge, merge them with this chunk
|
|
535
|
+
if pending_merge_chunks:
|
|
536
|
+
# Prepend all pending chunks to this chunk
|
|
537
|
+
merge_texts = [
|
|
538
|
+
merge_chunk.text for merge_chunk in pending_merge_chunks]
|
|
539
|
+
chunk.text = "\n".join(merge_texts) + "\n" + chunk.text
|
|
540
|
+
pending_merge_chunks = []
|
|
541
|
+
|
|
542
|
+
filtered_chunks.append(chunk)
|
|
543
|
+
|
|
544
|
+
# If there are still pending chunks at the end, append to last chunk
|
|
545
|
+
if pending_merge_chunks and filtered_chunks:
|
|
546
|
+
last_chunk = filtered_chunks[-1]
|
|
547
|
+
merge_texts = [
|
|
548
|
+
merge_chunk.text for merge_chunk in pending_merge_chunks]
|
|
549
|
+
last_chunk.text = last_chunk.text + "\n" + "\n".join(merge_texts)
|
|
550
|
+
|
|
551
|
+
# Second pass: Merge small chunks
|
|
552
|
+
processed = []
|
|
553
|
+
|
|
554
|
+
for chunk in filtered_chunks:
|
|
555
|
+
# Check if chunk is too small
|
|
556
|
+
token_count = self.count_tokens(chunk.text)
|
|
557
|
+
|
|
558
|
+
if token_count < self.min_chunk_tokens and processed:
|
|
559
|
+
# Try to merge with previous chunk
|
|
560
|
+
prev_chunk = processed[-1]
|
|
561
|
+
merged_text = prev_chunk.text + "\n\n" + chunk.text
|
|
562
|
+
merged_tokens = self.count_tokens(merged_text)
|
|
563
|
+
|
|
564
|
+
# Only merge if it doesn't exceed max_tokens
|
|
565
|
+
if merged_tokens <= self.max_tokens:
|
|
566
|
+
# Update the text while preserving the chunk structure
|
|
567
|
+
prev_chunk.text = merged_text
|
|
568
|
+
continue
|
|
569
|
+
|
|
570
|
+
processed.append(chunk)
|
|
571
|
+
|
|
572
|
+
return processed
|
|
573
|
+
|
|
574
|
+
def _is_image_only_chunk(self, chunk: BaseChunk) -> bool:
|
|
575
|
+
"""
|
|
576
|
+
Check if a chunk contains only image placeholders.
|
|
577
|
+
|
|
578
|
+
Image-only chunks typically contain only:
|
|
579
|
+
- <!-- image --> placeholders
|
|
580
|
+
- Whitespace and newlines
|
|
581
|
+
- No meaningful text content
|
|
582
|
+
|
|
583
|
+
Args:
|
|
584
|
+
chunk: Chunk to check
|
|
585
|
+
|
|
586
|
+
Returns:
|
|
587
|
+
True if chunk is image-only
|
|
588
|
+
"""
|
|
589
|
+
text = chunk.text.strip()
|
|
590
|
+
|
|
591
|
+
# Remove all image placeholders
|
|
592
|
+
text_without_images = re.sub(
|
|
593
|
+
r'<!--\s*image\s*-->', '', text, flags=re.IGNORECASE)
|
|
594
|
+
text_without_images = text_without_images.strip()
|
|
595
|
+
|
|
596
|
+
# If nothing remains after removing image placeholders, it's image-only
|
|
597
|
+
if not text_without_images:
|
|
598
|
+
return True
|
|
599
|
+
|
|
600
|
+
# Also check for very short content that's just whitespace or punctuation
|
|
601
|
+
# This catches cases where there might be a stray character
|
|
602
|
+
if len(text_without_images) < 5 and not any(c.isalnum() for c in text_without_images):
|
|
603
|
+
return True
|
|
604
|
+
|
|
605
|
+
return False
|
|
606
|
+
|
|
607
|
+
def _is_incomplete_table_fragment(self, chunk: BaseChunk) -> bool:
|
|
608
|
+
"""
|
|
609
|
+
Check if a chunk contains an incomplete table fragment.
|
|
610
|
+
|
|
611
|
+
Incomplete table fragments typically contain:
|
|
612
|
+
- Only table separator lines (|---|---|)
|
|
613
|
+
- Only table borders without content
|
|
614
|
+
- Very short lines with mostly dashes and pipes
|
|
615
|
+
- Single dash or pipe character lines
|
|
616
|
+
|
|
617
|
+
Args:
|
|
618
|
+
chunk: Chunk to check
|
|
619
|
+
|
|
620
|
+
Returns:
|
|
621
|
+
True if chunk is an incomplete table fragment
|
|
622
|
+
"""
|
|
623
|
+
text = chunk.text.strip()
|
|
624
|
+
|
|
625
|
+
# Remove heading markers to get the actual content
|
|
626
|
+
lines = text.split('\n')
|
|
627
|
+
content_lines = []
|
|
628
|
+
|
|
629
|
+
for line in lines:
|
|
630
|
+
# Skip heading lines (starting with #)
|
|
631
|
+
stripped = line.strip()
|
|
632
|
+
if not stripped.startswith('#'):
|
|
633
|
+
content_lines.append(stripped)
|
|
634
|
+
|
|
635
|
+
# If no content lines, not a table fragment
|
|
636
|
+
if not content_lines:
|
|
637
|
+
return False
|
|
638
|
+
|
|
639
|
+
# Join content lines
|
|
640
|
+
content = '\n'.join(content_lines).strip()
|
|
641
|
+
|
|
642
|
+
# Check if it's only table separators (lines with |, -, and whitespace)
|
|
643
|
+
# Pattern: lines containing mostly |, -, and spaces
|
|
644
|
+
table_separator_pattern = r'^[\s\|\-]+$'
|
|
645
|
+
|
|
646
|
+
# Check each content line
|
|
647
|
+
separator_lines = 0
|
|
648
|
+
total_content_lines = len(content_lines)
|
|
649
|
+
|
|
650
|
+
for line in content_lines:
|
|
651
|
+
if not line.strip():
|
|
652
|
+
continue
|
|
653
|
+
# Check if line is mostly table separators
|
|
654
|
+
if re.match(table_separator_pattern, line):
|
|
655
|
+
separator_lines += 1
|
|
656
|
+
|
|
657
|
+
# If all non-empty lines are separators, it's an incomplete fragment
|
|
658
|
+
if separator_lines > 0 and separator_lines == total_content_lines:
|
|
659
|
+
return True
|
|
660
|
+
|
|
661
|
+
# Check for very short content that's mostly punctuation
|
|
662
|
+
# Remove all whitespace, pipes, and dashes
|
|
663
|
+
content_cleaned = re.sub(r'[\s\|\-]', '', content)
|
|
664
|
+
|
|
665
|
+
# If very little actual content remains (less than 10 chars),
|
|
666
|
+
# and original has table markers, it's likely a fragment
|
|
667
|
+
if len(content_cleaned) < 10 and ('|' in content or '---' in content):
|
|
668
|
+
return True
|
|
669
|
+
|
|
670
|
+
return False
|
|
671
|
+
|
|
672
|
+
def _is_toc_chunk(self, chunk: BaseChunk) -> bool:
|
|
673
|
+
"""
|
|
674
|
+
Check if a chunk is a Table of Contents entry.
|
|
675
|
+
|
|
676
|
+
TOC entries typically:
|
|
677
|
+
- Have "Table of Contents", "Table des matières", "Contents", "Sommaire" headings
|
|
678
|
+
- Contain many dots (....) or dashes (----) as separators
|
|
679
|
+
- Have page numbers at the end of lines
|
|
680
|
+
|
|
681
|
+
Args:
|
|
682
|
+
chunk: Chunk to check
|
|
683
|
+
|
|
684
|
+
Returns:
|
|
685
|
+
True if chunk appears to be a TOC entry
|
|
686
|
+
"""
|
|
687
|
+
text = chunk.text.lower()
|
|
688
|
+
|
|
689
|
+
# Check for TOC heading patterns
|
|
690
|
+
toc_headings = [
|
|
691
|
+
"table of contents",
|
|
692
|
+
"table des matières",
|
|
693
|
+
"contents",
|
|
694
|
+
"sommaire",
|
|
695
|
+
"índice",
|
|
696
|
+
"inhaltsverzeichnis",
|
|
697
|
+
]
|
|
698
|
+
|
|
699
|
+
# Get heading context if available
|
|
700
|
+
doc_chunk = DocChunk.model_validate(chunk)
|
|
701
|
+
headings = doc_chunk.meta.headings or []
|
|
702
|
+
heading_text = " ".join(headings).lower()
|
|
703
|
+
|
|
704
|
+
for toc_heading in toc_headings:
|
|
705
|
+
if toc_heading in heading_text or toc_heading in text[:100]:
|
|
706
|
+
# Additional check: TOC entries often have separator patterns
|
|
707
|
+
# Like dots (....) or dashes (---) or pipe tables
|
|
708
|
+
separator_count = (
|
|
709
|
+
text.count('....') +
|
|
710
|
+
text.count('----') +
|
|
711
|
+
text.count('|---')
|
|
712
|
+
)
|
|
713
|
+
|
|
714
|
+
# If has TOC heading and separator patterns, it's likely TOC
|
|
715
|
+
if separator_count > 0:
|
|
716
|
+
return True
|
|
717
|
+
|
|
718
|
+
# Also check for page number patterns at end of lines
|
|
719
|
+
page_number_pattern = r'\d+\s*$|\d+\s*\|'
|
|
720
|
+
lines = text.split('\n')
|
|
721
|
+
page_number_lines = sum(
|
|
722
|
+
1 for line in lines
|
|
723
|
+
if re.search(page_number_pattern, line.strip())
|
|
724
|
+
)
|
|
725
|
+
|
|
726
|
+
# If most lines end with numbers, likely TOC
|
|
727
|
+
if len(lines) > 1 and page_number_lines > len(lines) * 0.5:
|
|
728
|
+
return True
|
|
729
|
+
|
|
730
|
+
return False
|
|
731
|
+
|
|
732
|
+
def chunk_from_markdown_file(
|
|
733
|
+
self,
|
|
734
|
+
md_file_path: str,
|
|
735
|
+
contextualize: bool = True,
|
|
736
|
+
) -> List[dict[str, Any]]:
|
|
737
|
+
"""
|
|
738
|
+
Chunk content directly from a markdown file using Docling.
|
|
739
|
+
|
|
740
|
+
This method uses the Docling DocumentConverter to convert the markdown file
|
|
741
|
+
and then chunks it using the HybridChunker. It provides a convenient way
|
|
742
|
+
to process markdown files without manual conversion.
|
|
743
|
+
|
|
744
|
+
Args:
|
|
745
|
+
md_file_path: Path to the markdown file to chunk
|
|
746
|
+
contextualize: If True, applies contextualization to add hierarchical
|
|
747
|
+
context from headings (default: True)
|
|
748
|
+
|
|
749
|
+
Returns:
|
|
750
|
+
List of dictionaries with chunk information:
|
|
751
|
+
- text: The chunk text (contextualized if enabled)
|
|
752
|
+
- num_tokens: Number of tokens in the chunk
|
|
753
|
+
- doc_items: List of document item references
|
|
754
|
+
- chunk: The BaseChunk object
|
|
755
|
+
|
|
756
|
+
Raises:
|
|
757
|
+
ImportError: If docling is not installed
|
|
758
|
+
FileNotFoundError: If the markdown file doesn't exist
|
|
759
|
+
|
|
760
|
+
Example:
|
|
761
|
+
```python
|
|
762
|
+
from advanced_chunker import AdvancedChunker
|
|
763
|
+
|
|
764
|
+
# Create chunker
|
|
765
|
+
chunker = AdvancedChunker(strategy="markdown_tables")
|
|
766
|
+
|
|
767
|
+
# Chunk from markdown file
|
|
768
|
+
chunks = chunker.chunk_from_markdown_file(
|
|
769
|
+
md_file_path="/path/to/document.md",
|
|
770
|
+
contextualize=True
|
|
771
|
+
)
|
|
772
|
+
|
|
773
|
+
# Access chunk information
|
|
774
|
+
for chunk_info in chunks:
|
|
775
|
+
print(f"Text: {chunk_info['text']}")
|
|
776
|
+
print(f"Tokens: {chunk_info['num_tokens']}")
|
|
777
|
+
```
|
|
778
|
+
"""
|
|
779
|
+
try:
|
|
780
|
+
from docling.document_converter import DocumentConverter
|
|
781
|
+
except ImportError:
|
|
782
|
+
raise ImportError(
|
|
783
|
+
"docling is not installed. Please install it with: "
|
|
784
|
+
"pip install docling"
|
|
785
|
+
)
|
|
786
|
+
|
|
787
|
+
import os
|
|
788
|
+
if not os.path.exists(md_file_path):
|
|
789
|
+
raise FileNotFoundError(f"Markdown file not found: {md_file_path}")
|
|
790
|
+
|
|
791
|
+
# Convert markdown file to DoclingDocument
|
|
792
|
+
converter = DocumentConverter()
|
|
793
|
+
result = converter.convert(source=md_file_path)
|
|
794
|
+
dl_doc = result.document
|
|
795
|
+
|
|
796
|
+
# Chunk the document
|
|
797
|
+
chunks_list = []
|
|
798
|
+
for chunk in self.hybrid_chunker.chunk(dl_doc=dl_doc):
|
|
799
|
+
# Get contextualized text if requested
|
|
800
|
+
if contextualize:
|
|
801
|
+
chunk_text = self.contextualize(chunk=chunk)
|
|
802
|
+
else:
|
|
803
|
+
chunk_text = chunk.text
|
|
804
|
+
|
|
805
|
+
# Get chunk information
|
|
806
|
+
num_tokens = self.count_tokens(text=chunk_text)
|
|
807
|
+
doc_chunk = DocChunk.model_validate(chunk)
|
|
808
|
+
doc_items_refs = [it.self_ref for it in doc_chunk.meta.doc_items]
|
|
809
|
+
|
|
810
|
+
chunk_info = {
|
|
811
|
+
"text": chunk_text,
|
|
812
|
+
"num_tokens": num_tokens,
|
|
813
|
+
"doc_items": doc_items_refs,
|
|
814
|
+
"chunk": chunk,
|
|
815
|
+
}
|
|
816
|
+
chunks_list.append(chunk_info)
|
|
817
|
+
|
|
818
|
+
return chunks_list
|
|
819
|
+
|
|
820
|
+
def count_tokens(self, text: str) -> int:
|
|
821
|
+
"""
|
|
822
|
+
Count tokens in text.
|
|
823
|
+
|
|
824
|
+
Args:
|
|
825
|
+
text: Text to count tokens for
|
|
826
|
+
|
|
827
|
+
Returns:
|
|
828
|
+
Number of tokens
|
|
829
|
+
"""
|
|
830
|
+
return self.tokenizer.count_tokens(text=text)
|
|
831
|
+
|
|
832
|
+
def get_max_tokens(self) -> int:
|
|
833
|
+
"""
|
|
834
|
+
Get maximum token limit for the tokenizer.
|
|
835
|
+
|
|
836
|
+
Returns:
|
|
837
|
+
Maximum number of tokens
|
|
838
|
+
"""
|
|
839
|
+
return self.tokenizer.get_max_tokens()
|
|
840
|
+
|
|
841
|
+
def contextualize(self, chunk: BaseChunk) -> str:
|
|
842
|
+
"""
|
|
843
|
+
Contextualize a chunk by adding hierarchical context from headings.
|
|
844
|
+
|
|
845
|
+
This method enriches the chunk text with context from parent headings
|
|
846
|
+
and section titles, which improves RAG retrieval quality by providing
|
|
847
|
+
more semantic context.
|
|
848
|
+
|
|
849
|
+
If `include_heading_markers` is True, headings will be prefixed with
|
|
850
|
+
markdown-style `#` markers based on their hierarchy level.
|
|
851
|
+
|
|
852
|
+
Args:
|
|
853
|
+
chunk: The chunk to contextualize
|
|
854
|
+
|
|
855
|
+
Returns:
|
|
856
|
+
Context-enriched text string
|
|
857
|
+
|
|
858
|
+
Example:
|
|
859
|
+
>>> for chunk in chunker.chunk(dl_doc=doc):
|
|
860
|
+
... enriched_text = chunker.contextualize(chunk=chunk)
|
|
861
|
+
... # Use enriched_text for embedding
|
|
862
|
+
"""
|
|
863
|
+
if not self.include_heading_markers:
|
|
864
|
+
return self.hybrid_chunker.contextualize(chunk=chunk)
|
|
865
|
+
|
|
866
|
+
# Custom contextualization with markdown heading markers
|
|
867
|
+
doc_chunk = DocChunk.model_validate(chunk)
|
|
868
|
+
meta = doc_chunk.meta
|
|
869
|
+
|
|
870
|
+
items = []
|
|
871
|
+
|
|
872
|
+
# Add headings with markdown markers
|
|
873
|
+
if meta.headings:
|
|
874
|
+
for i, heading in enumerate(meta.headings):
|
|
875
|
+
# Level starts at 1 for first heading, increases for nested
|
|
876
|
+
level = i + 1
|
|
877
|
+
items.append(f"{'#' * level} {heading}")
|
|
878
|
+
|
|
879
|
+
# Add the chunk text
|
|
880
|
+
items.append(chunk.text)
|
|
881
|
+
|
|
882
|
+
return self.hybrid_chunker.delim.join(items)
|
|
883
|
+
|
|
884
|
+
@staticmethod
|
|
885
|
+
def find_nth_chunk_with_label(
|
|
886
|
+
chunks: Iterable[BaseChunk],
|
|
887
|
+
n: int,
|
|
888
|
+
label: DocItemLabel,
|
|
889
|
+
) -> tuple[Optional[int], Optional[DocChunk]]:
|
|
890
|
+
"""
|
|
891
|
+
Find the n-th chunk containing a specific document item label.
|
|
892
|
+
|
|
893
|
+
Args:
|
|
894
|
+
chunks: Iterable of chunks to search
|
|
895
|
+
n: Zero-based index of the chunk to find
|
|
896
|
+
label: Document item label to search for
|
|
897
|
+
|
|
898
|
+
Returns:
|
|
899
|
+
Tuple of (chunk_index, chunk) or (None, None) if not found
|
|
900
|
+
"""
|
|
901
|
+
num_found = -1
|
|
902
|
+
for i, chunk in enumerate(chunks):
|
|
903
|
+
doc_chunk = DocChunk.model_validate(chunk)
|
|
904
|
+
for it in doc_chunk.meta.doc_items:
|
|
905
|
+
if it.label == label:
|
|
906
|
+
num_found += 1
|
|
907
|
+
if num_found == n:
|
|
908
|
+
return i, doc_chunk
|
|
909
|
+
return None, None
|
|
910
|
+
|
|
911
|
+
def get_chunk_info(self, chunk: BaseChunk) -> dict[str, Any]:
|
|
912
|
+
"""
|
|
913
|
+
Get detailed information about a chunk.
|
|
914
|
+
|
|
915
|
+
Args:
|
|
916
|
+
chunk: Chunk to analyze
|
|
917
|
+
|
|
918
|
+
Returns:
|
|
919
|
+
Dictionary with chunk information including:
|
|
920
|
+
- text: Contextualized text
|
|
921
|
+
- num_tokens: Token count
|
|
922
|
+
- doc_items: List of document item references
|
|
923
|
+
"""
|
|
924
|
+
ctx_text = self.contextualize(chunk=chunk)
|
|
925
|
+
num_tokens = self.count_tokens(text=ctx_text)
|
|
926
|
+
doc_chunk = DocChunk.model_validate(chunk)
|
|
927
|
+
doc_items_refs = [it.self_ref for it in doc_chunk.meta.doc_items]
|
|
928
|
+
|
|
929
|
+
return {
|
|
930
|
+
"text": ctx_text,
|
|
931
|
+
"num_tokens": num_tokens,
|
|
932
|
+
"doc_items": doc_items_refs,
|
|
933
|
+
"chunk": doc_chunk,
|
|
934
|
+
}
|
|
935
|
+
|
|
936
|
+
|
|
937
|
+
# Convenience function for quick setup
|
|
938
|
+
def create_chunker(
|
|
939
|
+
strategy: str = "default",
|
|
940
|
+
embed_model_id: str = "sentence-transformers/all-MiniLM-L6-v2",
|
|
941
|
+
include_heading_markers: bool = True,
|
|
942
|
+
max_tokens: int = AdvancedChunker.DEFAULT_MAX_TOKENS,
|
|
943
|
+
merge_peers: bool = AdvancedChunker.DEFAULT_MERGE_PEERS,
|
|
944
|
+
min_chunk_tokens: int = AdvancedChunker.DEFAULT_MIN_CHUNK_TOKENS,
|
|
945
|
+
filter_toc: bool = True,
|
|
946
|
+
**kwargs,
|
|
947
|
+
) -> AdvancedChunker:
|
|
948
|
+
"""
|
|
949
|
+
Create a pre-configured advanced chunker.
|
|
950
|
+
|
|
951
|
+
Args:
|
|
952
|
+
strategy: Chunking strategy to use:
|
|
953
|
+
- "default": Default serialization
|
|
954
|
+
- "markdown_tables": Markdown table formatting
|
|
955
|
+
- "custom_placeholder": Custom image placeholder
|
|
956
|
+
- "annotations": Include picture annotations
|
|
957
|
+
embed_model_id: HuggingFace model ID for tokenization
|
|
958
|
+
include_heading_markers: If True, adds markdown # markers to headings
|
|
959
|
+
in contextualized output (default: True)
|
|
960
|
+
max_tokens: Maximum tokens per chunk (default: 1024)
|
|
961
|
+
merge_peers: If True, merges adjacent small chunks (default: True)
|
|
962
|
+
min_chunk_tokens: Minimum tokens for standalone chunks (default: 50)
|
|
963
|
+
filter_toc: If True, filters out Table of Contents entries (default: True)
|
|
964
|
+
**kwargs: Additional arguments passed to strategy-specific providers
|
|
965
|
+
|
|
966
|
+
Returns:
|
|
967
|
+
Configured AdvancedChunker instance
|
|
968
|
+
|
|
969
|
+
Example:
|
|
970
|
+
```python
|
|
971
|
+
# Create chunker with markdown tables
|
|
972
|
+
chunker = create_chunker(strategy="markdown_tables")
|
|
973
|
+
|
|
974
|
+
# Create chunker with custom image placeholder
|
|
975
|
+
chunker = create_chunker(
|
|
976
|
+
strategy="custom_placeholder",
|
|
977
|
+
image_placeholder="[IMAGE]"
|
|
978
|
+
)
|
|
979
|
+
|
|
980
|
+
# Create chunker with larger chunks and TOC filtering
|
|
981
|
+
chunker = create_chunker(
|
|
982
|
+
strategy="markdown_tables",
|
|
983
|
+
max_tokens=2048,
|
|
984
|
+
filter_toc=True
|
|
985
|
+
)
|
|
986
|
+
```
|
|
987
|
+
"""
|
|
988
|
+
provider_map = {
|
|
989
|
+
"default": DefaultSerializerProvider,
|
|
990
|
+
"markdown_tables": MDTableSerializerProvider,
|
|
991
|
+
"custom_placeholder": ImgPlaceholderSerializerProvider,
|
|
992
|
+
"annotations": ImgAnnotationSerializerProvider,
|
|
993
|
+
}
|
|
994
|
+
|
|
995
|
+
if strategy not in provider_map:
|
|
996
|
+
raise ValueError(
|
|
997
|
+
f"Unknown strategy: {strategy}. "
|
|
998
|
+
f"Available: {list(provider_map.keys())}"
|
|
999
|
+
)
|
|
1000
|
+
|
|
1001
|
+
provider_class = provider_map[strategy]
|
|
1002
|
+
|
|
1003
|
+
# Filter kwargs for provider initialization
|
|
1004
|
+
import inspect
|
|
1005
|
+
provider_sig = inspect.signature(provider_class.__init__)
|
|
1006
|
+
provider_params = set(provider_sig.parameters.keys()) - {"self"}
|
|
1007
|
+
provider_kwargs = {k: v for k, v in kwargs.items() if k in provider_params}
|
|
1008
|
+
|
|
1009
|
+
provider = provider_class(**provider_kwargs)
|
|
1010
|
+
|
|
1011
|
+
return AdvancedChunker(
|
|
1012
|
+
embed_model_id=embed_model_id,
|
|
1013
|
+
serializer_provider=provider,
|
|
1014
|
+
include_heading_markers=include_heading_markers,
|
|
1015
|
+
max_tokens=max_tokens,
|
|
1016
|
+
merge_peers=merge_peers,
|
|
1017
|
+
min_chunk_tokens=min_chunk_tokens,
|
|
1018
|
+
filter_toc=filter_toc,
|
|
1019
|
+
)
|