rakam-systems-vectorstore 0.1.1rc7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. rakam_systems_vectorstore/MANIFEST.in +26 -0
  2. rakam_systems_vectorstore/README.md +1071 -0
  3. rakam_systems_vectorstore/__init__.py +93 -0
  4. rakam_systems_vectorstore/components/__init__.py +0 -0
  5. rakam_systems_vectorstore/components/chunker/__init__.py +19 -0
  6. rakam_systems_vectorstore/components/chunker/advanced_chunker.py +1019 -0
  7. rakam_systems_vectorstore/components/chunker/text_chunker.py +154 -0
  8. rakam_systems_vectorstore/components/embedding_model/__init__.py +0 -0
  9. rakam_systems_vectorstore/components/embedding_model/configurable_embeddings.py +546 -0
  10. rakam_systems_vectorstore/components/embedding_model/openai_embeddings.py +259 -0
  11. rakam_systems_vectorstore/components/loader/__init__.py +31 -0
  12. rakam_systems_vectorstore/components/loader/adaptive_loader.py +512 -0
  13. rakam_systems_vectorstore/components/loader/code_loader.py +699 -0
  14. rakam_systems_vectorstore/components/loader/doc_loader.py +812 -0
  15. rakam_systems_vectorstore/components/loader/eml_loader.py +556 -0
  16. rakam_systems_vectorstore/components/loader/html_loader.py +626 -0
  17. rakam_systems_vectorstore/components/loader/md_loader.py +622 -0
  18. rakam_systems_vectorstore/components/loader/odt_loader.py +750 -0
  19. rakam_systems_vectorstore/components/loader/pdf_loader.py +771 -0
  20. rakam_systems_vectorstore/components/loader/pdf_loader_light.py +723 -0
  21. rakam_systems_vectorstore/components/loader/tabular_loader.py +597 -0
  22. rakam_systems_vectorstore/components/vectorstore/__init__.py +0 -0
  23. rakam_systems_vectorstore/components/vectorstore/apps.py +10 -0
  24. rakam_systems_vectorstore/components/vectorstore/configurable_pg_vector_store.py +1661 -0
  25. rakam_systems_vectorstore/components/vectorstore/faiss_vector_store.py +878 -0
  26. rakam_systems_vectorstore/components/vectorstore/migrations/0001_initial.py +55 -0
  27. rakam_systems_vectorstore/components/vectorstore/migrations/__init__.py +0 -0
  28. rakam_systems_vectorstore/components/vectorstore/models.py +10 -0
  29. rakam_systems_vectorstore/components/vectorstore/pg_models.py +97 -0
  30. rakam_systems_vectorstore/components/vectorstore/pg_vector_store.py +827 -0
  31. rakam_systems_vectorstore/config.py +266 -0
  32. rakam_systems_vectorstore/core.py +8 -0
  33. rakam_systems_vectorstore/pyproject.toml +113 -0
  34. rakam_systems_vectorstore/server/README.md +290 -0
  35. rakam_systems_vectorstore/server/__init__.py +20 -0
  36. rakam_systems_vectorstore/server/mcp_server_vector.py +325 -0
  37. rakam_systems_vectorstore/setup.py +103 -0
  38. rakam_systems_vectorstore-0.1.1rc7.dist-info/METADATA +370 -0
  39. rakam_systems_vectorstore-0.1.1rc7.dist-info/RECORD +40 -0
  40. rakam_systems_vectorstore-0.1.1rc7.dist-info/WHEEL +4 -0
@@ -0,0 +1,1019 @@
1
+ """
2
+ Advanced Chunker with Customizable Serialization
3
+
4
+ This module provides an advanced chunking system that allows customization of
5
+ serialization strategies for different document elements (tables, pictures, etc.)
6
+ during the chunking process.
7
+
8
+ Key Features:
9
+ - Hybrid chunking with customizable serialization
10
+ - Support for different table serialization formats (triplet, markdown, etc.)
11
+ - Configurable picture serialization with annotation support
12
+ - Token-aware chunking with contextual information
13
+ - Extensible serializer provider pattern
14
+
15
+ Usage Example:
16
+ ```python
17
+ from advanced_chunker import AdvancedChunker
18
+
19
+ # Create chunker with markdown tables
20
+ chunker = AdvancedChunker(strategy="markdown_tables")
21
+
22
+ # Chunk documents
23
+ documents = ["document text here"]
24
+ chunks = chunker.run(documents)
25
+ ```
26
+ """
27
+
28
+ from __future__ import annotations
29
+ import re
30
+ from typing import Any, Iterable, List, Optional, Type
31
+ from abc import abstractmethod
32
+
33
+ from rakam_systems_core.ai_core.interfaces.chunker import Chunker
34
+
35
+ try:
36
+ from chonkie import SentenceChunker
37
+ CHONKIE_AVAILABLE = True
38
+ except ImportError:
39
+ CHONKIE_AVAILABLE = False
40
+
41
+ from docling_core.transforms.chunker.hybrid_chunker import HybridChunker
42
+ from docling_core.transforms.chunker.base import BaseChunk
43
+ from docling_core.transforms.chunker.hierarchical_chunker import (
44
+ DocChunk,
45
+ DocMeta,
46
+ ChunkingDocSerializer,
47
+ ChunkingSerializerProvider,
48
+ )
49
+ from docling_core.transforms.chunker.tokenizer.base import BaseTokenizer
50
+ from docling_core.transforms.chunker.tokenizer.huggingface import HuggingFaceTokenizer
51
+ from docling_core.transforms.serializer.base import (
52
+ BaseDocSerializer,
53
+ SerializationResult,
54
+ )
55
+ from docling_core.transforms.serializer.common import create_ser_result
56
+ from docling_core.transforms.serializer.markdown import (
57
+ MarkdownTableSerializer,
58
+ MarkdownPictureSerializer,
59
+ MarkdownParams,
60
+ )
61
+ from docling_core.types.doc.document import (
62
+ DoclingDocument,
63
+ PictureClassificationData,
64
+ PictureDescriptionData,
65
+ PictureMoleculeData,
66
+ PictureItem,
67
+ )
68
+ from docling_core.types.doc.labels import DocItemLabel
69
+ from transformers import AutoTokenizer
70
+ from typing_extensions import override
71
+
72
+
73
+ class BaseSerializerProvider(ChunkingSerializerProvider):
74
+ """Base class for serializer providers with common configuration."""
75
+
76
+ def __init__(
77
+ self,
78
+ table_serializer: Optional[Any] = None,
79
+ picture_serializer: Optional[Any] = None,
80
+ params: Optional[MarkdownParams] = None,
81
+ ):
82
+ """
83
+ Initialize the serializer provider.
84
+
85
+ Args:
86
+ table_serializer: Custom table serializer instance
87
+ picture_serializer: Custom picture serializer instance
88
+ params: Markdown serialization parameters
89
+ """
90
+ self.table_serializer = table_serializer
91
+ self.picture_serializer = picture_serializer
92
+ self.params = params or MarkdownParams()
93
+
94
+ @abstractmethod
95
+ def get_serializer(self, doc: DoclingDocument) -> ChunkingDocSerializer:
96
+ """Get the configured serializer for the document."""
97
+ pass
98
+
99
+
100
+ class DefaultSerializerProvider(BaseSerializerProvider):
101
+ """Default serializer provider with standard settings."""
102
+
103
+ def get_serializer(self, doc: DoclingDocument) -> ChunkingDocSerializer:
104
+ """Get default serializer."""
105
+ kwargs = {"doc": doc, "params": self.params}
106
+ if self.table_serializer:
107
+ kwargs["table_serializer"] = self.table_serializer
108
+ if self.picture_serializer:
109
+ kwargs["picture_serializer"] = self.picture_serializer
110
+ return ChunkingDocSerializer(**kwargs)
111
+
112
+
113
+ class MDTableSerializerProvider(BaseSerializerProvider):
114
+ """
115
+ Serializer provider that uses Markdown format for tables.
116
+
117
+ This provider converts tables to Markdown format instead of the default
118
+ triplet notation, making them more human-readable.
119
+ """
120
+
121
+ def __init__(self, params: Optional[MarkdownParams] = None):
122
+ """Initialize with Markdown table serializer."""
123
+ super().__init__(
124
+ table_serializer=MarkdownTableSerializer(),
125
+ params=params,
126
+ )
127
+
128
+ def get_serializer(self, doc: DoclingDocument) -> ChunkingDocSerializer:
129
+ """Get serializer with Markdown table formatting."""
130
+ return ChunkingDocSerializer(
131
+ doc=doc,
132
+ table_serializer=self.table_serializer,
133
+ params=self.params,
134
+ )
135
+
136
+
137
+ class ImgPlaceholderSerializerProvider(BaseSerializerProvider):
138
+ """
139
+ Serializer provider with customizable image placeholder.
140
+
141
+ This provider allows you to specify a custom placeholder text for images
142
+ in the serialized output.
143
+ """
144
+
145
+ def __init__(self, image_placeholder: str = "<!-- image -->"):
146
+ """
147
+ Initialize with custom image placeholder.
148
+
149
+ Args:
150
+ image_placeholder: Text to use as placeholder for images
151
+ """
152
+ super().__init__(
153
+ params=MarkdownParams(image_placeholder=image_placeholder)
154
+ )
155
+
156
+ def get_serializer(self, doc: DoclingDocument) -> ChunkingDocSerializer:
157
+ """Get serializer with custom image placeholder."""
158
+ return ChunkingDocSerializer(doc=doc, params=self.params)
159
+
160
+
161
+ class AnnotationPictureSerializer(MarkdownPictureSerializer):
162
+ """
163
+ Picture serializer that leverages picture annotations.
164
+
165
+ This serializer extracts and includes annotation information such as:
166
+ - Picture classifications (predicted class)
167
+ - Molecule data (SMILES notation)
168
+ - Picture descriptions
169
+ """
170
+
171
+ @override
172
+ def serialize(
173
+ self,
174
+ *,
175
+ item: PictureItem,
176
+ doc_serializer: BaseDocSerializer,
177
+ doc: DoclingDocument,
178
+ **kwargs: Any,
179
+ ) -> SerializationResult:
180
+ """
181
+ Serialize picture with annotations.
182
+
183
+ Args:
184
+ item: Picture item to serialize
185
+ doc_serializer: Document serializer instance
186
+ doc: Parent document
187
+ **kwargs: Additional serialization arguments
188
+
189
+ Returns:
190
+ Serialization result with annotation text
191
+ """
192
+ text_parts: list[str] = []
193
+
194
+ # Extract annotations
195
+ for annotation in item.annotations:
196
+ if isinstance(annotation, PictureClassificationData):
197
+ predicted_class = (
198
+ annotation.predicted_classes[0].class_name
199
+ if annotation.predicted_classes
200
+ else None
201
+ )
202
+ if predicted_class is not None:
203
+ text_parts.append(f"Picture type: {predicted_class}")
204
+
205
+ elif isinstance(annotation, PictureMoleculeData):
206
+ text_parts.append(f"SMILES: {annotation.smi}")
207
+
208
+ elif isinstance(annotation, PictureDescriptionData):
209
+ text_parts.append(f"Picture description: {annotation.text}")
210
+
211
+ # Join and post-process
212
+ text_res = "\n".join(text_parts)
213
+ text_res = doc_serializer.post_process(text=text_res)
214
+ return create_ser_result(text=text_res, span_source=item)
215
+
216
+
217
+ class ImgAnnotationSerializerProvider(BaseSerializerProvider):
218
+ """
219
+ Serializer provider that includes picture annotations in output.
220
+
221
+ This provider uses the AnnotationPictureSerializer to include rich
222
+ annotation data for pictures in the chunked output.
223
+ """
224
+
225
+ def __init__(self):
226
+ """Initialize with annotation picture serializer."""
227
+ super().__init__(picture_serializer=AnnotationPictureSerializer())
228
+
229
+ def get_serializer(self, doc: DoclingDocument) -> ChunkingDocSerializer:
230
+ """Get serializer with picture annotation support."""
231
+ return ChunkingDocSerializer(
232
+ doc=doc,
233
+ picture_serializer=self.picture_serializer,
234
+ )
235
+
236
+
237
+ class AdvancedChunker(Chunker):
238
+ """
239
+ Advanced chunker with customizable serialization strategies.
240
+
241
+ This class implements the Chunker interface and wraps the HybridChunker
242
+ to provide customizable serialization strategies for various document elements.
243
+
244
+ Attributes:
245
+ tokenizer: Tokenizer instance for token counting
246
+ hybrid_chunker: Underlying HybridChunker instance
247
+ embed_model_id: Model ID for tokenization
248
+ serializer_provider: Provider for custom serialization
249
+ include_heading_markers: Whether to include markdown # markers in headings
250
+ max_tokens: Maximum tokens per chunk
251
+ merge_peers: Whether to merge adjacent small chunks
252
+ min_chunk_tokens: Minimum tokens for a chunk to be kept standalone
253
+ """
254
+
255
+ # Default configuration for better chunking quality
256
+ DEFAULT_MAX_TOKENS = 1024 # Larger chunks for better context
257
+ DEFAULT_MERGE_PEERS = True # Merge small adjacent chunks
258
+ DEFAULT_MIN_CHUNK_TOKENS = 50 # Minimum tokens for standalone chunks
259
+
260
+ def __init__(
261
+ self,
262
+ embed_model_id: str = "sentence-transformers/all-MiniLM-L6-v2",
263
+ tokenizer: Optional[BaseTokenizer] = None,
264
+ serializer_provider: Optional[ChunkingSerializerProvider] = None,
265
+ strategy: Optional[str] = None,
266
+ name: str = "advanced_chunker",
267
+ include_heading_markers: bool = True,
268
+ max_tokens: int = DEFAULT_MAX_TOKENS,
269
+ merge_peers: bool = DEFAULT_MERGE_PEERS,
270
+ min_chunk_tokens: int = DEFAULT_MIN_CHUNK_TOKENS,
271
+ filter_toc: bool = True,
272
+ **chunker_kwargs,
273
+ ):
274
+ """
275
+ Initialize the advanced chunker.
276
+
277
+ Args:
278
+ embed_model_id: HuggingFace model ID for tokenization
279
+ tokenizer: Optional custom tokenizer (if not provided, will be created)
280
+ serializer_provider: Custom serializer provider for document elements
281
+ strategy: Pre-configured strategy name (default, markdown_tables, etc.)
282
+ name: Component name
283
+ include_heading_markers: If True, adds markdown # markers to headings
284
+ in contextualized output (default: True)
285
+ max_tokens: Maximum tokens per chunk (default: 1024)
286
+ merge_peers: If True, merges adjacent small chunks with same metadata (default: True)
287
+ min_chunk_tokens: Minimum tokens for a chunk to be kept, smaller chunks
288
+ will be merged with neighbors (default: 50)
289
+ filter_toc: If True, filters out Table of Contents entries (default: True)
290
+ **chunker_kwargs: Additional arguments for HybridChunker
291
+ """
292
+ super().__init__(name=name)
293
+
294
+ self.embed_model_id = embed_model_id
295
+ self.include_heading_markers = include_heading_markers
296
+ self.max_tokens = max_tokens
297
+ self.merge_peers = merge_peers
298
+ self.min_chunk_tokens = min_chunk_tokens
299
+ self.filter_toc = filter_toc
300
+
301
+ # Handle strategy-based provider creation
302
+ if strategy is not None and serializer_provider is None:
303
+ serializer_provider = self._create_provider_from_strategy(
304
+ strategy, **chunker_kwargs
305
+ )
306
+
307
+ self.serializer_provider = serializer_provider
308
+
309
+ # Initialize tokenizer
310
+ if tokenizer is None:
311
+ self.tokenizer = HuggingFaceTokenizer(
312
+ tokenizer=AutoTokenizer.from_pretrained(embed_model_id)
313
+ )
314
+ else:
315
+ self.tokenizer = tokenizer
316
+
317
+ # Initialize chunker with improved settings
318
+ chunker_config = {
319
+ "tokenizer": self.tokenizer,
320
+ "max_tokens": max_tokens,
321
+ "merge_peers": merge_peers,
322
+ }
323
+ if self.serializer_provider is not None:
324
+ chunker_config["serializer_provider"] = self.serializer_provider
325
+ chunker_config.update(chunker_kwargs)
326
+
327
+ self.hybrid_chunker = HybridChunker(**chunker_config)
328
+
329
+ def _create_provider_from_strategy(
330
+ self, strategy: str, **kwargs
331
+ ) -> ChunkingSerializerProvider:
332
+ """Create a serializer provider from a strategy name."""
333
+ provider_map = {
334
+ "default": DefaultSerializerProvider,
335
+ "markdown_tables": MDTableSerializerProvider,
336
+ "custom_placeholder": ImgPlaceholderSerializerProvider,
337
+ "annotations": ImgAnnotationSerializerProvider,
338
+ }
339
+
340
+ if strategy not in provider_map:
341
+ raise ValueError(
342
+ f"Unknown strategy: {strategy}. "
343
+ f"Available: {list(provider_map.keys())}"
344
+ )
345
+
346
+ provider_class = provider_map[strategy]
347
+
348
+ # Filter kwargs for provider initialization
349
+ import inspect
350
+ provider_sig = inspect.signature(provider_class.__init__)
351
+ provider_params = set(provider_sig.parameters.keys()) - {"self"}
352
+ provider_kwargs = {k: v for k,
353
+ v in kwargs.items() if k in provider_params}
354
+
355
+ return provider_class(**provider_kwargs)
356
+
357
+ def run(self, documents: List[str]) -> List[str]:
358
+ """
359
+ Split documents into smaller chunks.
360
+
361
+ This implementation expects documents to be already processed by Docling
362
+ or similar tools. For raw text, it falls back to simple chunking.
363
+
364
+ Args:
365
+ documents: List of document strings to chunk
366
+
367
+ Returns:
368
+ List of chunk strings
369
+ """
370
+ chunks = []
371
+
372
+ for doc_str in documents:
373
+ # Try to parse as DoclingDocument JSON
374
+ try:
375
+ import json
376
+ # Check if it's JSON format
377
+ json.loads(doc_str)
378
+ doc = DoclingDocument.model_validate_json(doc_str)
379
+ # Use hybrid chunker for structured documents
380
+ for chunk in self.hybrid_chunker.chunk(dl_doc=doc):
381
+ ctx_text = self.contextualize(chunk=chunk)
382
+ chunks.append(ctx_text)
383
+ except (Exception,):
384
+ # Fall back to simple text chunking for raw text using Chonkie
385
+ chunk_results = self.chunk_text(doc_str)
386
+ chunks.extend([chunk_info["text"]
387
+ for chunk_info in chunk_results])
388
+
389
+ return chunks
390
+
391
+ def chunk_text(
392
+ self,
393
+ text: str,
394
+ chunk_size: int = 2048,
395
+ chunk_overlap: int = 128,
396
+ min_sentences_per_chunk: int = 1,
397
+ tokenizer: str = "character",
398
+ ) -> List[dict[str, Any]]:
399
+ """
400
+ Chunk raw text using the Chonkie library's SentenceChunker.
401
+
402
+ This method provides a simpler alternative to the Docling-based chunking
403
+ for plain text documents. It uses sentence-based chunking with configurable
404
+ token limits and overlap.
405
+
406
+ Args:
407
+ text: Raw text to chunk
408
+ chunk_size: Maximum tokens per chunk (default: 2048)
409
+ chunk_overlap: Overlap between consecutive chunks in tokens (default: 128)
410
+ min_sentences_per_chunk: Minimum sentences per chunk (default: 1)
411
+ tokenizer: Tokenizer to use - "character", "gpt2", or any HuggingFace tokenizer (default: "character")
412
+
413
+ Returns:
414
+ List of dictionaries with chunk information:
415
+ - text: The chunk text
416
+ - token_count: Number of tokens in the chunk
417
+ - start_index: Starting character index in original text
418
+ - end_index: Ending character index in original text
419
+
420
+ Raises:
421
+ ImportError: If chonkie is not installed
422
+
423
+ Example:
424
+ ```python
425
+ chunker = AdvancedChunker()
426
+ chunks = chunker.chunk_text(
427
+ "Your long text here...",
428
+ chunk_size=1024,
429
+ chunk_overlap=64
430
+ )
431
+
432
+ for chunk_info in chunks:
433
+ print(f"Text: {chunk_info['text']}")
434
+ print(f"Tokens: {chunk_info['token_count']}")
435
+ ```
436
+ """
437
+ if not CHONKIE_AVAILABLE:
438
+ raise ImportError(
439
+ "chonkie is not installed. Please install it with: "
440
+ "pip install chonkie==1.4.2"
441
+ )
442
+
443
+ # Initialize the Chonkie SentenceChunker
444
+ chonkie_chunker = SentenceChunker(
445
+ tokenizer=tokenizer,
446
+ chunk_size=chunk_size,
447
+ chunk_overlap=chunk_overlap,
448
+ min_sentences_per_chunk=min_sentences_per_chunk,
449
+ )
450
+
451
+ # Chunk the text
452
+ chunks = chonkie_chunker(text)
453
+
454
+ # Convert Chonkie chunks to our format
455
+ result = []
456
+ for chunk in chunks:
457
+ chunk_info = {
458
+ "text": chunk.text,
459
+ "token_count": chunk.token_count,
460
+ "start_index": chunk.start_index,
461
+ "end_index": chunk.end_index,
462
+ }
463
+ result.append(chunk_info)
464
+
465
+ return result
466
+
467
+ def chunk_docling_document(
468
+ self,
469
+ dl_doc: DoclingDocument,
470
+ post_process: bool = True
471
+ ) -> Iterable[BaseChunk]:
472
+ """
473
+ Generate chunks from a Docling document.
474
+
475
+ This is an advanced method for working directly with DoclingDocument objects.
476
+ For the standard Chunker interface, use the run() method.
477
+
478
+ Args:
479
+ dl_doc: DoclingDocument to chunk
480
+ post_process: If True, applies post-processing to filter TOC and merge
481
+ small chunks (default: True)
482
+
483
+ Returns:
484
+ Iterable of BaseChunk objects
485
+ """
486
+ chunks = list(self.hybrid_chunker.chunk(dl_doc=dl_doc))
487
+
488
+ if post_process:
489
+ chunks = self._post_process_chunks(chunks)
490
+
491
+ return chunks
492
+
493
+ def _post_process_chunks(self, chunks: List[BaseChunk]) -> List[BaseChunk]:
494
+ """
495
+ Post-process chunks to improve quality.
496
+
497
+ This method:
498
+ 1. Filters out Table of Contents entries
499
+ 2. Merges image-only chunks with adjacent content
500
+ 3. Merges incomplete table fragments with adjacent content
501
+ 4. Merges very small chunks with their neighbors
502
+ 5. Removes duplicate heading-only chunks
503
+
504
+ Args:
505
+ chunks: List of chunks to process
506
+
507
+ Returns:
508
+ Processed list of chunks
509
+ """
510
+ if not chunks:
511
+ return chunks
512
+
513
+ # First pass: Filter TOC and mark chunks for processing
514
+ filtered_chunks = []
515
+ # Chunks waiting to be merged (images, table fragments)
516
+ pending_merge_chunks = []
517
+
518
+ for chunk in chunks:
519
+ # Filter TOC entries
520
+ if self.filter_toc and self._is_toc_chunk(chunk):
521
+ continue
522
+
523
+ # Check if chunk is image-only or incomplete table fragment
524
+ should_merge = (
525
+ self._is_image_only_chunk(chunk) or
526
+ self._is_incomplete_table_fragment(chunk)
527
+ )
528
+
529
+ if should_merge:
530
+ # Accumulate chunks to merge with next content chunk
531
+ pending_merge_chunks.append(chunk)
532
+ continue
533
+
534
+ # If we have pending chunks to merge, merge them with this chunk
535
+ if pending_merge_chunks:
536
+ # Prepend all pending chunks to this chunk
537
+ merge_texts = [
538
+ merge_chunk.text for merge_chunk in pending_merge_chunks]
539
+ chunk.text = "\n".join(merge_texts) + "\n" + chunk.text
540
+ pending_merge_chunks = []
541
+
542
+ filtered_chunks.append(chunk)
543
+
544
+ # If there are still pending chunks at the end, append to last chunk
545
+ if pending_merge_chunks and filtered_chunks:
546
+ last_chunk = filtered_chunks[-1]
547
+ merge_texts = [
548
+ merge_chunk.text for merge_chunk in pending_merge_chunks]
549
+ last_chunk.text = last_chunk.text + "\n" + "\n".join(merge_texts)
550
+
551
+ # Second pass: Merge small chunks
552
+ processed = []
553
+
554
+ for chunk in filtered_chunks:
555
+ # Check if chunk is too small
556
+ token_count = self.count_tokens(chunk.text)
557
+
558
+ if token_count < self.min_chunk_tokens and processed:
559
+ # Try to merge with previous chunk
560
+ prev_chunk = processed[-1]
561
+ merged_text = prev_chunk.text + "\n\n" + chunk.text
562
+ merged_tokens = self.count_tokens(merged_text)
563
+
564
+ # Only merge if it doesn't exceed max_tokens
565
+ if merged_tokens <= self.max_tokens:
566
+ # Update the text while preserving the chunk structure
567
+ prev_chunk.text = merged_text
568
+ continue
569
+
570
+ processed.append(chunk)
571
+
572
+ return processed
573
+
574
+ def _is_image_only_chunk(self, chunk: BaseChunk) -> bool:
575
+ """
576
+ Check if a chunk contains only image placeholders.
577
+
578
+ Image-only chunks typically contain only:
579
+ - <!-- image --> placeholders
580
+ - Whitespace and newlines
581
+ - No meaningful text content
582
+
583
+ Args:
584
+ chunk: Chunk to check
585
+
586
+ Returns:
587
+ True if chunk is image-only
588
+ """
589
+ text = chunk.text.strip()
590
+
591
+ # Remove all image placeholders
592
+ text_without_images = re.sub(
593
+ r'<!--\s*image\s*-->', '', text, flags=re.IGNORECASE)
594
+ text_without_images = text_without_images.strip()
595
+
596
+ # If nothing remains after removing image placeholders, it's image-only
597
+ if not text_without_images:
598
+ return True
599
+
600
+ # Also check for very short content that's just whitespace or punctuation
601
+ # This catches cases where there might be a stray character
602
+ if len(text_without_images) < 5 and not any(c.isalnum() for c in text_without_images):
603
+ return True
604
+
605
+ return False
606
+
607
+ def _is_incomplete_table_fragment(self, chunk: BaseChunk) -> bool:
608
+ """
609
+ Check if a chunk contains an incomplete table fragment.
610
+
611
+ Incomplete table fragments typically contain:
612
+ - Only table separator lines (|---|---|)
613
+ - Only table borders without content
614
+ - Very short lines with mostly dashes and pipes
615
+ - Single dash or pipe character lines
616
+
617
+ Args:
618
+ chunk: Chunk to check
619
+
620
+ Returns:
621
+ True if chunk is an incomplete table fragment
622
+ """
623
+ text = chunk.text.strip()
624
+
625
+ # Remove heading markers to get the actual content
626
+ lines = text.split('\n')
627
+ content_lines = []
628
+
629
+ for line in lines:
630
+ # Skip heading lines (starting with #)
631
+ stripped = line.strip()
632
+ if not stripped.startswith('#'):
633
+ content_lines.append(stripped)
634
+
635
+ # If no content lines, not a table fragment
636
+ if not content_lines:
637
+ return False
638
+
639
+ # Join content lines
640
+ content = '\n'.join(content_lines).strip()
641
+
642
+ # Check if it's only table separators (lines with |, -, and whitespace)
643
+ # Pattern: lines containing mostly |, -, and spaces
644
+ table_separator_pattern = r'^[\s\|\-]+$'
645
+
646
+ # Check each content line
647
+ separator_lines = 0
648
+ total_content_lines = len(content_lines)
649
+
650
+ for line in content_lines:
651
+ if not line.strip():
652
+ continue
653
+ # Check if line is mostly table separators
654
+ if re.match(table_separator_pattern, line):
655
+ separator_lines += 1
656
+
657
+ # If all non-empty lines are separators, it's an incomplete fragment
658
+ if separator_lines > 0 and separator_lines == total_content_lines:
659
+ return True
660
+
661
+ # Check for very short content that's mostly punctuation
662
+ # Remove all whitespace, pipes, and dashes
663
+ content_cleaned = re.sub(r'[\s\|\-]', '', content)
664
+
665
+ # If very little actual content remains (less than 10 chars),
666
+ # and original has table markers, it's likely a fragment
667
+ if len(content_cleaned) < 10 and ('|' in content or '---' in content):
668
+ return True
669
+
670
+ return False
671
+
672
+ def _is_toc_chunk(self, chunk: BaseChunk) -> bool:
673
+ """
674
+ Check if a chunk is a Table of Contents entry.
675
+
676
+ TOC entries typically:
677
+ - Have "Table of Contents", "Table des matières", "Contents", "Sommaire" headings
678
+ - Contain many dots (....) or dashes (----) as separators
679
+ - Have page numbers at the end of lines
680
+
681
+ Args:
682
+ chunk: Chunk to check
683
+
684
+ Returns:
685
+ True if chunk appears to be a TOC entry
686
+ """
687
+ text = chunk.text.lower()
688
+
689
+ # Check for TOC heading patterns
690
+ toc_headings = [
691
+ "table of contents",
692
+ "table des matières",
693
+ "contents",
694
+ "sommaire",
695
+ "índice",
696
+ "inhaltsverzeichnis",
697
+ ]
698
+
699
+ # Get heading context if available
700
+ doc_chunk = DocChunk.model_validate(chunk)
701
+ headings = doc_chunk.meta.headings or []
702
+ heading_text = " ".join(headings).lower()
703
+
704
+ for toc_heading in toc_headings:
705
+ if toc_heading in heading_text or toc_heading in text[:100]:
706
+ # Additional check: TOC entries often have separator patterns
707
+ # Like dots (....) or dashes (---) or pipe tables
708
+ separator_count = (
709
+ text.count('....') +
710
+ text.count('----') +
711
+ text.count('|---')
712
+ )
713
+
714
+ # If has TOC heading and separator patterns, it's likely TOC
715
+ if separator_count > 0:
716
+ return True
717
+
718
+ # Also check for page number patterns at end of lines
719
+ page_number_pattern = r'\d+\s*$|\d+\s*\|'
720
+ lines = text.split('\n')
721
+ page_number_lines = sum(
722
+ 1 for line in lines
723
+ if re.search(page_number_pattern, line.strip())
724
+ )
725
+
726
+ # If most lines end with numbers, likely TOC
727
+ if len(lines) > 1 and page_number_lines > len(lines) * 0.5:
728
+ return True
729
+
730
+ return False
731
+
732
+ def chunk_from_markdown_file(
733
+ self,
734
+ md_file_path: str,
735
+ contextualize: bool = True,
736
+ ) -> List[dict[str, Any]]:
737
+ """
738
+ Chunk content directly from a markdown file using Docling.
739
+
740
+ This method uses the Docling DocumentConverter to convert the markdown file
741
+ and then chunks it using the HybridChunker. It provides a convenient way
742
+ to process markdown files without manual conversion.
743
+
744
+ Args:
745
+ md_file_path: Path to the markdown file to chunk
746
+ contextualize: If True, applies contextualization to add hierarchical
747
+ context from headings (default: True)
748
+
749
+ Returns:
750
+ List of dictionaries with chunk information:
751
+ - text: The chunk text (contextualized if enabled)
752
+ - num_tokens: Number of tokens in the chunk
753
+ - doc_items: List of document item references
754
+ - chunk: The BaseChunk object
755
+
756
+ Raises:
757
+ ImportError: If docling is not installed
758
+ FileNotFoundError: If the markdown file doesn't exist
759
+
760
+ Example:
761
+ ```python
762
+ from advanced_chunker import AdvancedChunker
763
+
764
+ # Create chunker
765
+ chunker = AdvancedChunker(strategy="markdown_tables")
766
+
767
+ # Chunk from markdown file
768
+ chunks = chunker.chunk_from_markdown_file(
769
+ md_file_path="/path/to/document.md",
770
+ contextualize=True
771
+ )
772
+
773
+ # Access chunk information
774
+ for chunk_info in chunks:
775
+ print(f"Text: {chunk_info['text']}")
776
+ print(f"Tokens: {chunk_info['num_tokens']}")
777
+ ```
778
+ """
779
+ try:
780
+ from docling.document_converter import DocumentConverter
781
+ except ImportError:
782
+ raise ImportError(
783
+ "docling is not installed. Please install it with: "
784
+ "pip install docling"
785
+ )
786
+
787
+ import os
788
+ if not os.path.exists(md_file_path):
789
+ raise FileNotFoundError(f"Markdown file not found: {md_file_path}")
790
+
791
+ # Convert markdown file to DoclingDocument
792
+ converter = DocumentConverter()
793
+ result = converter.convert(source=md_file_path)
794
+ dl_doc = result.document
795
+
796
+ # Chunk the document
797
+ chunks_list = []
798
+ for chunk in self.hybrid_chunker.chunk(dl_doc=dl_doc):
799
+ # Get contextualized text if requested
800
+ if contextualize:
801
+ chunk_text = self.contextualize(chunk=chunk)
802
+ else:
803
+ chunk_text = chunk.text
804
+
805
+ # Get chunk information
806
+ num_tokens = self.count_tokens(text=chunk_text)
807
+ doc_chunk = DocChunk.model_validate(chunk)
808
+ doc_items_refs = [it.self_ref for it in doc_chunk.meta.doc_items]
809
+
810
+ chunk_info = {
811
+ "text": chunk_text,
812
+ "num_tokens": num_tokens,
813
+ "doc_items": doc_items_refs,
814
+ "chunk": chunk,
815
+ }
816
+ chunks_list.append(chunk_info)
817
+
818
+ return chunks_list
819
+
820
+ def count_tokens(self, text: str) -> int:
821
+ """
822
+ Count tokens in text.
823
+
824
+ Args:
825
+ text: Text to count tokens for
826
+
827
+ Returns:
828
+ Number of tokens
829
+ """
830
+ return self.tokenizer.count_tokens(text=text)
831
+
832
+ def get_max_tokens(self) -> int:
833
+ """
834
+ Get maximum token limit for the tokenizer.
835
+
836
+ Returns:
837
+ Maximum number of tokens
838
+ """
839
+ return self.tokenizer.get_max_tokens()
840
+
841
+ def contextualize(self, chunk: BaseChunk) -> str:
842
+ """
843
+ Contextualize a chunk by adding hierarchical context from headings.
844
+
845
+ This method enriches the chunk text with context from parent headings
846
+ and section titles, which improves RAG retrieval quality by providing
847
+ more semantic context.
848
+
849
+ If `include_heading_markers` is True, headings will be prefixed with
850
+ markdown-style `#` markers based on their hierarchy level.
851
+
852
+ Args:
853
+ chunk: The chunk to contextualize
854
+
855
+ Returns:
856
+ Context-enriched text string
857
+
858
+ Example:
859
+ >>> for chunk in chunker.chunk(dl_doc=doc):
860
+ ... enriched_text = chunker.contextualize(chunk=chunk)
861
+ ... # Use enriched_text for embedding
862
+ """
863
+ if not self.include_heading_markers:
864
+ return self.hybrid_chunker.contextualize(chunk=chunk)
865
+
866
+ # Custom contextualization with markdown heading markers
867
+ doc_chunk = DocChunk.model_validate(chunk)
868
+ meta = doc_chunk.meta
869
+
870
+ items = []
871
+
872
+ # Add headings with markdown markers
873
+ if meta.headings:
874
+ for i, heading in enumerate(meta.headings):
875
+ # Level starts at 1 for first heading, increases for nested
876
+ level = i + 1
877
+ items.append(f"{'#' * level} {heading}")
878
+
879
+ # Add the chunk text
880
+ items.append(chunk.text)
881
+
882
+ return self.hybrid_chunker.delim.join(items)
883
+
884
+ @staticmethod
885
+ def find_nth_chunk_with_label(
886
+ chunks: Iterable[BaseChunk],
887
+ n: int,
888
+ label: DocItemLabel,
889
+ ) -> tuple[Optional[int], Optional[DocChunk]]:
890
+ """
891
+ Find the n-th chunk containing a specific document item label.
892
+
893
+ Args:
894
+ chunks: Iterable of chunks to search
895
+ n: Zero-based index of the chunk to find
896
+ label: Document item label to search for
897
+
898
+ Returns:
899
+ Tuple of (chunk_index, chunk) or (None, None) if not found
900
+ """
901
+ num_found = -1
902
+ for i, chunk in enumerate(chunks):
903
+ doc_chunk = DocChunk.model_validate(chunk)
904
+ for it in doc_chunk.meta.doc_items:
905
+ if it.label == label:
906
+ num_found += 1
907
+ if num_found == n:
908
+ return i, doc_chunk
909
+ return None, None
910
+
911
+ def get_chunk_info(self, chunk: BaseChunk) -> dict[str, Any]:
912
+ """
913
+ Get detailed information about a chunk.
914
+
915
+ Args:
916
+ chunk: Chunk to analyze
917
+
918
+ Returns:
919
+ Dictionary with chunk information including:
920
+ - text: Contextualized text
921
+ - num_tokens: Token count
922
+ - doc_items: List of document item references
923
+ """
924
+ ctx_text = self.contextualize(chunk=chunk)
925
+ num_tokens = self.count_tokens(text=ctx_text)
926
+ doc_chunk = DocChunk.model_validate(chunk)
927
+ doc_items_refs = [it.self_ref for it in doc_chunk.meta.doc_items]
928
+
929
+ return {
930
+ "text": ctx_text,
931
+ "num_tokens": num_tokens,
932
+ "doc_items": doc_items_refs,
933
+ "chunk": doc_chunk,
934
+ }
935
+
936
+
937
+ # Convenience function for quick setup
938
+ def create_chunker(
939
+ strategy: str = "default",
940
+ embed_model_id: str = "sentence-transformers/all-MiniLM-L6-v2",
941
+ include_heading_markers: bool = True,
942
+ max_tokens: int = AdvancedChunker.DEFAULT_MAX_TOKENS,
943
+ merge_peers: bool = AdvancedChunker.DEFAULT_MERGE_PEERS,
944
+ min_chunk_tokens: int = AdvancedChunker.DEFAULT_MIN_CHUNK_TOKENS,
945
+ filter_toc: bool = True,
946
+ **kwargs,
947
+ ) -> AdvancedChunker:
948
+ """
949
+ Create a pre-configured advanced chunker.
950
+
951
+ Args:
952
+ strategy: Chunking strategy to use:
953
+ - "default": Default serialization
954
+ - "markdown_tables": Markdown table formatting
955
+ - "custom_placeholder": Custom image placeholder
956
+ - "annotations": Include picture annotations
957
+ embed_model_id: HuggingFace model ID for tokenization
958
+ include_heading_markers: If True, adds markdown # markers to headings
959
+ in contextualized output (default: True)
960
+ max_tokens: Maximum tokens per chunk (default: 1024)
961
+ merge_peers: If True, merges adjacent small chunks (default: True)
962
+ min_chunk_tokens: Minimum tokens for standalone chunks (default: 50)
963
+ filter_toc: If True, filters out Table of Contents entries (default: True)
964
+ **kwargs: Additional arguments passed to strategy-specific providers
965
+
966
+ Returns:
967
+ Configured AdvancedChunker instance
968
+
969
+ Example:
970
+ ```python
971
+ # Create chunker with markdown tables
972
+ chunker = create_chunker(strategy="markdown_tables")
973
+
974
+ # Create chunker with custom image placeholder
975
+ chunker = create_chunker(
976
+ strategy="custom_placeholder",
977
+ image_placeholder="[IMAGE]"
978
+ )
979
+
980
+ # Create chunker with larger chunks and TOC filtering
981
+ chunker = create_chunker(
982
+ strategy="markdown_tables",
983
+ max_tokens=2048,
984
+ filter_toc=True
985
+ )
986
+ ```
987
+ """
988
+ provider_map = {
989
+ "default": DefaultSerializerProvider,
990
+ "markdown_tables": MDTableSerializerProvider,
991
+ "custom_placeholder": ImgPlaceholderSerializerProvider,
992
+ "annotations": ImgAnnotationSerializerProvider,
993
+ }
994
+
995
+ if strategy not in provider_map:
996
+ raise ValueError(
997
+ f"Unknown strategy: {strategy}. "
998
+ f"Available: {list(provider_map.keys())}"
999
+ )
1000
+
1001
+ provider_class = provider_map[strategy]
1002
+
1003
+ # Filter kwargs for provider initialization
1004
+ import inspect
1005
+ provider_sig = inspect.signature(provider_class.__init__)
1006
+ provider_params = set(provider_sig.parameters.keys()) - {"self"}
1007
+ provider_kwargs = {k: v for k, v in kwargs.items() if k in provider_params}
1008
+
1009
+ provider = provider_class(**provider_kwargs)
1010
+
1011
+ return AdvancedChunker(
1012
+ embed_model_id=embed_model_id,
1013
+ serializer_provider=provider,
1014
+ include_heading_markers=include_heading_markers,
1015
+ max_tokens=max_tokens,
1016
+ merge_peers=merge_peers,
1017
+ min_chunk_tokens=min_chunk_tokens,
1018
+ filter_toc=filter_toc,
1019
+ )