rakam-systems-vectorstore 0.1.1rc7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. rakam_systems_vectorstore/MANIFEST.in +26 -0
  2. rakam_systems_vectorstore/README.md +1071 -0
  3. rakam_systems_vectorstore/__init__.py +93 -0
  4. rakam_systems_vectorstore/components/__init__.py +0 -0
  5. rakam_systems_vectorstore/components/chunker/__init__.py +19 -0
  6. rakam_systems_vectorstore/components/chunker/advanced_chunker.py +1019 -0
  7. rakam_systems_vectorstore/components/chunker/text_chunker.py +154 -0
  8. rakam_systems_vectorstore/components/embedding_model/__init__.py +0 -0
  9. rakam_systems_vectorstore/components/embedding_model/configurable_embeddings.py +546 -0
  10. rakam_systems_vectorstore/components/embedding_model/openai_embeddings.py +259 -0
  11. rakam_systems_vectorstore/components/loader/__init__.py +31 -0
  12. rakam_systems_vectorstore/components/loader/adaptive_loader.py +512 -0
  13. rakam_systems_vectorstore/components/loader/code_loader.py +699 -0
  14. rakam_systems_vectorstore/components/loader/doc_loader.py +812 -0
  15. rakam_systems_vectorstore/components/loader/eml_loader.py +556 -0
  16. rakam_systems_vectorstore/components/loader/html_loader.py +626 -0
  17. rakam_systems_vectorstore/components/loader/md_loader.py +622 -0
  18. rakam_systems_vectorstore/components/loader/odt_loader.py +750 -0
  19. rakam_systems_vectorstore/components/loader/pdf_loader.py +771 -0
  20. rakam_systems_vectorstore/components/loader/pdf_loader_light.py +723 -0
  21. rakam_systems_vectorstore/components/loader/tabular_loader.py +597 -0
  22. rakam_systems_vectorstore/components/vectorstore/__init__.py +0 -0
  23. rakam_systems_vectorstore/components/vectorstore/apps.py +10 -0
  24. rakam_systems_vectorstore/components/vectorstore/configurable_pg_vector_store.py +1661 -0
  25. rakam_systems_vectorstore/components/vectorstore/faiss_vector_store.py +878 -0
  26. rakam_systems_vectorstore/components/vectorstore/migrations/0001_initial.py +55 -0
  27. rakam_systems_vectorstore/components/vectorstore/migrations/__init__.py +0 -0
  28. rakam_systems_vectorstore/components/vectorstore/models.py +10 -0
  29. rakam_systems_vectorstore/components/vectorstore/pg_models.py +97 -0
  30. rakam_systems_vectorstore/components/vectorstore/pg_vector_store.py +827 -0
  31. rakam_systems_vectorstore/config.py +266 -0
  32. rakam_systems_vectorstore/core.py +8 -0
  33. rakam_systems_vectorstore/pyproject.toml +113 -0
  34. rakam_systems_vectorstore/server/README.md +290 -0
  35. rakam_systems_vectorstore/server/__init__.py +20 -0
  36. rakam_systems_vectorstore/server/mcp_server_vector.py +325 -0
  37. rakam_systems_vectorstore/setup.py +103 -0
  38. rakam_systems_vectorstore-0.1.1rc7.dist-info/METADATA +370 -0
  39. rakam_systems_vectorstore-0.1.1rc7.dist-info/RECORD +40 -0
  40. rakam_systems_vectorstore-0.1.1rc7.dist-info/WHEEL +4 -0
@@ -0,0 +1,771 @@
1
+ """
2
+ PDF Loader using Docling library for advanced PDF processing.
3
+
4
+ This loader uses the Docling library to extract text, images, tables, and figures
5
+ from PDF documents with high quality. It supports:
6
+ - Text extraction with layout preservation
7
+ - Image extraction (page images, figures, tables)
8
+ - Markdown export with embedded or referenced images
9
+ - Configurable image resolution
10
+
11
+ The loader stores extracted images and markdown in a scratch folder within the data directory.
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import mimetypes
17
+ import os
18
+ import time
19
+ from pathlib import Path
20
+ from typing import Any, Dict, List, Optional, Union
21
+
22
+ from docling.datamodel.base_models import InputFormat
23
+ from docling.datamodel.pipeline_options import PdfPipelineOptions
24
+ from docling.document_converter import DocumentConverter, PdfFormatOption
25
+ from docling_core.types.doc import ImageRefMode, PictureItem, TableItem
26
+
27
+ from rakam_systems_core.ai_utils import logging
28
+ from rakam_systems_core.ai_core.interfaces.loader import Loader
29
+ from rakam_systems_vectorstore.components.chunker import AdvancedChunker
30
+ from rakam_systems_vectorstore.core import Node, NodeMetadata, VSFile
31
+
32
+ logger = logging.getLogger(__name__)
33
+
34
+
35
+ class PdfLoader(Loader):
36
+ """
37
+ PDF loader using Docling for advanced document processing.
38
+
39
+ This loader provides high-quality PDF processing with support for:
40
+ - Text extraction with layout preservation
41
+ - Image extraction (pages, figures, tables)
42
+ - Markdown export with images
43
+ - Configurable processing options
44
+
45
+ The extracted content is chunked and returned as text or Node objects.
46
+ Images and markdown files are saved to a scratch directory for reference.
47
+ """
48
+
49
+ # Default configuration
50
+ DEFAULT_IMAGE_SCALE = 2.0 # Scale=1 ~ 72 DPI, Scale=2 ~ 144 DPI
51
+ DEFAULT_EMBED_MODEL_ID = "sentence-transformers/all-MiniLM-L6-v2"
52
+ DEFAULT_CHUNKER_STRATEGY = "markdown_tables"
53
+ DEFAULT_MAX_TOKENS = 1024 # Larger chunks for better context
54
+ DEFAULT_MIN_CHUNK_TOKENS = 50 # Minimum tokens for standalone chunks
55
+
56
+ def __init__(
57
+ self,
58
+ name: str = "pdf_loader",
59
+ config: Optional[Dict[str, Any]] = None
60
+ ):
61
+ """
62
+ Initialize PDF loader with Docling.
63
+
64
+ Args:
65
+ name: Component name
66
+ config: Optional configuration with keys:
67
+ - image_scale: Image resolution scale (default: 2.0)
68
+ - generate_page_images: Whether to generate page images (default: True)
69
+ - generate_picture_images: Whether to generate picture images (default: True)
70
+ - embed_model_id: HuggingFace model ID for tokenization (default: "sentence-transformers/all-MiniLM-L6-v2")
71
+ - chunker_strategy: Strategy for chunking ("default", "markdown_tables", "annotations", default: "markdown_tables")
72
+ - save_images: Whether to save images to disk (default: True)
73
+ - save_markdown: Whether to save markdown files (default: True)
74
+ - scratch_folder_name: Name of scratch folder (default: "scratch")
75
+ - include_images_in_chunks: Whether to include image references in text chunks (default: True)
76
+ - max_tokens: Maximum tokens per chunk (default: 1024)
77
+ - merge_peers: Whether to merge adjacent small chunks (default: True)
78
+ - min_chunk_tokens: Minimum tokens for standalone chunks (default: 50)
79
+ - filter_toc: Whether to filter out Table of Contents entries (default: True)
80
+ """
81
+ super().__init__(name=name, config=config)
82
+
83
+ # Extract configuration
84
+ config = config or {}
85
+ self._image_scale = config.get('image_scale', self.DEFAULT_IMAGE_SCALE)
86
+ self._generate_page_images = config.get('generate_page_images', True)
87
+ self._generate_picture_images = config.get(
88
+ 'generate_picture_images', True)
89
+ self._save_images = config.get('save_images', True)
90
+ self._save_markdown = config.get('save_markdown', True)
91
+ self._scratch_folder_name = config.get(
92
+ 'scratch_folder_name', 'scratch')
93
+ self._include_images_in_chunks = config.get(
94
+ 'include_images_in_chunks', True)
95
+
96
+ # Chunker configuration
97
+ self._max_tokens = config.get('max_tokens', self.DEFAULT_MAX_TOKENS)
98
+ self._merge_peers = config.get('merge_peers', True)
99
+ self._min_chunk_tokens = config.get(
100
+ 'min_chunk_tokens', self.DEFAULT_MIN_CHUNK_TOKENS)
101
+ self._filter_toc = config.get('filter_toc', True)
102
+
103
+ # Initialize advanced chunker with improved settings
104
+ embed_model_id = config.get(
105
+ 'embed_model_id', self.DEFAULT_EMBED_MODEL_ID)
106
+ chunker_strategy = config.get(
107
+ 'chunker_strategy', self.DEFAULT_CHUNKER_STRATEGY)
108
+ self._chunker = AdvancedChunker(
109
+ embed_model_id=embed_model_id,
110
+ strategy=chunker_strategy,
111
+ max_tokens=self._max_tokens,
112
+ merge_peers=self._merge_peers,
113
+ min_chunk_tokens=self._min_chunk_tokens,
114
+ filter_toc=self._filter_toc,
115
+ )
116
+
117
+ # Initialize document converter with pipeline options
118
+ self._doc_converter = self._create_converter()
119
+
120
+ # Store conversion result for image tracking
121
+ self._last_conv_res = None
122
+ self._last_scratch_dir = None
123
+
124
+ logger.info(
125
+ f"Initialized PdfLoader with image_scale={self._image_scale}, chunker_strategy={chunker_strategy}, include_images_in_chunks={self._include_images_in_chunks}")
126
+
127
+ def run(self, source: str) -> List[str]:
128
+ """
129
+ Execute the primary operation for the component.
130
+
131
+ This method satisfies the BaseComponent abstract method requirement
132
+ and delegates to load_as_chunks.
133
+
134
+ Args:
135
+ source: Path to PDF file
136
+
137
+ Returns:
138
+ List of text chunks extracted from the PDF
139
+ """
140
+ return self.load_as_chunks(source)
141
+
142
+ def _create_converter(self) -> DocumentConverter:
143
+ """Create and configure the Docling document converter."""
144
+ pipeline_options = PdfPipelineOptions()
145
+ pipeline_options.images_scale = self._image_scale
146
+ pipeline_options.generate_page_images = self._generate_page_images
147
+ pipeline_options.generate_picture_images = self._generate_picture_images
148
+
149
+ doc_converter = DocumentConverter(
150
+ format_options={
151
+ InputFormat.PDF: PdfFormatOption(
152
+ pipeline_options=pipeline_options)
153
+ }
154
+ )
155
+
156
+ return doc_converter
157
+
158
+ def load_as_nodes(
159
+ self,
160
+ source: Union[str, Path],
161
+ source_id: Optional[str] = None,
162
+ custom_metadata: Optional[Dict[str, Any]] = None
163
+ ) -> List[Node]:
164
+ """
165
+ Load PDF and return as Node objects with metadata.
166
+
167
+ Args:
168
+ source: Path to PDF file
169
+ source_id: Optional source identifier (defaults to file path)
170
+ custom_metadata: Optional custom metadata to attach to nodes
171
+
172
+ Returns:
173
+ List of Node objects with text chunks and metadata
174
+ """
175
+ # Convert Path to string
176
+ if isinstance(source, Path):
177
+ source = str(source)
178
+
179
+ # Load text chunks
180
+ chunks = self.load_as_chunks(source)
181
+
182
+ # Determine source ID
183
+ if source_id is None:
184
+ source_id = source
185
+
186
+ # Create nodes with metadata
187
+ nodes = []
188
+ for idx, chunk in enumerate(chunks):
189
+ metadata = NodeMetadata(
190
+ source_file_uuid=source_id,
191
+ position=idx,
192
+ custom=custom_metadata or {}
193
+ )
194
+ node = Node(content=chunk, metadata=metadata)
195
+ nodes.append(node)
196
+
197
+ logger.info(f"Created {len(nodes)} nodes from PDF: {source}")
198
+ return nodes
199
+
200
+ def load_as_text(
201
+ self,
202
+ source: Union[str, Path],
203
+ ) -> str:
204
+ """
205
+ Load PDF and return as a single text string.
206
+
207
+ This method extracts all text from the PDF and returns it as a single
208
+ string without chunking. Useful when you need the full document text.
209
+
210
+ Args:
211
+ source: Path to PDF file
212
+
213
+ Returns:
214
+ Full text content of the PDF as a single string
215
+
216
+ Raises:
217
+ FileNotFoundError: If source file doesn't exist
218
+ ValueError: If source is not a PDF file
219
+ Exception: If PDF processing fails
220
+ """
221
+ # Convert Path to string
222
+ if isinstance(source, Path):
223
+ source = str(source)
224
+
225
+ # Validate file exists
226
+ if not os.path.isfile(source):
227
+ raise FileNotFoundError(f"File not found: {source}")
228
+
229
+ # Validate file is a PDF
230
+ if not self._is_pdf_file(source):
231
+ raise ValueError(
232
+ f"File is not a PDF: {source}. MIME type: {mimetypes.guess_type(source)[0]}")
233
+
234
+ logger.info(f"Loading PDF as text: {source}")
235
+ start_time = time.time()
236
+
237
+ try:
238
+ # Convert PDF document
239
+ conv_res = self._doc_converter.convert(source)
240
+
241
+ # Export the full document as markdown text
242
+ full_text = conv_res.document.export_to_markdown()
243
+
244
+ elapsed = time.time() - start_time
245
+ logger.info(
246
+ f"PDF loaded as text in {elapsed:.2f}s: {len(conv_res.document.pages)} pages, {len(full_text)} characters")
247
+
248
+ return full_text
249
+
250
+ except Exception as e:
251
+ logger.error(f"Error loading PDF as text {source}: {e}")
252
+ raise
253
+
254
+ def load_as_chunks(
255
+ self,
256
+ source: Union[str, Path],
257
+ ) -> List[str]:
258
+ """
259
+ Load PDF and return as a list of text chunks.
260
+
261
+ This method extracts text from the PDF, processes it with the configured
262
+ chunker strategy, and returns a list of text chunks. Each chunk includes
263
+ contextualization and optionally image references.
264
+
265
+ Args:
266
+ source: Path to PDF file
267
+
268
+ Returns:
269
+ List of text chunks extracted from the PDF
270
+
271
+ Raises:
272
+ FileNotFoundError: If source file doesn't exist
273
+ ValueError: If source is not a PDF file
274
+ Exception: If PDF processing fails
275
+ """
276
+ # Convert Path to string
277
+ if isinstance(source, Path):
278
+ source = str(source)
279
+
280
+ # Validate file exists
281
+ if not os.path.isfile(source):
282
+ raise FileNotFoundError(f"File not found: {source}")
283
+
284
+ # Validate file is a PDF
285
+ if not self._is_pdf_file(source):
286
+ raise ValueError(
287
+ f"File is not a PDF: {source}. MIME type: {mimetypes.guess_type(source)[0]}")
288
+
289
+ logger.info(f"Loading PDF file: {source}")
290
+ start_time = time.time()
291
+
292
+ try:
293
+ # Convert PDF document
294
+ conv_res = self._doc_converter.convert(source)
295
+
296
+ # Create scratch directory in data folder
297
+ scratch_dir = self._get_scratch_dir(source)
298
+
299
+ # Store for later use in image inclusion
300
+ self._last_conv_res = conv_res
301
+ self._last_scratch_dir = scratch_dir
302
+
303
+ # Save images and tables if enabled
304
+ if self._save_images:
305
+ self._save_page_images(conv_res, scratch_dir)
306
+ self._save_element_images(conv_res, scratch_dir)
307
+
308
+ # Save markdown if enabled
309
+ if self._save_markdown:
310
+ self._save_markdown_files(conv_res, scratch_dir)
311
+
312
+ # Extract text and chunk it
313
+ text_chunks = self._extract_and_chunk_text(conv_res, scratch_dir)
314
+
315
+ elapsed = time.time() - start_time
316
+ logger.info(
317
+ f"PDF processed in {elapsed:.2f}s: {len(conv_res.document.pages)} pages, {len(text_chunks)} chunks")
318
+
319
+ return text_chunks
320
+
321
+ except Exception as e:
322
+ logger.error(f"Error processing PDF {source}: {e}")
323
+ raise
324
+
325
+ def load_as_vsfile(
326
+ self,
327
+ file_path: Union[str, Path],
328
+ custom_metadata: Optional[Dict[str, Any]] = None
329
+ ) -> VSFile:
330
+ """
331
+ Load PDF and return as VSFile object.
332
+
333
+ Args:
334
+ file_path: Path to PDF file
335
+ custom_metadata: Optional custom metadata
336
+
337
+ Returns:
338
+ VSFile object with nodes
339
+
340
+ Raises:
341
+ FileNotFoundError: If file doesn't exist
342
+ ValueError: If file is not a PDF
343
+ """
344
+ if isinstance(file_path, Path):
345
+ file_path = str(file_path)
346
+
347
+ if not os.path.isfile(file_path):
348
+ raise FileNotFoundError(f"File not found: {file_path}")
349
+
350
+ if not self._is_pdf_file(file_path):
351
+ raise ValueError(f"File is not a PDF: {file_path}")
352
+
353
+ # Create VSFile
354
+ vsfile = VSFile(file_path)
355
+
356
+ # Load and create nodes
357
+ nodes = self.load_as_nodes(
358
+ file_path, str(vsfile.uuid), custom_metadata)
359
+ vsfile.nodes = nodes
360
+ vsfile.processed = True
361
+
362
+ logger.info(
363
+ f"Created VSFile with {len(nodes)} nodes from: {file_path}")
364
+ return vsfile
365
+
366
+ def _is_pdf_file(self, file_path: str) -> bool:
367
+ """
368
+ Check if file is a PDF based on extension and MIME type.
369
+
370
+ Args:
371
+ file_path: Path to file
372
+
373
+ Returns:
374
+ True if file is a PDF, False otherwise
375
+ """
376
+ # Check extension
377
+ path = Path(file_path)
378
+ if path.suffix.lower() != '.pdf':
379
+ return False
380
+
381
+ # Check MIME type
382
+ mime_type, _ = mimetypes.guess_type(file_path)
383
+ if mime_type and mime_type != 'application/pdf':
384
+ return False
385
+
386
+ return True
387
+
388
+ def _get_scratch_dir(self, source_path: str) -> Path:
389
+ """
390
+ Get scratch directory for storing extracted files.
391
+
392
+ The scratch directory is created inside the data folder relative to the source file.
393
+
394
+ Args:
395
+ source_path: Path to source PDF file
396
+
397
+ Returns:
398
+ Path to scratch directory
399
+ """
400
+ source = Path(source_path)
401
+
402
+ # Find data folder - assume it's a parent of the source or sibling
403
+ if 'data' in source.parts:
404
+ # Navigate to data folder
405
+ data_folder = source
406
+ while data_folder.name != 'data' and data_folder.parent != data_folder:
407
+ data_folder = data_folder.parent
408
+ else:
409
+ # Use parent directory and create/use data folder
410
+ data_folder = source.parent / 'data'
411
+
412
+ # Create scratch directory inside data folder
413
+ scratch_dir = data_folder / self._scratch_folder_name
414
+ scratch_dir.mkdir(parents=True, exist_ok=True)
415
+
416
+ logger.debug(f"Using scratch directory: {scratch_dir}")
417
+ return scratch_dir
418
+
419
+ def _save_page_images(self, conv_res, scratch_dir: Path) -> None:
420
+ """Save page images to scratch directory."""
421
+ doc_filename = conv_res.input.file.stem
422
+
423
+ for page_no, page in conv_res.document.pages.items():
424
+ if not hasattr(page, 'image') or page.image is None:
425
+ continue
426
+
427
+ page_image_filename = scratch_dir / \
428
+ f"{doc_filename}-page-{page.page_no}.png"
429
+ try:
430
+ with page_image_filename.open("wb") as fp:
431
+ page.image.pil_image.save(fp, format="PNG")
432
+ logger.debug(
433
+ f"Saved page {page.page_no} image to {page_image_filename}")
434
+ except Exception as e:
435
+ logger.warning(
436
+ f"Failed to save page {page.page_no} image: {e}")
437
+
438
+ def _save_element_images(self, conv_res, scratch_dir: Path) -> None:
439
+ """Save images of tables and figures to scratch directory."""
440
+ doc_filename = conv_res.input.file.stem
441
+ table_counter = 0
442
+ picture_counter = 0
443
+
444
+ for element, _level in conv_res.document.iterate_items():
445
+ try:
446
+ if isinstance(element, TableItem):
447
+ table_counter += 1
448
+ element_image_filename = (
449
+ scratch_dir /
450
+ f"{doc_filename}-table-{table_counter}.png"
451
+ )
452
+ with element_image_filename.open("wb") as fp:
453
+ element.get_image(conv_res.document).save(fp, "PNG")
454
+ logger.debug(
455
+ f"Saved table {table_counter} to {element_image_filename}")
456
+
457
+ elif isinstance(element, PictureItem):
458
+ picture_counter += 1
459
+ element_image_filename = (
460
+ scratch_dir /
461
+ f"{doc_filename}-picture-{picture_counter}.png"
462
+ )
463
+ with element_image_filename.open("wb") as fp:
464
+ element.get_image(conv_res.document).save(fp, "PNG")
465
+ logger.debug(
466
+ f"Saved picture {picture_counter} to {element_image_filename}")
467
+
468
+ except Exception as e:
469
+ logger.warning(f"Failed to save element image: {e}")
470
+
471
+ logger.info(
472
+ f"Saved {table_counter} tables and {picture_counter} pictures")
473
+
474
+ def _save_markdown_files(self, conv_res, scratch_dir: Path) -> None:
475
+ """Save markdown files with images."""
476
+ doc_filename = conv_res.input.file.stem
477
+
478
+ try:
479
+ # Save markdown with embedded images
480
+ md_filename = scratch_dir / f"{doc_filename}-with-images.md"
481
+ conv_res.document.save_as_markdown(
482
+ md_filename, image_mode=ImageRefMode.EMBEDDED)
483
+ logger.debug(
484
+ f"Saved markdown with embedded images to {md_filename}")
485
+
486
+ # Save markdown with referenced images
487
+ md_filename = scratch_dir / f"{doc_filename}-with-image-refs.md"
488
+ conv_res.document.save_as_markdown(
489
+ md_filename, image_mode=ImageRefMode.REFERENCED)
490
+ logger.debug(
491
+ f"Saved markdown with image references to {md_filename}")
492
+
493
+ # Save HTML with referenced images
494
+ html_filename = scratch_dir / \
495
+ f"{doc_filename}-with-image-refs.html"
496
+ conv_res.document.save_as_html(
497
+ html_filename, image_mode=ImageRefMode.REFERENCED)
498
+ logger.debug(
499
+ f"Saved HTML with image references to {html_filename}")
500
+
501
+ except Exception as e:
502
+ logger.warning(f"Failed to save markdown files: {e}")
503
+
504
+ def _extract_and_chunk_text(self, conv_res, scratch_dir: Path) -> List[str]:
505
+ """
506
+ Extract text from conversion result and chunk it using AdvancedChunker.
507
+
508
+ Args:
509
+ conv_res: Docling conversion result
510
+ scratch_dir: Path to scratch directory with images
511
+
512
+ Returns:
513
+ List of text chunks with contextualization and optional image references
514
+ """
515
+ text_chunks = []
516
+
517
+ try:
518
+ # Use AdvancedChunker to chunk the DoclingDocument directly
519
+ # This provides better chunking with table support and contextualization
520
+ chunk_count = 0
521
+ for chunk in self._chunker.chunk_docling_document(conv_res.document):
522
+ # Get contextualized text for each chunk
523
+ ctx_text = self._chunker.contextualize(chunk=chunk)
524
+
525
+ # If enabled, add image references to chunks
526
+ if self._include_images_in_chunks:
527
+ logger.debug(
528
+ f"Processing chunk {chunk_count}: has meta={hasattr(chunk, 'meta')}")
529
+ if hasattr(chunk, 'meta'):
530
+ logger.debug(
531
+ f" meta has doc_items={hasattr(chunk.meta, 'doc_items')}")
532
+ if hasattr(chunk.meta, 'doc_items'):
533
+ logger.debug(
534
+ f" doc_items count={len(chunk.meta.doc_items)}")
535
+
536
+ ctx_text = self._add_images_to_chunk(
537
+ ctx_text, chunk, conv_res, scratch_dir)
538
+
539
+ text_chunks.append(ctx_text)
540
+ chunk_count += 1
541
+
542
+ except Exception as e:
543
+ logger.warning(
544
+ f"Failed to chunk document with AdvancedChunker: {e}")
545
+ # Fall back to simple text extraction if advanced chunking fails
546
+ logger.info("Falling back to simple text extraction")
547
+ text_chunks = self._extract_text_fallback(conv_res)
548
+
549
+ return text_chunks
550
+
551
+ def _add_images_to_chunk(self, chunk_text: str, chunk, conv_res, scratch_dir: Path) -> str:
552
+ """
553
+ Add image references to a text chunk based on text content matching.
554
+ Images are added in the order they appear in the original document.
555
+
556
+ This method:
557
+ 1. Gets the full markdown document text
558
+ 2. Finds images (![Image](...)) in the markdown
559
+ 3. Uses fuzzy matching to find which images belong to this chunk
560
+ 4. Appends image references to the chunk
561
+
562
+ Args:
563
+ chunk_text: The contextualized text of the chunk
564
+ chunk: The chunk object from the chunker
565
+ conv_res: Docling conversion result
566
+ scratch_dir: Path to scratch directory with saved images
567
+
568
+ Returns:
569
+ Chunk text with appended image references in document order
570
+ """
571
+ doc_filename = conv_res.input.file.stem
572
+ image_refs = []
573
+
574
+ try:
575
+ # Read the full document markdown text with image references from the saved file
576
+ md_filename = scratch_dir / f"{doc_filename}-with-image-refs.md"
577
+ if not md_filename.exists():
578
+ logger.warning(
579
+ f"Markdown file with image references not found: {md_filename}")
580
+ return chunk_text
581
+
582
+ full_doc_text = md_filename.read_text()
583
+
584
+ # Find all ![Image](...) markers in the markdown
585
+ # Note: paths can contain parentheses, so we need to match until .png) or .jpg)
586
+ import re
587
+ image_pattern = r'!\[Image\]\((.+?\.(?:png|jpg|jpeg|gif|webp))\)'
588
+ image_positions = [] # (position, img_path)
589
+
590
+ for match in re.finditer(image_pattern, full_doc_text):
591
+ img_pos = match.start()
592
+ img_path_in_md = match.group(1)
593
+ # Use the image path directly from markdown
594
+ image_positions.append((img_pos, img_path_in_md))
595
+ logger.debug(f"Found image in document at position {img_pos}")
596
+
597
+ logger.debug(
598
+ f"Found {len(image_positions)} images in full document (length: {len(full_doc_text)})")
599
+
600
+ # Now find which images belong to this chunk
601
+ # Strategy: Look for text snippets from the chunk in the full document
602
+ # Split chunk into sentences/paragraphs for better matching
603
+ chunk_lines = [line.strip() for line in chunk_text.split(
604
+ '\n') if line.strip() and len(line.strip()) > 20]
605
+
606
+ if chunk_lines:
607
+ # Find the position range of this chunk in the full document
608
+ # Try to match beginning and end of chunk
609
+ # First 100 chars of first substantial line
610
+ first_line = chunk_lines[0][:100]
611
+ last_line = chunk_lines[-1][:100] if len(
612
+ chunk_lines) > 1 else first_line
613
+
614
+ # Remove title markers that chunker might add
615
+ first_line_clean = first_line.replace(
616
+ '## ', '').replace('# ', '').strip()
617
+
618
+ chunk_start_pos = full_doc_text.find(first_line_clean)
619
+
620
+ if chunk_start_pos == -1:
621
+ # Try with less text
622
+ first_line_clean = first_line_clean[:50]
623
+ chunk_start_pos = full_doc_text.find(first_line_clean)
624
+
625
+ if chunk_start_pos != -1:
626
+ # Find chunk end - look for the last line
627
+ last_line_clean = last_line.replace(
628
+ '## ', '').replace('# ', '').strip()[:50]
629
+ chunk_end_search = full_doc_text.find(
630
+ last_line_clean, chunk_start_pos)
631
+
632
+ if chunk_end_search != -1:
633
+ chunk_end_pos = chunk_end_search + \
634
+ len(last_line_clean) + 500 # Add buffer
635
+ else:
636
+ chunk_end_pos = chunk_start_pos + \
637
+ len(chunk_text) + 500 # Estimate
638
+
639
+ logger.debug(
640
+ f"Chunk found at position {chunk_start_pos}-{chunk_end_pos}")
641
+
642
+ # Find images that fall within this chunk's range
643
+ for img_pos, img_path in sorted(image_positions):
644
+ if chunk_start_pos <= img_pos <= chunk_end_pos:
645
+ image_refs.append(f"\n![Image]({img_path})")
646
+ logger.debug(
647
+ f"Added image at position {img_pos} to chunk")
648
+ else:
649
+ logger.debug(
650
+ f"Could not find chunk position in full document (tried: '{first_line_clean[:30]}...')")
651
+
652
+ # Append image references to the chunk text
653
+ if image_refs:
654
+ chunk_text = chunk_text + "".join(image_refs)
655
+ logger.info(
656
+ f"Added {len(image_refs)} image references to chunk")
657
+
658
+ except Exception as e:
659
+ logger.warning(f"Could not add images to chunk: {e}")
660
+ import traceback
661
+ logger.debug(f"Traceback: {traceback.format_exc()}")
662
+
663
+ return chunk_text
664
+
665
+ def _extract_text_fallback(self, conv_res) -> List[str]:
666
+ """
667
+ Fallback method for text extraction if advanced chunking fails.
668
+
669
+ Args:
670
+ conv_res: Docling conversion result
671
+
672
+ Returns:
673
+ List of text chunks
674
+ """
675
+ text_parts = []
676
+
677
+ # Extract text from each page
678
+ for page_no, page in conv_res.document.pages.items():
679
+ try:
680
+ # Export page as markdown to preserve structure
681
+ page_text = page.export_to_markdown()
682
+
683
+ if page_text and page_text.strip():
684
+ text_parts.append(page_text)
685
+
686
+ except Exception as e:
687
+ logger.warning(
688
+ f"Failed to extract text from page {page_no}: {e}")
689
+
690
+ # Join all text and use AdvancedChunker's raw text chunking
691
+ full_text = "\n\n".join(text_parts)
692
+ return self._chunker.run([full_text])
693
+
694
+
695
+ def create_pdf_loader(
696
+ image_scale: float = 2.0,
697
+ embed_model_id: str = "sentence-transformers/all-MiniLM-L6-v2",
698
+ chunker_strategy: str = "markdown_tables",
699
+ save_images: bool = True,
700
+ save_markdown: bool = True,
701
+ scratch_folder_name: str = 'scratch',
702
+ include_images_in_chunks: bool = True,
703
+ max_tokens: int = 1024,
704
+ merge_peers: bool = True,
705
+ min_chunk_tokens: int = 50,
706
+ filter_toc: bool = True,
707
+ ) -> PdfLoader:
708
+ """
709
+ Factory function to create a PDF loader.
710
+
711
+ Args:
712
+ image_scale: Image resolution scale (1.0 ~ 72 DPI, 2.0 ~ 144 DPI)
713
+ embed_model_id: HuggingFace model ID for tokenization
714
+ chunker_strategy: Strategy for chunking:
715
+ - "default": Default serialization
716
+ - "markdown_tables": Markdown table formatting (recommended)
717
+ - "annotations": Include picture annotations
718
+ - "custom_placeholder": Custom image placeholders
719
+ save_images: Whether to save extracted images
720
+ save_markdown: Whether to save markdown files
721
+ scratch_folder_name: Name of scratch folder in data directory
722
+ include_images_in_chunks: Whether to include image references in text chunks (default: True)
723
+ max_tokens: Maximum tokens per chunk (default: 1024). Larger values create
724
+ bigger, more contextual chunks. Recommended: 512-2048.
725
+ merge_peers: Whether to merge adjacent small chunks with same metadata (default: True)
726
+ min_chunk_tokens: Minimum tokens for a standalone chunk (default: 50).
727
+ Smaller chunks will be merged with neighbors.
728
+ filter_toc: Whether to filter out Table of Contents entries (default: True).
729
+ TOC entries often create noisy, low-value chunks.
730
+
731
+ Returns:
732
+ Configured PDF loader
733
+
734
+ Example:
735
+ >>> # Basic usage with default settings
736
+ >>> loader = create_pdf_loader()
737
+ >>> chunks = loader.load_as_chunks("data/document.pdf")
738
+ >>> print(f"Extracted {len(chunks)} chunks")
739
+
740
+ >>> # Create loader with larger chunks and TOC filtering
741
+ >>> loader = create_pdf_loader(
742
+ ... max_tokens=2048,
743
+ ... filter_toc=True,
744
+ ... min_chunk_tokens=100
745
+ ... )
746
+ >>> chunks = loader.load_as_chunks("data/document.pdf")
747
+
748
+ >>> # Create loader without image references in chunks
749
+ >>> loader = create_pdf_loader(include_images_in_chunks=False)
750
+ >>> chunks = loader.load_as_chunks("data/document.pdf")
751
+ """
752
+ config = {
753
+ 'image_scale': image_scale,
754
+ 'embed_model_id': embed_model_id,
755
+ 'chunker_strategy': chunker_strategy,
756
+ 'save_images': save_images,
757
+ 'save_markdown': save_markdown,
758
+ 'scratch_folder_name': scratch_folder_name,
759
+ 'include_images_in_chunks': include_images_in_chunks,
760
+ 'generate_page_images': True,
761
+ 'generate_picture_images': True,
762
+ 'max_tokens': max_tokens,
763
+ 'merge_peers': merge_peers,
764
+ 'min_chunk_tokens': min_chunk_tokens,
765
+ 'filter_toc': filter_toc,
766
+ }
767
+
768
+ return PdfLoader(config=config)
769
+
770
+
771
+ __all__ = ["PdfLoader", "create_pdf_loader"]