rakam-systems-vectorstore 0.1.1rc7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. rakam_systems_vectorstore/MANIFEST.in +26 -0
  2. rakam_systems_vectorstore/README.md +1071 -0
  3. rakam_systems_vectorstore/__init__.py +93 -0
  4. rakam_systems_vectorstore/components/__init__.py +0 -0
  5. rakam_systems_vectorstore/components/chunker/__init__.py +19 -0
  6. rakam_systems_vectorstore/components/chunker/advanced_chunker.py +1019 -0
  7. rakam_systems_vectorstore/components/chunker/text_chunker.py +154 -0
  8. rakam_systems_vectorstore/components/embedding_model/__init__.py +0 -0
  9. rakam_systems_vectorstore/components/embedding_model/configurable_embeddings.py +546 -0
  10. rakam_systems_vectorstore/components/embedding_model/openai_embeddings.py +259 -0
  11. rakam_systems_vectorstore/components/loader/__init__.py +31 -0
  12. rakam_systems_vectorstore/components/loader/adaptive_loader.py +512 -0
  13. rakam_systems_vectorstore/components/loader/code_loader.py +699 -0
  14. rakam_systems_vectorstore/components/loader/doc_loader.py +812 -0
  15. rakam_systems_vectorstore/components/loader/eml_loader.py +556 -0
  16. rakam_systems_vectorstore/components/loader/html_loader.py +626 -0
  17. rakam_systems_vectorstore/components/loader/md_loader.py +622 -0
  18. rakam_systems_vectorstore/components/loader/odt_loader.py +750 -0
  19. rakam_systems_vectorstore/components/loader/pdf_loader.py +771 -0
  20. rakam_systems_vectorstore/components/loader/pdf_loader_light.py +723 -0
  21. rakam_systems_vectorstore/components/loader/tabular_loader.py +597 -0
  22. rakam_systems_vectorstore/components/vectorstore/__init__.py +0 -0
  23. rakam_systems_vectorstore/components/vectorstore/apps.py +10 -0
  24. rakam_systems_vectorstore/components/vectorstore/configurable_pg_vector_store.py +1661 -0
  25. rakam_systems_vectorstore/components/vectorstore/faiss_vector_store.py +878 -0
  26. rakam_systems_vectorstore/components/vectorstore/migrations/0001_initial.py +55 -0
  27. rakam_systems_vectorstore/components/vectorstore/migrations/__init__.py +0 -0
  28. rakam_systems_vectorstore/components/vectorstore/models.py +10 -0
  29. rakam_systems_vectorstore/components/vectorstore/pg_models.py +97 -0
  30. rakam_systems_vectorstore/components/vectorstore/pg_vector_store.py +827 -0
  31. rakam_systems_vectorstore/config.py +266 -0
  32. rakam_systems_vectorstore/core.py +8 -0
  33. rakam_systems_vectorstore/pyproject.toml +113 -0
  34. rakam_systems_vectorstore/server/README.md +290 -0
  35. rakam_systems_vectorstore/server/__init__.py +20 -0
  36. rakam_systems_vectorstore/server/mcp_server_vector.py +325 -0
  37. rakam_systems_vectorstore/setup.py +103 -0
  38. rakam_systems_vectorstore-0.1.1rc7.dist-info/METADATA +370 -0
  39. rakam_systems_vectorstore-0.1.1rc7.dist-info/RECORD +40 -0
  40. rakam_systems_vectorstore-0.1.1rc7.dist-info/WHEEL +4 -0
@@ -0,0 +1,723 @@
1
+ """
2
+ PDF Loader using pymupdf4llm library for lightweight PDF processing.
3
+
4
+ This loader uses the pymupdf4llm library to extract text from PDF documents
5
+ and convert them to markdown format. It provides a lightweight alternative to
6
+ the Docling-based loader with:
7
+ - Fast text extraction with markdown formatting
8
+ - Table support
9
+ - Image references
10
+ - Configurable chunking
11
+
12
+ The loader stores extracted markdown in a scratch folder within the data directory.
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ import mimetypes
18
+ import os
19
+ import re
20
+ import threading
21
+ import time
22
+ from pathlib import Path
23
+ from typing import Any, Dict, List, Optional, Union
24
+
25
+ import pymupdf4llm
26
+
27
+ from rakam_systems_core.ai_utils import logging
28
+ from rakam_systems_core.ai_core.interfaces.loader import Loader
29
+ from rakam_systems_vectorstore.components.chunker import AdvancedChunker
30
+ from rakam_systems_vectorstore.core import Node, NodeMetadata, VSFile
31
+
32
+ logger = logging.getLogger(__name__)
33
+
34
+ # Global lock for pymupdf4llm operations (library is not thread-safe)
35
+ _pymupdf4llm_lock = threading.Lock()
36
+
37
+
38
+ class PdfLoaderLight(Loader):
39
+ """
40
+ Lightweight PDF loader using pymupdf4llm for document processing.
41
+
42
+ This loader provides fast PDF processing with support for:
43
+ - Text extraction with markdown formatting
44
+ - Table extraction
45
+ - Image references
46
+ - Configurable chunking
47
+
48
+ The extracted content is chunked and returned as text or Node objects.
49
+ Markdown files can be saved to a scratch directory for reference.
50
+ """
51
+
52
+ # Default configuration
53
+ DEFAULT_EMBED_MODEL_ID = "sentence-transformers/all-MiniLM-L6-v2"
54
+ DEFAULT_CHUNKER_STRATEGY = "markdown_tables"
55
+ DEFAULT_MAX_TOKENS = 1024 # Larger chunks for better context
56
+ DEFAULT_MIN_CHUNK_TOKENS = 50 # Minimum tokens for standalone chunks
57
+ DEFAULT_PAGE_CHUNKS = False # Whether to chunk by page
58
+ DEFAULT_DPI = 150 # DPI for image extraction
59
+ DEFAULT_IMAGE_PATH = "data/ingestion_image/" # Default path for extracted images
60
+
61
+ def __init__(
62
+ self,
63
+ name: str = "pdf_loader_light",
64
+ config: Optional[Dict[str, Any]] = None
65
+ ):
66
+ """
67
+ Initialize PDF loader with pymupdf4llm.
68
+
69
+ Args:
70
+ name: Component name
71
+ config: Optional configuration with keys:
72
+ - embed_model_id: HuggingFace model ID for tokenization (default: "sentence-transformers/all-MiniLM-L6-v2")
73
+ - chunker_strategy: Strategy for chunking ("default", "markdown_tables", "annotations", default: "markdown_tables")
74
+ - save_markdown: Whether to save markdown files (default: True)
75
+ - scratch_folder_name: Name of scratch folder (default: "scratch")
76
+ - max_tokens: Maximum tokens per chunk (default: 1024)
77
+ - merge_peers: Whether to merge adjacent small chunks (default: True)
78
+ - min_chunk_tokens: Minimum tokens for standalone chunks (default: 50)
79
+ - filter_toc: Whether to filter out Table of Contents entries (default: True)
80
+ - page_chunks: Whether to chunk by page (default: False)
81
+ - write_images: Whether to extract images (default: True)
82
+ - image_path: Path to save images (default: None, uses INGESTION_IMAGE_PATH env var or "data/ingestion_image/")
83
+ - image_format: Format for extracted images (default: "png")
84
+ - dpi: DPI for image extraction (default: 150)
85
+ - margins: Margins for text extraction (default: (0, 50, 0, 50))
86
+ """
87
+ super().__init__(name=name, config=config)
88
+
89
+ # Extract configuration
90
+ config = config or {}
91
+ self._save_markdown = config.get('save_markdown', True)
92
+ self._scratch_folder_name = config.get(
93
+ 'scratch_folder_name', 'scratch')
94
+ self._page_chunks = config.get('page_chunks', self.DEFAULT_PAGE_CHUNKS)
95
+ self._write_images = config.get('write_images', True)
96
+ self._image_format = config.get('image_format', 'png')
97
+ self._dpi = config.get('dpi', self.DEFAULT_DPI)
98
+ self._margins = config.get('margins', (0, 50, 0, 50))
99
+
100
+ # Get image path from config, env var, or use default
101
+ self._image_path = config.get('image_path') or os.getenv(
102
+ 'INGESTION_IMAGE_PATH', self.DEFAULT_IMAGE_PATH)
103
+
104
+ # Chunker configuration
105
+ self._max_tokens = config.get('max_tokens', self.DEFAULT_MAX_TOKENS)
106
+ self._merge_peers = config.get('merge_peers', True)
107
+ self._min_chunk_tokens = config.get(
108
+ 'min_chunk_tokens', self.DEFAULT_MIN_CHUNK_TOKENS)
109
+ self._filter_toc = config.get('filter_toc', True)
110
+
111
+ # Initialize advanced chunker
112
+ embed_model_id = config.get(
113
+ 'embed_model_id', self.DEFAULT_EMBED_MODEL_ID)
114
+ chunker_strategy = config.get(
115
+ 'chunker_strategy', self.DEFAULT_CHUNKER_STRATEGY)
116
+ self._chunker = AdvancedChunker(
117
+ embed_model_id=embed_model_id,
118
+ strategy=chunker_strategy,
119
+ max_tokens=self._max_tokens,
120
+ merge_peers=self._merge_peers,
121
+ min_chunk_tokens=self._min_chunk_tokens,
122
+ filter_toc=self._filter_toc,
123
+ )
124
+
125
+ # Store last conversion result
126
+ self._last_markdown = None
127
+ self._last_scratch_dir = None
128
+
129
+ # Store image path mapping: {relative_path_in_markdown: absolute_path_on_disk}
130
+ self._image_path_mapping: Dict[str, str] = {}
131
+
132
+ logger.info(
133
+ f"Initialized PdfLoaderLight with chunker_strategy={chunker_strategy}, page_chunks={self._page_chunks}, dpi={self._dpi}, image_path={self._image_path}")
134
+
135
+ def run(self, source: str) -> List[str]:
136
+ """
137
+ Execute the primary operation for the component.
138
+
139
+ This method satisfies the BaseComponent abstract method requirement
140
+ and delegates to load_as_chunks.
141
+
142
+ Args:
143
+ source: Path to PDF file
144
+
145
+ Returns:
146
+ List of text chunks extracted from the PDF
147
+ """
148
+ return self.load_as_chunks(source)
149
+
150
+ def load_as_nodes(
151
+ self,
152
+ source: Union[str, Path],
153
+ source_id: Optional[str] = None,
154
+ custom_metadata: Optional[Dict[str, Any]] = None
155
+ ) -> List[Node]:
156
+ """
157
+ Load PDF and return as Node objects with metadata.
158
+
159
+ Args:
160
+ source: Path to PDF file
161
+ source_id: Optional source identifier (defaults to file path)
162
+ custom_metadata: Optional custom metadata to attach to nodes
163
+
164
+ Returns:
165
+ List of Node objects with text chunks and metadata
166
+ """
167
+ # Convert Path to string
168
+ if isinstance(source, Path):
169
+ source = str(source)
170
+
171
+ # Load text chunks
172
+ chunks = self.load_as_chunks(source)
173
+
174
+ # Determine source ID
175
+ if source_id is None:
176
+ source_id = source
177
+
178
+ # Create nodes with metadata
179
+ nodes = []
180
+ for idx, chunk in enumerate(chunks):
181
+ metadata = NodeMetadata(
182
+ source_file_uuid=source_id,
183
+ position=idx,
184
+ custom=custom_metadata or {}
185
+ )
186
+ node = Node(content=chunk, metadata=metadata)
187
+ nodes.append(node)
188
+
189
+ logger.info(f"Created {len(nodes)} nodes from PDF: {source}")
190
+ return nodes
191
+
192
+ def load_as_text(
193
+ self,
194
+ source: Union[str, Path],
195
+ ) -> str:
196
+ """
197
+ Load PDF and return as a single text string.
198
+
199
+ This method extracts all text from the PDF and returns it as a single
200
+ string without chunking. Useful when you need the full document text.
201
+
202
+ Args:
203
+ source: Path to PDF file
204
+
205
+ Returns:
206
+ Full text content of the PDF as a single string
207
+
208
+ Raises:
209
+ FileNotFoundError: If source file doesn't exist
210
+ ValueError: If source is not a PDF file
211
+ Exception: If PDF processing fails
212
+ """
213
+ # Convert Path to string
214
+ if isinstance(source, Path):
215
+ source = str(source)
216
+
217
+ # Validate file exists
218
+ if not os.path.isfile(source):
219
+ raise FileNotFoundError(f"File not found: {source}")
220
+
221
+ # Validate file is a PDF
222
+ if not self._is_pdf_file(source):
223
+ raise ValueError(
224
+ f"File is not a PDF: {source}. MIME type: {mimetypes.guess_type(source)[0]}")
225
+
226
+ logger.info(f"Loading PDF as text: {source}")
227
+ start_time = time.time()
228
+
229
+ try:
230
+ # Get image extraction path
231
+ image_path = self._get_image_path(
232
+ source) if self._write_images else None
233
+
234
+ # Convert PDF to markdown using pymupdf4llm
235
+ # Use lock because pymupdf4llm is not thread-safe
236
+ with _pymupdf4llm_lock:
237
+ md_text = pymupdf4llm.to_markdown(
238
+ source,
239
+ page_chunks=False, # Get full document
240
+ write_images=self._write_images,
241
+ image_path=image_path,
242
+ image_format=self._image_format,
243
+ dpi=self._dpi,
244
+ margins=self._margins,
245
+ )
246
+
247
+ # Build image path mapping
248
+ if self._write_images and image_path:
249
+ self._build_image_path_mapping(md_text, image_path)
250
+
251
+ # Store for later use
252
+ self._last_markdown = md_text
253
+ scratch_dir = self._get_scratch_dir(source)
254
+ self._last_scratch_dir = scratch_dir
255
+
256
+ # Save markdown if enabled
257
+ if self._save_markdown:
258
+ self._save_markdown_file(source, md_text, scratch_dir)
259
+
260
+ elapsed = time.time() - start_time
261
+ logger.info(
262
+ f"PDF loaded as text in {elapsed:.2f}s: {len(md_text)} characters")
263
+
264
+ return md_text
265
+
266
+ except Exception as e:
267
+ logger.error(f"Error loading PDF as text {source}: {e}")
268
+ raise
269
+
270
+ def load_as_chunks(
271
+ self,
272
+ source: Union[str, Path],
273
+ ) -> List[str]:
274
+ """
275
+ Load PDF and return as a list of text chunks.
276
+
277
+ This method extracts text from the PDF, processes it with the configured
278
+ chunker strategy, and returns a list of text chunks. Each chunk includes
279
+ contextualization.
280
+
281
+ Args:
282
+ source: Path to PDF file
283
+
284
+ Returns:
285
+ List of text chunks extracted from the PDF
286
+
287
+ Raises:
288
+ FileNotFoundError: If source file doesn't exist
289
+ ValueError: If source is not a PDF file
290
+ Exception: If PDF processing fails
291
+ """
292
+ # Convert Path to string
293
+ if isinstance(source, Path):
294
+ source = str(source)
295
+
296
+ # Validate file exists
297
+ if not os.path.isfile(source):
298
+ raise FileNotFoundError(f"File not found: {source}")
299
+
300
+ # Validate file is a PDF
301
+ if not self._is_pdf_file(source):
302
+ raise ValueError(
303
+ f"File is not a PDF: {source}. MIME type: {mimetypes.guess_type(source)[0]}")
304
+
305
+ logger.info(f"Loading PDF file: {source}")
306
+ start_time = time.time()
307
+
308
+ try:
309
+ # Get image extraction path
310
+ image_path = self._get_image_path(
311
+ source) if self._write_images else None
312
+
313
+ # Convert PDF to markdown using pymupdf4llm
314
+ # Use lock because pymupdf4llm is not thread-safe
315
+ with _pymupdf4llm_lock:
316
+ md_result = pymupdf4llm.to_markdown(
317
+ source,
318
+ page_chunks=self._page_chunks,
319
+ write_images=self._write_images,
320
+ image_path=image_path,
321
+ image_format=self._image_format,
322
+ dpi=self._dpi,
323
+ margins=self._margins,
324
+ )
325
+
326
+ # Build image path mapping
327
+ if self._write_images and image_path:
328
+ if isinstance(md_result, list):
329
+ # Page chunks - combine all text for mapping
330
+ full_text = "\n\n".join([page['text']
331
+ for page in md_result])
332
+ self._build_image_path_mapping(full_text, image_path)
333
+ else:
334
+ self._build_image_path_mapping(md_result, image_path)
335
+
336
+ # Store for later use
337
+ self._last_markdown = md_result
338
+ scratch_dir = self._get_scratch_dir(source)
339
+ self._last_scratch_dir = scratch_dir
340
+
341
+ # Save markdown if enabled
342
+ if self._save_markdown:
343
+ if self._page_chunks and isinstance(md_result, list):
344
+ # Save each page separately
345
+ for idx, page_md in enumerate(md_result):
346
+ self._save_markdown_file(
347
+ source, page_md['text'], scratch_dir, page_num=idx+1)
348
+ # Also save full document
349
+ full_text = "\n\n".join([page['text']
350
+ for page in md_result])
351
+ self._save_markdown_file(source, full_text, scratch_dir)
352
+ else:
353
+ self._save_markdown_file(source, md_result, scratch_dir)
354
+
355
+ # Extract and chunk text
356
+ text_chunks = self._extract_and_chunk_text(md_result)
357
+
358
+ elapsed = time.time() - start_time
359
+ logger.info(
360
+ f"PDF processed in {elapsed:.2f}s: {len(text_chunks)} chunks")
361
+
362
+ return text_chunks
363
+
364
+ except Exception as e:
365
+ logger.error(f"Error processing PDF {source}: {e}")
366
+ raise
367
+
368
+ def load_as_vsfile(
369
+ self,
370
+ file_path: Union[str, Path],
371
+ custom_metadata: Optional[Dict[str, Any]] = None
372
+ ) -> VSFile:
373
+ """
374
+ Load PDF and return as VSFile object.
375
+
376
+ Args:
377
+ file_path: Path to PDF file
378
+ custom_metadata: Optional custom metadata
379
+
380
+ Returns:
381
+ VSFile object with nodes
382
+
383
+ Raises:
384
+ FileNotFoundError: If file doesn't exist
385
+ ValueError: If file is not a PDF
386
+ """
387
+ if isinstance(file_path, Path):
388
+ file_path = str(file_path)
389
+
390
+ if not os.path.isfile(file_path):
391
+ raise FileNotFoundError(f"File not found: {file_path}")
392
+
393
+ if not self._is_pdf_file(file_path):
394
+ raise ValueError(f"File is not a PDF: {file_path}")
395
+
396
+ # Create VSFile
397
+ vsfile = VSFile(file_path)
398
+
399
+ # Load and create nodes
400
+ nodes = self.load_as_nodes(
401
+ file_path, str(vsfile.uuid), custom_metadata)
402
+ vsfile.nodes = nodes
403
+ vsfile.processed = True
404
+
405
+ logger.info(
406
+ f"Created VSFile with {len(nodes)} nodes from: {file_path}")
407
+ return vsfile
408
+
409
+ def _is_pdf_file(self, file_path: str) -> bool:
410
+ """
411
+ Check if file is a PDF based on extension and MIME type.
412
+
413
+ Args:
414
+ file_path: Path to file
415
+
416
+ Returns:
417
+ True if file is a PDF, False otherwise
418
+ """
419
+ # Check extension
420
+ path = Path(file_path)
421
+ if path.suffix.lower() != '.pdf':
422
+ return False
423
+
424
+ # Check MIME type
425
+ mime_type, _ = mimetypes.guess_type(file_path)
426
+ if mime_type and mime_type != 'application/pdf':
427
+ return False
428
+
429
+ return True
430
+
431
+ def _get_image_path(self, source_path: str) -> str:
432
+ """
433
+ Get the path where images should be extracted.
434
+
435
+ Uses the configured image path (from config, env var, or default).
436
+ Creates a subdirectory based on the source PDF filename.
437
+
438
+ Args:
439
+ source_path: Path to source PDF file
440
+
441
+ Returns:
442
+ Absolute path to image extraction directory
443
+ """
444
+ source = Path(source_path)
445
+ doc_filename = source.stem
446
+
447
+ # Create base image path
448
+ base_path = Path(self._image_path)
449
+
450
+ # Create subdirectory for this document
451
+ image_dir = base_path / doc_filename
452
+ image_dir.mkdir(parents=True, exist_ok=True)
453
+
454
+ logger.debug(f"Using image path: {image_dir}")
455
+ return str(image_dir)
456
+
457
+ def _build_image_path_mapping(self, markdown_text: str, image_path: str) -> None:
458
+ """
459
+ Build mapping between image references in markdown and actual file paths.
460
+
461
+ Extracts image references from markdown (e.g., ![](image.png)) and maps them
462
+ to their absolute paths on disk.
463
+
464
+ Args:
465
+ markdown_text: Markdown text containing image references
466
+ image_path: Directory where images were extracted
467
+ """
468
+ # Clear previous mapping
469
+ self._image_path_mapping.clear()
470
+
471
+ # Find all image references in markdown: ![alt text](image_path)
472
+ # Use a more robust pattern that handles parentheses in paths
473
+ # Match everything between ![]( and the closing ) that comes before a newline or another ![
474
+ image_pattern = r'!\[([^\]]*)\]\(([^)]+(?:\([^)]*\)[^)]*)*\.(?:png|jpg|jpeg|gif|bmp|svg|webp))\)'
475
+ matches = re.findall(image_pattern, markdown_text, re.IGNORECASE)
476
+
477
+ for alt_text, markdown_path in matches:
478
+ # The markdown_path could be:
479
+ # 1. Relative path: "image.png" or "subdir/image.png"
480
+ # 2. Full path: "data/ingestion_image/doc/image.png"
481
+
482
+ # Convert to Path for easier manipulation
483
+ path_obj = Path(markdown_path)
484
+
485
+ # Check if the path exists as-is (might be absolute or relative)
486
+ if path_obj.exists():
487
+ absolute_path = path_obj.resolve()
488
+ else:
489
+ # Try combining with image_path
490
+ absolute_path = Path(image_path) / path_obj.name
491
+
492
+ # Store mapping: markdown path -> absolute path on disk
493
+ if absolute_path.exists():
494
+ self._image_path_mapping[markdown_path] = str(absolute_path)
495
+ logger.debug(
496
+ f"Mapped image: {markdown_path} -> {absolute_path}")
497
+ else:
498
+ logger.warning(f"Image file not found: {absolute_path}")
499
+
500
+ logger.info(
501
+ f"Built image path mapping with {len(self._image_path_mapping)} images")
502
+
503
+ def get_image_path_mapping(self) -> Dict[str, str]:
504
+ """
505
+ Get the mapping of image paths from markdown to actual file paths.
506
+
507
+ Returns:
508
+ Dictionary mapping relative paths in markdown to absolute paths on disk
509
+ """
510
+ return self._image_path_mapping.copy()
511
+
512
+ def get_image_absolute_path(self, markdown_image_path: str) -> Optional[str]:
513
+ """
514
+ Get the absolute file path for an image referenced in the markdown.
515
+
516
+ Args:
517
+ markdown_image_path: The relative path as it appears in the markdown
518
+
519
+ Returns:
520
+ Absolute path to the image file, or None if not found
521
+ """
522
+ return self._image_path_mapping.get(markdown_image_path)
523
+
524
+ def _get_scratch_dir(self, source_path: str) -> Path:
525
+ """
526
+ Get scratch directory for storing extracted files.
527
+
528
+ The scratch directory is created inside the data folder relative to the source file.
529
+
530
+ Args:
531
+ source_path: Path to source PDF file
532
+
533
+ Returns:
534
+ Path to scratch directory
535
+ """
536
+ source = Path(source_path)
537
+
538
+ # Find data folder - assume it's a parent of the source or sibling
539
+ if 'data' in source.parts:
540
+ # Navigate to data folder
541
+ data_folder = source
542
+ while data_folder.name != 'data' and data_folder.parent != data_folder:
543
+ data_folder = data_folder.parent
544
+ else:
545
+ # Use parent directory and create/use data folder
546
+ data_folder = source.parent / 'data'
547
+
548
+ # Create scratch directory inside data folder
549
+ scratch_dir = data_folder / self._scratch_folder_name
550
+ scratch_dir.mkdir(parents=True, exist_ok=True)
551
+
552
+ logger.debug(f"Using scratch directory: {scratch_dir}")
553
+ return scratch_dir
554
+
555
+ def _save_markdown_file(
556
+ self,
557
+ source_path: str,
558
+ markdown_text: str,
559
+ scratch_dir: Path,
560
+ page_num: Optional[int] = None
561
+ ) -> None:
562
+ """
563
+ Save markdown text to file.
564
+
565
+ Args:
566
+ source_path: Path to source PDF
567
+ markdown_text: Markdown text to save
568
+ scratch_dir: Scratch directory path
569
+ page_num: Optional page number for page-specific files
570
+ """
571
+ try:
572
+ doc_filename = Path(source_path).stem
573
+
574
+ if page_num is not None:
575
+ md_filename = scratch_dir / \
576
+ f"{doc_filename}-page-{page_num}.md"
577
+ else:
578
+ md_filename = scratch_dir / f"{doc_filename}.md"
579
+
580
+ md_filename.write_text(markdown_text, encoding='utf-8')
581
+ logger.debug(f"Saved markdown to {md_filename}")
582
+
583
+ except Exception as e:
584
+ logger.warning(f"Failed to save markdown file: {e}")
585
+
586
+ def _extract_and_chunk_text(self, md_result: Union[str, List[Dict[str, Any]]]) -> List[str]:
587
+ """
588
+ Extract text and chunk it using AdvancedChunker.
589
+
590
+ Args:
591
+ md_result: Markdown result from pymupdf4llm (string or list of page dicts)
592
+
593
+ Returns:
594
+ List of text chunks with contextualization
595
+ """
596
+ text_chunks = []
597
+
598
+ try:
599
+ # Handle page_chunks=True case (list of dicts)
600
+ if isinstance(md_result, list):
601
+ # Each item is a dict with 'text' and metadata
602
+ for page_dict in md_result:
603
+ page_text = page_dict.get('text', '')
604
+ if page_text and page_text.strip():
605
+ # Chunk each page separately
606
+ page_chunks = self._chunker.run([page_text])
607
+ text_chunks.extend(page_chunks)
608
+ else:
609
+ # page_chunks=False case (single string)
610
+ text_chunks = self._chunker.run([md_result])
611
+
612
+ except Exception as e:
613
+ logger.warning(
614
+ f"Failed to chunk document with AdvancedChunker: {e}")
615
+ # Fall back to simple text extraction if advanced chunking fails
616
+ logger.info("Falling back to simple text extraction")
617
+ if isinstance(md_result, list):
618
+ text_chunks = [page_dict.get(
619
+ 'text', '') for page_dict in md_result if page_dict.get('text', '').strip()]
620
+ else:
621
+ text_chunks = [md_result]
622
+
623
+ return text_chunks
624
+
625
+
626
+ def create_pdf_loader_light(
627
+ embed_model_id: str = "sentence-transformers/all-MiniLM-L6-v2",
628
+ chunker_strategy: str = "markdown_tables",
629
+ save_markdown: bool = True,
630
+ scratch_folder_name: str = 'scratch',
631
+ max_tokens: int = 1024,
632
+ merge_peers: bool = True,
633
+ min_chunk_tokens: int = 50,
634
+ filter_toc: bool = True,
635
+ page_chunks: bool = False,
636
+ write_images: bool = True,
637
+ image_path: Optional[str] = None,
638
+ image_format: str = 'png',
639
+ dpi: int = 150,
640
+ margins: tuple = (0, 50, 0, 50),
641
+ ) -> PdfLoaderLight:
642
+ """
643
+ Factory function to create a lightweight PDF loader.
644
+
645
+ Args:
646
+ embed_model_id: HuggingFace model ID for tokenization
647
+ chunker_strategy: Strategy for chunking:
648
+ - "default": Default serialization
649
+ - "markdown_tables": Markdown table formatting (recommended)
650
+ - "annotations": Include picture annotations
651
+ save_markdown: Whether to save markdown files
652
+ scratch_folder_name: Name of scratch folder in data directory
653
+ max_tokens: Maximum tokens per chunk (default: 1024). Larger values create
654
+ bigger, more contextual chunks. Recommended: 512-2048.
655
+ merge_peers: Whether to merge adjacent small chunks with same metadata (default: True)
656
+ min_chunk_tokens: Minimum tokens for a standalone chunk (default: 50).
657
+ Smaller chunks will be merged with neighbors.
658
+ filter_toc: Whether to filter out Table of Contents entries (default: True).
659
+ TOC entries often create noisy, low-value chunks.
660
+ page_chunks: Whether to chunk by page (default: False). If True, each page
661
+ is processed separately.
662
+ write_images: Whether to extract images from PDF (default: True)
663
+ image_path: Path to save extracted images (default: None). If None, uses
664
+ INGESTION_IMAGE_PATH env var or "data/ingestion_image/".
665
+ image_format: Format for extracted images (default: 'png'). Options: 'png', 'jpg', 'ppm', 'pnm'
666
+ dpi: DPI for image extraction (default: 150). Higher values = better quality but larger files.
667
+ margins: Margins for text extraction in points (left, top, right, bottom).
668
+ Default: (0, 50, 0, 50) - excludes 50pt from top and bottom.
669
+
670
+ Returns:
671
+ Configured lightweight PDF loader
672
+
673
+ Example:
674
+ >>> # Basic usage with default settings
675
+ >>> loader = create_pdf_loader_light()
676
+ >>> chunks = loader.load_as_chunks("data/document.pdf")
677
+ >>> print(f"Extracted {len(chunks)} chunks")
678
+
679
+ >>> # Create loader with larger chunks and page-based chunking
680
+ >>> loader = create_pdf_loader_light(
681
+ ... max_tokens=2048,
682
+ ... page_chunks=True,
683
+ ... filter_toc=True
684
+ ... )
685
+ >>> chunks = loader.load_as_chunks("data/document.pdf")
686
+
687
+ >>> # Create loader with high-quality image extraction
688
+ >>> loader = create_pdf_loader_light(
689
+ ... write_images=True,
690
+ ... dpi=300,
691
+ ... image_format='png'
692
+ ... )
693
+ >>> chunks = loader.load_as_chunks("data/document.pdf")
694
+
695
+ >>> # Access image path mapping after loading
696
+ >>> loader = create_pdf_loader_light(write_images=True)
697
+ >>> chunks = loader.load_as_chunks("data/document.pdf")
698
+ >>> image_mapping = loader.get_image_path_mapping()
699
+ >>> # image_mapping = {'image-1.png': '/path/to/data/ingestion_image/document/image-1.png', ...}
700
+ >>> # Get specific image path from markdown reference
701
+ >>> abs_path = loader.get_image_absolute_path('image-1.png')
702
+ """
703
+ config = {
704
+ 'embed_model_id': embed_model_id,
705
+ 'chunker_strategy': chunker_strategy,
706
+ 'save_markdown': save_markdown,
707
+ 'scratch_folder_name': scratch_folder_name,
708
+ 'max_tokens': max_tokens,
709
+ 'merge_peers': merge_peers,
710
+ 'min_chunk_tokens': min_chunk_tokens,
711
+ 'filter_toc': filter_toc,
712
+ 'page_chunks': page_chunks,
713
+ 'write_images': write_images,
714
+ 'image_path': image_path,
715
+ 'image_format': image_format,
716
+ 'dpi': dpi,
717
+ 'margins': margins,
718
+ }
719
+
720
+ return PdfLoaderLight(config=config)
721
+
722
+
723
+ __all__ = ["PdfLoaderLight", "create_pdf_loader_light"]