rakam-systems-vectorstore 0.1.1rc7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rakam_systems_vectorstore/MANIFEST.in +26 -0
- rakam_systems_vectorstore/README.md +1071 -0
- rakam_systems_vectorstore/__init__.py +93 -0
- rakam_systems_vectorstore/components/__init__.py +0 -0
- rakam_systems_vectorstore/components/chunker/__init__.py +19 -0
- rakam_systems_vectorstore/components/chunker/advanced_chunker.py +1019 -0
- rakam_systems_vectorstore/components/chunker/text_chunker.py +154 -0
- rakam_systems_vectorstore/components/embedding_model/__init__.py +0 -0
- rakam_systems_vectorstore/components/embedding_model/configurable_embeddings.py +546 -0
- rakam_systems_vectorstore/components/embedding_model/openai_embeddings.py +259 -0
- rakam_systems_vectorstore/components/loader/__init__.py +31 -0
- rakam_systems_vectorstore/components/loader/adaptive_loader.py +512 -0
- rakam_systems_vectorstore/components/loader/code_loader.py +699 -0
- rakam_systems_vectorstore/components/loader/doc_loader.py +812 -0
- rakam_systems_vectorstore/components/loader/eml_loader.py +556 -0
- rakam_systems_vectorstore/components/loader/html_loader.py +626 -0
- rakam_systems_vectorstore/components/loader/md_loader.py +622 -0
- rakam_systems_vectorstore/components/loader/odt_loader.py +750 -0
- rakam_systems_vectorstore/components/loader/pdf_loader.py +771 -0
- rakam_systems_vectorstore/components/loader/pdf_loader_light.py +723 -0
- rakam_systems_vectorstore/components/loader/tabular_loader.py +597 -0
- rakam_systems_vectorstore/components/vectorstore/__init__.py +0 -0
- rakam_systems_vectorstore/components/vectorstore/apps.py +10 -0
- rakam_systems_vectorstore/components/vectorstore/configurable_pg_vector_store.py +1661 -0
- rakam_systems_vectorstore/components/vectorstore/faiss_vector_store.py +878 -0
- rakam_systems_vectorstore/components/vectorstore/migrations/0001_initial.py +55 -0
- rakam_systems_vectorstore/components/vectorstore/migrations/__init__.py +0 -0
- rakam_systems_vectorstore/components/vectorstore/models.py +10 -0
- rakam_systems_vectorstore/components/vectorstore/pg_models.py +97 -0
- rakam_systems_vectorstore/components/vectorstore/pg_vector_store.py +827 -0
- rakam_systems_vectorstore/config.py +266 -0
- rakam_systems_vectorstore/core.py +8 -0
- rakam_systems_vectorstore/pyproject.toml +113 -0
- rakam_systems_vectorstore/server/README.md +290 -0
- rakam_systems_vectorstore/server/__init__.py +20 -0
- rakam_systems_vectorstore/server/mcp_server_vector.py +325 -0
- rakam_systems_vectorstore/setup.py +103 -0
- rakam_systems_vectorstore-0.1.1rc7.dist-info/METADATA +370 -0
- rakam_systems_vectorstore-0.1.1rc7.dist-info/RECORD +40 -0
- rakam_systems_vectorstore-0.1.1rc7.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,771 @@
|
|
|
1
|
+
"""
|
|
2
|
+
PDF Loader using Docling library for advanced PDF processing.
|
|
3
|
+
|
|
4
|
+
This loader uses the Docling library to extract text, images, tables, and figures
|
|
5
|
+
from PDF documents with high quality. It supports:
|
|
6
|
+
- Text extraction with layout preservation
|
|
7
|
+
- Image extraction (page images, figures, tables)
|
|
8
|
+
- Markdown export with embedded or referenced images
|
|
9
|
+
- Configurable image resolution
|
|
10
|
+
|
|
11
|
+
The loader stores extracted images and markdown in a scratch folder within the data directory.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import mimetypes
|
|
17
|
+
import os
|
|
18
|
+
import time
|
|
19
|
+
from pathlib import Path
|
|
20
|
+
from typing import Any, Dict, List, Optional, Union
|
|
21
|
+
|
|
22
|
+
from docling.datamodel.base_models import InputFormat
|
|
23
|
+
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
|
24
|
+
from docling.document_converter import DocumentConverter, PdfFormatOption
|
|
25
|
+
from docling_core.types.doc import ImageRefMode, PictureItem, TableItem
|
|
26
|
+
|
|
27
|
+
from rakam_systems_core.ai_utils import logging
|
|
28
|
+
from rakam_systems_core.ai_core.interfaces.loader import Loader
|
|
29
|
+
from rakam_systems_vectorstore.components.chunker import AdvancedChunker
|
|
30
|
+
from rakam_systems_vectorstore.core import Node, NodeMetadata, VSFile
|
|
31
|
+
|
|
32
|
+
logger = logging.getLogger(__name__)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class PdfLoader(Loader):
|
|
36
|
+
"""
|
|
37
|
+
PDF loader using Docling for advanced document processing.
|
|
38
|
+
|
|
39
|
+
This loader provides high-quality PDF processing with support for:
|
|
40
|
+
- Text extraction with layout preservation
|
|
41
|
+
- Image extraction (pages, figures, tables)
|
|
42
|
+
- Markdown export with images
|
|
43
|
+
- Configurable processing options
|
|
44
|
+
|
|
45
|
+
The extracted content is chunked and returned as text or Node objects.
|
|
46
|
+
Images and markdown files are saved to a scratch directory for reference.
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
# Default configuration
|
|
50
|
+
DEFAULT_IMAGE_SCALE = 2.0 # Scale=1 ~ 72 DPI, Scale=2 ~ 144 DPI
|
|
51
|
+
DEFAULT_EMBED_MODEL_ID = "sentence-transformers/all-MiniLM-L6-v2"
|
|
52
|
+
DEFAULT_CHUNKER_STRATEGY = "markdown_tables"
|
|
53
|
+
DEFAULT_MAX_TOKENS = 1024 # Larger chunks for better context
|
|
54
|
+
DEFAULT_MIN_CHUNK_TOKENS = 50 # Minimum tokens for standalone chunks
|
|
55
|
+
|
|
56
|
+
def __init__(
|
|
57
|
+
self,
|
|
58
|
+
name: str = "pdf_loader",
|
|
59
|
+
config: Optional[Dict[str, Any]] = None
|
|
60
|
+
):
|
|
61
|
+
"""
|
|
62
|
+
Initialize PDF loader with Docling.
|
|
63
|
+
|
|
64
|
+
Args:
|
|
65
|
+
name: Component name
|
|
66
|
+
config: Optional configuration with keys:
|
|
67
|
+
- image_scale: Image resolution scale (default: 2.0)
|
|
68
|
+
- generate_page_images: Whether to generate page images (default: True)
|
|
69
|
+
- generate_picture_images: Whether to generate picture images (default: True)
|
|
70
|
+
- embed_model_id: HuggingFace model ID for tokenization (default: "sentence-transformers/all-MiniLM-L6-v2")
|
|
71
|
+
- chunker_strategy: Strategy for chunking ("default", "markdown_tables", "annotations", default: "markdown_tables")
|
|
72
|
+
- save_images: Whether to save images to disk (default: True)
|
|
73
|
+
- save_markdown: Whether to save markdown files (default: True)
|
|
74
|
+
- scratch_folder_name: Name of scratch folder (default: "scratch")
|
|
75
|
+
- include_images_in_chunks: Whether to include image references in text chunks (default: True)
|
|
76
|
+
- max_tokens: Maximum tokens per chunk (default: 1024)
|
|
77
|
+
- merge_peers: Whether to merge adjacent small chunks (default: True)
|
|
78
|
+
- min_chunk_tokens: Minimum tokens for standalone chunks (default: 50)
|
|
79
|
+
- filter_toc: Whether to filter out Table of Contents entries (default: True)
|
|
80
|
+
"""
|
|
81
|
+
super().__init__(name=name, config=config)
|
|
82
|
+
|
|
83
|
+
# Extract configuration
|
|
84
|
+
config = config or {}
|
|
85
|
+
self._image_scale = config.get('image_scale', self.DEFAULT_IMAGE_SCALE)
|
|
86
|
+
self._generate_page_images = config.get('generate_page_images', True)
|
|
87
|
+
self._generate_picture_images = config.get(
|
|
88
|
+
'generate_picture_images', True)
|
|
89
|
+
self._save_images = config.get('save_images', True)
|
|
90
|
+
self._save_markdown = config.get('save_markdown', True)
|
|
91
|
+
self._scratch_folder_name = config.get(
|
|
92
|
+
'scratch_folder_name', 'scratch')
|
|
93
|
+
self._include_images_in_chunks = config.get(
|
|
94
|
+
'include_images_in_chunks', True)
|
|
95
|
+
|
|
96
|
+
# Chunker configuration
|
|
97
|
+
self._max_tokens = config.get('max_tokens', self.DEFAULT_MAX_TOKENS)
|
|
98
|
+
self._merge_peers = config.get('merge_peers', True)
|
|
99
|
+
self._min_chunk_tokens = config.get(
|
|
100
|
+
'min_chunk_tokens', self.DEFAULT_MIN_CHUNK_TOKENS)
|
|
101
|
+
self._filter_toc = config.get('filter_toc', True)
|
|
102
|
+
|
|
103
|
+
# Initialize advanced chunker with improved settings
|
|
104
|
+
embed_model_id = config.get(
|
|
105
|
+
'embed_model_id', self.DEFAULT_EMBED_MODEL_ID)
|
|
106
|
+
chunker_strategy = config.get(
|
|
107
|
+
'chunker_strategy', self.DEFAULT_CHUNKER_STRATEGY)
|
|
108
|
+
self._chunker = AdvancedChunker(
|
|
109
|
+
embed_model_id=embed_model_id,
|
|
110
|
+
strategy=chunker_strategy,
|
|
111
|
+
max_tokens=self._max_tokens,
|
|
112
|
+
merge_peers=self._merge_peers,
|
|
113
|
+
min_chunk_tokens=self._min_chunk_tokens,
|
|
114
|
+
filter_toc=self._filter_toc,
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
# Initialize document converter with pipeline options
|
|
118
|
+
self._doc_converter = self._create_converter()
|
|
119
|
+
|
|
120
|
+
# Store conversion result for image tracking
|
|
121
|
+
self._last_conv_res = None
|
|
122
|
+
self._last_scratch_dir = None
|
|
123
|
+
|
|
124
|
+
logger.info(
|
|
125
|
+
f"Initialized PdfLoader with image_scale={self._image_scale}, chunker_strategy={chunker_strategy}, include_images_in_chunks={self._include_images_in_chunks}")
|
|
126
|
+
|
|
127
|
+
def run(self, source: str) -> List[str]:
|
|
128
|
+
"""
|
|
129
|
+
Execute the primary operation for the component.
|
|
130
|
+
|
|
131
|
+
This method satisfies the BaseComponent abstract method requirement
|
|
132
|
+
and delegates to load_as_chunks.
|
|
133
|
+
|
|
134
|
+
Args:
|
|
135
|
+
source: Path to PDF file
|
|
136
|
+
|
|
137
|
+
Returns:
|
|
138
|
+
List of text chunks extracted from the PDF
|
|
139
|
+
"""
|
|
140
|
+
return self.load_as_chunks(source)
|
|
141
|
+
|
|
142
|
+
def _create_converter(self) -> DocumentConverter:
|
|
143
|
+
"""Create and configure the Docling document converter."""
|
|
144
|
+
pipeline_options = PdfPipelineOptions()
|
|
145
|
+
pipeline_options.images_scale = self._image_scale
|
|
146
|
+
pipeline_options.generate_page_images = self._generate_page_images
|
|
147
|
+
pipeline_options.generate_picture_images = self._generate_picture_images
|
|
148
|
+
|
|
149
|
+
doc_converter = DocumentConverter(
|
|
150
|
+
format_options={
|
|
151
|
+
InputFormat.PDF: PdfFormatOption(
|
|
152
|
+
pipeline_options=pipeline_options)
|
|
153
|
+
}
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
return doc_converter
|
|
157
|
+
|
|
158
|
+
def load_as_nodes(
|
|
159
|
+
self,
|
|
160
|
+
source: Union[str, Path],
|
|
161
|
+
source_id: Optional[str] = None,
|
|
162
|
+
custom_metadata: Optional[Dict[str, Any]] = None
|
|
163
|
+
) -> List[Node]:
|
|
164
|
+
"""
|
|
165
|
+
Load PDF and return as Node objects with metadata.
|
|
166
|
+
|
|
167
|
+
Args:
|
|
168
|
+
source: Path to PDF file
|
|
169
|
+
source_id: Optional source identifier (defaults to file path)
|
|
170
|
+
custom_metadata: Optional custom metadata to attach to nodes
|
|
171
|
+
|
|
172
|
+
Returns:
|
|
173
|
+
List of Node objects with text chunks and metadata
|
|
174
|
+
"""
|
|
175
|
+
# Convert Path to string
|
|
176
|
+
if isinstance(source, Path):
|
|
177
|
+
source = str(source)
|
|
178
|
+
|
|
179
|
+
# Load text chunks
|
|
180
|
+
chunks = self.load_as_chunks(source)
|
|
181
|
+
|
|
182
|
+
# Determine source ID
|
|
183
|
+
if source_id is None:
|
|
184
|
+
source_id = source
|
|
185
|
+
|
|
186
|
+
# Create nodes with metadata
|
|
187
|
+
nodes = []
|
|
188
|
+
for idx, chunk in enumerate(chunks):
|
|
189
|
+
metadata = NodeMetadata(
|
|
190
|
+
source_file_uuid=source_id,
|
|
191
|
+
position=idx,
|
|
192
|
+
custom=custom_metadata or {}
|
|
193
|
+
)
|
|
194
|
+
node = Node(content=chunk, metadata=metadata)
|
|
195
|
+
nodes.append(node)
|
|
196
|
+
|
|
197
|
+
logger.info(f"Created {len(nodes)} nodes from PDF: {source}")
|
|
198
|
+
return nodes
|
|
199
|
+
|
|
200
|
+
def load_as_text(
|
|
201
|
+
self,
|
|
202
|
+
source: Union[str, Path],
|
|
203
|
+
) -> str:
|
|
204
|
+
"""
|
|
205
|
+
Load PDF and return as a single text string.
|
|
206
|
+
|
|
207
|
+
This method extracts all text from the PDF and returns it as a single
|
|
208
|
+
string without chunking. Useful when you need the full document text.
|
|
209
|
+
|
|
210
|
+
Args:
|
|
211
|
+
source: Path to PDF file
|
|
212
|
+
|
|
213
|
+
Returns:
|
|
214
|
+
Full text content of the PDF as a single string
|
|
215
|
+
|
|
216
|
+
Raises:
|
|
217
|
+
FileNotFoundError: If source file doesn't exist
|
|
218
|
+
ValueError: If source is not a PDF file
|
|
219
|
+
Exception: If PDF processing fails
|
|
220
|
+
"""
|
|
221
|
+
# Convert Path to string
|
|
222
|
+
if isinstance(source, Path):
|
|
223
|
+
source = str(source)
|
|
224
|
+
|
|
225
|
+
# Validate file exists
|
|
226
|
+
if not os.path.isfile(source):
|
|
227
|
+
raise FileNotFoundError(f"File not found: {source}")
|
|
228
|
+
|
|
229
|
+
# Validate file is a PDF
|
|
230
|
+
if not self._is_pdf_file(source):
|
|
231
|
+
raise ValueError(
|
|
232
|
+
f"File is not a PDF: {source}. MIME type: {mimetypes.guess_type(source)[0]}")
|
|
233
|
+
|
|
234
|
+
logger.info(f"Loading PDF as text: {source}")
|
|
235
|
+
start_time = time.time()
|
|
236
|
+
|
|
237
|
+
try:
|
|
238
|
+
# Convert PDF document
|
|
239
|
+
conv_res = self._doc_converter.convert(source)
|
|
240
|
+
|
|
241
|
+
# Export the full document as markdown text
|
|
242
|
+
full_text = conv_res.document.export_to_markdown()
|
|
243
|
+
|
|
244
|
+
elapsed = time.time() - start_time
|
|
245
|
+
logger.info(
|
|
246
|
+
f"PDF loaded as text in {elapsed:.2f}s: {len(conv_res.document.pages)} pages, {len(full_text)} characters")
|
|
247
|
+
|
|
248
|
+
return full_text
|
|
249
|
+
|
|
250
|
+
except Exception as e:
|
|
251
|
+
logger.error(f"Error loading PDF as text {source}: {e}")
|
|
252
|
+
raise
|
|
253
|
+
|
|
254
|
+
def load_as_chunks(
|
|
255
|
+
self,
|
|
256
|
+
source: Union[str, Path],
|
|
257
|
+
) -> List[str]:
|
|
258
|
+
"""
|
|
259
|
+
Load PDF and return as a list of text chunks.
|
|
260
|
+
|
|
261
|
+
This method extracts text from the PDF, processes it with the configured
|
|
262
|
+
chunker strategy, and returns a list of text chunks. Each chunk includes
|
|
263
|
+
contextualization and optionally image references.
|
|
264
|
+
|
|
265
|
+
Args:
|
|
266
|
+
source: Path to PDF file
|
|
267
|
+
|
|
268
|
+
Returns:
|
|
269
|
+
List of text chunks extracted from the PDF
|
|
270
|
+
|
|
271
|
+
Raises:
|
|
272
|
+
FileNotFoundError: If source file doesn't exist
|
|
273
|
+
ValueError: If source is not a PDF file
|
|
274
|
+
Exception: If PDF processing fails
|
|
275
|
+
"""
|
|
276
|
+
# Convert Path to string
|
|
277
|
+
if isinstance(source, Path):
|
|
278
|
+
source = str(source)
|
|
279
|
+
|
|
280
|
+
# Validate file exists
|
|
281
|
+
if not os.path.isfile(source):
|
|
282
|
+
raise FileNotFoundError(f"File not found: {source}")
|
|
283
|
+
|
|
284
|
+
# Validate file is a PDF
|
|
285
|
+
if not self._is_pdf_file(source):
|
|
286
|
+
raise ValueError(
|
|
287
|
+
f"File is not a PDF: {source}. MIME type: {mimetypes.guess_type(source)[0]}")
|
|
288
|
+
|
|
289
|
+
logger.info(f"Loading PDF file: {source}")
|
|
290
|
+
start_time = time.time()
|
|
291
|
+
|
|
292
|
+
try:
|
|
293
|
+
# Convert PDF document
|
|
294
|
+
conv_res = self._doc_converter.convert(source)
|
|
295
|
+
|
|
296
|
+
# Create scratch directory in data folder
|
|
297
|
+
scratch_dir = self._get_scratch_dir(source)
|
|
298
|
+
|
|
299
|
+
# Store for later use in image inclusion
|
|
300
|
+
self._last_conv_res = conv_res
|
|
301
|
+
self._last_scratch_dir = scratch_dir
|
|
302
|
+
|
|
303
|
+
# Save images and tables if enabled
|
|
304
|
+
if self._save_images:
|
|
305
|
+
self._save_page_images(conv_res, scratch_dir)
|
|
306
|
+
self._save_element_images(conv_res, scratch_dir)
|
|
307
|
+
|
|
308
|
+
# Save markdown if enabled
|
|
309
|
+
if self._save_markdown:
|
|
310
|
+
self._save_markdown_files(conv_res, scratch_dir)
|
|
311
|
+
|
|
312
|
+
# Extract text and chunk it
|
|
313
|
+
text_chunks = self._extract_and_chunk_text(conv_res, scratch_dir)
|
|
314
|
+
|
|
315
|
+
elapsed = time.time() - start_time
|
|
316
|
+
logger.info(
|
|
317
|
+
f"PDF processed in {elapsed:.2f}s: {len(conv_res.document.pages)} pages, {len(text_chunks)} chunks")
|
|
318
|
+
|
|
319
|
+
return text_chunks
|
|
320
|
+
|
|
321
|
+
except Exception as e:
|
|
322
|
+
logger.error(f"Error processing PDF {source}: {e}")
|
|
323
|
+
raise
|
|
324
|
+
|
|
325
|
+
def load_as_vsfile(
|
|
326
|
+
self,
|
|
327
|
+
file_path: Union[str, Path],
|
|
328
|
+
custom_metadata: Optional[Dict[str, Any]] = None
|
|
329
|
+
) -> VSFile:
|
|
330
|
+
"""
|
|
331
|
+
Load PDF and return as VSFile object.
|
|
332
|
+
|
|
333
|
+
Args:
|
|
334
|
+
file_path: Path to PDF file
|
|
335
|
+
custom_metadata: Optional custom metadata
|
|
336
|
+
|
|
337
|
+
Returns:
|
|
338
|
+
VSFile object with nodes
|
|
339
|
+
|
|
340
|
+
Raises:
|
|
341
|
+
FileNotFoundError: If file doesn't exist
|
|
342
|
+
ValueError: If file is not a PDF
|
|
343
|
+
"""
|
|
344
|
+
if isinstance(file_path, Path):
|
|
345
|
+
file_path = str(file_path)
|
|
346
|
+
|
|
347
|
+
if not os.path.isfile(file_path):
|
|
348
|
+
raise FileNotFoundError(f"File not found: {file_path}")
|
|
349
|
+
|
|
350
|
+
if not self._is_pdf_file(file_path):
|
|
351
|
+
raise ValueError(f"File is not a PDF: {file_path}")
|
|
352
|
+
|
|
353
|
+
# Create VSFile
|
|
354
|
+
vsfile = VSFile(file_path)
|
|
355
|
+
|
|
356
|
+
# Load and create nodes
|
|
357
|
+
nodes = self.load_as_nodes(
|
|
358
|
+
file_path, str(vsfile.uuid), custom_metadata)
|
|
359
|
+
vsfile.nodes = nodes
|
|
360
|
+
vsfile.processed = True
|
|
361
|
+
|
|
362
|
+
logger.info(
|
|
363
|
+
f"Created VSFile with {len(nodes)} nodes from: {file_path}")
|
|
364
|
+
return vsfile
|
|
365
|
+
|
|
366
|
+
def _is_pdf_file(self, file_path: str) -> bool:
|
|
367
|
+
"""
|
|
368
|
+
Check if file is a PDF based on extension and MIME type.
|
|
369
|
+
|
|
370
|
+
Args:
|
|
371
|
+
file_path: Path to file
|
|
372
|
+
|
|
373
|
+
Returns:
|
|
374
|
+
True if file is a PDF, False otherwise
|
|
375
|
+
"""
|
|
376
|
+
# Check extension
|
|
377
|
+
path = Path(file_path)
|
|
378
|
+
if path.suffix.lower() != '.pdf':
|
|
379
|
+
return False
|
|
380
|
+
|
|
381
|
+
# Check MIME type
|
|
382
|
+
mime_type, _ = mimetypes.guess_type(file_path)
|
|
383
|
+
if mime_type and mime_type != 'application/pdf':
|
|
384
|
+
return False
|
|
385
|
+
|
|
386
|
+
return True
|
|
387
|
+
|
|
388
|
+
def _get_scratch_dir(self, source_path: str) -> Path:
|
|
389
|
+
"""
|
|
390
|
+
Get scratch directory for storing extracted files.
|
|
391
|
+
|
|
392
|
+
The scratch directory is created inside the data folder relative to the source file.
|
|
393
|
+
|
|
394
|
+
Args:
|
|
395
|
+
source_path: Path to source PDF file
|
|
396
|
+
|
|
397
|
+
Returns:
|
|
398
|
+
Path to scratch directory
|
|
399
|
+
"""
|
|
400
|
+
source = Path(source_path)
|
|
401
|
+
|
|
402
|
+
# Find data folder - assume it's a parent of the source or sibling
|
|
403
|
+
if 'data' in source.parts:
|
|
404
|
+
# Navigate to data folder
|
|
405
|
+
data_folder = source
|
|
406
|
+
while data_folder.name != 'data' and data_folder.parent != data_folder:
|
|
407
|
+
data_folder = data_folder.parent
|
|
408
|
+
else:
|
|
409
|
+
# Use parent directory and create/use data folder
|
|
410
|
+
data_folder = source.parent / 'data'
|
|
411
|
+
|
|
412
|
+
# Create scratch directory inside data folder
|
|
413
|
+
scratch_dir = data_folder / self._scratch_folder_name
|
|
414
|
+
scratch_dir.mkdir(parents=True, exist_ok=True)
|
|
415
|
+
|
|
416
|
+
logger.debug(f"Using scratch directory: {scratch_dir}")
|
|
417
|
+
return scratch_dir
|
|
418
|
+
|
|
419
|
+
def _save_page_images(self, conv_res, scratch_dir: Path) -> None:
|
|
420
|
+
"""Save page images to scratch directory."""
|
|
421
|
+
doc_filename = conv_res.input.file.stem
|
|
422
|
+
|
|
423
|
+
for page_no, page in conv_res.document.pages.items():
|
|
424
|
+
if not hasattr(page, 'image') or page.image is None:
|
|
425
|
+
continue
|
|
426
|
+
|
|
427
|
+
page_image_filename = scratch_dir / \
|
|
428
|
+
f"{doc_filename}-page-{page.page_no}.png"
|
|
429
|
+
try:
|
|
430
|
+
with page_image_filename.open("wb") as fp:
|
|
431
|
+
page.image.pil_image.save(fp, format="PNG")
|
|
432
|
+
logger.debug(
|
|
433
|
+
f"Saved page {page.page_no} image to {page_image_filename}")
|
|
434
|
+
except Exception as e:
|
|
435
|
+
logger.warning(
|
|
436
|
+
f"Failed to save page {page.page_no} image: {e}")
|
|
437
|
+
|
|
438
|
+
def _save_element_images(self, conv_res, scratch_dir: Path) -> None:
|
|
439
|
+
"""Save images of tables and figures to scratch directory."""
|
|
440
|
+
doc_filename = conv_res.input.file.stem
|
|
441
|
+
table_counter = 0
|
|
442
|
+
picture_counter = 0
|
|
443
|
+
|
|
444
|
+
for element, _level in conv_res.document.iterate_items():
|
|
445
|
+
try:
|
|
446
|
+
if isinstance(element, TableItem):
|
|
447
|
+
table_counter += 1
|
|
448
|
+
element_image_filename = (
|
|
449
|
+
scratch_dir /
|
|
450
|
+
f"{doc_filename}-table-{table_counter}.png"
|
|
451
|
+
)
|
|
452
|
+
with element_image_filename.open("wb") as fp:
|
|
453
|
+
element.get_image(conv_res.document).save(fp, "PNG")
|
|
454
|
+
logger.debug(
|
|
455
|
+
f"Saved table {table_counter} to {element_image_filename}")
|
|
456
|
+
|
|
457
|
+
elif isinstance(element, PictureItem):
|
|
458
|
+
picture_counter += 1
|
|
459
|
+
element_image_filename = (
|
|
460
|
+
scratch_dir /
|
|
461
|
+
f"{doc_filename}-picture-{picture_counter}.png"
|
|
462
|
+
)
|
|
463
|
+
with element_image_filename.open("wb") as fp:
|
|
464
|
+
element.get_image(conv_res.document).save(fp, "PNG")
|
|
465
|
+
logger.debug(
|
|
466
|
+
f"Saved picture {picture_counter} to {element_image_filename}")
|
|
467
|
+
|
|
468
|
+
except Exception as e:
|
|
469
|
+
logger.warning(f"Failed to save element image: {e}")
|
|
470
|
+
|
|
471
|
+
logger.info(
|
|
472
|
+
f"Saved {table_counter} tables and {picture_counter} pictures")
|
|
473
|
+
|
|
474
|
+
def _save_markdown_files(self, conv_res, scratch_dir: Path) -> None:
|
|
475
|
+
"""Save markdown files with images."""
|
|
476
|
+
doc_filename = conv_res.input.file.stem
|
|
477
|
+
|
|
478
|
+
try:
|
|
479
|
+
# Save markdown with embedded images
|
|
480
|
+
md_filename = scratch_dir / f"{doc_filename}-with-images.md"
|
|
481
|
+
conv_res.document.save_as_markdown(
|
|
482
|
+
md_filename, image_mode=ImageRefMode.EMBEDDED)
|
|
483
|
+
logger.debug(
|
|
484
|
+
f"Saved markdown with embedded images to {md_filename}")
|
|
485
|
+
|
|
486
|
+
# Save markdown with referenced images
|
|
487
|
+
md_filename = scratch_dir / f"{doc_filename}-with-image-refs.md"
|
|
488
|
+
conv_res.document.save_as_markdown(
|
|
489
|
+
md_filename, image_mode=ImageRefMode.REFERENCED)
|
|
490
|
+
logger.debug(
|
|
491
|
+
f"Saved markdown with image references to {md_filename}")
|
|
492
|
+
|
|
493
|
+
# Save HTML with referenced images
|
|
494
|
+
html_filename = scratch_dir / \
|
|
495
|
+
f"{doc_filename}-with-image-refs.html"
|
|
496
|
+
conv_res.document.save_as_html(
|
|
497
|
+
html_filename, image_mode=ImageRefMode.REFERENCED)
|
|
498
|
+
logger.debug(
|
|
499
|
+
f"Saved HTML with image references to {html_filename}")
|
|
500
|
+
|
|
501
|
+
except Exception as e:
|
|
502
|
+
logger.warning(f"Failed to save markdown files: {e}")
|
|
503
|
+
|
|
504
|
+
def _extract_and_chunk_text(self, conv_res, scratch_dir: Path) -> List[str]:
|
|
505
|
+
"""
|
|
506
|
+
Extract text from conversion result and chunk it using AdvancedChunker.
|
|
507
|
+
|
|
508
|
+
Args:
|
|
509
|
+
conv_res: Docling conversion result
|
|
510
|
+
scratch_dir: Path to scratch directory with images
|
|
511
|
+
|
|
512
|
+
Returns:
|
|
513
|
+
List of text chunks with contextualization and optional image references
|
|
514
|
+
"""
|
|
515
|
+
text_chunks = []
|
|
516
|
+
|
|
517
|
+
try:
|
|
518
|
+
# Use AdvancedChunker to chunk the DoclingDocument directly
|
|
519
|
+
# This provides better chunking with table support and contextualization
|
|
520
|
+
chunk_count = 0
|
|
521
|
+
for chunk in self._chunker.chunk_docling_document(conv_res.document):
|
|
522
|
+
# Get contextualized text for each chunk
|
|
523
|
+
ctx_text = self._chunker.contextualize(chunk=chunk)
|
|
524
|
+
|
|
525
|
+
# If enabled, add image references to chunks
|
|
526
|
+
if self._include_images_in_chunks:
|
|
527
|
+
logger.debug(
|
|
528
|
+
f"Processing chunk {chunk_count}: has meta={hasattr(chunk, 'meta')}")
|
|
529
|
+
if hasattr(chunk, 'meta'):
|
|
530
|
+
logger.debug(
|
|
531
|
+
f" meta has doc_items={hasattr(chunk.meta, 'doc_items')}")
|
|
532
|
+
if hasattr(chunk.meta, 'doc_items'):
|
|
533
|
+
logger.debug(
|
|
534
|
+
f" doc_items count={len(chunk.meta.doc_items)}")
|
|
535
|
+
|
|
536
|
+
ctx_text = self._add_images_to_chunk(
|
|
537
|
+
ctx_text, chunk, conv_res, scratch_dir)
|
|
538
|
+
|
|
539
|
+
text_chunks.append(ctx_text)
|
|
540
|
+
chunk_count += 1
|
|
541
|
+
|
|
542
|
+
except Exception as e:
|
|
543
|
+
logger.warning(
|
|
544
|
+
f"Failed to chunk document with AdvancedChunker: {e}")
|
|
545
|
+
# Fall back to simple text extraction if advanced chunking fails
|
|
546
|
+
logger.info("Falling back to simple text extraction")
|
|
547
|
+
text_chunks = self._extract_text_fallback(conv_res)
|
|
548
|
+
|
|
549
|
+
return text_chunks
|
|
550
|
+
|
|
551
|
+
def _add_images_to_chunk(self, chunk_text: str, chunk, conv_res, scratch_dir: Path) -> str:
|
|
552
|
+
"""
|
|
553
|
+
Add image references to a text chunk based on text content matching.
|
|
554
|
+
Images are added in the order they appear in the original document.
|
|
555
|
+
|
|
556
|
+
This method:
|
|
557
|
+
1. Gets the full markdown document text
|
|
558
|
+
2. Finds images () in the markdown
|
|
559
|
+
3. Uses fuzzy matching to find which images belong to this chunk
|
|
560
|
+
4. Appends image references to the chunk
|
|
561
|
+
|
|
562
|
+
Args:
|
|
563
|
+
chunk_text: The contextualized text of the chunk
|
|
564
|
+
chunk: The chunk object from the chunker
|
|
565
|
+
conv_res: Docling conversion result
|
|
566
|
+
scratch_dir: Path to scratch directory with saved images
|
|
567
|
+
|
|
568
|
+
Returns:
|
|
569
|
+
Chunk text with appended image references in document order
|
|
570
|
+
"""
|
|
571
|
+
doc_filename = conv_res.input.file.stem
|
|
572
|
+
image_refs = []
|
|
573
|
+
|
|
574
|
+
try:
|
|
575
|
+
# Read the full document markdown text with image references from the saved file
|
|
576
|
+
md_filename = scratch_dir / f"{doc_filename}-with-image-refs.md"
|
|
577
|
+
if not md_filename.exists():
|
|
578
|
+
logger.warning(
|
|
579
|
+
f"Markdown file with image references not found: {md_filename}")
|
|
580
|
+
return chunk_text
|
|
581
|
+
|
|
582
|
+
full_doc_text = md_filename.read_text()
|
|
583
|
+
|
|
584
|
+
# Find all  markers in the markdown
|
|
585
|
+
# Note: paths can contain parentheses, so we need to match until .png) or .jpg)
|
|
586
|
+
import re
|
|
587
|
+
image_pattern = r'!\[Image\]\((.+?\.(?:png|jpg|jpeg|gif|webp))\)'
|
|
588
|
+
image_positions = [] # (position, img_path)
|
|
589
|
+
|
|
590
|
+
for match in re.finditer(image_pattern, full_doc_text):
|
|
591
|
+
img_pos = match.start()
|
|
592
|
+
img_path_in_md = match.group(1)
|
|
593
|
+
# Use the image path directly from markdown
|
|
594
|
+
image_positions.append((img_pos, img_path_in_md))
|
|
595
|
+
logger.debug(f"Found image in document at position {img_pos}")
|
|
596
|
+
|
|
597
|
+
logger.debug(
|
|
598
|
+
f"Found {len(image_positions)} images in full document (length: {len(full_doc_text)})")
|
|
599
|
+
|
|
600
|
+
# Now find which images belong to this chunk
|
|
601
|
+
# Strategy: Look for text snippets from the chunk in the full document
|
|
602
|
+
# Split chunk into sentences/paragraphs for better matching
|
|
603
|
+
chunk_lines = [line.strip() for line in chunk_text.split(
|
|
604
|
+
'\n') if line.strip() and len(line.strip()) > 20]
|
|
605
|
+
|
|
606
|
+
if chunk_lines:
|
|
607
|
+
# Find the position range of this chunk in the full document
|
|
608
|
+
# Try to match beginning and end of chunk
|
|
609
|
+
# First 100 chars of first substantial line
|
|
610
|
+
first_line = chunk_lines[0][:100]
|
|
611
|
+
last_line = chunk_lines[-1][:100] if len(
|
|
612
|
+
chunk_lines) > 1 else first_line
|
|
613
|
+
|
|
614
|
+
# Remove title markers that chunker might add
|
|
615
|
+
first_line_clean = first_line.replace(
|
|
616
|
+
'## ', '').replace('# ', '').strip()
|
|
617
|
+
|
|
618
|
+
chunk_start_pos = full_doc_text.find(first_line_clean)
|
|
619
|
+
|
|
620
|
+
if chunk_start_pos == -1:
|
|
621
|
+
# Try with less text
|
|
622
|
+
first_line_clean = first_line_clean[:50]
|
|
623
|
+
chunk_start_pos = full_doc_text.find(first_line_clean)
|
|
624
|
+
|
|
625
|
+
if chunk_start_pos != -1:
|
|
626
|
+
# Find chunk end - look for the last line
|
|
627
|
+
last_line_clean = last_line.replace(
|
|
628
|
+
'## ', '').replace('# ', '').strip()[:50]
|
|
629
|
+
chunk_end_search = full_doc_text.find(
|
|
630
|
+
last_line_clean, chunk_start_pos)
|
|
631
|
+
|
|
632
|
+
if chunk_end_search != -1:
|
|
633
|
+
chunk_end_pos = chunk_end_search + \
|
|
634
|
+
len(last_line_clean) + 500 # Add buffer
|
|
635
|
+
else:
|
|
636
|
+
chunk_end_pos = chunk_start_pos + \
|
|
637
|
+
len(chunk_text) + 500 # Estimate
|
|
638
|
+
|
|
639
|
+
logger.debug(
|
|
640
|
+
f"Chunk found at position {chunk_start_pos}-{chunk_end_pos}")
|
|
641
|
+
|
|
642
|
+
# Find images that fall within this chunk's range
|
|
643
|
+
for img_pos, img_path in sorted(image_positions):
|
|
644
|
+
if chunk_start_pos <= img_pos <= chunk_end_pos:
|
|
645
|
+
image_refs.append(f"\n")
|
|
646
|
+
logger.debug(
|
|
647
|
+
f"Added image at position {img_pos} to chunk")
|
|
648
|
+
else:
|
|
649
|
+
logger.debug(
|
|
650
|
+
f"Could not find chunk position in full document (tried: '{first_line_clean[:30]}...')")
|
|
651
|
+
|
|
652
|
+
# Append image references to the chunk text
|
|
653
|
+
if image_refs:
|
|
654
|
+
chunk_text = chunk_text + "".join(image_refs)
|
|
655
|
+
logger.info(
|
|
656
|
+
f"Added {len(image_refs)} image references to chunk")
|
|
657
|
+
|
|
658
|
+
except Exception as e:
|
|
659
|
+
logger.warning(f"Could not add images to chunk: {e}")
|
|
660
|
+
import traceback
|
|
661
|
+
logger.debug(f"Traceback: {traceback.format_exc()}")
|
|
662
|
+
|
|
663
|
+
return chunk_text
|
|
664
|
+
|
|
665
|
+
def _extract_text_fallback(self, conv_res) -> List[str]:
|
|
666
|
+
"""
|
|
667
|
+
Fallback method for text extraction if advanced chunking fails.
|
|
668
|
+
|
|
669
|
+
Args:
|
|
670
|
+
conv_res: Docling conversion result
|
|
671
|
+
|
|
672
|
+
Returns:
|
|
673
|
+
List of text chunks
|
|
674
|
+
"""
|
|
675
|
+
text_parts = []
|
|
676
|
+
|
|
677
|
+
# Extract text from each page
|
|
678
|
+
for page_no, page in conv_res.document.pages.items():
|
|
679
|
+
try:
|
|
680
|
+
# Export page as markdown to preserve structure
|
|
681
|
+
page_text = page.export_to_markdown()
|
|
682
|
+
|
|
683
|
+
if page_text and page_text.strip():
|
|
684
|
+
text_parts.append(page_text)
|
|
685
|
+
|
|
686
|
+
except Exception as e:
|
|
687
|
+
logger.warning(
|
|
688
|
+
f"Failed to extract text from page {page_no}: {e}")
|
|
689
|
+
|
|
690
|
+
# Join all text and use AdvancedChunker's raw text chunking
|
|
691
|
+
full_text = "\n\n".join(text_parts)
|
|
692
|
+
return self._chunker.run([full_text])
|
|
693
|
+
|
|
694
|
+
|
|
695
|
+
def create_pdf_loader(
|
|
696
|
+
image_scale: float = 2.0,
|
|
697
|
+
embed_model_id: str = "sentence-transformers/all-MiniLM-L6-v2",
|
|
698
|
+
chunker_strategy: str = "markdown_tables",
|
|
699
|
+
save_images: bool = True,
|
|
700
|
+
save_markdown: bool = True,
|
|
701
|
+
scratch_folder_name: str = 'scratch',
|
|
702
|
+
include_images_in_chunks: bool = True,
|
|
703
|
+
max_tokens: int = 1024,
|
|
704
|
+
merge_peers: bool = True,
|
|
705
|
+
min_chunk_tokens: int = 50,
|
|
706
|
+
filter_toc: bool = True,
|
|
707
|
+
) -> PdfLoader:
|
|
708
|
+
"""
|
|
709
|
+
Factory function to create a PDF loader.
|
|
710
|
+
|
|
711
|
+
Args:
|
|
712
|
+
image_scale: Image resolution scale (1.0 ~ 72 DPI, 2.0 ~ 144 DPI)
|
|
713
|
+
embed_model_id: HuggingFace model ID for tokenization
|
|
714
|
+
chunker_strategy: Strategy for chunking:
|
|
715
|
+
- "default": Default serialization
|
|
716
|
+
- "markdown_tables": Markdown table formatting (recommended)
|
|
717
|
+
- "annotations": Include picture annotations
|
|
718
|
+
- "custom_placeholder": Custom image placeholders
|
|
719
|
+
save_images: Whether to save extracted images
|
|
720
|
+
save_markdown: Whether to save markdown files
|
|
721
|
+
scratch_folder_name: Name of scratch folder in data directory
|
|
722
|
+
include_images_in_chunks: Whether to include image references in text chunks (default: True)
|
|
723
|
+
max_tokens: Maximum tokens per chunk (default: 1024). Larger values create
|
|
724
|
+
bigger, more contextual chunks. Recommended: 512-2048.
|
|
725
|
+
merge_peers: Whether to merge adjacent small chunks with same metadata (default: True)
|
|
726
|
+
min_chunk_tokens: Minimum tokens for a standalone chunk (default: 50).
|
|
727
|
+
Smaller chunks will be merged with neighbors.
|
|
728
|
+
filter_toc: Whether to filter out Table of Contents entries (default: True).
|
|
729
|
+
TOC entries often create noisy, low-value chunks.
|
|
730
|
+
|
|
731
|
+
Returns:
|
|
732
|
+
Configured PDF loader
|
|
733
|
+
|
|
734
|
+
Example:
|
|
735
|
+
>>> # Basic usage with default settings
|
|
736
|
+
>>> loader = create_pdf_loader()
|
|
737
|
+
>>> chunks = loader.load_as_chunks("data/document.pdf")
|
|
738
|
+
>>> print(f"Extracted {len(chunks)} chunks")
|
|
739
|
+
|
|
740
|
+
>>> # Create loader with larger chunks and TOC filtering
|
|
741
|
+
>>> loader = create_pdf_loader(
|
|
742
|
+
... max_tokens=2048,
|
|
743
|
+
... filter_toc=True,
|
|
744
|
+
... min_chunk_tokens=100
|
|
745
|
+
... )
|
|
746
|
+
>>> chunks = loader.load_as_chunks("data/document.pdf")
|
|
747
|
+
|
|
748
|
+
>>> # Create loader without image references in chunks
|
|
749
|
+
>>> loader = create_pdf_loader(include_images_in_chunks=False)
|
|
750
|
+
>>> chunks = loader.load_as_chunks("data/document.pdf")
|
|
751
|
+
"""
|
|
752
|
+
config = {
|
|
753
|
+
'image_scale': image_scale,
|
|
754
|
+
'embed_model_id': embed_model_id,
|
|
755
|
+
'chunker_strategy': chunker_strategy,
|
|
756
|
+
'save_images': save_images,
|
|
757
|
+
'save_markdown': save_markdown,
|
|
758
|
+
'scratch_folder_name': scratch_folder_name,
|
|
759
|
+
'include_images_in_chunks': include_images_in_chunks,
|
|
760
|
+
'generate_page_images': True,
|
|
761
|
+
'generate_picture_images': True,
|
|
762
|
+
'max_tokens': max_tokens,
|
|
763
|
+
'merge_peers': merge_peers,
|
|
764
|
+
'min_chunk_tokens': min_chunk_tokens,
|
|
765
|
+
'filter_toc': filter_toc,
|
|
766
|
+
}
|
|
767
|
+
|
|
768
|
+
return PdfLoader(config=config)
|
|
769
|
+
|
|
770
|
+
|
|
771
|
+
__all__ = ["PdfLoader", "create_pdf_loader"]
|