visual-rag-toolkit 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- benchmarks/README.md +101 -0
- benchmarks/__init__.py +11 -0
- benchmarks/analyze_results.py +187 -0
- benchmarks/benchmark_datasets.txt +105 -0
- benchmarks/prepare_submission.py +205 -0
- benchmarks/quick_test.py +566 -0
- benchmarks/run_vidore.py +513 -0
- benchmarks/vidore_beir_qdrant/run_qdrant_beir.py +1365 -0
- benchmarks/vidore_tatdqa_test/COMMANDS.md +83 -0
- benchmarks/vidore_tatdqa_test/__init__.py +6 -0
- benchmarks/vidore_tatdqa_test/dataset_loader.py +363 -0
- benchmarks/vidore_tatdqa_test/metrics.py +44 -0
- benchmarks/vidore_tatdqa_test/run_qdrant.py +799 -0
- benchmarks/vidore_tatdqa_test/sweep_eval.py +372 -0
- demo/__init__.py +10 -0
- demo/app.py +45 -0
- demo/commands.py +334 -0
- demo/config.py +34 -0
- demo/download_models.py +75 -0
- demo/evaluation.py +602 -0
- demo/example_metadata_mapping_sigir.json +37 -0
- demo/indexing.py +286 -0
- demo/qdrant_utils.py +211 -0
- demo/results.py +35 -0
- demo/test_qdrant_connection.py +119 -0
- demo/ui/__init__.py +15 -0
- demo/ui/benchmark.py +355 -0
- demo/ui/header.py +30 -0
- demo/ui/playground.py +339 -0
- demo/ui/sidebar.py +162 -0
- demo/ui/upload.py +487 -0
- visual_rag/__init__.py +98 -0
- visual_rag/cli/__init__.py +1 -0
- visual_rag/cli/main.py +629 -0
- visual_rag/config.py +230 -0
- visual_rag/demo_runner.py +90 -0
- visual_rag/embedding/__init__.py +26 -0
- visual_rag/embedding/pooling.py +343 -0
- visual_rag/embedding/visual_embedder.py +622 -0
- visual_rag/indexing/__init__.py +21 -0
- visual_rag/indexing/cloudinary_uploader.py +274 -0
- visual_rag/indexing/pdf_processor.py +324 -0
- visual_rag/indexing/pipeline.py +628 -0
- visual_rag/indexing/qdrant_indexer.py +478 -0
- visual_rag/preprocessing/__init__.py +3 -0
- visual_rag/preprocessing/crop_empty.py +120 -0
- visual_rag/qdrant_admin.py +222 -0
- visual_rag/retrieval/__init__.py +19 -0
- visual_rag/retrieval/multi_vector.py +222 -0
- visual_rag/retrieval/single_stage.py +126 -0
- visual_rag/retrieval/three_stage.py +173 -0
- visual_rag/retrieval/two_stage.py +471 -0
- visual_rag/visualization/__init__.py +19 -0
- visual_rag/visualization/saliency.py +335 -0
- visual_rag_toolkit-0.1.1.dist-info/METADATA +305 -0
- visual_rag_toolkit-0.1.1.dist-info/RECORD +59 -0
- visual_rag_toolkit-0.1.1.dist-info/WHEEL +4 -0
- visual_rag_toolkit-0.1.1.dist-info/entry_points.txt +3 -0
- visual_rag_toolkit-0.1.1.dist-info/licenses/LICENSE +22 -0
|
@@ -0,0 +1,274 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Cloudinary Uploader - Upload images to Cloudinary CDN.
|
|
3
|
+
|
|
4
|
+
Works INDEPENDENTLY of PDF processing and embedding.
|
|
5
|
+
Use it if you just need to upload images to a CDN.
|
|
6
|
+
|
|
7
|
+
Features:
|
|
8
|
+
- Retry logic with timeouts
|
|
9
|
+
- Batch uploading
|
|
10
|
+
- Automatic JPEG optimization
|
|
11
|
+
|
|
12
|
+
Environment Variables:
|
|
13
|
+
- VISUAL_RAG_THREAD_SAFE: Set to "1" to use thread-safe timeouts
|
|
14
|
+
(required for Streamlit, Flask, or other threaded contexts)
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import io
|
|
18
|
+
import logging
|
|
19
|
+
import os
|
|
20
|
+
import platform
|
|
21
|
+
import signal
|
|
22
|
+
import threading
|
|
23
|
+
import time
|
|
24
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
25
|
+
from concurrent.futures import TimeoutError as FuturesTimeoutError
|
|
26
|
+
from typing import Optional
|
|
27
|
+
|
|
28
|
+
from PIL import Image
|
|
29
|
+
|
|
30
|
+
logger = logging.getLogger(__name__)
|
|
31
|
+
|
|
32
|
+
THREAD_SAFE_MODE = os.getenv("VISUAL_RAG_THREAD_SAFE", "").lower() in ("1", "true", "yes")
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class CloudinaryUploader:
|
|
36
|
+
"""
|
|
37
|
+
Upload images to Cloudinary CDN.
|
|
38
|
+
|
|
39
|
+
Works independently - just needs PIL images.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
cloud_name: Cloudinary cloud name
|
|
43
|
+
api_key: Cloudinary API key
|
|
44
|
+
api_secret: Cloudinary API secret
|
|
45
|
+
folder: Base folder for uploads
|
|
46
|
+
max_retries: Number of retry attempts
|
|
47
|
+
timeout_seconds: Timeout per upload
|
|
48
|
+
|
|
49
|
+
Example:
|
|
50
|
+
>>> uploader = CloudinaryUploader(
|
|
51
|
+
... cloud_name="my-cloud",
|
|
52
|
+
... api_key="xxx",
|
|
53
|
+
... api_secret="yyy",
|
|
54
|
+
... folder="my-project",
|
|
55
|
+
... )
|
|
56
|
+
>>>
|
|
57
|
+
>>> url = uploader.upload(image, "doc_page_1")
|
|
58
|
+
>>> print(url) # https://res.cloudinary.com/.../doc_page_1.jpg
|
|
59
|
+
"""
|
|
60
|
+
|
|
61
|
+
def __init__(
|
|
62
|
+
self,
|
|
63
|
+
cloud_name: Optional[str] = None,
|
|
64
|
+
api_key: Optional[str] = None,
|
|
65
|
+
api_secret: Optional[str] = None,
|
|
66
|
+
folder: str = "visual-rag",
|
|
67
|
+
max_retries: int = 3,
|
|
68
|
+
timeout_seconds: int = 30,
|
|
69
|
+
jpeg_quality: int = 95,
|
|
70
|
+
):
|
|
71
|
+
# Load from environment if not provided
|
|
72
|
+
self.cloud_name = cloud_name or os.getenv("CLOUDINARY_CLOUD_NAME")
|
|
73
|
+
self.api_key = api_key or os.getenv("CLOUDINARY_API_KEY")
|
|
74
|
+
self.api_secret = api_secret or os.getenv("CLOUDINARY_API_SECRET")
|
|
75
|
+
|
|
76
|
+
if not all([self.cloud_name, self.api_key, self.api_secret]):
|
|
77
|
+
raise ValueError(
|
|
78
|
+
"Cloudinary credentials required. Set CLOUDINARY_CLOUD_NAME, "
|
|
79
|
+
"CLOUDINARY_API_KEY, CLOUDINARY_API_SECRET environment variables "
|
|
80
|
+
"or pass them as arguments."
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
self.folder = folder
|
|
84
|
+
self.max_retries = max_retries
|
|
85
|
+
self.timeout_seconds = timeout_seconds
|
|
86
|
+
self.jpeg_quality = jpeg_quality
|
|
87
|
+
|
|
88
|
+
# Check dependency
|
|
89
|
+
try:
|
|
90
|
+
import cloudinary # noqa
|
|
91
|
+
except ImportError:
|
|
92
|
+
raise ImportError(
|
|
93
|
+
"Cloudinary not installed. "
|
|
94
|
+
"Install with: pip install visual-rag-toolkit[cloudinary]"
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
logger.info("☁️ Cloudinary uploader initialized")
|
|
98
|
+
logger.info(f" Folder: {folder}")
|
|
99
|
+
|
|
100
|
+
def upload(
|
|
101
|
+
self,
|
|
102
|
+
image: Image.Image,
|
|
103
|
+
public_id: str,
|
|
104
|
+
subfolder: Optional[str] = None,
|
|
105
|
+
) -> Optional[str]:
|
|
106
|
+
"""
|
|
107
|
+
Upload a single image to Cloudinary.
|
|
108
|
+
|
|
109
|
+
Args:
|
|
110
|
+
image: PIL Image to upload
|
|
111
|
+
public_id: Public ID (filename without extension)
|
|
112
|
+
subfolder: Optional subfolder within base folder
|
|
113
|
+
|
|
114
|
+
Returns:
|
|
115
|
+
Secure URL of uploaded image, or None if failed
|
|
116
|
+
"""
|
|
117
|
+
import cloudinary
|
|
118
|
+
import cloudinary.uploader
|
|
119
|
+
|
|
120
|
+
# Prepare buffer
|
|
121
|
+
buffer = io.BytesIO()
|
|
122
|
+
image.save(buffer, format="JPEG", quality=self.jpeg_quality, optimize=True)
|
|
123
|
+
|
|
124
|
+
# Configure Cloudinary
|
|
125
|
+
cloudinary.config(
|
|
126
|
+
cloud_name=self.cloud_name,
|
|
127
|
+
api_key=self.api_key,
|
|
128
|
+
api_secret=self.api_secret,
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
# Build folder path
|
|
132
|
+
folder_path = self.folder
|
|
133
|
+
if subfolder:
|
|
134
|
+
folder_path = f"{self.folder}/{subfolder}"
|
|
135
|
+
|
|
136
|
+
def do_upload():
|
|
137
|
+
buffer.seek(0)
|
|
138
|
+
result = cloudinary.uploader.upload(
|
|
139
|
+
buffer,
|
|
140
|
+
folder=folder_path,
|
|
141
|
+
overwrite=True,
|
|
142
|
+
public_id=public_id,
|
|
143
|
+
resource_type="image",
|
|
144
|
+
timeout=self.timeout_seconds,
|
|
145
|
+
)
|
|
146
|
+
return result["secure_url"]
|
|
147
|
+
|
|
148
|
+
# Use thread-safe mode for Streamlit/Flask/threaded contexts
|
|
149
|
+
# Set VISUAL_RAG_THREAD_SAFE=1 to enable
|
|
150
|
+
if THREAD_SAFE_MODE or threading.current_thread() is not threading.main_thread():
|
|
151
|
+
return self._upload_with_thread_timeout(do_upload, public_id)
|
|
152
|
+
else:
|
|
153
|
+
return self._upload_with_signal_timeout(do_upload, public_id)
|
|
154
|
+
|
|
155
|
+
def _upload_with_thread_timeout(self, do_upload, public_id: str) -> Optional[str]:
|
|
156
|
+
"""Thread-safe upload with ThreadPoolExecutor timeout."""
|
|
157
|
+
for attempt in range(self.max_retries):
|
|
158
|
+
try:
|
|
159
|
+
with ThreadPoolExecutor(max_workers=1) as executor:
|
|
160
|
+
future = executor.submit(do_upload)
|
|
161
|
+
return future.result(timeout=self.timeout_seconds)
|
|
162
|
+
|
|
163
|
+
except FuturesTimeoutError:
|
|
164
|
+
logger.warning(
|
|
165
|
+
f"Upload timeout (attempt {attempt + 1}/{self.max_retries}): {public_id}"
|
|
166
|
+
)
|
|
167
|
+
if attempt < self.max_retries - 1:
|
|
168
|
+
time.sleep(2**attempt)
|
|
169
|
+
|
|
170
|
+
except Exception as e:
|
|
171
|
+
logger.warning(f"Upload failed (attempt {attempt + 1}/{self.max_retries}): {e}")
|
|
172
|
+
if attempt < self.max_retries - 1:
|
|
173
|
+
time.sleep(2**attempt)
|
|
174
|
+
|
|
175
|
+
logger.error(f"❌ Upload failed after {self.max_retries} attempts: {public_id}")
|
|
176
|
+
return None
|
|
177
|
+
|
|
178
|
+
def _upload_with_signal_timeout(self, do_upload, public_id: str) -> Optional[str]:
|
|
179
|
+
"""Signal-based upload timeout (main thread only, Unix/macOS)."""
|
|
180
|
+
use_timeout = platform.system() != "Windows"
|
|
181
|
+
|
|
182
|
+
class SignalTimeoutError(Exception):
|
|
183
|
+
pass
|
|
184
|
+
|
|
185
|
+
def timeout_handler(signum, frame):
|
|
186
|
+
raise SignalTimeoutError(f"Upload timed out after {self.timeout_seconds}s")
|
|
187
|
+
|
|
188
|
+
for attempt in range(self.max_retries):
|
|
189
|
+
try:
|
|
190
|
+
if use_timeout:
|
|
191
|
+
old_handler = signal.signal(signal.SIGALRM, timeout_handler)
|
|
192
|
+
signal.alarm(self.timeout_seconds)
|
|
193
|
+
|
|
194
|
+
try:
|
|
195
|
+
return do_upload()
|
|
196
|
+
finally:
|
|
197
|
+
if use_timeout:
|
|
198
|
+
signal.alarm(0)
|
|
199
|
+
signal.signal(signal.SIGALRM, old_handler)
|
|
200
|
+
|
|
201
|
+
except SignalTimeoutError:
|
|
202
|
+
logger.warning(
|
|
203
|
+
f"Upload timeout (attempt {attempt + 1}/{self.max_retries}): {public_id}"
|
|
204
|
+
)
|
|
205
|
+
if attempt < self.max_retries - 1:
|
|
206
|
+
time.sleep(2**attempt)
|
|
207
|
+
|
|
208
|
+
except Exception as e:
|
|
209
|
+
logger.warning(f"Upload failed (attempt {attempt + 1}/{self.max_retries}): {e}")
|
|
210
|
+
if attempt < self.max_retries - 1:
|
|
211
|
+
time.sleep(2**attempt)
|
|
212
|
+
|
|
213
|
+
logger.error(f"❌ Upload failed after {self.max_retries} attempts: {public_id}")
|
|
214
|
+
return None
|
|
215
|
+
|
|
216
|
+
def upload_original_and_resized(
|
|
217
|
+
self,
|
|
218
|
+
original_image: Image.Image,
|
|
219
|
+
resized_image: Image.Image,
|
|
220
|
+
base_public_id: str,
|
|
221
|
+
) -> tuple:
|
|
222
|
+
"""
|
|
223
|
+
Upload both original and resized versions.
|
|
224
|
+
|
|
225
|
+
Args:
|
|
226
|
+
original_image: Original PDF page image
|
|
227
|
+
resized_image: Resized image for ColPali
|
|
228
|
+
base_public_id: Base public ID (e.g., "doc_page_1")
|
|
229
|
+
|
|
230
|
+
Returns:
|
|
231
|
+
Tuple of (original_url, resized_url) - either can be None on failure
|
|
232
|
+
"""
|
|
233
|
+
original_url = self.upload(
|
|
234
|
+
original_image,
|
|
235
|
+
base_public_id,
|
|
236
|
+
subfolder="original",
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
resized_url = self.upload(
|
|
240
|
+
resized_image,
|
|
241
|
+
base_public_id,
|
|
242
|
+
subfolder="resized",
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
return original_url, resized_url
|
|
246
|
+
|
|
247
|
+
def upload_original_cropped_and_resized(
|
|
248
|
+
self,
|
|
249
|
+
original_image: Image.Image,
|
|
250
|
+
cropped_image: Optional[Image.Image],
|
|
251
|
+
resized_image: Image.Image,
|
|
252
|
+
base_public_id: str,
|
|
253
|
+
) -> tuple:
|
|
254
|
+
original_url = self.upload(
|
|
255
|
+
original_image,
|
|
256
|
+
base_public_id,
|
|
257
|
+
subfolder="original",
|
|
258
|
+
)
|
|
259
|
+
|
|
260
|
+
cropped_url = None
|
|
261
|
+
if cropped_image is not None:
|
|
262
|
+
cropped_url = self.upload(
|
|
263
|
+
cropped_image,
|
|
264
|
+
base_public_id,
|
|
265
|
+
subfolder="cropped",
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
resized_url = self.upload(
|
|
269
|
+
resized_image,
|
|
270
|
+
base_public_id,
|
|
271
|
+
subfolder="resized",
|
|
272
|
+
)
|
|
273
|
+
|
|
274
|
+
return original_url, cropped_url, resized_url
|
|
@@ -0,0 +1,324 @@
|
|
|
1
|
+
"""
|
|
2
|
+
PDF Processor - Convert PDFs to images and extract text.
|
|
3
|
+
|
|
4
|
+
This module works INDEPENDENTLY of embedding and vector storage.
|
|
5
|
+
Use it if you just need PDF → images conversion.
|
|
6
|
+
|
|
7
|
+
Features:
|
|
8
|
+
- Batch processing to save memory
|
|
9
|
+
- Text extraction with surrogate character handling
|
|
10
|
+
- Configurable DPI and quality settings
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
import gc
|
|
14
|
+
import logging
|
|
15
|
+
import re
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
from typing import Any, Dict, Generator, List, Optional, Tuple
|
|
18
|
+
|
|
19
|
+
from PIL import Image
|
|
20
|
+
|
|
21
|
+
logger = logging.getLogger(__name__)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class PDFProcessor:
|
|
25
|
+
"""
|
|
26
|
+
Process PDFs into images and text for visual retrieval.
|
|
27
|
+
|
|
28
|
+
Works independently - no embedding or storage dependencies.
|
|
29
|
+
|
|
30
|
+
Args:
|
|
31
|
+
dpi: DPI for image conversion (higher = better quality)
|
|
32
|
+
output_format: Image format (RGB, L, etc.)
|
|
33
|
+
page_batch_size: Pages per batch for memory efficiency
|
|
34
|
+
|
|
35
|
+
Example:
|
|
36
|
+
>>> processor = PDFProcessor(dpi=140)
|
|
37
|
+
>>>
|
|
38
|
+
>>> # Convert single PDF
|
|
39
|
+
>>> images, texts = processor.process_pdf(Path("report.pdf"))
|
|
40
|
+
>>>
|
|
41
|
+
>>> # Stream large PDFs
|
|
42
|
+
>>> for images, texts in processor.stream_pdf(Path("large.pdf"), batch_size=10):
|
|
43
|
+
... # Process each batch
|
|
44
|
+
... pass
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
def __init__(
|
|
48
|
+
self,
|
|
49
|
+
dpi: int = 140,
|
|
50
|
+
output_format: str = "RGB",
|
|
51
|
+
page_batch_size: int = 50,
|
|
52
|
+
):
|
|
53
|
+
self.dpi = dpi
|
|
54
|
+
self.output_format = output_format
|
|
55
|
+
self.page_batch_size = page_batch_size
|
|
56
|
+
|
|
57
|
+
# PDF deps are optional: we only require them when calling PDF-specific methods.
|
|
58
|
+
# This keeps the class usable for helper utilities like `resize_for_colpali()`
|
|
59
|
+
# even in minimal installs.
|
|
60
|
+
self._pdf_deps_available = True
|
|
61
|
+
try:
|
|
62
|
+
import pdf2image # noqa: F401
|
|
63
|
+
import pypdf # noqa: F401
|
|
64
|
+
except Exception:
|
|
65
|
+
self._pdf_deps_available = False
|
|
66
|
+
|
|
67
|
+
def _require_pdf_deps(self) -> None:
|
|
68
|
+
if not self._pdf_deps_available:
|
|
69
|
+
raise ImportError(
|
|
70
|
+
"PDF processing requires `pdf2image` and `pypdf`.\n"
|
|
71
|
+
'Install with: pip install "visual-rag-toolkit[pdf]"'
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
def process_pdf(
|
|
75
|
+
self,
|
|
76
|
+
pdf_path: Path,
|
|
77
|
+
dpi: Optional[int] = None,
|
|
78
|
+
) -> Tuple[List[Image.Image], List[str]]:
|
|
79
|
+
"""
|
|
80
|
+
Convert PDF to images and extract text.
|
|
81
|
+
|
|
82
|
+
Args:
|
|
83
|
+
pdf_path: Path to PDF file
|
|
84
|
+
dpi: Override default DPI
|
|
85
|
+
|
|
86
|
+
Returns:
|
|
87
|
+
Tuple of (list of images, list of page texts)
|
|
88
|
+
"""
|
|
89
|
+
self._require_pdf_deps()
|
|
90
|
+
from pdf2image import convert_from_path
|
|
91
|
+
from pypdf import PdfReader
|
|
92
|
+
|
|
93
|
+
dpi = dpi or self.dpi
|
|
94
|
+
pdf_path = Path(pdf_path)
|
|
95
|
+
|
|
96
|
+
logger.info(f"📄 Processing PDF: {pdf_path.name}")
|
|
97
|
+
|
|
98
|
+
# Extract text
|
|
99
|
+
reader = PdfReader(str(pdf_path))
|
|
100
|
+
total_pages = len(reader.pages)
|
|
101
|
+
|
|
102
|
+
page_texts = []
|
|
103
|
+
for page in reader.pages:
|
|
104
|
+
text = page.extract_text() or ""
|
|
105
|
+
# Handle surrogate characters
|
|
106
|
+
text = self._sanitize_text(text)
|
|
107
|
+
page_texts.append(text)
|
|
108
|
+
|
|
109
|
+
# Convert to images in batches
|
|
110
|
+
all_images = []
|
|
111
|
+
for start_page in range(1, total_pages + 1, self.page_batch_size):
|
|
112
|
+
end_page = min(start_page + self.page_batch_size - 1, total_pages)
|
|
113
|
+
|
|
114
|
+
batch_images = convert_from_path(
|
|
115
|
+
str(pdf_path),
|
|
116
|
+
dpi=dpi,
|
|
117
|
+
fmt=self.output_format.lower(),
|
|
118
|
+
first_page=start_page,
|
|
119
|
+
last_page=end_page,
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
all_images.extend(batch_images)
|
|
123
|
+
|
|
124
|
+
del batch_images
|
|
125
|
+
gc.collect()
|
|
126
|
+
|
|
127
|
+
assert len(all_images) == len(
|
|
128
|
+
page_texts
|
|
129
|
+
), f"Mismatch: {len(all_images)} images vs {len(page_texts)} texts"
|
|
130
|
+
|
|
131
|
+
logger.info(f"✅ Processed {len(all_images)} pages")
|
|
132
|
+
return all_images, page_texts
|
|
133
|
+
|
|
134
|
+
def stream_pdf(
|
|
135
|
+
self,
|
|
136
|
+
pdf_path: Path,
|
|
137
|
+
batch_size: int = 10,
|
|
138
|
+
dpi: Optional[int] = None,
|
|
139
|
+
) -> Generator[Tuple[List[Image.Image], List[str], int], None, None]:
|
|
140
|
+
"""
|
|
141
|
+
Stream PDF processing for large files.
|
|
142
|
+
|
|
143
|
+
Yields batches of (images, texts, start_page) without loading
|
|
144
|
+
entire PDF into memory.
|
|
145
|
+
|
|
146
|
+
Args:
|
|
147
|
+
pdf_path: Path to PDF file
|
|
148
|
+
batch_size: Pages per batch
|
|
149
|
+
dpi: Override default DPI
|
|
150
|
+
|
|
151
|
+
Yields:
|
|
152
|
+
Tuple of (batch_images, batch_texts, start_page_number)
|
|
153
|
+
"""
|
|
154
|
+
self._require_pdf_deps()
|
|
155
|
+
from pdf2image import convert_from_path
|
|
156
|
+
from pypdf import PdfReader
|
|
157
|
+
|
|
158
|
+
dpi = dpi or self.dpi
|
|
159
|
+
pdf_path = Path(pdf_path)
|
|
160
|
+
|
|
161
|
+
reader = PdfReader(str(pdf_path))
|
|
162
|
+
total_pages = len(reader.pages)
|
|
163
|
+
|
|
164
|
+
logger.info(f"📄 Streaming PDF: {pdf_path.name} ({total_pages} pages)")
|
|
165
|
+
|
|
166
|
+
for start_idx in range(0, total_pages, batch_size):
|
|
167
|
+
end_idx = min(start_idx + batch_size, total_pages)
|
|
168
|
+
|
|
169
|
+
# Extract text for batch
|
|
170
|
+
batch_texts = []
|
|
171
|
+
for page_idx in range(start_idx, end_idx):
|
|
172
|
+
text = reader.pages[page_idx].extract_text() or ""
|
|
173
|
+
text = self._sanitize_text(text)
|
|
174
|
+
batch_texts.append(text)
|
|
175
|
+
|
|
176
|
+
# Convert images for batch
|
|
177
|
+
batch_images = convert_from_path(
|
|
178
|
+
str(pdf_path),
|
|
179
|
+
dpi=dpi,
|
|
180
|
+
fmt=self.output_format.lower(),
|
|
181
|
+
first_page=start_idx + 1, # 1-indexed
|
|
182
|
+
last_page=end_idx,
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
yield batch_images, batch_texts, start_idx + 1
|
|
186
|
+
|
|
187
|
+
del batch_images
|
|
188
|
+
gc.collect()
|
|
189
|
+
|
|
190
|
+
def get_page_count(self, pdf_path: Path) -> int:
|
|
191
|
+
"""Get number of pages in PDF without loading images."""
|
|
192
|
+
self._require_pdf_deps()
|
|
193
|
+
from pypdf import PdfReader
|
|
194
|
+
|
|
195
|
+
reader = PdfReader(str(pdf_path))
|
|
196
|
+
return len(reader.pages)
|
|
197
|
+
|
|
198
|
+
def resize_for_colpali(
|
|
199
|
+
self,
|
|
200
|
+
image: Image.Image,
|
|
201
|
+
max_edge: int = 2048,
|
|
202
|
+
tile_size: int = 512,
|
|
203
|
+
) -> Tuple[Image.Image, int, int]:
|
|
204
|
+
"""
|
|
205
|
+
Resize image following ColPali/Idefics3 processor logic.
|
|
206
|
+
|
|
207
|
+
Resizes to fit within tile grid without black padding.
|
|
208
|
+
|
|
209
|
+
Args:
|
|
210
|
+
image: PIL Image
|
|
211
|
+
max_edge: Maximum edge length
|
|
212
|
+
tile_size: Size of each tile
|
|
213
|
+
|
|
214
|
+
Returns:
|
|
215
|
+
Tuple of (resized_image, tile_rows, tile_cols)
|
|
216
|
+
"""
|
|
217
|
+
# Ensure consistent mode for downstream processors (and predictable tests)
|
|
218
|
+
if image.mode != "RGB":
|
|
219
|
+
image = image.convert("RGB")
|
|
220
|
+
|
|
221
|
+
w, h = image.size
|
|
222
|
+
|
|
223
|
+
# Step 1: Resize so longest edge = max_edge
|
|
224
|
+
if w > h:
|
|
225
|
+
new_w = max_edge
|
|
226
|
+
new_h = int(h * (max_edge / w))
|
|
227
|
+
else:
|
|
228
|
+
new_h = max_edge
|
|
229
|
+
new_w = int(w * (max_edge / h))
|
|
230
|
+
|
|
231
|
+
# Step 2: Calculate tile grid
|
|
232
|
+
tile_cols = (new_w + tile_size - 1) // tile_size
|
|
233
|
+
tile_rows = (new_h + tile_size - 1) // tile_size
|
|
234
|
+
|
|
235
|
+
# Step 3: Calculate exact dimensions for tiles
|
|
236
|
+
final_w = tile_cols * tile_size
|
|
237
|
+
final_h = tile_rows * tile_size
|
|
238
|
+
|
|
239
|
+
# Step 4: Scale to fit within tile grid
|
|
240
|
+
scale_w = final_w / w
|
|
241
|
+
scale_h = final_h / h
|
|
242
|
+
scale = min(scale_w, scale_h)
|
|
243
|
+
|
|
244
|
+
scaled_w = int(w * scale)
|
|
245
|
+
scaled_h = int(h * scale)
|
|
246
|
+
|
|
247
|
+
resized = image.resize((scaled_w, scaled_h), Image.LANCZOS)
|
|
248
|
+
|
|
249
|
+
# Center on white canvas if needed
|
|
250
|
+
if scaled_w != final_w or scaled_h != final_h:
|
|
251
|
+
canvas = Image.new("RGB", (final_w, final_h), (255, 255, 255))
|
|
252
|
+
offset_x = (final_w - scaled_w) // 2
|
|
253
|
+
offset_y = (final_h - scaled_h) // 2
|
|
254
|
+
canvas.paste(resized, (offset_x, offset_y))
|
|
255
|
+
resized = canvas
|
|
256
|
+
|
|
257
|
+
return resized, tile_rows, tile_cols
|
|
258
|
+
|
|
259
|
+
def _sanitize_text(self, text: str) -> str:
|
|
260
|
+
"""Remove invalid Unicode characters (surrogates) from text."""
|
|
261
|
+
if not text:
|
|
262
|
+
return ""
|
|
263
|
+
|
|
264
|
+
# Remove surrogate characters (U+D800-U+DFFF)
|
|
265
|
+
return text.encode("utf-8", errors="surrogatepass").decode("utf-8", errors="ignore")
|
|
266
|
+
|
|
267
|
+
def extract_metadata_from_filename(
|
|
268
|
+
self,
|
|
269
|
+
filename: str,
|
|
270
|
+
mapping: Optional[Dict[str, Dict[str, Any]]] = None,
|
|
271
|
+
) -> Dict[str, Any]:
|
|
272
|
+
"""
|
|
273
|
+
Extract metadata from PDF filename.
|
|
274
|
+
|
|
275
|
+
Uses mapping if provided, otherwise falls back to pattern matching.
|
|
276
|
+
|
|
277
|
+
Args:
|
|
278
|
+
filename: PDF filename (with or without .pdf extension)
|
|
279
|
+
mapping: Optional mapping dict {filename: metadata}
|
|
280
|
+
|
|
281
|
+
Returns:
|
|
282
|
+
Metadata dict with year, source, district, etc.
|
|
283
|
+
"""
|
|
284
|
+
# Remove extension
|
|
285
|
+
stem = Path(filename).stem
|
|
286
|
+
stem_lower = stem.lower().strip()
|
|
287
|
+
|
|
288
|
+
# Try mapping first
|
|
289
|
+
if mapping:
|
|
290
|
+
if stem_lower in mapping:
|
|
291
|
+
return mapping[stem_lower].copy()
|
|
292
|
+
|
|
293
|
+
# Try without .pdf
|
|
294
|
+
stem_no_ext = stem_lower.replace(".pdf", "")
|
|
295
|
+
if stem_no_ext in mapping:
|
|
296
|
+
return mapping[stem_no_ext].copy()
|
|
297
|
+
|
|
298
|
+
# Fallback: pattern matching
|
|
299
|
+
metadata = {"filename": filename}
|
|
300
|
+
|
|
301
|
+
# Extract year
|
|
302
|
+
year_match = re.search(r"(20\d{2})", stem)
|
|
303
|
+
if year_match:
|
|
304
|
+
metadata["year"] = int(year_match.group(1))
|
|
305
|
+
|
|
306
|
+
# Detect source type
|
|
307
|
+
if "consolidated" in stem_lower or ("annual" in stem_lower and "oag" in stem_lower):
|
|
308
|
+
metadata["source"] = "Consolidated"
|
|
309
|
+
elif "dlg" in stem_lower or "district local government" in stem_lower:
|
|
310
|
+
metadata["source"] = "Local Government"
|
|
311
|
+
# Try to extract district name
|
|
312
|
+
district_match = re.search(r"([a-z]+)\s+(?:dlg|district local government)", stem_lower)
|
|
313
|
+
if district_match:
|
|
314
|
+
metadata["district"] = district_match.group(1).title()
|
|
315
|
+
elif "hospital" in stem_lower or "referral" in stem_lower:
|
|
316
|
+
metadata["source"] = "Hospital"
|
|
317
|
+
elif "ministry" in stem_lower:
|
|
318
|
+
metadata["source"] = "Ministry"
|
|
319
|
+
elif "project" in stem_lower:
|
|
320
|
+
metadata["source"] = "Project"
|
|
321
|
+
else:
|
|
322
|
+
metadata["source"] = "Unknown"
|
|
323
|
+
|
|
324
|
+
return metadata
|