visual-rag-toolkit 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- benchmarks/README.md +101 -0
- benchmarks/__init__.py +11 -0
- benchmarks/analyze_results.py +187 -0
- benchmarks/benchmark_datasets.txt +105 -0
- benchmarks/prepare_submission.py +205 -0
- benchmarks/quick_test.py +566 -0
- benchmarks/run_vidore.py +513 -0
- benchmarks/vidore_beir_qdrant/run_qdrant_beir.py +1365 -0
- benchmarks/vidore_tatdqa_test/COMMANDS.md +83 -0
- benchmarks/vidore_tatdqa_test/__init__.py +6 -0
- benchmarks/vidore_tatdqa_test/dataset_loader.py +363 -0
- benchmarks/vidore_tatdqa_test/metrics.py +44 -0
- benchmarks/vidore_tatdqa_test/run_qdrant.py +799 -0
- benchmarks/vidore_tatdqa_test/sweep_eval.py +372 -0
- demo/__init__.py +10 -0
- demo/app.py +45 -0
- demo/commands.py +334 -0
- demo/config.py +34 -0
- demo/download_models.py +75 -0
- demo/evaluation.py +602 -0
- demo/example_metadata_mapping_sigir.json +37 -0
- demo/indexing.py +286 -0
- demo/qdrant_utils.py +211 -0
- demo/results.py +35 -0
- demo/test_qdrant_connection.py +119 -0
- demo/ui/__init__.py +15 -0
- demo/ui/benchmark.py +355 -0
- demo/ui/header.py +30 -0
- demo/ui/playground.py +339 -0
- demo/ui/sidebar.py +162 -0
- demo/ui/upload.py +487 -0
- visual_rag/__init__.py +98 -0
- visual_rag/cli/__init__.py +1 -0
- visual_rag/cli/main.py +629 -0
- visual_rag/config.py +230 -0
- visual_rag/demo_runner.py +90 -0
- visual_rag/embedding/__init__.py +26 -0
- visual_rag/embedding/pooling.py +343 -0
- visual_rag/embedding/visual_embedder.py +622 -0
- visual_rag/indexing/__init__.py +21 -0
- visual_rag/indexing/cloudinary_uploader.py +274 -0
- visual_rag/indexing/pdf_processor.py +324 -0
- visual_rag/indexing/pipeline.py +628 -0
- visual_rag/indexing/qdrant_indexer.py +478 -0
- visual_rag/preprocessing/__init__.py +3 -0
- visual_rag/preprocessing/crop_empty.py +120 -0
- visual_rag/qdrant_admin.py +222 -0
- visual_rag/retrieval/__init__.py +19 -0
- visual_rag/retrieval/multi_vector.py +222 -0
- visual_rag/retrieval/single_stage.py +126 -0
- visual_rag/retrieval/three_stage.py +173 -0
- visual_rag/retrieval/two_stage.py +471 -0
- visual_rag/visualization/__init__.py +19 -0
- visual_rag/visualization/saliency.py +335 -0
- visual_rag_toolkit-0.1.1.dist-info/METADATA +305 -0
- visual_rag_toolkit-0.1.1.dist-info/RECORD +59 -0
- visual_rag_toolkit-0.1.1.dist-info/WHEEL +4 -0
- visual_rag_toolkit-0.1.1.dist-info/entry_points.txt +3 -0
- visual_rag_toolkit-0.1.1.dist-info/licenses/LICENSE +22 -0
|
@@ -0,0 +1,628 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Processing Pipeline - Complete PDF → Qdrant pipeline with saliency metadata.
|
|
3
|
+
|
|
4
|
+
This module combines all components for end-to-end processing:
|
|
5
|
+
- PDF → images conversion
|
|
6
|
+
- Image resizing for ColPali
|
|
7
|
+
- Embedding generation with token info
|
|
8
|
+
- Tile-level pooling
|
|
9
|
+
- Cloudinary upload (optional)
|
|
10
|
+
- Qdrant indexing with full metadata for saliency maps
|
|
11
|
+
|
|
12
|
+
The metadata stored includes everything needed for saliency visualization:
|
|
13
|
+
- Tile structure (num_tiles, tile_rows, tile_cols, patches_per_tile)
|
|
14
|
+
- Image dimensions (original and resized)
|
|
15
|
+
- Token info (num_visual_tokens, visual_token_indices)
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
import gc
|
|
19
|
+
import hashlib
|
|
20
|
+
import logging
|
|
21
|
+
from pathlib import Path
|
|
22
|
+
from typing import Any, Dict, List, Optional, Set
|
|
23
|
+
|
|
24
|
+
import numpy as np
|
|
25
|
+
import torch
|
|
26
|
+
|
|
27
|
+
logger = logging.getLogger(__name__)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class ProcessingPipeline:
|
|
31
|
+
"""
|
|
32
|
+
End-to-end pipeline for PDF processing and indexing.
|
|
33
|
+
|
|
34
|
+
This pipeline:
|
|
35
|
+
1. Converts PDFs to images
|
|
36
|
+
2. Resizes for ColPali processing
|
|
37
|
+
3. Generates embeddings with token info
|
|
38
|
+
4. Computes pooling (strategy-dependent)
|
|
39
|
+
5. Uploads images to Cloudinary (optional)
|
|
40
|
+
6. Stores in Qdrant with full saliency metadata
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
embedder: VisualEmbedder instance
|
|
44
|
+
indexer: QdrantIndexer instance (optional)
|
|
45
|
+
cloudinary_uploader: CloudinaryUploader instance (optional)
|
|
46
|
+
pdf_processor: PDFProcessor instance (optional, auto-created)
|
|
47
|
+
metadata_mapping: Dict mapping filenames to extra metadata
|
|
48
|
+
config: Configuration dict
|
|
49
|
+
embedding_strategy: How to process embeddings before storing:
|
|
50
|
+
- "pooling" (default): Extract visual tokens only, compute tile-level pooling
|
|
51
|
+
This is our NOVEL contribution - preserves spatial structure while reducing size.
|
|
52
|
+
- "standard": Push ALL tokens as-is (including special tokens, padding)
|
|
53
|
+
This is the baseline approach for comparison.
|
|
54
|
+
|
|
55
|
+
Example:
|
|
56
|
+
>>> from visual_rag import VisualEmbedder, QdrantIndexer, CloudinaryUploader
|
|
57
|
+
>>> from visual_rag.indexing.pipeline import ProcessingPipeline
|
|
58
|
+
>>>
|
|
59
|
+
>>> # Our novel pooling strategy (default)
|
|
60
|
+
>>> pipeline = ProcessingPipeline(
|
|
61
|
+
... embedder=VisualEmbedder(),
|
|
62
|
+
... indexer=QdrantIndexer(url, api_key, "my_collection"),
|
|
63
|
+
... embedding_strategy="pooling", # Visual tokens only + tile pooling
|
|
64
|
+
... )
|
|
65
|
+
>>>
|
|
66
|
+
>>> # Standard baseline (all tokens, no filtering)
|
|
67
|
+
>>> pipeline_baseline = ProcessingPipeline(
|
|
68
|
+
... embedder=VisualEmbedder(),
|
|
69
|
+
... indexer=QdrantIndexer(url, api_key, "my_collection_baseline"),
|
|
70
|
+
... embedding_strategy="standard", # All tokens as-is
|
|
71
|
+
... )
|
|
72
|
+
>>>
|
|
73
|
+
>>> pipeline.process_pdf(Path("report.pdf"))
|
|
74
|
+
"""
|
|
75
|
+
|
|
76
|
+
# Valid embedding strategies
|
|
77
|
+
# - "pooling": Visual tokens only + tile-level pooling (NOVEL)
|
|
78
|
+
# - "standard": All tokens + global mean (BASELINE)
|
|
79
|
+
# - "all": Embed once, push BOTH representations (efficient comparison)
|
|
80
|
+
STRATEGIES = ["pooling", "standard", "all"]
|
|
81
|
+
|
|
82
|
+
def __init__(
|
|
83
|
+
self,
|
|
84
|
+
embedder=None,
|
|
85
|
+
indexer=None,
|
|
86
|
+
cloudinary_uploader=None,
|
|
87
|
+
pdf_processor=None,
|
|
88
|
+
metadata_mapping: Optional[Dict[str, Dict[str, Any]]] = None,
|
|
89
|
+
config: Optional[Dict[str, Any]] = None,
|
|
90
|
+
embedding_strategy: str = "pooling",
|
|
91
|
+
crop_empty: bool = False,
|
|
92
|
+
crop_empty_percentage_to_remove: float = 0.9,
|
|
93
|
+
crop_empty_remove_page_number: bool = False,
|
|
94
|
+
crop_empty_preserve_border_px: int = 1,
|
|
95
|
+
crop_empty_uniform_rowcol_std_threshold: float = 0.0,
|
|
96
|
+
):
|
|
97
|
+
self.embedder = embedder
|
|
98
|
+
self.indexer = indexer
|
|
99
|
+
self.cloudinary_uploader = cloudinary_uploader
|
|
100
|
+
self.metadata_mapping = metadata_mapping or {}
|
|
101
|
+
self.config = config or {}
|
|
102
|
+
|
|
103
|
+
# Validate and set embedding strategy
|
|
104
|
+
if embedding_strategy not in self.STRATEGIES:
|
|
105
|
+
raise ValueError(
|
|
106
|
+
f"Invalid embedding_strategy: {embedding_strategy}. "
|
|
107
|
+
f"Must be one of: {self.STRATEGIES}"
|
|
108
|
+
)
|
|
109
|
+
self.embedding_strategy = embedding_strategy
|
|
110
|
+
|
|
111
|
+
self.crop_empty = bool(crop_empty)
|
|
112
|
+
self.crop_empty_percentage_to_remove = float(crop_empty_percentage_to_remove)
|
|
113
|
+
self.crop_empty_remove_page_number = bool(crop_empty_remove_page_number)
|
|
114
|
+
self.crop_empty_preserve_border_px = int(crop_empty_preserve_border_px)
|
|
115
|
+
self.crop_empty_uniform_rowcol_std_threshold = float(
|
|
116
|
+
crop_empty_uniform_rowcol_std_threshold
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
logger.info(f"📊 Embedding strategy: {embedding_strategy}")
|
|
120
|
+
if embedding_strategy == "pooling":
|
|
121
|
+
logger.info(" → Visual tokens only + tile-level mean pooling (NOVEL)")
|
|
122
|
+
else:
|
|
123
|
+
logger.info(" → All tokens as-is (BASELINE)")
|
|
124
|
+
|
|
125
|
+
# Create PDF processor if not provided
|
|
126
|
+
if pdf_processor is None:
|
|
127
|
+
from visual_rag.indexing.pdf_processor import PDFProcessor
|
|
128
|
+
|
|
129
|
+
dpi = self.config.get("processing", {}).get("dpi", 140)
|
|
130
|
+
pdf_processor = PDFProcessor(dpi=dpi)
|
|
131
|
+
self.pdf_processor = pdf_processor
|
|
132
|
+
|
|
133
|
+
# Config defaults
|
|
134
|
+
self.embedding_batch_size = self.config.get("batching", {}).get("embedding_batch_size", 8)
|
|
135
|
+
self.upload_batch_size = self.config.get("batching", {}).get("upload_batch_size", 8)
|
|
136
|
+
self.delay_between_uploads = self.config.get("delays", {}).get("between_uploads", 0.5)
|
|
137
|
+
|
|
138
|
+
def process_pdf(
|
|
139
|
+
self,
|
|
140
|
+
pdf_path: Path,
|
|
141
|
+
skip_existing: bool = True,
|
|
142
|
+
upload_to_cloudinary: bool = True,
|
|
143
|
+
upload_to_qdrant: bool = True,
|
|
144
|
+
original_filename: Optional[str] = None,
|
|
145
|
+
progress_callback: Optional[callable] = None,
|
|
146
|
+
) -> Dict[str, Any]:
|
|
147
|
+
"""
|
|
148
|
+
Process a single PDF end-to-end.
|
|
149
|
+
|
|
150
|
+
Args:
|
|
151
|
+
pdf_path: Path to PDF file
|
|
152
|
+
skip_existing: Skip pages that already exist in Qdrant
|
|
153
|
+
upload_to_cloudinary: Upload images to Cloudinary
|
|
154
|
+
upload_to_qdrant: Upload embeddings to Qdrant
|
|
155
|
+
original_filename: Original filename (use this instead of pdf_path.name for temp files)
|
|
156
|
+
progress_callback: Optional callback(stage, current, total, message) for progress updates
|
|
157
|
+
|
|
158
|
+
Returns:
|
|
159
|
+
Dict with processing results:
|
|
160
|
+
{
|
|
161
|
+
"filename": str,
|
|
162
|
+
"total_pages": int,
|
|
163
|
+
"uploaded": int,
|
|
164
|
+
"skipped": int,
|
|
165
|
+
"failed": int,
|
|
166
|
+
"pages": [...], # Page data with embeddings and metadata
|
|
167
|
+
}
|
|
168
|
+
"""
|
|
169
|
+
pdf_path = Path(pdf_path)
|
|
170
|
+
filename = original_filename or pdf_path.name
|
|
171
|
+
logger.info(f"📚 Processing PDF: {filename}")
|
|
172
|
+
|
|
173
|
+
# Check existing pages
|
|
174
|
+
existing_ids: Set[str] = set()
|
|
175
|
+
if skip_existing and self.indexer:
|
|
176
|
+
existing_ids = self.indexer.get_existing_ids(filename)
|
|
177
|
+
if existing_ids:
|
|
178
|
+
logger.info(f" Found {len(existing_ids)} existing pages")
|
|
179
|
+
|
|
180
|
+
logger.info("🖼️ Converting PDF to images...")
|
|
181
|
+
if progress_callback:
|
|
182
|
+
progress_callback("convert", 0, 0, "Converting PDF to images...")
|
|
183
|
+
images, texts = self.pdf_processor.process_pdf(pdf_path)
|
|
184
|
+
total_pages = len(images)
|
|
185
|
+
logger.info(f" ✅ Converted {total_pages} pages")
|
|
186
|
+
if progress_callback:
|
|
187
|
+
progress_callback("convert", total_pages, total_pages, f"Converted {total_pages} pages")
|
|
188
|
+
|
|
189
|
+
extra_metadata = self._get_extra_metadata(filename)
|
|
190
|
+
if extra_metadata:
|
|
191
|
+
logger.info(f" 📋 Found extra metadata: {list(extra_metadata.keys())}")
|
|
192
|
+
|
|
193
|
+
# Process in batches
|
|
194
|
+
uploaded = 0
|
|
195
|
+
skipped = 0
|
|
196
|
+
failed = 0
|
|
197
|
+
all_pages = []
|
|
198
|
+
upload_queue = []
|
|
199
|
+
|
|
200
|
+
for batch_start in range(0, total_pages, self.embedding_batch_size):
|
|
201
|
+
batch_end = min(batch_start + self.embedding_batch_size, total_pages)
|
|
202
|
+
batch_images = images[batch_start:batch_end]
|
|
203
|
+
batch_texts = texts[batch_start:batch_end]
|
|
204
|
+
|
|
205
|
+
logger.info(f"📦 Processing pages {batch_start + 1}-{batch_end}/{total_pages}")
|
|
206
|
+
if progress_callback:
|
|
207
|
+
progress_callback(
|
|
208
|
+
"embed",
|
|
209
|
+
batch_start,
|
|
210
|
+
total_pages,
|
|
211
|
+
f"Embedding pages {batch_start + 1}-{batch_end}",
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
pages_to_process = []
|
|
215
|
+
for i, (img, text) in enumerate(zip(batch_images, batch_texts)):
|
|
216
|
+
page_num = batch_start + i + 1
|
|
217
|
+
chunk_id = self.generate_chunk_id(filename, page_num)
|
|
218
|
+
|
|
219
|
+
if skip_existing and chunk_id in existing_ids:
|
|
220
|
+
skipped += 1
|
|
221
|
+
continue
|
|
222
|
+
|
|
223
|
+
pages_to_process.append(
|
|
224
|
+
{
|
|
225
|
+
"index": i,
|
|
226
|
+
"page_num": page_num,
|
|
227
|
+
"chunk_id": chunk_id,
|
|
228
|
+
"raw_image": img,
|
|
229
|
+
"text": text,
|
|
230
|
+
}
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
if not pages_to_process:
|
|
234
|
+
logger.info(" All pages in batch exist, skipping...")
|
|
235
|
+
continue
|
|
236
|
+
|
|
237
|
+
# Generate embeddings with token info
|
|
238
|
+
logger.info(f"🤖 Generating embeddings for {len(pages_to_process)} pages...")
|
|
239
|
+
from visual_rag.preprocessing.crop_empty import CropEmptyConfig, crop_empty
|
|
240
|
+
|
|
241
|
+
images_to_embed = []
|
|
242
|
+
for p in pages_to_process:
|
|
243
|
+
raw_img = p["raw_image"]
|
|
244
|
+
if self.crop_empty:
|
|
245
|
+
cropped_img, crop_meta = crop_empty(
|
|
246
|
+
raw_img,
|
|
247
|
+
config=CropEmptyConfig(
|
|
248
|
+
percentage_to_remove=float(self.crop_empty_percentage_to_remove),
|
|
249
|
+
remove_page_number=bool(self.crop_empty_remove_page_number),
|
|
250
|
+
preserve_border_px=int(self.crop_empty_preserve_border_px),
|
|
251
|
+
uniform_rowcol_std_threshold=float(
|
|
252
|
+
self.crop_empty_uniform_rowcol_std_threshold
|
|
253
|
+
),
|
|
254
|
+
),
|
|
255
|
+
)
|
|
256
|
+
p["embed_image"] = cropped_img
|
|
257
|
+
p["crop_meta"] = crop_meta
|
|
258
|
+
images_to_embed.append(cropped_img)
|
|
259
|
+
else:
|
|
260
|
+
p["embed_image"] = raw_img
|
|
261
|
+
p["crop_meta"] = None
|
|
262
|
+
images_to_embed.append(raw_img)
|
|
263
|
+
|
|
264
|
+
embeddings, token_infos = self.embedder.embed_images(
|
|
265
|
+
images_to_embed,
|
|
266
|
+
batch_size=self.embedding_batch_size,
|
|
267
|
+
return_token_info=True,
|
|
268
|
+
show_progress=False,
|
|
269
|
+
)
|
|
270
|
+
|
|
271
|
+
for idx, page_info in enumerate(pages_to_process):
|
|
272
|
+
raw_img = page_info["raw_image"]
|
|
273
|
+
embed_img = page_info["embed_image"]
|
|
274
|
+
crop_meta = page_info["crop_meta"]
|
|
275
|
+
page_num = page_info["page_num"]
|
|
276
|
+
chunk_id = page_info["chunk_id"]
|
|
277
|
+
text = page_info["text"]
|
|
278
|
+
embedding = embeddings[idx]
|
|
279
|
+
token_info = token_infos[idx]
|
|
280
|
+
|
|
281
|
+
if progress_callback:
|
|
282
|
+
progress_callback(
|
|
283
|
+
"process",
|
|
284
|
+
page_num,
|
|
285
|
+
total_pages,
|
|
286
|
+
f"Processing page {page_num}/{total_pages}",
|
|
287
|
+
)
|
|
288
|
+
|
|
289
|
+
try:
|
|
290
|
+
page_data = self._process_single_page(
|
|
291
|
+
filename=filename,
|
|
292
|
+
pdf_stem=pdf_path.stem,
|
|
293
|
+
page_num=page_num,
|
|
294
|
+
chunk_id=chunk_id,
|
|
295
|
+
total_pages=total_pages,
|
|
296
|
+
raw_img=raw_img,
|
|
297
|
+
embed_img=embed_img,
|
|
298
|
+
text=text,
|
|
299
|
+
embedding=embedding,
|
|
300
|
+
token_info=token_info,
|
|
301
|
+
extra_metadata=extra_metadata,
|
|
302
|
+
upload_to_cloudinary=upload_to_cloudinary,
|
|
303
|
+
crop_meta=crop_meta,
|
|
304
|
+
)
|
|
305
|
+
|
|
306
|
+
all_pages.append(page_data)
|
|
307
|
+
|
|
308
|
+
if upload_to_qdrant and self.indexer:
|
|
309
|
+
upload_queue.append(page_data)
|
|
310
|
+
|
|
311
|
+
# Upload in batches
|
|
312
|
+
if len(upload_queue) >= self.upload_batch_size:
|
|
313
|
+
count = self._upload_batch(upload_queue)
|
|
314
|
+
uploaded += count
|
|
315
|
+
upload_queue = []
|
|
316
|
+
|
|
317
|
+
except Exception as e:
|
|
318
|
+
logger.error(f" ❌ Failed page {page_num}: {e}")
|
|
319
|
+
failed += 1
|
|
320
|
+
|
|
321
|
+
# Memory cleanup
|
|
322
|
+
gc.collect()
|
|
323
|
+
if torch.cuda.is_available():
|
|
324
|
+
torch.cuda.empty_cache()
|
|
325
|
+
|
|
326
|
+
# Upload remaining pages
|
|
327
|
+
if upload_queue and upload_to_qdrant and self.indexer:
|
|
328
|
+
count = self._upload_batch(upload_queue)
|
|
329
|
+
uploaded += count
|
|
330
|
+
|
|
331
|
+
logger.info(
|
|
332
|
+
f"✅ Completed {filename}: {uploaded} uploaded, {skipped} skipped, {failed} failed"
|
|
333
|
+
)
|
|
334
|
+
|
|
335
|
+
return {
|
|
336
|
+
"filename": filename,
|
|
337
|
+
"total_pages": total_pages,
|
|
338
|
+
"uploaded": uploaded,
|
|
339
|
+
"skipped": skipped,
|
|
340
|
+
"failed": failed,
|
|
341
|
+
"pages": all_pages,
|
|
342
|
+
}
|
|
343
|
+
|
|
344
|
+
def _process_single_page(
|
|
345
|
+
self,
|
|
346
|
+
filename: str,
|
|
347
|
+
pdf_stem: str,
|
|
348
|
+
page_num: int,
|
|
349
|
+
chunk_id: str,
|
|
350
|
+
total_pages: int,
|
|
351
|
+
raw_img,
|
|
352
|
+
embed_img,
|
|
353
|
+
text: str,
|
|
354
|
+
embedding: torch.Tensor,
|
|
355
|
+
token_info: Dict[str, Any],
|
|
356
|
+
extra_metadata: Dict[str, Any],
|
|
357
|
+
upload_to_cloudinary: bool = True,
|
|
358
|
+
crop_meta: Optional[Dict[str, Any]] = None,
|
|
359
|
+
) -> Dict[str, Any]:
|
|
360
|
+
"""Process a single page with full metadata for saliency."""
|
|
361
|
+
from visual_rag.embedding.pooling import global_mean_pooling
|
|
362
|
+
|
|
363
|
+
# Resize image for ColPali
|
|
364
|
+
resized_img, tile_rows, tile_cols = self.pdf_processor.resize_for_colpali(embed_img)
|
|
365
|
+
|
|
366
|
+
# Use processor's tile info if available (more accurate)
|
|
367
|
+
proc_n_rows = token_info.get("n_rows")
|
|
368
|
+
proc_n_cols = token_info.get("n_cols")
|
|
369
|
+
if proc_n_rows and proc_n_cols:
|
|
370
|
+
tile_rows = proc_n_rows
|
|
371
|
+
tile_cols = proc_n_cols
|
|
372
|
+
|
|
373
|
+
# Convert embedding to numpy
|
|
374
|
+
if isinstance(embedding, torch.Tensor):
|
|
375
|
+
if embedding.dtype == torch.bfloat16:
|
|
376
|
+
full_embedding = embedding.cpu().float().numpy()
|
|
377
|
+
else:
|
|
378
|
+
full_embedding = embedding.cpu().numpy()
|
|
379
|
+
else:
|
|
380
|
+
full_embedding = np.array(embedding)
|
|
381
|
+
full_embedding = full_embedding.astype(np.float32)
|
|
382
|
+
|
|
383
|
+
# Token info for metadata
|
|
384
|
+
visual_indices = token_info["visual_token_indices"]
|
|
385
|
+
num_visual_tokens = token_info["num_visual_tokens"]
|
|
386
|
+
|
|
387
|
+
# =========================================================================
|
|
388
|
+
# STRATEGY: "pooling" (NOVEL) vs "standard" (BASELINE) vs "all" (BOTH)
|
|
389
|
+
# =========================================================================
|
|
390
|
+
|
|
391
|
+
# Always compute visual-only embedding (needed for pooling and saliency)
|
|
392
|
+
visual_embedding = full_embedding[visual_indices]
|
|
393
|
+
|
|
394
|
+
tile_pooled = self.embedder.mean_pool_visual_embedding(
|
|
395
|
+
visual_embedding, token_info, target_vectors=32
|
|
396
|
+
)
|
|
397
|
+
experimental_pooled = self.embedder.experimental_pool_visual_embedding(
|
|
398
|
+
visual_embedding, token_info, target_vectors=32, mean_pool=tile_pooled
|
|
399
|
+
)
|
|
400
|
+
global_pooled = global_mean_pooling(full_embedding)
|
|
401
|
+
global_pooling = (
|
|
402
|
+
self.embedder.global_pool_from_mean_pool(tile_pooled)
|
|
403
|
+
if tile_pooled.size
|
|
404
|
+
else global_pooled
|
|
405
|
+
)
|
|
406
|
+
|
|
407
|
+
num_tiles = int(tile_pooled.shape[0])
|
|
408
|
+
patches_per_tile = int(visual_embedding.shape[0] // max(num_tiles, 1)) if num_tiles else 0
|
|
409
|
+
if tile_rows and tile_cols and int(tile_rows) * int(tile_cols) + 1 == num_tiles:
|
|
410
|
+
pass
|
|
411
|
+
else:
|
|
412
|
+
tile_rows = token_info.get("n_rows") or None
|
|
413
|
+
tile_cols = token_info.get("n_cols") or None
|
|
414
|
+
|
|
415
|
+
if self.embedding_strategy == "pooling":
|
|
416
|
+
# NOVEL APPROACH: Visual tokens only + tile-level pooling
|
|
417
|
+
embedding_for_initial = visual_embedding
|
|
418
|
+
embedding_for_pooling = tile_pooled
|
|
419
|
+
global_pooling = (
|
|
420
|
+
self.embedder.global_pool_from_mean_pool(tile_pooled)
|
|
421
|
+
if tile_pooled.size
|
|
422
|
+
else global_pooled
|
|
423
|
+
)
|
|
424
|
+
|
|
425
|
+
elif self.embedding_strategy == "standard":
|
|
426
|
+
# BASELINE: All tokens + global mean
|
|
427
|
+
embedding_for_initial = full_embedding
|
|
428
|
+
embedding_for_pooling = global_pooled.reshape(1, -1)
|
|
429
|
+
global_pooling = global_pooled
|
|
430
|
+
|
|
431
|
+
else: # "all" - Push BOTH representations (efficient for comparison)
|
|
432
|
+
# Embed once, store multiple vector representations
|
|
433
|
+
# This allows comparing both strategies without re-embedding
|
|
434
|
+
embedding_for_initial = visual_embedding # Use visual for search
|
|
435
|
+
embedding_for_pooling = tile_pooled # Use tile-level for fast prefetch
|
|
436
|
+
global_pooling = (
|
|
437
|
+
self.embedder.global_pool_from_mean_pool(tile_pooled)
|
|
438
|
+
if tile_pooled.size
|
|
439
|
+
else global_pooled
|
|
440
|
+
)
|
|
441
|
+
|
|
442
|
+
# ALSO store standard representations as additional vectors
|
|
443
|
+
# These will be added to metadata for optional use
|
|
444
|
+
pass # Extra vectors handled in return dict below
|
|
445
|
+
|
|
446
|
+
# Upload to Cloudinary
|
|
447
|
+
original_url = None
|
|
448
|
+
cropped_url = None
|
|
449
|
+
resized_url = None
|
|
450
|
+
|
|
451
|
+
if upload_to_cloudinary and self.cloudinary_uploader:
|
|
452
|
+
base_filename = f"{pdf_stem}_page_{page_num}"
|
|
453
|
+
if self.crop_empty:
|
|
454
|
+
original_url, cropped_url, resized_url = (
|
|
455
|
+
self.cloudinary_uploader.upload_original_cropped_and_resized(
|
|
456
|
+
raw_img, embed_img, resized_img, base_filename
|
|
457
|
+
)
|
|
458
|
+
)
|
|
459
|
+
else:
|
|
460
|
+
original_url, resized_url = self.cloudinary_uploader.upload_original_and_resized(
|
|
461
|
+
raw_img, resized_img, base_filename
|
|
462
|
+
)
|
|
463
|
+
|
|
464
|
+
# Sanitize text
|
|
465
|
+
safe_text = self._sanitize_text(text[:10000]) if text else ""
|
|
466
|
+
|
|
467
|
+
metadata = {
|
|
468
|
+
"filename": filename,
|
|
469
|
+
"page_number": page_num,
|
|
470
|
+
"total_pages": total_pages,
|
|
471
|
+
"has_text": bool(text and text.strip()),
|
|
472
|
+
"text": safe_text,
|
|
473
|
+
# Image URLs
|
|
474
|
+
"page": resized_url or "", # For display
|
|
475
|
+
"original_url": original_url or "",
|
|
476
|
+
"cropped_url": cropped_url or "",
|
|
477
|
+
"resized_url": resized_url or "",
|
|
478
|
+
# Dimensions (needed for saliency overlay)
|
|
479
|
+
"original_width": raw_img.width,
|
|
480
|
+
"original_height": raw_img.height,
|
|
481
|
+
"cropped_width": int(embed_img.width) if self.crop_empty else int(raw_img.width),
|
|
482
|
+
"cropped_height": int(embed_img.height) if self.crop_empty else int(raw_img.height),
|
|
483
|
+
"resized_width": resized_img.width,
|
|
484
|
+
"resized_height": resized_img.height,
|
|
485
|
+
# Tile structure (needed for saliency)
|
|
486
|
+
"num_tiles": num_tiles,
|
|
487
|
+
"tile_rows": tile_rows,
|
|
488
|
+
"tile_cols": tile_cols,
|
|
489
|
+
"patches_per_tile": patches_per_tile,
|
|
490
|
+
# Token info (needed for saliency)
|
|
491
|
+
"num_visual_tokens": num_visual_tokens,
|
|
492
|
+
"visual_token_indices": visual_indices,
|
|
493
|
+
"total_tokens": len(full_embedding), # Total tokens in raw embedding
|
|
494
|
+
# Strategy used (important for paper comparison)
|
|
495
|
+
"embedding_strategy": self.embedding_strategy,
|
|
496
|
+
"model_name": getattr(self.embedder, "model_name", None),
|
|
497
|
+
"crop_empty_enabled": bool(self.crop_empty),
|
|
498
|
+
"crop_empty_crop_box": (crop_meta or {}).get("crop_box"),
|
|
499
|
+
"crop_empty_remove_page_number": bool(self.crop_empty_remove_page_number),
|
|
500
|
+
"crop_empty_percentage_to_remove": float(self.crop_empty_percentage_to_remove),
|
|
501
|
+
"crop_empty_preserve_border_px": int(self.crop_empty_preserve_border_px),
|
|
502
|
+
"crop_empty_uniform_rowcol_std_threshold": float(
|
|
503
|
+
self.crop_empty_uniform_rowcol_std_threshold
|
|
504
|
+
),
|
|
505
|
+
# Extra metadata (year, district, etc.)
|
|
506
|
+
**extra_metadata,
|
|
507
|
+
}
|
|
508
|
+
|
|
509
|
+
result = {
|
|
510
|
+
"id": chunk_id,
|
|
511
|
+
"visual_embedding": embedding_for_initial, # "initial" vector in Qdrant
|
|
512
|
+
"tile_pooled_embedding": embedding_for_pooling, # "mean_pooling" vector in Qdrant
|
|
513
|
+
"experimental_pooled_embedding": experimental_pooled, # "experimental_pooling" vector in Qdrant
|
|
514
|
+
"global_pooled_embedding": global_pooling, # "global_pooling" vector in Qdrant
|
|
515
|
+
"metadata": metadata,
|
|
516
|
+
"image": raw_img,
|
|
517
|
+
"resized_image": resized_img,
|
|
518
|
+
}
|
|
519
|
+
|
|
520
|
+
# For "all" strategy, include BOTH representations for comparison
|
|
521
|
+
if self.embedding_strategy == "all":
|
|
522
|
+
result["extra_vectors"] = {
|
|
523
|
+
# Standard baseline vectors (for comparison)
|
|
524
|
+
"full_embedding": full_embedding, # All tokens [total, 128]
|
|
525
|
+
"global_pooled": global_pooled, # Global mean [128]
|
|
526
|
+
# Pooling vectors (already in main result)
|
|
527
|
+
"visual_embedding": visual_embedding, # Visual only [visual, 128]
|
|
528
|
+
"tile_pooled": tile_pooled, # Tile-level [tiles, 128]
|
|
529
|
+
}
|
|
530
|
+
|
|
531
|
+
return result
|
|
532
|
+
|
|
533
|
+
def _upload_batch(self, upload_queue: List[Dict[str, Any]]) -> int:
|
|
534
|
+
"""Upload batch to Qdrant."""
|
|
535
|
+
if not upload_queue or not self.indexer:
|
|
536
|
+
return 0
|
|
537
|
+
|
|
538
|
+
logger.info(f"📤 Uploading batch of {len(upload_queue)} pages...")
|
|
539
|
+
|
|
540
|
+
count = self.indexer.upload_batch(
|
|
541
|
+
upload_queue,
|
|
542
|
+
delay_between_batches=self.delay_between_uploads,
|
|
543
|
+
)
|
|
544
|
+
|
|
545
|
+
return count
|
|
546
|
+
|
|
547
|
+
def _get_extra_metadata(self, filename: str) -> Dict[str, Any]:
|
|
548
|
+
"""Get extra metadata for a filename."""
|
|
549
|
+
if not self.metadata_mapping:
|
|
550
|
+
return {}
|
|
551
|
+
|
|
552
|
+
# Normalize filename
|
|
553
|
+
filename_clean = filename.replace(".pdf", "").replace(".PDF", "").strip().lower()
|
|
554
|
+
|
|
555
|
+
# Try exact match
|
|
556
|
+
if filename_clean in self.metadata_mapping:
|
|
557
|
+
return self.metadata_mapping[filename_clean].copy()
|
|
558
|
+
|
|
559
|
+
# Try fuzzy match
|
|
560
|
+
from difflib import SequenceMatcher
|
|
561
|
+
|
|
562
|
+
best_match = None
|
|
563
|
+
best_score = 0.0
|
|
564
|
+
|
|
565
|
+
for known_filename, metadata in self.metadata_mapping.items():
|
|
566
|
+
score = SequenceMatcher(None, filename_clean, known_filename.lower()).ratio()
|
|
567
|
+
if score > best_score and score > 0.75:
|
|
568
|
+
best_score = score
|
|
569
|
+
best_match = metadata
|
|
570
|
+
|
|
571
|
+
if best_match:
|
|
572
|
+
logger.debug(f"Fuzzy matched '{filename}' with score {best_score:.2f}")
|
|
573
|
+
return best_match.copy()
|
|
574
|
+
|
|
575
|
+
return {}
|
|
576
|
+
|
|
577
|
+
def _sanitize_text(self, text: str) -> str:
|
|
578
|
+
"""Remove invalid Unicode characters."""
|
|
579
|
+
if not text:
|
|
580
|
+
return ""
|
|
581
|
+
return text.encode("utf-8", errors="surrogatepass").decode("utf-8", errors="ignore")
|
|
582
|
+
|
|
583
|
+
@staticmethod
|
|
584
|
+
def generate_chunk_id(filename: str, page_number: int) -> str:
|
|
585
|
+
"""Generate deterministic chunk ID."""
|
|
586
|
+
content = f"{filename}:page:{page_number}"
|
|
587
|
+
hash_obj = hashlib.sha256(content.encode())
|
|
588
|
+
hex_str = hash_obj.hexdigest()[:32]
|
|
589
|
+
return f"{hex_str[:8]}-{hex_str[8:12]}-{hex_str[12:16]}-{hex_str[16:20]}-{hex_str[20:32]}"
|
|
590
|
+
|
|
591
|
+
@staticmethod
|
|
592
|
+
def load_metadata_mapping(json_path: Path) -> Dict[str, Dict[str, Any]]:
|
|
593
|
+
"""
|
|
594
|
+
Load metadata mapping from JSON file.
|
|
595
|
+
|
|
596
|
+
Expected format:
|
|
597
|
+
{
|
|
598
|
+
"filenames": {
|
|
599
|
+
"Report Name 2023": {"year": 2023, "source": "Local Government", ...},
|
|
600
|
+
...
|
|
601
|
+
}
|
|
602
|
+
}
|
|
603
|
+
|
|
604
|
+
Or simple format:
|
|
605
|
+
{
|
|
606
|
+
"Report Name 2023": {"year": 2023, "source": "Local Government", ...},
|
|
607
|
+
...
|
|
608
|
+
}
|
|
609
|
+
"""
|
|
610
|
+
import json
|
|
611
|
+
|
|
612
|
+
with open(json_path, "r") as f:
|
|
613
|
+
data = json.load(f)
|
|
614
|
+
|
|
615
|
+
# Check if nested under "filenames"
|
|
616
|
+
if "filenames" in data and isinstance(data["filenames"], dict):
|
|
617
|
+
mapping = data["filenames"]
|
|
618
|
+
else:
|
|
619
|
+
mapping = data
|
|
620
|
+
|
|
621
|
+
# Normalize keys to lowercase
|
|
622
|
+
normalized = {}
|
|
623
|
+
for filename, metadata in mapping.items():
|
|
624
|
+
key = filename.lower().strip().replace(".pdf", "")
|
|
625
|
+
normalized[key] = metadata
|
|
626
|
+
|
|
627
|
+
logger.info(f"📖 Loaded metadata for {len(normalized)} files")
|
|
628
|
+
return normalized
|