visual-rag-toolkit 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. benchmarks/README.md +101 -0
  2. benchmarks/__init__.py +11 -0
  3. benchmarks/analyze_results.py +187 -0
  4. benchmarks/benchmark_datasets.txt +105 -0
  5. benchmarks/prepare_submission.py +205 -0
  6. benchmarks/quick_test.py +566 -0
  7. benchmarks/run_vidore.py +513 -0
  8. benchmarks/vidore_beir_qdrant/run_qdrant_beir.py +1365 -0
  9. benchmarks/vidore_tatdqa_test/COMMANDS.md +83 -0
  10. benchmarks/vidore_tatdqa_test/__init__.py +6 -0
  11. benchmarks/vidore_tatdqa_test/dataset_loader.py +363 -0
  12. benchmarks/vidore_tatdqa_test/metrics.py +44 -0
  13. benchmarks/vidore_tatdqa_test/run_qdrant.py +799 -0
  14. benchmarks/vidore_tatdqa_test/sweep_eval.py +372 -0
  15. demo/__init__.py +10 -0
  16. demo/app.py +45 -0
  17. demo/commands.py +334 -0
  18. demo/config.py +34 -0
  19. demo/download_models.py +75 -0
  20. demo/evaluation.py +602 -0
  21. demo/example_metadata_mapping_sigir.json +37 -0
  22. demo/indexing.py +286 -0
  23. demo/qdrant_utils.py +211 -0
  24. demo/results.py +35 -0
  25. demo/test_qdrant_connection.py +119 -0
  26. demo/ui/__init__.py +15 -0
  27. demo/ui/benchmark.py +355 -0
  28. demo/ui/header.py +30 -0
  29. demo/ui/playground.py +339 -0
  30. demo/ui/sidebar.py +162 -0
  31. demo/ui/upload.py +487 -0
  32. visual_rag/__init__.py +98 -0
  33. visual_rag/cli/__init__.py +1 -0
  34. visual_rag/cli/main.py +629 -0
  35. visual_rag/config.py +230 -0
  36. visual_rag/demo_runner.py +90 -0
  37. visual_rag/embedding/__init__.py +26 -0
  38. visual_rag/embedding/pooling.py +343 -0
  39. visual_rag/embedding/visual_embedder.py +622 -0
  40. visual_rag/indexing/__init__.py +21 -0
  41. visual_rag/indexing/cloudinary_uploader.py +274 -0
  42. visual_rag/indexing/pdf_processor.py +324 -0
  43. visual_rag/indexing/pipeline.py +628 -0
  44. visual_rag/indexing/qdrant_indexer.py +478 -0
  45. visual_rag/preprocessing/__init__.py +3 -0
  46. visual_rag/preprocessing/crop_empty.py +120 -0
  47. visual_rag/qdrant_admin.py +222 -0
  48. visual_rag/retrieval/__init__.py +19 -0
  49. visual_rag/retrieval/multi_vector.py +222 -0
  50. visual_rag/retrieval/single_stage.py +126 -0
  51. visual_rag/retrieval/three_stage.py +173 -0
  52. visual_rag/retrieval/two_stage.py +471 -0
  53. visual_rag/visualization/__init__.py +19 -0
  54. visual_rag/visualization/saliency.py +335 -0
  55. visual_rag_toolkit-0.1.1.dist-info/METADATA +305 -0
  56. visual_rag_toolkit-0.1.1.dist-info/RECORD +59 -0
  57. visual_rag_toolkit-0.1.1.dist-info/WHEEL +4 -0
  58. visual_rag_toolkit-0.1.1.dist-info/entry_points.txt +3 -0
  59. visual_rag_toolkit-0.1.1.dist-info/licenses/LICENSE +22 -0
@@ -0,0 +1,628 @@
1
+ """
2
+ Processing Pipeline - Complete PDF → Qdrant pipeline with saliency metadata.
3
+
4
+ This module combines all components for end-to-end processing:
5
+ - PDF → images conversion
6
+ - Image resizing for ColPali
7
+ - Embedding generation with token info
8
+ - Tile-level pooling
9
+ - Cloudinary upload (optional)
10
+ - Qdrant indexing with full metadata for saliency maps
11
+
12
+ The metadata stored includes everything needed for saliency visualization:
13
+ - Tile structure (num_tiles, tile_rows, tile_cols, patches_per_tile)
14
+ - Image dimensions (original and resized)
15
+ - Token info (num_visual_tokens, visual_token_indices)
16
+ """
17
+
18
+ import gc
19
+ import hashlib
20
+ import logging
21
+ from pathlib import Path
22
+ from typing import Any, Dict, List, Optional, Set
23
+
24
+ import numpy as np
25
+ import torch
26
+
27
+ logger = logging.getLogger(__name__)
28
+
29
+
30
+ class ProcessingPipeline:
31
+ """
32
+ End-to-end pipeline for PDF processing and indexing.
33
+
34
+ This pipeline:
35
+ 1. Converts PDFs to images
36
+ 2. Resizes for ColPali processing
37
+ 3. Generates embeddings with token info
38
+ 4. Computes pooling (strategy-dependent)
39
+ 5. Uploads images to Cloudinary (optional)
40
+ 6. Stores in Qdrant with full saliency metadata
41
+
42
+ Args:
43
+ embedder: VisualEmbedder instance
44
+ indexer: QdrantIndexer instance (optional)
45
+ cloudinary_uploader: CloudinaryUploader instance (optional)
46
+ pdf_processor: PDFProcessor instance (optional, auto-created)
47
+ metadata_mapping: Dict mapping filenames to extra metadata
48
+ config: Configuration dict
49
+ embedding_strategy: How to process embeddings before storing:
50
+ - "pooling" (default): Extract visual tokens only, compute tile-level pooling
51
+ This is our NOVEL contribution - preserves spatial structure while reducing size.
52
+ - "standard": Push ALL tokens as-is (including special tokens, padding)
53
+ This is the baseline approach for comparison.
54
+
55
+ Example:
56
+ >>> from visual_rag import VisualEmbedder, QdrantIndexer, CloudinaryUploader
57
+ >>> from visual_rag.indexing.pipeline import ProcessingPipeline
58
+ >>>
59
+ >>> # Our novel pooling strategy (default)
60
+ >>> pipeline = ProcessingPipeline(
61
+ ... embedder=VisualEmbedder(),
62
+ ... indexer=QdrantIndexer(url, api_key, "my_collection"),
63
+ ... embedding_strategy="pooling", # Visual tokens only + tile pooling
64
+ ... )
65
+ >>>
66
+ >>> # Standard baseline (all tokens, no filtering)
67
+ >>> pipeline_baseline = ProcessingPipeline(
68
+ ... embedder=VisualEmbedder(),
69
+ ... indexer=QdrantIndexer(url, api_key, "my_collection_baseline"),
70
+ ... embedding_strategy="standard", # All tokens as-is
71
+ ... )
72
+ >>>
73
+ >>> pipeline.process_pdf(Path("report.pdf"))
74
+ """
75
+
76
+ # Valid embedding strategies
77
+ # - "pooling": Visual tokens only + tile-level pooling (NOVEL)
78
+ # - "standard": All tokens + global mean (BASELINE)
79
+ # - "all": Embed once, push BOTH representations (efficient comparison)
80
+ STRATEGIES = ["pooling", "standard", "all"]
81
+
82
+ def __init__(
83
+ self,
84
+ embedder=None,
85
+ indexer=None,
86
+ cloudinary_uploader=None,
87
+ pdf_processor=None,
88
+ metadata_mapping: Optional[Dict[str, Dict[str, Any]]] = None,
89
+ config: Optional[Dict[str, Any]] = None,
90
+ embedding_strategy: str = "pooling",
91
+ crop_empty: bool = False,
92
+ crop_empty_percentage_to_remove: float = 0.9,
93
+ crop_empty_remove_page_number: bool = False,
94
+ crop_empty_preserve_border_px: int = 1,
95
+ crop_empty_uniform_rowcol_std_threshold: float = 0.0,
96
+ ):
97
+ self.embedder = embedder
98
+ self.indexer = indexer
99
+ self.cloudinary_uploader = cloudinary_uploader
100
+ self.metadata_mapping = metadata_mapping or {}
101
+ self.config = config or {}
102
+
103
+ # Validate and set embedding strategy
104
+ if embedding_strategy not in self.STRATEGIES:
105
+ raise ValueError(
106
+ f"Invalid embedding_strategy: {embedding_strategy}. "
107
+ f"Must be one of: {self.STRATEGIES}"
108
+ )
109
+ self.embedding_strategy = embedding_strategy
110
+
111
+ self.crop_empty = bool(crop_empty)
112
+ self.crop_empty_percentage_to_remove = float(crop_empty_percentage_to_remove)
113
+ self.crop_empty_remove_page_number = bool(crop_empty_remove_page_number)
114
+ self.crop_empty_preserve_border_px = int(crop_empty_preserve_border_px)
115
+ self.crop_empty_uniform_rowcol_std_threshold = float(
116
+ crop_empty_uniform_rowcol_std_threshold
117
+ )
118
+
119
+ logger.info(f"📊 Embedding strategy: {embedding_strategy}")
120
+ if embedding_strategy == "pooling":
121
+ logger.info(" → Visual tokens only + tile-level mean pooling (NOVEL)")
122
+ else:
123
+ logger.info(" → All tokens as-is (BASELINE)")
124
+
125
+ # Create PDF processor if not provided
126
+ if pdf_processor is None:
127
+ from visual_rag.indexing.pdf_processor import PDFProcessor
128
+
129
+ dpi = self.config.get("processing", {}).get("dpi", 140)
130
+ pdf_processor = PDFProcessor(dpi=dpi)
131
+ self.pdf_processor = pdf_processor
132
+
133
+ # Config defaults
134
+ self.embedding_batch_size = self.config.get("batching", {}).get("embedding_batch_size", 8)
135
+ self.upload_batch_size = self.config.get("batching", {}).get("upload_batch_size", 8)
136
+ self.delay_between_uploads = self.config.get("delays", {}).get("between_uploads", 0.5)
137
+
138
+ def process_pdf(
139
+ self,
140
+ pdf_path: Path,
141
+ skip_existing: bool = True,
142
+ upload_to_cloudinary: bool = True,
143
+ upload_to_qdrant: bool = True,
144
+ original_filename: Optional[str] = None,
145
+ progress_callback: Optional[callable] = None,
146
+ ) -> Dict[str, Any]:
147
+ """
148
+ Process a single PDF end-to-end.
149
+
150
+ Args:
151
+ pdf_path: Path to PDF file
152
+ skip_existing: Skip pages that already exist in Qdrant
153
+ upload_to_cloudinary: Upload images to Cloudinary
154
+ upload_to_qdrant: Upload embeddings to Qdrant
155
+ original_filename: Original filename (use this instead of pdf_path.name for temp files)
156
+ progress_callback: Optional callback(stage, current, total, message) for progress updates
157
+
158
+ Returns:
159
+ Dict with processing results:
160
+ {
161
+ "filename": str,
162
+ "total_pages": int,
163
+ "uploaded": int,
164
+ "skipped": int,
165
+ "failed": int,
166
+ "pages": [...], # Page data with embeddings and metadata
167
+ }
168
+ """
169
+ pdf_path = Path(pdf_path)
170
+ filename = original_filename or pdf_path.name
171
+ logger.info(f"📚 Processing PDF: {filename}")
172
+
173
+ # Check existing pages
174
+ existing_ids: Set[str] = set()
175
+ if skip_existing and self.indexer:
176
+ existing_ids = self.indexer.get_existing_ids(filename)
177
+ if existing_ids:
178
+ logger.info(f" Found {len(existing_ids)} existing pages")
179
+
180
+ logger.info("🖼️ Converting PDF to images...")
181
+ if progress_callback:
182
+ progress_callback("convert", 0, 0, "Converting PDF to images...")
183
+ images, texts = self.pdf_processor.process_pdf(pdf_path)
184
+ total_pages = len(images)
185
+ logger.info(f" ✅ Converted {total_pages} pages")
186
+ if progress_callback:
187
+ progress_callback("convert", total_pages, total_pages, f"Converted {total_pages} pages")
188
+
189
+ extra_metadata = self._get_extra_metadata(filename)
190
+ if extra_metadata:
191
+ logger.info(f" 📋 Found extra metadata: {list(extra_metadata.keys())}")
192
+
193
+ # Process in batches
194
+ uploaded = 0
195
+ skipped = 0
196
+ failed = 0
197
+ all_pages = []
198
+ upload_queue = []
199
+
200
+ for batch_start in range(0, total_pages, self.embedding_batch_size):
201
+ batch_end = min(batch_start + self.embedding_batch_size, total_pages)
202
+ batch_images = images[batch_start:batch_end]
203
+ batch_texts = texts[batch_start:batch_end]
204
+
205
+ logger.info(f"📦 Processing pages {batch_start + 1}-{batch_end}/{total_pages}")
206
+ if progress_callback:
207
+ progress_callback(
208
+ "embed",
209
+ batch_start,
210
+ total_pages,
211
+ f"Embedding pages {batch_start + 1}-{batch_end}",
212
+ )
213
+
214
+ pages_to_process = []
215
+ for i, (img, text) in enumerate(zip(batch_images, batch_texts)):
216
+ page_num = batch_start + i + 1
217
+ chunk_id = self.generate_chunk_id(filename, page_num)
218
+
219
+ if skip_existing and chunk_id in existing_ids:
220
+ skipped += 1
221
+ continue
222
+
223
+ pages_to_process.append(
224
+ {
225
+ "index": i,
226
+ "page_num": page_num,
227
+ "chunk_id": chunk_id,
228
+ "raw_image": img,
229
+ "text": text,
230
+ }
231
+ )
232
+
233
+ if not pages_to_process:
234
+ logger.info(" All pages in batch exist, skipping...")
235
+ continue
236
+
237
+ # Generate embeddings with token info
238
+ logger.info(f"🤖 Generating embeddings for {len(pages_to_process)} pages...")
239
+ from visual_rag.preprocessing.crop_empty import CropEmptyConfig, crop_empty
240
+
241
+ images_to_embed = []
242
+ for p in pages_to_process:
243
+ raw_img = p["raw_image"]
244
+ if self.crop_empty:
245
+ cropped_img, crop_meta = crop_empty(
246
+ raw_img,
247
+ config=CropEmptyConfig(
248
+ percentage_to_remove=float(self.crop_empty_percentage_to_remove),
249
+ remove_page_number=bool(self.crop_empty_remove_page_number),
250
+ preserve_border_px=int(self.crop_empty_preserve_border_px),
251
+ uniform_rowcol_std_threshold=float(
252
+ self.crop_empty_uniform_rowcol_std_threshold
253
+ ),
254
+ ),
255
+ )
256
+ p["embed_image"] = cropped_img
257
+ p["crop_meta"] = crop_meta
258
+ images_to_embed.append(cropped_img)
259
+ else:
260
+ p["embed_image"] = raw_img
261
+ p["crop_meta"] = None
262
+ images_to_embed.append(raw_img)
263
+
264
+ embeddings, token_infos = self.embedder.embed_images(
265
+ images_to_embed,
266
+ batch_size=self.embedding_batch_size,
267
+ return_token_info=True,
268
+ show_progress=False,
269
+ )
270
+
271
+ for idx, page_info in enumerate(pages_to_process):
272
+ raw_img = page_info["raw_image"]
273
+ embed_img = page_info["embed_image"]
274
+ crop_meta = page_info["crop_meta"]
275
+ page_num = page_info["page_num"]
276
+ chunk_id = page_info["chunk_id"]
277
+ text = page_info["text"]
278
+ embedding = embeddings[idx]
279
+ token_info = token_infos[idx]
280
+
281
+ if progress_callback:
282
+ progress_callback(
283
+ "process",
284
+ page_num,
285
+ total_pages,
286
+ f"Processing page {page_num}/{total_pages}",
287
+ )
288
+
289
+ try:
290
+ page_data = self._process_single_page(
291
+ filename=filename,
292
+ pdf_stem=pdf_path.stem,
293
+ page_num=page_num,
294
+ chunk_id=chunk_id,
295
+ total_pages=total_pages,
296
+ raw_img=raw_img,
297
+ embed_img=embed_img,
298
+ text=text,
299
+ embedding=embedding,
300
+ token_info=token_info,
301
+ extra_metadata=extra_metadata,
302
+ upload_to_cloudinary=upload_to_cloudinary,
303
+ crop_meta=crop_meta,
304
+ )
305
+
306
+ all_pages.append(page_data)
307
+
308
+ if upload_to_qdrant and self.indexer:
309
+ upload_queue.append(page_data)
310
+
311
+ # Upload in batches
312
+ if len(upload_queue) >= self.upload_batch_size:
313
+ count = self._upload_batch(upload_queue)
314
+ uploaded += count
315
+ upload_queue = []
316
+
317
+ except Exception as e:
318
+ logger.error(f" ❌ Failed page {page_num}: {e}")
319
+ failed += 1
320
+
321
+ # Memory cleanup
322
+ gc.collect()
323
+ if torch.cuda.is_available():
324
+ torch.cuda.empty_cache()
325
+
326
+ # Upload remaining pages
327
+ if upload_queue and upload_to_qdrant and self.indexer:
328
+ count = self._upload_batch(upload_queue)
329
+ uploaded += count
330
+
331
+ logger.info(
332
+ f"✅ Completed {filename}: {uploaded} uploaded, {skipped} skipped, {failed} failed"
333
+ )
334
+
335
+ return {
336
+ "filename": filename,
337
+ "total_pages": total_pages,
338
+ "uploaded": uploaded,
339
+ "skipped": skipped,
340
+ "failed": failed,
341
+ "pages": all_pages,
342
+ }
343
+
344
+ def _process_single_page(
345
+ self,
346
+ filename: str,
347
+ pdf_stem: str,
348
+ page_num: int,
349
+ chunk_id: str,
350
+ total_pages: int,
351
+ raw_img,
352
+ embed_img,
353
+ text: str,
354
+ embedding: torch.Tensor,
355
+ token_info: Dict[str, Any],
356
+ extra_metadata: Dict[str, Any],
357
+ upload_to_cloudinary: bool = True,
358
+ crop_meta: Optional[Dict[str, Any]] = None,
359
+ ) -> Dict[str, Any]:
360
+ """Process a single page with full metadata for saliency."""
361
+ from visual_rag.embedding.pooling import global_mean_pooling
362
+
363
+ # Resize image for ColPali
364
+ resized_img, tile_rows, tile_cols = self.pdf_processor.resize_for_colpali(embed_img)
365
+
366
+ # Use processor's tile info if available (more accurate)
367
+ proc_n_rows = token_info.get("n_rows")
368
+ proc_n_cols = token_info.get("n_cols")
369
+ if proc_n_rows and proc_n_cols:
370
+ tile_rows = proc_n_rows
371
+ tile_cols = proc_n_cols
372
+
373
+ # Convert embedding to numpy
374
+ if isinstance(embedding, torch.Tensor):
375
+ if embedding.dtype == torch.bfloat16:
376
+ full_embedding = embedding.cpu().float().numpy()
377
+ else:
378
+ full_embedding = embedding.cpu().numpy()
379
+ else:
380
+ full_embedding = np.array(embedding)
381
+ full_embedding = full_embedding.astype(np.float32)
382
+
383
+ # Token info for metadata
384
+ visual_indices = token_info["visual_token_indices"]
385
+ num_visual_tokens = token_info["num_visual_tokens"]
386
+
387
+ # =========================================================================
388
+ # STRATEGY: "pooling" (NOVEL) vs "standard" (BASELINE) vs "all" (BOTH)
389
+ # =========================================================================
390
+
391
+ # Always compute visual-only embedding (needed for pooling and saliency)
392
+ visual_embedding = full_embedding[visual_indices]
393
+
394
+ tile_pooled = self.embedder.mean_pool_visual_embedding(
395
+ visual_embedding, token_info, target_vectors=32
396
+ )
397
+ experimental_pooled = self.embedder.experimental_pool_visual_embedding(
398
+ visual_embedding, token_info, target_vectors=32, mean_pool=tile_pooled
399
+ )
400
+ global_pooled = global_mean_pooling(full_embedding)
401
+ global_pooling = (
402
+ self.embedder.global_pool_from_mean_pool(tile_pooled)
403
+ if tile_pooled.size
404
+ else global_pooled
405
+ )
406
+
407
+ num_tiles = int(tile_pooled.shape[0])
408
+ patches_per_tile = int(visual_embedding.shape[0] // max(num_tiles, 1)) if num_tiles else 0
409
+ if tile_rows and tile_cols and int(tile_rows) * int(tile_cols) + 1 == num_tiles:
410
+ pass
411
+ else:
412
+ tile_rows = token_info.get("n_rows") or None
413
+ tile_cols = token_info.get("n_cols") or None
414
+
415
+ if self.embedding_strategy == "pooling":
416
+ # NOVEL APPROACH: Visual tokens only + tile-level pooling
417
+ embedding_for_initial = visual_embedding
418
+ embedding_for_pooling = tile_pooled
419
+ global_pooling = (
420
+ self.embedder.global_pool_from_mean_pool(tile_pooled)
421
+ if tile_pooled.size
422
+ else global_pooled
423
+ )
424
+
425
+ elif self.embedding_strategy == "standard":
426
+ # BASELINE: All tokens + global mean
427
+ embedding_for_initial = full_embedding
428
+ embedding_for_pooling = global_pooled.reshape(1, -1)
429
+ global_pooling = global_pooled
430
+
431
+ else: # "all" - Push BOTH representations (efficient for comparison)
432
+ # Embed once, store multiple vector representations
433
+ # This allows comparing both strategies without re-embedding
434
+ embedding_for_initial = visual_embedding # Use visual for search
435
+ embedding_for_pooling = tile_pooled # Use tile-level for fast prefetch
436
+ global_pooling = (
437
+ self.embedder.global_pool_from_mean_pool(tile_pooled)
438
+ if tile_pooled.size
439
+ else global_pooled
440
+ )
441
+
442
+ # ALSO store standard representations as additional vectors
443
+ # These will be added to metadata for optional use
444
+ pass # Extra vectors handled in return dict below
445
+
446
+ # Upload to Cloudinary
447
+ original_url = None
448
+ cropped_url = None
449
+ resized_url = None
450
+
451
+ if upload_to_cloudinary and self.cloudinary_uploader:
452
+ base_filename = f"{pdf_stem}_page_{page_num}"
453
+ if self.crop_empty:
454
+ original_url, cropped_url, resized_url = (
455
+ self.cloudinary_uploader.upload_original_cropped_and_resized(
456
+ raw_img, embed_img, resized_img, base_filename
457
+ )
458
+ )
459
+ else:
460
+ original_url, resized_url = self.cloudinary_uploader.upload_original_and_resized(
461
+ raw_img, resized_img, base_filename
462
+ )
463
+
464
+ # Sanitize text
465
+ safe_text = self._sanitize_text(text[:10000]) if text else ""
466
+
467
+ metadata = {
468
+ "filename": filename,
469
+ "page_number": page_num,
470
+ "total_pages": total_pages,
471
+ "has_text": bool(text and text.strip()),
472
+ "text": safe_text,
473
+ # Image URLs
474
+ "page": resized_url or "", # For display
475
+ "original_url": original_url or "",
476
+ "cropped_url": cropped_url or "",
477
+ "resized_url": resized_url or "",
478
+ # Dimensions (needed for saliency overlay)
479
+ "original_width": raw_img.width,
480
+ "original_height": raw_img.height,
481
+ "cropped_width": int(embed_img.width) if self.crop_empty else int(raw_img.width),
482
+ "cropped_height": int(embed_img.height) if self.crop_empty else int(raw_img.height),
483
+ "resized_width": resized_img.width,
484
+ "resized_height": resized_img.height,
485
+ # Tile structure (needed for saliency)
486
+ "num_tiles": num_tiles,
487
+ "tile_rows": tile_rows,
488
+ "tile_cols": tile_cols,
489
+ "patches_per_tile": patches_per_tile,
490
+ # Token info (needed for saliency)
491
+ "num_visual_tokens": num_visual_tokens,
492
+ "visual_token_indices": visual_indices,
493
+ "total_tokens": len(full_embedding), # Total tokens in raw embedding
494
+ # Strategy used (important for paper comparison)
495
+ "embedding_strategy": self.embedding_strategy,
496
+ "model_name": getattr(self.embedder, "model_name", None),
497
+ "crop_empty_enabled": bool(self.crop_empty),
498
+ "crop_empty_crop_box": (crop_meta or {}).get("crop_box"),
499
+ "crop_empty_remove_page_number": bool(self.crop_empty_remove_page_number),
500
+ "crop_empty_percentage_to_remove": float(self.crop_empty_percentage_to_remove),
501
+ "crop_empty_preserve_border_px": int(self.crop_empty_preserve_border_px),
502
+ "crop_empty_uniform_rowcol_std_threshold": float(
503
+ self.crop_empty_uniform_rowcol_std_threshold
504
+ ),
505
+ # Extra metadata (year, district, etc.)
506
+ **extra_metadata,
507
+ }
508
+
509
+ result = {
510
+ "id": chunk_id,
511
+ "visual_embedding": embedding_for_initial, # "initial" vector in Qdrant
512
+ "tile_pooled_embedding": embedding_for_pooling, # "mean_pooling" vector in Qdrant
513
+ "experimental_pooled_embedding": experimental_pooled, # "experimental_pooling" vector in Qdrant
514
+ "global_pooled_embedding": global_pooling, # "global_pooling" vector in Qdrant
515
+ "metadata": metadata,
516
+ "image": raw_img,
517
+ "resized_image": resized_img,
518
+ }
519
+
520
+ # For "all" strategy, include BOTH representations for comparison
521
+ if self.embedding_strategy == "all":
522
+ result["extra_vectors"] = {
523
+ # Standard baseline vectors (for comparison)
524
+ "full_embedding": full_embedding, # All tokens [total, 128]
525
+ "global_pooled": global_pooled, # Global mean [128]
526
+ # Pooling vectors (already in main result)
527
+ "visual_embedding": visual_embedding, # Visual only [visual, 128]
528
+ "tile_pooled": tile_pooled, # Tile-level [tiles, 128]
529
+ }
530
+
531
+ return result
532
+
533
+ def _upload_batch(self, upload_queue: List[Dict[str, Any]]) -> int:
534
+ """Upload batch to Qdrant."""
535
+ if not upload_queue or not self.indexer:
536
+ return 0
537
+
538
+ logger.info(f"📤 Uploading batch of {len(upload_queue)} pages...")
539
+
540
+ count = self.indexer.upload_batch(
541
+ upload_queue,
542
+ delay_between_batches=self.delay_between_uploads,
543
+ )
544
+
545
+ return count
546
+
547
+ def _get_extra_metadata(self, filename: str) -> Dict[str, Any]:
548
+ """Get extra metadata for a filename."""
549
+ if not self.metadata_mapping:
550
+ return {}
551
+
552
+ # Normalize filename
553
+ filename_clean = filename.replace(".pdf", "").replace(".PDF", "").strip().lower()
554
+
555
+ # Try exact match
556
+ if filename_clean in self.metadata_mapping:
557
+ return self.metadata_mapping[filename_clean].copy()
558
+
559
+ # Try fuzzy match
560
+ from difflib import SequenceMatcher
561
+
562
+ best_match = None
563
+ best_score = 0.0
564
+
565
+ for known_filename, metadata in self.metadata_mapping.items():
566
+ score = SequenceMatcher(None, filename_clean, known_filename.lower()).ratio()
567
+ if score > best_score and score > 0.75:
568
+ best_score = score
569
+ best_match = metadata
570
+
571
+ if best_match:
572
+ logger.debug(f"Fuzzy matched '{filename}' with score {best_score:.2f}")
573
+ return best_match.copy()
574
+
575
+ return {}
576
+
577
+ def _sanitize_text(self, text: str) -> str:
578
+ """Remove invalid Unicode characters."""
579
+ if not text:
580
+ return ""
581
+ return text.encode("utf-8", errors="surrogatepass").decode("utf-8", errors="ignore")
582
+
583
+ @staticmethod
584
+ def generate_chunk_id(filename: str, page_number: int) -> str:
585
+ """Generate deterministic chunk ID."""
586
+ content = f"{filename}:page:{page_number}"
587
+ hash_obj = hashlib.sha256(content.encode())
588
+ hex_str = hash_obj.hexdigest()[:32]
589
+ return f"{hex_str[:8]}-{hex_str[8:12]}-{hex_str[12:16]}-{hex_str[16:20]}-{hex_str[20:32]}"
590
+
591
+ @staticmethod
592
+ def load_metadata_mapping(json_path: Path) -> Dict[str, Dict[str, Any]]:
593
+ """
594
+ Load metadata mapping from JSON file.
595
+
596
+ Expected format:
597
+ {
598
+ "filenames": {
599
+ "Report Name 2023": {"year": 2023, "source": "Local Government", ...},
600
+ ...
601
+ }
602
+ }
603
+
604
+ Or simple format:
605
+ {
606
+ "Report Name 2023": {"year": 2023, "source": "Local Government", ...},
607
+ ...
608
+ }
609
+ """
610
+ import json
611
+
612
+ with open(json_path, "r") as f:
613
+ data = json.load(f)
614
+
615
+ # Check if nested under "filenames"
616
+ if "filenames" in data and isinstance(data["filenames"], dict):
617
+ mapping = data["filenames"]
618
+ else:
619
+ mapping = data
620
+
621
+ # Normalize keys to lowercase
622
+ normalized = {}
623
+ for filename, metadata in mapping.items():
624
+ key = filename.lower().strip().replace(".pdf", "")
625
+ normalized[key] = metadata
626
+
627
+ logger.info(f"📖 Loaded metadata for {len(normalized)} files")
628
+ return normalized