visual-rag-toolkit 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. benchmarks/README.md +101 -0
  2. benchmarks/__init__.py +11 -0
  3. benchmarks/analyze_results.py +187 -0
  4. benchmarks/benchmark_datasets.txt +105 -0
  5. benchmarks/prepare_submission.py +205 -0
  6. benchmarks/quick_test.py +566 -0
  7. benchmarks/run_vidore.py +513 -0
  8. benchmarks/vidore_beir_qdrant/run_qdrant_beir.py +1365 -0
  9. benchmarks/vidore_tatdqa_test/COMMANDS.md +83 -0
  10. benchmarks/vidore_tatdqa_test/__init__.py +6 -0
  11. benchmarks/vidore_tatdqa_test/dataset_loader.py +363 -0
  12. benchmarks/vidore_tatdqa_test/metrics.py +44 -0
  13. benchmarks/vidore_tatdqa_test/run_qdrant.py +799 -0
  14. benchmarks/vidore_tatdqa_test/sweep_eval.py +372 -0
  15. demo/__init__.py +10 -0
  16. demo/app.py +45 -0
  17. demo/commands.py +334 -0
  18. demo/config.py +34 -0
  19. demo/download_models.py +75 -0
  20. demo/evaluation.py +602 -0
  21. demo/example_metadata_mapping_sigir.json +37 -0
  22. demo/indexing.py +286 -0
  23. demo/qdrant_utils.py +211 -0
  24. demo/results.py +35 -0
  25. demo/test_qdrant_connection.py +119 -0
  26. demo/ui/__init__.py +15 -0
  27. demo/ui/benchmark.py +355 -0
  28. demo/ui/header.py +30 -0
  29. demo/ui/playground.py +339 -0
  30. demo/ui/sidebar.py +162 -0
  31. demo/ui/upload.py +487 -0
  32. visual_rag/__init__.py +98 -0
  33. visual_rag/cli/__init__.py +1 -0
  34. visual_rag/cli/main.py +629 -0
  35. visual_rag/config.py +230 -0
  36. visual_rag/demo_runner.py +90 -0
  37. visual_rag/embedding/__init__.py +26 -0
  38. visual_rag/embedding/pooling.py +343 -0
  39. visual_rag/embedding/visual_embedder.py +622 -0
  40. visual_rag/indexing/__init__.py +21 -0
  41. visual_rag/indexing/cloudinary_uploader.py +274 -0
  42. visual_rag/indexing/pdf_processor.py +324 -0
  43. visual_rag/indexing/pipeline.py +628 -0
  44. visual_rag/indexing/qdrant_indexer.py +478 -0
  45. visual_rag/preprocessing/__init__.py +3 -0
  46. visual_rag/preprocessing/crop_empty.py +120 -0
  47. visual_rag/qdrant_admin.py +222 -0
  48. visual_rag/retrieval/__init__.py +19 -0
  49. visual_rag/retrieval/multi_vector.py +222 -0
  50. visual_rag/retrieval/single_stage.py +126 -0
  51. visual_rag/retrieval/three_stage.py +173 -0
  52. visual_rag/retrieval/two_stage.py +471 -0
  53. visual_rag/visualization/__init__.py +19 -0
  54. visual_rag/visualization/saliency.py +335 -0
  55. visual_rag_toolkit-0.1.1.dist-info/METADATA +305 -0
  56. visual_rag_toolkit-0.1.1.dist-info/RECORD +59 -0
  57. visual_rag_toolkit-0.1.1.dist-info/WHEEL +4 -0
  58. visual_rag_toolkit-0.1.1.dist-info/entry_points.txt +3 -0
  59. visual_rag_toolkit-0.1.1.dist-info/licenses/LICENSE +22 -0
@@ -0,0 +1,335 @@
1
+ """
2
+ Saliency Map Generation for Visual Document Retrieval.
3
+
4
+ Generates attention/saliency maps to visualize which parts of documents
5
+ are most relevant to a query.
6
+ """
7
+
8
+ import logging
9
+ from typing import Any, Dict, List, Optional, Tuple
10
+
11
+ import numpy as np
12
+ from PIL import Image
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ def generate_saliency_map(
18
+ query_embedding: np.ndarray,
19
+ doc_embedding: np.ndarray,
20
+ image: Image.Image,
21
+ token_info: Optional[Dict[str, Any]] = None,
22
+ colormap: str = "Reds",
23
+ alpha: float = 0.5,
24
+ threshold_percentile: float = 50.0,
25
+ ) -> Tuple[Image.Image, np.ndarray]:
26
+ """
27
+ Generate saliency map showing which parts of the image match the query.
28
+
29
+ Computes patch-level relevance scores and overlays them on the image.
30
+
31
+ Args:
32
+ query_embedding: Query embeddings [num_query_tokens, dim]
33
+ doc_embedding: Document visual embeddings [num_visual_tokens, dim]
34
+ image: Original PIL Image
35
+ token_info: Optional token info with n_rows, n_cols for tile grid
36
+ colormap: Matplotlib colormap name (Reds, viridis, jet, etc.)
37
+ alpha: Overlay transparency (0-1)
38
+ threshold_percentile: Only highlight patches above this percentile
39
+
40
+ Returns:
41
+ Tuple of (annotated_image, patch_scores)
42
+
43
+ Example:
44
+ >>> query = embedder.embed_query("budget allocation")
45
+ >>> doc = visual_embedding # From embed_images
46
+ >>> annotated, scores = generate_saliency_map(
47
+ ... query_embedding=query.numpy(),
48
+ ... doc_embedding=doc,
49
+ ... image=page_image,
50
+ ... token_info=token_info,
51
+ ... )
52
+ >>> annotated.save("saliency.png")
53
+ """
54
+ # Ensure numpy arrays
55
+ if hasattr(query_embedding, "numpy"):
56
+ query_np = query_embedding.numpy()
57
+ elif hasattr(query_embedding, "cpu"):
58
+ query_np = query_embedding.cpu().numpy()
59
+ else:
60
+ query_np = np.array(query_embedding, dtype=np.float32)
61
+
62
+ if hasattr(doc_embedding, "numpy"):
63
+ doc_np = doc_embedding.numpy()
64
+ elif hasattr(doc_embedding, "cpu"):
65
+ doc_np = doc_embedding.cpu().numpy()
66
+ else:
67
+ doc_np = np.array(doc_embedding, dtype=np.float32)
68
+
69
+ # Normalize embeddings
70
+ query_norm = query_np / (np.linalg.norm(query_np, axis=1, keepdims=True) + 1e-8)
71
+ doc_norm = doc_np / (np.linalg.norm(doc_np, axis=1, keepdims=True) + 1e-8)
72
+
73
+ # Compute similarity matrix: [num_query, num_doc]
74
+ similarity_matrix = np.dot(query_norm, doc_norm.T)
75
+
76
+ # Get max similarity per document patch (best match from any query token)
77
+ patch_scores = similarity_matrix.max(axis=0)
78
+
79
+ # Normalize to [0, 1]
80
+ score_min, score_max = patch_scores.min(), patch_scores.max()
81
+ if score_max - score_min > 1e-8:
82
+ patch_scores_norm = (patch_scores - score_min) / (score_max - score_min)
83
+ else:
84
+ patch_scores_norm = np.zeros_like(patch_scores)
85
+
86
+ # Determine grid dimensions
87
+ if token_info and token_info.get("n_rows") and token_info.get("n_cols"):
88
+ n_rows = token_info["n_rows"]
89
+ n_cols = token_info["n_cols"]
90
+ num_tiles = n_rows * n_cols + 1 # +1 for global tile
91
+ patches_per_tile = 64 # ColSmol standard
92
+
93
+ # Reshape to tile grid (excluding global tile)
94
+ try:
95
+ # Skip global tile patches at the end
96
+ tile_patches = num_tiles * patches_per_tile
97
+ if len(patch_scores_norm) >= tile_patches:
98
+ grid_patches = patch_scores_norm[: n_rows * n_cols * patches_per_tile]
99
+ else:
100
+ grid_patches = patch_scores_norm
101
+
102
+ # Reshape: [tiles * patches_per_tile] -> [tiles, patches_per_tile]
103
+ # Then mean per tile
104
+ num_grid_tiles = n_rows * n_cols
105
+ grid_patches = grid_patches[: num_grid_tiles * patches_per_tile]
106
+ tile_scores = grid_patches.reshape(num_grid_tiles, patches_per_tile).mean(axis=1)
107
+ tile_scores = tile_scores.reshape(n_rows, n_cols)
108
+ except Exception as e:
109
+ logger.warning(f"Could not reshape to tile grid: {e}")
110
+ tile_scores = None
111
+ else:
112
+ tile_scores = None
113
+ n_rows = n_cols = None
114
+
115
+ # Create overlay
116
+ annotated = create_saliency_overlay(
117
+ image=image,
118
+ scores=tile_scores if tile_scores is not None else patch_scores_norm,
119
+ colormap=colormap,
120
+ alpha=alpha,
121
+ threshold_percentile=threshold_percentile,
122
+ grid_rows=n_rows,
123
+ grid_cols=n_cols,
124
+ )
125
+
126
+ return annotated, patch_scores
127
+
128
+
129
+ def create_saliency_overlay(
130
+ image: Image.Image,
131
+ scores: np.ndarray,
132
+ colormap: str = "Reds",
133
+ alpha: float = 0.5,
134
+ threshold_percentile: float = 50.0,
135
+ grid_rows: Optional[int] = None,
136
+ grid_cols: Optional[int] = None,
137
+ ) -> Image.Image:
138
+ """
139
+ Create colored overlay on image based on scores.
140
+
141
+ Args:
142
+ image: Base PIL Image
143
+ scores: Score array - 1D [num_patches] or 2D [rows, cols]
144
+ colormap: Matplotlib colormap name
145
+ alpha: Overlay transparency
146
+ threshold_percentile: Only color patches above this percentile
147
+ grid_rows, grid_cols: Grid dimensions (auto-detected if not provided)
148
+
149
+ Returns:
150
+ Annotated PIL Image
151
+ """
152
+ try:
153
+ import matplotlib.pyplot as plt
154
+ except ImportError:
155
+ logger.warning("matplotlib not installed, returning original image")
156
+ return image
157
+
158
+ img_array = np.array(image)
159
+ h, w = img_array.shape[:2]
160
+
161
+ # Handle 2D scores (tile grid)
162
+ if scores.ndim == 2:
163
+ rows, cols = scores.shape
164
+ elif grid_rows and grid_cols:
165
+ rows, cols = grid_rows, grid_cols
166
+ # Reshape if possible
167
+ if len(scores) == rows * cols:
168
+ scores = scores.reshape(rows, cols)
169
+ else:
170
+ # Fallback: estimate grid from score count
171
+ num_patches = len(scores)
172
+ aspect = w / h
173
+ cols = int(np.sqrt(num_patches * aspect))
174
+ rows = max(1, num_patches // cols)
175
+ scores = scores[: rows * cols].reshape(rows, cols)
176
+ else:
177
+ # Auto-estimate grid
178
+ num_patches = len(scores) if scores.ndim == 1 else scores.size
179
+ aspect = w / h
180
+ cols = max(1, int(np.sqrt(num_patches * aspect)))
181
+ rows = max(1, num_patches // cols)
182
+
183
+ if rows * cols > len(scores) if scores.ndim == 1 else scores.size:
184
+ cols = max(1, cols - 1)
185
+
186
+ if scores.ndim == 1:
187
+ scores = scores[: rows * cols].reshape(rows, cols)
188
+
189
+ # Get colormap
190
+ cmap = plt.cm.get_cmap(colormap)
191
+
192
+ # Calculate threshold
193
+ threshold = np.percentile(scores, threshold_percentile)
194
+
195
+ # Calculate cell dimensions
196
+ cell_h = h // rows
197
+ cell_w = w // cols
198
+
199
+ # Create RGBA overlay
200
+ overlay = np.zeros((h, w, 4), dtype=np.uint8)
201
+
202
+ for i in range(rows):
203
+ for j in range(cols):
204
+ score = scores[i, j]
205
+
206
+ if score >= threshold:
207
+ y1 = i * cell_h
208
+ y2 = min((i + 1) * cell_h, h)
209
+ x1 = j * cell_w
210
+ x2 = min((j + 1) * cell_w, w)
211
+
212
+ # Normalize score for coloring (above threshold)
213
+ norm_score = (score - threshold) / (1.0 - threshold + 1e-8)
214
+ norm_score = min(1.0, max(0.0, norm_score))
215
+
216
+ # Get color
217
+ color = cmap(norm_score)[:3]
218
+ color_uint8 = (np.array(color) * 255).astype(np.uint8)
219
+
220
+ overlay[y1:y2, x1:x2, :3] = color_uint8
221
+ overlay[y1:y2, x1:x2, 3] = int(alpha * 255 * norm_score)
222
+
223
+ # Blend with original
224
+ overlay_img = Image.fromarray(overlay, "RGBA")
225
+ result = Image.alpha_composite(image.convert("RGBA"), overlay_img)
226
+
227
+ return result.convert("RGB")
228
+
229
+
230
+ def visualize_search_results(
231
+ query: str,
232
+ results: List[Dict[str, Any]],
233
+ query_embedding: Optional[np.ndarray] = None,
234
+ embeddings: Optional[List[np.ndarray]] = None,
235
+ output_path: Optional[str] = None,
236
+ max_results: int = 5,
237
+ show_saliency: bool = False,
238
+ ) -> Optional[Image.Image]:
239
+ """
240
+ Visualize search results as a grid of images with scores.
241
+
242
+ Args:
243
+ query: Original query text
244
+ results: List of search results with 'payload' containing 'page' (image URL/base64)
245
+ query_embedding: Query embedding for saliency (optional)
246
+ embeddings: Document embeddings for saliency (optional)
247
+ output_path: Path to save visualization (optional)
248
+ max_results: Maximum results to show
249
+ show_saliency: Generate saliency overlays (requires query_embedding & embeddings)
250
+
251
+ Returns:
252
+ Combined visualization image if successful
253
+ """
254
+ try:
255
+ import matplotlib.pyplot as plt
256
+ except ImportError:
257
+ logger.error("matplotlib required for visualization")
258
+ return None
259
+
260
+ results = results[:max_results]
261
+ n = len(results)
262
+
263
+ if n == 0:
264
+ logger.warning("No results to visualize")
265
+ return None
266
+
267
+ fig, axes = plt.subplots(1, n, figsize=(4 * n, 4))
268
+ if n == 1:
269
+ axes = [axes]
270
+
271
+ for idx, (result, ax) in enumerate(zip(results, axes)):
272
+ payload = result.get("payload", {})
273
+ score = result.get("score_final", result.get("score_stage1", 0))
274
+
275
+ # Try to load image from payload
276
+ page_data = payload.get("page", "")
277
+ image = None
278
+
279
+ if page_data.startswith("data:image"):
280
+ # Base64 encoded
281
+ try:
282
+ import base64
283
+ from io import BytesIO
284
+
285
+ b64_data = page_data.split(",")[1]
286
+ image = Image.open(BytesIO(base64.b64decode(b64_data)))
287
+ except Exception as e:
288
+ logger.debug(f"Could not decode base64 image: {e}")
289
+ elif page_data.startswith("http"):
290
+ # URL - try to fetch
291
+ try:
292
+ import urllib.request
293
+ from io import BytesIO
294
+
295
+ with urllib.request.urlopen(page_data, timeout=5) as response:
296
+ image = Image.open(BytesIO(response.read()))
297
+ except Exception as e:
298
+ logger.debug(f"Could not fetch image URL: {e}")
299
+
300
+ if image:
301
+ ax.imshow(image)
302
+ else:
303
+ # Show placeholder
304
+ ax.text(0.5, 0.5, "No image", ha="center", va="center", fontsize=12, color="gray")
305
+
306
+ # Add title
307
+ title = f"Rank {idx + 1}\nScore: {score:.3f}"
308
+ if payload.get("filename"):
309
+ title += f"\n{payload['filename'][:30]}"
310
+ if payload.get("page_number") is not None:
311
+ title += f" p.{payload['page_number'] + 1}"
312
+
313
+ ax.set_title(title, fontsize=9)
314
+ ax.axis("off")
315
+
316
+ # Add query as suptitle
317
+ query_display = query[:80] + "..." if len(query) > 80 else query
318
+ plt.suptitle(f"Query: {query_display}", fontsize=11, fontweight="bold")
319
+ plt.tight_layout()
320
+
321
+ if output_path:
322
+ plt.savefig(output_path, dpi=150, bbox_inches="tight")
323
+ logger.info(f"💾 Saved visualization to: {output_path}")
324
+
325
+ # Convert to PIL Image for return
326
+ from io import BytesIO
327
+
328
+ buf = BytesIO()
329
+ plt.savefig(buf, format="png", dpi=100, bbox_inches="tight")
330
+ buf.seek(0)
331
+ result_image = Image.open(buf)
332
+
333
+ plt.close()
334
+
335
+ return result_image
@@ -0,0 +1,305 @@
1
+ Metadata-Version: 2.4
2
+ Name: visual-rag-toolkit
3
+ Version: 0.1.1
4
+ Summary: End-to-end visual document retrieval with ColPali, featuring two-stage pooling for scalable search
5
+ Project-URL: Homepage, https://github.com/Ara-Yeroyan/visual-rag-toolkit
6
+ Project-URL: Documentation, https://github.com/Ara-Yeroyan/visual-rag-toolkit#readme
7
+ Project-URL: Repository, https://github.com/Ara-Yeroyan/visual-rag-toolkit
8
+ Project-URL: Issues, https://github.com/Ara-Yeroyan/visual-rag-toolkit/issues
9
+ Author: Visual RAG Team
10
+ License: MIT License
11
+
12
+ Copyright (c) 2026 Ara Yeroyan
13
+
14
+ Permission is hereby granted, free of charge, to any person obtaining a copy
15
+ of this software and associated documentation files (the "Software"), to deal
16
+ in the Software without restriction, including without limitation the rights
17
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
18
+ copies of the Software, and to permit persons to whom the Software is
19
+ furnished to do so, subject to the following conditions:
20
+
21
+ The above copyright notice and this permission notice shall be included in all
22
+ copies or substantial portions of the Software.
23
+
24
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
25
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
26
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
27
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
28
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
29
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30
+ SOFTWARE.
31
+
32
+ License-File: LICENSE
33
+ Keywords: colbert,colpali,document-retrieval,late-interaction,multimodal-rag,pdf-processing,qdrant,visual-rag,visual-search
34
+ Classifier: Development Status :: 4 - Beta
35
+ Classifier: Intended Audience :: Developers
36
+ Classifier: Intended Audience :: Science/Research
37
+ Classifier: License :: OSI Approved :: MIT License
38
+ Classifier: Programming Language :: Python :: 3
39
+ Classifier: Programming Language :: Python :: 3.9
40
+ Classifier: Programming Language :: Python :: 3.10
41
+ Classifier: Programming Language :: Python :: 3.11
42
+ Classifier: Programming Language :: Python :: 3.12
43
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
44
+ Classifier: Topic :: Scientific/Engineering :: Image Processing
45
+ Requires-Python: >=3.9
46
+ Requires-Dist: numpy>=1.21.0
47
+ Requires-Dist: pillow>=9.0.0
48
+ Requires-Dist: python-dotenv>=0.19.0
49
+ Requires-Dist: pyyaml>=6.0
50
+ Requires-Dist: torch>=2.0.0
51
+ Requires-Dist: tqdm>=4.60.0
52
+ Provides-Extra: all
53
+ Requires-Dist: altair>=5.0.0; extra == 'all'
54
+ Requires-Dist: cloudinary>=1.30.0; extra == 'all'
55
+ Requires-Dist: colpali-engine>=0.3.0; extra == 'all'
56
+ Requires-Dist: httpx>=0.24.0; extra == 'all'
57
+ Requires-Dist: pandas>=2.0.0; extra == 'all'
58
+ Requires-Dist: pdf2image>=1.16.0; extra == 'all'
59
+ Requires-Dist: pypdf>=3.0.0; extra == 'all'
60
+ Requires-Dist: qdrant-client>=1.7.0; extra == 'all'
61
+ Requires-Dist: streamlit>=1.25.0; extra == 'all'
62
+ Requires-Dist: transformers>=4.35.0; extra == 'all'
63
+ Provides-Extra: cloudinary
64
+ Requires-Dist: cloudinary>=1.30.0; extra == 'cloudinary'
65
+ Provides-Extra: dev
66
+ Requires-Dist: black>=23.0.0; extra == 'dev'
67
+ Requires-Dist: mypy>=1.0.0; extra == 'dev'
68
+ Requires-Dist: pytest-cov>=4.0.0; extra == 'dev'
69
+ Requires-Dist: pytest>=7.0.0; extra == 'dev'
70
+ Requires-Dist: ruff>=0.1.0; extra == 'dev'
71
+ Provides-Extra: embedding
72
+ Requires-Dist: colpali-engine>=0.3.0; extra == 'embedding'
73
+ Requires-Dist: transformers>=4.35.0; extra == 'embedding'
74
+ Provides-Extra: pdf
75
+ Requires-Dist: pdf2image>=1.16.0; extra == 'pdf'
76
+ Requires-Dist: pypdf>=3.0.0; extra == 'pdf'
77
+ Provides-Extra: qdrant
78
+ Requires-Dist: qdrant-client>=1.7.0; extra == 'qdrant'
79
+ Provides-Extra: ui
80
+ Requires-Dist: altair>=5.0.0; extra == 'ui'
81
+ Requires-Dist: httpx>=0.24.0; extra == 'ui'
82
+ Requires-Dist: pandas>=2.0.0; extra == 'ui'
83
+ Requires-Dist: streamlit>=1.25.0; extra == 'ui'
84
+ Description-Content-Type: text/markdown
85
+
86
+ # Visual RAG Toolkit
87
+
88
+ [![PyPI version](https://badge.fury.io/py/visual-rag-toolkit.svg)](https://badge.fury.io/py/visual-rag-toolkit)
89
+ [![CI](https://github.com/Ara-Yeroyan/visual-rag-toolkit/actions/workflows/ci.yaml/badge.svg)](https://github.com/Ara-Yeroyan/visual-rag-toolkit/actions/workflows/ci.yaml)
90
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
91
+ [![Python 3.9+](https://img.shields.io/badge/python-3.9+-blue.svg)](https://www.python.org/downloads/)
92
+
93
+ End-to-end visual document retrieval toolkit featuring **fast multi-stage retrieval** (prefetch with pooled vectors + exact MaxSim reranking).
94
+
95
+ This repo contains:
96
+ - a **Python package** (`visual_rag`)
97
+ - a **Streamlit demo app** (`demo/`)
98
+ - **benchmark & evaluation scripts** for ViDoRe v2 (`benchmarks/`)
99
+
100
+ ## 🎯 Key Features
101
+
102
+ - **Modular**: PDF → images, embedding, Qdrant indexing, retrieval can be used independently.
103
+ - **Multi-stage retrieval**: two-stage and three-stage retrieval modes built for Qdrant named vectors.
104
+ - **Model-aware embedding**: ColSmol + ColPali support behind a single `VisualEmbedder` interface.
105
+ - **Token hygiene**: query special-token filtering by default for more stable MaxSim behavior.
106
+ - **Practical pipelines**: robust indexing, retries, optional Cloudinary image URLs, evaluation reporting.
107
+
108
+ ## 📦 Installation
109
+
110
+ ```bash
111
+ # Core package (minimal dependencies)
112
+ pip install visual-rag-toolkit
113
+
114
+ # With specific features
115
+ pip install visual-rag-toolkit[embedding] # ColSmol/ColPali embedding support
116
+ pip install visual-rag-toolkit[pdf] # PDF processing
117
+ pip install visual-rag-toolkit[qdrant] # Vector database
118
+ pip install visual-rag-toolkit[cloudinary] # Image CDN
119
+ pip install visual-rag-toolkit[ui] # Streamlit demo dependencies
120
+
121
+ # All dependencies
122
+ pip install visual-rag-toolkit[all]
123
+ ```
124
+
125
+ ### System dependencies (PDF)
126
+
127
+ `pdf2image` requires Poppler.
128
+
129
+ - macOS: `brew install poppler`
130
+ - Ubuntu/Debian: `sudo apt-get update && sudo apt-get install -y poppler-utils`
131
+
132
+ ## 🚀 Quick Start
133
+
134
+ ### Minimal: embed a query and run two-stage search (server-side)
135
+
136
+ ```python
137
+ from qdrant_client import QdrantClient
138
+ from visual_rag import VisualEmbedder, TwoStageRetriever
139
+
140
+ client = QdrantClient(url="https://YOUR_QDRANT", api_key="YOUR_KEY")
141
+ collection_name = "your_collection"
142
+
143
+ # Embed query tokens
144
+ embedder = VisualEmbedder(model_name="vidore/colpali-v1.3")
145
+ q = embedder.embed_query("What is the budget allocation?")
146
+
147
+ # Fast path: all stages computed in Qdrant (prefetch + exact rerank)
148
+ retriever = TwoStageRetriever(client, collection_name)
149
+ results = retriever.search_server_side(
150
+ query_embedding=q,
151
+ top_k=10,
152
+ prefetch_k=256,
153
+ stage1_mode="tokens_vs_experimental", # or: tokens_vs_tiles / pooled_query_vs_tiles / pooled_query_vs_global
154
+ )
155
+
156
+ for r in results[:3]:
157
+ print(r["id"], r["score_final"])
158
+ ```
159
+
160
+ ### Process a PDF into images (no embedding, no vector DB)
161
+
162
+ ```python
163
+ from pathlib import Path
164
+ from visual_rag import PDFProcessor
165
+
166
+ processor = PDFProcessor(dpi=140)
167
+ images, texts = processor.process_pdf(Path("report.pdf"))
168
+ print(len(images), "pages")
169
+ ```
170
+
171
+ ## 🔬 Multi-stage Retrieval (Two-stage / Three-stage)
172
+
173
+ Traditional ColBERT-style MaxSim scoring compares all query tokens vs all document tokens, which becomes expensive at scale.
174
+
175
+ **Our approach:**
176
+
177
+ ```
178
+ Stage 1: Fast prefetch with tile-level pooled vectors
179
+ ├── Pool each tile (64 patches) → num_tiles vectors
180
+ ├── Use HNSW index for O(log N) retrieval
181
+ └── Retrieve top-K candidates (e.g., 200)
182
+
183
+ Stage 2: Exact MaxSim reranking on candidates
184
+ ├── Load full multi-vector embeddings
185
+ ├── Compute exact ColBERT MaxSim scores
186
+ └── Return top-k results (e.g., 10)
187
+ ```
188
+
189
+ Three-stage extends this with an additional “cheap prefetch” stage before stage 2.
190
+
191
+ ## 📁 Package Structure
192
+
193
+ ```
194
+ visual-rag-toolkit/
195
+ ├── visual_rag/ # Import as: from visual_rag import ...
196
+ │ ├── embedding/ # VisualEmbedder, pooling functions
197
+ │ ├── indexing/ # PDFProcessor, QdrantIndexer, CloudinaryUploader
198
+ │ ├── retrieval/ # TwoStageRetriever
199
+ │ ├── visualization/ # Saliency maps
200
+ │ ├── cli/ # Command-line: visual-rag process/search
201
+ │ └── config.py # load_config, get, get_section
202
+
203
+ ├── benchmarks/ # ViDoRe evaluation scripts
204
+ └── examples/ # Usage examples
205
+ ```
206
+
207
+ ## ⚙️ Configuration
208
+
209
+ Configure via environment variables or YAML:
210
+
211
+ ```bash
212
+ # Qdrant credentials (preferred names used by the demo + scripts)
213
+ export SIGIR_QDRANT_URL="https://your-cluster.qdrant.io"
214
+ export SIGIR_QDRANT_KEY="your-api-key"
215
+
216
+ # Backwards-compatible fallbacks (also supported)
217
+ export QDRANT_URL="https://your-cluster.qdrant.io"
218
+ export QDRANT_API_KEY="your-api-key"
219
+
220
+ export VISUALRAG_MODEL="vidore/colSmol-500M"
221
+
222
+ # Special token handling (default: filter them out)
223
+ export VISUALRAG_INCLUDE_SPECIAL_TOKENS=true # Include special tokens
224
+ ```
225
+
226
+ Or use a config file (`visual_rag.yaml`):
227
+
228
+ ```yaml
229
+ model:
230
+ name: "vidore/colSmol-500M"
231
+ batch_size: 4
232
+
233
+ qdrant:
234
+ url: "https://your-cluster.qdrant.io"
235
+ collection: "my_documents"
236
+
237
+ search:
238
+ strategy: "two_stage" # or "multi_vector", "pooled"
239
+ prefetch_k: 200
240
+ top_k: 10
241
+ ```
242
+
243
+ ## 🖥️ Demo (Streamlit)
244
+
245
+ ```bash
246
+ pip install "visual-rag-toolkit[ui,qdrant,embedding,pdf]"
247
+
248
+ # Option A: from Python
249
+ python -c "import visual_rag; visual_rag.demo()"
250
+
251
+ # Option B: CLI launcher
252
+ visual-rag-demo
253
+ ```
254
+
255
+ ## 📊 Benchmark Evaluation
256
+
257
+ Run ViDoRe benchmark evaluation:
258
+
259
+ ```bash
260
+ # Example: evaluate a collection against ViDoRe BEIR datasets in Qdrant
261
+ python -m benchmarks.vidore_beir_qdrant.run_qdrant_beir \
262
+ --datasets vidore/esg_reports_v2 vidore/biomedical_lectures_v2 \
263
+ --collection YOUR_COLLECTION \
264
+ --mode two_stage \
265
+ --stage1-mode tokens_vs_experimental \
266
+ --prefetch-k 256 \
267
+ --top-k 100 \
268
+ --evaluation-scope union
269
+ ```
270
+
271
+ More commands (including multi-stage variants and cropping configs) live in:
272
+ - `benchmarks/vidore_tatdqa_test/COMMANDS.md`
273
+
274
+ ## 🔧 Development
275
+
276
+ ```bash
277
+ git clone https://github.com/Ara-Yeroyan/visual-rag-toolkit
278
+ cd visual-rag-toolkit
279
+ pip install -e ".[dev]"
280
+ pytest tests/ -v
281
+ ```
282
+
283
+ ## 📄 Citation
284
+
285
+ If you use this toolkit in your research, please cite:
286
+
287
+ ```bibtex
288
+ @software{visual_rag_toolkit,
289
+ title = {Visual RAG Toolkit: Scalable Visual Document Retrieval with Two-Stage Pooling},
290
+ author = {Ara Yeroyan},
291
+ year = {2026},
292
+ url = {https://github.com/Ara-Yeroyan/visual-rag-toolkit}
293
+ }
294
+ ```
295
+
296
+ ## 📝 License
297
+
298
+ MIT License - see [LICENSE](LICENSE) for details.
299
+
300
+ ## 🙏 Acknowledgments
301
+
302
+ - [Qdrant](https://qdrant.tech/) - Vector database with multi-vector support
303
+ - [ColPali](https://github.com/illuin-tech/colpali) - Visual document retrieval models
304
+ - [ViDoRe](https://huggingface.co/spaces/vidore/vidore-leaderboard) - Benchmark dataset
305
+
@@ -0,0 +1,59 @@
1
+ benchmarks/README.md,sha256=_MAC9n308xawKk9wMSyiZ70Ievl5TtGdOiwZKjjd8sQ,2519
2
+ benchmarks/__init__.py,sha256=ovkoAwhfqB7b0J7QawUUWd73lk96Njy7c6FOH6lndg4,366
3
+ benchmarks/analyze_results.py,sha256=BxgpyXguRNwGv0-VeQBxkF9duzCAd5xc3XAmdNEamxA,5579
4
+ benchmarks/benchmark_datasets.txt,sha256=Y87QfusxxSr5wgFaxmp9Ew-EMb2dwq1StrMvjeJU-LA,4831
5
+ benchmarks/prepare_submission.py,sha256=wD9sLWDqkQw_OANmVOdwe7OQlv4ZVf4sTQiQs7La4tQ,6002
6
+ benchmarks/quick_test.py,sha256=Mdcf2FNYSqWpYVfCmQLQzUVWLG-FiKUnyHyHKnAR3z4,20531
7
+ benchmarks/run_vidore.py,sha256=RuDaEJ0wIV-hLHRtcd8PsRGOEEUFYDcrjUlor-HAajc,16373
8
+ benchmarks/vidore_beir_qdrant/run_qdrant_beir.py,sha256=0lqIA6Qv53CreJpOg-h48sl4c8m7c_pVoQCp-oscnG0,56715
9
+ benchmarks/vidore_tatdqa_test/COMMANDS.md,sha256=lhobkqHLZJjIPE-Lo3VuBuKh5XpbT2WS_sK-6dasPcE,1890
10
+ benchmarks/vidore_tatdqa_test/__init__.py,sha256=WZiwKx8BGNuc0-oz1V3yiq8m_gWc5woEWy-WGb4F14E,18
11
+ benchmarks/vidore_tatdqa_test/dataset_loader.py,sha256=gCCneGAKWQm0WlJHLvGjoMrAbm5b9cPEflkoMimtA2s,12795
12
+ benchmarks/vidore_tatdqa_test/metrics.py,sha256=cLdYbRt5VcxInO1cN79ve6ZLP3kaSxRkdzRX3IbPPMs,1112
13
+ benchmarks/vidore_tatdqa_test/run_qdrant.py,sha256=_PikeqIYpWPim-KEQOwvT-aqwYoAWASjqJVisi8PfQg,28681
14
+ benchmarks/vidore_tatdqa_test/sweep_eval.py,sha256=d_kbyNTJ1LoFfIVnsZyiRO1nKyMqmRB5jEweZL6kYd4,12688
15
+ demo/__init__.py,sha256=jVzjsVKZl5ZZuFxawA8Pxj3yuIKL7llkao3rBpde-aQ,204
16
+ demo/app.py,sha256=1GZJ_JhVWvqoBewngc8tHeiuM1fNbxddEO6ZsEdwBfg,1029
17
+ demo/commands.py,sha256=qxRE2x610yZvcjwEfSKiR9CyFonX-vRxFqQNJCUKfyA,13690
18
+ demo/config.py,sha256=BNkV4NSEEMIV9e6Z-cxds2v247uVmTPCgL-M5ItPzMg,757
19
+ demo/download_models.py,sha256=J10qQt2TpEshVOxvCX_ZSbV7YozIBqDATZnt8fUKFHs,2868
20
+ demo/evaluation.py,sha256=wiVxzRu3UZ5wAwHlpSKQ6srZjnSR06dgQw3G0OOV2Eg,28954
21
+ demo/example_metadata_mapping_sigir.json,sha256=UCgqZtr6Wnq_vS7zxPxpvuokk9gxOVgKydC7f1lauw8,824
22
+ demo/indexing.py,sha256=NLtGYnuCCb3uHGCgs8KHlLqKR-FSD6sxW3PlEw9UhYM,12853
23
+ demo/qdrant_utils.py,sha256=VWEC7BwhMjjB7iIS5iaVDMGt_CMh9mQG4F94k1Pt0yA,7677
24
+ demo/results.py,sha256=dprvxnyHwxJvkAQuh4deaCsiEG1wm0n9svPyxI37vJg,1050
25
+ demo/test_qdrant_connection.py,sha256=hkbyl3zGsw_GdBBp5MkW_3SBKTHXbwH3Sr_pUE54_po,3866
26
+ demo/ui/__init__.py,sha256=EyBCvnXYfPbdyxJzyp9TjQBeJJUgmOY1yRHkUeC6JFQ,412
27
+ demo/ui/benchmark.py,sha256=HiGCN4HrqeOC7L6t2kuzIiyWdcVE_cP2JTxoewrmPSo,14218
28
+ demo/ui/header.py,sha256=J2hXr_nNyg1H9rmrd-EGx3WUl7lYo-Ca30ptgzBCfBs,806
29
+ demo/ui/playground.py,sha256=Z3OgCWOzzTld1I3eN1IcTadaSzsqDQf7MiHwTbxbvJA,13692
30
+ demo/ui/sidebar.py,sha256=muVCnvoeMOm1rHx7UPt68yLXlG3OERdXvJ3QqIXAUoc,7839
31
+ demo/ui/upload.py,sha256=BHJmbIQOAYdMF_svxlRSYIe163Y5UX5P_gilJ09YHSA,20372
32
+ visual_rag/__init__.py,sha256=UkGFXjPmjbO6Iad8ty1uJOMQsVMpV_s63ihchHltLx8,2555
33
+ visual_rag/config.py,sha256=pd48M3j3n8ZV1HhaabMmP_uoEJnqhBC-Bma9vuvc8V4,7368
34
+ visual_rag/demo_runner.py,sha256=wi0Wz3gZ39l4aovMd6zURq_CKUSgma4kGjF6hpQHwGY,2793
35
+ visual_rag/qdrant_admin.py,sha256=NNczko2S5-K3qATNUxgYn51hNWgWb6boheL7vlCQGpM,7055
36
+ visual_rag/cli/__init__.py,sha256=WgBRXm0VACfLltvVlLcSs3FTM1uQ7Uuw3CVD4-zWZwc,46
37
+ visual_rag/cli/main.py,sha256=QmpnQ0lbC6Q9lwxaSCDh6paEEzI78IPY1jwc3_9y7VI,21083
38
+ visual_rag/embedding/__init__.py,sha256=7QIENmxwRnwnUzsYKRY3VQTyF3HJkRiL1D7Au9XHF0w,682
39
+ visual_rag/embedding/pooling.py,sha256=x8uY4VHbxEnsJRM2JeOkzPHDiwOkbi5NK4XW21U1hAc,11401
40
+ visual_rag/embedding/visual_embedder.py,sha256=he9JpVHmo_szOiXCwtJdrCseGmf2y5Gi0UEFjwazzVY,23198
41
+ visual_rag/indexing/__init__.py,sha256=pMLuinCIERbwWechn176nMrtlmTp0ySfuj8gdkNvRks,679
42
+ visual_rag/indexing/cloudinary_uploader.py,sha256=e-G5du4D7z6mWWl2lahMidG-Wdc-baImFFILTojebpA,8826
43
+ visual_rag/indexing/pdf_processor.py,sha256=V3RAKpwgIFicqUaXzaaljePxh_oP4UV5W0aiJyfv0BY,10247
44
+ visual_rag/indexing/pipeline.py,sha256=1ScpVRlLCq2FWi3IPvlQcIfDCQQ2F64IlRd9ZZHiTaA,25037
45
+ visual_rag/indexing/qdrant_indexer.py,sha256=uUOA-6Qkd_vEeP1LdgGyoh1FHu1ZNEyYKuNxJAqetBU,17121
46
+ visual_rag/preprocessing/__init__.py,sha256=rCzfBO0jaVKp6MpPRRused_4gasHfobAbG-139Y806E,121
47
+ visual_rag/preprocessing/crop_empty.py,sha256=iHXITFkRlF40VPJ4k9d432RUAi_89BhAEvK4wOEn96Q,5211
48
+ visual_rag/retrieval/__init__.py,sha256=J9pnbeB83Fqs9n4g3GcIp1VR9dnuyAlcsIDVsf0lSb8,601
49
+ visual_rag/retrieval/multi_vector.py,sha256=m5PKjkj0TFeWNccKNmCqghTM5b9ARr43Lq3sRhOxnjw,7381
50
+ visual_rag/retrieval/single_stage.py,sha256=TSndnh4Kz9aT_0kKhNyLEvokbDLkgq--lXuyldzP5sU,4105
51
+ visual_rag/retrieval/three_stage.py,sha256=YC0CVEohxTT5zhilcQHI7nYAk08E5jC3zkQ3-rNdLMw,5951
52
+ visual_rag/retrieval/two_stage.py,sha256=_RnEgIx_qY4yu2iIk0a3w47D7WiKHlmBivm5gLEpyI4,16779
53
+ visual_rag/visualization/__init__.py,sha256=SITKNvBEseDp7F3K6UzLPA-6OQFqYfY5azS5nlDdihQ,447
54
+ visual_rag/visualization/saliency.py,sha256=F3Plc18Sf3tzWcyncuaruTmENm1IfW5j9NFGEQR93cY,11248
55
+ visual_rag_toolkit-0.1.1.dist-info/METADATA,sha256=SL55eEexz2ogZPD5q-gfzpF2TVZ_U1ZwykPlHaggEdU,11070
56
+ visual_rag_toolkit-0.1.1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
57
+ visual_rag_toolkit-0.1.1.dist-info/entry_points.txt,sha256=6Tob1GPg_ILGELjYTPsAnNMZ1W0NS939nfI7xyW2DIY,102
58
+ visual_rag_toolkit-0.1.1.dist-info/licenses/LICENSE,sha256=hEg_weKnHXJakQRR3sw2ygcZ101zCI00zMhBOPb3yfA,1069
59
+ visual_rag_toolkit-0.1.1.dist-info/RECORD,,