PyPI - visual-rag-toolkit - Versions diffs - 0.1.1__py3-none-any.whl - Mend

visual-rag-toolkit 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (59) hide show

benchmarks/README.md +101 -0
benchmarks/__init__.py +11 -0
benchmarks/analyze_results.py +187 -0
benchmarks/benchmark_datasets.txt +105 -0
benchmarks/prepare_submission.py +205 -0
benchmarks/quick_test.py +566 -0
benchmarks/run_vidore.py +513 -0
benchmarks/vidore_beir_qdrant/run_qdrant_beir.py +1365 -0
benchmarks/vidore_tatdqa_test/COMMANDS.md +83 -0
benchmarks/vidore_tatdqa_test/__init__.py +6 -0
benchmarks/vidore_tatdqa_test/dataset_loader.py +363 -0
benchmarks/vidore_tatdqa_test/metrics.py +44 -0
benchmarks/vidore_tatdqa_test/run_qdrant.py +799 -0
benchmarks/vidore_tatdqa_test/sweep_eval.py +372 -0
demo/__init__.py +10 -0
demo/app.py +45 -0
demo/commands.py +334 -0
demo/config.py +34 -0
demo/download_models.py +75 -0
demo/evaluation.py +602 -0
demo/example_metadata_mapping_sigir.json +37 -0
demo/indexing.py +286 -0
demo/qdrant_utils.py +211 -0
demo/results.py +35 -0
demo/test_qdrant_connection.py +119 -0
demo/ui/__init__.py +15 -0
demo/ui/benchmark.py +355 -0
demo/ui/header.py +30 -0
demo/ui/playground.py +339 -0
demo/ui/sidebar.py +162 -0
demo/ui/upload.py +487 -0
visual_rag/__init__.py +98 -0
visual_rag/cli/__init__.py +1 -0
visual_rag/cli/main.py +629 -0
visual_rag/config.py +230 -0
visual_rag/demo_runner.py +90 -0
visual_rag/embedding/__init__.py +26 -0
visual_rag/embedding/pooling.py +343 -0
visual_rag/embedding/visual_embedder.py +622 -0
visual_rag/indexing/__init__.py +21 -0
visual_rag/indexing/cloudinary_uploader.py +274 -0
visual_rag/indexing/pdf_processor.py +324 -0
visual_rag/indexing/pipeline.py +628 -0
visual_rag/indexing/qdrant_indexer.py +478 -0
visual_rag/preprocessing/__init__.py +3 -0
visual_rag/preprocessing/crop_empty.py +120 -0
visual_rag/qdrant_admin.py +222 -0
visual_rag/retrieval/__init__.py +19 -0
visual_rag/retrieval/multi_vector.py +222 -0
visual_rag/retrieval/single_stage.py +126 -0
visual_rag/retrieval/three_stage.py +173 -0
visual_rag/retrieval/two_stage.py +471 -0
visual_rag/visualization/__init__.py +19 -0
visual_rag/visualization/saliency.py +335 -0
visual_rag_toolkit-0.1.1.dist-info/METADATA +305 -0
visual_rag_toolkit-0.1.1.dist-info/RECORD +59 -0
visual_rag_toolkit-0.1.1.dist-info/WHEEL +4 -0
visual_rag_toolkit-0.1.1.dist-info/entry_points.txt +3 -0
visual_rag_toolkit-0.1.1.dist-info/licenses/LICENSE +22 -0

visual_rag/visualization/saliency.py ADDED Viewed

@@ -0,0 +1,335 @@
+"""
+Saliency Map Generation for Visual Document Retrieval.
+Generates attention/saliency maps to visualize which parts of documents
+are most relevant to a query.
+"""
+import logging
+from typing import Any, Dict, List, Optional, Tuple
+import numpy as np
+from PIL import Image
+logger = logging.getLogger(__name__)
+def generate_saliency_map(
+    query_embedding: np.ndarray,
+    doc_embedding: np.ndarray,
+    image: Image.Image,
+    token_info: Optional[Dict[str, Any]] = None,
+    colormap: str = "Reds",
+    alpha: float = 0.5,
+    threshold_percentile: float = 50.0,
+) -> Tuple[Image.Image, np.ndarray]:
+    """
+    Generate saliency map showing which parts of the image match the query.
+    Computes patch-level relevance scores and overlays them on the image.
+    Args:
+        query_embedding: Query embeddings [num_query_tokens, dim]
+        doc_embedding: Document visual embeddings [num_visual_tokens, dim]
+        image: Original PIL Image
+        token_info: Optional token info with n_rows, n_cols for tile grid
+        colormap: Matplotlib colormap name (Reds, viridis, jet, etc.)
+        alpha: Overlay transparency (0-1)
+        threshold_percentile: Only highlight patches above this percentile
+    Returns:
+        Tuple of (annotated_image, patch_scores)
+    Example:
+        >>> query = embedder.embed_query("budget allocation")
+        >>> doc = visual_embedding  # From embed_images
+        >>> annotated, scores = generate_saliency_map(
+        ...     query_embedding=query.numpy(),
+        ...     doc_embedding=doc,
+        ...     image=page_image,
+        ...     token_info=token_info,
+        ... )
+        >>> annotated.save("saliency.png")
+    """
+    # Ensure numpy arrays
+    if hasattr(query_embedding, "numpy"):
+        query_np = query_embedding.numpy()
+    elif hasattr(query_embedding, "cpu"):
+        query_np = query_embedding.cpu().numpy()
+    else:
+        query_np = np.array(query_embedding, dtype=np.float32)
+    if hasattr(doc_embedding, "numpy"):
+        doc_np = doc_embedding.numpy()
+    elif hasattr(doc_embedding, "cpu"):
+        doc_np = doc_embedding.cpu().numpy()
+    else:
+        doc_np = np.array(doc_embedding, dtype=np.float32)
+    # Normalize embeddings
+    query_norm = query_np / (np.linalg.norm(query_np, axis=1, keepdims=True) + 1e-8)
+    doc_norm = doc_np / (np.linalg.norm(doc_np, axis=1, keepdims=True) + 1e-8)
+    # Compute similarity matrix: [num_query, num_doc]
+    similarity_matrix = np.dot(query_norm, doc_norm.T)
+    # Get max similarity per document patch (best match from any query token)
+    patch_scores = similarity_matrix.max(axis=0)
+    # Normalize to [0, 1]
+    score_min, score_max = patch_scores.min(), patch_scores.max()
+    if score_max - score_min > 1e-8:
+        patch_scores_norm = (patch_scores - score_min) / (score_max - score_min)
+    else:
+        patch_scores_norm = np.zeros_like(patch_scores)
+    # Determine grid dimensions
+    if token_info and token_info.get("n_rows") and token_info.get("n_cols"):
+        n_rows = token_info["n_rows"]
+        n_cols = token_info["n_cols"]
+        num_tiles = n_rows * n_cols + 1  # +1 for global tile
+        patches_per_tile = 64  # ColSmol standard
+        # Reshape to tile grid (excluding global tile)
+        try:
+            # Skip global tile patches at the end
+            tile_patches = num_tiles * patches_per_tile
+            if len(patch_scores_norm) >= tile_patches:
+                grid_patches = patch_scores_norm[: n_rows * n_cols * patches_per_tile]
+            else:
+                grid_patches = patch_scores_norm
+            # Reshape: [tiles * patches_per_tile] -> [tiles, patches_per_tile]
+            # Then mean per tile
+            num_grid_tiles = n_rows * n_cols
+            grid_patches = grid_patches[: num_grid_tiles * patches_per_tile]
+            tile_scores = grid_patches.reshape(num_grid_tiles, patches_per_tile).mean(axis=1)
+            tile_scores = tile_scores.reshape(n_rows, n_cols)
+        except Exception as e:
+            logger.warning(f"Could not reshape to tile grid: {e}")
+            tile_scores = None
+    else:
+        tile_scores = None
+        n_rows = n_cols = None
+    # Create overlay
+    annotated = create_saliency_overlay(
+        image=image,
+        scores=tile_scores if tile_scores is not None else patch_scores_norm,
+        colormap=colormap,
+        alpha=alpha,
+        threshold_percentile=threshold_percentile,
+        grid_rows=n_rows,
+        grid_cols=n_cols,
+    )
+    return annotated, patch_scores
+def create_saliency_overlay(
+    image: Image.Image,
+    scores: np.ndarray,
+    colormap: str = "Reds",
+    alpha: float = 0.5,
+    threshold_percentile: float = 50.0,
+    grid_rows: Optional[int] = None,
+    grid_cols: Optional[int] = None,
+) -> Image.Image:
+    """
+    Create colored overlay on image based on scores.
+    Args:
+        image: Base PIL Image
+        scores: Score array - 1D [num_patches] or 2D [rows, cols]
+        colormap: Matplotlib colormap name
+        alpha: Overlay transparency
+        threshold_percentile: Only color patches above this percentile
+        grid_rows, grid_cols: Grid dimensions (auto-detected if not provided)
+    Returns:
+        Annotated PIL Image
+    """
+    try:
+        import matplotlib.pyplot as plt
+    except ImportError:
+        logger.warning("matplotlib not installed, returning original image")
+        return image
+    img_array = np.array(image)
+    h, w = img_array.shape[:2]
+    # Handle 2D scores (tile grid)
+    if scores.ndim == 2:
+        rows, cols = scores.shape
+    elif grid_rows and grid_cols:
+        rows, cols = grid_rows, grid_cols
+        # Reshape if possible
+        if len(scores) == rows * cols:
+            scores = scores.reshape(rows, cols)
+        else:
+            # Fallback: estimate grid from score count
+            num_patches = len(scores)
+            aspect = w / h
+            cols = int(np.sqrt(num_patches * aspect))
+            rows = max(1, num_patches // cols)
+            scores = scores[: rows * cols].reshape(rows, cols)
+    else:
+        # Auto-estimate grid
+        num_patches = len(scores) if scores.ndim == 1 else scores.size
+        aspect = w / h
+        cols = max(1, int(np.sqrt(num_patches * aspect)))
+        rows = max(1, num_patches // cols)
+        if rows * cols > len(scores) if scores.ndim == 1 else scores.size:
+            cols = max(1, cols - 1)
+        if scores.ndim == 1:
+            scores = scores[: rows * cols].reshape(rows, cols)
+    # Get colormap
+    cmap = plt.cm.get_cmap(colormap)
+    # Calculate threshold
+    threshold = np.percentile(scores, threshold_percentile)
+    # Calculate cell dimensions
+    cell_h = h // rows
+    cell_w = w // cols
+    # Create RGBA overlay
+    overlay = np.zeros((h, w, 4), dtype=np.uint8)
+    for i in range(rows):
+        for j in range(cols):
+            score = scores[i, j]
+            if score >= threshold:
+                y1 = i * cell_h
+                y2 = min((i + 1) * cell_h, h)
+                x1 = j * cell_w
+                x2 = min((j + 1) * cell_w, w)
+                # Normalize score for coloring (above threshold)
+                norm_score = (score - threshold) / (1.0 - threshold + 1e-8)
+                norm_score = min(1.0, max(0.0, norm_score))
+                # Get color
+                color = cmap(norm_score)[:3]
+                color_uint8 = (np.array(color) * 255).astype(np.uint8)
+                overlay[y1:y2, x1:x2, :3] = color_uint8
+                overlay[y1:y2, x1:x2, 3] = int(alpha * 255 * norm_score)
+    # Blend with original
+    overlay_img = Image.fromarray(overlay, "RGBA")
+    result = Image.alpha_composite(image.convert("RGBA"), overlay_img)
+    return result.convert("RGB")
+def visualize_search_results(
+    query: str,
+    results: List[Dict[str, Any]],
+    query_embedding: Optional[np.ndarray] = None,
+    embeddings: Optional[List[np.ndarray]] = None,
+    output_path: Optional[str] = None,
+    max_results: int = 5,
+    show_saliency: bool = False,
+) -> Optional[Image.Image]:
+    """
+    Visualize search results as a grid of images with scores.
+    Args:
+        query: Original query text
+        results: List of search results with 'payload' containing 'page' (image URL/base64)
+        query_embedding: Query embedding for saliency (optional)
+        embeddings: Document embeddings for saliency (optional)
+        output_path: Path to save visualization (optional)
+        max_results: Maximum results to show
+        show_saliency: Generate saliency overlays (requires query_embedding & embeddings)
+    Returns:
+        Combined visualization image if successful
+    """
+    try:
+        import matplotlib.pyplot as plt
+    except ImportError:
+        logger.error("matplotlib required for visualization")
+        return None
+    results = results[:max_results]
+    n = len(results)
+    if n == 0:
+        logger.warning("No results to visualize")
+        return None
+    fig, axes = plt.subplots(1, n, figsize=(4 * n, 4))
+    if n == 1:
+        axes = [axes]
+    for idx, (result, ax) in enumerate(zip(results, axes)):
+        payload = result.get("payload", {})
+        score = result.get("score_final", result.get("score_stage1", 0))
+        # Try to load image from payload
+        page_data = payload.get("page", "")
+        image = None
+        if page_data.startswith("data:image"):
+            # Base64 encoded
+            try:
+                import base64
+                from io import BytesIO
+                b64_data = page_data.split(",")[1]
+                image = Image.open(BytesIO(base64.b64decode(b64_data)))
+            except Exception as e:
+                logger.debug(f"Could not decode base64 image: {e}")
+        elif page_data.startswith("http"):
+            # URL - try to fetch
+            try:
+                import urllib.request
+                from io import BytesIO
+                with urllib.request.urlopen(page_data, timeout=5) as response:
+                    image = Image.open(BytesIO(response.read()))
+            except Exception as e:
+                logger.debug(f"Could not fetch image URL: {e}")
+        if image:
+            ax.imshow(image)
+        else:
+            # Show placeholder
+            ax.text(0.5, 0.5, "No image", ha="center", va="center", fontsize=12, color="gray")
+        # Add title
+        title = f"Rank {idx + 1}\nScore: {score:.3f}"
+        if payload.get("filename"):
+            title += f"\n{payload['filename'][:30]}"
+        if payload.get("page_number") is not None:
+            title += f" p.{payload['page_number'] + 1}"
+        ax.set_title(title, fontsize=9)
+        ax.axis("off")
+    # Add query as suptitle
+    query_display = query[:80] + "..." if len(query) > 80 else query
+    plt.suptitle(f"Query: {query_display}", fontsize=11, fontweight="bold")
+    plt.tight_layout()
+    if output_path:
+        plt.savefig(output_path, dpi=150, bbox_inches="tight")
+        logger.info(f"💾 Saved visualization to: {output_path}")
+    # Convert to PIL Image for return
+    from io import BytesIO
+    buf = BytesIO()
+    plt.savefig(buf, format="png", dpi=100, bbox_inches="tight")
+    buf.seek(0)
+    result_image = Image.open(buf)
+    plt.close()
+    return result_image

visual_rag_toolkit-0.1.1.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,305 @@
+Metadata-Version: 2.4
+Name: visual-rag-toolkit
+Version: 0.1.1
+Summary: End-to-end visual document retrieval with ColPali, featuring two-stage pooling for scalable search
+Project-URL: Homepage, https://github.com/Ara-Yeroyan/visual-rag-toolkit
+Project-URL: Documentation, https://github.com/Ara-Yeroyan/visual-rag-toolkit#readme
+Project-URL: Repository, https://github.com/Ara-Yeroyan/visual-rag-toolkit
+Project-URL: Issues, https://github.com/Ara-Yeroyan/visual-rag-toolkit/issues
+Author: Visual RAG Team
+License: MIT License
+        Copyright (c) 2026 Ara Yeroyan
+        Permission is hereby granted, free of charge, to any person obtaining a copy
+        of this software and associated documentation files (the "Software"), to deal
+        in the Software without restriction, including without limitation the rights
+        to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+        copies of the Software, and to permit persons to whom the Software is
+        furnished to do so, subject to the following conditions:
+        The above copyright notice and this permission notice shall be included in all
+        copies or substantial portions of the Software.
+        THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+        IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+        FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+        AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+        LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+        OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+        SOFTWARE.
+License-File: LICENSE
+Keywords: colbert,colpali,document-retrieval,late-interaction,multimodal-rag,pdf-processing,qdrant,visual-rag,visual-search
+Classifier: Development Status :: 4 - Beta
+Classifier: Intended Audience :: Developers
+Classifier: Intended Audience :: Science/Research
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
+Classifier: Topic :: Scientific/Engineering :: Image Processing
+Requires-Python: >=3.9
+Requires-Dist: numpy>=1.21.0
+Requires-Dist: pillow>=9.0.0
+Requires-Dist: python-dotenv>=0.19.0
+Requires-Dist: pyyaml>=6.0
+Requires-Dist: torch>=2.0.0
+Requires-Dist: tqdm>=4.60.0
+Provides-Extra: all
+Requires-Dist: altair>=5.0.0; extra == 'all'
+Requires-Dist: cloudinary>=1.30.0; extra == 'all'
+Requires-Dist: colpali-engine>=0.3.0; extra == 'all'
+Requires-Dist: httpx>=0.24.0; extra == 'all'
+Requires-Dist: pandas>=2.0.0; extra == 'all'
+Requires-Dist: pdf2image>=1.16.0; extra == 'all'
+Requires-Dist: pypdf>=3.0.0; extra == 'all'
+Requires-Dist: qdrant-client>=1.7.0; extra == 'all'
+Requires-Dist: streamlit>=1.25.0; extra == 'all'
+Requires-Dist: transformers>=4.35.0; extra == 'all'
+Provides-Extra: cloudinary
+Requires-Dist: cloudinary>=1.30.0; extra == 'cloudinary'
+Provides-Extra: dev
+Requires-Dist: black>=23.0.0; extra == 'dev'
+Requires-Dist: mypy>=1.0.0; extra == 'dev'
+Requires-Dist: pytest-cov>=4.0.0; extra == 'dev'
+Requires-Dist: pytest>=7.0.0; extra == 'dev'
+Requires-Dist: ruff>=0.1.0; extra == 'dev'
+Provides-Extra: embedding
+Requires-Dist: colpali-engine>=0.3.0; extra == 'embedding'
+Requires-Dist: transformers>=4.35.0; extra == 'embedding'
+Provides-Extra: pdf
+Requires-Dist: pdf2image>=1.16.0; extra == 'pdf'
+Requires-Dist: pypdf>=3.0.0; extra == 'pdf'
+Provides-Extra: qdrant
+Requires-Dist: qdrant-client>=1.7.0; extra == 'qdrant'
+Provides-Extra: ui
+Requires-Dist: altair>=5.0.0; extra == 'ui'
+Requires-Dist: httpx>=0.24.0; extra == 'ui'
+Requires-Dist: pandas>=2.0.0; extra == 'ui'
+Requires-Dist: streamlit>=1.25.0; extra == 'ui'
+Description-Content-Type: text/markdown
+# Visual RAG Toolkit
+[![PyPI version](https://badge.fury.io/py/visual-rag-toolkit.svg)](https://badge.fury.io/py/visual-rag-toolkit)
+[![CI](https://github.com/Ara-Yeroyan/visual-rag-toolkit/actions/workflows/ci.yaml/badge.svg)](https://github.com/Ara-Yeroyan/visual-rag-toolkit/actions/workflows/ci.yaml)
+[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
+[![Python 3.9+](https://img.shields.io/badge/python-3.9+-blue.svg)](https://www.python.org/downloads/)
+End-to-end visual document retrieval toolkit featuring **fast multi-stage retrieval** (prefetch with pooled vectors + exact MaxSim reranking).
+This repo contains:
+- a **Python package** (`visual_rag`)
+- a **Streamlit demo app** (`demo/`)
+- **benchmark & evaluation scripts** for ViDoRe v2 (`benchmarks/`)
+## 🎯 Key Features
+- **Modular**: PDF → images, embedding, Qdrant indexing, retrieval can be used independently.
+- **Multi-stage retrieval**: two-stage and three-stage retrieval modes built for Qdrant named vectors.
+- **Model-aware embedding**: ColSmol + ColPali support behind a single `VisualEmbedder` interface.
+- **Token hygiene**: query special-token filtering by default for more stable MaxSim behavior.
+- **Practical pipelines**: robust indexing, retries, optional Cloudinary image URLs, evaluation reporting.
+## 📦 Installation
+```bash
+# Core package (minimal dependencies)
+pip install visual-rag-toolkit
+# With specific features
+pip install visual-rag-toolkit[embedding]    # ColSmol/ColPali embedding support
+pip install visual-rag-toolkit[pdf]          # PDF processing
+pip install visual-rag-toolkit[qdrant]       # Vector database
+pip install visual-rag-toolkit[cloudinary]   # Image CDN
+pip install visual-rag-toolkit[ui]           # Streamlit demo dependencies
+# All dependencies
+pip install visual-rag-toolkit[all]
+```
+### System dependencies (PDF)
+`pdf2image` requires Poppler.
+- macOS: `brew install poppler`
+- Ubuntu/Debian: `sudo apt-get update && sudo apt-get install -y poppler-utils`
+## 🚀 Quick Start
+### Minimal: embed a query and run two-stage search (server-side)
+```python
+from qdrant_client import QdrantClient
+from visual_rag import VisualEmbedder, TwoStageRetriever
+client = QdrantClient(url="https://YOUR_QDRANT", api_key="YOUR_KEY")
+collection_name = "your_collection"
+# Embed query tokens
+embedder = VisualEmbedder(model_name="vidore/colpali-v1.3")
+q = embedder.embed_query("What is the budget allocation?")
+# Fast path: all stages computed in Qdrant (prefetch + exact rerank)
+retriever = TwoStageRetriever(client, collection_name)
+results = retriever.search_server_side(
+    query_embedding=q,
+    top_k=10,
+    prefetch_k=256,
+    stage1_mode="tokens_vs_experimental",  # or: tokens_vs_tiles / pooled_query_vs_tiles / pooled_query_vs_global
+)
+for r in results[:3]:
+    print(r["id"], r["score_final"])
+```
+### Process a PDF into images (no embedding, no vector DB)
+```python
+from pathlib import Path
+from visual_rag import PDFProcessor
+processor = PDFProcessor(dpi=140)
+images, texts = processor.process_pdf(Path("report.pdf"))
+print(len(images), "pages")
+```
+## 🔬 Multi-stage Retrieval (Two-stage / Three-stage)
+Traditional ColBERT-style MaxSim scoring compares all query tokens vs all document tokens, which becomes expensive at scale.
+**Our approach:**
+```
+Stage 1: Fast prefetch with tile-level pooled vectors
+         ├── Pool each tile (64 patches) → num_tiles vectors
+         ├── Use HNSW index for O(log N) retrieval
+         └── Retrieve top-K candidates (e.g., 200)
+Stage 2: Exact MaxSim reranking on candidates
+         ├── Load full multi-vector embeddings
+         ├── Compute exact ColBERT MaxSim scores
+         └── Return top-k results (e.g., 10)
+```
+Three-stage extends this with an additional “cheap prefetch” stage before stage 2.
+## 📁 Package Structure
+```
+visual-rag-toolkit/
+├── visual_rag/              # Import as: from visual_rag import ...
+│   ├── embedding/           # VisualEmbedder, pooling functions
+│   ├── indexing/            # PDFProcessor, QdrantIndexer, CloudinaryUploader
+│   ├── retrieval/           # TwoStageRetriever
+│   ├── visualization/       # Saliency maps
+│   ├── cli/                 # Command-line: visual-rag process/search
+│   └── config.py            # load_config, get, get_section
+│
+├── benchmarks/              # ViDoRe evaluation scripts
+└── examples/                # Usage examples
+```
+## ⚙️ Configuration
+Configure via environment variables or YAML:
+```bash
+# Qdrant credentials (preferred names used by the demo + scripts)
+export SIGIR_QDRANT_URL="https://your-cluster.qdrant.io"
+export SIGIR_QDRANT_KEY="your-api-key"
+# Backwards-compatible fallbacks (also supported)
+export QDRANT_URL="https://your-cluster.qdrant.io"
+export QDRANT_API_KEY="your-api-key"
+export VISUALRAG_MODEL="vidore/colSmol-500M"
+# Special token handling (default: filter them out)
+export VISUALRAG_INCLUDE_SPECIAL_TOKENS=true  # Include special tokens
+```
+Or use a config file (`visual_rag.yaml`):
+```yaml
+model:
+  name: "vidore/colSmol-500M"
+  batch_size: 4
+qdrant:
+  url: "https://your-cluster.qdrant.io"
+  collection: "my_documents"
+search:
+  strategy: "two_stage"  # or "multi_vector", "pooled"
+  prefetch_k: 200
+  top_k: 10
+```
+## 🖥️ Demo (Streamlit)
+```bash
+pip install "visual-rag-toolkit[ui,qdrant,embedding,pdf]"
+# Option A: from Python
+python -c "import visual_rag; visual_rag.demo()"
+# Option B: CLI launcher
+visual-rag-demo
+```
+## 📊 Benchmark Evaluation
+Run ViDoRe benchmark evaluation:
+```bash
+# Example: evaluate a collection against ViDoRe BEIR datasets in Qdrant
+python -m benchmarks.vidore_beir_qdrant.run_qdrant_beir \
+  --datasets vidore/esg_reports_v2 vidore/biomedical_lectures_v2 \
+  --collection YOUR_COLLECTION \
+  --mode two_stage \
+  --stage1-mode tokens_vs_experimental \
+  --prefetch-k 256 \
+  --top-k 100 \
+  --evaluation-scope union
+```
+More commands (including multi-stage variants and cropping configs) live in:
+- `benchmarks/vidore_tatdqa_test/COMMANDS.md`
+## 🔧 Development
+```bash
+git clone https://github.com/Ara-Yeroyan/visual-rag-toolkit
+cd visual-rag-toolkit
+pip install -e ".[dev]"
+pytest tests/ -v
+```
+## 📄 Citation
+If you use this toolkit in your research, please cite:
+```bibtex
+@software{visual_rag_toolkit,
+  title = {Visual RAG Toolkit: Scalable Visual Document Retrieval with Two-Stage Pooling},
+  author = {Ara Yeroyan},
+  year = {2026},
+  url = {https://github.com/Ara-Yeroyan/visual-rag-toolkit}
+}
+```
+## 📝 License
+MIT License - see [LICENSE](LICENSE) for details.
+## 🙏 Acknowledgments
+- [Qdrant](https://qdrant.tech/) - Vector database with multi-vector support
+- [ColPali](https://github.com/illuin-tech/colpali) - Visual document retrieval models
+- [ViDoRe](https://huggingface.co/spaces/vidore/vidore-leaderboard) - Benchmark dataset

visual_rag_toolkit-0.1.1.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,59 @@
+benchmarks/README.md,sha256=_MAC9n308xawKk9wMSyiZ70Ievl5TtGdOiwZKjjd8sQ,2519
+benchmarks/__init__.py,sha256=ovkoAwhfqB7b0J7QawUUWd73lk96Njy7c6FOH6lndg4,366
+benchmarks/analyze_results.py,sha256=BxgpyXguRNwGv0-VeQBxkF9duzCAd5xc3XAmdNEamxA,5579
+benchmarks/benchmark_datasets.txt,sha256=Y87QfusxxSr5wgFaxmp9Ew-EMb2dwq1StrMvjeJU-LA,4831
+benchmarks/prepare_submission.py,sha256=wD9sLWDqkQw_OANmVOdwe7OQlv4ZVf4sTQiQs7La4tQ,6002
+benchmarks/quick_test.py,sha256=Mdcf2FNYSqWpYVfCmQLQzUVWLG-FiKUnyHyHKnAR3z4,20531
+benchmarks/run_vidore.py,sha256=RuDaEJ0wIV-hLHRtcd8PsRGOEEUFYDcrjUlor-HAajc,16373
+benchmarks/vidore_beir_qdrant/run_qdrant_beir.py,sha256=0lqIA6Qv53CreJpOg-h48sl4c8m7c_pVoQCp-oscnG0,56715
+benchmarks/vidore_tatdqa_test/COMMANDS.md,sha256=lhobkqHLZJjIPE-Lo3VuBuKh5XpbT2WS_sK-6dasPcE,1890
+benchmarks/vidore_tatdqa_test/__init__.py,sha256=WZiwKx8BGNuc0-oz1V3yiq8m_gWc5woEWy-WGb4F14E,18
+benchmarks/vidore_tatdqa_test/dataset_loader.py,sha256=gCCneGAKWQm0WlJHLvGjoMrAbm5b9cPEflkoMimtA2s,12795
+benchmarks/vidore_tatdqa_test/metrics.py,sha256=cLdYbRt5VcxInO1cN79ve6ZLP3kaSxRkdzRX3IbPPMs,1112
+benchmarks/vidore_tatdqa_test/run_qdrant.py,sha256=_PikeqIYpWPim-KEQOwvT-aqwYoAWASjqJVisi8PfQg,28681
+benchmarks/vidore_tatdqa_test/sweep_eval.py,sha256=d_kbyNTJ1LoFfIVnsZyiRO1nKyMqmRB5jEweZL6kYd4,12688
+demo/__init__.py,sha256=jVzjsVKZl5ZZuFxawA8Pxj3yuIKL7llkao3rBpde-aQ,204
+demo/app.py,sha256=1GZJ_JhVWvqoBewngc8tHeiuM1fNbxddEO6ZsEdwBfg,1029
+demo/commands.py,sha256=qxRE2x610yZvcjwEfSKiR9CyFonX-vRxFqQNJCUKfyA,13690
+demo/config.py,sha256=BNkV4NSEEMIV9e6Z-cxds2v247uVmTPCgL-M5ItPzMg,757
+demo/download_models.py,sha256=J10qQt2TpEshVOxvCX_ZSbV7YozIBqDATZnt8fUKFHs,2868
+demo/evaluation.py,sha256=wiVxzRu3UZ5wAwHlpSKQ6srZjnSR06dgQw3G0OOV2Eg,28954
+demo/example_metadata_mapping_sigir.json,sha256=UCgqZtr6Wnq_vS7zxPxpvuokk9gxOVgKydC7f1lauw8,824
+demo/indexing.py,sha256=NLtGYnuCCb3uHGCgs8KHlLqKR-FSD6sxW3PlEw9UhYM,12853
+demo/qdrant_utils.py,sha256=VWEC7BwhMjjB7iIS5iaVDMGt_CMh9mQG4F94k1Pt0yA,7677
+demo/results.py,sha256=dprvxnyHwxJvkAQuh4deaCsiEG1wm0n9svPyxI37vJg,1050
+demo/test_qdrant_connection.py,sha256=hkbyl3zGsw_GdBBp5MkW_3SBKTHXbwH3Sr_pUE54_po,3866
+demo/ui/__init__.py,sha256=EyBCvnXYfPbdyxJzyp9TjQBeJJUgmOY1yRHkUeC6JFQ,412
+demo/ui/benchmark.py,sha256=HiGCN4HrqeOC7L6t2kuzIiyWdcVE_cP2JTxoewrmPSo,14218
+demo/ui/header.py,sha256=J2hXr_nNyg1H9rmrd-EGx3WUl7lYo-Ca30ptgzBCfBs,806
+demo/ui/playground.py,sha256=Z3OgCWOzzTld1I3eN1IcTadaSzsqDQf7MiHwTbxbvJA,13692
+demo/ui/sidebar.py,sha256=muVCnvoeMOm1rHx7UPt68yLXlG3OERdXvJ3QqIXAUoc,7839
+demo/ui/upload.py,sha256=BHJmbIQOAYdMF_svxlRSYIe163Y5UX5P_gilJ09YHSA,20372
+visual_rag/__init__.py,sha256=UkGFXjPmjbO6Iad8ty1uJOMQsVMpV_s63ihchHltLx8,2555
+visual_rag/config.py,sha256=pd48M3j3n8ZV1HhaabMmP_uoEJnqhBC-Bma9vuvc8V4,7368
+visual_rag/demo_runner.py,sha256=wi0Wz3gZ39l4aovMd6zURq_CKUSgma4kGjF6hpQHwGY,2793
+visual_rag/qdrant_admin.py,sha256=NNczko2S5-K3qATNUxgYn51hNWgWb6boheL7vlCQGpM,7055
+visual_rag/cli/__init__.py,sha256=WgBRXm0VACfLltvVlLcSs3FTM1uQ7Uuw3CVD4-zWZwc,46
+visual_rag/cli/main.py,sha256=QmpnQ0lbC6Q9lwxaSCDh6paEEzI78IPY1jwc3_9y7VI,21083
+visual_rag/embedding/__init__.py,sha256=7QIENmxwRnwnUzsYKRY3VQTyF3HJkRiL1D7Au9XHF0w,682
+visual_rag/embedding/pooling.py,sha256=x8uY4VHbxEnsJRM2JeOkzPHDiwOkbi5NK4XW21U1hAc,11401
+visual_rag/embedding/visual_embedder.py,sha256=he9JpVHmo_szOiXCwtJdrCseGmf2y5Gi0UEFjwazzVY,23198
+visual_rag/indexing/__init__.py,sha256=pMLuinCIERbwWechn176nMrtlmTp0ySfuj8gdkNvRks,679
+visual_rag/indexing/cloudinary_uploader.py,sha256=e-G5du4D7z6mWWl2lahMidG-Wdc-baImFFILTojebpA,8826
+visual_rag/indexing/pdf_processor.py,sha256=V3RAKpwgIFicqUaXzaaljePxh_oP4UV5W0aiJyfv0BY,10247
+visual_rag/indexing/pipeline.py,sha256=1ScpVRlLCq2FWi3IPvlQcIfDCQQ2F64IlRd9ZZHiTaA,25037
+visual_rag/indexing/qdrant_indexer.py,sha256=uUOA-6Qkd_vEeP1LdgGyoh1FHu1ZNEyYKuNxJAqetBU,17121
+visual_rag/preprocessing/__init__.py,sha256=rCzfBO0jaVKp6MpPRRused_4gasHfobAbG-139Y806E,121
+visual_rag/preprocessing/crop_empty.py,sha256=iHXITFkRlF40VPJ4k9d432RUAi_89BhAEvK4wOEn96Q,5211
+visual_rag/retrieval/__init__.py,sha256=J9pnbeB83Fqs9n4g3GcIp1VR9dnuyAlcsIDVsf0lSb8,601
+visual_rag/retrieval/multi_vector.py,sha256=m5PKjkj0TFeWNccKNmCqghTM5b9ARr43Lq3sRhOxnjw,7381
+visual_rag/retrieval/single_stage.py,sha256=TSndnh4Kz9aT_0kKhNyLEvokbDLkgq--lXuyldzP5sU,4105
+visual_rag/retrieval/three_stage.py,sha256=YC0CVEohxTT5zhilcQHI7nYAk08E5jC3zkQ3-rNdLMw,5951
+visual_rag/retrieval/two_stage.py,sha256=_RnEgIx_qY4yu2iIk0a3w47D7WiKHlmBivm5gLEpyI4,16779
+visual_rag/visualization/__init__.py,sha256=SITKNvBEseDp7F3K6UzLPA-6OQFqYfY5azS5nlDdihQ,447
+visual_rag/visualization/saliency.py,sha256=F3Plc18Sf3tzWcyncuaruTmENm1IfW5j9NFGEQR93cY,11248
+visual_rag_toolkit-0.1.1.dist-info/METADATA,sha256=SL55eEexz2ogZPD5q-gfzpF2TVZ_U1ZwykPlHaggEdU,11070
+visual_rag_toolkit-0.1.1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
+visual_rag_toolkit-0.1.1.dist-info/entry_points.txt,sha256=6Tob1GPg_ILGELjYTPsAnNMZ1W0NS939nfI7xyW2DIY,102
+visual_rag_toolkit-0.1.1.dist-info/licenses/LICENSE,sha256=hEg_weKnHXJakQRR3sw2ygcZ101zCI00zMhBOPb3yfA,1069
+visual_rag_toolkit-0.1.1.dist-info/RECORD,,