visual-rag-toolkit 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- benchmarks/README.md +101 -0
- benchmarks/__init__.py +11 -0
- benchmarks/analyze_results.py +187 -0
- benchmarks/benchmark_datasets.txt +105 -0
- benchmarks/prepare_submission.py +205 -0
- benchmarks/quick_test.py +566 -0
- benchmarks/run_vidore.py +513 -0
- benchmarks/vidore_beir_qdrant/run_qdrant_beir.py +1365 -0
- benchmarks/vidore_tatdqa_test/COMMANDS.md +83 -0
- benchmarks/vidore_tatdqa_test/__init__.py +6 -0
- benchmarks/vidore_tatdqa_test/dataset_loader.py +363 -0
- benchmarks/vidore_tatdqa_test/metrics.py +44 -0
- benchmarks/vidore_tatdqa_test/run_qdrant.py +799 -0
- benchmarks/vidore_tatdqa_test/sweep_eval.py +372 -0
- demo/__init__.py +10 -0
- demo/app.py +45 -0
- demo/commands.py +334 -0
- demo/config.py +34 -0
- demo/download_models.py +75 -0
- demo/evaluation.py +602 -0
- demo/example_metadata_mapping_sigir.json +37 -0
- demo/indexing.py +286 -0
- demo/qdrant_utils.py +211 -0
- demo/results.py +35 -0
- demo/test_qdrant_connection.py +119 -0
- demo/ui/__init__.py +15 -0
- demo/ui/benchmark.py +355 -0
- demo/ui/header.py +30 -0
- demo/ui/playground.py +339 -0
- demo/ui/sidebar.py +162 -0
- demo/ui/upload.py +487 -0
- visual_rag/__init__.py +98 -0
- visual_rag/cli/__init__.py +1 -0
- visual_rag/cli/main.py +629 -0
- visual_rag/config.py +230 -0
- visual_rag/demo_runner.py +90 -0
- visual_rag/embedding/__init__.py +26 -0
- visual_rag/embedding/pooling.py +343 -0
- visual_rag/embedding/visual_embedder.py +622 -0
- visual_rag/indexing/__init__.py +21 -0
- visual_rag/indexing/cloudinary_uploader.py +274 -0
- visual_rag/indexing/pdf_processor.py +324 -0
- visual_rag/indexing/pipeline.py +628 -0
- visual_rag/indexing/qdrant_indexer.py +478 -0
- visual_rag/preprocessing/__init__.py +3 -0
- visual_rag/preprocessing/crop_empty.py +120 -0
- visual_rag/qdrant_admin.py +222 -0
- visual_rag/retrieval/__init__.py +19 -0
- visual_rag/retrieval/multi_vector.py +222 -0
- visual_rag/retrieval/single_stage.py +126 -0
- visual_rag/retrieval/three_stage.py +173 -0
- visual_rag/retrieval/two_stage.py +471 -0
- visual_rag/visualization/__init__.py +19 -0
- visual_rag/visualization/saliency.py +335 -0
- visual_rag_toolkit-0.1.1.dist-info/METADATA +305 -0
- visual_rag_toolkit-0.1.1.dist-info/RECORD +59 -0
- visual_rag_toolkit-0.1.1.dist-info/WHEEL +4 -0
- visual_rag_toolkit-0.1.1.dist-info/entry_points.txt +3 -0
- visual_rag_toolkit-0.1.1.dist-info/licenses/LICENSE +22 -0
benchmarks/README.md
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
# ViDoRe Benchmark Evaluation
|
|
2
|
+
|
|
3
|
+
This directory contains scripts for evaluating visual document retrieval on the [ViDoRe benchmark](https://huggingface.co/spaces/vidore/vidore-leaderboard).
|
|
4
|
+
|
|
5
|
+
## Quick Start
|
|
6
|
+
|
|
7
|
+
### 1. Install Dependencies
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
# Install visual-rag-toolkit with all dependencies
|
|
11
|
+
pip install -e ".[all]"
|
|
12
|
+
|
|
13
|
+
# Install benchmark-specific dependencies
|
|
14
|
+
pip install datasets mteb
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
### 2. Run Evaluation
|
|
18
|
+
|
|
19
|
+
```bash
|
|
20
|
+
# Run on single dataset
|
|
21
|
+
python benchmarks/run_vidore.py --dataset vidore/docvqa_test_subsampled
|
|
22
|
+
|
|
23
|
+
# Run on all ViDoRe datasets
|
|
24
|
+
python benchmarks/run_vidore.py --all
|
|
25
|
+
|
|
26
|
+
# With two-stage retrieval (our contribution)
|
|
27
|
+
python benchmarks/run_vidore.py --dataset vidore/docvqa_test_subsampled --two-stage
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
### 3. Submit to Leaderboard
|
|
31
|
+
|
|
32
|
+
```bash
|
|
33
|
+
# Generate submission file
|
|
34
|
+
python benchmarks/prepare_submission.py --results results/
|
|
35
|
+
|
|
36
|
+
# Submit to HuggingFace
|
|
37
|
+
huggingface-cli login
|
|
38
|
+
huggingface-cli upload vidore/results ./submission.json
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
## ViDoRe Datasets
|
|
42
|
+
|
|
43
|
+
The benchmark includes these datasets (from the leaderboard):
|
|
44
|
+
|
|
45
|
+
| Dataset | Type | # Queries | # Documents |
|
|
46
|
+
|---------|------|-----------|-------------|
|
|
47
|
+
| docvqa_test_subsampled | DocVQA | ~500 | ~5,000 |
|
|
48
|
+
| infovqa_test_subsampled | InfoVQA | ~500 | ~5,000 |
|
|
49
|
+
| tabfquad_test_subsampled | TabFQuAD | ~500 | ~5,000 |
|
|
50
|
+
| tatdqa_test | TAT-DQA | ~1,500 | ~2,500 |
|
|
51
|
+
| arxivqa_test_subsampled | ArXivQA | ~500 | ~5,000 |
|
|
52
|
+
| shiftproject_test | SHIFT | ~500 | ~5,000 |
|
|
53
|
+
|
|
54
|
+
## Evaluation Metrics
|
|
55
|
+
|
|
56
|
+
- **NDCG@5**: Normalized Discounted Cumulative Gain at 5
|
|
57
|
+
- **NDCG@10**: Normalized Discounted Cumulative Gain at 10
|
|
58
|
+
- **MRR@10**: Mean Reciprocal Rank at 10
|
|
59
|
+
- **Recall@5**: Recall at 5
|
|
60
|
+
- **Recall@10**: Recall at 10
|
|
61
|
+
|
|
62
|
+
## Two-Stage Retrieval (Our Contribution)
|
|
63
|
+
|
|
64
|
+
Our key contribution is efficient two-stage retrieval:
|
|
65
|
+
|
|
66
|
+
```
|
|
67
|
+
Stage 1: Fast prefetch with tile-level pooled vectors
|
|
68
|
+
Uses HNSW index for O(log N) retrieval
|
|
69
|
+
|
|
70
|
+
Stage 2: Exact MaxSim reranking on top-K candidates
|
|
71
|
+
Full multi-vector scoring for precision
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
This provides:
|
|
75
|
+
- **5-10x speedup** over full MaxSim at scale
|
|
76
|
+
- **95%+ accuracy** compared to exhaustive search
|
|
77
|
+
- **Memory efficient** (don't load all embeddings upfront)
|
|
78
|
+
|
|
79
|
+
To evaluate with two-stage:
|
|
80
|
+
|
|
81
|
+
```bash
|
|
82
|
+
python benchmarks/run_vidore.py \
|
|
83
|
+
--dataset vidore/docvqa_test_subsampled \
|
|
84
|
+
--two-stage \
|
|
85
|
+
--prefetch-k 200 \
|
|
86
|
+
--top-k 10
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
## Files
|
|
90
|
+
|
|
91
|
+
- `run_vidore.py` - Main evaluation script
|
|
92
|
+
- `prepare_submission.py` - Generate leaderboard submission
|
|
93
|
+
- `analyze_results.py` - Analyze and compare results
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
|
benchmarks/__init__.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Benchmark utilities and dataset loaders used by the demo UI.
|
|
3
|
+
|
|
4
|
+
Note: The `benchmarks/` folder is primarily for research/evaluation scripts, but the
|
|
5
|
+
Streamlit demo imports some loaders/metrics from here. Making this directory a
|
|
6
|
+
package (via `__init__.py`) ensures imports like `benchmarks.vidore_tatdqa_test`
|
|
7
|
+
work in Docker/Spaces environments.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
__all__ = []
|
|
11
|
+
|
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Analyze and compare benchmark results.
|
|
4
|
+
|
|
5
|
+
Usage:
|
|
6
|
+
# Compare exhaustive vs two-stage
|
|
7
|
+
python analyze_results.py --results results/
|
|
8
|
+
|
|
9
|
+
# Compare multiple models
|
|
10
|
+
python analyze_results.py --dirs results_colsmol/ results_colpali/
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
import argparse
|
|
14
|
+
import json
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
from typing import Dict, List, Any
|
|
17
|
+
|
|
18
|
+
import numpy as np
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def load_all_results(results_dir: Path) -> Dict[str, Dict]:
|
|
22
|
+
"""Load all result files from directory."""
|
|
23
|
+
results = {}
|
|
24
|
+
for f in results_dir.glob("*.json"):
|
|
25
|
+
with open(f) as fp:
|
|
26
|
+
data = json.load(fp)
|
|
27
|
+
|
|
28
|
+
# Key by dataset + method
|
|
29
|
+
dataset = data.get("dataset", f.stem).split("/")[-1]
|
|
30
|
+
method = "two_stage" if data.get("two_stage") else "exhaustive"
|
|
31
|
+
key = f"{dataset}_{method}"
|
|
32
|
+
|
|
33
|
+
results[key] = {
|
|
34
|
+
"dataset": dataset,
|
|
35
|
+
"method": method,
|
|
36
|
+
"model": data.get("model", "unknown"),
|
|
37
|
+
**data.get("metrics", {}),
|
|
38
|
+
}
|
|
39
|
+
return results
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def compare_methods(results: Dict[str, Dict]) -> None:
|
|
43
|
+
"""Compare exhaustive vs two-stage on same datasets."""
|
|
44
|
+
|
|
45
|
+
# Group by dataset
|
|
46
|
+
datasets = {}
|
|
47
|
+
for key, data in results.items():
|
|
48
|
+
ds = data["dataset"].replace("_twostage", "")
|
|
49
|
+
if ds not in datasets:
|
|
50
|
+
datasets[ds] = {}
|
|
51
|
+
datasets[ds][data["method"]] = data
|
|
52
|
+
|
|
53
|
+
print("\n" + "=" * 80)
|
|
54
|
+
print("EXHAUSTIVE vs TWO-STAGE COMPARISON")
|
|
55
|
+
print("=" * 80)
|
|
56
|
+
|
|
57
|
+
print(f"\n{'Dataset':<30} {'Method':<12} {'NDCG@10':>10} {'MRR@10':>10} {'Time(ms)':>10}")
|
|
58
|
+
print("-" * 72)
|
|
59
|
+
|
|
60
|
+
improvements = []
|
|
61
|
+
speedups = []
|
|
62
|
+
|
|
63
|
+
for dataset, methods in sorted(datasets.items()):
|
|
64
|
+
for method in ["exhaustive", "two_stage"]:
|
|
65
|
+
if method in methods:
|
|
66
|
+
m = methods[method]
|
|
67
|
+
time_ms = m.get("avg_search_time_ms", 0)
|
|
68
|
+
print(f"{dataset:<30} {method:<12} {m.get('ndcg@10', 0):>10.4f} {m.get('mrr@10', 0):>10.4f} {time_ms:>10.2f}")
|
|
69
|
+
|
|
70
|
+
# Calculate improvement
|
|
71
|
+
if "exhaustive" in methods and "two_stage" in methods:
|
|
72
|
+
ex = methods["exhaustive"]
|
|
73
|
+
ts = methods["two_stage"]
|
|
74
|
+
|
|
75
|
+
ndcg_diff = ts.get("ndcg@10", 0) - ex.get("ndcg@10", 0)
|
|
76
|
+
improvements.append(ndcg_diff)
|
|
77
|
+
|
|
78
|
+
ex_time = ex.get("avg_search_time_ms", 1)
|
|
79
|
+
ts_time = ts.get("avg_search_time_ms", 1)
|
|
80
|
+
if ts_time > 0:
|
|
81
|
+
speedups.append(ex_time / ts_time)
|
|
82
|
+
|
|
83
|
+
print()
|
|
84
|
+
|
|
85
|
+
# Summary
|
|
86
|
+
if improvements:
|
|
87
|
+
print("-" * 72)
|
|
88
|
+
print(f"Average NDCG@10 difference (two_stage - exhaustive): {np.mean(improvements):+.4f}")
|
|
89
|
+
print(f"Retention rate: {100 * (1 + np.mean(improvements)):.1f}%")
|
|
90
|
+
|
|
91
|
+
if speedups:
|
|
92
|
+
print(f"Average speedup: {np.mean(speedups):.1f}x")
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def analyze_stage1_recall(results: Dict[str, Dict]) -> None:
|
|
96
|
+
"""Analyze how well stage 1 preserves relevant documents."""
|
|
97
|
+
print("\n" + "=" * 80)
|
|
98
|
+
print("STAGE 1 RECALL ANALYSIS")
|
|
99
|
+
print("=" * 80)
|
|
100
|
+
print("\n(Stage 1 recall = how often relevant doc is in prefetch candidates)")
|
|
101
|
+
print("This requires detailed results with stage1_rank info - run with --detailed")
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def print_leaderboard(results: Dict[str, Dict]) -> None:
|
|
105
|
+
"""Print results in leaderboard format."""
|
|
106
|
+
print("\n" + "=" * 80)
|
|
107
|
+
print("LEADERBOARD FORMAT")
|
|
108
|
+
print("=" * 80)
|
|
109
|
+
|
|
110
|
+
# Best result per dataset
|
|
111
|
+
best = {}
|
|
112
|
+
for key, data in results.items():
|
|
113
|
+
ds = data["dataset"].replace("_twostage", "")
|
|
114
|
+
ndcg = data.get("ndcg@10", 0)
|
|
115
|
+
if ds not in best or ndcg > best[ds].get("ndcg@10", 0):
|
|
116
|
+
best[ds] = data
|
|
117
|
+
|
|
118
|
+
# Compute average
|
|
119
|
+
ndcg_scores = [d.get("ndcg@10", 0) for d in best.values()]
|
|
120
|
+
avg = sum(ndcg_scores) / len(ndcg_scores) if ndcg_scores else 0
|
|
121
|
+
|
|
122
|
+
print(f"\nModel: {list(results.values())[0].get('model', 'unknown')}")
|
|
123
|
+
print(f"\n{'Dataset':<35} {'NDCG@10':>10}")
|
|
124
|
+
print("-" * 45)
|
|
125
|
+
|
|
126
|
+
for ds, data in sorted(best.items()):
|
|
127
|
+
method_tag = " (2-stage)" if data.get("method") == "two_stage" else ""
|
|
128
|
+
print(f"{ds + method_tag:<35} {data.get('ndcg@10', 0):>10.4f}")
|
|
129
|
+
|
|
130
|
+
print("-" * 45)
|
|
131
|
+
print(f"{'AVERAGE':<35} {avg:>10.4f}")
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def main():
|
|
135
|
+
parser = argparse.ArgumentParser(description="Analyze benchmark results")
|
|
136
|
+
parser.add_argument(
|
|
137
|
+
"--results", type=str, default="results",
|
|
138
|
+
help="Results directory"
|
|
139
|
+
)
|
|
140
|
+
parser.add_argument(
|
|
141
|
+
"--dirs", nargs="+",
|
|
142
|
+
help="Multiple result directories to compare"
|
|
143
|
+
)
|
|
144
|
+
parser.add_argument(
|
|
145
|
+
"--compare", action="store_true",
|
|
146
|
+
help="Compare exhaustive vs two-stage"
|
|
147
|
+
)
|
|
148
|
+
parser.add_argument(
|
|
149
|
+
"--leaderboard", action="store_true",
|
|
150
|
+
help="Print in leaderboard format"
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
args = parser.parse_args()
|
|
154
|
+
|
|
155
|
+
if args.dirs:
|
|
156
|
+
# Compare multiple directories
|
|
157
|
+
all_results = {}
|
|
158
|
+
for d in args.dirs:
|
|
159
|
+
results = load_all_results(Path(d))
|
|
160
|
+
for k, v in results.items():
|
|
161
|
+
all_results[f"{d}_{k}"] = v
|
|
162
|
+
results = all_results
|
|
163
|
+
else:
|
|
164
|
+
results = load_all_results(Path(args.results))
|
|
165
|
+
|
|
166
|
+
if not results:
|
|
167
|
+
print(f"ā No results found")
|
|
168
|
+
return
|
|
169
|
+
|
|
170
|
+
print(f"š Loaded {len(results)} result files")
|
|
171
|
+
|
|
172
|
+
if args.compare or not args.leaderboard:
|
|
173
|
+
compare_methods(results)
|
|
174
|
+
|
|
175
|
+
if args.leaderboard or not args.compare:
|
|
176
|
+
print_leaderboard(results)
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
if __name__ == "__main__":
|
|
180
|
+
main()
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
Option A vs Option B (Visual RAG Toolkit benchmarking datasets/protocols)
|
|
2
|
+
|
|
3
|
+
Goal
|
|
4
|
+
- Evaluate ColPali/ColSmol-style visual document retrieval with:
|
|
5
|
+
- single-stage full late-interaction MaxSim (query tokens vs doc tokens)
|
|
6
|
+
- two-stage retrieval (stage-1 cheap prefetch, stage-2 full MaxSim rerank)
|
|
7
|
+
- Use Qdrant-backed evaluation for āreal worldā behavior (ANN prefetch + vector fetch costs).
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
Vocabulary: DocVQA / InfoVQA / TabFQuAD / TAT-DQA / ArXivQA / SHIFT
|
|
11
|
+
- These names refer to ātask familiesā / subsets in the ViDoRe benchmark.
|
|
12
|
+
- In this repo we currently reference them as HuggingFace datasets like:
|
|
13
|
+
- vidore/docvqa_test_subsampled
|
|
14
|
+
- vidore/infovqa_test_subsampled
|
|
15
|
+
- vidore/tabfquad_test_subsampled
|
|
16
|
+
- vidore/tatdqa_test
|
|
17
|
+
- vidore/arxivqa_test_subsampled
|
|
18
|
+
- vidore/shiftproject_test
|
|
19
|
+
- Important: These are still āViDoRe benchmark datasetsā (not completely unrelated external datasets),
|
|
20
|
+
but they are derived from those task domains.
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
What is āqrels mappingā?
|
|
24
|
+
- qrels = query relevance labels used by IR metrics (NDCG/MRR/Recall).
|
|
25
|
+
- Concretely: a mapping like:
|
|
26
|
+
qrels[query_id] = {doc_id_1: relevance, doc_id_2: relevance, ...}
|
|
27
|
+
- In a correct shared-corpus benchmark, doc_id refers to an item in a corpus (pages/images),
|
|
28
|
+
and query_id refers to a query/question; qrels tells which corpus items are relevant.
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
Option A: Official ViDoRe protocol (recommended for paper comparability)
|
|
32
|
+
What it means
|
|
33
|
+
- Evaluate each ViDoRe dataset using the āofficialā definition of:
|
|
34
|
+
- query set
|
|
35
|
+
- corpus (shared across queries)
|
|
36
|
+
- qrels (relevance mapping from queries to corpus items)
|
|
37
|
+
- metrics (NDCG@K, MRR@K, Recall@K as reported by the benchmark)
|
|
38
|
+
- Results are directly comparable to other systems and can be reported as āViDoRe benchmark resultsā.
|
|
39
|
+
|
|
40
|
+
Why it matters
|
|
41
|
+
- Strongest credibility and easiest to defend in a paper.
|
|
42
|
+
- Minimizes reviewer skepticism about custom evaluation.
|
|
43
|
+
|
|
44
|
+
Notes about THIS repo today
|
|
45
|
+
- The current script `benchmarks/run_vidore.py` DOES NOT implement the official shared-corpus protocol.
|
|
46
|
+
It currently constructs an artificial 1:1 regime:
|
|
47
|
+
- query_id = q_{idx}
|
|
48
|
+
- doc_id = d_{idx}
|
|
49
|
+
- qrels[q_{idx}] = {d_{idx}: 1}
|
|
50
|
+
This makes the ācorpus sizeā equal to the number of examples and makes each doc relevant to only one query.
|
|
51
|
+
- For Option A, we should later update the benchmark pipeline to:
|
|
52
|
+
1) load the official corpus + query split for each dataset
|
|
53
|
+
2) index the corpus into Qdrant using visual-rag-toolkit indexing (vectors: initial, mean_pooling, global_pooling)
|
|
54
|
+
3) run retrieval against Qdrant (not in-memory NumPy)
|
|
55
|
+
4) compute official metrics using qrels
|
|
56
|
+
|
|
57
|
+
Implementation checklist for later (Option A)
|
|
58
|
+
- Determine the true ViDoRe dataset schema for each subset:
|
|
59
|
+
- Which fields identify the query?
|
|
60
|
+
- Which fields identify the relevant document/page id?
|
|
61
|
+
- Is the corpus provided as a separate split/file/dataset?
|
|
62
|
+
- Ensure consistent doc_id keys across:
|
|
63
|
+
- indexing pipeline ids
|
|
64
|
+
- qrels doc ids
|
|
65
|
+
- Ensure we donāt āleakā query-paired doc only; we must have a shared corpus per dataset.
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
Option B: Custom āscale-stressā protocol (not leaderboard-comparable)
|
|
69
|
+
What it means
|
|
70
|
+
- Build a larger shared corpus to stress scalability and show the value of two-stage retrieval.
|
|
71
|
+
- A practical version using only ViDoRe subsets:
|
|
72
|
+
- Merge corpora from multiple ViDoRe subsets into one larger corpus
|
|
73
|
+
e.g. corpus = DocVQA corpus + InfoVQA corpus + TabFQuAD corpus + ...
|
|
74
|
+
- Run queries from one subset (or all subsets) against the merged corpus.
|
|
75
|
+
- Keep qrels pointing to the original relevant doc ids (those docs now still exist in the merged corpus).
|
|
76
|
+
|
|
77
|
+
Why it matters
|
|
78
|
+
- Produces ālatency vs corpus sizeā curves and āquality retention vs prefetch_kā curves
|
|
79
|
+
that better reflect production deployments (thousands ā millions of pages).
|
|
80
|
+
|
|
81
|
+
Tradeoffs
|
|
82
|
+
- Not directly comparable to the official ViDoRe leaderboard.
|
|
83
|
+
- Must be explicitly described as a custom scaling experiment in the paper.
|
|
84
|
+
|
|
85
|
+
Custom qrels mapping in Option B
|
|
86
|
+
- If doc IDs are preserved during corpus merge, then qrels does not need to change:
|
|
87
|
+
qrels[query_id] still points to doc_id(s) that are in the merged corpus.
|
|
88
|
+
- If doc IDs collide between subsets, then you must namespace them:
|
|
89
|
+
doc_id := "{subset_name}:{original_doc_id}"
|
|
90
|
+
and update qrels doc ids accordingly.
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
Is ViDoRe āsmallā?
|
|
94
|
+
- ViDoRe subsets used here are commonly āsubsampledā (e.g., ~500 queries) for quick runs.
|
|
95
|
+
- Even if ViDoRe is not ātinyā, it is still far smaller than a production corpus (hundreds of thousands to millions of pages).
|
|
96
|
+
- Thatās why Option B exists as an additional scaling experiment (often paired with an in-domain corpus).
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
Where this repo currently defines the subsets
|
|
100
|
+
- `visual-rag-toolkit/benchmarks/run_vidore.py` has `VIDORE_DATASETS` mapping.
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
|
|
@@ -0,0 +1,205 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Prepare submission for ViDoRe leaderboard.
|
|
4
|
+
|
|
5
|
+
Reads evaluation results and formats them for HuggingFace submission.
|
|
6
|
+
|
|
7
|
+
Usage:
|
|
8
|
+
python prepare_submission.py --results results/ --output submission.json
|
|
9
|
+
python prepare_submission.py --results results/ --model-name "MyModel" --upload
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import argparse
|
|
13
|
+
import json
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
from datetime import datetime
|
|
16
|
+
from typing import Dict, Any, Optional
|
|
17
|
+
|
|
18
|
+
# ViDoRe leaderboard expected datasets
|
|
19
|
+
VIDORE_DATASETS = {
|
|
20
|
+
"docvqa_test_subsampled": "DocVQA",
|
|
21
|
+
"infovqa_test_subsampled": "InfoVQA",
|
|
22
|
+
"tabfquad_test_subsampled": "TabFQuAD",
|
|
23
|
+
"tatdqa_test": "TAT-DQA",
|
|
24
|
+
"arxivqa_test_subsampled": "ArXivQA",
|
|
25
|
+
"shiftproject_test": "SHIFT",
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def load_results(results_dir: Path) -> Dict[str, Dict[str, float]]:
|
|
30
|
+
"""Load all result JSON files from directory."""
|
|
31
|
+
results = {}
|
|
32
|
+
|
|
33
|
+
for json_file in results_dir.glob("*.json"):
|
|
34
|
+
with open(json_file) as f:
|
|
35
|
+
data = json.load(f)
|
|
36
|
+
|
|
37
|
+
dataset = data.get("dataset", json_file.stem)
|
|
38
|
+
dataset_short = dataset.split("/")[-1].replace("_twostage", "")
|
|
39
|
+
|
|
40
|
+
results[dataset_short] = {
|
|
41
|
+
"ndcg@5": data["metrics"].get("ndcg@5", 0),
|
|
42
|
+
"ndcg@10": data["metrics"].get("ndcg@10", 0),
|
|
43
|
+
"mrr@10": data["metrics"].get("mrr@10", 0),
|
|
44
|
+
"recall@5": data["metrics"].get("recall@5", 0),
|
|
45
|
+
"recall@10": data["metrics"].get("recall@10", 0),
|
|
46
|
+
"two_stage": data.get("two_stage", False),
|
|
47
|
+
"model": data.get("model", "unknown"),
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
return results
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def format_submission(
|
|
54
|
+
results: Dict[str, Dict],
|
|
55
|
+
model_name: str,
|
|
56
|
+
model_url: Optional[str] = None,
|
|
57
|
+
description: Optional[str] = None,
|
|
58
|
+
) -> Dict[str, Any]:
|
|
59
|
+
"""Format results for ViDoRe leaderboard submission."""
|
|
60
|
+
|
|
61
|
+
# Calculate average scores
|
|
62
|
+
ndcg10_scores = [r["ndcg@10"] for r in results.values()]
|
|
63
|
+
avg_ndcg10 = sum(ndcg10_scores) / len(ndcg10_scores) if ndcg10_scores else 0
|
|
64
|
+
|
|
65
|
+
submission = {
|
|
66
|
+
"model_name": model_name,
|
|
67
|
+
"model_url": model_url or "",
|
|
68
|
+
"description": description or "Visual RAG Toolkit submission",
|
|
69
|
+
"submitted_at": datetime.now().isoformat(),
|
|
70
|
+
"average_ndcg@10": avg_ndcg10,
|
|
71
|
+
"results": {},
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
# Add per-dataset results
|
|
75
|
+
for dataset_short, metrics in results.items():
|
|
76
|
+
display_name = VIDORE_DATASETS.get(dataset_short, dataset_short)
|
|
77
|
+
submission["results"][display_name] = {
|
|
78
|
+
"ndcg@5": metrics["ndcg@5"],
|
|
79
|
+
"ndcg@10": metrics["ndcg@10"],
|
|
80
|
+
"mrr@10": metrics["mrr@10"],
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
return submission
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def print_summary(results: Dict[str, Dict], submission: Dict[str, Any]):
|
|
87
|
+
"""Print summary table."""
|
|
88
|
+
print("\n" + "=" * 70)
|
|
89
|
+
print(f"MODEL: {submission['model_name']}")
|
|
90
|
+
print("=" * 70)
|
|
91
|
+
|
|
92
|
+
print(f"\n{'Dataset':<25} {'NDCG@5':>10} {'NDCG@10':>10} {'MRR@10':>10}")
|
|
93
|
+
print("-" * 55)
|
|
94
|
+
|
|
95
|
+
for dataset, metrics in results.items():
|
|
96
|
+
display = VIDORE_DATASETS.get(dataset, dataset)[:24]
|
|
97
|
+
print(f"{display:<25} {metrics['ndcg@5']:>10.4f} {metrics['ndcg@10']:>10.4f} {metrics['mrr@10']:>10.4f}")
|
|
98
|
+
|
|
99
|
+
print("-" * 55)
|
|
100
|
+
print(f"{'AVERAGE':<25} {'':<10} {submission['average_ndcg@10']:>10.4f}")
|
|
101
|
+
print("=" * 70)
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def upload_to_huggingface(submission: Dict[str, Any], repo_id: str = "vidore/results"):
|
|
105
|
+
"""Upload submission to HuggingFace."""
|
|
106
|
+
try:
|
|
107
|
+
from huggingface_hub import HfApi
|
|
108
|
+
except ImportError:
|
|
109
|
+
print("Install huggingface_hub: pip install huggingface_hub")
|
|
110
|
+
return False
|
|
111
|
+
|
|
112
|
+
api = HfApi()
|
|
113
|
+
|
|
114
|
+
# Save to temp file
|
|
115
|
+
temp_file = Path(f"/tmp/submission_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json")
|
|
116
|
+
with open(temp_file, "w") as f:
|
|
117
|
+
json.dump(submission, f, indent=2)
|
|
118
|
+
|
|
119
|
+
try:
|
|
120
|
+
api.upload_file(
|
|
121
|
+
path_or_fileobj=str(temp_file),
|
|
122
|
+
path_in_repo=f"submissions/{submission['model_name']}.json",
|
|
123
|
+
repo_id=repo_id,
|
|
124
|
+
repo_type="space",
|
|
125
|
+
)
|
|
126
|
+
print(f"ā
Uploaded to {repo_id}")
|
|
127
|
+
return True
|
|
128
|
+
except Exception as e:
|
|
129
|
+
print(f"ā Upload failed: {e}")
|
|
130
|
+
return False
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def main():
|
|
134
|
+
parser = argparse.ArgumentParser(description="Prepare ViDoRe submission")
|
|
135
|
+
parser.add_argument(
|
|
136
|
+
"--results", type=str, default="results",
|
|
137
|
+
help="Directory with result JSON files"
|
|
138
|
+
)
|
|
139
|
+
parser.add_argument(
|
|
140
|
+
"--output", type=str, default="submission.json",
|
|
141
|
+
help="Output submission file"
|
|
142
|
+
)
|
|
143
|
+
parser.add_argument(
|
|
144
|
+
"--model-name", type=str, default="visual-rag-toolkit",
|
|
145
|
+
help="Model name for leaderboard"
|
|
146
|
+
)
|
|
147
|
+
parser.add_argument(
|
|
148
|
+
"--model-url", type=str,
|
|
149
|
+
help="URL to model/paper"
|
|
150
|
+
)
|
|
151
|
+
parser.add_argument(
|
|
152
|
+
"--description", type=str,
|
|
153
|
+
help="Model description"
|
|
154
|
+
)
|
|
155
|
+
parser.add_argument(
|
|
156
|
+
"--upload", action="store_true",
|
|
157
|
+
help="Upload to HuggingFace"
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
args = parser.parse_args()
|
|
161
|
+
|
|
162
|
+
results_dir = Path(args.results)
|
|
163
|
+
if not results_dir.exists():
|
|
164
|
+
print(f"ā Results directory not found: {results_dir}")
|
|
165
|
+
return
|
|
166
|
+
|
|
167
|
+
# Load results
|
|
168
|
+
results = load_results(results_dir)
|
|
169
|
+
if not results:
|
|
170
|
+
print(f"ā No result files found in {results_dir}")
|
|
171
|
+
return
|
|
172
|
+
|
|
173
|
+
print(f"š Found {len(results)} dataset results")
|
|
174
|
+
|
|
175
|
+
# Format submission
|
|
176
|
+
submission = format_submission(
|
|
177
|
+
results,
|
|
178
|
+
model_name=args.model_name,
|
|
179
|
+
model_url=args.model_url,
|
|
180
|
+
description=args.description,
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
# Print summary
|
|
184
|
+
print_summary(results, submission)
|
|
185
|
+
|
|
186
|
+
# Save
|
|
187
|
+
output_path = Path(args.output)
|
|
188
|
+
with open(output_path, "w") as f:
|
|
189
|
+
json.dump(submission, f, indent=2)
|
|
190
|
+
print(f"\nš¾ Saved to: {output_path}")
|
|
191
|
+
|
|
192
|
+
# Upload if requested
|
|
193
|
+
if args.upload:
|
|
194
|
+
upload_to_huggingface(submission)
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
if __name__ == "__main__":
|
|
198
|
+
main()
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
|