visual-rag-toolkit 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. benchmarks/README.md +101 -0
  2. benchmarks/__init__.py +11 -0
  3. benchmarks/analyze_results.py +187 -0
  4. benchmarks/benchmark_datasets.txt +105 -0
  5. benchmarks/prepare_submission.py +205 -0
  6. benchmarks/quick_test.py +566 -0
  7. benchmarks/run_vidore.py +513 -0
  8. benchmarks/vidore_beir_qdrant/run_qdrant_beir.py +1365 -0
  9. benchmarks/vidore_tatdqa_test/COMMANDS.md +83 -0
  10. benchmarks/vidore_tatdqa_test/__init__.py +6 -0
  11. benchmarks/vidore_tatdqa_test/dataset_loader.py +363 -0
  12. benchmarks/vidore_tatdqa_test/metrics.py +44 -0
  13. benchmarks/vidore_tatdqa_test/run_qdrant.py +799 -0
  14. benchmarks/vidore_tatdqa_test/sweep_eval.py +372 -0
  15. demo/__init__.py +10 -0
  16. demo/app.py +45 -0
  17. demo/commands.py +334 -0
  18. demo/config.py +34 -0
  19. demo/download_models.py +75 -0
  20. demo/evaluation.py +602 -0
  21. demo/example_metadata_mapping_sigir.json +37 -0
  22. demo/indexing.py +286 -0
  23. demo/qdrant_utils.py +211 -0
  24. demo/results.py +35 -0
  25. demo/test_qdrant_connection.py +119 -0
  26. demo/ui/__init__.py +15 -0
  27. demo/ui/benchmark.py +355 -0
  28. demo/ui/header.py +30 -0
  29. demo/ui/playground.py +339 -0
  30. demo/ui/sidebar.py +162 -0
  31. demo/ui/upload.py +487 -0
  32. visual_rag/__init__.py +98 -0
  33. visual_rag/cli/__init__.py +1 -0
  34. visual_rag/cli/main.py +629 -0
  35. visual_rag/config.py +230 -0
  36. visual_rag/demo_runner.py +90 -0
  37. visual_rag/embedding/__init__.py +26 -0
  38. visual_rag/embedding/pooling.py +343 -0
  39. visual_rag/embedding/visual_embedder.py +622 -0
  40. visual_rag/indexing/__init__.py +21 -0
  41. visual_rag/indexing/cloudinary_uploader.py +274 -0
  42. visual_rag/indexing/pdf_processor.py +324 -0
  43. visual_rag/indexing/pipeline.py +628 -0
  44. visual_rag/indexing/qdrant_indexer.py +478 -0
  45. visual_rag/preprocessing/__init__.py +3 -0
  46. visual_rag/preprocessing/crop_empty.py +120 -0
  47. visual_rag/qdrant_admin.py +222 -0
  48. visual_rag/retrieval/__init__.py +19 -0
  49. visual_rag/retrieval/multi_vector.py +222 -0
  50. visual_rag/retrieval/single_stage.py +126 -0
  51. visual_rag/retrieval/three_stage.py +173 -0
  52. visual_rag/retrieval/two_stage.py +471 -0
  53. visual_rag/visualization/__init__.py +19 -0
  54. visual_rag/visualization/saliency.py +335 -0
  55. visual_rag_toolkit-0.1.1.dist-info/METADATA +305 -0
  56. visual_rag_toolkit-0.1.1.dist-info/RECORD +59 -0
  57. visual_rag_toolkit-0.1.1.dist-info/WHEEL +4 -0
  58. visual_rag_toolkit-0.1.1.dist-info/entry_points.txt +3 -0
  59. visual_rag_toolkit-0.1.1.dist-info/licenses/LICENSE +22 -0
benchmarks/README.md ADDED
@@ -0,0 +1,101 @@
1
+ # ViDoRe Benchmark Evaluation
2
+
3
+ This directory contains scripts for evaluating visual document retrieval on the [ViDoRe benchmark](https://huggingface.co/spaces/vidore/vidore-leaderboard).
4
+
5
+ ## Quick Start
6
+
7
+ ### 1. Install Dependencies
8
+
9
+ ```bash
10
+ # Install visual-rag-toolkit with all dependencies
11
+ pip install -e ".[all]"
12
+
13
+ # Install benchmark-specific dependencies
14
+ pip install datasets mteb
15
+ ```
16
+
17
+ ### 2. Run Evaluation
18
+
19
+ ```bash
20
+ # Run on single dataset
21
+ python benchmarks/run_vidore.py --dataset vidore/docvqa_test_subsampled
22
+
23
+ # Run on all ViDoRe datasets
24
+ python benchmarks/run_vidore.py --all
25
+
26
+ # With two-stage retrieval (our contribution)
27
+ python benchmarks/run_vidore.py --dataset vidore/docvqa_test_subsampled --two-stage
28
+ ```
29
+
30
+ ### 3. Submit to Leaderboard
31
+
32
+ ```bash
33
+ # Generate submission file
34
+ python benchmarks/prepare_submission.py --results results/
35
+
36
+ # Submit to HuggingFace
37
+ huggingface-cli login
38
+ huggingface-cli upload vidore/results ./submission.json
39
+ ```
40
+
41
+ ## ViDoRe Datasets
42
+
43
+ The benchmark includes these datasets (from the leaderboard):
44
+
45
+ | Dataset | Type | # Queries | # Documents |
46
+ |---------|------|-----------|-------------|
47
+ | docvqa_test_subsampled | DocVQA | ~500 | ~5,000 |
48
+ | infovqa_test_subsampled | InfoVQA | ~500 | ~5,000 |
49
+ | tabfquad_test_subsampled | TabFQuAD | ~500 | ~5,000 |
50
+ | tatdqa_test | TAT-DQA | ~1,500 | ~2,500 |
51
+ | arxivqa_test_subsampled | ArXivQA | ~500 | ~5,000 |
52
+ | shiftproject_test | SHIFT | ~500 | ~5,000 |
53
+
54
+ ## Evaluation Metrics
55
+
56
+ - **NDCG@5**: Normalized Discounted Cumulative Gain at 5
57
+ - **NDCG@10**: Normalized Discounted Cumulative Gain at 10
58
+ - **MRR@10**: Mean Reciprocal Rank at 10
59
+ - **Recall@5**: Recall at 5
60
+ - **Recall@10**: Recall at 10
61
+
62
+ ## Two-Stage Retrieval (Our Contribution)
63
+
64
+ Our key contribution is efficient two-stage retrieval:
65
+
66
+ ```
67
+ Stage 1: Fast prefetch with tile-level pooled vectors
68
+ Uses HNSW index for O(log N) retrieval
69
+
70
+ Stage 2: Exact MaxSim reranking on top-K candidates
71
+ Full multi-vector scoring for precision
72
+ ```
73
+
74
+ This provides:
75
+ - **5-10x speedup** over full MaxSim at scale
76
+ - **95%+ accuracy** compared to exhaustive search
77
+ - **Memory efficient** (don't load all embeddings upfront)
78
+
79
+ To evaluate with two-stage:
80
+
81
+ ```bash
82
+ python benchmarks/run_vidore.py \
83
+ --dataset vidore/docvqa_test_subsampled \
84
+ --two-stage \
85
+ --prefetch-k 200 \
86
+ --top-k 10
87
+ ```
88
+
89
+ ## Files
90
+
91
+ - `run_vidore.py` - Main evaluation script
92
+ - `prepare_submission.py` - Generate leaderboard submission
93
+ - `analyze_results.py` - Analyze and compare results
94
+
95
+
96
+
97
+
98
+
99
+
100
+
101
+
benchmarks/__init__.py ADDED
@@ -0,0 +1,11 @@
1
+ """
2
+ Benchmark utilities and dataset loaders used by the demo UI.
3
+
4
+ Note: The `benchmarks/` folder is primarily for research/evaluation scripts, but the
5
+ Streamlit demo imports some loaders/metrics from here. Making this directory a
6
+ package (via `__init__.py`) ensures imports like `benchmarks.vidore_tatdqa_test`
7
+ work in Docker/Spaces environments.
8
+ """
9
+
10
+ __all__ = []
11
+
@@ -0,0 +1,187 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Analyze and compare benchmark results.
4
+
5
+ Usage:
6
+ # Compare exhaustive vs two-stage
7
+ python analyze_results.py --results results/
8
+
9
+ # Compare multiple models
10
+ python analyze_results.py --dirs results_colsmol/ results_colpali/
11
+ """
12
+
13
+ import argparse
14
+ import json
15
+ from pathlib import Path
16
+ from typing import Dict, List, Any
17
+
18
+ import numpy as np
19
+
20
+
21
+ def load_all_results(results_dir: Path) -> Dict[str, Dict]:
22
+ """Load all result files from directory."""
23
+ results = {}
24
+ for f in results_dir.glob("*.json"):
25
+ with open(f) as fp:
26
+ data = json.load(fp)
27
+
28
+ # Key by dataset + method
29
+ dataset = data.get("dataset", f.stem).split("/")[-1]
30
+ method = "two_stage" if data.get("two_stage") else "exhaustive"
31
+ key = f"{dataset}_{method}"
32
+
33
+ results[key] = {
34
+ "dataset": dataset,
35
+ "method": method,
36
+ "model": data.get("model", "unknown"),
37
+ **data.get("metrics", {}),
38
+ }
39
+ return results
40
+
41
+
42
+ def compare_methods(results: Dict[str, Dict]) -> None:
43
+ """Compare exhaustive vs two-stage on same datasets."""
44
+
45
+ # Group by dataset
46
+ datasets = {}
47
+ for key, data in results.items():
48
+ ds = data["dataset"].replace("_twostage", "")
49
+ if ds not in datasets:
50
+ datasets[ds] = {}
51
+ datasets[ds][data["method"]] = data
52
+
53
+ print("\n" + "=" * 80)
54
+ print("EXHAUSTIVE vs TWO-STAGE COMPARISON")
55
+ print("=" * 80)
56
+
57
+ print(f"\n{'Dataset':<30} {'Method':<12} {'NDCG@10':>10} {'MRR@10':>10} {'Time(ms)':>10}")
58
+ print("-" * 72)
59
+
60
+ improvements = []
61
+ speedups = []
62
+
63
+ for dataset, methods in sorted(datasets.items()):
64
+ for method in ["exhaustive", "two_stage"]:
65
+ if method in methods:
66
+ m = methods[method]
67
+ time_ms = m.get("avg_search_time_ms", 0)
68
+ print(f"{dataset:<30} {method:<12} {m.get('ndcg@10', 0):>10.4f} {m.get('mrr@10', 0):>10.4f} {time_ms:>10.2f}")
69
+
70
+ # Calculate improvement
71
+ if "exhaustive" in methods and "two_stage" in methods:
72
+ ex = methods["exhaustive"]
73
+ ts = methods["two_stage"]
74
+
75
+ ndcg_diff = ts.get("ndcg@10", 0) - ex.get("ndcg@10", 0)
76
+ improvements.append(ndcg_diff)
77
+
78
+ ex_time = ex.get("avg_search_time_ms", 1)
79
+ ts_time = ts.get("avg_search_time_ms", 1)
80
+ if ts_time > 0:
81
+ speedups.append(ex_time / ts_time)
82
+
83
+ print()
84
+
85
+ # Summary
86
+ if improvements:
87
+ print("-" * 72)
88
+ print(f"Average NDCG@10 difference (two_stage - exhaustive): {np.mean(improvements):+.4f}")
89
+ print(f"Retention rate: {100 * (1 + np.mean(improvements)):.1f}%")
90
+
91
+ if speedups:
92
+ print(f"Average speedup: {np.mean(speedups):.1f}x")
93
+
94
+
95
+ def analyze_stage1_recall(results: Dict[str, Dict]) -> None:
96
+ """Analyze how well stage 1 preserves relevant documents."""
97
+ print("\n" + "=" * 80)
98
+ print("STAGE 1 RECALL ANALYSIS")
99
+ print("=" * 80)
100
+ print("\n(Stage 1 recall = how often relevant doc is in prefetch candidates)")
101
+ print("This requires detailed results with stage1_rank info - run with --detailed")
102
+
103
+
104
+ def print_leaderboard(results: Dict[str, Dict]) -> None:
105
+ """Print results in leaderboard format."""
106
+ print("\n" + "=" * 80)
107
+ print("LEADERBOARD FORMAT")
108
+ print("=" * 80)
109
+
110
+ # Best result per dataset
111
+ best = {}
112
+ for key, data in results.items():
113
+ ds = data["dataset"].replace("_twostage", "")
114
+ ndcg = data.get("ndcg@10", 0)
115
+ if ds not in best or ndcg > best[ds].get("ndcg@10", 0):
116
+ best[ds] = data
117
+
118
+ # Compute average
119
+ ndcg_scores = [d.get("ndcg@10", 0) for d in best.values()]
120
+ avg = sum(ndcg_scores) / len(ndcg_scores) if ndcg_scores else 0
121
+
122
+ print(f"\nModel: {list(results.values())[0].get('model', 'unknown')}")
123
+ print(f"\n{'Dataset':<35} {'NDCG@10':>10}")
124
+ print("-" * 45)
125
+
126
+ for ds, data in sorted(best.items()):
127
+ method_tag = " (2-stage)" if data.get("method") == "two_stage" else ""
128
+ print(f"{ds + method_tag:<35} {data.get('ndcg@10', 0):>10.4f}")
129
+
130
+ print("-" * 45)
131
+ print(f"{'AVERAGE':<35} {avg:>10.4f}")
132
+
133
+
134
+ def main():
135
+ parser = argparse.ArgumentParser(description="Analyze benchmark results")
136
+ parser.add_argument(
137
+ "--results", type=str, default="results",
138
+ help="Results directory"
139
+ )
140
+ parser.add_argument(
141
+ "--dirs", nargs="+",
142
+ help="Multiple result directories to compare"
143
+ )
144
+ parser.add_argument(
145
+ "--compare", action="store_true",
146
+ help="Compare exhaustive vs two-stage"
147
+ )
148
+ parser.add_argument(
149
+ "--leaderboard", action="store_true",
150
+ help="Print in leaderboard format"
151
+ )
152
+
153
+ args = parser.parse_args()
154
+
155
+ if args.dirs:
156
+ # Compare multiple directories
157
+ all_results = {}
158
+ for d in args.dirs:
159
+ results = load_all_results(Path(d))
160
+ for k, v in results.items():
161
+ all_results[f"{d}_{k}"] = v
162
+ results = all_results
163
+ else:
164
+ results = load_all_results(Path(args.results))
165
+
166
+ if not results:
167
+ print(f"āŒ No results found")
168
+ return
169
+
170
+ print(f"šŸ“Š Loaded {len(results)} result files")
171
+
172
+ if args.compare or not args.leaderboard:
173
+ compare_methods(results)
174
+
175
+ if args.leaderboard or not args.compare:
176
+ print_leaderboard(results)
177
+
178
+
179
+ if __name__ == "__main__":
180
+ main()
181
+
182
+
183
+
184
+
185
+
186
+
187
+
@@ -0,0 +1,105 @@
1
+ Option A vs Option B (Visual RAG Toolkit benchmarking datasets/protocols)
2
+
3
+ Goal
4
+ - Evaluate ColPali/ColSmol-style visual document retrieval with:
5
+ - single-stage full late-interaction MaxSim (query tokens vs doc tokens)
6
+ - two-stage retrieval (stage-1 cheap prefetch, stage-2 full MaxSim rerank)
7
+ - Use Qdrant-backed evaluation for ā€œreal worldā€ behavior (ANN prefetch + vector fetch costs).
8
+
9
+
10
+ Vocabulary: DocVQA / InfoVQA / TabFQuAD / TAT-DQA / ArXivQA / SHIFT
11
+ - These names refer to ā€œtask familiesā€ / subsets in the ViDoRe benchmark.
12
+ - In this repo we currently reference them as HuggingFace datasets like:
13
+ - vidore/docvqa_test_subsampled
14
+ - vidore/infovqa_test_subsampled
15
+ - vidore/tabfquad_test_subsampled
16
+ - vidore/tatdqa_test
17
+ - vidore/arxivqa_test_subsampled
18
+ - vidore/shiftproject_test
19
+ - Important: These are still ā€œViDoRe benchmark datasetsā€ (not completely unrelated external datasets),
20
+ but they are derived from those task domains.
21
+
22
+
23
+ What is ā€œqrels mappingā€?
24
+ - qrels = query relevance labels used by IR metrics (NDCG/MRR/Recall).
25
+ - Concretely: a mapping like:
26
+ qrels[query_id] = {doc_id_1: relevance, doc_id_2: relevance, ...}
27
+ - In a correct shared-corpus benchmark, doc_id refers to an item in a corpus (pages/images),
28
+ and query_id refers to a query/question; qrels tells which corpus items are relevant.
29
+
30
+
31
+ Option A: Official ViDoRe protocol (recommended for paper comparability)
32
+ What it means
33
+ - Evaluate each ViDoRe dataset using the ā€œofficialā€ definition of:
34
+ - query set
35
+ - corpus (shared across queries)
36
+ - qrels (relevance mapping from queries to corpus items)
37
+ - metrics (NDCG@K, MRR@K, Recall@K as reported by the benchmark)
38
+ - Results are directly comparable to other systems and can be reported as ā€œViDoRe benchmark resultsā€.
39
+
40
+ Why it matters
41
+ - Strongest credibility and easiest to defend in a paper.
42
+ - Minimizes reviewer skepticism about custom evaluation.
43
+
44
+ Notes about THIS repo today
45
+ - The current script `benchmarks/run_vidore.py` DOES NOT implement the official shared-corpus protocol.
46
+ It currently constructs an artificial 1:1 regime:
47
+ - query_id = q_{idx}
48
+ - doc_id = d_{idx}
49
+ - qrels[q_{idx}] = {d_{idx}: 1}
50
+ This makes the ā€œcorpus sizeā€ equal to the number of examples and makes each doc relevant to only one query.
51
+ - For Option A, we should later update the benchmark pipeline to:
52
+ 1) load the official corpus + query split for each dataset
53
+ 2) index the corpus into Qdrant using visual-rag-toolkit indexing (vectors: initial, mean_pooling, global_pooling)
54
+ 3) run retrieval against Qdrant (not in-memory NumPy)
55
+ 4) compute official metrics using qrels
56
+
57
+ Implementation checklist for later (Option A)
58
+ - Determine the true ViDoRe dataset schema for each subset:
59
+ - Which fields identify the query?
60
+ - Which fields identify the relevant document/page id?
61
+ - Is the corpus provided as a separate split/file/dataset?
62
+ - Ensure consistent doc_id keys across:
63
+ - indexing pipeline ids
64
+ - qrels doc ids
65
+ - Ensure we don’t ā€œleakā€ query-paired doc only; we must have a shared corpus per dataset.
66
+
67
+
68
+ Option B: Custom ā€œscale-stressā€ protocol (not leaderboard-comparable)
69
+ What it means
70
+ - Build a larger shared corpus to stress scalability and show the value of two-stage retrieval.
71
+ - A practical version using only ViDoRe subsets:
72
+ - Merge corpora from multiple ViDoRe subsets into one larger corpus
73
+ e.g. corpus = DocVQA corpus + InfoVQA corpus + TabFQuAD corpus + ...
74
+ - Run queries from one subset (or all subsets) against the merged corpus.
75
+ - Keep qrels pointing to the original relevant doc ids (those docs now still exist in the merged corpus).
76
+
77
+ Why it matters
78
+ - Produces ā€œlatency vs corpus sizeā€ curves and ā€œquality retention vs prefetch_kā€ curves
79
+ that better reflect production deployments (thousands → millions of pages).
80
+
81
+ Tradeoffs
82
+ - Not directly comparable to the official ViDoRe leaderboard.
83
+ - Must be explicitly described as a custom scaling experiment in the paper.
84
+
85
+ Custom qrels mapping in Option B
86
+ - If doc IDs are preserved during corpus merge, then qrels does not need to change:
87
+ qrels[query_id] still points to doc_id(s) that are in the merged corpus.
88
+ - If doc IDs collide between subsets, then you must namespace them:
89
+ doc_id := "{subset_name}:{original_doc_id}"
90
+ and update qrels doc ids accordingly.
91
+
92
+
93
+ Is ViDoRe ā€œsmallā€?
94
+ - ViDoRe subsets used here are commonly ā€œsubsampledā€ (e.g., ~500 queries) for quick runs.
95
+ - Even if ViDoRe is not ā€œtinyā€, it is still far smaller than a production corpus (hundreds of thousands to millions of pages).
96
+ - That’s why Option B exists as an additional scaling experiment (often paired with an in-domain corpus).
97
+
98
+
99
+ Where this repo currently defines the subsets
100
+ - `visual-rag-toolkit/benchmarks/run_vidore.py` has `VIDORE_DATASETS` mapping.
101
+
102
+
103
+
104
+
105
+
@@ -0,0 +1,205 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Prepare submission for ViDoRe leaderboard.
4
+
5
+ Reads evaluation results and formats them for HuggingFace submission.
6
+
7
+ Usage:
8
+ python prepare_submission.py --results results/ --output submission.json
9
+ python prepare_submission.py --results results/ --model-name "MyModel" --upload
10
+ """
11
+
12
+ import argparse
13
+ import json
14
+ from pathlib import Path
15
+ from datetime import datetime
16
+ from typing import Dict, Any, Optional
17
+
18
+ # ViDoRe leaderboard expected datasets
19
+ VIDORE_DATASETS = {
20
+ "docvqa_test_subsampled": "DocVQA",
21
+ "infovqa_test_subsampled": "InfoVQA",
22
+ "tabfquad_test_subsampled": "TabFQuAD",
23
+ "tatdqa_test": "TAT-DQA",
24
+ "arxivqa_test_subsampled": "ArXivQA",
25
+ "shiftproject_test": "SHIFT",
26
+ }
27
+
28
+
29
+ def load_results(results_dir: Path) -> Dict[str, Dict[str, float]]:
30
+ """Load all result JSON files from directory."""
31
+ results = {}
32
+
33
+ for json_file in results_dir.glob("*.json"):
34
+ with open(json_file) as f:
35
+ data = json.load(f)
36
+
37
+ dataset = data.get("dataset", json_file.stem)
38
+ dataset_short = dataset.split("/")[-1].replace("_twostage", "")
39
+
40
+ results[dataset_short] = {
41
+ "ndcg@5": data["metrics"].get("ndcg@5", 0),
42
+ "ndcg@10": data["metrics"].get("ndcg@10", 0),
43
+ "mrr@10": data["metrics"].get("mrr@10", 0),
44
+ "recall@5": data["metrics"].get("recall@5", 0),
45
+ "recall@10": data["metrics"].get("recall@10", 0),
46
+ "two_stage": data.get("two_stage", False),
47
+ "model": data.get("model", "unknown"),
48
+ }
49
+
50
+ return results
51
+
52
+
53
+ def format_submission(
54
+ results: Dict[str, Dict],
55
+ model_name: str,
56
+ model_url: Optional[str] = None,
57
+ description: Optional[str] = None,
58
+ ) -> Dict[str, Any]:
59
+ """Format results for ViDoRe leaderboard submission."""
60
+
61
+ # Calculate average scores
62
+ ndcg10_scores = [r["ndcg@10"] for r in results.values()]
63
+ avg_ndcg10 = sum(ndcg10_scores) / len(ndcg10_scores) if ndcg10_scores else 0
64
+
65
+ submission = {
66
+ "model_name": model_name,
67
+ "model_url": model_url or "",
68
+ "description": description or "Visual RAG Toolkit submission",
69
+ "submitted_at": datetime.now().isoformat(),
70
+ "average_ndcg@10": avg_ndcg10,
71
+ "results": {},
72
+ }
73
+
74
+ # Add per-dataset results
75
+ for dataset_short, metrics in results.items():
76
+ display_name = VIDORE_DATASETS.get(dataset_short, dataset_short)
77
+ submission["results"][display_name] = {
78
+ "ndcg@5": metrics["ndcg@5"],
79
+ "ndcg@10": metrics["ndcg@10"],
80
+ "mrr@10": metrics["mrr@10"],
81
+ }
82
+
83
+ return submission
84
+
85
+
86
+ def print_summary(results: Dict[str, Dict], submission: Dict[str, Any]):
87
+ """Print summary table."""
88
+ print("\n" + "=" * 70)
89
+ print(f"MODEL: {submission['model_name']}")
90
+ print("=" * 70)
91
+
92
+ print(f"\n{'Dataset':<25} {'NDCG@5':>10} {'NDCG@10':>10} {'MRR@10':>10}")
93
+ print("-" * 55)
94
+
95
+ for dataset, metrics in results.items():
96
+ display = VIDORE_DATASETS.get(dataset, dataset)[:24]
97
+ print(f"{display:<25} {metrics['ndcg@5']:>10.4f} {metrics['ndcg@10']:>10.4f} {metrics['mrr@10']:>10.4f}")
98
+
99
+ print("-" * 55)
100
+ print(f"{'AVERAGE':<25} {'':<10} {submission['average_ndcg@10']:>10.4f}")
101
+ print("=" * 70)
102
+
103
+
104
+ def upload_to_huggingface(submission: Dict[str, Any], repo_id: str = "vidore/results"):
105
+ """Upload submission to HuggingFace."""
106
+ try:
107
+ from huggingface_hub import HfApi
108
+ except ImportError:
109
+ print("Install huggingface_hub: pip install huggingface_hub")
110
+ return False
111
+
112
+ api = HfApi()
113
+
114
+ # Save to temp file
115
+ temp_file = Path(f"/tmp/submission_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json")
116
+ with open(temp_file, "w") as f:
117
+ json.dump(submission, f, indent=2)
118
+
119
+ try:
120
+ api.upload_file(
121
+ path_or_fileobj=str(temp_file),
122
+ path_in_repo=f"submissions/{submission['model_name']}.json",
123
+ repo_id=repo_id,
124
+ repo_type="space",
125
+ )
126
+ print(f"āœ… Uploaded to {repo_id}")
127
+ return True
128
+ except Exception as e:
129
+ print(f"āŒ Upload failed: {e}")
130
+ return False
131
+
132
+
133
+ def main():
134
+ parser = argparse.ArgumentParser(description="Prepare ViDoRe submission")
135
+ parser.add_argument(
136
+ "--results", type=str, default="results",
137
+ help="Directory with result JSON files"
138
+ )
139
+ parser.add_argument(
140
+ "--output", type=str, default="submission.json",
141
+ help="Output submission file"
142
+ )
143
+ parser.add_argument(
144
+ "--model-name", type=str, default="visual-rag-toolkit",
145
+ help="Model name for leaderboard"
146
+ )
147
+ parser.add_argument(
148
+ "--model-url", type=str,
149
+ help="URL to model/paper"
150
+ )
151
+ parser.add_argument(
152
+ "--description", type=str,
153
+ help="Model description"
154
+ )
155
+ parser.add_argument(
156
+ "--upload", action="store_true",
157
+ help="Upload to HuggingFace"
158
+ )
159
+
160
+ args = parser.parse_args()
161
+
162
+ results_dir = Path(args.results)
163
+ if not results_dir.exists():
164
+ print(f"āŒ Results directory not found: {results_dir}")
165
+ return
166
+
167
+ # Load results
168
+ results = load_results(results_dir)
169
+ if not results:
170
+ print(f"āŒ No result files found in {results_dir}")
171
+ return
172
+
173
+ print(f"šŸ“Š Found {len(results)} dataset results")
174
+
175
+ # Format submission
176
+ submission = format_submission(
177
+ results,
178
+ model_name=args.model_name,
179
+ model_url=args.model_url,
180
+ description=args.description,
181
+ )
182
+
183
+ # Print summary
184
+ print_summary(results, submission)
185
+
186
+ # Save
187
+ output_path = Path(args.output)
188
+ with open(output_path, "w") as f:
189
+ json.dump(submission, f, indent=2)
190
+ print(f"\nšŸ’¾ Saved to: {output_path}")
191
+
192
+ # Upload if requested
193
+ if args.upload:
194
+ upload_to_huggingface(submission)
195
+
196
+
197
+ if __name__ == "__main__":
198
+ main()
199
+
200
+
201
+
202
+
203
+
204
+
205
+