mteb 2.6.9__py3-none-any.whl → 2.7.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mteb/_evaluators/retrieval_metrics.py +1 -1
- mteb/abstasks/retrieval_dataset_loaders.py +2 -2
- mteb/abstasks/text/reranking.py +1 -1
- mteb/benchmarks/benchmarks/__init__.py +2 -0
- mteb/benchmarks/benchmarks/benchmarks.py +66 -10
- mteb/descriptive_stats/Retrieval/ChemRxivRetrieval.json +30 -0
- mteb/descriptive_stats/Retrieval/EuroPIRQRetrieval.json +116 -0
- mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +13 -7
- mteb/models/model_implementations/nvidia_models.py +58 -5
- mteb/models/model_implementations/voyage_models.py +84 -0
- mteb/models/model_implementations/voyage_v.py +5 -3
- mteb/models/model_meta.py +1 -1
- mteb/models/sentence_transformer_wrapper.py +16 -3
- mteb/models/vllm_wrapper.py +327 -0
- mteb/tasks/retrieval/eng/__init__.py +2 -0
- mteb/tasks/retrieval/eng/chemrxiv.py +33 -0
- mteb/tasks/retrieval/multilingual/__init__.py +2 -0
- mteb/tasks/retrieval/multilingual/euro_pirq_retrieval.py +43 -0
- mteb/tasks/retrieval/multilingual/vidore3_bench_retrieval.py +90 -100
- {mteb-2.6.9.dist-info → mteb-2.7.1.dist-info}/METADATA +5 -1
- {mteb-2.6.9.dist-info → mteb-2.7.1.dist-info}/RECORD +25 -20
- {mteb-2.6.9.dist-info → mteb-2.7.1.dist-info}/WHEEL +0 -0
- {mteb-2.6.9.dist-info → mteb-2.7.1.dist-info}/entry_points.txt +0 -0
- {mteb-2.6.9.dist-info → mteb-2.7.1.dist-info}/licenses/LICENSE +0 -0
- {mteb-2.6.9.dist-info → mteb-2.7.1.dist-info}/top_level.txt +0 -0
|
@@ -140,7 +140,7 @@ def calculate_pmrr(original_run, new_run, changed_qrels):
|
|
|
140
140
|
changes = []
|
|
141
141
|
for qid in changed_qrels.keys():
|
|
142
142
|
if qid + "-og" not in original_run or qid + "-changed" not in new_run:
|
|
143
|
-
|
|
143
|
+
logger.warning(f"Query {qid} not found in the runs for calculating p-MRR")
|
|
144
144
|
continue
|
|
145
145
|
original_qid_run = original_run[qid + "-og"]
|
|
146
146
|
new_qid_run = new_run[qid + "-changed"]
|
|
@@ -136,7 +136,7 @@ class RetrievalDatasetLoader:
|
|
|
136
136
|
"_id", "id"
|
|
137
137
|
)
|
|
138
138
|
logger.info("Loaded %d %s Documents.", len(corpus_ds), self.split.upper())
|
|
139
|
-
logger.
|
|
139
|
+
logger.debug("Doc Example: %s", corpus_ds[0])
|
|
140
140
|
return corpus_ds
|
|
141
141
|
|
|
142
142
|
def _load_queries(self) -> QueryDatasetType:
|
|
@@ -152,7 +152,7 @@ class RetrievalDatasetLoader:
|
|
|
152
152
|
)
|
|
153
153
|
|
|
154
154
|
logger.info("Loaded %d %s queries.", len(queries_ds), self.split.upper())
|
|
155
|
-
logger.
|
|
155
|
+
logger.debug("Query Example: %s", queries_ds[0])
|
|
156
156
|
|
|
157
157
|
return queries_ds
|
|
158
158
|
|
mteb/abstasks/text/reranking.py
CHANGED
|
@@ -6,6 +6,7 @@ from mteb.benchmarks.benchmarks.benchmarks import (
|
|
|
6
6
|
BUILT_MTEB,
|
|
7
7
|
C_MTEB,
|
|
8
8
|
CHEMTEB,
|
|
9
|
+
CHEMTEB_V1_1,
|
|
9
10
|
CODE_RAG,
|
|
10
11
|
ENCODECHKA,
|
|
11
12
|
FA_MTEB,
|
|
@@ -70,6 +71,7 @@ __all__ = [
|
|
|
70
71
|
"BRIGHT_LONG",
|
|
71
72
|
"BUILT_MTEB",
|
|
72
73
|
"CHEMTEB",
|
|
74
|
+
"CHEMTEB_V1_1",
|
|
73
75
|
"CODE_RAG",
|
|
74
76
|
"C_MTEB",
|
|
75
77
|
"ENCODECHKA",
|
|
@@ -1656,6 +1656,7 @@ FA_MTEB_2 = Benchmark(
|
|
|
1656
1656
|
|
|
1657
1657
|
CHEMTEB = Benchmark(
|
|
1658
1658
|
name="ChemTEB",
|
|
1659
|
+
aliases=["ChemTEB(v1)"],
|
|
1659
1660
|
display_name="Chemical",
|
|
1660
1661
|
icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-gui-purge.svg",
|
|
1661
1662
|
tasks=get_tasks(
|
|
@@ -1701,6 +1702,62 @@ CHEMTEB = Benchmark(
|
|
|
1701
1702
|
""",
|
|
1702
1703
|
)
|
|
1703
1704
|
|
|
1705
|
+
CHEMTEB_V1_1 = Benchmark(
|
|
1706
|
+
name="ChemTEB(v1.1)",
|
|
1707
|
+
aliases=["ChemTEB(latest)"],
|
|
1708
|
+
display_name="Chemical",
|
|
1709
|
+
icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-gui-purge.svg",
|
|
1710
|
+
tasks=get_tasks(
|
|
1711
|
+
tasks=[
|
|
1712
|
+
"PubChemSMILESBitextMining",
|
|
1713
|
+
"SDSEyeProtectionClassification",
|
|
1714
|
+
"SDSGlovesClassification",
|
|
1715
|
+
"WikipediaBioMetChemClassification",
|
|
1716
|
+
"WikipediaGreenhouseEnantiopureClassification",
|
|
1717
|
+
"WikipediaSolidStateColloidalClassification",
|
|
1718
|
+
"WikipediaOrganicInorganicClassification",
|
|
1719
|
+
"WikipediaCryobiologySeparationClassification",
|
|
1720
|
+
"WikipediaChemistryTopicsClassification",
|
|
1721
|
+
"WikipediaTheoreticalAppliedClassification",
|
|
1722
|
+
"WikipediaChemFieldsClassification",
|
|
1723
|
+
"WikipediaLuminescenceClassification",
|
|
1724
|
+
"WikipediaIsotopesFissionClassification",
|
|
1725
|
+
"WikipediaSaltsSemiconductorsClassification",
|
|
1726
|
+
"WikipediaBiolumNeurochemClassification",
|
|
1727
|
+
"WikipediaCrystallographyAnalyticalClassification",
|
|
1728
|
+
"WikipediaCompChemSpectroscopyClassification",
|
|
1729
|
+
"WikipediaChemEngSpecialtiesClassification",
|
|
1730
|
+
"WikipediaChemistryTopicsClustering",
|
|
1731
|
+
"WikipediaSpecialtiesInChemistryClustering",
|
|
1732
|
+
"PubChemAISentenceParaphrasePC",
|
|
1733
|
+
"PubChemSMILESPC",
|
|
1734
|
+
"PubChemSynonymPC",
|
|
1735
|
+
"PubChemWikiParagraphsPC",
|
|
1736
|
+
"PubChemWikiPairClassification",
|
|
1737
|
+
"ChemNQRetrieval",
|
|
1738
|
+
"ChemHotpotQARetrieval",
|
|
1739
|
+
"ChemRxivRetrieval",
|
|
1740
|
+
],
|
|
1741
|
+
),
|
|
1742
|
+
description="ChemTEB evaluates the performance of text embedding models on chemical domain data. This version adds the ChemRxivRetrieval task.",
|
|
1743
|
+
reference="https://arxiv.org/abs/2412.00532",
|
|
1744
|
+
citation=r"""
|
|
1745
|
+
@article{kasmaee2024chemteb,
|
|
1746
|
+
author = {Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila},
|
|
1747
|
+
journal = {arXiv preprint arXiv:2412.00532},
|
|
1748
|
+
title = {ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \\& Efficiency on a Specific Domain},
|
|
1749
|
+
year = {2024},
|
|
1750
|
+
}
|
|
1751
|
+
|
|
1752
|
+
@article{kasmaee2025chembed,
|
|
1753
|
+
author = {Kasmaee, Ali Shiraee and Khodadad, Mohammad and Astaraki, Mahdi and Saloot, Mohammad Arshi and Sherck, Nicholas and Mahyar, Hamidreza and Samiee, Soheila},
|
|
1754
|
+
journal = {arXiv preprint arXiv:2508.01643},
|
|
1755
|
+
title = {Chembed: Enhancing chemical literature search through domain-specific text embeddings},
|
|
1756
|
+
year = {2025},
|
|
1757
|
+
}
|
|
1758
|
+
""",
|
|
1759
|
+
)
|
|
1760
|
+
|
|
1704
1761
|
BEIR_NL = Benchmark(
|
|
1705
1762
|
name="BEIR-NL",
|
|
1706
1763
|
display_name="BEIR-NL",
|
|
@@ -2350,17 +2407,16 @@ VIDORE_V3 = VidoreBenchmark(
|
|
|
2350
2407
|
]
|
|
2351
2408
|
),
|
|
2352
2409
|
description="ViDoRe V3 sets a new industry gold standard for multi-modal, enterprise document visual retrieval evaluation. It addresses a critical challenge in production RAG systems: retrieving accurate information from complex, visually-rich documents. The benchmark includes both open and closed datasets: to submit results on private tasks, please [open an issue](https://github.com/embeddings-benchmark/mteb/issues?template=eval_request.yaml).",
|
|
2353
|
-
reference="https://
|
|
2410
|
+
reference="https://arxiv.org/abs/2601.08620",
|
|
2354
2411
|
citation=r"""
|
|
2355
|
-
@
|
|
2356
|
-
|
|
2357
|
-
|
|
2358
|
-
|
|
2359
|
-
|
|
2360
|
-
|
|
2361
|
-
|
|
2362
|
-
|
|
2363
|
-
year = {2025},
|
|
2412
|
+
@article{loison2026vidorev3comprehensiveevaluation,
|
|
2413
|
+
archiveprefix = {arXiv},
|
|
2414
|
+
author = {António Loison and Quentin Macé and Antoine Edy and Victor Xing and Tom Balough and Gabriel Moreira and Bo Liu and Manuel Faysse and Céline Hudelot and Gautier Viaud},
|
|
2415
|
+
eprint = {2601.08620},
|
|
2416
|
+
primaryclass = {cs.AI},
|
|
2417
|
+
title = {ViDoRe V3: A Comprehensive Evaluation of Retrieval Augmented Generation in Complex Real-World Scenarios},
|
|
2418
|
+
url = {https://arxiv.org/abs/2601.08620},
|
|
2419
|
+
year = {2026},
|
|
2364
2420
|
}
|
|
2365
2421
|
""",
|
|
2366
2422
|
)
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
{
|
|
2
|
+
"test": {
|
|
3
|
+
"num_samples": 74457,
|
|
4
|
+
"number_of_characters": 76109543,
|
|
5
|
+
"documents_text_statistics": {
|
|
6
|
+
"total_text_length": 75549698,
|
|
7
|
+
"min_text_length": 121,
|
|
8
|
+
"average_text_length": 1087.7189916063176,
|
|
9
|
+
"max_text_length": 25438,
|
|
10
|
+
"unique_texts": 69150
|
|
11
|
+
},
|
|
12
|
+
"documents_image_statistics": null,
|
|
13
|
+
"queries_text_statistics": {
|
|
14
|
+
"total_text_length": 559845,
|
|
15
|
+
"min_text_length": 57,
|
|
16
|
+
"average_text_length": 111.969,
|
|
17
|
+
"max_text_length": 224,
|
|
18
|
+
"unique_texts": 5000
|
|
19
|
+
},
|
|
20
|
+
"queries_image_statistics": null,
|
|
21
|
+
"relevant_docs_statistics": {
|
|
22
|
+
"num_relevant_docs": 5000,
|
|
23
|
+
"min_relevant_docs_per_query": 1,
|
|
24
|
+
"average_relevant_docs_per_query": 1.0,
|
|
25
|
+
"max_relevant_docs_per_query": 1,
|
|
26
|
+
"unique_relevant_docs": 5000
|
|
27
|
+
},
|
|
28
|
+
"top_ranked_statistics": null
|
|
29
|
+
}
|
|
30
|
+
}
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
{
|
|
2
|
+
"test": {
|
|
3
|
+
"num_samples": 30300,
|
|
4
|
+
"number_of_characters": 17320243,
|
|
5
|
+
"documents_text_statistics": {
|
|
6
|
+
"total_text_length": 17276572,
|
|
7
|
+
"min_text_length": 316,
|
|
8
|
+
"average_text_length": 575.8857333333333,
|
|
9
|
+
"max_text_length": 1008,
|
|
10
|
+
"unique_texts": 28361
|
|
11
|
+
},
|
|
12
|
+
"documents_image_statistics": null,
|
|
13
|
+
"queries_text_statistics": {
|
|
14
|
+
"total_text_length": 43671,
|
|
15
|
+
"min_text_length": 67,
|
|
16
|
+
"average_text_length": 145.57,
|
|
17
|
+
"max_text_length": 345,
|
|
18
|
+
"unique_texts": 300
|
|
19
|
+
},
|
|
20
|
+
"queries_image_statistics": null,
|
|
21
|
+
"relevant_docs_statistics": {
|
|
22
|
+
"num_relevant_docs": 300,
|
|
23
|
+
"min_relevant_docs_per_query": 1,
|
|
24
|
+
"average_relevant_docs_per_query": 1.0,
|
|
25
|
+
"max_relevant_docs_per_query": 1,
|
|
26
|
+
"unique_relevant_docs": 300
|
|
27
|
+
},
|
|
28
|
+
"top_ranked_statistics": null,
|
|
29
|
+
"hf_subset_descriptive_stats": {
|
|
30
|
+
"en": {
|
|
31
|
+
"num_samples": 10100,
|
|
32
|
+
"number_of_characters": 5517678,
|
|
33
|
+
"documents_text_statistics": {
|
|
34
|
+
"total_text_length": 5503635,
|
|
35
|
+
"min_text_length": 316,
|
|
36
|
+
"average_text_length": 550.3635,
|
|
37
|
+
"max_text_length": 726,
|
|
38
|
+
"unique_texts": 9422
|
|
39
|
+
},
|
|
40
|
+
"documents_image_statistics": null,
|
|
41
|
+
"queries_text_statistics": {
|
|
42
|
+
"total_text_length": 14043,
|
|
43
|
+
"min_text_length": 68,
|
|
44
|
+
"average_text_length": 140.43,
|
|
45
|
+
"max_text_length": 305,
|
|
46
|
+
"unique_texts": 100
|
|
47
|
+
},
|
|
48
|
+
"queries_image_statistics": null,
|
|
49
|
+
"relevant_docs_statistics": {
|
|
50
|
+
"num_relevant_docs": 100,
|
|
51
|
+
"min_relevant_docs_per_query": 1,
|
|
52
|
+
"average_relevant_docs_per_query": 1.0,
|
|
53
|
+
"max_relevant_docs_per_query": 1,
|
|
54
|
+
"unique_relevant_docs": 100
|
|
55
|
+
},
|
|
56
|
+
"top_ranked_statistics": null
|
|
57
|
+
},
|
|
58
|
+
"fi": {
|
|
59
|
+
"num_samples": 10100,
|
|
60
|
+
"number_of_characters": 5953462,
|
|
61
|
+
"documents_text_statistics": {
|
|
62
|
+
"total_text_length": 5938809,
|
|
63
|
+
"min_text_length": 326,
|
|
64
|
+
"average_text_length": 593.8809,
|
|
65
|
+
"max_text_length": 1008,
|
|
66
|
+
"unique_texts": 9422
|
|
67
|
+
},
|
|
68
|
+
"documents_image_statistics": null,
|
|
69
|
+
"queries_text_statistics": {
|
|
70
|
+
"total_text_length": 14653,
|
|
71
|
+
"min_text_length": 67,
|
|
72
|
+
"average_text_length": 146.53,
|
|
73
|
+
"max_text_length": 345,
|
|
74
|
+
"unique_texts": 100
|
|
75
|
+
},
|
|
76
|
+
"queries_image_statistics": null,
|
|
77
|
+
"relevant_docs_statistics": {
|
|
78
|
+
"num_relevant_docs": 100,
|
|
79
|
+
"min_relevant_docs_per_query": 1,
|
|
80
|
+
"average_relevant_docs_per_query": 1.0,
|
|
81
|
+
"max_relevant_docs_per_query": 1,
|
|
82
|
+
"unique_relevant_docs": 100
|
|
83
|
+
},
|
|
84
|
+
"top_ranked_statistics": null
|
|
85
|
+
},
|
|
86
|
+
"pt": {
|
|
87
|
+
"num_samples": 10100,
|
|
88
|
+
"number_of_characters": 5849103,
|
|
89
|
+
"documents_text_statistics": {
|
|
90
|
+
"total_text_length": 5834128,
|
|
91
|
+
"min_text_length": 325,
|
|
92
|
+
"average_text_length": 583.4128,
|
|
93
|
+
"max_text_length": 774,
|
|
94
|
+
"unique_texts": 9517
|
|
95
|
+
},
|
|
96
|
+
"documents_image_statistics": null,
|
|
97
|
+
"queries_text_statistics": {
|
|
98
|
+
"total_text_length": 14975,
|
|
99
|
+
"min_text_length": 69,
|
|
100
|
+
"average_text_length": 149.75,
|
|
101
|
+
"max_text_length": 320,
|
|
102
|
+
"unique_texts": 100
|
|
103
|
+
},
|
|
104
|
+
"queries_image_statistics": null,
|
|
105
|
+
"relevant_docs_statistics": {
|
|
106
|
+
"num_relevant_docs": 100,
|
|
107
|
+
"min_relevant_docs_per_query": 1,
|
|
108
|
+
"average_relevant_docs_per_query": 1.0,
|
|
109
|
+
"max_relevant_docs_per_query": 1,
|
|
110
|
+
"unique_relevant_docs": 100
|
|
111
|
+
},
|
|
112
|
+
"top_ranked_statistics": null
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
}
|
|
@@ -1,17 +1,15 @@
|
|
|
1
|
-
from typing import
|
|
1
|
+
from typing import Any
|
|
2
2
|
|
|
3
3
|
import torch
|
|
4
|
+
from packaging.version import Version
|
|
4
5
|
from torch.utils.data import DataLoader
|
|
6
|
+
from transformers import __version__ as transformers_version
|
|
5
7
|
|
|
6
8
|
from mteb.abstasks.task_metadata import TaskMetadata
|
|
7
9
|
from mteb.models.abs_encoder import AbsEncoder
|
|
8
10
|
from mteb.models.model_meta import ModelMeta
|
|
9
11
|
from mteb.types import Array, BatchedInput, PromptType
|
|
10
12
|
|
|
11
|
-
if TYPE_CHECKING:
|
|
12
|
-
pass
|
|
13
|
-
|
|
14
|
-
|
|
15
13
|
LLAMA_NEMORETRIEVER_CITATION = """@misc{xu2025llamanemoretrievercolembedtopperforming,
|
|
16
14
|
title={Llama Nemoretriever Colembed: Top-Performing Text-Image Retrieval Model},
|
|
17
15
|
author={Mengyao Xu and Gabriel Moreira and Ronay Ak and Radek Osmulski and Yauhen Babakhin and Zhiding Yu and Benedikt Schifferer and Even Oldridge},
|
|
@@ -34,6 +32,14 @@ class LlamaNemoretrieverColembed(AbsEncoder):
|
|
|
34
32
|
attn_implementation="flash_attention_2",
|
|
35
33
|
**kwargs,
|
|
36
34
|
):
|
|
35
|
+
required_transformers_version = "4.49.0"
|
|
36
|
+
|
|
37
|
+
if Version(transformers_version) != Version(required_transformers_version):
|
|
38
|
+
raise RuntimeError(
|
|
39
|
+
f"transformers version {transformers_version} is not match with required "
|
|
40
|
+
f"install version {required_transformers_version} to run `nvidia/llama-nemoretriever-colembed`"
|
|
41
|
+
)
|
|
42
|
+
|
|
37
43
|
from transformers import AutoModel
|
|
38
44
|
|
|
39
45
|
self.model = AutoModel.from_pretrained(
|
|
@@ -148,7 +154,7 @@ llama_nemoretriever_colembed_1b_v1 = ModelMeta(
|
|
|
148
154
|
name="nvidia/llama-nemoretriever-colembed-1b-v1",
|
|
149
155
|
model_type=["late-interaction"],
|
|
150
156
|
languages=["eng-Latn"],
|
|
151
|
-
revision="
|
|
157
|
+
revision="6eade800103413033f260bb55b49fe039fd28a6e",
|
|
152
158
|
release_date="2025-06-27",
|
|
153
159
|
modalities=["image", "text"],
|
|
154
160
|
n_parameters=2_418_000_000,
|
|
@@ -175,7 +181,7 @@ llama_nemoretriever_colembed_3b_v1 = ModelMeta(
|
|
|
175
181
|
name="nvidia/llama-nemoretriever-colembed-3b-v1",
|
|
176
182
|
model_type=["late-interaction"],
|
|
177
183
|
languages=["eng-Latn"],
|
|
178
|
-
revision="
|
|
184
|
+
revision="4194bdd2cd2871f220ddba6273ce173ef1217a1e",
|
|
179
185
|
release_date="2025-06-27",
|
|
180
186
|
modalities=["image", "text"],
|
|
181
187
|
n_parameters=4_407_000_000,
|
|
@@ -10,8 +10,9 @@ from tqdm import tqdm
|
|
|
10
10
|
from transformers import AutoModel, AutoTokenizer
|
|
11
11
|
from transformers import __version__ as transformers_version
|
|
12
12
|
|
|
13
|
-
from mteb import TaskMetadata
|
|
14
13
|
from mteb._requires_package import requires_package
|
|
14
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
15
|
+
from mteb.models import CrossEncoderWrapper
|
|
15
16
|
from mteb.models.abs_encoder import AbsEncoder
|
|
16
17
|
from mteb.models.instruct_wrapper import InstructSentenceTransformerModel
|
|
17
18
|
from mteb.models.model_meta import ModelMeta, ScoringFunction
|
|
@@ -20,23 +21,23 @@ from mteb.types import Array, BatchedInput, PromptType
|
|
|
20
21
|
logger = logging.getLogger(__name__)
|
|
21
22
|
|
|
22
23
|
NV_RETRIEVER_CITATION = """@misc{lee2025nvembedimprovedtechniquestraining,
|
|
23
|
-
title={NV-Embed: Improved Techniques for Training LLMs as Generalist Embedding Models},
|
|
24
|
+
title={NV-Embed: Improved Techniques for Training LLMs as Generalist Embedding Models},
|
|
24
25
|
author={Chankyu Lee and Rajarshi Roy and Mengyao Xu and Jonathan Raiman and Mohammad Shoeybi and Bryan Catanzaro and Wei Ping},
|
|
25
26
|
year={2025},
|
|
26
27
|
eprint={2405.17428},
|
|
27
28
|
archivePrefix={arXiv},
|
|
28
29
|
primaryClass={cs.CL},
|
|
29
|
-
url={https://arxiv.org/abs/2405.17428},
|
|
30
|
+
url={https://arxiv.org/abs/2405.17428},
|
|
30
31
|
}"""
|
|
31
32
|
|
|
32
33
|
LlamaEmbedNemotron_CITATION = """@misc{babakhin2025llamaembednemotron8buniversaltextembedding,
|
|
33
|
-
title={Llama-Embed-Nemotron-8B: A Universal Text Embedding Model for Multilingual and Cross-Lingual Tasks},
|
|
34
|
+
title={Llama-Embed-Nemotron-8B: A Universal Text Embedding Model for Multilingual and Cross-Lingual Tasks},
|
|
34
35
|
author={Yauhen Babakhin and Radek Osmulski and Ronay Ak and Gabriel Moreira and Mengyao Xu and Benedikt Schifferer and Bo Liu and Even Oldridge},
|
|
35
36
|
year={2025},
|
|
36
37
|
eprint={2511.07025},
|
|
37
38
|
archivePrefix={arXiv},
|
|
38
39
|
primaryClass={cs.CL},
|
|
39
|
-
url={https://arxiv.org/abs/2511.07025},
|
|
40
|
+
url={https://arxiv.org/abs/2511.07025},
|
|
40
41
|
}"""
|
|
41
42
|
|
|
42
43
|
|
|
@@ -629,3 +630,55 @@ llama_embed_nemotron_8b = ModelMeta(
|
|
|
629
630
|
contacts=["ybabakhin"],
|
|
630
631
|
citation=LlamaEmbedNemotron_CITATION,
|
|
631
632
|
)
|
|
633
|
+
|
|
634
|
+
|
|
635
|
+
def _nemotron_rerank_model(model: str, revision: str, **kwargs) -> CrossEncoderWrapper:
|
|
636
|
+
required_transformers_version = "4.47.1"
|
|
637
|
+
|
|
638
|
+
if Version(transformers_version) != Version(required_transformers_version):
|
|
639
|
+
raise RuntimeError(
|
|
640
|
+
f"transformers version {transformers_version} is not match with required "
|
|
641
|
+
f"install version {required_transformers_version} to run `nvidia/llama-nemotron-rerank-1b-v2`"
|
|
642
|
+
)
|
|
643
|
+
|
|
644
|
+
return CrossEncoderWrapper(
|
|
645
|
+
model=model,
|
|
646
|
+
revision=revision,
|
|
647
|
+
**kwargs,
|
|
648
|
+
)
|
|
649
|
+
|
|
650
|
+
|
|
651
|
+
nemotron_rerank_1b_v2 = ModelMeta(
|
|
652
|
+
loader=_nemotron_rerank_model,
|
|
653
|
+
loader_kwargs=dict(
|
|
654
|
+
trust_remote_code=True,
|
|
655
|
+
query_prefix="question:",
|
|
656
|
+
passage_prefix=" \n \n passage:",
|
|
657
|
+
model_kwargs={"torch_dtype": torch.float32},
|
|
658
|
+
),
|
|
659
|
+
name="nvidia/llama-nemotron-rerank-1b-v2",
|
|
660
|
+
revision="78efcfdc23b53a753f6c73f2d78b18132a34ac4d",
|
|
661
|
+
release_date="2025-10-16",
|
|
662
|
+
languages=["eng-Latn"],
|
|
663
|
+
n_parameters=1235816448,
|
|
664
|
+
memory_usage_mb=2357.0,
|
|
665
|
+
max_tokens=4096,
|
|
666
|
+
embed_dim=2048,
|
|
667
|
+
license="https://www.nvidia.com/en-us/agreements/enterprise-software/nvidia-open-model-license/",
|
|
668
|
+
open_weights=True,
|
|
669
|
+
public_training_code=None,
|
|
670
|
+
public_training_data=None,
|
|
671
|
+
framework=["PyTorch", "Sentence Transformers"],
|
|
672
|
+
reference="https://huggingface.co/nvidia/llama-nemotron-rerank-1b-v2",
|
|
673
|
+
similarity_fn_name=ScoringFunction.COSINE,
|
|
674
|
+
use_instructions=None,
|
|
675
|
+
training_datasets=set(
|
|
676
|
+
# private
|
|
677
|
+
),
|
|
678
|
+
adapted_from="meta-llama/Llama-3.2-1B",
|
|
679
|
+
superseded_by=None,
|
|
680
|
+
modalities=["text"],
|
|
681
|
+
model_type=["cross-encoder"],
|
|
682
|
+
citation=None,
|
|
683
|
+
contacts=None,
|
|
684
|
+
)
|
|
@@ -25,6 +25,9 @@ VOYAGE_DTYPE_TRANSLATION = {
|
|
|
25
25
|
|
|
26
26
|
# Total token limits per model based on VoyageAI documentation
|
|
27
27
|
VOYAGE_TOTAL_TOKEN_LIMITS = {
|
|
28
|
+
"voyage-4-large": 120_000,
|
|
29
|
+
"voyage-4": 320_000,
|
|
30
|
+
"voyage-4-lite": 1_000_000,
|
|
28
31
|
"voyage-3.5-lite": 1_000_000,
|
|
29
32
|
"voyage-3.5": 320_000,
|
|
30
33
|
"voyage-2": 320_000,
|
|
@@ -206,6 +209,84 @@ model_prompts = {
|
|
|
206
209
|
PromptType.document.value: "document",
|
|
207
210
|
}
|
|
208
211
|
|
|
212
|
+
voyage_4 = ModelMeta(
|
|
213
|
+
name="voyageai/voyage-4",
|
|
214
|
+
model_type=["dense"],
|
|
215
|
+
revision="1",
|
|
216
|
+
release_date="2026-01-15",
|
|
217
|
+
languages=None, # supported languages not specified
|
|
218
|
+
loader=VoyageModel,
|
|
219
|
+
loader_kwargs=dict(
|
|
220
|
+
max_tokens=32000,
|
|
221
|
+
model_prompts=model_prompts,
|
|
222
|
+
),
|
|
223
|
+
max_tokens=32000,
|
|
224
|
+
embed_dim=1024,
|
|
225
|
+
open_weights=False,
|
|
226
|
+
n_parameters=None,
|
|
227
|
+
memory_usage_mb=None,
|
|
228
|
+
license=None,
|
|
229
|
+
reference="https://blog.voyageai.com/2026/01/15/voyage-4/",
|
|
230
|
+
similarity_fn_name="cosine",
|
|
231
|
+
framework=["API"],
|
|
232
|
+
use_instructions=True,
|
|
233
|
+
training_datasets=VOYAGE_TRAINING_DATA,
|
|
234
|
+
public_training_code=None,
|
|
235
|
+
public_training_data=None,
|
|
236
|
+
)
|
|
237
|
+
|
|
238
|
+
voyage_4_lite = ModelMeta(
|
|
239
|
+
name="voyageai/voyage-4-lite",
|
|
240
|
+
model_type=["dense"],
|
|
241
|
+
revision="1",
|
|
242
|
+
release_date="2026-01-15",
|
|
243
|
+
languages=None, # supported languages not specified
|
|
244
|
+
loader=VoyageModel,
|
|
245
|
+
loader_kwargs=dict(
|
|
246
|
+
max_tokens=32000,
|
|
247
|
+
model_prompts=model_prompts,
|
|
248
|
+
),
|
|
249
|
+
max_tokens=32000,
|
|
250
|
+
embed_dim=1024,
|
|
251
|
+
open_weights=False,
|
|
252
|
+
n_parameters=None,
|
|
253
|
+
memory_usage_mb=None,
|
|
254
|
+
license=None,
|
|
255
|
+
reference="https://blog.voyageai.com/2026/01/15/voyage-4/",
|
|
256
|
+
similarity_fn_name="cosine",
|
|
257
|
+
framework=["API"],
|
|
258
|
+
use_instructions=True,
|
|
259
|
+
training_datasets=VOYAGE_TRAINING_DATA,
|
|
260
|
+
public_training_code=None,
|
|
261
|
+
public_training_data=None,
|
|
262
|
+
)
|
|
263
|
+
|
|
264
|
+
voyage_4_large = ModelMeta(
|
|
265
|
+
name="voyageai/voyage-4-large",
|
|
266
|
+
model_type=["dense"],
|
|
267
|
+
revision="1",
|
|
268
|
+
release_date="2026-01-15",
|
|
269
|
+
languages=None, # supported languages not specified
|
|
270
|
+
loader=VoyageModel,
|
|
271
|
+
loader_kwargs=dict(
|
|
272
|
+
max_tokens=32000,
|
|
273
|
+
model_prompts=model_prompts,
|
|
274
|
+
),
|
|
275
|
+
max_tokens=32000,
|
|
276
|
+
embed_dim=1024,
|
|
277
|
+
open_weights=False,
|
|
278
|
+
n_parameters=None,
|
|
279
|
+
memory_usage_mb=None,
|
|
280
|
+
license=None,
|
|
281
|
+
reference="https://blog.voyageai.com/2026/01/15/voyage-4/",
|
|
282
|
+
similarity_fn_name="cosine",
|
|
283
|
+
framework=["API"],
|
|
284
|
+
use_instructions=True,
|
|
285
|
+
training_datasets=VOYAGE_TRAINING_DATA,
|
|
286
|
+
public_training_code=None,
|
|
287
|
+
public_training_data=None,
|
|
288
|
+
)
|
|
289
|
+
|
|
209
290
|
voyage_3_large = ModelMeta(
|
|
210
291
|
name="voyageai/voyage-3-large", # Date of publication of this post https://blog.voyageai.com/2025/01/07/voyage-3-large/
|
|
211
292
|
model_type=["dense"],
|
|
@@ -230,6 +311,7 @@ voyage_3_large = ModelMeta(
|
|
|
230
311
|
training_datasets=VOYAGE_TRAINING_DATA,
|
|
231
312
|
public_training_code=None,
|
|
232
313
|
public_training_data=None,
|
|
314
|
+
superseded_by="voyageai/voyage-4-large",
|
|
233
315
|
)
|
|
234
316
|
|
|
235
317
|
|
|
@@ -257,6 +339,7 @@ voyage_3_5 = ModelMeta(
|
|
|
257
339
|
training_datasets=VOYAGE_TRAINING_DATA,
|
|
258
340
|
public_training_code=None,
|
|
259
341
|
public_training_data=None,
|
|
342
|
+
superseded_by="voyageai/voyage-4",
|
|
260
343
|
)
|
|
261
344
|
|
|
262
345
|
voyage_3_5_int8 = ModelMeta(
|
|
@@ -571,6 +654,7 @@ voyage_3_lite = ModelMeta(
|
|
|
571
654
|
training_datasets=VOYAGE_TRAINING_DATA,
|
|
572
655
|
public_training_code=None,
|
|
573
656
|
public_training_data=None,
|
|
657
|
+
superseded_by="voyageai/voyage-4-lite",
|
|
574
658
|
)
|
|
575
659
|
|
|
576
660
|
voyage_3_exp = ModelMeta(
|
|
@@ -16,6 +16,8 @@ from mteb.types import Array, BatchedInput, PromptType
|
|
|
16
16
|
if TYPE_CHECKING:
|
|
17
17
|
from PIL import Image
|
|
18
18
|
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
19
21
|
|
|
20
22
|
def _downsample_image(
|
|
21
23
|
image: Image.Image, max_pixels: int = 16000000, target_longest_side: int = 4000
|
|
@@ -37,17 +39,17 @@ def _downsample_image(
|
|
|
37
39
|
new_width = int(width * (target_longest_side / height))
|
|
38
40
|
|
|
39
41
|
new_size = (new_width, new_height)
|
|
40
|
-
|
|
42
|
+
logger.info(
|
|
41
43
|
f"Downsampling image from {width}x{height} to {new_width}x{new_height}"
|
|
42
44
|
)
|
|
43
45
|
return image.resize(new_size, Image.LANCZOS)
|
|
44
46
|
if width > height:
|
|
45
47
|
if width > 10000:
|
|
46
|
-
|
|
48
|
+
logger.error("Processing extremely wide images.")
|
|
47
49
|
return image.resize((10000, height), Image.LANCZOS)
|
|
48
50
|
else:
|
|
49
51
|
if height > 10000:
|
|
50
|
-
|
|
52
|
+
logger.error("Processing extremely high images.")
|
|
51
53
|
return image.resize((width, 10000), Image.LANCZOS)
|
|
52
54
|
return image
|
|
53
55
|
|
mteb/models/model_meta.py
CHANGED
|
@@ -331,7 +331,7 @@ class ModelMeta(BaseModel):
|
|
|
331
331
|
revision = revisions[0].commit_id if revisions else None
|
|
332
332
|
|
|
333
333
|
release_date = cls.fetch_release_date(model_name)
|
|
334
|
-
model_license = card_data.license
|
|
334
|
+
model_license = card_data.license if card_data.license != "other" else None
|
|
335
335
|
n_parameters = cls._calculate_num_parameters_from_hub(model_name)
|
|
336
336
|
memory_usage_mb = cls._calculate_memory_usage_mb(model_name, n_parameters)
|
|
337
337
|
if model_config and hasattr(model_config, "hidden_size"):
|
|
@@ -266,13 +266,24 @@ class SentenceTransformerMultimodalEncoderWrapper(SentenceTransformerEncoderWrap
|
|
|
266
266
|
|
|
267
267
|
|
|
268
268
|
class CrossEncoderWrapper:
|
|
269
|
-
"""Wrapper for CrossEncoder models.
|
|
269
|
+
"""Wrapper for CrossEncoder models.
|
|
270
|
+
|
|
271
|
+
Args:
|
|
272
|
+
model: The CrossEncoder model to use. Can be a string (model name) or a CrossEncoder model.
|
|
273
|
+
revision: The revision of the model to use.
|
|
274
|
+
device: The device used to load the model.
|
|
275
|
+
query_prefix: A prefix to add to all queries.
|
|
276
|
+
passage_prefix: A prefix to add to all passages.
|
|
277
|
+
**kwargs: Additional arguments to pass to the CrossEncoder model.
|
|
278
|
+
"""
|
|
270
279
|
|
|
271
280
|
def __init__(
|
|
272
281
|
self,
|
|
273
282
|
model: CrossEncoder | str,
|
|
274
283
|
revision: str | None = None,
|
|
275
284
|
device: str | None = None,
|
|
285
|
+
query_prefix: str = "",
|
|
286
|
+
passage_prefix: str = "",
|
|
276
287
|
**kwargs,
|
|
277
288
|
) -> None:
|
|
278
289
|
from sentence_transformers import CrossEncoder
|
|
@@ -283,6 +294,8 @@ class CrossEncoderWrapper:
|
|
|
283
294
|
self.model = CrossEncoder(model, revision=revision, device=device, **kwargs)
|
|
284
295
|
|
|
285
296
|
self.mteb_model_meta = ModelMeta.from_cross_encoder(self.model)
|
|
297
|
+
self.query_prefix = query_prefix
|
|
298
|
+
self.passage_prefix = passage_prefix
|
|
286
299
|
|
|
287
300
|
def predict(
|
|
288
301
|
self,
|
|
@@ -311,10 +324,10 @@ class CrossEncoderWrapper:
|
|
|
311
324
|
The predicted relevance scores for each inputs pair.
|
|
312
325
|
"""
|
|
313
326
|
all_queries_with_instructions = [
|
|
314
|
-
text for batch in inputs1 for text in batch["text"]
|
|
327
|
+
self.query_prefix + text for batch in inputs1 for text in batch["text"]
|
|
315
328
|
]
|
|
316
329
|
all_corpus_with_instructions = [
|
|
317
|
-
text for batch in inputs2 for text in batch["text"]
|
|
330
|
+
self.passage_prefix + text for batch in inputs2 for text in batch["text"]
|
|
318
331
|
]
|
|
319
332
|
|
|
320
333
|
return self.model.predict(
|