mteb 2.2.2__py3-none-any.whl → 2.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. mteb/__init__.py +4 -0
  2. mteb/descriptive_stats/Reranking/MultiLongDocReranking.json +466 -0
  3. mteb/evaluate.py +38 -7
  4. mteb/models/__init__.py +4 -1
  5. mteb/models/cache_wrappers/__init__.py +2 -1
  6. mteb/models/model_implementations/colpali_models.py +4 -4
  7. mteb/models/model_implementations/colqwen_models.py +206 -2
  8. mteb/models/model_implementations/eagerworks_models.py +163 -0
  9. mteb/models/model_implementations/euler_models.py +25 -0
  10. mteb/models/model_implementations/google_models.py +1 -1
  11. mteb/models/model_implementations/jina_models.py +203 -5
  12. mteb/models/model_implementations/nb_sbert.py +1 -1
  13. mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +10 -11
  14. mteb/models/model_implementations/nvidia_models.py +1 -1
  15. mteb/models/model_implementations/ops_moa_models.py +2 -2
  16. mteb/models/model_implementations/promptriever_models.py +4 -4
  17. mteb/models/model_implementations/qwen3_models.py +3 -3
  18. mteb/models/model_implementations/qzhou_models.py +1 -1
  19. mteb/models/model_implementations/random_baseline.py +8 -18
  20. mteb/models/model_implementations/vdr_models.py +1 -0
  21. mteb/models/model_implementations/yuan_models_en.py +57 -0
  22. mteb/models/search_encoder_index/__init__.py +7 -0
  23. mteb/models/search_encoder_index/search_backend_protocol.py +50 -0
  24. mteb/models/search_encoder_index/search_indexes/__init__.py +5 -0
  25. mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +157 -0
  26. mteb/models/search_wrappers.py +157 -41
  27. mteb/results/model_result.py +2 -1
  28. mteb/results/task_result.py +12 -0
  29. mteb/similarity_functions.py +49 -0
  30. mteb/tasks/reranking/multilingual/__init__.py +2 -0
  31. mteb/tasks/reranking/multilingual/multi_long_doc_reranking.py +70 -0
  32. mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +4 -0
  33. mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +56 -42
  34. mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +3 -3
  35. {mteb-2.2.2.dist-info → mteb-2.3.1.dist-info}/METADATA +6 -1
  36. {mteb-2.2.2.dist-info → mteb-2.3.1.dist-info}/RECORD +40 -31
  37. {mteb-2.2.2.dist-info → mteb-2.3.1.dist-info}/WHEEL +0 -0
  38. {mteb-2.2.2.dist-info → mteb-2.3.1.dist-info}/entry_points.txt +0 -0
  39. {mteb-2.2.2.dist-info → mteb-2.3.1.dist-info}/licenses/LICENSE +0 -0
  40. {mteb-2.2.2.dist-info → mteb-2.3.1.dist-info}/top_level.txt +0 -0
@@ -1,11 +1,19 @@
1
1
  import logging
2
+ from typing import Any
2
3
 
3
4
  import torch
5
+ from PIL import Image
6
+ from torch.utils.data import DataLoader
7
+ from tqdm.auto import tqdm
4
8
 
5
9
  from mteb._requires_package import (
10
+ requires_image_dependencies,
6
11
  requires_package,
7
12
  )
8
- from mteb.models.model_meta import ModelMeta
13
+ from mteb.abstasks.task_metadata import TaskMetadata
14
+ from mteb.models.abs_encoder import AbsEncoder
15
+ from mteb.models.model_meta import ModelMeta, ScoringFunction
16
+ from mteb.types import Array, BatchedInput, PromptType
9
17
 
10
18
  from .colpali_models import (
11
19
  COLPALI_CITATION,
@@ -73,6 +81,132 @@ class ColQwen2_5Wrapper(ColPaliEngineWrapper): # noqa: N801
73
81
  )
74
82
 
75
83
 
84
+ class ColQwen3Wrapper(AbsEncoder):
85
+ """Wrapper for the ColQwen3 vision-language retrieval model."""
86
+
87
+ def __init__(
88
+ self,
89
+ model_name: str,
90
+ *,
91
+ revision: str | None = None,
92
+ device: str | None = None,
93
+ dtype: torch.dtype | str | None = torch.bfloat16,
94
+ **kwargs: Any,
95
+ ):
96
+ requires_image_dependencies()
97
+ requires_package(self, "transformers", model_name, "pip install mteb[colqwen3]")
98
+ from transformers import AutoModel, AutoProcessor
99
+
100
+ self.device = device or (
101
+ "cuda"
102
+ if torch.cuda.is_available()
103
+ else "mps"
104
+ if torch.backends.mps.is_available()
105
+ else "cpu"
106
+ )
107
+ self.model = AutoModel.from_pretrained(
108
+ model_name,
109
+ revision=revision,
110
+ dtype=dtype,
111
+ trust_remote_code=True,
112
+ **kwargs,
113
+ ).to(self.device)
114
+ self.model.eval()
115
+
116
+ self.processor = AutoProcessor.from_pretrained(
117
+ model_name,
118
+ revision=revision,
119
+ trust_remote_code=True,
120
+ max_num_visual_tokens=1280,
121
+ )
122
+
123
+ def encode(
124
+ self,
125
+ inputs: DataLoader[BatchedInput],
126
+ *,
127
+ task_metadata: TaskMetadata,
128
+ hf_split: str,
129
+ hf_subset: str,
130
+ prompt_type: PromptType | None = None,
131
+ **kwargs: Any,
132
+ ) -> Array:
133
+ if (
134
+ "text" not in inputs.dataset.features
135
+ and "image" not in inputs.dataset.features
136
+ ):
137
+ raise ValueError("No text or image features found in inputs.")
138
+ return self.get_fused_embeddings(inputs, **kwargs)
139
+
140
+ def _encode_inputs(self, encoded_inputs: dict[str, torch.Tensor]) -> torch.Tensor:
141
+ outputs = self.model(**encoded_inputs)
142
+ # Avoid boolean casting of tensors when checking for custom attributes.
143
+ embeddings = getattr(outputs, "embeddings", None)
144
+ if embeddings is None:
145
+ embeddings = outputs[0]
146
+ return embeddings
147
+
148
+ def get_fused_embeddings(
149
+ self,
150
+ image_texts_pairs: DataLoader[BatchedInput] | None = None,
151
+ batch_size: int = 32,
152
+ show_progress_bar: bool = True,
153
+ fusion_mode="concat",
154
+ **kwargs: Any,
155
+ ):
156
+ import torchvision.transforms.functional as F
157
+
158
+ contains_image = "image" in image_texts_pairs.dataset.features
159
+ contains_text = "text" in image_texts_pairs.dataset.features
160
+ contains_both = contains_image and contains_text
161
+
162
+ if contains_both:
163
+ progress_desc = "Encoding images+texts"
164
+ elif contains_image:
165
+ progress_desc = "Encoding images"
166
+ elif contains_text:
167
+ progress_desc = "Encoding texts"
168
+ else:
169
+ raise ValueError("No text or image features found in inputs.")
170
+
171
+ all_embeds: list[torch.Tensor] = []
172
+ with torch.no_grad():
173
+ for batch in tqdm(
174
+ image_texts_pairs,
175
+ disable=not show_progress_bar,
176
+ desc=progress_desc,
177
+ ):
178
+ if contains_image:
179
+ imgs = [
180
+ F.to_pil_image(b.to(self.device))
181
+ if not isinstance(b, Image.Image)
182
+ else b
183
+ for b in batch["image"]
184
+ ]
185
+ else:
186
+ imgs = None
187
+ if contains_text:
188
+ texts = batch["text"]
189
+ else:
190
+ texts = None
191
+ if contains_both:
192
+ assert len(imgs) == len(texts), (
193
+ f"The number of texts and images must have the same length, got {len(imgs)} and {len(texts)}"
194
+ )
195
+
196
+ inputs = self.processor(images=imgs, text=texts)
197
+ inputs = {k: v.to(self.device) for k, v in inputs.items()}
198
+ outs = self._encode_inputs(inputs)
199
+ all_embeds.extend(outs.cpu().to(torch.float32))
200
+
201
+ padded = torch.nn.utils.rnn.pad_sequence(
202
+ all_embeds, batch_first=True, padding_value=0
203
+ )
204
+ return padded
205
+
206
+ def similarity(self, a, b):
207
+ return self.processor.score_multi_vector(a, b, device=self.device)
208
+
209
+
76
210
  colqwen2 = ModelMeta(
77
211
  loader=ColQwen2Wrapper,
78
212
  loader_kwargs=dict(
@@ -125,6 +259,72 @@ colqwen2_5 = ModelMeta(
125
259
  citation=COLPALI_CITATION,
126
260
  )
127
261
 
262
+ TOMORO_TRAINING_DATA = {
263
+ "VDRMultilingualRetrieval",
264
+ # from https://huggingface.co/datasets/vidore/colpali_train_set
265
+ "VidoreDocVQARetrieval",
266
+ "VidoreInfoVQARetrieval",
267
+ "VidoreTatdqaRetrieval",
268
+ "VidoreArxivQARetrieval",
269
+ "VisRAG-Ret-Train-Synthetic-data",
270
+ "VisRAG-Ret-Train-In-domain-data",
271
+ }
272
+
273
+ TOMORO_CITATION = """
274
+ @misc{huang2025tomoro_colqwen3_embed,
275
+ title={TomoroAI/tomoro-colqwen3-embed},
276
+ author={Xin Huang and Kye Min Tan and Albert Phelps},
277
+ year={2025},
278
+ url={https://huggingface.co/TomoroAI/tomoro-colqwen3-embed-8b}
279
+ }
280
+ """
281
+
282
+ colqwen3_8b = ModelMeta(
283
+ loader=ColQwen3Wrapper,
284
+ name="TomoroAI/tomoro-colqwen3-embed-8b",
285
+ languages=["eng-Latn"],
286
+ revision="0b9fe28142910e209bbac15b1efe85507c27644f",
287
+ release_date="2025-11-26",
288
+ modalities=["image", "text"],
289
+ n_parameters=8_000_000_000,
290
+ memory_usage_mb=16724,
291
+ max_tokens=262144,
292
+ embed_dim=320,
293
+ license="apache-2.0",
294
+ open_weights=True,
295
+ public_training_code="https://github.com/illuin-tech/colpali",
296
+ public_training_data=None,
297
+ framework=["PyTorch"],
298
+ reference="https://huggingface.co/TomoroAI/tomoro-colqwen3-embed-8b",
299
+ similarity_fn_name=ScoringFunction.MAX_SIM,
300
+ use_instructions=True,
301
+ training_datasets=TOMORO_TRAINING_DATA,
302
+ citation=TOMORO_CITATION,
303
+ )
304
+
305
+ colqwen3_4b = ModelMeta(
306
+ loader=ColQwen3Wrapper,
307
+ name="TomoroAI/tomoro-colqwen3-embed-4b",
308
+ languages=["eng-Latn"],
309
+ revision="6a32fb68598730bf5620fbf18d832c784235c59c",
310
+ release_date="2025-11-26",
311
+ modalities=["image", "text"],
312
+ n_parameters=4_000_000_000,
313
+ memory_usage_mb=8466,
314
+ max_tokens=262144,
315
+ embed_dim=320,
316
+ license="apache-2.0",
317
+ open_weights=True,
318
+ public_training_code="https://github.com/illuin-tech/colpali",
319
+ public_training_data=None,
320
+ framework=["PyTorch"],
321
+ reference="https://huggingface.co/TomoroAI/tomoro-colqwen3-embed-4b",
322
+ similarity_fn_name=ScoringFunction.MAX_SIM,
323
+ use_instructions=True,
324
+ training_datasets=TOMORO_TRAINING_DATA,
325
+ citation=TOMORO_CITATION,
326
+ )
327
+
128
328
  colnomic_7b = ModelMeta(
129
329
  loader=ColQwen2_5Wrapper,
130
330
  loader_kwargs=dict(
@@ -223,7 +423,11 @@ colnomic_7b = ModelMeta(
223
423
 
224
424
 
225
425
  EVOQWEN_TRAINING_DATA = {
226
- "colpali_train_set",
426
+ # "colpali_train_set",
427
+ "VidoreDocVQARetrieval",
428
+ "VidoreInfoVQARetrieval",
429
+ "VidoreTatdqaRetrieval",
430
+ "VidoreArxivQARetrieval",
227
431
  "VisRAG-Ret-Train-Synthetic-data",
228
432
  "VisRAG-Ret-Train-In-domain-data",
229
433
  }
@@ -0,0 +1,163 @@
1
+ from typing import Any
2
+
3
+ import torch
4
+ from torch.utils.data import DataLoader
5
+ from tqdm.auto import tqdm
6
+
7
+ from mteb._requires_package import (
8
+ requires_image_dependencies,
9
+ requires_package,
10
+ )
11
+ from mteb.abstasks.task_metadata import TaskMetadata
12
+ from mteb.models.abs_encoder import AbsEncoder
13
+ from mteb.models.model_meta import ModelMeta, ScoringFunction
14
+ from mteb.types import Array, BatchedInput, PromptType
15
+
16
+
17
+ class EagerEmbedV1Wrapper(AbsEncoder):
18
+ """Wrapper for EagerEmbed single-vector embedding models."""
19
+
20
+ def __init__(
21
+ self,
22
+ model_name: str,
23
+ revision: str | None = None,
24
+ device: str | None = None,
25
+ image_size: int = 784,
26
+ **kwargs,
27
+ ):
28
+ requires_image_dependencies()
29
+ requires_package(
30
+ self, "qwen_vl_utils", model_name, "pip install mteb[eager_embed]"
31
+ )
32
+ from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
33
+
34
+ self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
35
+ self.image_size = image_size
36
+
37
+ # Load model
38
+ self.mdl = Qwen3VLForConditionalGeneration.from_pretrained(model_name, **kwargs)
39
+ self.mdl = self.mdl.to(self.device)
40
+ self.mdl.eval()
41
+
42
+ # Load processor
43
+ self.processor = AutoProcessor.from_pretrained(model_name)
44
+
45
+ def get_embedding(self, last_hidden_state: torch.Tensor) -> torch.Tensor:
46
+ """Extract embeddings from last token of last hidden state."""
47
+ reps = last_hidden_state[:, -1]
48
+ return reps
49
+
50
+ def encode(
51
+ self,
52
+ inputs: DataLoader[BatchedInput],
53
+ *,
54
+ task_metadata: TaskMetadata,
55
+ hf_split: str,
56
+ hf_subset: str,
57
+ prompt_type: PromptType | None = None,
58
+ **kwargs: Any,
59
+ ) -> Array:
60
+ """Encode inputs (text and/or images) into embeddings."""
61
+ from qwen_vl_utils import process_vision_info
62
+
63
+ all_embeddings: list[torch.Tensor] = []
64
+
65
+ with torch.no_grad():
66
+ for batch in tqdm(inputs, desc="Encoding"):
67
+ batch_texts = batch.get("text", [])
68
+ batch_images = batch.get("image", [])
69
+
70
+ messages = []
71
+ for i in range(max(len(batch_texts), len(batch_images))):
72
+ text_content = batch_texts[i] if batch_texts else ""
73
+ image_content = batch_images[i] if batch_images else None
74
+
75
+ query_prefix = "Query: " if prompt_type == PromptType.query else ""
76
+ content = [
77
+ {"type": "text", "text": f"{query_prefix}{text_content}"}
78
+ ]
79
+
80
+ if image_content is not None:
81
+ content.append(
82
+ {
83
+ "type": "image",
84
+ "image": image_content,
85
+ "resized_height": self.image_size,
86
+ "resized_width": self.image_size,
87
+ }
88
+ )
89
+
90
+ messages.append([{"role": "user", "content": content}])
91
+
92
+ # Prepare inputs
93
+ texts = [
94
+ self.processor.apply_chat_template(
95
+ msg, tokenize=False, add_generation_prompt=False
96
+ )
97
+ + "<|endoftext|>"
98
+ for msg in messages
99
+ ]
100
+
101
+ image_inputs = None
102
+ video_inputs = None
103
+ if batch_images:
104
+ image_inputs, video_inputs = process_vision_info(messages)
105
+
106
+ model_inputs = self.processor(
107
+ text=texts,
108
+ images=image_inputs,
109
+ videos=video_inputs,
110
+ padding="longest",
111
+ return_tensors="pt",
112
+ ).to(self.device)
113
+
114
+ # Get embeddings
115
+ output = self.mdl(
116
+ **model_inputs, return_dict=True, output_hidden_states=True
117
+ )
118
+ embeddings = self.get_embedding(output.hidden_states[-1])
119
+ embeddings = embeddings.cpu().to(torch.float32)
120
+ embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=-1)
121
+
122
+ all_embeddings.append(embeddings)
123
+
124
+ return torch.cat(all_embeddings, dim=0)
125
+
126
+
127
+ EAGER_EMBED_V1_CITATION = """@article{EagerEmbed,
128
+ title={Eager Embed V1: Multimodal Dense Embeddings for Retrieval},
129
+ author={Juan Pablo Balarini},
130
+ year={2025},
131
+ publisher={Eagerworks},
132
+ url={https://github.com/eagerworks/eager-embed},
133
+ }"""
134
+
135
+ EAGER_EMBED_V1_TRAINING_DATASETS = {"colpali", "bge-ir", "pixmo-docs", "wiki-ss"}
136
+
137
+ Eager_Embed_V1 = ModelMeta(
138
+ loader=EagerEmbedV1Wrapper,
139
+ loader_kwargs=dict(
140
+ dtype=torch.float16,
141
+ image_size=784,
142
+ ),
143
+ name="eagerworks/eager-embed-v1",
144
+ languages=["fra-Latn", "spa-Latn", "eng-Latn", "deu-Latn"],
145
+ revision="a6bec272729c5056e2c26618ce085205c82a3b3c",
146
+ release_date="2025-11-20",
147
+ modalities=["image", "text"],
148
+ n_parameters=4_000_000_000,
149
+ memory_usage_mb=16929,
150
+ max_tokens=262144,
151
+ embed_dim=2560,
152
+ license="apache-2.0",
153
+ open_weights=True,
154
+ framework=["Tevatron"],
155
+ reference="https://huggingface.co/eagerworks/eager-embed-v1",
156
+ similarity_fn_name=ScoringFunction.COSINE,
157
+ use_instructions=True,
158
+ training_datasets=EAGER_EMBED_V1_TRAINING_DATASETS,
159
+ citation=EAGER_EMBED_V1_CITATION,
160
+ adapted_from="https://huggingface.co/Qwen/Qwen3-VL-4B-Instruct",
161
+ public_training_code="https://github.com/eagerworks/eager-embed",
162
+ public_training_data="https://github.com/eagerworks/eager-embed/blob/main/dataset_config.yaml",
163
+ )
@@ -0,0 +1,25 @@
1
+ from mteb.models.model_meta import ModelMeta
2
+ from mteb.models.sentence_transformer_wrapper import sentence_transformers_loader
3
+
4
+ Euler_Legal_Embedding_V1 = ModelMeta(
5
+ loader=sentence_transformers_loader,
6
+ name="Mira190/Euler-Legal-Embedding-V1",
7
+ revision="df607ed9e25e569514a99c27cdaaab16e76b6dd4",
8
+ release_date="2025-11-06",
9
+ languages=["eng-Latn"],
10
+ n_parameters=8000000000,
11
+ memory_usage_mb=15618,
12
+ max_tokens=1536,
13
+ embed_dim=4096,
14
+ license="apache-2.0",
15
+ open_weights=True,
16
+ public_training_code=None,
17
+ public_training_data=None,
18
+ framework=["PyTorch", "Sentence Transformers"],
19
+ reference="https://huggingface.co/Mira190/Euler-Legal-Embedding-V1",
20
+ similarity_fn_name="cosine",
21
+ use_instructions=False,
22
+ training_datasets=set(), # final-data-new-anonymized-grok4-filtered
23
+ adapted_from="Qwen/Qwen3-Embedding-8B",
24
+ superseded_by=None,
25
+ )
@@ -275,5 +275,5 @@ embedding_gemma_300m = ModelMeta(
275
275
  public_training_data=None,
276
276
  training_datasets=GECKO_TRAINING_DATA,
277
277
  similarity_fn_name="cosine",
278
- memory_usage_mb=578,
278
+ memory_usage_mb=1155,
279
279
  )
@@ -1,8 +1,10 @@
1
1
  import logging
2
+ from collections import defaultdict
2
3
  from typing import Any, ClassVar
3
4
 
4
5
  import numpy as np
5
6
  import torch
7
+ from sentence_transformers import CrossEncoder
6
8
  from torch.utils.data import DataLoader
7
9
 
8
10
  from mteb._requires_package import requires_package
@@ -10,13 +12,92 @@ from mteb.abstasks.task_metadata import TaskMetadata
10
12
  from mteb.languages import PROGRAMMING_LANGS
11
13
  from mteb.models.abs_encoder import AbsEncoder
12
14
  from mteb.models.model_meta import ModelMeta, ScoringFunction
13
- from mteb.models.sentence_transformer_wrapper import SentenceTransformerEncoderWrapper
15
+ from mteb.models.sentence_transformer_wrapper import (
16
+ CrossEncoderWrapper,
17
+ SentenceTransformerEncoderWrapper,
18
+ )
14
19
  from mteb.types import Array, BatchedInput, PromptType
15
20
 
16
21
  logger = logging.getLogger(__name__)
17
22
 
18
23
  MIN_SENTENCE_TRANSFORMERS_VERSION = (3, 1, 0)
19
24
 
25
+ multilingual_langs = [
26
+ "afr-Latn",
27
+ "ara-Arab",
28
+ "aze-Latn",
29
+ "bel-Cyrl",
30
+ "bul-Cyrl",
31
+ "ben-Beng",
32
+ "cat-Latn",
33
+ "ceb-Latn",
34
+ "ces-Latn",
35
+ "cym-Latn",
36
+ "dan-Latn",
37
+ "deu-Latn",
38
+ "ell-Grek",
39
+ "eng-Latn",
40
+ "spa-Latn",
41
+ "est-Latn",
42
+ "eus-Latn",
43
+ "fas-Arab",
44
+ "fin-Latn",
45
+ "fra-Latn",
46
+ "glg-Latn",
47
+ "guj-Gujr",
48
+ "heb-Hebr",
49
+ "hin-Deva",
50
+ "hrv-Latn",
51
+ "hat-Latn",
52
+ "hun-Latn",
53
+ "hye-Armn",
54
+ "ind-Latn",
55
+ "isl-Latn",
56
+ "ita-Latn",
57
+ "jpn-Jpan",
58
+ "jav-Latn",
59
+ "kat-Geor",
60
+ "kaz-Cyrl",
61
+ "khm-Khmr",
62
+ "kan-Knda",
63
+ "kor-Hang",
64
+ "kir-Cyrl",
65
+ "lao-Laoo",
66
+ "lit-Latn",
67
+ "lav-Latn",
68
+ "mkd-Cyrl",
69
+ "mal-Mlym",
70
+ "mon-Cyrl",
71
+ "mar-Deva",
72
+ "msa-Latn",
73
+ "mya-Mymr",
74
+ "nep-Deva",
75
+ "nld-Latn",
76
+ "nor-Latn",
77
+ "nob-Latn",
78
+ "nno-Latn",
79
+ "pan-Guru",
80
+ "pol-Latn",
81
+ "por-Latn",
82
+ "que-Latn",
83
+ "ron-Latn",
84
+ "rus-Cyrl",
85
+ "sin-Sinh",
86
+ "slk-Latn",
87
+ "slv-Latn",
88
+ "swa-Latn",
89
+ "tam-Taml",
90
+ "tel-Telu",
91
+ "tha-Thai",
92
+ "tgl-Latn",
93
+ "tur-Latn",
94
+ "ukr-Cyrl",
95
+ "urd-Arab",
96
+ "vie-Latn",
97
+ "yor-Latn",
98
+ "zho-Hans",
99
+ ]
100
+
20
101
  XLMR_LANGUAGES = [
21
102
  "afr-Latn",
22
103
  "amh-Latn",
@@ -119,6 +200,28 @@ XLMR_LANGUAGES = [
119
200
  "zho-Hans",
120
201
  ]
121
202
 
203
+ JINARerankerV3_TRAINING_DATA = {
204
+ "MIRACLRetrieval",
205
+ "MIRACLRetrievalHardNegatives",
206
+ "MIRACLReranking",
207
+ "CMedQAv1-reranking",
208
+ "CMedQAv2-reranking",
209
+ "MrTidyRetrieval",
210
+ "T2Reranking",
211
+ "MSMARCO",
212
+ "MSMARCOHardNegatives",
213
+ "NQ",
214
+ "NQHardNegatives",
215
+ "HotpotQA",
216
+ "HotpotQAHardNegatives",
217
+ "T2Retrieval",
218
+ "DuRetrieval",
219
+ "MMarcoReranking",
220
+ "CornStack",
221
+ "MultiLongDocRetrieval",
222
+ "StackOverflowQA",
223
+ }
224
+
122
225
  JinaV4_TRAINING_DATA = {
123
226
  "MSMARCO",
124
227
  "MSMARCOHardNegatives",
@@ -139,14 +242,72 @@ JinaV4_TRAINING_DATA = {
139
242
  "CornStack",
140
243
  "VDRMultilingualRetrieval",
141
244
  # from https://huggingface.co/datasets/vidore/colpali_train_set
142
- "DocVQA",
143
- "InfoVQA",
144
- "TATDQA",
145
- "arXivQA",
245
+ "VidoreDocVQARetrieval",
246
+ "VidoreInfoVQARetrieval",
247
+ "VidoreTatdqaRetrieval",
248
+ "VidoreArxivQARetrieval",
146
249
  # "other", # inhouse dataset including synthetic datasets
147
250
  }
148
251
 
149
252
 
253
+ class JinaRerankerV3Wrapper(CrossEncoderWrapper):
254
+ """Wrapper integration for MTEB."""
255
+
256
+ def __init__(
257
+ self,
258
+ model: CrossEncoder | str,
259
+ revision: str | None = None,
260
+ trust_remote_code: bool = True,
261
+ **kwargs: Any,
262
+ ) -> None:
263
+ from sentence_transformers.util import get_device_name
264
+ from transformers import AutoModel
265
+
266
+ self.model = AutoModel.from_pretrained(
267
+ model, trust_remote_code=trust_remote_code, dtype="auto"
268
+ )
269
+
270
+ device = kwargs.get("device", None)
271
+ if device is None:
272
+ device = get_device_name()
273
+ logger.info(f"Use pytorch device: {device}")
274
+
275
+ self.model.to(device)
276
+ self.model.eval()
277
+
278
+ def predict(
279
+ self,
280
+ inputs1: DataLoader[BatchedInput],
281
+ inputs2: DataLoader[BatchedInput],
282
+ *,
283
+ task_metadata: TaskMetadata,
284
+ hf_split: str,
285
+ hf_subset: str,
286
+ prompt_type: PromptType | None = None,
287
+ **kwargs: Any,
288
+ ) -> Array:
289
+ all_corpus = [text for batch in inputs2 for text in batch["text"]]
290
+ all_queries = [text for batch in inputs1 for text in batch["text"]]
291
+
292
+ sentences_count = len(all_corpus)
293
+ query_groups: dict[str, list[tuple[int, str]]] = defaultdict(list)
294
+ for idx, (query, doc) in enumerate(zip(all_queries, all_corpus)):
295
+ query_groups[query].append((idx, doc))
296
+
297
+ results = np.zeros(sentences_count, dtype=np.float32)
298
+ for query, doc_infos in query_groups.items():
299
+ original_indices, docs = zip(*doc_infos)
300
+
301
+ scores = self.model.rerank(
302
+ query, list(docs), max_query_length=3072, max_doc_length=2048
303
+ )
304
+ for scr in scores:
305
+ original_idx = original_indices[scr["index"]]
306
+ results[original_idx] = float(scr["relevance_score"])
307
+
308
+ return results
309
+
310
+
150
311
  class JinaWrapper(SentenceTransformerEncoderWrapper):
151
312
  """following the hf model card documentation."""
152
313
 
@@ -553,6 +714,43 @@ def get_programming_task_override(
553
714
  return current_task_name
554
715
 
555
716
 
717
+ jina_reranker_v3 = ModelMeta(
718
+ loader=JinaRerankerV3Wrapper,
719
+ loader_kwargs=dict(
720
+ trust_remote_code=True,
721
+ ),
722
+ name="jinaai/jina-reranker-v3",
723
+ languages=multilingual_langs,
724
+ open_weights=True,
725
+ revision="050e171c4f75dfec5b648ed8470a2475e5a30f30",
726
+ release_date="2025-09-18", # official release date
727
+ modalities=["text"],
728
+ n_parameters=int(0.6 * 1e9),
729
+ memory_usage_mb=1138,
730
+ max_tokens=131072,
731
+ embed_dim=None,
732
+ license="cc-by-nc-4.0",
733
+ similarity_fn_name=None,
734
+ framework=["PyTorch"],
735
+ use_instructions=None,
736
+ reference="https://huggingface.co/jinaai/jina-reranker-v3",
737
+ is_cross_encoder=True,
738
+ public_training_code=None,
739
+ public_training_data=None,
740
+ training_datasets=JINARerankerV3_TRAINING_DATA,
741
+ adapted_from="Qwen/Qwen3-0.6B",
742
+ citation="""@misc{wang2025jinarerankerv3lateinteractionlistwise,
743
+ title={jina-reranker-v3: Last but Not Late Interaction for Listwise Document Reranking},
744
+ author={Feng Wang and Yuqing Li and Han Xiao},
745
+ year={2025},
746
+ eprint={2509.25085},
747
+ archivePrefix={arXiv},
748
+ primaryClass={cs.CL},
749
+ url={https://arxiv.org/abs/2509.25085},}
750
+ """,
751
+ )
752
+
753
+
556
754
  jina_embeddings_v4 = ModelMeta(
557
755
  loader=JinaV4Wrapper,
558
756
  loader_kwargs=dict(
@@ -11,7 +11,7 @@ nb_sbert = ModelMeta(
11
11
  revision="b95656350a076aeafd2d23763660f80655408cc6",
12
12
  release_date="2022-11-23",
13
13
  n_parameters=1_780_000_000,
14
- memory_usage_mb=197,
14
+ memory_usage_mb=678,
15
15
  embed_dim=4096,
16
16
  license="apache-2.0",
17
17
  max_tokens=75,