mteb 2.3.10__py3-none-any.whl → 2.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +7 -2
- mteb/abstasks/_statistics_calculation.py +6 -2
- mteb/abstasks/classification.py +0 -2
- mteb/benchmarks/benchmarks/__init__.py +2 -0
- mteb/benchmarks/benchmarks/benchmarks.py +57 -0
- mteb/deprecated_evaluator.py +8 -13
- mteb/descriptive_stats/Reranking/JQaRARerankingLite.json +35 -0
- mteb/descriptive_stats/Reranking/JaCWIRRerankingLite.json +35 -0
- mteb/descriptive_stats/Retrieval/JaCWIRRetrievalLite.json +30 -0
- mteb/descriptive_stats/Retrieval/JaqketRetrievalLite.json +30 -0
- mteb/descriptive_stats/Retrieval/MIRACLJaRetrievalLite.json +30 -0
- mteb/descriptive_stats/Retrieval/MrTyDiJaRetrievalLite.json +30 -0
- mteb/evaluate.py +2 -33
- mteb/leaderboard/figures.py +1 -1
- mteb/leaderboard/table.py +1 -11
- mteb/models/abs_encoder.py +21 -17
- mteb/models/cache_wrappers/cache_backends/_hash_utils.py +2 -2
- mteb/models/get_model_meta.py +3 -123
- mteb/models/instruct_wrapper.py +2 -1
- mteb/models/model_implementations/bica_model.py +34 -0
- mteb/models/model_implementations/colpali_models.py +7 -2
- mteb/models/model_implementations/colqwen_models.py +1 -1
- mteb/models/model_implementations/gme_v_models.py +9 -5
- mteb/models/model_implementations/google_models.py +10 -0
- mteb/models/model_implementations/granite_vision_embedding_models.py +6 -2
- mteb/models/model_implementations/jasper_models.py +2 -2
- mteb/models/model_implementations/jina_models.py +1 -1
- mteb/models/model_implementations/mod_models.py +204 -0
- mteb/models/model_implementations/nomic_models.py +142 -4
- mteb/models/model_implementations/nomic_models_vision.py +6 -2
- mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +6 -2
- mteb/models/model_implementations/pylate_models.py +1 -4
- mteb/models/model_implementations/random_baseline.py +6 -2
- mteb/models/model_implementations/seed_1_6_embedding_models.py +7 -2
- mteb/models/model_implementations/voyage_v.py +6 -2
- mteb/models/model_meta.py +396 -19
- mteb/models/sentence_transformer_wrapper.py +2 -7
- mteb/tasks/reranking/jpn/__init__.py +9 -1
- mteb/tasks/reranking/jpn/j_qa_ra_reranking_lite.py +49 -0
- mteb/tasks/reranking/jpn/ja_cwir_reranking_lite.py +47 -0
- mteb/tasks/retrieval/code/fresh_stack_retrieval.py +8 -5
- mteb/tasks/retrieval/jpn/__init__.py +8 -0
- mteb/tasks/retrieval/jpn/ja_cwir_retrieval_lite.py +47 -0
- mteb/tasks/retrieval/jpn/jaqket_retrieval_lite.py +50 -0
- mteb/tasks/retrieval/jpn/miracl_ja_retrieval_lite.py +52 -0
- mteb/tasks/retrieval/jpn/mr_tydi_ja_retrieval_lite.py +48 -0
- mteb/types/_encoder_io.py +7 -2
- {mteb-2.3.10.dist-info → mteb-2.4.1.dist-info}/METADATA +2 -1
- {mteb-2.3.10.dist-info → mteb-2.4.1.dist-info}/RECORD +53 -39
- {mteb-2.3.10.dist-info → mteb-2.4.1.dist-info}/WHEEL +0 -0
- {mteb-2.3.10.dist-info → mteb-2.4.1.dist-info}/entry_points.txt +0 -0
- {mteb-2.3.10.dist-info → mteb-2.4.1.dist-info}/licenses/LICENSE +0 -0
- {mteb-2.3.10.dist-info → mteb-2.4.1.dist-info}/top_level.txt +0 -0
mteb/models/get_model_meta.py
CHANGED
|
@@ -1,26 +1,15 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
1
|
import difflib
|
|
4
2
|
import logging
|
|
5
|
-
import warnings
|
|
6
3
|
from collections.abc import Iterable
|
|
7
|
-
from typing import
|
|
8
|
-
|
|
9
|
-
from huggingface_hub import ModelCard
|
|
10
|
-
from huggingface_hub.errors import RepositoryNotFoundError
|
|
4
|
+
from typing import Any
|
|
11
5
|
|
|
12
6
|
from mteb.abstasks import AbsTask
|
|
13
7
|
from mteb.models import (
|
|
14
|
-
CrossEncoderWrapper,
|
|
15
8
|
ModelMeta,
|
|
16
9
|
MTEBModels,
|
|
17
|
-
sentence_transformers_loader,
|
|
18
10
|
)
|
|
19
11
|
from mteb.models.model_implementations import MODEL_REGISTRY
|
|
20
12
|
|
|
21
|
-
if TYPE_CHECKING:
|
|
22
|
-
from sentence_transformers import CrossEncoder, SentenceTransformer
|
|
23
|
-
|
|
24
13
|
logger = logging.getLogger(__name__)
|
|
25
14
|
|
|
26
15
|
|
|
@@ -101,24 +90,9 @@ def get_model(
|
|
|
101
90
|
Returns:
|
|
102
91
|
A model object
|
|
103
92
|
"""
|
|
104
|
-
from sentence_transformers import CrossEncoder, SentenceTransformer
|
|
105
|
-
|
|
106
93
|
meta = get_model_meta(model_name, revision)
|
|
107
94
|
model = meta.load_model(**kwargs)
|
|
108
95
|
|
|
109
|
-
# If revision not available in the modelmeta, try to extract it from sentence-transformers
|
|
110
|
-
if hasattr(model, "model") and isinstance(model.model, SentenceTransformer): # type: ignore
|
|
111
|
-
_meta = _model_meta_from_sentence_transformers(model.model) # type: ignore
|
|
112
|
-
if meta.revision is None:
|
|
113
|
-
meta.revision = _meta.revision if _meta.revision else meta.revision
|
|
114
|
-
if not meta.similarity_fn_name:
|
|
115
|
-
meta.similarity_fn_name = _meta.similarity_fn_name
|
|
116
|
-
|
|
117
|
-
elif isinstance(model, CrossEncoder):
|
|
118
|
-
_meta = _model_meta_from_cross_encoder(model.model)
|
|
119
|
-
if meta.revision is None:
|
|
120
|
-
meta.revision = _meta.revision if _meta.revision else meta.revision
|
|
121
|
-
|
|
122
96
|
model.mteb_model_meta = meta # type: ignore
|
|
123
97
|
return model
|
|
124
98
|
|
|
@@ -148,12 +122,8 @@ def get_model_meta(
|
|
|
148
122
|
logger.info(
|
|
149
123
|
"Model not found in model registry. Attempting to extract metadata by loading the model ({model_name}) using HuggingFace."
|
|
150
124
|
)
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
meta.revision = revision
|
|
154
|
-
return meta
|
|
155
|
-
except RepositoryNotFoundError:
|
|
156
|
-
pass
|
|
125
|
+
meta = ModelMeta.from_hub(model_name, revision)
|
|
126
|
+
return meta
|
|
157
127
|
|
|
158
128
|
not_found_msg = f"Model '{model_name}' not found in MTEB registry"
|
|
159
129
|
not_found_msg += " nor on the Huggingface Hub." if fetch_from_hf else "."
|
|
@@ -171,93 +141,3 @@ def get_model_meta(
|
|
|
171
141
|
suggestion = f" Did you mean: '{close_matches[0]}'?"
|
|
172
142
|
|
|
173
143
|
raise KeyError(not_found_msg + suggestion)
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
def _model_meta_from_hf_hub(model_name: str) -> ModelMeta:
|
|
177
|
-
card = ModelCard.load(model_name)
|
|
178
|
-
card_data = card.data.to_dict()
|
|
179
|
-
frameworks = ["PyTorch"]
|
|
180
|
-
loader = None
|
|
181
|
-
if card_data.get("library_name", None) == "sentence-transformers":
|
|
182
|
-
frameworks.append("Sentence Transformers")
|
|
183
|
-
loader = sentence_transformers_loader
|
|
184
|
-
else:
|
|
185
|
-
msg = (
|
|
186
|
-
"Model library not recognized, defaulting to Sentence Transformers loader."
|
|
187
|
-
)
|
|
188
|
-
logger.warning(msg)
|
|
189
|
-
warnings.warn(msg)
|
|
190
|
-
loader = sentence_transformers_loader
|
|
191
|
-
|
|
192
|
-
revision = card_data.get("base_model_revision", None)
|
|
193
|
-
license = card_data.get("license", None)
|
|
194
|
-
return ModelMeta(
|
|
195
|
-
loader=loader,
|
|
196
|
-
name=model_name,
|
|
197
|
-
revision=revision,
|
|
198
|
-
release_date=None,
|
|
199
|
-
languages=None,
|
|
200
|
-
license=license,
|
|
201
|
-
framework=frameworks, # type: ignore
|
|
202
|
-
training_datasets=None,
|
|
203
|
-
similarity_fn_name=None,
|
|
204
|
-
n_parameters=None,
|
|
205
|
-
memory_usage_mb=None,
|
|
206
|
-
max_tokens=None,
|
|
207
|
-
embed_dim=None,
|
|
208
|
-
open_weights=True,
|
|
209
|
-
public_training_code=None,
|
|
210
|
-
public_training_data=None,
|
|
211
|
-
use_instructions=None,
|
|
212
|
-
)
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
def _model_meta_from_cross_encoder(model: CrossEncoder) -> ModelMeta:
|
|
216
|
-
return ModelMeta(
|
|
217
|
-
loader=CrossEncoderWrapper,
|
|
218
|
-
name=model.model.name_or_path,
|
|
219
|
-
revision=model.config._commit_hash,
|
|
220
|
-
release_date=None,
|
|
221
|
-
languages=None,
|
|
222
|
-
framework=["Sentence Transformers"],
|
|
223
|
-
similarity_fn_name=None,
|
|
224
|
-
n_parameters=None,
|
|
225
|
-
memory_usage_mb=None,
|
|
226
|
-
max_tokens=None,
|
|
227
|
-
embed_dim=None,
|
|
228
|
-
license=None,
|
|
229
|
-
open_weights=True,
|
|
230
|
-
public_training_code=None,
|
|
231
|
-
public_training_data=None,
|
|
232
|
-
use_instructions=None,
|
|
233
|
-
training_datasets=None,
|
|
234
|
-
)
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
def _model_meta_from_sentence_transformers(model: SentenceTransformer) -> ModelMeta:
|
|
238
|
-
name: str | None = (
|
|
239
|
-
model.model_card_data.model_name
|
|
240
|
-
if model.model_card_data.model_name
|
|
241
|
-
else model.model_card_data.base_model
|
|
242
|
-
)
|
|
243
|
-
embeddings_dim = model.get_sentence_embedding_dimension()
|
|
244
|
-
meta = ModelMeta(
|
|
245
|
-
loader=sentence_transformers_loader,
|
|
246
|
-
name=name,
|
|
247
|
-
revision=model.model_card_data.base_model_revision,
|
|
248
|
-
release_date=None,
|
|
249
|
-
languages=None,
|
|
250
|
-
framework=["Sentence Transformers"],
|
|
251
|
-
similarity_fn_name=None,
|
|
252
|
-
n_parameters=None,
|
|
253
|
-
memory_usage_mb=None,
|
|
254
|
-
max_tokens=None,
|
|
255
|
-
embed_dim=embeddings_dim,
|
|
256
|
-
license=None,
|
|
257
|
-
open_weights=True,
|
|
258
|
-
public_training_code=None,
|
|
259
|
-
public_training_data=None,
|
|
260
|
-
use_instructions=None,
|
|
261
|
-
training_datasets=None,
|
|
262
|
-
)
|
|
263
|
-
return meta
|
mteb/models/instruct_wrapper.py
CHANGED
|
@@ -122,7 +122,8 @@ class InstructSentenceTransformerModel(AbsEncoder):
|
|
|
122
122
|
apply_instruction_to_passages: Whether to apply the instruction template to the passages.
|
|
123
123
|
padding_side: Padding side. If None, the padding side will be read from the model config.
|
|
124
124
|
add_eos_token: Whether to add the eos token to each input example.
|
|
125
|
-
prompts_dict: Dictionary of task names to prompt names. If
|
|
125
|
+
prompts_dict: Dictionary of task names to prompt names. If task name is missing in the dict or prompts dict is None, prompt from task metadata or
|
|
126
|
+
AbsTask.abstask_prompt will be used.
|
|
126
127
|
**kwargs: Kwargs for Sentence Transformer model.
|
|
127
128
|
"""
|
|
128
129
|
from sentence_transformers import SentenceTransformer
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
from mteb.models import ModelMeta, sentence_transformers_loader
|
|
2
|
+
|
|
3
|
+
bica_base = ModelMeta(
|
|
4
|
+
name="bisectgroup/BiCA-base",
|
|
5
|
+
loader=sentence_transformers_loader,
|
|
6
|
+
languages=["eng-Latn"],
|
|
7
|
+
open_weights=True,
|
|
8
|
+
revision="31237a836e5ae908c308a256573e5f0986498574",
|
|
9
|
+
release_date="2025-11-14",
|
|
10
|
+
n_parameters=110_000_000,
|
|
11
|
+
memory_usage_mb=418,
|
|
12
|
+
embed_dim=768,
|
|
13
|
+
license="mit",
|
|
14
|
+
max_tokens=512,
|
|
15
|
+
reference="https://huggingface.co/bisectgroup/BiCA-base",
|
|
16
|
+
similarity_fn_name="cosine",
|
|
17
|
+
framework=["Sentence Transformers", "PyTorch"],
|
|
18
|
+
use_instructions=False,
|
|
19
|
+
public_training_code="https://github.com/NiravBhattLab/BiCA",
|
|
20
|
+
public_training_data="https://huggingface.co/datasets/bisectgroup/hard-negatives-traversal",
|
|
21
|
+
adapted_from="thenlper/gte-base",
|
|
22
|
+
citation="""
|
|
23
|
+
@misc{sinha2025bicaeffectivebiomedicaldense,
|
|
24
|
+
title={BiCA: Effective Biomedical Dense Retrieval with Citation-Aware Hard Negatives},
|
|
25
|
+
author={Aarush Sinha and Pavan Kumar S and Roshan Balaji and Nirav Pravinbhai Bhatt},
|
|
26
|
+
year={2025},
|
|
27
|
+
eprint={2511.08029},
|
|
28
|
+
archivePrefix={arXiv},
|
|
29
|
+
primaryClass={cs.IR},
|
|
30
|
+
url={https://arxiv.org/abs/2511.08029},
|
|
31
|
+
}
|
|
32
|
+
""",
|
|
33
|
+
training_datasets=set(),
|
|
34
|
+
)
|
|
@@ -1,8 +1,9 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import logging
|
|
2
|
-
from typing import Any
|
|
4
|
+
from typing import TYPE_CHECKING, Any
|
|
3
5
|
|
|
4
6
|
import torch
|
|
5
|
-
from PIL import Image
|
|
6
7
|
from torch.utils.data import DataLoader
|
|
7
8
|
from tqdm.auto import tqdm
|
|
8
9
|
|
|
@@ -15,6 +16,9 @@ from mteb.models.abs_encoder import AbsEncoder
|
|
|
15
16
|
from mteb.models.model_meta import ModelMeta, ScoringFunction
|
|
16
17
|
from mteb.types import Array, BatchedInput, PromptType
|
|
17
18
|
|
|
19
|
+
if TYPE_CHECKING:
|
|
20
|
+
from PIL import Image
|
|
21
|
+
|
|
18
22
|
logger = logging.getLogger(__name__)
|
|
19
23
|
|
|
20
24
|
|
|
@@ -89,6 +93,7 @@ class ColPaliEngineWrapper(AbsEncoder):
|
|
|
89
93
|
**kwargs,
|
|
90
94
|
):
|
|
91
95
|
import torchvision.transforms.functional as F
|
|
96
|
+
from PIL import Image
|
|
92
97
|
|
|
93
98
|
all_embeds = []
|
|
94
99
|
|
|
@@ -2,7 +2,6 @@ import logging
|
|
|
2
2
|
from typing import Any
|
|
3
3
|
|
|
4
4
|
import torch
|
|
5
|
-
from PIL import Image
|
|
6
5
|
from torch.utils.data import DataLoader
|
|
7
6
|
from tqdm.auto import tqdm
|
|
8
7
|
|
|
@@ -154,6 +153,7 @@ class ColQwen3Wrapper(AbsEncoder):
|
|
|
154
153
|
**kwargs: Any,
|
|
155
154
|
):
|
|
156
155
|
import torchvision.transforms.functional as F
|
|
156
|
+
from PIL import Image
|
|
157
157
|
|
|
158
158
|
contains_image = "image" in image_texts_pairs.dataset.features
|
|
159
159
|
contains_text = "text" in image_texts_pairs.dataset.features
|
|
@@ -1,9 +1,10 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import logging
|
|
2
4
|
import math
|
|
3
|
-
from typing import Any
|
|
5
|
+
from typing import TYPE_CHECKING, Any
|
|
4
6
|
|
|
5
7
|
import torch
|
|
6
|
-
from PIL import Image
|
|
7
8
|
from torch.utils.data import DataLoader
|
|
8
9
|
from tqdm.autonotebook import tqdm
|
|
9
10
|
|
|
@@ -12,6 +13,9 @@ from mteb.models.abs_encoder import AbsEncoder
|
|
|
12
13
|
from mteb.models.model_meta import ModelMeta, ScoringFunction
|
|
13
14
|
from mteb.types import Array, BatchedInput, PromptType
|
|
14
15
|
|
|
16
|
+
if TYPE_CHECKING:
|
|
17
|
+
from PIL import Image
|
|
18
|
+
|
|
15
19
|
logger = logging.getLogger(__name__)
|
|
16
20
|
|
|
17
21
|
GME_CITATION = """@misc{zhang2024gme,
|
|
@@ -267,9 +271,9 @@ def smart_resize(
|
|
|
267
271
|
return h_bar, w_bar
|
|
268
272
|
|
|
269
273
|
|
|
270
|
-
def fetch_image(
|
|
271
|
-
|
|
272
|
-
|
|
274
|
+
def fetch_image(image: Image.Image, size_factor: int = IMAGE_FACTOR) -> Image.Image:
|
|
275
|
+
from PIL import Image
|
|
276
|
+
|
|
273
277
|
image_obj = None
|
|
274
278
|
if isinstance(image, Image.Image):
|
|
275
279
|
image_obj = image
|
|
@@ -272,4 +272,14 @@ embedding_gemma_300m = ModelMeta(
|
|
|
272
272
|
training_datasets=GECKO_TRAINING_DATA,
|
|
273
273
|
similarity_fn_name="cosine",
|
|
274
274
|
memory_usage_mb=1155,
|
|
275
|
+
citation="""
|
|
276
|
+
@misc{vera2025embeddinggemmapowerfullightweighttext,
|
|
277
|
+
title={EmbeddingGemma: Powerful and Lightweight Text Representations},
|
|
278
|
+
author={Henrique Schechter Vera and Sahil Dua and Biao Zhang and Daniel Salz and Ryan Mullins and Sindhu Raghuram Panyam and Sara Smoot and Iftekhar Naim and Joe Zou and Feiyang Chen and Daniel Cer and Alice Lisak and Min Choi and Lucas Gonzalez and Omar Sanseviero and Glenn Cameron and Ian Ballantyne and Kat Black and Kaifeng Chen and Weiyi Wang and Zhe Li and Gus Martins and Jinhyuk Lee and Mark Sherwood and Juyeong Ji and Renjie Wu and Jingxiao Zheng and Jyotinder Singh and Abheesht Sharma and Divyashree Sreepathihalli and Aashi Jain and Adham Elarabawy and AJ Co and Andreas Doumanoglou and Babak Samari and Ben Hora and Brian Potetz and Dahun Kim and Enrique Alfonseca and Fedor Moiseev and Feng Han and Frank Palma Gomez and Gustavo Hernández Ábrego and Hesen Zhang and Hui Hui and Jay Han and Karan Gill and Ke Chen and Koert Chen and Madhuri Shanbhogue and Michael Boratko and Paul Suganthan and Sai Meher Karthik Duddu and Sandeep Mariserla and Setareh Ariafar and Shanfeng Zhang and Shijie Zhang and Simon Baumgartner and Sonam Goenka and Steve Qiu and Tanmaya Dabral and Trevor Walker and Vikram Rao and Waleed Khawaja and Wenlei Zhou and Xiaoqi Ren and Ye Xia and Yichang Chen and Yi-Ting Chen and Zhe Dong and Zhongli Ding and Francesco Visin and Gaël Liu and Jiageng Zhang and Kathleen Kenealy and Michelle Casbon and Ravin Kumar and Thomas Mesnard and Zach Gleicher and Cormac Brick and Olivier Lacombe and Adam Roberts and Qin Yin and Yunhsuan Sung and Raphael Hoffmann and Tris Warkentin and Armand Joulin and Tom Duerig and Mojtaba Seyedhosseini},
|
|
279
|
+
year={2025},
|
|
280
|
+
eprint={2509.20354},
|
|
281
|
+
archivePrefix={arXiv},
|
|
282
|
+
primaryClass={cs.CL},
|
|
283
|
+
url={https://arxiv.org/abs/2509.20354},
|
|
284
|
+
}""",
|
|
275
285
|
)
|
|
@@ -1,8 +1,9 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import logging
|
|
2
|
-
from typing import Any
|
|
4
|
+
from typing import TYPE_CHECKING, Any
|
|
3
5
|
|
|
4
6
|
import torch
|
|
5
|
-
from PIL import Image
|
|
6
7
|
from torch.utils.data import DataLoader
|
|
7
8
|
from tqdm.auto import tqdm
|
|
8
9
|
|
|
@@ -15,6 +16,9 @@ from mteb.types import Array, BatchedInput, PromptType
|
|
|
15
16
|
|
|
16
17
|
logger = logging.getLogger(__name__)
|
|
17
18
|
|
|
19
|
+
if TYPE_CHECKING:
|
|
20
|
+
from PIL import Image
|
|
21
|
+
|
|
18
22
|
|
|
19
23
|
class GraniteVisionEmbeddingWrapper:
|
|
20
24
|
def __init__(
|
|
@@ -355,13 +355,13 @@ Jasper_Token_Compression_600M = ModelMeta(
|
|
|
355
355
|
| qzhou_training_data,
|
|
356
356
|
citation="""
|
|
357
357
|
@misc{zhang2025jaspertokencompression600mtechnicalreport,
|
|
358
|
-
title={Jasper-Token-Compression-600M Technical Report},
|
|
358
|
+
title={Jasper-Token-Compression-600M Technical Report},
|
|
359
359
|
author={Dun Zhang and Ziyang Zeng and Yudong Zhou and Shuyang Lu},
|
|
360
360
|
year={2025},
|
|
361
361
|
eprint={2511.14405},
|
|
362
362
|
archivePrefix={arXiv},
|
|
363
363
|
primaryClass={cs.IR},
|
|
364
|
-
url={https://arxiv.org/abs/2511.14405},
|
|
364
|
+
url={https://arxiv.org/abs/2511.14405},
|
|
365
365
|
}
|
|
366
366
|
""",
|
|
367
367
|
)
|
|
@@ -740,7 +740,7 @@ jina_reranker_v3 = ModelMeta(
|
|
|
740
740
|
training_datasets=JINARerankerV3_TRAINING_DATA,
|
|
741
741
|
adapted_from="Qwen/Qwen3-0.6B",
|
|
742
742
|
citation="""@misc{wang2025jinarerankerv3lateinteractionlistwise,
|
|
743
|
-
title={jina-reranker-v3: Last but Not Late Interaction for Listwise Document Reranking},
|
|
743
|
+
title={jina-reranker-v3: Last but Not Late Interaction for Listwise Document Reranking},
|
|
744
744
|
author={Feng Wang and Yuqing Li and Han Xiao},
|
|
745
745
|
year={2025},
|
|
746
746
|
eprint={2509.25085},
|
|
@@ -0,0 +1,204 @@
|
|
|
1
|
+
from mteb.models.instruct_wrapper import InstructSentenceTransformerModel
|
|
2
|
+
from mteb.models.model_meta import ModelMeta
|
|
3
|
+
from mteb.models.models_protocols import EncoderProtocol, PromptType
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def instruction_template(
|
|
7
|
+
instruction: str, prompt_type: PromptType | None = None
|
|
8
|
+
) -> str:
|
|
9
|
+
if not instruction or prompt_type == PromptType.document:
|
|
10
|
+
return ""
|
|
11
|
+
if isinstance(instruction, dict):
|
|
12
|
+
if prompt_type is None:
|
|
13
|
+
instruction = next(iter(instruction.values())) # TODO
|
|
14
|
+
else:
|
|
15
|
+
instruction = instruction[prompt_type]
|
|
16
|
+
return f"Instruct: {instruction}\nQuery:"
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
multilingual_langs = [
|
|
20
|
+
"afr-Latn",
|
|
21
|
+
"ara-Arab",
|
|
22
|
+
"aze-Latn",
|
|
23
|
+
"bel-Cyrl",
|
|
24
|
+
"bul-Cyrl",
|
|
25
|
+
"ben-Beng",
|
|
26
|
+
"cat-Latn",
|
|
27
|
+
"ceb-Latn",
|
|
28
|
+
"ces-Latn",
|
|
29
|
+
"cym-Latn",
|
|
30
|
+
"dan-Latn",
|
|
31
|
+
"deu-Latn",
|
|
32
|
+
"ell-Grek",
|
|
33
|
+
"eng-Latn",
|
|
34
|
+
"spa-Latn",
|
|
35
|
+
"est-Latn",
|
|
36
|
+
"eus-Latn",
|
|
37
|
+
"fas-Arab",
|
|
38
|
+
"fin-Latn",
|
|
39
|
+
"fra-Latn",
|
|
40
|
+
"glg-Latn",
|
|
41
|
+
"guj-Gujr",
|
|
42
|
+
"heb-Hebr",
|
|
43
|
+
"hin-Deva",
|
|
44
|
+
"hrv-Latn",
|
|
45
|
+
"hat-Latn",
|
|
46
|
+
"hun-Latn",
|
|
47
|
+
"hye-Armn",
|
|
48
|
+
"ind-Latn",
|
|
49
|
+
"isl-Latn",
|
|
50
|
+
"ita-Latn",
|
|
51
|
+
"jpn-Jpan",
|
|
52
|
+
"jav-Latn",
|
|
53
|
+
"kat-Geor",
|
|
54
|
+
"kaz-Cyrl",
|
|
55
|
+
"khm-Khmr",
|
|
56
|
+
"kan-Knda",
|
|
57
|
+
"kor-Hang",
|
|
58
|
+
"kir-Cyrl",
|
|
59
|
+
"lao-Laoo",
|
|
60
|
+
"lit-Latn",
|
|
61
|
+
"lav-Latn",
|
|
62
|
+
"mkd-Cyrl",
|
|
63
|
+
"mal-Mlym",
|
|
64
|
+
"mon-Cyrl",
|
|
65
|
+
"mar-Deva",
|
|
66
|
+
"msa-Latn",
|
|
67
|
+
"mya-Mymr",
|
|
68
|
+
"nep-Deva",
|
|
69
|
+
"nld-Latn",
|
|
70
|
+
"nor-Latn",
|
|
71
|
+
"nob-Latn",
|
|
72
|
+
"nno-Latn",
|
|
73
|
+
"pan-Guru",
|
|
74
|
+
"pol-Latn",
|
|
75
|
+
"por-Latn",
|
|
76
|
+
"que-Latn",
|
|
77
|
+
"ron-Latn",
|
|
78
|
+
"rus-Cyrl",
|
|
79
|
+
"sin-Sinh",
|
|
80
|
+
"slk-Latn",
|
|
81
|
+
"slv-Latn",
|
|
82
|
+
"swa-Latn",
|
|
83
|
+
"tam-Taml",
|
|
84
|
+
"tel-Telu",
|
|
85
|
+
"tha-Thai",
|
|
86
|
+
"tgl-Latn",
|
|
87
|
+
"tur-Latn",
|
|
88
|
+
"ukr-Cyrl",
|
|
89
|
+
"urd-Arab",
|
|
90
|
+
"vie-Latn",
|
|
91
|
+
"yor-Latn",
|
|
92
|
+
"zho-Hans",
|
|
93
|
+
]
|
|
94
|
+
|
|
95
|
+
MOD_CITATION = """@misc{mod-embedding-2025,
|
|
96
|
+
title={MoD-Embedding: A Fine-tuned Multilingual Text Embedding Model},
|
|
97
|
+
author={MoD Team},
|
|
98
|
+
year={2025},
|
|
99
|
+
url={https://huggingface.co/bflhc/MoD-Embedding}
|
|
100
|
+
}"""
|
|
101
|
+
|
|
102
|
+
training_data = {
|
|
103
|
+
"T2Retrieval",
|
|
104
|
+
"DuRetrieval",
|
|
105
|
+
"MMarcoReranking",
|
|
106
|
+
"CMedQAv2-reranking",
|
|
107
|
+
"NQ",
|
|
108
|
+
"MSMARCO",
|
|
109
|
+
"HotpotQA",
|
|
110
|
+
"FEVER",
|
|
111
|
+
"MrTidyRetrieval",
|
|
112
|
+
"MIRACLRetrieval",
|
|
113
|
+
"CodeSearchNet",
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
# Predefined prompts for various RTEB tasks
|
|
117
|
+
PREDEFINED_PROMPTS = {
|
|
118
|
+
# ========== Open Datasets ==========
|
|
119
|
+
# Legal domain
|
|
120
|
+
"AILACasedocs": "Given a legal case scenario, retrieve the most relevant case documents",
|
|
121
|
+
"AILAStatutes": "Given a legal scenario, retrieve the most relevant statute documents",
|
|
122
|
+
"LegalQuAD": "Given a legal question, retrieve relevant legal documents that answer the question",
|
|
123
|
+
"LegalSummarization": "Given a query, retrieve relevant legal documents for summarization",
|
|
124
|
+
# Code domain
|
|
125
|
+
"AppsRetrieval": "Given a query about mobile applications, retrieve relevant app information",
|
|
126
|
+
"HumanEvalRetrieval": "Given a code problem description, retrieve relevant code examples",
|
|
127
|
+
"MBPPRetrieval": "Given a programming problem description, retrieve relevant code solutions",
|
|
128
|
+
"DS1000Retrieval": "Given a data science problem, retrieve relevant code snippets",
|
|
129
|
+
"FreshStackRetrieval": "Given a programming question, retrieve relevant Stack Overflow posts",
|
|
130
|
+
# Finance domain
|
|
131
|
+
"FinQARetrieval": "Given a financial question, retrieve relevant financial documents",
|
|
132
|
+
"FinanceBenchRetrieval": "Given a financial query, retrieve relevant financial information",
|
|
133
|
+
"HC3FinanceRetrieval": "Given a finance-related query, retrieve relevant documents",
|
|
134
|
+
# Medical domain
|
|
135
|
+
"CUREv1": "Given a medical query, retrieve relevant clinical documents",
|
|
136
|
+
"ChatDoctorRetrieval": "Given a medical question, retrieve relevant medical information",
|
|
137
|
+
# SQL domain
|
|
138
|
+
"WikiSQLRetrieval": "Given a natural language query, retrieve relevant SQL examples",
|
|
139
|
+
# Multilingual
|
|
140
|
+
"MIRACLRetrievalHardNegatives": "Given a query, retrieve relevant passages",
|
|
141
|
+
# ========== Private/Closed Datasets ==========
|
|
142
|
+
# Code domain (Private)
|
|
143
|
+
"Code1Retrieval": "Given a code problem description, retrieve relevant code examples",
|
|
144
|
+
"JapaneseCode1Retrieval": "Given a code problem description, retrieve relevant code examples",
|
|
145
|
+
# Finance domain (Private)
|
|
146
|
+
"EnglishFinance1Retrieval": "Given a financial query, retrieve relevant financial documents",
|
|
147
|
+
"EnglishFinance2Retrieval": "Given a financial query, retrieve relevant financial documents",
|
|
148
|
+
"EnglishFinance3Retrieval": "Given a financial query, retrieve relevant financial documents",
|
|
149
|
+
"EnglishFinance4Retrieval": "Given a financial query, retrieve relevant financial documents",
|
|
150
|
+
# Healthcare domain (Private)
|
|
151
|
+
"EnglishHealthcare1Retrieval": "Given a medical question, retrieve relevant medical information",
|
|
152
|
+
"GermanHealthcare1Retrieval": "Given a medical question, retrieve relevant medical information",
|
|
153
|
+
# Legal domain (Private)
|
|
154
|
+
"FrenchLegal1Retrieval": "Given a legal query, retrieve relevant legal documents",
|
|
155
|
+
"GermanLegal1Retrieval": "Given a legal query, retrieve relevant legal documents",
|
|
156
|
+
"JapaneseLegal1Retrieval": "Given a legal query, retrieve relevant legal documents",
|
|
157
|
+
# General/Multilingual (Private)
|
|
158
|
+
"French1Retrieval": "Given a query, retrieve relevant passages",
|
|
159
|
+
"German1Retrieval": "Given a query, retrieve relevant passages",
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def mod_instruct_loader(
|
|
164
|
+
model_name_or_path: str, revision: str, **kwargs
|
|
165
|
+
) -> EncoderProtocol:
|
|
166
|
+
# Set default prompts_dict if not provided
|
|
167
|
+
|
|
168
|
+
model = InstructSentenceTransformerModel(
|
|
169
|
+
model_name_or_path,
|
|
170
|
+
revision=revision,
|
|
171
|
+
instruction_template=instruction_template,
|
|
172
|
+
apply_instruction_to_passages=False,
|
|
173
|
+
prompt_dicts=PREDEFINED_PROMPTS,
|
|
174
|
+
**kwargs,
|
|
175
|
+
)
|
|
176
|
+
encoder = model.model._first_module()
|
|
177
|
+
if encoder.auto_model.config._attn_implementation == "flash_attention_2":
|
|
178
|
+
# The Qwen3 code only use left padding in flash_attention_2 mode.
|
|
179
|
+
encoder.tokenizer.padding_side = "left"
|
|
180
|
+
return model
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
MoD_Embedding = ModelMeta(
|
|
184
|
+
loader=mod_instruct_loader,
|
|
185
|
+
name="bflhc/MoD-Embedding",
|
|
186
|
+
languages=multilingual_langs,
|
|
187
|
+
open_weights=True,
|
|
188
|
+
revision="acbb5b70fdab262226a6af2bc62001de8021b05c",
|
|
189
|
+
release_date="2025-12-14",
|
|
190
|
+
n_parameters=4021774336,
|
|
191
|
+
memory_usage_mb=7671,
|
|
192
|
+
embed_dim=2560,
|
|
193
|
+
max_tokens=32768,
|
|
194
|
+
license="apache-2.0",
|
|
195
|
+
reference="https://huggingface.co/bflhc/MoD-Embedding",
|
|
196
|
+
similarity_fn_name="cosine",
|
|
197
|
+
framework=["Sentence Transformers", "PyTorch"],
|
|
198
|
+
use_instructions=True,
|
|
199
|
+
public_training_code=None,
|
|
200
|
+
public_training_data=None,
|
|
201
|
+
training_datasets=training_data,
|
|
202
|
+
citation=MOD_CITATION,
|
|
203
|
+
adapted_from="Qwen/Qwen3-Embedding-4B",
|
|
204
|
+
)
|