mteb 2.3.10__py3-none-any.whl → 2.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +7 -2
  2. mteb/abstasks/_statistics_calculation.py +6 -2
  3. mteb/abstasks/classification.py +0 -2
  4. mteb/benchmarks/benchmarks/__init__.py +2 -0
  5. mteb/benchmarks/benchmarks/benchmarks.py +57 -0
  6. mteb/deprecated_evaluator.py +8 -13
  7. mteb/descriptive_stats/Reranking/JQaRARerankingLite.json +35 -0
  8. mteb/descriptive_stats/Reranking/JaCWIRRerankingLite.json +35 -0
  9. mteb/descriptive_stats/Retrieval/JaCWIRRetrievalLite.json +30 -0
  10. mteb/descriptive_stats/Retrieval/JaqketRetrievalLite.json +30 -0
  11. mteb/descriptive_stats/Retrieval/MIRACLJaRetrievalLite.json +30 -0
  12. mteb/descriptive_stats/Retrieval/MrTyDiJaRetrievalLite.json +30 -0
  13. mteb/evaluate.py +2 -33
  14. mteb/leaderboard/figures.py +1 -1
  15. mteb/leaderboard/table.py +1 -11
  16. mteb/models/abs_encoder.py +21 -17
  17. mteb/models/cache_wrappers/cache_backends/_hash_utils.py +2 -2
  18. mteb/models/get_model_meta.py +3 -123
  19. mteb/models/instruct_wrapper.py +2 -1
  20. mteb/models/model_implementations/bica_model.py +34 -0
  21. mteb/models/model_implementations/colpali_models.py +7 -2
  22. mteb/models/model_implementations/colqwen_models.py +1 -1
  23. mteb/models/model_implementations/gme_v_models.py +9 -5
  24. mteb/models/model_implementations/google_models.py +10 -0
  25. mteb/models/model_implementations/granite_vision_embedding_models.py +6 -2
  26. mteb/models/model_implementations/jasper_models.py +2 -2
  27. mteb/models/model_implementations/jina_models.py +1 -1
  28. mteb/models/model_implementations/mod_models.py +204 -0
  29. mteb/models/model_implementations/nomic_models.py +142 -4
  30. mteb/models/model_implementations/nomic_models_vision.py +6 -2
  31. mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +6 -2
  32. mteb/models/model_implementations/pylate_models.py +1 -4
  33. mteb/models/model_implementations/random_baseline.py +6 -2
  34. mteb/models/model_implementations/seed_1_6_embedding_models.py +7 -2
  35. mteb/models/model_implementations/voyage_v.py +6 -2
  36. mteb/models/model_meta.py +396 -19
  37. mteb/models/sentence_transformer_wrapper.py +2 -7
  38. mteb/tasks/reranking/jpn/__init__.py +9 -1
  39. mteb/tasks/reranking/jpn/j_qa_ra_reranking_lite.py +49 -0
  40. mteb/tasks/reranking/jpn/ja_cwir_reranking_lite.py +47 -0
  41. mteb/tasks/retrieval/code/fresh_stack_retrieval.py +8 -5
  42. mteb/tasks/retrieval/jpn/__init__.py +8 -0
  43. mteb/tasks/retrieval/jpn/ja_cwir_retrieval_lite.py +47 -0
  44. mteb/tasks/retrieval/jpn/jaqket_retrieval_lite.py +50 -0
  45. mteb/tasks/retrieval/jpn/miracl_ja_retrieval_lite.py +52 -0
  46. mteb/tasks/retrieval/jpn/mr_tydi_ja_retrieval_lite.py +48 -0
  47. mteb/types/_encoder_io.py +7 -2
  48. {mteb-2.3.10.dist-info → mteb-2.4.1.dist-info}/METADATA +2 -1
  49. {mteb-2.3.10.dist-info → mteb-2.4.1.dist-info}/RECORD +53 -39
  50. {mteb-2.3.10.dist-info → mteb-2.4.1.dist-info}/WHEEL +0 -0
  51. {mteb-2.3.10.dist-info → mteb-2.4.1.dist-info}/entry_points.txt +0 -0
  52. {mteb-2.3.10.dist-info → mteb-2.4.1.dist-info}/licenses/LICENSE +0 -0
  53. {mteb-2.3.10.dist-info → mteb-2.4.1.dist-info}/top_level.txt +0 -0
@@ -1,26 +1,15 @@
1
- from __future__ import annotations
2
-
3
1
  import difflib
4
2
  import logging
5
- import warnings
6
3
  from collections.abc import Iterable
7
- from typing import TYPE_CHECKING, Any
8
-
9
- from huggingface_hub import ModelCard
10
- from huggingface_hub.errors import RepositoryNotFoundError
4
+ from typing import Any
11
5
 
12
6
  from mteb.abstasks import AbsTask
13
7
  from mteb.models import (
14
- CrossEncoderWrapper,
15
8
  ModelMeta,
16
9
  MTEBModels,
17
- sentence_transformers_loader,
18
10
  )
19
11
  from mteb.models.model_implementations import MODEL_REGISTRY
20
12
 
21
- if TYPE_CHECKING:
22
- from sentence_transformers import CrossEncoder, SentenceTransformer
23
-
24
13
  logger = logging.getLogger(__name__)
25
14
 
26
15
 
@@ -101,24 +90,9 @@ def get_model(
101
90
  Returns:
102
91
  A model object
103
92
  """
104
- from sentence_transformers import CrossEncoder, SentenceTransformer
105
-
106
93
  meta = get_model_meta(model_name, revision)
107
94
  model = meta.load_model(**kwargs)
108
95
 
109
- # If revision not available in the modelmeta, try to extract it from sentence-transformers
110
- if hasattr(model, "model") and isinstance(model.model, SentenceTransformer): # type: ignore
111
- _meta = _model_meta_from_sentence_transformers(model.model) # type: ignore
112
- if meta.revision is None:
113
- meta.revision = _meta.revision if _meta.revision else meta.revision
114
- if not meta.similarity_fn_name:
115
- meta.similarity_fn_name = _meta.similarity_fn_name
116
-
117
- elif isinstance(model, CrossEncoder):
118
- _meta = _model_meta_from_cross_encoder(model.model)
119
- if meta.revision is None:
120
- meta.revision = _meta.revision if _meta.revision else meta.revision
121
-
122
96
  model.mteb_model_meta = meta # type: ignore
123
97
  return model
124
98
 
@@ -148,12 +122,8 @@ def get_model_meta(
148
122
  logger.info(
149
123
  "Model not found in model registry. Attempting to extract metadata by loading the model ({model_name}) using HuggingFace."
150
124
  )
151
- try:
152
- meta = _model_meta_from_hf_hub(model_name)
153
- meta.revision = revision
154
- return meta
155
- except RepositoryNotFoundError:
156
- pass
125
+ meta = ModelMeta.from_hub(model_name, revision)
126
+ return meta
157
127
 
158
128
  not_found_msg = f"Model '{model_name}' not found in MTEB registry"
159
129
  not_found_msg += " nor on the Huggingface Hub." if fetch_from_hf else "."
@@ -171,93 +141,3 @@ def get_model_meta(
171
141
  suggestion = f" Did you mean: '{close_matches[0]}'?"
172
142
 
173
143
  raise KeyError(not_found_msg + suggestion)
174
-
175
-
176
- def _model_meta_from_hf_hub(model_name: str) -> ModelMeta:
177
- card = ModelCard.load(model_name)
178
- card_data = card.data.to_dict()
179
- frameworks = ["PyTorch"]
180
- loader = None
181
- if card_data.get("library_name", None) == "sentence-transformers":
182
- frameworks.append("Sentence Transformers")
183
- loader = sentence_transformers_loader
184
- else:
185
- msg = (
186
- "Model library not recognized, defaulting to Sentence Transformers loader."
187
- )
188
- logger.warning(msg)
189
- warnings.warn(msg)
190
- loader = sentence_transformers_loader
191
-
192
- revision = card_data.get("base_model_revision", None)
193
- license = card_data.get("license", None)
194
- return ModelMeta(
195
- loader=loader,
196
- name=model_name,
197
- revision=revision,
198
- release_date=None,
199
- languages=None,
200
- license=license,
201
- framework=frameworks, # type: ignore
202
- training_datasets=None,
203
- similarity_fn_name=None,
204
- n_parameters=None,
205
- memory_usage_mb=None,
206
- max_tokens=None,
207
- embed_dim=None,
208
- open_weights=True,
209
- public_training_code=None,
210
- public_training_data=None,
211
- use_instructions=None,
212
- )
213
-
214
-
215
- def _model_meta_from_cross_encoder(model: CrossEncoder) -> ModelMeta:
216
- return ModelMeta(
217
- loader=CrossEncoderWrapper,
218
- name=model.model.name_or_path,
219
- revision=model.config._commit_hash,
220
- release_date=None,
221
- languages=None,
222
- framework=["Sentence Transformers"],
223
- similarity_fn_name=None,
224
- n_parameters=None,
225
- memory_usage_mb=None,
226
- max_tokens=None,
227
- embed_dim=None,
228
- license=None,
229
- open_weights=True,
230
- public_training_code=None,
231
- public_training_data=None,
232
- use_instructions=None,
233
- training_datasets=None,
234
- )
235
-
236
-
237
- def _model_meta_from_sentence_transformers(model: SentenceTransformer) -> ModelMeta:
238
- name: str | None = (
239
- model.model_card_data.model_name
240
- if model.model_card_data.model_name
241
- else model.model_card_data.base_model
242
- )
243
- embeddings_dim = model.get_sentence_embedding_dimension()
244
- meta = ModelMeta(
245
- loader=sentence_transformers_loader,
246
- name=name,
247
- revision=model.model_card_data.base_model_revision,
248
- release_date=None,
249
- languages=None,
250
- framework=["Sentence Transformers"],
251
- similarity_fn_name=None,
252
- n_parameters=None,
253
- memory_usage_mb=None,
254
- max_tokens=None,
255
- embed_dim=embeddings_dim,
256
- license=None,
257
- open_weights=True,
258
- public_training_code=None,
259
- public_training_data=None,
260
- use_instructions=None,
261
- training_datasets=None,
262
- )
263
- return meta
@@ -122,7 +122,8 @@ class InstructSentenceTransformerModel(AbsEncoder):
122
122
  apply_instruction_to_passages: Whether to apply the instruction template to the passages.
123
123
  padding_side: Padding side. If None, the padding side will be read from the model config.
124
124
  add_eos_token: Whether to add the eos token to each input example.
125
- prompts_dict: Dictionary of task names to prompt names. If None, the prompts will be read from the model config.
125
+ prompts_dict: Dictionary of task names to prompt names. If task name is missing in the dict or prompts dict is None, prompt from task metadata or
126
+ AbsTask.abstask_prompt will be used.
126
127
  **kwargs: Kwargs for Sentence Transformer model.
127
128
  """
128
129
  from sentence_transformers import SentenceTransformer
@@ -0,0 +1,34 @@
1
+ from mteb.models import ModelMeta, sentence_transformers_loader
2
+
3
+ bica_base = ModelMeta(
4
+ name="bisectgroup/BiCA-base",
5
+ loader=sentence_transformers_loader,
6
+ languages=["eng-Latn"],
7
+ open_weights=True,
8
+ revision="31237a836e5ae908c308a256573e5f0986498574",
9
+ release_date="2025-11-14",
10
+ n_parameters=110_000_000,
11
+ memory_usage_mb=418,
12
+ embed_dim=768,
13
+ license="mit",
14
+ max_tokens=512,
15
+ reference="https://huggingface.co/bisectgroup/BiCA-base",
16
+ similarity_fn_name="cosine",
17
+ framework=["Sentence Transformers", "PyTorch"],
18
+ use_instructions=False,
19
+ public_training_code="https://github.com/NiravBhattLab/BiCA",
20
+ public_training_data="https://huggingface.co/datasets/bisectgroup/hard-negatives-traversal",
21
+ adapted_from="thenlper/gte-base",
22
+ citation="""
23
+ @misc{sinha2025bicaeffectivebiomedicaldense,
24
+ title={BiCA: Effective Biomedical Dense Retrieval with Citation-Aware Hard Negatives},
25
+ author={Aarush Sinha and Pavan Kumar S and Roshan Balaji and Nirav Pravinbhai Bhatt},
26
+ year={2025},
27
+ eprint={2511.08029},
28
+ archivePrefix={arXiv},
29
+ primaryClass={cs.IR},
30
+ url={https://arxiv.org/abs/2511.08029},
31
+ }
32
+ """,
33
+ training_datasets=set(),
34
+ )
@@ -1,8 +1,9 @@
1
+ from __future__ import annotations
2
+
1
3
  import logging
2
- from typing import Any
4
+ from typing import TYPE_CHECKING, Any
3
5
 
4
6
  import torch
5
- from PIL import Image
6
7
  from torch.utils.data import DataLoader
7
8
  from tqdm.auto import tqdm
8
9
 
@@ -15,6 +16,9 @@ from mteb.models.abs_encoder import AbsEncoder
15
16
  from mteb.models.model_meta import ModelMeta, ScoringFunction
16
17
  from mteb.types import Array, BatchedInput, PromptType
17
18
 
19
+ if TYPE_CHECKING:
20
+ from PIL import Image
21
+
18
22
  logger = logging.getLogger(__name__)
19
23
 
20
24
 
@@ -89,6 +93,7 @@ class ColPaliEngineWrapper(AbsEncoder):
89
93
  **kwargs,
90
94
  ):
91
95
  import torchvision.transforms.functional as F
96
+ from PIL import Image
92
97
 
93
98
  all_embeds = []
94
99
 
@@ -2,7 +2,6 @@ import logging
2
2
  from typing import Any
3
3
 
4
4
  import torch
5
- from PIL import Image
6
5
  from torch.utils.data import DataLoader
7
6
  from tqdm.auto import tqdm
8
7
 
@@ -154,6 +153,7 @@ class ColQwen3Wrapper(AbsEncoder):
154
153
  **kwargs: Any,
155
154
  ):
156
155
  import torchvision.transforms.functional as F
156
+ from PIL import Image
157
157
 
158
158
  contains_image = "image" in image_texts_pairs.dataset.features
159
159
  contains_text = "text" in image_texts_pairs.dataset.features
@@ -1,9 +1,10 @@
1
+ from __future__ import annotations
2
+
1
3
  import logging
2
4
  import math
3
- from typing import Any
5
+ from typing import TYPE_CHECKING, Any
4
6
 
5
7
  import torch
6
- from PIL import Image
7
8
  from torch.utils.data import DataLoader
8
9
  from tqdm.autonotebook import tqdm
9
10
 
@@ -12,6 +13,9 @@ from mteb.models.abs_encoder import AbsEncoder
12
13
  from mteb.models.model_meta import ModelMeta, ScoringFunction
13
14
  from mteb.types import Array, BatchedInput, PromptType
14
15
 
16
+ if TYPE_CHECKING:
17
+ from PIL import Image
18
+
15
19
  logger = logging.getLogger(__name__)
16
20
 
17
21
  GME_CITATION = """@misc{zhang2024gme,
@@ -267,9 +271,9 @@ def smart_resize(
267
271
  return h_bar, w_bar
268
272
 
269
273
 
270
- def fetch_image(
271
- image: str | Image.Image, size_factor: int = IMAGE_FACTOR
272
- ) -> Image.Image:
274
+ def fetch_image(image: Image.Image, size_factor: int = IMAGE_FACTOR) -> Image.Image:
275
+ from PIL import Image
276
+
273
277
  image_obj = None
274
278
  if isinstance(image, Image.Image):
275
279
  image_obj = image
@@ -272,4 +272,14 @@ embedding_gemma_300m = ModelMeta(
272
272
  training_datasets=GECKO_TRAINING_DATA,
273
273
  similarity_fn_name="cosine",
274
274
  memory_usage_mb=1155,
275
+ citation="""
276
+ @misc{vera2025embeddinggemmapowerfullightweighttext,
277
+ title={EmbeddingGemma: Powerful and Lightweight Text Representations},
278
+ author={Henrique Schechter Vera and Sahil Dua and Biao Zhang and Daniel Salz and Ryan Mullins and Sindhu Raghuram Panyam and Sara Smoot and Iftekhar Naim and Joe Zou and Feiyang Chen and Daniel Cer and Alice Lisak and Min Choi and Lucas Gonzalez and Omar Sanseviero and Glenn Cameron and Ian Ballantyne and Kat Black and Kaifeng Chen and Weiyi Wang and Zhe Li and Gus Martins and Jinhyuk Lee and Mark Sherwood and Juyeong Ji and Renjie Wu and Jingxiao Zheng and Jyotinder Singh and Abheesht Sharma and Divyashree Sreepathihalli and Aashi Jain and Adham Elarabawy and AJ Co and Andreas Doumanoglou and Babak Samari and Ben Hora and Brian Potetz and Dahun Kim and Enrique Alfonseca and Fedor Moiseev and Feng Han and Frank Palma Gomez and Gustavo Hernández Ábrego and Hesen Zhang and Hui Hui and Jay Han and Karan Gill and Ke Chen and Koert Chen and Madhuri Shanbhogue and Michael Boratko and Paul Suganthan and Sai Meher Karthik Duddu and Sandeep Mariserla and Setareh Ariafar and Shanfeng Zhang and Shijie Zhang and Simon Baumgartner and Sonam Goenka and Steve Qiu and Tanmaya Dabral and Trevor Walker and Vikram Rao and Waleed Khawaja and Wenlei Zhou and Xiaoqi Ren and Ye Xia and Yichang Chen and Yi-Ting Chen and Zhe Dong and Zhongli Ding and Francesco Visin and Gaël Liu and Jiageng Zhang and Kathleen Kenealy and Michelle Casbon and Ravin Kumar and Thomas Mesnard and Zach Gleicher and Cormac Brick and Olivier Lacombe and Adam Roberts and Qin Yin and Yunhsuan Sung and Raphael Hoffmann and Tris Warkentin and Armand Joulin and Tom Duerig and Mojtaba Seyedhosseini},
279
+ year={2025},
280
+ eprint={2509.20354},
281
+ archivePrefix={arXiv},
282
+ primaryClass={cs.CL},
283
+ url={https://arxiv.org/abs/2509.20354},
284
+ }""",
275
285
  )
@@ -1,8 +1,9 @@
1
+ from __future__ import annotations
2
+
1
3
  import logging
2
- from typing import Any
4
+ from typing import TYPE_CHECKING, Any
3
5
 
4
6
  import torch
5
- from PIL import Image
6
7
  from torch.utils.data import DataLoader
7
8
  from tqdm.auto import tqdm
8
9
 
@@ -15,6 +16,9 @@ from mteb.types import Array, BatchedInput, PromptType
15
16
 
16
17
  logger = logging.getLogger(__name__)
17
18
 
19
+ if TYPE_CHECKING:
20
+ from PIL import Image
21
+
18
22
 
19
23
  class GraniteVisionEmbeddingWrapper:
20
24
  def __init__(
@@ -355,13 +355,13 @@ Jasper_Token_Compression_600M = ModelMeta(
355
355
  | qzhou_training_data,
356
356
  citation="""
357
357
  @misc{zhang2025jaspertokencompression600mtechnicalreport,
358
- title={Jasper-Token-Compression-600M Technical Report},
358
+ title={Jasper-Token-Compression-600M Technical Report},
359
359
  author={Dun Zhang and Ziyang Zeng and Yudong Zhou and Shuyang Lu},
360
360
  year={2025},
361
361
  eprint={2511.14405},
362
362
  archivePrefix={arXiv},
363
363
  primaryClass={cs.IR},
364
- url={https://arxiv.org/abs/2511.14405},
364
+ url={https://arxiv.org/abs/2511.14405},
365
365
  }
366
366
  """,
367
367
  )
@@ -740,7 +740,7 @@ jina_reranker_v3 = ModelMeta(
740
740
  training_datasets=JINARerankerV3_TRAINING_DATA,
741
741
  adapted_from="Qwen/Qwen3-0.6B",
742
742
  citation="""@misc{wang2025jinarerankerv3lateinteractionlistwise,
743
- title={jina-reranker-v3: Last but Not Late Interaction for Listwise Document Reranking},
743
+ title={jina-reranker-v3: Last but Not Late Interaction for Listwise Document Reranking},
744
744
  author={Feng Wang and Yuqing Li and Han Xiao},
745
745
  year={2025},
746
746
  eprint={2509.25085},
@@ -0,0 +1,204 @@
1
+ from mteb.models.instruct_wrapper import InstructSentenceTransformerModel
2
+ from mteb.models.model_meta import ModelMeta
3
+ from mteb.models.models_protocols import EncoderProtocol, PromptType
4
+
5
+
6
+ def instruction_template(
7
+ instruction: str, prompt_type: PromptType | None = None
8
+ ) -> str:
9
+ if not instruction or prompt_type == PromptType.document:
10
+ return ""
11
+ if isinstance(instruction, dict):
12
+ if prompt_type is None:
13
+ instruction = next(iter(instruction.values())) # TODO
14
+ else:
15
+ instruction = instruction[prompt_type]
16
+ return f"Instruct: {instruction}\nQuery:"
17
+
18
+
19
+ multilingual_langs = [
20
+ "afr-Latn",
21
+ "ara-Arab",
22
+ "aze-Latn",
23
+ "bel-Cyrl",
24
+ "bul-Cyrl",
25
+ "ben-Beng",
26
+ "cat-Latn",
27
+ "ceb-Latn",
28
+ "ces-Latn",
29
+ "cym-Latn",
30
+ "dan-Latn",
31
+ "deu-Latn",
32
+ "ell-Grek",
33
+ "eng-Latn",
34
+ "spa-Latn",
35
+ "est-Latn",
36
+ "eus-Latn",
37
+ "fas-Arab",
38
+ "fin-Latn",
39
+ "fra-Latn",
40
+ "glg-Latn",
41
+ "guj-Gujr",
42
+ "heb-Hebr",
43
+ "hin-Deva",
44
+ "hrv-Latn",
45
+ "hat-Latn",
46
+ "hun-Latn",
47
+ "hye-Armn",
48
+ "ind-Latn",
49
+ "isl-Latn",
50
+ "ita-Latn",
51
+ "jpn-Jpan",
52
+ "jav-Latn",
53
+ "kat-Geor",
54
+ "kaz-Cyrl",
55
+ "khm-Khmr",
56
+ "kan-Knda",
57
+ "kor-Hang",
58
+ "kir-Cyrl",
59
+ "lao-Laoo",
60
+ "lit-Latn",
61
+ "lav-Latn",
62
+ "mkd-Cyrl",
63
+ "mal-Mlym",
64
+ "mon-Cyrl",
65
+ "mar-Deva",
66
+ "msa-Latn",
67
+ "mya-Mymr",
68
+ "nep-Deva",
69
+ "nld-Latn",
70
+ "nor-Latn",
71
+ "nob-Latn",
72
+ "nno-Latn",
73
+ "pan-Guru",
74
+ "pol-Latn",
75
+ "por-Latn",
76
+ "que-Latn",
77
+ "ron-Latn",
78
+ "rus-Cyrl",
79
+ "sin-Sinh",
80
+ "slk-Latn",
81
+ "slv-Latn",
82
+ "swa-Latn",
83
+ "tam-Taml",
84
+ "tel-Telu",
85
+ "tha-Thai",
86
+ "tgl-Latn",
87
+ "tur-Latn",
88
+ "ukr-Cyrl",
89
+ "urd-Arab",
90
+ "vie-Latn",
91
+ "yor-Latn",
92
+ "zho-Hans",
93
+ ]
94
+
95
+ MOD_CITATION = """@misc{mod-embedding-2025,
96
+ title={MoD-Embedding: A Fine-tuned Multilingual Text Embedding Model},
97
+ author={MoD Team},
98
+ year={2025},
99
+ url={https://huggingface.co/bflhc/MoD-Embedding}
100
+ }"""
101
+
102
+ training_data = {
103
+ "T2Retrieval",
104
+ "DuRetrieval",
105
+ "MMarcoReranking",
106
+ "CMedQAv2-reranking",
107
+ "NQ",
108
+ "MSMARCO",
109
+ "HotpotQA",
110
+ "FEVER",
111
+ "MrTidyRetrieval",
112
+ "MIRACLRetrieval",
113
+ "CodeSearchNet",
114
+ }
115
+
116
+ # Predefined prompts for various RTEB tasks
117
+ PREDEFINED_PROMPTS = {
118
+ # ========== Open Datasets ==========
119
+ # Legal domain
120
+ "AILACasedocs": "Given a legal case scenario, retrieve the most relevant case documents",
121
+ "AILAStatutes": "Given a legal scenario, retrieve the most relevant statute documents",
122
+ "LegalQuAD": "Given a legal question, retrieve relevant legal documents that answer the question",
123
+ "LegalSummarization": "Given a query, retrieve relevant legal documents for summarization",
124
+ # Code domain
125
+ "AppsRetrieval": "Given a query about mobile applications, retrieve relevant app information",
126
+ "HumanEvalRetrieval": "Given a code problem description, retrieve relevant code examples",
127
+ "MBPPRetrieval": "Given a programming problem description, retrieve relevant code solutions",
128
+ "DS1000Retrieval": "Given a data science problem, retrieve relevant code snippets",
129
+ "FreshStackRetrieval": "Given a programming question, retrieve relevant Stack Overflow posts",
130
+ # Finance domain
131
+ "FinQARetrieval": "Given a financial question, retrieve relevant financial documents",
132
+ "FinanceBenchRetrieval": "Given a financial query, retrieve relevant financial information",
133
+ "HC3FinanceRetrieval": "Given a finance-related query, retrieve relevant documents",
134
+ # Medical domain
135
+ "CUREv1": "Given a medical query, retrieve relevant clinical documents",
136
+ "ChatDoctorRetrieval": "Given a medical question, retrieve relevant medical information",
137
+ # SQL domain
138
+ "WikiSQLRetrieval": "Given a natural language query, retrieve relevant SQL examples",
139
+ # Multilingual
140
+ "MIRACLRetrievalHardNegatives": "Given a query, retrieve relevant passages",
141
+ # ========== Private/Closed Datasets ==========
142
+ # Code domain (Private)
143
+ "Code1Retrieval": "Given a code problem description, retrieve relevant code examples",
144
+ "JapaneseCode1Retrieval": "Given a code problem description, retrieve relevant code examples",
145
+ # Finance domain (Private)
146
+ "EnglishFinance1Retrieval": "Given a financial query, retrieve relevant financial documents",
147
+ "EnglishFinance2Retrieval": "Given a financial query, retrieve relevant financial documents",
148
+ "EnglishFinance3Retrieval": "Given a financial query, retrieve relevant financial documents",
149
+ "EnglishFinance4Retrieval": "Given a financial query, retrieve relevant financial documents",
150
+ # Healthcare domain (Private)
151
+ "EnglishHealthcare1Retrieval": "Given a medical question, retrieve relevant medical information",
152
+ "GermanHealthcare1Retrieval": "Given a medical question, retrieve relevant medical information",
153
+ # Legal domain (Private)
154
+ "FrenchLegal1Retrieval": "Given a legal query, retrieve relevant legal documents",
155
+ "GermanLegal1Retrieval": "Given a legal query, retrieve relevant legal documents",
156
+ "JapaneseLegal1Retrieval": "Given a legal query, retrieve relevant legal documents",
157
+ # General/Multilingual (Private)
158
+ "French1Retrieval": "Given a query, retrieve relevant passages",
159
+ "German1Retrieval": "Given a query, retrieve relevant passages",
160
+ }
161
+
162
+
163
+ def mod_instruct_loader(
164
+ model_name_or_path: str, revision: str, **kwargs
165
+ ) -> EncoderProtocol:
166
+ # Set default prompts_dict if not provided
167
+
168
+ model = InstructSentenceTransformerModel(
169
+ model_name_or_path,
170
+ revision=revision,
171
+ instruction_template=instruction_template,
172
+ apply_instruction_to_passages=False,
173
+ prompt_dicts=PREDEFINED_PROMPTS,
174
+ **kwargs,
175
+ )
176
+ encoder = model.model._first_module()
177
+ if encoder.auto_model.config._attn_implementation == "flash_attention_2":
178
+ # The Qwen3 code only use left padding in flash_attention_2 mode.
179
+ encoder.tokenizer.padding_side = "left"
180
+ return model
181
+
182
+
183
+ MoD_Embedding = ModelMeta(
184
+ loader=mod_instruct_loader,
185
+ name="bflhc/MoD-Embedding",
186
+ languages=multilingual_langs,
187
+ open_weights=True,
188
+ revision="acbb5b70fdab262226a6af2bc62001de8021b05c",
189
+ release_date="2025-12-14",
190
+ n_parameters=4021774336,
191
+ memory_usage_mb=7671,
192
+ embed_dim=2560,
193
+ max_tokens=32768,
194
+ license="apache-2.0",
195
+ reference="https://huggingface.co/bflhc/MoD-Embedding",
196
+ similarity_fn_name="cosine",
197
+ framework=["Sentence Transformers", "PyTorch"],
198
+ use_instructions=True,
199
+ public_training_code=None,
200
+ public_training_data=None,
201
+ training_datasets=training_data,
202
+ citation=MOD_CITATION,
203
+ adapted_from="Qwen/Qwen3-Embedding-4B",
204
+ )