mteb 2.3.11__py3-none-any.whl → 2.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. mteb/benchmarks/benchmarks/__init__.py +2 -0
  2. mteb/benchmarks/benchmarks/benchmarks.py +57 -0
  3. mteb/deprecated_evaluator.py +8 -13
  4. mteb/descriptive_stats/Reranking/JQaRARerankingLite.json +35 -0
  5. mteb/descriptive_stats/Reranking/JaCWIRRerankingLite.json +35 -0
  6. mteb/descriptive_stats/Retrieval/JaCWIRRetrievalLite.json +30 -0
  7. mteb/descriptive_stats/Retrieval/JaqketRetrievalLite.json +30 -0
  8. mteb/descriptive_stats/Retrieval/MIRACLJaRetrievalLite.json +30 -0
  9. mteb/descriptive_stats/Retrieval/MrTyDiJaRetrievalLite.json +30 -0
  10. mteb/evaluate.py +2 -33
  11. mteb/leaderboard/figures.py +1 -1
  12. mteb/leaderboard/table.py +1 -11
  13. mteb/models/abs_encoder.py +21 -17
  14. mteb/models/get_model_meta.py +3 -123
  15. mteb/models/instruct_wrapper.py +2 -1
  16. mteb/models/model_implementations/bica_model.py +34 -0
  17. mteb/models/model_implementations/google_models.py +10 -0
  18. mteb/models/model_implementations/mod_models.py +204 -0
  19. mteb/models/model_implementations/nomic_models.py +142 -4
  20. mteb/models/model_meta.py +396 -19
  21. mteb/models/sentence_transformer_wrapper.py +2 -7
  22. mteb/tasks/reranking/jpn/__init__.py +9 -1
  23. mteb/tasks/reranking/jpn/j_qa_ra_reranking_lite.py +49 -0
  24. mteb/tasks/reranking/jpn/ja_cwir_reranking_lite.py +47 -0
  25. mteb/tasks/retrieval/code/fresh_stack_retrieval.py +8 -5
  26. mteb/tasks/retrieval/jpn/__init__.py +8 -0
  27. mteb/tasks/retrieval/jpn/ja_cwir_retrieval_lite.py +47 -0
  28. mteb/tasks/retrieval/jpn/jaqket_retrieval_lite.py +50 -0
  29. mteb/tasks/retrieval/jpn/miracl_ja_retrieval_lite.py +52 -0
  30. mteb/tasks/retrieval/jpn/mr_tydi_ja_retrieval_lite.py +48 -0
  31. {mteb-2.3.11.dist-info → mteb-2.4.1.dist-info}/METADATA +1 -1
  32. {mteb-2.3.11.dist-info → mteb-2.4.1.dist-info}/RECORD +36 -22
  33. {mteb-2.3.11.dist-info → mteb-2.4.1.dist-info}/WHEEL +0 -0
  34. {mteb-2.3.11.dist-info → mteb-2.4.1.dist-info}/entry_points.txt +0 -0
  35. {mteb-2.3.11.dist-info → mteb-2.4.1.dist-info}/licenses/LICENSE +0 -0
  36. {mteb-2.3.11.dist-info → mteb-2.4.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,34 @@
1
+ from mteb.models import ModelMeta, sentence_transformers_loader
2
+
3
+ bica_base = ModelMeta(
4
+ name="bisectgroup/BiCA-base",
5
+ loader=sentence_transformers_loader,
6
+ languages=["eng-Latn"],
7
+ open_weights=True,
8
+ revision="31237a836e5ae908c308a256573e5f0986498574",
9
+ release_date="2025-11-14",
10
+ n_parameters=110_000_000,
11
+ memory_usage_mb=418,
12
+ embed_dim=768,
13
+ license="mit",
14
+ max_tokens=512,
15
+ reference="https://huggingface.co/bisectgroup/BiCA-base",
16
+ similarity_fn_name="cosine",
17
+ framework=["Sentence Transformers", "PyTorch"],
18
+ use_instructions=False,
19
+ public_training_code="https://github.com/NiravBhattLab/BiCA",
20
+ public_training_data="https://huggingface.co/datasets/bisectgroup/hard-negatives-traversal",
21
+ adapted_from="thenlper/gte-base",
22
+ citation="""
23
+ @misc{sinha2025bicaeffectivebiomedicaldense,
24
+ title={BiCA: Effective Biomedical Dense Retrieval with Citation-Aware Hard Negatives},
25
+ author={Aarush Sinha and Pavan Kumar S and Roshan Balaji and Nirav Pravinbhai Bhatt},
26
+ year={2025},
27
+ eprint={2511.08029},
28
+ archivePrefix={arXiv},
29
+ primaryClass={cs.IR},
30
+ url={https://arxiv.org/abs/2511.08029},
31
+ }
32
+ """,
33
+ training_datasets=set(),
34
+ )
@@ -272,4 +272,14 @@ embedding_gemma_300m = ModelMeta(
272
272
  training_datasets=GECKO_TRAINING_DATA,
273
273
  similarity_fn_name="cosine",
274
274
  memory_usage_mb=1155,
275
+ citation="""
276
+ @misc{vera2025embeddinggemmapowerfullightweighttext,
277
+ title={EmbeddingGemma: Powerful and Lightweight Text Representations},
278
+ author={Henrique Schechter Vera and Sahil Dua and Biao Zhang and Daniel Salz and Ryan Mullins and Sindhu Raghuram Panyam and Sara Smoot and Iftekhar Naim and Joe Zou and Feiyang Chen and Daniel Cer and Alice Lisak and Min Choi and Lucas Gonzalez and Omar Sanseviero and Glenn Cameron and Ian Ballantyne and Kat Black and Kaifeng Chen and Weiyi Wang and Zhe Li and Gus Martins and Jinhyuk Lee and Mark Sherwood and Juyeong Ji and Renjie Wu and Jingxiao Zheng and Jyotinder Singh and Abheesht Sharma and Divyashree Sreepathihalli and Aashi Jain and Adham Elarabawy and AJ Co and Andreas Doumanoglou and Babak Samari and Ben Hora and Brian Potetz and Dahun Kim and Enrique Alfonseca and Fedor Moiseev and Feng Han and Frank Palma Gomez and Gustavo Hernández Ábrego and Hesen Zhang and Hui Hui and Jay Han and Karan Gill and Ke Chen and Koert Chen and Madhuri Shanbhogue and Michael Boratko and Paul Suganthan and Sai Meher Karthik Duddu and Sandeep Mariserla and Setareh Ariafar and Shanfeng Zhang and Shijie Zhang and Simon Baumgartner and Sonam Goenka and Steve Qiu and Tanmaya Dabral and Trevor Walker and Vikram Rao and Waleed Khawaja and Wenlei Zhou and Xiaoqi Ren and Ye Xia and Yichang Chen and Yi-Ting Chen and Zhe Dong and Zhongli Ding and Francesco Visin and Gaël Liu and Jiageng Zhang and Kathleen Kenealy and Michelle Casbon and Ravin Kumar and Thomas Mesnard and Zach Gleicher and Cormac Brick and Olivier Lacombe and Adam Roberts and Qin Yin and Yunhsuan Sung and Raphael Hoffmann and Tris Warkentin and Armand Joulin and Tom Duerig and Mojtaba Seyedhosseini},
279
+ year={2025},
280
+ eprint={2509.20354},
281
+ archivePrefix={arXiv},
282
+ primaryClass={cs.CL},
283
+ url={https://arxiv.org/abs/2509.20354},
284
+ }""",
275
285
  )
@@ -0,0 +1,204 @@
1
+ from mteb.models.instruct_wrapper import InstructSentenceTransformerModel
2
+ from mteb.models.model_meta import ModelMeta
3
+ from mteb.models.models_protocols import EncoderProtocol, PromptType
4
+
5
+
6
+ def instruction_template(
7
+ instruction: str, prompt_type: PromptType | None = None
8
+ ) -> str:
9
+ if not instruction or prompt_type == PromptType.document:
10
+ return ""
11
+ if isinstance(instruction, dict):
12
+ if prompt_type is None:
13
+ instruction = next(iter(instruction.values())) # TODO
14
+ else:
15
+ instruction = instruction[prompt_type]
16
+ return f"Instruct: {instruction}\nQuery:"
17
+
18
+
19
+ multilingual_langs = [
20
+ "afr-Latn",
21
+ "ara-Arab",
22
+ "aze-Latn",
23
+ "bel-Cyrl",
24
+ "bul-Cyrl",
25
+ "ben-Beng",
26
+ "cat-Latn",
27
+ "ceb-Latn",
28
+ "ces-Latn",
29
+ "cym-Latn",
30
+ "dan-Latn",
31
+ "deu-Latn",
32
+ "ell-Grek",
33
+ "eng-Latn",
34
+ "spa-Latn",
35
+ "est-Latn",
36
+ "eus-Latn",
37
+ "fas-Arab",
38
+ "fin-Latn",
39
+ "fra-Latn",
40
+ "glg-Latn",
41
+ "guj-Gujr",
42
+ "heb-Hebr",
43
+ "hin-Deva",
44
+ "hrv-Latn",
45
+ "hat-Latn",
46
+ "hun-Latn",
47
+ "hye-Armn",
48
+ "ind-Latn",
49
+ "isl-Latn",
50
+ "ita-Latn",
51
+ "jpn-Jpan",
52
+ "jav-Latn",
53
+ "kat-Geor",
54
+ "kaz-Cyrl",
55
+ "khm-Khmr",
56
+ "kan-Knda",
57
+ "kor-Hang",
58
+ "kir-Cyrl",
59
+ "lao-Laoo",
60
+ "lit-Latn",
61
+ "lav-Latn",
62
+ "mkd-Cyrl",
63
+ "mal-Mlym",
64
+ "mon-Cyrl",
65
+ "mar-Deva",
66
+ "msa-Latn",
67
+ "mya-Mymr",
68
+ "nep-Deva",
69
+ "nld-Latn",
70
+ "nor-Latn",
71
+ "nob-Latn",
72
+ "nno-Latn",
73
+ "pan-Guru",
74
+ "pol-Latn",
75
+ "por-Latn",
76
+ "que-Latn",
77
+ "ron-Latn",
78
+ "rus-Cyrl",
79
+ "sin-Sinh",
80
+ "slk-Latn",
81
+ "slv-Latn",
82
+ "swa-Latn",
83
+ "tam-Taml",
84
+ "tel-Telu",
85
+ "tha-Thai",
86
+ "tgl-Latn",
87
+ "tur-Latn",
88
+ "ukr-Cyrl",
89
+ "urd-Arab",
90
+ "vie-Latn",
91
+ "yor-Latn",
92
+ "zho-Hans",
93
+ ]
94
+
95
+ MOD_CITATION = """@misc{mod-embedding-2025,
96
+ title={MoD-Embedding: A Fine-tuned Multilingual Text Embedding Model},
97
+ author={MoD Team},
98
+ year={2025},
99
+ url={https://huggingface.co/bflhc/MoD-Embedding}
100
+ }"""
101
+
102
+ training_data = {
103
+ "T2Retrieval",
104
+ "DuRetrieval",
105
+ "MMarcoReranking",
106
+ "CMedQAv2-reranking",
107
+ "NQ",
108
+ "MSMARCO",
109
+ "HotpotQA",
110
+ "FEVER",
111
+ "MrTidyRetrieval",
112
+ "MIRACLRetrieval",
113
+ "CodeSearchNet",
114
+ }
115
+
116
+ # Predefined prompts for various RTEB tasks
117
+ PREDEFINED_PROMPTS = {
118
+ # ========== Open Datasets ==========
119
+ # Legal domain
120
+ "AILACasedocs": "Given a legal case scenario, retrieve the most relevant case documents",
121
+ "AILAStatutes": "Given a legal scenario, retrieve the most relevant statute documents",
122
+ "LegalQuAD": "Given a legal question, retrieve relevant legal documents that answer the question",
123
+ "LegalSummarization": "Given a query, retrieve relevant legal documents for summarization",
124
+ # Code domain
125
+ "AppsRetrieval": "Given a query about mobile applications, retrieve relevant app information",
126
+ "HumanEvalRetrieval": "Given a code problem description, retrieve relevant code examples",
127
+ "MBPPRetrieval": "Given a programming problem description, retrieve relevant code solutions",
128
+ "DS1000Retrieval": "Given a data science problem, retrieve relevant code snippets",
129
+ "FreshStackRetrieval": "Given a programming question, retrieve relevant Stack Overflow posts",
130
+ # Finance domain
131
+ "FinQARetrieval": "Given a financial question, retrieve relevant financial documents",
132
+ "FinanceBenchRetrieval": "Given a financial query, retrieve relevant financial information",
133
+ "HC3FinanceRetrieval": "Given a finance-related query, retrieve relevant documents",
134
+ # Medical domain
135
+ "CUREv1": "Given a medical query, retrieve relevant clinical documents",
136
+ "ChatDoctorRetrieval": "Given a medical question, retrieve relevant medical information",
137
+ # SQL domain
138
+ "WikiSQLRetrieval": "Given a natural language query, retrieve relevant SQL examples",
139
+ # Multilingual
140
+ "MIRACLRetrievalHardNegatives": "Given a query, retrieve relevant passages",
141
+ # ========== Private/Closed Datasets ==========
142
+ # Code domain (Private)
143
+ "Code1Retrieval": "Given a code problem description, retrieve relevant code examples",
144
+ "JapaneseCode1Retrieval": "Given a code problem description, retrieve relevant code examples",
145
+ # Finance domain (Private)
146
+ "EnglishFinance1Retrieval": "Given a financial query, retrieve relevant financial documents",
147
+ "EnglishFinance2Retrieval": "Given a financial query, retrieve relevant financial documents",
148
+ "EnglishFinance3Retrieval": "Given a financial query, retrieve relevant financial documents",
149
+ "EnglishFinance4Retrieval": "Given a financial query, retrieve relevant financial documents",
150
+ # Healthcare domain (Private)
151
+ "EnglishHealthcare1Retrieval": "Given a medical question, retrieve relevant medical information",
152
+ "GermanHealthcare1Retrieval": "Given a medical question, retrieve relevant medical information",
153
+ # Legal domain (Private)
154
+ "FrenchLegal1Retrieval": "Given a legal query, retrieve relevant legal documents",
155
+ "GermanLegal1Retrieval": "Given a legal query, retrieve relevant legal documents",
156
+ "JapaneseLegal1Retrieval": "Given a legal query, retrieve relevant legal documents",
157
+ # General/Multilingual (Private)
158
+ "French1Retrieval": "Given a query, retrieve relevant passages",
159
+ "German1Retrieval": "Given a query, retrieve relevant passages",
160
+ }
161
+
162
+
163
+ def mod_instruct_loader(
164
+ model_name_or_path: str, revision: str, **kwargs
165
+ ) -> EncoderProtocol:
166
+ # Set default prompts_dict if not provided
167
+
168
+ model = InstructSentenceTransformerModel(
169
+ model_name_or_path,
170
+ revision=revision,
171
+ instruction_template=instruction_template,
172
+ apply_instruction_to_passages=False,
173
+ prompt_dicts=PREDEFINED_PROMPTS,
174
+ **kwargs,
175
+ )
176
+ encoder = model.model._first_module()
177
+ if encoder.auto_model.config._attn_implementation == "flash_attention_2":
178
+ # The Qwen3 code only use left padding in flash_attention_2 mode.
179
+ encoder.tokenizer.padding_side = "left"
180
+ return model
181
+
182
+
183
+ MoD_Embedding = ModelMeta(
184
+ loader=mod_instruct_loader,
185
+ name="bflhc/MoD-Embedding",
186
+ languages=multilingual_langs,
187
+ open_weights=True,
188
+ revision="acbb5b70fdab262226a6af2bc62001de8021b05c",
189
+ release_date="2025-12-14",
190
+ n_parameters=4021774336,
191
+ memory_usage_mb=7671,
192
+ embed_dim=2560,
193
+ max_tokens=32768,
194
+ license="apache-2.0",
195
+ reference="https://huggingface.co/bflhc/MoD-Embedding",
196
+ similarity_fn_name="cosine",
197
+ framework=["Sentence Transformers", "PyTorch"],
198
+ use_instructions=True,
199
+ public_training_code=None,
200
+ public_training_data=None,
201
+ training_datasets=training_data,
202
+ citation=MOD_CITATION,
203
+ adapted_from="Qwen/Qwen3-Embedding-4B",
204
+ )
@@ -193,7 +193,7 @@ NOMIC_CITATION = """
193
193
  """
194
194
 
195
195
  nomic_embed_v1_5 = ModelMeta(
196
- loader=NomicWrapper,
196
+ loader=NomicWrapper, # type: ignore
197
197
  loader_kwargs=dict(
198
198
  trust_remote_code=True,
199
199
  model_prompts=model_prompts,
@@ -221,7 +221,7 @@ nomic_embed_v1_5 = ModelMeta(
221
221
  )
222
222
 
223
223
  nomic_embed_v1 = ModelMeta(
224
- loader=NomicWrapper,
224
+ loader=NomicWrapper, # type: ignore
225
225
  loader_kwargs=dict(
226
226
  trust_remote_code=True,
227
227
  model_prompts=model_prompts,
@@ -249,7 +249,7 @@ nomic_embed_v1 = ModelMeta(
249
249
  )
250
250
 
251
251
  nomic_embed_v1_ablated = ModelMeta(
252
- loader=NomicWrapper,
252
+ loader=NomicWrapper, # type: ignore
253
253
  loader_kwargs=dict(
254
254
  trust_remote_code=True,
255
255
  model_prompts=model_prompts,
@@ -276,7 +276,7 @@ nomic_embed_v1_ablated = ModelMeta(
276
276
  )
277
277
 
278
278
  nomic_embed_v1_unsupervised = ModelMeta(
279
- loader=NomicWrapper,
279
+ loader=NomicWrapper, # type: ignore
280
280
  loader_kwargs=dict(
281
281
  trust_remote_code=True,
282
282
  model_prompts=model_prompts,
@@ -329,3 +329,141 @@ nomic_modern_bert_embed = ModelMeta(
329
329
  training_datasets=nomic_training_data,
330
330
  public_training_data=None,
331
331
  )
332
+
333
+
334
+ m_languages = [
335
+ "eng-Latn",
336
+ "spa-Latn",
337
+ "fra-Latn",
338
+ "deu-Latn",
339
+ "ita-Latn",
340
+ "por-Latn",
341
+ "pol-Latn",
342
+ "nld-Latn",
343
+ "tur-Latn",
344
+ "jpn-Jpan",
345
+ "vie-Latn",
346
+ "rus-Cyrl",
347
+ "ind-Latn",
348
+ "arb-Arab",
349
+ "ces-Latn",
350
+ "ron-Latn",
351
+ "swe-Latn",
352
+ "ell-Grek",
353
+ "ukr-Cyrl",
354
+ "zho-Hans",
355
+ "hun-Latn",
356
+ "dan-Latn",
357
+ "nor-Latn",
358
+ "hin-Deva",
359
+ "fin-Latn",
360
+ "bul-Cyrl",
361
+ "kor-Hang",
362
+ "slk-Latn",
363
+ "tha-Thai",
364
+ "heb-Hebr",
365
+ "cat-Latn",
366
+ "lit-Latn",
367
+ "fas-Arab",
368
+ "msa-Latn",
369
+ "slv-Latn",
370
+ "lav-Latn",
371
+ "mar-Deva",
372
+ "ben-Beng",
373
+ "sqi-Latn",
374
+ "cym-Latn",
375
+ "bel-Cyrl",
376
+ "mal-Mlym",
377
+ "kan-Knda",
378
+ "mkd-Cyrl",
379
+ "urd-Arab",
380
+ "fry-Latn",
381
+ "fil-Latn",
382
+ "tel-Telu",
383
+ "eus-Latn",
384
+ "swh-Latn",
385
+ "som-Latn",
386
+ "snd-Arab",
387
+ "uzb-Latn",
388
+ "cos-Latn",
389
+ "hrv-Latn",
390
+ "guj-Gujr",
391
+ "hin-Latn",
392
+ "ceb-Latn",
393
+ "epo-Latn",
394
+ "jav-Latn",
395
+ "lat-Latn",
396
+ "zul-Latn",
397
+ "mon-Cyrl",
398
+ "sin-Sinh",
399
+ "ell-Latn",
400
+ "gle-Latn",
401
+ "kir-Cyrl",
402
+ "tgk-Cyrl",
403
+ "mya-Mymr",
404
+ "khm-Khmr",
405
+ "mlg-Latn",
406
+ "pan-Guru",
407
+ "rus-Latn",
408
+ "sna-Latn",
409
+ "zho-Latn",
410
+ "hau-Latn",
411
+ "heb-Latn",
412
+ "hmn-Latn",
413
+ "hat-Latn",
414
+ "jpn-Latn",
415
+ "sun-Latn",
416
+ "bul-Latn",
417
+ "gla-Latn",
418
+ "nya-Latn",
419
+ "pus-Arab",
420
+ "kur-Latn",
421
+ "hbs-Latn",
422
+ "amh-Ethi",
423
+ "ibo-Latn",
424
+ "lao-Laoo",
425
+ "mri-Latn",
426
+ "nno-Latn",
427
+ "smo-Latn",
428
+ "yid-Hebr",
429
+ "sot-Latn",
430
+ "tgl-Latn",
431
+ "xho-Latn",
432
+ "yor-Latn",
433
+ ]
434
+
435
+ nomic_embed_text_v2_moe = ModelMeta(
436
+ loader=NomicWrapper, # type: ignore
437
+ loader_kwargs=dict(
438
+ trust_remote_code=True,
439
+ model_prompts=model_prompts,
440
+ ),
441
+ name="nomic-ai/nomic-embed-text-v2-moe",
442
+ languages=m_languages,
443
+ open_weights=True,
444
+ revision="1066b6599d099fbb93dfcb64f9c37a7c9e503e85",
445
+ release_date="2025-02-07",
446
+ n_parameters=475292928,
447
+ memory_usage_mb=1813,
448
+ max_tokens=512,
449
+ embed_dim=768,
450
+ license="apache-2.0",
451
+ reference="https://huggingface.co/nomic-ai/nomic-embed-text-v2-moe",
452
+ similarity_fn_name=ScoringFunction.COSINE,
453
+ framework=["Sentence Transformers", "PyTorch"],
454
+ use_instructions=True,
455
+ adapted_from="nomic-ai/nomic-xlm-2048",
456
+ public_training_data="https://github.com/nomic-ai/contrastors?tab=readme-ov-file#data-access",
457
+ public_training_code="https://github.com/nomic-ai/contrastors/blob/613ddfd37309e538cceadb05b1e6423e7b09f603/src/contrastors/configs/train/contrastive_finetune_moe.yaml",
458
+ training_datasets=None, # did not look into this further
459
+ superseded_by=None,
460
+ citation="""@misc{nussbaum2025trainingsparsemixtureexperts,
461
+ title={Training Sparse Mixture Of Experts Text Embedding Models},
462
+ author={Zach Nussbaum and Brandon Duderstadt},
463
+ year={2025},
464
+ eprint={2502.07972},
465
+ archivePrefix={arXiv},
466
+ primaryClass={cs.CL},
467
+ url={https://arxiv.org/abs/2502.07972},
468
+ }""",
469
+ )