mteb 2.3.11__py3-none-any.whl → 2.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. mteb/benchmarks/benchmarks/__init__.py +2 -0
  2. mteb/benchmarks/benchmarks/benchmarks.py +57 -0
  3. mteb/deprecated_evaluator.py +8 -13
  4. mteb/descriptive_stats/Reranking/JQaRARerankingLite.json +35 -0
  5. mteb/descriptive_stats/Reranking/JaCWIRRerankingLite.json +35 -0
  6. mteb/descriptive_stats/Retrieval/JaCWIRRetrievalLite.json +30 -0
  7. mteb/descriptive_stats/Retrieval/JaqketRetrievalLite.json +30 -0
  8. mteb/descriptive_stats/Retrieval/MIRACLJaRetrievalLite.json +30 -0
  9. mteb/descriptive_stats/Retrieval/MrTyDiJaRetrievalLite.json +30 -0
  10. mteb/evaluate.py +2 -33
  11. mteb/leaderboard/figures.py +1 -1
  12. mteb/leaderboard/table.py +1 -11
  13. mteb/models/abs_encoder.py +21 -17
  14. mteb/models/get_model_meta.py +3 -123
  15. mteb/models/instruct_wrapper.py +2 -1
  16. mteb/models/model_implementations/andersborges.py +12 -0
  17. mteb/models/model_implementations/bge_models.py +43 -0
  18. mteb/models/model_implementations/bica_model.py +34 -0
  19. mteb/models/model_implementations/dino_models.py +152 -0
  20. mteb/models/model_implementations/emillykkejensen_models.py +18 -0
  21. mteb/models/model_implementations/euler_models.py +6 -0
  22. mteb/models/model_implementations/fa_models.py +50 -0
  23. mteb/models/model_implementations/facebookai.py +44 -0
  24. mteb/models/model_implementations/google_models.py +10 -0
  25. mteb/models/model_implementations/gte_models.py +69 -0
  26. mteb/models/model_implementations/kalm_models.py +38 -0
  27. mteb/models/model_implementations/kblab.py +6 -0
  28. mteb/models/model_implementations/kowshik24_models.py +9 -0
  29. mteb/models/model_implementations/misc_models.py +293 -0
  30. mteb/models/model_implementations/mod_models.py +189 -0
  31. mteb/models/model_implementations/mxbai_models.py +6 -0
  32. mteb/models/model_implementations/nomic_models.py +150 -4
  33. mteb/models/model_implementations/pylate_models.py +33 -0
  34. mteb/models/model_implementations/ru_sentence_models.py +22 -0
  35. mteb/models/model_implementations/sentence_transformers_models.py +39 -0
  36. mteb/models/model_implementations/spartan8806_atles_champion.py +7 -0
  37. mteb/models/model_implementations/ua_sentence_models.py +9 -0
  38. mteb/models/model_implementations/vi_vn_models.py +33 -0
  39. mteb/models/model_meta.py +396 -19
  40. mteb/models/sentence_transformer_wrapper.py +2 -7
  41. mteb/tasks/reranking/jpn/__init__.py +9 -1
  42. mteb/tasks/reranking/jpn/j_qa_ra_reranking_lite.py +49 -0
  43. mteb/tasks/reranking/jpn/ja_cwir_reranking_lite.py +47 -0
  44. mteb/tasks/retrieval/code/fresh_stack_retrieval.py +8 -5
  45. mteb/tasks/retrieval/jpn/__init__.py +8 -0
  46. mteb/tasks/retrieval/jpn/ja_cwir_retrieval_lite.py +47 -0
  47. mteb/tasks/retrieval/jpn/jaqket_retrieval_lite.py +50 -0
  48. mteb/tasks/retrieval/jpn/miracl_ja_retrieval_lite.py +52 -0
  49. mteb/tasks/retrieval/jpn/mr_tydi_ja_retrieval_lite.py +48 -0
  50. {mteb-2.3.11.dist-info → mteb-2.4.2.dist-info}/METADATA +1 -1
  51. {mteb-2.3.11.dist-info → mteb-2.4.2.dist-info}/RECORD +55 -41
  52. {mteb-2.3.11.dist-info → mteb-2.4.2.dist-info}/WHEEL +0 -0
  53. {mteb-2.3.11.dist-info → mteb-2.4.2.dist-info}/entry_points.txt +0 -0
  54. {mteb-2.3.11.dist-info → mteb-2.4.2.dist-info}/licenses/LICENSE +0 -0
  55. {mteb-2.3.11.dist-info → mteb-2.4.2.dist-info}/top_level.txt +0 -0
@@ -127,6 +127,15 @@ Gameselo__STS_multilingual_mpnet_base_v2 = ModelMeta(
127
127
  },
128
128
  adapted_from="sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
129
129
  superseded_by=None,
130
+ citation="""@inproceedings{reimers-2019-sentence-bert,
131
+ title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
132
+ author = "Reimers, Nils and Gurevych, Iryna",
133
+ booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
134
+ month = "11",
135
+ year = "2019",
136
+ publisher = "Association for Computational Linguistics",
137
+ url = "https://arxiv.org/abs/1908.10084",
138
+ }""",
130
139
  )
131
140
 
132
141
  Hum_Works__lodestone_base_4096_v1 = ModelMeta(
@@ -250,6 +259,29 @@ Lajavaness__bilingual_embedding_base = ModelMeta(
250
259
  training_datasets=bilingual_embedding_training_data,
251
260
  adapted_from="dangvantuan/bilingual_impl",
252
261
  superseded_by=None,
262
+ citation="""
263
+ @article{conneau2019unsupervised,
264
+ title={Unsupervised cross-lingual representation learning at scale},
265
+ author={Conneau, Alexis and Khandelwal, Kartikay and Goyal, Naman and Chaudhary, Vishrav and Wenzek, Guillaume and Guzm{\'a}n, Francisco and Grave, Edouard and Ott, Myle and Zettlemoyer, Luke and Stoyanov, Veselin},
266
+ journal={arXiv preprint arXiv:1911.02116},
267
+ year={2019}
268
+ }
269
+
270
+ @article{reimers2019sentence,
271
+ title={Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks},
272
+ author={Nils Reimers, Iryna Gurevych},
273
+ journal={https://arxiv.org/abs/1908.10084},
274
+ year={2019}
275
+ }
276
+
277
+ @article{thakur2020augmented,
278
+ title={Augmented SBERT: Data Augmentation Method for Improving Bi-Encoders for Pairwise Sentence Scoring Tasks},
279
+ author={Thakur, Nandan and Reimers, Nils and Daxenberger, Johannes and Gurevych, Iryna},
280
+ journal={arXiv e-prints},
281
+ pages={arXiv--2010},
282
+ year={2020}
283
+ }
284
+ """,
253
285
  )
254
286
  Lajavaness__bilingual_embedding_large = ModelMeta(
255
287
  name="Lajavaness/bilingual-embedding-large",
@@ -275,6 +307,29 @@ Lajavaness__bilingual_embedding_large = ModelMeta(
275
307
  training_datasets=bilingual_embedding_training_data,
276
308
  adapted_from="dangvantuan/bilingual_impl",
277
309
  superseded_by=None,
310
+ citation="""
311
+ @article{conneau2019unsupervised,
312
+ title={Unsupervised cross-lingual representation learning at scale},
313
+ author={Conneau, Alexis and Khandelwal, Kartikay and Goyal, Naman and Chaudhary, Vishrav and Wenzek, Guillaume and Guzm{\'a}n, Francisco and Grave, Edouard and Ott, Myle and Zettlemoyer, Luke and Stoyanov, Veselin},
314
+ journal={arXiv preprint arXiv:1911.02116},
315
+ year={2019}
316
+ }
317
+
318
+ @article{reimers2019sentence,
319
+ title={Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks},
320
+ author={Nils Reimers, Iryna Gurevych},
321
+ journal={https://arxiv.org/abs/1908.10084},
322
+ year={2019}
323
+ }
324
+
325
+ @article{thakur2020augmented,
326
+ title={Augmented SBERT: Data Augmentation Method for Improving Bi-Encoders for Pairwise Sentence Scoring Tasks},
327
+ author={Thakur, Nandan and Reimers, Nils and Daxenberger, Johannes and Gurevych, Iryna},
328
+ journal={arXiv e-prints},
329
+ pages={arXiv--2010},
330
+ year={2020}
331
+ }
332
+ """,
278
333
  )
279
334
  Lajavaness__bilingual_embedding_small = ModelMeta(
280
335
  name="Lajavaness/bilingual-embedding-small",
@@ -300,6 +355,29 @@ Lajavaness__bilingual_embedding_small = ModelMeta(
300
355
  training_datasets=bilingual_embedding_training_data,
301
356
  adapted_from="dangvantuan/bilingual_impl",
302
357
  superseded_by=None,
358
+ citation="""
359
+ @article{conneau2019unsupervised,
360
+ title={Unsupervised cross-lingual representation learning at scale},
361
+ author={Conneau, Alexis and Khandelwal, Kartikay and Goyal, Naman and Chaudhary, Vishrav and Wenzek, Guillaume and Guzm{\'a}n, Francisco and Grave, Edouard and Ott, Myle and Zettlemoyer, Luke and Stoyanov, Veselin},
362
+ journal={arXiv preprint arXiv:1911.02116},
363
+ year={2019}
364
+ }
365
+
366
+ @article{reimers2019sentence,
367
+ title={Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks},
368
+ author={Nils Reimers, Iryna Gurevych},
369
+ journal={https://arxiv.org/abs/1908.10084},
370
+ year={2019}
371
+ }
372
+
373
+ @article{thakur2020augmented,
374
+ title={Augmented SBERT: Data Augmentation Method for Improving Bi-Encoders for Pairwise Sentence Scoring Tasks},
375
+ author={Thakur, Nandan and Reimers, Nils and Daxenberger, Johannes and Gurevych, Iryna},
376
+ journal={arXiv e-prints},
377
+ pages={arXiv--2010},
378
+ year={2020}
379
+ }
380
+ """,
303
381
  )
304
382
  Mihaiii__Bulbasaur = ModelMeta(
305
383
  name="Mihaiii/Bulbasaur",
@@ -503,6 +581,15 @@ Omartificial_Intelligence_Space__Arabert_all_nli_triplet_Matryoshka = ModelMeta(
503
581
  training_datasets=set(), # not in MTEB: {"Omartificial-Intelligence-Space/Arabic-NLi-Triplet"},
504
582
  adapted_from="aubmindlab/bert-base-arabertv02",
505
583
  superseded_by=None,
584
+ citation="""@inproceedings{reimers-2019-sentence-bert,
585
+ title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
586
+ author = "Reimers, Nils and Gurevych, Iryna",
587
+ booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
588
+ month = "11",
589
+ year = "2019",
590
+ publisher = "Association for Computational Linguistics",
591
+ url = "https://arxiv.org/abs/1908.10084",
592
+ }""",
506
593
  )
507
594
  Omartificial_Intelligence_Space__Arabic_MiniLM_L12_v2_all_nli_triplet = ModelMeta(
508
595
  name="Omartificial-Intelligence-Space/Arabic-MiniLM-L12-v2-all-nli-triplet",
@@ -551,6 +638,15 @@ Omartificial_Intelligence_Space__Arabic_all_nli_triplet_Matryoshka = ModelMeta(
551
638
  # {"Omartificial-Intelligence-Space/Arabic-NLi-Triplet"},
552
639
  adapted_from="sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
553
640
  superseded_by=None,
641
+ citation="""@inproceedings{reimers-2019-sentence-bert,
642
+ title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
643
+ author = "Reimers, Nils and Gurevych, Iryna",
644
+ booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
645
+ month = "11",
646
+ year = "2019",
647
+ publisher = "Association for Computational Linguistics",
648
+ url = "https://arxiv.org/abs/1908.10084",
649
+ }""",
554
650
  )
555
651
  Omartificial_Intelligence_Space__Arabic_labse_Matryoshka = ModelMeta(
556
652
  name="Omartificial-Intelligence-Space/Arabic-labse-Matryoshka",
@@ -575,6 +671,15 @@ Omartificial_Intelligence_Space__Arabic_labse_Matryoshka = ModelMeta(
575
671
  # {"Omartificial-Intelligence-Space/Arabic-NLi-Triplet"},
576
672
  adapted_from="sentence-transformers/LaBSE",
577
673
  superseded_by=None,
674
+ citation="""@inproceedings{reimers-2019-sentence-bert,
675
+ title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
676
+ author = "Reimers, Nils and Gurevych, Iryna",
677
+ booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
678
+ month = "11",
679
+ year = "2019",
680
+ publisher = "Association for Computational Linguistics",
681
+ url = "https://arxiv.org/abs/1908.10084",
682
+ }""",
578
683
  )
579
684
  Omartificial_Intelligence_Space__Arabic_mpnet_base_all_nli_triplet = ModelMeta(
580
685
  name="Omartificial-Intelligence-Space/Arabic-mpnet-base-all-nli-triplet",
@@ -599,6 +704,15 @@ Omartificial_Intelligence_Space__Arabic_mpnet_base_all_nli_triplet = ModelMeta(
599
704
  # {"Omartificial-Intelligence-Space/Arabic-NLi-Triplet"},
600
705
  adapted_from="tomaarsen/mpnet-base-all-nli-triplet",
601
706
  superseded_by=None,
707
+ citation="""@inproceedings{reimers-2019-sentence-bert,
708
+ title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
709
+ author = "Reimers, Nils and Gurevych, Iryna",
710
+ booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
711
+ month = "11",
712
+ year = "2019",
713
+ publisher = "Association for Computational Linguistics",
714
+ url = "https://arxiv.org/abs/1908.10084",
715
+ }""",
602
716
  )
603
717
  Omartificial_Intelligence_Space__Marbert_all_nli_triplet_Matryoshka = ModelMeta(
604
718
  name="Omartificial-Intelligence-Space/Marbert-all-nli-triplet-Matryoshka",
@@ -621,6 +735,15 @@ Omartificial_Intelligence_Space__Marbert_all_nli_triplet_Matryoshka = ModelMeta(
621
735
  training_datasets=set(), # not in MTEB: "Omartificial-Intelligence-Space/Arabic-NLi-Triplet"},
622
736
  adapted_from="UBC-NLP/MARBERTv2",
623
737
  superseded_by=None,
738
+ citation="""@inproceedings{reimers-2019-sentence-bert,
739
+ title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
740
+ author = "Reimers, Nils and Gurevych, Iryna",
741
+ booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
742
+ month = "11",
743
+ year = "2019",
744
+ publisher = "Association for Computational Linguistics",
745
+ url = "https://arxiv.org/abs/1908.10084",
746
+ }""",
624
747
  )
625
748
  consciousai__cai_lunaris_text_embeddings = ModelMeta(
626
749
  name="consciousAI/cai-lunaris-text-embeddings",
@@ -763,6 +886,12 @@ thenlper__gte_base = ModelMeta(
763
886
  training_datasets=None,
764
887
  adapted_from=None,
765
888
  superseded_by=None,
889
+ citation="""@article{li2023towards,
890
+ title={Towards general text embeddings with multi-stage contrastive learning},
891
+ author={Li, Zehan and Zhang, Xin and Zhang, Yanzhao and Long, Dingkun and Xie, Pengjun and Zhang, Meishan},
892
+ journal={arXiv preprint arXiv:2308.03281},
893
+ year={2023}
894
+ }""",
766
895
  )
767
896
  thenlper__gte_large = ModelMeta(
768
897
  name="thenlper/gte-large",
@@ -785,6 +914,12 @@ thenlper__gte_large = ModelMeta(
785
914
  training_datasets=None,
786
915
  adapted_from=None,
787
916
  superseded_by=None,
917
+ citation="""@article{li2023towards,
918
+ title={Towards general text embeddings with multi-stage contrastive learning},
919
+ author={Li, Zehan and Zhang, Xin and Zhang, Yanzhao and Long, Dingkun and Xie, Pengjun and Zhang, Meishan},
920
+ journal={arXiv preprint arXiv:2308.03281},
921
+ year={2023}
922
+ }""",
788
923
  )
789
924
  thenlper__gte_small = ModelMeta(
790
925
  name="thenlper/gte-small",
@@ -807,6 +942,12 @@ thenlper__gte_small = ModelMeta(
807
942
  training_datasets=None,
808
943
  adapted_from=None,
809
944
  superseded_by=None,
945
+ citation="""@article{li2023towards,
946
+ title={Towards general text embeddings with multi-stage contrastive learning},
947
+ author={Li, Zehan and Zhang, Xin and Zhang, Yanzhao and Long, Dingkun and Xie, Pengjun and Zhang, Meishan},
948
+ journal={arXiv preprint arXiv:2308.03281},
949
+ year={2023}
950
+ }""",
810
951
  )
811
952
  OrlikB__KartonBERT_USE_base_v1 = ModelMeta(
812
953
  name="OrlikB/KartonBERT-USE-base-v1",
@@ -873,6 +1014,14 @@ sdadas__mmlw_e5_base = ModelMeta(
873
1014
  training_datasets=E5_TRAINING_DATA,
874
1015
  adapted_from="intfloat/multilingual-e5-base",
875
1016
  superseded_by=None,
1017
+ citation="""@article{dadas2024pirb,
1018
+ title={{PIRB}: A Comprehensive Benchmark of Polish Dense and Hybrid Text Retrieval Methods},
1019
+ author={Sławomir Dadas and Michał Perełkiewicz and Rafał Poświata},
1020
+ year={2024},
1021
+ eprint={2402.13350},
1022
+ archivePrefix={arXiv},
1023
+ primaryClass={cs.CL}
1024
+ }""",
876
1025
  )
877
1026
  dwzhu__e5_base_4k = ModelMeta(
878
1027
  name="dwzhu/e5-base-4k",
@@ -895,6 +1044,12 @@ dwzhu__e5_base_4k = ModelMeta(
895
1044
  training_datasets=E5_TRAINING_DATA,
896
1045
  adapted_from="intfloat/e5-base-v2",
897
1046
  superseded_by=None,
1047
+ citation="""@article{zhu2024longembed,
1048
+ title={LongEmbed: Extending Embedding Models for Long Context Retrieval},
1049
+ author={Zhu, Dawei and Wang, Liang and Yang, Nan and Song, Yifan and Wu, Wenhao and Wei, Furu and Li, Sujian},
1050
+ journal={arXiv preprint arXiv:2404.12096},
1051
+ year={2024}
1052
+ }""",
898
1053
  )
899
1054
  sdadas__mmlw_e5_large = ModelMeta(
900
1055
  name="sdadas/mmlw-e5-large",
@@ -917,6 +1072,14 @@ sdadas__mmlw_e5_large = ModelMeta(
917
1072
  training_datasets=E5_TRAINING_DATA,
918
1073
  adapted_from="intfloat/multilingual-e5-large",
919
1074
  superseded_by=None,
1075
+ citation="""@article{dadas2024pirb,
1076
+ title={{PIRB}: A Comprehensive Benchmark of Polish Dense and Hybrid Text Retrieval Methods},
1077
+ author={Sławomir Dadas and Michał Perełkiewicz and Rafał Poświata},
1078
+ year={2024},
1079
+ eprint={2402.13350},
1080
+ archivePrefix={arXiv},
1081
+ primaryClass={cs.CL}
1082
+ }""",
920
1083
  )
921
1084
  sdadas__mmlw_e5_small = ModelMeta(
922
1085
  name="sdadas/mmlw-e5-small",
@@ -939,6 +1102,14 @@ sdadas__mmlw_e5_small = ModelMeta(
939
1102
  training_datasets=E5_TRAINING_DATA,
940
1103
  adapted_from="intfloat/multilingual-e5-small",
941
1104
  superseded_by=None,
1105
+ citation="""@article{dadas2024pirb,
1106
+ title={{PIRB}: A Comprehensive Benchmark of Polish Dense and Hybrid Text Retrieval Methods},
1107
+ author={Sławomir Dadas and Michał Perełkiewicz and Rafał Poświata},
1108
+ year={2024},
1109
+ eprint={2402.13350},
1110
+ archivePrefix={arXiv},
1111
+ primaryClass={cs.CL}
1112
+ }""",
942
1113
  )
943
1114
  sdadas__mmlw_roberta_base = ModelMeta(
944
1115
  name="sdadas/mmlw-roberta-base",
@@ -961,6 +1132,14 @@ sdadas__mmlw_roberta_base = ModelMeta(
961
1132
  training_datasets={"MSMARCO"},
962
1133
  adapted_from="sdadas/polish-roberta-base-v2",
963
1134
  superseded_by=None,
1135
+ citation="""@article{dadas2024pirb,
1136
+ title={{PIRB}: A Comprehensive Benchmark of Polish Dense and Hybrid Text Retrieval Methods},
1137
+ author={Sławomir Dadas and Michał Perełkiewicz and Rafał Poświata},
1138
+ year={2024},
1139
+ eprint={2402.13350},
1140
+ archivePrefix={arXiv},
1141
+ primaryClass={cs.CL}
1142
+ }""",
964
1143
  )
965
1144
  sdadas__mmlw_roberta_large = ModelMeta(
966
1145
  name="sdadas/mmlw-roberta-large",
@@ -983,6 +1162,14 @@ sdadas__mmlw_roberta_large = ModelMeta(
983
1162
  training_datasets={"MSMARCO"},
984
1163
  adapted_from="sdadas/polish-roberta-large-v2",
985
1164
  superseded_by=None,
1165
+ citation="""@article{dadas2024pirb,
1166
+ title={{PIRB}: A Comprehensive Benchmark of Polish Dense and Hybrid Text Retrieval Methods},
1167
+ author={Sławomir Dadas and Michał Perełkiewicz and Rafał Poświata},
1168
+ year={2024},
1169
+ eprint={2402.13350},
1170
+ archivePrefix={arXiv},
1171
+ primaryClass={cs.CL}
1172
+ }""",
986
1173
  )
987
1174
 
988
1175
  udever_dataset = { # discussed here: https://github.com/embeddings-benchmark/mteb/issues/2193
@@ -1060,6 +1247,12 @@ izhx__udever_bloom_1b1 = ModelMeta(
1060
1247
  training_datasets=udever_dataset,
1061
1248
  adapted_from="bigscience/bloom-1b1",
1062
1249
  superseded_by=None,
1250
+ citation="""@article{zhang2023language,
1251
+ title={Language Models are Universal Embedders},
1252
+ author={Zhang, Xin and Li, Zehan and Zhang, Yanzhao and Long, Dingkun and Xie, Pengjun and Zhang, Meishan and Zhang, Min},
1253
+ journal={arXiv preprint arXiv:2310.08232},
1254
+ year={2023}
1255
+ }""",
1063
1256
  )
1064
1257
  izhx__udever_bloom_3b = ModelMeta(
1065
1258
  name="izhx/udever-bloom-3b",
@@ -1082,6 +1275,12 @@ izhx__udever_bloom_3b = ModelMeta(
1082
1275
  training_datasets=udever_dataset,
1083
1276
  adapted_from="bigscience/bloom-3b",
1084
1277
  superseded_by=None,
1278
+ citation="""@article{zhang2023language,
1279
+ title={Language Models are Universal Embedders},
1280
+ author={Zhang, Xin and Li, Zehan and Zhang, Yanzhao and Long, Dingkun and Xie, Pengjun and Zhang, Meishan and Zhang, Min},
1281
+ journal={arXiv preprint arXiv:2310.08232},
1282
+ year={2023}
1283
+ }""",
1085
1284
  )
1086
1285
  izhx__udever_bloom_560m = ModelMeta(
1087
1286
  name="izhx/udever-bloom-560m",
@@ -1104,6 +1303,12 @@ izhx__udever_bloom_560m = ModelMeta(
1104
1303
  training_datasets=udever_dataset,
1105
1304
  adapted_from="bigscience/bloom-560m",
1106
1305
  superseded_by=None,
1306
+ citation="""@article{zhang2023language,
1307
+ title={Language Models are Universal Embedders},
1308
+ author={Zhang, Xin and Li, Zehan and Zhang, Yanzhao and Long, Dingkun and Xie, Pengjun and Zhang, Meishan and Zhang, Min},
1309
+ journal={arXiv preprint arXiv:2310.08232},
1310
+ year={2023}
1311
+ }""",
1107
1312
  )
1108
1313
  izhx__udever_bloom_7b1 = ModelMeta(
1109
1314
  name="izhx/udever-bloom-7b1",
@@ -1126,6 +1331,12 @@ izhx__udever_bloom_7b1 = ModelMeta(
1126
1331
  training_datasets=udever_dataset,
1127
1332
  adapted_from="bigscience/bloom-7b1",
1128
1333
  superseded_by=None,
1334
+ citation="""@article{zhang2023language,
1335
+ title={Language Models are Universal Embedders},
1336
+ author={Zhang, Xin and Li, Zehan and Zhang, Yanzhao and Long, Dingkun and Xie, Pengjun and Zhang, Meishan and Zhang, Min},
1337
+ journal={arXiv preprint arXiv:2310.08232},
1338
+ year={2023}
1339
+ }""",
1129
1340
  )
1130
1341
  avsolatorio__gist_embedding_v0 = ModelMeta(
1131
1342
  name="avsolatorio/GIST-Embedding-v0",
@@ -1165,6 +1376,16 @@ avsolatorio__gist_embedding_v0 = ModelMeta(
1165
1376
  | bge_training_data,
1166
1377
  adapted_from="BAAI/bge-large-en-v1.5",
1167
1378
  superseded_by=None,
1379
+ citation="""@article{solatorio2024gistembed,
1380
+ title={GISTEmbed: Guided In-sample Selection of Training Negatives for Text Embedding Fine-tuning},
1381
+ author={Aivin V. Solatorio},
1382
+ journal={arXiv preprint arXiv:2402.16829},
1383
+ year={2024},
1384
+ URL={https://arxiv.org/abs/2402.16829}
1385
+ eprint={2402.16829},
1386
+ archivePrefix={arXiv},
1387
+ primaryClass={cs.LG}
1388
+ }""",
1168
1389
  )
1169
1390
  avsolatorio__gist_all_minilm_l6_v2 = ModelMeta(
1170
1391
  name="avsolatorio/GIST-all-MiniLM-L6-v2",
@@ -1204,6 +1425,16 @@ avsolatorio__gist_all_minilm_l6_v2 = ModelMeta(
1204
1425
  | bge_training_data,
1205
1426
  adapted_from=None,
1206
1427
  superseded_by=None,
1428
+ citation="""@article{solatorio2024gistembed,
1429
+ title={GISTEmbed: Guided In-sample Selection of Training Negatives for Text Embedding Fine-tuning},
1430
+ author={Aivin V. Solatorio},
1431
+ journal={arXiv preprint arXiv:2402.16829},
1432
+ year={2024},
1433
+ URL={https://arxiv.org/abs/2402.16829}
1434
+ eprint={2402.16829},
1435
+ archivePrefix={arXiv},
1436
+ primaryClass={cs.LG}
1437
+ }""",
1207
1438
  )
1208
1439
  avsolatorio__gist_large_embedding_v0 = ModelMeta(
1209
1440
  name="avsolatorio/GIST-large-Embedding-v0",
@@ -1243,6 +1474,16 @@ avsolatorio__gist_large_embedding_v0 = ModelMeta(
1243
1474
  | bge_training_data,
1244
1475
  adapted_from=None,
1245
1476
  superseded_by=None,
1477
+ citation="""@article{solatorio2024gistembed,
1478
+ title={GISTEmbed: Guided In-sample Selection of Training Negatives for Text Embedding Fine-tuning},
1479
+ author={Aivin V. Solatorio},
1480
+ journal={arXiv preprint arXiv:2402.16829},
1481
+ year={2024},
1482
+ URL={https://arxiv.org/abs/2402.16829}
1483
+ eprint={2402.16829},
1484
+ archivePrefix={arXiv},
1485
+ primaryClass={cs.LG}
1486
+ }""",
1246
1487
  )
1247
1488
  avsolatorio__gist_small_embedding_v0 = ModelMeta(
1248
1489
  name="avsolatorio/GIST-small-Embedding-v0",
@@ -1282,6 +1523,16 @@ avsolatorio__gist_small_embedding_v0 = ModelMeta(
1282
1523
  | bge_training_data,
1283
1524
  adapted_from=None,
1284
1525
  superseded_by=None,
1526
+ citation="""@article{solatorio2024gistembed,
1527
+ title={GISTEmbed: Guided In-sample Selection of Training Negatives for Text Embedding Fine-tuning},
1528
+ author={Aivin V. Solatorio},
1529
+ journal={arXiv preprint arXiv:2402.16829},
1530
+ year={2024},
1531
+ URL={https://arxiv.org/abs/2402.16829}
1532
+ eprint={2402.16829},
1533
+ archivePrefix={arXiv},
1534
+ primaryClass={cs.LG}
1535
+ }""",
1285
1536
  )
1286
1537
  bigscience__sgpt_bloom_7b1_msmarco = ModelMeta(
1287
1538
  name="bigscience/sgpt-bloom-7b1-msmarco",
@@ -1304,6 +1555,12 @@ bigscience__sgpt_bloom_7b1_msmarco = ModelMeta(
1304
1555
  training_datasets=None,
1305
1556
  adapted_from="/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3/bloom-7b1",
1306
1557
  superseded_by=None,
1558
+ citation="""@article{muennighoff2022sgpt,
1559
+ title={SGPT: GPT Sentence Embeddings for Semantic Search},
1560
+ author={Muennighoff, Niklas},
1561
+ journal={arXiv preprint arXiv:2202.08904},
1562
+ year={2022}
1563
+ }""",
1307
1564
  )
1308
1565
  aari1995__german_semantic_sts_v2 = ModelMeta(
1309
1566
  name="aari1995/German_Semantic_STS_V2",
@@ -1358,6 +1615,12 @@ abhinand__medembed_small_v0_1 = ModelMeta(
1358
1615
  },
1359
1616
  adapted_from="BAAI/bge-base-en-v1.5",
1360
1617
  superseded_by=None,
1618
+ citation="""@software{balachandran2024medembed,
1619
+ author = {Balachandran, Abhinand},
1620
+ title = {MedEmbed: Medical-Focused Embedding Models},
1621
+ year = {2024},
1622
+ url = {https://github.com/abhinand5/MedEmbed}
1623
+ }""",
1361
1624
  )
1362
1625
  avsolatorio__noinstruct_small_embedding_v0 = ModelMeta(
1363
1626
  name="avsolatorio/NoInstruct-small-Embedding-v0",
@@ -1490,6 +1753,15 @@ omarelshehy__arabic_english_sts_matryoshka = ModelMeta(
1490
1753
  training_datasets=None,
1491
1754
  adapted_from="FacebookAI/xlm-roberta-large",
1492
1755
  superseded_by=None,
1756
+ citation="""@inproceedings{reimers-2019-sentence-bert,
1757
+ title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
1758
+ author = "Reimers, Nils and Gurevych, Iryna",
1759
+ booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
1760
+ month = "11",
1761
+ year = "2019",
1762
+ publisher = "Association for Computational Linguistics",
1763
+ url = "https://arxiv.org/abs/1908.10084",
1764
+ }""",
1493
1765
  )
1494
1766
  openbmb__minicpm_embedding = ModelMeta(
1495
1767
  loader=sentence_transformers_loader,
@@ -1543,6 +1815,13 @@ silma_ai__silma_embedding_matryoshka_v0_1 = ModelMeta(
1543
1815
  training_datasets=None,
1544
1816
  adapted_from="/workspace/v3-matryoshka_aubmindlab-bert-base-arabertv02-2024-10-12_13-55-06/checkpoint-26250",
1545
1817
  superseded_by=None,
1818
+ citation="""@misc{silma2024embedding,
1819
+ author = {Abu Bakr Soliman, Karim Ouda, SILMA AI},
1820
+ title = {SILMA Embedding Matryoshka 0.1},
1821
+ year = {2024},
1822
+ publisher = {Hugging Face},
1823
+ howpublished = {https://huggingface.co/silma-ai/silma-embeddding-matryoshka-0.1},
1824
+ }""",
1546
1825
  )
1547
1826
 
1548
1827
  sbert_chinese_general_v1 = ModelMeta(
@@ -1683,6 +1962,15 @@ conan_embedding = ModelMeta(
1683
1962
  # source: https://arxiv.org/pdf/2408.15710
1684
1963
  training_datasets=None, # They "scraped" things from the internet, we don't know, could be leakage
1685
1964
  superseded_by=None,
1965
+ citation="""@misc{li2024conanembeddinggeneraltextembedding,
1966
+ title={Conan-embedding: General Text Embedding with More and Better Negative Samples},
1967
+ author={Shiyu Li and Yang Tang and Shizhe Chen and Xi Chen},
1968
+ year={2024},
1969
+ eprint={2408.15710},
1970
+ archivePrefix={arXiv},
1971
+ primaryClass={cs.CL},
1972
+ url={https://arxiv.org/abs/2408.15710},
1973
+ }""",
1686
1974
  )
1687
1975
 
1688
1976
  ember_v1 = ModelMeta(
@@ -1705,4 +1993,9 @@ ember_v1 = ModelMeta(
1705
1993
  use_instructions=None,
1706
1994
  training_datasets=None,
1707
1995
  superseded_by=None,
1996
+ citation="""@misc{nur2024emberv1,
1997
+ title={ember-v1: SOTA embedding model},
1998
+ author={Enrike Nur and Anar Aliyev},
1999
+ year={2023},
2000
+ }""",
1708
2001
  )
@@ -0,0 +1,189 @@
1
+ from mteb.models.instruct_wrapper import InstructSentenceTransformerModel
2
+ from mteb.models.model_meta import ModelMeta
3
+ from mteb.models.models_protocols import PromptType
4
+
5
+
6
+ def instruction_template(
7
+ instruction: str, prompt_type: PromptType | None = None
8
+ ) -> str:
9
+ if not instruction or prompt_type == PromptType.document:
10
+ return ""
11
+ if isinstance(instruction, dict):
12
+ if prompt_type is None:
13
+ instruction = next(iter(instruction.values())) # TODO
14
+ else:
15
+ instruction = instruction[prompt_type]
16
+ return f"Instruct: {instruction}\nQuery:"
17
+
18
+
19
+ multilingual_langs = [
20
+ "afr-Latn",
21
+ "ara-Arab",
22
+ "aze-Latn",
23
+ "bel-Cyrl",
24
+ "bul-Cyrl",
25
+ "ben-Beng",
26
+ "cat-Latn",
27
+ "ceb-Latn",
28
+ "ces-Latn",
29
+ "cym-Latn",
30
+ "dan-Latn",
31
+ "deu-Latn",
32
+ "ell-Grek",
33
+ "eng-Latn",
34
+ "spa-Latn",
35
+ "est-Latn",
36
+ "eus-Latn",
37
+ "fas-Arab",
38
+ "fin-Latn",
39
+ "fra-Latn",
40
+ "glg-Latn",
41
+ "guj-Gujr",
42
+ "heb-Hebr",
43
+ "hin-Deva",
44
+ "hrv-Latn",
45
+ "hat-Latn",
46
+ "hun-Latn",
47
+ "hye-Armn",
48
+ "ind-Latn",
49
+ "isl-Latn",
50
+ "ita-Latn",
51
+ "jpn-Jpan",
52
+ "jav-Latn",
53
+ "kat-Geor",
54
+ "kaz-Cyrl",
55
+ "khm-Khmr",
56
+ "kan-Knda",
57
+ "kor-Hang",
58
+ "kir-Cyrl",
59
+ "lao-Laoo",
60
+ "lit-Latn",
61
+ "lav-Latn",
62
+ "mkd-Cyrl",
63
+ "mal-Mlym",
64
+ "mon-Cyrl",
65
+ "mar-Deva",
66
+ "msa-Latn",
67
+ "mya-Mymr",
68
+ "nep-Deva",
69
+ "nld-Latn",
70
+ "nor-Latn",
71
+ "nob-Latn",
72
+ "nno-Latn",
73
+ "pan-Guru",
74
+ "pol-Latn",
75
+ "por-Latn",
76
+ "que-Latn",
77
+ "ron-Latn",
78
+ "rus-Cyrl",
79
+ "sin-Sinh",
80
+ "slk-Latn",
81
+ "slv-Latn",
82
+ "swa-Latn",
83
+ "tam-Taml",
84
+ "tel-Telu",
85
+ "tha-Thai",
86
+ "tgl-Latn",
87
+ "tur-Latn",
88
+ "ukr-Cyrl",
89
+ "urd-Arab",
90
+ "vie-Latn",
91
+ "yor-Latn",
92
+ "zho-Hans",
93
+ ]
94
+
95
+ MOD_CITATION = """@misc{mod-embedding-2025,
96
+ title={MoD-Embedding: A Fine-tuned Multilingual Text Embedding Model},
97
+ author={MoD Team},
98
+ year={2025},
99
+ url={https://huggingface.co/bflhc/MoD-Embedding}
100
+ }"""
101
+
102
+ training_data = {
103
+ "T2Retrieval",
104
+ "DuRetrieval",
105
+ "MMarcoReranking",
106
+ "CMedQAv2-reranking",
107
+ "NQ",
108
+ "MSMARCO",
109
+ "HotpotQA",
110
+ "FEVER",
111
+ "MrTidyRetrieval",
112
+ "MIRACLRetrieval",
113
+ "CodeSearchNet",
114
+ }
115
+
116
+ # Predefined prompts for various RTEB tasks
117
+ _PREDEFINED_PROMPTS = {
118
+ # ========== Open Datasets ==========
119
+ # Legal domain
120
+ "AILACasedocs": "Given a legal case scenario, retrieve the most relevant case documents",
121
+ "AILAStatutes": "Given a legal scenario, retrieve the most relevant statute documents",
122
+ "LegalQuAD": "Given a legal question, retrieve relevant legal documents that answer the question",
123
+ "LegalSummarization": "Given a query, retrieve relevant legal documents for summarization",
124
+ # Code domain
125
+ "AppsRetrieval": "Given a query about mobile applications, retrieve relevant app information",
126
+ "HumanEvalRetrieval": "Given a code problem description, retrieve relevant code examples",
127
+ "MBPPRetrieval": "Given a programming problem description, retrieve relevant code solutions",
128
+ "DS1000Retrieval": "Given a data science problem, retrieve relevant code snippets",
129
+ "FreshStackRetrieval": "Given a programming question, retrieve relevant Stack Overflow posts",
130
+ # Finance domain
131
+ "FinQARetrieval": "Given a financial question, retrieve relevant financial documents",
132
+ "FinanceBenchRetrieval": "Given a financial query, retrieve relevant financial information",
133
+ "HC3FinanceRetrieval": "Given a finance-related query, retrieve relevant documents",
134
+ # Medical domain
135
+ "CUREv1": "Given a medical query, retrieve relevant clinical documents",
136
+ "ChatDoctorRetrieval": "Given a medical question, retrieve relevant medical information",
137
+ # SQL domain
138
+ "WikiSQLRetrieval": "Given a natural language query, retrieve relevant SQL examples",
139
+ # Multilingual
140
+ "MIRACLRetrievalHardNegatives": "Given a query, retrieve relevant passages",
141
+ # ========== Private/Closed Datasets ==========
142
+ # Code domain (Private)
143
+ "Code1Retrieval": "Given a code problem description, retrieve relevant code examples",
144
+ "JapaneseCode1Retrieval": "Given a code problem description, retrieve relevant code examples",
145
+ # Finance domain (Private)
146
+ "EnglishFinance1Retrieval": "Given a financial query, retrieve relevant financial documents",
147
+ "EnglishFinance2Retrieval": "Given a financial query, retrieve relevant financial documents",
148
+ "EnglishFinance3Retrieval": "Given a financial query, retrieve relevant financial documents",
149
+ "EnglishFinance4Retrieval": "Given a financial query, retrieve relevant financial documents",
150
+ # Healthcare domain (Private)
151
+ "EnglishHealthcare1Retrieval": "Given a medical question, retrieve relevant medical information",
152
+ "GermanHealthcare1Retrieval": "Given a medical question, retrieve relevant medical information",
153
+ # Legal domain (Private)
154
+ "FrenchLegal1Retrieval": "Given a legal query, retrieve relevant legal documents",
155
+ "GermanLegal1Retrieval": "Given a legal query, retrieve relevant legal documents",
156
+ "JapaneseLegal1Retrieval": "Given a legal query, retrieve relevant legal documents",
157
+ # General/Multilingual (Private)
158
+ "French1Retrieval": "Given a query, retrieve relevant passages",
159
+ "German1Retrieval": "Given a query, retrieve relevant passages",
160
+ }
161
+
162
+
163
+ MoD_Embedding = ModelMeta(
164
+ loader=InstructSentenceTransformerModel,
165
+ loader_kwargs=dict(
166
+ instruction_template=instruction_template,
167
+ apply_instruction_to_passages=False,
168
+ prompts_dict=_PREDEFINED_PROMPTS,
169
+ ),
170
+ name="bflhc/MoD-Embedding",
171
+ languages=multilingual_langs,
172
+ open_weights=True,
173
+ revision="acbb5b70fdab262226a6af2bc62001de8021b05c",
174
+ release_date="2025-12-14",
175
+ n_parameters=4021774336,
176
+ memory_usage_mb=7671,
177
+ embed_dim=2560,
178
+ max_tokens=32768,
179
+ license="apache-2.0",
180
+ reference="https://huggingface.co/bflhc/MoD-Embedding",
181
+ similarity_fn_name="cosine",
182
+ framework=["Sentence Transformers", "PyTorch"],
183
+ use_instructions=True,
184
+ public_training_code=None,
185
+ public_training_data=None,
186
+ training_datasets=training_data,
187
+ citation=MOD_CITATION,
188
+ adapted_from="Qwen/Qwen3-Embedding-4B",
189
+ )