mteb 2.4.1__py3-none-any.whl → 2.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. mteb/benchmarks/benchmark.py +31 -13
  2. mteb/benchmarks/benchmarks/benchmarks.py +2 -2
  3. mteb/cache.py +36 -7
  4. mteb/descriptive_stats/Classification/TurkishConstitutionalCourtViolation.json +54 -0
  5. mteb/descriptive_stats/Retrieval/SQuADKorV1Retrieval.json +30 -0
  6. mteb/models/model_implementations/andersborges.py +12 -0
  7. mteb/models/model_implementations/bge_models.py +43 -0
  8. mteb/models/model_implementations/codefuse_models.py +144 -0
  9. mteb/models/model_implementations/dino_models.py +152 -0
  10. mteb/models/model_implementations/emillykkejensen_models.py +18 -0
  11. mteb/models/model_implementations/euler_models.py +6 -0
  12. mteb/models/model_implementations/fa_models.py +50 -0
  13. mteb/models/model_implementations/facebookai.py +44 -0
  14. mteb/models/model_implementations/gte_models.py +69 -0
  15. mteb/models/model_implementations/kalm_models.py +38 -0
  16. mteb/models/model_implementations/kblab.py +6 -0
  17. mteb/models/model_implementations/kowshik24_models.py +9 -0
  18. mteb/models/model_implementations/misc_models.py +293 -0
  19. mteb/models/model_implementations/mod_models.py +10 -23
  20. mteb/models/model_implementations/mxbai_models.py +6 -0
  21. mteb/models/model_implementations/nomic_models.py +8 -0
  22. mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +5 -3
  23. mteb/models/model_implementations/pylate_models.py +33 -0
  24. mteb/models/model_implementations/ru_sentence_models.py +22 -0
  25. mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +658 -0
  26. mteb/models/model_implementations/sentence_transformers_models.py +39 -0
  27. mteb/models/model_implementations/spartan8806_atles_champion.py +7 -0
  28. mteb/models/model_implementations/ua_sentence_models.py +9 -0
  29. mteb/models/model_implementations/vi_vn_models.py +33 -0
  30. mteb/results/benchmark_results.py +22 -4
  31. mteb/tasks/classification/tur/__init__.py +4 -0
  32. mteb/tasks/classification/tur/turkish_constitutional_court.py +41 -0
  33. mteb/tasks/retrieval/kor/__init__.py +2 -1
  34. mteb/tasks/retrieval/kor/squad_kor_v1_retrieval.py +47 -0
  35. {mteb-2.4.1.dist-info → mteb-2.5.0.dist-info}/METADATA +1 -1
  36. {mteb-2.4.1.dist-info → mteb-2.5.0.dist-info}/RECORD +40 -35
  37. {mteb-2.4.1.dist-info → mteb-2.5.0.dist-info}/WHEEL +0 -0
  38. {mteb-2.4.1.dist-info → mteb-2.5.0.dist-info}/entry_points.txt +0 -0
  39. {mteb-2.4.1.dist-info → mteb-2.5.0.dist-info}/licenses/LICENSE +0 -0
  40. {mteb-2.4.1.dist-info → mteb-2.5.0.dist-info}/top_level.txt +0 -0
@@ -127,6 +127,15 @@ Gameselo__STS_multilingual_mpnet_base_v2 = ModelMeta(
127
127
  },
128
128
  adapted_from="sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
129
129
  superseded_by=None,
130
+ citation="""@inproceedings{reimers-2019-sentence-bert,
131
+ title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
132
+ author = "Reimers, Nils and Gurevych, Iryna",
133
+ booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
134
+ month = "11",
135
+ year = "2019",
136
+ publisher = "Association for Computational Linguistics",
137
+ url = "https://arxiv.org/abs/1908.10084",
138
+ }""",
130
139
  )
131
140
 
132
141
  Hum_Works__lodestone_base_4096_v1 = ModelMeta(
@@ -250,6 +259,29 @@ Lajavaness__bilingual_embedding_base = ModelMeta(
250
259
  training_datasets=bilingual_embedding_training_data,
251
260
  adapted_from="dangvantuan/bilingual_impl",
252
261
  superseded_by=None,
262
+ citation="""
263
+ @article{conneau2019unsupervised,
264
+ title={Unsupervised cross-lingual representation learning at scale},
265
+ author={Conneau, Alexis and Khandelwal, Kartikay and Goyal, Naman and Chaudhary, Vishrav and Wenzek, Guillaume and Guzm{\'a}n, Francisco and Grave, Edouard and Ott, Myle and Zettlemoyer, Luke and Stoyanov, Veselin},
266
+ journal={arXiv preprint arXiv:1911.02116},
267
+ year={2019}
268
+ }
269
+
270
+ @article{reimers2019sentence,
271
+ title={Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks},
272
+ author={Nils Reimers, Iryna Gurevych},
273
+ journal={https://arxiv.org/abs/1908.10084},
274
+ year={2019}
275
+ }
276
+
277
+ @article{thakur2020augmented,
278
+ title={Augmented SBERT: Data Augmentation Method for Improving Bi-Encoders for Pairwise Sentence Scoring Tasks},
279
+ author={Thakur, Nandan and Reimers, Nils and Daxenberger, Johannes and Gurevych, Iryna},
280
+ journal={arXiv e-prints},
281
+ pages={arXiv--2010},
282
+ year={2020}
283
+ }
284
+ """,
253
285
  )
254
286
  Lajavaness__bilingual_embedding_large = ModelMeta(
255
287
  name="Lajavaness/bilingual-embedding-large",
@@ -275,6 +307,29 @@ Lajavaness__bilingual_embedding_large = ModelMeta(
275
307
  training_datasets=bilingual_embedding_training_data,
276
308
  adapted_from="dangvantuan/bilingual_impl",
277
309
  superseded_by=None,
310
+ citation="""
311
+ @article{conneau2019unsupervised,
312
+ title={Unsupervised cross-lingual representation learning at scale},
313
+ author={Conneau, Alexis and Khandelwal, Kartikay and Goyal, Naman and Chaudhary, Vishrav and Wenzek, Guillaume and Guzm{\'a}n, Francisco and Grave, Edouard and Ott, Myle and Zettlemoyer, Luke and Stoyanov, Veselin},
314
+ journal={arXiv preprint arXiv:1911.02116},
315
+ year={2019}
316
+ }
317
+
318
+ @article{reimers2019sentence,
319
+ title={Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks},
320
+ author={Nils Reimers, Iryna Gurevych},
321
+ journal={https://arxiv.org/abs/1908.10084},
322
+ year={2019}
323
+ }
324
+
325
+ @article{thakur2020augmented,
326
+ title={Augmented SBERT: Data Augmentation Method for Improving Bi-Encoders for Pairwise Sentence Scoring Tasks},
327
+ author={Thakur, Nandan and Reimers, Nils and Daxenberger, Johannes and Gurevych, Iryna},
328
+ journal={arXiv e-prints},
329
+ pages={arXiv--2010},
330
+ year={2020}
331
+ }
332
+ """,
278
333
  )
279
334
  Lajavaness__bilingual_embedding_small = ModelMeta(
280
335
  name="Lajavaness/bilingual-embedding-small",
@@ -300,6 +355,29 @@ Lajavaness__bilingual_embedding_small = ModelMeta(
300
355
  training_datasets=bilingual_embedding_training_data,
301
356
  adapted_from="dangvantuan/bilingual_impl",
302
357
  superseded_by=None,
358
+ citation="""
359
+ @article{conneau2019unsupervised,
360
+ title={Unsupervised cross-lingual representation learning at scale},
361
+ author={Conneau, Alexis and Khandelwal, Kartikay and Goyal, Naman and Chaudhary, Vishrav and Wenzek, Guillaume and Guzm{\'a}n, Francisco and Grave, Edouard and Ott, Myle and Zettlemoyer, Luke and Stoyanov, Veselin},
362
+ journal={arXiv preprint arXiv:1911.02116},
363
+ year={2019}
364
+ }
365
+
366
+ @article{reimers2019sentence,
367
+ title={Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks},
368
+ author={Nils Reimers, Iryna Gurevych},
369
+ journal={https://arxiv.org/abs/1908.10084},
370
+ year={2019}
371
+ }
372
+
373
+ @article{thakur2020augmented,
374
+ title={Augmented SBERT: Data Augmentation Method for Improving Bi-Encoders for Pairwise Sentence Scoring Tasks},
375
+ author={Thakur, Nandan and Reimers, Nils and Daxenberger, Johannes and Gurevych, Iryna},
376
+ journal={arXiv e-prints},
377
+ pages={arXiv--2010},
378
+ year={2020}
379
+ }
380
+ """,
303
381
  )
304
382
  Mihaiii__Bulbasaur = ModelMeta(
305
383
  name="Mihaiii/Bulbasaur",
@@ -503,6 +581,15 @@ Omartificial_Intelligence_Space__Arabert_all_nli_triplet_Matryoshka = ModelMeta(
503
581
  training_datasets=set(), # not in MTEB: {"Omartificial-Intelligence-Space/Arabic-NLi-Triplet"},
504
582
  adapted_from="aubmindlab/bert-base-arabertv02",
505
583
  superseded_by=None,
584
+ citation="""@inproceedings{reimers-2019-sentence-bert,
585
+ title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
586
+ author = "Reimers, Nils and Gurevych, Iryna",
587
+ booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
588
+ month = "11",
589
+ year = "2019",
590
+ publisher = "Association for Computational Linguistics",
591
+ url = "https://arxiv.org/abs/1908.10084",
592
+ }""",
506
593
  )
507
594
  Omartificial_Intelligence_Space__Arabic_MiniLM_L12_v2_all_nli_triplet = ModelMeta(
508
595
  name="Omartificial-Intelligence-Space/Arabic-MiniLM-L12-v2-all-nli-triplet",
@@ -551,6 +638,15 @@ Omartificial_Intelligence_Space__Arabic_all_nli_triplet_Matryoshka = ModelMeta(
551
638
  # {"Omartificial-Intelligence-Space/Arabic-NLi-Triplet"},
552
639
  adapted_from="sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
553
640
  superseded_by=None,
641
+ citation="""@inproceedings{reimers-2019-sentence-bert,
642
+ title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
643
+ author = "Reimers, Nils and Gurevych, Iryna",
644
+ booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
645
+ month = "11",
646
+ year = "2019",
647
+ publisher = "Association for Computational Linguistics",
648
+ url = "https://arxiv.org/abs/1908.10084",
649
+ }""",
554
650
  )
555
651
  Omartificial_Intelligence_Space__Arabic_labse_Matryoshka = ModelMeta(
556
652
  name="Omartificial-Intelligence-Space/Arabic-labse-Matryoshka",
@@ -575,6 +671,15 @@ Omartificial_Intelligence_Space__Arabic_labse_Matryoshka = ModelMeta(
575
671
  # {"Omartificial-Intelligence-Space/Arabic-NLi-Triplet"},
576
672
  adapted_from="sentence-transformers/LaBSE",
577
673
  superseded_by=None,
674
+ citation="""@inproceedings{reimers-2019-sentence-bert,
675
+ title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
676
+ author = "Reimers, Nils and Gurevych, Iryna",
677
+ booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
678
+ month = "11",
679
+ year = "2019",
680
+ publisher = "Association for Computational Linguistics",
681
+ url = "https://arxiv.org/abs/1908.10084",
682
+ }""",
578
683
  )
579
684
  Omartificial_Intelligence_Space__Arabic_mpnet_base_all_nli_triplet = ModelMeta(
580
685
  name="Omartificial-Intelligence-Space/Arabic-mpnet-base-all-nli-triplet",
@@ -599,6 +704,15 @@ Omartificial_Intelligence_Space__Arabic_mpnet_base_all_nli_triplet = ModelMeta(
599
704
  # {"Omartificial-Intelligence-Space/Arabic-NLi-Triplet"},
600
705
  adapted_from="tomaarsen/mpnet-base-all-nli-triplet",
601
706
  superseded_by=None,
707
+ citation="""@inproceedings{reimers-2019-sentence-bert,
708
+ title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
709
+ author = "Reimers, Nils and Gurevych, Iryna",
710
+ booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
711
+ month = "11",
712
+ year = "2019",
713
+ publisher = "Association for Computational Linguistics",
714
+ url = "https://arxiv.org/abs/1908.10084",
715
+ }""",
602
716
  )
603
717
  Omartificial_Intelligence_Space__Marbert_all_nli_triplet_Matryoshka = ModelMeta(
604
718
  name="Omartificial-Intelligence-Space/Marbert-all-nli-triplet-Matryoshka",
@@ -621,6 +735,15 @@ Omartificial_Intelligence_Space__Marbert_all_nli_triplet_Matryoshka = ModelMeta(
621
735
  training_datasets=set(), # not in MTEB: "Omartificial-Intelligence-Space/Arabic-NLi-Triplet"},
622
736
  adapted_from="UBC-NLP/MARBERTv2",
623
737
  superseded_by=None,
738
+ citation="""@inproceedings{reimers-2019-sentence-bert,
739
+ title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
740
+ author = "Reimers, Nils and Gurevych, Iryna",
741
+ booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
742
+ month = "11",
743
+ year = "2019",
744
+ publisher = "Association for Computational Linguistics",
745
+ url = "https://arxiv.org/abs/1908.10084",
746
+ }""",
624
747
  )
625
748
  consciousai__cai_lunaris_text_embeddings = ModelMeta(
626
749
  name="consciousAI/cai-lunaris-text-embeddings",
@@ -763,6 +886,12 @@ thenlper__gte_base = ModelMeta(
763
886
  training_datasets=None,
764
887
  adapted_from=None,
765
888
  superseded_by=None,
889
+ citation="""@article{li2023towards,
890
+ title={Towards general text embeddings with multi-stage contrastive learning},
891
+ author={Li, Zehan and Zhang, Xin and Zhang, Yanzhao and Long, Dingkun and Xie, Pengjun and Zhang, Meishan},
892
+ journal={arXiv preprint arXiv:2308.03281},
893
+ year={2023}
894
+ }""",
766
895
  )
767
896
  thenlper__gte_large = ModelMeta(
768
897
  name="thenlper/gte-large",
@@ -785,6 +914,12 @@ thenlper__gte_large = ModelMeta(
785
914
  training_datasets=None,
786
915
  adapted_from=None,
787
916
  superseded_by=None,
917
+ citation="""@article{li2023towards,
918
+ title={Towards general text embeddings with multi-stage contrastive learning},
919
+ author={Li, Zehan and Zhang, Xin and Zhang, Yanzhao and Long, Dingkun and Xie, Pengjun and Zhang, Meishan},
920
+ journal={arXiv preprint arXiv:2308.03281},
921
+ year={2023}
922
+ }""",
788
923
  )
789
924
  thenlper__gte_small = ModelMeta(
790
925
  name="thenlper/gte-small",
@@ -807,6 +942,12 @@ thenlper__gte_small = ModelMeta(
807
942
  training_datasets=None,
808
943
  adapted_from=None,
809
944
  superseded_by=None,
945
+ citation="""@article{li2023towards,
946
+ title={Towards general text embeddings with multi-stage contrastive learning},
947
+ author={Li, Zehan and Zhang, Xin and Zhang, Yanzhao and Long, Dingkun and Xie, Pengjun and Zhang, Meishan},
948
+ journal={arXiv preprint arXiv:2308.03281},
949
+ year={2023}
950
+ }""",
810
951
  )
811
952
  OrlikB__KartonBERT_USE_base_v1 = ModelMeta(
812
953
  name="OrlikB/KartonBERT-USE-base-v1",
@@ -873,6 +1014,14 @@ sdadas__mmlw_e5_base = ModelMeta(
873
1014
  training_datasets=E5_TRAINING_DATA,
874
1015
  adapted_from="intfloat/multilingual-e5-base",
875
1016
  superseded_by=None,
1017
+ citation="""@article{dadas2024pirb,
1018
+ title={{PIRB}: A Comprehensive Benchmark of Polish Dense and Hybrid Text Retrieval Methods},
1019
+ author={Sławomir Dadas and Michał Perełkiewicz and Rafał Poświata},
1020
+ year={2024},
1021
+ eprint={2402.13350},
1022
+ archivePrefix={arXiv},
1023
+ primaryClass={cs.CL}
1024
+ }""",
876
1025
  )
877
1026
  dwzhu__e5_base_4k = ModelMeta(
878
1027
  name="dwzhu/e5-base-4k",
@@ -895,6 +1044,12 @@ dwzhu__e5_base_4k = ModelMeta(
895
1044
  training_datasets=E5_TRAINING_DATA,
896
1045
  adapted_from="intfloat/e5-base-v2",
897
1046
  superseded_by=None,
1047
+ citation="""@article{zhu2024longembed,
1048
+ title={LongEmbed: Extending Embedding Models for Long Context Retrieval},
1049
+ author={Zhu, Dawei and Wang, Liang and Yang, Nan and Song, Yifan and Wu, Wenhao and Wei, Furu and Li, Sujian},
1050
+ journal={arXiv preprint arXiv:2404.12096},
1051
+ year={2024}
1052
+ }""",
898
1053
  )
899
1054
  sdadas__mmlw_e5_large = ModelMeta(
900
1055
  name="sdadas/mmlw-e5-large",
@@ -917,6 +1072,14 @@ sdadas__mmlw_e5_large = ModelMeta(
917
1072
  training_datasets=E5_TRAINING_DATA,
918
1073
  adapted_from="intfloat/multilingual-e5-large",
919
1074
  superseded_by=None,
1075
+ citation="""@article{dadas2024pirb,
1076
+ title={{PIRB}: A Comprehensive Benchmark of Polish Dense and Hybrid Text Retrieval Methods},
1077
+ author={Sławomir Dadas and Michał Perełkiewicz and Rafał Poświata},
1078
+ year={2024},
1079
+ eprint={2402.13350},
1080
+ archivePrefix={arXiv},
1081
+ primaryClass={cs.CL}
1082
+ }""",
920
1083
  )
921
1084
  sdadas__mmlw_e5_small = ModelMeta(
922
1085
  name="sdadas/mmlw-e5-small",
@@ -939,6 +1102,14 @@ sdadas__mmlw_e5_small = ModelMeta(
939
1102
  training_datasets=E5_TRAINING_DATA,
940
1103
  adapted_from="intfloat/multilingual-e5-small",
941
1104
  superseded_by=None,
1105
+ citation="""@article{dadas2024pirb,
1106
+ title={{PIRB}: A Comprehensive Benchmark of Polish Dense and Hybrid Text Retrieval Methods},
1107
+ author={Sławomir Dadas and Michał Perełkiewicz and Rafał Poświata},
1108
+ year={2024},
1109
+ eprint={2402.13350},
1110
+ archivePrefix={arXiv},
1111
+ primaryClass={cs.CL}
1112
+ }""",
942
1113
  )
943
1114
  sdadas__mmlw_roberta_base = ModelMeta(
944
1115
  name="sdadas/mmlw-roberta-base",
@@ -961,6 +1132,14 @@ sdadas__mmlw_roberta_base = ModelMeta(
961
1132
  training_datasets={"MSMARCO"},
962
1133
  adapted_from="sdadas/polish-roberta-base-v2",
963
1134
  superseded_by=None,
1135
+ citation="""@article{dadas2024pirb,
1136
+ title={{PIRB}: A Comprehensive Benchmark of Polish Dense and Hybrid Text Retrieval Methods},
1137
+ author={Sławomir Dadas and Michał Perełkiewicz and Rafał Poświata},
1138
+ year={2024},
1139
+ eprint={2402.13350},
1140
+ archivePrefix={arXiv},
1141
+ primaryClass={cs.CL}
1142
+ }""",
964
1143
  )
965
1144
  sdadas__mmlw_roberta_large = ModelMeta(
966
1145
  name="sdadas/mmlw-roberta-large",
@@ -983,6 +1162,14 @@ sdadas__mmlw_roberta_large = ModelMeta(
983
1162
  training_datasets={"MSMARCO"},
984
1163
  adapted_from="sdadas/polish-roberta-large-v2",
985
1164
  superseded_by=None,
1165
+ citation="""@article{dadas2024pirb,
1166
+ title={{PIRB}: A Comprehensive Benchmark of Polish Dense and Hybrid Text Retrieval Methods},
1167
+ author={Sławomir Dadas and Michał Perełkiewicz and Rafał Poświata},
1168
+ year={2024},
1169
+ eprint={2402.13350},
1170
+ archivePrefix={arXiv},
1171
+ primaryClass={cs.CL}
1172
+ }""",
986
1173
  )
987
1174
 
988
1175
  udever_dataset = { # discussed here: https://github.com/embeddings-benchmark/mteb/issues/2193
@@ -1060,6 +1247,12 @@ izhx__udever_bloom_1b1 = ModelMeta(
1060
1247
  training_datasets=udever_dataset,
1061
1248
  adapted_from="bigscience/bloom-1b1",
1062
1249
  superseded_by=None,
1250
+ citation="""@article{zhang2023language,
1251
+ title={Language Models are Universal Embedders},
1252
+ author={Zhang, Xin and Li, Zehan and Zhang, Yanzhao and Long, Dingkun and Xie, Pengjun and Zhang, Meishan and Zhang, Min},
1253
+ journal={arXiv preprint arXiv:2310.08232},
1254
+ year={2023}
1255
+ }""",
1063
1256
  )
1064
1257
  izhx__udever_bloom_3b = ModelMeta(
1065
1258
  name="izhx/udever-bloom-3b",
@@ -1082,6 +1275,12 @@ izhx__udever_bloom_3b = ModelMeta(
1082
1275
  training_datasets=udever_dataset,
1083
1276
  adapted_from="bigscience/bloom-3b",
1084
1277
  superseded_by=None,
1278
+ citation="""@article{zhang2023language,
1279
+ title={Language Models are Universal Embedders},
1280
+ author={Zhang, Xin and Li, Zehan and Zhang, Yanzhao and Long, Dingkun and Xie, Pengjun and Zhang, Meishan and Zhang, Min},
1281
+ journal={arXiv preprint arXiv:2310.08232},
1282
+ year={2023}
1283
+ }""",
1085
1284
  )
1086
1285
  izhx__udever_bloom_560m = ModelMeta(
1087
1286
  name="izhx/udever-bloom-560m",
@@ -1104,6 +1303,12 @@ izhx__udever_bloom_560m = ModelMeta(
1104
1303
  training_datasets=udever_dataset,
1105
1304
  adapted_from="bigscience/bloom-560m",
1106
1305
  superseded_by=None,
1306
+ citation="""@article{zhang2023language,
1307
+ title={Language Models are Universal Embedders},
1308
+ author={Zhang, Xin and Li, Zehan and Zhang, Yanzhao and Long, Dingkun and Xie, Pengjun and Zhang, Meishan and Zhang, Min},
1309
+ journal={arXiv preprint arXiv:2310.08232},
1310
+ year={2023}
1311
+ }""",
1107
1312
  )
1108
1313
  izhx__udever_bloom_7b1 = ModelMeta(
1109
1314
  name="izhx/udever-bloom-7b1",
@@ -1126,6 +1331,12 @@ izhx__udever_bloom_7b1 = ModelMeta(
1126
1331
  training_datasets=udever_dataset,
1127
1332
  adapted_from="bigscience/bloom-7b1",
1128
1333
  superseded_by=None,
1334
+ citation="""@article{zhang2023language,
1335
+ title={Language Models are Universal Embedders},
1336
+ author={Zhang, Xin and Li, Zehan and Zhang, Yanzhao and Long, Dingkun and Xie, Pengjun and Zhang, Meishan and Zhang, Min},
1337
+ journal={arXiv preprint arXiv:2310.08232},
1338
+ year={2023}
1339
+ }""",
1129
1340
  )
1130
1341
  avsolatorio__gist_embedding_v0 = ModelMeta(
1131
1342
  name="avsolatorio/GIST-Embedding-v0",
@@ -1165,6 +1376,16 @@ avsolatorio__gist_embedding_v0 = ModelMeta(
1165
1376
  | bge_training_data,
1166
1377
  adapted_from="BAAI/bge-large-en-v1.5",
1167
1378
  superseded_by=None,
1379
+ citation="""@article{solatorio2024gistembed,
1380
+ title={GISTEmbed: Guided In-sample Selection of Training Negatives for Text Embedding Fine-tuning},
1381
+ author={Aivin V. Solatorio},
1382
+ journal={arXiv preprint arXiv:2402.16829},
1383
+ year={2024},
1384
+ URL={https://arxiv.org/abs/2402.16829}
1385
+ eprint={2402.16829},
1386
+ archivePrefix={arXiv},
1387
+ primaryClass={cs.LG}
1388
+ }""",
1168
1389
  )
1169
1390
  avsolatorio__gist_all_minilm_l6_v2 = ModelMeta(
1170
1391
  name="avsolatorio/GIST-all-MiniLM-L6-v2",
@@ -1204,6 +1425,16 @@ avsolatorio__gist_all_minilm_l6_v2 = ModelMeta(
1204
1425
  | bge_training_data,
1205
1426
  adapted_from=None,
1206
1427
  superseded_by=None,
1428
+ citation="""@article{solatorio2024gistembed,
1429
+ title={GISTEmbed: Guided In-sample Selection of Training Negatives for Text Embedding Fine-tuning},
1430
+ author={Aivin V. Solatorio},
1431
+ journal={arXiv preprint arXiv:2402.16829},
1432
+ year={2024},
1433
+ URL={https://arxiv.org/abs/2402.16829}
1434
+ eprint={2402.16829},
1435
+ archivePrefix={arXiv},
1436
+ primaryClass={cs.LG}
1437
+ }""",
1207
1438
  )
1208
1439
  avsolatorio__gist_large_embedding_v0 = ModelMeta(
1209
1440
  name="avsolatorio/GIST-large-Embedding-v0",
@@ -1243,6 +1474,16 @@ avsolatorio__gist_large_embedding_v0 = ModelMeta(
1243
1474
  | bge_training_data,
1244
1475
  adapted_from=None,
1245
1476
  superseded_by=None,
1477
+ citation="""@article{solatorio2024gistembed,
1478
+ title={GISTEmbed: Guided In-sample Selection of Training Negatives for Text Embedding Fine-tuning},
1479
+ author={Aivin V. Solatorio},
1480
+ journal={arXiv preprint arXiv:2402.16829},
1481
+ year={2024},
1482
+ URL={https://arxiv.org/abs/2402.16829}
1483
+ eprint={2402.16829},
1484
+ archivePrefix={arXiv},
1485
+ primaryClass={cs.LG}
1486
+ }""",
1246
1487
  )
1247
1488
  avsolatorio__gist_small_embedding_v0 = ModelMeta(
1248
1489
  name="avsolatorio/GIST-small-Embedding-v0",
@@ -1282,6 +1523,16 @@ avsolatorio__gist_small_embedding_v0 = ModelMeta(
1282
1523
  | bge_training_data,
1283
1524
  adapted_from=None,
1284
1525
  superseded_by=None,
1526
+ citation="""@article{solatorio2024gistembed,
1527
+ title={GISTEmbed: Guided In-sample Selection of Training Negatives for Text Embedding Fine-tuning},
1528
+ author={Aivin V. Solatorio},
1529
+ journal={arXiv preprint arXiv:2402.16829},
1530
+ year={2024},
1531
+ URL={https://arxiv.org/abs/2402.16829}
1532
+ eprint={2402.16829},
1533
+ archivePrefix={arXiv},
1534
+ primaryClass={cs.LG}
1535
+ }""",
1285
1536
  )
1286
1537
  bigscience__sgpt_bloom_7b1_msmarco = ModelMeta(
1287
1538
  name="bigscience/sgpt-bloom-7b1-msmarco",
@@ -1304,6 +1555,12 @@ bigscience__sgpt_bloom_7b1_msmarco = ModelMeta(
1304
1555
  training_datasets=None,
1305
1556
  adapted_from="/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3/bloom-7b1",
1306
1557
  superseded_by=None,
1558
+ citation="""@article{muennighoff2022sgpt,
1559
+ title={SGPT: GPT Sentence Embeddings for Semantic Search},
1560
+ author={Muennighoff, Niklas},
1561
+ journal={arXiv preprint arXiv:2202.08904},
1562
+ year={2022}
1563
+ }""",
1307
1564
  )
1308
1565
  aari1995__german_semantic_sts_v2 = ModelMeta(
1309
1566
  name="aari1995/German_Semantic_STS_V2",
@@ -1358,6 +1615,12 @@ abhinand__medembed_small_v0_1 = ModelMeta(
1358
1615
  },
1359
1616
  adapted_from="BAAI/bge-base-en-v1.5",
1360
1617
  superseded_by=None,
1618
+ citation="""@software{balachandran2024medembed,
1619
+ author = {Balachandran, Abhinand},
1620
+ title = {MedEmbed: Medical-Focused Embedding Models},
1621
+ year = {2024},
1622
+ url = {https://github.com/abhinand5/MedEmbed}
1623
+ }""",
1361
1624
  )
1362
1625
  avsolatorio__noinstruct_small_embedding_v0 = ModelMeta(
1363
1626
  name="avsolatorio/NoInstruct-small-Embedding-v0",
@@ -1490,6 +1753,15 @@ omarelshehy__arabic_english_sts_matryoshka = ModelMeta(
1490
1753
  training_datasets=None,
1491
1754
  adapted_from="FacebookAI/xlm-roberta-large",
1492
1755
  superseded_by=None,
1756
+ citation="""@inproceedings{reimers-2019-sentence-bert,
1757
+ title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
1758
+ author = "Reimers, Nils and Gurevych, Iryna",
1759
+ booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
1760
+ month = "11",
1761
+ year = "2019",
1762
+ publisher = "Association for Computational Linguistics",
1763
+ url = "https://arxiv.org/abs/1908.10084",
1764
+ }""",
1493
1765
  )
1494
1766
  openbmb__minicpm_embedding = ModelMeta(
1495
1767
  loader=sentence_transformers_loader,
@@ -1543,6 +1815,13 @@ silma_ai__silma_embedding_matryoshka_v0_1 = ModelMeta(
1543
1815
  training_datasets=None,
1544
1816
  adapted_from="/workspace/v3-matryoshka_aubmindlab-bert-base-arabertv02-2024-10-12_13-55-06/checkpoint-26250",
1545
1817
  superseded_by=None,
1818
+ citation="""@misc{silma2024embedding,
1819
+ author = {Abu Bakr Soliman, Karim Ouda, SILMA AI},
1820
+ title = {SILMA Embedding Matryoshka 0.1},
1821
+ year = {2024},
1822
+ publisher = {Hugging Face},
1823
+ howpublished = {https://huggingface.co/silma-ai/silma-embeddding-matryoshka-0.1},
1824
+ }""",
1546
1825
  )
1547
1826
 
1548
1827
  sbert_chinese_general_v1 = ModelMeta(
@@ -1683,6 +1962,15 @@ conan_embedding = ModelMeta(
1683
1962
  # source: https://arxiv.org/pdf/2408.15710
1684
1963
  training_datasets=None, # They "scraped" things from the internet, we don't know, could be leakage
1685
1964
  superseded_by=None,
1965
+ citation="""@misc{li2024conanembeddinggeneraltextembedding,
1966
+ title={Conan-embedding: General Text Embedding with More and Better Negative Samples},
1967
+ author={Shiyu Li and Yang Tang and Shizhe Chen and Xi Chen},
1968
+ year={2024},
1969
+ eprint={2408.15710},
1970
+ archivePrefix={arXiv},
1971
+ primaryClass={cs.CL},
1972
+ url={https://arxiv.org/abs/2408.15710},
1973
+ }""",
1686
1974
  )
1687
1975
 
1688
1976
  ember_v1 = ModelMeta(
@@ -1705,4 +1993,9 @@ ember_v1 = ModelMeta(
1705
1993
  use_instructions=None,
1706
1994
  training_datasets=None,
1707
1995
  superseded_by=None,
1996
+ citation="""@misc{nur2024emberv1,
1997
+ title={ember-v1: SOTA embedding model},
1998
+ author={Enrike Nur and Anar Aliyev},
1999
+ year={2023},
2000
+ }""",
1708
2001
  )
@@ -1,6 +1,6 @@
1
1
  from mteb.models.instruct_wrapper import InstructSentenceTransformerModel
2
2
  from mteb.models.model_meta import ModelMeta
3
- from mteb.models.models_protocols import EncoderProtocol, PromptType
3
+ from mteb.models.models_protocols import PromptType
4
4
 
5
5
 
6
6
  def instruction_template(
@@ -114,7 +114,7 @@ training_data = {
114
114
  }
115
115
 
116
116
  # Predefined prompts for various RTEB tasks
117
- PREDEFINED_PROMPTS = {
117
+ _PREDEFINED_PROMPTS = {
118
118
  # ========== Open Datasets ==========
119
119
  # Legal domain
120
120
  "AILACasedocs": "Given a legal case scenario, retrieve the most relevant case documents",
@@ -137,7 +137,7 @@ PREDEFINED_PROMPTS = {
137
137
  # SQL domain
138
138
  "WikiSQLRetrieval": "Given a natural language query, retrieve relevant SQL examples",
139
139
  # Multilingual
140
- "MIRACLRetrievalHardNegatives": "Given a query, retrieve relevant passages",
140
+ "MIRACLRetrievalHardNegatives": "Given a question, retrieve Wikipedia passages that answer the question",
141
141
  # ========== Private/Closed Datasets ==========
142
142
  # Code domain (Private)
143
143
  "Code1Retrieval": "Given a code problem description, retrieve relevant code examples",
@@ -160,28 +160,15 @@ PREDEFINED_PROMPTS = {
160
160
  }
161
161
 
162
162
 
163
- def mod_instruct_loader(
164
- model_name_or_path: str, revision: str, **kwargs
165
- ) -> EncoderProtocol:
166
- # Set default prompts_dict if not provided
167
-
168
- model = InstructSentenceTransformerModel(
169
- model_name_or_path,
170
- revision=revision,
163
+ MoD_Embedding = ModelMeta(
164
+ loader=InstructSentenceTransformerModel,
165
+ loader_kwargs=dict(
171
166
  instruction_template=instruction_template,
172
167
  apply_instruction_to_passages=False,
173
- prompt_dicts=PREDEFINED_PROMPTS,
174
- **kwargs,
175
- )
176
- encoder = model.model._first_module()
177
- if encoder.auto_model.config._attn_implementation == "flash_attention_2":
178
- # The Qwen3 code only use left padding in flash_attention_2 mode.
179
- encoder.tokenizer.padding_side = "left"
180
- return model
181
-
182
-
183
- MoD_Embedding = ModelMeta(
184
- loader=mod_instruct_loader,
168
+ prompts_dict=_PREDEFINED_PROMPTS,
169
+ max_seq_length=18480,
170
+ model_kwargs={"torch_dtype": "bfloat16"},
171
+ ),
185
172
  name="bflhc/MoD-Embedding",
186
173
  languages=multilingual_langs,
187
174
  open_weights=True,
@@ -99,4 +99,10 @@ mxbai_embed_xsmall_v1 = ModelMeta(
99
99
  public_training_code=None,
100
100
  public_training_data=None,
101
101
  training_datasets=mixedbread_training_data,
102
+ citation="""@online{xsmall2024mxbai,
103
+ title={Every Byte Matters: Introducing mxbai-embed-xsmall-v1},
104
+ author={Sean Lee and Julius Lipp and Rui Huang and Darius Koenig},
105
+ year={2024},
106
+ url={https://www.mixedbread.ai/blog/mxbai-embed-xsmall-v1},
107
+ }""",
102
108
  )
@@ -328,6 +328,14 @@ nomic_modern_bert_embed = ModelMeta(
328
328
  superseded_by=None,
329
329
  training_datasets=nomic_training_data,
330
330
  public_training_data=None,
331
+ citation="""@misc{nussbaum2024nomic,
332
+ title={Nomic Embed: Training a Reproducible Long Context Text Embedder},
333
+ author={Zach Nussbaum and John X. Morris and Brandon Duderstadt and Andriy Mulyar},
334
+ year={2024},
335
+ eprint={2402.01613},
336
+ archivePrefix={arXiv},
337
+ primaryClass={cs.CL}
338
+ }""",
331
339
  )
332
340
 
333
341
 
@@ -65,14 +65,16 @@ class LlamaNemoretrieverColembed(AbsEncoder):
65
65
  iterator = DataLoader(images, batch_size=batch_size)
66
66
 
67
67
  for batch in iterator:
68
- for b in batch:
68
+ for image in batch["image"]:
69
69
  pil_img = (
70
- F.to_pil_image(b.to("cpu")) if not isinstance(b, Image.Image) else b
70
+ image
71
+ if isinstance(image, Image.Image)
72
+ else F.to_pil_image(image.to("cpu"))
71
73
  )
72
74
  all_images.append(pil_img)
73
75
 
74
76
  batch_size = 1
75
- return self.model.forward_passages(all_images, batch_size=batch_size)
77
+ return self.model.forward_images(all_images, batch_size=batch_size)
76
78
 
77
79
  def calculate_probs(self, text_embeddings, image_embeddings):
78
80
  scores = self.similarity(text_embeddings, image_embeddings)
@@ -415,6 +415,30 @@ jina_colbert_v2 = ModelMeta(
415
415
  "DuRetrieval",
416
416
  "MIRACL",
417
417
  },
418
+ citation="""@inproceedings{xiao-etal-2024-jina,
419
+ title = "{J}ina-{C}ol{BERT}-v2: A General-Purpose Multilingual Late Interaction Retriever",
420
+ author = {Jha, Rohan and
421
+ Wang, Bo and
422
+ G{\"u}nther, Michael and
423
+ Mastrapas, Georgios and
424
+ Sturua, Saba and
425
+ Mohr, Isabelle and
426
+ Koukounas, Andreas and
427
+ Wang, Mohammad Kalim and
428
+ Wang, Nan and
429
+ Xiao, Han},
430
+ editor = {S{\"a}lev{\"a}, Jonne and
431
+ Owodunni, Abraham},
432
+ booktitle = "Proceedings of the Fourth Workshop on Multilingual Representation Learning (MRL 2024)",
433
+ month = nov,
434
+ year = "2024",
435
+ address = "Miami, Florida, USA",
436
+ publisher = "Association for Computational Linguistics",
437
+ url = "https://aclanthology.org/2024.mrl-1.11/",
438
+ doi = "10.18653/v1/2024.mrl-1.11",
439
+ pages = "159--166",
440
+ abstract = "Multi-vector dense models, such as ColBERT, have proven highly effective in information retrieval. ColBERT`s late interaction scoring approximates the joint query-document attention seen in cross-encoders while maintaining inference efficiency closer to traditional dense retrieval models, thanks to its bi-encoder architecture and recent optimizations in indexing and search. In this paper, we introduce a novel architecture and a training framework to support long context window and multilingual retrieval. Leveraging Matryoshka Representation Loss, we further demonstrate that the reducing the embedding dimensionality from 128 to 64 has insignificant impact on the model`s retrieval performance and cut storage requirements by up to 50{\%}. Our new model, Jina-ColBERT-v2, demonstrates strong performance across a range of English and multilingual retrieval tasks,"
441
+ }""",
418
442
  )
419
443
 
420
444
 
@@ -444,4 +468,13 @@ lightonai__gte_moderncolbert_v1 = ModelMeta(
444
468
  "MSMARCO",
445
469
  "mMARCO-NL",
446
470
  },
471
+ citation="""@inproceedings{reimers-2019-sentence-bert,
472
+ title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
473
+ author = "Reimers, Nils and Gurevych, Iryna",
474
+ booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
475
+ month = "11",
476
+ year = "2019",
477
+ publisher = "Association for Computational Linguistics",
478
+ url = "https://arxiv.org/abs/1908.10084"
479
+ }""",
447
480
  )