mteb 2.3.11__py3-none-any.whl → 2.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mteb/benchmarks/benchmarks/__init__.py +2 -0
- mteb/benchmarks/benchmarks/benchmarks.py +57 -0
- mteb/deprecated_evaluator.py +8 -13
- mteb/descriptive_stats/Reranking/JQaRARerankingLite.json +35 -0
- mteb/descriptive_stats/Reranking/JaCWIRRerankingLite.json +35 -0
- mteb/descriptive_stats/Retrieval/JaCWIRRetrievalLite.json +30 -0
- mteb/descriptive_stats/Retrieval/JaqketRetrievalLite.json +30 -0
- mteb/descriptive_stats/Retrieval/MIRACLJaRetrievalLite.json +30 -0
- mteb/descriptive_stats/Retrieval/MrTyDiJaRetrievalLite.json +30 -0
- mteb/evaluate.py +2 -33
- mteb/leaderboard/figures.py +1 -1
- mteb/leaderboard/table.py +1 -11
- mteb/models/abs_encoder.py +21 -17
- mteb/models/get_model_meta.py +3 -123
- mteb/models/instruct_wrapper.py +2 -1
- mteb/models/model_implementations/andersborges.py +12 -0
- mteb/models/model_implementations/bge_models.py +43 -0
- mteb/models/model_implementations/bica_model.py +34 -0
- mteb/models/model_implementations/dino_models.py +152 -0
- mteb/models/model_implementations/emillykkejensen_models.py +18 -0
- mteb/models/model_implementations/euler_models.py +6 -0
- mteb/models/model_implementations/fa_models.py +50 -0
- mteb/models/model_implementations/facebookai.py +44 -0
- mteb/models/model_implementations/google_models.py +10 -0
- mteb/models/model_implementations/gte_models.py +69 -0
- mteb/models/model_implementations/kalm_models.py +38 -0
- mteb/models/model_implementations/kblab.py +6 -0
- mteb/models/model_implementations/kowshik24_models.py +9 -0
- mteb/models/model_implementations/misc_models.py +293 -0
- mteb/models/model_implementations/mod_models.py +189 -0
- mteb/models/model_implementations/mxbai_models.py +6 -0
- mteb/models/model_implementations/nomic_models.py +150 -4
- mteb/models/model_implementations/pylate_models.py +33 -0
- mteb/models/model_implementations/ru_sentence_models.py +22 -0
- mteb/models/model_implementations/sentence_transformers_models.py +39 -0
- mteb/models/model_implementations/spartan8806_atles_champion.py +7 -0
- mteb/models/model_implementations/ua_sentence_models.py +9 -0
- mteb/models/model_implementations/vi_vn_models.py +33 -0
- mteb/models/model_meta.py +396 -19
- mteb/models/sentence_transformer_wrapper.py +2 -7
- mteb/tasks/reranking/jpn/__init__.py +9 -1
- mteb/tasks/reranking/jpn/j_qa_ra_reranking_lite.py +49 -0
- mteb/tasks/reranking/jpn/ja_cwir_reranking_lite.py +47 -0
- mteb/tasks/retrieval/code/fresh_stack_retrieval.py +8 -5
- mteb/tasks/retrieval/jpn/__init__.py +8 -0
- mteb/tasks/retrieval/jpn/ja_cwir_retrieval_lite.py +47 -0
- mteb/tasks/retrieval/jpn/jaqket_retrieval_lite.py +50 -0
- mteb/tasks/retrieval/jpn/miracl_ja_retrieval_lite.py +52 -0
- mteb/tasks/retrieval/jpn/mr_tydi_ja_retrieval_lite.py +48 -0
- {mteb-2.3.11.dist-info → mteb-2.4.2.dist-info}/METADATA +1 -1
- {mteb-2.3.11.dist-info → mteb-2.4.2.dist-info}/RECORD +55 -41
- {mteb-2.3.11.dist-info → mteb-2.4.2.dist-info}/WHEEL +0 -0
- {mteb-2.3.11.dist-info → mteb-2.4.2.dist-info}/entry_points.txt +0 -0
- {mteb-2.3.11.dist-info → mteb-2.4.2.dist-info}/licenses/LICENSE +0 -0
- {mteb-2.3.11.dist-info → mteb-2.4.2.dist-info}/top_level.txt +0 -0
|
@@ -127,6 +127,15 @@ Gameselo__STS_multilingual_mpnet_base_v2 = ModelMeta(
|
|
|
127
127
|
},
|
|
128
128
|
adapted_from="sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
|
|
129
129
|
superseded_by=None,
|
|
130
|
+
citation="""@inproceedings{reimers-2019-sentence-bert,
|
|
131
|
+
title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
|
|
132
|
+
author = "Reimers, Nils and Gurevych, Iryna",
|
|
133
|
+
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
|
|
134
|
+
month = "11",
|
|
135
|
+
year = "2019",
|
|
136
|
+
publisher = "Association for Computational Linguistics",
|
|
137
|
+
url = "https://arxiv.org/abs/1908.10084",
|
|
138
|
+
}""",
|
|
130
139
|
)
|
|
131
140
|
|
|
132
141
|
Hum_Works__lodestone_base_4096_v1 = ModelMeta(
|
|
@@ -250,6 +259,29 @@ Lajavaness__bilingual_embedding_base = ModelMeta(
|
|
|
250
259
|
training_datasets=bilingual_embedding_training_data,
|
|
251
260
|
adapted_from="dangvantuan/bilingual_impl",
|
|
252
261
|
superseded_by=None,
|
|
262
|
+
citation="""
|
|
263
|
+
@article{conneau2019unsupervised,
|
|
264
|
+
title={Unsupervised cross-lingual representation learning at scale},
|
|
265
|
+
author={Conneau, Alexis and Khandelwal, Kartikay and Goyal, Naman and Chaudhary, Vishrav and Wenzek, Guillaume and Guzm{\'a}n, Francisco and Grave, Edouard and Ott, Myle and Zettlemoyer, Luke and Stoyanov, Veselin},
|
|
266
|
+
journal={arXiv preprint arXiv:1911.02116},
|
|
267
|
+
year={2019}
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
@article{reimers2019sentence,
|
|
271
|
+
title={Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks},
|
|
272
|
+
author={Nils Reimers, Iryna Gurevych},
|
|
273
|
+
journal={https://arxiv.org/abs/1908.10084},
|
|
274
|
+
year={2019}
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
@article{thakur2020augmented,
|
|
278
|
+
title={Augmented SBERT: Data Augmentation Method for Improving Bi-Encoders for Pairwise Sentence Scoring Tasks},
|
|
279
|
+
author={Thakur, Nandan and Reimers, Nils and Daxenberger, Johannes and Gurevych, Iryna},
|
|
280
|
+
journal={arXiv e-prints},
|
|
281
|
+
pages={arXiv--2010},
|
|
282
|
+
year={2020}
|
|
283
|
+
}
|
|
284
|
+
""",
|
|
253
285
|
)
|
|
254
286
|
Lajavaness__bilingual_embedding_large = ModelMeta(
|
|
255
287
|
name="Lajavaness/bilingual-embedding-large",
|
|
@@ -275,6 +307,29 @@ Lajavaness__bilingual_embedding_large = ModelMeta(
|
|
|
275
307
|
training_datasets=bilingual_embedding_training_data,
|
|
276
308
|
adapted_from="dangvantuan/bilingual_impl",
|
|
277
309
|
superseded_by=None,
|
|
310
|
+
citation="""
|
|
311
|
+
@article{conneau2019unsupervised,
|
|
312
|
+
title={Unsupervised cross-lingual representation learning at scale},
|
|
313
|
+
author={Conneau, Alexis and Khandelwal, Kartikay and Goyal, Naman and Chaudhary, Vishrav and Wenzek, Guillaume and Guzm{\'a}n, Francisco and Grave, Edouard and Ott, Myle and Zettlemoyer, Luke and Stoyanov, Veselin},
|
|
314
|
+
journal={arXiv preprint arXiv:1911.02116},
|
|
315
|
+
year={2019}
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
@article{reimers2019sentence,
|
|
319
|
+
title={Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks},
|
|
320
|
+
author={Nils Reimers, Iryna Gurevych},
|
|
321
|
+
journal={https://arxiv.org/abs/1908.10084},
|
|
322
|
+
year={2019}
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
@article{thakur2020augmented,
|
|
326
|
+
title={Augmented SBERT: Data Augmentation Method for Improving Bi-Encoders for Pairwise Sentence Scoring Tasks},
|
|
327
|
+
author={Thakur, Nandan and Reimers, Nils and Daxenberger, Johannes and Gurevych, Iryna},
|
|
328
|
+
journal={arXiv e-prints},
|
|
329
|
+
pages={arXiv--2010},
|
|
330
|
+
year={2020}
|
|
331
|
+
}
|
|
332
|
+
""",
|
|
278
333
|
)
|
|
279
334
|
Lajavaness__bilingual_embedding_small = ModelMeta(
|
|
280
335
|
name="Lajavaness/bilingual-embedding-small",
|
|
@@ -300,6 +355,29 @@ Lajavaness__bilingual_embedding_small = ModelMeta(
|
|
|
300
355
|
training_datasets=bilingual_embedding_training_data,
|
|
301
356
|
adapted_from="dangvantuan/bilingual_impl",
|
|
302
357
|
superseded_by=None,
|
|
358
|
+
citation="""
|
|
359
|
+
@article{conneau2019unsupervised,
|
|
360
|
+
title={Unsupervised cross-lingual representation learning at scale},
|
|
361
|
+
author={Conneau, Alexis and Khandelwal, Kartikay and Goyal, Naman and Chaudhary, Vishrav and Wenzek, Guillaume and Guzm{\'a}n, Francisco and Grave, Edouard and Ott, Myle and Zettlemoyer, Luke and Stoyanov, Veselin},
|
|
362
|
+
journal={arXiv preprint arXiv:1911.02116},
|
|
363
|
+
year={2019}
|
|
364
|
+
}
|
|
365
|
+
|
|
366
|
+
@article{reimers2019sentence,
|
|
367
|
+
title={Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks},
|
|
368
|
+
author={Nils Reimers, Iryna Gurevych},
|
|
369
|
+
journal={https://arxiv.org/abs/1908.10084},
|
|
370
|
+
year={2019}
|
|
371
|
+
}
|
|
372
|
+
|
|
373
|
+
@article{thakur2020augmented,
|
|
374
|
+
title={Augmented SBERT: Data Augmentation Method for Improving Bi-Encoders for Pairwise Sentence Scoring Tasks},
|
|
375
|
+
author={Thakur, Nandan and Reimers, Nils and Daxenberger, Johannes and Gurevych, Iryna},
|
|
376
|
+
journal={arXiv e-prints},
|
|
377
|
+
pages={arXiv--2010},
|
|
378
|
+
year={2020}
|
|
379
|
+
}
|
|
380
|
+
""",
|
|
303
381
|
)
|
|
304
382
|
Mihaiii__Bulbasaur = ModelMeta(
|
|
305
383
|
name="Mihaiii/Bulbasaur",
|
|
@@ -503,6 +581,15 @@ Omartificial_Intelligence_Space__Arabert_all_nli_triplet_Matryoshka = ModelMeta(
|
|
|
503
581
|
training_datasets=set(), # not in MTEB: {"Omartificial-Intelligence-Space/Arabic-NLi-Triplet"},
|
|
504
582
|
adapted_from="aubmindlab/bert-base-arabertv02",
|
|
505
583
|
superseded_by=None,
|
|
584
|
+
citation="""@inproceedings{reimers-2019-sentence-bert,
|
|
585
|
+
title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
|
|
586
|
+
author = "Reimers, Nils and Gurevych, Iryna",
|
|
587
|
+
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
|
|
588
|
+
month = "11",
|
|
589
|
+
year = "2019",
|
|
590
|
+
publisher = "Association for Computational Linguistics",
|
|
591
|
+
url = "https://arxiv.org/abs/1908.10084",
|
|
592
|
+
}""",
|
|
506
593
|
)
|
|
507
594
|
Omartificial_Intelligence_Space__Arabic_MiniLM_L12_v2_all_nli_triplet = ModelMeta(
|
|
508
595
|
name="Omartificial-Intelligence-Space/Arabic-MiniLM-L12-v2-all-nli-triplet",
|
|
@@ -551,6 +638,15 @@ Omartificial_Intelligence_Space__Arabic_all_nli_triplet_Matryoshka = ModelMeta(
|
|
|
551
638
|
# {"Omartificial-Intelligence-Space/Arabic-NLi-Triplet"},
|
|
552
639
|
adapted_from="sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
|
|
553
640
|
superseded_by=None,
|
|
641
|
+
citation="""@inproceedings{reimers-2019-sentence-bert,
|
|
642
|
+
title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
|
|
643
|
+
author = "Reimers, Nils and Gurevych, Iryna",
|
|
644
|
+
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
|
|
645
|
+
month = "11",
|
|
646
|
+
year = "2019",
|
|
647
|
+
publisher = "Association for Computational Linguistics",
|
|
648
|
+
url = "https://arxiv.org/abs/1908.10084",
|
|
649
|
+
}""",
|
|
554
650
|
)
|
|
555
651
|
Omartificial_Intelligence_Space__Arabic_labse_Matryoshka = ModelMeta(
|
|
556
652
|
name="Omartificial-Intelligence-Space/Arabic-labse-Matryoshka",
|
|
@@ -575,6 +671,15 @@ Omartificial_Intelligence_Space__Arabic_labse_Matryoshka = ModelMeta(
|
|
|
575
671
|
# {"Omartificial-Intelligence-Space/Arabic-NLi-Triplet"},
|
|
576
672
|
adapted_from="sentence-transformers/LaBSE",
|
|
577
673
|
superseded_by=None,
|
|
674
|
+
citation="""@inproceedings{reimers-2019-sentence-bert,
|
|
675
|
+
title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
|
|
676
|
+
author = "Reimers, Nils and Gurevych, Iryna",
|
|
677
|
+
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
|
|
678
|
+
month = "11",
|
|
679
|
+
year = "2019",
|
|
680
|
+
publisher = "Association for Computational Linguistics",
|
|
681
|
+
url = "https://arxiv.org/abs/1908.10084",
|
|
682
|
+
}""",
|
|
578
683
|
)
|
|
579
684
|
Omartificial_Intelligence_Space__Arabic_mpnet_base_all_nli_triplet = ModelMeta(
|
|
580
685
|
name="Omartificial-Intelligence-Space/Arabic-mpnet-base-all-nli-triplet",
|
|
@@ -599,6 +704,15 @@ Omartificial_Intelligence_Space__Arabic_mpnet_base_all_nli_triplet = ModelMeta(
|
|
|
599
704
|
# {"Omartificial-Intelligence-Space/Arabic-NLi-Triplet"},
|
|
600
705
|
adapted_from="tomaarsen/mpnet-base-all-nli-triplet",
|
|
601
706
|
superseded_by=None,
|
|
707
|
+
citation="""@inproceedings{reimers-2019-sentence-bert,
|
|
708
|
+
title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
|
|
709
|
+
author = "Reimers, Nils and Gurevych, Iryna",
|
|
710
|
+
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
|
|
711
|
+
month = "11",
|
|
712
|
+
year = "2019",
|
|
713
|
+
publisher = "Association for Computational Linguistics",
|
|
714
|
+
url = "https://arxiv.org/abs/1908.10084",
|
|
715
|
+
}""",
|
|
602
716
|
)
|
|
603
717
|
Omartificial_Intelligence_Space__Marbert_all_nli_triplet_Matryoshka = ModelMeta(
|
|
604
718
|
name="Omartificial-Intelligence-Space/Marbert-all-nli-triplet-Matryoshka",
|
|
@@ -621,6 +735,15 @@ Omartificial_Intelligence_Space__Marbert_all_nli_triplet_Matryoshka = ModelMeta(
|
|
|
621
735
|
training_datasets=set(), # not in MTEB: "Omartificial-Intelligence-Space/Arabic-NLi-Triplet"},
|
|
622
736
|
adapted_from="UBC-NLP/MARBERTv2",
|
|
623
737
|
superseded_by=None,
|
|
738
|
+
citation="""@inproceedings{reimers-2019-sentence-bert,
|
|
739
|
+
title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
|
|
740
|
+
author = "Reimers, Nils and Gurevych, Iryna",
|
|
741
|
+
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
|
|
742
|
+
month = "11",
|
|
743
|
+
year = "2019",
|
|
744
|
+
publisher = "Association for Computational Linguistics",
|
|
745
|
+
url = "https://arxiv.org/abs/1908.10084",
|
|
746
|
+
}""",
|
|
624
747
|
)
|
|
625
748
|
consciousai__cai_lunaris_text_embeddings = ModelMeta(
|
|
626
749
|
name="consciousAI/cai-lunaris-text-embeddings",
|
|
@@ -763,6 +886,12 @@ thenlper__gte_base = ModelMeta(
|
|
|
763
886
|
training_datasets=None,
|
|
764
887
|
adapted_from=None,
|
|
765
888
|
superseded_by=None,
|
|
889
|
+
citation="""@article{li2023towards,
|
|
890
|
+
title={Towards general text embeddings with multi-stage contrastive learning},
|
|
891
|
+
author={Li, Zehan and Zhang, Xin and Zhang, Yanzhao and Long, Dingkun and Xie, Pengjun and Zhang, Meishan},
|
|
892
|
+
journal={arXiv preprint arXiv:2308.03281},
|
|
893
|
+
year={2023}
|
|
894
|
+
}""",
|
|
766
895
|
)
|
|
767
896
|
thenlper__gte_large = ModelMeta(
|
|
768
897
|
name="thenlper/gte-large",
|
|
@@ -785,6 +914,12 @@ thenlper__gte_large = ModelMeta(
|
|
|
785
914
|
training_datasets=None,
|
|
786
915
|
adapted_from=None,
|
|
787
916
|
superseded_by=None,
|
|
917
|
+
citation="""@article{li2023towards,
|
|
918
|
+
title={Towards general text embeddings with multi-stage contrastive learning},
|
|
919
|
+
author={Li, Zehan and Zhang, Xin and Zhang, Yanzhao and Long, Dingkun and Xie, Pengjun and Zhang, Meishan},
|
|
920
|
+
journal={arXiv preprint arXiv:2308.03281},
|
|
921
|
+
year={2023}
|
|
922
|
+
}""",
|
|
788
923
|
)
|
|
789
924
|
thenlper__gte_small = ModelMeta(
|
|
790
925
|
name="thenlper/gte-small",
|
|
@@ -807,6 +942,12 @@ thenlper__gte_small = ModelMeta(
|
|
|
807
942
|
training_datasets=None,
|
|
808
943
|
adapted_from=None,
|
|
809
944
|
superseded_by=None,
|
|
945
|
+
citation="""@article{li2023towards,
|
|
946
|
+
title={Towards general text embeddings with multi-stage contrastive learning},
|
|
947
|
+
author={Li, Zehan and Zhang, Xin and Zhang, Yanzhao and Long, Dingkun and Xie, Pengjun and Zhang, Meishan},
|
|
948
|
+
journal={arXiv preprint arXiv:2308.03281},
|
|
949
|
+
year={2023}
|
|
950
|
+
}""",
|
|
810
951
|
)
|
|
811
952
|
OrlikB__KartonBERT_USE_base_v1 = ModelMeta(
|
|
812
953
|
name="OrlikB/KartonBERT-USE-base-v1",
|
|
@@ -873,6 +1014,14 @@ sdadas__mmlw_e5_base = ModelMeta(
|
|
|
873
1014
|
training_datasets=E5_TRAINING_DATA,
|
|
874
1015
|
adapted_from="intfloat/multilingual-e5-base",
|
|
875
1016
|
superseded_by=None,
|
|
1017
|
+
citation="""@article{dadas2024pirb,
|
|
1018
|
+
title={{PIRB}: A Comprehensive Benchmark of Polish Dense and Hybrid Text Retrieval Methods},
|
|
1019
|
+
author={Sławomir Dadas and Michał Perełkiewicz and Rafał Poświata},
|
|
1020
|
+
year={2024},
|
|
1021
|
+
eprint={2402.13350},
|
|
1022
|
+
archivePrefix={arXiv},
|
|
1023
|
+
primaryClass={cs.CL}
|
|
1024
|
+
}""",
|
|
876
1025
|
)
|
|
877
1026
|
dwzhu__e5_base_4k = ModelMeta(
|
|
878
1027
|
name="dwzhu/e5-base-4k",
|
|
@@ -895,6 +1044,12 @@ dwzhu__e5_base_4k = ModelMeta(
|
|
|
895
1044
|
training_datasets=E5_TRAINING_DATA,
|
|
896
1045
|
adapted_from="intfloat/e5-base-v2",
|
|
897
1046
|
superseded_by=None,
|
|
1047
|
+
citation="""@article{zhu2024longembed,
|
|
1048
|
+
title={LongEmbed: Extending Embedding Models for Long Context Retrieval},
|
|
1049
|
+
author={Zhu, Dawei and Wang, Liang and Yang, Nan and Song, Yifan and Wu, Wenhao and Wei, Furu and Li, Sujian},
|
|
1050
|
+
journal={arXiv preprint arXiv:2404.12096},
|
|
1051
|
+
year={2024}
|
|
1052
|
+
}""",
|
|
898
1053
|
)
|
|
899
1054
|
sdadas__mmlw_e5_large = ModelMeta(
|
|
900
1055
|
name="sdadas/mmlw-e5-large",
|
|
@@ -917,6 +1072,14 @@ sdadas__mmlw_e5_large = ModelMeta(
|
|
|
917
1072
|
training_datasets=E5_TRAINING_DATA,
|
|
918
1073
|
adapted_from="intfloat/multilingual-e5-large",
|
|
919
1074
|
superseded_by=None,
|
|
1075
|
+
citation="""@article{dadas2024pirb,
|
|
1076
|
+
title={{PIRB}: A Comprehensive Benchmark of Polish Dense and Hybrid Text Retrieval Methods},
|
|
1077
|
+
author={Sławomir Dadas and Michał Perełkiewicz and Rafał Poświata},
|
|
1078
|
+
year={2024},
|
|
1079
|
+
eprint={2402.13350},
|
|
1080
|
+
archivePrefix={arXiv},
|
|
1081
|
+
primaryClass={cs.CL}
|
|
1082
|
+
}""",
|
|
920
1083
|
)
|
|
921
1084
|
sdadas__mmlw_e5_small = ModelMeta(
|
|
922
1085
|
name="sdadas/mmlw-e5-small",
|
|
@@ -939,6 +1102,14 @@ sdadas__mmlw_e5_small = ModelMeta(
|
|
|
939
1102
|
training_datasets=E5_TRAINING_DATA,
|
|
940
1103
|
adapted_from="intfloat/multilingual-e5-small",
|
|
941
1104
|
superseded_by=None,
|
|
1105
|
+
citation="""@article{dadas2024pirb,
|
|
1106
|
+
title={{PIRB}: A Comprehensive Benchmark of Polish Dense and Hybrid Text Retrieval Methods},
|
|
1107
|
+
author={Sławomir Dadas and Michał Perełkiewicz and Rafał Poświata},
|
|
1108
|
+
year={2024},
|
|
1109
|
+
eprint={2402.13350},
|
|
1110
|
+
archivePrefix={arXiv},
|
|
1111
|
+
primaryClass={cs.CL}
|
|
1112
|
+
}""",
|
|
942
1113
|
)
|
|
943
1114
|
sdadas__mmlw_roberta_base = ModelMeta(
|
|
944
1115
|
name="sdadas/mmlw-roberta-base",
|
|
@@ -961,6 +1132,14 @@ sdadas__mmlw_roberta_base = ModelMeta(
|
|
|
961
1132
|
training_datasets={"MSMARCO"},
|
|
962
1133
|
adapted_from="sdadas/polish-roberta-base-v2",
|
|
963
1134
|
superseded_by=None,
|
|
1135
|
+
citation="""@article{dadas2024pirb,
|
|
1136
|
+
title={{PIRB}: A Comprehensive Benchmark of Polish Dense and Hybrid Text Retrieval Methods},
|
|
1137
|
+
author={Sławomir Dadas and Michał Perełkiewicz and Rafał Poświata},
|
|
1138
|
+
year={2024},
|
|
1139
|
+
eprint={2402.13350},
|
|
1140
|
+
archivePrefix={arXiv},
|
|
1141
|
+
primaryClass={cs.CL}
|
|
1142
|
+
}""",
|
|
964
1143
|
)
|
|
965
1144
|
sdadas__mmlw_roberta_large = ModelMeta(
|
|
966
1145
|
name="sdadas/mmlw-roberta-large",
|
|
@@ -983,6 +1162,14 @@ sdadas__mmlw_roberta_large = ModelMeta(
|
|
|
983
1162
|
training_datasets={"MSMARCO"},
|
|
984
1163
|
adapted_from="sdadas/polish-roberta-large-v2",
|
|
985
1164
|
superseded_by=None,
|
|
1165
|
+
citation="""@article{dadas2024pirb,
|
|
1166
|
+
title={{PIRB}: A Comprehensive Benchmark of Polish Dense and Hybrid Text Retrieval Methods},
|
|
1167
|
+
author={Sławomir Dadas and Michał Perełkiewicz and Rafał Poświata},
|
|
1168
|
+
year={2024},
|
|
1169
|
+
eprint={2402.13350},
|
|
1170
|
+
archivePrefix={arXiv},
|
|
1171
|
+
primaryClass={cs.CL}
|
|
1172
|
+
}""",
|
|
986
1173
|
)
|
|
987
1174
|
|
|
988
1175
|
udever_dataset = { # discussed here: https://github.com/embeddings-benchmark/mteb/issues/2193
|
|
@@ -1060,6 +1247,12 @@ izhx__udever_bloom_1b1 = ModelMeta(
|
|
|
1060
1247
|
training_datasets=udever_dataset,
|
|
1061
1248
|
adapted_from="bigscience/bloom-1b1",
|
|
1062
1249
|
superseded_by=None,
|
|
1250
|
+
citation="""@article{zhang2023language,
|
|
1251
|
+
title={Language Models are Universal Embedders},
|
|
1252
|
+
author={Zhang, Xin and Li, Zehan and Zhang, Yanzhao and Long, Dingkun and Xie, Pengjun and Zhang, Meishan and Zhang, Min},
|
|
1253
|
+
journal={arXiv preprint arXiv:2310.08232},
|
|
1254
|
+
year={2023}
|
|
1255
|
+
}""",
|
|
1063
1256
|
)
|
|
1064
1257
|
izhx__udever_bloom_3b = ModelMeta(
|
|
1065
1258
|
name="izhx/udever-bloom-3b",
|
|
@@ -1082,6 +1275,12 @@ izhx__udever_bloom_3b = ModelMeta(
|
|
|
1082
1275
|
training_datasets=udever_dataset,
|
|
1083
1276
|
adapted_from="bigscience/bloom-3b",
|
|
1084
1277
|
superseded_by=None,
|
|
1278
|
+
citation="""@article{zhang2023language,
|
|
1279
|
+
title={Language Models are Universal Embedders},
|
|
1280
|
+
author={Zhang, Xin and Li, Zehan and Zhang, Yanzhao and Long, Dingkun and Xie, Pengjun and Zhang, Meishan and Zhang, Min},
|
|
1281
|
+
journal={arXiv preprint arXiv:2310.08232},
|
|
1282
|
+
year={2023}
|
|
1283
|
+
}""",
|
|
1085
1284
|
)
|
|
1086
1285
|
izhx__udever_bloom_560m = ModelMeta(
|
|
1087
1286
|
name="izhx/udever-bloom-560m",
|
|
@@ -1104,6 +1303,12 @@ izhx__udever_bloom_560m = ModelMeta(
|
|
|
1104
1303
|
training_datasets=udever_dataset,
|
|
1105
1304
|
adapted_from="bigscience/bloom-560m",
|
|
1106
1305
|
superseded_by=None,
|
|
1306
|
+
citation="""@article{zhang2023language,
|
|
1307
|
+
title={Language Models are Universal Embedders},
|
|
1308
|
+
author={Zhang, Xin and Li, Zehan and Zhang, Yanzhao and Long, Dingkun and Xie, Pengjun and Zhang, Meishan and Zhang, Min},
|
|
1309
|
+
journal={arXiv preprint arXiv:2310.08232},
|
|
1310
|
+
year={2023}
|
|
1311
|
+
}""",
|
|
1107
1312
|
)
|
|
1108
1313
|
izhx__udever_bloom_7b1 = ModelMeta(
|
|
1109
1314
|
name="izhx/udever-bloom-7b1",
|
|
@@ -1126,6 +1331,12 @@ izhx__udever_bloom_7b1 = ModelMeta(
|
|
|
1126
1331
|
training_datasets=udever_dataset,
|
|
1127
1332
|
adapted_from="bigscience/bloom-7b1",
|
|
1128
1333
|
superseded_by=None,
|
|
1334
|
+
citation="""@article{zhang2023language,
|
|
1335
|
+
title={Language Models are Universal Embedders},
|
|
1336
|
+
author={Zhang, Xin and Li, Zehan and Zhang, Yanzhao and Long, Dingkun and Xie, Pengjun and Zhang, Meishan and Zhang, Min},
|
|
1337
|
+
journal={arXiv preprint arXiv:2310.08232},
|
|
1338
|
+
year={2023}
|
|
1339
|
+
}""",
|
|
1129
1340
|
)
|
|
1130
1341
|
avsolatorio__gist_embedding_v0 = ModelMeta(
|
|
1131
1342
|
name="avsolatorio/GIST-Embedding-v0",
|
|
@@ -1165,6 +1376,16 @@ avsolatorio__gist_embedding_v0 = ModelMeta(
|
|
|
1165
1376
|
| bge_training_data,
|
|
1166
1377
|
adapted_from="BAAI/bge-large-en-v1.5",
|
|
1167
1378
|
superseded_by=None,
|
|
1379
|
+
citation="""@article{solatorio2024gistembed,
|
|
1380
|
+
title={GISTEmbed: Guided In-sample Selection of Training Negatives for Text Embedding Fine-tuning},
|
|
1381
|
+
author={Aivin V. Solatorio},
|
|
1382
|
+
journal={arXiv preprint arXiv:2402.16829},
|
|
1383
|
+
year={2024},
|
|
1384
|
+
URL={https://arxiv.org/abs/2402.16829}
|
|
1385
|
+
eprint={2402.16829},
|
|
1386
|
+
archivePrefix={arXiv},
|
|
1387
|
+
primaryClass={cs.LG}
|
|
1388
|
+
}""",
|
|
1168
1389
|
)
|
|
1169
1390
|
avsolatorio__gist_all_minilm_l6_v2 = ModelMeta(
|
|
1170
1391
|
name="avsolatorio/GIST-all-MiniLM-L6-v2",
|
|
@@ -1204,6 +1425,16 @@ avsolatorio__gist_all_minilm_l6_v2 = ModelMeta(
|
|
|
1204
1425
|
| bge_training_data,
|
|
1205
1426
|
adapted_from=None,
|
|
1206
1427
|
superseded_by=None,
|
|
1428
|
+
citation="""@article{solatorio2024gistembed,
|
|
1429
|
+
title={GISTEmbed: Guided In-sample Selection of Training Negatives for Text Embedding Fine-tuning},
|
|
1430
|
+
author={Aivin V. Solatorio},
|
|
1431
|
+
journal={arXiv preprint arXiv:2402.16829},
|
|
1432
|
+
year={2024},
|
|
1433
|
+
URL={https://arxiv.org/abs/2402.16829}
|
|
1434
|
+
eprint={2402.16829},
|
|
1435
|
+
archivePrefix={arXiv},
|
|
1436
|
+
primaryClass={cs.LG}
|
|
1437
|
+
}""",
|
|
1207
1438
|
)
|
|
1208
1439
|
avsolatorio__gist_large_embedding_v0 = ModelMeta(
|
|
1209
1440
|
name="avsolatorio/GIST-large-Embedding-v0",
|
|
@@ -1243,6 +1474,16 @@ avsolatorio__gist_large_embedding_v0 = ModelMeta(
|
|
|
1243
1474
|
| bge_training_data,
|
|
1244
1475
|
adapted_from=None,
|
|
1245
1476
|
superseded_by=None,
|
|
1477
|
+
citation="""@article{solatorio2024gistembed,
|
|
1478
|
+
title={GISTEmbed: Guided In-sample Selection of Training Negatives for Text Embedding Fine-tuning},
|
|
1479
|
+
author={Aivin V. Solatorio},
|
|
1480
|
+
journal={arXiv preprint arXiv:2402.16829},
|
|
1481
|
+
year={2024},
|
|
1482
|
+
URL={https://arxiv.org/abs/2402.16829}
|
|
1483
|
+
eprint={2402.16829},
|
|
1484
|
+
archivePrefix={arXiv},
|
|
1485
|
+
primaryClass={cs.LG}
|
|
1486
|
+
}""",
|
|
1246
1487
|
)
|
|
1247
1488
|
avsolatorio__gist_small_embedding_v0 = ModelMeta(
|
|
1248
1489
|
name="avsolatorio/GIST-small-Embedding-v0",
|
|
@@ -1282,6 +1523,16 @@ avsolatorio__gist_small_embedding_v0 = ModelMeta(
|
|
|
1282
1523
|
| bge_training_data,
|
|
1283
1524
|
adapted_from=None,
|
|
1284
1525
|
superseded_by=None,
|
|
1526
|
+
citation="""@article{solatorio2024gistembed,
|
|
1527
|
+
title={GISTEmbed: Guided In-sample Selection of Training Negatives for Text Embedding Fine-tuning},
|
|
1528
|
+
author={Aivin V. Solatorio},
|
|
1529
|
+
journal={arXiv preprint arXiv:2402.16829},
|
|
1530
|
+
year={2024},
|
|
1531
|
+
URL={https://arxiv.org/abs/2402.16829}
|
|
1532
|
+
eprint={2402.16829},
|
|
1533
|
+
archivePrefix={arXiv},
|
|
1534
|
+
primaryClass={cs.LG}
|
|
1535
|
+
}""",
|
|
1285
1536
|
)
|
|
1286
1537
|
bigscience__sgpt_bloom_7b1_msmarco = ModelMeta(
|
|
1287
1538
|
name="bigscience/sgpt-bloom-7b1-msmarco",
|
|
@@ -1304,6 +1555,12 @@ bigscience__sgpt_bloom_7b1_msmarco = ModelMeta(
|
|
|
1304
1555
|
training_datasets=None,
|
|
1305
1556
|
adapted_from="/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3/bloom-7b1",
|
|
1306
1557
|
superseded_by=None,
|
|
1558
|
+
citation="""@article{muennighoff2022sgpt,
|
|
1559
|
+
title={SGPT: GPT Sentence Embeddings for Semantic Search},
|
|
1560
|
+
author={Muennighoff, Niklas},
|
|
1561
|
+
journal={arXiv preprint arXiv:2202.08904},
|
|
1562
|
+
year={2022}
|
|
1563
|
+
}""",
|
|
1307
1564
|
)
|
|
1308
1565
|
aari1995__german_semantic_sts_v2 = ModelMeta(
|
|
1309
1566
|
name="aari1995/German_Semantic_STS_V2",
|
|
@@ -1358,6 +1615,12 @@ abhinand__medembed_small_v0_1 = ModelMeta(
|
|
|
1358
1615
|
},
|
|
1359
1616
|
adapted_from="BAAI/bge-base-en-v1.5",
|
|
1360
1617
|
superseded_by=None,
|
|
1618
|
+
citation="""@software{balachandran2024medembed,
|
|
1619
|
+
author = {Balachandran, Abhinand},
|
|
1620
|
+
title = {MedEmbed: Medical-Focused Embedding Models},
|
|
1621
|
+
year = {2024},
|
|
1622
|
+
url = {https://github.com/abhinand5/MedEmbed}
|
|
1623
|
+
}""",
|
|
1361
1624
|
)
|
|
1362
1625
|
avsolatorio__noinstruct_small_embedding_v0 = ModelMeta(
|
|
1363
1626
|
name="avsolatorio/NoInstruct-small-Embedding-v0",
|
|
@@ -1490,6 +1753,15 @@ omarelshehy__arabic_english_sts_matryoshka = ModelMeta(
|
|
|
1490
1753
|
training_datasets=None,
|
|
1491
1754
|
adapted_from="FacebookAI/xlm-roberta-large",
|
|
1492
1755
|
superseded_by=None,
|
|
1756
|
+
citation="""@inproceedings{reimers-2019-sentence-bert,
|
|
1757
|
+
title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
|
|
1758
|
+
author = "Reimers, Nils and Gurevych, Iryna",
|
|
1759
|
+
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
|
|
1760
|
+
month = "11",
|
|
1761
|
+
year = "2019",
|
|
1762
|
+
publisher = "Association for Computational Linguistics",
|
|
1763
|
+
url = "https://arxiv.org/abs/1908.10084",
|
|
1764
|
+
}""",
|
|
1493
1765
|
)
|
|
1494
1766
|
openbmb__minicpm_embedding = ModelMeta(
|
|
1495
1767
|
loader=sentence_transformers_loader,
|
|
@@ -1543,6 +1815,13 @@ silma_ai__silma_embedding_matryoshka_v0_1 = ModelMeta(
|
|
|
1543
1815
|
training_datasets=None,
|
|
1544
1816
|
adapted_from="/workspace/v3-matryoshka_aubmindlab-bert-base-arabertv02-2024-10-12_13-55-06/checkpoint-26250",
|
|
1545
1817
|
superseded_by=None,
|
|
1818
|
+
citation="""@misc{silma2024embedding,
|
|
1819
|
+
author = {Abu Bakr Soliman, Karim Ouda, SILMA AI},
|
|
1820
|
+
title = {SILMA Embedding Matryoshka 0.1},
|
|
1821
|
+
year = {2024},
|
|
1822
|
+
publisher = {Hugging Face},
|
|
1823
|
+
howpublished = {https://huggingface.co/silma-ai/silma-embeddding-matryoshka-0.1},
|
|
1824
|
+
}""",
|
|
1546
1825
|
)
|
|
1547
1826
|
|
|
1548
1827
|
sbert_chinese_general_v1 = ModelMeta(
|
|
@@ -1683,6 +1962,15 @@ conan_embedding = ModelMeta(
|
|
|
1683
1962
|
# source: https://arxiv.org/pdf/2408.15710
|
|
1684
1963
|
training_datasets=None, # They "scraped" things from the internet, we don't know, could be leakage
|
|
1685
1964
|
superseded_by=None,
|
|
1965
|
+
citation="""@misc{li2024conanembeddinggeneraltextembedding,
|
|
1966
|
+
title={Conan-embedding: General Text Embedding with More and Better Negative Samples},
|
|
1967
|
+
author={Shiyu Li and Yang Tang and Shizhe Chen and Xi Chen},
|
|
1968
|
+
year={2024},
|
|
1969
|
+
eprint={2408.15710},
|
|
1970
|
+
archivePrefix={arXiv},
|
|
1971
|
+
primaryClass={cs.CL},
|
|
1972
|
+
url={https://arxiv.org/abs/2408.15710},
|
|
1973
|
+
}""",
|
|
1686
1974
|
)
|
|
1687
1975
|
|
|
1688
1976
|
ember_v1 = ModelMeta(
|
|
@@ -1705,4 +1993,9 @@ ember_v1 = ModelMeta(
|
|
|
1705
1993
|
use_instructions=None,
|
|
1706
1994
|
training_datasets=None,
|
|
1707
1995
|
superseded_by=None,
|
|
1996
|
+
citation="""@misc{nur2024emberv1,
|
|
1997
|
+
title={ember-v1: SOTA embedding model},
|
|
1998
|
+
author={Enrike Nur and Anar Aliyev},
|
|
1999
|
+
year={2023},
|
|
2000
|
+
}""",
|
|
1708
2001
|
)
|
|
@@ -0,0 +1,189 @@
|
|
|
1
|
+
from mteb.models.instruct_wrapper import InstructSentenceTransformerModel
|
|
2
|
+
from mteb.models.model_meta import ModelMeta
|
|
3
|
+
from mteb.models.models_protocols import PromptType
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def instruction_template(
|
|
7
|
+
instruction: str, prompt_type: PromptType | None = None
|
|
8
|
+
) -> str:
|
|
9
|
+
if not instruction or prompt_type == PromptType.document:
|
|
10
|
+
return ""
|
|
11
|
+
if isinstance(instruction, dict):
|
|
12
|
+
if prompt_type is None:
|
|
13
|
+
instruction = next(iter(instruction.values())) # TODO
|
|
14
|
+
else:
|
|
15
|
+
instruction = instruction[prompt_type]
|
|
16
|
+
return f"Instruct: {instruction}\nQuery:"
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
multilingual_langs = [
|
|
20
|
+
"afr-Latn",
|
|
21
|
+
"ara-Arab",
|
|
22
|
+
"aze-Latn",
|
|
23
|
+
"bel-Cyrl",
|
|
24
|
+
"bul-Cyrl",
|
|
25
|
+
"ben-Beng",
|
|
26
|
+
"cat-Latn",
|
|
27
|
+
"ceb-Latn",
|
|
28
|
+
"ces-Latn",
|
|
29
|
+
"cym-Latn",
|
|
30
|
+
"dan-Latn",
|
|
31
|
+
"deu-Latn",
|
|
32
|
+
"ell-Grek",
|
|
33
|
+
"eng-Latn",
|
|
34
|
+
"spa-Latn",
|
|
35
|
+
"est-Latn",
|
|
36
|
+
"eus-Latn",
|
|
37
|
+
"fas-Arab",
|
|
38
|
+
"fin-Latn",
|
|
39
|
+
"fra-Latn",
|
|
40
|
+
"glg-Latn",
|
|
41
|
+
"guj-Gujr",
|
|
42
|
+
"heb-Hebr",
|
|
43
|
+
"hin-Deva",
|
|
44
|
+
"hrv-Latn",
|
|
45
|
+
"hat-Latn",
|
|
46
|
+
"hun-Latn",
|
|
47
|
+
"hye-Armn",
|
|
48
|
+
"ind-Latn",
|
|
49
|
+
"isl-Latn",
|
|
50
|
+
"ita-Latn",
|
|
51
|
+
"jpn-Jpan",
|
|
52
|
+
"jav-Latn",
|
|
53
|
+
"kat-Geor",
|
|
54
|
+
"kaz-Cyrl",
|
|
55
|
+
"khm-Khmr",
|
|
56
|
+
"kan-Knda",
|
|
57
|
+
"kor-Hang",
|
|
58
|
+
"kir-Cyrl",
|
|
59
|
+
"lao-Laoo",
|
|
60
|
+
"lit-Latn",
|
|
61
|
+
"lav-Latn",
|
|
62
|
+
"mkd-Cyrl",
|
|
63
|
+
"mal-Mlym",
|
|
64
|
+
"mon-Cyrl",
|
|
65
|
+
"mar-Deva",
|
|
66
|
+
"msa-Latn",
|
|
67
|
+
"mya-Mymr",
|
|
68
|
+
"nep-Deva",
|
|
69
|
+
"nld-Latn",
|
|
70
|
+
"nor-Latn",
|
|
71
|
+
"nob-Latn",
|
|
72
|
+
"nno-Latn",
|
|
73
|
+
"pan-Guru",
|
|
74
|
+
"pol-Latn",
|
|
75
|
+
"por-Latn",
|
|
76
|
+
"que-Latn",
|
|
77
|
+
"ron-Latn",
|
|
78
|
+
"rus-Cyrl",
|
|
79
|
+
"sin-Sinh",
|
|
80
|
+
"slk-Latn",
|
|
81
|
+
"slv-Latn",
|
|
82
|
+
"swa-Latn",
|
|
83
|
+
"tam-Taml",
|
|
84
|
+
"tel-Telu",
|
|
85
|
+
"tha-Thai",
|
|
86
|
+
"tgl-Latn",
|
|
87
|
+
"tur-Latn",
|
|
88
|
+
"ukr-Cyrl",
|
|
89
|
+
"urd-Arab",
|
|
90
|
+
"vie-Latn",
|
|
91
|
+
"yor-Latn",
|
|
92
|
+
"zho-Hans",
|
|
93
|
+
]
|
|
94
|
+
|
|
95
|
+
MOD_CITATION = """@misc{mod-embedding-2025,
|
|
96
|
+
title={MoD-Embedding: A Fine-tuned Multilingual Text Embedding Model},
|
|
97
|
+
author={MoD Team},
|
|
98
|
+
year={2025},
|
|
99
|
+
url={https://huggingface.co/bflhc/MoD-Embedding}
|
|
100
|
+
}"""
|
|
101
|
+
|
|
102
|
+
training_data = {
|
|
103
|
+
"T2Retrieval",
|
|
104
|
+
"DuRetrieval",
|
|
105
|
+
"MMarcoReranking",
|
|
106
|
+
"CMedQAv2-reranking",
|
|
107
|
+
"NQ",
|
|
108
|
+
"MSMARCO",
|
|
109
|
+
"HotpotQA",
|
|
110
|
+
"FEVER",
|
|
111
|
+
"MrTidyRetrieval",
|
|
112
|
+
"MIRACLRetrieval",
|
|
113
|
+
"CodeSearchNet",
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
# Predefined prompts for various RTEB tasks
|
|
117
|
+
_PREDEFINED_PROMPTS = {
|
|
118
|
+
# ========== Open Datasets ==========
|
|
119
|
+
# Legal domain
|
|
120
|
+
"AILACasedocs": "Given a legal case scenario, retrieve the most relevant case documents",
|
|
121
|
+
"AILAStatutes": "Given a legal scenario, retrieve the most relevant statute documents",
|
|
122
|
+
"LegalQuAD": "Given a legal question, retrieve relevant legal documents that answer the question",
|
|
123
|
+
"LegalSummarization": "Given a query, retrieve relevant legal documents for summarization",
|
|
124
|
+
# Code domain
|
|
125
|
+
"AppsRetrieval": "Given a query about mobile applications, retrieve relevant app information",
|
|
126
|
+
"HumanEvalRetrieval": "Given a code problem description, retrieve relevant code examples",
|
|
127
|
+
"MBPPRetrieval": "Given a programming problem description, retrieve relevant code solutions",
|
|
128
|
+
"DS1000Retrieval": "Given a data science problem, retrieve relevant code snippets",
|
|
129
|
+
"FreshStackRetrieval": "Given a programming question, retrieve relevant Stack Overflow posts",
|
|
130
|
+
# Finance domain
|
|
131
|
+
"FinQARetrieval": "Given a financial question, retrieve relevant financial documents",
|
|
132
|
+
"FinanceBenchRetrieval": "Given a financial query, retrieve relevant financial information",
|
|
133
|
+
"HC3FinanceRetrieval": "Given a finance-related query, retrieve relevant documents",
|
|
134
|
+
# Medical domain
|
|
135
|
+
"CUREv1": "Given a medical query, retrieve relevant clinical documents",
|
|
136
|
+
"ChatDoctorRetrieval": "Given a medical question, retrieve relevant medical information",
|
|
137
|
+
# SQL domain
|
|
138
|
+
"WikiSQLRetrieval": "Given a natural language query, retrieve relevant SQL examples",
|
|
139
|
+
# Multilingual
|
|
140
|
+
"MIRACLRetrievalHardNegatives": "Given a query, retrieve relevant passages",
|
|
141
|
+
# ========== Private/Closed Datasets ==========
|
|
142
|
+
# Code domain (Private)
|
|
143
|
+
"Code1Retrieval": "Given a code problem description, retrieve relevant code examples",
|
|
144
|
+
"JapaneseCode1Retrieval": "Given a code problem description, retrieve relevant code examples",
|
|
145
|
+
# Finance domain (Private)
|
|
146
|
+
"EnglishFinance1Retrieval": "Given a financial query, retrieve relevant financial documents",
|
|
147
|
+
"EnglishFinance2Retrieval": "Given a financial query, retrieve relevant financial documents",
|
|
148
|
+
"EnglishFinance3Retrieval": "Given a financial query, retrieve relevant financial documents",
|
|
149
|
+
"EnglishFinance4Retrieval": "Given a financial query, retrieve relevant financial documents",
|
|
150
|
+
# Healthcare domain (Private)
|
|
151
|
+
"EnglishHealthcare1Retrieval": "Given a medical question, retrieve relevant medical information",
|
|
152
|
+
"GermanHealthcare1Retrieval": "Given a medical question, retrieve relevant medical information",
|
|
153
|
+
# Legal domain (Private)
|
|
154
|
+
"FrenchLegal1Retrieval": "Given a legal query, retrieve relevant legal documents",
|
|
155
|
+
"GermanLegal1Retrieval": "Given a legal query, retrieve relevant legal documents",
|
|
156
|
+
"JapaneseLegal1Retrieval": "Given a legal query, retrieve relevant legal documents",
|
|
157
|
+
# General/Multilingual (Private)
|
|
158
|
+
"French1Retrieval": "Given a query, retrieve relevant passages",
|
|
159
|
+
"German1Retrieval": "Given a query, retrieve relevant passages",
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
MoD_Embedding = ModelMeta(
|
|
164
|
+
loader=InstructSentenceTransformerModel,
|
|
165
|
+
loader_kwargs=dict(
|
|
166
|
+
instruction_template=instruction_template,
|
|
167
|
+
apply_instruction_to_passages=False,
|
|
168
|
+
prompts_dict=_PREDEFINED_PROMPTS,
|
|
169
|
+
),
|
|
170
|
+
name="bflhc/MoD-Embedding",
|
|
171
|
+
languages=multilingual_langs,
|
|
172
|
+
open_weights=True,
|
|
173
|
+
revision="acbb5b70fdab262226a6af2bc62001de8021b05c",
|
|
174
|
+
release_date="2025-12-14",
|
|
175
|
+
n_parameters=4021774336,
|
|
176
|
+
memory_usage_mb=7671,
|
|
177
|
+
embed_dim=2560,
|
|
178
|
+
max_tokens=32768,
|
|
179
|
+
license="apache-2.0",
|
|
180
|
+
reference="https://huggingface.co/bflhc/MoD-Embedding",
|
|
181
|
+
similarity_fn_name="cosine",
|
|
182
|
+
framework=["Sentence Transformers", "PyTorch"],
|
|
183
|
+
use_instructions=True,
|
|
184
|
+
public_training_code=None,
|
|
185
|
+
public_training_data=None,
|
|
186
|
+
training_datasets=training_data,
|
|
187
|
+
citation=MOD_CITATION,
|
|
188
|
+
adapted_from="Qwen/Qwen3-Embedding-4B",
|
|
189
|
+
)
|