PyPI - mteb - Versions diffs - 2.4.1__py3-none-any.whl → 2.5.0__py3-none-any.whl - Mend

mteb 2.4.1py3-none-any.whl → 2.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

mteb/models/model_implementations/misc_models.py CHANGED Viewed

@@ -127,6 +127,15 @@ Gameselo__STS_multilingual_mpnet_base_v2 = ModelMeta(
     },
     adapted_from="sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
     superseded_by=None,
+    citation="""@inproceedings{reimers-2019-sentence-bert,
+    title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
+    author = "Reimers, Nils and Gurevych, Iryna",
+    booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
+    month = "11",
+    year = "2019",
+    publisher = "Association for Computational Linguistics",
+    url = "https://arxiv.org/abs/1908.10084",
+}""",
 )
 Hum_Works__lodestone_base_4096_v1 = ModelMeta(
@@ -250,6 +259,29 @@ Lajavaness__bilingual_embedding_base = ModelMeta(
     training_datasets=bilingual_embedding_training_data,
     adapted_from="dangvantuan/bilingual_impl",
     superseded_by=None,
+    citation="""
+    @article{conneau2019unsupervised,
+  title={Unsupervised cross-lingual representation learning at scale},
+  author={Conneau, Alexis and Khandelwal, Kartikay and Goyal, Naman and Chaudhary, Vishrav and Wenzek, Guillaume and Guzm{\'a}n, Francisco and Grave, Edouard and Ott, Myle and Zettlemoyer, Luke and Stoyanov, Veselin},
+  journal={arXiv preprint arXiv:1911.02116},
+  year={2019}
+}
+@article{reimers2019sentence,
+   title={Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks},
+   author={Nils Reimers, Iryna Gurevych},
+   journal={https://arxiv.org/abs/1908.10084},
+   year={2019}
+}
+@article{thakur2020augmented,
+  title={Augmented SBERT: Data Augmentation Method for Improving Bi-Encoders for Pairwise Sentence Scoring Tasks},
+  author={Thakur, Nandan and Reimers, Nils and Daxenberger, Johannes and Gurevych, Iryna},
+  journal={arXiv e-prints},
+  pages={arXiv--2010},
+  year={2020}
+}
+""",
 )
 Lajavaness__bilingual_embedding_large = ModelMeta(
     name="Lajavaness/bilingual-embedding-large",
@@ -275,6 +307,29 @@ Lajavaness__bilingual_embedding_large = ModelMeta(
     training_datasets=bilingual_embedding_training_data,
     adapted_from="dangvantuan/bilingual_impl",
     superseded_by=None,
+    citation="""
+    @article{conneau2019unsupervised,
+  title={Unsupervised cross-lingual representation learning at scale},
+  author={Conneau, Alexis and Khandelwal, Kartikay and Goyal, Naman and Chaudhary, Vishrav and Wenzek, Guillaume and Guzm{\'a}n, Francisco and Grave, Edouard and Ott, Myle and Zettlemoyer, Luke and Stoyanov, Veselin},
+  journal={arXiv preprint arXiv:1911.02116},
+  year={2019}
+}
+@article{reimers2019sentence,
+   title={Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks},
+   author={Nils Reimers, Iryna Gurevych},
+   journal={https://arxiv.org/abs/1908.10084},
+   year={2019}
+}
+@article{thakur2020augmented,
+  title={Augmented SBERT: Data Augmentation Method for Improving Bi-Encoders for Pairwise Sentence Scoring Tasks},
+  author={Thakur, Nandan and Reimers, Nils and Daxenberger, Johannes and Gurevych, Iryna},
+  journal={arXiv e-prints},
+  pages={arXiv--2010},
+  year={2020}
+}
+""",
 )
 Lajavaness__bilingual_embedding_small = ModelMeta(
     name="Lajavaness/bilingual-embedding-small",
@@ -300,6 +355,29 @@ Lajavaness__bilingual_embedding_small = ModelMeta(
     training_datasets=bilingual_embedding_training_data,
     adapted_from="dangvantuan/bilingual_impl",
     superseded_by=None,
+    citation="""
+    @article{conneau2019unsupervised,
+  title={Unsupervised cross-lingual representation learning at scale},
+  author={Conneau, Alexis and Khandelwal, Kartikay and Goyal, Naman and Chaudhary, Vishrav and Wenzek, Guillaume and Guzm{\'a}n, Francisco and Grave, Edouard and Ott, Myle and Zettlemoyer, Luke and Stoyanov, Veselin},
+  journal={arXiv preprint arXiv:1911.02116},
+  year={2019}
+}
+@article{reimers2019sentence,
+   title={Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks},
+   author={Nils Reimers, Iryna Gurevych},
+   journal={https://arxiv.org/abs/1908.10084},
+   year={2019}
+}
+@article{thakur2020augmented,
+  title={Augmented SBERT: Data Augmentation Method for Improving Bi-Encoders for Pairwise Sentence Scoring Tasks},
+  author={Thakur, Nandan and Reimers, Nils and Daxenberger, Johannes and Gurevych, Iryna},
+  journal={arXiv e-prints},
+  pages={arXiv--2010},
+  year={2020}
+}
+""",
 )
 Mihaiii__Bulbasaur = ModelMeta(
     name="Mihaiii/Bulbasaur",
@@ -503,6 +581,15 @@ Omartificial_Intelligence_Space__Arabert_all_nli_triplet_Matryoshka = ModelMeta(
     training_datasets=set(),  # not in MTEB: {"Omartificial-Intelligence-Space/Arabic-NLi-Triplet"},
     adapted_from="aubmindlab/bert-base-arabertv02",
     superseded_by=None,
+    citation="""@inproceedings{reimers-2019-sentence-bert,
+    title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
+    author = "Reimers, Nils and Gurevych, Iryna",
+    booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
+    month = "11",
+    year = "2019",
+    publisher = "Association for Computational Linguistics",
+    url = "https://arxiv.org/abs/1908.10084",
+}""",
 )
 Omartificial_Intelligence_Space__Arabic_MiniLM_L12_v2_all_nli_triplet = ModelMeta(
     name="Omartificial-Intelligence-Space/Arabic-MiniLM-L12-v2-all-nli-triplet",
@@ -551,6 +638,15 @@ Omartificial_Intelligence_Space__Arabic_all_nli_triplet_Matryoshka = ModelMeta(
     # {"Omartificial-Intelligence-Space/Arabic-NLi-Triplet"},
     adapted_from="sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
     superseded_by=None,
+    citation="""@inproceedings{reimers-2019-sentence-bert,
+    title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
+    author = "Reimers, Nils and Gurevych, Iryna",
+    booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
+    month = "11",
+    year = "2019",
+    publisher = "Association for Computational Linguistics",
+    url = "https://arxiv.org/abs/1908.10084",
+}""",
 )
 Omartificial_Intelligence_Space__Arabic_labse_Matryoshka = ModelMeta(
     name="Omartificial-Intelligence-Space/Arabic-labse-Matryoshka",
@@ -575,6 +671,15 @@ Omartificial_Intelligence_Space__Arabic_labse_Matryoshka = ModelMeta(
     # {"Omartificial-Intelligence-Space/Arabic-NLi-Triplet"},
     adapted_from="sentence-transformers/LaBSE",
     superseded_by=None,
+    citation="""@inproceedings{reimers-2019-sentence-bert,
+    title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
+    author = "Reimers, Nils and Gurevych, Iryna",
+    booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
+    month = "11",
+    year = "2019",
+    publisher = "Association for Computational Linguistics",
+    url = "https://arxiv.org/abs/1908.10084",
+}""",
 )
 Omartificial_Intelligence_Space__Arabic_mpnet_base_all_nli_triplet = ModelMeta(
     name="Omartificial-Intelligence-Space/Arabic-mpnet-base-all-nli-triplet",
@@ -599,6 +704,15 @@ Omartificial_Intelligence_Space__Arabic_mpnet_base_all_nli_triplet = ModelMeta(
     # {"Omartificial-Intelligence-Space/Arabic-NLi-Triplet"},
     adapted_from="tomaarsen/mpnet-base-all-nli-triplet",
     superseded_by=None,
+    citation="""@inproceedings{reimers-2019-sentence-bert,
+    title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
+    author = "Reimers, Nils and Gurevych, Iryna",
+    booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
+    month = "11",
+    year = "2019",
+    publisher = "Association for Computational Linguistics",
+    url = "https://arxiv.org/abs/1908.10084",
+}""",
 )
 Omartificial_Intelligence_Space__Marbert_all_nli_triplet_Matryoshka = ModelMeta(
     name="Omartificial-Intelligence-Space/Marbert-all-nli-triplet-Matryoshka",
@@ -621,6 +735,15 @@ Omartificial_Intelligence_Space__Marbert_all_nli_triplet_Matryoshka = ModelMeta(
     training_datasets=set(),  # not in MTEB: "Omartificial-Intelligence-Space/Arabic-NLi-Triplet"},
     adapted_from="UBC-NLP/MARBERTv2",
     superseded_by=None,
+    citation="""@inproceedings{reimers-2019-sentence-bert,
+    title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
+    author = "Reimers, Nils and Gurevych, Iryna",
+    booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
+    month = "11",
+    year = "2019",
+    publisher = "Association for Computational Linguistics",
+    url = "https://arxiv.org/abs/1908.10084",
+}""",
 )
 consciousai__cai_lunaris_text_embeddings = ModelMeta(
     name="consciousAI/cai-lunaris-text-embeddings",
@@ -763,6 +886,12 @@ thenlper__gte_base = ModelMeta(
     training_datasets=None,
     adapted_from=None,
     superseded_by=None,
+    citation="""@article{li2023towards,
+  title={Towards general text embeddings with multi-stage contrastive learning},
+  author={Li, Zehan and Zhang, Xin and Zhang, Yanzhao and Long, Dingkun and Xie, Pengjun and Zhang, Meishan},
+  journal={arXiv preprint arXiv:2308.03281},
+  year={2023}
+}""",
 )
 thenlper__gte_large = ModelMeta(
     name="thenlper/gte-large",
@@ -785,6 +914,12 @@ thenlper__gte_large = ModelMeta(
     training_datasets=None,
     adapted_from=None,
     superseded_by=None,
+    citation="""@article{li2023towards,
+  title={Towards general text embeddings with multi-stage contrastive learning},
+  author={Li, Zehan and Zhang, Xin and Zhang, Yanzhao and Long, Dingkun and Xie, Pengjun and Zhang, Meishan},
+  journal={arXiv preprint arXiv:2308.03281},
+  year={2023}
+}""",
 )
 thenlper__gte_small = ModelMeta(
     name="thenlper/gte-small",
@@ -807,6 +942,12 @@ thenlper__gte_small = ModelMeta(
     training_datasets=None,
     adapted_from=None,
     superseded_by=None,
+    citation="""@article{li2023towards,
+  title={Towards general text embeddings with multi-stage contrastive learning},
+  author={Li, Zehan and Zhang, Xin and Zhang, Yanzhao and Long, Dingkun and Xie, Pengjun and Zhang, Meishan},
+  journal={arXiv preprint arXiv:2308.03281},
+  year={2023}
+}""",
 )
 OrlikB__KartonBERT_USE_base_v1 = ModelMeta(
     name="OrlikB/KartonBERT-USE-base-v1",
@@ -873,6 +1014,14 @@ sdadas__mmlw_e5_base = ModelMeta(
     training_datasets=E5_TRAINING_DATA,
     adapted_from="intfloat/multilingual-e5-base",
     superseded_by=None,
+    citation="""@article{dadas2024pirb,
+  title={{PIRB}: A Comprehensive Benchmark of Polish Dense and Hybrid Text Retrieval Methods},
+  author={Sławomir Dadas and Michał Perełkiewicz and Rafał Poświata},
+  year={2024},
+  eprint={2402.13350},
+  archivePrefix={arXiv},
+  primaryClass={cs.CL}
+}""",
 )
 dwzhu__e5_base_4k = ModelMeta(
     name="dwzhu/e5-base-4k",
@@ -895,6 +1044,12 @@ dwzhu__e5_base_4k = ModelMeta(
     training_datasets=E5_TRAINING_DATA,
     adapted_from="intfloat/e5-base-v2",
     superseded_by=None,
+    citation="""@article{zhu2024longembed,
+  title={LongEmbed: Extending Embedding Models for Long Context Retrieval},
+  author={Zhu, Dawei and Wang, Liang and Yang, Nan and Song, Yifan and Wu, Wenhao and Wei, Furu and Li, Sujian},
+  journal={arXiv preprint arXiv:2404.12096},
+  year={2024}
+}""",
 )
 sdadas__mmlw_e5_large = ModelMeta(
     name="sdadas/mmlw-e5-large",
@@ -917,6 +1072,14 @@ sdadas__mmlw_e5_large = ModelMeta(
     training_datasets=E5_TRAINING_DATA,
     adapted_from="intfloat/multilingual-e5-large",
     superseded_by=None,
+    citation="""@article{dadas2024pirb,
+  title={{PIRB}: A Comprehensive Benchmark of Polish Dense and Hybrid Text Retrieval Methods},
+  author={Sławomir Dadas and Michał Perełkiewicz and Rafał Poświata},
+  year={2024},
+  eprint={2402.13350},
+  archivePrefix={arXiv},
+  primaryClass={cs.CL}
+}""",
 )
 sdadas__mmlw_e5_small = ModelMeta(
     name="sdadas/mmlw-e5-small",
@@ -939,6 +1102,14 @@ sdadas__mmlw_e5_small = ModelMeta(
     training_datasets=E5_TRAINING_DATA,
     adapted_from="intfloat/multilingual-e5-small",
     superseded_by=None,
+    citation="""@article{dadas2024pirb,
+  title={{PIRB}: A Comprehensive Benchmark of Polish Dense and Hybrid Text Retrieval Methods},
+  author={Sławomir Dadas and Michał Perełkiewicz and Rafał Poświata},
+  year={2024},
+  eprint={2402.13350},
+  archivePrefix={arXiv},
+  primaryClass={cs.CL}
+}""",
 )
 sdadas__mmlw_roberta_base = ModelMeta(
     name="sdadas/mmlw-roberta-base",
@@ -961,6 +1132,14 @@ sdadas__mmlw_roberta_base = ModelMeta(
     training_datasets={"MSMARCO"},
     adapted_from="sdadas/polish-roberta-base-v2",
     superseded_by=None,
+    citation="""@article{dadas2024pirb,
+  title={{PIRB}: A Comprehensive Benchmark of Polish Dense and Hybrid Text Retrieval Methods},
+  author={Sławomir Dadas and Michał Perełkiewicz and Rafał Poświata},
+  year={2024},
+  eprint={2402.13350},
+  archivePrefix={arXiv},
+  primaryClass={cs.CL}
+}""",
 )
 sdadas__mmlw_roberta_large = ModelMeta(
     name="sdadas/mmlw-roberta-large",
@@ -983,6 +1162,14 @@ sdadas__mmlw_roberta_large = ModelMeta(
     training_datasets={"MSMARCO"},
     adapted_from="sdadas/polish-roberta-large-v2",
     superseded_by=None,
+    citation="""@article{dadas2024pirb,
+  title={{PIRB}: A Comprehensive Benchmark of Polish Dense and Hybrid Text Retrieval Methods},
+  author={Sławomir Dadas and Michał Perełkiewicz and Rafał Poświata},
+  year={2024},
+  eprint={2402.13350},
+  archivePrefix={arXiv},
+  primaryClass={cs.CL}
+}""",
 )
 udever_dataset = {  # discussed here: https://github.com/embeddings-benchmark/mteb/issues/2193
@@ -1060,6 +1247,12 @@ izhx__udever_bloom_1b1 = ModelMeta(
     training_datasets=udever_dataset,
     adapted_from="bigscience/bloom-1b1",
     superseded_by=None,
+    citation="""@article{zhang2023language,
+  title={Language Models are Universal Embedders},
+  author={Zhang, Xin and Li, Zehan and Zhang, Yanzhao and Long, Dingkun and Xie, Pengjun and Zhang, Meishan and Zhang, Min},
+  journal={arXiv preprint arXiv:2310.08232},
+  year={2023}
+}""",
 )
 izhx__udever_bloom_3b = ModelMeta(
     name="izhx/udever-bloom-3b",
@@ -1082,6 +1275,12 @@ izhx__udever_bloom_3b = ModelMeta(
     training_datasets=udever_dataset,
     adapted_from="bigscience/bloom-3b",
     superseded_by=None,
+    citation="""@article{zhang2023language,
+  title={Language Models are Universal Embedders},
+  author={Zhang, Xin and Li, Zehan and Zhang, Yanzhao and Long, Dingkun and Xie, Pengjun and Zhang, Meishan and Zhang, Min},
+  journal={arXiv preprint arXiv:2310.08232},
+  year={2023}
+}""",
 )
 izhx__udever_bloom_560m = ModelMeta(
     name="izhx/udever-bloom-560m",
@@ -1104,6 +1303,12 @@ izhx__udever_bloom_560m = ModelMeta(
     training_datasets=udever_dataset,
     adapted_from="bigscience/bloom-560m",
     superseded_by=None,
+    citation="""@article{zhang2023language,
+  title={Language Models are Universal Embedders},
+  author={Zhang, Xin and Li, Zehan and Zhang, Yanzhao and Long, Dingkun and Xie, Pengjun and Zhang, Meishan and Zhang, Min},
+  journal={arXiv preprint arXiv:2310.08232},
+  year={2023}
+}""",
 )
 izhx__udever_bloom_7b1 = ModelMeta(
     name="izhx/udever-bloom-7b1",
@@ -1126,6 +1331,12 @@ izhx__udever_bloom_7b1 = ModelMeta(
     training_datasets=udever_dataset,
     adapted_from="bigscience/bloom-7b1",
     superseded_by=None,
+    citation="""@article{zhang2023language,
+  title={Language Models are Universal Embedders},
+  author={Zhang, Xin and Li, Zehan and Zhang, Yanzhao and Long, Dingkun and Xie, Pengjun and Zhang, Meishan and Zhang, Min},
+  journal={arXiv preprint arXiv:2310.08232},
+  year={2023}
+}""",
 )
 avsolatorio__gist_embedding_v0 = ModelMeta(
     name="avsolatorio/GIST-Embedding-v0",
@@ -1165,6 +1376,16 @@ avsolatorio__gist_embedding_v0 = ModelMeta(
     | bge_training_data,
     adapted_from="BAAI/bge-large-en-v1.5",
     superseded_by=None,
+    citation="""@article{solatorio2024gistembed,
+    title={GISTEmbed: Guided In-sample Selection of Training Negatives for Text Embedding Fine-tuning},
+    author={Aivin V. Solatorio},
+    journal={arXiv preprint arXiv:2402.16829},
+    year={2024},
+    URL={https://arxiv.org/abs/2402.16829}
+    eprint={2402.16829},
+    archivePrefix={arXiv},
+    primaryClass={cs.LG}
+}""",
 )
 avsolatorio__gist_all_minilm_l6_v2 = ModelMeta(
     name="avsolatorio/GIST-all-MiniLM-L6-v2",
@@ -1204,6 +1425,16 @@ avsolatorio__gist_all_minilm_l6_v2 = ModelMeta(
     | bge_training_data,
     adapted_from=None,
     superseded_by=None,
+    citation="""@article{solatorio2024gistembed,
+    title={GISTEmbed: Guided In-sample Selection of Training Negatives for Text Embedding Fine-tuning},
+    author={Aivin V. Solatorio},
+    journal={arXiv preprint arXiv:2402.16829},
+    year={2024},
+    URL={https://arxiv.org/abs/2402.16829}
+    eprint={2402.16829},
+    archivePrefix={arXiv},
+    primaryClass={cs.LG}
+}""",
 )
 avsolatorio__gist_large_embedding_v0 = ModelMeta(
     name="avsolatorio/GIST-large-Embedding-v0",
@@ -1243,6 +1474,16 @@ avsolatorio__gist_large_embedding_v0 = ModelMeta(
     | bge_training_data,
     adapted_from=None,
     superseded_by=None,
+    citation="""@article{solatorio2024gistembed,
+    title={GISTEmbed: Guided In-sample Selection of Training Negatives for Text Embedding Fine-tuning},
+    author={Aivin V. Solatorio},
+    journal={arXiv preprint arXiv:2402.16829},
+    year={2024},
+    URL={https://arxiv.org/abs/2402.16829}
+    eprint={2402.16829},
+    archivePrefix={arXiv},
+    primaryClass={cs.LG}
+}""",
 )
 avsolatorio__gist_small_embedding_v0 = ModelMeta(
     name="avsolatorio/GIST-small-Embedding-v0",
@@ -1282,6 +1523,16 @@ avsolatorio__gist_small_embedding_v0 = ModelMeta(
     | bge_training_data,
     adapted_from=None,
     superseded_by=None,
+    citation="""@article{solatorio2024gistembed,
+    title={GISTEmbed: Guided In-sample Selection of Training Negatives for Text Embedding Fine-tuning},
+    author={Aivin V. Solatorio},
+    journal={arXiv preprint arXiv:2402.16829},
+    year={2024},
+    URL={https://arxiv.org/abs/2402.16829}
+    eprint={2402.16829},
+    archivePrefix={arXiv},
+    primaryClass={cs.LG}
+}""",
 )
 bigscience__sgpt_bloom_7b1_msmarco = ModelMeta(
     name="bigscience/sgpt-bloom-7b1-msmarco",
@@ -1304,6 +1555,12 @@ bigscience__sgpt_bloom_7b1_msmarco = ModelMeta(
     training_datasets=None,
     adapted_from="/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3/bloom-7b1",
     superseded_by=None,
+    citation="""@article{muennighoff2022sgpt,
+  title={SGPT: GPT Sentence Embeddings for Semantic Search},
+  author={Muennighoff, Niklas},
+  journal={arXiv preprint arXiv:2202.08904},
+  year={2022}
+}""",
 )
 aari1995__german_semantic_sts_v2 = ModelMeta(
     name="aari1995/German_Semantic_STS_V2",
@@ -1358,6 +1615,12 @@ abhinand__medembed_small_v0_1 = ModelMeta(
     },
     adapted_from="BAAI/bge-base-en-v1.5",
     superseded_by=None,
+    citation="""@software{balachandran2024medembed,
+  author = {Balachandran, Abhinand},
+  title = {MedEmbed: Medical-Focused Embedding Models},
+  year = {2024},
+  url = {https://github.com/abhinand5/MedEmbed}
+}""",
 )
 avsolatorio__noinstruct_small_embedding_v0 = ModelMeta(
     name="avsolatorio/NoInstruct-small-Embedding-v0",
@@ -1490,6 +1753,15 @@ omarelshehy__arabic_english_sts_matryoshka = ModelMeta(
     training_datasets=None,
     adapted_from="FacebookAI/xlm-roberta-large",
     superseded_by=None,
+    citation="""@inproceedings{reimers-2019-sentence-bert,
+    title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
+    author = "Reimers, Nils and Gurevych, Iryna",
+    booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
+    month = "11",
+    year = "2019",
+    publisher = "Association for Computational Linguistics",
+    url = "https://arxiv.org/abs/1908.10084",
+}""",
 )
 openbmb__minicpm_embedding = ModelMeta(
     loader=sentence_transformers_loader,
@@ -1543,6 +1815,13 @@ silma_ai__silma_embedding_matryoshka_v0_1 = ModelMeta(
     training_datasets=None,
     adapted_from="/workspace/v3-matryoshka_aubmindlab-bert-base-arabertv02-2024-10-12_13-55-06/checkpoint-26250",
     superseded_by=None,
+    citation="""@misc{silma2024embedding,
+  author = {Abu Bakr Soliman, Karim Ouda, SILMA AI},
+  title = {SILMA Embedding Matryoshka 0.1},
+  year = {2024},
+  publisher = {Hugging Face},
+  howpublished = {https://huggingface.co/silma-ai/silma-embeddding-matryoshka-0.1},
+}""",
 )
 sbert_chinese_general_v1 = ModelMeta(
@@ -1683,6 +1962,15 @@ conan_embedding = ModelMeta(
     # source: https://arxiv.org/pdf/2408.15710
     training_datasets=None,  # They "scraped" things from the internet, we don't know, could be leakage
     superseded_by=None,
+    citation="""@misc{li2024conanembeddinggeneraltextembedding,
+  title={Conan-embedding: General Text Embedding with More and Better Negative Samples},
+  author={Shiyu Li and Yang Tang and Shizhe Chen and Xi Chen},
+  year={2024},
+  eprint={2408.15710},
+  archivePrefix={arXiv},
+  primaryClass={cs.CL},
+  url={https://arxiv.org/abs/2408.15710},
+}""",
 )
 ember_v1 = ModelMeta(
@@ -1705,4 +1993,9 @@ ember_v1 = ModelMeta(
     use_instructions=None,
     training_datasets=None,
     superseded_by=None,
+    citation="""@misc{nur2024emberv1,
+      title={ember-v1: SOTA embedding model},
+      author={Enrike Nur and Anar Aliyev},
+      year={2023},
+}""",
 )

mteb/models/model_implementations/mod_models.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from mteb.models.instruct_wrapper import InstructSentenceTransformerModel
 from mteb.models.model_meta import ModelMeta
-from mteb.models.models_protocols import EncoderProtocol, PromptType
+from mteb.models.models_protocols import PromptType
 def instruction_template(
@@ -114,7 +114,7 @@ training_data = {
 }
 # Predefined prompts for various RTEB tasks
-PREDEFINED_PROMPTS = {
+_PREDEFINED_PROMPTS = {
     # ========== Open Datasets ==========
     # Legal domain
     "AILACasedocs": "Given a legal case scenario, retrieve the most relevant case documents",
@@ -137,7 +137,7 @@ PREDEFINED_PROMPTS = {
     # SQL domain
     "WikiSQLRetrieval": "Given a natural language query, retrieve relevant SQL examples",
     # Multilingual
-    "MIRACLRetrievalHardNegatives": "Given a query, retrieve relevant passages",
+    "MIRACLRetrievalHardNegatives": "Given a question, retrieve Wikipedia passages that answer the question",
     # ========== Private/Closed Datasets ==========
     # Code domain (Private)
     "Code1Retrieval": "Given a code problem description, retrieve relevant code examples",
@@ -160,28 +160,15 @@ PREDEFINED_PROMPTS = {
 }
-def mod_instruct_loader(
-    model_name_or_path: str, revision: str, **kwargs
-) -> EncoderProtocol:
-    # Set default prompts_dict if not provided
-    model = InstructSentenceTransformerModel(
-        model_name_or_path,
-        revision=revision,
+MoD_Embedding = ModelMeta(
+    loader=InstructSentenceTransformerModel,
+    loader_kwargs=dict(
         instruction_template=instruction_template,
         apply_instruction_to_passages=False,
-        prompt_dicts=PREDEFINED_PROMPTS,
-        **kwargs,
-    )
-    encoder = model.model._first_module()
-    if encoder.auto_model.config._attn_implementation == "flash_attention_2":
-        # The Qwen3 code only use left padding in flash_attention_2 mode.
-        encoder.tokenizer.padding_side = "left"
-    return model
-MoD_Embedding = ModelMeta(
-    loader=mod_instruct_loader,
+        prompts_dict=_PREDEFINED_PROMPTS,
+        max_seq_length=18480,
+        model_kwargs={"torch_dtype": "bfloat16"},
+    ),
     name="bflhc/MoD-Embedding",
     languages=multilingual_langs,
     open_weights=True,

mteb/models/model_implementations/mxbai_models.py CHANGED Viewed

@@ -99,4 +99,10 @@ mxbai_embed_xsmall_v1 = ModelMeta(
     public_training_code=None,
     public_training_data=None,
     training_datasets=mixedbread_training_data,
+    citation="""@online{xsmall2024mxbai,
+  title={Every Byte Matters: Introducing mxbai-embed-xsmall-v1},
+  author={Sean Lee and Julius Lipp and Rui Huang and Darius Koenig},
+  year={2024},
+  url={https://www.mixedbread.ai/blog/mxbai-embed-xsmall-v1},
+}""",
 )

mteb/models/model_implementations/nomic_models.py CHANGED Viewed

@@ -328,6 +328,14 @@ nomic_modern_bert_embed = ModelMeta(
     superseded_by=None,
     training_datasets=nomic_training_data,
     public_training_data=None,
+    citation="""@misc{nussbaum2024nomic,
+      title={Nomic Embed: Training a Reproducible Long Context Text Embedder},
+      author={Zach Nussbaum and John X. Morris and Brandon Duderstadt and Andriy Mulyar},
+      year={2024},
+      eprint={2402.01613},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}""",
 )

mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py CHANGED Viewed

@@ -65,14 +65,16 @@ class LlamaNemoretrieverColembed(AbsEncoder):
             iterator = DataLoader(images, batch_size=batch_size)
         for batch in iterator:
-            for b in batch:
+            for image in batch["image"]:
                 pil_img = (
-                    F.to_pil_image(b.to("cpu")) if not isinstance(b, Image.Image) else b
+                    image
+                    if isinstance(image, Image.Image)
+                    else F.to_pil_image(image.to("cpu"))
                 )
                 all_images.append(pil_img)
         batch_size = 1
-        return self.model.forward_passages(all_images, batch_size=batch_size)
+        return self.model.forward_images(all_images, batch_size=batch_size)
     def calculate_probs(self, text_embeddings, image_embeddings):
         scores = self.similarity(text_embeddings, image_embeddings)

mteb/models/model_implementations/pylate_models.py CHANGED Viewed

@@ -415,6 +415,30 @@ jina_colbert_v2 = ModelMeta(
         "DuRetrieval",
         "MIRACL",
     },
+    citation="""@inproceedings{xiao-etal-2024-jina,
+    title = "{J}ina-{C}ol{BERT}-v2: A General-Purpose Multilingual Late Interaction Retriever",
+    author = {Jha, Rohan  and
+      Wang, Bo  and
+      G{\"u}nther, Michael  and
+      Mastrapas, Georgios  and
+      Sturua, Saba  and
+      Mohr, Isabelle  and
+      Koukounas, Andreas  and
+      Wang, Mohammad Kalim  and
+      Wang, Nan  and
+      Xiao, Han},
+    editor = {S{\"a}lev{\"a}, Jonne  and
+      Owodunni, Abraham},
+    booktitle = "Proceedings of the Fourth Workshop on Multilingual Representation Learning (MRL 2024)",
+    month = nov,
+    year = "2024",
+    address = "Miami, Florida, USA",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/2024.mrl-1.11/",
+    doi = "10.18653/v1/2024.mrl-1.11",
+    pages = "159--166",
+    abstract = "Multi-vector dense models, such as ColBERT, have proven highly effective in information retrieval. ColBERT`s late interaction scoring approximates the joint query-document attention seen in cross-encoders while maintaining inference efficiency closer to traditional dense retrieval models, thanks to its bi-encoder architecture and recent optimizations in indexing and search. In this paper, we introduce a novel architecture and a training framework to support long context window and multilingual retrieval. Leveraging Matryoshka Representation Loss, we further demonstrate that the reducing the embedding dimensionality from 128 to 64 has insignificant impact on the model`s retrieval performance and cut storage requirements by up to 50{\%}. Our new model, Jina-ColBERT-v2, demonstrates strong performance across a range of English and multilingual retrieval tasks,"
+}""",
 )
@@ -444,4 +468,13 @@ lightonai__gte_moderncolbert_v1 = ModelMeta(
         "MSMARCO",
         "mMARCO-NL",
     },
+    citation="""@inproceedings{reimers-2019-sentence-bert,
+    title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
+    author = "Reimers, Nils and Gurevych, Iryna",
+    booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
+    month = "11",
+    year = "2019",
+    publisher = "Association for Computational Linguistics",
+    url = "https://arxiv.org/abs/1908.10084"
+}""",
 )

mteb 2.4.1__py3-none-any.whl → 2.5.0__py3-none-any.whl

mteb 2.4.1py3-none-any.whl → 2.5.0py3-none-any.whl