mteb 2.4.1__py3-none-any.whl → 2.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. mteb/benchmarks/benchmark.py +31 -13
  2. mteb/benchmarks/benchmarks/benchmarks.py +2 -2
  3. mteb/cache.py +36 -7
  4. mteb/descriptive_stats/Classification/TurkishConstitutionalCourtViolation.json +54 -0
  5. mteb/descriptive_stats/Retrieval/SQuADKorV1Retrieval.json +30 -0
  6. mteb/models/model_implementations/andersborges.py +12 -0
  7. mteb/models/model_implementations/bge_models.py +43 -0
  8. mteb/models/model_implementations/codefuse_models.py +144 -0
  9. mteb/models/model_implementations/dino_models.py +152 -0
  10. mteb/models/model_implementations/emillykkejensen_models.py +18 -0
  11. mteb/models/model_implementations/euler_models.py +6 -0
  12. mteb/models/model_implementations/fa_models.py +50 -0
  13. mteb/models/model_implementations/facebookai.py +44 -0
  14. mteb/models/model_implementations/gte_models.py +69 -0
  15. mteb/models/model_implementations/kalm_models.py +38 -0
  16. mteb/models/model_implementations/kblab.py +6 -0
  17. mteb/models/model_implementations/kowshik24_models.py +9 -0
  18. mteb/models/model_implementations/misc_models.py +293 -0
  19. mteb/models/model_implementations/mod_models.py +10 -23
  20. mteb/models/model_implementations/mxbai_models.py +6 -0
  21. mteb/models/model_implementations/nomic_models.py +8 -0
  22. mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +5 -3
  23. mteb/models/model_implementations/pylate_models.py +33 -0
  24. mteb/models/model_implementations/ru_sentence_models.py +22 -0
  25. mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +658 -0
  26. mteb/models/model_implementations/sentence_transformers_models.py +39 -0
  27. mteb/models/model_implementations/spartan8806_atles_champion.py +7 -0
  28. mteb/models/model_implementations/ua_sentence_models.py +9 -0
  29. mteb/models/model_implementations/vi_vn_models.py +33 -0
  30. mteb/results/benchmark_results.py +22 -4
  31. mteb/tasks/classification/tur/__init__.py +4 -0
  32. mteb/tasks/classification/tur/turkish_constitutional_court.py +41 -0
  33. mteb/tasks/retrieval/kor/__init__.py +2 -1
  34. mteb/tasks/retrieval/kor/squad_kor_v1_retrieval.py +47 -0
  35. {mteb-2.4.1.dist-info → mteb-2.5.0.dist-info}/METADATA +1 -1
  36. {mteb-2.4.1.dist-info → mteb-2.5.0.dist-info}/RECORD +40 -35
  37. {mteb-2.4.1.dist-info → mteb-2.5.0.dist-info}/WHEEL +0 -0
  38. {mteb-2.4.1.dist-info → mteb-2.5.0.dist-info}/entry_points.txt +0 -0
  39. {mteb-2.4.1.dist-info → mteb-2.5.0.dist-info}/licenses/LICENSE +0 -0
  40. {mteb-2.4.1.dist-info → mteb-2.5.0.dist-info}/top_level.txt +0 -0
@@ -123,6 +123,14 @@ dinov2_small = ModelMeta(
123
123
  similarity_fn_name=ScoringFunction.COSINE,
124
124
  use_instructions=False,
125
125
  training_datasets=dinov2_training_datasets,
126
+ citation="""@misc{oquab2023dinov2,
127
+ title={DINOv2: Learning Robust Visual Features without Supervision},
128
+ author={Maxime Oquab and Timothée Darcet and Théo Moutakanni and Huy Vo and Marc Szafraniec and Vasil Khalidov and Pierre Fernandez and Daniel Haziza and Francisco Massa and Alaaeldin El-Nouby and Mahmoud Assran and Nicolas Ballas and Wojciech Galuba and Russell Howes and Po-Yao Huang and Shang-Wen Li and Ishan Misra and Michael Rabbat and Vasu Sharma and Gabriel Synnaeve and Hu Xu and Hervé Jegou and Julien Mairal and Patrick Labatut and Armand Joulin and Piotr Bojanowski},
129
+ year={2023},
130
+ eprint={2304.07193},
131
+ archivePrefix={arXiv},
132
+ primaryClass={cs.CV}
133
+ }""",
126
134
  )
127
135
 
128
136
  dinov2_base = ModelMeta(
@@ -145,6 +153,14 @@ dinov2_base = ModelMeta(
145
153
  similarity_fn_name=ScoringFunction.COSINE,
146
154
  use_instructions=False,
147
155
  training_datasets=dinov2_training_datasets,
156
+ citation="""@misc{oquab2023dinov2,
157
+ title={DINOv2: Learning Robust Visual Features without Supervision},
158
+ author={Maxime Oquab and Timothée Darcet and Théo Moutakanni and Huy Vo and Marc Szafraniec and Vasil Khalidov and Pierre Fernandez and Daniel Haziza and Francisco Massa and Alaaeldin El-Nouby and Mahmoud Assran and Nicolas Ballas and Wojciech Galuba and Russell Howes and Po-Yao Huang and Shang-Wen Li and Ishan Misra and Michael Rabbat and Vasu Sharma and Gabriel Synnaeve and Hu Xu and Hervé Jegou and Julien Mairal and Patrick Labatut and Armand Joulin and Piotr Bojanowski},
159
+ year={2023},
160
+ eprint={2304.07193},
161
+ archivePrefix={arXiv},
162
+ primaryClass={cs.CV}
163
+ }""",
148
164
  )
149
165
 
150
166
  dinov2_large = ModelMeta(
@@ -167,6 +183,14 @@ dinov2_large = ModelMeta(
167
183
  similarity_fn_name=ScoringFunction.COSINE,
168
184
  use_instructions=False,
169
185
  training_datasets=dinov2_training_datasets,
186
+ citation="""@misc{oquab2023dinov2,
187
+ title={DINOv2: Learning Robust Visual Features without Supervision},
188
+ author={Maxime Oquab and Timothée Darcet and Théo Moutakanni and Huy Vo and Marc Szafraniec and Vasil Khalidov and Pierre Fernandez and Daniel Haziza and Francisco Massa and Alaaeldin El-Nouby and Mahmoud Assran and Nicolas Ballas and Wojciech Galuba and Russell Howes and Po-Yao Huang and Shang-Wen Li and Ishan Misra and Michael Rabbat and Vasu Sharma and Gabriel Synnaeve and Hu Xu and Hervé Jegou and Julien Mairal and Patrick Labatut and Armand Joulin and Piotr Bojanowski},
189
+ year={2023},
190
+ eprint={2304.07193},
191
+ archivePrefix={arXiv},
192
+ primaryClass={cs.CV}
193
+ }""",
170
194
  )
171
195
 
172
196
  dinov2_giant = ModelMeta(
@@ -189,6 +213,14 @@ dinov2_giant = ModelMeta(
189
213
  similarity_fn_name=ScoringFunction.COSINE,
190
214
  use_instructions=False,
191
215
  training_datasets=dinov2_training_datasets,
216
+ citation="""@misc{oquab2023dinov2,
217
+ title={DINOv2: Learning Robust Visual Features without Supervision},
218
+ author={Maxime Oquab and Timothée Darcet and Théo Moutakanni and Huy Vo and Marc Szafraniec and Vasil Khalidov and Pierre Fernandez and Daniel Haziza and Francisco Massa and Alaaeldin El-Nouby and Mahmoud Assran and Nicolas Ballas and Wojciech Galuba and Russell Howes and Po-Yao Huang and Shang-Wen Li and Ishan Misra and Michael Rabbat and Vasu Sharma and Gabriel Synnaeve and Hu Xu and Hervé Jegou and Julien Mairal and Patrick Labatut and Armand Joulin and Piotr Bojanowski},
219
+ year={2023},
220
+ eprint={2304.07193},
221
+ archivePrefix={arXiv},
222
+ primaryClass={cs.CV}
223
+ }""",
192
224
  )
193
225
 
194
226
  webssl_dino_training_datasets = set(
@@ -215,6 +247,14 @@ webssl_dino300m_full2b = ModelMeta(
215
247
  similarity_fn_name=None,
216
248
  use_instructions=False,
217
249
  training_datasets=webssl_dino_training_datasets,
250
+ citation="""@article{fan2025scaling,
251
+ title={Scaling Language-Free Visual Representation Learning},
252
+ author={David Fan and Shengbang Tong and Jiachen Zhu and Koustuv Sinha and Zhuang Liu and Xinlei Chen and Michael Rabbat and Nicolas Ballas and Yann LeCun and Amir Bar and Saining Xie},
253
+ year={2025},
254
+ eprint={2504.01017},
255
+ archivePrefix={arXiv},
256
+ primaryClass={cs.CV}
257
+ }""",
218
258
  )
219
259
 
220
260
  webssl_dino1b_full2b = ModelMeta(
@@ -237,6 +277,14 @@ webssl_dino1b_full2b = ModelMeta(
237
277
  similarity_fn_name=None,
238
278
  use_instructions=False,
239
279
  training_datasets=webssl_dino_training_datasets,
280
+ citation="""@article{fan2025scaling,
281
+ title={Scaling Language-Free Visual Representation Learning},
282
+ author={David Fan and Shengbang Tong and Jiachen Zhu and Koustuv Sinha and Zhuang Liu and Xinlei Chen and Michael Rabbat and Nicolas Ballas and Yann LeCun and Amir Bar and Saining Xie},
283
+ year={2025},
284
+ eprint={2504.01017},
285
+ archivePrefix={arXiv},
286
+ primaryClass={cs.CV}
287
+ }""",
240
288
  )
241
289
 
242
290
  webssl_dino2b_full2b = ModelMeta(
@@ -259,6 +307,14 @@ webssl_dino2b_full2b = ModelMeta(
259
307
  similarity_fn_name=None,
260
308
  use_instructions=False,
261
309
  training_datasets=webssl_dino_training_datasets,
310
+ citation="""@article{fan2025scaling,
311
+ title={Scaling Language-Free Visual Representation Learning},
312
+ author={David Fan and Shengbang Tong and Jiachen Zhu and Koustuv Sinha and Zhuang Liu and Xinlei Chen and Michael Rabbat and Nicolas Ballas and Yann LeCun and Amir Bar and Saining Xie},
313
+ year={2025},
314
+ eprint={2504.01017},
315
+ archivePrefix={arXiv},
316
+ primaryClass={cs.CV}
317
+ }""",
262
318
  )
263
319
 
264
320
  webssl_dino3b_full2b = ModelMeta(
@@ -281,6 +337,14 @@ webssl_dino3b_full2b = ModelMeta(
281
337
  similarity_fn_name=None,
282
338
  use_instructions=False,
283
339
  training_datasets=webssl_dino_training_datasets,
340
+ citation="""@article{fan2025scaling,
341
+ title={Scaling Language-Free Visual Representation Learning},
342
+ author={David Fan and Shengbang Tong and Jiachen Zhu and Koustuv Sinha and Zhuang Liu and Xinlei Chen and Michael Rabbat and Nicolas Ballas and Yann LeCun and Amir Bar and Saining Xie},
343
+ year={2025},
344
+ eprint={2504.01017},
345
+ archivePrefix={arXiv},
346
+ primaryClass={cs.CV}
347
+ }""",
284
348
  )
285
349
 
286
350
  webssl_dino5b_full2b = ModelMeta(
@@ -303,6 +367,14 @@ webssl_dino5b_full2b = ModelMeta(
303
367
  similarity_fn_name=None,
304
368
  use_instructions=False,
305
369
  training_datasets=webssl_dino_training_datasets,
370
+ citation="""@article{fan2025scaling,
371
+ title={Scaling Language-Free Visual Representation Learning},
372
+ author={David Fan and Shengbang Tong and Jiachen Zhu and Koustuv Sinha and Zhuang Liu and Xinlei Chen and Michael Rabbat and Nicolas Ballas and Yann LeCun and Amir Bar and Saining Xie},
373
+ year={2025},
374
+ eprint={2504.01017},
375
+ archivePrefix={arXiv},
376
+ primaryClass={cs.CV}
377
+ }""",
306
378
  )
307
379
 
308
380
  webssl_dino7b_full8b_224 = ModelMeta(
@@ -325,6 +397,14 @@ webssl_dino7b_full8b_224 = ModelMeta(
325
397
  similarity_fn_name=None,
326
398
  use_instructions=False,
327
399
  training_datasets=webssl_dino_training_datasets,
400
+ citation="""@article{fan2025scaling,
401
+ title={Scaling Language-Free Visual Representation Learning},
402
+ author={David Fan and Shengbang Tong and Jiachen Zhu and Koustuv Sinha and Zhuang Liu and Xinlei Chen and Michael Rabbat and Nicolas Ballas and Yann LeCun and Amir Bar and Saining Xie},
403
+ year={2025},
404
+ eprint={2504.01017},
405
+ archivePrefix={arXiv},
406
+ primaryClass={cs.CV}
407
+ }""",
328
408
  )
329
409
 
330
410
  webssl_dino7b_full8b_378 = ModelMeta(
@@ -347,6 +427,14 @@ webssl_dino7b_full8b_378 = ModelMeta(
347
427
  similarity_fn_name=None,
348
428
  use_instructions=False,
349
429
  training_datasets=webssl_dino_training_datasets,
430
+ citation="""@article{fan2025scaling,
431
+ title={Scaling Language-Free Visual Representation Learning},
432
+ author={David Fan and Shengbang Tong and Jiachen Zhu and Koustuv Sinha and Zhuang Liu and Xinlei Chen and Michael Rabbat and Nicolas Ballas and Yann LeCun and Amir Bar and Saining Xie},
433
+ year={2025},
434
+ eprint={2504.01017},
435
+ archivePrefix={arXiv},
436
+ primaryClass={cs.CV}
437
+ }""",
350
438
  )
351
439
 
352
440
  webssl_dino7b_full8b_518 = ModelMeta(
@@ -369,6 +457,14 @@ webssl_dino7b_full8b_518 = ModelMeta(
369
457
  similarity_fn_name=None,
370
458
  use_instructions=False,
371
459
  training_datasets=webssl_dino_training_datasets,
460
+ citation="""@article{fan2025scaling,
461
+ title={Scaling Language-Free Visual Representation Learning},
462
+ author={David Fan and Shengbang Tong and Jiachen Zhu and Koustuv Sinha and Zhuang Liu and Xinlei Chen and Michael Rabbat and Nicolas Ballas and Yann LeCun and Amir Bar and Saining Xie},
463
+ year={2025},
464
+ eprint={2504.01017},
465
+ archivePrefix={arXiv},
466
+ primaryClass={cs.CV}
467
+ }""",
372
468
  )
373
469
 
374
470
 
@@ -392,6 +488,14 @@ webssl_dino2b_light2b = ModelMeta(
392
488
  similarity_fn_name=None,
393
489
  use_instructions=False,
394
490
  training_datasets=webssl_dino_training_datasets,
491
+ citation="""@article{fan2025scaling,
492
+ title={Scaling Language-Free Visual Representation Learning},
493
+ author={David Fan and Shengbang Tong and Jiachen Zhu and Koustuv Sinha and Zhuang Liu and Xinlei Chen and Michael Rabbat and Nicolas Ballas and Yann LeCun and Amir Bar and Saining Xie},
494
+ year={2025},
495
+ eprint={2504.01017},
496
+ archivePrefix={arXiv},
497
+ primaryClass={cs.CV}
498
+ }""",
395
499
  )
396
500
 
397
501
  webssl_dino2b_heavy2b = ModelMeta(
@@ -414,6 +518,14 @@ webssl_dino2b_heavy2b = ModelMeta(
414
518
  similarity_fn_name=None,
415
519
  use_instructions=False,
416
520
  training_datasets=webssl_dino_training_datasets,
521
+ citation="""@article{fan2025scaling,
522
+ title={Scaling Language-Free Visual Representation Learning},
523
+ author={David Fan and Shengbang Tong and Jiachen Zhu and Koustuv Sinha and Zhuang Liu and Xinlei Chen and Michael Rabbat and Nicolas Ballas and Yann LeCun and Amir Bar and Saining Xie},
524
+ year={2025},
525
+ eprint={2504.01017},
526
+ archivePrefix={arXiv},
527
+ primaryClass={cs.CV}
528
+ }""",
417
529
  )
418
530
 
419
531
  webssl_dino3b_light2b = ModelMeta(
@@ -436,6 +548,14 @@ webssl_dino3b_light2b = ModelMeta(
436
548
  similarity_fn_name=None,
437
549
  use_instructions=False,
438
550
  training_datasets=webssl_dino_training_datasets,
551
+ citation="""@article{fan2025scaling,
552
+ title={Scaling Language-Free Visual Representation Learning},
553
+ author={David Fan and Shengbang Tong and Jiachen Zhu and Koustuv Sinha and Zhuang Liu and Xinlei Chen and Michael Rabbat and Nicolas Ballas and Yann LeCun and Amir Bar and Saining Xie},
554
+ year={2025},
555
+ eprint={2504.01017},
556
+ archivePrefix={arXiv},
557
+ primaryClass={cs.CV}
558
+ }""",
439
559
  )
440
560
 
441
561
  webssl_dino3b_heavy2b = ModelMeta(
@@ -458,6 +578,14 @@ webssl_dino3b_heavy2b = ModelMeta(
458
578
  similarity_fn_name=None,
459
579
  use_instructions=False,
460
580
  training_datasets=webssl_dino_training_datasets,
581
+ citation="""@article{fan2025scaling,
582
+ title={Scaling Language-Free Visual Representation Learning},
583
+ author={David Fan and Shengbang Tong and Jiachen Zhu and Koustuv Sinha and Zhuang Liu and Xinlei Chen and Michael Rabbat and Nicolas Ballas and Yann LeCun and Amir Bar and Saining Xie},
584
+ year={2025},
585
+ eprint={2504.01017},
586
+ archivePrefix={arXiv},
587
+ primaryClass={cs.CV}
588
+ }""",
461
589
  )
462
590
 
463
591
  webssl_mae300m_full2b = ModelMeta(
@@ -480,6 +608,14 @@ webssl_mae300m_full2b = ModelMeta(
480
608
  similarity_fn_name=None,
481
609
  use_instructions=False,
482
610
  training_datasets=webssl_dino_training_datasets,
611
+ citation="""@article{fan2025scaling,
612
+ title={Scaling Language-Free Visual Representation Learning},
613
+ author={David Fan and Shengbang Tong and Jiachen Zhu and Koustuv Sinha and Zhuang Liu and Xinlei Chen and Michael Rabbat and Nicolas Ballas and Yann LeCun and Amir Bar and Saining Xie},
614
+ year={2025},
615
+ eprint={2504.01017},
616
+ archivePrefix={arXiv},
617
+ primaryClass={cs.CV}
618
+ }""",
483
619
  )
484
620
 
485
621
  webssl_mae700m_full2b = ModelMeta(
@@ -502,6 +638,14 @@ webssl_mae700m_full2b = ModelMeta(
502
638
  similarity_fn_name=None,
503
639
  use_instructions=False,
504
640
  training_datasets=webssl_dino_training_datasets,
641
+ citation="""@article{fan2025scaling,
642
+ title={Scaling Language-Free Visual Representation Learning},
643
+ author={David Fan and Shengbang Tong and Jiachen Zhu and Koustuv Sinha and Zhuang Liu and Xinlei Chen and Michael Rabbat and Nicolas Ballas and Yann LeCun and Amir Bar and Saining Xie},
644
+ year={2025},
645
+ eprint={2504.01017},
646
+ archivePrefix={arXiv},
647
+ primaryClass={cs.CV}
648
+ }""",
505
649
  )
506
650
 
507
651
  webssl_mae1b_full2b = ModelMeta(
@@ -524,4 +668,12 @@ webssl_mae1b_full2b = ModelMeta(
524
668
  similarity_fn_name=None,
525
669
  use_instructions=False,
526
670
  training_datasets=webssl_dino_training_datasets,
671
+ citation="""@article{fan2025scaling,
672
+ title={Scaling Language-Free Visual Representation Learning},
673
+ author={David Fan and Shengbang Tong and Jiachen Zhu and Koustuv Sinha and Zhuang Liu and Xinlei Chen and Michael Rabbat and Nicolas Ballas and Yann LeCun and Amir Bar and Saining Xie},
674
+ year={2025},
675
+ eprint={2504.01017},
676
+ archivePrefix={arXiv},
677
+ primaryClass={cs.CV}
678
+ }""",
527
679
  )
@@ -21,6 +21,15 @@ embedding_gemma_300m_scandi = ModelMeta(
21
21
  similarity_fn_name="cosine", # type: ignore[arg-type]
22
22
  adapted_from="google/embeddinggemma-300m",
23
23
  memory_usage_mb=578,
24
+ citation="""@inproceedings{reimers-2019-sentence-bert,
25
+ title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
26
+ author = "Reimers, Nils and Gurevych, Iryna",
27
+ booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
28
+ month = "11",
29
+ year = "2019",
30
+ publisher = "Association for Computational Linguistics",
31
+ url = "https://arxiv.org/abs/1908.10084",
32
+ }""",
24
33
  )
25
34
 
26
35
 
@@ -67,4 +76,13 @@ mmbert_scandi = ModelMeta(
67
76
  training_datasets=set(),
68
77
  similarity_fn_name="cosine", # type: ignore[arg-type]
69
78
  adapted_from="jonasaise/scandmmBERT-base-scandinavian",
79
+ citation="""@inproceedings{reimers-2019-sentence-bert,
80
+ title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
81
+ author = "Reimers, Nils and Gurevych, Iryna",
82
+ booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
83
+ month = "11",
84
+ year = "2019",
85
+ publisher = "Association for Computational Linguistics",
86
+ url = "https://arxiv.org/abs/1908.10084",
87
+ }""",
70
88
  )
@@ -22,4 +22,10 @@ Euler_Legal_Embedding_V1 = ModelMeta(
22
22
  training_datasets=set(), # final-data-new-anonymized-grok4-filtered
23
23
  adapted_from="Qwen/Qwen3-Embedding-8B",
24
24
  superseded_by=None,
25
+ citation="""@misc{euler2025legal,
26
+ title={Euler-Legal-Embedding: Advanced Legal Representation Learning},
27
+ author={LawRank Team},
28
+ year={2025},
29
+ publisher={Hugging Face}
30
+ }""",
25
31
  )
@@ -156,6 +156,15 @@ tooka_sbert = ModelMeta(
156
156
  public_training_code=None,
157
157
  public_training_data=None,
158
158
  training_datasets=None,
159
+ citation="""@inproceedings{reimers-2019-sentence-bert,
160
+ title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
161
+ author = "Reimers, Nils and Gurevych, Iryna",
162
+ booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
163
+ month = "11",
164
+ year = "2019",
165
+ publisher = "Association for Computational Linguistics",
166
+ url = "https://arxiv.org/abs/1908.10084",
167
+ }""",
159
168
  )
160
169
 
161
170
  fa_bert = ModelMeta(
@@ -180,6 +189,29 @@ fa_bert = ModelMeta(
180
189
  # It's just a base model
181
190
  # https://huggingface.co/datasets/sbunlp/hmblogs-v3
182
191
  ),
192
+ citation="""@inproceedings{masumi-etal-2025-fabert,
193
+ title = "{F}a{BERT}: Pre-training {BERT} on {P}ersian Blogs",
194
+ author = "Masumi, Mostafa and
195
+ Majd, Seyed Soroush and
196
+ Shamsfard, Mehrnoush and
197
+ Beigy, Hamid",
198
+ editor = "Bak, JinYeong and
199
+ Goot, Rob van der and
200
+ Jang, Hyeju and
201
+ Buaphet, Weerayut and
202
+ Ramponi, Alan and
203
+ Xu, Wei and
204
+ Ritter, Alan",
205
+ booktitle = "Proceedings of the Tenth Workshop on Noisy and User-generated Text",
206
+ month = may,
207
+ year = "2025",
208
+ address = "Albuquerque, New Mexico, USA",
209
+ publisher = "Association for Computational Linguistics",
210
+ url = "https://aclanthology.org/2025.wnut-1.10/",
211
+ doi = "10.18653/v1/2025.wnut-1.10",
212
+ pages = "85--96",
213
+ ISBN = "979-8-89176-232-9",
214
+ }""",
183
215
  )
184
216
 
185
217
  tooka_sbert_v2_small = ModelMeta(
@@ -201,6 +233,15 @@ tooka_sbert_v2_small = ModelMeta(
201
233
  public_training_code=None,
202
234
  public_training_data=None,
203
235
  training_datasets=None,
236
+ citation="""@inproceedings{reimers-2019-sentence-bert,
237
+ title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
238
+ author = "Reimers, Nils and Gurevych, Iryna",
239
+ booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
240
+ month = "11",
241
+ year = "2019",
242
+ publisher = "Association for Computational Linguistics",
243
+ url = "https://arxiv.org/abs/1908.10084",
244
+ }""",
204
245
  )
205
246
 
206
247
  tooka_sbert_v2_large = ModelMeta(
@@ -222,4 +263,13 @@ tooka_sbert_v2_large = ModelMeta(
222
263
  public_training_code=None,
223
264
  public_training_data=None,
224
265
  training_datasets=None,
266
+ citation="""@inproceedings{reimers-2019-sentence-bert,
267
+ title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
268
+ author = "Reimers, Nils and Gurevych, Iryna",
269
+ booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
270
+ month = "11",
271
+ year = "2019",
272
+ publisher = "Association for Computational Linguistics",
273
+ url = "https://arxiv.org/abs/1908.10084",
274
+ }""",
225
275
  )
@@ -123,6 +123,28 @@ xlmr_base = ModelMeta(
123
123
  public_training_code=None,
124
124
  public_training_data=None,
125
125
  training_datasets=set(),
126
+ citation="""@article{DBLP:journals/corr/abs-1911-02116,
127
+ author = {Alexis Conneau and
128
+ Kartikay Khandelwal and
129
+ Naman Goyal and
130
+ Vishrav Chaudhary and
131
+ Guillaume Wenzek and
132
+ Francisco Guzm{\'{a}}n and
133
+ Edouard Grave and
134
+ Myle Ott and
135
+ Luke Zettlemoyer and
136
+ Veselin Stoyanov},
137
+ title = {Unsupervised Cross-lingual Representation Learning at Scale},
138
+ journal = {CoRR},
139
+ volume = {abs/1911.02116},
140
+ year = {2019},
141
+ url = {http://arxiv.org/abs/1911.02116},
142
+ eprinttype = {arXiv},
143
+ eprint = {1911.02116},
144
+ timestamp = {Mon, 11 Nov 2019 18:38:09 +0100},
145
+ biburl = {https://dblp.org/rec/journals/corr/abs-1911-02116.bib},
146
+ bibsource = {dblp computer science bibliography, https://dblp.org}
147
+ }""",
126
148
  )
127
149
 
128
150
  xlmr_large = ModelMeta(
@@ -144,4 +166,26 @@ xlmr_large = ModelMeta(
144
166
  public_training_code=None,
145
167
  public_training_data=None,
146
168
  training_datasets=set(),
169
+ citation="""@article{DBLP:journals/corr/abs-1911-02116,
170
+ author = {Alexis Conneau and
171
+ Kartikay Khandelwal and
172
+ Naman Goyal and
173
+ Vishrav Chaudhary and
174
+ Guillaume Wenzek and
175
+ Francisco Guzm{\'{a}}n and
176
+ Edouard Grave and
177
+ Myle Ott and
178
+ Luke Zettlemoyer and
179
+ Veselin Stoyanov},
180
+ title = {Unsupervised Cross-lingual Representation Learning at Scale},
181
+ journal = {CoRR},
182
+ volume = {abs/1911.02116},
183
+ year = {2019},
184
+ url = {http://arxiv.org/abs/1911.02116},
185
+ eprinttype = {arXiv},
186
+ eprint = {1911.02116},
187
+ timestamp = {Mon, 11 Nov 2019 18:38:09 +0100},
188
+ biburl = {https://dblp.org/rec/journals/corr/abs-1911-02116.bib},
189
+ bibsource = {dblp computer science bibliography, https://dblp.org}
190
+ }""",
147
191
  )
@@ -89,6 +89,12 @@ gte_qwen1_5_7b_instruct = ModelMeta(
89
89
  public_training_code=None,
90
90
  public_training_data=None,
91
91
  training_datasets=None,
92
+ citation="""@article{li2023towards,
93
+ title={Towards general text embeddings with multi-stage contrastive learning},
94
+ author={Li, Zehan and Zhang, Xin and Zhang, Yanzhao and Long, Dingkun and Xie, Pengjun and Zhang, Meishan},
95
+ journal={arXiv preprint arXiv:2308.03281},
96
+ year={2023}
97
+ }""",
92
98
  )
93
99
 
94
100
  gte_qwen2_1_5b_instruct = ModelMeta(
@@ -119,6 +125,12 @@ gte_qwen2_1_5b_instruct = ModelMeta(
119
125
  public_training_code=None,
120
126
  public_training_data=None,
121
127
  training_datasets=None,
128
+ citation="""@article{li2023towards,
129
+ title={Towards general text embeddings with multi-stage contrastive learning},
130
+ author={Li, Zehan and Zhang, Xin and Zhang, Yanzhao and Long, Dingkun and Xie, Pengjun and Zhang, Meishan},
131
+ journal={arXiv preprint arXiv:2308.03281},
132
+ year={2023}
133
+ }""",
122
134
  )
123
135
 
124
136
  gte_small_zh = ModelMeta(
@@ -140,6 +152,12 @@ gte_small_zh = ModelMeta(
140
152
  public_training_code=None,
141
153
  public_training_data=None,
142
154
  training_datasets=None, # Not disclosed
155
+ citation="""@article{li2023towards,
156
+ title={Towards general text embeddings with multi-stage contrastive learning},
157
+ author={Li, Zehan and Zhang, Xin and Zhang, Yanzhao and Long, Dingkun and Xie, Pengjun and Zhang, Meishan},
158
+ journal={arXiv preprint arXiv:2308.03281},
159
+ year={2023}
160
+ }""",
143
161
  )
144
162
 
145
163
  gte_base_zh = ModelMeta(
@@ -161,6 +179,12 @@ gte_base_zh = ModelMeta(
161
179
  public_training_code=None,
162
180
  public_training_data=None,
163
181
  training_datasets=None, # Not disclosed
182
+ citation="""@article{li2023towards,
183
+ title={Towards general text embeddings with multi-stage contrastive learning},
184
+ author={Li, Zehan and Zhang, Xin and Zhang, Yanzhao and Long, Dingkun and Xie, Pengjun and Zhang, Meishan},
185
+ journal={arXiv preprint arXiv:2308.03281},
186
+ year={2023}
187
+ }""",
164
188
  )
165
189
 
166
190
  gte_large_zh = ModelMeta(
@@ -182,6 +206,12 @@ gte_large_zh = ModelMeta(
182
206
  public_training_code=None,
183
207
  public_training_data=None,
184
208
  training_datasets=None, # Not disclosed
209
+ citation="""@article{li2023towards,
210
+ title={Towards general text embeddings with multi-stage contrastive learning},
211
+ author={Li, Zehan and Zhang, Xin and Zhang, Yanzhao and Long, Dingkun and Xie, Pengjun and Zhang, Meishan},
212
+ journal={arXiv preprint arXiv:2308.03281},
213
+ year={2023}
214
+ }""",
185
215
  )
186
216
 
187
217
  gte_multilingual_langs = [
@@ -304,6 +334,13 @@ gte_multilingual_base = ModelMeta(
304
334
  public_training_code=None,
305
335
  public_training_data=None, # couldn't find
306
336
  training_datasets=gte_multi_training_data,
337
+ citation="""@inproceedings{zhang2024mgte,
338
+ title={mGTE: Generalized Long-Context Text Representation and Reranking Models for Multilingual Text Retrieval},
339
+ author={Zhang, Xin and Zhang, Yanzhao and Long, Dingkun and Xie, Wen and Dai, Ziqi and Tang, Jialong and Lin, Huan and Yang, Baosong and Xie, Pengjun and Huang, Fei and others},
340
+ booktitle={Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: Industry Track},
341
+ pages={1393--1412},
342
+ year={2024}
343
+ }""",
307
344
  )
308
345
 
309
346
  gte_modernbert_base = ModelMeta(
@@ -325,6 +362,20 @@ gte_modernbert_base = ModelMeta(
325
362
  public_training_code=None, # couldn't find
326
363
  public_training_data=None,
327
364
  training_datasets=gte_multi_training_data, # English part of gte_multi_training_data,
365
+ citation="""@inproceedings{zhang2024mgte,
366
+ title={mGTE: Generalized Long-Context Text Representation and Reranking Models for Multilingual Text Retrieval},
367
+ author={Zhang, Xin and Zhang, Yanzhao and Long, Dingkun and Xie, Wen and Dai, Ziqi and Tang, Jialong and Lin, Huan and Yang, Baosong and Xie, Pengjun and Huang, Fei and others},
368
+ booktitle={Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: Industry Track},
369
+ pages={1393--1412},
370
+ year={2024}
371
+ }
372
+
373
+ @article{li2023towards,
374
+ title={Towards general text embeddings with multi-stage contrastive learning},
375
+ author={Li, Zehan and Zhang, Xin and Zhang, Yanzhao and Long, Dingkun and Xie, Pengjun and Zhang, Meishan},
376
+ journal={arXiv preprint arXiv:2308.03281},
377
+ year={2023}
378
+ }""",
328
379
  )
329
380
 
330
381
 
@@ -349,4 +400,22 @@ gte_base_en_v15 = ModelMeta(
349
400
  public_training_code=None,
350
401
  public_training_data=None,
351
402
  training_datasets=None,
403
+ citation="""@misc{zhang2024mgte,
404
+ title={mGTE: Generalized Long-Context Text Representation and Reranking Models for Multilingual Text Retrieval},
405
+ author={Xin Zhang and Yanzhao Zhang and Dingkun Long and Wen Xie and Ziqi Dai and Jialong Tang and Huan Lin and Baosong Yang and Pengjun Xie and Fei Huang and Meishan Zhang and Wenjie Li and Min Zhang},
406
+ year={2024},
407
+ eprint={2407.19669},
408
+ archivePrefix={arXiv},
409
+ primaryClass={cs.CL},
410
+ url={https://arxiv.org/abs/2407.19669},
411
+ }
412
+ @misc{li2023gte,
413
+ title={Towards General Text Embeddings with Multi-stage Contrastive Learning},
414
+ author={Zehan Li and Xin Zhang and Yanzhao Zhang and Dingkun Long and Pengjun Xie and Meishan Zhang},
415
+ year={2023},
416
+ eprint={2308.03281},
417
+ archivePrefix={arXiv},
418
+ primaryClass={cs.CL},
419
+ url={https://arxiv.org/abs/2308.03281},
420
+ }""",
352
421
  )
@@ -901,6 +901,25 @@ KaLM_Embedding_KaLM_embedding_multilingual_mini_instruct_v2_5 = ModelMeta(
901
901
  training_datasets=kalm_v2_training_data,
902
902
  adapted_from="HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v2",
903
903
  superseded_by=None,
904
+ citation="""@misc{zhao2025kalmembeddingv2,
905
+ title={KaLM-Embedding-V2: Superior Training Techniques and Data Inspire A Versatile Embedding Model},
906
+ author={Xinping Zhao and Xinshuo Hu and Zifei Shan and Shouzheng Huang and Yao Zhou and Xin Zhang and Zetian Sun and Zhenyu Liu and Dongfang Li and Xinyuan Wei and Youcheng Pan and Yang Xiang and Meishan Zhang and Haofen Wang and Jun Yu and Baotian Hu and Min Zhang},
907
+ year={2025},
908
+ eprint={2506.20923},
909
+ archivePrefix={arXiv},
910
+ primaryClass={cs.CL},
911
+ url={https://arxiv.org/abs/2506.20923},
912
+ }
913
+
914
+ @misc{hu2025kalmembedding,
915
+ title={KaLM-Embedding: Superior Training Data Brings A Stronger Embedding Model},
916
+ author={Xinshuo Hu and Zifei Shan and Xinping Zhao and Zetian Sun and Zhenyu Liu and Dongfang Li and Shaolin Ye and Xinyuan Wei and Qian Chen and Baotian Hu and Haofen Wang and Jun Yu and Min Zhang},
917
+ year={2025},
918
+ eprint={2501.01028},
919
+ archivePrefix={arXiv},
920
+ primaryClass={cs.CL},
921
+ url={https://arxiv.org/abs/2501.01028},
922
+ }""",
904
923
  )
905
924
 
906
925
  KaLM_Embedding_gemma_3_12b_2511 = ModelMeta(
@@ -928,4 +947,23 @@ KaLM_Embedding_gemma_3_12b_2511 = ModelMeta(
928
947
  public_training_code="https://github.com/HITsz-TMG/KaLM-Embedding",
929
948
  public_training_data=None,
930
949
  training_datasets=KaLM_Embedding_gemma_3_12b_training_data,
950
+ citation="""@misc{zhao2025kalmembeddingv2,
951
+ title={KaLM-Embedding-V2: Superior Training Techniques and Data Inspire A Versatile Embedding Model},
952
+ author={Xinping Zhao and Xinshuo Hu and Zifei Shan and Shouzheng Huang and Yao Zhou and Xin Zhang and Zetian Sun and Zhenyu Liu and Dongfang Li and Xinyuan Wei and Youcheng Pan and Yang Xiang and Meishan Zhang and Haofen Wang and Jun Yu and Baotian Hu and Min Zhang},
953
+ year={2025},
954
+ eprint={2506.20923},
955
+ archivePrefix={arXiv},
956
+ primaryClass={cs.CL},
957
+ url={https://arxiv.org/abs/2506.20923},
958
+ }
959
+
960
+ @misc{hu2025kalmembedding,
961
+ title={KaLM-Embedding: Superior Training Data Brings A Stronger Embedding Model},
962
+ author={Xinshuo Hu and Zifei Shan and Xinping Zhao and Zetian Sun and Zhenyu Liu and Dongfang Li and Shaolin Ye and Xinyuan Wei and Qian Chen and Baotian Hu and Haofen Wang and Jun Yu and Min Zhang},
963
+ year={2025},
964
+ eprint={2501.01028},
965
+ archivePrefix={arXiv},
966
+ primaryClass={cs.CL},
967
+ url={https://arxiv.org/abs/2501.01028},
968
+ }""",
931
969
  )
@@ -21,4 +21,10 @@ sbert_swedish = ModelMeta(
21
21
  public_training_data=None,
22
22
  training_datasets=None,
23
23
  adapted_from="sentence-transformers/all-mpnet-base-v2",
24
+ citation="""@misc{rekathati2021introducing,
25
+ author = {Rekathati, Faton},
26
+ title = {The KBLab Blog: Introducing a Swedish Sentence Transformer},
27
+ url = {https://kb-labb.github.io/posts/2021-08-23-a-swedish-sentence-transformer/},
28
+ year = {2021}
29
+ }""",
24
30
  )
@@ -19,4 +19,13 @@ kowshik24_bangla_embedding_model = ModelMeta(
19
19
  public_training_code="https://github.com/kowshik24/Bangla-Embedding",
20
20
  public_training_data="https://huggingface.co/datasets/sartajekram/BanglaRQA",
21
21
  training_datasets=set(),
22
+ citation="""@inproceedings{reimers-2019-sentence-bert,
23
+ title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
24
+ author = "Reimers, Nils and Gurevych, Iryna",
25
+ booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
26
+ month = "11",
27
+ year = "2019",
28
+ publisher = "Association for Computational Linguistics",
29
+ url = "https://arxiv.org/abs/1908.10084",
30
+ }""",
22
31
  )