mteb 2.3.11__py3-none-any.whl → 2.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. mteb/benchmarks/benchmarks/__init__.py +2 -0
  2. mteb/benchmarks/benchmarks/benchmarks.py +57 -0
  3. mteb/deprecated_evaluator.py +8 -13
  4. mteb/descriptive_stats/Reranking/JQaRARerankingLite.json +35 -0
  5. mteb/descriptive_stats/Reranking/JaCWIRRerankingLite.json +35 -0
  6. mteb/descriptive_stats/Retrieval/JaCWIRRetrievalLite.json +30 -0
  7. mteb/descriptive_stats/Retrieval/JaqketRetrievalLite.json +30 -0
  8. mteb/descriptive_stats/Retrieval/MIRACLJaRetrievalLite.json +30 -0
  9. mteb/descriptive_stats/Retrieval/MrTyDiJaRetrievalLite.json +30 -0
  10. mteb/evaluate.py +2 -33
  11. mteb/leaderboard/figures.py +1 -1
  12. mteb/leaderboard/table.py +1 -11
  13. mteb/models/abs_encoder.py +21 -17
  14. mteb/models/get_model_meta.py +3 -123
  15. mteb/models/instruct_wrapper.py +2 -1
  16. mteb/models/model_implementations/andersborges.py +12 -0
  17. mteb/models/model_implementations/bge_models.py +43 -0
  18. mteb/models/model_implementations/bica_model.py +34 -0
  19. mteb/models/model_implementations/dino_models.py +152 -0
  20. mteb/models/model_implementations/emillykkejensen_models.py +18 -0
  21. mteb/models/model_implementations/euler_models.py +6 -0
  22. mteb/models/model_implementations/fa_models.py +50 -0
  23. mteb/models/model_implementations/facebookai.py +44 -0
  24. mteb/models/model_implementations/google_models.py +10 -0
  25. mteb/models/model_implementations/gte_models.py +69 -0
  26. mteb/models/model_implementations/kalm_models.py +38 -0
  27. mteb/models/model_implementations/kblab.py +6 -0
  28. mteb/models/model_implementations/kowshik24_models.py +9 -0
  29. mteb/models/model_implementations/misc_models.py +293 -0
  30. mteb/models/model_implementations/mod_models.py +189 -0
  31. mteb/models/model_implementations/mxbai_models.py +6 -0
  32. mteb/models/model_implementations/nomic_models.py +150 -4
  33. mteb/models/model_implementations/pylate_models.py +33 -0
  34. mteb/models/model_implementations/ru_sentence_models.py +22 -0
  35. mteb/models/model_implementations/sentence_transformers_models.py +39 -0
  36. mteb/models/model_implementations/spartan8806_atles_champion.py +7 -0
  37. mteb/models/model_implementations/ua_sentence_models.py +9 -0
  38. mteb/models/model_implementations/vi_vn_models.py +33 -0
  39. mteb/models/model_meta.py +396 -19
  40. mteb/models/sentence_transformer_wrapper.py +2 -7
  41. mteb/tasks/reranking/jpn/__init__.py +9 -1
  42. mteb/tasks/reranking/jpn/j_qa_ra_reranking_lite.py +49 -0
  43. mteb/tasks/reranking/jpn/ja_cwir_reranking_lite.py +47 -0
  44. mteb/tasks/retrieval/code/fresh_stack_retrieval.py +8 -5
  45. mteb/tasks/retrieval/jpn/__init__.py +8 -0
  46. mteb/tasks/retrieval/jpn/ja_cwir_retrieval_lite.py +47 -0
  47. mteb/tasks/retrieval/jpn/jaqket_retrieval_lite.py +50 -0
  48. mteb/tasks/retrieval/jpn/miracl_ja_retrieval_lite.py +52 -0
  49. mteb/tasks/retrieval/jpn/mr_tydi_ja_retrieval_lite.py +48 -0
  50. {mteb-2.3.11.dist-info → mteb-2.4.2.dist-info}/METADATA +1 -1
  51. {mteb-2.3.11.dist-info → mteb-2.4.2.dist-info}/RECORD +55 -41
  52. {mteb-2.3.11.dist-info → mteb-2.4.2.dist-info}/WHEEL +0 -0
  53. {mteb-2.3.11.dist-info → mteb-2.4.2.dist-info}/entry_points.txt +0 -0
  54. {mteb-2.3.11.dist-info → mteb-2.4.2.dist-info}/licenses/LICENSE +0 -0
  55. {mteb-2.3.11.dist-info → mteb-2.4.2.dist-info}/top_level.txt +0 -0
@@ -99,4 +99,10 @@ mxbai_embed_xsmall_v1 = ModelMeta(
99
99
  public_training_code=None,
100
100
  public_training_data=None,
101
101
  training_datasets=mixedbread_training_data,
102
+ citation="""@online{xsmall2024mxbai,
103
+ title={Every Byte Matters: Introducing mxbai-embed-xsmall-v1},
104
+ author={Sean Lee and Julius Lipp and Rui Huang and Darius Koenig},
105
+ year={2024},
106
+ url={https://www.mixedbread.ai/blog/mxbai-embed-xsmall-v1},
107
+ }""",
102
108
  )
@@ -193,7 +193,7 @@ NOMIC_CITATION = """
193
193
  """
194
194
 
195
195
  nomic_embed_v1_5 = ModelMeta(
196
- loader=NomicWrapper,
196
+ loader=NomicWrapper, # type: ignore
197
197
  loader_kwargs=dict(
198
198
  trust_remote_code=True,
199
199
  model_prompts=model_prompts,
@@ -221,7 +221,7 @@ nomic_embed_v1_5 = ModelMeta(
221
221
  )
222
222
 
223
223
  nomic_embed_v1 = ModelMeta(
224
- loader=NomicWrapper,
224
+ loader=NomicWrapper, # type: ignore
225
225
  loader_kwargs=dict(
226
226
  trust_remote_code=True,
227
227
  model_prompts=model_prompts,
@@ -249,7 +249,7 @@ nomic_embed_v1 = ModelMeta(
249
249
  )
250
250
 
251
251
  nomic_embed_v1_ablated = ModelMeta(
252
- loader=NomicWrapper,
252
+ loader=NomicWrapper, # type: ignore
253
253
  loader_kwargs=dict(
254
254
  trust_remote_code=True,
255
255
  model_prompts=model_prompts,
@@ -276,7 +276,7 @@ nomic_embed_v1_ablated = ModelMeta(
276
276
  )
277
277
 
278
278
  nomic_embed_v1_unsupervised = ModelMeta(
279
- loader=NomicWrapper,
279
+ loader=NomicWrapper, # type: ignore
280
280
  loader_kwargs=dict(
281
281
  trust_remote_code=True,
282
282
  model_prompts=model_prompts,
@@ -328,4 +328,150 @@ nomic_modern_bert_embed = ModelMeta(
328
328
  superseded_by=None,
329
329
  training_datasets=nomic_training_data,
330
330
  public_training_data=None,
331
+ citation="""@misc{nussbaum2024nomic,
332
+ title={Nomic Embed: Training a Reproducible Long Context Text Embedder},
333
+ author={Zach Nussbaum and John X. Morris and Brandon Duderstadt and Andriy Mulyar},
334
+ year={2024},
335
+ eprint={2402.01613},
336
+ archivePrefix={arXiv},
337
+ primaryClass={cs.CL}
338
+ }""",
339
+ )
340
+
341
+
342
+ m_languages = [
343
+ "eng-Latn",
344
+ "spa-Latn",
345
+ "fra-Latn",
346
+ "deu-Latn",
347
+ "ita-Latn",
348
+ "por-Latn",
349
+ "pol-Latn",
350
+ "nld-Latn",
351
+ "tur-Latn",
352
+ "jpn-Jpan",
353
+ "vie-Latn",
354
+ "rus-Cyrl",
355
+ "ind-Latn",
356
+ "arb-Arab",
357
+ "ces-Latn",
358
+ "ron-Latn",
359
+ "swe-Latn",
360
+ "ell-Grek",
361
+ "ukr-Cyrl",
362
+ "zho-Hans",
363
+ "hun-Latn",
364
+ "dan-Latn",
365
+ "nor-Latn",
366
+ "hin-Deva",
367
+ "fin-Latn",
368
+ "bul-Cyrl",
369
+ "kor-Hang",
370
+ "slk-Latn",
371
+ "tha-Thai",
372
+ "heb-Hebr",
373
+ "cat-Latn",
374
+ "lit-Latn",
375
+ "fas-Arab",
376
+ "msa-Latn",
377
+ "slv-Latn",
378
+ "lav-Latn",
379
+ "mar-Deva",
380
+ "ben-Beng",
381
+ "sqi-Latn",
382
+ "cym-Latn",
383
+ "bel-Cyrl",
384
+ "mal-Mlym",
385
+ "kan-Knda",
386
+ "mkd-Cyrl",
387
+ "urd-Arab",
388
+ "fry-Latn",
389
+ "fil-Latn",
390
+ "tel-Telu",
391
+ "eus-Latn",
392
+ "swh-Latn",
393
+ "som-Latn",
394
+ "snd-Arab",
395
+ "uzb-Latn",
396
+ "cos-Latn",
397
+ "hrv-Latn",
398
+ "guj-Gujr",
399
+ "hin-Latn",
400
+ "ceb-Latn",
401
+ "epo-Latn",
402
+ "jav-Latn",
403
+ "lat-Latn",
404
+ "zul-Latn",
405
+ "mon-Cyrl",
406
+ "sin-Sinh",
407
+ "ell-Latn",
408
+ "gle-Latn",
409
+ "kir-Cyrl",
410
+ "tgk-Cyrl",
411
+ "mya-Mymr",
412
+ "khm-Khmr",
413
+ "mlg-Latn",
414
+ "pan-Guru",
415
+ "rus-Latn",
416
+ "sna-Latn",
417
+ "zho-Latn",
418
+ "hau-Latn",
419
+ "heb-Latn",
420
+ "hmn-Latn",
421
+ "hat-Latn",
422
+ "jpn-Latn",
423
+ "sun-Latn",
424
+ "bul-Latn",
425
+ "gla-Latn",
426
+ "nya-Latn",
427
+ "pus-Arab",
428
+ "kur-Latn",
429
+ "hbs-Latn",
430
+ "amh-Ethi",
431
+ "ibo-Latn",
432
+ "lao-Laoo",
433
+ "mri-Latn",
434
+ "nno-Latn",
435
+ "smo-Latn",
436
+ "yid-Hebr",
437
+ "sot-Latn",
438
+ "tgl-Latn",
439
+ "xho-Latn",
440
+ "yor-Latn",
441
+ ]
442
+
443
+ nomic_embed_text_v2_moe = ModelMeta(
444
+ loader=NomicWrapper, # type: ignore
445
+ loader_kwargs=dict(
446
+ trust_remote_code=True,
447
+ model_prompts=model_prompts,
448
+ ),
449
+ name="nomic-ai/nomic-embed-text-v2-moe",
450
+ languages=m_languages,
451
+ open_weights=True,
452
+ revision="1066b6599d099fbb93dfcb64f9c37a7c9e503e85",
453
+ release_date="2025-02-07",
454
+ n_parameters=475292928,
455
+ memory_usage_mb=1813,
456
+ max_tokens=512,
457
+ embed_dim=768,
458
+ license="apache-2.0",
459
+ reference="https://huggingface.co/nomic-ai/nomic-embed-text-v2-moe",
460
+ similarity_fn_name=ScoringFunction.COSINE,
461
+ framework=["Sentence Transformers", "PyTorch"],
462
+ use_instructions=True,
463
+ adapted_from="nomic-ai/nomic-xlm-2048",
464
+ public_training_data="https://github.com/nomic-ai/contrastors?tab=readme-ov-file#data-access",
465
+ public_training_code="https://github.com/nomic-ai/contrastors/blob/613ddfd37309e538cceadb05b1e6423e7b09f603/src/contrastors/configs/train/contrastive_finetune_moe.yaml",
466
+ training_datasets=None, # did not look into this further
467
+ superseded_by=None,
468
+ citation="""@misc{nussbaum2025trainingsparsemixtureexperts,
469
+ title={Training Sparse Mixture Of Experts Text Embedding Models},
470
+ author={Zach Nussbaum and Brandon Duderstadt},
471
+ year={2025},
472
+ eprint={2502.07972},
473
+ archivePrefix={arXiv},
474
+ primaryClass={cs.CL},
475
+ url={https://arxiv.org/abs/2502.07972},
476
+ }""",
331
477
  )
@@ -415,6 +415,30 @@ jina_colbert_v2 = ModelMeta(
415
415
  "DuRetrieval",
416
416
  "MIRACL",
417
417
  },
418
+ citation="""@inproceedings{xiao-etal-2024-jina,
419
+ title = "{J}ina-{C}ol{BERT}-v2: A General-Purpose Multilingual Late Interaction Retriever",
420
+ author = {Jha, Rohan and
421
+ Wang, Bo and
422
+ G{\"u}nther, Michael and
423
+ Mastrapas, Georgios and
424
+ Sturua, Saba and
425
+ Mohr, Isabelle and
426
+ Koukounas, Andreas and
427
+ Wang, Mohammad Kalim and
428
+ Wang, Nan and
429
+ Xiao, Han},
430
+ editor = {S{\"a}lev{\"a}, Jonne and
431
+ Owodunni, Abraham},
432
+ booktitle = "Proceedings of the Fourth Workshop on Multilingual Representation Learning (MRL 2024)",
433
+ month = nov,
434
+ year = "2024",
435
+ address = "Miami, Florida, USA",
436
+ publisher = "Association for Computational Linguistics",
437
+ url = "https://aclanthology.org/2024.mrl-1.11/",
438
+ doi = "10.18653/v1/2024.mrl-1.11",
439
+ pages = "159--166",
440
+ abstract = "Multi-vector dense models, such as ColBERT, have proven highly effective in information retrieval. ColBERT`s late interaction scoring approximates the joint query-document attention seen in cross-encoders while maintaining inference efficiency closer to traditional dense retrieval models, thanks to its bi-encoder architecture and recent optimizations in indexing and search. In this paper, we introduce a novel architecture and a training framework to support long context window and multilingual retrieval. Leveraging Matryoshka Representation Loss, we further demonstrate that the reducing the embedding dimensionality from 128 to 64 has insignificant impact on the model`s retrieval performance and cut storage requirements by up to 50{\%}. Our new model, Jina-ColBERT-v2, demonstrates strong performance across a range of English and multilingual retrieval tasks,"
441
+ }""",
418
442
  )
419
443
 
420
444
 
@@ -444,4 +468,13 @@ lightonai__gte_moderncolbert_v1 = ModelMeta(
444
468
  "MSMARCO",
445
469
  "mMARCO-NL",
446
470
  },
471
+ citation="""@inproceedings{reimers-2019-sentence-bert,
472
+ title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
473
+ author = "Reimers, Nils and Gurevych, Iryna",
474
+ booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
475
+ month = "11",
476
+ year = "2019",
477
+ publisher = "Association for Computational Linguistics",
478
+ url = "https://arxiv.org/abs/1908.10084"
479
+ }""",
447
480
  )
@@ -439,6 +439,13 @@ user_bge_m3 = ModelMeta(
439
439
  },
440
440
  public_training_code=None,
441
441
  public_training_data=None,
442
+ citation="""@misc{deepvk2024user,
443
+ title={USER: Universal Sentence Encoder for Russian},
444
+ author={Malashenko, Boris and Zemerov, Anton and Spirin, Egor},
445
+ url={https://huggingface.co/datasets/deepvk/USER-base},
446
+ publisher={Hugging Face},
447
+ year={2024},
448
+ }""",
442
449
  )
443
450
 
444
451
  deberta_v1_ru = ModelMeta(
@@ -873,6 +880,7 @@ frida = ModelMeta(
873
880
  public_training_data=None,
874
881
  public_training_code=None,
875
882
  framework=["Sentence Transformers", "PyTorch"],
883
+ citation=None,
876
884
  )
877
885
 
878
886
  giga_embeddings = ModelMeta(
@@ -1008,6 +1016,13 @@ user2_small = ModelMeta(
1008
1016
  public_training_data=None,
1009
1017
  public_training_code="https://github.com/BlessedTatonka/some_code/tree/2899f27d51efdf4217fc6453799ff197e9792f1e",
1010
1018
  framework=["Sentence Transformers", "PyTorch"],
1019
+ citation="""@misc{deepvk2025user,
1020
+ title={USER2},
1021
+ author={Malashenko, Boris and Spirin, Egor and Sokolov Andrey},
1022
+ url={https://huggingface.co/deepvk/USER2-small},
1023
+ publisher={Hugging Face},
1024
+ year={2025},
1025
+ }""",
1011
1026
  )
1012
1027
 
1013
1028
  user2_base = ModelMeta(
@@ -1033,4 +1048,11 @@ user2_base = ModelMeta(
1033
1048
  public_training_data=None,
1034
1049
  public_training_code="https://github.com/BlessedTatonka/some_code/tree/2899f27d51efdf4217fc6453799ff197e9792f1e",
1035
1050
  framework=["Sentence Transformers", "PyTorch"],
1051
+ citation="""@misc{deepvk2025user,
1052
+ title={USER2},
1053
+ author={Malashenko, Boris and Spirin, Egor and Sokolov Andrey},
1054
+ url={https://huggingface.co/deepvk/USER2-base},
1055
+ publisher={Hugging Face},
1056
+ year={2025},
1057
+ }""",
1036
1058
  )
@@ -402,6 +402,15 @@ static_similarity_mrl_multilingual_v1 = ModelMeta(
402
402
  training_datasets=static_multi_datasets,
403
403
  public_training_code="https://huggingface.co/blog/static-embeddings",
404
404
  public_training_data="https://huggingface.co/collections/sentence-transformers/embedding-model-datasets-6644d7a3673a511914aa7552",
405
+ citation="""@inproceedings{reimers-2019-sentence-bert,
406
+ title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
407
+ author = "Reimers, Nils and Gurevych, Iryna",
408
+ booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
409
+ month = "11",
410
+ year = "2019",
411
+ publisher = "Association for Computational Linguistics",
412
+ url = "https://arxiv.org/abs/1908.10084",
413
+ }""",
405
414
  )
406
415
 
407
416
  contriever = ModelMeta(
@@ -467,6 +476,17 @@ microllama_text_embedding = ModelMeta(
467
476
  public_training_data=None,
468
477
  )
469
478
 
479
+ SENTENCE_T5_CITATION = """
480
+ @misc{ni2021sentencet5scalablesentenceencoders,
481
+ title={Sentence-T5: Scalable Sentence Encoders from Pre-trained Text-to-Text Models},
482
+ author={Jianmo Ni and Gustavo Hernández Ábrego and Noah Constant and Ji Ma and Keith B. Hall and Daniel Cer and Yinfei Yang},
483
+ year={2021},
484
+ eprint={2108.08877},
485
+ archivePrefix={arXiv},
486
+ primaryClass={cs.CL},
487
+ url={https://arxiv.org/abs/2108.08877},
488
+ }
489
+ """
470
490
  sentence_t5_base = ModelMeta(
471
491
  loader=sentence_transformers_loader,
472
492
  name="sentence-transformers/sentence-t5-base",
@@ -486,6 +506,7 @@ sentence_t5_base = ModelMeta(
486
506
  public_training_code=None,
487
507
  public_training_data=None,
488
508
  training_datasets={"SNLI", "Community QA"},
509
+ citation=SENTENCE_T5_CITATION,
489
510
  )
490
511
 
491
512
  sentence_t5_large = ModelMeta(
@@ -507,6 +528,7 @@ sentence_t5_large = ModelMeta(
507
528
  public_training_code=None,
508
529
  public_training_data=None,
509
530
  training_datasets={"SNLI", "Community QA"},
531
+ citation=SENTENCE_T5_CITATION,
510
532
  )
511
533
 
512
534
  sentence_t5_xl = ModelMeta(
@@ -528,6 +550,7 @@ sentence_t5_xl = ModelMeta(
528
550
  public_training_code=None,
529
551
  public_training_data=None,
530
552
  training_datasets={"SNLI", "Community QA"},
553
+ citation=SENTENCE_T5_CITATION,
531
554
  )
532
555
 
533
556
  sentence_t5_xxl = ModelMeta(
@@ -549,7 +572,19 @@ sentence_t5_xxl = ModelMeta(
549
572
  public_training_code=None,
550
573
  public_training_data=None,
551
574
  training_datasets={"SNLI", "Community QA"},
575
+ citation=SENTENCE_T5_CITATION,
552
576
  )
577
+ GTR_CITATION = """
578
+ @misc{ni2021largedualencodersgeneralizable,
579
+ title={Large Dual Encoders Are Generalizable Retrievers},
580
+ author={Jianmo Ni and Chen Qu and Jing Lu and Zhuyun Dai and Gustavo Hernández Ábrego and Ji Ma and Vincent Y. Zhao and Yi Luan and Keith B. Hall and Ming-Wei Chang and Yinfei Yang},
581
+ year={2021},
582
+ eprint={2112.07899},
583
+ archivePrefix={arXiv},
584
+ primaryClass={cs.IR},
585
+ url={https://arxiv.org/abs/2112.07899},
586
+ }
587
+ """
553
588
  gtr_t5_large = ModelMeta(
554
589
  loader=sentence_transformers_loader,
555
590
  name="sentence-transformers/gtr-t5-large",
@@ -581,6 +616,7 @@ gtr_t5_large = ModelMeta(
581
616
  "NQ-PL", # translation not trained on
582
617
  "Community QA",
583
618
  },
619
+ citation=GTR_CITATION,
584
620
  )
585
621
 
586
622
  gtr_t5_xl = ModelMeta(
@@ -614,6 +650,7 @@ gtr_t5_xl = ModelMeta(
614
650
  "NQ-PL", # translation not trained on
615
651
  "Community QA",
616
652
  },
653
+ citation=GTR_CITATION,
617
654
  )
618
655
  gtr_t5_xxl = ModelMeta(
619
656
  loader=sentence_transformers_loader,
@@ -646,6 +683,7 @@ gtr_t5_xxl = ModelMeta(
646
683
  "NQ-PL", # translation not trained on
647
684
  "Community QA",
648
685
  },
686
+ citation=GTR_CITATION,
649
687
  )
650
688
 
651
689
  gtr_t5_base = ModelMeta(
@@ -679,4 +717,5 @@ gtr_t5_base = ModelMeta(
679
717
  "NQ-PL", # translation not trained on
680
718
  "Community QA",
681
719
  },
720
+ citation=GTR_CITATION,
682
721
  )
@@ -23,4 +23,11 @@ spartan8806_atles_champion_embedding = ModelMeta(
23
23
  adapted_from="sentence-transformers/all-mpnet-base-v2",
24
24
  public_training_code=None,
25
25
  public_training_data=None,
26
+ citation="""@article{conner2025epistemic,
27
+ title={The Epistemic Barrier: How RLHF Makes AI Consciousness Empirically Undecidable},
28
+ author={Conner (spartan8806)},
29
+ journal={ATLES Research Papers},
30
+ year={2025},
31
+ note={Cross-model validation study (Phoenix, Grok, Gemini, Claude)}
32
+ }""",
26
33
  )
@@ -28,4 +28,13 @@ xlm_roberta_ua_distilled = ModelMeta(
28
28
  modalities=["text"],
29
29
  public_training_data=None,
30
30
  use_instructions=False,
31
+ citation="""@inproceedings{reimers-2019-sentence-bert,
32
+ title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
33
+ author = "Reimers, Nils and Gurevych, Iryna",
34
+ booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
35
+ month = "11",
36
+ year = "2019",
37
+ publisher = "Association for Computational Linguistics",
38
+ url = "https://arxiv.org/abs/1908.10084",
39
+ }""",
31
40
  )
@@ -75,6 +75,12 @@ aiteamvn_vietnamese_embeddings = ModelMeta(
75
75
  public_training_data=None,
76
76
  training_datasets=None,
77
77
  adapted_from="BAAI/bge-m3",
78
+ citation="""@misc{Vietnamese_Embedding,
79
+ title={Vietnamese_Embedding: Embedding model in Vietnamese language.},
80
+ author={Nguyen Nho Trung, Nguyen Nhat Quang, Nguyen Van Huy},
81
+ year={2025},
82
+ publisher={Huggingface},
83
+ }""",
78
84
  )
79
85
 
80
86
  hiieu_halong_embedding = ModelMeta(
@@ -99,6 +105,12 @@ hiieu_halong_embedding = ModelMeta(
99
105
  public_training_data=None,
100
106
  training_datasets=None,
101
107
  adapted_from="intfloat/multilingual-e5-base",
108
+ citation="""@misc{HalongEmbedding,
109
+ title={HalongEmbedding: A Vietnamese Text Embedding},
110
+ author={Ngo Hieu},
111
+ year={2024},
112
+ publisher={Huggingface},
113
+ }""",
102
114
  )
103
115
 
104
116
  sup_simcse_vietnamese_phobert_base_ = ModelMeta(
@@ -122,6 +134,20 @@ sup_simcse_vietnamese_phobert_base_ = ModelMeta(
122
134
  reference="https://huggingface.co/VoVanPhuc/sup-SimCSE-VietNamese-phobert-base",
123
135
  similarity_fn_name="cosine",
124
136
  training_datasets=None,
137
+ citation="""@article{gao2021simcse,
138
+ title={{SimCSE}: Simple Contrastive Learning of Sentence Embeddings},
139
+ author={Gao, Tianyu and Yao, Xingcheng and Chen, Danqi},
140
+ journal={arXiv preprint arXiv:2104.08821},
141
+ year={2021}
142
+ }
143
+
144
+ @inproceedings{phobert,
145
+ title = {{PhoBERT: Pre-trained language models for Vietnamese}},
146
+ author = {Dat Quoc Nguyen and Anh Tuan Nguyen},
147
+ booktitle = {Findings of the Association for Computational Linguistics: EMNLP 2020},
148
+ year = {2020},
149
+ pages = {1037--1042}
150
+ }""",
125
151
  )
126
152
 
127
153
  bkai_foundation_models_vietnamese_bi_encoder = ModelMeta(
@@ -145,4 +171,11 @@ bkai_foundation_models_vietnamese_bi_encoder = ModelMeta(
145
171
  reference="https://huggingface.co/bkai-foundation-models/vietnamese-bi-encoder",
146
172
  similarity_fn_name="cosine",
147
173
  training_datasets=None,
174
+ citation="""
175
+ @article{duc2024towards,
176
+ title={Towards Comprehensive Vietnamese Retrieval-Augmented Generation and Large Language Models},
177
+ author={Nguyen Quang Duc, Le Hai Son, Nguyen Duc Nhan, Nguyen Dich Nhat Minh, Le Thanh Huong, Dinh Viet Sang},
178
+ journal={arXiv preprint arXiv:2403.01616},
179
+ year={2024}
180
+ }""",
148
181
  )