mteb 2.3.11__py3-none-any.whl → 2.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mteb/benchmarks/benchmarks/__init__.py +2 -0
- mteb/benchmarks/benchmarks/benchmarks.py +57 -0
- mteb/deprecated_evaluator.py +8 -13
- mteb/descriptive_stats/Reranking/JQaRARerankingLite.json +35 -0
- mteb/descriptive_stats/Reranking/JaCWIRRerankingLite.json +35 -0
- mteb/descriptive_stats/Retrieval/JaCWIRRetrievalLite.json +30 -0
- mteb/descriptive_stats/Retrieval/JaqketRetrievalLite.json +30 -0
- mteb/descriptive_stats/Retrieval/MIRACLJaRetrievalLite.json +30 -0
- mteb/descriptive_stats/Retrieval/MrTyDiJaRetrievalLite.json +30 -0
- mteb/evaluate.py +2 -33
- mteb/leaderboard/figures.py +1 -1
- mteb/leaderboard/table.py +1 -11
- mteb/models/abs_encoder.py +21 -17
- mteb/models/get_model_meta.py +3 -123
- mteb/models/instruct_wrapper.py +2 -1
- mteb/models/model_implementations/andersborges.py +12 -0
- mteb/models/model_implementations/bge_models.py +43 -0
- mteb/models/model_implementations/bica_model.py +34 -0
- mteb/models/model_implementations/dino_models.py +152 -0
- mteb/models/model_implementations/emillykkejensen_models.py +18 -0
- mteb/models/model_implementations/euler_models.py +6 -0
- mteb/models/model_implementations/fa_models.py +50 -0
- mteb/models/model_implementations/facebookai.py +44 -0
- mteb/models/model_implementations/google_models.py +10 -0
- mteb/models/model_implementations/gte_models.py +69 -0
- mteb/models/model_implementations/kalm_models.py +38 -0
- mteb/models/model_implementations/kblab.py +6 -0
- mteb/models/model_implementations/kowshik24_models.py +9 -0
- mteb/models/model_implementations/misc_models.py +293 -0
- mteb/models/model_implementations/mod_models.py +189 -0
- mteb/models/model_implementations/mxbai_models.py +6 -0
- mteb/models/model_implementations/nomic_models.py +150 -4
- mteb/models/model_implementations/pylate_models.py +33 -0
- mteb/models/model_implementations/ru_sentence_models.py +22 -0
- mteb/models/model_implementations/sentence_transformers_models.py +39 -0
- mteb/models/model_implementations/spartan8806_atles_champion.py +7 -0
- mteb/models/model_implementations/ua_sentence_models.py +9 -0
- mteb/models/model_implementations/vi_vn_models.py +33 -0
- mteb/models/model_meta.py +396 -19
- mteb/models/sentence_transformer_wrapper.py +2 -7
- mteb/tasks/reranking/jpn/__init__.py +9 -1
- mteb/tasks/reranking/jpn/j_qa_ra_reranking_lite.py +49 -0
- mteb/tasks/reranking/jpn/ja_cwir_reranking_lite.py +47 -0
- mteb/tasks/retrieval/code/fresh_stack_retrieval.py +8 -5
- mteb/tasks/retrieval/jpn/__init__.py +8 -0
- mteb/tasks/retrieval/jpn/ja_cwir_retrieval_lite.py +47 -0
- mteb/tasks/retrieval/jpn/jaqket_retrieval_lite.py +50 -0
- mteb/tasks/retrieval/jpn/miracl_ja_retrieval_lite.py +52 -0
- mteb/tasks/retrieval/jpn/mr_tydi_ja_retrieval_lite.py +48 -0
- {mteb-2.3.11.dist-info → mteb-2.4.2.dist-info}/METADATA +1 -1
- {mteb-2.3.11.dist-info → mteb-2.4.2.dist-info}/RECORD +55 -41
- {mteb-2.3.11.dist-info → mteb-2.4.2.dist-info}/WHEEL +0 -0
- {mteb-2.3.11.dist-info → mteb-2.4.2.dist-info}/entry_points.txt +0 -0
- {mteb-2.3.11.dist-info → mteb-2.4.2.dist-info}/licenses/LICENSE +0 -0
- {mteb-2.3.11.dist-info → mteb-2.4.2.dist-info}/top_level.txt +0 -0
|
@@ -99,4 +99,10 @@ mxbai_embed_xsmall_v1 = ModelMeta(
|
|
|
99
99
|
public_training_code=None,
|
|
100
100
|
public_training_data=None,
|
|
101
101
|
training_datasets=mixedbread_training_data,
|
|
102
|
+
citation="""@online{xsmall2024mxbai,
|
|
103
|
+
title={Every Byte Matters: Introducing mxbai-embed-xsmall-v1},
|
|
104
|
+
author={Sean Lee and Julius Lipp and Rui Huang and Darius Koenig},
|
|
105
|
+
year={2024},
|
|
106
|
+
url={https://www.mixedbread.ai/blog/mxbai-embed-xsmall-v1},
|
|
107
|
+
}""",
|
|
102
108
|
)
|
|
@@ -193,7 +193,7 @@ NOMIC_CITATION = """
|
|
|
193
193
|
"""
|
|
194
194
|
|
|
195
195
|
nomic_embed_v1_5 = ModelMeta(
|
|
196
|
-
loader=NomicWrapper,
|
|
196
|
+
loader=NomicWrapper, # type: ignore
|
|
197
197
|
loader_kwargs=dict(
|
|
198
198
|
trust_remote_code=True,
|
|
199
199
|
model_prompts=model_prompts,
|
|
@@ -221,7 +221,7 @@ nomic_embed_v1_5 = ModelMeta(
|
|
|
221
221
|
)
|
|
222
222
|
|
|
223
223
|
nomic_embed_v1 = ModelMeta(
|
|
224
|
-
loader=NomicWrapper,
|
|
224
|
+
loader=NomicWrapper, # type: ignore
|
|
225
225
|
loader_kwargs=dict(
|
|
226
226
|
trust_remote_code=True,
|
|
227
227
|
model_prompts=model_prompts,
|
|
@@ -249,7 +249,7 @@ nomic_embed_v1 = ModelMeta(
|
|
|
249
249
|
)
|
|
250
250
|
|
|
251
251
|
nomic_embed_v1_ablated = ModelMeta(
|
|
252
|
-
loader=NomicWrapper,
|
|
252
|
+
loader=NomicWrapper, # type: ignore
|
|
253
253
|
loader_kwargs=dict(
|
|
254
254
|
trust_remote_code=True,
|
|
255
255
|
model_prompts=model_prompts,
|
|
@@ -276,7 +276,7 @@ nomic_embed_v1_ablated = ModelMeta(
|
|
|
276
276
|
)
|
|
277
277
|
|
|
278
278
|
nomic_embed_v1_unsupervised = ModelMeta(
|
|
279
|
-
loader=NomicWrapper,
|
|
279
|
+
loader=NomicWrapper, # type: ignore
|
|
280
280
|
loader_kwargs=dict(
|
|
281
281
|
trust_remote_code=True,
|
|
282
282
|
model_prompts=model_prompts,
|
|
@@ -328,4 +328,150 @@ nomic_modern_bert_embed = ModelMeta(
|
|
|
328
328
|
superseded_by=None,
|
|
329
329
|
training_datasets=nomic_training_data,
|
|
330
330
|
public_training_data=None,
|
|
331
|
+
citation="""@misc{nussbaum2024nomic,
|
|
332
|
+
title={Nomic Embed: Training a Reproducible Long Context Text Embedder},
|
|
333
|
+
author={Zach Nussbaum and John X. Morris and Brandon Duderstadt and Andriy Mulyar},
|
|
334
|
+
year={2024},
|
|
335
|
+
eprint={2402.01613},
|
|
336
|
+
archivePrefix={arXiv},
|
|
337
|
+
primaryClass={cs.CL}
|
|
338
|
+
}""",
|
|
339
|
+
)
|
|
340
|
+
|
|
341
|
+
|
|
342
|
+
m_languages = [
|
|
343
|
+
"eng-Latn",
|
|
344
|
+
"spa-Latn",
|
|
345
|
+
"fra-Latn",
|
|
346
|
+
"deu-Latn",
|
|
347
|
+
"ita-Latn",
|
|
348
|
+
"por-Latn",
|
|
349
|
+
"pol-Latn",
|
|
350
|
+
"nld-Latn",
|
|
351
|
+
"tur-Latn",
|
|
352
|
+
"jpn-Jpan",
|
|
353
|
+
"vie-Latn",
|
|
354
|
+
"rus-Cyrl",
|
|
355
|
+
"ind-Latn",
|
|
356
|
+
"arb-Arab",
|
|
357
|
+
"ces-Latn",
|
|
358
|
+
"ron-Latn",
|
|
359
|
+
"swe-Latn",
|
|
360
|
+
"ell-Grek",
|
|
361
|
+
"ukr-Cyrl",
|
|
362
|
+
"zho-Hans",
|
|
363
|
+
"hun-Latn",
|
|
364
|
+
"dan-Latn",
|
|
365
|
+
"nor-Latn",
|
|
366
|
+
"hin-Deva",
|
|
367
|
+
"fin-Latn",
|
|
368
|
+
"bul-Cyrl",
|
|
369
|
+
"kor-Hang",
|
|
370
|
+
"slk-Latn",
|
|
371
|
+
"tha-Thai",
|
|
372
|
+
"heb-Hebr",
|
|
373
|
+
"cat-Latn",
|
|
374
|
+
"lit-Latn",
|
|
375
|
+
"fas-Arab",
|
|
376
|
+
"msa-Latn",
|
|
377
|
+
"slv-Latn",
|
|
378
|
+
"lav-Latn",
|
|
379
|
+
"mar-Deva",
|
|
380
|
+
"ben-Beng",
|
|
381
|
+
"sqi-Latn",
|
|
382
|
+
"cym-Latn",
|
|
383
|
+
"bel-Cyrl",
|
|
384
|
+
"mal-Mlym",
|
|
385
|
+
"kan-Knda",
|
|
386
|
+
"mkd-Cyrl",
|
|
387
|
+
"urd-Arab",
|
|
388
|
+
"fry-Latn",
|
|
389
|
+
"fil-Latn",
|
|
390
|
+
"tel-Telu",
|
|
391
|
+
"eus-Latn",
|
|
392
|
+
"swh-Latn",
|
|
393
|
+
"som-Latn",
|
|
394
|
+
"snd-Arab",
|
|
395
|
+
"uzb-Latn",
|
|
396
|
+
"cos-Latn",
|
|
397
|
+
"hrv-Latn",
|
|
398
|
+
"guj-Gujr",
|
|
399
|
+
"hin-Latn",
|
|
400
|
+
"ceb-Latn",
|
|
401
|
+
"epo-Latn",
|
|
402
|
+
"jav-Latn",
|
|
403
|
+
"lat-Latn",
|
|
404
|
+
"zul-Latn",
|
|
405
|
+
"mon-Cyrl",
|
|
406
|
+
"sin-Sinh",
|
|
407
|
+
"ell-Latn",
|
|
408
|
+
"gle-Latn",
|
|
409
|
+
"kir-Cyrl",
|
|
410
|
+
"tgk-Cyrl",
|
|
411
|
+
"mya-Mymr",
|
|
412
|
+
"khm-Khmr",
|
|
413
|
+
"mlg-Latn",
|
|
414
|
+
"pan-Guru",
|
|
415
|
+
"rus-Latn",
|
|
416
|
+
"sna-Latn",
|
|
417
|
+
"zho-Latn",
|
|
418
|
+
"hau-Latn",
|
|
419
|
+
"heb-Latn",
|
|
420
|
+
"hmn-Latn",
|
|
421
|
+
"hat-Latn",
|
|
422
|
+
"jpn-Latn",
|
|
423
|
+
"sun-Latn",
|
|
424
|
+
"bul-Latn",
|
|
425
|
+
"gla-Latn",
|
|
426
|
+
"nya-Latn",
|
|
427
|
+
"pus-Arab",
|
|
428
|
+
"kur-Latn",
|
|
429
|
+
"hbs-Latn",
|
|
430
|
+
"amh-Ethi",
|
|
431
|
+
"ibo-Latn",
|
|
432
|
+
"lao-Laoo",
|
|
433
|
+
"mri-Latn",
|
|
434
|
+
"nno-Latn",
|
|
435
|
+
"smo-Latn",
|
|
436
|
+
"yid-Hebr",
|
|
437
|
+
"sot-Latn",
|
|
438
|
+
"tgl-Latn",
|
|
439
|
+
"xho-Latn",
|
|
440
|
+
"yor-Latn",
|
|
441
|
+
]
|
|
442
|
+
|
|
443
|
+
nomic_embed_text_v2_moe = ModelMeta(
|
|
444
|
+
loader=NomicWrapper, # type: ignore
|
|
445
|
+
loader_kwargs=dict(
|
|
446
|
+
trust_remote_code=True,
|
|
447
|
+
model_prompts=model_prompts,
|
|
448
|
+
),
|
|
449
|
+
name="nomic-ai/nomic-embed-text-v2-moe",
|
|
450
|
+
languages=m_languages,
|
|
451
|
+
open_weights=True,
|
|
452
|
+
revision="1066b6599d099fbb93dfcb64f9c37a7c9e503e85",
|
|
453
|
+
release_date="2025-02-07",
|
|
454
|
+
n_parameters=475292928,
|
|
455
|
+
memory_usage_mb=1813,
|
|
456
|
+
max_tokens=512,
|
|
457
|
+
embed_dim=768,
|
|
458
|
+
license="apache-2.0",
|
|
459
|
+
reference="https://huggingface.co/nomic-ai/nomic-embed-text-v2-moe",
|
|
460
|
+
similarity_fn_name=ScoringFunction.COSINE,
|
|
461
|
+
framework=["Sentence Transformers", "PyTorch"],
|
|
462
|
+
use_instructions=True,
|
|
463
|
+
adapted_from="nomic-ai/nomic-xlm-2048",
|
|
464
|
+
public_training_data="https://github.com/nomic-ai/contrastors?tab=readme-ov-file#data-access",
|
|
465
|
+
public_training_code="https://github.com/nomic-ai/contrastors/blob/613ddfd37309e538cceadb05b1e6423e7b09f603/src/contrastors/configs/train/contrastive_finetune_moe.yaml",
|
|
466
|
+
training_datasets=None, # did not look into this further
|
|
467
|
+
superseded_by=None,
|
|
468
|
+
citation="""@misc{nussbaum2025trainingsparsemixtureexperts,
|
|
469
|
+
title={Training Sparse Mixture Of Experts Text Embedding Models},
|
|
470
|
+
author={Zach Nussbaum and Brandon Duderstadt},
|
|
471
|
+
year={2025},
|
|
472
|
+
eprint={2502.07972},
|
|
473
|
+
archivePrefix={arXiv},
|
|
474
|
+
primaryClass={cs.CL},
|
|
475
|
+
url={https://arxiv.org/abs/2502.07972},
|
|
476
|
+
}""",
|
|
331
477
|
)
|
|
@@ -415,6 +415,30 @@ jina_colbert_v2 = ModelMeta(
|
|
|
415
415
|
"DuRetrieval",
|
|
416
416
|
"MIRACL",
|
|
417
417
|
},
|
|
418
|
+
citation="""@inproceedings{xiao-etal-2024-jina,
|
|
419
|
+
title = "{J}ina-{C}ol{BERT}-v2: A General-Purpose Multilingual Late Interaction Retriever",
|
|
420
|
+
author = {Jha, Rohan and
|
|
421
|
+
Wang, Bo and
|
|
422
|
+
G{\"u}nther, Michael and
|
|
423
|
+
Mastrapas, Georgios and
|
|
424
|
+
Sturua, Saba and
|
|
425
|
+
Mohr, Isabelle and
|
|
426
|
+
Koukounas, Andreas and
|
|
427
|
+
Wang, Mohammad Kalim and
|
|
428
|
+
Wang, Nan and
|
|
429
|
+
Xiao, Han},
|
|
430
|
+
editor = {S{\"a}lev{\"a}, Jonne and
|
|
431
|
+
Owodunni, Abraham},
|
|
432
|
+
booktitle = "Proceedings of the Fourth Workshop on Multilingual Representation Learning (MRL 2024)",
|
|
433
|
+
month = nov,
|
|
434
|
+
year = "2024",
|
|
435
|
+
address = "Miami, Florida, USA",
|
|
436
|
+
publisher = "Association for Computational Linguistics",
|
|
437
|
+
url = "https://aclanthology.org/2024.mrl-1.11/",
|
|
438
|
+
doi = "10.18653/v1/2024.mrl-1.11",
|
|
439
|
+
pages = "159--166",
|
|
440
|
+
abstract = "Multi-vector dense models, such as ColBERT, have proven highly effective in information retrieval. ColBERT`s late interaction scoring approximates the joint query-document attention seen in cross-encoders while maintaining inference efficiency closer to traditional dense retrieval models, thanks to its bi-encoder architecture and recent optimizations in indexing and search. In this paper, we introduce a novel architecture and a training framework to support long context window and multilingual retrieval. Leveraging Matryoshka Representation Loss, we further demonstrate that the reducing the embedding dimensionality from 128 to 64 has insignificant impact on the model`s retrieval performance and cut storage requirements by up to 50{\%}. Our new model, Jina-ColBERT-v2, demonstrates strong performance across a range of English and multilingual retrieval tasks,"
|
|
441
|
+
}""",
|
|
418
442
|
)
|
|
419
443
|
|
|
420
444
|
|
|
@@ -444,4 +468,13 @@ lightonai__gte_moderncolbert_v1 = ModelMeta(
|
|
|
444
468
|
"MSMARCO",
|
|
445
469
|
"mMARCO-NL",
|
|
446
470
|
},
|
|
471
|
+
citation="""@inproceedings{reimers-2019-sentence-bert,
|
|
472
|
+
title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
|
|
473
|
+
author = "Reimers, Nils and Gurevych, Iryna",
|
|
474
|
+
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
|
|
475
|
+
month = "11",
|
|
476
|
+
year = "2019",
|
|
477
|
+
publisher = "Association for Computational Linguistics",
|
|
478
|
+
url = "https://arxiv.org/abs/1908.10084"
|
|
479
|
+
}""",
|
|
447
480
|
)
|
|
@@ -439,6 +439,13 @@ user_bge_m3 = ModelMeta(
|
|
|
439
439
|
},
|
|
440
440
|
public_training_code=None,
|
|
441
441
|
public_training_data=None,
|
|
442
|
+
citation="""@misc{deepvk2024user,
|
|
443
|
+
title={USER: Universal Sentence Encoder for Russian},
|
|
444
|
+
author={Malashenko, Boris and Zemerov, Anton and Spirin, Egor},
|
|
445
|
+
url={https://huggingface.co/datasets/deepvk/USER-base},
|
|
446
|
+
publisher={Hugging Face},
|
|
447
|
+
year={2024},
|
|
448
|
+
}""",
|
|
442
449
|
)
|
|
443
450
|
|
|
444
451
|
deberta_v1_ru = ModelMeta(
|
|
@@ -873,6 +880,7 @@ frida = ModelMeta(
|
|
|
873
880
|
public_training_data=None,
|
|
874
881
|
public_training_code=None,
|
|
875
882
|
framework=["Sentence Transformers", "PyTorch"],
|
|
883
|
+
citation=None,
|
|
876
884
|
)
|
|
877
885
|
|
|
878
886
|
giga_embeddings = ModelMeta(
|
|
@@ -1008,6 +1016,13 @@ user2_small = ModelMeta(
|
|
|
1008
1016
|
public_training_data=None,
|
|
1009
1017
|
public_training_code="https://github.com/BlessedTatonka/some_code/tree/2899f27d51efdf4217fc6453799ff197e9792f1e",
|
|
1010
1018
|
framework=["Sentence Transformers", "PyTorch"],
|
|
1019
|
+
citation="""@misc{deepvk2025user,
|
|
1020
|
+
title={USER2},
|
|
1021
|
+
author={Malashenko, Boris and Spirin, Egor and Sokolov Andrey},
|
|
1022
|
+
url={https://huggingface.co/deepvk/USER2-small},
|
|
1023
|
+
publisher={Hugging Face},
|
|
1024
|
+
year={2025},
|
|
1025
|
+
}""",
|
|
1011
1026
|
)
|
|
1012
1027
|
|
|
1013
1028
|
user2_base = ModelMeta(
|
|
@@ -1033,4 +1048,11 @@ user2_base = ModelMeta(
|
|
|
1033
1048
|
public_training_data=None,
|
|
1034
1049
|
public_training_code="https://github.com/BlessedTatonka/some_code/tree/2899f27d51efdf4217fc6453799ff197e9792f1e",
|
|
1035
1050
|
framework=["Sentence Transformers", "PyTorch"],
|
|
1051
|
+
citation="""@misc{deepvk2025user,
|
|
1052
|
+
title={USER2},
|
|
1053
|
+
author={Malashenko, Boris and Spirin, Egor and Sokolov Andrey},
|
|
1054
|
+
url={https://huggingface.co/deepvk/USER2-base},
|
|
1055
|
+
publisher={Hugging Face},
|
|
1056
|
+
year={2025},
|
|
1057
|
+
}""",
|
|
1036
1058
|
)
|
|
@@ -402,6 +402,15 @@ static_similarity_mrl_multilingual_v1 = ModelMeta(
|
|
|
402
402
|
training_datasets=static_multi_datasets,
|
|
403
403
|
public_training_code="https://huggingface.co/blog/static-embeddings",
|
|
404
404
|
public_training_data="https://huggingface.co/collections/sentence-transformers/embedding-model-datasets-6644d7a3673a511914aa7552",
|
|
405
|
+
citation="""@inproceedings{reimers-2019-sentence-bert,
|
|
406
|
+
title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
|
|
407
|
+
author = "Reimers, Nils and Gurevych, Iryna",
|
|
408
|
+
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
|
|
409
|
+
month = "11",
|
|
410
|
+
year = "2019",
|
|
411
|
+
publisher = "Association for Computational Linguistics",
|
|
412
|
+
url = "https://arxiv.org/abs/1908.10084",
|
|
413
|
+
}""",
|
|
405
414
|
)
|
|
406
415
|
|
|
407
416
|
contriever = ModelMeta(
|
|
@@ -467,6 +476,17 @@ microllama_text_embedding = ModelMeta(
|
|
|
467
476
|
public_training_data=None,
|
|
468
477
|
)
|
|
469
478
|
|
|
479
|
+
SENTENCE_T5_CITATION = """
|
|
480
|
+
@misc{ni2021sentencet5scalablesentenceencoders,
|
|
481
|
+
title={Sentence-T5: Scalable Sentence Encoders from Pre-trained Text-to-Text Models},
|
|
482
|
+
author={Jianmo Ni and Gustavo Hernández Ábrego and Noah Constant and Ji Ma and Keith B. Hall and Daniel Cer and Yinfei Yang},
|
|
483
|
+
year={2021},
|
|
484
|
+
eprint={2108.08877},
|
|
485
|
+
archivePrefix={arXiv},
|
|
486
|
+
primaryClass={cs.CL},
|
|
487
|
+
url={https://arxiv.org/abs/2108.08877},
|
|
488
|
+
}
|
|
489
|
+
"""
|
|
470
490
|
sentence_t5_base = ModelMeta(
|
|
471
491
|
loader=sentence_transformers_loader,
|
|
472
492
|
name="sentence-transformers/sentence-t5-base",
|
|
@@ -486,6 +506,7 @@ sentence_t5_base = ModelMeta(
|
|
|
486
506
|
public_training_code=None,
|
|
487
507
|
public_training_data=None,
|
|
488
508
|
training_datasets={"SNLI", "Community QA"},
|
|
509
|
+
citation=SENTENCE_T5_CITATION,
|
|
489
510
|
)
|
|
490
511
|
|
|
491
512
|
sentence_t5_large = ModelMeta(
|
|
@@ -507,6 +528,7 @@ sentence_t5_large = ModelMeta(
|
|
|
507
528
|
public_training_code=None,
|
|
508
529
|
public_training_data=None,
|
|
509
530
|
training_datasets={"SNLI", "Community QA"},
|
|
531
|
+
citation=SENTENCE_T5_CITATION,
|
|
510
532
|
)
|
|
511
533
|
|
|
512
534
|
sentence_t5_xl = ModelMeta(
|
|
@@ -528,6 +550,7 @@ sentence_t5_xl = ModelMeta(
|
|
|
528
550
|
public_training_code=None,
|
|
529
551
|
public_training_data=None,
|
|
530
552
|
training_datasets={"SNLI", "Community QA"},
|
|
553
|
+
citation=SENTENCE_T5_CITATION,
|
|
531
554
|
)
|
|
532
555
|
|
|
533
556
|
sentence_t5_xxl = ModelMeta(
|
|
@@ -549,7 +572,19 @@ sentence_t5_xxl = ModelMeta(
|
|
|
549
572
|
public_training_code=None,
|
|
550
573
|
public_training_data=None,
|
|
551
574
|
training_datasets={"SNLI", "Community QA"},
|
|
575
|
+
citation=SENTENCE_T5_CITATION,
|
|
552
576
|
)
|
|
577
|
+
GTR_CITATION = """
|
|
578
|
+
@misc{ni2021largedualencodersgeneralizable,
|
|
579
|
+
title={Large Dual Encoders Are Generalizable Retrievers},
|
|
580
|
+
author={Jianmo Ni and Chen Qu and Jing Lu and Zhuyun Dai and Gustavo Hernández Ábrego and Ji Ma and Vincent Y. Zhao and Yi Luan and Keith B. Hall and Ming-Wei Chang and Yinfei Yang},
|
|
581
|
+
year={2021},
|
|
582
|
+
eprint={2112.07899},
|
|
583
|
+
archivePrefix={arXiv},
|
|
584
|
+
primaryClass={cs.IR},
|
|
585
|
+
url={https://arxiv.org/abs/2112.07899},
|
|
586
|
+
}
|
|
587
|
+
"""
|
|
553
588
|
gtr_t5_large = ModelMeta(
|
|
554
589
|
loader=sentence_transformers_loader,
|
|
555
590
|
name="sentence-transformers/gtr-t5-large",
|
|
@@ -581,6 +616,7 @@ gtr_t5_large = ModelMeta(
|
|
|
581
616
|
"NQ-PL", # translation not trained on
|
|
582
617
|
"Community QA",
|
|
583
618
|
},
|
|
619
|
+
citation=GTR_CITATION,
|
|
584
620
|
)
|
|
585
621
|
|
|
586
622
|
gtr_t5_xl = ModelMeta(
|
|
@@ -614,6 +650,7 @@ gtr_t5_xl = ModelMeta(
|
|
|
614
650
|
"NQ-PL", # translation not trained on
|
|
615
651
|
"Community QA",
|
|
616
652
|
},
|
|
653
|
+
citation=GTR_CITATION,
|
|
617
654
|
)
|
|
618
655
|
gtr_t5_xxl = ModelMeta(
|
|
619
656
|
loader=sentence_transformers_loader,
|
|
@@ -646,6 +683,7 @@ gtr_t5_xxl = ModelMeta(
|
|
|
646
683
|
"NQ-PL", # translation not trained on
|
|
647
684
|
"Community QA",
|
|
648
685
|
},
|
|
686
|
+
citation=GTR_CITATION,
|
|
649
687
|
)
|
|
650
688
|
|
|
651
689
|
gtr_t5_base = ModelMeta(
|
|
@@ -679,4 +717,5 @@ gtr_t5_base = ModelMeta(
|
|
|
679
717
|
"NQ-PL", # translation not trained on
|
|
680
718
|
"Community QA",
|
|
681
719
|
},
|
|
720
|
+
citation=GTR_CITATION,
|
|
682
721
|
)
|
|
@@ -23,4 +23,11 @@ spartan8806_atles_champion_embedding = ModelMeta(
|
|
|
23
23
|
adapted_from="sentence-transformers/all-mpnet-base-v2",
|
|
24
24
|
public_training_code=None,
|
|
25
25
|
public_training_data=None,
|
|
26
|
+
citation="""@article{conner2025epistemic,
|
|
27
|
+
title={The Epistemic Barrier: How RLHF Makes AI Consciousness Empirically Undecidable},
|
|
28
|
+
author={Conner (spartan8806)},
|
|
29
|
+
journal={ATLES Research Papers},
|
|
30
|
+
year={2025},
|
|
31
|
+
note={Cross-model validation study (Phoenix, Grok, Gemini, Claude)}
|
|
32
|
+
}""",
|
|
26
33
|
)
|
|
@@ -28,4 +28,13 @@ xlm_roberta_ua_distilled = ModelMeta(
|
|
|
28
28
|
modalities=["text"],
|
|
29
29
|
public_training_data=None,
|
|
30
30
|
use_instructions=False,
|
|
31
|
+
citation="""@inproceedings{reimers-2019-sentence-bert,
|
|
32
|
+
title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
|
|
33
|
+
author = "Reimers, Nils and Gurevych, Iryna",
|
|
34
|
+
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
|
|
35
|
+
month = "11",
|
|
36
|
+
year = "2019",
|
|
37
|
+
publisher = "Association for Computational Linguistics",
|
|
38
|
+
url = "https://arxiv.org/abs/1908.10084",
|
|
39
|
+
}""",
|
|
31
40
|
)
|
|
@@ -75,6 +75,12 @@ aiteamvn_vietnamese_embeddings = ModelMeta(
|
|
|
75
75
|
public_training_data=None,
|
|
76
76
|
training_datasets=None,
|
|
77
77
|
adapted_from="BAAI/bge-m3",
|
|
78
|
+
citation="""@misc{Vietnamese_Embedding,
|
|
79
|
+
title={Vietnamese_Embedding: Embedding model in Vietnamese language.},
|
|
80
|
+
author={Nguyen Nho Trung, Nguyen Nhat Quang, Nguyen Van Huy},
|
|
81
|
+
year={2025},
|
|
82
|
+
publisher={Huggingface},
|
|
83
|
+
}""",
|
|
78
84
|
)
|
|
79
85
|
|
|
80
86
|
hiieu_halong_embedding = ModelMeta(
|
|
@@ -99,6 +105,12 @@ hiieu_halong_embedding = ModelMeta(
|
|
|
99
105
|
public_training_data=None,
|
|
100
106
|
training_datasets=None,
|
|
101
107
|
adapted_from="intfloat/multilingual-e5-base",
|
|
108
|
+
citation="""@misc{HalongEmbedding,
|
|
109
|
+
title={HalongEmbedding: A Vietnamese Text Embedding},
|
|
110
|
+
author={Ngo Hieu},
|
|
111
|
+
year={2024},
|
|
112
|
+
publisher={Huggingface},
|
|
113
|
+
}""",
|
|
102
114
|
)
|
|
103
115
|
|
|
104
116
|
sup_simcse_vietnamese_phobert_base_ = ModelMeta(
|
|
@@ -122,6 +134,20 @@ sup_simcse_vietnamese_phobert_base_ = ModelMeta(
|
|
|
122
134
|
reference="https://huggingface.co/VoVanPhuc/sup-SimCSE-VietNamese-phobert-base",
|
|
123
135
|
similarity_fn_name="cosine",
|
|
124
136
|
training_datasets=None,
|
|
137
|
+
citation="""@article{gao2021simcse,
|
|
138
|
+
title={{SimCSE}: Simple Contrastive Learning of Sentence Embeddings},
|
|
139
|
+
author={Gao, Tianyu and Yao, Xingcheng and Chen, Danqi},
|
|
140
|
+
journal={arXiv preprint arXiv:2104.08821},
|
|
141
|
+
year={2021}
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
@inproceedings{phobert,
|
|
145
|
+
title = {{PhoBERT: Pre-trained language models for Vietnamese}},
|
|
146
|
+
author = {Dat Quoc Nguyen and Anh Tuan Nguyen},
|
|
147
|
+
booktitle = {Findings of the Association for Computational Linguistics: EMNLP 2020},
|
|
148
|
+
year = {2020},
|
|
149
|
+
pages = {1037--1042}
|
|
150
|
+
}""",
|
|
125
151
|
)
|
|
126
152
|
|
|
127
153
|
bkai_foundation_models_vietnamese_bi_encoder = ModelMeta(
|
|
@@ -145,4 +171,11 @@ bkai_foundation_models_vietnamese_bi_encoder = ModelMeta(
|
|
|
145
171
|
reference="https://huggingface.co/bkai-foundation-models/vietnamese-bi-encoder",
|
|
146
172
|
similarity_fn_name="cosine",
|
|
147
173
|
training_datasets=None,
|
|
174
|
+
citation="""
|
|
175
|
+
@article{duc2024towards,
|
|
176
|
+
title={Towards Comprehensive Vietnamese Retrieval-Augmented Generation and Large Language Models},
|
|
177
|
+
author={Nguyen Quang Duc, Le Hai Son, Nguyen Duc Nhan, Nguyen Dich Nhat Minh, Le Thanh Huong, Dinh Viet Sang},
|
|
178
|
+
journal={arXiv preprint arXiv:2403.01616},
|
|
179
|
+
year={2024}
|
|
180
|
+
}""",
|
|
148
181
|
)
|