mteb 2.3.11__py3-none-any.whl → 2.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mteb/benchmarks/benchmarks/__init__.py +2 -0
- mteb/benchmarks/benchmarks/benchmarks.py +57 -0
- mteb/deprecated_evaluator.py +8 -13
- mteb/descriptive_stats/Reranking/JQaRARerankingLite.json +35 -0
- mteb/descriptive_stats/Reranking/JaCWIRRerankingLite.json +35 -0
- mteb/descriptive_stats/Retrieval/JaCWIRRetrievalLite.json +30 -0
- mteb/descriptive_stats/Retrieval/JaqketRetrievalLite.json +30 -0
- mteb/descriptive_stats/Retrieval/MIRACLJaRetrievalLite.json +30 -0
- mteb/descriptive_stats/Retrieval/MrTyDiJaRetrievalLite.json +30 -0
- mteb/evaluate.py +2 -33
- mteb/leaderboard/figures.py +1 -1
- mteb/leaderboard/table.py +1 -11
- mteb/models/abs_encoder.py +21 -17
- mteb/models/get_model_meta.py +3 -123
- mteb/models/instruct_wrapper.py +2 -1
- mteb/models/model_implementations/bica_model.py +34 -0
- mteb/models/model_implementations/google_models.py +10 -0
- mteb/models/model_implementations/mod_models.py +204 -0
- mteb/models/model_implementations/nomic_models.py +142 -4
- mteb/models/model_meta.py +396 -19
- mteb/models/sentence_transformer_wrapper.py +2 -7
- mteb/tasks/reranking/jpn/__init__.py +9 -1
- mteb/tasks/reranking/jpn/j_qa_ra_reranking_lite.py +49 -0
- mteb/tasks/reranking/jpn/ja_cwir_reranking_lite.py +47 -0
- mteb/tasks/retrieval/code/fresh_stack_retrieval.py +8 -5
- mteb/tasks/retrieval/jpn/__init__.py +8 -0
- mteb/tasks/retrieval/jpn/ja_cwir_retrieval_lite.py +47 -0
- mteb/tasks/retrieval/jpn/jaqket_retrieval_lite.py +50 -0
- mteb/tasks/retrieval/jpn/miracl_ja_retrieval_lite.py +52 -0
- mteb/tasks/retrieval/jpn/mr_tydi_ja_retrieval_lite.py +48 -0
- {mteb-2.3.11.dist-info → mteb-2.4.1.dist-info}/METADATA +1 -1
- {mteb-2.3.11.dist-info → mteb-2.4.1.dist-info}/RECORD +36 -22
- {mteb-2.3.11.dist-info → mteb-2.4.1.dist-info}/WHEEL +0 -0
- {mteb-2.3.11.dist-info → mteb-2.4.1.dist-info}/entry_points.txt +0 -0
- {mteb-2.3.11.dist-info → mteb-2.4.1.dist-info}/licenses/LICENSE +0 -0
- {mteb-2.3.11.dist-info → mteb-2.4.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
from mteb.models import ModelMeta, sentence_transformers_loader
|
|
2
|
+
|
|
3
|
+
bica_base = ModelMeta(
|
|
4
|
+
name="bisectgroup/BiCA-base",
|
|
5
|
+
loader=sentence_transformers_loader,
|
|
6
|
+
languages=["eng-Latn"],
|
|
7
|
+
open_weights=True,
|
|
8
|
+
revision="31237a836e5ae908c308a256573e5f0986498574",
|
|
9
|
+
release_date="2025-11-14",
|
|
10
|
+
n_parameters=110_000_000,
|
|
11
|
+
memory_usage_mb=418,
|
|
12
|
+
embed_dim=768,
|
|
13
|
+
license="mit",
|
|
14
|
+
max_tokens=512,
|
|
15
|
+
reference="https://huggingface.co/bisectgroup/BiCA-base",
|
|
16
|
+
similarity_fn_name="cosine",
|
|
17
|
+
framework=["Sentence Transformers", "PyTorch"],
|
|
18
|
+
use_instructions=False,
|
|
19
|
+
public_training_code="https://github.com/NiravBhattLab/BiCA",
|
|
20
|
+
public_training_data="https://huggingface.co/datasets/bisectgroup/hard-negatives-traversal",
|
|
21
|
+
adapted_from="thenlper/gte-base",
|
|
22
|
+
citation="""
|
|
23
|
+
@misc{sinha2025bicaeffectivebiomedicaldense,
|
|
24
|
+
title={BiCA: Effective Biomedical Dense Retrieval with Citation-Aware Hard Negatives},
|
|
25
|
+
author={Aarush Sinha and Pavan Kumar S and Roshan Balaji and Nirav Pravinbhai Bhatt},
|
|
26
|
+
year={2025},
|
|
27
|
+
eprint={2511.08029},
|
|
28
|
+
archivePrefix={arXiv},
|
|
29
|
+
primaryClass={cs.IR},
|
|
30
|
+
url={https://arxiv.org/abs/2511.08029},
|
|
31
|
+
}
|
|
32
|
+
""",
|
|
33
|
+
training_datasets=set(),
|
|
34
|
+
)
|
|
@@ -272,4 +272,14 @@ embedding_gemma_300m = ModelMeta(
|
|
|
272
272
|
training_datasets=GECKO_TRAINING_DATA,
|
|
273
273
|
similarity_fn_name="cosine",
|
|
274
274
|
memory_usage_mb=1155,
|
|
275
|
+
citation="""
|
|
276
|
+
@misc{vera2025embeddinggemmapowerfullightweighttext,
|
|
277
|
+
title={EmbeddingGemma: Powerful and Lightweight Text Representations},
|
|
278
|
+
author={Henrique Schechter Vera and Sahil Dua and Biao Zhang and Daniel Salz and Ryan Mullins and Sindhu Raghuram Panyam and Sara Smoot and Iftekhar Naim and Joe Zou and Feiyang Chen and Daniel Cer and Alice Lisak and Min Choi and Lucas Gonzalez and Omar Sanseviero and Glenn Cameron and Ian Ballantyne and Kat Black and Kaifeng Chen and Weiyi Wang and Zhe Li and Gus Martins and Jinhyuk Lee and Mark Sherwood and Juyeong Ji and Renjie Wu and Jingxiao Zheng and Jyotinder Singh and Abheesht Sharma and Divyashree Sreepathihalli and Aashi Jain and Adham Elarabawy and AJ Co and Andreas Doumanoglou and Babak Samari and Ben Hora and Brian Potetz and Dahun Kim and Enrique Alfonseca and Fedor Moiseev and Feng Han and Frank Palma Gomez and Gustavo Hernández Ábrego and Hesen Zhang and Hui Hui and Jay Han and Karan Gill and Ke Chen and Koert Chen and Madhuri Shanbhogue and Michael Boratko and Paul Suganthan and Sai Meher Karthik Duddu and Sandeep Mariserla and Setareh Ariafar and Shanfeng Zhang and Shijie Zhang and Simon Baumgartner and Sonam Goenka and Steve Qiu and Tanmaya Dabral and Trevor Walker and Vikram Rao and Waleed Khawaja and Wenlei Zhou and Xiaoqi Ren and Ye Xia and Yichang Chen and Yi-Ting Chen and Zhe Dong and Zhongli Ding and Francesco Visin and Gaël Liu and Jiageng Zhang and Kathleen Kenealy and Michelle Casbon and Ravin Kumar and Thomas Mesnard and Zach Gleicher and Cormac Brick and Olivier Lacombe and Adam Roberts and Qin Yin and Yunhsuan Sung and Raphael Hoffmann and Tris Warkentin and Armand Joulin and Tom Duerig and Mojtaba Seyedhosseini},
|
|
279
|
+
year={2025},
|
|
280
|
+
eprint={2509.20354},
|
|
281
|
+
archivePrefix={arXiv},
|
|
282
|
+
primaryClass={cs.CL},
|
|
283
|
+
url={https://arxiv.org/abs/2509.20354},
|
|
284
|
+
}""",
|
|
275
285
|
)
|
|
@@ -0,0 +1,204 @@
|
|
|
1
|
+
from mteb.models.instruct_wrapper import InstructSentenceTransformerModel
|
|
2
|
+
from mteb.models.model_meta import ModelMeta
|
|
3
|
+
from mteb.models.models_protocols import EncoderProtocol, PromptType
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def instruction_template(
|
|
7
|
+
instruction: str, prompt_type: PromptType | None = None
|
|
8
|
+
) -> str:
|
|
9
|
+
if not instruction or prompt_type == PromptType.document:
|
|
10
|
+
return ""
|
|
11
|
+
if isinstance(instruction, dict):
|
|
12
|
+
if prompt_type is None:
|
|
13
|
+
instruction = next(iter(instruction.values())) # TODO
|
|
14
|
+
else:
|
|
15
|
+
instruction = instruction[prompt_type]
|
|
16
|
+
return f"Instruct: {instruction}\nQuery:"
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
multilingual_langs = [
|
|
20
|
+
"afr-Latn",
|
|
21
|
+
"ara-Arab",
|
|
22
|
+
"aze-Latn",
|
|
23
|
+
"bel-Cyrl",
|
|
24
|
+
"bul-Cyrl",
|
|
25
|
+
"ben-Beng",
|
|
26
|
+
"cat-Latn",
|
|
27
|
+
"ceb-Latn",
|
|
28
|
+
"ces-Latn",
|
|
29
|
+
"cym-Latn",
|
|
30
|
+
"dan-Latn",
|
|
31
|
+
"deu-Latn",
|
|
32
|
+
"ell-Grek",
|
|
33
|
+
"eng-Latn",
|
|
34
|
+
"spa-Latn",
|
|
35
|
+
"est-Latn",
|
|
36
|
+
"eus-Latn",
|
|
37
|
+
"fas-Arab",
|
|
38
|
+
"fin-Latn",
|
|
39
|
+
"fra-Latn",
|
|
40
|
+
"glg-Latn",
|
|
41
|
+
"guj-Gujr",
|
|
42
|
+
"heb-Hebr",
|
|
43
|
+
"hin-Deva",
|
|
44
|
+
"hrv-Latn",
|
|
45
|
+
"hat-Latn",
|
|
46
|
+
"hun-Latn",
|
|
47
|
+
"hye-Armn",
|
|
48
|
+
"ind-Latn",
|
|
49
|
+
"isl-Latn",
|
|
50
|
+
"ita-Latn",
|
|
51
|
+
"jpn-Jpan",
|
|
52
|
+
"jav-Latn",
|
|
53
|
+
"kat-Geor",
|
|
54
|
+
"kaz-Cyrl",
|
|
55
|
+
"khm-Khmr",
|
|
56
|
+
"kan-Knda",
|
|
57
|
+
"kor-Hang",
|
|
58
|
+
"kir-Cyrl",
|
|
59
|
+
"lao-Laoo",
|
|
60
|
+
"lit-Latn",
|
|
61
|
+
"lav-Latn",
|
|
62
|
+
"mkd-Cyrl",
|
|
63
|
+
"mal-Mlym",
|
|
64
|
+
"mon-Cyrl",
|
|
65
|
+
"mar-Deva",
|
|
66
|
+
"msa-Latn",
|
|
67
|
+
"mya-Mymr",
|
|
68
|
+
"nep-Deva",
|
|
69
|
+
"nld-Latn",
|
|
70
|
+
"nor-Latn",
|
|
71
|
+
"nob-Latn",
|
|
72
|
+
"nno-Latn",
|
|
73
|
+
"pan-Guru",
|
|
74
|
+
"pol-Latn",
|
|
75
|
+
"por-Latn",
|
|
76
|
+
"que-Latn",
|
|
77
|
+
"ron-Latn",
|
|
78
|
+
"rus-Cyrl",
|
|
79
|
+
"sin-Sinh",
|
|
80
|
+
"slk-Latn",
|
|
81
|
+
"slv-Latn",
|
|
82
|
+
"swa-Latn",
|
|
83
|
+
"tam-Taml",
|
|
84
|
+
"tel-Telu",
|
|
85
|
+
"tha-Thai",
|
|
86
|
+
"tgl-Latn",
|
|
87
|
+
"tur-Latn",
|
|
88
|
+
"ukr-Cyrl",
|
|
89
|
+
"urd-Arab",
|
|
90
|
+
"vie-Latn",
|
|
91
|
+
"yor-Latn",
|
|
92
|
+
"zho-Hans",
|
|
93
|
+
]
|
|
94
|
+
|
|
95
|
+
MOD_CITATION = """@misc{mod-embedding-2025,
|
|
96
|
+
title={MoD-Embedding: A Fine-tuned Multilingual Text Embedding Model},
|
|
97
|
+
author={MoD Team},
|
|
98
|
+
year={2025},
|
|
99
|
+
url={https://huggingface.co/bflhc/MoD-Embedding}
|
|
100
|
+
}"""
|
|
101
|
+
|
|
102
|
+
training_data = {
|
|
103
|
+
"T2Retrieval",
|
|
104
|
+
"DuRetrieval",
|
|
105
|
+
"MMarcoReranking",
|
|
106
|
+
"CMedQAv2-reranking",
|
|
107
|
+
"NQ",
|
|
108
|
+
"MSMARCO",
|
|
109
|
+
"HotpotQA",
|
|
110
|
+
"FEVER",
|
|
111
|
+
"MrTidyRetrieval",
|
|
112
|
+
"MIRACLRetrieval",
|
|
113
|
+
"CodeSearchNet",
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
# Predefined prompts for various RTEB tasks
|
|
117
|
+
PREDEFINED_PROMPTS = {
|
|
118
|
+
# ========== Open Datasets ==========
|
|
119
|
+
# Legal domain
|
|
120
|
+
"AILACasedocs": "Given a legal case scenario, retrieve the most relevant case documents",
|
|
121
|
+
"AILAStatutes": "Given a legal scenario, retrieve the most relevant statute documents",
|
|
122
|
+
"LegalQuAD": "Given a legal question, retrieve relevant legal documents that answer the question",
|
|
123
|
+
"LegalSummarization": "Given a query, retrieve relevant legal documents for summarization",
|
|
124
|
+
# Code domain
|
|
125
|
+
"AppsRetrieval": "Given a query about mobile applications, retrieve relevant app information",
|
|
126
|
+
"HumanEvalRetrieval": "Given a code problem description, retrieve relevant code examples",
|
|
127
|
+
"MBPPRetrieval": "Given a programming problem description, retrieve relevant code solutions",
|
|
128
|
+
"DS1000Retrieval": "Given a data science problem, retrieve relevant code snippets",
|
|
129
|
+
"FreshStackRetrieval": "Given a programming question, retrieve relevant Stack Overflow posts",
|
|
130
|
+
# Finance domain
|
|
131
|
+
"FinQARetrieval": "Given a financial question, retrieve relevant financial documents",
|
|
132
|
+
"FinanceBenchRetrieval": "Given a financial query, retrieve relevant financial information",
|
|
133
|
+
"HC3FinanceRetrieval": "Given a finance-related query, retrieve relevant documents",
|
|
134
|
+
# Medical domain
|
|
135
|
+
"CUREv1": "Given a medical query, retrieve relevant clinical documents",
|
|
136
|
+
"ChatDoctorRetrieval": "Given a medical question, retrieve relevant medical information",
|
|
137
|
+
# SQL domain
|
|
138
|
+
"WikiSQLRetrieval": "Given a natural language query, retrieve relevant SQL examples",
|
|
139
|
+
# Multilingual
|
|
140
|
+
"MIRACLRetrievalHardNegatives": "Given a query, retrieve relevant passages",
|
|
141
|
+
# ========== Private/Closed Datasets ==========
|
|
142
|
+
# Code domain (Private)
|
|
143
|
+
"Code1Retrieval": "Given a code problem description, retrieve relevant code examples",
|
|
144
|
+
"JapaneseCode1Retrieval": "Given a code problem description, retrieve relevant code examples",
|
|
145
|
+
# Finance domain (Private)
|
|
146
|
+
"EnglishFinance1Retrieval": "Given a financial query, retrieve relevant financial documents",
|
|
147
|
+
"EnglishFinance2Retrieval": "Given a financial query, retrieve relevant financial documents",
|
|
148
|
+
"EnglishFinance3Retrieval": "Given a financial query, retrieve relevant financial documents",
|
|
149
|
+
"EnglishFinance4Retrieval": "Given a financial query, retrieve relevant financial documents",
|
|
150
|
+
# Healthcare domain (Private)
|
|
151
|
+
"EnglishHealthcare1Retrieval": "Given a medical question, retrieve relevant medical information",
|
|
152
|
+
"GermanHealthcare1Retrieval": "Given a medical question, retrieve relevant medical information",
|
|
153
|
+
# Legal domain (Private)
|
|
154
|
+
"FrenchLegal1Retrieval": "Given a legal query, retrieve relevant legal documents",
|
|
155
|
+
"GermanLegal1Retrieval": "Given a legal query, retrieve relevant legal documents",
|
|
156
|
+
"JapaneseLegal1Retrieval": "Given a legal query, retrieve relevant legal documents",
|
|
157
|
+
# General/Multilingual (Private)
|
|
158
|
+
"French1Retrieval": "Given a query, retrieve relevant passages",
|
|
159
|
+
"German1Retrieval": "Given a query, retrieve relevant passages",
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def mod_instruct_loader(
|
|
164
|
+
model_name_or_path: str, revision: str, **kwargs
|
|
165
|
+
) -> EncoderProtocol:
|
|
166
|
+
# Set default prompts_dict if not provided
|
|
167
|
+
|
|
168
|
+
model = InstructSentenceTransformerModel(
|
|
169
|
+
model_name_or_path,
|
|
170
|
+
revision=revision,
|
|
171
|
+
instruction_template=instruction_template,
|
|
172
|
+
apply_instruction_to_passages=False,
|
|
173
|
+
prompt_dicts=PREDEFINED_PROMPTS,
|
|
174
|
+
**kwargs,
|
|
175
|
+
)
|
|
176
|
+
encoder = model.model._first_module()
|
|
177
|
+
if encoder.auto_model.config._attn_implementation == "flash_attention_2":
|
|
178
|
+
# The Qwen3 code only use left padding in flash_attention_2 mode.
|
|
179
|
+
encoder.tokenizer.padding_side = "left"
|
|
180
|
+
return model
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
MoD_Embedding = ModelMeta(
|
|
184
|
+
loader=mod_instruct_loader,
|
|
185
|
+
name="bflhc/MoD-Embedding",
|
|
186
|
+
languages=multilingual_langs,
|
|
187
|
+
open_weights=True,
|
|
188
|
+
revision="acbb5b70fdab262226a6af2bc62001de8021b05c",
|
|
189
|
+
release_date="2025-12-14",
|
|
190
|
+
n_parameters=4021774336,
|
|
191
|
+
memory_usage_mb=7671,
|
|
192
|
+
embed_dim=2560,
|
|
193
|
+
max_tokens=32768,
|
|
194
|
+
license="apache-2.0",
|
|
195
|
+
reference="https://huggingface.co/bflhc/MoD-Embedding",
|
|
196
|
+
similarity_fn_name="cosine",
|
|
197
|
+
framework=["Sentence Transformers", "PyTorch"],
|
|
198
|
+
use_instructions=True,
|
|
199
|
+
public_training_code=None,
|
|
200
|
+
public_training_data=None,
|
|
201
|
+
training_datasets=training_data,
|
|
202
|
+
citation=MOD_CITATION,
|
|
203
|
+
adapted_from="Qwen/Qwen3-Embedding-4B",
|
|
204
|
+
)
|
|
@@ -193,7 +193,7 @@ NOMIC_CITATION = """
|
|
|
193
193
|
"""
|
|
194
194
|
|
|
195
195
|
nomic_embed_v1_5 = ModelMeta(
|
|
196
|
-
loader=NomicWrapper,
|
|
196
|
+
loader=NomicWrapper, # type: ignore
|
|
197
197
|
loader_kwargs=dict(
|
|
198
198
|
trust_remote_code=True,
|
|
199
199
|
model_prompts=model_prompts,
|
|
@@ -221,7 +221,7 @@ nomic_embed_v1_5 = ModelMeta(
|
|
|
221
221
|
)
|
|
222
222
|
|
|
223
223
|
nomic_embed_v1 = ModelMeta(
|
|
224
|
-
loader=NomicWrapper,
|
|
224
|
+
loader=NomicWrapper, # type: ignore
|
|
225
225
|
loader_kwargs=dict(
|
|
226
226
|
trust_remote_code=True,
|
|
227
227
|
model_prompts=model_prompts,
|
|
@@ -249,7 +249,7 @@ nomic_embed_v1 = ModelMeta(
|
|
|
249
249
|
)
|
|
250
250
|
|
|
251
251
|
nomic_embed_v1_ablated = ModelMeta(
|
|
252
|
-
loader=NomicWrapper,
|
|
252
|
+
loader=NomicWrapper, # type: ignore
|
|
253
253
|
loader_kwargs=dict(
|
|
254
254
|
trust_remote_code=True,
|
|
255
255
|
model_prompts=model_prompts,
|
|
@@ -276,7 +276,7 @@ nomic_embed_v1_ablated = ModelMeta(
|
|
|
276
276
|
)
|
|
277
277
|
|
|
278
278
|
nomic_embed_v1_unsupervised = ModelMeta(
|
|
279
|
-
loader=NomicWrapper,
|
|
279
|
+
loader=NomicWrapper, # type: ignore
|
|
280
280
|
loader_kwargs=dict(
|
|
281
281
|
trust_remote_code=True,
|
|
282
282
|
model_prompts=model_prompts,
|
|
@@ -329,3 +329,141 @@ nomic_modern_bert_embed = ModelMeta(
|
|
|
329
329
|
training_datasets=nomic_training_data,
|
|
330
330
|
public_training_data=None,
|
|
331
331
|
)
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
m_languages = [
|
|
335
|
+
"eng-Latn",
|
|
336
|
+
"spa-Latn",
|
|
337
|
+
"fra-Latn",
|
|
338
|
+
"deu-Latn",
|
|
339
|
+
"ita-Latn",
|
|
340
|
+
"por-Latn",
|
|
341
|
+
"pol-Latn",
|
|
342
|
+
"nld-Latn",
|
|
343
|
+
"tur-Latn",
|
|
344
|
+
"jpn-Jpan",
|
|
345
|
+
"vie-Latn",
|
|
346
|
+
"rus-Cyrl",
|
|
347
|
+
"ind-Latn",
|
|
348
|
+
"arb-Arab",
|
|
349
|
+
"ces-Latn",
|
|
350
|
+
"ron-Latn",
|
|
351
|
+
"swe-Latn",
|
|
352
|
+
"ell-Grek",
|
|
353
|
+
"ukr-Cyrl",
|
|
354
|
+
"zho-Hans",
|
|
355
|
+
"hun-Latn",
|
|
356
|
+
"dan-Latn",
|
|
357
|
+
"nor-Latn",
|
|
358
|
+
"hin-Deva",
|
|
359
|
+
"fin-Latn",
|
|
360
|
+
"bul-Cyrl",
|
|
361
|
+
"kor-Hang",
|
|
362
|
+
"slk-Latn",
|
|
363
|
+
"tha-Thai",
|
|
364
|
+
"heb-Hebr",
|
|
365
|
+
"cat-Latn",
|
|
366
|
+
"lit-Latn",
|
|
367
|
+
"fas-Arab",
|
|
368
|
+
"msa-Latn",
|
|
369
|
+
"slv-Latn",
|
|
370
|
+
"lav-Latn",
|
|
371
|
+
"mar-Deva",
|
|
372
|
+
"ben-Beng",
|
|
373
|
+
"sqi-Latn",
|
|
374
|
+
"cym-Latn",
|
|
375
|
+
"bel-Cyrl",
|
|
376
|
+
"mal-Mlym",
|
|
377
|
+
"kan-Knda",
|
|
378
|
+
"mkd-Cyrl",
|
|
379
|
+
"urd-Arab",
|
|
380
|
+
"fry-Latn",
|
|
381
|
+
"fil-Latn",
|
|
382
|
+
"tel-Telu",
|
|
383
|
+
"eus-Latn",
|
|
384
|
+
"swh-Latn",
|
|
385
|
+
"som-Latn",
|
|
386
|
+
"snd-Arab",
|
|
387
|
+
"uzb-Latn",
|
|
388
|
+
"cos-Latn",
|
|
389
|
+
"hrv-Latn",
|
|
390
|
+
"guj-Gujr",
|
|
391
|
+
"hin-Latn",
|
|
392
|
+
"ceb-Latn",
|
|
393
|
+
"epo-Latn",
|
|
394
|
+
"jav-Latn",
|
|
395
|
+
"lat-Latn",
|
|
396
|
+
"zul-Latn",
|
|
397
|
+
"mon-Cyrl",
|
|
398
|
+
"sin-Sinh",
|
|
399
|
+
"ell-Latn",
|
|
400
|
+
"gle-Latn",
|
|
401
|
+
"kir-Cyrl",
|
|
402
|
+
"tgk-Cyrl",
|
|
403
|
+
"mya-Mymr",
|
|
404
|
+
"khm-Khmr",
|
|
405
|
+
"mlg-Latn",
|
|
406
|
+
"pan-Guru",
|
|
407
|
+
"rus-Latn",
|
|
408
|
+
"sna-Latn",
|
|
409
|
+
"zho-Latn",
|
|
410
|
+
"hau-Latn",
|
|
411
|
+
"heb-Latn",
|
|
412
|
+
"hmn-Latn",
|
|
413
|
+
"hat-Latn",
|
|
414
|
+
"jpn-Latn",
|
|
415
|
+
"sun-Latn",
|
|
416
|
+
"bul-Latn",
|
|
417
|
+
"gla-Latn",
|
|
418
|
+
"nya-Latn",
|
|
419
|
+
"pus-Arab",
|
|
420
|
+
"kur-Latn",
|
|
421
|
+
"hbs-Latn",
|
|
422
|
+
"amh-Ethi",
|
|
423
|
+
"ibo-Latn",
|
|
424
|
+
"lao-Laoo",
|
|
425
|
+
"mri-Latn",
|
|
426
|
+
"nno-Latn",
|
|
427
|
+
"smo-Latn",
|
|
428
|
+
"yid-Hebr",
|
|
429
|
+
"sot-Latn",
|
|
430
|
+
"tgl-Latn",
|
|
431
|
+
"xho-Latn",
|
|
432
|
+
"yor-Latn",
|
|
433
|
+
]
|
|
434
|
+
|
|
435
|
+
nomic_embed_text_v2_moe = ModelMeta(
|
|
436
|
+
loader=NomicWrapper, # type: ignore
|
|
437
|
+
loader_kwargs=dict(
|
|
438
|
+
trust_remote_code=True,
|
|
439
|
+
model_prompts=model_prompts,
|
|
440
|
+
),
|
|
441
|
+
name="nomic-ai/nomic-embed-text-v2-moe",
|
|
442
|
+
languages=m_languages,
|
|
443
|
+
open_weights=True,
|
|
444
|
+
revision="1066b6599d099fbb93dfcb64f9c37a7c9e503e85",
|
|
445
|
+
release_date="2025-02-07",
|
|
446
|
+
n_parameters=475292928,
|
|
447
|
+
memory_usage_mb=1813,
|
|
448
|
+
max_tokens=512,
|
|
449
|
+
embed_dim=768,
|
|
450
|
+
license="apache-2.0",
|
|
451
|
+
reference="https://huggingface.co/nomic-ai/nomic-embed-text-v2-moe",
|
|
452
|
+
similarity_fn_name=ScoringFunction.COSINE,
|
|
453
|
+
framework=["Sentence Transformers", "PyTorch"],
|
|
454
|
+
use_instructions=True,
|
|
455
|
+
adapted_from="nomic-ai/nomic-xlm-2048",
|
|
456
|
+
public_training_data="https://github.com/nomic-ai/contrastors?tab=readme-ov-file#data-access",
|
|
457
|
+
public_training_code="https://github.com/nomic-ai/contrastors/blob/613ddfd37309e538cceadb05b1e6423e7b09f603/src/contrastors/configs/train/contrastive_finetune_moe.yaml",
|
|
458
|
+
training_datasets=None, # did not look into this further
|
|
459
|
+
superseded_by=None,
|
|
460
|
+
citation="""@misc{nussbaum2025trainingsparsemixtureexperts,
|
|
461
|
+
title={Training Sparse Mixture Of Experts Text Embedding Models},
|
|
462
|
+
author={Zach Nussbaum and Brandon Duderstadt},
|
|
463
|
+
year={2025},
|
|
464
|
+
eprint={2502.07972},
|
|
465
|
+
archivePrefix={arXiv},
|
|
466
|
+
primaryClass={cs.CL},
|
|
467
|
+
url={https://arxiv.org/abs/2502.07972},
|
|
468
|
+
}""",
|
|
469
|
+
)
|