mteb 2.5.2__py3-none-any.whl → 2.5.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mteb/_create_dataloaders.py +10 -15
- mteb/_evaluators/any_sts_evaluator.py +1 -4
- mteb/_evaluators/evaluator.py +2 -1
- mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +5 -6
- mteb/_evaluators/pair_classification_evaluator.py +3 -1
- mteb/_evaluators/retrieval_metrics.py +17 -16
- mteb/_evaluators/sklearn_evaluator.py +9 -8
- mteb/_evaluators/text/bitext_mining_evaluator.py +23 -16
- mteb/_evaluators/text/summarization_evaluator.py +20 -16
- mteb/abstasks/_data_filter/filters.py +1 -1
- mteb/abstasks/_data_filter/task_pipelines.py +3 -0
- mteb/abstasks/_statistics_calculation.py +18 -10
- mteb/abstasks/_stratification.py +18 -18
- mteb/abstasks/abstask.py +33 -27
- mteb/abstasks/aggregate_task_metadata.py +1 -9
- mteb/abstasks/aggregated_task.py +7 -26
- mteb/abstasks/classification.py +10 -4
- mteb/abstasks/clustering.py +18 -14
- mteb/abstasks/clustering_legacy.py +8 -8
- mteb/abstasks/image/image_text_pair_classification.py +5 -3
- mteb/abstasks/multilabel_classification.py +20 -16
- mteb/abstasks/pair_classification.py +18 -9
- mteb/abstasks/regression.py +3 -3
- mteb/abstasks/retrieval.py +12 -9
- mteb/abstasks/sts.py +6 -3
- mteb/abstasks/task_metadata.py +22 -19
- mteb/abstasks/text/bitext_mining.py +36 -25
- mteb/abstasks/text/reranking.py +7 -5
- mteb/abstasks/text/summarization.py +8 -3
- mteb/abstasks/zeroshot_classification.py +5 -2
- mteb/benchmarks/benchmark.py +2 -2
- mteb/cache.py +27 -22
- mteb/cli/_display_tasks.py +2 -2
- mteb/cli/build_cli.py +15 -10
- mteb/cli/generate_model_card.py +10 -7
- mteb/deprecated_evaluator.py +60 -46
- mteb/evaluate.py +39 -30
- mteb/filter_tasks.py +25 -26
- mteb/get_tasks.py +29 -30
- mteb/languages/language_scripts.py +5 -3
- mteb/leaderboard/app.py +1 -1
- mteb/load_results.py +12 -12
- mteb/models/abs_encoder.py +7 -5
- mteb/models/cache_wrappers/cache_backend_protocol.py +3 -5
- mteb/models/cache_wrappers/cache_backends/_hash_utils.py +5 -4
- mteb/models/cache_wrappers/cache_backends/faiss_cache.py +6 -2
- mteb/models/cache_wrappers/cache_backends/numpy_cache.py +43 -25
- mteb/models/cache_wrappers/cache_wrapper.py +2 -2
- mteb/models/get_model_meta.py +8 -1
- mteb/models/instruct_wrapper.py +11 -5
- mteb/models/model_implementations/andersborges.py +2 -2
- mteb/models/model_implementations/blip_models.py +8 -8
- mteb/models/model_implementations/bm25.py +1 -1
- mteb/models/model_implementations/clip_models.py +3 -3
- mteb/models/model_implementations/cohere_models.py +1 -1
- mteb/models/model_implementations/cohere_v.py +2 -2
- mteb/models/model_implementations/dino_models.py +23 -23
- mteb/models/model_implementations/emillykkejensen_models.py +3 -3
- mteb/models/model_implementations/gme_v_models.py +4 -3
- mteb/models/model_implementations/jina_clip.py +1 -1
- mteb/models/model_implementations/jina_models.py +1 -1
- mteb/models/model_implementations/kennethenevoldsen_models.py +2 -2
- mteb/models/model_implementations/llm2clip_models.py +3 -3
- mteb/models/model_implementations/mcinext_models.py +4 -1
- mteb/models/model_implementations/moco_models.py +2 -2
- mteb/models/model_implementations/model2vec_models.py +1 -1
- mteb/models/model_implementations/nomic_models.py +8 -8
- mteb/models/model_implementations/openclip_models.py +7 -7
- mteb/models/model_implementations/random_baseline.py +3 -3
- mteb/models/model_implementations/rasgaard_models.py +1 -1
- mteb/models/model_implementations/repllama_models.py +2 -2
- mteb/models/model_implementations/rerankers_custom.py +3 -3
- mteb/models/model_implementations/rerankers_monot5_based.py +3 -3
- mteb/models/model_implementations/siglip_models.py +10 -10
- mteb/models/model_implementations/vlm2vec_models.py +1 -1
- mteb/models/model_implementations/voyage_v.py +4 -4
- mteb/models/model_meta.py +14 -13
- mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +9 -6
- mteb/models/search_wrappers.py +26 -12
- mteb/models/sentence_transformer_wrapper.py +19 -14
- mteb/py.typed +0 -0
- mteb/results/benchmark_results.py +28 -20
- mteb/results/model_result.py +52 -22
- mteb/results/task_result.py +55 -58
- mteb/similarity_functions.py +11 -7
- mteb/tasks/classification/dan/dk_hate_classification.py +1 -1
- mteb/tasks/classification/est/estonian_valence.py +1 -1
- mteb/tasks/classification/multilingual/scala_classification.py +1 -1
- mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +1 -1
- mteb/tasks/retrieval/code/code_rag.py +12 -12
- mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
- mteb/tasks/retrieval/dan/tv2_nordretrieval.py +2 -2
- mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +2 -2
- mteb/tasks/retrieval/nob/norquad.py +2 -2
- mteb/tasks/retrieval/nob/snl_retrieval.py +2 -2
- mteb/tasks/retrieval/tur/tur_hist_quad.py +1 -1
- mteb/types/_result.py +2 -1
- mteb/types/statistics.py +9 -3
- {mteb-2.5.2.dist-info → mteb-2.5.4.dist-info}/METADATA +1 -1
- {mteb-2.5.2.dist-info → mteb-2.5.4.dist-info}/RECORD +104 -103
- {mteb-2.5.2.dist-info → mteb-2.5.4.dist-info}/WHEEL +0 -0
- {mteb-2.5.2.dist-info → mteb-2.5.4.dist-info}/entry_points.txt +0 -0
- {mteb-2.5.2.dist-info → mteb-2.5.4.dist-info}/licenses/LICENSE +0 -0
- {mteb-2.5.2.dist-info → mteb-2.5.4.dist-info}/top_level.txt +0 -0
|
@@ -4,7 +4,7 @@ from mteb.models.sentence_transformer_wrapper import (
|
|
|
4
4
|
)
|
|
5
5
|
|
|
6
6
|
dfm_enc_large = ModelMeta(
|
|
7
|
-
loader=sentence_transformers_loader,
|
|
7
|
+
loader=sentence_transformers_loader,
|
|
8
8
|
name="KennethEnevoldsen/dfm-sentence-encoder-large",
|
|
9
9
|
model_type=["dense"],
|
|
10
10
|
languages=["dan-Latn"],
|
|
@@ -39,7 +39,7 @@ dfm_enc_large = ModelMeta(
|
|
|
39
39
|
)
|
|
40
40
|
|
|
41
41
|
dfm_enc_med = ModelMeta(
|
|
42
|
-
loader=sentence_transformers_loader,
|
|
42
|
+
loader=sentence_transformers_loader,
|
|
43
43
|
name="KennethEnevoldsen/dfm-sentence-encoder-medium",
|
|
44
44
|
model_type=["dense"],
|
|
45
45
|
languages=["dan-Latn"],
|
|
@@ -181,7 +181,7 @@ llm2clip_training_sets = set(
|
|
|
181
181
|
)
|
|
182
182
|
|
|
183
183
|
llm2clip_openai_l_14_336 = ModelMeta(
|
|
184
|
-
loader=llm2clip_loader,
|
|
184
|
+
loader=llm2clip_loader,
|
|
185
185
|
name="microsoft/LLM2CLIP-Openai-L-14-336",
|
|
186
186
|
model_type=["dense"],
|
|
187
187
|
languages=["eng-Latn"],
|
|
@@ -206,7 +206,7 @@ llm2clip_openai_l_14_336 = ModelMeta(
|
|
|
206
206
|
|
|
207
207
|
# NOTE: https://huggingface.co/microsoft/LLM2CLIP-Openai-L-14-224/discussions/1
|
|
208
208
|
llm2clip_openai_l_14_224 = ModelMeta(
|
|
209
|
-
loader=llm2clip_loader,
|
|
209
|
+
loader=llm2clip_loader,
|
|
210
210
|
name="microsoft/LLM2CLIP-Openai-L-14-224",
|
|
211
211
|
model_type=["dense"],
|
|
212
212
|
languages=["eng-Latn"],
|
|
@@ -230,7 +230,7 @@ llm2clip_openai_l_14_224 = ModelMeta(
|
|
|
230
230
|
)
|
|
231
231
|
|
|
232
232
|
llm2clip_openai_b_16 = ModelMeta(
|
|
233
|
-
loader=llm2clip_loader,
|
|
233
|
+
loader=llm2clip_loader,
|
|
234
234
|
name="microsoft/LLM2CLIP-Openai-B-16",
|
|
235
235
|
model_type=["dense"],
|
|
236
236
|
languages=["eng-Latn"],
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
import os
|
|
3
3
|
import time
|
|
4
|
+
import warnings
|
|
4
5
|
from typing import Any
|
|
5
6
|
|
|
6
7
|
import numpy as np
|
|
@@ -246,7 +247,9 @@ class HakimModelWrapper(AbsEncoder):
|
|
|
246
247
|
task_prompt, task_id = DATASET_TASKS.get(task_name, (None, None))
|
|
247
248
|
|
|
248
249
|
if not task_prompt:
|
|
249
|
-
|
|
250
|
+
msg = f"Unknown dataset: {task_name}, no preprocessing applied."
|
|
251
|
+
logger.warning(msg)
|
|
252
|
+
warnings.warn(msg)
|
|
250
253
|
return sample
|
|
251
254
|
|
|
252
255
|
task_prompt = f"مسئله : {task_prompt}"
|
|
@@ -117,7 +117,7 @@ mocov3_training_datasets = set(
|
|
|
117
117
|
)
|
|
118
118
|
|
|
119
119
|
mocov3_vit_base = ModelMeta(
|
|
120
|
-
loader=mocov3_loader,
|
|
120
|
+
loader=mocov3_loader,
|
|
121
121
|
name="nyu-visionx/moco-v3-vit-b",
|
|
122
122
|
model_type=["dense"],
|
|
123
123
|
languages=["eng-Latn"],
|
|
@@ -141,7 +141,7 @@ mocov3_vit_base = ModelMeta(
|
|
|
141
141
|
)
|
|
142
142
|
|
|
143
143
|
mocov3_vit_large = ModelMeta(
|
|
144
|
-
loader=mocov3_loader,
|
|
144
|
+
loader=mocov3_loader,
|
|
145
145
|
name="nyu-visionx/moco-v3-vit-l",
|
|
146
146
|
model_type=["dense"],
|
|
147
147
|
languages=["eng-Latn"],
|
|
@@ -139,7 +139,7 @@ class Model2VecModel(AbsEncoder):
|
|
|
139
139
|
**kwargs: Additional arguments to pass to the wrapper.
|
|
140
140
|
"""
|
|
141
141
|
requires_package(self, "model2vec", model_name, "pip install 'mteb[model2vec]'")
|
|
142
|
-
from model2vec import StaticModel
|
|
142
|
+
from model2vec import StaticModel
|
|
143
143
|
|
|
144
144
|
self.model_name = model_name
|
|
145
145
|
self.model = StaticModel.from_pretrained(self.model_name)
|
|
@@ -193,7 +193,7 @@ NOMIC_CITATION = """
|
|
|
193
193
|
"""
|
|
194
194
|
|
|
195
195
|
nomic_embed_v1_5 = ModelMeta(
|
|
196
|
-
loader=NomicWrapper,
|
|
196
|
+
loader=NomicWrapper,
|
|
197
197
|
loader_kwargs=dict(
|
|
198
198
|
trust_remote_code=True,
|
|
199
199
|
model_prompts=model_prompts,
|
|
@@ -222,7 +222,7 @@ nomic_embed_v1_5 = ModelMeta(
|
|
|
222
222
|
)
|
|
223
223
|
|
|
224
224
|
nomic_embed_v1 = ModelMeta(
|
|
225
|
-
loader=NomicWrapper,
|
|
225
|
+
loader=NomicWrapper,
|
|
226
226
|
loader_kwargs=dict(
|
|
227
227
|
trust_remote_code=True,
|
|
228
228
|
model_prompts=model_prompts,
|
|
@@ -251,7 +251,7 @@ nomic_embed_v1 = ModelMeta(
|
|
|
251
251
|
)
|
|
252
252
|
|
|
253
253
|
nomic_embed_v1_ablated = ModelMeta(
|
|
254
|
-
loader=NomicWrapper,
|
|
254
|
+
loader=NomicWrapper,
|
|
255
255
|
loader_kwargs=dict(
|
|
256
256
|
trust_remote_code=True,
|
|
257
257
|
model_prompts=model_prompts,
|
|
@@ -279,7 +279,7 @@ nomic_embed_v1_ablated = ModelMeta(
|
|
|
279
279
|
)
|
|
280
280
|
|
|
281
281
|
nomic_embed_v1_unsupervised = ModelMeta(
|
|
282
|
-
loader=NomicWrapper,
|
|
282
|
+
loader=NomicWrapper,
|
|
283
283
|
loader_kwargs=dict(
|
|
284
284
|
trust_remote_code=True,
|
|
285
285
|
model_prompts=model_prompts,
|
|
@@ -334,7 +334,7 @@ nomic_modern_bert_embed = ModelMeta(
|
|
|
334
334
|
training_datasets=nomic_training_data,
|
|
335
335
|
public_training_data=None,
|
|
336
336
|
citation="""@misc{nussbaum2024nomic,
|
|
337
|
-
title={Nomic Embed: Training a Reproducible Long Context Text Embedder},
|
|
337
|
+
title={Nomic Embed: Training a Reproducible Long Context Text Embedder},
|
|
338
338
|
author={Zach Nussbaum and John X. Morris and Brandon Duderstadt and Andriy Mulyar},
|
|
339
339
|
year={2024},
|
|
340
340
|
eprint={2402.01613},
|
|
@@ -446,7 +446,7 @@ m_languages = [
|
|
|
446
446
|
]
|
|
447
447
|
|
|
448
448
|
nomic_embed_text_v2_moe = ModelMeta(
|
|
449
|
-
loader=NomicWrapper,
|
|
449
|
+
loader=NomicWrapper,
|
|
450
450
|
loader_kwargs=dict(
|
|
451
451
|
trust_remote_code=True,
|
|
452
452
|
model_prompts=model_prompts,
|
|
@@ -472,12 +472,12 @@ nomic_embed_text_v2_moe = ModelMeta(
|
|
|
472
472
|
training_datasets=None, # did not look into this further
|
|
473
473
|
superseded_by=None,
|
|
474
474
|
citation="""@misc{nussbaum2025trainingsparsemixtureexperts,
|
|
475
|
-
title={Training Sparse Mixture Of Experts Text Embedding Models},
|
|
475
|
+
title={Training Sparse Mixture Of Experts Text Embedding Models},
|
|
476
476
|
author={Zach Nussbaum and Brandon Duderstadt},
|
|
477
477
|
year={2025},
|
|
478
478
|
eprint={2502.07972},
|
|
479
479
|
archivePrefix={arXiv},
|
|
480
480
|
primaryClass={cs.CL},
|
|
481
|
-
url={https://arxiv.org/abs/2502.07972},
|
|
481
|
+
url={https://arxiv.org/abs/2502.07972},
|
|
482
482
|
}""",
|
|
483
483
|
)
|
|
@@ -120,7 +120,7 @@ def openclip_loader(model_name, **kwargs):
|
|
|
120
120
|
|
|
121
121
|
|
|
122
122
|
CLIP_ViT_L_14_DataComp_XL_s13B_b90K = ModelMeta(
|
|
123
|
-
loader=openclip_loader,
|
|
123
|
+
loader=openclip_loader,
|
|
124
124
|
name="laion/CLIP-ViT-L-14-DataComp.XL-s13B-b90K",
|
|
125
125
|
model_type=["dense"],
|
|
126
126
|
languages=["eng-Latn"],
|
|
@@ -146,7 +146,7 @@ CLIP_ViT_L_14_DataComp_XL_s13B_b90K = ModelMeta(
|
|
|
146
146
|
)
|
|
147
147
|
|
|
148
148
|
CLIP_ViT_B_32_DataComp_XL_s13B_b90K = ModelMeta(
|
|
149
|
-
loader=openclip_loader,
|
|
149
|
+
loader=openclip_loader,
|
|
150
150
|
name="laion/CLIP-ViT-B-32-DataComp.XL-s13B-b90K",
|
|
151
151
|
model_type=["dense"],
|
|
152
152
|
languages=["eng-Latn"],
|
|
@@ -172,7 +172,7 @@ CLIP_ViT_B_32_DataComp_XL_s13B_b90K = ModelMeta(
|
|
|
172
172
|
)
|
|
173
173
|
|
|
174
174
|
CLIP_ViT_B_16_DataComp_XL_s13B_b90K = ModelMeta(
|
|
175
|
-
loader=openclip_loader,
|
|
175
|
+
loader=openclip_loader,
|
|
176
176
|
name="laion/CLIP-ViT-B-16-DataComp.XL-s13B-b90K",
|
|
177
177
|
model_type=["dense"],
|
|
178
178
|
languages=["eng-Latn"],
|
|
@@ -198,7 +198,7 @@ CLIP_ViT_B_16_DataComp_XL_s13B_b90K = ModelMeta(
|
|
|
198
198
|
)
|
|
199
199
|
|
|
200
200
|
CLIP_ViT_bigG_14_laion2B_39B_b160k = ModelMeta(
|
|
201
|
-
loader=openclip_loader,
|
|
201
|
+
loader=openclip_loader,
|
|
202
202
|
name="laion/CLIP-ViT-bigG-14-laion2B-39B-b160k",
|
|
203
203
|
model_type=["dense"],
|
|
204
204
|
languages=["eng-Latn"],
|
|
@@ -224,7 +224,7 @@ CLIP_ViT_bigG_14_laion2B_39B_b160k = ModelMeta(
|
|
|
224
224
|
)
|
|
225
225
|
|
|
226
226
|
CLIP_ViT_g_14_laion2B_s34B_b88K = ModelMeta(
|
|
227
|
-
loader=openclip_loader,
|
|
227
|
+
loader=openclip_loader,
|
|
228
228
|
name="laion/CLIP-ViT-g-14-laion2B-s34B-b88K",
|
|
229
229
|
model_type=["dense"],
|
|
230
230
|
languages=["eng-Latn"],
|
|
@@ -250,7 +250,7 @@ CLIP_ViT_g_14_laion2B_s34B_b88K = ModelMeta(
|
|
|
250
250
|
)
|
|
251
251
|
|
|
252
252
|
CLIP_ViT_H_14_laion2B_s32B_b79K = ModelMeta(
|
|
253
|
-
loader=openclip_loader,
|
|
253
|
+
loader=openclip_loader,
|
|
254
254
|
name="laion/CLIP-ViT-H-14-laion2B-s32B-b79K",
|
|
255
255
|
model_type=["dense"],
|
|
256
256
|
languages=["eng-Latn"],
|
|
@@ -276,7 +276,7 @@ CLIP_ViT_H_14_laion2B_s32B_b79K = ModelMeta(
|
|
|
276
276
|
)
|
|
277
277
|
|
|
278
278
|
CLIP_ViT_L_14_laion2B_s32B_b82K = ModelMeta(
|
|
279
|
-
loader=openclip_loader,
|
|
279
|
+
loader=openclip_loader,
|
|
280
280
|
name="laion/CLIP-ViT-L-14-laion2B-s32B-b82K",
|
|
281
281
|
model_type=["dense"],
|
|
282
282
|
languages=["eng-Latn"],
|
|
@@ -68,7 +68,7 @@ _common_mock_metadata = dict(
|
|
|
68
68
|
license="mit",
|
|
69
69
|
max_tokens=np.inf,
|
|
70
70
|
reference=None,
|
|
71
|
-
similarity_fn_name="cosine",
|
|
71
|
+
similarity_fn_name="cosine",
|
|
72
72
|
framework=[],
|
|
73
73
|
use_instructions=False,
|
|
74
74
|
public_training_code=None, # No training code, as this is a random baseline
|
|
@@ -187,7 +187,7 @@ class RandomEncoderBaseline:
|
|
|
187
187
|
|
|
188
188
|
|
|
189
189
|
random_encoder_baseline = ModelMeta(
|
|
190
|
-
loader=RandomEncoderBaseline,
|
|
190
|
+
loader=RandomEncoderBaseline,
|
|
191
191
|
name="baseline/random-encoder-baseline",
|
|
192
192
|
model_type=["dense"],
|
|
193
193
|
modalities=["text", "image"],
|
|
@@ -232,7 +232,7 @@ class RandomCrossEncoderBaseline:
|
|
|
232
232
|
|
|
233
233
|
|
|
234
234
|
random_cross_encoder_baseline = ModelMeta(
|
|
235
|
-
loader=RandomCrossEncoderBaseline,
|
|
235
|
+
loader=RandomCrossEncoderBaseline,
|
|
236
236
|
name="baseline/random-cross-encoder-baseline",
|
|
237
237
|
model_type=["cross-encoder"],
|
|
238
238
|
modalities=["text", "image"],
|
|
@@ -4,7 +4,7 @@ from mteb.models.model_implementations.model2vec_models import Model2VecModel
|
|
|
4
4
|
from mteb.models.model_meta import ModelMeta, ScoringFunction
|
|
5
5
|
|
|
6
6
|
potion_base_8m = ModelMeta(
|
|
7
|
-
loader=Model2VecModel,
|
|
7
|
+
loader=Model2VecModel,
|
|
8
8
|
name="rasgaard/m2v-dfm-large",
|
|
9
9
|
model_type=["dense"],
|
|
10
10
|
languages=["dan-Latn"],
|
|
@@ -154,7 +154,7 @@ REPLLAMA_CITATION = """
|
|
|
154
154
|
"""
|
|
155
155
|
|
|
156
156
|
repllama_llama2_original = ModelMeta(
|
|
157
|
-
loader=RepLLaMAModel,
|
|
157
|
+
loader=RepLLaMAModel,
|
|
158
158
|
loader_kwargs=dict(
|
|
159
159
|
base_model_name_or_path="meta-llama/Llama-2-7b-hf",
|
|
160
160
|
device_map="auto",
|
|
@@ -187,7 +187,7 @@ repllama_llama2_original = ModelMeta(
|
|
|
187
187
|
|
|
188
188
|
|
|
189
189
|
repllama_llama2_reproduced = ModelMeta(
|
|
190
|
-
loader=RepLLaMAModel,
|
|
190
|
+
loader=RepLLaMAModel,
|
|
191
191
|
loader_kwargs=dict(
|
|
192
192
|
base_model_name_or_path="meta-llama/Llama-2-7b-hf",
|
|
193
193
|
device_map="auto",
|
|
@@ -214,7 +214,7 @@ class JinaReranker(RerankerWrapper):
|
|
|
214
214
|
|
|
215
215
|
|
|
216
216
|
monobert_large = ModelMeta(
|
|
217
|
-
loader=MonoBERTReranker,
|
|
217
|
+
loader=MonoBERTReranker,
|
|
218
218
|
loader_kwargs=dict(
|
|
219
219
|
fp_options="float16",
|
|
220
220
|
),
|
|
@@ -239,7 +239,7 @@ monobert_large = ModelMeta(
|
|
|
239
239
|
|
|
240
240
|
# languages unclear: https://huggingface.co/jinaai/jina-reranker-v2-base-multilingual/discussions/28
|
|
241
241
|
jina_reranker_multilingual = ModelMeta(
|
|
242
|
-
loader=JinaReranker,
|
|
242
|
+
loader=JinaReranker,
|
|
243
243
|
loader_kwargs=dict(
|
|
244
244
|
fp_options="float16",
|
|
245
245
|
),
|
|
@@ -263,7 +263,7 @@ jina_reranker_multilingual = ModelMeta(
|
|
|
263
263
|
)
|
|
264
264
|
|
|
265
265
|
bge_reranker_v2_m3 = ModelMeta(
|
|
266
|
-
loader=BGEReranker,
|
|
266
|
+
loader=BGEReranker,
|
|
267
267
|
loader_kwargs=dict(
|
|
268
268
|
fp_options="float16",
|
|
269
269
|
),
|
|
@@ -343,7 +343,7 @@ monot5_small = ModelMeta(
|
|
|
343
343
|
)
|
|
344
344
|
|
|
345
345
|
monot5_base = ModelMeta(
|
|
346
|
-
loader=MonoT5Reranker,
|
|
346
|
+
loader=MonoT5Reranker,
|
|
347
347
|
loader_kwargs=dict(
|
|
348
348
|
fp_options="float16",
|
|
349
349
|
),
|
|
@@ -442,7 +442,7 @@ monot5_3b = ModelMeta(
|
|
|
442
442
|
)
|
|
443
443
|
|
|
444
444
|
flant5_base = ModelMeta(
|
|
445
|
-
loader=FLANT5Reranker,
|
|
445
|
+
loader=FLANT5Reranker,
|
|
446
446
|
loader_kwargs=dict(
|
|
447
447
|
fp_options="float16",
|
|
448
448
|
),
|
|
@@ -902,7 +902,7 @@ mt5_base_mmarco_v2 = ModelMeta(
|
|
|
902
902
|
)
|
|
903
903
|
|
|
904
904
|
mt5_13b_mmarco_100k = ModelMeta(
|
|
905
|
-
loader=MonoT5Reranker,
|
|
905
|
+
loader=MonoT5Reranker,
|
|
906
906
|
loader_kwargs=dict(
|
|
907
907
|
fp_options="float16",
|
|
908
908
|
),
|
|
@@ -123,7 +123,7 @@ siglip_training_datasets = set(
|
|
|
123
123
|
)
|
|
124
124
|
|
|
125
125
|
siglip_so400m_patch14_224 = ModelMeta(
|
|
126
|
-
loader=SiglipModelWrapper,
|
|
126
|
+
loader=SiglipModelWrapper,
|
|
127
127
|
name="google/siglip-so400m-patch14-224",
|
|
128
128
|
model_type=["dense"],
|
|
129
129
|
languages=["eng-Latn"],
|
|
@@ -147,7 +147,7 @@ siglip_so400m_patch14_224 = ModelMeta(
|
|
|
147
147
|
)
|
|
148
148
|
|
|
149
149
|
siglip_so400m_patch14_384 = ModelMeta(
|
|
150
|
-
loader=SiglipModelWrapper,
|
|
150
|
+
loader=SiglipModelWrapper,
|
|
151
151
|
name="google/siglip-so400m-patch14-384",
|
|
152
152
|
model_type=["dense"],
|
|
153
153
|
languages=["eng-Latn"],
|
|
@@ -171,7 +171,7 @@ siglip_so400m_patch14_384 = ModelMeta(
|
|
|
171
171
|
)
|
|
172
172
|
|
|
173
173
|
siglip_so400m_patch16_256_i18n = ModelMeta(
|
|
174
|
-
loader=SiglipModelWrapper,
|
|
174
|
+
loader=SiglipModelWrapper,
|
|
175
175
|
name="google/siglip-so400m-patch16-256-i18n",
|
|
176
176
|
model_type=["dense"],
|
|
177
177
|
languages=["eng-Latn"],
|
|
@@ -195,7 +195,7 @@ siglip_so400m_patch16_256_i18n = ModelMeta(
|
|
|
195
195
|
)
|
|
196
196
|
|
|
197
197
|
siglip_base_patch16_256_multilingual = ModelMeta(
|
|
198
|
-
loader=SiglipModelWrapper,
|
|
198
|
+
loader=SiglipModelWrapper,
|
|
199
199
|
name="google/siglip-base-patch16-256-multilingual",
|
|
200
200
|
model_type=["dense"],
|
|
201
201
|
languages=["eng-Latn"],
|
|
@@ -219,7 +219,7 @@ siglip_base_patch16_256_multilingual = ModelMeta(
|
|
|
219
219
|
)
|
|
220
220
|
|
|
221
221
|
siglip_base_patch16_256 = ModelMeta(
|
|
222
|
-
loader=SiglipModelWrapper,
|
|
222
|
+
loader=SiglipModelWrapper,
|
|
223
223
|
name="google/siglip-base-patch16-256",
|
|
224
224
|
model_type=["dense"],
|
|
225
225
|
languages=["eng-Latn"],
|
|
@@ -243,7 +243,7 @@ siglip_base_patch16_256 = ModelMeta(
|
|
|
243
243
|
)
|
|
244
244
|
|
|
245
245
|
siglip_base_patch16_512 = ModelMeta(
|
|
246
|
-
loader=SiglipModelWrapper,
|
|
246
|
+
loader=SiglipModelWrapper,
|
|
247
247
|
name="google/siglip-base-patch16-512",
|
|
248
248
|
model_type=["dense"],
|
|
249
249
|
languages=["eng-Latn"],
|
|
@@ -267,7 +267,7 @@ siglip_base_patch16_512 = ModelMeta(
|
|
|
267
267
|
)
|
|
268
268
|
|
|
269
269
|
siglip_base_patch16_384 = ModelMeta(
|
|
270
|
-
loader=SiglipModelWrapper,
|
|
270
|
+
loader=SiglipModelWrapper,
|
|
271
271
|
name="google/siglip-base-patch16-384",
|
|
272
272
|
model_type=["dense"],
|
|
273
273
|
languages=["eng-Latn"],
|
|
@@ -291,7 +291,7 @@ siglip_base_patch16_384 = ModelMeta(
|
|
|
291
291
|
)
|
|
292
292
|
|
|
293
293
|
siglip_base_patch16_224 = ModelMeta(
|
|
294
|
-
loader=SiglipModelWrapper,
|
|
294
|
+
loader=SiglipModelWrapper,
|
|
295
295
|
name="google/siglip-base-patch16-224",
|
|
296
296
|
model_type=["dense"],
|
|
297
297
|
languages=["eng-Latn"],
|
|
@@ -315,7 +315,7 @@ siglip_base_patch16_224 = ModelMeta(
|
|
|
315
315
|
)
|
|
316
316
|
|
|
317
317
|
siglip_large_patch16_256 = ModelMeta(
|
|
318
|
-
loader=SiglipModelWrapper,
|
|
318
|
+
loader=SiglipModelWrapper,
|
|
319
319
|
name="google/siglip-large-patch16-256",
|
|
320
320
|
model_type=["dense"],
|
|
321
321
|
languages=["eng-Latn"],
|
|
@@ -339,7 +339,7 @@ siglip_large_patch16_256 = ModelMeta(
|
|
|
339
339
|
)
|
|
340
340
|
|
|
341
341
|
siglip_large_patch16_384 = ModelMeta(
|
|
342
|
-
loader=SiglipModelWrapper,
|
|
342
|
+
loader=SiglipModelWrapper,
|
|
343
343
|
name="google/siglip-large-patch16-384",
|
|
344
344
|
model_type=["dense"],
|
|
345
345
|
languages=["eng-Latn"],
|
|
@@ -40,15 +40,15 @@ def _downsample_image(
|
|
|
40
40
|
logging.info(
|
|
41
41
|
f"Downsampling image from {width}x{height} to {new_width}x{new_height}"
|
|
42
42
|
)
|
|
43
|
-
return image.resize(new_size, Image.LANCZOS)
|
|
43
|
+
return image.resize(new_size, Image.LANCZOS)
|
|
44
44
|
if width > height:
|
|
45
45
|
if width > 10000:
|
|
46
46
|
logging.error("Processing extremely wide images.")
|
|
47
|
-
return image.resize((10000, height), Image.LANCZOS)
|
|
47
|
+
return image.resize((10000, height), Image.LANCZOS)
|
|
48
48
|
else:
|
|
49
49
|
if height > 10000:
|
|
50
50
|
logging.error("Processing extremely high images.")
|
|
51
|
-
return image.resize((width, 10000), Image.LANCZOS)
|
|
51
|
+
return image.resize((width, 10000), Image.LANCZOS)
|
|
52
52
|
return image
|
|
53
53
|
|
|
54
54
|
|
|
@@ -202,7 +202,7 @@ def voyage_v_loader(model_name, **kwargs):
|
|
|
202
202
|
|
|
203
203
|
|
|
204
204
|
voyage_v = ModelMeta(
|
|
205
|
-
loader=voyage_v_loader,
|
|
205
|
+
loader=voyage_v_loader,
|
|
206
206
|
name="voyageai/voyage-multimodal-3",
|
|
207
207
|
model_type=["dense"],
|
|
208
208
|
languages=[], # Unknown
|
mteb/models/model_meta.py
CHANGED
|
@@ -81,7 +81,7 @@ def _get_loader_name(
|
|
|
81
81
|
return loader.__name__
|
|
82
82
|
|
|
83
83
|
|
|
84
|
-
_SENTENCE_TRANSFORMER_LIB_NAME = "Sentence Transformers"
|
|
84
|
+
_SENTENCE_TRANSFORMER_LIB_NAME: FRAMEWORKS = "Sentence Transformers"
|
|
85
85
|
|
|
86
86
|
|
|
87
87
|
class ModelMeta(BaseModel):
|
|
@@ -263,10 +263,8 @@ class ModelMeta(BaseModel):
|
|
|
263
263
|
_kwargs = self.loader_kwargs.copy()
|
|
264
264
|
_kwargs.update(kwargs)
|
|
265
265
|
|
|
266
|
-
model:
|
|
267
|
-
|
|
268
|
-
)
|
|
269
|
-
model.mteb_model_meta = self # type: ignore
|
|
266
|
+
model: MTEBModels = self.loader(self.name, revision=self.revision, **_kwargs)
|
|
267
|
+
model.mteb_model_meta = self # type: ignore[misc]
|
|
270
268
|
return model
|
|
271
269
|
|
|
272
270
|
def model_name_as_path(self) -> str:
|
|
@@ -318,9 +316,8 @@ class ModelMeta(BaseModel):
|
|
|
318
316
|
model_config = None
|
|
319
317
|
logger.warning(f"Can't get configuration for {model_name}. Error: {e}")
|
|
320
318
|
|
|
321
|
-
if (
|
|
322
|
-
card_data.
|
|
323
|
-
or _SENTENCE_TRANSFORMER_LIB_NAME in card_data.tags
|
|
319
|
+
if card_data.library_name == _SENTENCE_TRANSFORMER_LIB_NAME or (
|
|
320
|
+
card_data.tags and _SENTENCE_TRANSFORMER_LIB_NAME in card_data.tags
|
|
324
321
|
):
|
|
325
322
|
frameworks.append(_SENTENCE_TRANSFORMER_LIB_NAME)
|
|
326
323
|
else:
|
|
@@ -435,7 +432,7 @@ class ModelMeta(BaseModel):
|
|
|
435
432
|
and config_sbert.get("similarity_fn_name") is not None
|
|
436
433
|
):
|
|
437
434
|
meta.similarity_fn_name = ScoringFunction.from_str(
|
|
438
|
-
config_sbert
|
|
435
|
+
config_sbert["similarity_fn_name"]
|
|
439
436
|
)
|
|
440
437
|
else:
|
|
441
438
|
meta.similarity_fn_name = ScoringFunction.COSINE
|
|
@@ -511,10 +508,12 @@ class ModelMeta(BaseModel):
|
|
|
511
508
|
if adapted_training_datasets is not None:
|
|
512
509
|
training_datasets |= adapted_training_datasets
|
|
513
510
|
except (ValueError, KeyError) as e:
|
|
514
|
-
|
|
511
|
+
msg = f"Could not get source model: {e} in MTEB"
|
|
512
|
+
logger.warning(msg)
|
|
513
|
+
warnings.warn(msg)
|
|
515
514
|
|
|
516
515
|
return_dataset = training_datasets.copy()
|
|
517
|
-
visited = set()
|
|
516
|
+
visited: set[str] = set()
|
|
518
517
|
|
|
519
518
|
for dataset in training_datasets:
|
|
520
519
|
similar_tasks = _collect_similar_tasks(dataset, visited)
|
|
@@ -548,6 +547,8 @@ class ModelMeta(BaseModel):
|
|
|
548
547
|
|
|
549
548
|
@staticmethod
|
|
550
549
|
def _calculate_num_parameters_from_hub(model_name: str | None = None) -> int | None:
|
|
550
|
+
if not model_name:
|
|
551
|
+
return None
|
|
551
552
|
try:
|
|
552
553
|
safetensors_metadata = get_safetensors_metadata(model_name)
|
|
553
554
|
if len(safetensors_metadata.parameter_count) >= 0:
|
|
@@ -561,7 +562,7 @@ class ModelMeta(BaseModel):
|
|
|
561
562
|
logger.warning(
|
|
562
563
|
f"Can't calculate number of parameters for {model_name}. Got error {e}"
|
|
563
564
|
)
|
|
564
|
-
|
|
565
|
+
return None
|
|
565
566
|
|
|
566
567
|
def calculate_num_parameters_from_hub(self) -> int | None:
|
|
567
568
|
"""Calculates the number of parameters in the model.
|
|
@@ -624,7 +625,7 @@ class ModelMeta(BaseModel):
|
|
|
624
625
|
if "API" in self.framework or self.name is None:
|
|
625
626
|
return None
|
|
626
627
|
|
|
627
|
-
return self._calculate_memory_usage_mb(self.
|
|
628
|
+
return self._calculate_memory_usage_mb(self.name, self.n_parameters)
|
|
628
629
|
|
|
629
630
|
@staticmethod
|
|
630
631
|
def fetch_release_date(model_name: str) -> StrDate | None:
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
|
+
import warnings
|
|
2
3
|
from collections.abc import Callable
|
|
3
4
|
|
|
4
5
|
import numpy as np
|
|
@@ -108,7 +109,7 @@ class FaissSearchIndex:
|
|
|
108
109
|
ids = ids.tolist()
|
|
109
110
|
|
|
110
111
|
if issubclass(self.index_type, faiss.IndexFlatL2):
|
|
111
|
-
similarities = -np.sqrt(np.maximum(similarities, 0))
|
|
112
|
+
similarities = (-np.sqrt(np.maximum(similarities, 0))).tolist()
|
|
112
113
|
|
|
113
114
|
return similarities, ids
|
|
114
115
|
|
|
@@ -116,8 +117,8 @@ class FaissSearchIndex:
|
|
|
116
117
|
self,
|
|
117
118
|
embeddings: Array,
|
|
118
119
|
top_k: int,
|
|
119
|
-
top_ranked: TopRankedDocumentsType
|
|
120
|
-
query_idx_to_id: dict[int, str]
|
|
120
|
+
top_ranked: TopRankedDocumentsType,
|
|
121
|
+
query_idx_to_id: dict[int, str],
|
|
121
122
|
) -> tuple[list[list[float]], list[list[int]]]:
|
|
122
123
|
doc_id_to_idx = {doc_id: i for i, doc_id in enumerate(self.idxs)}
|
|
123
124
|
scores_all: list[list[float]] = []
|
|
@@ -127,15 +128,17 @@ class FaissSearchIndex:
|
|
|
127
128
|
query_id = query_idx_to_id[query_idx]
|
|
128
129
|
ranked_ids = top_ranked.get(query_id)
|
|
129
130
|
if not ranked_ids:
|
|
130
|
-
|
|
131
|
+
msg = f"No top-ranked documents for query {query_id}"
|
|
132
|
+
logger.warning(msg)
|
|
133
|
+
warnings.warn(msg)
|
|
131
134
|
scores_all.append([])
|
|
132
135
|
idxs_all.append([])
|
|
133
136
|
continue
|
|
134
137
|
|
|
135
138
|
candidate_indices = [doc_id_to_idx[doc_id] for doc_id in ranked_ids]
|
|
136
|
-
d = self.index.d
|
|
139
|
+
d = self.index.d # type: ignore[union-attr]
|
|
137
140
|
candidate_embs = np.vstack(
|
|
138
|
-
[self.index.reconstruct(idx) for idx in candidate_indices]
|
|
141
|
+
[self.index.reconstruct(idx) for idx in candidate_indices] # type: ignore[union-attr]
|
|
139
142
|
)
|
|
140
143
|
sub_reranking_index = self.index_type(d)
|
|
141
144
|
sub_reranking_index.add(candidate_embs)
|