mteb 2.5.2__py3-none-any.whl → 2.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (104) hide show
  1. mteb/_create_dataloaders.py +10 -15
  2. mteb/_evaluators/any_sts_evaluator.py +1 -4
  3. mteb/_evaluators/evaluator.py +2 -1
  4. mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +5 -6
  5. mteb/_evaluators/pair_classification_evaluator.py +3 -1
  6. mteb/_evaluators/retrieval_metrics.py +17 -16
  7. mteb/_evaluators/sklearn_evaluator.py +9 -8
  8. mteb/_evaluators/text/bitext_mining_evaluator.py +23 -16
  9. mteb/_evaluators/text/summarization_evaluator.py +20 -16
  10. mteb/abstasks/_data_filter/filters.py +1 -1
  11. mteb/abstasks/_data_filter/task_pipelines.py +3 -0
  12. mteb/abstasks/_statistics_calculation.py +18 -10
  13. mteb/abstasks/_stratification.py +18 -18
  14. mteb/abstasks/abstask.py +33 -27
  15. mteb/abstasks/aggregate_task_metadata.py +1 -9
  16. mteb/abstasks/aggregated_task.py +7 -26
  17. mteb/abstasks/classification.py +10 -4
  18. mteb/abstasks/clustering.py +18 -14
  19. mteb/abstasks/clustering_legacy.py +8 -8
  20. mteb/abstasks/image/image_text_pair_classification.py +5 -3
  21. mteb/abstasks/multilabel_classification.py +20 -16
  22. mteb/abstasks/pair_classification.py +18 -9
  23. mteb/abstasks/regression.py +3 -3
  24. mteb/abstasks/retrieval.py +12 -9
  25. mteb/abstasks/sts.py +6 -3
  26. mteb/abstasks/task_metadata.py +22 -19
  27. mteb/abstasks/text/bitext_mining.py +36 -25
  28. mteb/abstasks/text/reranking.py +7 -5
  29. mteb/abstasks/text/summarization.py +8 -3
  30. mteb/abstasks/zeroshot_classification.py +5 -2
  31. mteb/benchmarks/benchmark.py +2 -2
  32. mteb/cache.py +27 -22
  33. mteb/cli/_display_tasks.py +2 -2
  34. mteb/cli/build_cli.py +15 -10
  35. mteb/cli/generate_model_card.py +10 -7
  36. mteb/deprecated_evaluator.py +60 -46
  37. mteb/evaluate.py +39 -30
  38. mteb/filter_tasks.py +25 -26
  39. mteb/get_tasks.py +29 -30
  40. mteb/languages/language_scripts.py +5 -3
  41. mteb/leaderboard/app.py +1 -1
  42. mteb/load_results.py +12 -12
  43. mteb/models/abs_encoder.py +7 -5
  44. mteb/models/cache_wrappers/cache_backend_protocol.py +3 -5
  45. mteb/models/cache_wrappers/cache_backends/_hash_utils.py +5 -4
  46. mteb/models/cache_wrappers/cache_backends/faiss_cache.py +6 -2
  47. mteb/models/cache_wrappers/cache_backends/numpy_cache.py +43 -25
  48. mteb/models/cache_wrappers/cache_wrapper.py +2 -2
  49. mteb/models/get_model_meta.py +8 -1
  50. mteb/models/instruct_wrapper.py +11 -5
  51. mteb/models/model_implementations/andersborges.py +2 -2
  52. mteb/models/model_implementations/blip_models.py +8 -8
  53. mteb/models/model_implementations/bm25.py +1 -1
  54. mteb/models/model_implementations/clip_models.py +3 -3
  55. mteb/models/model_implementations/cohere_models.py +1 -1
  56. mteb/models/model_implementations/cohere_v.py +2 -2
  57. mteb/models/model_implementations/dino_models.py +23 -23
  58. mteb/models/model_implementations/emillykkejensen_models.py +3 -3
  59. mteb/models/model_implementations/gme_v_models.py +4 -3
  60. mteb/models/model_implementations/jina_clip.py +1 -1
  61. mteb/models/model_implementations/jina_models.py +1 -1
  62. mteb/models/model_implementations/kennethenevoldsen_models.py +2 -2
  63. mteb/models/model_implementations/llm2clip_models.py +3 -3
  64. mteb/models/model_implementations/mcinext_models.py +4 -1
  65. mteb/models/model_implementations/moco_models.py +2 -2
  66. mteb/models/model_implementations/model2vec_models.py +1 -1
  67. mteb/models/model_implementations/nomic_models.py +8 -8
  68. mteb/models/model_implementations/openclip_models.py +7 -7
  69. mteb/models/model_implementations/random_baseline.py +3 -3
  70. mteb/models/model_implementations/rasgaard_models.py +1 -1
  71. mteb/models/model_implementations/repllama_models.py +2 -2
  72. mteb/models/model_implementations/rerankers_custom.py +3 -3
  73. mteb/models/model_implementations/rerankers_monot5_based.py +3 -3
  74. mteb/models/model_implementations/siglip_models.py +10 -10
  75. mteb/models/model_implementations/vlm2vec_models.py +1 -1
  76. mteb/models/model_implementations/voyage_v.py +4 -4
  77. mteb/models/model_meta.py +14 -13
  78. mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +9 -6
  79. mteb/models/search_wrappers.py +26 -12
  80. mteb/models/sentence_transformer_wrapper.py +19 -14
  81. mteb/py.typed +0 -0
  82. mteb/results/benchmark_results.py +28 -20
  83. mteb/results/model_result.py +52 -22
  84. mteb/results/task_result.py +55 -58
  85. mteb/similarity_functions.py +11 -7
  86. mteb/tasks/classification/dan/dk_hate_classification.py +1 -1
  87. mteb/tasks/classification/est/estonian_valence.py +1 -1
  88. mteb/tasks/classification/multilingual/scala_classification.py +1 -1
  89. mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +1 -1
  90. mteb/tasks/retrieval/code/code_rag.py +12 -12
  91. mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
  92. mteb/tasks/retrieval/dan/tv2_nordretrieval.py +2 -2
  93. mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +2 -2
  94. mteb/tasks/retrieval/nob/norquad.py +2 -2
  95. mteb/tasks/retrieval/nob/snl_retrieval.py +2 -2
  96. mteb/tasks/retrieval/tur/tur_hist_quad.py +1 -1
  97. mteb/types/_result.py +2 -1
  98. mteb/types/statistics.py +9 -3
  99. {mteb-2.5.2.dist-info → mteb-2.5.4.dist-info}/METADATA +1 -1
  100. {mteb-2.5.2.dist-info → mteb-2.5.4.dist-info}/RECORD +104 -103
  101. {mteb-2.5.2.dist-info → mteb-2.5.4.dist-info}/WHEEL +0 -0
  102. {mteb-2.5.2.dist-info → mteb-2.5.4.dist-info}/entry_points.txt +0 -0
  103. {mteb-2.5.2.dist-info → mteb-2.5.4.dist-info}/licenses/LICENSE +0 -0
  104. {mteb-2.5.2.dist-info → mteb-2.5.4.dist-info}/top_level.txt +0 -0
@@ -4,7 +4,7 @@ from mteb.models.sentence_transformer_wrapper import (
4
4
  )
5
5
 
6
6
  dfm_enc_large = ModelMeta(
7
- loader=sentence_transformers_loader, # type: ignore
7
+ loader=sentence_transformers_loader,
8
8
  name="KennethEnevoldsen/dfm-sentence-encoder-large",
9
9
  model_type=["dense"],
10
10
  languages=["dan-Latn"],
@@ -39,7 +39,7 @@ dfm_enc_large = ModelMeta(
39
39
  )
40
40
 
41
41
  dfm_enc_med = ModelMeta(
42
- loader=sentence_transformers_loader, # type: ignore
42
+ loader=sentence_transformers_loader,
43
43
  name="KennethEnevoldsen/dfm-sentence-encoder-medium",
44
44
  model_type=["dense"],
45
45
  languages=["dan-Latn"],
@@ -181,7 +181,7 @@ llm2clip_training_sets = set(
181
181
  )
182
182
 
183
183
  llm2clip_openai_l_14_336 = ModelMeta(
184
- loader=llm2clip_loader, # type: ignore
184
+ loader=llm2clip_loader,
185
185
  name="microsoft/LLM2CLIP-Openai-L-14-336",
186
186
  model_type=["dense"],
187
187
  languages=["eng-Latn"],
@@ -206,7 +206,7 @@ llm2clip_openai_l_14_336 = ModelMeta(
206
206
 
207
207
  # NOTE: https://huggingface.co/microsoft/LLM2CLIP-Openai-L-14-224/discussions/1
208
208
  llm2clip_openai_l_14_224 = ModelMeta(
209
- loader=llm2clip_loader, # type: ignore
209
+ loader=llm2clip_loader,
210
210
  name="microsoft/LLM2CLIP-Openai-L-14-224",
211
211
  model_type=["dense"],
212
212
  languages=["eng-Latn"],
@@ -230,7 +230,7 @@ llm2clip_openai_l_14_224 = ModelMeta(
230
230
  )
231
231
 
232
232
  llm2clip_openai_b_16 = ModelMeta(
233
- loader=llm2clip_loader, # type: ignore
233
+ loader=llm2clip_loader,
234
234
  name="microsoft/LLM2CLIP-Openai-B-16",
235
235
  model_type=["dense"],
236
236
  languages=["eng-Latn"],
@@ -1,6 +1,7 @@
1
1
  import logging
2
2
  import os
3
3
  import time
4
+ import warnings
4
5
  from typing import Any
5
6
 
6
7
  import numpy as np
@@ -246,7 +247,9 @@ class HakimModelWrapper(AbsEncoder):
246
247
  task_prompt, task_id = DATASET_TASKS.get(task_name, (None, None))
247
248
 
248
249
  if not task_prompt:
249
- logger.warning(f"Unknown dataset: {task_name}, no preprocessing applied.")
250
+ msg = f"Unknown dataset: {task_name}, no preprocessing applied."
251
+ logger.warning(msg)
252
+ warnings.warn(msg)
250
253
  return sample
251
254
 
252
255
  task_prompt = f"مسئله : {task_prompt}"
@@ -117,7 +117,7 @@ mocov3_training_datasets = set(
117
117
  )
118
118
 
119
119
  mocov3_vit_base = ModelMeta(
120
- loader=mocov3_loader, # type: ignore
120
+ loader=mocov3_loader,
121
121
  name="nyu-visionx/moco-v3-vit-b",
122
122
  model_type=["dense"],
123
123
  languages=["eng-Latn"],
@@ -141,7 +141,7 @@ mocov3_vit_base = ModelMeta(
141
141
  )
142
142
 
143
143
  mocov3_vit_large = ModelMeta(
144
- loader=mocov3_loader, # type: ignore
144
+ loader=mocov3_loader,
145
145
  name="nyu-visionx/moco-v3-vit-l",
146
146
  model_type=["dense"],
147
147
  languages=["eng-Latn"],
@@ -139,7 +139,7 @@ class Model2VecModel(AbsEncoder):
139
139
  **kwargs: Additional arguments to pass to the wrapper.
140
140
  """
141
141
  requires_package(self, "model2vec", model_name, "pip install 'mteb[model2vec]'")
142
- from model2vec import StaticModel # type: ignore
142
+ from model2vec import StaticModel
143
143
 
144
144
  self.model_name = model_name
145
145
  self.model = StaticModel.from_pretrained(self.model_name)
@@ -193,7 +193,7 @@ NOMIC_CITATION = """
193
193
  """
194
194
 
195
195
  nomic_embed_v1_5 = ModelMeta(
196
- loader=NomicWrapper, # type: ignore
196
+ loader=NomicWrapper,
197
197
  loader_kwargs=dict(
198
198
  trust_remote_code=True,
199
199
  model_prompts=model_prompts,
@@ -222,7 +222,7 @@ nomic_embed_v1_5 = ModelMeta(
222
222
  )
223
223
 
224
224
  nomic_embed_v1 = ModelMeta(
225
- loader=NomicWrapper, # type: ignore
225
+ loader=NomicWrapper,
226
226
  loader_kwargs=dict(
227
227
  trust_remote_code=True,
228
228
  model_prompts=model_prompts,
@@ -251,7 +251,7 @@ nomic_embed_v1 = ModelMeta(
251
251
  )
252
252
 
253
253
  nomic_embed_v1_ablated = ModelMeta(
254
- loader=NomicWrapper, # type: ignore
254
+ loader=NomicWrapper,
255
255
  loader_kwargs=dict(
256
256
  trust_remote_code=True,
257
257
  model_prompts=model_prompts,
@@ -279,7 +279,7 @@ nomic_embed_v1_ablated = ModelMeta(
279
279
  )
280
280
 
281
281
  nomic_embed_v1_unsupervised = ModelMeta(
282
- loader=NomicWrapper, # type: ignore
282
+ loader=NomicWrapper,
283
283
  loader_kwargs=dict(
284
284
  trust_remote_code=True,
285
285
  model_prompts=model_prompts,
@@ -334,7 +334,7 @@ nomic_modern_bert_embed = ModelMeta(
334
334
  training_datasets=nomic_training_data,
335
335
  public_training_data=None,
336
336
  citation="""@misc{nussbaum2024nomic,
337
- title={Nomic Embed: Training a Reproducible Long Context Text Embedder},
337
+ title={Nomic Embed: Training a Reproducible Long Context Text Embedder},
338
338
  author={Zach Nussbaum and John X. Morris and Brandon Duderstadt and Andriy Mulyar},
339
339
  year={2024},
340
340
  eprint={2402.01613},
@@ -446,7 +446,7 @@ m_languages = [
446
446
  ]
447
447
 
448
448
  nomic_embed_text_v2_moe = ModelMeta(
449
- loader=NomicWrapper, # type: ignore
449
+ loader=NomicWrapper,
450
450
  loader_kwargs=dict(
451
451
  trust_remote_code=True,
452
452
  model_prompts=model_prompts,
@@ -472,12 +472,12 @@ nomic_embed_text_v2_moe = ModelMeta(
472
472
  training_datasets=None, # did not look into this further
473
473
  superseded_by=None,
474
474
  citation="""@misc{nussbaum2025trainingsparsemixtureexperts,
475
- title={Training Sparse Mixture Of Experts Text Embedding Models},
475
+ title={Training Sparse Mixture Of Experts Text Embedding Models},
476
476
  author={Zach Nussbaum and Brandon Duderstadt},
477
477
  year={2025},
478
478
  eprint={2502.07972},
479
479
  archivePrefix={arXiv},
480
480
  primaryClass={cs.CL},
481
- url={https://arxiv.org/abs/2502.07972},
481
+ url={https://arxiv.org/abs/2502.07972},
482
482
  }""",
483
483
  )
@@ -120,7 +120,7 @@ def openclip_loader(model_name, **kwargs):
120
120
 
121
121
 
122
122
  CLIP_ViT_L_14_DataComp_XL_s13B_b90K = ModelMeta(
123
- loader=openclip_loader, # type: ignore
123
+ loader=openclip_loader,
124
124
  name="laion/CLIP-ViT-L-14-DataComp.XL-s13B-b90K",
125
125
  model_type=["dense"],
126
126
  languages=["eng-Latn"],
@@ -146,7 +146,7 @@ CLIP_ViT_L_14_DataComp_XL_s13B_b90K = ModelMeta(
146
146
  )
147
147
 
148
148
  CLIP_ViT_B_32_DataComp_XL_s13B_b90K = ModelMeta(
149
- loader=openclip_loader, # type: ignore
149
+ loader=openclip_loader,
150
150
  name="laion/CLIP-ViT-B-32-DataComp.XL-s13B-b90K",
151
151
  model_type=["dense"],
152
152
  languages=["eng-Latn"],
@@ -172,7 +172,7 @@ CLIP_ViT_B_32_DataComp_XL_s13B_b90K = ModelMeta(
172
172
  )
173
173
 
174
174
  CLIP_ViT_B_16_DataComp_XL_s13B_b90K = ModelMeta(
175
- loader=openclip_loader, # type: ignore
175
+ loader=openclip_loader,
176
176
  name="laion/CLIP-ViT-B-16-DataComp.XL-s13B-b90K",
177
177
  model_type=["dense"],
178
178
  languages=["eng-Latn"],
@@ -198,7 +198,7 @@ CLIP_ViT_B_16_DataComp_XL_s13B_b90K = ModelMeta(
198
198
  )
199
199
 
200
200
  CLIP_ViT_bigG_14_laion2B_39B_b160k = ModelMeta(
201
- loader=openclip_loader, # type: ignore
201
+ loader=openclip_loader,
202
202
  name="laion/CLIP-ViT-bigG-14-laion2B-39B-b160k",
203
203
  model_type=["dense"],
204
204
  languages=["eng-Latn"],
@@ -224,7 +224,7 @@ CLIP_ViT_bigG_14_laion2B_39B_b160k = ModelMeta(
224
224
  )
225
225
 
226
226
  CLIP_ViT_g_14_laion2B_s34B_b88K = ModelMeta(
227
- loader=openclip_loader, # type: ignore
227
+ loader=openclip_loader,
228
228
  name="laion/CLIP-ViT-g-14-laion2B-s34B-b88K",
229
229
  model_type=["dense"],
230
230
  languages=["eng-Latn"],
@@ -250,7 +250,7 @@ CLIP_ViT_g_14_laion2B_s34B_b88K = ModelMeta(
250
250
  )
251
251
 
252
252
  CLIP_ViT_H_14_laion2B_s32B_b79K = ModelMeta(
253
- loader=openclip_loader, # type: ignore
253
+ loader=openclip_loader,
254
254
  name="laion/CLIP-ViT-H-14-laion2B-s32B-b79K",
255
255
  model_type=["dense"],
256
256
  languages=["eng-Latn"],
@@ -276,7 +276,7 @@ CLIP_ViT_H_14_laion2B_s32B_b79K = ModelMeta(
276
276
  )
277
277
 
278
278
  CLIP_ViT_L_14_laion2B_s32B_b82K = ModelMeta(
279
- loader=openclip_loader, # type: ignore
279
+ loader=openclip_loader,
280
280
  name="laion/CLIP-ViT-L-14-laion2B-s32B-b82K",
281
281
  model_type=["dense"],
282
282
  languages=["eng-Latn"],
@@ -68,7 +68,7 @@ _common_mock_metadata = dict(
68
68
  license="mit",
69
69
  max_tokens=np.inf,
70
70
  reference=None,
71
- similarity_fn_name="cosine", # type: ignore
71
+ similarity_fn_name="cosine",
72
72
  framework=[],
73
73
  use_instructions=False,
74
74
  public_training_code=None, # No training code, as this is a random baseline
@@ -187,7 +187,7 @@ class RandomEncoderBaseline:
187
187
 
188
188
 
189
189
  random_encoder_baseline = ModelMeta(
190
- loader=RandomEncoderBaseline, # type: ignore
190
+ loader=RandomEncoderBaseline,
191
191
  name="baseline/random-encoder-baseline",
192
192
  model_type=["dense"],
193
193
  modalities=["text", "image"],
@@ -232,7 +232,7 @@ class RandomCrossEncoderBaseline:
232
232
 
233
233
 
234
234
  random_cross_encoder_baseline = ModelMeta(
235
- loader=RandomCrossEncoderBaseline, # type: ignore
235
+ loader=RandomCrossEncoderBaseline,
236
236
  name="baseline/random-cross-encoder-baseline",
237
237
  model_type=["cross-encoder"],
238
238
  modalities=["text", "image"],
@@ -4,7 +4,7 @@ from mteb.models.model_implementations.model2vec_models import Model2VecModel
4
4
  from mteb.models.model_meta import ModelMeta, ScoringFunction
5
5
 
6
6
  potion_base_8m = ModelMeta(
7
- loader=Model2VecModel, # type: ignore
7
+ loader=Model2VecModel,
8
8
  name="rasgaard/m2v-dfm-large",
9
9
  model_type=["dense"],
10
10
  languages=["dan-Latn"],
@@ -154,7 +154,7 @@ REPLLAMA_CITATION = """
154
154
  """
155
155
 
156
156
  repllama_llama2_original = ModelMeta(
157
- loader=RepLLaMAModel, # type: ignore
157
+ loader=RepLLaMAModel,
158
158
  loader_kwargs=dict(
159
159
  base_model_name_or_path="meta-llama/Llama-2-7b-hf",
160
160
  device_map="auto",
@@ -187,7 +187,7 @@ repllama_llama2_original = ModelMeta(
187
187
 
188
188
 
189
189
  repllama_llama2_reproduced = ModelMeta(
190
- loader=RepLLaMAModel, # type: ignore
190
+ loader=RepLLaMAModel,
191
191
  loader_kwargs=dict(
192
192
  base_model_name_or_path="meta-llama/Llama-2-7b-hf",
193
193
  device_map="auto",
@@ -214,7 +214,7 @@ class JinaReranker(RerankerWrapper):
214
214
 
215
215
 
216
216
  monobert_large = ModelMeta(
217
- loader=MonoBERTReranker, # type: ignore
217
+ loader=MonoBERTReranker,
218
218
  loader_kwargs=dict(
219
219
  fp_options="float16",
220
220
  ),
@@ -239,7 +239,7 @@ monobert_large = ModelMeta(
239
239
 
240
240
  # languages unclear: https://huggingface.co/jinaai/jina-reranker-v2-base-multilingual/discussions/28
241
241
  jina_reranker_multilingual = ModelMeta(
242
- loader=JinaReranker, # type: ignore
242
+ loader=JinaReranker,
243
243
  loader_kwargs=dict(
244
244
  fp_options="float16",
245
245
  ),
@@ -263,7 +263,7 @@ jina_reranker_multilingual = ModelMeta(
263
263
  )
264
264
 
265
265
  bge_reranker_v2_m3 = ModelMeta(
266
- loader=BGEReranker, # type: ignore
266
+ loader=BGEReranker,
267
267
  loader_kwargs=dict(
268
268
  fp_options="float16",
269
269
  ),
@@ -343,7 +343,7 @@ monot5_small = ModelMeta(
343
343
  )
344
344
 
345
345
  monot5_base = ModelMeta(
346
- loader=MonoT5Reranker, # type: ignore
346
+ loader=MonoT5Reranker,
347
347
  loader_kwargs=dict(
348
348
  fp_options="float16",
349
349
  ),
@@ -442,7 +442,7 @@ monot5_3b = ModelMeta(
442
442
  )
443
443
 
444
444
  flant5_base = ModelMeta(
445
- loader=FLANT5Reranker, # type: ignore
445
+ loader=FLANT5Reranker,
446
446
  loader_kwargs=dict(
447
447
  fp_options="float16",
448
448
  ),
@@ -902,7 +902,7 @@ mt5_base_mmarco_v2 = ModelMeta(
902
902
  )
903
903
 
904
904
  mt5_13b_mmarco_100k = ModelMeta(
905
- loader=MonoT5Reranker, # type: ignore
905
+ loader=MonoT5Reranker,
906
906
  loader_kwargs=dict(
907
907
  fp_options="float16",
908
908
  ),
@@ -123,7 +123,7 @@ siglip_training_datasets = set(
123
123
  )
124
124
 
125
125
  siglip_so400m_patch14_224 = ModelMeta(
126
- loader=SiglipModelWrapper, # type: ignore
126
+ loader=SiglipModelWrapper,
127
127
  name="google/siglip-so400m-patch14-224",
128
128
  model_type=["dense"],
129
129
  languages=["eng-Latn"],
@@ -147,7 +147,7 @@ siglip_so400m_patch14_224 = ModelMeta(
147
147
  )
148
148
 
149
149
  siglip_so400m_patch14_384 = ModelMeta(
150
- loader=SiglipModelWrapper, # type: ignore
150
+ loader=SiglipModelWrapper,
151
151
  name="google/siglip-so400m-patch14-384",
152
152
  model_type=["dense"],
153
153
  languages=["eng-Latn"],
@@ -171,7 +171,7 @@ siglip_so400m_patch14_384 = ModelMeta(
171
171
  )
172
172
 
173
173
  siglip_so400m_patch16_256_i18n = ModelMeta(
174
- loader=SiglipModelWrapper, # type: ignore
174
+ loader=SiglipModelWrapper,
175
175
  name="google/siglip-so400m-patch16-256-i18n",
176
176
  model_type=["dense"],
177
177
  languages=["eng-Latn"],
@@ -195,7 +195,7 @@ siglip_so400m_patch16_256_i18n = ModelMeta(
195
195
  )
196
196
 
197
197
  siglip_base_patch16_256_multilingual = ModelMeta(
198
- loader=SiglipModelWrapper, # type: ignore
198
+ loader=SiglipModelWrapper,
199
199
  name="google/siglip-base-patch16-256-multilingual",
200
200
  model_type=["dense"],
201
201
  languages=["eng-Latn"],
@@ -219,7 +219,7 @@ siglip_base_patch16_256_multilingual = ModelMeta(
219
219
  )
220
220
 
221
221
  siglip_base_patch16_256 = ModelMeta(
222
- loader=SiglipModelWrapper, # type: ignore
222
+ loader=SiglipModelWrapper,
223
223
  name="google/siglip-base-patch16-256",
224
224
  model_type=["dense"],
225
225
  languages=["eng-Latn"],
@@ -243,7 +243,7 @@ siglip_base_patch16_256 = ModelMeta(
243
243
  )
244
244
 
245
245
  siglip_base_patch16_512 = ModelMeta(
246
- loader=SiglipModelWrapper, # type: ignore
246
+ loader=SiglipModelWrapper,
247
247
  name="google/siglip-base-patch16-512",
248
248
  model_type=["dense"],
249
249
  languages=["eng-Latn"],
@@ -267,7 +267,7 @@ siglip_base_patch16_512 = ModelMeta(
267
267
  )
268
268
 
269
269
  siglip_base_patch16_384 = ModelMeta(
270
- loader=SiglipModelWrapper, # type: ignore
270
+ loader=SiglipModelWrapper,
271
271
  name="google/siglip-base-patch16-384",
272
272
  model_type=["dense"],
273
273
  languages=["eng-Latn"],
@@ -291,7 +291,7 @@ siglip_base_patch16_384 = ModelMeta(
291
291
  )
292
292
 
293
293
  siglip_base_patch16_224 = ModelMeta(
294
- loader=SiglipModelWrapper, # type: ignore
294
+ loader=SiglipModelWrapper,
295
295
  name="google/siglip-base-patch16-224",
296
296
  model_type=["dense"],
297
297
  languages=["eng-Latn"],
@@ -315,7 +315,7 @@ siglip_base_patch16_224 = ModelMeta(
315
315
  )
316
316
 
317
317
  siglip_large_patch16_256 = ModelMeta(
318
- loader=SiglipModelWrapper, # type: ignore
318
+ loader=SiglipModelWrapper,
319
319
  name="google/siglip-large-patch16-256",
320
320
  model_type=["dense"],
321
321
  languages=["eng-Latn"],
@@ -339,7 +339,7 @@ siglip_large_patch16_256 = ModelMeta(
339
339
  )
340
340
 
341
341
  siglip_large_patch16_384 = ModelMeta(
342
- loader=SiglipModelWrapper, # type: ignore
342
+ loader=SiglipModelWrapper,
343
343
  name="google/siglip-large-patch16-384",
344
344
  model_type=["dense"],
345
345
  languages=["eng-Latn"],
@@ -41,7 +41,7 @@ class VLM2VecWrapper(AbsEncoder):
41
41
  model_name,
42
42
  "pip install flash-attn --no-build-isolation",
43
43
  ):
44
- import flash_attn # noqa
44
+ pass
45
45
 
46
46
  requires_package(self, "peft", model_name, "pip install 'mteb[peft]'")
47
47
  from peft import LoraConfig, PeftModel
@@ -40,15 +40,15 @@ def _downsample_image(
40
40
  logging.info(
41
41
  f"Downsampling image from {width}x{height} to {new_width}x{new_height}"
42
42
  )
43
- return image.resize(new_size, Image.LANCZOS) # type: ignore
43
+ return image.resize(new_size, Image.LANCZOS)
44
44
  if width > height:
45
45
  if width > 10000:
46
46
  logging.error("Processing extremely wide images.")
47
- return image.resize((10000, height), Image.LANCZOS) # type: ignore
47
+ return image.resize((10000, height), Image.LANCZOS)
48
48
  else:
49
49
  if height > 10000:
50
50
  logging.error("Processing extremely high images.")
51
- return image.resize((width, 10000), Image.LANCZOS) # type: ignore
51
+ return image.resize((width, 10000), Image.LANCZOS)
52
52
  return image
53
53
 
54
54
 
@@ -202,7 +202,7 @@ def voyage_v_loader(model_name, **kwargs):
202
202
 
203
203
 
204
204
  voyage_v = ModelMeta(
205
- loader=voyage_v_loader, # type: ignore
205
+ loader=voyage_v_loader,
206
206
  name="voyageai/voyage-multimodal-3",
207
207
  model_type=["dense"],
208
208
  languages=[], # Unknown
mteb/models/model_meta.py CHANGED
@@ -81,7 +81,7 @@ def _get_loader_name(
81
81
  return loader.__name__
82
82
 
83
83
 
84
- _SENTENCE_TRANSFORMER_LIB_NAME = "Sentence Transformers"
84
+ _SENTENCE_TRANSFORMER_LIB_NAME: FRAMEWORKS = "Sentence Transformers"
85
85
 
86
86
 
87
87
  class ModelMeta(BaseModel):
@@ -263,10 +263,8 @@ class ModelMeta(BaseModel):
263
263
  _kwargs = self.loader_kwargs.copy()
264
264
  _kwargs.update(kwargs)
265
265
 
266
- model: EncoderProtocol = self.loader(
267
- self.name, revision=self.revision, **_kwargs
268
- )
269
- model.mteb_model_meta = self # type: ignore
266
+ model: MTEBModels = self.loader(self.name, revision=self.revision, **_kwargs)
267
+ model.mteb_model_meta = self # type: ignore[misc]
270
268
  return model
271
269
 
272
270
  def model_name_as_path(self) -> str:
@@ -318,9 +316,8 @@ class ModelMeta(BaseModel):
318
316
  model_config = None
319
317
  logger.warning(f"Can't get configuration for {model_name}. Error: {e}")
320
318
 
321
- if (
322
- card_data.library_name == _SENTENCE_TRANSFORMER_LIB_NAME
323
- or _SENTENCE_TRANSFORMER_LIB_NAME in card_data.tags
319
+ if card_data.library_name == _SENTENCE_TRANSFORMER_LIB_NAME or (
320
+ card_data.tags and _SENTENCE_TRANSFORMER_LIB_NAME in card_data.tags
324
321
  ):
325
322
  frameworks.append(_SENTENCE_TRANSFORMER_LIB_NAME)
326
323
  else:
@@ -435,7 +432,7 @@ class ModelMeta(BaseModel):
435
432
  and config_sbert.get("similarity_fn_name") is not None
436
433
  ):
437
434
  meta.similarity_fn_name = ScoringFunction.from_str(
438
- config_sbert.get("similarity_fn_name")
435
+ config_sbert["similarity_fn_name"]
439
436
  )
440
437
  else:
441
438
  meta.similarity_fn_name = ScoringFunction.COSINE
@@ -511,10 +508,12 @@ class ModelMeta(BaseModel):
511
508
  if adapted_training_datasets is not None:
512
509
  training_datasets |= adapted_training_datasets
513
510
  except (ValueError, KeyError) as e:
514
- logger.warning(f"Could not get source model: {e} in MTEB")
511
+ msg = f"Could not get source model: {e} in MTEB"
512
+ logger.warning(msg)
513
+ warnings.warn(msg)
515
514
 
516
515
  return_dataset = training_datasets.copy()
517
- visited = set()
516
+ visited: set[str] = set()
518
517
 
519
518
  for dataset in training_datasets:
520
519
  similar_tasks = _collect_similar_tasks(dataset, visited)
@@ -548,6 +547,8 @@ class ModelMeta(BaseModel):
548
547
 
549
548
  @staticmethod
550
549
  def _calculate_num_parameters_from_hub(model_name: str | None = None) -> int | None:
550
+ if not model_name:
551
+ return None
551
552
  try:
552
553
  safetensors_metadata = get_safetensors_metadata(model_name)
553
554
  if len(safetensors_metadata.parameter_count) >= 0:
@@ -561,7 +562,7 @@ class ModelMeta(BaseModel):
561
562
  logger.warning(
562
563
  f"Can't calculate number of parameters for {model_name}. Got error {e}"
563
564
  )
564
- return None
565
+ return None
565
566
 
566
567
  def calculate_num_parameters_from_hub(self) -> int | None:
567
568
  """Calculates the number of parameters in the model.
@@ -624,7 +625,7 @@ class ModelMeta(BaseModel):
624
625
  if "API" in self.framework or self.name is None:
625
626
  return None
626
627
 
627
- return self._calculate_memory_usage_mb(self.model_name, self.n_parameters)
628
+ return self._calculate_memory_usage_mb(self.name, self.n_parameters)
628
629
 
629
630
  @staticmethod
630
631
  def fetch_release_date(model_name: str) -> StrDate | None:
@@ -1,4 +1,5 @@
1
1
  import logging
2
+ import warnings
2
3
  from collections.abc import Callable
3
4
 
4
5
  import numpy as np
@@ -108,7 +109,7 @@ class FaissSearchIndex:
108
109
  ids = ids.tolist()
109
110
 
110
111
  if issubclass(self.index_type, faiss.IndexFlatL2):
111
- similarities = -np.sqrt(np.maximum(similarities, 0))
112
+ similarities = (-np.sqrt(np.maximum(similarities, 0))).tolist()
112
113
 
113
114
  return similarities, ids
114
115
 
@@ -116,8 +117,8 @@ class FaissSearchIndex:
116
117
  self,
117
118
  embeddings: Array,
118
119
  top_k: int,
119
- top_ranked: TopRankedDocumentsType | None = None,
120
- query_idx_to_id: dict[int, str] | None = None,
120
+ top_ranked: TopRankedDocumentsType,
121
+ query_idx_to_id: dict[int, str],
121
122
  ) -> tuple[list[list[float]], list[list[int]]]:
122
123
  doc_id_to_idx = {doc_id: i for i, doc_id in enumerate(self.idxs)}
123
124
  scores_all: list[list[float]] = []
@@ -127,15 +128,17 @@ class FaissSearchIndex:
127
128
  query_id = query_idx_to_id[query_idx]
128
129
  ranked_ids = top_ranked.get(query_id)
129
130
  if not ranked_ids:
130
- logger.warning(f"No top-ranked documents for query {query_id}")
131
+ msg = f"No top-ranked documents for query {query_id}"
132
+ logger.warning(msg)
133
+ warnings.warn(msg)
131
134
  scores_all.append([])
132
135
  idxs_all.append([])
133
136
  continue
134
137
 
135
138
  candidate_indices = [doc_id_to_idx[doc_id] for doc_id in ranked_ids]
136
- d = self.index.d
139
+ d = self.index.d # type: ignore[union-attr]
137
140
  candidate_embs = np.vstack(
138
- [self.index.reconstruct(idx) for idx in candidate_indices]
141
+ [self.index.reconstruct(idx) for idx in candidate_indices] # type: ignore[union-attr]
139
142
  )
140
143
  sub_reranking_index = self.index_type(d)
141
144
  sub_reranking_index.add(candidate_embs)