mteb 2.5.3__py3-none-any.whl → 2.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. mteb/_create_dataloaders.py +10 -15
  2. mteb/_evaluators/any_sts_evaluator.py +1 -4
  3. mteb/_evaluators/evaluator.py +2 -1
  4. mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +5 -6
  5. mteb/_evaluators/pair_classification_evaluator.py +3 -1
  6. mteb/_evaluators/retrieval_metrics.py +17 -16
  7. mteb/_evaluators/sklearn_evaluator.py +9 -8
  8. mteb/_evaluators/text/bitext_mining_evaluator.py +23 -16
  9. mteb/_evaluators/text/summarization_evaluator.py +20 -16
  10. mteb/abstasks/_data_filter/filters.py +1 -1
  11. mteb/abstasks/_data_filter/task_pipelines.py +3 -0
  12. mteb/abstasks/_statistics_calculation.py +18 -10
  13. mteb/abstasks/_stratification.py +18 -18
  14. mteb/abstasks/abstask.py +27 -21
  15. mteb/abstasks/aggregate_task_metadata.py +1 -9
  16. mteb/abstasks/aggregated_task.py +3 -16
  17. mteb/abstasks/classification.py +10 -4
  18. mteb/abstasks/clustering.py +18 -14
  19. mteb/abstasks/clustering_legacy.py +8 -8
  20. mteb/abstasks/image/image_text_pair_classification.py +5 -3
  21. mteb/abstasks/multilabel_classification.py +20 -16
  22. mteb/abstasks/pair_classification.py +18 -9
  23. mteb/abstasks/regression.py +3 -3
  24. mteb/abstasks/retrieval.py +12 -9
  25. mteb/abstasks/sts.py +6 -3
  26. mteb/abstasks/task_metadata.py +20 -16
  27. mteb/abstasks/text/bitext_mining.py +36 -25
  28. mteb/abstasks/text/reranking.py +7 -5
  29. mteb/abstasks/text/summarization.py +8 -3
  30. mteb/abstasks/zeroshot_classification.py +5 -2
  31. mteb/benchmarks/benchmark.py +2 -2
  32. mteb/cache.py +20 -18
  33. mteb/cli/_display_tasks.py +2 -2
  34. mteb/cli/build_cli.py +5 -5
  35. mteb/cli/generate_model_card.py +6 -4
  36. mteb/deprecated_evaluator.py +56 -43
  37. mteb/evaluate.py +35 -29
  38. mteb/filter_tasks.py +25 -26
  39. mteb/get_tasks.py +25 -27
  40. mteb/languages/language_scripts.py +5 -3
  41. mteb/leaderboard/app.py +1 -1
  42. mteb/load_results.py +12 -12
  43. mteb/models/abs_encoder.py +2 -2
  44. mteb/models/cache_wrappers/cache_backend_protocol.py +3 -5
  45. mteb/models/cache_wrappers/cache_backends/_hash_utils.py +5 -4
  46. mteb/models/cache_wrappers/cache_backends/faiss_cache.py +2 -1
  47. mteb/models/cache_wrappers/cache_backends/numpy_cache.py +30 -13
  48. mteb/models/cache_wrappers/cache_wrapper.py +2 -2
  49. mteb/models/get_model_meta.py +8 -1
  50. mteb/models/instruct_wrapper.py +11 -5
  51. mteb/models/model_implementations/andersborges.py +2 -2
  52. mteb/models/model_implementations/blip_models.py +8 -8
  53. mteb/models/model_implementations/bm25.py +1 -1
  54. mteb/models/model_implementations/clip_models.py +3 -3
  55. mteb/models/model_implementations/cohere_models.py +1 -1
  56. mteb/models/model_implementations/cohere_v.py +2 -2
  57. mteb/models/model_implementations/dino_models.py +23 -23
  58. mteb/models/model_implementations/emillykkejensen_models.py +3 -3
  59. mteb/models/model_implementations/jina_clip.py +1 -1
  60. mteb/models/model_implementations/jina_models.py +1 -1
  61. mteb/models/model_implementations/kennethenevoldsen_models.py +2 -2
  62. mteb/models/model_implementations/llm2clip_models.py +3 -3
  63. mteb/models/model_implementations/moco_models.py +2 -2
  64. mteb/models/model_implementations/model2vec_models.py +1 -1
  65. mteb/models/model_implementations/nomic_models.py +8 -8
  66. mteb/models/model_implementations/openclip_models.py +7 -7
  67. mteb/models/model_implementations/random_baseline.py +3 -3
  68. mteb/models/model_implementations/rasgaard_models.py +1 -1
  69. mteb/models/model_implementations/repllama_models.py +2 -2
  70. mteb/models/model_implementations/rerankers_custom.py +3 -3
  71. mteb/models/model_implementations/rerankers_monot5_based.py +3 -3
  72. mteb/models/model_implementations/siglip_models.py +10 -10
  73. mteb/models/model_implementations/vlm2vec_models.py +1 -1
  74. mteb/models/model_implementations/voyage_v.py +4 -4
  75. mteb/models/model_meta.py +11 -12
  76. mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +5 -5
  77. mteb/models/search_wrappers.py +22 -10
  78. mteb/models/sentence_transformer_wrapper.py +9 -4
  79. mteb/py.typed +0 -0
  80. mteb/results/benchmark_results.py +25 -19
  81. mteb/results/model_result.py +49 -21
  82. mteb/results/task_result.py +45 -51
  83. mteb/similarity_functions.py +11 -7
  84. mteb/tasks/classification/dan/dk_hate_classification.py +1 -1
  85. mteb/tasks/classification/est/estonian_valence.py +1 -1
  86. mteb/tasks/classification/multilingual/scala_classification.py +1 -1
  87. mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +1 -1
  88. mteb/tasks/retrieval/code/code_rag.py +12 -12
  89. mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
  90. mteb/tasks/retrieval/dan/tv2_nordretrieval.py +2 -2
  91. mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +2 -2
  92. mteb/tasks/retrieval/nob/norquad.py +2 -2
  93. mteb/tasks/retrieval/nob/snl_retrieval.py +2 -2
  94. mteb/tasks/retrieval/tur/tur_hist_quad.py +1 -1
  95. mteb/types/_result.py +2 -1
  96. mteb/types/statistics.py +9 -3
  97. {mteb-2.5.3.dist-info → mteb-2.5.4.dist-info}/METADATA +1 -1
  98. {mteb-2.5.3.dist-info → mteb-2.5.4.dist-info}/RECORD +102 -101
  99. {mteb-2.5.3.dist-info → mteb-2.5.4.dist-info}/WHEEL +0 -0
  100. {mteb-2.5.3.dist-info → mteb-2.5.4.dist-info}/entry_points.txt +0 -0
  101. {mteb-2.5.3.dist-info → mteb-2.5.4.dist-info}/licenses/LICENSE +0 -0
  102. {mteb-2.5.3.dist-info → mteb-2.5.4.dist-info}/top_level.txt +0 -0
@@ -40,15 +40,15 @@ def _downsample_image(
40
40
  logging.info(
41
41
  f"Downsampling image from {width}x{height} to {new_width}x{new_height}"
42
42
  )
43
- return image.resize(new_size, Image.LANCZOS) # type: ignore
43
+ return image.resize(new_size, Image.LANCZOS)
44
44
  if width > height:
45
45
  if width > 10000:
46
46
  logging.error("Processing extremely wide images.")
47
- return image.resize((10000, height), Image.LANCZOS) # type: ignore
47
+ return image.resize((10000, height), Image.LANCZOS)
48
48
  else:
49
49
  if height > 10000:
50
50
  logging.error("Processing extremely high images.")
51
- return image.resize((width, 10000), Image.LANCZOS) # type: ignore
51
+ return image.resize((width, 10000), Image.LANCZOS)
52
52
  return image
53
53
 
54
54
 
@@ -202,7 +202,7 @@ def voyage_v_loader(model_name, **kwargs):
202
202
 
203
203
 
204
204
  voyage_v = ModelMeta(
205
- loader=voyage_v_loader, # type: ignore
205
+ loader=voyage_v_loader,
206
206
  name="voyageai/voyage-multimodal-3",
207
207
  model_type=["dense"],
208
208
  languages=[], # Unknown
mteb/models/model_meta.py CHANGED
@@ -81,7 +81,7 @@ def _get_loader_name(
81
81
  return loader.__name__
82
82
 
83
83
 
84
- _SENTENCE_TRANSFORMER_LIB_NAME = "Sentence Transformers"
84
+ _SENTENCE_TRANSFORMER_LIB_NAME: FRAMEWORKS = "Sentence Transformers"
85
85
 
86
86
 
87
87
  class ModelMeta(BaseModel):
@@ -263,10 +263,8 @@ class ModelMeta(BaseModel):
263
263
  _kwargs = self.loader_kwargs.copy()
264
264
  _kwargs.update(kwargs)
265
265
 
266
- model: EncoderProtocol = self.loader(
267
- self.name, revision=self.revision, **_kwargs
268
- )
269
- model.mteb_model_meta = self # type: ignore
266
+ model: MTEBModels = self.loader(self.name, revision=self.revision, **_kwargs)
267
+ model.mteb_model_meta = self # type: ignore[misc]
270
268
  return model
271
269
 
272
270
  def model_name_as_path(self) -> str:
@@ -318,9 +316,8 @@ class ModelMeta(BaseModel):
318
316
  model_config = None
319
317
  logger.warning(f"Can't get configuration for {model_name}. Error: {e}")
320
318
 
321
- if (
322
- card_data.library_name == _SENTENCE_TRANSFORMER_LIB_NAME
323
- or _SENTENCE_TRANSFORMER_LIB_NAME in card_data.tags
319
+ if card_data.library_name == _SENTENCE_TRANSFORMER_LIB_NAME or (
320
+ card_data.tags and _SENTENCE_TRANSFORMER_LIB_NAME in card_data.tags
324
321
  ):
325
322
  frameworks.append(_SENTENCE_TRANSFORMER_LIB_NAME)
326
323
  else:
@@ -435,7 +432,7 @@ class ModelMeta(BaseModel):
435
432
  and config_sbert.get("similarity_fn_name") is not None
436
433
  ):
437
434
  meta.similarity_fn_name = ScoringFunction.from_str(
438
- config_sbert.get("similarity_fn_name")
435
+ config_sbert["similarity_fn_name"]
439
436
  )
440
437
  else:
441
438
  meta.similarity_fn_name = ScoringFunction.COSINE
@@ -516,7 +513,7 @@ class ModelMeta(BaseModel):
516
513
  warnings.warn(msg)
517
514
 
518
515
  return_dataset = training_datasets.copy()
519
- visited = set()
516
+ visited: set[str] = set()
520
517
 
521
518
  for dataset in training_datasets:
522
519
  similar_tasks = _collect_similar_tasks(dataset, visited)
@@ -550,6 +547,8 @@ class ModelMeta(BaseModel):
550
547
 
551
548
  @staticmethod
552
549
  def _calculate_num_parameters_from_hub(model_name: str | None = None) -> int | None:
550
+ if not model_name:
551
+ return None
553
552
  try:
554
553
  safetensors_metadata = get_safetensors_metadata(model_name)
555
554
  if len(safetensors_metadata.parameter_count) >= 0:
@@ -563,7 +562,7 @@ class ModelMeta(BaseModel):
563
562
  logger.warning(
564
563
  f"Can't calculate number of parameters for {model_name}. Got error {e}"
565
564
  )
566
- return None
565
+ return None
567
566
 
568
567
  def calculate_num_parameters_from_hub(self) -> int | None:
569
568
  """Calculates the number of parameters in the model.
@@ -626,7 +625,7 @@ class ModelMeta(BaseModel):
626
625
  if "API" in self.framework or self.name is None:
627
626
  return None
628
627
 
629
- return self._calculate_memory_usage_mb(self.model_name, self.n_parameters)
628
+ return self._calculate_memory_usage_mb(self.name, self.n_parameters)
630
629
 
631
630
  @staticmethod
632
631
  def fetch_release_date(model_name: str) -> StrDate | None:
@@ -109,7 +109,7 @@ class FaissSearchIndex:
109
109
  ids = ids.tolist()
110
110
 
111
111
  if issubclass(self.index_type, faiss.IndexFlatL2):
112
- similarities = -np.sqrt(np.maximum(similarities, 0))
112
+ similarities = (-np.sqrt(np.maximum(similarities, 0))).tolist()
113
113
 
114
114
  return similarities, ids
115
115
 
@@ -117,8 +117,8 @@ class FaissSearchIndex:
117
117
  self,
118
118
  embeddings: Array,
119
119
  top_k: int,
120
- top_ranked: TopRankedDocumentsType | None = None,
121
- query_idx_to_id: dict[int, str] | None = None,
120
+ top_ranked: TopRankedDocumentsType,
121
+ query_idx_to_id: dict[int, str],
122
122
  ) -> tuple[list[list[float]], list[list[int]]]:
123
123
  doc_id_to_idx = {doc_id: i for i, doc_id in enumerate(self.idxs)}
124
124
  scores_all: list[list[float]] = []
@@ -136,9 +136,9 @@ class FaissSearchIndex:
136
136
  continue
137
137
 
138
138
  candidate_indices = [doc_id_to_idx[doc_id] for doc_id in ranked_ids]
139
- d = self.index.d
139
+ d = self.index.d # type: ignore[union-attr]
140
140
  candidate_embs = np.vstack(
141
- [self.index.reconstruct(idx) for idx in candidate_indices]
141
+ [self.index.reconstruct(idx) for idx in candidate_indices] # type: ignore[union-attr]
142
142
  )
143
143
  sub_reranking_index = self.index_type(d)
144
144
  sub_reranking_index.add(candidate_embs)
@@ -200,7 +200,7 @@ class SearchEncoderWrapper:
200
200
  # Reset the task corpus dataloader to None to free up memory
201
201
  self.task_corpus = None
202
202
 
203
- results = {qid: {} for qid in query_idx_to_id.values()}
203
+ results: RetrievalOutputType = {qid: {} for qid in query_idx_to_id.values()}
204
204
  for qid in result_heaps:
205
205
  for score, corpus_id in result_heaps[qid]:
206
206
  results[qid][corpus_id] = score
@@ -218,13 +218,19 @@ class SearchEncoderWrapper:
218
218
  encode_kwargs: dict[str, Any],
219
219
  ) -> dict[str, list[tuple[float, str]]]:
220
220
  logger.info("Encoding Corpus in batches (this might take a while)...")
221
+ if self.task_corpus is None:
222
+ raise ValueError("Corpus must be indexed before searching.")
223
+
221
224
  itr = range(0, len(self.task_corpus), self.corpus_chunk_size)
222
225
 
223
- result_heaps = {qid: [] for qid in query_idx_to_id.values()}
226
+ result_heaps: dict[str, list[tuple[float, str]]] = {
227
+ qid: [] for qid in query_idx_to_id.values()
228
+ }
224
229
  for batch_num, corpus_start_idx in enumerate(itr):
225
230
  logger.info(f"Encoding Batch {batch_num + 1}/{len(itr)}...")
226
231
  corpus_end_idx = min(
227
- corpus_start_idx + self.corpus_chunk_size, len(self.task_corpus)
232
+ corpus_start_idx + self.corpus_chunk_size,
233
+ len(self.task_corpus),
228
234
  )
229
235
  sub_corpus = self.task_corpus.select(
230
236
  range(corpus_start_idx, corpus_end_idx)
@@ -249,7 +255,7 @@ class SearchEncoderWrapper:
249
255
  scores = self.model.similarity(query_embeddings, sub_corpus_embeddings)
250
256
 
251
257
  # get top-k values
252
- cos_scores_top_k_values, cos_scores_top_k_idx = torch.topk(
258
+ cos_scores_top_k_values_tensor, cos_scores_top_k_idx_tensor = torch.topk(
253
259
  torch.as_tensor(scores),
254
260
  min(
255
261
  top_k + 1,
@@ -258,8 +264,8 @@ class SearchEncoderWrapper:
258
264
  dim=1,
259
265
  largest=True,
260
266
  )
261
- cos_scores_top_k_idx = cos_scores_top_k_idx.cpu().tolist()
262
- cos_scores_top_k_values = cos_scores_top_k_values.cpu().tolist()
267
+ cos_scores_top_k_idx = cos_scores_top_k_idx_tensor.cpu().tolist()
268
+ cos_scores_top_k_values = cos_scores_top_k_values_tensor.cpu().tolist()
263
269
 
264
270
  sub_corpus_ids = list(sub_corpus_ids)
265
271
  result_heaps = self._sort_full_corpus_results(
@@ -319,7 +325,11 @@ class SearchEncoderWrapper:
319
325
  Returns:
320
326
  A dictionary mapping query IDs to a list of tuples, each containing a relevance score and a document ID.
321
327
  """
322
- result_heaps = {qid: [] for qid in query_idx_to_id.values()}
328
+ if self.task_corpus is None:
329
+ raise ValueError("Corpus must be indexed before searching.")
330
+ result_heaps: dict[str, list[tuple[float, str]]] = {
331
+ qid: [] for qid in query_idx_to_id.values()
332
+ }
323
333
  doc_id_to_idx = {doc["id"]: idx for idx, doc in enumerate(self.task_corpus)}
324
334
 
325
335
  all_doc_embeddings = self.model.encode(
@@ -387,12 +397,12 @@ class SearchEncoderWrapper:
387
397
 
388
398
  def _rerank_sort_results(
389
399
  self,
390
- result_heaps: list[tuple[float, str]],
400
+ result_heaps: dict[str, list[tuple[float, str]]],
391
401
  query_id: str,
392
402
  ranked_ids: list[str],
393
403
  scores_top_k_idx: torch.Tensor,
394
404
  scores_top_k_values: torch.Tensor,
395
- ) -> list[tuple[float, str]]:
405
+ ) -> dict[str, list[tuple[float, str]]]:
396
406
  """Sort the heap into descending order list.
397
407
 
398
408
  Returns:
@@ -503,6 +513,8 @@ class SearchCrossEncoderWrapper:
503
513
  raise ValueError(
504
514
  "CrossEncoder search requires top_ranked documents for reranking."
505
515
  )
516
+ if self.task_corpus is None:
517
+ raise ValueError("Corpus must be indexed before searching.")
506
518
 
507
519
  query_id_to_idx = {row["id"]: i for i, row in enumerate(queries)}
508
520
  doc_id_to_idx = {doc["id"]: idx for idx, doc in enumerate(self.task_corpus)}
@@ -542,7 +554,7 @@ class SearchCrossEncoderWrapper:
542
554
  hf_subset=hf_subset,
543
555
  )
544
556
 
545
- results = {qid: {} for qid in queries["id"]}
557
+ results: RetrievalOutputType = {qid: {} for qid in queries["id"]}
546
558
  for (query_id, corpus_id), score in zip(doc_pairs_ids, predictions):
547
559
  results[query_id][corpus_id] = float(score)
548
560
 
@@ -103,8 +103,11 @@ class SentenceTransformerEncoderWrapper(AbsEncoder):
103
103
  logger.warning(msg)
104
104
  warnings.warn(msg)
105
105
 
106
+ def similarity(self, embeddings1: Array, embeddings2: Array) -> Array:
107
+ """Compute the similarity between two collections of embeddings."""
106
108
  if hasattr(self.model, "similarity") and callable(self.model.similarity):
107
- self.similarity = self.model.similarity
109
+ return self.model.similarity(embeddings1, embeddings2)
110
+ return super().similarity(embeddings1, embeddings2)
108
111
 
109
112
  def encode(
110
113
  self,
@@ -150,7 +153,7 @@ class SentenceTransformerEncoderWrapper(AbsEncoder):
150
153
  prompt_name = None
151
154
  if self.model_prompts is not None:
152
155
  prompt_name = self.get_prompt_name(task_metadata, prompt_type)
153
- prompt = self.model_prompts.get(prompt_name, None)
156
+ prompt = self.model_prompts.get(prompt_name, None) # type: ignore[arg-type]
154
157
  if prompt_name:
155
158
  prompt_log = f"Using {prompt_name=} for task={task_metadata.name} {prompt_type=} with {prompt=}"
156
159
  else:
@@ -221,7 +224,7 @@ class SentenceTransformerMultimodalEncoderWrapper(SentenceTransformerEncoderWrap
221
224
  prompt_name = None
222
225
  if self.model_prompts is not None:
223
226
  prompt_name = self.get_prompt_name(task_metadata, prompt_type)
224
- prompt = self.model_prompts.get(prompt_name, None)
227
+ prompt = self.model_prompts.get(prompt_name, None) # type: ignore[arg-type]
225
228
  if prompt_name:
226
229
  logger.info(
227
230
  f"Using {prompt_name=} for task={task_metadata.name} {prompt_type=} with {prompt=}"
@@ -234,7 +237,9 @@ class SentenceTransformerMultimodalEncoderWrapper(SentenceTransformerEncoderWrap
234
237
  all_embeddings = []
235
238
  for batch in inputs:
236
239
  batch_column = next(iter(batch.keys()))
237
- batched_input = [dict() for _ in range(len(batch[batch_column]))]
240
+ batched_input: list[dict[str, Any]] = [
241
+ dict() for _ in range(len(batch[batch_column]))
242
+ ]
238
243
 
239
244
  # transform from {"text": [text1, text2], "image": [image1, image2]} to
240
245
  # [{"text": text1, "image": image1}, {"text": text2, "image": image2}]
mteb/py.typed ADDED
File without changes
@@ -1,10 +1,12 @@
1
+ from __future__ import annotations
2
+
1
3
  import functools
2
4
  import json
3
5
  import logging
4
6
  import warnings
5
- from collections.abc import Callable, Iterable, Iterator, Sequence
7
+ from collections.abc import Callable, Iterable, Iterator
6
8
  from pathlib import Path
7
- from typing import Any, Literal
9
+ from typing import Any, Literal, cast
8
10
 
9
11
  import pandas as pd
10
12
  from packaging.version import InvalidVersion, Version
@@ -33,11 +35,12 @@ from .model_result import ModelResult, _aggregate_and_pivot
33
35
  logger = logging.getLogger(__name__)
34
36
 
35
37
 
36
- # Global cache for model metas and version parsing
37
38
  @functools.lru_cache
38
39
  def _get_cached_model_metas() -> dict[str, str | None]:
39
40
  """Cache model metas to avoid repeated calls."""
40
- return {meta.name: meta.revision for meta in get_model_metas()}
41
+ return {
42
+ meta.name: meta.revision for meta in get_model_metas() if meta.name is not None
43
+ }
41
44
 
42
45
 
43
46
  @functools.lru_cache(maxsize=10000)
@@ -77,10 +80,10 @@ class BenchmarkResults(BaseModel):
77
80
  task_names: list[str] | None = None,
78
81
  languages: list[str] | None = None,
79
82
  domains: list[TaskDomain] | None = None,
80
- task_types: list[TaskType] | None = None, # type: ignore
83
+ task_types: list[TaskType] | None = None,
81
84
  modalities: list[Modalities] | None = None,
82
85
  is_public: bool | None = None,
83
- ) -> Self:
86
+ ) -> BenchmarkResults:
84
87
  # TODO: Same as filter_models
85
88
  model_results = [
86
89
  res._filter_tasks(
@@ -97,7 +100,7 @@ class BenchmarkResults(BaseModel):
97
100
  model_results=[res for res in model_results if res.task_results]
98
101
  )
99
102
 
100
- def select_tasks(self, tasks: Sequence[AbsTask]) -> Self:
103
+ def select_tasks(self, tasks: Iterable[AbsTask]) -> BenchmarkResults:
101
104
  """Select tasks from the benchmark results.
102
105
 
103
106
  Args:
@@ -115,7 +118,7 @@ class BenchmarkResults(BaseModel):
115
118
  self,
116
119
  names: list[str] | list[ModelMeta],
117
120
  revisions: list[str | None] | None = None,
118
- ) -> Self:
121
+ ) -> BenchmarkResults:
119
122
  """Get models by name and revision.
120
123
 
121
124
  Args:
@@ -128,7 +131,7 @@ class BenchmarkResults(BaseModel):
128
131
  models_res = []
129
132
  _revisions = revisions if revisions is not None else [None] * len(names)
130
133
 
131
- name_rev = {}
134
+ name_rev: dict[str, str | None] = {}
132
135
 
133
136
  if len(names) != len(_revisions):
134
137
  raise ValueError(
@@ -137,9 +140,12 @@ class BenchmarkResults(BaseModel):
137
140
 
138
141
  for name, revision in zip(names, _revisions):
139
142
  if isinstance(name, ModelMeta):
143
+ if name.name is None:
144
+ raise ValueError("name in ModelMeta is None. It must be a string.")
140
145
  name_rev[name.name] = name.revision
141
146
  else:
142
- name_rev[name] = revision
147
+ name_ = cast(str, name)
148
+ name_rev[name_] = revision
143
149
 
144
150
  for model_res in self.model_results:
145
151
  model_name = model_res.model_name
@@ -159,7 +165,7 @@ class BenchmarkResults(BaseModel):
159
165
  n_parameters_range: tuple[int | None, int | None] = (None, None),
160
166
  use_instructions: bool | None = None,
161
167
  zero_shot_on: list[AbsTask] | None = None,
162
- ) -> Self:
168
+ ) -> BenchmarkResults:
163
169
  # mostly a utility function for the leaderboard app.
164
170
  # I would probably move the filtering of the models outside of this call. No need to call get_model_metas inside the filter.
165
171
  # interface would then be the same as the get_models function
@@ -182,7 +188,7 @@ class BenchmarkResults(BaseModel):
182
188
 
183
189
  return type(self).model_construct(model_results=new_model_results)
184
190
 
185
- def join_revisions(self) -> Self:
191
+ def join_revisions(self) -> BenchmarkResults:
186
192
  """Join revisions of the same model.
187
193
 
188
194
  In case of conflicts, the following rules are applied:
@@ -212,10 +218,10 @@ class BenchmarkResults(BaseModel):
212
218
 
213
219
  # Use cached model metas
214
220
  model_to_main_revision = _get_cached_model_metas()
215
- task_df["main_revision"] = task_df["model"].map(model_to_main_revision) # type: ignore
221
+ task_df["main_revision"] = task_df["model"].map(model_to_main_revision)
216
222
 
217
223
  # Use cached version parsing
218
- task_df["mteb_version"] = task_df["mteb_version"].map(_parse_version_cached) # type: ignore
224
+ task_df["mteb_version"] = task_df["mteb_version"].map(_parse_version_cached)
219
225
 
220
226
  # Filter out rows without scores first
221
227
  task_df = task_df[task_df["has_scores"]]
@@ -259,8 +265,8 @@ class BenchmarkResults(BaseModel):
259
265
  # so grouping by original revision ensures consistent ModelResult creation
260
266
  for (model, model_revision), group in task_df.groupby(["model", "revision"]):
261
267
  model_result = ModelResult.model_construct(
262
- model_name=model,
263
- model_revision=model_revision,
268
+ model_name=model, # type: ignore[arg-type]
269
+ model_revision=model_revision, # type: ignore[arg-type]
264
270
  task_results=list(group["task_result"]),
265
271
  )
266
272
  model_results.append(model_result)
@@ -291,7 +297,7 @@ class BenchmarkResults(BaseModel):
291
297
  {
292
298
  "model": model_res.model_name,
293
299
  "revision": model_res.model_revision,
294
- **model_scores, # type: ignore
300
+ **model_scores,
295
301
  }
296
302
  )
297
303
  except Exception as e:
@@ -404,7 +410,7 @@ class BenchmarkResults(BaseModel):
404
410
 
405
411
  return self.benchmark._create_summary_table(self)
406
412
 
407
- def __iter__(self) -> Iterator[ModelResult]:
413
+ def __iter__(self) -> Iterator[ModelResult]: # type: ignore[override]
408
414
  return iter(self.model_results)
409
415
 
410
416
  def __getitem__(self, index: int) -> ModelResult:
@@ -426,7 +432,7 @@ class BenchmarkResults(BaseModel):
426
432
  out_file.write(self.model_dump_json(indent=2))
427
433
 
428
434
  @classmethod
429
- def from_validated(cls, **data) -> Self:
435
+ def from_validated(cls, **data) -> BenchmarkResults:
430
436
  """Create BenchmarkResults from validated data.
431
437
 
432
438
  Args:
@@ -1,12 +1,14 @@
1
+ from __future__ import annotations
2
+
1
3
  import logging
2
4
  import warnings
3
- from collections.abc import Callable, Iterable, Sequence
4
- from typing import Any, Literal
5
+ from collections.abc import Callable, Iterable
6
+ from typing import Any, Literal, cast
5
7
 
6
8
  import numpy as np
7
9
  import pandas as pd
8
10
  from pydantic import BaseModel, ConfigDict, Field
9
- from typing_extensions import Self
11
+ from typing_extensions import overload
10
12
 
11
13
  from mteb.abstasks.abstask import AbsTask
12
14
  from mteb.abstasks.task_metadata import (
@@ -58,7 +60,7 @@ def _aggregate_and_pivot(
58
60
  index=index_columns,
59
61
  columns=columns,
60
62
  values="score",
61
- aggfunc=aggregation_fn,
63
+ aggfunc=aggregation_fn, # type: ignore[arg-type]
62
64
  ).reset_index()
63
65
  elif format == "long":
64
66
  return (
@@ -81,7 +83,7 @@ class ModelResult(BaseModel):
81
83
  model_revision: str | None
82
84
  task_results: list[TaskResult]
83
85
  default_modalities: list[Modalities] = Field(
84
- default_factory=lambda: ["text"], alias="modalities"
86
+ default_factory=lambda: [cast(Modalities, "text")], alias="modalities"
85
87
  )
86
88
  model_config = (
87
89
  ConfigDict( # to free up the name model_* which is otherwise protected
@@ -95,16 +97,17 @@ class ModelResult(BaseModel):
95
97
  return f"ModelResult(model_name={self.model_name}, model_revision={self.model_revision}, task_results=[...](#{n_entries}))"
96
98
 
97
99
  @classmethod
98
- def from_validated(cls, **data: dict[str, Any]) -> Self:
100
+ def from_validated(cls, **data: dict[str, Any]) -> ModelResult:
99
101
  """Create a ModelResult from validated data.
100
102
 
101
103
  Args:
102
104
  data: The validated data.
103
105
  """
104
- data["task_results"] = [
105
- TaskResult.from_validated(**res) for res in data["task_results"]
106
+ data["task_results"] = [ # type: ignore[assignment]
107
+ TaskResult.from_validated(**res) # type: ignore[arg-type]
108
+ for res in data["task_results"]
106
109
  ]
107
- return cls.model_construct(**data)
110
+ return cls.model_construct(**data) # type: ignore[arg-type]
108
111
 
109
112
  def _filter_tasks(
110
113
  self,
@@ -114,7 +117,7 @@ class ModelResult(BaseModel):
114
117
  task_types: list[TaskType] | None = None,
115
118
  modalities: list[Modalities] | None = None,
116
119
  is_public: bool | None = None,
117
- ) -> Self:
120
+ ) -> ModelResult:
118
121
  new_task_results = []
119
122
  for task_result in self.task_results:
120
123
  if (task_names is not None) and (task_result.task_name not in task_names):
@@ -142,7 +145,7 @@ class ModelResult(BaseModel):
142
145
  task_results=new_task_results,
143
146
  )
144
147
 
145
- def select_tasks(self, tasks: Sequence[AbsTask]) -> Self:
148
+ def select_tasks(self, tasks: Iterable[AbsTask]) -> ModelResult:
146
149
  """Select tasks from the ModelResult based on a list of AbsTask objects.
147
150
 
148
151
  Args:
@@ -160,6 +163,28 @@ class ModelResult(BaseModel):
160
163
  task_results=new_task_results,
161
164
  )
162
165
 
166
+ @overload
167
+ def _get_scores(
168
+ self,
169
+ splits: list[SplitName] | None = None,
170
+ languages: list[ISOLanguage | ISOLanguageScript] | None = None,
171
+ scripts: list[ISOLanguageScript] | None = None,
172
+ getter: Callable[[ScoresDict], Score] | None = None,
173
+ aggregation: Callable[[list[Score]], Any] | None = None,
174
+ format: Literal["wide"] = "wide",
175
+ ) -> dict: ...
176
+
177
+ @overload
178
+ def _get_scores(
179
+ self,
180
+ splits: list[SplitName] | None = None,
181
+ languages: list[ISOLanguage | ISOLanguageScript] | None = None,
182
+ scripts: list[ISOLanguageScript] | None = None,
183
+ getter: Callable[[ScoresDict], Score] | None = None,
184
+ aggregation: Callable[[list[Score]], Any] | None = None,
185
+ format: Literal["long"] = "long",
186
+ ) -> list: ...
187
+
163
188
  def _get_scores(
164
189
  self,
165
190
  splits: list[SplitName] | None = None,
@@ -177,21 +202,24 @@ class ModelResult(BaseModel):
177
202
  aggregation = aggregation if aggregation is not None else np.mean
178
203
  else:
179
204
  use_fast = True
205
+ aggregation = cast(Callable[[list[Score]], Any], aggregation)
206
+ getter = cast(Callable[[ScoresDict], Score], getter)
207
+
180
208
  if format == "wide":
181
209
  scores = {}
182
210
  for res in self.task_results:
183
211
  try:
184
212
  if use_fast:
185
213
  scores[res.task_name] = res._get_score_fast(
186
- splits=splits, # type: ignore
187
- languages=languages, # type: ignore
214
+ splits=splits,
215
+ languages=languages,
188
216
  )
189
217
  else:
190
218
  scores[res.task_name] = res.get_score(
191
219
  splits=splits,
192
220
  languages=languages,
193
- aggregation=aggregation, # type: ignore
194
- getter=getter, # type: ignore
221
+ aggregation=aggregation,
222
+ getter=getter,
195
223
  scripts=scripts,
196
224
  )
197
225
  except Exception as e:
@@ -206,14 +234,14 @@ class ModelResult(BaseModel):
206
234
  if use_fast:
207
235
  score = task_res._get_score_fast(
208
236
  splits=splits,
209
- languages=languages, # type: ignore
237
+ languages=languages,
210
238
  )
211
239
  else:
212
240
  score = task_res.get_score(
213
241
  splits=splits,
214
242
  languages=languages,
215
- aggregation=aggregation, # type: ignore
216
- getter=getter, # type: ignore
243
+ aggregation=aggregation,
244
+ getter=getter,
217
245
  scripts=scripts,
218
246
  )
219
247
  entry = dict(
@@ -317,7 +345,7 @@ class ModelResult(BaseModel):
317
345
  def __hash__(self) -> int:
318
346
  return id(self)
319
347
 
320
- def __iter__(self) -> Iterable[TaskResult]:
348
+ def __iter__(self) -> Iterable[TaskResult]: # type: ignore[override]
321
349
  return iter(self.task_results)
322
350
 
323
351
  def __getitem__(self, index) -> TaskResult:
@@ -370,13 +398,13 @@ class ModelResult(BaseModel):
370
398
  return [task_res.task_name for task_res in self.task_results]
371
399
 
372
400
  @property
373
- def modalities(self) -> list[str]:
401
+ def modalities(self) -> list[Modalities]:
374
402
  """Get all modalities in the task results.
375
403
 
376
404
  Returns:
377
405
  A list of modalities in the task results.
378
406
  """
379
- mods = []
407
+ mods: list[Modalities] = []
380
408
  for task_res in self.task_results:
381
409
  task_modalities = getattr(task_res, "modalities", [])
382
410
  mods.extend(task_modalities)