mteb 2.5.3__py3-none-any.whl → 2.5.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. mteb/_create_dataloaders.py +10 -15
  2. mteb/_evaluators/any_sts_evaluator.py +1 -4
  3. mteb/_evaluators/evaluator.py +2 -1
  4. mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +5 -6
  5. mteb/_evaluators/pair_classification_evaluator.py +3 -1
  6. mteb/_evaluators/retrieval_metrics.py +17 -16
  7. mteb/_evaluators/sklearn_evaluator.py +9 -8
  8. mteb/_evaluators/text/bitext_mining_evaluator.py +23 -16
  9. mteb/_evaluators/text/summarization_evaluator.py +20 -16
  10. mteb/abstasks/_data_filter/filters.py +1 -1
  11. mteb/abstasks/_data_filter/task_pipelines.py +3 -0
  12. mteb/abstasks/_statistics_calculation.py +18 -10
  13. mteb/abstasks/_stratification.py +18 -18
  14. mteb/abstasks/abstask.py +27 -21
  15. mteb/abstasks/aggregate_task_metadata.py +1 -9
  16. mteb/abstasks/aggregated_task.py +3 -16
  17. mteb/abstasks/classification.py +10 -4
  18. mteb/abstasks/clustering.py +18 -14
  19. mteb/abstasks/clustering_legacy.py +8 -8
  20. mteb/abstasks/image/image_text_pair_classification.py +5 -3
  21. mteb/abstasks/multilabel_classification.py +20 -16
  22. mteb/abstasks/pair_classification.py +18 -9
  23. mteb/abstasks/regression.py +3 -3
  24. mteb/abstasks/retrieval.py +12 -9
  25. mteb/abstasks/sts.py +6 -3
  26. mteb/abstasks/task_metadata.py +20 -16
  27. mteb/abstasks/text/bitext_mining.py +36 -25
  28. mteb/abstasks/text/reranking.py +7 -5
  29. mteb/abstasks/text/summarization.py +8 -3
  30. mteb/abstasks/zeroshot_classification.py +5 -2
  31. mteb/benchmarks/benchmark.py +4 -2
  32. mteb/benchmarks/benchmarks/benchmarks.py +22 -1
  33. mteb/benchmarks/get_benchmark.py +14 -55
  34. mteb/cache.py +21 -18
  35. mteb/cli/_display_tasks.py +2 -2
  36. mteb/cli/build_cli.py +8 -8
  37. mteb/cli/generate_model_card.py +39 -20
  38. mteb/deprecated_evaluator.py +56 -43
  39. mteb/evaluate.py +35 -29
  40. mteb/filter_tasks.py +25 -26
  41. mteb/get_tasks.py +25 -27
  42. mteb/languages/language_scripts.py +5 -3
  43. mteb/leaderboard/app.py +1 -1
  44. mteb/load_results.py +12 -12
  45. mteb/models/abs_encoder.py +2 -2
  46. mteb/models/cache_wrappers/cache_backend_protocol.py +3 -5
  47. mteb/models/cache_wrappers/cache_backends/_hash_utils.py +5 -4
  48. mteb/models/cache_wrappers/cache_backends/faiss_cache.py +2 -1
  49. mteb/models/cache_wrappers/cache_backends/numpy_cache.py +30 -13
  50. mteb/models/cache_wrappers/cache_wrapper.py +2 -2
  51. mteb/models/get_model_meta.py +8 -1
  52. mteb/models/instruct_wrapper.py +11 -5
  53. mteb/models/model_implementations/andersborges.py +2 -2
  54. mteb/models/model_implementations/blip_models.py +8 -8
  55. mteb/models/model_implementations/bm25.py +1 -1
  56. mteb/models/model_implementations/clip_models.py +3 -3
  57. mteb/models/model_implementations/cohere_models.py +1 -1
  58. mteb/models/model_implementations/cohere_v.py +2 -2
  59. mteb/models/model_implementations/dino_models.py +23 -23
  60. mteb/models/model_implementations/emillykkejensen_models.py +3 -3
  61. mteb/models/model_implementations/jina_clip.py +1 -1
  62. mteb/models/model_implementations/jina_models.py +1 -1
  63. mteb/models/model_implementations/kennethenevoldsen_models.py +2 -2
  64. mteb/models/model_implementations/llm2clip_models.py +3 -3
  65. mteb/models/model_implementations/moco_models.py +2 -2
  66. mteb/models/model_implementations/model2vec_models.py +1 -1
  67. mteb/models/model_implementations/nomic_models.py +8 -8
  68. mteb/models/model_implementations/openclip_models.py +7 -7
  69. mteb/models/model_implementations/random_baseline.py +3 -3
  70. mteb/models/model_implementations/rasgaard_models.py +1 -1
  71. mteb/models/model_implementations/repllama_models.py +2 -2
  72. mteb/models/model_implementations/rerankers_custom.py +3 -3
  73. mteb/models/model_implementations/rerankers_monot5_based.py +3 -3
  74. mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +113 -146
  75. mteb/models/model_implementations/siglip_models.py +10 -10
  76. mteb/models/model_implementations/vlm2vec_models.py +1 -1
  77. mteb/models/model_implementations/voyage_v.py +4 -4
  78. mteb/models/model_meta.py +30 -14
  79. mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +5 -5
  80. mteb/models/search_wrappers.py +22 -10
  81. mteb/models/sentence_transformer_wrapper.py +9 -4
  82. mteb/py.typed +0 -0
  83. mteb/results/benchmark_results.py +25 -19
  84. mteb/results/model_result.py +49 -21
  85. mteb/results/task_result.py +45 -51
  86. mteb/similarity_functions.py +11 -7
  87. mteb/tasks/classification/dan/dk_hate_classification.py +1 -1
  88. mteb/tasks/classification/est/estonian_valence.py +1 -1
  89. mteb/tasks/classification/multilingual/scala_classification.py +1 -1
  90. mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +1 -1
  91. mteb/tasks/retrieval/code/code_rag.py +12 -12
  92. mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
  93. mteb/tasks/retrieval/dan/tv2_nordretrieval.py +2 -2
  94. mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +2 -2
  95. mteb/tasks/retrieval/nob/norquad.py +2 -2
  96. mteb/tasks/retrieval/nob/snl_retrieval.py +2 -2
  97. mteb/tasks/retrieval/tur/tur_hist_quad.py +1 -1
  98. mteb/types/_result.py +2 -1
  99. mteb/types/statistics.py +9 -3
  100. {mteb-2.5.3.dist-info → mteb-2.5.5.dist-info}/METADATA +1 -1
  101. {mteb-2.5.3.dist-info → mteb-2.5.5.dist-info}/RECORD +105 -104
  102. {mteb-2.5.3.dist-info → mteb-2.5.5.dist-info}/WHEEL +0 -0
  103. {mteb-2.5.3.dist-info → mteb-2.5.5.dist-info}/entry_points.txt +0 -0
  104. {mteb-2.5.3.dist-info → mteb-2.5.5.dist-info}/licenses/LICENSE +0 -0
  105. {mteb-2.5.3.dist-info → mteb-2.5.5.dist-info}/top_level.txt +0 -0
@@ -87,7 +87,7 @@ class AbsTaskRegression(AbsTaskClassification):
87
87
  Full details of api in [`SklearnModelProtocol`][mteb._evaluators.sklearn_evaluator.SklearnModelProtocol].
88
88
  """
89
89
 
90
- evaluator: type[SklearnModelProtocol] = SklearnEvaluator
90
+ evaluator: type[SklearnEvaluator] = SklearnEvaluator
91
91
  evaluator_model: SklearnModelProtocol = LinearRegression(n_jobs=-1)
92
92
 
93
93
  train_split: str = "train"
@@ -113,7 +113,7 @@ class AbsTaskRegression(AbsTaskClassification):
113
113
  )["train"]
114
114
  return train_split_sampled, []
115
115
 
116
- def _calculate_scores(
116
+ def _calculate_scores( # type: ignore[override]
117
117
  self,
118
118
  y_test: np.ndarray | list[int],
119
119
  y_pred: np.ndarray,
@@ -183,7 +183,7 @@ class AbsTaskRegression(AbsTaskClassification):
183
183
 
184
184
  return dataset_dict
185
185
 
186
- def _calculate_descriptive_statistics_from_split(
186
+ def _calculate_descriptive_statistics_from_split( # type: ignore[override]
187
187
  self, split: str, hf_subset: str | None = None, compute_overall: bool = False
188
188
  ) -> RegressionDescriptiveStatistics:
189
189
  train_text = []
@@ -1,7 +1,7 @@
1
1
  import json
2
2
  import logging
3
3
  from collections import defaultdict
4
- from collections.abc import Callable, Sequence
4
+ from collections.abc import Callable, Mapping, Sequence
5
5
  from pathlib import Path
6
6
  from time import time
7
7
  from typing import Any, Literal
@@ -286,7 +286,7 @@ class AbsTaskRetrieval(AbsTask):
286
286
  encode_kwargs: dict[str, Any],
287
287
  prediction_folder: Path | None = None,
288
288
  **kwargs,
289
- ) -> dict[HFSubset, ScoresDict]:
289
+ ) -> Mapping[HFSubset, ScoresDict]:
290
290
  """Evaluate the model on the retrieval task.
291
291
 
292
292
  Args:
@@ -357,6 +357,8 @@ class AbsTaskRetrieval(AbsTask):
357
357
  **kwargs,
358
358
  )
359
359
 
360
+ search_model: SearchProtocol
361
+
360
362
  if isinstance(model, EncoderProtocol) and not isinstance(model, SearchProtocol):
361
363
  search_model = SearchEncoderWrapper(model)
362
364
  elif isinstance(model, CrossEncoderProtocol):
@@ -578,11 +580,12 @@ class AbsTaskRetrieval(AbsTask):
578
580
  if isinstance(data[split][subset_item], Dataset):
579
581
  sections[split] = data[split][subset_item]
580
582
  elif converter is not None:
583
+ subset_data = data[split][subset_item]
584
+ if subset_data is None:
585
+ continue
586
+
581
587
  sections[split] = Dataset.from_list(
582
- [
583
- converter(idx, item)
584
- for idx, item in data[split][subset_item].items()
585
- ]
588
+ [converter(idx, item) for idx, item in subset_data.items()]
586
589
  )
587
590
  else:
588
591
  raise ValueError(
@@ -680,7 +683,7 @@ class AbsTaskRetrieval(AbsTask):
680
683
 
681
684
  top_k_sorted = defaultdict(list)
682
685
  for query_id, values in top_ranked.items():
683
- sorted_keys = sorted(values, key=values.get, reverse=True)
686
+ sorted_keys = sorted(values, key=lambda k: values[k], reverse=True)
684
687
  top_k_sorted[query_id] = sorted_keys[: self._top_k]
685
688
 
686
689
  self.dataset[subset][split]["top_ranked"] = top_k_sorted
@@ -688,10 +691,10 @@ class AbsTaskRetrieval(AbsTask):
688
691
 
689
692
 
690
693
  def _process_relevant_docs(
691
- collection: dict[str, dict[str, float]],
694
+ collection: Mapping[str, Mapping[str, int]],
692
695
  hf_subset: str,
693
696
  split: str,
694
- ) -> dict[str, dict[str, float]]:
697
+ ) -> dict[str, dict[str, int]]:
695
698
  """Collections can contain overlapping ids in different splits. Prepend split and subset to avoid this
696
699
 
697
700
  Returns:
mteb/abstasks/sts.py CHANGED
@@ -7,7 +7,7 @@ from scipy.stats import pearsonr, spearmanr
7
7
 
8
8
  from mteb._evaluators import AnySTSEvaluator
9
9
  from mteb._evaluators.any_sts_evaluator import STSEvaluatorScores
10
- from mteb.models import EncoderProtocol
10
+ from mteb.models import EncoderProtocol, MTEBModels
11
11
  from mteb.types import PromptType
12
12
  from mteb.types.statistics import (
13
13
  ImageStatistics,
@@ -103,7 +103,7 @@ class AbsTaskSTS(AbsTask):
103
103
 
104
104
  def _evaluate_subset(
105
105
  self,
106
- model: EncoderProtocol,
106
+ model: MTEBModels,
107
107
  data_split: Dataset,
108
108
  encode_kwargs: dict[str, Any],
109
109
  hf_split: str,
@@ -111,6 +111,9 @@ class AbsTaskSTS(AbsTask):
111
111
  prediction_folder: Path | None = None,
112
112
  **kwargs: Any,
113
113
  ) -> STSMetrics:
114
+ if not isinstance(model, EncoderProtocol):
115
+ raise TypeError("Expected model to be an instance of EncoderProtocol")
116
+
114
117
  normalized_scores = list(map(self._normalize, data_split["score"]))
115
118
  data_split = data_split.select_columns(list(self.column_names))
116
119
 
@@ -142,7 +145,7 @@ class AbsTaskSTS(AbsTask):
142
145
  ) -> STSMetrics:
143
146
  def compute_corr(x: list[float], y: list[float]) -> tuple[float, float]:
144
147
  """Return (pearson, spearman) correlations between x and y."""
145
- return pearsonr(x, y)[0], spearmanr(x, y)[0]
148
+ return float(pearsonr(x, y)[0]), float(spearmanr(x, y)[0])
146
149
 
147
150
  cosine_pearson, cosine_spearman = compute_corr(
148
151
  normalized_scores, scores["cosine_scores"]
@@ -2,9 +2,10 @@ import json
2
2
  import logging
3
3
  from collections.abc import Sequence
4
4
  from pathlib import Path
5
- from typing import Any, Literal
5
+ from typing import Any, Literal, cast
6
6
 
7
7
  from huggingface_hub import (
8
+ CardData,
8
9
  DatasetCard,
9
10
  DatasetCardData,
10
11
  constants,
@@ -150,7 +151,7 @@ _TASK_TYPE = (
150
151
  "InstructionReranking",
151
152
  ) + MIEB_TASK_TYPE
152
153
 
153
- TaskType = Literal[_TASK_TYPE]
154
+ TaskType = Literal[_TASK_TYPE] # type: ignore[valid-type]
154
155
  """The type of the task. E.g. includes "Classification", "Retrieval" and "Clustering"."""
155
156
 
156
157
 
@@ -192,8 +193,10 @@ AnnotatorType = Literal[
192
193
  """The type of the annotators. Is often important for understanding the quality of a dataset."""
193
194
 
194
195
 
195
- PromptDict = TypedDict(
196
- "PromptDict", {prompt_type.value: str for prompt_type in PromptType}, total=False
196
+ PromptDict = TypedDict( # type: ignore[misc]
197
+ "PromptDict",
198
+ {prompt_type.value: str for prompt_type in PromptType},
199
+ total=False,
197
200
  )
198
201
  """A dictionary containing the prompt used for the task.
199
202
 
@@ -365,7 +368,7 @@ class TaskMetadata(BaseModel):
365
368
  """Return a dictionary mapping huggingface subsets to languages."""
366
369
  if isinstance(self.eval_langs, dict):
367
370
  return self.eval_langs
368
- return {"default": self.eval_langs} # type: ignore
371
+ return {"default": cast(list[str], self.eval_langs)}
369
372
 
370
373
  @property
371
374
  def intext_citation(self, include_cite: bool = True) -> str:
@@ -413,7 +416,7 @@ class TaskMetadata(BaseModel):
413
416
  for subset, subset_value in stats.items():
414
417
  if subset == "hf_subset_descriptive_stats":
415
418
  continue
416
- n_samples[subset] = subset_value["num_samples"] # type: ignore
419
+ n_samples[subset] = subset_value["num_samples"]
417
420
  return n_samples
418
421
 
419
422
  @property
@@ -446,7 +449,7 @@ class TaskMetadata(BaseModel):
446
449
  Raises:
447
450
  ValueError: If the prompt type is not recognized.
448
451
  """
449
- if prompt_type is None:
452
+ if prompt_type is None or self.category is None:
450
453
  return self.modalities
451
454
  query_modalities, doc_modalities = self.category.split("2")
452
455
  category_to_modality: dict[str, Modalities] = {
@@ -466,7 +469,7 @@ class TaskMetadata(BaseModel):
466
469
 
467
470
  def _create_dataset_card_data(
468
471
  self,
469
- existing_dataset_card_data: DatasetCardData | None = None,
472
+ existing_dataset_card_data: CardData | None = None,
470
473
  ) -> tuple[DatasetCardData, dict[str, Any]]:
471
474
  """Create a DatasetCardData object from the task metadata.
472
475
 
@@ -501,12 +504,13 @@ class TaskMetadata(BaseModel):
501
504
 
502
505
  tags = ["mteb"] + self.modalities
503
506
 
504
- descriptive_stats = self.descriptive_stats
505
- if descriptive_stats is not None:
506
- for split, split_stat in descriptive_stats.items():
507
+ descriptive_stats = ""
508
+ if self.descriptive_stats is not None:
509
+ descriptive_stats_ = self.descriptive_stats
510
+ for split, split_stat in descriptive_stats_.items():
507
511
  if len(split_stat.get("hf_subset_descriptive_stats", {})) > 10:
508
512
  split_stat.pop("hf_subset_descriptive_stats", {})
509
- descriptive_stats = json.dumps(descriptive_stats, indent=4)
513
+ descriptive_stats = json.dumps(descriptive_stats_, indent=4)
510
514
 
511
515
  dataset_card_data_params = existing_dataset_card_data.to_dict()
512
516
  # override the existing values
@@ -694,11 +698,11 @@ class TaskMetadata(BaseModel):
694
698
 
695
699
  def _hf_languages(self) -> list[str]:
696
700
  languages: list[str] = []
697
- if self.is_multilingual:
698
- for val in list(self.eval_langs.values()):
701
+ if self.is_multilingual and isinstance(self.eval_langs, dict):
702
+ for val in self.eval_langs.values():
699
703
  languages.extend(val)
700
704
  else:
701
- languages = self.eval_langs
705
+ languages = cast(list[str], self.eval_langs)
702
706
  # value "python" is not valid. It must be an ISO 639-1, 639-2 or 639-3 code (two/three letters),
703
707
  # or a special value like "code", "multilingual".
704
708
  readme_langs = []
@@ -710,7 +714,7 @@ class TaskMetadata(BaseModel):
710
714
  readme_langs.append(lang_name)
711
715
  return sorted(set(readme_langs))
712
716
 
713
- def _hf_license(self) -> str:
717
+ def _hf_license(self) -> str | None:
714
718
  dataset_license = self.license
715
719
  if dataset_license:
716
720
  license_mapping = {
@@ -1,7 +1,7 @@
1
1
  import logging
2
2
  from collections import defaultdict
3
3
  from pathlib import Path
4
- from typing import Any, ClassVar, TypedDict
4
+ from typing import Any, ClassVar, TypedDict, cast
5
5
 
6
6
  from datasets import Dataset, DatasetDict
7
7
  from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
@@ -78,6 +78,9 @@ class AbsTaskBitextMining(AbsTask):
78
78
  **kwargs: Any,
79
79
  ) -> dict[HFSubset, ScoresDict]:
80
80
  """Added load for "parallel" datasets"""
81
+ if not isinstance(model, EncoderProtocol):
82
+ raise TypeError("Expected model to be an instance of EncoderProtocol")
83
+
81
84
  if not self.data_loaded:
82
85
  self.load_data()
83
86
 
@@ -87,11 +90,16 @@ class AbsTaskBitextMining(AbsTask):
87
90
  if subsets_to_run is not None:
88
91
  hf_subsets = [s for s in hf_subsets if s in subsets_to_run]
89
92
 
90
- scores = {}
93
+ encoder_model = cast(EncoderProtocol, model)
94
+
95
+ if self.dataset is None:
96
+ raise ValueError("Dataset is not loaded.")
97
+
98
+ scores: dict[str, BitextMiningMetrics] = {}
91
99
  if self.parallel_subsets:
92
- scores = self._evaluate_subset(
93
- model,
94
- self.dataset[split], # type: ignore
100
+ scores = self._evaluate_subset( # type: ignore[assignment]
101
+ encoder_model,
102
+ self.dataset[split],
95
103
  parallel=True,
96
104
  hf_split=split,
97
105
  hf_subset="parallel",
@@ -109,8 +117,8 @@ class AbsTaskBitextMining(AbsTask):
109
117
  data_split = self.dataset[split]
110
118
  else:
111
119
  data_split = self.dataset[hf_subset][split]
112
- scores[hf_subset] = self._evaluate_subset(
113
- model,
120
+ scores[hf_subset] = self._evaluate_subset( # type: ignore[assignment]
121
+ encoder_model,
114
122
  data_split,
115
123
  hf_split=split,
116
124
  hf_subset=hf_subset,
@@ -119,32 +127,32 @@ class AbsTaskBitextMining(AbsTask):
119
127
  **kwargs,
120
128
  )
121
129
 
122
- return scores
130
+ return cast(dict[HFSubset, ScoresDict], scores)
123
131
 
124
132
  def _get_pairs(self, parallel: bool) -> list[tuple[str, str]]:
125
133
  pairs = self._DEFAULT_PAIR
126
134
  if parallel:
127
- pairs = [langpair.split("-") for langpair in self.hf_subsets]
135
+ pairs = [langpair.split("-") for langpair in self.hf_subsets] # type: ignore[misc]
128
136
  return pairs
129
137
 
130
- def _evaluate_subset(
138
+ def _evaluate_subset( # type: ignore[override]
131
139
  self,
132
140
  model: EncoderProtocol,
133
141
  data_split: Dataset,
134
142
  *,
135
143
  hf_split: str,
136
144
  hf_subset: str,
137
- parallel: bool = False,
138
145
  encode_kwargs: dict[str, Any],
139
146
  prediction_folder: Path | None = None,
147
+ parallel: bool = False,
140
148
  **kwargs,
141
- ) -> ScoresDict:
149
+ ) -> BitextMiningMetrics | dict[str, BitextMiningMetrics]:
142
150
  pairs = self._get_pairs(parallel)
143
151
 
144
152
  evaluator = BitextMiningEvaluator(
145
153
  data_split,
146
154
  task_metadata=self.metadata,
147
- pair_columns=pairs, # type: ignore
155
+ pair_columns=pairs,
148
156
  hf_split=hf_split,
149
157
  hf_subset=hf_subset,
150
158
  **kwargs,
@@ -168,16 +176,16 @@ class AbsTaskBitextMining(AbsTask):
168
176
  )
169
177
 
170
178
  if parallel:
171
- metrics = {}
179
+ parallel_metrics = {}
172
180
  for keys, nearest_neighbors in neighbours.items():
173
- metrics[keys] = self._compute_metrics(nearest_neighbors, gold)
181
+ parallel_metrics[keys] = self._compute_metrics(nearest_neighbors, gold)
174
182
 
175
- for v in metrics.values():
183
+ for v in parallel_metrics.values():
176
184
  self._add_main_score(v)
177
- else:
178
- def_pair_str = "-".join(self._DEFAULT_PAIR[0])
179
- metrics = self._compute_metrics(neighbours[def_pair_str], gold)
180
- self._add_main_score(metrics)
185
+ return parallel_metrics
186
+ def_pair_str = "-".join(self._DEFAULT_PAIR[0])
187
+ metrics = self._compute_metrics(neighbours[def_pair_str], gold)
188
+ self._add_main_score(metrics)
181
189
  return metrics
182
190
 
183
191
  def _compute_metrics(
@@ -250,8 +258,11 @@ class AbsTaskBitextMining(AbsTask):
250
258
  )
251
259
 
252
260
  def _push_dataset_to_hub(self, repo_name: str) -> None:
261
+ if self.dataset is None:
262
+ raise ValueError("Dataset is not loaded.")
263
+
253
264
  if self.metadata.is_multilingual:
254
- dataset = defaultdict(dict)
265
+ dataset: dict[str, dict[str, list[str]]] = defaultdict(dict)
255
266
  for config in self.metadata.eval_langs:
256
267
  logger.info(f"Converting {config} of {self.metadata.name}")
257
268
 
@@ -266,10 +277,10 @@ class AbsTaskBitextMining(AbsTask):
266
277
  for split in self.dataset[config]:
267
278
  dataset[split][lang_1] = self.dataset[config][split][sent_1]
268
279
  dataset[split][lang_2] = self.dataset[config][split][sent_2]
269
- for split in dataset:
270
- dataset[split] = Dataset.from_dict(dataset[split])
271
- dataset = DatasetDict(dataset)
272
- dataset.push_to_hub(repo_name)
280
+ dataset_dict = DatasetDict(
281
+ {split: Dataset.from_dict(dataset[split]) for split in dataset}
282
+ )
283
+ dataset_dict.push_to_hub(repo_name)
273
284
  else:
274
285
  sentences = {}
275
286
  for split in self.dataset:
@@ -16,7 +16,7 @@ else:
16
16
 
17
17
  logger = logging.getLogger(__name__)
18
18
 
19
- OLD_FORMAT_RERANKING_TASKS = []
19
+ OLD_FORMAT_RERANKING_TASKS: list[str] = []
20
20
 
21
21
 
22
22
  @deprecated(
@@ -105,7 +105,9 @@ class AbsTaskReranking(AbsTaskRetrieval):
105
105
  )
106
106
 
107
107
  given_dataset = copy(given_dataset)
108
- self.dataset = defaultdict(lambda: defaultdict(dict))
108
+ self.dataset: dict[str, dict[str, RetrievalSplitData]] = defaultdict(
109
+ lambda: defaultdict(dict) # type: ignore[arg-type]
110
+ )
109
111
 
110
112
  hf_subsets = self.hf_subsets
111
113
 
@@ -115,19 +117,19 @@ class AbsTaskReranking(AbsTaskRetrieval):
115
117
  if hf_subset in cur_dataset:
116
118
  cur_dataset = cur_dataset[hf_subset]
117
119
  elif "name" in self.metadata.dataset:
118
- cur_dataset = datasets.load_dataset(**self.metadata.dataset) # type: ignore
120
+ cur_dataset = datasets.load_dataset(**self.metadata.dataset)
119
121
  assert hf_subset == "default", (
120
122
  f"Only default subset is supported for {self.metadata.name} since `name` is given in the metadata."
121
123
  )
122
124
  else:
123
125
  cur_dataset = datasets.load_dataset(
124
126
  **self.metadata.dataset, name=hf_subset
125
- ) # type: ignore
127
+ )
126
128
 
127
129
  for split in cur_dataset:
128
130
  corpus = []
129
131
  queries = []
130
- relevant_docs = defaultdict(dict)
132
+ relevant_docs: dict[str, dict[str, int]] = defaultdict(dict)
131
133
  top_ranked = defaultdict(list)
132
134
 
133
135
  # Create an enumerated dataset to pass indices
@@ -12,7 +12,7 @@ from mteb.abstasks._statistics_calculation import (
12
12
  calculate_text_statistics,
13
13
  )
14
14
  from mteb.abstasks.abstask import AbsTask
15
- from mteb.models import EncoderProtocol
15
+ from mteb.models import EncoderProtocol, MTEBModels
16
16
  from mteb.types.statistics import (
17
17
  ScoreStatistics,
18
18
  SplitDescriptiveStatistics,
@@ -77,7 +77,7 @@ class AbsTaskSummarization(AbsTask):
77
77
 
78
78
  def _evaluate_subset(
79
79
  self,
80
- model: EncoderProtocol,
80
+ model: MTEBModels,
81
81
  data_split: Dataset,
82
82
  *,
83
83
  hf_split: str,
@@ -86,8 +86,13 @@ class AbsTaskSummarization(AbsTask):
86
86
  prediction_folder: Path | None = None,
87
87
  **kwargs,
88
88
  ) -> SummarizationMetrics:
89
+ if not isinstance(model, EncoderProtocol):
90
+ raise TypeError("Expected model to be an instance of EncoderProtocol")
91
+
89
92
  normalized_scores = [
90
- (np.array(x) - self.min_score) / (self.max_score - self.min_score)
93
+ (
94
+ (np.array(x) - self.min_score) / (self.max_score - self.min_score)
95
+ ).tolist()
91
96
  for x in data_split[self.relevancy_column_name]
92
97
  ]
93
98
  evaluator = self.evaluator(
@@ -7,7 +7,7 @@ from datasets import Dataset
7
7
  from sklearn import metrics
8
8
 
9
9
  from mteb._evaluators import ZeroShotClassificationEvaluator
10
- from mteb.models import EncoderProtocol
10
+ from mteb.models import EncoderProtocol, MTEBModels
11
11
  from mteb.types.statistics import (
12
12
  ImageStatistics,
13
13
  LabelStatistics,
@@ -111,7 +111,7 @@ class AbsTaskZeroShotClassification(AbsTask):
111
111
 
112
112
  def _evaluate_subset(
113
113
  self,
114
- model: EncoderProtocol,
114
+ model: MTEBModels,
115
115
  data_split: Dataset,
116
116
  *,
117
117
  hf_split: str,
@@ -120,6 +120,9 @@ class AbsTaskZeroShotClassification(AbsTask):
120
120
  prediction_folder: Path | None = None,
121
121
  **kwargs,
122
122
  ) -> ZeroShotClassificationMetrics:
123
+ if not isinstance(model, EncoderProtocol):
124
+ raise TypeError("Expected model to be an instance of EncoderProtocol")
125
+
123
126
  candidate_labels = self.get_candidate_labels()
124
127
  data_split = data_split.select_columns(
125
128
  [self.input_column_name, self.label_column_name]
@@ -1,6 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
- from collections.abc import Iterable, Sequence
3
+ from collections.abc import Iterator, Sequence
4
4
  from dataclasses import dataclass, field
5
5
  from typing import TYPE_CHECKING, Literal
6
6
 
@@ -19,6 +19,7 @@ class Benchmark:
19
19
 
20
20
  Args:
21
21
  name: The name of the benchmark
22
+ aliases: Alternative names for the benchmark
22
23
  tasks: The tasks within the benchmark.
23
24
  description: A description of the benchmark, should include its intended goal and potentially a description of its construction
24
25
  reference: A link reference, to a source containing additional information typically to a paper, leaderboard or github.
@@ -38,6 +39,7 @@ class Benchmark:
38
39
 
39
40
  name: str
40
41
  tasks: Sequence[AbsTask]
42
+ aliases: Sequence[str] = field(default_factory=tuple)
41
43
  description: str | None = None
42
44
  reference: StrURL | None = None
43
45
  citation: str | None = None
@@ -47,7 +49,7 @@ class Benchmark:
47
49
  display_name: str | None = None
48
50
  language_view: list[str] | Literal["all"] = field(default_factory=list)
49
51
 
50
- def __iter__(self) -> Iterable[AbsTask]:
52
+ def __iter__(self) -> Iterator[AbsTask]:
51
53
  return iter(self.tasks)
52
54
 
53
55
  def __len__(self) -> int:
@@ -18,6 +18,7 @@ MMTEB_CITATION = r"""@article{enevoldsen2025mmtebmassivemultilingualtext,
18
18
 
19
19
  MTEB_EN = Benchmark(
20
20
  name="MTEB(eng, v2)",
21
+ aliases=["MTEB(eng)"],
21
22
  display_name="English",
22
23
  icon="https://github.com/lipis/flag-icons/raw/refs/heads/main/flags/4x3/us.svg",
23
24
  tasks=MTEBTasks(
@@ -89,6 +90,7 @@ The original MTEB leaderboard is available under the [MTEB(eng, v1)](http://mteb
89
90
 
90
91
  MTEB_ENG_CLASSIC = Benchmark(
91
92
  name="MTEB(eng, v1)",
93
+ aliases=["MTEB(eng, classic)", "MTEB"],
92
94
  display_name="English Legacy",
93
95
  icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/gb.svg",
94
96
  tasks=MTEBTasks(
@@ -185,6 +187,7 @@ We recommend that you use [MTEB(eng, v2)](http://mteb-leaderboard.hf.space/?benc
185
187
 
186
188
  MTEB_MAIN_RU = Benchmark(
187
189
  name="MTEB(rus, v1)",
190
+ aliases=["MTEB(rus)"],
188
191
  display_name="Russian legacy",
189
192
  icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/ru.svg",
190
193
  tasks=MTEBTasks(
@@ -344,6 +347,7 @@ RU_SCI_BENCH = Benchmark(
344
347
 
345
348
  MTEB_RETRIEVAL_WITH_INSTRUCTIONS = Benchmark(
346
349
  name="FollowIR",
350
+ aliases=["MTEB(Retrieval w/Instructions)"],
347
351
  display_name="Instruction Following",
348
352
  tasks=get_tasks(
349
353
  tasks=[
@@ -394,7 +398,9 @@ MTEB_RETRIEVAL_WITH_DOMAIN_INSTRUCTIONS = Benchmark(
394
398
  )
395
399
 
396
400
  MTEB_RETRIEVAL_LAW = Benchmark(
397
- name="MTEB(Law, v1)", # This benchmark is likely in the need of an update
401
+ # This benchmark is likely in the need of an update
402
+ name="MTEB(Law, v1)",
403
+ aliases=["MTEB(law)"],
398
404
  display_name="Legal",
399
405
  icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-map-library.svg",
400
406
  tasks=get_tasks(
@@ -416,6 +422,7 @@ MTEB_RETRIEVAL_LAW = Benchmark(
416
422
 
417
423
  MTEB_RETRIEVAL_MEDICAL = Benchmark(
418
424
  name="MTEB(Medical, v1)",
425
+ aliases=["MTEB(Medical)"],
419
426
  display_name="Medical",
420
427
  icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-map-hospital.svg",
421
428
  tasks=get_tasks(
@@ -469,6 +476,7 @@ MTEB_MINERS_BITEXT_MINING = Benchmark(
469
476
 
470
477
  SEB = Benchmark(
471
478
  name="MTEB(Scandinavian, v1)",
479
+ aliases=["MTEB(Scandinavian)", "SEB"],
472
480
  display_name="Scandinavian",
473
481
  icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/dk.svg",
474
482
  language_view=["dan-Latn", "swe-Latn", "nno-Latn", "nob-Latn"],
@@ -595,6 +603,7 @@ RAR_b = Benchmark(
595
603
 
596
604
  MTEB_FRA = Benchmark(
597
605
  name="MTEB(fra, v1)",
606
+ aliases=["MTEB(fra)"],
598
607
  display_name="French",
599
608
  icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/fr.svg",
600
609
  tasks=MTEBTasks(
@@ -653,6 +662,7 @@ MTEB_FRA = Benchmark(
653
662
 
654
663
  MTEB_DEU = Benchmark(
655
664
  name="MTEB(deu, v1)",
665
+ aliases=["MTEB(deu)"],
656
666
  display_name="German",
657
667
  icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/de.svg",
658
668
  tasks=get_tasks(
@@ -704,6 +714,7 @@ MTEB_DEU = Benchmark(
704
714
 
705
715
  MTEB_KOR = Benchmark(
706
716
  name="MTEB(kor, v1)",
717
+ aliases=["MTEB(kor)"],
707
718
  display_name="Korean",
708
719
  icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/kr.svg",
709
720
  tasks=get_tasks(
@@ -728,6 +739,7 @@ MTEB_KOR = Benchmark(
728
739
 
729
740
  MTEB_POL = Benchmark(
730
741
  name="MTEB(pol, v1)",
742
+ aliases=["MTEB(pol)"],
731
743
  display_name="Polish",
732
744
  icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/pl.svg",
733
745
  tasks=MTEBTasks(
@@ -777,6 +789,7 @@ two novel clustering tasks.""", # Rephrased from the abstract
777
789
 
778
790
  MTEB_code = Benchmark(
779
791
  name="MTEB(Code, v1)",
792
+ aliases=["MTEB(code)"],
780
793
  display_name="Code",
781
794
  icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-tech-electronics.svg",
782
795
  tasks=get_tasks(
@@ -953,6 +966,7 @@ MTEB_multilingual_v1 = Benchmark(
953
966
 
954
967
  MTEB_multilingual_v2 = Benchmark(
955
968
  name="MTEB(Multilingual, v2)",
969
+ aliases=["MTEB(Multilingual)", "MMTEB"],
956
970
  display_name="Multilingual",
957
971
  language_view=[
958
972
  "eng-Latn", # English
@@ -986,6 +1000,7 @@ MTEB_multilingual_v2 = Benchmark(
986
1000
 
987
1001
  MTEB_JPN = Benchmark(
988
1002
  name="MTEB(jpn, v1)",
1003
+ aliases=["MTEB(jpn)"],
989
1004
  display_name="Japanese Legacy",
990
1005
  icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/jp.svg",
991
1006
  tasks=get_tasks(
@@ -1056,6 +1071,7 @@ indic_languages = [
1056
1071
 
1057
1072
  MTEB_INDIC = Benchmark(
1058
1073
  name="MTEB(Indic, v1)",
1074
+ aliases=["MTEB(Indic)"],
1059
1075
  display_name="Indic",
1060
1076
  icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/in.svg",
1061
1077
  tasks=MTEBTasks(
@@ -1146,6 +1162,7 @@ eu_languages = [
1146
1162
 
1147
1163
  MTEB_EU = Benchmark(
1148
1164
  name="MTEB(Europe, v1)",
1165
+ aliases=["MTEB(Europe)"],
1149
1166
  display_name="European",
1150
1167
  icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/eu.svg",
1151
1168
  tasks=get_tasks(
@@ -1285,6 +1302,7 @@ BRIGHT = Benchmark(
1285
1302
 
1286
1303
  BRIGHT_LONG = Benchmark(
1287
1304
  name="BRIGHT (long)",
1305
+ aliases=["BRIGHT(long)"],
1288
1306
  tasks=MTEBTasks(
1289
1307
  (
1290
1308
  get_task(
@@ -1400,6 +1418,7 @@ NANOBEIR = Benchmark(
1400
1418
 
1401
1419
  C_MTEB = Benchmark(
1402
1420
  name="MTEB(cmn, v1)",
1421
+ aliases=["MTEB(Chinese)", "CMTEB"],
1403
1422
  display_name="Chinese",
1404
1423
  icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/cn.svg",
1405
1424
  tasks=MTEBTasks(
@@ -1466,6 +1485,7 @@ C_MTEB = Benchmark(
1466
1485
 
1467
1486
  FA_MTEB = Benchmark(
1468
1487
  name="MTEB(fas, v1)",
1488
+ aliases=["FaMTEB(fas, beta)"],
1469
1489
  display_name="Farsi Legacy",
1470
1490
  icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/ir.svg",
1471
1491
  tasks=get_tasks(
@@ -2347,6 +2367,7 @@ VIDORE_V3 = VidoreBenchmark(
2347
2367
 
2348
2368
  VISUAL_DOCUMENT_RETRIEVAL = VidoreBenchmark(
2349
2369
  name="ViDoRe(v1&v2)",
2370
+ aliases=["VisualDocumentRetrieval"],
2350
2371
  display_name="ViDoRe (V1&V2)",
2351
2372
  tasks=get_tasks(
2352
2373
  tasks=[