mteb 2.5.3__py3-none-any.whl → 2.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. mteb/_create_dataloaders.py +10 -15
  2. mteb/_evaluators/any_sts_evaluator.py +1 -4
  3. mteb/_evaluators/evaluator.py +2 -1
  4. mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +5 -6
  5. mteb/_evaluators/pair_classification_evaluator.py +3 -1
  6. mteb/_evaluators/retrieval_metrics.py +17 -16
  7. mteb/_evaluators/sklearn_evaluator.py +9 -8
  8. mteb/_evaluators/text/bitext_mining_evaluator.py +23 -16
  9. mteb/_evaluators/text/summarization_evaluator.py +20 -16
  10. mteb/abstasks/_data_filter/filters.py +1 -1
  11. mteb/abstasks/_data_filter/task_pipelines.py +3 -0
  12. mteb/abstasks/_statistics_calculation.py +18 -10
  13. mteb/abstasks/_stratification.py +18 -18
  14. mteb/abstasks/abstask.py +27 -21
  15. mteb/abstasks/aggregate_task_metadata.py +1 -9
  16. mteb/abstasks/aggregated_task.py +3 -16
  17. mteb/abstasks/classification.py +10 -4
  18. mteb/abstasks/clustering.py +18 -14
  19. mteb/abstasks/clustering_legacy.py +8 -8
  20. mteb/abstasks/image/image_text_pair_classification.py +5 -3
  21. mteb/abstasks/multilabel_classification.py +20 -16
  22. mteb/abstasks/pair_classification.py +18 -9
  23. mteb/abstasks/regression.py +3 -3
  24. mteb/abstasks/retrieval.py +12 -9
  25. mteb/abstasks/sts.py +6 -3
  26. mteb/abstasks/task_metadata.py +20 -16
  27. mteb/abstasks/text/bitext_mining.py +36 -25
  28. mteb/abstasks/text/reranking.py +7 -5
  29. mteb/abstasks/text/summarization.py +8 -3
  30. mteb/abstasks/zeroshot_classification.py +5 -2
  31. mteb/benchmarks/benchmark.py +2 -2
  32. mteb/cache.py +20 -18
  33. mteb/cli/_display_tasks.py +2 -2
  34. mteb/cli/build_cli.py +5 -5
  35. mteb/cli/generate_model_card.py +6 -4
  36. mteb/deprecated_evaluator.py +56 -43
  37. mteb/evaluate.py +35 -29
  38. mteb/filter_tasks.py +25 -26
  39. mteb/get_tasks.py +25 -27
  40. mteb/languages/language_scripts.py +5 -3
  41. mteb/leaderboard/app.py +1 -1
  42. mteb/load_results.py +12 -12
  43. mteb/models/abs_encoder.py +2 -2
  44. mteb/models/cache_wrappers/cache_backend_protocol.py +3 -5
  45. mteb/models/cache_wrappers/cache_backends/_hash_utils.py +5 -4
  46. mteb/models/cache_wrappers/cache_backends/faiss_cache.py +2 -1
  47. mteb/models/cache_wrappers/cache_backends/numpy_cache.py +30 -13
  48. mteb/models/cache_wrappers/cache_wrapper.py +2 -2
  49. mteb/models/get_model_meta.py +8 -1
  50. mteb/models/instruct_wrapper.py +11 -5
  51. mteb/models/model_implementations/andersborges.py +2 -2
  52. mteb/models/model_implementations/blip_models.py +8 -8
  53. mteb/models/model_implementations/bm25.py +1 -1
  54. mteb/models/model_implementations/clip_models.py +3 -3
  55. mteb/models/model_implementations/cohere_models.py +1 -1
  56. mteb/models/model_implementations/cohere_v.py +2 -2
  57. mteb/models/model_implementations/dino_models.py +23 -23
  58. mteb/models/model_implementations/emillykkejensen_models.py +3 -3
  59. mteb/models/model_implementations/jina_clip.py +1 -1
  60. mteb/models/model_implementations/jina_models.py +1 -1
  61. mteb/models/model_implementations/kennethenevoldsen_models.py +2 -2
  62. mteb/models/model_implementations/llm2clip_models.py +3 -3
  63. mteb/models/model_implementations/moco_models.py +2 -2
  64. mteb/models/model_implementations/model2vec_models.py +1 -1
  65. mteb/models/model_implementations/nomic_models.py +8 -8
  66. mteb/models/model_implementations/openclip_models.py +7 -7
  67. mteb/models/model_implementations/random_baseline.py +3 -3
  68. mteb/models/model_implementations/rasgaard_models.py +1 -1
  69. mteb/models/model_implementations/repllama_models.py +2 -2
  70. mteb/models/model_implementations/rerankers_custom.py +3 -3
  71. mteb/models/model_implementations/rerankers_monot5_based.py +3 -3
  72. mteb/models/model_implementations/siglip_models.py +10 -10
  73. mteb/models/model_implementations/vlm2vec_models.py +1 -1
  74. mteb/models/model_implementations/voyage_v.py +4 -4
  75. mteb/models/model_meta.py +11 -12
  76. mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +5 -5
  77. mteb/models/search_wrappers.py +22 -10
  78. mteb/models/sentence_transformer_wrapper.py +9 -4
  79. mteb/py.typed +0 -0
  80. mteb/results/benchmark_results.py +25 -19
  81. mteb/results/model_result.py +49 -21
  82. mteb/results/task_result.py +45 -51
  83. mteb/similarity_functions.py +11 -7
  84. mteb/tasks/classification/dan/dk_hate_classification.py +1 -1
  85. mteb/tasks/classification/est/estonian_valence.py +1 -1
  86. mteb/tasks/classification/multilingual/scala_classification.py +1 -1
  87. mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +1 -1
  88. mteb/tasks/retrieval/code/code_rag.py +12 -12
  89. mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
  90. mteb/tasks/retrieval/dan/tv2_nordretrieval.py +2 -2
  91. mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +2 -2
  92. mteb/tasks/retrieval/nob/norquad.py +2 -2
  93. mteb/tasks/retrieval/nob/snl_retrieval.py +2 -2
  94. mteb/tasks/retrieval/tur/tur_hist_quad.py +1 -1
  95. mteb/types/_result.py +2 -1
  96. mteb/types/statistics.py +9 -3
  97. {mteb-2.5.3.dist-info → mteb-2.5.4.dist-info}/METADATA +1 -1
  98. {mteb-2.5.3.dist-info → mteb-2.5.4.dist-info}/RECORD +102 -101
  99. {mteb-2.5.3.dist-info → mteb-2.5.4.dist-info}/WHEEL +0 -0
  100. {mteb-2.5.3.dist-info → mteb-2.5.4.dist-info}/entry_points.txt +0 -0
  101. {mteb-2.5.3.dist-info → mteb-2.5.4.dist-info}/licenses/LICENSE +0 -0
  102. {mteb-2.5.3.dist-info → mteb-2.5.4.dist-info}/top_level.txt +0 -0
@@ -3,9 +3,8 @@ from __future__ import annotations
3
3
  import json
4
4
  import logging
5
5
  import warnings
6
- from argparse import Namespace
7
6
  from collections import defaultdict
8
- from collections.abc import Callable, Iterable
7
+ from collections.abc import Callable, Iterable, Mapping
9
8
  from functools import cached_property
10
9
  from importlib.metadata import version
11
10
  from pathlib import Path
@@ -17,8 +16,11 @@ from packaging.version import Version
17
16
  from pydantic import BaseModel, field_validator
18
17
  from typing_extensions import Self
19
18
 
19
+ from mteb import TaskMetadata
20
20
  from mteb._helpful_enum import HelpfulStrEnum
21
+ from mteb.abstasks import AbsTaskClassification
21
22
  from mteb.abstasks.abstask import AbsTask
23
+ from mteb.abstasks.task_metadata import TaskDomain
22
24
  from mteb.languages import LanguageScripts
23
25
  from mteb.models.model_meta import ScoringFunction
24
26
  from mteb.types import (
@@ -40,67 +42,59 @@ class Criteria(HelpfulStrEnum):
40
42
  DATASET_REVISION = "dataset_revision"
41
43
 
42
44
 
43
- class ScalaNbClassificationDummy:
45
+ class ScalaNbClassificationDummy(AbsTaskClassification):
44
46
  """A dummy task for loading historic results from before v1.11.0"""
45
47
 
46
- metadata = Namespace( # type: ignore
48
+ metadata = TaskMetadata(
47
49
  name="ScalaNbClassification",
50
+ description="A dummy",
48
51
  main_score="accuracy",
49
52
  type="Classification",
50
- hf_subsets_to_langscripts={
51
- "default": ["nob-Latn"],
52
- },
53
- dataset={"revision": "revision_not_applicable"},
54
- revision="revision_not_applicable",
53
+ eval_langs=["nob-Latn"],
54
+ dataset={"path": "not/exists", "revision": "revision_not_applicable"},
55
55
  )
56
56
 
57
57
 
58
- class ScalaNnClassificationDummy:
58
+ class ScalaNnClassificationDummy(AbsTaskClassification):
59
59
  """A dummy task for loading historic results from before v1.11.0"""
60
60
 
61
- metadata = Namespace( # type: ignore
61
+ metadata = TaskMetadata(
62
62
  name="ScalaNnClassification",
63
+ description="A dummy",
63
64
  main_score="accuracy",
64
65
  type="Classification",
65
- hf_subsets_to_langscripts={
66
- "default": ["nno-Latn"],
67
- },
68
- dataset={"revision": "revision_not_applicable"},
69
- revision="revision_not_applicable",
66
+ eval_langs=["nob-Latn"],
67
+ dataset={"path": "not/exists", "revision": "revision_not_applicable"},
70
68
  )
71
69
 
72
70
 
73
- class ScalaDaClassificationDummy:
71
+ class ScalaDaClassificationDummy(AbsTaskClassification):
74
72
  """A dummy task for loading historic results from before v1.11.0"""
75
73
 
76
- metadata = Namespace( # type: ignore
74
+ metadata = TaskMetadata(
77
75
  name="ScalaDaClassification",
76
+ description="A dummy",
78
77
  main_score="accuracy",
79
78
  type="Classification",
80
- hf_subsets_to_langscripts={
81
- "default": ["dan-Latn"],
82
- },
83
- dataset={"revision": "revision_not_applicable"},
84
- revision="revision_not_applicable",
79
+ eval_langs=["dan-Latn"],
80
+ dataset={"path": "not/exists", "revision": "revision_not_applicable"},
85
81
  )
86
82
 
87
83
 
88
- class ScalaSvClassificationDummy:
84
+ class ScalaSvClassificationDummy(AbsTaskClassification):
89
85
  """A dummy task for loading historic results from before v1.11.0"""
90
86
 
91
- metadata = Namespace( # type: ignore
87
+ metadata = TaskMetadata(
92
88
  name="ScalaSvClassification",
89
+ description="A dummy",
93
90
  main_score="accuracy",
94
91
  type="Classification",
95
- hf_subsets_to_langscripts={
96
- "default": ["swe-Latn"],
97
- },
98
- dataset={"revision": "revision_not_applicable"},
99
- revision="revision_not_applicable",
92
+ eval_langs=["swe-Latn"],
93
+ dataset={"path": "not/exists", "revision": "revision_not_applicable"},
100
94
  )
101
95
 
102
96
 
103
- outdated_tasks = {
97
+ outdated_tasks: dict[str, type[AbsTask]] = {
104
98
  "ScalaNbClassification": ScalaNbClassificationDummy,
105
99
  "ScalaNnClassification": ScalaNnClassificationDummy,
106
100
  "ScalaDaClassification": ScalaDaClassificationDummy,
@@ -167,10 +161,10 @@ class TaskResult(BaseModel):
167
161
  def from_task_results(
168
162
  cls,
169
163
  task: AbsTask | type[AbsTask],
170
- scores: dict[SplitName, dict[HFSubset, ScoresDict]],
164
+ scores: dict[SplitName, Mapping[HFSubset, ScoresDict]],
171
165
  evaluation_time: float,
172
166
  kg_co2_emissions: float | None = None,
173
- ) -> Self:
167
+ ) -> TaskResult:
174
168
  """Create a TaskResult from the task and scores.
175
169
 
176
170
  Args:
@@ -247,12 +241,12 @@ class TaskResult(BaseModel):
247
241
  return get_task(self.task_name)
248
242
 
249
243
  @property
250
- def domains(self) -> list[str]:
244
+ def domains(self) -> list[TaskDomain]:
251
245
  """Get the domains of the task."""
252
246
  doms = self.task.metadata.domains
253
247
  if doms is None:
254
248
  doms = []
255
- return doms # type: ignore
249
+ return doms
256
250
 
257
251
  @property
258
252
  def task_type(self) -> str:
@@ -308,7 +302,7 @@ class TaskResult(BaseModel):
308
302
  if isinstance(v, dict):
309
303
  self._round_scores(v, n)
310
304
  elif isinstance(v, float):
311
- value[i] = round(v, n)
305
+ value[i] = round(v, n) # type: ignore[call-overload]
312
306
 
313
307
  elif isinstance(value, float):
314
308
  scores[key] = round(value, n)
@@ -326,7 +320,7 @@ class TaskResult(BaseModel):
326
320
  json.dump(json_obj, f, indent=2)
327
321
 
328
322
  @classmethod
329
- def from_disk(cls, path: Path, load_historic_data: bool = True) -> Self: # type: ignore
323
+ def from_disk(cls, path: Path, load_historic_data: bool = True) -> TaskResult:
330
324
  """Load TaskResult from disk.
331
325
 
332
326
  Args:
@@ -357,7 +351,7 @@ class TaskResult(BaseModel):
357
351
  ) # assume it is before 1.11.0 if the version is not present
358
352
 
359
353
  try:
360
- obj = cls.model_validate(data)
354
+ obj: TaskResult = cls.model_validate(data)
361
355
  except Exception as e:
362
356
  if not pre_1_11_load:
363
357
  raise e
@@ -382,6 +376,7 @@ class TaskResult(BaseModel):
382
376
  from mteb import get_task
383
377
 
384
378
  task_name = obj.task_name
379
+ task: AbsTask | type[AbsTask]
385
380
  if task_name in outdated_tasks:
386
381
  task = outdated_tasks[task_name]
387
382
  else:
@@ -394,11 +389,11 @@ class TaskResult(BaseModel):
394
389
  for key in list(hf_subset_scores.keys()):
395
390
  if isinstance(hf_subset_scores[key], dict):
396
391
  for k, v in hf_subset_scores[key].items():
397
- hf_subset_scores[f"{key}_{k}"] = v
398
- hf_subset_scores.pop(key)
392
+ hf_subset_scores[f"{key}_{k}"] = v # type: ignore[index]
393
+ hf_subset_scores.pop(key) # type: ignore[attr-defined]
399
394
 
400
395
  @classmethod
401
- def _convert_from_before_v1_11_0(cls, data: dict) -> Self:
396
+ def _convert_from_before_v1_11_0(cls, data: dict) -> TaskResult:
402
397
  from mteb.get_tasks import _TASKS_REGISTRY
403
398
 
404
399
  # in case the task name is not found in the registry, try to find a lower case version
@@ -484,7 +479,7 @@ class TaskResult(BaseModel):
484
479
  scores["test"]["fra-fra"] = scores["test"].pop("fr")
485
480
 
486
481
  result: TaskResult = TaskResult.from_task_results(
487
- task, # type: ignore
482
+ task,
488
483
  scores,
489
484
  evaluation_time,
490
485
  kg_co2_emissions=None,
@@ -535,7 +530,7 @@ class TaskResult(BaseModel):
535
530
  def _get_score_fast(
536
531
  self,
537
532
  splits: Iterable[str] | None = None,
538
- languages: str | None = None,
533
+ languages: list[ISOLanguage | ISOLanguageScript] | None = None,
539
534
  subsets: Iterable[str] | None = None,
540
535
  ) -> float:
541
536
  """Sped up version of get_score that will be used if no aggregation, script or getter needs to be specified.
@@ -584,7 +579,7 @@ class TaskResult(BaseModel):
584
579
  return val_sum / n_val
585
580
 
586
581
  @classmethod
587
- def from_validated(cls, **data) -> Self:
582
+ def from_validated(cls, **data) -> TaskResult:
588
583
  """Create a TaskResult from validated data.
589
584
 
590
585
  Returns:
@@ -595,13 +590,13 @@ class TaskResult(BaseModel):
595
590
  def __repr__(self) -> str:
596
591
  return f"TaskResult(task_name={self.task_name}, scores=...)"
597
592
 
598
- def only_main_score(self) -> Self:
593
+ def only_main_score(self) -> TaskResult:
599
594
  """Return a new TaskResult object with only the main score.
600
595
 
601
596
  Returns:
602
597
  A new TaskResult object with only the main score.
603
598
  """
604
- new_scores = {}
599
+ new_scores: dict[str, list[Score]] = {}
605
600
  for split in self.scores:
606
601
  new_scores[split] = []
607
602
  for subset_scores in self.scores[split]:
@@ -613,10 +608,9 @@ class TaskResult(BaseModel):
613
608
  }
614
609
  )
615
610
  new_res = {**self.to_dict(), "scores": new_scores}
616
- new_res = TaskResult.from_validated(**new_res)
617
- return new_res
611
+ return TaskResult.from_validated(**new_res)
618
612
 
619
- def validate_and_filter_scores(self, task: AbsTask | None = None) -> Self:
613
+ def validate_and_filter_scores(self, task: AbsTask | None = None) -> TaskResult:
620
614
  """Validate and filter the scores against the task metadata.
621
615
 
622
616
  This ensures that the scores are correct for the given task, by removing any splits besides those specified in the task metadata.
@@ -638,7 +632,7 @@ class TaskResult(BaseModel):
638
632
  splits = task.eval_splits
639
633
  hf_subsets = set(task.hf_subsets) # Convert to set once
640
634
 
641
- new_scores = {}
635
+ new_scores: dict[str, list[Score]] = {}
642
636
  seen_splits = set()
643
637
  for split in self.scores:
644
638
  if split not in splits:
@@ -739,7 +733,7 @@ class TaskResult(BaseModel):
739
733
  "mteb_version",
740
734
  "dataset_revision",
741
735
  ],
742
- ) -> Self:
736
+ ) -> TaskResult:
743
737
  """Merges two TaskResult objects.
744
738
 
745
739
  Args:
@@ -186,7 +186,7 @@ def max_sim(a: Array, b: Array) -> torch.Tensor:
186
186
  b,
187
187
  )
188
188
 
189
- return scores.max(axis=-1).values.sum(axis=-1)
189
+ return scores.max(axis=-1).values.sum(axis=-1) # type: ignore[call-overload]
190
190
 
191
191
 
192
192
  # https://github.com/lightonai/pylate/blob/2d094a724866d6e15701781528368438081c0157/pylate/scores/scores.py#L67C1-L122C38
@@ -217,7 +217,7 @@ def pairwise_max_sim(
217
217
  document_embedding,
218
218
  )
219
219
 
220
- scores.append(query_document_score.max(axis=-1).values.sum())
220
+ scores.append(query_document_score.max(axis=-1).values.sum()) # type: ignore[call-overload]
221
221
 
222
222
  return torch.stack(scores, dim=0)
223
223
 
@@ -317,11 +317,15 @@ def similarity(text_embeddings: Array, input_embeddings: Array) -> Array:
317
317
  Returns:
318
318
  Matrix with similarities
319
319
  """
320
- text_embeddings = _convert_to_tensor(text_embeddings)
321
- input_embeddings = _convert_to_tensor(input_embeddings)
320
+ text_embeddings_tensor = _convert_to_tensor(text_embeddings)
321
+ input_embeddings_tensor = _convert_to_tensor(input_embeddings)
322
322
 
323
- text_embeddings = text_embeddings / text_embeddings.norm(dim=-1, keepdim=True)
324
- input_embeddings = input_embeddings / input_embeddings.norm(dim=-1, keepdim=True)
325
- logits = torch.matmul(input_embeddings, text_embeddings.T)
323
+ text_embeddings_tensor = text_embeddings_tensor / text_embeddings_tensor.norm(
324
+ dim=-1, keepdim=True
325
+ )
326
+ input_embeddings_tensor = input_embeddings_tensor / input_embeddings_tensor.norm(
327
+ dim=-1, keepdim=True
328
+ )
329
+ logits = torch.matmul(input_embeddings_tensor, text_embeddings_tensor.T)
326
330
  probs = (logits * 100).softmax(dim=-1)
327
331
  return probs
@@ -62,7 +62,7 @@ Piperidis, Stelios},
62
62
 
63
63
  def dataset_transform(self):
64
64
  # convert label to a 0/1 label
65
- labels = self.dataset["train"]["label"] # type: ignore
65
+ labels = self.dataset["train"]["label"]
66
66
  lab2idx = {lab: idx for idx, lab in enumerate(set(labels))}
67
67
  self.dataset = self.dataset.map(
68
68
  lambda x: {"label": lab2idx[x["label"]]}, remove_columns=["label"]
@@ -45,7 +45,7 @@ class EstonianValenceClassification(AbsTaskClassification):
45
45
  "valence", "label"
46
46
  )
47
47
  # convert label to a numbers
48
- labels = self.dataset["train"]["label"] # type: ignore
48
+ labels = self.dataset["train"]["label"]
49
49
  lab2idx = {lab: idx for idx, lab in enumerate(set(labels))}
50
50
  self.dataset = self.dataset.map(
51
51
  lambda x: {"label": lab2idx[x["label"]]}, remove_columns=["label"]
@@ -57,7 +57,7 @@ Fishel, Mark},
57
57
  def dataset_transform(self):
58
58
  for lang in self.dataset.keys():
59
59
  # convert label to a 0/1 label
60
- labels = self.dataset[lang]["train"]["label"] # type: ignore
60
+ labels = self.dataset[lang]["train"]["label"]
61
61
  lab2idx = {lab: idx for idx, lab in enumerate(set(labels))}
62
62
  self.dataset[lang] = self.dataset[lang].map(
63
63
  lambda x: {"label": lab2idx[x["label"]]}, remove_columns=["label"]
@@ -49,7 +49,7 @@ class SugarCrepe(AbsTaskImageTextPairClassification):
49
49
  """Load dataset from HuggingFace hub"""
50
50
  if self.data_loaded:
51
51
  return
52
- self.dataset = datasets.load_dataset(**self.metadata.dataset) # type: ignore
52
+ self.dataset = datasets.load_dataset(**self.metadata.dataset)
53
53
  self.dataset = datasets.DatasetDict({"test": self.dataset["train"]})
54
54
  self.dataset_transform()
55
55
  self.data_loaded = True
@@ -48,14 +48,14 @@ class CodeRAGProgrammingSolutionsRetrieval(AbsTaskRetrieval):
48
48
  "path": "code-rag-bench/programming-solutions",
49
49
  "revision": "1064f7bba54d5400d4836f5831fe4c2332a566a6",
50
50
  },
51
- **common_args, # type: ignore
51
+ **common_args,
52
52
  )
53
53
 
54
54
  def load_data(self) -> None:
55
55
  """Load dataset from HuggingFace hub"""
56
56
  if self.data_loaded:
57
57
  return
58
- self.dataset = datasets.load_dataset(**self.metadata.dataset) # type: ignore
58
+ self.dataset = datasets.load_dataset(**self.metadata.dataset)
59
59
  self.dataset_transform()
60
60
  self.data_loaded = True
61
61
 
@@ -71,7 +71,7 @@ class CodeRAGProgrammingSolutionsRetrieval(AbsTaskRetrieval):
71
71
  self.queries = {}
72
72
 
73
73
  split = self.metadata.eval_splits[0]
74
- ds: datasets.Dataset = self.dataset[split] # type: ignore
74
+ ds: datasets.Dataset = self.dataset[split]
75
75
  ds = ds.shuffle(seed=42)
76
76
 
77
77
  self.queries[split] = {}
@@ -105,14 +105,14 @@ class CodeRAGOnlineTutorialsRetrieval(AbsTaskRetrieval):
105
105
  "path": "code-rag-bench/online-tutorials",
106
106
  "revision": "095bb77130082e4690d6c3a031997b03487bf6e2",
107
107
  },
108
- **common_args, # type: ignore
108
+ **common_args,
109
109
  )
110
110
 
111
111
  def load_data(self) -> None:
112
112
  """Load dataset from HuggingFace hub"""
113
113
  if self.data_loaded:
114
114
  return
115
- self.dataset = datasets.load_dataset(**self.metadata.dataset) # type: ignore
115
+ self.dataset = datasets.load_dataset(**self.metadata.dataset)
116
116
  self.dataset_transform()
117
117
  self.data_loaded = True
118
118
 
@@ -128,7 +128,7 @@ class CodeRAGOnlineTutorialsRetrieval(AbsTaskRetrieval):
128
128
  self.queries = {}
129
129
 
130
130
  split = self.metadata.eval_splits[0]
131
- ds: datasets.Dataset = self.dataset[split] # type: ignore
131
+ ds: datasets.Dataset = self.dataset[split]
132
132
  ds = ds.shuffle(seed=42)
133
133
 
134
134
  self.queries[split] = {}
@@ -165,14 +165,14 @@ class CodeRAGLibraryDocumentationSolutionsRetrieval(AbsTaskRetrieval):
165
165
  "path": "code-rag-bench/library-documentation",
166
166
  "revision": "b530d3b5a25087d2074e731b76232db85b9e9107",
167
167
  },
168
- **common_args, # type: ignore
168
+ **common_args,
169
169
  )
170
170
 
171
171
  def load_data(self) -> None:
172
172
  """Load dataset from HuggingFace hub"""
173
173
  if self.data_loaded:
174
174
  return
175
- self.dataset = datasets.load_dataset(**self.metadata.dataset) # type: ignore
175
+ self.dataset = datasets.load_dataset(**self.metadata.dataset)
176
176
  self.dataset_transform()
177
177
  self.data_loaded = True
178
178
 
@@ -188,7 +188,7 @@ class CodeRAGLibraryDocumentationSolutionsRetrieval(AbsTaskRetrieval):
188
188
  self.queries = {}
189
189
 
190
190
  split = self.metadata.eval_splits[0]
191
- ds: datasets.Dataset = self.dataset[split] # type: ignore
191
+ ds: datasets.Dataset = self.dataset[split]
192
192
  ds = ds.shuffle(seed=42)
193
193
 
194
194
  self.queries[split] = {}
@@ -222,14 +222,14 @@ class CodeRAGStackoverflowPostsRetrieval(AbsTaskRetrieval):
222
222
  "path": "code-rag-bench/stackoverflow-posts",
223
223
  "revision": "04e05d86cb0ac467b29a5d87f4c56eac99dfc0a4",
224
224
  },
225
- **common_args, # type: ignore
225
+ **common_args,
226
226
  )
227
227
 
228
228
  def load_data(self) -> None:
229
229
  """Load dataset from HuggingFace hub"""
230
230
  if self.data_loaded:
231
231
  return
232
- self.dataset = datasets.load_dataset(**self.metadata.dataset) # type: ignore
232
+ self.dataset = datasets.load_dataset(**self.metadata.dataset)
233
233
  self.dataset_transform()
234
234
  self.data_loaded = True
235
235
 
@@ -245,7 +245,7 @@ class CodeRAGStackoverflowPostsRetrieval(AbsTaskRetrieval):
245
245
  self.queries = {}
246
246
 
247
247
  split = self.metadata.eval_splits[0]
248
- ds: datasets.Dataset = self.dataset[split] # type: ignore
248
+ ds: datasets.Dataset = self.dataset[split]
249
249
  ds = ds.shuffle(seed=42)
250
250
 
251
251
  self.queries[split] = {}
@@ -51,7 +51,7 @@ Derczynski, Leon},
51
51
  """Load dataset from HuggingFace hub"""
52
52
  if self.data_loaded:
53
53
  return
54
- self.dataset = datasets.load_dataset(**self.metadata.dataset) # type: ignore
54
+ self.dataset = datasets.load_dataset(**self.metadata.dataset)
55
55
  self.dataset_transform()
56
56
  self.data_loaded = True
57
57
 
@@ -64,7 +64,7 @@ Piperidis, Stelios},
64
64
  """Load dataset from HuggingFace hub"""
65
65
  if self.data_loaded:
66
66
  return
67
- self.dataset = datasets.load_dataset(**self.metadata.dataset) # type: ignore
67
+ self.dataset = datasets.load_dataset(**self.metadata.dataset)
68
68
  self.dataset_transform()
69
69
  self.data_loaded = True
70
70
 
@@ -81,7 +81,7 @@ Piperidis, Stelios},
81
81
  text2id = {}
82
82
 
83
83
  for split in self.dataset:
84
- ds: datasets.Dataset = self.dataset[split] # type: ignore
84
+ ds: datasets.Dataset = self.dataset[split]
85
85
  ds = ds.shuffle(seed=42)
86
86
  ds = ds.select(
87
87
  range(2048)
@@ -40,7 +40,7 @@ class TwitterHjerneRetrieval(AbsTaskRetrieval):
40
40
  """Load dataset from HuggingFace hub"""
41
41
  if self.data_loaded:
42
42
  return
43
- self.dataset = datasets.load_dataset(**self.metadata.dataset) # type: ignore
43
+ self.dataset = datasets.load_dataset(**self.metadata.dataset)
44
44
  self.dataset_transform()
45
45
  self.data_loaded = True
46
46
 
@@ -57,7 +57,7 @@ class TwitterHjerneRetrieval(AbsTaskRetrieval):
57
57
  text2id = {}
58
58
 
59
59
  for split in self.dataset:
60
- ds: datasets.Dataset = self.dataset[split] # type: ignore
60
+ ds: datasets.Dataset = self.dataset[split]
61
61
  ds = ds.map(answers_to_list)
62
62
 
63
63
  self.queries[split] = {}
@@ -54,7 +54,7 @@ Fishel, Mark},
54
54
  """Load dataset from HuggingFace hub"""
55
55
  if self.data_loaded:
56
56
  return
57
- self.dataset = datasets.load_dataset(**self.metadata.dataset) # type: ignore
57
+ self.dataset = datasets.load_dataset(**self.metadata.dataset)
58
58
  self.dataset_transform()
59
59
  self.data_loaded = True
60
60
 
@@ -71,7 +71,7 @@ Fishel, Mark},
71
71
  text2id = {}
72
72
 
73
73
  for split in self.dataset:
74
- ds: datasets.Dataset = self.dataset[split] # type: ignore
74
+ ds: datasets.Dataset = self.dataset[split]
75
75
  ds = ds.shuffle(seed=42)
76
76
  max_samples = min(1024, len(ds))
77
77
  ds = ds.select(
@@ -41,7 +41,7 @@ class SNLRetrieval(AbsTaskRetrieval):
41
41
  """Load dataset from HuggingFace hub"""
42
42
  if self.data_loaded:
43
43
  return
44
- self.dataset = datasets.load_dataset(**self.metadata.dataset) # type: ignore
44
+ self.dataset = datasets.load_dataset(**self.metadata.dataset)
45
45
  self.dataset_transform()
46
46
  self.data_loaded = True
47
47
 
@@ -58,7 +58,7 @@ class SNLRetrieval(AbsTaskRetrieval):
58
58
  text2id = {}
59
59
 
60
60
  for split in self.dataset:
61
- ds: datasets.Dataset = self.dataset[split] # type: ignore
61
+ ds: datasets.Dataset = self.dataset[split]
62
62
  ds = ds.shuffle(seed=42)
63
63
 
64
64
  self.queries[split] = {}
@@ -59,7 +59,7 @@ class TurHistQuadRetrieval(AbsTaskRetrieval):
59
59
  text2id = {}
60
60
 
61
61
  for split in self.metadata.eval_splits:
62
- ds: datasets.Dataset = self.dataset[split] # type: ignore
62
+ ds: datasets.Dataset = self.dataset[split]
63
63
  ds = ds.shuffle(seed=42)
64
64
  max_samples = min(1024, len(ds))
65
65
  ds = ds.select(
mteb/types/_result.py CHANGED
@@ -1,3 +1,4 @@
1
+ from collections.abc import Mapping
1
2
  from typing import Any, NamedTuple
2
3
 
3
4
  HFSubset = str
@@ -8,7 +9,7 @@ SplitName = str
8
9
  Score = Any
9
10
  """A score value, could e.g. be accuracy. Normally it is a float or int, but it can take on any value. Should be json serializable."""
10
11
 
11
- ScoresDict = dict[str, Score]
12
+ ScoresDict = Mapping[str, Score]
12
13
  """A dictionary of scores, typically also include metadata, e.g {'main_score': 0.5, 'accuracy': 0.5, 'f1': 0.6, 'hf_subset': 'en-de', 'languages': ['eng-Latn', 'deu-Latn']}"""
13
14
 
14
15
 
mteb/types/statistics.py CHANGED
@@ -10,8 +10,14 @@ class SplitDescriptiveStatistics(TypedDict):
10
10
 
11
11
 
12
12
  class DescriptiveStatistics(TypedDict, SplitDescriptiveStatistics):
13
- """Class for descriptive statistics for the full task."""
13
+ """Class for descriptive statistics for the full task.
14
14
 
15
+ Attributes:
16
+ num_samples: Total number of samples
17
+ hf_subset_descriptive_stats: HFSubset descriptive statistics (only for multilingual datasets)
18
+ """
19
+
20
+ num_samples: int
15
21
  hf_subset_descriptive_stats: NotRequired[dict[HFSubset, SplitDescriptiveStatistics]]
16
22
 
17
23
 
@@ -88,9 +94,9 @@ class ScoreStatistics(TypedDict):
88
94
  max_score: Maximum score
89
95
  """
90
96
 
91
- min_score: int
97
+ min_score: int | float
92
98
  avg_score: float
93
- max_score: int
99
+ max_score: int | float
94
100
 
95
101
 
96
102
  class TopRankedStatistics(TypedDict):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mteb
3
- Version: 2.5.3
3
+ Version: 2.5.4
4
4
  Summary: Massive Text Embedding Benchmark
5
5
  Author-email: MTEB Contributors <niklas@huggingface.co>, Kenneth Enevoldsen <kenneth.enevoldsen@cas.au.dk>, Nouamane Tazi <nouamane@huggingface.co>, Nils Reimers <info@nils-reimers.de>
6
6
  Maintainer-email: Kenneth Enevoldsen <kenneth.enevoldsen@cas.au.dk>, Roman Solomatin <risolomatin@gmail.com>, Isaac Chung <chungisaac1217@gmail.com>