mteb 2.5.2__py3-none-any.whl → 2.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (104) hide show
  1. mteb/_create_dataloaders.py +10 -15
  2. mteb/_evaluators/any_sts_evaluator.py +1 -4
  3. mteb/_evaluators/evaluator.py +2 -1
  4. mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +5 -6
  5. mteb/_evaluators/pair_classification_evaluator.py +3 -1
  6. mteb/_evaluators/retrieval_metrics.py +17 -16
  7. mteb/_evaluators/sklearn_evaluator.py +9 -8
  8. mteb/_evaluators/text/bitext_mining_evaluator.py +23 -16
  9. mteb/_evaluators/text/summarization_evaluator.py +20 -16
  10. mteb/abstasks/_data_filter/filters.py +1 -1
  11. mteb/abstasks/_data_filter/task_pipelines.py +3 -0
  12. mteb/abstasks/_statistics_calculation.py +18 -10
  13. mteb/abstasks/_stratification.py +18 -18
  14. mteb/abstasks/abstask.py +33 -27
  15. mteb/abstasks/aggregate_task_metadata.py +1 -9
  16. mteb/abstasks/aggregated_task.py +7 -26
  17. mteb/abstasks/classification.py +10 -4
  18. mteb/abstasks/clustering.py +18 -14
  19. mteb/abstasks/clustering_legacy.py +8 -8
  20. mteb/abstasks/image/image_text_pair_classification.py +5 -3
  21. mteb/abstasks/multilabel_classification.py +20 -16
  22. mteb/abstasks/pair_classification.py +18 -9
  23. mteb/abstasks/regression.py +3 -3
  24. mteb/abstasks/retrieval.py +12 -9
  25. mteb/abstasks/sts.py +6 -3
  26. mteb/abstasks/task_metadata.py +22 -19
  27. mteb/abstasks/text/bitext_mining.py +36 -25
  28. mteb/abstasks/text/reranking.py +7 -5
  29. mteb/abstasks/text/summarization.py +8 -3
  30. mteb/abstasks/zeroshot_classification.py +5 -2
  31. mteb/benchmarks/benchmark.py +2 -2
  32. mteb/cache.py +27 -22
  33. mteb/cli/_display_tasks.py +2 -2
  34. mteb/cli/build_cli.py +15 -10
  35. mteb/cli/generate_model_card.py +10 -7
  36. mteb/deprecated_evaluator.py +60 -46
  37. mteb/evaluate.py +39 -30
  38. mteb/filter_tasks.py +25 -26
  39. mteb/get_tasks.py +29 -30
  40. mteb/languages/language_scripts.py +5 -3
  41. mteb/leaderboard/app.py +1 -1
  42. mteb/load_results.py +12 -12
  43. mteb/models/abs_encoder.py +7 -5
  44. mteb/models/cache_wrappers/cache_backend_protocol.py +3 -5
  45. mteb/models/cache_wrappers/cache_backends/_hash_utils.py +5 -4
  46. mteb/models/cache_wrappers/cache_backends/faiss_cache.py +6 -2
  47. mteb/models/cache_wrappers/cache_backends/numpy_cache.py +43 -25
  48. mteb/models/cache_wrappers/cache_wrapper.py +2 -2
  49. mteb/models/get_model_meta.py +8 -1
  50. mteb/models/instruct_wrapper.py +11 -5
  51. mteb/models/model_implementations/andersborges.py +2 -2
  52. mteb/models/model_implementations/blip_models.py +8 -8
  53. mteb/models/model_implementations/bm25.py +1 -1
  54. mteb/models/model_implementations/clip_models.py +3 -3
  55. mteb/models/model_implementations/cohere_models.py +1 -1
  56. mteb/models/model_implementations/cohere_v.py +2 -2
  57. mteb/models/model_implementations/dino_models.py +23 -23
  58. mteb/models/model_implementations/emillykkejensen_models.py +3 -3
  59. mteb/models/model_implementations/gme_v_models.py +4 -3
  60. mteb/models/model_implementations/jina_clip.py +1 -1
  61. mteb/models/model_implementations/jina_models.py +1 -1
  62. mteb/models/model_implementations/kennethenevoldsen_models.py +2 -2
  63. mteb/models/model_implementations/llm2clip_models.py +3 -3
  64. mteb/models/model_implementations/mcinext_models.py +4 -1
  65. mteb/models/model_implementations/moco_models.py +2 -2
  66. mteb/models/model_implementations/model2vec_models.py +1 -1
  67. mteb/models/model_implementations/nomic_models.py +8 -8
  68. mteb/models/model_implementations/openclip_models.py +7 -7
  69. mteb/models/model_implementations/random_baseline.py +3 -3
  70. mteb/models/model_implementations/rasgaard_models.py +1 -1
  71. mteb/models/model_implementations/repllama_models.py +2 -2
  72. mteb/models/model_implementations/rerankers_custom.py +3 -3
  73. mteb/models/model_implementations/rerankers_monot5_based.py +3 -3
  74. mteb/models/model_implementations/siglip_models.py +10 -10
  75. mteb/models/model_implementations/vlm2vec_models.py +1 -1
  76. mteb/models/model_implementations/voyage_v.py +4 -4
  77. mteb/models/model_meta.py +14 -13
  78. mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +9 -6
  79. mteb/models/search_wrappers.py +26 -12
  80. mteb/models/sentence_transformer_wrapper.py +19 -14
  81. mteb/py.typed +0 -0
  82. mteb/results/benchmark_results.py +28 -20
  83. mteb/results/model_result.py +52 -22
  84. mteb/results/task_result.py +55 -58
  85. mteb/similarity_functions.py +11 -7
  86. mteb/tasks/classification/dan/dk_hate_classification.py +1 -1
  87. mteb/tasks/classification/est/estonian_valence.py +1 -1
  88. mteb/tasks/classification/multilingual/scala_classification.py +1 -1
  89. mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +1 -1
  90. mteb/tasks/retrieval/code/code_rag.py +12 -12
  91. mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
  92. mteb/tasks/retrieval/dan/tv2_nordretrieval.py +2 -2
  93. mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +2 -2
  94. mteb/tasks/retrieval/nob/norquad.py +2 -2
  95. mteb/tasks/retrieval/nob/snl_retrieval.py +2 -2
  96. mteb/tasks/retrieval/tur/tur_hist_quad.py +1 -1
  97. mteb/types/_result.py +2 -1
  98. mteb/types/statistics.py +9 -3
  99. {mteb-2.5.2.dist-info → mteb-2.5.4.dist-info}/METADATA +1 -1
  100. {mteb-2.5.2.dist-info → mteb-2.5.4.dist-info}/RECORD +104 -103
  101. {mteb-2.5.2.dist-info → mteb-2.5.4.dist-info}/WHEEL +0 -0
  102. {mteb-2.5.2.dist-info → mteb-2.5.4.dist-info}/entry_points.txt +0 -0
  103. {mteb-2.5.2.dist-info → mteb-2.5.4.dist-info}/licenses/LICENSE +0 -0
  104. {mteb-2.5.2.dist-info → mteb-2.5.4.dist-info}/top_level.txt +0 -0
@@ -2,9 +2,9 @@ from __future__ import annotations
2
2
 
3
3
  import json
4
4
  import logging
5
- from argparse import Namespace
5
+ import warnings
6
6
  from collections import defaultdict
7
- from collections.abc import Callable, Iterable
7
+ from collections.abc import Callable, Iterable, Mapping
8
8
  from functools import cached_property
9
9
  from importlib.metadata import version
10
10
  from pathlib import Path
@@ -16,8 +16,11 @@ from packaging.version import Version
16
16
  from pydantic import BaseModel, field_validator
17
17
  from typing_extensions import Self
18
18
 
19
+ from mteb import TaskMetadata
19
20
  from mteb._helpful_enum import HelpfulStrEnum
21
+ from mteb.abstasks import AbsTaskClassification
20
22
  from mteb.abstasks.abstask import AbsTask
23
+ from mteb.abstasks.task_metadata import TaskDomain
21
24
  from mteb.languages import LanguageScripts
22
25
  from mteb.models.model_meta import ScoringFunction
23
26
  from mteb.types import (
@@ -39,67 +42,59 @@ class Criteria(HelpfulStrEnum):
39
42
  DATASET_REVISION = "dataset_revision"
40
43
 
41
44
 
42
- class ScalaNbClassificationDummy:
45
+ class ScalaNbClassificationDummy(AbsTaskClassification):
43
46
  """A dummy task for loading historic results from before v1.11.0"""
44
47
 
45
- metadata = Namespace( # type: ignore
48
+ metadata = TaskMetadata(
46
49
  name="ScalaNbClassification",
50
+ description="A dummy",
47
51
  main_score="accuracy",
48
52
  type="Classification",
49
- hf_subsets_to_langscripts={
50
- "default": ["nob-Latn"],
51
- },
52
- dataset={"revision": "revision_not_applicable"},
53
- revision="revision_not_applicable",
53
+ eval_langs=["nob-Latn"],
54
+ dataset={"path": "not/exists", "revision": "revision_not_applicable"},
54
55
  )
55
56
 
56
57
 
57
- class ScalaNnClassificationDummy:
58
+ class ScalaNnClassificationDummy(AbsTaskClassification):
58
59
  """A dummy task for loading historic results from before v1.11.0"""
59
60
 
60
- metadata = Namespace( # type: ignore
61
+ metadata = TaskMetadata(
61
62
  name="ScalaNnClassification",
63
+ description="A dummy",
62
64
  main_score="accuracy",
63
65
  type="Classification",
64
- hf_subsets_to_langscripts={
65
- "default": ["nno-Latn"],
66
- },
67
- dataset={"revision": "revision_not_applicable"},
68
- revision="revision_not_applicable",
66
+ eval_langs=["nob-Latn"],
67
+ dataset={"path": "not/exists", "revision": "revision_not_applicable"},
69
68
  )
70
69
 
71
70
 
72
- class ScalaDaClassificationDummy:
71
+ class ScalaDaClassificationDummy(AbsTaskClassification):
73
72
  """A dummy task for loading historic results from before v1.11.0"""
74
73
 
75
- metadata = Namespace( # type: ignore
74
+ metadata = TaskMetadata(
76
75
  name="ScalaDaClassification",
76
+ description="A dummy",
77
77
  main_score="accuracy",
78
78
  type="Classification",
79
- hf_subsets_to_langscripts={
80
- "default": ["dan-Latn"],
81
- },
82
- dataset={"revision": "revision_not_applicable"},
83
- revision="revision_not_applicable",
79
+ eval_langs=["dan-Latn"],
80
+ dataset={"path": "not/exists", "revision": "revision_not_applicable"},
84
81
  )
85
82
 
86
83
 
87
- class ScalaSvClassificationDummy:
84
+ class ScalaSvClassificationDummy(AbsTaskClassification):
88
85
  """A dummy task for loading historic results from before v1.11.0"""
89
86
 
90
- metadata = Namespace( # type: ignore
87
+ metadata = TaskMetadata(
91
88
  name="ScalaSvClassification",
89
+ description="A dummy",
92
90
  main_score="accuracy",
93
91
  type="Classification",
94
- hf_subsets_to_langscripts={
95
- "default": ["swe-Latn"],
96
- },
97
- dataset={"revision": "revision_not_applicable"},
98
- revision="revision_not_applicable",
92
+ eval_langs=["swe-Latn"],
93
+ dataset={"path": "not/exists", "revision": "revision_not_applicable"},
99
94
  )
100
95
 
101
96
 
102
- outdated_tasks = {
97
+ outdated_tasks: dict[str, type[AbsTask]] = {
103
98
  "ScalaNbClassification": ScalaNbClassificationDummy,
104
99
  "ScalaNnClassification": ScalaNnClassificationDummy,
105
100
  "ScalaDaClassification": ScalaDaClassificationDummy,
@@ -166,10 +161,10 @@ class TaskResult(BaseModel):
166
161
  def from_task_results(
167
162
  cls,
168
163
  task: AbsTask | type[AbsTask],
169
- scores: dict[SplitName, dict[HFSubset, ScoresDict]],
164
+ scores: dict[SplitName, Mapping[HFSubset, ScoresDict]],
170
165
  evaluation_time: float,
171
166
  kg_co2_emissions: float | None = None,
172
- ) -> Self:
167
+ ) -> TaskResult:
173
168
  """Create a TaskResult from the task and scores.
174
169
 
175
170
  Args:
@@ -246,12 +241,12 @@ class TaskResult(BaseModel):
246
241
  return get_task(self.task_name)
247
242
 
248
243
  @property
249
- def domains(self) -> list[str]:
244
+ def domains(self) -> list[TaskDomain]:
250
245
  """Get the domains of the task."""
251
246
  doms = self.task.metadata.domains
252
247
  if doms is None:
253
248
  doms = []
254
- return doms # type: ignore
249
+ return doms
255
250
 
256
251
  @property
257
252
  def task_type(self) -> str:
@@ -307,7 +302,7 @@ class TaskResult(BaseModel):
307
302
  if isinstance(v, dict):
308
303
  self._round_scores(v, n)
309
304
  elif isinstance(v, float):
310
- value[i] = round(v, n)
305
+ value[i] = round(v, n) # type: ignore[call-overload]
311
306
 
312
307
  elif isinstance(value, float):
313
308
  scores[key] = round(value, n)
@@ -325,7 +320,7 @@ class TaskResult(BaseModel):
325
320
  json.dump(json_obj, f, indent=2)
326
321
 
327
322
  @classmethod
328
- def from_disk(cls, path: Path, load_historic_data: bool = True) -> Self: # type: ignore
323
+ def from_disk(cls, path: Path, load_historic_data: bool = True) -> TaskResult:
329
324
  """Load TaskResult from disk.
330
325
 
331
326
  Args:
@@ -356,7 +351,7 @@ class TaskResult(BaseModel):
356
351
  ) # assume it is before 1.11.0 if the version is not present
357
352
 
358
353
  try:
359
- obj = cls.model_validate(data)
354
+ obj: TaskResult = cls.model_validate(data)
360
355
  except Exception as e:
361
356
  if not pre_1_11_load:
362
357
  raise e
@@ -381,6 +376,7 @@ class TaskResult(BaseModel):
381
376
  from mteb import get_task
382
377
 
383
378
  task_name = obj.task_name
379
+ task: AbsTask | type[AbsTask]
384
380
  if task_name in outdated_tasks:
385
381
  task = outdated_tasks[task_name]
386
382
  else:
@@ -393,11 +389,11 @@ class TaskResult(BaseModel):
393
389
  for key in list(hf_subset_scores.keys()):
394
390
  if isinstance(hf_subset_scores[key], dict):
395
391
  for k, v in hf_subset_scores[key].items():
396
- hf_subset_scores[f"{key}_{k}"] = v
397
- hf_subset_scores.pop(key)
392
+ hf_subset_scores[f"{key}_{k}"] = v # type: ignore[index]
393
+ hf_subset_scores.pop(key) # type: ignore[attr-defined]
398
394
 
399
395
  @classmethod
400
- def _convert_from_before_v1_11_0(cls, data: dict) -> Self:
396
+ def _convert_from_before_v1_11_0(cls, data: dict) -> TaskResult:
401
397
  from mteb.get_tasks import _TASKS_REGISTRY
402
398
 
403
399
  # in case the task name is not found in the registry, try to find a lower case version
@@ -462,7 +458,9 @@ class TaskResult(BaseModel):
462
458
  if main_score in hf_subset_scores:
463
459
  hf_subset_scores["main_score"] = hf_subset_scores[main_score]
464
460
  else:
465
- logger.warning(f"Main score {main_score} not found in scores")
461
+ msg = f"Main score {main_score} not found in scores"
462
+ logger.warning(msg)
463
+ warnings.warn(msg)
466
464
  hf_subset_scores["main_score"] = None
467
465
 
468
466
  # specific fixes:
@@ -481,7 +479,7 @@ class TaskResult(BaseModel):
481
479
  scores["test"]["fra-fra"] = scores["test"].pop("fr")
482
480
 
483
481
  result: TaskResult = TaskResult.from_task_results(
484
- task, # type: ignore
482
+ task,
485
483
  scores,
486
484
  evaluation_time,
487
485
  kg_co2_emissions=None,
@@ -532,7 +530,7 @@ class TaskResult(BaseModel):
532
530
  def _get_score_fast(
533
531
  self,
534
532
  splits: Iterable[str] | None = None,
535
- languages: str | None = None,
533
+ languages: list[ISOLanguage | ISOLanguageScript] | None = None,
536
534
  subsets: Iterable[str] | None = None,
537
535
  ) -> float:
538
536
  """Sped up version of get_score that will be used if no aggregation, script or getter needs to be specified.
@@ -581,7 +579,7 @@ class TaskResult(BaseModel):
581
579
  return val_sum / n_val
582
580
 
583
581
  @classmethod
584
- def from_validated(cls, **data) -> Self:
582
+ def from_validated(cls, **data) -> TaskResult:
585
583
  """Create a TaskResult from validated data.
586
584
 
587
585
  Returns:
@@ -592,13 +590,13 @@ class TaskResult(BaseModel):
592
590
  def __repr__(self) -> str:
593
591
  return f"TaskResult(task_name={self.task_name}, scores=...)"
594
592
 
595
- def only_main_score(self) -> Self:
593
+ def only_main_score(self) -> TaskResult:
596
594
  """Return a new TaskResult object with only the main score.
597
595
 
598
596
  Returns:
599
597
  A new TaskResult object with only the main score.
600
598
  """
601
- new_scores = {}
599
+ new_scores: dict[str, list[Score]] = {}
602
600
  for split in self.scores:
603
601
  new_scores[split] = []
604
602
  for subset_scores in self.scores[split]:
@@ -610,10 +608,9 @@ class TaskResult(BaseModel):
610
608
  }
611
609
  )
612
610
  new_res = {**self.to_dict(), "scores": new_scores}
613
- new_res = TaskResult.from_validated(**new_res)
614
- return new_res
611
+ return TaskResult.from_validated(**new_res)
615
612
 
616
- def validate_and_filter_scores(self, task: AbsTask | None = None) -> Self:
613
+ def validate_and_filter_scores(self, task: AbsTask | None = None) -> TaskResult:
617
614
  """Validate and filter the scores against the task metadata.
618
615
 
619
616
  This ensures that the scores are correct for the given task, by removing any splits besides those specified in the task metadata.
@@ -635,7 +632,7 @@ class TaskResult(BaseModel):
635
632
  splits = task.eval_splits
636
633
  hf_subsets = set(task.hf_subsets) # Convert to set once
637
634
 
638
- new_scores = {}
635
+ new_scores: dict[str, list[Score]] = {}
639
636
  seen_splits = set()
640
637
  for split in self.scores:
641
638
  if split not in splits:
@@ -658,14 +655,14 @@ class TaskResult(BaseModel):
658
655
  else:
659
656
  missing_subsets_str = str(missing_subsets)
660
657
 
661
- logger.warning(
662
- f"{task.metadata.name}: Missing subsets {missing_subsets_str} for split {split}"
663
- )
658
+ msg = f"{task.metadata.name}: Missing subsets {missing_subsets_str} for split {split}"
659
+ logger.warning(msg)
660
+ warnings.warn(msg)
664
661
  seen_splits.add(split)
665
662
  if seen_splits != set(splits):
666
- logger.warning(
667
- f"{task.metadata.name}: Missing splits {set(splits) - seen_splits}"
668
- )
663
+ msg = f"{task.metadata.name}: Missing splits {set(splits) - seen_splits}"
664
+ logger.warning(msg)
665
+ warnings.warn(msg)
669
666
  data = self.model_dump()
670
667
  data["scores"] = new_scores
671
668
  return type(self).model_construct(**data)
@@ -736,7 +733,7 @@ class TaskResult(BaseModel):
736
733
  "mteb_version",
737
734
  "dataset_revision",
738
735
  ],
739
- ) -> Self:
736
+ ) -> TaskResult:
740
737
  """Merges two TaskResult objects.
741
738
 
742
739
  Args:
@@ -186,7 +186,7 @@ def max_sim(a: Array, b: Array) -> torch.Tensor:
186
186
  b,
187
187
  )
188
188
 
189
- return scores.max(axis=-1).values.sum(axis=-1)
189
+ return scores.max(axis=-1).values.sum(axis=-1) # type: ignore[call-overload]
190
190
 
191
191
 
192
192
  # https://github.com/lightonai/pylate/blob/2d094a724866d6e15701781528368438081c0157/pylate/scores/scores.py#L67C1-L122C38
@@ -217,7 +217,7 @@ def pairwise_max_sim(
217
217
  document_embedding,
218
218
  )
219
219
 
220
- scores.append(query_document_score.max(axis=-1).values.sum())
220
+ scores.append(query_document_score.max(axis=-1).values.sum()) # type: ignore[call-overload]
221
221
 
222
222
  return torch.stack(scores, dim=0)
223
223
 
@@ -317,11 +317,15 @@ def similarity(text_embeddings: Array, input_embeddings: Array) -> Array:
317
317
  Returns:
318
318
  Matrix with similarities
319
319
  """
320
- text_embeddings = _convert_to_tensor(text_embeddings)
321
- input_embeddings = _convert_to_tensor(input_embeddings)
320
+ text_embeddings_tensor = _convert_to_tensor(text_embeddings)
321
+ input_embeddings_tensor = _convert_to_tensor(input_embeddings)
322
322
 
323
- text_embeddings = text_embeddings / text_embeddings.norm(dim=-1, keepdim=True)
324
- input_embeddings = input_embeddings / input_embeddings.norm(dim=-1, keepdim=True)
325
- logits = torch.matmul(input_embeddings, text_embeddings.T)
323
+ text_embeddings_tensor = text_embeddings_tensor / text_embeddings_tensor.norm(
324
+ dim=-1, keepdim=True
325
+ )
326
+ input_embeddings_tensor = input_embeddings_tensor / input_embeddings_tensor.norm(
327
+ dim=-1, keepdim=True
328
+ )
329
+ logits = torch.matmul(input_embeddings_tensor, text_embeddings_tensor.T)
326
330
  probs = (logits * 100).softmax(dim=-1)
327
331
  return probs
@@ -62,7 +62,7 @@ Piperidis, Stelios},
62
62
 
63
63
  def dataset_transform(self):
64
64
  # convert label to a 0/1 label
65
- labels = self.dataset["train"]["label"] # type: ignore
65
+ labels = self.dataset["train"]["label"]
66
66
  lab2idx = {lab: idx for idx, lab in enumerate(set(labels))}
67
67
  self.dataset = self.dataset.map(
68
68
  lambda x: {"label": lab2idx[x["label"]]}, remove_columns=["label"]
@@ -45,7 +45,7 @@ class EstonianValenceClassification(AbsTaskClassification):
45
45
  "valence", "label"
46
46
  )
47
47
  # convert label to a numbers
48
- labels = self.dataset["train"]["label"] # type: ignore
48
+ labels = self.dataset["train"]["label"]
49
49
  lab2idx = {lab: idx for idx, lab in enumerate(set(labels))}
50
50
  self.dataset = self.dataset.map(
51
51
  lambda x: {"label": lab2idx[x["label"]]}, remove_columns=["label"]
@@ -57,7 +57,7 @@ Fishel, Mark},
57
57
  def dataset_transform(self):
58
58
  for lang in self.dataset.keys():
59
59
  # convert label to a 0/1 label
60
- labels = self.dataset[lang]["train"]["label"] # type: ignore
60
+ labels = self.dataset[lang]["train"]["label"]
61
61
  lab2idx = {lab: idx for idx, lab in enumerate(set(labels))}
62
62
  self.dataset[lang] = self.dataset[lang].map(
63
63
  lambda x: {"label": lab2idx[x["label"]]}, remove_columns=["label"]
@@ -49,7 +49,7 @@ class SugarCrepe(AbsTaskImageTextPairClassification):
49
49
  """Load dataset from HuggingFace hub"""
50
50
  if self.data_loaded:
51
51
  return
52
- self.dataset = datasets.load_dataset(**self.metadata.dataset) # type: ignore
52
+ self.dataset = datasets.load_dataset(**self.metadata.dataset)
53
53
  self.dataset = datasets.DatasetDict({"test": self.dataset["train"]})
54
54
  self.dataset_transform()
55
55
  self.data_loaded = True
@@ -48,14 +48,14 @@ class CodeRAGProgrammingSolutionsRetrieval(AbsTaskRetrieval):
48
48
  "path": "code-rag-bench/programming-solutions",
49
49
  "revision": "1064f7bba54d5400d4836f5831fe4c2332a566a6",
50
50
  },
51
- **common_args, # type: ignore
51
+ **common_args,
52
52
  )
53
53
 
54
54
  def load_data(self) -> None:
55
55
  """Load dataset from HuggingFace hub"""
56
56
  if self.data_loaded:
57
57
  return
58
- self.dataset = datasets.load_dataset(**self.metadata.dataset) # type: ignore
58
+ self.dataset = datasets.load_dataset(**self.metadata.dataset)
59
59
  self.dataset_transform()
60
60
  self.data_loaded = True
61
61
 
@@ -71,7 +71,7 @@ class CodeRAGProgrammingSolutionsRetrieval(AbsTaskRetrieval):
71
71
  self.queries = {}
72
72
 
73
73
  split = self.metadata.eval_splits[0]
74
- ds: datasets.Dataset = self.dataset[split] # type: ignore
74
+ ds: datasets.Dataset = self.dataset[split]
75
75
  ds = ds.shuffle(seed=42)
76
76
 
77
77
  self.queries[split] = {}
@@ -105,14 +105,14 @@ class CodeRAGOnlineTutorialsRetrieval(AbsTaskRetrieval):
105
105
  "path": "code-rag-bench/online-tutorials",
106
106
  "revision": "095bb77130082e4690d6c3a031997b03487bf6e2",
107
107
  },
108
- **common_args, # type: ignore
108
+ **common_args,
109
109
  )
110
110
 
111
111
  def load_data(self) -> None:
112
112
  """Load dataset from HuggingFace hub"""
113
113
  if self.data_loaded:
114
114
  return
115
- self.dataset = datasets.load_dataset(**self.metadata.dataset) # type: ignore
115
+ self.dataset = datasets.load_dataset(**self.metadata.dataset)
116
116
  self.dataset_transform()
117
117
  self.data_loaded = True
118
118
 
@@ -128,7 +128,7 @@ class CodeRAGOnlineTutorialsRetrieval(AbsTaskRetrieval):
128
128
  self.queries = {}
129
129
 
130
130
  split = self.metadata.eval_splits[0]
131
- ds: datasets.Dataset = self.dataset[split] # type: ignore
131
+ ds: datasets.Dataset = self.dataset[split]
132
132
  ds = ds.shuffle(seed=42)
133
133
 
134
134
  self.queries[split] = {}
@@ -165,14 +165,14 @@ class CodeRAGLibraryDocumentationSolutionsRetrieval(AbsTaskRetrieval):
165
165
  "path": "code-rag-bench/library-documentation",
166
166
  "revision": "b530d3b5a25087d2074e731b76232db85b9e9107",
167
167
  },
168
- **common_args, # type: ignore
168
+ **common_args,
169
169
  )
170
170
 
171
171
  def load_data(self) -> None:
172
172
  """Load dataset from HuggingFace hub"""
173
173
  if self.data_loaded:
174
174
  return
175
- self.dataset = datasets.load_dataset(**self.metadata.dataset) # type: ignore
175
+ self.dataset = datasets.load_dataset(**self.metadata.dataset)
176
176
  self.dataset_transform()
177
177
  self.data_loaded = True
178
178
 
@@ -188,7 +188,7 @@ class CodeRAGLibraryDocumentationSolutionsRetrieval(AbsTaskRetrieval):
188
188
  self.queries = {}
189
189
 
190
190
  split = self.metadata.eval_splits[0]
191
- ds: datasets.Dataset = self.dataset[split] # type: ignore
191
+ ds: datasets.Dataset = self.dataset[split]
192
192
  ds = ds.shuffle(seed=42)
193
193
 
194
194
  self.queries[split] = {}
@@ -222,14 +222,14 @@ class CodeRAGStackoverflowPostsRetrieval(AbsTaskRetrieval):
222
222
  "path": "code-rag-bench/stackoverflow-posts",
223
223
  "revision": "04e05d86cb0ac467b29a5d87f4c56eac99dfc0a4",
224
224
  },
225
- **common_args, # type: ignore
225
+ **common_args,
226
226
  )
227
227
 
228
228
  def load_data(self) -> None:
229
229
  """Load dataset from HuggingFace hub"""
230
230
  if self.data_loaded:
231
231
  return
232
- self.dataset = datasets.load_dataset(**self.metadata.dataset) # type: ignore
232
+ self.dataset = datasets.load_dataset(**self.metadata.dataset)
233
233
  self.dataset_transform()
234
234
  self.data_loaded = True
235
235
 
@@ -245,7 +245,7 @@ class CodeRAGStackoverflowPostsRetrieval(AbsTaskRetrieval):
245
245
  self.queries = {}
246
246
 
247
247
  split = self.metadata.eval_splits[0]
248
- ds: datasets.Dataset = self.dataset[split] # type: ignore
248
+ ds: datasets.Dataset = self.dataset[split]
249
249
  ds = ds.shuffle(seed=42)
250
250
 
251
251
  self.queries[split] = {}
@@ -51,7 +51,7 @@ Derczynski, Leon},
51
51
  """Load dataset from HuggingFace hub"""
52
52
  if self.data_loaded:
53
53
  return
54
- self.dataset = datasets.load_dataset(**self.metadata.dataset) # type: ignore
54
+ self.dataset = datasets.load_dataset(**self.metadata.dataset)
55
55
  self.dataset_transform()
56
56
  self.data_loaded = True
57
57
 
@@ -64,7 +64,7 @@ Piperidis, Stelios},
64
64
  """Load dataset from HuggingFace hub"""
65
65
  if self.data_loaded:
66
66
  return
67
- self.dataset = datasets.load_dataset(**self.metadata.dataset) # type: ignore
67
+ self.dataset = datasets.load_dataset(**self.metadata.dataset)
68
68
  self.dataset_transform()
69
69
  self.data_loaded = True
70
70
 
@@ -81,7 +81,7 @@ Piperidis, Stelios},
81
81
  text2id = {}
82
82
 
83
83
  for split in self.dataset:
84
- ds: datasets.Dataset = self.dataset[split] # type: ignore
84
+ ds: datasets.Dataset = self.dataset[split]
85
85
  ds = ds.shuffle(seed=42)
86
86
  ds = ds.select(
87
87
  range(2048)
@@ -40,7 +40,7 @@ class TwitterHjerneRetrieval(AbsTaskRetrieval):
40
40
  """Load dataset from HuggingFace hub"""
41
41
  if self.data_loaded:
42
42
  return
43
- self.dataset = datasets.load_dataset(**self.metadata.dataset) # type: ignore
43
+ self.dataset = datasets.load_dataset(**self.metadata.dataset)
44
44
  self.dataset_transform()
45
45
  self.data_loaded = True
46
46
 
@@ -57,7 +57,7 @@ class TwitterHjerneRetrieval(AbsTaskRetrieval):
57
57
  text2id = {}
58
58
 
59
59
  for split in self.dataset:
60
- ds: datasets.Dataset = self.dataset[split] # type: ignore
60
+ ds: datasets.Dataset = self.dataset[split]
61
61
  ds = ds.map(answers_to_list)
62
62
 
63
63
  self.queries[split] = {}
@@ -54,7 +54,7 @@ Fishel, Mark},
54
54
  """Load dataset from HuggingFace hub"""
55
55
  if self.data_loaded:
56
56
  return
57
- self.dataset = datasets.load_dataset(**self.metadata.dataset) # type: ignore
57
+ self.dataset = datasets.load_dataset(**self.metadata.dataset)
58
58
  self.dataset_transform()
59
59
  self.data_loaded = True
60
60
 
@@ -71,7 +71,7 @@ Fishel, Mark},
71
71
  text2id = {}
72
72
 
73
73
  for split in self.dataset:
74
- ds: datasets.Dataset = self.dataset[split] # type: ignore
74
+ ds: datasets.Dataset = self.dataset[split]
75
75
  ds = ds.shuffle(seed=42)
76
76
  max_samples = min(1024, len(ds))
77
77
  ds = ds.select(
@@ -41,7 +41,7 @@ class SNLRetrieval(AbsTaskRetrieval):
41
41
  """Load dataset from HuggingFace hub"""
42
42
  if self.data_loaded:
43
43
  return
44
- self.dataset = datasets.load_dataset(**self.metadata.dataset) # type: ignore
44
+ self.dataset = datasets.load_dataset(**self.metadata.dataset)
45
45
  self.dataset_transform()
46
46
  self.data_loaded = True
47
47
 
@@ -58,7 +58,7 @@ class SNLRetrieval(AbsTaskRetrieval):
58
58
  text2id = {}
59
59
 
60
60
  for split in self.dataset:
61
- ds: datasets.Dataset = self.dataset[split] # type: ignore
61
+ ds: datasets.Dataset = self.dataset[split]
62
62
  ds = ds.shuffle(seed=42)
63
63
 
64
64
  self.queries[split] = {}
@@ -59,7 +59,7 @@ class TurHistQuadRetrieval(AbsTaskRetrieval):
59
59
  text2id = {}
60
60
 
61
61
  for split in self.metadata.eval_splits:
62
- ds: datasets.Dataset = self.dataset[split] # type: ignore
62
+ ds: datasets.Dataset = self.dataset[split]
63
63
  ds = ds.shuffle(seed=42)
64
64
  max_samples = min(1024, len(ds))
65
65
  ds = ds.select(
mteb/types/_result.py CHANGED
@@ -1,3 +1,4 @@
1
+ from collections.abc import Mapping
1
2
  from typing import Any, NamedTuple
2
3
 
3
4
  HFSubset = str
@@ -8,7 +9,7 @@ SplitName = str
8
9
  Score = Any
9
10
  """A score value, could e.g. be accuracy. Normally it is a float or int, but it can take on any value. Should be json serializable."""
10
11
 
11
- ScoresDict = dict[str, Score]
12
+ ScoresDict = Mapping[str, Score]
12
13
  """A dictionary of scores, typically also include metadata, e.g {'main_score': 0.5, 'accuracy': 0.5, 'f1': 0.6, 'hf_subset': 'en-de', 'languages': ['eng-Latn', 'deu-Latn']}"""
13
14
 
14
15
 
mteb/types/statistics.py CHANGED
@@ -10,8 +10,14 @@ class SplitDescriptiveStatistics(TypedDict):
10
10
 
11
11
 
12
12
  class DescriptiveStatistics(TypedDict, SplitDescriptiveStatistics):
13
- """Class for descriptive statistics for the full task."""
13
+ """Class for descriptive statistics for the full task.
14
14
 
15
+ Attributes:
16
+ num_samples: Total number of samples
17
+ hf_subset_descriptive_stats: HFSubset descriptive statistics (only for multilingual datasets)
18
+ """
19
+
20
+ num_samples: int
15
21
  hf_subset_descriptive_stats: NotRequired[dict[HFSubset, SplitDescriptiveStatistics]]
16
22
 
17
23
 
@@ -88,9 +94,9 @@ class ScoreStatistics(TypedDict):
88
94
  max_score: Maximum score
89
95
  """
90
96
 
91
- min_score: int
97
+ min_score: int | float
92
98
  avg_score: float
93
- max_score: int
99
+ max_score: int | float
94
100
 
95
101
 
96
102
  class TopRankedStatistics(TypedDict):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mteb
3
- Version: 2.5.2
3
+ Version: 2.5.4
4
4
  Summary: Massive Text Embedding Benchmark
5
5
  Author-email: MTEB Contributors <niklas@huggingface.co>, Kenneth Enevoldsen <kenneth.enevoldsen@cas.au.dk>, Nouamane Tazi <nouamane@huggingface.co>, Nils Reimers <info@nils-reimers.de>
6
6
  Maintainer-email: Kenneth Enevoldsen <kenneth.enevoldsen@cas.au.dk>, Roman Solomatin <risolomatin@gmail.com>, Isaac Chung <chungisaac1217@gmail.com>