mteb 2.5.2__py3-none-any.whl → 2.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (104) hide show
  1. mteb/_create_dataloaders.py +10 -15
  2. mteb/_evaluators/any_sts_evaluator.py +1 -4
  3. mteb/_evaluators/evaluator.py +2 -1
  4. mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +5 -6
  5. mteb/_evaluators/pair_classification_evaluator.py +3 -1
  6. mteb/_evaluators/retrieval_metrics.py +17 -16
  7. mteb/_evaluators/sklearn_evaluator.py +9 -8
  8. mteb/_evaluators/text/bitext_mining_evaluator.py +23 -16
  9. mteb/_evaluators/text/summarization_evaluator.py +20 -16
  10. mteb/abstasks/_data_filter/filters.py +1 -1
  11. mteb/abstasks/_data_filter/task_pipelines.py +3 -0
  12. mteb/abstasks/_statistics_calculation.py +18 -10
  13. mteb/abstasks/_stratification.py +18 -18
  14. mteb/abstasks/abstask.py +33 -27
  15. mteb/abstasks/aggregate_task_metadata.py +1 -9
  16. mteb/abstasks/aggregated_task.py +7 -26
  17. mteb/abstasks/classification.py +10 -4
  18. mteb/abstasks/clustering.py +18 -14
  19. mteb/abstasks/clustering_legacy.py +8 -8
  20. mteb/abstasks/image/image_text_pair_classification.py +5 -3
  21. mteb/abstasks/multilabel_classification.py +20 -16
  22. mteb/abstasks/pair_classification.py +18 -9
  23. mteb/abstasks/regression.py +3 -3
  24. mteb/abstasks/retrieval.py +12 -9
  25. mteb/abstasks/sts.py +6 -3
  26. mteb/abstasks/task_metadata.py +22 -19
  27. mteb/abstasks/text/bitext_mining.py +36 -25
  28. mteb/abstasks/text/reranking.py +7 -5
  29. mteb/abstasks/text/summarization.py +8 -3
  30. mteb/abstasks/zeroshot_classification.py +5 -2
  31. mteb/benchmarks/benchmark.py +2 -2
  32. mteb/cache.py +27 -22
  33. mteb/cli/_display_tasks.py +2 -2
  34. mteb/cli/build_cli.py +15 -10
  35. mteb/cli/generate_model_card.py +10 -7
  36. mteb/deprecated_evaluator.py +60 -46
  37. mteb/evaluate.py +39 -30
  38. mteb/filter_tasks.py +25 -26
  39. mteb/get_tasks.py +29 -30
  40. mteb/languages/language_scripts.py +5 -3
  41. mteb/leaderboard/app.py +1 -1
  42. mteb/load_results.py +12 -12
  43. mteb/models/abs_encoder.py +7 -5
  44. mteb/models/cache_wrappers/cache_backend_protocol.py +3 -5
  45. mteb/models/cache_wrappers/cache_backends/_hash_utils.py +5 -4
  46. mteb/models/cache_wrappers/cache_backends/faiss_cache.py +6 -2
  47. mteb/models/cache_wrappers/cache_backends/numpy_cache.py +43 -25
  48. mteb/models/cache_wrappers/cache_wrapper.py +2 -2
  49. mteb/models/get_model_meta.py +8 -1
  50. mteb/models/instruct_wrapper.py +11 -5
  51. mteb/models/model_implementations/andersborges.py +2 -2
  52. mteb/models/model_implementations/blip_models.py +8 -8
  53. mteb/models/model_implementations/bm25.py +1 -1
  54. mteb/models/model_implementations/clip_models.py +3 -3
  55. mteb/models/model_implementations/cohere_models.py +1 -1
  56. mteb/models/model_implementations/cohere_v.py +2 -2
  57. mteb/models/model_implementations/dino_models.py +23 -23
  58. mteb/models/model_implementations/emillykkejensen_models.py +3 -3
  59. mteb/models/model_implementations/gme_v_models.py +4 -3
  60. mteb/models/model_implementations/jina_clip.py +1 -1
  61. mteb/models/model_implementations/jina_models.py +1 -1
  62. mteb/models/model_implementations/kennethenevoldsen_models.py +2 -2
  63. mteb/models/model_implementations/llm2clip_models.py +3 -3
  64. mteb/models/model_implementations/mcinext_models.py +4 -1
  65. mteb/models/model_implementations/moco_models.py +2 -2
  66. mteb/models/model_implementations/model2vec_models.py +1 -1
  67. mteb/models/model_implementations/nomic_models.py +8 -8
  68. mteb/models/model_implementations/openclip_models.py +7 -7
  69. mteb/models/model_implementations/random_baseline.py +3 -3
  70. mteb/models/model_implementations/rasgaard_models.py +1 -1
  71. mteb/models/model_implementations/repllama_models.py +2 -2
  72. mteb/models/model_implementations/rerankers_custom.py +3 -3
  73. mteb/models/model_implementations/rerankers_monot5_based.py +3 -3
  74. mteb/models/model_implementations/siglip_models.py +10 -10
  75. mteb/models/model_implementations/vlm2vec_models.py +1 -1
  76. mteb/models/model_implementations/voyage_v.py +4 -4
  77. mteb/models/model_meta.py +14 -13
  78. mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +9 -6
  79. mteb/models/search_wrappers.py +26 -12
  80. mteb/models/sentence_transformer_wrapper.py +19 -14
  81. mteb/py.typed +0 -0
  82. mteb/results/benchmark_results.py +28 -20
  83. mteb/results/model_result.py +52 -22
  84. mteb/results/task_result.py +55 -58
  85. mteb/similarity_functions.py +11 -7
  86. mteb/tasks/classification/dan/dk_hate_classification.py +1 -1
  87. mteb/tasks/classification/est/estonian_valence.py +1 -1
  88. mteb/tasks/classification/multilingual/scala_classification.py +1 -1
  89. mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +1 -1
  90. mteb/tasks/retrieval/code/code_rag.py +12 -12
  91. mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
  92. mteb/tasks/retrieval/dan/tv2_nordretrieval.py +2 -2
  93. mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +2 -2
  94. mteb/tasks/retrieval/nob/norquad.py +2 -2
  95. mteb/tasks/retrieval/nob/snl_retrieval.py +2 -2
  96. mteb/tasks/retrieval/tur/tur_hist_quad.py +1 -1
  97. mteb/types/_result.py +2 -1
  98. mteb/types/statistics.py +9 -3
  99. {mteb-2.5.2.dist-info → mteb-2.5.4.dist-info}/METADATA +1 -1
  100. {mteb-2.5.2.dist-info → mteb-2.5.4.dist-info}/RECORD +104 -103
  101. {mteb-2.5.2.dist-info → mteb-2.5.4.dist-info}/WHEEL +0 -0
  102. {mteb-2.5.2.dist-info → mteb-2.5.4.dist-info}/entry_points.txt +0 -0
  103. {mteb-2.5.2.dist-info → mteb-2.5.4.dist-info}/licenses/LICENSE +0 -0
  104. {mteb-2.5.2.dist-info → mteb-2.5.4.dist-info}/top_level.txt +0 -0
mteb/cache.py CHANGED
@@ -3,8 +3,9 @@ import logging
3
3
  import os
4
4
  import shutil
5
5
  import subprocess
6
+ import warnings
6
7
  from collections import defaultdict
7
- from collections.abc import Sequence
8
+ from collections.abc import Iterable, Sequence
8
9
  from pathlib import Path
9
10
  from typing import cast
10
11
 
@@ -83,9 +84,9 @@ class ResultCache:
83
84
  model_path = results_folder / model_name
84
85
 
85
86
  if model_revision is None:
86
- logger.warning(
87
- "model_revision is not specified, attempting to load the latest revision. To disable this behavior, specify model_revision explicitly."
88
- )
87
+ msg = "`model_revision` is not specified, attempting to load the latest revision. To disable this behavior, specify the 'model_revision` explicitly."
88
+ logger.warning(msg)
89
+ warnings.warn(msg)
89
90
  # get revs from paths
90
91
  revisions = [p for p in model_path.glob("*") if p.is_dir()]
91
92
  if not revisions:
@@ -281,15 +282,17 @@ class ResultCache:
281
282
  shutil.rmtree(self.cache_path)
282
283
  logger.info(f"Cache directory {self.cache_path} cleared.")
283
284
  else:
284
- logger.warning(f"Cache directory {self.cache_path} does not exist.")
285
+ msg = f"Cache directory `{self.cache_path}` does not exist."
286
+ logger.warning(msg)
287
+ warnings.warn(msg)
285
288
 
286
289
  def __repr__(self) -> str:
287
290
  return f"ResultCache(cache_path={self.cache_path})"
288
291
 
289
292
  def get_cache_paths(
290
293
  self,
291
- models: Sequence[str] | Sequence[ModelMeta] | None = None,
292
- tasks: Sequence[str] | Sequence[AbsTask] | None = None,
294
+ models: Sequence[str] | Iterable[ModelMeta] | None = None,
295
+ tasks: Sequence[str] | Iterable[AbsTask] | None = None,
293
296
  require_model_meta: bool = True,
294
297
  include_remote: bool = True,
295
298
  ) -> list[Path]:
@@ -422,7 +425,7 @@ class ResultCache:
422
425
  @staticmethod
423
426
  def _filter_paths_by_model_and_revision(
424
427
  paths: list[Path],
425
- models: Sequence[str] | Sequence[ModelMeta] | None = None,
428
+ models: Sequence[str] | Iterable[ModelMeta] | None = None,
426
429
  ) -> list[Path]:
427
430
  """Filter a list of paths by model name and optional revision.
428
431
 
@@ -432,8 +435,9 @@ class ResultCache:
432
435
  if not models:
433
436
  return paths
434
437
 
435
- if isinstance(models[0], ModelMeta):
436
- models = cast(list[ModelMeta], models)
438
+ first_model = next(iter(models))
439
+ if isinstance(first_model, ModelMeta):
440
+ models = cast(Iterable[ModelMeta], models)
437
441
  name_and_revision = {
438
442
  (m.model_name_as_path(), m.revision or "no_revision_available")
439
443
  for m in models
@@ -444,13 +448,14 @@ class ResultCache:
444
448
  if (p.parent.parent.name, p.parent.name) in name_and_revision
445
449
  ]
446
450
 
447
- model_names = {m.replace("/", "__").replace(" ", "_") for m in models}
451
+ str_models = cast(Sequence[str], models)
452
+ model_names = {m.replace("/", "__").replace(" ", "_") for m in str_models}
448
453
  return [p for p in paths if p.parent.parent.name in model_names]
449
454
 
450
455
  @staticmethod
451
456
  def _filter_paths_by_task(
452
457
  paths: list[Path],
453
- tasks: Sequence[str] | Sequence[AbsTask] | None = None,
458
+ tasks: Sequence[str] | Iterable[AbsTask] | None = None,
454
459
  ) -> list[Path]:
455
460
  if tasks is not None:
456
461
  task_names = set()
@@ -466,8 +471,8 @@ class ResultCache:
466
471
 
467
472
  def load_results(
468
473
  self,
469
- models: Sequence[str] | Sequence[ModelMeta] | None = None,
470
- tasks: Sequence[str] | Sequence[AbsTask] | Benchmark | str | None = None,
474
+ models: Sequence[str] | Iterable[ModelMeta] | None = None,
475
+ tasks: Sequence[str] | Iterable[AbsTask] | str | None = None,
471
476
  require_model_meta: bool = True,
472
477
  include_remote: bool = True,
473
478
  validate_and_filter: bool = False,
@@ -511,7 +516,7 @@ class ResultCache:
511
516
  )
512
517
  models_results = defaultdict(list)
513
518
 
514
- task_names = {}
519
+ task_names: dict[str, AbsTask | None] = {}
515
520
  if tasks is not None:
516
521
  for task in tasks:
517
522
  if isinstance(task, AbsTask):
@@ -529,9 +534,11 @@ class ResultCache:
529
534
  )
530
535
 
531
536
  if validate_and_filter:
532
- task = task_names[task_result.task_name]
537
+ task_instance = task_names[task_result.task_name]
533
538
  try:
534
- task_result = task_result.validate_and_filter_scores(task=task)
539
+ task_result = task_result.validate_and_filter_scores(
540
+ task=task_instance
541
+ )
535
542
  except Exception as e:
536
543
  logger.info(
537
544
  f"Validation failed for {task_result.task_name} in {model_name} {revision}: {e}"
@@ -541,7 +548,7 @@ class ResultCache:
541
548
  models_results[(model_name, revision)].append(task_result)
542
549
 
543
550
  # create BenchmarkResults object
544
- models_results = [
551
+ models_results_object = [
545
552
  ModelResult(
546
553
  model_name=model_name,
547
554
  model_revision=revision,
@@ -550,9 +557,7 @@ class ResultCache:
550
557
  for (model_name, revision), task_results in models_results.items()
551
558
  ]
552
559
 
553
- benchmark_results = BenchmarkResults(
554
- model_results=models_results,
560
+ return BenchmarkResults(
561
+ model_results=models_results_object,
555
562
  benchmark=tasks if isinstance(tasks, Benchmark) else None,
556
563
  )
557
-
558
- return benchmark_results
@@ -1,4 +1,4 @@
1
- from collections.abc import Sequence
1
+ from collections.abc import Iterable, Sequence
2
2
 
3
3
  from mteb.abstasks import AbsTask
4
4
  from mteb.benchmarks import Benchmark
@@ -31,7 +31,7 @@ def _display_benchmarks(benchmarks: Sequence[Benchmark]) -> None:
31
31
  _display_tasks(benchmark.tasks, name=name)
32
32
 
33
33
 
34
- def _display_tasks(task_list: Sequence[AbsTask], name: str | None = None) -> None:
34
+ def _display_tasks(task_list: Iterable[AbsTask], name: str | None = None) -> None:
35
35
  from rich.console import Console
36
36
 
37
37
  console = Console()
mteb/cli/build_cli.py CHANGED
@@ -1,18 +1,19 @@
1
1
  import argparse
2
2
  import logging
3
3
  import os
4
+ import warnings
4
5
  from pathlib import Path
5
6
 
6
7
  import torch
7
8
  from rich.logging import RichHandler
8
9
 
9
10
  import mteb
11
+ from mteb.abstasks.abstask import AbsTask
10
12
  from mteb.cache import ResultCache
13
+ from mteb.cli._display_tasks import _display_benchmarks, _display_tasks
11
14
  from mteb.cli.generate_model_card import generate_model_card
12
15
  from mteb.evaluate import OverwriteStrategy
13
16
 
14
- from ._display_tasks import _display_benchmarks, _display_tasks
15
-
16
17
  logger = logging.getLogger(__name__)
17
18
 
18
19
 
@@ -53,7 +54,7 @@ def run(args: argparse.Namespace) -> None:
53
54
 
54
55
  if args.benchmarks:
55
56
  benchmarks = mteb.get_benchmarks(names=args.benchmarks)
56
- tasks = [t for b in benchmarks for t in b.tasks]
57
+ tasks = tuple(t for b in benchmarks for t in b.tasks)
57
58
  else:
58
59
  tasks = mteb.get_tasks(
59
60
  categories=args.categories,
@@ -69,15 +70,17 @@ def run(args: argparse.Namespace) -> None:
69
70
 
70
71
  overwrite_strategy = args.overwrite_strategy
71
72
  if args.overwrite:
72
- logger.warning(
73
- "`--overwrite` is deprecated, please use `--overwrite-strategy 'always'` instead."
73
+ warnings.warn(
74
+ "`--overwrite` is deprecated, please use `--overwrite-strategy 'always'` instead.",
75
+ DeprecationWarning,
74
76
  )
75
77
  overwrite_strategy = OverwriteStrategy.ALWAYS.value
76
78
 
77
79
  prediction_folder = args.prediction_folder
78
80
  if args.save_predictions:
79
- logger.warning(
80
- "`--save_predictions` is deprecated, please use `--prediction-folder` instead."
81
+ warnings.warn(
82
+ "`--save_predictions` is deprecated, please use `--prediction-folder` instead.",
83
+ DeprecationWarning,
81
84
  )
82
85
  prediction_folder = args.output_folder
83
86
 
@@ -279,15 +282,17 @@ def _create_meta(args: argparse.Namespace) -> None:
279
282
  from_existing = Path(from_existing)
280
283
 
281
284
  if output_path.exists() and overwrite:
282
- logger.warning("Output path already exists, overwriting.")
285
+ msg = "Output path already exists, overwriting."
286
+ logger.warning(msg)
287
+ warnings.warn(msg)
283
288
  elif output_path.exists():
284
289
  raise FileExistsError(
285
290
  "Output path already exists, use --overwrite to overwrite."
286
291
  )
287
292
 
288
- tasks = []
293
+ tasks: list[AbsTask] = []
289
294
  if tasks_names is not None:
290
- tasks = mteb.get_tasks(tasks_names)
295
+ tasks = list(mteb.get_tasks(tasks_names))
291
296
  if benchmarks is not None:
292
297
  benchmarks = mteb.get_benchmarks(benchmarks)
293
298
  for benchmark in benchmarks:
@@ -1,4 +1,6 @@
1
1
  import logging
2
+ import warnings
3
+ from collections.abc import Sequence
2
4
  from pathlib import Path
3
5
 
4
6
  from huggingface_hub import ModelCard, ModelCardData, repo_exists
@@ -12,7 +14,7 @@ logger = logging.getLogger(__name__)
12
14
 
13
15
  def generate_model_card(
14
16
  model_name: str,
15
- tasks: list[AbsTask] | None = None,
17
+ tasks: Sequence[AbsTask] | None = None,
16
18
  existing_model_card_id_or_path: str | Path | None = None,
17
19
  results_cache: ResultCache = ResultCache(),
18
20
  output_path: Path = Path("model_card.md"),
@@ -47,8 +49,8 @@ def generate_model_card(
47
49
  for task_result in models_results.task_results:
48
50
  eval_results.extend(task_result.get_hf_eval_results())
49
51
 
50
- existing_model_card_data = (
51
- existing_model_card.data if existing_model_card else ModelCardData()
52
+ existing_model_card_data: ModelCardData = (
53
+ existing_model_card.data if existing_model_card else ModelCardData() # type: ignore[assignment]
52
54
  )
53
55
 
54
56
  if existing_model_card_data.eval_results is None:
@@ -88,13 +90,14 @@ def generate_model_card(
88
90
  benchmark_results, existing_model_card
89
91
  )
90
92
 
91
- if push_to_hub:
93
+ if push_to_hub and existing_model_card_id_or_path:
94
+ existing_model_card_id_or_path = str(existing_model_card_id_or_path)
92
95
  if repo_exists(existing_model_card_id_or_path):
93
96
  existing_model_card.push_to_hub(existing_model_card_id_or_path, token=token)
94
97
  else:
95
- logger.warning(
96
- f"Repository {existing_model_card_id_or_path} does not exist on the Hub. Skipping push to hub."
97
- )
98
+ msg = f"Repository {existing_model_card_id_or_path} does not exist on the Hub. Skipping push to hub."
99
+ logger.warning(msg)
100
+ warnings.warn(msg)
98
101
  existing_model_card.save(output_path)
99
102
 
100
103
 
@@ -5,23 +5,24 @@ import logging
5
5
  import os
6
6
  import sys
7
7
  import traceback
8
- from collections.abc import Iterable
8
+ import warnings
9
+ from collections.abc import Iterable, Sequence
9
10
  from copy import deepcopy
10
11
  from datetime import datetime
11
12
  from itertools import chain
12
13
  from pathlib import Path
13
14
  from time import time
14
- from typing import TYPE_CHECKING, Any
15
+ from typing import TYPE_CHECKING, Any, cast
15
16
 
16
17
  import datasets
17
18
 
18
19
  import mteb
19
20
  from mteb.abstasks import AbsTask
21
+ from mteb.abstasks.aggregated_task import AbsTaskAggregate
20
22
  from mteb.abstasks.task_metadata import TaskCategory, TaskType
21
23
  from mteb.benchmarks import Benchmark
22
24
  from mteb.models import (
23
25
  CrossEncoderWrapper,
24
- EncoderProtocol,
25
26
  ModelMeta,
26
27
  MTEBModels,
27
28
  SentenceTransformerEncoderWrapper,
@@ -52,7 +53,7 @@ class MTEB:
52
53
  )
53
54
  def __init__(
54
55
  self,
55
- tasks: Iterable[AbsTask | Benchmark],
56
+ tasks: Iterable[AbsTask] | Iterable[Benchmark],
56
57
  *,
57
58
  err_logs_path: str = "error_logs.txt",
58
59
  ) -> None:
@@ -63,15 +64,14 @@ class MTEB:
63
64
  `mteb.get_tasks(["task1","task2"]) or `mteb.get_benchmark("MTEB(eng, classic)").
64
65
  err_logs_path: Path to save error logs.
65
66
  """
66
- from mteb.benchmarks import Benchmark
67
-
68
- self.tasks = list(tasks)
69
- if len(self.tasks) > 0 and isinstance(self.tasks[0], Benchmark):
67
+ if isinstance(next(iter(tasks)), Benchmark):
70
68
  self.benchmarks = tasks
71
- self.tasks = list(chain.from_iterable(self.tasks))
69
+ self.tasks = list(chain.from_iterable(cast(Iterable[Benchmark], tasks)))
70
+ elif isinstance(next(iter(tasks)), AbsTask):
71
+ self.tasks = list(cast(Iterable[AbsTask], tasks))
72
72
 
73
73
  self.err_logs_path = Path(err_logs_path)
74
- self.last_evaluated_splits = {}
74
+ self._last_evaluated_splits: dict[str, list[str]] = {}
75
75
 
76
76
  @property
77
77
  def available_tasks(self) -> list[str]:
@@ -84,7 +84,7 @@ class MTEB:
84
84
  return sorted({x.metadata.type for x in self.tasks})
85
85
 
86
86
  @property
87
- def available_task_categories(self) -> set[TaskCategory]:
87
+ def available_task_categories(self) -> set[TaskCategory | None]:
88
88
  """Set of available task categories."""
89
89
  return {x.metadata.category for x in self.tasks}
90
90
 
@@ -231,13 +231,14 @@ class MTEB:
231
231
  merged_kg_co2_emissions = None
232
232
  if existing_kg_co2_emissions and new_kg_co2_emissions:
233
233
  merged_kg_co2_emissions = existing_kg_co2_emissions + new_kg_co2_emissions
234
+ existing_evaluation_time = existing_results.evaluation_time or 0
235
+ new_evaluation_time = new_results.evaluation_time or 0
234
236
  merged_results = TaskResult(
235
237
  dataset_revision=new_results.dataset_revision,
236
238
  task_name=new_results.task_name,
237
239
  mteb_version=new_results.mteb_version,
238
240
  scores=merged_scores,
239
- evaluation_time=existing_results.evaluation_time
240
- + new_results.evaluation_time,
241
+ evaluation_time=existing_evaluation_time + new_evaluation_time,
241
242
  kg_co2_emissions=merged_kg_co2_emissions,
242
243
  )
243
244
 
@@ -306,13 +307,16 @@ class MTEB:
306
307
  elif verbosity == 3:
307
308
  datasets.logging.set_verbosity(logging.DEBUG)
308
309
 
309
- meta = self.create_model_meta(model)
310
- output_path = self._create_output_folder(meta, output_folder)
311
-
310
+ mteb_model: MTEBModels
312
311
  if isinstance(model, SentenceTransformer):
313
- model = SentenceTransformerEncoderWrapper(model)
312
+ mteb_model = SentenceTransformerEncoderWrapper(model)
314
313
  elif isinstance(model, CrossEncoder):
315
- model = CrossEncoderWrapper(model)
314
+ mteb_model = CrossEncoderWrapper(model)
315
+ else:
316
+ mteb_model = cast(MTEBModels, model)
317
+
318
+ meta = self.create_model_meta(mteb_model)
319
+ output_path = self._create_output_folder(meta, output_folder)
316
320
 
317
321
  # Disable co2_tracker for API models
318
322
  if "API" in meta.framework:
@@ -333,7 +337,7 @@ class MTEB:
333
337
  ) # save them in case we re-use the object (e.g. for reranking)
334
338
 
335
339
  # To evaluate missing splits, we keep track of the task name and the corresponding splits.
336
- self.last_evaluated_splits = {}
340
+ self._last_evaluated_splits = {}
337
341
 
338
342
  while len(self.tasks) > 0:
339
343
  task = self.tasks[0]
@@ -342,9 +346,10 @@ class MTEB:
342
346
  )
343
347
 
344
348
  if task.is_aggregate:
345
- self_ = MTEB(tasks=task.metadata.tasks)
346
- task_results = self_.run(
347
- model,
349
+ aggregated_task = cast(AbsTaskAggregate, task)
350
+ self_ = MTEB(tasks=aggregated_task.metadata.tasks)
351
+ aggregated_task_results = self_.run(
352
+ mteb_model,
348
353
  verbosity=verbosity - 1,
349
354
  output_folder=output_folder,
350
355
  eval_splits=eval_splits,
@@ -355,12 +360,15 @@ class MTEB:
355
360
  encode_kwargs=encode_kwargs,
356
361
  **kwargs,
357
362
  )
358
- new_results = task.combine_task_results(task_results)
363
+ new_results = aggregated_task.combine_task_results(
364
+ aggregated_task_results
365
+ )
359
366
  evaluation_results.append(new_results)
360
367
 
361
368
  if output_path:
362
- save_path = output_path / f"{task.metadata.name}.json"
363
- new_results.to_disk(save_path)
369
+ new_results.to_disk(
370
+ output_path / f"{aggregated_task.metadata.name}.json"
371
+ )
364
372
  del self.tasks[0]
365
373
  continue
366
374
 
@@ -382,7 +390,7 @@ class MTEB:
382
390
  task_subsets = task.hf_subsets
383
391
 
384
392
  existing_results = None
385
- save_path = None
393
+ save_path: Path | None = None
386
394
  final_splits_to_run = task_eval_splits
387
395
  missing_evaluations = self._get_missing_evaluations(
388
396
  existing_results,
@@ -432,7 +440,7 @@ class MTEB:
432
440
  logger.info(
433
441
  f"No splits to evaluate for {task.metadata.name}. Skipping evaluation."
434
442
  )
435
- self.last_evaluated_splits[task.metadata.name] = []
443
+ self._last_evaluated_splits[task.metadata.name] = []
436
444
  del self.tasks[0]
437
445
  continue
438
446
 
@@ -440,11 +448,11 @@ class MTEB:
440
448
  task.check_if_dataset_is_superseded()
441
449
  task.load_data()
442
450
 
443
- task_results = {}
451
+ task_results: dict[str, dict[str, dict[str, Any]]] = {}
444
452
  evaluation_time = 0
445
453
  kg_co2_emissions: int | None = 0 if co2_tracker else None
446
454
 
447
- self.last_evaluated_splits[task.metadata.name] = []
455
+ self._last_evaluated_splits[task.metadata.name] = []
448
456
 
449
457
  for split in final_splits_to_run:
450
458
  info = missing_evaluations[split]
@@ -465,14 +473,16 @@ class MTEB:
465
473
 
466
474
  if co2_tracker:
467
475
  try:
468
- from codecarbon import EmissionsTracker
476
+ from codecarbon import ( # type: ignore[import-untyped]
477
+ EmissionsTracker,
478
+ )
469
479
  except ImportError:
470
480
  raise ImportError(
471
481
  "codecarbon is not installed. Please install it using `pip install 'mteb[codecarbon]'` to track CO₂ emissions."
472
482
  )
473
- logger.warning(
474
- "Evaluating multiple MTEB runs simultaneously will produce incorrect CO₂ results"
475
- )
483
+ msg = "Evaluating multiple MTEB runs simultaneously will produce incorrect CO₂ results"
484
+ logger.warning(msg)
485
+ warnings.warn(msg)
476
486
  with EmissionsTracker(
477
487
  save_to_file=False,
478
488
  save_to_api=False,
@@ -481,7 +491,7 @@ class MTEB:
481
491
  ) as tracker:
482
492
  results, tick, tock = self._run_eval(
483
493
  task,
484
- model,
494
+ mteb_model,
485
495
  split,
486
496
  encode_kwargs=encode_kwargs,
487
497
  subsets_to_run=subsets_to_run,
@@ -494,7 +504,7 @@ class MTEB:
494
504
  else:
495
505
  results, tick, tock = self._run_eval(
496
506
  task,
497
- model,
507
+ mteb_model,
498
508
  split,
499
509
  subsets_to_run=subsets_to_run,
500
510
  encode_kwargs=encode_kwargs,
@@ -510,25 +520,25 @@ class MTEB:
510
520
  if verbosity >= 1:
511
521
  logger.info(f"Scores: {task_results[split]}")
512
522
 
513
- self.last_evaluated_splits[task.metadata.name].append(split)
523
+ self._last_evaluated_splits[task.metadata.name].append(split)
514
524
 
515
525
  # Create new TaskResult
516
526
  new_results = TaskResult.from_task_results(
517
527
  task,
518
- task_results,
528
+ task_results, # type: ignore[arg-type]
519
529
  evaluation_time=evaluation_time,
520
530
  kg_co2_emissions=kg_co2_emissions,
521
531
  )
522
532
 
523
533
  # Merge with existing if needed
524
- if output_path and save_path.exists():
534
+ if output_path and save_path and save_path.exists():
525
535
  existing_results = TaskResult.from_disk(save_path)
526
536
  if existing_results:
527
537
  merged_results = self._merge_results(existing_results, new_results)
528
538
  else:
529
539
  merged_results = new_results
530
540
 
531
- if output_path:
541
+ if output_path and save_path:
532
542
  merged_results.to_disk(save_path)
533
543
 
534
544
  evaluation_results.append(merged_results)
@@ -555,7 +565,7 @@ class MTEB:
555
565
  def create_model_meta(model: MTEBModels) -> ModelMeta:
556
566
  """Create a ModelMeta object for the given model."""
557
567
  if hasattr(model, "mteb_model_meta") and model.mteb_model_meta is not None:
558
- meta = model.mteb_model_meta # type: ignore
568
+ meta = model.mteb_model_meta
559
569
  else:
560
570
  meta = MTEB._get_model_meta(model)
561
571
 
@@ -581,7 +591,11 @@ class MTEB:
581
591
  if output_folder is None:
582
592
  return None
583
593
 
584
- model_revision: str = model_meta.revision # type: ignore
594
+ model_revision: str = (
595
+ model_meta.revision
596
+ if model_meta.revision is not None
597
+ else "no_revision_available"
598
+ )
585
599
  model_path_name = model_meta.model_name_as_path()
586
600
 
587
601
  output_path = Path(output_folder) / model_path_name / model_revision
@@ -603,15 +617,15 @@ class MTEB:
603
617
  Tasks with empty lists indicate that results already existed and no splits were evaluated.
604
618
  """
605
619
  return deepcopy(
606
- {task: list(splits) for task, splits in self.last_evaluated_splits.items()}
620
+ {task: list(splits) for task, splits in self._last_evaluated_splits.items()}
607
621
  )
608
622
 
609
623
  @staticmethod
610
624
  def _get_missing_evaluations(
611
625
  existing_results: TaskResult | None,
612
- task_eval_splits: list[str],
613
- task_eval_langs: list[str],
614
- eval_subsets: list[str] | None,
626
+ task_eval_splits: Sequence[str],
627
+ task_eval_langs: Sequence[str],
628
+ eval_subsets: Sequence[str] | None,
615
629
  ) -> dict[str, dict[str, Any]]:
616
630
  """Return a dictionary for each split, indicating if the whole split is missing and which subsets are missing."""
617
631
  missing_evaluations = {
@@ -660,7 +674,7 @@ class MTEB:
660
674
  return missing_evaluations
661
675
 
662
676
  @staticmethod
663
- def _get_model_meta(model: EncoderProtocol) -> ModelMeta:
677
+ def _get_model_meta(model: MTEBModels) -> ModelMeta:
664
678
  from sentence_transformers import CrossEncoder, SentenceTransformer
665
679
 
666
680
  if isinstance(model, CrossEncoder):