mteb 2.5.3__py3-none-any.whl → 2.5.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. mteb/_create_dataloaders.py +10 -15
  2. mteb/_evaluators/any_sts_evaluator.py +1 -4
  3. mteb/_evaluators/evaluator.py +2 -1
  4. mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +5 -6
  5. mteb/_evaluators/pair_classification_evaluator.py +3 -1
  6. mteb/_evaluators/retrieval_metrics.py +17 -16
  7. mteb/_evaluators/sklearn_evaluator.py +9 -8
  8. mteb/_evaluators/text/bitext_mining_evaluator.py +23 -16
  9. mteb/_evaluators/text/summarization_evaluator.py +20 -16
  10. mteb/abstasks/_data_filter/filters.py +1 -1
  11. mteb/abstasks/_data_filter/task_pipelines.py +3 -0
  12. mteb/abstasks/_statistics_calculation.py +18 -10
  13. mteb/abstasks/_stratification.py +18 -18
  14. mteb/abstasks/abstask.py +27 -21
  15. mteb/abstasks/aggregate_task_metadata.py +1 -9
  16. mteb/abstasks/aggregated_task.py +3 -16
  17. mteb/abstasks/classification.py +10 -4
  18. mteb/abstasks/clustering.py +18 -14
  19. mteb/abstasks/clustering_legacy.py +8 -8
  20. mteb/abstasks/image/image_text_pair_classification.py +5 -3
  21. mteb/abstasks/multilabel_classification.py +20 -16
  22. mteb/abstasks/pair_classification.py +18 -9
  23. mteb/abstasks/regression.py +3 -3
  24. mteb/abstasks/retrieval.py +12 -9
  25. mteb/abstasks/sts.py +6 -3
  26. mteb/abstasks/task_metadata.py +20 -16
  27. mteb/abstasks/text/bitext_mining.py +36 -25
  28. mteb/abstasks/text/reranking.py +7 -5
  29. mteb/abstasks/text/summarization.py +8 -3
  30. mteb/abstasks/zeroshot_classification.py +5 -2
  31. mteb/benchmarks/benchmark.py +4 -2
  32. mteb/benchmarks/benchmarks/benchmarks.py +22 -1
  33. mteb/benchmarks/get_benchmark.py +14 -55
  34. mteb/cache.py +21 -18
  35. mteb/cli/_display_tasks.py +2 -2
  36. mteb/cli/build_cli.py +8 -8
  37. mteb/cli/generate_model_card.py +39 -20
  38. mteb/deprecated_evaluator.py +56 -43
  39. mteb/evaluate.py +35 -29
  40. mteb/filter_tasks.py +25 -26
  41. mteb/get_tasks.py +25 -27
  42. mteb/languages/language_scripts.py +5 -3
  43. mteb/leaderboard/app.py +1 -1
  44. mteb/load_results.py +12 -12
  45. mteb/models/abs_encoder.py +2 -2
  46. mteb/models/cache_wrappers/cache_backend_protocol.py +3 -5
  47. mteb/models/cache_wrappers/cache_backends/_hash_utils.py +5 -4
  48. mteb/models/cache_wrappers/cache_backends/faiss_cache.py +2 -1
  49. mteb/models/cache_wrappers/cache_backends/numpy_cache.py +30 -13
  50. mteb/models/cache_wrappers/cache_wrapper.py +2 -2
  51. mteb/models/get_model_meta.py +8 -1
  52. mteb/models/instruct_wrapper.py +11 -5
  53. mteb/models/model_implementations/andersborges.py +2 -2
  54. mteb/models/model_implementations/blip_models.py +8 -8
  55. mteb/models/model_implementations/bm25.py +1 -1
  56. mteb/models/model_implementations/clip_models.py +3 -3
  57. mteb/models/model_implementations/cohere_models.py +1 -1
  58. mteb/models/model_implementations/cohere_v.py +2 -2
  59. mteb/models/model_implementations/dino_models.py +23 -23
  60. mteb/models/model_implementations/emillykkejensen_models.py +3 -3
  61. mteb/models/model_implementations/jina_clip.py +1 -1
  62. mteb/models/model_implementations/jina_models.py +1 -1
  63. mteb/models/model_implementations/kennethenevoldsen_models.py +2 -2
  64. mteb/models/model_implementations/llm2clip_models.py +3 -3
  65. mteb/models/model_implementations/moco_models.py +2 -2
  66. mteb/models/model_implementations/model2vec_models.py +1 -1
  67. mteb/models/model_implementations/nomic_models.py +8 -8
  68. mteb/models/model_implementations/openclip_models.py +7 -7
  69. mteb/models/model_implementations/random_baseline.py +3 -3
  70. mteb/models/model_implementations/rasgaard_models.py +1 -1
  71. mteb/models/model_implementations/repllama_models.py +2 -2
  72. mteb/models/model_implementations/rerankers_custom.py +3 -3
  73. mteb/models/model_implementations/rerankers_monot5_based.py +3 -3
  74. mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +113 -146
  75. mteb/models/model_implementations/siglip_models.py +10 -10
  76. mteb/models/model_implementations/vlm2vec_models.py +1 -1
  77. mteb/models/model_implementations/voyage_v.py +4 -4
  78. mteb/models/model_meta.py +30 -14
  79. mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +5 -5
  80. mteb/models/search_wrappers.py +22 -10
  81. mteb/models/sentence_transformer_wrapper.py +9 -4
  82. mteb/py.typed +0 -0
  83. mteb/results/benchmark_results.py +25 -19
  84. mteb/results/model_result.py +49 -21
  85. mteb/results/task_result.py +45 -51
  86. mteb/similarity_functions.py +11 -7
  87. mteb/tasks/classification/dan/dk_hate_classification.py +1 -1
  88. mteb/tasks/classification/est/estonian_valence.py +1 -1
  89. mteb/tasks/classification/multilingual/scala_classification.py +1 -1
  90. mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +1 -1
  91. mteb/tasks/retrieval/code/code_rag.py +12 -12
  92. mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
  93. mteb/tasks/retrieval/dan/tv2_nordretrieval.py +2 -2
  94. mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +2 -2
  95. mteb/tasks/retrieval/nob/norquad.py +2 -2
  96. mteb/tasks/retrieval/nob/snl_retrieval.py +2 -2
  97. mteb/tasks/retrieval/tur/tur_hist_quad.py +1 -1
  98. mteb/types/_result.py +2 -1
  99. mteb/types/statistics.py +9 -3
  100. {mteb-2.5.3.dist-info → mteb-2.5.5.dist-info}/METADATA +1 -1
  101. {mteb-2.5.3.dist-info → mteb-2.5.5.dist-info}/RECORD +105 -104
  102. {mteb-2.5.3.dist-info → mteb-2.5.5.dist-info}/WHEEL +0 -0
  103. {mteb-2.5.3.dist-info → mteb-2.5.5.dist-info}/entry_points.txt +0 -0
  104. {mteb-2.5.3.dist-info → mteb-2.5.5.dist-info}/licenses/LICENSE +0 -0
  105. {mteb-2.5.3.dist-info → mteb-2.5.5.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,5 @@
1
1
  import difflib
2
2
  import logging
3
- import warnings
4
3
  from functools import lru_cache
5
4
 
6
5
  from .benchmark import Benchmark
@@ -20,53 +19,16 @@ def _build_registry() -> dict[str, Benchmark]:
20
19
  return benchmark_registry
21
20
 
22
21
 
23
- def _get_previous_benchmark_names() -> dict[str, str]:
24
- from .benchmarks import (
25
- BRIGHT_LONG,
26
- C_MTEB,
27
- FA_MTEB,
28
- MTEB_DEU,
29
- MTEB_EN,
30
- MTEB_ENG_CLASSIC,
31
- MTEB_EU,
32
- MTEB_FRA,
33
- MTEB_INDIC,
34
- MTEB_JPN,
35
- MTEB_KOR,
36
- MTEB_MAIN_RU,
37
- MTEB_POL,
38
- MTEB_RETRIEVAL_LAW,
39
- MTEB_RETRIEVAL_MEDICAL,
40
- MTEB_RETRIEVAL_WITH_INSTRUCTIONS,
41
- SEB,
42
- VISUAL_DOCUMENT_RETRIEVAL,
43
- MTEB_code,
44
- MTEB_multilingual_v2,
45
- )
46
-
47
- previous_benchmark_names = {
48
- "MTEB(eng)": MTEB_EN.name,
49
- "MTEB(eng, classic)": MTEB_ENG_CLASSIC.name,
50
- "MTEB(rus)": MTEB_MAIN_RU.name,
51
- "MTEB(Retrieval w/Instructions)": MTEB_RETRIEVAL_WITH_INSTRUCTIONS.name,
52
- "MTEB(law)": MTEB_RETRIEVAL_LAW.name,
53
- "MTEB(Medical)": MTEB_RETRIEVAL_MEDICAL.name,
54
- "MTEB(Scandinavian)": SEB.name,
55
- "MTEB(fra)": MTEB_FRA.name,
56
- "MTEB(deu)": MTEB_DEU.name,
57
- "MTEB(kor)": MTEB_KOR.name,
58
- "MTEB(pol)": MTEB_POL.name,
59
- "MTEB(code)": MTEB_code.name,
60
- "MTEB(Multilingual)": MTEB_multilingual_v2.name,
61
- "MTEB(jpn)": MTEB_JPN.name,
62
- "MTEB(Indic)": MTEB_INDIC.name,
63
- "MTEB(Europe)": MTEB_EU.name,
64
- "MTEB(Chinese)": C_MTEB.name,
65
- "FaMTEB(fas, beta)": FA_MTEB.name,
66
- "BRIGHT(long)": BRIGHT_LONG.name,
67
- "VisualDocumentRetrieval": VISUAL_DOCUMENT_RETRIEVAL.name,
68
- }
69
- return previous_benchmark_names
22
+ @lru_cache
23
+ def _build_aliases_registry() -> dict[str, Benchmark]:
24
+ import mteb.benchmarks.benchmarks as benchmark_module
25
+
26
+ aliases: dict[str, Benchmark] = {}
27
+ for _, inst in benchmark_module.__dict__.items():
28
+ if isinstance(inst, Benchmark) and inst.aliases is not None:
29
+ for alias in inst.aliases:
30
+ aliases[alias] = inst
31
+ return aliases
70
32
 
71
33
 
72
34
  def get_benchmark(
@@ -80,14 +42,11 @@ def get_benchmark(
80
42
  Returns:
81
43
  The Benchmark instance corresponding to the given name.
82
44
  """
83
- previous_benchmark_names = _get_previous_benchmark_names()
84
45
  benchmark_registry = _build_registry()
85
- if benchmark_name in previous_benchmark_names:
86
- warnings.warn(
87
- f"Using the previous benchmark name '{benchmark_name}' is deprecated. Please use '{previous_benchmark_names[benchmark_name]}' instead.",
88
- DeprecationWarning,
89
- )
90
- benchmark_name = previous_benchmark_names[benchmark_name]
46
+ aliases_registry = _build_aliases_registry()
47
+
48
+ if benchmark_name in aliases_registry:
49
+ return aliases_registry[benchmark_name]
91
50
  if benchmark_name not in benchmark_registry:
92
51
  close_matches = difflib.get_close_matches(
93
52
  benchmark_name, benchmark_registry.keys()
mteb/cache.py CHANGED
@@ -5,7 +5,7 @@ import shutil
5
5
  import subprocess
6
6
  import warnings
7
7
  from collections import defaultdict
8
- from collections.abc import Sequence
8
+ from collections.abc import Iterable, Sequence
9
9
  from pathlib import Path
10
10
  from typing import cast
11
11
 
@@ -291,8 +291,8 @@ class ResultCache:
291
291
 
292
292
  def get_cache_paths(
293
293
  self,
294
- models: Sequence[str] | Sequence[ModelMeta] | None = None,
295
- tasks: Sequence[str] | Sequence[AbsTask] | None = None,
294
+ models: Sequence[str] | Iterable[ModelMeta] | None = None,
295
+ tasks: Sequence[str] | Iterable[AbsTask] | None = None,
296
296
  require_model_meta: bool = True,
297
297
  include_remote: bool = True,
298
298
  ) -> list[Path]:
@@ -425,7 +425,7 @@ class ResultCache:
425
425
  @staticmethod
426
426
  def _filter_paths_by_model_and_revision(
427
427
  paths: list[Path],
428
- models: Sequence[str] | Sequence[ModelMeta] | None = None,
428
+ models: Sequence[str] | Iterable[ModelMeta] | None = None,
429
429
  ) -> list[Path]:
430
430
  """Filter a list of paths by model name and optional revision.
431
431
 
@@ -435,8 +435,9 @@ class ResultCache:
435
435
  if not models:
436
436
  return paths
437
437
 
438
- if isinstance(models[0], ModelMeta):
439
- models = cast(list[ModelMeta], models)
438
+ first_model = next(iter(models))
439
+ if isinstance(first_model, ModelMeta):
440
+ models = cast(Iterable[ModelMeta], models)
440
441
  name_and_revision = {
441
442
  (m.model_name_as_path(), m.revision or "no_revision_available")
442
443
  for m in models
@@ -447,13 +448,14 @@ class ResultCache:
447
448
  if (p.parent.parent.name, p.parent.name) in name_and_revision
448
449
  ]
449
450
 
450
- model_names = {m.replace("/", "__").replace(" ", "_") for m in models}
451
+ str_models = cast(Sequence[str], models)
452
+ model_names = {m.replace("/", "__").replace(" ", "_") for m in str_models}
451
453
  return [p for p in paths if p.parent.parent.name in model_names]
452
454
 
453
455
  @staticmethod
454
456
  def _filter_paths_by_task(
455
457
  paths: list[Path],
456
- tasks: Sequence[str] | Sequence[AbsTask] | None = None,
458
+ tasks: Sequence[str] | Iterable[AbsTask] | None = None,
457
459
  ) -> list[Path]:
458
460
  if tasks is not None:
459
461
  task_names = set()
@@ -469,8 +471,8 @@ class ResultCache:
469
471
 
470
472
  def load_results(
471
473
  self,
472
- models: Sequence[str] | Sequence[ModelMeta] | None = None,
473
- tasks: Sequence[str] | Sequence[AbsTask] | Benchmark | str | None = None,
474
+ models: Sequence[str] | Iterable[ModelMeta] | None = None,
475
+ tasks: Sequence[str] | Iterable[AbsTask] | Benchmark | str | None = None,
474
476
  require_model_meta: bool = True,
475
477
  include_remote: bool = True,
476
478
  validate_and_filter: bool = False,
@@ -481,6 +483,7 @@ class ResultCache:
481
483
  Args:
482
484
  models: A list of model names to load the results for. If None it will load the results for all models.
483
485
  tasks: A list of task names to load the results for. If str is passed, then benchmark will be loaded.
486
+ If Benchmark is passed, then all tasks in the benchmark will be loaded.
484
487
  If None it will load the results for all tasks.
485
488
  require_model_meta: If True it will ignore results that do not have a model_meta.json file. If false it attempt to
486
489
  extract the model name and revision from the path.
@@ -514,7 +517,7 @@ class ResultCache:
514
517
  )
515
518
  models_results = defaultdict(list)
516
519
 
517
- task_names = {}
520
+ task_names: dict[str, AbsTask | None] = {}
518
521
  if tasks is not None:
519
522
  for task in tasks:
520
523
  if isinstance(task, AbsTask):
@@ -532,9 +535,11 @@ class ResultCache:
532
535
  )
533
536
 
534
537
  if validate_and_filter:
535
- task = task_names[task_result.task_name]
538
+ task_instance = task_names[task_result.task_name]
536
539
  try:
537
- task_result = task_result.validate_and_filter_scores(task=task)
540
+ task_result = task_result.validate_and_filter_scores(
541
+ task=task_instance
542
+ )
538
543
  except Exception as e:
539
544
  logger.info(
540
545
  f"Validation failed for {task_result.task_name} in {model_name} {revision}: {e}"
@@ -544,7 +549,7 @@ class ResultCache:
544
549
  models_results[(model_name, revision)].append(task_result)
545
550
 
546
551
  # create BenchmarkResults object
547
- models_results = [
552
+ models_results_object = [
548
553
  ModelResult(
549
554
  model_name=model_name,
550
555
  model_revision=revision,
@@ -553,9 +558,7 @@ class ResultCache:
553
558
  for (model_name, revision), task_results in models_results.items()
554
559
  ]
555
560
 
556
- benchmark_results = BenchmarkResults(
557
- model_results=models_results,
561
+ return BenchmarkResults(
562
+ model_results=models_results_object,
558
563
  benchmark=tasks if isinstance(tasks, Benchmark) else None,
559
564
  )
560
-
561
- return benchmark_results
@@ -1,4 +1,4 @@
1
- from collections.abc import Sequence
1
+ from collections.abc import Iterable, Sequence
2
2
 
3
3
  from mteb.abstasks import AbsTask
4
4
  from mteb.benchmarks import Benchmark
@@ -31,7 +31,7 @@ def _display_benchmarks(benchmarks: Sequence[Benchmark]) -> None:
31
31
  _display_tasks(benchmark.tasks, name=name)
32
32
 
33
33
 
34
- def _display_tasks(task_list: Sequence[AbsTask], name: str | None = None) -> None:
34
+ def _display_tasks(task_list: Iterable[AbsTask], name: str | None = None) -> None:
35
35
  from rich.console import Console
36
36
 
37
37
  console = Console()
mteb/cli/build_cli.py CHANGED
@@ -8,12 +8,12 @@ import torch
8
8
  from rich.logging import RichHandler
9
9
 
10
10
  import mteb
11
+ from mteb.abstasks.abstask import AbsTask
11
12
  from mteb.cache import ResultCache
13
+ from mteb.cli._display_tasks import _display_benchmarks, _display_tasks
12
14
  from mteb.cli.generate_model_card import generate_model_card
13
15
  from mteb.evaluate import OverwriteStrategy
14
16
 
15
- from ._display_tasks import _display_benchmarks, _display_tasks
16
-
17
17
  logger = logging.getLogger(__name__)
18
18
 
19
19
 
@@ -54,7 +54,7 @@ def run(args: argparse.Namespace) -> None:
54
54
 
55
55
  if args.benchmarks:
56
56
  benchmarks = mteb.get_benchmarks(names=args.benchmarks)
57
- tasks = [t for b in benchmarks for t in b.tasks]
57
+ tasks = tuple(t for b in benchmarks for t in b.tasks)
58
58
  else:
59
59
  tasks = mteb.get_tasks(
60
60
  categories=args.categories,
@@ -290,17 +290,17 @@ def _create_meta(args: argparse.Namespace) -> None:
290
290
  "Output path already exists, use --overwrite to overwrite."
291
291
  )
292
292
 
293
- tasks = []
293
+ benchmarks = None
294
+ tasks: list[AbsTask] = []
294
295
  if tasks_names is not None:
295
- tasks = mteb.get_tasks(tasks_names)
296
+ tasks = list(mteb.get_tasks(tasks_names))
296
297
  if benchmarks is not None:
297
298
  benchmarks = mteb.get_benchmarks(benchmarks)
298
- for benchmark in benchmarks:
299
- tasks.extend(benchmark.tasks)
300
299
 
301
300
  generate_model_card(
302
301
  model_name,
303
- tasks if len(tasks) > 0 else None,
302
+ tasks,
303
+ benchmarks,
304
304
  existing_model_card_id_or_path=from_existing,
305
305
  results_cache=ResultCache(results_folder),
306
306
  output_path=output_path,
@@ -1,11 +1,12 @@
1
1
  import logging
2
2
  import warnings
3
+ from collections.abc import Sequence
3
4
  from pathlib import Path
4
5
 
5
6
  from huggingface_hub import ModelCard, ModelCardData, repo_exists
6
7
 
7
- from mteb import BenchmarkResults
8
8
  from mteb.abstasks.abstask import AbsTask
9
+ from mteb.benchmarks.benchmark import Benchmark
9
10
  from mteb.cache import ResultCache
10
11
 
11
12
  logger = logging.getLogger(__name__)
@@ -13,12 +14,13 @@ logger = logging.getLogger(__name__)
13
14
 
14
15
  def generate_model_card(
15
16
  model_name: str,
16
- tasks: list[AbsTask] | None = None,
17
+ tasks: Sequence[AbsTask] | None = None,
18
+ benchmarks: Sequence[Benchmark] | None = None,
17
19
  existing_model_card_id_or_path: str | Path | None = None,
18
20
  results_cache: ResultCache = ResultCache(),
19
21
  output_path: Path = Path("model_card.md"),
20
22
  add_table_to_model_card: bool = False,
21
- models_to_compare: list[str] | None = None,
23
+ models_to_compare: Sequence[str] | None = None,
22
24
  token: str | None = None,
23
25
  push_to_hub: bool = False,
24
26
  ) -> None:
@@ -27,6 +29,7 @@ def generate_model_card(
27
29
  Args:
28
30
  model_name: Name of the model.
29
31
  tasks: List of tasks to generate results for.
32
+ benchmarks: A Benchmark or list of benchmarks to generate results for.
30
33
  existing_model_card_id_or_path: Path or ID of an existing model card to update.
31
34
  results_cache: Instance of ResultCache to load results from.
32
35
  output_path: Path to save the generated model card.
@@ -40,16 +43,24 @@ def generate_model_card(
40
43
  if existing_model_card_id_or_path:
41
44
  existing_model_card = ModelCard.load(existing_model_card_id_or_path)
42
45
 
46
+ all_tasks: list[AbsTask] = []
47
+ if tasks is not None:
48
+ all_tasks.extend(tasks)
49
+
50
+ if benchmarks is not None:
51
+ for b in benchmarks:
52
+ all_tasks.extend(b.tasks)
53
+
43
54
  benchmark_results = results_cache.load_results(
44
- [model_name], tasks, only_main_score=True
55
+ [model_name], all_tasks if all_tasks else None, only_main_score=True
45
56
  )
46
57
  eval_results = []
47
58
  for models_results in benchmark_results.model_results:
48
59
  for task_result in models_results.task_results:
49
60
  eval_results.extend(task_result.get_hf_eval_results())
50
61
 
51
- existing_model_card_data = (
52
- existing_model_card.data if existing_model_card else ModelCardData()
62
+ existing_model_card_data: ModelCardData = (
63
+ existing_model_card.data if existing_model_card else ModelCardData() # type: ignore[assignment]
53
64
  )
54
65
 
55
66
  if existing_model_card_data.eval_results is None:
@@ -79,17 +90,16 @@ def generate_model_card(
79
90
  card_data=existing_model_card_data
80
91
  )
81
92
 
82
- if models_to_compare:
83
- benchmark_results = results_cache.load_results(
84
- [model_name, *models_to_compare], tasks, only_main_score=True
85
- )
86
-
87
93
  if add_table_to_model_card:
88
94
  existing_model_card = _add_table_to_model_card(
89
- benchmark_results, existing_model_card
95
+ results_cache,
96
+ existing_model_card,
97
+ (model_name, *models_to_compare) if models_to_compare else (model_name,),
98
+ benchmarks or [],
90
99
  )
91
100
 
92
- if push_to_hub:
101
+ if push_to_hub and existing_model_card_id_or_path:
102
+ existing_model_card_id_or_path = str(existing_model_card_id_or_path)
93
103
  if repo_exists(existing_model_card_id_or_path):
94
104
  existing_model_card.push_to_hub(existing_model_card_id_or_path, token=token)
95
105
  else:
@@ -100,14 +110,23 @@ def generate_model_card(
100
110
 
101
111
 
102
112
  def _add_table_to_model_card(
103
- results: BenchmarkResults, model_card: ModelCard
113
+ results_cache: ResultCache,
114
+ model_card: ModelCard,
115
+ models: Sequence[str],
116
+ benchmarks: Sequence[Benchmark],
104
117
  ) -> ModelCard:
105
118
  original_content = model_card.content
106
- results_df = results.to_dataframe()
107
- results_df = results_df.set_index("task_name")
108
- mteb_content = f"""
109
- # MTEB results
110
- {results_df.to_markdown()}
111
- """
119
+ mteb_content = "# MTEB Results\n\n"
120
+
121
+ for benchmark in benchmarks:
122
+ mteb_content += f"## Benchmark: {benchmark.name}\n\n"
123
+ benchmark_results = results_cache.load_results(
124
+ tasks=benchmark,
125
+ models=models,
126
+ only_main_score=True,
127
+ )
128
+ df_results = benchmark_results.get_benchmark_result()
129
+ mteb_content += df_results.to_markdown(index=True) + "\n\n"
130
+
112
131
  model_card.content = original_content + "\n\n" + mteb_content
113
132
  return model_card
@@ -6,23 +6,23 @@ import os
6
6
  import sys
7
7
  import traceback
8
8
  import warnings
9
- from collections.abc import Iterable
9
+ from collections.abc import Iterable, Sequence
10
10
  from copy import deepcopy
11
11
  from datetime import datetime
12
12
  from itertools import chain
13
13
  from pathlib import Path
14
14
  from time import time
15
- from typing import TYPE_CHECKING, Any
15
+ from typing import TYPE_CHECKING, Any, cast
16
16
 
17
17
  import datasets
18
18
 
19
19
  import mteb
20
20
  from mteb.abstasks import AbsTask
21
+ from mteb.abstasks.aggregated_task import AbsTaskAggregate
21
22
  from mteb.abstasks.task_metadata import TaskCategory, TaskType
22
23
  from mteb.benchmarks import Benchmark
23
24
  from mteb.models import (
24
25
  CrossEncoderWrapper,
25
- EncoderProtocol,
26
26
  ModelMeta,
27
27
  MTEBModels,
28
28
  SentenceTransformerEncoderWrapper,
@@ -53,7 +53,7 @@ class MTEB:
53
53
  )
54
54
  def __init__(
55
55
  self,
56
- tasks: Iterable[AbsTask | Benchmark],
56
+ tasks: Iterable[AbsTask] | Iterable[Benchmark],
57
57
  *,
58
58
  err_logs_path: str = "error_logs.txt",
59
59
  ) -> None:
@@ -64,15 +64,14 @@ class MTEB:
64
64
  `mteb.get_tasks(["task1","task2"]) or `mteb.get_benchmark("MTEB(eng, classic)").
65
65
  err_logs_path: Path to save error logs.
66
66
  """
67
- from mteb.benchmarks import Benchmark
68
-
69
- self.tasks = list(tasks)
70
- if len(self.tasks) > 0 and isinstance(self.tasks[0], Benchmark):
67
+ if isinstance(next(iter(tasks)), Benchmark):
71
68
  self.benchmarks = tasks
72
- self.tasks = list(chain.from_iterable(self.tasks))
69
+ self.tasks = list(chain.from_iterable(cast(Iterable[Benchmark], tasks)))
70
+ elif isinstance(next(iter(tasks)), AbsTask):
71
+ self.tasks = list(cast(Iterable[AbsTask], tasks))
73
72
 
74
73
  self.err_logs_path = Path(err_logs_path)
75
- self.last_evaluated_splits = {}
74
+ self._last_evaluated_splits: dict[str, list[str]] = {}
76
75
 
77
76
  @property
78
77
  def available_tasks(self) -> list[str]:
@@ -85,7 +84,7 @@ class MTEB:
85
84
  return sorted({x.metadata.type for x in self.tasks})
86
85
 
87
86
  @property
88
- def available_task_categories(self) -> set[TaskCategory]:
87
+ def available_task_categories(self) -> set[TaskCategory | None]:
89
88
  """Set of available task categories."""
90
89
  return {x.metadata.category for x in self.tasks}
91
90
 
@@ -232,13 +231,14 @@ class MTEB:
232
231
  merged_kg_co2_emissions = None
233
232
  if existing_kg_co2_emissions and new_kg_co2_emissions:
234
233
  merged_kg_co2_emissions = existing_kg_co2_emissions + new_kg_co2_emissions
234
+ existing_evaluation_time = existing_results.evaluation_time or 0
235
+ new_evaluation_time = new_results.evaluation_time or 0
235
236
  merged_results = TaskResult(
236
237
  dataset_revision=new_results.dataset_revision,
237
238
  task_name=new_results.task_name,
238
239
  mteb_version=new_results.mteb_version,
239
240
  scores=merged_scores,
240
- evaluation_time=existing_results.evaluation_time
241
- + new_results.evaluation_time,
241
+ evaluation_time=existing_evaluation_time + new_evaluation_time,
242
242
  kg_co2_emissions=merged_kg_co2_emissions,
243
243
  )
244
244
 
@@ -307,13 +307,16 @@ class MTEB:
307
307
  elif verbosity == 3:
308
308
  datasets.logging.set_verbosity(logging.DEBUG)
309
309
 
310
- meta = self.create_model_meta(model)
311
- output_path = self._create_output_folder(meta, output_folder)
312
-
310
+ mteb_model: MTEBModels
313
311
  if isinstance(model, SentenceTransformer):
314
- model = SentenceTransformerEncoderWrapper(model)
312
+ mteb_model = SentenceTransformerEncoderWrapper(model)
315
313
  elif isinstance(model, CrossEncoder):
316
- model = CrossEncoderWrapper(model)
314
+ mteb_model = CrossEncoderWrapper(model)
315
+ else:
316
+ mteb_model = cast(MTEBModels, model)
317
+
318
+ meta = self.create_model_meta(mteb_model)
319
+ output_path = self._create_output_folder(meta, output_folder)
317
320
 
318
321
  # Disable co2_tracker for API models
319
322
  if "API" in meta.framework:
@@ -334,7 +337,7 @@ class MTEB:
334
337
  ) # save them in case we re-use the object (e.g. for reranking)
335
338
 
336
339
  # To evaluate missing splits, we keep track of the task name and the corresponding splits.
337
- self.last_evaluated_splits = {}
340
+ self._last_evaluated_splits = {}
338
341
 
339
342
  while len(self.tasks) > 0:
340
343
  task = self.tasks[0]
@@ -343,9 +346,10 @@ class MTEB:
343
346
  )
344
347
 
345
348
  if task.is_aggregate:
346
- self_ = MTEB(tasks=task.metadata.tasks)
347
- task_results = self_.run(
348
- model,
349
+ aggregated_task = cast(AbsTaskAggregate, task)
350
+ self_ = MTEB(tasks=aggregated_task.metadata.tasks)
351
+ aggregated_task_results = self_.run(
352
+ mteb_model,
349
353
  verbosity=verbosity - 1,
350
354
  output_folder=output_folder,
351
355
  eval_splits=eval_splits,
@@ -356,12 +360,15 @@ class MTEB:
356
360
  encode_kwargs=encode_kwargs,
357
361
  **kwargs,
358
362
  )
359
- new_results = task.combine_task_results(task_results)
363
+ new_results = aggregated_task.combine_task_results(
364
+ aggregated_task_results
365
+ )
360
366
  evaluation_results.append(new_results)
361
367
 
362
368
  if output_path:
363
- save_path = output_path / f"{task.metadata.name}.json"
364
- new_results.to_disk(save_path)
369
+ new_results.to_disk(
370
+ output_path / f"{aggregated_task.metadata.name}.json"
371
+ )
365
372
  del self.tasks[0]
366
373
  continue
367
374
 
@@ -383,7 +390,7 @@ class MTEB:
383
390
  task_subsets = task.hf_subsets
384
391
 
385
392
  existing_results = None
386
- save_path = None
393
+ save_path: Path | None = None
387
394
  final_splits_to_run = task_eval_splits
388
395
  missing_evaluations = self._get_missing_evaluations(
389
396
  existing_results,
@@ -433,7 +440,7 @@ class MTEB:
433
440
  logger.info(
434
441
  f"No splits to evaluate for {task.metadata.name}. Skipping evaluation."
435
442
  )
436
- self.last_evaluated_splits[task.metadata.name] = []
443
+ self._last_evaluated_splits[task.metadata.name] = []
437
444
  del self.tasks[0]
438
445
  continue
439
446
 
@@ -441,11 +448,11 @@ class MTEB:
441
448
  task.check_if_dataset_is_superseded()
442
449
  task.load_data()
443
450
 
444
- task_results = {}
451
+ task_results: dict[str, dict[str, dict[str, Any]]] = {}
445
452
  evaluation_time = 0
446
453
  kg_co2_emissions: int | None = 0 if co2_tracker else None
447
454
 
448
- self.last_evaluated_splits[task.metadata.name] = []
455
+ self._last_evaluated_splits[task.metadata.name] = []
449
456
 
450
457
  for split in final_splits_to_run:
451
458
  info = missing_evaluations[split]
@@ -466,7 +473,9 @@ class MTEB:
466
473
 
467
474
  if co2_tracker:
468
475
  try:
469
- from codecarbon import EmissionsTracker
476
+ from codecarbon import ( # type: ignore[import-untyped]
477
+ EmissionsTracker,
478
+ )
470
479
  except ImportError:
471
480
  raise ImportError(
472
481
  "codecarbon is not installed. Please install it using `pip install 'mteb[codecarbon]'` to track CO₂ emissions."
@@ -482,7 +491,7 @@ class MTEB:
482
491
  ) as tracker:
483
492
  results, tick, tock = self._run_eval(
484
493
  task,
485
- model,
494
+ mteb_model,
486
495
  split,
487
496
  encode_kwargs=encode_kwargs,
488
497
  subsets_to_run=subsets_to_run,
@@ -495,7 +504,7 @@ class MTEB:
495
504
  else:
496
505
  results, tick, tock = self._run_eval(
497
506
  task,
498
- model,
507
+ mteb_model,
499
508
  split,
500
509
  subsets_to_run=subsets_to_run,
501
510
  encode_kwargs=encode_kwargs,
@@ -511,25 +520,25 @@ class MTEB:
511
520
  if verbosity >= 1:
512
521
  logger.info(f"Scores: {task_results[split]}")
513
522
 
514
- self.last_evaluated_splits[task.metadata.name].append(split)
523
+ self._last_evaluated_splits[task.metadata.name].append(split)
515
524
 
516
525
  # Create new TaskResult
517
526
  new_results = TaskResult.from_task_results(
518
527
  task,
519
- task_results,
528
+ task_results, # type: ignore[arg-type]
520
529
  evaluation_time=evaluation_time,
521
530
  kg_co2_emissions=kg_co2_emissions,
522
531
  )
523
532
 
524
533
  # Merge with existing if needed
525
- if output_path and save_path.exists():
534
+ if output_path and save_path and save_path.exists():
526
535
  existing_results = TaskResult.from_disk(save_path)
527
536
  if existing_results:
528
537
  merged_results = self._merge_results(existing_results, new_results)
529
538
  else:
530
539
  merged_results = new_results
531
540
 
532
- if output_path:
541
+ if output_path and save_path:
533
542
  merged_results.to_disk(save_path)
534
543
 
535
544
  evaluation_results.append(merged_results)
@@ -556,7 +565,7 @@ class MTEB:
556
565
  def create_model_meta(model: MTEBModels) -> ModelMeta:
557
566
  """Create a ModelMeta object for the given model."""
558
567
  if hasattr(model, "mteb_model_meta") and model.mteb_model_meta is not None:
559
- meta = model.mteb_model_meta # type: ignore
568
+ meta = model.mteb_model_meta
560
569
  else:
561
570
  meta = MTEB._get_model_meta(model)
562
571
 
@@ -582,7 +591,11 @@ class MTEB:
582
591
  if output_folder is None:
583
592
  return None
584
593
 
585
- model_revision: str = model_meta.revision # type: ignore
594
+ model_revision: str = (
595
+ model_meta.revision
596
+ if model_meta.revision is not None
597
+ else "no_revision_available"
598
+ )
586
599
  model_path_name = model_meta.model_name_as_path()
587
600
 
588
601
  output_path = Path(output_folder) / model_path_name / model_revision
@@ -604,15 +617,15 @@ class MTEB:
604
617
  Tasks with empty lists indicate that results already existed and no splits were evaluated.
605
618
  """
606
619
  return deepcopy(
607
- {task: list(splits) for task, splits in self.last_evaluated_splits.items()}
620
+ {task: list(splits) for task, splits in self._last_evaluated_splits.items()}
608
621
  )
609
622
 
610
623
  @staticmethod
611
624
  def _get_missing_evaluations(
612
625
  existing_results: TaskResult | None,
613
- task_eval_splits: list[str],
614
- task_eval_langs: list[str],
615
- eval_subsets: list[str] | None,
626
+ task_eval_splits: Sequence[str],
627
+ task_eval_langs: Sequence[str],
628
+ eval_subsets: Sequence[str] | None,
616
629
  ) -> dict[str, dict[str, Any]]:
617
630
  """Return a dictionary for each split, indicating if the whole split is missing and which subsets are missing."""
618
631
  missing_evaluations = {
@@ -661,7 +674,7 @@ class MTEB:
661
674
  return missing_evaluations
662
675
 
663
676
  @staticmethod
664
- def _get_model_meta(model: EncoderProtocol) -> ModelMeta:
677
+ def _get_model_meta(model: MTEBModels) -> ModelMeta:
665
678
  from sentence_transformers import CrossEncoder, SentenceTransformer
666
679
 
667
680
  if isinstance(model, CrossEncoder):