PyPI - mteb - Versions diffs - 2.5.3__py3-none-any.whl → 2.5.5__py3-none-any.whl - Mend

mteb 2.5.3py3-none-any.whl → 2.5.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (105) hide show

mteb/_create_dataloaders.py +10 -15
mteb/_evaluators/any_sts_evaluator.py +1 -4
mteb/_evaluators/evaluator.py +2 -1
mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +5 -6
mteb/_evaluators/pair_classification_evaluator.py +3 -1
mteb/_evaluators/retrieval_metrics.py +17 -16
mteb/_evaluators/sklearn_evaluator.py +9 -8
mteb/_evaluators/text/bitext_mining_evaluator.py +23 -16
mteb/_evaluators/text/summarization_evaluator.py +20 -16
mteb/abstasks/_data_filter/filters.py +1 -1
mteb/abstasks/_data_filter/task_pipelines.py +3 -0
mteb/abstasks/_statistics_calculation.py +18 -10
mteb/abstasks/_stratification.py +18 -18
mteb/abstasks/abstask.py +27 -21
mteb/abstasks/aggregate_task_metadata.py +1 -9
mteb/abstasks/aggregated_task.py +3 -16
mteb/abstasks/classification.py +10 -4
mteb/abstasks/clustering.py +18 -14
mteb/abstasks/clustering_legacy.py +8 -8
mteb/abstasks/image/image_text_pair_classification.py +5 -3
mteb/abstasks/multilabel_classification.py +20 -16
mteb/abstasks/pair_classification.py +18 -9
mteb/abstasks/regression.py +3 -3
mteb/abstasks/retrieval.py +12 -9
mteb/abstasks/sts.py +6 -3
mteb/abstasks/task_metadata.py +20 -16
mteb/abstasks/text/bitext_mining.py +36 -25
mteb/abstasks/text/reranking.py +7 -5
mteb/abstasks/text/summarization.py +8 -3
mteb/abstasks/zeroshot_classification.py +5 -2
mteb/benchmarks/benchmark.py +4 -2
mteb/benchmarks/benchmarks/benchmarks.py +22 -1
mteb/benchmarks/get_benchmark.py +14 -55
mteb/cache.py +21 -18
mteb/cli/_display_tasks.py +2 -2
mteb/cli/build_cli.py +8 -8
mteb/cli/generate_model_card.py +39 -20
mteb/deprecated_evaluator.py +56 -43
mteb/evaluate.py +35 -29
mteb/filter_tasks.py +25 -26
mteb/get_tasks.py +25 -27
mteb/languages/language_scripts.py +5 -3
mteb/leaderboard/app.py +1 -1
mteb/load_results.py +12 -12
mteb/models/abs_encoder.py +2 -2
mteb/models/cache_wrappers/cache_backend_protocol.py +3 -5
mteb/models/cache_wrappers/cache_backends/_hash_utils.py +5 -4
mteb/models/cache_wrappers/cache_backends/faiss_cache.py +2 -1
mteb/models/cache_wrappers/cache_backends/numpy_cache.py +30 -13
mteb/models/cache_wrappers/cache_wrapper.py +2 -2
mteb/models/get_model_meta.py +8 -1
mteb/models/instruct_wrapper.py +11 -5
mteb/models/model_implementations/andersborges.py +2 -2
mteb/models/model_implementations/blip_models.py +8 -8
mteb/models/model_implementations/bm25.py +1 -1
mteb/models/model_implementations/clip_models.py +3 -3
mteb/models/model_implementations/cohere_models.py +1 -1
mteb/models/model_implementations/cohere_v.py +2 -2
mteb/models/model_implementations/dino_models.py +23 -23
mteb/models/model_implementations/emillykkejensen_models.py +3 -3
mteb/models/model_implementations/jina_clip.py +1 -1
mteb/models/model_implementations/jina_models.py +1 -1
mteb/models/model_implementations/kennethenevoldsen_models.py +2 -2
mteb/models/model_implementations/llm2clip_models.py +3 -3
mteb/models/model_implementations/moco_models.py +2 -2
mteb/models/model_implementations/model2vec_models.py +1 -1
mteb/models/model_implementations/nomic_models.py +8 -8
mteb/models/model_implementations/openclip_models.py +7 -7
mteb/models/model_implementations/random_baseline.py +3 -3
mteb/models/model_implementations/rasgaard_models.py +1 -1
mteb/models/model_implementations/repllama_models.py +2 -2
mteb/models/model_implementations/rerankers_custom.py +3 -3
mteb/models/model_implementations/rerankers_monot5_based.py +3 -3
mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +113 -146
mteb/models/model_implementations/siglip_models.py +10 -10
mteb/models/model_implementations/vlm2vec_models.py +1 -1
mteb/models/model_implementations/voyage_v.py +4 -4
mteb/models/model_meta.py +30 -14
mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +5 -5
mteb/models/search_wrappers.py +22 -10
mteb/models/sentence_transformer_wrapper.py +9 -4
mteb/py.typed +0 -0
mteb/results/benchmark_results.py +25 -19
mteb/results/model_result.py +49 -21
mteb/results/task_result.py +45 -51
mteb/similarity_functions.py +11 -7
mteb/tasks/classification/dan/dk_hate_classification.py +1 -1
mteb/tasks/classification/est/estonian_valence.py +1 -1
mteb/tasks/classification/multilingual/scala_classification.py +1 -1
mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +1 -1
mteb/tasks/retrieval/code/code_rag.py +12 -12
mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
mteb/tasks/retrieval/dan/tv2_nordretrieval.py +2 -2
mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +2 -2
mteb/tasks/retrieval/nob/norquad.py +2 -2
mteb/tasks/retrieval/nob/snl_retrieval.py +2 -2
mteb/tasks/retrieval/tur/tur_hist_quad.py +1 -1
mteb/types/_result.py +2 -1
mteb/types/statistics.py +9 -3
{mteb-2.5.3.dist-info → mteb-2.5.5.dist-info}/METADATA +1 -1
{mteb-2.5.3.dist-info → mteb-2.5.5.dist-info}/RECORD +105 -104
{mteb-2.5.3.dist-info → mteb-2.5.5.dist-info}/WHEEL +0 -0
{mteb-2.5.3.dist-info → mteb-2.5.5.dist-info}/entry_points.txt +0 -0
{mteb-2.5.3.dist-info → mteb-2.5.5.dist-info}/licenses/LICENSE +0 -0
{mteb-2.5.3.dist-info → mteb-2.5.5.dist-info}/top_level.txt +0 -0

mteb/benchmarks/get_benchmark.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import difflib
 import logging
-import warnings
 from functools import lru_cache
 from .benchmark import Benchmark
@@ -20,53 +19,16 @@ def _build_registry() -> dict[str, Benchmark]:
     return benchmark_registry
-def _get_previous_benchmark_names() -> dict[str, str]:
-    from .benchmarks import (
-        BRIGHT_LONG,
-        C_MTEB,
-        FA_MTEB,
-        MTEB_DEU,
-        MTEB_EN,
-        MTEB_ENG_CLASSIC,
-        MTEB_EU,
-        MTEB_FRA,
-        MTEB_INDIC,
-        MTEB_JPN,
-        MTEB_KOR,
-        MTEB_MAIN_RU,
-        MTEB_POL,
-        MTEB_RETRIEVAL_LAW,
-        MTEB_RETRIEVAL_MEDICAL,
-        MTEB_RETRIEVAL_WITH_INSTRUCTIONS,
-        SEB,
-        VISUAL_DOCUMENT_RETRIEVAL,
-        MTEB_code,
-        MTEB_multilingual_v2,
-    )
-    previous_benchmark_names = {
-        "MTEB(eng)": MTEB_EN.name,
-        "MTEB(eng, classic)": MTEB_ENG_CLASSIC.name,
-        "MTEB(rus)": MTEB_MAIN_RU.name,
-        "MTEB(Retrieval w/Instructions)": MTEB_RETRIEVAL_WITH_INSTRUCTIONS.name,
-        "MTEB(law)": MTEB_RETRIEVAL_LAW.name,
-        "MTEB(Medical)": MTEB_RETRIEVAL_MEDICAL.name,
-        "MTEB(Scandinavian)": SEB.name,
-        "MTEB(fra)": MTEB_FRA.name,
-        "MTEB(deu)": MTEB_DEU.name,
-        "MTEB(kor)": MTEB_KOR.name,
-        "MTEB(pol)": MTEB_POL.name,
-        "MTEB(code)": MTEB_code.name,
-        "MTEB(Multilingual)": MTEB_multilingual_v2.name,
-        "MTEB(jpn)": MTEB_JPN.name,
-        "MTEB(Indic)": MTEB_INDIC.name,
-        "MTEB(Europe)": MTEB_EU.name,
-        "MTEB(Chinese)": C_MTEB.name,
-        "FaMTEB(fas, beta)": FA_MTEB.name,
-        "BRIGHT(long)": BRIGHT_LONG.name,
-        "VisualDocumentRetrieval": VISUAL_DOCUMENT_RETRIEVAL.name,
-    }
-    return previous_benchmark_names
+@lru_cache
+def _build_aliases_registry() -> dict[str, Benchmark]:
+    import mteb.benchmarks.benchmarks as benchmark_module
+    aliases: dict[str, Benchmark] = {}
+    for _, inst in benchmark_module.__dict__.items():
+        if isinstance(inst, Benchmark) and inst.aliases is not None:
+            for alias in inst.aliases:
+                aliases[alias] = inst
+    return aliases
 def get_benchmark(
@@ -80,14 +42,11 @@ def get_benchmark(
     Returns:
         The Benchmark instance corresponding to the given name.
     """
-    previous_benchmark_names = _get_previous_benchmark_names()
     benchmark_registry = _build_registry()
-    if benchmark_name in previous_benchmark_names:
-        warnings.warn(
-            f"Using the previous benchmark name '{benchmark_name}' is deprecated. Please use '{previous_benchmark_names[benchmark_name]}' instead.",
-            DeprecationWarning,
-        )
-        benchmark_name = previous_benchmark_names[benchmark_name]
+    aliases_registry = _build_aliases_registry()
+    if benchmark_name in aliases_registry:
+        return aliases_registry[benchmark_name]
     if benchmark_name not in benchmark_registry:
         close_matches = difflib.get_close_matches(
             benchmark_name, benchmark_registry.keys()

mteb/cache.py CHANGED Viewed

@@ -5,7 +5,7 @@ import shutil
 import subprocess
 import warnings
 from collections import defaultdict
-from collections.abc import Sequence
+from collections.abc import Iterable, Sequence
 from pathlib import Path
 from typing import cast
@@ -291,8 +291,8 @@ class ResultCache:
     def get_cache_paths(
         self,
-        models: Sequence[str] | Sequence[ModelMeta] | None = None,
-        tasks: Sequence[str] | Sequence[AbsTask] | None = None,
+        models: Sequence[str] | Iterable[ModelMeta] | None = None,
+        tasks: Sequence[str] | Iterable[AbsTask] | None = None,
         require_model_meta: bool = True,
         include_remote: bool = True,
     ) -> list[Path]:
@@ -425,7 +425,7 @@ class ResultCache:
     @staticmethod
     def _filter_paths_by_model_and_revision(
         paths: list[Path],
-        models: Sequence[str] | Sequence[ModelMeta] | None = None,
+        models: Sequence[str] | Iterable[ModelMeta] | None = None,
     ) -> list[Path]:
         """Filter a list of paths by model name and optional revision.
@@ -435,8 +435,9 @@ class ResultCache:
         if not models:
             return paths
-        if isinstance(models[0], ModelMeta):
-            models = cast(list[ModelMeta], models)
+        first_model = next(iter(models))
+        if isinstance(first_model, ModelMeta):
+            models = cast(Iterable[ModelMeta], models)
             name_and_revision = {
                 (m.model_name_as_path(), m.revision or "no_revision_available")
                 for m in models
@@ -447,13 +448,14 @@ class ResultCache:
                 if (p.parent.parent.name, p.parent.name) in name_and_revision
             ]
-        model_names = {m.replace("/", "__").replace(" ", "_") for m in models}
+        str_models = cast(Sequence[str], models)
+        model_names = {m.replace("/", "__").replace(" ", "_") for m in str_models}
         return [p for p in paths if p.parent.parent.name in model_names]
     @staticmethod
     def _filter_paths_by_task(
         paths: list[Path],
-        tasks: Sequence[str] | Sequence[AbsTask] | None = None,
+        tasks: Sequence[str] | Iterable[AbsTask] | None = None,
     ) -> list[Path]:
         if tasks is not None:
             task_names = set()
@@ -469,8 +471,8 @@ class ResultCache:
     def load_results(
         self,
-        models: Sequence[str] | Sequence[ModelMeta] | None = None,
-        tasks: Sequence[str] | Sequence[AbsTask] | Benchmark | str | None = None,
+        models: Sequence[str] | Iterable[ModelMeta] | None = None,
+        tasks: Sequence[str] | Iterable[AbsTask] | Benchmark | str | None = None,
         require_model_meta: bool = True,
         include_remote: bool = True,
         validate_and_filter: bool = False,
@@ -481,6 +483,7 @@ class ResultCache:
         Args:
             models: A list of model names to load the results for. If None it will load the results for all models.
             tasks: A list of task names to load the results for. If str is passed, then benchmark will be loaded.
+                If Benchmark is passed, then all tasks in the benchmark will be loaded.
                 If None it will load the results for all tasks.
             require_model_meta: If True it will ignore results that do not have a model_meta.json file. If false it attempt to
                 extract the model name and revision from the path.
@@ -514,7 +517,7 @@ class ResultCache:
         )
         models_results = defaultdict(list)
-        task_names = {}
+        task_names: dict[str, AbsTask | None] = {}
         if tasks is not None:
             for task in tasks:
                 if isinstance(task, AbsTask):
@@ -532,9 +535,11 @@ class ResultCache:
             )
             if validate_and_filter:
-                task = task_names[task_result.task_name]
+                task_instance = task_names[task_result.task_name]
                 try:
-                    task_result = task_result.validate_and_filter_scores(task=task)
+                    task_result = task_result.validate_and_filter_scores(
+                        task=task_instance
+                    )
                 except Exception as e:
                     logger.info(
                         f"Validation failed for {task_result.task_name} in {model_name} {revision}: {e}"
@@ -544,7 +549,7 @@ class ResultCache:
             models_results[(model_name, revision)].append(task_result)
         # create BenchmarkResults object
-        models_results = [
+        models_results_object = [
             ModelResult(
                 model_name=model_name,
                 model_revision=revision,
@@ -553,9 +558,7 @@ class ResultCache:
             for (model_name, revision), task_results in models_results.items()
         ]
-        benchmark_results = BenchmarkResults(
-            model_results=models_results,
+        return BenchmarkResults(
+            model_results=models_results_object,
             benchmark=tasks if isinstance(tasks, Benchmark) else None,
         )
-        return benchmark_results

mteb/cli/_display_tasks.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from collections.abc import Sequence
+from collections.abc import Iterable, Sequence
 from mteb.abstasks import AbsTask
 from mteb.benchmarks import Benchmark
@@ -31,7 +31,7 @@ def _display_benchmarks(benchmarks: Sequence[Benchmark]) -> None:
         _display_tasks(benchmark.tasks, name=name)
-def _display_tasks(task_list: Sequence[AbsTask], name: str | None = None) -> None:
+def _display_tasks(task_list: Iterable[AbsTask], name: str | None = None) -> None:
     from rich.console import Console
     console = Console()

mteb/cli/build_cli.py CHANGED Viewed

@@ -8,12 +8,12 @@ import torch
 from rich.logging import RichHandler
 import mteb
+from mteb.abstasks.abstask import AbsTask
 from mteb.cache import ResultCache
+from mteb.cli._display_tasks import _display_benchmarks, _display_tasks
 from mteb.cli.generate_model_card import generate_model_card
 from mteb.evaluate import OverwriteStrategy
-from ._display_tasks import _display_benchmarks, _display_tasks
 logger = logging.getLogger(__name__)
@@ -54,7 +54,7 @@ def run(args: argparse.Namespace) -> None:
     if args.benchmarks:
         benchmarks = mteb.get_benchmarks(names=args.benchmarks)
-        tasks = [t for b in benchmarks for t in b.tasks]
+        tasks = tuple(t for b in benchmarks for t in b.tasks)
     else:
         tasks = mteb.get_tasks(
             categories=args.categories,
@@ -290,17 +290,17 @@ def _create_meta(args: argparse.Namespace) -> None:
             "Output path already exists, use --overwrite to overwrite."
         )
-    tasks = []
+    benchmarks = None
+    tasks: list[AbsTask] = []
     if tasks_names is not None:
-        tasks = mteb.get_tasks(tasks_names)
+        tasks = list(mteb.get_tasks(tasks_names))
     if benchmarks is not None:
         benchmarks = mteb.get_benchmarks(benchmarks)
-        for benchmark in benchmarks:
-            tasks.extend(benchmark.tasks)
     generate_model_card(
         model_name,
-        tasks if len(tasks) > 0 else None,
+        tasks,
+        benchmarks,
         existing_model_card_id_or_path=from_existing,
         results_cache=ResultCache(results_folder),
         output_path=output_path,

mteb/cli/generate_model_card.py CHANGED Viewed

@@ -1,11 +1,12 @@
 import logging
 import warnings
+from collections.abc import Sequence
 from pathlib import Path
 from huggingface_hub import ModelCard, ModelCardData, repo_exists
-from mteb import BenchmarkResults
 from mteb.abstasks.abstask import AbsTask
+from mteb.benchmarks.benchmark import Benchmark
 from mteb.cache import ResultCache
 logger = logging.getLogger(__name__)
@@ -13,12 +14,13 @@ logger = logging.getLogger(__name__)
 def generate_model_card(
     model_name: str,
-    tasks: list[AbsTask] | None = None,
+    tasks: Sequence[AbsTask] | None = None,
+    benchmarks: Sequence[Benchmark] | None = None,
     existing_model_card_id_or_path: str | Path | None = None,
     results_cache: ResultCache = ResultCache(),
     output_path: Path = Path("model_card.md"),
     add_table_to_model_card: bool = False,
-    models_to_compare: list[str] | None = None,
+    models_to_compare: Sequence[str] | None = None,
     token: str | None = None,
     push_to_hub: bool = False,
 ) -> None:
@@ -27,6 +29,7 @@ def generate_model_card(
     Args:
         model_name: Name of the model.
         tasks: List of tasks to generate results for.
+        benchmarks: A Benchmark or list of benchmarks to generate results for.
         existing_model_card_id_or_path: Path or ID of an existing model card to update.
         results_cache: Instance of ResultCache to load results from.
         output_path: Path to save the generated model card.
@@ -40,16 +43,24 @@ def generate_model_card(
     if existing_model_card_id_or_path:
         existing_model_card = ModelCard.load(existing_model_card_id_or_path)
+    all_tasks: list[AbsTask] = []
+    if tasks is not None:
+        all_tasks.extend(tasks)
+    if benchmarks is not None:
+        for b in benchmarks:
+            all_tasks.extend(b.tasks)
     benchmark_results = results_cache.load_results(
-        [model_name], tasks, only_main_score=True
+        [model_name], all_tasks if all_tasks else None, only_main_score=True
     )
     eval_results = []
     for models_results in benchmark_results.model_results:
         for task_result in models_results.task_results:
             eval_results.extend(task_result.get_hf_eval_results())
-    existing_model_card_data = (
-        existing_model_card.data if existing_model_card else ModelCardData()
+    existing_model_card_data: ModelCardData = (
+        existing_model_card.data if existing_model_card else ModelCardData()  # type: ignore[assignment]
     )
     if existing_model_card_data.eval_results is None:
@@ -79,17 +90,16 @@ def generate_model_card(
             card_data=existing_model_card_data
         )
-    if models_to_compare:
-        benchmark_results = results_cache.load_results(
-            [model_name, *models_to_compare], tasks, only_main_score=True
-        )
     if add_table_to_model_card:
         existing_model_card = _add_table_to_model_card(
-            benchmark_results, existing_model_card
+            results_cache,
+            existing_model_card,
+            (model_name, *models_to_compare) if models_to_compare else (model_name,),
+            benchmarks or [],
         )
-    if push_to_hub:
+    if push_to_hub and existing_model_card_id_or_path:
+        existing_model_card_id_or_path = str(existing_model_card_id_or_path)
         if repo_exists(existing_model_card_id_or_path):
             existing_model_card.push_to_hub(existing_model_card_id_or_path, token=token)
         else:
@@ -100,14 +110,23 @@ def generate_model_card(
 def _add_table_to_model_card(
-    results: BenchmarkResults, model_card: ModelCard
+    results_cache: ResultCache,
+    model_card: ModelCard,
+    models: Sequence[str],
+    benchmarks: Sequence[Benchmark],
 ) -> ModelCard:
     original_content = model_card.content
-    results_df = results.to_dataframe()
-    results_df = results_df.set_index("task_name")
-    mteb_content = f"""
-# MTEB results
-{results_df.to_markdown()}
-"""
+    mteb_content = "# MTEB Results\n\n"
+    for benchmark in benchmarks:
+        mteb_content += f"## Benchmark: {benchmark.name}\n\n"
+        benchmark_results = results_cache.load_results(
+            tasks=benchmark,
+            models=models,
+            only_main_score=True,
+        )
+        df_results = benchmark_results.get_benchmark_result()
+        mteb_content += df_results.to_markdown(index=True) + "\n\n"
     model_card.content = original_content + "\n\n" + mteb_content
     return model_card

mteb/deprecated_evaluator.py CHANGED Viewed

@@ -6,23 +6,23 @@ import os
 import sys
 import traceback
 import warnings
-from collections.abc import Iterable
+from collections.abc import Iterable, Sequence
 from copy import deepcopy
 from datetime import datetime
 from itertools import chain
 from pathlib import Path
 from time import time
-from typing import TYPE_CHECKING, Any
+from typing import TYPE_CHECKING, Any, cast
 import datasets
 import mteb
 from mteb.abstasks import AbsTask
+from mteb.abstasks.aggregated_task import AbsTaskAggregate
 from mteb.abstasks.task_metadata import TaskCategory, TaskType
 from mteb.benchmarks import Benchmark
 from mteb.models import (
     CrossEncoderWrapper,
-    EncoderProtocol,
     ModelMeta,
     MTEBModels,
     SentenceTransformerEncoderWrapper,
@@ -53,7 +53,7 @@ class MTEB:
     )
     def __init__(
         self,
-        tasks: Iterable[AbsTask | Benchmark],
+        tasks: Iterable[AbsTask] | Iterable[Benchmark],
         *,
         err_logs_path: str = "error_logs.txt",
     ) -> None:
@@ -64,15 +64,14 @@ class MTEB:
                 `mteb.get_tasks(["task1","task2"]) or `mteb.get_benchmark("MTEB(eng, classic)").
             err_logs_path: Path to save error logs.
         """
-        from mteb.benchmarks import Benchmark
-        self.tasks = list(tasks)
-        if len(self.tasks) > 0 and isinstance(self.tasks[0], Benchmark):
+        if isinstance(next(iter(tasks)), Benchmark):
             self.benchmarks = tasks
-            self.tasks = list(chain.from_iterable(self.tasks))
+            self.tasks = list(chain.from_iterable(cast(Iterable[Benchmark], tasks)))
+        elif isinstance(next(iter(tasks)), AbsTask):
+            self.tasks = list(cast(Iterable[AbsTask], tasks))
         self.err_logs_path = Path(err_logs_path)
-        self.last_evaluated_splits = {}
+        self._last_evaluated_splits: dict[str, list[str]] = {}
     @property
     def available_tasks(self) -> list[str]:
@@ -85,7 +84,7 @@ class MTEB:
         return sorted({x.metadata.type for x in self.tasks})
     @property
-    def available_task_categories(self) -> set[TaskCategory]:
+    def available_task_categories(self) -> set[TaskCategory | None]:
         """Set of available task categories."""
         return {x.metadata.category for x in self.tasks}
@@ -232,13 +231,14 @@ class MTEB:
         merged_kg_co2_emissions = None
         if existing_kg_co2_emissions and new_kg_co2_emissions:
             merged_kg_co2_emissions = existing_kg_co2_emissions + new_kg_co2_emissions
+        existing_evaluation_time = existing_results.evaluation_time or 0
+        new_evaluation_time = new_results.evaluation_time or 0
         merged_results = TaskResult(
             dataset_revision=new_results.dataset_revision,
             task_name=new_results.task_name,
             mteb_version=new_results.mteb_version,
             scores=merged_scores,
-            evaluation_time=existing_results.evaluation_time
-            + new_results.evaluation_time,
+            evaluation_time=existing_evaluation_time + new_evaluation_time,
             kg_co2_emissions=merged_kg_co2_emissions,
         )
@@ -307,13 +307,16 @@ class MTEB:
         elif verbosity == 3:
             datasets.logging.set_verbosity(logging.DEBUG)
-        meta = self.create_model_meta(model)
-        output_path = self._create_output_folder(meta, output_folder)
+        mteb_model: MTEBModels
         if isinstance(model, SentenceTransformer):
-            model = SentenceTransformerEncoderWrapper(model)
+            mteb_model = SentenceTransformerEncoderWrapper(model)
         elif isinstance(model, CrossEncoder):
-            model = CrossEncoderWrapper(model)
+            mteb_model = CrossEncoderWrapper(model)
+        else:
+            mteb_model = cast(MTEBModels, model)
+        meta = self.create_model_meta(mteb_model)
+        output_path = self._create_output_folder(meta, output_folder)
         # Disable co2_tracker for API models
         if "API" in meta.framework:
@@ -334,7 +337,7 @@ class MTEB:
         )  # save them in case we re-use the object (e.g. for reranking)
         # To evaluate missing splits, we keep track of the task name and the corresponding splits.
-        self.last_evaluated_splits = {}
+        self._last_evaluated_splits = {}
         while len(self.tasks) > 0:
             task = self.tasks[0]
@@ -343,9 +346,10 @@ class MTEB:
             )
             if task.is_aggregate:
-                self_ = MTEB(tasks=task.metadata.tasks)
-                task_results = self_.run(
-                    model,
+                aggregated_task = cast(AbsTaskAggregate, task)
+                self_ = MTEB(tasks=aggregated_task.metadata.tasks)
+                aggregated_task_results = self_.run(
+                    mteb_model,
                     verbosity=verbosity - 1,
                     output_folder=output_folder,
                     eval_splits=eval_splits,
@@ -356,12 +360,15 @@ class MTEB:
                     encode_kwargs=encode_kwargs,
                     **kwargs,
                 )
-                new_results = task.combine_task_results(task_results)
+                new_results = aggregated_task.combine_task_results(
+                    aggregated_task_results
+                )
                 evaluation_results.append(new_results)
                 if output_path:
-                    save_path = output_path / f"{task.metadata.name}.json"
-                    new_results.to_disk(save_path)
+                    new_results.to_disk(
+                        output_path / f"{aggregated_task.metadata.name}.json"
+                    )
                 del self.tasks[0]
                 continue
@@ -383,7 +390,7 @@ class MTEB:
             task_subsets = task.hf_subsets
             existing_results = None
-            save_path = None
+            save_path: Path | None = None
             final_splits_to_run = task_eval_splits
             missing_evaluations = self._get_missing_evaluations(
                 existing_results,
@@ -433,7 +440,7 @@ class MTEB:
                     logger.info(
                         f"No splits to evaluate for {task.metadata.name}. Skipping evaluation."
                     )
-                self.last_evaluated_splits[task.metadata.name] = []
+                self._last_evaluated_splits[task.metadata.name] = []
                 del self.tasks[0]
                 continue
@@ -441,11 +448,11 @@ class MTEB:
                 task.check_if_dataset_is_superseded()
                 task.load_data()
-                task_results = {}
+                task_results: dict[str, dict[str, dict[str, Any]]] = {}
                 evaluation_time = 0
                 kg_co2_emissions: int | None = 0 if co2_tracker else None
-                self.last_evaluated_splits[task.metadata.name] = []
+                self._last_evaluated_splits[task.metadata.name] = []
                 for split in final_splits_to_run:
                     info = missing_evaluations[split]
@@ -466,7 +473,9 @@ class MTEB:
                     if co2_tracker:
                         try:
-                            from codecarbon import EmissionsTracker
+                            from codecarbon import (  # type: ignore[import-untyped]
+                                EmissionsTracker,
+                            )
                         except ImportError:
                             raise ImportError(
                                 "codecarbon is not installed. Please install it using `pip install 'mteb[codecarbon]'` to track CO₂ emissions."
@@ -482,7 +491,7 @@ class MTEB:
                         ) as tracker:
                             results, tick, tock = self._run_eval(
                                 task,
-                                model,
+                                mteb_model,
                                 split,
                                 encode_kwargs=encode_kwargs,
                                 subsets_to_run=subsets_to_run,
@@ -495,7 +504,7 @@ class MTEB:
                     else:
                         results, tick, tock = self._run_eval(
                             task,
-                            model,
+                            mteb_model,
                             split,
                             subsets_to_run=subsets_to_run,
                             encode_kwargs=encode_kwargs,
@@ -511,25 +520,25 @@ class MTEB:
                     if verbosity >= 1:
                         logger.info(f"Scores: {task_results[split]}")
-                    self.last_evaluated_splits[task.metadata.name].append(split)
+                    self._last_evaluated_splits[task.metadata.name].append(split)
                 # Create new TaskResult
                 new_results = TaskResult.from_task_results(
                     task,
-                    task_results,
+                    task_results,  # type: ignore[arg-type]
                     evaluation_time=evaluation_time,
                     kg_co2_emissions=kg_co2_emissions,
                 )
                 # Merge with existing if needed
-                if output_path and save_path.exists():
+                if output_path and save_path and save_path.exists():
                     existing_results = TaskResult.from_disk(save_path)
                 if existing_results:
                     merged_results = self._merge_results(existing_results, new_results)
                 else:
                     merged_results = new_results
-                if output_path:
+                if output_path and save_path:
                     merged_results.to_disk(save_path)
                 evaluation_results.append(merged_results)
@@ -556,7 +565,7 @@ class MTEB:
     def create_model_meta(model: MTEBModels) -> ModelMeta:
         """Create a ModelMeta object for the given model."""
         if hasattr(model, "mteb_model_meta") and model.mteb_model_meta is not None:
-            meta = model.mteb_model_meta  # type: ignore
+            meta = model.mteb_model_meta
         else:
             meta = MTEB._get_model_meta(model)
@@ -582,7 +591,11 @@ class MTEB:
         if output_folder is None:
             return None
-        model_revision: str = model_meta.revision  # type: ignore
+        model_revision: str = (
+            model_meta.revision
+            if model_meta.revision is not None
+            else "no_revision_available"
+        )
         model_path_name = model_meta.model_name_as_path()
         output_path = Path(output_folder) / model_path_name / model_revision
@@ -604,15 +617,15 @@ class MTEB:
              Tasks with empty lists indicate that results already existed and no splits were evaluated.
         """
         return deepcopy(
-            {task: list(splits) for task, splits in self.last_evaluated_splits.items()}
+            {task: list(splits) for task, splits in self._last_evaluated_splits.items()}
         )
     @staticmethod
     def _get_missing_evaluations(
         existing_results: TaskResult | None,
-        task_eval_splits: list[str],
-        task_eval_langs: list[str],
-        eval_subsets: list[str] | None,
+        task_eval_splits: Sequence[str],
+        task_eval_langs: Sequence[str],
+        eval_subsets: Sequence[str] | None,
     ) -> dict[str, dict[str, Any]]:
         """Return a dictionary for each split, indicating if the whole split is missing and which subsets are missing."""
         missing_evaluations = {
@@ -661,7 +674,7 @@ class MTEB:
         return missing_evaluations
     @staticmethod
-    def _get_model_meta(model: EncoderProtocol) -> ModelMeta:
+    def _get_model_meta(model: MTEBModels) -> ModelMeta:
         from sentence_transformers import CrossEncoder, SentenceTransformer
         if isinstance(model, CrossEncoder):

mteb 2.5.3__py3-none-any.whl → 2.5.5__py3-none-any.whl

mteb 2.5.3py3-none-any.whl → 2.5.5py3-none-any.whl