PyPI - mteb - Versions diffs - 2.1.4__py3-none-any.whl → 2.7.2__py3-none-any.whl - Mend

mteb 2.1.4py3-none-any.whl → 2.7.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (527) hide show

mteb/results/model_result.py CHANGED Viewed

@@ -1,12 +1,14 @@
+from __future__ import annotations
 import logging
 import warnings
-from collections.abc import Callable, Iterable, Sequence
-from typing import Any, Literal
+from collections.abc import Callable, Iterable
+from typing import Any, Literal, cast
 import numpy as np
 import pandas as pd
 from pydantic import BaseModel, ConfigDict, Field
-from typing_extensions import Self
+from typing_extensions import overload
 from mteb.abstasks.abstask import AbsTask
 from mteb.abstasks.task_metadata import (
@@ -22,7 +24,7 @@ from mteb.types import (
     SplitName,
 )
-from .task_result import TaskResult
+from .task_result import TaskError, TaskResult
 logger = logging.getLogger(__name__)
@@ -30,7 +32,7 @@ logger = logging.getLogger(__name__)
 def _aggregate_and_pivot(
     df: pd.DataFrame,
     columns: list[str],
-    aggregation_level: Literal["subset", "split", "task"],
+    aggregation_level: Literal["subset", "split", "task", "language"],
     format: Literal["wide", "long"],
     aggregation_fn: Callable[[list[Score]], Any] | None,
 ) -> pd.DataFrame:
@@ -43,6 +45,12 @@ def _aggregate_and_pivot(
     elif aggregation_level == "task":
         index_columns = ["task_name"]
+    elif aggregation_level == "language":
+        index_columns = ["language"]
+        df = df.explode("language").reset_index(
+            drop=True
+        )  # each language in its own row before aggregation
     # perform aggregation
     if aggregation_fn is None:
         aggregation_fn = np.mean
@@ -52,7 +60,7 @@ def _aggregate_and_pivot(
             index=index_columns,
             columns=columns,
             values="score",
-            aggfunc=aggregation_fn,
+            aggfunc=aggregation_fn,  # type: ignore[arg-type]
         ).reset_index()
     elif format == "long":
         return (
@@ -75,29 +83,31 @@ class ModelResult(BaseModel):
     model_revision: str | None
     task_results: list[TaskResult]
     default_modalities: list[Modalities] = Field(
-        default_factory=lambda: ["text"], alias="modalities"
+        default_factory=lambda: [cast(Modalities, "text")], alias="modalities"
     )
     model_config = (
         ConfigDict(  # to free up the name model_* which is otherwise protected
             protected_namespaces=(),
         )
     )
+    exceptions: list[TaskError] | None = None
     def __repr__(self) -> str:
         n_entries = len(self.task_results)
         return f"ModelResult(model_name={self.model_name}, model_revision={self.model_revision}, task_results=[...](#{n_entries}))"
     @classmethod
-    def from_validated(cls, **data: dict[str, Any]) -> Self:
+    def from_validated(cls, **data: dict[str, Any]) -> ModelResult:
         """Create a ModelResult from validated data.
         Args:
             data: The validated data.
         """
-        data["task_results"] = [
-            TaskResult.from_validated(**res) for res in data["task_results"]
+        data["task_results"] = [  # type: ignore[assignment]
+            TaskResult.from_validated(**res)  # type: ignore[arg-type]
+            for res in data["task_results"]
         ]
-        return cls.model_construct(**data)
+        return cls.model_construct(**data)  # type: ignore[arg-type]
     def _filter_tasks(
         self,
@@ -107,7 +117,7 @@ class ModelResult(BaseModel):
         task_types: list[TaskType] | None = None,
         modalities: list[Modalities] | None = None,
         is_public: bool | None = None,
-    ) -> Self:
+    ) -> ModelResult:
         new_task_results = []
         for task_result in self.task_results:
             if (task_names is not None) and (task_result.task_name not in task_names):
@@ -135,7 +145,7 @@ class ModelResult(BaseModel):
             task_results=new_task_results,
         )
-    def select_tasks(self, tasks: Sequence[AbsTask]) -> Self:
+    def select_tasks(self, tasks: Iterable[AbsTask]) -> ModelResult:
         """Select tasks from the ModelResult based on a list of AbsTask objects.
         Args:
@@ -153,6 +163,28 @@ class ModelResult(BaseModel):
             task_results=new_task_results,
         )
+    @overload
+    def _get_scores(
+        self,
+        splits: list[SplitName] | None = None,
+        languages: list[ISOLanguage | ISOLanguageScript] | None = None,
+        scripts: list[ISOLanguageScript] | None = None,
+        getter: Callable[[ScoresDict], Score] | None = None,
+        aggregation: Callable[[list[Score]], Any] | None = None,
+        format: Literal["wide"] = "wide",
+    ) -> dict: ...
+    @overload
+    def _get_scores(
+        self,
+        splits: list[SplitName] | None = None,
+        languages: list[ISOLanguage | ISOLanguageScript] | None = None,
+        scripts: list[ISOLanguageScript] | None = None,
+        getter: Callable[[ScoresDict], Score] | None = None,
+        aggregation: Callable[[list[Score]], Any] | None = None,
+        format: Literal["long"] = "long",
+    ) -> list: ...
     def _get_scores(
         self,
         splits: list[SplitName] | None = None,
@@ -170,21 +202,24 @@ class ModelResult(BaseModel):
             aggregation = aggregation if aggregation is not None else np.mean
         else:
             use_fast = True
+        aggregation = cast(Callable[[list[Score]], Any], aggregation)
+        getter = cast(Callable[[ScoresDict], Score], getter)
         if format == "wide":
             scores = {}
             for res in self.task_results:
                 try:
                     if use_fast:
                         scores[res.task_name] = res._get_score_fast(
-                            splits=splits,  # type: ignore
-                            languages=languages,  # type: ignore
+                            splits=splits,
+                            languages=languages,
                         )
                     else:
                         scores[res.task_name] = res.get_score(
                             splits=splits,
                             languages=languages,
-                            aggregation=aggregation,  # type: ignore
-                            getter=getter,  # type: ignore
+                            aggregation=aggregation,
+                            getter=getter,
                             scripts=scripts,
                         )
                 except Exception as e:
@@ -199,14 +234,14 @@ class ModelResult(BaseModel):
                     if use_fast:
                         score = task_res._get_score_fast(
                             splits=splits,
-                            languages=languages,  # type: ignore
+                            languages=languages,
                         )
                     else:
                         score = task_res.get_score(
                             splits=splits,
                             languages=languages,
-                            aggregation=aggregation,  # type: ignore
-                            getter=getter,  # type: ignore
+                            aggregation=aggregation,
+                            getter=getter,
                             scripts=scripts,
                         )
                     entry = dict(
@@ -226,7 +261,7 @@ class ModelResult(BaseModel):
                     )
             return entries
-    def _get_score_for_table(self) -> list[dict[str, str | float]]:
+    def _get_score_for_table(self) -> list[dict[str, str | float | list[str]]]:
         scores_data = []
         model_name = self.model_name
         for task_result in self.task_results:
@@ -238,10 +273,10 @@ class ModelResult(BaseModel):
                         "model_revision": self.model_revision,
                         "task_name": task_name,
                         "split": split,
+                        "language": score_item.get("languages", ["Unknown"]),
                         "subset": score_item.get("hf_subset", "default"),
                         "score": score_item.get("main_score", None),
                     }
                     scores_data.append(row)
         return scores_data
@@ -285,7 +320,9 @@ class ModelResult(BaseModel):
         scores_data = self._get_score_for_table()
         if not scores_data:
-            logger.warning("No scores data available. Returning empty DataFrame.")
+            msg = "No scores data available. Returning empty DataFrame."
+            logger.warning(msg)
+            warnings.warn(msg)
             return pd.DataFrame()
         # Create DataFrame
@@ -308,7 +345,7 @@ class ModelResult(BaseModel):
     def __hash__(self) -> int:
         return id(self)
-    def __iter__(self) -> Iterable[TaskResult]:
+    def __iter__(self) -> Iterable[TaskResult]:  # type: ignore[override]
         return iter(self.task_results)
     def __getitem__(self, index) -> TaskResult:
@@ -361,13 +398,13 @@ class ModelResult(BaseModel):
         return [task_res.task_name for task_res in self.task_results]
     @property
-    def modalities(self) -> list[str]:
+    def modalities(self) -> list[Modalities]:
         """Get all modalities in the task results.
         Returns:
             A list of modalities in the task results.
         """
-        mods = []
+        mods: list[Modalities] = []
         for task_res in self.task_results:
             task_modalities = getattr(task_res, "modalities", [])
             mods.extend(task_modalities)

mteb/results/task_result.py CHANGED Viewed

@@ -2,9 +2,9 @@ from __future__ import annotations
 import json
 import logging
-from argparse import Namespace
+import warnings
 from collections import defaultdict
-from collections.abc import Callable, Iterable
+from collections.abc import Callable, Iterable, Mapping
 from functools import cached_property
 from importlib.metadata import version
 from pathlib import Path
@@ -16,8 +16,11 @@ from packaging.version import Version
 from pydantic import BaseModel, field_validator
 from typing_extensions import Self
+from mteb import TaskMetadata
 from mteb._helpful_enum import HelpfulStrEnum
+from mteb.abstasks import AbsTaskClassification
 from mteb.abstasks.abstask import AbsTask
+from mteb.abstasks.task_metadata import TaskDomain
 from mteb.languages import LanguageScripts
 from mteb.models.model_meta import ScoringFunction
 from mteb.types import (
@@ -39,67 +42,59 @@ class Criteria(HelpfulStrEnum):
     DATASET_REVISION = "dataset_revision"
-class ScalaNbClassificationDummy:
+class ScalaNbClassificationDummy(AbsTaskClassification):
     """A dummy task for loading historic results from before v1.11.0"""
-    metadata = Namespace(  # type: ignore
+    metadata = TaskMetadata(
         name="ScalaNbClassification",
+        description="A dummy",
         main_score="accuracy",
         type="Classification",
-        hf_subsets_to_langscripts={
-            "default": ["nob-Latn"],
-        },
-        dataset={"revision": "revision_not_applicable"},
-        revision="revision_not_applicable",
+        eval_langs=["nob-Latn"],
+        dataset={"path": "not/exists", "revision": "revision_not_applicable"},
     )
-class ScalaNnClassificationDummy:
+class ScalaNnClassificationDummy(AbsTaskClassification):
     """A dummy task for loading historic results from before v1.11.0"""
-    metadata = Namespace(  # type: ignore
+    metadata = TaskMetadata(
         name="ScalaNnClassification",
+        description="A dummy",
         main_score="accuracy",
         type="Classification",
-        hf_subsets_to_langscripts={
-            "default": ["nno-Latn"],
-        },
-        dataset={"revision": "revision_not_applicable"},
-        revision="revision_not_applicable",
+        eval_langs=["nob-Latn"],
+        dataset={"path": "not/exists", "revision": "revision_not_applicable"},
     )
-class ScalaDaClassificationDummy:
+class ScalaDaClassificationDummy(AbsTaskClassification):
     """A dummy task for loading historic results from before v1.11.0"""
-    metadata = Namespace(  # type: ignore
+    metadata = TaskMetadata(
         name="ScalaDaClassification",
+        description="A dummy",
         main_score="accuracy",
         type="Classification",
-        hf_subsets_to_langscripts={
-            "default": ["dan-Latn"],
-        },
-        dataset={"revision": "revision_not_applicable"},
-        revision="revision_not_applicable",
+        eval_langs=["dan-Latn"],
+        dataset={"path": "not/exists", "revision": "revision_not_applicable"},
     )
-class ScalaSvClassificationDummy:
+class ScalaSvClassificationDummy(AbsTaskClassification):
     """A dummy task for loading historic results from before v1.11.0"""
-    metadata = Namespace(  # type: ignore
+    metadata = TaskMetadata(
         name="ScalaSvClassification",
+        description="A dummy",
         main_score="accuracy",
         type="Classification",
-        hf_subsets_to_langscripts={
-            "default": ["swe-Latn"],
-        },
-        dataset={"revision": "revision_not_applicable"},
-        revision="revision_not_applicable",
+        eval_langs=["swe-Latn"],
+        dataset={"path": "not/exists", "revision": "revision_not_applicable"},
     )
-outdated_tasks = {
+outdated_tasks: dict[str, type[AbsTask]] = {
     "ScalaNbClassification": ScalaNbClassificationDummy,
     "ScalaNnClassification": ScalaNnClassificationDummy,
     "ScalaDaClassification": ScalaDaClassificationDummy,
@@ -166,10 +161,10 @@ class TaskResult(BaseModel):
     def from_task_results(
         cls,
         task: AbsTask | type[AbsTask],
-        scores: dict[SplitName, dict[HFSubset, ScoresDict]],
+        scores: dict[SplitName, Mapping[HFSubset, ScoresDict]],
         evaluation_time: float,
         kg_co2_emissions: float | None = None,
-    ) -> Self:
+    ) -> TaskResult:
         """Create a TaskResult from the task and scores.
         Args:
@@ -246,12 +241,12 @@ class TaskResult(BaseModel):
         return get_task(self.task_name)
     @property
-    def domains(self) -> list[str]:
+    def domains(self) -> list[TaskDomain]:
         """Get the domains of the task."""
         doms = self.task.metadata.domains
         if doms is None:
             doms = []
-        return doms  # type: ignore
+        return doms
     @property
     def task_type(self) -> str:
@@ -307,7 +302,7 @@ class TaskResult(BaseModel):
                     if isinstance(v, dict):
                         self._round_scores(v, n)
                     elif isinstance(v, float):
-                        value[i] = round(v, n)
+                        value[i] = round(v, n)  # type: ignore[call-overload]
             elif isinstance(value, float):
                 scores[key] = round(value, n)
@@ -325,7 +320,7 @@ class TaskResult(BaseModel):
             json.dump(json_obj, f, indent=2)
     @classmethod
-    def from_disk(cls, path: Path, load_historic_data: bool = True) -> Self:  # type: ignore
+    def from_disk(cls, path: Path, load_historic_data: bool = True) -> TaskResult:
         """Load TaskResult from disk.
         Args:
@@ -356,7 +351,7 @@ class TaskResult(BaseModel):
         )  # assume it is before 1.11.0 if the version is not present
         try:
-            obj = cls.model_validate(data)
+            obj: TaskResult = cls.model_validate(data)
         except Exception as e:
             if not pre_1_11_load:
                 raise e
@@ -381,6 +376,7 @@ class TaskResult(BaseModel):
         from mteb import get_task
         task_name = obj.task_name
+        task: AbsTask | type[AbsTask]
         if task_name in outdated_tasks:
             task = outdated_tasks[task_name]
         else:
@@ -393,11 +389,11 @@ class TaskResult(BaseModel):
                     for key in list(hf_subset_scores.keys()):
                         if isinstance(hf_subset_scores[key], dict):
                             for k, v in hf_subset_scores[key].items():
-                                hf_subset_scores[f"{key}_{k}"] = v
-                            hf_subset_scores.pop(key)
+                                hf_subset_scores[f"{key}_{k}"] = v  # type: ignore[index]
+                            hf_subset_scores.pop(key)  # type: ignore[attr-defined]
     @classmethod
-    def _convert_from_before_v1_11_0(cls, data: dict) -> Self:
+    def _convert_from_before_v1_11_0(cls, data: dict) -> TaskResult:
         from mteb.get_tasks import _TASKS_REGISTRY
         # in case the task name is not found in the registry, try to find a lower case version
@@ -462,7 +458,9 @@ class TaskResult(BaseModel):
                     if main_score in hf_subset_scores:
                         hf_subset_scores["main_score"] = hf_subset_scores[main_score]
                     else:
-                        logger.warning(f"Main score {main_score} not found in scores")
+                        msg = f"Main score {main_score} not found in scores"
+                        logger.warning(msg)
+                        warnings.warn(msg)
                         hf_subset_scores["main_score"] = None
         # specific fixes:
@@ -481,7 +479,7 @@ class TaskResult(BaseModel):
                 scores["test"]["fra-fra"] = scores["test"].pop("fr")
         result: TaskResult = TaskResult.from_task_results(
-            task,  # type: ignore
+            task,
             scores,
             evaluation_time,
             kg_co2_emissions=None,
@@ -532,7 +530,7 @@ class TaskResult(BaseModel):
     def _get_score_fast(
         self,
         splits: Iterable[str] | None = None,
-        languages: str | None = None,
+        languages: list[ISOLanguage | ISOLanguageScript] | None = None,
         subsets: Iterable[str] | None = None,
     ) -> float:
         """Sped up version of get_score that will be used if no aggregation, script or getter needs to be specified.
@@ -581,7 +579,7 @@ class TaskResult(BaseModel):
         return val_sum / n_val
     @classmethod
-    def from_validated(cls, **data) -> Self:
+    def from_validated(cls, **data) -> TaskResult:
         """Create a TaskResult from validated data.
         Returns:
@@ -592,13 +590,13 @@ class TaskResult(BaseModel):
     def __repr__(self) -> str:
         return f"TaskResult(task_name={self.task_name}, scores=...)"
-    def only_main_score(self) -> Self:
+    def only_main_score(self) -> TaskResult:
         """Return a new TaskResult object with only the main score.
         Returns:
             A new TaskResult object with only the main score.
         """
-        new_scores = {}
+        new_scores: dict[str, list[Score]] = {}
         for split in self.scores:
             new_scores[split] = []
             for subset_scores in self.scores[split]:
@@ -610,10 +608,12 @@ class TaskResult(BaseModel):
                     }
                 )
         new_res = {**self.to_dict(), "scores": new_scores}
-        new_res = TaskResult.from_validated(**new_res)
-        return new_res
+        return TaskResult.from_validated(**new_res)
-    def validate_and_filter_scores(self, task: AbsTask | None = None) -> Self:
+    def validate_and_filter_scores(
+        self,
+        task: AbsTask | None = None,
+    ) -> TaskResult:
         """Validate and filter the scores against the task metadata.
         This ensures that the scores are correct for the given task, by removing any splits besides those specified in the task metadata.
@@ -633,21 +633,23 @@ class TaskResult(BaseModel):
             task = get_task(self.task_name)
         splits = task.eval_splits
-        hf_subsets = task.hf_subsets
-        hf_subsets = set(hf_subsets)
+        hf_subsets = set(task.hf_subsets)  # Convert to set once
-        new_scores = {}
+        new_scores: dict[str, list[Score]] = {}
         seen_splits = set()
         for split in self.scores:
             if split not in splits:
                 continue
-            new_scores[split] = []
             seen_subsets = set()
-            for _scores in self.scores[split]:
-                if _scores["hf_subset"] not in hf_subsets:
-                    continue
-                new_scores[split].append(_scores)
+            # Use list comprehension for better performance
+            new_scores[split] = [
+                _scores
+                for _scores in self.scores[split]
+                if _scores["hf_subset"] in hf_subsets
+            ]
+            for _scores in new_scores[split]:
                 seen_subsets.add(_scores["hf_subset"])
             if seen_subsets != hf_subsets:
                 missing_subsets = hf_subsets - seen_subsets
                 if len(missing_subsets) > 2:
@@ -656,17 +658,39 @@ class TaskResult(BaseModel):
                 else:
                     missing_subsets_str = str(missing_subsets)
-                logger.warning(
-                    f"{task.metadata.name}: Missing subsets {missing_subsets_str} for split {split}"
-                )
+                msg = f"{task.metadata.name}: Missing subsets {missing_subsets_str} for split {split}"
+                logger.warning(msg)
+                warnings.warn(msg)
+                for missing_subset in missing_subsets:
+                    new_scores[split].append(
+                        {
+                            "hf_subset": missing_subset,
+                            "main_score": np.nan,
+                            "languages": task.metadata.hf_subsets_to_langscripts.get(
+                                missing_subset, []
+                            ),
+                        }
+                    )
             seen_splits.add(split)
         if seen_splits != set(splits):
-            logger.warning(
-                f"{task.metadata.name}: Missing splits {set(splits) - seen_splits}"
-            )
-        new_res = {**self.to_dict(), "scores": new_scores}
-        new_res = TaskResult.from_validated(**new_res)
-        return new_res
+            msg = f"{task.metadata.name}: Missing splits {set(splits) - seen_splits}"
+            logger.warning(msg)
+            warnings.warn(msg)
+            for missing_split in set(splits) - seen_splits:
+                new_scores[missing_split] = []
+                for missing_subset in hf_subsets:
+                    new_scores[missing_split].append(
+                        {
+                            "hf_subset": missing_subset,
+                            "main_score": np.nan,
+                            "languages": task.metadata.hf_subsets_to_langscripts.get(
+                                missing_subset, []
+                            ),
+                        }
+                    )
+        data = self.model_dump()
+        data["scores"] = new_scores
+        return type(self).model_construct(**data)
     def is_mergeable(
         self,
@@ -698,27 +722,31 @@ class TaskResult(BaseModel):
             name = result.metadata.name
             revision = result.metadata.revision
         else:
+            msg = "result must be a TaskResult or AbsTask object"
+            if raise_error:
+                raise ValueError(msg)
+            logger.debug(msg)
             return False
         if self.task_name != name:
+            msg = f"Cannot merge TaskResult objects as they are derived from different tasks ({self.task_name} and {name})"
             if raise_error:
-                raise ValueError(
-                    f"Cannot merge TaskResult objects as they are derived from different tasks ({self.task_name} and {name})"
-                )
+                raise ValueError(msg)
+            logger.debug(msg)
             return False
         if Criteria.MTEB_VERSION in criteria and self.mteb_version != mteb_version:
+            msg = f"Cannot merge TaskResult objects as they are derived from different MTEB versions ({self.mteb_version} (loaded) and {mteb_version} (current))"
             if raise_error:
-                raise ValueError(
-                    f"Cannot merge TaskResult objects as they are derived from different MTEB versions ({self.mteb_version} and {mteb_version})"
-                )
+                raise ValueError(msg)
+            logger.debug(msg)
             return False
         if Criteria.DATASET_REVISION in criteria and self.dataset_revision != revision:
+            msg = f"Cannot merge TaskResult objects as they are derived from different dataset revisions ({self.dataset_revision} and {revision})"
             if raise_error:
-                raise ValueError(
-                    f"Cannot merge TaskResult objects as they are derived from different dataset revisions ({self.dataset_revision} and {revision})"
-                )
+                raise ValueError(msg)
+            logger.debug(msg)
             return False
         return True
@@ -730,7 +758,7 @@ class TaskResult(BaseModel):
             "mteb_version",
             "dataset_revision",
         ],
-    ) -> Self:
+    ) -> TaskResult:
         """Merges two TaskResult objects.
         Args:
@@ -836,3 +864,15 @@ class TaskResult(BaseModel):
                     )
                 )
         return results
+class TaskError(BaseModel):
+    """A class to represent an error that occurred during the evaluation of a task.
+    Attributes:
+        task_name: The name of the MTEB task.
+        exception: The error message that occurred during the evaluation.
+    """
+    task_name: str
+    exception: str

mteb 2.1.4__py3-none-any.whl → 2.7.2__py3-none-any.whl

mteb 2.1.4py3-none-any.whl → 2.7.2py3-none-any.whl