PyPI - mteb - Versions diffs - 2.5.2__py3-none-any.whl → 2.7.9__py3-none-any.whl - Mend

mteb 2.5.2py3-none-any.whl → 2.7.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (529) hide show

mteb/abstasks/text/bitext_mining.py CHANGED Viewed

@@ -1,7 +1,8 @@
+from __future__ import annotations
 import logging
 from collections import defaultdict
-from pathlib import Path
-from typing import Any, ClassVar, TypedDict
+from typing import TYPE_CHECKING, Any, ClassVar, TypedDict, cast
 from datasets import Dataset, DatasetDict
 from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
@@ -9,9 +10,15 @@ from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_sc
 from mteb._evaluators import BitextMiningEvaluator
 from mteb.abstasks._statistics_calculation import calculate_text_statistics
 from mteb.abstasks.abstask import AbsTask
-from mteb.models import EncoderProtocol, MTEBModels
-from mteb.types import HFSubset, ScoresDict
-from mteb.types.statistics import SplitDescriptiveStatistics, TextStatistics
+from mteb.models import EncoderProtocol
+from mteb.types.statistics import SplitDescriptiveStatistics
+if TYPE_CHECKING:
+    from pathlib import Path
+    from mteb.models import MTEBModels
+    from mteb.types import EncodeKwargs, HFSubset, ScoresDict
+    from mteb.types.statistics import TextStatistics
 logger = logging.getLogger(__name__)
@@ -73,13 +80,17 @@ class AbsTaskBitextMining(AbsTask):
         split: str = "test",
         subsets_to_run: list[HFSubset] | None = None,
         *,
-        encode_kwargs: dict[str, Any],
+        encode_kwargs: EncodeKwargs,
         prediction_folder: Path | None = None,
+        num_proc: int = 1,
         **kwargs: Any,
     ) -> dict[HFSubset, ScoresDict]:
         """Added load for "parallel" datasets"""
+        if not isinstance(model, EncoderProtocol):
+            raise TypeError("Expected model to be an instance of EncoderProtocol")
         if not self.data_loaded:
-            self.load_data()
+            self.load_data(num_proc=num_proc)
         hf_subsets = self.hf_subsets
@@ -87,16 +98,22 @@ class AbsTaskBitextMining(AbsTask):
         if subsets_to_run is not None:
             hf_subsets = [s for s in hf_subsets if s in subsets_to_run]
-        scores = {}
+        encoder_model = cast("EncoderProtocol", model)
+        if self.dataset is None:
+            raise ValueError("Dataset is not loaded.")
+        scores: dict[str, BitextMiningMetrics] = {}
         if self.parallel_subsets:
-            scores = self._evaluate_subset(
-                model,
-                self.dataset[split],  # type: ignore
+            scores = self._evaluate_subset(  # type: ignore[assignment]
+                encoder_model,
+                self.dataset[split],
                 parallel=True,
                 hf_split=split,
                 hf_subset="parallel",
                 encode_kwargs=encode_kwargs,
                 prediction_folder=prediction_folder,
+                num_proc=num_proc,
                 **kwargs,
             )
         else:
@@ -109,42 +126,44 @@ class AbsTaskBitextMining(AbsTask):
                     data_split = self.dataset[split]
                 else:
                     data_split = self.dataset[hf_subset][split]
-                scores[hf_subset] = self._evaluate_subset(
-                    model,
+                scores[hf_subset] = self._evaluate_subset(  # type: ignore[assignment]
+                    encoder_model,
                     data_split,
                     hf_split=split,
                     hf_subset=hf_subset,
                     encode_kwargs=encode_kwargs,
                     prediction_folder=prediction_folder,
+                    num_proc=num_proc,
                     **kwargs,
                 )
-        return scores
+        return cast("dict[HFSubset, ScoresDict]", scores)
     def _get_pairs(self, parallel: bool) -> list[tuple[str, str]]:
         pairs = self._DEFAULT_PAIR
         if parallel:
-            pairs = [langpair.split("-") for langpair in self.hf_subsets]
+            pairs = [langpair.split("-") for langpair in self.hf_subsets]  # type: ignore[misc]
         return pairs
-    def _evaluate_subset(
+    def _evaluate_subset(  # type: ignore[override]
         self,
         model: EncoderProtocol,
         data_split: Dataset,
         *,
         hf_split: str,
         hf_subset: str,
-        parallel: bool = False,
-        encode_kwargs: dict[str, Any],
+        encode_kwargs: EncodeKwargs,
         prediction_folder: Path | None = None,
+        parallel: bool = False,
+        num_proc: int = 1,
         **kwargs,
-    ) -> ScoresDict:
+    ) -> BitextMiningMetrics | dict[str, BitextMiningMetrics]:
         pairs = self._get_pairs(parallel)
         evaluator = BitextMiningEvaluator(
             data_split,
             task_metadata=self.metadata,
-            pair_columns=pairs,  # type: ignore
+            pair_columns=pairs,
             hf_split=hf_split,
             hf_subset=hf_subset,
             **kwargs,
@@ -156,7 +175,7 @@ class AbsTaskBitextMining(AbsTask):
             else data_split["gold"]
         )
-        neighbours = evaluator(model, encode_kwargs=encode_kwargs)
+        neighbours = evaluator(model, encode_kwargs=encode_kwargs, num_proc=num_proc)
         if prediction_folder:
             self._save_task_predictions(
@@ -168,16 +187,16 @@ class AbsTaskBitextMining(AbsTask):
             )
         if parallel:
-            metrics = {}
+            parallel_metrics = {}
             for keys, nearest_neighbors in neighbours.items():
-                metrics[keys] = self._compute_metrics(nearest_neighbors, gold)
+                parallel_metrics[keys] = self._compute_metrics(nearest_neighbors, gold)
-            for v in metrics.values():
+            for v in parallel_metrics.values():
                 self._add_main_score(v)
-        else:
-            def_pair_str = "-".join(self._DEFAULT_PAIR[0])
-            metrics = self._compute_metrics(neighbours[def_pair_str], gold)
-            self._add_main_score(metrics)
+            return parallel_metrics
+        def_pair_str = "-".join(self._DEFAULT_PAIR[0])
+        metrics = self._compute_metrics(neighbours[def_pair_str], gold)
+        self._add_main_score(metrics)
         return metrics
     def _compute_metrics(
@@ -249,9 +268,12 @@ class AbsTaskBitextMining(AbsTask):
             sentence2_statistics=text2_statistics,
         )
-    def _push_dataset_to_hub(self, repo_name: str) -> None:
+    def _push_dataset_to_hub(self, repo_name: str, num_proc: int = 1) -> None:
+        if self.dataset is None:
+            raise ValueError("Dataset is not loaded.")
         if self.metadata.is_multilingual:
-            dataset = defaultdict(dict)
+            dataset: dict[str, dict[str, list[str]]] = defaultdict(dict)
             for config in self.metadata.eval_langs:
                 logger.info(f"Converting {config} of {self.metadata.name}")
@@ -266,10 +288,10 @@ class AbsTaskBitextMining(AbsTask):
                     for split in self.dataset[config]:
                         dataset[split][lang_1] = self.dataset[config][split][sent_1]
                         dataset[split][lang_2] = self.dataset[config][split][sent_2]
-            for split in dataset:
-                dataset[split] = Dataset.from_dict(dataset[split])
-            dataset = DatasetDict(dataset)
-            dataset.push_to_hub(repo_name)
+            dataset_dict = DatasetDict(
+                {split: Dataset.from_dict(dataset[split]) for split in dataset}
+            )
+            dataset_dict.push_to_hub(repo_name, num_proc=num_proc)
         else:
             sentences = {}
             for split in self.dataset:
@@ -281,4 +303,4 @@ class AbsTaskBitextMining(AbsTask):
                     }
                 )
             sentences = DatasetDict(sentences)
-            sentences.push_to_hub(repo_name)
+            sentences.push_to_hub(repo_name, num_proc=num_proc)

mteb/abstasks/text/reranking.py CHANGED Viewed

@@ -16,7 +16,7 @@ else:
 logger = logging.getLogger(__name__)
-OLD_FORMAT_RERANKING_TASKS = []
+OLD_FORMAT_RERANKING_TASKS: list[str] = []
 @deprecated(
@@ -34,7 +34,7 @@ class AbsTaskReranking(AbsTaskRetrieval):
         For dataformat and other information, see [AbsTaskRetrieval][mteb.abstasks.retrieval.AbsTaskRetrieval].
     """
-    def load_data(self) -> None:
+    def load_data(self, num_proc: int = 1, **kwargs) -> None:
         """Load the dataset."""
         if self.data_loaded:
             return
@@ -43,7 +43,7 @@ class AbsTaskReranking(AbsTaskRetrieval):
             self.transform_old_dataset_format()
         else:
             # use AbsTaskRetrieval default to load the data
-            return super().load_data()
+            return super().load_data(num_proc=num_proc)
     def _process_example(self, example: dict, split: str, query_idx: int) -> dict:
         """Process a single example from the dataset.
@@ -100,12 +100,14 @@ class AbsTaskReranking(AbsTaskRetrieval):
         if self.metadata.name not in OLD_FORMAT_RERANKING_TASKS:
             return
-        logging.info(
+        logger.info(
             f"Transforming old format to standard format for {self.metadata.name}"
         )
         given_dataset = copy(given_dataset)
-        self.dataset = defaultdict(lambda: defaultdict(dict))
+        self.dataset: dict[str, dict[str, RetrievalSplitData]] = defaultdict(
+            lambda: defaultdict(dict)  # type: ignore[arg-type]
+        )
         hf_subsets = self.hf_subsets
@@ -115,19 +117,19 @@ class AbsTaskReranking(AbsTaskRetrieval):
                 if hf_subset in cur_dataset:
                     cur_dataset = cur_dataset[hf_subset]
             elif "name" in self.metadata.dataset:
-                cur_dataset = datasets.load_dataset(**self.metadata.dataset)  # type: ignore
+                cur_dataset = datasets.load_dataset(**self.metadata.dataset)
                 assert hf_subset == "default", (
                     f"Only default subset is supported for {self.metadata.name} since `name` is given in the metadata."
                 )
             else:
                 cur_dataset = datasets.load_dataset(
                     **self.metadata.dataset, name=hf_subset
-                )  # type: ignore
+                )
             for split in cur_dataset:
                 corpus = []
                 queries = []
-                relevant_docs = defaultdict(dict)
+                relevant_docs: dict[str, dict[str, int]] = defaultdict(dict)
                 top_ranked = defaultdict(list)
                 # Create an enumerated dataset to pass indices

mteb/abstasks/text/summarization.py CHANGED Viewed

@@ -1,12 +1,11 @@
+from __future__ import annotations
 import logging
-from pathlib import Path
-from typing import Any
+from typing import TYPE_CHECKING
 import numpy as np
-from datasets import Dataset
 from mteb._evaluators import SummarizationEvaluator
-from mteb._evaluators.text.summarization_evaluator import SummarizationMetrics
 from mteb.abstasks._statistics_calculation import (
     calculate_score_statistics,
     calculate_text_statistics,
@@ -14,11 +13,22 @@ from mteb.abstasks._statistics_calculation import (
 from mteb.abstasks.abstask import AbsTask
 from mteb.models import EncoderProtocol
 from mteb.types.statistics import (
-    ScoreStatistics,
     SplitDescriptiveStatistics,
-    TextStatistics,
 )
+if TYPE_CHECKING:
+    from pathlib import Path
+    from datasets import Dataset
+    from mteb._evaluators.text.summarization_evaluator import SummarizationMetrics
+    from mteb.models import MTEBModels
+    from mteb.types import EncodeKwargs
+    from mteb.types.statistics import (
+        ScoreStatistics,
+        TextStatistics,
+    )
 logger = logging.getLogger(__name__)
@@ -77,17 +87,23 @@ class AbsTaskSummarization(AbsTask):
     def _evaluate_subset(
         self,
-        model: EncoderProtocol,
+        model: MTEBModels,
         data_split: Dataset,
         *,
         hf_split: str,
         hf_subset: str,
-        encode_kwargs: dict[str, Any],
+        encode_kwargs: EncodeKwargs,
         prediction_folder: Path | None = None,
+        num_proc: int = 1,
         **kwargs,
     ) -> SummarizationMetrics:
+        if not isinstance(model, EncoderProtocol):
+            raise TypeError("Expected model to be an instance of EncoderProtocol")
         normalized_scores = [
-            (np.array(x) - self.min_score) / (self.max_score - self.min_score)
+            (
+                (np.array(x) - self.min_score) / (self.max_score - self.min_score)
+            ).tolist()
             for x in data_split[self.relevancy_column_name]
         ]
         evaluator = self.evaluator(
@@ -100,7 +116,7 @@ class AbsTaskSummarization(AbsTask):
             hf_subset=hf_subset,
             **kwargs,
         )
-        scores = evaluator(model, encode_kwargs=encode_kwargs)
+        scores = evaluator(model, encode_kwargs=encode_kwargs, num_proc=num_proc)
         if prediction_folder:
             self._save_task_predictions(
                 scores,

mteb/abstasks/zeroshot_classification.py CHANGED Viewed

@@ -1,6 +1,7 @@
+from __future__ import annotations
 import logging
-from pathlib import Path
-from typing import Any, TypedDict
+from typing import TYPE_CHECKING, TypedDict
 import torch
 from datasets import Dataset
@@ -9,10 +10,7 @@ from sklearn import metrics
 from mteb._evaluators import ZeroShotClassificationEvaluator
 from mteb.models import EncoderProtocol
 from mteb.types.statistics import (
-    ImageStatistics,
-    LabelStatistics,
     SplitDescriptiveStatistics,
-    TextStatistics,
 )
 from ._statistics_calculation import (
@@ -22,6 +20,17 @@ from ._statistics_calculation import (
 )
 from .abstask import AbsTask
+if TYPE_CHECKING:
+    from pathlib import Path
+    from mteb.models import MTEBModels
+    from mteb.types import EncodeKwargs
+    from mteb.types.statistics import (
+        ImageStatistics,
+        LabelStatistics,
+        TextStatistics,
+    )
 logger = logging.getLogger(__name__)
@@ -111,15 +120,19 @@ class AbsTaskZeroShotClassification(AbsTask):
     def _evaluate_subset(
         self,
-        model: EncoderProtocol,
+        model: MTEBModels,
         data_split: Dataset,
         *,
         hf_split: str,
         hf_subset: str,
-        encode_kwargs: dict[str, Any],
+        encode_kwargs: EncodeKwargs,
         prediction_folder: Path | None = None,
+        num_proc: int = 1,
         **kwargs,
     ) -> ZeroShotClassificationMetrics:
+        if not isinstance(model, EncoderProtocol):
+            raise TypeError("Expected model to be an instance of EncoderProtocol")
         candidate_labels = self.get_candidate_labels()
         data_split = data_split.select_columns(
             [self.input_column_name, self.label_column_name]
@@ -133,7 +146,11 @@ class AbsTaskZeroShotClassification(AbsTask):
             hf_subset=hf_subset,
             **kwargs,
         )
-        probs = evaluator(model, encode_kwargs=encode_kwargs)
+        probs = evaluator(
+            model,
+            encode_kwargs=encode_kwargs,
+            num_proc=num_proc,
+        )
         if prediction_folder:
             self._save_task_predictions(
@@ -158,13 +175,14 @@ class AbsTaskZeroShotClassification(AbsTask):
             accuracy=metrics.accuracy_score(labels, predictions),
         )
-    def _push_dataset_to_hub(self, repo_name: str) -> None:
+    def _push_dataset_to_hub(self, repo_name: str, num_proc: int = 1) -> None:
         self._upload_dataset_to_hub(
             repo_name,
             [
                 self.input_column_name,
                 self.label_column_name,
             ],
+            num_proc=num_proc,
         )
         labels_dataset = Dataset.from_dict({"labels": self.get_candidate_labels()})
         labels_dataset.push_to_hub(repo_name, config_name="labels")

mteb/benchmarks/_create_table.py CHANGED Viewed

@@ -1,13 +1,17 @@
+from __future__ import annotations
 import re
 from collections import defaultdict
-from typing import Literal
+from typing import TYPE_CHECKING, Literal
 import numpy as np
 import pandas as pd
 import mteb
 from mteb.get_tasks import get_task, get_tasks
-from mteb.results.benchmark_results import BenchmarkResults
+if TYPE_CHECKING:
+    from mteb.results.benchmark_results import BenchmarkResults
 def _borda_count(scores: pd.Series) -> pd.Series:
@@ -115,7 +119,6 @@ def _create_summary_table_from_benchmark_results(
     # Build joint table
     joint_table = mean_per_type.copy()
-    joint_table = joint_table.drop(models_to_remove, axis=0)
     joint_table.insert(0, "mean", overall_mean)
     joint_table.insert(1, "mean_by_task_type", typed_mean)
     joint_table["borda_rank"] = _get_borda_rank(per_task)
@@ -303,6 +306,7 @@ def _create_per_language_table_from_benchmark_results(
 def _create_summary_table_mean_public_private(
     benchmark_results: BenchmarkResults,
+    exclude_private_from_borda: bool = False,
 ) -> pd.DataFrame:
     """Create summary table from BenchmarkResults.
@@ -311,6 +315,7 @@ def _create_summary_table_mean_public_private(
     Args:
         benchmark_results: BenchmarkResults object containing model results
+        exclude_private_from_borda: If True, calculate Borda rank using only public tasks
     Returns:
         DataFrame with model summaries, ready for styling in the leaderboard
@@ -353,10 +358,13 @@ def _create_summary_table_mean_public_private(
     # Build joint table
     joint_table = mean_per_type.copy()
-    joint_table = joint_table.drop(models_to_remove, axis=0)
     joint_table.insert(0, "mean(public)", public_mean)
     joint_table.insert(1, "mean(private)", private_mean)
-    joint_table["borda_rank"] = _get_borda_rank(per_task)
+    if exclude_private_from_borda:
+        borda_per_task = per_task[public_task_name]
+    else:
+        borda_per_task = per_task
+    joint_table["borda_rank"] = _get_borda_rank(borda_per_task)
     joint_table = joint_table.sort_values("borda_rank", ascending=True)
     joint_table = joint_table.reset_index()
@@ -476,7 +484,6 @@ def _create_summary_table_mean_subset(
     # Build joint table
     joint_table = mean_per_type.copy()
-    joint_table = joint_table.drop(models_to_remove, axis=0)
     joint_table.insert(0, "mean(subset)", overall_subset_mean)
     joint_table["borda_rank"] = _get_borda_rank(per_subset)
     joint_table = joint_table.sort_values("mean(subset)", ascending=False)
@@ -595,7 +602,6 @@ def _create_summary_table_mean_task_type(
     # Build joint table
     joint_table = mean_per_type.copy()
-    joint_table = joint_table.drop(models_to_remove, axis=0)
     joint_table.insert(0, "mean_by_task_type", typed_mean)
     joint_table = joint_table.sort_values("mean_by_task_type", ascending=False)
     joint_table["borda_rank"] = _get_borda_rank(per_task)

mteb/benchmarks/benchmark.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from __future__ import annotations
-from collections.abc import Iterable, Sequence
+from collections.abc import Iterator, Sequence
 from dataclasses import dataclass, field
 from typing import TYPE_CHECKING, Literal
@@ -19,6 +19,7 @@ class Benchmark:
     Args:
         name: The name of the benchmark
+        aliases: Alternative names for the benchmark
         tasks: The tasks within the benchmark.
         description: A description of the benchmark, should include its intended goal and potentially a description of its construction
         reference: A link reference, to a source containing additional information typically to a paper, leaderboard or github.
@@ -38,6 +39,7 @@ class Benchmark:
     name: str
     tasks: Sequence[AbsTask]
+    aliases: Sequence[str] = field(default_factory=tuple)
     description: str | None = None
     reference: StrURL | None = None
     citation: str | None = None
@@ -47,7 +49,7 @@ class Benchmark:
     display_name: str | None = None
     language_view: list[str] | Literal["all"] = field(default_factory=list)
-    def __iter__(self) -> Iterable[AbsTask]:
+    def __iter__(self) -> Iterator[AbsTask]:
         return iter(self.tasks)
     def __len__(self) -> int:
@@ -121,9 +123,19 @@ class RtebBenchmark(Benchmark):
             _create_summary_table_mean_public_private,
         )
-        joint_table = _create_summary_table_mean_public_private(benchmark_results)
+        joint_table = _create_summary_table_mean_public_private(
+            benchmark_results, exclude_private_from_borda=True
+        )
+        # issue 3902: temporary remove the private column from RTEB summary table
+        if "Mean (Private)" in joint_table.columns:
+            joint_table = joint_table.drop(columns=["Mean (Private)"])
         # For RTEB: all tasks are Retrieval type, so Retrieval column = Mean (Task)
+        # but due to 3902, if Private column existed, Mean (Task) was the mean of Public and Private so instead we drop Mean (Task) and rename Mean (Public) to Mean (Task)
         joint_table = joint_table.rename(columns={"Retrieval": "Mean (Task)"})
+        if "Mean (Task)" in joint_table.columns:
+            joint_table = joint_table.drop(columns=["Mean (Task)"])
+        joint_table = joint_table.rename(columns={"Mean (Public)": "Mean (Task)"})
         return joint_table

mteb/benchmarks/benchmarks/__init__.py CHANGED Viewed

@@ -3,9 +3,11 @@ from mteb.benchmarks.benchmarks.benchmarks import (
     BEIR_NL,
     BRIGHT,
     BRIGHT_LONG,
+    BRIGHT_V1_1,
     BUILT_MTEB,
     C_MTEB,
     CHEMTEB,
+    CHEMTEB_V1_1,
     CODE_RAG,
     ENCODECHKA,
     FA_MTEB,
@@ -14,6 +16,7 @@ from mteb.benchmarks.benchmarks.benchmarks import (
     JINA_VDR,
     JMTEB_LITE_V1,
     JMTEB_V2,
+    KOVIDORE_V2,
     LONG_EMBED,
     MIEB_ENG,
     MIEB_IMG,
@@ -67,8 +70,10 @@ __all__ = [
     "BEIR_NL",
     "BRIGHT",
     "BRIGHT_LONG",
+    "BRIGHT_V1_1",
     "BUILT_MTEB",
     "CHEMTEB",
+    "CHEMTEB_V1_1",
     "CODE_RAG",
     "C_MTEB",
     "ENCODECHKA",
@@ -79,6 +84,7 @@ __all__ = [
     "JINA_VDR",
     "JMTEB_LITE_V1",
     "JMTEB_V2",
+    "KOVIDORE_V2",
     "LONG_EMBED",
     "MIEB_ENG",
     "MIEB_IMG",

mteb 2.5.2__py3-none-any.whl → 2.7.9__py3-none-any.whl

mteb 2.5.2py3-none-any.whl → 2.7.9py3-none-any.whl