PyPI - mteb - Versions diffs - 2.5.2__py3-none-any.whl → 2.7.9__py3-none-any.whl - Mend

mteb 2.5.2py3-none-any.whl → 2.7.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (529) hide show

mteb/abstasks/classification.py CHANGED Viewed

@@ -1,7 +1,8 @@
+from __future__ import annotations
 import logging
 from collections import defaultdict
-from pathlib import Path
-from typing import Any, TypedDict
+from typing import TYPE_CHECKING, Any, TypedDict
 import numpy as np
 from datasets import Dataset, DatasetDict
@@ -16,12 +17,8 @@ from sklearn.metrics import (
 from mteb._evaluators.sklearn_evaluator import SklearnEvaluator, SklearnModelProtocol
 from mteb.models import EncoderProtocol, MTEBModels
-from mteb.types import HFSubset, ScoresDict
 from mteb.types.statistics import (
-    ImageStatistics,
-    LabelStatistics,
     SplitDescriptiveStatistics,
-    TextStatistics,
 )
 from ._statistics_calculation import (
@@ -31,6 +28,18 @@ from ._statistics_calculation import (
 )
 from .abstask import AbsTask
+if TYPE_CHECKING:
+    from pathlib import Path
+    from mteb._evaluators.sklearn_evaluator import SklearnModelProtocol
+    from mteb.models import MTEBModels
+    from mteb.types import EncodeKwargs, HFSubset, ScoresDict
+    from mteb.types.statistics import (
+        ImageStatistics,
+        LabelStatistics,
+        TextStatistics,
+    )
 logger = logging.getLogger(__name__)
@@ -98,9 +107,8 @@ class AbsTaskClassification(AbsTask):
             text: str (for text) or PIL.Image (for image). Column name can be changed via `input_column_name` attribute.
             label: int. Column name can be changed via `label_column_name` attribute.
         evaluator_model: The model to use for evaluation. Can be any sklearn compatible model. Default is `LogisticRegression`.
-            Full details of api in [`SklearnModelProtocol`][mteb._evaluators.sklearn_evaluator.SklearnModelProtocol].
-        samples_per_label: Number of samples per label to use for training the evaluator model. Default is 8.
-        n_experiments: Number of experiments to run. Default is 10.
+       samples_per_label: Number of samples per label to use for training the evaluator model. Default is 8.
+       n_experiments: Number of experiments to run. Default is 10.
         train_split: Name of the split to use for training the evaluator model. Default is "train".
         label_column_name: Name of the column containing the labels. Default is "label".
         input_column_name: Name of the column containing the input data. Default is "text".
@@ -126,8 +134,9 @@ class AbsTaskClassification(AbsTask):
         split: str = "test",
         subsets_to_run: list[HFSubset] | None = None,
         *,
-        encode_kwargs: dict[str, Any],
+        encode_kwargs: EncodeKwargs,
         prediction_folder: Path | None = None,
+        num_proc: int = 1,
         **kwargs: Any,
     ) -> dict[HFSubset, ScoresDict]:
         """Evaluate a model on the classification task.
@@ -141,7 +150,10 @@ class AbsTaskClassification(AbsTask):
             )
         if not self.data_loaded:
-            self.load_data()
+            self.load_data(num_proc=num_proc)
+        if self.dataset is None:
+            raise RuntimeError("Dataset not loaded.")
         if "random_state" in self.evaluator_model.get_params():
             self.evaluator_model = self.evaluator_model.set_params(
@@ -171,23 +183,28 @@ class AbsTaskClassification(AbsTask):
                 hf_subset=hf_subset,
                 encode_kwargs=encode_kwargs,
                 prediction_folder=prediction_folder,
+                num_proc=num_proc,
                 **kwargs,
             )
             self._add_main_score(scores[hf_subset])
-        return scores
+        return scores  # type: ignore[return-value]
     def _evaluate_subset(
         self,
-        model: EncoderProtocol,
+        model: MTEBModels,
         data_split: DatasetDict,
         *,
-        encode_kwargs: dict[str, Any],
+        encode_kwargs: EncodeKwargs,
         hf_split: str,
         hf_subset: str,
         prediction_folder: Path | None = None,
+        num_proc: int = 1,
         **kwargs: Any,
     ) -> FullClassificationMetrics:
+        if not isinstance(model, EncoderProtocol):
+            raise TypeError("Expected model to be an instance of EncoderProtocol")
         train_split = data_split[self.train_split]
         eval_split = data_split[hf_split]
@@ -216,7 +233,10 @@ class AbsTaskClassification(AbsTask):
                 evaluator_model=self.evaluator_model,
             )
             y_pred, test_cache = evaluator(
-                model, encode_kwargs=encode_kwargs, test_cache=test_cache
+                model,
+                encode_kwargs=encode_kwargs,
+                test_cache=test_cache,
+                num_proc=num_proc,
             )
             if prediction_folder:
                 all_predictions.append(y_pred.tolist())
@@ -237,7 +257,7 @@ class AbsTaskClassification(AbsTask):
             # ap will be none for non binary classification tasks
             k: (
                 float(np.mean(values))
-                if (values := [s[k] for s in scores if s[k] is not None])
+                if (values := [s[k] for s in scores if s[k] is not None])  # type: ignore[literal-required]
                 else np.nan
             )
             for k in scores[0].keys()
@@ -245,7 +265,7 @@ class AbsTaskClassification(AbsTask):
         logger.info(f"Running {self.metadata.name} - Finished.")
         return FullClassificationMetrics(
             scores_per_experiment=scores,
-            **avg_scores,
+            **avg_scores,  # type: ignore[typeddict-item]
         )
     def _calculate_scores(
@@ -358,11 +378,12 @@ class AbsTaskClassification(AbsTask):
             label_statistics=label_statistics,
         )
-    def _push_dataset_to_hub(self, repo_name: str) -> None:
+    def _push_dataset_to_hub(self, repo_name: str, num_proc: int = 1) -> None:
         self._upload_dataset_to_hub(
             repo_name,
             [
                 self.input_column_name,
                 self.label_column_name,
             ],
+            num_proc=num_proc,
         )

mteb/abstasks/clustering.py CHANGED Viewed

@@ -1,9 +1,10 @@
+from __future__ import annotations
 import itertools
 import logging
 import random
 from collections import defaultdict
-from pathlib import Path
-from typing import Any
+from typing import TYPE_CHECKING, Any, cast
 import numpy as np
 from datasets import Dataset, DatasetDict
@@ -12,12 +13,9 @@ from sklearn.metrics.cluster import v_measure_score
 from mteb._create_dataloaders import create_dataloader
 from mteb.models import EncoderProtocol
-from mteb.types import HFSubset, ScoresDict
+from mteb.types import Array, HFSubset
 from mteb.types.statistics import (
-    ImageStatistics,
-    LabelStatistics,
     SplitDescriptiveStatistics,
-    TextStatistics,
 )
 from ._statistics_calculation import (
@@ -27,6 +25,17 @@ from ._statistics_calculation import (
 )
 from .abstask import AbsTask
+if TYPE_CHECKING:
+    from pathlib import Path
+    from mteb.models import MTEBModels
+    from mteb.types import Array, EncodeKwargs, ScoresDict
+    from mteb.types.statistics import (
+        ImageStatistics,
+        LabelStatistics,
+        TextStatistics,
+    )
 logger = logging.getLogger(__name__)
@@ -34,7 +43,7 @@ MultilingualDataset = dict[HFSubset, DatasetDict]
 def _evaluate_clustering_bootstrapped(
-    embeddings: np.ndarray,
+    embeddings: Array,
     labels: list[list[str]],
     n_clusters: int,
     cluster_size: int,
@@ -61,21 +70,21 @@ def _evaluate_clustering_bootstrapped(
         max_depth = max(map(len, labels))
     # Evaluate on each level til max depth
     for i_level in range(max_depth):
-        level_labels = []
+        level_labels: list[str | int] = []
         # Assign -1 to gold label if the level is not there
         for label in labels:
             if len(label) > i_level:
                 level_labels.append(label[i_level])
             else:
                 level_labels.append(-1)
-        level_labels = np.array(level_labels)
+        np_level_labels = np.array(level_labels)
         valid_idx = np.array(
-            [level_label != -1 for level_label in level_labels]
+            [level_label != -1 for level_label in np_level_labels]
         )  # Could be level_labels != -1 but fails with FutureWarning: elementwise comparison failed
-        level_labels = level_labels[valid_idx]
+        np_level_labels = np_level_labels[valid_idx]
         level_embeddings = embeddings[valid_idx]
         clustering_model = MiniBatchKMeans(
-            n_clusters=np.unique(level_labels).size,
+            n_clusters=np.unique(np_level_labels).size,
             batch_size=kmean_batch_size,
             init="k-means++",
             n_init=1,  # default when kmeans++ is used
@@ -87,7 +96,7 @@ def _evaluate_clustering_bootstrapped(
             cluster_indices = rng_state.choices(range(n_embeddings), k=cluster_size)
             _embeddings = level_embeddings[cluster_indices]
-            _labels = level_labels[cluster_indices]
+            _labels = np_level_labels[cluster_indices]
             cluster_assignment = clustering_model.fit_predict(_embeddings)
             v_measure = v_measure_score(_labels, cluster_assignment)
             v_measures[f"Level {i_level}"].append(v_measure)
@@ -153,15 +162,20 @@ class AbsTaskClustering(AbsTask):
     def _evaluate_subset(
         self,
-        model: EncoderProtocol,
+        model: MTEBModels,
         data_split: Dataset,
         *,
-        encode_kwargs: dict[str, Any],
+        encode_kwargs: EncodeKwargs,
         hf_split: str,
         hf_subset: str,
         prediction_folder: Path | None = None,
+        num_proc: int = 1,
         **kwargs: Any,
     ) -> ScoresDict:
+        if not isinstance(model, EncoderProtocol):
+            raise TypeError(
+                "Expected encoder model to be an instance of EncoderProtocol."
+            )
         if (
             self.max_document_to_embed is not None
             and self.max_fraction_of_documents_to_embed is not None
@@ -182,13 +196,13 @@ class AbsTaskClustering(AbsTask):
                     self.max_fraction_of_documents_to_embed * len(data_split)
                 )
             else:
-                max_documents_to_embed = self.max_document_to_embed
+                max_documents_to_embed = cast("int", self.max_document_to_embed)
-            max_documents_to_embed = min(len(data_split), max_documents_to_embed)  # type: ignore
+            max_documents_to_embed = min(len(data_split), max_documents_to_embed)
             example_indices = self.rng_state.sample(
                 range(len(data_split)), k=max_documents_to_embed
             )
-            downsampled_dataset = data_split.select(example_indices)  # type: ignore
+            downsampled_dataset = data_split.select(example_indices)
         downsampled_dataset = downsampled_dataset.select_columns(
             [self.input_column_name, self.label_column_name]
@@ -200,6 +214,7 @@ class AbsTaskClustering(AbsTask):
                 downsampled_dataset,
                 self.metadata,
                 input_column=self.input_column_name,
+                num_proc=num_proc,
                 **encode_kwargs,
             ),
             task_metadata=self.metadata,
@@ -283,9 +298,11 @@ class AbsTaskClustering(AbsTask):
             labels_statistics=label_statistics,
         )
-    def _push_dataset_to_hub(self, repo_name: str) -> None:
+    def _push_dataset_to_hub(self, repo_name: str, num_proc: int = 1) -> None:
         self._upload_dataset_to_hub(
-            repo_name, [self.input_column_name, self.label_column_name]
+            repo_name,
+            [self.input_column_name, self.label_column_name],
+            num_proc=num_proc,
         )

mteb/abstasks/clustering_legacy.py CHANGED Viewed

@@ -1,6 +1,7 @@
+from __future__ import annotations
 import logging
-from pathlib import Path
-from typing import Any, TypedDict
+from typing import TYPE_CHECKING, Any, TypedDict
 import numpy as np
 from datasets import Dataset
@@ -8,13 +9,9 @@ from scipy.optimize import linear_sum_assignment
 from sklearn import metrics
 from mteb._evaluators import ClusteringEvaluator
-from mteb.models import EncoderProtocol
-from mteb.types import ScoresDict
+from mteb.models import EncoderProtocol, MTEBModels
 from mteb.types.statistics import (
-    ImageStatistics,
-    LabelStatistics,
     SplitDescriptiveStatistics,
-    TextStatistics,
 )
 from ._statistics_calculation import (
@@ -24,6 +21,17 @@ from ._statistics_calculation import (
 )
 from .abstask import AbsTask
+if TYPE_CHECKING:
+    from pathlib import Path
+    from mteb.models import MTEBModels
+    from mteb.types import EncodeKwargs, ScoresDict
+    from mteb.types.statistics import (
+        ImageStatistics,
+        LabelStatistics,
+        TextStatistics,
+    )
 logger = logging.getLogger(__name__)
@@ -80,15 +88,19 @@ class AbsTaskClusteringLegacy(AbsTask):
     def _evaluate_subset(
         self,
-        model: EncoderProtocol,
+        model: MTEBModels,
         data_split: Dataset,
         *,
-        encode_kwargs: dict[str, Any],
+        encode_kwargs: EncodeKwargs,
         hf_split: str,
         hf_subset: str,
         prediction_folder: Path | None = None,
+        num_proc: int = 1,
         **kwargs: Any,
     ) -> ScoresDict:
+        if not isinstance(model, EncoderProtocol):
+            raise TypeError("Expected model to be an instance of EncoderProtocol")
         data_split = data_split.select_columns(
             [self.input_column_name, self.label_column_name]
         )
@@ -139,9 +151,6 @@ class AbsTaskClusteringLegacy(AbsTask):
             }
             return scores
-        data_split = data_split.select_columns(
-            [self.input_column_name, self.label_column_name]
-        )
         evaluator = self.evaluator(
             data_split,
             input_column_name=self.input_column_name,
@@ -151,10 +160,14 @@ class AbsTaskClusteringLegacy(AbsTask):
             hf_subset=hf_subset,
             **kwargs,
         )
-        clusters = evaluator(model, encode_kwargs=encode_kwargs)
+        evaluate_clusters = evaluator(
+            model,
+            encode_kwargs=encode_kwargs,
+            num_proc=num_proc,
+        )
         if prediction_folder:
             self._save_task_predictions(
-                clusters,
+                evaluate_clusters,
                 model,
                 prediction_folder,
                 hf_subset=hf_subset,
@@ -163,7 +176,7 @@ class AbsTaskClusteringLegacy(AbsTask):
         return self._compute_metrics(
             data_split[self.label_column_name],
-            clusters,
+            evaluate_clusters,
         )
     def _compute_metrics(
@@ -230,11 +243,12 @@ class AbsTaskClusteringLegacy(AbsTask):
             label_statistics=label_statistics,
         )
-    def _push_dataset_to_hub(self, repo_name: str) -> None:
+    def _push_dataset_to_hub(self, repo_name: str, num_proc: int = 1) -> None:
         self._upload_dataset_to_hub(
             repo_name,
             [
                 self.input_column_name,
                 self.label_column_name,
             ],
+            num_proc=num_proc,
         )

mteb/abstasks/image/image_text_pair_classification.py CHANGED Viewed

@@ -1,10 +1,11 @@
+from __future__ import annotations
 import logging
 from collections.abc import Sequence
-from pathlib import Path
-from typing import Any, TypedDict
+from typing import TYPE_CHECKING, Any, TypedDict
 import torch
-from datasets import Dataset, concatenate_datasets
+from datasets import concatenate_datasets
 from mteb._evaluators import ImageTextPairClassificationEvaluator
 from mteb.abstasks._statistics_calculation import (
@@ -14,11 +15,21 @@ from mteb.abstasks._statistics_calculation import (
 from mteb.abstasks.abstask import AbsTask
 from mteb.models.models_protocols import EncoderProtocol
 from mteb.types.statistics import (
-    ImageStatistics,
     SplitDescriptiveStatistics,
-    TextStatistics,
 )
+if TYPE_CHECKING:
+    from pathlib import Path
+    from datasets import Dataset
+    from mteb.models.models_protocols import MTEBModels
+    from mteb.types import EncodeKwargs
+    from mteb.types.statistics import (
+        ImageStatistics,
+        TextStatistics,
+    )
 logger = logging.getLogger(__name__)
@@ -116,15 +127,18 @@ class AbsTaskImageTextPairClassification(AbsTask):
     def _evaluate_subset(
         self,
-        model: EncoderProtocol,
+        model: MTEBModels,
         data_split: Dataset,
         *,
-        encode_kwargs: dict[str, Any],
+        encode_kwargs: EncodeKwargs,
         hf_split: str,
         hf_subset: str,
         prediction_folder: Path | None = None,
+        num_proc: int = 1,
         **kwargs: Any,
     ) -> ImageTextPairClassificationMetrics:
+        if not isinstance(model, EncoderProtocol):
+            raise TypeError("Expected model to be an instance of EncoderProtocol")
         select_columns = []
         for columns in (self.images_column_names, self.texts_column_names):
             if isinstance(columns, str):
@@ -154,7 +168,9 @@ class AbsTaskImageTextPairClassification(AbsTask):
             hf_subset=hf_subset,
             **kwargs,
         )
-        scores = evaluator(model, encode_kwargs=encode_kwargs)
+        scores: list[torch.Tensor] = evaluator(
+            model, encode_kwargs=encode_kwargs, num_proc=num_proc
+        )  # type: ignore[assignment]
         if prediction_folder:
             self._save_task_predictions(
                 [score.tolist() for score in scores],
@@ -202,7 +218,7 @@ class AbsTaskImageTextPairClassification(AbsTask):
             accuracy=torch.Tensor(all_correct_scores).float().mean().item(),
         )
-    def _push_dataset_to_hub(self, repo_name: str) -> None:
+    def _push_dataset_to_hub(self, repo_name: str, num_proc: int = 1) -> None:
         text_columns = (
             [self.texts_column_names]
             if isinstance(self.texts_column_names, str)
@@ -217,4 +233,5 @@ class AbsTaskImageTextPairClassification(AbsTask):
         self._upload_dataset_to_hub(
             repo_name,
             [*text_columns, *image_columns],
+            num_proc=num_proc,
         )

mteb/abstasks/multilabel_classification.py CHANGED Viewed

@@ -1,8 +1,9 @@
+from __future__ import annotations
 import itertools
 import logging
 from collections import defaultdict
-from pathlib import Path
-from typing import Any, TypedDict
+from typing import TYPE_CHECKING, Any, TypedDict
 import numpy as np
 from datasets import DatasetDict
@@ -15,23 +16,29 @@ from typing_extensions import override
 from mteb._create_dataloaders import create_dataloader
 from mteb._evaluators.classification_metrics import hamming_score
-from mteb._evaluators.sklearn_evaluator import SklearnModelProtocol
 from mteb.models import EncoderProtocol
 from .classification import AbsTaskClassification
+if TYPE_CHECKING:
+    from pathlib import Path
+    from mteb._evaluators.sklearn_evaluator import SklearnModelProtocol
+    from mteb.models import MTEBModels
+    from mteb.types import Array, EncodeKwargs
 logger = logging.getLogger(__name__)
 def _evaluate_classifier(
-    embeddings_train: np.ndarray,
+    embeddings_train: Array,
     y_train: np.ndarray,
-    embeddings_test: np.ndarray,
+    embeddings_test: Array,
     classifier: SklearnModelProtocol,
 ) -> tuple[np.ndarray, SklearnModelProtocol]:
-    classifier: SklearnModelProtocol = clone(classifier)
-    classifier.fit(embeddings_train, y_train)
-    return classifier.predict(embeddings_test), classifier
+    classifier_copy: SklearnModelProtocol = clone(classifier)
+    classifier_copy.fit(embeddings_train, y_train)
+    return classifier_copy.predict(embeddings_test), classifier_copy
 class MultilabelClassificationMetrics(TypedDict):
@@ -69,25 +76,29 @@ class AbsTaskMultilabelClassification(AbsTaskClassification):
         input_column_name: Name of the column containing the input text.
         label_column_name: Name of the column containing the labels.
         samples_per_label: Number of samples to use pr. label. These samples are embedded and a classifier is fit using the labels and samples.
-        evaluator: Classifier to use for evaluation. Must implement the SklearnModelProtocol.
+        evaluator_model: Classifier to use for evaluation. Must implement the SklearnModelProtocol.
     """
-    evaluator: SklearnModelProtocol = KNeighborsClassifier(n_neighbors=5)
+    evaluator_model: SklearnModelProtocol = KNeighborsClassifier(n_neighbors=5)
     input_column_name: str = "text"
     label_column_name: str = "label"
     @override
-    def _evaluate_subset(
+    def _evaluate_subset(  # type: ignore[override]
         self,
-        model: EncoderProtocol,
+        model: MTEBModels,
         data_split: DatasetDict,
         *,
-        encode_kwargs: dict[str, Any],
+        encode_kwargs: EncodeKwargs,
         hf_split: str,
         hf_subset: str,
         prediction_folder: Path | None = None,
+        num_proc: int = 1,
         **kwargs: Any,
     ) -> FullMultilabelClassificationMetrics:
+        if not isinstance(model, EncoderProtocol):
+            raise TypeError("Expected model to be an instance of EncoderProtocol")
         if isinstance(data_split, DatasetDict):
             data_split = data_split.select_columns(
                 [self.input_column_name, self.label_column_name]
@@ -115,6 +126,7 @@ class AbsTaskMultilabelClassification(AbsTaskClassification):
             unique_train_dataset,
             self.metadata,
             input_column=self.input_column_name,
+            num_proc=num_proc,
             **encode_kwargs,
         )
@@ -165,7 +177,7 @@ class AbsTaskMultilabelClassification(AbsTaskClassification):
             y_train = train_split.select(sample_indices)[self.label_column_name]
             y_train = binarizer.transform(y_train)
             y_pred, current_classifier = _evaluate_classifier(
-                X_train, y_train, X_test, self.evaluator
+                X_train, y_train, X_test, self.evaluator_model
             )
             if prediction_folder:
                 all_predictions.append(y_pred.tolist())
@@ -185,19 +197,20 @@ class AbsTaskMultilabelClassification(AbsTaskClassification):
             )
         avg_scores: dict[str, Any] = {
-            k: np.mean([s[k] for s in scores]) for k in scores[0].keys()
+            k: np.mean([s[k] for s in scores])  # type: ignore[literal-required]
+            for k in scores[0].keys()
         }
         logger.info("Running multilabel classification - Finished.")
         return FullMultilabelClassificationMetrics(
             scores_per_experiment=scores,
-            **avg_scores,
+            **avg_scores,  # type: ignore[typeddict-item]
         )
-    def _calculate_scores(
+    def _calculate_scores(  # type: ignore[override]
         self,
         y_test: np.ndarray,
         y_pred: np.ndarray,
-        x_test_embedding: np.ndarray,
+        x_test_embedding: Array,
         current_classifier: SklearnModelProtocol,
     ) -> MultilabelClassificationMetrics:
         accuracy = current_classifier.score(x_test_embedding, y_test)
@@ -232,10 +245,9 @@ class AbsTaskMultilabelClassification(AbsTaskClassification):
         """
         sample_indices = []
         if idxs is None:
-            idxs = np.arange(len(y))
+            idxs = list(np.arange(len(y)))
         self.np_rng.shuffle(idxs)
-        idxs = idxs.tolist()
-        label_counter = defaultdict(int)
+        label_counter: dict[int, int] = defaultdict(int)
         for i in idxs:
             if any((label_counter[label] < samples_per_label) for label in y[i]):
                 sample_indices.append(i)

mteb 2.5.2__py3-none-any.whl → 2.7.9__py3-none-any.whl

mteb 2.5.2py3-none-any.whl → 2.7.9py3-none-any.whl