PyPI - mteb - Versions diffs - 2.1.4__py3-none-any.whl → 2.7.2__py3-none-any.whl - Mend

mteb 2.1.4py3-none-any.whl → 2.7.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (527) hide show

mteb/abstasks/clustering.py CHANGED Viewed

@@ -3,7 +3,7 @@ import logging
 import random
 from collections import defaultdict
 from pathlib import Path
-from typing import Any
+from typing import Any, cast
 import numpy as np
 from datasets import Dataset, DatasetDict
@@ -11,8 +11,8 @@ from sklearn.cluster import MiniBatchKMeans
 from sklearn.metrics.cluster import v_measure_score
 from mteb._create_dataloaders import create_dataloader
-from mteb.models import EncoderProtocol
-from mteb.types import HFSubset, ScoresDict
+from mteb.models import EncoderProtocol, MTEBModels
+from mteb.types import Array, EncodeKwargs, HFSubset, ScoresDict
 from mteb.types.statistics import (
     ImageStatistics,
     LabelStatistics,
@@ -34,7 +34,7 @@ MultilingualDataset = dict[HFSubset, DatasetDict]
 def _evaluate_clustering_bootstrapped(
-    embeddings: np.ndarray,
+    embeddings: Array,
     labels: list[list[str]],
     n_clusters: int,
     cluster_size: int,
@@ -61,21 +61,21 @@ def _evaluate_clustering_bootstrapped(
         max_depth = max(map(len, labels))
     # Evaluate on each level til max depth
     for i_level in range(max_depth):
-        level_labels = []
+        level_labels: list[str | int] = []
         # Assign -1 to gold label if the level is not there
         for label in labels:
             if len(label) > i_level:
                 level_labels.append(label[i_level])
             else:
                 level_labels.append(-1)
-        level_labels = np.array(level_labels)
+        np_level_labels = np.array(level_labels)
         valid_idx = np.array(
-            [level_label != -1 for level_label in level_labels]
+            [level_label != -1 for level_label in np_level_labels]
         )  # Could be level_labels != -1 but fails with FutureWarning: elementwise comparison failed
-        level_labels = level_labels[valid_idx]
+        np_level_labels = np_level_labels[valid_idx]
         level_embeddings = embeddings[valid_idx]
         clustering_model = MiniBatchKMeans(
-            n_clusters=np.unique(level_labels).size,
+            n_clusters=np.unique(np_level_labels).size,
             batch_size=kmean_batch_size,
             init="k-means++",
             n_init=1,  # default when kmeans++ is used
@@ -87,7 +87,7 @@ def _evaluate_clustering_bootstrapped(
             cluster_indices = rng_state.choices(range(n_embeddings), k=cluster_size)
             _embeddings = level_embeddings[cluster_indices]
-            _labels = level_labels[cluster_indices]
+            _labels = np_level_labels[cluster_indices]
             cluster_assignment = clustering_model.fit_predict(_embeddings)
             v_measure = v_measure_score(_labels, cluster_assignment)
             v_measures[f"Level {i_level}"].append(v_measure)
@@ -153,15 +153,19 @@ class AbsTaskClustering(AbsTask):
     def _evaluate_subset(
         self,
-        model: EncoderProtocol,
+        model: MTEBModels,
         data_split: Dataset,
         *,
-        encode_kwargs: dict[str, Any],
+        encode_kwargs: EncodeKwargs,
         hf_split: str,
         hf_subset: str,
         prediction_folder: Path | None = None,
         **kwargs: Any,
     ) -> ScoresDict:
+        if not isinstance(model, EncoderProtocol):
+            raise TypeError(
+                "Expected encoder model to be an instance of EncoderProtocol."
+            )
         if (
             self.max_document_to_embed is not None
             and self.max_fraction_of_documents_to_embed is not None
@@ -182,13 +186,13 @@ class AbsTaskClustering(AbsTask):
                     self.max_fraction_of_documents_to_embed * len(data_split)
                 )
             else:
-                max_documents_to_embed = self.max_document_to_embed
+                max_documents_to_embed = cast(int, self.max_document_to_embed)
-            max_documents_to_embed = min(len(data_split), max_documents_to_embed)  # type: ignore
+            max_documents_to_embed = min(len(data_split), max_documents_to_embed)
             example_indices = self.rng_state.sample(
                 range(len(data_split)), k=max_documents_to_embed
             )
-            downsampled_dataset = data_split.select(example_indices)  # type: ignore
+            downsampled_dataset = data_split.select(example_indices)
         downsampled_dataset = downsampled_dataset.select_columns(
             [self.input_column_name, self.label_column_name]
@@ -200,7 +204,7 @@ class AbsTaskClustering(AbsTask):
                 downsampled_dataset,
                 self.metadata,
                 input_column=self.input_column_name,
-                batch_size=encode_kwargs["batch_size"],
+                **encode_kwargs,
             ),
             task_metadata=self.metadata,
             hf_subset=hf_subset,

mteb/abstasks/clustering_legacy.py CHANGED Viewed

@@ -8,8 +8,8 @@ from scipy.optimize import linear_sum_assignment
 from sklearn import metrics
 from mteb._evaluators import ClusteringEvaluator
-from mteb.models import EncoderProtocol
-from mteb.types import ScoresDict
+from mteb.models import EncoderProtocol, MTEBModels
+from mteb.types import EncodeKwargs, ScoresDict
 from mteb.types.statistics import (
     ImageStatistics,
     LabelStatistics,
@@ -80,15 +80,21 @@ class AbsTaskClusteringLegacy(AbsTask):
     def _evaluate_subset(
         self,
-        model: EncoderProtocol,
+        model: MTEBModels,
         data_split: Dataset,
         *,
-        encode_kwargs: dict[str, Any],
+        encode_kwargs: EncodeKwargs,
         hf_split: str,
         hf_subset: str,
         prediction_folder: Path | None = None,
         **kwargs: Any,
     ) -> ScoresDict:
+        if not isinstance(model, EncoderProtocol):
+            raise TypeError("Expected model to be an instance of EncoderProtocol")
+        data_split = data_split.select_columns(
+            [self.input_column_name, self.label_column_name]
+        )
         # MTEB text clustering requires renaming and eval per subset.
         if self.metadata.modalities == ["text"]:
             all_metrics = []
@@ -136,9 +142,6 @@ class AbsTaskClusteringLegacy(AbsTask):
             }
             return scores
-        data_split = data_split.select_columns(
-            [self.input_column_name, self.label_column_name]
-        )
         evaluator = self.evaluator(
             data_split,
             input_column_name=self.input_column_name,
@@ -148,10 +151,10 @@ class AbsTaskClusteringLegacy(AbsTask):
             hf_subset=hf_subset,
             **kwargs,
         )
-        clusters = evaluator(model, encode_kwargs=encode_kwargs)
+        evaluate_clusters = evaluator(model, encode_kwargs=encode_kwargs)
         if prediction_folder:
             self._save_task_predictions(
-                clusters,
+                evaluate_clusters,
                 model,
                 prediction_folder,
                 hf_subset=hf_subset,
@@ -160,7 +163,7 @@ class AbsTaskClusteringLegacy(AbsTask):
         return self._compute_metrics(
             data_split[self.label_column_name],
-            clusters,
+            evaluate_clusters,
         )
     def _compute_metrics(

mteb/abstasks/image/image_text_pair_classification.py CHANGED Viewed

@@ -12,7 +12,8 @@ from mteb.abstasks._statistics_calculation import (
     calculate_text_statistics,
 )
 from mteb.abstasks.abstask import AbsTask
-from mteb.models.models_protocols import EncoderProtocol
+from mteb.models.models_protocols import EncoderProtocol, MTEBModels
+from mteb.types import EncodeKwargs
 from mteb.types.statistics import (
     ImageStatistics,
     SplitDescriptiveStatistics,
@@ -116,15 +117,17 @@ class AbsTaskImageTextPairClassification(AbsTask):
     def _evaluate_subset(
         self,
-        model: EncoderProtocol,
+        model: MTEBModels,
         data_split: Dataset,
         *,
-        encode_kwargs: dict[str, Any],
+        encode_kwargs: EncodeKwargs,
         hf_split: str,
         hf_subset: str,
         prediction_folder: Path | None = None,
         **kwargs: Any,
     ) -> ImageTextPairClassificationMetrics:
+        if not isinstance(model, EncoderProtocol):
+            raise TypeError("Expected model to be an instance of EncoderProtocol")
         select_columns = []
         for columns in (self.images_column_names, self.texts_column_names):
             if isinstance(columns, str):
@@ -154,7 +157,7 @@ class AbsTaskImageTextPairClassification(AbsTask):
             hf_subset=hf_subset,
             **kwargs,
         )
-        scores = evaluator(model, encode_kwargs=encode_kwargs)
+        scores: list[torch.Tensor] = evaluator(model, encode_kwargs=encode_kwargs)  # type: ignore[assignment]
         if prediction_folder:
             self._save_task_predictions(
                 [score.tolist() for score in scores],

mteb/abstasks/multilabel_classification.py CHANGED Viewed

@@ -14,8 +14,10 @@ from sklearn.preprocessing import MultiLabelBinarizer
 from typing_extensions import override
 from mteb._create_dataloaders import create_dataloader
+from mteb._evaluators.classification_metrics import hamming_score
 from mteb._evaluators.sklearn_evaluator import SklearnModelProtocol
-from mteb.models import EncoderProtocol
+from mteb.models import EncoderProtocol, MTEBModels
+from mteb.types import Array, EncodeKwargs
 from .classification import AbsTaskClassification
@@ -23,14 +25,14 @@ logger = logging.getLogger(__name__)
 def _evaluate_classifier(
-    embeddings_train: np.ndarray,
+    embeddings_train: Array,
     y_train: np.ndarray,
-    embeddings_test: np.ndarray,
+    embeddings_test: Array,
     classifier: SklearnModelProtocol,
 ) -> tuple[np.ndarray, SklearnModelProtocol]:
-    classifier: SklearnModelProtocol = clone(classifier)
-    classifier.fit(embeddings_train, y_train)
-    return classifier.predict(embeddings_test), classifier
+    classifier_copy: SklearnModelProtocol = clone(classifier)
+    classifier_copy.fit(embeddings_train, y_train)
+    return classifier_copy.predict(embeddings_test), classifier_copy
 class MultilabelClassificationMetrics(TypedDict):
@@ -40,11 +42,13 @@ class MultilabelClassificationMetrics(TypedDict):
         accuracy: Accuracy of the classifier.
         lrap: Label Ranking Average Precision (LRAP) score.
         f1: Macro F1 score.
+        hamming: Hamming score (label-based accuracy).
     """
     accuracy: float
     lrap: float
     f1: float
+    hamming: float
 class FullMultilabelClassificationMetrics(MultilabelClassificationMetrics):
@@ -66,25 +70,28 @@ class AbsTaskMultilabelClassification(AbsTaskClassification):
         input_column_name: Name of the column containing the input text.
         label_column_name: Name of the column containing the labels.
         samples_per_label: Number of samples to use pr. label. These samples are embedded and a classifier is fit using the labels and samples.
-        evaluator: Classifier to use for evaluation. Must implement the SklearnModelProtocol.
+        evaluator_model: Classifier to use for evaluation. Must implement the SklearnModelProtocol.
     """
-    evaluator: SklearnModelProtocol = KNeighborsClassifier(n_neighbors=5)
+    evaluator_model: SklearnModelProtocol = KNeighborsClassifier(n_neighbors=5)
     input_column_name: str = "text"
     label_column_name: str = "label"
     @override
-    def _evaluate_subset(
+    def _evaluate_subset(  # type: ignore[override]
         self,
-        model: EncoderProtocol,
+        model: MTEBModels,
         data_split: DatasetDict,
         *,
-        encode_kwargs: dict[str, Any],
+        encode_kwargs: EncodeKwargs,
         hf_split: str,
         hf_subset: str,
         prediction_folder: Path | None = None,
         **kwargs: Any,
     ) -> FullMultilabelClassificationMetrics:
+        if not isinstance(model, EncoderProtocol):
+            raise TypeError("Expected model to be an instance of EncoderProtocol")
         if isinstance(data_split, DatasetDict):
             data_split = data_split.select_columns(
                 [self.input_column_name, self.label_column_name]
@@ -112,7 +119,7 @@ class AbsTaskMultilabelClassification(AbsTaskClassification):
             unique_train_dataset,
             self.metadata,
             input_column=self.input_column_name,
-            batch_size=encode_kwargs["batch_size"],
+            **encode_kwargs,
         )
         logger.info("Running multilabel classification - Encoding training set...")
@@ -141,7 +148,7 @@ class AbsTaskMultilabelClassification(AbsTaskClassification):
             test_dataset.select_columns(self.input_column_name),
             self.metadata,
             input_column=self.input_column_name,
-            batch_size=encode_kwargs["batch_size"],
+            **encode_kwargs,
         )
         logger.info("Running multilabel classification - Encoding test set...")
@@ -157,12 +164,12 @@ class AbsTaskMultilabelClassification(AbsTaskClassification):
         logger.info("Running multilabel classification - Evaluating classifiers...")
         all_predictions = []
-        for i_experiment, sample_indices in enumerate(train_samples):
+        for _, sample_indices in enumerate(train_samples):
             X_train = np.stack([unique_train_embeddings[idx] for idx in sample_indices])
             y_train = train_split.select(sample_indices)[self.label_column_name]
             y_train = binarizer.transform(y_train)
             y_pred, current_classifier = _evaluate_classifier(
-                X_train, y_train, X_test, self.evaluator
+                X_train, y_train, X_test, self.evaluator_model
             )
             if prediction_folder:
                 all_predictions.append(y_pred.tolist())
@@ -182,19 +189,20 @@ class AbsTaskMultilabelClassification(AbsTaskClassification):
             )
         avg_scores: dict[str, Any] = {
-            k: np.mean([s[k] for s in scores]) for k in scores[0].keys()
+            k: np.mean([s[k] for s in scores])  # type: ignore[literal-required]
+            for k in scores[0].keys()
         }
         logger.info("Running multilabel classification - Finished.")
         return FullMultilabelClassificationMetrics(
             scores_per_experiment=scores,
-            **avg_scores,
+            **avg_scores,  # type: ignore[typeddict-item]
         )
-    def _calculate_scores(
+    def _calculate_scores(  # type: ignore[override]
         self,
         y_test: np.ndarray,
         y_pred: np.ndarray,
-        x_test_embedding: np.ndarray,
+        x_test_embedding: Array,
         current_classifier: SklearnModelProtocol,
     ) -> MultilabelClassificationMetrics:
         accuracy = current_classifier.score(x_test_embedding, y_test)
@@ -207,10 +215,12 @@ class AbsTaskMultilabelClassification(AbsTaskClassification):
         else:
             lrap = label_ranking_average_precision_score(y_test, y_pred)
         f1 = f1_score(y_test, y_pred, average="macro")
+        hamming = hamming_score(y_test, y_pred)
         return MultilabelClassificationMetrics(
             accuracy=accuracy,
             lrap=lrap,
             f1=f1,
+            hamming=hamming,
         )
     def _undersample_data_indices(
@@ -218,6 +228,8 @@ class AbsTaskMultilabelClassification(AbsTaskClassification):
     ) -> tuple[list[int], list[int]]:
         """Undersample data to have samples_per_label samples of each label.
+        Currently ensures that each label has at least samples_per_label samples.
         Returns:
             A tuple containing:
                 - List of sampled indices.
@@ -225,10 +237,9 @@ class AbsTaskMultilabelClassification(AbsTaskClassification):
         """
         sample_indices = []
         if idxs is None:
-            idxs = np.arange(len(y))
+            idxs = list(np.arange(len(y)))
         self.np_rng.shuffle(idxs)
-        idxs = idxs.tolist()
-        label_counter = defaultdict(int)
+        label_counter: dict[int, int] = defaultdict(int)
         for i in idxs:
             if any((label_counter[label] < samples_per_label) for label in y[i]):
                 sample_indices.append(i)

mteb/abstasks/pair_classification.py CHANGED Viewed

@@ -18,7 +18,8 @@ from mteb.abstasks._statistics_calculation import (
 )
 from mteb.abstasks.abstask import AbsTask
 from mteb.models.model_meta import ScoringFunction
-from mteb.models.models_protocols import EncoderProtocol
+from mteb.models.models_protocols import EncoderProtocol, MTEBModels
+from mteb.types import EncodeKwargs, PromptType
 from mteb.types.statistics import (
     ImageStatistics,
     LabelStatistics,
@@ -35,7 +36,7 @@ class PairClassificationDescriptiveStatistics(SplitDescriptiveStatistics):
     Attributes:
         num_samples: number of samples in the dataset.
         number_of_characters: Total number of symbols in the dataset.
-        unique_text_pairs: Number of unique pairs
+        unique_pairs: Number of unique pairs
         text1_statistics: Statistics for sentence1
         text2_statistics: Statistics for sentence2
@@ -43,8 +44,8 @@ class PairClassificationDescriptiveStatistics(SplitDescriptiveStatistics):
     """
     num_samples: int
-    number_of_characters: int
-    unique_pairs: int
+    number_of_characters: int | None
+    unique_pairs: int | None
     text1_statistics: TextStatistics | None
     image1_statistics: ImageStatistics | None
@@ -65,24 +66,31 @@ class AbsTaskPairClassification(AbsTask):
         input2_column_name: The name of the column containing the second sentence in the pair.
         label_column_name: The name of the column containing the labels for the pairs. Labels should be 0 or 1.
         abstask_prompt: Prompt to use for the task for instruction model if not prompt is provided in TaskMetadata.prompt.
+        input1_prompt_type: Type of prompt of first input. Used for asymmetric tasks.
+        input2_prompt_type: Type of prompt of second input. Used for asymmetric tasks.
     """
     abstask_prompt = "Retrieve text that are semantically similar to the given text."
     input1_column_name: str = "sentence1"
     input2_column_name: str = "sentence2"
     label_column_name: str = "labels"
+    input1_prompt_type: PromptType | None = None
+    input2_prompt_type: PromptType | None = None
     def _evaluate_subset(
         self,
-        model: EncoderProtocol,
+        model: MTEBModels,
         data_split: Dataset,
         *,
         hf_split: str,
         hf_subset: str,
-        encode_kwargs: dict[str, str],
+        encode_kwargs: EncodeKwargs,
         prediction_folder: Path | None = None,
         **kwargs,
     ) -> dict[str, float]:
+        if not isinstance(model, EncoderProtocol):
+            raise TypeError("Expected model to be an instance of EncoderProtocol")
         if self.metadata.modalities == ["text"]:
             # for compatibility with v1 version where datasets were stored in a single row
             data_split = data_split[0] if len(data_split) == 1 else data_split
@@ -93,6 +101,8 @@ class AbsTaskPairClassification(AbsTask):
             task_metadata=self.metadata,
             hf_split=hf_split,
             hf_subset=hf_subset,
+            input1_prompt_type=self.input1_prompt_type,
+            input2_prompt_type=self.input2_prompt_type,
             **kwargs,
         )
         similarity_scores = evaluator(model, encode_kwargs=encode_kwargs)
@@ -113,7 +123,7 @@ class AbsTaskPairClassification(AbsTask):
         self, similarity_scores: PairClassificationDistances, labels: list[int]
     ) -> dict[str, float]:
         logger.info("Computing metrics...")
-        labels = np.asarray(labels)
+        np_labels = np.asarray(labels)
         output_scores = {}
         max_scores = defaultdict(list)
         for short_name, scores, reverse in [
@@ -135,7 +145,7 @@ class AbsTaskPairClassification(AbsTask):
             ],
             [ScoringFunction.DOT_PRODUCT.value, similarity_scores["dot_scores"], True],
         ]:
-            metrics = self._compute_metrics_values(scores, labels, reverse)
+            metrics = self._compute_metrics_values(scores, np_labels, reverse)  # type: ignore[arg-type]
             for metric_name, metric_value in metrics.items():
                 output_scores[f"{short_name}_{metric_name}"] = metric_value
                 max_scores[metric_name].append(metric_value)
@@ -230,6 +240,12 @@ class AbsTaskPairClassification(AbsTask):
     def _push_dataset_to_hub(self, repo_name: str) -> None:
         # previously pair classification datasets were stored in a single row
+        if self.dataset is None:
+            # overall this shouldn't happen as we check for dataset before pushing to hub
+            # added here for type checking purposes
+            raise RuntimeError(
+                "Dataset not loaded. To load dataset run `task.load_data()`."
+            )
         if self.metadata.is_multilingual:
             for subset in self.dataset:
                 for split in self.dataset[subset]:
@@ -283,13 +299,13 @@ class AbsTaskPairClassification(AbsTask):
         )
     def _find_best_acc_and_threshold(
-        self, scores: np.ndarray, labels: np.ndarray, high_score_more_similar: bool
+        self, scores: list[float], labels: np.ndarray, high_score_more_similar: bool
     ) -> tuple[float, float]:
         rows = list(zip(scores, labels))
         rows = sorted(rows, key=lambda x: x[0], reverse=high_score_more_similar)
         max_acc = 0
-        best_threshold = -1
+        best_threshold = -1.0
         positive_so_far = 0
         remaining_negatives = sum(np.array(labels) == 0)
@@ -316,7 +332,7 @@ class AbsTaskPairClassification(AbsTask):
         rows = sorted(rows, key=lambda x: x[0], reverse=high_score_more_similar)
-        best_f1 = best_precision = best_recall = 0
+        best_f1 = best_precision = best_recall = 0.0
         threshold = 0
         nextract = 0
         ncorrect = 0

mteb/abstasks/regression.py CHANGED Viewed

@@ -84,10 +84,10 @@ class AbsTaskRegression(AbsTaskClassification):
         n_samples: Number of samples to use for training the regression model. If the dataset has fewer samples than n_samples, all samples are used.
         abstask_prompt: Prompt to use for the task for instruction model if not prompt is provided in TaskMetadata.prompt.
         evaluator_model: The model to use for evaluation. Can be any sklearn compatible model. Default is `LinearRegression`.
-            Full details of api in [`SklearnModelProtocol`][mteb._evaluators.sklearn_evaluator.SklearnModelProtocol].
     """
-    evaluator: type[SklearnModelProtocol] = SklearnEvaluator
+    evaluator: type[SklearnEvaluator] = SklearnEvaluator
     evaluator_model: SklearnModelProtocol = LinearRegression(n_jobs=-1)
     train_split: str = "train"
@@ -113,7 +113,7 @@ class AbsTaskRegression(AbsTaskClassification):
             )["train"]
         return train_split_sampled, []
-    def _calculate_scores(
+    def _calculate_scores(  # type: ignore[override]
         self,
         y_test: np.ndarray | list[int],
         y_pred: np.ndarray,
@@ -183,7 +183,7 @@ class AbsTaskRegression(AbsTaskClassification):
         return dataset_dict
-    def _calculate_descriptive_statistics_from_split(
+    def _calculate_descriptive_statistics_from_split(  # type: ignore[override]
         self, split: str, hf_subset: str | None = None, compute_overall: bool = False
     ) -> RegressionDescriptiveStatistics:
         train_text = []

mteb 2.1.4__py3-none-any.whl → 2.7.2__py3-none-any.whl

mteb 2.1.4py3-none-any.whl → 2.7.2py3-none-any.whl