PyPI - mteb - Versions diffs - 2.5.3__py3-none-any.whl → 2.5.4__py3-none-any.whl - Mend

mteb 2.5.3py3-none-any.whl → 2.5.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (102) hide show

mteb/_create_dataloaders.py +10 -15
mteb/_evaluators/any_sts_evaluator.py +1 -4
mteb/_evaluators/evaluator.py +2 -1
mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +5 -6
mteb/_evaluators/pair_classification_evaluator.py +3 -1
mteb/_evaluators/retrieval_metrics.py +17 -16
mteb/_evaluators/sklearn_evaluator.py +9 -8
mteb/_evaluators/text/bitext_mining_evaluator.py +23 -16
mteb/_evaluators/text/summarization_evaluator.py +20 -16
mteb/abstasks/_data_filter/filters.py +1 -1
mteb/abstasks/_data_filter/task_pipelines.py +3 -0
mteb/abstasks/_statistics_calculation.py +18 -10
mteb/abstasks/_stratification.py +18 -18
mteb/abstasks/abstask.py +27 -21
mteb/abstasks/aggregate_task_metadata.py +1 -9
mteb/abstasks/aggregated_task.py +3 -16
mteb/abstasks/classification.py +10 -4
mteb/abstasks/clustering.py +18 -14
mteb/abstasks/clustering_legacy.py +8 -8
mteb/abstasks/image/image_text_pair_classification.py +5 -3
mteb/abstasks/multilabel_classification.py +20 -16
mteb/abstasks/pair_classification.py +18 -9
mteb/abstasks/regression.py +3 -3
mteb/abstasks/retrieval.py +12 -9
mteb/abstasks/sts.py +6 -3
mteb/abstasks/task_metadata.py +20 -16
mteb/abstasks/text/bitext_mining.py +36 -25
mteb/abstasks/text/reranking.py +7 -5
mteb/abstasks/text/summarization.py +8 -3
mteb/abstasks/zeroshot_classification.py +5 -2
mteb/benchmarks/benchmark.py +2 -2
mteb/cache.py +20 -18
mteb/cli/_display_tasks.py +2 -2
mteb/cli/build_cli.py +5 -5
mteb/cli/generate_model_card.py +6 -4
mteb/deprecated_evaluator.py +56 -43
mteb/evaluate.py +35 -29
mteb/filter_tasks.py +25 -26
mteb/get_tasks.py +25 -27
mteb/languages/language_scripts.py +5 -3
mteb/leaderboard/app.py +1 -1
mteb/load_results.py +12 -12
mteb/models/abs_encoder.py +2 -2
mteb/models/cache_wrappers/cache_backend_protocol.py +3 -5
mteb/models/cache_wrappers/cache_backends/_hash_utils.py +5 -4
mteb/models/cache_wrappers/cache_backends/faiss_cache.py +2 -1
mteb/models/cache_wrappers/cache_backends/numpy_cache.py +30 -13
mteb/models/cache_wrappers/cache_wrapper.py +2 -2
mteb/models/get_model_meta.py +8 -1
mteb/models/instruct_wrapper.py +11 -5
mteb/models/model_implementations/andersborges.py +2 -2
mteb/models/model_implementations/blip_models.py +8 -8
mteb/models/model_implementations/bm25.py +1 -1
mteb/models/model_implementations/clip_models.py +3 -3
mteb/models/model_implementations/cohere_models.py +1 -1
mteb/models/model_implementations/cohere_v.py +2 -2
mteb/models/model_implementations/dino_models.py +23 -23
mteb/models/model_implementations/emillykkejensen_models.py +3 -3
mteb/models/model_implementations/jina_clip.py +1 -1
mteb/models/model_implementations/jina_models.py +1 -1
mteb/models/model_implementations/kennethenevoldsen_models.py +2 -2
mteb/models/model_implementations/llm2clip_models.py +3 -3
mteb/models/model_implementations/moco_models.py +2 -2
mteb/models/model_implementations/model2vec_models.py +1 -1
mteb/models/model_implementations/nomic_models.py +8 -8
mteb/models/model_implementations/openclip_models.py +7 -7
mteb/models/model_implementations/random_baseline.py +3 -3
mteb/models/model_implementations/rasgaard_models.py +1 -1
mteb/models/model_implementations/repllama_models.py +2 -2
mteb/models/model_implementations/rerankers_custom.py +3 -3
mteb/models/model_implementations/rerankers_monot5_based.py +3 -3
mteb/models/model_implementations/siglip_models.py +10 -10
mteb/models/model_implementations/vlm2vec_models.py +1 -1
mteb/models/model_implementations/voyage_v.py +4 -4
mteb/models/model_meta.py +11 -12
mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +5 -5
mteb/models/search_wrappers.py +22 -10
mteb/models/sentence_transformer_wrapper.py +9 -4
mteb/py.typed +0 -0
mteb/results/benchmark_results.py +25 -19
mteb/results/model_result.py +49 -21
mteb/results/task_result.py +45 -51
mteb/similarity_functions.py +11 -7
mteb/tasks/classification/dan/dk_hate_classification.py +1 -1
mteb/tasks/classification/est/estonian_valence.py +1 -1
mteb/tasks/classification/multilingual/scala_classification.py +1 -1
mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +1 -1
mteb/tasks/retrieval/code/code_rag.py +12 -12
mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
mteb/tasks/retrieval/dan/tv2_nordretrieval.py +2 -2
mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +2 -2
mteb/tasks/retrieval/nob/norquad.py +2 -2
mteb/tasks/retrieval/nob/snl_retrieval.py +2 -2
mteb/tasks/retrieval/tur/tur_hist_quad.py +1 -1
mteb/types/_result.py +2 -1
mteb/types/statistics.py +9 -3
{mteb-2.5.3.dist-info → mteb-2.5.4.dist-info}/METADATA +1 -1
{mteb-2.5.3.dist-info → mteb-2.5.4.dist-info}/RECORD +102 -101
{mteb-2.5.3.dist-info → mteb-2.5.4.dist-info}/WHEEL +0 -0
{mteb-2.5.3.dist-info → mteb-2.5.4.dist-info}/entry_points.txt +0 -0
{mteb-2.5.3.dist-info → mteb-2.5.4.dist-info}/licenses/LICENSE +0 -0
{mteb-2.5.3.dist-info → mteb-2.5.4.dist-info}/top_level.txt +0 -0

mteb/_create_dataloaders.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import logging
+import warnings
 from collections.abc import Callable
 from typing import Any, cast
@@ -113,11 +114,8 @@ def _create_text_dataloader_for_queries(
     )
-_warned_about_user_role = False
 def _convert_conv_history_to_query(
-    row: dict[str, list[str] | Conversation],
+    row: dict[str, str | list[str] | Conversation],
 ) -> dict[str, str | Conversation]:
     """Convert a conversation history to a single query string.
@@ -127,21 +125,18 @@ def _convert_conv_history_to_query(
     Returns:
         The updated row with the "query" and "text" fields set to the conversation string, and the "conversation" field set to the list of ConversationTurn.
     """
-    global _warned_about_user_role
     conversation = row["text"]
     # if it's a list of strings, just join them
     if isinstance(conversation, list) and isinstance(conversation[0], str):
-        conversation = cast(list[str], conversation)
-        conv_str = "; ".join(conversation)
+        conversation_ = cast(list[str], conversation)
+        conv_str = "; ".join(conversation_)
         current_conversation = [
-            ConversationTurn(role="user", content=message) for message in conversation
+            ConversationTurn(role="user", content=message) for message in conversation_
         ]
-        if not _warned_about_user_role:
-            logger.warning(
-                "Conversations are a list of strings. Used 'user' role for all turns."
-            )
-            _warned_about_user_role = True
+        warnings.warn(
+            "Conversations are a list of strings. Used 'user' role for all turns.",
+            category=UserWarning,
+        )
     # otherwise, it's a list of dictionaries, which we need to convert to strings
     elif isinstance(conversation, list) and isinstance(conversation[0], dict):
         conv = []
@@ -178,7 +173,7 @@ def _convert_conv_history_to_query(
     row["text"] = conv_str
     row["conversation"] = current_conversation
-    return row
+    return cast(dict[str, str | list[ConversationTurn]], row)
 def _create_dataloader_for_queries_conversation(

mteb/_evaluators/any_sts_evaluator.py CHANGED Viewed

@@ -57,10 +57,7 @@ class AnySTSEvaluator(Evaluator):
         self.input2_prompt_type = input2_prompt_type
     def __call__(
-        self,
-        model: EncoderProtocol,
-        *,
-        encode_kwargs: dict[str, Any],
+        self, model: EncoderProtocol, *, encode_kwargs: dict[str, Any]
     ) -> STSEvaluatorScores:
         logger.info("Running semantic similarity - Encoding samples (1/2)")
         embeddings1 = model.encode(

mteb/_evaluators/evaluator.py CHANGED Viewed

@@ -1,4 +1,5 @@
 from abc import ABC, abstractmethod
+from collections.abc import Iterable, Mapping
 from typing import Any
 from mteb.abstasks.abstask import _set_seed
@@ -18,7 +19,7 @@ class Evaluator(ABC):
     @abstractmethod
     def __call__(
         self, model: EncoderProtocol, *, encode_kwargs: dict[str, Any]
-    ) -> dict[str, float]:
+    ) -> Mapping[str, float] | Iterable[Any]:
         """This is called during training to evaluate the model.
         It returns scores.

mteb/_evaluators/image/imagetext_pairclassification_evaluator.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from __future__ import annotations
 import logging
+from collections.abc import Sequence
 from typing import TYPE_CHECKING, Any
 import torch
@@ -61,8 +62,8 @@ class ImageTextPairClassificationEvaluator(Evaluator):
     def __init__(
         self,
         dataset,
-        images_column_names: str | list[str],
-        texts_column_names: str | list[str],
+        images_column_names: str | Sequence[str],
+        texts_column_names: str | Sequence[str],
         num_images_per_sample: int,
         num_texts_per_sample: int,
         task_metadata: TaskMetadata,
@@ -82,10 +83,8 @@ class ImageTextPairClassificationEvaluator(Evaluator):
         self.hf_split = hf_split
         self.hf_subset = hf_subset
-    def __call__(
-        self,
-        model: EncoderProtocol,
-        encode_kwargs: dict[str, Any],
+    def __call__(  # type: ignore[override]
+        self, model: EncoderProtocol, *, encode_kwargs: dict[str, Any]
     ) -> list[torch.Tensor]:
         images = []
         if isinstance(self.images_column_names, str):

mteb/_evaluators/pair_classification_evaluator.py CHANGED Viewed

@@ -148,7 +148,9 @@ class PairClassificationEvaluator(Evaluator):
         hf_subset: str,
         **encode_kwargs: Any,
     ) -> np.ndarray:
-        index_map, all_unique_texts, all_texts_indexes = {}, [], []
+        index_map = {}
+        all_unique_texts: list[str] = []
+        all_texts_indexes = []
         for text in all_texts:
             text_hash = hash(text)
             if text_hash not in index_map:

mteb/_evaluators/retrieval_metrics.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import logging
 from collections import defaultdict
+from collections.abc import Mapping
 from typing import Any
 import numpy as np
@@ -15,7 +16,7 @@ logger = logging.getLogger(__name__)
 def mrr(
     qrels: RelevantDocumentsType,
-    results: dict[str, dict[str, float]],
+    results: Mapping[str, Mapping[str, float]],
     k_values: list[int],
 ) -> dict[str, list[float]]:
     mrr_metrics = defaultdict(list)
@@ -32,7 +33,7 @@ def mrr(
             doc_id for doc_id in qrels[query_id] if qrels[query_id][doc_id] > 0
         }
         for k in k_values:
-            rr = 0
+            rr = 0.0
             for rank, hit in enumerate(top_hits[query_id][0:k]):
                 if hit[0] in query_relevant_docs:
                     rr = 1.0 / (rank + 1)
@@ -45,8 +46,8 @@ def recall_cap(
     qrels: RelevantDocumentsType,
     results: dict[str, dict[str, float]],
     k_values: list[int],
-) -> dict[str, list[float]]:
-    capped_recall = defaultdict(list)
+) -> dict[str, list[float | None]]:
+    capped_recall: dict[str, list[float | None]] = defaultdict(list)
     k_max = max(k_values)
@@ -188,7 +189,7 @@ def evaluate_p_mrr_change(
     Returns:
         A dictionary with the scores, including "p-MRR", "og" and "changed" keys.
     """
-    followir_scores = defaultdict(dict)
+    followir_scores: dict[str, float | dict[str, float]] = defaultdict(dict)
     qrels_sep = {
         "og": {k: v for k, v in qrels.items() if k.endswith("-og")},
@@ -227,7 +228,7 @@ def evaluate_p_mrr_change(
             ndcg, _map, recall, precision, naucs, avg_mrr, naucs_mrr, cv_recall, {}
         )
         for key, value in scores_dict.items():
-            followir_scores[name][key] = value
+            followir_scores[name][key] = value  # type: ignore[index]
     return followir_scores
@@ -254,8 +255,8 @@ def confidence_scores(sim_scores: list[float]) -> dict[str, float]:
     sim_scores_sorted = sorted(sim_scores)[::-1]
     cs_max = sim_scores_sorted[0]
-    cs_std = np.std(sim_scores)
-    cs_diff1 = None
+    cs_std = float(np.std(sim_scores))
+    cs_diff1 = 0.0
     if len(sim_scores) > 1:
         cs_diff1 = sim_scores_sorted[0] - sim_scores_sorted[1]
     elif len(sim_scores) == 1:
@@ -410,7 +411,7 @@ def make_score_dict(
     cv_recall: dict[str, float],
     task_scores: dict[str, float],
     previous_results_model_meta: dict[str, Any] | None = None,
-) -> dict[str, float]:
+) -> dict[str, Any]:
     return {
         **{f"ndcg_at_{k.split('@')[1]}": v for (k, v) in ndcg.items()},
         **{f"map_at_{k.split('@')[1]}": v for (k, v) in _map.items()},
@@ -528,7 +529,7 @@ def max_over_subqueries(
 def calculate_retrieval_scores(
-    results: dict[str, dict[str, float]],
+    results: Mapping[str, Mapping[str, float]],
     qrels: RelevantDocumentsType,
     k_values: list[int],
     skip_first_result: bool = False,
@@ -576,7 +577,7 @@ def calculate_retrieval_scores(
 def evaluate_abstention(
-    results: dict[str, dict[str, float]],
+    results: Mapping[str, Mapping[str, float]],
     metric_scores: dict[str, list[float]],
 ) -> dict[str, float]:
     """Computes normalized Area Under the Curve on a set of evaluated instances as presented in the paper https://arxiv.org/abs/2402.12997
@@ -591,21 +592,21 @@ def evaluate_abstention(
     all_sim_scores = [list(results[qid].values()) for qid in list(results.keys())]
     all_conf_scores = [confidence_scores(sim_scores) for sim_scores in all_sim_scores]
     conf_fcts = list(all_conf_scores[0].keys())
-    all_conf_scores = {
+    all_conf_scores_ = {
         fct: np.array([x[fct] for x in all_conf_scores]) for fct in conf_fcts
     }
-    metric_scores = {k: np.array(v) for k, v in metric_scores.items()}
+    metric_scores_ = {k: np.array(v) for k, v in metric_scores.items()}
     naucs = {}
-    for metric_name, scores in metric_scores.items():
-        for fct, conf_scores in all_conf_scores.items():
+    for metric_name, scores in metric_scores_.items():
+        for fct, conf_scores in all_conf_scores_.items():
             naucs[f"nAUC_{metric_name}_{fct}"] = nauc(conf_scores, scores)
     return naucs
 def calculate_cv_recall(
-    results: dict[str, dict[str, float]],
+    results: Mapping[str, Mapping[str, float]],
     qrels: RelevantDocumentsType,
     k_values: list[int],
     skip_first_result: bool = False,

mteb/_evaluators/sklearn_evaluator.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import logging
-from typing import Any, Protocol
+from typing import Any, Protocol, cast
 import numpy as np
 from datasets import Dataset
@@ -9,7 +9,7 @@ from typing_extensions import Self
 from mteb._create_dataloaders import create_dataloader
 from mteb.abstasks.task_metadata import TaskMetadata
 from mteb.models import EncoderProtocol
-from mteb.types import BatchedInput
+from mteb.types import Array, BatchedInput
 from .evaluator import Evaluator
@@ -17,11 +17,11 @@ logger = logging.getLogger(__name__)
 class SklearnModelProtocol(Protocol):
-    def fit(self, X: np.ndarray, y: np.ndarray | list[int]) -> None: ...  # noqa: N803
-    def predict(self, X: np.ndarray) -> np.ndarray: ...  # noqa: N803
+    def fit(self, X: Array, y: np.ndarray | list[int]) -> None: ...  # noqa: N803
+    def predict(self, X: Array) -> np.ndarray: ...  # noqa: N803
     def get_params(self) -> dict[str, Any]: ...
-    def set_params(self, **kwargs: dict[str, Any]) -> Self: ...
-    def score(self, X: np.ndarray, y: np.ndarray | list[int]) -> float: ...  # noqa: N803
+    def set_params(self, random_state: int, **kwargs: dict[str, Any]) -> Self: ...
+    def score(self, X: Array, y: np.ndarray | list[int]) -> float: ...  # noqa: N803
 class SklearnEvaluator(Evaluator):
@@ -71,8 +71,8 @@ class SklearnEvaluator(Evaluator):
         model: EncoderProtocol,
         *,
         encode_kwargs: dict[str, Any],
-        test_cache: np.ndarray | None = None,
-    ) -> tuple[np.ndarray, np.ndarray]:
+        test_cache: Array | None = None,
+    ) -> tuple[np.ndarray, Array]:
         """Classification evaluation by training a sklearn classifier on the embeddings of the training set and evaluating on the embeddings of the test set.
         Args:
@@ -104,6 +104,7 @@ class SklearnEvaluator(Evaluator):
                 hf_subset=self.hf_subset,
                 **encode_kwargs,
             )
+            test_cache = cast(Array, test_cache)
         logger.info("Running - Fitting classifier...")
         y_train = self.train_dataset[self.label_column_name]

mteb/_evaluators/text/bitext_mining_evaluator.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import logging
 from typing import Any
-import numpy as np
 import torch
 from datasets import Dataset
 from tqdm.auto import tqdm
@@ -10,6 +9,7 @@ from mteb._create_dataloaders import _create_dataloader_from_texts
 from mteb._evaluators.evaluator import Evaluator
 from mteb.abstasks.task_metadata import TaskMetadata
 from mteb.models import EncoderProtocol
+from mteb.types import Array
 logger = logging.getLogger(__name__)
@@ -69,11 +69,11 @@ class BitextMiningEvaluator(Evaluator):
     def _similarity_search(
         self,
-        query_embeddings: np.ndarray,
-        corpus_embeddings: np.ndarray,
+        query_embeddings: Array,
+        corpus_embeddings: Array,
         model: EncoderProtocol,
         query_chunk_size: int = 100,
-        corpus_chunk_size: int = 500000,
+        corpus_chunk_size: int = 500_000,
     ) -> list[dict[str, float]]:
         """This function performs a cosine similarity search between a list of query embeddings and a list of corpus embeddings.
@@ -104,13 +104,15 @@ class BitextMiningEvaluator(Evaluator):
         ):
             query_embeddings = query_embeddings.to(corpus_embeddings.device)
-        queries_result_list = [[] for _ in range(len(query_embeddings))]
+        queries_result_list: list[list[dict[str, float]]] = [
+            [] for _ in range(len(query_embeddings))
+        ]
         for query_start_idx in range(0, len(query_embeddings), query_chunk_size):
             # Iterate over chunks of the corpus
             for corpus_start_idx in range(0, len(corpus_embeddings), corpus_chunk_size):
                 # Compute cosine similarities
-                similarity_scores = model.similarity(  # type: ignore
+                similarity_scores = model.similarity(
                     query_embeddings[
                         query_start_idx : query_start_idx + query_chunk_size
                     ],
@@ -120,15 +122,17 @@ class BitextMiningEvaluator(Evaluator):
                 )
                 # Get top-k scores
-                cos_scores_top_k_values, cos_scores_top_k_idx = torch.topk(
-                    torch.tensor(similarity_scores),
-                    1,
-                    dim=1,
-                    largest=True,
-                    sorted=False,
+                cos_scores_top_k_values_tensor, cos_scores_top_k_idx_tensor = (
+                    torch.topk(
+                        torch.tensor(similarity_scores),
+                        1,
+                        dim=1,
+                        largest=True,
+                        sorted=False,
+                    )
                 )
-                cos_scores_top_k_values = cos_scores_top_k_values.cpu().tolist()
-                cos_scores_top_k_idx = cos_scores_top_k_idx.cpu().tolist()
+                cos_scores_top_k_values = cos_scores_top_k_values_tensor.cpu().tolist()
+                cos_scores_top_k_idx = cos_scores_top_k_idx_tensor.cpu().tolist()
                 for query_itr in range(len(similarity_scores)):
                     for sub_corpus_id, score in zip(
@@ -141,11 +145,14 @@ class BitextMiningEvaluator(Evaluator):
                             {"corpus_id": corpus_id, "score": score}
                         )
+        result_queries_list: list[dict[str, float]] = [
+            {} for _ in range(len(query_embeddings))
+        ]
         # Sort and strip to top_k results
         for idx in range(len(queries_result_list)):
             queries_result_list[idx] = sorted(
                 queries_result_list[idx], key=lambda x: x["score"], reverse=True
             )
-            queries_result_list[idx] = queries_result_list[idx][0]
+            result_queries_list[idx] = queries_result_list[idx][0]
-        return queries_result_list
+        return result_queries_list

mteb/_evaluators/text/summarization_evaluator.py CHANGED Viewed

@@ -135,10 +135,10 @@ class SummarizationEvaluator(Evaluator):
         )
         # Split the embeddings into the original human & machine summaries
-        embs_human_summaries_all = np.split(
+        embs_human_summaries_all_split = np.split(
             embs_human_summaries_all, np.cumsum(human_lens)[:-1]
         )
-        embs_machine_summaries_all = np.split(
+        embs_machine_summaries_all_split = np.split(
             embs_machine_summaries_all, np.cumsum(machine_lens)[:-1]
         )
@@ -148,7 +148,9 @@ class SummarizationEvaluator(Evaluator):
         all_human_scores = []
         for i, (embs_human_summaries, embs_machine_summaries) in tqdm(
-            enumerate(zip(embs_human_summaries_all, embs_machine_summaries_all)),
+            enumerate(
+                zip(embs_human_summaries_all_split, embs_machine_summaries_all_split)
+            ),
             desc="Scoring",
             total=len(self.human_summaries),
         ):
@@ -164,7 +166,7 @@ class SummarizationEvaluator(Evaluator):
                 dot_scores = dot_score(emb_machine_summary, embs_human_summaries)
                 _sim_score = [
-                    float(model.similarity(emb_machine_summary, emb_human_summary))  # type: ignore
+                    float(model.similarity(emb_machine_summary, emb_human_summary))
                     for emb_human_summary in embs_human_summaries
                 ]
                 sim_score = torch.tensor(_sim_score)
@@ -216,17 +218,19 @@ class SummarizationEvaluator(Evaluator):
             strict=True,
         ):
             cosine_spearman_scores.append(
-                spearmanr(human_scores, cosine_pred_scores).statistic
+                float(spearmanr(human_scores, cosine_pred_scores).statistic)
             )
             cosine_pearson_scores.append(
-                pearsonr(human_scores, cosine_pred_scores).statistic
+                float(pearsonr(human_scores, cosine_pred_scores).statistic)
             )
             dot_spearman_scores.append(
-                spearmanr(human_scores, dot_pred_scores).statistic
+                float(spearmanr(human_scores, dot_pred_scores).statistic)
+            )
+            dot_pearson_scores.append(
+                float(pearsonr(human_scores, dot_pred_scores).statistic)
             )
-            dot_pearson_scores.append(pearsonr(human_scores, dot_pred_scores).statistic)
-            spearman_scores.append(spearmanr(human_scores, sim_scores).statistic)
-            pearson_scores.append(pearsonr(human_scores, sim_scores).statistic)
+            spearman_scores.append(float(spearmanr(human_scores, sim_scores).statistic))
+            pearson_scores.append(float(pearsonr(human_scores, sim_scores).statistic))
         return SummarizationMetrics(
             pearson=float(np.mean(pearson_scores)),
@@ -273,10 +277,10 @@ class DeprecatedSummarizationEvaluator(SummarizationEvaluator):
             pearson_scores.append(pearsonr(human_scores, sim_scores))
         return SummarizationMetrics(
-            pearson=float(np.mean(pearson_scores)),
-            spearman=float(np.mean(spearman_scores)),
-            cosine_spearman=float(np.mean(cosine_spearman_scores)),
-            cosine_pearson=float(np.mean(cosine_pearson_scores)),
-            dot_pearson=float(np.mean(dot_pearson_scores)),
-            dot_spearman=float(np.mean(dot_spearman_scores)),
+            pearson=float(np.mean(pearson_scores)),  # type: ignore[arg-type]
+            spearman=float(np.mean(spearman_scores)),  # type: ignore[arg-type]
+            cosine_spearman=float(np.mean(cosine_spearman_scores)),  # type: ignore[arg-type]
+            cosine_pearson=float(np.mean(cosine_pearson_scores)),  # type: ignore[arg-type]
+            dot_pearson=float(np.mean(dot_pearson_scores)),  # type: ignore[arg-type]
+            dot_spearman=float(np.mean(dot_spearman_scores)),  # type: ignore[arg-type]
         )

mteb/abstasks/_data_filter/filters.py CHANGED Viewed

@@ -61,7 +61,7 @@ def filter_unclear_label(
         for text, label in zip(ds[input_column], ds[label_column]):
             key = text.strip().lower()
             normalized.setdefault(key, set()).add(
-                label if isinstance(label, (str, int, float)) else tuple(label)
+                label if isinstance(label, (str, int, float)) else tuple(label)  # type: ignore[arg-type]
             )
     bad_texts = {t for t, labels in normalized.items() if len(labels) > 1}

mteb/abstasks/_data_filter/task_pipelines.py CHANGED Viewed

@@ -89,6 +89,9 @@ def process_classification(
             subset=None,
         )
+    if task.dataset is None:
+        raise ValueError("Task dataset is None.")
     new_ds = {}
     for subset in task.dataset:
         new_ds[subset] = clean_dataset(

mteb/abstasks/_statistics_calculation.py CHANGED Viewed

@@ -2,7 +2,8 @@ from __future__ import annotations
 import hashlib
 from collections import Counter
-from typing import TYPE_CHECKING
+from collections.abc import Mapping
+from typing import TYPE_CHECKING, cast
 from mteb.types import TopRankedDocumentsType
 from mteb.types.statistics import (
@@ -52,7 +53,7 @@ def calculate_image_statistics(images: list[Image.Image]) -> ImageStatistics:
     seen_hashes: set[str] = set()
     for img in images:
-        width, height = img.size  # type: ignore
+        width, height = img.size
         img_heights.append(height)
         img_widths.append(width)
@@ -82,17 +83,24 @@ def calculate_label_statistics(labels: list[int | list[int]]) -> LabelStatistics
         LabelStatistics: A dictionary containing the descriptive statistics.
     """
+    total_labels: list[int | None] = []
     if not isinstance(labels[0], list):
-        label_len = [1] * len(labels)
-        total_label_len = len(labels)
-        total_labels = labels
+        # single label classification
+        single_label = cast(list[int], labels)
+        label_len = [1] * len(single_label)
+        total_label_len = len(single_label)
+        total_labels.extend(single_label)
     elif isinstance(labels[0], list):
         # multilabel classification
-        label_len = [len(l) for l in labels]
+        multilabel_labels = cast(list[list[int]], labels)
+        label_len = [len(l) for l in multilabel_labels]
         total_label_len = sum(label_len)
-        total_labels = []
-        for l in labels:
-            total_labels.extend(l if len(l) > 0 else [None])
+        for l in multilabel_labels:
+            if l and len(l) > 0:
+                total_labels.extend(l)
+            else:
+                total_labels.append(None)
     else:
         raise ValueError(
             "Labels must be a list of integers or a list of lists of integers."
@@ -159,7 +167,7 @@ def calculate_top_ranked_statistics(
 def calculate_relevant_docs_statistics(
-    relevant_docs: dict[str, dict[str, float]],
+    relevant_docs: Mapping[str, Mapping[str, int]],
 ) -> RelevantDocsStatistics:
     qrels_lengths = [len(relevant_docs[qid]) for qid in relevant_docs]
     unique_qrels = len({doc for qid in relevant_docs for doc in relevant_docs[qid]})

mteb/abstasks/_stratification.py CHANGED Viewed

@@ -39,6 +39,7 @@ Bibtex:
 """
 import itertools
+from typing import Any
 import numpy as np
 import scipy.sparse as sp
@@ -119,8 +120,10 @@ def _get_most_desired_combination(samples_with_combination: dict):
         if support_size == 0:
             continue
         if currently_chosen is None or (
-            best_number_of_combinations < number_of_combinations  # type: ignore
-            and best_support_size > support_size  # type: ignore
+            best_number_of_combinations is not None
+            and best_support_size is not None
+            and best_number_of_combinations < number_of_combinations
+            and best_support_size > support_size
         ):
             currently_chosen = combination
             best_number_of_combinations, best_support_size = (
@@ -162,7 +165,7 @@ class IterativeStratification(_BaseKFold):
         self._rng_state = check_random_state(random_state)
         need_shuffle = shuffle or random_state is not None
         self.order = order
-        super().__init__(  # type: ignore
+        super().__init__(
             n_splits,
             shuffle=need_shuffle,
             random_state=self._rng_state if need_shuffle else None,
@@ -172,8 +175,7 @@ class IterativeStratification(_BaseKFold):
             self.percentage_per_fold = sample_distribution_per_fold
         else:
             self.percentage_per_fold = [
-                1 / float(self.n_splits)
-                for _ in range(self.n_splits)  # type: ignore
+                1 / float(self.n_splits) for _ in range(self.n_splits)
             ]
     def _prepare_stratification(
@@ -182,9 +184,9 @@ class IterativeStratification(_BaseKFold):
         list[list[int]],
         dict[int, bool],
         list[list[int]],
-        list[list[list[int]]],
-        dict[tuple[int, ...], list[int]],
-        list[list[int]],
+        list[list[Any]],
+        dict[str, list[Any]],
+        list[list[Any]],
     ]:
         """Prepares variables for performing stratification
@@ -206,14 +208,14 @@ class IterativeStratification(_BaseKFold):
         """
         self.n_samples, self.n_labels = y.shape
         self.desired_samples_per_fold = np.array(
-            [self.percentage_per_fold[i] * self.n_samples for i in range(self.n_splits)]  # type: ignore
+            [self.percentage_per_fold[i] * self.n_samples for i in range(self.n_splits)]
         )
         rows = sp.lil_matrix(y).rows
         rows_used = dict.fromkeys(range(self.n_samples), False)
         all_combinations = []
-        per_row_combinations = [[] for i in range(self.n_samples)]
-        samples_with_combination = {}
-        folds = [[] for _ in range(self.n_splits)]  # type: ignore
+        per_row_combinations: list[list[Any]] = [[] for i in range(self.n_samples)]
+        samples_with_combination: dict[str, list[Any]] = {}
+        folds: list[list[int]] = [[] for _ in range(self.n_splits)]
         # for every row
         for sample_index, label_assignment in enumerate(rows):
@@ -229,21 +231,19 @@ class IterativeStratification(_BaseKFold):
                 all_combinations.append(combination)
                 per_row_combinations[sample_index].append(combination)
-        all_combinations = [list(x) for x in set(all_combinations)]
         self.desired_samples_per_combination_per_fold = {
             combination: np.array(
                 [
                     len(evidence_for_combination) * self.percentage_per_fold[j]
-                    for j in range(self.n_splits)  # type: ignore
+                    for j in range(self.n_splits)
                 ]
             )
             for combination, evidence_for_combination in samples_with_combination.items()
         }
         return (
-            rows,
+            rows.tolist(),
             rows_used,
-            all_combinations,
+            [list(x) for x in set(all_combinations)],
             per_row_combinations,
             samples_with_combination,
             folds,
@@ -328,7 +328,7 @@ class IterativeStratification(_BaseKFold):
             per_row_combinations,
             samples_with_combination,
             folds,
-        ) = self._prepare_stratification(y)  # type: ignore
+        ) = self._prepare_stratification(y)
         self._distribute_positive_evidence(
             rows_used, folds, samples_with_combination, per_row_combinations

mteb 2.5.3__py3-none-any.whl → 2.5.4__py3-none-any.whl

mteb 2.5.3py3-none-any.whl → 2.5.4py3-none-any.whl