PyPI - mteb - Versions diffs - 2.5.2__py3-none-any.whl → 2.7.2__py3-none-any.whl - Mend

mteb 2.5.2py3-none-any.whl → 2.7.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (241) hide show

mteb/__init__.py +2 -0
mteb/_create_dataloaders.py +17 -18
mteb/_evaluators/any_sts_evaluator.py +3 -3
mteb/_evaluators/clustering_evaluator.py +2 -2
mteb/_evaluators/evaluator.py +4 -2
mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +10 -8
mteb/_evaluators/pair_classification_evaluator.py +5 -3
mteb/_evaluators/retrieval_evaluator.py +2 -2
mteb/_evaluators/retrieval_metrics.py +18 -17
mteb/_evaluators/sklearn_evaluator.py +11 -10
mteb/_evaluators/text/bitext_mining_evaluator.py +27 -18
mteb/_evaluators/text/summarization_evaluator.py +23 -18
mteb/_evaluators/zeroshot_classification_evaluator.py +5 -3
mteb/abstasks/_data_filter/filters.py +1 -1
mteb/abstasks/_data_filter/task_pipelines.py +3 -0
mteb/abstasks/_statistics_calculation.py +18 -10
mteb/abstasks/_stratification.py +18 -18
mteb/abstasks/abstask.py +35 -28
mteb/abstasks/aggregate_task_metadata.py +1 -9
mteb/abstasks/aggregated_task.py +10 -29
mteb/abstasks/classification.py +15 -10
mteb/abstasks/clustering.py +19 -15
mteb/abstasks/clustering_legacy.py +10 -10
mteb/abstasks/image/image_text_pair_classification.py +7 -4
mteb/abstasks/multilabel_classification.py +23 -19
mteb/abstasks/pair_classification.py +20 -11
mteb/abstasks/regression.py +4 -4
mteb/abstasks/retrieval.py +28 -24
mteb/abstasks/retrieval_dataset_loaders.py +2 -2
mteb/abstasks/sts.py +8 -5
mteb/abstasks/task_metadata.py +31 -33
mteb/abstasks/text/bitext_mining.py +39 -28
mteb/abstasks/text/reranking.py +8 -6
mteb/abstasks/text/summarization.py +10 -5
mteb/abstasks/zeroshot_classification.py +8 -4
mteb/benchmarks/benchmark.py +4 -2
mteb/benchmarks/benchmarks/__init__.py +4 -0
mteb/benchmarks/benchmarks/benchmarks.py +112 -11
mteb/benchmarks/get_benchmark.py +14 -55
mteb/cache.py +182 -29
mteb/cli/_display_tasks.py +2 -2
mteb/cli/build_cli.py +110 -14
mteb/cli/generate_model_card.py +43 -23
mteb/deprecated_evaluator.py +63 -49
mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2CybersecurityRetrieval.json +32 -0
mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EconomicRetrieval.json +32 -0
mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EnergyRetrieval.json +32 -0
mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2HrRetrieval.json +32 -0
mteb/descriptive_stats/Retrieval/ChemRxivRetrieval.json +30 -0
mteb/descriptive_stats/Retrieval/EuroPIRQRetrieval.json +116 -0
mteb/descriptive_stats/Retrieval/NanoClimateFEVER-VN.json +30 -0
mteb/descriptive_stats/Retrieval/NanoDBPedia-VN.json +30 -0
mteb/descriptive_stats/Retrieval/NanoFEVER-VN.json +30 -0
mteb/descriptive_stats/Retrieval/NanoHotpotQA-VN.json +30 -0
mteb/descriptive_stats/Retrieval/NanoMSMARCO-VN.json +30 -0
mteb/descriptive_stats/Retrieval/NanoNQ-VN.json +30 -0
mteb/descriptive_stats/Retrieval/TVPLRetrieval.json +30 -0
mteb/evaluate.py +44 -33
mteb/filter_tasks.py +25 -26
mteb/get_tasks.py +29 -30
mteb/languages/language_scripts.py +5 -3
mteb/leaderboard/app.py +162 -34
mteb/load_results.py +12 -12
mteb/models/abs_encoder.py +10 -6
mteb/models/cache_wrappers/cache_backend_protocol.py +3 -5
mteb/models/cache_wrappers/cache_backends/_hash_utils.py +5 -4
mteb/models/cache_wrappers/cache_backends/faiss_cache.py +6 -2
mteb/models/cache_wrappers/cache_backends/numpy_cache.py +43 -25
mteb/models/cache_wrappers/cache_wrapper.py +2 -2
mteb/models/get_model_meta.py +21 -3
mteb/models/instruct_wrapper.py +28 -8
mteb/models/model_implementations/align_models.py +1 -1
mteb/models/model_implementations/andersborges.py +4 -4
mteb/models/model_implementations/ara_models.py +1 -1
mteb/models/model_implementations/arctic_models.py +8 -8
mteb/models/model_implementations/b1ade_models.py +1 -1
mteb/models/model_implementations/bge_models.py +45 -21
mteb/models/model_implementations/bica_model.py +3 -3
mteb/models/model_implementations/blip2_models.py +2 -2
mteb/models/model_implementations/blip_models.py +16 -16
mteb/models/model_implementations/bm25.py +4 -4
mteb/models/model_implementations/bmretriever_models.py +6 -4
mteb/models/model_implementations/cadet_models.py +1 -1
mteb/models/model_implementations/cde_models.py +11 -4
mteb/models/model_implementations/clip_models.py +6 -6
mteb/models/model_implementations/clips_models.py +3 -3
mteb/models/model_implementations/codefuse_models.py +5 -5
mteb/models/model_implementations/codesage_models.py +3 -3
mteb/models/model_implementations/cohere_models.py +5 -5
mteb/models/model_implementations/cohere_v.py +2 -2
mteb/models/model_implementations/colpali_models.py +3 -3
mteb/models/model_implementations/colqwen_models.py +8 -8
mteb/models/model_implementations/colsmol_models.py +2 -2
mteb/models/model_implementations/conan_models.py +1 -1
mteb/models/model_implementations/dino_models.py +42 -42
mteb/models/model_implementations/e5_instruct.py +23 -4
mteb/models/model_implementations/e5_models.py +9 -9
mteb/models/model_implementations/e5_v.py +6 -6
mteb/models/model_implementations/eagerworks_models.py +1 -1
mteb/models/model_implementations/emillykkejensen_models.py +6 -6
mteb/models/model_implementations/en_code_retriever.py +1 -1
mteb/models/model_implementations/euler_models.py +2 -2
mteb/models/model_implementations/fa_models.py +9 -9
mteb/models/model_implementations/facebookai.py +14 -2
mteb/models/model_implementations/geogpt_models.py +1 -1
mteb/models/model_implementations/gme_v_models.py +6 -5
mteb/models/model_implementations/google_models.py +1 -1
mteb/models/model_implementations/granite_vision_embedding_models.py +1 -1
mteb/models/model_implementations/gritlm_models.py +2 -2
mteb/models/model_implementations/gte_models.py +25 -13
mteb/models/model_implementations/hinvec_models.py +1 -1
mteb/models/model_implementations/ibm_granite_models.py +30 -6
mteb/models/model_implementations/inf_models.py +2 -2
mteb/models/model_implementations/jasper_models.py +2 -2
mteb/models/model_implementations/jina_clip.py +48 -10
mteb/models/model_implementations/jina_models.py +18 -11
mteb/models/model_implementations/kblab.py +12 -6
mteb/models/model_implementations/kennethenevoldsen_models.py +4 -4
mteb/models/model_implementations/kfst.py +1 -1
mteb/models/model_implementations/kowshik24_models.py +1 -1
mteb/models/model_implementations/lgai_embedding_models.py +1 -1
mteb/models/model_implementations/linq_models.py +1 -1
mteb/models/model_implementations/listconranker.py +1 -1
mteb/models/model_implementations/llm2clip_models.py +6 -6
mteb/models/model_implementations/llm2vec_models.py +8 -8
mteb/models/model_implementations/mcinext_models.py +4 -1
mteb/models/model_implementations/mdbr_models.py +17 -3
mteb/models/model_implementations/misc_models.py +68 -68
mteb/models/model_implementations/mixedbread_ai_models.py +332 -0
mteb/models/model_implementations/mme5_models.py +1 -1
mteb/models/model_implementations/moco_models.py +4 -4
mteb/models/model_implementations/mod_models.py +1 -1
mteb/models/model_implementations/model2vec_models.py +14 -14
mteb/models/model_implementations/moka_models.py +1 -1
mteb/models/model_implementations/nbailab.py +3 -3
mteb/models/model_implementations/no_instruct_sentence_models.py +2 -2
mteb/models/model_implementations/nomic_models.py +30 -15
mteb/models/model_implementations/nomic_models_vision.py +1 -1
mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +15 -9
mteb/models/model_implementations/nvidia_models.py +151 -19
mteb/models/model_implementations/octen_models.py +61 -2
mteb/models/model_implementations/openclip_models.py +13 -13
mteb/models/model_implementations/opensearch_neural_sparse_models.py +5 -5
mteb/models/model_implementations/ops_moa_models.py +1 -1
mteb/models/model_implementations/ordalietech_solon_embeddings_mini_beta_1_1.py +1 -1
mteb/models/model_implementations/pawan_models.py +1 -1
mteb/models/model_implementations/piccolo_models.py +1 -1
mteb/models/model_implementations/pixie_models.py +56 -0
mteb/models/model_implementations/promptriever_models.py +4 -4
mteb/models/model_implementations/pylate_models.py +10 -9
mteb/models/model_implementations/qodo_models.py +2 -2
mteb/models/model_implementations/qtack_models.py +1 -1
mteb/models/model_implementations/qwen3_models.py +3 -3
mteb/models/model_implementations/qzhou_models.py +2 -2
mteb/models/model_implementations/random_baseline.py +3 -3
mteb/models/model_implementations/rasgaard_models.py +2 -2
mteb/models/model_implementations/reasonir_model.py +1 -1
mteb/models/model_implementations/repllama_models.py +3 -3
mteb/models/model_implementations/rerankers_custom.py +12 -6
mteb/models/model_implementations/rerankers_monot5_based.py +17 -17
mteb/models/model_implementations/richinfoai_models.py +1 -1
mteb/models/model_implementations/ru_sentence_models.py +20 -20
mteb/models/model_implementations/ruri_models.py +10 -10
mteb/models/model_implementations/salesforce_models.py +3 -3
mteb/models/model_implementations/samilpwc_models.py +1 -1
mteb/models/model_implementations/sarashina_embedding_models.py +2 -2
mteb/models/model_implementations/searchmap_models.py +1 -1
mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +113 -146
mteb/models/model_implementations/sentence_transformers_models.py +124 -22
mteb/models/model_implementations/shuu_model.py +1 -1
mteb/models/model_implementations/siglip_models.py +20 -20
mteb/models/model_implementations/slm_models.py +416 -0
mteb/models/model_implementations/spartan8806_atles_champion.py +1 -1
mteb/models/model_implementations/stella_models.py +17 -4
mteb/models/model_implementations/tarka_models.py +2 -2
mteb/models/model_implementations/text2vec_models.py +9 -3
mteb/models/model_implementations/ua_sentence_models.py +1 -1
mteb/models/model_implementations/uae_models.py +7 -1
mteb/models/model_implementations/vdr_models.py +1 -1
mteb/models/model_implementations/vi_vn_models.py +6 -6
mteb/models/model_implementations/vlm2vec_models.py +3 -3
mteb/models/model_implementations/voyage_models.py +84 -0
mteb/models/model_implementations/voyage_v.py +9 -7
mteb/models/model_implementations/youtu_models.py +1 -1
mteb/models/model_implementations/yuan_models.py +1 -1
mteb/models/model_implementations/yuan_models_en.py +1 -1
mteb/models/model_meta.py +80 -31
mteb/models/models_protocols.py +22 -6
mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +9 -6
mteb/models/search_wrappers.py +33 -18
mteb/models/sentence_transformer_wrapper.py +50 -25
mteb/models/vllm_wrapper.py +327 -0
mteb/py.typed +0 -0
mteb/results/benchmark_results.py +29 -21
mteb/results/model_result.py +52 -22
mteb/results/task_result.py +80 -58
mteb/similarity_functions.py +11 -7
mteb/tasks/classification/dan/dk_hate_classification.py +1 -1
mteb/tasks/classification/est/estonian_valence.py +1 -1
mteb/tasks/classification/kur/kurdish_sentiment_classification.py +2 -2
mteb/tasks/classification/multilingual/scala_classification.py +1 -1
mteb/tasks/clustering/eng/hume_wiki_cities_clustering.py +1 -1
mteb/tasks/clustering/eng/wiki_cities_clustering.py +1 -1
mteb/tasks/clustering/zho/cmteb_clustering.py +2 -2
mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +1 -1
mteb/tasks/reranking/multilingual/wikipedia_reranking_multilingual.py +1 -1
mteb/tasks/retrieval/code/code_rag.py +12 -12
mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
mteb/tasks/retrieval/dan/tv2_nordretrieval.py +2 -2
mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +2 -2
mteb/tasks/retrieval/eng/__init__.py +2 -0
mteb/tasks/retrieval/eng/chemrxiv.py +33 -0
mteb/tasks/retrieval/eng/cub200_i2i_retrieval.py +1 -1
mteb/tasks/retrieval/kor/__init__.py +15 -1
mteb/tasks/retrieval/kor/kovidore2_bench_retrieval.py +142 -0
mteb/tasks/retrieval/multilingual/__init__.py +2 -0
mteb/tasks/retrieval/multilingual/euro_pirq_retrieval.py +43 -0
mteb/tasks/retrieval/multilingual/vidore3_bench_retrieval.py +90 -100
mteb/tasks/retrieval/nob/norquad.py +2 -2
mteb/tasks/retrieval/nob/snl_retrieval.py +2 -2
mteb/tasks/retrieval/tur/tur_hist_quad.py +1 -1
mteb/tasks/retrieval/vie/__init__.py +14 -6
mteb/tasks/retrieval/vie/climate_fevervn_retrieval.py +39 -0
mteb/tasks/retrieval/vie/db_pedia_vn_retrieval.py +39 -0
mteb/tasks/retrieval/vie/fevervn_retrieval.py +39 -0
mteb/tasks/retrieval/vie/hotpot_qavn_retrieval.py +39 -0
mteb/tasks/retrieval/vie/msmarcovn_retrieval.py +48 -0
mteb/tasks/retrieval/vie/nqvn_retrieval.py +39 -0
mteb/tasks/retrieval/vie/tvpl_retrieval.py +42 -0
mteb/tasks/retrieval/vie/zac_legal_text_retrieval.py +15 -1
mteb/types/__init__.py +2 -0
mteb/types/_encoder_io.py +12 -0
mteb/types/_result.py +2 -1
mteb/types/statistics.py +9 -3
{mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/METADATA +15 -4
{mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/RECORD +240 -219
mteb/models/model_implementations/mxbai_models.py +0 -111
{mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/WHEEL +0 -0
{mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/entry_points.txt +0 -0
{mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/licenses/LICENSE +0 -0
{mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/top_level.txt +0 -0

mteb/_evaluators/text/summarization_evaluator.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import logging
 import sys
-from typing import Any, TypedDict
+from typing import TypedDict
 import numpy as np
 import torch
@@ -12,6 +12,7 @@ from mteb._evaluators.evaluator import Evaluator
 from mteb.abstasks.task_metadata import TaskMetadata
 from mteb.models import EncoderProtocol
 from mteb.similarity_functions import cos_sim, dot_score
+from mteb.types import EncodeKwargs
 # if later than python 3.13 use typing module
 if sys.version_info >= (3, 13):
@@ -94,7 +95,7 @@ class SummarizationEvaluator(Evaluator):
         self,
         model: EncoderProtocol,
         *,
-        encode_kwargs: dict[str, Any],
+        encode_kwargs: EncodeKwargs,
     ) -> SummarizationDistances:
         # Get the human & machine summaries for the text in one go for all
         human_lens = [len(human_summaries) for human_summaries in self.human_summaries]
@@ -135,10 +136,10 @@ class SummarizationEvaluator(Evaluator):
         )
         # Split the embeddings into the original human & machine summaries
-        embs_human_summaries_all = np.split(
+        embs_human_summaries_all_split = np.split(
             embs_human_summaries_all, np.cumsum(human_lens)[:-1]
         )
-        embs_machine_summaries_all = np.split(
+        embs_machine_summaries_all_split = np.split(
             embs_machine_summaries_all, np.cumsum(machine_lens)[:-1]
         )
@@ -148,7 +149,9 @@ class SummarizationEvaluator(Evaluator):
         all_human_scores = []
         for i, (embs_human_summaries, embs_machine_summaries) in tqdm(
-            enumerate(zip(embs_human_summaries_all, embs_machine_summaries_all)),
+            enumerate(
+                zip(embs_human_summaries_all_split, embs_machine_summaries_all_split)
+            ),
             desc="Scoring",
             total=len(self.human_summaries),
         ):
@@ -164,7 +167,7 @@ class SummarizationEvaluator(Evaluator):
                 dot_scores = dot_score(emb_machine_summary, embs_human_summaries)
                 _sim_score = [
-                    float(model.similarity(emb_machine_summary, emb_human_summary))  # type: ignore
+                    float(model.similarity(emb_machine_summary, emb_human_summary))
                     for emb_human_summary in embs_human_summaries
                 ]
                 sim_score = torch.tensor(_sim_score)
@@ -216,17 +219,19 @@ class SummarizationEvaluator(Evaluator):
             strict=True,
         ):
             cosine_spearman_scores.append(
-                spearmanr(human_scores, cosine_pred_scores).statistic
+                float(spearmanr(human_scores, cosine_pred_scores).statistic)
             )
             cosine_pearson_scores.append(
-                pearsonr(human_scores, cosine_pred_scores).statistic
+                float(pearsonr(human_scores, cosine_pred_scores).statistic)
             )
             dot_spearman_scores.append(
-                spearmanr(human_scores, dot_pred_scores).statistic
+                float(spearmanr(human_scores, dot_pred_scores).statistic)
+            )
+            dot_pearson_scores.append(
+                float(pearsonr(human_scores, dot_pred_scores).statistic)
             )
-            dot_pearson_scores.append(pearsonr(human_scores, dot_pred_scores).statistic)
-            spearman_scores.append(spearmanr(human_scores, sim_scores).statistic)
-            pearson_scores.append(pearsonr(human_scores, sim_scores).statistic)
+            spearman_scores.append(float(spearmanr(human_scores, sim_scores).statistic))
+            pearson_scores.append(float(pearsonr(human_scores, sim_scores).statistic))
         return SummarizationMetrics(
             pearson=float(np.mean(pearson_scores)),
@@ -273,10 +278,10 @@ class DeprecatedSummarizationEvaluator(SummarizationEvaluator):
             pearson_scores.append(pearsonr(human_scores, sim_scores))
         return SummarizationMetrics(
-            pearson=float(np.mean(pearson_scores)),
-            spearman=float(np.mean(spearman_scores)),
-            cosine_spearman=float(np.mean(cosine_spearman_scores)),
-            cosine_pearson=float(np.mean(cosine_pearson_scores)),
-            dot_pearson=float(np.mean(dot_pearson_scores)),
-            dot_spearman=float(np.mean(dot_spearman_scores)),
+            pearson=float(np.mean(pearson_scores)),  # type: ignore[arg-type]
+            spearman=float(np.mean(spearman_scores)),  # type: ignore[arg-type]
+            cosine_spearman=float(np.mean(cosine_spearman_scores)),  # type: ignore[arg-type]
+            cosine_pearson=float(np.mean(cosine_pearson_scores)),  # type: ignore[arg-type]
+            dot_pearson=float(np.mean(dot_pearson_scores)),  # type: ignore[arg-type]
+            dot_spearman=float(np.mean(dot_spearman_scores)),  # type: ignore[arg-type]
         )

mteb/_evaluators/zeroshot_classification_evaluator.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import logging
-from typing import Any
 from datasets import Dataset
@@ -10,7 +9,7 @@ from mteb._create_dataloaders import (
 from mteb.abstasks.task_metadata import TaskMetadata
 from mteb.models import EncoderProtocol
 from mteb.similarity_functions import similarity
-from mteb.types import Array
+from mteb.types import Array, EncodeKwargs
 from .evaluator import Evaluator
@@ -38,7 +37,10 @@ class ZeroShotClassificationEvaluator(Evaluator):
         self.hf_subset = hf_subset
     def __call__(
-        self, model: EncoderProtocol, *, encode_kwargs: dict[str, Any]
+        self,
+        model: EncoderProtocol,
+        *,
+        encode_kwargs: EncodeKwargs,
     ) -> Array:
         dataloader = create_dataloader(
             self.dataset,

mteb/abstasks/_data_filter/filters.py CHANGED Viewed

@@ -61,7 +61,7 @@ def filter_unclear_label(
         for text, label in zip(ds[input_column], ds[label_column]):
             key = text.strip().lower()
             normalized.setdefault(key, set()).add(
-                label if isinstance(label, (str, int, float)) else tuple(label)
+                label if isinstance(label, (str, int, float)) else tuple(label)  # type: ignore[arg-type]
             )
     bad_texts = {t for t, labels in normalized.items() if len(labels) > 1}

mteb/abstasks/_data_filter/task_pipelines.py CHANGED Viewed

@@ -89,6 +89,9 @@ def process_classification(
             subset=None,
         )
+    if task.dataset is None:
+        raise ValueError("Task dataset is None.")
     new_ds = {}
     for subset in task.dataset:
         new_ds[subset] = clean_dataset(

mteb/abstasks/_statistics_calculation.py CHANGED Viewed

@@ -2,7 +2,8 @@ from __future__ import annotations
 import hashlib
 from collections import Counter
-from typing import TYPE_CHECKING
+from collections.abc import Mapping
+from typing import TYPE_CHECKING, cast
 from mteb.types import TopRankedDocumentsType
 from mteb.types.statistics import (
@@ -52,7 +53,7 @@ def calculate_image_statistics(images: list[Image.Image]) -> ImageStatistics:
     seen_hashes: set[str] = set()
     for img in images:
-        width, height = img.size  # type: ignore
+        width, height = img.size
         img_heights.append(height)
         img_widths.append(width)
@@ -82,17 +83,24 @@ def calculate_label_statistics(labels: list[int | list[int]]) -> LabelStatistics
         LabelStatistics: A dictionary containing the descriptive statistics.
     """
+    total_labels: list[int | None] = []
     if not isinstance(labels[0], list):
-        label_len = [1] * len(labels)
-        total_label_len = len(labels)
-        total_labels = labels
+        # single label classification
+        single_label = cast(list[int], labels)
+        label_len = [1] * len(single_label)
+        total_label_len = len(single_label)
+        total_labels.extend(single_label)
     elif isinstance(labels[0], list):
         # multilabel classification
-        label_len = [len(l) for l in labels]
+        multilabel_labels = cast(list[list[int]], labels)
+        label_len = [len(l) for l in multilabel_labels]
         total_label_len = sum(label_len)
-        total_labels = []
-        for l in labels:
-            total_labels.extend(l if len(l) > 0 else [None])
+        for l in multilabel_labels:
+            if l and len(l) > 0:
+                total_labels.extend(l)
+            else:
+                total_labels.append(None)
     else:
         raise ValueError(
             "Labels must be a list of integers or a list of lists of integers."
@@ -159,7 +167,7 @@ def calculate_top_ranked_statistics(
 def calculate_relevant_docs_statistics(
-    relevant_docs: dict[str, dict[str, float]],
+    relevant_docs: Mapping[str, Mapping[str, int]],
 ) -> RelevantDocsStatistics:
     qrels_lengths = [len(relevant_docs[qid]) for qid in relevant_docs]
     unique_qrels = len({doc for qid in relevant_docs for doc in relevant_docs[qid]})

mteb/abstasks/_stratification.py CHANGED Viewed

@@ -39,6 +39,7 @@ Bibtex:
 """
 import itertools
+from typing import Any
 import numpy as np
 import scipy.sparse as sp
@@ -119,8 +120,10 @@ def _get_most_desired_combination(samples_with_combination: dict):
         if support_size == 0:
             continue
         if currently_chosen is None or (
-            best_number_of_combinations < number_of_combinations  # type: ignore
-            and best_support_size > support_size  # type: ignore
+            best_number_of_combinations is not None
+            and best_support_size is not None
+            and best_number_of_combinations < number_of_combinations
+            and best_support_size > support_size
         ):
             currently_chosen = combination
             best_number_of_combinations, best_support_size = (
@@ -162,7 +165,7 @@ class IterativeStratification(_BaseKFold):
         self._rng_state = check_random_state(random_state)
         need_shuffle = shuffle or random_state is not None
         self.order = order
-        super().__init__(  # type: ignore
+        super().__init__(
             n_splits,
             shuffle=need_shuffle,
             random_state=self._rng_state if need_shuffle else None,
@@ -172,8 +175,7 @@ class IterativeStratification(_BaseKFold):
             self.percentage_per_fold = sample_distribution_per_fold
         else:
             self.percentage_per_fold = [
-                1 / float(self.n_splits)
-                for _ in range(self.n_splits)  # type: ignore
+                1 / float(self.n_splits) for _ in range(self.n_splits)
             ]
     def _prepare_stratification(
@@ -182,9 +184,9 @@ class IterativeStratification(_BaseKFold):
         list[list[int]],
         dict[int, bool],
         list[list[int]],
-        list[list[list[int]]],
-        dict[tuple[int, ...], list[int]],
-        list[list[int]],
+        list[list[Any]],
+        dict[str, list[Any]],
+        list[list[Any]],
     ]:
         """Prepares variables for performing stratification
@@ -206,14 +208,14 @@ class IterativeStratification(_BaseKFold):
         """
         self.n_samples, self.n_labels = y.shape
         self.desired_samples_per_fold = np.array(
-            [self.percentage_per_fold[i] * self.n_samples for i in range(self.n_splits)]  # type: ignore
+            [self.percentage_per_fold[i] * self.n_samples for i in range(self.n_splits)]
         )
         rows = sp.lil_matrix(y).rows
         rows_used = dict.fromkeys(range(self.n_samples), False)
         all_combinations = []
-        per_row_combinations = [[] for i in range(self.n_samples)]
-        samples_with_combination = {}
-        folds = [[] for _ in range(self.n_splits)]  # type: ignore
+        per_row_combinations: list[list[Any]] = [[] for i in range(self.n_samples)]
+        samples_with_combination: dict[str, list[Any]] = {}
+        folds: list[list[int]] = [[] for _ in range(self.n_splits)]
         # for every row
         for sample_index, label_assignment in enumerate(rows):
@@ -229,21 +231,19 @@ class IterativeStratification(_BaseKFold):
                 all_combinations.append(combination)
                 per_row_combinations[sample_index].append(combination)
-        all_combinations = [list(x) for x in set(all_combinations)]
         self.desired_samples_per_combination_per_fold = {
             combination: np.array(
                 [
                     len(evidence_for_combination) * self.percentage_per_fold[j]
-                    for j in range(self.n_splits)  # type: ignore
+                    for j in range(self.n_splits)
                 ]
             )
             for combination, evidence_for_combination in samples_with_combination.items()
         }
         return (
-            rows,
+            rows.tolist(),
             rows_used,
-            all_combinations,
+            [list(x) for x in set(all_combinations)],
             per_row_combinations,
             samples_with_combination,
             folds,
@@ -328,7 +328,7 @@ class IterativeStratification(_BaseKFold):
             per_row_combinations,
             samples_with_combination,
             folds,
-        ) = self._prepare_stratification(y)  # type: ignore
+        ) = self._prepare_stratification(y)
         self._distribute_positive_evidence(
             rows_used, folds, samples_with_combination, per_row_combinations

mteb/abstasks/abstask.py CHANGED Viewed

@@ -1,10 +1,11 @@
 import json
 import logging
+import warnings
 from abc import ABC, abstractmethod
-from collections.abc import Sequence
+from collections.abc import Mapping, Sequence
 from copy import copy
 from pathlib import Path
-from typing import Any, cast
+from typing import Any, Literal, cast
 import numpy as np
 from datasets import ClassLabel, Dataset, DatasetDict, load_dataset
@@ -22,6 +23,7 @@ from mteb.models import (
     SearchProtocol,
 )
 from mteb.types import HFSubset, Modalities, ScoresDict
+from mteb.types._encoder_io import EncodeKwargs
 from mteb.types.statistics import DescriptiveStatistics, SplitDescriptiveStatistics
 logger = logging.getLogger(__name__)
@@ -78,8 +80,8 @@ class AbsTask(ABC):
     """
     metadata: TaskMetadata
-    abstask_prompt: str | None = None
-    _eval_splits: list[str] | None = None
+    abstask_prompt: str
+    _eval_splits: Sequence[str] | None = None
     dataset: dict[HFSubset, DatasetDict] | None = None
     data_loaded: bool = False
     hf_subsets: list[HFSubset]
@@ -102,9 +104,9 @@ class AbsTask(ABC):
     def check_if_dataset_is_superseded(self) -> None:
         """Check if the dataset is superseded by a newer version."""
         if self.superseded_by:
-            logger.warning(
-                f"Dataset '{self.metadata.name}' is superseded by '{self.superseded_by}', you might consider using the newer version of the dataset."
-            )
+            msg = f"Dataset '{self.metadata.name}' is superseded by '{self.superseded_by}'. We recommend using the newer version of the dataset unless you are running a specific benchmark. See `get_task('{self.superseded_by}').metadata.description` to get a description of the task and changes."
+            logger.warning(msg)
+            warnings.warn(msg)
     def dataset_transform(self):
         """A transform operations applied to the dataset after loading.
@@ -120,10 +122,10 @@ class AbsTask(ABC):
         split: str = "test",
         subsets_to_run: list[HFSubset] | None = None,
         *,
-        encode_kwargs: dict[str, Any],
+        encode_kwargs: EncodeKwargs,
         prediction_folder: Path | None = None,
         **kwargs: Any,
-    ) -> dict[HFSubset, ScoresDict]:
+    ) -> Mapping[HFSubset, ScoresDict]:
         """Evaluates an MTEB compatible model on the task.
         Args:
@@ -195,12 +197,12 @@ class AbsTask(ABC):
     @abstractmethod
     def _evaluate_subset(
         self,
-        model: EncoderProtocol,
+        model: MTEBModels,
         data_split: Dataset,
         *,
-        encode_kwargs: dict[str, Any],
         hf_split: str,
         hf_subset: str,
+        encode_kwargs: EncodeKwargs,
         prediction_folder: Path | None = None,
         **kwargs: Any,
     ) -> ScoresDict:
@@ -210,7 +212,7 @@ class AbsTask(ABC):
     def _save_task_predictions(
         self,
-        predictions: dict[str, Any] | list[Any],
+        predictions: Mapping[str, Any] | list[Any],
         model: MTEBModels,
         prediction_folder: Path,
         hf_split: str,
@@ -226,7 +228,7 @@ class AbsTask(ABC):
             hf_subset: The subset of the dataset (e.g. "en").
         """
         predictions_path = self._predictions_path(prediction_folder)
-        existing_results = {
+        existing_results: dict[str, Any] = {
             "mteb_model_meta": {
                 "model_name": model.mteb_model_meta.name,
                 "revision": model.mteb_model_meta.revision,
@@ -326,7 +328,7 @@ class AbsTask(ABC):
                     )
         else:
             # some of monolingual datasets explicitly adding the split name to the dataset name
-            self.dataset = load_dataset(**self.metadata.dataset)  # type: ignore
+            self.dataset = load_dataset(**self.metadata.dataset)
         self.dataset_transform()
         self.data_loaded = True
@@ -362,15 +364,19 @@ class AbsTask(ABC):
         """
         from mteb.abstasks import AbsTaskClassification
-        if self.metadata.descriptive_stat_path.exists() and not overwrite_results:
+        existing_stats = self.metadata.descriptive_stats
+        if existing_stats is not None and not overwrite_results:
             logger.info("Loading metadata descriptive statistics from cache.")
-            return self.metadata.descriptive_stats
+            return existing_stats
         if not self.data_loaded:
             self.load_data()
         descriptive_stats: dict[str, DescriptiveStatistics] = {}
-        hf_subset_stat = "hf_subset_descriptive_stats"
+        hf_subset_stat: Literal["hf_subset_descriptive_stats"] = (
+            "hf_subset_descriptive_stats"
+        )
         eval_splits = self.metadata.eval_splits
         if isinstance(self, AbsTaskClassification):
             eval_splits.append(self.train_split)
@@ -381,7 +387,7 @@ class AbsTask(ABC):
             logger.info(f"Processing metadata for split {split}")
             if self.metadata.is_multilingual:
                 descriptive_stats[split] = (
-                    self._calculate_descriptive_statistics_from_split(
+                    self._calculate_descriptive_statistics_from_split(  # type: ignore[assignment]
                         split, compute_overall=True
                     )
                 )
@@ -400,7 +406,7 @@ class AbsTask(ABC):
                     descriptive_stats[split][hf_subset_stat][hf_subset] = split_details
             else:
                 split_details = self._calculate_descriptive_statistics_from_split(split)
-                descriptive_stats[split] = split_details
+                descriptive_stats[split] = split_details  # type: ignore[assignment]
         with self.metadata.descriptive_stat_path.open("w") as f:
             json.dump(descriptive_stats, f, indent=4)
@@ -437,7 +443,7 @@ class AbsTask(ABC):
         return self.metadata.languages
-    def filter_eval_splits(self, eval_splits: list[str] | None) -> Self:
+    def filter_eval_splits(self, eval_splits: Sequence[str] | None) -> Self:
         """Filter the evaluation splits of the task.
         Args:
@@ -451,9 +457,9 @@ class AbsTask(ABC):
     def filter_languages(
         self,
-        languages: list[str] | None,
-        script: list[str] | None = None,
-        hf_subsets: list[HFSubset] | None = None,
+        languages: Sequence[str] | None,
+        script: Sequence[str] | None = None,
+        hf_subsets: Sequence[HFSubset] | None = None,
         exclusive_language_filter: bool = False,
     ) -> Self:
         """Filter the languages of the task.
@@ -499,12 +505,14 @@ class AbsTask(ABC):
         self.hf_subsets = subsets_to_keep
         return self
-    def _add_main_score(self, scores: dict[HFSubset, ScoresDict]) -> None:
+    def _add_main_score(self, scores: ScoresDict) -> None:
         scores["main_score"] = scores[self.metadata.main_score]
     def _upload_dataset_to_hub(
         self, repo_name: str, fields: list[str] | dict[str, str]
     ) -> None:
+        if self.dataset is None:
+            raise ValueError("Dataset not loaded")
         if self.metadata.is_multilingual:
             for config in self.metadata.eval_langs:
                 logger.info(f"Converting {config} of {self.metadata.name}")
@@ -574,7 +582,7 @@ class AbsTask(ABC):
         return False
     @property
-    def eval_splits(self) -> list[str]:
+    def eval_splits(self) -> Sequence[str]:
         """Returns the evaluation splits of the task."""
         if self._eval_splits:
             return self._eval_splits
@@ -607,9 +615,8 @@ class AbsTask(ABC):
             self.data_loaded = False
             logger.info(f"Unloaded dataset {self.metadata.name} from memory.")
         else:
-            logger.warning(
-                f"Dataset {self.metadata.name} is not loaded, cannot unload it."
-            )
+            msg = f"Dataset `{self.metadata.name}` is not loaded, cannot unload it."
+            logger.warning(msg)
     @property
     def superseded_by(self) -> str | None:

mteb/abstasks/aggregate_task_metadata.py CHANGED Viewed

@@ -5,7 +5,6 @@ from pydantic import ConfigDict, Field, model_validator
 from typing_extensions import Self
 from mteb.types import (
-    HFSubset,
     ISOLanguageScript,
     Languages,
     Licenses,
@@ -60,14 +59,7 @@ class AggregateTaskMetadata(TaskMetadata):
     reference: str | None = None
     bibtex_citation: str | None = None
-    @property
-    def hf_subsets_to_langscripts(self) -> dict[HFSubset, list[ISOLanguageScript]]:
-        """Return a dictionary mapping huggingface subsets to languages."""
-        if isinstance(self.eval_langs, dict):
-            return self.eval_langs
-        return {"default": self.eval_langs}  # type: ignore
-    @model_validator(mode="after")  # type: ignore
+    @model_validator(mode="after")
     def _compute_unfilled_cases(self) -> Self:
         if not self.eval_langs:
             self.eval_langs = self._compute_eval_langs()

mteb/abstasks/aggregated_task.py CHANGED Viewed

@@ -1,14 +1,15 @@
 import logging
+import warnings
+from collections.abc import Mapping
 from pathlib import Path
 from typing import Any
 import numpy as np
 from datasets import Dataset, DatasetDict
-from typing_extensions import Self
 from mteb.models.models_protocols import MTEBModels
 from mteb.results.task_result import TaskResult
-from mteb.types import HFSubset, ScoresDict
+from mteb.types import EncodeKwargs, HFSubset, ScoresDict
 from mteb.types.statistics import DescriptiveStatistics
 from .abstask import AbsTask
@@ -32,7 +33,7 @@ class AbsTaskAggregate(AbsTask):
     def task_results_to_scores(
         self, task_results: list[TaskResult]
-    ) -> dict[str, dict[HFSubset, ScoresDict]]:
+    ) -> dict[str, Mapping[HFSubset, ScoresDict]]:
         """The function that aggregated scores. Can be redefined to allow for custom aggregations.
         Args:
@@ -41,7 +42,7 @@ class AbsTaskAggregate(AbsTask):
         Returns:
             A dictionary with the aggregated scores.
         """
-        scores = {}
+        scores: dict[str, Mapping[HFSubset, ScoresDict]] = {}
         subsets = (
             self.metadata.eval_langs.keys()
             if isinstance(self.metadata.eval_langs, dict)
@@ -113,40 +114,20 @@ class AbsTaskAggregate(AbsTask):
         )
         mteb_versions = {tr.mteb_version for tr in task_results}
         if len(mteb_versions) != 1:
-            logger.warning(
-                f"All tasks of {self.metadata.name} is not run using the same version."
-            )
+            msg = f"All tasks of {self.metadata.name} is not run using the same version. different versions found are: {mteb_versions}"
+            logger.warning(msg)
+            warnings.warn(msg)
             task_res.mteb_version = None
         task_res.mteb_version = task_results[0].mteb_version
         return task_res
-    def check_if_dataset_is_superseded(self) -> None:
-        """Check if the dataset is superseded by a newer version"""
-        if self.superseded_by:
-            logger.warning(
-                f"Dataset '{self.metadata.name}' is superseded by '{self.superseded_by}', you might consider using the newer version of the dataset."
-            )
-    def filter_eval_splits(self, eval_splits: list[str] | None) -> Self:
-        """Filter the evaluation splits of the task.
-        Args:
-            eval_splits: List of splits to evaluate on. If None, all splits in metadata
-                are used.
-        Returns:
-            The task with filtered evaluation splits.
-        """
-        self._eval_splits = eval_splits
-        return self
     def evaluate(
         self,
         model: MTEBModels,
         split: str = "test",
         subsets_to_run: list[HFSubset] | None = None,
         *,
-        encode_kwargs: dict[str, Any],
+        encode_kwargs: EncodeKwargs,
         prediction_folder: Path | None = None,
         **kwargs: Any,
     ) -> dict[HFSubset, ScoresDict]:
@@ -160,7 +141,7 @@ class AbsTaskAggregate(AbsTask):
         self,
         model: MTEBModels,
         data_split: DatasetDict | Dataset,
-        encode_kwargs: dict[str, Any],
+        encode_kwargs: EncodeKwargs,
         **kwargs: Any,
     ) -> ScoresDict:
         raise NotImplementedError(

mteb 2.5.2__py3-none-any.whl → 2.7.2__py3-none-any.whl

mteb 2.5.2py3-none-any.whl → 2.7.2py3-none-any.whl