PyPI - mteb - Versions diffs - 2.5.2__py3-none-any.whl → 2.7.2__py3-none-any.whl - Mend

mteb 2.5.2py3-none-any.whl → 2.7.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (241) hide show

mteb/__init__.py +2 -0
mteb/_create_dataloaders.py +17 -18
mteb/_evaluators/any_sts_evaluator.py +3 -3
mteb/_evaluators/clustering_evaluator.py +2 -2
mteb/_evaluators/evaluator.py +4 -2
mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +10 -8
mteb/_evaluators/pair_classification_evaluator.py +5 -3
mteb/_evaluators/retrieval_evaluator.py +2 -2
mteb/_evaluators/retrieval_metrics.py +18 -17
mteb/_evaluators/sklearn_evaluator.py +11 -10
mteb/_evaluators/text/bitext_mining_evaluator.py +27 -18
mteb/_evaluators/text/summarization_evaluator.py +23 -18
mteb/_evaluators/zeroshot_classification_evaluator.py +5 -3
mteb/abstasks/_data_filter/filters.py +1 -1
mteb/abstasks/_data_filter/task_pipelines.py +3 -0
mteb/abstasks/_statistics_calculation.py +18 -10
mteb/abstasks/_stratification.py +18 -18
mteb/abstasks/abstask.py +35 -28
mteb/abstasks/aggregate_task_metadata.py +1 -9
mteb/abstasks/aggregated_task.py +10 -29
mteb/abstasks/classification.py +15 -10
mteb/abstasks/clustering.py +19 -15
mteb/abstasks/clustering_legacy.py +10 -10
mteb/abstasks/image/image_text_pair_classification.py +7 -4
mteb/abstasks/multilabel_classification.py +23 -19
mteb/abstasks/pair_classification.py +20 -11
mteb/abstasks/regression.py +4 -4
mteb/abstasks/retrieval.py +28 -24
mteb/abstasks/retrieval_dataset_loaders.py +2 -2
mteb/abstasks/sts.py +8 -5
mteb/abstasks/task_metadata.py +31 -33
mteb/abstasks/text/bitext_mining.py +39 -28
mteb/abstasks/text/reranking.py +8 -6
mteb/abstasks/text/summarization.py +10 -5
mteb/abstasks/zeroshot_classification.py +8 -4
mteb/benchmarks/benchmark.py +4 -2
mteb/benchmarks/benchmarks/__init__.py +4 -0
mteb/benchmarks/benchmarks/benchmarks.py +112 -11
mteb/benchmarks/get_benchmark.py +14 -55
mteb/cache.py +182 -29
mteb/cli/_display_tasks.py +2 -2
mteb/cli/build_cli.py +110 -14
mteb/cli/generate_model_card.py +43 -23
mteb/deprecated_evaluator.py +63 -49
mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2CybersecurityRetrieval.json +32 -0
mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EconomicRetrieval.json +32 -0
mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EnergyRetrieval.json +32 -0
mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2HrRetrieval.json +32 -0
mteb/descriptive_stats/Retrieval/ChemRxivRetrieval.json +30 -0
mteb/descriptive_stats/Retrieval/EuroPIRQRetrieval.json +116 -0
mteb/descriptive_stats/Retrieval/NanoClimateFEVER-VN.json +30 -0
mteb/descriptive_stats/Retrieval/NanoDBPedia-VN.json +30 -0
mteb/descriptive_stats/Retrieval/NanoFEVER-VN.json +30 -0
mteb/descriptive_stats/Retrieval/NanoHotpotQA-VN.json +30 -0
mteb/descriptive_stats/Retrieval/NanoMSMARCO-VN.json +30 -0
mteb/descriptive_stats/Retrieval/NanoNQ-VN.json +30 -0
mteb/descriptive_stats/Retrieval/TVPLRetrieval.json +30 -0
mteb/evaluate.py +44 -33
mteb/filter_tasks.py +25 -26
mteb/get_tasks.py +29 -30
mteb/languages/language_scripts.py +5 -3
mteb/leaderboard/app.py +162 -34
mteb/load_results.py +12 -12
mteb/models/abs_encoder.py +10 -6
mteb/models/cache_wrappers/cache_backend_protocol.py +3 -5
mteb/models/cache_wrappers/cache_backends/_hash_utils.py +5 -4
mteb/models/cache_wrappers/cache_backends/faiss_cache.py +6 -2
mteb/models/cache_wrappers/cache_backends/numpy_cache.py +43 -25
mteb/models/cache_wrappers/cache_wrapper.py +2 -2
mteb/models/get_model_meta.py +21 -3
mteb/models/instruct_wrapper.py +28 -8
mteb/models/model_implementations/align_models.py +1 -1
mteb/models/model_implementations/andersborges.py +4 -4
mteb/models/model_implementations/ara_models.py +1 -1
mteb/models/model_implementations/arctic_models.py +8 -8
mteb/models/model_implementations/b1ade_models.py +1 -1
mteb/models/model_implementations/bge_models.py +45 -21
mteb/models/model_implementations/bica_model.py +3 -3
mteb/models/model_implementations/blip2_models.py +2 -2
mteb/models/model_implementations/blip_models.py +16 -16
mteb/models/model_implementations/bm25.py +4 -4
mteb/models/model_implementations/bmretriever_models.py +6 -4
mteb/models/model_implementations/cadet_models.py +1 -1
mteb/models/model_implementations/cde_models.py +11 -4
mteb/models/model_implementations/clip_models.py +6 -6
mteb/models/model_implementations/clips_models.py +3 -3
mteb/models/model_implementations/codefuse_models.py +5 -5
mteb/models/model_implementations/codesage_models.py +3 -3
mteb/models/model_implementations/cohere_models.py +5 -5
mteb/models/model_implementations/cohere_v.py +2 -2
mteb/models/model_implementations/colpali_models.py +3 -3
mteb/models/model_implementations/colqwen_models.py +8 -8
mteb/models/model_implementations/colsmol_models.py +2 -2
mteb/models/model_implementations/conan_models.py +1 -1
mteb/models/model_implementations/dino_models.py +42 -42
mteb/models/model_implementations/e5_instruct.py +23 -4
mteb/models/model_implementations/e5_models.py +9 -9
mteb/models/model_implementations/e5_v.py +6 -6
mteb/models/model_implementations/eagerworks_models.py +1 -1
mteb/models/model_implementations/emillykkejensen_models.py +6 -6
mteb/models/model_implementations/en_code_retriever.py +1 -1
mteb/models/model_implementations/euler_models.py +2 -2
mteb/models/model_implementations/fa_models.py +9 -9
mteb/models/model_implementations/facebookai.py +14 -2
mteb/models/model_implementations/geogpt_models.py +1 -1
mteb/models/model_implementations/gme_v_models.py +6 -5
mteb/models/model_implementations/google_models.py +1 -1
mteb/models/model_implementations/granite_vision_embedding_models.py +1 -1
mteb/models/model_implementations/gritlm_models.py +2 -2
mteb/models/model_implementations/gte_models.py +25 -13
mteb/models/model_implementations/hinvec_models.py +1 -1
mteb/models/model_implementations/ibm_granite_models.py +30 -6
mteb/models/model_implementations/inf_models.py +2 -2
mteb/models/model_implementations/jasper_models.py +2 -2
mteb/models/model_implementations/jina_clip.py +48 -10
mteb/models/model_implementations/jina_models.py +18 -11
mteb/models/model_implementations/kblab.py +12 -6
mteb/models/model_implementations/kennethenevoldsen_models.py +4 -4
mteb/models/model_implementations/kfst.py +1 -1
mteb/models/model_implementations/kowshik24_models.py +1 -1
mteb/models/model_implementations/lgai_embedding_models.py +1 -1
mteb/models/model_implementations/linq_models.py +1 -1
mteb/models/model_implementations/listconranker.py +1 -1
mteb/models/model_implementations/llm2clip_models.py +6 -6
mteb/models/model_implementations/llm2vec_models.py +8 -8
mteb/models/model_implementations/mcinext_models.py +4 -1
mteb/models/model_implementations/mdbr_models.py +17 -3
mteb/models/model_implementations/misc_models.py +68 -68
mteb/models/model_implementations/mixedbread_ai_models.py +332 -0
mteb/models/model_implementations/mme5_models.py +1 -1
mteb/models/model_implementations/moco_models.py +4 -4
mteb/models/model_implementations/mod_models.py +1 -1
mteb/models/model_implementations/model2vec_models.py +14 -14
mteb/models/model_implementations/moka_models.py +1 -1
mteb/models/model_implementations/nbailab.py +3 -3
mteb/models/model_implementations/no_instruct_sentence_models.py +2 -2
mteb/models/model_implementations/nomic_models.py +30 -15
mteb/models/model_implementations/nomic_models_vision.py +1 -1
mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +15 -9
mteb/models/model_implementations/nvidia_models.py +151 -19
mteb/models/model_implementations/octen_models.py +61 -2
mteb/models/model_implementations/openclip_models.py +13 -13
mteb/models/model_implementations/opensearch_neural_sparse_models.py +5 -5
mteb/models/model_implementations/ops_moa_models.py +1 -1
mteb/models/model_implementations/ordalietech_solon_embeddings_mini_beta_1_1.py +1 -1
mteb/models/model_implementations/pawan_models.py +1 -1
mteb/models/model_implementations/piccolo_models.py +1 -1
mteb/models/model_implementations/pixie_models.py +56 -0
mteb/models/model_implementations/promptriever_models.py +4 -4
mteb/models/model_implementations/pylate_models.py +10 -9
mteb/models/model_implementations/qodo_models.py +2 -2
mteb/models/model_implementations/qtack_models.py +1 -1
mteb/models/model_implementations/qwen3_models.py +3 -3
mteb/models/model_implementations/qzhou_models.py +2 -2
mteb/models/model_implementations/random_baseline.py +3 -3
mteb/models/model_implementations/rasgaard_models.py +2 -2
mteb/models/model_implementations/reasonir_model.py +1 -1
mteb/models/model_implementations/repllama_models.py +3 -3
mteb/models/model_implementations/rerankers_custom.py +12 -6
mteb/models/model_implementations/rerankers_monot5_based.py +17 -17
mteb/models/model_implementations/richinfoai_models.py +1 -1
mteb/models/model_implementations/ru_sentence_models.py +20 -20
mteb/models/model_implementations/ruri_models.py +10 -10
mteb/models/model_implementations/salesforce_models.py +3 -3
mteb/models/model_implementations/samilpwc_models.py +1 -1
mteb/models/model_implementations/sarashina_embedding_models.py +2 -2
mteb/models/model_implementations/searchmap_models.py +1 -1
mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +113 -146
mteb/models/model_implementations/sentence_transformers_models.py +124 -22
mteb/models/model_implementations/shuu_model.py +1 -1
mteb/models/model_implementations/siglip_models.py +20 -20
mteb/models/model_implementations/slm_models.py +416 -0
mteb/models/model_implementations/spartan8806_atles_champion.py +1 -1
mteb/models/model_implementations/stella_models.py +17 -4
mteb/models/model_implementations/tarka_models.py +2 -2
mteb/models/model_implementations/text2vec_models.py +9 -3
mteb/models/model_implementations/ua_sentence_models.py +1 -1
mteb/models/model_implementations/uae_models.py +7 -1
mteb/models/model_implementations/vdr_models.py +1 -1
mteb/models/model_implementations/vi_vn_models.py +6 -6
mteb/models/model_implementations/vlm2vec_models.py +3 -3
mteb/models/model_implementations/voyage_models.py +84 -0
mteb/models/model_implementations/voyage_v.py +9 -7
mteb/models/model_implementations/youtu_models.py +1 -1
mteb/models/model_implementations/yuan_models.py +1 -1
mteb/models/model_implementations/yuan_models_en.py +1 -1
mteb/models/model_meta.py +80 -31
mteb/models/models_protocols.py +22 -6
mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +9 -6
mteb/models/search_wrappers.py +33 -18
mteb/models/sentence_transformer_wrapper.py +50 -25
mteb/models/vllm_wrapper.py +327 -0
mteb/py.typed +0 -0
mteb/results/benchmark_results.py +29 -21
mteb/results/model_result.py +52 -22
mteb/results/task_result.py +80 -58
mteb/similarity_functions.py +11 -7
mteb/tasks/classification/dan/dk_hate_classification.py +1 -1
mteb/tasks/classification/est/estonian_valence.py +1 -1
mteb/tasks/classification/kur/kurdish_sentiment_classification.py +2 -2
mteb/tasks/classification/multilingual/scala_classification.py +1 -1
mteb/tasks/clustering/eng/hume_wiki_cities_clustering.py +1 -1
mteb/tasks/clustering/eng/wiki_cities_clustering.py +1 -1
mteb/tasks/clustering/zho/cmteb_clustering.py +2 -2
mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +1 -1
mteb/tasks/reranking/multilingual/wikipedia_reranking_multilingual.py +1 -1
mteb/tasks/retrieval/code/code_rag.py +12 -12
mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
mteb/tasks/retrieval/dan/tv2_nordretrieval.py +2 -2
mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +2 -2
mteb/tasks/retrieval/eng/__init__.py +2 -0
mteb/tasks/retrieval/eng/chemrxiv.py +33 -0
mteb/tasks/retrieval/eng/cub200_i2i_retrieval.py +1 -1
mteb/tasks/retrieval/kor/__init__.py +15 -1
mteb/tasks/retrieval/kor/kovidore2_bench_retrieval.py +142 -0
mteb/tasks/retrieval/multilingual/__init__.py +2 -0
mteb/tasks/retrieval/multilingual/euro_pirq_retrieval.py +43 -0
mteb/tasks/retrieval/multilingual/vidore3_bench_retrieval.py +90 -100
mteb/tasks/retrieval/nob/norquad.py +2 -2
mteb/tasks/retrieval/nob/snl_retrieval.py +2 -2
mteb/tasks/retrieval/tur/tur_hist_quad.py +1 -1
mteb/tasks/retrieval/vie/__init__.py +14 -6
mteb/tasks/retrieval/vie/climate_fevervn_retrieval.py +39 -0
mteb/tasks/retrieval/vie/db_pedia_vn_retrieval.py +39 -0
mteb/tasks/retrieval/vie/fevervn_retrieval.py +39 -0
mteb/tasks/retrieval/vie/hotpot_qavn_retrieval.py +39 -0
mteb/tasks/retrieval/vie/msmarcovn_retrieval.py +48 -0
mteb/tasks/retrieval/vie/nqvn_retrieval.py +39 -0
mteb/tasks/retrieval/vie/tvpl_retrieval.py +42 -0
mteb/tasks/retrieval/vie/zac_legal_text_retrieval.py +15 -1
mteb/types/__init__.py +2 -0
mteb/types/_encoder_io.py +12 -0
mteb/types/_result.py +2 -1
mteb/types/statistics.py +9 -3
{mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/METADATA +15 -4
{mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/RECORD +240 -219
mteb/models/model_implementations/mxbai_models.py +0 -111
{mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/WHEEL +0 -0
{mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/entry_points.txt +0 -0
{mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/licenses/LICENSE +0 -0
{mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/top_level.txt +0 -0

mteb/__init__.py CHANGED Viewed

@@ -3,6 +3,7 @@ from importlib.metadata import version
 from mteb import types
 from mteb.abstasks import AbsTask
 from mteb.abstasks.task_metadata import TaskMetadata
+from mteb.cache import ResultCache
 from mteb.deprecated_evaluator import MTEB
 from mteb.evaluate import evaluate
 from mteb.filter_tasks import filter_tasks
@@ -33,6 +34,7 @@ __all__ = [
     "CrossEncoderProtocol",
     "EncoderProtocol",
     "IndexEncoderSearchProtocol",
+    "ResultCache",
     "SearchProtocol",
     "SentenceTransformerEncoderWrapper",
     "TaskMetadata",

mteb/_create_dataloaders.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import logging
+import warnings
 from collections.abc import Callable
 from typing import Any, cast
@@ -22,7 +23,7 @@ logger = logging.getLogger(__name__)
 def _create_dataloader_from_texts(
     text: list[str],
     batch_size: int = 32,
-    **kwargs: dict[str, Any],
+    **kwargs: Any,
 ) -> DataLoader[TextInput]:
     """Create a dataloader from a list of text.
@@ -113,11 +114,8 @@ def _create_text_dataloader_for_queries(
     )
-_warned_about_user_role = False
 def _convert_conv_history_to_query(
-    row: dict[str, list[str] | Conversation],
+    row: dict[str, str | list[str] | Conversation],
 ) -> dict[str, str | Conversation]:
     """Convert a conversation history to a single query string.
@@ -127,21 +125,18 @@ def _convert_conv_history_to_query(
     Returns:
         The updated row with the "query" and "text" fields set to the conversation string, and the "conversation" field set to the list of ConversationTurn.
     """
-    global _warned_about_user_role
     conversation = row["text"]
     # if it's a list of strings, just join them
     if isinstance(conversation, list) and isinstance(conversation[0], str):
-        conversation = cast(list[str], conversation)
-        conv_str = "; ".join(conversation)
+        conversation_ = cast(list[str], conversation)
+        conv_str = "; ".join(conversation_)
         current_conversation = [
-            ConversationTurn(role="user", content=message) for message in conversation
+            ConversationTurn(role="user", content=message) for message in conversation_
         ]
-        if not _warned_about_user_role:
-            logger.warning(
-                "Conversations are a list of strings. Used 'user' role for all turns."
-            )
-            _warned_about_user_role = True
+        warnings.warn(
+            "Conversations are a list of strings. Used 'user' role for all turns.",
+            category=UserWarning,
+        )
     # otherwise, it's a list of dictionaries, which we need to convert to strings
     elif isinstance(conversation, list) and isinstance(conversation[0], dict):
         conv = []
@@ -178,7 +173,7 @@ def _convert_conv_history_to_query(
     row["text"] = conv_str
     row["conversation"] = current_conversation
-    return row
+    return cast(dict[str, str | list[ConversationTurn]], row)
 def _create_dataloader_for_queries_conversation(
@@ -196,7 +191,8 @@ def _create_dataloader_for_queries_conversation(
     """
     return DataLoader(
         queries.map(
-            _convert_conv_history_to_query, desc="Converting conversations to queries"
+            _convert_conv_history_to_query,
+            desc="Converting conversations to queries",
         ),
         collate_fn=_custom_collate_fn,
         batch_size=batch_size,
@@ -366,6 +362,9 @@ def _create_document_dataloader(
         task_metadata: Metadata of the task to determine the document type.
         input_column: The column to use as input. If None, it will use the first column that matches the modality.
         batch_size: Batch size for the dataloader.
+    Returns:
+        A dataloader for the documents.
     """
     document_type = task_metadata.get_modalities(PromptType.document)
     if document_type == ["text"]:  # text only
@@ -388,7 +387,7 @@ def create_dataloader(
     prompt_type: PromptType | None = None,
     input_column: str | None = None,
     batch_size: int = 32,
-    **kwargs: dict[str, Any],
+    **kwargs: Any,
 ) -> DataLoader[BatchedInput]:
     """Create a dataloader from a dataset.

mteb/_evaluators/any_sts_evaluator.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import logging
-from typing import Any, TypedDict
+from typing import TypedDict
 from datasets import Dataset
 from sklearn.metrics.pairwise import (
@@ -12,7 +12,7 @@ from mteb._create_dataloaders import create_dataloader
 from mteb.abstasks.task_metadata import TaskMetadata
 from mteb.models import EncoderProtocol
 from mteb.similarity_functions import compute_pairwise_similarity
-from mteb.types import PromptType
+from mteb.types import EncodeKwargs, PromptType
 from .evaluator import Evaluator
@@ -60,7 +60,7 @@ class AnySTSEvaluator(Evaluator):
         self,
         model: EncoderProtocol,
         *,
-        encode_kwargs: dict[str, Any],
+        encode_kwargs: EncodeKwargs,
     ) -> STSEvaluatorScores:
         logger.info("Running semantic similarity - Encoding samples (1/2)")
         embeddings1 = model.encode(

mteb/_evaluators/clustering_evaluator.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import logging
-from typing import Any
 from datasets import Dataset
 from sklearn import cluster
@@ -7,6 +6,7 @@ from sklearn import cluster
 from mteb._create_dataloaders import create_dataloader
 from mteb.abstasks.task_metadata import TaskMetadata
 from mteb.models import EncoderProtocol
+from mteb.types import EncodeKwargs
 from .evaluator import Evaluator
@@ -38,7 +38,7 @@ class ClusteringEvaluator(Evaluator):
         self,
         model: EncoderProtocol,
         *,
-        encode_kwargs: dict[str, Any],
+        encode_kwargs: EncodeKwargs,
     ) -> list[int]:
         data_loader = create_dataloader(
             self.dataset,

mteb/_evaluators/evaluator.py CHANGED Viewed

@@ -1,8 +1,10 @@
 from abc import ABC, abstractmethod
+from collections.abc import Iterable, Mapping
 from typing import Any
 from mteb.abstasks.abstask import _set_seed
 from mteb.models import EncoderProtocol
+from mteb.types import EncodeKwargs
 class Evaluator(ABC):
@@ -17,8 +19,8 @@ class Evaluator(ABC):
     @abstractmethod
     def __call__(
-        self, model: EncoderProtocol, *, encode_kwargs: dict[str, Any]
-    ) -> dict[str, float]:
+        self, model: EncoderProtocol, *, encode_kwargs: EncodeKwargs
+    ) -> Mapping[str, float] | Iterable[Any]:
         """This is called during training to evaluate the model.
         It returns scores.

mteb/_evaluators/image/imagetext_pairclassification_evaluator.py CHANGED Viewed

@@ -1,20 +1,22 @@
 from __future__ import annotations
 import logging
+from collections.abc import Sequence
 from typing import TYPE_CHECKING, Any
 import torch
 import torch.nn.functional as F
-from datasets import Dataset
 from torch.utils.data import DataLoader
 from mteb._create_dataloaders import (
+    _create_dataloader_from_texts,
     _transform_image_to_rgb,
 )
 from mteb._evaluators.evaluator import Evaluator
 from mteb._requires_package import requires_image_dependencies
 from mteb.abstasks.task_metadata import TaskMetadata
 from mteb.models.models_protocols import EncoderProtocol
+from mteb.types import EncodeKwargs
 if TYPE_CHECKING:
     from PIL.Image import Image
@@ -61,8 +63,8 @@ class ImageTextPairClassificationEvaluator(Evaluator):
     def __init__(
         self,
         dataset,
-        images_column_names: str | list[str],
-        texts_column_names: str | list[str],
+        images_column_names: str | Sequence[str],
+        texts_column_names: str | Sequence[str],
         num_images_per_sample: int,
         num_texts_per_sample: int,
         task_metadata: TaskMetadata,
@@ -82,10 +84,11 @@ class ImageTextPairClassificationEvaluator(Evaluator):
         self.hf_split = hf_split
         self.hf_subset = hf_subset
-    def __call__(
+    def __call__(  # type: ignore[override]
         self,
         model: EncoderProtocol,
-        encode_kwargs: dict[str, Any],
+        *,
+        encode_kwargs: EncodeKwargs,
     ) -> list[torch.Tensor]:
         images = []
         if isinstance(self.images_column_names, str):
@@ -106,8 +109,8 @@ class ImageTextPairClassificationEvaluator(Evaluator):
                     texts.append(row[col])
         text_embeddings = model.encode(
-            DataLoader(
-                Dataset.from_dict({"text": texts}),
+            _create_dataloader_from_texts(
+                texts,
                 **encode_kwargs,
             ),
             task_metadata=self.task_metadata,
@@ -128,7 +131,6 @@ class ImageTextPairClassificationEvaluator(Evaluator):
             DataLoader(
                 CustomImageDataset(images),
                 collate_fn=lambda x: {"image": [item["image"] for item in x]},
-                **encode_kwargs,
             ),
             task_metadata=self.task_metadata,
             hf_subset=self.hf_subset,

mteb/_evaluators/pair_classification_evaluator.py CHANGED Viewed

@@ -14,7 +14,7 @@ from mteb._evaluators.evaluator import Evaluator
 from mteb.abstasks.task_metadata import TaskMetadata
 from mteb.models import EncoderProtocol
 from mteb.similarity_functions import compute_pairwise_similarity
-from mteb.types import PromptType
+from mteb.types import EncodeKwargs, PromptType
 logger = logging.getLogger(__name__)
@@ -85,7 +85,7 @@ class PairClassificationEvaluator(Evaluator):
     def __call__(
         self,
         model: EncoderProtocol,
-        encode_kwargs: dict[str, Any],
+        encode_kwargs: EncodeKwargs,
     ) -> PairClassificationDistances:
         logger.info("Running pair classification - Encoding samples (1/2)")
         embeddings1 = model.encode(
@@ -148,7 +148,9 @@ class PairClassificationEvaluator(Evaluator):
         hf_subset: str,
         **encode_kwargs: Any,
     ) -> np.ndarray:
-        index_map, all_unique_texts, all_texts_indexes = {}, [], []
+        index_map = {}
+        all_unique_texts: list[str] = []
+        all_texts_indexes = []
         for text in all_texts:
             text_hash = hash(text)
             if text_hash not in index_map:

mteb/_evaluators/retrieval_evaluator.py CHANGED Viewed

@@ -1,11 +1,11 @@
 import logging
 from collections.abc import Sequence
-from typing import Any
 from mteb.abstasks.task_metadata import TaskMetadata
 from mteb.models import SearchProtocol
 from mteb.types import (
     CorpusDatasetType,
+    EncodeKwargs,
     QueryDatasetType,
     RelevantDocumentsType,
     RetrievalEvaluationResult,
@@ -48,7 +48,7 @@ class RetrievalEvaluator(Evaluator):
     def __call__(  # type: ignore[override]
         self,
         search_model: SearchProtocol,
-        encode_kwargs: dict[str, Any],
+        encode_kwargs: EncodeKwargs,
     ) -> RetrievalOutputType:
         logger.info("Running retrieval task - Indexing corpus...")
         search_model.index(

mteb/_evaluators/retrieval_metrics.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import logging
 from collections import defaultdict
+from collections.abc import Mapping
 from typing import Any
 import numpy as np
@@ -15,7 +16,7 @@ logger = logging.getLogger(__name__)
 def mrr(
     qrels: RelevantDocumentsType,
-    results: dict[str, dict[str, float]],
+    results: Mapping[str, Mapping[str, float]],
     k_values: list[int],
 ) -> dict[str, list[float]]:
     mrr_metrics = defaultdict(list)
@@ -32,7 +33,7 @@ def mrr(
             doc_id for doc_id in qrels[query_id] if qrels[query_id][doc_id] > 0
         }
         for k in k_values:
-            rr = 0
+            rr = 0.0
             for rank, hit in enumerate(top_hits[query_id][0:k]):
                 if hit[0] in query_relevant_docs:
                     rr = 1.0 / (rank + 1)
@@ -45,8 +46,8 @@ def recall_cap(
     qrels: RelevantDocumentsType,
     results: dict[str, dict[str, float]],
     k_values: list[int],
-) -> dict[str, list[float]]:
-    capped_recall = defaultdict(list)
+) -> dict[str, list[float | None]]:
+    capped_recall: dict[str, list[float | None]] = defaultdict(list)
     k_max = max(k_values)
@@ -139,7 +140,7 @@ def calculate_pmrr(original_run, new_run, changed_qrels):
     changes = []
     for qid in changed_qrels.keys():
         if qid + "-og" not in original_run or qid + "-changed" not in new_run:
-            logging.warning(f"Query {qid} not found in the runs for calculating p-MRR")
+            logger.warning(f"Query {qid} not found in the runs for calculating p-MRR")
             continue
         original_qid_run = original_run[qid + "-og"]
         new_qid_run = new_run[qid + "-changed"]
@@ -188,7 +189,7 @@ def evaluate_p_mrr_change(
     Returns:
         A dictionary with the scores, including "p-MRR", "og" and "changed" keys.
     """
-    followir_scores = defaultdict(dict)
+    followir_scores: dict[str, float | dict[str, float]] = defaultdict(dict)
     qrels_sep = {
         "og": {k: v for k, v in qrels.items() if k.endswith("-og")},
@@ -227,7 +228,7 @@ def evaluate_p_mrr_change(
             ndcg, _map, recall, precision, naucs, avg_mrr, naucs_mrr, cv_recall, {}
         )
         for key, value in scores_dict.items():
-            followir_scores[name][key] = value
+            followir_scores[name][key] = value  # type: ignore[index]
     return followir_scores
@@ -254,8 +255,8 @@ def confidence_scores(sim_scores: list[float]) -> dict[str, float]:
     sim_scores_sorted = sorted(sim_scores)[::-1]
     cs_max = sim_scores_sorted[0]
-    cs_std = np.std(sim_scores)
-    cs_diff1 = None
+    cs_std = float(np.std(sim_scores))
+    cs_diff1 = 0.0
     if len(sim_scores) > 1:
         cs_diff1 = sim_scores_sorted[0] - sim_scores_sorted[1]
     elif len(sim_scores) == 1:
@@ -410,7 +411,7 @@ def make_score_dict(
     cv_recall: dict[str, float],
     task_scores: dict[str, float],
     previous_results_model_meta: dict[str, Any] | None = None,
-) -> dict[str, float]:
+) -> dict[str, Any]:
     return {
         **{f"ndcg_at_{k.split('@')[1]}": v for (k, v) in ndcg.items()},
         **{f"map_at_{k.split('@')[1]}": v for (k, v) in _map.items()},
@@ -528,7 +529,7 @@ def max_over_subqueries(
 def calculate_retrieval_scores(
-    results: dict[str, dict[str, float]],
+    results: Mapping[str, Mapping[str, float]],
     qrels: RelevantDocumentsType,
     k_values: list[int],
     skip_first_result: bool = False,
@@ -576,7 +577,7 @@ def calculate_retrieval_scores(
 def evaluate_abstention(
-    results: dict[str, dict[str, float]],
+    results: Mapping[str, Mapping[str, float]],
     metric_scores: dict[str, list[float]],
 ) -> dict[str, float]:
     """Computes normalized Area Under the Curve on a set of evaluated instances as presented in the paper https://arxiv.org/abs/2402.12997
@@ -591,21 +592,21 @@ def evaluate_abstention(
     all_sim_scores = [list(results[qid].values()) for qid in list(results.keys())]
     all_conf_scores = [confidence_scores(sim_scores) for sim_scores in all_sim_scores]
     conf_fcts = list(all_conf_scores[0].keys())
-    all_conf_scores = {
+    all_conf_scores_ = {
         fct: np.array([x[fct] for x in all_conf_scores]) for fct in conf_fcts
     }
-    metric_scores = {k: np.array(v) for k, v in metric_scores.items()}
+    metric_scores_ = {k: np.array(v) for k, v in metric_scores.items()}
     naucs = {}
-    for metric_name, scores in metric_scores.items():
-        for fct, conf_scores in all_conf_scores.items():
+    for metric_name, scores in metric_scores_.items():
+        for fct, conf_scores in all_conf_scores_.items():
             naucs[f"nAUC_{metric_name}_{fct}"] = nauc(conf_scores, scores)
     return naucs
 def calculate_cv_recall(
-    results: dict[str, dict[str, float]],
+    results: Mapping[str, Mapping[str, float]],
     qrels: RelevantDocumentsType,
     k_values: list[int],
     skip_first_result: bool = False,

mteb/_evaluators/sklearn_evaluator.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import logging
-from typing import Any, Protocol
+from typing import Any, Protocol, cast
 import numpy as np
 from datasets import Dataset
@@ -9,7 +9,7 @@ from typing_extensions import Self
 from mteb._create_dataloaders import create_dataloader
 from mteb.abstasks.task_metadata import TaskMetadata
 from mteb.models import EncoderProtocol
-from mteb.types import BatchedInput
+from mteb.types import Array, BatchedInput, EncodeKwargs
 from .evaluator import Evaluator
@@ -17,11 +17,11 @@ logger = logging.getLogger(__name__)
 class SklearnModelProtocol(Protocol):
-    def fit(self, X: np.ndarray, y: np.ndarray | list[int]) -> None: ...  # noqa: N803
-    def predict(self, X: np.ndarray) -> np.ndarray: ...  # noqa: N803
+    def fit(self, X: Array, y: np.ndarray | list[int]) -> None: ...  # noqa: N803
+    def predict(self, X: Array) -> np.ndarray: ...  # noqa: N803
     def get_params(self) -> dict[str, Any]: ...
-    def set_params(self, **kwargs: dict[str, Any]) -> Self: ...
-    def score(self, X: np.ndarray, y: np.ndarray | list[int]) -> float: ...  # noqa: N803
+    def set_params(self, random_state: int, **kwargs: dict[str, Any]) -> Self: ...
+    def score(self, X: Array, y: np.ndarray | list[int]) -> float: ...  # noqa: N803
 class SklearnEvaluator(Evaluator):
@@ -50,7 +50,7 @@ class SklearnEvaluator(Evaluator):
         self.evaluator_model = evaluator_model
     def create_dataloaders(
-        self, encode_kwargs: dict[str, Any]
+        self, encode_kwargs: EncodeKwargs
     ) -> tuple[DataLoader[BatchedInput], DataLoader[BatchedInput]]:
         dataloader_train = create_dataloader(
             self.train_dataset,
@@ -70,9 +70,9 @@ class SklearnEvaluator(Evaluator):
         self,
         model: EncoderProtocol,
         *,
-        encode_kwargs: dict[str, Any],
-        test_cache: np.ndarray | None = None,
-    ) -> tuple[np.ndarray, np.ndarray]:
+        encode_kwargs: EncodeKwargs,
+        test_cache: Array | None = None,
+    ) -> tuple[np.ndarray, Array]:
         """Classification evaluation by training a sklearn classifier on the embeddings of the training set and evaluating on the embeddings of the test set.
         Args:
@@ -104,6 +104,7 @@ class SklearnEvaluator(Evaluator):
                 hf_subset=self.hf_subset,
                 **encode_kwargs,
             )
+            test_cache = cast(Array, test_cache)
         logger.info("Running - Fitting classifier...")
         y_train = self.train_dataset[self.label_column_name]

mteb/_evaluators/text/bitext_mining_evaluator.py CHANGED Viewed

@@ -1,7 +1,5 @@
 import logging
-from typing import Any
-import numpy as np
 import torch
 from datasets import Dataset
 from tqdm.auto import tqdm
@@ -10,6 +8,7 @@ from mteb._create_dataloaders import _create_dataloader_from_texts
 from mteb._evaluators.evaluator import Evaluator
 from mteb.abstasks.task_metadata import TaskMetadata
 from mteb.models import EncoderProtocol
+from mteb.types import Array, EncodeKwargs
 logger = logging.getLogger(__name__)
@@ -33,7 +32,10 @@ class BitextMiningEvaluator(Evaluator):
         self.task_metadata = task_metadata
     def __call__(
-        self, model: EncoderProtocol, *, encode_kwargs: dict[str, Any]
+        self,
+        model: EncoderProtocol,
+        *,
+        encode_kwargs: EncodeKwargs,
     ) -> dict[str, list[dict[str, float]]]:
         pair_elements = {p for pair in self.pairs for p in pair}
         if isinstance(self.sentences, Dataset):
@@ -69,11 +71,11 @@ class BitextMiningEvaluator(Evaluator):
     def _similarity_search(
         self,
-        query_embeddings: np.ndarray,
-        corpus_embeddings: np.ndarray,
+        query_embeddings: Array,
+        corpus_embeddings: Array,
         model: EncoderProtocol,
         query_chunk_size: int = 100,
-        corpus_chunk_size: int = 500000,
+        corpus_chunk_size: int = 500_000,
     ) -> list[dict[str, float]]:
         """This function performs a cosine similarity search between a list of query embeddings and a list of corpus embeddings.
@@ -104,13 +106,15 @@ class BitextMiningEvaluator(Evaluator):
         ):
             query_embeddings = query_embeddings.to(corpus_embeddings.device)
-        queries_result_list = [[] for _ in range(len(query_embeddings))]
+        queries_result_list: list[list[dict[str, float]]] = [
+            [] for _ in range(len(query_embeddings))
+        ]
         for query_start_idx in range(0, len(query_embeddings), query_chunk_size):
             # Iterate over chunks of the corpus
             for corpus_start_idx in range(0, len(corpus_embeddings), corpus_chunk_size):
                 # Compute cosine similarities
-                similarity_scores = model.similarity(  # type: ignore
+                similarity_scores = model.similarity(
                     query_embeddings[
                         query_start_idx : query_start_idx + query_chunk_size
                     ],
@@ -120,15 +124,17 @@ class BitextMiningEvaluator(Evaluator):
                 )
                 # Get top-k scores
-                cos_scores_top_k_values, cos_scores_top_k_idx = torch.topk(
-                    torch.tensor(similarity_scores),
-                    1,
-                    dim=1,
-                    largest=True,
-                    sorted=False,
+                cos_scores_top_k_values_tensor, cos_scores_top_k_idx_tensor = (
+                    torch.topk(
+                        torch.tensor(similarity_scores),
+                        1,
+                        dim=1,
+                        largest=True,
+                        sorted=False,
+                    )
                 )
-                cos_scores_top_k_values = cos_scores_top_k_values.cpu().tolist()
-                cos_scores_top_k_idx = cos_scores_top_k_idx.cpu().tolist()
+                cos_scores_top_k_values = cos_scores_top_k_values_tensor.cpu().tolist()
+                cos_scores_top_k_idx = cos_scores_top_k_idx_tensor.cpu().tolist()
                 for query_itr in range(len(similarity_scores)):
                     for sub_corpus_id, score in zip(
@@ -141,11 +147,14 @@ class BitextMiningEvaluator(Evaluator):
                             {"corpus_id": corpus_id, "score": score}
                         )
+        result_queries_list: list[dict[str, float]] = [
+            {} for _ in range(len(query_embeddings))
+        ]
         # Sort and strip to top_k results
         for idx in range(len(queries_result_list)):
             queries_result_list[idx] = sorted(
                 queries_result_list[idx], key=lambda x: x["score"], reverse=True
             )
-            queries_result_list[idx] = queries_result_list[idx][0]
+            result_queries_list[idx] = queries_result_list[idx][0]
-        return queries_result_list
+        return result_queries_list

mteb 2.5.2__py3-none-any.whl → 2.7.2__py3-none-any.whl

mteb 2.5.2py3-none-any.whl → 2.7.2py3-none-any.whl