PyPI - mteb - Versions diffs - 2.1.4__py3-none-any.whl → 2.7.2__py3-none-any.whl - Mend

mteb 2.1.4py3-none-any.whl → 2.7.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (527) hide show

mteb/leaderboard/benchmark_selector.py CHANGED Viewed

@@ -71,18 +71,26 @@ GP_BENCHMARK_ENTRIES = [
                         "MTEB(cmn, v1)",
                         "MTEB(deu, v1)",
                         "MTEB(fra, v1)",
-                        "MTEB(jpn, v1)",
+                        "JMTEB(v2)",
                         "MTEB(kor, v1)",
                         "MTEB(nld, v1)",
                         "MTEB(pol, v1)",
-                        "MTEB(rus, v1)",
+                        "MTEB(rus, v1.1)",
                         "MTEB(fas, v2)",
                         "VN-MTEB (vie, v1)",
                     ]
                 )
                 + [
                     MenuEntry(
-                        "Other", mteb.get_benchmarks(["MTEB(eng, v1)", "MTEB(fas, v1)"])
+                        "Other",
+                        mteb.get_benchmarks(
+                            [
+                                "MTEB(eng, v1)",
+                                "MTEB(fas, v1)",
+                                "MTEB(rus, v1)",
+                                "MTEB(jpn, v1)",
+                            ]
+                        ),
                     )
                 ],
             ),
@@ -110,10 +118,11 @@ R_BENCHMARK_ENTRIES = [
             MenuEntry(
                 "Image",
                 description=None,
-                open=False,
+                open=True,
                 benchmarks=[
-                    mteb.get_benchmark("VisualDocumentRetrieval"),
+                    mteb.get_benchmark("ViDoRe(v3)"),
                     mteb.get_benchmark("JinaVDR"),
+                    MenuEntry("Other", [mteb.get_benchmark("ViDoRe(v1&v2)")]),
                 ],
             ),
             MenuEntry(

mteb/leaderboard/figures.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import logging
 from typing import get_args
 import numpy as np
@@ -7,6 +8,8 @@ import plotly.graph_objects as go
 from mteb.abstasks.task_metadata import TaskType
+logger = logging.getLogger(__name__)
 def _text_plot(text: str):
     """Returns empty scatter plot with text added, this can be great for error messages."""
@@ -29,16 +32,17 @@ def _failsafe_plot(fun):
         try:
             return fun(*args, **kwargs)
         except Exception as e:
+            logger.error(f"Plot generation failed: {e}")
             return _text_plot(f"Couldn't produce plot. Reason: {e}")
     return wrapper
-def _parse_n_params(text: str) -> int:
-    if text.endswith("M"):
-        return float(text[:-1]) * 1e6
-    if text.endswith("B"):
-        return float(text[:-1]) * 1e9
+def _parse_n_params(params: float | None) -> int | float:
+    """Specified in billions."""
+    if params is None or np.isnan(params):
+        return None
+    return int(params * 1e9)
 def _parse_model_name(name: str) -> str:
@@ -51,20 +55,14 @@ def _parse_model_name(name: str) -> str:
 def _parse_float(value) -> float:
-    try:
-        if value == "Infinite":
-            return np.inf
-        else:
-            return float(value)
-    except ValueError:
+    if value is None or np.isnan(value):
         return np.nan
+    return float(value)
 def _process_max_tokens(x):
-    if pd.isna(x):
+    if pd.isna(x) or x is None or np.isinf(x):
         return "Unknown"
-    if np.isinf(x):
-        return "Infinite"
     return str(int(x))
@@ -112,7 +110,7 @@ def _add_size_guide(fig: go.Figure):
 @_failsafe_plot
 def _performance_size_plot(df: pd.DataFrame) -> go.Figure:
     df = df.copy()
-    df["Number of Parameters"] = df["Number of Parameters"].map(_parse_n_params)
+    df["Number of Parameters"] = df["Number of Parameters (B)"].map(_parse_n_params)
     df["Model"] = df["Model"].map(_parse_model_name)
     df["model_text"] = df["Model"].where(df["Model"].isin(models_to_annotate), "")
     df["Embedding Dimensions"] = df["Embedding Dimensions"].map(_parse_float)

mteb/leaderboard/table.py CHANGED Viewed

@@ -26,16 +26,6 @@ def _format_scores(score: float) -> float:
     return round(score * 100, 2)
-def _get_column_types(df: pd.DataFrame) -> list[str]:
-    types = []
-    for column_name in df.columns:
-        if is_numeric_dtype(df[column_name]):
-            types.append("number")
-        else:
-            types.append("str")
-    return types
 def _get_column_widths(df: pd.DataFrame) -> list[str]:
     # Please do not remove this function when refactoring.
     # Column width calculation seeminlgy changes regularly with Gradio releases,
@@ -120,6 +110,39 @@ def apply_per_task_styling_from_benchmark(
     return _apply_per_task_table_styling(per_task_df)
+def apply_per_language_styling_from_benchmark(
+    benchmark_instance: Benchmark, benchmark_results: BenchmarkResults
+) -> gr.DataFrame:
+    """Apply styling to per-language table created by the benchmark instance's _create_per_language_table method.
+    This supports polymorphism - different benchmark classes can have different table generation logic.
+    Args:
+        benchmark_instance: The benchmark instance
+        benchmark_results: BenchmarkResults object containing model results (may be pre-filtered)
+    Returns:
+        Styled gr.DataFrame ready for display in the leaderboard
+    """
+    # Use the instance method to support polymorphism
+    per_language_df = benchmark_instance._create_per_language_table(benchmark_results)
+    # If it's a no-results DataFrame, return it as-is
+    if "No results" in per_language_df.columns:
+        return gr.DataFrame(per_language_df)
+    # Apply the styling
+    return _apply_per_language_table_styling(per_language_df)
+def _style_number_of_parameters(num_params: float) -> str:
+    """Anything bigger than 1B is shown in billions with 1 decimal (e.g. 1.712 > 1.7) while anything smaller as 0.xxx B (e.g. 0.345 remains 0.345)"""
+    if num_params >= 1:
+        return f"{num_params:.1f}"
+    else:
+        return f"{num_params:.3f}"
 def _apply_summary_table_styling(joint_table: pd.DataFrame) -> gr.DataFrame:
     """Apply styling to a raw summary DataFrame
@@ -130,7 +153,7 @@ def _apply_summary_table_styling(joint_table: pd.DataFrame) -> gr.DataFrame:
         "Rank (Borda)",
         "Rank",
         "Model",
-        "Number of Parameters",
+        "Number of Parameters (B)",
         "Embedding Dimensions",
         "Max Tokens",
         "Memory Usage (MB)",
@@ -156,7 +179,14 @@ def _apply_summary_table_styling(joint_table: pd.DataFrame) -> gr.DataFrame:
     joint_table[score_columns] = joint_table[score_columns].map(_format_scores)
     joint_table_style = joint_table.style.format(
-        {**dict.fromkeys(score_columns, "{:.2f}"), "Rank (Borda)": "{:.0f}"},
+        {
+            **dict.fromkeys(score_columns, "{:.2f}"),
+            "Rank (Borda)": "{:.0f}",
+            "Memory Usage (MB)": "{:.0f}",
+            "Embedding Dimensions": "{:.0f}",
+            "Max Tokens": "{:.0f}",
+            "Number of Parameters (B)": lambda x: _style_number_of_parameters(x),
+        },
         na_rep="",
     )
     joint_table_style = joint_table_style.highlight_min(
@@ -186,7 +216,7 @@ def _apply_summary_table_styling(joint_table: pd.DataFrame) -> gr.DataFrame:
                     gmap=gmap_values.loc[mask],
                 )
-    column_types = _get_column_types(joint_table_style.data)
+    column_types = ["auto" for _ in joint_table_style.data.columns]
     # setting model name column to markdown
     if len(column_types) > 1:
         column_types[1] = "markdown"
@@ -204,8 +234,7 @@ def _apply_summary_table_styling(joint_table: pd.DataFrame) -> gr.DataFrame:
         pinned_columns=2,
         column_widths=column_widths,
         wrap=True,
-        show_fullscreen_button=True,
-        show_copy_button=True,
+        buttons=["copy", "fullscreen"],
         show_search="filter",
     )
@@ -223,11 +252,47 @@ def _apply_per_task_table_styling(per_task: pd.DataFrame) -> gr.DataFrame:
         "{:.2f}", subset=task_score_columns, na_rep=""
     ).highlight_max(subset=task_score_columns, props="font-weight: bold")
+    # setting task name column width to 250px
+    column_widths = _get_column_widths(per_task_style.data)
+    if len(column_widths) > 0:
+        column_widths[0] = "250px"
     return gr.DataFrame(
         per_task_style,
         interactive=False,
         pinned_columns=1,
-        show_fullscreen_button=True,
-        show_copy_button=True,
+        column_widths=column_widths,
+        buttons=["copy", "fullscreen"],
+        show_search="filter",
+    )
+def _apply_per_language_table_styling(per_language: pd.DataFrame) -> gr.DataFrame:
+    """Apply styling to a raw per-task DataFrame
+    Returns:
+        Styled gr.DataFrame ready for display in the leaderboard
+    """
+    language_score_columns = per_language.select_dtypes("number").columns
+    per_language[language_score_columns] *= 100
+    if len(per_language.columns) > 100:  # Avoid gradio error on very wide tables
+        per_language_style = per_language.round(2)
+    else:
+        per_language_style = per_language.style.format(
+            "{:.2f}", subset=language_score_columns, na_rep=""
+        ).highlight_max(subset=language_score_columns, props="font-weight: bold")
+    # setting task name column width to 250px
+    column_widths = _get_column_widths(per_language_style.data)
+    if len(column_widths) > 0:
+        column_widths[0] = "250px"
+    return gr.DataFrame(
+        per_language_style,
+        interactive=False,
+        pinned_columns=1,
+        column_widths=column_widths,
+        buttons=["copy", "fullscreen"],
         show_search="filter",
     )

mteb/load_results.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import json
 import logging
 import sys
-from collections.abc import Sequence
+from collections.abc import Iterable, Sequence
 from pathlib import Path
 from mteb.abstasks.abstask import AbsTask
@@ -45,8 +45,8 @@ def _model_name_and_revision(
 def load_results(
     results_repo: str = "https://github.com/embeddings-benchmark/results",
     download_latest: bool = True,
-    models: Sequence[ModelMeta] | Sequence[str] | None = None,
-    tasks: Sequence[AbsTask] | Sequence[str] | None = None,
+    models: Iterable[ModelMeta] | Sequence[str] | None = None,
+    tasks: Iterable[AbsTask] | Sequence[str] | None = None,
     validate_and_filter: bool = True,
     require_model_meta: bool = True,
     only_main_score: bool = False,
@@ -83,21 +83,21 @@ def load_results(
     if models is not None:
         models_to_keep = {}
-        for model_path in models:
-            if isinstance(model_path, ModelMeta):
-                models_to_keep[model_path.name] = model_path.revision
+        for model in models:
+            if isinstance(model, ModelMeta):
+                models_to_keep[model.name] = model.revision
             else:
-                models_to_keep[model_path] = None
+                models_to_keep[model] = None
     else:
         models_to_keep = None
-    task_names = {}
+    task_names: dict[str, AbsTask | None] = {}
     if tasks is not None:
-        for task in tasks:
-            if isinstance(task, AbsTask):
-                task_names[task.metadata.name] = task
+        for task_ in tasks:
+            if isinstance(task_, AbsTask):
+                task_names[task_.metadata.name] = task_
             else:
-                task_names[task] = None
+                task_names[task_] = None
     model_results = []
     for model_path in model_paths:

mteb/models/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from .cache_wrappers import CachedEmbeddingWrapper
+from .cache_wrappers import CacheBackendProtocol, CachedEmbeddingWrapper
 from .model_meta import ModelMeta
 from .models_protocols import (
     CrossEncoderProtocol,
@@ -6,6 +6,7 @@ from .models_protocols import (
     MTEBModels,
     SearchProtocol,
 )
+from .search_encoder_index.search_backend_protocol import IndexEncoderSearchProtocol
 from .search_wrappers import SearchCrossEncoderWrapper, SearchEncoderWrapper
 from .sentence_transformer_wrapper import (
     CrossEncoderWrapper,
@@ -14,10 +15,12 @@ from .sentence_transformer_wrapper import (
 )
 __all__ = [
+    "CacheBackendProtocol",
     "CachedEmbeddingWrapper",
     "CrossEncoderProtocol",
     "CrossEncoderWrapper",
     "EncoderProtocol",
+    "IndexEncoderSearchProtocol",
     "MTEBModels",
     "ModelMeta",
     "SearchCrossEncoderWrapper",

mteb/models/abs_encoder.py CHANGED Viewed

@@ -1,9 +1,11 @@
 import logging
+import warnings
 from abc import ABC, abstractmethod
 from collections.abc import Callable, Sequence
 from typing import Any, Literal, cast, get_args, overload
 from torch.utils.data import DataLoader
+from typing_extensions import Unpack
 import mteb
 from mteb.abstasks.task_metadata import TaskMetadata, TaskType
@@ -18,6 +20,7 @@ from mteb.similarity_functions import (
 from mteb.types import (
     Array,
     BatchedInput,
+    EncodeKwargs,
     PromptType,
 )
@@ -43,7 +46,7 @@ class AbsEncoder(ABC):
     model: Any
     mteb_model_meta: ModelMeta | None = None
     model_prompts: dict[str, str] | None = None
-    instruction_template: str | Callable[[str, PromptType], str] | None = None
+    instruction_template: str | Callable[[str, PromptType | None], str] | None = None
     prompts_dict: dict[str, str] | None = None
     def get_prompt_name(
@@ -54,11 +57,11 @@ class AbsEncoder(ABC):
         """A wrapper function around the model.encode method that handles the prompt_name argument and standardizes the output to a numpy array.
         The order of priorities for prompt selection are:
-            1. Composed prompt of task name + prompt type (query or passage)
+            1. Composed prompt of task name + prompt type
             2. Specific task prompt
-            3. Composed prompt of task type + prompt type (query or passage)
+            3. Composed prompt of task type + prompt type
             4. Specific task type prompt
-            5. Specific prompt type (query or passage)
+            5. Specific prompt type
         Args:
             task_metadata: The task name to use for building the encoding prompt
@@ -105,12 +108,12 @@ class AbsEncoder(ABC):
         Args:
             task_metadata: The metadata of the task.
-            prompt_type: The name type of prompt. (query or passage)
+            prompt_type: The name type of prompt.
         """
         if not self.model_prompts:
             return None
         prompt_name = self.get_prompt_name(task_metadata, prompt_type)
-        return self.model_prompts.get(prompt_name)
+        return self.model_prompts.get(prompt_name) if prompt_name else None
     @staticmethod
     @overload
@@ -187,6 +190,7 @@ class AbsEncoder(ABC):
                 except KeyError:
                     msg = f"Task name {task_name} is not valid. {valid_keys_msg}"
                     logger.warning(msg)
+                    warnings.warn(msg)
                     invalid_task_messages.add(msg)
                     invalid_keys.add(task_key)
@@ -210,13 +214,11 @@ class AbsEncoder(ABC):
             task_metadata: The metadata of the task. Sentence-transformers uses this to
                 determine which prompt to use from a specified dictionary.
                 The order of priorities for prompt selection are:
-                    1. Composed prompt of task name + prompt type (query or passage)
-                    2. Specific task prompt
-                    3. Composed prompt of task type + prompt type (query or passage)
-                    4. Specific task type prompt
-                    5. Specific prompt type (query or passage)
-                    6. Default prompt from the task definition
-            prompt_type: The name type of prompt. (query or passage)
+                    1. Specific task prompt
+                    2. Specific task type prompt
+                    3. Specific prompt type
+                    4. Default prompt from the task definition
+            prompt_type: The name type of prompt.
         Returns:
             The instruction/prompt to be used for encoding sentences.
@@ -224,13 +226,19 @@ class AbsEncoder(ABC):
         prompt = task_metadata.prompt
         if self.prompts_dict and task_metadata.name in self.prompts_dict:
             prompt = self.prompts_dict[task_metadata.name]
+        elif self.prompts_dict and task_metadata.type in self.prompts_dict:
+            prompt = self.prompts_dict[task_metadata.type]
+        elif (
+            self.prompts_dict and prompt_type and prompt_type.value in self.prompts_dict
+        ):
+            prompt = self.prompts_dict[prompt_type.value]
         if isinstance(prompt, dict) and prompt_type:
             if prompt.get(prompt_type.value):
                 return prompt[prompt_type.value]
-            logger.warning(
-                f"Prompt type '{prompt_type}' not found in task metadata for task '{task_metadata.name}'."
-            )
+            msg = f"Prompt type '{prompt_type}' not found in task metadata for task '{task_metadata.name}'."
+            logger.warning(msg)
+            warnings.warn(msg)
             return ""
         if prompt:
@@ -246,7 +254,7 @@ class AbsEncoder(ABC):
         Args:
             instruction: The instruction to be formatted.
-            prompt_type: The name type of prompt. (query or passage)
+            prompt_type: The name type of prompt.
         """
         if self.instruction_template is None:
             raise ValueError(
@@ -269,7 +277,7 @@ class AbsEncoder(ABC):
         Args:
             task_metadata: The metadata of the task
-            prompt_type: The name type of prompt. (query or passage)
+            prompt_type: The name type of prompt.
         Returns:
             The instruction to be used for encoding sentences.
@@ -364,7 +372,7 @@ class AbsEncoder(ABC):
         hf_split: str,
         hf_subset: str,
         prompt_type: PromptType | None = None,
-        **kwargs: Any,
+        **kwargs: Unpack[EncodeKwargs],
     ) -> Array:
         """Encodes the given sentences using the encoder.
@@ -373,14 +381,14 @@ class AbsEncoder(ABC):
             task_metadata: The metadata of the task. Sentence-transformers uses this to
                 determine which prompt to use from a specified dictionary.
                 The order of priorities for prompt selection are:
-                    1. Composed prompt of task name + prompt type (query or passage)
+                    1. Composed prompt of task name + prompt type
                     2. Specific task prompt
-                    3. Composed prompt of task type + prompt type (query or passage)
+                    3. Composed prompt of task type + prompt type
                     4. Specific task type prompt
-                    5. Specific prompt type (query or passage)
+                    5. Specific prompt type
             hf_split: Split of current task
             hf_subset: Subset of current task
-            prompt_type: The name type of prompt. (query or passage)
+            prompt_type: The name type of prompt.
             **kwargs: Additional arguments to pass to the encoder.
         Returns:

mteb/models/cache_wrappers/__init__.py CHANGED Viewed

@@ -1,3 +1,4 @@
+from .cache_backend_protocol import CacheBackendProtocol
 from .cache_wrapper import CachedEmbeddingWrapper
-__all__ = ["CachedEmbeddingWrapper"]
+__all__ = ["CacheBackendProtocol", "CachedEmbeddingWrapper"]

mteb/models/cache_wrappers/cache_backend_protocol.py CHANGED Viewed

@@ -5,8 +5,6 @@ from typing import Any, Protocol, runtime_checkable
 import numpy as np
-from mteb.types import BatchedInput
 @runtime_checkable
 class CacheBackendProtocol(Protocol):
@@ -26,7 +24,7 @@ class CacheBackendProtocol(Protocol):
             **kwargs: Additional backend-specific arguments.
         """
-    def add(self, item: list[BatchedInput], vectors: np.ndarray) -> None:
+    def add(self, item: list[dict[str, Any]], vectors: np.ndarray) -> None:
         """Add a vector to the cache.
         Args:
@@ -34,7 +32,7 @@ class CacheBackendProtocol(Protocol):
             vectors: Embedding vector of shape (dim,) or (1, dim).
         """
-    def get_vector(self, item: BatchedInput) -> np.ndarray | None:
+    def get_vector(self, item: dict[str, Any]) -> np.ndarray | None:
         """Retrieve the cached vector for the given item.
         Args:
@@ -53,5 +51,5 @@ class CacheBackendProtocol(Protocol):
     def close(self) -> None:
         """Release resources or flush data."""
-    def __contains__(self, item: BatchedInput) -> bool:
+    def __contains__(self, item: dict[str, Any]) -> bool:
         """Check whether the cache contains an item."""

mteb/models/cache_wrappers/cache_backends/_hash_utils.py CHANGED Viewed

@@ -1,16 +1,17 @@
 import hashlib
+from collections.abc import Mapping
+from typing import Any
-from PIL import Image
-from mteb.types import BatchedInput
-def _hash_item(item: BatchedInput) -> str:
+def _hash_item(item: Mapping[str, Any]) -> str:
     item_hash = ""
     if "text" in item:
-        item_hash = hashlib.sha256(item["text"].encode()).hexdigest()
+        item_text: str = item["text"]
+        item_hash = hashlib.sha256(item_text.encode()).hexdigest()
     if "image" in item:
+        from PIL import Image
         image: Image.Image = item["image"]
         item_hash += hashlib.sha256(image.tobytes()).hexdigest()

mteb/models/cache_wrappers/cache_backends/faiss_cache.py CHANGED Viewed

@@ -1,6 +1,8 @@
 import json
 import logging
+import warnings
 from pathlib import Path
+from typing import Any
 import numpy as np
@@ -36,7 +38,7 @@ class FaissCache:
         logger.info(f"Initialized FAISS VectorCacheMap in {self.directory}")
         self.load()
-    def add(self, items: list[BatchedInput], vectors: np.ndarray) -> None:
+    def add(self, items: list[dict[str, Any]], vectors: np.ndarray) -> None:
         """Add vector to FAISS index."""
         import faiss
@@ -71,7 +73,9 @@ class FaissCache:
         try:
             return self.index.reconstruct(idx)
         except Exception:
-            logger.warning(f"Vector id {idx} missing for hash {item_hash}")
+            msg = f"Vector id {idx} missing for hash {item_hash}"
+            logger.warning(msg)
+            warnings.warn(msg)
             return None
     def save(self) -> None:

mteb 2.1.4__py3-none-any.whl → 2.7.2__py3-none-any.whl

mteb 2.1.4py3-none-any.whl → 2.7.2py3-none-any.whl