PyPI - mteb - Versions diffs - 2.7.2__py3-none-any.whl → 2.7.9__py3-none-any.whl - Mend

mteb 2.7.2py3-none-any.whl → 2.7.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (486) hide show

mteb/descriptive_stats/Retrieval/BrightTheoremQATheoremsRetrieval.json ADDED Viewed

@@ -0,0 +1,35 @@
+{
+    "standard": {
+        "num_samples": 23904,
+        "number_of_characters": 20825122,
+        "documents_text_statistics": {
+            "total_text_length": 20797224,
+            "min_text_length": 74,
+            "average_text_length": 872.4033726246906,
+            "max_text_length": 19104,
+            "unique_texts": 23839
+        },
+        "documents_image_statistics": null,
+        "queries_text_statistics": {
+            "total_text_length": 27898,
+            "min_text_length": 13,
+            "average_text_length": 429.2,
+            "max_text_length": 1255,
+            "unique_texts": 65
+        },
+        "queries_image_statistics": null,
+        "relevant_docs_statistics": {
+            "num_relevant_docs": 126,
+            "min_relevant_docs_per_query": 1,
+            "average_relevant_docs_per_query": 1.9384615384615385,
+            "max_relevant_docs_per_query": 6,
+            "unique_relevant_docs": 95
+        },
+        "top_ranked_statistics": {
+            "num_top_ranked": 1549535,
+            "min_top_ranked_per_query": 23839,
+            "average_top_ranked_per_query": 23839.0,
+            "max_top_ranked_per_query": 23839
+        }
+    }
+}

mteb/evaluate.py CHANGED Viewed

@@ -2,7 +2,6 @@ from __future__ import annotations
 import logging
 import warnings
-from collections.abc import Iterable
 from pathlib import Path
 from time import time
 from typing import TYPE_CHECKING, cast
@@ -17,22 +16,25 @@ from mteb.abstasks.aggregated_task import AbsTaskAggregate
 from mteb.benchmarks.benchmark import Benchmark
 from mteb.cache import ResultCache
 from mteb.models.model_meta import ModelMeta
-from mteb.models.models_protocols import (
-    MTEBModels,
-)
 from mteb.models.sentence_transformer_wrapper import (
     CrossEncoderWrapper,
     SentenceTransformerEncoderWrapper,
 )
 from mteb.results import ModelResult, TaskResult
 from mteb.results.task_result import TaskError
-from mteb.types import HFSubset, PromptType, SplitName
-from mteb.types._encoder_io import EncodeKwargs
-from mteb.types._metadata import ModelName, Revision
+from mteb.types import PromptType
 if TYPE_CHECKING:
+    from collections.abc import Iterable
     from sentence_transformers import CrossEncoder, SentenceTransformer
+    from mteb.models.models_protocols import (
+        MTEBModels,
+    )
+    from mteb.types import EncodeKwargs, HFSubset, SplitName
+    from mteb.types._metadata import ModelName, Revision
 logger = logging.getLogger(__name__)
@@ -69,13 +71,13 @@ def _sanitize_model(
         meta = getattr(model, "mteb_model_meta")
         if not isinstance(meta, ModelMeta):
             meta = ModelMeta._from_hub(None)
-        wrapped_model = cast(MTEBModels | ModelMeta, model)
+        wrapped_model = cast("MTEBModels | ModelMeta", model)
     else:
         meta = ModelMeta._from_hub(None) if not isinstance(model, ModelMeta) else model
         wrapped_model = meta
-    model_name = cast(str, meta.name)
-    model_revision = cast(str, meta.revision)
+    model_name = cast("str", meta.name)
+    model_revision = cast("str", meta.revision)
     return wrapped_model, meta, model_name, model_revision
@@ -123,6 +125,7 @@ def _evaluate_task(
                 co2_tracker=False,
                 prediction_folder=prediction_folder,
                 public_only=public_only,
+                num_proc=num_proc,
             )
         if isinstance(result, TaskResult):
             result.kg_co2_emissions = tracker.final_emissions
@@ -132,10 +135,10 @@ def _evaluate_task(
     task.check_if_dataset_is_superseded()
-    data_loaded = task.data_loaded
-    if not data_loaded:
+    data_preloaded = task.data_loaded
+    if not data_preloaded:
         try:
-            task.load_data()
+            task.load_data(num_proc=num_proc)
         except DatasetNotFoundError as e:
             if not task.metadata.is_public and public_only is None:
                 msg = (
@@ -161,6 +164,7 @@ def _evaluate_task(
             subsets_to_run=hf_subsets,
             encode_kwargs=encode_kwargs,
             prediction_folder=prediction_folder,
+            num_proc=num_proc,
         )
         tock = time()
@@ -176,7 +180,7 @@ def _evaluate_task(
         kg_co2_emissions=None,
     )
-    if data_loaded:  # only unload if we loaded the data
+    if not data_preloaded:  # only unload if we loaded the data
         task.unload_data()
     return result
@@ -202,10 +206,10 @@ def _check_model_modalities(
     if isinstance(tasks, AbsTask):
         check_tasks = [tasks]
     elif isinstance(tasks, Benchmark):
-        benchmark = cast(Benchmark, tasks)
+        benchmark = cast("Benchmark", tasks)
         check_tasks = benchmark.tasks
     else:
-        check_tasks = cast(Iterable[AbsTask], tasks)
+        check_tasks = cast("Iterable[AbsTask]", tasks)
     warnings, errors = [], []
@@ -278,6 +282,7 @@ def evaluate(
     prediction_folder: Path | str | None = None,
     show_progress_bar: bool = True,
     public_only: bool | None = None,
+    num_proc: int = 1,
 ) -> ModelResult:
     """This function runs a model on a given task and returns the results.
@@ -286,7 +291,7 @@ def evaluate(
         tasks: A task to run.
         co2_tracker: If True, track the CO₂ emissions of the evaluation, required codecarbon to be installed, which can be installed using
             `pip install mteb[codecarbon]`. If none is passed co2 tracking will only be run if codecarbon is installed.
-        encode_kwargs: Additional keyword arguments passed to the models `encode` method.
+        encode_kwargs: Additional keyword arguments passed to the models `encode` and `load_data` methods;
         raise_error: If True, raise an error if the task fails. If False, return an empty list.
         cache: The cache to use for loading the results. If None, then no cache will be used. The default cache saved the cache in the
             `~/.cache/mteb` directory. It can be overridden by setting the `MTEB_CACHE` environment variable to a different directory or by directly
@@ -298,10 +303,11 @@ def evaluate(
                 changed.
             - "only-cache": Only load the results from the cache folder and do not run the task. Useful if you just want to load the results from the
                 cache.
-        prediction_folder: Optional folder in which to save model predictions for the task. Predictions of the tasks will be sabed in `prediction_folder/{task_name}_predictions.json`
+        prediction_folder: Optional folder in which to save model predictions for the task. Predictions of the tasks will be saved in `prediction_folder/{task_name}_predictions.json`
         show_progress_bar: Whether to show a progress bar when running the evaluation. Default is True. Setting this to False will also set the
             `encode_kwargs['show_progress_bar']` to False if encode_kwargs is unspecified.
         public_only: Run only public tasks. If None, it will attempt to run the private task.
+        num_proc: Number of processes to use during data loading and transformation. Defaults to 1.
     Returns:
         The results of the evaluation.
@@ -342,7 +348,7 @@ def evaluate(
     # AbsTaskAggregate is a special case where we have to run multiple tasks and combine the results
     if isinstance(tasks, AbsTaskAggregate):
-        aggregated_task = cast(AbsTaskAggregate, tasks)
+        aggregated_task = cast("AbsTaskAggregate", tasks)
         results = evaluate(
             model,
             aggregated_task.metadata.tasks,
@@ -354,8 +360,12 @@ def evaluate(
             prediction_folder=prediction_folder,
             show_progress_bar=show_progress_bar,
             public_only=public_only,
+            num_proc=num_proc,
         )
         combined_results = aggregated_task.combine_task_results(results.task_results)
+        if cache:
+            cache.save_to_cache(combined_results, meta)
         return ModelResult(
             model_name=results.model_name,
             model_revision=results.model_revision,
@@ -365,7 +375,7 @@ def evaluate(
     if isinstance(tasks, AbsTask):
         task = tasks
     else:
-        tasks = cast(Iterable[AbsTask], tasks)
+        tasks = cast("Iterable[AbsTask]", tasks)
         evaluate_results = []
         exceptions = []
         tasks_tqdm = tqdm(
@@ -386,6 +396,7 @@ def evaluate(
                 prediction_folder=prediction_folder,
                 show_progress_bar=False,
                 public_only=public_only,
+                num_proc=num_proc,
             )
             evaluate_results.extend(_res.task_results)
             if _res.exceptions:
@@ -465,6 +476,7 @@ def evaluate(
                 encode_kwargs=encode_kwargs,
                 prediction_folder=prediction_folder,
                 public_only=public_only,
+                num_proc=num_proc,
             )
         except Exception as e:
             logger.error(
@@ -480,6 +492,7 @@ def evaluate(
             encode_kwargs=encode_kwargs,
             prediction_folder=prediction_folder,
             public_only=public_only,
+            num_proc=num_proc,
         )
     logger.info(f"✓ Finished evaluation for {task.metadata.name}")

mteb/filter_tasks.py CHANGED Viewed

@@ -1,19 +1,24 @@
 """This script contains functions that are used to get an overview of the MTEB benchmark."""
+from __future__ import annotations
 import logging
-from collections.abc import Iterable, Sequence
-from typing import overload
+from typing import TYPE_CHECKING, overload
-from mteb.abstasks import (
-    AbsTask,
-)
 from mteb.abstasks.aggregated_task import AbsTaskAggregate
-from mteb.abstasks.task_metadata import TaskCategory, TaskDomain, TaskType
 from mteb.languages import (
     ISO_TO_LANGUAGE,
     ISO_TO_SCRIPT,
 )
-from mteb.types import Modalities
+if TYPE_CHECKING:
+    from collections.abc import Iterable, Sequence
+    from mteb.abstasks import (
+        AbsTask,
+    )
+    from mteb.abstasks.task_metadata import TaskCategory, TaskDomain, TaskType
+    from mteb.types import Modalities
 logger = logging.getLogger(__name__)

mteb/get_tasks.py CHANGED Viewed

@@ -1,20 +1,25 @@
 """This script contains functions that are used to get an overview of the MTEB benchmark."""
+from __future__ import annotations
 import difflib
 import logging
 import warnings
 from collections import Counter, defaultdict
-from collections.abc import Iterable, Sequence
-from typing import Any
+from typing import TYPE_CHECKING, Any
 import pandas as pd
 from mteb.abstasks import (
     AbsTask,
 )
-from mteb.abstasks.task_metadata import TaskCategory, TaskDomain, TaskType
 from mteb.filter_tasks import filter_tasks
-from mteb.types import Modalities
+if TYPE_CHECKING:
+    from collections.abc import Iterable, Sequence
+    from mteb.abstasks.task_metadata import TaskCategory, TaskDomain, TaskType
+    from mteb.types import Modalities
 logger = logging.getLogger(__name__)

mteb/languages/language_scripts.py CHANGED Viewed

@@ -1,10 +1,15 @@
-from collections.abc import Iterable, Sequence
-from dataclasses import dataclass
+from __future__ import annotations
-from typing_extensions import Self
+from dataclasses import dataclass
+from typing import TYPE_CHECKING
 from mteb.languages.check_language_code import check_language_code
+if TYPE_CHECKING:
+    from collections.abc import Iterable, Sequence
+    from typing_extensions import Self
 @dataclass
 class LanguageScripts:

mteb/leaderboard/app.py CHANGED Viewed

@@ -1,3 +1,5 @@
+from __future__ import annotations
 import itertools
 import json
 import logging
@@ -5,15 +7,14 @@ import tempfile
 import time
 import warnings
 from pathlib import Path
-from typing import Literal, get_args
+from typing import TYPE_CHECKING, Literal, get_args
 from urllib.parse import urlencode
 import cachetools
 import gradio as gr
-import pandas as pd
+import pandas as pd  # noqa: TC002 # gradio tries to validate typehints
 import mteb
-from mteb import BenchmarkResults
 from mteb.benchmarks.benchmark import RtebBenchmark
 from mteb.cache import ResultCache
 from mteb.leaderboard.benchmark_selector import (
@@ -31,6 +32,9 @@ from mteb.leaderboard.table import (
 from mteb.leaderboard.text_segments import ACKNOWLEDGEMENT, FAQ
 from mteb.models.model_meta import MODEL_TYPES
+if TYPE_CHECKING:
+    from mteb import BenchmarkResults
 logger = logging.getLogger(__name__)
@@ -546,7 +550,10 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
     logger.info("Step 7/7: Building Gradio interface and callbacks...")
     interface_start = time.time()
-    with gr.Blocks(fill_width=True) as demo:
+    with gr.Blocks(
+        title="MTEB Leaderboard",
+        fill_width=True,
+    ) as demo:
         with gr.Sidebar(
             position="left",
             label="Benchmark Selection and Customization",

mteb/leaderboard/table.py CHANGED Viewed

@@ -1,3 +1,7 @@
+from __future__ import annotations
+from typing import TYPE_CHECKING
 import gradio as gr
 import matplotlib.pyplot as plt
 import numpy as np
@@ -5,8 +9,9 @@ import pandas as pd
 from matplotlib.colors import LinearSegmentedColormap
 from pandas.api.types import is_numeric_dtype
-from mteb.benchmarks.benchmark import Benchmark
-from mteb.results.benchmark_results import BenchmarkResults
+if TYPE_CHECKING:
+    from mteb.benchmarks.benchmark import Benchmark
+    from mteb.results.benchmark_results import BenchmarkResults
 def _borda_count(scores: pd.Series) -> pd.Series:

mteb/load_results.py CHANGED Viewed

@@ -1,13 +1,19 @@
+from __future__ import annotations
 import json
 import logging
 import sys
-from collections.abc import Iterable, Sequence
-from pathlib import Path
+from typing import TYPE_CHECKING
 from mteb.abstasks.abstask import AbsTask
 from mteb.models.model_meta import ModelMeta
 from mteb.results import BenchmarkResults, ModelResult, TaskResult
-from mteb.types import ModelName, Revision
+if TYPE_CHECKING:
+    from collections.abc import Iterable, Sequence
+    from pathlib import Path
+    from mteb.types import ModelName, Revision
 if sys.version_info >= (3, 13):
     from warnings import deprecated

mteb/models/abs_encoder.py CHANGED Viewed

@@ -1,14 +1,12 @@
+from __future__ import annotations
 import logging
 import warnings
 from abc import ABC, abstractmethod
-from collections.abc import Callable, Sequence
-from typing import Any, Literal, cast, get_args, overload
-from torch.utils.data import DataLoader
-from typing_extensions import Unpack
+from typing import TYPE_CHECKING, Any, Literal, cast, get_args, overload
 import mteb
-from mteb.abstasks.task_metadata import TaskMetadata, TaskType
+from mteb.abstasks.task_metadata import TaskType
 from mteb.similarity_functions import (
     cos_sim,
     dot_score,
@@ -18,13 +16,25 @@ from mteb.similarity_functions import (
     pairwise_max_sim,
 )
 from mteb.types import (
-    Array,
-    BatchedInput,
-    EncodeKwargs,
     PromptType,
 )
-from .model_meta import ModelMeta, ScoringFunction
+from .model_meta import ScoringFunction
+if TYPE_CHECKING:
+    from collections.abc import Callable, Sequence
+    from torch.utils.data import DataLoader
+    from typing_extensions import Unpack
+    from mteb.abstasks.task_metadata import TaskMetadata
+    from mteb.types import (
+        Array,
+        BatchedInput,
+        EncodeKwargs,
+    )
+    from .model_meta import ModelMeta
 logger = logging.getLogger(__name__)
@@ -314,7 +324,7 @@ class AbsEncoder(ABC):
             ):
                 arr = self.model.similarity(embeddings1, embeddings2)
                 # We assume that the model returns an Array-like object:
-                arr = cast(Array, arr)
+                arr = cast("Array", arr)
                 return arr
             return cos_sim(embeddings1, embeddings2)
         if self.mteb_model_meta.similarity_fn_name is ScoringFunction.COSINE:
@@ -352,7 +362,7 @@ class AbsEncoder(ABC):
             ):
                 arr = self.model.similarity_pairwise(embeddings1, embeddings2)
                 # We assume that the model returns an Array-like object:
-                arr = cast(Array, arr)
+                arr = cast("Array", arr)
                 return arr
             return pairwise_cos_sim(embeddings1, embeddings2)
         if self.mteb_model_meta.similarity_fn_name is ScoringFunction.COSINE:

mteb/models/cache_wrappers/cache_backend_protocol.py CHANGED Viewed

@@ -1,9 +1,11 @@
 from __future__ import annotations
-from pathlib import Path
-from typing import Any, Protocol, runtime_checkable
+from typing import TYPE_CHECKING, Any, Protocol, runtime_checkable
-import numpy as np
+if TYPE_CHECKING:
+    from pathlib import Path
+    import numpy as np
 @runtime_checkable

mteb/models/cache_wrappers/cache_backends/_hash_utils.py CHANGED Viewed

@@ -1,6 +1,12 @@
+from __future__ import annotations
 import hashlib
-from collections.abc import Mapping
-from typing import Any
+from typing import TYPE_CHECKING, Any
+if TYPE_CHECKING:
+    from collections.abc import Mapping
+    from PIL import Image
 def _hash_item(item: Mapping[str, Any]) -> str:
@@ -10,8 +16,6 @@ def _hash_item(item: Mapping[str, Any]) -> str:
         item_hash = hashlib.sha256(item_text.encode()).hexdigest()
     if "image" in item:
-        from PIL import Image
         image: Image.Image = item["image"]
         item_hash += hashlib.sha256(image.tobytes()).hexdigest()

mteb/models/cache_wrappers/cache_backends/faiss_cache.py CHANGED Viewed

@@ -1,16 +1,22 @@
+from __future__ import annotations
 import json
 import logging
 import warnings
 from pathlib import Path
-from typing import Any
+from typing import TYPE_CHECKING, Any
 import numpy as np
 from mteb._requires_package import requires_package
-from mteb.types import BatchedInput
 from ._hash_utils import _hash_item
+if TYPE_CHECKING:
+    import faiss
+    from mteb.types import BatchedInput
 logger = logging.getLogger(__name__)
@@ -24,7 +30,6 @@ class FaissCache:
             "FAISS-based vector cache",
             install_instruction="pip install mteb[faiss-cpu]",
         )
-        import faiss
         self.directory = Path(directory)
         self.directory.mkdir(parents=True, exist_ok=True)

mteb/models/cache_wrappers/cache_wrapper.py CHANGED Viewed

@@ -1,21 +1,26 @@
+from __future__ import annotations
 import logging
 from pathlib import Path
-from typing import Any
+from typing import TYPE_CHECKING, Any
 import numpy as np
 import torch
 from datasets import Dataset
-from torch.utils.data import DataLoader
 from mteb._create_dataloaders import create_dataloader
-from mteb.abstasks.task_metadata import TaskMetadata
-from mteb.models.cache_wrappers.cache_backend_protocol import (
-    CacheBackendProtocol,
-)
 from mteb.models.cache_wrappers.cache_backends.numpy_cache import NumpyCache
-from mteb.models.model_meta import ModelMeta
-from mteb.models.models_protocols import EncoderProtocol
-from mteb.types import Array, BatchedInput, PromptType
+if TYPE_CHECKING:
+    from torch.utils.data import DataLoader
+    from mteb.abstasks.task_metadata import TaskMetadata
+    from mteb.models.cache_wrappers.cache_backend_protocol import (
+        CacheBackendProtocol,
+    )
+    from mteb.models.model_meta import ModelMeta
+    from mteb.models.models_protocols import EncoderProtocol
+    from mteb.types import Array, BatchedInput, PromptType
 logger = logging.getLogger(__name__)

mteb/models/get_model_meta.py CHANGED Viewed

@@ -1,15 +1,22 @@
+from __future__ import annotations
 import difflib
 import logging
-from collections.abc import Iterable
-from typing import Any
+from typing import TYPE_CHECKING, Any
-from mteb.abstasks import AbsTask
 from mteb.models import (
     ModelMeta,
-    MTEBModels,
 )
 from mteb.models.model_implementations import MODEL_REGISTRY
+if TYPE_CHECKING:
+    from collections.abc import Iterable
+    from mteb.abstasks import AbsTask
+    from mteb.models import (
+        MTEBModels,
+    )
 logger = logging.getLogger(__name__)
@@ -116,7 +123,10 @@ def get_model(
 def get_model_meta(
-    model_name: str, revision: str | None = None, fetch_from_hf: bool = True
+    model_name: str,
+    revision: str | None = None,
+    fetch_from_hf: bool = True,
+    fill_missing: bool = False,
 ) -> ModelMeta:
     """A function to fetch a model metadata object by name.
@@ -124,6 +134,7 @@ def get_model_meta(
         model_name: Name of the model to fetch
         revision: Revision of the model to fetch
         fetch_from_hf: Whether to fetch the model from HuggingFace Hub if not found in the registry
+        fill_missing: Computes missing attributes from the metadata including number of parameters and memory usage.
     Returns:
         A model metadata object
@@ -135,10 +146,25 @@ def get_model_meta(
             raise ValueError(
                 f"Model revision {revision} not found for model {model_name}. Expected {model_meta.revision}."
             )
+        if fill_missing and fetch_from_hf:
+            original_meta_dict = model_meta.model_dump()
+            new_meta = ModelMeta.from_hub(model_name)
+            new_meta_dict = new_meta.model_dump(exclude_none=True)
+            updates = {
+                k: v
+                for k, v in new_meta_dict.items()
+                if original_meta_dict.get(k) is None
+            }
+            if updates:
+                return model_meta.model_copy(update=updates)
         return model_meta
     if fetch_from_hf:
         logger.info(
-            "Model not found in model registry. Attempting to extract metadata by loading the model ({model_name}) using HuggingFace."
+            f"Model not found in model registry. Attempting to extract metadata by loading the model ({model_name}) using HuggingFace."
         )
         meta = ModelMeta.from_hub(model_name, revision)
         return meta

mteb/models/instruct_wrapper.py CHANGED Viewed

@@ -1,16 +1,24 @@
+from __future__ import annotations
 import logging
-from collections.abc import Callable
-from typing import Any
+from typing import TYPE_CHECKING, Any
 import torch
-from torch.utils.data import DataLoader
 from mteb._requires_package import requires_package
-from mteb.abstasks.task_metadata import TaskMetadata
-from mteb.types import Array, BatchedInput, PromptType
+from mteb.types import PromptType
 from .abs_encoder import AbsEncoder
+if TYPE_CHECKING:
+    from collections.abc import Callable
+    from torch.utils.data import DataLoader
+    from mteb.abstasks.task_metadata import TaskMetadata
+    from mteb.types import Array, BatchedInput
 logger = logging.getLogger(__name__)

mteb 2.7.2__py3-none-any.whl → 2.7.9__py3-none-any.whl

mteb 2.7.2py3-none-any.whl → 2.7.9py3-none-any.whl