PyPI - mteb - Versions diffs - 2.7.1__py3-none-any.whl → 2.7.3__py3-none-any.whl - Mend

mteb 2.7.1py3-none-any.whl → 2.7.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (155) hide show

mteb/__init__.py +2 -0
mteb/_create_dataloaders.py +16 -9
mteb/_evaluators/any_sts_evaluator.py +10 -5
mteb/_evaluators/clustering_evaluator.py +10 -4
mteb/_evaluators/evaluator.py +9 -4
mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +6 -4
mteb/_evaluators/pair_classification_evaluator.py +10 -5
mteb/_evaluators/retrieval_evaluator.py +19 -13
mteb/_evaluators/retrieval_metrics.py +9 -3
mteb/_evaluators/sklearn_evaluator.py +14 -10
mteb/_evaluators/text/bitext_mining_evaluator.py +8 -3
mteb/_evaluators/text/summarization_evaluator.py +8 -4
mteb/_evaluators/zeroshot_classification_evaluator.py +10 -3
mteb/_helpful_enum.py +5 -1
mteb/abstasks/_data_filter/filters.py +8 -2
mteb/abstasks/_data_filter/task_pipelines.py +7 -2
mteb/abstasks/_statistics_calculation.py +6 -4
mteb/abstasks/abstask.py +17 -9
mteb/abstasks/aggregate_task_metadata.py +20 -9
mteb/abstasks/aggregated_task.py +15 -8
mteb/abstasks/classification.py +15 -6
mteb/abstasks/clustering.py +17 -8
mteb/abstasks/clustering_legacy.py +14 -6
mteb/abstasks/image/image_text_pair_classification.py +17 -7
mteb/abstasks/multilabel_classification.py +11 -5
mteb/abstasks/pair_classification.py +19 -9
mteb/abstasks/regression.py +14 -6
mteb/abstasks/retrieval.py +27 -16
mteb/abstasks/retrieval_dataset_loaders.py +11 -8
mteb/abstasks/sts.py +19 -10
mteb/abstasks/task_metadata.py +17 -8
mteb/abstasks/text/bitext_mining.py +14 -7
mteb/abstasks/text/summarization.py +17 -7
mteb/abstasks/zeroshot_classification.py +15 -7
mteb/benchmarks/_create_table.py +13 -3
mteb/benchmarks/benchmark.py +11 -1
mteb/benchmarks/benchmarks/rteb_benchmarks.py +20 -9
mteb/cache.py +20 -14
mteb/cli/_display_tasks.py +9 -3
mteb/cli/build_cli.py +5 -2
mteb/cli/generate_model_card.py +9 -2
mteb/deprecated_evaluator.py +16 -12
mteb/evaluate.py +20 -18
mteb/filter_tasks.py +12 -7
mteb/get_tasks.py +9 -4
mteb/languages/language_scripts.py +8 -3
mteb/leaderboard/app.py +7 -3
mteb/leaderboard/table.py +7 -2
mteb/load_results.py +9 -3
mteb/models/abs_encoder.py +22 -12
mteb/models/cache_wrappers/cache_backend_protocol.py +5 -3
mteb/models/cache_wrappers/cache_backends/_hash_utils.py +8 -4
mteb/models/cache_wrappers/cache_backends/faiss_cache.py +8 -3
mteb/models/cache_wrappers/cache_wrapper.py +14 -9
mteb/models/get_model_meta.py +11 -4
mteb/models/instruct_wrapper.py +13 -5
mteb/models/model_implementations/align_models.py +9 -4
mteb/models/model_implementations/bedrock_models.py +16 -6
mteb/models/model_implementations/blip2_models.py +9 -4
mteb/models/model_implementations/blip_models.py +9 -4
mteb/models/model_implementations/bm25.py +15 -10
mteb/models/model_implementations/bmretriever_models.py +6 -2
mteb/models/model_implementations/cde_models.py +9 -5
mteb/models/model_implementations/clip_models.py +9 -4
mteb/models/model_implementations/cohere_models.py +10 -4
mteb/models/model_implementations/cohere_v.py +9 -4
mteb/models/model_implementations/colpali_models.py +4 -3
mteb/models/model_implementations/colqwen_models.py +10 -31
mteb/models/model_implementations/colsmol_models.py +1 -1
mteb/models/model_implementations/conan_models.py +10 -4
mteb/models/model_implementations/dino_models.py +9 -4
mteb/models/model_implementations/e5_v.py +9 -4
mteb/models/model_implementations/eagerworks_models.py +10 -4
mteb/models/model_implementations/evaclip_models.py +9 -4
mteb/models/model_implementations/gme_v_models.py +5 -3
mteb/models/model_implementations/google_models.py +10 -4
mteb/models/model_implementations/granite_vision_embedding_models.py +6 -5
mteb/models/model_implementations/hinvec_models.py +5 -1
mteb/models/model_implementations/jasper_models.py +12 -5
mteb/models/model_implementations/jina_clip.py +9 -4
mteb/models/model_implementations/jina_models.py +10 -5
mteb/models/model_implementations/kalm_models.py +18 -12
mteb/models/model_implementations/linq_models.py +6 -1
mteb/models/model_implementations/listconranker.py +9 -4
mteb/models/model_implementations/llm2clip_models.py +9 -4
mteb/models/model_implementations/llm2vec_models.py +12 -6
mteb/models/model_implementations/mcinext_models.py +5 -2
mteb/models/model_implementations/mdbr_models.py +3 -1
mteb/models/model_implementations/{mxbai_models.py → mixedbread_ai_models.py} +91 -0
mteb/models/model_implementations/moco_models.py +9 -4
mteb/models/model_implementations/mod_models.py +1 -1
mteb/models/model_implementations/model2vec_models.py +10 -4
mteb/models/model_implementations/no_instruct_sentence_models.py +12 -5
mteb/models/model_implementations/nomic_models.py +10 -4
mteb/models/model_implementations/nomic_models_vision.py +4 -3
mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +7 -3
mteb/models/model_implementations/nvidia_models.py +12 -4
mteb/models/model_implementations/octen_models.py +1 -1
mteb/models/model_implementations/openai_models.py +9 -4
mteb/models/model_implementations/openclip_models.py +9 -4
mteb/models/model_implementations/opensearch_neural_sparse_models.py +10 -4
mteb/models/model_implementations/ops_moa_models.py +7 -2
mteb/models/model_implementations/pixie_models.py +56 -0
mteb/models/model_implementations/promptriever_models.py +12 -6
mteb/models/model_implementations/pylate_models.py +19 -13
mteb/models/model_implementations/qwen3_models.py +8 -1
mteb/models/model_implementations/random_baseline.py +4 -3
mteb/models/model_implementations/repllama_models.py +13 -6
mteb/models/model_implementations/rerankers_custom.py +10 -4
mteb/models/model_implementations/rerankers_monot5_based.py +10 -4
mteb/models/model_implementations/salesforce_models.py +7 -1
mteb/models/model_implementations/seed_1_6_embedding_models.py +4 -2
mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +5 -2
mteb/models/model_implementations/seed_models.py +1 -1
mteb/models/model_implementations/siglip_models.py +9 -4
mteb/models/model_implementations/slm_models.py +7 -4
mteb/models/model_implementations/uae_models.py +9 -4
mteb/models/model_implementations/vdr_models.py +7 -1
mteb/models/model_implementations/vista_models.py +9 -4
mteb/models/model_implementations/vlm2vec_models.py +9 -4
mteb/models/model_implementations/voyage_models.py +10 -4
mteb/models/model_implementations/voyage_v.py +10 -6
mteb/models/model_implementations/yuan_models_en.py +1 -1
mteb/models/model_meta.py +12 -7
mteb/models/models_protocols.py +19 -18
mteb/models/search_encoder_index/search_backend_protocol.py +7 -3
mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +12 -4
mteb/models/search_wrappers.py +19 -12
mteb/models/sentence_transformer_wrapper.py +4 -3
mteb/models/vllm_wrapper.py +8 -6
mteb/results/benchmark_results.py +22 -17
mteb/results/model_result.py +21 -15
mteb/results/task_result.py +41 -10
mteb/similarity_functions.py +8 -2
mteb/tasks/aggregated_tasks/eng/cqadupstack_retrieval.py +3 -3
mteb/tasks/aggregated_tasks/eng/sts17_multilingual_visual_sts_eng.py +3 -3
mteb/tasks/aggregated_tasks/eng/sts_benchmark_multilingual_visual_sts_eng.py +3 -3
mteb/tasks/aggregated_tasks/fas/cqadupstack_retrieval_fa.py +3 -3
mteb/tasks/aggregated_tasks/fas/syn_per_chatbot_conv_sa_classification.py +3 -3
mteb/tasks/aggregated_tasks/multilingual/sts17_multilingual_vision_sts.py +3 -3
mteb/tasks/aggregated_tasks/multilingual/sts_benchmark_multilingual_visual_sts.py +3 -3
mteb/tasks/aggregated_tasks/nld/cqadupstack_nl_retrieval.py +3 -3
mteb/tasks/aggregated_tasks/pol/cqadupstack_retrieval_pl.py +3 -3
mteb/tasks/clustering/nob/snl_clustering.py +7 -2
mteb/tasks/clustering/nob/vg_clustering.py +7 -2
mteb/tasks/retrieval/eng/limit_retrieval.py +6 -1
mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +3 -3
mteb/types/_encoder_io.py +1 -1
mteb/types/statistics.py +9 -2
{mteb-2.7.1.dist-info → mteb-2.7.3.dist-info}/METADATA +1 -1
{mteb-2.7.1.dist-info → mteb-2.7.3.dist-info}/RECORD +155 -154
{mteb-2.7.1.dist-info → mteb-2.7.3.dist-info}/WHEEL +0 -0
{mteb-2.7.1.dist-info → mteb-2.7.3.dist-info}/entry_points.txt +0 -0
{mteb-2.7.1.dist-info → mteb-2.7.3.dist-info}/licenses/LICENSE +0 -0
{mteb-2.7.1.dist-info → mteb-2.7.3.dist-info}/top_level.txt +0 -0

mteb/benchmarks/benchmarks/rteb_benchmarks.py CHANGED Viewed

@@ -10,6 +10,8 @@ RTEB_CITATION = r"""@article{rteb2025,
   year = {2025},
 }"""
+removal_note = "\n\nNote: We have temporarily removed the 'Private' column to read more about this decision out the [announcement](https://github.com/embeddings-benchmark/mteb/issues/3934)."
 RTEB_MAIN = RtebBenchmark(
     name="RTEB(beta)",
     display_name="RTEB Multilingual",
@@ -48,7 +50,8 @@ RTEB_MAIN = RtebBenchmark(
             "JapaneseLegal1Retrieval",
         ],
     ),
-    description="RTEB (ReTrieval Embedding Benchmark) is a comprehensive benchmark for evaluating text retrieval models across multiple specialized domains including legal, finance, code, and healthcare. It contains diverse retrieval tasks designed to test models' ability to understand domain-specific terminology and retrieve relevant documents in specialized contexts across multiple languages. The dataset includes both open and closed datasets, providing a robust evaluation framework for real-world applications. To submit results on private tasks, please create [open an issue](https://github.com/embeddings-benchmark/mteb/issues).",
+    description="RTEB (ReTrieval Embedding Benchmark) is a comprehensive benchmark for evaluating text retrieval models across multiple specialized domains including legal, finance, code, and healthcare. It contains diverse retrieval tasks designed to test models' ability to understand domain-specific terminology and retrieve relevant documents in specialized contexts across multiple languages. The dataset includes both open and closed datasets, providing a robust evaluation framework for real-world applications. To submit results on private tasks, please create [open an issue](https://github.com/embeddings-benchmark/mteb/issues)."
+    + removal_note,
     citation=RTEB_CITATION,
     contacts=["fzowl"],
 )
@@ -83,7 +86,8 @@ RTEB_ENGLISH = RtebBenchmark(
         ],
         languages=["eng"],
     ),
-    description="RTEB English is a subset of RTEB containing retrieval tasks in English across legal, finance, code, and healthcare domains. Includes diverse tasks covering specialized domains such as healthcare and finance. The benchmark includes both open and closed datasets, providing a robust evaluation framework for real-world applications. To submit results on private tasks, please create [open an issue](https://github.com/embeddings-benchmark/mteb/issues).",
+    description="RTEB English is a subset of RTEB containing retrieval tasks in English across legal, finance, code, and healthcare domains. Includes diverse tasks covering specialized domains such as healthcare and finance. The benchmark includes both open and closed datasets, providing a robust evaluation framework for real-world applications. To submit results on private tasks, please create [open an issue](https://github.com/embeddings-benchmark/mteb/issues)."
+    + removal_note,
     citation=RTEB_CITATION,
     contacts=["fzowl"],
 )
@@ -101,7 +105,8 @@ RTEB_FRENCH = RtebBenchmark(
         ],
         languages=["fra"],
     ),
-    description="RTEB French is a subset of RTEB containing retrieval tasks in French across legal and general knowledge domains. The benchmark includes both open and closed datasets, providing a robust evaluation framework for real-world applications. To submit results on private tasks, please create [open an issue](https://github.com/embeddings-benchmark/mteb/issues).",
+    description="RTEB French is a subset of RTEB containing retrieval tasks in French across legal and general knowledge domains. The benchmark includes both open and closed datasets, providing a robust evaluation framework for real-world applications. To submit results on private tasks, please create [open an issue](https://github.com/embeddings-benchmark/mteb/issues)."
+    + removal_note,
     citation=RTEB_CITATION,
     contacts=["fzowl"],
 )
@@ -119,7 +124,8 @@ RTEB_GERMAN = RtebBenchmark(
             "GermanLegal1Retrieval",
         ],
     ),
-    description="RTEB German is a subset of RTEB containing retrieval tasks in German across legal, healthcare, and business domains. The benchmark includes both open and closed datasets, providing a robust evaluation framework for real-world applications. To submit results on private tasks, please create [open an issue](https://github.com/embeddings-benchmark/mteb/issues).",
+    description="RTEB German is a subset of RTEB containing retrieval tasks in German across legal, healthcare, and business domains. The benchmark includes both open and closed datasets, providing a robust evaluation framework for real-world applications. To submit results on private tasks, please create [open an issue](https://github.com/embeddings-benchmark/mteb/issues)."
+    + removal_note,
     citation=RTEB_CITATION,
     contacts=["fzowl"],
 )
@@ -135,7 +141,8 @@ RTEB_JAPANESE = RtebBenchmark(
             "JapaneseLegal1Retrieval",
         ],
     ),
-    description="RTEB Japanese is a subset of RTEB  containing retrieval tasks in Japanese across legal and code domains. The benchmark includes both open and closed datasets, providing a robust evaluation framework for real-world applications. To submit results on private tasks, please create [open an issue](https://github.com/embeddings-benchmark/mteb/issues).",
+    description="RTEB Japanese is a subset of RTEB  containing retrieval tasks in Japanese across legal and code domains. The benchmark includes both open and closed datasets, providing a robust evaluation framework for real-world applications. To submit results on private tasks, please create [open an issue](https://github.com/embeddings-benchmark/mteb/issues)."
+    + removal_note,
     citation=RTEB_CITATION,
     contacts=["fzowl"],
 )
@@ -156,7 +163,8 @@ RTEB_FINANCE = RtebBenchmark(
             "EnglishFinance4Retrieval",
         ],
     ),
-    description="RTEB Finance is a subset of RTEB  containing retrieval tasks specifically focused on financial domain including finance benchmarks, Q&A, financial document retrieval, and corporate governance. The benchmark includes both open and closed datasets, providing a robust evaluation framework for real-world applications. To submit results on private tasks, please create [open an issue](https://github.com/embeddings-benchmark/mteb/issues).",
+    description="RTEB Finance is a subset of RTEB  containing retrieval tasks specifically focused on financial domain including finance benchmarks, Q&A, financial document retrieval, and corporate governance. The benchmark includes both open and closed datasets, providing a robust evaluation framework for real-world applications. To submit results on private tasks, please create [open an issue](https://github.com/embeddings-benchmark/mteb/issues)."
+    + removal_note,
     citation=RTEB_CITATION,
     contacts=["fzowl"],
 )
@@ -177,7 +185,8 @@ RTEB_LEGAL = RtebBenchmark(
             "JapaneseLegal1Retrieval",
         ],
     ),
-    description="RTEB Legal is a subset of RTEB containing retrieval tasks specifically focused on legal domain including case documents, statutes, legal summarization, and multilingual legal Q&A. The benchmark includes both open and closed datasets, providing a robust evaluation framework for real-world applications. To submit results on private tasks, please create [open an issue](https://github.com/embeddings-benchmark/mteb/issues).",
+    description="RTEB Legal is a subset of RTEB containing retrieval tasks specifically focused on legal domain including case documents, statutes, legal summarization, and multilingual legal Q&A. The benchmark includes both open and closed datasets, providing a robust evaluation framework for real-world applications. To submit results on private tasks, please create [open an issue](https://github.com/embeddings-benchmark/mteb/issues)."
+    + removal_note,
     citation=RTEB_CITATION,
     contacts=["fzowl"],
 )
@@ -199,7 +208,8 @@ RTEB_CODE = RtebBenchmark(
             "JapaneseCode1Retrieval",
         ],
     ),
-    description="RTEB Code is a subset of RTEB containing retrieval tasks specifically focused on programming and code domains including algorithmic problems, data science tasks, code evaluation, SQL retrieval, and multilingual code retrieval. The benchmark includes both open and closed datasets, providing a robust evaluation framework for real-world applications. To submit results on private tasks, please create [open an issue](https://github.com/embeddings-benchmark/mteb/issues).",
+    description="RTEB Code is a subset of RTEB containing retrieval tasks specifically focused on programming and code domains including algorithmic problems, data science tasks, code evaluation, SQL retrieval, and multilingual code retrieval. The benchmark includes both open and closed datasets, providing a robust evaluation framework for real-world applications. To submit results on private tasks, please create [open an issue](https://github.com/embeddings-benchmark/mteb/issues)."
+    + removal_note,
     citation=RTEB_CITATION,
     contacts=["fzowl"],
 )
@@ -217,7 +227,8 @@ RTEB_HEALTHCARE = RtebBenchmark(
             "GermanHealthcare1Retrieval",
         ],
     ),
-    description="RTEB Healthcare is a subset of RTEB containing retrieval tasks specifically focused on healthcare and medical domains including medical Q&A, healthcare information retrieval, cross-lingual medical retrieval, and multilingual medical consultation. The benchmark includes both open and closed datasets, providing a robust evaluation framework for real-world applications. To submit results on private tasks, please create [open an issue](https://github.com/embeddings-benchmark/mteb/issues).",
+    description="RTEB Healthcare is a subset of RTEB containing retrieval tasks specifically focused on healthcare and medical domains including medical Q&A, healthcare information retrieval, cross-lingual medical retrieval, and multilingual medical consultation. The benchmark includes both open and closed datasets, providing a robust evaluation framework for real-world applications. To submit results on private tasks, please create [open an issue](https://github.com/embeddings-benchmark/mteb/issues)."
+    + removal_note,
     citation=RTEB_CITATION,
     contacts=["fzowl"],
 )

mteb/cache.py CHANGED Viewed

@@ -1,3 +1,5 @@
+from __future__ import annotations
 import gzip
 import io
 import json
@@ -7,18 +9,22 @@ import shutil
 import subprocess
 import warnings
 from collections import defaultdict
-from collections.abc import Iterable, Sequence
 from pathlib import Path
-from typing import cast
+from typing import TYPE_CHECKING, cast
 import requests
+from pydantic import ValidationError
 import mteb
 from mteb.abstasks import AbsTask
 from mteb.benchmarks.benchmark import Benchmark
 from mteb.models import ModelMeta
 from mteb.results import BenchmarkResults, ModelResult, TaskResult
-from mteb.types import ModelName, Revision
+if TYPE_CHECKING:
+    from collections.abc import Iterable, Sequence
+    from mteb.types import ModelName, Revision
 logger = logging.getLogger(__name__)
@@ -27,8 +33,8 @@ class ResultCache:
     """Class to handle the local cache of MTEB results.
     Examples:
-        >>> from mteb.cache import ResultCache
-        >>> cache = ResultCache(cache_path="~/.cache/mteb") # default
+        >>> import mteb
+        >>> cache = mteb.ResultCache(cache_path="~/.cache/mteb") # default
         >>> cache.download_from_remote() # download the latest results from the remote repository
         >>> result = cache.load_results("task_name", "model_name")
     """
@@ -320,8 +326,8 @@ class ResultCache:
             OSError: On other file system errors
         Examples:
-            >>> from mteb.cache import ResultCache
-            >>> cache = ResultCache()
+            >>> import mteb
+            >>> cache = mteb.ResultCache()
             >>> # Download optimized cached results
             >>> cache_file = cache._download_cached_results_from_branch()
             >>> # Use custom output path
@@ -460,8 +466,8 @@ class ResultCache:
             A list of paths in the cache directory.
         Examples:
-            >>> from mteb.cache import ResultCache
-            >>> cache = ResultCache()
+            >>> import mteb
+            >>> cache = mteb.ResultCache()
             >>>
             >>> # Get all cache paths
             >>> paths = cache.get_cache_paths()
@@ -583,7 +589,7 @@ class ResultCache:
         first_model = next(iter(models))
         if isinstance(first_model, ModelMeta):
-            models = cast(Iterable[ModelMeta], models)
+            models = cast("Iterable[ModelMeta]", models)
             name_and_revision = {
                 (m.model_name_as_path(), m.revision or "no_revision_available")
                 for m in models
@@ -594,7 +600,7 @@ class ResultCache:
                 if (p.parent.parent.name, p.parent.name) in name_and_revision
             ]
-        str_models = cast(Sequence[str], models)
+        str_models = cast("Sequence[str]", models)
         model_names = {m.replace("/", "__").replace(" ", "_") for m in str_models}
         return [p for p in paths if p.parent.parent.name in model_names]
@@ -642,8 +648,8 @@ class ResultCache:
             A BenchmarkResults object containing the results for the specified models and tasks.
         Examples:
-            >>> from mteb.cache import ResultCache
-            >>> cache = ResultCache()
+            >>> import mteb
+            >>> cache = mteb.ResultCache()
             >>>
             >>> # Load results for specific models and tasks
             >>> results = cache.load_results(
@@ -686,7 +692,7 @@ class ResultCache:
                     task_result = task_result.validate_and_filter_scores(
                         task=task_instance
                     )
-                except Exception as e:
+                except ValidationError as e:
                     logger.info(
                         f"Validation failed for {task_result.task_name} in {model_name} {revision}: {e}"
                     )

mteb/cli/_display_tasks.py CHANGED Viewed

@@ -1,9 +1,15 @@
-from collections.abc import Iterable, Sequence
+from __future__ import annotations
+from typing import TYPE_CHECKING
-from mteb.abstasks import AbsTask
-from mteb.benchmarks import Benchmark
 from mteb.get_tasks import MTEBTasks
+if TYPE_CHECKING:
+    from collections.abc import Iterable, Sequence
+    from mteb.abstasks import AbsTask
+    from mteb.benchmarks import Benchmark
 def _display_benchmarks(benchmarks: Sequence[Benchmark]) -> None:
     """Get all benchmarks available in the MTEB."""

mteb/cli/build_cli.py CHANGED Viewed

@@ -3,17 +3,20 @@ import logging
 import os
 import warnings
 from pathlib import Path
+from typing import TYPE_CHECKING
 import torch
 from rich.logging import RichHandler
 import mteb
-from mteb.abstasks.abstask import AbsTask
 from mteb.cache import ResultCache
 from mteb.cli._display_tasks import _display_benchmarks, _display_tasks
 from mteb.cli.generate_model_card import generate_model_card
 from mteb.evaluate import OverwriteStrategy
-from mteb.types._encoder_io import EncodeKwargs
+if TYPE_CHECKING:
+    from mteb.abstasks.abstask import AbsTask
+    from mteb.types import EncodeKwargs
 logger = logging.getLogger(__name__)

mteb/cli/generate_model_card.py CHANGED Viewed

@@ -1,14 +1,21 @@
+from __future__ import annotations
 import logging
 import warnings
-from collections.abc import Sequence
 from pathlib import Path
+from typing import TYPE_CHECKING
 from huggingface_hub import ModelCard, ModelCardData, repo_exists
 from mteb.abstasks.abstask import AbsTask
-from mteb.benchmarks.benchmark import Benchmark
 from mteb.cache import ResultCache
+if TYPE_CHECKING:
+    from collections.abc import Sequence
+    from mteb.abstasks.abstask import AbsTask
+    from mteb.benchmarks.benchmark import Benchmark
 logger = logging.getLogger(__name__)

mteb/deprecated_evaluator.py CHANGED Viewed

@@ -6,7 +6,6 @@ import os
 import sys
 import traceback
 import warnings
-from collections.abc import Iterable, Sequence
 from copy import deepcopy
 from datetime import datetime
 from itertools import chain
@@ -18,26 +17,31 @@ import datasets
 import mteb
 from mteb.abstasks import AbsTask
-from mteb.abstasks.aggregated_task import AbsTaskAggregate
-from mteb.abstasks.task_metadata import TaskCategory, TaskType
 from mteb.benchmarks import Benchmark
 from mteb.models import (
     CrossEncoderWrapper,
     ModelMeta,
-    MTEBModels,
     SentenceTransformerEncoderWrapper,
 )
 from mteb.results import TaskResult
-from mteb.types import EncodeKwargs, ScoresDict
+if TYPE_CHECKING:
+    from collections.abc import Iterable, Sequence
+    from sentence_transformers import CrossEncoder, SentenceTransformer
+    from mteb.abstasks.aggregated_task import AbsTaskAggregate
+    from mteb.abstasks.task_metadata import TaskCategory, TaskType
+    from mteb.models import (
+        MTEBModels,
+    )
+    from mteb.types import EncodeKwargs, ScoresDict
 if sys.version_info >= (3, 13):
     from warnings import deprecated
 else:
     from typing_extensions import deprecated
-if TYPE_CHECKING:
-    from sentence_transformers import CrossEncoder, SentenceTransformer
 logger = logging.getLogger(__name__)
@@ -66,9 +70,9 @@ class MTEB:
         """
         if isinstance(next(iter(tasks)), Benchmark):
             self.benchmarks = tasks
-            self.tasks = list(chain.from_iterable(cast(Iterable[Benchmark], tasks)))
+            self.tasks = list(chain.from_iterable(cast("Iterable[Benchmark]", tasks)))
         elif isinstance(next(iter(tasks)), AbsTask):
-            self.tasks = list(cast(Iterable[AbsTask], tasks))
+            self.tasks = list(cast("Iterable[AbsTask]", tasks))
         self.err_logs_path = Path(err_logs_path)
         self._last_evaluated_splits: dict[str, list[str]] = {}
@@ -313,7 +317,7 @@ class MTEB:
         elif isinstance(model, CrossEncoder):
             mteb_model = CrossEncoderWrapper(model)
         else:
-            mteb_model = cast(MTEBModels, model)
+            mteb_model = cast("MTEBModels", model)
         meta = self.create_model_meta(mteb_model)
         output_path = self._create_output_folder(meta, output_folder)
@@ -346,7 +350,7 @@ class MTEB:
             )
             if task.is_aggregate:
-                aggregated_task = cast(AbsTaskAggregate, task)
+                aggregated_task = cast("AbsTaskAggregate", task)
                 self_ = MTEB(tasks=aggregated_task.metadata.tasks)
                 aggregated_task_results = self_.run(
                     mteb_model,

mteb/evaluate.py CHANGED Viewed

@@ -2,7 +2,6 @@ from __future__ import annotations
 import logging
 import warnings
-from collections.abc import Iterable
 from pathlib import Path
 from time import time
 from typing import TYPE_CHECKING, cast
@@ -17,22 +16,25 @@ from mteb.abstasks.aggregated_task import AbsTaskAggregate
 from mteb.benchmarks.benchmark import Benchmark
 from mteb.cache import ResultCache
 from mteb.models.model_meta import ModelMeta
-from mteb.models.models_protocols import (
-    MTEBModels,
-)
 from mteb.models.sentence_transformer_wrapper import (
     CrossEncoderWrapper,
     SentenceTransformerEncoderWrapper,
 )
 from mteb.results import ModelResult, TaskResult
 from mteb.results.task_result import TaskError
-from mteb.types import HFSubset, PromptType, SplitName
-from mteb.types._encoder_io import EncodeKwargs
-from mteb.types._metadata import ModelName, Revision
+from mteb.types import PromptType
 if TYPE_CHECKING:
+    from collections.abc import Iterable
     from sentence_transformers import CrossEncoder, SentenceTransformer
+    from mteb.models.models_protocols import (
+        MTEBModels,
+    )
+    from mteb.types import EncodeKwargs, HFSubset, SplitName
+    from mteb.types._metadata import ModelName, Revision
 logger = logging.getLogger(__name__)
@@ -69,13 +71,13 @@ def _sanitize_model(
         meta = getattr(model, "mteb_model_meta")
         if not isinstance(meta, ModelMeta):
             meta = ModelMeta._from_hub(None)
-        wrapped_model = cast(MTEBModels | ModelMeta, model)
+        wrapped_model = cast("MTEBModels | ModelMeta", model)
     else:
         meta = ModelMeta._from_hub(None) if not isinstance(model, ModelMeta) else model
         wrapped_model = meta
-    model_name = cast(str, meta.name)
-    model_revision = cast(str, meta.revision)
+    model_name = cast("str", meta.name)
+    model_revision = cast("str", meta.revision)
     return wrapped_model, meta, model_name, model_revision
@@ -132,8 +134,8 @@ def _evaluate_task(
     task.check_if_dataset_is_superseded()
-    data_loaded = task.data_loaded
-    if not data_loaded:
+    data_preloaded = task.data_loaded
+    if not data_preloaded:
         try:
             task.load_data()
         except DatasetNotFoundError as e:
@@ -176,7 +178,7 @@ def _evaluate_task(
         kg_co2_emissions=None,
     )
-    if data_loaded:  # only unload if we loaded the data
+    if not data_preloaded:  # only unload if we loaded the data
         task.unload_data()
     return result
@@ -202,10 +204,10 @@ def _check_model_modalities(
     if isinstance(tasks, AbsTask):
         check_tasks = [tasks]
     elif isinstance(tasks, Benchmark):
-        benchmark = cast(Benchmark, tasks)
+        benchmark = cast("Benchmark", tasks)
         check_tasks = benchmark.tasks
     else:
-        check_tasks = cast(Iterable[AbsTask], tasks)
+        check_tasks = cast("Iterable[AbsTask]", tasks)
     warnings, errors = [], []
@@ -298,7 +300,7 @@ def evaluate(
                 changed.
             - "only-cache": Only load the results from the cache folder and do not run the task. Useful if you just want to load the results from the
                 cache.
-        prediction_folder: Optional folder in which to save model predictions for the task. Predictions of the tasks will be sabed in `prediction_folder/{task_name}_predictions.json`
+        prediction_folder: Optional folder in which to save model predictions for the task. Predictions of the tasks will be saved in `prediction_folder/{task_name}_predictions.json`
         show_progress_bar: Whether to show a progress bar when running the evaluation. Default is True. Setting this to False will also set the
             `encode_kwargs['show_progress_bar']` to False if encode_kwargs is unspecified.
         public_only: Run only public tasks. If None, it will attempt to run the private task.
@@ -342,7 +344,7 @@ def evaluate(
     # AbsTaskAggregate is a special case where we have to run multiple tasks and combine the results
     if isinstance(tasks, AbsTaskAggregate):
-        aggregated_task = cast(AbsTaskAggregate, tasks)
+        aggregated_task = cast("AbsTaskAggregate", tasks)
         results = evaluate(
             model,
             aggregated_task.metadata.tasks,
@@ -365,7 +367,7 @@ def evaluate(
     if isinstance(tasks, AbsTask):
         task = tasks
     else:
-        tasks = cast(Iterable[AbsTask], tasks)
+        tasks = cast("Iterable[AbsTask]", tasks)
         evaluate_results = []
         exceptions = []
         tasks_tqdm = tqdm(

mteb/filter_tasks.py CHANGED Viewed

@@ -1,19 +1,24 @@
 """This script contains functions that are used to get an overview of the MTEB benchmark."""
+from __future__ import annotations
 import logging
-from collections.abc import Iterable, Sequence
-from typing import overload
+from typing import TYPE_CHECKING, overload
-from mteb.abstasks import (
-    AbsTask,
-)
 from mteb.abstasks.aggregated_task import AbsTaskAggregate
-from mteb.abstasks.task_metadata import TaskCategory, TaskDomain, TaskType
 from mteb.languages import (
     ISO_TO_LANGUAGE,
     ISO_TO_SCRIPT,
 )
-from mteb.types import Modalities
+if TYPE_CHECKING:
+    from collections.abc import Iterable, Sequence
+    from mteb.abstasks import (
+        AbsTask,
+    )
+    from mteb.abstasks.task_metadata import TaskCategory, TaskDomain, TaskType
+    from mteb.types import Modalities
 logger = logging.getLogger(__name__)

mteb/get_tasks.py CHANGED Viewed

@@ -1,20 +1,25 @@
 """This script contains functions that are used to get an overview of the MTEB benchmark."""
+from __future__ import annotations
 import difflib
 import logging
 import warnings
 from collections import Counter, defaultdict
-from collections.abc import Iterable, Sequence
-from typing import Any
+from typing import TYPE_CHECKING, Any
 import pandas as pd
 from mteb.abstasks import (
     AbsTask,
 )
-from mteb.abstasks.task_metadata import TaskCategory, TaskDomain, TaskType
 from mteb.filter_tasks import filter_tasks
-from mteb.types import Modalities
+if TYPE_CHECKING:
+    from collections.abc import Iterable, Sequence
+    from mteb.abstasks.task_metadata import TaskCategory, TaskDomain, TaskType
+    from mteb.types import Modalities
 logger = logging.getLogger(__name__)

mteb/languages/language_scripts.py CHANGED Viewed

@@ -1,10 +1,15 @@
-from collections.abc import Iterable, Sequence
-from dataclasses import dataclass
+from __future__ import annotations
-from typing_extensions import Self
+from dataclasses import dataclass
+from typing import TYPE_CHECKING
 from mteb.languages.check_language_code import check_language_code
+if TYPE_CHECKING:
+    from collections.abc import Iterable, Sequence
+    from typing_extensions import Self
 @dataclass
 class LanguageScripts:

mteb/leaderboard/app.py CHANGED Viewed

@@ -1,3 +1,5 @@
+from __future__ import annotations
 import itertools
 import json
 import logging
@@ -5,15 +7,14 @@ import tempfile
 import time
 import warnings
 from pathlib import Path
-from typing import Literal, get_args
+from typing import TYPE_CHECKING, Literal, get_args
 from urllib.parse import urlencode
 import cachetools
 import gradio as gr
-import pandas as pd
+import pandas as pd  # noqa: TC002 # gradio tries to validate typehints
 import mteb
-from mteb import BenchmarkResults
 from mteb.benchmarks.benchmark import RtebBenchmark
 from mteb.cache import ResultCache
 from mteb.leaderboard.benchmark_selector import (
@@ -31,6 +32,9 @@ from mteb.leaderboard.table import (
 from mteb.leaderboard.text_segments import ACKNOWLEDGEMENT, FAQ
 from mteb.models.model_meta import MODEL_TYPES
+if TYPE_CHECKING:
+    from mteb import BenchmarkResults
 logger = logging.getLogger(__name__)

mteb/leaderboard/table.py CHANGED Viewed

@@ -1,3 +1,7 @@
+from __future__ import annotations
+from typing import TYPE_CHECKING
 import gradio as gr
 import matplotlib.pyplot as plt
 import numpy as np
@@ -5,8 +9,9 @@ import pandas as pd
 from matplotlib.colors import LinearSegmentedColormap
 from pandas.api.types import is_numeric_dtype
-from mteb.benchmarks.benchmark import Benchmark
-from mteb.results.benchmark_results import BenchmarkResults
+if TYPE_CHECKING:
+    from mteb.benchmarks.benchmark import Benchmark
+    from mteb.results.benchmark_results import BenchmarkResults
 def _borda_count(scores: pd.Series) -> pd.Series:

mteb/load_results.py CHANGED Viewed

@@ -1,13 +1,19 @@
+from __future__ import annotations
 import json
 import logging
 import sys
-from collections.abc import Iterable, Sequence
-from pathlib import Path
+from typing import TYPE_CHECKING
 from mteb.abstasks.abstask import AbsTask
 from mteb.models.model_meta import ModelMeta
 from mteb.results import BenchmarkResults, ModelResult, TaskResult
-from mteb.types import ModelName, Revision
+if TYPE_CHECKING:
+    from collections.abc import Iterable, Sequence
+    from pathlib import Path
+    from mteb.types import ModelName, Revision
 if sys.version_info >= (3, 13):
     from warnings import deprecated

mteb 2.7.1__py3-none-any.whl → 2.7.3__py3-none-any.whl

mteb 2.7.1py3-none-any.whl → 2.7.3py3-none-any.whl