PyPI - mteb - Versions diffs - 2.7.2__py3-none-any.whl → 2.7.9__py3-none-any.whl - Mend

mteb 2.7.2py3-none-any.whl → 2.7.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (486) hide show

mteb/abstasks/text/reranking.py CHANGED Viewed

@@ -34,7 +34,7 @@ class AbsTaskReranking(AbsTaskRetrieval):
         For dataformat and other information, see [AbsTaskRetrieval][mteb.abstasks.retrieval.AbsTaskRetrieval].
     """
-    def load_data(self) -> None:
+    def load_data(self, num_proc: int = 1, **kwargs) -> None:
         """Load the dataset."""
         if self.data_loaded:
             return
@@ -43,7 +43,7 @@ class AbsTaskReranking(AbsTaskRetrieval):
             self.transform_old_dataset_format()
         else:
             # use AbsTaskRetrieval default to load the data
-            return super().load_data()
+            return super().load_data(num_proc=num_proc)
     def _process_example(self, example: dict, split: str, query_idx: int) -> dict:
         """Process a single example from the dataset.

mteb/abstasks/text/summarization.py CHANGED Viewed

@@ -1,24 +1,34 @@
+from __future__ import annotations
 import logging
-from pathlib import Path
+from typing import TYPE_CHECKING
 import numpy as np
-from datasets import Dataset
 from mteb._evaluators import SummarizationEvaluator
-from mteb._evaluators.text.summarization_evaluator import SummarizationMetrics
 from mteb.abstasks._statistics_calculation import (
     calculate_score_statistics,
     calculate_text_statistics,
 )
 from mteb.abstasks.abstask import AbsTask
-from mteb.models import EncoderProtocol, MTEBModels
-from mteb.types import EncodeKwargs
+from mteb.models import EncoderProtocol
 from mteb.types.statistics import (
-    ScoreStatistics,
     SplitDescriptiveStatistics,
-    TextStatistics,
 )
+if TYPE_CHECKING:
+    from pathlib import Path
+    from datasets import Dataset
+    from mteb._evaluators.text.summarization_evaluator import SummarizationMetrics
+    from mteb.models import MTEBModels
+    from mteb.types import EncodeKwargs
+    from mteb.types.statistics import (
+        ScoreStatistics,
+        TextStatistics,
+    )
 logger = logging.getLogger(__name__)
@@ -84,6 +94,7 @@ class AbsTaskSummarization(AbsTask):
         hf_subset: str,
         encode_kwargs: EncodeKwargs,
         prediction_folder: Path | None = None,
+        num_proc: int = 1,
         **kwargs,
     ) -> SummarizationMetrics:
         if not isinstance(model, EncoderProtocol):
@@ -105,7 +116,7 @@ class AbsTaskSummarization(AbsTask):
             hf_subset=hf_subset,
             **kwargs,
         )
-        scores = evaluator(model, encode_kwargs=encode_kwargs)
+        scores = evaluator(model, encode_kwargs=encode_kwargs, num_proc=num_proc)
         if prediction_folder:
             self._save_task_predictions(
                 scores,

mteb/abstasks/zeroshot_classification.py CHANGED Viewed

@@ -1,19 +1,16 @@
+from __future__ import annotations
 import logging
-from pathlib import Path
-from typing import TypedDict
+from typing import TYPE_CHECKING, TypedDict
 import torch
 from datasets import Dataset
 from sklearn import metrics
 from mteb._evaluators import ZeroShotClassificationEvaluator
-from mteb.models import EncoderProtocol, MTEBModels
-from mteb.types import EncodeKwargs
+from mteb.models import EncoderProtocol
 from mteb.types.statistics import (
-    ImageStatistics,
-    LabelStatistics,
     SplitDescriptiveStatistics,
-    TextStatistics,
 )
 from ._statistics_calculation import (
@@ -23,6 +20,17 @@ from ._statistics_calculation import (
 )
 from .abstask import AbsTask
+if TYPE_CHECKING:
+    from pathlib import Path
+    from mteb.models import MTEBModels
+    from mteb.types import EncodeKwargs
+    from mteb.types.statistics import (
+        ImageStatistics,
+        LabelStatistics,
+        TextStatistics,
+    )
 logger = logging.getLogger(__name__)
@@ -119,6 +127,7 @@ class AbsTaskZeroShotClassification(AbsTask):
         hf_subset: str,
         encode_kwargs: EncodeKwargs,
         prediction_folder: Path | None = None,
+        num_proc: int = 1,
         **kwargs,
     ) -> ZeroShotClassificationMetrics:
         if not isinstance(model, EncoderProtocol):
@@ -137,7 +146,11 @@ class AbsTaskZeroShotClassification(AbsTask):
             hf_subset=hf_subset,
             **kwargs,
         )
-        probs = evaluator(model, encode_kwargs=encode_kwargs)
+        probs = evaluator(
+            model,
+            encode_kwargs=encode_kwargs,
+            num_proc=num_proc,
+        )
         if prediction_folder:
             self._save_task_predictions(
@@ -162,13 +175,14 @@ class AbsTaskZeroShotClassification(AbsTask):
             accuracy=metrics.accuracy_score(labels, predictions),
         )
-    def _push_dataset_to_hub(self, repo_name: str) -> None:
+    def _push_dataset_to_hub(self, repo_name: str, num_proc: int = 1) -> None:
         self._upload_dataset_to_hub(
             repo_name,
             [
                 self.input_column_name,
                 self.label_column_name,
             ],
+            num_proc=num_proc,
         )
         labels_dataset = Dataset.from_dict({"labels": self.get_candidate_labels()})
         labels_dataset.push_to_hub(repo_name, config_name="labels")

mteb/benchmarks/_create_table.py CHANGED Viewed

@@ -1,13 +1,17 @@
+from __future__ import annotations
 import re
 from collections import defaultdict
-from typing import Literal
+from typing import TYPE_CHECKING, Literal
 import numpy as np
 import pandas as pd
 import mteb
 from mteb.get_tasks import get_task, get_tasks
-from mteb.results.benchmark_results import BenchmarkResults
+if TYPE_CHECKING:
+    from mteb.results.benchmark_results import BenchmarkResults
 def _borda_count(scores: pd.Series) -> pd.Series:
@@ -115,7 +119,6 @@ def _create_summary_table_from_benchmark_results(
     # Build joint table
     joint_table = mean_per_type.copy()
-    joint_table = joint_table.drop(models_to_remove, axis=0)
     joint_table.insert(0, "mean", overall_mean)
     joint_table.insert(1, "mean_by_task_type", typed_mean)
     joint_table["borda_rank"] = _get_borda_rank(per_task)
@@ -303,6 +306,7 @@ def _create_per_language_table_from_benchmark_results(
 def _create_summary_table_mean_public_private(
     benchmark_results: BenchmarkResults,
+    exclude_private_from_borda: bool = False,
 ) -> pd.DataFrame:
     """Create summary table from BenchmarkResults.
@@ -311,6 +315,7 @@ def _create_summary_table_mean_public_private(
     Args:
         benchmark_results: BenchmarkResults object containing model results
+        exclude_private_from_borda: If True, calculate Borda rank using only public tasks
     Returns:
         DataFrame with model summaries, ready for styling in the leaderboard
@@ -353,10 +358,13 @@ def _create_summary_table_mean_public_private(
     # Build joint table
     joint_table = mean_per_type.copy()
-    joint_table = joint_table.drop(models_to_remove, axis=0)
     joint_table.insert(0, "mean(public)", public_mean)
     joint_table.insert(1, "mean(private)", private_mean)
-    joint_table["borda_rank"] = _get_borda_rank(per_task)
+    if exclude_private_from_borda:
+        borda_per_task = per_task[public_task_name]
+    else:
+        borda_per_task = per_task
+    joint_table["borda_rank"] = _get_borda_rank(borda_per_task)
     joint_table = joint_table.sort_values("borda_rank", ascending=True)
     joint_table = joint_table.reset_index()
@@ -476,7 +484,6 @@ def _create_summary_table_mean_subset(
     # Build joint table
     joint_table = mean_per_type.copy()
-    joint_table = joint_table.drop(models_to_remove, axis=0)
     joint_table.insert(0, "mean(subset)", overall_subset_mean)
     joint_table["borda_rank"] = _get_borda_rank(per_subset)
     joint_table = joint_table.sort_values("mean(subset)", ascending=False)
@@ -595,7 +602,6 @@ def _create_summary_table_mean_task_type(
     # Build joint table
     joint_table = mean_per_type.copy()
-    joint_table = joint_table.drop(models_to_remove, axis=0)
     joint_table.insert(0, "mean_by_task_type", typed_mean)
     joint_table = joint_table.sort_values("mean_by_task_type", ascending=False)
     joint_table["borda_rank"] = _get_borda_rank(per_task)

mteb/benchmarks/benchmark.py CHANGED Viewed

@@ -123,9 +123,19 @@ class RtebBenchmark(Benchmark):
             _create_summary_table_mean_public_private,
         )
-        joint_table = _create_summary_table_mean_public_private(benchmark_results)
+        joint_table = _create_summary_table_mean_public_private(
+            benchmark_results, exclude_private_from_borda=True
+        )
+        # issue 3902: temporary remove the private column from RTEB summary table
+        if "Mean (Private)" in joint_table.columns:
+            joint_table = joint_table.drop(columns=["Mean (Private)"])
         # For RTEB: all tasks are Retrieval type, so Retrieval column = Mean (Task)
+        # but due to 3902, if Private column existed, Mean (Task) was the mean of Public and Private so instead we drop Mean (Task) and rename Mean (Public) to Mean (Task)
         joint_table = joint_table.rename(columns={"Retrieval": "Mean (Task)"})
+        if "Mean (Task)" in joint_table.columns:
+            joint_table = joint_table.drop(columns=["Mean (Task)"])
+        joint_table = joint_table.rename(columns={"Mean (Public)": "Mean (Task)"})
         return joint_table

mteb/benchmarks/benchmarks/__init__.py CHANGED Viewed

@@ -3,6 +3,7 @@ from mteb.benchmarks.benchmarks.benchmarks import (
     BEIR_NL,
     BRIGHT,
     BRIGHT_LONG,
+    BRIGHT_V1_1,
     BUILT_MTEB,
     C_MTEB,
     CHEMTEB,
@@ -69,6 +70,7 @@ __all__ = [
     "BEIR_NL",
     "BRIGHT",
     "BRIGHT_LONG",
+    "BRIGHT_V1_1",
     "BUILT_MTEB",
     "CHEMTEB",
     "CHEMTEB_V1_1",

mteb/benchmarks/benchmarks/benchmarks.py CHANGED Viewed

@@ -1330,6 +1330,46 @@ This is the long version of the benchmark, which only filter longer documents.
 """,
 )
+BRIGHT_V1_1 = Benchmark(
+    name="BRIGHT(v1.1)",
+    display_name="Reasoning Retrieval",
+    tasks=get_tasks(
+        tasks=[
+            "BrightBiologyRetrieval",
+            "BrightEarthScienceRetrieval",
+            "BrightEconomicsRetrieval",
+            "BrightPsychologyRetrieval",
+            "BrightRoboticsRetrieval",
+            "BrightStackoverflowRetrieval",
+            "BrightSustainableLivingRetrieval",
+            "BrightPonyRetrieval",
+            "BrightLeetcodeRetrieval",
+            "BrightAopsRetrieval",
+            "BrightTheoremQATheoremsRetrieval",
+            "BrightTheoremQAQuestionsRetrieval",
+            "BrightBiologyLongRetrieval",
+            "BrightEarthScienceLongRetrieval",
+            "BrightEconomicsLongRetrieval",
+            "BrightPsychologyLongRetrieval",
+            "BrightRoboticsLongRetrieval",
+            "BrightStackoverflowLongRetrieval",
+            "BrightSustainableLivingLongRetrieval",
+            "BrightPonyLongRetrieval",
+        ],
+    ),
+    description="v1.1 refactors the BRIGHT into a different tasks and added prompt to individual tasks.",
+    reference="https://brightbenchmark.github.io/",
+    citation=r"""
+@article{su2024bright,
+  author = {Su, Hongjin and Yen, Howard and Xia, Mengzhou and Shi, Weijia and Muennighoff, Niklas and Wang, Han-yu and Liu, Haisu and Shi, Quan and Siegel, Zachary S and Tang, Michael and others},
+  journal = {arXiv preprint arXiv:2407.12883},
+  title = {Bright: A realistic and challenging benchmark for reasoning-intensive retrieval},
+  year = {2024},
+}
+""",
+)
 CODE_RAG = Benchmark(
     name="CodeRAG",
     tasks=get_tasks(
@@ -1781,8 +1821,7 @@ BEIR_NL = Benchmark(
             "TRECCOVID-NL",
         ],
     ),
-    description="BEIR-NL is a Dutch adaptation of the publicly available BEIR benchmark, created through automated "
-    "translation.",
+    description="BEIR-NL is a Dutch adaptation of the publicly available BEIR benchmark, created through automated translation.",
     reference="https://arxiv.org/abs/2412.08329",
     contacts=["nikolay-banar"],
     citation=r"""

mteb/benchmarks/benchmarks/rteb_benchmarks.py CHANGED Viewed

@@ -10,6 +10,8 @@ RTEB_CITATION = r"""@article{rteb2025,
   year = {2025},
 }"""
+removal_note = "\n\nNote: We have temporarily removed the 'Private' column to read more about this decision out the [announcement](https://github.com/embeddings-benchmark/mteb/issues/3934)."
 RTEB_MAIN = RtebBenchmark(
     name="RTEB(beta)",
     display_name="RTEB Multilingual",
@@ -48,7 +50,8 @@ RTEB_MAIN = RtebBenchmark(
             "JapaneseLegal1Retrieval",
         ],
     ),
-    description="RTEB (ReTrieval Embedding Benchmark) is a comprehensive benchmark for evaluating text retrieval models across multiple specialized domains including legal, finance, code, and healthcare. It contains diverse retrieval tasks designed to test models' ability to understand domain-specific terminology and retrieve relevant documents in specialized contexts across multiple languages. The dataset includes both open and closed datasets, providing a robust evaluation framework for real-world applications. To submit results on private tasks, please create [open an issue](https://github.com/embeddings-benchmark/mteb/issues).",
+    description="RTEB (ReTrieval Embedding Benchmark) is a comprehensive benchmark for evaluating text retrieval models across multiple specialized domains including legal, finance, code, and healthcare. It contains diverse retrieval tasks designed to test models' ability to understand domain-specific terminology and retrieve relevant documents in specialized contexts across multiple languages. The dataset includes both open and closed datasets, providing a robust evaluation framework for real-world applications. To submit results on private tasks, please create [open an issue](https://github.com/embeddings-benchmark/mteb/issues)."
+    + removal_note,
     citation=RTEB_CITATION,
     contacts=["fzowl"],
 )
@@ -83,7 +86,8 @@ RTEB_ENGLISH = RtebBenchmark(
         ],
         languages=["eng"],
     ),
-    description="RTEB English is a subset of RTEB containing retrieval tasks in English across legal, finance, code, and healthcare domains. Includes diverse tasks covering specialized domains such as healthcare and finance. The benchmark includes both open and closed datasets, providing a robust evaluation framework for real-world applications. To submit results on private tasks, please create [open an issue](https://github.com/embeddings-benchmark/mteb/issues).",
+    description="RTEB English is a subset of RTEB containing retrieval tasks in English across legal, finance, code, and healthcare domains. Includes diverse tasks covering specialized domains such as healthcare and finance. The benchmark includes both open and closed datasets, providing a robust evaluation framework for real-world applications. To submit results on private tasks, please create [open an issue](https://github.com/embeddings-benchmark/mteb/issues)."
+    + removal_note,
     citation=RTEB_CITATION,
     contacts=["fzowl"],
 )
@@ -101,7 +105,8 @@ RTEB_FRENCH = RtebBenchmark(
         ],
         languages=["fra"],
     ),
-    description="RTEB French is a subset of RTEB containing retrieval tasks in French across legal and general knowledge domains. The benchmark includes both open and closed datasets, providing a robust evaluation framework for real-world applications. To submit results on private tasks, please create [open an issue](https://github.com/embeddings-benchmark/mteb/issues).",
+    description="RTEB French is a subset of RTEB containing retrieval tasks in French across legal and general knowledge domains. The benchmark includes both open and closed datasets, providing a robust evaluation framework for real-world applications. To submit results on private tasks, please create [open an issue](https://github.com/embeddings-benchmark/mteb/issues)."
+    + removal_note,
     citation=RTEB_CITATION,
     contacts=["fzowl"],
 )
@@ -119,7 +124,8 @@ RTEB_GERMAN = RtebBenchmark(
             "GermanLegal1Retrieval",
         ],
     ),
-    description="RTEB German is a subset of RTEB containing retrieval tasks in German across legal, healthcare, and business domains. The benchmark includes both open and closed datasets, providing a robust evaluation framework for real-world applications. To submit results on private tasks, please create [open an issue](https://github.com/embeddings-benchmark/mteb/issues).",
+    description="RTEB German is a subset of RTEB containing retrieval tasks in German across legal, healthcare, and business domains. The benchmark includes both open and closed datasets, providing a robust evaluation framework for real-world applications. To submit results on private tasks, please create [open an issue](https://github.com/embeddings-benchmark/mteb/issues)."
+    + removal_note,
     citation=RTEB_CITATION,
     contacts=["fzowl"],
 )
@@ -135,7 +141,8 @@ RTEB_JAPANESE = RtebBenchmark(
             "JapaneseLegal1Retrieval",
         ],
     ),
-    description="RTEB Japanese is a subset of RTEB  containing retrieval tasks in Japanese across legal and code domains. The benchmark includes both open and closed datasets, providing a robust evaluation framework for real-world applications. To submit results on private tasks, please create [open an issue](https://github.com/embeddings-benchmark/mteb/issues).",
+    description="RTEB Japanese is a subset of RTEB  containing retrieval tasks in Japanese across legal and code domains. The benchmark includes both open and closed datasets, providing a robust evaluation framework for real-world applications. To submit results on private tasks, please create [open an issue](https://github.com/embeddings-benchmark/mteb/issues)."
+    + removal_note,
     citation=RTEB_CITATION,
     contacts=["fzowl"],
 )
@@ -156,7 +163,8 @@ RTEB_FINANCE = RtebBenchmark(
             "EnglishFinance4Retrieval",
         ],
     ),
-    description="RTEB Finance is a subset of RTEB  containing retrieval tasks specifically focused on financial domain including finance benchmarks, Q&A, financial document retrieval, and corporate governance. The benchmark includes both open and closed datasets, providing a robust evaluation framework for real-world applications. To submit results on private tasks, please create [open an issue](https://github.com/embeddings-benchmark/mteb/issues).",
+    description="RTEB Finance is a subset of RTEB  containing retrieval tasks specifically focused on financial domain including finance benchmarks, Q&A, financial document retrieval, and corporate governance. The benchmark includes both open and closed datasets, providing a robust evaluation framework for real-world applications. To submit results on private tasks, please create [open an issue](https://github.com/embeddings-benchmark/mteb/issues)."
+    + removal_note,
     citation=RTEB_CITATION,
     contacts=["fzowl"],
 )
@@ -177,7 +185,8 @@ RTEB_LEGAL = RtebBenchmark(
             "JapaneseLegal1Retrieval",
         ],
     ),
-    description="RTEB Legal is a subset of RTEB containing retrieval tasks specifically focused on legal domain including case documents, statutes, legal summarization, and multilingual legal Q&A. The benchmark includes both open and closed datasets, providing a robust evaluation framework for real-world applications. To submit results on private tasks, please create [open an issue](https://github.com/embeddings-benchmark/mteb/issues).",
+    description="RTEB Legal is a subset of RTEB containing retrieval tasks specifically focused on legal domain including case documents, statutes, legal summarization, and multilingual legal Q&A. The benchmark includes both open and closed datasets, providing a robust evaluation framework for real-world applications. To submit results on private tasks, please create [open an issue](https://github.com/embeddings-benchmark/mteb/issues)."
+    + removal_note,
     citation=RTEB_CITATION,
     contacts=["fzowl"],
 )
@@ -199,7 +208,8 @@ RTEB_CODE = RtebBenchmark(
             "JapaneseCode1Retrieval",
         ],
     ),
-    description="RTEB Code is a subset of RTEB containing retrieval tasks specifically focused on programming and code domains including algorithmic problems, data science tasks, code evaluation, SQL retrieval, and multilingual code retrieval. The benchmark includes both open and closed datasets, providing a robust evaluation framework for real-world applications. To submit results on private tasks, please create [open an issue](https://github.com/embeddings-benchmark/mteb/issues).",
+    description="RTEB Code is a subset of RTEB containing retrieval tasks specifically focused on programming and code domains including algorithmic problems, data science tasks, code evaluation, SQL retrieval, and multilingual code retrieval. The benchmark includes both open and closed datasets, providing a robust evaluation framework for real-world applications. To submit results on private tasks, please create [open an issue](https://github.com/embeddings-benchmark/mteb/issues)."
+    + removal_note,
     citation=RTEB_CITATION,
     contacts=["fzowl"],
 )
@@ -217,7 +227,8 @@ RTEB_HEALTHCARE = RtebBenchmark(
             "GermanHealthcare1Retrieval",
         ],
     ),
-    description="RTEB Healthcare is a subset of RTEB containing retrieval tasks specifically focused on healthcare and medical domains including medical Q&A, healthcare information retrieval, cross-lingual medical retrieval, and multilingual medical consultation. The benchmark includes both open and closed datasets, providing a robust evaluation framework for real-world applications. To submit results on private tasks, please create [open an issue](https://github.com/embeddings-benchmark/mteb/issues).",
+    description="RTEB Healthcare is a subset of RTEB containing retrieval tasks specifically focused on healthcare and medical domains including medical Q&A, healthcare information retrieval, cross-lingual medical retrieval, and multilingual medical consultation. The benchmark includes both open and closed datasets, providing a robust evaluation framework for real-world applications. To submit results on private tasks, please create [open an issue](https://github.com/embeddings-benchmark/mteb/issues)."
+    + removal_note,
     citation=RTEB_CITATION,
     contacts=["fzowl"],
 )

mteb/cache.py CHANGED Viewed

@@ -1,3 +1,5 @@
+from __future__ import annotations
 import gzip
 import io
 import json
@@ -7,9 +9,8 @@ import shutil
 import subprocess
 import warnings
 from collections import defaultdict
-from collections.abc import Iterable, Sequence
 from pathlib import Path
-from typing import cast
+from typing import TYPE_CHECKING, cast
 import requests
 from pydantic import ValidationError
@@ -19,7 +20,11 @@ from mteb.abstasks import AbsTask
 from mteb.benchmarks.benchmark import Benchmark
 from mteb.models import ModelMeta
 from mteb.results import BenchmarkResults, ModelResult, TaskResult
-from mteb.types import ModelName, Revision
+if TYPE_CHECKING:
+    from collections.abc import Iterable, Sequence
+    from mteb.types import ModelName, Revision
 logger = logging.getLogger(__name__)
@@ -584,7 +589,7 @@ class ResultCache:
         first_model = next(iter(models))
         if isinstance(first_model, ModelMeta):
-            models = cast(Iterable[ModelMeta], models)
+            models = cast("Iterable[ModelMeta]", models)
             name_and_revision = {
                 (m.model_name_as_path(), m.revision or "no_revision_available")
                 for m in models
@@ -595,7 +600,7 @@ class ResultCache:
                 if (p.parent.parent.name, p.parent.name) in name_and_revision
             ]
-        str_models = cast(Sequence[str], models)
+        str_models = cast("Sequence[str]", models)
         model_names = {m.replace("/", "__").replace(" ", "_") for m in str_models}
         return [p for p in paths if p.parent.parent.name in model_names]

mteb/cli/_display_tasks.py CHANGED Viewed

@@ -1,9 +1,15 @@
-from collections.abc import Iterable, Sequence
+from __future__ import annotations
+from typing import TYPE_CHECKING
-from mteb.abstasks import AbsTask
-from mteb.benchmarks import Benchmark
 from mteb.get_tasks import MTEBTasks
+if TYPE_CHECKING:
+    from collections.abc import Iterable, Sequence
+    from mteb.abstasks import AbsTask
+    from mteb.benchmarks import Benchmark
 def _display_benchmarks(benchmarks: Sequence[Benchmark]) -> None:
     """Get all benchmarks available in the MTEB."""

mteb/cli/build_cli.py CHANGED Viewed

@@ -3,17 +3,20 @@ import logging
 import os
 import warnings
 from pathlib import Path
+from typing import TYPE_CHECKING
 import torch
 from rich.logging import RichHandler
 import mteb
-from mteb.abstasks.abstask import AbsTask
 from mteb.cache import ResultCache
 from mteb.cli._display_tasks import _display_benchmarks, _display_tasks
 from mteb.cli.generate_model_card import generate_model_card
 from mteb.evaluate import OverwriteStrategy
-from mteb.types._encoder_io import EncodeKwargs
+if TYPE_CHECKING:
+    from mteb.abstasks.abstask import AbsTask
+    from mteb.types import EncodeKwargs
 logger = logging.getLogger(__name__)

mteb/cli/generate_model_card.py CHANGED Viewed

@@ -1,14 +1,21 @@
+from __future__ import annotations
 import logging
 import warnings
-from collections.abc import Sequence
 from pathlib import Path
+from typing import TYPE_CHECKING
 from huggingface_hub import ModelCard, ModelCardData, repo_exists
 from mteb.abstasks.abstask import AbsTask
-from mteb.benchmarks.benchmark import Benchmark
 from mteb.cache import ResultCache
+if TYPE_CHECKING:
+    from collections.abc import Sequence
+    from mteb.abstasks.abstask import AbsTask
+    from mteb.benchmarks.benchmark import Benchmark
 logger = logging.getLogger(__name__)

mteb/deprecated_evaluator.py CHANGED Viewed

@@ -6,7 +6,6 @@ import os
 import sys
 import traceback
 import warnings
-from collections.abc import Iterable, Sequence
 from copy import deepcopy
 from datetime import datetime
 from itertools import chain
@@ -18,26 +17,31 @@ import datasets
 import mteb
 from mteb.abstasks import AbsTask
-from mteb.abstasks.aggregated_task import AbsTaskAggregate
-from mteb.abstasks.task_metadata import TaskCategory, TaskType
 from mteb.benchmarks import Benchmark
 from mteb.models import (
     CrossEncoderWrapper,
     ModelMeta,
-    MTEBModels,
     SentenceTransformerEncoderWrapper,
 )
 from mteb.results import TaskResult
-from mteb.types import EncodeKwargs, ScoresDict
+if TYPE_CHECKING:
+    from collections.abc import Iterable, Sequence
+    from sentence_transformers import CrossEncoder, SentenceTransformer
+    from mteb.abstasks.aggregated_task import AbsTaskAggregate
+    from mteb.abstasks.task_metadata import TaskCategory, TaskType
+    from mteb.models import (
+        MTEBModels,
+    )
+    from mteb.types import EncodeKwargs, ScoresDict
 if sys.version_info >= (3, 13):
     from warnings import deprecated
 else:
     from typing_extensions import deprecated
-if TYPE_CHECKING:
-    from sentence_transformers import CrossEncoder, SentenceTransformer
 logger = logging.getLogger(__name__)
@@ -66,9 +70,9 @@ class MTEB:
         """
         if isinstance(next(iter(tasks)), Benchmark):
             self.benchmarks = tasks
-            self.tasks = list(chain.from_iterable(cast(Iterable[Benchmark], tasks)))
+            self.tasks = list(chain.from_iterable(cast("Iterable[Benchmark]", tasks)))
         elif isinstance(next(iter(tasks)), AbsTask):
-            self.tasks = list(cast(Iterable[AbsTask], tasks))
+            self.tasks = list(cast("Iterable[AbsTask]", tasks))
         self.err_logs_path = Path(err_logs_path)
         self._last_evaluated_splits: dict[str, list[str]] = {}
@@ -313,7 +317,7 @@ class MTEB:
         elif isinstance(model, CrossEncoder):
             mteb_model = CrossEncoderWrapper(model)
         else:
-            mteb_model = cast(MTEBModels, model)
+            mteb_model = cast("MTEBModels", model)
         meta = self.create_model_meta(mteb_model)
         output_path = self._create_output_folder(meta, output_folder)
@@ -346,7 +350,7 @@ class MTEB:
             )
             if task.is_aggregate:
-                aggregated_task = cast(AbsTaskAggregate, task)
+                aggregated_task = cast("AbsTaskAggregate", task)
                 self_ = MTEB(tasks=aggregated_task.metadata.tasks)
                 aggregated_task_results = self_.run(
                     mteb_model,

mteb/descriptive_stats/Retrieval/BrightAopsRetrieval.json ADDED Viewed

@@ -0,0 +1,35 @@
+{
+    "standard": {
+        "num_samples": 188113,
+        "number_of_characters": 141769714,
+        "documents_text_statistics": {
+            "total_text_length": 141734227,
+            "min_text_length": 58,
+            "average_text_length": 753.8974425803981,
+            "max_text_length": 7334,
+            "unique_texts": 176508
+        },
+        "documents_image_statistics": null,
+        "queries_text_statistics": {
+            "total_text_length": 35487,
+            "min_text_length": 85,
+            "average_text_length": 319.7027027027027,
+            "max_text_length": 1167,
+            "unique_texts": 111
+        },
+        "queries_image_statistics": null,
+        "relevant_docs_statistics": {
+            "num_relevant_docs": 524,
+            "min_relevant_docs_per_query": 1,
+            "average_relevant_docs_per_query": 4.7207207207207205,
+            "max_relevant_docs_per_query": 8,
+            "unique_relevant_docs": 111
+        },
+        "top_ranked_statistics": {
+            "num_top_ranked": 20264921,
+            "min_top_ranked_per_query": 176954,
+            "average_top_ranked_per_query": 182566.85585585586,
+            "max_top_ranked_per_query": 186176
+        }
+    }
+}

mteb 2.7.2__py3-none-any.whl → 2.7.9__py3-none-any.whl

mteb 2.7.2py3-none-any.whl → 2.7.9py3-none-any.whl