PyPI - mteb - Versions diffs - 2.4.2__py3-none-any.whl → 2.5.0__py3-none-any.whl - Mend

mteb 2.4.2py3-none-any.whl → 2.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

mteb/benchmarks/benchmark.py CHANGED Viewed

@@ -1,22 +1,16 @@
+from __future__ import annotations
 from collections.abc import Iterable, Sequence
 from dataclasses import dataclass, field
 from typing import TYPE_CHECKING, Literal
 import pandas as pd
-from mteb.benchmarks._create_table import (
-    _create_per_language_table_from_benchmark_results,
-    _create_per_task_table_from_benchmark_results,
-    _create_summary_table_from_benchmark_results,
-    _create_summary_table_mean_public_private,
-    _create_summary_table_mean_subset,
-    _create_summary_table_mean_task_type,
-)
-from mteb.results import BenchmarkResults
+from mteb.abstasks.abstask import AbsTask
 from mteb.types import StrURL
 if TYPE_CHECKING:
-    from mteb.abstasks import AbsTask
+    from mteb.results import BenchmarkResults
 @dataclass
@@ -43,7 +37,7 @@ class Benchmark:
     """
     name: str
-    tasks: Sequence["AbsTask"]
+    tasks: Sequence[AbsTask]
     description: str | None = None
     reference: StrURL | None = None
     citation: str | None = None
@@ -53,13 +47,13 @@ class Benchmark:
     display_name: str | None = None
     language_view: list[str] | Literal["all"] = field(default_factory=list)
-    def __iter__(self) -> Iterable["AbsTask"]:
+    def __iter__(self) -> Iterable[AbsTask]:
         return iter(self.tasks)
     def __len__(self) -> int:
         return len(self.tasks)
-    def __getitem__(self, index: int) -> "AbsTask":
+    def __getitem__(self, index: int) -> AbsTask:
         return self.tasks[index]
     def _create_summary_table(
@@ -70,6 +64,10 @@ class Benchmark:
         Returns:
             A pandas DataFrame representing the summary results.
         """
+        from mteb.benchmarks._create_table import (
+            _create_summary_table_from_benchmark_results,
+        )
         return _create_summary_table_from_benchmark_results(benchmark_results)
     def _create_per_task_table(
@@ -80,6 +78,10 @@ class Benchmark:
         Returns:
             A pandas DataFrame representing the per-task results.
         """
+        from mteb.benchmarks._create_table import (
+            _create_per_task_table_from_benchmark_results,
+        )
         return _create_per_task_table_from_benchmark_results(benchmark_results)
     def _create_per_language_table(
@@ -90,6 +92,10 @@ class Benchmark:
         Returns:
             A pandas DataFrame representing the per-language results.
         """
+        from mteb.benchmarks._create_table import (
+            _create_per_language_table_from_benchmark_results,
+        )
         if self.language_view == "all" or len(self.language_view) > 0:
             return _create_per_language_table_from_benchmark_results(
                 benchmark_results, self.language_view
@@ -111,6 +117,10 @@ class RtebBenchmark(Benchmark):
     def _create_summary_table(
         self, benchmark_results: BenchmarkResults
     ) -> pd.DataFrame:
+        from mteb.benchmarks._create_table import (
+            _create_summary_table_mean_public_private,
+        )
         joint_table = _create_summary_table_mean_public_private(benchmark_results)
         # For RTEB: all tasks are Retrieval type, so Retrieval column = Mean (Task)
         joint_table = joint_table.rename(columns={"Retrieval": "Mean (Task)"})
@@ -123,6 +133,8 @@ class HUMEBenchmark(Benchmark):
     def _create_summary_table(
         self, benchmark_results: BenchmarkResults
     ) -> pd.DataFrame:
+        from mteb.benchmarks._create_table import _create_summary_table_mean_subset
         return _create_summary_table_mean_subset(benchmark_results)
@@ -132,6 +144,8 @@ class MIEBBenchmark(Benchmark):
     def _create_summary_table(
         self, benchmark_results: BenchmarkResults
     ) -> pd.DataFrame:
+        from mteb.benchmarks._create_table import _create_summary_table_mean_task_type
         return _create_summary_table_mean_task_type(benchmark_results)
@@ -141,6 +155,10 @@ class VidoreBenchmark(Benchmark):
     def _create_summary_table(
         self, benchmark_results: BenchmarkResults
     ) -> pd.DataFrame:
+        from mteb.benchmarks._create_table import (
+            _create_summary_table_mean_public_private,
+        )
         joint_table = _create_summary_table_mean_public_private(benchmark_results)
         # For ViDoRe (V1, V2, V3): all tasks are Document Understanding type, so Document Understanding column = Mean (Task)
         joint_table = joint_table.rename(

mteb/benchmarks/benchmarks/benchmarks.py CHANGED Viewed

@@ -435,7 +435,7 @@ MTEB_RETRIEVAL_MEDICAL = Benchmark(
         ],
     ),
     description="A curated set of MTEB tasks designed to evaluate systems in the context of medical information retrieval.",
-    reference="",
+    reference=None,
     citation=None,
 )
@@ -2589,7 +2589,7 @@ HUME = HUMEBenchmark(
         ],
     ),
     description="The HUME benchmark is designed to evaluate the performance of text embedding models and humans on a comparable set of tasks. This captures areas where models perform better than human annotators and the reverse. In the paper, we go further into the analysis and what conclusions can be drawn.",
-    reference="Coming soon (in review)",
+    reference=None,
     citation=None,
     contacts=["AdnanElAssadi56", "KennethEnevoldsen", "isaac-chung", "Samoed"],
 )

mteb/cache.py CHANGED Viewed

@@ -8,7 +8,9 @@ from collections.abc import Sequence
 from pathlib import Path
 from typing import cast
+import mteb
 from mteb.abstasks import AbsTask
+from mteb.benchmarks.benchmark import Benchmark
 from mteb.models import ModelMeta
 from mteb.results import BenchmarkResults, ModelResult, TaskResult
 from mteb.types import ModelName, Revision
@@ -195,12 +197,14 @@ class ResultCache:
         self,
         remote: str = "https://github.com/embeddings-benchmark/results",
         download_latest: bool = True,
+        revision: str | None = None,
     ) -> Path:
         """Downloads the latest version of the results repository from GitHub to a local cache directory. Required git to be installed.
         Args:
             remote: The URL of the results repository on GitHub.
             download_latest: If True it will download the latest version of the repository, otherwise it will only update the existing repository.
+            revision: If specified, it will checkout the given revision after cloning or pulling the repository.
         Returns:
             The path to the local cache directory.
@@ -228,14 +232,27 @@ class ResultCache:
                 )
                 raise ValueError(msg)
-            if download_latest:
+            if revision or download_latest:
                 logger.info(
-                    f"remote repository already exists in {results_directory}, updating it using git pull"
+                    f"remote repository already exists in {results_directory}, fetching updates"
+                )
+                subprocess.run(
+                    ["git", "fetch", "--all", "--tags"],
+                    cwd=results_directory,
+                    check=True,
                 )
-                subprocess.run(["git", "pull"], cwd=results_directory)
             else:
                 logger.debug(
-                    f"Results repository already exists in {results_directory}, skipping update, set download_latest=True to update it"
+                    f"Results repository already exists in {results_directory}, skipping update, "
+                    f"set download_latest=True to update it"
+                )
+            if revision:
+                logger.info(f"Checking out revision '{revision}'")
+                subprocess.run(
+                    ["git", "checkout", revision],
+                    cwd=results_directory,
+                    check=True,
                 )
             return results_directory
@@ -243,8 +260,15 @@ class ResultCache:
             f"No results repository found in {results_directory}, cloning it from {remote}"
         )
+        clone_cmd = ["git", "clone", "--depth", "1"]
+        if revision:
+            logger.info(f"Cloning repository at revision '{revision}'")
+            clone_cmd.append(f"--revision={revision}")
+        clone_cmd.extend([remote, "remote"])
         subprocess.run(
-            ["git", "clone", "--depth", "1", remote, "remote"],
+            clone_cmd,
             cwd=self.cache_path,
             check=True,
         )
@@ -443,7 +467,7 @@ class ResultCache:
     def load_results(
         self,
         models: Sequence[str] | Sequence[ModelMeta] | None = None,
-        tasks: Sequence[str] | Sequence[AbsTask] | None = None,
+        tasks: Sequence[str] | Sequence[AbsTask] | Benchmark | str | None = None,
         require_model_meta: bool = True,
         include_remote: bool = True,
         validate_and_filter: bool = False,
@@ -453,7 +477,8 @@ class ResultCache:
         Args:
             models: A list of model names to load the results for. If None it will load the results for all models.
-            tasks: A list of task names to load the results for. If None it will load the results for all tasks.
+            tasks: A list of task names to load the results for. If str is passed, then benchmark will be loaded.
+                If None it will load the results for all tasks.
             require_model_meta: If True it will ignore results that do not have a model_meta.json file. If false it attempt to
                 extract the model name and revision from the path.
             include_remote: If True, it will include results from the remote repository.
@@ -475,6 +500,9 @@ class ResultCache:
             ...     require_model_meta=True,
             ... )
         """
+        if isinstance(tasks, str):
+            tasks = mteb.get_benchmark(tasks)
         paths = self.get_cache_paths(
             models=models,
             tasks=tasks,
@@ -524,6 +552,7 @@ class ResultCache:
         benchmark_results = BenchmarkResults(
             model_results=models_results,
+            benchmark=tasks if isinstance(tasks, Benchmark) else None,
         )
         return benchmark_results

mteb/descriptive_stats/Classification/TurkishConstitutionalCourtViolation.json ADDED Viewed

@@ -0,0 +1,54 @@
+{
+    "test": {
+        "num_samples": 193,
+        "number_texts_intersect_with_train": 0,
+        "text_statistics": {
+            "total_text_length": 1543015,
+            "min_text_length": 492,
+            "average_text_length": 7994.896373056995,
+            "max_text_length": 49510,
+            "unique_texts": 193
+        },
+        "image_statistics": null,
+        "label_statistics": {
+            "min_labels_per_text": 1,
+            "average_label_per_text": 1.0,
+            "max_labels_per_text": 1,
+            "unique_labels": 2,
+            "labels": {
+                "1": {
+                    "count": 177
+                },
+                "0": {
+                    "count": 16
+                }
+            }
+        }
+    },
+    "train": {
+        "num_samples": 870,
+        "number_texts_intersect_with_train": null,
+        "text_statistics": {
+            "total_text_length": 6968132,
+            "min_text_length": 259,
+            "average_text_length": 8009.347126436782,
+            "max_text_length": 74490,
+            "unique_texts": 870
+        },
+        "image_statistics": null,
+        "label_statistics": {
+            "min_labels_per_text": 1,
+            "average_label_per_text": 1.0,
+            "max_labels_per_text": 1,
+            "unique_labels": 2,
+            "labels": {
+                "1": {
+                    "count": 755
+                },
+                "0": {
+                    "count": 115
+                }
+            }
+        }
+    }
+}

mteb/descriptive_stats/Retrieval/SQuADKorV1Retrieval.json ADDED Viewed

@@ -0,0 +1,30 @@
+{
+    "test": {
+        "num_samples": 6734,
+        "number_of_characters": 718835,
+        "documents_text_statistics": {
+            "total_text_length": 523388,
+            "min_text_length": 352,
+            "average_text_length": 545.1958333333333,
+            "max_text_length": 2952,
+            "unique_texts": 960
+        },
+        "documents_image_statistics": null,
+        "queries_text_statistics": {
+            "total_text_length": 195447,
+            "min_text_length": 5,
+            "average_text_length": 33.84949774852788,
+            "max_text_length": 110,
+            "unique_texts": 5764
+        },
+        "queries_image_statistics": null,
+        "relevant_docs_statistics": {
+            "num_relevant_docs": 5774,
+            "min_relevant_docs_per_query": 1,
+            "average_relevant_docs_per_query": 1.0,
+            "max_relevant_docs_per_query": 1,
+            "unique_relevant_docs": 960
+        },
+        "top_ranked_statistics": null
+    }
+}

mteb/models/model_implementations/codefuse_models.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from mteb.models import ModelMeta
 from mteb.models.instruct_wrapper import InstructSentenceTransformerModel
+from mteb.models.model_meta import ScoringFunction
 from mteb.types import PromptType
 F2LLM_CITATION = """@article{2025F2LLM,
@@ -74,6 +75,22 @@ training_datasets = {
     "TwentyNewsgroupsClustering",
 }
+c2llm_training_datasets = {
+    "CodeSearchNet",
+    "CodeSearchNetRetrieval",
+    "CodeSearchNetCCRetrieval",
+    "CodeEditSearchRetrieval",
+    "CodeFeedbackMT",
+    "CodeFeedbackST",
+    "CodeTransOceanContest",
+    "CodeTransOceanDL",
+    "COIRCodeSearchNetRetrieval",
+    "CosQA",
+    "StackOverflowQA",
+    "SyntheticText2SQL",
+    "AdvTrain",
+}
 prompts_dict = {
     "AmazonCounterfactualClassification": "Classify a given Amazon customer review text as either counterfactual or not counterfactual.",
     "Banking77Classification": "Given an online banking query, find the corresponding intents.",
@@ -119,6 +136,77 @@ prompts_dict = {
 }
+c2llm_prompts_dict = {
+    "CodeEditSearchRetrieval": {
+        "query": "Retrieve the diff code that relevant the following query:\n",
+        "document": "Retrieved Answer:",
+    },
+    "CodeSearchNetRetrieval": {
+        "query": "Retrieve the code that solves the following query:\n",
+        "document": "Retrieved Answer:",
+    },
+    "AppsRetrieval": {
+        "query": "Given a problem description from a programming contest, retrieve code examples that can assist in solving it.\n",
+        "document": "Retrieved Answer:",
+    },
+    "CodeFeedbackMT": {
+        "query": "Given a multi-turn conversation history that includes both text and code, retrieve relevant multi-modal answers composed of text and code that address the ongoing discussion.\n",
+        "document": "Retrieved Answer:",
+    },
+    "CodeFeedbackST": {
+        "query": "Given a single-turn question composed of text and code, retrieve suitable answers that also mix text and code to provide helpful feedback.\n",
+        "document": "Retrieved Answer:",
+    },
+    "CodeSearchNetCCRetrieval": {
+        "query": "Given an initial code segment, retrieve the subsequent segment that continues the code.\n",
+        "document": "Retrieved Answer:",
+    },
+    "CodeTransOceanContest": {
+        "query": "Given a Python code snippet, retrieve its semantically equivalent version written in C++.\n",
+        "document": "Retrieved Answer:",
+    },
+    "CodeTransOceanDL": {
+        "query": "Given a Python code snippet, retrieve its semantically equivalent version written in C++.\n",
+        "document": "Retrieved Answer:",
+    },
+    "COIRCodeSearchNetRetrieval": {
+        "query": "Given a code snippet, retrieve its corresponding document string that summarizes its functionality.\n",
+        "document": "Retrieved Answer:",
+    },
+    "CosQA": {
+        "query": "Given a query from a web search, retrieve code that is helpful in addressing the query.\n",
+        "document": "Retrieved Answer:",
+    },
+    "StackOverflowQA": {
+        "query": "Given a question combining text and code, retrieve relevant answers that also contain both text and code snippets and can address the question.\n",
+        "document": "Retrieved Answer:",
+    },
+    "SyntheticText2SQL": {
+        "query": "Given a natural language question, retrieve SQL queries that serve as appropriate responses.\n",
+        "document": "Retrieved Answer:",
+    },
+}
+c2llm_languages = [
+    "eng-Latn",
+    "zho-Hans",
+    "python-Code",
+    "javascript-Code",
+    "go-Code",
+    "ruby-Code",
+    "java-Code",
+    "php-Code",
+]
+c2llm_loader_kwargs = dict(
+    trust_remote_code=True,
+    prompts_dict=c2llm_prompts_dict,
+    apply_instruction_to_passages=True,
+    max_seq_length=2048,
+    padding_side="left",
+)
 def instruction_template(
     instruction: str, prompt_type: PromptType | None = None
 ) -> str:
@@ -218,3 +306,59 @@ F2LLM_4B = ModelMeta(
     training_datasets=training_datasets,
     citation=F2LLM_CITATION,
 )
+C2LLM_0B5 = ModelMeta(
+    loader=InstructSentenceTransformerModel,
+    loader_kwargs=c2llm_loader_kwargs,
+    name="codefuse-ai/C2LLM-0.5B",
+    revision="f08c18be03de42c6e388948a1804d4b271a953a2",
+    release_date="2025-12-22",
+    languages=c2llm_languages,
+    n_parameters=497252096,
+    memory_usage_mb=948.0,
+    max_tokens=32768,
+    embed_dim=896,
+    license="apache-2.0",
+    open_weights=True,
+    public_training_code=None,
+    public_training_data=None,
+    framework=["PyTorch", "Sentence Transformers"],
+    reference="https://huggingface.co/codefuse-ai/C2LLM-0.5B",
+    similarity_fn_name=ScoringFunction.COSINE,
+    use_instructions=True,
+    training_datasets=c2llm_training_datasets,
+    adapted_from=None,
+    superseded_by=None,
+    modalities=["text"],
+    is_cross_encoder=None,
+    citation=None,
+    contacts=None,
+)
+C2LLM_7B = ModelMeta(
+    loader=InstructSentenceTransformerModel,
+    loader_kwargs=c2llm_loader_kwargs,
+    name="codefuse-ai/C2LLM-7B",
+    revision="c1dc16d6d64eb962c783bfb36a6d9c2f24a86dca",
+    release_date="2025-12-22",
+    languages=c2llm_languages,
+    n_parameters=7667028992,
+    memory_usage_mb=14624.0,
+    max_tokens=32768,
+    embed_dim=3584,
+    license="apache-2.0",
+    open_weights=True,
+    public_training_code=None,
+    public_training_data=None,
+    framework=["PyTorch", "Sentence Transformers"],
+    reference="https://huggingface.co/codefuse-ai/C2LLM-7B",
+    similarity_fn_name=ScoringFunction.COSINE,
+    use_instructions=True,
+    training_datasets=c2llm_training_datasets,
+    adapted_from=None,
+    superseded_by=None,
+    modalities=["text"],
+    is_cross_encoder=None,
+    citation=None,
+    contacts=None,
+)

mteb/models/model_implementations/mod_models.py CHANGED Viewed

@@ -137,7 +137,7 @@ _PREDEFINED_PROMPTS = {
     # SQL domain
     "WikiSQLRetrieval": "Given a natural language query, retrieve relevant SQL examples",
     # Multilingual
-    "MIRACLRetrievalHardNegatives": "Given a query, retrieve relevant passages",
+    "MIRACLRetrievalHardNegatives": "Given a question, retrieve Wikipedia passages that answer the question",
     # ========== Private/Closed Datasets ==========
     # Code domain (Private)
     "Code1Retrieval": "Given a code problem description, retrieve relevant code examples",
@@ -166,6 +166,8 @@ MoD_Embedding = ModelMeta(
         instruction_template=instruction_template,
         apply_instruction_to_passages=False,
         prompts_dict=_PREDEFINED_PROMPTS,
+        max_seq_length=18480,
+        model_kwargs={"torch_dtype": "bfloat16"},
     ),
     name="bflhc/MoD-Embedding",
     languages=multilingual_langs,

mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py CHANGED Viewed

@@ -65,14 +65,16 @@ class LlamaNemoretrieverColembed(AbsEncoder):
             iterator = DataLoader(images, batch_size=batch_size)
         for batch in iterator:
-            for b in batch:
+            for image in batch["image"]:
                 pil_img = (
-                    F.to_pil_image(b.to("cpu")) if not isinstance(b, Image.Image) else b
+                    image
+                    if isinstance(image, Image.Image)
+                    else F.to_pil_image(image.to("cpu"))
                 )
                 all_images.append(pil_img)
         batch_size = 1
-        return self.model.forward_passages(all_images, batch_size=batch_size)
+        return self.model.forward_images(all_images, batch_size=batch_size)
     def calculate_probs(self, text_embeddings, image_embeddings):
         scores = self.similarity(text_embeddings, image_embeddings)

mteb 2.4.2__py3-none-any.whl → 2.5.0__py3-none-any.whl

mteb 2.4.2py3-none-any.whl → 2.5.0py3-none-any.whl