PyPI - mteb - Versions diffs - 2.1.4__py3-none-any.whl → 2.5.2__py3-none-any.whl - Mend

mteb 2.1.4py3-none-any.whl → 2.5.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (458) hide show

mteb/models/model_implementations/clip_models.py CHANGED Viewed

@@ -117,6 +117,7 @@ CLIP_CITATION = """
 clip_vit_large_patch14 = ModelMeta(
     loader=CLIPModel,  # type: ignore
     name="openai/clip-vit-large-patch14",
+    model_type=["dense"],
     languages=["eng-Latn"],
     revision="32bd64288804d66eefd0ccbe215aa642df71cc41",
     release_date="2021-02-26",
@@ -140,6 +141,7 @@ clip_vit_large_patch14 = ModelMeta(
 clip_vit_base_patch32 = ModelMeta(
     loader=CLIPModel,  # type: ignore
     name="openai/clip-vit-base-patch32",
+    model_type=["dense"],
     languages=["eng-Latn"],
     revision="3d74acf9a28c67741b2f4f2ea7635f0aaf6f0268",
     release_date="2021-02-26",
@@ -163,6 +165,7 @@ clip_vit_base_patch32 = ModelMeta(
 clip_vit_base_patch16 = ModelMeta(
     loader=CLIPModel,  # type: ignore
     name="openai/clip-vit-base-patch16",
+    model_type=["dense"],
     languages=["eng-Latn"],
     revision="57c216476eefef5ab752ec549e440a49ae4ae5f3",
     release_date="2021-02-26",

mteb/models/model_implementations/clips_models.py ADDED Viewed

@@ -0,0 +1,100 @@
+from mteb.models.model_meta import (
+    ModelMeta,
+    ScoringFunction,
+)
+from mteb.models.sentence_transformer_wrapper import sentence_transformers_loader
+from .e5_models import ME5_TRAINING_DATA, model_prompts
+E5_NL_CITATION = """
+@misc{banar2025mtebnle5nlembeddingbenchmark,
+  archiveprefix = {arXiv},
+  author = {Nikolay Banar and Ehsan Lotfi and Jens Van Nooten and Cristina Arhiliuc and Marija Kliocaite and Walter Daelemans},
+  eprint = {2509.12340},
+  primaryclass = {cs.CL},
+  title = {MTEB-NL and E5-NL: Embedding Benchmark and Models for Dutch},
+  url = {https://arxiv.org/abs/2509.12340},
+  year = {2025},
+}
+"""
+e5_nl_small = ModelMeta(
+    loader=sentence_transformers_loader,
+    loader_kwargs=dict(
+        model_prompts=model_prompts,
+    ),
+    name="clips/e5-small-trm-nl",
+    model_type=["dense"],
+    languages=["nld-Latn"],
+    open_weights=True,
+    revision="0243664a6c5e12eef854b091eb283e51833c3e9f",
+    release_date="2025-09-23",
+    n_parameters=40_800_000,
+    memory_usage_mb=78,
+    embed_dim=384,
+    license="mit",
+    max_tokens=512,
+    reference="https://huggingface.co/clips/e5-small-trm-nl",
+    similarity_fn_name=ScoringFunction.COSINE,
+    framework=["Sentence Transformers", "PyTorch"],
+    use_instructions=True,
+    public_training_code="https://github.com/ELotfi/e5-nl",
+    public_training_data="https://huggingface.co/collections/clips/beir-nl",
+    training_datasets=ME5_TRAINING_DATA,  # mMARCO-NL, HotpotQA-NL, FEVER-NL, and LLM generated data
+    adapted_from="intfloat/multilingual-e5-small",
+    citation=E5_NL_CITATION,
+)
+e5_nl_base = ModelMeta(
+    loader=sentence_transformers_loader,
+    loader_kwargs=dict(
+        model_prompts=model_prompts,
+    ),
+    name="clips/e5-base-trm-nl",
+    model_type=["dense"],
+    languages=["nld-Latn"],
+    open_weights=True,
+    revision="6bd5722f236da48b4b8bcb28cc1fc478f7089956",
+    release_date="2025-09-23",
+    n_parameters=124_400_000,
+    memory_usage_mb=237,
+    embed_dim=768,
+    license="mit",
+    max_tokens=514,
+    reference="https://huggingface.co/clips/e5-base-trm-nl",
+    similarity_fn_name=ScoringFunction.COSINE,
+    framework=["Sentence Transformers", "PyTorch"],
+    use_instructions=True,
+    public_training_code="https://github.com/ELotfi/e5-nl",
+    public_training_data="https://huggingface.co/collections/clips/beir-nl",
+    adapted_from="intfloat/multilingual-e5-base",
+    training_datasets=ME5_TRAINING_DATA,  # mMARCO-NL, HotpotQA-NL, FEVER-NL, and LLM generated data
+    citation=E5_NL_CITATION,
+)
+e5_nl_large = ModelMeta(
+    loader=sentence_transformers_loader,
+    loader_kwargs=dict(
+        model_prompts=model_prompts,
+    ),
+    name="clips/e5-large-trm-nl",
+    model_type=["dense"],
+    languages=["nld-Latn"],
+    open_weights=True,
+    revision="683333f86ed9eb3699b5567f0fdabeb958d412b0",
+    release_date="2025-09-23",
+    n_parameters=355_000_000,
+    memory_usage_mb=1355,
+    embed_dim=1024,
+    license="mit",
+    max_tokens=514,
+    reference="https://huggingface.co/clips/e5-large-trm-nl",
+    similarity_fn_name=ScoringFunction.COSINE,
+    framework=["Sentence Transformers", "PyTorch"],
+    use_instructions=True,
+    public_training_code="https://github.com/ELotfi/e5-nl",
+    public_training_data="https://huggingface.co/collections/clips/beir-nl",
+    training_datasets=ME5_TRAINING_DATA,  # mMARCO-NL, HotpotQA-NL, FEVER-NL, and LLM generated data
+    adapted_from="intfloat/multilingual-e5-large",
+    citation=E5_NL_CITATION,
+)

mteb/models/model_implementations/codefuse_models.py CHANGED Viewed

@@ -1,7 +1,20 @@
 from mteb.models import ModelMeta
 from mteb.models.instruct_wrapper import InstructSentenceTransformerModel
+from mteb.models.model_meta import ScoringFunction
 from mteb.types import PromptType
+F2LLM_CITATION = """@article{2025F2LLM,
+    title={F2LLM Technical Report: Matching SOTA Embedding Performance with 6 Million Open-Source Data},
+    author={Ziyin Zhang and Zihan Liao and Hang Yu and Peng Di and Rui Wang},
+    journal={CoRR},
+    volume={abs/2510.02294},
+    year={2025},
+    url={https://doi.org/10.48550/arXiv.2510.02294},
+    doi={10.48550/ARXIV.2510.02294},
+    eprinttype={arXiv},
+    eprint={2510.02294}
+}"""
 training_datasets = {
     "MSMARCO",
     "ArguAna",
@@ -62,6 +75,22 @@ training_datasets = {
     "TwentyNewsgroupsClustering",
 }
+c2llm_training_datasets = {
+    "CodeSearchNet",
+    "CodeSearchNetRetrieval",
+    "CodeSearchNetCCRetrieval",
+    "CodeEditSearchRetrieval",
+    "CodeFeedbackMT",
+    "CodeFeedbackST",
+    "CodeTransOceanContest",
+    "CodeTransOceanDL",
+    "COIRCodeSearchNetRetrieval",
+    "CosQA",
+    "StackOverflowQA",
+    "SyntheticText2SQL",
+    "AdvTrain",
+}
 prompts_dict = {
     "AmazonCounterfactualClassification": "Classify a given Amazon customer review text as either counterfactual or not counterfactual.",
     "Banking77Classification": "Given an online banking query, find the corresponding intents.",
@@ -107,6 +136,77 @@ prompts_dict = {
 }
+c2llm_prompts_dict = {
+    "CodeEditSearchRetrieval": {
+        "query": "Retrieve the diff code that relevant the following query:\n",
+        "document": "Retrieved Answer:",
+    },
+    "CodeSearchNetRetrieval": {
+        "query": "Retrieve the code that solves the following query:\n",
+        "document": "Retrieved Answer:",
+    },
+    "AppsRetrieval": {
+        "query": "Given a problem description from a programming contest, retrieve code examples that can assist in solving it.\n",
+        "document": "Retrieved Answer:",
+    },
+    "CodeFeedbackMT": {
+        "query": "Given a multi-turn conversation history that includes both text and code, retrieve relevant multi-modal answers composed of text and code that address the ongoing discussion.\n",
+        "document": "Retrieved Answer:",
+    },
+    "CodeFeedbackST": {
+        "query": "Given a single-turn question composed of text and code, retrieve suitable answers that also mix text and code to provide helpful feedback.\n",
+        "document": "Retrieved Answer:",
+    },
+    "CodeSearchNetCCRetrieval": {
+        "query": "Given an initial code segment, retrieve the subsequent segment that continues the code.\n",
+        "document": "Retrieved Answer:",
+    },
+    "CodeTransOceanContest": {
+        "query": "Given a Python code snippet, retrieve its semantically equivalent version written in C++.\n",
+        "document": "Retrieved Answer:",
+    },
+    "CodeTransOceanDL": {
+        "query": "Given a Python code snippet, retrieve its semantically equivalent version written in C++.\n",
+        "document": "Retrieved Answer:",
+    },
+    "COIRCodeSearchNetRetrieval": {
+        "query": "Given a code snippet, retrieve its corresponding document string that summarizes its functionality.\n",
+        "document": "Retrieved Answer:",
+    },
+    "CosQA": {
+        "query": "Given a query from a web search, retrieve code that is helpful in addressing the query.\n",
+        "document": "Retrieved Answer:",
+    },
+    "StackOverflowQA": {
+        "query": "Given a question combining text and code, retrieve relevant answers that also contain both text and code snippets and can address the question.\n",
+        "document": "Retrieved Answer:",
+    },
+    "SyntheticText2SQL": {
+        "query": "Given a natural language question, retrieve SQL queries that serve as appropriate responses.\n",
+        "document": "Retrieved Answer:",
+    },
+}
+c2llm_languages = [
+    "eng-Latn",
+    "zho-Hans",
+    "python-Code",
+    "javascript-Code",
+    "go-Code",
+    "ruby-Code",
+    "java-Code",
+    "php-Code",
+]
+c2llm_loader_kwargs = dict(
+    trust_remote_code=True,
+    prompts_dict=c2llm_prompts_dict,
+    apply_instruction_to_passages=True,
+    max_seq_length=2048,
+    padding_side="left",
+)
 def instruction_template(
     instruction: str, prompt_type: PromptType | None = None
 ) -> str:
@@ -130,6 +230,7 @@ F2LLM_0B6 = ModelMeta(
         max_seq_length=8192,
     ),
     name="codefuse-ai/F2LLM-0.6B",
+    model_type=["dense"],
     languages=["eng-Latn"],
     open_weights=True,
     revision="36416618b83d4bd84a8ca30c2ee01ed518f9f2e7",
@@ -146,6 +247,7 @@ F2LLM_0B6 = ModelMeta(
     public_training_code="https://github.com/codefuse-ai/F2LLM",
     public_training_data="https://huggingface.co/datasets/codefuse-ai/F2LLM",
     training_datasets=training_datasets,
+    citation=F2LLM_CITATION,
 )
 F2LLM_1B7 = ModelMeta(
@@ -158,6 +260,7 @@ F2LLM_1B7 = ModelMeta(
         max_seq_length=8192,
     ),
     name="codefuse-ai/F2LLM-1.7B",
+    model_type=["dense"],
     languages=["eng-Latn"],
     open_weights=True,
     revision="fdce0e09655f42cea26f7f66f5a70cd4507ea45c",
@@ -174,6 +277,7 @@ F2LLM_1B7 = ModelMeta(
     public_training_code="https://github.com/codefuse-ai/F2LLM",
     public_training_data="https://huggingface.co/datasets/codefuse-ai/F2LLM",
     training_datasets=training_datasets,
+    citation=F2LLM_CITATION,
 )
 F2LLM_4B = ModelMeta(
@@ -186,6 +290,7 @@ F2LLM_4B = ModelMeta(
         max_seq_length=8192,
     ),
     name="codefuse-ai/F2LLM-4B",
+    model_type=["dense"],
     languages=["eng-Latn"],
     open_weights=True,
     revision="9fe95901ed2b6b59dd7673d6e93c9d76766a1e25",
@@ -202,4 +307,61 @@ F2LLM_4B = ModelMeta(
     public_training_code="https://github.com/codefuse-ai/F2LLM",
     public_training_data="https://huggingface.co/datasets/codefuse-ai/F2LLM",
     training_datasets=training_datasets,
+    citation=F2LLM_CITATION,
+)
+C2LLM_0B5 = ModelMeta(
+    loader=InstructSentenceTransformerModel,
+    loader_kwargs=c2llm_loader_kwargs,
+    name="codefuse-ai/C2LLM-0.5B",
+    revision="f08c18be03de42c6e388948a1804d4b271a953a2",
+    release_date="2025-12-22",
+    languages=c2llm_languages,
+    n_parameters=497252096,
+    memory_usage_mb=948.0,
+    max_tokens=32768,
+    embed_dim=896,
+    license="apache-2.0",
+    open_weights=True,
+    public_training_code=None,
+    public_training_data=None,
+    framework=["PyTorch", "Sentence Transformers"],
+    reference="https://huggingface.co/codefuse-ai/C2LLM-0.5B",
+    similarity_fn_name=ScoringFunction.COSINE,
+    use_instructions=True,
+    training_datasets=c2llm_training_datasets,
+    adapted_from=None,
+    superseded_by=None,
+    modalities=["text"],
+    is_cross_encoder=None,
+    citation=None,
+    contacts=None,
+)
+C2LLM_7B = ModelMeta(
+    loader=InstructSentenceTransformerModel,
+    loader_kwargs=c2llm_loader_kwargs,
+    name="codefuse-ai/C2LLM-7B",
+    revision="c1dc16d6d64eb962c783bfb36a6d9c2f24a86dca",
+    release_date="2025-12-22",
+    languages=c2llm_languages,
+    n_parameters=7667028992,
+    memory_usage_mb=14624.0,
+    max_tokens=32768,
+    embed_dim=3584,
+    license="apache-2.0",
+    open_weights=True,
+    public_training_code=None,
+    public_training_data=None,
+    framework=["PyTorch", "Sentence Transformers"],
+    reference="https://huggingface.co/codefuse-ai/C2LLM-7B",
+    similarity_fn_name=ScoringFunction.COSINE,
+    use_instructions=True,
+    training_datasets=c2llm_training_datasets,
+    adapted_from=None,
+    superseded_by=None,
+    modalities=["text"],
+    is_cross_encoder=None,
+    citation=None,
+    contacts=None,
 )

mteb/models/model_implementations/codesage_models.py CHANGED Viewed

@@ -1,6 +1,15 @@
 from mteb.models.model_meta import ModelMeta, ScoringFunction
 from mteb.models.sentence_transformer_wrapper import sentence_transformers_loader
+CODESAGE_CITATION = """@inproceedings{
+    zhang2024code,
+    title={{CODE} {REPRESENTATION} {LEARNING} {AT} {SCALE}},
+    author={Dejiao Zhang and Wasi Uddin Ahmad and Ming Tan and Hantian Ding and Ramesh Nallapati and Dan Roth and Xiaofei Ma and Bing Xiang},
+    booktitle={The Twelfth International Conference on Learning Representations},
+    year={2024},
+    url={https://openreview.net/forum?id=vfzRRjumpX}
+}"""
 codesage_languages = [
     "python-Code",
     "javascript-Code",
@@ -13,6 +22,7 @@ codesage_languages = [
 codesage_large = ModelMeta(
     loader=sentence_transformers_loader,
     name="codesage/codesage-large-v2",
+    model_type=["dense"],
     languages=codesage_languages,
     revision="6e5d6dc15db3e310c37c6dbac072409f95ffa5c5",
     release_date="2024-02-03",
@@ -33,11 +43,13 @@ codesage_large = ModelMeta(
         "CodeSearchNetRetrieval",
         "CodeSearchNetCCRetrieval",
     },
+    citation=CODESAGE_CITATION,
 )
 codesage_base = ModelMeta(
     loader=sentence_transformers_loader,
     name="codesage/codesage-base-v2",
+    model_type=["dense"],
     languages=codesage_languages,
     revision="92eac4f44c8674638f039f1b0d8280f2539cb4c7",
     release_date="2024-02-03",
@@ -58,11 +70,13 @@ codesage_base = ModelMeta(
         "CodeSearchNetRetrieval",
         "CodeSearchNetCCRetrieval",
     },
+    citation=CODESAGE_CITATION,
 )
 codesage_small = ModelMeta(
     loader=sentence_transformers_loader,
     name="codesage/codesage-small-v2",
+    model_type=["dense"],
     languages=codesage_languages,
     revision="4844c2f24b25e181aa43ca058cc73dd2622565c1",
     release_date="2024-02-03",
@@ -83,4 +97,5 @@ codesage_small = ModelMeta(
         "CodeSearchNetRetrieval",
         "CodeSearchNetCCRetrieval",
     },
+    citation=CODESAGE_CITATION,
 )

mteb/models/model_implementations/cohere_models.py CHANGED Viewed

@@ -8,6 +8,7 @@ import torch
 from torch.utils.data import DataLoader
 from tqdm.auto import tqdm
+from mteb._requires_package import requires_package
 from mteb.abstasks.task_metadata import TaskMetadata
 from mteb.models.abs_encoder import AbsEncoder
 from mteb.models.model_meta import ModelMeta, ScoringFunction
@@ -219,9 +220,11 @@ class CohereTextEmbeddingModel(AbsEncoder):
         output_dimension: int | None = None,
         **kwargs,
     ) -> None:
+        requires_package(self, "cohere", model_name, "pip install 'mteb[cohere]'")
         import cohere  # type: ignore
-        self.model_name = model_name.lstrip("Cohere/Cohere-")
+        self.model_name = model_name.removeprefix("Cohere/Cohere-")
         self.sep = sep
         self.model_prompts = self.validate_task_to_prompt_name(model_prompts)
         if embedding_type not in get_args(EmbeddingType):
@@ -377,6 +380,7 @@ cohere_mult_3 = ModelMeta(
         model_prompts=model_prompts,
     ),
     name="Cohere/Cohere-embed-multilingual-v3.0",
+    model_type=["dense"],
     languages=supported_languages,
     open_weights=False,
     revision="1",
@@ -401,6 +405,7 @@ cohere_eng_3 = ModelMeta(
         model_prompts=model_prompts,
     ),
     name="Cohere/Cohere-embed-english-v3.0",
+    model_type=["dense"],
     languages=["eng-Latn"],
     open_weights=False,
     reference="https://cohere.com/blog/introducing-embed-v3",
@@ -425,6 +430,7 @@ cohere_mult_light_3 = ModelMeta(
         model_prompts=model_prompts,
     ),
     name="Cohere/Cohere-embed-multilingual-light-v3.0",
+    model_type=["dense"],
     languages=supported_languages,
     open_weights=False,
     revision="1",
@@ -449,6 +455,7 @@ cohere_eng_light_3 = ModelMeta(
         model_prompts=model_prompts,
     ),
     name="Cohere/Cohere-embed-english-light-v3.0",
+    model_type=["dense"],
     languages=["eng-Latn"],
     open_weights=False,
     reference="https://cohere.com/blog/introducing-embed-v3",

mteb/models/model_implementations/cohere_v.py CHANGED Viewed

@@ -381,6 +381,7 @@ cohere_mult_3 = ModelMeta(
     loader=cohere_v_loader,  # type: ignore
     loader_kwargs={"model_name": "embed-multilingual-v3.0"},
     name="cohere/embed-multilingual-v3.0",
+    model_type=["dense"],
     languages=[],  # Unknown, but support >100 languages
     revision="1",
     release_date="2024-10-24",
@@ -404,6 +405,7 @@ cohere_eng_3 = ModelMeta(
     loader=cohere_v_loader,  # type: ignore
     loader_kwargs={"model_name": "embed-english-v3.0"},
     name="cohere/embed-english-v3.0",
+    model_type=["dense"],
     languages=["eng-Latn"],
     revision="1",
     release_date="2024-10-24",
@@ -426,6 +428,7 @@ cohere_eng_3 = ModelMeta(
 cohere_embed_v4_multimodal = ModelMeta(
     loader=cohere_v_loader,
     loader_kwargs=dict(model_name="embed-v4.0"),
+    model_type=["dense"],
     name="Cohere/Cohere-embed-v4.0",
     languages=all_languages,
     revision="1",
@@ -450,6 +453,7 @@ cohere_embed_v4_multimodal_binary = ModelMeta(
     loader=cohere_v_loader,
     loader_kwargs=dict(embedding_type="binary"),
     name="Cohere/Cohere-embed-v4.0 (output_dtype=binary)",
+    model_type=["dense"],
     languages=all_languages,
     revision="1",
     release_date="2024-12-01",
@@ -474,6 +478,7 @@ cohere_embed_v4_multimodal_int8 = ModelMeta(
     loader=cohere_v_loader,
     loader_kwargs=dict(embedding_type="int8"),
     name="Cohere/Cohere-embed-v4.0 (output_dtype=int8)",
+    model_type=["dense"],
     languages=all_languages,
     revision="1",
     release_date="2024-12-01",

mteb/models/model_implementations/colpali_models.py CHANGED Viewed

@@ -1,8 +1,9 @@
+from __future__ import annotations
 import logging
-from typing import Any
+from typing import TYPE_CHECKING, Any
 import torch
-from PIL import Image
 from torch.utils.data import DataLoader
 from tqdm.auto import tqdm
@@ -15,6 +16,9 @@ from mteb.models.abs_encoder import AbsEncoder
 from mteb.models.model_meta import ModelMeta, ScoringFunction
 from mteb.types import Array, BatchedInput, PromptType
+if TYPE_CHECKING:
+    from PIL import Image
 logger = logging.getLogger(__name__)
@@ -89,6 +93,7 @@ class ColPaliEngineWrapper(AbsEncoder):
         **kwargs,
     ):
         import torchvision.transforms.functional as F
+        from PIL import Image
         all_embeds = []
@@ -196,10 +201,10 @@ COLPALI_CITATION = """
 COLPALI_TRAINING_DATA = {
     # from https://huggingface.co/datasets/vidore/colpali_train_set
-    "DocVQA",
-    "InfoVQA",
-    "TATDQA",
-    "arXivQA",
+    "VidoreDocVQARetrieval",
+    "VidoreInfoVQARetrieval",
+    "VidoreTatdqaRetrieval",
+    "VidoreArxivQARetrieval",
 }
 colpali_v1_1 = ModelMeta(
@@ -208,6 +213,7 @@ colpali_v1_1 = ModelMeta(
         torch_dtype=torch.float16,
     ),
     name="vidore/colpali-v1.1",
+    model_type=["late-interaction"],
     languages=["eng-Latn"],
     revision="a0f15e3bcf97110e7ac1bb4be4bcd30eeb31992a",
     release_date="2024-08-21",
@@ -234,6 +240,7 @@ colpali_v1_2 = ModelMeta(
         torch_dtype=torch.float16,
     ),
     name="vidore/colpali-v1.2",
+    model_type=["late-interaction"],
     languages=["eng-Latn"],
     revision="6b89bc63c16809af4d111bfe412e2ac6bc3c9451",
     release_date="2024-08-26",
@@ -260,6 +267,7 @@ colpali_v1_3 = ModelMeta(
         torch_dtype=torch.float16,
     ),
     name="vidore/colpali-v1.3",
+    model_type=["late-interaction"],
     languages=["eng-Latn"],
     revision="1b5c8929330df1a66de441a9b5409a878f0de5b0",
     release_date="2024-11-01",

mteb 2.1.4__py3-none-any.whl → 2.5.2__py3-none-any.whl

mteb 2.1.4py3-none-any.whl → 2.5.2py3-none-any.whl