PyPI - mteb - Versions diffs - 2.6.6__py3-none-any.whl → 2.6.7__py3-none-any.whl - Mend

mteb 2.6.6py3-none-any.whl → 2.6.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

mteb/models/model_implementations/jina_clip.py CHANGED Viewed

@@ -7,6 +7,7 @@ from tqdm.auto import tqdm
 from mteb._requires_package import requires_image_dependencies
 from mteb.abstasks.task_metadata import TaskMetadata
 from mteb.models.abs_encoder import AbsEncoder
+from mteb.models.model_implementations.colpali_models import COLPALI_TRAINING_DATA
 from mteb.models.model_meta import ModelMeta, ScoringFunction
 from mteb.types import Array, BatchedInput, PromptType
@@ -120,6 +121,15 @@ class JinaCLIPModel(AbsEncoder):
         raise ValueError
+_JINA_CLIP_TRAIN_DATASETS_V1 = {
+    # LAION400M
+    # ShareGPT4V
+    "MSMARCO",
+    "NQ",
+    "HotpotQA",
+    # Natural Language Inference (NLI) dataset (Bowman et al., 2015)
+}
 jina_clip_v1 = ModelMeta(
     loader=JinaCLIPModel,
     name="jinaai/jina-clip-v1",
@@ -140,13 +150,41 @@ jina_clip_v1 = ModelMeta(
     reference="https://huggingface.co/jinaai/jina-clip-v1",
     similarity_fn_name=ScoringFunction.COSINE,
     use_instructions=True,
-    training_datasets={
-        # LAION400M
-        # ShareGPT4V
-        "MSMARCO",
-        # NQ
-        # HotpotQA
-        # Natural Language Inference (NLI) dataset (Bowman et al., 2015)
-    },
+    training_datasets=_JINA_CLIP_TRAIN_DATASETS_V1,
     citation=JINA_CLIP_CITATION,
+    superseded_by="jinaai/jina-clip-v2",
+)
+jina_clip_v2 = ModelMeta(
+    loader=JinaCLIPModel,
+    name="jinaai/jina-clip-v2",
+    revision="344d954da76eb8ad47a7aaff42d012e30c15b8fe",
+    release_date="2024-10-09",
+    languages=["eng-Latn"],
+    n_parameters=865278477,
+    memory_usage_mb=1650.0,
+    max_tokens=8192,
+    embed_dim=1024,
+    license="cc-by-nc-4.0",
+    open_weights=True,
+    public_training_code=None,
+    public_training_data=None,
+    framework=["PyTorch", "Sentence Transformers"],
+    reference="https://huggingface.co/jinaai/jina-clip-v2",
+    similarity_fn_name=ScoringFunction.COSINE,
+    use_instructions=False,
+    training_datasets=_JINA_CLIP_TRAIN_DATASETS_V1 | COLPALI_TRAINING_DATA,
+    modalities=["text", "image"],
+    model_type=["dense"],
+    citation="""
+@misc{koukounas2024jinaclipv2multilingualmultimodalembeddings,
+      title={jina-clip-v2: Multilingual Multimodal Embeddings for Text and Images},
+      author={Andreas Koukounas and Georgios Mastrapas and Bo Wang and Mohammad Kalim Akram and Sedigheh Eslami and Michael Günther and Isabelle Mohr and Saba Sturua and Scott Martens and Nan Wang and Han Xiao},
+      year={2024},
+      eprint={2412.08802},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},
+      url={https://arxiv.org/abs/2412.08802},
+}
+""",
 )

mteb/models/model_implementations/nvidia_models.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import logging
+from collections.abc import Callable
 from typing import Any
 import torch
@@ -29,7 +30,7 @@ NV_RETRIEVER_CITATION = """@misc{moreira2025nvretrieverimprovingtextembedding,
 }"""
-def instruction_template(
+def _instruction_template(
     instruction: str, prompt_type: PromptType | None = None
 ) -> str:
     return f"Instruct: {instruction}\nQuery: " if instruction else ""
@@ -100,10 +101,77 @@ nvidia_training_datasets = {
     "MrTidyRetrieval",
 }
+class _NVEmbedWrapper(InstructSentenceTransformerModel):
+    """Inherited, because nvembed requires `sbert==2`, but it doesn't have tokenizers kwargs"""
+    def __init__(
+        self,
+        model_name: str,
+        revision: str,
+        instruction_template: str
+        | Callable[[str, PromptType | None], str]
+        | None = None,
+        max_seq_length: int | None = None,
+        apply_instruction_to_passages: bool = True,
+        padding_side: str | None = None,
+        add_eos_token: bool = False,
+        prompts_dict: dict[str, str] | None = None,
+        **kwargs: Any,
+    ):
+        from sentence_transformers import __version__ as sbert_version
+        required_transformers_version = "4.42.4"
+        required_sbert_version = "2.7.0"
+        if Version(transformers_version) != Version(required_transformers_version):
+            raise RuntimeError(
+                f"transformers version {transformers_version} is not match with required "
+                f"install version {required_transformers_version} to run `nvidia/NV-Embed-v2`"
+            )
+        if Version(sbert_version) != Version(required_sbert_version):
+            raise RuntimeError(
+                f"sbert version {sbert_version} is not match with required "
+                f"install version {required_sbert_version} to run `nvidia/NV-Embed-v2`"
+            )
+        requires_package(
+            self, "flash_attn", model_name, "pip install 'mteb[flash_attention]'"
+        )
+        from sentence_transformers import SentenceTransformer
+        if (
+            isinstance(instruction_template, str)
+            and "{instruction}" not in instruction_template
+        ):
+            raise ValueError(
+                "Instruction template must contain the string '{instruction}'."
+            )
+        if instruction_template is None:
+            logger.warning(
+                "No instruction template provided. Instructions will be used as-is."
+            )
+        self.instruction_template = instruction_template
+        self.model_name = model_name
+        self.model = SentenceTransformer(model_name, revision=revision, **kwargs)
+        self.model.tokenizer.padding_side = padding_side
+        self.model.tokenizer.add_eos_token = add_eos_token
+        if max_seq_length:
+            # https://github.com/huggingface/sentence-transformers/issues/3575
+            self.model.max_seq_length = max_seq_length
+        self.apply_instruction_to_passages = apply_instruction_to_passages
+        self.prompts_dict = prompts_dict
 NV_embed_v2 = ModelMeta(
-    loader=InstructSentenceTransformerModel,
+    loader=_NVEmbedWrapper,
     loader_kwargs=dict(
-        instruction_template=instruction_template,
+        instruction_template=_instruction_template,
         trust_remote_code=True,
         max_seq_length=32768,
         padding_side="right",
@@ -132,9 +200,9 @@ NV_embed_v2 = ModelMeta(
 )
 NV_embed_v1 = ModelMeta(
-    loader=InstructSentenceTransformerModel,
+    loader=_NVEmbedWrapper,
     loader_kwargs=dict(
-        instruction_template=instruction_template,
+        instruction_template=_instruction_template,
         trust_remote_code=True,
         max_seq_length=32768,
         padding_side="right",

mteb/models/model_implementations/sentence_transformers_models.py CHANGED Viewed

@@ -1,5 +1,7 @@
 """Implementation of Sentence Transformers model validated in MTEB."""
+import numpy as np
 from mteb.models.model_meta import ModelMeta, ScoringFunction
 from mteb.models.sentence_transformer_wrapper import (
     SentenceTransformerEncoderWrapper,
@@ -773,3 +775,67 @@ gtr_t5_base = ModelMeta(
     },
     citation=GTR_CITATION,
 )
+static_retrieval_mrl_en_v1 = ModelMeta(
+    loader=sentence_transformers_loader,
+    name="sentence-transformers/static-retrieval-mrl-en-v1",
+    revision="f60985c706f192d45d218078e49e5a8b6f15283a",
+    release_date="2024-10-24",
+    languages=["eng-Latn"],
+    n_parameters=3_125_4528,
+    memory_usage_mb=119,
+    max_tokens=np.inf,
+    embed_dim=1024,
+    license="apache-2.0",
+    open_weights=True,
+    public_training_code="https://huggingface.co/sentence-transformers/static-retrieval-mrl-en-v1/blob/main/train.py",
+    public_training_data=None,
+    framework=["PyTorch", "Sentence Transformers"],
+    reference="https://huggingface.co/sentence-transformers/static-retrieval-mrl-en-v1",
+    similarity_fn_name=ScoringFunction.COSINE,
+    use_instructions=False,
+    training_datasets={
+        "MSMARCO",
+        # gooaq
+        # s2orc
+        # allnli
+        # paq
+        # trivia-qa
+        # swim-ir-monolingual
+        # PubMedQA
+        # swim
+        "MIRACLRetrieval",
+        "MultiLongDocRetrieval",
+        "MrTidyRetrieval",
+    },
+    modalities=["text"],
+    model_type=["dense"],
+)
+multi_qa_mpnet_base_dot_v1 = ModelMeta(
+    loader=sentence_transformers_loader,
+    name="sentence-transformers/multi-qa-mpnet-base-dot-v1",
+    revision="3af7c6da5b3e1bea796ef6c97fe237538cbe6e7f",
+    release_date="2021-08-23",
+    languages=["eng-Latn"],
+    n_parameters=109486978,
+    memory_usage_mb=418.0,
+    max_tokens=512,
+    embed_dim=768,
+    license=None,
+    open_weights=True,
+    public_training_code="https://huggingface.co/sentence-transformers/multi-qa-mpnet-base-dot-v1/blob/main/train_script.py",
+    public_training_data=None,
+    framework=["PyTorch", "Sentence Transformers"],
+    reference="https://huggingface.co/sentence-transformers/multi-qa-mpnet-base-dot-v1",
+    similarity_fn_name=ScoringFunction.DOT_PRODUCT,
+    use_instructions=False,
+    training_datasets={
+        "MSMARCO",
+        "YahooAnswersTopicsClassification",
+        "NQ",
+    },
+    adapted_from="microsoft/mpnet-base",
+    modalities=["text"],
+    model_type=["dense"],
+)

{mteb-2.6.6.dist-info → mteb-2.6.7.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: mteb
-Version: 2.6.6
+Version: 2.6.7
 Summary: Massive Text Embedding Benchmark
 Author-email: MTEB Contributors <niklas@huggingface.co>, Kenneth Enevoldsen <kenneth.enevoldsen@cas.au.dk>, Nouamane Tazi <nouamane@huggingface.co>, Nils Reimers <info@nils-reimers.de>
 Maintainer-email: Kenneth Enevoldsen <kenneth.enevoldsen@cas.au.dk>, Roman Solomatin <risolomatin@gmail.com>, Isaac Chung <chungisaac1217@gmail.com>

{mteb-2.6.6.dist-info → mteb-2.6.7.dist-info}/RECORD RENAMED Viewed

@@ -1514,7 +1514,7 @@ mteb/models/model_implementations/human.py,sha256=EtYa8G7Dc8fDcelBVw0xTpxGGx1YKK
 mteb/models/model_implementations/ibm_granite_models.py,sha256=ljHjuPuBkIwJvp5WZ3csjTOIb14nLh1h3OYkW-CEeHY,8464
 mteb/models/model_implementations/inf_models.py,sha256=SXXs3s9PWo08fzrxG_WOXGc_gvbpmkt-Blt7YoGcPRo,3020
 mteb/models/model_implementations/jasper_models.py,sha256=buJgllGIeyi7LsxDJY3UYJs_YzdDBkU3QpuQyU6VoTc,16293
-mteb/models/model_implementations/jina_clip.py,sha256=QZUe7fm0otnnPHAIYnxcRwE1VHpNt3Xs-FGlUV6Itwc,5167
+mteb/models/model_implementations/jina_clip.py,sha256=0XhRSWTPR3ERAsOoVOxhB1yV6v1pEY8EQcTy1ChtSoU,6595
 mteb/models/model_implementations/jina_models.py,sha256=kFmkAWUFoJpq_1tRQIspk54lsik2vIoQcy5DS7YKgQ0,35198
 mteb/models/model_implementations/kalm_models.py,sha256=SHqkw5p7HzmQrb_bIFjRp1rsuv2v531nXIk390h_ojY,62115
 mteb/models/model_implementations/kblab.py,sha256=EisTJXijICN2pyfWT_89qUnNO7TH95t1LxCxjzJnzQo,1237
@@ -1541,7 +1541,7 @@ mteb/models/model_implementations/no_instruct_sentence_models.py,sha256=qLiMok_O
 mteb/models/model_implementations/nomic_models.py,sha256=dmQC_cWg6hAmiBHK7fXoXEiGBJnJvrq0RsnCcJ2qe1Q,15137
 mteb/models/model_implementations/nomic_models_vision.py,sha256=usCKfZCR7aEi_DnNmVAYjH-lXx_ipQkBVtUAmhJ90QI,6870
 mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py,sha256=6dTGtK1GiaYdpJ4IQFgCCOkGyHQyuEUatKs-Uv-1YmE,6450
-mteb/models/model_implementations/nvidia_models.py,sha256=_lLfFl4-uSKpZdj_SDpdKiI2Gb5C1GgPqWSS-QdlYMM,21768
+mteb/models/model_implementations/nvidia_models.py,sha256=JMy0x7EWGrAxZ9s63F2vSPdPS-9yF3RIS4uj3N2UrVI,24315
 mteb/models/model_implementations/octen_models.py,sha256=FwQAcB_z6bFohpFlNQK2ugLBEOQUu533auOhrNqMxaM,7511
 mteb/models/model_implementations/openai_models.py,sha256=905BajYi_XyOZgqU3AeKpwIttLoUitaAyc48sTWI6Jg,9482
 mteb/models/model_implementations/openclip_models.py,sha256=MyosgeYSrgBXGuGFtI2Tyxksxpb7bADFJVSYFCLweVA,11622
@@ -1572,7 +1572,7 @@ mteb/models/model_implementations/searchmap_models.py,sha256=xVQPkO7aLp_kBFiMDAm
 mteb/models/model_implementations/seed_1_6_embedding_models.py,sha256=gcGKEY-n7DWGPlXYhO_kcNJ3lkBEnbw8NUxADNs3siM,18635
 mteb/models/model_implementations/seed_1_6_embedding_models_1215.py,sha256=OoTHcDRQGOuSzf08V62EXrSEdRsXhnMv2ZN9feJWs9s,36443
 mteb/models/model_implementations/seed_models.py,sha256=9UF2AQ0Uue8DD73SjYhHn2hLxey_7Iq9ii9TkRaA3CM,14168
-mteb/models/model_implementations/sentence_transformers_models.py,sha256=WFWB7SPY9WS9b-SWiSAWSszQ7lJO-QGBxnIN8bU3kWE,23969
+mteb/models/model_implementations/sentence_transformers_models.py,sha256=6oULaf2mTyVe7vy9oS_QoKuxXXPaAqjQgSooMTG0xow,26071
 mteb/models/model_implementations/shuu_model.py,sha256=1jDFFPAfbfrSzC4vbHczO4yqy3Xh4tWiDAd3FS9-T6M,1177
 mteb/models/model_implementations/siglip_models.py,sha256=SOSyp-B7w6Vvqas_10D_1rvpJcKSQuJmXGy7Wdtsw7o,13012
 mteb/models/model_implementations/slm_models.py,sha256=JXjBio-9NFHLefU4Ny1Z-fFkyvvIz0U2kQ6t5s-PzlQ,13427
@@ -2612,9 +2612,9 @@ mteb/types/_metadata.py,sha256=NN-W0S6a5TDV7UkpRx1pyWtGF4TyyCyoPUfHOwdeci8,2290
 mteb/types/_result.py,sha256=UKNokV9pu3G74MGebocU512aU_fFU9I9nPKnrG9Q0iE,1035
 mteb/types/_string_validators.py,sha256=PY-dYq4E8O50VS3bLYdldPWp400fl_WzUjfVSkNWe8U,523
 mteb/types/statistics.py,sha256=GwkBPmAr18Onu-vHtzHs0PFrhCozdOMiT13HwnWL4ZM,3961
-mteb-2.6.6.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-mteb-2.6.6.dist-info/METADATA,sha256=s0uH9FABmjhyRn2bwsWVFFxjRtJWEYbQaqEuavtj_mY,14281
-mteb-2.6.6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-mteb-2.6.6.dist-info/entry_points.txt,sha256=8IJoEJFKoDHmVnNev-qJ9pp4Ln7_1-ma9QsXnzVCzGU,39
-mteb-2.6.6.dist-info/top_level.txt,sha256=OLVIjcQAlWBz0bdmutKlWHLF42FF0hp4uVAg3ZyiG4U,5
-mteb-2.6.6.dist-info/RECORD,,
+mteb-2.6.7.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+mteb-2.6.7.dist-info/METADATA,sha256=p99o5hSYjMeWfoMLwNljk7_mDzsRjVXBbwPzsobuyWA,14281
+mteb-2.6.7.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+mteb-2.6.7.dist-info/entry_points.txt,sha256=8IJoEJFKoDHmVnNev-qJ9pp4Ln7_1-ma9QsXnzVCzGU,39
+mteb-2.6.7.dist-info/top_level.txt,sha256=OLVIjcQAlWBz0bdmutKlWHLF42FF0hp4uVAg3ZyiG4U,5
+mteb-2.6.7.dist-info/RECORD,,

{mteb-2.6.6.dist-info → mteb-2.6.7.dist-info}/WHEEL RENAMED Viewed

File without changes

{mteb-2.6.6.dist-info → mteb-2.6.7.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{mteb-2.6.6.dist-info → mteb-2.6.7.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{mteb-2.6.6.dist-info → mteb-2.6.7.dist-info}/top_level.txt RENAMED Viewed

File without changes

mteb 2.6.6__py3-none-any.whl → 2.6.7__py3-none-any.whl

mteb 2.6.6py3-none-any.whl → 2.6.7py3-none-any.whl