mteb 2.6.6__py3-none-any.whl → 2.6.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,6 +7,7 @@ from tqdm.auto import tqdm
7
7
  from mteb._requires_package import requires_image_dependencies
8
8
  from mteb.abstasks.task_metadata import TaskMetadata
9
9
  from mteb.models.abs_encoder import AbsEncoder
10
+ from mteb.models.model_implementations.colpali_models import COLPALI_TRAINING_DATA
10
11
  from mteb.models.model_meta import ModelMeta, ScoringFunction
11
12
  from mteb.types import Array, BatchedInput, PromptType
12
13
 
@@ -120,6 +121,15 @@ class JinaCLIPModel(AbsEncoder):
120
121
  raise ValueError
121
122
 
122
123
 
124
+ _JINA_CLIP_TRAIN_DATASETS_V1 = {
125
+ # LAION400M
126
+ # ShareGPT4V
127
+ "MSMARCO",
128
+ "NQ",
129
+ "HotpotQA",
130
+ # Natural Language Inference (NLI) dataset (Bowman et al., 2015)
131
+ }
132
+
123
133
  jina_clip_v1 = ModelMeta(
124
134
  loader=JinaCLIPModel,
125
135
  name="jinaai/jina-clip-v1",
@@ -140,13 +150,41 @@ jina_clip_v1 = ModelMeta(
140
150
  reference="https://huggingface.co/jinaai/jina-clip-v1",
141
151
  similarity_fn_name=ScoringFunction.COSINE,
142
152
  use_instructions=True,
143
- training_datasets={
144
- # LAION400M
145
- # ShareGPT4V
146
- "MSMARCO",
147
- # NQ
148
- # HotpotQA
149
- # Natural Language Inference (NLI) dataset (Bowman et al., 2015)
150
- },
153
+ training_datasets=_JINA_CLIP_TRAIN_DATASETS_V1,
151
154
  citation=JINA_CLIP_CITATION,
155
+ superseded_by="jinaai/jina-clip-v2",
156
+ )
157
+
158
+ jina_clip_v2 = ModelMeta(
159
+ loader=JinaCLIPModel,
160
+ name="jinaai/jina-clip-v2",
161
+ revision="344d954da76eb8ad47a7aaff42d012e30c15b8fe",
162
+ release_date="2024-10-09",
163
+ languages=["eng-Latn"],
164
+ n_parameters=865278477,
165
+ memory_usage_mb=1650.0,
166
+ max_tokens=8192,
167
+ embed_dim=1024,
168
+ license="cc-by-nc-4.0",
169
+ open_weights=True,
170
+ public_training_code=None,
171
+ public_training_data=None,
172
+ framework=["PyTorch", "Sentence Transformers"],
173
+ reference="https://huggingface.co/jinaai/jina-clip-v2",
174
+ similarity_fn_name=ScoringFunction.COSINE,
175
+ use_instructions=False,
176
+ training_datasets=_JINA_CLIP_TRAIN_DATASETS_V1 | COLPALI_TRAINING_DATA,
177
+ modalities=["text", "image"],
178
+ model_type=["dense"],
179
+ citation="""
180
+ @misc{koukounas2024jinaclipv2multilingualmultimodalembeddings,
181
+ title={jina-clip-v2: Multilingual Multimodal Embeddings for Text and Images},
182
+ author={Andreas Koukounas and Georgios Mastrapas and Bo Wang and Mohammad Kalim Akram and Sedigheh Eslami and Michael Günther and Isabelle Mohr and Saba Sturua and Scott Martens and Nan Wang and Han Xiao},
183
+ year={2024},
184
+ eprint={2412.08802},
185
+ archivePrefix={arXiv},
186
+ primaryClass={cs.CL},
187
+ url={https://arxiv.org/abs/2412.08802},
188
+ }
189
+ """,
152
190
  )
@@ -1,4 +1,5 @@
1
1
  import logging
2
+ from collections.abc import Callable
2
3
  from typing import Any
3
4
 
4
5
  import torch
@@ -29,7 +30,7 @@ NV_RETRIEVER_CITATION = """@misc{moreira2025nvretrieverimprovingtextembedding,
29
30
  }"""
30
31
 
31
32
 
32
- def instruction_template(
33
+ def _instruction_template(
33
34
  instruction: str, prompt_type: PromptType | None = None
34
35
  ) -> str:
35
36
  return f"Instruct: {instruction}\nQuery: " if instruction else ""
@@ -100,10 +101,77 @@ nvidia_training_datasets = {
100
101
  "MrTidyRetrieval",
101
102
  }
102
103
 
104
+
105
+ class _NVEmbedWrapper(InstructSentenceTransformerModel):
106
+ """Inherited, because nvembed requires `sbert==2`, but it doesn't have tokenizers kwargs"""
107
+
108
+ def __init__(
109
+ self,
110
+ model_name: str,
111
+ revision: str,
112
+ instruction_template: str
113
+ | Callable[[str, PromptType | None], str]
114
+ | None = None,
115
+ max_seq_length: int | None = None,
116
+ apply_instruction_to_passages: bool = True,
117
+ padding_side: str | None = None,
118
+ add_eos_token: bool = False,
119
+ prompts_dict: dict[str, str] | None = None,
120
+ **kwargs: Any,
121
+ ):
122
+ from sentence_transformers import __version__ as sbert_version
123
+
124
+ required_transformers_version = "4.42.4"
125
+ required_sbert_version = "2.7.0"
126
+
127
+ if Version(transformers_version) != Version(required_transformers_version):
128
+ raise RuntimeError(
129
+ f"transformers version {transformers_version} is not match with required "
130
+ f"install version {required_transformers_version} to run `nvidia/NV-Embed-v2`"
131
+ )
132
+
133
+ if Version(sbert_version) != Version(required_sbert_version):
134
+ raise RuntimeError(
135
+ f"sbert version {sbert_version} is not match with required "
136
+ f"install version {required_sbert_version} to run `nvidia/NV-Embed-v2`"
137
+ )
138
+
139
+ requires_package(
140
+ self, "flash_attn", model_name, "pip install 'mteb[flash_attention]'"
141
+ )
142
+
143
+ from sentence_transformers import SentenceTransformer
144
+
145
+ if (
146
+ isinstance(instruction_template, str)
147
+ and "{instruction}" not in instruction_template
148
+ ):
149
+ raise ValueError(
150
+ "Instruction template must contain the string '{instruction}'."
151
+ )
152
+ if instruction_template is None:
153
+ logger.warning(
154
+ "No instruction template provided. Instructions will be used as-is."
155
+ )
156
+
157
+ self.instruction_template = instruction_template
158
+
159
+ self.model_name = model_name
160
+ self.model = SentenceTransformer(model_name, revision=revision, **kwargs)
161
+ self.model.tokenizer.padding_side = padding_side
162
+ self.model.tokenizer.add_eos_token = add_eos_token
163
+
164
+ if max_seq_length:
165
+ # https://github.com/huggingface/sentence-transformers/issues/3575
166
+ self.model.max_seq_length = max_seq_length
167
+ self.apply_instruction_to_passages = apply_instruction_to_passages
168
+ self.prompts_dict = prompts_dict
169
+
170
+
103
171
  NV_embed_v2 = ModelMeta(
104
- loader=InstructSentenceTransformerModel,
172
+ loader=_NVEmbedWrapper,
105
173
  loader_kwargs=dict(
106
- instruction_template=instruction_template,
174
+ instruction_template=_instruction_template,
107
175
  trust_remote_code=True,
108
176
  max_seq_length=32768,
109
177
  padding_side="right",
@@ -132,9 +200,9 @@ NV_embed_v2 = ModelMeta(
132
200
  )
133
201
 
134
202
  NV_embed_v1 = ModelMeta(
135
- loader=InstructSentenceTransformerModel,
203
+ loader=_NVEmbedWrapper,
136
204
  loader_kwargs=dict(
137
- instruction_template=instruction_template,
205
+ instruction_template=_instruction_template,
138
206
  trust_remote_code=True,
139
207
  max_seq_length=32768,
140
208
  padding_side="right",
@@ -1,5 +1,7 @@
1
1
  """Implementation of Sentence Transformers model validated in MTEB."""
2
2
 
3
+ import numpy as np
4
+
3
5
  from mteb.models.model_meta import ModelMeta, ScoringFunction
4
6
  from mteb.models.sentence_transformer_wrapper import (
5
7
  SentenceTransformerEncoderWrapper,
@@ -773,3 +775,67 @@ gtr_t5_base = ModelMeta(
773
775
  },
774
776
  citation=GTR_CITATION,
775
777
  )
778
+
779
+ static_retrieval_mrl_en_v1 = ModelMeta(
780
+ loader=sentence_transformers_loader,
781
+ name="sentence-transformers/static-retrieval-mrl-en-v1",
782
+ revision="f60985c706f192d45d218078e49e5a8b6f15283a",
783
+ release_date="2024-10-24",
784
+ languages=["eng-Latn"],
785
+ n_parameters=3_125_4528,
786
+ memory_usage_mb=119,
787
+ max_tokens=np.inf,
788
+ embed_dim=1024,
789
+ license="apache-2.0",
790
+ open_weights=True,
791
+ public_training_code="https://huggingface.co/sentence-transformers/static-retrieval-mrl-en-v1/blob/main/train.py",
792
+ public_training_data=None,
793
+ framework=["PyTorch", "Sentence Transformers"],
794
+ reference="https://huggingface.co/sentence-transformers/static-retrieval-mrl-en-v1",
795
+ similarity_fn_name=ScoringFunction.COSINE,
796
+ use_instructions=False,
797
+ training_datasets={
798
+ "MSMARCO",
799
+ # gooaq
800
+ # s2orc
801
+ # allnli
802
+ # paq
803
+ # trivia-qa
804
+ # swim-ir-monolingual
805
+ # PubMedQA
806
+ # swim
807
+ "MIRACLRetrieval",
808
+ "MultiLongDocRetrieval",
809
+ "MrTidyRetrieval",
810
+ },
811
+ modalities=["text"],
812
+ model_type=["dense"],
813
+ )
814
+
815
+ multi_qa_mpnet_base_dot_v1 = ModelMeta(
816
+ loader=sentence_transformers_loader,
817
+ name="sentence-transformers/multi-qa-mpnet-base-dot-v1",
818
+ revision="3af7c6da5b3e1bea796ef6c97fe237538cbe6e7f",
819
+ release_date="2021-08-23",
820
+ languages=["eng-Latn"],
821
+ n_parameters=109486978,
822
+ memory_usage_mb=418.0,
823
+ max_tokens=512,
824
+ embed_dim=768,
825
+ license=None,
826
+ open_weights=True,
827
+ public_training_code="https://huggingface.co/sentence-transformers/multi-qa-mpnet-base-dot-v1/blob/main/train_script.py",
828
+ public_training_data=None,
829
+ framework=["PyTorch", "Sentence Transformers"],
830
+ reference="https://huggingface.co/sentence-transformers/multi-qa-mpnet-base-dot-v1",
831
+ similarity_fn_name=ScoringFunction.DOT_PRODUCT,
832
+ use_instructions=False,
833
+ training_datasets={
834
+ "MSMARCO",
835
+ "YahooAnswersTopicsClassification",
836
+ "NQ",
837
+ },
838
+ adapted_from="microsoft/mpnet-base",
839
+ modalities=["text"],
840
+ model_type=["dense"],
841
+ )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mteb
3
- Version: 2.6.6
3
+ Version: 2.6.7
4
4
  Summary: Massive Text Embedding Benchmark
5
5
  Author-email: MTEB Contributors <niklas@huggingface.co>, Kenneth Enevoldsen <kenneth.enevoldsen@cas.au.dk>, Nouamane Tazi <nouamane@huggingface.co>, Nils Reimers <info@nils-reimers.de>
6
6
  Maintainer-email: Kenneth Enevoldsen <kenneth.enevoldsen@cas.au.dk>, Roman Solomatin <risolomatin@gmail.com>, Isaac Chung <chungisaac1217@gmail.com>
@@ -1514,7 +1514,7 @@ mteb/models/model_implementations/human.py,sha256=EtYa8G7Dc8fDcelBVw0xTpxGGx1YKK
1514
1514
  mteb/models/model_implementations/ibm_granite_models.py,sha256=ljHjuPuBkIwJvp5WZ3csjTOIb14nLh1h3OYkW-CEeHY,8464
1515
1515
  mteb/models/model_implementations/inf_models.py,sha256=SXXs3s9PWo08fzrxG_WOXGc_gvbpmkt-Blt7YoGcPRo,3020
1516
1516
  mteb/models/model_implementations/jasper_models.py,sha256=buJgllGIeyi7LsxDJY3UYJs_YzdDBkU3QpuQyU6VoTc,16293
1517
- mteb/models/model_implementations/jina_clip.py,sha256=QZUe7fm0otnnPHAIYnxcRwE1VHpNt3Xs-FGlUV6Itwc,5167
1517
+ mteb/models/model_implementations/jina_clip.py,sha256=0XhRSWTPR3ERAsOoVOxhB1yV6v1pEY8EQcTy1ChtSoU,6595
1518
1518
  mteb/models/model_implementations/jina_models.py,sha256=kFmkAWUFoJpq_1tRQIspk54lsik2vIoQcy5DS7YKgQ0,35198
1519
1519
  mteb/models/model_implementations/kalm_models.py,sha256=SHqkw5p7HzmQrb_bIFjRp1rsuv2v531nXIk390h_ojY,62115
1520
1520
  mteb/models/model_implementations/kblab.py,sha256=EisTJXijICN2pyfWT_89qUnNO7TH95t1LxCxjzJnzQo,1237
@@ -1541,7 +1541,7 @@ mteb/models/model_implementations/no_instruct_sentence_models.py,sha256=qLiMok_O
1541
1541
  mteb/models/model_implementations/nomic_models.py,sha256=dmQC_cWg6hAmiBHK7fXoXEiGBJnJvrq0RsnCcJ2qe1Q,15137
1542
1542
  mteb/models/model_implementations/nomic_models_vision.py,sha256=usCKfZCR7aEi_DnNmVAYjH-lXx_ipQkBVtUAmhJ90QI,6870
1543
1543
  mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py,sha256=6dTGtK1GiaYdpJ4IQFgCCOkGyHQyuEUatKs-Uv-1YmE,6450
1544
- mteb/models/model_implementations/nvidia_models.py,sha256=_lLfFl4-uSKpZdj_SDpdKiI2Gb5C1GgPqWSS-QdlYMM,21768
1544
+ mteb/models/model_implementations/nvidia_models.py,sha256=JMy0x7EWGrAxZ9s63F2vSPdPS-9yF3RIS4uj3N2UrVI,24315
1545
1545
  mteb/models/model_implementations/octen_models.py,sha256=FwQAcB_z6bFohpFlNQK2ugLBEOQUu533auOhrNqMxaM,7511
1546
1546
  mteb/models/model_implementations/openai_models.py,sha256=905BajYi_XyOZgqU3AeKpwIttLoUitaAyc48sTWI6Jg,9482
1547
1547
  mteb/models/model_implementations/openclip_models.py,sha256=MyosgeYSrgBXGuGFtI2Tyxksxpb7bADFJVSYFCLweVA,11622
@@ -1572,7 +1572,7 @@ mteb/models/model_implementations/searchmap_models.py,sha256=xVQPkO7aLp_kBFiMDAm
1572
1572
  mteb/models/model_implementations/seed_1_6_embedding_models.py,sha256=gcGKEY-n7DWGPlXYhO_kcNJ3lkBEnbw8NUxADNs3siM,18635
1573
1573
  mteb/models/model_implementations/seed_1_6_embedding_models_1215.py,sha256=OoTHcDRQGOuSzf08V62EXrSEdRsXhnMv2ZN9feJWs9s,36443
1574
1574
  mteb/models/model_implementations/seed_models.py,sha256=9UF2AQ0Uue8DD73SjYhHn2hLxey_7Iq9ii9TkRaA3CM,14168
1575
- mteb/models/model_implementations/sentence_transformers_models.py,sha256=WFWB7SPY9WS9b-SWiSAWSszQ7lJO-QGBxnIN8bU3kWE,23969
1575
+ mteb/models/model_implementations/sentence_transformers_models.py,sha256=6oULaf2mTyVe7vy9oS_QoKuxXXPaAqjQgSooMTG0xow,26071
1576
1576
  mteb/models/model_implementations/shuu_model.py,sha256=1jDFFPAfbfrSzC4vbHczO4yqy3Xh4tWiDAd3FS9-T6M,1177
1577
1577
  mteb/models/model_implementations/siglip_models.py,sha256=SOSyp-B7w6Vvqas_10D_1rvpJcKSQuJmXGy7Wdtsw7o,13012
1578
1578
  mteb/models/model_implementations/slm_models.py,sha256=JXjBio-9NFHLefU4Ny1Z-fFkyvvIz0U2kQ6t5s-PzlQ,13427
@@ -2612,9 +2612,9 @@ mteb/types/_metadata.py,sha256=NN-W0S6a5TDV7UkpRx1pyWtGF4TyyCyoPUfHOwdeci8,2290
2612
2612
  mteb/types/_result.py,sha256=UKNokV9pu3G74MGebocU512aU_fFU9I9nPKnrG9Q0iE,1035
2613
2613
  mteb/types/_string_validators.py,sha256=PY-dYq4E8O50VS3bLYdldPWp400fl_WzUjfVSkNWe8U,523
2614
2614
  mteb/types/statistics.py,sha256=GwkBPmAr18Onu-vHtzHs0PFrhCozdOMiT13HwnWL4ZM,3961
2615
- mteb-2.6.6.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
2616
- mteb-2.6.6.dist-info/METADATA,sha256=s0uH9FABmjhyRn2bwsWVFFxjRtJWEYbQaqEuavtj_mY,14281
2617
- mteb-2.6.6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
2618
- mteb-2.6.6.dist-info/entry_points.txt,sha256=8IJoEJFKoDHmVnNev-qJ9pp4Ln7_1-ma9QsXnzVCzGU,39
2619
- mteb-2.6.6.dist-info/top_level.txt,sha256=OLVIjcQAlWBz0bdmutKlWHLF42FF0hp4uVAg3ZyiG4U,5
2620
- mteb-2.6.6.dist-info/RECORD,,
2615
+ mteb-2.6.7.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
2616
+ mteb-2.6.7.dist-info/METADATA,sha256=p99o5hSYjMeWfoMLwNljk7_mDzsRjVXBbwPzsobuyWA,14281
2617
+ mteb-2.6.7.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
2618
+ mteb-2.6.7.dist-info/entry_points.txt,sha256=8IJoEJFKoDHmVnNev-qJ9pp4Ln7_1-ma9QsXnzVCzGU,39
2619
+ mteb-2.6.7.dist-info/top_level.txt,sha256=OLVIjcQAlWBz0bdmutKlWHLF42FF0hp4uVAg3ZyiG4U,5
2620
+ mteb-2.6.7.dist-info/RECORD,,
File without changes