mteb 2.6.6__py3-none-any.whl → 2.6.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mteb/models/model_implementations/jina_clip.py +46 -8
- mteb/models/model_implementations/nvidia_models.py +73 -5
- mteb/models/model_implementations/sentence_transformers_models.py +66 -0
- {mteb-2.6.6.dist-info → mteb-2.6.7.dist-info}/METADATA +1 -1
- {mteb-2.6.6.dist-info → mteb-2.6.7.dist-info}/RECORD +9 -9
- {mteb-2.6.6.dist-info → mteb-2.6.7.dist-info}/WHEEL +0 -0
- {mteb-2.6.6.dist-info → mteb-2.6.7.dist-info}/entry_points.txt +0 -0
- {mteb-2.6.6.dist-info → mteb-2.6.7.dist-info}/licenses/LICENSE +0 -0
- {mteb-2.6.6.dist-info → mteb-2.6.7.dist-info}/top_level.txt +0 -0
|
@@ -7,6 +7,7 @@ from tqdm.auto import tqdm
|
|
|
7
7
|
from mteb._requires_package import requires_image_dependencies
|
|
8
8
|
from mteb.abstasks.task_metadata import TaskMetadata
|
|
9
9
|
from mteb.models.abs_encoder import AbsEncoder
|
|
10
|
+
from mteb.models.model_implementations.colpali_models import COLPALI_TRAINING_DATA
|
|
10
11
|
from mteb.models.model_meta import ModelMeta, ScoringFunction
|
|
11
12
|
from mteb.types import Array, BatchedInput, PromptType
|
|
12
13
|
|
|
@@ -120,6 +121,15 @@ class JinaCLIPModel(AbsEncoder):
|
|
|
120
121
|
raise ValueError
|
|
121
122
|
|
|
122
123
|
|
|
124
|
+
_JINA_CLIP_TRAIN_DATASETS_V1 = {
|
|
125
|
+
# LAION400M
|
|
126
|
+
# ShareGPT4V
|
|
127
|
+
"MSMARCO",
|
|
128
|
+
"NQ",
|
|
129
|
+
"HotpotQA",
|
|
130
|
+
# Natural Language Inference (NLI) dataset (Bowman et al., 2015)
|
|
131
|
+
}
|
|
132
|
+
|
|
123
133
|
jina_clip_v1 = ModelMeta(
|
|
124
134
|
loader=JinaCLIPModel,
|
|
125
135
|
name="jinaai/jina-clip-v1",
|
|
@@ -140,13 +150,41 @@ jina_clip_v1 = ModelMeta(
|
|
|
140
150
|
reference="https://huggingface.co/jinaai/jina-clip-v1",
|
|
141
151
|
similarity_fn_name=ScoringFunction.COSINE,
|
|
142
152
|
use_instructions=True,
|
|
143
|
-
training_datasets=
|
|
144
|
-
# LAION400M
|
|
145
|
-
# ShareGPT4V
|
|
146
|
-
"MSMARCO",
|
|
147
|
-
# NQ
|
|
148
|
-
# HotpotQA
|
|
149
|
-
# Natural Language Inference (NLI) dataset (Bowman et al., 2015)
|
|
150
|
-
},
|
|
153
|
+
training_datasets=_JINA_CLIP_TRAIN_DATASETS_V1,
|
|
151
154
|
citation=JINA_CLIP_CITATION,
|
|
155
|
+
superseded_by="jinaai/jina-clip-v2",
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
jina_clip_v2 = ModelMeta(
|
|
159
|
+
loader=JinaCLIPModel,
|
|
160
|
+
name="jinaai/jina-clip-v2",
|
|
161
|
+
revision="344d954da76eb8ad47a7aaff42d012e30c15b8fe",
|
|
162
|
+
release_date="2024-10-09",
|
|
163
|
+
languages=["eng-Latn"],
|
|
164
|
+
n_parameters=865278477,
|
|
165
|
+
memory_usage_mb=1650.0,
|
|
166
|
+
max_tokens=8192,
|
|
167
|
+
embed_dim=1024,
|
|
168
|
+
license="cc-by-nc-4.0",
|
|
169
|
+
open_weights=True,
|
|
170
|
+
public_training_code=None,
|
|
171
|
+
public_training_data=None,
|
|
172
|
+
framework=["PyTorch", "Sentence Transformers"],
|
|
173
|
+
reference="https://huggingface.co/jinaai/jina-clip-v2",
|
|
174
|
+
similarity_fn_name=ScoringFunction.COSINE,
|
|
175
|
+
use_instructions=False,
|
|
176
|
+
training_datasets=_JINA_CLIP_TRAIN_DATASETS_V1 | COLPALI_TRAINING_DATA,
|
|
177
|
+
modalities=["text", "image"],
|
|
178
|
+
model_type=["dense"],
|
|
179
|
+
citation="""
|
|
180
|
+
@misc{koukounas2024jinaclipv2multilingualmultimodalembeddings,
|
|
181
|
+
title={jina-clip-v2: Multilingual Multimodal Embeddings for Text and Images},
|
|
182
|
+
author={Andreas Koukounas and Georgios Mastrapas and Bo Wang and Mohammad Kalim Akram and Sedigheh Eslami and Michael Günther and Isabelle Mohr and Saba Sturua and Scott Martens and Nan Wang and Han Xiao},
|
|
183
|
+
year={2024},
|
|
184
|
+
eprint={2412.08802},
|
|
185
|
+
archivePrefix={arXiv},
|
|
186
|
+
primaryClass={cs.CL},
|
|
187
|
+
url={https://arxiv.org/abs/2412.08802},
|
|
188
|
+
}
|
|
189
|
+
""",
|
|
152
190
|
)
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
|
+
from collections.abc import Callable
|
|
2
3
|
from typing import Any
|
|
3
4
|
|
|
4
5
|
import torch
|
|
@@ -29,7 +30,7 @@ NV_RETRIEVER_CITATION = """@misc{moreira2025nvretrieverimprovingtextembedding,
|
|
|
29
30
|
}"""
|
|
30
31
|
|
|
31
32
|
|
|
32
|
-
def
|
|
33
|
+
def _instruction_template(
|
|
33
34
|
instruction: str, prompt_type: PromptType | None = None
|
|
34
35
|
) -> str:
|
|
35
36
|
return f"Instruct: {instruction}\nQuery: " if instruction else ""
|
|
@@ -100,10 +101,77 @@ nvidia_training_datasets = {
|
|
|
100
101
|
"MrTidyRetrieval",
|
|
101
102
|
}
|
|
102
103
|
|
|
104
|
+
|
|
105
|
+
class _NVEmbedWrapper(InstructSentenceTransformerModel):
|
|
106
|
+
"""Inherited, because nvembed requires `sbert==2`, but it doesn't have tokenizers kwargs"""
|
|
107
|
+
|
|
108
|
+
def __init__(
|
|
109
|
+
self,
|
|
110
|
+
model_name: str,
|
|
111
|
+
revision: str,
|
|
112
|
+
instruction_template: str
|
|
113
|
+
| Callable[[str, PromptType | None], str]
|
|
114
|
+
| None = None,
|
|
115
|
+
max_seq_length: int | None = None,
|
|
116
|
+
apply_instruction_to_passages: bool = True,
|
|
117
|
+
padding_side: str | None = None,
|
|
118
|
+
add_eos_token: bool = False,
|
|
119
|
+
prompts_dict: dict[str, str] | None = None,
|
|
120
|
+
**kwargs: Any,
|
|
121
|
+
):
|
|
122
|
+
from sentence_transformers import __version__ as sbert_version
|
|
123
|
+
|
|
124
|
+
required_transformers_version = "4.42.4"
|
|
125
|
+
required_sbert_version = "2.7.0"
|
|
126
|
+
|
|
127
|
+
if Version(transformers_version) != Version(required_transformers_version):
|
|
128
|
+
raise RuntimeError(
|
|
129
|
+
f"transformers version {transformers_version} is not match with required "
|
|
130
|
+
f"install version {required_transformers_version} to run `nvidia/NV-Embed-v2`"
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
if Version(sbert_version) != Version(required_sbert_version):
|
|
134
|
+
raise RuntimeError(
|
|
135
|
+
f"sbert version {sbert_version} is not match with required "
|
|
136
|
+
f"install version {required_sbert_version} to run `nvidia/NV-Embed-v2`"
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
requires_package(
|
|
140
|
+
self, "flash_attn", model_name, "pip install 'mteb[flash_attention]'"
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
from sentence_transformers import SentenceTransformer
|
|
144
|
+
|
|
145
|
+
if (
|
|
146
|
+
isinstance(instruction_template, str)
|
|
147
|
+
and "{instruction}" not in instruction_template
|
|
148
|
+
):
|
|
149
|
+
raise ValueError(
|
|
150
|
+
"Instruction template must contain the string '{instruction}'."
|
|
151
|
+
)
|
|
152
|
+
if instruction_template is None:
|
|
153
|
+
logger.warning(
|
|
154
|
+
"No instruction template provided. Instructions will be used as-is."
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
self.instruction_template = instruction_template
|
|
158
|
+
|
|
159
|
+
self.model_name = model_name
|
|
160
|
+
self.model = SentenceTransformer(model_name, revision=revision, **kwargs)
|
|
161
|
+
self.model.tokenizer.padding_side = padding_side
|
|
162
|
+
self.model.tokenizer.add_eos_token = add_eos_token
|
|
163
|
+
|
|
164
|
+
if max_seq_length:
|
|
165
|
+
# https://github.com/huggingface/sentence-transformers/issues/3575
|
|
166
|
+
self.model.max_seq_length = max_seq_length
|
|
167
|
+
self.apply_instruction_to_passages = apply_instruction_to_passages
|
|
168
|
+
self.prompts_dict = prompts_dict
|
|
169
|
+
|
|
170
|
+
|
|
103
171
|
NV_embed_v2 = ModelMeta(
|
|
104
|
-
loader=
|
|
172
|
+
loader=_NVEmbedWrapper,
|
|
105
173
|
loader_kwargs=dict(
|
|
106
|
-
instruction_template=
|
|
174
|
+
instruction_template=_instruction_template,
|
|
107
175
|
trust_remote_code=True,
|
|
108
176
|
max_seq_length=32768,
|
|
109
177
|
padding_side="right",
|
|
@@ -132,9 +200,9 @@ NV_embed_v2 = ModelMeta(
|
|
|
132
200
|
)
|
|
133
201
|
|
|
134
202
|
NV_embed_v1 = ModelMeta(
|
|
135
|
-
loader=
|
|
203
|
+
loader=_NVEmbedWrapper,
|
|
136
204
|
loader_kwargs=dict(
|
|
137
|
-
instruction_template=
|
|
205
|
+
instruction_template=_instruction_template,
|
|
138
206
|
trust_remote_code=True,
|
|
139
207
|
max_seq_length=32768,
|
|
140
208
|
padding_side="right",
|
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
"""Implementation of Sentence Transformers model validated in MTEB."""
|
|
2
2
|
|
|
3
|
+
import numpy as np
|
|
4
|
+
|
|
3
5
|
from mteb.models.model_meta import ModelMeta, ScoringFunction
|
|
4
6
|
from mteb.models.sentence_transformer_wrapper import (
|
|
5
7
|
SentenceTransformerEncoderWrapper,
|
|
@@ -773,3 +775,67 @@ gtr_t5_base = ModelMeta(
|
|
|
773
775
|
},
|
|
774
776
|
citation=GTR_CITATION,
|
|
775
777
|
)
|
|
778
|
+
|
|
779
|
+
static_retrieval_mrl_en_v1 = ModelMeta(
|
|
780
|
+
loader=sentence_transformers_loader,
|
|
781
|
+
name="sentence-transformers/static-retrieval-mrl-en-v1",
|
|
782
|
+
revision="f60985c706f192d45d218078e49e5a8b6f15283a",
|
|
783
|
+
release_date="2024-10-24",
|
|
784
|
+
languages=["eng-Latn"],
|
|
785
|
+
n_parameters=3_125_4528,
|
|
786
|
+
memory_usage_mb=119,
|
|
787
|
+
max_tokens=np.inf,
|
|
788
|
+
embed_dim=1024,
|
|
789
|
+
license="apache-2.0",
|
|
790
|
+
open_weights=True,
|
|
791
|
+
public_training_code="https://huggingface.co/sentence-transformers/static-retrieval-mrl-en-v1/blob/main/train.py",
|
|
792
|
+
public_training_data=None,
|
|
793
|
+
framework=["PyTorch", "Sentence Transformers"],
|
|
794
|
+
reference="https://huggingface.co/sentence-transformers/static-retrieval-mrl-en-v1",
|
|
795
|
+
similarity_fn_name=ScoringFunction.COSINE,
|
|
796
|
+
use_instructions=False,
|
|
797
|
+
training_datasets={
|
|
798
|
+
"MSMARCO",
|
|
799
|
+
# gooaq
|
|
800
|
+
# s2orc
|
|
801
|
+
# allnli
|
|
802
|
+
# paq
|
|
803
|
+
# trivia-qa
|
|
804
|
+
# swim-ir-monolingual
|
|
805
|
+
# PubMedQA
|
|
806
|
+
# swim
|
|
807
|
+
"MIRACLRetrieval",
|
|
808
|
+
"MultiLongDocRetrieval",
|
|
809
|
+
"MrTidyRetrieval",
|
|
810
|
+
},
|
|
811
|
+
modalities=["text"],
|
|
812
|
+
model_type=["dense"],
|
|
813
|
+
)
|
|
814
|
+
|
|
815
|
+
multi_qa_mpnet_base_dot_v1 = ModelMeta(
|
|
816
|
+
loader=sentence_transformers_loader,
|
|
817
|
+
name="sentence-transformers/multi-qa-mpnet-base-dot-v1",
|
|
818
|
+
revision="3af7c6da5b3e1bea796ef6c97fe237538cbe6e7f",
|
|
819
|
+
release_date="2021-08-23",
|
|
820
|
+
languages=["eng-Latn"],
|
|
821
|
+
n_parameters=109486978,
|
|
822
|
+
memory_usage_mb=418.0,
|
|
823
|
+
max_tokens=512,
|
|
824
|
+
embed_dim=768,
|
|
825
|
+
license=None,
|
|
826
|
+
open_weights=True,
|
|
827
|
+
public_training_code="https://huggingface.co/sentence-transformers/multi-qa-mpnet-base-dot-v1/blob/main/train_script.py",
|
|
828
|
+
public_training_data=None,
|
|
829
|
+
framework=["PyTorch", "Sentence Transformers"],
|
|
830
|
+
reference="https://huggingface.co/sentence-transformers/multi-qa-mpnet-base-dot-v1",
|
|
831
|
+
similarity_fn_name=ScoringFunction.DOT_PRODUCT,
|
|
832
|
+
use_instructions=False,
|
|
833
|
+
training_datasets={
|
|
834
|
+
"MSMARCO",
|
|
835
|
+
"YahooAnswersTopicsClassification",
|
|
836
|
+
"NQ",
|
|
837
|
+
},
|
|
838
|
+
adapted_from="microsoft/mpnet-base",
|
|
839
|
+
modalities=["text"],
|
|
840
|
+
model_type=["dense"],
|
|
841
|
+
)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mteb
|
|
3
|
-
Version: 2.6.
|
|
3
|
+
Version: 2.6.7
|
|
4
4
|
Summary: Massive Text Embedding Benchmark
|
|
5
5
|
Author-email: MTEB Contributors <niklas@huggingface.co>, Kenneth Enevoldsen <kenneth.enevoldsen@cas.au.dk>, Nouamane Tazi <nouamane@huggingface.co>, Nils Reimers <info@nils-reimers.de>
|
|
6
6
|
Maintainer-email: Kenneth Enevoldsen <kenneth.enevoldsen@cas.au.dk>, Roman Solomatin <risolomatin@gmail.com>, Isaac Chung <chungisaac1217@gmail.com>
|
|
@@ -1514,7 +1514,7 @@ mteb/models/model_implementations/human.py,sha256=EtYa8G7Dc8fDcelBVw0xTpxGGx1YKK
|
|
|
1514
1514
|
mteb/models/model_implementations/ibm_granite_models.py,sha256=ljHjuPuBkIwJvp5WZ3csjTOIb14nLh1h3OYkW-CEeHY,8464
|
|
1515
1515
|
mteb/models/model_implementations/inf_models.py,sha256=SXXs3s9PWo08fzrxG_WOXGc_gvbpmkt-Blt7YoGcPRo,3020
|
|
1516
1516
|
mteb/models/model_implementations/jasper_models.py,sha256=buJgllGIeyi7LsxDJY3UYJs_YzdDBkU3QpuQyU6VoTc,16293
|
|
1517
|
-
mteb/models/model_implementations/jina_clip.py,sha256=
|
|
1517
|
+
mteb/models/model_implementations/jina_clip.py,sha256=0XhRSWTPR3ERAsOoVOxhB1yV6v1pEY8EQcTy1ChtSoU,6595
|
|
1518
1518
|
mteb/models/model_implementations/jina_models.py,sha256=kFmkAWUFoJpq_1tRQIspk54lsik2vIoQcy5DS7YKgQ0,35198
|
|
1519
1519
|
mteb/models/model_implementations/kalm_models.py,sha256=SHqkw5p7HzmQrb_bIFjRp1rsuv2v531nXIk390h_ojY,62115
|
|
1520
1520
|
mteb/models/model_implementations/kblab.py,sha256=EisTJXijICN2pyfWT_89qUnNO7TH95t1LxCxjzJnzQo,1237
|
|
@@ -1541,7 +1541,7 @@ mteb/models/model_implementations/no_instruct_sentence_models.py,sha256=qLiMok_O
|
|
|
1541
1541
|
mteb/models/model_implementations/nomic_models.py,sha256=dmQC_cWg6hAmiBHK7fXoXEiGBJnJvrq0RsnCcJ2qe1Q,15137
|
|
1542
1542
|
mteb/models/model_implementations/nomic_models_vision.py,sha256=usCKfZCR7aEi_DnNmVAYjH-lXx_ipQkBVtUAmhJ90QI,6870
|
|
1543
1543
|
mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py,sha256=6dTGtK1GiaYdpJ4IQFgCCOkGyHQyuEUatKs-Uv-1YmE,6450
|
|
1544
|
-
mteb/models/model_implementations/nvidia_models.py,sha256=
|
|
1544
|
+
mteb/models/model_implementations/nvidia_models.py,sha256=JMy0x7EWGrAxZ9s63F2vSPdPS-9yF3RIS4uj3N2UrVI,24315
|
|
1545
1545
|
mteb/models/model_implementations/octen_models.py,sha256=FwQAcB_z6bFohpFlNQK2ugLBEOQUu533auOhrNqMxaM,7511
|
|
1546
1546
|
mteb/models/model_implementations/openai_models.py,sha256=905BajYi_XyOZgqU3AeKpwIttLoUitaAyc48sTWI6Jg,9482
|
|
1547
1547
|
mteb/models/model_implementations/openclip_models.py,sha256=MyosgeYSrgBXGuGFtI2Tyxksxpb7bADFJVSYFCLweVA,11622
|
|
@@ -1572,7 +1572,7 @@ mteb/models/model_implementations/searchmap_models.py,sha256=xVQPkO7aLp_kBFiMDAm
|
|
|
1572
1572
|
mteb/models/model_implementations/seed_1_6_embedding_models.py,sha256=gcGKEY-n7DWGPlXYhO_kcNJ3lkBEnbw8NUxADNs3siM,18635
|
|
1573
1573
|
mteb/models/model_implementations/seed_1_6_embedding_models_1215.py,sha256=OoTHcDRQGOuSzf08V62EXrSEdRsXhnMv2ZN9feJWs9s,36443
|
|
1574
1574
|
mteb/models/model_implementations/seed_models.py,sha256=9UF2AQ0Uue8DD73SjYhHn2hLxey_7Iq9ii9TkRaA3CM,14168
|
|
1575
|
-
mteb/models/model_implementations/sentence_transformers_models.py,sha256=
|
|
1575
|
+
mteb/models/model_implementations/sentence_transformers_models.py,sha256=6oULaf2mTyVe7vy9oS_QoKuxXXPaAqjQgSooMTG0xow,26071
|
|
1576
1576
|
mteb/models/model_implementations/shuu_model.py,sha256=1jDFFPAfbfrSzC4vbHczO4yqy3Xh4tWiDAd3FS9-T6M,1177
|
|
1577
1577
|
mteb/models/model_implementations/siglip_models.py,sha256=SOSyp-B7w6Vvqas_10D_1rvpJcKSQuJmXGy7Wdtsw7o,13012
|
|
1578
1578
|
mteb/models/model_implementations/slm_models.py,sha256=JXjBio-9NFHLefU4Ny1Z-fFkyvvIz0U2kQ6t5s-PzlQ,13427
|
|
@@ -2612,9 +2612,9 @@ mteb/types/_metadata.py,sha256=NN-W0S6a5TDV7UkpRx1pyWtGF4TyyCyoPUfHOwdeci8,2290
|
|
|
2612
2612
|
mteb/types/_result.py,sha256=UKNokV9pu3G74MGebocU512aU_fFU9I9nPKnrG9Q0iE,1035
|
|
2613
2613
|
mteb/types/_string_validators.py,sha256=PY-dYq4E8O50VS3bLYdldPWp400fl_WzUjfVSkNWe8U,523
|
|
2614
2614
|
mteb/types/statistics.py,sha256=GwkBPmAr18Onu-vHtzHs0PFrhCozdOMiT13HwnWL4ZM,3961
|
|
2615
|
-
mteb-2.6.
|
|
2616
|
-
mteb-2.6.
|
|
2617
|
-
mteb-2.6.
|
|
2618
|
-
mteb-2.6.
|
|
2619
|
-
mteb-2.6.
|
|
2620
|
-
mteb-2.6.
|
|
2615
|
+
mteb-2.6.7.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
2616
|
+
mteb-2.6.7.dist-info/METADATA,sha256=p99o5hSYjMeWfoMLwNljk7_mDzsRjVXBbwPzsobuyWA,14281
|
|
2617
|
+
mteb-2.6.7.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
2618
|
+
mteb-2.6.7.dist-info/entry_points.txt,sha256=8IJoEJFKoDHmVnNev-qJ9pp4Ln7_1-ma9QsXnzVCzGU,39
|
|
2619
|
+
mteb-2.6.7.dist-info/top_level.txt,sha256=OLVIjcQAlWBz0bdmutKlWHLF42FF0hp4uVAg3ZyiG4U,5
|
|
2620
|
+
mteb-2.6.7.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|