mteb 2.7.11__py3-none-any.whl → 2.7.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mteb/abstasks/abstask.py +2 -1
- mteb/models/model_implementations/nomic_models.py +40 -0
- mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +110 -10
- mteb/models/model_implementations/ops_colqwen3_models.py +267 -0
- mteb/models/model_implementations/querit_models.py +245 -0
- mteb/tasks/bitext_mining/multilingual/bible_nlp_bitext_mining.py +1 -1
- mteb/tasks/classification/ben/bengali_document_classification.py +2 -2
- mteb/tasks/classification/ces/czech_product_review_sentiment_classification.py +2 -2
- mteb/tasks/classification/ces/czech_so_me_sentiment_classification.py +1 -1
- mteb/tasks/classification/multilingual/hin_dialect_classification.py +1 -1
- mteb/tasks/classification/multilingual/indic_lang_classification.py +1 -1
- mteb/tasks/classification/multilingual/indic_sentiment_classification.py +1 -1
- mteb/tasks/classification/multilingual/language_classification.py +1 -1
- mteb/tasks/classification/multilingual/south_african_lang_classification.py +1 -1
- mteb/tasks/classification/slk/slovak_movie_review_sentiment_classification.py +2 -2
- mteb/tasks/classification/swa/swahili_news_classification.py +2 -2
- mteb/tasks/clustering/deu/ten_k_gnad_clustering_p2p.py +1 -1
- mteb/tasks/clustering/deu/ten_k_gnad_clustering_s2s.py +1 -1
- mteb/tasks/clustering/nob/vg_hierarchical_clustering.py +2 -2
- mteb/tasks/multilabel_classification/nld/covid_disinformation_nl_multi_label_classification.py +1 -1
- mteb/tasks/pair_classification/multilingual/pub_chem_wiki_pair_classification.py +1 -1
- mteb/tasks/retrieval/code/code_rag.py +4 -4
- mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
- mteb/tasks/retrieval/dan/tv2_nordretrieval.py +1 -1
- mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +1 -1
- mteb/tasks/retrieval/nob/norquad.py +1 -1
- mteb/tasks/retrieval/nob/snl_retrieval.py +1 -1
- mteb/tasks/sts/multilingual/sem_rel24_sts.py +1 -1
- mteb/tasks/sts/multilingual/sts_benchmark_multilingual_sts.py +1 -1
- mteb/tasks/sts/por/assin2_sts.py +1 -1
- {mteb-2.7.11.dist-info → mteb-2.7.13.dist-info}/METADATA +1 -3
- {mteb-2.7.11.dist-info → mteb-2.7.13.dist-info}/RECORD +36 -34
- {mteb-2.7.11.dist-info → mteb-2.7.13.dist-info}/WHEEL +1 -1
- {mteb-2.7.11.dist-info → mteb-2.7.13.dist-info}/entry_points.txt +0 -0
- {mteb-2.7.11.dist-info → mteb-2.7.13.dist-info}/licenses/LICENSE +0 -0
- {mteb-2.7.11.dist-info → mteb-2.7.13.dist-info}/top_level.txt +0 -0
mteb/abstasks/abstask.py
CHANGED
|
@@ -116,7 +116,7 @@ class AbsTask(ABC):
|
|
|
116
116
|
logger.warning(msg)
|
|
117
117
|
warnings.warn(msg)
|
|
118
118
|
|
|
119
|
-
def dataset_transform(self, num_proc: int = 1):
|
|
119
|
+
def dataset_transform(self, num_proc: int = 1, **kwargs: Any) -> None:
|
|
120
120
|
"""A transform operations applied to the dataset after loading.
|
|
121
121
|
|
|
122
122
|
This method is useful when the dataset from Huggingface is not in an `mteb` compatible format.
|
|
@@ -124,6 +124,7 @@ class AbsTask(ABC):
|
|
|
124
124
|
|
|
125
125
|
Args:
|
|
126
126
|
num_proc: Number of processes to use for the transformation.
|
|
127
|
+
kwargs: Additional keyword arguments passed to the load_dataset function. Keep for forward compatibility.
|
|
127
128
|
"""
|
|
128
129
|
pass
|
|
129
130
|
|
|
@@ -7,6 +7,7 @@ import torch
|
|
|
7
7
|
import torch.nn.functional as F
|
|
8
8
|
from packaging.version import Version
|
|
9
9
|
|
|
10
|
+
from mteb.models import sentence_transformers_loader
|
|
10
11
|
from mteb.models.model_meta import ModelMeta, ScoringFunction
|
|
11
12
|
from mteb.models.sentence_transformer_wrapper import SentenceTransformerEncoderWrapper
|
|
12
13
|
from mteb.types import PromptType
|
|
@@ -509,3 +510,42 @@ nomic_embed_text_v2_moe = ModelMeta(
|
|
|
509
510
|
url={https://arxiv.org/abs/2502.07972},
|
|
510
511
|
}""",
|
|
511
512
|
)
|
|
513
|
+
|
|
514
|
+
nomic_embed_code = ModelMeta(
|
|
515
|
+
loader=sentence_transformers_loader,
|
|
516
|
+
loader_kwargs={
|
|
517
|
+
"trust_remote_code": True,
|
|
518
|
+
"model_prompts": model_prompts,
|
|
519
|
+
},
|
|
520
|
+
name="nomic-ai/nomic-embed-code",
|
|
521
|
+
revision="11114029805cee545ef111d5144b623787462a52",
|
|
522
|
+
release_date="2025-03-24",
|
|
523
|
+
languages=["eng-Latn"],
|
|
524
|
+
n_parameters=7_070_619_136,
|
|
525
|
+
n_embedding_parameters=None,
|
|
526
|
+
memory_usage_mb=26972.0,
|
|
527
|
+
max_tokens=32768,
|
|
528
|
+
embed_dim=3584,
|
|
529
|
+
license="apache-2.0",
|
|
530
|
+
open_weights=True,
|
|
531
|
+
public_training_code="https://github.com/gangiswag/cornstack/",
|
|
532
|
+
public_training_data="https://huggingface.co/collections/nomic-ai/cornstack",
|
|
533
|
+
framework=["PyTorch", "Sentence Transformers", "safetensors"],
|
|
534
|
+
reference="https://huggingface.co/nomic-ai/nomic-embed-code",
|
|
535
|
+
similarity_fn_name=ScoringFunction.COSINE,
|
|
536
|
+
use_instructions=True,
|
|
537
|
+
training_datasets={"CoRNStack"},
|
|
538
|
+
adapted_from=None,
|
|
539
|
+
superseded_by=None,
|
|
540
|
+
modalities=["text"],
|
|
541
|
+
model_type=["dense"],
|
|
542
|
+
citation="""@misc{suresh2025cornstackhighqualitycontrastivedata,
|
|
543
|
+
title={CoRNStack: High-Quality Contrastive Data for Better Code Retrieval and Reranking},
|
|
544
|
+
author={Tarun Suresh and Revanth Gangi Reddy and Yifei Xu and Zach Nussbaum and Andriy Mulyar and Brandon Duderstadt and Heng Ji},
|
|
545
|
+
year={2025},
|
|
546
|
+
eprint={2412.01007},
|
|
547
|
+
archivePrefix={arXiv},
|
|
548
|
+
primaryClass={cs.CL},
|
|
549
|
+
url={https://arxiv.org/abs/2412.01007},
|
|
550
|
+
}""",
|
|
551
|
+
)
|
|
@@ -3,7 +3,7 @@ from __future__ import annotations
|
|
|
3
3
|
from typing import TYPE_CHECKING, Any
|
|
4
4
|
|
|
5
5
|
import torch
|
|
6
|
-
from packaging.
|
|
6
|
+
from packaging.specifiers import SpecifierSet
|
|
7
7
|
from torch.utils.data import DataLoader
|
|
8
8
|
from transformers import __version__ as transformers_version
|
|
9
9
|
|
|
@@ -31,18 +31,20 @@ class LlamaNemoretrieverColembed(AbsEncoder):
|
|
|
31
31
|
model_name_or_path: str,
|
|
32
32
|
revision: str,
|
|
33
33
|
trust_remote_code: bool,
|
|
34
|
+
transformers_version_constraint: str | None = None,
|
|
34
35
|
device_map="cuda",
|
|
35
36
|
torch_dtype=torch.bfloat16,
|
|
36
37
|
attn_implementation="flash_attention_2",
|
|
37
38
|
**kwargs,
|
|
38
39
|
):
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
40
|
+
if transformers_version_constraint is not None:
|
|
41
|
+
spec = SpecifierSet(transformers_version_constraint)
|
|
42
|
+
if transformers_version not in spec:
|
|
43
|
+
raise RuntimeError(
|
|
44
|
+
f"Model `{model_name_or_path}` requires transformers{transformers_version_constraint}, "
|
|
45
|
+
f"but {transformers_version} is installed. "
|
|
46
|
+
f"Run: pip install 'transformers{transformers_version_constraint}'"
|
|
47
|
+
)
|
|
46
48
|
|
|
47
49
|
from transformers import AutoModel
|
|
48
50
|
|
|
@@ -150,10 +152,24 @@ TRAINING_DATA = {
|
|
|
150
152
|
"wiki-ss-nq",
|
|
151
153
|
}
|
|
152
154
|
|
|
155
|
+
|
|
156
|
+
TRAINING_DATA_v2 = {
|
|
157
|
+
"VidoreDocVQARetrieval",
|
|
158
|
+
"VidoreInfoVQARetrieval",
|
|
159
|
+
"VidoreTatdqaRetrieval",
|
|
160
|
+
"VidoreArxivQARetrieval",
|
|
161
|
+
"docmatix-ir",
|
|
162
|
+
"VDRMultilingualRetrieval",
|
|
163
|
+
"VisRAG-Ret-Train-Synthetic-data",
|
|
164
|
+
"VisRAG-Ret-Train-In-domain-data",
|
|
165
|
+
"wiki-ss-nq",
|
|
166
|
+
}
|
|
167
|
+
|
|
153
168
|
llama_nemoretriever_colembed_1b_v1 = ModelMeta(
|
|
154
169
|
loader=LlamaNemoretrieverColembed,
|
|
155
170
|
loader_kwargs=dict(
|
|
156
171
|
trust_remote_code=True,
|
|
172
|
+
transformers_version_constraint="==4.49.0",
|
|
157
173
|
),
|
|
158
174
|
name="nvidia/llama-nemoretriever-colembed-1b-v1",
|
|
159
175
|
model_type=["late-interaction"],
|
|
@@ -168,7 +184,7 @@ llama_nemoretriever_colembed_1b_v1 = ModelMeta(
|
|
|
168
184
|
embed_dim=2048,
|
|
169
185
|
license="https://huggingface.co/nvidia/llama-nemoretriever-colembed-1b-v1/blob/main/LICENSE",
|
|
170
186
|
open_weights=True,
|
|
171
|
-
public_training_code=
|
|
187
|
+
public_training_code=None,
|
|
172
188
|
public_training_data="https://huggingface.co/nvidia/llama-nemoretriever-colembed-1b-v1#training-dataset",
|
|
173
189
|
framework=["PyTorch", "Transformers", "safetensors"],
|
|
174
190
|
reference="https://huggingface.co/nvidia/llama-nemoretriever-colembed-1b-v1",
|
|
@@ -182,6 +198,7 @@ llama_nemoretriever_colembed_3b_v1 = ModelMeta(
|
|
|
182
198
|
loader=LlamaNemoretrieverColembed,
|
|
183
199
|
loader_kwargs=dict(
|
|
184
200
|
trust_remote_code=True,
|
|
201
|
+
transformers_version_constraint="==4.49.0",
|
|
185
202
|
),
|
|
186
203
|
name="nvidia/llama-nemoretriever-colembed-3b-v1",
|
|
187
204
|
model_type=["late-interaction"],
|
|
@@ -196,7 +213,7 @@ llama_nemoretriever_colembed_3b_v1 = ModelMeta(
|
|
|
196
213
|
embed_dim=3072,
|
|
197
214
|
license="https://huggingface.co/nvidia/llama-nemoretriever-colembed-1b-v1/blob/main/LICENSE",
|
|
198
215
|
open_weights=True,
|
|
199
|
-
public_training_code=
|
|
216
|
+
public_training_code=None,
|
|
200
217
|
public_training_data="https://huggingface.co/nvidia/llama-nemoretriever-colembed-1b-v1#training-dataset",
|
|
201
218
|
framework=["PyTorch", "Transformers", "safetensors"],
|
|
202
219
|
reference="https://huggingface.co/nvidia/llama-nemoretriever-colembed-3b-v1",
|
|
@@ -205,3 +222,86 @@ llama_nemoretriever_colembed_3b_v1 = ModelMeta(
|
|
|
205
222
|
training_datasets=TRAINING_DATA,
|
|
206
223
|
citation=LLAMA_NEMORETRIEVER_CITATION,
|
|
207
224
|
)
|
|
225
|
+
|
|
226
|
+
llama_nemotron_colembed_vl_3b_v2 = ModelMeta(
|
|
227
|
+
loader=LlamaNemoretrieverColembed,
|
|
228
|
+
loader_kwargs=dict(
|
|
229
|
+
trust_remote_code=True,
|
|
230
|
+
transformers_version_constraint="==4.49.0",
|
|
231
|
+
),
|
|
232
|
+
name="nvidia/llama-nemotron-colembed-vl-3b-v2",
|
|
233
|
+
model_type=["late-interaction"],
|
|
234
|
+
languages=["eng-Latn"],
|
|
235
|
+
revision="75f03c712cb3a252e062295f9a0966e5d95d6156",
|
|
236
|
+
release_date="2026-01-21",
|
|
237
|
+
modalities=["image", "text"],
|
|
238
|
+
n_parameters=4_407_000_000,
|
|
239
|
+
memory_usage_mb=8403,
|
|
240
|
+
max_tokens=8192,
|
|
241
|
+
embed_dim=3072,
|
|
242
|
+
license="https://huggingface.co/nvidia/llama-nemotron-colembed-vl-3b-v2/blob/main/LICENSE",
|
|
243
|
+
open_weights=True,
|
|
244
|
+
public_training_code=None,
|
|
245
|
+
public_training_data="https://huggingface.co/nvidia/llama-nemotron-colembed-vl-3b-v2#training-dataset",
|
|
246
|
+
framework=["PyTorch", "Transformers", "safetensors"],
|
|
247
|
+
reference="https://huggingface.co/nvidia/llama-nemotron-colembed-vl-3b-v2",
|
|
248
|
+
similarity_fn_name="MaxSim",
|
|
249
|
+
use_instructions=True,
|
|
250
|
+
training_datasets=TRAINING_DATA,
|
|
251
|
+
citation=LLAMA_NEMORETRIEVER_CITATION,
|
|
252
|
+
)
|
|
253
|
+
|
|
254
|
+
nemotron_colembed_vl_4b_v2 = ModelMeta(
|
|
255
|
+
loader=LlamaNemoretrieverColembed,
|
|
256
|
+
loader_kwargs=dict(
|
|
257
|
+
trust_remote_code=True,
|
|
258
|
+
transformers_version_constraint="==5.0.0rc0",
|
|
259
|
+
),
|
|
260
|
+
name="nvidia/nemotron-colembed-vl-4b-v2",
|
|
261
|
+
revision="823b1625c15fe3da73fa094205e538a7a2301a2a",
|
|
262
|
+
languages=["eng-Latn"],
|
|
263
|
+
release_date="2026-01-07",
|
|
264
|
+
modalities=["image", "text"],
|
|
265
|
+
n_parameters=4_800_000_000,
|
|
266
|
+
memory_usage_mb=9206,
|
|
267
|
+
max_tokens=262144,
|
|
268
|
+
embed_dim=2560,
|
|
269
|
+
license="https://huggingface.co/nvidia/nemotron-colembed-vl-4b-v2/blob/main/LICENSE",
|
|
270
|
+
open_weights=True,
|
|
271
|
+
public_training_code=None,
|
|
272
|
+
public_training_data="https://huggingface.co/nvidia/nemotron-colembed-vl-4b-v2#training-dataset",
|
|
273
|
+
framework=["PyTorch", "Transformers"],
|
|
274
|
+
reference="https://huggingface.co/nvidia/nemotron-colembed-vl-4b-v2",
|
|
275
|
+
similarity_fn_name="MaxSim",
|
|
276
|
+
use_instructions=True,
|
|
277
|
+
training_datasets=TRAINING_DATA_v2,
|
|
278
|
+
citation=LLAMA_NEMORETRIEVER_CITATION,
|
|
279
|
+
)
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
nemotron_colembed_vl_8b_v2 = ModelMeta(
|
|
283
|
+
loader=LlamaNemoretrieverColembed,
|
|
284
|
+
loader_kwargs=dict(
|
|
285
|
+
trust_remote_code=True,
|
|
286
|
+
transformers_version_constraint="==5.0.0rc0",
|
|
287
|
+
),
|
|
288
|
+
name="nvidia/nemotron-colembed-vl-8b-v2",
|
|
289
|
+
revision="6cbe43579dda6237768fc373768ad372cc5cdfec",
|
|
290
|
+
languages=["eng-Latn"],
|
|
291
|
+
release_date="2026-01-07",
|
|
292
|
+
modalities=["image", "text"],
|
|
293
|
+
n_parameters=8_700_000_000,
|
|
294
|
+
memory_usage_mb=16722,
|
|
295
|
+
max_tokens=262144,
|
|
296
|
+
embed_dim=4096,
|
|
297
|
+
license="https://huggingface.co/nvidia/nemotron-colembed-vl-8b-v2/blob/main/LICENSE",
|
|
298
|
+
open_weights=True,
|
|
299
|
+
public_training_code=None,
|
|
300
|
+
public_training_data="https://huggingface.co/nvidia/nemotron-colembed-vl-8b-v2#training-dataset",
|
|
301
|
+
framework=["PyTorch", "Transformers"],
|
|
302
|
+
reference="https://huggingface.co/nvidia/nemotron-colembed-vl-8b-v2",
|
|
303
|
+
similarity_fn_name="MaxSim",
|
|
304
|
+
use_instructions=True,
|
|
305
|
+
training_datasets=TRAINING_DATA_v2,
|
|
306
|
+
citation=LLAMA_NEMORETRIEVER_CITATION,
|
|
307
|
+
)
|
|
@@ -0,0 +1,267 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING, Any
|
|
4
|
+
|
|
5
|
+
import torch
|
|
6
|
+
from tqdm.auto import tqdm
|
|
7
|
+
from transformers import AutoModel, AutoProcessor
|
|
8
|
+
|
|
9
|
+
from mteb._requires_package import requires_image_dependencies
|
|
10
|
+
from mteb.models.abs_encoder import AbsEncoder
|
|
11
|
+
from mteb.models.model_meta import ModelMeta, ScoringFunction
|
|
12
|
+
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
from torch.utils.data import DataLoader
|
|
15
|
+
|
|
16
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
17
|
+
from mteb.types import Array, BatchedInput, PromptType
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class OpsColQwen3Wrapper(AbsEncoder):
|
|
21
|
+
"""Wrapper for OpsColQwen3 model."""
|
|
22
|
+
|
|
23
|
+
def __init__(
|
|
24
|
+
self,
|
|
25
|
+
model_name: str = "OpenSearch-AI/Ops-Colqwen3-4B",
|
|
26
|
+
revision: str | None = None,
|
|
27
|
+
device: str | None = None,
|
|
28
|
+
attn_implementation: str | None = None,
|
|
29
|
+
**kwargs,
|
|
30
|
+
):
|
|
31
|
+
requires_image_dependencies()
|
|
32
|
+
from transformers.utils.import_utils import is_flash_attn_2_available
|
|
33
|
+
|
|
34
|
+
if attn_implementation is None:
|
|
35
|
+
attn_implementation = (
|
|
36
|
+
"flash_attention_2" if is_flash_attn_2_available() else None
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
|
|
40
|
+
|
|
41
|
+
self.mdl = AutoModel.from_pretrained(
|
|
42
|
+
model_name,
|
|
43
|
+
device_map=self.device,
|
|
44
|
+
attn_implementation=attn_implementation,
|
|
45
|
+
trust_remote_code=True,
|
|
46
|
+
revision=revision,
|
|
47
|
+
**kwargs,
|
|
48
|
+
)
|
|
49
|
+
self.mdl.eval()
|
|
50
|
+
|
|
51
|
+
self.processor = AutoProcessor.from_pretrained(
|
|
52
|
+
model_name,
|
|
53
|
+
trust_remote_code=True,
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
def encode(
|
|
57
|
+
self,
|
|
58
|
+
inputs: DataLoader[BatchedInput],
|
|
59
|
+
*,
|
|
60
|
+
task_metadata: TaskMetadata,
|
|
61
|
+
hf_split: str,
|
|
62
|
+
hf_subset: str,
|
|
63
|
+
prompt_type: PromptType | None = None,
|
|
64
|
+
**kwargs: Any,
|
|
65
|
+
) -> Array:
|
|
66
|
+
text_embeddings = None
|
|
67
|
+
image_embeddings = None
|
|
68
|
+
|
|
69
|
+
if "text" in inputs.dataset.features:
|
|
70
|
+
text_embeddings = self.get_text_embeddings(inputs, **kwargs)
|
|
71
|
+
if "image" in inputs.dataset.features:
|
|
72
|
+
image_embeddings = self.get_image_embeddings(inputs, **kwargs)
|
|
73
|
+
|
|
74
|
+
if text_embeddings is not None and image_embeddings is not None:
|
|
75
|
+
if len(text_embeddings) != len(image_embeddings):
|
|
76
|
+
raise ValueError(
|
|
77
|
+
"The number of texts and images must have the same length"
|
|
78
|
+
)
|
|
79
|
+
fused_embeddings = text_embeddings + image_embeddings
|
|
80
|
+
return fused_embeddings
|
|
81
|
+
elif text_embeddings is not None:
|
|
82
|
+
return text_embeddings
|
|
83
|
+
elif image_embeddings is not None:
|
|
84
|
+
return image_embeddings
|
|
85
|
+
raise ValueError("No text or image inputs found")
|
|
86
|
+
|
|
87
|
+
def encode_input(self, inputs):
|
|
88
|
+
return self.mdl(**inputs)
|
|
89
|
+
|
|
90
|
+
def get_image_embeddings(
|
|
91
|
+
self,
|
|
92
|
+
images: DataLoader,
|
|
93
|
+
batch_size: int = 32,
|
|
94
|
+
**kwargs,
|
|
95
|
+
) -> torch.Tensor:
|
|
96
|
+
import torchvision.transforms.functional as F
|
|
97
|
+
from PIL import Image
|
|
98
|
+
|
|
99
|
+
all_embeds = []
|
|
100
|
+
|
|
101
|
+
with torch.no_grad():
|
|
102
|
+
for batch in tqdm(images, desc="Encoding images"):
|
|
103
|
+
# batch may be list of tensors or PIL
|
|
104
|
+
imgs = [
|
|
105
|
+
F.to_pil_image(b.to(self.device))
|
|
106
|
+
if not isinstance(b, Image.Image)
|
|
107
|
+
else b
|
|
108
|
+
for b in batch["image"]
|
|
109
|
+
]
|
|
110
|
+
inputs = self.processor.process_images(imgs)
|
|
111
|
+
inputs = {k: v.to(self.device) for k, v in inputs.items()}
|
|
112
|
+
outs = self.encode_input(inputs)
|
|
113
|
+
all_embeds.extend(outs.cpu().to(torch.float32))
|
|
114
|
+
|
|
115
|
+
padded = torch.nn.utils.rnn.pad_sequence(
|
|
116
|
+
all_embeds, batch_first=True, padding_value=0
|
|
117
|
+
)
|
|
118
|
+
return padded
|
|
119
|
+
|
|
120
|
+
def get_text_embeddings(
|
|
121
|
+
self,
|
|
122
|
+
texts: DataLoader,
|
|
123
|
+
batch_size: int = 32,
|
|
124
|
+
**kwargs,
|
|
125
|
+
) -> torch.Tensor:
|
|
126
|
+
all_embeds = []
|
|
127
|
+
|
|
128
|
+
with torch.no_grad():
|
|
129
|
+
for batch in tqdm(texts, desc="Encoding texts"):
|
|
130
|
+
batch_texts = batch["text"]
|
|
131
|
+
inputs = self.processor.process_queries(batch_texts)
|
|
132
|
+
inputs = {k: v.to(self.device) for k, v in inputs.items()}
|
|
133
|
+
outs = self.encode_input(inputs)
|
|
134
|
+
all_embeds.extend(outs.cpu().to(torch.float32))
|
|
135
|
+
|
|
136
|
+
padded = torch.nn.utils.rnn.pad_sequence(
|
|
137
|
+
all_embeds, batch_first=True, padding_value=0
|
|
138
|
+
)
|
|
139
|
+
return padded
|
|
140
|
+
|
|
141
|
+
def similarity(self, a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
|
|
142
|
+
return self.processor.score_multi_vector(a, b, device=self.device)
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
OPS_COLQWEN3_TRAINING_DATA = {
|
|
146
|
+
"VDRMultilingualRetrieval",
|
|
147
|
+
# from https://huggingface.co/datasets/vidore/colpali_train_set
|
|
148
|
+
"VidoreDocVQARetrieval",
|
|
149
|
+
"VidoreInfoVQARetrieval",
|
|
150
|
+
"VidoreTatdqaRetrieval",
|
|
151
|
+
"VidoreArxivQARetrieval",
|
|
152
|
+
"docmatix-ir",
|
|
153
|
+
"HotpotQA",
|
|
154
|
+
"FEVER",
|
|
155
|
+
"NQ",
|
|
156
|
+
"MIRACLRetrieval",
|
|
157
|
+
"WebInstructSub", # MathStackExchange and ScienceStackExchange only
|
|
158
|
+
"MrTyDi",
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
multilingual_langs = [
|
|
162
|
+
"afr-Latn",
|
|
163
|
+
"ara-Arab",
|
|
164
|
+
"aze-Latn",
|
|
165
|
+
"bel-Cyrl",
|
|
166
|
+
"bul-Cyrl",
|
|
167
|
+
"ben-Beng",
|
|
168
|
+
"cat-Latn",
|
|
169
|
+
"ceb-Latn",
|
|
170
|
+
"ces-Latn",
|
|
171
|
+
"cym-Latn",
|
|
172
|
+
"dan-Latn",
|
|
173
|
+
"deu-Latn",
|
|
174
|
+
"ell-Grek",
|
|
175
|
+
"eng-Latn",
|
|
176
|
+
"spa-Latn",
|
|
177
|
+
"est-Latn",
|
|
178
|
+
"eus-Latn",
|
|
179
|
+
"fas-Arab",
|
|
180
|
+
"fin-Latn",
|
|
181
|
+
"fra-Latn",
|
|
182
|
+
"glg-Latn",
|
|
183
|
+
"guj-Gujr",
|
|
184
|
+
"heb-Hebr",
|
|
185
|
+
"hin-Deva",
|
|
186
|
+
"hrv-Latn",
|
|
187
|
+
"hat-Latn",
|
|
188
|
+
"hun-Latn",
|
|
189
|
+
"hye-Armn",
|
|
190
|
+
"ind-Latn",
|
|
191
|
+
"isl-Latn",
|
|
192
|
+
"ita-Latn",
|
|
193
|
+
"jpn-Jpan",
|
|
194
|
+
"jav-Latn",
|
|
195
|
+
"kat-Geor",
|
|
196
|
+
"kaz-Cyrl",
|
|
197
|
+
"khm-Khmr",
|
|
198
|
+
"kan-Knda",
|
|
199
|
+
"kor-Hang",
|
|
200
|
+
"kir-Cyrl",
|
|
201
|
+
"lao-Laoo",
|
|
202
|
+
"lit-Latn",
|
|
203
|
+
"lav-Latn",
|
|
204
|
+
"mkd-Cyrl",
|
|
205
|
+
"mal-Mlym",
|
|
206
|
+
"mon-Cyrl",
|
|
207
|
+
"mar-Deva",
|
|
208
|
+
"msa-Latn",
|
|
209
|
+
"mya-Mymr",
|
|
210
|
+
"nep-Deva",
|
|
211
|
+
"nld-Latn",
|
|
212
|
+
"nor-Latn",
|
|
213
|
+
"nob-Latn",
|
|
214
|
+
"nno-Latn",
|
|
215
|
+
"pan-Guru",
|
|
216
|
+
"pol-Latn",
|
|
217
|
+
"por-Latn",
|
|
218
|
+
"que-Latn",
|
|
219
|
+
"ron-Latn",
|
|
220
|
+
"rus-Cyrl",
|
|
221
|
+
"sin-Sinh",
|
|
222
|
+
"slk-Latn",
|
|
223
|
+
"slv-Latn",
|
|
224
|
+
"swa-Latn",
|
|
225
|
+
"tam-Taml",
|
|
226
|
+
"tel-Telu",
|
|
227
|
+
"tha-Thai",
|
|
228
|
+
"tgl-Latn",
|
|
229
|
+
"tur-Latn",
|
|
230
|
+
"ukr-Cyrl",
|
|
231
|
+
"urd-Arab",
|
|
232
|
+
"vie-Latn",
|
|
233
|
+
"yor-Latn",
|
|
234
|
+
"zho-Hans",
|
|
235
|
+
]
|
|
236
|
+
|
|
237
|
+
OPS_COLQWEN3_CITATION = """
|
|
238
|
+
@misc{ops_colqwen3_4b,
|
|
239
|
+
author = {OpenSearch-AI},
|
|
240
|
+
title = {Ops-ColQwen3: State-of-the-Art Multimodal Embedding Model for Visual Document Retrieval},
|
|
241
|
+
year = {2026},
|
|
242
|
+
url = {https://huggingface.co/OpenSearch-AI/Ops-ColQwen3-4B},
|
|
243
|
+
}"""
|
|
244
|
+
|
|
245
|
+
ops_colqwen3_4b = ModelMeta(
|
|
246
|
+
loader=OpsColQwen3Wrapper,
|
|
247
|
+
name="OpenSearch-AI/Ops-Colqwen3-4B",
|
|
248
|
+
loader_kwargs=dict(dtype=torch.float16, trust_remote_code=True),
|
|
249
|
+
languages=multilingual_langs,
|
|
250
|
+
revision="4894b7d451ff33981650acc693bb482dbef302d3",
|
|
251
|
+
release_date="2026-01-24",
|
|
252
|
+
modalities=["image", "text"],
|
|
253
|
+
n_parameters=4_800_000_000,
|
|
254
|
+
memory_usage_mb=9206,
|
|
255
|
+
max_tokens=32768,
|
|
256
|
+
embed_dim=2560,
|
|
257
|
+
license="apache-2.0",
|
|
258
|
+
open_weights=True,
|
|
259
|
+
public_training_code="https://github.com/illuin-tech/colpali",
|
|
260
|
+
public_training_data=None,
|
|
261
|
+
framework=["PyTorch"],
|
|
262
|
+
reference="https://huggingface.co/OpenSearch-AI/Ops-Colqwen3-4B",
|
|
263
|
+
similarity_fn_name=ScoringFunction.MAX_SIM,
|
|
264
|
+
use_instructions=True,
|
|
265
|
+
training_datasets=OPS_COLQWEN3_TRAINING_DATA,
|
|
266
|
+
citation=OPS_COLQWEN3_CITATION,
|
|
267
|
+
)
|