mteb 2.7.12__py3-none-any.whl → 2.7.14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mteb/abstasks/pair_classification.py +13 -7
- mteb/models/get_model_meta.py +12 -0
- mteb/models/model_implementations/bm25.py +1 -1
- mteb/models/model_implementations/human.py +1 -1
- mteb/models/model_implementations/nomic_models.py +40 -0
- mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +110 -10
- mteb/models/model_implementations/opensearch_neural_sparse_models.py +5 -5
- mteb/models/model_implementations/ops_colqwen3_models.py +267 -0
- mteb/models/model_implementations/querit_models.py +245 -0
- mteb/models/model_meta.py +122 -4
- mteb/results/model_result.py +23 -0
- mteb/results/task_result.py +4 -4
- mteb/tasks/pair_classification/fas/fars_tail.py +2 -34
- {mteb-2.7.12.dist-info → mteb-2.7.14.dist-info}/METADATA +1 -3
- {mteb-2.7.12.dist-info → mteb-2.7.14.dist-info}/RECORD +19 -17
- {mteb-2.7.12.dist-info → mteb-2.7.14.dist-info}/WHEEL +0 -0
- {mteb-2.7.12.dist-info → mteb-2.7.14.dist-info}/entry_points.txt +0 -0
- {mteb-2.7.12.dist-info → mteb-2.7.14.dist-info}/licenses/LICENSE +0 -0
- {mteb-2.7.12.dist-info → mteb-2.7.14.dist-info}/top_level.txt +0 -0
|
@@ -25,6 +25,8 @@ from mteb.types.statistics import (
|
|
|
25
25
|
if TYPE_CHECKING:
|
|
26
26
|
from pathlib import Path
|
|
27
27
|
|
|
28
|
+
from numpy.typing import NDArray
|
|
29
|
+
|
|
28
30
|
from mteb._evaluators.pair_classification_evaluator import (
|
|
29
31
|
PairClassificationDistances,
|
|
30
32
|
)
|
|
@@ -36,7 +38,6 @@ if TYPE_CHECKING:
|
|
|
36
38
|
TextStatistics,
|
|
37
39
|
)
|
|
38
40
|
|
|
39
|
-
|
|
40
41
|
logger = logging.getLogger(__name__)
|
|
41
42
|
|
|
42
43
|
|
|
@@ -138,7 +139,7 @@ class AbsTaskPairClassification(AbsTask):
|
|
|
138
139
|
self, similarity_scores: PairClassificationDistances, labels: list[int]
|
|
139
140
|
) -> dict[str, float]:
|
|
140
141
|
logger.info("Computing metrics...")
|
|
141
|
-
np_labels = np.asarray(labels)
|
|
142
|
+
np_labels: NDArray[np.int64] = np.asarray(labels, dtype=np.int64)
|
|
142
143
|
output_scores = {}
|
|
143
144
|
max_scores = defaultdict(list)
|
|
144
145
|
for short_name, scores, reverse in [
|
|
@@ -281,7 +282,10 @@ class AbsTaskPairClassification(AbsTask):
|
|
|
281
282
|
)
|
|
282
283
|
|
|
283
284
|
def _compute_metrics_values(
|
|
284
|
-
self,
|
|
285
|
+
self,
|
|
286
|
+
scores: list[float],
|
|
287
|
+
labels: NDArray[np.int64],
|
|
288
|
+
high_score_more_similar: bool,
|
|
285
289
|
) -> dict[str, float]:
|
|
286
290
|
"""Compute the metrics for the given scores and labels.
|
|
287
291
|
|
|
@@ -315,7 +319,10 @@ class AbsTaskPairClassification(AbsTask):
|
|
|
315
319
|
)
|
|
316
320
|
|
|
317
321
|
def _find_best_acc_and_threshold(
|
|
318
|
-
self,
|
|
322
|
+
self,
|
|
323
|
+
scores: list[float],
|
|
324
|
+
labels: NDArray[np.int64],
|
|
325
|
+
high_score_more_similar: bool,
|
|
319
326
|
) -> tuple[float, float]:
|
|
320
327
|
rows = list(zip(scores, labels))
|
|
321
328
|
rows = sorted(rows, key=lambda x: x[0], reverse=high_score_more_similar)
|
|
@@ -323,7 +330,7 @@ class AbsTaskPairClassification(AbsTask):
|
|
|
323
330
|
max_acc = 0
|
|
324
331
|
best_threshold = -1.0
|
|
325
332
|
positive_so_far = 0
|
|
326
|
-
remaining_negatives = sum(
|
|
333
|
+
remaining_negatives = sum(labels == 0)
|
|
327
334
|
|
|
328
335
|
for i in range(len(rows) - 1):
|
|
329
336
|
score, label = rows[i]
|
|
@@ -339,10 +346,9 @@ class AbsTaskPairClassification(AbsTask):
|
|
|
339
346
|
return max_acc, best_threshold
|
|
340
347
|
|
|
341
348
|
def _find_best_f1_and_threshold(
|
|
342
|
-
self, scores, labels, high_score_more_similar: bool
|
|
349
|
+
self, scores, labels: NDArray[np.int64], high_score_more_similar: bool
|
|
343
350
|
) -> tuple[float, float, float, float]:
|
|
344
351
|
scores = np.asarray(scores)
|
|
345
|
-
labels = np.asarray(labels)
|
|
346
352
|
|
|
347
353
|
rows = list(zip(scores, labels))
|
|
348
354
|
|
mteb/models/get_model_meta.py
CHANGED
|
@@ -2,6 +2,7 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import difflib
|
|
4
4
|
import logging
|
|
5
|
+
import warnings
|
|
5
6
|
from typing import TYPE_CHECKING, Any
|
|
6
7
|
|
|
7
8
|
from mteb.models import (
|
|
@@ -122,6 +123,11 @@ def get_model(
|
|
|
122
123
|
return model
|
|
123
124
|
|
|
124
125
|
|
|
126
|
+
_MODEL_RENAMES: dict[str, str] = {
|
|
127
|
+
"bm25s": "baseline/bm25s",
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
|
|
125
131
|
def get_model_meta(
|
|
126
132
|
model_name: str,
|
|
127
133
|
revision: str | None = None,
|
|
@@ -139,6 +145,12 @@ def get_model_meta(
|
|
|
139
145
|
Returns:
|
|
140
146
|
A model metadata object
|
|
141
147
|
"""
|
|
148
|
+
if model_name in _MODEL_RENAMES:
|
|
149
|
+
new_name = _MODEL_RENAMES[model_name]
|
|
150
|
+
msg = f"The model '{model_name}' has been renamed to '{new_name}'. To prevent this warning use the new name."
|
|
151
|
+
warnings.warn(msg, DeprecationWarning, stacklevel=2)
|
|
152
|
+
model_name = new_name
|
|
153
|
+
|
|
142
154
|
if model_name in MODEL_REGISTRY:
|
|
143
155
|
model_meta = MODEL_REGISTRY[model_name]
|
|
144
156
|
|
|
@@ -7,6 +7,7 @@ import torch
|
|
|
7
7
|
import torch.nn.functional as F
|
|
8
8
|
from packaging.version import Version
|
|
9
9
|
|
|
10
|
+
from mteb.models import sentence_transformers_loader
|
|
10
11
|
from mteb.models.model_meta import ModelMeta, ScoringFunction
|
|
11
12
|
from mteb.models.sentence_transformer_wrapper import SentenceTransformerEncoderWrapper
|
|
12
13
|
from mteb.types import PromptType
|
|
@@ -509,3 +510,42 @@ nomic_embed_text_v2_moe = ModelMeta(
|
|
|
509
510
|
url={https://arxiv.org/abs/2502.07972},
|
|
510
511
|
}""",
|
|
511
512
|
)
|
|
513
|
+
|
|
514
|
+
nomic_embed_code = ModelMeta(
|
|
515
|
+
loader=sentence_transformers_loader,
|
|
516
|
+
loader_kwargs={
|
|
517
|
+
"trust_remote_code": True,
|
|
518
|
+
"model_prompts": model_prompts,
|
|
519
|
+
},
|
|
520
|
+
name="nomic-ai/nomic-embed-code",
|
|
521
|
+
revision="11114029805cee545ef111d5144b623787462a52",
|
|
522
|
+
release_date="2025-03-24",
|
|
523
|
+
languages=["eng-Latn"],
|
|
524
|
+
n_parameters=7_070_619_136,
|
|
525
|
+
n_embedding_parameters=None,
|
|
526
|
+
memory_usage_mb=26972.0,
|
|
527
|
+
max_tokens=32768,
|
|
528
|
+
embed_dim=3584,
|
|
529
|
+
license="apache-2.0",
|
|
530
|
+
open_weights=True,
|
|
531
|
+
public_training_code="https://github.com/gangiswag/cornstack/",
|
|
532
|
+
public_training_data="https://huggingface.co/collections/nomic-ai/cornstack",
|
|
533
|
+
framework=["PyTorch", "Sentence Transformers", "safetensors"],
|
|
534
|
+
reference="https://huggingface.co/nomic-ai/nomic-embed-code",
|
|
535
|
+
similarity_fn_name=ScoringFunction.COSINE,
|
|
536
|
+
use_instructions=True,
|
|
537
|
+
training_datasets={"CoRNStack"},
|
|
538
|
+
adapted_from=None,
|
|
539
|
+
superseded_by=None,
|
|
540
|
+
modalities=["text"],
|
|
541
|
+
model_type=["dense"],
|
|
542
|
+
citation="""@misc{suresh2025cornstackhighqualitycontrastivedata,
|
|
543
|
+
title={CoRNStack: High-Quality Contrastive Data for Better Code Retrieval and Reranking},
|
|
544
|
+
author={Tarun Suresh and Revanth Gangi Reddy and Yifei Xu and Zach Nussbaum and Andriy Mulyar and Brandon Duderstadt and Heng Ji},
|
|
545
|
+
year={2025},
|
|
546
|
+
eprint={2412.01007},
|
|
547
|
+
archivePrefix={arXiv},
|
|
548
|
+
primaryClass={cs.CL},
|
|
549
|
+
url={https://arxiv.org/abs/2412.01007},
|
|
550
|
+
}""",
|
|
551
|
+
)
|
|
@@ -3,7 +3,7 @@ from __future__ import annotations
|
|
|
3
3
|
from typing import TYPE_CHECKING, Any
|
|
4
4
|
|
|
5
5
|
import torch
|
|
6
|
-
from packaging.
|
|
6
|
+
from packaging.specifiers import SpecifierSet
|
|
7
7
|
from torch.utils.data import DataLoader
|
|
8
8
|
from transformers import __version__ as transformers_version
|
|
9
9
|
|
|
@@ -31,18 +31,20 @@ class LlamaNemoretrieverColembed(AbsEncoder):
|
|
|
31
31
|
model_name_or_path: str,
|
|
32
32
|
revision: str,
|
|
33
33
|
trust_remote_code: bool,
|
|
34
|
+
transformers_version_constraint: str | None = None,
|
|
34
35
|
device_map="cuda",
|
|
35
36
|
torch_dtype=torch.bfloat16,
|
|
36
37
|
attn_implementation="flash_attention_2",
|
|
37
38
|
**kwargs,
|
|
38
39
|
):
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
40
|
+
if transformers_version_constraint is not None:
|
|
41
|
+
spec = SpecifierSet(transformers_version_constraint)
|
|
42
|
+
if transformers_version not in spec:
|
|
43
|
+
raise RuntimeError(
|
|
44
|
+
f"Model `{model_name_or_path}` requires transformers{transformers_version_constraint}, "
|
|
45
|
+
f"but {transformers_version} is installed. "
|
|
46
|
+
f"Run: pip install 'transformers{transformers_version_constraint}'"
|
|
47
|
+
)
|
|
46
48
|
|
|
47
49
|
from transformers import AutoModel
|
|
48
50
|
|
|
@@ -150,10 +152,24 @@ TRAINING_DATA = {
|
|
|
150
152
|
"wiki-ss-nq",
|
|
151
153
|
}
|
|
152
154
|
|
|
155
|
+
|
|
156
|
+
TRAINING_DATA_v2 = {
|
|
157
|
+
"VidoreDocVQARetrieval",
|
|
158
|
+
"VidoreInfoVQARetrieval",
|
|
159
|
+
"VidoreTatdqaRetrieval",
|
|
160
|
+
"VidoreArxivQARetrieval",
|
|
161
|
+
"docmatix-ir",
|
|
162
|
+
"VDRMultilingualRetrieval",
|
|
163
|
+
"VisRAG-Ret-Train-Synthetic-data",
|
|
164
|
+
"VisRAG-Ret-Train-In-domain-data",
|
|
165
|
+
"wiki-ss-nq",
|
|
166
|
+
}
|
|
167
|
+
|
|
153
168
|
llama_nemoretriever_colembed_1b_v1 = ModelMeta(
|
|
154
169
|
loader=LlamaNemoretrieverColembed,
|
|
155
170
|
loader_kwargs=dict(
|
|
156
171
|
trust_remote_code=True,
|
|
172
|
+
transformers_version_constraint="==4.49.0",
|
|
157
173
|
),
|
|
158
174
|
name="nvidia/llama-nemoretriever-colembed-1b-v1",
|
|
159
175
|
model_type=["late-interaction"],
|
|
@@ -168,7 +184,7 @@ llama_nemoretriever_colembed_1b_v1 = ModelMeta(
|
|
|
168
184
|
embed_dim=2048,
|
|
169
185
|
license="https://huggingface.co/nvidia/llama-nemoretriever-colembed-1b-v1/blob/main/LICENSE",
|
|
170
186
|
open_weights=True,
|
|
171
|
-
public_training_code=
|
|
187
|
+
public_training_code=None,
|
|
172
188
|
public_training_data="https://huggingface.co/nvidia/llama-nemoretriever-colembed-1b-v1#training-dataset",
|
|
173
189
|
framework=["PyTorch", "Transformers", "safetensors"],
|
|
174
190
|
reference="https://huggingface.co/nvidia/llama-nemoretriever-colembed-1b-v1",
|
|
@@ -182,6 +198,7 @@ llama_nemoretriever_colembed_3b_v1 = ModelMeta(
|
|
|
182
198
|
loader=LlamaNemoretrieverColembed,
|
|
183
199
|
loader_kwargs=dict(
|
|
184
200
|
trust_remote_code=True,
|
|
201
|
+
transformers_version_constraint="==4.49.0",
|
|
185
202
|
),
|
|
186
203
|
name="nvidia/llama-nemoretriever-colembed-3b-v1",
|
|
187
204
|
model_type=["late-interaction"],
|
|
@@ -196,7 +213,7 @@ llama_nemoretriever_colembed_3b_v1 = ModelMeta(
|
|
|
196
213
|
embed_dim=3072,
|
|
197
214
|
license="https://huggingface.co/nvidia/llama-nemoretriever-colembed-1b-v1/blob/main/LICENSE",
|
|
198
215
|
open_weights=True,
|
|
199
|
-
public_training_code=
|
|
216
|
+
public_training_code=None,
|
|
200
217
|
public_training_data="https://huggingface.co/nvidia/llama-nemoretriever-colembed-1b-v1#training-dataset",
|
|
201
218
|
framework=["PyTorch", "Transformers", "safetensors"],
|
|
202
219
|
reference="https://huggingface.co/nvidia/llama-nemoretriever-colembed-3b-v1",
|
|
@@ -205,3 +222,86 @@ llama_nemoretriever_colembed_3b_v1 = ModelMeta(
|
|
|
205
222
|
training_datasets=TRAINING_DATA,
|
|
206
223
|
citation=LLAMA_NEMORETRIEVER_CITATION,
|
|
207
224
|
)
|
|
225
|
+
|
|
226
|
+
llama_nemotron_colembed_vl_3b_v2 = ModelMeta(
|
|
227
|
+
loader=LlamaNemoretrieverColembed,
|
|
228
|
+
loader_kwargs=dict(
|
|
229
|
+
trust_remote_code=True,
|
|
230
|
+
transformers_version_constraint="==4.49.0",
|
|
231
|
+
),
|
|
232
|
+
name="nvidia/llama-nemotron-colembed-vl-3b-v2",
|
|
233
|
+
model_type=["late-interaction"],
|
|
234
|
+
languages=["eng-Latn"],
|
|
235
|
+
revision="75f03c712cb3a252e062295f9a0966e5d95d6156",
|
|
236
|
+
release_date="2026-01-21",
|
|
237
|
+
modalities=["image", "text"],
|
|
238
|
+
n_parameters=4_407_000_000,
|
|
239
|
+
memory_usage_mb=8403,
|
|
240
|
+
max_tokens=8192,
|
|
241
|
+
embed_dim=3072,
|
|
242
|
+
license="https://huggingface.co/nvidia/llama-nemotron-colembed-vl-3b-v2/blob/main/LICENSE",
|
|
243
|
+
open_weights=True,
|
|
244
|
+
public_training_code=None,
|
|
245
|
+
public_training_data="https://huggingface.co/nvidia/llama-nemotron-colembed-vl-3b-v2#training-dataset",
|
|
246
|
+
framework=["PyTorch", "Transformers", "safetensors"],
|
|
247
|
+
reference="https://huggingface.co/nvidia/llama-nemotron-colembed-vl-3b-v2",
|
|
248
|
+
similarity_fn_name="MaxSim",
|
|
249
|
+
use_instructions=True,
|
|
250
|
+
training_datasets=TRAINING_DATA,
|
|
251
|
+
citation=LLAMA_NEMORETRIEVER_CITATION,
|
|
252
|
+
)
|
|
253
|
+
|
|
254
|
+
nemotron_colembed_vl_4b_v2 = ModelMeta(
|
|
255
|
+
loader=LlamaNemoretrieverColembed,
|
|
256
|
+
loader_kwargs=dict(
|
|
257
|
+
trust_remote_code=True,
|
|
258
|
+
transformers_version_constraint="==5.0.0rc0",
|
|
259
|
+
),
|
|
260
|
+
name="nvidia/nemotron-colembed-vl-4b-v2",
|
|
261
|
+
revision="823b1625c15fe3da73fa094205e538a7a2301a2a",
|
|
262
|
+
languages=["eng-Latn"],
|
|
263
|
+
release_date="2026-01-07",
|
|
264
|
+
modalities=["image", "text"],
|
|
265
|
+
n_parameters=4_800_000_000,
|
|
266
|
+
memory_usage_mb=9206,
|
|
267
|
+
max_tokens=262144,
|
|
268
|
+
embed_dim=2560,
|
|
269
|
+
license="https://huggingface.co/nvidia/nemotron-colembed-vl-4b-v2/blob/main/LICENSE",
|
|
270
|
+
open_weights=True,
|
|
271
|
+
public_training_code=None,
|
|
272
|
+
public_training_data="https://huggingface.co/nvidia/nemotron-colembed-vl-4b-v2#training-dataset",
|
|
273
|
+
framework=["PyTorch", "Transformers"],
|
|
274
|
+
reference="https://huggingface.co/nvidia/nemotron-colembed-vl-4b-v2",
|
|
275
|
+
similarity_fn_name="MaxSim",
|
|
276
|
+
use_instructions=True,
|
|
277
|
+
training_datasets=TRAINING_DATA_v2,
|
|
278
|
+
citation=LLAMA_NEMORETRIEVER_CITATION,
|
|
279
|
+
)
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
nemotron_colembed_vl_8b_v2 = ModelMeta(
|
|
283
|
+
loader=LlamaNemoretrieverColembed,
|
|
284
|
+
loader_kwargs=dict(
|
|
285
|
+
trust_remote_code=True,
|
|
286
|
+
transformers_version_constraint="==5.0.0rc0",
|
|
287
|
+
),
|
|
288
|
+
name="nvidia/nemotron-colembed-vl-8b-v2",
|
|
289
|
+
revision="6cbe43579dda6237768fc373768ad372cc5cdfec",
|
|
290
|
+
languages=["eng-Latn"],
|
|
291
|
+
release_date="2026-01-07",
|
|
292
|
+
modalities=["image", "text"],
|
|
293
|
+
n_parameters=8_700_000_000,
|
|
294
|
+
memory_usage_mb=16722,
|
|
295
|
+
max_tokens=262144,
|
|
296
|
+
embed_dim=4096,
|
|
297
|
+
license="https://huggingface.co/nvidia/nemotron-colembed-vl-8b-v2/blob/main/LICENSE",
|
|
298
|
+
open_weights=True,
|
|
299
|
+
public_training_code=None,
|
|
300
|
+
public_training_data="https://huggingface.co/nvidia/nemotron-colembed-vl-8b-v2#training-dataset",
|
|
301
|
+
framework=["PyTorch", "Transformers"],
|
|
302
|
+
reference="https://huggingface.co/nvidia/nemotron-colembed-vl-8b-v2",
|
|
303
|
+
similarity_fn_name="MaxSim",
|
|
304
|
+
use_instructions=True,
|
|
305
|
+
training_datasets=TRAINING_DATA_v2,
|
|
306
|
+
citation=LLAMA_NEMORETRIEVER_CITATION,
|
|
307
|
+
)
|
|
@@ -134,7 +134,7 @@ class SparseEncoderWrapper(AbsEncoder):
|
|
|
134
134
|
|
|
135
135
|
opensearch_neural_sparse_encoding_doc_v3_gte = ModelMeta(
|
|
136
136
|
name="opensearch-project/opensearch-neural-sparse-encoding-doc-v3-gte",
|
|
137
|
-
model_type=["
|
|
137
|
+
model_type=["sparse"],
|
|
138
138
|
languages=["eng-Latn"],
|
|
139
139
|
open_weights=True,
|
|
140
140
|
revision="a8abaa916125ee512a7a8f4d706d07eb0128a8e6",
|
|
@@ -161,7 +161,7 @@ opensearch_neural_sparse_encoding_doc_v3_gte = ModelMeta(
|
|
|
161
161
|
|
|
162
162
|
opensearch_neural_sparse_encoding_doc_v3_distill = ModelMeta(
|
|
163
163
|
name="opensearch-project/opensearch-neural-sparse-encoding-doc-v3-distill",
|
|
164
|
-
model_type=["
|
|
164
|
+
model_type=["sparse"],
|
|
165
165
|
languages=["eng-Latn"],
|
|
166
166
|
open_weights=True,
|
|
167
167
|
revision="babf71f3c48695e2e53a978208e8aba48335e3c0",
|
|
@@ -184,7 +184,7 @@ opensearch_neural_sparse_encoding_doc_v3_distill = ModelMeta(
|
|
|
184
184
|
|
|
185
185
|
opensearch_neural_sparse_encoding_doc_v2_distill = ModelMeta(
|
|
186
186
|
name="opensearch-project/opensearch-neural-sparse-encoding-doc-v2-distill",
|
|
187
|
-
model_type=["
|
|
187
|
+
model_type=["sparse"],
|
|
188
188
|
languages=["eng-Latn"],
|
|
189
189
|
open_weights=True,
|
|
190
190
|
revision="8921a26c78b8559d6604eb1f5c0b74c079bee38f",
|
|
@@ -208,7 +208,7 @@ opensearch_neural_sparse_encoding_doc_v2_distill = ModelMeta(
|
|
|
208
208
|
|
|
209
209
|
opensearch_neural_sparse_encoding_doc_v2_mini = ModelMeta(
|
|
210
210
|
name="opensearch-project/opensearch-neural-sparse-encoding-doc-v2-mini",
|
|
211
|
-
model_type=["
|
|
211
|
+
model_type=["sparse"],
|
|
212
212
|
languages=["eng-Latn"],
|
|
213
213
|
open_weights=True,
|
|
214
214
|
revision="4af867a426867dfdd744097531046f4289a32fdd",
|
|
@@ -231,7 +231,7 @@ opensearch_neural_sparse_encoding_doc_v2_mini = ModelMeta(
|
|
|
231
231
|
|
|
232
232
|
opensearch_neural_sparse_encoding_doc_v1 = ModelMeta(
|
|
233
233
|
name="opensearch-project/opensearch-neural-sparse-encoding-doc-v1",
|
|
234
|
-
model_type=["
|
|
234
|
+
model_type=["sparse"],
|
|
235
235
|
languages=["eng-Latn"],
|
|
236
236
|
open_weights=True,
|
|
237
237
|
revision="98cdcbd72867c547f72f2b7b7bed9cdf9f09922d",
|
|
@@ -0,0 +1,267 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING, Any
|
|
4
|
+
|
|
5
|
+
import torch
|
|
6
|
+
from tqdm.auto import tqdm
|
|
7
|
+
from transformers import AutoModel, AutoProcessor
|
|
8
|
+
|
|
9
|
+
from mteb._requires_package import requires_image_dependencies
|
|
10
|
+
from mteb.models.abs_encoder import AbsEncoder
|
|
11
|
+
from mteb.models.model_meta import ModelMeta, ScoringFunction
|
|
12
|
+
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
from torch.utils.data import DataLoader
|
|
15
|
+
|
|
16
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
17
|
+
from mteb.types import Array, BatchedInput, PromptType
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class OpsColQwen3Wrapper(AbsEncoder):
|
|
21
|
+
"""Wrapper for OpsColQwen3 model."""
|
|
22
|
+
|
|
23
|
+
def __init__(
|
|
24
|
+
self,
|
|
25
|
+
model_name: str = "OpenSearch-AI/Ops-Colqwen3-4B",
|
|
26
|
+
revision: str | None = None,
|
|
27
|
+
device: str | None = None,
|
|
28
|
+
attn_implementation: str | None = None,
|
|
29
|
+
**kwargs,
|
|
30
|
+
):
|
|
31
|
+
requires_image_dependencies()
|
|
32
|
+
from transformers.utils.import_utils import is_flash_attn_2_available
|
|
33
|
+
|
|
34
|
+
if attn_implementation is None:
|
|
35
|
+
attn_implementation = (
|
|
36
|
+
"flash_attention_2" if is_flash_attn_2_available() else None
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
|
|
40
|
+
|
|
41
|
+
self.mdl = AutoModel.from_pretrained(
|
|
42
|
+
model_name,
|
|
43
|
+
device_map=self.device,
|
|
44
|
+
attn_implementation=attn_implementation,
|
|
45
|
+
trust_remote_code=True,
|
|
46
|
+
revision=revision,
|
|
47
|
+
**kwargs,
|
|
48
|
+
)
|
|
49
|
+
self.mdl.eval()
|
|
50
|
+
|
|
51
|
+
self.processor = AutoProcessor.from_pretrained(
|
|
52
|
+
model_name,
|
|
53
|
+
trust_remote_code=True,
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
def encode(
|
|
57
|
+
self,
|
|
58
|
+
inputs: DataLoader[BatchedInput],
|
|
59
|
+
*,
|
|
60
|
+
task_metadata: TaskMetadata,
|
|
61
|
+
hf_split: str,
|
|
62
|
+
hf_subset: str,
|
|
63
|
+
prompt_type: PromptType | None = None,
|
|
64
|
+
**kwargs: Any,
|
|
65
|
+
) -> Array:
|
|
66
|
+
text_embeddings = None
|
|
67
|
+
image_embeddings = None
|
|
68
|
+
|
|
69
|
+
if "text" in inputs.dataset.features:
|
|
70
|
+
text_embeddings = self.get_text_embeddings(inputs, **kwargs)
|
|
71
|
+
if "image" in inputs.dataset.features:
|
|
72
|
+
image_embeddings = self.get_image_embeddings(inputs, **kwargs)
|
|
73
|
+
|
|
74
|
+
if text_embeddings is not None and image_embeddings is not None:
|
|
75
|
+
if len(text_embeddings) != len(image_embeddings):
|
|
76
|
+
raise ValueError(
|
|
77
|
+
"The number of texts and images must have the same length"
|
|
78
|
+
)
|
|
79
|
+
fused_embeddings = text_embeddings + image_embeddings
|
|
80
|
+
return fused_embeddings
|
|
81
|
+
elif text_embeddings is not None:
|
|
82
|
+
return text_embeddings
|
|
83
|
+
elif image_embeddings is not None:
|
|
84
|
+
return image_embeddings
|
|
85
|
+
raise ValueError("No text or image inputs found")
|
|
86
|
+
|
|
87
|
+
def encode_input(self, inputs):
|
|
88
|
+
return self.mdl(**inputs)
|
|
89
|
+
|
|
90
|
+
def get_image_embeddings(
|
|
91
|
+
self,
|
|
92
|
+
images: DataLoader,
|
|
93
|
+
batch_size: int = 32,
|
|
94
|
+
**kwargs,
|
|
95
|
+
) -> torch.Tensor:
|
|
96
|
+
import torchvision.transforms.functional as F
|
|
97
|
+
from PIL import Image
|
|
98
|
+
|
|
99
|
+
all_embeds = []
|
|
100
|
+
|
|
101
|
+
with torch.no_grad():
|
|
102
|
+
for batch in tqdm(images, desc="Encoding images"):
|
|
103
|
+
# batch may be list of tensors or PIL
|
|
104
|
+
imgs = [
|
|
105
|
+
F.to_pil_image(b.to(self.device))
|
|
106
|
+
if not isinstance(b, Image.Image)
|
|
107
|
+
else b
|
|
108
|
+
for b in batch["image"]
|
|
109
|
+
]
|
|
110
|
+
inputs = self.processor.process_images(imgs)
|
|
111
|
+
inputs = {k: v.to(self.device) for k, v in inputs.items()}
|
|
112
|
+
outs = self.encode_input(inputs)
|
|
113
|
+
all_embeds.extend(outs.cpu().to(torch.float32))
|
|
114
|
+
|
|
115
|
+
padded = torch.nn.utils.rnn.pad_sequence(
|
|
116
|
+
all_embeds, batch_first=True, padding_value=0
|
|
117
|
+
)
|
|
118
|
+
return padded
|
|
119
|
+
|
|
120
|
+
def get_text_embeddings(
|
|
121
|
+
self,
|
|
122
|
+
texts: DataLoader,
|
|
123
|
+
batch_size: int = 32,
|
|
124
|
+
**kwargs,
|
|
125
|
+
) -> torch.Tensor:
|
|
126
|
+
all_embeds = []
|
|
127
|
+
|
|
128
|
+
with torch.no_grad():
|
|
129
|
+
for batch in tqdm(texts, desc="Encoding texts"):
|
|
130
|
+
batch_texts = batch["text"]
|
|
131
|
+
inputs = self.processor.process_queries(batch_texts)
|
|
132
|
+
inputs = {k: v.to(self.device) for k, v in inputs.items()}
|
|
133
|
+
outs = self.encode_input(inputs)
|
|
134
|
+
all_embeds.extend(outs.cpu().to(torch.float32))
|
|
135
|
+
|
|
136
|
+
padded = torch.nn.utils.rnn.pad_sequence(
|
|
137
|
+
all_embeds, batch_first=True, padding_value=0
|
|
138
|
+
)
|
|
139
|
+
return padded
|
|
140
|
+
|
|
141
|
+
def similarity(self, a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
|
|
142
|
+
return self.processor.score_multi_vector(a, b, device=self.device)
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
OPS_COLQWEN3_TRAINING_DATA = {
|
|
146
|
+
"VDRMultilingualRetrieval",
|
|
147
|
+
# from https://huggingface.co/datasets/vidore/colpali_train_set
|
|
148
|
+
"VidoreDocVQARetrieval",
|
|
149
|
+
"VidoreInfoVQARetrieval",
|
|
150
|
+
"VidoreTatdqaRetrieval",
|
|
151
|
+
"VidoreArxivQARetrieval",
|
|
152
|
+
"docmatix-ir",
|
|
153
|
+
"HotpotQA",
|
|
154
|
+
"FEVER",
|
|
155
|
+
"NQ",
|
|
156
|
+
"MIRACLRetrieval",
|
|
157
|
+
"WebInstructSub", # MathStackExchange and ScienceStackExchange only
|
|
158
|
+
"MrTyDi",
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
multilingual_langs = [
|
|
162
|
+
"afr-Latn",
|
|
163
|
+
"ara-Arab",
|
|
164
|
+
"aze-Latn",
|
|
165
|
+
"bel-Cyrl",
|
|
166
|
+
"bul-Cyrl",
|
|
167
|
+
"ben-Beng",
|
|
168
|
+
"cat-Latn",
|
|
169
|
+
"ceb-Latn",
|
|
170
|
+
"ces-Latn",
|
|
171
|
+
"cym-Latn",
|
|
172
|
+
"dan-Latn",
|
|
173
|
+
"deu-Latn",
|
|
174
|
+
"ell-Grek",
|
|
175
|
+
"eng-Latn",
|
|
176
|
+
"spa-Latn",
|
|
177
|
+
"est-Latn",
|
|
178
|
+
"eus-Latn",
|
|
179
|
+
"fas-Arab",
|
|
180
|
+
"fin-Latn",
|
|
181
|
+
"fra-Latn",
|
|
182
|
+
"glg-Latn",
|
|
183
|
+
"guj-Gujr",
|
|
184
|
+
"heb-Hebr",
|
|
185
|
+
"hin-Deva",
|
|
186
|
+
"hrv-Latn",
|
|
187
|
+
"hat-Latn",
|
|
188
|
+
"hun-Latn",
|
|
189
|
+
"hye-Armn",
|
|
190
|
+
"ind-Latn",
|
|
191
|
+
"isl-Latn",
|
|
192
|
+
"ita-Latn",
|
|
193
|
+
"jpn-Jpan",
|
|
194
|
+
"jav-Latn",
|
|
195
|
+
"kat-Geor",
|
|
196
|
+
"kaz-Cyrl",
|
|
197
|
+
"khm-Khmr",
|
|
198
|
+
"kan-Knda",
|
|
199
|
+
"kor-Hang",
|
|
200
|
+
"kir-Cyrl",
|
|
201
|
+
"lao-Laoo",
|
|
202
|
+
"lit-Latn",
|
|
203
|
+
"lav-Latn",
|
|
204
|
+
"mkd-Cyrl",
|
|
205
|
+
"mal-Mlym",
|
|
206
|
+
"mon-Cyrl",
|
|
207
|
+
"mar-Deva",
|
|
208
|
+
"msa-Latn",
|
|
209
|
+
"mya-Mymr",
|
|
210
|
+
"nep-Deva",
|
|
211
|
+
"nld-Latn",
|
|
212
|
+
"nor-Latn",
|
|
213
|
+
"nob-Latn",
|
|
214
|
+
"nno-Latn",
|
|
215
|
+
"pan-Guru",
|
|
216
|
+
"pol-Latn",
|
|
217
|
+
"por-Latn",
|
|
218
|
+
"que-Latn",
|
|
219
|
+
"ron-Latn",
|
|
220
|
+
"rus-Cyrl",
|
|
221
|
+
"sin-Sinh",
|
|
222
|
+
"slk-Latn",
|
|
223
|
+
"slv-Latn",
|
|
224
|
+
"swa-Latn",
|
|
225
|
+
"tam-Taml",
|
|
226
|
+
"tel-Telu",
|
|
227
|
+
"tha-Thai",
|
|
228
|
+
"tgl-Latn",
|
|
229
|
+
"tur-Latn",
|
|
230
|
+
"ukr-Cyrl",
|
|
231
|
+
"urd-Arab",
|
|
232
|
+
"vie-Latn",
|
|
233
|
+
"yor-Latn",
|
|
234
|
+
"zho-Hans",
|
|
235
|
+
]
|
|
236
|
+
|
|
237
|
+
OPS_COLQWEN3_CITATION = """
|
|
238
|
+
@misc{ops_colqwen3_4b,
|
|
239
|
+
author = {OpenSearch-AI},
|
|
240
|
+
title = {Ops-ColQwen3: State-of-the-Art Multimodal Embedding Model for Visual Document Retrieval},
|
|
241
|
+
year = {2026},
|
|
242
|
+
url = {https://huggingface.co/OpenSearch-AI/Ops-ColQwen3-4B},
|
|
243
|
+
}"""
|
|
244
|
+
|
|
245
|
+
ops_colqwen3_4b = ModelMeta(
|
|
246
|
+
loader=OpsColQwen3Wrapper,
|
|
247
|
+
name="OpenSearch-AI/Ops-Colqwen3-4B",
|
|
248
|
+
loader_kwargs=dict(dtype=torch.float16, trust_remote_code=True),
|
|
249
|
+
languages=multilingual_langs,
|
|
250
|
+
revision="4894b7d451ff33981650acc693bb482dbef302d3",
|
|
251
|
+
release_date="2026-01-24",
|
|
252
|
+
modalities=["image", "text"],
|
|
253
|
+
n_parameters=4_800_000_000,
|
|
254
|
+
memory_usage_mb=9206,
|
|
255
|
+
max_tokens=32768,
|
|
256
|
+
embed_dim=2560,
|
|
257
|
+
license="apache-2.0",
|
|
258
|
+
open_weights=True,
|
|
259
|
+
public_training_code="https://github.com/illuin-tech/colpali",
|
|
260
|
+
public_training_data=None,
|
|
261
|
+
framework=["PyTorch"],
|
|
262
|
+
reference="https://huggingface.co/OpenSearch-AI/Ops-Colqwen3-4B",
|
|
263
|
+
similarity_fn_name=ScoringFunction.MAX_SIM,
|
|
264
|
+
use_instructions=True,
|
|
265
|
+
training_datasets=OPS_COLQWEN3_TRAINING_DATA,
|
|
266
|
+
citation=OPS_COLQWEN3_CITATION,
|
|
267
|
+
)
|
|
@@ -0,0 +1,245 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from typing import TYPE_CHECKING, Any
|
|
5
|
+
|
|
6
|
+
import torch
|
|
7
|
+
from tqdm.auto import tqdm
|
|
8
|
+
|
|
9
|
+
from mteb.models.model_meta import ModelMeta
|
|
10
|
+
|
|
11
|
+
from .rerankers_custom import RerankerWrapper
|
|
12
|
+
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
from torch.utils.data import DataLoader
|
|
15
|
+
|
|
16
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
17
|
+
from mteb.types import BatchedInput, PromptType
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class QueritWrapper(RerankerWrapper):
|
|
23
|
+
"""
|
|
24
|
+
Multi-GPU / multi-process reranker wrapper for mteb.mteb evaluation.
|
|
25
|
+
Supports flattening all query-passage pairs without explicit grouping.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
def __init__(
|
|
29
|
+
self,
|
|
30
|
+
model_name: str,
|
|
31
|
+
**kwargs: Any,
|
|
32
|
+
) -> None:
|
|
33
|
+
super().__init__(model_name, **kwargs)
|
|
34
|
+
from transformers import AutoModel, AutoTokenizer
|
|
35
|
+
|
|
36
|
+
if not self.device:
|
|
37
|
+
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
38
|
+
model_args = {}
|
|
39
|
+
if self.fp_options:
|
|
40
|
+
model_args["torch_dtype"] = self.fp_options
|
|
41
|
+
self.model = AutoModel.from_pretrained(
|
|
42
|
+
model_name, trust_remote_code=True, **model_args
|
|
43
|
+
)
|
|
44
|
+
logger.info(f"Using model {model_name}")
|
|
45
|
+
|
|
46
|
+
self.model.to(self.device)
|
|
47
|
+
self.tokenizer = AutoTokenizer.from_pretrained(
|
|
48
|
+
model_name, trust_remote_code=True
|
|
49
|
+
)
|
|
50
|
+
if "[CLS]" not in self.tokenizer.get_vocab():
|
|
51
|
+
raise ValueError("Tokenizer missing required special token '[CLS]'")
|
|
52
|
+
self.cls_token_id = self.tokenizer.convert_tokens_to_ids("[CLS]")
|
|
53
|
+
self.pad_token_id = self.tokenizer.pad_token_id or 0
|
|
54
|
+
|
|
55
|
+
self.max_length = (
|
|
56
|
+
min(kwargs.get("max_length", 4096), self.tokenizer.model_max_length) - 1
|
|
57
|
+
) # sometimes it's a v large number/max int
|
|
58
|
+
logger.info(f"Using max_length of {self.max_length}, 1 token for [CLS]")
|
|
59
|
+
self.model.eval()
|
|
60
|
+
|
|
61
|
+
def process_inputs(
|
|
62
|
+
self,
|
|
63
|
+
pairs: list[str],
|
|
64
|
+
) -> dict[str, torch.Tensor]:
|
|
65
|
+
"""
|
|
66
|
+
Encode a batch of (query, document) pairs:
|
|
67
|
+
- Concatenate prompt + Query + Content
|
|
68
|
+
- Append [CLS] at the end
|
|
69
|
+
- Left-pad to max_length
|
|
70
|
+
- Generate custom attention mask based on block types
|
|
71
|
+
"""
|
|
72
|
+
# Construct input texts
|
|
73
|
+
enc = self.tokenizer(
|
|
74
|
+
pairs,
|
|
75
|
+
add_special_tokens=False,
|
|
76
|
+
truncation=True,
|
|
77
|
+
max_length=self.max_length,
|
|
78
|
+
padding=False,
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
input_ids_list: list[list[int]] = []
|
|
82
|
+
attn_mask_list: list[torch.Tensor] = []
|
|
83
|
+
|
|
84
|
+
for ids in enc["input_ids"]:
|
|
85
|
+
# Append [CLS] token
|
|
86
|
+
ids = ids + [self.cls_token_id]
|
|
87
|
+
block_types = [1] * (len(ids) - 1) + [2] # content + CLS
|
|
88
|
+
|
|
89
|
+
# Pad or truncate
|
|
90
|
+
if len(ids) < self.max_length:
|
|
91
|
+
pad_len = self.max_length - len(ids)
|
|
92
|
+
ids = [self.pad_token_id] * pad_len + ids
|
|
93
|
+
block_types = [0] * pad_len + block_types
|
|
94
|
+
else:
|
|
95
|
+
ids = ids[-self.max_length :]
|
|
96
|
+
block_types = block_types[-self.max_length :]
|
|
97
|
+
|
|
98
|
+
attn = self.compute_mask_content_cls(block_types)
|
|
99
|
+
input_ids_list.append(ids)
|
|
100
|
+
attn_mask_list.append(attn)
|
|
101
|
+
|
|
102
|
+
input_ids = torch.tensor(input_ids_list, dtype=torch.long, device=self.device)
|
|
103
|
+
attention_mask = torch.stack(attn_mask_list, dim=0).to(self.device)
|
|
104
|
+
|
|
105
|
+
return {"input_ids": input_ids, "attention_mask": attention_mask}
|
|
106
|
+
|
|
107
|
+
@torch.inference_mode()
|
|
108
|
+
def predict(
|
|
109
|
+
self,
|
|
110
|
+
inputs1: DataLoader[BatchedInput],
|
|
111
|
+
inputs2: DataLoader[BatchedInput],
|
|
112
|
+
*,
|
|
113
|
+
task_metadata: TaskMetadata,
|
|
114
|
+
hf_split: str,
|
|
115
|
+
hf_subset: str,
|
|
116
|
+
prompt_type: PromptType | None = None,
|
|
117
|
+
**kwargs: Any,
|
|
118
|
+
) -> list[float]:
|
|
119
|
+
"""
|
|
120
|
+
Predict relevance scores for query-passage pairs.
|
|
121
|
+
Supports both single-process and multi-process/multi-GPU modes.
|
|
122
|
+
"""
|
|
123
|
+
# Flatten all pairs from mteb.mteb DataLoaders
|
|
124
|
+
queries = [text for batch in inputs1 for text in batch["text"]]
|
|
125
|
+
passages = [text for batch in inputs2 for text in batch["text"]]
|
|
126
|
+
|
|
127
|
+
instructions = None
|
|
128
|
+
if "instruction" in inputs2.dataset.features:
|
|
129
|
+
instructions = [text for batch in inputs1 for text in batch["instruction"]]
|
|
130
|
+
|
|
131
|
+
num_pairs = len(queries)
|
|
132
|
+
if num_pairs == 0:
|
|
133
|
+
return []
|
|
134
|
+
final_scores: list[float] = []
|
|
135
|
+
|
|
136
|
+
batch_size = kwargs.get("batch_size", self.batch_size)
|
|
137
|
+
with tqdm(total=num_pairs, desc="Scoring", ncols=100) as pbar:
|
|
138
|
+
for start in range(0, num_pairs, batch_size):
|
|
139
|
+
end = min(start + batch_size, num_pairs)
|
|
140
|
+
batch_q = queries[start:end]
|
|
141
|
+
batch_d = passages[start:end]
|
|
142
|
+
|
|
143
|
+
batch_instructions = (
|
|
144
|
+
instructions[start:end]
|
|
145
|
+
if instructions is not None
|
|
146
|
+
else [None] * len(batch_q)
|
|
147
|
+
)
|
|
148
|
+
pairs = [
|
|
149
|
+
self.format_instruction(instr, query, doc)
|
|
150
|
+
for instr, query, doc in zip(batch_instructions, batch_q, batch_d)
|
|
151
|
+
]
|
|
152
|
+
enc = self.process_inputs(pairs)
|
|
153
|
+
out = self.model(**enc)
|
|
154
|
+
scores = out["score"].squeeze(-1).detach().float().cpu().tolist()
|
|
155
|
+
|
|
156
|
+
if not isinstance(scores, list):
|
|
157
|
+
scores = [scores]
|
|
158
|
+
|
|
159
|
+
final_scores.extend(scores)
|
|
160
|
+
pbar.update(len(scores))
|
|
161
|
+
|
|
162
|
+
return final_scores
|
|
163
|
+
|
|
164
|
+
@staticmethod
|
|
165
|
+
def format_instruction(instruction: str | None, query: str, doc: str) -> str:
|
|
166
|
+
if instruction is None:
|
|
167
|
+
output = f"Judge whether the Content meets the requirements based on the Query. Query: {query}; Content: {doc}"
|
|
168
|
+
else:
|
|
169
|
+
output = f"{instruction} Query: {query}; Content: {doc}"
|
|
170
|
+
return output
|
|
171
|
+
|
|
172
|
+
@staticmethod
|
|
173
|
+
def compute_mask_content_cls(block_types: list[int]) -> torch.Tensor:
|
|
174
|
+
"""
|
|
175
|
+
Create custom attention mask based on token block types:
|
|
176
|
+
- 0: padding → ignored
|
|
177
|
+
- 1: content → causal attention to previous content only
|
|
178
|
+
- 2: [CLS] → causal attention to all non-padding tokens
|
|
179
|
+
|
|
180
|
+
Args:
|
|
181
|
+
block_types: List of token types for one sequence
|
|
182
|
+
|
|
183
|
+
Returns:
|
|
184
|
+
[1, seq_len, seq_len] boolean attention mask (True = allowed to attend)
|
|
185
|
+
"""
|
|
186
|
+
pos = torch.tensor(block_types, dtype=torch.long)
|
|
187
|
+
n = pos.shape[0]
|
|
188
|
+
if n == 0:
|
|
189
|
+
return torch.empty((0, 0), dtype=torch.bool, device=pos.device)
|
|
190
|
+
|
|
191
|
+
row_types = pos.view(n, 1)
|
|
192
|
+
col_types = pos.view(1, n)
|
|
193
|
+
|
|
194
|
+
row_idx = torch.arange(n, device=pos.device).view(n, 1)
|
|
195
|
+
col_idx = torch.arange(n, device=pos.device).view(1, n)
|
|
196
|
+
causal_mask = col_idx <= row_idx
|
|
197
|
+
|
|
198
|
+
# Content tokens only attend to previous content
|
|
199
|
+
mask_content = (row_types == 1) & (col_types == 1) & causal_mask
|
|
200
|
+
|
|
201
|
+
# [CLS] attends to all non-pad tokens (causal)
|
|
202
|
+
mask_cls = (row_types == 2) & (col_types != 0) & causal_mask
|
|
203
|
+
|
|
204
|
+
type_mask = mask_content | mask_cls
|
|
205
|
+
return type_mask.unsqueeze(0)
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
querit_reranker_training_data = {
|
|
209
|
+
"MIRACLRanking", # https://huggingface.co/datasets/mteb/MIRACLReranking
|
|
210
|
+
"MrTidyRetrieval", # https://huggingface.co/datasets/mteb/mrtidy
|
|
211
|
+
"ruri-v3-dataset-reranker", # https://huggingface.co/datasets/cl-nagoya/ruri-v3-dataset-reranker
|
|
212
|
+
"MultiLongDocReranking", # https://huggingface.co/datasets/Shitao/MLDR
|
|
213
|
+
"MindSmallReranking", # https://huggingface.co/datasets/mteb/MindSmallReranking
|
|
214
|
+
"MSMARCO", # https://huggingface.co/datasets/mteb/msmarco
|
|
215
|
+
"CQADupStack", # https://huggingface.co/datasets/mteb/cqadupstack-*
|
|
216
|
+
"AskUbuntuDupQuestions", # https://github.com/taolei87/askubuntu & The corpus and queries that overlap with mteb/askubuntudupquestions-reranking have been removed.
|
|
217
|
+
"T2Reranking", # https://huggingface.co/datasets/THUIR/T2Ranking & The corpus and queries that overlap with mteb/T2Reranking have been removed.
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
model_meta = ModelMeta(
|
|
221
|
+
loader=QueritWrapper,
|
|
222
|
+
loader_kwargs={
|
|
223
|
+
"fp_options": "bfloat16",
|
|
224
|
+
},
|
|
225
|
+
name="Querit/Querit",
|
|
226
|
+
model_type=["cross-encoder"],
|
|
227
|
+
languages=["eng-Latn"],
|
|
228
|
+
open_weights=True,
|
|
229
|
+
revision="5ad2649cc4defb7e1361262260e9a781f14b08bc",
|
|
230
|
+
release_date="2026-01-24",
|
|
231
|
+
n_parameters=4919636992,
|
|
232
|
+
n_embedding_parameters=131907584,
|
|
233
|
+
embed_dim=1024,
|
|
234
|
+
memory_usage_mb=9383.0,
|
|
235
|
+
max_tokens=4096,
|
|
236
|
+
reference="https://huggingface.co/Querit/Querit",
|
|
237
|
+
similarity_fn_name=None,
|
|
238
|
+
training_datasets=querit_reranker_training_data,
|
|
239
|
+
license="apache-2.0",
|
|
240
|
+
framework=["PyTorch"],
|
|
241
|
+
use_instructions=None,
|
|
242
|
+
public_training_code=None,
|
|
243
|
+
public_training_data=None,
|
|
244
|
+
citation=None,
|
|
245
|
+
)
|
mteb/models/model_meta.py
CHANGED
|
@@ -71,7 +71,7 @@ FRAMEWORKS = Literal[
|
|
|
71
71
|
"Transformers",
|
|
72
72
|
]
|
|
73
73
|
|
|
74
|
-
MODEL_TYPES = Literal["dense", "cross-encoder", "late-interaction"]
|
|
74
|
+
MODEL_TYPES = Literal["dense", "cross-encoder", "late-interaction", "sparse"]
|
|
75
75
|
|
|
76
76
|
|
|
77
77
|
class ScoringFunction(HelpfulStrEnum):
|
|
@@ -266,7 +266,7 @@ class ModelMeta(BaseModel):
|
|
|
266
266
|
@field_validator("name")
|
|
267
267
|
@classmethod
|
|
268
268
|
def _check_name(cls, v: str | None) -> str | None:
|
|
269
|
-
if v is None
|
|
269
|
+
if v is None:
|
|
270
270
|
return v
|
|
271
271
|
if "/" not in v:
|
|
272
272
|
raise ValueError(
|
|
@@ -302,6 +302,121 @@ class ModelMeta(BaseModel):
|
|
|
302
302
|
raise ValueError("Model name is not set")
|
|
303
303
|
return self.name.replace("/", "__").replace(" ", "_")
|
|
304
304
|
|
|
305
|
+
@classmethod
|
|
306
|
+
def _detect_cross_encoder_or_dense(
|
|
307
|
+
cls,
|
|
308
|
+
model_name: str,
|
|
309
|
+
revision: str | None,
|
|
310
|
+
sentence_transformers_loader: Callable[..., MTEBModels],
|
|
311
|
+
cross_encoder_loader: Callable[..., MTEBModels],
|
|
312
|
+
) -> tuple[Callable[..., MTEBModels] | None, MODEL_TYPES]:
|
|
313
|
+
"""Detect if model is CrossEncoder or default to dense."""
|
|
314
|
+
config = _get_json_from_hub(
|
|
315
|
+
model_name, "config.json", "model", revision=revision
|
|
316
|
+
)
|
|
317
|
+
|
|
318
|
+
if not config:
|
|
319
|
+
logger.warning(
|
|
320
|
+
f"Could not load config.json for {model_name}. "
|
|
321
|
+
"Defaulting to SentenceTransformer loader."
|
|
322
|
+
)
|
|
323
|
+
return sentence_transformers_loader, "dense"
|
|
324
|
+
|
|
325
|
+
architectures = config.get("architectures", [])
|
|
326
|
+
|
|
327
|
+
is_cross_encoder = any(
|
|
328
|
+
arch.endswith("ForSequenceClassification") for arch in architectures
|
|
329
|
+
)
|
|
330
|
+
if is_cross_encoder:
|
|
331
|
+
return cross_encoder_loader, "cross-encoder"
|
|
332
|
+
|
|
333
|
+
if cls._is_causal_lm_reranker(architectures, config, model_name):
|
|
334
|
+
return cross_encoder_loader, "cross-encoder"
|
|
335
|
+
|
|
336
|
+
logger.info(
|
|
337
|
+
f"Model {model_name} does not have modules.json or recognized architecture. "
|
|
338
|
+
"Defaulting to SentenceTransformer loader."
|
|
339
|
+
)
|
|
340
|
+
return sentence_transformers_loader, "dense"
|
|
341
|
+
|
|
342
|
+
@staticmethod
|
|
343
|
+
def _is_causal_lm_reranker(
|
|
344
|
+
architectures: list[str], config: dict[str, Any], model_name: str
|
|
345
|
+
) -> bool:
|
|
346
|
+
"""Check if model is a CausalLM-style reranker."""
|
|
347
|
+
is_causal_lm = any(arch.endswith("ForCausalLM") for arch in architectures)
|
|
348
|
+
|
|
349
|
+
if not is_causal_lm:
|
|
350
|
+
return False
|
|
351
|
+
|
|
352
|
+
num_labels = config.get("num_labels", 0)
|
|
353
|
+
model_name_lower = model_name.lower()
|
|
354
|
+
|
|
355
|
+
return (
|
|
356
|
+
num_labels > 0
|
|
357
|
+
or "rerank" in model_name_lower
|
|
358
|
+
or "cross-encoder" in model_name_lower
|
|
359
|
+
)
|
|
360
|
+
|
|
361
|
+
@classmethod
|
|
362
|
+
def _detect_model_type_and_loader(
|
|
363
|
+
cls,
|
|
364
|
+
model_name: str | None,
|
|
365
|
+
revision: str | None = None,
|
|
366
|
+
) -> tuple[Callable[..., MTEBModels] | None, MODEL_TYPES]:
|
|
367
|
+
"""Detect the model type and appropriate loader based on HuggingFace Hub configuration files.
|
|
368
|
+
|
|
369
|
+
This follows the Sentence Transformers architecture detection logic:
|
|
370
|
+
1. Check for modules.json - If present, model is a SentenceTransformer (dense encoder)
|
|
371
|
+
2. If no modules.json, check config.json for architecture:
|
|
372
|
+
- ForSequenceClassification → CrossEncoder
|
|
373
|
+
- CausalLM with reranking indicators → CrossEncoder
|
|
374
|
+
3. Default to dense (SentenceTransformer) if no clear indicators are found
|
|
375
|
+
|
|
376
|
+
Detection for CausalLM-style rerankers:
|
|
377
|
+
- Model has ForCausalLM architecture AND
|
|
378
|
+
- Has num_labels > 0 in config, OR
|
|
379
|
+
- Model name contains "rerank" or "cross-encoder"
|
|
380
|
+
|
|
381
|
+
Args:
|
|
382
|
+
model_name: The HuggingFace model name (can be None)
|
|
383
|
+
revision: The model revision
|
|
384
|
+
|
|
385
|
+
Returns:
|
|
386
|
+
A tuple of (loader_function, model_type) where:
|
|
387
|
+
- loader_function: A callable that returns MTEBModels, or None if model doesn't exist
|
|
388
|
+
- model_type: One of "dense", "cross-encoder", or "late-interaction"
|
|
389
|
+
"""
|
|
390
|
+
from mteb.models import CrossEncoderWrapper, sentence_transformers_loader
|
|
391
|
+
|
|
392
|
+
if not model_name or not _repo_exists(model_name):
|
|
393
|
+
return sentence_transformers_loader, "dense"
|
|
394
|
+
|
|
395
|
+
try:
|
|
396
|
+
modules_config = _get_json_from_hub(
|
|
397
|
+
model_name, "modules.json", "model", revision=revision
|
|
398
|
+
)
|
|
399
|
+
|
|
400
|
+
if (
|
|
401
|
+
modules_config
|
|
402
|
+
): # SentenceTransformer/SparseEncoder (Not support for now)
|
|
403
|
+
return sentence_transformers_loader, "dense"
|
|
404
|
+
else:
|
|
405
|
+
return cls._detect_cross_encoder_or_dense(
|
|
406
|
+
model_name,
|
|
407
|
+
revision,
|
|
408
|
+
sentence_transformers_loader,
|
|
409
|
+
cross_encoder_loader=CrossEncoderWrapper,
|
|
410
|
+
)
|
|
411
|
+
|
|
412
|
+
except Exception as e:
|
|
413
|
+
logger.warning(
|
|
414
|
+
f"Error detecting model type for {model_name}: {e}. "
|
|
415
|
+
"Defaulting to SentenceTransformer loader."
|
|
416
|
+
)
|
|
417
|
+
|
|
418
|
+
return sentence_transformers_loader, "dense"
|
|
419
|
+
|
|
305
420
|
@classmethod
|
|
306
421
|
def _from_hub(
|
|
307
422
|
cls,
|
|
@@ -319,9 +434,11 @@ class ModelMeta(BaseModel):
|
|
|
319
434
|
Returns:
|
|
320
435
|
The generated ModelMeta.
|
|
321
436
|
"""
|
|
322
|
-
|
|
437
|
+
loader: Callable[..., MTEBModels] | None
|
|
438
|
+
model_type: MODEL_TYPES
|
|
439
|
+
|
|
440
|
+
loader, model_type = cls._detect_model_type_and_loader(model_name, revision)
|
|
323
441
|
|
|
324
|
-
loader = sentence_transformers_loader
|
|
325
442
|
frameworks: list[FRAMEWORKS] = ["PyTorch"]
|
|
326
443
|
model_license = None
|
|
327
444
|
reference = None
|
|
@@ -363,6 +480,7 @@ class ModelMeta(BaseModel):
|
|
|
363
480
|
return cls(
|
|
364
481
|
loader=loader,
|
|
365
482
|
name=model_name or "no_model_name/available",
|
|
483
|
+
model_type=[model_type],
|
|
366
484
|
revision=revision or "no_revision_available",
|
|
367
485
|
reference=reference,
|
|
368
486
|
release_date=release_date,
|
mteb/results/model_result.py
CHANGED
|
@@ -17,6 +17,7 @@ from .task_result import TaskError, TaskResult
|
|
|
17
17
|
|
|
18
18
|
if TYPE_CHECKING:
|
|
19
19
|
from collections.abc import Callable, Iterable
|
|
20
|
+
from pathlib import Path
|
|
20
21
|
|
|
21
22
|
from mteb.abstasks.abstask import AbsTask
|
|
22
23
|
from mteb.abstasks.task_metadata import (
|
|
@@ -417,3 +418,25 @@ class ModelResult(BaseModel):
|
|
|
417
418
|
if not mods:
|
|
418
419
|
mods = self.default_modalities
|
|
419
420
|
return list(set(mods))
|
|
421
|
+
|
|
422
|
+
def to_disk(self, path: Path) -> None:
|
|
423
|
+
"""Save ModelResult to disk as JSON.
|
|
424
|
+
|
|
425
|
+
Args:
|
|
426
|
+
path: The path to the file to save.
|
|
427
|
+
"""
|
|
428
|
+
with path.open("w") as f:
|
|
429
|
+
f.write(self.model_dump_json(indent=2))
|
|
430
|
+
|
|
431
|
+
@classmethod
|
|
432
|
+
def from_disk(cls, path: Path) -> ModelResult:
|
|
433
|
+
"""Load ModelResult from disk.
|
|
434
|
+
|
|
435
|
+
Args:
|
|
436
|
+
path: The path to the JSON file to load.
|
|
437
|
+
|
|
438
|
+
Returns:
|
|
439
|
+
The loaded ModelResult object.
|
|
440
|
+
"""
|
|
441
|
+
with path.open("r", encoding="utf-8") as f:
|
|
442
|
+
return cls.model_validate_json(f.read())
|
mteb/results/task_result.py
CHANGED
|
@@ -337,16 +337,16 @@ class TaskResult(BaseModel):
|
|
|
337
337
|
The loaded TaskResult object.
|
|
338
338
|
"""
|
|
339
339
|
with path.open("r", encoding="utf-8") as f:
|
|
340
|
-
|
|
340
|
+
json_str = f.read()
|
|
341
341
|
|
|
342
342
|
if not load_historic_data:
|
|
343
343
|
try:
|
|
344
|
-
return cls.
|
|
344
|
+
return cls.model_validate_json(json_str)
|
|
345
345
|
except Exception as e:
|
|
346
346
|
raise ValueError(
|
|
347
347
|
f"Error loading TaskResult from disk. You can try to load historic data by setting `load_historic_data=True`. Error: {e}"
|
|
348
348
|
)
|
|
349
|
-
|
|
349
|
+
data = json.loads(json_str)
|
|
350
350
|
pre_1_11_load = (
|
|
351
351
|
(
|
|
352
352
|
"mteb_version" in data
|
|
@@ -357,7 +357,7 @@ class TaskResult(BaseModel):
|
|
|
357
357
|
) # assume it is before 1.11.0 if the version is not present
|
|
358
358
|
|
|
359
359
|
try:
|
|
360
|
-
obj: TaskResult = cls.
|
|
360
|
+
obj: TaskResult = cls.model_validate_json(json_str)
|
|
361
361
|
except Exception as e:
|
|
362
362
|
if not pre_1_11_load:
|
|
363
363
|
raise e
|
|
@@ -1,5 +1,3 @@
|
|
|
1
|
-
import datasets
|
|
2
|
-
|
|
3
1
|
from mteb.abstasks.pair_classification import AbsTaskPairClassification
|
|
4
2
|
from mteb.abstasks.task_metadata import TaskMetadata
|
|
5
3
|
|
|
@@ -8,8 +6,8 @@ class FarsTail(AbsTaskPairClassification):
|
|
|
8
6
|
metadata = TaskMetadata(
|
|
9
7
|
name="FarsTail",
|
|
10
8
|
dataset={
|
|
11
|
-
"path": "
|
|
12
|
-
"revision": "
|
|
9
|
+
"path": "mteb/FarsTail",
|
|
10
|
+
"revision": "0fa0863dc160869b5a2d78803b4440ea3c671ff5",
|
|
13
11
|
},
|
|
14
12
|
description="This dataset, named FarsTail, includes 10,367 samples which are provided in both the Persian language as well as the indexed format to be useful for non-Persian researchers. The samples are generated from 3,539 multiple-choice questions with the least amount of annotator interventions in a way similar to the SciTail dataset",
|
|
15
13
|
reference="https://link.springer.com/article/10.1007/s00500-023-08959-3",
|
|
@@ -37,33 +35,3 @@ class FarsTail(AbsTaskPairClassification):
|
|
|
37
35
|
}
|
|
38
36
|
""", # after removing neutral
|
|
39
37
|
)
|
|
40
|
-
|
|
41
|
-
def load_data(self, num_proc: int = 1, **kwargs) -> None:
|
|
42
|
-
if self.data_loaded:
|
|
43
|
-
return
|
|
44
|
-
path = self.metadata.dataset["path"]
|
|
45
|
-
revision = self.metadata.dataset["revision"]
|
|
46
|
-
data_files = {
|
|
47
|
-
"test": f"https://huggingface.co/datasets/{path}/resolve/{revision}/data/Test-word.csv"
|
|
48
|
-
}
|
|
49
|
-
self.dataset = datasets.load_dataset(
|
|
50
|
-
"csv", data_files=data_files, delimiter="\t"
|
|
51
|
-
)
|
|
52
|
-
self.dataset_transform()
|
|
53
|
-
self.data_loaded = True
|
|
54
|
-
|
|
55
|
-
def dataset_transform(self, num_proc: int = 1):
|
|
56
|
-
_dataset = {}
|
|
57
|
-
self.dataset = self.dataset.filter(lambda x: x["label"] != "n")
|
|
58
|
-
self.dataset = self.dataset.map(
|
|
59
|
-
lambda example: {"label": 1 if example["label"] == "e" else 0}
|
|
60
|
-
)
|
|
61
|
-
for split in self.metadata.eval_splits:
|
|
62
|
-
_dataset[split] = [
|
|
63
|
-
{
|
|
64
|
-
"sentence1": self.dataset[split]["premise"],
|
|
65
|
-
"sentence2": self.dataset[split]["hypothesis"],
|
|
66
|
-
"labels": self.dataset[split]["label"],
|
|
67
|
-
}
|
|
68
|
-
]
|
|
69
|
-
self.dataset = _dataset
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mteb
|
|
3
|
-
Version: 2.7.
|
|
3
|
+
Version: 2.7.14
|
|
4
4
|
Summary: Massive Text Embedding Benchmark
|
|
5
5
|
Author-email: MTEB Contributors <niklas@huggingface.co>, Kenneth Enevoldsen <kenneth.enevoldsen@cas.au.dk>, Nouamane Tazi <nouamane@huggingface.co>, Nils Reimers <info@nils-reimers.de>
|
|
6
6
|
Maintainer-email: Kenneth Enevoldsen <kenneth.enevoldsen@cas.au.dk>, Roman Solomatin <risolomatin@gmail.com>, Isaac Chung <chungisaac1217@gmail.com>
|
|
@@ -32,8 +32,6 @@ Requires-Dist: rich>=0.0.0
|
|
|
32
32
|
Requires-Dist: pytrec-eval-terrier>=0.5.6
|
|
33
33
|
Requires-Dist: pydantic>=2.0.0
|
|
34
34
|
Requires-Dist: polars>=0.20.22
|
|
35
|
-
Requires-Dist: torch; python_full_version < "3.14"
|
|
36
|
-
Requires-Dist: torch>=2.9.0; python_full_version >= "3.14"
|
|
37
35
|
Provides-Extra: image
|
|
38
36
|
Requires-Dist: torchvision>0.2.1; extra == "image"
|
|
39
37
|
Requires-Dist: transformers[torch-vision,vision]; extra == "image"
|
|
@@ -40,7 +40,7 @@ mteb/abstasks/clustering.py,sha256=I8vre2f2FJFagzJEYf6hKDo3Y28xU29J_O-MhfqWqSI,1
|
|
|
40
40
|
mteb/abstasks/clustering_legacy.py,sha256=sbx8K6paccvzDPnmhgNE_UJE83orAJnQm3NGr-Ktjfs,9184
|
|
41
41
|
mteb/abstasks/dataset_card_template.md,sha256=aD6l8qc3_jxwoIGJNYLzse-jpRa8hu92AxpnUtNgges,5122
|
|
42
42
|
mteb/abstasks/multilabel_classification.py,sha256=rFa_Pw2OsUzqhZS-jh2zFD7I-TNl8bVNJ-DW7EpPapU,9708
|
|
43
|
-
mteb/abstasks/pair_classification.py,sha256=
|
|
43
|
+
mteb/abstasks/pair_classification.py,sha256=RVV5WUjs18N5PbWpyxakDNEd1UlRc4ON9I0OjD26Z78,14231
|
|
44
44
|
mteb/abstasks/regression.py,sha256=ZuMZfOwU3G4hr__eHsgdagKKdrbN4-wQMLz45jr9YUc,8946
|
|
45
45
|
mteb/abstasks/retrieval.py,sha256=BPyRibStAD70JfR0Z1x-VVVfzJDRVSmbOS6uREfpmok,27743
|
|
46
46
|
mteb/abstasks/retrieval_dataset_loaders.py,sha256=p0y1nrWlUrt_aeoR4ocDLEQMLuD_SlMH0gBiUsOwrww,9983
|
|
@@ -1479,9 +1479,9 @@ mteb/leaderboard/table.py,sha256=U5mWtrVUTk_6t8T4KAp5qlbFgKh1PD0iKICqNMfhsoY,104
|
|
|
1479
1479
|
mteb/leaderboard/text_segments.py,sha256=iMIkS04QQjPbT-SkU0x6fOcS8xRbUYevryu9HydipKM,6570
|
|
1480
1480
|
mteb/models/__init__.py,sha256=ABTuoqiBjBtBWW3LYY7ItBHdylR6jWoy06HH0g6j6fU,910
|
|
1481
1481
|
mteb/models/abs_encoder.py,sha256=We9HlwWP61P4cMyZ080gywvDErA1eVsU9t46PtcNrCM,16830
|
|
1482
|
-
mteb/models/get_model_meta.py,sha256=
|
|
1482
|
+
mteb/models/get_model_meta.py,sha256=WRWnVIT1n7i63BYlBRB-8BpYNtHxn7KMJOm5mzlJ8xI,7211
|
|
1483
1483
|
mteb/models/instruct_wrapper.py,sha256=XAvvbPnXiTxKhFbmusm2uS8E9BMq8QXRSzQQI1jqKzE,9781
|
|
1484
|
-
mteb/models/model_meta.py,sha256=
|
|
1484
|
+
mteb/models/model_meta.py,sha256=E6mBB_inz9kMO8z3ixgGuB9QKWUYYzW44gSZwnY3ZbI,37316
|
|
1485
1485
|
mteb/models/models_protocols.py,sha256=HTB4-SYa3SeJXMMSA8o05lHTiLBbq314VW60K_PfcZY,9509
|
|
1486
1486
|
mteb/models/search_wrappers.py,sha256=PXE1VVDWUd0LgTPJ-FxqIbGpIDWLRKo5CjrwIuu5nzw,21567
|
|
1487
1487
|
mteb/models/sentence_transformer_wrapper.py,sha256=RsOxj-b7qzeYcxUTVJyb-lZDY4bINl4jEAEkPvKYB10,13578
|
|
@@ -1505,7 +1505,7 @@ mteb/models/model_implementations/bge_models.py,sha256=JuO1FRWrsqlsM_jslQ96oVsD3
|
|
|
1505
1505
|
mteb/models/model_implementations/bica_model.py,sha256=Yx3iZrXF6ZMJS9SH5lbzNHoUWGNH3dypRtZ7dX5o7rA,1305
|
|
1506
1506
|
mteb/models/model_implementations/blip2_models.py,sha256=C6egwozJthHmv92I0SWID3-sQCPROPJP0TzfQVKNzlo,7898
|
|
1507
1507
|
mteb/models/model_implementations/blip_models.py,sha256=D_9e7C8GXGST8k7dMJL20x984vMeqbITu36XASi-iUU,12149
|
|
1508
|
-
mteb/models/model_implementations/bm25.py,sha256
|
|
1508
|
+
mteb/models/model_implementations/bm25.py,sha256=IAKU8syYesN7seRQLII-c1ACq6BRz5Ql6nEQEXYWLwQ,5226
|
|
1509
1509
|
mteb/models/model_implementations/bmretriever_models.py,sha256=rijCIzX6nO5kNXqxEFbZrV7bsZtmKs8RIkMqa5cPWTk,7078
|
|
1510
1510
|
mteb/models/model_implementations/cadet_models.py,sha256=gXIfW9MkGYFhOhsrq5a_tQcPuth13Dh1dO1KySwVxyo,2305
|
|
1511
1511
|
mteb/models/model_implementations/cde_models.py,sha256=l4E6h1hcsNY1GTXoCgQDoeG5dRcEl7JTOiiWmp6FYqg,9373
|
|
@@ -1537,7 +1537,7 @@ mteb/models/model_implementations/granite_vision_embedding_models.py,sha256=jxyR
|
|
|
1537
1537
|
mteb/models/model_implementations/gritlm_models.py,sha256=756vgZGADy5FhKlFuzuD6huevC_AYD5b88V1Y5yFht8,3241
|
|
1538
1538
|
mteb/models/model_implementations/gte_models.py,sha256=-ASkoAuAiVytVtsYMtuKonUf39i0U69HSEnJy_-PwXA,14574
|
|
1539
1539
|
mteb/models/model_implementations/hinvec_models.py,sha256=SYWGFr8XALmM7B9tIHEQnrqq9kZOZIBkW7m7QpzerHI,1756
|
|
1540
|
-
mteb/models/model_implementations/human.py,sha256=
|
|
1540
|
+
mteb/models/model_implementations/human.py,sha256=k7vN6WTcSWyWS9wnluzr6yCOjuMi5LupQnT-4cfzNOk,600
|
|
1541
1541
|
mteb/models/model_implementations/ibm_granite_models.py,sha256=ipLRDBerTQiL5NaoaDho410Fzy7eNFlF3jB54hGZrwI,8687
|
|
1542
1542
|
mteb/models/model_implementations/inf_models.py,sha256=q_hNNhzMjAxbnJnAT0N6KaNegX_3XZlmz-LXY5C891I,3093
|
|
1543
1543
|
mteb/models/model_implementations/jasper_models.py,sha256=ourAMx1_L6b2AxX046wQcxDqvYzY1Mx3gaHww0WaMA8,16476
|
|
@@ -1565,14 +1565,15 @@ mteb/models/model_implementations/model2vec_models.py,sha256=qXcPhV0hGRFBsvRBrb8
|
|
|
1565
1565
|
mteb/models/model_implementations/moka_models.py,sha256=4Esujv_fVJjHuX1nRH6sGtmrmF04A90F4Xo2uN0YTzs,5205
|
|
1566
1566
|
mteb/models/model_implementations/nbailab.py,sha256=iv2xdqVM5HoTAlBR6e_UdzJu6rSPujqWXFYwyCv69hU,2684
|
|
1567
1567
|
mteb/models/model_implementations/no_instruct_sentence_models.py,sha256=DTb-eHZYSY6lGJkkdkC0tZ_n0GHLQwVlUehVg59T5N4,4198
|
|
1568
|
-
mteb/models/model_implementations/nomic_models.py,sha256=
|
|
1568
|
+
mteb/models/model_implementations/nomic_models.py,sha256=BO6XQbX4PFa5By0opAYkxz95CcHmjxbG5DYcklxJ1l8,16986
|
|
1569
1569
|
mteb/models/model_implementations/nomic_models_vision.py,sha256=AzTCWbXBonUAVub0TTxWCsBtg4WYex3vPiLlz3ULdHc,6916
|
|
1570
|
-
mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py,sha256
|
|
1570
|
+
mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py,sha256=-5_kd9jeDcgVv9gdwWuvX_-bNQdhAxInf1Mqo8_BdS8,10653
|
|
1571
1571
|
mteb/models/model_implementations/nvidia_models.py,sha256=r-AW1dVQbteWjexjvZgFEt_90OHNRYer_5GLuqSXRS0,26924
|
|
1572
1572
|
mteb/models/model_implementations/octen_models.py,sha256=5z-t2O-iIFiOOLdZ_AK9f7GrVRg-9_vx3JNAG9dJNPE,8562
|
|
1573
1573
|
mteb/models/model_implementations/openai_models.py,sha256=y1wMknrrcu1L5CNwniG0mFThPVMON1c2Fj22jkKsw7Y,9730
|
|
1574
1574
|
mteb/models/model_implementations/openclip_models.py,sha256=z2gQum16O0QhJPyxqKor3oO-_uWfnep6wSXqOFQQ2Q8,11969
|
|
1575
|
-
mteb/models/model_implementations/opensearch_neural_sparse_models.py,sha256=
|
|
1575
|
+
mteb/models/model_implementations/opensearch_neural_sparse_models.py,sha256=J5FEvKWQUiBusL6PHcrRuRRJOQ-iMwOSu1fX0pblXhk,8941
|
|
1576
|
+
mteb/models/model_implementations/ops_colqwen3_models.py,sha256=5vg5d1_WfVGMgtIwkh6zf2-Paum6V35XcKEvLfRyRzs,7437
|
|
1576
1577
|
mteb/models/model_implementations/ops_moa_models.py,sha256=Ah7L78mqC9pH8t6sf1OWXOLjouVUpAutt6lZ0np7eMM,2655
|
|
1577
1578
|
mteb/models/model_implementations/ordalietech_solon_embeddings_mini_beta_1_1.py,sha256=xv1ftJeMND4lpeKYC3RLQB4nhdiYy0wCxrzEjUj4gSg,1114
|
|
1578
1579
|
mteb/models/model_implementations/pawan_models.py,sha256=iyzh6NSPZKU9znJYEDPjJNIqvkyuKPAol5TcILuq1Is,1225
|
|
@@ -1582,6 +1583,7 @@ mteb/models/model_implementations/promptriever_models.py,sha256=tDXysEKbvoDNtidV
|
|
|
1582
1583
|
mteb/models/model_implementations/pylate_models.py,sha256=EwpDJf5yjONmmiv9zHSEbc3e7kMRJUHYv7K59QTrNAE,17525
|
|
1583
1584
|
mteb/models/model_implementations/qodo_models.py,sha256=AwYRGctJbjEkcIh1pcSeeEXYiYiizAbfTyw6CaZBJfU,2224
|
|
1584
1585
|
mteb/models/model_implementations/qtack_models.py,sha256=vw_2O4ZABR-_nYV4g1Ud8bW1DTP-wwtQS2eGqN70vT4,1304
|
|
1586
|
+
mteb/models/model_implementations/querit_models.py,sha256=P7lAw5IDe47DA_5srMwGPqxjMIFuvOW0BJ7xwB4GOro,8917
|
|
1585
1587
|
mteb/models/model_implementations/qwen3_models.py,sha256=857UnUEil9o8xcw7vSr2fMRlEegyE2Q86e5yLeRL_mQ,5517
|
|
1586
1588
|
mteb/models/model_implementations/qzhou_models.py,sha256=mfG70JrNJCo-s3MykRn6lg9gFPcKMeMI7Y8VrBhNo7I,3684
|
|
1587
1589
|
mteb/models/model_implementations/random_baseline.py,sha256=YsITQoLbea_Iz2X84WNGBGkhlsQ3hB7yx1oJwXghimE,7561
|
|
@@ -1627,8 +1629,8 @@ mteb/models/search_encoder_index/search_indexes/__init__.py,sha256=Wm60_oUemUpFs
|
|
|
1627
1629
|
mteb/models/search_encoder_index/search_indexes/faiss_search_index.py,sha256=jwC-3swhnILZnVHUrMR7Ts78TuYtVRxPusF02UV1g6E,5770
|
|
1628
1630
|
mteb/results/__init__.py,sha256=EXQqK4Am5eIYzD52dpcGAFSdqnC38oE6JHN302oidHc,158
|
|
1629
1631
|
mteb/results/benchmark_results.py,sha256=unBUBJ92ud0UXlkZJLn71WVcf-oUlF6XcITTccz5OBA,20318
|
|
1630
|
-
mteb/results/model_result.py,sha256=
|
|
1631
|
-
mteb/results/task_result.py,sha256=
|
|
1632
|
+
mteb/results/model_result.py,sha256=h894O5-RSCOF8XNpXMuhBCqnj43T-1K5Y1el_fyrzP4,15954
|
|
1633
|
+
mteb/results/task_result.py,sha256=Wi5MRQBkb0Qddhc4nLzrrjm1nGlCh8aq4_VCQoxGmNg,34300
|
|
1632
1634
|
mteb/tasks/__init__.py,sha256=izAxU0ip1F_YUwx0dFCuN35BaktdmePh6vlDiHC0kLo,503
|
|
1633
1635
|
mteb/tasks/aggregated_tasks/__init__.py,sha256=Ufgbh1AirxCQkojO3AUhUFWM8zQG10cfdVTkj_PeyLI,104
|
|
1634
1636
|
mteb/tasks/aggregated_tasks/eng/__init__.py,sha256=HgaSyAX8Is5CGE006RgJkLQQVxrx2FmMnm6NHQBDi-4,358
|
|
@@ -2111,7 +2113,7 @@ mteb/tasks/pair_classification/eng/twitter_sem_eval2015_pc.py,sha256=0YjKK4C47Uu
|
|
|
2111
2113
|
mteb/tasks/pair_classification/eng/twitter_url_corpus_pc.py,sha256=M9B3JCFt4L6yEnd8S-o-W-FtCSMdl0h1KST3fqApEVA,1796
|
|
2112
2114
|
mteb/tasks/pair_classification/fas/__init__.py,sha256=1Bbr5ZKSjpPuJb9zvk7OSd2Krdh1bpxJjVNLNPFT4Ck,440
|
|
2113
2115
|
mteb/tasks/pair_classification/fas/fa_mteb_pair_classification.py,sha256=1i8phewQffpIxzWtBWQFUisHu3XhBpk9Sf-IkwM8jNg,10932
|
|
2114
|
-
mteb/tasks/pair_classification/fas/fars_tail.py,sha256=
|
|
2116
|
+
mteb/tasks/pair_classification/fas/fars_tail.py,sha256=jb-6UW0Lk7YxdMMCZsMavY6CRiv3T6MFrbvlPd0vPPk,1676
|
|
2115
2117
|
mteb/tasks/pair_classification/hye/__init__.py,sha256=hU4xSf6kyKhD4o4CuNMQNE1w9FKv8tkkqvYvhpMV5Kg,93
|
|
2116
2118
|
mteb/tasks/pair_classification/hye/armenian_paraphrase_pc.py,sha256=Ezi604W-cHOLDm8O9j3yq9z-GzDt9OWI9jgyqVjY9M4,1437
|
|
2117
2119
|
mteb/tasks/pair_classification/ind/__init__.py,sha256=iXGvZ6eNgGhyD2wgbkvV-bpPPCJNxlE5eq_qvF2Y_UI,53
|
|
@@ -2644,9 +2646,9 @@ mteb/types/_metadata.py,sha256=NN-W0S6a5TDV7UkpRx1pyWtGF4TyyCyoPUfHOwdeci8,2290
|
|
|
2644
2646
|
mteb/types/_result.py,sha256=UKNokV9pu3G74MGebocU512aU_fFU9I9nPKnrG9Q0iE,1035
|
|
2645
2647
|
mteb/types/_string_validators.py,sha256=PY-dYq4E8O50VS3bLYdldPWp400fl_WzUjfVSkNWe8U,523
|
|
2646
2648
|
mteb/types/statistics.py,sha256=gElgSShKBXpfcqaZHhU_d2UHln1CyzUj8FN8KFun_UA,4087
|
|
2647
|
-
mteb-2.7.
|
|
2648
|
-
mteb-2.7.
|
|
2649
|
-
mteb-2.7.
|
|
2650
|
-
mteb-2.7.
|
|
2651
|
-
mteb-2.7.
|
|
2652
|
-
mteb-2.7.
|
|
2649
|
+
mteb-2.7.14.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
2650
|
+
mteb-2.7.14.dist-info/METADATA,sha256=ZTD9D9Fuy9OCRxIXSZzh1bObP0PKSXUMqI4j3XVNR_c,14348
|
|
2651
|
+
mteb-2.7.14.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
2652
|
+
mteb-2.7.14.dist-info/entry_points.txt,sha256=8IJoEJFKoDHmVnNev-qJ9pp4Ln7_1-ma9QsXnzVCzGU,39
|
|
2653
|
+
mteb-2.7.14.dist-info/top_level.txt,sha256=OLVIjcQAlWBz0bdmutKlWHLF42FF0hp4uVAg3ZyiG4U,5
|
|
2654
|
+
mteb-2.7.14.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|