mteb 2.3.10__py3-none-any.whl → 2.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +7 -2
- mteb/abstasks/_statistics_calculation.py +6 -2
- mteb/abstasks/classification.py +0 -2
- mteb/benchmarks/benchmarks/__init__.py +2 -0
- mteb/benchmarks/benchmarks/benchmarks.py +57 -0
- mteb/deprecated_evaluator.py +8 -13
- mteb/descriptive_stats/Reranking/JQaRARerankingLite.json +35 -0
- mteb/descriptive_stats/Reranking/JaCWIRRerankingLite.json +35 -0
- mteb/descriptive_stats/Retrieval/JaCWIRRetrievalLite.json +30 -0
- mteb/descriptive_stats/Retrieval/JaqketRetrievalLite.json +30 -0
- mteb/descriptive_stats/Retrieval/MIRACLJaRetrievalLite.json +30 -0
- mteb/descriptive_stats/Retrieval/MrTyDiJaRetrievalLite.json +30 -0
- mteb/evaluate.py +2 -33
- mteb/leaderboard/figures.py +1 -1
- mteb/leaderboard/table.py +1 -11
- mteb/models/abs_encoder.py +21 -17
- mteb/models/cache_wrappers/cache_backends/_hash_utils.py +2 -2
- mteb/models/get_model_meta.py +3 -123
- mteb/models/instruct_wrapper.py +2 -1
- mteb/models/model_implementations/bica_model.py +34 -0
- mteb/models/model_implementations/colpali_models.py +7 -2
- mteb/models/model_implementations/colqwen_models.py +1 -1
- mteb/models/model_implementations/gme_v_models.py +9 -5
- mteb/models/model_implementations/google_models.py +10 -0
- mteb/models/model_implementations/granite_vision_embedding_models.py +6 -2
- mteb/models/model_implementations/jasper_models.py +2 -2
- mteb/models/model_implementations/jina_models.py +1 -1
- mteb/models/model_implementations/mod_models.py +204 -0
- mteb/models/model_implementations/nomic_models.py +142 -4
- mteb/models/model_implementations/nomic_models_vision.py +6 -2
- mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +6 -2
- mteb/models/model_implementations/pylate_models.py +1 -4
- mteb/models/model_implementations/random_baseline.py +6 -2
- mteb/models/model_implementations/seed_1_6_embedding_models.py +7 -2
- mteb/models/model_implementations/voyage_v.py +6 -2
- mteb/models/model_meta.py +396 -19
- mteb/models/sentence_transformer_wrapper.py +2 -7
- mteb/tasks/reranking/jpn/__init__.py +9 -1
- mteb/tasks/reranking/jpn/j_qa_ra_reranking_lite.py +49 -0
- mteb/tasks/reranking/jpn/ja_cwir_reranking_lite.py +47 -0
- mteb/tasks/retrieval/code/fresh_stack_retrieval.py +8 -5
- mteb/tasks/retrieval/jpn/__init__.py +8 -0
- mteb/tasks/retrieval/jpn/ja_cwir_retrieval_lite.py +47 -0
- mteb/tasks/retrieval/jpn/jaqket_retrieval_lite.py +50 -0
- mteb/tasks/retrieval/jpn/miracl_ja_retrieval_lite.py +52 -0
- mteb/tasks/retrieval/jpn/mr_tydi_ja_retrieval_lite.py +48 -0
- mteb/types/_encoder_io.py +7 -2
- {mteb-2.3.10.dist-info → mteb-2.4.1.dist-info}/METADATA +2 -1
- {mteb-2.3.10.dist-info → mteb-2.4.1.dist-info}/RECORD +53 -39
- {mteb-2.3.10.dist-info → mteb-2.4.1.dist-info}/WHEEL +0 -0
- {mteb-2.3.10.dist-info → mteb-2.4.1.dist-info}/entry_points.txt +0 -0
- {mteb-2.3.10.dist-info → mteb-2.4.1.dist-info}/licenses/LICENSE +0 -0
- {mteb-2.3.10.dist-info → mteb-2.4.1.dist-info}/top_level.txt +0 -0
|
@@ -193,7 +193,7 @@ NOMIC_CITATION = """
|
|
|
193
193
|
"""
|
|
194
194
|
|
|
195
195
|
nomic_embed_v1_5 = ModelMeta(
|
|
196
|
-
loader=NomicWrapper,
|
|
196
|
+
loader=NomicWrapper, # type: ignore
|
|
197
197
|
loader_kwargs=dict(
|
|
198
198
|
trust_remote_code=True,
|
|
199
199
|
model_prompts=model_prompts,
|
|
@@ -221,7 +221,7 @@ nomic_embed_v1_5 = ModelMeta(
|
|
|
221
221
|
)
|
|
222
222
|
|
|
223
223
|
nomic_embed_v1 = ModelMeta(
|
|
224
|
-
loader=NomicWrapper,
|
|
224
|
+
loader=NomicWrapper, # type: ignore
|
|
225
225
|
loader_kwargs=dict(
|
|
226
226
|
trust_remote_code=True,
|
|
227
227
|
model_prompts=model_prompts,
|
|
@@ -249,7 +249,7 @@ nomic_embed_v1 = ModelMeta(
|
|
|
249
249
|
)
|
|
250
250
|
|
|
251
251
|
nomic_embed_v1_ablated = ModelMeta(
|
|
252
|
-
loader=NomicWrapper,
|
|
252
|
+
loader=NomicWrapper, # type: ignore
|
|
253
253
|
loader_kwargs=dict(
|
|
254
254
|
trust_remote_code=True,
|
|
255
255
|
model_prompts=model_prompts,
|
|
@@ -276,7 +276,7 @@ nomic_embed_v1_ablated = ModelMeta(
|
|
|
276
276
|
)
|
|
277
277
|
|
|
278
278
|
nomic_embed_v1_unsupervised = ModelMeta(
|
|
279
|
-
loader=NomicWrapper,
|
|
279
|
+
loader=NomicWrapper, # type: ignore
|
|
280
280
|
loader_kwargs=dict(
|
|
281
281
|
trust_remote_code=True,
|
|
282
282
|
model_prompts=model_prompts,
|
|
@@ -329,3 +329,141 @@ nomic_modern_bert_embed = ModelMeta(
|
|
|
329
329
|
training_datasets=nomic_training_data,
|
|
330
330
|
public_training_data=None,
|
|
331
331
|
)
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
m_languages = [
|
|
335
|
+
"eng-Latn",
|
|
336
|
+
"spa-Latn",
|
|
337
|
+
"fra-Latn",
|
|
338
|
+
"deu-Latn",
|
|
339
|
+
"ita-Latn",
|
|
340
|
+
"por-Latn",
|
|
341
|
+
"pol-Latn",
|
|
342
|
+
"nld-Latn",
|
|
343
|
+
"tur-Latn",
|
|
344
|
+
"jpn-Jpan",
|
|
345
|
+
"vie-Latn",
|
|
346
|
+
"rus-Cyrl",
|
|
347
|
+
"ind-Latn",
|
|
348
|
+
"arb-Arab",
|
|
349
|
+
"ces-Latn",
|
|
350
|
+
"ron-Latn",
|
|
351
|
+
"swe-Latn",
|
|
352
|
+
"ell-Grek",
|
|
353
|
+
"ukr-Cyrl",
|
|
354
|
+
"zho-Hans",
|
|
355
|
+
"hun-Latn",
|
|
356
|
+
"dan-Latn",
|
|
357
|
+
"nor-Latn",
|
|
358
|
+
"hin-Deva",
|
|
359
|
+
"fin-Latn",
|
|
360
|
+
"bul-Cyrl",
|
|
361
|
+
"kor-Hang",
|
|
362
|
+
"slk-Latn",
|
|
363
|
+
"tha-Thai",
|
|
364
|
+
"heb-Hebr",
|
|
365
|
+
"cat-Latn",
|
|
366
|
+
"lit-Latn",
|
|
367
|
+
"fas-Arab",
|
|
368
|
+
"msa-Latn",
|
|
369
|
+
"slv-Latn",
|
|
370
|
+
"lav-Latn",
|
|
371
|
+
"mar-Deva",
|
|
372
|
+
"ben-Beng",
|
|
373
|
+
"sqi-Latn",
|
|
374
|
+
"cym-Latn",
|
|
375
|
+
"bel-Cyrl",
|
|
376
|
+
"mal-Mlym",
|
|
377
|
+
"kan-Knda",
|
|
378
|
+
"mkd-Cyrl",
|
|
379
|
+
"urd-Arab",
|
|
380
|
+
"fry-Latn",
|
|
381
|
+
"fil-Latn",
|
|
382
|
+
"tel-Telu",
|
|
383
|
+
"eus-Latn",
|
|
384
|
+
"swh-Latn",
|
|
385
|
+
"som-Latn",
|
|
386
|
+
"snd-Arab",
|
|
387
|
+
"uzb-Latn",
|
|
388
|
+
"cos-Latn",
|
|
389
|
+
"hrv-Latn",
|
|
390
|
+
"guj-Gujr",
|
|
391
|
+
"hin-Latn",
|
|
392
|
+
"ceb-Latn",
|
|
393
|
+
"epo-Latn",
|
|
394
|
+
"jav-Latn",
|
|
395
|
+
"lat-Latn",
|
|
396
|
+
"zul-Latn",
|
|
397
|
+
"mon-Cyrl",
|
|
398
|
+
"sin-Sinh",
|
|
399
|
+
"ell-Latn",
|
|
400
|
+
"gle-Latn",
|
|
401
|
+
"kir-Cyrl",
|
|
402
|
+
"tgk-Cyrl",
|
|
403
|
+
"mya-Mymr",
|
|
404
|
+
"khm-Khmr",
|
|
405
|
+
"mlg-Latn",
|
|
406
|
+
"pan-Guru",
|
|
407
|
+
"rus-Latn",
|
|
408
|
+
"sna-Latn",
|
|
409
|
+
"zho-Latn",
|
|
410
|
+
"hau-Latn",
|
|
411
|
+
"heb-Latn",
|
|
412
|
+
"hmn-Latn",
|
|
413
|
+
"hat-Latn",
|
|
414
|
+
"jpn-Latn",
|
|
415
|
+
"sun-Latn",
|
|
416
|
+
"bul-Latn",
|
|
417
|
+
"gla-Latn",
|
|
418
|
+
"nya-Latn",
|
|
419
|
+
"pus-Arab",
|
|
420
|
+
"kur-Latn",
|
|
421
|
+
"hbs-Latn",
|
|
422
|
+
"amh-Ethi",
|
|
423
|
+
"ibo-Latn",
|
|
424
|
+
"lao-Laoo",
|
|
425
|
+
"mri-Latn",
|
|
426
|
+
"nno-Latn",
|
|
427
|
+
"smo-Latn",
|
|
428
|
+
"yid-Hebr",
|
|
429
|
+
"sot-Latn",
|
|
430
|
+
"tgl-Latn",
|
|
431
|
+
"xho-Latn",
|
|
432
|
+
"yor-Latn",
|
|
433
|
+
]
|
|
434
|
+
|
|
435
|
+
nomic_embed_text_v2_moe = ModelMeta(
|
|
436
|
+
loader=NomicWrapper, # type: ignore
|
|
437
|
+
loader_kwargs=dict(
|
|
438
|
+
trust_remote_code=True,
|
|
439
|
+
model_prompts=model_prompts,
|
|
440
|
+
),
|
|
441
|
+
name="nomic-ai/nomic-embed-text-v2-moe",
|
|
442
|
+
languages=m_languages,
|
|
443
|
+
open_weights=True,
|
|
444
|
+
revision="1066b6599d099fbb93dfcb64f9c37a7c9e503e85",
|
|
445
|
+
release_date="2025-02-07",
|
|
446
|
+
n_parameters=475292928,
|
|
447
|
+
memory_usage_mb=1813,
|
|
448
|
+
max_tokens=512,
|
|
449
|
+
embed_dim=768,
|
|
450
|
+
license="apache-2.0",
|
|
451
|
+
reference="https://huggingface.co/nomic-ai/nomic-embed-text-v2-moe",
|
|
452
|
+
similarity_fn_name=ScoringFunction.COSINE,
|
|
453
|
+
framework=["Sentence Transformers", "PyTorch"],
|
|
454
|
+
use_instructions=True,
|
|
455
|
+
adapted_from="nomic-ai/nomic-xlm-2048",
|
|
456
|
+
public_training_data="https://github.com/nomic-ai/contrastors?tab=readme-ov-file#data-access",
|
|
457
|
+
public_training_code="https://github.com/nomic-ai/contrastors/blob/613ddfd37309e538cceadb05b1e6423e7b09f603/src/contrastors/configs/train/contrastive_finetune_moe.yaml",
|
|
458
|
+
training_datasets=None, # did not look into this further
|
|
459
|
+
superseded_by=None,
|
|
460
|
+
citation="""@misc{nussbaum2025trainingsparsemixtureexperts,
|
|
461
|
+
title={Training Sparse Mixture Of Experts Text Embedding Models},
|
|
462
|
+
author={Zach Nussbaum and Brandon Duderstadt},
|
|
463
|
+
year={2025},
|
|
464
|
+
eprint={2502.07972},
|
|
465
|
+
archivePrefix={arXiv},
|
|
466
|
+
primaryClass={cs.CL},
|
|
467
|
+
url={https://arxiv.org/abs/2502.07972},
|
|
468
|
+
}""",
|
|
469
|
+
)
|
|
@@ -1,8 +1,9 @@
|
|
|
1
|
-
from
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING, Any
|
|
2
4
|
|
|
3
5
|
import torch
|
|
4
6
|
import torch.nn.functional as F
|
|
5
|
-
from PIL import Image
|
|
6
7
|
from torch.utils.data import DataLoader
|
|
7
8
|
from tqdm.auto import tqdm
|
|
8
9
|
|
|
@@ -12,6 +13,9 @@ from mteb.models.abs_encoder import AbsEncoder
|
|
|
12
13
|
from mteb.models.model_meta import ModelMeta, ScoringFunction
|
|
13
14
|
from mteb.types import Array, BatchedInput, PromptType
|
|
14
15
|
|
|
16
|
+
if TYPE_CHECKING:
|
|
17
|
+
from PIL import Image
|
|
18
|
+
|
|
15
19
|
NOMIC_EMBED_VISION_CITATION = """@article{nussbaum2024nomicembedvision,
|
|
16
20
|
title={Nomic Embed Vision: Expanding the Latent Space},
|
|
17
21
|
author={Nussbaum, Zach and Duderstadt, Brandon and Mulyar, Andriy},
|
|
@@ -1,7 +1,6 @@
|
|
|
1
|
-
from typing import Any
|
|
1
|
+
from typing import TYPE_CHECKING, Any
|
|
2
2
|
|
|
3
3
|
import torch
|
|
4
|
-
from PIL import Image
|
|
5
4
|
from torch.utils.data import DataLoader
|
|
6
5
|
|
|
7
6
|
from mteb.abstasks.task_metadata import TaskMetadata
|
|
@@ -9,6 +8,10 @@ from mteb.models.abs_encoder import AbsEncoder
|
|
|
9
8
|
from mteb.models.model_meta import ModelMeta
|
|
10
9
|
from mteb.types import Array, BatchedInput, PromptType
|
|
11
10
|
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
pass
|
|
13
|
+
|
|
14
|
+
|
|
12
15
|
LLAMA_NEMORETRIEVER_CITATION = """@misc{xu2025llamanemoretrievercolembedtopperforming,
|
|
13
16
|
title={Llama Nemoretriever Colembed: Top-Performing Text-Image Retrieval Model},
|
|
14
17
|
author={Mengyao Xu and Gabriel Moreira and Ronay Ak and Radek Osmulski and Yauhen Babakhin and Zhiding Yu and Benedikt Schifferer and Even Oldridge},
|
|
@@ -53,6 +56,7 @@ class LlamaNemoretrieverColembed(AbsEncoder):
|
|
|
53
56
|
**kwargs,
|
|
54
57
|
):
|
|
55
58
|
import torchvision.transforms.functional as F
|
|
59
|
+
from PIL import Image
|
|
56
60
|
|
|
57
61
|
all_images = []
|
|
58
62
|
if isinstance(images, DataLoader):
|
|
@@ -328,13 +328,10 @@ class MultiVectorModel(AbsEncoder, PylateSearchEncoder):
|
|
|
328
328
|
inputs,
|
|
329
329
|
prompt_name=prompt_name,
|
|
330
330
|
is_query=prompt_type == PromptType.query,
|
|
331
|
-
convert_to_tensor=True,
|
|
332
331
|
**kwargs,
|
|
333
332
|
)
|
|
334
333
|
|
|
335
|
-
|
|
336
|
-
pred = torch.nn.utils.rnn.pad_sequence(pred, batch_first=True, padding_value=0)
|
|
337
|
-
return pred.cpu().numpy()
|
|
334
|
+
return pred
|
|
338
335
|
|
|
339
336
|
|
|
340
337
|
colbert_v2 = ModelMeta(
|
|
@@ -1,9 +1,10 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import hashlib
|
|
2
|
-
from typing import Any, Literal
|
|
4
|
+
from typing import TYPE_CHECKING, Any, Literal
|
|
3
5
|
|
|
4
6
|
import numpy as np
|
|
5
7
|
import torch
|
|
6
|
-
from PIL import Image
|
|
7
8
|
from torch.utils.data import DataLoader
|
|
8
9
|
|
|
9
10
|
from mteb.abstasks.task_metadata import TaskMetadata
|
|
@@ -14,6 +15,9 @@ from mteb.similarity_functions import (
|
|
|
14
15
|
)
|
|
15
16
|
from mteb.types._encoder_io import Array, BatchedInput, PromptType
|
|
16
17
|
|
|
18
|
+
if TYPE_CHECKING:
|
|
19
|
+
from PIL import Image
|
|
20
|
+
|
|
17
21
|
|
|
18
22
|
def _string_to_vector(text: str | None, size: int) -> np.ndarray:
|
|
19
23
|
"""Generate a deterministic random vector based on a string.
|
|
@@ -1,14 +1,15 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import base64
|
|
2
4
|
import logging
|
|
3
5
|
import os
|
|
4
6
|
import time
|
|
5
7
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
6
8
|
from io import BytesIO
|
|
7
|
-
from typing import Any
|
|
9
|
+
from typing import TYPE_CHECKING, Any
|
|
8
10
|
|
|
9
11
|
import requests
|
|
10
12
|
import torch
|
|
11
|
-
from PIL import Image
|
|
12
13
|
from torch.utils.data import DataLoader
|
|
13
14
|
|
|
14
15
|
from mteb._requires_package import requires_package
|
|
@@ -19,6 +20,10 @@ from mteb.models.model_implementations.nvidia_models import nvidia_training_data
|
|
|
19
20
|
from mteb.models.model_meta import ModelMeta
|
|
20
21
|
from mteb.types import Array, BatchedInput, PromptType
|
|
21
22
|
|
|
23
|
+
if TYPE_CHECKING:
|
|
24
|
+
from PIL import Image
|
|
25
|
+
|
|
26
|
+
|
|
22
27
|
logger = logging.getLogger(__name__)
|
|
23
28
|
|
|
24
29
|
|
|
@@ -1,8 +1,9 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import logging
|
|
2
|
-
from typing import Any, Literal
|
|
4
|
+
from typing import TYPE_CHECKING, Any, Literal
|
|
3
5
|
|
|
4
6
|
import torch
|
|
5
|
-
from PIL import Image
|
|
6
7
|
from torch.utils.data import DataLoader
|
|
7
8
|
from tqdm.auto import tqdm
|
|
8
9
|
|
|
@@ -12,6 +13,9 @@ from mteb.models.abs_encoder import AbsEncoder
|
|
|
12
13
|
from mteb.models.model_meta import ModelMeta, ScoringFunction
|
|
13
14
|
from mteb.types import Array, BatchedInput, PromptType
|
|
14
15
|
|
|
16
|
+
if TYPE_CHECKING:
|
|
17
|
+
from PIL import Image
|
|
18
|
+
|
|
15
19
|
|
|
16
20
|
def _downsample_image(
|
|
17
21
|
image: Image.Image, max_pixels: int = 16000000, target_longest_side: int = 4000
|