mteb 2.3.10__py3-none-any.whl → 2.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +7 -2
  2. mteb/abstasks/_statistics_calculation.py +6 -2
  3. mteb/abstasks/classification.py +0 -2
  4. mteb/benchmarks/benchmarks/__init__.py +2 -0
  5. mteb/benchmarks/benchmarks/benchmarks.py +57 -0
  6. mteb/deprecated_evaluator.py +8 -13
  7. mteb/descriptive_stats/Reranking/JQaRARerankingLite.json +35 -0
  8. mteb/descriptive_stats/Reranking/JaCWIRRerankingLite.json +35 -0
  9. mteb/descriptive_stats/Retrieval/JaCWIRRetrievalLite.json +30 -0
  10. mteb/descriptive_stats/Retrieval/JaqketRetrievalLite.json +30 -0
  11. mteb/descriptive_stats/Retrieval/MIRACLJaRetrievalLite.json +30 -0
  12. mteb/descriptive_stats/Retrieval/MrTyDiJaRetrievalLite.json +30 -0
  13. mteb/evaluate.py +2 -33
  14. mteb/leaderboard/figures.py +1 -1
  15. mteb/leaderboard/table.py +1 -11
  16. mteb/models/abs_encoder.py +21 -17
  17. mteb/models/cache_wrappers/cache_backends/_hash_utils.py +2 -2
  18. mteb/models/get_model_meta.py +3 -123
  19. mteb/models/instruct_wrapper.py +2 -1
  20. mteb/models/model_implementations/bica_model.py +34 -0
  21. mteb/models/model_implementations/colpali_models.py +7 -2
  22. mteb/models/model_implementations/colqwen_models.py +1 -1
  23. mteb/models/model_implementations/gme_v_models.py +9 -5
  24. mteb/models/model_implementations/google_models.py +10 -0
  25. mteb/models/model_implementations/granite_vision_embedding_models.py +6 -2
  26. mteb/models/model_implementations/jasper_models.py +2 -2
  27. mteb/models/model_implementations/jina_models.py +1 -1
  28. mteb/models/model_implementations/mod_models.py +204 -0
  29. mteb/models/model_implementations/nomic_models.py +142 -4
  30. mteb/models/model_implementations/nomic_models_vision.py +6 -2
  31. mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +6 -2
  32. mteb/models/model_implementations/pylate_models.py +1 -4
  33. mteb/models/model_implementations/random_baseline.py +6 -2
  34. mteb/models/model_implementations/seed_1_6_embedding_models.py +7 -2
  35. mteb/models/model_implementations/voyage_v.py +6 -2
  36. mteb/models/model_meta.py +396 -19
  37. mteb/models/sentence_transformer_wrapper.py +2 -7
  38. mteb/tasks/reranking/jpn/__init__.py +9 -1
  39. mteb/tasks/reranking/jpn/j_qa_ra_reranking_lite.py +49 -0
  40. mteb/tasks/reranking/jpn/ja_cwir_reranking_lite.py +47 -0
  41. mteb/tasks/retrieval/code/fresh_stack_retrieval.py +8 -5
  42. mteb/tasks/retrieval/jpn/__init__.py +8 -0
  43. mteb/tasks/retrieval/jpn/ja_cwir_retrieval_lite.py +47 -0
  44. mteb/tasks/retrieval/jpn/jaqket_retrieval_lite.py +50 -0
  45. mteb/tasks/retrieval/jpn/miracl_ja_retrieval_lite.py +52 -0
  46. mteb/tasks/retrieval/jpn/mr_tydi_ja_retrieval_lite.py +48 -0
  47. mteb/types/_encoder_io.py +7 -2
  48. {mteb-2.3.10.dist-info → mteb-2.4.1.dist-info}/METADATA +2 -1
  49. {mteb-2.3.10.dist-info → mteb-2.4.1.dist-info}/RECORD +53 -39
  50. {mteb-2.3.10.dist-info → mteb-2.4.1.dist-info}/WHEEL +0 -0
  51. {mteb-2.3.10.dist-info → mteb-2.4.1.dist-info}/entry_points.txt +0 -0
  52. {mteb-2.3.10.dist-info → mteb-2.4.1.dist-info}/licenses/LICENSE +0 -0
  53. {mteb-2.3.10.dist-info → mteb-2.4.1.dist-info}/top_level.txt +0 -0
@@ -193,7 +193,7 @@ NOMIC_CITATION = """
193
193
  """
194
194
 
195
195
  nomic_embed_v1_5 = ModelMeta(
196
- loader=NomicWrapper,
196
+ loader=NomicWrapper, # type: ignore
197
197
  loader_kwargs=dict(
198
198
  trust_remote_code=True,
199
199
  model_prompts=model_prompts,
@@ -221,7 +221,7 @@ nomic_embed_v1_5 = ModelMeta(
221
221
  )
222
222
 
223
223
  nomic_embed_v1 = ModelMeta(
224
- loader=NomicWrapper,
224
+ loader=NomicWrapper, # type: ignore
225
225
  loader_kwargs=dict(
226
226
  trust_remote_code=True,
227
227
  model_prompts=model_prompts,
@@ -249,7 +249,7 @@ nomic_embed_v1 = ModelMeta(
249
249
  )
250
250
 
251
251
  nomic_embed_v1_ablated = ModelMeta(
252
- loader=NomicWrapper,
252
+ loader=NomicWrapper, # type: ignore
253
253
  loader_kwargs=dict(
254
254
  trust_remote_code=True,
255
255
  model_prompts=model_prompts,
@@ -276,7 +276,7 @@ nomic_embed_v1_ablated = ModelMeta(
276
276
  )
277
277
 
278
278
  nomic_embed_v1_unsupervised = ModelMeta(
279
- loader=NomicWrapper,
279
+ loader=NomicWrapper, # type: ignore
280
280
  loader_kwargs=dict(
281
281
  trust_remote_code=True,
282
282
  model_prompts=model_prompts,
@@ -329,3 +329,141 @@ nomic_modern_bert_embed = ModelMeta(
329
329
  training_datasets=nomic_training_data,
330
330
  public_training_data=None,
331
331
  )
332
+
333
+
334
+ m_languages = [
335
+ "eng-Latn",
336
+ "spa-Latn",
337
+ "fra-Latn",
338
+ "deu-Latn",
339
+ "ita-Latn",
340
+ "por-Latn",
341
+ "pol-Latn",
342
+ "nld-Latn",
343
+ "tur-Latn",
344
+ "jpn-Jpan",
345
+ "vie-Latn",
346
+ "rus-Cyrl",
347
+ "ind-Latn",
348
+ "arb-Arab",
349
+ "ces-Latn",
350
+ "ron-Latn",
351
+ "swe-Latn",
352
+ "ell-Grek",
353
+ "ukr-Cyrl",
354
+ "zho-Hans",
355
+ "hun-Latn",
356
+ "dan-Latn",
357
+ "nor-Latn",
358
+ "hin-Deva",
359
+ "fin-Latn",
360
+ "bul-Cyrl",
361
+ "kor-Hang",
362
+ "slk-Latn",
363
+ "tha-Thai",
364
+ "heb-Hebr",
365
+ "cat-Latn",
366
+ "lit-Latn",
367
+ "fas-Arab",
368
+ "msa-Latn",
369
+ "slv-Latn",
370
+ "lav-Latn",
371
+ "mar-Deva",
372
+ "ben-Beng",
373
+ "sqi-Latn",
374
+ "cym-Latn",
375
+ "bel-Cyrl",
376
+ "mal-Mlym",
377
+ "kan-Knda",
378
+ "mkd-Cyrl",
379
+ "urd-Arab",
380
+ "fry-Latn",
381
+ "fil-Latn",
382
+ "tel-Telu",
383
+ "eus-Latn",
384
+ "swh-Latn",
385
+ "som-Latn",
386
+ "snd-Arab",
387
+ "uzb-Latn",
388
+ "cos-Latn",
389
+ "hrv-Latn",
390
+ "guj-Gujr",
391
+ "hin-Latn",
392
+ "ceb-Latn",
393
+ "epo-Latn",
394
+ "jav-Latn",
395
+ "lat-Latn",
396
+ "zul-Latn",
397
+ "mon-Cyrl",
398
+ "sin-Sinh",
399
+ "ell-Latn",
400
+ "gle-Latn",
401
+ "kir-Cyrl",
402
+ "tgk-Cyrl",
403
+ "mya-Mymr",
404
+ "khm-Khmr",
405
+ "mlg-Latn",
406
+ "pan-Guru",
407
+ "rus-Latn",
408
+ "sna-Latn",
409
+ "zho-Latn",
410
+ "hau-Latn",
411
+ "heb-Latn",
412
+ "hmn-Latn",
413
+ "hat-Latn",
414
+ "jpn-Latn",
415
+ "sun-Latn",
416
+ "bul-Latn",
417
+ "gla-Latn",
418
+ "nya-Latn",
419
+ "pus-Arab",
420
+ "kur-Latn",
421
+ "hbs-Latn",
422
+ "amh-Ethi",
423
+ "ibo-Latn",
424
+ "lao-Laoo",
425
+ "mri-Latn",
426
+ "nno-Latn",
427
+ "smo-Latn",
428
+ "yid-Hebr",
429
+ "sot-Latn",
430
+ "tgl-Latn",
431
+ "xho-Latn",
432
+ "yor-Latn",
433
+ ]
434
+
435
+ nomic_embed_text_v2_moe = ModelMeta(
436
+ loader=NomicWrapper, # type: ignore
437
+ loader_kwargs=dict(
438
+ trust_remote_code=True,
439
+ model_prompts=model_prompts,
440
+ ),
441
+ name="nomic-ai/nomic-embed-text-v2-moe",
442
+ languages=m_languages,
443
+ open_weights=True,
444
+ revision="1066b6599d099fbb93dfcb64f9c37a7c9e503e85",
445
+ release_date="2025-02-07",
446
+ n_parameters=475292928,
447
+ memory_usage_mb=1813,
448
+ max_tokens=512,
449
+ embed_dim=768,
450
+ license="apache-2.0",
451
+ reference="https://huggingface.co/nomic-ai/nomic-embed-text-v2-moe",
452
+ similarity_fn_name=ScoringFunction.COSINE,
453
+ framework=["Sentence Transformers", "PyTorch"],
454
+ use_instructions=True,
455
+ adapted_from="nomic-ai/nomic-xlm-2048",
456
+ public_training_data="https://github.com/nomic-ai/contrastors?tab=readme-ov-file#data-access",
457
+ public_training_code="https://github.com/nomic-ai/contrastors/blob/613ddfd37309e538cceadb05b1e6423e7b09f603/src/contrastors/configs/train/contrastive_finetune_moe.yaml",
458
+ training_datasets=None, # did not look into this further
459
+ superseded_by=None,
460
+ citation="""@misc{nussbaum2025trainingsparsemixtureexperts,
461
+ title={Training Sparse Mixture Of Experts Text Embedding Models},
462
+ author={Zach Nussbaum and Brandon Duderstadt},
463
+ year={2025},
464
+ eprint={2502.07972},
465
+ archivePrefix={arXiv},
466
+ primaryClass={cs.CL},
467
+ url={https://arxiv.org/abs/2502.07972},
468
+ }""",
469
+ )
@@ -1,8 +1,9 @@
1
- from typing import Any
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING, Any
2
4
 
3
5
  import torch
4
6
  import torch.nn.functional as F
5
- from PIL import Image
6
7
  from torch.utils.data import DataLoader
7
8
  from tqdm.auto import tqdm
8
9
 
@@ -12,6 +13,9 @@ from mteb.models.abs_encoder import AbsEncoder
12
13
  from mteb.models.model_meta import ModelMeta, ScoringFunction
13
14
  from mteb.types import Array, BatchedInput, PromptType
14
15
 
16
+ if TYPE_CHECKING:
17
+ from PIL import Image
18
+
15
19
  NOMIC_EMBED_VISION_CITATION = """@article{nussbaum2024nomicembedvision,
16
20
  title={Nomic Embed Vision: Expanding the Latent Space},
17
21
  author={Nussbaum, Zach and Duderstadt, Brandon and Mulyar, Andriy},
@@ -1,7 +1,6 @@
1
- from typing import Any
1
+ from typing import TYPE_CHECKING, Any
2
2
 
3
3
  import torch
4
- from PIL import Image
5
4
  from torch.utils.data import DataLoader
6
5
 
7
6
  from mteb.abstasks.task_metadata import TaskMetadata
@@ -9,6 +8,10 @@ from mteb.models.abs_encoder import AbsEncoder
9
8
  from mteb.models.model_meta import ModelMeta
10
9
  from mteb.types import Array, BatchedInput, PromptType
11
10
 
11
+ if TYPE_CHECKING:
12
+ pass
13
+
14
+
12
15
  LLAMA_NEMORETRIEVER_CITATION = """@misc{xu2025llamanemoretrievercolembedtopperforming,
13
16
  title={Llama Nemoretriever Colembed: Top-Performing Text-Image Retrieval Model},
14
17
  author={Mengyao Xu and Gabriel Moreira and Ronay Ak and Radek Osmulski and Yauhen Babakhin and Zhiding Yu and Benedikt Schifferer and Even Oldridge},
@@ -53,6 +56,7 @@ class LlamaNemoretrieverColembed(AbsEncoder):
53
56
  **kwargs,
54
57
  ):
55
58
  import torchvision.transforms.functional as F
59
+ from PIL import Image
56
60
 
57
61
  all_images = []
58
62
  if isinstance(images, DataLoader):
@@ -328,13 +328,10 @@ class MultiVectorModel(AbsEncoder, PylateSearchEncoder):
328
328
  inputs,
329
329
  prompt_name=prompt_name,
330
330
  is_query=prompt_type == PromptType.query,
331
- convert_to_tensor=True,
332
331
  **kwargs,
333
332
  )
334
333
 
335
- # encode returns a list of tensors shaped (x, token_dim), pad to uniform length
336
- pred = torch.nn.utils.rnn.pad_sequence(pred, batch_first=True, padding_value=0)
337
- return pred.cpu().numpy()
334
+ return pred
338
335
 
339
336
 
340
337
  colbert_v2 = ModelMeta(
@@ -1,9 +1,10 @@
1
+ from __future__ import annotations
2
+
1
3
  import hashlib
2
- from typing import Any, Literal
4
+ from typing import TYPE_CHECKING, Any, Literal
3
5
 
4
6
  import numpy as np
5
7
  import torch
6
- from PIL import Image
7
8
  from torch.utils.data import DataLoader
8
9
 
9
10
  from mteb.abstasks.task_metadata import TaskMetadata
@@ -14,6 +15,9 @@ from mteb.similarity_functions import (
14
15
  )
15
16
  from mteb.types._encoder_io import Array, BatchedInput, PromptType
16
17
 
18
+ if TYPE_CHECKING:
19
+ from PIL import Image
20
+
17
21
 
18
22
  def _string_to_vector(text: str | None, size: int) -> np.ndarray:
19
23
  """Generate a deterministic random vector based on a string.
@@ -1,14 +1,15 @@
1
+ from __future__ import annotations
2
+
1
3
  import base64
2
4
  import logging
3
5
  import os
4
6
  import time
5
7
  from concurrent.futures import ThreadPoolExecutor, as_completed
6
8
  from io import BytesIO
7
- from typing import Any
9
+ from typing import TYPE_CHECKING, Any
8
10
 
9
11
  import requests
10
12
  import torch
11
- from PIL import Image
12
13
  from torch.utils.data import DataLoader
13
14
 
14
15
  from mteb._requires_package import requires_package
@@ -19,6 +20,10 @@ from mteb.models.model_implementations.nvidia_models import nvidia_training_data
19
20
  from mteb.models.model_meta import ModelMeta
20
21
  from mteb.types import Array, BatchedInput, PromptType
21
22
 
23
+ if TYPE_CHECKING:
24
+ from PIL import Image
25
+
26
+
22
27
  logger = logging.getLogger(__name__)
23
28
 
24
29
 
@@ -1,8 +1,9 @@
1
+ from __future__ import annotations
2
+
1
3
  import logging
2
- from typing import Any, Literal
4
+ from typing import TYPE_CHECKING, Any, Literal
3
5
 
4
6
  import torch
5
- from PIL import Image
6
7
  from torch.utils.data import DataLoader
7
8
  from tqdm.auto import tqdm
8
9
 
@@ -12,6 +13,9 @@ from mteb.models.abs_encoder import AbsEncoder
12
13
  from mteb.models.model_meta import ModelMeta, ScoringFunction
13
14
  from mteb.types import Array, BatchedInput, PromptType
14
15
 
16
+ if TYPE_CHECKING:
17
+ from PIL import Image
18
+
15
19
 
16
20
  def _downsample_image(
17
21
  image: Image.Image, max_pixels: int = 16000000, target_longest_side: int = 4000