mteb 2.7.1__py3-none-any.whl → 2.7.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (155) hide show
  1. mteb/__init__.py +2 -0
  2. mteb/_create_dataloaders.py +16 -9
  3. mteb/_evaluators/any_sts_evaluator.py +10 -5
  4. mteb/_evaluators/clustering_evaluator.py +10 -4
  5. mteb/_evaluators/evaluator.py +9 -4
  6. mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +6 -4
  7. mteb/_evaluators/pair_classification_evaluator.py +10 -5
  8. mteb/_evaluators/retrieval_evaluator.py +19 -13
  9. mteb/_evaluators/retrieval_metrics.py +9 -3
  10. mteb/_evaluators/sklearn_evaluator.py +14 -10
  11. mteb/_evaluators/text/bitext_mining_evaluator.py +8 -3
  12. mteb/_evaluators/text/summarization_evaluator.py +8 -4
  13. mteb/_evaluators/zeroshot_classification_evaluator.py +10 -3
  14. mteb/_helpful_enum.py +5 -1
  15. mteb/abstasks/_data_filter/filters.py +8 -2
  16. mteb/abstasks/_data_filter/task_pipelines.py +7 -2
  17. mteb/abstasks/_statistics_calculation.py +6 -4
  18. mteb/abstasks/abstask.py +17 -9
  19. mteb/abstasks/aggregate_task_metadata.py +20 -9
  20. mteb/abstasks/aggregated_task.py +15 -8
  21. mteb/abstasks/classification.py +15 -6
  22. mteb/abstasks/clustering.py +17 -8
  23. mteb/abstasks/clustering_legacy.py +14 -6
  24. mteb/abstasks/image/image_text_pair_classification.py +17 -7
  25. mteb/abstasks/multilabel_classification.py +11 -5
  26. mteb/abstasks/pair_classification.py +19 -9
  27. mteb/abstasks/regression.py +14 -6
  28. mteb/abstasks/retrieval.py +27 -16
  29. mteb/abstasks/retrieval_dataset_loaders.py +11 -8
  30. mteb/abstasks/sts.py +19 -10
  31. mteb/abstasks/task_metadata.py +17 -8
  32. mteb/abstasks/text/bitext_mining.py +14 -7
  33. mteb/abstasks/text/summarization.py +17 -7
  34. mteb/abstasks/zeroshot_classification.py +15 -7
  35. mteb/benchmarks/_create_table.py +13 -3
  36. mteb/benchmarks/benchmark.py +11 -1
  37. mteb/benchmarks/benchmarks/rteb_benchmarks.py +20 -9
  38. mteb/cache.py +20 -14
  39. mteb/cli/_display_tasks.py +9 -3
  40. mteb/cli/build_cli.py +5 -2
  41. mteb/cli/generate_model_card.py +9 -2
  42. mteb/deprecated_evaluator.py +16 -12
  43. mteb/evaluate.py +20 -18
  44. mteb/filter_tasks.py +12 -7
  45. mteb/get_tasks.py +9 -4
  46. mteb/languages/language_scripts.py +8 -3
  47. mteb/leaderboard/app.py +7 -3
  48. mteb/leaderboard/table.py +7 -2
  49. mteb/load_results.py +9 -3
  50. mteb/models/abs_encoder.py +22 -12
  51. mteb/models/cache_wrappers/cache_backend_protocol.py +5 -3
  52. mteb/models/cache_wrappers/cache_backends/_hash_utils.py +8 -4
  53. mteb/models/cache_wrappers/cache_backends/faiss_cache.py +8 -3
  54. mteb/models/cache_wrappers/cache_wrapper.py +14 -9
  55. mteb/models/get_model_meta.py +11 -4
  56. mteb/models/instruct_wrapper.py +13 -5
  57. mteb/models/model_implementations/align_models.py +9 -4
  58. mteb/models/model_implementations/bedrock_models.py +16 -6
  59. mteb/models/model_implementations/blip2_models.py +9 -4
  60. mteb/models/model_implementations/blip_models.py +9 -4
  61. mteb/models/model_implementations/bm25.py +15 -10
  62. mteb/models/model_implementations/bmretriever_models.py +6 -2
  63. mteb/models/model_implementations/cde_models.py +9 -5
  64. mteb/models/model_implementations/clip_models.py +9 -4
  65. mteb/models/model_implementations/cohere_models.py +10 -4
  66. mteb/models/model_implementations/cohere_v.py +9 -4
  67. mteb/models/model_implementations/colpali_models.py +4 -3
  68. mteb/models/model_implementations/colqwen_models.py +10 -31
  69. mteb/models/model_implementations/colsmol_models.py +1 -1
  70. mteb/models/model_implementations/conan_models.py +10 -4
  71. mteb/models/model_implementations/dino_models.py +9 -4
  72. mteb/models/model_implementations/e5_v.py +9 -4
  73. mteb/models/model_implementations/eagerworks_models.py +10 -4
  74. mteb/models/model_implementations/evaclip_models.py +9 -4
  75. mteb/models/model_implementations/gme_v_models.py +5 -3
  76. mteb/models/model_implementations/google_models.py +10 -4
  77. mteb/models/model_implementations/granite_vision_embedding_models.py +6 -5
  78. mteb/models/model_implementations/hinvec_models.py +5 -1
  79. mteb/models/model_implementations/jasper_models.py +12 -5
  80. mteb/models/model_implementations/jina_clip.py +9 -4
  81. mteb/models/model_implementations/jina_models.py +10 -5
  82. mteb/models/model_implementations/kalm_models.py +18 -12
  83. mteb/models/model_implementations/linq_models.py +6 -1
  84. mteb/models/model_implementations/listconranker.py +9 -4
  85. mteb/models/model_implementations/llm2clip_models.py +9 -4
  86. mteb/models/model_implementations/llm2vec_models.py +12 -6
  87. mteb/models/model_implementations/mcinext_models.py +5 -2
  88. mteb/models/model_implementations/mdbr_models.py +3 -1
  89. mteb/models/model_implementations/{mxbai_models.py → mixedbread_ai_models.py} +91 -0
  90. mteb/models/model_implementations/moco_models.py +9 -4
  91. mteb/models/model_implementations/mod_models.py +1 -1
  92. mteb/models/model_implementations/model2vec_models.py +10 -4
  93. mteb/models/model_implementations/no_instruct_sentence_models.py +12 -5
  94. mteb/models/model_implementations/nomic_models.py +10 -4
  95. mteb/models/model_implementations/nomic_models_vision.py +4 -3
  96. mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +7 -3
  97. mteb/models/model_implementations/nvidia_models.py +12 -4
  98. mteb/models/model_implementations/octen_models.py +1 -1
  99. mteb/models/model_implementations/openai_models.py +9 -4
  100. mteb/models/model_implementations/openclip_models.py +9 -4
  101. mteb/models/model_implementations/opensearch_neural_sparse_models.py +10 -4
  102. mteb/models/model_implementations/ops_moa_models.py +7 -2
  103. mteb/models/model_implementations/pixie_models.py +56 -0
  104. mteb/models/model_implementations/promptriever_models.py +12 -6
  105. mteb/models/model_implementations/pylate_models.py +19 -13
  106. mteb/models/model_implementations/qwen3_models.py +8 -1
  107. mteb/models/model_implementations/random_baseline.py +4 -3
  108. mteb/models/model_implementations/repllama_models.py +13 -6
  109. mteb/models/model_implementations/rerankers_custom.py +10 -4
  110. mteb/models/model_implementations/rerankers_monot5_based.py +10 -4
  111. mteb/models/model_implementations/salesforce_models.py +7 -1
  112. mteb/models/model_implementations/seed_1_6_embedding_models.py +4 -2
  113. mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +5 -2
  114. mteb/models/model_implementations/seed_models.py +1 -1
  115. mteb/models/model_implementations/siglip_models.py +9 -4
  116. mteb/models/model_implementations/slm_models.py +7 -4
  117. mteb/models/model_implementations/uae_models.py +9 -4
  118. mteb/models/model_implementations/vdr_models.py +7 -1
  119. mteb/models/model_implementations/vista_models.py +9 -4
  120. mteb/models/model_implementations/vlm2vec_models.py +9 -4
  121. mteb/models/model_implementations/voyage_models.py +10 -4
  122. mteb/models/model_implementations/voyage_v.py +10 -6
  123. mteb/models/model_implementations/yuan_models_en.py +1 -1
  124. mteb/models/model_meta.py +12 -7
  125. mteb/models/models_protocols.py +19 -18
  126. mteb/models/search_encoder_index/search_backend_protocol.py +7 -3
  127. mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +12 -4
  128. mteb/models/search_wrappers.py +19 -12
  129. mteb/models/sentence_transformer_wrapper.py +4 -3
  130. mteb/models/vllm_wrapper.py +8 -6
  131. mteb/results/benchmark_results.py +22 -17
  132. mteb/results/model_result.py +21 -15
  133. mteb/results/task_result.py +41 -10
  134. mteb/similarity_functions.py +8 -2
  135. mteb/tasks/aggregated_tasks/eng/cqadupstack_retrieval.py +3 -3
  136. mteb/tasks/aggregated_tasks/eng/sts17_multilingual_visual_sts_eng.py +3 -3
  137. mteb/tasks/aggregated_tasks/eng/sts_benchmark_multilingual_visual_sts_eng.py +3 -3
  138. mteb/tasks/aggregated_tasks/fas/cqadupstack_retrieval_fa.py +3 -3
  139. mteb/tasks/aggregated_tasks/fas/syn_per_chatbot_conv_sa_classification.py +3 -3
  140. mteb/tasks/aggregated_tasks/multilingual/sts17_multilingual_vision_sts.py +3 -3
  141. mteb/tasks/aggregated_tasks/multilingual/sts_benchmark_multilingual_visual_sts.py +3 -3
  142. mteb/tasks/aggregated_tasks/nld/cqadupstack_nl_retrieval.py +3 -3
  143. mteb/tasks/aggregated_tasks/pol/cqadupstack_retrieval_pl.py +3 -3
  144. mteb/tasks/clustering/nob/snl_clustering.py +7 -2
  145. mteb/tasks/clustering/nob/vg_clustering.py +7 -2
  146. mteb/tasks/retrieval/eng/limit_retrieval.py +6 -1
  147. mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +3 -3
  148. mteb/types/_encoder_io.py +1 -1
  149. mteb/types/statistics.py +9 -2
  150. {mteb-2.7.1.dist-info → mteb-2.7.3.dist-info}/METADATA +1 -1
  151. {mteb-2.7.1.dist-info → mteb-2.7.3.dist-info}/RECORD +155 -154
  152. {mteb-2.7.1.dist-info → mteb-2.7.3.dist-info}/WHEEL +0 -0
  153. {mteb-2.7.1.dist-info → mteb-2.7.3.dist-info}/entry_points.txt +0 -0
  154. {mteb-2.7.1.dist-info → mteb-2.7.3.dist-info}/licenses/LICENSE +0 -0
  155. {mteb-2.7.1.dist-info → mteb-2.7.3.dist-info}/top_level.txt +0 -0
@@ -1,13 +1,18 @@
1
- from typing import Any, Literal
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING, Any, Literal
2
4
 
3
5
  import torch
4
- from torch.utils.data import DataLoader
5
6
  from tqdm.auto import tqdm
6
7
 
7
- from mteb.abstasks.task_metadata import TaskMetadata
8
8
  from mteb.models.abs_encoder import AbsEncoder
9
9
  from mteb.models.model_meta import ModelMeta, ScoringFunction
10
- from mteb.types import Array, BatchedInput, PromptType
10
+
11
+ if TYPE_CHECKING:
12
+ from torch.utils.data import DataLoader
13
+
14
+ from mteb.abstasks.task_metadata import TaskMetadata
15
+ from mteb.types import Array, BatchedInput, PromptType
11
16
 
12
17
 
13
18
  class DINOModel(AbsEncoder):
@@ -1,14 +1,19 @@
1
- from typing import Any
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING, Any
2
4
 
3
5
  import torch
4
6
  from packaging import version
5
- from torch.utils.data import DataLoader
6
7
  from tqdm.auto import tqdm
7
8
 
8
- from mteb.abstasks.task_metadata import TaskMetadata
9
9
  from mteb.models.abs_encoder import AbsEncoder
10
10
  from mteb.models.model_meta import ModelMeta, ScoringFunction
11
- from mteb.types import Array, BatchedInput, PromptType
11
+
12
+ if TYPE_CHECKING:
13
+ from torch.utils.data import DataLoader
14
+
15
+ from mteb.abstasks.task_metadata import TaskMetadata
16
+ from mteb.types import Array, BatchedInput, PromptType
12
17
 
13
18
  E5_V_TRANSFORMERS_VERSION = (
14
19
  "4.44.2" # Issue 1647: Only works with transformers==4.44.2.
@@ -1,17 +1,23 @@
1
- from typing import Any
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING, Any
2
4
 
3
5
  import torch
4
- from torch.utils.data import DataLoader
5
6
  from tqdm.auto import tqdm
6
7
 
7
8
  from mteb._requires_package import (
8
9
  requires_image_dependencies,
9
10
  requires_package,
10
11
  )
11
- from mteb.abstasks.task_metadata import TaskMetadata
12
12
  from mteb.models.abs_encoder import AbsEncoder
13
13
  from mteb.models.model_meta import ModelMeta, ScoringFunction
14
- from mteb.types import Array, BatchedInput, PromptType
14
+ from mteb.types import PromptType
15
+
16
+ if TYPE_CHECKING:
17
+ from torch.utils.data import DataLoader
18
+
19
+ from mteb.abstasks.task_metadata import TaskMetadata
20
+ from mteb.types import Array, BatchedInput
15
21
 
16
22
 
17
23
  class EagerEmbedV1Wrapper(AbsEncoder):
@@ -1,15 +1,20 @@
1
+ from __future__ import annotations
2
+
1
3
  from pathlib import Path
2
- from typing import Any
4
+ from typing import TYPE_CHECKING, Any
3
5
 
4
6
  import torch
5
- from torch.utils.data import DataLoader
6
7
  from tqdm.auto import tqdm
7
8
 
8
9
  from mteb._requires_package import requires_image_dependencies
9
- from mteb.abstasks.task_metadata import TaskMetadata
10
10
  from mteb.models.abs_encoder import AbsEncoder
11
11
  from mteb.models.model_meta import ModelMeta, ScoringFunction
12
- from mteb.types import Array, BatchedInput, PromptType
12
+
13
+ if TYPE_CHECKING:
14
+ from torch.utils.data import DataLoader
15
+
16
+ from mteb.abstasks.task_metadata import TaskMetadata
17
+ from mteb.types import Array, BatchedInput, PromptType
13
18
 
14
19
  EVA_CLIP_CITATION = """@article{EVA-CLIP,
15
20
  title={EVA-CLIP: Improved Training Techniques for CLIP at Scale},
@@ -6,16 +6,18 @@ import warnings
6
6
  from typing import TYPE_CHECKING, Any
7
7
 
8
8
  import torch
9
- from torch.utils.data import DataLoader
10
9
  from tqdm.autonotebook import tqdm
11
10
 
12
- from mteb.abstasks.task_metadata import TaskMetadata
13
11
  from mteb.models.abs_encoder import AbsEncoder
14
12
  from mteb.models.model_meta import ModelMeta, ScoringFunction
15
- from mteb.types import Array, BatchedInput, PromptType
13
+ from mteb.types import PromptType
16
14
 
17
15
  if TYPE_CHECKING:
18
16
  from PIL import Image
17
+ from torch.utils.data import DataLoader
18
+
19
+ from mteb.abstasks.task_metadata import TaskMetadata
20
+ from mteb.types import Array, BatchedInput
19
21
 
20
22
  logger = logging.getLogger(__name__)
21
23
 
@@ -1,17 +1,23 @@
1
- from typing import Any
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING, Any
2
4
 
3
5
  import numpy as np
4
6
  from packaging.version import Version
5
- from torch.utils.data import DataLoader
6
7
  from tqdm.auto import tqdm
7
8
  from transformers import __version__ as transformers_version
8
9
 
9
10
  from mteb._requires_package import requires_package
10
- from mteb.abstasks.task_metadata import TaskMetadata
11
11
  from mteb.models import sentence_transformers_loader
12
12
  from mteb.models.abs_encoder import AbsEncoder
13
13
  from mteb.models.model_meta import ModelMeta, ScoringFunction
14
- from mteb.types import Array, BatchedInput, PromptType
14
+ from mteb.types import PromptType
15
+
16
+ if TYPE_CHECKING:
17
+ from torch.utils.data import DataLoader
18
+
19
+ from mteb.abstasks.task_metadata import TaskMetadata
20
+ from mteb.types import Array, BatchedInput
15
21
 
16
22
  MULTILINGUAL_EVALUATED_LANGUAGES = [
17
23
  "arb-Arab",
@@ -4,20 +4,21 @@ import logging
4
4
  from typing import TYPE_CHECKING, Any
5
5
 
6
6
  import torch
7
- from torch.utils.data import DataLoader
8
7
  from tqdm.auto import tqdm
9
8
 
10
9
  from mteb._requires_package import (
11
10
  requires_image_dependencies,
12
11
  )
13
- from mteb.abstasks.task_metadata import TaskMetadata
14
12
  from mteb.models.model_meta import ModelMeta
15
- from mteb.types import Array, BatchedInput, PromptType
16
-
17
- logger = logging.getLogger(__name__)
18
13
 
19
14
  if TYPE_CHECKING:
20
15
  from PIL import Image
16
+ from torch.utils.data import DataLoader
17
+
18
+ from mteb.abstasks.task_metadata import TaskMetadata
19
+ from mteb.types import Array, BatchedInput, PromptType
20
+
21
+ logger = logging.getLogger(__name__)
21
22
 
22
23
 
23
24
  class GraniteVisionEmbeddingWrapper:
@@ -1,9 +1,13 @@
1
+ from __future__ import annotations
2
+
1
3
  import logging
4
+ from typing import TYPE_CHECKING
2
5
 
3
6
  from mteb.models.model_meta import ModelMeta
4
7
  from mteb.models.sentence_transformer_wrapper import sentence_transformers_loader
5
- from mteb.types import PromptType
6
8
 
9
+ if TYPE_CHECKING:
10
+ from mteb.types import PromptType
7
11
  logger = logging.getLogger(__name__)
8
12
 
9
13
 
@@ -1,11 +1,10 @@
1
+ from __future__ import annotations
2
+
1
3
  import logging
2
- from collections.abc import Callable
3
- from typing import Any
4
+ from typing import TYPE_CHECKING, Any
4
5
 
5
6
  import torch
6
- from torch.utils.data import DataLoader
7
7
 
8
- from mteb.abstasks.task_metadata import TaskMetadata
9
8
  from mteb.models.abs_encoder import AbsEncoder
10
9
  from mteb.models.instruct_wrapper import InstructSentenceTransformerModel
11
10
  from mteb.models.model_implementations.bge_models import (
@@ -17,7 +16,15 @@ from mteb.models.model_implementations.e5_instruct import E5_MISTRAL_TRAINING_DA
17
16
  from mteb.models.model_implementations.nvidia_models import nvidia_training_datasets
18
17
  from mteb.models.model_implementations.qzhou_models import qzhou_training_data
19
18
  from mteb.models.model_meta import ModelMeta, ScoringFunction
20
- from mteb.types import Array, BatchedInput, PromptType
19
+ from mteb.types import PromptType
20
+
21
+ if TYPE_CHECKING:
22
+ from collections.abc import Callable
23
+
24
+ from torch.utils.data import DataLoader
25
+
26
+ from mteb.abstasks.task_metadata import TaskMetadata
27
+ from mteb.types import Array, BatchedInput
21
28
 
22
29
  logger = logging.getLogger(__name__)
23
30
 
@@ -1,15 +1,20 @@
1
- from typing import Any
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING, Any
2
4
 
3
5
  import torch
4
- from torch.utils.data import DataLoader
5
6
  from tqdm.auto import tqdm
6
7
 
7
8
  from mteb._requires_package import requires_image_dependencies
8
- from mteb.abstasks.task_metadata import TaskMetadata
9
9
  from mteb.models.abs_encoder import AbsEncoder
10
10
  from mteb.models.model_implementations.colpali_models import COLPALI_TRAINING_DATA
11
11
  from mteb.models.model_meta import ModelMeta, ScoringFunction
12
- from mteb.types import Array, BatchedInput, PromptType
12
+
13
+ if TYPE_CHECKING:
14
+ from torch.utils.data import DataLoader
15
+
16
+ from mteb.abstasks.task_metadata import TaskMetadata
17
+ from mteb.types import Array, BatchedInput, PromptType
13
18
 
14
19
  JINA_CLIP_CITATION = """@article{koukounas2024jinaclip,
15
20
  title={Jina CLIP: Your CLIP Model Is Also Your Text Retriever},
@@ -1,14 +1,13 @@
1
+ from __future__ import annotations
2
+
1
3
  import logging
2
4
  from collections import defaultdict
3
- from typing import Any, ClassVar
5
+ from typing import TYPE_CHECKING, Any, ClassVar
4
6
 
5
7
  import numpy as np
6
8
  import torch
7
- from sentence_transformers import CrossEncoder
8
- from torch.utils.data import DataLoader
9
9
 
10
10
  from mteb._requires_package import requires_package
11
- from mteb.abstasks.task_metadata import TaskMetadata
12
11
  from mteb.languages import PROGRAMMING_LANGS
13
12
  from mteb.models.abs_encoder import AbsEncoder
14
13
  from mteb.models.model_meta import ModelMeta, ScoringFunction
@@ -16,7 +15,13 @@ from mteb.models.sentence_transformer_wrapper import (
16
15
  CrossEncoderWrapper,
17
16
  SentenceTransformerEncoderWrapper,
18
17
  )
19
- from mteb.types import Array, BatchedInput, PromptType
18
+
19
+ if TYPE_CHECKING:
20
+ from sentence_transformers import CrossEncoder
21
+ from torch.utils.data import DataLoader
22
+
23
+ from mteb.abstasks.task_metadata import TaskMetadata
24
+ from mteb.types import Array, BatchedInput, PromptType
20
25
 
21
26
  logger = logging.getLogger(__name__)
22
27
 
@@ -1,14 +1,20 @@
1
+ from __future__ import annotations
2
+
1
3
  import logging
2
- from typing import Any
4
+ from typing import TYPE_CHECKING, Any
3
5
 
4
6
  import torch
5
- from torch.utils.data import DataLoader
6
7
 
7
- from mteb.abstasks.task_metadata import TaskMetadata
8
8
  from mteb.models.instruct_wrapper import InstructSentenceTransformerModel
9
9
  from mteb.models.model_meta import ModelMeta
10
10
  from mteb.models.sentence_transformer_wrapper import sentence_transformers_loader
11
- from mteb.types import Array, BatchedInput, PromptType
11
+ from mteb.types import PromptType
12
+
13
+ if TYPE_CHECKING:
14
+ from torch.utils.data import DataLoader
15
+
16
+ from mteb.abstasks.task_metadata import TaskMetadata
17
+ from mteb.types import Array, BatchedInput
12
18
 
13
19
  logger = logging.getLogger(__name__)
14
20
 
@@ -907,23 +913,23 @@ KaLM_Embedding_KaLM_embedding_multilingual_mini_instruct_v2_5 = ModelMeta(
907
913
  adapted_from="HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v2",
908
914
  superseded_by=None,
909
915
  citation="""@misc{zhao2025kalmembeddingv2,
910
- title={KaLM-Embedding-V2: Superior Training Techniques and Data Inspire A Versatile Embedding Model},
916
+ title={KaLM-Embedding-V2: Superior Training Techniques and Data Inspire A Versatile Embedding Model},
911
917
  author={Xinping Zhao and Xinshuo Hu and Zifei Shan and Shouzheng Huang and Yao Zhou and Xin Zhang and Zetian Sun and Zhenyu Liu and Dongfang Li and Xinyuan Wei and Youcheng Pan and Yang Xiang and Meishan Zhang and Haofen Wang and Jun Yu and Baotian Hu and Min Zhang},
912
918
  year={2025},
913
919
  eprint={2506.20923},
914
920
  archivePrefix={arXiv},
915
921
  primaryClass={cs.CL},
916
- url={https://arxiv.org/abs/2506.20923},
922
+ url={https://arxiv.org/abs/2506.20923},
917
923
  }
918
924
 
919
925
  @misc{hu2025kalmembedding,
920
- title={KaLM-Embedding: Superior Training Data Brings A Stronger Embedding Model},
926
+ title={KaLM-Embedding: Superior Training Data Brings A Stronger Embedding Model},
921
927
  author={Xinshuo Hu and Zifei Shan and Xinping Zhao and Zetian Sun and Zhenyu Liu and Dongfang Li and Shaolin Ye and Xinyuan Wei and Qian Chen and Baotian Hu and Haofen Wang and Jun Yu and Min Zhang},
922
928
  year={2025},
923
929
  eprint={2501.01028},
924
930
  archivePrefix={arXiv},
925
931
  primaryClass={cs.CL},
926
- url={https://arxiv.org/abs/2501.01028},
932
+ url={https://arxiv.org/abs/2501.01028},
927
933
  }""",
928
934
  )
929
935
 
@@ -954,22 +960,22 @@ KaLM_Embedding_gemma_3_12b_2511 = ModelMeta(
954
960
  public_training_data=None,
955
961
  training_datasets=KaLM_Embedding_gemma_3_12b_training_data,
956
962
  citation="""@misc{zhao2025kalmembeddingv2,
957
- title={KaLM-Embedding-V2: Superior Training Techniques and Data Inspire A Versatile Embedding Model},
963
+ title={KaLM-Embedding-V2: Superior Training Techniques and Data Inspire A Versatile Embedding Model},
958
964
  author={Xinping Zhao and Xinshuo Hu and Zifei Shan and Shouzheng Huang and Yao Zhou and Xin Zhang and Zetian Sun and Zhenyu Liu and Dongfang Li and Xinyuan Wei and Youcheng Pan and Yang Xiang and Meishan Zhang and Haofen Wang and Jun Yu and Baotian Hu and Min Zhang},
959
965
  year={2025},
960
966
  eprint={2506.20923},
961
967
  archivePrefix={arXiv},
962
968
  primaryClass={cs.CL},
963
- url={https://arxiv.org/abs/2506.20923},
969
+ url={https://arxiv.org/abs/2506.20923},
964
970
  }
965
971
 
966
972
  @misc{hu2025kalmembedding,
967
- title={KaLM-Embedding: Superior Training Data Brings A Stronger Embedding Model},
973
+ title={KaLM-Embedding: Superior Training Data Brings A Stronger Embedding Model},
968
974
  author={Xinshuo Hu and Zifei Shan and Xinping Zhao and Zetian Sun and Zhenyu Liu and Dongfang Li and Shaolin Ye and Xinyuan Wei and Qian Chen and Baotian Hu and Haofen Wang and Jun Yu and Min Zhang},
969
975
  year={2025},
970
976
  eprint={2501.01028},
971
977
  archivePrefix={arXiv},
972
978
  primaryClass={cs.CL},
973
- url={https://arxiv.org/abs/2501.01028},
979
+ url={https://arxiv.org/abs/2501.01028},
974
980
  }""",
975
981
  )
@@ -1,11 +1,16 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING
4
+
1
5
  import torch
2
6
 
3
7
  from mteb.models.instruct_wrapper import instruct_wrapper
4
8
  from mteb.models.model_meta import ModelMeta, ScoringFunction
5
- from mteb.types import PromptType
6
9
 
7
10
  from .e5_instruct import E5_MISTRAL_TRAINING_DATA
8
11
 
12
+ if TYPE_CHECKING:
13
+ from mteb.types import PromptType
9
14
  LINQ_EMBED_MISTRAL_CITATION = """@misc{LinqAIResearch2024,
10
15
  title={Linq-Embed-Mistral:Elevating Text Retrieval with Improved GPT Data Through Task-Specific Control and Quality Refinement},
11
16
  author={Junseong Kim and Seolhwa Lee and Jihoon Kwon and Sangmo Gu and Yejin Kim and Minkyung Cho and Jy-yong Sohn and Chanyeol Choi},
@@ -1,14 +1,19 @@
1
- from typing import Any
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING, Any
2
4
 
3
5
  import torch
4
- from torch.utils.data import DataLoader
5
6
 
6
- from mteb.abstasks.task_metadata import TaskMetadata
7
7
  from mteb.models.model_meta import ModelMeta
8
- from mteb.types import BatchedInput, PromptType
9
8
 
10
9
  from .rerankers_custom import RerankerWrapper
11
10
 
11
+ if TYPE_CHECKING:
12
+ from torch.utils.data import DataLoader
13
+
14
+ from mteb.abstasks.task_metadata import TaskMetadata
15
+ from mteb.types import BatchedInput, PromptType
16
+
12
17
  LISTCONRANKER_CITATION = """@article{liu2025listconranker,
13
18
  title={ListConRanker: A Contrastive Text Reranker with Listwise Encoding},
14
19
  author={Liu, Junlong and Ma, Yue and Zhao, Ruihui and Zheng, Junhao and Ma, Qianli and Kang, Yangyang},
@@ -1,15 +1,20 @@
1
+ from __future__ import annotations
2
+
1
3
  from pathlib import Path
2
- from typing import Any
4
+ from typing import TYPE_CHECKING, Any
3
5
 
4
6
  import torch
5
- from torch.utils.data import DataLoader
6
7
  from tqdm.auto import tqdm
7
8
 
8
9
  from mteb._requires_package import requires_image_dependencies, requires_package
9
- from mteb.abstasks.task_metadata import TaskMetadata
10
10
  from mteb.models.abs_encoder import AbsEncoder
11
11
  from mteb.models.model_meta import ModelMeta, ScoringFunction
12
- from mteb.types import Array, BatchedInput, PromptType
12
+
13
+ if TYPE_CHECKING:
14
+ from torch.utils.data import DataLoader
15
+
16
+ from mteb.abstasks.task_metadata import TaskMetadata
17
+ from mteb.types import Array, BatchedInput, PromptType
13
18
 
14
19
  LLM2CLIP_CITATION = """@misc{huang2024llm2clippowerfullanguagemodel,
15
20
  title={LLM2CLIP: Powerful Language Model Unlock Richer Visual Representation},
@@ -1,16 +1,22 @@
1
+ from __future__ import annotations
2
+
1
3
  import logging
2
- from collections.abc import Callable
3
- from typing import Any
4
+ from typing import TYPE_CHECKING, Any
4
5
 
5
6
  import torch
6
- from torch.utils.data import DataLoader
7
7
 
8
8
  from mteb._requires_package import requires_package, suggest_package
9
- from mteb.abstasks.task_metadata import TaskMetadata
10
9
  from mteb.models.abs_encoder import AbsEncoder
11
10
  from mteb.models.model_meta import ModelMeta, ScoringFunction
12
- from mteb.models.models_protocols import EncoderProtocol
13
- from mteb.types import Array, BatchedInput, PromptType
11
+
12
+ if TYPE_CHECKING:
13
+ from collections.abc import Callable
14
+
15
+ from torch.utils.data import DataLoader
16
+
17
+ from mteb.abstasks.task_metadata import TaskMetadata
18
+ from mteb.models.models_protocols import EncoderProtocol
19
+ from mteb.types import Array, BatchedInput, PromptType
14
20
 
15
21
  logger = logging.getLogger(__name__)
16
22
 
@@ -1,16 +1,19 @@
1
+ from __future__ import annotations
2
+
1
3
  import logging
2
4
  import os
3
5
  import time
4
6
  import warnings
5
- from typing import Any
7
+ from typing import TYPE_CHECKING, Any
6
8
 
7
9
  import numpy as np
8
10
  import requests
9
11
 
10
12
  from mteb.models.abs_encoder import AbsEncoder
11
13
  from mteb.models.model_meta import ModelMeta
12
- from mteb.types import PromptType
13
14
 
15
+ if TYPE_CHECKING:
16
+ from mteb.types import PromptType
14
17
  logger = logging.getLogger(__name__)
15
18
 
16
19
  HAKIM_CITATION = """@article{sarmadi2025hakim,
@@ -1,5 +1,7 @@
1
1
  from mteb.models.model_implementations.arctic_models import arctic_v1_training_datasets
2
- from mteb.models.model_implementations.mxbai_models import mixedbread_training_data
2
+ from mteb.models.model_implementations.mixedbread_ai_models import (
3
+ mixedbread_training_data,
4
+ )
3
5
  from mteb.models.model_meta import ModelMeta
4
6
  from mteb.models.sentence_transformer_wrapper import sentence_transformers_loader
5
7
 
@@ -1,3 +1,4 @@
1
+ from mteb.models.model_implementations.pylate_models import MultiVectorModel
1
2
  from mteb.models.model_meta import (
2
3
  ModelMeta,
3
4
  ScoringFunction,
@@ -239,3 +240,93 @@ mxbai_rerank_large_v1 = ModelMeta(
239
240
  }""",
240
241
  contacts=None,
241
242
  )
243
+
244
+ mxbai_edge_colbert_v0_17m = ModelMeta(
245
+ loader=MultiVectorModel,
246
+ name="mixedbread-ai/mxbai-edge-colbert-v0-17m",
247
+ model_type=["late-interaction"],
248
+ languages=["eng-Latn"],
249
+ open_weights=True,
250
+ revision="23ae07f5bf028bc0d1f80c82e6e2dd2311f13a46",
251
+ public_training_code=None,
252
+ public_training_data=None,
253
+ release_date="2025-10-16",
254
+ n_parameters=int(17 * 1e6),
255
+ memory_usage_mb=64,
256
+ max_tokens=7999,
257
+ embed_dim=None,
258
+ license="apache-2.0",
259
+ similarity_fn_name=ScoringFunction.MAX_SIM,
260
+ framework=["PyLate", "ColBERT", "Transformers", "safetensors"],
261
+ reference="https://huggingface.co/mixedbread-ai/mxbai-edge-colbert-v0-17m",
262
+ use_instructions=False,
263
+ adapted_from="https://huggingface.co/jhu-clsp/ettin-encoder-17m",
264
+ superseded_by=None,
265
+ training_datasets={
266
+ "CornStack",
267
+ "MSMARCO",
268
+ "NQ",
269
+ "HotpotQA",
270
+ "AmazonQA",
271
+ "LoTTE",
272
+ "MultiLongDocRetrieval",
273
+ # "FineWeb",
274
+ # "PubMedQA",
275
+ # "TriviaQA",
276
+ },
277
+ citation="""@misc{takehi2025fantasticsmallretrieverstrain,
278
+ title={Fantastic (small) Retrievers and How to Train Them: mxbai-edge-colbert-v0 Tech Report},
279
+ author={Rikiya Takehi and Benjamin Clavié and Sean Lee and Aamir Shakir},
280
+ year={2025},
281
+ eprint={2510.14880},
282
+ archivePrefix={arXiv},
283
+ primaryClass={cs.IR},
284
+ url={https://arxiv.org/abs/2510.14880},
285
+ }""",
286
+ contacts=None,
287
+ )
288
+
289
+ mxbai_edge_colbert_v0_32m = ModelMeta(
290
+ loader=MultiVectorModel,
291
+ name="mixedbread-ai/mxbai-edge-colbert-v0-32m",
292
+ model_type=["late-interaction"],
293
+ languages=["eng-Latn"],
294
+ open_weights=True,
295
+ revision="2f12870a85dae80680b9babc59992c9a2bc59e4a",
296
+ public_training_code=None,
297
+ public_training_data=None,
298
+ release_date="2025-10-16",
299
+ n_parameters=int(32 * 1e6),
300
+ memory_usage_mb=122,
301
+ max_tokens=511,
302
+ embed_dim=None,
303
+ license="apache-2.0",
304
+ similarity_fn_name=ScoringFunction.MAX_SIM,
305
+ framework=["PyLate", "ColBERT", "Transformers", "safetensors"],
306
+ reference="https://huggingface.co/mixedbread-ai/mxbai-edge-colbert-v0-32m",
307
+ use_instructions=False,
308
+ adapted_from="https://huggingface.co/jhu-clsp/ettin-encoder-32m",
309
+ superseded_by=None,
310
+ training_datasets={
311
+ "CornStack",
312
+ "MSMARCO",
313
+ "NQ",
314
+ "HotpotQA",
315
+ "AmazonQA",
316
+ "LoTTE",
317
+ "MultiLongDocRetrieval",
318
+ # "FineWeb",
319
+ # "PubMedQA",
320
+ # "TriviaQA",
321
+ },
322
+ citation="""@misc{takehi2025fantasticsmallretrieverstrain,
323
+ title={Fantastic (small) Retrievers and How to Train Them: mxbai-edge-colbert-v0 Tech Report},
324
+ author={Rikiya Takehi and Benjamin Clavié and Sean Lee and Aamir Shakir},
325
+ year={2025},
326
+ eprint={2510.14880},
327
+ archivePrefix={arXiv},
328
+ primaryClass={cs.IR},
329
+ url={https://arxiv.org/abs/2510.14880},
330
+ }""",
331
+ contacts=None,
332
+ )
@@ -1,14 +1,19 @@
1
- from typing import Any
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING, Any
2
4
 
3
5
  import torch
4
- from torch.utils.data import DataLoader
5
6
  from tqdm.auto import tqdm
6
7
 
7
8
  from mteb._requires_package import requires_image_dependencies, requires_package
8
- from mteb.abstasks.task_metadata import TaskMetadata
9
9
  from mteb.models.abs_encoder import AbsEncoder
10
10
  from mteb.models.model_meta import ModelMeta, ScoringFunction
11
- from mteb.types import Array, BatchedInput, PromptType
11
+
12
+ if TYPE_CHECKING:
13
+ from torch.utils.data import DataLoader
14
+
15
+ from mteb.abstasks.task_metadata import TaskMetadata
16
+ from mteb.types import Array, BatchedInput, PromptType
12
17
 
13
18
  MOCOV3_CITATION = """@Article{chen2021mocov3,
14
19
  author = {Xinlei Chen* and Saining Xie* and Kaiming He},
@@ -1,6 +1,6 @@
1
1
  from mteb.models.instruct_wrapper import InstructSentenceTransformerModel
2
2
  from mteb.models.model_meta import ModelMeta
3
- from mteb.models.models_protocols import PromptType
3
+ from mteb.types import PromptType
4
4
 
5
5
 
6
6
  def instruction_template(
@@ -1,17 +1,23 @@
1
+ from __future__ import annotations
2
+
1
3
  import logging
2
- from typing import Any
4
+ from typing import TYPE_CHECKING, Any
3
5
 
4
6
  import numpy as np
5
- from torch.utils.data import DataLoader
6
7
 
7
8
  from mteb._requires_package import requires_package
8
- from mteb.abstasks.task_metadata import TaskMetadata
9
9
  from mteb.models.abs_encoder import AbsEncoder
10
10
  from mteb.models.model_meta import ModelMeta, ScoringFunction
11
- from mteb.types import Array, BatchedInput, PromptType
12
11
 
13
12
  from .bge_models import bge_training_data
14
13
 
14
+ if TYPE_CHECKING:
15
+ from torch.utils.data import DataLoader
16
+
17
+ from mteb.abstasks.task_metadata import TaskMetadata
18
+ from mteb.types import Array, BatchedInput, PromptType
19
+
20
+
15
21
  logger = logging.getLogger(__name__)
16
22
 
17
23
  MODEL2VEC_CITATION = """@software{minishlab2024model2vec,