mteb 2.5.2__py3-none-any.whl → 2.7.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (241) hide show
  1. mteb/__init__.py +2 -0
  2. mteb/_create_dataloaders.py +17 -18
  3. mteb/_evaluators/any_sts_evaluator.py +3 -3
  4. mteb/_evaluators/clustering_evaluator.py +2 -2
  5. mteb/_evaluators/evaluator.py +4 -2
  6. mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +10 -8
  7. mteb/_evaluators/pair_classification_evaluator.py +5 -3
  8. mteb/_evaluators/retrieval_evaluator.py +2 -2
  9. mteb/_evaluators/retrieval_metrics.py +18 -17
  10. mteb/_evaluators/sklearn_evaluator.py +11 -10
  11. mteb/_evaluators/text/bitext_mining_evaluator.py +27 -18
  12. mteb/_evaluators/text/summarization_evaluator.py +23 -18
  13. mteb/_evaluators/zeroshot_classification_evaluator.py +5 -3
  14. mteb/abstasks/_data_filter/filters.py +1 -1
  15. mteb/abstasks/_data_filter/task_pipelines.py +3 -0
  16. mteb/abstasks/_statistics_calculation.py +18 -10
  17. mteb/abstasks/_stratification.py +18 -18
  18. mteb/abstasks/abstask.py +35 -28
  19. mteb/abstasks/aggregate_task_metadata.py +1 -9
  20. mteb/abstasks/aggregated_task.py +10 -29
  21. mteb/abstasks/classification.py +15 -10
  22. mteb/abstasks/clustering.py +19 -15
  23. mteb/abstasks/clustering_legacy.py +10 -10
  24. mteb/abstasks/image/image_text_pair_classification.py +7 -4
  25. mteb/abstasks/multilabel_classification.py +23 -19
  26. mteb/abstasks/pair_classification.py +20 -11
  27. mteb/abstasks/regression.py +4 -4
  28. mteb/abstasks/retrieval.py +28 -24
  29. mteb/abstasks/retrieval_dataset_loaders.py +2 -2
  30. mteb/abstasks/sts.py +8 -5
  31. mteb/abstasks/task_metadata.py +31 -33
  32. mteb/abstasks/text/bitext_mining.py +39 -28
  33. mteb/abstasks/text/reranking.py +8 -6
  34. mteb/abstasks/text/summarization.py +10 -5
  35. mteb/abstasks/zeroshot_classification.py +8 -4
  36. mteb/benchmarks/benchmark.py +4 -2
  37. mteb/benchmarks/benchmarks/__init__.py +4 -0
  38. mteb/benchmarks/benchmarks/benchmarks.py +112 -11
  39. mteb/benchmarks/get_benchmark.py +14 -55
  40. mteb/cache.py +182 -29
  41. mteb/cli/_display_tasks.py +2 -2
  42. mteb/cli/build_cli.py +110 -14
  43. mteb/cli/generate_model_card.py +43 -23
  44. mteb/deprecated_evaluator.py +63 -49
  45. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2CybersecurityRetrieval.json +32 -0
  46. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EconomicRetrieval.json +32 -0
  47. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EnergyRetrieval.json +32 -0
  48. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2HrRetrieval.json +32 -0
  49. mteb/descriptive_stats/Retrieval/ChemRxivRetrieval.json +30 -0
  50. mteb/descriptive_stats/Retrieval/EuroPIRQRetrieval.json +116 -0
  51. mteb/descriptive_stats/Retrieval/NanoClimateFEVER-VN.json +30 -0
  52. mteb/descriptive_stats/Retrieval/NanoDBPedia-VN.json +30 -0
  53. mteb/descriptive_stats/Retrieval/NanoFEVER-VN.json +30 -0
  54. mteb/descriptive_stats/Retrieval/NanoHotpotQA-VN.json +30 -0
  55. mteb/descriptive_stats/Retrieval/NanoMSMARCO-VN.json +30 -0
  56. mteb/descriptive_stats/Retrieval/NanoNQ-VN.json +30 -0
  57. mteb/descriptive_stats/Retrieval/TVPLRetrieval.json +30 -0
  58. mteb/evaluate.py +44 -33
  59. mteb/filter_tasks.py +25 -26
  60. mteb/get_tasks.py +29 -30
  61. mteb/languages/language_scripts.py +5 -3
  62. mteb/leaderboard/app.py +162 -34
  63. mteb/load_results.py +12 -12
  64. mteb/models/abs_encoder.py +10 -6
  65. mteb/models/cache_wrappers/cache_backend_protocol.py +3 -5
  66. mteb/models/cache_wrappers/cache_backends/_hash_utils.py +5 -4
  67. mteb/models/cache_wrappers/cache_backends/faiss_cache.py +6 -2
  68. mteb/models/cache_wrappers/cache_backends/numpy_cache.py +43 -25
  69. mteb/models/cache_wrappers/cache_wrapper.py +2 -2
  70. mteb/models/get_model_meta.py +21 -3
  71. mteb/models/instruct_wrapper.py +28 -8
  72. mteb/models/model_implementations/align_models.py +1 -1
  73. mteb/models/model_implementations/andersborges.py +4 -4
  74. mteb/models/model_implementations/ara_models.py +1 -1
  75. mteb/models/model_implementations/arctic_models.py +8 -8
  76. mteb/models/model_implementations/b1ade_models.py +1 -1
  77. mteb/models/model_implementations/bge_models.py +45 -21
  78. mteb/models/model_implementations/bica_model.py +3 -3
  79. mteb/models/model_implementations/blip2_models.py +2 -2
  80. mteb/models/model_implementations/blip_models.py +16 -16
  81. mteb/models/model_implementations/bm25.py +4 -4
  82. mteb/models/model_implementations/bmretriever_models.py +6 -4
  83. mteb/models/model_implementations/cadet_models.py +1 -1
  84. mteb/models/model_implementations/cde_models.py +11 -4
  85. mteb/models/model_implementations/clip_models.py +6 -6
  86. mteb/models/model_implementations/clips_models.py +3 -3
  87. mteb/models/model_implementations/codefuse_models.py +5 -5
  88. mteb/models/model_implementations/codesage_models.py +3 -3
  89. mteb/models/model_implementations/cohere_models.py +5 -5
  90. mteb/models/model_implementations/cohere_v.py +2 -2
  91. mteb/models/model_implementations/colpali_models.py +3 -3
  92. mteb/models/model_implementations/colqwen_models.py +8 -8
  93. mteb/models/model_implementations/colsmol_models.py +2 -2
  94. mteb/models/model_implementations/conan_models.py +1 -1
  95. mteb/models/model_implementations/dino_models.py +42 -42
  96. mteb/models/model_implementations/e5_instruct.py +23 -4
  97. mteb/models/model_implementations/e5_models.py +9 -9
  98. mteb/models/model_implementations/e5_v.py +6 -6
  99. mteb/models/model_implementations/eagerworks_models.py +1 -1
  100. mteb/models/model_implementations/emillykkejensen_models.py +6 -6
  101. mteb/models/model_implementations/en_code_retriever.py +1 -1
  102. mteb/models/model_implementations/euler_models.py +2 -2
  103. mteb/models/model_implementations/fa_models.py +9 -9
  104. mteb/models/model_implementations/facebookai.py +14 -2
  105. mteb/models/model_implementations/geogpt_models.py +1 -1
  106. mteb/models/model_implementations/gme_v_models.py +6 -5
  107. mteb/models/model_implementations/google_models.py +1 -1
  108. mteb/models/model_implementations/granite_vision_embedding_models.py +1 -1
  109. mteb/models/model_implementations/gritlm_models.py +2 -2
  110. mteb/models/model_implementations/gte_models.py +25 -13
  111. mteb/models/model_implementations/hinvec_models.py +1 -1
  112. mteb/models/model_implementations/ibm_granite_models.py +30 -6
  113. mteb/models/model_implementations/inf_models.py +2 -2
  114. mteb/models/model_implementations/jasper_models.py +2 -2
  115. mteb/models/model_implementations/jina_clip.py +48 -10
  116. mteb/models/model_implementations/jina_models.py +18 -11
  117. mteb/models/model_implementations/kblab.py +12 -6
  118. mteb/models/model_implementations/kennethenevoldsen_models.py +4 -4
  119. mteb/models/model_implementations/kfst.py +1 -1
  120. mteb/models/model_implementations/kowshik24_models.py +1 -1
  121. mteb/models/model_implementations/lgai_embedding_models.py +1 -1
  122. mteb/models/model_implementations/linq_models.py +1 -1
  123. mteb/models/model_implementations/listconranker.py +1 -1
  124. mteb/models/model_implementations/llm2clip_models.py +6 -6
  125. mteb/models/model_implementations/llm2vec_models.py +8 -8
  126. mteb/models/model_implementations/mcinext_models.py +4 -1
  127. mteb/models/model_implementations/mdbr_models.py +17 -3
  128. mteb/models/model_implementations/misc_models.py +68 -68
  129. mteb/models/model_implementations/mixedbread_ai_models.py +332 -0
  130. mteb/models/model_implementations/mme5_models.py +1 -1
  131. mteb/models/model_implementations/moco_models.py +4 -4
  132. mteb/models/model_implementations/mod_models.py +1 -1
  133. mteb/models/model_implementations/model2vec_models.py +14 -14
  134. mteb/models/model_implementations/moka_models.py +1 -1
  135. mteb/models/model_implementations/nbailab.py +3 -3
  136. mteb/models/model_implementations/no_instruct_sentence_models.py +2 -2
  137. mteb/models/model_implementations/nomic_models.py +30 -15
  138. mteb/models/model_implementations/nomic_models_vision.py +1 -1
  139. mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +15 -9
  140. mteb/models/model_implementations/nvidia_models.py +151 -19
  141. mteb/models/model_implementations/octen_models.py +61 -2
  142. mteb/models/model_implementations/openclip_models.py +13 -13
  143. mteb/models/model_implementations/opensearch_neural_sparse_models.py +5 -5
  144. mteb/models/model_implementations/ops_moa_models.py +1 -1
  145. mteb/models/model_implementations/ordalietech_solon_embeddings_mini_beta_1_1.py +1 -1
  146. mteb/models/model_implementations/pawan_models.py +1 -1
  147. mteb/models/model_implementations/piccolo_models.py +1 -1
  148. mteb/models/model_implementations/pixie_models.py +56 -0
  149. mteb/models/model_implementations/promptriever_models.py +4 -4
  150. mteb/models/model_implementations/pylate_models.py +10 -9
  151. mteb/models/model_implementations/qodo_models.py +2 -2
  152. mteb/models/model_implementations/qtack_models.py +1 -1
  153. mteb/models/model_implementations/qwen3_models.py +3 -3
  154. mteb/models/model_implementations/qzhou_models.py +2 -2
  155. mteb/models/model_implementations/random_baseline.py +3 -3
  156. mteb/models/model_implementations/rasgaard_models.py +2 -2
  157. mteb/models/model_implementations/reasonir_model.py +1 -1
  158. mteb/models/model_implementations/repllama_models.py +3 -3
  159. mteb/models/model_implementations/rerankers_custom.py +12 -6
  160. mteb/models/model_implementations/rerankers_monot5_based.py +17 -17
  161. mteb/models/model_implementations/richinfoai_models.py +1 -1
  162. mteb/models/model_implementations/ru_sentence_models.py +20 -20
  163. mteb/models/model_implementations/ruri_models.py +10 -10
  164. mteb/models/model_implementations/salesforce_models.py +3 -3
  165. mteb/models/model_implementations/samilpwc_models.py +1 -1
  166. mteb/models/model_implementations/sarashina_embedding_models.py +2 -2
  167. mteb/models/model_implementations/searchmap_models.py +1 -1
  168. mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +113 -146
  169. mteb/models/model_implementations/sentence_transformers_models.py +124 -22
  170. mteb/models/model_implementations/shuu_model.py +1 -1
  171. mteb/models/model_implementations/siglip_models.py +20 -20
  172. mteb/models/model_implementations/slm_models.py +416 -0
  173. mteb/models/model_implementations/spartan8806_atles_champion.py +1 -1
  174. mteb/models/model_implementations/stella_models.py +17 -4
  175. mteb/models/model_implementations/tarka_models.py +2 -2
  176. mteb/models/model_implementations/text2vec_models.py +9 -3
  177. mteb/models/model_implementations/ua_sentence_models.py +1 -1
  178. mteb/models/model_implementations/uae_models.py +7 -1
  179. mteb/models/model_implementations/vdr_models.py +1 -1
  180. mteb/models/model_implementations/vi_vn_models.py +6 -6
  181. mteb/models/model_implementations/vlm2vec_models.py +3 -3
  182. mteb/models/model_implementations/voyage_models.py +84 -0
  183. mteb/models/model_implementations/voyage_v.py +9 -7
  184. mteb/models/model_implementations/youtu_models.py +1 -1
  185. mteb/models/model_implementations/yuan_models.py +1 -1
  186. mteb/models/model_implementations/yuan_models_en.py +1 -1
  187. mteb/models/model_meta.py +80 -31
  188. mteb/models/models_protocols.py +22 -6
  189. mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +9 -6
  190. mteb/models/search_wrappers.py +33 -18
  191. mteb/models/sentence_transformer_wrapper.py +50 -25
  192. mteb/models/vllm_wrapper.py +327 -0
  193. mteb/py.typed +0 -0
  194. mteb/results/benchmark_results.py +29 -21
  195. mteb/results/model_result.py +52 -22
  196. mteb/results/task_result.py +80 -58
  197. mteb/similarity_functions.py +11 -7
  198. mteb/tasks/classification/dan/dk_hate_classification.py +1 -1
  199. mteb/tasks/classification/est/estonian_valence.py +1 -1
  200. mteb/tasks/classification/kur/kurdish_sentiment_classification.py +2 -2
  201. mteb/tasks/classification/multilingual/scala_classification.py +1 -1
  202. mteb/tasks/clustering/eng/hume_wiki_cities_clustering.py +1 -1
  203. mteb/tasks/clustering/eng/wiki_cities_clustering.py +1 -1
  204. mteb/tasks/clustering/zho/cmteb_clustering.py +2 -2
  205. mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +1 -1
  206. mteb/tasks/reranking/multilingual/wikipedia_reranking_multilingual.py +1 -1
  207. mteb/tasks/retrieval/code/code_rag.py +12 -12
  208. mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
  209. mteb/tasks/retrieval/dan/tv2_nordretrieval.py +2 -2
  210. mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +2 -2
  211. mteb/tasks/retrieval/eng/__init__.py +2 -0
  212. mteb/tasks/retrieval/eng/chemrxiv.py +33 -0
  213. mteb/tasks/retrieval/eng/cub200_i2i_retrieval.py +1 -1
  214. mteb/tasks/retrieval/kor/__init__.py +15 -1
  215. mteb/tasks/retrieval/kor/kovidore2_bench_retrieval.py +142 -0
  216. mteb/tasks/retrieval/multilingual/__init__.py +2 -0
  217. mteb/tasks/retrieval/multilingual/euro_pirq_retrieval.py +43 -0
  218. mteb/tasks/retrieval/multilingual/vidore3_bench_retrieval.py +90 -100
  219. mteb/tasks/retrieval/nob/norquad.py +2 -2
  220. mteb/tasks/retrieval/nob/snl_retrieval.py +2 -2
  221. mteb/tasks/retrieval/tur/tur_hist_quad.py +1 -1
  222. mteb/tasks/retrieval/vie/__init__.py +14 -6
  223. mteb/tasks/retrieval/vie/climate_fevervn_retrieval.py +39 -0
  224. mteb/tasks/retrieval/vie/db_pedia_vn_retrieval.py +39 -0
  225. mteb/tasks/retrieval/vie/fevervn_retrieval.py +39 -0
  226. mteb/tasks/retrieval/vie/hotpot_qavn_retrieval.py +39 -0
  227. mteb/tasks/retrieval/vie/msmarcovn_retrieval.py +48 -0
  228. mteb/tasks/retrieval/vie/nqvn_retrieval.py +39 -0
  229. mteb/tasks/retrieval/vie/tvpl_retrieval.py +42 -0
  230. mteb/tasks/retrieval/vie/zac_legal_text_retrieval.py +15 -1
  231. mteb/types/__init__.py +2 -0
  232. mteb/types/_encoder_io.py +12 -0
  233. mteb/types/_result.py +2 -1
  234. mteb/types/statistics.py +9 -3
  235. {mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/METADATA +15 -4
  236. {mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/RECORD +240 -219
  237. mteb/models/model_implementations/mxbai_models.py +0 -111
  238. {mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/WHEEL +0 -0
  239. {mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/entry_points.txt +0 -0
  240. {mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/licenses/LICENSE +0 -0
  241. {mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,332 @@
1
+ from mteb.models.model_implementations.pylate_models import MultiVectorModel
2
+ from mteb.models.model_meta import (
3
+ ModelMeta,
4
+ ScoringFunction,
5
+ )
6
+ from mteb.models.sentence_transformer_wrapper import (
7
+ CrossEncoderWrapper,
8
+ sentence_transformers_loader,
9
+ )
10
+
11
+ mixedbread_training_data = {
12
+ # from correspondence:
13
+ # as mentioned in our blog post
14
+ # (https://www.mixedbread.com/blog/mxbai-embed-large-v1#built-for-rag-and-real-world-use-cases:~:text=During%20the%20whole,related%20use%20cases.)
15
+ # We do not train on any data (except the MSMarco training split) of MTEB. We have a strong filtering process to ensure the OOD setting. That's true
16
+ # for all of our models. Keep up the good work and let me know if you have any questions.
17
+ "MSMARCO",
18
+ }
19
+
20
+ mxbai_embed_large_v1 = ModelMeta(
21
+ loader=sentence_transformers_loader,
22
+ loader_kwargs=dict(
23
+ model_prompts={
24
+ "query": "Represent this sentence for searching relevant passages: "
25
+ },
26
+ ),
27
+ name="mixedbread-ai/mxbai-embed-large-v1",
28
+ model_type=["dense"],
29
+ languages=["eng-Latn"],
30
+ open_weights=True,
31
+ revision="990580e27d329c7408b3741ecff85876e128e203",
32
+ release_date="2024-03-07", # initial commit of hf model.
33
+ n_parameters=335_000_000,
34
+ memory_usage_mb=639,
35
+ max_tokens=512,
36
+ embed_dim=1024,
37
+ license="apache-2.0",
38
+ reference="https://huggingface.co/mixedbread-ai/mxbai-embed-large-v1",
39
+ similarity_fn_name=ScoringFunction.COSINE,
40
+ framework=[
41
+ "Sentence Transformers",
42
+ "PyTorch",
43
+ "ONNX",
44
+ "safetensors",
45
+ "GGUF",
46
+ "Transformers",
47
+ ],
48
+ use_instructions=True,
49
+ citation="""
50
+ @online{emb2024mxbai,
51
+ title={Open Source Strikes Bread - New Fluffy Embeddings Model},
52
+ author={Sean Lee and Aamir Shakir and Darius Koenig and Julius Lipp},
53
+ year={2024},
54
+ url={https://www.mixedbread.ai/blog/mxbai-embed-large-v1},
55
+ }
56
+
57
+ @article{li2023angle,
58
+ title={AnglE-optimized Text Embeddings},
59
+ author={Li, Xianming and Li, Jing},
60
+ journal={arXiv preprint arXiv:2309.12871},
61
+ year={2023}
62
+ }
63
+ """,
64
+ public_training_code=None,
65
+ public_training_data=None,
66
+ training_datasets=mixedbread_training_data,
67
+ )
68
+
69
+ mxbai_embed_2d_large_v1 = ModelMeta(
70
+ loader=sentence_transformers_loader,
71
+ name="mixedbread-ai/mxbai-embed-2d-large-v1",
72
+ model_type=["dense"],
73
+ languages=["eng-Latn"],
74
+ open_weights=True,
75
+ revision="7e639ca8e344af398876ead3b19ec3c0b9068f49",
76
+ release_date="2024-03-04", # initial commit of hf model.
77
+ n_parameters=335_000_000,
78
+ memory_usage_mb=None,
79
+ max_tokens=512,
80
+ embed_dim=768,
81
+ license="apache-2.0",
82
+ reference="https://huggingface.co/mixedbread-ai/mxbai-embed-2d-large-v1",
83
+ similarity_fn_name=ScoringFunction.COSINE,
84
+ framework=[
85
+ "Sentence Transformers",
86
+ "PyTorch",
87
+ "ONNX",
88
+ "safetensors",
89
+ "Transformers",
90
+ ],
91
+ use_instructions=True,
92
+ adapted_from=None,
93
+ superseded_by=None,
94
+ public_training_code=None,
95
+ public_training_data=None,
96
+ training_datasets=None,
97
+ )
98
+
99
+
100
+ mxbai_embed_xsmall_v1 = ModelMeta(
101
+ loader=sentence_transformers_loader,
102
+ name="mixedbread-ai/mxbai-embed-xsmall-v1",
103
+ model_type=["dense"],
104
+ languages=["eng-Latn"],
105
+ open_weights=True,
106
+ revision="2f741ec33328bb57e4704e1238fc59a4a5745705",
107
+ release_date="2024-08-13", # initial commit of hf model.
108
+ n_parameters=24_100_000,
109
+ memory_usage_mb=None,
110
+ max_tokens=512,
111
+ embed_dim=384,
112
+ license="apache-2.0",
113
+ reference="https://huggingface.co/mixedbread-ai/mxbai-embed-xsmall-v1",
114
+ similarity_fn_name=ScoringFunction.COSINE,
115
+ framework=["Sentence Transformers", "PyTorch", "ONNX", "safetensors", "GGUF"],
116
+ use_instructions=True,
117
+ adapted_from="sentence-transformers/all-MiniLM-L6-v2",
118
+ superseded_by=None,
119
+ public_training_code=None,
120
+ public_training_data=None,
121
+ training_datasets=mixedbread_training_data,
122
+ citation="""@online{xsmall2024mxbai,
123
+ title={Every Byte Matters: Introducing mxbai-embed-xsmall-v1},
124
+ author={Sean Lee and Julius Lipp and Rui Huang and Darius Koenig},
125
+ year={2024},
126
+ url={https://www.mixedbread.ai/blog/mxbai-embed-xsmall-v1},
127
+ }""",
128
+ )
129
+
130
+ mxbai_rerank_xsmall_v1 = ModelMeta(
131
+ loader=CrossEncoderWrapper,
132
+ name="mixedbread-ai/mxbai-rerank-xsmall-v1",
133
+ revision="b5c6e9da73abc3711f593f705371cdbe9e0fe422",
134
+ release_date="2024-02-29",
135
+ languages=["eng-Latn"],
136
+ n_parameters=70830337,
137
+ memory_usage_mb=135.0,
138
+ max_tokens=512,
139
+ embed_dim=None,
140
+ license="apache-2.0",
141
+ open_weights=True,
142
+ public_training_code=None,
143
+ public_training_data=None,
144
+ framework=[
145
+ "PyTorch",
146
+ "Sentence Transformers",
147
+ "Transformers",
148
+ "ONNX",
149
+ "safetensors",
150
+ ],
151
+ reference="https://huggingface.co/mixedbread-ai/mxbai-rerank-xsmall-v1",
152
+ similarity_fn_name=None,
153
+ use_instructions=None,
154
+ training_datasets=None,
155
+ adapted_from=None,
156
+ superseded_by=None,
157
+ modalities=["text"],
158
+ model_type=["cross-encoder"],
159
+ citation="""@online{rerank2024mxbai,
160
+ title={Boost Your Search With The Crispy Mixedbread Rerank Models},
161
+ author={Aamir Shakir and Darius Koenig and Julius Lipp and Sean Lee},
162
+ year={2024},
163
+ url={https://www.mixedbread.ai/blog/mxbai-rerank-v1},
164
+ }""",
165
+ contacts=None,
166
+ )
167
+
168
+ mxbai_rerank_base_v1 = ModelMeta(
169
+ loader=CrossEncoderWrapper,
170
+ name="mixedbread-ai/mxbai-rerank-base-v1",
171
+ revision="800f24c113213a187e65bde9db00c15a2bb12738",
172
+ release_date="2024-02-29",
173
+ languages=["eng-Latn"],
174
+ n_parameters=184422913,
175
+ memory_usage_mb=352.0,
176
+ max_tokens=512,
177
+ embed_dim=None,
178
+ license="apache-2.0",
179
+ open_weights=True,
180
+ public_training_code=None,
181
+ public_training_data=None,
182
+ framework=[
183
+ "PyTorch",
184
+ "Sentence Transformers",
185
+ "Transformers",
186
+ "ONNX",
187
+ "safetensors",
188
+ ],
189
+ reference="https://huggingface.co/mixedbread-ai/mxbai-rerank-base-v1",
190
+ similarity_fn_name=None,
191
+ use_instructions=None,
192
+ training_datasets=None,
193
+ adapted_from=None,
194
+ superseded_by=None,
195
+ modalities=["text"],
196
+ model_type=["cross-encoder"],
197
+ citation="""@online{rerank2024mxbai,
198
+ title={Boost Your Search With The Crispy Mixedbread Rerank Models},
199
+ author={Aamir Shakir and Darius Koenig and Julius Lipp and Sean Lee},
200
+ year={2024},
201
+ url={https://www.mixedbread.ai/blog/mxbai-rerank-v1},
202
+ }""",
203
+ contacts=None,
204
+ )
205
+
206
+ mxbai_rerank_large_v1 = ModelMeta(
207
+ loader=CrossEncoderWrapper,
208
+ name="mixedbread-ai/mxbai-rerank-large-v1",
209
+ revision="98f655841d5caf0b16eaff79c2b4ca109d920d17",
210
+ release_date="2024-02-29",
211
+ languages=["eng-Latn"],
212
+ n_parameters=435062785,
213
+ memory_usage_mb=830.0,
214
+ max_tokens=512,
215
+ embed_dim=None,
216
+ license="apache-2.0",
217
+ open_weights=True,
218
+ public_training_code=None,
219
+ public_training_data=None,
220
+ framework=[
221
+ "PyTorch",
222
+ "Sentence Transformers",
223
+ "Transformers",
224
+ "ONNX",
225
+ "safetensors",
226
+ ],
227
+ reference="https://huggingface.co/mixedbread-ai/mxbai-rerank-large-v1",
228
+ similarity_fn_name=None,
229
+ use_instructions=None,
230
+ training_datasets=None,
231
+ adapted_from=None,
232
+ superseded_by=None,
233
+ modalities=["text"],
234
+ model_type=["cross-encoder"],
235
+ citation="""@online{rerank2024mxbai,
236
+ title={Boost Your Search With The Crispy Mixedbread Rerank Models},
237
+ author={Aamir Shakir and Darius Koenig and Julius Lipp and Sean Lee},
238
+ year={2024},
239
+ url={https://www.mixedbread.ai/blog/mxbai-rerank-v1},
240
+ }""",
241
+ contacts=None,
242
+ )
243
+
244
+ mxbai_edge_colbert_v0_17m = ModelMeta(
245
+ loader=MultiVectorModel,
246
+ name="mixedbread-ai/mxbai-edge-colbert-v0-17m",
247
+ model_type=["late-interaction"],
248
+ languages=["eng-Latn"],
249
+ open_weights=True,
250
+ revision="23ae07f5bf028bc0d1f80c82e6e2dd2311f13a46",
251
+ public_training_code=None,
252
+ public_training_data=None,
253
+ release_date="2025-10-16",
254
+ n_parameters=int(17 * 1e6),
255
+ memory_usage_mb=64,
256
+ max_tokens=7999,
257
+ embed_dim=None,
258
+ license="apache-2.0",
259
+ similarity_fn_name=ScoringFunction.MAX_SIM,
260
+ framework=["PyLate", "ColBERT", "Transformers", "safetensors"],
261
+ reference="https://huggingface.co/mixedbread-ai/mxbai-edge-colbert-v0-17m",
262
+ use_instructions=False,
263
+ adapted_from="https://huggingface.co/jhu-clsp/ettin-encoder-17m",
264
+ superseded_by=None,
265
+ training_datasets={
266
+ "CornStack",
267
+ "MSMARCO",
268
+ "NQ",
269
+ "HotpotQA",
270
+ "AmazonQA",
271
+ "LoTTE",
272
+ "MultiLongDocRetrieval",
273
+ # "FineWeb",
274
+ # "PubMedQA",
275
+ # "TriviaQA",
276
+ },
277
+ citation="""@misc{takehi2025fantasticsmallretrieverstrain,
278
+ title={Fantastic (small) Retrievers and How to Train Them: mxbai-edge-colbert-v0 Tech Report},
279
+ author={Rikiya Takehi and Benjamin Clavié and Sean Lee and Aamir Shakir},
280
+ year={2025},
281
+ eprint={2510.14880},
282
+ archivePrefix={arXiv},
283
+ primaryClass={cs.IR},
284
+ url={https://arxiv.org/abs/2510.14880},
285
+ }""",
286
+ contacts=None,
287
+ )
288
+
289
+ mxbai_edge_colbert_v0_32m = ModelMeta(
290
+ loader=MultiVectorModel,
291
+ name="mixedbread-ai/mxbai-edge-colbert-v0-32m",
292
+ model_type=["late-interaction"],
293
+ languages=["eng-Latn"],
294
+ open_weights=True,
295
+ revision="2f12870a85dae80680b9babc59992c9a2bc59e4a",
296
+ public_training_code=None,
297
+ public_training_data=None,
298
+ release_date="2025-10-16",
299
+ n_parameters=int(32 * 1e6),
300
+ memory_usage_mb=122,
301
+ max_tokens=511,
302
+ embed_dim=None,
303
+ license="apache-2.0",
304
+ similarity_fn_name=ScoringFunction.MAX_SIM,
305
+ framework=["PyLate", "ColBERT", "Transformers", "safetensors"],
306
+ reference="https://huggingface.co/mixedbread-ai/mxbai-edge-colbert-v0-32m",
307
+ use_instructions=False,
308
+ adapted_from="https://huggingface.co/jhu-clsp/ettin-encoder-32m",
309
+ superseded_by=None,
310
+ training_datasets={
311
+ "CornStack",
312
+ "MSMARCO",
313
+ "NQ",
314
+ "HotpotQA",
315
+ "AmazonQA",
316
+ "LoTTE",
317
+ "MultiLongDocRetrieval",
318
+ # "FineWeb",
319
+ # "PubMedQA",
320
+ # "TriviaQA",
321
+ },
322
+ citation="""@misc{takehi2025fantasticsmallretrieverstrain,
323
+ title={Fantastic (small) Retrievers and How to Train Them: mxbai-edge-colbert-v0 Tech Report},
324
+ author={Rikiya Takehi and Benjamin Clavié and Sean Lee and Aamir Shakir},
325
+ year={2025},
326
+ eprint={2510.14880},
327
+ archivePrefix={arXiv},
328
+ primaryClass={cs.IR},
329
+ url={https://arxiv.org/abs/2510.14880},
330
+ }""",
331
+ contacts=None,
332
+ )
@@ -25,7 +25,7 @@ mme5_mllama = ModelMeta(
25
25
  open_weights=True,
26
26
  public_training_code=None,
27
27
  public_training_data="https://huggingface.co/datasets/intfloat/mmE5-MMEB-hardneg, https://huggingface.co/datasets/intfloat/mmE5-synthetic",
28
- framework=["Sentence Transformers", "PyTorch"],
28
+ framework=["Sentence Transformers", "PyTorch", "Transformers", "safetensors"],
29
29
  reference="https://huggingface.co/intfloat/mmE5-mllama-11b-instruct",
30
30
  similarity_fn_name=ScoringFunction.COSINE,
31
31
  use_instructions=True,
@@ -117,7 +117,7 @@ mocov3_training_datasets = set(
117
117
  )
118
118
 
119
119
  mocov3_vit_base = ModelMeta(
120
- loader=mocov3_loader, # type: ignore
120
+ loader=mocov3_loader,
121
121
  name="nyu-visionx/moco-v3-vit-b",
122
122
  model_type=["dense"],
123
123
  languages=["eng-Latn"],
@@ -132,7 +132,7 @@ mocov3_vit_base = ModelMeta(
132
132
  open_weights=True,
133
133
  public_training_code="https://github.com/facebookresearch/moco-v3",
134
134
  public_training_data=None,
135
- framework=["PyTorch"],
135
+ framework=["PyTorch", "Transformers", "safetensors"],
136
136
  reference="https://github.com/facebookresearch/moco-v3",
137
137
  similarity_fn_name=ScoringFunction.COSINE,
138
138
  use_instructions=False,
@@ -141,7 +141,7 @@ mocov3_vit_base = ModelMeta(
141
141
  )
142
142
 
143
143
  mocov3_vit_large = ModelMeta(
144
- loader=mocov3_loader, # type: ignore
144
+ loader=mocov3_loader,
145
145
  name="nyu-visionx/moco-v3-vit-l",
146
146
  model_type=["dense"],
147
147
  languages=["eng-Latn"],
@@ -156,7 +156,7 @@ mocov3_vit_large = ModelMeta(
156
156
  open_weights=True,
157
157
  public_training_code="https://github.com/facebookresearch/moco-v3",
158
158
  public_training_data=None,
159
- framework=["PyTorch"],
159
+ framework=["PyTorch", "Transformers", "safetensors"],
160
160
  reference="https://github.com/facebookresearch/moco-v3",
161
161
  similarity_fn_name=ScoringFunction.COSINE,
162
162
  use_instructions=False,
@@ -181,7 +181,7 @@ MoD_Embedding = ModelMeta(
181
181
  license="apache-2.0",
182
182
  reference="https://huggingface.co/bflhc/MoD-Embedding",
183
183
  similarity_fn_name="cosine",
184
- framework=["Sentence Transformers", "PyTorch"],
184
+ framework=["Sentence Transformers", "PyTorch", "safetensors"],
185
185
  use_instructions=True,
186
186
  public_training_code=None,
187
187
  public_training_data=None,
@@ -139,7 +139,7 @@ class Model2VecModel(AbsEncoder):
139
139
  **kwargs: Additional arguments to pass to the wrapper.
140
140
  """
141
141
  requires_package(self, "model2vec", model_name, "pip install 'mteb[model2vec]'")
142
- from model2vec import StaticModel # type: ignore
142
+ from model2vec import StaticModel
143
143
 
144
144
  self.model_name = model_name
145
145
  self.model = StaticModel.from_pretrained(self.model_name)
@@ -172,7 +172,7 @@ m2v_base_glove_subword = ModelMeta(
172
172
  embed_dim=256,
173
173
  license="mit",
174
174
  similarity_fn_name=ScoringFunction.COSINE,
175
- framework=["NumPy", "Sentence Transformers"],
175
+ framework=["NumPy", "Sentence Transformers", "ONNX", "safetensors"],
176
176
  reference="https://huggingface.co/minishlab/M2V_base_glove_subword",
177
177
  use_instructions=False,
178
178
  adapted_from="BAAI/bge-base-en-v1.5",
@@ -198,7 +198,7 @@ m2v_base_glove = ModelMeta(
198
198
  embed_dim=256,
199
199
  license="mit",
200
200
  similarity_fn_name=ScoringFunction.COSINE,
201
- framework=["NumPy", "Sentence Transformers"],
201
+ framework=["NumPy", "Sentence Transformers", "safetensors"],
202
202
  reference="https://huggingface.co/minishlab/M2V_base_glove",
203
203
  use_instructions=False,
204
204
  adapted_from="BAAI/bge-base-en-v1.5",
@@ -223,7 +223,7 @@ m2v_base_output = ModelMeta(
223
223
  embed_dim=256,
224
224
  license="mit",
225
225
  similarity_fn_name=ScoringFunction.COSINE,
226
- framework=["NumPy", "Sentence Transformers"],
226
+ framework=["NumPy", "Sentence Transformers", "ONNX", "safetensors"],
227
227
  reference="https://huggingface.co/minishlab/M2V_base_output",
228
228
  use_instructions=False,
229
229
  adapted_from="BAAI/bge-base-en-v1.5",
@@ -248,7 +248,7 @@ m2v_multilingual_output = ModelMeta(
248
248
  embed_dim=256,
249
249
  license="mit",
250
250
  similarity_fn_name=ScoringFunction.COSINE,
251
- framework=["NumPy", "Sentence Transformers"],
251
+ framework=["NumPy", "Sentence Transformers", "ONNX", "safetensors"],
252
252
  reference="https://huggingface.co/minishlab/M2V_multilingual_output",
253
253
  use_instructions=False,
254
254
  adapted_from="sentence-transformers/LaBSE",
@@ -273,7 +273,7 @@ potion_base_2m = ModelMeta(
273
273
  embed_dim=64,
274
274
  license="mit",
275
275
  similarity_fn_name=ScoringFunction.COSINE,
276
- framework=["NumPy", "Sentence Transformers"],
276
+ framework=["NumPy", "Sentence Transformers", "ONNX", "safetensors"],
277
277
  reference="https://huggingface.co/minishlab/potion-base-2M",
278
278
  use_instructions=False,
279
279
  adapted_from="BAAI/bge-base-en-v1.5",
@@ -298,7 +298,7 @@ potion_base_4m = ModelMeta(
298
298
  embed_dim=128,
299
299
  license="mit",
300
300
  similarity_fn_name=ScoringFunction.COSINE,
301
- framework=["NumPy", "Sentence Transformers"],
301
+ framework=["NumPy", "Sentence Transformers", "ONNX", "safetensors"],
302
302
  reference="https://huggingface.co/minishlab/potion-base-4M",
303
303
  use_instructions=False,
304
304
  adapted_from="BAAI/bge-base-en-v1.5",
@@ -323,7 +323,7 @@ potion_base_8m = ModelMeta(
323
323
  embed_dim=256,
324
324
  license="mit",
325
325
  similarity_fn_name=ScoringFunction.COSINE,
326
- framework=["NumPy", "Sentence Transformers"],
326
+ framework=["NumPy", "Sentence Transformers", "ONNX", "safetensors"],
327
327
  reference="https://huggingface.co/minishlab/potion-base-8M",
328
328
  use_instructions=False,
329
329
  adapted_from="BAAI/bge-base-en-v1.5",
@@ -348,7 +348,7 @@ potion_multilingual_128m = ModelMeta(
348
348
  embed_dim=256,
349
349
  license="mit",
350
350
  similarity_fn_name="cosine",
351
- framework=["NumPy"],
351
+ framework=["NumPy", "ONNX", "safetensors", "Sentence Transformers"],
352
352
  reference="https://huggingface.co/minishlab/potion-multilingual-128M",
353
353
  use_instructions=False,
354
354
  adapted_from="BAAI/bge-m3",
@@ -373,7 +373,7 @@ pubmed_bert_100k = ModelMeta(
373
373
  embed_dim=64,
374
374
  license="apache-2.0",
375
375
  similarity_fn_name="cosine",
376
- framework=["NumPy"],
376
+ framework=["NumPy", "Sentence Transformers", "safetensors", "Transformers"],
377
377
  reference="https://huggingface.co/NeuML/pubmedbert-base-embeddings-100K",
378
378
  use_instructions=False,
379
379
  adapted_from="NeuML/pubmedbert-base-embeddings",
@@ -397,7 +397,7 @@ pubmed_bert_500k = ModelMeta(
397
397
  embed_dim=64,
398
398
  license="apache-2.0",
399
399
  similarity_fn_name="cosine",
400
- framework=["NumPy"],
400
+ framework=["NumPy", "Sentence Transformers", "safetensors", "Transformers"],
401
401
  reference="https://huggingface.co/NeuML/pubmedbert-base-embeddings-500K",
402
402
  use_instructions=False,
403
403
  adapted_from="NeuML/pubmedbert-base-embeddings",
@@ -421,7 +421,7 @@ pubmed_bert_1m = ModelMeta(
421
421
  embed_dim=64,
422
422
  license="apache-2.0",
423
423
  similarity_fn_name="cosine",
424
- framework=["NumPy"],
424
+ framework=["NumPy", "Sentence Transformers", "safetensors", "Transformers"],
425
425
  reference="https://huggingface.co/NeuML/pubmedbert-base-embeddings-1M",
426
426
  use_instructions=False,
427
427
  adapted_from="NeuML/pubmedbert-base-embeddings",
@@ -445,7 +445,7 @@ pubmed_bert_2m = ModelMeta(
445
445
  embed_dim=64,
446
446
  license="apache-2.0",
447
447
  similarity_fn_name="cosine",
448
- framework=["NumPy"],
448
+ framework=["NumPy", "Sentence Transformers", "safetensors", "Transformers"],
449
449
  reference="https://huggingface.co/NeuML/pubmedbert-base-embeddings-2M",
450
450
  use_instructions=False,
451
451
  adapted_from="NeuML/pubmedbert-base-embeddings",
@@ -469,7 +469,7 @@ pubmed_bert_8m = ModelMeta(
469
469
  embed_dim=256,
470
470
  license="apache-2.0",
471
471
  similarity_fn_name="cosine",
472
- framework=["NumPy"],
472
+ framework=["NumPy", "Sentence Transformers", "safetensors", "Transformers"],
473
473
  reference="https://huggingface.co/NeuML/pubmedbert-base-embeddings-8M",
474
474
  use_instructions=False,
475
475
  adapted_from="NeuML/pubmedbert-base-embeddings",
@@ -104,7 +104,7 @@ m3e_base = ModelMeta(
104
104
  max_tokens=512,
105
105
  reference="https://huggingface.co/moka-ai/m3e-base",
106
106
  similarity_fn_name=ScoringFunction.COSINE,
107
- framework=["Sentence Transformers", "PyTorch"],
107
+ framework=["Sentence Transformers", "PyTorch", "safetensors"],
108
108
  use_instructions=False,
109
109
  superseded_by=None,
110
110
  adapted_from=None,
@@ -18,7 +18,7 @@ nb_sbert = ModelMeta(
18
18
  max_tokens=75,
19
19
  reference="https://huggingface.co/NbAiLab/nb-sbert-base",
20
20
  similarity_fn_name=ScoringFunction.COSINE,
21
- framework=["Sentence Transformers", "PyTorch"],
21
+ framework=["Sentence Transformers", "PyTorch", "safetensors", "Transformers"],
22
22
  use_instructions=False,
23
23
  public_training_code=None,
24
24
  public_training_data="https://huggingface.co/datasets/NbAiLab/mnli-norwegian",
@@ -40,7 +40,7 @@ nb_bert_large = ModelMeta(
40
40
  max_tokens=512,
41
41
  reference="https://huggingface.co/NbAiLab/nb-bert-large",
42
42
  similarity_fn_name=ScoringFunction.COSINE,
43
- framework=["Sentence Transformers", "PyTorch"],
43
+ framework=["Sentence Transformers", "PyTorch", "Transformers", "safetensors"],
44
44
  use_instructions=False,
45
45
  public_training_code=None,
46
46
  public_training_data="https://huggingface.co/NbAiLab/nb-bert-large#training-data",
@@ -62,7 +62,7 @@ nb_bert_base = ModelMeta(
62
62
  max_tokens=512,
63
63
  reference="https://huggingface.co/NbAiLab/nb-bert-base",
64
64
  similarity_fn_name=ScoringFunction.COSINE,
65
- framework=["Sentence Transformers", "PyTorch"],
65
+ framework=["Sentence Transformers", "PyTorch", "Transformers", "safetensors"],
66
66
  use_instructions=False,
67
67
  public_training_code=None,
68
68
  public_training_data="https://huggingface.co/NbAiLab/nb-bert-base#training-data",
@@ -30,13 +30,13 @@ class NoInstructModel(AbsEncoder):
30
30
  self,
31
31
  model_name: str,
32
32
  revision: str,
33
+ device: str | None = None,
33
34
  model_prompts: dict[str, str] | None = None,
34
35
  **kwargs: Any,
35
36
  ):
36
37
  from transformers import AutoModel, AutoTokenizer
37
38
 
38
39
  self.model_name = model_name
39
- device = kwargs.pop("device", None)
40
40
  self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
41
41
  self.model = AutoModel.from_pretrained(
42
42
  model_name, revision=revision, **kwargs
@@ -109,7 +109,7 @@ no_instruct_small_v0 = ModelMeta(
109
109
  license="mit",
110
110
  reference="https://huggingface.co/avsolatorio/NoInstruct-small-Embedding-v0",
111
111
  similarity_fn_name=ScoringFunction.COSINE,
112
- framework=["PyTorch"],
112
+ framework=["PyTorch", "Sentence Transformers", "safetensors", "Transformers"],
113
113
  use_instructions=False,
114
114
  adapted_from=None,
115
115
  superseded_by=None,