mteb 2.5.2__py3-none-any.whl → 2.7.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (241) hide show
  1. mteb/__init__.py +2 -0
  2. mteb/_create_dataloaders.py +17 -18
  3. mteb/_evaluators/any_sts_evaluator.py +3 -3
  4. mteb/_evaluators/clustering_evaluator.py +2 -2
  5. mteb/_evaluators/evaluator.py +4 -2
  6. mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +10 -8
  7. mteb/_evaluators/pair_classification_evaluator.py +5 -3
  8. mteb/_evaluators/retrieval_evaluator.py +2 -2
  9. mteb/_evaluators/retrieval_metrics.py +18 -17
  10. mteb/_evaluators/sklearn_evaluator.py +11 -10
  11. mteb/_evaluators/text/bitext_mining_evaluator.py +27 -18
  12. mteb/_evaluators/text/summarization_evaluator.py +23 -18
  13. mteb/_evaluators/zeroshot_classification_evaluator.py +5 -3
  14. mteb/abstasks/_data_filter/filters.py +1 -1
  15. mteb/abstasks/_data_filter/task_pipelines.py +3 -0
  16. mteb/abstasks/_statistics_calculation.py +18 -10
  17. mteb/abstasks/_stratification.py +18 -18
  18. mteb/abstasks/abstask.py +35 -28
  19. mteb/abstasks/aggregate_task_metadata.py +1 -9
  20. mteb/abstasks/aggregated_task.py +10 -29
  21. mteb/abstasks/classification.py +15 -10
  22. mteb/abstasks/clustering.py +19 -15
  23. mteb/abstasks/clustering_legacy.py +10 -10
  24. mteb/abstasks/image/image_text_pair_classification.py +7 -4
  25. mteb/abstasks/multilabel_classification.py +23 -19
  26. mteb/abstasks/pair_classification.py +20 -11
  27. mteb/abstasks/regression.py +4 -4
  28. mteb/abstasks/retrieval.py +28 -24
  29. mteb/abstasks/retrieval_dataset_loaders.py +2 -2
  30. mteb/abstasks/sts.py +8 -5
  31. mteb/abstasks/task_metadata.py +31 -33
  32. mteb/abstasks/text/bitext_mining.py +39 -28
  33. mteb/abstasks/text/reranking.py +8 -6
  34. mteb/abstasks/text/summarization.py +10 -5
  35. mteb/abstasks/zeroshot_classification.py +8 -4
  36. mteb/benchmarks/benchmark.py +4 -2
  37. mteb/benchmarks/benchmarks/__init__.py +4 -0
  38. mteb/benchmarks/benchmarks/benchmarks.py +112 -11
  39. mteb/benchmarks/get_benchmark.py +14 -55
  40. mteb/cache.py +182 -29
  41. mteb/cli/_display_tasks.py +2 -2
  42. mteb/cli/build_cli.py +110 -14
  43. mteb/cli/generate_model_card.py +43 -23
  44. mteb/deprecated_evaluator.py +63 -49
  45. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2CybersecurityRetrieval.json +32 -0
  46. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EconomicRetrieval.json +32 -0
  47. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EnergyRetrieval.json +32 -0
  48. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2HrRetrieval.json +32 -0
  49. mteb/descriptive_stats/Retrieval/ChemRxivRetrieval.json +30 -0
  50. mteb/descriptive_stats/Retrieval/EuroPIRQRetrieval.json +116 -0
  51. mteb/descriptive_stats/Retrieval/NanoClimateFEVER-VN.json +30 -0
  52. mteb/descriptive_stats/Retrieval/NanoDBPedia-VN.json +30 -0
  53. mteb/descriptive_stats/Retrieval/NanoFEVER-VN.json +30 -0
  54. mteb/descriptive_stats/Retrieval/NanoHotpotQA-VN.json +30 -0
  55. mteb/descriptive_stats/Retrieval/NanoMSMARCO-VN.json +30 -0
  56. mteb/descriptive_stats/Retrieval/NanoNQ-VN.json +30 -0
  57. mteb/descriptive_stats/Retrieval/TVPLRetrieval.json +30 -0
  58. mteb/evaluate.py +44 -33
  59. mteb/filter_tasks.py +25 -26
  60. mteb/get_tasks.py +29 -30
  61. mteb/languages/language_scripts.py +5 -3
  62. mteb/leaderboard/app.py +162 -34
  63. mteb/load_results.py +12 -12
  64. mteb/models/abs_encoder.py +10 -6
  65. mteb/models/cache_wrappers/cache_backend_protocol.py +3 -5
  66. mteb/models/cache_wrappers/cache_backends/_hash_utils.py +5 -4
  67. mteb/models/cache_wrappers/cache_backends/faiss_cache.py +6 -2
  68. mteb/models/cache_wrappers/cache_backends/numpy_cache.py +43 -25
  69. mteb/models/cache_wrappers/cache_wrapper.py +2 -2
  70. mteb/models/get_model_meta.py +21 -3
  71. mteb/models/instruct_wrapper.py +28 -8
  72. mteb/models/model_implementations/align_models.py +1 -1
  73. mteb/models/model_implementations/andersborges.py +4 -4
  74. mteb/models/model_implementations/ara_models.py +1 -1
  75. mteb/models/model_implementations/arctic_models.py +8 -8
  76. mteb/models/model_implementations/b1ade_models.py +1 -1
  77. mteb/models/model_implementations/bge_models.py +45 -21
  78. mteb/models/model_implementations/bica_model.py +3 -3
  79. mteb/models/model_implementations/blip2_models.py +2 -2
  80. mteb/models/model_implementations/blip_models.py +16 -16
  81. mteb/models/model_implementations/bm25.py +4 -4
  82. mteb/models/model_implementations/bmretriever_models.py +6 -4
  83. mteb/models/model_implementations/cadet_models.py +1 -1
  84. mteb/models/model_implementations/cde_models.py +11 -4
  85. mteb/models/model_implementations/clip_models.py +6 -6
  86. mteb/models/model_implementations/clips_models.py +3 -3
  87. mteb/models/model_implementations/codefuse_models.py +5 -5
  88. mteb/models/model_implementations/codesage_models.py +3 -3
  89. mteb/models/model_implementations/cohere_models.py +5 -5
  90. mteb/models/model_implementations/cohere_v.py +2 -2
  91. mteb/models/model_implementations/colpali_models.py +3 -3
  92. mteb/models/model_implementations/colqwen_models.py +8 -8
  93. mteb/models/model_implementations/colsmol_models.py +2 -2
  94. mteb/models/model_implementations/conan_models.py +1 -1
  95. mteb/models/model_implementations/dino_models.py +42 -42
  96. mteb/models/model_implementations/e5_instruct.py +23 -4
  97. mteb/models/model_implementations/e5_models.py +9 -9
  98. mteb/models/model_implementations/e5_v.py +6 -6
  99. mteb/models/model_implementations/eagerworks_models.py +1 -1
  100. mteb/models/model_implementations/emillykkejensen_models.py +6 -6
  101. mteb/models/model_implementations/en_code_retriever.py +1 -1
  102. mteb/models/model_implementations/euler_models.py +2 -2
  103. mteb/models/model_implementations/fa_models.py +9 -9
  104. mteb/models/model_implementations/facebookai.py +14 -2
  105. mteb/models/model_implementations/geogpt_models.py +1 -1
  106. mteb/models/model_implementations/gme_v_models.py +6 -5
  107. mteb/models/model_implementations/google_models.py +1 -1
  108. mteb/models/model_implementations/granite_vision_embedding_models.py +1 -1
  109. mteb/models/model_implementations/gritlm_models.py +2 -2
  110. mteb/models/model_implementations/gte_models.py +25 -13
  111. mteb/models/model_implementations/hinvec_models.py +1 -1
  112. mteb/models/model_implementations/ibm_granite_models.py +30 -6
  113. mteb/models/model_implementations/inf_models.py +2 -2
  114. mteb/models/model_implementations/jasper_models.py +2 -2
  115. mteb/models/model_implementations/jina_clip.py +48 -10
  116. mteb/models/model_implementations/jina_models.py +18 -11
  117. mteb/models/model_implementations/kblab.py +12 -6
  118. mteb/models/model_implementations/kennethenevoldsen_models.py +4 -4
  119. mteb/models/model_implementations/kfst.py +1 -1
  120. mteb/models/model_implementations/kowshik24_models.py +1 -1
  121. mteb/models/model_implementations/lgai_embedding_models.py +1 -1
  122. mteb/models/model_implementations/linq_models.py +1 -1
  123. mteb/models/model_implementations/listconranker.py +1 -1
  124. mteb/models/model_implementations/llm2clip_models.py +6 -6
  125. mteb/models/model_implementations/llm2vec_models.py +8 -8
  126. mteb/models/model_implementations/mcinext_models.py +4 -1
  127. mteb/models/model_implementations/mdbr_models.py +17 -3
  128. mteb/models/model_implementations/misc_models.py +68 -68
  129. mteb/models/model_implementations/mixedbread_ai_models.py +332 -0
  130. mteb/models/model_implementations/mme5_models.py +1 -1
  131. mteb/models/model_implementations/moco_models.py +4 -4
  132. mteb/models/model_implementations/mod_models.py +1 -1
  133. mteb/models/model_implementations/model2vec_models.py +14 -14
  134. mteb/models/model_implementations/moka_models.py +1 -1
  135. mteb/models/model_implementations/nbailab.py +3 -3
  136. mteb/models/model_implementations/no_instruct_sentence_models.py +2 -2
  137. mteb/models/model_implementations/nomic_models.py +30 -15
  138. mteb/models/model_implementations/nomic_models_vision.py +1 -1
  139. mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +15 -9
  140. mteb/models/model_implementations/nvidia_models.py +151 -19
  141. mteb/models/model_implementations/octen_models.py +61 -2
  142. mteb/models/model_implementations/openclip_models.py +13 -13
  143. mteb/models/model_implementations/opensearch_neural_sparse_models.py +5 -5
  144. mteb/models/model_implementations/ops_moa_models.py +1 -1
  145. mteb/models/model_implementations/ordalietech_solon_embeddings_mini_beta_1_1.py +1 -1
  146. mteb/models/model_implementations/pawan_models.py +1 -1
  147. mteb/models/model_implementations/piccolo_models.py +1 -1
  148. mteb/models/model_implementations/pixie_models.py +56 -0
  149. mteb/models/model_implementations/promptriever_models.py +4 -4
  150. mteb/models/model_implementations/pylate_models.py +10 -9
  151. mteb/models/model_implementations/qodo_models.py +2 -2
  152. mteb/models/model_implementations/qtack_models.py +1 -1
  153. mteb/models/model_implementations/qwen3_models.py +3 -3
  154. mteb/models/model_implementations/qzhou_models.py +2 -2
  155. mteb/models/model_implementations/random_baseline.py +3 -3
  156. mteb/models/model_implementations/rasgaard_models.py +2 -2
  157. mteb/models/model_implementations/reasonir_model.py +1 -1
  158. mteb/models/model_implementations/repllama_models.py +3 -3
  159. mteb/models/model_implementations/rerankers_custom.py +12 -6
  160. mteb/models/model_implementations/rerankers_monot5_based.py +17 -17
  161. mteb/models/model_implementations/richinfoai_models.py +1 -1
  162. mteb/models/model_implementations/ru_sentence_models.py +20 -20
  163. mteb/models/model_implementations/ruri_models.py +10 -10
  164. mteb/models/model_implementations/salesforce_models.py +3 -3
  165. mteb/models/model_implementations/samilpwc_models.py +1 -1
  166. mteb/models/model_implementations/sarashina_embedding_models.py +2 -2
  167. mteb/models/model_implementations/searchmap_models.py +1 -1
  168. mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +113 -146
  169. mteb/models/model_implementations/sentence_transformers_models.py +124 -22
  170. mteb/models/model_implementations/shuu_model.py +1 -1
  171. mteb/models/model_implementations/siglip_models.py +20 -20
  172. mteb/models/model_implementations/slm_models.py +416 -0
  173. mteb/models/model_implementations/spartan8806_atles_champion.py +1 -1
  174. mteb/models/model_implementations/stella_models.py +17 -4
  175. mteb/models/model_implementations/tarka_models.py +2 -2
  176. mteb/models/model_implementations/text2vec_models.py +9 -3
  177. mteb/models/model_implementations/ua_sentence_models.py +1 -1
  178. mteb/models/model_implementations/uae_models.py +7 -1
  179. mteb/models/model_implementations/vdr_models.py +1 -1
  180. mteb/models/model_implementations/vi_vn_models.py +6 -6
  181. mteb/models/model_implementations/vlm2vec_models.py +3 -3
  182. mteb/models/model_implementations/voyage_models.py +84 -0
  183. mteb/models/model_implementations/voyage_v.py +9 -7
  184. mteb/models/model_implementations/youtu_models.py +1 -1
  185. mteb/models/model_implementations/yuan_models.py +1 -1
  186. mteb/models/model_implementations/yuan_models_en.py +1 -1
  187. mteb/models/model_meta.py +80 -31
  188. mteb/models/models_protocols.py +22 -6
  189. mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +9 -6
  190. mteb/models/search_wrappers.py +33 -18
  191. mteb/models/sentence_transformer_wrapper.py +50 -25
  192. mteb/models/vllm_wrapper.py +327 -0
  193. mteb/py.typed +0 -0
  194. mteb/results/benchmark_results.py +29 -21
  195. mteb/results/model_result.py +52 -22
  196. mteb/results/task_result.py +80 -58
  197. mteb/similarity_functions.py +11 -7
  198. mteb/tasks/classification/dan/dk_hate_classification.py +1 -1
  199. mteb/tasks/classification/est/estonian_valence.py +1 -1
  200. mteb/tasks/classification/kur/kurdish_sentiment_classification.py +2 -2
  201. mteb/tasks/classification/multilingual/scala_classification.py +1 -1
  202. mteb/tasks/clustering/eng/hume_wiki_cities_clustering.py +1 -1
  203. mteb/tasks/clustering/eng/wiki_cities_clustering.py +1 -1
  204. mteb/tasks/clustering/zho/cmteb_clustering.py +2 -2
  205. mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +1 -1
  206. mteb/tasks/reranking/multilingual/wikipedia_reranking_multilingual.py +1 -1
  207. mteb/tasks/retrieval/code/code_rag.py +12 -12
  208. mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
  209. mteb/tasks/retrieval/dan/tv2_nordretrieval.py +2 -2
  210. mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +2 -2
  211. mteb/tasks/retrieval/eng/__init__.py +2 -0
  212. mteb/tasks/retrieval/eng/chemrxiv.py +33 -0
  213. mteb/tasks/retrieval/eng/cub200_i2i_retrieval.py +1 -1
  214. mteb/tasks/retrieval/kor/__init__.py +15 -1
  215. mteb/tasks/retrieval/kor/kovidore2_bench_retrieval.py +142 -0
  216. mteb/tasks/retrieval/multilingual/__init__.py +2 -0
  217. mteb/tasks/retrieval/multilingual/euro_pirq_retrieval.py +43 -0
  218. mteb/tasks/retrieval/multilingual/vidore3_bench_retrieval.py +90 -100
  219. mteb/tasks/retrieval/nob/norquad.py +2 -2
  220. mteb/tasks/retrieval/nob/snl_retrieval.py +2 -2
  221. mteb/tasks/retrieval/tur/tur_hist_quad.py +1 -1
  222. mteb/tasks/retrieval/vie/__init__.py +14 -6
  223. mteb/tasks/retrieval/vie/climate_fevervn_retrieval.py +39 -0
  224. mteb/tasks/retrieval/vie/db_pedia_vn_retrieval.py +39 -0
  225. mteb/tasks/retrieval/vie/fevervn_retrieval.py +39 -0
  226. mteb/tasks/retrieval/vie/hotpot_qavn_retrieval.py +39 -0
  227. mteb/tasks/retrieval/vie/msmarcovn_retrieval.py +48 -0
  228. mteb/tasks/retrieval/vie/nqvn_retrieval.py +39 -0
  229. mteb/tasks/retrieval/vie/tvpl_retrieval.py +42 -0
  230. mteb/tasks/retrieval/vie/zac_legal_text_retrieval.py +15 -1
  231. mteb/types/__init__.py +2 -0
  232. mteb/types/_encoder_io.py +12 -0
  233. mteb/types/_result.py +2 -1
  234. mteb/types/statistics.py +9 -3
  235. {mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/METADATA +15 -4
  236. {mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/RECORD +240 -219
  237. mteb/models/model_implementations/mxbai_models.py +0 -111
  238. {mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/WHEEL +0 -0
  239. {mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/entry_points.txt +0 -0
  240. {mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/licenses/LICENSE +0 -0
  241. {mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/top_level.txt +0 -0
@@ -298,7 +298,7 @@ jasper_en_v1 = ModelMeta(
298
298
  license="apache-2.0",
299
299
  reference="https://huggingface.co/infgrad/jasper_en_vision_language_v1",
300
300
  similarity_fn_name=ScoringFunction.COSINE,
301
- framework=["Sentence Transformers", "PyTorch"],
301
+ framework=["Sentence Transformers", "PyTorch", "safetensors"],
302
302
  use_instructions=True,
303
303
  adapted_from=None,
304
304
  superseded_by=None,
@@ -345,7 +345,7 @@ Jasper_Token_Compression_600M = ModelMeta(
345
345
  max_tokens=32768,
346
346
  reference="https://huggingface.co/infgrad/Jasper-Token-Compression-600M",
347
347
  similarity_fn_name="cosine",
348
- framework=["Sentence Transformers", "PyTorch"],
348
+ framework=["Sentence Transformers", "PyTorch", "safetensors"],
349
349
  use_instructions=True,
350
350
  public_training_code="https://github.com/DunZhang/Jasper-Token-Compression-Training",
351
351
  # public_training_data: unsupervised data for distillation
@@ -7,6 +7,7 @@ from tqdm.auto import tqdm
7
7
  from mteb._requires_package import requires_image_dependencies
8
8
  from mteb.abstasks.task_metadata import TaskMetadata
9
9
  from mteb.models.abs_encoder import AbsEncoder
10
+ from mteb.models.model_implementations.colpali_models import COLPALI_TRAINING_DATA
10
11
  from mteb.models.model_meta import ModelMeta, ScoringFunction
11
12
  from mteb.types import Array, BatchedInput, PromptType
12
13
 
@@ -120,8 +121,17 @@ class JinaCLIPModel(AbsEncoder):
120
121
  raise ValueError
121
122
 
122
123
 
124
+ _JINA_CLIP_TRAIN_DATASETS_V1 = {
125
+ # LAION400M
126
+ # ShareGPT4V
127
+ "MSMARCO",
128
+ "NQ",
129
+ "HotpotQA",
130
+ # Natural Language Inference (NLI) dataset (Bowman et al., 2015)
131
+ }
132
+
123
133
  jina_clip_v1 = ModelMeta(
124
- loader=JinaCLIPModel, # type: ignore
134
+ loader=JinaCLIPModel,
125
135
  name="jinaai/jina-clip-v1",
126
136
  model_type=["dense"],
127
137
  languages=["eng-Latn"],
@@ -136,17 +146,45 @@ jina_clip_v1 = ModelMeta(
136
146
  open_weights=True,
137
147
  public_training_code=None,
138
148
  public_training_data=None,
139
- framework=["PyTorch"],
149
+ framework=["PyTorch", "Transformers", "ONNX", "safetensors"],
140
150
  reference="https://huggingface.co/jinaai/jina-clip-v1",
141
151
  similarity_fn_name=ScoringFunction.COSINE,
142
152
  use_instructions=True,
143
- training_datasets={
144
- # LAION400M
145
- # ShareGPT4V
146
- "MSMARCO",
147
- # NQ
148
- # HotpotQA
149
- # Natural Language Inference (NLI) dataset (Bowman et al., 2015)
150
- },
153
+ training_datasets=_JINA_CLIP_TRAIN_DATASETS_V1,
151
154
  citation=JINA_CLIP_CITATION,
155
+ superseded_by="jinaai/jina-clip-v2",
156
+ )
157
+
158
+ jina_clip_v2 = ModelMeta(
159
+ loader=JinaCLIPModel,
160
+ name="jinaai/jina-clip-v2",
161
+ revision="344d954da76eb8ad47a7aaff42d012e30c15b8fe",
162
+ release_date="2024-10-09",
163
+ languages=["eng-Latn"],
164
+ n_parameters=865278477,
165
+ memory_usage_mb=1650.0,
166
+ max_tokens=8192,
167
+ embed_dim=1024,
168
+ license="cc-by-nc-4.0",
169
+ open_weights=True,
170
+ public_training_code=None,
171
+ public_training_data=None,
172
+ framework=["PyTorch", "Sentence Transformers"],
173
+ reference="https://huggingface.co/jinaai/jina-clip-v2",
174
+ similarity_fn_name=ScoringFunction.COSINE,
175
+ use_instructions=False,
176
+ training_datasets=_JINA_CLIP_TRAIN_DATASETS_V1 | COLPALI_TRAINING_DATA,
177
+ modalities=["text", "image"],
178
+ model_type=["dense"],
179
+ citation="""
180
+ @misc{koukounas2024jinaclipv2multilingualmultimodalembeddings,
181
+ title={jina-clip-v2: Multilingual Multimodal Embeddings for Text and Images},
182
+ author={Andreas Koukounas and Georgios Mastrapas and Bo Wang and Mohammad Kalim Akram and Sedigheh Eslami and Michael Günther and Isabelle Mohr and Saba Sturua and Scott Martens and Nan Wang and Han Xiao},
183
+ year={2024},
184
+ eprint={2412.08802},
185
+ archivePrefix={arXiv},
186
+ primaryClass={cs.CL},
187
+ url={https://arxiv.org/abs/2412.08802},
188
+ }
189
+ """,
152
190
  )
@@ -257,6 +257,7 @@ class JinaRerankerV3Wrapper(CrossEncoderWrapper):
257
257
  self,
258
258
  model: CrossEncoder | str,
259
259
  revision: str | None = None,
260
+ device: str | None = None,
260
261
  trust_remote_code: bool = True,
261
262
  **kwargs: Any,
262
263
  ) -> None:
@@ -267,10 +268,7 @@ class JinaRerankerV3Wrapper(CrossEncoderWrapper):
267
268
  model, trust_remote_code=trust_remote_code, dtype="auto"
268
269
  )
269
270
 
270
- device = kwargs.get("device", None)
271
- if device is None:
272
- device = get_device_name()
273
- logger.info(f"Use pytorch device: {device}")
271
+ device = device or get_device_name()
274
272
 
275
273
  self.model.to(device)
276
274
  self.model.eval()
@@ -320,6 +318,7 @@ class JinaWrapper(SentenceTransformerEncoderWrapper):
320
318
  self,
321
319
  model: str,
322
320
  revision: str,
321
+ device: str | None = None,
323
322
  model_prompts: dict[str, str] | None = None,
324
323
  **kwargs,
325
324
  ) -> None:
@@ -339,7 +338,9 @@ class JinaWrapper(SentenceTransformerEncoderWrapper):
339
338
  )
340
339
  import flash_attn # noqa: F401
341
340
 
342
- super().__init__(model, revision, model_prompts, **kwargs)
341
+ super().__init__(
342
+ model, revision, device=device, model_prompts=model_prompts, **kwargs
343
+ )
343
344
 
344
345
  def encode(
345
346
  self,
@@ -732,7 +733,7 @@ jina_reranker_v3 = ModelMeta(
732
733
  embed_dim=None,
733
734
  license="cc-by-nc-4.0",
734
735
  similarity_fn_name=None,
735
- framework=["PyTorch"],
736
+ framework=["PyTorch", "Transformers", "safetensors"],
736
737
  use_instructions=None,
737
738
  reference="https://huggingface.co/jinaai/jina-reranker-v3",
738
739
  public_training_code=None,
@@ -775,7 +776,7 @@ jina_embeddings_v4 = ModelMeta(
775
776
  embed_dim=2048,
776
777
  license="cc-by-nc-4.0",
777
778
  similarity_fn_name="cosine",
778
- framework=["Sentence Transformers", "PyTorch"],
779
+ framework=["Sentence Transformers", "PyTorch", "Transformers", "safetensors"],
779
780
  use_instructions=True,
780
781
  reference="https://huggingface.co/jinaai/jina-embeddings-v4",
781
782
  public_training_code=None,
@@ -795,7 +796,7 @@ jina_embeddings_v4 = ModelMeta(
795
796
 
796
797
 
797
798
  jina_embeddings_v3 = ModelMeta(
798
- loader=JinaWrapper, # type: ignore
799
+ loader=JinaWrapper,
799
800
  loader_kwargs=dict(
800
801
  trust_remote_code=True,
801
802
  model_prompts={
@@ -823,7 +824,13 @@ jina_embeddings_v3 = ModelMeta(
823
824
  embed_dim=1024,
824
825
  license="cc-by-nc-4.0",
825
826
  similarity_fn_name=ScoringFunction.COSINE,
826
- framework=["Sentence Transformers", "PyTorch"],
827
+ framework=[
828
+ "Sentence Transformers",
829
+ "PyTorch",
830
+ "Transformers",
831
+ "ONNX",
832
+ "safetensors",
833
+ ],
827
834
  use_instructions=True,
828
835
  reference="https://huggingface.co/jinaai/jina-embeddings-v3",
829
836
  public_training_code=None,
@@ -878,7 +885,7 @@ jina_embeddings_v2_base_en = ModelMeta(
878
885
  max_tokens=8192,
879
886
  reference="https://huggingface.co/jinaai/jina-embeddings-v2-base-en",
880
887
  similarity_fn_name=ScoringFunction.COSINE,
881
- framework=["Sentence Transformers", "PyTorch"],
888
+ framework=["Sentence Transformers", "PyTorch", "ONNX", "safetensors"],
882
889
  use_instructions=False,
883
890
  superseded_by=None,
884
891
  adapted_from="jina-bert-base-en-v1", # pretrained on C4 with Alibi to support longer context.
@@ -942,7 +949,7 @@ jina_embeddings_v2_small_en = ModelMeta(
942
949
  max_tokens=8192,
943
950
  reference="https://huggingface.co/jinaai/jina-embeddings-v2-small-en",
944
951
  similarity_fn_name=ScoringFunction.COSINE,
945
- framework=["Sentence Transformers", "PyTorch"],
952
+ framework=["Sentence Transformers", "PyTorch", "ONNX", "safetensors"],
946
953
  use_instructions=False,
947
954
  superseded_by=None,
948
955
  adapted_from="jina-bert-smalll-en-v1", # pretrained on C4 with Alibi to support longer context
@@ -16,16 +16,22 @@ sbert_swedish = ModelMeta(
16
16
  max_tokens=384,
17
17
  reference="https://huggingface.co/KBLab/sentence-bert-swedish-cased",
18
18
  similarity_fn_name=ScoringFunction.COSINE,
19
- framework=["Sentence Transformers", "PyTorch"],
19
+ framework=[
20
+ "Sentence Transformers",
21
+ "PyTorch",
22
+ "safetensors",
23
+ "GGUF",
24
+ "Transformers",
25
+ ],
20
26
  use_instructions=False,
21
27
  public_training_code=None,
22
28
  public_training_data=None,
23
29
  training_datasets=None,
24
30
  adapted_from="sentence-transformers/all-mpnet-base-v2",
25
- citation="""@misc{rekathati2021introducing,
26
- author = {Rekathati, Faton},
27
- title = {The KBLab Blog: Introducing a Swedish Sentence Transformer},
28
- url = {https://kb-labb.github.io/posts/2021-08-23-a-swedish-sentence-transformer/},
29
- year = {2021}
31
+ citation="""@misc{rekathati2021introducing,
32
+ author = {Rekathati, Faton},
33
+ title = {The KBLab Blog: Introducing a Swedish Sentence Transformer},
34
+ url = {https://kb-labb.github.io/posts/2021-08-23-a-swedish-sentence-transformer/},
35
+ year = {2021}
30
36
  }""",
31
37
  )
@@ -4,7 +4,7 @@ from mteb.models.sentence_transformer_wrapper import (
4
4
  )
5
5
 
6
6
  dfm_enc_large = ModelMeta(
7
- loader=sentence_transformers_loader, # type: ignore
7
+ loader=sentence_transformers_loader,
8
8
  name="KennethEnevoldsen/dfm-sentence-encoder-large",
9
9
  model_type=["dense"],
10
10
  languages=["dan-Latn"],
@@ -18,7 +18,7 @@ dfm_enc_large = ModelMeta(
18
18
  max_tokens=512,
19
19
  reference="https://huggingface.co/KennethEnevoldsen/dfm-sentence-encoder-large",
20
20
  similarity_fn_name=ScoringFunction.COSINE,
21
- framework=["Sentence Transformers", "PyTorch"],
21
+ framework=["Sentence Transformers", "PyTorch", "Transformers", "safetensors"],
22
22
  use_instructions=False,
23
23
  superseded_by=None,
24
24
  adapted_from="chcaa/dfm-encoder-large-v1",
@@ -39,7 +39,7 @@ dfm_enc_large = ModelMeta(
39
39
  )
40
40
 
41
41
  dfm_enc_med = ModelMeta(
42
- loader=sentence_transformers_loader, # type: ignore
42
+ loader=sentence_transformers_loader,
43
43
  name="KennethEnevoldsen/dfm-sentence-encoder-medium",
44
44
  model_type=["dense"],
45
45
  languages=["dan-Latn"],
@@ -53,7 +53,7 @@ dfm_enc_med = ModelMeta(
53
53
  max_tokens=512,
54
54
  reference="https://huggingface.co/KennethEnevoldsen/dfm-sentence-encoder-medium",
55
55
  similarity_fn_name=ScoringFunction.COSINE,
56
- framework=["Sentence Transformers", "PyTorch"],
56
+ framework=["Sentence Transformers", "PyTorch", "Transformers"],
57
57
  use_instructions=False,
58
58
  superseded_by=None,
59
59
  adapted_from=None,
@@ -16,7 +16,7 @@ xlmr_scandi = ModelMeta(
16
16
  max_tokens=512,
17
17
  reference="https://huggingface.co/KFST/XLMRoberta-en-da-sv-nb",
18
18
  similarity_fn_name=ScoringFunction.COSINE,
19
- framework=["Sentence Transformers", "PyTorch"],
19
+ framework=["Sentence Transformers", "PyTorch", "safetensors", "Transformers"],
20
20
  use_instructions=False,
21
21
  public_training_code=None,
22
22
  public_training_data=None,
@@ -15,7 +15,7 @@ kowshik24_bangla_embedding_model = ModelMeta(
15
15
  max_tokens=128,
16
16
  reference="https://huggingface.co/Kowshik24/bangla-sentence-transformer-ft-matryoshka-paraphrase-multilingual-mpnet-base-v2",
17
17
  similarity_fn_name="cosine",
18
- framework=["Sentence Transformers", "PyTorch"],
18
+ framework=["Sentence Transformers", "PyTorch", "safetensors"],
19
19
  use_instructions=False,
20
20
  public_training_code="https://github.com/kowshik24/Bangla-Embedding",
21
21
  public_training_data="https://huggingface.co/datasets/sartajekram/BanglaRQA",
@@ -58,7 +58,7 @@ lgai_embedding_en = ModelMeta(
58
58
  max_tokens=32768,
59
59
  reference="https://huggingface.co/annamodels/LGAI-Embedding-Preview",
60
60
  similarity_fn_name="cosine",
61
- framework=["Sentence Transformers", "PyTorch"],
61
+ framework=["Sentence Transformers", "PyTorch", "safetensors", "Transformers"],
62
62
  use_instructions=True,
63
63
  public_training_code=None,
64
64
  public_training_data=None,
@@ -44,7 +44,7 @@ Linq_Embed_Mistral = ModelMeta(
44
44
  max_tokens=32768,
45
45
  reference="https://huggingface.co/Linq-AI-Research/Linq-Embed-Mistral",
46
46
  similarity_fn_name=ScoringFunction.COSINE,
47
- framework=["Sentence Transformers", "PyTorch"],
47
+ framework=["Sentence Transformers", "PyTorch", "safetensors", "Transformers"],
48
48
  use_instructions=True,
49
49
  public_training_code=None,
50
50
  public_training_data=None,
@@ -125,7 +125,7 @@ listconranker = ModelMeta(
125
125
  license="mit",
126
126
  max_tokens=512,
127
127
  reference="https://huggingface.co/ByteDance/ListConRanker",
128
- framework=["PyTorch"],
128
+ framework=["PyTorch", "Sentence Transformers", "safetensors", "Transformers"],
129
129
  use_instructions=False,
130
130
  public_training_code=None,
131
131
  public_training_data=None,
@@ -181,7 +181,7 @@ llm2clip_training_sets = set(
181
181
  )
182
182
 
183
183
  llm2clip_openai_l_14_336 = ModelMeta(
184
- loader=llm2clip_loader, # type: ignore
184
+ loader=llm2clip_loader,
185
185
  name="microsoft/LLM2CLIP-Openai-L-14-336",
186
186
  model_type=["dense"],
187
187
  languages=["eng-Latn"],
@@ -196,7 +196,7 @@ llm2clip_openai_l_14_336 = ModelMeta(
196
196
  open_weights=True,
197
197
  public_training_code="https://github.com/microsoft/LLM2CLIP",
198
198
  public_training_data=None,
199
- framework=["PyTorch"],
199
+ framework=["PyTorch", "safetensors"],
200
200
  reference="https://huggingface.co/microsoft/LLM2CLIP-Openai-L-14-336",
201
201
  similarity_fn_name=ScoringFunction.COSINE,
202
202
  use_instructions=True,
@@ -206,7 +206,7 @@ llm2clip_openai_l_14_336 = ModelMeta(
206
206
 
207
207
  # NOTE: https://huggingface.co/microsoft/LLM2CLIP-Openai-L-14-224/discussions/1
208
208
  llm2clip_openai_l_14_224 = ModelMeta(
209
- loader=llm2clip_loader, # type: ignore
209
+ loader=llm2clip_loader,
210
210
  name="microsoft/LLM2CLIP-Openai-L-14-224",
211
211
  model_type=["dense"],
212
212
  languages=["eng-Latn"],
@@ -221,7 +221,7 @@ llm2clip_openai_l_14_224 = ModelMeta(
221
221
  open_weights=True,
222
222
  public_training_code="https://github.com/microsoft/LLM2CLIP",
223
223
  public_training_data=None,
224
- framework=["PyTorch"],
224
+ framework=["PyTorch", "safetensors"],
225
225
  reference="https://huggingface.co/microsoft/LLM2CLIP-Openai-L-14-224",
226
226
  similarity_fn_name=ScoringFunction.COSINE,
227
227
  use_instructions=True,
@@ -230,7 +230,7 @@ llm2clip_openai_l_14_224 = ModelMeta(
230
230
  )
231
231
 
232
232
  llm2clip_openai_b_16 = ModelMeta(
233
- loader=llm2clip_loader, # type: ignore
233
+ loader=llm2clip_loader,
234
234
  name="microsoft/LLM2CLIP-Openai-B-16",
235
235
  model_type=["dense"],
236
236
  languages=["eng-Latn"],
@@ -245,7 +245,7 @@ llm2clip_openai_b_16 = ModelMeta(
245
245
  open_weights=True,
246
246
  public_training_code="https://github.com/microsoft/LLM2CLIP",
247
247
  public_training_data=None,
248
- framework=["PyTorch"],
248
+ framework=["PyTorch", "safetensors"],
249
249
  reference="https://huggingface.co/microsoft/LLM2CLIP-Openai-B-16",
250
250
  similarity_fn_name=ScoringFunction.COSINE,
251
251
  use_instructions=True,
@@ -145,7 +145,7 @@ llm2vec_llama3_8b_supervised = ModelMeta(
145
145
  license="mit",
146
146
  reference="https://huggingface.co/McGill-NLP/LLM2Vec-Meta-Llama-3-8B-Instruct-mntp-supervised",
147
147
  similarity_fn_name=ScoringFunction.COSINE,
148
- framework=["LLM2Vec", "PyTorch"],
148
+ framework=["LLM2Vec", "PyTorch", "safetensors"],
149
149
  use_instructions=True,
150
150
  public_training_code="https://github.com/McGill-NLP/llm2vec/tree/250292a307428240d801fadd85825464e71c3277/train_configs",
151
151
  training_datasets=llm2vec_supervised_training_data,
@@ -174,7 +174,7 @@ llm2vec_llama3_8b_unsupervised = ModelMeta(
174
174
  license="mit",
175
175
  reference="https://huggingface.co/McGill-NLP/LLM2Vec-Meta-Llama-3-8B-Instruct-mntp-unsup-simcse",
176
176
  similarity_fn_name=ScoringFunction.COSINE,
177
- framework=["LLM2Vec", "PyTorch"],
177
+ framework=["LLM2Vec", "PyTorch", "safetensors"],
178
178
  use_instructions=True,
179
179
  citation=LLM2VEC_CITATION,
180
180
  public_training_code="https://github.com/McGill-NLP/llm2vec/tree/250292a307428240d801fadd85825464e71c3277/train_configs",
@@ -203,7 +203,7 @@ llm2vec_mistral7b_supervised = ModelMeta(
203
203
  license="mit",
204
204
  reference="https://huggingface.co/McGill-NLP/LLM2Vec-Mistral-7B-Instruct-v2-mntp-supervised",
205
205
  similarity_fn_name=ScoringFunction.COSINE,
206
- framework=["LLM2Vec", "PyTorch"],
206
+ framework=["LLM2Vec", "PyTorch", "safetensors"],
207
207
  use_instructions=True,
208
208
  citation=LLM2VEC_CITATION,
209
209
  public_training_code="https://github.com/McGill-NLP/llm2vec/tree/250292a307428240d801fadd85825464e71c3277/train_configs",
@@ -232,7 +232,7 @@ llm2vec_mistral7b_unsupervised = ModelMeta(
232
232
  license="mit",
233
233
  reference="https://huggingface.co/McGill-NLP/LLM2Vec-Mistral-7B-Instruct-v2-mntp-unsup-simcse",
234
234
  similarity_fn_name=ScoringFunction.COSINE,
235
- framework=["LLM2Vec", "PyTorch"],
235
+ framework=["LLM2Vec", "PyTorch", "safetensors"],
236
236
  use_instructions=True,
237
237
  citation=LLM2VEC_CITATION,
238
238
  public_training_code="https://github.com/McGill-NLP/llm2vec/tree/250292a307428240d801fadd85825464e71c3277/train_configs",
@@ -261,7 +261,7 @@ llm2vec_llama2_7b_supervised = ModelMeta(
261
261
  license="mit",
262
262
  reference="https://huggingface.co/McGill-NLP/LLM2Vec-Llama-2-7b-chat-hf-mntp-supervised",
263
263
  similarity_fn_name=ScoringFunction.COSINE,
264
- framework=["LLM2Vec", "PyTorch"],
264
+ framework=["LLM2Vec", "PyTorch", "safetensors"],
265
265
  use_instructions=True,
266
266
  citation=LLM2VEC_CITATION,
267
267
  public_training_code="https://github.com/McGill-NLP/llm2vec/tree/250292a307428240d801fadd85825464e71c3277/train_configs",
@@ -290,7 +290,7 @@ llm2vec_llama2_7b_unsupervised = ModelMeta(
290
290
  license="mit",
291
291
  reference="https://huggingface.co/McGill-NLP/LLM2Vec-Llama-2-7b-chat-hf-mntp-unsup-simcse",
292
292
  similarity_fn_name=ScoringFunction.COSINE,
293
- framework=["LLM2Vec", "PyTorch"],
293
+ framework=["LLM2Vec", "PyTorch", "safetensors"],
294
294
  use_instructions=True,
295
295
  public_training_code="https://github.com/McGill-NLP/llm2vec/tree/250292a307428240d801fadd85825464e71c3277/train_configs",
296
296
  training_datasets=set(),
@@ -319,7 +319,7 @@ llm2vec_sheared_llama_supervised = ModelMeta(
319
319
  license="mit",
320
320
  reference="https://huggingface.co/McGill-NLP/LLM2Vec-Sheared-LLaMA-mntp-supervised",
321
321
  similarity_fn_name=ScoringFunction.COSINE,
322
- framework=["LLM2Vec", "PyTorch"],
322
+ framework=["LLM2Vec", "PyTorch", "safetensors"],
323
323
  use_instructions=True,
324
324
  citation=LLM2VEC_CITATION,
325
325
  public_training_code="https://github.com/McGill-NLP/llm2vec/tree/250292a307428240d801fadd85825464e71c3277/train_configs",
@@ -348,7 +348,7 @@ llm2vec_sheared_llama_unsupervised = ModelMeta(
348
348
  license="mit",
349
349
  reference="https://huggingface.co/McGill-NLP/LLM2Vec-Sheared-LLaMA-mntp-unsup-simcse",
350
350
  similarity_fn_name=ScoringFunction.COSINE,
351
- framework=["LLM2Vec", "PyTorch"],
351
+ framework=["LLM2Vec", "PyTorch", "safetensors"],
352
352
  use_instructions=True,
353
353
  citation=LLM2VEC_CITATION,
354
354
  public_training_code="https://github.com/McGill-NLP/llm2vec/tree/250292a307428240d801fadd85825464e71c3277/train_configs",
@@ -1,6 +1,7 @@
1
1
  import logging
2
2
  import os
3
3
  import time
4
+ import warnings
4
5
  from typing import Any
5
6
 
6
7
  import numpy as np
@@ -246,7 +247,9 @@ class HakimModelWrapper(AbsEncoder):
246
247
  task_prompt, task_id = DATASET_TASKS.get(task_name, (None, None))
247
248
 
248
249
  if not task_prompt:
249
- logger.warning(f"Unknown dataset: {task_name}, no preprocessing applied.")
250
+ msg = f"Unknown dataset: {task_name}, no preprocessing applied."
251
+ logger.warning(msg)
252
+ warnings.warn(msg)
250
253
  return sample
251
254
 
252
255
  task_prompt = f"مسئله : {task_prompt}"
@@ -1,5 +1,7 @@
1
1
  from mteb.models.model_implementations.arctic_models import arctic_v1_training_datasets
2
- from mteb.models.model_implementations.mxbai_models import mixedbread_training_data
2
+ from mteb.models.model_implementations.mixedbread_ai_models import (
3
+ mixedbread_training_data,
4
+ )
3
5
  from mteb.models.model_meta import ModelMeta
4
6
  from mteb.models.sentence_transformer_wrapper import sentence_transformers_loader
5
7
 
@@ -35,7 +37,13 @@ mdbr_leaf_ir = ModelMeta(
35
37
  release_date="2025-08-27",
36
38
  languages=["eng-Latn"],
37
39
  open_weights=True,
38
- framework=["Sentence Transformers", "PyTorch"],
40
+ framework=[
41
+ "Sentence Transformers",
42
+ "PyTorch",
43
+ "ONNX",
44
+ "safetensors",
45
+ "Transformers",
46
+ ],
39
47
  n_parameters=22_861_056,
40
48
  memory_usage_mb=86,
41
49
  max_tokens=512,
@@ -63,7 +71,13 @@ mdbr_leaf_mt = ModelMeta(
63
71
  release_date="2025-08-27",
64
72
  languages=["eng-Latn"],
65
73
  open_weights=True,
66
- framework=["Sentence Transformers", "PyTorch"],
74
+ framework=[
75
+ "Sentence Transformers",
76
+ "PyTorch",
77
+ "ONNX",
78
+ "safetensors",
79
+ "Transformers",
80
+ ],
67
81
  n_parameters=22_958_592,
68
82
  memory_usage_mb=86,
69
83
  max_tokens=512,