mteb 2.7.2__py3-none-any.whl → 2.7.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (486) hide show
  1. mteb/_create_dataloaders.py +63 -14
  2. mteb/_evaluators/any_sts_evaluator.py +12 -5
  3. mteb/_evaluators/clustering_evaluator.py +12 -4
  4. mteb/_evaluators/evaluator.py +11 -5
  5. mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +14 -5
  6. mteb/_evaluators/pair_classification_evaluator.py +13 -5
  7. mteb/_evaluators/retrieval_evaluator.py +22 -13
  8. mteb/_evaluators/retrieval_metrics.py +9 -3
  9. mteb/_evaluators/sklearn_evaluator.py +20 -11
  10. mteb/_evaluators/text/bitext_mining_evaluator.py +10 -3
  11. mteb/_evaluators/text/summarization_evaluator.py +10 -4
  12. mteb/_evaluators/zeroshot_classification_evaluator.py +12 -3
  13. mteb/_helpful_enum.py +5 -1
  14. mteb/abstasks/_data_filter/filters.py +8 -2
  15. mteb/abstasks/_data_filter/task_pipelines.py +7 -2
  16. mteb/abstasks/_statistics_calculation.py +6 -4
  17. mteb/abstasks/abstask.py +48 -21
  18. mteb/abstasks/aggregate_task_metadata.py +20 -9
  19. mteb/abstasks/aggregated_task.py +15 -8
  20. mteb/abstasks/classification.py +25 -9
  21. mteb/abstasks/clustering.py +23 -10
  22. mteb/abstasks/clustering_legacy.py +22 -8
  23. mteb/abstasks/image/image_text_pair_classification.py +23 -9
  24. mteb/abstasks/multilabel_classification.py +13 -5
  25. mteb/abstasks/pair_classification.py +27 -11
  26. mteb/abstasks/regression.py +14 -6
  27. mteb/abstasks/retrieval.py +56 -30
  28. mteb/abstasks/retrieval_dataset_loaders.py +48 -37
  29. mteb/abstasks/sts.py +29 -13
  30. mteb/abstasks/task_metadata.py +17 -8
  31. mteb/abstasks/text/bitext_mining.py +23 -12
  32. mteb/abstasks/text/reranking.py +2 -2
  33. mteb/abstasks/text/summarization.py +19 -8
  34. mteb/abstasks/zeroshot_classification.py +23 -9
  35. mteb/benchmarks/_create_table.py +13 -7
  36. mteb/benchmarks/benchmark.py +11 -1
  37. mteb/benchmarks/benchmarks/__init__.py +2 -0
  38. mteb/benchmarks/benchmarks/benchmarks.py +41 -2
  39. mteb/benchmarks/benchmarks/rteb_benchmarks.py +20 -9
  40. mteb/cache.py +10 -5
  41. mteb/cli/_display_tasks.py +9 -3
  42. mteb/cli/build_cli.py +5 -2
  43. mteb/cli/generate_model_card.py +9 -2
  44. mteb/deprecated_evaluator.py +16 -12
  45. mteb/descriptive_stats/Retrieval/BrightAopsRetrieval.json +35 -0
  46. mteb/descriptive_stats/Retrieval/BrightBiologyLongRetrieval.json +35 -0
  47. mteb/descriptive_stats/Retrieval/BrightBiologyRetrieval.json +35 -0
  48. mteb/descriptive_stats/Retrieval/BrightEarthScienceLongRetrieval.json +35 -0
  49. mteb/descriptive_stats/Retrieval/BrightEarthScienceRetrieval.json +35 -0
  50. mteb/descriptive_stats/Retrieval/BrightEconomicsLongRetrieval.json +35 -0
  51. mteb/descriptive_stats/Retrieval/BrightEconomicsRetrieval.json +35 -0
  52. mteb/descriptive_stats/Retrieval/BrightLeetcodeRetrieval.json +35 -0
  53. mteb/descriptive_stats/Retrieval/BrightPonyLongRetrieval.json +35 -0
  54. mteb/descriptive_stats/Retrieval/BrightPonyRetrieval.json +35 -0
  55. mteb/descriptive_stats/Retrieval/BrightPsychologyLongRetrieval.json +35 -0
  56. mteb/descriptive_stats/Retrieval/BrightPsychologyRetrieval.json +35 -0
  57. mteb/descriptive_stats/Retrieval/BrightRoboticsLongRetrieval.json +35 -0
  58. mteb/descriptive_stats/Retrieval/BrightRoboticsRetrieval.json +35 -0
  59. mteb/descriptive_stats/Retrieval/BrightStackoverflowLongRetrieval.json +35 -0
  60. mteb/descriptive_stats/Retrieval/BrightStackoverflowRetrieval.json +35 -0
  61. mteb/descriptive_stats/Retrieval/BrightSustainableLivingLongRetrieval.json +35 -0
  62. mteb/descriptive_stats/Retrieval/BrightSustainableLivingRetrieval.json +35 -0
  63. mteb/descriptive_stats/Retrieval/BrightTheoremQAQuestionsRetrieval.json +35 -0
  64. mteb/descriptive_stats/Retrieval/BrightTheoremQATheoremsRetrieval.json +35 -0
  65. mteb/evaluate.py +33 -20
  66. mteb/filter_tasks.py +12 -7
  67. mteb/get_tasks.py +9 -4
  68. mteb/languages/language_scripts.py +8 -3
  69. mteb/leaderboard/app.py +11 -4
  70. mteb/leaderboard/table.py +7 -2
  71. mteb/load_results.py +9 -3
  72. mteb/models/abs_encoder.py +22 -12
  73. mteb/models/cache_wrappers/cache_backend_protocol.py +5 -3
  74. mteb/models/cache_wrappers/cache_backends/_hash_utils.py +8 -4
  75. mteb/models/cache_wrappers/cache_backends/faiss_cache.py +8 -3
  76. mteb/models/cache_wrappers/cache_wrapper.py +14 -9
  77. mteb/models/get_model_meta.py +32 -6
  78. mteb/models/instruct_wrapper.py +13 -5
  79. mteb/models/model_implementations/align_models.py +10 -4
  80. mteb/models/model_implementations/amazon_models.py +1 -0
  81. mteb/models/model_implementations/andersborges.py +2 -0
  82. mteb/models/model_implementations/ara_models.py +1 -0
  83. mteb/models/model_implementations/arctic_models.py +8 -0
  84. mteb/models/model_implementations/b1ade_models.py +1 -0
  85. mteb/models/model_implementations/bedrock_models.py +20 -6
  86. mteb/models/model_implementations/bge_models.py +40 -1
  87. mteb/models/model_implementations/bica_model.py +1 -0
  88. mteb/models/model_implementations/blip2_models.py +11 -4
  89. mteb/models/model_implementations/blip_models.py +17 -4
  90. mteb/models/model_implementations/bm25.py +24 -14
  91. mteb/models/model_implementations/bmretriever_models.py +10 -2
  92. mteb/models/model_implementations/cadet_models.py +1 -0
  93. mteb/models/model_implementations/cde_models.py +11 -5
  94. mteb/models/model_implementations/clip_models.py +12 -4
  95. mteb/models/model_implementations/clips_models.py +3 -0
  96. mteb/models/model_implementations/codefuse_models.py +5 -0
  97. mteb/models/model_implementations/codesage_models.py +3 -0
  98. mteb/models/model_implementations/cohere_models.py +14 -4
  99. mteb/models/model_implementations/cohere_v.py +14 -4
  100. mteb/models/model_implementations/colpali_models.py +7 -3
  101. mteb/models/model_implementations/colqwen_models.py +17 -31
  102. mteb/models/model_implementations/colsmol_models.py +3 -1
  103. mteb/models/model_implementations/conan_models.py +11 -4
  104. mteb/models/model_implementations/dino_models.py +28 -4
  105. mteb/models/model_implementations/e5_instruct.py +4 -0
  106. mteb/models/model_implementations/e5_models.py +9 -0
  107. mteb/models/model_implementations/e5_v.py +10 -4
  108. mteb/models/model_implementations/eagerworks_models.py +11 -4
  109. mteb/models/model_implementations/emillykkejensen_models.py +3 -0
  110. mteb/models/model_implementations/en_code_retriever.py +1 -0
  111. mteb/models/model_implementations/euler_models.py +1 -0
  112. mteb/models/model_implementations/evaclip_models.py +13 -4
  113. mteb/models/model_implementations/fa_models.py +9 -0
  114. mteb/models/model_implementations/facebookai.py +2 -0
  115. mteb/models/model_implementations/geogpt_models.py +1 -0
  116. mteb/models/model_implementations/gme_v_models.py +7 -3
  117. mteb/models/model_implementations/google_models.py +15 -4
  118. mteb/models/model_implementations/granite_vision_embedding_models.py +7 -5
  119. mteb/models/model_implementations/gritlm_models.py +3 -0
  120. mteb/models/model_implementations/gte_models.py +9 -0
  121. mteb/models/model_implementations/hinvec_models.py +6 -1
  122. mteb/models/model_implementations/human.py +1 -0
  123. mteb/models/model_implementations/ibm_granite_models.py +6 -0
  124. mteb/models/model_implementations/inf_models.py +2 -0
  125. mteb/models/model_implementations/jasper_models.py +14 -5
  126. mteb/models/model_implementations/jina_clip.py +10 -4
  127. mteb/models/model_implementations/jina_models.py +17 -5
  128. mteb/models/model_implementations/kalm_models.py +24 -12
  129. mteb/models/model_implementations/kblab.py +1 -0
  130. mteb/models/model_implementations/kennethenevoldsen_models.py +2 -0
  131. mteb/models/model_implementations/kfst.py +1 -0
  132. mteb/models/model_implementations/kowshik24_models.py +1 -0
  133. mteb/models/model_implementations/lens_models.py +2 -0
  134. mteb/models/model_implementations/lgai_embedding_models.py +1 -0
  135. mteb/models/model_implementations/linq_models.py +7 -1
  136. mteb/models/model_implementations/listconranker.py +10 -4
  137. mteb/models/model_implementations/llm2clip_models.py +12 -4
  138. mteb/models/model_implementations/llm2vec_models.py +20 -6
  139. mteb/models/model_implementations/mcinext_models.py +8 -2
  140. mteb/models/model_implementations/mdbr_models.py +2 -0
  141. mteb/models/model_implementations/misc_models.py +63 -0
  142. mteb/models/model_implementations/mixedbread_ai_models.py +3 -0
  143. mteb/models/model_implementations/mme5_models.py +2 -1
  144. mteb/models/model_implementations/moco_models.py +11 -4
  145. mteb/models/model_implementations/mod_models.py +2 -1
  146. mteb/models/model_implementations/model2vec_models.py +23 -4
  147. mteb/models/model_implementations/moka_models.py +3 -0
  148. mteb/models/model_implementations/nbailab.py +3 -0
  149. mteb/models/model_implementations/no_instruct_sentence_models.py +13 -5
  150. mteb/models/model_implementations/nomic_models.py +17 -4
  151. mteb/models/model_implementations/nomic_models_vision.py +5 -3
  152. mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +9 -3
  153. mteb/models/model_implementations/nvidia_models.py +15 -4
  154. mteb/models/model_implementations/octen_models.py +3 -1
  155. mteb/models/model_implementations/openai_models.py +14 -4
  156. mteb/models/model_implementations/openclip_models.py +17 -4
  157. mteb/models/model_implementations/opensearch_neural_sparse_models.py +15 -4
  158. mteb/models/model_implementations/ops_moa_models.py +9 -2
  159. mteb/models/model_implementations/ordalietech_solon_embeddings_mini_beta_1_1.py +1 -0
  160. mteb/models/model_implementations/pawan_models.py +1 -0
  161. mteb/models/model_implementations/piccolo_models.py +2 -0
  162. mteb/models/model_implementations/promptriever_models.py +16 -6
  163. mteb/models/model_implementations/pylate_models.py +32 -13
  164. mteb/models/model_implementations/qodo_models.py +2 -0
  165. mteb/models/model_implementations/qtack_models.py +1 -0
  166. mteb/models/model_implementations/qwen3_models.py +11 -1
  167. mteb/models/model_implementations/qzhou_models.py +2 -0
  168. mteb/models/model_implementations/random_baseline.py +4 -3
  169. mteb/models/model_implementations/rasgaard_models.py +1 -0
  170. mteb/models/model_implementations/reasonir_model.py +65 -0
  171. mteb/models/model_implementations/repllama_models.py +15 -6
  172. mteb/models/model_implementations/rerankers_custom.py +13 -4
  173. mteb/models/model_implementations/rerankers_monot5_based.py +24 -4
  174. mteb/models/model_implementations/richinfoai_models.py +1 -0
  175. mteb/models/model_implementations/ru_sentence_models.py +20 -0
  176. mteb/models/model_implementations/ruri_models.py +10 -0
  177. mteb/models/model_implementations/salesforce_models.py +10 -1
  178. mteb/models/model_implementations/samilpwc_models.py +1 -0
  179. mteb/models/model_implementations/sarashina_embedding_models.py +2 -0
  180. mteb/models/model_implementations/searchmap_models.py +1 -0
  181. mteb/models/model_implementations/seed_1_6_embedding_models.py +5 -2
  182. mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +6 -2
  183. mteb/models/model_implementations/seed_models.py +2 -1
  184. mteb/models/model_implementations/sentence_transformers_models.py +18 -0
  185. mteb/models/model_implementations/shuu_model.py +1 -0
  186. mteb/models/model_implementations/siglip_models.py +19 -4
  187. mteb/models/model_implementations/slm_models.py +7 -4
  188. mteb/models/model_implementations/sonar_models.py +2 -1
  189. mteb/models/model_implementations/spartan8806_atles_champion.py +1 -0
  190. mteb/models/model_implementations/stella_models.py +6 -0
  191. mteb/models/model_implementations/tarka_models.py +2 -0
  192. mteb/models/model_implementations/text2vec_models.py +3 -0
  193. mteb/models/model_implementations/ua_sentence_models.py +1 -0
  194. mteb/models/model_implementations/uae_models.py +10 -4
  195. mteb/models/model_implementations/vdr_models.py +8 -1
  196. mteb/models/model_implementations/vi_vn_models.py +6 -0
  197. mteb/models/model_implementations/vista_models.py +11 -4
  198. mteb/models/model_implementations/vlm2vec_models.py +11 -4
  199. mteb/models/model_implementations/voyage_models.py +52 -4
  200. mteb/models/model_implementations/voyage_v.py +11 -6
  201. mteb/models/model_implementations/xyz_models.py +1 -0
  202. mteb/models/model_implementations/youtu_models.py +1 -0
  203. mteb/models/model_implementations/yuan_models.py +1 -0
  204. mteb/models/model_implementations/yuan_models_en.py +2 -1
  205. mteb/models/model_meta.py +47 -9
  206. mteb/models/models_protocols.py +23 -18
  207. mteb/models/search_encoder_index/search_backend_protocol.py +7 -3
  208. mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +12 -4
  209. mteb/models/search_wrappers.py +31 -12
  210. mteb/models/sentence_transformer_wrapper.py +4 -3
  211. mteb/models/vllm_wrapper.py +8 -6
  212. mteb/results/benchmark_results.py +22 -17
  213. mteb/results/model_result.py +21 -15
  214. mteb/results/task_result.py +32 -16
  215. mteb/similarity_functions.py +8 -2
  216. mteb/tasks/aggregated_tasks/eng/cqadupstack_retrieval.py +3 -3
  217. mteb/tasks/aggregated_tasks/eng/sts17_multilingual_visual_sts_eng.py +3 -3
  218. mteb/tasks/aggregated_tasks/eng/sts_benchmark_multilingual_visual_sts_eng.py +3 -3
  219. mteb/tasks/aggregated_tasks/fas/cqadupstack_retrieval_fa.py +3 -3
  220. mteb/tasks/aggregated_tasks/fas/syn_per_chatbot_conv_sa_classification.py +3 -3
  221. mteb/tasks/aggregated_tasks/multilingual/sts17_multilingual_vision_sts.py +3 -3
  222. mteb/tasks/aggregated_tasks/multilingual/sts_benchmark_multilingual_visual_sts.py +3 -3
  223. mteb/tasks/aggregated_tasks/nld/cqadupstack_nl_retrieval.py +3 -3
  224. mteb/tasks/aggregated_tasks/pol/cqadupstack_retrieval_pl.py +3 -3
  225. mteb/tasks/bitext_mining/eng/pub_chem_smiles_bitext_mining.py +1 -1
  226. mteb/tasks/bitext_mining/fas/fa_mteb_summary_retrieval.py +3 -3
  227. mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining.py +1 -1
  228. mteb/tasks/bitext_mining/multilingual/flores_bitext_mining.py +1 -1
  229. mteb/tasks/bitext_mining/multilingual/in22_conv_bitext_mining.py +1 -1
  230. mteb/tasks/bitext_mining/multilingual/in22_gen_bitext_mining.py +1 -1
  231. mteb/tasks/bitext_mining/multilingual/norwegian_courts_bitext_mining.py +1 -1
  232. mteb/tasks/bitext_mining/multilingual/ntrex_bitext_mining.py +1 -1
  233. mteb/tasks/bitext_mining/multilingual/roma_tales_bitext_mining.py +2 -2
  234. mteb/tasks/bitext_mining/multilingual/web_faq_bitext_mining.py +2 -2
  235. mteb/tasks/classification/ara/online_store_review_sentiment_classification.py +1 -1
  236. mteb/tasks/classification/ara/restaurant_review_sentiment_classification.py +1 -1
  237. mteb/tasks/classification/ara/tweet_sarcasm_classification.py +1 -1
  238. mteb/tasks/classification/ben/bengali_hate_speech_classification.py +1 -1
  239. mteb/tasks/classification/ben/bengali_sentiment_analysis.py +1 -1
  240. mteb/tasks/classification/bul/bulgarian_store_review_sentiment_classfication.py +1 -1
  241. mteb/tasks/classification/ces/csfdcz_movie_review_sentiment_classification.py +2 -2
  242. mteb/tasks/classification/dan/ddisco_cohesion_classification.py +1 -1
  243. mteb/tasks/classification/dan/dk_hate_classification.py +1 -1
  244. mteb/tasks/classification/deu/german_politicians_twitter_sentiment_classification.py +1 -1
  245. mteb/tasks/classification/ell/greek_legal_code_classification.py +1 -1
  246. mteb/tasks/classification/eng/dbpedia_classification.py +2 -2
  247. mteb/tasks/classification/eng/toxic_chat_classification.py +2 -2
  248. mteb/tasks/classification/eng/toxic_conversations_classification.py +2 -2
  249. mteb/tasks/classification/eng/tweet_topic_single_classification.py +1 -1
  250. mteb/tasks/classification/eng/yahoo_answers_topics_classification.py +1 -1
  251. mteb/tasks/classification/eng/yelp_review_full_classification.py +2 -2
  252. mteb/tasks/classification/est/estonian_valence.py +1 -1
  253. mteb/tasks/classification/fas/fa_mteb_classification.py +6 -6
  254. mteb/tasks/classification/fas/persian_food_sentiment_classification.py +1 -1
  255. mteb/tasks/classification/fil/filipino_shopee_reviews_classification.py +1 -1
  256. mteb/tasks/classification/fin/fin_toxicity_classification.py +1 -1
  257. mteb/tasks/classification/fra/french_book_reviews.py +2 -2
  258. mteb/tasks/classification/fra/movie_review_sentiment_classification.py +2 -2
  259. mteb/tasks/classification/guj/gujarati_news_classification.py +1 -1
  260. mteb/tasks/classification/hin/hindi_discourse_classification.py +1 -1
  261. mteb/tasks/classification/hin/sentiment_analysis_hindi.py +1 -1
  262. mteb/tasks/classification/ind/indonesian_id_clickbait_classification.py +2 -2
  263. mteb/tasks/classification/ind/indonesian_mongabay_conservation_classification.py +1 -1
  264. mteb/tasks/classification/ita/dado_eval_coarse_classification.py +1 -1
  265. mteb/tasks/classification/ita/ita_casehold_classification.py +1 -1
  266. mteb/tasks/classification/ita/sardi_stance_classification.py +1 -1
  267. mteb/tasks/classification/jav/javanese_imdb_classification.py +1 -1
  268. mteb/tasks/classification/jpn/wrime_classification.py +1 -1
  269. mteb/tasks/classification/kan/kannada_news_classification.py +2 -2
  270. mteb/tasks/classification/kor/klue_tc.py +2 -2
  271. mteb/tasks/classification/kor/kor_fin.py +1 -1
  272. mteb/tasks/classification/kor/kor_hate_classification.py +1 -1
  273. mteb/tasks/classification/kor/kor_sarcasm_classification.py +1 -1
  274. mteb/tasks/classification/mal/malayalam_news_classification.py +1 -1
  275. mteb/tasks/classification/mar/marathi_news_classification.py +1 -1
  276. mteb/tasks/classification/multilingual/afri_senti_lang_classification.py +1 -1
  277. mteb/tasks/classification/multilingual/catalonia_tweet_classification.py +1 -1
  278. mteb/tasks/classification/multilingual/cyrillic_turkic_lang_classification.py +1 -1
  279. mteb/tasks/classification/multilingual/indic_nlp_news_classification.py +1 -1
  280. mteb/tasks/classification/multilingual/masakha_news_classification.py +1 -1
  281. mteb/tasks/classification/multilingual/multi_hate_classification.py +1 -1
  282. mteb/tasks/classification/multilingual/multilingual_sentiment_classification.py +1 -1
  283. mteb/tasks/classification/multilingual/scala_classification.py +1 -1
  284. mteb/tasks/classification/multilingual/sib200_classification.py +1 -1
  285. mteb/tasks/classification/multilingual/turkic_classification.py +1 -1
  286. mteb/tasks/classification/multilingual/tweet_sentiment_classification.py +1 -1
  287. mteb/tasks/classification/nep/nepali_news_classification.py +2 -2
  288. mteb/tasks/classification/nld/dutch_sarcastic_headlines_classification.py +1 -1
  289. mteb/tasks/classification/nld/vaccin_chat_nl_classification.py +1 -1
  290. mteb/tasks/classification/ory/odia_news_classification.py +2 -2
  291. mteb/tasks/classification/pan/punjabi_news_classification.py +1 -1
  292. mteb/tasks/classification/ron/moroco.py +1 -1
  293. mteb/tasks/classification/ron/romanian_reviews_sentiment.py +1 -1
  294. mteb/tasks/classification/ron/romanian_sentiment_classification.py +1 -1
  295. mteb/tasks/classification/rus/georeview_classification.py +1 -1
  296. mteb/tasks/classification/rus/headline_classification.py +2 -2
  297. mteb/tasks/classification/rus/inappropriateness_classification.py +2 -2
  298. mteb/tasks/classification/rus/ru_reviews_classification.py +2 -2
  299. mteb/tasks/classification/rus/ru_sci_bench_grnti_classification.py +1 -1
  300. mteb/tasks/classification/rus/ru_sci_bench_oecd_classification.py +1 -1
  301. mteb/tasks/classification/rus/ru_toixic_classification_okmlcup.py +1 -1
  302. mteb/tasks/classification/san/sanskrit_shlokas_classification.py +1 -1
  303. mteb/tasks/classification/sin/sinhala_news_classification.py +2 -2
  304. mteb/tasks/classification/sin/sinhala_news_source_classification.py +2 -2
  305. mteb/tasks/classification/slk/csfdsk_movie_review_sentiment_classification.py +2 -2
  306. mteb/tasks/classification/slv/frenk_sl_classification.py +1 -1
  307. mteb/tasks/classification/spa/spanish_news_classification.py +2 -2
  308. mteb/tasks/classification/ssw/siswati_news_classification.py +1 -1
  309. mteb/tasks/classification/tam/tamil_news_classification.py +2 -2
  310. mteb/tasks/classification/tel/telugu_andhra_jyoti_news_classification.py +2 -2
  311. mteb/tasks/classification/tha/wongnai_reviews_classification.py +1 -1
  312. mteb/tasks/classification/tur/turkish_movie_sentiment_classification.py +2 -2
  313. mteb/tasks/classification/ukr/ukr_formality_classification.py +2 -2
  314. mteb/tasks/classification/vie/toxic_conversations_vn_classification.py +1 -1
  315. mteb/tasks/classification/vie/vie_student_feedback_classification.py +1 -1
  316. mteb/tasks/classification/zho/yue_openrice_review_classification.py +2 -2
  317. mteb/tasks/classification/zul/isi_zulu_news_classification.py +1 -1
  318. mteb/tasks/clustering/deu/blurbs_clustering_p2p.py +1 -1
  319. mteb/tasks/clustering/deu/blurbs_clustering_s2s.py +1 -1
  320. mteb/tasks/clustering/eng/arxiv_clustering_p2p.py +1 -1
  321. mteb/tasks/clustering/eng/arxiv_hierarchical_clustering.py +2 -2
  322. mteb/tasks/clustering/eng/big_patent_clustering.py +1 -1
  323. mteb/tasks/clustering/eng/biorxiv_clustering_p2p.py +1 -1
  324. mteb/tasks/clustering/eng/biorxiv_clustering_s2s.py +1 -1
  325. mteb/tasks/clustering/eng/medrxiv_clustering_p2p.py +1 -1
  326. mteb/tasks/clustering/eng/medrxiv_clustering_s2s.py +1 -1
  327. mteb/tasks/clustering/eng/reddit_clustering.py +1 -1
  328. mteb/tasks/clustering/eng/reddit_clustering_p2p.py +1 -1
  329. mteb/tasks/clustering/eng/stack_exchange_clustering.py +1 -1
  330. mteb/tasks/clustering/eng/stack_exchange_clustering_p2p.py +1 -1
  331. mteb/tasks/clustering/eng/twenty_newsgroups_clustering.py +1 -1
  332. mteb/tasks/clustering/fas/fa_mteb_clustering.py +4 -4
  333. mteb/tasks/clustering/fra/hal_clustering_s2s.py +2 -2
  334. mteb/tasks/clustering/multilingual/mlsum_clustering_p2p.py +2 -2
  335. mteb/tasks/clustering/multilingual/mlsum_clustering_s2s.py +2 -2
  336. mteb/tasks/clustering/multilingual/sib200_clustering_s2s.py +1 -1
  337. mteb/tasks/clustering/multilingual/wiki_clustering_p2p.py +1 -1
  338. mteb/tasks/clustering/nld/dutch_news_articles_clustering_p2p.py +1 -1
  339. mteb/tasks/clustering/nld/dutch_news_articles_clustering_s2s.py +1 -1
  340. mteb/tasks/clustering/nld/iconclass_clustering_s2s.py +1 -1
  341. mteb/tasks/clustering/nld/open_tender_clustering_p2p.py +1 -1
  342. mteb/tasks/clustering/nld/vabb_clustering_p2p.py +1 -1
  343. mteb/tasks/clustering/nld/vabb_clustering_s2s.py +1 -1
  344. mteb/tasks/clustering/nob/snl_clustering.py +8 -3
  345. mteb/tasks/clustering/nob/vg_clustering.py +8 -3
  346. mteb/tasks/clustering/pol/polish_clustering.py +3 -3
  347. mteb/tasks/clustering/rus/ru_sci_bench_grnti_clustering_p2p.py +1 -1
  348. mteb/tasks/clustering/rus/ru_sci_bench_oecd_clustering_p2p.py +1 -1
  349. mteb/tasks/clustering/zho/cmteb_clustering.py +4 -4
  350. mteb/tasks/image_text_pair_classification/eng/image_co_de.py +1 -1
  351. mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +1 -1
  352. mteb/tasks/instruction_reranking/multilingual/m_follow_ir.py +2 -2
  353. mteb/tasks/multichoice/eng/cv_bench.py +4 -4
  354. mteb/tasks/multilabel_classification/ita/emit_classification.py +1 -1
  355. mteb/tasks/multilabel_classification/mlt/maltese_news_classification.py +1 -1
  356. mteb/tasks/multilabel_classification/rus/ru_toixic_multilabelclassification_okmlcup.py +1 -1
  357. mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_group_classification.py +1 -1
  358. mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_subclass_classification.py +1 -1
  359. mteb/tasks/pair_classification/ara/ar_entail.py +1 -1
  360. mteb/tasks/pair_classification/dan/talemaader_pc.py +1 -1
  361. mteb/tasks/pair_classification/deu/false_friends_de_en_pc.py +1 -1
  362. mteb/tasks/pair_classification/eng/pub_chem_ai_sentence_paraphrase_pc.py +1 -1
  363. mteb/tasks/pair_classification/eng/pub_chem_smilespc.py +4 -3
  364. mteb/tasks/pair_classification/eng/pub_chem_synonym_pc.py +1 -1
  365. mteb/tasks/pair_classification/eng/pub_chem_wiki_paragraphs_pc.py +1 -1
  366. mteb/tasks/pair_classification/eng/sprint_duplicate_questions_pc.py +1 -1
  367. mteb/tasks/pair_classification/eng/twitter_sem_eval2015_pc.py +1 -1
  368. mteb/tasks/pair_classification/eng/twitter_url_corpus_pc.py +1 -1
  369. mteb/tasks/pair_classification/fas/fa_mteb_pair_classification.py +5 -5
  370. mteb/tasks/pair_classification/fas/fars_tail.py +2 -2
  371. mteb/tasks/pair_classification/hye/armenian_paraphrase_pc.py +1 -1
  372. mteb/tasks/pair_classification/ita/dis_co_tex_pair_classification.py +1 -1
  373. mteb/tasks/pair_classification/kor/klue_nli.py +1 -1
  374. mteb/tasks/pair_classification/multilingual/rte3.py +2 -2
  375. mteb/tasks/pair_classification/multilingual/xnli.py +1 -1
  376. mteb/tasks/pair_classification/pol/polish_pc.py +4 -4
  377. mteb/tasks/pair_classification/por/assin2_rte.py +1 -1
  378. mteb/tasks/pair_classification/por/sick_br_pc.py +1 -1
  379. mteb/tasks/pair_classification/rus/terra.py +2 -2
  380. mteb/tasks/pair_classification/vie/sprint_duplicate_questions_pcvn.py +1 -1
  381. mteb/tasks/pair_classification/vie/twitter_sem_eval2015_pcvn.py +1 -1
  382. mteb/tasks/pair_classification/vie/twitter_url_corpus_pcvn.py +1 -1
  383. mteb/tasks/pair_classification/zho/cmteb_pair_classification.py +2 -2
  384. mteb/tasks/retrieval/ara/sadeem_question_retrieval.py +1 -1
  385. mteb/tasks/retrieval/code/code_edit_search_retrieval.py +1 -1
  386. mteb/tasks/retrieval/code/code_rag.py +4 -4
  387. mteb/tasks/retrieval/code/code_search_net_cc_retrieval.py +1 -1
  388. mteb/tasks/retrieval/code/coir_code_search_net_retrieval.py +1 -1
  389. mteb/tasks/retrieval/code/ds1000_retrieval.py +1 -1
  390. mteb/tasks/retrieval/code/fresh_stack_retrieval.py +1 -1
  391. mteb/tasks/retrieval/code/human_eval_retrieval.py +1 -1
  392. mteb/tasks/retrieval/code/mbpp_retrieval.py +1 -1
  393. mteb/tasks/retrieval/code/wiki_sql_retrieval.py +1 -1
  394. mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
  395. mteb/tasks/retrieval/dan/tv2_nordretrieval.py +1 -1
  396. mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +1 -1
  397. mteb/tasks/retrieval/deu/german_gov_service_retrieval.py +1 -1
  398. mteb/tasks/retrieval/deu/german_qu_ad_retrieval.py +1 -1
  399. mteb/tasks/retrieval/ell/greek_civics_qa.py +1 -1
  400. mteb/tasks/retrieval/eng/__init__.py +42 -0
  401. mteb/tasks/retrieval/eng/bright_retrieval.py +10 -2
  402. mteb/tasks/retrieval/eng/bright_v1_1_retrieval.py +968 -0
  403. mteb/tasks/retrieval/eng/chat_doctor_retrieval.py +1 -1
  404. mteb/tasks/retrieval/eng/fin_qa_retrieval.py +1 -1
  405. mteb/tasks/retrieval/eng/finance_bench_retrieval.py +1 -1
  406. mteb/tasks/retrieval/eng/hateful_memes_i2t_retrieval.py +1 -1
  407. mteb/tasks/retrieval/eng/hateful_memes_t2i_retrieval.py +1 -1
  408. mteb/tasks/retrieval/eng/hc3_finance_retrieval.py +1 -1
  409. mteb/tasks/retrieval/eng/lemb_narrative_qa_retrieval.py +1 -1
  410. mteb/tasks/retrieval/eng/lemb_needle_retrieval.py +1 -1
  411. mteb/tasks/retrieval/eng/lemb_passkey_retrieval.py +1 -1
  412. mteb/tasks/retrieval/eng/lemb_summ_screen_fd_retrieval.py +1 -1
  413. mteb/tasks/retrieval/eng/lemb_wikim_qa_retrieval.py +1 -1
  414. mteb/tasks/retrieval/eng/lembqm_sum_retrieval.py +1 -1
  415. mteb/tasks/retrieval/eng/limit_retrieval.py +6 -1
  416. mteb/tasks/retrieval/eng/lit_search_retrieval.py +1 -1
  417. mteb/tasks/retrieval/eng/memotion_i2t_retrieval.py +1 -1
  418. mteb/tasks/retrieval/eng/memotion_t2i_retrieval.py +1 -1
  419. mteb/tasks/retrieval/eng/ml_questions.py +1 -1
  420. mteb/tasks/retrieval/eng/nano_argu_ana_retrieval.py +1 -1
  421. mteb/tasks/retrieval/eng/nano_climate_fever_retrieval.py +1 -1
  422. mteb/tasks/retrieval/eng/nano_db_pedia_retrieval.py +1 -1
  423. mteb/tasks/retrieval/eng/nano_fever_retrieval.py +1 -1
  424. mteb/tasks/retrieval/eng/nano_fi_qa2018_retrieval.py +1 -1
  425. mteb/tasks/retrieval/eng/nano_hotpot_qa_retrieval.py +1 -1
  426. mteb/tasks/retrieval/eng/nano_msmarco_retrieval.py +1 -1
  427. mteb/tasks/retrieval/eng/nano_nf_corpus_retrieval.py +1 -1
  428. mteb/tasks/retrieval/eng/nano_nq_retrieval.py +1 -1
  429. mteb/tasks/retrieval/eng/nano_quora_retrieval.py +1 -1
  430. mteb/tasks/retrieval/eng/nano_sci_fact_retrieval.py +1 -1
  431. mteb/tasks/retrieval/eng/nano_scidocs_retrieval.py +1 -1
  432. mteb/tasks/retrieval/eng/nano_touche2020_retrieval.py +1 -1
  433. mteb/tasks/retrieval/eng/narrative_qa_retrieval.py +1 -1
  434. mteb/tasks/retrieval/eng/r2_med_retrieval.py +8 -8
  435. mteb/tasks/retrieval/eng/sci_mmir_i2t_retrieval.py +1 -1
  436. mteb/tasks/retrieval/eng/sci_mmir_t2i_retrieval.py +1 -1
  437. mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +10 -10
  438. mteb/tasks/retrieval/fra/f_qu_ad_retrieval.py +1 -1
  439. mteb/tasks/retrieval/fra/syntec_retrieval.py +1 -1
  440. mteb/tasks/retrieval/hun/hun_sum2.py +1 -1
  441. mteb/tasks/retrieval/kat/georgian_faq_retrieval.py +1 -1
  442. mteb/tasks/retrieval/multilingual/cross_lingual_semantic_discrimination_wmt19.py +1 -1
  443. mteb/tasks/retrieval/multilingual/cross_lingual_semantic_discrimination_wmt21.py +1 -1
  444. mteb/tasks/retrieval/multilingual/cur_ev1_retrieval.py +1 -1
  445. mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +1 -1
  446. mteb/tasks/retrieval/multilingual/miracl_vision_retrieval.py +1 -1
  447. mteb/tasks/retrieval/multilingual/mr_tidy_retrieval.py +1 -1
  448. mteb/tasks/retrieval/multilingual/public_health_qa_retrieval.py +1 -1
  449. mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +5 -5
  450. mteb/tasks/retrieval/multilingual/statcan_dialogue_dataset_retrieval.py +1 -1
  451. mteb/tasks/retrieval/multilingual/vdr_multilingual_retrieval.py +1 -1
  452. mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +14 -4
  453. mteb/tasks/retrieval/multilingual/wit_t2i_retrieval.py +1 -1
  454. mteb/tasks/retrieval/multilingual/x_flickr30k_co_t2i_retrieval.py +1 -1
  455. mteb/tasks/retrieval/multilingual/x_qu_ad_retrieval.py +1 -1
  456. mteb/tasks/retrieval/multilingual/xm3600_t2i_retrieval.py +1 -1
  457. mteb/tasks/retrieval/nld/cqa_dupstack_android_nl_retrieval.py +1 -1
  458. mteb/tasks/retrieval/nld/cqa_dupstack_english_nl_retrieval.py +1 -1
  459. mteb/tasks/retrieval/nld/cqa_dupstack_gaming_nl_retrieval.py +1 -1
  460. mteb/tasks/retrieval/nld/cqa_dupstack_gis_nl_retrieval.py +1 -1
  461. mteb/tasks/retrieval/nld/cqa_dupstack_mathematica_nl_retrieval.py +1 -1
  462. mteb/tasks/retrieval/nld/cqa_dupstack_physics_nl_retrieval.py +1 -1
  463. mteb/tasks/retrieval/nld/cqa_dupstack_programmers_nl_retrieval.py +1 -1
  464. mteb/tasks/retrieval/nld/cqa_dupstack_stats_nl_retrieval.py +1 -1
  465. mteb/tasks/retrieval/nld/cqa_dupstack_tex_nl_retrieval.py +1 -1
  466. mteb/tasks/retrieval/nld/cqa_dupstack_unix_nl_retrieval.py +1 -1
  467. mteb/tasks/retrieval/nld/cqa_dupstack_webmasters_nl_retrieval.py +1 -1
  468. mteb/tasks/retrieval/nld/cqa_dupstack_wordpress_nl_retrieval.py +1 -1
  469. mteb/tasks/retrieval/nob/norquad.py +1 -1
  470. mteb/tasks/retrieval/nob/snl_retrieval.py +1 -1
  471. mteb/tasks/retrieval/slk/slovak_sum_retrieval.py +1 -1
  472. mteb/tasks/retrieval/vie/vie_qu_ad_retrieval.py +1 -1
  473. mteb/tasks/sts/fao/faroese_sts.py +1 -1
  474. mteb/tasks/sts/fra/sick_fr_sts.py +1 -1
  475. mteb/tasks/sts/kor/klue_sts.py +1 -1
  476. mteb/tasks/sts/por/sick_br_sts.py +1 -1
  477. mteb/tasks/sts/rus/ru_para_phraser_sts.py +1 -1
  478. mteb/tasks/zeroshot_classification/eng/sci_mmir.py +1 -1
  479. mteb/types/_encoder_io.py +1 -1
  480. mteb/types/statistics.py +9 -2
  481. {mteb-2.7.2.dist-info → mteb-2.7.9.dist-info}/METADATA +1 -1
  482. {mteb-2.7.2.dist-info → mteb-2.7.9.dist-info}/RECORD +486 -465
  483. {mteb-2.7.2.dist-info → mteb-2.7.9.dist-info}/WHEEL +1 -1
  484. {mteb-2.7.2.dist-info → mteb-2.7.9.dist-info}/entry_points.txt +0 -0
  485. {mteb-2.7.2.dist-info → mteb-2.7.9.dist-info}/licenses/LICENSE +0 -0
  486. {mteb-2.7.2.dist-info → mteb-2.7.9.dist-info}/top_level.txt +0 -0
@@ -1,13 +1,18 @@
1
- from typing import Any
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING, Any
2
4
 
3
5
  import torch
4
- from torch.utils.data import DataLoader
5
6
  from tqdm.auto import tqdm
6
7
 
7
- from mteb.abstasks.task_metadata import TaskMetadata
8
8
  from mteb.models.abs_encoder import AbsEncoder
9
9
  from mteb.models.model_meta import ModelMeta, ScoringFunction
10
- from mteb.types import Array, BatchedInput, PromptType
10
+
11
+ if TYPE_CHECKING:
12
+ from torch.utils.data import DataLoader
13
+
14
+ from mteb.abstasks.task_metadata import TaskMetadata
15
+ from mteb.types import Array, BatchedInput, PromptType
11
16
 
12
17
  SIGLIP_CITATION = """@misc{zhai2023sigmoid,
13
18
  title={Sigmoid Loss for Language Image Pre-Training},
@@ -131,6 +136,7 @@ siglip_so400m_patch14_224 = ModelMeta(
131
136
  release_date="2024-01-08",
132
137
  modalities=["image", "text"],
133
138
  n_parameters=877_000_000,
139
+ n_embedding_parameters=None,
134
140
  memory_usage_mb=3347,
135
141
  max_tokens=16,
136
142
  embed_dim=1152,
@@ -155,6 +161,7 @@ siglip_so400m_patch14_384 = ModelMeta(
155
161
  release_date="2024-01-08",
156
162
  modalities=["image", "text"],
157
163
  n_parameters=878_000_000,
164
+ n_embedding_parameters=None,
158
165
  memory_usage_mb=3349,
159
166
  max_tokens=64,
160
167
  embed_dim=1152,
@@ -179,6 +186,7 @@ siglip_so400m_patch16_256_i18n = ModelMeta(
179
186
  release_date="2024-01-08",
180
187
  modalities=["image", "text"],
181
188
  n_parameters=1_130_000_000,
189
+ n_embedding_parameters=None,
182
190
  memory_usage_mb=4306,
183
191
  max_tokens=64,
184
192
  embed_dim=1152,
@@ -203,6 +211,7 @@ siglip_base_patch16_256_multilingual = ModelMeta(
203
211
  release_date="2024-01-08",
204
212
  modalities=["image", "text"],
205
213
  n_parameters=371_000_000,
214
+ n_embedding_parameters=None,
206
215
  memory_usage_mb=1414,
207
216
  max_tokens=64,
208
217
  embed_dim=768,
@@ -227,6 +236,7 @@ siglip_base_patch16_256 = ModelMeta(
227
236
  release_date="2024-01-08",
228
237
  modalities=["image", "text"],
229
238
  n_parameters=203_000_000,
239
+ n_embedding_parameters=None,
230
240
  memory_usage_mb=775,
231
241
  max_tokens=64,
232
242
  embed_dim=768,
@@ -251,6 +261,7 @@ siglip_base_patch16_512 = ModelMeta(
251
261
  release_date="2024-01-08",
252
262
  modalities=["image", "text"],
253
263
  n_parameters=204_000_000,
264
+ n_embedding_parameters=None,
254
265
  memory_usage_mb=777,
255
266
  max_tokens=64,
256
267
  embed_dim=768,
@@ -275,6 +286,7 @@ siglip_base_patch16_384 = ModelMeta(
275
286
  release_date="2024-01-08",
276
287
  modalities=["image", "text"],
277
288
  n_parameters=203_000_000,
289
+ n_embedding_parameters=None,
278
290
  memory_usage_mb=776,
279
291
  max_tokens=64,
280
292
  embed_dim=768,
@@ -299,6 +311,7 @@ siglip_base_patch16_224 = ModelMeta(
299
311
  release_date="2024-01-08",
300
312
  modalities=["image", "text"],
301
313
  n_parameters=203_000_000,
314
+ n_embedding_parameters=None,
302
315
  memory_usage_mb=775,
303
316
  max_tokens=64,
304
317
  embed_dim=768,
@@ -323,6 +336,7 @@ siglip_large_patch16_256 = ModelMeta(
323
336
  release_date="2024-01-08",
324
337
  modalities=["image", "text"],
325
338
  n_parameters=652_000_000,
339
+ n_embedding_parameters=None,
326
340
  memory_usage_mb=2488,
327
341
  max_tokens=64,
328
342
  embed_dim=1024,
@@ -347,6 +361,7 @@ siglip_large_patch16_384 = ModelMeta(
347
361
  release_date="2024-01-08",
348
362
  modalities=["image", "text"],
349
363
  n_parameters=652_000_000,
364
+ n_embedding_parameters=None,
350
365
  memory_usage_mb=2489,
351
366
  max_tokens=64,
352
367
  embed_dim=1024,
@@ -13,24 +13,27 @@ Based on:
13
13
  from __future__ import annotations
14
14
 
15
15
  import logging
16
- from typing import Any
16
+ from typing import TYPE_CHECKING, Any
17
17
 
18
18
  import torch
19
- from torch.utils.data import DataLoader
20
19
  from tqdm.auto import tqdm
21
20
 
22
21
  from mteb._requires_package import (
23
22
  requires_image_dependencies,
24
23
  requires_package,
25
24
  )
26
- from mteb.abstasks.task_metadata import TaskMetadata
27
25
  from mteb.models.abs_encoder import AbsEncoder
28
26
  from mteb.models.model_implementations.colpali_models import (
29
27
  COLPALI_CITATION,
30
28
  COLPALI_TRAINING_DATA,
31
29
  )
32
30
  from mteb.models.model_meta import ModelMeta, ScoringFunction
33
- from mteb.types import Array, BatchedInput, PromptType
31
+
32
+ if TYPE_CHECKING:
33
+ from torch.utils.data import DataLoader
34
+
35
+ from mteb.abstasks.task_metadata import TaskMetadata
36
+ from mteb.types import Array, BatchedInput, PromptType
34
37
 
35
38
  logger = logging.getLogger(__name__)
36
39
 
@@ -224,7 +224,8 @@ sonar = ModelMeta(
224
224
  use_instructions=False, # it does take a language code as input
225
225
  revision="a551c586dcf4a49c8fd847de369412d556a7f2f2",
226
226
  release_date="2021-05-21",
227
- n_parameters=None, # it is really multiple models so not sure how to calculate this
227
+ n_parameters=None,
228
+ n_embedding_parameters=None, # it is really multiple models so not sure how to calculate this
228
229
  max_tokens=512, # https://github.com/facebookresearch/SONAR/blob/549d287466443bd8720f938047882630c1c5c3f7/sonar/models/sonar_text/builder.py#L139
229
230
  embed_dim=1024,
230
231
  license="mit",
@@ -12,6 +12,7 @@ spartan8806_atles_champion_embedding = ModelMeta(
12
12
  revision="d4c74d7000bbd25f3597fc0f2dcde59ef1386e8f",
13
13
  release_date="2025-11-15",
14
14
  n_parameters=110_000_000,
15
+ n_embedding_parameters=23_444_736,
15
16
  memory_usage_mb=420,
16
17
  max_tokens=512,
17
18
  embed_dim=768,
@@ -66,6 +66,7 @@ stella_en_400m = ModelMeta(
66
66
  revision="1bb50bc7bb726810eac2140e62155b88b0df198f",
67
67
  release_date="2024-07-12",
68
68
  n_parameters=435_000_000,
69
+ n_embedding_parameters=None,
69
70
  memory_usage_mb=1660,
70
71
  max_tokens=8192,
71
72
  embed_dim=4096,
@@ -101,6 +102,7 @@ stella_en_1_5b = ModelMeta(
101
102
  revision="d03be74b361d4eb24f42a2fe5bd2e29917df4604",
102
103
  release_date="2024-07-12",
103
104
  n_parameters=1_540_000_000,
105
+ n_embedding_parameters=232_928_256,
104
106
  memory_usage_mb=5887,
105
107
  max_tokens=131072,
106
108
  embed_dim=8960,
@@ -130,6 +132,7 @@ stella_large_zh_v3_1792d = ModelMeta(
130
132
  revision="d5d39eb8cd11c80a63df53314e59997074469f09",
131
133
  release_date="2024-02-17",
132
134
  n_parameters=None,
135
+ n_embedding_parameters=21_635_072,
133
136
  memory_usage_mb=None, # can't see on model card
134
137
  embed_dim=1792,
135
138
  license="not specified",
@@ -157,6 +160,7 @@ stella_base_zh_v3_1792d = ModelMeta(
157
160
  revision="82254892a0fba125aa2abf3a4800d2dd12821343",
158
161
  release_date="2024-02-17",
159
162
  n_parameters=None,
163
+ n_embedding_parameters=16_226_304,
160
164
  memory_usage_mb=None, # can't see on model card
161
165
  embed_dim=1792,
162
166
  license="mit",
@@ -185,6 +189,7 @@ stella_mrl_large_zh_v3_5_1792d = ModelMeta(
185
189
  revision="17bb1c32a93a8fc5f6fc9e91d5ea86da99983cfe",
186
190
  release_date="2024-02-27",
187
191
  n_parameters=int(326 * 1e6),
192
+ n_embedding_parameters=21_635_072,
188
193
  memory_usage_mb=1242,
189
194
  embed_dim=1792,
190
195
  license="mit",
@@ -209,6 +214,7 @@ zpoint_large_embedding_zh = ModelMeta(
209
214
  revision="b1075144f440ab4409c05622c1179130ebd57d03",
210
215
  release_date="2024-06-04",
211
216
  n_parameters=int(326 * 1e6),
217
+ n_embedding_parameters=21_635_072,
212
218
  memory_usage_mb=1242,
213
219
  embed_dim=1792,
214
220
  license="mit",
@@ -327,6 +327,7 @@ tarka_embedding_150m_v1 = ModelMeta(
327
327
  revision="b0ffecc4ef0d873e517507ed080e43b88b2704b9",
328
328
  release_date="2025-11-04",
329
329
  n_parameters=155_714_304,
330
+ n_embedding_parameters=None,
330
331
  embed_dim=768,
331
332
  max_tokens=2048,
332
333
  license="gemma",
@@ -361,6 +362,7 @@ tarka_embedding_350m_v1 = ModelMeta(
361
362
  revision="a850d6a329145474727424fed6b12b62096b8ba3",
362
363
  release_date="2025-11-11",
363
364
  n_parameters=354_483_968,
365
+ n_embedding_parameters=None,
364
366
  memory_usage_mb=676,
365
367
  embed_dim=1024,
366
368
  max_tokens=128000,
@@ -22,6 +22,7 @@ text2vec_base_chinese = ModelMeta(
22
22
  revision="183bb99aa7af74355fb58d16edf8c13ae7c5433e",
23
23
  release_date="2022-01-23",
24
24
  n_parameters=int(102 * 1e6),
25
+ n_embedding_parameters=16_226_304,
25
26
  embed_dim=768,
26
27
  license="apache-2.0",
27
28
  max_tokens=512,
@@ -51,6 +52,7 @@ text2vec_base_chinese_paraphrase = ModelMeta(
51
52
  revision="e90c150a9c7fb55a67712a766d6820c55fb83cdd",
52
53
  release_date="2023-06-19",
53
54
  n_parameters=118 * 1e6,
55
+ n_embedding_parameters=30_720_000,
54
56
  memory_usage_mb=450,
55
57
  embed_dim=768,
56
58
  license="apache-2.0",
@@ -95,6 +97,7 @@ text2vec_base_multilingual = ModelMeta(
95
97
  # So probably best not to.
96
98
  loader=sentence_transformers_loader,
97
99
  n_parameters=117654272,
100
+ n_embedding_parameters=96_014_208,
98
101
  memory_usage_mb=449,
99
102
  embed_dim=384,
100
103
  license="apache-2.0",
@@ -8,6 +8,7 @@ xlm_roberta_ua_distilled = ModelMeta(
8
8
  model_type=["dense"],
9
9
  loader=sentence_transformers_loader,
10
10
  n_parameters=278_000_000,
11
+ n_embedding_parameters=192_001_536,
11
12
  memory_usage_mb=1061,
12
13
  max_tokens=512,
13
14
  embed_dim=768,
@@ -1,13 +1,18 @@
1
+ from __future__ import annotations
2
+
1
3
  import logging
2
- from typing import Any
4
+ from typing import TYPE_CHECKING, Any
3
5
 
4
6
  import torch
5
- from torch.utils.data import DataLoader
6
7
 
7
- from mteb.abstasks.task_metadata import TaskMetadata
8
8
  from mteb.models.model_meta import ModelMeta, ScoringFunction
9
9
  from mteb.models.sentence_transformer_wrapper import SentenceTransformerEncoderWrapper
10
- from mteb.types import Array, BatchedInput, PromptType
10
+
11
+ if TYPE_CHECKING:
12
+ from torch.utils.data import DataLoader
13
+
14
+ from mteb.abstasks.task_metadata import TaskMetadata
15
+ from mteb.types import Array, BatchedInput, PromptType
11
16
 
12
17
  logger = logging.getLogger(__name__)
13
18
 
@@ -67,6 +72,7 @@ uae_large_v1 = ModelMeta(
67
72
  revision="369c368f70f16a613f19f5598d4f12d9f44235d4",
68
73
  release_date="2023-12-04", # initial commit of hf model.
69
74
  n_parameters=int(335 * 1e6),
75
+ n_embedding_parameters=31_254_528,
70
76
  memory_usage_mb=1278,
71
77
  max_tokens=512,
72
78
  embed_dim=1024,
@@ -1,6 +1,12 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING
4
+
1
5
  from mteb.models.instruct_wrapper import InstructSentenceTransformerModel
2
6
  from mteb.models.model_meta import ModelMeta, ScoringFunction
3
- from mteb.types import PromptType
7
+
8
+ if TYPE_CHECKING:
9
+ from mteb.types import PromptType
4
10
 
5
11
 
6
12
  def instruction_template(
@@ -32,6 +38,7 @@ vdr_2b_multi_v1 = ModelMeta(
32
38
  release_date="2024-01-08",
33
39
  modalities=["text"], # TODO: integrate with image
34
40
  n_parameters=2_000_000_000,
41
+ n_embedding_parameters=233_373_696,
35
42
  memory_usage_mb=4213,
36
43
  max_tokens=32768,
37
44
  embed_dim=1536,
@@ -16,6 +16,7 @@ greennode_embedding_large_vn_v1 = ModelMeta(
16
16
  loader=sentence_transformers_loader,
17
17
  open_weights=True,
18
18
  n_parameters=568_000_000,
19
+ n_embedding_parameters=256_002_048,
19
20
  memory_usage_mb=2167,
20
21
  embed_dim=1024,
21
22
  license="cc-by-4.0",
@@ -41,6 +42,7 @@ greennode_embedding_large_vn_mixed_v1 = ModelMeta(
41
42
  loader=sentence_transformers_loader,
42
43
  open_weights=True,
43
44
  n_parameters=568_000_000,
45
+ n_embedding_parameters=256_002_048,
44
46
  memory_usage_mb=2167,
45
47
  embed_dim=1024,
46
48
  license="cc-by-4.0",
@@ -66,6 +68,7 @@ aiteamvn_vietnamese_embeddings = ModelMeta(
66
68
  loader=sentence_transformers_loader,
67
69
  open_weights=True,
68
70
  n_parameters=568_000_000,
71
+ n_embedding_parameters=256_002_048,
69
72
  memory_usage_mb=2166,
70
73
  embed_dim=1024,
71
74
  license="cc-by-4.0",
@@ -98,6 +101,7 @@ hiieu_halong_embedding = ModelMeta(
98
101
  use_instructions=False,
99
102
  open_weights=True,
100
103
  n_parameters=278_000_000,
104
+ n_embedding_parameters=192_001_536,
101
105
  memory_usage_mb=1061,
102
106
  embed_dim=768,
103
107
  license="apache-2.0",
@@ -129,6 +133,7 @@ sup_simcse_vietnamese_phobert_base_ = ModelMeta(
129
133
  use_instructions=False,
130
134
  open_weights=True,
131
135
  n_parameters=135_000_000,
136
+ n_embedding_parameters=49_152_768,
132
137
  memory_usage_mb=517,
133
138
  max_tokens=256,
134
139
  embed_dim=768,
@@ -167,6 +172,7 @@ bkai_foundation_models_vietnamese_bi_encoder = ModelMeta(
167
172
  use_instructions=False,
168
173
  open_weights=True,
169
174
  n_parameters=135_000_000,
175
+ n_embedding_parameters=49_152_768,
170
176
  memory_usage_mb=515,
171
177
  max_tokens=256,
172
178
  embed_dim=768,
@@ -1,14 +1,19 @@
1
- from typing import Any, Literal
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING, Any, Literal
2
4
 
3
5
  import torch
4
- from torch.utils.data import DataLoader
5
6
  from tqdm.auto import tqdm
6
7
 
7
8
  from mteb._requires_package import requires_image_dependencies
8
- from mteb.abstasks.task_metadata import TaskMetadata
9
9
  from mteb.models.abs_encoder import AbsEncoder
10
10
  from mteb.models.model_meta import ModelMeta, ScoringFunction
11
- from mteb.types import Array, BatchedInput, PromptType
11
+
12
+ if TYPE_CHECKING:
13
+ from torch.utils.data import DataLoader
14
+
15
+ from mteb.abstasks.task_metadata import TaskMetadata
16
+ from mteb.types import Array, BatchedInput, PromptType
12
17
 
13
18
  VISTA_CITATION = """@article{zhou2024vista,
14
19
  title={VISTA: Visualized Text Embedding For Universal Multi-Modal Retrieval},
@@ -253,6 +258,7 @@ visualized_bge_base = ModelMeta(
253
258
  release_date="2024-06-06",
254
259
  modalities=["image", "text"],
255
260
  n_parameters=196_000_000,
261
+ n_embedding_parameters=None,
256
262
  memory_usage_mb=1631,
257
263
  max_tokens=512,
258
264
  embed_dim=768,
@@ -281,6 +287,7 @@ visualized_bge_m3 = ModelMeta(
281
287
  release_date="2024-06-06",
282
288
  modalities=["image", "text"],
283
289
  n_parameters=872_909_505,
290
+ n_embedding_parameters=None,
284
291
  memory_usage_mb=4263,
285
292
  max_tokens=8192,
286
293
  embed_dim=1024,
@@ -1,8 +1,9 @@
1
+ from __future__ import annotations
2
+
1
3
  import logging
2
- from typing import Any
4
+ from typing import TYPE_CHECKING, Any
3
5
 
4
6
  import torch
5
- from torch.utils.data import DataLoader
6
7
  from tqdm.auto import tqdm
7
8
 
8
9
  from mteb._requires_package import (
@@ -10,10 +11,14 @@ from mteb._requires_package import (
10
11
  requires_package,
11
12
  suggest_package,
12
13
  )
13
- from mteb.abstasks.task_metadata import TaskMetadata
14
14
  from mteb.models.abs_encoder import AbsEncoder
15
15
  from mteb.models.model_meta import ModelMeta, ScoringFunction
16
- from mteb.types import Array, BatchedInput, PromptType
16
+
17
+ if TYPE_CHECKING:
18
+ from torch.utils.data import DataLoader
19
+
20
+ from mteb.abstasks.task_metadata import TaskMetadata
21
+ from mteb.types import Array, BatchedInput, PromptType
17
22
 
18
23
  logger = logging.getLogger(__name__)
19
24
 
@@ -275,6 +280,7 @@ vlm2vec_lora = ModelMeta(
275
280
  release_date="2024-10-08",
276
281
  modalities=["image", "text"],
277
282
  n_parameters=None,
283
+ n_embedding_parameters=None,
278
284
  memory_usage_mb=None,
279
285
  max_tokens=131072,
280
286
  embed_dim=3072,
@@ -299,6 +305,7 @@ vlm2vec_full = ModelMeta(
299
305
  release_date="2024-10-08",
300
306
  modalities=["image", "text"],
301
307
  n_parameters=4_150_000_000,
308
+ n_embedding_parameters=None,
302
309
  memory_usage_mb=7909,
303
310
  max_tokens=131072,
304
311
  embed_dim=3072,
@@ -1,16 +1,22 @@
1
+ from __future__ import annotations
2
+
1
3
  import time
2
4
  from functools import wraps
3
- from typing import Any, Literal
5
+ from typing import TYPE_CHECKING, Any, Literal
4
6
 
5
7
  import numpy as np
6
- from torch.utils.data import DataLoader
7
8
  from tqdm.auto import tqdm
8
9
 
9
10
  from mteb._requires_package import requires_package
10
- from mteb.abstasks.task_metadata import TaskMetadata
11
11
  from mteb.models.abs_encoder import AbsEncoder
12
12
  from mteb.models.model_meta import ModelMeta, ScoringFunction
13
- from mteb.types import Array, BatchedInput, PromptType
13
+ from mteb.types import PromptType
14
+
15
+ if TYPE_CHECKING:
16
+ from torch.utils.data import DataLoader
17
+
18
+ from mteb.abstasks.task_metadata import TaskMetadata
19
+ from mteb.types import Array, BatchedInput
14
20
 
15
21
  VOYAGE_TRAINING_DATA = set(
16
22
  # Self-reported (message from VoyageAI member)
@@ -176,6 +182,7 @@ class VoyageModel(AbsEncoder):
176
182
  model=self._model_name,
177
183
  input_type=input_type,
178
184
  output_dtype=output_dtype,
185
+ output_dimension=self.mteb_model_meta.embed_dim,
179
186
  ).embeddings
180
187
  )
181
188
  pbar.update(len(batch))
@@ -209,6 +216,32 @@ model_prompts = {
209
216
  PromptType.document.value: "document",
210
217
  }
211
218
 
219
+ voyage_4_large_2048d = ModelMeta(
220
+ name="voyageai/voyage-4-large (embed_dim=2048)",
221
+ model_type=["dense"],
222
+ revision="1",
223
+ release_date="2026-01-15",
224
+ languages=None, # supported languages not specified
225
+ loader=VoyageModel,
226
+ loader_kwargs=dict(
227
+ max_tokens=32000,
228
+ model_prompts=model_prompts,
229
+ ),
230
+ max_tokens=32000,
231
+ embed_dim=2048,
232
+ open_weights=False,
233
+ n_parameters=None,
234
+ memory_usage_mb=None,
235
+ license=None,
236
+ reference="https://blog.voyageai.com/2026/01/15/voyage-4/",
237
+ similarity_fn_name="cosine",
238
+ framework=["API"],
239
+ use_instructions=True,
240
+ training_datasets=VOYAGE_TRAINING_DATA,
241
+ public_training_code=None,
242
+ public_training_data=None,
243
+ )
244
+
212
245
  voyage_4 = ModelMeta(
213
246
  name="voyageai/voyage-4",
214
247
  model_type=["dense"],
@@ -302,6 +335,7 @@ voyage_3_large = ModelMeta(
302
335
  embed_dim=1024,
303
336
  open_weights=False,
304
337
  n_parameters=None,
338
+ n_embedding_parameters=None,
305
339
  memory_usage_mb=None,
306
340
  license=None,
307
341
  reference="https://blog.voyageai.com/2025/01/07/voyage-3-large/",
@@ -330,6 +364,7 @@ voyage_3_5 = ModelMeta(
330
364
  embed_dim=1024,
331
365
  open_weights=False,
332
366
  n_parameters=None,
367
+ n_embedding_parameters=None,
333
368
  memory_usage_mb=None,
334
369
  license=None,
335
370
  reference="https://blog.voyageai.com/2025/05/20/voyage-3-5/",
@@ -357,6 +392,7 @@ voyage_3_5_int8 = ModelMeta(
357
392
  embed_dim=1024,
358
393
  open_weights=False,
359
394
  n_parameters=None,
395
+ n_embedding_parameters=None,
360
396
  memory_usage_mb=None,
361
397
  license=None,
362
398
  reference="https://blog.voyageai.com/2025/05/20/voyage-3-5/",
@@ -384,6 +420,7 @@ voyage_3_5_binary = ModelMeta(
384
420
  embed_dim=1024, # Same as original after unpacking from bits
385
421
  open_weights=False,
386
422
  n_parameters=None,
423
+ n_embedding_parameters=None,
387
424
  memory_usage_mb=None,
388
425
  license=None,
389
426
  reference="https://blog.voyageai.com/2025/05/20/voyage-3-5/",
@@ -411,6 +448,7 @@ voyage_large_2_instruct = ModelMeta(
411
448
  embed_dim=1024,
412
449
  open_weights=False,
413
450
  n_parameters=None,
451
+ n_embedding_parameters=None,
414
452
  memory_usage_mb=None,
415
453
  license=None,
416
454
  reference="https://blog.voyageai.com/2024/05/05/voyage-large-2-instruct-instruction-tuned-and-rank-1-on-mteb/",
@@ -437,6 +475,7 @@ voyage_finance_2 = ModelMeta(
437
475
  embed_dim=1024,
438
476
  open_weights=False,
439
477
  n_parameters=None,
478
+ n_embedding_parameters=None,
440
479
  memory_usage_mb=None,
441
480
  license=None,
442
481
  reference="https://blog.voyageai.com/2024/06/03/domain-specific-embeddings-finance-edition-voyage-finance-2/",
@@ -463,6 +502,7 @@ voyage_law_2 = ModelMeta(
463
502
  embed_dim=1024,
464
503
  open_weights=False,
465
504
  n_parameters=None,
505
+ n_embedding_parameters=None,
466
506
  memory_usage_mb=None,
467
507
  license=None,
468
508
  reference="https://blog.voyageai.com/2024/04/15/domain-specific-embeddings-and-retrieval-legal-edition-voyage-law-2/",
@@ -489,6 +529,7 @@ voyage_code_2 = ModelMeta(
489
529
  embed_dim=1536,
490
530
  open_weights=False,
491
531
  n_parameters=None,
532
+ n_embedding_parameters=None,
492
533
  memory_usage_mb=None,
493
534
  license=None,
494
535
  reference="https://blog.voyageai.com/2024/01/23/voyage-code-2-elevate-your-code-retrieval/",
@@ -515,6 +556,7 @@ voyage_code_3 = ModelMeta(
515
556
  embed_dim=1024,
516
557
  open_weights=False,
517
558
  n_parameters=None,
559
+ n_embedding_parameters=None,
518
560
  memory_usage_mb=None,
519
561
  license=None,
520
562
  reference="https://blog.voyageai.com/2024/12/04/voyage-code-3/",
@@ -542,6 +584,7 @@ voyage_large_2 = ModelMeta(
542
584
  embed_dim=1536,
543
585
  open_weights=False,
544
586
  n_parameters=None,
587
+ n_embedding_parameters=None,
545
588
  memory_usage_mb=None,
546
589
  license=None,
547
590
  reference="https://blog.voyageai.com/2023/10/29/voyage-embeddings/",
@@ -568,6 +611,7 @@ voyage_2 = ModelMeta(
568
611
  embed_dim=1024,
569
612
  open_weights=False,
570
613
  n_parameters=None,
614
+ n_embedding_parameters=None,
571
615
  memory_usage_mb=None,
572
616
  license=None,
573
617
  reference="https://blog.voyageai.com/2023/10/29/voyage-embeddings/",
@@ -593,6 +637,7 @@ voyage_multilingual_2 = ModelMeta(
593
637
  embed_dim=1024,
594
638
  open_weights=False,
595
639
  n_parameters=None,
640
+ n_embedding_parameters=None,
596
641
  memory_usage_mb=None,
597
642
  license=None,
598
643
  reference="https://blog.voyageai.com/2024/06/10/voyage-multilingual-2-multilingual-embedding-model/",
@@ -619,6 +664,7 @@ voyage_3 = ModelMeta(
619
664
  embed_dim=1024,
620
665
  open_weights=False,
621
666
  n_parameters=None,
667
+ n_embedding_parameters=None,
622
668
  memory_usage_mb=None,
623
669
  license=None,
624
670
  reference="https://blog.voyageai.com/2024/09/18/voyage-3/",
@@ -645,6 +691,7 @@ voyage_3_lite = ModelMeta(
645
691
  embed_dim=512,
646
692
  open_weights=False,
647
693
  n_parameters=None,
694
+ n_embedding_parameters=None,
648
695
  memory_usage_mb=None,
649
696
  license=None,
650
697
  reference="https://blog.voyageai.com/2024/09/18/voyage-3/",
@@ -673,6 +720,7 @@ voyage_3_exp = ModelMeta(
673
720
  open_weights=False,
674
721
  # from their card https://huggingface.co/voyageai/voyage-3-m-exp#model-information
675
722
  n_parameters=int(6918 * 1e6),
723
+ n_embedding_parameters=None,
676
724
  memory_usage_mb=None,
677
725
  license=None,
678
726
  reference="https://huggingface.co/voyageai/voyage-3-m-exp",
@@ -4,17 +4,19 @@ import logging
4
4
  from typing import TYPE_CHECKING, Any, Literal
5
5
 
6
6
  import torch
7
- from torch.utils.data import DataLoader
8
7
  from tqdm.auto import tqdm
9
8
 
10
9
  from mteb._requires_package import requires_image_dependencies, requires_package
11
- from mteb.abstasks.task_metadata import TaskMetadata
12
10
  from mteb.models.abs_encoder import AbsEncoder
13
11
  from mteb.models.model_meta import ModelMeta, ScoringFunction
14
- from mteb.types import Array, BatchedInput, PromptType
12
+ from mteb.types import PromptType
15
13
 
16
14
  if TYPE_CHECKING:
17
15
  from PIL import Image
16
+ from torch.utils.data import DataLoader
17
+
18
+ from mteb.abstasks.task_metadata import TaskMetadata
19
+ from mteb.types import Array, BatchedInput
18
20
 
19
21
  logger = logging.getLogger(__name__)
20
22
 
@@ -27,6 +29,8 @@ def _downsample_image(
27
29
  Returns:
28
30
  The downsampled image.
29
31
  """
32
+ from PIL.Image import Resampling
33
+
30
34
  width, height = image.size
31
35
  pixels = width * height
32
36
 
@@ -42,15 +46,15 @@ def _downsample_image(
42
46
  logger.info(
43
47
  f"Downsampling image from {width}x{height} to {new_width}x{new_height}"
44
48
  )
45
- return image.resize(new_size, Image.LANCZOS)
49
+ return image.resize(new_size, Resampling.LANCZOS)
46
50
  if width > height:
47
51
  if width > 10000:
48
52
  logger.error("Processing extremely wide images.")
49
- return image.resize((10000, height), Image.LANCZOS)
53
+ return image.resize((10000, height), Resampling.LANCZOS)
50
54
  else:
51
55
  if height > 10000:
52
56
  logger.error("Processing extremely high images.")
53
- return image.resize((width, 10000), Image.LANCZOS)
57
+ return image.resize((width, 10000), Resampling.LANCZOS)
54
58
  return image
55
59
 
56
60
 
@@ -211,6 +215,7 @@ voyage_v = ModelMeta(
211
215
  revision="1",
212
216
  release_date="2024-11-10",
213
217
  n_parameters=None,
218
+ n_embedding_parameters=None,
214
219
  memory_usage_mb=None,
215
220
  max_tokens=32768,
216
221
  embed_dim=1024,
@@ -31,6 +31,7 @@ xyz_embedding = ModelMeta(
31
31
  revision="4004120220b99baea764a1d3508427248ac3bccf",
32
32
  release_date="2024-09-13",
33
33
  n_parameters=326000000,
34
+ n_embedding_parameters=21_635_072,
34
35
  memory_usage_mb=1242,
35
36
  max_tokens=512,
36
37
  embed_dim=768,