mteb 2.7.2__py3-none-any.whl → 2.7.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (486) hide show
  1. mteb/_create_dataloaders.py +63 -14
  2. mteb/_evaluators/any_sts_evaluator.py +12 -5
  3. mteb/_evaluators/clustering_evaluator.py +12 -4
  4. mteb/_evaluators/evaluator.py +11 -5
  5. mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +14 -5
  6. mteb/_evaluators/pair_classification_evaluator.py +13 -5
  7. mteb/_evaluators/retrieval_evaluator.py +22 -13
  8. mteb/_evaluators/retrieval_metrics.py +9 -3
  9. mteb/_evaluators/sklearn_evaluator.py +20 -11
  10. mteb/_evaluators/text/bitext_mining_evaluator.py +10 -3
  11. mteb/_evaluators/text/summarization_evaluator.py +10 -4
  12. mteb/_evaluators/zeroshot_classification_evaluator.py +12 -3
  13. mteb/_helpful_enum.py +5 -1
  14. mteb/abstasks/_data_filter/filters.py +8 -2
  15. mteb/abstasks/_data_filter/task_pipelines.py +7 -2
  16. mteb/abstasks/_statistics_calculation.py +6 -4
  17. mteb/abstasks/abstask.py +48 -21
  18. mteb/abstasks/aggregate_task_metadata.py +20 -9
  19. mteb/abstasks/aggregated_task.py +15 -8
  20. mteb/abstasks/classification.py +25 -9
  21. mteb/abstasks/clustering.py +23 -10
  22. mteb/abstasks/clustering_legacy.py +22 -8
  23. mteb/abstasks/image/image_text_pair_classification.py +23 -9
  24. mteb/abstasks/multilabel_classification.py +13 -5
  25. mteb/abstasks/pair_classification.py +27 -11
  26. mteb/abstasks/regression.py +14 -6
  27. mteb/abstasks/retrieval.py +56 -30
  28. mteb/abstasks/retrieval_dataset_loaders.py +48 -37
  29. mteb/abstasks/sts.py +29 -13
  30. mteb/abstasks/task_metadata.py +17 -8
  31. mteb/abstasks/text/bitext_mining.py +23 -12
  32. mteb/abstasks/text/reranking.py +2 -2
  33. mteb/abstasks/text/summarization.py +19 -8
  34. mteb/abstasks/zeroshot_classification.py +23 -9
  35. mteb/benchmarks/_create_table.py +13 -7
  36. mteb/benchmarks/benchmark.py +11 -1
  37. mteb/benchmarks/benchmarks/__init__.py +2 -0
  38. mteb/benchmarks/benchmarks/benchmarks.py +41 -2
  39. mteb/benchmarks/benchmarks/rteb_benchmarks.py +20 -9
  40. mteb/cache.py +10 -5
  41. mteb/cli/_display_tasks.py +9 -3
  42. mteb/cli/build_cli.py +5 -2
  43. mteb/cli/generate_model_card.py +9 -2
  44. mteb/deprecated_evaluator.py +16 -12
  45. mteb/descriptive_stats/Retrieval/BrightAopsRetrieval.json +35 -0
  46. mteb/descriptive_stats/Retrieval/BrightBiologyLongRetrieval.json +35 -0
  47. mteb/descriptive_stats/Retrieval/BrightBiologyRetrieval.json +35 -0
  48. mteb/descriptive_stats/Retrieval/BrightEarthScienceLongRetrieval.json +35 -0
  49. mteb/descriptive_stats/Retrieval/BrightEarthScienceRetrieval.json +35 -0
  50. mteb/descriptive_stats/Retrieval/BrightEconomicsLongRetrieval.json +35 -0
  51. mteb/descriptive_stats/Retrieval/BrightEconomicsRetrieval.json +35 -0
  52. mteb/descriptive_stats/Retrieval/BrightLeetcodeRetrieval.json +35 -0
  53. mteb/descriptive_stats/Retrieval/BrightPonyLongRetrieval.json +35 -0
  54. mteb/descriptive_stats/Retrieval/BrightPonyRetrieval.json +35 -0
  55. mteb/descriptive_stats/Retrieval/BrightPsychologyLongRetrieval.json +35 -0
  56. mteb/descriptive_stats/Retrieval/BrightPsychologyRetrieval.json +35 -0
  57. mteb/descriptive_stats/Retrieval/BrightRoboticsLongRetrieval.json +35 -0
  58. mteb/descriptive_stats/Retrieval/BrightRoboticsRetrieval.json +35 -0
  59. mteb/descriptive_stats/Retrieval/BrightStackoverflowLongRetrieval.json +35 -0
  60. mteb/descriptive_stats/Retrieval/BrightStackoverflowRetrieval.json +35 -0
  61. mteb/descriptive_stats/Retrieval/BrightSustainableLivingLongRetrieval.json +35 -0
  62. mteb/descriptive_stats/Retrieval/BrightSustainableLivingRetrieval.json +35 -0
  63. mteb/descriptive_stats/Retrieval/BrightTheoremQAQuestionsRetrieval.json +35 -0
  64. mteb/descriptive_stats/Retrieval/BrightTheoremQATheoremsRetrieval.json +35 -0
  65. mteb/evaluate.py +33 -20
  66. mteb/filter_tasks.py +12 -7
  67. mteb/get_tasks.py +9 -4
  68. mteb/languages/language_scripts.py +8 -3
  69. mteb/leaderboard/app.py +11 -4
  70. mteb/leaderboard/table.py +7 -2
  71. mteb/load_results.py +9 -3
  72. mteb/models/abs_encoder.py +22 -12
  73. mteb/models/cache_wrappers/cache_backend_protocol.py +5 -3
  74. mteb/models/cache_wrappers/cache_backends/_hash_utils.py +8 -4
  75. mteb/models/cache_wrappers/cache_backends/faiss_cache.py +8 -3
  76. mteb/models/cache_wrappers/cache_wrapper.py +14 -9
  77. mteb/models/get_model_meta.py +32 -6
  78. mteb/models/instruct_wrapper.py +13 -5
  79. mteb/models/model_implementations/align_models.py +10 -4
  80. mteb/models/model_implementations/amazon_models.py +1 -0
  81. mteb/models/model_implementations/andersborges.py +2 -0
  82. mteb/models/model_implementations/ara_models.py +1 -0
  83. mteb/models/model_implementations/arctic_models.py +8 -0
  84. mteb/models/model_implementations/b1ade_models.py +1 -0
  85. mteb/models/model_implementations/bedrock_models.py +20 -6
  86. mteb/models/model_implementations/bge_models.py +40 -1
  87. mteb/models/model_implementations/bica_model.py +1 -0
  88. mteb/models/model_implementations/blip2_models.py +11 -4
  89. mteb/models/model_implementations/blip_models.py +17 -4
  90. mteb/models/model_implementations/bm25.py +24 -14
  91. mteb/models/model_implementations/bmretriever_models.py +10 -2
  92. mteb/models/model_implementations/cadet_models.py +1 -0
  93. mteb/models/model_implementations/cde_models.py +11 -5
  94. mteb/models/model_implementations/clip_models.py +12 -4
  95. mteb/models/model_implementations/clips_models.py +3 -0
  96. mteb/models/model_implementations/codefuse_models.py +5 -0
  97. mteb/models/model_implementations/codesage_models.py +3 -0
  98. mteb/models/model_implementations/cohere_models.py +14 -4
  99. mteb/models/model_implementations/cohere_v.py +14 -4
  100. mteb/models/model_implementations/colpali_models.py +7 -3
  101. mteb/models/model_implementations/colqwen_models.py +17 -31
  102. mteb/models/model_implementations/colsmol_models.py +3 -1
  103. mteb/models/model_implementations/conan_models.py +11 -4
  104. mteb/models/model_implementations/dino_models.py +28 -4
  105. mteb/models/model_implementations/e5_instruct.py +4 -0
  106. mteb/models/model_implementations/e5_models.py +9 -0
  107. mteb/models/model_implementations/e5_v.py +10 -4
  108. mteb/models/model_implementations/eagerworks_models.py +11 -4
  109. mteb/models/model_implementations/emillykkejensen_models.py +3 -0
  110. mteb/models/model_implementations/en_code_retriever.py +1 -0
  111. mteb/models/model_implementations/euler_models.py +1 -0
  112. mteb/models/model_implementations/evaclip_models.py +13 -4
  113. mteb/models/model_implementations/fa_models.py +9 -0
  114. mteb/models/model_implementations/facebookai.py +2 -0
  115. mteb/models/model_implementations/geogpt_models.py +1 -0
  116. mteb/models/model_implementations/gme_v_models.py +7 -3
  117. mteb/models/model_implementations/google_models.py +15 -4
  118. mteb/models/model_implementations/granite_vision_embedding_models.py +7 -5
  119. mteb/models/model_implementations/gritlm_models.py +3 -0
  120. mteb/models/model_implementations/gte_models.py +9 -0
  121. mteb/models/model_implementations/hinvec_models.py +6 -1
  122. mteb/models/model_implementations/human.py +1 -0
  123. mteb/models/model_implementations/ibm_granite_models.py +6 -0
  124. mteb/models/model_implementations/inf_models.py +2 -0
  125. mteb/models/model_implementations/jasper_models.py +14 -5
  126. mteb/models/model_implementations/jina_clip.py +10 -4
  127. mteb/models/model_implementations/jina_models.py +17 -5
  128. mteb/models/model_implementations/kalm_models.py +24 -12
  129. mteb/models/model_implementations/kblab.py +1 -0
  130. mteb/models/model_implementations/kennethenevoldsen_models.py +2 -0
  131. mteb/models/model_implementations/kfst.py +1 -0
  132. mteb/models/model_implementations/kowshik24_models.py +1 -0
  133. mteb/models/model_implementations/lens_models.py +2 -0
  134. mteb/models/model_implementations/lgai_embedding_models.py +1 -0
  135. mteb/models/model_implementations/linq_models.py +7 -1
  136. mteb/models/model_implementations/listconranker.py +10 -4
  137. mteb/models/model_implementations/llm2clip_models.py +12 -4
  138. mteb/models/model_implementations/llm2vec_models.py +20 -6
  139. mteb/models/model_implementations/mcinext_models.py +8 -2
  140. mteb/models/model_implementations/mdbr_models.py +2 -0
  141. mteb/models/model_implementations/misc_models.py +63 -0
  142. mteb/models/model_implementations/mixedbread_ai_models.py +3 -0
  143. mteb/models/model_implementations/mme5_models.py +2 -1
  144. mteb/models/model_implementations/moco_models.py +11 -4
  145. mteb/models/model_implementations/mod_models.py +2 -1
  146. mteb/models/model_implementations/model2vec_models.py +23 -4
  147. mteb/models/model_implementations/moka_models.py +3 -0
  148. mteb/models/model_implementations/nbailab.py +3 -0
  149. mteb/models/model_implementations/no_instruct_sentence_models.py +13 -5
  150. mteb/models/model_implementations/nomic_models.py +17 -4
  151. mteb/models/model_implementations/nomic_models_vision.py +5 -3
  152. mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +9 -3
  153. mteb/models/model_implementations/nvidia_models.py +15 -4
  154. mteb/models/model_implementations/octen_models.py +3 -1
  155. mteb/models/model_implementations/openai_models.py +14 -4
  156. mteb/models/model_implementations/openclip_models.py +17 -4
  157. mteb/models/model_implementations/opensearch_neural_sparse_models.py +15 -4
  158. mteb/models/model_implementations/ops_moa_models.py +9 -2
  159. mteb/models/model_implementations/ordalietech_solon_embeddings_mini_beta_1_1.py +1 -0
  160. mteb/models/model_implementations/pawan_models.py +1 -0
  161. mteb/models/model_implementations/piccolo_models.py +2 -0
  162. mteb/models/model_implementations/promptriever_models.py +16 -6
  163. mteb/models/model_implementations/pylate_models.py +32 -13
  164. mteb/models/model_implementations/qodo_models.py +2 -0
  165. mteb/models/model_implementations/qtack_models.py +1 -0
  166. mteb/models/model_implementations/qwen3_models.py +11 -1
  167. mteb/models/model_implementations/qzhou_models.py +2 -0
  168. mteb/models/model_implementations/random_baseline.py +4 -3
  169. mteb/models/model_implementations/rasgaard_models.py +1 -0
  170. mteb/models/model_implementations/reasonir_model.py +65 -0
  171. mteb/models/model_implementations/repllama_models.py +15 -6
  172. mteb/models/model_implementations/rerankers_custom.py +13 -4
  173. mteb/models/model_implementations/rerankers_monot5_based.py +24 -4
  174. mteb/models/model_implementations/richinfoai_models.py +1 -0
  175. mteb/models/model_implementations/ru_sentence_models.py +20 -0
  176. mteb/models/model_implementations/ruri_models.py +10 -0
  177. mteb/models/model_implementations/salesforce_models.py +10 -1
  178. mteb/models/model_implementations/samilpwc_models.py +1 -0
  179. mteb/models/model_implementations/sarashina_embedding_models.py +2 -0
  180. mteb/models/model_implementations/searchmap_models.py +1 -0
  181. mteb/models/model_implementations/seed_1_6_embedding_models.py +5 -2
  182. mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +6 -2
  183. mteb/models/model_implementations/seed_models.py +2 -1
  184. mteb/models/model_implementations/sentence_transformers_models.py +18 -0
  185. mteb/models/model_implementations/shuu_model.py +1 -0
  186. mteb/models/model_implementations/siglip_models.py +19 -4
  187. mteb/models/model_implementations/slm_models.py +7 -4
  188. mteb/models/model_implementations/sonar_models.py +2 -1
  189. mteb/models/model_implementations/spartan8806_atles_champion.py +1 -0
  190. mteb/models/model_implementations/stella_models.py +6 -0
  191. mteb/models/model_implementations/tarka_models.py +2 -0
  192. mteb/models/model_implementations/text2vec_models.py +3 -0
  193. mteb/models/model_implementations/ua_sentence_models.py +1 -0
  194. mteb/models/model_implementations/uae_models.py +10 -4
  195. mteb/models/model_implementations/vdr_models.py +8 -1
  196. mteb/models/model_implementations/vi_vn_models.py +6 -0
  197. mteb/models/model_implementations/vista_models.py +11 -4
  198. mteb/models/model_implementations/vlm2vec_models.py +11 -4
  199. mteb/models/model_implementations/voyage_models.py +52 -4
  200. mteb/models/model_implementations/voyage_v.py +11 -6
  201. mteb/models/model_implementations/xyz_models.py +1 -0
  202. mteb/models/model_implementations/youtu_models.py +1 -0
  203. mteb/models/model_implementations/yuan_models.py +1 -0
  204. mteb/models/model_implementations/yuan_models_en.py +2 -1
  205. mteb/models/model_meta.py +47 -9
  206. mteb/models/models_protocols.py +23 -18
  207. mteb/models/search_encoder_index/search_backend_protocol.py +7 -3
  208. mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +12 -4
  209. mteb/models/search_wrappers.py +31 -12
  210. mteb/models/sentence_transformer_wrapper.py +4 -3
  211. mteb/models/vllm_wrapper.py +8 -6
  212. mteb/results/benchmark_results.py +22 -17
  213. mteb/results/model_result.py +21 -15
  214. mteb/results/task_result.py +32 -16
  215. mteb/similarity_functions.py +8 -2
  216. mteb/tasks/aggregated_tasks/eng/cqadupstack_retrieval.py +3 -3
  217. mteb/tasks/aggregated_tasks/eng/sts17_multilingual_visual_sts_eng.py +3 -3
  218. mteb/tasks/aggregated_tasks/eng/sts_benchmark_multilingual_visual_sts_eng.py +3 -3
  219. mteb/tasks/aggregated_tasks/fas/cqadupstack_retrieval_fa.py +3 -3
  220. mteb/tasks/aggregated_tasks/fas/syn_per_chatbot_conv_sa_classification.py +3 -3
  221. mteb/tasks/aggregated_tasks/multilingual/sts17_multilingual_vision_sts.py +3 -3
  222. mteb/tasks/aggregated_tasks/multilingual/sts_benchmark_multilingual_visual_sts.py +3 -3
  223. mteb/tasks/aggregated_tasks/nld/cqadupstack_nl_retrieval.py +3 -3
  224. mteb/tasks/aggregated_tasks/pol/cqadupstack_retrieval_pl.py +3 -3
  225. mteb/tasks/bitext_mining/eng/pub_chem_smiles_bitext_mining.py +1 -1
  226. mteb/tasks/bitext_mining/fas/fa_mteb_summary_retrieval.py +3 -3
  227. mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining.py +1 -1
  228. mteb/tasks/bitext_mining/multilingual/flores_bitext_mining.py +1 -1
  229. mteb/tasks/bitext_mining/multilingual/in22_conv_bitext_mining.py +1 -1
  230. mteb/tasks/bitext_mining/multilingual/in22_gen_bitext_mining.py +1 -1
  231. mteb/tasks/bitext_mining/multilingual/norwegian_courts_bitext_mining.py +1 -1
  232. mteb/tasks/bitext_mining/multilingual/ntrex_bitext_mining.py +1 -1
  233. mteb/tasks/bitext_mining/multilingual/roma_tales_bitext_mining.py +2 -2
  234. mteb/tasks/bitext_mining/multilingual/web_faq_bitext_mining.py +2 -2
  235. mteb/tasks/classification/ara/online_store_review_sentiment_classification.py +1 -1
  236. mteb/tasks/classification/ara/restaurant_review_sentiment_classification.py +1 -1
  237. mteb/tasks/classification/ara/tweet_sarcasm_classification.py +1 -1
  238. mteb/tasks/classification/ben/bengali_hate_speech_classification.py +1 -1
  239. mteb/tasks/classification/ben/bengali_sentiment_analysis.py +1 -1
  240. mteb/tasks/classification/bul/bulgarian_store_review_sentiment_classfication.py +1 -1
  241. mteb/tasks/classification/ces/csfdcz_movie_review_sentiment_classification.py +2 -2
  242. mteb/tasks/classification/dan/ddisco_cohesion_classification.py +1 -1
  243. mteb/tasks/classification/dan/dk_hate_classification.py +1 -1
  244. mteb/tasks/classification/deu/german_politicians_twitter_sentiment_classification.py +1 -1
  245. mteb/tasks/classification/ell/greek_legal_code_classification.py +1 -1
  246. mteb/tasks/classification/eng/dbpedia_classification.py +2 -2
  247. mteb/tasks/classification/eng/toxic_chat_classification.py +2 -2
  248. mteb/tasks/classification/eng/toxic_conversations_classification.py +2 -2
  249. mteb/tasks/classification/eng/tweet_topic_single_classification.py +1 -1
  250. mteb/tasks/classification/eng/yahoo_answers_topics_classification.py +1 -1
  251. mteb/tasks/classification/eng/yelp_review_full_classification.py +2 -2
  252. mteb/tasks/classification/est/estonian_valence.py +1 -1
  253. mteb/tasks/classification/fas/fa_mteb_classification.py +6 -6
  254. mteb/tasks/classification/fas/persian_food_sentiment_classification.py +1 -1
  255. mteb/tasks/classification/fil/filipino_shopee_reviews_classification.py +1 -1
  256. mteb/tasks/classification/fin/fin_toxicity_classification.py +1 -1
  257. mteb/tasks/classification/fra/french_book_reviews.py +2 -2
  258. mteb/tasks/classification/fra/movie_review_sentiment_classification.py +2 -2
  259. mteb/tasks/classification/guj/gujarati_news_classification.py +1 -1
  260. mteb/tasks/classification/hin/hindi_discourse_classification.py +1 -1
  261. mteb/tasks/classification/hin/sentiment_analysis_hindi.py +1 -1
  262. mteb/tasks/classification/ind/indonesian_id_clickbait_classification.py +2 -2
  263. mteb/tasks/classification/ind/indonesian_mongabay_conservation_classification.py +1 -1
  264. mteb/tasks/classification/ita/dado_eval_coarse_classification.py +1 -1
  265. mteb/tasks/classification/ita/ita_casehold_classification.py +1 -1
  266. mteb/tasks/classification/ita/sardi_stance_classification.py +1 -1
  267. mteb/tasks/classification/jav/javanese_imdb_classification.py +1 -1
  268. mteb/tasks/classification/jpn/wrime_classification.py +1 -1
  269. mteb/tasks/classification/kan/kannada_news_classification.py +2 -2
  270. mteb/tasks/classification/kor/klue_tc.py +2 -2
  271. mteb/tasks/classification/kor/kor_fin.py +1 -1
  272. mteb/tasks/classification/kor/kor_hate_classification.py +1 -1
  273. mteb/tasks/classification/kor/kor_sarcasm_classification.py +1 -1
  274. mteb/tasks/classification/mal/malayalam_news_classification.py +1 -1
  275. mteb/tasks/classification/mar/marathi_news_classification.py +1 -1
  276. mteb/tasks/classification/multilingual/afri_senti_lang_classification.py +1 -1
  277. mteb/tasks/classification/multilingual/catalonia_tweet_classification.py +1 -1
  278. mteb/tasks/classification/multilingual/cyrillic_turkic_lang_classification.py +1 -1
  279. mteb/tasks/classification/multilingual/indic_nlp_news_classification.py +1 -1
  280. mteb/tasks/classification/multilingual/masakha_news_classification.py +1 -1
  281. mteb/tasks/classification/multilingual/multi_hate_classification.py +1 -1
  282. mteb/tasks/classification/multilingual/multilingual_sentiment_classification.py +1 -1
  283. mteb/tasks/classification/multilingual/scala_classification.py +1 -1
  284. mteb/tasks/classification/multilingual/sib200_classification.py +1 -1
  285. mteb/tasks/classification/multilingual/turkic_classification.py +1 -1
  286. mteb/tasks/classification/multilingual/tweet_sentiment_classification.py +1 -1
  287. mteb/tasks/classification/nep/nepali_news_classification.py +2 -2
  288. mteb/tasks/classification/nld/dutch_sarcastic_headlines_classification.py +1 -1
  289. mteb/tasks/classification/nld/vaccin_chat_nl_classification.py +1 -1
  290. mteb/tasks/classification/ory/odia_news_classification.py +2 -2
  291. mteb/tasks/classification/pan/punjabi_news_classification.py +1 -1
  292. mteb/tasks/classification/ron/moroco.py +1 -1
  293. mteb/tasks/classification/ron/romanian_reviews_sentiment.py +1 -1
  294. mteb/tasks/classification/ron/romanian_sentiment_classification.py +1 -1
  295. mteb/tasks/classification/rus/georeview_classification.py +1 -1
  296. mteb/tasks/classification/rus/headline_classification.py +2 -2
  297. mteb/tasks/classification/rus/inappropriateness_classification.py +2 -2
  298. mteb/tasks/classification/rus/ru_reviews_classification.py +2 -2
  299. mteb/tasks/classification/rus/ru_sci_bench_grnti_classification.py +1 -1
  300. mteb/tasks/classification/rus/ru_sci_bench_oecd_classification.py +1 -1
  301. mteb/tasks/classification/rus/ru_toixic_classification_okmlcup.py +1 -1
  302. mteb/tasks/classification/san/sanskrit_shlokas_classification.py +1 -1
  303. mteb/tasks/classification/sin/sinhala_news_classification.py +2 -2
  304. mteb/tasks/classification/sin/sinhala_news_source_classification.py +2 -2
  305. mteb/tasks/classification/slk/csfdsk_movie_review_sentiment_classification.py +2 -2
  306. mteb/tasks/classification/slv/frenk_sl_classification.py +1 -1
  307. mteb/tasks/classification/spa/spanish_news_classification.py +2 -2
  308. mteb/tasks/classification/ssw/siswati_news_classification.py +1 -1
  309. mteb/tasks/classification/tam/tamil_news_classification.py +2 -2
  310. mteb/tasks/classification/tel/telugu_andhra_jyoti_news_classification.py +2 -2
  311. mteb/tasks/classification/tha/wongnai_reviews_classification.py +1 -1
  312. mteb/tasks/classification/tur/turkish_movie_sentiment_classification.py +2 -2
  313. mteb/tasks/classification/ukr/ukr_formality_classification.py +2 -2
  314. mteb/tasks/classification/vie/toxic_conversations_vn_classification.py +1 -1
  315. mteb/tasks/classification/vie/vie_student_feedback_classification.py +1 -1
  316. mteb/tasks/classification/zho/yue_openrice_review_classification.py +2 -2
  317. mteb/tasks/classification/zul/isi_zulu_news_classification.py +1 -1
  318. mteb/tasks/clustering/deu/blurbs_clustering_p2p.py +1 -1
  319. mteb/tasks/clustering/deu/blurbs_clustering_s2s.py +1 -1
  320. mteb/tasks/clustering/eng/arxiv_clustering_p2p.py +1 -1
  321. mteb/tasks/clustering/eng/arxiv_hierarchical_clustering.py +2 -2
  322. mteb/tasks/clustering/eng/big_patent_clustering.py +1 -1
  323. mteb/tasks/clustering/eng/biorxiv_clustering_p2p.py +1 -1
  324. mteb/tasks/clustering/eng/biorxiv_clustering_s2s.py +1 -1
  325. mteb/tasks/clustering/eng/medrxiv_clustering_p2p.py +1 -1
  326. mteb/tasks/clustering/eng/medrxiv_clustering_s2s.py +1 -1
  327. mteb/tasks/clustering/eng/reddit_clustering.py +1 -1
  328. mteb/tasks/clustering/eng/reddit_clustering_p2p.py +1 -1
  329. mteb/tasks/clustering/eng/stack_exchange_clustering.py +1 -1
  330. mteb/tasks/clustering/eng/stack_exchange_clustering_p2p.py +1 -1
  331. mteb/tasks/clustering/eng/twenty_newsgroups_clustering.py +1 -1
  332. mteb/tasks/clustering/fas/fa_mteb_clustering.py +4 -4
  333. mteb/tasks/clustering/fra/hal_clustering_s2s.py +2 -2
  334. mteb/tasks/clustering/multilingual/mlsum_clustering_p2p.py +2 -2
  335. mteb/tasks/clustering/multilingual/mlsum_clustering_s2s.py +2 -2
  336. mteb/tasks/clustering/multilingual/sib200_clustering_s2s.py +1 -1
  337. mteb/tasks/clustering/multilingual/wiki_clustering_p2p.py +1 -1
  338. mteb/tasks/clustering/nld/dutch_news_articles_clustering_p2p.py +1 -1
  339. mteb/tasks/clustering/nld/dutch_news_articles_clustering_s2s.py +1 -1
  340. mteb/tasks/clustering/nld/iconclass_clustering_s2s.py +1 -1
  341. mteb/tasks/clustering/nld/open_tender_clustering_p2p.py +1 -1
  342. mteb/tasks/clustering/nld/vabb_clustering_p2p.py +1 -1
  343. mteb/tasks/clustering/nld/vabb_clustering_s2s.py +1 -1
  344. mteb/tasks/clustering/nob/snl_clustering.py +8 -3
  345. mteb/tasks/clustering/nob/vg_clustering.py +8 -3
  346. mteb/tasks/clustering/pol/polish_clustering.py +3 -3
  347. mteb/tasks/clustering/rus/ru_sci_bench_grnti_clustering_p2p.py +1 -1
  348. mteb/tasks/clustering/rus/ru_sci_bench_oecd_clustering_p2p.py +1 -1
  349. mteb/tasks/clustering/zho/cmteb_clustering.py +4 -4
  350. mteb/tasks/image_text_pair_classification/eng/image_co_de.py +1 -1
  351. mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +1 -1
  352. mteb/tasks/instruction_reranking/multilingual/m_follow_ir.py +2 -2
  353. mteb/tasks/multichoice/eng/cv_bench.py +4 -4
  354. mteb/tasks/multilabel_classification/ita/emit_classification.py +1 -1
  355. mteb/tasks/multilabel_classification/mlt/maltese_news_classification.py +1 -1
  356. mteb/tasks/multilabel_classification/rus/ru_toixic_multilabelclassification_okmlcup.py +1 -1
  357. mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_group_classification.py +1 -1
  358. mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_subclass_classification.py +1 -1
  359. mteb/tasks/pair_classification/ara/ar_entail.py +1 -1
  360. mteb/tasks/pair_classification/dan/talemaader_pc.py +1 -1
  361. mteb/tasks/pair_classification/deu/false_friends_de_en_pc.py +1 -1
  362. mteb/tasks/pair_classification/eng/pub_chem_ai_sentence_paraphrase_pc.py +1 -1
  363. mteb/tasks/pair_classification/eng/pub_chem_smilespc.py +4 -3
  364. mteb/tasks/pair_classification/eng/pub_chem_synonym_pc.py +1 -1
  365. mteb/tasks/pair_classification/eng/pub_chem_wiki_paragraphs_pc.py +1 -1
  366. mteb/tasks/pair_classification/eng/sprint_duplicate_questions_pc.py +1 -1
  367. mteb/tasks/pair_classification/eng/twitter_sem_eval2015_pc.py +1 -1
  368. mteb/tasks/pair_classification/eng/twitter_url_corpus_pc.py +1 -1
  369. mteb/tasks/pair_classification/fas/fa_mteb_pair_classification.py +5 -5
  370. mteb/tasks/pair_classification/fas/fars_tail.py +2 -2
  371. mteb/tasks/pair_classification/hye/armenian_paraphrase_pc.py +1 -1
  372. mteb/tasks/pair_classification/ita/dis_co_tex_pair_classification.py +1 -1
  373. mteb/tasks/pair_classification/kor/klue_nli.py +1 -1
  374. mteb/tasks/pair_classification/multilingual/rte3.py +2 -2
  375. mteb/tasks/pair_classification/multilingual/xnli.py +1 -1
  376. mteb/tasks/pair_classification/pol/polish_pc.py +4 -4
  377. mteb/tasks/pair_classification/por/assin2_rte.py +1 -1
  378. mteb/tasks/pair_classification/por/sick_br_pc.py +1 -1
  379. mteb/tasks/pair_classification/rus/terra.py +2 -2
  380. mteb/tasks/pair_classification/vie/sprint_duplicate_questions_pcvn.py +1 -1
  381. mteb/tasks/pair_classification/vie/twitter_sem_eval2015_pcvn.py +1 -1
  382. mteb/tasks/pair_classification/vie/twitter_url_corpus_pcvn.py +1 -1
  383. mteb/tasks/pair_classification/zho/cmteb_pair_classification.py +2 -2
  384. mteb/tasks/retrieval/ara/sadeem_question_retrieval.py +1 -1
  385. mteb/tasks/retrieval/code/code_edit_search_retrieval.py +1 -1
  386. mteb/tasks/retrieval/code/code_rag.py +4 -4
  387. mteb/tasks/retrieval/code/code_search_net_cc_retrieval.py +1 -1
  388. mteb/tasks/retrieval/code/coir_code_search_net_retrieval.py +1 -1
  389. mteb/tasks/retrieval/code/ds1000_retrieval.py +1 -1
  390. mteb/tasks/retrieval/code/fresh_stack_retrieval.py +1 -1
  391. mteb/tasks/retrieval/code/human_eval_retrieval.py +1 -1
  392. mteb/tasks/retrieval/code/mbpp_retrieval.py +1 -1
  393. mteb/tasks/retrieval/code/wiki_sql_retrieval.py +1 -1
  394. mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
  395. mteb/tasks/retrieval/dan/tv2_nordretrieval.py +1 -1
  396. mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +1 -1
  397. mteb/tasks/retrieval/deu/german_gov_service_retrieval.py +1 -1
  398. mteb/tasks/retrieval/deu/german_qu_ad_retrieval.py +1 -1
  399. mteb/tasks/retrieval/ell/greek_civics_qa.py +1 -1
  400. mteb/tasks/retrieval/eng/__init__.py +42 -0
  401. mteb/tasks/retrieval/eng/bright_retrieval.py +10 -2
  402. mteb/tasks/retrieval/eng/bright_v1_1_retrieval.py +968 -0
  403. mteb/tasks/retrieval/eng/chat_doctor_retrieval.py +1 -1
  404. mteb/tasks/retrieval/eng/fin_qa_retrieval.py +1 -1
  405. mteb/tasks/retrieval/eng/finance_bench_retrieval.py +1 -1
  406. mteb/tasks/retrieval/eng/hateful_memes_i2t_retrieval.py +1 -1
  407. mteb/tasks/retrieval/eng/hateful_memes_t2i_retrieval.py +1 -1
  408. mteb/tasks/retrieval/eng/hc3_finance_retrieval.py +1 -1
  409. mteb/tasks/retrieval/eng/lemb_narrative_qa_retrieval.py +1 -1
  410. mteb/tasks/retrieval/eng/lemb_needle_retrieval.py +1 -1
  411. mteb/tasks/retrieval/eng/lemb_passkey_retrieval.py +1 -1
  412. mteb/tasks/retrieval/eng/lemb_summ_screen_fd_retrieval.py +1 -1
  413. mteb/tasks/retrieval/eng/lemb_wikim_qa_retrieval.py +1 -1
  414. mteb/tasks/retrieval/eng/lembqm_sum_retrieval.py +1 -1
  415. mteb/tasks/retrieval/eng/limit_retrieval.py +6 -1
  416. mteb/tasks/retrieval/eng/lit_search_retrieval.py +1 -1
  417. mteb/tasks/retrieval/eng/memotion_i2t_retrieval.py +1 -1
  418. mteb/tasks/retrieval/eng/memotion_t2i_retrieval.py +1 -1
  419. mteb/tasks/retrieval/eng/ml_questions.py +1 -1
  420. mteb/tasks/retrieval/eng/nano_argu_ana_retrieval.py +1 -1
  421. mteb/tasks/retrieval/eng/nano_climate_fever_retrieval.py +1 -1
  422. mteb/tasks/retrieval/eng/nano_db_pedia_retrieval.py +1 -1
  423. mteb/tasks/retrieval/eng/nano_fever_retrieval.py +1 -1
  424. mteb/tasks/retrieval/eng/nano_fi_qa2018_retrieval.py +1 -1
  425. mteb/tasks/retrieval/eng/nano_hotpot_qa_retrieval.py +1 -1
  426. mteb/tasks/retrieval/eng/nano_msmarco_retrieval.py +1 -1
  427. mteb/tasks/retrieval/eng/nano_nf_corpus_retrieval.py +1 -1
  428. mteb/tasks/retrieval/eng/nano_nq_retrieval.py +1 -1
  429. mteb/tasks/retrieval/eng/nano_quora_retrieval.py +1 -1
  430. mteb/tasks/retrieval/eng/nano_sci_fact_retrieval.py +1 -1
  431. mteb/tasks/retrieval/eng/nano_scidocs_retrieval.py +1 -1
  432. mteb/tasks/retrieval/eng/nano_touche2020_retrieval.py +1 -1
  433. mteb/tasks/retrieval/eng/narrative_qa_retrieval.py +1 -1
  434. mteb/tasks/retrieval/eng/r2_med_retrieval.py +8 -8
  435. mteb/tasks/retrieval/eng/sci_mmir_i2t_retrieval.py +1 -1
  436. mteb/tasks/retrieval/eng/sci_mmir_t2i_retrieval.py +1 -1
  437. mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +10 -10
  438. mteb/tasks/retrieval/fra/f_qu_ad_retrieval.py +1 -1
  439. mteb/tasks/retrieval/fra/syntec_retrieval.py +1 -1
  440. mteb/tasks/retrieval/hun/hun_sum2.py +1 -1
  441. mteb/tasks/retrieval/kat/georgian_faq_retrieval.py +1 -1
  442. mteb/tasks/retrieval/multilingual/cross_lingual_semantic_discrimination_wmt19.py +1 -1
  443. mteb/tasks/retrieval/multilingual/cross_lingual_semantic_discrimination_wmt21.py +1 -1
  444. mteb/tasks/retrieval/multilingual/cur_ev1_retrieval.py +1 -1
  445. mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +1 -1
  446. mteb/tasks/retrieval/multilingual/miracl_vision_retrieval.py +1 -1
  447. mteb/tasks/retrieval/multilingual/mr_tidy_retrieval.py +1 -1
  448. mteb/tasks/retrieval/multilingual/public_health_qa_retrieval.py +1 -1
  449. mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +5 -5
  450. mteb/tasks/retrieval/multilingual/statcan_dialogue_dataset_retrieval.py +1 -1
  451. mteb/tasks/retrieval/multilingual/vdr_multilingual_retrieval.py +1 -1
  452. mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +14 -4
  453. mteb/tasks/retrieval/multilingual/wit_t2i_retrieval.py +1 -1
  454. mteb/tasks/retrieval/multilingual/x_flickr30k_co_t2i_retrieval.py +1 -1
  455. mteb/tasks/retrieval/multilingual/x_qu_ad_retrieval.py +1 -1
  456. mteb/tasks/retrieval/multilingual/xm3600_t2i_retrieval.py +1 -1
  457. mteb/tasks/retrieval/nld/cqa_dupstack_android_nl_retrieval.py +1 -1
  458. mteb/tasks/retrieval/nld/cqa_dupstack_english_nl_retrieval.py +1 -1
  459. mteb/tasks/retrieval/nld/cqa_dupstack_gaming_nl_retrieval.py +1 -1
  460. mteb/tasks/retrieval/nld/cqa_dupstack_gis_nl_retrieval.py +1 -1
  461. mteb/tasks/retrieval/nld/cqa_dupstack_mathematica_nl_retrieval.py +1 -1
  462. mteb/tasks/retrieval/nld/cqa_dupstack_physics_nl_retrieval.py +1 -1
  463. mteb/tasks/retrieval/nld/cqa_dupstack_programmers_nl_retrieval.py +1 -1
  464. mteb/tasks/retrieval/nld/cqa_dupstack_stats_nl_retrieval.py +1 -1
  465. mteb/tasks/retrieval/nld/cqa_dupstack_tex_nl_retrieval.py +1 -1
  466. mteb/tasks/retrieval/nld/cqa_dupstack_unix_nl_retrieval.py +1 -1
  467. mteb/tasks/retrieval/nld/cqa_dupstack_webmasters_nl_retrieval.py +1 -1
  468. mteb/tasks/retrieval/nld/cqa_dupstack_wordpress_nl_retrieval.py +1 -1
  469. mteb/tasks/retrieval/nob/norquad.py +1 -1
  470. mteb/tasks/retrieval/nob/snl_retrieval.py +1 -1
  471. mteb/tasks/retrieval/slk/slovak_sum_retrieval.py +1 -1
  472. mteb/tasks/retrieval/vie/vie_qu_ad_retrieval.py +1 -1
  473. mteb/tasks/sts/fao/faroese_sts.py +1 -1
  474. mteb/tasks/sts/fra/sick_fr_sts.py +1 -1
  475. mteb/tasks/sts/kor/klue_sts.py +1 -1
  476. mteb/tasks/sts/por/sick_br_sts.py +1 -1
  477. mteb/tasks/sts/rus/ru_para_phraser_sts.py +1 -1
  478. mteb/tasks/zeroshot_classification/eng/sci_mmir.py +1 -1
  479. mteb/types/_encoder_io.py +1 -1
  480. mteb/types/statistics.py +9 -2
  481. {mteb-2.7.2.dist-info → mteb-2.7.9.dist-info}/METADATA +1 -1
  482. {mteb-2.7.2.dist-info → mteb-2.7.9.dist-info}/RECORD +486 -465
  483. {mteb-2.7.2.dist-info → mteb-2.7.9.dist-info}/WHEEL +1 -1
  484. {mteb-2.7.2.dist-info → mteb-2.7.9.dist-info}/entry_points.txt +0 -0
  485. {mteb-2.7.2.dist-info → mteb-2.7.9.dist-info}/licenses/LICENSE +0 -0
  486. {mteb-2.7.2.dist-info → mteb-2.7.9.dist-info}/top_level.txt +0 -0
@@ -1,13 +1,18 @@
1
- from typing import Any
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING, Any
2
4
 
3
5
  import torch
4
- from torch.utils.data import DataLoader
5
6
  from tqdm.auto import tqdm
6
7
 
7
- from mteb.abstasks.task_metadata import TaskMetadata
8
8
  from mteb.models.abs_encoder import AbsEncoder
9
9
  from mteb.models.model_meta import ModelMeta, ScoringFunction
10
- from mteb.types import Array, BatchedInput, PromptType
10
+
11
+ if TYPE_CHECKING:
12
+ from torch.utils.data import DataLoader
13
+
14
+ from mteb.abstasks.task_metadata import TaskMetadata
15
+ from mteb.types import Array, BatchedInput, PromptType
11
16
 
12
17
 
13
18
  class ALIGNModel(AbsEncoder):
@@ -111,6 +116,7 @@ align_base = ModelMeta(
111
116
  release_date="2023-02-24",
112
117
  modalities=["image", "text"],
113
118
  n_parameters=176_000_000,
119
+ n_embedding_parameters=None,
114
120
  memory_usage_mb=671,
115
121
  max_tokens=64,
116
122
  embed_dim=768,
@@ -8,6 +8,7 @@ amazon_titan_text_embeddings_v2 = ModelMeta(
8
8
  release_date="2024-04-30",
9
9
  languages=["eng-Latn"],
10
10
  n_parameters=None,
11
+ n_embedding_parameters=None,
11
12
  memory_usage_mb=None,
12
13
  max_tokens=None,
13
14
  embed_dim=None,
@@ -12,6 +12,7 @@ model2vecdk = ModelMeta(
12
12
  revision="cb576c78dcc1b729e4612645f61db59929d69e61",
13
13
  release_date="2025-11-21",
14
14
  n_parameters=48042496,
15
+ n_embedding_parameters=None,
15
16
  memory_usage_mb=183,
16
17
  max_tokens=np.inf,
17
18
  embed_dim=256,
@@ -43,6 +44,7 @@ model2vecdk_stem = ModelMeta(
43
44
  revision="cb576c78dcc1b729e4612645f61db59929d69e61",
44
45
  release_date="2025-11-21",
45
46
  n_parameters=48578560,
47
+ n_embedding_parameters=None,
46
48
  memory_usage_mb=185,
47
49
  max_tokens=np.inf,
48
50
  embed_dim=256,
@@ -10,6 +10,7 @@ arabic_triplet_matryoshka = ModelMeta(
10
10
  revision="ed357f222f0b6ea6670d2c9b5a1cb93950d34200",
11
11
  release_date="2024-07-28",
12
12
  n_parameters=135_000_000,
13
+ n_embedding_parameters=49_152_000,
13
14
  memory_usage_mb=516,
14
15
  embed_dim=768,
15
16
  license="apache-2.0",
@@ -147,6 +147,7 @@ arctic_embed_xs = ModelMeta(
147
147
  open_weights=True,
148
148
  framework=["Sentence Transformers", "PyTorch", "ONNX", "safetensors"],
149
149
  n_parameters=22_600_000,
150
+ n_embedding_parameters=11_720_448,
150
151
  memory_usage_mb=86,
151
152
  max_tokens=512,
152
153
  embed_dim=384,
@@ -173,6 +174,7 @@ arctic_embed_s = ModelMeta(
173
174
  open_weights=True,
174
175
  framework=["Sentence Transformers", "PyTorch", "ONNX", "safetensors"],
175
176
  n_parameters=32_200_000,
177
+ n_embedding_parameters=11_720_448,
176
178
  memory_usage_mb=127,
177
179
  max_tokens=512,
178
180
  embed_dim=384,
@@ -199,6 +201,7 @@ arctic_embed_m = ModelMeta(
199
201
  open_weights=True,
200
202
  framework=["Sentence Transformers", "PyTorch", "ONNX", "safetensors"],
201
203
  n_parameters=109_000_000,
204
+ n_embedding_parameters=23_440_896,
202
205
  memory_usage_mb=415,
203
206
  max_tokens=512,
204
207
  embed_dim=768,
@@ -225,6 +228,7 @@ arctic_embed_m_long = ModelMeta(
225
228
  open_weights=True,
226
229
  framework=["Sentence Transformers", "PyTorch", "ONNX", "safetensors"],
227
230
  n_parameters=137_000_000,
231
+ n_embedding_parameters=None,
228
232
  memory_usage_mb=522,
229
233
  max_tokens=2048,
230
234
  embed_dim=768,
@@ -250,6 +254,7 @@ arctic_embed_l = ModelMeta(
250
254
  open_weights=True,
251
255
  framework=["Sentence Transformers", "PyTorch", "ONNX", "safetensors"],
252
256
  n_parameters=335_000_000,
257
+ n_embedding_parameters=31_254_528,
253
258
  memory_usage_mb=1274,
254
259
  max_tokens=512,
255
260
  embed_dim=1024,
@@ -280,6 +285,7 @@ arctic_embed_m_v1_5 = ModelMeta(
280
285
  open_weights=True,
281
286
  framework=["Sentence Transformers", "PyTorch", "ONNX", "safetensors", "GGUF"],
282
287
  n_parameters=109_000_000,
288
+ n_embedding_parameters=23_440_896,
283
289
  memory_usage_mb=415,
284
290
  max_tokens=512,
285
291
  embed_dim=768,
@@ -306,6 +312,7 @@ arctic_embed_m_v2_0 = ModelMeta(
306
312
  open_weights=True,
307
313
  framework=["Sentence Transformers", "PyTorch", "ONNX", "safetensors"],
308
314
  n_parameters=305_000_000,
315
+ n_embedding_parameters=None,
309
316
  memory_usage_mb=1165,
310
317
  max_tokens=8192,
311
318
  embed_dim=768,
@@ -331,6 +338,7 @@ arctic_embed_l_v2_0 = ModelMeta(
331
338
  open_weights=True,
332
339
  framework=["Sentence Transformers", "PyTorch", "ONNX", "safetensors"],
333
340
  n_parameters=568_000_000,
341
+ n_embedding_parameters=256_002_048,
334
342
  memory_usage_mb=2166,
335
343
  max_tokens=8192,
336
344
  embed_dim=1024,
@@ -16,6 +16,7 @@ b1ade_embed = ModelMeta(
16
16
  open_weights=True,
17
17
  release_date="2025-03-10",
18
18
  n_parameters=335_000_000,
19
+ n_embedding_parameters=31_254_528,
19
20
  memory_usage_mb=1278,
20
21
  embed_dim=1024,
21
22
  license="mit",
@@ -1,20 +1,30 @@
1
+ from __future__ import annotations
2
+
1
3
  import json
2
4
  import logging
3
5
  import re
4
- from typing import Any
6
+ from typing import TYPE_CHECKING, Any
5
7
 
6
8
  import numpy as np
7
- from torch.utils.data import DataLoader
8
9
  from tqdm.auto import tqdm
9
10
 
10
11
  from mteb._requires_package import requires_package
11
- from mteb.abstasks.task_metadata import TaskMetadata
12
12
  from mteb.models.abs_encoder import AbsEncoder
13
13
  from mteb.models.model_meta import ModelMeta, ScoringFunction
14
- from mteb.types import Array, BatchedInput, PromptType
15
14
 
16
- from .cohere_models import model_prompts as cohere_model_prompts
17
- from .cohere_models import supported_languages as cohere_supported_languages
15
+ from .cohere_models import (
16
+ model_prompts as cohere_model_prompts,
17
+ )
18
+ from .cohere_models import (
19
+ supported_languages as cohere_supported_languages,
20
+ )
21
+
22
+ if TYPE_CHECKING:
23
+ from torch.utils.data import DataLoader
24
+
25
+ from mteb.abstasks.task_metadata import TaskMetadata
26
+ from mteb.types import Array, BatchedInput, PromptType
27
+
18
28
 
19
29
  logger = logging.getLogger(__name__)
20
30
 
@@ -169,6 +179,7 @@ amazon_titan_embed_text_v1 = ModelMeta(
169
179
  embed_dim=1536,
170
180
  open_weights=False,
171
181
  n_parameters=None,
182
+ n_embedding_parameters=None,
172
183
  memory_usage_mb=None,
173
184
  public_training_code=None,
174
185
  public_training_data=None, # assumed
@@ -196,6 +207,7 @@ amazon_titan_embed_text_v2 = ModelMeta(
196
207
  embed_dim=1024,
197
208
  open_weights=False,
198
209
  n_parameters=None,
210
+ n_embedding_parameters=None,
199
211
  memory_usage_mb=None,
200
212
  public_training_code=None,
201
213
  public_training_data=None, # assumed
@@ -225,6 +237,7 @@ cohere_embed_english_v3 = ModelMeta(
225
237
  revision="1",
226
238
  release_date="2023-11-02",
227
239
  n_parameters=None,
240
+ n_embedding_parameters=None,
228
241
  memory_usage_mb=None,
229
242
  public_training_code=None,
230
243
  public_training_data=None, # assumed
@@ -253,6 +266,7 @@ cohere_embed_multilingual_v3 = ModelMeta(
253
266
  revision="1",
254
267
  release_date="2023-11-02",
255
268
  n_parameters=None,
269
+ n_embedding_parameters=None,
256
270
  memory_usage_mb=None,
257
271
  public_training_code=None,
258
272
  public_training_data=None, # assumed
@@ -6,7 +6,29 @@ from mteb.models.sentence_transformer_wrapper import sentence_transformers_loade
6
6
 
7
7
  from .e5_instruct import E5_MISTRAL_TRAINING_DATA
8
8
 
9
- model_prompts = {"query": "Represent this sentence for searching relevant passages: "}
9
+ model_prompts = {
10
+ "query": "Represent this sentence for searching relevant passages: ",
11
+ "BrightBiologyRetrieval-query": "Represent this biology post for searching relevant passages: ",
12
+ "BrightEarthScienceRetrieval-query": "Represent this earth_science post for searching relevant passages: ",
13
+ "BrightEconomicsRetrieval-query": "Represent this economics post for searching relevant passages: ",
14
+ "BrightPsychologyRetrieval-query": "Represent this psychology post for searching relevant passages: ",
15
+ "BrightRoboticsRetrieval-query": "Represent this robotics post for searching relevant passages: ",
16
+ "BrightStackoverflowRetrieval-query": "Represent this stackoverflow post for searching relevant passages: ",
17
+ "BrightSustainableLivingRetrieval-query": "Represent this sustainable_living post for searching relevant passages: ",
18
+ "BrightPonyRetrieval-query": "Represent this Pony question for searching relevant passages: ",
19
+ "BrightLeetcodeRetrieval-query": "Represent this Coding problem for searching relevant examples: ",
20
+ "BrightAopsRetrieval-query": "Represent this Math problem for searching relevant examples: ",
21
+ "BrightTheoremQATheoremsRetrieval-query": "Represent this Math problem for searching relevant theorems: ",
22
+ "BrightTheoremQAQuestionsRetrieval-query": "Represent this Math problem for searching relevant examples: ",
23
+ "BrightBiologyLongRetrieval-query": "Represent this biology post for searching relevant documents: ",
24
+ "BrightEarthScienceLongRetrieval-query": "Represent this earth_science post for searching relevant documents: ",
25
+ "BrightEconomicsLongRetrieval-query": "Represent this economics post for searching relevant documents: ",
26
+ "BrightPsychologyLongRetrieval-query": "Represent this psychology post for searching relevant documents: ",
27
+ "BrightRoboticsLongRetrieval-query": "Represent this robotics post for searching relevant document: ",
28
+ "BrightStackoverflowLongRetrieval-query": "Represent this stackoverflow post for searching relevant document: ",
29
+ "BrightSustainableLivingLongRetrieval-query": "Represent this sustainable_living post for searching relevant documents: ",
30
+ "BrightPonyLongRetrieval-query": "Represent this Pony question for searching relevant documents: ",
31
+ }
10
32
  BGE_15_CITATION = """@misc{bge_embedding,
11
33
  title={C-Pack: Packaged Resources To Advance General Chinese Embedding},
12
34
  author={Shitao Xiao and Zheng Liu and Peitian Zhang and Niklas Muennighoff},
@@ -325,6 +347,7 @@ bge_small_en_v1_5 = ModelMeta(
325
347
  revision="5c38ec7c405ec4b44b94cc5a9bb96e735b38267a",
326
348
  release_date="2023-09-12", # initial commit of hf model.
327
349
  n_parameters=33_400_000,
350
+ n_embedding_parameters=11_720_448,
328
351
  memory_usage_mb=127,
329
352
  embed_dim=512,
330
353
  license="mit",
@@ -357,6 +380,7 @@ bge_base_en_v1_5 = ModelMeta(
357
380
  revision="a5beb1e3e68b9ab74eb54cfd186867f64f240e1a",
358
381
  release_date="2023-09-11", # initial commit of hf model.
359
382
  n_parameters=109_000_000,
383
+ n_embedding_parameters=23_440_896,
360
384
  memory_usage_mb=390,
361
385
  embed_dim=768,
362
386
  license="mit",
@@ -389,6 +413,7 @@ bge_large_en_v1_5 = ModelMeta(
389
413
  revision="d4aa6901d3a41ba39fb536a557fa166f842b0e09",
390
414
  release_date="2023-09-12", # initial commit of hf model.
391
415
  n_parameters=335_000_000,
416
+ n_embedding_parameters=31_254_528,
392
417
  memory_usage_mb=1242,
393
418
  embed_dim=1024,
394
419
  license="mit",
@@ -421,6 +446,7 @@ bge_small_zh = ModelMeta(
421
446
  revision="1d2363c5de6ce9ba9c890c8e23a4c72dce540ca8",
422
447
  release_date="2023-08-05", # initial commit of hf model.
423
448
  n_parameters=33_400_000,
449
+ n_embedding_parameters=10_817_536,
424
450
  memory_usage_mb=127,
425
451
  embed_dim=512,
426
452
  license="mit",
@@ -448,6 +474,7 @@ bge_base_zh = ModelMeta(
448
474
  revision="0e5f83d4895db7955e4cb9ed37ab73f7ded339b6",
449
475
  release_date="2023-08-05", # initial commit of hf model.
450
476
  n_parameters=109_000_000,
477
+ n_embedding_parameters=16_226_304,
451
478
  memory_usage_mb=390,
452
479
  embed_dim=768,
453
480
  license="mit",
@@ -475,6 +502,7 @@ bge_large_zh = ModelMeta(
475
502
  revision="b5d9f5c027e87b6f0b6fa4b614f8f9cdc45ce0e8",
476
503
  release_date="2023-08-02", # initial commit of hf model.
477
504
  n_parameters=335_000_000,
505
+ n_embedding_parameters=21_635_072,
478
506
  memory_usage_mb=1242,
479
507
  embed_dim=1024,
480
508
  license="mit",
@@ -502,6 +530,7 @@ bge_small_en = ModelMeta(
502
530
  revision="4778d71a06863076696b03fd2777eb118712cad8",
503
531
  release_date="2023-08-05", # initial commit of hf model.
504
532
  n_parameters=33_400_000,
533
+ n_embedding_parameters=11_720_448,
505
534
  memory_usage_mb=127,
506
535
  embed_dim=512,
507
536
  license="mit",
@@ -529,6 +558,7 @@ bge_base_en = ModelMeta(
529
558
  revision="b737bf5dcc6ee8bdc530531266b4804a5d77b5d8",
530
559
  release_date="2023-08-05", # initial commit of hf model.
531
560
  n_parameters=109_000_000,
561
+ n_embedding_parameters=23_440_896,
532
562
  memory_usage_mb=390,
533
563
  embed_dim=768,
534
564
  license="mit",
@@ -562,6 +592,7 @@ bge_large_en = ModelMeta(
562
592
  revision="abe7d9d814b775ca171121fb03f394dc42974275",
563
593
  release_date="2023-08-05", # initial commit of hf model.
564
594
  n_parameters=335_000_000,
595
+ n_embedding_parameters=31_254_528,
565
596
  memory_usage_mb=1242,
566
597
  embed_dim=1024,
567
598
  license="mit",
@@ -590,6 +621,7 @@ bge_small_zh_v1_5 = ModelMeta(
590
621
  revision="7999e1d3359715c523056ef9478215996d62a620",
591
622
  release_date="2023-09-12", # initial commit of hf model.
592
623
  n_parameters=33_400_000,
624
+ n_embedding_parameters=10_817_536,
593
625
  memory_usage_mb=91,
594
626
  embed_dim=512,
595
627
  license="mit",
@@ -616,6 +648,7 @@ bge_base_zh_v1_5 = ModelMeta(
616
648
  revision="f03589ceff5aac7111bd60cfc7d497ca17ecac65",
617
649
  release_date="2023-09-11", # initial commit of hf model.
618
650
  n_parameters=109_000_000,
651
+ n_embedding_parameters=16_226_304,
619
652
  memory_usage_mb=416,
620
653
  embed_dim=768,
621
654
  license="mit",
@@ -642,6 +675,7 @@ bge_large_zh_v1_5 = ModelMeta(
642
675
  revision="79e7739b6ab944e86d6171e44d24c997fc1e0116",
643
676
  release_date="2023-09-12", # initial commit of hf model.
644
677
  n_parameters=335_000_000,
678
+ n_embedding_parameters=21_635_072,
645
679
  memory_usage_mb=1278,
646
680
  embed_dim=1024,
647
681
  license="mit",
@@ -665,6 +699,7 @@ bge_m3 = ModelMeta(
665
699
  revision="5617a9f61b028005a4858fdac845db406aefb181",
666
700
  release_date="2024-06-28",
667
701
  n_parameters=568_000_000,
702
+ n_embedding_parameters=256_002_048,
668
703
  memory_usage_mb=2167,
669
704
  embed_dim=1024,
670
705
  license="mit",
@@ -761,6 +796,7 @@ bge_multilingual_gemma2 = ModelMeta(
761
796
  revision="992e13d8984fde2c31ef8a3cb2c038aeec513b8a",
762
797
  release_date="2024-07-25", # initial commit of hf model.
763
798
  n_parameters=int(9.24 * 1e9),
799
+ n_embedding_parameters=917_511_168,
764
800
  memory_usage_mb=35254,
765
801
  embed_dim=3584, # from old C-MTEB leaderboard
766
802
  license="https://ai.google.dev/gemma/terms",
@@ -808,6 +844,7 @@ bge_en_icl = ModelMeta(
808
844
  revision="971c7e1445cc86656ca0bd85ed770b8675a40bb5",
809
845
  release_date="2024-07-25", # initial commit of hf model.
810
846
  n_parameters=int(7.11 * 1e9),
847
+ n_embedding_parameters=131_084_288,
811
848
  memory_usage_mb=27125,
812
849
  embed_dim=4096,
813
850
  license="apache-2.0",
@@ -842,6 +879,7 @@ bge_m3_unsupervised = ModelMeta(
842
879
  revision="46f03bc86361cf88102b0b517b36c8259f2946b1",
843
880
  release_date="2024-01-30", # January 30, 2024 - BGE-M3 release date
844
881
  n_parameters=568_000_000,
882
+ n_embedding_parameters=256_002_048,
845
883
  memory_usage_mb=2167,
846
884
  embed_dim=1024,
847
885
  license="mit",
@@ -871,6 +909,7 @@ manu__bge_m3_custom_fr = ModelMeta(
871
909
  languages=None,
872
910
  loader=sentence_transformers_loader,
873
911
  n_parameters=567754752,
912
+ n_embedding_parameters=256_002_048,
874
913
  memory_usage_mb=2166,
875
914
  max_tokens=8194.0,
876
915
  embed_dim=1024,
@@ -9,6 +9,7 @@ bica_base = ModelMeta(
9
9
  revision="31237a836e5ae908c308a256573e5f0986498574",
10
10
  release_date="2025-11-14",
11
11
  n_parameters=110_000_000,
12
+ n_embedding_parameters=23_440_896,
12
13
  memory_usage_mb=418,
13
14
  embed_dim=768,
14
15
  license="mit",
@@ -1,14 +1,19 @@
1
- from typing import Any
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING, Any
2
4
 
3
5
  import torch
4
- from torch.utils.data import DataLoader
5
6
  from tqdm.auto import tqdm
6
7
 
7
8
  from mteb._requires_package import requires_package
8
- from mteb.abstasks.task_metadata import TaskMetadata
9
9
  from mteb.models.abs_encoder import AbsEncoder
10
10
  from mteb.models.model_meta import ModelMeta, ScoringFunction
11
- from mteb.types import Array, BatchedInput, PromptType
11
+
12
+ if TYPE_CHECKING:
13
+ from torch.utils.data import DataLoader
14
+
15
+ from mteb.abstasks.task_metadata import TaskMetadata
16
+ from mteb.types import Array, BatchedInput, PromptType
12
17
 
13
18
  BLIP2_CITATION = """@inproceedings{li2023blip2,
14
19
  title={{BLIP-2:} Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models},
@@ -172,6 +177,7 @@ blip2_opt_2_7b = ModelMeta(
172
177
  release_date="2024-03-22",
173
178
  modalities=["image", "text"],
174
179
  n_parameters=3_740_000_000,
180
+ n_embedding_parameters=None,
175
181
  memory_usage_mb=14285,
176
182
  max_tokens=None,
177
183
  embed_dim=768,
@@ -196,6 +202,7 @@ blip2_opt_6_7b_coco = ModelMeta(
196
202
  release_date="2024-03-31",
197
203
  modalities=["image", "text"],
198
204
  n_parameters=7_750_000_000,
205
+ n_embedding_parameters=None,
199
206
  memory_usage_mb=29577,
200
207
  max_tokens=None,
201
208
  embed_dim=768,
@@ -1,14 +1,19 @@
1
- from typing import Any
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING, Any
2
4
 
3
5
  import torch
4
6
  from torch.nn.functional import normalize
5
- from torch.utils.data import DataLoader
6
7
  from tqdm.auto import tqdm
7
8
 
8
- from mteb.abstasks.task_metadata import TaskMetadata
9
9
  from mteb.models.abs_encoder import AbsEncoder
10
10
  from mteb.models.model_meta import ModelMeta, ScoringFunction
11
- from mteb.types import Array, BatchedInput, PromptType
11
+
12
+ if TYPE_CHECKING:
13
+ from torch.utils.data import DataLoader
14
+
15
+ from mteb.abstasks.task_metadata import TaskMetadata
16
+ from mteb.types import Array, BatchedInput, PromptType
12
17
 
13
18
  BLIP_CITATION = """@misc{https://doi.org/10.48550/arxiv.2201.12086,
14
19
  doi = {10.48550/ARXIV.2201.12086},
@@ -136,6 +141,7 @@ blip_image_captioning_large = ModelMeta(
136
141
  release_date="2023-12-07",
137
142
  modalities=["image", "text"],
138
143
  n_parameters=470_000_000,
144
+ n_embedding_parameters=23_442_432,
139
145
  memory_usage_mb=1792,
140
146
  max_tokens=512,
141
147
  embed_dim=768,
@@ -164,6 +170,7 @@ blip_image_captioning_base = ModelMeta(
164
170
  release_date="2023-08-01",
165
171
  modalities=["image", "text"],
166
172
  n_parameters=247_000_000,
173
+ n_embedding_parameters=23_442_432,
167
174
  memory_usage_mb=942,
168
175
  max_tokens=512,
169
176
  embed_dim=768,
@@ -193,6 +200,7 @@ blip_vqa_base = ModelMeta(
193
200
  release_date="2023-12-07",
194
201
  modalities=["image", "text"],
195
202
  n_parameters=247_000_000,
203
+ n_embedding_parameters=23_442_432,
196
204
  memory_usage_mb=1467,
197
205
  max_tokens=512,
198
206
  embed_dim=768,
@@ -220,6 +228,7 @@ blip_vqa_capfilt_large = ModelMeta(
220
228
  release_date="2023-01-22",
221
229
  modalities=["image", "text"],
222
230
  n_parameters=247_000_000,
231
+ n_embedding_parameters=23_442_432,
223
232
  memory_usage_mb=942,
224
233
  max_tokens=512,
225
234
  embed_dim=768,
@@ -247,6 +256,7 @@ blip_itm_base_coco = ModelMeta(
247
256
  release_date="2023-08-01",
248
257
  modalities=["image", "text"],
249
258
  n_parameters=247_000_000,
259
+ n_embedding_parameters=23_442_432,
250
260
  memory_usage_mb=942,
251
261
  max_tokens=512,
252
262
  embed_dim=768,
@@ -274,6 +284,7 @@ blip_itm_large_coco = ModelMeta(
274
284
  release_date="2023-08-01",
275
285
  modalities=["image", "text"],
276
286
  n_parameters=470_000_000,
287
+ n_embedding_parameters=23_442_432,
277
288
  memory_usage_mb=1793,
278
289
  max_tokens=512,
279
290
  embed_dim=768,
@@ -302,6 +313,7 @@ blip_itm_base_flickr = ModelMeta(
302
313
  release_date="2023-08-01",
303
314
  modalities=["image", "text"],
304
315
  n_parameters=247_000_000,
316
+ n_embedding_parameters=23_442_432,
305
317
  memory_usage_mb=942,
306
318
  max_tokens=512,
307
319
  embed_dim=768,
@@ -330,6 +342,7 @@ blip_itm_large_flickr = ModelMeta(
330
342
  release_date="2023-08-01",
331
343
  modalities=["image", "text"],
332
344
  n_parameters=470_000_000,
345
+ n_embedding_parameters=23_442_432,
333
346
  memory_usage_mb=1793,
334
347
  max_tokens=512,
335
348
  embed_dim=768,
@@ -1,18 +1,22 @@
1
+ from __future__ import annotations
2
+
1
3
  import logging
4
+ from typing import TYPE_CHECKING
2
5
 
3
6
  from mteb._create_dataloaders import _create_text_queries_dataloader
4
7
  from mteb._requires_package import requires_package
5
- from mteb.abstasks.task_metadata import TaskMetadata
6
8
  from mteb.models.model_meta import ModelMeta
7
- from mteb.models.models_protocols import SearchProtocol
8
- from mteb.types import (
9
- CorpusDatasetType,
10
- EncodeKwargs,
11
- InstructionDatasetType,
12
- QueryDatasetType,
13
- RetrievalOutputType,
14
- TopRankedDocumentsType,
15
- )
9
+
10
+ if TYPE_CHECKING:
11
+ from mteb.abstasks.task_metadata import TaskMetadata
12
+ from mteb.models.models_protocols import SearchProtocol
13
+ from mteb.types import (
14
+ CorpusDatasetType,
15
+ EncodeKwargs,
16
+ QueryDatasetType,
17
+ RetrievalOutputType,
18
+ TopRankedDocumentsType,
19
+ )
16
20
 
17
21
  logger = logging.getLogger(__name__)
18
22
 
@@ -50,6 +54,7 @@ def bm25_loader(model_name, **kwargs) -> SearchProtocol:
50
54
  hf_split: str,
51
55
  hf_subset: str,
52
56
  encode_kwargs: EncodeKwargs,
57
+ num_proc: int = 1,
53
58
  ) -> None:
54
59
  logger.info("Encoding Corpus...")
55
60
  corpus_texts = [
@@ -75,8 +80,8 @@ def bm25_loader(model_name, **kwargs) -> SearchProtocol:
75
80
  hf_subset: str,
76
81
  top_k: int,
77
82
  encode_kwargs: EncodeKwargs,
78
- instructions: InstructionDatasetType | None = None,
79
83
  top_ranked: TopRankedDocumentsType | None = None,
84
+ num_proc: int = 1,
80
85
  ) -> RetrievalOutputType:
81
86
  logger.info("Encoding Queries...")
82
87
  query_ids = list(queries["id"])
@@ -98,13 +103,17 @@ def bm25_loader(model_name, **kwargs) -> SearchProtocol:
98
103
  query_results = queries_results[qi]
99
104
  scores = queries_scores[qi]
100
105
  doc_id_to_score = {}
106
+ query_documents = (
107
+ top_ranked[qid] if top_ranked and qid in top_ranked else None
108
+ )
101
109
 
102
110
  # Iterate over results
103
- for ri in range(len(query_results)):
104
- doc_idx = query_results[ri]
105
- score = scores[ri]
111
+ for doc_idx, score in zip(query_results, scores):
106
112
  doc_id = self.corpus_idx_to_id[doc_idx]
107
113
 
114
+ # handle reranking with a filtered set of documents
115
+ if query_documents is not None and doc_id not in query_documents:
116
+ continue
108
117
  doc_id_to_score[doc_id] = float(score)
109
118
 
110
119
  results[qid] = doc_id_to_score
@@ -127,6 +136,7 @@ bm25_s = ModelMeta(
127
136
  revision="0_1_10",
128
137
  release_date="2024-07-10", # release of version 0.1.10
129
138
  n_parameters=None,
139
+ n_embedding_parameters=None,
130
140
  memory_usage_mb=None,
131
141
  embed_dim=None,
132
142
  license=None,
@@ -1,5 +1,6 @@
1
- from collections.abc import Callable
2
- from typing import Any
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING, Any
3
4
 
4
5
  import torch
5
6
  from sentence_transformers import SentenceTransformer
@@ -9,6 +10,9 @@ from mteb.models import ModelMeta
9
10
  from mteb.models.instruct_wrapper import InstructSentenceTransformerModel
10
11
  from mteb.types import PromptType
11
12
 
13
+ if TYPE_CHECKING:
14
+ from collections.abc import Callable
15
+
12
16
 
13
17
  def instruction_template(
14
18
  instruction: str, prompt_type: PromptType | None = None
@@ -99,6 +103,7 @@ BMRetriever_410M = ModelMeta(
99
103
  release_date="2024-04-29",
100
104
  embed_dim=1024,
101
105
  n_parameters=353_822_720,
106
+ n_embedding_parameters=51_511_296,
102
107
  memory_usage_mb=1349,
103
108
  max_tokens=2048,
104
109
  license="mit",
@@ -129,6 +134,7 @@ BMRetriever_1B = ModelMeta(
129
134
  release_date="2024-04-29",
130
135
  embed_dim=2048,
131
136
  n_parameters=908_759_040,
137
+ n_embedding_parameters=103_022_592,
132
138
  memory_usage_mb=3466,
133
139
  max_tokens=2048,
134
140
  license="mit",
@@ -159,6 +165,7 @@ BMRetriever_2B = ModelMeta(
159
165
  release_date="2024-04-29",
160
166
  embed_dim=2048,
161
167
  n_parameters=2_506_172_416,
168
+ n_embedding_parameters=524_288_000,
162
169
  memory_usage_mb=9560,
163
170
  max_tokens=8192,
164
171
  license="mit",
@@ -189,6 +196,7 @@ BMRetriever_7B = ModelMeta(
189
196
  release_date="2024-04-29",
190
197
  embed_dim=4096,
191
198
  n_parameters=7_110_660_096,
199
+ n_embedding_parameters=131_072_000,
192
200
  memory_usage_mb=27124,
193
201
  max_tokens=32768,
194
202
  license="mit",
@@ -41,6 +41,7 @@ cadet_embed = ModelMeta(
41
41
  open_weights=True,
42
42
  release_date="2025-05-11",
43
43
  n_parameters=109_000_000,
44
+ n_embedding_parameters=23_440_896,
44
45
  memory_usage_mb=418,
45
46
  embed_dim=768,
46
47
  license="apache-2.0",