mteb 2.5.2__py3-none-any.whl → 2.7.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (529) hide show
  1. mteb/__init__.py +2 -0
  2. mteb/_create_dataloaders.py +78 -30
  3. mteb/_evaluators/any_sts_evaluator.py +13 -6
  4. mteb/_evaluators/clustering_evaluator.py +13 -5
  5. mteb/_evaluators/evaluator.py +12 -4
  6. mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +22 -11
  7. mteb/_evaluators/pair_classification_evaluator.py +17 -7
  8. mteb/_evaluators/retrieval_evaluator.py +23 -14
  9. mteb/_evaluators/retrieval_metrics.py +26 -19
  10. mteb/_evaluators/sklearn_evaluator.py +27 -17
  11. mteb/_evaluators/text/bitext_mining_evaluator.py +36 -20
  12. mteb/_evaluators/text/summarization_evaluator.py +31 -20
  13. mteb/_evaluators/zeroshot_classification_evaluator.py +16 -5
  14. mteb/_helpful_enum.py +5 -1
  15. mteb/abstasks/_data_filter/filters.py +9 -3
  16. mteb/abstasks/_data_filter/task_pipelines.py +10 -2
  17. mteb/abstasks/_statistics_calculation.py +21 -11
  18. mteb/abstasks/_stratification.py +18 -18
  19. mteb/abstasks/abstask.py +78 -44
  20. mteb/abstasks/aggregate_task_metadata.py +21 -18
  21. mteb/abstasks/aggregated_task.py +23 -35
  22. mteb/abstasks/classification.py +39 -18
  23. mteb/abstasks/clustering.py +37 -20
  24. mteb/abstasks/clustering_legacy.py +30 -16
  25. mteb/abstasks/image/image_text_pair_classification.py +26 -9
  26. mteb/abstasks/multilabel_classification.py +33 -21
  27. mteb/abstasks/pair_classification.py +44 -19
  28. mteb/abstasks/regression.py +18 -10
  29. mteb/abstasks/retrieval.py +82 -52
  30. mteb/abstasks/retrieval_dataset_loaders.py +50 -39
  31. mteb/abstasks/sts.py +34 -15
  32. mteb/abstasks/task_metadata.py +44 -37
  33. mteb/abstasks/text/bitext_mining.py +57 -35
  34. mteb/abstasks/text/reranking.py +10 -8
  35. mteb/abstasks/text/summarization.py +26 -10
  36. mteb/abstasks/zeroshot_classification.py +27 -9
  37. mteb/benchmarks/_create_table.py +13 -7
  38. mteb/benchmarks/benchmark.py +15 -3
  39. mteb/benchmarks/benchmarks/__init__.py +6 -0
  40. mteb/benchmarks/benchmarks/benchmarks.py +153 -13
  41. mteb/benchmarks/benchmarks/rteb_benchmarks.py +20 -9
  42. mteb/benchmarks/get_benchmark.py +14 -55
  43. mteb/cache.py +189 -31
  44. mteb/cli/_display_tasks.py +10 -4
  45. mteb/cli/build_cli.py +112 -13
  46. mteb/cli/generate_model_card.py +50 -23
  47. mteb/deprecated_evaluator.py +72 -54
  48. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2CybersecurityRetrieval.json +32 -0
  49. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EconomicRetrieval.json +32 -0
  50. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EnergyRetrieval.json +32 -0
  51. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2HrRetrieval.json +32 -0
  52. mteb/descriptive_stats/Retrieval/BrightAopsRetrieval.json +35 -0
  53. mteb/descriptive_stats/Retrieval/BrightBiologyLongRetrieval.json +35 -0
  54. mteb/descriptive_stats/Retrieval/BrightBiologyRetrieval.json +35 -0
  55. mteb/descriptive_stats/Retrieval/BrightEarthScienceLongRetrieval.json +35 -0
  56. mteb/descriptive_stats/Retrieval/BrightEarthScienceRetrieval.json +35 -0
  57. mteb/descriptive_stats/Retrieval/BrightEconomicsLongRetrieval.json +35 -0
  58. mteb/descriptive_stats/Retrieval/BrightEconomicsRetrieval.json +35 -0
  59. mteb/descriptive_stats/Retrieval/BrightLeetcodeRetrieval.json +35 -0
  60. mteb/descriptive_stats/Retrieval/BrightPonyLongRetrieval.json +35 -0
  61. mteb/descriptive_stats/Retrieval/BrightPonyRetrieval.json +35 -0
  62. mteb/descriptive_stats/Retrieval/BrightPsychologyLongRetrieval.json +35 -0
  63. mteb/descriptive_stats/Retrieval/BrightPsychologyRetrieval.json +35 -0
  64. mteb/descriptive_stats/Retrieval/BrightRoboticsLongRetrieval.json +35 -0
  65. mteb/descriptive_stats/Retrieval/BrightRoboticsRetrieval.json +35 -0
  66. mteb/descriptive_stats/Retrieval/BrightStackoverflowLongRetrieval.json +35 -0
  67. mteb/descriptive_stats/Retrieval/BrightStackoverflowRetrieval.json +35 -0
  68. mteb/descriptive_stats/Retrieval/BrightSustainableLivingLongRetrieval.json +35 -0
  69. mteb/descriptive_stats/Retrieval/BrightSustainableLivingRetrieval.json +35 -0
  70. mteb/descriptive_stats/Retrieval/BrightTheoremQAQuestionsRetrieval.json +35 -0
  71. mteb/descriptive_stats/Retrieval/BrightTheoremQATheoremsRetrieval.json +35 -0
  72. mteb/descriptive_stats/Retrieval/ChemRxivRetrieval.json +30 -0
  73. mteb/descriptive_stats/Retrieval/EuroPIRQRetrieval.json +116 -0
  74. mteb/descriptive_stats/Retrieval/NanoClimateFEVER-VN.json +30 -0
  75. mteb/descriptive_stats/Retrieval/NanoDBPedia-VN.json +30 -0
  76. mteb/descriptive_stats/Retrieval/NanoFEVER-VN.json +30 -0
  77. mteb/descriptive_stats/Retrieval/NanoHotpotQA-VN.json +30 -0
  78. mteb/descriptive_stats/Retrieval/NanoMSMARCO-VN.json +30 -0
  79. mteb/descriptive_stats/Retrieval/NanoNQ-VN.json +30 -0
  80. mteb/descriptive_stats/Retrieval/TVPLRetrieval.json +30 -0
  81. mteb/evaluate.py +71 -47
  82. mteb/filter_tasks.py +36 -32
  83. mteb/get_tasks.py +37 -33
  84. mteb/languages/language_scripts.py +11 -4
  85. mteb/leaderboard/app.py +172 -37
  86. mteb/leaderboard/table.py +7 -2
  87. mteb/load_results.py +20 -14
  88. mteb/models/abs_encoder.py +30 -16
  89. mteb/models/cache_wrappers/cache_backend_protocol.py +7 -7
  90. mteb/models/cache_wrappers/cache_backends/_hash_utils.py +10 -5
  91. mteb/models/cache_wrappers/cache_backends/faiss_cache.py +13 -4
  92. mteb/models/cache_wrappers/cache_backends/numpy_cache.py +43 -25
  93. mteb/models/cache_wrappers/cache_wrapper.py +16 -11
  94. mteb/models/get_model_meta.py +53 -9
  95. mteb/models/instruct_wrapper.py +41 -13
  96. mteb/models/model_implementations/align_models.py +11 -5
  97. mteb/models/model_implementations/amazon_models.py +1 -0
  98. mteb/models/model_implementations/andersborges.py +6 -4
  99. mteb/models/model_implementations/ara_models.py +2 -1
  100. mteb/models/model_implementations/arctic_models.py +16 -8
  101. mteb/models/model_implementations/b1ade_models.py +2 -1
  102. mteb/models/model_implementations/bedrock_models.py +20 -6
  103. mteb/models/model_implementations/bge_models.py +85 -22
  104. mteb/models/model_implementations/bica_model.py +4 -3
  105. mteb/models/model_implementations/blip2_models.py +13 -6
  106. mteb/models/model_implementations/blip_models.py +33 -20
  107. mteb/models/model_implementations/bm25.py +27 -17
  108. mteb/models/model_implementations/bmretriever_models.py +16 -6
  109. mteb/models/model_implementations/cadet_models.py +2 -1
  110. mteb/models/model_implementations/cde_models.py +22 -9
  111. mteb/models/model_implementations/clip_models.py +18 -10
  112. mteb/models/model_implementations/clips_models.py +6 -3
  113. mteb/models/model_implementations/codefuse_models.py +10 -5
  114. mteb/models/model_implementations/codesage_models.py +6 -3
  115. mteb/models/model_implementations/cohere_models.py +19 -9
  116. mteb/models/model_implementations/cohere_v.py +16 -6
  117. mteb/models/model_implementations/colpali_models.py +10 -6
  118. mteb/models/model_implementations/colqwen_models.py +24 -38
  119. mteb/models/model_implementations/colsmol_models.py +5 -3
  120. mteb/models/model_implementations/conan_models.py +12 -5
  121. mteb/models/model_implementations/dino_models.py +70 -46
  122. mteb/models/model_implementations/e5_instruct.py +27 -4
  123. mteb/models/model_implementations/e5_models.py +18 -9
  124. mteb/models/model_implementations/e5_v.py +16 -10
  125. mteb/models/model_implementations/eagerworks_models.py +12 -5
  126. mteb/models/model_implementations/emillykkejensen_models.py +9 -6
  127. mteb/models/model_implementations/en_code_retriever.py +2 -1
  128. mteb/models/model_implementations/euler_models.py +3 -2
  129. mteb/models/model_implementations/evaclip_models.py +13 -4
  130. mteb/models/model_implementations/fa_models.py +18 -9
  131. mteb/models/model_implementations/facebookai.py +16 -2
  132. mteb/models/model_implementations/geogpt_models.py +2 -1
  133. mteb/models/model_implementations/gme_v_models.py +13 -8
  134. mteb/models/model_implementations/google_models.py +16 -5
  135. mteb/models/model_implementations/granite_vision_embedding_models.py +8 -6
  136. mteb/models/model_implementations/gritlm_models.py +5 -2
  137. mteb/models/model_implementations/gte_models.py +34 -13
  138. mteb/models/model_implementations/hinvec_models.py +7 -2
  139. mteb/models/model_implementations/human.py +1 -0
  140. mteb/models/model_implementations/ibm_granite_models.py +36 -6
  141. mteb/models/model_implementations/inf_models.py +4 -2
  142. mteb/models/model_implementations/jasper_models.py +16 -7
  143. mteb/models/model_implementations/jina_clip.py +58 -14
  144. mteb/models/model_implementations/jina_models.py +35 -16
  145. mteb/models/model_implementations/kalm_models.py +24 -12
  146. mteb/models/model_implementations/kblab.py +13 -6
  147. mteb/models/model_implementations/kennethenevoldsen_models.py +6 -4
  148. mteb/models/model_implementations/kfst.py +2 -1
  149. mteb/models/model_implementations/kowshik24_models.py +2 -1
  150. mteb/models/model_implementations/lens_models.py +2 -0
  151. mteb/models/model_implementations/lgai_embedding_models.py +2 -1
  152. mteb/models/model_implementations/linq_models.py +8 -2
  153. mteb/models/model_implementations/listconranker.py +11 -5
  154. mteb/models/model_implementations/llm2clip_models.py +18 -10
  155. mteb/models/model_implementations/llm2vec_models.py +28 -14
  156. mteb/models/model_implementations/mcinext_models.py +12 -3
  157. mteb/models/model_implementations/mdbr_models.py +19 -3
  158. mteb/models/model_implementations/misc_models.py +131 -68
  159. mteb/models/model_implementations/mixedbread_ai_models.py +335 -0
  160. mteb/models/model_implementations/mme5_models.py +3 -2
  161. mteb/models/model_implementations/moco_models.py +15 -8
  162. mteb/models/model_implementations/mod_models.py +3 -2
  163. mteb/models/model_implementations/model2vec_models.py +37 -18
  164. mteb/models/model_implementations/moka_models.py +4 -1
  165. mteb/models/model_implementations/nbailab.py +6 -3
  166. mteb/models/model_implementations/no_instruct_sentence_models.py +15 -7
  167. mteb/models/model_implementations/nomic_models.py +47 -19
  168. mteb/models/model_implementations/nomic_models_vision.py +6 -4
  169. mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +20 -8
  170. mteb/models/model_implementations/nvidia_models.py +165 -22
  171. mteb/models/model_implementations/octen_models.py +64 -3
  172. mteb/models/model_implementations/openai_models.py +14 -4
  173. mteb/models/model_implementations/openclip_models.py +30 -17
  174. mteb/models/model_implementations/opensearch_neural_sparse_models.py +20 -9
  175. mteb/models/model_implementations/ops_moa_models.py +10 -3
  176. mteb/models/model_implementations/ordalietech_solon_embeddings_mini_beta_1_1.py +2 -1
  177. mteb/models/model_implementations/pawan_models.py +2 -1
  178. mteb/models/model_implementations/piccolo_models.py +3 -1
  179. mteb/models/model_implementations/pixie_models.py +56 -0
  180. mteb/models/model_implementations/promptriever_models.py +20 -10
  181. mteb/models/model_implementations/pylate_models.py +41 -21
  182. mteb/models/model_implementations/qodo_models.py +4 -2
  183. mteb/models/model_implementations/qtack_models.py +2 -1
  184. mteb/models/model_implementations/qwen3_models.py +14 -4
  185. mteb/models/model_implementations/qzhou_models.py +4 -2
  186. mteb/models/model_implementations/random_baseline.py +7 -6
  187. mteb/models/model_implementations/rasgaard_models.py +3 -2
  188. mteb/models/model_implementations/reasonir_model.py +66 -1
  189. mteb/models/model_implementations/repllama_models.py +18 -9
  190. mteb/models/model_implementations/rerankers_custom.py +25 -10
  191. mteb/models/model_implementations/rerankers_monot5_based.py +41 -21
  192. mteb/models/model_implementations/richinfoai_models.py +2 -1
  193. mteb/models/model_implementations/ru_sentence_models.py +40 -20
  194. mteb/models/model_implementations/ruri_models.py +20 -10
  195. mteb/models/model_implementations/salesforce_models.py +13 -4
  196. mteb/models/model_implementations/samilpwc_models.py +2 -1
  197. mteb/models/model_implementations/sarashina_embedding_models.py +4 -2
  198. mteb/models/model_implementations/searchmap_models.py +2 -1
  199. mteb/models/model_implementations/seed_1_6_embedding_models.py +5 -2
  200. mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +119 -148
  201. mteb/models/model_implementations/seed_models.py +2 -1
  202. mteb/models/model_implementations/sentence_transformers_models.py +142 -22
  203. mteb/models/model_implementations/shuu_model.py +2 -1
  204. mteb/models/model_implementations/siglip_models.py +39 -24
  205. mteb/models/model_implementations/slm_models.py +419 -0
  206. mteb/models/model_implementations/sonar_models.py +2 -1
  207. mteb/models/model_implementations/spartan8806_atles_champion.py +2 -1
  208. mteb/models/model_implementations/stella_models.py +23 -4
  209. mteb/models/model_implementations/tarka_models.py +4 -2
  210. mteb/models/model_implementations/text2vec_models.py +12 -3
  211. mteb/models/model_implementations/ua_sentence_models.py +2 -1
  212. mteb/models/model_implementations/uae_models.py +17 -5
  213. mteb/models/model_implementations/vdr_models.py +9 -2
  214. mteb/models/model_implementations/vi_vn_models.py +12 -6
  215. mteb/models/model_implementations/vista_models.py +11 -4
  216. mteb/models/model_implementations/vlm2vec_models.py +14 -7
  217. mteb/models/model_implementations/voyage_models.py +136 -4
  218. mteb/models/model_implementations/voyage_v.py +17 -10
  219. mteb/models/model_implementations/xyz_models.py +1 -0
  220. mteb/models/model_implementations/youtu_models.py +2 -1
  221. mteb/models/model_implementations/yuan_models.py +2 -1
  222. mteb/models/model_implementations/yuan_models_en.py +3 -2
  223. mteb/models/model_meta.py +127 -40
  224. mteb/models/models_protocols.py +43 -22
  225. mteb/models/search_encoder_index/search_backend_protocol.py +7 -3
  226. mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +21 -10
  227. mteb/models/search_wrappers.py +63 -29
  228. mteb/models/sentence_transformer_wrapper.py +52 -26
  229. mteb/models/vllm_wrapper.py +329 -0
  230. mteb/py.typed +0 -0
  231. mteb/results/benchmark_results.py +48 -35
  232. mteb/results/model_result.py +68 -32
  233. mteb/results/task_result.py +110 -72
  234. mteb/similarity_functions.py +19 -9
  235. mteb/tasks/aggregated_tasks/eng/cqadupstack_retrieval.py +3 -3
  236. mteb/tasks/aggregated_tasks/eng/sts17_multilingual_visual_sts_eng.py +3 -3
  237. mteb/tasks/aggregated_tasks/eng/sts_benchmark_multilingual_visual_sts_eng.py +3 -3
  238. mteb/tasks/aggregated_tasks/fas/cqadupstack_retrieval_fa.py +3 -3
  239. mteb/tasks/aggregated_tasks/fas/syn_per_chatbot_conv_sa_classification.py +3 -3
  240. mteb/tasks/aggregated_tasks/multilingual/sts17_multilingual_vision_sts.py +3 -3
  241. mteb/tasks/aggregated_tasks/multilingual/sts_benchmark_multilingual_visual_sts.py +3 -3
  242. mteb/tasks/aggregated_tasks/nld/cqadupstack_nl_retrieval.py +3 -3
  243. mteb/tasks/aggregated_tasks/pol/cqadupstack_retrieval_pl.py +3 -3
  244. mteb/tasks/bitext_mining/eng/pub_chem_smiles_bitext_mining.py +1 -1
  245. mteb/tasks/bitext_mining/fas/fa_mteb_summary_retrieval.py +3 -3
  246. mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining.py +1 -1
  247. mteb/tasks/bitext_mining/multilingual/flores_bitext_mining.py +1 -1
  248. mteb/tasks/bitext_mining/multilingual/in22_conv_bitext_mining.py +1 -1
  249. mteb/tasks/bitext_mining/multilingual/in22_gen_bitext_mining.py +1 -1
  250. mteb/tasks/bitext_mining/multilingual/norwegian_courts_bitext_mining.py +1 -1
  251. mteb/tasks/bitext_mining/multilingual/ntrex_bitext_mining.py +1 -1
  252. mteb/tasks/bitext_mining/multilingual/roma_tales_bitext_mining.py +2 -2
  253. mteb/tasks/bitext_mining/multilingual/web_faq_bitext_mining.py +2 -2
  254. mteb/tasks/classification/ara/online_store_review_sentiment_classification.py +1 -1
  255. mteb/tasks/classification/ara/restaurant_review_sentiment_classification.py +1 -1
  256. mteb/tasks/classification/ara/tweet_sarcasm_classification.py +1 -1
  257. mteb/tasks/classification/ben/bengali_hate_speech_classification.py +1 -1
  258. mteb/tasks/classification/ben/bengali_sentiment_analysis.py +1 -1
  259. mteb/tasks/classification/bul/bulgarian_store_review_sentiment_classfication.py +1 -1
  260. mteb/tasks/classification/ces/csfdcz_movie_review_sentiment_classification.py +2 -2
  261. mteb/tasks/classification/dan/ddisco_cohesion_classification.py +1 -1
  262. mteb/tasks/classification/dan/dk_hate_classification.py +2 -2
  263. mteb/tasks/classification/deu/german_politicians_twitter_sentiment_classification.py +1 -1
  264. mteb/tasks/classification/ell/greek_legal_code_classification.py +1 -1
  265. mteb/tasks/classification/eng/dbpedia_classification.py +2 -2
  266. mteb/tasks/classification/eng/toxic_chat_classification.py +2 -2
  267. mteb/tasks/classification/eng/toxic_conversations_classification.py +2 -2
  268. mteb/tasks/classification/eng/tweet_topic_single_classification.py +1 -1
  269. mteb/tasks/classification/eng/yahoo_answers_topics_classification.py +1 -1
  270. mteb/tasks/classification/eng/yelp_review_full_classification.py +2 -2
  271. mteb/tasks/classification/est/estonian_valence.py +2 -2
  272. mteb/tasks/classification/fas/fa_mteb_classification.py +6 -6
  273. mteb/tasks/classification/fas/persian_food_sentiment_classification.py +1 -1
  274. mteb/tasks/classification/fil/filipino_shopee_reviews_classification.py +1 -1
  275. mteb/tasks/classification/fin/fin_toxicity_classification.py +1 -1
  276. mteb/tasks/classification/fra/french_book_reviews.py +2 -2
  277. mteb/tasks/classification/fra/movie_review_sentiment_classification.py +2 -2
  278. mteb/tasks/classification/guj/gujarati_news_classification.py +1 -1
  279. mteb/tasks/classification/hin/hindi_discourse_classification.py +1 -1
  280. mteb/tasks/classification/hin/sentiment_analysis_hindi.py +1 -1
  281. mteb/tasks/classification/ind/indonesian_id_clickbait_classification.py +2 -2
  282. mteb/tasks/classification/ind/indonesian_mongabay_conservation_classification.py +1 -1
  283. mteb/tasks/classification/ita/dado_eval_coarse_classification.py +1 -1
  284. mteb/tasks/classification/ita/ita_casehold_classification.py +1 -1
  285. mteb/tasks/classification/ita/sardi_stance_classification.py +1 -1
  286. mteb/tasks/classification/jav/javanese_imdb_classification.py +1 -1
  287. mteb/tasks/classification/jpn/wrime_classification.py +1 -1
  288. mteb/tasks/classification/kan/kannada_news_classification.py +2 -2
  289. mteb/tasks/classification/kor/klue_tc.py +2 -2
  290. mteb/tasks/classification/kor/kor_fin.py +1 -1
  291. mteb/tasks/classification/kor/kor_hate_classification.py +1 -1
  292. mteb/tasks/classification/kor/kor_sarcasm_classification.py +1 -1
  293. mteb/tasks/classification/kur/kurdish_sentiment_classification.py +2 -2
  294. mteb/tasks/classification/mal/malayalam_news_classification.py +1 -1
  295. mteb/tasks/classification/mar/marathi_news_classification.py +1 -1
  296. mteb/tasks/classification/multilingual/afri_senti_lang_classification.py +1 -1
  297. mteb/tasks/classification/multilingual/catalonia_tweet_classification.py +1 -1
  298. mteb/tasks/classification/multilingual/cyrillic_turkic_lang_classification.py +1 -1
  299. mteb/tasks/classification/multilingual/indic_nlp_news_classification.py +1 -1
  300. mteb/tasks/classification/multilingual/masakha_news_classification.py +1 -1
  301. mteb/tasks/classification/multilingual/multi_hate_classification.py +1 -1
  302. mteb/tasks/classification/multilingual/multilingual_sentiment_classification.py +1 -1
  303. mteb/tasks/classification/multilingual/scala_classification.py +2 -2
  304. mteb/tasks/classification/multilingual/sib200_classification.py +1 -1
  305. mteb/tasks/classification/multilingual/turkic_classification.py +1 -1
  306. mteb/tasks/classification/multilingual/tweet_sentiment_classification.py +1 -1
  307. mteb/tasks/classification/nep/nepali_news_classification.py +2 -2
  308. mteb/tasks/classification/nld/dutch_sarcastic_headlines_classification.py +1 -1
  309. mteb/tasks/classification/nld/vaccin_chat_nl_classification.py +1 -1
  310. mteb/tasks/classification/ory/odia_news_classification.py +2 -2
  311. mteb/tasks/classification/pan/punjabi_news_classification.py +1 -1
  312. mteb/tasks/classification/ron/moroco.py +1 -1
  313. mteb/tasks/classification/ron/romanian_reviews_sentiment.py +1 -1
  314. mteb/tasks/classification/ron/romanian_sentiment_classification.py +1 -1
  315. mteb/tasks/classification/rus/georeview_classification.py +1 -1
  316. mteb/tasks/classification/rus/headline_classification.py +2 -2
  317. mteb/tasks/classification/rus/inappropriateness_classification.py +2 -2
  318. mteb/tasks/classification/rus/ru_reviews_classification.py +2 -2
  319. mteb/tasks/classification/rus/ru_sci_bench_grnti_classification.py +1 -1
  320. mteb/tasks/classification/rus/ru_sci_bench_oecd_classification.py +1 -1
  321. mteb/tasks/classification/rus/ru_toixic_classification_okmlcup.py +1 -1
  322. mteb/tasks/classification/san/sanskrit_shlokas_classification.py +1 -1
  323. mteb/tasks/classification/sin/sinhala_news_classification.py +2 -2
  324. mteb/tasks/classification/sin/sinhala_news_source_classification.py +2 -2
  325. mteb/tasks/classification/slk/csfdsk_movie_review_sentiment_classification.py +2 -2
  326. mteb/tasks/classification/slv/frenk_sl_classification.py +1 -1
  327. mteb/tasks/classification/spa/spanish_news_classification.py +2 -2
  328. mteb/tasks/classification/ssw/siswati_news_classification.py +1 -1
  329. mteb/tasks/classification/tam/tamil_news_classification.py +2 -2
  330. mteb/tasks/classification/tel/telugu_andhra_jyoti_news_classification.py +2 -2
  331. mteb/tasks/classification/tha/wongnai_reviews_classification.py +1 -1
  332. mteb/tasks/classification/tur/turkish_movie_sentiment_classification.py +2 -2
  333. mteb/tasks/classification/ukr/ukr_formality_classification.py +2 -2
  334. mteb/tasks/classification/vie/toxic_conversations_vn_classification.py +1 -1
  335. mteb/tasks/classification/vie/vie_student_feedback_classification.py +1 -1
  336. mteb/tasks/classification/zho/yue_openrice_review_classification.py +2 -2
  337. mteb/tasks/classification/zul/isi_zulu_news_classification.py +1 -1
  338. mteb/tasks/clustering/deu/blurbs_clustering_p2p.py +1 -1
  339. mteb/tasks/clustering/deu/blurbs_clustering_s2s.py +1 -1
  340. mteb/tasks/clustering/eng/arxiv_clustering_p2p.py +1 -1
  341. mteb/tasks/clustering/eng/arxiv_hierarchical_clustering.py +2 -2
  342. mteb/tasks/clustering/eng/big_patent_clustering.py +1 -1
  343. mteb/tasks/clustering/eng/biorxiv_clustering_p2p.py +1 -1
  344. mteb/tasks/clustering/eng/biorxiv_clustering_s2s.py +1 -1
  345. mteb/tasks/clustering/eng/hume_wiki_cities_clustering.py +1 -1
  346. mteb/tasks/clustering/eng/medrxiv_clustering_p2p.py +1 -1
  347. mteb/tasks/clustering/eng/medrxiv_clustering_s2s.py +1 -1
  348. mteb/tasks/clustering/eng/reddit_clustering.py +1 -1
  349. mteb/tasks/clustering/eng/reddit_clustering_p2p.py +1 -1
  350. mteb/tasks/clustering/eng/stack_exchange_clustering.py +1 -1
  351. mteb/tasks/clustering/eng/stack_exchange_clustering_p2p.py +1 -1
  352. mteb/tasks/clustering/eng/twenty_newsgroups_clustering.py +1 -1
  353. mteb/tasks/clustering/eng/wiki_cities_clustering.py +1 -1
  354. mteb/tasks/clustering/fas/fa_mteb_clustering.py +4 -4
  355. mteb/tasks/clustering/fra/hal_clustering_s2s.py +2 -2
  356. mteb/tasks/clustering/multilingual/mlsum_clustering_p2p.py +2 -2
  357. mteb/tasks/clustering/multilingual/mlsum_clustering_s2s.py +2 -2
  358. mteb/tasks/clustering/multilingual/sib200_clustering_s2s.py +1 -1
  359. mteb/tasks/clustering/multilingual/wiki_clustering_p2p.py +1 -1
  360. mteb/tasks/clustering/nld/dutch_news_articles_clustering_p2p.py +1 -1
  361. mteb/tasks/clustering/nld/dutch_news_articles_clustering_s2s.py +1 -1
  362. mteb/tasks/clustering/nld/iconclass_clustering_s2s.py +1 -1
  363. mteb/tasks/clustering/nld/open_tender_clustering_p2p.py +1 -1
  364. mteb/tasks/clustering/nld/vabb_clustering_p2p.py +1 -1
  365. mteb/tasks/clustering/nld/vabb_clustering_s2s.py +1 -1
  366. mteb/tasks/clustering/nob/snl_clustering.py +8 -3
  367. mteb/tasks/clustering/nob/vg_clustering.py +8 -3
  368. mteb/tasks/clustering/pol/polish_clustering.py +3 -3
  369. mteb/tasks/clustering/rus/ru_sci_bench_grnti_clustering_p2p.py +1 -1
  370. mteb/tasks/clustering/rus/ru_sci_bench_oecd_clustering_p2p.py +1 -1
  371. mteb/tasks/clustering/zho/cmteb_clustering.py +6 -6
  372. mteb/tasks/image_text_pair_classification/eng/image_co_de.py +1 -1
  373. mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +2 -2
  374. mteb/tasks/instruction_reranking/multilingual/m_follow_ir.py +2 -2
  375. mteb/tasks/multichoice/eng/cv_bench.py +4 -4
  376. mteb/tasks/multilabel_classification/ita/emit_classification.py +1 -1
  377. mteb/tasks/multilabel_classification/mlt/maltese_news_classification.py +1 -1
  378. mteb/tasks/multilabel_classification/rus/ru_toixic_multilabelclassification_okmlcup.py +1 -1
  379. mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_group_classification.py +1 -1
  380. mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_subclass_classification.py +1 -1
  381. mteb/tasks/pair_classification/ara/ar_entail.py +1 -1
  382. mteb/tasks/pair_classification/dan/talemaader_pc.py +1 -1
  383. mteb/tasks/pair_classification/deu/false_friends_de_en_pc.py +1 -1
  384. mteb/tasks/pair_classification/eng/pub_chem_ai_sentence_paraphrase_pc.py +1 -1
  385. mteb/tasks/pair_classification/eng/pub_chem_smilespc.py +4 -3
  386. mteb/tasks/pair_classification/eng/pub_chem_synonym_pc.py +1 -1
  387. mteb/tasks/pair_classification/eng/pub_chem_wiki_paragraphs_pc.py +1 -1
  388. mteb/tasks/pair_classification/eng/sprint_duplicate_questions_pc.py +1 -1
  389. mteb/tasks/pair_classification/eng/twitter_sem_eval2015_pc.py +1 -1
  390. mteb/tasks/pair_classification/eng/twitter_url_corpus_pc.py +1 -1
  391. mteb/tasks/pair_classification/fas/fa_mteb_pair_classification.py +5 -5
  392. mteb/tasks/pair_classification/fas/fars_tail.py +2 -2
  393. mteb/tasks/pair_classification/hye/armenian_paraphrase_pc.py +1 -1
  394. mteb/tasks/pair_classification/ita/dis_co_tex_pair_classification.py +1 -1
  395. mteb/tasks/pair_classification/kor/klue_nli.py +1 -1
  396. mteb/tasks/pair_classification/multilingual/rte3.py +2 -2
  397. mteb/tasks/pair_classification/multilingual/xnli.py +1 -1
  398. mteb/tasks/pair_classification/pol/polish_pc.py +4 -4
  399. mteb/tasks/pair_classification/por/assin2_rte.py +1 -1
  400. mteb/tasks/pair_classification/por/sick_br_pc.py +1 -1
  401. mteb/tasks/pair_classification/rus/terra.py +2 -2
  402. mteb/tasks/pair_classification/vie/sprint_duplicate_questions_pcvn.py +1 -1
  403. mteb/tasks/pair_classification/vie/twitter_sem_eval2015_pcvn.py +1 -1
  404. mteb/tasks/pair_classification/vie/twitter_url_corpus_pcvn.py +1 -1
  405. mteb/tasks/pair_classification/zho/cmteb_pair_classification.py +2 -2
  406. mteb/tasks/reranking/multilingual/wikipedia_reranking_multilingual.py +1 -1
  407. mteb/tasks/retrieval/ara/sadeem_question_retrieval.py +1 -1
  408. mteb/tasks/retrieval/code/code_edit_search_retrieval.py +1 -1
  409. mteb/tasks/retrieval/code/code_rag.py +16 -16
  410. mteb/tasks/retrieval/code/code_search_net_cc_retrieval.py +1 -1
  411. mteb/tasks/retrieval/code/coir_code_search_net_retrieval.py +1 -1
  412. mteb/tasks/retrieval/code/ds1000_retrieval.py +1 -1
  413. mteb/tasks/retrieval/code/fresh_stack_retrieval.py +1 -1
  414. mteb/tasks/retrieval/code/human_eval_retrieval.py +1 -1
  415. mteb/tasks/retrieval/code/mbpp_retrieval.py +1 -1
  416. mteb/tasks/retrieval/code/wiki_sql_retrieval.py +1 -1
  417. mteb/tasks/retrieval/dan/dan_fever_retrieval.py +2 -2
  418. mteb/tasks/retrieval/dan/tv2_nordretrieval.py +3 -3
  419. mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +3 -3
  420. mteb/tasks/retrieval/deu/german_gov_service_retrieval.py +1 -1
  421. mteb/tasks/retrieval/deu/german_qu_ad_retrieval.py +1 -1
  422. mteb/tasks/retrieval/ell/greek_civics_qa.py +1 -1
  423. mteb/tasks/retrieval/eng/__init__.py +44 -0
  424. mteb/tasks/retrieval/eng/bright_retrieval.py +10 -2
  425. mteb/tasks/retrieval/eng/bright_v1_1_retrieval.py +968 -0
  426. mteb/tasks/retrieval/eng/chat_doctor_retrieval.py +1 -1
  427. mteb/tasks/retrieval/eng/chemrxiv.py +33 -0
  428. mteb/tasks/retrieval/eng/cub200_i2i_retrieval.py +1 -1
  429. mteb/tasks/retrieval/eng/fin_qa_retrieval.py +1 -1
  430. mteb/tasks/retrieval/eng/finance_bench_retrieval.py +1 -1
  431. mteb/tasks/retrieval/eng/hateful_memes_i2t_retrieval.py +1 -1
  432. mteb/tasks/retrieval/eng/hateful_memes_t2i_retrieval.py +1 -1
  433. mteb/tasks/retrieval/eng/hc3_finance_retrieval.py +1 -1
  434. mteb/tasks/retrieval/eng/lemb_narrative_qa_retrieval.py +1 -1
  435. mteb/tasks/retrieval/eng/lemb_needle_retrieval.py +1 -1
  436. mteb/tasks/retrieval/eng/lemb_passkey_retrieval.py +1 -1
  437. mteb/tasks/retrieval/eng/lemb_summ_screen_fd_retrieval.py +1 -1
  438. mteb/tasks/retrieval/eng/lemb_wikim_qa_retrieval.py +1 -1
  439. mteb/tasks/retrieval/eng/lembqm_sum_retrieval.py +1 -1
  440. mteb/tasks/retrieval/eng/limit_retrieval.py +6 -1
  441. mteb/tasks/retrieval/eng/lit_search_retrieval.py +1 -1
  442. mteb/tasks/retrieval/eng/memotion_i2t_retrieval.py +1 -1
  443. mteb/tasks/retrieval/eng/memotion_t2i_retrieval.py +1 -1
  444. mteb/tasks/retrieval/eng/ml_questions.py +1 -1
  445. mteb/tasks/retrieval/eng/nano_argu_ana_retrieval.py +1 -1
  446. mteb/tasks/retrieval/eng/nano_climate_fever_retrieval.py +1 -1
  447. mteb/tasks/retrieval/eng/nano_db_pedia_retrieval.py +1 -1
  448. mteb/tasks/retrieval/eng/nano_fever_retrieval.py +1 -1
  449. mteb/tasks/retrieval/eng/nano_fi_qa2018_retrieval.py +1 -1
  450. mteb/tasks/retrieval/eng/nano_hotpot_qa_retrieval.py +1 -1
  451. mteb/tasks/retrieval/eng/nano_msmarco_retrieval.py +1 -1
  452. mteb/tasks/retrieval/eng/nano_nf_corpus_retrieval.py +1 -1
  453. mteb/tasks/retrieval/eng/nano_nq_retrieval.py +1 -1
  454. mteb/tasks/retrieval/eng/nano_quora_retrieval.py +1 -1
  455. mteb/tasks/retrieval/eng/nano_sci_fact_retrieval.py +1 -1
  456. mteb/tasks/retrieval/eng/nano_scidocs_retrieval.py +1 -1
  457. mteb/tasks/retrieval/eng/nano_touche2020_retrieval.py +1 -1
  458. mteb/tasks/retrieval/eng/narrative_qa_retrieval.py +1 -1
  459. mteb/tasks/retrieval/eng/r2_med_retrieval.py +8 -8
  460. mteb/tasks/retrieval/eng/sci_mmir_i2t_retrieval.py +1 -1
  461. mteb/tasks/retrieval/eng/sci_mmir_t2i_retrieval.py +1 -1
  462. mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +10 -10
  463. mteb/tasks/retrieval/fra/f_qu_ad_retrieval.py +1 -1
  464. mteb/tasks/retrieval/fra/syntec_retrieval.py +1 -1
  465. mteb/tasks/retrieval/hun/hun_sum2.py +1 -1
  466. mteb/tasks/retrieval/kat/georgian_faq_retrieval.py +1 -1
  467. mteb/tasks/retrieval/kor/__init__.py +15 -1
  468. mteb/tasks/retrieval/kor/kovidore2_bench_retrieval.py +142 -0
  469. mteb/tasks/retrieval/multilingual/__init__.py +2 -0
  470. mteb/tasks/retrieval/multilingual/cross_lingual_semantic_discrimination_wmt19.py +1 -1
  471. mteb/tasks/retrieval/multilingual/cross_lingual_semantic_discrimination_wmt21.py +1 -1
  472. mteb/tasks/retrieval/multilingual/cur_ev1_retrieval.py +1 -1
  473. mteb/tasks/retrieval/multilingual/euro_pirq_retrieval.py +43 -0
  474. mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +1 -1
  475. mteb/tasks/retrieval/multilingual/miracl_vision_retrieval.py +1 -1
  476. mteb/tasks/retrieval/multilingual/mr_tidy_retrieval.py +1 -1
  477. mteb/tasks/retrieval/multilingual/public_health_qa_retrieval.py +1 -1
  478. mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +5 -5
  479. mteb/tasks/retrieval/multilingual/statcan_dialogue_dataset_retrieval.py +1 -1
  480. mteb/tasks/retrieval/multilingual/vdr_multilingual_retrieval.py +1 -1
  481. mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +14 -4
  482. mteb/tasks/retrieval/multilingual/vidore3_bench_retrieval.py +90 -100
  483. mteb/tasks/retrieval/multilingual/wit_t2i_retrieval.py +1 -1
  484. mteb/tasks/retrieval/multilingual/x_flickr30k_co_t2i_retrieval.py +1 -1
  485. mteb/tasks/retrieval/multilingual/x_qu_ad_retrieval.py +1 -1
  486. mteb/tasks/retrieval/multilingual/xm3600_t2i_retrieval.py +1 -1
  487. mteb/tasks/retrieval/nld/cqa_dupstack_android_nl_retrieval.py +1 -1
  488. mteb/tasks/retrieval/nld/cqa_dupstack_english_nl_retrieval.py +1 -1
  489. mteb/tasks/retrieval/nld/cqa_dupstack_gaming_nl_retrieval.py +1 -1
  490. mteb/tasks/retrieval/nld/cqa_dupstack_gis_nl_retrieval.py +1 -1
  491. mteb/tasks/retrieval/nld/cqa_dupstack_mathematica_nl_retrieval.py +1 -1
  492. mteb/tasks/retrieval/nld/cqa_dupstack_physics_nl_retrieval.py +1 -1
  493. mteb/tasks/retrieval/nld/cqa_dupstack_programmers_nl_retrieval.py +1 -1
  494. mteb/tasks/retrieval/nld/cqa_dupstack_stats_nl_retrieval.py +1 -1
  495. mteb/tasks/retrieval/nld/cqa_dupstack_tex_nl_retrieval.py +1 -1
  496. mteb/tasks/retrieval/nld/cqa_dupstack_unix_nl_retrieval.py +1 -1
  497. mteb/tasks/retrieval/nld/cqa_dupstack_webmasters_nl_retrieval.py +1 -1
  498. mteb/tasks/retrieval/nld/cqa_dupstack_wordpress_nl_retrieval.py +1 -1
  499. mteb/tasks/retrieval/nob/norquad.py +3 -3
  500. mteb/tasks/retrieval/nob/snl_retrieval.py +3 -3
  501. mteb/tasks/retrieval/slk/slovak_sum_retrieval.py +1 -1
  502. mteb/tasks/retrieval/tur/tur_hist_quad.py +1 -1
  503. mteb/tasks/retrieval/vie/__init__.py +14 -6
  504. mteb/tasks/retrieval/vie/climate_fevervn_retrieval.py +39 -0
  505. mteb/tasks/retrieval/vie/db_pedia_vn_retrieval.py +39 -0
  506. mteb/tasks/retrieval/vie/fevervn_retrieval.py +39 -0
  507. mteb/tasks/retrieval/vie/hotpot_qavn_retrieval.py +39 -0
  508. mteb/tasks/retrieval/vie/msmarcovn_retrieval.py +48 -0
  509. mteb/tasks/retrieval/vie/nqvn_retrieval.py +39 -0
  510. mteb/tasks/retrieval/vie/tvpl_retrieval.py +42 -0
  511. mteb/tasks/retrieval/vie/vie_qu_ad_retrieval.py +1 -1
  512. mteb/tasks/retrieval/vie/zac_legal_text_retrieval.py +15 -1
  513. mteb/tasks/sts/fao/faroese_sts.py +1 -1
  514. mteb/tasks/sts/fra/sick_fr_sts.py +1 -1
  515. mteb/tasks/sts/kor/klue_sts.py +1 -1
  516. mteb/tasks/sts/por/sick_br_sts.py +1 -1
  517. mteb/tasks/sts/rus/ru_para_phraser_sts.py +1 -1
  518. mteb/tasks/zeroshot_classification/eng/sci_mmir.py +1 -1
  519. mteb/types/__init__.py +2 -0
  520. mteb/types/_encoder_io.py +13 -1
  521. mteb/types/_result.py +2 -1
  522. mteb/types/statistics.py +18 -5
  523. {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/METADATA +15 -4
  524. {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/RECORD +528 -486
  525. {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/WHEEL +1 -1
  526. mteb/models/model_implementations/mxbai_models.py +0 -111
  527. {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/entry_points.txt +0 -0
  528. {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/licenses/LICENSE +0 -0
  529. {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/top_level.txt +0 -0
@@ -1,23 +1,33 @@
1
+ from __future__ import annotations
2
+
1
3
  import logging
4
+ import warnings
2
5
  from pathlib import Path
6
+ from typing import TYPE_CHECKING
3
7
 
4
8
  from huggingface_hub import ModelCard, ModelCardData, repo_exists
5
9
 
6
- from mteb import BenchmarkResults
7
10
  from mteb.abstasks.abstask import AbsTask
8
11
  from mteb.cache import ResultCache
9
12
 
13
+ if TYPE_CHECKING:
14
+ from collections.abc import Sequence
15
+
16
+ from mteb.abstasks.abstask import AbsTask
17
+ from mteb.benchmarks.benchmark import Benchmark
18
+
10
19
  logger = logging.getLogger(__name__)
11
20
 
12
21
 
13
22
  def generate_model_card(
14
23
  model_name: str,
15
- tasks: list[AbsTask] | None = None,
24
+ tasks: Sequence[AbsTask] | None = None,
25
+ benchmarks: Sequence[Benchmark] | None = None,
16
26
  existing_model_card_id_or_path: str | Path | None = None,
17
27
  results_cache: ResultCache = ResultCache(),
18
28
  output_path: Path = Path("model_card.md"),
19
29
  add_table_to_model_card: bool = False,
20
- models_to_compare: list[str] | None = None,
30
+ models_to_compare: Sequence[str] | None = None,
21
31
  token: str | None = None,
22
32
  push_to_hub: bool = False,
23
33
  ) -> None:
@@ -26,6 +36,7 @@ def generate_model_card(
26
36
  Args:
27
37
  model_name: Name of the model.
28
38
  tasks: List of tasks to generate results for.
39
+ benchmarks: A Benchmark or list of benchmarks to generate results for.
29
40
  existing_model_card_id_or_path: Path or ID of an existing model card to update.
30
41
  results_cache: Instance of ResultCache to load results from.
31
42
  output_path: Path to save the generated model card.
@@ -39,16 +50,24 @@ def generate_model_card(
39
50
  if existing_model_card_id_or_path:
40
51
  existing_model_card = ModelCard.load(existing_model_card_id_or_path)
41
52
 
53
+ all_tasks: list[AbsTask] = []
54
+ if tasks is not None:
55
+ all_tasks.extend(tasks)
56
+
57
+ if benchmarks is not None:
58
+ for b in benchmarks:
59
+ all_tasks.extend(b.tasks)
60
+
42
61
  benchmark_results = results_cache.load_results(
43
- [model_name], tasks, only_main_score=True
62
+ [model_name], all_tasks if all_tasks else None, only_main_score=True
44
63
  )
45
64
  eval_results = []
46
65
  for models_results in benchmark_results.model_results:
47
66
  for task_result in models_results.task_results:
48
67
  eval_results.extend(task_result.get_hf_eval_results())
49
68
 
50
- existing_model_card_data = (
51
- existing_model_card.data if existing_model_card else ModelCardData()
69
+ existing_model_card_data: ModelCardData = (
70
+ existing_model_card.data if existing_model_card else ModelCardData() # type: ignore[assignment]
52
71
  )
53
72
 
54
73
  if existing_model_card_data.eval_results is None:
@@ -78,35 +97,43 @@ def generate_model_card(
78
97
  card_data=existing_model_card_data
79
98
  )
80
99
 
81
- if models_to_compare:
82
- benchmark_results = results_cache.load_results(
83
- [model_name, *models_to_compare], tasks, only_main_score=True
84
- )
85
-
86
100
  if add_table_to_model_card:
87
101
  existing_model_card = _add_table_to_model_card(
88
- benchmark_results, existing_model_card
102
+ results_cache,
103
+ existing_model_card,
104
+ (model_name, *models_to_compare) if models_to_compare else (model_name,),
105
+ benchmarks or [],
89
106
  )
90
107
 
91
- if push_to_hub:
108
+ if push_to_hub and existing_model_card_id_or_path:
109
+ existing_model_card_id_or_path = str(existing_model_card_id_or_path)
92
110
  if repo_exists(existing_model_card_id_or_path):
93
111
  existing_model_card.push_to_hub(existing_model_card_id_or_path, token=token)
94
112
  else:
95
- logger.warning(
96
- f"Repository {existing_model_card_id_or_path} does not exist on the Hub. Skipping push to hub."
97
- )
113
+ msg = f"Repository {existing_model_card_id_or_path} does not exist on the Hub. Skipping push to hub."
114
+ logger.warning(msg)
115
+ warnings.warn(msg)
98
116
  existing_model_card.save(output_path)
99
117
 
100
118
 
101
119
  def _add_table_to_model_card(
102
- results: BenchmarkResults, model_card: ModelCard
120
+ results_cache: ResultCache,
121
+ model_card: ModelCard,
122
+ models: Sequence[str],
123
+ benchmarks: Sequence[Benchmark],
103
124
  ) -> ModelCard:
104
125
  original_content = model_card.content
105
- results_df = results.to_dataframe()
106
- results_df = results_df.set_index("task_name")
107
- mteb_content = f"""
108
- # MTEB results
109
- {results_df.to_markdown()}
110
- """
126
+ mteb_content = "# MTEB Results\n\n"
127
+
128
+ for benchmark in benchmarks:
129
+ mteb_content += f"## Benchmark: {benchmark.name}\n\n"
130
+ benchmark_results = results_cache.load_results(
131
+ tasks=benchmark,
132
+ models=models,
133
+ only_main_score=True,
134
+ )
135
+ df_results = benchmark_results.get_benchmark_result()
136
+ mteb_content += df_results.to_markdown(index=True) + "\n\n"
137
+
111
138
  model_card.content = original_content + "\n\n" + mteb_content
112
139
  return model_card
@@ -5,38 +5,43 @@ import logging
5
5
  import os
6
6
  import sys
7
7
  import traceback
8
- from collections.abc import Iterable
8
+ import warnings
9
9
  from copy import deepcopy
10
10
  from datetime import datetime
11
11
  from itertools import chain
12
12
  from pathlib import Path
13
13
  from time import time
14
- from typing import TYPE_CHECKING, Any
14
+ from typing import TYPE_CHECKING, Any, cast
15
15
 
16
16
  import datasets
17
17
 
18
18
  import mteb
19
19
  from mteb.abstasks import AbsTask
20
- from mteb.abstasks.task_metadata import TaskCategory, TaskType
21
20
  from mteb.benchmarks import Benchmark
22
21
  from mteb.models import (
23
22
  CrossEncoderWrapper,
24
- EncoderProtocol,
25
23
  ModelMeta,
26
- MTEBModels,
27
24
  SentenceTransformerEncoderWrapper,
28
25
  )
29
26
  from mteb.results import TaskResult
30
- from mteb.types import ScoresDict
27
+
28
+ if TYPE_CHECKING:
29
+ from collections.abc import Iterable, Sequence
30
+
31
+ from sentence_transformers import CrossEncoder, SentenceTransformer
32
+
33
+ from mteb.abstasks.aggregated_task import AbsTaskAggregate
34
+ from mteb.abstasks.task_metadata import TaskCategory, TaskType
35
+ from mteb.models import (
36
+ MTEBModels,
37
+ )
38
+ from mteb.types import EncodeKwargs, ScoresDict
31
39
 
32
40
  if sys.version_info >= (3, 13):
33
41
  from warnings import deprecated
34
42
  else:
35
43
  from typing_extensions import deprecated
36
44
 
37
- if TYPE_CHECKING:
38
- from sentence_transformers import CrossEncoder, SentenceTransformer
39
-
40
45
  logger = logging.getLogger(__name__)
41
46
 
42
47
 
@@ -52,7 +57,7 @@ class MTEB:
52
57
  )
53
58
  def __init__(
54
59
  self,
55
- tasks: Iterable[AbsTask | Benchmark],
60
+ tasks: Iterable[AbsTask] | Iterable[Benchmark],
56
61
  *,
57
62
  err_logs_path: str = "error_logs.txt",
58
63
  ) -> None:
@@ -63,15 +68,14 @@ class MTEB:
63
68
  `mteb.get_tasks(["task1","task2"]) or `mteb.get_benchmark("MTEB(eng, classic)").
64
69
  err_logs_path: Path to save error logs.
65
70
  """
66
- from mteb.benchmarks import Benchmark
67
-
68
- self.tasks = list(tasks)
69
- if len(self.tasks) > 0 and isinstance(self.tasks[0], Benchmark):
71
+ if isinstance(next(iter(tasks)), Benchmark):
70
72
  self.benchmarks = tasks
71
- self.tasks = list(chain.from_iterable(self.tasks))
73
+ self.tasks = list(chain.from_iterable(cast("Iterable[Benchmark]", tasks)))
74
+ elif isinstance(next(iter(tasks)), AbsTask):
75
+ self.tasks = list(cast("Iterable[AbsTask]", tasks))
72
76
 
73
77
  self.err_logs_path = Path(err_logs_path)
74
- self.last_evaluated_splits = {}
78
+ self._last_evaluated_splits: dict[str, list[str]] = {}
75
79
 
76
80
  @property
77
81
  def available_tasks(self) -> list[str]:
@@ -84,7 +88,7 @@ class MTEB:
84
88
  return sorted({x.metadata.type for x in self.tasks})
85
89
 
86
90
  @property
87
- def available_task_categories(self) -> set[TaskCategory]:
91
+ def available_task_categories(self) -> set[TaskCategory | None]:
88
92
  """Set of available task categories."""
89
93
  return {x.metadata.category for x in self.tasks}
90
94
 
@@ -174,7 +178,7 @@ class MTEB:
174
178
  split: str,
175
179
  subsets_to_run: list[str] | None = None,
176
180
  *,
177
- encode_kwargs: dict[str, Any],
181
+ encode_kwargs: EncodeKwargs,
178
182
  **kwargs: Any,
179
183
  ):
180
184
  tick = time()
@@ -231,13 +235,14 @@ class MTEB:
231
235
  merged_kg_co2_emissions = None
232
236
  if existing_kg_co2_emissions and new_kg_co2_emissions:
233
237
  merged_kg_co2_emissions = existing_kg_co2_emissions + new_kg_co2_emissions
238
+ existing_evaluation_time = existing_results.evaluation_time or 0
239
+ new_evaluation_time = new_results.evaluation_time or 0
234
240
  merged_results = TaskResult(
235
241
  dataset_revision=new_results.dataset_revision,
236
242
  task_name=new_results.task_name,
237
243
  mteb_version=new_results.mteb_version,
238
244
  scores=merged_scores,
239
- evaluation_time=existing_results.evaluation_time
240
- + new_results.evaluation_time,
245
+ evaluation_time=existing_evaluation_time + new_evaluation_time,
241
246
  kg_co2_emissions=merged_kg_co2_emissions,
242
247
  )
243
248
 
@@ -262,7 +267,7 @@ class MTEB:
262
267
  overwrite_results: bool = False,
263
268
  raise_error: bool = True,
264
269
  co2_tracker: bool = False,
265
- encode_kwargs: dict[str, Any] | None = None,
270
+ encode_kwargs: EncodeKwargs | None = None,
266
271
  **kwargs,
267
272
  ) -> list[TaskResult]:
268
273
  """Run the evaluation pipeline on the selected tasks.
@@ -306,13 +311,16 @@ class MTEB:
306
311
  elif verbosity == 3:
307
312
  datasets.logging.set_verbosity(logging.DEBUG)
308
313
 
309
- meta = self.create_model_meta(model)
310
- output_path = self._create_output_folder(meta, output_folder)
311
-
314
+ mteb_model: MTEBModels
312
315
  if isinstance(model, SentenceTransformer):
313
- model = SentenceTransformerEncoderWrapper(model)
316
+ mteb_model = SentenceTransformerEncoderWrapper(model)
314
317
  elif isinstance(model, CrossEncoder):
315
- model = CrossEncoderWrapper(model)
318
+ mteb_model = CrossEncoderWrapper(model)
319
+ else:
320
+ mteb_model = cast("MTEBModels", model)
321
+
322
+ meta = self.create_model_meta(mteb_model)
323
+ output_path = self._create_output_folder(meta, output_folder)
316
324
 
317
325
  # Disable co2_tracker for API models
318
326
  if "API" in meta.framework:
@@ -333,7 +341,7 @@ class MTEB:
333
341
  ) # save them in case we re-use the object (e.g. for reranking)
334
342
 
335
343
  # To evaluate missing splits, we keep track of the task name and the corresponding splits.
336
- self.last_evaluated_splits = {}
344
+ self._last_evaluated_splits = {}
337
345
 
338
346
  while len(self.tasks) > 0:
339
347
  task = self.tasks[0]
@@ -342,9 +350,10 @@ class MTEB:
342
350
  )
343
351
 
344
352
  if task.is_aggregate:
345
- self_ = MTEB(tasks=task.metadata.tasks)
346
- task_results = self_.run(
347
- model,
353
+ aggregated_task = cast("AbsTaskAggregate", task)
354
+ self_ = MTEB(tasks=aggregated_task.metadata.tasks)
355
+ aggregated_task_results = self_.run(
356
+ mteb_model,
348
357
  verbosity=verbosity - 1,
349
358
  output_folder=output_folder,
350
359
  eval_splits=eval_splits,
@@ -355,12 +364,15 @@ class MTEB:
355
364
  encode_kwargs=encode_kwargs,
356
365
  **kwargs,
357
366
  )
358
- new_results = task.combine_task_results(task_results)
367
+ new_results = aggregated_task.combine_task_results(
368
+ aggregated_task_results
369
+ )
359
370
  evaluation_results.append(new_results)
360
371
 
361
372
  if output_path:
362
- save_path = output_path / f"{task.metadata.name}.json"
363
- new_results.to_disk(save_path)
373
+ new_results.to_disk(
374
+ output_path / f"{aggregated_task.metadata.name}.json"
375
+ )
364
376
  del self.tasks[0]
365
377
  continue
366
378
 
@@ -382,7 +394,7 @@ class MTEB:
382
394
  task_subsets = task.hf_subsets
383
395
 
384
396
  existing_results = None
385
- save_path = None
397
+ save_path: Path | None = None
386
398
  final_splits_to_run = task_eval_splits
387
399
  missing_evaluations = self._get_missing_evaluations(
388
400
  existing_results,
@@ -432,7 +444,7 @@ class MTEB:
432
444
  logger.info(
433
445
  f"No splits to evaluate for {task.metadata.name}. Skipping evaluation."
434
446
  )
435
- self.last_evaluated_splits[task.metadata.name] = []
447
+ self._last_evaluated_splits[task.metadata.name] = []
436
448
  del self.tasks[0]
437
449
  continue
438
450
 
@@ -440,11 +452,11 @@ class MTEB:
440
452
  task.check_if_dataset_is_superseded()
441
453
  task.load_data()
442
454
 
443
- task_results = {}
455
+ task_results: dict[str, dict[str, dict[str, Any]]] = {}
444
456
  evaluation_time = 0
445
457
  kg_co2_emissions: int | None = 0 if co2_tracker else None
446
458
 
447
- self.last_evaluated_splits[task.metadata.name] = []
459
+ self._last_evaluated_splits[task.metadata.name] = []
448
460
 
449
461
  for split in final_splits_to_run:
450
462
  info = missing_evaluations[split]
@@ -465,14 +477,16 @@ class MTEB:
465
477
 
466
478
  if co2_tracker:
467
479
  try:
468
- from codecarbon import EmissionsTracker
480
+ from codecarbon import ( # type: ignore[import-not-found,import-untyped]
481
+ EmissionsTracker,
482
+ )
469
483
  except ImportError:
470
484
  raise ImportError(
471
485
  "codecarbon is not installed. Please install it using `pip install 'mteb[codecarbon]'` to track CO₂ emissions."
472
486
  )
473
- logger.warning(
474
- "Evaluating multiple MTEB runs simultaneously will produce incorrect CO₂ results"
475
- )
487
+ msg = "Evaluating multiple MTEB runs simultaneously will produce incorrect CO₂ results"
488
+ logger.warning(msg)
489
+ warnings.warn(msg)
476
490
  with EmissionsTracker(
477
491
  save_to_file=False,
478
492
  save_to_api=False,
@@ -481,7 +495,7 @@ class MTEB:
481
495
  ) as tracker:
482
496
  results, tick, tock = self._run_eval(
483
497
  task,
484
- model,
498
+ mteb_model,
485
499
  split,
486
500
  encode_kwargs=encode_kwargs,
487
501
  subsets_to_run=subsets_to_run,
@@ -494,7 +508,7 @@ class MTEB:
494
508
  else:
495
509
  results, tick, tock = self._run_eval(
496
510
  task,
497
- model,
511
+ mteb_model,
498
512
  split,
499
513
  subsets_to_run=subsets_to_run,
500
514
  encode_kwargs=encode_kwargs,
@@ -510,25 +524,25 @@ class MTEB:
510
524
  if verbosity >= 1:
511
525
  logger.info(f"Scores: {task_results[split]}")
512
526
 
513
- self.last_evaluated_splits[task.metadata.name].append(split)
527
+ self._last_evaluated_splits[task.metadata.name].append(split)
514
528
 
515
529
  # Create new TaskResult
516
530
  new_results = TaskResult.from_task_results(
517
531
  task,
518
- task_results,
532
+ task_results, # type: ignore[arg-type]
519
533
  evaluation_time=evaluation_time,
520
534
  kg_co2_emissions=kg_co2_emissions,
521
535
  )
522
536
 
523
537
  # Merge with existing if needed
524
- if output_path and save_path.exists():
538
+ if output_path and save_path and save_path.exists():
525
539
  existing_results = TaskResult.from_disk(save_path)
526
540
  if existing_results:
527
541
  merged_results = self._merge_results(existing_results, new_results)
528
542
  else:
529
543
  merged_results = new_results
530
544
 
531
- if output_path:
545
+ if output_path and save_path:
532
546
  merged_results.to_disk(save_path)
533
547
 
534
548
  evaluation_results.append(merged_results)
@@ -555,7 +569,7 @@ class MTEB:
555
569
  def create_model_meta(model: MTEBModels) -> ModelMeta:
556
570
  """Create a ModelMeta object for the given model."""
557
571
  if hasattr(model, "mteb_model_meta") and model.mteb_model_meta is not None:
558
- meta = model.mteb_model_meta # type: ignore
572
+ meta = model.mteb_model_meta
559
573
  else:
560
574
  meta = MTEB._get_model_meta(model)
561
575
 
@@ -581,7 +595,11 @@ class MTEB:
581
595
  if output_folder is None:
582
596
  return None
583
597
 
584
- model_revision: str = model_meta.revision # type: ignore
598
+ model_revision: str = (
599
+ model_meta.revision
600
+ if model_meta.revision is not None
601
+ else "no_revision_available"
602
+ )
585
603
  model_path_name = model_meta.model_name_as_path()
586
604
 
587
605
  output_path = Path(output_folder) / model_path_name / model_revision
@@ -603,15 +621,15 @@ class MTEB:
603
621
  Tasks with empty lists indicate that results already existed and no splits were evaluated.
604
622
  """
605
623
  return deepcopy(
606
- {task: list(splits) for task, splits in self.last_evaluated_splits.items()}
624
+ {task: list(splits) for task, splits in self._last_evaluated_splits.items()}
607
625
  )
608
626
 
609
627
  @staticmethod
610
628
  def _get_missing_evaluations(
611
629
  existing_results: TaskResult | None,
612
- task_eval_splits: list[str],
613
- task_eval_langs: list[str],
614
- eval_subsets: list[str] | None,
630
+ task_eval_splits: Sequence[str],
631
+ task_eval_langs: Sequence[str],
632
+ eval_subsets: Sequence[str] | None,
615
633
  ) -> dict[str, dict[str, Any]]:
616
634
  """Return a dictionary for each split, indicating if the whole split is missing and which subsets are missing."""
617
635
  missing_evaluations = {
@@ -660,7 +678,7 @@ class MTEB:
660
678
  return missing_evaluations
661
679
 
662
680
  @staticmethod
663
- def _get_model_meta(model: EncoderProtocol) -> ModelMeta:
681
+ def _get_model_meta(model: MTEBModels) -> ModelMeta:
664
682
  from sentence_transformers import CrossEncoder, SentenceTransformer
665
683
 
666
684
  if isinstance(model, CrossEncoder):
@@ -0,0 +1,32 @@
1
+ {
2
+ "test": {
3
+ "num_samples": 1299,
4
+ "number_of_characters": 9254,
5
+ "documents_text_statistics": null,
6
+ "documents_image_statistics": {
7
+ "min_image_width": 2245,
8
+ "average_image_width": 2370.324347826087,
9
+ "max_image_width": 3508,
10
+ "min_image_height": 2481,
11
+ "average_image_height": 3289.8060869565215,
12
+ "max_image_height": 3580,
13
+ "unique_images": 1132
14
+ },
15
+ "queries_text_statistics": {
16
+ "total_text_length": 9254,
17
+ "min_text_length": 15,
18
+ "average_text_length": 62.10738255033557,
19
+ "max_text_length": 108,
20
+ "unique_texts": 149
21
+ },
22
+ "queries_image_statistics": null,
23
+ "relevant_docs_statistics": {
24
+ "num_relevant_docs": 409,
25
+ "min_relevant_docs_per_query": 1,
26
+ "average_relevant_docs_per_query": 2.7449664429530203,
27
+ "max_relevant_docs_per_query": 7,
28
+ "unique_relevant_docs": 316
29
+ },
30
+ "top_ranked_statistics": null
31
+ }
32
+ }
@@ -0,0 +1,32 @@
1
+ {
2
+ "test": {
3
+ "num_samples": 1640,
4
+ "number_of_characters": 8331,
5
+ "documents_text_statistics": null,
6
+ "documents_image_statistics": {
7
+ "min_image_width": 2313,
8
+ "average_image_width": 2347.5321597833445,
9
+ "max_image_width": 2481,
10
+ "min_image_height": 3138,
11
+ "average_image_height": 3214.301963439404,
12
+ "max_image_height": 3508,
13
+ "unique_images": 1442
14
+ },
15
+ "queries_text_statistics": {
16
+ "total_text_length": 8331,
17
+ "min_text_length": 23,
18
+ "average_text_length": 51.11042944785276,
19
+ "max_text_length": 110,
20
+ "unique_texts": 163
21
+ },
22
+ "queries_image_statistics": null,
23
+ "relevant_docs_statistics": {
24
+ "num_relevant_docs": 413,
25
+ "min_relevant_docs_per_query": 1,
26
+ "average_relevant_docs_per_query": 2.5337423312883436,
27
+ "max_relevant_docs_per_query": 6,
28
+ "unique_relevant_docs": 349
29
+ },
30
+ "top_ranked_statistics": null
31
+ }
32
+ }
@@ -0,0 +1,32 @@
1
+ {
2
+ "test": {
3
+ "num_samples": 2166,
4
+ "number_of_characters": 9764,
5
+ "documents_text_statistics": null,
6
+ "documents_image_statistics": {
7
+ "min_image_width": 2221,
8
+ "average_image_width": 2339.4957350727545,
9
+ "max_image_width": 2480,
10
+ "min_image_height": 3036,
11
+ "average_image_height": 3242.8138484696437,
12
+ "max_image_height": 3508,
13
+ "unique_images": 1974
14
+ },
15
+ "queries_text_statistics": {
16
+ "total_text_length": 9764,
17
+ "min_text_length": 22,
18
+ "average_text_length": 56.4393063583815,
19
+ "max_text_length": 103,
20
+ "unique_texts": 173
21
+ },
22
+ "queries_image_statistics": null,
23
+ "relevant_docs_statistics": {
24
+ "num_relevant_docs": 525,
25
+ "min_relevant_docs_per_query": 1,
26
+ "average_relevant_docs_per_query": 3.0346820809248554,
27
+ "max_relevant_docs_per_query": 7,
28
+ "unique_relevant_docs": 442
29
+ },
30
+ "top_ranked_statistics": null
31
+ }
32
+ }
@@ -0,0 +1,32 @@
1
+ {
2
+ "test": {
3
+ "num_samples": 2330,
4
+ "number_of_characters": 13131,
5
+ "documents_text_statistics": null,
6
+ "documents_image_statistics": {
7
+ "min_image_width": 1949,
8
+ "average_image_width": 2430.1152204836417,
9
+ "max_image_width": 3505,
10
+ "min_image_height": 2480,
11
+ "average_image_height": 3350.3921289710765,
12
+ "max_image_height": 3626,
13
+ "unique_images": 2096
14
+ },
15
+ "queries_text_statistics": {
16
+ "total_text_length": 13131,
17
+ "min_text_length": 21,
18
+ "average_text_length": 59.41628959276018,
19
+ "max_text_length": 112,
20
+ "unique_texts": 221
21
+ },
22
+ "queries_image_statistics": null,
23
+ "relevant_docs_statistics": {
24
+ "num_relevant_docs": 726,
25
+ "min_relevant_docs_per_query": 1,
26
+ "average_relevant_docs_per_query": 3.2850678733031673,
27
+ "max_relevant_docs_per_query": 7,
28
+ "unique_relevant_docs": 575
29
+ },
30
+ "top_ranked_statistics": null
31
+ }
32
+ }
@@ -0,0 +1,35 @@
1
+ {
2
+ "standard": {
3
+ "num_samples": 188113,
4
+ "number_of_characters": 141769714,
5
+ "documents_text_statistics": {
6
+ "total_text_length": 141734227,
7
+ "min_text_length": 58,
8
+ "average_text_length": 753.8974425803981,
9
+ "max_text_length": 7334,
10
+ "unique_texts": 176508
11
+ },
12
+ "documents_image_statistics": null,
13
+ "queries_text_statistics": {
14
+ "total_text_length": 35487,
15
+ "min_text_length": 85,
16
+ "average_text_length": 319.7027027027027,
17
+ "max_text_length": 1167,
18
+ "unique_texts": 111
19
+ },
20
+ "queries_image_statistics": null,
21
+ "relevant_docs_statistics": {
22
+ "num_relevant_docs": 524,
23
+ "min_relevant_docs_per_query": 1,
24
+ "average_relevant_docs_per_query": 4.7207207207207205,
25
+ "max_relevant_docs_per_query": 8,
26
+ "unique_relevant_docs": 111
27
+ },
28
+ "top_ranked_statistics": {
29
+ "num_top_ranked": 20264921,
30
+ "min_top_ranked_per_query": 176954,
31
+ "average_top_ranked_per_query": 182566.85585585586,
32
+ "max_top_ranked_per_query": 186176
33
+ }
34
+ }
35
+ }
@@ -0,0 +1,35 @@
1
+ {
2
+ "long": {
3
+ "num_samples": 627,
4
+ "number_of_characters": 19398082,
5
+ "documents_text_statistics": {
6
+ "total_text_length": 19344209,
7
+ "min_text_length": 142,
8
+ "average_text_length": 36916.42938931298,
9
+ "max_text_length": 1324201,
10
+ "unique_texts": 498
11
+ },
12
+ "documents_image_statistics": null,
13
+ "queries_text_statistics": {
14
+ "total_text_length": 53873,
15
+ "min_text_length": 89,
16
+ "average_text_length": 523.0388349514564,
17
+ "max_text_length": 2195,
18
+ "unique_texts": 103
19
+ },
20
+ "queries_image_statistics": null,
21
+ "relevant_docs_statistics": {
22
+ "num_relevant_docs": 134,
23
+ "min_relevant_docs_per_query": 1,
24
+ "average_relevant_docs_per_query": 1.3009708737864079,
25
+ "max_relevant_docs_per_query": 4,
26
+ "unique_relevant_docs": 134
27
+ },
28
+ "top_ranked_statistics": {
29
+ "num_top_ranked": 53972,
30
+ "min_top_ranked_per_query": 524,
31
+ "average_top_ranked_per_query": 524.0,
32
+ "max_top_ranked_per_query": 524
33
+ }
34
+ }
35
+ }