mteb 2.5.2__py3-none-any.whl → 2.7.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (529) hide show
  1. mteb/__init__.py +2 -0
  2. mteb/_create_dataloaders.py +78 -30
  3. mteb/_evaluators/any_sts_evaluator.py +13 -6
  4. mteb/_evaluators/clustering_evaluator.py +13 -5
  5. mteb/_evaluators/evaluator.py +12 -4
  6. mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +22 -11
  7. mteb/_evaluators/pair_classification_evaluator.py +17 -7
  8. mteb/_evaluators/retrieval_evaluator.py +23 -14
  9. mteb/_evaluators/retrieval_metrics.py +26 -19
  10. mteb/_evaluators/sklearn_evaluator.py +27 -17
  11. mteb/_evaluators/text/bitext_mining_evaluator.py +36 -20
  12. mteb/_evaluators/text/summarization_evaluator.py +31 -20
  13. mteb/_evaluators/zeroshot_classification_evaluator.py +16 -5
  14. mteb/_helpful_enum.py +5 -1
  15. mteb/abstasks/_data_filter/filters.py +9 -3
  16. mteb/abstasks/_data_filter/task_pipelines.py +10 -2
  17. mteb/abstasks/_statistics_calculation.py +21 -11
  18. mteb/abstasks/_stratification.py +18 -18
  19. mteb/abstasks/abstask.py +78 -44
  20. mteb/abstasks/aggregate_task_metadata.py +21 -18
  21. mteb/abstasks/aggregated_task.py +23 -35
  22. mteb/abstasks/classification.py +39 -18
  23. mteb/abstasks/clustering.py +37 -20
  24. mteb/abstasks/clustering_legacy.py +30 -16
  25. mteb/abstasks/image/image_text_pair_classification.py +26 -9
  26. mteb/abstasks/multilabel_classification.py +33 -21
  27. mteb/abstasks/pair_classification.py +44 -19
  28. mteb/abstasks/regression.py +18 -10
  29. mteb/abstasks/retrieval.py +82 -52
  30. mteb/abstasks/retrieval_dataset_loaders.py +50 -39
  31. mteb/abstasks/sts.py +34 -15
  32. mteb/abstasks/task_metadata.py +44 -37
  33. mteb/abstasks/text/bitext_mining.py +57 -35
  34. mteb/abstasks/text/reranking.py +10 -8
  35. mteb/abstasks/text/summarization.py +26 -10
  36. mteb/abstasks/zeroshot_classification.py +27 -9
  37. mteb/benchmarks/_create_table.py +13 -7
  38. mteb/benchmarks/benchmark.py +15 -3
  39. mteb/benchmarks/benchmarks/__init__.py +6 -0
  40. mteb/benchmarks/benchmarks/benchmarks.py +153 -13
  41. mteb/benchmarks/benchmarks/rteb_benchmarks.py +20 -9
  42. mteb/benchmarks/get_benchmark.py +14 -55
  43. mteb/cache.py +189 -31
  44. mteb/cli/_display_tasks.py +10 -4
  45. mteb/cli/build_cli.py +112 -13
  46. mteb/cli/generate_model_card.py +50 -23
  47. mteb/deprecated_evaluator.py +72 -54
  48. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2CybersecurityRetrieval.json +32 -0
  49. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EconomicRetrieval.json +32 -0
  50. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EnergyRetrieval.json +32 -0
  51. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2HrRetrieval.json +32 -0
  52. mteb/descriptive_stats/Retrieval/BrightAopsRetrieval.json +35 -0
  53. mteb/descriptive_stats/Retrieval/BrightBiologyLongRetrieval.json +35 -0
  54. mteb/descriptive_stats/Retrieval/BrightBiologyRetrieval.json +35 -0
  55. mteb/descriptive_stats/Retrieval/BrightEarthScienceLongRetrieval.json +35 -0
  56. mteb/descriptive_stats/Retrieval/BrightEarthScienceRetrieval.json +35 -0
  57. mteb/descriptive_stats/Retrieval/BrightEconomicsLongRetrieval.json +35 -0
  58. mteb/descriptive_stats/Retrieval/BrightEconomicsRetrieval.json +35 -0
  59. mteb/descriptive_stats/Retrieval/BrightLeetcodeRetrieval.json +35 -0
  60. mteb/descriptive_stats/Retrieval/BrightPonyLongRetrieval.json +35 -0
  61. mteb/descriptive_stats/Retrieval/BrightPonyRetrieval.json +35 -0
  62. mteb/descriptive_stats/Retrieval/BrightPsychologyLongRetrieval.json +35 -0
  63. mteb/descriptive_stats/Retrieval/BrightPsychologyRetrieval.json +35 -0
  64. mteb/descriptive_stats/Retrieval/BrightRoboticsLongRetrieval.json +35 -0
  65. mteb/descriptive_stats/Retrieval/BrightRoboticsRetrieval.json +35 -0
  66. mteb/descriptive_stats/Retrieval/BrightStackoverflowLongRetrieval.json +35 -0
  67. mteb/descriptive_stats/Retrieval/BrightStackoverflowRetrieval.json +35 -0
  68. mteb/descriptive_stats/Retrieval/BrightSustainableLivingLongRetrieval.json +35 -0
  69. mteb/descriptive_stats/Retrieval/BrightSustainableLivingRetrieval.json +35 -0
  70. mteb/descriptive_stats/Retrieval/BrightTheoremQAQuestionsRetrieval.json +35 -0
  71. mteb/descriptive_stats/Retrieval/BrightTheoremQATheoremsRetrieval.json +35 -0
  72. mteb/descriptive_stats/Retrieval/ChemRxivRetrieval.json +30 -0
  73. mteb/descriptive_stats/Retrieval/EuroPIRQRetrieval.json +116 -0
  74. mteb/descriptive_stats/Retrieval/NanoClimateFEVER-VN.json +30 -0
  75. mteb/descriptive_stats/Retrieval/NanoDBPedia-VN.json +30 -0
  76. mteb/descriptive_stats/Retrieval/NanoFEVER-VN.json +30 -0
  77. mteb/descriptive_stats/Retrieval/NanoHotpotQA-VN.json +30 -0
  78. mteb/descriptive_stats/Retrieval/NanoMSMARCO-VN.json +30 -0
  79. mteb/descriptive_stats/Retrieval/NanoNQ-VN.json +30 -0
  80. mteb/descriptive_stats/Retrieval/TVPLRetrieval.json +30 -0
  81. mteb/evaluate.py +71 -47
  82. mteb/filter_tasks.py +36 -32
  83. mteb/get_tasks.py +37 -33
  84. mteb/languages/language_scripts.py +11 -4
  85. mteb/leaderboard/app.py +172 -37
  86. mteb/leaderboard/table.py +7 -2
  87. mteb/load_results.py +20 -14
  88. mteb/models/abs_encoder.py +30 -16
  89. mteb/models/cache_wrappers/cache_backend_protocol.py +7 -7
  90. mteb/models/cache_wrappers/cache_backends/_hash_utils.py +10 -5
  91. mteb/models/cache_wrappers/cache_backends/faiss_cache.py +13 -4
  92. mteb/models/cache_wrappers/cache_backends/numpy_cache.py +43 -25
  93. mteb/models/cache_wrappers/cache_wrapper.py +16 -11
  94. mteb/models/get_model_meta.py +53 -9
  95. mteb/models/instruct_wrapper.py +41 -13
  96. mteb/models/model_implementations/align_models.py +11 -5
  97. mteb/models/model_implementations/amazon_models.py +1 -0
  98. mteb/models/model_implementations/andersborges.py +6 -4
  99. mteb/models/model_implementations/ara_models.py +2 -1
  100. mteb/models/model_implementations/arctic_models.py +16 -8
  101. mteb/models/model_implementations/b1ade_models.py +2 -1
  102. mteb/models/model_implementations/bedrock_models.py +20 -6
  103. mteb/models/model_implementations/bge_models.py +85 -22
  104. mteb/models/model_implementations/bica_model.py +4 -3
  105. mteb/models/model_implementations/blip2_models.py +13 -6
  106. mteb/models/model_implementations/blip_models.py +33 -20
  107. mteb/models/model_implementations/bm25.py +27 -17
  108. mteb/models/model_implementations/bmretriever_models.py +16 -6
  109. mteb/models/model_implementations/cadet_models.py +2 -1
  110. mteb/models/model_implementations/cde_models.py +22 -9
  111. mteb/models/model_implementations/clip_models.py +18 -10
  112. mteb/models/model_implementations/clips_models.py +6 -3
  113. mteb/models/model_implementations/codefuse_models.py +10 -5
  114. mteb/models/model_implementations/codesage_models.py +6 -3
  115. mteb/models/model_implementations/cohere_models.py +19 -9
  116. mteb/models/model_implementations/cohere_v.py +16 -6
  117. mteb/models/model_implementations/colpali_models.py +10 -6
  118. mteb/models/model_implementations/colqwen_models.py +24 -38
  119. mteb/models/model_implementations/colsmol_models.py +5 -3
  120. mteb/models/model_implementations/conan_models.py +12 -5
  121. mteb/models/model_implementations/dino_models.py +70 -46
  122. mteb/models/model_implementations/e5_instruct.py +27 -4
  123. mteb/models/model_implementations/e5_models.py +18 -9
  124. mteb/models/model_implementations/e5_v.py +16 -10
  125. mteb/models/model_implementations/eagerworks_models.py +12 -5
  126. mteb/models/model_implementations/emillykkejensen_models.py +9 -6
  127. mteb/models/model_implementations/en_code_retriever.py +2 -1
  128. mteb/models/model_implementations/euler_models.py +3 -2
  129. mteb/models/model_implementations/evaclip_models.py +13 -4
  130. mteb/models/model_implementations/fa_models.py +18 -9
  131. mteb/models/model_implementations/facebookai.py +16 -2
  132. mteb/models/model_implementations/geogpt_models.py +2 -1
  133. mteb/models/model_implementations/gme_v_models.py +13 -8
  134. mteb/models/model_implementations/google_models.py +16 -5
  135. mteb/models/model_implementations/granite_vision_embedding_models.py +8 -6
  136. mteb/models/model_implementations/gritlm_models.py +5 -2
  137. mteb/models/model_implementations/gte_models.py +34 -13
  138. mteb/models/model_implementations/hinvec_models.py +7 -2
  139. mteb/models/model_implementations/human.py +1 -0
  140. mteb/models/model_implementations/ibm_granite_models.py +36 -6
  141. mteb/models/model_implementations/inf_models.py +4 -2
  142. mteb/models/model_implementations/jasper_models.py +16 -7
  143. mteb/models/model_implementations/jina_clip.py +58 -14
  144. mteb/models/model_implementations/jina_models.py +35 -16
  145. mteb/models/model_implementations/kalm_models.py +24 -12
  146. mteb/models/model_implementations/kblab.py +13 -6
  147. mteb/models/model_implementations/kennethenevoldsen_models.py +6 -4
  148. mteb/models/model_implementations/kfst.py +2 -1
  149. mteb/models/model_implementations/kowshik24_models.py +2 -1
  150. mteb/models/model_implementations/lens_models.py +2 -0
  151. mteb/models/model_implementations/lgai_embedding_models.py +2 -1
  152. mteb/models/model_implementations/linq_models.py +8 -2
  153. mteb/models/model_implementations/listconranker.py +11 -5
  154. mteb/models/model_implementations/llm2clip_models.py +18 -10
  155. mteb/models/model_implementations/llm2vec_models.py +28 -14
  156. mteb/models/model_implementations/mcinext_models.py +12 -3
  157. mteb/models/model_implementations/mdbr_models.py +19 -3
  158. mteb/models/model_implementations/misc_models.py +131 -68
  159. mteb/models/model_implementations/mixedbread_ai_models.py +335 -0
  160. mteb/models/model_implementations/mme5_models.py +3 -2
  161. mteb/models/model_implementations/moco_models.py +15 -8
  162. mteb/models/model_implementations/mod_models.py +3 -2
  163. mteb/models/model_implementations/model2vec_models.py +37 -18
  164. mteb/models/model_implementations/moka_models.py +4 -1
  165. mteb/models/model_implementations/nbailab.py +6 -3
  166. mteb/models/model_implementations/no_instruct_sentence_models.py +15 -7
  167. mteb/models/model_implementations/nomic_models.py +47 -19
  168. mteb/models/model_implementations/nomic_models_vision.py +6 -4
  169. mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +20 -8
  170. mteb/models/model_implementations/nvidia_models.py +165 -22
  171. mteb/models/model_implementations/octen_models.py +64 -3
  172. mteb/models/model_implementations/openai_models.py +14 -4
  173. mteb/models/model_implementations/openclip_models.py +30 -17
  174. mteb/models/model_implementations/opensearch_neural_sparse_models.py +20 -9
  175. mteb/models/model_implementations/ops_moa_models.py +10 -3
  176. mteb/models/model_implementations/ordalietech_solon_embeddings_mini_beta_1_1.py +2 -1
  177. mteb/models/model_implementations/pawan_models.py +2 -1
  178. mteb/models/model_implementations/piccolo_models.py +3 -1
  179. mteb/models/model_implementations/pixie_models.py +56 -0
  180. mteb/models/model_implementations/promptriever_models.py +20 -10
  181. mteb/models/model_implementations/pylate_models.py +41 -21
  182. mteb/models/model_implementations/qodo_models.py +4 -2
  183. mteb/models/model_implementations/qtack_models.py +2 -1
  184. mteb/models/model_implementations/qwen3_models.py +14 -4
  185. mteb/models/model_implementations/qzhou_models.py +4 -2
  186. mteb/models/model_implementations/random_baseline.py +7 -6
  187. mteb/models/model_implementations/rasgaard_models.py +3 -2
  188. mteb/models/model_implementations/reasonir_model.py +66 -1
  189. mteb/models/model_implementations/repllama_models.py +18 -9
  190. mteb/models/model_implementations/rerankers_custom.py +25 -10
  191. mteb/models/model_implementations/rerankers_monot5_based.py +41 -21
  192. mteb/models/model_implementations/richinfoai_models.py +2 -1
  193. mteb/models/model_implementations/ru_sentence_models.py +40 -20
  194. mteb/models/model_implementations/ruri_models.py +20 -10
  195. mteb/models/model_implementations/salesforce_models.py +13 -4
  196. mteb/models/model_implementations/samilpwc_models.py +2 -1
  197. mteb/models/model_implementations/sarashina_embedding_models.py +4 -2
  198. mteb/models/model_implementations/searchmap_models.py +2 -1
  199. mteb/models/model_implementations/seed_1_6_embedding_models.py +5 -2
  200. mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +119 -148
  201. mteb/models/model_implementations/seed_models.py +2 -1
  202. mteb/models/model_implementations/sentence_transformers_models.py +142 -22
  203. mteb/models/model_implementations/shuu_model.py +2 -1
  204. mteb/models/model_implementations/siglip_models.py +39 -24
  205. mteb/models/model_implementations/slm_models.py +419 -0
  206. mteb/models/model_implementations/sonar_models.py +2 -1
  207. mteb/models/model_implementations/spartan8806_atles_champion.py +2 -1
  208. mteb/models/model_implementations/stella_models.py +23 -4
  209. mteb/models/model_implementations/tarka_models.py +4 -2
  210. mteb/models/model_implementations/text2vec_models.py +12 -3
  211. mteb/models/model_implementations/ua_sentence_models.py +2 -1
  212. mteb/models/model_implementations/uae_models.py +17 -5
  213. mteb/models/model_implementations/vdr_models.py +9 -2
  214. mteb/models/model_implementations/vi_vn_models.py +12 -6
  215. mteb/models/model_implementations/vista_models.py +11 -4
  216. mteb/models/model_implementations/vlm2vec_models.py +14 -7
  217. mteb/models/model_implementations/voyage_models.py +136 -4
  218. mteb/models/model_implementations/voyage_v.py +17 -10
  219. mteb/models/model_implementations/xyz_models.py +1 -0
  220. mteb/models/model_implementations/youtu_models.py +2 -1
  221. mteb/models/model_implementations/yuan_models.py +2 -1
  222. mteb/models/model_implementations/yuan_models_en.py +3 -2
  223. mteb/models/model_meta.py +127 -40
  224. mteb/models/models_protocols.py +43 -22
  225. mteb/models/search_encoder_index/search_backend_protocol.py +7 -3
  226. mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +21 -10
  227. mteb/models/search_wrappers.py +63 -29
  228. mteb/models/sentence_transformer_wrapper.py +52 -26
  229. mteb/models/vllm_wrapper.py +329 -0
  230. mteb/py.typed +0 -0
  231. mteb/results/benchmark_results.py +48 -35
  232. mteb/results/model_result.py +68 -32
  233. mteb/results/task_result.py +110 -72
  234. mteb/similarity_functions.py +19 -9
  235. mteb/tasks/aggregated_tasks/eng/cqadupstack_retrieval.py +3 -3
  236. mteb/tasks/aggregated_tasks/eng/sts17_multilingual_visual_sts_eng.py +3 -3
  237. mteb/tasks/aggregated_tasks/eng/sts_benchmark_multilingual_visual_sts_eng.py +3 -3
  238. mteb/tasks/aggregated_tasks/fas/cqadupstack_retrieval_fa.py +3 -3
  239. mteb/tasks/aggregated_tasks/fas/syn_per_chatbot_conv_sa_classification.py +3 -3
  240. mteb/tasks/aggregated_tasks/multilingual/sts17_multilingual_vision_sts.py +3 -3
  241. mteb/tasks/aggregated_tasks/multilingual/sts_benchmark_multilingual_visual_sts.py +3 -3
  242. mteb/tasks/aggregated_tasks/nld/cqadupstack_nl_retrieval.py +3 -3
  243. mteb/tasks/aggregated_tasks/pol/cqadupstack_retrieval_pl.py +3 -3
  244. mteb/tasks/bitext_mining/eng/pub_chem_smiles_bitext_mining.py +1 -1
  245. mteb/tasks/bitext_mining/fas/fa_mteb_summary_retrieval.py +3 -3
  246. mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining.py +1 -1
  247. mteb/tasks/bitext_mining/multilingual/flores_bitext_mining.py +1 -1
  248. mteb/tasks/bitext_mining/multilingual/in22_conv_bitext_mining.py +1 -1
  249. mteb/tasks/bitext_mining/multilingual/in22_gen_bitext_mining.py +1 -1
  250. mteb/tasks/bitext_mining/multilingual/norwegian_courts_bitext_mining.py +1 -1
  251. mteb/tasks/bitext_mining/multilingual/ntrex_bitext_mining.py +1 -1
  252. mteb/tasks/bitext_mining/multilingual/roma_tales_bitext_mining.py +2 -2
  253. mteb/tasks/bitext_mining/multilingual/web_faq_bitext_mining.py +2 -2
  254. mteb/tasks/classification/ara/online_store_review_sentiment_classification.py +1 -1
  255. mteb/tasks/classification/ara/restaurant_review_sentiment_classification.py +1 -1
  256. mteb/tasks/classification/ara/tweet_sarcasm_classification.py +1 -1
  257. mteb/tasks/classification/ben/bengali_hate_speech_classification.py +1 -1
  258. mteb/tasks/classification/ben/bengali_sentiment_analysis.py +1 -1
  259. mteb/tasks/classification/bul/bulgarian_store_review_sentiment_classfication.py +1 -1
  260. mteb/tasks/classification/ces/csfdcz_movie_review_sentiment_classification.py +2 -2
  261. mteb/tasks/classification/dan/ddisco_cohesion_classification.py +1 -1
  262. mteb/tasks/classification/dan/dk_hate_classification.py +2 -2
  263. mteb/tasks/classification/deu/german_politicians_twitter_sentiment_classification.py +1 -1
  264. mteb/tasks/classification/ell/greek_legal_code_classification.py +1 -1
  265. mteb/tasks/classification/eng/dbpedia_classification.py +2 -2
  266. mteb/tasks/classification/eng/toxic_chat_classification.py +2 -2
  267. mteb/tasks/classification/eng/toxic_conversations_classification.py +2 -2
  268. mteb/tasks/classification/eng/tweet_topic_single_classification.py +1 -1
  269. mteb/tasks/classification/eng/yahoo_answers_topics_classification.py +1 -1
  270. mteb/tasks/classification/eng/yelp_review_full_classification.py +2 -2
  271. mteb/tasks/classification/est/estonian_valence.py +2 -2
  272. mteb/tasks/classification/fas/fa_mteb_classification.py +6 -6
  273. mteb/tasks/classification/fas/persian_food_sentiment_classification.py +1 -1
  274. mteb/tasks/classification/fil/filipino_shopee_reviews_classification.py +1 -1
  275. mteb/tasks/classification/fin/fin_toxicity_classification.py +1 -1
  276. mteb/tasks/classification/fra/french_book_reviews.py +2 -2
  277. mteb/tasks/classification/fra/movie_review_sentiment_classification.py +2 -2
  278. mteb/tasks/classification/guj/gujarati_news_classification.py +1 -1
  279. mteb/tasks/classification/hin/hindi_discourse_classification.py +1 -1
  280. mteb/tasks/classification/hin/sentiment_analysis_hindi.py +1 -1
  281. mteb/tasks/classification/ind/indonesian_id_clickbait_classification.py +2 -2
  282. mteb/tasks/classification/ind/indonesian_mongabay_conservation_classification.py +1 -1
  283. mteb/tasks/classification/ita/dado_eval_coarse_classification.py +1 -1
  284. mteb/tasks/classification/ita/ita_casehold_classification.py +1 -1
  285. mteb/tasks/classification/ita/sardi_stance_classification.py +1 -1
  286. mteb/tasks/classification/jav/javanese_imdb_classification.py +1 -1
  287. mteb/tasks/classification/jpn/wrime_classification.py +1 -1
  288. mteb/tasks/classification/kan/kannada_news_classification.py +2 -2
  289. mteb/tasks/classification/kor/klue_tc.py +2 -2
  290. mteb/tasks/classification/kor/kor_fin.py +1 -1
  291. mteb/tasks/classification/kor/kor_hate_classification.py +1 -1
  292. mteb/tasks/classification/kor/kor_sarcasm_classification.py +1 -1
  293. mteb/tasks/classification/kur/kurdish_sentiment_classification.py +2 -2
  294. mteb/tasks/classification/mal/malayalam_news_classification.py +1 -1
  295. mteb/tasks/classification/mar/marathi_news_classification.py +1 -1
  296. mteb/tasks/classification/multilingual/afri_senti_lang_classification.py +1 -1
  297. mteb/tasks/classification/multilingual/catalonia_tweet_classification.py +1 -1
  298. mteb/tasks/classification/multilingual/cyrillic_turkic_lang_classification.py +1 -1
  299. mteb/tasks/classification/multilingual/indic_nlp_news_classification.py +1 -1
  300. mteb/tasks/classification/multilingual/masakha_news_classification.py +1 -1
  301. mteb/tasks/classification/multilingual/multi_hate_classification.py +1 -1
  302. mteb/tasks/classification/multilingual/multilingual_sentiment_classification.py +1 -1
  303. mteb/tasks/classification/multilingual/scala_classification.py +2 -2
  304. mteb/tasks/classification/multilingual/sib200_classification.py +1 -1
  305. mteb/tasks/classification/multilingual/turkic_classification.py +1 -1
  306. mteb/tasks/classification/multilingual/tweet_sentiment_classification.py +1 -1
  307. mteb/tasks/classification/nep/nepali_news_classification.py +2 -2
  308. mteb/tasks/classification/nld/dutch_sarcastic_headlines_classification.py +1 -1
  309. mteb/tasks/classification/nld/vaccin_chat_nl_classification.py +1 -1
  310. mteb/tasks/classification/ory/odia_news_classification.py +2 -2
  311. mteb/tasks/classification/pan/punjabi_news_classification.py +1 -1
  312. mteb/tasks/classification/ron/moroco.py +1 -1
  313. mteb/tasks/classification/ron/romanian_reviews_sentiment.py +1 -1
  314. mteb/tasks/classification/ron/romanian_sentiment_classification.py +1 -1
  315. mteb/tasks/classification/rus/georeview_classification.py +1 -1
  316. mteb/tasks/classification/rus/headline_classification.py +2 -2
  317. mteb/tasks/classification/rus/inappropriateness_classification.py +2 -2
  318. mteb/tasks/classification/rus/ru_reviews_classification.py +2 -2
  319. mteb/tasks/classification/rus/ru_sci_bench_grnti_classification.py +1 -1
  320. mteb/tasks/classification/rus/ru_sci_bench_oecd_classification.py +1 -1
  321. mteb/tasks/classification/rus/ru_toixic_classification_okmlcup.py +1 -1
  322. mteb/tasks/classification/san/sanskrit_shlokas_classification.py +1 -1
  323. mteb/tasks/classification/sin/sinhala_news_classification.py +2 -2
  324. mteb/tasks/classification/sin/sinhala_news_source_classification.py +2 -2
  325. mteb/tasks/classification/slk/csfdsk_movie_review_sentiment_classification.py +2 -2
  326. mteb/tasks/classification/slv/frenk_sl_classification.py +1 -1
  327. mteb/tasks/classification/spa/spanish_news_classification.py +2 -2
  328. mteb/tasks/classification/ssw/siswati_news_classification.py +1 -1
  329. mteb/tasks/classification/tam/tamil_news_classification.py +2 -2
  330. mteb/tasks/classification/tel/telugu_andhra_jyoti_news_classification.py +2 -2
  331. mteb/tasks/classification/tha/wongnai_reviews_classification.py +1 -1
  332. mteb/tasks/classification/tur/turkish_movie_sentiment_classification.py +2 -2
  333. mteb/tasks/classification/ukr/ukr_formality_classification.py +2 -2
  334. mteb/tasks/classification/vie/toxic_conversations_vn_classification.py +1 -1
  335. mteb/tasks/classification/vie/vie_student_feedback_classification.py +1 -1
  336. mteb/tasks/classification/zho/yue_openrice_review_classification.py +2 -2
  337. mteb/tasks/classification/zul/isi_zulu_news_classification.py +1 -1
  338. mteb/tasks/clustering/deu/blurbs_clustering_p2p.py +1 -1
  339. mteb/tasks/clustering/deu/blurbs_clustering_s2s.py +1 -1
  340. mteb/tasks/clustering/eng/arxiv_clustering_p2p.py +1 -1
  341. mteb/tasks/clustering/eng/arxiv_hierarchical_clustering.py +2 -2
  342. mteb/tasks/clustering/eng/big_patent_clustering.py +1 -1
  343. mteb/tasks/clustering/eng/biorxiv_clustering_p2p.py +1 -1
  344. mteb/tasks/clustering/eng/biorxiv_clustering_s2s.py +1 -1
  345. mteb/tasks/clustering/eng/hume_wiki_cities_clustering.py +1 -1
  346. mteb/tasks/clustering/eng/medrxiv_clustering_p2p.py +1 -1
  347. mteb/tasks/clustering/eng/medrxiv_clustering_s2s.py +1 -1
  348. mteb/tasks/clustering/eng/reddit_clustering.py +1 -1
  349. mteb/tasks/clustering/eng/reddit_clustering_p2p.py +1 -1
  350. mteb/tasks/clustering/eng/stack_exchange_clustering.py +1 -1
  351. mteb/tasks/clustering/eng/stack_exchange_clustering_p2p.py +1 -1
  352. mteb/tasks/clustering/eng/twenty_newsgroups_clustering.py +1 -1
  353. mteb/tasks/clustering/eng/wiki_cities_clustering.py +1 -1
  354. mteb/tasks/clustering/fas/fa_mteb_clustering.py +4 -4
  355. mteb/tasks/clustering/fra/hal_clustering_s2s.py +2 -2
  356. mteb/tasks/clustering/multilingual/mlsum_clustering_p2p.py +2 -2
  357. mteb/tasks/clustering/multilingual/mlsum_clustering_s2s.py +2 -2
  358. mteb/tasks/clustering/multilingual/sib200_clustering_s2s.py +1 -1
  359. mteb/tasks/clustering/multilingual/wiki_clustering_p2p.py +1 -1
  360. mteb/tasks/clustering/nld/dutch_news_articles_clustering_p2p.py +1 -1
  361. mteb/tasks/clustering/nld/dutch_news_articles_clustering_s2s.py +1 -1
  362. mteb/tasks/clustering/nld/iconclass_clustering_s2s.py +1 -1
  363. mteb/tasks/clustering/nld/open_tender_clustering_p2p.py +1 -1
  364. mteb/tasks/clustering/nld/vabb_clustering_p2p.py +1 -1
  365. mteb/tasks/clustering/nld/vabb_clustering_s2s.py +1 -1
  366. mteb/tasks/clustering/nob/snl_clustering.py +8 -3
  367. mteb/tasks/clustering/nob/vg_clustering.py +8 -3
  368. mteb/tasks/clustering/pol/polish_clustering.py +3 -3
  369. mteb/tasks/clustering/rus/ru_sci_bench_grnti_clustering_p2p.py +1 -1
  370. mteb/tasks/clustering/rus/ru_sci_bench_oecd_clustering_p2p.py +1 -1
  371. mteb/tasks/clustering/zho/cmteb_clustering.py +6 -6
  372. mteb/tasks/image_text_pair_classification/eng/image_co_de.py +1 -1
  373. mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +2 -2
  374. mteb/tasks/instruction_reranking/multilingual/m_follow_ir.py +2 -2
  375. mteb/tasks/multichoice/eng/cv_bench.py +4 -4
  376. mteb/tasks/multilabel_classification/ita/emit_classification.py +1 -1
  377. mteb/tasks/multilabel_classification/mlt/maltese_news_classification.py +1 -1
  378. mteb/tasks/multilabel_classification/rus/ru_toixic_multilabelclassification_okmlcup.py +1 -1
  379. mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_group_classification.py +1 -1
  380. mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_subclass_classification.py +1 -1
  381. mteb/tasks/pair_classification/ara/ar_entail.py +1 -1
  382. mteb/tasks/pair_classification/dan/talemaader_pc.py +1 -1
  383. mteb/tasks/pair_classification/deu/false_friends_de_en_pc.py +1 -1
  384. mteb/tasks/pair_classification/eng/pub_chem_ai_sentence_paraphrase_pc.py +1 -1
  385. mteb/tasks/pair_classification/eng/pub_chem_smilespc.py +4 -3
  386. mteb/tasks/pair_classification/eng/pub_chem_synonym_pc.py +1 -1
  387. mteb/tasks/pair_classification/eng/pub_chem_wiki_paragraphs_pc.py +1 -1
  388. mteb/tasks/pair_classification/eng/sprint_duplicate_questions_pc.py +1 -1
  389. mteb/tasks/pair_classification/eng/twitter_sem_eval2015_pc.py +1 -1
  390. mteb/tasks/pair_classification/eng/twitter_url_corpus_pc.py +1 -1
  391. mteb/tasks/pair_classification/fas/fa_mteb_pair_classification.py +5 -5
  392. mteb/tasks/pair_classification/fas/fars_tail.py +2 -2
  393. mteb/tasks/pair_classification/hye/armenian_paraphrase_pc.py +1 -1
  394. mteb/tasks/pair_classification/ita/dis_co_tex_pair_classification.py +1 -1
  395. mteb/tasks/pair_classification/kor/klue_nli.py +1 -1
  396. mteb/tasks/pair_classification/multilingual/rte3.py +2 -2
  397. mteb/tasks/pair_classification/multilingual/xnli.py +1 -1
  398. mteb/tasks/pair_classification/pol/polish_pc.py +4 -4
  399. mteb/tasks/pair_classification/por/assin2_rte.py +1 -1
  400. mteb/tasks/pair_classification/por/sick_br_pc.py +1 -1
  401. mteb/tasks/pair_classification/rus/terra.py +2 -2
  402. mteb/tasks/pair_classification/vie/sprint_duplicate_questions_pcvn.py +1 -1
  403. mteb/tasks/pair_classification/vie/twitter_sem_eval2015_pcvn.py +1 -1
  404. mteb/tasks/pair_classification/vie/twitter_url_corpus_pcvn.py +1 -1
  405. mteb/tasks/pair_classification/zho/cmteb_pair_classification.py +2 -2
  406. mteb/tasks/reranking/multilingual/wikipedia_reranking_multilingual.py +1 -1
  407. mteb/tasks/retrieval/ara/sadeem_question_retrieval.py +1 -1
  408. mteb/tasks/retrieval/code/code_edit_search_retrieval.py +1 -1
  409. mteb/tasks/retrieval/code/code_rag.py +16 -16
  410. mteb/tasks/retrieval/code/code_search_net_cc_retrieval.py +1 -1
  411. mteb/tasks/retrieval/code/coir_code_search_net_retrieval.py +1 -1
  412. mteb/tasks/retrieval/code/ds1000_retrieval.py +1 -1
  413. mteb/tasks/retrieval/code/fresh_stack_retrieval.py +1 -1
  414. mteb/tasks/retrieval/code/human_eval_retrieval.py +1 -1
  415. mteb/tasks/retrieval/code/mbpp_retrieval.py +1 -1
  416. mteb/tasks/retrieval/code/wiki_sql_retrieval.py +1 -1
  417. mteb/tasks/retrieval/dan/dan_fever_retrieval.py +2 -2
  418. mteb/tasks/retrieval/dan/tv2_nordretrieval.py +3 -3
  419. mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +3 -3
  420. mteb/tasks/retrieval/deu/german_gov_service_retrieval.py +1 -1
  421. mteb/tasks/retrieval/deu/german_qu_ad_retrieval.py +1 -1
  422. mteb/tasks/retrieval/ell/greek_civics_qa.py +1 -1
  423. mteb/tasks/retrieval/eng/__init__.py +44 -0
  424. mteb/tasks/retrieval/eng/bright_retrieval.py +10 -2
  425. mteb/tasks/retrieval/eng/bright_v1_1_retrieval.py +968 -0
  426. mteb/tasks/retrieval/eng/chat_doctor_retrieval.py +1 -1
  427. mteb/tasks/retrieval/eng/chemrxiv.py +33 -0
  428. mteb/tasks/retrieval/eng/cub200_i2i_retrieval.py +1 -1
  429. mteb/tasks/retrieval/eng/fin_qa_retrieval.py +1 -1
  430. mteb/tasks/retrieval/eng/finance_bench_retrieval.py +1 -1
  431. mteb/tasks/retrieval/eng/hateful_memes_i2t_retrieval.py +1 -1
  432. mteb/tasks/retrieval/eng/hateful_memes_t2i_retrieval.py +1 -1
  433. mteb/tasks/retrieval/eng/hc3_finance_retrieval.py +1 -1
  434. mteb/tasks/retrieval/eng/lemb_narrative_qa_retrieval.py +1 -1
  435. mteb/tasks/retrieval/eng/lemb_needle_retrieval.py +1 -1
  436. mteb/tasks/retrieval/eng/lemb_passkey_retrieval.py +1 -1
  437. mteb/tasks/retrieval/eng/lemb_summ_screen_fd_retrieval.py +1 -1
  438. mteb/tasks/retrieval/eng/lemb_wikim_qa_retrieval.py +1 -1
  439. mteb/tasks/retrieval/eng/lembqm_sum_retrieval.py +1 -1
  440. mteb/tasks/retrieval/eng/limit_retrieval.py +6 -1
  441. mteb/tasks/retrieval/eng/lit_search_retrieval.py +1 -1
  442. mteb/tasks/retrieval/eng/memotion_i2t_retrieval.py +1 -1
  443. mteb/tasks/retrieval/eng/memotion_t2i_retrieval.py +1 -1
  444. mteb/tasks/retrieval/eng/ml_questions.py +1 -1
  445. mteb/tasks/retrieval/eng/nano_argu_ana_retrieval.py +1 -1
  446. mteb/tasks/retrieval/eng/nano_climate_fever_retrieval.py +1 -1
  447. mteb/tasks/retrieval/eng/nano_db_pedia_retrieval.py +1 -1
  448. mteb/tasks/retrieval/eng/nano_fever_retrieval.py +1 -1
  449. mteb/tasks/retrieval/eng/nano_fi_qa2018_retrieval.py +1 -1
  450. mteb/tasks/retrieval/eng/nano_hotpot_qa_retrieval.py +1 -1
  451. mteb/tasks/retrieval/eng/nano_msmarco_retrieval.py +1 -1
  452. mteb/tasks/retrieval/eng/nano_nf_corpus_retrieval.py +1 -1
  453. mteb/tasks/retrieval/eng/nano_nq_retrieval.py +1 -1
  454. mteb/tasks/retrieval/eng/nano_quora_retrieval.py +1 -1
  455. mteb/tasks/retrieval/eng/nano_sci_fact_retrieval.py +1 -1
  456. mteb/tasks/retrieval/eng/nano_scidocs_retrieval.py +1 -1
  457. mteb/tasks/retrieval/eng/nano_touche2020_retrieval.py +1 -1
  458. mteb/tasks/retrieval/eng/narrative_qa_retrieval.py +1 -1
  459. mteb/tasks/retrieval/eng/r2_med_retrieval.py +8 -8
  460. mteb/tasks/retrieval/eng/sci_mmir_i2t_retrieval.py +1 -1
  461. mteb/tasks/retrieval/eng/sci_mmir_t2i_retrieval.py +1 -1
  462. mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +10 -10
  463. mteb/tasks/retrieval/fra/f_qu_ad_retrieval.py +1 -1
  464. mteb/tasks/retrieval/fra/syntec_retrieval.py +1 -1
  465. mteb/tasks/retrieval/hun/hun_sum2.py +1 -1
  466. mteb/tasks/retrieval/kat/georgian_faq_retrieval.py +1 -1
  467. mteb/tasks/retrieval/kor/__init__.py +15 -1
  468. mteb/tasks/retrieval/kor/kovidore2_bench_retrieval.py +142 -0
  469. mteb/tasks/retrieval/multilingual/__init__.py +2 -0
  470. mteb/tasks/retrieval/multilingual/cross_lingual_semantic_discrimination_wmt19.py +1 -1
  471. mteb/tasks/retrieval/multilingual/cross_lingual_semantic_discrimination_wmt21.py +1 -1
  472. mteb/tasks/retrieval/multilingual/cur_ev1_retrieval.py +1 -1
  473. mteb/tasks/retrieval/multilingual/euro_pirq_retrieval.py +43 -0
  474. mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +1 -1
  475. mteb/tasks/retrieval/multilingual/miracl_vision_retrieval.py +1 -1
  476. mteb/tasks/retrieval/multilingual/mr_tidy_retrieval.py +1 -1
  477. mteb/tasks/retrieval/multilingual/public_health_qa_retrieval.py +1 -1
  478. mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +5 -5
  479. mteb/tasks/retrieval/multilingual/statcan_dialogue_dataset_retrieval.py +1 -1
  480. mteb/tasks/retrieval/multilingual/vdr_multilingual_retrieval.py +1 -1
  481. mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +14 -4
  482. mteb/tasks/retrieval/multilingual/vidore3_bench_retrieval.py +90 -100
  483. mteb/tasks/retrieval/multilingual/wit_t2i_retrieval.py +1 -1
  484. mteb/tasks/retrieval/multilingual/x_flickr30k_co_t2i_retrieval.py +1 -1
  485. mteb/tasks/retrieval/multilingual/x_qu_ad_retrieval.py +1 -1
  486. mteb/tasks/retrieval/multilingual/xm3600_t2i_retrieval.py +1 -1
  487. mteb/tasks/retrieval/nld/cqa_dupstack_android_nl_retrieval.py +1 -1
  488. mteb/tasks/retrieval/nld/cqa_dupstack_english_nl_retrieval.py +1 -1
  489. mteb/tasks/retrieval/nld/cqa_dupstack_gaming_nl_retrieval.py +1 -1
  490. mteb/tasks/retrieval/nld/cqa_dupstack_gis_nl_retrieval.py +1 -1
  491. mteb/tasks/retrieval/nld/cqa_dupstack_mathematica_nl_retrieval.py +1 -1
  492. mteb/tasks/retrieval/nld/cqa_dupstack_physics_nl_retrieval.py +1 -1
  493. mteb/tasks/retrieval/nld/cqa_dupstack_programmers_nl_retrieval.py +1 -1
  494. mteb/tasks/retrieval/nld/cqa_dupstack_stats_nl_retrieval.py +1 -1
  495. mteb/tasks/retrieval/nld/cqa_dupstack_tex_nl_retrieval.py +1 -1
  496. mteb/tasks/retrieval/nld/cqa_dupstack_unix_nl_retrieval.py +1 -1
  497. mteb/tasks/retrieval/nld/cqa_dupstack_webmasters_nl_retrieval.py +1 -1
  498. mteb/tasks/retrieval/nld/cqa_dupstack_wordpress_nl_retrieval.py +1 -1
  499. mteb/tasks/retrieval/nob/norquad.py +3 -3
  500. mteb/tasks/retrieval/nob/snl_retrieval.py +3 -3
  501. mteb/tasks/retrieval/slk/slovak_sum_retrieval.py +1 -1
  502. mteb/tasks/retrieval/tur/tur_hist_quad.py +1 -1
  503. mteb/tasks/retrieval/vie/__init__.py +14 -6
  504. mteb/tasks/retrieval/vie/climate_fevervn_retrieval.py +39 -0
  505. mteb/tasks/retrieval/vie/db_pedia_vn_retrieval.py +39 -0
  506. mteb/tasks/retrieval/vie/fevervn_retrieval.py +39 -0
  507. mteb/tasks/retrieval/vie/hotpot_qavn_retrieval.py +39 -0
  508. mteb/tasks/retrieval/vie/msmarcovn_retrieval.py +48 -0
  509. mteb/tasks/retrieval/vie/nqvn_retrieval.py +39 -0
  510. mteb/tasks/retrieval/vie/tvpl_retrieval.py +42 -0
  511. mteb/tasks/retrieval/vie/vie_qu_ad_retrieval.py +1 -1
  512. mteb/tasks/retrieval/vie/zac_legal_text_retrieval.py +15 -1
  513. mteb/tasks/sts/fao/faroese_sts.py +1 -1
  514. mteb/tasks/sts/fra/sick_fr_sts.py +1 -1
  515. mteb/tasks/sts/kor/klue_sts.py +1 -1
  516. mteb/tasks/sts/por/sick_br_sts.py +1 -1
  517. mteb/tasks/sts/rus/ru_para_phraser_sts.py +1 -1
  518. mteb/tasks/zeroshot_classification/eng/sci_mmir.py +1 -1
  519. mteb/types/__init__.py +2 -0
  520. mteb/types/_encoder_io.py +13 -1
  521. mteb/types/_result.py +2 -1
  522. mteb/types/statistics.py +18 -5
  523. {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/METADATA +15 -4
  524. {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/RECORD +528 -486
  525. {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/WHEEL +1 -1
  526. mteb/models/model_implementations/mxbai_models.py +0 -111
  527. {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/entry_points.txt +0 -0
  528. {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/licenses/LICENSE +0 -0
  529. {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/top_level.txt +0 -0
mteb/filter_tasks.py CHANGED
@@ -1,19 +1,24 @@
1
1
  """This script contains functions that are used to get an overview of the MTEB benchmark."""
2
2
 
3
+ from __future__ import annotations
4
+
3
5
  import logging
4
- from collections.abc import Sequence
5
- from typing import overload
6
+ from typing import TYPE_CHECKING, overload
6
7
 
7
- from mteb.abstasks import (
8
- AbsTask,
9
- )
10
8
  from mteb.abstasks.aggregated_task import AbsTaskAggregate
11
- from mteb.abstasks.task_metadata import TaskCategory, TaskDomain, TaskType
12
9
  from mteb.languages import (
13
10
  ISO_TO_LANGUAGE,
14
11
  ISO_TO_SCRIPT,
15
12
  )
16
- from mteb.types import Modalities
13
+
14
+ if TYPE_CHECKING:
15
+ from collections.abc import Iterable, Sequence
16
+
17
+ from mteb.abstasks import (
18
+ AbsTask,
19
+ )
20
+ from mteb.abstasks.task_metadata import TaskCategory, TaskDomain, TaskType
21
+ from mteb.types import Modalities
17
22
 
18
23
  logger = logging.getLogger(__name__)
19
24
 
@@ -34,14 +39,14 @@ def _check_is_valid_language(lang: str) -> None:
34
39
 
35
40
  @overload
36
41
  def filter_tasks(
37
- tasks: Sequence[AbsTask],
42
+ tasks: Iterable[AbsTask],
38
43
  *,
39
- languages: list[str] | None = None,
40
- script: list[str] | None = None,
41
- domains: list[TaskDomain] | None = None,
42
- task_types: list[TaskType] | None = None, # type: ignore
43
- categories: list[TaskCategory] | None = None,
44
- modalities: list[Modalities] | None = None,
44
+ languages: Sequence[str] | None = None,
45
+ script: Sequence[str] | None = None,
46
+ domains: Iterable[TaskDomain] | None = None,
47
+ task_types: Iterable[TaskType] | None = None,
48
+ categories: Iterable[TaskCategory] | None = None,
49
+ modalities: Iterable[Modalities] | None = None,
45
50
  exclusive_modality_filter: bool = False,
46
51
  exclude_superseded: bool = False,
47
52
  exclude_aggregate: bool = False,
@@ -51,14 +56,14 @@ def filter_tasks(
51
56
 
52
57
  @overload
53
58
  def filter_tasks(
54
- tasks: Sequence[type[AbsTask]],
59
+ tasks: Iterable[type[AbsTask]],
55
60
  *,
56
- languages: list[str] | None = None,
57
- script: list[str] | None = None,
58
- domains: list[TaskDomain] | None = None,
59
- task_types: list[TaskType] | None = None, # type: ignore
60
- categories: list[TaskCategory] | None = None,
61
- modalities: list[Modalities] | None = None,
61
+ languages: Sequence[str] | None = None,
62
+ script: Sequence[str] | None = None,
63
+ domains: Iterable[TaskDomain] | None = None,
64
+ task_types: Iterable[TaskType] | None = None,
65
+ categories: Iterable[TaskCategory] | None = None,
66
+ modalities: Iterable[Modalities] | None = None,
62
67
  exclusive_modality_filter: bool = False,
63
68
  exclude_superseded: bool = False,
64
69
  exclude_aggregate: bool = False,
@@ -67,14 +72,14 @@ def filter_tasks(
67
72
 
68
73
 
69
74
  def filter_tasks(
70
- tasks: Sequence[AbsTask] | Sequence[type[AbsTask]],
75
+ tasks: Iterable[AbsTask] | Iterable[type[AbsTask]],
71
76
  *,
72
- languages: list[str] | None = None,
73
- script: list[str] | None = None,
74
- domains: list[TaskDomain] | None = None,
75
- task_types: list[TaskType] | None = None, # type: ignore
76
- categories: list[TaskCategory] | None = None,
77
- modalities: list[Modalities] | None = None,
77
+ languages: Sequence[str] | None = None,
78
+ script: Sequence[str] | None = None,
79
+ domains: Iterable[TaskDomain] | None = None,
80
+ task_types: Iterable[TaskType] | None = None,
81
+ categories: Iterable[TaskCategory] | None = None,
82
+ modalities: Iterable[Modalities] | None = None,
78
83
  exclusive_modality_filter: bool = False,
79
84
  exclude_superseded: bool = False,
80
85
  exclude_aggregate: bool = False,
@@ -92,7 +97,6 @@ def filter_tasks(
92
97
  task_types: A string specifying the type of task e.g. "Classification" or "Retrieval". If None, all tasks are included.
93
98
  categories: A list of task categories these include "t2t" (text to text), "t2i" (text to image). See TaskMetadata for the full list.
94
99
  exclude_superseded: A boolean flag to exclude datasets which are superseded by another.
95
- eval_splits: A list of evaluation splits to include. If None, all splits are included.
96
100
  modalities: A list of modalities to include. If None, all modalities are included.
97
101
  exclusive_modality_filter: If True, only keep tasks where _all_ filter modalities are included in the
98
102
  task's modalities and ALL task modalities are in filter modalities (exact match).
@@ -113,12 +117,12 @@ def filter_tasks(
113
117
  """
114
118
  langs_to_keep = None
115
119
  if languages:
116
- [_check_is_valid_language(lang) for lang in languages]
120
+ [_check_is_valid_language(lang) for lang in languages] # type: ignore[func-returns-value]
117
121
  langs_to_keep = set(languages)
118
122
 
119
123
  script_to_keep = None
120
124
  if script:
121
- [_check_is_valid_script(s) for s in script]
125
+ [_check_is_valid_script(s) for s in script] # type: ignore[func-returns-value]
122
126
  script_to_keep = set(script)
123
127
 
124
128
  domains_to_keep = None
@@ -178,4 +182,4 @@ def filter_tasks(
178
182
 
179
183
  _tasks.append(t)
180
184
 
181
- return _tasks
185
+ return _tasks # type: ignore[return-value] # type checker cannot infer the overload return type
mteb/get_tasks.py CHANGED
@@ -1,19 +1,25 @@
1
1
  """This script contains functions that are used to get an overview of the MTEB benchmark."""
2
2
 
3
+ from __future__ import annotations
4
+
3
5
  import difflib
4
6
  import logging
7
+ import warnings
5
8
  from collections import Counter, defaultdict
6
- from collections.abc import Sequence
7
- from typing import Any
9
+ from typing import TYPE_CHECKING, Any
8
10
 
9
11
  import pandas as pd
10
12
 
11
13
  from mteb.abstasks import (
12
14
  AbsTask,
13
15
  )
14
- from mteb.abstasks.task_metadata import TaskCategory, TaskDomain, TaskType
15
16
  from mteb.filter_tasks import filter_tasks
16
- from mteb.types import Modalities
17
+
18
+ if TYPE_CHECKING:
19
+ from collections.abc import Iterable, Sequence
20
+
21
+ from mteb.abstasks.task_metadata import TaskCategory, TaskDomain, TaskType
22
+ from mteb.types import Modalities
17
23
 
18
24
  logger = logging.getLogger(__name__)
19
25
 
@@ -22,12 +28,11 @@ logger = logging.getLogger(__name__)
22
28
  def _gather_tasks() -> tuple[type[AbsTask], ...]:
23
29
  import mteb.tasks as tasks
24
30
 
25
- tasks = [
31
+ return tuple(
26
32
  t
27
33
  for t in tasks.__dict__.values()
28
34
  if isinstance(t, type) and issubclass(t, AbsTask)
29
- ]
30
- return tuple(tasks)
35
+ )
31
36
 
32
37
 
33
38
  def _create_name_to_task_mapping(
@@ -43,7 +48,7 @@ def _create_name_to_task_mapping(
43
48
  return metadata_names
44
49
 
45
50
 
46
- def _create_similar_tasks(tasks: Sequence[type[AbsTask]]) -> dict[str, list[str]]:
51
+ def _create_similar_tasks(tasks: Iterable[type[AbsTask]]) -> dict[str, list[str]]:
47
52
  """Create a dictionary of similar tasks.
48
53
 
49
54
  Returns:
@@ -194,9 +199,8 @@ class MTEBTasks(tuple[AbsTask]):
194
199
  string with a LaTeX table.
195
200
  """
196
201
  if include_citation_in_name and "name" in properties:
197
- properties += ["intext_citation"]
198
- df = self.to_dataframe(properties)
199
- df["name"] = df["name"] + " " + df["intext_citation"]
202
+ df = self.to_dataframe(tuple(properties) + ("intext_citation",))
203
+ df["name"] = df["name"] + " " + df["intext_citation"] # type: ignore[operator]
200
204
  df = df.drop(columns=["intext_citation"])
201
205
  else:
202
206
  df = self.to_dataframe(properties)
@@ -221,17 +225,17 @@ class MTEBTasks(tuple[AbsTask]):
221
225
 
222
226
 
223
227
  def get_tasks(
224
- tasks: list[str] | None = None,
228
+ tasks: Sequence[str] | None = None,
225
229
  *,
226
- languages: list[str] | None = None,
227
- script: list[str] | None = None,
228
- domains: list[TaskDomain] | None = None,
229
- task_types: list[TaskType] | None = None, # type: ignore
230
- categories: list[TaskCategory] | None = None,
230
+ languages: Sequence[str] | None = None,
231
+ script: Sequence[str] | None = None,
232
+ domains: Sequence[TaskDomain] | None = None,
233
+ task_types: Sequence[TaskType] | None = None,
234
+ categories: Sequence[TaskCategory] | None = None,
231
235
  exclude_superseded: bool = True,
232
- eval_splits: list[str] | None = None,
236
+ eval_splits: Sequence[str] | None = None,
233
237
  exclusive_language_filter: bool = False,
234
- modalities: list[Modalities] | None = None,
238
+ modalities: Sequence[Modalities] | None = None,
235
239
  exclusive_modality_filter: bool = False,
236
240
  exclude_aggregate: bool = False,
237
241
  exclude_private: bool = True,
@@ -287,7 +291,7 @@ def get_tasks(
287
291
  ]
288
292
  return MTEBTasks(_tasks)
289
293
 
290
- _tasks = filter_tasks(
294
+ tasks_: Sequence[type[AbsTask]] = filter_tasks(
291
295
  TASK_LIST,
292
296
  languages=languages,
293
297
  script=script,
@@ -300,12 +304,12 @@ def get_tasks(
300
304
  exclude_aggregate=exclude_aggregate,
301
305
  exclude_private=exclude_private,
302
306
  )
303
- _tasks = [
304
- cls().filter_languages(languages, script).filter_eval_splits(eval_splits)
305
- for cls in _tasks
306
- ]
307
-
308
- return MTEBTasks(_tasks)
307
+ return MTEBTasks(
308
+ [
309
+ cls().filter_languages(languages, script).filter_eval_splits(eval_splits)
310
+ for cls in tasks_
311
+ ]
312
+ )
309
313
 
310
314
 
311
315
  _TASK_RENAMES = {"PersianTextTone": "SynPerTextToneClassification"}
@@ -313,10 +317,10 @@ _TASK_RENAMES = {"PersianTextTone": "SynPerTextToneClassification"}
313
317
 
314
318
  def get_task(
315
319
  task_name: str,
316
- languages: list[str] | None = None,
317
- script: list[str] | None = None,
318
- eval_splits: list[str] | None = None,
319
- hf_subsets: list[str] | None = None,
320
+ languages: Sequence[str] | None = None,
321
+ script: Sequence[str] | None = None,
322
+ eval_splits: Sequence[str] | None = None,
323
+ hf_subsets: Sequence[str] | None = None,
320
324
  exclusive_language_filter: bool = False,
321
325
  ) -> AbsTask:
322
326
  """Get a task by name.
@@ -340,9 +344,9 @@ def get_task(
340
344
  """
341
345
  if task_name in _TASK_RENAMES:
342
346
  _task_name = _TASK_RENAMES[task_name]
343
- logger.warning(
344
- f"The task with the given name '{task_name}' has been renamed to '{_task_name}'. To prevent this warning use the new name."
345
- )
347
+ msg = f"The task with the given name '{task_name}' has been renamed to '{_task_name}'. To prevent this warning use the new name."
348
+ logger.warning(msg)
349
+ warnings.warn(msg)
346
350
 
347
351
  if task_name not in _TASKS_REGISTRY:
348
352
  close_matches = difflib.get_close_matches(task_name, _TASKS_REGISTRY.keys())
@@ -1,9 +1,14 @@
1
- from collections.abc import Iterable
1
+ from __future__ import annotations
2
+
2
3
  from dataclasses import dataclass
4
+ from typing import TYPE_CHECKING
5
+
6
+ from mteb.languages.check_language_code import check_language_code
3
7
 
4
- from typing_extensions import Self
8
+ if TYPE_CHECKING:
9
+ from collections.abc import Iterable, Sequence
5
10
 
6
- from mteb.languages import check_language_code
11
+ from typing_extensions import Self
7
12
 
8
13
 
9
14
  @dataclass
@@ -25,7 +30,9 @@ class LanguageScripts:
25
30
 
26
31
  @classmethod
27
32
  def from_languages_and_scripts(
28
- cls, languages: list[str] | None = None, scripts: list[str] | None = None
33
+ cls,
34
+ languages: Sequence[str] | None = None,
35
+ scripts: Sequence[str] | None = None,
29
36
  ) -> Self:
30
37
  """Create a LanguageScripts object from lists of languages and scripts.
31
38
 
mteb/leaderboard/app.py CHANGED
@@ -1,3 +1,5 @@
1
+ from __future__ import annotations
2
+
1
3
  import itertools
2
4
  import json
3
5
  import logging
@@ -5,15 +7,14 @@ import tempfile
5
7
  import time
6
8
  import warnings
7
9
  from pathlib import Path
8
- from typing import Literal
10
+ from typing import TYPE_CHECKING, Literal, get_args
9
11
  from urllib.parse import urlencode
10
12
 
11
13
  import cachetools
12
14
  import gradio as gr
13
- import pandas as pd
15
+ import pandas as pd # noqa: TC002 # gradio tries to validate typehints
14
16
 
15
17
  import mteb
16
- from mteb import BenchmarkResults
17
18
  from mteb.benchmarks.benchmark import RtebBenchmark
18
19
  from mteb.cache import ResultCache
19
20
  from mteb.leaderboard.benchmark_selector import (
@@ -29,40 +30,118 @@ from mteb.leaderboard.table import (
29
30
  apply_summary_styling_from_benchmark,
30
31
  )
31
32
  from mteb.leaderboard.text_segments import ACKNOWLEDGEMENT, FAQ
33
+ from mteb.models.model_meta import MODEL_TYPES
34
+
35
+ if TYPE_CHECKING:
36
+ from mteb import BenchmarkResults
32
37
 
33
38
  logger = logging.getLogger(__name__)
34
39
 
40
+
35
41
  LANGUAGE: list[str] = list({l for t in mteb.get_tasks() for l in t.metadata.languages})
42
+ MODEL_TYPE_CHOICES = list(get_args(MODEL_TYPES))
36
43
 
37
44
 
38
45
  def _load_results(cache: ResultCache) -> BenchmarkResults:
46
+ """Load benchmark results using an optimized caching strategy.
47
+
48
+ This function implements a two-tier caching strategy for faster leaderboard startup:
49
+
50
+ 1. **Primary Strategy (Fast)**: Download pre-computed cached results from the
51
+ 'cached-data' branch as a compressed JSON file (~2MB vs ~200MB full repo).
52
+ This avoids the need to clone the entire results repository and provides
53
+ near-instantaneous loading for most users.
54
+
55
+ 2. **Fallback Strategy (Slower)**: If the cached download fails, fall back to
56
+ the original approach of downloading the full results repository and
57
+ building the cache from scratch.
58
+
59
+ The cached results file contains pre-aggregated benchmark data that eliminates
60
+ the need for expensive operations like task selection and revision joining
61
+ during app startup.
62
+
63
+ Args:
64
+ cache: ResultCache instance used for both optimized and fallback operations
65
+
66
+ Returns:
67
+ BenchmarkResults: Complete benchmark results ready for leaderboard display
68
+
69
+ Raises:
70
+ Various exceptions related to network issues, file I/O, or data validation
71
+ are logged and may cause fallback to the slower repository-based approach.
72
+ """
39
73
  start_time = time.time()
40
74
  results_cache_path = Path(__file__).parent.joinpath("__cached_results.json")
75
+
41
76
  if not results_cache_path.exists():
42
- logger.info("Cached results not found, downloading from remote...")
43
- cache.download_from_remote()
44
- download_time = time.time() - start_time
45
- logger.info(f"Downloaded remote results in {download_time:.2f}s")
46
-
47
- load_start = time.time()
48
- all_model_names = [model_meta.name for model_meta in mteb.get_model_metas()]
49
-
50
- all_results = cache.load_results(
51
- models=all_model_names,
52
- only_main_score=True,
53
- require_model_meta=False,
54
- include_remote=True,
77
+ # First try to download the cached results file from the cached-data branch
78
+ # This is faster than cloning the entire results repository
79
+ logger.info(
80
+ "Cached results not found, trying to download from cached-data branch..."
55
81
  )
56
- load_time = time.time() - load_start
57
- logger.info(f"Loaded results from cache in {load_time:.2f}s")
58
- return all_results
59
- else:
60
- logger.info("Loading cached results from disk...")
82
+
83
+ try:
84
+ # Use ResultCache's optimized download method
85
+ # Default saves to mteb/leaderboard/__cached_results.json
86
+ results_cache_path = cache._download_cached_results_from_branch()
87
+ download_time = time.time() - start_time
88
+ logger.info(
89
+ f"Downloaded cached results from cached-data branch in {download_time:.2f}s"
90
+ )
91
+
92
+ except Exception as e:
93
+ logger.error(
94
+ f"Failed to download from cached-data branch: {type(e).__name__}: {e}"
95
+ )
96
+ logger.info("Falling back to downloading full remote repository...")
97
+
98
+ # Fall back to the original approach: clone the full repo
99
+ cache.download_from_remote()
100
+ download_time = time.time() - start_time
101
+ logger.info(f"Downloaded remote results in {download_time:.2f}s")
102
+
103
+ load_start = time.time()
104
+ all_model_names = [model_meta.name for model_meta in mteb.get_model_metas()]
105
+
106
+ all_results = cache.load_results(
107
+ models=all_model_names,
108
+ only_main_score=True,
109
+ require_model_meta=False,
110
+ include_remote=True,
111
+ )
112
+ load_time = time.time() - load_start
113
+ logger.info(f"Loaded results from cache in {load_time:.2f}s")
114
+ return all_results
115
+
116
+ # Load the cached results file (either pre-existing or just downloaded)
117
+ logger.info("Loading cached results from disk...")
118
+ try:
119
+ logger.info(f"Opening file: {results_cache_path}")
120
+
121
+ file_size = results_cache_path.stat().st_size
122
+ logger.info(f"File exists, size: {file_size} bytes")
123
+
61
124
  with results_cache_path.open() as cache_file:
62
- results = mteb.BenchmarkResults.from_validated(**json.load(cache_file))
63
- total_time = time.time() - start_time
64
- logger.info(f"Loaded cached results in {total_time:.2f}s")
65
- return results
125
+ logger.info("File opened successfully, attempting JSON parse...")
126
+ json_data = json.load(cache_file)
127
+ logger.info(
128
+ f"JSON parsed successfully, keys: {list(json_data.keys()) if isinstance(json_data, dict) else 'not a dict'}"
129
+ )
130
+
131
+ logger.info("Attempting BenchmarkResults.from_validated...")
132
+ results = mteb.BenchmarkResults.from_validated(**json_data)
133
+ logger.info("BenchmarkResults.from_validated successful")
134
+
135
+ except Exception as e:
136
+ # TODO: Handle the case when we fail to load cached results from disk.
137
+ logger.error(
138
+ f"Failed to load cached results from disk: {type(e).__name__}: {e}"
139
+ )
140
+ raise
141
+
142
+ total_time = time.time() - start_time
143
+ logger.info(f"Loaded cached results in {total_time:.2f}s")
144
+ return results
66
145
 
67
146
 
68
147
  def _produce_benchmark_link(benchmark_name: str, request: gr.Request) -> str:
@@ -169,7 +248,7 @@ def _update_task_info(task_names: str) -> gr.DataFrame:
169
248
  df = df.drop(columns="reference")
170
249
  return gr.DataFrame(
171
250
  df,
172
- datatype=["markdown"] + ["str"] * (len(df.columns) - 1), # type: ignore
251
+ datatype=["markdown"] + ["str"] * (len(df.columns) - 1),
173
252
  buttons=["copy", "fullscreen"],
174
253
  show_search="filter",
175
254
  )
@@ -187,6 +266,7 @@ def _filter_models(
187
266
  instructions: bool | None,
188
267
  max_model_size: int,
189
268
  zero_shot_setting: Literal["only_zero_shot", "allow_all", "remove_unknown"],
269
+ model_types: list[str] | None,
190
270
  ):
191
271
  lower, upper = 0, max_model_size
192
272
  # Setting to None, when the user doesn't specify anything
@@ -205,6 +285,7 @@ def _filter_models(
205
285
  use_instructions=instructions,
206
286
  frameworks=compatibility,
207
287
  n_parameters_range=(lower, upper),
288
+ model_types=model_types,
208
289
  )
209
290
 
210
291
  models_to_keep = set()
@@ -269,6 +350,7 @@ def _cache_on_benchmark_select(benchmark_name, all_benchmark_results):
269
350
  instructions=None,
270
351
  max_model_size=MAX_MODEL_SIZE,
271
352
  zero_shot_setting="allow_all",
353
+ model_types=MODEL_TYPE_CHOICES,
272
354
  )
273
355
  # Sort to ensure consistency with update_models
274
356
  initial_models = sorted(initial_models)
@@ -387,6 +469,7 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
387
469
  instructions=None,
388
470
  max_model_size=MAX_MODEL_SIZE,
389
471
  zero_shot_setting="allow_all",
472
+ model_types=MODEL_TYPE_CHOICES,
390
473
  )
391
474
  default_filtered_scores = [
392
475
  entry for entry in default_scores if entry["model_name"] in filtered_models
@@ -467,7 +550,10 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
467
550
 
468
551
  logger.info("Step 7/7: Building Gradio interface and callbacks...")
469
552
  interface_start = time.time()
470
- with gr.Blocks(fill_width=True) as demo:
553
+ with gr.Blocks(
554
+ title="MTEB Leaderboard",
555
+ fill_width=True,
556
+ ) as demo:
471
557
  with gr.Sidebar(
472
558
  position="left",
473
559
  label="Benchmark Selection and Customization",
@@ -583,6 +669,12 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
583
669
  label="Model Parameters",
584
670
  interactive=True,
585
671
  )
672
+ with gr.Column():
673
+ model_type_select = gr.CheckboxGroup(
674
+ MODEL_TYPE_CHOICES,
675
+ value=MODEL_TYPE_CHOICES,
676
+ label="Model Type",
677
+ )
586
678
 
587
679
  with gr.Tab("Summary"):
588
680
  summary_table.render()
@@ -755,7 +847,8 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
755
847
  compatibility,
756
848
  instructions,
757
849
  max_model_size,
758
- zero_shot: hash(
850
+ zero_shot,
851
+ model_type_select: hash(
759
852
  (
760
853
  id(scores),
761
854
  hash(tuple(tasks)),
@@ -764,6 +857,7 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
764
857
  hash(instructions),
765
858
  hash(max_model_size),
766
859
  hash(zero_shot),
860
+ hash(tuple(model_type_select)),
767
861
  )
768
862
  ),
769
863
  )
@@ -775,6 +869,7 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
775
869
  instructions: bool | None,
776
870
  max_model_size: int,
777
871
  zero_shot: Literal["allow_all", "remove_unknown", "only_zero_shot"],
872
+ model_type_select: list[str],
778
873
  ):
779
874
  start_time = time.time()
780
875
  model_names = list({entry["model_name"] for entry in scores})
@@ -786,6 +881,7 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
786
881
  instructions,
787
882
  max_model_size,
788
883
  zero_shot_setting=zero_shot,
884
+ model_types=model_type_select,
789
885
  )
790
886
  elapsed = time.time() - start_time
791
887
  logger.debug(f"update_models callback: {elapsed}s")
@@ -803,6 +899,7 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
803
899
  instructions,
804
900
  max_model_size,
805
901
  zero_shot,
902
+ model_type_select,
806
903
  ],
807
904
  outputs=[models],
808
905
  )
@@ -817,6 +914,7 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
817
914
  instructions,
818
915
  max_model_size,
819
916
  zero_shot,
917
+ model_type_select,
820
918
  ],
821
919
  outputs=[models],
822
920
  )
@@ -830,6 +928,7 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
830
928
  instructions,
831
929
  max_model_size,
832
930
  zero_shot,
931
+ model_type_select,
833
932
  ],
834
933
  outputs=[models],
835
934
  )
@@ -843,6 +942,7 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
843
942
  instructions,
844
943
  max_model_size,
845
944
  zero_shot,
945
+ model_type_select,
846
946
  ],
847
947
  outputs=[models],
848
948
  )
@@ -856,6 +956,7 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
856
956
  instructions,
857
957
  max_model_size,
858
958
  zero_shot,
959
+ model_type_select,
859
960
  ],
860
961
  outputs=[models],
861
962
  )
@@ -869,6 +970,7 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
869
970
  instructions,
870
971
  max_model_size,
871
972
  zero_shot,
973
+ model_type_select,
872
974
  ],
873
975
  outputs=[models],
874
976
  )
@@ -882,6 +984,21 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
882
984
  instructions,
883
985
  max_model_size,
884
986
  zero_shot,
987
+ model_type_select,
988
+ ],
989
+ outputs=[models],
990
+ )
991
+ model_type_select.change(
992
+ update_models,
993
+ inputs=[
994
+ scores,
995
+ task_select,
996
+ availability,
997
+ compatibility,
998
+ instructions,
999
+ max_model_size,
1000
+ zero_shot,
1001
+ model_type_select,
885
1002
  ],
886
1003
  outputs=[models],
887
1004
  )
@@ -1023,16 +1140,34 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
1023
1140
 
1024
1141
 
1025
1142
  if __name__ == "__main__":
1026
- logging.getLogger("mteb.load_results.task_results").setLevel(
1027
- logging.ERROR
1028
- ) # Warnings related to task split
1029
- logging.getLogger("mteb.model_meta").setLevel(
1030
- logging.ERROR
1031
- ) # Warning related to model metadata (fetch_from_hf=False)
1032
- logging.getLogger("mteb.load_results.benchmark_results").setLevel(
1033
- logging.ERROR
1034
- ) # Warning related to model metadata (fetch_from_hf=False)
1143
+ import os
1144
+
1145
+ # Add process ID to logging for multiprocessing debugging
1146
+ logging.basicConfig(
1147
+ level=logging.INFO,
1148
+ format="%(asctime)s - PID:%(process)d - %(name)s - %(levelname)s - %(message)s",
1149
+ force=True, # Override any existing handlers
1150
+ )
1151
+
1152
+ # Flush log handlers immediately (helpful for multiprocessing)
1153
+ for handler in logging.root.handlers:
1154
+ handler.flush()
1155
+
1156
+ logger.info(f"Starting leaderboard app in process {os.getpid()}")
1157
+
1158
+ # Suppress specific WARNING messages while keeping INFO level for the app
1159
+ logging.getLogger("mteb.results.task_result").setLevel(logging.ERROR)
1160
+ logging.getLogger("mteb.models.model_meta").setLevel(logging.ERROR)
1161
+ logging.getLogger("mteb.results.benchmark_results").setLevel(logging.ERROR)
1162
+
1035
1163
  warnings.filterwarnings("ignore", message="Couldn't get scores for .* due to .*")
1164
+ warnings.filterwarnings("ignore", message="Could not get source model: .*")
1165
+ warnings.filterwarnings(
1166
+ "ignore", message="No scores data available. Returning empty DataFrame."
1167
+ )
1168
+ warnings.filterwarnings("ignore", message="Main score .* not found in scores")
1169
+ warnings.filterwarnings("ignore", message=".*: Missing subsets .* for split .*")
1170
+ warnings.filterwarnings("ignore", message=".*: Missing splits .*")
1036
1171
 
1037
1172
  app = get_leaderboard_app()
1038
1173