mteb 2.1.4__py3-none-any.whl → 2.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (458) hide show
  1. mteb/__init__.py +4 -0
  2. mteb/_create_dataloaders.py +6 -3
  3. mteb/_evaluators/any_sts_evaluator.py +21 -12
  4. mteb/_evaluators/classification_metrics.py +54 -0
  5. mteb/_evaluators/clustering_evaluator.py +1 -1
  6. mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +9 -4
  7. mteb/_evaluators/pair_classification_evaluator.py +30 -38
  8. mteb/_evaluators/sklearn_evaluator.py +15 -28
  9. mteb/_evaluators/text/bitext_mining_evaluator.py +4 -1
  10. mteb/_evaluators/text/summarization_evaluator.py +4 -2
  11. mteb/_evaluators/zeroshot_classification_evaluator.py +2 -2
  12. mteb/abstasks/_data_filter/__init__.py +0 -0
  13. mteb/abstasks/_data_filter/filters.py +125 -0
  14. mteb/abstasks/_data_filter/task_pipelines.py +102 -0
  15. mteb/abstasks/_statistics_calculation.py +6 -2
  16. mteb/abstasks/classification.py +0 -2
  17. mteb/abstasks/clustering.py +1 -1
  18. mteb/abstasks/clustering_legacy.py +3 -0
  19. mteb/abstasks/multilabel_classification.py +10 -3
  20. mteb/abstasks/pair_classification.py +8 -1
  21. mteb/abstasks/sts.py +7 -0
  22. mteb/abstasks/task_metadata.py +1 -0
  23. mteb/benchmarks/_create_table.py +84 -37
  24. mteb/benchmarks/benchmark.py +74 -15
  25. mteb/benchmarks/benchmarks/__init__.py +8 -0
  26. mteb/benchmarks/benchmarks/benchmarks.py +259 -15
  27. mteb/benchmarks/get_benchmark.py +2 -0
  28. mteb/cache.py +47 -10
  29. mteb/deprecated_evaluator.py +8 -13
  30. mteb/descriptive_stats/BitextMining/RuSciBenchBitextMining.v2.json +61 -0
  31. mteb/descriptive_stats/Classification/HebrewSentimentAnalysis.v3.json +60 -0
  32. mteb/descriptive_stats/Classification/TurkishConstitutionalCourtViolation.json +54 -0
  33. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3ComputerScienceRetrieval.json +214 -0
  34. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3EnergyRetrieval.json +214 -0
  35. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceEnRetrieval.json +214 -0
  36. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceFrRetrieval.json +214 -0
  37. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3HrRetrieval.json +214 -0
  38. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3IndustrialRetrieval.json +214 -0
  39. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3NuclearRetrieval.json +214 -0
  40. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PharmaceuticalsRetrieval.json +214 -0
  41. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PhysicsRetrieval.json +214 -0
  42. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3TelecomRetrieval.json +214 -0
  43. mteb/descriptive_stats/PairClassification/TERRa.V2.json +35 -0
  44. mteb/descriptive_stats/Reranking/JQaRARerankingLite.json +35 -0
  45. mteb/descriptive_stats/Reranking/JaCWIRRerankingLite.json +35 -0
  46. mteb/descriptive_stats/Reranking/MultiLongDocReranking.json +466 -0
  47. mteb/descriptive_stats/Retrieval/ArguAna-NL.v2.json +30 -0
  48. mteb/descriptive_stats/Retrieval/JaCWIRRetrievalLite.json +30 -0
  49. mteb/descriptive_stats/Retrieval/JaqketRetrievalLite.json +30 -0
  50. mteb/descriptive_stats/Retrieval/MIRACLJaRetrievalLite.json +30 -0
  51. mteb/descriptive_stats/Retrieval/MrTyDiJaRetrievalLite.json +30 -0
  52. mteb/descriptive_stats/Retrieval/NFCorpus-NL.v2.json +30 -0
  53. mteb/descriptive_stats/Retrieval/SCIDOCS-NL.v2.json +30 -0
  54. mteb/descriptive_stats/Retrieval/SQuADKorV1Retrieval.json +30 -0
  55. mteb/descriptive_stats/Retrieval/SciFact-NL.v2.json +30 -0
  56. mteb/evaluate.py +65 -45
  57. mteb/leaderboard/app.py +268 -133
  58. mteb/leaderboard/benchmark_selector.py +14 -5
  59. mteb/leaderboard/figures.py +13 -15
  60. mteb/leaderboard/table.py +82 -17
  61. mteb/models/__init__.py +4 -1
  62. mteb/models/abs_encoder.py +21 -17
  63. mteb/models/cache_wrappers/__init__.py +2 -1
  64. mteb/models/cache_wrappers/cache_backends/_hash_utils.py +2 -2
  65. mteb/models/cache_wrappers/cache_wrapper.py +1 -1
  66. mteb/models/get_model_meta.py +3 -114
  67. mteb/models/instruct_wrapper.py +5 -1
  68. mteb/models/model_implementations/align_models.py +7 -0
  69. mteb/models/model_implementations/amazon_models.py +1 -0
  70. mteb/models/model_implementations/andersborges.py +65 -0
  71. mteb/models/model_implementations/ara_models.py +8 -0
  72. mteb/models/model_implementations/arctic_models.py +8 -0
  73. mteb/models/model_implementations/b1ade_models.py +1 -0
  74. mteb/models/model_implementations/bedrock_models.py +4 -0
  75. mteb/models/model_implementations/bge_models.py +60 -0
  76. mteb/models/model_implementations/bica_model.py +35 -0
  77. mteb/models/model_implementations/blip2_models.py +11 -0
  78. mteb/models/model_implementations/blip_models.py +27 -0
  79. mteb/models/model_implementations/bm25.py +1 -0
  80. mteb/models/model_implementations/bmretriever_models.py +4 -0
  81. mteb/models/model_implementations/cadet_models.py +9 -0
  82. mteb/models/model_implementations/cde_models.py +14 -0
  83. mteb/models/model_implementations/clip_models.py +3 -0
  84. mteb/models/model_implementations/clips_models.py +100 -0
  85. mteb/models/model_implementations/codefuse_models.py +162 -0
  86. mteb/models/model_implementations/codesage_models.py +15 -0
  87. mteb/models/model_implementations/cohere_models.py +8 -1
  88. mteb/models/model_implementations/cohere_v.py +5 -0
  89. mteb/models/model_implementations/colpali_models.py +14 -6
  90. mteb/models/model_implementations/colqwen_models.py +271 -1
  91. mteb/models/model_implementations/colsmol_models.py +2 -0
  92. mteb/models/model_implementations/conan_models.py +1 -0
  93. mteb/models/model_implementations/dino_models.py +171 -0
  94. mteb/models/model_implementations/e5_instruct.py +4 -0
  95. mteb/models/model_implementations/e5_models.py +12 -101
  96. mteb/models/model_implementations/e5_v.py +1 -0
  97. mteb/models/model_implementations/eagerworks_models.py +164 -0
  98. mteb/models/model_implementations/emillykkejensen_models.py +91 -0
  99. mteb/models/model_implementations/en_code_retriever.py +1 -0
  100. mteb/models/model_implementations/euler_models.py +32 -0
  101. mteb/models/model_implementations/evaclip_models.py +4 -0
  102. mteb/models/model_implementations/fa_models.py +58 -0
  103. mteb/models/model_implementations/facebookai.py +193 -0
  104. mteb/models/model_implementations/geogpt_models.py +1 -0
  105. mteb/models/model_implementations/gme_v_models.py +11 -5
  106. mteb/models/model_implementations/google_models.py +16 -5
  107. mteb/models/model_implementations/granite_vision_embedding_models.py +7 -2
  108. mteb/models/model_implementations/gritlm_models.py +2 -0
  109. mteb/models/model_implementations/gte_models.py +78 -0
  110. mteb/models/model_implementations/hinvec_models.py +1 -0
  111. mteb/models/model_implementations/human.py +1 -0
  112. mteb/models/model_implementations/ibm_granite_models.py +6 -0
  113. mteb/models/model_implementations/inf_models.py +2 -0
  114. mteb/models/model_implementations/jasper_models.py +255 -2
  115. mteb/models/model_implementations/jina_clip.py +1 -0
  116. mteb/models/model_implementations/jina_models.py +209 -5
  117. mteb/models/model_implementations/kalm_models.py +203 -25
  118. mteb/models/model_implementations/kblab.py +31 -0
  119. mteb/models/model_implementations/kennethenevoldsen_models.py +74 -0
  120. mteb/models/model_implementations/kfst.py +25 -0
  121. mteb/models/model_implementations/kowshik24_models.py +32 -0
  122. mteb/models/model_implementations/lens_models.py +2 -0
  123. mteb/models/model_implementations/lgai_embedding_models.py +1 -0
  124. mteb/models/model_implementations/linq_models.py +3 -2
  125. mteb/models/model_implementations/listconranker.py +1 -1
  126. mteb/models/model_implementations/llm2clip_models.py +3 -0
  127. mteb/models/model_implementations/llm2vec_models.py +8 -0
  128. mteb/models/model_implementations/mcinext_models.py +3 -0
  129. mteb/models/model_implementations/mdbr_models.py +2 -0
  130. mteb/models/model_implementations/misc_models.py +362 -0
  131. mteb/models/model_implementations/mme5_models.py +1 -0
  132. mteb/models/model_implementations/moco_models.py +11 -0
  133. mteb/models/model_implementations/mod_models.py +191 -0
  134. mteb/models/model_implementations/model2vec_models.py +13 -0
  135. mteb/models/model_implementations/moka_models.py +3 -0
  136. mteb/models/model_implementations/mxbai_models.py +9 -0
  137. mteb/models/model_implementations/nbailab.py +70 -0
  138. mteb/models/model_implementations/no_instruct_sentence_models.py +1 -0
  139. mteb/models/model_implementations/nomic_models.py +156 -4
  140. mteb/models/model_implementations/nomic_models_vision.py +7 -2
  141. mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +23 -16
  142. mteb/models/model_implementations/nvidia_models.py +4 -1
  143. mteb/models/model_implementations/octen_models.py +195 -0
  144. mteb/models/model_implementations/openai_models.py +20 -16
  145. mteb/models/model_implementations/openclip_models.py +24 -0
  146. mteb/models/model_implementations/opensearch_neural_sparse_models.py +5 -0
  147. mteb/models/model_implementations/ops_moa_models.py +4 -2
  148. mteb/models/model_implementations/pawan_models.py +39 -0
  149. mteb/models/model_implementations/piccolo_models.py +8 -0
  150. mteb/models/model_implementations/promptriever_models.py +8 -4
  151. mteb/models/model_implementations/pylate_models.py +37 -4
  152. mteb/models/model_implementations/qodo_models.py +2 -0
  153. mteb/models/model_implementations/qtack_models.py +1 -0
  154. mteb/models/model_implementations/qwen3_models.py +6 -3
  155. mteb/models/model_implementations/qzhou_models.py +3 -1
  156. mteb/models/model_implementations/random_baseline.py +16 -21
  157. mteb/models/model_implementations/rasgaard_models.py +34 -0
  158. mteb/models/model_implementations/reasonir_model.py +1 -0
  159. mteb/models/model_implementations/repllama_models.py +2 -0
  160. mteb/models/model_implementations/rerankers_custom.py +3 -3
  161. mteb/models/model_implementations/rerankers_monot5_based.py +14 -14
  162. mteb/models/model_implementations/richinfoai_models.py +1 -0
  163. mteb/models/model_implementations/ru_sentence_models.py +51 -0
  164. mteb/models/model_implementations/ruri_models.py +322 -0
  165. mteb/models/model_implementations/salesforce_models.py +3 -0
  166. mteb/models/model_implementations/samilpwc_models.py +1 -0
  167. mteb/models/model_implementations/sarashina_embedding_models.py +168 -0
  168. mteb/models/model_implementations/searchmap_models.py +1 -0
  169. mteb/models/model_implementations/seed_1_6_embedding_models.py +8 -2
  170. mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +658 -0
  171. mteb/models/model_implementations/seed_models.py +1 -0
  172. mteb/models/model_implementations/sentence_transformers_models.py +57 -0
  173. mteb/models/model_implementations/shuu_model.py +32 -31
  174. mteb/models/model_implementations/siglip_models.py +10 -0
  175. mteb/models/model_implementations/sonar_models.py +1 -0
  176. mteb/models/model_implementations/spartan8806_atles_champion.py +34 -0
  177. mteb/models/model_implementations/stella_models.py +6 -0
  178. mteb/models/model_implementations/tarka_models.py +376 -0
  179. mteb/models/model_implementations/ua_sentence_models.py +10 -0
  180. mteb/models/model_implementations/uae_models.py +1 -0
  181. mteb/models/model_implementations/vdr_models.py +2 -0
  182. mteb/models/model_implementations/vi_vn_models.py +39 -0
  183. mteb/models/model_implementations/vista_models.py +2 -0
  184. mteb/models/model_implementations/vlm2vec_models.py +2 -0
  185. mteb/models/model_implementations/voyage_models.py +15 -0
  186. mteb/models/model_implementations/voyage_v.py +8 -2
  187. mteb/models/model_implementations/xyz_models.py +1 -0
  188. mteb/models/model_implementations/youtu_models.py +1 -0
  189. mteb/models/model_implementations/yuan_models.py +34 -0
  190. mteb/models/model_implementations/yuan_models_en.py +58 -0
  191. mteb/models/model_meta.py +442 -22
  192. mteb/models/search_encoder_index/__init__.py +7 -0
  193. mteb/models/search_encoder_index/search_backend_protocol.py +50 -0
  194. mteb/models/search_encoder_index/search_indexes/__init__.py +5 -0
  195. mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +157 -0
  196. mteb/models/search_wrappers.py +165 -48
  197. mteb/models/sentence_transformer_wrapper.py +2 -7
  198. mteb/results/benchmark_results.py +88 -47
  199. mteb/results/model_result.py +11 -4
  200. mteb/results/task_result.py +37 -19
  201. mteb/similarity_functions.py +49 -0
  202. mteb/tasks/bitext_mining/multilingual/__init__.py +2 -1
  203. mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining.py +4 -2
  204. mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining_fast.py +1 -1
  205. mteb/tasks/bitext_mining/multilingual/ru_sci_bench_bitext_mining.py +47 -5
  206. mteb/tasks/bitext_mining/multilingual/web_faq_bitext_mining.py +2 -6
  207. mteb/tasks/classification/ara/ajgt.py +1 -2
  208. mteb/tasks/classification/ara/hotel_review_sentiment_classification.py +1 -2
  209. mteb/tasks/classification/ara/online_store_review_sentiment_classification.py +1 -2
  210. mteb/tasks/classification/ara/restaurant_review_sentiment_classification.py +1 -2
  211. mteb/tasks/classification/ara/tweet_emotion_classification.py +1 -2
  212. mteb/tasks/classification/ara/tweet_sarcasm_classification.py +1 -2
  213. mteb/tasks/classification/ben/bengali_document_classification.py +1 -2
  214. mteb/tasks/classification/ben/bengali_hate_speech_classification.py +1 -2
  215. mteb/tasks/classification/ben/bengali_sentiment_analysis.py +1 -2
  216. mteb/tasks/classification/ces/csfdcz_movie_review_sentiment_classification.py +1 -2
  217. mteb/tasks/classification/ces/czech_product_review_sentiment_classification.py +1 -2
  218. mteb/tasks/classification/ces/czech_so_me_sentiment_classification.py +1 -2
  219. mteb/tasks/classification/dan/angry_tweets_classification.py +1 -2
  220. mteb/tasks/classification/dan/danish_political_comments_classification.py +1 -2
  221. mteb/tasks/classification/dan/ddisco_cohesion_classification.py +1 -2
  222. mteb/tasks/classification/dan/dk_hate_classification.py +1 -2
  223. mteb/tasks/classification/deu/german_politicians_twitter_sentiment_classification.py +1 -2
  224. mteb/tasks/classification/deu/ten_k_gnad_classification.py +1 -2
  225. mteb/tasks/classification/eng/amazon_polarity_classification.py +1 -2
  226. mteb/tasks/classification/eng/arxiv_classification.py +1 -2
  227. mteb/tasks/classification/eng/banking77_classification.py +1 -2
  228. mteb/tasks/classification/eng/dbpedia_classification.py +1 -2
  229. mteb/tasks/classification/eng/emotion_classification.py +1 -2
  230. mteb/tasks/classification/eng/financial_phrasebank_classification.py +1 -2
  231. mteb/tasks/classification/eng/frenk_en_classification.py +1 -2
  232. mteb/tasks/classification/eng/gtsrb_classification.py +1 -1
  233. mteb/tasks/classification/eng/imdb_classification.py +1 -2
  234. mteb/tasks/classification/eng/legal_bench_classification.py +14 -120
  235. mteb/tasks/classification/eng/news_classification.py +1 -2
  236. mteb/tasks/classification/eng/patch_camelyon_classification.py +1 -1
  237. mteb/tasks/classification/eng/patent_classification.py +1 -2
  238. mteb/tasks/classification/eng/poem_sentiment_classification.py +1 -2
  239. mteb/tasks/classification/eng/sds_eye_protection_classification.py +1 -2
  240. mteb/tasks/classification/eng/sds_gloves_classification.py +1 -2
  241. mteb/tasks/classification/eng/toxic_chat_classification.py +2 -19
  242. mteb/tasks/classification/eng/toxic_conversations_classification.py +1 -2
  243. mteb/tasks/classification/eng/tweet_sentiment_extraction_classification.py +1 -2
  244. mteb/tasks/classification/eng/tweet_topic_single_classification.py +2 -13
  245. mteb/tasks/classification/eng/ucf101_classification.py +1 -5
  246. mteb/tasks/classification/eng/wikipedia_bio_met_chem_classification.py +1 -2
  247. mteb/tasks/classification/eng/wikipedia_chem_fields_classification.py +1 -2
  248. mteb/tasks/classification/eng/wikipedia_comp_chem_spectroscopy_classification.py +1 -2
  249. mteb/tasks/classification/eng/wikipedia_crystallography_analytical_classification.py +1 -2
  250. mteb/tasks/classification/eng/wikipedia_theoretical_applied_classification.py +1 -2
  251. mteb/tasks/classification/eng/yahoo_answers_topics_classification.py +1 -2
  252. mteb/tasks/classification/eng/yelp_review_full_classification.py +1 -2
  253. mteb/tasks/classification/est/estonian_valence.py +1 -2
  254. mteb/tasks/classification/fas/fa_mteb_classification.py +7 -14
  255. mteb/tasks/classification/fil/filipino_hate_speech_classification.py +1 -2
  256. mteb/tasks/classification/fin/fin_toxicity_classification.py +2 -11
  257. mteb/tasks/classification/fra/french_book_reviews.py +1 -2
  258. mteb/tasks/classification/fra/movie_review_sentiment_classification.py +1 -2
  259. mteb/tasks/classification/guj/gujarati_news_classification.py +1 -2
  260. mteb/tasks/classification/heb/__init__.py +6 -1
  261. mteb/tasks/classification/heb/hebrew_sentiment_analysis.py +62 -4
  262. mteb/tasks/classification/hin/hindi_discourse_classification.py +1 -2
  263. mteb/tasks/classification/hin/sentiment_analysis_hindi.py +1 -2
  264. mteb/tasks/classification/hrv/frenk_hr_classification.py +1 -2
  265. mteb/tasks/classification/ind/indonesian_id_clickbait_classification.py +1 -2
  266. mteb/tasks/classification/ind/indonesian_mongabay_conservation_classification.py +1 -2
  267. mteb/tasks/classification/ita/italian_linguist_acceptability_classification.py +1 -2
  268. mteb/tasks/classification/jav/javanese_imdb_classification.py +1 -2
  269. mteb/tasks/classification/jpn/wrime_classification.py +1 -2
  270. mteb/tasks/classification/kan/kannada_news_classification.py +1 -2
  271. mteb/tasks/classification/kor/klue_tc.py +1 -2
  272. mteb/tasks/classification/kor/kor_hate_classification.py +2 -17
  273. mteb/tasks/classification/kor/kor_sarcasm_classification.py +2 -19
  274. mteb/tasks/classification/kur/kurdish_sentiment_classification.py +1 -2
  275. mteb/tasks/classification/mal/malayalam_news_classification.py +1 -2
  276. mteb/tasks/classification/mar/marathi_news_classification.py +1 -2
  277. mteb/tasks/classification/mkd/macedonian_tweet_sentiment_classification.py +1 -2
  278. mteb/tasks/classification/multilingual/catalonia_tweet_classification.py +1 -6
  279. mteb/tasks/classification/multilingual/multi_hate_classification.py +1 -4
  280. mteb/tasks/classification/multilingual/ru_sci_bench_classification.py +4 -23
  281. mteb/tasks/classification/multilingual/scala_classification.py +1 -2
  282. mteb/tasks/classification/multilingual/sib200_classification.py +1 -6
  283. mteb/tasks/classification/mya/myanmar_news.py +1 -2
  284. mteb/tasks/classification/nep/nepali_news_classification.py +1 -2
  285. mteb/tasks/classification/nld/dutch_book_review_sentiment_classification.py +4 -2
  286. mteb/tasks/classification/nld/dutch_cola_classification.py +3 -0
  287. mteb/tasks/classification/nld/dutch_government_bias_classification.py +3 -0
  288. mteb/tasks/classification/nld/dutch_news_articles_classification.py +3 -0
  289. mteb/tasks/classification/nld/dutch_sarcastic_headlines_classification.py +3 -0
  290. mteb/tasks/classification/nld/iconclass_classification.py +3 -0
  291. mteb/tasks/classification/nld/open_tender_classification.py +3 -0
  292. mteb/tasks/classification/nld/vaccin_chat_nl_classification.py +3 -0
  293. mteb/tasks/classification/nob/no_rec_classification.py +1 -2
  294. mteb/tasks/classification/nob/norwegian_parliament_classification.py +1 -2
  295. mteb/tasks/classification/ory/odia_news_classification.py +1 -2
  296. mteb/tasks/classification/pol/polish_classification.py +3 -6
  297. mteb/tasks/classification/ron/moroco.py +1 -2
  298. mteb/tasks/classification/ron/romanian_reviews_sentiment.py +1 -2
  299. mteb/tasks/classification/ron/romanian_sentiment_classification.py +1 -2
  300. mteb/tasks/classification/rus/georeview_classification.py +1 -2
  301. mteb/tasks/classification/rus/headline_classification.py +1 -2
  302. mteb/tasks/classification/rus/inappropriateness_classification.py +1 -2
  303. mteb/tasks/classification/rus/ru_reviews_classification.py +1 -2
  304. mteb/tasks/classification/rus/ru_toixic_classification_okmlcup.py +1 -2
  305. mteb/tasks/classification/rus/senti_ru_eval.py +1 -2
  306. mteb/tasks/classification/sin/sinhala_news_classification.py +1 -2
  307. mteb/tasks/classification/sin/sinhala_news_source_classification.py +1 -2
  308. mteb/tasks/classification/slk/csfdsk_movie_review_sentiment_classification.py +1 -2
  309. mteb/tasks/classification/slk/slovak_hate_speech_classification.py +1 -2
  310. mteb/tasks/classification/slk/slovak_movie_review_sentiment_classification.py +1 -2
  311. mteb/tasks/classification/slv/frenk_sl_classification.py +1 -2
  312. mteb/tasks/classification/spa/spanish_news_classification.py +1 -2
  313. mteb/tasks/classification/spa/spanish_sentiment_classification.py +1 -2
  314. mteb/tasks/classification/ssw/siswati_news_classification.py +1 -2
  315. mteb/tasks/classification/swa/swahili_news_classification.py +1 -2
  316. mteb/tasks/classification/swe/dalaj_classification.py +1 -2
  317. mteb/tasks/classification/swe/swe_rec_classification.py +1 -2
  318. mteb/tasks/classification/swe/swedish_sentiment_classification.py +1 -2
  319. mteb/tasks/classification/tam/tamil_news_classification.py +1 -2
  320. mteb/tasks/classification/tel/telugu_andhra_jyoti_news_classification.py +1 -2
  321. mteb/tasks/classification/tha/wisesight_sentiment_classification.py +1 -2
  322. mteb/tasks/classification/tsn/tswana_news_classification.py +1 -2
  323. mteb/tasks/classification/tur/__init__.py +4 -0
  324. mteb/tasks/classification/tur/turkish_constitutional_court.py +41 -0
  325. mteb/tasks/classification/tur/turkish_movie_sentiment_classification.py +1 -2
  326. mteb/tasks/classification/tur/turkish_product_sentiment_classification.py +1 -2
  327. mteb/tasks/classification/ukr/ukr_formality_classification.py +2 -15
  328. mteb/tasks/classification/urd/urdu_roman_sentiment_classification.py +1 -2
  329. mteb/tasks/classification/vie/amazon_counterfactual_vn_classification.py +1 -6
  330. mteb/tasks/classification/vie/amazon_polarity_vn_classification.py +1 -6
  331. mteb/tasks/classification/vie/amazon_reviews_vn_classification.py +1 -5
  332. mteb/tasks/classification/vie/banking77_vn_classification.py +1 -5
  333. mteb/tasks/classification/vie/emotion_vn_classification.py +1 -5
  334. mteb/tasks/classification/vie/imdb_vn_classification.py +1 -5
  335. mteb/tasks/classification/vie/massive_intent_vn_classification.py +1 -5
  336. mteb/tasks/classification/vie/massive_scenario_vn_classification.py +1 -5
  337. mteb/tasks/classification/vie/mtop_domain_vn_classification.py +1 -5
  338. mteb/tasks/classification/vie/mtop_intent_vn_classification.py +1 -5
  339. mteb/tasks/classification/vie/toxic_conversations_vn_classification.py +1 -5
  340. mteb/tasks/classification/vie/tweet_sentiment_extraction_vn_classification.py +1 -5
  341. mteb/tasks/classification/vie/vie_student_feedback_classification.py +1 -2
  342. mteb/tasks/classification/zho/cmteb_classification.py +5 -10
  343. mteb/tasks/classification/zho/yue_openrice_review_classification.py +1 -2
  344. mteb/tasks/classification/zul/isi_zulu_news_classification.py +1 -2
  345. mteb/tasks/clustering/jpn/mews_c16_ja_clustering.py +1 -3
  346. mteb/tasks/clustering/multilingual/sib200_clustering_s2s.py +1 -6
  347. mteb/tasks/clustering/nld/dutch_news_articles_clustering_p2p.py +3 -0
  348. mteb/tasks/clustering/nld/dutch_news_articles_clustering_s2s.py +3 -0
  349. mteb/tasks/clustering/nld/iconclass_clustering_s2s.py +3 -0
  350. mteb/tasks/clustering/nld/open_tender_clustering_p2p.py +3 -0
  351. mteb/tasks/clustering/nld/open_tender_clustering_s2s.py +3 -0
  352. mteb/tasks/clustering/nld/vabb_clustering_p2p.py +3 -0
  353. mteb/tasks/clustering/nld/vabb_clustering_s2s.py +3 -0
  354. mteb/tasks/clustering/vie/reddit_clustering_p2p_vn.py +1 -5
  355. mteb/tasks/clustering/vie/reddit_clustering_vn.py +1 -5
  356. mteb/tasks/clustering/vie/stack_exchange_clustering_p2p_vn.py +1 -5
  357. mteb/tasks/clustering/vie/stack_exchange_clustering_vn.py +1 -5
  358. mteb/tasks/clustering/vie/twenty_newsgroups_clustering_vn.py +1 -5
  359. mteb/tasks/multilabel_classification/ita/emit_classification.py +1 -5
  360. mteb/tasks/multilabel_classification/kor/kor_hate_speech_ml_classification.py +1 -9
  361. mteb/tasks/multilabel_classification/mlt/maltese_news_classification.py +1 -6
  362. mteb/tasks/multilabel_classification/nld/covid_disinformation_nl_multi_label_classification.py +3 -0
  363. mteb/tasks/multilabel_classification/nld/vabb_multi_label_classification.py +3 -0
  364. mteb/tasks/multilabel_classification/por/brazilian_toxic_tweets_classification.py +1 -6
  365. mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_group_classification.py +1 -1
  366. mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_subclass_classification.py +1 -2
  367. mteb/tasks/pair_classification/dan/talemaader_pc.py +1 -6
  368. mteb/tasks/pair_classification/eng/legal_bench_pc.py +1 -9
  369. mteb/tasks/pair_classification/nld/sick_nl_pair_classification.py +3 -0
  370. mteb/tasks/pair_classification/nld/xlwic_nl_pair_classification.py +3 -0
  371. mteb/tasks/pair_classification/rus/__init__.py +2 -2
  372. mteb/tasks/pair_classification/rus/terra.py +51 -25
  373. mteb/tasks/pair_classification/vie/sprint_duplicate_questions_pcvn.py +1 -5
  374. mteb/tasks/pair_classification/vie/twitter_sem_eval2015_pcvn.py +1 -5
  375. mteb/tasks/pair_classification/vie/twitter_url_corpus_pcvn.py +1 -5
  376. mteb/tasks/regression/multilingual/ru_sci_bench_regression.py +2 -6
  377. mteb/tasks/reranking/jpn/__init__.py +9 -1
  378. mteb/tasks/reranking/jpn/j_qa_ra_reranking_lite.py +49 -0
  379. mteb/tasks/reranking/jpn/ja_cwir_reranking_lite.py +47 -0
  380. mteb/tasks/reranking/multilingual/__init__.py +2 -0
  381. mteb/tasks/reranking/multilingual/multi_long_doc_reranking.py +70 -0
  382. mteb/tasks/reranking/multilingual/x_glue_wpr_reranking.py +1 -2
  383. mteb/tasks/reranking/vie/ask_ubuntu_dup_questions_vn.py +1 -5
  384. mteb/tasks/reranking/vie/sci_docs_reranking_vn.py +1 -5
  385. mteb/tasks/reranking/vie/stack_overflow_dup_questions_vn.py +1 -5
  386. mteb/tasks/retrieval/code/fresh_stack_retrieval.py +8 -5
  387. mteb/tasks/retrieval/eng/lit_search_retrieval.py +1 -8
  388. mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +4 -0
  389. mteb/tasks/retrieval/jpn/__init__.py +8 -0
  390. mteb/tasks/retrieval/jpn/ja_cwir_retrieval.py +1 -4
  391. mteb/tasks/retrieval/jpn/ja_cwir_retrieval_lite.py +47 -0
  392. mteb/tasks/retrieval/jpn/jaqket_retrieval_lite.py +50 -0
  393. mteb/tasks/retrieval/jpn/miracl_ja_retrieval_lite.py +52 -0
  394. mteb/tasks/retrieval/jpn/mr_tydi_ja_retrieval_lite.py +48 -0
  395. mteb/tasks/retrieval/kat/georgian_faq_retrieval.py +11 -4
  396. mteb/tasks/retrieval/kor/__init__.py +2 -1
  397. mteb/tasks/retrieval/kor/squad_kor_v1_retrieval.py +47 -0
  398. mteb/tasks/retrieval/multilingual/__init__.py +22 -0
  399. mteb/tasks/retrieval/multilingual/belebele_retrieval.py +5 -4
  400. mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +56 -42
  401. mteb/tasks/retrieval/multilingual/mkqa_retrieval.py +1 -2
  402. mteb/tasks/retrieval/multilingual/mlqa_retrieval.py +1 -4
  403. mteb/tasks/retrieval/multilingual/multi_long_doc_retrieval.py +1 -2
  404. mteb/tasks/retrieval/multilingual/public_health_qa_retrieval.py +9 -4
  405. mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +2 -12
  406. mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +4 -2
  407. mteb/tasks/retrieval/multilingual/vidore3_bench_retrieval.py +399 -0
  408. mteb/tasks/retrieval/nld/__init__.py +8 -4
  409. mteb/tasks/retrieval/nld/argu_ana_nl_retrieval.py +46 -27
  410. mteb/tasks/retrieval/nld/bbsard_nl_retrieval.py +3 -0
  411. mteb/tasks/retrieval/nld/dutch_news_articles_retrieval.py +3 -0
  412. mteb/tasks/retrieval/nld/legal_qa_nl_retrieval.py +3 -0
  413. mteb/tasks/retrieval/nld/nf_corpus_nl_retrieval.py +42 -25
  414. mteb/tasks/retrieval/nld/open_tender_retrieval.py +3 -0
  415. mteb/tasks/retrieval/nld/sci_fact_nl_retrieval.py +42 -24
  416. mteb/tasks/retrieval/nld/scidocsnl_retrieval.py +44 -27
  417. mteb/tasks/retrieval/nld/vabb_retrieval.py +3 -0
  418. mteb/tasks/retrieval/slk/slovak_sum_retrieval.py +1 -7
  419. mteb/tasks/retrieval/vie/argu_ana_vn_retrieval.py +1 -5
  420. mteb/tasks/retrieval/vie/climate_fevervn_retrieval.py +1 -5
  421. mteb/tasks/retrieval/vie/cqa_dupstack_android_vn_retrieval.py +1 -5
  422. mteb/tasks/retrieval/vie/cqa_dupstack_gis_vn_retrieval.py +1 -5
  423. mteb/tasks/retrieval/vie/cqa_dupstack_mathematica_vn_retrieval.py +1 -5
  424. mteb/tasks/retrieval/vie/cqa_dupstack_physics_vn_retrieval.py +1 -5
  425. mteb/tasks/retrieval/vie/cqa_dupstack_programmers_vn_retrieval.py +1 -5
  426. mteb/tasks/retrieval/vie/cqa_dupstack_stats_vn_retrieval.py +1 -5
  427. mteb/tasks/retrieval/vie/cqa_dupstack_tex_vn_retrieval.py +1 -5
  428. mteb/tasks/retrieval/vie/cqa_dupstack_unix_vn_retrieval.py +1 -5
  429. mteb/tasks/retrieval/vie/cqa_dupstack_webmasters_vn_retrieval.py +1 -5
  430. mteb/tasks/retrieval/vie/cqa_dupstack_wordpress_vn_retrieval.py +1 -5
  431. mteb/tasks/retrieval/vie/db_pedia_vn_retrieval.py +1 -5
  432. mteb/tasks/retrieval/vie/fevervn_retrieval.py +1 -7
  433. mteb/tasks/retrieval/vie/fi_qa2018_vn_retrieval.py +1 -5
  434. mteb/tasks/retrieval/vie/green_node_table_markdown_retrieval.py +16 -1
  435. mteb/tasks/retrieval/vie/hotpot_qavn_retrieval.py +1 -6
  436. mteb/tasks/retrieval/vie/msmarcovn_retrieval.py +1 -5
  437. mteb/tasks/retrieval/vie/nf_corpus_vn_retrieval.py +1 -5
  438. mteb/tasks/retrieval/vie/nqvn_retrieval.py +1 -5
  439. mteb/tasks/retrieval/vie/quora_vn_retrieval.py +1 -6
  440. mteb/tasks/retrieval/vie/sci_fact_vn_retrieval.py +1 -5
  441. mteb/tasks/retrieval/vie/scidocsvn_retrieval.py +1 -6
  442. mteb/tasks/retrieval/vie/touche2020_vn_retrieval.py +1 -5
  443. mteb/tasks/retrieval/vie/treccovidvn_retrieval.py +1 -5
  444. mteb/tasks/sts/nld/sick_nl_sts.py +1 -0
  445. mteb/tasks/sts/vie/biosses_stsvn.py +1 -5
  446. mteb/tasks/sts/vie/sickr_stsvn.py +1 -5
  447. mteb/tasks/sts/vie/sts_benchmark_stsvn.py +1 -5
  448. mteb/tasks/zeroshot_classification/eng/gtsrb.py +1 -1
  449. mteb/tasks/zeroshot_classification/eng/patch_camelyon.py +1 -1
  450. mteb/tasks/zeroshot_classification/eng/ucf101.py +1 -5
  451. mteb/types/_encoder_io.py +7 -2
  452. {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/METADATA +11 -5
  453. {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/RECORD +457 -391
  454. mteb/models/model_implementations/nb_sbert.py +0 -25
  455. {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/WHEEL +0 -0
  456. {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/entry_points.txt +0 -0
  457. {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/licenses/LICENSE +0 -0
  458. {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/top_level.txt +0 -0
mteb/leaderboard/app.py CHANGED
@@ -5,7 +5,7 @@ import tempfile
5
5
  import time
6
6
  import warnings
7
7
  from pathlib import Path
8
- from typing import Literal, get_args
8
+ from typing import Literal
9
9
  from urllib.parse import urlencode
10
10
 
11
11
  import cachetools
@@ -14,7 +14,6 @@ import pandas as pd
14
14
 
15
15
  import mteb
16
16
  from mteb import BenchmarkResults
17
- from mteb.abstasks.task_metadata import TaskDomain, TaskType
18
17
  from mteb.benchmarks.benchmark import RtebBenchmark
19
18
  from mteb.cache import ResultCache
20
19
  from mteb.leaderboard.benchmark_selector import (
@@ -25,11 +24,11 @@ from mteb.leaderboard.benchmark_selector import (
25
24
  )
26
25
  from mteb.leaderboard.figures import _performance_size_plot, _radar_chart
27
26
  from mteb.leaderboard.table import (
27
+ apply_per_language_styling_from_benchmark,
28
28
  apply_per_task_styling_from_benchmark,
29
29
  apply_summary_styling_from_benchmark,
30
30
  )
31
31
  from mteb.leaderboard.text_segments import ACKNOWLEDGEMENT, FAQ
32
- from mteb.types import Modalities
33
32
 
34
33
  logger = logging.getLogger(__name__)
35
34
 
@@ -37,9 +36,15 @@ LANGUAGE: list[str] = list({l for t in mteb.get_tasks() for l in t.metadata.lang
37
36
 
38
37
 
39
38
  def _load_results(cache: ResultCache) -> BenchmarkResults:
39
+ start_time = time.time()
40
40
  results_cache_path = Path(__file__).parent.joinpath("__cached_results.json")
41
41
  if not results_cache_path.exists():
42
+ logger.info("Cached results not found, downloading from remote...")
42
43
  cache.download_from_remote()
44
+ download_time = time.time() - start_time
45
+ logger.info(f"Downloaded remote results in {download_time:.2f}s")
46
+
47
+ load_start = time.time()
43
48
  all_model_names = [model_meta.name for model_meta in mteb.get_model_metas()]
44
49
 
45
50
  all_results = cache.load_results(
@@ -48,10 +53,16 @@ def _load_results(cache: ResultCache) -> BenchmarkResults:
48
53
  require_model_meta=False,
49
54
  include_remote=True,
50
55
  )
56
+ load_time = time.time() - load_start
57
+ logger.info(f"Loaded results from cache in {load_time:.2f}s")
51
58
  return all_results
52
59
  else:
60
+ logger.info("Loading cached results from disk...")
53
61
  with results_cache_path.open() as cache_file:
54
- return mteb.BenchmarkResults.from_validated(**json.load(cache_file))
62
+ results = mteb.BenchmarkResults.from_validated(**json.load(cache_file))
63
+ total_time = time.time() - start_time
64
+ logger.info(f"Loaded cached results in {total_time:.2f}s")
65
+ return results
55
66
 
56
67
 
57
68
  def _produce_benchmark_link(benchmark_name: str, request: gr.Request) -> str:
@@ -107,7 +118,9 @@ def _update_description(
107
118
  description += f" - **Number of task types**: {n_task_types}\n"
108
119
  description += f" - **Number of domains**: {n_domains}\n"
109
120
  if benchmark.reference is not None:
110
- description += f"\n[Click for More Info]({benchmark.reference})"
121
+ description += (
122
+ f'\n<a href="{benchmark.reference}" target="_blank">Click for More Info</a>'
123
+ )
111
124
 
112
125
  return description
113
126
 
@@ -137,7 +150,10 @@ def _update_task_info(task_names: str) -> gr.DataFrame:
137
150
  df["languages"] = df["languages"].map(_format_list)
138
151
  df = df.sort_values("name")
139
152
  df["domains"] = df["domains"].map(_format_list)
140
- df["name"] = "[" + df["name"] + "](" + df["reference"] + ")"
153
+ df["name"] = df.apply(
154
+ lambda row: f'<a href="{row["reference"]}" target="_blank">{row["name"]}</a>',
155
+ axis=1,
156
+ )
141
157
  df["modalities"] = df["modalities"].map(_format_list)
142
158
  df = df.rename(
143
159
  columns={
@@ -153,9 +169,8 @@ def _update_task_info(task_names: str) -> gr.DataFrame:
153
169
  df = df.drop(columns="reference")
154
170
  return gr.DataFrame(
155
171
  df,
156
- datatype=["markdown"] + ["str"] * (len(df.columns) - 1),
157
- show_copy_button=True,
158
- show_fullscreen_button=True,
172
+ datatype=["markdown"] + ["str"] * (len(df.columns) - 1), # type: ignore
173
+ buttons=["copy", "fullscreen"],
159
174
  show_search="filter",
160
175
  )
161
176
 
@@ -213,21 +228,154 @@ def _should_show_zero_shot_filter(benchmark_name: str) -> bool:
213
228
  return True
214
229
 
215
230
 
231
+ @cachetools.cached(
232
+ cache={},
233
+ key=lambda benchmark_name, all_benchmark_results: hash(benchmark_name),
234
+ )
235
+ def _cache_on_benchmark_select(benchmark_name, all_benchmark_results):
236
+ start_time = time.time()
237
+ benchmark = mteb.get_benchmark(benchmark_name)
238
+ languages = [task.languages for task in benchmark.tasks if task.languages]
239
+ languages = set(itertools.chain.from_iterable(languages))
240
+ languages = sorted(languages)
241
+ domains = [
242
+ task.metadata.domains for task in benchmark.tasks if task.metadata.domains
243
+ ]
244
+ domains = set(itertools.chain.from_iterable(domains))
245
+ types = {task.metadata.type for task in benchmark.tasks if task.metadata.type}
246
+ modalities = set()
247
+ for task in benchmark.tasks:
248
+ modalities.update(task.metadata.modalities)
249
+ languages, domains, types, modalities = (
250
+ sorted(languages),
251
+ sorted(domains),
252
+ sorted(types),
253
+ sorted(modalities),
254
+ )
255
+ elapsed = time.time() - start_time
256
+ benchmark_results = all_benchmark_results[benchmark_name]
257
+ scores = benchmark_results._get_scores(format="long")
258
+ logger.debug(f"on_benchmark_select callback: {elapsed}s")
259
+ show_zero_shot = _should_show_zero_shot_filter(benchmark_name)
260
+
261
+ # Calculate initial models for this benchmark to avoid race conditions
262
+ benchmark_tasks = sorted([task.metadata.name for task in benchmark.tasks])
263
+ all_models_in_scores = list({entry["model_name"] for entry in scores})
264
+ initial_models = _filter_models(
265
+ all_models_in_scores,
266
+ benchmark_tasks,
267
+ availability=None,
268
+ compatibility=[],
269
+ instructions=None,
270
+ max_model_size=MAX_MODEL_SIZE,
271
+ zero_shot_setting="allow_all",
272
+ )
273
+ # Sort to ensure consistency with update_models
274
+ initial_models = sorted(initial_models)
275
+
276
+ return (
277
+ languages,
278
+ domains,
279
+ types,
280
+ modalities,
281
+ benchmark_tasks,
282
+ scores,
283
+ show_zero_shot,
284
+ initial_models,
285
+ )
286
+
287
+
288
+ @cachetools.cached(
289
+ cache={},
290
+ key=lambda benchmark_name,
291
+ type_select,
292
+ domain_select,
293
+ lang_select,
294
+ modality_select: hash(
295
+ (
296
+ hash(benchmark_name),
297
+ hash(tuple(type_select)),
298
+ hash(tuple(domain_select)),
299
+ hash(tuple(lang_select)),
300
+ hash(tuple(modality_select)),
301
+ )
302
+ ),
303
+ )
304
+ def _cache_update_task_list(
305
+ benchmark_name, type_select, domain_select, lang_select, modality_select
306
+ ):
307
+ if not len(lang_select):
308
+ return []
309
+ start_time = time.time()
310
+ benchmark_tasks = []
311
+ tasks_to_keep = []
312
+ for task in mteb.get_benchmark(benchmark_name).tasks:
313
+ benchmark_tasks.append(task.metadata.name)
314
+ if task.metadata.type not in type_select:
315
+ continue
316
+ if task.metadata.domains and not (
317
+ set(task.metadata.domains) & set(domain_select)
318
+ ):
319
+ continue
320
+ if task.languages and not (set(task.languages) & set(lang_select)):
321
+ continue
322
+ if task.metadata.modalities and not (
323
+ set(task.metadata.modalities) & set(modality_select)
324
+ ):
325
+ continue
326
+ tasks_to_keep.append(task.metadata.name)
327
+ benchmark_tasks.sort()
328
+ tasks_to_keep.sort()
329
+ elapsed = time.time() - start_time
330
+ logger.debug(f"update_task_list callback: {elapsed}s")
331
+
332
+ return benchmark_tasks, tasks_to_keep
333
+
334
+
216
335
  def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
217
336
  """Returns a Gradio Blocks app for the MTEB leaderboard."""
218
- logger.info("Loading all benchmark results")
337
+ app_start = time.time()
338
+ logger.info("=== Starting leaderboard app initialization ===")
339
+
340
+ logger.info("Step 1/7: Loading all benchmark results...")
341
+ load_start = time.time()
219
342
  all_results = _load_results(cache)
343
+ load_time = time.time() - load_start
344
+ logger.info(f"Step 1/7 complete: Loaded results in {load_time:.2f}s")
220
345
 
346
+ logger.info("Step 2/7: Fetching benchmarks...")
347
+ bench_start = time.time()
221
348
  benchmarks = sorted(
222
349
  mteb.get_benchmarks(display_on_leaderboard=True), key=lambda x: x.name
223
350
  )
351
+ bench_time = time.time() - bench_start
352
+ logger.info(
353
+ f"Step 2/7 complete: Fetched {len(benchmarks)} benchmarks in {bench_time:.2f}s"
354
+ )
355
+
356
+ logger.info(
357
+ "Step 3/7: Processing all benchmarks (select_tasks + join_revisions)..."
358
+ )
359
+ process_start = time.time()
224
360
  all_benchmark_results = {
225
361
  benchmark.name: all_results.select_tasks(benchmark.tasks).join_revisions()
226
362
  for benchmark in benchmarks
227
363
  }
364
+ process_time = time.time() - process_start
365
+ if len(benchmarks) > 0:
366
+ logger.info(
367
+ f"Step 3/7 complete: Processed {len(benchmarks)} benchmarks in {process_time:.2f}s (avg {process_time / len(benchmarks):.2f}s/benchmark)"
368
+ )
369
+ else:
370
+ logger.info(
371
+ f"Step 3/7 complete: Processed 0 benchmarks in {process_time:.2f}s (avg N/A)"
372
+ )
373
+
228
374
  default_benchmark = mteb.get_benchmark(DEFAULT_BENCHMARK_NAME)
229
375
  default_results = all_benchmark_results[default_benchmark.name]
230
- logger.info("Benchmark results loaded")
376
+
377
+ logger.info("Step 4/7: Filtering models...")
378
+ filter_start = time.time()
231
379
 
232
380
  default_scores = default_results._get_scores(format="long")
233
381
  all_models = list({entry["model_name"] for entry in default_scores})
@@ -247,63 +395,79 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
247
395
  # Filter BenchmarkResults based on default filtered models (as required by Kenneth)
248
396
  filtered_model_names = [entry["model_name"] for entry in default_filtered_scores]
249
397
  filtered_benchmark_results = default_results.select_models(filtered_model_names)
398
+ filter_time = time.time() - filter_start
399
+ logger.info(
400
+ f"Step 4/7 complete: Filtered {len(filtered_model_names)} models in {filter_time:.2f}s"
401
+ )
250
402
 
403
+ logger.info("Step 5/7: Generating tables...")
404
+ table_start = time.time()
251
405
  summary_table = apply_summary_styling_from_benchmark(
252
406
  default_benchmark, filtered_benchmark_results
253
407
  )
254
408
  per_task_table = apply_per_task_styling_from_benchmark(
255
409
  default_benchmark, filtered_benchmark_results
256
410
  )
411
+ per_language_table = apply_per_language_styling_from_benchmark(
412
+ default_benchmark,
413
+ filtered_benchmark_results,
414
+ )
415
+ table_time = time.time() - table_start
416
+ logger.info(f"Step 5/7 complete: Generated tables in {table_time:.2f}s")
257
417
 
258
- lang_select = gr.Dropdown(
259
- LANGUAGE,
418
+ # Check if this benchmark displays per-language results
419
+ display_language_table = len(default_benchmark.language_view) > 0
420
+
421
+ logger.info("Step 6/7: Creating Gradio components...")
422
+ component_start = time.time()
423
+ lang_select = gr.CheckboxGroup(
424
+ sorted(default_results.languages),
260
425
  value=sorted(default_results.languages),
261
- allow_custom_value=True,
262
- multiselect=True,
426
+ show_label=True,
427
+ show_select_all=True,
263
428
  label="Language",
264
429
  info="Select languages to include.",
265
430
  )
266
- type_select = gr.Dropdown(
267
- sorted(get_args(TaskType)),
431
+ type_select = gr.CheckboxGroup(
432
+ sorted(default_results.task_types),
268
433
  value=sorted(default_results.task_types),
269
- multiselect=True,
434
+ show_label=True,
435
+ show_select_all=True,
270
436
  label="Task Type",
271
437
  info="Select task types to include.",
272
438
  )
273
- domain_select = gr.Dropdown(
274
- sorted(get_args(TaskDomain)),
439
+ domain_select = gr.CheckboxGroup(
440
+ sorted(default_results.domains),
275
441
  value=sorted(default_results.domains),
276
- multiselect=True,
442
+ show_label=True,
443
+ show_select_all=True,
277
444
  label="Domain",
278
445
  info="Select domains to include.",
279
446
  )
280
- task_select = gr.Dropdown(
281
- sorted(all_results.task_names),
447
+ task_select = gr.CheckboxGroup(
448
+ sorted(default_results.task_names),
282
449
  value=sorted(default_results.task_names),
283
- allow_custom_value=True,
284
- multiselect=True,
450
+ show_label=True,
451
+ show_select_all=True,
285
452
  label="Task",
286
453
  info="Select specific tasks to include",
287
454
  )
288
- modality_select = gr.Dropdown(
289
- sorted(get_args(Modalities)),
455
+ modality_select = gr.CheckboxGroup(
456
+ sorted(default_results.modalities),
290
457
  value=sorted(default_results.modalities),
291
- multiselect=True,
458
+ show_label=True,
459
+ show_select_all=True,
292
460
  label="Modality",
293
461
  info="Select modalities to include.",
294
462
  )
463
+ component_time = time.time() - component_start
464
+ logger.info(
465
+ f"Step 6/7 complete: Created Gradio components in {component_time:.2f}s"
466
+ )
295
467
 
296
- head = """
297
- <link href="https://cdn.jsdelivr.net/npm/tailwindcss@2.2.19/dist/tailwind.min.css" rel="stylesheet">
298
- """
299
-
300
- with gr.Blocks(
301
- fill_width=True,
302
- theme=gr.themes.Soft(
303
- font=[gr.themes.GoogleFont("Roboto Mono"), "Arial", "sans-serif"],
304
- ),
305
- head=head,
306
- ) as demo:
468
+ logger.info("Step 7/7: Building Gradio interface and callbacks...")
469
+ interface_start = time.time()
470
+ with gr.Blocks(fill_width=True) as demo:
307
471
  with gr.Sidebar(
308
472
  position="left",
309
473
  label="Benchmark Selection and Customization",
@@ -435,9 +599,6 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
435
599
 
436
600
  with gr.Tab("Performance per Model Size") as plot_tab:
437
601
  plot = gr.Plot(_performance_size_plot, inputs=[summary_table])
438
- gr.Markdown(
439
- "*We only display TOP 5 models that have been run on all tasks in the benchmark*"
440
- )
441
602
  plot_tab.select(
442
603
  _performance_size_plot, inputs=[summary_table], outputs=[plot]
443
604
  )
@@ -457,68 +618,41 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
457
618
  download_per_task.click(
458
619
  _download_table, inputs=[per_task_table], outputs=[download_per_task]
459
620
  )
621
+ with gr.Tab(
622
+ "Performance per language", visible=display_language_table
623
+ ) as language_tab:
624
+ per_language_table.render()
625
+ download_per_language = gr.DownloadButton("Download Table")
626
+ download_per_language.click(
627
+ _download_table,
628
+ inputs=[per_language_table],
629
+ outputs=[download_per_language],
630
+ )
460
631
  with gr.Tab("Task information"):
461
632
  task_info_table = gr.DataFrame(_update_task_info, inputs=[task_select]) # noqa: F841
462
633
 
463
634
  # This sets the benchmark from the URL query parameters
464
635
  demo.load(_set_benchmark_on_load, inputs=[], outputs=[benchmark_select])
465
636
 
466
- @cachetools.cached(
467
- cache={},
468
- key=lambda benchmark_name: hash(benchmark_name),
469
- )
470
637
  def on_benchmark_select(benchmark_name):
471
- start_time = time.time()
472
- benchmark = mteb.get_benchmark(benchmark_name)
473
- languages = [task.languages for task in benchmark.tasks if task.languages]
474
- languages = set(itertools.chain.from_iterable(languages))
475
- languages = sorted(languages)
476
- domains = [
477
- task.metadata.domains
478
- for task in benchmark.tasks
479
- if task.metadata.domains
480
- ]
481
- domains = set(itertools.chain.from_iterable(domains))
482
- types = {
483
- task.metadata.type for task in benchmark.tasks if task.metadata.type
484
- }
485
- modalities = set()
486
- for task in benchmark.tasks:
487
- modalities.update(task.metadata.modalities)
488
- languages, domains, types, modalities = (
489
- sorted(languages),
490
- sorted(domains),
491
- sorted(types),
492
- sorted(modalities),
493
- )
494
- elapsed = time.time() - start_time
495
- benchmark_results = all_benchmark_results[benchmark_name]
496
- scores = benchmark_results._get_scores(format="long")
497
- logger.debug(f"on_benchmark_select callback: {elapsed}s")
498
- show_zero_shot = _should_show_zero_shot_filter(benchmark_name)
499
-
500
- # Calculate initial models for this benchmark to avoid race conditions
501
- benchmark_tasks = sorted([task.metadata.name for task in benchmark.tasks])
502
- all_models_in_scores = list({entry["model_name"] for entry in scores})
503
- initial_models = _filter_models(
504
- all_models_in_scores,
505
- benchmark_tasks,
506
- availability=None,
507
- compatibility=[],
508
- instructions=None,
509
- max_model_size=MAX_MODEL_SIZE,
510
- zero_shot_setting="allow_all",
511
- )
512
- # Sort to ensure consistency with update_models
513
- initial_models = sorted(initial_models)
514
-
515
- return (
638
+ (
516
639
  languages,
517
640
  domains,
518
641
  types,
519
642
  modalities,
520
643
  benchmark_tasks,
521
644
  scores,
645
+ show_zero_shot,
646
+ initial_models,
647
+ ) = _cache_on_benchmark_select(benchmark_name, all_benchmark_results)
648
+
649
+ return (
650
+ gr.update(choices=languages, value=languages),
651
+ gr.update(choices=domains, value=domains),
652
+ gr.update(choices=types, value=types),
653
+ gr.update(choices=modalities, value=modalities),
654
+ gr.update(choices=benchmark_tasks, value=benchmark_tasks),
655
+ scores,
522
656
  gr.update(visible=show_zero_shot),
523
657
  initial_models,
524
658
  )
@@ -560,48 +694,13 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
560
694
  outputs=[scores],
561
695
  )
562
696
 
563
- @cachetools.cached(
564
- cache={},
565
- key=lambda benchmark_name,
566
- type_select,
567
- domain_select,
568
- lang_select,
569
- modality_select: hash(
570
- (
571
- hash(benchmark_name),
572
- hash(tuple(type_select)),
573
- hash(tuple(domain_select)),
574
- hash(tuple(lang_select)),
575
- hash(tuple(modality_select)),
576
- )
577
- ),
578
- )
579
697
  def update_task_list(
580
698
  benchmark_name, type_select, domain_select, lang_select, modality_select
581
699
  ):
582
- if not len(lang_select):
583
- return []
584
- start_time = time.time()
585
- tasks_to_keep = []
586
- for task in mteb.get_benchmark(benchmark_name).tasks:
587
- if task.metadata.type not in type_select:
588
- continue
589
- if task.metadata.domains is not None and not (
590
- set(task.metadata.domains) & set(domain_select)
591
- ):
592
- continue
593
- if task.languages is not None and not (
594
- set(task.languages) & set(lang_select)
595
- ):
596
- continue
597
- if task.metadata.modalities and not (
598
- set(task.metadata.modalities) & set(modality_select)
599
- ):
600
- continue
601
- tasks_to_keep.append(task.metadata.name)
602
- elapsed = time.time() - start_time
603
- logger.debug(f"update_task_list callback: {elapsed}s")
604
- return sorted(tasks_to_keep)
700
+ benchmark_tasks, tasks_to_keep = _cache_update_task_list(
701
+ benchmark_name, type_select, domain_select, lang_select, modality_select
702
+ )
703
+ return gr.update(choices=benchmark_tasks, value=tasks_to_keep)
605
704
 
606
705
  type_select.input(
607
706
  update_task_list,
@@ -854,9 +953,18 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
854
953
  per_task = apply_per_task_styling_from_benchmark(
855
954
  benchmark, filtered_benchmark_results
856
955
  )
956
+ per_language = apply_per_language_styling_from_benchmark(
957
+ benchmark,
958
+ filtered_benchmark_results,
959
+ )
857
960
  elapsed = time.time() - start_time
858
961
  logger.debug(f"update_tables callback: {elapsed}s")
859
- return summary, per_task
962
+ return (
963
+ summary,
964
+ per_task,
965
+ per_language,
966
+ gr.update(visible=len(benchmark.language_view) > 0),
967
+ )
860
968
 
861
969
  # Only update tables when models change, not when scores/tasks change directly
862
970
  # This avoids redundant updates since scores/tasks changes trigger update_models
@@ -865,11 +973,20 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
865
973
  item.change(
866
974
  update_tables,
867
975
  inputs=[scores, task_select, models, benchmark_select],
868
- outputs=[summary_table, per_task_table],
976
+ outputs=[
977
+ summary_table,
978
+ per_task_table,
979
+ per_language_table,
980
+ language_tab,
981
+ ],
869
982
  )
870
983
 
871
984
  gr.Markdown(ACKNOWLEDGEMENT, elem_id="ack_markdown")
985
+ interface_time = time.time() - interface_start
986
+ logger.info(f"Step 7/7 complete: Built Gradio interface in {interface_time:.2f}s")
872
987
 
988
+ logger.info("Starting prerun on all benchmarks to populate caches...")
989
+ prerun_start = time.time()
873
990
  # Prerun on all benchmarks, so that results of callbacks get cached
874
991
  for benchmark in benchmarks:
875
992
  (
@@ -895,6 +1012,13 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
895
1012
  update_tables(
896
1013
  bench_scores, filtered_tasks, bench_initial_models, benchmark.name
897
1014
  )
1015
+ prerun_time = time.time() - prerun_start
1016
+ logger.info(
1017
+ f"Prerun complete: Processed {len(benchmarks)} benchmarks in {prerun_time:.2f}s"
1018
+ )
1019
+
1020
+ total_time = time.time() - app_start
1021
+ logger.info(f"=== Leaderboard app initialization complete in {total_time:.2f}s ===")
898
1022
  return demo
899
1023
 
900
1024
 
@@ -911,4 +1035,15 @@ if __name__ == "__main__":
911
1035
  warnings.filterwarnings("ignore", message="Couldn't get scores for .* due to .*")
912
1036
 
913
1037
  app = get_leaderboard_app()
914
- app.launch(server_name="0.0.0.0", server_port=7860)
1038
+
1039
+ head = """
1040
+ <link href="https://cdn.jsdelivr.net/npm/tailwindcss@2.2.19/dist/tailwind.min.css" rel="stylesheet">
1041
+ """
1042
+ app.launch(
1043
+ server_name="0.0.0.0",
1044
+ server_port=7860,
1045
+ theme=gr.themes.Soft(
1046
+ font=[gr.themes.GoogleFont("Roboto Mono"), "Arial", "sans-serif"],
1047
+ ),
1048
+ head=head,
1049
+ )
@@ -71,18 +71,26 @@ GP_BENCHMARK_ENTRIES = [
71
71
  "MTEB(cmn, v1)",
72
72
  "MTEB(deu, v1)",
73
73
  "MTEB(fra, v1)",
74
- "MTEB(jpn, v1)",
74
+ "JMTEB(v2)",
75
75
  "MTEB(kor, v1)",
76
76
  "MTEB(nld, v1)",
77
77
  "MTEB(pol, v1)",
78
- "MTEB(rus, v1)",
78
+ "MTEB(rus, v1.1)",
79
79
  "MTEB(fas, v2)",
80
80
  "VN-MTEB (vie, v1)",
81
81
  ]
82
82
  )
83
83
  + [
84
84
  MenuEntry(
85
- "Other", mteb.get_benchmarks(["MTEB(eng, v1)", "MTEB(fas, v1)"])
85
+ "Other",
86
+ mteb.get_benchmarks(
87
+ [
88
+ "MTEB(eng, v1)",
89
+ "MTEB(fas, v1)",
90
+ "MTEB(rus, v1)",
91
+ "MTEB(jpn, v1)",
92
+ ]
93
+ ),
86
94
  )
87
95
  ],
88
96
  ),
@@ -110,10 +118,11 @@ R_BENCHMARK_ENTRIES = [
110
118
  MenuEntry(
111
119
  "Image",
112
120
  description=None,
113
- open=False,
121
+ open=True,
114
122
  benchmarks=[
115
- mteb.get_benchmark("VisualDocumentRetrieval"),
123
+ mteb.get_benchmark("ViDoRe(v3)"),
116
124
  mteb.get_benchmark("JinaVDR"),
125
+ MenuEntry("Other", [mteb.get_benchmark("ViDoRe(v1&v2)")]),
117
126
  ],
118
127
  ),
119
128
  MenuEntry(