mteb 2.1.4__py3-none-any.whl → 2.7.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (527) hide show
  1. mteb/__init__.py +6 -0
  2. mteb/_create_dataloaders.py +22 -20
  3. mteb/_evaluators/any_sts_evaluator.py +23 -14
  4. mteb/_evaluators/classification_metrics.py +54 -0
  5. mteb/_evaluators/clustering_evaluator.py +3 -3
  6. mteb/_evaluators/evaluator.py +4 -2
  7. mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +18 -11
  8. mteb/_evaluators/pair_classification_evaluator.py +34 -40
  9. mteb/_evaluators/retrieval_evaluator.py +2 -2
  10. mteb/_evaluators/retrieval_metrics.py +18 -17
  11. mteb/_evaluators/sklearn_evaluator.py +25 -37
  12. mteb/_evaluators/text/bitext_mining_evaluator.py +31 -19
  13. mteb/_evaluators/text/summarization_evaluator.py +27 -20
  14. mteb/_evaluators/zeroshot_classification_evaluator.py +7 -5
  15. mteb/abstasks/_data_filter/__init__.py +0 -0
  16. mteb/abstasks/_data_filter/filters.py +125 -0
  17. mteb/abstasks/_data_filter/task_pipelines.py +105 -0
  18. mteb/abstasks/_statistics_calculation.py +23 -11
  19. mteb/abstasks/_stratification.py +18 -18
  20. mteb/abstasks/abstask.py +35 -28
  21. mteb/abstasks/aggregate_task_metadata.py +1 -9
  22. mteb/abstasks/aggregated_task.py +10 -29
  23. mteb/abstasks/classification.py +15 -12
  24. mteb/abstasks/clustering.py +20 -16
  25. mteb/abstasks/clustering_legacy.py +13 -10
  26. mteb/abstasks/image/image_text_pair_classification.py +7 -4
  27. mteb/abstasks/multilabel_classification.py +33 -22
  28. mteb/abstasks/pair_classification.py +27 -11
  29. mteb/abstasks/regression.py +4 -4
  30. mteb/abstasks/retrieval.py +28 -24
  31. mteb/abstasks/retrieval_dataset_loaders.py +2 -2
  32. mteb/abstasks/sts.py +14 -4
  33. mteb/abstasks/task_metadata.py +32 -33
  34. mteb/abstasks/text/bitext_mining.py +39 -28
  35. mteb/abstasks/text/reranking.py +8 -6
  36. mteb/abstasks/text/summarization.py +10 -5
  37. mteb/abstasks/zeroshot_classification.py +8 -4
  38. mteb/benchmarks/_create_table.py +84 -37
  39. mteb/benchmarks/benchmark.py +77 -16
  40. mteb/benchmarks/benchmarks/__init__.py +12 -0
  41. mteb/benchmarks/benchmarks/benchmarks.py +361 -16
  42. mteb/benchmarks/get_benchmark.py +14 -53
  43. mteb/cache.py +227 -37
  44. mteb/cli/_display_tasks.py +2 -2
  45. mteb/cli/build_cli.py +110 -14
  46. mteb/cli/generate_model_card.py +43 -23
  47. mteb/deprecated_evaluator.py +71 -62
  48. mteb/descriptive_stats/BitextMining/RuSciBenchBitextMining.v2.json +61 -0
  49. mteb/descriptive_stats/Classification/HebrewSentimentAnalysis.v3.json +60 -0
  50. mteb/descriptive_stats/Classification/TurkishConstitutionalCourtViolation.json +54 -0
  51. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2CybersecurityRetrieval.json +32 -0
  52. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EconomicRetrieval.json +32 -0
  53. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EnergyRetrieval.json +32 -0
  54. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2HrRetrieval.json +32 -0
  55. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3ComputerScienceRetrieval.json +214 -0
  56. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3EnergyRetrieval.json +214 -0
  57. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceEnRetrieval.json +214 -0
  58. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceFrRetrieval.json +214 -0
  59. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3HrRetrieval.json +214 -0
  60. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3IndustrialRetrieval.json +214 -0
  61. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3NuclearRetrieval.json +214 -0
  62. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PharmaceuticalsRetrieval.json +214 -0
  63. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PhysicsRetrieval.json +214 -0
  64. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3TelecomRetrieval.json +214 -0
  65. mteb/descriptive_stats/PairClassification/TERRa.V2.json +35 -0
  66. mteb/descriptive_stats/Reranking/JQaRARerankingLite.json +35 -0
  67. mteb/descriptive_stats/Reranking/JaCWIRRerankingLite.json +35 -0
  68. mteb/descriptive_stats/Reranking/MultiLongDocReranking.json +466 -0
  69. mteb/descriptive_stats/Retrieval/ArguAna-NL.v2.json +30 -0
  70. mteb/descriptive_stats/Retrieval/ChemRxivRetrieval.json +30 -0
  71. mteb/descriptive_stats/Retrieval/EuroPIRQRetrieval.json +116 -0
  72. mteb/descriptive_stats/Retrieval/JaCWIRRetrievalLite.json +30 -0
  73. mteb/descriptive_stats/Retrieval/JaqketRetrievalLite.json +30 -0
  74. mteb/descriptive_stats/Retrieval/MIRACLJaRetrievalLite.json +30 -0
  75. mteb/descriptive_stats/Retrieval/MrTyDiJaRetrievalLite.json +30 -0
  76. mteb/descriptive_stats/Retrieval/NFCorpus-NL.v2.json +30 -0
  77. mteb/descriptive_stats/Retrieval/NanoClimateFEVER-VN.json +30 -0
  78. mteb/descriptive_stats/Retrieval/NanoDBPedia-VN.json +30 -0
  79. mteb/descriptive_stats/Retrieval/NanoFEVER-VN.json +30 -0
  80. mteb/descriptive_stats/Retrieval/NanoHotpotQA-VN.json +30 -0
  81. mteb/descriptive_stats/Retrieval/NanoMSMARCO-VN.json +30 -0
  82. mteb/descriptive_stats/Retrieval/NanoNQ-VN.json +30 -0
  83. mteb/descriptive_stats/Retrieval/SCIDOCS-NL.v2.json +30 -0
  84. mteb/descriptive_stats/Retrieval/SQuADKorV1Retrieval.json +30 -0
  85. mteb/descriptive_stats/Retrieval/SciFact-NL.v2.json +30 -0
  86. mteb/descriptive_stats/Retrieval/TVPLRetrieval.json +30 -0
  87. mteb/evaluate.py +106 -75
  88. mteb/filter_tasks.py +25 -26
  89. mteb/get_tasks.py +29 -30
  90. mteb/languages/language_scripts.py +5 -3
  91. mteb/leaderboard/app.py +414 -151
  92. mteb/leaderboard/benchmark_selector.py +14 -5
  93. mteb/leaderboard/figures.py +13 -15
  94. mteb/leaderboard/table.py +82 -17
  95. mteb/load_results.py +12 -12
  96. mteb/models/__init__.py +4 -1
  97. mteb/models/abs_encoder.py +31 -23
  98. mteb/models/cache_wrappers/__init__.py +2 -1
  99. mteb/models/cache_wrappers/cache_backend_protocol.py +3 -5
  100. mteb/models/cache_wrappers/cache_backends/_hash_utils.py +7 -6
  101. mteb/models/cache_wrappers/cache_backends/faiss_cache.py +6 -2
  102. mteb/models/cache_wrappers/cache_backends/numpy_cache.py +43 -25
  103. mteb/models/cache_wrappers/cache_wrapper.py +3 -3
  104. mteb/models/get_model_meta.py +25 -118
  105. mteb/models/instruct_wrapper.py +33 -9
  106. mteb/models/model_implementations/align_models.py +8 -1
  107. mteb/models/model_implementations/amazon_models.py +1 -0
  108. mteb/models/model_implementations/andersborges.py +65 -0
  109. mteb/models/model_implementations/ara_models.py +9 -1
  110. mteb/models/model_implementations/arctic_models.py +16 -8
  111. mteb/models/model_implementations/b1ade_models.py +2 -1
  112. mteb/models/model_implementations/bedrock_models.py +4 -0
  113. mteb/models/model_implementations/bge_models.py +101 -17
  114. mteb/models/model_implementations/bica_model.py +35 -0
  115. mteb/models/model_implementations/blip2_models.py +13 -2
  116. mteb/models/model_implementations/blip_models.py +43 -16
  117. mteb/models/model_implementations/bm25.py +5 -4
  118. mteb/models/model_implementations/bmretriever_models.py +10 -4
  119. mteb/models/model_implementations/cadet_models.py +10 -1
  120. mteb/models/model_implementations/cde_models.py +25 -4
  121. mteb/models/model_implementations/clip_models.py +9 -6
  122. mteb/models/model_implementations/clips_models.py +100 -0
  123. mteb/models/model_implementations/codefuse_models.py +165 -3
  124. mteb/models/model_implementations/codesage_models.py +18 -3
  125. mteb/models/model_implementations/cohere_models.py +13 -6
  126. mteb/models/model_implementations/cohere_v.py +7 -2
  127. mteb/models/model_implementations/colpali_models.py +17 -9
  128. mteb/models/model_implementations/colqwen_models.py +275 -5
  129. mteb/models/model_implementations/colsmol_models.py +4 -2
  130. mteb/models/model_implementations/conan_models.py +2 -1
  131. mteb/models/model_implementations/dino_models.py +194 -23
  132. mteb/models/model_implementations/e5_instruct.py +27 -4
  133. mteb/models/model_implementations/e5_models.py +21 -110
  134. mteb/models/model_implementations/e5_v.py +7 -6
  135. mteb/models/model_implementations/eagerworks_models.py +164 -0
  136. mteb/models/model_implementations/emillykkejensen_models.py +91 -0
  137. mteb/models/model_implementations/en_code_retriever.py +2 -1
  138. mteb/models/model_implementations/euler_models.py +32 -0
  139. mteb/models/model_implementations/evaclip_models.py +4 -0
  140. mteb/models/model_implementations/fa_models.py +67 -9
  141. mteb/models/model_implementations/facebookai.py +205 -0
  142. mteb/models/model_implementations/geogpt_models.py +2 -1
  143. mteb/models/model_implementations/gme_v_models.py +17 -10
  144. mteb/models/model_implementations/google_models.py +17 -6
  145. mteb/models/model_implementations/granite_vision_embedding_models.py +8 -3
  146. mteb/models/model_implementations/gritlm_models.py +4 -2
  147. mteb/models/model_implementations/gte_models.py +99 -9
  148. mteb/models/model_implementations/hinvec_models.py +2 -1
  149. mteb/models/model_implementations/human.py +1 -0
  150. mteb/models/model_implementations/ibm_granite_models.py +36 -6
  151. mteb/models/model_implementations/inf_models.py +4 -2
  152. mteb/models/model_implementations/jasper_models.py +256 -3
  153. mteb/models/model_implementations/jina_clip.py +49 -10
  154. mteb/models/model_implementations/jina_models.py +222 -11
  155. mteb/models/model_implementations/kalm_models.py +203 -25
  156. mteb/models/model_implementations/kblab.py +37 -0
  157. mteb/models/model_implementations/kennethenevoldsen_models.py +74 -0
  158. mteb/models/model_implementations/kfst.py +25 -0
  159. mteb/models/model_implementations/kowshik24_models.py +32 -0
  160. mteb/models/model_implementations/lens_models.py +2 -0
  161. mteb/models/model_implementations/lgai_embedding_models.py +2 -1
  162. mteb/models/model_implementations/linq_models.py +4 -3
  163. mteb/models/model_implementations/listconranker.py +2 -2
  164. mteb/models/model_implementations/llm2clip_models.py +9 -6
  165. mteb/models/model_implementations/llm2vec_models.py +16 -8
  166. mteb/models/model_implementations/mcinext_models.py +7 -1
  167. mteb/models/model_implementations/mdbr_models.py +19 -3
  168. mteb/models/model_implementations/misc_models.py +422 -60
  169. mteb/models/model_implementations/mixedbread_ai_models.py +332 -0
  170. mteb/models/model_implementations/mme5_models.py +2 -1
  171. mteb/models/model_implementations/moco_models.py +15 -4
  172. mteb/models/model_implementations/mod_models.py +191 -0
  173. mteb/models/model_implementations/model2vec_models.py +27 -14
  174. mteb/models/model_implementations/moka_models.py +4 -1
  175. mteb/models/model_implementations/nbailab.py +70 -0
  176. mteb/models/model_implementations/no_instruct_sentence_models.py +3 -2
  177. mteb/models/model_implementations/nomic_models.py +173 -6
  178. mteb/models/model_implementations/nomic_models_vision.py +8 -3
  179. mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +32 -19
  180. mteb/models/model_implementations/nvidia_models.py +155 -20
  181. mteb/models/model_implementations/octen_models.py +254 -0
  182. mteb/models/model_implementations/openai_models.py +20 -16
  183. mteb/models/model_implementations/openclip_models.py +37 -13
  184. mteb/models/model_implementations/opensearch_neural_sparse_models.py +10 -5
  185. mteb/models/model_implementations/ops_moa_models.py +5 -3
  186. mteb/models/model_implementations/ordalietech_solon_embeddings_mini_beta_1_1.py +1 -1
  187. mteb/models/model_implementations/pawan_models.py +39 -0
  188. mteb/models/model_implementations/piccolo_models.py +9 -1
  189. mteb/models/model_implementations/pixie_models.py +56 -0
  190. mteb/models/model_implementations/promptriever_models.py +12 -8
  191. mteb/models/model_implementations/pylate_models.py +46 -12
  192. mteb/models/model_implementations/qodo_models.py +4 -2
  193. mteb/models/model_implementations/qtack_models.py +2 -1
  194. mteb/models/model_implementations/qwen3_models.py +9 -6
  195. mteb/models/model_implementations/qzhou_models.py +5 -3
  196. mteb/models/model_implementations/random_baseline.py +19 -24
  197. mteb/models/model_implementations/rasgaard_models.py +34 -0
  198. mteb/models/model_implementations/reasonir_model.py +2 -1
  199. mteb/models/model_implementations/repllama_models.py +5 -3
  200. mteb/models/model_implementations/rerankers_custom.py +15 -9
  201. mteb/models/model_implementations/rerankers_monot5_based.py +31 -31
  202. mteb/models/model_implementations/richinfoai_models.py +2 -1
  203. mteb/models/model_implementations/ru_sentence_models.py +71 -20
  204. mteb/models/model_implementations/ruri_models.py +322 -0
  205. mteb/models/model_implementations/salesforce_models.py +6 -3
  206. mteb/models/model_implementations/samilpwc_models.py +2 -1
  207. mteb/models/model_implementations/sarashina_embedding_models.py +168 -0
  208. mteb/models/model_implementations/searchmap_models.py +2 -1
  209. mteb/models/model_implementations/seed_1_6_embedding_models.py +8 -2
  210. mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +625 -0
  211. mteb/models/model_implementations/seed_models.py +1 -0
  212. mteb/models/model_implementations/sentence_transformers_models.py +177 -18
  213. mteb/models/model_implementations/shuu_model.py +32 -31
  214. mteb/models/model_implementations/siglip_models.py +30 -20
  215. mteb/models/model_implementations/slm_models.py +416 -0
  216. mteb/models/model_implementations/sonar_models.py +1 -0
  217. mteb/models/model_implementations/spartan8806_atles_champion.py +34 -0
  218. mteb/models/model_implementations/stella_models.py +23 -4
  219. mteb/models/model_implementations/tarka_models.py +376 -0
  220. mteb/models/model_implementations/text2vec_models.py +9 -3
  221. mteb/models/model_implementations/ua_sentence_models.py +11 -1
  222. mteb/models/model_implementations/uae_models.py +8 -1
  223. mteb/models/model_implementations/vdr_models.py +3 -1
  224. mteb/models/model_implementations/vi_vn_models.py +45 -6
  225. mteb/models/model_implementations/vista_models.py +2 -0
  226. mteb/models/model_implementations/vlm2vec_models.py +5 -3
  227. mteb/models/model_implementations/voyage_models.py +99 -0
  228. mteb/models/model_implementations/voyage_v.py +17 -9
  229. mteb/models/model_implementations/xyz_models.py +1 -0
  230. mteb/models/model_implementations/youtu_models.py +2 -1
  231. mteb/models/model_implementations/yuan_models.py +34 -0
  232. mteb/models/model_implementations/yuan_models_en.py +58 -0
  233. mteb/models/model_meta.py +498 -29
  234. mteb/models/models_protocols.py +22 -6
  235. mteb/models/search_encoder_index/__init__.py +7 -0
  236. mteb/models/search_encoder_index/search_backend_protocol.py +50 -0
  237. mteb/models/search_encoder_index/search_indexes/__init__.py +5 -0
  238. mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +160 -0
  239. mteb/models/search_wrappers.py +197 -65
  240. mteb/models/sentence_transformer_wrapper.py +52 -32
  241. mteb/models/vllm_wrapper.py +327 -0
  242. mteb/py.typed +0 -0
  243. mteb/results/benchmark_results.py +114 -65
  244. mteb/results/model_result.py +63 -26
  245. mteb/results/task_result.py +117 -77
  246. mteb/similarity_functions.py +60 -7
  247. mteb/tasks/bitext_mining/multilingual/__init__.py +2 -1
  248. mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining.py +4 -2
  249. mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining_fast.py +1 -1
  250. mteb/tasks/bitext_mining/multilingual/ru_sci_bench_bitext_mining.py +47 -5
  251. mteb/tasks/bitext_mining/multilingual/web_faq_bitext_mining.py +2 -6
  252. mteb/tasks/classification/ara/ajgt.py +1 -2
  253. mteb/tasks/classification/ara/hotel_review_sentiment_classification.py +1 -2
  254. mteb/tasks/classification/ara/online_store_review_sentiment_classification.py +1 -2
  255. mteb/tasks/classification/ara/restaurant_review_sentiment_classification.py +1 -2
  256. mteb/tasks/classification/ara/tweet_emotion_classification.py +1 -2
  257. mteb/tasks/classification/ara/tweet_sarcasm_classification.py +1 -2
  258. mteb/tasks/classification/ben/bengali_document_classification.py +1 -2
  259. mteb/tasks/classification/ben/bengali_hate_speech_classification.py +1 -2
  260. mteb/tasks/classification/ben/bengali_sentiment_analysis.py +1 -2
  261. mteb/tasks/classification/ces/csfdcz_movie_review_sentiment_classification.py +1 -2
  262. mteb/tasks/classification/ces/czech_product_review_sentiment_classification.py +1 -2
  263. mteb/tasks/classification/ces/czech_so_me_sentiment_classification.py +1 -2
  264. mteb/tasks/classification/dan/angry_tweets_classification.py +1 -2
  265. mteb/tasks/classification/dan/danish_political_comments_classification.py +1 -2
  266. mteb/tasks/classification/dan/ddisco_cohesion_classification.py +1 -2
  267. mteb/tasks/classification/dan/dk_hate_classification.py +2 -3
  268. mteb/tasks/classification/deu/german_politicians_twitter_sentiment_classification.py +1 -2
  269. mteb/tasks/classification/deu/ten_k_gnad_classification.py +1 -2
  270. mteb/tasks/classification/eng/amazon_polarity_classification.py +1 -2
  271. mteb/tasks/classification/eng/arxiv_classification.py +1 -2
  272. mteb/tasks/classification/eng/banking77_classification.py +1 -2
  273. mteb/tasks/classification/eng/dbpedia_classification.py +1 -2
  274. mteb/tasks/classification/eng/emotion_classification.py +1 -2
  275. mteb/tasks/classification/eng/financial_phrasebank_classification.py +1 -2
  276. mteb/tasks/classification/eng/frenk_en_classification.py +1 -2
  277. mteb/tasks/classification/eng/gtsrb_classification.py +1 -1
  278. mteb/tasks/classification/eng/imdb_classification.py +1 -2
  279. mteb/tasks/classification/eng/legal_bench_classification.py +14 -120
  280. mteb/tasks/classification/eng/news_classification.py +1 -2
  281. mteb/tasks/classification/eng/patch_camelyon_classification.py +1 -1
  282. mteb/tasks/classification/eng/patent_classification.py +1 -2
  283. mteb/tasks/classification/eng/poem_sentiment_classification.py +1 -2
  284. mteb/tasks/classification/eng/sds_eye_protection_classification.py +1 -2
  285. mteb/tasks/classification/eng/sds_gloves_classification.py +1 -2
  286. mteb/tasks/classification/eng/toxic_chat_classification.py +2 -19
  287. mteb/tasks/classification/eng/toxic_conversations_classification.py +1 -2
  288. mteb/tasks/classification/eng/tweet_sentiment_extraction_classification.py +1 -2
  289. mteb/tasks/classification/eng/tweet_topic_single_classification.py +2 -13
  290. mteb/tasks/classification/eng/ucf101_classification.py +1 -5
  291. mteb/tasks/classification/eng/wikipedia_bio_met_chem_classification.py +1 -2
  292. mteb/tasks/classification/eng/wikipedia_chem_fields_classification.py +1 -2
  293. mteb/tasks/classification/eng/wikipedia_comp_chem_spectroscopy_classification.py +1 -2
  294. mteb/tasks/classification/eng/wikipedia_crystallography_analytical_classification.py +1 -2
  295. mteb/tasks/classification/eng/wikipedia_theoretical_applied_classification.py +1 -2
  296. mteb/tasks/classification/eng/yahoo_answers_topics_classification.py +1 -2
  297. mteb/tasks/classification/eng/yelp_review_full_classification.py +1 -2
  298. mteb/tasks/classification/est/estonian_valence.py +2 -3
  299. mteb/tasks/classification/fas/fa_mteb_classification.py +7 -14
  300. mteb/tasks/classification/fil/filipino_hate_speech_classification.py +1 -2
  301. mteb/tasks/classification/fin/fin_toxicity_classification.py +2 -11
  302. mteb/tasks/classification/fra/french_book_reviews.py +1 -2
  303. mteb/tasks/classification/fra/movie_review_sentiment_classification.py +1 -2
  304. mteb/tasks/classification/guj/gujarati_news_classification.py +1 -2
  305. mteb/tasks/classification/heb/__init__.py +6 -1
  306. mteb/tasks/classification/heb/hebrew_sentiment_analysis.py +62 -4
  307. mteb/tasks/classification/hin/hindi_discourse_classification.py +1 -2
  308. mteb/tasks/classification/hin/sentiment_analysis_hindi.py +1 -2
  309. mteb/tasks/classification/hrv/frenk_hr_classification.py +1 -2
  310. mteb/tasks/classification/ind/indonesian_id_clickbait_classification.py +1 -2
  311. mteb/tasks/classification/ind/indonesian_mongabay_conservation_classification.py +1 -2
  312. mteb/tasks/classification/ita/italian_linguist_acceptability_classification.py +1 -2
  313. mteb/tasks/classification/jav/javanese_imdb_classification.py +1 -2
  314. mteb/tasks/classification/jpn/wrime_classification.py +1 -2
  315. mteb/tasks/classification/kan/kannada_news_classification.py +1 -2
  316. mteb/tasks/classification/kor/klue_tc.py +1 -2
  317. mteb/tasks/classification/kor/kor_hate_classification.py +2 -17
  318. mteb/tasks/classification/kor/kor_sarcasm_classification.py +2 -19
  319. mteb/tasks/classification/kur/kurdish_sentiment_classification.py +3 -4
  320. mteb/tasks/classification/mal/malayalam_news_classification.py +1 -2
  321. mteb/tasks/classification/mar/marathi_news_classification.py +1 -2
  322. mteb/tasks/classification/mkd/macedonian_tweet_sentiment_classification.py +1 -2
  323. mteb/tasks/classification/multilingual/catalonia_tweet_classification.py +1 -6
  324. mteb/tasks/classification/multilingual/multi_hate_classification.py +1 -4
  325. mteb/tasks/classification/multilingual/ru_sci_bench_classification.py +4 -23
  326. mteb/tasks/classification/multilingual/scala_classification.py +2 -3
  327. mteb/tasks/classification/multilingual/sib200_classification.py +1 -6
  328. mteb/tasks/classification/mya/myanmar_news.py +1 -2
  329. mteb/tasks/classification/nep/nepali_news_classification.py +1 -2
  330. mteb/tasks/classification/nld/dutch_book_review_sentiment_classification.py +4 -2
  331. mteb/tasks/classification/nld/dutch_cola_classification.py +3 -0
  332. mteb/tasks/classification/nld/dutch_government_bias_classification.py +3 -0
  333. mteb/tasks/classification/nld/dutch_news_articles_classification.py +3 -0
  334. mteb/tasks/classification/nld/dutch_sarcastic_headlines_classification.py +3 -0
  335. mteb/tasks/classification/nld/iconclass_classification.py +3 -0
  336. mteb/tasks/classification/nld/open_tender_classification.py +3 -0
  337. mteb/tasks/classification/nld/vaccin_chat_nl_classification.py +3 -0
  338. mteb/tasks/classification/nob/no_rec_classification.py +1 -2
  339. mteb/tasks/classification/nob/norwegian_parliament_classification.py +1 -2
  340. mteb/tasks/classification/ory/odia_news_classification.py +1 -2
  341. mteb/tasks/classification/pol/polish_classification.py +3 -6
  342. mteb/tasks/classification/ron/moroco.py +1 -2
  343. mteb/tasks/classification/ron/romanian_reviews_sentiment.py +1 -2
  344. mteb/tasks/classification/ron/romanian_sentiment_classification.py +1 -2
  345. mteb/tasks/classification/rus/georeview_classification.py +1 -2
  346. mteb/tasks/classification/rus/headline_classification.py +1 -2
  347. mteb/tasks/classification/rus/inappropriateness_classification.py +1 -2
  348. mteb/tasks/classification/rus/ru_reviews_classification.py +1 -2
  349. mteb/tasks/classification/rus/ru_toixic_classification_okmlcup.py +1 -2
  350. mteb/tasks/classification/rus/senti_ru_eval.py +1 -2
  351. mteb/tasks/classification/sin/sinhala_news_classification.py +1 -2
  352. mteb/tasks/classification/sin/sinhala_news_source_classification.py +1 -2
  353. mteb/tasks/classification/slk/csfdsk_movie_review_sentiment_classification.py +1 -2
  354. mteb/tasks/classification/slk/slovak_hate_speech_classification.py +1 -2
  355. mteb/tasks/classification/slk/slovak_movie_review_sentiment_classification.py +1 -2
  356. mteb/tasks/classification/slv/frenk_sl_classification.py +1 -2
  357. mteb/tasks/classification/spa/spanish_news_classification.py +1 -2
  358. mteb/tasks/classification/spa/spanish_sentiment_classification.py +1 -2
  359. mteb/tasks/classification/ssw/siswati_news_classification.py +1 -2
  360. mteb/tasks/classification/swa/swahili_news_classification.py +1 -2
  361. mteb/tasks/classification/swe/dalaj_classification.py +1 -2
  362. mteb/tasks/classification/swe/swe_rec_classification.py +1 -2
  363. mteb/tasks/classification/swe/swedish_sentiment_classification.py +1 -2
  364. mteb/tasks/classification/tam/tamil_news_classification.py +1 -2
  365. mteb/tasks/classification/tel/telugu_andhra_jyoti_news_classification.py +1 -2
  366. mteb/tasks/classification/tha/wisesight_sentiment_classification.py +1 -2
  367. mteb/tasks/classification/tsn/tswana_news_classification.py +1 -2
  368. mteb/tasks/classification/tur/__init__.py +4 -0
  369. mteb/tasks/classification/tur/turkish_constitutional_court.py +41 -0
  370. mteb/tasks/classification/tur/turkish_movie_sentiment_classification.py +1 -2
  371. mteb/tasks/classification/tur/turkish_product_sentiment_classification.py +1 -2
  372. mteb/tasks/classification/ukr/ukr_formality_classification.py +2 -15
  373. mteb/tasks/classification/urd/urdu_roman_sentiment_classification.py +1 -2
  374. mteb/tasks/classification/vie/amazon_counterfactual_vn_classification.py +1 -6
  375. mteb/tasks/classification/vie/amazon_polarity_vn_classification.py +1 -6
  376. mteb/tasks/classification/vie/amazon_reviews_vn_classification.py +1 -5
  377. mteb/tasks/classification/vie/banking77_vn_classification.py +1 -5
  378. mteb/tasks/classification/vie/emotion_vn_classification.py +1 -5
  379. mteb/tasks/classification/vie/imdb_vn_classification.py +1 -5
  380. mteb/tasks/classification/vie/massive_intent_vn_classification.py +1 -5
  381. mteb/tasks/classification/vie/massive_scenario_vn_classification.py +1 -5
  382. mteb/tasks/classification/vie/mtop_domain_vn_classification.py +1 -5
  383. mteb/tasks/classification/vie/mtop_intent_vn_classification.py +1 -5
  384. mteb/tasks/classification/vie/toxic_conversations_vn_classification.py +1 -5
  385. mteb/tasks/classification/vie/tweet_sentiment_extraction_vn_classification.py +1 -5
  386. mteb/tasks/classification/vie/vie_student_feedback_classification.py +1 -2
  387. mteb/tasks/classification/zho/cmteb_classification.py +5 -10
  388. mteb/tasks/classification/zho/yue_openrice_review_classification.py +1 -2
  389. mteb/tasks/classification/zul/isi_zulu_news_classification.py +1 -2
  390. mteb/tasks/clustering/eng/hume_wiki_cities_clustering.py +1 -1
  391. mteb/tasks/clustering/eng/wiki_cities_clustering.py +1 -1
  392. mteb/tasks/clustering/jpn/mews_c16_ja_clustering.py +1 -3
  393. mteb/tasks/clustering/multilingual/sib200_clustering_s2s.py +1 -6
  394. mteb/tasks/clustering/nld/dutch_news_articles_clustering_p2p.py +3 -0
  395. mteb/tasks/clustering/nld/dutch_news_articles_clustering_s2s.py +3 -0
  396. mteb/tasks/clustering/nld/iconclass_clustering_s2s.py +3 -0
  397. mteb/tasks/clustering/nld/open_tender_clustering_p2p.py +3 -0
  398. mteb/tasks/clustering/nld/open_tender_clustering_s2s.py +3 -0
  399. mteb/tasks/clustering/nld/vabb_clustering_p2p.py +3 -0
  400. mteb/tasks/clustering/nld/vabb_clustering_s2s.py +3 -0
  401. mteb/tasks/clustering/vie/reddit_clustering_p2p_vn.py +1 -5
  402. mteb/tasks/clustering/vie/reddit_clustering_vn.py +1 -5
  403. mteb/tasks/clustering/vie/stack_exchange_clustering_p2p_vn.py +1 -5
  404. mteb/tasks/clustering/vie/stack_exchange_clustering_vn.py +1 -5
  405. mteb/tasks/clustering/vie/twenty_newsgroups_clustering_vn.py +1 -5
  406. mteb/tasks/clustering/zho/cmteb_clustering.py +2 -2
  407. mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +1 -1
  408. mteb/tasks/multilabel_classification/ita/emit_classification.py +1 -5
  409. mteb/tasks/multilabel_classification/kor/kor_hate_speech_ml_classification.py +1 -9
  410. mteb/tasks/multilabel_classification/mlt/maltese_news_classification.py +1 -6
  411. mteb/tasks/multilabel_classification/nld/covid_disinformation_nl_multi_label_classification.py +3 -0
  412. mteb/tasks/multilabel_classification/nld/vabb_multi_label_classification.py +3 -0
  413. mteb/tasks/multilabel_classification/por/brazilian_toxic_tweets_classification.py +1 -6
  414. mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_group_classification.py +1 -1
  415. mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_subclass_classification.py +1 -2
  416. mteb/tasks/pair_classification/dan/talemaader_pc.py +1 -6
  417. mteb/tasks/pair_classification/eng/legal_bench_pc.py +1 -9
  418. mteb/tasks/pair_classification/nld/sick_nl_pair_classification.py +3 -0
  419. mteb/tasks/pair_classification/nld/xlwic_nl_pair_classification.py +3 -0
  420. mteb/tasks/pair_classification/rus/__init__.py +2 -2
  421. mteb/tasks/pair_classification/rus/terra.py +51 -25
  422. mteb/tasks/pair_classification/vie/sprint_duplicate_questions_pcvn.py +1 -5
  423. mteb/tasks/pair_classification/vie/twitter_sem_eval2015_pcvn.py +1 -5
  424. mteb/tasks/pair_classification/vie/twitter_url_corpus_pcvn.py +1 -5
  425. mteb/tasks/regression/multilingual/ru_sci_bench_regression.py +2 -6
  426. mteb/tasks/reranking/jpn/__init__.py +9 -1
  427. mteb/tasks/reranking/jpn/j_qa_ra_reranking_lite.py +49 -0
  428. mteb/tasks/reranking/jpn/ja_cwir_reranking_lite.py +47 -0
  429. mteb/tasks/reranking/multilingual/__init__.py +2 -0
  430. mteb/tasks/reranking/multilingual/multi_long_doc_reranking.py +70 -0
  431. mteb/tasks/reranking/multilingual/wikipedia_reranking_multilingual.py +1 -1
  432. mteb/tasks/reranking/multilingual/x_glue_wpr_reranking.py +1 -2
  433. mteb/tasks/reranking/vie/ask_ubuntu_dup_questions_vn.py +1 -5
  434. mteb/tasks/reranking/vie/sci_docs_reranking_vn.py +1 -5
  435. mteb/tasks/reranking/vie/stack_overflow_dup_questions_vn.py +1 -5
  436. mteb/tasks/retrieval/code/code_rag.py +12 -12
  437. mteb/tasks/retrieval/code/fresh_stack_retrieval.py +8 -5
  438. mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
  439. mteb/tasks/retrieval/dan/tv2_nordretrieval.py +2 -2
  440. mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +2 -2
  441. mteb/tasks/retrieval/eng/__init__.py +2 -0
  442. mteb/tasks/retrieval/eng/chemrxiv.py +33 -0
  443. mteb/tasks/retrieval/eng/cub200_i2i_retrieval.py +1 -1
  444. mteb/tasks/retrieval/eng/lit_search_retrieval.py +1 -8
  445. mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +4 -0
  446. mteb/tasks/retrieval/jpn/__init__.py +8 -0
  447. mteb/tasks/retrieval/jpn/ja_cwir_retrieval.py +1 -4
  448. mteb/tasks/retrieval/jpn/ja_cwir_retrieval_lite.py +47 -0
  449. mteb/tasks/retrieval/jpn/jaqket_retrieval_lite.py +50 -0
  450. mteb/tasks/retrieval/jpn/miracl_ja_retrieval_lite.py +52 -0
  451. mteb/tasks/retrieval/jpn/mr_tydi_ja_retrieval_lite.py +48 -0
  452. mteb/tasks/retrieval/kat/georgian_faq_retrieval.py +11 -4
  453. mteb/tasks/retrieval/kor/__init__.py +16 -1
  454. mteb/tasks/retrieval/kor/kovidore2_bench_retrieval.py +142 -0
  455. mteb/tasks/retrieval/kor/squad_kor_v1_retrieval.py +47 -0
  456. mteb/tasks/retrieval/multilingual/__init__.py +24 -0
  457. mteb/tasks/retrieval/multilingual/belebele_retrieval.py +5 -4
  458. mteb/tasks/retrieval/multilingual/euro_pirq_retrieval.py +43 -0
  459. mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +56 -42
  460. mteb/tasks/retrieval/multilingual/mkqa_retrieval.py +1 -2
  461. mteb/tasks/retrieval/multilingual/mlqa_retrieval.py +1 -4
  462. mteb/tasks/retrieval/multilingual/multi_long_doc_retrieval.py +1 -2
  463. mteb/tasks/retrieval/multilingual/public_health_qa_retrieval.py +9 -4
  464. mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +2 -12
  465. mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +4 -2
  466. mteb/tasks/retrieval/multilingual/vidore3_bench_retrieval.py +389 -0
  467. mteb/tasks/retrieval/nld/__init__.py +8 -4
  468. mteb/tasks/retrieval/nld/argu_ana_nl_retrieval.py +46 -27
  469. mteb/tasks/retrieval/nld/bbsard_nl_retrieval.py +3 -0
  470. mteb/tasks/retrieval/nld/dutch_news_articles_retrieval.py +3 -0
  471. mteb/tasks/retrieval/nld/legal_qa_nl_retrieval.py +3 -0
  472. mteb/tasks/retrieval/nld/nf_corpus_nl_retrieval.py +42 -25
  473. mteb/tasks/retrieval/nld/open_tender_retrieval.py +3 -0
  474. mteb/tasks/retrieval/nld/sci_fact_nl_retrieval.py +42 -24
  475. mteb/tasks/retrieval/nld/scidocsnl_retrieval.py +44 -27
  476. mteb/tasks/retrieval/nld/vabb_retrieval.py +3 -0
  477. mteb/tasks/retrieval/nob/norquad.py +2 -2
  478. mteb/tasks/retrieval/nob/snl_retrieval.py +2 -2
  479. mteb/tasks/retrieval/slk/slovak_sum_retrieval.py +1 -7
  480. mteb/tasks/retrieval/tur/tur_hist_quad.py +1 -1
  481. mteb/tasks/retrieval/vie/__init__.py +14 -6
  482. mteb/tasks/retrieval/vie/argu_ana_vn_retrieval.py +1 -5
  483. mteb/tasks/retrieval/vie/climate_fevervn_retrieval.py +40 -5
  484. mteb/tasks/retrieval/vie/cqa_dupstack_android_vn_retrieval.py +1 -5
  485. mteb/tasks/retrieval/vie/cqa_dupstack_gis_vn_retrieval.py +1 -5
  486. mteb/tasks/retrieval/vie/cqa_dupstack_mathematica_vn_retrieval.py +1 -5
  487. mteb/tasks/retrieval/vie/cqa_dupstack_physics_vn_retrieval.py +1 -5
  488. mteb/tasks/retrieval/vie/cqa_dupstack_programmers_vn_retrieval.py +1 -5
  489. mteb/tasks/retrieval/vie/cqa_dupstack_stats_vn_retrieval.py +1 -5
  490. mteb/tasks/retrieval/vie/cqa_dupstack_tex_vn_retrieval.py +1 -5
  491. mteb/tasks/retrieval/vie/cqa_dupstack_unix_vn_retrieval.py +1 -5
  492. mteb/tasks/retrieval/vie/cqa_dupstack_webmasters_vn_retrieval.py +1 -5
  493. mteb/tasks/retrieval/vie/cqa_dupstack_wordpress_vn_retrieval.py +1 -5
  494. mteb/tasks/retrieval/vie/db_pedia_vn_retrieval.py +40 -5
  495. mteb/tasks/retrieval/vie/fevervn_retrieval.py +40 -7
  496. mteb/tasks/retrieval/vie/fi_qa2018_vn_retrieval.py +1 -5
  497. mteb/tasks/retrieval/vie/green_node_table_markdown_retrieval.py +16 -1
  498. mteb/tasks/retrieval/vie/hotpot_qavn_retrieval.py +40 -6
  499. mteb/tasks/retrieval/vie/msmarcovn_retrieval.py +49 -5
  500. mteb/tasks/retrieval/vie/nf_corpus_vn_retrieval.py +1 -5
  501. mteb/tasks/retrieval/vie/nqvn_retrieval.py +40 -5
  502. mteb/tasks/retrieval/vie/quora_vn_retrieval.py +1 -6
  503. mteb/tasks/retrieval/vie/sci_fact_vn_retrieval.py +1 -5
  504. mteb/tasks/retrieval/vie/scidocsvn_retrieval.py +1 -6
  505. mteb/tasks/retrieval/vie/touche2020_vn_retrieval.py +1 -5
  506. mteb/tasks/retrieval/vie/treccovidvn_retrieval.py +1 -5
  507. mteb/tasks/retrieval/vie/tvpl_retrieval.py +42 -0
  508. mteb/tasks/retrieval/vie/zac_legal_text_retrieval.py +15 -1
  509. mteb/tasks/sts/nld/sick_nl_sts.py +1 -0
  510. mteb/tasks/sts/vie/biosses_stsvn.py +1 -5
  511. mteb/tasks/sts/vie/sickr_stsvn.py +1 -5
  512. mteb/tasks/sts/vie/sts_benchmark_stsvn.py +1 -5
  513. mteb/tasks/zeroshot_classification/eng/gtsrb.py +1 -1
  514. mteb/tasks/zeroshot_classification/eng/patch_camelyon.py +1 -1
  515. mteb/tasks/zeroshot_classification/eng/ucf101.py +1 -5
  516. mteb/types/__init__.py +2 -0
  517. mteb/types/_encoder_io.py +19 -2
  518. mteb/types/_result.py +2 -1
  519. mteb/types/statistics.py +9 -3
  520. {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/METADATA +25 -8
  521. {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/RECORD +525 -438
  522. mteb/models/model_implementations/mxbai_models.py +0 -102
  523. mteb/models/model_implementations/nb_sbert.py +0 -25
  524. {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/WHEEL +0 -0
  525. {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/entry_points.txt +0 -0
  526. {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/licenses/LICENSE +0 -0
  527. {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/top_level.txt +0 -0
mteb/leaderboard/app.py CHANGED
@@ -14,7 +14,6 @@ import pandas as pd
14
14
 
15
15
  import mteb
16
16
  from mteb import BenchmarkResults
17
- from mteb.abstasks.task_metadata import TaskDomain, TaskType
18
17
  from mteb.benchmarks.benchmark import RtebBenchmark
19
18
  from mteb.cache import ResultCache
20
19
  from mteb.leaderboard.benchmark_selector import (
@@ -25,33 +24,120 @@ from mteb.leaderboard.benchmark_selector import (
25
24
  )
26
25
  from mteb.leaderboard.figures import _performance_size_plot, _radar_chart
27
26
  from mteb.leaderboard.table import (
27
+ apply_per_language_styling_from_benchmark,
28
28
  apply_per_task_styling_from_benchmark,
29
29
  apply_summary_styling_from_benchmark,
30
30
  )
31
31
  from mteb.leaderboard.text_segments import ACKNOWLEDGEMENT, FAQ
32
- from mteb.types import Modalities
32
+ from mteb.models.model_meta import MODEL_TYPES
33
33
 
34
34
  logger = logging.getLogger(__name__)
35
35
 
36
+
36
37
  LANGUAGE: list[str] = list({l for t in mteb.get_tasks() for l in t.metadata.languages})
38
+ MODEL_TYPE_CHOICES = list(get_args(MODEL_TYPES))
37
39
 
38
40
 
39
41
  def _load_results(cache: ResultCache) -> BenchmarkResults:
42
+ """Load benchmark results using an optimized caching strategy.
43
+
44
+ This function implements a two-tier caching strategy for faster leaderboard startup:
45
+
46
+ 1. **Primary Strategy (Fast)**: Download pre-computed cached results from the
47
+ 'cached-data' branch as a compressed JSON file (~2MB vs ~200MB full repo).
48
+ This avoids the need to clone the entire results repository and provides
49
+ near-instantaneous loading for most users.
50
+
51
+ 2. **Fallback Strategy (Slower)**: If the cached download fails, fall back to
52
+ the original approach of downloading the full results repository and
53
+ building the cache from scratch.
54
+
55
+ The cached results file contains pre-aggregated benchmark data that eliminates
56
+ the need for expensive operations like task selection and revision joining
57
+ during app startup.
58
+
59
+ Args:
60
+ cache: ResultCache instance used for both optimized and fallback operations
61
+
62
+ Returns:
63
+ BenchmarkResults: Complete benchmark results ready for leaderboard display
64
+
65
+ Raises:
66
+ Various exceptions related to network issues, file I/O, or data validation
67
+ are logged and may cause fallback to the slower repository-based approach.
68
+ """
69
+ start_time = time.time()
40
70
  results_cache_path = Path(__file__).parent.joinpath("__cached_results.json")
71
+
41
72
  if not results_cache_path.exists():
42
- cache.download_from_remote()
43
- all_model_names = [model_meta.name for model_meta in mteb.get_model_metas()]
44
-
45
- all_results = cache.load_results(
46
- models=all_model_names,
47
- only_main_score=True,
48
- require_model_meta=False,
49
- include_remote=True,
73
+ # First try to download the cached results file from the cached-data branch
74
+ # This is faster than cloning the entire results repository
75
+ logger.info(
76
+ "Cached results not found, trying to download from cached-data branch..."
50
77
  )
51
- return all_results
52
- else:
78
+
79
+ try:
80
+ # Use ResultCache's optimized download method
81
+ # Default saves to mteb/leaderboard/__cached_results.json
82
+ results_cache_path = cache._download_cached_results_from_branch()
83
+ download_time = time.time() - start_time
84
+ logger.info(
85
+ f"Downloaded cached results from cached-data branch in {download_time:.2f}s"
86
+ )
87
+
88
+ except Exception as e:
89
+ logger.error(
90
+ f"Failed to download from cached-data branch: {type(e).__name__}: {e}"
91
+ )
92
+ logger.info("Falling back to downloading full remote repository...")
93
+
94
+ # Fall back to the original approach: clone the full repo
95
+ cache.download_from_remote()
96
+ download_time = time.time() - start_time
97
+ logger.info(f"Downloaded remote results in {download_time:.2f}s")
98
+
99
+ load_start = time.time()
100
+ all_model_names = [model_meta.name for model_meta in mteb.get_model_metas()]
101
+
102
+ all_results = cache.load_results(
103
+ models=all_model_names,
104
+ only_main_score=True,
105
+ require_model_meta=False,
106
+ include_remote=True,
107
+ )
108
+ load_time = time.time() - load_start
109
+ logger.info(f"Loaded results from cache in {load_time:.2f}s")
110
+ return all_results
111
+
112
+ # Load the cached results file (either pre-existing or just downloaded)
113
+ logger.info("Loading cached results from disk...")
114
+ try:
115
+ logger.info(f"Opening file: {results_cache_path}")
116
+
117
+ file_size = results_cache_path.stat().st_size
118
+ logger.info(f"File exists, size: {file_size} bytes")
119
+
53
120
  with results_cache_path.open() as cache_file:
54
- return mteb.BenchmarkResults.from_validated(**json.load(cache_file))
121
+ logger.info("File opened successfully, attempting JSON parse...")
122
+ json_data = json.load(cache_file)
123
+ logger.info(
124
+ f"JSON parsed successfully, keys: {list(json_data.keys()) if isinstance(json_data, dict) else 'not a dict'}"
125
+ )
126
+
127
+ logger.info("Attempting BenchmarkResults.from_validated...")
128
+ results = mteb.BenchmarkResults.from_validated(**json_data)
129
+ logger.info("BenchmarkResults.from_validated successful")
130
+
131
+ except Exception as e:
132
+ # TODO: Handle the case when we fail to load cached results from disk.
133
+ logger.error(
134
+ f"Failed to load cached results from disk: {type(e).__name__}: {e}"
135
+ )
136
+ raise
137
+
138
+ total_time = time.time() - start_time
139
+ logger.info(f"Loaded cached results in {total_time:.2f}s")
140
+ return results
55
141
 
56
142
 
57
143
  def _produce_benchmark_link(benchmark_name: str, request: gr.Request) -> str:
@@ -107,7 +193,9 @@ def _update_description(
107
193
  description += f" - **Number of task types**: {n_task_types}\n"
108
194
  description += f" - **Number of domains**: {n_domains}\n"
109
195
  if benchmark.reference is not None:
110
- description += f"\n[Click for More Info]({benchmark.reference})"
196
+ description += (
197
+ f'\n<a href="{benchmark.reference}" target="_blank">Click for More Info</a>'
198
+ )
111
199
 
112
200
  return description
113
201
 
@@ -137,7 +225,10 @@ def _update_task_info(task_names: str) -> gr.DataFrame:
137
225
  df["languages"] = df["languages"].map(_format_list)
138
226
  df = df.sort_values("name")
139
227
  df["domains"] = df["domains"].map(_format_list)
140
- df["name"] = "[" + df["name"] + "](" + df["reference"] + ")"
228
+ df["name"] = df.apply(
229
+ lambda row: f'<a href="{row["reference"]}" target="_blank">{row["name"]}</a>',
230
+ axis=1,
231
+ )
141
232
  df["modalities"] = df["modalities"].map(_format_list)
142
233
  df = df.rename(
143
234
  columns={
@@ -154,8 +245,7 @@ def _update_task_info(task_names: str) -> gr.DataFrame:
154
245
  return gr.DataFrame(
155
246
  df,
156
247
  datatype=["markdown"] + ["str"] * (len(df.columns) - 1),
157
- show_copy_button=True,
158
- show_fullscreen_button=True,
248
+ buttons=["copy", "fullscreen"],
159
249
  show_search="filter",
160
250
  )
161
251
 
@@ -172,6 +262,7 @@ def _filter_models(
172
262
  instructions: bool | None,
173
263
  max_model_size: int,
174
264
  zero_shot_setting: Literal["only_zero_shot", "allow_all", "remove_unknown"],
265
+ model_types: list[str] | None,
175
266
  ):
176
267
  lower, upper = 0, max_model_size
177
268
  # Setting to None, when the user doesn't specify anything
@@ -190,6 +281,7 @@ def _filter_models(
190
281
  use_instructions=instructions,
191
282
  frameworks=compatibility,
192
283
  n_parameters_range=(lower, upper),
284
+ model_types=model_types,
193
285
  )
194
286
 
195
287
  models_to_keep = set()
@@ -213,21 +305,155 @@ def _should_show_zero_shot_filter(benchmark_name: str) -> bool:
213
305
  return True
214
306
 
215
307
 
308
+ @cachetools.cached(
309
+ cache={},
310
+ key=lambda benchmark_name, all_benchmark_results: hash(benchmark_name),
311
+ )
312
+ def _cache_on_benchmark_select(benchmark_name, all_benchmark_results):
313
+ start_time = time.time()
314
+ benchmark = mteb.get_benchmark(benchmark_name)
315
+ languages = [task.languages for task in benchmark.tasks if task.languages]
316
+ languages = set(itertools.chain.from_iterable(languages))
317
+ languages = sorted(languages)
318
+ domains = [
319
+ task.metadata.domains for task in benchmark.tasks if task.metadata.domains
320
+ ]
321
+ domains = set(itertools.chain.from_iterable(domains))
322
+ types = {task.metadata.type for task in benchmark.tasks if task.metadata.type}
323
+ modalities = set()
324
+ for task in benchmark.tasks:
325
+ modalities.update(task.metadata.modalities)
326
+ languages, domains, types, modalities = (
327
+ sorted(languages),
328
+ sorted(domains),
329
+ sorted(types),
330
+ sorted(modalities),
331
+ )
332
+ elapsed = time.time() - start_time
333
+ benchmark_results = all_benchmark_results[benchmark_name]
334
+ scores = benchmark_results._get_scores(format="long")
335
+ logger.debug(f"on_benchmark_select callback: {elapsed}s")
336
+ show_zero_shot = _should_show_zero_shot_filter(benchmark_name)
337
+
338
+ # Calculate initial models for this benchmark to avoid race conditions
339
+ benchmark_tasks = sorted([task.metadata.name for task in benchmark.tasks])
340
+ all_models_in_scores = list({entry["model_name"] for entry in scores})
341
+ initial_models = _filter_models(
342
+ all_models_in_scores,
343
+ benchmark_tasks,
344
+ availability=None,
345
+ compatibility=[],
346
+ instructions=None,
347
+ max_model_size=MAX_MODEL_SIZE,
348
+ zero_shot_setting="allow_all",
349
+ model_types=MODEL_TYPE_CHOICES,
350
+ )
351
+ # Sort to ensure consistency with update_models
352
+ initial_models = sorted(initial_models)
353
+
354
+ return (
355
+ languages,
356
+ domains,
357
+ types,
358
+ modalities,
359
+ benchmark_tasks,
360
+ scores,
361
+ show_zero_shot,
362
+ initial_models,
363
+ )
364
+
365
+
366
+ @cachetools.cached(
367
+ cache={},
368
+ key=lambda benchmark_name,
369
+ type_select,
370
+ domain_select,
371
+ lang_select,
372
+ modality_select: hash(
373
+ (
374
+ hash(benchmark_name),
375
+ hash(tuple(type_select)),
376
+ hash(tuple(domain_select)),
377
+ hash(tuple(lang_select)),
378
+ hash(tuple(modality_select)),
379
+ )
380
+ ),
381
+ )
382
+ def _cache_update_task_list(
383
+ benchmark_name, type_select, domain_select, lang_select, modality_select
384
+ ):
385
+ if not len(lang_select):
386
+ return []
387
+ start_time = time.time()
388
+ benchmark_tasks = []
389
+ tasks_to_keep = []
390
+ for task in mteb.get_benchmark(benchmark_name).tasks:
391
+ benchmark_tasks.append(task.metadata.name)
392
+ if task.metadata.type not in type_select:
393
+ continue
394
+ if task.metadata.domains and not (
395
+ set(task.metadata.domains) & set(domain_select)
396
+ ):
397
+ continue
398
+ if task.languages and not (set(task.languages) & set(lang_select)):
399
+ continue
400
+ if task.metadata.modalities and not (
401
+ set(task.metadata.modalities) & set(modality_select)
402
+ ):
403
+ continue
404
+ tasks_to_keep.append(task.metadata.name)
405
+ benchmark_tasks.sort()
406
+ tasks_to_keep.sort()
407
+ elapsed = time.time() - start_time
408
+ logger.debug(f"update_task_list callback: {elapsed}s")
409
+
410
+ return benchmark_tasks, tasks_to_keep
411
+
412
+
216
413
  def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
217
414
  """Returns a Gradio Blocks app for the MTEB leaderboard."""
218
- logger.info("Loading all benchmark results")
415
+ app_start = time.time()
416
+ logger.info("=== Starting leaderboard app initialization ===")
417
+
418
+ logger.info("Step 1/7: Loading all benchmark results...")
419
+ load_start = time.time()
219
420
  all_results = _load_results(cache)
421
+ load_time = time.time() - load_start
422
+ logger.info(f"Step 1/7 complete: Loaded results in {load_time:.2f}s")
220
423
 
424
+ logger.info("Step 2/7: Fetching benchmarks...")
425
+ bench_start = time.time()
221
426
  benchmarks = sorted(
222
427
  mteb.get_benchmarks(display_on_leaderboard=True), key=lambda x: x.name
223
428
  )
429
+ bench_time = time.time() - bench_start
430
+ logger.info(
431
+ f"Step 2/7 complete: Fetched {len(benchmarks)} benchmarks in {bench_time:.2f}s"
432
+ )
433
+
434
+ logger.info(
435
+ "Step 3/7: Processing all benchmarks (select_tasks + join_revisions)..."
436
+ )
437
+ process_start = time.time()
224
438
  all_benchmark_results = {
225
439
  benchmark.name: all_results.select_tasks(benchmark.tasks).join_revisions()
226
440
  for benchmark in benchmarks
227
441
  }
442
+ process_time = time.time() - process_start
443
+ if len(benchmarks) > 0:
444
+ logger.info(
445
+ f"Step 3/7 complete: Processed {len(benchmarks)} benchmarks in {process_time:.2f}s (avg {process_time / len(benchmarks):.2f}s/benchmark)"
446
+ )
447
+ else:
448
+ logger.info(
449
+ f"Step 3/7 complete: Processed 0 benchmarks in {process_time:.2f}s (avg N/A)"
450
+ )
451
+
228
452
  default_benchmark = mteb.get_benchmark(DEFAULT_BENCHMARK_NAME)
229
453
  default_results = all_benchmark_results[default_benchmark.name]
230
- logger.info("Benchmark results loaded")
454
+
455
+ logger.info("Step 4/7: Filtering models...")
456
+ filter_start = time.time()
231
457
 
232
458
  default_scores = default_results._get_scores(format="long")
233
459
  all_models = list({entry["model_name"] for entry in default_scores})
@@ -239,6 +465,7 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
239
465
  instructions=None,
240
466
  max_model_size=MAX_MODEL_SIZE,
241
467
  zero_shot_setting="allow_all",
468
+ model_types=MODEL_TYPE_CHOICES,
242
469
  )
243
470
  default_filtered_scores = [
244
471
  entry for entry in default_scores if entry["model_name"] in filtered_models
@@ -247,63 +474,79 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
247
474
  # Filter BenchmarkResults based on default filtered models (as required by Kenneth)
248
475
  filtered_model_names = [entry["model_name"] for entry in default_filtered_scores]
249
476
  filtered_benchmark_results = default_results.select_models(filtered_model_names)
477
+ filter_time = time.time() - filter_start
478
+ logger.info(
479
+ f"Step 4/7 complete: Filtered {len(filtered_model_names)} models in {filter_time:.2f}s"
480
+ )
250
481
 
482
+ logger.info("Step 5/7: Generating tables...")
483
+ table_start = time.time()
251
484
  summary_table = apply_summary_styling_from_benchmark(
252
485
  default_benchmark, filtered_benchmark_results
253
486
  )
254
487
  per_task_table = apply_per_task_styling_from_benchmark(
255
488
  default_benchmark, filtered_benchmark_results
256
489
  )
490
+ per_language_table = apply_per_language_styling_from_benchmark(
491
+ default_benchmark,
492
+ filtered_benchmark_results,
493
+ )
494
+ table_time = time.time() - table_start
495
+ logger.info(f"Step 5/7 complete: Generated tables in {table_time:.2f}s")
257
496
 
258
- lang_select = gr.Dropdown(
259
- LANGUAGE,
497
+ # Check if this benchmark displays per-language results
498
+ display_language_table = len(default_benchmark.language_view) > 0
499
+
500
+ logger.info("Step 6/7: Creating Gradio components...")
501
+ component_start = time.time()
502
+ lang_select = gr.CheckboxGroup(
503
+ sorted(default_results.languages),
260
504
  value=sorted(default_results.languages),
261
- allow_custom_value=True,
262
- multiselect=True,
505
+ show_label=True,
506
+ show_select_all=True,
263
507
  label="Language",
264
508
  info="Select languages to include.",
265
509
  )
266
- type_select = gr.Dropdown(
267
- sorted(get_args(TaskType)),
510
+ type_select = gr.CheckboxGroup(
511
+ sorted(default_results.task_types),
268
512
  value=sorted(default_results.task_types),
269
- multiselect=True,
513
+ show_label=True,
514
+ show_select_all=True,
270
515
  label="Task Type",
271
516
  info="Select task types to include.",
272
517
  )
273
- domain_select = gr.Dropdown(
274
- sorted(get_args(TaskDomain)),
518
+ domain_select = gr.CheckboxGroup(
519
+ sorted(default_results.domains),
275
520
  value=sorted(default_results.domains),
276
- multiselect=True,
521
+ show_label=True,
522
+ show_select_all=True,
277
523
  label="Domain",
278
524
  info="Select domains to include.",
279
525
  )
280
- task_select = gr.Dropdown(
281
- sorted(all_results.task_names),
526
+ task_select = gr.CheckboxGroup(
527
+ sorted(default_results.task_names),
282
528
  value=sorted(default_results.task_names),
283
- allow_custom_value=True,
284
- multiselect=True,
529
+ show_label=True,
530
+ show_select_all=True,
285
531
  label="Task",
286
532
  info="Select specific tasks to include",
287
533
  )
288
- modality_select = gr.Dropdown(
289
- sorted(get_args(Modalities)),
534
+ modality_select = gr.CheckboxGroup(
535
+ sorted(default_results.modalities),
290
536
  value=sorted(default_results.modalities),
291
- multiselect=True,
537
+ show_label=True,
538
+ show_select_all=True,
292
539
  label="Modality",
293
540
  info="Select modalities to include.",
294
541
  )
542
+ component_time = time.time() - component_start
543
+ logger.info(
544
+ f"Step 6/7 complete: Created Gradio components in {component_time:.2f}s"
545
+ )
295
546
 
296
- head = """
297
- <link href="https://cdn.jsdelivr.net/npm/tailwindcss@2.2.19/dist/tailwind.min.css" rel="stylesheet">
298
- """
299
-
300
- with gr.Blocks(
301
- fill_width=True,
302
- theme=gr.themes.Soft(
303
- font=[gr.themes.GoogleFont("Roboto Mono"), "Arial", "sans-serif"],
304
- ),
305
- head=head,
306
- ) as demo:
547
+ logger.info("Step 7/7: Building Gradio interface and callbacks...")
548
+ interface_start = time.time()
549
+ with gr.Blocks(fill_width=True) as demo:
307
550
  with gr.Sidebar(
308
551
  position="left",
309
552
  label="Benchmark Selection and Customization",
@@ -419,6 +662,12 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
419
662
  label="Model Parameters",
420
663
  interactive=True,
421
664
  )
665
+ with gr.Column():
666
+ model_type_select = gr.CheckboxGroup(
667
+ MODEL_TYPE_CHOICES,
668
+ value=MODEL_TYPE_CHOICES,
669
+ label="Model Type",
670
+ )
422
671
 
423
672
  with gr.Tab("Summary"):
424
673
  summary_table.render()
@@ -435,9 +684,6 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
435
684
 
436
685
  with gr.Tab("Performance per Model Size") as plot_tab:
437
686
  plot = gr.Plot(_performance_size_plot, inputs=[summary_table])
438
- gr.Markdown(
439
- "*We only display TOP 5 models that have been run on all tasks in the benchmark*"
440
- )
441
687
  plot_tab.select(
442
688
  _performance_size_plot, inputs=[summary_table], outputs=[plot]
443
689
  )
@@ -457,68 +703,41 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
457
703
  download_per_task.click(
458
704
  _download_table, inputs=[per_task_table], outputs=[download_per_task]
459
705
  )
706
+ with gr.Tab(
707
+ "Performance per language", visible=display_language_table
708
+ ) as language_tab:
709
+ per_language_table.render()
710
+ download_per_language = gr.DownloadButton("Download Table")
711
+ download_per_language.click(
712
+ _download_table,
713
+ inputs=[per_language_table],
714
+ outputs=[download_per_language],
715
+ )
460
716
  with gr.Tab("Task information"):
461
717
  task_info_table = gr.DataFrame(_update_task_info, inputs=[task_select]) # noqa: F841
462
718
 
463
719
  # This sets the benchmark from the URL query parameters
464
720
  demo.load(_set_benchmark_on_load, inputs=[], outputs=[benchmark_select])
465
721
 
466
- @cachetools.cached(
467
- cache={},
468
- key=lambda benchmark_name: hash(benchmark_name),
469
- )
470
722
  def on_benchmark_select(benchmark_name):
471
- start_time = time.time()
472
- benchmark = mteb.get_benchmark(benchmark_name)
473
- languages = [task.languages for task in benchmark.tasks if task.languages]
474
- languages = set(itertools.chain.from_iterable(languages))
475
- languages = sorted(languages)
476
- domains = [
477
- task.metadata.domains
478
- for task in benchmark.tasks
479
- if task.metadata.domains
480
- ]
481
- domains = set(itertools.chain.from_iterable(domains))
482
- types = {
483
- task.metadata.type for task in benchmark.tasks if task.metadata.type
484
- }
485
- modalities = set()
486
- for task in benchmark.tasks:
487
- modalities.update(task.metadata.modalities)
488
- languages, domains, types, modalities = (
489
- sorted(languages),
490
- sorted(domains),
491
- sorted(types),
492
- sorted(modalities),
493
- )
494
- elapsed = time.time() - start_time
495
- benchmark_results = all_benchmark_results[benchmark_name]
496
- scores = benchmark_results._get_scores(format="long")
497
- logger.debug(f"on_benchmark_select callback: {elapsed}s")
498
- show_zero_shot = _should_show_zero_shot_filter(benchmark_name)
499
-
500
- # Calculate initial models for this benchmark to avoid race conditions
501
- benchmark_tasks = sorted([task.metadata.name for task in benchmark.tasks])
502
- all_models_in_scores = list({entry["model_name"] for entry in scores})
503
- initial_models = _filter_models(
504
- all_models_in_scores,
505
- benchmark_tasks,
506
- availability=None,
507
- compatibility=[],
508
- instructions=None,
509
- max_model_size=MAX_MODEL_SIZE,
510
- zero_shot_setting="allow_all",
511
- )
512
- # Sort to ensure consistency with update_models
513
- initial_models = sorted(initial_models)
514
-
515
- return (
723
+ (
516
724
  languages,
517
725
  domains,
518
726
  types,
519
727
  modalities,
520
728
  benchmark_tasks,
521
729
  scores,
730
+ show_zero_shot,
731
+ initial_models,
732
+ ) = _cache_on_benchmark_select(benchmark_name, all_benchmark_results)
733
+
734
+ return (
735
+ gr.update(choices=languages, value=languages),
736
+ gr.update(choices=domains, value=domains),
737
+ gr.update(choices=types, value=types),
738
+ gr.update(choices=modalities, value=modalities),
739
+ gr.update(choices=benchmark_tasks, value=benchmark_tasks),
740
+ scores,
522
741
  gr.update(visible=show_zero_shot),
523
742
  initial_models,
524
743
  )
@@ -560,48 +779,13 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
560
779
  outputs=[scores],
561
780
  )
562
781
 
563
- @cachetools.cached(
564
- cache={},
565
- key=lambda benchmark_name,
566
- type_select,
567
- domain_select,
568
- lang_select,
569
- modality_select: hash(
570
- (
571
- hash(benchmark_name),
572
- hash(tuple(type_select)),
573
- hash(tuple(domain_select)),
574
- hash(tuple(lang_select)),
575
- hash(tuple(modality_select)),
576
- )
577
- ),
578
- )
579
782
  def update_task_list(
580
783
  benchmark_name, type_select, domain_select, lang_select, modality_select
581
784
  ):
582
- if not len(lang_select):
583
- return []
584
- start_time = time.time()
585
- tasks_to_keep = []
586
- for task in mteb.get_benchmark(benchmark_name).tasks:
587
- if task.metadata.type not in type_select:
588
- continue
589
- if task.metadata.domains is not None and not (
590
- set(task.metadata.domains) & set(domain_select)
591
- ):
592
- continue
593
- if task.languages is not None and not (
594
- set(task.languages) & set(lang_select)
595
- ):
596
- continue
597
- if task.metadata.modalities and not (
598
- set(task.metadata.modalities) & set(modality_select)
599
- ):
600
- continue
601
- tasks_to_keep.append(task.metadata.name)
602
- elapsed = time.time() - start_time
603
- logger.debug(f"update_task_list callback: {elapsed}s")
604
- return sorted(tasks_to_keep)
785
+ benchmark_tasks, tasks_to_keep = _cache_update_task_list(
786
+ benchmark_name, type_select, domain_select, lang_select, modality_select
787
+ )
788
+ return gr.update(choices=benchmark_tasks, value=tasks_to_keep)
605
789
 
606
790
  type_select.input(
607
791
  update_task_list,
@@ -656,7 +840,8 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
656
840
  compatibility,
657
841
  instructions,
658
842
  max_model_size,
659
- zero_shot: hash(
843
+ zero_shot,
844
+ model_type_select: hash(
660
845
  (
661
846
  id(scores),
662
847
  hash(tuple(tasks)),
@@ -665,6 +850,7 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
665
850
  hash(instructions),
666
851
  hash(max_model_size),
667
852
  hash(zero_shot),
853
+ hash(tuple(model_type_select)),
668
854
  )
669
855
  ),
670
856
  )
@@ -676,6 +862,7 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
676
862
  instructions: bool | None,
677
863
  max_model_size: int,
678
864
  zero_shot: Literal["allow_all", "remove_unknown", "only_zero_shot"],
865
+ model_type_select: list[str],
679
866
  ):
680
867
  start_time = time.time()
681
868
  model_names = list({entry["model_name"] for entry in scores})
@@ -687,6 +874,7 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
687
874
  instructions,
688
875
  max_model_size,
689
876
  zero_shot_setting=zero_shot,
877
+ model_types=model_type_select,
690
878
  )
691
879
  elapsed = time.time() - start_time
692
880
  logger.debug(f"update_models callback: {elapsed}s")
@@ -704,6 +892,7 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
704
892
  instructions,
705
893
  max_model_size,
706
894
  zero_shot,
895
+ model_type_select,
707
896
  ],
708
897
  outputs=[models],
709
898
  )
@@ -718,6 +907,7 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
718
907
  instructions,
719
908
  max_model_size,
720
909
  zero_shot,
910
+ model_type_select,
721
911
  ],
722
912
  outputs=[models],
723
913
  )
@@ -731,6 +921,7 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
731
921
  instructions,
732
922
  max_model_size,
733
923
  zero_shot,
924
+ model_type_select,
734
925
  ],
735
926
  outputs=[models],
736
927
  )
@@ -744,6 +935,7 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
744
935
  instructions,
745
936
  max_model_size,
746
937
  zero_shot,
938
+ model_type_select,
747
939
  ],
748
940
  outputs=[models],
749
941
  )
@@ -757,6 +949,7 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
757
949
  instructions,
758
950
  max_model_size,
759
951
  zero_shot,
952
+ model_type_select,
760
953
  ],
761
954
  outputs=[models],
762
955
  )
@@ -770,6 +963,7 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
770
963
  instructions,
771
964
  max_model_size,
772
965
  zero_shot,
966
+ model_type_select,
773
967
  ],
774
968
  outputs=[models],
775
969
  )
@@ -783,6 +977,21 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
783
977
  instructions,
784
978
  max_model_size,
785
979
  zero_shot,
980
+ model_type_select,
981
+ ],
982
+ outputs=[models],
983
+ )
984
+ model_type_select.change(
985
+ update_models,
986
+ inputs=[
987
+ scores,
988
+ task_select,
989
+ availability,
990
+ compatibility,
991
+ instructions,
992
+ max_model_size,
993
+ zero_shot,
994
+ model_type_select,
786
995
  ],
787
996
  outputs=[models],
788
997
  )
@@ -854,9 +1063,18 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
854
1063
  per_task = apply_per_task_styling_from_benchmark(
855
1064
  benchmark, filtered_benchmark_results
856
1065
  )
1066
+ per_language = apply_per_language_styling_from_benchmark(
1067
+ benchmark,
1068
+ filtered_benchmark_results,
1069
+ )
857
1070
  elapsed = time.time() - start_time
858
1071
  logger.debug(f"update_tables callback: {elapsed}s")
859
- return summary, per_task
1072
+ return (
1073
+ summary,
1074
+ per_task,
1075
+ per_language,
1076
+ gr.update(visible=len(benchmark.language_view) > 0),
1077
+ )
860
1078
 
861
1079
  # Only update tables when models change, not when scores/tasks change directly
862
1080
  # This avoids redundant updates since scores/tasks changes trigger update_models
@@ -865,11 +1083,20 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
865
1083
  item.change(
866
1084
  update_tables,
867
1085
  inputs=[scores, task_select, models, benchmark_select],
868
- outputs=[summary_table, per_task_table],
1086
+ outputs=[
1087
+ summary_table,
1088
+ per_task_table,
1089
+ per_language_table,
1090
+ language_tab,
1091
+ ],
869
1092
  )
870
1093
 
871
1094
  gr.Markdown(ACKNOWLEDGEMENT, elem_id="ack_markdown")
1095
+ interface_time = time.time() - interface_start
1096
+ logger.info(f"Step 7/7 complete: Built Gradio interface in {interface_time:.2f}s")
872
1097
 
1098
+ logger.info("Starting prerun on all benchmarks to populate caches...")
1099
+ prerun_start = time.time()
873
1100
  # Prerun on all benchmarks, so that results of callbacks get cached
874
1101
  for benchmark in benchmarks:
875
1102
  (
@@ -895,20 +1122,56 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
895
1122
  update_tables(
896
1123
  bench_scores, filtered_tasks, bench_initial_models, benchmark.name
897
1124
  )
1125
+ prerun_time = time.time() - prerun_start
1126
+ logger.info(
1127
+ f"Prerun complete: Processed {len(benchmarks)} benchmarks in {prerun_time:.2f}s"
1128
+ )
1129
+
1130
+ total_time = time.time() - app_start
1131
+ logger.info(f"=== Leaderboard app initialization complete in {total_time:.2f}s ===")
898
1132
  return demo
899
1133
 
900
1134
 
901
1135
  if __name__ == "__main__":
902
- logging.getLogger("mteb.load_results.task_results").setLevel(
903
- logging.ERROR
904
- ) # Warnings related to task split
905
- logging.getLogger("mteb.model_meta").setLevel(
906
- logging.ERROR
907
- ) # Warning related to model metadata (fetch_from_hf=False)
908
- logging.getLogger("mteb.load_results.benchmark_results").setLevel(
909
- logging.ERROR
910
- ) # Warning related to model metadata (fetch_from_hf=False)
1136
+ import os
1137
+
1138
+ # Add process ID to logging for multiprocessing debugging
1139
+ logging.basicConfig(
1140
+ level=logging.INFO,
1141
+ format="%(asctime)s - PID:%(process)d - %(name)s - %(levelname)s - %(message)s",
1142
+ force=True, # Override any existing handlers
1143
+ )
1144
+
1145
+ # Flush log handlers immediately (helpful for multiprocessing)
1146
+ for handler in logging.root.handlers:
1147
+ handler.flush()
1148
+
1149
+ logger.info(f"Starting leaderboard app in process {os.getpid()}")
1150
+
1151
+ # Suppress specific WARNING messages while keeping INFO level for the app
1152
+ logging.getLogger("mteb.results.task_result").setLevel(logging.ERROR)
1153
+ logging.getLogger("mteb.models.model_meta").setLevel(logging.ERROR)
1154
+ logging.getLogger("mteb.results.benchmark_results").setLevel(logging.ERROR)
1155
+
911
1156
  warnings.filterwarnings("ignore", message="Couldn't get scores for .* due to .*")
1157
+ warnings.filterwarnings("ignore", message="Could not get source model: .*")
1158
+ warnings.filterwarnings(
1159
+ "ignore", message="No scores data available. Returning empty DataFrame."
1160
+ )
1161
+ warnings.filterwarnings("ignore", message="Main score .* not found in scores")
1162
+ warnings.filterwarnings("ignore", message=".*: Missing subsets .* for split .*")
1163
+ warnings.filterwarnings("ignore", message=".*: Missing splits .*")
912
1164
 
913
1165
  app = get_leaderboard_app()
914
- app.launch(server_name="0.0.0.0", server_port=7860)
1166
+
1167
+ head = """
1168
+ <link href="https://cdn.jsdelivr.net/npm/tailwindcss@2.2.19/dist/tailwind.min.css" rel="stylesheet">
1169
+ """
1170
+ app.launch(
1171
+ server_name="0.0.0.0",
1172
+ server_port=7860,
1173
+ theme=gr.themes.Soft(
1174
+ font=[gr.themes.GoogleFont("Roboto Mono"), "Arial", "sans-serif"],
1175
+ ),
1176
+ head=head,
1177
+ )