mteb 2.1.4__py3-none-any.whl → 2.7.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (527) hide show
  1. mteb/__init__.py +6 -0
  2. mteb/_create_dataloaders.py +22 -20
  3. mteb/_evaluators/any_sts_evaluator.py +23 -14
  4. mteb/_evaluators/classification_metrics.py +54 -0
  5. mteb/_evaluators/clustering_evaluator.py +3 -3
  6. mteb/_evaluators/evaluator.py +4 -2
  7. mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +18 -11
  8. mteb/_evaluators/pair_classification_evaluator.py +34 -40
  9. mteb/_evaluators/retrieval_evaluator.py +2 -2
  10. mteb/_evaluators/retrieval_metrics.py +18 -17
  11. mteb/_evaluators/sklearn_evaluator.py +25 -37
  12. mteb/_evaluators/text/bitext_mining_evaluator.py +31 -19
  13. mteb/_evaluators/text/summarization_evaluator.py +27 -20
  14. mteb/_evaluators/zeroshot_classification_evaluator.py +7 -5
  15. mteb/abstasks/_data_filter/__init__.py +0 -0
  16. mteb/abstasks/_data_filter/filters.py +125 -0
  17. mteb/abstasks/_data_filter/task_pipelines.py +105 -0
  18. mteb/abstasks/_statistics_calculation.py +23 -11
  19. mteb/abstasks/_stratification.py +18 -18
  20. mteb/abstasks/abstask.py +35 -28
  21. mteb/abstasks/aggregate_task_metadata.py +1 -9
  22. mteb/abstasks/aggregated_task.py +10 -29
  23. mteb/abstasks/classification.py +15 -12
  24. mteb/abstasks/clustering.py +20 -16
  25. mteb/abstasks/clustering_legacy.py +13 -10
  26. mteb/abstasks/image/image_text_pair_classification.py +7 -4
  27. mteb/abstasks/multilabel_classification.py +33 -22
  28. mteb/abstasks/pair_classification.py +27 -11
  29. mteb/abstasks/regression.py +4 -4
  30. mteb/abstasks/retrieval.py +28 -24
  31. mteb/abstasks/retrieval_dataset_loaders.py +2 -2
  32. mteb/abstasks/sts.py +14 -4
  33. mteb/abstasks/task_metadata.py +32 -33
  34. mteb/abstasks/text/bitext_mining.py +39 -28
  35. mteb/abstasks/text/reranking.py +8 -6
  36. mteb/abstasks/text/summarization.py +10 -5
  37. mteb/abstasks/zeroshot_classification.py +8 -4
  38. mteb/benchmarks/_create_table.py +84 -37
  39. mteb/benchmarks/benchmark.py +77 -16
  40. mteb/benchmarks/benchmarks/__init__.py +12 -0
  41. mteb/benchmarks/benchmarks/benchmarks.py +361 -16
  42. mteb/benchmarks/get_benchmark.py +14 -53
  43. mteb/cache.py +227 -37
  44. mteb/cli/_display_tasks.py +2 -2
  45. mteb/cli/build_cli.py +110 -14
  46. mteb/cli/generate_model_card.py +43 -23
  47. mteb/deprecated_evaluator.py +71 -62
  48. mteb/descriptive_stats/BitextMining/RuSciBenchBitextMining.v2.json +61 -0
  49. mteb/descriptive_stats/Classification/HebrewSentimentAnalysis.v3.json +60 -0
  50. mteb/descriptive_stats/Classification/TurkishConstitutionalCourtViolation.json +54 -0
  51. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2CybersecurityRetrieval.json +32 -0
  52. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EconomicRetrieval.json +32 -0
  53. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EnergyRetrieval.json +32 -0
  54. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2HrRetrieval.json +32 -0
  55. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3ComputerScienceRetrieval.json +214 -0
  56. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3EnergyRetrieval.json +214 -0
  57. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceEnRetrieval.json +214 -0
  58. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceFrRetrieval.json +214 -0
  59. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3HrRetrieval.json +214 -0
  60. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3IndustrialRetrieval.json +214 -0
  61. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3NuclearRetrieval.json +214 -0
  62. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PharmaceuticalsRetrieval.json +214 -0
  63. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PhysicsRetrieval.json +214 -0
  64. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3TelecomRetrieval.json +214 -0
  65. mteb/descriptive_stats/PairClassification/TERRa.V2.json +35 -0
  66. mteb/descriptive_stats/Reranking/JQaRARerankingLite.json +35 -0
  67. mteb/descriptive_stats/Reranking/JaCWIRRerankingLite.json +35 -0
  68. mteb/descriptive_stats/Reranking/MultiLongDocReranking.json +466 -0
  69. mteb/descriptive_stats/Retrieval/ArguAna-NL.v2.json +30 -0
  70. mteb/descriptive_stats/Retrieval/ChemRxivRetrieval.json +30 -0
  71. mteb/descriptive_stats/Retrieval/EuroPIRQRetrieval.json +116 -0
  72. mteb/descriptive_stats/Retrieval/JaCWIRRetrievalLite.json +30 -0
  73. mteb/descriptive_stats/Retrieval/JaqketRetrievalLite.json +30 -0
  74. mteb/descriptive_stats/Retrieval/MIRACLJaRetrievalLite.json +30 -0
  75. mteb/descriptive_stats/Retrieval/MrTyDiJaRetrievalLite.json +30 -0
  76. mteb/descriptive_stats/Retrieval/NFCorpus-NL.v2.json +30 -0
  77. mteb/descriptive_stats/Retrieval/NanoClimateFEVER-VN.json +30 -0
  78. mteb/descriptive_stats/Retrieval/NanoDBPedia-VN.json +30 -0
  79. mteb/descriptive_stats/Retrieval/NanoFEVER-VN.json +30 -0
  80. mteb/descriptive_stats/Retrieval/NanoHotpotQA-VN.json +30 -0
  81. mteb/descriptive_stats/Retrieval/NanoMSMARCO-VN.json +30 -0
  82. mteb/descriptive_stats/Retrieval/NanoNQ-VN.json +30 -0
  83. mteb/descriptive_stats/Retrieval/SCIDOCS-NL.v2.json +30 -0
  84. mteb/descriptive_stats/Retrieval/SQuADKorV1Retrieval.json +30 -0
  85. mteb/descriptive_stats/Retrieval/SciFact-NL.v2.json +30 -0
  86. mteb/descriptive_stats/Retrieval/TVPLRetrieval.json +30 -0
  87. mteb/evaluate.py +106 -75
  88. mteb/filter_tasks.py +25 -26
  89. mteb/get_tasks.py +29 -30
  90. mteb/languages/language_scripts.py +5 -3
  91. mteb/leaderboard/app.py +414 -151
  92. mteb/leaderboard/benchmark_selector.py +14 -5
  93. mteb/leaderboard/figures.py +13 -15
  94. mteb/leaderboard/table.py +82 -17
  95. mteb/load_results.py +12 -12
  96. mteb/models/__init__.py +4 -1
  97. mteb/models/abs_encoder.py +31 -23
  98. mteb/models/cache_wrappers/__init__.py +2 -1
  99. mteb/models/cache_wrappers/cache_backend_protocol.py +3 -5
  100. mteb/models/cache_wrappers/cache_backends/_hash_utils.py +7 -6
  101. mteb/models/cache_wrappers/cache_backends/faiss_cache.py +6 -2
  102. mteb/models/cache_wrappers/cache_backends/numpy_cache.py +43 -25
  103. mteb/models/cache_wrappers/cache_wrapper.py +3 -3
  104. mteb/models/get_model_meta.py +25 -118
  105. mteb/models/instruct_wrapper.py +33 -9
  106. mteb/models/model_implementations/align_models.py +8 -1
  107. mteb/models/model_implementations/amazon_models.py +1 -0
  108. mteb/models/model_implementations/andersborges.py +65 -0
  109. mteb/models/model_implementations/ara_models.py +9 -1
  110. mteb/models/model_implementations/arctic_models.py +16 -8
  111. mteb/models/model_implementations/b1ade_models.py +2 -1
  112. mteb/models/model_implementations/bedrock_models.py +4 -0
  113. mteb/models/model_implementations/bge_models.py +101 -17
  114. mteb/models/model_implementations/bica_model.py +35 -0
  115. mteb/models/model_implementations/blip2_models.py +13 -2
  116. mteb/models/model_implementations/blip_models.py +43 -16
  117. mteb/models/model_implementations/bm25.py +5 -4
  118. mteb/models/model_implementations/bmretriever_models.py +10 -4
  119. mteb/models/model_implementations/cadet_models.py +10 -1
  120. mteb/models/model_implementations/cde_models.py +25 -4
  121. mteb/models/model_implementations/clip_models.py +9 -6
  122. mteb/models/model_implementations/clips_models.py +100 -0
  123. mteb/models/model_implementations/codefuse_models.py +165 -3
  124. mteb/models/model_implementations/codesage_models.py +18 -3
  125. mteb/models/model_implementations/cohere_models.py +13 -6
  126. mteb/models/model_implementations/cohere_v.py +7 -2
  127. mteb/models/model_implementations/colpali_models.py +17 -9
  128. mteb/models/model_implementations/colqwen_models.py +275 -5
  129. mteb/models/model_implementations/colsmol_models.py +4 -2
  130. mteb/models/model_implementations/conan_models.py +2 -1
  131. mteb/models/model_implementations/dino_models.py +194 -23
  132. mteb/models/model_implementations/e5_instruct.py +27 -4
  133. mteb/models/model_implementations/e5_models.py +21 -110
  134. mteb/models/model_implementations/e5_v.py +7 -6
  135. mteb/models/model_implementations/eagerworks_models.py +164 -0
  136. mteb/models/model_implementations/emillykkejensen_models.py +91 -0
  137. mteb/models/model_implementations/en_code_retriever.py +2 -1
  138. mteb/models/model_implementations/euler_models.py +32 -0
  139. mteb/models/model_implementations/evaclip_models.py +4 -0
  140. mteb/models/model_implementations/fa_models.py +67 -9
  141. mteb/models/model_implementations/facebookai.py +205 -0
  142. mteb/models/model_implementations/geogpt_models.py +2 -1
  143. mteb/models/model_implementations/gme_v_models.py +17 -10
  144. mteb/models/model_implementations/google_models.py +17 -6
  145. mteb/models/model_implementations/granite_vision_embedding_models.py +8 -3
  146. mteb/models/model_implementations/gritlm_models.py +4 -2
  147. mteb/models/model_implementations/gte_models.py +99 -9
  148. mteb/models/model_implementations/hinvec_models.py +2 -1
  149. mteb/models/model_implementations/human.py +1 -0
  150. mteb/models/model_implementations/ibm_granite_models.py +36 -6
  151. mteb/models/model_implementations/inf_models.py +4 -2
  152. mteb/models/model_implementations/jasper_models.py +256 -3
  153. mteb/models/model_implementations/jina_clip.py +49 -10
  154. mteb/models/model_implementations/jina_models.py +222 -11
  155. mteb/models/model_implementations/kalm_models.py +203 -25
  156. mteb/models/model_implementations/kblab.py +37 -0
  157. mteb/models/model_implementations/kennethenevoldsen_models.py +74 -0
  158. mteb/models/model_implementations/kfst.py +25 -0
  159. mteb/models/model_implementations/kowshik24_models.py +32 -0
  160. mteb/models/model_implementations/lens_models.py +2 -0
  161. mteb/models/model_implementations/lgai_embedding_models.py +2 -1
  162. mteb/models/model_implementations/linq_models.py +4 -3
  163. mteb/models/model_implementations/listconranker.py +2 -2
  164. mteb/models/model_implementations/llm2clip_models.py +9 -6
  165. mteb/models/model_implementations/llm2vec_models.py +16 -8
  166. mteb/models/model_implementations/mcinext_models.py +7 -1
  167. mteb/models/model_implementations/mdbr_models.py +19 -3
  168. mteb/models/model_implementations/misc_models.py +422 -60
  169. mteb/models/model_implementations/mixedbread_ai_models.py +332 -0
  170. mteb/models/model_implementations/mme5_models.py +2 -1
  171. mteb/models/model_implementations/moco_models.py +15 -4
  172. mteb/models/model_implementations/mod_models.py +191 -0
  173. mteb/models/model_implementations/model2vec_models.py +27 -14
  174. mteb/models/model_implementations/moka_models.py +4 -1
  175. mteb/models/model_implementations/nbailab.py +70 -0
  176. mteb/models/model_implementations/no_instruct_sentence_models.py +3 -2
  177. mteb/models/model_implementations/nomic_models.py +173 -6
  178. mteb/models/model_implementations/nomic_models_vision.py +8 -3
  179. mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +32 -19
  180. mteb/models/model_implementations/nvidia_models.py +155 -20
  181. mteb/models/model_implementations/octen_models.py +254 -0
  182. mteb/models/model_implementations/openai_models.py +20 -16
  183. mteb/models/model_implementations/openclip_models.py +37 -13
  184. mteb/models/model_implementations/opensearch_neural_sparse_models.py +10 -5
  185. mteb/models/model_implementations/ops_moa_models.py +5 -3
  186. mteb/models/model_implementations/ordalietech_solon_embeddings_mini_beta_1_1.py +1 -1
  187. mteb/models/model_implementations/pawan_models.py +39 -0
  188. mteb/models/model_implementations/piccolo_models.py +9 -1
  189. mteb/models/model_implementations/pixie_models.py +56 -0
  190. mteb/models/model_implementations/promptriever_models.py +12 -8
  191. mteb/models/model_implementations/pylate_models.py +46 -12
  192. mteb/models/model_implementations/qodo_models.py +4 -2
  193. mteb/models/model_implementations/qtack_models.py +2 -1
  194. mteb/models/model_implementations/qwen3_models.py +9 -6
  195. mteb/models/model_implementations/qzhou_models.py +5 -3
  196. mteb/models/model_implementations/random_baseline.py +19 -24
  197. mteb/models/model_implementations/rasgaard_models.py +34 -0
  198. mteb/models/model_implementations/reasonir_model.py +2 -1
  199. mteb/models/model_implementations/repllama_models.py +5 -3
  200. mteb/models/model_implementations/rerankers_custom.py +15 -9
  201. mteb/models/model_implementations/rerankers_monot5_based.py +31 -31
  202. mteb/models/model_implementations/richinfoai_models.py +2 -1
  203. mteb/models/model_implementations/ru_sentence_models.py +71 -20
  204. mteb/models/model_implementations/ruri_models.py +322 -0
  205. mteb/models/model_implementations/salesforce_models.py +6 -3
  206. mteb/models/model_implementations/samilpwc_models.py +2 -1
  207. mteb/models/model_implementations/sarashina_embedding_models.py +168 -0
  208. mteb/models/model_implementations/searchmap_models.py +2 -1
  209. mteb/models/model_implementations/seed_1_6_embedding_models.py +8 -2
  210. mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +625 -0
  211. mteb/models/model_implementations/seed_models.py +1 -0
  212. mteb/models/model_implementations/sentence_transformers_models.py +177 -18
  213. mteb/models/model_implementations/shuu_model.py +32 -31
  214. mteb/models/model_implementations/siglip_models.py +30 -20
  215. mteb/models/model_implementations/slm_models.py +416 -0
  216. mteb/models/model_implementations/sonar_models.py +1 -0
  217. mteb/models/model_implementations/spartan8806_atles_champion.py +34 -0
  218. mteb/models/model_implementations/stella_models.py +23 -4
  219. mteb/models/model_implementations/tarka_models.py +376 -0
  220. mteb/models/model_implementations/text2vec_models.py +9 -3
  221. mteb/models/model_implementations/ua_sentence_models.py +11 -1
  222. mteb/models/model_implementations/uae_models.py +8 -1
  223. mteb/models/model_implementations/vdr_models.py +3 -1
  224. mteb/models/model_implementations/vi_vn_models.py +45 -6
  225. mteb/models/model_implementations/vista_models.py +2 -0
  226. mteb/models/model_implementations/vlm2vec_models.py +5 -3
  227. mteb/models/model_implementations/voyage_models.py +99 -0
  228. mteb/models/model_implementations/voyage_v.py +17 -9
  229. mteb/models/model_implementations/xyz_models.py +1 -0
  230. mteb/models/model_implementations/youtu_models.py +2 -1
  231. mteb/models/model_implementations/yuan_models.py +34 -0
  232. mteb/models/model_implementations/yuan_models_en.py +58 -0
  233. mteb/models/model_meta.py +498 -29
  234. mteb/models/models_protocols.py +22 -6
  235. mteb/models/search_encoder_index/__init__.py +7 -0
  236. mteb/models/search_encoder_index/search_backend_protocol.py +50 -0
  237. mteb/models/search_encoder_index/search_indexes/__init__.py +5 -0
  238. mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +160 -0
  239. mteb/models/search_wrappers.py +197 -65
  240. mteb/models/sentence_transformer_wrapper.py +52 -32
  241. mteb/models/vllm_wrapper.py +327 -0
  242. mteb/py.typed +0 -0
  243. mteb/results/benchmark_results.py +114 -65
  244. mteb/results/model_result.py +63 -26
  245. mteb/results/task_result.py +117 -77
  246. mteb/similarity_functions.py +60 -7
  247. mteb/tasks/bitext_mining/multilingual/__init__.py +2 -1
  248. mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining.py +4 -2
  249. mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining_fast.py +1 -1
  250. mteb/tasks/bitext_mining/multilingual/ru_sci_bench_bitext_mining.py +47 -5
  251. mteb/tasks/bitext_mining/multilingual/web_faq_bitext_mining.py +2 -6
  252. mteb/tasks/classification/ara/ajgt.py +1 -2
  253. mteb/tasks/classification/ara/hotel_review_sentiment_classification.py +1 -2
  254. mteb/tasks/classification/ara/online_store_review_sentiment_classification.py +1 -2
  255. mteb/tasks/classification/ara/restaurant_review_sentiment_classification.py +1 -2
  256. mteb/tasks/classification/ara/tweet_emotion_classification.py +1 -2
  257. mteb/tasks/classification/ara/tweet_sarcasm_classification.py +1 -2
  258. mteb/tasks/classification/ben/bengali_document_classification.py +1 -2
  259. mteb/tasks/classification/ben/bengali_hate_speech_classification.py +1 -2
  260. mteb/tasks/classification/ben/bengali_sentiment_analysis.py +1 -2
  261. mteb/tasks/classification/ces/csfdcz_movie_review_sentiment_classification.py +1 -2
  262. mteb/tasks/classification/ces/czech_product_review_sentiment_classification.py +1 -2
  263. mteb/tasks/classification/ces/czech_so_me_sentiment_classification.py +1 -2
  264. mteb/tasks/classification/dan/angry_tweets_classification.py +1 -2
  265. mteb/tasks/classification/dan/danish_political_comments_classification.py +1 -2
  266. mteb/tasks/classification/dan/ddisco_cohesion_classification.py +1 -2
  267. mteb/tasks/classification/dan/dk_hate_classification.py +2 -3
  268. mteb/tasks/classification/deu/german_politicians_twitter_sentiment_classification.py +1 -2
  269. mteb/tasks/classification/deu/ten_k_gnad_classification.py +1 -2
  270. mteb/tasks/classification/eng/amazon_polarity_classification.py +1 -2
  271. mteb/tasks/classification/eng/arxiv_classification.py +1 -2
  272. mteb/tasks/classification/eng/banking77_classification.py +1 -2
  273. mteb/tasks/classification/eng/dbpedia_classification.py +1 -2
  274. mteb/tasks/classification/eng/emotion_classification.py +1 -2
  275. mteb/tasks/classification/eng/financial_phrasebank_classification.py +1 -2
  276. mteb/tasks/classification/eng/frenk_en_classification.py +1 -2
  277. mteb/tasks/classification/eng/gtsrb_classification.py +1 -1
  278. mteb/tasks/classification/eng/imdb_classification.py +1 -2
  279. mteb/tasks/classification/eng/legal_bench_classification.py +14 -120
  280. mteb/tasks/classification/eng/news_classification.py +1 -2
  281. mteb/tasks/classification/eng/patch_camelyon_classification.py +1 -1
  282. mteb/tasks/classification/eng/patent_classification.py +1 -2
  283. mteb/tasks/classification/eng/poem_sentiment_classification.py +1 -2
  284. mteb/tasks/classification/eng/sds_eye_protection_classification.py +1 -2
  285. mteb/tasks/classification/eng/sds_gloves_classification.py +1 -2
  286. mteb/tasks/classification/eng/toxic_chat_classification.py +2 -19
  287. mteb/tasks/classification/eng/toxic_conversations_classification.py +1 -2
  288. mteb/tasks/classification/eng/tweet_sentiment_extraction_classification.py +1 -2
  289. mteb/tasks/classification/eng/tweet_topic_single_classification.py +2 -13
  290. mteb/tasks/classification/eng/ucf101_classification.py +1 -5
  291. mteb/tasks/classification/eng/wikipedia_bio_met_chem_classification.py +1 -2
  292. mteb/tasks/classification/eng/wikipedia_chem_fields_classification.py +1 -2
  293. mteb/tasks/classification/eng/wikipedia_comp_chem_spectroscopy_classification.py +1 -2
  294. mteb/tasks/classification/eng/wikipedia_crystallography_analytical_classification.py +1 -2
  295. mteb/tasks/classification/eng/wikipedia_theoretical_applied_classification.py +1 -2
  296. mteb/tasks/classification/eng/yahoo_answers_topics_classification.py +1 -2
  297. mteb/tasks/classification/eng/yelp_review_full_classification.py +1 -2
  298. mteb/tasks/classification/est/estonian_valence.py +2 -3
  299. mteb/tasks/classification/fas/fa_mteb_classification.py +7 -14
  300. mteb/tasks/classification/fil/filipino_hate_speech_classification.py +1 -2
  301. mteb/tasks/classification/fin/fin_toxicity_classification.py +2 -11
  302. mteb/tasks/classification/fra/french_book_reviews.py +1 -2
  303. mteb/tasks/classification/fra/movie_review_sentiment_classification.py +1 -2
  304. mteb/tasks/classification/guj/gujarati_news_classification.py +1 -2
  305. mteb/tasks/classification/heb/__init__.py +6 -1
  306. mteb/tasks/classification/heb/hebrew_sentiment_analysis.py +62 -4
  307. mteb/tasks/classification/hin/hindi_discourse_classification.py +1 -2
  308. mteb/tasks/classification/hin/sentiment_analysis_hindi.py +1 -2
  309. mteb/tasks/classification/hrv/frenk_hr_classification.py +1 -2
  310. mteb/tasks/classification/ind/indonesian_id_clickbait_classification.py +1 -2
  311. mteb/tasks/classification/ind/indonesian_mongabay_conservation_classification.py +1 -2
  312. mteb/tasks/classification/ita/italian_linguist_acceptability_classification.py +1 -2
  313. mteb/tasks/classification/jav/javanese_imdb_classification.py +1 -2
  314. mteb/tasks/classification/jpn/wrime_classification.py +1 -2
  315. mteb/tasks/classification/kan/kannada_news_classification.py +1 -2
  316. mteb/tasks/classification/kor/klue_tc.py +1 -2
  317. mteb/tasks/classification/kor/kor_hate_classification.py +2 -17
  318. mteb/tasks/classification/kor/kor_sarcasm_classification.py +2 -19
  319. mteb/tasks/classification/kur/kurdish_sentiment_classification.py +3 -4
  320. mteb/tasks/classification/mal/malayalam_news_classification.py +1 -2
  321. mteb/tasks/classification/mar/marathi_news_classification.py +1 -2
  322. mteb/tasks/classification/mkd/macedonian_tweet_sentiment_classification.py +1 -2
  323. mteb/tasks/classification/multilingual/catalonia_tweet_classification.py +1 -6
  324. mteb/tasks/classification/multilingual/multi_hate_classification.py +1 -4
  325. mteb/tasks/classification/multilingual/ru_sci_bench_classification.py +4 -23
  326. mteb/tasks/classification/multilingual/scala_classification.py +2 -3
  327. mteb/tasks/classification/multilingual/sib200_classification.py +1 -6
  328. mteb/tasks/classification/mya/myanmar_news.py +1 -2
  329. mteb/tasks/classification/nep/nepali_news_classification.py +1 -2
  330. mteb/tasks/classification/nld/dutch_book_review_sentiment_classification.py +4 -2
  331. mteb/tasks/classification/nld/dutch_cola_classification.py +3 -0
  332. mteb/tasks/classification/nld/dutch_government_bias_classification.py +3 -0
  333. mteb/tasks/classification/nld/dutch_news_articles_classification.py +3 -0
  334. mteb/tasks/classification/nld/dutch_sarcastic_headlines_classification.py +3 -0
  335. mteb/tasks/classification/nld/iconclass_classification.py +3 -0
  336. mteb/tasks/classification/nld/open_tender_classification.py +3 -0
  337. mteb/tasks/classification/nld/vaccin_chat_nl_classification.py +3 -0
  338. mteb/tasks/classification/nob/no_rec_classification.py +1 -2
  339. mteb/tasks/classification/nob/norwegian_parliament_classification.py +1 -2
  340. mteb/tasks/classification/ory/odia_news_classification.py +1 -2
  341. mteb/tasks/classification/pol/polish_classification.py +3 -6
  342. mteb/tasks/classification/ron/moroco.py +1 -2
  343. mteb/tasks/classification/ron/romanian_reviews_sentiment.py +1 -2
  344. mteb/tasks/classification/ron/romanian_sentiment_classification.py +1 -2
  345. mteb/tasks/classification/rus/georeview_classification.py +1 -2
  346. mteb/tasks/classification/rus/headline_classification.py +1 -2
  347. mteb/tasks/classification/rus/inappropriateness_classification.py +1 -2
  348. mteb/tasks/classification/rus/ru_reviews_classification.py +1 -2
  349. mteb/tasks/classification/rus/ru_toixic_classification_okmlcup.py +1 -2
  350. mteb/tasks/classification/rus/senti_ru_eval.py +1 -2
  351. mteb/tasks/classification/sin/sinhala_news_classification.py +1 -2
  352. mteb/tasks/classification/sin/sinhala_news_source_classification.py +1 -2
  353. mteb/tasks/classification/slk/csfdsk_movie_review_sentiment_classification.py +1 -2
  354. mteb/tasks/classification/slk/slovak_hate_speech_classification.py +1 -2
  355. mteb/tasks/classification/slk/slovak_movie_review_sentiment_classification.py +1 -2
  356. mteb/tasks/classification/slv/frenk_sl_classification.py +1 -2
  357. mteb/tasks/classification/spa/spanish_news_classification.py +1 -2
  358. mteb/tasks/classification/spa/spanish_sentiment_classification.py +1 -2
  359. mteb/tasks/classification/ssw/siswati_news_classification.py +1 -2
  360. mteb/tasks/classification/swa/swahili_news_classification.py +1 -2
  361. mteb/tasks/classification/swe/dalaj_classification.py +1 -2
  362. mteb/tasks/classification/swe/swe_rec_classification.py +1 -2
  363. mteb/tasks/classification/swe/swedish_sentiment_classification.py +1 -2
  364. mteb/tasks/classification/tam/tamil_news_classification.py +1 -2
  365. mteb/tasks/classification/tel/telugu_andhra_jyoti_news_classification.py +1 -2
  366. mteb/tasks/classification/tha/wisesight_sentiment_classification.py +1 -2
  367. mteb/tasks/classification/tsn/tswana_news_classification.py +1 -2
  368. mteb/tasks/classification/tur/__init__.py +4 -0
  369. mteb/tasks/classification/tur/turkish_constitutional_court.py +41 -0
  370. mteb/tasks/classification/tur/turkish_movie_sentiment_classification.py +1 -2
  371. mteb/tasks/classification/tur/turkish_product_sentiment_classification.py +1 -2
  372. mteb/tasks/classification/ukr/ukr_formality_classification.py +2 -15
  373. mteb/tasks/classification/urd/urdu_roman_sentiment_classification.py +1 -2
  374. mteb/tasks/classification/vie/amazon_counterfactual_vn_classification.py +1 -6
  375. mteb/tasks/classification/vie/amazon_polarity_vn_classification.py +1 -6
  376. mteb/tasks/classification/vie/amazon_reviews_vn_classification.py +1 -5
  377. mteb/tasks/classification/vie/banking77_vn_classification.py +1 -5
  378. mteb/tasks/classification/vie/emotion_vn_classification.py +1 -5
  379. mteb/tasks/classification/vie/imdb_vn_classification.py +1 -5
  380. mteb/tasks/classification/vie/massive_intent_vn_classification.py +1 -5
  381. mteb/tasks/classification/vie/massive_scenario_vn_classification.py +1 -5
  382. mteb/tasks/classification/vie/mtop_domain_vn_classification.py +1 -5
  383. mteb/tasks/classification/vie/mtop_intent_vn_classification.py +1 -5
  384. mteb/tasks/classification/vie/toxic_conversations_vn_classification.py +1 -5
  385. mteb/tasks/classification/vie/tweet_sentiment_extraction_vn_classification.py +1 -5
  386. mteb/tasks/classification/vie/vie_student_feedback_classification.py +1 -2
  387. mteb/tasks/classification/zho/cmteb_classification.py +5 -10
  388. mteb/tasks/classification/zho/yue_openrice_review_classification.py +1 -2
  389. mteb/tasks/classification/zul/isi_zulu_news_classification.py +1 -2
  390. mteb/tasks/clustering/eng/hume_wiki_cities_clustering.py +1 -1
  391. mteb/tasks/clustering/eng/wiki_cities_clustering.py +1 -1
  392. mteb/tasks/clustering/jpn/mews_c16_ja_clustering.py +1 -3
  393. mteb/tasks/clustering/multilingual/sib200_clustering_s2s.py +1 -6
  394. mteb/tasks/clustering/nld/dutch_news_articles_clustering_p2p.py +3 -0
  395. mteb/tasks/clustering/nld/dutch_news_articles_clustering_s2s.py +3 -0
  396. mteb/tasks/clustering/nld/iconclass_clustering_s2s.py +3 -0
  397. mteb/tasks/clustering/nld/open_tender_clustering_p2p.py +3 -0
  398. mteb/tasks/clustering/nld/open_tender_clustering_s2s.py +3 -0
  399. mteb/tasks/clustering/nld/vabb_clustering_p2p.py +3 -0
  400. mteb/tasks/clustering/nld/vabb_clustering_s2s.py +3 -0
  401. mteb/tasks/clustering/vie/reddit_clustering_p2p_vn.py +1 -5
  402. mteb/tasks/clustering/vie/reddit_clustering_vn.py +1 -5
  403. mteb/tasks/clustering/vie/stack_exchange_clustering_p2p_vn.py +1 -5
  404. mteb/tasks/clustering/vie/stack_exchange_clustering_vn.py +1 -5
  405. mteb/tasks/clustering/vie/twenty_newsgroups_clustering_vn.py +1 -5
  406. mteb/tasks/clustering/zho/cmteb_clustering.py +2 -2
  407. mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +1 -1
  408. mteb/tasks/multilabel_classification/ita/emit_classification.py +1 -5
  409. mteb/tasks/multilabel_classification/kor/kor_hate_speech_ml_classification.py +1 -9
  410. mteb/tasks/multilabel_classification/mlt/maltese_news_classification.py +1 -6
  411. mteb/tasks/multilabel_classification/nld/covid_disinformation_nl_multi_label_classification.py +3 -0
  412. mteb/tasks/multilabel_classification/nld/vabb_multi_label_classification.py +3 -0
  413. mteb/tasks/multilabel_classification/por/brazilian_toxic_tweets_classification.py +1 -6
  414. mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_group_classification.py +1 -1
  415. mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_subclass_classification.py +1 -2
  416. mteb/tasks/pair_classification/dan/talemaader_pc.py +1 -6
  417. mteb/tasks/pair_classification/eng/legal_bench_pc.py +1 -9
  418. mteb/tasks/pair_classification/nld/sick_nl_pair_classification.py +3 -0
  419. mteb/tasks/pair_classification/nld/xlwic_nl_pair_classification.py +3 -0
  420. mteb/tasks/pair_classification/rus/__init__.py +2 -2
  421. mteb/tasks/pair_classification/rus/terra.py +51 -25
  422. mteb/tasks/pair_classification/vie/sprint_duplicate_questions_pcvn.py +1 -5
  423. mteb/tasks/pair_classification/vie/twitter_sem_eval2015_pcvn.py +1 -5
  424. mteb/tasks/pair_classification/vie/twitter_url_corpus_pcvn.py +1 -5
  425. mteb/tasks/regression/multilingual/ru_sci_bench_regression.py +2 -6
  426. mteb/tasks/reranking/jpn/__init__.py +9 -1
  427. mteb/tasks/reranking/jpn/j_qa_ra_reranking_lite.py +49 -0
  428. mteb/tasks/reranking/jpn/ja_cwir_reranking_lite.py +47 -0
  429. mteb/tasks/reranking/multilingual/__init__.py +2 -0
  430. mteb/tasks/reranking/multilingual/multi_long_doc_reranking.py +70 -0
  431. mteb/tasks/reranking/multilingual/wikipedia_reranking_multilingual.py +1 -1
  432. mteb/tasks/reranking/multilingual/x_glue_wpr_reranking.py +1 -2
  433. mteb/tasks/reranking/vie/ask_ubuntu_dup_questions_vn.py +1 -5
  434. mteb/tasks/reranking/vie/sci_docs_reranking_vn.py +1 -5
  435. mteb/tasks/reranking/vie/stack_overflow_dup_questions_vn.py +1 -5
  436. mteb/tasks/retrieval/code/code_rag.py +12 -12
  437. mteb/tasks/retrieval/code/fresh_stack_retrieval.py +8 -5
  438. mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
  439. mteb/tasks/retrieval/dan/tv2_nordretrieval.py +2 -2
  440. mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +2 -2
  441. mteb/tasks/retrieval/eng/__init__.py +2 -0
  442. mteb/tasks/retrieval/eng/chemrxiv.py +33 -0
  443. mteb/tasks/retrieval/eng/cub200_i2i_retrieval.py +1 -1
  444. mteb/tasks/retrieval/eng/lit_search_retrieval.py +1 -8
  445. mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +4 -0
  446. mteb/tasks/retrieval/jpn/__init__.py +8 -0
  447. mteb/tasks/retrieval/jpn/ja_cwir_retrieval.py +1 -4
  448. mteb/tasks/retrieval/jpn/ja_cwir_retrieval_lite.py +47 -0
  449. mteb/tasks/retrieval/jpn/jaqket_retrieval_lite.py +50 -0
  450. mteb/tasks/retrieval/jpn/miracl_ja_retrieval_lite.py +52 -0
  451. mteb/tasks/retrieval/jpn/mr_tydi_ja_retrieval_lite.py +48 -0
  452. mteb/tasks/retrieval/kat/georgian_faq_retrieval.py +11 -4
  453. mteb/tasks/retrieval/kor/__init__.py +16 -1
  454. mteb/tasks/retrieval/kor/kovidore2_bench_retrieval.py +142 -0
  455. mteb/tasks/retrieval/kor/squad_kor_v1_retrieval.py +47 -0
  456. mteb/tasks/retrieval/multilingual/__init__.py +24 -0
  457. mteb/tasks/retrieval/multilingual/belebele_retrieval.py +5 -4
  458. mteb/tasks/retrieval/multilingual/euro_pirq_retrieval.py +43 -0
  459. mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +56 -42
  460. mteb/tasks/retrieval/multilingual/mkqa_retrieval.py +1 -2
  461. mteb/tasks/retrieval/multilingual/mlqa_retrieval.py +1 -4
  462. mteb/tasks/retrieval/multilingual/multi_long_doc_retrieval.py +1 -2
  463. mteb/tasks/retrieval/multilingual/public_health_qa_retrieval.py +9 -4
  464. mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +2 -12
  465. mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +4 -2
  466. mteb/tasks/retrieval/multilingual/vidore3_bench_retrieval.py +389 -0
  467. mteb/tasks/retrieval/nld/__init__.py +8 -4
  468. mteb/tasks/retrieval/nld/argu_ana_nl_retrieval.py +46 -27
  469. mteb/tasks/retrieval/nld/bbsard_nl_retrieval.py +3 -0
  470. mteb/tasks/retrieval/nld/dutch_news_articles_retrieval.py +3 -0
  471. mteb/tasks/retrieval/nld/legal_qa_nl_retrieval.py +3 -0
  472. mteb/tasks/retrieval/nld/nf_corpus_nl_retrieval.py +42 -25
  473. mteb/tasks/retrieval/nld/open_tender_retrieval.py +3 -0
  474. mteb/tasks/retrieval/nld/sci_fact_nl_retrieval.py +42 -24
  475. mteb/tasks/retrieval/nld/scidocsnl_retrieval.py +44 -27
  476. mteb/tasks/retrieval/nld/vabb_retrieval.py +3 -0
  477. mteb/tasks/retrieval/nob/norquad.py +2 -2
  478. mteb/tasks/retrieval/nob/snl_retrieval.py +2 -2
  479. mteb/tasks/retrieval/slk/slovak_sum_retrieval.py +1 -7
  480. mteb/tasks/retrieval/tur/tur_hist_quad.py +1 -1
  481. mteb/tasks/retrieval/vie/__init__.py +14 -6
  482. mteb/tasks/retrieval/vie/argu_ana_vn_retrieval.py +1 -5
  483. mteb/tasks/retrieval/vie/climate_fevervn_retrieval.py +40 -5
  484. mteb/tasks/retrieval/vie/cqa_dupstack_android_vn_retrieval.py +1 -5
  485. mteb/tasks/retrieval/vie/cqa_dupstack_gis_vn_retrieval.py +1 -5
  486. mteb/tasks/retrieval/vie/cqa_dupstack_mathematica_vn_retrieval.py +1 -5
  487. mteb/tasks/retrieval/vie/cqa_dupstack_physics_vn_retrieval.py +1 -5
  488. mteb/tasks/retrieval/vie/cqa_dupstack_programmers_vn_retrieval.py +1 -5
  489. mteb/tasks/retrieval/vie/cqa_dupstack_stats_vn_retrieval.py +1 -5
  490. mteb/tasks/retrieval/vie/cqa_dupstack_tex_vn_retrieval.py +1 -5
  491. mteb/tasks/retrieval/vie/cqa_dupstack_unix_vn_retrieval.py +1 -5
  492. mteb/tasks/retrieval/vie/cqa_dupstack_webmasters_vn_retrieval.py +1 -5
  493. mteb/tasks/retrieval/vie/cqa_dupstack_wordpress_vn_retrieval.py +1 -5
  494. mteb/tasks/retrieval/vie/db_pedia_vn_retrieval.py +40 -5
  495. mteb/tasks/retrieval/vie/fevervn_retrieval.py +40 -7
  496. mteb/tasks/retrieval/vie/fi_qa2018_vn_retrieval.py +1 -5
  497. mteb/tasks/retrieval/vie/green_node_table_markdown_retrieval.py +16 -1
  498. mteb/tasks/retrieval/vie/hotpot_qavn_retrieval.py +40 -6
  499. mteb/tasks/retrieval/vie/msmarcovn_retrieval.py +49 -5
  500. mteb/tasks/retrieval/vie/nf_corpus_vn_retrieval.py +1 -5
  501. mteb/tasks/retrieval/vie/nqvn_retrieval.py +40 -5
  502. mteb/tasks/retrieval/vie/quora_vn_retrieval.py +1 -6
  503. mteb/tasks/retrieval/vie/sci_fact_vn_retrieval.py +1 -5
  504. mteb/tasks/retrieval/vie/scidocsvn_retrieval.py +1 -6
  505. mteb/tasks/retrieval/vie/touche2020_vn_retrieval.py +1 -5
  506. mteb/tasks/retrieval/vie/treccovidvn_retrieval.py +1 -5
  507. mteb/tasks/retrieval/vie/tvpl_retrieval.py +42 -0
  508. mteb/tasks/retrieval/vie/zac_legal_text_retrieval.py +15 -1
  509. mteb/tasks/sts/nld/sick_nl_sts.py +1 -0
  510. mteb/tasks/sts/vie/biosses_stsvn.py +1 -5
  511. mteb/tasks/sts/vie/sickr_stsvn.py +1 -5
  512. mteb/tasks/sts/vie/sts_benchmark_stsvn.py +1 -5
  513. mteb/tasks/zeroshot_classification/eng/gtsrb.py +1 -1
  514. mteb/tasks/zeroshot_classification/eng/patch_camelyon.py +1 -1
  515. mteb/tasks/zeroshot_classification/eng/ucf101.py +1 -5
  516. mteb/types/__init__.py +2 -0
  517. mteb/types/_encoder_io.py +19 -2
  518. mteb/types/_result.py +2 -1
  519. mteb/types/statistics.py +9 -3
  520. {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/METADATA +25 -8
  521. {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/RECORD +525 -438
  522. mteb/models/model_implementations/mxbai_models.py +0 -102
  523. mteb/models/model_implementations/nb_sbert.py +0 -25
  524. {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/WHEEL +0 -0
  525. {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/entry_points.txt +0 -0
  526. {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/licenses/LICENSE +0 -0
  527. {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/top_level.txt +0 -0
@@ -71,18 +71,26 @@ GP_BENCHMARK_ENTRIES = [
71
71
  "MTEB(cmn, v1)",
72
72
  "MTEB(deu, v1)",
73
73
  "MTEB(fra, v1)",
74
- "MTEB(jpn, v1)",
74
+ "JMTEB(v2)",
75
75
  "MTEB(kor, v1)",
76
76
  "MTEB(nld, v1)",
77
77
  "MTEB(pol, v1)",
78
- "MTEB(rus, v1)",
78
+ "MTEB(rus, v1.1)",
79
79
  "MTEB(fas, v2)",
80
80
  "VN-MTEB (vie, v1)",
81
81
  ]
82
82
  )
83
83
  + [
84
84
  MenuEntry(
85
- "Other", mteb.get_benchmarks(["MTEB(eng, v1)", "MTEB(fas, v1)"])
85
+ "Other",
86
+ mteb.get_benchmarks(
87
+ [
88
+ "MTEB(eng, v1)",
89
+ "MTEB(fas, v1)",
90
+ "MTEB(rus, v1)",
91
+ "MTEB(jpn, v1)",
92
+ ]
93
+ ),
86
94
  )
87
95
  ],
88
96
  ),
@@ -110,10 +118,11 @@ R_BENCHMARK_ENTRIES = [
110
118
  MenuEntry(
111
119
  "Image",
112
120
  description=None,
113
- open=False,
121
+ open=True,
114
122
  benchmarks=[
115
- mteb.get_benchmark("VisualDocumentRetrieval"),
123
+ mteb.get_benchmark("ViDoRe(v3)"),
116
124
  mteb.get_benchmark("JinaVDR"),
125
+ MenuEntry("Other", [mteb.get_benchmark("ViDoRe(v1&v2)")]),
117
126
  ],
118
127
  ),
119
128
  MenuEntry(
@@ -1,3 +1,4 @@
1
+ import logging
1
2
  from typing import get_args
2
3
 
3
4
  import numpy as np
@@ -7,6 +8,8 @@ import plotly.graph_objects as go
7
8
 
8
9
  from mteb.abstasks.task_metadata import TaskType
9
10
 
11
+ logger = logging.getLogger(__name__)
12
+
10
13
 
11
14
  def _text_plot(text: str):
12
15
  """Returns empty scatter plot with text added, this can be great for error messages."""
@@ -29,16 +32,17 @@ def _failsafe_plot(fun):
29
32
  try:
30
33
  return fun(*args, **kwargs)
31
34
  except Exception as e:
35
+ logger.error(f"Plot generation failed: {e}")
32
36
  return _text_plot(f"Couldn't produce plot. Reason: {e}")
33
37
 
34
38
  return wrapper
35
39
 
36
40
 
37
- def _parse_n_params(text: str) -> int:
38
- if text.endswith("M"):
39
- return float(text[:-1]) * 1e6
40
- if text.endswith("B"):
41
- return float(text[:-1]) * 1e9
41
+ def _parse_n_params(params: float | None) -> int | float:
42
+ """Specified in billions."""
43
+ if params is None or np.isnan(params):
44
+ return None
45
+ return int(params * 1e9)
42
46
 
43
47
 
44
48
  def _parse_model_name(name: str) -> str:
@@ -51,20 +55,14 @@ def _parse_model_name(name: str) -> str:
51
55
 
52
56
 
53
57
  def _parse_float(value) -> float:
54
- try:
55
- if value == "Infinite":
56
- return np.inf
57
- else:
58
- return float(value)
59
- except ValueError:
58
+ if value is None or np.isnan(value):
60
59
  return np.nan
60
+ return float(value)
61
61
 
62
62
 
63
63
  def _process_max_tokens(x):
64
- if pd.isna(x):
64
+ if pd.isna(x) or x is None or np.isinf(x):
65
65
  return "Unknown"
66
- if np.isinf(x):
67
- return "Infinite"
68
66
  return str(int(x))
69
67
 
70
68
 
@@ -112,7 +110,7 @@ def _add_size_guide(fig: go.Figure):
112
110
  @_failsafe_plot
113
111
  def _performance_size_plot(df: pd.DataFrame) -> go.Figure:
114
112
  df = df.copy()
115
- df["Number of Parameters"] = df["Number of Parameters"].map(_parse_n_params)
113
+ df["Number of Parameters"] = df["Number of Parameters (B)"].map(_parse_n_params)
116
114
  df["Model"] = df["Model"].map(_parse_model_name)
117
115
  df["model_text"] = df["Model"].where(df["Model"].isin(models_to_annotate), "")
118
116
  df["Embedding Dimensions"] = df["Embedding Dimensions"].map(_parse_float)
mteb/leaderboard/table.py CHANGED
@@ -26,16 +26,6 @@ def _format_scores(score: float) -> float:
26
26
  return round(score * 100, 2)
27
27
 
28
28
 
29
- def _get_column_types(df: pd.DataFrame) -> list[str]:
30
- types = []
31
- for column_name in df.columns:
32
- if is_numeric_dtype(df[column_name]):
33
- types.append("number")
34
- else:
35
- types.append("str")
36
- return types
37
-
38
-
39
29
  def _get_column_widths(df: pd.DataFrame) -> list[str]:
40
30
  # Please do not remove this function when refactoring.
41
31
  # Column width calculation seeminlgy changes regularly with Gradio releases,
@@ -120,6 +110,39 @@ def apply_per_task_styling_from_benchmark(
120
110
  return _apply_per_task_table_styling(per_task_df)
121
111
 
122
112
 
113
+ def apply_per_language_styling_from_benchmark(
114
+ benchmark_instance: Benchmark, benchmark_results: BenchmarkResults
115
+ ) -> gr.DataFrame:
116
+ """Apply styling to per-language table created by the benchmark instance's _create_per_language_table method.
117
+
118
+ This supports polymorphism - different benchmark classes can have different table generation logic.
119
+
120
+ Args:
121
+ benchmark_instance: The benchmark instance
122
+ benchmark_results: BenchmarkResults object containing model results (may be pre-filtered)
123
+
124
+ Returns:
125
+ Styled gr.DataFrame ready for display in the leaderboard
126
+ """
127
+ # Use the instance method to support polymorphism
128
+ per_language_df = benchmark_instance._create_per_language_table(benchmark_results)
129
+
130
+ # If it's a no-results DataFrame, return it as-is
131
+ if "No results" in per_language_df.columns:
132
+ return gr.DataFrame(per_language_df)
133
+
134
+ # Apply the styling
135
+ return _apply_per_language_table_styling(per_language_df)
136
+
137
+
138
+ def _style_number_of_parameters(num_params: float) -> str:
139
+ """Anything bigger than 1B is shown in billions with 1 decimal (e.g. 1.712 > 1.7) while anything smaller as 0.xxx B (e.g. 0.345 remains 0.345)"""
140
+ if num_params >= 1:
141
+ return f"{num_params:.1f}"
142
+ else:
143
+ return f"{num_params:.3f}"
144
+
145
+
123
146
  def _apply_summary_table_styling(joint_table: pd.DataFrame) -> gr.DataFrame:
124
147
  """Apply styling to a raw summary DataFrame
125
148
 
@@ -130,7 +153,7 @@ def _apply_summary_table_styling(joint_table: pd.DataFrame) -> gr.DataFrame:
130
153
  "Rank (Borda)",
131
154
  "Rank",
132
155
  "Model",
133
- "Number of Parameters",
156
+ "Number of Parameters (B)",
134
157
  "Embedding Dimensions",
135
158
  "Max Tokens",
136
159
  "Memory Usage (MB)",
@@ -156,7 +179,14 @@ def _apply_summary_table_styling(joint_table: pd.DataFrame) -> gr.DataFrame:
156
179
  joint_table[score_columns] = joint_table[score_columns].map(_format_scores)
157
180
 
158
181
  joint_table_style = joint_table.style.format(
159
- {**dict.fromkeys(score_columns, "{:.2f}"), "Rank (Borda)": "{:.0f}"},
182
+ {
183
+ **dict.fromkeys(score_columns, "{:.2f}"),
184
+ "Rank (Borda)": "{:.0f}",
185
+ "Memory Usage (MB)": "{:.0f}",
186
+ "Embedding Dimensions": "{:.0f}",
187
+ "Max Tokens": "{:.0f}",
188
+ "Number of Parameters (B)": lambda x: _style_number_of_parameters(x),
189
+ },
160
190
  na_rep="",
161
191
  )
162
192
  joint_table_style = joint_table_style.highlight_min(
@@ -186,7 +216,7 @@ def _apply_summary_table_styling(joint_table: pd.DataFrame) -> gr.DataFrame:
186
216
  gmap=gmap_values.loc[mask],
187
217
  )
188
218
 
189
- column_types = _get_column_types(joint_table_style.data)
219
+ column_types = ["auto" for _ in joint_table_style.data.columns]
190
220
  # setting model name column to markdown
191
221
  if len(column_types) > 1:
192
222
  column_types[1] = "markdown"
@@ -204,8 +234,7 @@ def _apply_summary_table_styling(joint_table: pd.DataFrame) -> gr.DataFrame:
204
234
  pinned_columns=2,
205
235
  column_widths=column_widths,
206
236
  wrap=True,
207
- show_fullscreen_button=True,
208
- show_copy_button=True,
237
+ buttons=["copy", "fullscreen"],
209
238
  show_search="filter",
210
239
  )
211
240
 
@@ -223,11 +252,47 @@ def _apply_per_task_table_styling(per_task: pd.DataFrame) -> gr.DataFrame:
223
252
  "{:.2f}", subset=task_score_columns, na_rep=""
224
253
  ).highlight_max(subset=task_score_columns, props="font-weight: bold")
225
254
 
255
+ # setting task name column width to 250px
256
+ column_widths = _get_column_widths(per_task_style.data)
257
+ if len(column_widths) > 0:
258
+ column_widths[0] = "250px"
259
+
226
260
  return gr.DataFrame(
227
261
  per_task_style,
228
262
  interactive=False,
229
263
  pinned_columns=1,
230
- show_fullscreen_button=True,
231
- show_copy_button=True,
264
+ column_widths=column_widths,
265
+ buttons=["copy", "fullscreen"],
266
+ show_search="filter",
267
+ )
268
+
269
+
270
+ def _apply_per_language_table_styling(per_language: pd.DataFrame) -> gr.DataFrame:
271
+ """Apply styling to a raw per-task DataFrame
272
+
273
+ Returns:
274
+ Styled gr.DataFrame ready for display in the leaderboard
275
+ """
276
+ language_score_columns = per_language.select_dtypes("number").columns
277
+ per_language[language_score_columns] *= 100
278
+
279
+ if len(per_language.columns) > 100: # Avoid gradio error on very wide tables
280
+ per_language_style = per_language.round(2)
281
+ else:
282
+ per_language_style = per_language.style.format(
283
+ "{:.2f}", subset=language_score_columns, na_rep=""
284
+ ).highlight_max(subset=language_score_columns, props="font-weight: bold")
285
+
286
+ # setting task name column width to 250px
287
+ column_widths = _get_column_widths(per_language_style.data)
288
+ if len(column_widths) > 0:
289
+ column_widths[0] = "250px"
290
+
291
+ return gr.DataFrame(
292
+ per_language_style,
293
+ interactive=False,
294
+ pinned_columns=1,
295
+ column_widths=column_widths,
296
+ buttons=["copy", "fullscreen"],
232
297
  show_search="filter",
233
298
  )
mteb/load_results.py CHANGED
@@ -1,7 +1,7 @@
1
1
  import json
2
2
  import logging
3
3
  import sys
4
- from collections.abc import Sequence
4
+ from collections.abc import Iterable, Sequence
5
5
  from pathlib import Path
6
6
 
7
7
  from mteb.abstasks.abstask import AbsTask
@@ -45,8 +45,8 @@ def _model_name_and_revision(
45
45
  def load_results(
46
46
  results_repo: str = "https://github.com/embeddings-benchmark/results",
47
47
  download_latest: bool = True,
48
- models: Sequence[ModelMeta] | Sequence[str] | None = None,
49
- tasks: Sequence[AbsTask] | Sequence[str] | None = None,
48
+ models: Iterable[ModelMeta] | Sequence[str] | None = None,
49
+ tasks: Iterable[AbsTask] | Sequence[str] | None = None,
50
50
  validate_and_filter: bool = True,
51
51
  require_model_meta: bool = True,
52
52
  only_main_score: bool = False,
@@ -83,21 +83,21 @@ def load_results(
83
83
 
84
84
  if models is not None:
85
85
  models_to_keep = {}
86
- for model_path in models:
87
- if isinstance(model_path, ModelMeta):
88
- models_to_keep[model_path.name] = model_path.revision
86
+ for model in models:
87
+ if isinstance(model, ModelMeta):
88
+ models_to_keep[model.name] = model.revision
89
89
  else:
90
- models_to_keep[model_path] = None
90
+ models_to_keep[model] = None
91
91
  else:
92
92
  models_to_keep = None
93
93
 
94
- task_names = {}
94
+ task_names: dict[str, AbsTask | None] = {}
95
95
  if tasks is not None:
96
- for task in tasks:
97
- if isinstance(task, AbsTask):
98
- task_names[task.metadata.name] = task
96
+ for task_ in tasks:
97
+ if isinstance(task_, AbsTask):
98
+ task_names[task_.metadata.name] = task_
99
99
  else:
100
- task_names[task] = None
100
+ task_names[task_] = None
101
101
 
102
102
  model_results = []
103
103
  for model_path in model_paths:
mteb/models/__init__.py CHANGED
@@ -1,4 +1,4 @@
1
- from .cache_wrappers import CachedEmbeddingWrapper
1
+ from .cache_wrappers import CacheBackendProtocol, CachedEmbeddingWrapper
2
2
  from .model_meta import ModelMeta
3
3
  from .models_protocols import (
4
4
  CrossEncoderProtocol,
@@ -6,6 +6,7 @@ from .models_protocols import (
6
6
  MTEBModels,
7
7
  SearchProtocol,
8
8
  )
9
+ from .search_encoder_index.search_backend_protocol import IndexEncoderSearchProtocol
9
10
  from .search_wrappers import SearchCrossEncoderWrapper, SearchEncoderWrapper
10
11
  from .sentence_transformer_wrapper import (
11
12
  CrossEncoderWrapper,
@@ -14,10 +15,12 @@ from .sentence_transformer_wrapper import (
14
15
  )
15
16
 
16
17
  __all__ = [
18
+ "CacheBackendProtocol",
17
19
  "CachedEmbeddingWrapper",
18
20
  "CrossEncoderProtocol",
19
21
  "CrossEncoderWrapper",
20
22
  "EncoderProtocol",
23
+ "IndexEncoderSearchProtocol",
21
24
  "MTEBModels",
22
25
  "ModelMeta",
23
26
  "SearchCrossEncoderWrapper",
@@ -1,9 +1,11 @@
1
1
  import logging
2
+ import warnings
2
3
  from abc import ABC, abstractmethod
3
4
  from collections.abc import Callable, Sequence
4
5
  from typing import Any, Literal, cast, get_args, overload
5
6
 
6
7
  from torch.utils.data import DataLoader
8
+ from typing_extensions import Unpack
7
9
 
8
10
  import mteb
9
11
  from mteb.abstasks.task_metadata import TaskMetadata, TaskType
@@ -18,6 +20,7 @@ from mteb.similarity_functions import (
18
20
  from mteb.types import (
19
21
  Array,
20
22
  BatchedInput,
23
+ EncodeKwargs,
21
24
  PromptType,
22
25
  )
23
26
 
@@ -43,7 +46,7 @@ class AbsEncoder(ABC):
43
46
  model: Any
44
47
  mteb_model_meta: ModelMeta | None = None
45
48
  model_prompts: dict[str, str] | None = None
46
- instruction_template: str | Callable[[str, PromptType], str] | None = None
49
+ instruction_template: str | Callable[[str, PromptType | None], str] | None = None
47
50
  prompts_dict: dict[str, str] | None = None
48
51
 
49
52
  def get_prompt_name(
@@ -54,11 +57,11 @@ class AbsEncoder(ABC):
54
57
  """A wrapper function around the model.encode method that handles the prompt_name argument and standardizes the output to a numpy array.
55
58
 
56
59
  The order of priorities for prompt selection are:
57
- 1. Composed prompt of task name + prompt type (query or passage)
60
+ 1. Composed prompt of task name + prompt type
58
61
  2. Specific task prompt
59
- 3. Composed prompt of task type + prompt type (query or passage)
62
+ 3. Composed prompt of task type + prompt type
60
63
  4. Specific task type prompt
61
- 5. Specific prompt type (query or passage)
64
+ 5. Specific prompt type
62
65
 
63
66
  Args:
64
67
  task_metadata: The task name to use for building the encoding prompt
@@ -105,12 +108,12 @@ class AbsEncoder(ABC):
105
108
 
106
109
  Args:
107
110
  task_metadata: The metadata of the task.
108
- prompt_type: The name type of prompt. (query or passage)
111
+ prompt_type: The name type of prompt.
109
112
  """
110
113
  if not self.model_prompts:
111
114
  return None
112
115
  prompt_name = self.get_prompt_name(task_metadata, prompt_type)
113
- return self.model_prompts.get(prompt_name)
116
+ return self.model_prompts.get(prompt_name) if prompt_name else None
114
117
 
115
118
  @staticmethod
116
119
  @overload
@@ -187,6 +190,7 @@ class AbsEncoder(ABC):
187
190
  except KeyError:
188
191
  msg = f"Task name {task_name} is not valid. {valid_keys_msg}"
189
192
  logger.warning(msg)
193
+ warnings.warn(msg)
190
194
  invalid_task_messages.add(msg)
191
195
  invalid_keys.add(task_key)
192
196
 
@@ -210,13 +214,11 @@ class AbsEncoder(ABC):
210
214
  task_metadata: The metadata of the task. Sentence-transformers uses this to
211
215
  determine which prompt to use from a specified dictionary.
212
216
  The order of priorities for prompt selection are:
213
- 1. Composed prompt of task name + prompt type (query or passage)
214
- 2. Specific task prompt
215
- 3. Composed prompt of task type + prompt type (query or passage)
216
- 4. Specific task type prompt
217
- 5. Specific prompt type (query or passage)
218
- 6. Default prompt from the task definition
219
- prompt_type: The name type of prompt. (query or passage)
217
+ 1. Specific task prompt
218
+ 2. Specific task type prompt
219
+ 3. Specific prompt type
220
+ 4. Default prompt from the task definition
221
+ prompt_type: The name type of prompt.
220
222
 
221
223
  Returns:
222
224
  The instruction/prompt to be used for encoding sentences.
@@ -224,13 +226,19 @@ class AbsEncoder(ABC):
224
226
  prompt = task_metadata.prompt
225
227
  if self.prompts_dict and task_metadata.name in self.prompts_dict:
226
228
  prompt = self.prompts_dict[task_metadata.name]
229
+ elif self.prompts_dict and task_metadata.type in self.prompts_dict:
230
+ prompt = self.prompts_dict[task_metadata.type]
231
+ elif (
232
+ self.prompts_dict and prompt_type and prompt_type.value in self.prompts_dict
233
+ ):
234
+ prompt = self.prompts_dict[prompt_type.value]
227
235
 
228
236
  if isinstance(prompt, dict) and prompt_type:
229
237
  if prompt.get(prompt_type.value):
230
238
  return prompt[prompt_type.value]
231
- logger.warning(
232
- f"Prompt type '{prompt_type}' not found in task metadata for task '{task_metadata.name}'."
233
- )
239
+ msg = f"Prompt type '{prompt_type}' not found in task metadata for task '{task_metadata.name}'."
240
+ logger.warning(msg)
241
+ warnings.warn(msg)
234
242
  return ""
235
243
 
236
244
  if prompt:
@@ -246,7 +254,7 @@ class AbsEncoder(ABC):
246
254
 
247
255
  Args:
248
256
  instruction: The instruction to be formatted.
249
- prompt_type: The name type of prompt. (query or passage)
257
+ prompt_type: The name type of prompt.
250
258
  """
251
259
  if self.instruction_template is None:
252
260
  raise ValueError(
@@ -269,7 +277,7 @@ class AbsEncoder(ABC):
269
277
 
270
278
  Args:
271
279
  task_metadata: The metadata of the task
272
- prompt_type: The name type of prompt. (query or passage)
280
+ prompt_type: The name type of prompt.
273
281
 
274
282
  Returns:
275
283
  The instruction to be used for encoding sentences.
@@ -364,7 +372,7 @@ class AbsEncoder(ABC):
364
372
  hf_split: str,
365
373
  hf_subset: str,
366
374
  prompt_type: PromptType | None = None,
367
- **kwargs: Any,
375
+ **kwargs: Unpack[EncodeKwargs],
368
376
  ) -> Array:
369
377
  """Encodes the given sentences using the encoder.
370
378
 
@@ -373,14 +381,14 @@ class AbsEncoder(ABC):
373
381
  task_metadata: The metadata of the task. Sentence-transformers uses this to
374
382
  determine which prompt to use from a specified dictionary.
375
383
  The order of priorities for prompt selection are:
376
- 1. Composed prompt of task name + prompt type (query or passage)
384
+ 1. Composed prompt of task name + prompt type
377
385
  2. Specific task prompt
378
- 3. Composed prompt of task type + prompt type (query or passage)
386
+ 3. Composed prompt of task type + prompt type
379
387
  4. Specific task type prompt
380
- 5. Specific prompt type (query or passage)
388
+ 5. Specific prompt type
381
389
  hf_split: Split of current task
382
390
  hf_subset: Subset of current task
383
- prompt_type: The name type of prompt. (query or passage)
391
+ prompt_type: The name type of prompt.
384
392
  **kwargs: Additional arguments to pass to the encoder.
385
393
 
386
394
  Returns:
@@ -1,3 +1,4 @@
1
+ from .cache_backend_protocol import CacheBackendProtocol
1
2
  from .cache_wrapper import CachedEmbeddingWrapper
2
3
 
3
- __all__ = ["CachedEmbeddingWrapper"]
4
+ __all__ = ["CacheBackendProtocol", "CachedEmbeddingWrapper"]
@@ -5,8 +5,6 @@ from typing import Any, Protocol, runtime_checkable
5
5
 
6
6
  import numpy as np
7
7
 
8
- from mteb.types import BatchedInput
9
-
10
8
 
11
9
  @runtime_checkable
12
10
  class CacheBackendProtocol(Protocol):
@@ -26,7 +24,7 @@ class CacheBackendProtocol(Protocol):
26
24
  **kwargs: Additional backend-specific arguments.
27
25
  """
28
26
 
29
- def add(self, item: list[BatchedInput], vectors: np.ndarray) -> None:
27
+ def add(self, item: list[dict[str, Any]], vectors: np.ndarray) -> None:
30
28
  """Add a vector to the cache.
31
29
 
32
30
  Args:
@@ -34,7 +32,7 @@ class CacheBackendProtocol(Protocol):
34
32
  vectors: Embedding vector of shape (dim,) or (1, dim).
35
33
  """
36
34
 
37
- def get_vector(self, item: BatchedInput) -> np.ndarray | None:
35
+ def get_vector(self, item: dict[str, Any]) -> np.ndarray | None:
38
36
  """Retrieve the cached vector for the given item.
39
37
 
40
38
  Args:
@@ -53,5 +51,5 @@ class CacheBackendProtocol(Protocol):
53
51
  def close(self) -> None:
54
52
  """Release resources or flush data."""
55
53
 
56
- def __contains__(self, item: BatchedInput) -> bool:
54
+ def __contains__(self, item: dict[str, Any]) -> bool:
57
55
  """Check whether the cache contains an item."""
@@ -1,16 +1,17 @@
1
1
  import hashlib
2
+ from collections.abc import Mapping
3
+ from typing import Any
2
4
 
3
- from PIL import Image
4
5
 
5
- from mteb.types import BatchedInput
6
-
7
-
8
- def _hash_item(item: BatchedInput) -> str:
6
+ def _hash_item(item: Mapping[str, Any]) -> str:
9
7
  item_hash = ""
10
8
  if "text" in item:
11
- item_hash = hashlib.sha256(item["text"].encode()).hexdigest()
9
+ item_text: str = item["text"]
10
+ item_hash = hashlib.sha256(item_text.encode()).hexdigest()
12
11
 
13
12
  if "image" in item:
13
+ from PIL import Image
14
+
14
15
  image: Image.Image = item["image"]
15
16
  item_hash += hashlib.sha256(image.tobytes()).hexdigest()
16
17
 
@@ -1,6 +1,8 @@
1
1
  import json
2
2
  import logging
3
+ import warnings
3
4
  from pathlib import Path
5
+ from typing import Any
4
6
 
5
7
  import numpy as np
6
8
 
@@ -36,7 +38,7 @@ class FaissCache:
36
38
  logger.info(f"Initialized FAISS VectorCacheMap in {self.directory}")
37
39
  self.load()
38
40
 
39
- def add(self, items: list[BatchedInput], vectors: np.ndarray) -> None:
41
+ def add(self, items: list[dict[str, Any]], vectors: np.ndarray) -> None:
40
42
  """Add vector to FAISS index."""
41
43
  import faiss
42
44
 
@@ -71,7 +73,9 @@ class FaissCache:
71
73
  try:
72
74
  return self.index.reconstruct(idx)
73
75
  except Exception:
74
- logger.warning(f"Vector id {idx} missing for hash {item_hash}")
76
+ msg = f"Vector id {idx} missing for hash {item_hash}"
77
+ logger.warning(msg)
78
+ warnings.warn(msg)
75
79
  return None
76
80
 
77
81
  def save(self) -> None: