mteb 2.5.2__py3-none-any.whl → 2.7.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (529) hide show
  1. mteb/__init__.py +2 -0
  2. mteb/_create_dataloaders.py +78 -30
  3. mteb/_evaluators/any_sts_evaluator.py +13 -6
  4. mteb/_evaluators/clustering_evaluator.py +13 -5
  5. mteb/_evaluators/evaluator.py +12 -4
  6. mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +22 -11
  7. mteb/_evaluators/pair_classification_evaluator.py +17 -7
  8. mteb/_evaluators/retrieval_evaluator.py +23 -14
  9. mteb/_evaluators/retrieval_metrics.py +26 -19
  10. mteb/_evaluators/sklearn_evaluator.py +27 -17
  11. mteb/_evaluators/text/bitext_mining_evaluator.py +36 -20
  12. mteb/_evaluators/text/summarization_evaluator.py +31 -20
  13. mteb/_evaluators/zeroshot_classification_evaluator.py +16 -5
  14. mteb/_helpful_enum.py +5 -1
  15. mteb/abstasks/_data_filter/filters.py +9 -3
  16. mteb/abstasks/_data_filter/task_pipelines.py +10 -2
  17. mteb/abstasks/_statistics_calculation.py +21 -11
  18. mteb/abstasks/_stratification.py +18 -18
  19. mteb/abstasks/abstask.py +78 -44
  20. mteb/abstasks/aggregate_task_metadata.py +21 -18
  21. mteb/abstasks/aggregated_task.py +23 -35
  22. mteb/abstasks/classification.py +39 -18
  23. mteb/abstasks/clustering.py +37 -20
  24. mteb/abstasks/clustering_legacy.py +30 -16
  25. mteb/abstasks/image/image_text_pair_classification.py +26 -9
  26. mteb/abstasks/multilabel_classification.py +33 -21
  27. mteb/abstasks/pair_classification.py +44 -19
  28. mteb/abstasks/regression.py +18 -10
  29. mteb/abstasks/retrieval.py +82 -52
  30. mteb/abstasks/retrieval_dataset_loaders.py +50 -39
  31. mteb/abstasks/sts.py +34 -15
  32. mteb/abstasks/task_metadata.py +44 -37
  33. mteb/abstasks/text/bitext_mining.py +57 -35
  34. mteb/abstasks/text/reranking.py +10 -8
  35. mteb/abstasks/text/summarization.py +26 -10
  36. mteb/abstasks/zeroshot_classification.py +27 -9
  37. mteb/benchmarks/_create_table.py +13 -7
  38. mteb/benchmarks/benchmark.py +15 -3
  39. mteb/benchmarks/benchmarks/__init__.py +6 -0
  40. mteb/benchmarks/benchmarks/benchmarks.py +153 -13
  41. mteb/benchmarks/benchmarks/rteb_benchmarks.py +20 -9
  42. mteb/benchmarks/get_benchmark.py +14 -55
  43. mteb/cache.py +189 -31
  44. mteb/cli/_display_tasks.py +10 -4
  45. mteb/cli/build_cli.py +112 -13
  46. mteb/cli/generate_model_card.py +50 -23
  47. mteb/deprecated_evaluator.py +72 -54
  48. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2CybersecurityRetrieval.json +32 -0
  49. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EconomicRetrieval.json +32 -0
  50. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EnergyRetrieval.json +32 -0
  51. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2HrRetrieval.json +32 -0
  52. mteb/descriptive_stats/Retrieval/BrightAopsRetrieval.json +35 -0
  53. mteb/descriptive_stats/Retrieval/BrightBiologyLongRetrieval.json +35 -0
  54. mteb/descriptive_stats/Retrieval/BrightBiologyRetrieval.json +35 -0
  55. mteb/descriptive_stats/Retrieval/BrightEarthScienceLongRetrieval.json +35 -0
  56. mteb/descriptive_stats/Retrieval/BrightEarthScienceRetrieval.json +35 -0
  57. mteb/descriptive_stats/Retrieval/BrightEconomicsLongRetrieval.json +35 -0
  58. mteb/descriptive_stats/Retrieval/BrightEconomicsRetrieval.json +35 -0
  59. mteb/descriptive_stats/Retrieval/BrightLeetcodeRetrieval.json +35 -0
  60. mteb/descriptive_stats/Retrieval/BrightPonyLongRetrieval.json +35 -0
  61. mteb/descriptive_stats/Retrieval/BrightPonyRetrieval.json +35 -0
  62. mteb/descriptive_stats/Retrieval/BrightPsychologyLongRetrieval.json +35 -0
  63. mteb/descriptive_stats/Retrieval/BrightPsychologyRetrieval.json +35 -0
  64. mteb/descriptive_stats/Retrieval/BrightRoboticsLongRetrieval.json +35 -0
  65. mteb/descriptive_stats/Retrieval/BrightRoboticsRetrieval.json +35 -0
  66. mteb/descriptive_stats/Retrieval/BrightStackoverflowLongRetrieval.json +35 -0
  67. mteb/descriptive_stats/Retrieval/BrightStackoverflowRetrieval.json +35 -0
  68. mteb/descriptive_stats/Retrieval/BrightSustainableLivingLongRetrieval.json +35 -0
  69. mteb/descriptive_stats/Retrieval/BrightSustainableLivingRetrieval.json +35 -0
  70. mteb/descriptive_stats/Retrieval/BrightTheoremQAQuestionsRetrieval.json +35 -0
  71. mteb/descriptive_stats/Retrieval/BrightTheoremQATheoremsRetrieval.json +35 -0
  72. mteb/descriptive_stats/Retrieval/ChemRxivRetrieval.json +30 -0
  73. mteb/descriptive_stats/Retrieval/EuroPIRQRetrieval.json +116 -0
  74. mteb/descriptive_stats/Retrieval/NanoClimateFEVER-VN.json +30 -0
  75. mteb/descriptive_stats/Retrieval/NanoDBPedia-VN.json +30 -0
  76. mteb/descriptive_stats/Retrieval/NanoFEVER-VN.json +30 -0
  77. mteb/descriptive_stats/Retrieval/NanoHotpotQA-VN.json +30 -0
  78. mteb/descriptive_stats/Retrieval/NanoMSMARCO-VN.json +30 -0
  79. mteb/descriptive_stats/Retrieval/NanoNQ-VN.json +30 -0
  80. mteb/descriptive_stats/Retrieval/TVPLRetrieval.json +30 -0
  81. mteb/evaluate.py +71 -47
  82. mteb/filter_tasks.py +36 -32
  83. mteb/get_tasks.py +37 -33
  84. mteb/languages/language_scripts.py +11 -4
  85. mteb/leaderboard/app.py +172 -37
  86. mteb/leaderboard/table.py +7 -2
  87. mteb/load_results.py +20 -14
  88. mteb/models/abs_encoder.py +30 -16
  89. mteb/models/cache_wrappers/cache_backend_protocol.py +7 -7
  90. mteb/models/cache_wrappers/cache_backends/_hash_utils.py +10 -5
  91. mteb/models/cache_wrappers/cache_backends/faiss_cache.py +13 -4
  92. mteb/models/cache_wrappers/cache_backends/numpy_cache.py +43 -25
  93. mteb/models/cache_wrappers/cache_wrapper.py +16 -11
  94. mteb/models/get_model_meta.py +53 -9
  95. mteb/models/instruct_wrapper.py +41 -13
  96. mteb/models/model_implementations/align_models.py +11 -5
  97. mteb/models/model_implementations/amazon_models.py +1 -0
  98. mteb/models/model_implementations/andersborges.py +6 -4
  99. mteb/models/model_implementations/ara_models.py +2 -1
  100. mteb/models/model_implementations/arctic_models.py +16 -8
  101. mteb/models/model_implementations/b1ade_models.py +2 -1
  102. mteb/models/model_implementations/bedrock_models.py +20 -6
  103. mteb/models/model_implementations/bge_models.py +85 -22
  104. mteb/models/model_implementations/bica_model.py +4 -3
  105. mteb/models/model_implementations/blip2_models.py +13 -6
  106. mteb/models/model_implementations/blip_models.py +33 -20
  107. mteb/models/model_implementations/bm25.py +27 -17
  108. mteb/models/model_implementations/bmretriever_models.py +16 -6
  109. mteb/models/model_implementations/cadet_models.py +2 -1
  110. mteb/models/model_implementations/cde_models.py +22 -9
  111. mteb/models/model_implementations/clip_models.py +18 -10
  112. mteb/models/model_implementations/clips_models.py +6 -3
  113. mteb/models/model_implementations/codefuse_models.py +10 -5
  114. mteb/models/model_implementations/codesage_models.py +6 -3
  115. mteb/models/model_implementations/cohere_models.py +19 -9
  116. mteb/models/model_implementations/cohere_v.py +16 -6
  117. mteb/models/model_implementations/colpali_models.py +10 -6
  118. mteb/models/model_implementations/colqwen_models.py +24 -38
  119. mteb/models/model_implementations/colsmol_models.py +5 -3
  120. mteb/models/model_implementations/conan_models.py +12 -5
  121. mteb/models/model_implementations/dino_models.py +70 -46
  122. mteb/models/model_implementations/e5_instruct.py +27 -4
  123. mteb/models/model_implementations/e5_models.py +18 -9
  124. mteb/models/model_implementations/e5_v.py +16 -10
  125. mteb/models/model_implementations/eagerworks_models.py +12 -5
  126. mteb/models/model_implementations/emillykkejensen_models.py +9 -6
  127. mteb/models/model_implementations/en_code_retriever.py +2 -1
  128. mteb/models/model_implementations/euler_models.py +3 -2
  129. mteb/models/model_implementations/evaclip_models.py +13 -4
  130. mteb/models/model_implementations/fa_models.py +18 -9
  131. mteb/models/model_implementations/facebookai.py +16 -2
  132. mteb/models/model_implementations/geogpt_models.py +2 -1
  133. mteb/models/model_implementations/gme_v_models.py +13 -8
  134. mteb/models/model_implementations/google_models.py +16 -5
  135. mteb/models/model_implementations/granite_vision_embedding_models.py +8 -6
  136. mteb/models/model_implementations/gritlm_models.py +5 -2
  137. mteb/models/model_implementations/gte_models.py +34 -13
  138. mteb/models/model_implementations/hinvec_models.py +7 -2
  139. mteb/models/model_implementations/human.py +1 -0
  140. mteb/models/model_implementations/ibm_granite_models.py +36 -6
  141. mteb/models/model_implementations/inf_models.py +4 -2
  142. mteb/models/model_implementations/jasper_models.py +16 -7
  143. mteb/models/model_implementations/jina_clip.py +58 -14
  144. mteb/models/model_implementations/jina_models.py +35 -16
  145. mteb/models/model_implementations/kalm_models.py +24 -12
  146. mteb/models/model_implementations/kblab.py +13 -6
  147. mteb/models/model_implementations/kennethenevoldsen_models.py +6 -4
  148. mteb/models/model_implementations/kfst.py +2 -1
  149. mteb/models/model_implementations/kowshik24_models.py +2 -1
  150. mteb/models/model_implementations/lens_models.py +2 -0
  151. mteb/models/model_implementations/lgai_embedding_models.py +2 -1
  152. mteb/models/model_implementations/linq_models.py +8 -2
  153. mteb/models/model_implementations/listconranker.py +11 -5
  154. mteb/models/model_implementations/llm2clip_models.py +18 -10
  155. mteb/models/model_implementations/llm2vec_models.py +28 -14
  156. mteb/models/model_implementations/mcinext_models.py +12 -3
  157. mteb/models/model_implementations/mdbr_models.py +19 -3
  158. mteb/models/model_implementations/misc_models.py +131 -68
  159. mteb/models/model_implementations/mixedbread_ai_models.py +335 -0
  160. mteb/models/model_implementations/mme5_models.py +3 -2
  161. mteb/models/model_implementations/moco_models.py +15 -8
  162. mteb/models/model_implementations/mod_models.py +3 -2
  163. mteb/models/model_implementations/model2vec_models.py +37 -18
  164. mteb/models/model_implementations/moka_models.py +4 -1
  165. mteb/models/model_implementations/nbailab.py +6 -3
  166. mteb/models/model_implementations/no_instruct_sentence_models.py +15 -7
  167. mteb/models/model_implementations/nomic_models.py +47 -19
  168. mteb/models/model_implementations/nomic_models_vision.py +6 -4
  169. mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +20 -8
  170. mteb/models/model_implementations/nvidia_models.py +165 -22
  171. mteb/models/model_implementations/octen_models.py +64 -3
  172. mteb/models/model_implementations/openai_models.py +14 -4
  173. mteb/models/model_implementations/openclip_models.py +30 -17
  174. mteb/models/model_implementations/opensearch_neural_sparse_models.py +20 -9
  175. mteb/models/model_implementations/ops_moa_models.py +10 -3
  176. mteb/models/model_implementations/ordalietech_solon_embeddings_mini_beta_1_1.py +2 -1
  177. mteb/models/model_implementations/pawan_models.py +2 -1
  178. mteb/models/model_implementations/piccolo_models.py +3 -1
  179. mteb/models/model_implementations/pixie_models.py +56 -0
  180. mteb/models/model_implementations/promptriever_models.py +20 -10
  181. mteb/models/model_implementations/pylate_models.py +41 -21
  182. mteb/models/model_implementations/qodo_models.py +4 -2
  183. mteb/models/model_implementations/qtack_models.py +2 -1
  184. mteb/models/model_implementations/qwen3_models.py +14 -4
  185. mteb/models/model_implementations/qzhou_models.py +4 -2
  186. mteb/models/model_implementations/random_baseline.py +7 -6
  187. mteb/models/model_implementations/rasgaard_models.py +3 -2
  188. mteb/models/model_implementations/reasonir_model.py +66 -1
  189. mteb/models/model_implementations/repllama_models.py +18 -9
  190. mteb/models/model_implementations/rerankers_custom.py +25 -10
  191. mteb/models/model_implementations/rerankers_monot5_based.py +41 -21
  192. mteb/models/model_implementations/richinfoai_models.py +2 -1
  193. mteb/models/model_implementations/ru_sentence_models.py +40 -20
  194. mteb/models/model_implementations/ruri_models.py +20 -10
  195. mteb/models/model_implementations/salesforce_models.py +13 -4
  196. mteb/models/model_implementations/samilpwc_models.py +2 -1
  197. mteb/models/model_implementations/sarashina_embedding_models.py +4 -2
  198. mteb/models/model_implementations/searchmap_models.py +2 -1
  199. mteb/models/model_implementations/seed_1_6_embedding_models.py +5 -2
  200. mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +119 -148
  201. mteb/models/model_implementations/seed_models.py +2 -1
  202. mteb/models/model_implementations/sentence_transformers_models.py +142 -22
  203. mteb/models/model_implementations/shuu_model.py +2 -1
  204. mteb/models/model_implementations/siglip_models.py +39 -24
  205. mteb/models/model_implementations/slm_models.py +419 -0
  206. mteb/models/model_implementations/sonar_models.py +2 -1
  207. mteb/models/model_implementations/spartan8806_atles_champion.py +2 -1
  208. mteb/models/model_implementations/stella_models.py +23 -4
  209. mteb/models/model_implementations/tarka_models.py +4 -2
  210. mteb/models/model_implementations/text2vec_models.py +12 -3
  211. mteb/models/model_implementations/ua_sentence_models.py +2 -1
  212. mteb/models/model_implementations/uae_models.py +17 -5
  213. mteb/models/model_implementations/vdr_models.py +9 -2
  214. mteb/models/model_implementations/vi_vn_models.py +12 -6
  215. mteb/models/model_implementations/vista_models.py +11 -4
  216. mteb/models/model_implementations/vlm2vec_models.py +14 -7
  217. mteb/models/model_implementations/voyage_models.py +136 -4
  218. mteb/models/model_implementations/voyage_v.py +17 -10
  219. mteb/models/model_implementations/xyz_models.py +1 -0
  220. mteb/models/model_implementations/youtu_models.py +2 -1
  221. mteb/models/model_implementations/yuan_models.py +2 -1
  222. mteb/models/model_implementations/yuan_models_en.py +3 -2
  223. mteb/models/model_meta.py +127 -40
  224. mteb/models/models_protocols.py +43 -22
  225. mteb/models/search_encoder_index/search_backend_protocol.py +7 -3
  226. mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +21 -10
  227. mteb/models/search_wrappers.py +63 -29
  228. mteb/models/sentence_transformer_wrapper.py +52 -26
  229. mteb/models/vllm_wrapper.py +329 -0
  230. mteb/py.typed +0 -0
  231. mteb/results/benchmark_results.py +48 -35
  232. mteb/results/model_result.py +68 -32
  233. mteb/results/task_result.py +110 -72
  234. mteb/similarity_functions.py +19 -9
  235. mteb/tasks/aggregated_tasks/eng/cqadupstack_retrieval.py +3 -3
  236. mteb/tasks/aggregated_tasks/eng/sts17_multilingual_visual_sts_eng.py +3 -3
  237. mteb/tasks/aggregated_tasks/eng/sts_benchmark_multilingual_visual_sts_eng.py +3 -3
  238. mteb/tasks/aggregated_tasks/fas/cqadupstack_retrieval_fa.py +3 -3
  239. mteb/tasks/aggregated_tasks/fas/syn_per_chatbot_conv_sa_classification.py +3 -3
  240. mteb/tasks/aggregated_tasks/multilingual/sts17_multilingual_vision_sts.py +3 -3
  241. mteb/tasks/aggregated_tasks/multilingual/sts_benchmark_multilingual_visual_sts.py +3 -3
  242. mteb/tasks/aggregated_tasks/nld/cqadupstack_nl_retrieval.py +3 -3
  243. mteb/tasks/aggregated_tasks/pol/cqadupstack_retrieval_pl.py +3 -3
  244. mteb/tasks/bitext_mining/eng/pub_chem_smiles_bitext_mining.py +1 -1
  245. mteb/tasks/bitext_mining/fas/fa_mteb_summary_retrieval.py +3 -3
  246. mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining.py +1 -1
  247. mteb/tasks/bitext_mining/multilingual/flores_bitext_mining.py +1 -1
  248. mteb/tasks/bitext_mining/multilingual/in22_conv_bitext_mining.py +1 -1
  249. mteb/tasks/bitext_mining/multilingual/in22_gen_bitext_mining.py +1 -1
  250. mteb/tasks/bitext_mining/multilingual/norwegian_courts_bitext_mining.py +1 -1
  251. mteb/tasks/bitext_mining/multilingual/ntrex_bitext_mining.py +1 -1
  252. mteb/tasks/bitext_mining/multilingual/roma_tales_bitext_mining.py +2 -2
  253. mteb/tasks/bitext_mining/multilingual/web_faq_bitext_mining.py +2 -2
  254. mteb/tasks/classification/ara/online_store_review_sentiment_classification.py +1 -1
  255. mteb/tasks/classification/ara/restaurant_review_sentiment_classification.py +1 -1
  256. mteb/tasks/classification/ara/tweet_sarcasm_classification.py +1 -1
  257. mteb/tasks/classification/ben/bengali_hate_speech_classification.py +1 -1
  258. mteb/tasks/classification/ben/bengali_sentiment_analysis.py +1 -1
  259. mteb/tasks/classification/bul/bulgarian_store_review_sentiment_classfication.py +1 -1
  260. mteb/tasks/classification/ces/csfdcz_movie_review_sentiment_classification.py +2 -2
  261. mteb/tasks/classification/dan/ddisco_cohesion_classification.py +1 -1
  262. mteb/tasks/classification/dan/dk_hate_classification.py +2 -2
  263. mteb/tasks/classification/deu/german_politicians_twitter_sentiment_classification.py +1 -1
  264. mteb/tasks/classification/ell/greek_legal_code_classification.py +1 -1
  265. mteb/tasks/classification/eng/dbpedia_classification.py +2 -2
  266. mteb/tasks/classification/eng/toxic_chat_classification.py +2 -2
  267. mteb/tasks/classification/eng/toxic_conversations_classification.py +2 -2
  268. mteb/tasks/classification/eng/tweet_topic_single_classification.py +1 -1
  269. mteb/tasks/classification/eng/yahoo_answers_topics_classification.py +1 -1
  270. mteb/tasks/classification/eng/yelp_review_full_classification.py +2 -2
  271. mteb/tasks/classification/est/estonian_valence.py +2 -2
  272. mteb/tasks/classification/fas/fa_mteb_classification.py +6 -6
  273. mteb/tasks/classification/fas/persian_food_sentiment_classification.py +1 -1
  274. mteb/tasks/classification/fil/filipino_shopee_reviews_classification.py +1 -1
  275. mteb/tasks/classification/fin/fin_toxicity_classification.py +1 -1
  276. mteb/tasks/classification/fra/french_book_reviews.py +2 -2
  277. mteb/tasks/classification/fra/movie_review_sentiment_classification.py +2 -2
  278. mteb/tasks/classification/guj/gujarati_news_classification.py +1 -1
  279. mteb/tasks/classification/hin/hindi_discourse_classification.py +1 -1
  280. mteb/tasks/classification/hin/sentiment_analysis_hindi.py +1 -1
  281. mteb/tasks/classification/ind/indonesian_id_clickbait_classification.py +2 -2
  282. mteb/tasks/classification/ind/indonesian_mongabay_conservation_classification.py +1 -1
  283. mteb/tasks/classification/ita/dado_eval_coarse_classification.py +1 -1
  284. mteb/tasks/classification/ita/ita_casehold_classification.py +1 -1
  285. mteb/tasks/classification/ita/sardi_stance_classification.py +1 -1
  286. mteb/tasks/classification/jav/javanese_imdb_classification.py +1 -1
  287. mteb/tasks/classification/jpn/wrime_classification.py +1 -1
  288. mteb/tasks/classification/kan/kannada_news_classification.py +2 -2
  289. mteb/tasks/classification/kor/klue_tc.py +2 -2
  290. mteb/tasks/classification/kor/kor_fin.py +1 -1
  291. mteb/tasks/classification/kor/kor_hate_classification.py +1 -1
  292. mteb/tasks/classification/kor/kor_sarcasm_classification.py +1 -1
  293. mteb/tasks/classification/kur/kurdish_sentiment_classification.py +2 -2
  294. mteb/tasks/classification/mal/malayalam_news_classification.py +1 -1
  295. mteb/tasks/classification/mar/marathi_news_classification.py +1 -1
  296. mteb/tasks/classification/multilingual/afri_senti_lang_classification.py +1 -1
  297. mteb/tasks/classification/multilingual/catalonia_tweet_classification.py +1 -1
  298. mteb/tasks/classification/multilingual/cyrillic_turkic_lang_classification.py +1 -1
  299. mteb/tasks/classification/multilingual/indic_nlp_news_classification.py +1 -1
  300. mteb/tasks/classification/multilingual/masakha_news_classification.py +1 -1
  301. mteb/tasks/classification/multilingual/multi_hate_classification.py +1 -1
  302. mteb/tasks/classification/multilingual/multilingual_sentiment_classification.py +1 -1
  303. mteb/tasks/classification/multilingual/scala_classification.py +2 -2
  304. mteb/tasks/classification/multilingual/sib200_classification.py +1 -1
  305. mteb/tasks/classification/multilingual/turkic_classification.py +1 -1
  306. mteb/tasks/classification/multilingual/tweet_sentiment_classification.py +1 -1
  307. mteb/tasks/classification/nep/nepali_news_classification.py +2 -2
  308. mteb/tasks/classification/nld/dutch_sarcastic_headlines_classification.py +1 -1
  309. mteb/tasks/classification/nld/vaccin_chat_nl_classification.py +1 -1
  310. mteb/tasks/classification/ory/odia_news_classification.py +2 -2
  311. mteb/tasks/classification/pan/punjabi_news_classification.py +1 -1
  312. mteb/tasks/classification/ron/moroco.py +1 -1
  313. mteb/tasks/classification/ron/romanian_reviews_sentiment.py +1 -1
  314. mteb/tasks/classification/ron/romanian_sentiment_classification.py +1 -1
  315. mteb/tasks/classification/rus/georeview_classification.py +1 -1
  316. mteb/tasks/classification/rus/headline_classification.py +2 -2
  317. mteb/tasks/classification/rus/inappropriateness_classification.py +2 -2
  318. mteb/tasks/classification/rus/ru_reviews_classification.py +2 -2
  319. mteb/tasks/classification/rus/ru_sci_bench_grnti_classification.py +1 -1
  320. mteb/tasks/classification/rus/ru_sci_bench_oecd_classification.py +1 -1
  321. mteb/tasks/classification/rus/ru_toixic_classification_okmlcup.py +1 -1
  322. mteb/tasks/classification/san/sanskrit_shlokas_classification.py +1 -1
  323. mteb/tasks/classification/sin/sinhala_news_classification.py +2 -2
  324. mteb/tasks/classification/sin/sinhala_news_source_classification.py +2 -2
  325. mteb/tasks/classification/slk/csfdsk_movie_review_sentiment_classification.py +2 -2
  326. mteb/tasks/classification/slv/frenk_sl_classification.py +1 -1
  327. mteb/tasks/classification/spa/spanish_news_classification.py +2 -2
  328. mteb/tasks/classification/ssw/siswati_news_classification.py +1 -1
  329. mteb/tasks/classification/tam/tamil_news_classification.py +2 -2
  330. mteb/tasks/classification/tel/telugu_andhra_jyoti_news_classification.py +2 -2
  331. mteb/tasks/classification/tha/wongnai_reviews_classification.py +1 -1
  332. mteb/tasks/classification/tur/turkish_movie_sentiment_classification.py +2 -2
  333. mteb/tasks/classification/ukr/ukr_formality_classification.py +2 -2
  334. mteb/tasks/classification/vie/toxic_conversations_vn_classification.py +1 -1
  335. mteb/tasks/classification/vie/vie_student_feedback_classification.py +1 -1
  336. mteb/tasks/classification/zho/yue_openrice_review_classification.py +2 -2
  337. mteb/tasks/classification/zul/isi_zulu_news_classification.py +1 -1
  338. mteb/tasks/clustering/deu/blurbs_clustering_p2p.py +1 -1
  339. mteb/tasks/clustering/deu/blurbs_clustering_s2s.py +1 -1
  340. mteb/tasks/clustering/eng/arxiv_clustering_p2p.py +1 -1
  341. mteb/tasks/clustering/eng/arxiv_hierarchical_clustering.py +2 -2
  342. mteb/tasks/clustering/eng/big_patent_clustering.py +1 -1
  343. mteb/tasks/clustering/eng/biorxiv_clustering_p2p.py +1 -1
  344. mteb/tasks/clustering/eng/biorxiv_clustering_s2s.py +1 -1
  345. mteb/tasks/clustering/eng/hume_wiki_cities_clustering.py +1 -1
  346. mteb/tasks/clustering/eng/medrxiv_clustering_p2p.py +1 -1
  347. mteb/tasks/clustering/eng/medrxiv_clustering_s2s.py +1 -1
  348. mteb/tasks/clustering/eng/reddit_clustering.py +1 -1
  349. mteb/tasks/clustering/eng/reddit_clustering_p2p.py +1 -1
  350. mteb/tasks/clustering/eng/stack_exchange_clustering.py +1 -1
  351. mteb/tasks/clustering/eng/stack_exchange_clustering_p2p.py +1 -1
  352. mteb/tasks/clustering/eng/twenty_newsgroups_clustering.py +1 -1
  353. mteb/tasks/clustering/eng/wiki_cities_clustering.py +1 -1
  354. mteb/tasks/clustering/fas/fa_mteb_clustering.py +4 -4
  355. mteb/tasks/clustering/fra/hal_clustering_s2s.py +2 -2
  356. mteb/tasks/clustering/multilingual/mlsum_clustering_p2p.py +2 -2
  357. mteb/tasks/clustering/multilingual/mlsum_clustering_s2s.py +2 -2
  358. mteb/tasks/clustering/multilingual/sib200_clustering_s2s.py +1 -1
  359. mteb/tasks/clustering/multilingual/wiki_clustering_p2p.py +1 -1
  360. mteb/tasks/clustering/nld/dutch_news_articles_clustering_p2p.py +1 -1
  361. mteb/tasks/clustering/nld/dutch_news_articles_clustering_s2s.py +1 -1
  362. mteb/tasks/clustering/nld/iconclass_clustering_s2s.py +1 -1
  363. mteb/tasks/clustering/nld/open_tender_clustering_p2p.py +1 -1
  364. mteb/tasks/clustering/nld/vabb_clustering_p2p.py +1 -1
  365. mteb/tasks/clustering/nld/vabb_clustering_s2s.py +1 -1
  366. mteb/tasks/clustering/nob/snl_clustering.py +8 -3
  367. mteb/tasks/clustering/nob/vg_clustering.py +8 -3
  368. mteb/tasks/clustering/pol/polish_clustering.py +3 -3
  369. mteb/tasks/clustering/rus/ru_sci_bench_grnti_clustering_p2p.py +1 -1
  370. mteb/tasks/clustering/rus/ru_sci_bench_oecd_clustering_p2p.py +1 -1
  371. mteb/tasks/clustering/zho/cmteb_clustering.py +6 -6
  372. mteb/tasks/image_text_pair_classification/eng/image_co_de.py +1 -1
  373. mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +2 -2
  374. mteb/tasks/instruction_reranking/multilingual/m_follow_ir.py +2 -2
  375. mteb/tasks/multichoice/eng/cv_bench.py +4 -4
  376. mteb/tasks/multilabel_classification/ita/emit_classification.py +1 -1
  377. mteb/tasks/multilabel_classification/mlt/maltese_news_classification.py +1 -1
  378. mteb/tasks/multilabel_classification/rus/ru_toixic_multilabelclassification_okmlcup.py +1 -1
  379. mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_group_classification.py +1 -1
  380. mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_subclass_classification.py +1 -1
  381. mteb/tasks/pair_classification/ara/ar_entail.py +1 -1
  382. mteb/tasks/pair_classification/dan/talemaader_pc.py +1 -1
  383. mteb/tasks/pair_classification/deu/false_friends_de_en_pc.py +1 -1
  384. mteb/tasks/pair_classification/eng/pub_chem_ai_sentence_paraphrase_pc.py +1 -1
  385. mteb/tasks/pair_classification/eng/pub_chem_smilespc.py +4 -3
  386. mteb/tasks/pair_classification/eng/pub_chem_synonym_pc.py +1 -1
  387. mteb/tasks/pair_classification/eng/pub_chem_wiki_paragraphs_pc.py +1 -1
  388. mteb/tasks/pair_classification/eng/sprint_duplicate_questions_pc.py +1 -1
  389. mteb/tasks/pair_classification/eng/twitter_sem_eval2015_pc.py +1 -1
  390. mteb/tasks/pair_classification/eng/twitter_url_corpus_pc.py +1 -1
  391. mteb/tasks/pair_classification/fas/fa_mteb_pair_classification.py +5 -5
  392. mteb/tasks/pair_classification/fas/fars_tail.py +2 -2
  393. mteb/tasks/pair_classification/hye/armenian_paraphrase_pc.py +1 -1
  394. mteb/tasks/pair_classification/ita/dis_co_tex_pair_classification.py +1 -1
  395. mteb/tasks/pair_classification/kor/klue_nli.py +1 -1
  396. mteb/tasks/pair_classification/multilingual/rte3.py +2 -2
  397. mteb/tasks/pair_classification/multilingual/xnli.py +1 -1
  398. mteb/tasks/pair_classification/pol/polish_pc.py +4 -4
  399. mteb/tasks/pair_classification/por/assin2_rte.py +1 -1
  400. mteb/tasks/pair_classification/por/sick_br_pc.py +1 -1
  401. mteb/tasks/pair_classification/rus/terra.py +2 -2
  402. mteb/tasks/pair_classification/vie/sprint_duplicate_questions_pcvn.py +1 -1
  403. mteb/tasks/pair_classification/vie/twitter_sem_eval2015_pcvn.py +1 -1
  404. mteb/tasks/pair_classification/vie/twitter_url_corpus_pcvn.py +1 -1
  405. mteb/tasks/pair_classification/zho/cmteb_pair_classification.py +2 -2
  406. mteb/tasks/reranking/multilingual/wikipedia_reranking_multilingual.py +1 -1
  407. mteb/tasks/retrieval/ara/sadeem_question_retrieval.py +1 -1
  408. mteb/tasks/retrieval/code/code_edit_search_retrieval.py +1 -1
  409. mteb/tasks/retrieval/code/code_rag.py +16 -16
  410. mteb/tasks/retrieval/code/code_search_net_cc_retrieval.py +1 -1
  411. mteb/tasks/retrieval/code/coir_code_search_net_retrieval.py +1 -1
  412. mteb/tasks/retrieval/code/ds1000_retrieval.py +1 -1
  413. mteb/tasks/retrieval/code/fresh_stack_retrieval.py +1 -1
  414. mteb/tasks/retrieval/code/human_eval_retrieval.py +1 -1
  415. mteb/tasks/retrieval/code/mbpp_retrieval.py +1 -1
  416. mteb/tasks/retrieval/code/wiki_sql_retrieval.py +1 -1
  417. mteb/tasks/retrieval/dan/dan_fever_retrieval.py +2 -2
  418. mteb/tasks/retrieval/dan/tv2_nordretrieval.py +3 -3
  419. mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +3 -3
  420. mteb/tasks/retrieval/deu/german_gov_service_retrieval.py +1 -1
  421. mteb/tasks/retrieval/deu/german_qu_ad_retrieval.py +1 -1
  422. mteb/tasks/retrieval/ell/greek_civics_qa.py +1 -1
  423. mteb/tasks/retrieval/eng/__init__.py +44 -0
  424. mteb/tasks/retrieval/eng/bright_retrieval.py +10 -2
  425. mteb/tasks/retrieval/eng/bright_v1_1_retrieval.py +968 -0
  426. mteb/tasks/retrieval/eng/chat_doctor_retrieval.py +1 -1
  427. mteb/tasks/retrieval/eng/chemrxiv.py +33 -0
  428. mteb/tasks/retrieval/eng/cub200_i2i_retrieval.py +1 -1
  429. mteb/tasks/retrieval/eng/fin_qa_retrieval.py +1 -1
  430. mteb/tasks/retrieval/eng/finance_bench_retrieval.py +1 -1
  431. mteb/tasks/retrieval/eng/hateful_memes_i2t_retrieval.py +1 -1
  432. mteb/tasks/retrieval/eng/hateful_memes_t2i_retrieval.py +1 -1
  433. mteb/tasks/retrieval/eng/hc3_finance_retrieval.py +1 -1
  434. mteb/tasks/retrieval/eng/lemb_narrative_qa_retrieval.py +1 -1
  435. mteb/tasks/retrieval/eng/lemb_needle_retrieval.py +1 -1
  436. mteb/tasks/retrieval/eng/lemb_passkey_retrieval.py +1 -1
  437. mteb/tasks/retrieval/eng/lemb_summ_screen_fd_retrieval.py +1 -1
  438. mteb/tasks/retrieval/eng/lemb_wikim_qa_retrieval.py +1 -1
  439. mteb/tasks/retrieval/eng/lembqm_sum_retrieval.py +1 -1
  440. mteb/tasks/retrieval/eng/limit_retrieval.py +6 -1
  441. mteb/tasks/retrieval/eng/lit_search_retrieval.py +1 -1
  442. mteb/tasks/retrieval/eng/memotion_i2t_retrieval.py +1 -1
  443. mteb/tasks/retrieval/eng/memotion_t2i_retrieval.py +1 -1
  444. mteb/tasks/retrieval/eng/ml_questions.py +1 -1
  445. mteb/tasks/retrieval/eng/nano_argu_ana_retrieval.py +1 -1
  446. mteb/tasks/retrieval/eng/nano_climate_fever_retrieval.py +1 -1
  447. mteb/tasks/retrieval/eng/nano_db_pedia_retrieval.py +1 -1
  448. mteb/tasks/retrieval/eng/nano_fever_retrieval.py +1 -1
  449. mteb/tasks/retrieval/eng/nano_fi_qa2018_retrieval.py +1 -1
  450. mteb/tasks/retrieval/eng/nano_hotpot_qa_retrieval.py +1 -1
  451. mteb/tasks/retrieval/eng/nano_msmarco_retrieval.py +1 -1
  452. mteb/tasks/retrieval/eng/nano_nf_corpus_retrieval.py +1 -1
  453. mteb/tasks/retrieval/eng/nano_nq_retrieval.py +1 -1
  454. mteb/tasks/retrieval/eng/nano_quora_retrieval.py +1 -1
  455. mteb/tasks/retrieval/eng/nano_sci_fact_retrieval.py +1 -1
  456. mteb/tasks/retrieval/eng/nano_scidocs_retrieval.py +1 -1
  457. mteb/tasks/retrieval/eng/nano_touche2020_retrieval.py +1 -1
  458. mteb/tasks/retrieval/eng/narrative_qa_retrieval.py +1 -1
  459. mteb/tasks/retrieval/eng/r2_med_retrieval.py +8 -8
  460. mteb/tasks/retrieval/eng/sci_mmir_i2t_retrieval.py +1 -1
  461. mteb/tasks/retrieval/eng/sci_mmir_t2i_retrieval.py +1 -1
  462. mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +10 -10
  463. mteb/tasks/retrieval/fra/f_qu_ad_retrieval.py +1 -1
  464. mteb/tasks/retrieval/fra/syntec_retrieval.py +1 -1
  465. mteb/tasks/retrieval/hun/hun_sum2.py +1 -1
  466. mteb/tasks/retrieval/kat/georgian_faq_retrieval.py +1 -1
  467. mteb/tasks/retrieval/kor/__init__.py +15 -1
  468. mteb/tasks/retrieval/kor/kovidore2_bench_retrieval.py +142 -0
  469. mteb/tasks/retrieval/multilingual/__init__.py +2 -0
  470. mteb/tasks/retrieval/multilingual/cross_lingual_semantic_discrimination_wmt19.py +1 -1
  471. mteb/tasks/retrieval/multilingual/cross_lingual_semantic_discrimination_wmt21.py +1 -1
  472. mteb/tasks/retrieval/multilingual/cur_ev1_retrieval.py +1 -1
  473. mteb/tasks/retrieval/multilingual/euro_pirq_retrieval.py +43 -0
  474. mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +1 -1
  475. mteb/tasks/retrieval/multilingual/miracl_vision_retrieval.py +1 -1
  476. mteb/tasks/retrieval/multilingual/mr_tidy_retrieval.py +1 -1
  477. mteb/tasks/retrieval/multilingual/public_health_qa_retrieval.py +1 -1
  478. mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +5 -5
  479. mteb/tasks/retrieval/multilingual/statcan_dialogue_dataset_retrieval.py +1 -1
  480. mteb/tasks/retrieval/multilingual/vdr_multilingual_retrieval.py +1 -1
  481. mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +14 -4
  482. mteb/tasks/retrieval/multilingual/vidore3_bench_retrieval.py +90 -100
  483. mteb/tasks/retrieval/multilingual/wit_t2i_retrieval.py +1 -1
  484. mteb/tasks/retrieval/multilingual/x_flickr30k_co_t2i_retrieval.py +1 -1
  485. mteb/tasks/retrieval/multilingual/x_qu_ad_retrieval.py +1 -1
  486. mteb/tasks/retrieval/multilingual/xm3600_t2i_retrieval.py +1 -1
  487. mteb/tasks/retrieval/nld/cqa_dupstack_android_nl_retrieval.py +1 -1
  488. mteb/tasks/retrieval/nld/cqa_dupstack_english_nl_retrieval.py +1 -1
  489. mteb/tasks/retrieval/nld/cqa_dupstack_gaming_nl_retrieval.py +1 -1
  490. mteb/tasks/retrieval/nld/cqa_dupstack_gis_nl_retrieval.py +1 -1
  491. mteb/tasks/retrieval/nld/cqa_dupstack_mathematica_nl_retrieval.py +1 -1
  492. mteb/tasks/retrieval/nld/cqa_dupstack_physics_nl_retrieval.py +1 -1
  493. mteb/tasks/retrieval/nld/cqa_dupstack_programmers_nl_retrieval.py +1 -1
  494. mteb/tasks/retrieval/nld/cqa_dupstack_stats_nl_retrieval.py +1 -1
  495. mteb/tasks/retrieval/nld/cqa_dupstack_tex_nl_retrieval.py +1 -1
  496. mteb/tasks/retrieval/nld/cqa_dupstack_unix_nl_retrieval.py +1 -1
  497. mteb/tasks/retrieval/nld/cqa_dupstack_webmasters_nl_retrieval.py +1 -1
  498. mteb/tasks/retrieval/nld/cqa_dupstack_wordpress_nl_retrieval.py +1 -1
  499. mteb/tasks/retrieval/nob/norquad.py +3 -3
  500. mteb/tasks/retrieval/nob/snl_retrieval.py +3 -3
  501. mteb/tasks/retrieval/slk/slovak_sum_retrieval.py +1 -1
  502. mteb/tasks/retrieval/tur/tur_hist_quad.py +1 -1
  503. mteb/tasks/retrieval/vie/__init__.py +14 -6
  504. mteb/tasks/retrieval/vie/climate_fevervn_retrieval.py +39 -0
  505. mteb/tasks/retrieval/vie/db_pedia_vn_retrieval.py +39 -0
  506. mteb/tasks/retrieval/vie/fevervn_retrieval.py +39 -0
  507. mteb/tasks/retrieval/vie/hotpot_qavn_retrieval.py +39 -0
  508. mteb/tasks/retrieval/vie/msmarcovn_retrieval.py +48 -0
  509. mteb/tasks/retrieval/vie/nqvn_retrieval.py +39 -0
  510. mteb/tasks/retrieval/vie/tvpl_retrieval.py +42 -0
  511. mteb/tasks/retrieval/vie/vie_qu_ad_retrieval.py +1 -1
  512. mteb/tasks/retrieval/vie/zac_legal_text_retrieval.py +15 -1
  513. mteb/tasks/sts/fao/faroese_sts.py +1 -1
  514. mteb/tasks/sts/fra/sick_fr_sts.py +1 -1
  515. mteb/tasks/sts/kor/klue_sts.py +1 -1
  516. mteb/tasks/sts/por/sick_br_sts.py +1 -1
  517. mteb/tasks/sts/rus/ru_para_phraser_sts.py +1 -1
  518. mteb/tasks/zeroshot_classification/eng/sci_mmir.py +1 -1
  519. mteb/types/__init__.py +2 -0
  520. mteb/types/_encoder_io.py +13 -1
  521. mteb/types/_result.py +2 -1
  522. mteb/types/statistics.py +18 -5
  523. {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/METADATA +15 -4
  524. {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/RECORD +528 -486
  525. {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/WHEEL +1 -1
  526. mteb/models/model_implementations/mxbai_models.py +0 -111
  527. {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/entry_points.txt +0 -0
  528. {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/licenses/LICENSE +0 -0
  529. {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/top_level.txt +0 -0
@@ -1,29 +1,37 @@
1
+ from __future__ import annotations
2
+
1
3
  import logging
2
4
  import warnings
3
- from collections.abc import Callable, Iterable, Sequence
4
- from typing import Any, Literal
5
+ from typing import TYPE_CHECKING, Any, Literal, cast
5
6
 
6
7
  import numpy as np
7
8
  import pandas as pd
8
9
  from pydantic import BaseModel, ConfigDict, Field
9
- from typing_extensions import Self
10
+ from typing_extensions import overload
10
11
 
11
- from mteb.abstasks.abstask import AbsTask
12
- from mteb.abstasks.task_metadata import (
13
- TaskDomain,
14
- TaskType,
15
- )
16
12
  from mteb.types import (
17
- ISOLanguage,
18
- ISOLanguageScript,
19
13
  Modalities,
20
- Score,
21
- ScoresDict,
22
- SplitName,
23
14
  )
24
15
 
25
16
  from .task_result import TaskError, TaskResult
26
17
 
18
+ if TYPE_CHECKING:
19
+ from collections.abc import Callable, Iterable
20
+
21
+ from mteb.abstasks.abstask import AbsTask
22
+ from mteb.abstasks.task_metadata import (
23
+ TaskDomain,
24
+ TaskType,
25
+ )
26
+ from mteb.types import (
27
+ ISOLanguage,
28
+ ISOLanguageScript,
29
+ Score,
30
+ ScoresDict,
31
+ SplitName,
32
+ )
33
+
34
+
27
35
  logger = logging.getLogger(__name__)
28
36
 
29
37
 
@@ -58,7 +66,7 @@ def _aggregate_and_pivot(
58
66
  index=index_columns,
59
67
  columns=columns,
60
68
  values="score",
61
- aggfunc=aggregation_fn,
69
+ aggfunc=aggregation_fn, # type: ignore[arg-type]
62
70
  ).reset_index()
63
71
  elif format == "long":
64
72
  return (
@@ -81,7 +89,7 @@ class ModelResult(BaseModel):
81
89
  model_revision: str | None
82
90
  task_results: list[TaskResult]
83
91
  default_modalities: list[Modalities] = Field(
84
- default_factory=lambda: ["text"], alias="modalities"
92
+ default_factory=lambda: [cast("Modalities", "text")], alias="modalities"
85
93
  )
86
94
  model_config = (
87
95
  ConfigDict( # to free up the name model_* which is otherwise protected
@@ -95,16 +103,17 @@ class ModelResult(BaseModel):
95
103
  return f"ModelResult(model_name={self.model_name}, model_revision={self.model_revision}, task_results=[...](#{n_entries}))"
96
104
 
97
105
  @classmethod
98
- def from_validated(cls, **data: dict[str, Any]) -> Self:
106
+ def from_validated(cls, **data: dict[str, Any]) -> ModelResult:
99
107
  """Create a ModelResult from validated data.
100
108
 
101
109
  Args:
102
110
  data: The validated data.
103
111
  """
104
- data["task_results"] = [
105
- TaskResult.from_validated(**res) for res in data["task_results"]
112
+ data["task_results"] = [ # type: ignore[assignment]
113
+ TaskResult.from_validated(**res) # type: ignore[arg-type]
114
+ for res in data["task_results"]
106
115
  ]
107
- return cls.model_construct(**data)
116
+ return cls.model_construct(**data) # type: ignore[arg-type]
108
117
 
109
118
  def _filter_tasks(
110
119
  self,
@@ -114,7 +123,7 @@ class ModelResult(BaseModel):
114
123
  task_types: list[TaskType] | None = None,
115
124
  modalities: list[Modalities] | None = None,
116
125
  is_public: bool | None = None,
117
- ) -> Self:
126
+ ) -> ModelResult:
118
127
  new_task_results = []
119
128
  for task_result in self.task_results:
120
129
  if (task_names is not None) and (task_result.task_name not in task_names):
@@ -142,7 +151,7 @@ class ModelResult(BaseModel):
142
151
  task_results=new_task_results,
143
152
  )
144
153
 
145
- def select_tasks(self, tasks: Sequence[AbsTask]) -> Self:
154
+ def select_tasks(self, tasks: Iterable[AbsTask]) -> ModelResult:
146
155
  """Select tasks from the ModelResult based on a list of AbsTask objects.
147
156
 
148
157
  Args:
@@ -160,6 +169,28 @@ class ModelResult(BaseModel):
160
169
  task_results=new_task_results,
161
170
  )
162
171
 
172
+ @overload
173
+ def _get_scores(
174
+ self,
175
+ splits: list[SplitName] | None = None,
176
+ languages: list[ISOLanguage | ISOLanguageScript] | None = None,
177
+ scripts: list[ISOLanguageScript] | None = None,
178
+ getter: Callable[[ScoresDict], Score] | None = None,
179
+ aggregation: Callable[[list[Score]], Any] | None = None,
180
+ format: Literal["wide"] = "wide",
181
+ ) -> dict: ...
182
+
183
+ @overload
184
+ def _get_scores(
185
+ self,
186
+ splits: list[SplitName] | None = None,
187
+ languages: list[ISOLanguage | ISOLanguageScript] | None = None,
188
+ scripts: list[ISOLanguageScript] | None = None,
189
+ getter: Callable[[ScoresDict], Score] | None = None,
190
+ aggregation: Callable[[list[Score]], Any] | None = None,
191
+ format: Literal["long"] = "long",
192
+ ) -> list: ...
193
+
163
194
  def _get_scores(
164
195
  self,
165
196
  splits: list[SplitName] | None = None,
@@ -177,21 +208,24 @@ class ModelResult(BaseModel):
177
208
  aggregation = aggregation if aggregation is not None else np.mean
178
209
  else:
179
210
  use_fast = True
211
+ aggregation = cast("Callable[[list[Score]], Any]", aggregation)
212
+ getter = cast("Callable[[ScoresDict], Score]", getter)
213
+
180
214
  if format == "wide":
181
215
  scores = {}
182
216
  for res in self.task_results:
183
217
  try:
184
218
  if use_fast:
185
219
  scores[res.task_name] = res._get_score_fast(
186
- splits=splits, # type: ignore
187
- languages=languages, # type: ignore
220
+ splits=splits,
221
+ languages=languages,
188
222
  )
189
223
  else:
190
224
  scores[res.task_name] = res.get_score(
191
225
  splits=splits,
192
226
  languages=languages,
193
- aggregation=aggregation, # type: ignore
194
- getter=getter, # type: ignore
227
+ aggregation=aggregation,
228
+ getter=getter,
195
229
  scripts=scripts,
196
230
  )
197
231
  except Exception as e:
@@ -206,14 +240,14 @@ class ModelResult(BaseModel):
206
240
  if use_fast:
207
241
  score = task_res._get_score_fast(
208
242
  splits=splits,
209
- languages=languages, # type: ignore
243
+ languages=languages,
210
244
  )
211
245
  else:
212
246
  score = task_res.get_score(
213
247
  splits=splits,
214
248
  languages=languages,
215
- aggregation=aggregation, # type: ignore
216
- getter=getter, # type: ignore
249
+ aggregation=aggregation,
250
+ getter=getter,
217
251
  scripts=scripts,
218
252
  )
219
253
  entry = dict(
@@ -292,7 +326,9 @@ class ModelResult(BaseModel):
292
326
  scores_data = self._get_score_for_table()
293
327
 
294
328
  if not scores_data:
295
- logger.warning("No scores data available. Returning empty DataFrame.")
329
+ msg = "No scores data available. Returning empty DataFrame."
330
+ logger.warning(msg)
331
+ warnings.warn(msg)
296
332
  return pd.DataFrame()
297
333
 
298
334
  # Create DataFrame
@@ -315,7 +351,7 @@ class ModelResult(BaseModel):
315
351
  def __hash__(self) -> int:
316
352
  return id(self)
317
353
 
318
- def __iter__(self) -> Iterable[TaskResult]:
354
+ def __iter__(self) -> Iterable[TaskResult]: # type: ignore[override]
319
355
  return iter(self.task_results)
320
356
 
321
357
  def __getitem__(self, index) -> TaskResult:
@@ -368,13 +404,13 @@ class ModelResult(BaseModel):
368
404
  return [task_res.task_name for task_res in self.task_results]
369
405
 
370
406
  @property
371
- def modalities(self) -> list[str]:
407
+ def modalities(self) -> list[Modalities]:
372
408
  """Get all modalities in the task results.
373
409
 
374
410
  Returns:
375
411
  A list of modalities in the task results.
376
412
  """
377
- mods = []
413
+ mods: list[Modalities] = []
378
414
  for task_res in self.task_results:
379
415
  task_modalities = getattr(task_res, "modalities", [])
380
416
  mods.extend(task_modalities)
@@ -2,33 +2,42 @@ from __future__ import annotations
2
2
 
3
3
  import json
4
4
  import logging
5
- from argparse import Namespace
5
+ import warnings
6
6
  from collections import defaultdict
7
- from collections.abc import Callable, Iterable
8
7
  from functools import cached_property
9
8
  from importlib.metadata import version
10
- from pathlib import Path
11
- from typing import Any
9
+ from typing import TYPE_CHECKING, Any
12
10
 
13
11
  import numpy as np
14
12
  from huggingface_hub import EvalResult
15
13
  from packaging.version import Version
16
14
  from pydantic import BaseModel, field_validator
17
- from typing_extensions import Self
18
15
 
16
+ from mteb import TaskMetadata
19
17
  from mteb._helpful_enum import HelpfulStrEnum
18
+ from mteb.abstasks import AbsTaskClassification
20
19
  from mteb.abstasks.abstask import AbsTask
21
20
  from mteb.languages import LanguageScripts
22
21
  from mteb.models.model_meta import ScoringFunction
23
22
  from mteb.types import (
24
- HFSubset,
25
- ISOLanguage,
26
- ISOLanguageScript,
27
- Score,
28
23
  ScoresDict,
29
24
  SplitName,
30
25
  )
31
26
 
27
+ if TYPE_CHECKING:
28
+ from collections.abc import Callable, Iterable, Mapping
29
+ from pathlib import Path
30
+
31
+ from typing_extensions import Self
32
+
33
+ from mteb.abstasks.task_metadata import TaskDomain
34
+ from mteb.types import (
35
+ HFSubset,
36
+ ISOLanguage,
37
+ ISOLanguageScript,
38
+ Score,
39
+ )
40
+
32
41
  logger = logging.getLogger(__name__)
33
42
 
34
43
 
@@ -39,67 +48,59 @@ class Criteria(HelpfulStrEnum):
39
48
  DATASET_REVISION = "dataset_revision"
40
49
 
41
50
 
42
- class ScalaNbClassificationDummy:
51
+ class ScalaNbClassificationDummy(AbsTaskClassification):
43
52
  """A dummy task for loading historic results from before v1.11.0"""
44
53
 
45
- metadata = Namespace( # type: ignore
54
+ metadata = TaskMetadata(
46
55
  name="ScalaNbClassification",
56
+ description="A dummy",
47
57
  main_score="accuracy",
48
58
  type="Classification",
49
- hf_subsets_to_langscripts={
50
- "default": ["nob-Latn"],
51
- },
52
- dataset={"revision": "revision_not_applicable"},
53
- revision="revision_not_applicable",
59
+ eval_langs=["nob-Latn"],
60
+ dataset={"path": "not/exists", "revision": "revision_not_applicable"},
54
61
  )
55
62
 
56
63
 
57
- class ScalaNnClassificationDummy:
64
+ class ScalaNnClassificationDummy(AbsTaskClassification):
58
65
  """A dummy task for loading historic results from before v1.11.0"""
59
66
 
60
- metadata = Namespace( # type: ignore
67
+ metadata = TaskMetadata(
61
68
  name="ScalaNnClassification",
69
+ description="A dummy",
62
70
  main_score="accuracy",
63
71
  type="Classification",
64
- hf_subsets_to_langscripts={
65
- "default": ["nno-Latn"],
66
- },
67
- dataset={"revision": "revision_not_applicable"},
68
- revision="revision_not_applicable",
72
+ eval_langs=["nob-Latn"],
73
+ dataset={"path": "not/exists", "revision": "revision_not_applicable"},
69
74
  )
70
75
 
71
76
 
72
- class ScalaDaClassificationDummy:
77
+ class ScalaDaClassificationDummy(AbsTaskClassification):
73
78
  """A dummy task for loading historic results from before v1.11.0"""
74
79
 
75
- metadata = Namespace( # type: ignore
80
+ metadata = TaskMetadata(
76
81
  name="ScalaDaClassification",
82
+ description="A dummy",
77
83
  main_score="accuracy",
78
84
  type="Classification",
79
- hf_subsets_to_langscripts={
80
- "default": ["dan-Latn"],
81
- },
82
- dataset={"revision": "revision_not_applicable"},
83
- revision="revision_not_applicable",
85
+ eval_langs=["dan-Latn"],
86
+ dataset={"path": "not/exists", "revision": "revision_not_applicable"},
84
87
  )
85
88
 
86
89
 
87
- class ScalaSvClassificationDummy:
90
+ class ScalaSvClassificationDummy(AbsTaskClassification):
88
91
  """A dummy task for loading historic results from before v1.11.0"""
89
92
 
90
- metadata = Namespace( # type: ignore
93
+ metadata = TaskMetadata(
91
94
  name="ScalaSvClassification",
95
+ description="A dummy",
92
96
  main_score="accuracy",
93
97
  type="Classification",
94
- hf_subsets_to_langscripts={
95
- "default": ["swe-Latn"],
96
- },
97
- dataset={"revision": "revision_not_applicable"},
98
- revision="revision_not_applicable",
98
+ eval_langs=["swe-Latn"],
99
+ dataset={"path": "not/exists", "revision": "revision_not_applicable"},
99
100
  )
100
101
 
101
102
 
102
- outdated_tasks = {
103
+ outdated_tasks: dict[str, type[AbsTask]] = {
103
104
  "ScalaNbClassification": ScalaNbClassificationDummy,
104
105
  "ScalaNnClassification": ScalaNnClassificationDummy,
105
106
  "ScalaDaClassification": ScalaDaClassificationDummy,
@@ -166,10 +167,10 @@ class TaskResult(BaseModel):
166
167
  def from_task_results(
167
168
  cls,
168
169
  task: AbsTask | type[AbsTask],
169
- scores: dict[SplitName, dict[HFSubset, ScoresDict]],
170
+ scores: dict[SplitName, Mapping[HFSubset, ScoresDict]],
170
171
  evaluation_time: float,
171
172
  kg_co2_emissions: float | None = None,
172
- ) -> Self:
173
+ ) -> TaskResult:
173
174
  """Create a TaskResult from the task and scores.
174
175
 
175
176
  Args:
@@ -246,12 +247,12 @@ class TaskResult(BaseModel):
246
247
  return get_task(self.task_name)
247
248
 
248
249
  @property
249
- def domains(self) -> list[str]:
250
+ def domains(self) -> list[TaskDomain]:
250
251
  """Get the domains of the task."""
251
252
  doms = self.task.metadata.domains
252
253
  if doms is None:
253
254
  doms = []
254
- return doms # type: ignore
255
+ return doms
255
256
 
256
257
  @property
257
258
  def task_type(self) -> str:
@@ -307,7 +308,7 @@ class TaskResult(BaseModel):
307
308
  if isinstance(v, dict):
308
309
  self._round_scores(v, n)
309
310
  elif isinstance(v, float):
310
- value[i] = round(v, n)
311
+ value[i] = round(v, n) # type: ignore[call-overload]
311
312
 
312
313
  elif isinstance(value, float):
313
314
  scores[key] = round(value, n)
@@ -325,7 +326,7 @@ class TaskResult(BaseModel):
325
326
  json.dump(json_obj, f, indent=2)
326
327
 
327
328
  @classmethod
328
- def from_disk(cls, path: Path, load_historic_data: bool = True) -> Self: # type: ignore
329
+ def from_disk(cls, path: Path, load_historic_data: bool = True) -> TaskResult:
329
330
  """Load TaskResult from disk.
330
331
 
331
332
  Args:
@@ -356,7 +357,7 @@ class TaskResult(BaseModel):
356
357
  ) # assume it is before 1.11.0 if the version is not present
357
358
 
358
359
  try:
359
- obj = cls.model_validate(data)
360
+ obj: TaskResult = cls.model_validate(data)
360
361
  except Exception as e:
361
362
  if not pre_1_11_load:
362
363
  raise e
@@ -381,6 +382,7 @@ class TaskResult(BaseModel):
381
382
  from mteb import get_task
382
383
 
383
384
  task_name = obj.task_name
385
+ task: AbsTask | type[AbsTask]
384
386
  if task_name in outdated_tasks:
385
387
  task = outdated_tasks[task_name]
386
388
  else:
@@ -393,11 +395,11 @@ class TaskResult(BaseModel):
393
395
  for key in list(hf_subset_scores.keys()):
394
396
  if isinstance(hf_subset_scores[key], dict):
395
397
  for k, v in hf_subset_scores[key].items():
396
- hf_subset_scores[f"{key}_{k}"] = v
397
- hf_subset_scores.pop(key)
398
+ hf_subset_scores[f"{key}_{k}"] = v # type: ignore[index]
399
+ hf_subset_scores.pop(key) # type: ignore[attr-defined]
398
400
 
399
401
  @classmethod
400
- def _convert_from_before_v1_11_0(cls, data: dict) -> Self:
402
+ def _convert_from_before_v1_11_0(cls, data: dict) -> TaskResult:
401
403
  from mteb.get_tasks import _TASKS_REGISTRY
402
404
 
403
405
  # in case the task name is not found in the registry, try to find a lower case version
@@ -462,7 +464,9 @@ class TaskResult(BaseModel):
462
464
  if main_score in hf_subset_scores:
463
465
  hf_subset_scores["main_score"] = hf_subset_scores[main_score]
464
466
  else:
465
- logger.warning(f"Main score {main_score} not found in scores")
467
+ msg = f"Main score {main_score} not found in scores"
468
+ logger.warning(msg)
469
+ warnings.warn(msg)
466
470
  hf_subset_scores["main_score"] = None
467
471
 
468
472
  # specific fixes:
@@ -481,7 +485,7 @@ class TaskResult(BaseModel):
481
485
  scores["test"]["fra-fra"] = scores["test"].pop("fr")
482
486
 
483
487
  result: TaskResult = TaskResult.from_task_results(
484
- task, # type: ignore
488
+ task,
485
489
  scores,
486
490
  evaluation_time,
487
491
  kg_co2_emissions=None,
@@ -532,7 +536,7 @@ class TaskResult(BaseModel):
532
536
  def _get_score_fast(
533
537
  self,
534
538
  splits: Iterable[str] | None = None,
535
- languages: str | None = None,
539
+ languages: list[ISOLanguage | ISOLanguageScript] | None = None,
536
540
  subsets: Iterable[str] | None = None,
537
541
  ) -> float:
538
542
  """Sped up version of get_score that will be used if no aggregation, script or getter needs to be specified.
@@ -581,7 +585,7 @@ class TaskResult(BaseModel):
581
585
  return val_sum / n_val
582
586
 
583
587
  @classmethod
584
- def from_validated(cls, **data) -> Self:
588
+ def from_validated(cls, **data) -> TaskResult:
585
589
  """Create a TaskResult from validated data.
586
590
 
587
591
  Returns:
@@ -592,13 +596,13 @@ class TaskResult(BaseModel):
592
596
  def __repr__(self) -> str:
593
597
  return f"TaskResult(task_name={self.task_name}, scores=...)"
594
598
 
595
- def only_main_score(self) -> Self:
599
+ def only_main_score(self) -> TaskResult:
596
600
  """Return a new TaskResult object with only the main score.
597
601
 
598
602
  Returns:
599
603
  A new TaskResult object with only the main score.
600
604
  """
601
- new_scores = {}
605
+ new_scores: dict[str, list[Score]] = {}
602
606
  for split in self.scores:
603
607
  new_scores[split] = []
604
608
  for subset_scores in self.scores[split]:
@@ -610,10 +614,12 @@ class TaskResult(BaseModel):
610
614
  }
611
615
  )
612
616
  new_res = {**self.to_dict(), "scores": new_scores}
613
- new_res = TaskResult.from_validated(**new_res)
614
- return new_res
617
+ return TaskResult.from_validated(**new_res)
615
618
 
616
- def validate_and_filter_scores(self, task: AbsTask | None = None) -> Self:
619
+ def validate_and_filter_scores(
620
+ self,
621
+ task: AbsTask | None = None,
622
+ ) -> TaskResult:
617
623
  """Validate and filter the scores against the task metadata.
618
624
 
619
625
  This ensures that the scores are correct for the given task, by removing any splits besides those specified in the task metadata.
@@ -635,22 +641,32 @@ class TaskResult(BaseModel):
635
641
  splits = task.eval_splits
636
642
  hf_subsets = set(task.hf_subsets) # Convert to set once
637
643
 
638
- new_scores = {}
644
+ new_scores: dict[str, list[Score]] = {}
639
645
  seen_splits = set()
640
646
  for split in self.scores:
641
647
  if split not in splits:
642
648
  continue
643
649
  seen_subsets = set()
644
- # Use list comprehension for better performance
645
- new_scores[split] = [
646
- _scores
647
- for _scores in self.scores[split]
648
- if _scores["hf_subset"] in hf_subsets
649
- ]
650
+ if task.is_aggregate:
651
+ # aggregate tasks only have the default subset, but in metadata can be multiple
652
+ new_scores[split] = [
653
+ _scores
654
+ for _scores in self.scores[split]
655
+ if _scores["hf_subset"] == "default"
656
+ ]
657
+ seen_subsets = {"default"}
658
+ else:
659
+ new_scores[split] = [
660
+ _scores
661
+ for _scores in self.scores[split]
662
+ if _scores["hf_subset"] in hf_subsets
663
+ ]
650
664
  for _scores in new_scores[split]:
651
665
  seen_subsets.add(_scores["hf_subset"])
652
666
 
653
- if seen_subsets != hf_subsets:
667
+ if seen_subsets != hf_subsets and not (
668
+ task.is_aggregate and "default" in seen_subsets
669
+ ):
654
670
  missing_subsets = hf_subsets - seen_subsets
655
671
  if len(missing_subsets) > 2:
656
672
  subset1, subset2 = list(missing_subsets)[:2]
@@ -658,14 +674,36 @@ class TaskResult(BaseModel):
658
674
  else:
659
675
  missing_subsets_str = str(missing_subsets)
660
676
 
661
- logger.warning(
662
- f"{task.metadata.name}: Missing subsets {missing_subsets_str} for split {split}"
663
- )
677
+ msg = f"{task.metadata.name}: Missing subsets {missing_subsets_str} for split {split}"
678
+ logger.warning(msg)
679
+ warnings.warn(msg)
680
+ for missing_subset in missing_subsets:
681
+ new_scores[split].append(
682
+ {
683
+ "hf_subset": missing_subset,
684
+ "main_score": np.nan,
685
+ "languages": task.metadata.hf_subsets_to_langscripts.get(
686
+ missing_subset, []
687
+ ),
688
+ }
689
+ )
664
690
  seen_splits.add(split)
665
691
  if seen_splits != set(splits):
666
- logger.warning(
667
- f"{task.metadata.name}: Missing splits {set(splits) - seen_splits}"
668
- )
692
+ msg = f"{task.metadata.name}: Missing splits {set(splits) - seen_splits}"
693
+ logger.warning(msg)
694
+ warnings.warn(msg)
695
+ for missing_split in set(splits) - seen_splits:
696
+ new_scores[missing_split] = []
697
+ for missing_subset in hf_subsets:
698
+ new_scores[missing_split].append(
699
+ {
700
+ "hf_subset": missing_subset,
701
+ "main_score": np.nan,
702
+ "languages": task.metadata.hf_subsets_to_langscripts.get(
703
+ missing_subset, []
704
+ ),
705
+ }
706
+ )
669
707
  data = self.model_dump()
670
708
  data["scores"] = new_scores
671
709
  return type(self).model_construct(**data)
@@ -736,7 +774,7 @@ class TaskResult(BaseModel):
736
774
  "mteb_version",
737
775
  "dataset_revision",
738
776
  ],
739
- ) -> Self:
777
+ ) -> TaskResult:
740
778
  """Merges two TaskResult objects.
741
779
 
742
780
  Args:
@@ -1,8 +1,14 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING
4
+
1
5
  import torch
2
6
 
3
- from mteb.models import EncoderProtocol
4
7
  from mteb.models.model_meta import ScoringFunction
5
- from mteb.types import Array
8
+
9
+ if TYPE_CHECKING:
10
+ from mteb.models import EncoderProtocol
11
+ from mteb.types import Array
6
12
 
7
13
 
8
14
  def _use_torch_compile():
@@ -186,7 +192,7 @@ def max_sim(a: Array, b: Array) -> torch.Tensor:
186
192
  b,
187
193
  )
188
194
 
189
- return scores.max(axis=-1).values.sum(axis=-1)
195
+ return scores.max(axis=-1).values.sum(axis=-1) # type: ignore[call-overload]
190
196
 
191
197
 
192
198
  # https://github.com/lightonai/pylate/blob/2d094a724866d6e15701781528368438081c0157/pylate/scores/scores.py#L67C1-L122C38
@@ -217,7 +223,7 @@ def pairwise_max_sim(
217
223
  document_embedding,
218
224
  )
219
225
 
220
- scores.append(query_document_score.max(axis=-1).values.sum())
226
+ scores.append(query_document_score.max(axis=-1).values.sum()) # type: ignore[call-overload]
221
227
 
222
228
  return torch.stack(scores, dim=0)
223
229
 
@@ -317,11 +323,15 @@ def similarity(text_embeddings: Array, input_embeddings: Array) -> Array:
317
323
  Returns:
318
324
  Matrix with similarities
319
325
  """
320
- text_embeddings = _convert_to_tensor(text_embeddings)
321
- input_embeddings = _convert_to_tensor(input_embeddings)
326
+ text_embeddings_tensor = _convert_to_tensor(text_embeddings)
327
+ input_embeddings_tensor = _convert_to_tensor(input_embeddings)
322
328
 
323
- text_embeddings = text_embeddings / text_embeddings.norm(dim=-1, keepdim=True)
324
- input_embeddings = input_embeddings / input_embeddings.norm(dim=-1, keepdim=True)
325
- logits = torch.matmul(input_embeddings, text_embeddings.T)
329
+ text_embeddings_tensor = text_embeddings_tensor / text_embeddings_tensor.norm(
330
+ dim=-1, keepdim=True
331
+ )
332
+ input_embeddings_tensor = input_embeddings_tensor / input_embeddings_tensor.norm(
333
+ dim=-1, keepdim=True
334
+ )
335
+ logits = torch.matmul(input_embeddings_tensor, text_embeddings_tensor.T)
326
336
  probs = (logits * 100).softmax(dim=-1)
327
337
  return probs
@@ -1,5 +1,5 @@
1
- from mteb.abstasks import AbsTask
2
- from mteb.abstasks.aggregated_task import AbsTaskAggregate, AggregateTaskMetadata
1
+ from mteb.abstasks.aggregate_task_metadata import AggregateTaskMetadata
2
+ from mteb.abstasks.aggregated_task import AbsTaskAggregate
3
3
  from mteb.tasks.retrieval import (
4
4
  CQADupstackAndroidRetrieval,
5
5
  CQADupstackEnglishRetrieval,
@@ -15,7 +15,7 @@ from mteb.tasks.retrieval import (
15
15
  CQADupstackWordpressRetrieval,
16
16
  )
17
17
 
18
- task_list_cqa: list[AbsTask] = [
18
+ task_list_cqa = [
19
19
  CQADupstackAndroidRetrieval(),
20
20
  CQADupstackEnglishRetrieval(),
21
21
  CQADupstackGamingRetrieval(),
@@ -1,10 +1,10 @@
1
- from mteb.abstasks.abstask import AbsTask
2
- from mteb.abstasks.aggregated_task import AbsTaskAggregate, AggregateTaskMetadata
1
+ from mteb.abstasks.aggregate_task_metadata import AggregateTaskMetadata
2
+ from mteb.abstasks.aggregated_task import AbsTaskAggregate
3
3
  from mteb.tasks.sts.multilingual.sts17_multilingual_visual_sts import (
4
4
  STS17MultilingualVisualSTS,
5
5
  )
6
6
 
7
- task_list_sts17: list[AbsTask] = [
7
+ task_list_sts17 = [
8
8
  STS17MultilingualVisualSTS().filter_languages(
9
9
  languages=["eng"], hf_subsets=["en-en"]
10
10
  )