mteb 2.1.4__py3-none-any.whl → 2.7.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (527) hide show
  1. mteb/__init__.py +6 -0
  2. mteb/_create_dataloaders.py +22 -20
  3. mteb/_evaluators/any_sts_evaluator.py +23 -14
  4. mteb/_evaluators/classification_metrics.py +54 -0
  5. mteb/_evaluators/clustering_evaluator.py +3 -3
  6. mteb/_evaluators/evaluator.py +4 -2
  7. mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +18 -11
  8. mteb/_evaluators/pair_classification_evaluator.py +34 -40
  9. mteb/_evaluators/retrieval_evaluator.py +2 -2
  10. mteb/_evaluators/retrieval_metrics.py +18 -17
  11. mteb/_evaluators/sklearn_evaluator.py +25 -37
  12. mteb/_evaluators/text/bitext_mining_evaluator.py +31 -19
  13. mteb/_evaluators/text/summarization_evaluator.py +27 -20
  14. mteb/_evaluators/zeroshot_classification_evaluator.py +7 -5
  15. mteb/abstasks/_data_filter/__init__.py +0 -0
  16. mteb/abstasks/_data_filter/filters.py +125 -0
  17. mteb/abstasks/_data_filter/task_pipelines.py +105 -0
  18. mteb/abstasks/_statistics_calculation.py +23 -11
  19. mteb/abstasks/_stratification.py +18 -18
  20. mteb/abstasks/abstask.py +35 -28
  21. mteb/abstasks/aggregate_task_metadata.py +1 -9
  22. mteb/abstasks/aggregated_task.py +10 -29
  23. mteb/abstasks/classification.py +15 -12
  24. mteb/abstasks/clustering.py +20 -16
  25. mteb/abstasks/clustering_legacy.py +13 -10
  26. mteb/abstasks/image/image_text_pair_classification.py +7 -4
  27. mteb/abstasks/multilabel_classification.py +33 -22
  28. mteb/abstasks/pair_classification.py +27 -11
  29. mteb/abstasks/regression.py +4 -4
  30. mteb/abstasks/retrieval.py +28 -24
  31. mteb/abstasks/retrieval_dataset_loaders.py +2 -2
  32. mteb/abstasks/sts.py +14 -4
  33. mteb/abstasks/task_metadata.py +32 -33
  34. mteb/abstasks/text/bitext_mining.py +39 -28
  35. mteb/abstasks/text/reranking.py +8 -6
  36. mteb/abstasks/text/summarization.py +10 -5
  37. mteb/abstasks/zeroshot_classification.py +8 -4
  38. mteb/benchmarks/_create_table.py +84 -37
  39. mteb/benchmarks/benchmark.py +77 -16
  40. mteb/benchmarks/benchmarks/__init__.py +12 -0
  41. mteb/benchmarks/benchmarks/benchmarks.py +361 -16
  42. mteb/benchmarks/get_benchmark.py +14 -53
  43. mteb/cache.py +227 -37
  44. mteb/cli/_display_tasks.py +2 -2
  45. mteb/cli/build_cli.py +110 -14
  46. mteb/cli/generate_model_card.py +43 -23
  47. mteb/deprecated_evaluator.py +71 -62
  48. mteb/descriptive_stats/BitextMining/RuSciBenchBitextMining.v2.json +61 -0
  49. mteb/descriptive_stats/Classification/HebrewSentimentAnalysis.v3.json +60 -0
  50. mteb/descriptive_stats/Classification/TurkishConstitutionalCourtViolation.json +54 -0
  51. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2CybersecurityRetrieval.json +32 -0
  52. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EconomicRetrieval.json +32 -0
  53. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EnergyRetrieval.json +32 -0
  54. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2HrRetrieval.json +32 -0
  55. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3ComputerScienceRetrieval.json +214 -0
  56. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3EnergyRetrieval.json +214 -0
  57. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceEnRetrieval.json +214 -0
  58. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceFrRetrieval.json +214 -0
  59. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3HrRetrieval.json +214 -0
  60. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3IndustrialRetrieval.json +214 -0
  61. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3NuclearRetrieval.json +214 -0
  62. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PharmaceuticalsRetrieval.json +214 -0
  63. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PhysicsRetrieval.json +214 -0
  64. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3TelecomRetrieval.json +214 -0
  65. mteb/descriptive_stats/PairClassification/TERRa.V2.json +35 -0
  66. mteb/descriptive_stats/Reranking/JQaRARerankingLite.json +35 -0
  67. mteb/descriptive_stats/Reranking/JaCWIRRerankingLite.json +35 -0
  68. mteb/descriptive_stats/Reranking/MultiLongDocReranking.json +466 -0
  69. mteb/descriptive_stats/Retrieval/ArguAna-NL.v2.json +30 -0
  70. mteb/descriptive_stats/Retrieval/ChemRxivRetrieval.json +30 -0
  71. mteb/descriptive_stats/Retrieval/EuroPIRQRetrieval.json +116 -0
  72. mteb/descriptive_stats/Retrieval/JaCWIRRetrievalLite.json +30 -0
  73. mteb/descriptive_stats/Retrieval/JaqketRetrievalLite.json +30 -0
  74. mteb/descriptive_stats/Retrieval/MIRACLJaRetrievalLite.json +30 -0
  75. mteb/descriptive_stats/Retrieval/MrTyDiJaRetrievalLite.json +30 -0
  76. mteb/descriptive_stats/Retrieval/NFCorpus-NL.v2.json +30 -0
  77. mteb/descriptive_stats/Retrieval/NanoClimateFEVER-VN.json +30 -0
  78. mteb/descriptive_stats/Retrieval/NanoDBPedia-VN.json +30 -0
  79. mteb/descriptive_stats/Retrieval/NanoFEVER-VN.json +30 -0
  80. mteb/descriptive_stats/Retrieval/NanoHotpotQA-VN.json +30 -0
  81. mteb/descriptive_stats/Retrieval/NanoMSMARCO-VN.json +30 -0
  82. mteb/descriptive_stats/Retrieval/NanoNQ-VN.json +30 -0
  83. mteb/descriptive_stats/Retrieval/SCIDOCS-NL.v2.json +30 -0
  84. mteb/descriptive_stats/Retrieval/SQuADKorV1Retrieval.json +30 -0
  85. mteb/descriptive_stats/Retrieval/SciFact-NL.v2.json +30 -0
  86. mteb/descriptive_stats/Retrieval/TVPLRetrieval.json +30 -0
  87. mteb/evaluate.py +106 -75
  88. mteb/filter_tasks.py +25 -26
  89. mteb/get_tasks.py +29 -30
  90. mteb/languages/language_scripts.py +5 -3
  91. mteb/leaderboard/app.py +414 -151
  92. mteb/leaderboard/benchmark_selector.py +14 -5
  93. mteb/leaderboard/figures.py +13 -15
  94. mteb/leaderboard/table.py +82 -17
  95. mteb/load_results.py +12 -12
  96. mteb/models/__init__.py +4 -1
  97. mteb/models/abs_encoder.py +31 -23
  98. mteb/models/cache_wrappers/__init__.py +2 -1
  99. mteb/models/cache_wrappers/cache_backend_protocol.py +3 -5
  100. mteb/models/cache_wrappers/cache_backends/_hash_utils.py +7 -6
  101. mteb/models/cache_wrappers/cache_backends/faiss_cache.py +6 -2
  102. mteb/models/cache_wrappers/cache_backends/numpy_cache.py +43 -25
  103. mteb/models/cache_wrappers/cache_wrapper.py +3 -3
  104. mteb/models/get_model_meta.py +25 -118
  105. mteb/models/instruct_wrapper.py +33 -9
  106. mteb/models/model_implementations/align_models.py +8 -1
  107. mteb/models/model_implementations/amazon_models.py +1 -0
  108. mteb/models/model_implementations/andersborges.py +65 -0
  109. mteb/models/model_implementations/ara_models.py +9 -1
  110. mteb/models/model_implementations/arctic_models.py +16 -8
  111. mteb/models/model_implementations/b1ade_models.py +2 -1
  112. mteb/models/model_implementations/bedrock_models.py +4 -0
  113. mteb/models/model_implementations/bge_models.py +101 -17
  114. mteb/models/model_implementations/bica_model.py +35 -0
  115. mteb/models/model_implementations/blip2_models.py +13 -2
  116. mteb/models/model_implementations/blip_models.py +43 -16
  117. mteb/models/model_implementations/bm25.py +5 -4
  118. mteb/models/model_implementations/bmretriever_models.py +10 -4
  119. mteb/models/model_implementations/cadet_models.py +10 -1
  120. mteb/models/model_implementations/cde_models.py +25 -4
  121. mteb/models/model_implementations/clip_models.py +9 -6
  122. mteb/models/model_implementations/clips_models.py +100 -0
  123. mteb/models/model_implementations/codefuse_models.py +165 -3
  124. mteb/models/model_implementations/codesage_models.py +18 -3
  125. mteb/models/model_implementations/cohere_models.py +13 -6
  126. mteb/models/model_implementations/cohere_v.py +7 -2
  127. mteb/models/model_implementations/colpali_models.py +17 -9
  128. mteb/models/model_implementations/colqwen_models.py +275 -5
  129. mteb/models/model_implementations/colsmol_models.py +4 -2
  130. mteb/models/model_implementations/conan_models.py +2 -1
  131. mteb/models/model_implementations/dino_models.py +194 -23
  132. mteb/models/model_implementations/e5_instruct.py +27 -4
  133. mteb/models/model_implementations/e5_models.py +21 -110
  134. mteb/models/model_implementations/e5_v.py +7 -6
  135. mteb/models/model_implementations/eagerworks_models.py +164 -0
  136. mteb/models/model_implementations/emillykkejensen_models.py +91 -0
  137. mteb/models/model_implementations/en_code_retriever.py +2 -1
  138. mteb/models/model_implementations/euler_models.py +32 -0
  139. mteb/models/model_implementations/evaclip_models.py +4 -0
  140. mteb/models/model_implementations/fa_models.py +67 -9
  141. mteb/models/model_implementations/facebookai.py +205 -0
  142. mteb/models/model_implementations/geogpt_models.py +2 -1
  143. mteb/models/model_implementations/gme_v_models.py +17 -10
  144. mteb/models/model_implementations/google_models.py +17 -6
  145. mteb/models/model_implementations/granite_vision_embedding_models.py +8 -3
  146. mteb/models/model_implementations/gritlm_models.py +4 -2
  147. mteb/models/model_implementations/gte_models.py +99 -9
  148. mteb/models/model_implementations/hinvec_models.py +2 -1
  149. mteb/models/model_implementations/human.py +1 -0
  150. mteb/models/model_implementations/ibm_granite_models.py +36 -6
  151. mteb/models/model_implementations/inf_models.py +4 -2
  152. mteb/models/model_implementations/jasper_models.py +256 -3
  153. mteb/models/model_implementations/jina_clip.py +49 -10
  154. mteb/models/model_implementations/jina_models.py +222 -11
  155. mteb/models/model_implementations/kalm_models.py +203 -25
  156. mteb/models/model_implementations/kblab.py +37 -0
  157. mteb/models/model_implementations/kennethenevoldsen_models.py +74 -0
  158. mteb/models/model_implementations/kfst.py +25 -0
  159. mteb/models/model_implementations/kowshik24_models.py +32 -0
  160. mteb/models/model_implementations/lens_models.py +2 -0
  161. mteb/models/model_implementations/lgai_embedding_models.py +2 -1
  162. mteb/models/model_implementations/linq_models.py +4 -3
  163. mteb/models/model_implementations/listconranker.py +2 -2
  164. mteb/models/model_implementations/llm2clip_models.py +9 -6
  165. mteb/models/model_implementations/llm2vec_models.py +16 -8
  166. mteb/models/model_implementations/mcinext_models.py +7 -1
  167. mteb/models/model_implementations/mdbr_models.py +19 -3
  168. mteb/models/model_implementations/misc_models.py +422 -60
  169. mteb/models/model_implementations/mixedbread_ai_models.py +332 -0
  170. mteb/models/model_implementations/mme5_models.py +2 -1
  171. mteb/models/model_implementations/moco_models.py +15 -4
  172. mteb/models/model_implementations/mod_models.py +191 -0
  173. mteb/models/model_implementations/model2vec_models.py +27 -14
  174. mteb/models/model_implementations/moka_models.py +4 -1
  175. mteb/models/model_implementations/nbailab.py +70 -0
  176. mteb/models/model_implementations/no_instruct_sentence_models.py +3 -2
  177. mteb/models/model_implementations/nomic_models.py +173 -6
  178. mteb/models/model_implementations/nomic_models_vision.py +8 -3
  179. mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +32 -19
  180. mteb/models/model_implementations/nvidia_models.py +155 -20
  181. mteb/models/model_implementations/octen_models.py +254 -0
  182. mteb/models/model_implementations/openai_models.py +20 -16
  183. mteb/models/model_implementations/openclip_models.py +37 -13
  184. mteb/models/model_implementations/opensearch_neural_sparse_models.py +10 -5
  185. mteb/models/model_implementations/ops_moa_models.py +5 -3
  186. mteb/models/model_implementations/ordalietech_solon_embeddings_mini_beta_1_1.py +1 -1
  187. mteb/models/model_implementations/pawan_models.py +39 -0
  188. mteb/models/model_implementations/piccolo_models.py +9 -1
  189. mteb/models/model_implementations/pixie_models.py +56 -0
  190. mteb/models/model_implementations/promptriever_models.py +12 -8
  191. mteb/models/model_implementations/pylate_models.py +46 -12
  192. mteb/models/model_implementations/qodo_models.py +4 -2
  193. mteb/models/model_implementations/qtack_models.py +2 -1
  194. mteb/models/model_implementations/qwen3_models.py +9 -6
  195. mteb/models/model_implementations/qzhou_models.py +5 -3
  196. mteb/models/model_implementations/random_baseline.py +19 -24
  197. mteb/models/model_implementations/rasgaard_models.py +34 -0
  198. mteb/models/model_implementations/reasonir_model.py +2 -1
  199. mteb/models/model_implementations/repllama_models.py +5 -3
  200. mteb/models/model_implementations/rerankers_custom.py +15 -9
  201. mteb/models/model_implementations/rerankers_monot5_based.py +31 -31
  202. mteb/models/model_implementations/richinfoai_models.py +2 -1
  203. mteb/models/model_implementations/ru_sentence_models.py +71 -20
  204. mteb/models/model_implementations/ruri_models.py +322 -0
  205. mteb/models/model_implementations/salesforce_models.py +6 -3
  206. mteb/models/model_implementations/samilpwc_models.py +2 -1
  207. mteb/models/model_implementations/sarashina_embedding_models.py +168 -0
  208. mteb/models/model_implementations/searchmap_models.py +2 -1
  209. mteb/models/model_implementations/seed_1_6_embedding_models.py +8 -2
  210. mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +625 -0
  211. mteb/models/model_implementations/seed_models.py +1 -0
  212. mteb/models/model_implementations/sentence_transformers_models.py +177 -18
  213. mteb/models/model_implementations/shuu_model.py +32 -31
  214. mteb/models/model_implementations/siglip_models.py +30 -20
  215. mteb/models/model_implementations/slm_models.py +416 -0
  216. mteb/models/model_implementations/sonar_models.py +1 -0
  217. mteb/models/model_implementations/spartan8806_atles_champion.py +34 -0
  218. mteb/models/model_implementations/stella_models.py +23 -4
  219. mteb/models/model_implementations/tarka_models.py +376 -0
  220. mteb/models/model_implementations/text2vec_models.py +9 -3
  221. mteb/models/model_implementations/ua_sentence_models.py +11 -1
  222. mteb/models/model_implementations/uae_models.py +8 -1
  223. mteb/models/model_implementations/vdr_models.py +3 -1
  224. mteb/models/model_implementations/vi_vn_models.py +45 -6
  225. mteb/models/model_implementations/vista_models.py +2 -0
  226. mteb/models/model_implementations/vlm2vec_models.py +5 -3
  227. mteb/models/model_implementations/voyage_models.py +99 -0
  228. mteb/models/model_implementations/voyage_v.py +17 -9
  229. mteb/models/model_implementations/xyz_models.py +1 -0
  230. mteb/models/model_implementations/youtu_models.py +2 -1
  231. mteb/models/model_implementations/yuan_models.py +34 -0
  232. mteb/models/model_implementations/yuan_models_en.py +58 -0
  233. mteb/models/model_meta.py +498 -29
  234. mteb/models/models_protocols.py +22 -6
  235. mteb/models/search_encoder_index/__init__.py +7 -0
  236. mteb/models/search_encoder_index/search_backend_protocol.py +50 -0
  237. mteb/models/search_encoder_index/search_indexes/__init__.py +5 -0
  238. mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +160 -0
  239. mteb/models/search_wrappers.py +197 -65
  240. mteb/models/sentence_transformer_wrapper.py +52 -32
  241. mteb/models/vllm_wrapper.py +327 -0
  242. mteb/py.typed +0 -0
  243. mteb/results/benchmark_results.py +114 -65
  244. mteb/results/model_result.py +63 -26
  245. mteb/results/task_result.py +117 -77
  246. mteb/similarity_functions.py +60 -7
  247. mteb/tasks/bitext_mining/multilingual/__init__.py +2 -1
  248. mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining.py +4 -2
  249. mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining_fast.py +1 -1
  250. mteb/tasks/bitext_mining/multilingual/ru_sci_bench_bitext_mining.py +47 -5
  251. mteb/tasks/bitext_mining/multilingual/web_faq_bitext_mining.py +2 -6
  252. mteb/tasks/classification/ara/ajgt.py +1 -2
  253. mteb/tasks/classification/ara/hotel_review_sentiment_classification.py +1 -2
  254. mteb/tasks/classification/ara/online_store_review_sentiment_classification.py +1 -2
  255. mteb/tasks/classification/ara/restaurant_review_sentiment_classification.py +1 -2
  256. mteb/tasks/classification/ara/tweet_emotion_classification.py +1 -2
  257. mteb/tasks/classification/ara/tweet_sarcasm_classification.py +1 -2
  258. mteb/tasks/classification/ben/bengali_document_classification.py +1 -2
  259. mteb/tasks/classification/ben/bengali_hate_speech_classification.py +1 -2
  260. mteb/tasks/classification/ben/bengali_sentiment_analysis.py +1 -2
  261. mteb/tasks/classification/ces/csfdcz_movie_review_sentiment_classification.py +1 -2
  262. mteb/tasks/classification/ces/czech_product_review_sentiment_classification.py +1 -2
  263. mteb/tasks/classification/ces/czech_so_me_sentiment_classification.py +1 -2
  264. mteb/tasks/classification/dan/angry_tweets_classification.py +1 -2
  265. mteb/tasks/classification/dan/danish_political_comments_classification.py +1 -2
  266. mteb/tasks/classification/dan/ddisco_cohesion_classification.py +1 -2
  267. mteb/tasks/classification/dan/dk_hate_classification.py +2 -3
  268. mteb/tasks/classification/deu/german_politicians_twitter_sentiment_classification.py +1 -2
  269. mteb/tasks/classification/deu/ten_k_gnad_classification.py +1 -2
  270. mteb/tasks/classification/eng/amazon_polarity_classification.py +1 -2
  271. mteb/tasks/classification/eng/arxiv_classification.py +1 -2
  272. mteb/tasks/classification/eng/banking77_classification.py +1 -2
  273. mteb/tasks/classification/eng/dbpedia_classification.py +1 -2
  274. mteb/tasks/classification/eng/emotion_classification.py +1 -2
  275. mteb/tasks/classification/eng/financial_phrasebank_classification.py +1 -2
  276. mteb/tasks/classification/eng/frenk_en_classification.py +1 -2
  277. mteb/tasks/classification/eng/gtsrb_classification.py +1 -1
  278. mteb/tasks/classification/eng/imdb_classification.py +1 -2
  279. mteb/tasks/classification/eng/legal_bench_classification.py +14 -120
  280. mteb/tasks/classification/eng/news_classification.py +1 -2
  281. mteb/tasks/classification/eng/patch_camelyon_classification.py +1 -1
  282. mteb/tasks/classification/eng/patent_classification.py +1 -2
  283. mteb/tasks/classification/eng/poem_sentiment_classification.py +1 -2
  284. mteb/tasks/classification/eng/sds_eye_protection_classification.py +1 -2
  285. mteb/tasks/classification/eng/sds_gloves_classification.py +1 -2
  286. mteb/tasks/classification/eng/toxic_chat_classification.py +2 -19
  287. mteb/tasks/classification/eng/toxic_conversations_classification.py +1 -2
  288. mteb/tasks/classification/eng/tweet_sentiment_extraction_classification.py +1 -2
  289. mteb/tasks/classification/eng/tweet_topic_single_classification.py +2 -13
  290. mteb/tasks/classification/eng/ucf101_classification.py +1 -5
  291. mteb/tasks/classification/eng/wikipedia_bio_met_chem_classification.py +1 -2
  292. mteb/tasks/classification/eng/wikipedia_chem_fields_classification.py +1 -2
  293. mteb/tasks/classification/eng/wikipedia_comp_chem_spectroscopy_classification.py +1 -2
  294. mteb/tasks/classification/eng/wikipedia_crystallography_analytical_classification.py +1 -2
  295. mteb/tasks/classification/eng/wikipedia_theoretical_applied_classification.py +1 -2
  296. mteb/tasks/classification/eng/yahoo_answers_topics_classification.py +1 -2
  297. mteb/tasks/classification/eng/yelp_review_full_classification.py +1 -2
  298. mteb/tasks/classification/est/estonian_valence.py +2 -3
  299. mteb/tasks/classification/fas/fa_mteb_classification.py +7 -14
  300. mteb/tasks/classification/fil/filipino_hate_speech_classification.py +1 -2
  301. mteb/tasks/classification/fin/fin_toxicity_classification.py +2 -11
  302. mteb/tasks/classification/fra/french_book_reviews.py +1 -2
  303. mteb/tasks/classification/fra/movie_review_sentiment_classification.py +1 -2
  304. mteb/tasks/classification/guj/gujarati_news_classification.py +1 -2
  305. mteb/tasks/classification/heb/__init__.py +6 -1
  306. mteb/tasks/classification/heb/hebrew_sentiment_analysis.py +62 -4
  307. mteb/tasks/classification/hin/hindi_discourse_classification.py +1 -2
  308. mteb/tasks/classification/hin/sentiment_analysis_hindi.py +1 -2
  309. mteb/tasks/classification/hrv/frenk_hr_classification.py +1 -2
  310. mteb/tasks/classification/ind/indonesian_id_clickbait_classification.py +1 -2
  311. mteb/tasks/classification/ind/indonesian_mongabay_conservation_classification.py +1 -2
  312. mteb/tasks/classification/ita/italian_linguist_acceptability_classification.py +1 -2
  313. mteb/tasks/classification/jav/javanese_imdb_classification.py +1 -2
  314. mteb/tasks/classification/jpn/wrime_classification.py +1 -2
  315. mteb/tasks/classification/kan/kannada_news_classification.py +1 -2
  316. mteb/tasks/classification/kor/klue_tc.py +1 -2
  317. mteb/tasks/classification/kor/kor_hate_classification.py +2 -17
  318. mteb/tasks/classification/kor/kor_sarcasm_classification.py +2 -19
  319. mteb/tasks/classification/kur/kurdish_sentiment_classification.py +3 -4
  320. mteb/tasks/classification/mal/malayalam_news_classification.py +1 -2
  321. mteb/tasks/classification/mar/marathi_news_classification.py +1 -2
  322. mteb/tasks/classification/mkd/macedonian_tweet_sentiment_classification.py +1 -2
  323. mteb/tasks/classification/multilingual/catalonia_tweet_classification.py +1 -6
  324. mteb/tasks/classification/multilingual/multi_hate_classification.py +1 -4
  325. mteb/tasks/classification/multilingual/ru_sci_bench_classification.py +4 -23
  326. mteb/tasks/classification/multilingual/scala_classification.py +2 -3
  327. mteb/tasks/classification/multilingual/sib200_classification.py +1 -6
  328. mteb/tasks/classification/mya/myanmar_news.py +1 -2
  329. mteb/tasks/classification/nep/nepali_news_classification.py +1 -2
  330. mteb/tasks/classification/nld/dutch_book_review_sentiment_classification.py +4 -2
  331. mteb/tasks/classification/nld/dutch_cola_classification.py +3 -0
  332. mteb/tasks/classification/nld/dutch_government_bias_classification.py +3 -0
  333. mteb/tasks/classification/nld/dutch_news_articles_classification.py +3 -0
  334. mteb/tasks/classification/nld/dutch_sarcastic_headlines_classification.py +3 -0
  335. mteb/tasks/classification/nld/iconclass_classification.py +3 -0
  336. mteb/tasks/classification/nld/open_tender_classification.py +3 -0
  337. mteb/tasks/classification/nld/vaccin_chat_nl_classification.py +3 -0
  338. mteb/tasks/classification/nob/no_rec_classification.py +1 -2
  339. mteb/tasks/classification/nob/norwegian_parliament_classification.py +1 -2
  340. mteb/tasks/classification/ory/odia_news_classification.py +1 -2
  341. mteb/tasks/classification/pol/polish_classification.py +3 -6
  342. mteb/tasks/classification/ron/moroco.py +1 -2
  343. mteb/tasks/classification/ron/romanian_reviews_sentiment.py +1 -2
  344. mteb/tasks/classification/ron/romanian_sentiment_classification.py +1 -2
  345. mteb/tasks/classification/rus/georeview_classification.py +1 -2
  346. mteb/tasks/classification/rus/headline_classification.py +1 -2
  347. mteb/tasks/classification/rus/inappropriateness_classification.py +1 -2
  348. mteb/tasks/classification/rus/ru_reviews_classification.py +1 -2
  349. mteb/tasks/classification/rus/ru_toixic_classification_okmlcup.py +1 -2
  350. mteb/tasks/classification/rus/senti_ru_eval.py +1 -2
  351. mteb/tasks/classification/sin/sinhala_news_classification.py +1 -2
  352. mteb/tasks/classification/sin/sinhala_news_source_classification.py +1 -2
  353. mteb/tasks/classification/slk/csfdsk_movie_review_sentiment_classification.py +1 -2
  354. mteb/tasks/classification/slk/slovak_hate_speech_classification.py +1 -2
  355. mteb/tasks/classification/slk/slovak_movie_review_sentiment_classification.py +1 -2
  356. mteb/tasks/classification/slv/frenk_sl_classification.py +1 -2
  357. mteb/tasks/classification/spa/spanish_news_classification.py +1 -2
  358. mteb/tasks/classification/spa/spanish_sentiment_classification.py +1 -2
  359. mteb/tasks/classification/ssw/siswati_news_classification.py +1 -2
  360. mteb/tasks/classification/swa/swahili_news_classification.py +1 -2
  361. mteb/tasks/classification/swe/dalaj_classification.py +1 -2
  362. mteb/tasks/classification/swe/swe_rec_classification.py +1 -2
  363. mteb/tasks/classification/swe/swedish_sentiment_classification.py +1 -2
  364. mteb/tasks/classification/tam/tamil_news_classification.py +1 -2
  365. mteb/tasks/classification/tel/telugu_andhra_jyoti_news_classification.py +1 -2
  366. mteb/tasks/classification/tha/wisesight_sentiment_classification.py +1 -2
  367. mteb/tasks/classification/tsn/tswana_news_classification.py +1 -2
  368. mteb/tasks/classification/tur/__init__.py +4 -0
  369. mteb/tasks/classification/tur/turkish_constitutional_court.py +41 -0
  370. mteb/tasks/classification/tur/turkish_movie_sentiment_classification.py +1 -2
  371. mteb/tasks/classification/tur/turkish_product_sentiment_classification.py +1 -2
  372. mteb/tasks/classification/ukr/ukr_formality_classification.py +2 -15
  373. mteb/tasks/classification/urd/urdu_roman_sentiment_classification.py +1 -2
  374. mteb/tasks/classification/vie/amazon_counterfactual_vn_classification.py +1 -6
  375. mteb/tasks/classification/vie/amazon_polarity_vn_classification.py +1 -6
  376. mteb/tasks/classification/vie/amazon_reviews_vn_classification.py +1 -5
  377. mteb/tasks/classification/vie/banking77_vn_classification.py +1 -5
  378. mteb/tasks/classification/vie/emotion_vn_classification.py +1 -5
  379. mteb/tasks/classification/vie/imdb_vn_classification.py +1 -5
  380. mteb/tasks/classification/vie/massive_intent_vn_classification.py +1 -5
  381. mteb/tasks/classification/vie/massive_scenario_vn_classification.py +1 -5
  382. mteb/tasks/classification/vie/mtop_domain_vn_classification.py +1 -5
  383. mteb/tasks/classification/vie/mtop_intent_vn_classification.py +1 -5
  384. mteb/tasks/classification/vie/toxic_conversations_vn_classification.py +1 -5
  385. mteb/tasks/classification/vie/tweet_sentiment_extraction_vn_classification.py +1 -5
  386. mteb/tasks/classification/vie/vie_student_feedback_classification.py +1 -2
  387. mteb/tasks/classification/zho/cmteb_classification.py +5 -10
  388. mteb/tasks/classification/zho/yue_openrice_review_classification.py +1 -2
  389. mteb/tasks/classification/zul/isi_zulu_news_classification.py +1 -2
  390. mteb/tasks/clustering/eng/hume_wiki_cities_clustering.py +1 -1
  391. mteb/tasks/clustering/eng/wiki_cities_clustering.py +1 -1
  392. mteb/tasks/clustering/jpn/mews_c16_ja_clustering.py +1 -3
  393. mteb/tasks/clustering/multilingual/sib200_clustering_s2s.py +1 -6
  394. mteb/tasks/clustering/nld/dutch_news_articles_clustering_p2p.py +3 -0
  395. mteb/tasks/clustering/nld/dutch_news_articles_clustering_s2s.py +3 -0
  396. mteb/tasks/clustering/nld/iconclass_clustering_s2s.py +3 -0
  397. mteb/tasks/clustering/nld/open_tender_clustering_p2p.py +3 -0
  398. mteb/tasks/clustering/nld/open_tender_clustering_s2s.py +3 -0
  399. mteb/tasks/clustering/nld/vabb_clustering_p2p.py +3 -0
  400. mteb/tasks/clustering/nld/vabb_clustering_s2s.py +3 -0
  401. mteb/tasks/clustering/vie/reddit_clustering_p2p_vn.py +1 -5
  402. mteb/tasks/clustering/vie/reddit_clustering_vn.py +1 -5
  403. mteb/tasks/clustering/vie/stack_exchange_clustering_p2p_vn.py +1 -5
  404. mteb/tasks/clustering/vie/stack_exchange_clustering_vn.py +1 -5
  405. mteb/tasks/clustering/vie/twenty_newsgroups_clustering_vn.py +1 -5
  406. mteb/tasks/clustering/zho/cmteb_clustering.py +2 -2
  407. mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +1 -1
  408. mteb/tasks/multilabel_classification/ita/emit_classification.py +1 -5
  409. mteb/tasks/multilabel_classification/kor/kor_hate_speech_ml_classification.py +1 -9
  410. mteb/tasks/multilabel_classification/mlt/maltese_news_classification.py +1 -6
  411. mteb/tasks/multilabel_classification/nld/covid_disinformation_nl_multi_label_classification.py +3 -0
  412. mteb/tasks/multilabel_classification/nld/vabb_multi_label_classification.py +3 -0
  413. mteb/tasks/multilabel_classification/por/brazilian_toxic_tweets_classification.py +1 -6
  414. mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_group_classification.py +1 -1
  415. mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_subclass_classification.py +1 -2
  416. mteb/tasks/pair_classification/dan/talemaader_pc.py +1 -6
  417. mteb/tasks/pair_classification/eng/legal_bench_pc.py +1 -9
  418. mteb/tasks/pair_classification/nld/sick_nl_pair_classification.py +3 -0
  419. mteb/tasks/pair_classification/nld/xlwic_nl_pair_classification.py +3 -0
  420. mteb/tasks/pair_classification/rus/__init__.py +2 -2
  421. mteb/tasks/pair_classification/rus/terra.py +51 -25
  422. mteb/tasks/pair_classification/vie/sprint_duplicate_questions_pcvn.py +1 -5
  423. mteb/tasks/pair_classification/vie/twitter_sem_eval2015_pcvn.py +1 -5
  424. mteb/tasks/pair_classification/vie/twitter_url_corpus_pcvn.py +1 -5
  425. mteb/tasks/regression/multilingual/ru_sci_bench_regression.py +2 -6
  426. mteb/tasks/reranking/jpn/__init__.py +9 -1
  427. mteb/tasks/reranking/jpn/j_qa_ra_reranking_lite.py +49 -0
  428. mteb/tasks/reranking/jpn/ja_cwir_reranking_lite.py +47 -0
  429. mteb/tasks/reranking/multilingual/__init__.py +2 -0
  430. mteb/tasks/reranking/multilingual/multi_long_doc_reranking.py +70 -0
  431. mteb/tasks/reranking/multilingual/wikipedia_reranking_multilingual.py +1 -1
  432. mteb/tasks/reranking/multilingual/x_glue_wpr_reranking.py +1 -2
  433. mteb/tasks/reranking/vie/ask_ubuntu_dup_questions_vn.py +1 -5
  434. mteb/tasks/reranking/vie/sci_docs_reranking_vn.py +1 -5
  435. mteb/tasks/reranking/vie/stack_overflow_dup_questions_vn.py +1 -5
  436. mteb/tasks/retrieval/code/code_rag.py +12 -12
  437. mteb/tasks/retrieval/code/fresh_stack_retrieval.py +8 -5
  438. mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
  439. mteb/tasks/retrieval/dan/tv2_nordretrieval.py +2 -2
  440. mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +2 -2
  441. mteb/tasks/retrieval/eng/__init__.py +2 -0
  442. mteb/tasks/retrieval/eng/chemrxiv.py +33 -0
  443. mteb/tasks/retrieval/eng/cub200_i2i_retrieval.py +1 -1
  444. mteb/tasks/retrieval/eng/lit_search_retrieval.py +1 -8
  445. mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +4 -0
  446. mteb/tasks/retrieval/jpn/__init__.py +8 -0
  447. mteb/tasks/retrieval/jpn/ja_cwir_retrieval.py +1 -4
  448. mteb/tasks/retrieval/jpn/ja_cwir_retrieval_lite.py +47 -0
  449. mteb/tasks/retrieval/jpn/jaqket_retrieval_lite.py +50 -0
  450. mteb/tasks/retrieval/jpn/miracl_ja_retrieval_lite.py +52 -0
  451. mteb/tasks/retrieval/jpn/mr_tydi_ja_retrieval_lite.py +48 -0
  452. mteb/tasks/retrieval/kat/georgian_faq_retrieval.py +11 -4
  453. mteb/tasks/retrieval/kor/__init__.py +16 -1
  454. mteb/tasks/retrieval/kor/kovidore2_bench_retrieval.py +142 -0
  455. mteb/tasks/retrieval/kor/squad_kor_v1_retrieval.py +47 -0
  456. mteb/tasks/retrieval/multilingual/__init__.py +24 -0
  457. mteb/tasks/retrieval/multilingual/belebele_retrieval.py +5 -4
  458. mteb/tasks/retrieval/multilingual/euro_pirq_retrieval.py +43 -0
  459. mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +56 -42
  460. mteb/tasks/retrieval/multilingual/mkqa_retrieval.py +1 -2
  461. mteb/tasks/retrieval/multilingual/mlqa_retrieval.py +1 -4
  462. mteb/tasks/retrieval/multilingual/multi_long_doc_retrieval.py +1 -2
  463. mteb/tasks/retrieval/multilingual/public_health_qa_retrieval.py +9 -4
  464. mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +2 -12
  465. mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +4 -2
  466. mteb/tasks/retrieval/multilingual/vidore3_bench_retrieval.py +389 -0
  467. mteb/tasks/retrieval/nld/__init__.py +8 -4
  468. mteb/tasks/retrieval/nld/argu_ana_nl_retrieval.py +46 -27
  469. mteb/tasks/retrieval/nld/bbsard_nl_retrieval.py +3 -0
  470. mteb/tasks/retrieval/nld/dutch_news_articles_retrieval.py +3 -0
  471. mteb/tasks/retrieval/nld/legal_qa_nl_retrieval.py +3 -0
  472. mteb/tasks/retrieval/nld/nf_corpus_nl_retrieval.py +42 -25
  473. mteb/tasks/retrieval/nld/open_tender_retrieval.py +3 -0
  474. mteb/tasks/retrieval/nld/sci_fact_nl_retrieval.py +42 -24
  475. mteb/tasks/retrieval/nld/scidocsnl_retrieval.py +44 -27
  476. mteb/tasks/retrieval/nld/vabb_retrieval.py +3 -0
  477. mteb/tasks/retrieval/nob/norquad.py +2 -2
  478. mteb/tasks/retrieval/nob/snl_retrieval.py +2 -2
  479. mteb/tasks/retrieval/slk/slovak_sum_retrieval.py +1 -7
  480. mteb/tasks/retrieval/tur/tur_hist_quad.py +1 -1
  481. mteb/tasks/retrieval/vie/__init__.py +14 -6
  482. mteb/tasks/retrieval/vie/argu_ana_vn_retrieval.py +1 -5
  483. mteb/tasks/retrieval/vie/climate_fevervn_retrieval.py +40 -5
  484. mteb/tasks/retrieval/vie/cqa_dupstack_android_vn_retrieval.py +1 -5
  485. mteb/tasks/retrieval/vie/cqa_dupstack_gis_vn_retrieval.py +1 -5
  486. mteb/tasks/retrieval/vie/cqa_dupstack_mathematica_vn_retrieval.py +1 -5
  487. mteb/tasks/retrieval/vie/cqa_dupstack_physics_vn_retrieval.py +1 -5
  488. mteb/tasks/retrieval/vie/cqa_dupstack_programmers_vn_retrieval.py +1 -5
  489. mteb/tasks/retrieval/vie/cqa_dupstack_stats_vn_retrieval.py +1 -5
  490. mteb/tasks/retrieval/vie/cqa_dupstack_tex_vn_retrieval.py +1 -5
  491. mteb/tasks/retrieval/vie/cqa_dupstack_unix_vn_retrieval.py +1 -5
  492. mteb/tasks/retrieval/vie/cqa_dupstack_webmasters_vn_retrieval.py +1 -5
  493. mteb/tasks/retrieval/vie/cqa_dupstack_wordpress_vn_retrieval.py +1 -5
  494. mteb/tasks/retrieval/vie/db_pedia_vn_retrieval.py +40 -5
  495. mteb/tasks/retrieval/vie/fevervn_retrieval.py +40 -7
  496. mteb/tasks/retrieval/vie/fi_qa2018_vn_retrieval.py +1 -5
  497. mteb/tasks/retrieval/vie/green_node_table_markdown_retrieval.py +16 -1
  498. mteb/tasks/retrieval/vie/hotpot_qavn_retrieval.py +40 -6
  499. mteb/tasks/retrieval/vie/msmarcovn_retrieval.py +49 -5
  500. mteb/tasks/retrieval/vie/nf_corpus_vn_retrieval.py +1 -5
  501. mteb/tasks/retrieval/vie/nqvn_retrieval.py +40 -5
  502. mteb/tasks/retrieval/vie/quora_vn_retrieval.py +1 -6
  503. mteb/tasks/retrieval/vie/sci_fact_vn_retrieval.py +1 -5
  504. mteb/tasks/retrieval/vie/scidocsvn_retrieval.py +1 -6
  505. mteb/tasks/retrieval/vie/touche2020_vn_retrieval.py +1 -5
  506. mteb/tasks/retrieval/vie/treccovidvn_retrieval.py +1 -5
  507. mteb/tasks/retrieval/vie/tvpl_retrieval.py +42 -0
  508. mteb/tasks/retrieval/vie/zac_legal_text_retrieval.py +15 -1
  509. mteb/tasks/sts/nld/sick_nl_sts.py +1 -0
  510. mteb/tasks/sts/vie/biosses_stsvn.py +1 -5
  511. mteb/tasks/sts/vie/sickr_stsvn.py +1 -5
  512. mteb/tasks/sts/vie/sts_benchmark_stsvn.py +1 -5
  513. mteb/tasks/zeroshot_classification/eng/gtsrb.py +1 -1
  514. mteb/tasks/zeroshot_classification/eng/patch_camelyon.py +1 -1
  515. mteb/tasks/zeroshot_classification/eng/ucf101.py +1 -5
  516. mteb/types/__init__.py +2 -0
  517. mteb/types/_encoder_io.py +19 -2
  518. mteb/types/_result.py +2 -1
  519. mteb/types/statistics.py +9 -3
  520. {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/METADATA +25 -8
  521. {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/RECORD +525 -438
  522. mteb/models/model_implementations/mxbai_models.py +0 -102
  523. mteb/models/model_implementations/nb_sbert.py +0 -25
  524. {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/WHEEL +0 -0
  525. {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/entry_points.txt +0 -0
  526. {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/licenses/LICENSE +0 -0
  527. {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/top_level.txt +0 -0
@@ -1,12 +1,14 @@
1
+ from __future__ import annotations
2
+
1
3
  import logging
2
4
  import warnings
3
- from collections.abc import Callable, Iterable, Sequence
4
- from typing import Any, Literal
5
+ from collections.abc import Callable, Iterable
6
+ from typing import Any, Literal, cast
5
7
 
6
8
  import numpy as np
7
9
  import pandas as pd
8
10
  from pydantic import BaseModel, ConfigDict, Field
9
- from typing_extensions import Self
11
+ from typing_extensions import overload
10
12
 
11
13
  from mteb.abstasks.abstask import AbsTask
12
14
  from mteb.abstasks.task_metadata import (
@@ -22,7 +24,7 @@ from mteb.types import (
22
24
  SplitName,
23
25
  )
24
26
 
25
- from .task_result import TaskResult
27
+ from .task_result import TaskError, TaskResult
26
28
 
27
29
  logger = logging.getLogger(__name__)
28
30
 
@@ -30,7 +32,7 @@ logger = logging.getLogger(__name__)
30
32
  def _aggregate_and_pivot(
31
33
  df: pd.DataFrame,
32
34
  columns: list[str],
33
- aggregation_level: Literal["subset", "split", "task"],
35
+ aggregation_level: Literal["subset", "split", "task", "language"],
34
36
  format: Literal["wide", "long"],
35
37
  aggregation_fn: Callable[[list[Score]], Any] | None,
36
38
  ) -> pd.DataFrame:
@@ -43,6 +45,12 @@ def _aggregate_and_pivot(
43
45
  elif aggregation_level == "task":
44
46
  index_columns = ["task_name"]
45
47
 
48
+ elif aggregation_level == "language":
49
+ index_columns = ["language"]
50
+ df = df.explode("language").reset_index(
51
+ drop=True
52
+ ) # each language in its own row before aggregation
53
+
46
54
  # perform aggregation
47
55
  if aggregation_fn is None:
48
56
  aggregation_fn = np.mean
@@ -52,7 +60,7 @@ def _aggregate_and_pivot(
52
60
  index=index_columns,
53
61
  columns=columns,
54
62
  values="score",
55
- aggfunc=aggregation_fn,
63
+ aggfunc=aggregation_fn, # type: ignore[arg-type]
56
64
  ).reset_index()
57
65
  elif format == "long":
58
66
  return (
@@ -75,29 +83,31 @@ class ModelResult(BaseModel):
75
83
  model_revision: str | None
76
84
  task_results: list[TaskResult]
77
85
  default_modalities: list[Modalities] = Field(
78
- default_factory=lambda: ["text"], alias="modalities"
86
+ default_factory=lambda: [cast(Modalities, "text")], alias="modalities"
79
87
  )
80
88
  model_config = (
81
89
  ConfigDict( # to free up the name model_* which is otherwise protected
82
90
  protected_namespaces=(),
83
91
  )
84
92
  )
93
+ exceptions: list[TaskError] | None = None
85
94
 
86
95
  def __repr__(self) -> str:
87
96
  n_entries = len(self.task_results)
88
97
  return f"ModelResult(model_name={self.model_name}, model_revision={self.model_revision}, task_results=[...](#{n_entries}))"
89
98
 
90
99
  @classmethod
91
- def from_validated(cls, **data: dict[str, Any]) -> Self:
100
+ def from_validated(cls, **data: dict[str, Any]) -> ModelResult:
92
101
  """Create a ModelResult from validated data.
93
102
 
94
103
  Args:
95
104
  data: The validated data.
96
105
  """
97
- data["task_results"] = [
98
- TaskResult.from_validated(**res) for res in data["task_results"]
106
+ data["task_results"] = [ # type: ignore[assignment]
107
+ TaskResult.from_validated(**res) # type: ignore[arg-type]
108
+ for res in data["task_results"]
99
109
  ]
100
- return cls.model_construct(**data)
110
+ return cls.model_construct(**data) # type: ignore[arg-type]
101
111
 
102
112
  def _filter_tasks(
103
113
  self,
@@ -107,7 +117,7 @@ class ModelResult(BaseModel):
107
117
  task_types: list[TaskType] | None = None,
108
118
  modalities: list[Modalities] | None = None,
109
119
  is_public: bool | None = None,
110
- ) -> Self:
120
+ ) -> ModelResult:
111
121
  new_task_results = []
112
122
  for task_result in self.task_results:
113
123
  if (task_names is not None) and (task_result.task_name not in task_names):
@@ -135,7 +145,7 @@ class ModelResult(BaseModel):
135
145
  task_results=new_task_results,
136
146
  )
137
147
 
138
- def select_tasks(self, tasks: Sequence[AbsTask]) -> Self:
148
+ def select_tasks(self, tasks: Iterable[AbsTask]) -> ModelResult:
139
149
  """Select tasks from the ModelResult based on a list of AbsTask objects.
140
150
 
141
151
  Args:
@@ -153,6 +163,28 @@ class ModelResult(BaseModel):
153
163
  task_results=new_task_results,
154
164
  )
155
165
 
166
+ @overload
167
+ def _get_scores(
168
+ self,
169
+ splits: list[SplitName] | None = None,
170
+ languages: list[ISOLanguage | ISOLanguageScript] | None = None,
171
+ scripts: list[ISOLanguageScript] | None = None,
172
+ getter: Callable[[ScoresDict], Score] | None = None,
173
+ aggregation: Callable[[list[Score]], Any] | None = None,
174
+ format: Literal["wide"] = "wide",
175
+ ) -> dict: ...
176
+
177
+ @overload
178
+ def _get_scores(
179
+ self,
180
+ splits: list[SplitName] | None = None,
181
+ languages: list[ISOLanguage | ISOLanguageScript] | None = None,
182
+ scripts: list[ISOLanguageScript] | None = None,
183
+ getter: Callable[[ScoresDict], Score] | None = None,
184
+ aggregation: Callable[[list[Score]], Any] | None = None,
185
+ format: Literal["long"] = "long",
186
+ ) -> list: ...
187
+
156
188
  def _get_scores(
157
189
  self,
158
190
  splits: list[SplitName] | None = None,
@@ -170,21 +202,24 @@ class ModelResult(BaseModel):
170
202
  aggregation = aggregation if aggregation is not None else np.mean
171
203
  else:
172
204
  use_fast = True
205
+ aggregation = cast(Callable[[list[Score]], Any], aggregation)
206
+ getter = cast(Callable[[ScoresDict], Score], getter)
207
+
173
208
  if format == "wide":
174
209
  scores = {}
175
210
  for res in self.task_results:
176
211
  try:
177
212
  if use_fast:
178
213
  scores[res.task_name] = res._get_score_fast(
179
- splits=splits, # type: ignore
180
- languages=languages, # type: ignore
214
+ splits=splits,
215
+ languages=languages,
181
216
  )
182
217
  else:
183
218
  scores[res.task_name] = res.get_score(
184
219
  splits=splits,
185
220
  languages=languages,
186
- aggregation=aggregation, # type: ignore
187
- getter=getter, # type: ignore
221
+ aggregation=aggregation,
222
+ getter=getter,
188
223
  scripts=scripts,
189
224
  )
190
225
  except Exception as e:
@@ -199,14 +234,14 @@ class ModelResult(BaseModel):
199
234
  if use_fast:
200
235
  score = task_res._get_score_fast(
201
236
  splits=splits,
202
- languages=languages, # type: ignore
237
+ languages=languages,
203
238
  )
204
239
  else:
205
240
  score = task_res.get_score(
206
241
  splits=splits,
207
242
  languages=languages,
208
- aggregation=aggregation, # type: ignore
209
- getter=getter, # type: ignore
243
+ aggregation=aggregation,
244
+ getter=getter,
210
245
  scripts=scripts,
211
246
  )
212
247
  entry = dict(
@@ -226,7 +261,7 @@ class ModelResult(BaseModel):
226
261
  )
227
262
  return entries
228
263
 
229
- def _get_score_for_table(self) -> list[dict[str, str | float]]:
264
+ def _get_score_for_table(self) -> list[dict[str, str | float | list[str]]]:
230
265
  scores_data = []
231
266
  model_name = self.model_name
232
267
  for task_result in self.task_results:
@@ -238,10 +273,10 @@ class ModelResult(BaseModel):
238
273
  "model_revision": self.model_revision,
239
274
  "task_name": task_name,
240
275
  "split": split,
276
+ "language": score_item.get("languages", ["Unknown"]),
241
277
  "subset": score_item.get("hf_subset", "default"),
242
278
  "score": score_item.get("main_score", None),
243
279
  }
244
-
245
280
  scores_data.append(row)
246
281
 
247
282
  return scores_data
@@ -285,7 +320,9 @@ class ModelResult(BaseModel):
285
320
  scores_data = self._get_score_for_table()
286
321
 
287
322
  if not scores_data:
288
- logger.warning("No scores data available. Returning empty DataFrame.")
323
+ msg = "No scores data available. Returning empty DataFrame."
324
+ logger.warning(msg)
325
+ warnings.warn(msg)
289
326
  return pd.DataFrame()
290
327
 
291
328
  # Create DataFrame
@@ -308,7 +345,7 @@ class ModelResult(BaseModel):
308
345
  def __hash__(self) -> int:
309
346
  return id(self)
310
347
 
311
- def __iter__(self) -> Iterable[TaskResult]:
348
+ def __iter__(self) -> Iterable[TaskResult]: # type: ignore[override]
312
349
  return iter(self.task_results)
313
350
 
314
351
  def __getitem__(self, index) -> TaskResult:
@@ -361,13 +398,13 @@ class ModelResult(BaseModel):
361
398
  return [task_res.task_name for task_res in self.task_results]
362
399
 
363
400
  @property
364
- def modalities(self) -> list[str]:
401
+ def modalities(self) -> list[Modalities]:
365
402
  """Get all modalities in the task results.
366
403
 
367
404
  Returns:
368
405
  A list of modalities in the task results.
369
406
  """
370
- mods = []
407
+ mods: list[Modalities] = []
371
408
  for task_res in self.task_results:
372
409
  task_modalities = getattr(task_res, "modalities", [])
373
410
  mods.extend(task_modalities)
@@ -2,9 +2,9 @@ from __future__ import annotations
2
2
 
3
3
  import json
4
4
  import logging
5
- from argparse import Namespace
5
+ import warnings
6
6
  from collections import defaultdict
7
- from collections.abc import Callable, Iterable
7
+ from collections.abc import Callable, Iterable, Mapping
8
8
  from functools import cached_property
9
9
  from importlib.metadata import version
10
10
  from pathlib import Path
@@ -16,8 +16,11 @@ from packaging.version import Version
16
16
  from pydantic import BaseModel, field_validator
17
17
  from typing_extensions import Self
18
18
 
19
+ from mteb import TaskMetadata
19
20
  from mteb._helpful_enum import HelpfulStrEnum
21
+ from mteb.abstasks import AbsTaskClassification
20
22
  from mteb.abstasks.abstask import AbsTask
23
+ from mteb.abstasks.task_metadata import TaskDomain
21
24
  from mteb.languages import LanguageScripts
22
25
  from mteb.models.model_meta import ScoringFunction
23
26
  from mteb.types import (
@@ -39,67 +42,59 @@ class Criteria(HelpfulStrEnum):
39
42
  DATASET_REVISION = "dataset_revision"
40
43
 
41
44
 
42
- class ScalaNbClassificationDummy:
45
+ class ScalaNbClassificationDummy(AbsTaskClassification):
43
46
  """A dummy task for loading historic results from before v1.11.0"""
44
47
 
45
- metadata = Namespace( # type: ignore
48
+ metadata = TaskMetadata(
46
49
  name="ScalaNbClassification",
50
+ description="A dummy",
47
51
  main_score="accuracy",
48
52
  type="Classification",
49
- hf_subsets_to_langscripts={
50
- "default": ["nob-Latn"],
51
- },
52
- dataset={"revision": "revision_not_applicable"},
53
- revision="revision_not_applicable",
53
+ eval_langs=["nob-Latn"],
54
+ dataset={"path": "not/exists", "revision": "revision_not_applicable"},
54
55
  )
55
56
 
56
57
 
57
- class ScalaNnClassificationDummy:
58
+ class ScalaNnClassificationDummy(AbsTaskClassification):
58
59
  """A dummy task for loading historic results from before v1.11.0"""
59
60
 
60
- metadata = Namespace( # type: ignore
61
+ metadata = TaskMetadata(
61
62
  name="ScalaNnClassification",
63
+ description="A dummy",
62
64
  main_score="accuracy",
63
65
  type="Classification",
64
- hf_subsets_to_langscripts={
65
- "default": ["nno-Latn"],
66
- },
67
- dataset={"revision": "revision_not_applicable"},
68
- revision="revision_not_applicable",
66
+ eval_langs=["nob-Latn"],
67
+ dataset={"path": "not/exists", "revision": "revision_not_applicable"},
69
68
  )
70
69
 
71
70
 
72
- class ScalaDaClassificationDummy:
71
+ class ScalaDaClassificationDummy(AbsTaskClassification):
73
72
  """A dummy task for loading historic results from before v1.11.0"""
74
73
 
75
- metadata = Namespace( # type: ignore
74
+ metadata = TaskMetadata(
76
75
  name="ScalaDaClassification",
76
+ description="A dummy",
77
77
  main_score="accuracy",
78
78
  type="Classification",
79
- hf_subsets_to_langscripts={
80
- "default": ["dan-Latn"],
81
- },
82
- dataset={"revision": "revision_not_applicable"},
83
- revision="revision_not_applicable",
79
+ eval_langs=["dan-Latn"],
80
+ dataset={"path": "not/exists", "revision": "revision_not_applicable"},
84
81
  )
85
82
 
86
83
 
87
- class ScalaSvClassificationDummy:
84
+ class ScalaSvClassificationDummy(AbsTaskClassification):
88
85
  """A dummy task for loading historic results from before v1.11.0"""
89
86
 
90
- metadata = Namespace( # type: ignore
87
+ metadata = TaskMetadata(
91
88
  name="ScalaSvClassification",
89
+ description="A dummy",
92
90
  main_score="accuracy",
93
91
  type="Classification",
94
- hf_subsets_to_langscripts={
95
- "default": ["swe-Latn"],
96
- },
97
- dataset={"revision": "revision_not_applicable"},
98
- revision="revision_not_applicable",
92
+ eval_langs=["swe-Latn"],
93
+ dataset={"path": "not/exists", "revision": "revision_not_applicable"},
99
94
  )
100
95
 
101
96
 
102
- outdated_tasks = {
97
+ outdated_tasks: dict[str, type[AbsTask]] = {
103
98
  "ScalaNbClassification": ScalaNbClassificationDummy,
104
99
  "ScalaNnClassification": ScalaNnClassificationDummy,
105
100
  "ScalaDaClassification": ScalaDaClassificationDummy,
@@ -166,10 +161,10 @@ class TaskResult(BaseModel):
166
161
  def from_task_results(
167
162
  cls,
168
163
  task: AbsTask | type[AbsTask],
169
- scores: dict[SplitName, dict[HFSubset, ScoresDict]],
164
+ scores: dict[SplitName, Mapping[HFSubset, ScoresDict]],
170
165
  evaluation_time: float,
171
166
  kg_co2_emissions: float | None = None,
172
- ) -> Self:
167
+ ) -> TaskResult:
173
168
  """Create a TaskResult from the task and scores.
174
169
 
175
170
  Args:
@@ -246,12 +241,12 @@ class TaskResult(BaseModel):
246
241
  return get_task(self.task_name)
247
242
 
248
243
  @property
249
- def domains(self) -> list[str]:
244
+ def domains(self) -> list[TaskDomain]:
250
245
  """Get the domains of the task."""
251
246
  doms = self.task.metadata.domains
252
247
  if doms is None:
253
248
  doms = []
254
- return doms # type: ignore
249
+ return doms
255
250
 
256
251
  @property
257
252
  def task_type(self) -> str:
@@ -307,7 +302,7 @@ class TaskResult(BaseModel):
307
302
  if isinstance(v, dict):
308
303
  self._round_scores(v, n)
309
304
  elif isinstance(v, float):
310
- value[i] = round(v, n)
305
+ value[i] = round(v, n) # type: ignore[call-overload]
311
306
 
312
307
  elif isinstance(value, float):
313
308
  scores[key] = round(value, n)
@@ -325,7 +320,7 @@ class TaskResult(BaseModel):
325
320
  json.dump(json_obj, f, indent=2)
326
321
 
327
322
  @classmethod
328
- def from_disk(cls, path: Path, load_historic_data: bool = True) -> Self: # type: ignore
323
+ def from_disk(cls, path: Path, load_historic_data: bool = True) -> TaskResult:
329
324
  """Load TaskResult from disk.
330
325
 
331
326
  Args:
@@ -356,7 +351,7 @@ class TaskResult(BaseModel):
356
351
  ) # assume it is before 1.11.0 if the version is not present
357
352
 
358
353
  try:
359
- obj = cls.model_validate(data)
354
+ obj: TaskResult = cls.model_validate(data)
360
355
  except Exception as e:
361
356
  if not pre_1_11_load:
362
357
  raise e
@@ -381,6 +376,7 @@ class TaskResult(BaseModel):
381
376
  from mteb import get_task
382
377
 
383
378
  task_name = obj.task_name
379
+ task: AbsTask | type[AbsTask]
384
380
  if task_name in outdated_tasks:
385
381
  task = outdated_tasks[task_name]
386
382
  else:
@@ -393,11 +389,11 @@ class TaskResult(BaseModel):
393
389
  for key in list(hf_subset_scores.keys()):
394
390
  if isinstance(hf_subset_scores[key], dict):
395
391
  for k, v in hf_subset_scores[key].items():
396
- hf_subset_scores[f"{key}_{k}"] = v
397
- hf_subset_scores.pop(key)
392
+ hf_subset_scores[f"{key}_{k}"] = v # type: ignore[index]
393
+ hf_subset_scores.pop(key) # type: ignore[attr-defined]
398
394
 
399
395
  @classmethod
400
- def _convert_from_before_v1_11_0(cls, data: dict) -> Self:
396
+ def _convert_from_before_v1_11_0(cls, data: dict) -> TaskResult:
401
397
  from mteb.get_tasks import _TASKS_REGISTRY
402
398
 
403
399
  # in case the task name is not found in the registry, try to find a lower case version
@@ -462,7 +458,9 @@ class TaskResult(BaseModel):
462
458
  if main_score in hf_subset_scores:
463
459
  hf_subset_scores["main_score"] = hf_subset_scores[main_score]
464
460
  else:
465
- logger.warning(f"Main score {main_score} not found in scores")
461
+ msg = f"Main score {main_score} not found in scores"
462
+ logger.warning(msg)
463
+ warnings.warn(msg)
466
464
  hf_subset_scores["main_score"] = None
467
465
 
468
466
  # specific fixes:
@@ -481,7 +479,7 @@ class TaskResult(BaseModel):
481
479
  scores["test"]["fra-fra"] = scores["test"].pop("fr")
482
480
 
483
481
  result: TaskResult = TaskResult.from_task_results(
484
- task, # type: ignore
482
+ task,
485
483
  scores,
486
484
  evaluation_time,
487
485
  kg_co2_emissions=None,
@@ -532,7 +530,7 @@ class TaskResult(BaseModel):
532
530
  def _get_score_fast(
533
531
  self,
534
532
  splits: Iterable[str] | None = None,
535
- languages: str | None = None,
533
+ languages: list[ISOLanguage | ISOLanguageScript] | None = None,
536
534
  subsets: Iterable[str] | None = None,
537
535
  ) -> float:
538
536
  """Sped up version of get_score that will be used if no aggregation, script or getter needs to be specified.
@@ -581,7 +579,7 @@ class TaskResult(BaseModel):
581
579
  return val_sum / n_val
582
580
 
583
581
  @classmethod
584
- def from_validated(cls, **data) -> Self:
582
+ def from_validated(cls, **data) -> TaskResult:
585
583
  """Create a TaskResult from validated data.
586
584
 
587
585
  Returns:
@@ -592,13 +590,13 @@ class TaskResult(BaseModel):
592
590
  def __repr__(self) -> str:
593
591
  return f"TaskResult(task_name={self.task_name}, scores=...)"
594
592
 
595
- def only_main_score(self) -> Self:
593
+ def only_main_score(self) -> TaskResult:
596
594
  """Return a new TaskResult object with only the main score.
597
595
 
598
596
  Returns:
599
597
  A new TaskResult object with only the main score.
600
598
  """
601
- new_scores = {}
599
+ new_scores: dict[str, list[Score]] = {}
602
600
  for split in self.scores:
603
601
  new_scores[split] = []
604
602
  for subset_scores in self.scores[split]:
@@ -610,10 +608,12 @@ class TaskResult(BaseModel):
610
608
  }
611
609
  )
612
610
  new_res = {**self.to_dict(), "scores": new_scores}
613
- new_res = TaskResult.from_validated(**new_res)
614
- return new_res
611
+ return TaskResult.from_validated(**new_res)
615
612
 
616
- def validate_and_filter_scores(self, task: AbsTask | None = None) -> Self:
613
+ def validate_and_filter_scores(
614
+ self,
615
+ task: AbsTask | None = None,
616
+ ) -> TaskResult:
617
617
  """Validate and filter the scores against the task metadata.
618
618
 
619
619
  This ensures that the scores are correct for the given task, by removing any splits besides those specified in the task metadata.
@@ -633,21 +633,23 @@ class TaskResult(BaseModel):
633
633
  task = get_task(self.task_name)
634
634
 
635
635
  splits = task.eval_splits
636
- hf_subsets = task.hf_subsets
637
- hf_subsets = set(hf_subsets)
636
+ hf_subsets = set(task.hf_subsets) # Convert to set once
638
637
 
639
- new_scores = {}
638
+ new_scores: dict[str, list[Score]] = {}
640
639
  seen_splits = set()
641
640
  for split in self.scores:
642
641
  if split not in splits:
643
642
  continue
644
- new_scores[split] = []
645
643
  seen_subsets = set()
646
- for _scores in self.scores[split]:
647
- if _scores["hf_subset"] not in hf_subsets:
648
- continue
649
- new_scores[split].append(_scores)
644
+ # Use list comprehension for better performance
645
+ new_scores[split] = [
646
+ _scores
647
+ for _scores in self.scores[split]
648
+ if _scores["hf_subset"] in hf_subsets
649
+ ]
650
+ for _scores in new_scores[split]:
650
651
  seen_subsets.add(_scores["hf_subset"])
652
+
651
653
  if seen_subsets != hf_subsets:
652
654
  missing_subsets = hf_subsets - seen_subsets
653
655
  if len(missing_subsets) > 2:
@@ -656,17 +658,39 @@ class TaskResult(BaseModel):
656
658
  else:
657
659
  missing_subsets_str = str(missing_subsets)
658
660
 
659
- logger.warning(
660
- f"{task.metadata.name}: Missing subsets {missing_subsets_str} for split {split}"
661
- )
661
+ msg = f"{task.metadata.name}: Missing subsets {missing_subsets_str} for split {split}"
662
+ logger.warning(msg)
663
+ warnings.warn(msg)
664
+ for missing_subset in missing_subsets:
665
+ new_scores[split].append(
666
+ {
667
+ "hf_subset": missing_subset,
668
+ "main_score": np.nan,
669
+ "languages": task.metadata.hf_subsets_to_langscripts.get(
670
+ missing_subset, []
671
+ ),
672
+ }
673
+ )
662
674
  seen_splits.add(split)
663
675
  if seen_splits != set(splits):
664
- logger.warning(
665
- f"{task.metadata.name}: Missing splits {set(splits) - seen_splits}"
666
- )
667
- new_res = {**self.to_dict(), "scores": new_scores}
668
- new_res = TaskResult.from_validated(**new_res)
669
- return new_res
676
+ msg = f"{task.metadata.name}: Missing splits {set(splits) - seen_splits}"
677
+ logger.warning(msg)
678
+ warnings.warn(msg)
679
+ for missing_split in set(splits) - seen_splits:
680
+ new_scores[missing_split] = []
681
+ for missing_subset in hf_subsets:
682
+ new_scores[missing_split].append(
683
+ {
684
+ "hf_subset": missing_subset,
685
+ "main_score": np.nan,
686
+ "languages": task.metadata.hf_subsets_to_langscripts.get(
687
+ missing_subset, []
688
+ ),
689
+ }
690
+ )
691
+ data = self.model_dump()
692
+ data["scores"] = new_scores
693
+ return type(self).model_construct(**data)
670
694
 
671
695
  def is_mergeable(
672
696
  self,
@@ -698,27 +722,31 @@ class TaskResult(BaseModel):
698
722
  name = result.metadata.name
699
723
  revision = result.metadata.revision
700
724
  else:
725
+ msg = "result must be a TaskResult or AbsTask object"
726
+ if raise_error:
727
+ raise ValueError(msg)
728
+ logger.debug(msg)
701
729
  return False
702
730
 
703
731
  if self.task_name != name:
732
+ msg = f"Cannot merge TaskResult objects as they are derived from different tasks ({self.task_name} and {name})"
704
733
  if raise_error:
705
- raise ValueError(
706
- f"Cannot merge TaskResult objects as they are derived from different tasks ({self.task_name} and {name})"
707
- )
734
+ raise ValueError(msg)
735
+ logger.debug(msg)
708
736
  return False
709
737
 
710
738
  if Criteria.MTEB_VERSION in criteria and self.mteb_version != mteb_version:
739
+ msg = f"Cannot merge TaskResult objects as they are derived from different MTEB versions ({self.mteb_version} (loaded) and {mteb_version} (current))"
711
740
  if raise_error:
712
- raise ValueError(
713
- f"Cannot merge TaskResult objects as they are derived from different MTEB versions ({self.mteb_version} and {mteb_version})"
714
- )
741
+ raise ValueError(msg)
742
+ logger.debug(msg)
715
743
  return False
716
744
 
717
745
  if Criteria.DATASET_REVISION in criteria and self.dataset_revision != revision:
746
+ msg = f"Cannot merge TaskResult objects as they are derived from different dataset revisions ({self.dataset_revision} and {revision})"
718
747
  if raise_error:
719
- raise ValueError(
720
- f"Cannot merge TaskResult objects as they are derived from different dataset revisions ({self.dataset_revision} and {revision})"
721
- )
748
+ raise ValueError(msg)
749
+ logger.debug(msg)
722
750
  return False
723
751
 
724
752
  return True
@@ -730,7 +758,7 @@ class TaskResult(BaseModel):
730
758
  "mteb_version",
731
759
  "dataset_revision",
732
760
  ],
733
- ) -> Self:
761
+ ) -> TaskResult:
734
762
  """Merges two TaskResult objects.
735
763
 
736
764
  Args:
@@ -836,3 +864,15 @@ class TaskResult(BaseModel):
836
864
  )
837
865
  )
838
866
  return results
867
+
868
+
869
+ class TaskError(BaseModel):
870
+ """A class to represent an error that occurred during the evaluation of a task.
871
+
872
+ Attributes:
873
+ task_name: The name of the MTEB task.
874
+ exception: The error message that occurred during the evaluation.
875
+ """
876
+
877
+ task_name: str
878
+ exception: str