mteb 2.1.4__py3-none-any.whl → 2.7.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (527) hide show
  1. mteb/__init__.py +6 -0
  2. mteb/_create_dataloaders.py +22 -20
  3. mteb/_evaluators/any_sts_evaluator.py +23 -14
  4. mteb/_evaluators/classification_metrics.py +54 -0
  5. mteb/_evaluators/clustering_evaluator.py +3 -3
  6. mteb/_evaluators/evaluator.py +4 -2
  7. mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +18 -11
  8. mteb/_evaluators/pair_classification_evaluator.py +34 -40
  9. mteb/_evaluators/retrieval_evaluator.py +2 -2
  10. mteb/_evaluators/retrieval_metrics.py +18 -17
  11. mteb/_evaluators/sklearn_evaluator.py +25 -37
  12. mteb/_evaluators/text/bitext_mining_evaluator.py +31 -19
  13. mteb/_evaluators/text/summarization_evaluator.py +27 -20
  14. mteb/_evaluators/zeroshot_classification_evaluator.py +7 -5
  15. mteb/abstasks/_data_filter/__init__.py +0 -0
  16. mteb/abstasks/_data_filter/filters.py +125 -0
  17. mteb/abstasks/_data_filter/task_pipelines.py +105 -0
  18. mteb/abstasks/_statistics_calculation.py +23 -11
  19. mteb/abstasks/_stratification.py +18 -18
  20. mteb/abstasks/abstask.py +35 -28
  21. mteb/abstasks/aggregate_task_metadata.py +1 -9
  22. mteb/abstasks/aggregated_task.py +10 -29
  23. mteb/abstasks/classification.py +15 -12
  24. mteb/abstasks/clustering.py +20 -16
  25. mteb/abstasks/clustering_legacy.py +13 -10
  26. mteb/abstasks/image/image_text_pair_classification.py +7 -4
  27. mteb/abstasks/multilabel_classification.py +33 -22
  28. mteb/abstasks/pair_classification.py +27 -11
  29. mteb/abstasks/regression.py +4 -4
  30. mteb/abstasks/retrieval.py +28 -24
  31. mteb/abstasks/retrieval_dataset_loaders.py +2 -2
  32. mteb/abstasks/sts.py +14 -4
  33. mteb/abstasks/task_metadata.py +32 -33
  34. mteb/abstasks/text/bitext_mining.py +39 -28
  35. mteb/abstasks/text/reranking.py +8 -6
  36. mteb/abstasks/text/summarization.py +10 -5
  37. mteb/abstasks/zeroshot_classification.py +8 -4
  38. mteb/benchmarks/_create_table.py +84 -37
  39. mteb/benchmarks/benchmark.py +77 -16
  40. mteb/benchmarks/benchmarks/__init__.py +12 -0
  41. mteb/benchmarks/benchmarks/benchmarks.py +361 -16
  42. mteb/benchmarks/get_benchmark.py +14 -53
  43. mteb/cache.py +227 -37
  44. mteb/cli/_display_tasks.py +2 -2
  45. mteb/cli/build_cli.py +110 -14
  46. mteb/cli/generate_model_card.py +43 -23
  47. mteb/deprecated_evaluator.py +71 -62
  48. mteb/descriptive_stats/BitextMining/RuSciBenchBitextMining.v2.json +61 -0
  49. mteb/descriptive_stats/Classification/HebrewSentimentAnalysis.v3.json +60 -0
  50. mteb/descriptive_stats/Classification/TurkishConstitutionalCourtViolation.json +54 -0
  51. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2CybersecurityRetrieval.json +32 -0
  52. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EconomicRetrieval.json +32 -0
  53. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EnergyRetrieval.json +32 -0
  54. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2HrRetrieval.json +32 -0
  55. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3ComputerScienceRetrieval.json +214 -0
  56. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3EnergyRetrieval.json +214 -0
  57. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceEnRetrieval.json +214 -0
  58. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceFrRetrieval.json +214 -0
  59. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3HrRetrieval.json +214 -0
  60. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3IndustrialRetrieval.json +214 -0
  61. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3NuclearRetrieval.json +214 -0
  62. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PharmaceuticalsRetrieval.json +214 -0
  63. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PhysicsRetrieval.json +214 -0
  64. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3TelecomRetrieval.json +214 -0
  65. mteb/descriptive_stats/PairClassification/TERRa.V2.json +35 -0
  66. mteb/descriptive_stats/Reranking/JQaRARerankingLite.json +35 -0
  67. mteb/descriptive_stats/Reranking/JaCWIRRerankingLite.json +35 -0
  68. mteb/descriptive_stats/Reranking/MultiLongDocReranking.json +466 -0
  69. mteb/descriptive_stats/Retrieval/ArguAna-NL.v2.json +30 -0
  70. mteb/descriptive_stats/Retrieval/ChemRxivRetrieval.json +30 -0
  71. mteb/descriptive_stats/Retrieval/EuroPIRQRetrieval.json +116 -0
  72. mteb/descriptive_stats/Retrieval/JaCWIRRetrievalLite.json +30 -0
  73. mteb/descriptive_stats/Retrieval/JaqketRetrievalLite.json +30 -0
  74. mteb/descriptive_stats/Retrieval/MIRACLJaRetrievalLite.json +30 -0
  75. mteb/descriptive_stats/Retrieval/MrTyDiJaRetrievalLite.json +30 -0
  76. mteb/descriptive_stats/Retrieval/NFCorpus-NL.v2.json +30 -0
  77. mteb/descriptive_stats/Retrieval/NanoClimateFEVER-VN.json +30 -0
  78. mteb/descriptive_stats/Retrieval/NanoDBPedia-VN.json +30 -0
  79. mteb/descriptive_stats/Retrieval/NanoFEVER-VN.json +30 -0
  80. mteb/descriptive_stats/Retrieval/NanoHotpotQA-VN.json +30 -0
  81. mteb/descriptive_stats/Retrieval/NanoMSMARCO-VN.json +30 -0
  82. mteb/descriptive_stats/Retrieval/NanoNQ-VN.json +30 -0
  83. mteb/descriptive_stats/Retrieval/SCIDOCS-NL.v2.json +30 -0
  84. mteb/descriptive_stats/Retrieval/SQuADKorV1Retrieval.json +30 -0
  85. mteb/descriptive_stats/Retrieval/SciFact-NL.v2.json +30 -0
  86. mteb/descriptive_stats/Retrieval/TVPLRetrieval.json +30 -0
  87. mteb/evaluate.py +106 -75
  88. mteb/filter_tasks.py +25 -26
  89. mteb/get_tasks.py +29 -30
  90. mteb/languages/language_scripts.py +5 -3
  91. mteb/leaderboard/app.py +414 -151
  92. mteb/leaderboard/benchmark_selector.py +14 -5
  93. mteb/leaderboard/figures.py +13 -15
  94. mteb/leaderboard/table.py +82 -17
  95. mteb/load_results.py +12 -12
  96. mteb/models/__init__.py +4 -1
  97. mteb/models/abs_encoder.py +31 -23
  98. mteb/models/cache_wrappers/__init__.py +2 -1
  99. mteb/models/cache_wrappers/cache_backend_protocol.py +3 -5
  100. mteb/models/cache_wrappers/cache_backends/_hash_utils.py +7 -6
  101. mteb/models/cache_wrappers/cache_backends/faiss_cache.py +6 -2
  102. mteb/models/cache_wrappers/cache_backends/numpy_cache.py +43 -25
  103. mteb/models/cache_wrappers/cache_wrapper.py +3 -3
  104. mteb/models/get_model_meta.py +25 -118
  105. mteb/models/instruct_wrapper.py +33 -9
  106. mteb/models/model_implementations/align_models.py +8 -1
  107. mteb/models/model_implementations/amazon_models.py +1 -0
  108. mteb/models/model_implementations/andersborges.py +65 -0
  109. mteb/models/model_implementations/ara_models.py +9 -1
  110. mteb/models/model_implementations/arctic_models.py +16 -8
  111. mteb/models/model_implementations/b1ade_models.py +2 -1
  112. mteb/models/model_implementations/bedrock_models.py +4 -0
  113. mteb/models/model_implementations/bge_models.py +101 -17
  114. mteb/models/model_implementations/bica_model.py +35 -0
  115. mteb/models/model_implementations/blip2_models.py +13 -2
  116. mteb/models/model_implementations/blip_models.py +43 -16
  117. mteb/models/model_implementations/bm25.py +5 -4
  118. mteb/models/model_implementations/bmretriever_models.py +10 -4
  119. mteb/models/model_implementations/cadet_models.py +10 -1
  120. mteb/models/model_implementations/cde_models.py +25 -4
  121. mteb/models/model_implementations/clip_models.py +9 -6
  122. mteb/models/model_implementations/clips_models.py +100 -0
  123. mteb/models/model_implementations/codefuse_models.py +165 -3
  124. mteb/models/model_implementations/codesage_models.py +18 -3
  125. mteb/models/model_implementations/cohere_models.py +13 -6
  126. mteb/models/model_implementations/cohere_v.py +7 -2
  127. mteb/models/model_implementations/colpali_models.py +17 -9
  128. mteb/models/model_implementations/colqwen_models.py +275 -5
  129. mteb/models/model_implementations/colsmol_models.py +4 -2
  130. mteb/models/model_implementations/conan_models.py +2 -1
  131. mteb/models/model_implementations/dino_models.py +194 -23
  132. mteb/models/model_implementations/e5_instruct.py +27 -4
  133. mteb/models/model_implementations/e5_models.py +21 -110
  134. mteb/models/model_implementations/e5_v.py +7 -6
  135. mteb/models/model_implementations/eagerworks_models.py +164 -0
  136. mteb/models/model_implementations/emillykkejensen_models.py +91 -0
  137. mteb/models/model_implementations/en_code_retriever.py +2 -1
  138. mteb/models/model_implementations/euler_models.py +32 -0
  139. mteb/models/model_implementations/evaclip_models.py +4 -0
  140. mteb/models/model_implementations/fa_models.py +67 -9
  141. mteb/models/model_implementations/facebookai.py +205 -0
  142. mteb/models/model_implementations/geogpt_models.py +2 -1
  143. mteb/models/model_implementations/gme_v_models.py +17 -10
  144. mteb/models/model_implementations/google_models.py +17 -6
  145. mteb/models/model_implementations/granite_vision_embedding_models.py +8 -3
  146. mteb/models/model_implementations/gritlm_models.py +4 -2
  147. mteb/models/model_implementations/gte_models.py +99 -9
  148. mteb/models/model_implementations/hinvec_models.py +2 -1
  149. mteb/models/model_implementations/human.py +1 -0
  150. mteb/models/model_implementations/ibm_granite_models.py +36 -6
  151. mteb/models/model_implementations/inf_models.py +4 -2
  152. mteb/models/model_implementations/jasper_models.py +256 -3
  153. mteb/models/model_implementations/jina_clip.py +49 -10
  154. mteb/models/model_implementations/jina_models.py +222 -11
  155. mteb/models/model_implementations/kalm_models.py +203 -25
  156. mteb/models/model_implementations/kblab.py +37 -0
  157. mteb/models/model_implementations/kennethenevoldsen_models.py +74 -0
  158. mteb/models/model_implementations/kfst.py +25 -0
  159. mteb/models/model_implementations/kowshik24_models.py +32 -0
  160. mteb/models/model_implementations/lens_models.py +2 -0
  161. mteb/models/model_implementations/lgai_embedding_models.py +2 -1
  162. mteb/models/model_implementations/linq_models.py +4 -3
  163. mteb/models/model_implementations/listconranker.py +2 -2
  164. mteb/models/model_implementations/llm2clip_models.py +9 -6
  165. mteb/models/model_implementations/llm2vec_models.py +16 -8
  166. mteb/models/model_implementations/mcinext_models.py +7 -1
  167. mteb/models/model_implementations/mdbr_models.py +19 -3
  168. mteb/models/model_implementations/misc_models.py +422 -60
  169. mteb/models/model_implementations/mixedbread_ai_models.py +332 -0
  170. mteb/models/model_implementations/mme5_models.py +2 -1
  171. mteb/models/model_implementations/moco_models.py +15 -4
  172. mteb/models/model_implementations/mod_models.py +191 -0
  173. mteb/models/model_implementations/model2vec_models.py +27 -14
  174. mteb/models/model_implementations/moka_models.py +4 -1
  175. mteb/models/model_implementations/nbailab.py +70 -0
  176. mteb/models/model_implementations/no_instruct_sentence_models.py +3 -2
  177. mteb/models/model_implementations/nomic_models.py +173 -6
  178. mteb/models/model_implementations/nomic_models_vision.py +8 -3
  179. mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +32 -19
  180. mteb/models/model_implementations/nvidia_models.py +155 -20
  181. mteb/models/model_implementations/octen_models.py +254 -0
  182. mteb/models/model_implementations/openai_models.py +20 -16
  183. mteb/models/model_implementations/openclip_models.py +37 -13
  184. mteb/models/model_implementations/opensearch_neural_sparse_models.py +10 -5
  185. mteb/models/model_implementations/ops_moa_models.py +5 -3
  186. mteb/models/model_implementations/ordalietech_solon_embeddings_mini_beta_1_1.py +1 -1
  187. mteb/models/model_implementations/pawan_models.py +39 -0
  188. mteb/models/model_implementations/piccolo_models.py +9 -1
  189. mteb/models/model_implementations/pixie_models.py +56 -0
  190. mteb/models/model_implementations/promptriever_models.py +12 -8
  191. mteb/models/model_implementations/pylate_models.py +46 -12
  192. mteb/models/model_implementations/qodo_models.py +4 -2
  193. mteb/models/model_implementations/qtack_models.py +2 -1
  194. mteb/models/model_implementations/qwen3_models.py +9 -6
  195. mteb/models/model_implementations/qzhou_models.py +5 -3
  196. mteb/models/model_implementations/random_baseline.py +19 -24
  197. mteb/models/model_implementations/rasgaard_models.py +34 -0
  198. mteb/models/model_implementations/reasonir_model.py +2 -1
  199. mteb/models/model_implementations/repllama_models.py +5 -3
  200. mteb/models/model_implementations/rerankers_custom.py +15 -9
  201. mteb/models/model_implementations/rerankers_monot5_based.py +31 -31
  202. mteb/models/model_implementations/richinfoai_models.py +2 -1
  203. mteb/models/model_implementations/ru_sentence_models.py +71 -20
  204. mteb/models/model_implementations/ruri_models.py +322 -0
  205. mteb/models/model_implementations/salesforce_models.py +6 -3
  206. mteb/models/model_implementations/samilpwc_models.py +2 -1
  207. mteb/models/model_implementations/sarashina_embedding_models.py +168 -0
  208. mteb/models/model_implementations/searchmap_models.py +2 -1
  209. mteb/models/model_implementations/seed_1_6_embedding_models.py +8 -2
  210. mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +625 -0
  211. mteb/models/model_implementations/seed_models.py +1 -0
  212. mteb/models/model_implementations/sentence_transformers_models.py +177 -18
  213. mteb/models/model_implementations/shuu_model.py +32 -31
  214. mteb/models/model_implementations/siglip_models.py +30 -20
  215. mteb/models/model_implementations/slm_models.py +416 -0
  216. mteb/models/model_implementations/sonar_models.py +1 -0
  217. mteb/models/model_implementations/spartan8806_atles_champion.py +34 -0
  218. mteb/models/model_implementations/stella_models.py +23 -4
  219. mteb/models/model_implementations/tarka_models.py +376 -0
  220. mteb/models/model_implementations/text2vec_models.py +9 -3
  221. mteb/models/model_implementations/ua_sentence_models.py +11 -1
  222. mteb/models/model_implementations/uae_models.py +8 -1
  223. mteb/models/model_implementations/vdr_models.py +3 -1
  224. mteb/models/model_implementations/vi_vn_models.py +45 -6
  225. mteb/models/model_implementations/vista_models.py +2 -0
  226. mteb/models/model_implementations/vlm2vec_models.py +5 -3
  227. mteb/models/model_implementations/voyage_models.py +99 -0
  228. mteb/models/model_implementations/voyage_v.py +17 -9
  229. mteb/models/model_implementations/xyz_models.py +1 -0
  230. mteb/models/model_implementations/youtu_models.py +2 -1
  231. mteb/models/model_implementations/yuan_models.py +34 -0
  232. mteb/models/model_implementations/yuan_models_en.py +58 -0
  233. mteb/models/model_meta.py +498 -29
  234. mteb/models/models_protocols.py +22 -6
  235. mteb/models/search_encoder_index/__init__.py +7 -0
  236. mteb/models/search_encoder_index/search_backend_protocol.py +50 -0
  237. mteb/models/search_encoder_index/search_indexes/__init__.py +5 -0
  238. mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +160 -0
  239. mteb/models/search_wrappers.py +197 -65
  240. mteb/models/sentence_transformer_wrapper.py +52 -32
  241. mteb/models/vllm_wrapper.py +327 -0
  242. mteb/py.typed +0 -0
  243. mteb/results/benchmark_results.py +114 -65
  244. mteb/results/model_result.py +63 -26
  245. mteb/results/task_result.py +117 -77
  246. mteb/similarity_functions.py +60 -7
  247. mteb/tasks/bitext_mining/multilingual/__init__.py +2 -1
  248. mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining.py +4 -2
  249. mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining_fast.py +1 -1
  250. mteb/tasks/bitext_mining/multilingual/ru_sci_bench_bitext_mining.py +47 -5
  251. mteb/tasks/bitext_mining/multilingual/web_faq_bitext_mining.py +2 -6
  252. mteb/tasks/classification/ara/ajgt.py +1 -2
  253. mteb/tasks/classification/ara/hotel_review_sentiment_classification.py +1 -2
  254. mteb/tasks/classification/ara/online_store_review_sentiment_classification.py +1 -2
  255. mteb/tasks/classification/ara/restaurant_review_sentiment_classification.py +1 -2
  256. mteb/tasks/classification/ara/tweet_emotion_classification.py +1 -2
  257. mteb/tasks/classification/ara/tweet_sarcasm_classification.py +1 -2
  258. mteb/tasks/classification/ben/bengali_document_classification.py +1 -2
  259. mteb/tasks/classification/ben/bengali_hate_speech_classification.py +1 -2
  260. mteb/tasks/classification/ben/bengali_sentiment_analysis.py +1 -2
  261. mteb/tasks/classification/ces/csfdcz_movie_review_sentiment_classification.py +1 -2
  262. mteb/tasks/classification/ces/czech_product_review_sentiment_classification.py +1 -2
  263. mteb/tasks/classification/ces/czech_so_me_sentiment_classification.py +1 -2
  264. mteb/tasks/classification/dan/angry_tweets_classification.py +1 -2
  265. mteb/tasks/classification/dan/danish_political_comments_classification.py +1 -2
  266. mteb/tasks/classification/dan/ddisco_cohesion_classification.py +1 -2
  267. mteb/tasks/classification/dan/dk_hate_classification.py +2 -3
  268. mteb/tasks/classification/deu/german_politicians_twitter_sentiment_classification.py +1 -2
  269. mteb/tasks/classification/deu/ten_k_gnad_classification.py +1 -2
  270. mteb/tasks/classification/eng/amazon_polarity_classification.py +1 -2
  271. mteb/tasks/classification/eng/arxiv_classification.py +1 -2
  272. mteb/tasks/classification/eng/banking77_classification.py +1 -2
  273. mteb/tasks/classification/eng/dbpedia_classification.py +1 -2
  274. mteb/tasks/classification/eng/emotion_classification.py +1 -2
  275. mteb/tasks/classification/eng/financial_phrasebank_classification.py +1 -2
  276. mteb/tasks/classification/eng/frenk_en_classification.py +1 -2
  277. mteb/tasks/classification/eng/gtsrb_classification.py +1 -1
  278. mteb/tasks/classification/eng/imdb_classification.py +1 -2
  279. mteb/tasks/classification/eng/legal_bench_classification.py +14 -120
  280. mteb/tasks/classification/eng/news_classification.py +1 -2
  281. mteb/tasks/classification/eng/patch_camelyon_classification.py +1 -1
  282. mteb/tasks/classification/eng/patent_classification.py +1 -2
  283. mteb/tasks/classification/eng/poem_sentiment_classification.py +1 -2
  284. mteb/tasks/classification/eng/sds_eye_protection_classification.py +1 -2
  285. mteb/tasks/classification/eng/sds_gloves_classification.py +1 -2
  286. mteb/tasks/classification/eng/toxic_chat_classification.py +2 -19
  287. mteb/tasks/classification/eng/toxic_conversations_classification.py +1 -2
  288. mteb/tasks/classification/eng/tweet_sentiment_extraction_classification.py +1 -2
  289. mteb/tasks/classification/eng/tweet_topic_single_classification.py +2 -13
  290. mteb/tasks/classification/eng/ucf101_classification.py +1 -5
  291. mteb/tasks/classification/eng/wikipedia_bio_met_chem_classification.py +1 -2
  292. mteb/tasks/classification/eng/wikipedia_chem_fields_classification.py +1 -2
  293. mteb/tasks/classification/eng/wikipedia_comp_chem_spectroscopy_classification.py +1 -2
  294. mteb/tasks/classification/eng/wikipedia_crystallography_analytical_classification.py +1 -2
  295. mteb/tasks/classification/eng/wikipedia_theoretical_applied_classification.py +1 -2
  296. mteb/tasks/classification/eng/yahoo_answers_topics_classification.py +1 -2
  297. mteb/tasks/classification/eng/yelp_review_full_classification.py +1 -2
  298. mteb/tasks/classification/est/estonian_valence.py +2 -3
  299. mteb/tasks/classification/fas/fa_mteb_classification.py +7 -14
  300. mteb/tasks/classification/fil/filipino_hate_speech_classification.py +1 -2
  301. mteb/tasks/classification/fin/fin_toxicity_classification.py +2 -11
  302. mteb/tasks/classification/fra/french_book_reviews.py +1 -2
  303. mteb/tasks/classification/fra/movie_review_sentiment_classification.py +1 -2
  304. mteb/tasks/classification/guj/gujarati_news_classification.py +1 -2
  305. mteb/tasks/classification/heb/__init__.py +6 -1
  306. mteb/tasks/classification/heb/hebrew_sentiment_analysis.py +62 -4
  307. mteb/tasks/classification/hin/hindi_discourse_classification.py +1 -2
  308. mteb/tasks/classification/hin/sentiment_analysis_hindi.py +1 -2
  309. mteb/tasks/classification/hrv/frenk_hr_classification.py +1 -2
  310. mteb/tasks/classification/ind/indonesian_id_clickbait_classification.py +1 -2
  311. mteb/tasks/classification/ind/indonesian_mongabay_conservation_classification.py +1 -2
  312. mteb/tasks/classification/ita/italian_linguist_acceptability_classification.py +1 -2
  313. mteb/tasks/classification/jav/javanese_imdb_classification.py +1 -2
  314. mteb/tasks/classification/jpn/wrime_classification.py +1 -2
  315. mteb/tasks/classification/kan/kannada_news_classification.py +1 -2
  316. mteb/tasks/classification/kor/klue_tc.py +1 -2
  317. mteb/tasks/classification/kor/kor_hate_classification.py +2 -17
  318. mteb/tasks/classification/kor/kor_sarcasm_classification.py +2 -19
  319. mteb/tasks/classification/kur/kurdish_sentiment_classification.py +3 -4
  320. mteb/tasks/classification/mal/malayalam_news_classification.py +1 -2
  321. mteb/tasks/classification/mar/marathi_news_classification.py +1 -2
  322. mteb/tasks/classification/mkd/macedonian_tweet_sentiment_classification.py +1 -2
  323. mteb/tasks/classification/multilingual/catalonia_tweet_classification.py +1 -6
  324. mteb/tasks/classification/multilingual/multi_hate_classification.py +1 -4
  325. mteb/tasks/classification/multilingual/ru_sci_bench_classification.py +4 -23
  326. mteb/tasks/classification/multilingual/scala_classification.py +2 -3
  327. mteb/tasks/classification/multilingual/sib200_classification.py +1 -6
  328. mteb/tasks/classification/mya/myanmar_news.py +1 -2
  329. mteb/tasks/classification/nep/nepali_news_classification.py +1 -2
  330. mteb/tasks/classification/nld/dutch_book_review_sentiment_classification.py +4 -2
  331. mteb/tasks/classification/nld/dutch_cola_classification.py +3 -0
  332. mteb/tasks/classification/nld/dutch_government_bias_classification.py +3 -0
  333. mteb/tasks/classification/nld/dutch_news_articles_classification.py +3 -0
  334. mteb/tasks/classification/nld/dutch_sarcastic_headlines_classification.py +3 -0
  335. mteb/tasks/classification/nld/iconclass_classification.py +3 -0
  336. mteb/tasks/classification/nld/open_tender_classification.py +3 -0
  337. mteb/tasks/classification/nld/vaccin_chat_nl_classification.py +3 -0
  338. mteb/tasks/classification/nob/no_rec_classification.py +1 -2
  339. mteb/tasks/classification/nob/norwegian_parliament_classification.py +1 -2
  340. mteb/tasks/classification/ory/odia_news_classification.py +1 -2
  341. mteb/tasks/classification/pol/polish_classification.py +3 -6
  342. mteb/tasks/classification/ron/moroco.py +1 -2
  343. mteb/tasks/classification/ron/romanian_reviews_sentiment.py +1 -2
  344. mteb/tasks/classification/ron/romanian_sentiment_classification.py +1 -2
  345. mteb/tasks/classification/rus/georeview_classification.py +1 -2
  346. mteb/tasks/classification/rus/headline_classification.py +1 -2
  347. mteb/tasks/classification/rus/inappropriateness_classification.py +1 -2
  348. mteb/tasks/classification/rus/ru_reviews_classification.py +1 -2
  349. mteb/tasks/classification/rus/ru_toixic_classification_okmlcup.py +1 -2
  350. mteb/tasks/classification/rus/senti_ru_eval.py +1 -2
  351. mteb/tasks/classification/sin/sinhala_news_classification.py +1 -2
  352. mteb/tasks/classification/sin/sinhala_news_source_classification.py +1 -2
  353. mteb/tasks/classification/slk/csfdsk_movie_review_sentiment_classification.py +1 -2
  354. mteb/tasks/classification/slk/slovak_hate_speech_classification.py +1 -2
  355. mteb/tasks/classification/slk/slovak_movie_review_sentiment_classification.py +1 -2
  356. mteb/tasks/classification/slv/frenk_sl_classification.py +1 -2
  357. mteb/tasks/classification/spa/spanish_news_classification.py +1 -2
  358. mteb/tasks/classification/spa/spanish_sentiment_classification.py +1 -2
  359. mteb/tasks/classification/ssw/siswati_news_classification.py +1 -2
  360. mteb/tasks/classification/swa/swahili_news_classification.py +1 -2
  361. mteb/tasks/classification/swe/dalaj_classification.py +1 -2
  362. mteb/tasks/classification/swe/swe_rec_classification.py +1 -2
  363. mteb/tasks/classification/swe/swedish_sentiment_classification.py +1 -2
  364. mteb/tasks/classification/tam/tamil_news_classification.py +1 -2
  365. mteb/tasks/classification/tel/telugu_andhra_jyoti_news_classification.py +1 -2
  366. mteb/tasks/classification/tha/wisesight_sentiment_classification.py +1 -2
  367. mteb/tasks/classification/tsn/tswana_news_classification.py +1 -2
  368. mteb/tasks/classification/tur/__init__.py +4 -0
  369. mteb/tasks/classification/tur/turkish_constitutional_court.py +41 -0
  370. mteb/tasks/classification/tur/turkish_movie_sentiment_classification.py +1 -2
  371. mteb/tasks/classification/tur/turkish_product_sentiment_classification.py +1 -2
  372. mteb/tasks/classification/ukr/ukr_formality_classification.py +2 -15
  373. mteb/tasks/classification/urd/urdu_roman_sentiment_classification.py +1 -2
  374. mteb/tasks/classification/vie/amazon_counterfactual_vn_classification.py +1 -6
  375. mteb/tasks/classification/vie/amazon_polarity_vn_classification.py +1 -6
  376. mteb/tasks/classification/vie/amazon_reviews_vn_classification.py +1 -5
  377. mteb/tasks/classification/vie/banking77_vn_classification.py +1 -5
  378. mteb/tasks/classification/vie/emotion_vn_classification.py +1 -5
  379. mteb/tasks/classification/vie/imdb_vn_classification.py +1 -5
  380. mteb/tasks/classification/vie/massive_intent_vn_classification.py +1 -5
  381. mteb/tasks/classification/vie/massive_scenario_vn_classification.py +1 -5
  382. mteb/tasks/classification/vie/mtop_domain_vn_classification.py +1 -5
  383. mteb/tasks/classification/vie/mtop_intent_vn_classification.py +1 -5
  384. mteb/tasks/classification/vie/toxic_conversations_vn_classification.py +1 -5
  385. mteb/tasks/classification/vie/tweet_sentiment_extraction_vn_classification.py +1 -5
  386. mteb/tasks/classification/vie/vie_student_feedback_classification.py +1 -2
  387. mteb/tasks/classification/zho/cmteb_classification.py +5 -10
  388. mteb/tasks/classification/zho/yue_openrice_review_classification.py +1 -2
  389. mteb/tasks/classification/zul/isi_zulu_news_classification.py +1 -2
  390. mteb/tasks/clustering/eng/hume_wiki_cities_clustering.py +1 -1
  391. mteb/tasks/clustering/eng/wiki_cities_clustering.py +1 -1
  392. mteb/tasks/clustering/jpn/mews_c16_ja_clustering.py +1 -3
  393. mteb/tasks/clustering/multilingual/sib200_clustering_s2s.py +1 -6
  394. mteb/tasks/clustering/nld/dutch_news_articles_clustering_p2p.py +3 -0
  395. mteb/tasks/clustering/nld/dutch_news_articles_clustering_s2s.py +3 -0
  396. mteb/tasks/clustering/nld/iconclass_clustering_s2s.py +3 -0
  397. mteb/tasks/clustering/nld/open_tender_clustering_p2p.py +3 -0
  398. mteb/tasks/clustering/nld/open_tender_clustering_s2s.py +3 -0
  399. mteb/tasks/clustering/nld/vabb_clustering_p2p.py +3 -0
  400. mteb/tasks/clustering/nld/vabb_clustering_s2s.py +3 -0
  401. mteb/tasks/clustering/vie/reddit_clustering_p2p_vn.py +1 -5
  402. mteb/tasks/clustering/vie/reddit_clustering_vn.py +1 -5
  403. mteb/tasks/clustering/vie/stack_exchange_clustering_p2p_vn.py +1 -5
  404. mteb/tasks/clustering/vie/stack_exchange_clustering_vn.py +1 -5
  405. mteb/tasks/clustering/vie/twenty_newsgroups_clustering_vn.py +1 -5
  406. mteb/tasks/clustering/zho/cmteb_clustering.py +2 -2
  407. mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +1 -1
  408. mteb/tasks/multilabel_classification/ita/emit_classification.py +1 -5
  409. mteb/tasks/multilabel_classification/kor/kor_hate_speech_ml_classification.py +1 -9
  410. mteb/tasks/multilabel_classification/mlt/maltese_news_classification.py +1 -6
  411. mteb/tasks/multilabel_classification/nld/covid_disinformation_nl_multi_label_classification.py +3 -0
  412. mteb/tasks/multilabel_classification/nld/vabb_multi_label_classification.py +3 -0
  413. mteb/tasks/multilabel_classification/por/brazilian_toxic_tweets_classification.py +1 -6
  414. mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_group_classification.py +1 -1
  415. mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_subclass_classification.py +1 -2
  416. mteb/tasks/pair_classification/dan/talemaader_pc.py +1 -6
  417. mteb/tasks/pair_classification/eng/legal_bench_pc.py +1 -9
  418. mteb/tasks/pair_classification/nld/sick_nl_pair_classification.py +3 -0
  419. mteb/tasks/pair_classification/nld/xlwic_nl_pair_classification.py +3 -0
  420. mteb/tasks/pair_classification/rus/__init__.py +2 -2
  421. mteb/tasks/pair_classification/rus/terra.py +51 -25
  422. mteb/tasks/pair_classification/vie/sprint_duplicate_questions_pcvn.py +1 -5
  423. mteb/tasks/pair_classification/vie/twitter_sem_eval2015_pcvn.py +1 -5
  424. mteb/tasks/pair_classification/vie/twitter_url_corpus_pcvn.py +1 -5
  425. mteb/tasks/regression/multilingual/ru_sci_bench_regression.py +2 -6
  426. mteb/tasks/reranking/jpn/__init__.py +9 -1
  427. mteb/tasks/reranking/jpn/j_qa_ra_reranking_lite.py +49 -0
  428. mteb/tasks/reranking/jpn/ja_cwir_reranking_lite.py +47 -0
  429. mteb/tasks/reranking/multilingual/__init__.py +2 -0
  430. mteb/tasks/reranking/multilingual/multi_long_doc_reranking.py +70 -0
  431. mteb/tasks/reranking/multilingual/wikipedia_reranking_multilingual.py +1 -1
  432. mteb/tasks/reranking/multilingual/x_glue_wpr_reranking.py +1 -2
  433. mteb/tasks/reranking/vie/ask_ubuntu_dup_questions_vn.py +1 -5
  434. mteb/tasks/reranking/vie/sci_docs_reranking_vn.py +1 -5
  435. mteb/tasks/reranking/vie/stack_overflow_dup_questions_vn.py +1 -5
  436. mteb/tasks/retrieval/code/code_rag.py +12 -12
  437. mteb/tasks/retrieval/code/fresh_stack_retrieval.py +8 -5
  438. mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
  439. mteb/tasks/retrieval/dan/tv2_nordretrieval.py +2 -2
  440. mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +2 -2
  441. mteb/tasks/retrieval/eng/__init__.py +2 -0
  442. mteb/tasks/retrieval/eng/chemrxiv.py +33 -0
  443. mteb/tasks/retrieval/eng/cub200_i2i_retrieval.py +1 -1
  444. mteb/tasks/retrieval/eng/lit_search_retrieval.py +1 -8
  445. mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +4 -0
  446. mteb/tasks/retrieval/jpn/__init__.py +8 -0
  447. mteb/tasks/retrieval/jpn/ja_cwir_retrieval.py +1 -4
  448. mteb/tasks/retrieval/jpn/ja_cwir_retrieval_lite.py +47 -0
  449. mteb/tasks/retrieval/jpn/jaqket_retrieval_lite.py +50 -0
  450. mteb/tasks/retrieval/jpn/miracl_ja_retrieval_lite.py +52 -0
  451. mteb/tasks/retrieval/jpn/mr_tydi_ja_retrieval_lite.py +48 -0
  452. mteb/tasks/retrieval/kat/georgian_faq_retrieval.py +11 -4
  453. mteb/tasks/retrieval/kor/__init__.py +16 -1
  454. mteb/tasks/retrieval/kor/kovidore2_bench_retrieval.py +142 -0
  455. mteb/tasks/retrieval/kor/squad_kor_v1_retrieval.py +47 -0
  456. mteb/tasks/retrieval/multilingual/__init__.py +24 -0
  457. mteb/tasks/retrieval/multilingual/belebele_retrieval.py +5 -4
  458. mteb/tasks/retrieval/multilingual/euro_pirq_retrieval.py +43 -0
  459. mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +56 -42
  460. mteb/tasks/retrieval/multilingual/mkqa_retrieval.py +1 -2
  461. mteb/tasks/retrieval/multilingual/mlqa_retrieval.py +1 -4
  462. mteb/tasks/retrieval/multilingual/multi_long_doc_retrieval.py +1 -2
  463. mteb/tasks/retrieval/multilingual/public_health_qa_retrieval.py +9 -4
  464. mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +2 -12
  465. mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +4 -2
  466. mteb/tasks/retrieval/multilingual/vidore3_bench_retrieval.py +389 -0
  467. mteb/tasks/retrieval/nld/__init__.py +8 -4
  468. mteb/tasks/retrieval/nld/argu_ana_nl_retrieval.py +46 -27
  469. mteb/tasks/retrieval/nld/bbsard_nl_retrieval.py +3 -0
  470. mteb/tasks/retrieval/nld/dutch_news_articles_retrieval.py +3 -0
  471. mteb/tasks/retrieval/nld/legal_qa_nl_retrieval.py +3 -0
  472. mteb/tasks/retrieval/nld/nf_corpus_nl_retrieval.py +42 -25
  473. mteb/tasks/retrieval/nld/open_tender_retrieval.py +3 -0
  474. mteb/tasks/retrieval/nld/sci_fact_nl_retrieval.py +42 -24
  475. mteb/tasks/retrieval/nld/scidocsnl_retrieval.py +44 -27
  476. mteb/tasks/retrieval/nld/vabb_retrieval.py +3 -0
  477. mteb/tasks/retrieval/nob/norquad.py +2 -2
  478. mteb/tasks/retrieval/nob/snl_retrieval.py +2 -2
  479. mteb/tasks/retrieval/slk/slovak_sum_retrieval.py +1 -7
  480. mteb/tasks/retrieval/tur/tur_hist_quad.py +1 -1
  481. mteb/tasks/retrieval/vie/__init__.py +14 -6
  482. mteb/tasks/retrieval/vie/argu_ana_vn_retrieval.py +1 -5
  483. mteb/tasks/retrieval/vie/climate_fevervn_retrieval.py +40 -5
  484. mteb/tasks/retrieval/vie/cqa_dupstack_android_vn_retrieval.py +1 -5
  485. mteb/tasks/retrieval/vie/cqa_dupstack_gis_vn_retrieval.py +1 -5
  486. mteb/tasks/retrieval/vie/cqa_dupstack_mathematica_vn_retrieval.py +1 -5
  487. mteb/tasks/retrieval/vie/cqa_dupstack_physics_vn_retrieval.py +1 -5
  488. mteb/tasks/retrieval/vie/cqa_dupstack_programmers_vn_retrieval.py +1 -5
  489. mteb/tasks/retrieval/vie/cqa_dupstack_stats_vn_retrieval.py +1 -5
  490. mteb/tasks/retrieval/vie/cqa_dupstack_tex_vn_retrieval.py +1 -5
  491. mteb/tasks/retrieval/vie/cqa_dupstack_unix_vn_retrieval.py +1 -5
  492. mteb/tasks/retrieval/vie/cqa_dupstack_webmasters_vn_retrieval.py +1 -5
  493. mteb/tasks/retrieval/vie/cqa_dupstack_wordpress_vn_retrieval.py +1 -5
  494. mteb/tasks/retrieval/vie/db_pedia_vn_retrieval.py +40 -5
  495. mteb/tasks/retrieval/vie/fevervn_retrieval.py +40 -7
  496. mteb/tasks/retrieval/vie/fi_qa2018_vn_retrieval.py +1 -5
  497. mteb/tasks/retrieval/vie/green_node_table_markdown_retrieval.py +16 -1
  498. mteb/tasks/retrieval/vie/hotpot_qavn_retrieval.py +40 -6
  499. mteb/tasks/retrieval/vie/msmarcovn_retrieval.py +49 -5
  500. mteb/tasks/retrieval/vie/nf_corpus_vn_retrieval.py +1 -5
  501. mteb/tasks/retrieval/vie/nqvn_retrieval.py +40 -5
  502. mteb/tasks/retrieval/vie/quora_vn_retrieval.py +1 -6
  503. mteb/tasks/retrieval/vie/sci_fact_vn_retrieval.py +1 -5
  504. mteb/tasks/retrieval/vie/scidocsvn_retrieval.py +1 -6
  505. mteb/tasks/retrieval/vie/touche2020_vn_retrieval.py +1 -5
  506. mteb/tasks/retrieval/vie/treccovidvn_retrieval.py +1 -5
  507. mteb/tasks/retrieval/vie/tvpl_retrieval.py +42 -0
  508. mteb/tasks/retrieval/vie/zac_legal_text_retrieval.py +15 -1
  509. mteb/tasks/sts/nld/sick_nl_sts.py +1 -0
  510. mteb/tasks/sts/vie/biosses_stsvn.py +1 -5
  511. mteb/tasks/sts/vie/sickr_stsvn.py +1 -5
  512. mteb/tasks/sts/vie/sts_benchmark_stsvn.py +1 -5
  513. mteb/tasks/zeroshot_classification/eng/gtsrb.py +1 -1
  514. mteb/tasks/zeroshot_classification/eng/patch_camelyon.py +1 -1
  515. mteb/tasks/zeroshot_classification/eng/ucf101.py +1 -5
  516. mteb/types/__init__.py +2 -0
  517. mteb/types/_encoder_io.py +19 -2
  518. mteb/types/_result.py +2 -1
  519. mteb/types/statistics.py +9 -3
  520. {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/METADATA +25 -8
  521. {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/RECORD +525 -438
  522. mteb/models/model_implementations/mxbai_models.py +0 -102
  523. mteb/models/model_implementations/nb_sbert.py +0 -25
  524. {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/WHEEL +0 -0
  525. {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/entry_points.txt +0 -0
  526. {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/licenses/LICENSE +0 -0
  527. {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/top_level.txt +0 -0
mteb/evaluate.py CHANGED
@@ -1,23 +1,23 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import logging
4
+ import warnings
4
5
  from collections.abc import Iterable
5
- from copy import deepcopy
6
6
  from pathlib import Path
7
7
  from time import time
8
- from typing import TYPE_CHECKING, Any, cast
8
+ from typing import TYPE_CHECKING, cast
9
9
 
10
+ from datasets.exceptions import DatasetNotFoundError
10
11
  from tqdm.auto import tqdm
11
12
 
12
13
  from mteb._helpful_enum import HelpfulStrEnum
13
14
  from mteb.abstasks import AbsTaskRetrieval
14
15
  from mteb.abstasks.abstask import AbsTask
15
16
  from mteb.abstasks.aggregated_task import AbsTaskAggregate
17
+ from mteb.benchmarks.benchmark import Benchmark
16
18
  from mteb.cache import ResultCache
17
19
  from mteb.models.model_meta import ModelMeta
18
20
  from mteb.models.models_protocols import (
19
- CrossEncoderProtocol,
20
- EncoderProtocol,
21
21
  MTEBModels,
22
22
  )
23
23
  from mteb.models.sentence_transformer_wrapper import (
@@ -25,7 +25,9 @@ from mteb.models.sentence_transformer_wrapper import (
25
25
  SentenceTransformerEncoderWrapper,
26
26
  )
27
27
  from mteb.results import ModelResult, TaskResult
28
+ from mteb.results.task_result import TaskError
28
29
  from mteb.types import HFSubset, PromptType, SplitName
30
+ from mteb.types._encoder_io import EncodeKwargs
29
31
  from mteb.types._metadata import ModelName, Revision
30
32
 
31
33
  if TYPE_CHECKING:
@@ -51,62 +53,31 @@ class OverwriteStrategy(HelpfulStrEnum):
51
53
  ONLY_CACHE = "only-cache"
52
54
 
53
55
 
54
- _empty_model_meta = ModelMeta(
55
- loader=None,
56
- name=None,
57
- revision=None,
58
- release_date=None,
59
- languages=None,
60
- framework=[],
61
- similarity_fn_name=None,
62
- n_parameters=None,
63
- memory_usage_mb=None,
64
- max_tokens=None,
65
- embed_dim=None,
66
- license=None,
67
- open_weights=None,
68
- public_training_code=None,
69
- public_training_data=None,
70
- use_instructions=None,
71
- training_datasets=None,
72
- modalities=[],
73
- )
74
-
75
-
76
- def _create_empty_model_meta() -> ModelMeta:
77
- logger.warning("Model metadata is missing. Using empty metadata.")
78
- meta = deepcopy(_empty_model_meta)
79
- meta.revision = "no_revision_available"
80
- meta.name = "no_model_name_available"
81
- return meta
82
-
83
-
84
56
  def _sanitize_model(
85
57
  model: ModelMeta | MTEBModels | SentenceTransformer | CrossEncoder,
86
58
  ) -> tuple[MTEBModels | ModelMeta, ModelMeta, ModelName, Revision]:
87
59
  from sentence_transformers import CrossEncoder, SentenceTransformer
88
60
 
61
+ wrapped_model: MTEBModels | ModelMeta
89
62
  if isinstance(model, SentenceTransformer):
90
- _mdl = SentenceTransformerEncoderWrapper(model)
91
- meta = _mdl.mteb_model_meta
92
- _mdl = cast(EncoderProtocol, _mdl)
93
- model = _mdl
63
+ wrapped_model = SentenceTransformerEncoderWrapper(model)
64
+ meta = wrapped_model.mteb_model_meta
94
65
  elif isinstance(model, CrossEncoder):
95
- _mdl = CrossEncoderWrapper(model)
96
- _mdl = cast(CrossEncoderProtocol, _mdl)
97
- meta = _mdl.mteb_model_meta
98
- model = _mdl
66
+ wrapped_model = CrossEncoderWrapper(model)
67
+ meta = wrapped_model.mteb_model_meta
99
68
  elif hasattr(model, "mteb_model_meta"):
100
- meta = model.mteb_model_meta # type: ignore[attr-defined]
69
+ meta = getattr(model, "mteb_model_meta")
101
70
  if not isinstance(meta, ModelMeta):
102
- meta = _create_empty_model_meta()
71
+ meta = ModelMeta._from_hub(None)
72
+ wrapped_model = cast(MTEBModels | ModelMeta, model)
103
73
  else:
104
- meta = _create_empty_model_meta() if not isinstance(model, ModelMeta) else model
74
+ meta = ModelMeta._from_hub(None) if not isinstance(model, ModelMeta) else model
75
+ wrapped_model = meta
105
76
 
106
77
  model_name = cast(str, meta.name)
107
78
  model_revision = cast(str, meta.revision)
108
79
 
109
- return model, meta, model_name, model_revision
80
+ return wrapped_model, meta, model_name, model_revision
110
81
 
111
82
 
112
83
  def _evaluate_task(
@@ -115,9 +86,11 @@ def _evaluate_task(
115
86
  *,
116
87
  splits: dict[SplitName, list[HFSubset]],
117
88
  co2_tracker: bool | None,
118
- encode_kwargs: dict[str, Any],
89
+ encode_kwargs: EncodeKwargs,
119
90
  prediction_folder: Path | None,
120
- ) -> TaskResult:
91
+ public_only: bool | None,
92
+ num_proc: int = 1,
93
+ ) -> TaskResult | TaskError:
121
94
  """The core logic to run a model on a given task. See `evaluate` for more details.
122
95
 
123
96
  Returns:
@@ -149,8 +122,10 @@ def _evaluate_task(
149
122
  encode_kwargs=encode_kwargs,
150
123
  co2_tracker=False,
151
124
  prediction_folder=prediction_folder,
125
+ public_only=public_only,
152
126
  )
153
- result.kg_co2_emissions = tracker.final_emissions
127
+ if isinstance(result, TaskResult):
128
+ result.kg_co2_emissions = tracker.final_emissions
154
129
  return result
155
130
 
156
131
  task_results = {}
@@ -159,9 +134,24 @@ def _evaluate_task(
159
134
 
160
135
  data_loaded = task.data_loaded
161
136
  if not data_loaded:
162
- task.load_data()
137
+ try:
138
+ task.load_data()
139
+ except DatasetNotFoundError as e:
140
+ if not task.metadata.is_public and public_only is None:
141
+ msg = (
142
+ f"Dataset for private task '{task.metadata.name}' not found. "
143
+ "Make sure you have access to the dataset and that you have set up the authentication correctly. To disable this warning set `public_only=False`"
144
+ )
145
+ logger.warning(msg)
146
+ warnings.warn(msg)
147
+ return TaskError(
148
+ task_name=task.metadata.name,
149
+ exception=str(e),
150
+ )
151
+ if public_only is False:
152
+ raise e
163
153
 
164
- evaluation_time = 0
154
+ evaluation_time = 0.0
165
155
 
166
156
  for split, hf_subsets in splits.items():
167
157
  tick = time()
@@ -208,12 +198,18 @@ def _check_model_modalities(
208
198
  return
209
199
 
210
200
  model_modalities = set(model.modalities)
201
+ check_tasks: Iterable[AbsTask] = []
211
202
  if isinstance(tasks, AbsTask):
212
- tasks = [tasks]
203
+ check_tasks = [tasks]
204
+ elif isinstance(tasks, Benchmark):
205
+ benchmark = cast(Benchmark, tasks)
206
+ check_tasks = benchmark.tasks
207
+ else:
208
+ check_tasks = cast(Iterable[AbsTask], tasks)
213
209
 
214
210
  warnings, errors = [], []
215
211
 
216
- for task in tasks:
212
+ for task in check_tasks:
217
213
  # only retrieval tasks have different modalities for query and document and can be run with partial overlaps
218
214
  if isinstance(task, AbsTaskRetrieval):
219
215
  query_mods = set(task.metadata.get_modalities(PromptType.query))
@@ -256,17 +252,32 @@ def _check_model_modalities(
256
252
  logger.warning(msg)
257
253
 
258
254
 
255
+ def _requires_merge(task: AbsTask, existing_results: TaskResult) -> bool:
256
+ """Check if the existing results require merging with new results."""
257
+ # If the task has multiple eval splits and existing results cover only a subset, we need to merge
258
+ required_evals = dict.fromkeys(task.eval_splits, task.hf_subsets)
259
+ for split, subsets in required_evals.items():
260
+ res = existing_results.scores.get(split, None)
261
+ if res is None:
262
+ return True
263
+ hf_subsets = [r["hf_subset"] for r in res]
264
+ if not set(subsets).issubset(set(hf_subsets)):
265
+ return True
266
+ return False
267
+
268
+
259
269
  def evaluate(
260
270
  model: ModelMeta | MTEBModels | SentenceTransformer | CrossEncoder,
261
271
  tasks: AbsTask | Iterable[AbsTask],
262
272
  *,
263
273
  co2_tracker: bool | None = None,
264
274
  raise_error: bool = True,
265
- encode_kwargs: dict[str, Any] | None = None,
275
+ encode_kwargs: EncodeKwargs | None = None,
266
276
  cache: ResultCache | None = ResultCache(),
267
277
  overwrite_strategy: str | OverwriteStrategy = "only-missing",
268
278
  prediction_folder: Path | str | None = None,
269
279
  show_progress_bar: bool = True,
280
+ public_only: bool | None = None,
270
281
  ) -> ModelResult:
271
282
  """This function runs a model on a given task and returns the results.
272
283
 
@@ -290,6 +301,7 @@ def evaluate(
290
301
  prediction_folder: Optional folder in which to save model predictions for the task. Predictions of the tasks will be sabed in `prediction_folder/{task_name}_predictions.json`
291
302
  show_progress_bar: Whether to show a progress bar when running the evaluation. Default is True. Setting this to False will also set the
292
303
  `encode_kwargs['show_progress_bar']` to False if encode_kwargs is unspecified.
304
+ public_only: Run only public tasks. If None, it will attempt to run the private task.
293
305
 
294
306
  Returns:
295
307
  The results of the evaluation.
@@ -330,10 +342,10 @@ def evaluate(
330
342
 
331
343
  # AbsTaskAggregate is a special case where we have to run multiple tasks and combine the results
332
344
  if isinstance(tasks, AbsTaskAggregate):
333
- task = cast(AbsTaskAggregate, tasks)
345
+ aggregated_task = cast(AbsTaskAggregate, tasks)
334
346
  results = evaluate(
335
347
  model,
336
- task.metadata.tasks,
348
+ aggregated_task.metadata.tasks,
337
349
  co2_tracker=co2_tracker,
338
350
  raise_error=raise_error,
339
351
  encode_kwargs=encode_kwargs,
@@ -341,18 +353,21 @@ def evaluate(
341
353
  overwrite_strategy=overwrite_strategy,
342
354
  prediction_folder=prediction_folder,
343
355
  show_progress_bar=show_progress_bar,
356
+ public_only=public_only,
344
357
  )
345
- result = task.combine_task_results(results.task_results)
358
+ combined_results = aggregated_task.combine_task_results(results.task_results)
346
359
  return ModelResult(
347
360
  model_name=results.model_name,
348
361
  model_revision=results.model_revision,
349
- task_results=[result],
362
+ task_results=[combined_results],
350
363
  )
351
364
 
352
365
  if isinstance(tasks, AbsTask):
353
366
  task = tasks
354
367
  else:
355
- results = []
368
+ tasks = cast(Iterable[AbsTask], tasks)
369
+ evaluate_results = []
370
+ exceptions = []
356
371
  tasks_tqdm = tqdm(
357
372
  tasks,
358
373
  desc="Evaluating tasks",
@@ -370,31 +385,40 @@ def evaluate(
370
385
  overwrite_strategy=overwrite_strategy,
371
386
  prediction_folder=prediction_folder,
372
387
  show_progress_bar=False,
388
+ public_only=public_only,
373
389
  )
374
- results.extend(_res.task_results)
390
+ evaluate_results.extend(_res.task_results)
391
+ if _res.exceptions:
392
+ exceptions.extend(_res.exceptions)
375
393
  return ModelResult(
376
394
  model_name=_res.model_name,
377
395
  model_revision=_res.model_revision,
378
- task_results=results,
396
+ task_results=evaluate_results,
397
+ exceptions=exceptions,
379
398
  )
380
399
 
381
400
  overwrite_strategy = OverwriteStrategy.from_str(overwrite_strategy)
382
401
 
383
- existing_results = None
402
+ existing_results: TaskResult | None = None
384
403
  if cache and overwrite_strategy != OverwriteStrategy.ALWAYS:
385
- results = cache.load_task_result(task.metadata.name, meta)
386
- if results:
387
- existing_results = results
404
+ cache_results = cache.load_task_result(task.metadata.name, meta)
405
+ if cache_results:
406
+ existing_results = cache_results
388
407
 
389
408
  if (
390
409
  existing_results
391
- and overwrite_strategy == "only-missing"
392
- and overwrite_strategy == OverwriteStrategy.ONLY_MISSING
393
- and existing_results.is_mergeable(task)
410
+ and overwrite_strategy
411
+ not in (OverwriteStrategy.ALWAYS, OverwriteStrategy.NEVER)
412
+ and (
413
+ not _requires_merge(task, existing_results)
414
+ or existing_results.is_mergeable(task)
415
+ )
394
416
  ):
395
417
  missing_eval = existing_results.get_missing_evaluations(task)
396
418
  else:
397
419
  missing_eval = dict.fromkeys(task.eval_splits, task.hf_subsets)
420
+ # Will be fully recomputed so we set it to None to avoid merging:
421
+ existing_results = None
398
422
 
399
423
  if (
400
424
  existing_results
@@ -415,12 +439,13 @@ def evaluate(
415
439
  OverwriteStrategy.ONLY_CACHE,
416
440
  ]:
417
441
  raise ValueError(
418
- f"overwrite_strategy is set to '{overwrite_strategy.value}' and the results file exists. However there are the following missing splits (and subsets): {missing_eval}. To rerun these set overwrite_strategy to 'only-missing'."
442
+ f"overwrite_strategy is set to '{overwrite_strategy.value}' and the results file exists for task {task.metadata.name}. "
443
+ + f"However there are the following missing splits (and subsets): {missing_eval}. To rerun these set overwrite_strategy to 'only-missing'."
419
444
  )
420
445
 
421
446
  if existing_results:
422
447
  logger.info(
423
- f"Found existing results for {task.metadata.name}, only running missing splits: {list(missing_eval.keys())}"
448
+ f"Found existing results for {task.metadata.name}, only running missing splits (subsets): {missing_eval}"
424
449
  )
425
450
 
426
451
  if isinstance(model, ModelMeta):
@@ -439,16 +464,13 @@ def evaluate(
439
464
  co2_tracker=co2_tracker,
440
465
  encode_kwargs=encode_kwargs,
441
466
  prediction_folder=prediction_folder,
467
+ public_only=public_only,
442
468
  )
443
469
  except Exception as e:
444
470
  logger.error(
445
471
  f"Error while running task {task.metadata.name} on splits {list(missing_eval.keys())}: {e}"
446
472
  )
447
- return ModelResult(
448
- model_name=model_name,
449
- model_revision=model_revision,
450
- task_results=[],
451
- )
473
+ result = TaskError(task_name=task.metadata.name, exception=str(e))
452
474
  else:
453
475
  result = _evaluate_task(
454
476
  model=model,
@@ -457,9 +479,18 @@ def evaluate(
457
479
  co2_tracker=False,
458
480
  encode_kwargs=encode_kwargs,
459
481
  prediction_folder=prediction_folder,
482
+ public_only=public_only,
460
483
  )
461
484
  logger.info(f"✓ Finished evaluation for {task.metadata.name}")
462
485
 
486
+ if isinstance(result, TaskError):
487
+ return ModelResult(
488
+ model_name=model_name,
489
+ model_revision=model_revision,
490
+ task_results=[],
491
+ exceptions=[result],
492
+ )
493
+
463
494
  if existing_results:
464
495
  result = result.merge(existing_results)
465
496
 
mteb/filter_tasks.py CHANGED
@@ -1,7 +1,7 @@
1
1
  """This script contains functions that are used to get an overview of the MTEB benchmark."""
2
2
 
3
3
  import logging
4
- from collections.abc import Sequence
4
+ from collections.abc import Iterable, Sequence
5
5
  from typing import overload
6
6
 
7
7
  from mteb.abstasks import (
@@ -34,14 +34,14 @@ def _check_is_valid_language(lang: str) -> None:
34
34
 
35
35
  @overload
36
36
  def filter_tasks(
37
- tasks: Sequence[AbsTask],
37
+ tasks: Iterable[AbsTask],
38
38
  *,
39
- languages: list[str] | None = None,
40
- script: list[str] | None = None,
41
- domains: list[TaskDomain] | None = None,
42
- task_types: list[TaskType] | None = None, # type: ignore
43
- categories: list[TaskCategory] | None = None,
44
- modalities: list[Modalities] | None = None,
39
+ languages: Sequence[str] | None = None,
40
+ script: Sequence[str] | None = None,
41
+ domains: Iterable[TaskDomain] | None = None,
42
+ task_types: Iterable[TaskType] | None = None,
43
+ categories: Iterable[TaskCategory] | None = None,
44
+ modalities: Iterable[Modalities] | None = None,
45
45
  exclusive_modality_filter: bool = False,
46
46
  exclude_superseded: bool = False,
47
47
  exclude_aggregate: bool = False,
@@ -51,14 +51,14 @@ def filter_tasks(
51
51
 
52
52
  @overload
53
53
  def filter_tasks(
54
- tasks: Sequence[type[AbsTask]],
54
+ tasks: Iterable[type[AbsTask]],
55
55
  *,
56
- languages: list[str] | None = None,
57
- script: list[str] | None = None,
58
- domains: list[TaskDomain] | None = None,
59
- task_types: list[TaskType] | None = None, # type: ignore
60
- categories: list[TaskCategory] | None = None,
61
- modalities: list[Modalities] | None = None,
56
+ languages: Sequence[str] | None = None,
57
+ script: Sequence[str] | None = None,
58
+ domains: Iterable[TaskDomain] | None = None,
59
+ task_types: Iterable[TaskType] | None = None,
60
+ categories: Iterable[TaskCategory] | None = None,
61
+ modalities: Iterable[Modalities] | None = None,
62
62
  exclusive_modality_filter: bool = False,
63
63
  exclude_superseded: bool = False,
64
64
  exclude_aggregate: bool = False,
@@ -67,14 +67,14 @@ def filter_tasks(
67
67
 
68
68
 
69
69
  def filter_tasks(
70
- tasks: Sequence[AbsTask] | Sequence[type[AbsTask]],
70
+ tasks: Iterable[AbsTask] | Iterable[type[AbsTask]],
71
71
  *,
72
- languages: list[str] | None = None,
73
- script: list[str] | None = None,
74
- domains: list[TaskDomain] | None = None,
75
- task_types: list[TaskType] | None = None, # type: ignore
76
- categories: list[TaskCategory] | None = None,
77
- modalities: list[Modalities] | None = None,
72
+ languages: Sequence[str] | None = None,
73
+ script: Sequence[str] | None = None,
74
+ domains: Iterable[TaskDomain] | None = None,
75
+ task_types: Iterable[TaskType] | None = None,
76
+ categories: Iterable[TaskCategory] | None = None,
77
+ modalities: Iterable[Modalities] | None = None,
78
78
  exclusive_modality_filter: bool = False,
79
79
  exclude_superseded: bool = False,
80
80
  exclude_aggregate: bool = False,
@@ -92,7 +92,6 @@ def filter_tasks(
92
92
  task_types: A string specifying the type of task e.g. "Classification" or "Retrieval". If None, all tasks are included.
93
93
  categories: A list of task categories these include "t2t" (text to text), "t2i" (text to image). See TaskMetadata for the full list.
94
94
  exclude_superseded: A boolean flag to exclude datasets which are superseded by another.
95
- eval_splits: A list of evaluation splits to include. If None, all splits are included.
96
95
  modalities: A list of modalities to include. If None, all modalities are included.
97
96
  exclusive_modality_filter: If True, only keep tasks where _all_ filter modalities are included in the
98
97
  task's modalities and ALL task modalities are in filter modalities (exact match).
@@ -113,12 +112,12 @@ def filter_tasks(
113
112
  """
114
113
  langs_to_keep = None
115
114
  if languages:
116
- [_check_is_valid_language(lang) for lang in languages]
115
+ [_check_is_valid_language(lang) for lang in languages] # type: ignore[func-returns-value]
117
116
  langs_to_keep = set(languages)
118
117
 
119
118
  script_to_keep = None
120
119
  if script:
121
- [_check_is_valid_script(s) for s in script]
120
+ [_check_is_valid_script(s) for s in script] # type: ignore[func-returns-value]
122
121
  script_to_keep = set(script)
123
122
 
124
123
  domains_to_keep = None
@@ -178,4 +177,4 @@ def filter_tasks(
178
177
 
179
178
  _tasks.append(t)
180
179
 
181
- return _tasks
180
+ return _tasks # type: ignore[return-value] # type checker cannot infer the overload return type
mteb/get_tasks.py CHANGED
@@ -2,8 +2,9 @@
2
2
 
3
3
  import difflib
4
4
  import logging
5
+ import warnings
5
6
  from collections import Counter, defaultdict
6
- from collections.abc import Sequence
7
+ from collections.abc import Iterable, Sequence
7
8
  from typing import Any
8
9
 
9
10
  import pandas as pd
@@ -22,12 +23,11 @@ logger = logging.getLogger(__name__)
22
23
  def _gather_tasks() -> tuple[type[AbsTask], ...]:
23
24
  import mteb.tasks as tasks
24
25
 
25
- tasks = [
26
+ return tuple(
26
27
  t
27
28
  for t in tasks.__dict__.values()
28
29
  if isinstance(t, type) and issubclass(t, AbsTask)
29
- ]
30
- return tuple(tasks)
30
+ )
31
31
 
32
32
 
33
33
  def _create_name_to_task_mapping(
@@ -43,7 +43,7 @@ def _create_name_to_task_mapping(
43
43
  return metadata_names
44
44
 
45
45
 
46
- def _create_similar_tasks(tasks: Sequence[type[AbsTask]]) -> dict[str, list[str]]:
46
+ def _create_similar_tasks(tasks: Iterable[type[AbsTask]]) -> dict[str, list[str]]:
47
47
  """Create a dictionary of similar tasks.
48
48
 
49
49
  Returns:
@@ -194,9 +194,8 @@ class MTEBTasks(tuple[AbsTask]):
194
194
  string with a LaTeX table.
195
195
  """
196
196
  if include_citation_in_name and "name" in properties:
197
- properties += ["intext_citation"]
198
- df = self.to_dataframe(properties)
199
- df["name"] = df["name"] + " " + df["intext_citation"]
197
+ df = self.to_dataframe(tuple(properties) + ("intext_citation",))
198
+ df["name"] = df["name"] + " " + df["intext_citation"] # type: ignore[operator]
200
199
  df = df.drop(columns=["intext_citation"])
201
200
  else:
202
201
  df = self.to_dataframe(properties)
@@ -221,17 +220,17 @@ class MTEBTasks(tuple[AbsTask]):
221
220
 
222
221
 
223
222
  def get_tasks(
224
- tasks: list[str] | None = None,
223
+ tasks: Sequence[str] | None = None,
225
224
  *,
226
- languages: list[str] | None = None,
227
- script: list[str] | None = None,
228
- domains: list[TaskDomain] | None = None,
229
- task_types: list[TaskType] | None = None, # type: ignore
230
- categories: list[TaskCategory] | None = None,
225
+ languages: Sequence[str] | None = None,
226
+ script: Sequence[str] | None = None,
227
+ domains: Sequence[TaskDomain] | None = None,
228
+ task_types: Sequence[TaskType] | None = None,
229
+ categories: Sequence[TaskCategory] | None = None,
231
230
  exclude_superseded: bool = True,
232
- eval_splits: list[str] | None = None,
231
+ eval_splits: Sequence[str] | None = None,
233
232
  exclusive_language_filter: bool = False,
234
- modalities: list[Modalities] | None = None,
233
+ modalities: Sequence[Modalities] | None = None,
235
234
  exclusive_modality_filter: bool = False,
236
235
  exclude_aggregate: bool = False,
237
236
  exclude_private: bool = True,
@@ -287,7 +286,7 @@ def get_tasks(
287
286
  ]
288
287
  return MTEBTasks(_tasks)
289
288
 
290
- _tasks = filter_tasks(
289
+ tasks_: Sequence[type[AbsTask]] = filter_tasks(
291
290
  TASK_LIST,
292
291
  languages=languages,
293
292
  script=script,
@@ -300,12 +299,12 @@ def get_tasks(
300
299
  exclude_aggregate=exclude_aggregate,
301
300
  exclude_private=exclude_private,
302
301
  )
303
- _tasks = [
304
- cls().filter_languages(languages, script).filter_eval_splits(eval_splits)
305
- for cls in _tasks
306
- ]
307
-
308
- return MTEBTasks(_tasks)
302
+ return MTEBTasks(
303
+ [
304
+ cls().filter_languages(languages, script).filter_eval_splits(eval_splits)
305
+ for cls in tasks_
306
+ ]
307
+ )
309
308
 
310
309
 
311
310
  _TASK_RENAMES = {"PersianTextTone": "SynPerTextToneClassification"}
@@ -313,10 +312,10 @@ _TASK_RENAMES = {"PersianTextTone": "SynPerTextToneClassification"}
313
312
 
314
313
  def get_task(
315
314
  task_name: str,
316
- languages: list[str] | None = None,
317
- script: list[str] | None = None,
318
- eval_splits: list[str] | None = None,
319
- hf_subsets: list[str] | None = None,
315
+ languages: Sequence[str] | None = None,
316
+ script: Sequence[str] | None = None,
317
+ eval_splits: Sequence[str] | None = None,
318
+ hf_subsets: Sequence[str] | None = None,
320
319
  exclusive_language_filter: bool = False,
321
320
  ) -> AbsTask:
322
321
  """Get a task by name.
@@ -340,9 +339,9 @@ def get_task(
340
339
  """
341
340
  if task_name in _TASK_RENAMES:
342
341
  _task_name = _TASK_RENAMES[task_name]
343
- logger.warning(
344
- f"The task with the given name '{task_name}' has been renamed to '{_task_name}'. To prevent this warning use the new name."
345
- )
342
+ msg = f"The task with the given name '{task_name}' has been renamed to '{_task_name}'. To prevent this warning use the new name."
343
+ logger.warning(msg)
344
+ warnings.warn(msg)
346
345
 
347
346
  if task_name not in _TASKS_REGISTRY:
348
347
  close_matches = difflib.get_close_matches(task_name, _TASKS_REGISTRY.keys())
@@ -1,9 +1,9 @@
1
- from collections.abc import Iterable
1
+ from collections.abc import Iterable, Sequence
2
2
  from dataclasses import dataclass
3
3
 
4
4
  from typing_extensions import Self
5
5
 
6
- from mteb.languages import check_language_code
6
+ from mteb.languages.check_language_code import check_language_code
7
7
 
8
8
 
9
9
  @dataclass
@@ -25,7 +25,9 @@ class LanguageScripts:
25
25
 
26
26
  @classmethod
27
27
  def from_languages_and_scripts(
28
- cls, languages: list[str] | None = None, scripts: list[str] | None = None
28
+ cls,
29
+ languages: Sequence[str] | None = None,
30
+ scripts: Sequence[str] | None = None,
29
31
  ) -> Self:
30
32
  """Create a LanguageScripts object from lists of languages and scripts.
31
33