mteb 2.5.2__py3-none-any.whl → 2.7.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (529) hide show
  1. mteb/__init__.py +2 -0
  2. mteb/_create_dataloaders.py +78 -30
  3. mteb/_evaluators/any_sts_evaluator.py +13 -6
  4. mteb/_evaluators/clustering_evaluator.py +13 -5
  5. mteb/_evaluators/evaluator.py +12 -4
  6. mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +22 -11
  7. mteb/_evaluators/pair_classification_evaluator.py +17 -7
  8. mteb/_evaluators/retrieval_evaluator.py +23 -14
  9. mteb/_evaluators/retrieval_metrics.py +26 -19
  10. mteb/_evaluators/sklearn_evaluator.py +27 -17
  11. mteb/_evaluators/text/bitext_mining_evaluator.py +36 -20
  12. mteb/_evaluators/text/summarization_evaluator.py +31 -20
  13. mteb/_evaluators/zeroshot_classification_evaluator.py +16 -5
  14. mteb/_helpful_enum.py +5 -1
  15. mteb/abstasks/_data_filter/filters.py +9 -3
  16. mteb/abstasks/_data_filter/task_pipelines.py +10 -2
  17. mteb/abstasks/_statistics_calculation.py +21 -11
  18. mteb/abstasks/_stratification.py +18 -18
  19. mteb/abstasks/abstask.py +78 -44
  20. mteb/abstasks/aggregate_task_metadata.py +21 -18
  21. mteb/abstasks/aggregated_task.py +23 -35
  22. mteb/abstasks/classification.py +39 -18
  23. mteb/abstasks/clustering.py +37 -20
  24. mteb/abstasks/clustering_legacy.py +30 -16
  25. mteb/abstasks/image/image_text_pair_classification.py +26 -9
  26. mteb/abstasks/multilabel_classification.py +33 -21
  27. mteb/abstasks/pair_classification.py +44 -19
  28. mteb/abstasks/regression.py +18 -10
  29. mteb/abstasks/retrieval.py +82 -52
  30. mteb/abstasks/retrieval_dataset_loaders.py +50 -39
  31. mteb/abstasks/sts.py +34 -15
  32. mteb/abstasks/task_metadata.py +44 -37
  33. mteb/abstasks/text/bitext_mining.py +57 -35
  34. mteb/abstasks/text/reranking.py +10 -8
  35. mteb/abstasks/text/summarization.py +26 -10
  36. mteb/abstasks/zeroshot_classification.py +27 -9
  37. mteb/benchmarks/_create_table.py +13 -7
  38. mteb/benchmarks/benchmark.py +15 -3
  39. mteb/benchmarks/benchmarks/__init__.py +6 -0
  40. mteb/benchmarks/benchmarks/benchmarks.py +153 -13
  41. mteb/benchmarks/benchmarks/rteb_benchmarks.py +20 -9
  42. mteb/benchmarks/get_benchmark.py +14 -55
  43. mteb/cache.py +189 -31
  44. mteb/cli/_display_tasks.py +10 -4
  45. mteb/cli/build_cli.py +112 -13
  46. mteb/cli/generate_model_card.py +50 -23
  47. mteb/deprecated_evaluator.py +72 -54
  48. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2CybersecurityRetrieval.json +32 -0
  49. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EconomicRetrieval.json +32 -0
  50. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EnergyRetrieval.json +32 -0
  51. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2HrRetrieval.json +32 -0
  52. mteb/descriptive_stats/Retrieval/BrightAopsRetrieval.json +35 -0
  53. mteb/descriptive_stats/Retrieval/BrightBiologyLongRetrieval.json +35 -0
  54. mteb/descriptive_stats/Retrieval/BrightBiologyRetrieval.json +35 -0
  55. mteb/descriptive_stats/Retrieval/BrightEarthScienceLongRetrieval.json +35 -0
  56. mteb/descriptive_stats/Retrieval/BrightEarthScienceRetrieval.json +35 -0
  57. mteb/descriptive_stats/Retrieval/BrightEconomicsLongRetrieval.json +35 -0
  58. mteb/descriptive_stats/Retrieval/BrightEconomicsRetrieval.json +35 -0
  59. mteb/descriptive_stats/Retrieval/BrightLeetcodeRetrieval.json +35 -0
  60. mteb/descriptive_stats/Retrieval/BrightPonyLongRetrieval.json +35 -0
  61. mteb/descriptive_stats/Retrieval/BrightPonyRetrieval.json +35 -0
  62. mteb/descriptive_stats/Retrieval/BrightPsychologyLongRetrieval.json +35 -0
  63. mteb/descriptive_stats/Retrieval/BrightPsychologyRetrieval.json +35 -0
  64. mteb/descriptive_stats/Retrieval/BrightRoboticsLongRetrieval.json +35 -0
  65. mteb/descriptive_stats/Retrieval/BrightRoboticsRetrieval.json +35 -0
  66. mteb/descriptive_stats/Retrieval/BrightStackoverflowLongRetrieval.json +35 -0
  67. mteb/descriptive_stats/Retrieval/BrightStackoverflowRetrieval.json +35 -0
  68. mteb/descriptive_stats/Retrieval/BrightSustainableLivingLongRetrieval.json +35 -0
  69. mteb/descriptive_stats/Retrieval/BrightSustainableLivingRetrieval.json +35 -0
  70. mteb/descriptive_stats/Retrieval/BrightTheoremQAQuestionsRetrieval.json +35 -0
  71. mteb/descriptive_stats/Retrieval/BrightTheoremQATheoremsRetrieval.json +35 -0
  72. mteb/descriptive_stats/Retrieval/ChemRxivRetrieval.json +30 -0
  73. mteb/descriptive_stats/Retrieval/EuroPIRQRetrieval.json +116 -0
  74. mteb/descriptive_stats/Retrieval/NanoClimateFEVER-VN.json +30 -0
  75. mteb/descriptive_stats/Retrieval/NanoDBPedia-VN.json +30 -0
  76. mteb/descriptive_stats/Retrieval/NanoFEVER-VN.json +30 -0
  77. mteb/descriptive_stats/Retrieval/NanoHotpotQA-VN.json +30 -0
  78. mteb/descriptive_stats/Retrieval/NanoMSMARCO-VN.json +30 -0
  79. mteb/descriptive_stats/Retrieval/NanoNQ-VN.json +30 -0
  80. mteb/descriptive_stats/Retrieval/TVPLRetrieval.json +30 -0
  81. mteb/evaluate.py +71 -47
  82. mteb/filter_tasks.py +36 -32
  83. mteb/get_tasks.py +37 -33
  84. mteb/languages/language_scripts.py +11 -4
  85. mteb/leaderboard/app.py +172 -37
  86. mteb/leaderboard/table.py +7 -2
  87. mteb/load_results.py +20 -14
  88. mteb/models/abs_encoder.py +30 -16
  89. mteb/models/cache_wrappers/cache_backend_protocol.py +7 -7
  90. mteb/models/cache_wrappers/cache_backends/_hash_utils.py +10 -5
  91. mteb/models/cache_wrappers/cache_backends/faiss_cache.py +13 -4
  92. mteb/models/cache_wrappers/cache_backends/numpy_cache.py +43 -25
  93. mteb/models/cache_wrappers/cache_wrapper.py +16 -11
  94. mteb/models/get_model_meta.py +53 -9
  95. mteb/models/instruct_wrapper.py +41 -13
  96. mteb/models/model_implementations/align_models.py +11 -5
  97. mteb/models/model_implementations/amazon_models.py +1 -0
  98. mteb/models/model_implementations/andersborges.py +6 -4
  99. mteb/models/model_implementations/ara_models.py +2 -1
  100. mteb/models/model_implementations/arctic_models.py +16 -8
  101. mteb/models/model_implementations/b1ade_models.py +2 -1
  102. mteb/models/model_implementations/bedrock_models.py +20 -6
  103. mteb/models/model_implementations/bge_models.py +85 -22
  104. mteb/models/model_implementations/bica_model.py +4 -3
  105. mteb/models/model_implementations/blip2_models.py +13 -6
  106. mteb/models/model_implementations/blip_models.py +33 -20
  107. mteb/models/model_implementations/bm25.py +27 -17
  108. mteb/models/model_implementations/bmretriever_models.py +16 -6
  109. mteb/models/model_implementations/cadet_models.py +2 -1
  110. mteb/models/model_implementations/cde_models.py +22 -9
  111. mteb/models/model_implementations/clip_models.py +18 -10
  112. mteb/models/model_implementations/clips_models.py +6 -3
  113. mteb/models/model_implementations/codefuse_models.py +10 -5
  114. mteb/models/model_implementations/codesage_models.py +6 -3
  115. mteb/models/model_implementations/cohere_models.py +19 -9
  116. mteb/models/model_implementations/cohere_v.py +16 -6
  117. mteb/models/model_implementations/colpali_models.py +10 -6
  118. mteb/models/model_implementations/colqwen_models.py +24 -38
  119. mteb/models/model_implementations/colsmol_models.py +5 -3
  120. mteb/models/model_implementations/conan_models.py +12 -5
  121. mteb/models/model_implementations/dino_models.py +70 -46
  122. mteb/models/model_implementations/e5_instruct.py +27 -4
  123. mteb/models/model_implementations/e5_models.py +18 -9
  124. mteb/models/model_implementations/e5_v.py +16 -10
  125. mteb/models/model_implementations/eagerworks_models.py +12 -5
  126. mteb/models/model_implementations/emillykkejensen_models.py +9 -6
  127. mteb/models/model_implementations/en_code_retriever.py +2 -1
  128. mteb/models/model_implementations/euler_models.py +3 -2
  129. mteb/models/model_implementations/evaclip_models.py +13 -4
  130. mteb/models/model_implementations/fa_models.py +18 -9
  131. mteb/models/model_implementations/facebookai.py +16 -2
  132. mteb/models/model_implementations/geogpt_models.py +2 -1
  133. mteb/models/model_implementations/gme_v_models.py +13 -8
  134. mteb/models/model_implementations/google_models.py +16 -5
  135. mteb/models/model_implementations/granite_vision_embedding_models.py +8 -6
  136. mteb/models/model_implementations/gritlm_models.py +5 -2
  137. mteb/models/model_implementations/gte_models.py +34 -13
  138. mteb/models/model_implementations/hinvec_models.py +7 -2
  139. mteb/models/model_implementations/human.py +1 -0
  140. mteb/models/model_implementations/ibm_granite_models.py +36 -6
  141. mteb/models/model_implementations/inf_models.py +4 -2
  142. mteb/models/model_implementations/jasper_models.py +16 -7
  143. mteb/models/model_implementations/jina_clip.py +58 -14
  144. mteb/models/model_implementations/jina_models.py +35 -16
  145. mteb/models/model_implementations/kalm_models.py +24 -12
  146. mteb/models/model_implementations/kblab.py +13 -6
  147. mteb/models/model_implementations/kennethenevoldsen_models.py +6 -4
  148. mteb/models/model_implementations/kfst.py +2 -1
  149. mteb/models/model_implementations/kowshik24_models.py +2 -1
  150. mteb/models/model_implementations/lens_models.py +2 -0
  151. mteb/models/model_implementations/lgai_embedding_models.py +2 -1
  152. mteb/models/model_implementations/linq_models.py +8 -2
  153. mteb/models/model_implementations/listconranker.py +11 -5
  154. mteb/models/model_implementations/llm2clip_models.py +18 -10
  155. mteb/models/model_implementations/llm2vec_models.py +28 -14
  156. mteb/models/model_implementations/mcinext_models.py +12 -3
  157. mteb/models/model_implementations/mdbr_models.py +19 -3
  158. mteb/models/model_implementations/misc_models.py +131 -68
  159. mteb/models/model_implementations/mixedbread_ai_models.py +335 -0
  160. mteb/models/model_implementations/mme5_models.py +3 -2
  161. mteb/models/model_implementations/moco_models.py +15 -8
  162. mteb/models/model_implementations/mod_models.py +3 -2
  163. mteb/models/model_implementations/model2vec_models.py +37 -18
  164. mteb/models/model_implementations/moka_models.py +4 -1
  165. mteb/models/model_implementations/nbailab.py +6 -3
  166. mteb/models/model_implementations/no_instruct_sentence_models.py +15 -7
  167. mteb/models/model_implementations/nomic_models.py +47 -19
  168. mteb/models/model_implementations/nomic_models_vision.py +6 -4
  169. mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +20 -8
  170. mteb/models/model_implementations/nvidia_models.py +165 -22
  171. mteb/models/model_implementations/octen_models.py +64 -3
  172. mteb/models/model_implementations/openai_models.py +14 -4
  173. mteb/models/model_implementations/openclip_models.py +30 -17
  174. mteb/models/model_implementations/opensearch_neural_sparse_models.py +20 -9
  175. mteb/models/model_implementations/ops_moa_models.py +10 -3
  176. mteb/models/model_implementations/ordalietech_solon_embeddings_mini_beta_1_1.py +2 -1
  177. mteb/models/model_implementations/pawan_models.py +2 -1
  178. mteb/models/model_implementations/piccolo_models.py +3 -1
  179. mteb/models/model_implementations/pixie_models.py +56 -0
  180. mteb/models/model_implementations/promptriever_models.py +20 -10
  181. mteb/models/model_implementations/pylate_models.py +41 -21
  182. mteb/models/model_implementations/qodo_models.py +4 -2
  183. mteb/models/model_implementations/qtack_models.py +2 -1
  184. mteb/models/model_implementations/qwen3_models.py +14 -4
  185. mteb/models/model_implementations/qzhou_models.py +4 -2
  186. mteb/models/model_implementations/random_baseline.py +7 -6
  187. mteb/models/model_implementations/rasgaard_models.py +3 -2
  188. mteb/models/model_implementations/reasonir_model.py +66 -1
  189. mteb/models/model_implementations/repllama_models.py +18 -9
  190. mteb/models/model_implementations/rerankers_custom.py +25 -10
  191. mteb/models/model_implementations/rerankers_monot5_based.py +41 -21
  192. mteb/models/model_implementations/richinfoai_models.py +2 -1
  193. mteb/models/model_implementations/ru_sentence_models.py +40 -20
  194. mteb/models/model_implementations/ruri_models.py +20 -10
  195. mteb/models/model_implementations/salesforce_models.py +13 -4
  196. mteb/models/model_implementations/samilpwc_models.py +2 -1
  197. mteb/models/model_implementations/sarashina_embedding_models.py +4 -2
  198. mteb/models/model_implementations/searchmap_models.py +2 -1
  199. mteb/models/model_implementations/seed_1_6_embedding_models.py +5 -2
  200. mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +119 -148
  201. mteb/models/model_implementations/seed_models.py +2 -1
  202. mteb/models/model_implementations/sentence_transformers_models.py +142 -22
  203. mteb/models/model_implementations/shuu_model.py +2 -1
  204. mteb/models/model_implementations/siglip_models.py +39 -24
  205. mteb/models/model_implementations/slm_models.py +419 -0
  206. mteb/models/model_implementations/sonar_models.py +2 -1
  207. mteb/models/model_implementations/spartan8806_atles_champion.py +2 -1
  208. mteb/models/model_implementations/stella_models.py +23 -4
  209. mteb/models/model_implementations/tarka_models.py +4 -2
  210. mteb/models/model_implementations/text2vec_models.py +12 -3
  211. mteb/models/model_implementations/ua_sentence_models.py +2 -1
  212. mteb/models/model_implementations/uae_models.py +17 -5
  213. mteb/models/model_implementations/vdr_models.py +9 -2
  214. mteb/models/model_implementations/vi_vn_models.py +12 -6
  215. mteb/models/model_implementations/vista_models.py +11 -4
  216. mteb/models/model_implementations/vlm2vec_models.py +14 -7
  217. mteb/models/model_implementations/voyage_models.py +136 -4
  218. mteb/models/model_implementations/voyage_v.py +17 -10
  219. mteb/models/model_implementations/xyz_models.py +1 -0
  220. mteb/models/model_implementations/youtu_models.py +2 -1
  221. mteb/models/model_implementations/yuan_models.py +2 -1
  222. mteb/models/model_implementations/yuan_models_en.py +3 -2
  223. mteb/models/model_meta.py +127 -40
  224. mteb/models/models_protocols.py +43 -22
  225. mteb/models/search_encoder_index/search_backend_protocol.py +7 -3
  226. mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +21 -10
  227. mteb/models/search_wrappers.py +63 -29
  228. mteb/models/sentence_transformer_wrapper.py +52 -26
  229. mteb/models/vllm_wrapper.py +329 -0
  230. mteb/py.typed +0 -0
  231. mteb/results/benchmark_results.py +48 -35
  232. mteb/results/model_result.py +68 -32
  233. mteb/results/task_result.py +110 -72
  234. mteb/similarity_functions.py +19 -9
  235. mteb/tasks/aggregated_tasks/eng/cqadupstack_retrieval.py +3 -3
  236. mteb/tasks/aggregated_tasks/eng/sts17_multilingual_visual_sts_eng.py +3 -3
  237. mteb/tasks/aggregated_tasks/eng/sts_benchmark_multilingual_visual_sts_eng.py +3 -3
  238. mteb/tasks/aggregated_tasks/fas/cqadupstack_retrieval_fa.py +3 -3
  239. mteb/tasks/aggregated_tasks/fas/syn_per_chatbot_conv_sa_classification.py +3 -3
  240. mteb/tasks/aggregated_tasks/multilingual/sts17_multilingual_vision_sts.py +3 -3
  241. mteb/tasks/aggregated_tasks/multilingual/sts_benchmark_multilingual_visual_sts.py +3 -3
  242. mteb/tasks/aggregated_tasks/nld/cqadupstack_nl_retrieval.py +3 -3
  243. mteb/tasks/aggregated_tasks/pol/cqadupstack_retrieval_pl.py +3 -3
  244. mteb/tasks/bitext_mining/eng/pub_chem_smiles_bitext_mining.py +1 -1
  245. mteb/tasks/bitext_mining/fas/fa_mteb_summary_retrieval.py +3 -3
  246. mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining.py +1 -1
  247. mteb/tasks/bitext_mining/multilingual/flores_bitext_mining.py +1 -1
  248. mteb/tasks/bitext_mining/multilingual/in22_conv_bitext_mining.py +1 -1
  249. mteb/tasks/bitext_mining/multilingual/in22_gen_bitext_mining.py +1 -1
  250. mteb/tasks/bitext_mining/multilingual/norwegian_courts_bitext_mining.py +1 -1
  251. mteb/tasks/bitext_mining/multilingual/ntrex_bitext_mining.py +1 -1
  252. mteb/tasks/bitext_mining/multilingual/roma_tales_bitext_mining.py +2 -2
  253. mteb/tasks/bitext_mining/multilingual/web_faq_bitext_mining.py +2 -2
  254. mteb/tasks/classification/ara/online_store_review_sentiment_classification.py +1 -1
  255. mteb/tasks/classification/ara/restaurant_review_sentiment_classification.py +1 -1
  256. mteb/tasks/classification/ara/tweet_sarcasm_classification.py +1 -1
  257. mteb/tasks/classification/ben/bengali_hate_speech_classification.py +1 -1
  258. mteb/tasks/classification/ben/bengali_sentiment_analysis.py +1 -1
  259. mteb/tasks/classification/bul/bulgarian_store_review_sentiment_classfication.py +1 -1
  260. mteb/tasks/classification/ces/csfdcz_movie_review_sentiment_classification.py +2 -2
  261. mteb/tasks/classification/dan/ddisco_cohesion_classification.py +1 -1
  262. mteb/tasks/classification/dan/dk_hate_classification.py +2 -2
  263. mteb/tasks/classification/deu/german_politicians_twitter_sentiment_classification.py +1 -1
  264. mteb/tasks/classification/ell/greek_legal_code_classification.py +1 -1
  265. mteb/tasks/classification/eng/dbpedia_classification.py +2 -2
  266. mteb/tasks/classification/eng/toxic_chat_classification.py +2 -2
  267. mteb/tasks/classification/eng/toxic_conversations_classification.py +2 -2
  268. mteb/tasks/classification/eng/tweet_topic_single_classification.py +1 -1
  269. mteb/tasks/classification/eng/yahoo_answers_topics_classification.py +1 -1
  270. mteb/tasks/classification/eng/yelp_review_full_classification.py +2 -2
  271. mteb/tasks/classification/est/estonian_valence.py +2 -2
  272. mteb/tasks/classification/fas/fa_mteb_classification.py +6 -6
  273. mteb/tasks/classification/fas/persian_food_sentiment_classification.py +1 -1
  274. mteb/tasks/classification/fil/filipino_shopee_reviews_classification.py +1 -1
  275. mteb/tasks/classification/fin/fin_toxicity_classification.py +1 -1
  276. mteb/tasks/classification/fra/french_book_reviews.py +2 -2
  277. mteb/tasks/classification/fra/movie_review_sentiment_classification.py +2 -2
  278. mteb/tasks/classification/guj/gujarati_news_classification.py +1 -1
  279. mteb/tasks/classification/hin/hindi_discourse_classification.py +1 -1
  280. mteb/tasks/classification/hin/sentiment_analysis_hindi.py +1 -1
  281. mteb/tasks/classification/ind/indonesian_id_clickbait_classification.py +2 -2
  282. mteb/tasks/classification/ind/indonesian_mongabay_conservation_classification.py +1 -1
  283. mteb/tasks/classification/ita/dado_eval_coarse_classification.py +1 -1
  284. mteb/tasks/classification/ita/ita_casehold_classification.py +1 -1
  285. mteb/tasks/classification/ita/sardi_stance_classification.py +1 -1
  286. mteb/tasks/classification/jav/javanese_imdb_classification.py +1 -1
  287. mteb/tasks/classification/jpn/wrime_classification.py +1 -1
  288. mteb/tasks/classification/kan/kannada_news_classification.py +2 -2
  289. mteb/tasks/classification/kor/klue_tc.py +2 -2
  290. mteb/tasks/classification/kor/kor_fin.py +1 -1
  291. mteb/tasks/classification/kor/kor_hate_classification.py +1 -1
  292. mteb/tasks/classification/kor/kor_sarcasm_classification.py +1 -1
  293. mteb/tasks/classification/kur/kurdish_sentiment_classification.py +2 -2
  294. mteb/tasks/classification/mal/malayalam_news_classification.py +1 -1
  295. mteb/tasks/classification/mar/marathi_news_classification.py +1 -1
  296. mteb/tasks/classification/multilingual/afri_senti_lang_classification.py +1 -1
  297. mteb/tasks/classification/multilingual/catalonia_tweet_classification.py +1 -1
  298. mteb/tasks/classification/multilingual/cyrillic_turkic_lang_classification.py +1 -1
  299. mteb/tasks/classification/multilingual/indic_nlp_news_classification.py +1 -1
  300. mteb/tasks/classification/multilingual/masakha_news_classification.py +1 -1
  301. mteb/tasks/classification/multilingual/multi_hate_classification.py +1 -1
  302. mteb/tasks/classification/multilingual/multilingual_sentiment_classification.py +1 -1
  303. mteb/tasks/classification/multilingual/scala_classification.py +2 -2
  304. mteb/tasks/classification/multilingual/sib200_classification.py +1 -1
  305. mteb/tasks/classification/multilingual/turkic_classification.py +1 -1
  306. mteb/tasks/classification/multilingual/tweet_sentiment_classification.py +1 -1
  307. mteb/tasks/classification/nep/nepali_news_classification.py +2 -2
  308. mteb/tasks/classification/nld/dutch_sarcastic_headlines_classification.py +1 -1
  309. mteb/tasks/classification/nld/vaccin_chat_nl_classification.py +1 -1
  310. mteb/tasks/classification/ory/odia_news_classification.py +2 -2
  311. mteb/tasks/classification/pan/punjabi_news_classification.py +1 -1
  312. mteb/tasks/classification/ron/moroco.py +1 -1
  313. mteb/tasks/classification/ron/romanian_reviews_sentiment.py +1 -1
  314. mteb/tasks/classification/ron/romanian_sentiment_classification.py +1 -1
  315. mteb/tasks/classification/rus/georeview_classification.py +1 -1
  316. mteb/tasks/classification/rus/headline_classification.py +2 -2
  317. mteb/tasks/classification/rus/inappropriateness_classification.py +2 -2
  318. mteb/tasks/classification/rus/ru_reviews_classification.py +2 -2
  319. mteb/tasks/classification/rus/ru_sci_bench_grnti_classification.py +1 -1
  320. mteb/tasks/classification/rus/ru_sci_bench_oecd_classification.py +1 -1
  321. mteb/tasks/classification/rus/ru_toixic_classification_okmlcup.py +1 -1
  322. mteb/tasks/classification/san/sanskrit_shlokas_classification.py +1 -1
  323. mteb/tasks/classification/sin/sinhala_news_classification.py +2 -2
  324. mteb/tasks/classification/sin/sinhala_news_source_classification.py +2 -2
  325. mteb/tasks/classification/slk/csfdsk_movie_review_sentiment_classification.py +2 -2
  326. mteb/tasks/classification/slv/frenk_sl_classification.py +1 -1
  327. mteb/tasks/classification/spa/spanish_news_classification.py +2 -2
  328. mteb/tasks/classification/ssw/siswati_news_classification.py +1 -1
  329. mteb/tasks/classification/tam/tamil_news_classification.py +2 -2
  330. mteb/tasks/classification/tel/telugu_andhra_jyoti_news_classification.py +2 -2
  331. mteb/tasks/classification/tha/wongnai_reviews_classification.py +1 -1
  332. mteb/tasks/classification/tur/turkish_movie_sentiment_classification.py +2 -2
  333. mteb/tasks/classification/ukr/ukr_formality_classification.py +2 -2
  334. mteb/tasks/classification/vie/toxic_conversations_vn_classification.py +1 -1
  335. mteb/tasks/classification/vie/vie_student_feedback_classification.py +1 -1
  336. mteb/tasks/classification/zho/yue_openrice_review_classification.py +2 -2
  337. mteb/tasks/classification/zul/isi_zulu_news_classification.py +1 -1
  338. mteb/tasks/clustering/deu/blurbs_clustering_p2p.py +1 -1
  339. mteb/tasks/clustering/deu/blurbs_clustering_s2s.py +1 -1
  340. mteb/tasks/clustering/eng/arxiv_clustering_p2p.py +1 -1
  341. mteb/tasks/clustering/eng/arxiv_hierarchical_clustering.py +2 -2
  342. mteb/tasks/clustering/eng/big_patent_clustering.py +1 -1
  343. mteb/tasks/clustering/eng/biorxiv_clustering_p2p.py +1 -1
  344. mteb/tasks/clustering/eng/biorxiv_clustering_s2s.py +1 -1
  345. mteb/tasks/clustering/eng/hume_wiki_cities_clustering.py +1 -1
  346. mteb/tasks/clustering/eng/medrxiv_clustering_p2p.py +1 -1
  347. mteb/tasks/clustering/eng/medrxiv_clustering_s2s.py +1 -1
  348. mteb/tasks/clustering/eng/reddit_clustering.py +1 -1
  349. mteb/tasks/clustering/eng/reddit_clustering_p2p.py +1 -1
  350. mteb/tasks/clustering/eng/stack_exchange_clustering.py +1 -1
  351. mteb/tasks/clustering/eng/stack_exchange_clustering_p2p.py +1 -1
  352. mteb/tasks/clustering/eng/twenty_newsgroups_clustering.py +1 -1
  353. mteb/tasks/clustering/eng/wiki_cities_clustering.py +1 -1
  354. mteb/tasks/clustering/fas/fa_mteb_clustering.py +4 -4
  355. mteb/tasks/clustering/fra/hal_clustering_s2s.py +2 -2
  356. mteb/tasks/clustering/multilingual/mlsum_clustering_p2p.py +2 -2
  357. mteb/tasks/clustering/multilingual/mlsum_clustering_s2s.py +2 -2
  358. mteb/tasks/clustering/multilingual/sib200_clustering_s2s.py +1 -1
  359. mteb/tasks/clustering/multilingual/wiki_clustering_p2p.py +1 -1
  360. mteb/tasks/clustering/nld/dutch_news_articles_clustering_p2p.py +1 -1
  361. mteb/tasks/clustering/nld/dutch_news_articles_clustering_s2s.py +1 -1
  362. mteb/tasks/clustering/nld/iconclass_clustering_s2s.py +1 -1
  363. mteb/tasks/clustering/nld/open_tender_clustering_p2p.py +1 -1
  364. mteb/tasks/clustering/nld/vabb_clustering_p2p.py +1 -1
  365. mteb/tasks/clustering/nld/vabb_clustering_s2s.py +1 -1
  366. mteb/tasks/clustering/nob/snl_clustering.py +8 -3
  367. mteb/tasks/clustering/nob/vg_clustering.py +8 -3
  368. mteb/tasks/clustering/pol/polish_clustering.py +3 -3
  369. mteb/tasks/clustering/rus/ru_sci_bench_grnti_clustering_p2p.py +1 -1
  370. mteb/tasks/clustering/rus/ru_sci_bench_oecd_clustering_p2p.py +1 -1
  371. mteb/tasks/clustering/zho/cmteb_clustering.py +6 -6
  372. mteb/tasks/image_text_pair_classification/eng/image_co_de.py +1 -1
  373. mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +2 -2
  374. mteb/tasks/instruction_reranking/multilingual/m_follow_ir.py +2 -2
  375. mteb/tasks/multichoice/eng/cv_bench.py +4 -4
  376. mteb/tasks/multilabel_classification/ita/emit_classification.py +1 -1
  377. mteb/tasks/multilabel_classification/mlt/maltese_news_classification.py +1 -1
  378. mteb/tasks/multilabel_classification/rus/ru_toixic_multilabelclassification_okmlcup.py +1 -1
  379. mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_group_classification.py +1 -1
  380. mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_subclass_classification.py +1 -1
  381. mteb/tasks/pair_classification/ara/ar_entail.py +1 -1
  382. mteb/tasks/pair_classification/dan/talemaader_pc.py +1 -1
  383. mteb/tasks/pair_classification/deu/false_friends_de_en_pc.py +1 -1
  384. mteb/tasks/pair_classification/eng/pub_chem_ai_sentence_paraphrase_pc.py +1 -1
  385. mteb/tasks/pair_classification/eng/pub_chem_smilespc.py +4 -3
  386. mteb/tasks/pair_classification/eng/pub_chem_synonym_pc.py +1 -1
  387. mteb/tasks/pair_classification/eng/pub_chem_wiki_paragraphs_pc.py +1 -1
  388. mteb/tasks/pair_classification/eng/sprint_duplicate_questions_pc.py +1 -1
  389. mteb/tasks/pair_classification/eng/twitter_sem_eval2015_pc.py +1 -1
  390. mteb/tasks/pair_classification/eng/twitter_url_corpus_pc.py +1 -1
  391. mteb/tasks/pair_classification/fas/fa_mteb_pair_classification.py +5 -5
  392. mteb/tasks/pair_classification/fas/fars_tail.py +2 -2
  393. mteb/tasks/pair_classification/hye/armenian_paraphrase_pc.py +1 -1
  394. mteb/tasks/pair_classification/ita/dis_co_tex_pair_classification.py +1 -1
  395. mteb/tasks/pair_classification/kor/klue_nli.py +1 -1
  396. mteb/tasks/pair_classification/multilingual/rte3.py +2 -2
  397. mteb/tasks/pair_classification/multilingual/xnli.py +1 -1
  398. mteb/tasks/pair_classification/pol/polish_pc.py +4 -4
  399. mteb/tasks/pair_classification/por/assin2_rte.py +1 -1
  400. mteb/tasks/pair_classification/por/sick_br_pc.py +1 -1
  401. mteb/tasks/pair_classification/rus/terra.py +2 -2
  402. mteb/tasks/pair_classification/vie/sprint_duplicate_questions_pcvn.py +1 -1
  403. mteb/tasks/pair_classification/vie/twitter_sem_eval2015_pcvn.py +1 -1
  404. mteb/tasks/pair_classification/vie/twitter_url_corpus_pcvn.py +1 -1
  405. mteb/tasks/pair_classification/zho/cmteb_pair_classification.py +2 -2
  406. mteb/tasks/reranking/multilingual/wikipedia_reranking_multilingual.py +1 -1
  407. mteb/tasks/retrieval/ara/sadeem_question_retrieval.py +1 -1
  408. mteb/tasks/retrieval/code/code_edit_search_retrieval.py +1 -1
  409. mteb/tasks/retrieval/code/code_rag.py +16 -16
  410. mteb/tasks/retrieval/code/code_search_net_cc_retrieval.py +1 -1
  411. mteb/tasks/retrieval/code/coir_code_search_net_retrieval.py +1 -1
  412. mteb/tasks/retrieval/code/ds1000_retrieval.py +1 -1
  413. mteb/tasks/retrieval/code/fresh_stack_retrieval.py +1 -1
  414. mteb/tasks/retrieval/code/human_eval_retrieval.py +1 -1
  415. mteb/tasks/retrieval/code/mbpp_retrieval.py +1 -1
  416. mteb/tasks/retrieval/code/wiki_sql_retrieval.py +1 -1
  417. mteb/tasks/retrieval/dan/dan_fever_retrieval.py +2 -2
  418. mteb/tasks/retrieval/dan/tv2_nordretrieval.py +3 -3
  419. mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +3 -3
  420. mteb/tasks/retrieval/deu/german_gov_service_retrieval.py +1 -1
  421. mteb/tasks/retrieval/deu/german_qu_ad_retrieval.py +1 -1
  422. mteb/tasks/retrieval/ell/greek_civics_qa.py +1 -1
  423. mteb/tasks/retrieval/eng/__init__.py +44 -0
  424. mteb/tasks/retrieval/eng/bright_retrieval.py +10 -2
  425. mteb/tasks/retrieval/eng/bright_v1_1_retrieval.py +968 -0
  426. mteb/tasks/retrieval/eng/chat_doctor_retrieval.py +1 -1
  427. mteb/tasks/retrieval/eng/chemrxiv.py +33 -0
  428. mteb/tasks/retrieval/eng/cub200_i2i_retrieval.py +1 -1
  429. mteb/tasks/retrieval/eng/fin_qa_retrieval.py +1 -1
  430. mteb/tasks/retrieval/eng/finance_bench_retrieval.py +1 -1
  431. mteb/tasks/retrieval/eng/hateful_memes_i2t_retrieval.py +1 -1
  432. mteb/tasks/retrieval/eng/hateful_memes_t2i_retrieval.py +1 -1
  433. mteb/tasks/retrieval/eng/hc3_finance_retrieval.py +1 -1
  434. mteb/tasks/retrieval/eng/lemb_narrative_qa_retrieval.py +1 -1
  435. mteb/tasks/retrieval/eng/lemb_needle_retrieval.py +1 -1
  436. mteb/tasks/retrieval/eng/lemb_passkey_retrieval.py +1 -1
  437. mteb/tasks/retrieval/eng/lemb_summ_screen_fd_retrieval.py +1 -1
  438. mteb/tasks/retrieval/eng/lemb_wikim_qa_retrieval.py +1 -1
  439. mteb/tasks/retrieval/eng/lembqm_sum_retrieval.py +1 -1
  440. mteb/tasks/retrieval/eng/limit_retrieval.py +6 -1
  441. mteb/tasks/retrieval/eng/lit_search_retrieval.py +1 -1
  442. mteb/tasks/retrieval/eng/memotion_i2t_retrieval.py +1 -1
  443. mteb/tasks/retrieval/eng/memotion_t2i_retrieval.py +1 -1
  444. mteb/tasks/retrieval/eng/ml_questions.py +1 -1
  445. mteb/tasks/retrieval/eng/nano_argu_ana_retrieval.py +1 -1
  446. mteb/tasks/retrieval/eng/nano_climate_fever_retrieval.py +1 -1
  447. mteb/tasks/retrieval/eng/nano_db_pedia_retrieval.py +1 -1
  448. mteb/tasks/retrieval/eng/nano_fever_retrieval.py +1 -1
  449. mteb/tasks/retrieval/eng/nano_fi_qa2018_retrieval.py +1 -1
  450. mteb/tasks/retrieval/eng/nano_hotpot_qa_retrieval.py +1 -1
  451. mteb/tasks/retrieval/eng/nano_msmarco_retrieval.py +1 -1
  452. mteb/tasks/retrieval/eng/nano_nf_corpus_retrieval.py +1 -1
  453. mteb/tasks/retrieval/eng/nano_nq_retrieval.py +1 -1
  454. mteb/tasks/retrieval/eng/nano_quora_retrieval.py +1 -1
  455. mteb/tasks/retrieval/eng/nano_sci_fact_retrieval.py +1 -1
  456. mteb/tasks/retrieval/eng/nano_scidocs_retrieval.py +1 -1
  457. mteb/tasks/retrieval/eng/nano_touche2020_retrieval.py +1 -1
  458. mteb/tasks/retrieval/eng/narrative_qa_retrieval.py +1 -1
  459. mteb/tasks/retrieval/eng/r2_med_retrieval.py +8 -8
  460. mteb/tasks/retrieval/eng/sci_mmir_i2t_retrieval.py +1 -1
  461. mteb/tasks/retrieval/eng/sci_mmir_t2i_retrieval.py +1 -1
  462. mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +10 -10
  463. mteb/tasks/retrieval/fra/f_qu_ad_retrieval.py +1 -1
  464. mteb/tasks/retrieval/fra/syntec_retrieval.py +1 -1
  465. mteb/tasks/retrieval/hun/hun_sum2.py +1 -1
  466. mteb/tasks/retrieval/kat/georgian_faq_retrieval.py +1 -1
  467. mteb/tasks/retrieval/kor/__init__.py +15 -1
  468. mteb/tasks/retrieval/kor/kovidore2_bench_retrieval.py +142 -0
  469. mteb/tasks/retrieval/multilingual/__init__.py +2 -0
  470. mteb/tasks/retrieval/multilingual/cross_lingual_semantic_discrimination_wmt19.py +1 -1
  471. mteb/tasks/retrieval/multilingual/cross_lingual_semantic_discrimination_wmt21.py +1 -1
  472. mteb/tasks/retrieval/multilingual/cur_ev1_retrieval.py +1 -1
  473. mteb/tasks/retrieval/multilingual/euro_pirq_retrieval.py +43 -0
  474. mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +1 -1
  475. mteb/tasks/retrieval/multilingual/miracl_vision_retrieval.py +1 -1
  476. mteb/tasks/retrieval/multilingual/mr_tidy_retrieval.py +1 -1
  477. mteb/tasks/retrieval/multilingual/public_health_qa_retrieval.py +1 -1
  478. mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +5 -5
  479. mteb/tasks/retrieval/multilingual/statcan_dialogue_dataset_retrieval.py +1 -1
  480. mteb/tasks/retrieval/multilingual/vdr_multilingual_retrieval.py +1 -1
  481. mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +14 -4
  482. mteb/tasks/retrieval/multilingual/vidore3_bench_retrieval.py +90 -100
  483. mteb/tasks/retrieval/multilingual/wit_t2i_retrieval.py +1 -1
  484. mteb/tasks/retrieval/multilingual/x_flickr30k_co_t2i_retrieval.py +1 -1
  485. mteb/tasks/retrieval/multilingual/x_qu_ad_retrieval.py +1 -1
  486. mteb/tasks/retrieval/multilingual/xm3600_t2i_retrieval.py +1 -1
  487. mteb/tasks/retrieval/nld/cqa_dupstack_android_nl_retrieval.py +1 -1
  488. mteb/tasks/retrieval/nld/cqa_dupstack_english_nl_retrieval.py +1 -1
  489. mteb/tasks/retrieval/nld/cqa_dupstack_gaming_nl_retrieval.py +1 -1
  490. mteb/tasks/retrieval/nld/cqa_dupstack_gis_nl_retrieval.py +1 -1
  491. mteb/tasks/retrieval/nld/cqa_dupstack_mathematica_nl_retrieval.py +1 -1
  492. mteb/tasks/retrieval/nld/cqa_dupstack_physics_nl_retrieval.py +1 -1
  493. mteb/tasks/retrieval/nld/cqa_dupstack_programmers_nl_retrieval.py +1 -1
  494. mteb/tasks/retrieval/nld/cqa_dupstack_stats_nl_retrieval.py +1 -1
  495. mteb/tasks/retrieval/nld/cqa_dupstack_tex_nl_retrieval.py +1 -1
  496. mteb/tasks/retrieval/nld/cqa_dupstack_unix_nl_retrieval.py +1 -1
  497. mteb/tasks/retrieval/nld/cqa_dupstack_webmasters_nl_retrieval.py +1 -1
  498. mteb/tasks/retrieval/nld/cqa_dupstack_wordpress_nl_retrieval.py +1 -1
  499. mteb/tasks/retrieval/nob/norquad.py +3 -3
  500. mteb/tasks/retrieval/nob/snl_retrieval.py +3 -3
  501. mteb/tasks/retrieval/slk/slovak_sum_retrieval.py +1 -1
  502. mteb/tasks/retrieval/tur/tur_hist_quad.py +1 -1
  503. mteb/tasks/retrieval/vie/__init__.py +14 -6
  504. mteb/tasks/retrieval/vie/climate_fevervn_retrieval.py +39 -0
  505. mteb/tasks/retrieval/vie/db_pedia_vn_retrieval.py +39 -0
  506. mteb/tasks/retrieval/vie/fevervn_retrieval.py +39 -0
  507. mteb/tasks/retrieval/vie/hotpot_qavn_retrieval.py +39 -0
  508. mteb/tasks/retrieval/vie/msmarcovn_retrieval.py +48 -0
  509. mteb/tasks/retrieval/vie/nqvn_retrieval.py +39 -0
  510. mteb/tasks/retrieval/vie/tvpl_retrieval.py +42 -0
  511. mteb/tasks/retrieval/vie/vie_qu_ad_retrieval.py +1 -1
  512. mteb/tasks/retrieval/vie/zac_legal_text_retrieval.py +15 -1
  513. mteb/tasks/sts/fao/faroese_sts.py +1 -1
  514. mteb/tasks/sts/fra/sick_fr_sts.py +1 -1
  515. mteb/tasks/sts/kor/klue_sts.py +1 -1
  516. mteb/tasks/sts/por/sick_br_sts.py +1 -1
  517. mteb/tasks/sts/rus/ru_para_phraser_sts.py +1 -1
  518. mteb/tasks/zeroshot_classification/eng/sci_mmir.py +1 -1
  519. mteb/types/__init__.py +2 -0
  520. mteb/types/_encoder_io.py +13 -1
  521. mteb/types/_result.py +2 -1
  522. mteb/types/statistics.py +18 -5
  523. {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/METADATA +15 -4
  524. {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/RECORD +528 -486
  525. {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/WHEEL +1 -1
  526. mteb/models/model_implementations/mxbai_models.py +0 -111
  527. {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/entry_points.txt +0 -0
  528. {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/licenses/LICENSE +0 -0
  529. {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/top_level.txt +0 -0
@@ -1,23 +1,29 @@
1
- import logging
2
- from collections.abc import Sequence
3
- from typing import Any
1
+ from __future__ import annotations
4
2
 
5
- from mteb.abstasks.task_metadata import TaskMetadata
6
- from mteb.models import SearchProtocol
7
- from mteb.types import (
8
- CorpusDatasetType,
9
- QueryDatasetType,
10
- RelevantDocumentsType,
11
- RetrievalEvaluationResult,
12
- RetrievalOutputType,
13
- TopRankedDocumentsType,
14
- )
3
+ import logging
4
+ from typing import TYPE_CHECKING
15
5
 
16
6
  from .evaluator import Evaluator
17
7
  from .retrieval_metrics import (
18
8
  calculate_retrieval_scores,
19
9
  )
20
10
 
11
+ if TYPE_CHECKING:
12
+ from collections.abc import Sequence
13
+
14
+ from mteb.abstasks.task_metadata import TaskMetadata
15
+ from mteb.models import SearchProtocol
16
+ from mteb.types import (
17
+ CorpusDatasetType,
18
+ EncodeKwargs,
19
+ QueryDatasetType,
20
+ RelevantDocumentsType,
21
+ RetrievalEvaluationResult,
22
+ RetrievalOutputType,
23
+ TopRankedDocumentsType,
24
+ )
25
+
26
+
21
27
  logger = logging.getLogger(__name__)
22
28
 
23
29
 
@@ -48,7 +54,8 @@ class RetrievalEvaluator(Evaluator):
48
54
  def __call__( # type: ignore[override]
49
55
  self,
50
56
  search_model: SearchProtocol,
51
- encode_kwargs: dict[str, Any],
57
+ encode_kwargs: EncodeKwargs,
58
+ num_proc: int = 1,
52
59
  ) -> RetrievalOutputType:
53
60
  logger.info("Running retrieval task - Indexing corpus...")
54
61
  search_model.index(
@@ -57,6 +64,7 @@ class RetrievalEvaluator(Evaluator):
57
64
  hf_split=self.hf_split,
58
65
  hf_subset=self.hf_subset,
59
66
  encode_kwargs=encode_kwargs,
67
+ num_proc=num_proc,
60
68
  )
61
69
  logger.info("Running retrieval task - Searching queries...")
62
70
  return search_model.search(
@@ -67,6 +75,7 @@ class RetrievalEvaluator(Evaluator):
67
75
  hf_subset=self.hf_subset,
68
76
  encode_kwargs=encode_kwargs,
69
77
  top_ranked=self.top_ranked,
78
+ num_proc=num_proc,
70
79
  )
71
80
 
72
81
  def evaluate(
@@ -1,6 +1,8 @@
1
+ from __future__ import annotations
2
+
1
3
  import logging
2
4
  from collections import defaultdict
3
- from typing import Any
5
+ from typing import TYPE_CHECKING, Any
4
6
 
5
7
  import numpy as np
6
8
  import pandas as pd
@@ -8,14 +10,19 @@ import pytrec_eval
8
10
  from packaging.version import Version
9
11
  from sklearn.metrics import auc
10
12
 
11
- from mteb.types import RelevantDocumentsType, RetrievalEvaluationResult
13
+ from mteb.types import RetrievalEvaluationResult
14
+
15
+ if TYPE_CHECKING:
16
+ from collections.abc import Mapping
17
+
18
+ from mteb.types import RelevantDocumentsType
12
19
 
13
20
  logger = logging.getLogger(__name__)
14
21
 
15
22
 
16
23
  def mrr(
17
24
  qrels: RelevantDocumentsType,
18
- results: dict[str, dict[str, float]],
25
+ results: Mapping[str, Mapping[str, float]],
19
26
  k_values: list[int],
20
27
  ) -> dict[str, list[float]]:
21
28
  mrr_metrics = defaultdict(list)
@@ -32,7 +39,7 @@ def mrr(
32
39
  doc_id for doc_id in qrels[query_id] if qrels[query_id][doc_id] > 0
33
40
  }
34
41
  for k in k_values:
35
- rr = 0
42
+ rr = 0.0
36
43
  for rank, hit in enumerate(top_hits[query_id][0:k]):
37
44
  if hit[0] in query_relevant_docs:
38
45
  rr = 1.0 / (rank + 1)
@@ -45,8 +52,8 @@ def recall_cap(
45
52
  qrels: RelevantDocumentsType,
46
53
  results: dict[str, dict[str, float]],
47
54
  k_values: list[int],
48
- ) -> dict[str, list[float]]:
49
- capped_recall = defaultdict(list)
55
+ ) -> dict[str, list[float | None]]:
56
+ capped_recall: dict[str, list[float | None]] = defaultdict(list)
50
57
 
51
58
  k_max = max(k_values)
52
59
 
@@ -139,7 +146,7 @@ def calculate_pmrr(original_run, new_run, changed_qrels):
139
146
  changes = []
140
147
  for qid in changed_qrels.keys():
141
148
  if qid + "-og" not in original_run or qid + "-changed" not in new_run:
142
- logging.warning(f"Query {qid} not found in the runs for calculating p-MRR")
149
+ logger.warning(f"Query {qid} not found in the runs for calculating p-MRR")
143
150
  continue
144
151
  original_qid_run = original_run[qid + "-og"]
145
152
  new_qid_run = new_run[qid + "-changed"]
@@ -188,7 +195,7 @@ def evaluate_p_mrr_change(
188
195
  Returns:
189
196
  A dictionary with the scores, including "p-MRR", "og" and "changed" keys.
190
197
  """
191
- followir_scores = defaultdict(dict)
198
+ followir_scores: dict[str, float | dict[str, float]] = defaultdict(dict)
192
199
 
193
200
  qrels_sep = {
194
201
  "og": {k: v for k, v in qrels.items() if k.endswith("-og")},
@@ -227,7 +234,7 @@ def evaluate_p_mrr_change(
227
234
  ndcg, _map, recall, precision, naucs, avg_mrr, naucs_mrr, cv_recall, {}
228
235
  )
229
236
  for key, value in scores_dict.items():
230
- followir_scores[name][key] = value
237
+ followir_scores[name][key] = value # type: ignore[index]
231
238
 
232
239
  return followir_scores
233
240
 
@@ -254,8 +261,8 @@ def confidence_scores(sim_scores: list[float]) -> dict[str, float]:
254
261
  sim_scores_sorted = sorted(sim_scores)[::-1]
255
262
 
256
263
  cs_max = sim_scores_sorted[0]
257
- cs_std = np.std(sim_scores)
258
- cs_diff1 = None
264
+ cs_std = float(np.std(sim_scores))
265
+ cs_diff1 = 0.0
259
266
  if len(sim_scores) > 1:
260
267
  cs_diff1 = sim_scores_sorted[0] - sim_scores_sorted[1]
261
268
  elif len(sim_scores) == 1:
@@ -410,7 +417,7 @@ def make_score_dict(
410
417
  cv_recall: dict[str, float],
411
418
  task_scores: dict[str, float],
412
419
  previous_results_model_meta: dict[str, Any] | None = None,
413
- ) -> dict[str, float]:
420
+ ) -> dict[str, Any]:
414
421
  return {
415
422
  **{f"ndcg_at_{k.split('@')[1]}": v for (k, v) in ndcg.items()},
416
423
  **{f"map_at_{k.split('@')[1]}": v for (k, v) in _map.items()},
@@ -528,7 +535,7 @@ def max_over_subqueries(
528
535
 
529
536
 
530
537
  def calculate_retrieval_scores(
531
- results: dict[str, dict[str, float]],
538
+ results: Mapping[str, Mapping[str, float]],
532
539
  qrels: RelevantDocumentsType,
533
540
  k_values: list[int],
534
541
  skip_first_result: bool = False,
@@ -576,7 +583,7 @@ def calculate_retrieval_scores(
576
583
 
577
584
 
578
585
  def evaluate_abstention(
579
- results: dict[str, dict[str, float]],
586
+ results: Mapping[str, Mapping[str, float]],
580
587
  metric_scores: dict[str, list[float]],
581
588
  ) -> dict[str, float]:
582
589
  """Computes normalized Area Under the Curve on a set of evaluated instances as presented in the paper https://arxiv.org/abs/2402.12997
@@ -591,21 +598,21 @@ def evaluate_abstention(
591
598
  all_sim_scores = [list(results[qid].values()) for qid in list(results.keys())]
592
599
  all_conf_scores = [confidence_scores(sim_scores) for sim_scores in all_sim_scores]
593
600
  conf_fcts = list(all_conf_scores[0].keys())
594
- all_conf_scores = {
601
+ all_conf_scores_ = {
595
602
  fct: np.array([x[fct] for x in all_conf_scores]) for fct in conf_fcts
596
603
  }
597
- metric_scores = {k: np.array(v) for k, v in metric_scores.items()}
604
+ metric_scores_ = {k: np.array(v) for k, v in metric_scores.items()}
598
605
  naucs = {}
599
606
 
600
- for metric_name, scores in metric_scores.items():
601
- for fct, conf_scores in all_conf_scores.items():
607
+ for metric_name, scores in metric_scores_.items():
608
+ for fct, conf_scores in all_conf_scores_.items():
602
609
  naucs[f"nAUC_{metric_name}_{fct}"] = nauc(conf_scores, scores)
603
610
 
604
611
  return naucs
605
612
 
606
613
 
607
614
  def calculate_cv_recall(
608
- results: dict[str, dict[str, float]],
615
+ results: Mapping[str, Mapping[str, float]],
609
616
  qrels: RelevantDocumentsType,
610
617
  k_values: list[int],
611
618
  skip_first_result: bool = False,
@@ -1,27 +1,31 @@
1
- import logging
2
- from typing import Any, Protocol
1
+ from __future__ import annotations
3
2
 
4
- import numpy as np
5
- from datasets import Dataset
6
- from torch.utils.data import DataLoader
7
- from typing_extensions import Self
3
+ import logging
4
+ from typing import TYPE_CHECKING, Any, Protocol, cast
8
5
 
9
6
  from mteb._create_dataloaders import create_dataloader
10
- from mteb.abstasks.task_metadata import TaskMetadata
11
- from mteb.models import EncoderProtocol
12
- from mteb.types import BatchedInput
13
7
 
14
8
  from .evaluator import Evaluator
15
9
 
10
+ if TYPE_CHECKING:
11
+ import numpy as np
12
+ from datasets import Dataset
13
+ from torch.utils.data import DataLoader
14
+ from typing_extensions import Self
15
+
16
+ from mteb.abstasks.task_metadata import TaskMetadata
17
+ from mteb.models import EncoderProtocol
18
+ from mteb.types import Array, BatchedInput, EncodeKwargs
19
+
16
20
  logger = logging.getLogger(__name__)
17
21
 
18
22
 
19
23
  class SklearnModelProtocol(Protocol):
20
- def fit(self, X: np.ndarray, y: np.ndarray | list[int]) -> None: ... # noqa: N803
21
- def predict(self, X: np.ndarray) -> np.ndarray: ... # noqa: N803
24
+ def fit(self, X: Array, y: np.ndarray | list[int]) -> None: ... # noqa: N803
25
+ def predict(self, X: Array) -> np.ndarray: ... # noqa: N803
22
26
  def get_params(self) -> dict[str, Any]: ...
23
- def set_params(self, **kwargs: dict[str, Any]) -> Self: ...
24
- def score(self, X: np.ndarray, y: np.ndarray | list[int]) -> float: ... # noqa: N803
27
+ def set_params(self, random_state: int, **kwargs: dict[str, Any]) -> Self: ...
28
+ def score(self, X: Array, y: np.ndarray | list[int]) -> float: ... # noqa: N803
25
29
 
26
30
 
27
31
  class SklearnEvaluator(Evaluator):
@@ -50,18 +54,20 @@ class SklearnEvaluator(Evaluator):
50
54
  self.evaluator_model = evaluator_model
51
55
 
52
56
  def create_dataloaders(
53
- self, encode_kwargs: dict[str, Any]
57
+ self, encode_kwargs: EncodeKwargs, num_proc: int
54
58
  ) -> tuple[DataLoader[BatchedInput], DataLoader[BatchedInput]]:
55
59
  dataloader_train = create_dataloader(
56
60
  self.train_dataset,
57
61
  self.task_metadata,
58
62
  input_column=self.values_column_name,
63
+ num_proc=num_proc,
59
64
  **encode_kwargs,
60
65
  )
61
66
  dataloader_test = create_dataloader(
62
67
  self.eval_dataset,
63
68
  self.task_metadata,
64
69
  input_column=self.values_column_name,
70
+ num_proc=num_proc,
65
71
  **encode_kwargs,
66
72
  )
67
73
  return dataloader_train, dataloader_test
@@ -70,15 +76,17 @@ class SklearnEvaluator(Evaluator):
70
76
  self,
71
77
  model: EncoderProtocol,
72
78
  *,
73
- encode_kwargs: dict[str, Any],
74
- test_cache: np.ndarray | None = None,
75
- ) -> tuple[np.ndarray, np.ndarray]:
79
+ encode_kwargs: EncodeKwargs,
80
+ test_cache: Array | None = None,
81
+ num_proc: int = 1,
82
+ ) -> tuple[np.ndarray, Array]:
76
83
  """Classification evaluation by training a sklearn classifier on the embeddings of the training set and evaluating on the embeddings of the test set.
77
84
 
78
85
  Args:
79
86
  model: Encoder
80
87
  encode_kwargs: encode kwargs
81
88
  test_cache: embeddings of the test set, if already computed
89
+ num_proc: number of processes to use
82
90
 
83
91
  Returns:
84
92
  Tuple of test predictions and embeddings
@@ -86,6 +94,7 @@ class SklearnEvaluator(Evaluator):
86
94
  """
87
95
  dataloader_train, dataloader_test = self.create_dataloaders(
88
96
  encode_kwargs=encode_kwargs,
97
+ num_proc=num_proc,
89
98
  )
90
99
 
91
100
  logger.info("Running - Encoding samples...")
@@ -104,6 +113,7 @@ class SklearnEvaluator(Evaluator):
104
113
  hf_subset=self.hf_subset,
105
114
  **encode_kwargs,
106
115
  )
116
+ test_cache = cast("Array", test_cache)
107
117
 
108
118
  logger.info("Running - Fitting classifier...")
109
119
  y_train = self.train_dataset[self.label_column_name]
@@ -1,15 +1,19 @@
1
+ from __future__ import annotations
2
+
1
3
  import logging
2
- from typing import Any
4
+ from typing import TYPE_CHECKING
3
5
 
4
- import numpy as np
5
6
  import torch
6
7
  from datasets import Dataset
7
8
  from tqdm.auto import tqdm
8
9
 
9
10
  from mteb._create_dataloaders import _create_dataloader_from_texts
10
11
  from mteb._evaluators.evaluator import Evaluator
11
- from mteb.abstasks.task_metadata import TaskMetadata
12
- from mteb.models import EncoderProtocol
12
+
13
+ if TYPE_CHECKING:
14
+ from mteb.abstasks.task_metadata import TaskMetadata
15
+ from mteb.models import EncoderProtocol
16
+ from mteb.types import Array, EncodeKwargs
13
17
 
14
18
  logger = logging.getLogger(__name__)
15
19
 
@@ -33,7 +37,11 @@ class BitextMiningEvaluator(Evaluator):
33
37
  self.task_metadata = task_metadata
34
38
 
35
39
  def __call__(
36
- self, model: EncoderProtocol, *, encode_kwargs: dict[str, Any]
40
+ self,
41
+ model: EncoderProtocol,
42
+ *,
43
+ encode_kwargs: EncodeKwargs,
44
+ num_proc: int = 1,
37
45
  ) -> dict[str, list[dict[str, float]]]:
38
46
  pair_elements = {p for pair in self.pairs for p in pair}
39
47
  if isinstance(self.sentences, Dataset):
@@ -48,6 +56,7 @@ class BitextMiningEvaluator(Evaluator):
48
56
  for sub in tqdm(subsets):
49
57
  dataloader = _create_dataloader_from_texts(
50
58
  self.sentences[sub],
59
+ num_proc=num_proc,
51
60
  **encode_kwargs,
52
61
  )
53
62
  embeddings[sub] = model.encode(
@@ -69,11 +78,11 @@ class BitextMiningEvaluator(Evaluator):
69
78
 
70
79
  def _similarity_search(
71
80
  self,
72
- query_embeddings: np.ndarray,
73
- corpus_embeddings: np.ndarray,
81
+ query_embeddings: Array,
82
+ corpus_embeddings: Array,
74
83
  model: EncoderProtocol,
75
84
  query_chunk_size: int = 100,
76
- corpus_chunk_size: int = 500000,
85
+ corpus_chunk_size: int = 500_000,
77
86
  ) -> list[dict[str, float]]:
78
87
  """This function performs a cosine similarity search between a list of query embeddings and a list of corpus embeddings.
79
88
 
@@ -104,13 +113,15 @@ class BitextMiningEvaluator(Evaluator):
104
113
  ):
105
114
  query_embeddings = query_embeddings.to(corpus_embeddings.device)
106
115
 
107
- queries_result_list = [[] for _ in range(len(query_embeddings))]
116
+ queries_result_list: list[list[dict[str, float]]] = [
117
+ [] for _ in range(len(query_embeddings))
118
+ ]
108
119
 
109
120
  for query_start_idx in range(0, len(query_embeddings), query_chunk_size):
110
121
  # Iterate over chunks of the corpus
111
122
  for corpus_start_idx in range(0, len(corpus_embeddings), corpus_chunk_size):
112
123
  # Compute cosine similarities
113
- similarity_scores = model.similarity( # type: ignore
124
+ similarity_scores = model.similarity(
114
125
  query_embeddings[
115
126
  query_start_idx : query_start_idx + query_chunk_size
116
127
  ],
@@ -120,15 +131,17 @@ class BitextMiningEvaluator(Evaluator):
120
131
  )
121
132
 
122
133
  # Get top-k scores
123
- cos_scores_top_k_values, cos_scores_top_k_idx = torch.topk(
124
- torch.tensor(similarity_scores),
125
- 1,
126
- dim=1,
127
- largest=True,
128
- sorted=False,
134
+ cos_scores_top_k_values_tensor, cos_scores_top_k_idx_tensor = (
135
+ torch.topk(
136
+ torch.tensor(similarity_scores),
137
+ 1,
138
+ dim=1,
139
+ largest=True,
140
+ sorted=False,
141
+ )
129
142
  )
130
- cos_scores_top_k_values = cos_scores_top_k_values.cpu().tolist()
131
- cos_scores_top_k_idx = cos_scores_top_k_idx.cpu().tolist()
143
+ cos_scores_top_k_values = cos_scores_top_k_values_tensor.cpu().tolist()
144
+ cos_scores_top_k_idx = cos_scores_top_k_idx_tensor.cpu().tolist()
132
145
 
133
146
  for query_itr in range(len(similarity_scores)):
134
147
  for sub_corpus_id, score in zip(
@@ -141,11 +154,14 @@ class BitextMiningEvaluator(Evaluator):
141
154
  {"corpus_id": corpus_id, "score": score}
142
155
  )
143
156
 
157
+ result_queries_list: list[dict[str, float]] = [
158
+ {} for _ in range(len(query_embeddings))
159
+ ]
144
160
  # Sort and strip to top_k results
145
161
  for idx in range(len(queries_result_list)):
146
162
  queries_result_list[idx] = sorted(
147
163
  queries_result_list[idx], key=lambda x: x["score"], reverse=True
148
164
  )
149
- queries_result_list[idx] = queries_result_list[idx][0]
165
+ result_queries_list[idx] = queries_result_list[idx][0]
150
166
 
151
- return queries_result_list
167
+ return result_queries_list
@@ -1,6 +1,8 @@
1
+ from __future__ import annotations
2
+
1
3
  import logging
2
4
  import sys
3
- from typing import Any, TypedDict
5
+ from typing import TYPE_CHECKING, TypedDict
4
6
 
5
7
  import numpy as np
6
8
  import torch
@@ -9,10 +11,13 @@ from tqdm.auto import tqdm
9
11
 
10
12
  from mteb._create_dataloaders import _create_dataloader_from_texts
11
13
  from mteb._evaluators.evaluator import Evaluator
12
- from mteb.abstasks.task_metadata import TaskMetadata
13
- from mteb.models import EncoderProtocol
14
14
  from mteb.similarity_functions import cos_sim, dot_score
15
15
 
16
+ if TYPE_CHECKING:
17
+ from mteb.abstasks.task_metadata import TaskMetadata
18
+ from mteb.models import EncoderProtocol
19
+ from mteb.types import EncodeKwargs
20
+
16
21
  # if later than python 3.13 use typing module
17
22
  if sys.version_info >= (3, 13):
18
23
  from warnings import deprecated
@@ -94,7 +99,8 @@ class SummarizationEvaluator(Evaluator):
94
99
  self,
95
100
  model: EncoderProtocol,
96
101
  *,
97
- encode_kwargs: dict[str, Any],
102
+ encode_kwargs: EncodeKwargs,
103
+ num_proc: int = 1,
98
104
  ) -> SummarizationDistances:
99
105
  # Get the human & machine summaries for the text in one go for all
100
106
  human_lens = [len(human_summaries) for human_summaries in self.human_summaries]
@@ -110,6 +116,7 @@ class SummarizationEvaluator(Evaluator):
110
116
  for human_summaries in self.human_summaries
111
117
  for summary in human_summaries
112
118
  ],
119
+ num_proc=num_proc,
113
120
  **encode_kwargs,
114
121
  ),
115
122
  task_metadata=self.task_metadata,
@@ -135,10 +142,10 @@ class SummarizationEvaluator(Evaluator):
135
142
  )
136
143
 
137
144
  # Split the embeddings into the original human & machine summaries
138
- embs_human_summaries_all = np.split(
145
+ embs_human_summaries_all_split = np.split(
139
146
  embs_human_summaries_all, np.cumsum(human_lens)[:-1]
140
147
  )
141
- embs_machine_summaries_all = np.split(
148
+ embs_machine_summaries_all_split = np.split(
142
149
  embs_machine_summaries_all, np.cumsum(machine_lens)[:-1]
143
150
  )
144
151
 
@@ -148,7 +155,9 @@ class SummarizationEvaluator(Evaluator):
148
155
  all_human_scores = []
149
156
 
150
157
  for i, (embs_human_summaries, embs_machine_summaries) in tqdm(
151
- enumerate(zip(embs_human_summaries_all, embs_machine_summaries_all)),
158
+ enumerate(
159
+ zip(embs_human_summaries_all_split, embs_machine_summaries_all_split)
160
+ ),
152
161
  desc="Scoring",
153
162
  total=len(self.human_summaries),
154
163
  ):
@@ -164,7 +173,7 @@ class SummarizationEvaluator(Evaluator):
164
173
  dot_scores = dot_score(emb_machine_summary, embs_human_summaries)
165
174
 
166
175
  _sim_score = [
167
- float(model.similarity(emb_machine_summary, emb_human_summary)) # type: ignore
176
+ float(model.similarity(emb_machine_summary, emb_human_summary))
168
177
  for emb_human_summary in embs_human_summaries
169
178
  ]
170
179
  sim_score = torch.tensor(_sim_score)
@@ -216,17 +225,19 @@ class SummarizationEvaluator(Evaluator):
216
225
  strict=True,
217
226
  ):
218
227
  cosine_spearman_scores.append(
219
- spearmanr(human_scores, cosine_pred_scores).statistic
228
+ float(spearmanr(human_scores, cosine_pred_scores).statistic)
220
229
  )
221
230
  cosine_pearson_scores.append(
222
- pearsonr(human_scores, cosine_pred_scores).statistic
231
+ float(pearsonr(human_scores, cosine_pred_scores).statistic)
223
232
  )
224
233
  dot_spearman_scores.append(
225
- spearmanr(human_scores, dot_pred_scores).statistic
234
+ float(spearmanr(human_scores, dot_pred_scores).statistic)
226
235
  )
227
- dot_pearson_scores.append(pearsonr(human_scores, dot_pred_scores).statistic)
228
- spearman_scores.append(spearmanr(human_scores, sim_scores).statistic)
229
- pearson_scores.append(pearsonr(human_scores, sim_scores).statistic)
236
+ dot_pearson_scores.append(
237
+ float(pearsonr(human_scores, dot_pred_scores).statistic)
238
+ )
239
+ spearman_scores.append(float(spearmanr(human_scores, sim_scores).statistic))
240
+ pearson_scores.append(float(pearsonr(human_scores, sim_scores).statistic))
230
241
 
231
242
  return SummarizationMetrics(
232
243
  pearson=float(np.mean(pearson_scores)),
@@ -273,10 +284,10 @@ class DeprecatedSummarizationEvaluator(SummarizationEvaluator):
273
284
  pearson_scores.append(pearsonr(human_scores, sim_scores))
274
285
 
275
286
  return SummarizationMetrics(
276
- pearson=float(np.mean(pearson_scores)),
277
- spearman=float(np.mean(spearman_scores)),
278
- cosine_spearman=float(np.mean(cosine_spearman_scores)),
279
- cosine_pearson=float(np.mean(cosine_pearson_scores)),
280
- dot_pearson=float(np.mean(dot_pearson_scores)),
281
- dot_spearman=float(np.mean(dot_spearman_scores)),
287
+ pearson=float(np.mean(pearson_scores)), # type: ignore[arg-type]
288
+ spearman=float(np.mean(spearman_scores)), # type: ignore[arg-type]
289
+ cosine_spearman=float(np.mean(cosine_spearman_scores)), # type: ignore[arg-type]
290
+ cosine_pearson=float(np.mean(cosine_pearson_scores)), # type: ignore[arg-type]
291
+ dot_pearson=float(np.mean(dot_pearson_scores)), # type: ignore[arg-type]
292
+ dot_spearman=float(np.mean(dot_spearman_scores)), # type: ignore[arg-type]
282
293
  )
@@ -1,5 +1,7 @@
1
+ from __future__ import annotations
2
+
1
3
  import logging
2
- from typing import Any
4
+ from typing import TYPE_CHECKING
3
5
 
4
6
  from datasets import Dataset
5
7
 
@@ -7,13 +9,17 @@ from mteb._create_dataloaders import (
7
9
  _create_dataloader_from_texts,
8
10
  create_dataloader,
9
11
  )
10
- from mteb.abstasks.task_metadata import TaskMetadata
11
- from mteb.models import EncoderProtocol
12
12
  from mteb.similarity_functions import similarity
13
- from mteb.types import Array
14
13
 
15
14
  from .evaluator import Evaluator
16
15
 
16
+ if TYPE_CHECKING:
17
+ from datasets import Dataset
18
+
19
+ from mteb.abstasks.task_metadata import TaskMetadata
20
+ from mteb.models import EncoderProtocol
21
+ from mteb.types import Array, EncodeKwargs
22
+
17
23
  logger = logging.getLogger(__name__)
18
24
 
19
25
 
@@ -38,12 +44,17 @@ class ZeroShotClassificationEvaluator(Evaluator):
38
44
  self.hf_subset = hf_subset
39
45
 
40
46
  def __call__(
41
- self, model: EncoderProtocol, *, encode_kwargs: dict[str, Any]
47
+ self,
48
+ model: EncoderProtocol,
49
+ *,
50
+ encode_kwargs: EncodeKwargs,
51
+ num_proc: int = 1,
42
52
  ) -> Array:
43
53
  dataloader = create_dataloader(
44
54
  self.dataset,
45
55
  input_column=self.input_column_name,
46
56
  task_metadata=self.task_metadata,
57
+ num_proc=num_proc,
47
58
  **encode_kwargs,
48
59
  )
49
60
 
mteb/_helpful_enum.py CHANGED
@@ -1,6 +1,10 @@
1
+ from __future__ import annotations
2
+
1
3
  from enum import Enum
4
+ from typing import TYPE_CHECKING
2
5
 
3
- from typing_extensions import Self
6
+ if TYPE_CHECKING:
7
+ from typing_extensions import Self
4
8
 
5
9
 
6
10
  class HelpfulStrEnum(str, Enum):
@@ -1,12 +1,18 @@
1
1
  """Simplified version of https://gist.github.com/AlexeyVatolin/ea3adc21aa7a767603ff393b22085adc from https://github.com/embeddings-benchmark/mteb/pull/2900"""
2
2
 
3
+ from __future__ import annotations
4
+
3
5
  import logging
6
+ from typing import TYPE_CHECKING
4
7
 
5
8
  import datasets
6
9
  import pandas as pd
7
- from datasets import Dataset, DatasetDict
10
+ from datasets import DatasetDict
11
+
12
+ if TYPE_CHECKING:
13
+ from datasets import Dataset
8
14
 
9
- from mteb import TaskMetadata
15
+ from mteb import TaskMetadata
10
16
 
11
17
  logger = logging.getLogger(__name__)
12
18
 
@@ -61,7 +67,7 @@ def filter_unclear_label(
61
67
  for text, label in zip(ds[input_column], ds[label_column]):
62
68
  key = text.strip().lower()
63
69
  normalized.setdefault(key, set()).add(
64
- label if isinstance(label, (str, int, float)) else tuple(label)
70
+ label if isinstance(label, (str, int, float)) else tuple(label) # type: ignore[arg-type]
65
71
  )
66
72
 
67
73
  bad_texts = {t for t, labels in normalized.items() if len(labels) > 1}
@@ -1,9 +1,10 @@
1
+ from __future__ import annotations
2
+
1
3
  import logging
4
+ from typing import TYPE_CHECKING
2
5
 
3
6
  from datasets import DatasetDict
4
7
 
5
- from mteb import TaskMetadata
6
- from mteb.abstasks import AbsTaskClassification
7
8
  from mteb.abstasks._data_filter.filters import (
8
9
  deduplicate,
9
10
  filter_empty,
@@ -13,6 +14,10 @@ from mteb.abstasks._data_filter.filters import (
13
14
  split_train_test,
14
15
  )
15
16
 
17
+ if TYPE_CHECKING:
18
+ from mteb import TaskMetadata
19
+ from mteb.abstasks import AbsTaskClassification
20
+
16
21
  logger = logging.getLogger(__name__)
17
22
 
18
23
 
@@ -89,6 +94,9 @@ def process_classification(
89
94
  subset=None,
90
95
  )
91
96
 
97
+ if task.dataset is None:
98
+ raise ValueError("Task dataset is None.")
99
+
92
100
  new_ds = {}
93
101
  for subset in task.dataset:
94
102
  new_ds[subset] = clean_dataset(