mteb 2.5.2__py3-none-any.whl → 2.7.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (529) hide show
  1. mteb/__init__.py +2 -0
  2. mteb/_create_dataloaders.py +78 -30
  3. mteb/_evaluators/any_sts_evaluator.py +13 -6
  4. mteb/_evaluators/clustering_evaluator.py +13 -5
  5. mteb/_evaluators/evaluator.py +12 -4
  6. mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +22 -11
  7. mteb/_evaluators/pair_classification_evaluator.py +17 -7
  8. mteb/_evaluators/retrieval_evaluator.py +23 -14
  9. mteb/_evaluators/retrieval_metrics.py +26 -19
  10. mteb/_evaluators/sklearn_evaluator.py +27 -17
  11. mteb/_evaluators/text/bitext_mining_evaluator.py +36 -20
  12. mteb/_evaluators/text/summarization_evaluator.py +31 -20
  13. mteb/_evaluators/zeroshot_classification_evaluator.py +16 -5
  14. mteb/_helpful_enum.py +5 -1
  15. mteb/abstasks/_data_filter/filters.py +9 -3
  16. mteb/abstasks/_data_filter/task_pipelines.py +10 -2
  17. mteb/abstasks/_statistics_calculation.py +21 -11
  18. mteb/abstasks/_stratification.py +18 -18
  19. mteb/abstasks/abstask.py +78 -44
  20. mteb/abstasks/aggregate_task_metadata.py +21 -18
  21. mteb/abstasks/aggregated_task.py +23 -35
  22. mteb/abstasks/classification.py +39 -18
  23. mteb/abstasks/clustering.py +37 -20
  24. mteb/abstasks/clustering_legacy.py +30 -16
  25. mteb/abstasks/image/image_text_pair_classification.py +26 -9
  26. mteb/abstasks/multilabel_classification.py +33 -21
  27. mteb/abstasks/pair_classification.py +44 -19
  28. mteb/abstasks/regression.py +18 -10
  29. mteb/abstasks/retrieval.py +82 -52
  30. mteb/abstasks/retrieval_dataset_loaders.py +50 -39
  31. mteb/abstasks/sts.py +34 -15
  32. mteb/abstasks/task_metadata.py +44 -37
  33. mteb/abstasks/text/bitext_mining.py +57 -35
  34. mteb/abstasks/text/reranking.py +10 -8
  35. mteb/abstasks/text/summarization.py +26 -10
  36. mteb/abstasks/zeroshot_classification.py +27 -9
  37. mteb/benchmarks/_create_table.py +13 -7
  38. mteb/benchmarks/benchmark.py +15 -3
  39. mteb/benchmarks/benchmarks/__init__.py +6 -0
  40. mteb/benchmarks/benchmarks/benchmarks.py +153 -13
  41. mteb/benchmarks/benchmarks/rteb_benchmarks.py +20 -9
  42. mteb/benchmarks/get_benchmark.py +14 -55
  43. mteb/cache.py +189 -31
  44. mteb/cli/_display_tasks.py +10 -4
  45. mteb/cli/build_cli.py +112 -13
  46. mteb/cli/generate_model_card.py +50 -23
  47. mteb/deprecated_evaluator.py +72 -54
  48. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2CybersecurityRetrieval.json +32 -0
  49. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EconomicRetrieval.json +32 -0
  50. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EnergyRetrieval.json +32 -0
  51. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2HrRetrieval.json +32 -0
  52. mteb/descriptive_stats/Retrieval/BrightAopsRetrieval.json +35 -0
  53. mteb/descriptive_stats/Retrieval/BrightBiologyLongRetrieval.json +35 -0
  54. mteb/descriptive_stats/Retrieval/BrightBiologyRetrieval.json +35 -0
  55. mteb/descriptive_stats/Retrieval/BrightEarthScienceLongRetrieval.json +35 -0
  56. mteb/descriptive_stats/Retrieval/BrightEarthScienceRetrieval.json +35 -0
  57. mteb/descriptive_stats/Retrieval/BrightEconomicsLongRetrieval.json +35 -0
  58. mteb/descriptive_stats/Retrieval/BrightEconomicsRetrieval.json +35 -0
  59. mteb/descriptive_stats/Retrieval/BrightLeetcodeRetrieval.json +35 -0
  60. mteb/descriptive_stats/Retrieval/BrightPonyLongRetrieval.json +35 -0
  61. mteb/descriptive_stats/Retrieval/BrightPonyRetrieval.json +35 -0
  62. mteb/descriptive_stats/Retrieval/BrightPsychologyLongRetrieval.json +35 -0
  63. mteb/descriptive_stats/Retrieval/BrightPsychologyRetrieval.json +35 -0
  64. mteb/descriptive_stats/Retrieval/BrightRoboticsLongRetrieval.json +35 -0
  65. mteb/descriptive_stats/Retrieval/BrightRoboticsRetrieval.json +35 -0
  66. mteb/descriptive_stats/Retrieval/BrightStackoverflowLongRetrieval.json +35 -0
  67. mteb/descriptive_stats/Retrieval/BrightStackoverflowRetrieval.json +35 -0
  68. mteb/descriptive_stats/Retrieval/BrightSustainableLivingLongRetrieval.json +35 -0
  69. mteb/descriptive_stats/Retrieval/BrightSustainableLivingRetrieval.json +35 -0
  70. mteb/descriptive_stats/Retrieval/BrightTheoremQAQuestionsRetrieval.json +35 -0
  71. mteb/descriptive_stats/Retrieval/BrightTheoremQATheoremsRetrieval.json +35 -0
  72. mteb/descriptive_stats/Retrieval/ChemRxivRetrieval.json +30 -0
  73. mteb/descriptive_stats/Retrieval/EuroPIRQRetrieval.json +116 -0
  74. mteb/descriptive_stats/Retrieval/NanoClimateFEVER-VN.json +30 -0
  75. mteb/descriptive_stats/Retrieval/NanoDBPedia-VN.json +30 -0
  76. mteb/descriptive_stats/Retrieval/NanoFEVER-VN.json +30 -0
  77. mteb/descriptive_stats/Retrieval/NanoHotpotQA-VN.json +30 -0
  78. mteb/descriptive_stats/Retrieval/NanoMSMARCO-VN.json +30 -0
  79. mteb/descriptive_stats/Retrieval/NanoNQ-VN.json +30 -0
  80. mteb/descriptive_stats/Retrieval/TVPLRetrieval.json +30 -0
  81. mteb/evaluate.py +71 -47
  82. mteb/filter_tasks.py +36 -32
  83. mteb/get_tasks.py +37 -33
  84. mteb/languages/language_scripts.py +11 -4
  85. mteb/leaderboard/app.py +172 -37
  86. mteb/leaderboard/table.py +7 -2
  87. mteb/load_results.py +20 -14
  88. mteb/models/abs_encoder.py +30 -16
  89. mteb/models/cache_wrappers/cache_backend_protocol.py +7 -7
  90. mteb/models/cache_wrappers/cache_backends/_hash_utils.py +10 -5
  91. mteb/models/cache_wrappers/cache_backends/faiss_cache.py +13 -4
  92. mteb/models/cache_wrappers/cache_backends/numpy_cache.py +43 -25
  93. mteb/models/cache_wrappers/cache_wrapper.py +16 -11
  94. mteb/models/get_model_meta.py +53 -9
  95. mteb/models/instruct_wrapper.py +41 -13
  96. mteb/models/model_implementations/align_models.py +11 -5
  97. mteb/models/model_implementations/amazon_models.py +1 -0
  98. mteb/models/model_implementations/andersborges.py +6 -4
  99. mteb/models/model_implementations/ara_models.py +2 -1
  100. mteb/models/model_implementations/arctic_models.py +16 -8
  101. mteb/models/model_implementations/b1ade_models.py +2 -1
  102. mteb/models/model_implementations/bedrock_models.py +20 -6
  103. mteb/models/model_implementations/bge_models.py +85 -22
  104. mteb/models/model_implementations/bica_model.py +4 -3
  105. mteb/models/model_implementations/blip2_models.py +13 -6
  106. mteb/models/model_implementations/blip_models.py +33 -20
  107. mteb/models/model_implementations/bm25.py +27 -17
  108. mteb/models/model_implementations/bmretriever_models.py +16 -6
  109. mteb/models/model_implementations/cadet_models.py +2 -1
  110. mteb/models/model_implementations/cde_models.py +22 -9
  111. mteb/models/model_implementations/clip_models.py +18 -10
  112. mteb/models/model_implementations/clips_models.py +6 -3
  113. mteb/models/model_implementations/codefuse_models.py +10 -5
  114. mteb/models/model_implementations/codesage_models.py +6 -3
  115. mteb/models/model_implementations/cohere_models.py +19 -9
  116. mteb/models/model_implementations/cohere_v.py +16 -6
  117. mteb/models/model_implementations/colpali_models.py +10 -6
  118. mteb/models/model_implementations/colqwen_models.py +24 -38
  119. mteb/models/model_implementations/colsmol_models.py +5 -3
  120. mteb/models/model_implementations/conan_models.py +12 -5
  121. mteb/models/model_implementations/dino_models.py +70 -46
  122. mteb/models/model_implementations/e5_instruct.py +27 -4
  123. mteb/models/model_implementations/e5_models.py +18 -9
  124. mteb/models/model_implementations/e5_v.py +16 -10
  125. mteb/models/model_implementations/eagerworks_models.py +12 -5
  126. mteb/models/model_implementations/emillykkejensen_models.py +9 -6
  127. mteb/models/model_implementations/en_code_retriever.py +2 -1
  128. mteb/models/model_implementations/euler_models.py +3 -2
  129. mteb/models/model_implementations/evaclip_models.py +13 -4
  130. mteb/models/model_implementations/fa_models.py +18 -9
  131. mteb/models/model_implementations/facebookai.py +16 -2
  132. mteb/models/model_implementations/geogpt_models.py +2 -1
  133. mteb/models/model_implementations/gme_v_models.py +13 -8
  134. mteb/models/model_implementations/google_models.py +16 -5
  135. mteb/models/model_implementations/granite_vision_embedding_models.py +8 -6
  136. mteb/models/model_implementations/gritlm_models.py +5 -2
  137. mteb/models/model_implementations/gte_models.py +34 -13
  138. mteb/models/model_implementations/hinvec_models.py +7 -2
  139. mteb/models/model_implementations/human.py +1 -0
  140. mteb/models/model_implementations/ibm_granite_models.py +36 -6
  141. mteb/models/model_implementations/inf_models.py +4 -2
  142. mteb/models/model_implementations/jasper_models.py +16 -7
  143. mteb/models/model_implementations/jina_clip.py +58 -14
  144. mteb/models/model_implementations/jina_models.py +35 -16
  145. mteb/models/model_implementations/kalm_models.py +24 -12
  146. mteb/models/model_implementations/kblab.py +13 -6
  147. mteb/models/model_implementations/kennethenevoldsen_models.py +6 -4
  148. mteb/models/model_implementations/kfst.py +2 -1
  149. mteb/models/model_implementations/kowshik24_models.py +2 -1
  150. mteb/models/model_implementations/lens_models.py +2 -0
  151. mteb/models/model_implementations/lgai_embedding_models.py +2 -1
  152. mteb/models/model_implementations/linq_models.py +8 -2
  153. mteb/models/model_implementations/listconranker.py +11 -5
  154. mteb/models/model_implementations/llm2clip_models.py +18 -10
  155. mteb/models/model_implementations/llm2vec_models.py +28 -14
  156. mteb/models/model_implementations/mcinext_models.py +12 -3
  157. mteb/models/model_implementations/mdbr_models.py +19 -3
  158. mteb/models/model_implementations/misc_models.py +131 -68
  159. mteb/models/model_implementations/mixedbread_ai_models.py +335 -0
  160. mteb/models/model_implementations/mme5_models.py +3 -2
  161. mteb/models/model_implementations/moco_models.py +15 -8
  162. mteb/models/model_implementations/mod_models.py +3 -2
  163. mteb/models/model_implementations/model2vec_models.py +37 -18
  164. mteb/models/model_implementations/moka_models.py +4 -1
  165. mteb/models/model_implementations/nbailab.py +6 -3
  166. mteb/models/model_implementations/no_instruct_sentence_models.py +15 -7
  167. mteb/models/model_implementations/nomic_models.py +47 -19
  168. mteb/models/model_implementations/nomic_models_vision.py +6 -4
  169. mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +20 -8
  170. mteb/models/model_implementations/nvidia_models.py +165 -22
  171. mteb/models/model_implementations/octen_models.py +64 -3
  172. mteb/models/model_implementations/openai_models.py +14 -4
  173. mteb/models/model_implementations/openclip_models.py +30 -17
  174. mteb/models/model_implementations/opensearch_neural_sparse_models.py +20 -9
  175. mteb/models/model_implementations/ops_moa_models.py +10 -3
  176. mteb/models/model_implementations/ordalietech_solon_embeddings_mini_beta_1_1.py +2 -1
  177. mteb/models/model_implementations/pawan_models.py +2 -1
  178. mteb/models/model_implementations/piccolo_models.py +3 -1
  179. mteb/models/model_implementations/pixie_models.py +56 -0
  180. mteb/models/model_implementations/promptriever_models.py +20 -10
  181. mteb/models/model_implementations/pylate_models.py +41 -21
  182. mteb/models/model_implementations/qodo_models.py +4 -2
  183. mteb/models/model_implementations/qtack_models.py +2 -1
  184. mteb/models/model_implementations/qwen3_models.py +14 -4
  185. mteb/models/model_implementations/qzhou_models.py +4 -2
  186. mteb/models/model_implementations/random_baseline.py +7 -6
  187. mteb/models/model_implementations/rasgaard_models.py +3 -2
  188. mteb/models/model_implementations/reasonir_model.py +66 -1
  189. mteb/models/model_implementations/repllama_models.py +18 -9
  190. mteb/models/model_implementations/rerankers_custom.py +25 -10
  191. mteb/models/model_implementations/rerankers_monot5_based.py +41 -21
  192. mteb/models/model_implementations/richinfoai_models.py +2 -1
  193. mteb/models/model_implementations/ru_sentence_models.py +40 -20
  194. mteb/models/model_implementations/ruri_models.py +20 -10
  195. mteb/models/model_implementations/salesforce_models.py +13 -4
  196. mteb/models/model_implementations/samilpwc_models.py +2 -1
  197. mteb/models/model_implementations/sarashina_embedding_models.py +4 -2
  198. mteb/models/model_implementations/searchmap_models.py +2 -1
  199. mteb/models/model_implementations/seed_1_6_embedding_models.py +5 -2
  200. mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +119 -148
  201. mteb/models/model_implementations/seed_models.py +2 -1
  202. mteb/models/model_implementations/sentence_transformers_models.py +142 -22
  203. mteb/models/model_implementations/shuu_model.py +2 -1
  204. mteb/models/model_implementations/siglip_models.py +39 -24
  205. mteb/models/model_implementations/slm_models.py +419 -0
  206. mteb/models/model_implementations/sonar_models.py +2 -1
  207. mteb/models/model_implementations/spartan8806_atles_champion.py +2 -1
  208. mteb/models/model_implementations/stella_models.py +23 -4
  209. mteb/models/model_implementations/tarka_models.py +4 -2
  210. mteb/models/model_implementations/text2vec_models.py +12 -3
  211. mteb/models/model_implementations/ua_sentence_models.py +2 -1
  212. mteb/models/model_implementations/uae_models.py +17 -5
  213. mteb/models/model_implementations/vdr_models.py +9 -2
  214. mteb/models/model_implementations/vi_vn_models.py +12 -6
  215. mteb/models/model_implementations/vista_models.py +11 -4
  216. mteb/models/model_implementations/vlm2vec_models.py +14 -7
  217. mteb/models/model_implementations/voyage_models.py +136 -4
  218. mteb/models/model_implementations/voyage_v.py +17 -10
  219. mteb/models/model_implementations/xyz_models.py +1 -0
  220. mteb/models/model_implementations/youtu_models.py +2 -1
  221. mteb/models/model_implementations/yuan_models.py +2 -1
  222. mteb/models/model_implementations/yuan_models_en.py +3 -2
  223. mteb/models/model_meta.py +127 -40
  224. mteb/models/models_protocols.py +43 -22
  225. mteb/models/search_encoder_index/search_backend_protocol.py +7 -3
  226. mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +21 -10
  227. mteb/models/search_wrappers.py +63 -29
  228. mteb/models/sentence_transformer_wrapper.py +52 -26
  229. mteb/models/vllm_wrapper.py +329 -0
  230. mteb/py.typed +0 -0
  231. mteb/results/benchmark_results.py +48 -35
  232. mteb/results/model_result.py +68 -32
  233. mteb/results/task_result.py +110 -72
  234. mteb/similarity_functions.py +19 -9
  235. mteb/tasks/aggregated_tasks/eng/cqadupstack_retrieval.py +3 -3
  236. mteb/tasks/aggregated_tasks/eng/sts17_multilingual_visual_sts_eng.py +3 -3
  237. mteb/tasks/aggregated_tasks/eng/sts_benchmark_multilingual_visual_sts_eng.py +3 -3
  238. mteb/tasks/aggregated_tasks/fas/cqadupstack_retrieval_fa.py +3 -3
  239. mteb/tasks/aggregated_tasks/fas/syn_per_chatbot_conv_sa_classification.py +3 -3
  240. mteb/tasks/aggregated_tasks/multilingual/sts17_multilingual_vision_sts.py +3 -3
  241. mteb/tasks/aggregated_tasks/multilingual/sts_benchmark_multilingual_visual_sts.py +3 -3
  242. mteb/tasks/aggregated_tasks/nld/cqadupstack_nl_retrieval.py +3 -3
  243. mteb/tasks/aggregated_tasks/pol/cqadupstack_retrieval_pl.py +3 -3
  244. mteb/tasks/bitext_mining/eng/pub_chem_smiles_bitext_mining.py +1 -1
  245. mteb/tasks/bitext_mining/fas/fa_mteb_summary_retrieval.py +3 -3
  246. mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining.py +1 -1
  247. mteb/tasks/bitext_mining/multilingual/flores_bitext_mining.py +1 -1
  248. mteb/tasks/bitext_mining/multilingual/in22_conv_bitext_mining.py +1 -1
  249. mteb/tasks/bitext_mining/multilingual/in22_gen_bitext_mining.py +1 -1
  250. mteb/tasks/bitext_mining/multilingual/norwegian_courts_bitext_mining.py +1 -1
  251. mteb/tasks/bitext_mining/multilingual/ntrex_bitext_mining.py +1 -1
  252. mteb/tasks/bitext_mining/multilingual/roma_tales_bitext_mining.py +2 -2
  253. mteb/tasks/bitext_mining/multilingual/web_faq_bitext_mining.py +2 -2
  254. mteb/tasks/classification/ara/online_store_review_sentiment_classification.py +1 -1
  255. mteb/tasks/classification/ara/restaurant_review_sentiment_classification.py +1 -1
  256. mteb/tasks/classification/ara/tweet_sarcasm_classification.py +1 -1
  257. mteb/tasks/classification/ben/bengali_hate_speech_classification.py +1 -1
  258. mteb/tasks/classification/ben/bengali_sentiment_analysis.py +1 -1
  259. mteb/tasks/classification/bul/bulgarian_store_review_sentiment_classfication.py +1 -1
  260. mteb/tasks/classification/ces/csfdcz_movie_review_sentiment_classification.py +2 -2
  261. mteb/tasks/classification/dan/ddisco_cohesion_classification.py +1 -1
  262. mteb/tasks/classification/dan/dk_hate_classification.py +2 -2
  263. mteb/tasks/classification/deu/german_politicians_twitter_sentiment_classification.py +1 -1
  264. mteb/tasks/classification/ell/greek_legal_code_classification.py +1 -1
  265. mteb/tasks/classification/eng/dbpedia_classification.py +2 -2
  266. mteb/tasks/classification/eng/toxic_chat_classification.py +2 -2
  267. mteb/tasks/classification/eng/toxic_conversations_classification.py +2 -2
  268. mteb/tasks/classification/eng/tweet_topic_single_classification.py +1 -1
  269. mteb/tasks/classification/eng/yahoo_answers_topics_classification.py +1 -1
  270. mteb/tasks/classification/eng/yelp_review_full_classification.py +2 -2
  271. mteb/tasks/classification/est/estonian_valence.py +2 -2
  272. mteb/tasks/classification/fas/fa_mteb_classification.py +6 -6
  273. mteb/tasks/classification/fas/persian_food_sentiment_classification.py +1 -1
  274. mteb/tasks/classification/fil/filipino_shopee_reviews_classification.py +1 -1
  275. mteb/tasks/classification/fin/fin_toxicity_classification.py +1 -1
  276. mteb/tasks/classification/fra/french_book_reviews.py +2 -2
  277. mteb/tasks/classification/fra/movie_review_sentiment_classification.py +2 -2
  278. mteb/tasks/classification/guj/gujarati_news_classification.py +1 -1
  279. mteb/tasks/classification/hin/hindi_discourse_classification.py +1 -1
  280. mteb/tasks/classification/hin/sentiment_analysis_hindi.py +1 -1
  281. mteb/tasks/classification/ind/indonesian_id_clickbait_classification.py +2 -2
  282. mteb/tasks/classification/ind/indonesian_mongabay_conservation_classification.py +1 -1
  283. mteb/tasks/classification/ita/dado_eval_coarse_classification.py +1 -1
  284. mteb/tasks/classification/ita/ita_casehold_classification.py +1 -1
  285. mteb/tasks/classification/ita/sardi_stance_classification.py +1 -1
  286. mteb/tasks/classification/jav/javanese_imdb_classification.py +1 -1
  287. mteb/tasks/classification/jpn/wrime_classification.py +1 -1
  288. mteb/tasks/classification/kan/kannada_news_classification.py +2 -2
  289. mteb/tasks/classification/kor/klue_tc.py +2 -2
  290. mteb/tasks/classification/kor/kor_fin.py +1 -1
  291. mteb/tasks/classification/kor/kor_hate_classification.py +1 -1
  292. mteb/tasks/classification/kor/kor_sarcasm_classification.py +1 -1
  293. mteb/tasks/classification/kur/kurdish_sentiment_classification.py +2 -2
  294. mteb/tasks/classification/mal/malayalam_news_classification.py +1 -1
  295. mteb/tasks/classification/mar/marathi_news_classification.py +1 -1
  296. mteb/tasks/classification/multilingual/afri_senti_lang_classification.py +1 -1
  297. mteb/tasks/classification/multilingual/catalonia_tweet_classification.py +1 -1
  298. mteb/tasks/classification/multilingual/cyrillic_turkic_lang_classification.py +1 -1
  299. mteb/tasks/classification/multilingual/indic_nlp_news_classification.py +1 -1
  300. mteb/tasks/classification/multilingual/masakha_news_classification.py +1 -1
  301. mteb/tasks/classification/multilingual/multi_hate_classification.py +1 -1
  302. mteb/tasks/classification/multilingual/multilingual_sentiment_classification.py +1 -1
  303. mteb/tasks/classification/multilingual/scala_classification.py +2 -2
  304. mteb/tasks/classification/multilingual/sib200_classification.py +1 -1
  305. mteb/tasks/classification/multilingual/turkic_classification.py +1 -1
  306. mteb/tasks/classification/multilingual/tweet_sentiment_classification.py +1 -1
  307. mteb/tasks/classification/nep/nepali_news_classification.py +2 -2
  308. mteb/tasks/classification/nld/dutch_sarcastic_headlines_classification.py +1 -1
  309. mteb/tasks/classification/nld/vaccin_chat_nl_classification.py +1 -1
  310. mteb/tasks/classification/ory/odia_news_classification.py +2 -2
  311. mteb/tasks/classification/pan/punjabi_news_classification.py +1 -1
  312. mteb/tasks/classification/ron/moroco.py +1 -1
  313. mteb/tasks/classification/ron/romanian_reviews_sentiment.py +1 -1
  314. mteb/tasks/classification/ron/romanian_sentiment_classification.py +1 -1
  315. mteb/tasks/classification/rus/georeview_classification.py +1 -1
  316. mteb/tasks/classification/rus/headline_classification.py +2 -2
  317. mteb/tasks/classification/rus/inappropriateness_classification.py +2 -2
  318. mteb/tasks/classification/rus/ru_reviews_classification.py +2 -2
  319. mteb/tasks/classification/rus/ru_sci_bench_grnti_classification.py +1 -1
  320. mteb/tasks/classification/rus/ru_sci_bench_oecd_classification.py +1 -1
  321. mteb/tasks/classification/rus/ru_toixic_classification_okmlcup.py +1 -1
  322. mteb/tasks/classification/san/sanskrit_shlokas_classification.py +1 -1
  323. mteb/tasks/classification/sin/sinhala_news_classification.py +2 -2
  324. mteb/tasks/classification/sin/sinhala_news_source_classification.py +2 -2
  325. mteb/tasks/classification/slk/csfdsk_movie_review_sentiment_classification.py +2 -2
  326. mteb/tasks/classification/slv/frenk_sl_classification.py +1 -1
  327. mteb/tasks/classification/spa/spanish_news_classification.py +2 -2
  328. mteb/tasks/classification/ssw/siswati_news_classification.py +1 -1
  329. mteb/tasks/classification/tam/tamil_news_classification.py +2 -2
  330. mteb/tasks/classification/tel/telugu_andhra_jyoti_news_classification.py +2 -2
  331. mteb/tasks/classification/tha/wongnai_reviews_classification.py +1 -1
  332. mteb/tasks/classification/tur/turkish_movie_sentiment_classification.py +2 -2
  333. mteb/tasks/classification/ukr/ukr_formality_classification.py +2 -2
  334. mteb/tasks/classification/vie/toxic_conversations_vn_classification.py +1 -1
  335. mteb/tasks/classification/vie/vie_student_feedback_classification.py +1 -1
  336. mteb/tasks/classification/zho/yue_openrice_review_classification.py +2 -2
  337. mteb/tasks/classification/zul/isi_zulu_news_classification.py +1 -1
  338. mteb/tasks/clustering/deu/blurbs_clustering_p2p.py +1 -1
  339. mteb/tasks/clustering/deu/blurbs_clustering_s2s.py +1 -1
  340. mteb/tasks/clustering/eng/arxiv_clustering_p2p.py +1 -1
  341. mteb/tasks/clustering/eng/arxiv_hierarchical_clustering.py +2 -2
  342. mteb/tasks/clustering/eng/big_patent_clustering.py +1 -1
  343. mteb/tasks/clustering/eng/biorxiv_clustering_p2p.py +1 -1
  344. mteb/tasks/clustering/eng/biorxiv_clustering_s2s.py +1 -1
  345. mteb/tasks/clustering/eng/hume_wiki_cities_clustering.py +1 -1
  346. mteb/tasks/clustering/eng/medrxiv_clustering_p2p.py +1 -1
  347. mteb/tasks/clustering/eng/medrxiv_clustering_s2s.py +1 -1
  348. mteb/tasks/clustering/eng/reddit_clustering.py +1 -1
  349. mteb/tasks/clustering/eng/reddit_clustering_p2p.py +1 -1
  350. mteb/tasks/clustering/eng/stack_exchange_clustering.py +1 -1
  351. mteb/tasks/clustering/eng/stack_exchange_clustering_p2p.py +1 -1
  352. mteb/tasks/clustering/eng/twenty_newsgroups_clustering.py +1 -1
  353. mteb/tasks/clustering/eng/wiki_cities_clustering.py +1 -1
  354. mteb/tasks/clustering/fas/fa_mteb_clustering.py +4 -4
  355. mteb/tasks/clustering/fra/hal_clustering_s2s.py +2 -2
  356. mteb/tasks/clustering/multilingual/mlsum_clustering_p2p.py +2 -2
  357. mteb/tasks/clustering/multilingual/mlsum_clustering_s2s.py +2 -2
  358. mteb/tasks/clustering/multilingual/sib200_clustering_s2s.py +1 -1
  359. mteb/tasks/clustering/multilingual/wiki_clustering_p2p.py +1 -1
  360. mteb/tasks/clustering/nld/dutch_news_articles_clustering_p2p.py +1 -1
  361. mteb/tasks/clustering/nld/dutch_news_articles_clustering_s2s.py +1 -1
  362. mteb/tasks/clustering/nld/iconclass_clustering_s2s.py +1 -1
  363. mteb/tasks/clustering/nld/open_tender_clustering_p2p.py +1 -1
  364. mteb/tasks/clustering/nld/vabb_clustering_p2p.py +1 -1
  365. mteb/tasks/clustering/nld/vabb_clustering_s2s.py +1 -1
  366. mteb/tasks/clustering/nob/snl_clustering.py +8 -3
  367. mteb/tasks/clustering/nob/vg_clustering.py +8 -3
  368. mteb/tasks/clustering/pol/polish_clustering.py +3 -3
  369. mteb/tasks/clustering/rus/ru_sci_bench_grnti_clustering_p2p.py +1 -1
  370. mteb/tasks/clustering/rus/ru_sci_bench_oecd_clustering_p2p.py +1 -1
  371. mteb/tasks/clustering/zho/cmteb_clustering.py +6 -6
  372. mteb/tasks/image_text_pair_classification/eng/image_co_de.py +1 -1
  373. mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +2 -2
  374. mteb/tasks/instruction_reranking/multilingual/m_follow_ir.py +2 -2
  375. mteb/tasks/multichoice/eng/cv_bench.py +4 -4
  376. mteb/tasks/multilabel_classification/ita/emit_classification.py +1 -1
  377. mteb/tasks/multilabel_classification/mlt/maltese_news_classification.py +1 -1
  378. mteb/tasks/multilabel_classification/rus/ru_toixic_multilabelclassification_okmlcup.py +1 -1
  379. mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_group_classification.py +1 -1
  380. mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_subclass_classification.py +1 -1
  381. mteb/tasks/pair_classification/ara/ar_entail.py +1 -1
  382. mteb/tasks/pair_classification/dan/talemaader_pc.py +1 -1
  383. mteb/tasks/pair_classification/deu/false_friends_de_en_pc.py +1 -1
  384. mteb/tasks/pair_classification/eng/pub_chem_ai_sentence_paraphrase_pc.py +1 -1
  385. mteb/tasks/pair_classification/eng/pub_chem_smilespc.py +4 -3
  386. mteb/tasks/pair_classification/eng/pub_chem_synonym_pc.py +1 -1
  387. mteb/tasks/pair_classification/eng/pub_chem_wiki_paragraphs_pc.py +1 -1
  388. mteb/tasks/pair_classification/eng/sprint_duplicate_questions_pc.py +1 -1
  389. mteb/tasks/pair_classification/eng/twitter_sem_eval2015_pc.py +1 -1
  390. mteb/tasks/pair_classification/eng/twitter_url_corpus_pc.py +1 -1
  391. mteb/tasks/pair_classification/fas/fa_mteb_pair_classification.py +5 -5
  392. mteb/tasks/pair_classification/fas/fars_tail.py +2 -2
  393. mteb/tasks/pair_classification/hye/armenian_paraphrase_pc.py +1 -1
  394. mteb/tasks/pair_classification/ita/dis_co_tex_pair_classification.py +1 -1
  395. mteb/tasks/pair_classification/kor/klue_nli.py +1 -1
  396. mteb/tasks/pair_classification/multilingual/rte3.py +2 -2
  397. mteb/tasks/pair_classification/multilingual/xnli.py +1 -1
  398. mteb/tasks/pair_classification/pol/polish_pc.py +4 -4
  399. mteb/tasks/pair_classification/por/assin2_rte.py +1 -1
  400. mteb/tasks/pair_classification/por/sick_br_pc.py +1 -1
  401. mteb/tasks/pair_classification/rus/terra.py +2 -2
  402. mteb/tasks/pair_classification/vie/sprint_duplicate_questions_pcvn.py +1 -1
  403. mteb/tasks/pair_classification/vie/twitter_sem_eval2015_pcvn.py +1 -1
  404. mteb/tasks/pair_classification/vie/twitter_url_corpus_pcvn.py +1 -1
  405. mteb/tasks/pair_classification/zho/cmteb_pair_classification.py +2 -2
  406. mteb/tasks/reranking/multilingual/wikipedia_reranking_multilingual.py +1 -1
  407. mteb/tasks/retrieval/ara/sadeem_question_retrieval.py +1 -1
  408. mteb/tasks/retrieval/code/code_edit_search_retrieval.py +1 -1
  409. mteb/tasks/retrieval/code/code_rag.py +16 -16
  410. mteb/tasks/retrieval/code/code_search_net_cc_retrieval.py +1 -1
  411. mteb/tasks/retrieval/code/coir_code_search_net_retrieval.py +1 -1
  412. mteb/tasks/retrieval/code/ds1000_retrieval.py +1 -1
  413. mteb/tasks/retrieval/code/fresh_stack_retrieval.py +1 -1
  414. mteb/tasks/retrieval/code/human_eval_retrieval.py +1 -1
  415. mteb/tasks/retrieval/code/mbpp_retrieval.py +1 -1
  416. mteb/tasks/retrieval/code/wiki_sql_retrieval.py +1 -1
  417. mteb/tasks/retrieval/dan/dan_fever_retrieval.py +2 -2
  418. mteb/tasks/retrieval/dan/tv2_nordretrieval.py +3 -3
  419. mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +3 -3
  420. mteb/tasks/retrieval/deu/german_gov_service_retrieval.py +1 -1
  421. mteb/tasks/retrieval/deu/german_qu_ad_retrieval.py +1 -1
  422. mteb/tasks/retrieval/ell/greek_civics_qa.py +1 -1
  423. mteb/tasks/retrieval/eng/__init__.py +44 -0
  424. mteb/tasks/retrieval/eng/bright_retrieval.py +10 -2
  425. mteb/tasks/retrieval/eng/bright_v1_1_retrieval.py +968 -0
  426. mteb/tasks/retrieval/eng/chat_doctor_retrieval.py +1 -1
  427. mteb/tasks/retrieval/eng/chemrxiv.py +33 -0
  428. mteb/tasks/retrieval/eng/cub200_i2i_retrieval.py +1 -1
  429. mteb/tasks/retrieval/eng/fin_qa_retrieval.py +1 -1
  430. mteb/tasks/retrieval/eng/finance_bench_retrieval.py +1 -1
  431. mteb/tasks/retrieval/eng/hateful_memes_i2t_retrieval.py +1 -1
  432. mteb/tasks/retrieval/eng/hateful_memes_t2i_retrieval.py +1 -1
  433. mteb/tasks/retrieval/eng/hc3_finance_retrieval.py +1 -1
  434. mteb/tasks/retrieval/eng/lemb_narrative_qa_retrieval.py +1 -1
  435. mteb/tasks/retrieval/eng/lemb_needle_retrieval.py +1 -1
  436. mteb/tasks/retrieval/eng/lemb_passkey_retrieval.py +1 -1
  437. mteb/tasks/retrieval/eng/lemb_summ_screen_fd_retrieval.py +1 -1
  438. mteb/tasks/retrieval/eng/lemb_wikim_qa_retrieval.py +1 -1
  439. mteb/tasks/retrieval/eng/lembqm_sum_retrieval.py +1 -1
  440. mteb/tasks/retrieval/eng/limit_retrieval.py +6 -1
  441. mteb/tasks/retrieval/eng/lit_search_retrieval.py +1 -1
  442. mteb/tasks/retrieval/eng/memotion_i2t_retrieval.py +1 -1
  443. mteb/tasks/retrieval/eng/memotion_t2i_retrieval.py +1 -1
  444. mteb/tasks/retrieval/eng/ml_questions.py +1 -1
  445. mteb/tasks/retrieval/eng/nano_argu_ana_retrieval.py +1 -1
  446. mteb/tasks/retrieval/eng/nano_climate_fever_retrieval.py +1 -1
  447. mteb/tasks/retrieval/eng/nano_db_pedia_retrieval.py +1 -1
  448. mteb/tasks/retrieval/eng/nano_fever_retrieval.py +1 -1
  449. mteb/tasks/retrieval/eng/nano_fi_qa2018_retrieval.py +1 -1
  450. mteb/tasks/retrieval/eng/nano_hotpot_qa_retrieval.py +1 -1
  451. mteb/tasks/retrieval/eng/nano_msmarco_retrieval.py +1 -1
  452. mteb/tasks/retrieval/eng/nano_nf_corpus_retrieval.py +1 -1
  453. mteb/tasks/retrieval/eng/nano_nq_retrieval.py +1 -1
  454. mteb/tasks/retrieval/eng/nano_quora_retrieval.py +1 -1
  455. mteb/tasks/retrieval/eng/nano_sci_fact_retrieval.py +1 -1
  456. mteb/tasks/retrieval/eng/nano_scidocs_retrieval.py +1 -1
  457. mteb/tasks/retrieval/eng/nano_touche2020_retrieval.py +1 -1
  458. mteb/tasks/retrieval/eng/narrative_qa_retrieval.py +1 -1
  459. mteb/tasks/retrieval/eng/r2_med_retrieval.py +8 -8
  460. mteb/tasks/retrieval/eng/sci_mmir_i2t_retrieval.py +1 -1
  461. mteb/tasks/retrieval/eng/sci_mmir_t2i_retrieval.py +1 -1
  462. mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +10 -10
  463. mteb/tasks/retrieval/fra/f_qu_ad_retrieval.py +1 -1
  464. mteb/tasks/retrieval/fra/syntec_retrieval.py +1 -1
  465. mteb/tasks/retrieval/hun/hun_sum2.py +1 -1
  466. mteb/tasks/retrieval/kat/georgian_faq_retrieval.py +1 -1
  467. mteb/tasks/retrieval/kor/__init__.py +15 -1
  468. mteb/tasks/retrieval/kor/kovidore2_bench_retrieval.py +142 -0
  469. mteb/tasks/retrieval/multilingual/__init__.py +2 -0
  470. mteb/tasks/retrieval/multilingual/cross_lingual_semantic_discrimination_wmt19.py +1 -1
  471. mteb/tasks/retrieval/multilingual/cross_lingual_semantic_discrimination_wmt21.py +1 -1
  472. mteb/tasks/retrieval/multilingual/cur_ev1_retrieval.py +1 -1
  473. mteb/tasks/retrieval/multilingual/euro_pirq_retrieval.py +43 -0
  474. mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +1 -1
  475. mteb/tasks/retrieval/multilingual/miracl_vision_retrieval.py +1 -1
  476. mteb/tasks/retrieval/multilingual/mr_tidy_retrieval.py +1 -1
  477. mteb/tasks/retrieval/multilingual/public_health_qa_retrieval.py +1 -1
  478. mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +5 -5
  479. mteb/tasks/retrieval/multilingual/statcan_dialogue_dataset_retrieval.py +1 -1
  480. mteb/tasks/retrieval/multilingual/vdr_multilingual_retrieval.py +1 -1
  481. mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +14 -4
  482. mteb/tasks/retrieval/multilingual/vidore3_bench_retrieval.py +90 -100
  483. mteb/tasks/retrieval/multilingual/wit_t2i_retrieval.py +1 -1
  484. mteb/tasks/retrieval/multilingual/x_flickr30k_co_t2i_retrieval.py +1 -1
  485. mteb/tasks/retrieval/multilingual/x_qu_ad_retrieval.py +1 -1
  486. mteb/tasks/retrieval/multilingual/xm3600_t2i_retrieval.py +1 -1
  487. mteb/tasks/retrieval/nld/cqa_dupstack_android_nl_retrieval.py +1 -1
  488. mteb/tasks/retrieval/nld/cqa_dupstack_english_nl_retrieval.py +1 -1
  489. mteb/tasks/retrieval/nld/cqa_dupstack_gaming_nl_retrieval.py +1 -1
  490. mteb/tasks/retrieval/nld/cqa_dupstack_gis_nl_retrieval.py +1 -1
  491. mteb/tasks/retrieval/nld/cqa_dupstack_mathematica_nl_retrieval.py +1 -1
  492. mteb/tasks/retrieval/nld/cqa_dupstack_physics_nl_retrieval.py +1 -1
  493. mteb/tasks/retrieval/nld/cqa_dupstack_programmers_nl_retrieval.py +1 -1
  494. mteb/tasks/retrieval/nld/cqa_dupstack_stats_nl_retrieval.py +1 -1
  495. mteb/tasks/retrieval/nld/cqa_dupstack_tex_nl_retrieval.py +1 -1
  496. mteb/tasks/retrieval/nld/cqa_dupstack_unix_nl_retrieval.py +1 -1
  497. mteb/tasks/retrieval/nld/cqa_dupstack_webmasters_nl_retrieval.py +1 -1
  498. mteb/tasks/retrieval/nld/cqa_dupstack_wordpress_nl_retrieval.py +1 -1
  499. mteb/tasks/retrieval/nob/norquad.py +3 -3
  500. mteb/tasks/retrieval/nob/snl_retrieval.py +3 -3
  501. mteb/tasks/retrieval/slk/slovak_sum_retrieval.py +1 -1
  502. mteb/tasks/retrieval/tur/tur_hist_quad.py +1 -1
  503. mteb/tasks/retrieval/vie/__init__.py +14 -6
  504. mteb/tasks/retrieval/vie/climate_fevervn_retrieval.py +39 -0
  505. mteb/tasks/retrieval/vie/db_pedia_vn_retrieval.py +39 -0
  506. mteb/tasks/retrieval/vie/fevervn_retrieval.py +39 -0
  507. mteb/tasks/retrieval/vie/hotpot_qavn_retrieval.py +39 -0
  508. mteb/tasks/retrieval/vie/msmarcovn_retrieval.py +48 -0
  509. mteb/tasks/retrieval/vie/nqvn_retrieval.py +39 -0
  510. mteb/tasks/retrieval/vie/tvpl_retrieval.py +42 -0
  511. mteb/tasks/retrieval/vie/vie_qu_ad_retrieval.py +1 -1
  512. mteb/tasks/retrieval/vie/zac_legal_text_retrieval.py +15 -1
  513. mteb/tasks/sts/fao/faroese_sts.py +1 -1
  514. mteb/tasks/sts/fra/sick_fr_sts.py +1 -1
  515. mteb/tasks/sts/kor/klue_sts.py +1 -1
  516. mteb/tasks/sts/por/sick_br_sts.py +1 -1
  517. mteb/tasks/sts/rus/ru_para_phraser_sts.py +1 -1
  518. mteb/tasks/zeroshot_classification/eng/sci_mmir.py +1 -1
  519. mteb/types/__init__.py +2 -0
  520. mteb/types/_encoder_io.py +13 -1
  521. mteb/types/_result.py +2 -1
  522. mteb/types/statistics.py +18 -5
  523. {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/METADATA +15 -4
  524. {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/RECORD +528 -486
  525. {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/WHEEL +1 -1
  526. mteb/models/model_implementations/mxbai_models.py +0 -111
  527. {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/entry_points.txt +0 -0
  528. {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/licenses/LICENSE +0 -0
  529. {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/top_level.txt +0 -0
@@ -1,16 +1,15 @@
1
+ from __future__ import annotations
2
+
1
3
  import hashlib
2
4
  import logging
3
5
  from collections import defaultdict
4
- from pathlib import Path
6
+ from typing import TYPE_CHECKING
5
7
 
6
8
  import numpy as np
7
9
  from datasets import Dataset
8
10
  from sklearn.metrics import average_precision_score
9
11
 
10
12
  from mteb._evaluators import PairClassificationEvaluator
11
- from mteb._evaluators.pair_classification_evaluator import (
12
- PairClassificationDistances,
13
- )
14
13
  from mteb.abstasks._statistics_calculation import (
15
14
  calculate_image_statistics,
16
15
  calculate_label_statistics,
@@ -19,14 +18,25 @@ from mteb.abstasks._statistics_calculation import (
19
18
  from mteb.abstasks.abstask import AbsTask
20
19
  from mteb.models.model_meta import ScoringFunction
21
20
  from mteb.models.models_protocols import EncoderProtocol
22
- from mteb.types import PromptType
23
21
  from mteb.types.statistics import (
24
- ImageStatistics,
25
- LabelStatistics,
26
22
  SplitDescriptiveStatistics,
27
- TextStatistics,
28
23
  )
29
24
 
25
+ if TYPE_CHECKING:
26
+ from pathlib import Path
27
+
28
+ from mteb._evaluators.pair_classification_evaluator import (
29
+ PairClassificationDistances,
30
+ )
31
+ from mteb.models.models_protocols import MTEBModels
32
+ from mteb.types import EncodeKwargs, PromptType
33
+ from mteb.types.statistics import (
34
+ ImageStatistics,
35
+ LabelStatistics,
36
+ TextStatistics,
37
+ )
38
+
39
+
30
40
  logger = logging.getLogger(__name__)
31
41
 
32
42
 
@@ -44,8 +54,8 @@ class PairClassificationDescriptiveStatistics(SplitDescriptiveStatistics):
44
54
  """
45
55
 
46
56
  num_samples: int
47
- number_of_characters: int
48
- unique_pairs: int
57
+ number_of_characters: int | None
58
+ unique_pairs: int | None
49
59
 
50
60
  text1_statistics: TextStatistics | None
51
61
  image1_statistics: ImageStatistics | None
@@ -79,15 +89,19 @@ class AbsTaskPairClassification(AbsTask):
79
89
 
80
90
  def _evaluate_subset(
81
91
  self,
82
- model: EncoderProtocol,
92
+ model: MTEBModels,
83
93
  data_split: Dataset,
84
94
  *,
85
95
  hf_split: str,
86
96
  hf_subset: str,
87
- encode_kwargs: dict[str, str],
97
+ encode_kwargs: EncodeKwargs,
88
98
  prediction_folder: Path | None = None,
99
+ num_proc: int = 1,
89
100
  **kwargs,
90
101
  ) -> dict[str, float]:
102
+ if not isinstance(model, EncoderProtocol):
103
+ raise TypeError("Expected model to be an instance of EncoderProtocol")
104
+
91
105
  if self.metadata.modalities == ["text"]:
92
106
  # for compatibility with v1 version where datasets were stored in a single row
93
107
  data_split = data_split[0] if len(data_split) == 1 else data_split
@@ -102,7 +116,11 @@ class AbsTaskPairClassification(AbsTask):
102
116
  input2_prompt_type=self.input2_prompt_type,
103
117
  **kwargs,
104
118
  )
105
- similarity_scores = evaluator(model, encode_kwargs=encode_kwargs)
119
+ similarity_scores = evaluator(
120
+ model,
121
+ encode_kwargs=encode_kwargs,
122
+ num_proc=num_proc,
123
+ )
106
124
 
107
125
  if prediction_folder:
108
126
  self._save_task_predictions(
@@ -120,7 +138,7 @@ class AbsTaskPairClassification(AbsTask):
120
138
  self, similarity_scores: PairClassificationDistances, labels: list[int]
121
139
  ) -> dict[str, float]:
122
140
  logger.info("Computing metrics...")
123
- labels = np.asarray(labels)
141
+ np_labels = np.asarray(labels)
124
142
  output_scores = {}
125
143
  max_scores = defaultdict(list)
126
144
  for short_name, scores, reverse in [
@@ -142,7 +160,7 @@ class AbsTaskPairClassification(AbsTask):
142
160
  ],
143
161
  [ScoringFunction.DOT_PRODUCT.value, similarity_scores["dot_scores"], True],
144
162
  ]:
145
- metrics = self._compute_metrics_values(scores, labels, reverse)
163
+ metrics = self._compute_metrics_values(scores, np_labels, reverse) # type: ignore[arg-type]
146
164
  for metric_name, metric_value in metrics.items():
147
165
  output_scores[f"{short_name}_{metric_name}"] = metric_value
148
166
  max_scores[metric_name].append(metric_value)
@@ -235,8 +253,14 @@ class AbsTaskPairClassification(AbsTask):
235
253
  labels_statistics=calculate_label_statistics(labels),
236
254
  )
237
255
 
238
- def _push_dataset_to_hub(self, repo_name: str) -> None:
256
+ def _push_dataset_to_hub(self, repo_name: str, num_proc: int = 1) -> None:
239
257
  # previously pair classification datasets were stored in a single row
258
+ if self.dataset is None:
259
+ # overall this shouldn't happen as we check for dataset before pushing to hub
260
+ # added here for type checking purposes
261
+ raise RuntimeError(
262
+ "Dataset not loaded. To load dataset run `task.load_data()`."
263
+ )
240
264
  if self.metadata.is_multilingual:
241
265
  for subset in self.dataset:
242
266
  for split in self.dataset[subset]:
@@ -253,6 +277,7 @@ class AbsTaskPairClassification(AbsTask):
253
277
  self.input2_column_name,
254
278
  self.label_column_name,
255
279
  ],
280
+ num_proc=num_proc,
256
281
  )
257
282
 
258
283
  def _compute_metrics_values(
@@ -290,13 +315,13 @@ class AbsTaskPairClassification(AbsTask):
290
315
  )
291
316
 
292
317
  def _find_best_acc_and_threshold(
293
- self, scores: np.ndarray, labels: np.ndarray, high_score_more_similar: bool
318
+ self, scores: list[float], labels: np.ndarray, high_score_more_similar: bool
294
319
  ) -> tuple[float, float]:
295
320
  rows = list(zip(scores, labels))
296
321
  rows = sorted(rows, key=lambda x: x[0], reverse=high_score_more_similar)
297
322
 
298
323
  max_acc = 0
299
- best_threshold = -1
324
+ best_threshold = -1.0
300
325
  positive_so_far = 0
301
326
  remaining_negatives = sum(np.array(labels) == 0)
302
327
 
@@ -323,7 +348,7 @@ class AbsTaskPairClassification(AbsTask):
323
348
 
324
349
  rows = sorted(rows, key=lambda x: x[0], reverse=high_score_more_similar)
325
350
 
326
- best_f1 = best_precision = best_recall = 0
351
+ best_f1 = best_precision = best_recall = 0.0
327
352
  threshold = 0
328
353
  nextract = 0
329
354
  ncorrect = 0
@@ -1,29 +1,37 @@
1
+ from __future__ import annotations
2
+
1
3
  import logging
2
- from typing import TypedDict
4
+ from typing import TYPE_CHECKING, TypedDict
3
5
 
4
6
  import datasets
5
7
  import numpy as np
6
8
  import pandas as pd
7
- from datasets import Dataset
8
9
  from scipy.stats import kendalltau
9
10
  from sklearn.linear_model import LinearRegression
10
11
  from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
11
12
 
12
- from mteb._evaluators.sklearn_evaluator import SklearnEvaluator, SklearnModelProtocol
13
+ from mteb._evaluators.sklearn_evaluator import SklearnEvaluator
13
14
  from mteb.abstasks._statistics_calculation import (
14
15
  calculate_image_statistics,
15
16
  calculate_score_statistics,
16
17
  calculate_text_statistics,
17
18
  )
18
19
  from mteb.types.statistics import (
19
- ImageStatistics,
20
- ScoreStatistics,
21
20
  SplitDescriptiveStatistics,
22
- TextStatistics,
23
21
  )
24
22
 
25
23
  from .classification import AbsTaskClassification
26
24
 
25
+ if TYPE_CHECKING:
26
+ from datasets import Dataset
27
+
28
+ from mteb._evaluators.sklearn_evaluator import SklearnModelProtocol
29
+ from mteb.types.statistics import (
30
+ ImageStatistics,
31
+ ScoreStatistics,
32
+ TextStatistics,
33
+ )
34
+
27
35
  logger = logging.getLogger(__name__)
28
36
 
29
37
 
@@ -84,10 +92,10 @@ class AbsTaskRegression(AbsTaskClassification):
84
92
  n_samples: Number of samples to use for training the regression model. If the dataset has fewer samples than n_samples, all samples are used.
85
93
  abstask_prompt: Prompt to use for the task for instruction model if not prompt is provided in TaskMetadata.prompt.
86
94
  evaluator_model: The model to use for evaluation. Can be any sklearn compatible model. Default is `LinearRegression`.
87
- Full details of api in [`SklearnModelProtocol`][mteb._evaluators.sklearn_evaluator.SklearnModelProtocol].
95
+
88
96
  """
89
97
 
90
- evaluator: type[SklearnModelProtocol] = SklearnEvaluator
98
+ evaluator: type[SklearnEvaluator] = SklearnEvaluator
91
99
  evaluator_model: SklearnModelProtocol = LinearRegression(n_jobs=-1)
92
100
 
93
101
  train_split: str = "train"
@@ -113,7 +121,7 @@ class AbsTaskRegression(AbsTaskClassification):
113
121
  )["train"]
114
122
  return train_split_sampled, []
115
123
 
116
- def _calculate_scores(
124
+ def _calculate_scores( # type: ignore[override]
117
125
  self,
118
126
  y_test: np.ndarray | list[int],
119
127
  y_pred: np.ndarray,
@@ -183,7 +191,7 @@ class AbsTaskRegression(AbsTaskClassification):
183
191
 
184
192
  return dataset_dict
185
193
 
186
- def _calculate_descriptive_statistics_from_split(
194
+ def _calculate_descriptive_statistics_from_split( # type: ignore[override]
187
195
  self, split: str, hf_subset: str | None = None, compute_overall: bool = False
188
196
  ) -> RegressionDescriptiveStatistics:
189
197
  train_text = []
@@ -1,13 +1,13 @@
1
+ from __future__ import annotations
2
+
1
3
  import json
2
4
  import logging
3
5
  from collections import defaultdict
4
- from collections.abc import Callable, Sequence
5
6
  from pathlib import Path
6
7
  from time import time
7
- from typing import Any, Literal
8
+ from typing import TYPE_CHECKING, Any, Literal
8
9
 
9
10
  from datasets import Dataset, DatasetDict, concatenate_datasets
10
- from typing_extensions import Self
11
11
 
12
12
  from mteb._create_dataloaders import (
13
13
  _combine_queries_with_instruction_text,
@@ -19,24 +19,12 @@ from mteb._evaluators.retrieval_metrics import make_score_dict
19
19
  from mteb.models import (
20
20
  CrossEncoderProtocol,
21
21
  EncoderProtocol,
22
- MTEBModels,
23
22
  SearchCrossEncoderWrapper,
24
23
  SearchEncoderWrapper,
25
24
  SearchProtocol,
26
25
  )
27
- from mteb.types import (
28
- HFSubset,
29
- QueryDatasetType,
30
- RelevantDocumentsType,
31
- RetrievalOutputType,
32
- ScoresDict,
33
- )
34
26
  from mteb.types.statistics import (
35
- ImageStatistics,
36
- RelevantDocsStatistics,
37
27
  SplitDescriptiveStatistics,
38
- TextStatistics,
39
- TopRankedStatistics,
40
28
  )
41
29
 
42
30
  from ._statistics_calculation import (
@@ -52,6 +40,30 @@ from .retrieval_dataset_loaders import (
52
40
  _combine_queries_with_instructions_datasets,
53
41
  )
54
42
 
43
+ if TYPE_CHECKING:
44
+ from collections.abc import Callable, Mapping, Sequence
45
+
46
+ from typing_extensions import Self
47
+
48
+ from mteb.models import (
49
+ MTEBModels,
50
+ )
51
+ from mteb.types import (
52
+ EncodeKwargs,
53
+ HFSubset,
54
+ QueryDatasetType,
55
+ RelevantDocumentsType,
56
+ RetrievalOutputType,
57
+ ScoresDict,
58
+ )
59
+ from mteb.types.statistics import (
60
+ ImageStatistics,
61
+ RelevantDocsStatistics,
62
+ TextStatistics,
63
+ TopRankedStatistics,
64
+ )
65
+
66
+
55
67
  logger = logging.getLogger(__name__)
56
68
 
57
69
 
@@ -136,7 +148,7 @@ class AbsTaskRetrieval(AbsTask):
136
148
  )
137
149
  )
138
150
 
139
- def convert_v1_dataset_format_to_v2(self):
151
+ def convert_v1_dataset_format_to_v2(self, num_proc: int) -> None:
140
152
  """Convert dataset from v1 (from `self.queries`, `self.document`) format to v2 format (`self.dotaset`)."""
141
153
  # check if dataset is `v1` version
142
154
  if not hasattr(self, "queries"):
@@ -184,17 +196,17 @@ class AbsTaskRetrieval(AbsTask):
184
196
  return queries, corpus
185
197
 
186
198
  if self.metadata.is_multilingual:
187
- for subset in self.queries:
188
- for split in self.queries[subset]:
189
- queries = self.queries[subset][split]
190
- corpus = self.corpus[subset][split]
199
+ for subset in self.queries: # type: ignore[attr-defined]
200
+ for split in self.queries[subset]: # type: ignore[attr-defined]
201
+ queries = self.queries[subset][split] # type: ignore[attr-defined]
202
+ corpus = self.corpus[subset][split] # type: ignore[attr-defined]
191
203
 
192
204
  (
193
205
  self.dataset[subset][split]["queries"],
194
206
  self.dataset[subset][split]["corpus"],
195
207
  ) = _process_split(queries, corpus)
196
208
 
197
- self.dataset[subset][split]["relevant_docs"] = self.relevant_docs[
209
+ self.dataset[subset][split]["relevant_docs"] = self.relevant_docs[ # type: ignore[attr-defined]
198
210
  subset
199
211
  ][split]
200
212
  if hasattr(self, "instructions"):
@@ -203,6 +215,7 @@ class AbsTaskRetrieval(AbsTask):
203
215
  _combine_queries_with_instructions_datasets(
204
216
  self.dataset[subset][split]["queries"],
205
217
  instructions,
218
+ num_proc,
206
219
  )
207
220
  )
208
221
  if hasattr(self, "top_ranked"):
@@ -211,15 +224,15 @@ class AbsTaskRetrieval(AbsTask):
211
224
  ][split]
212
225
  else:
213
226
  subset = "default"
214
- for split in self.queries:
215
- queries = self.queries[split]
216
- corpus = self.corpus[split]
227
+ for split in self.queries: # type: ignore[attr-defined]
228
+ queries = self.queries[split] # type: ignore[attr-defined]
229
+ corpus = self.corpus[split] # type: ignore[attr-defined]
217
230
  (
218
231
  self.dataset[subset][split]["queries"],
219
232
  self.dataset[subset][split]["corpus"],
220
233
  ) = _process_split(queries, corpus)
221
234
 
222
- self.dataset[subset][split]["relevant_docs"] = self.relevant_docs[
235
+ self.dataset[subset][split]["relevant_docs"] = self.relevant_docs[ # type: ignore[attr-defined]
223
236
  split
224
237
  ].copy()
225
238
  if hasattr(self, "instructions"):
@@ -228,28 +241,29 @@ class AbsTaskRetrieval(AbsTask):
228
241
  _combine_queries_with_instructions_datasets(
229
242
  self.dataset[subset][split]["queries"],
230
243
  instructions,
244
+ num_proc,
231
245
  )
232
246
  )
233
- if hasattr(self, "top_ranked"):
247
+ if hasattr(self, "top_ranked") and self.top_ranked:
234
248
  self.dataset[subset][split]["top_ranked"] = self.top_ranked[
235
249
  split
236
250
  ].copy()
237
251
 
238
- del self.queries
239
- del self.corpus
240
- del self.relevant_docs
252
+ del self.queries # type: ignore[attr-defined]
253
+ del self.corpus # type: ignore[attr-defined]
254
+ del self.relevant_docs # type: ignore[attr-defined]
241
255
  if hasattr(self, "instructions"):
242
256
  del self.instructions
243
257
  if hasattr(self, "top_ranked"):
244
258
  del self.top_ranked
245
259
 
246
- def load_data(self) -> None:
260
+ def load_data(self, num_proc: int = 1, **kwargs) -> None:
247
261
  """Load the dataset for the retrieval task."""
248
262
  if self.data_loaded:
249
263
  return
250
264
 
251
265
  dataset_path = self.metadata.dataset["path"]
252
- eval_splits = self.metadata.eval_splits
266
+ eval_splits = self.eval_splits
253
267
  trust_remote_code = self.metadata.dataset.get("trust_remote_code", False)
254
268
  revision = self.metadata.dataset["revision"]
255
269
 
@@ -265,16 +279,18 @@ class AbsTaskRetrieval(AbsTask):
265
279
  trust_remote_code=trust_remote_code,
266
280
  split=split,
267
281
  config=hf_subset,
268
- ).load()
282
+ ).load(
283
+ num_proc=num_proc,
284
+ )
269
285
 
270
286
  if self.metadata.is_multilingual:
271
- for lang in self.metadata.eval_langs:
287
+ for lang in self.hf_subsets:
272
288
  for split in eval_splits:
273
289
  _process_data(split, lang)
274
290
  else:
275
291
  for split in eval_splits:
276
292
  _process_data(split)
277
- self.dataset_transform()
293
+ self.dataset_transform(num_proc=num_proc)
278
294
  self.data_loaded = True
279
295
 
280
296
  def evaluate(
@@ -283,10 +299,11 @@ class AbsTaskRetrieval(AbsTask):
283
299
  split: str = "test",
284
300
  subsets_to_run: list[HFSubset] | None = None,
285
301
  *,
286
- encode_kwargs: dict[str, Any],
302
+ encode_kwargs: EncodeKwargs,
287
303
  prediction_folder: Path | None = None,
288
- **kwargs,
289
- ) -> dict[HFSubset, ScoresDict]:
304
+ num_proc: int = 1,
305
+ **kwargs: Any,
306
+ ) -> Mapping[HFSubset, ScoresDict]:
290
307
  """Evaluate the model on the retrieval task.
291
308
 
292
309
  Args:
@@ -296,16 +313,16 @@ class AbsTaskRetrieval(AbsTask):
296
313
  subsets_to_run: Optional list of subsets to evaluate on
297
314
  encode_kwargs: Keyword arguments passed to the encoder
298
315
  prediction_folder: Folder to save model predictions
316
+ num_proc: Number of processes to use
299
317
  **kwargs: Additional keyword arguments passed to the evaluator
300
318
 
301
-
302
319
  Returns:
303
320
  Dictionary mapping subsets to their evaluation scores
304
321
  """
305
322
  if not self.data_loaded:
306
- self.load_data()
323
+ self.load_data(num_proc=num_proc)
307
324
  # TODO: convert all tasks directly https://github.com/embeddings-benchmark/mteb/issues/2030
308
- self.convert_v1_dataset_format_to_v2()
325
+ self.convert_v1_dataset_format_to_v2(num_proc=num_proc)
309
326
 
310
327
  return super().evaluate(
311
328
  model,
@@ -313,6 +330,7 @@ class AbsTaskRetrieval(AbsTask):
313
330
  subsets_to_run,
314
331
  encode_kwargs=encode_kwargs,
315
332
  prediction_folder=prediction_folder,
333
+ num_proc=num_proc,
316
334
  **kwargs,
317
335
  )
318
336
 
@@ -320,10 +338,11 @@ class AbsTaskRetrieval(AbsTask):
320
338
  self,
321
339
  model: MTEBModels,
322
340
  data_split: RetrievalSplitData,
323
- encode_kwargs: dict[str, Any],
341
+ encode_kwargs: EncodeKwargs,
324
342
  hf_split: str,
325
343
  hf_subset: str,
326
344
  prediction_folder: Path | None = None,
345
+ num_proc: int = 1,
327
346
  **kwargs,
328
347
  ) -> ScoresDict:
329
348
  """Evaluate a model on a specific subset of the data.
@@ -335,6 +354,7 @@ class AbsTaskRetrieval(AbsTask):
335
354
  hf_split: Split to evaluate on
336
355
  hf_subset: Subset to evaluate on
337
356
  prediction_folder: Folder with results prediction
357
+ num_proc: Number of processes to use
338
358
  **kwargs: Additional keyword arguments passed to the evaluator
339
359
 
340
360
  Returns:
@@ -357,6 +377,8 @@ class AbsTaskRetrieval(AbsTask):
357
377
  **kwargs,
358
378
  )
359
379
 
380
+ search_model: SearchProtocol
381
+
360
382
  if isinstance(model, EncoderProtocol) and not isinstance(model, SearchProtocol):
361
383
  search_model = SearchEncoderWrapper(model)
362
384
  elif isinstance(model, CrossEncoderProtocol):
@@ -372,6 +394,7 @@ class AbsTaskRetrieval(AbsTask):
372
394
  results = retriever(
373
395
  search_model,
374
396
  encode_kwargs=encode_kwargs,
397
+ num_proc=num_proc,
375
398
  )
376
399
  end_time = time()
377
400
  logger.debug(
@@ -446,9 +469,13 @@ class AbsTaskRetrieval(AbsTask):
446
469
  return {}
447
470
 
448
471
  def _calculate_descriptive_statistics_from_split(
449
- self, split: str, hf_subset: str | None = None, compute_overall: bool = False
472
+ self,
473
+ split: str,
474
+ hf_subset: str | None = None,
475
+ compute_overall: bool = False,
476
+ num_proc: int = 1,
450
477
  ) -> RetrievalDescriptiveStatistics:
451
- self.convert_v1_dataset_format_to_v2()
478
+ self.convert_v1_dataset_format_to_v2(num_proc)
452
479
  if hf_subset and hf_subset in self.dataset:
453
480
  split_data = self.dataset[hf_subset][split]
454
481
  queries = split_data["queries"]
@@ -553,8 +580,8 @@ class AbsTaskRetrieval(AbsTask):
553
580
  top_ranked_statistics=top_ranked_statistics,
554
581
  )
555
582
 
556
- def _push_dataset_to_hub(self, repo_name: str) -> None:
557
- self.convert_v1_dataset_format_to_v2()
583
+ def _push_dataset_to_hub(self, repo_name: str, num_proc: int = 1) -> None:
584
+ self.convert_v1_dataset_format_to_v2(num_proc)
558
585
 
559
586
  def _push_section(
560
587
  data: dict[str, RetrievalSplitData],
@@ -578,11 +605,12 @@ class AbsTaskRetrieval(AbsTask):
578
605
  if isinstance(data[split][subset_item], Dataset):
579
606
  sections[split] = data[split][subset_item]
580
607
  elif converter is not None:
608
+ subset_data = data[split][subset_item]
609
+ if subset_data is None:
610
+ continue
611
+
581
612
  sections[split] = Dataset.from_list(
582
- [
583
- converter(idx, item)
584
- for idx, item in data[split][subset_item].items()
585
- ]
613
+ [converter(idx, item) for idx, item in subset_data.items()]
586
614
  )
587
615
  else:
588
616
  raise ValueError(
@@ -593,6 +621,7 @@ class AbsTaskRetrieval(AbsTask):
593
621
  repo_name,
594
622
  hf_subset_name,
595
623
  commit_message=f"Add {hf_subset_name}-{subset_item}",
624
+ num_proc=num_proc,
596
625
  )
597
626
 
598
627
  for subset in self.dataset:
@@ -626,6 +655,7 @@ class AbsTaskRetrieval(AbsTask):
626
655
  repo_name,
627
656
  f"{subset}-qrels" if subset != "default" else "qrels",
628
657
  commit_message=f"Add {subset}-qrels",
658
+ num_proc=num_proc,
629
659
  )
630
660
 
631
661
  _push_section(
@@ -680,7 +710,7 @@ class AbsTaskRetrieval(AbsTask):
680
710
 
681
711
  top_k_sorted = defaultdict(list)
682
712
  for query_id, values in top_ranked.items():
683
- sorted_keys = sorted(values, key=values.get, reverse=True)
713
+ sorted_keys = sorted(values, key=lambda k: values[k], reverse=True)
684
714
  top_k_sorted[query_id] = sorted_keys[: self._top_k]
685
715
 
686
716
  self.dataset[subset][split]["top_ranked"] = top_k_sorted
@@ -688,10 +718,10 @@ class AbsTaskRetrieval(AbsTask):
688
718
 
689
719
 
690
720
  def _process_relevant_docs(
691
- collection: dict[str, dict[str, float]],
721
+ collection: Mapping[str, Mapping[str, int]],
692
722
  hf_subset: str,
693
723
  split: str,
694
- ) -> dict[str, dict[str, float]]:
724
+ ) -> dict[str, dict[str, int]]:
695
725
  """Collections can contain overlapping ids in different splits. Prepend split and subset to avoid this
696
726
 
697
727
  Returns: