mteb 2.7.3__py3-none-any.whl → 2.7.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (434) hide show
  1. mteb/_create_dataloaders.py +47 -5
  2. mteb/_evaluators/any_sts_evaluator.py +2 -0
  3. mteb/_evaluators/clustering_evaluator.py +2 -0
  4. mteb/_evaluators/evaluator.py +2 -1
  5. mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +8 -1
  6. mteb/_evaluators/pair_classification_evaluator.py +3 -0
  7. mteb/_evaluators/retrieval_evaluator.py +3 -0
  8. mteb/_evaluators/sklearn_evaluator.py +6 -1
  9. mteb/_evaluators/text/bitext_mining_evaluator.py +2 -0
  10. mteb/_evaluators/text/summarization_evaluator.py +2 -0
  11. mteb/_evaluators/zeroshot_classification_evaluator.py +2 -0
  12. mteb/abstasks/abstask.py +31 -12
  13. mteb/abstasks/classification.py +10 -3
  14. mteb/abstasks/clustering.py +6 -2
  15. mteb/abstasks/clustering_legacy.py +8 -2
  16. mteb/abstasks/image/image_text_pair_classification.py +6 -2
  17. mteb/abstasks/multilabel_classification.py +2 -0
  18. mteb/abstasks/pair_classification.py +8 -2
  19. mteb/abstasks/retrieval.py +27 -12
  20. mteb/abstasks/retrieval_dataset_loaders.py +29 -19
  21. mteb/abstasks/sts.py +10 -3
  22. mteb/abstasks/text/bitext_mining.py +9 -5
  23. mteb/abstasks/text/reranking.py +2 -2
  24. mteb/abstasks/text/summarization.py +2 -1
  25. mteb/abstasks/zeroshot_classification.py +8 -2
  26. mteb/benchmarks/benchmarks/__init__.py +2 -0
  27. mteb/benchmarks/benchmarks/benchmarks.py +41 -2
  28. mteb/descriptive_stats/Retrieval/BrightAopsRetrieval.json +35 -0
  29. mteb/descriptive_stats/Retrieval/BrightBiologyLongRetrieval.json +35 -0
  30. mteb/descriptive_stats/Retrieval/BrightBiologyRetrieval.json +35 -0
  31. mteb/descriptive_stats/Retrieval/BrightEarthScienceLongRetrieval.json +35 -0
  32. mteb/descriptive_stats/Retrieval/BrightEarthScienceRetrieval.json +35 -0
  33. mteb/descriptive_stats/Retrieval/BrightEconomicsLongRetrieval.json +35 -0
  34. mteb/descriptive_stats/Retrieval/BrightEconomicsRetrieval.json +35 -0
  35. mteb/descriptive_stats/Retrieval/BrightLeetcodeRetrieval.json +35 -0
  36. mteb/descriptive_stats/Retrieval/BrightPonyLongRetrieval.json +35 -0
  37. mteb/descriptive_stats/Retrieval/BrightPonyRetrieval.json +35 -0
  38. mteb/descriptive_stats/Retrieval/BrightPsychologyLongRetrieval.json +35 -0
  39. mteb/descriptive_stats/Retrieval/BrightPsychologyRetrieval.json +35 -0
  40. mteb/descriptive_stats/Retrieval/BrightRoboticsLongRetrieval.json +35 -0
  41. mteb/descriptive_stats/Retrieval/BrightRoboticsRetrieval.json +35 -0
  42. mteb/descriptive_stats/Retrieval/BrightStackoverflowLongRetrieval.json +35 -0
  43. mteb/descriptive_stats/Retrieval/BrightStackoverflowRetrieval.json +35 -0
  44. mteb/descriptive_stats/Retrieval/BrightSustainableLivingLongRetrieval.json +35 -0
  45. mteb/descriptive_stats/Retrieval/BrightSustainableLivingRetrieval.json +35 -0
  46. mteb/descriptive_stats/Retrieval/BrightTheoremQAQuestionsRetrieval.json +35 -0
  47. mteb/descriptive_stats/Retrieval/BrightTheoremQATheoremsRetrieval.json +35 -0
  48. mteb/evaluate.py +10 -2
  49. mteb/models/model_implementations/align_models.py +1 -0
  50. mteb/models/model_implementations/amazon_models.py +1 -0
  51. mteb/models/model_implementations/andersborges.py +2 -0
  52. mteb/models/model_implementations/ara_models.py +1 -0
  53. mteb/models/model_implementations/arctic_models.py +8 -0
  54. mteb/models/model_implementations/b1ade_models.py +1 -0
  55. mteb/models/model_implementations/bedrock_models.py +4 -0
  56. mteb/models/model_implementations/bge_models.py +40 -1
  57. mteb/models/model_implementations/bica_model.py +1 -0
  58. mteb/models/model_implementations/blip2_models.py +2 -0
  59. mteb/models/model_implementations/blip_models.py +8 -0
  60. mteb/models/model_implementations/bm25.py +10 -5
  61. mteb/models/model_implementations/bmretriever_models.py +4 -0
  62. mteb/models/model_implementations/cadet_models.py +1 -0
  63. mteb/models/model_implementations/cde_models.py +2 -0
  64. mteb/models/model_implementations/clip_models.py +3 -0
  65. mteb/models/model_implementations/clips_models.py +3 -0
  66. mteb/models/model_implementations/codefuse_models.py +5 -0
  67. mteb/models/model_implementations/codesage_models.py +3 -0
  68. mteb/models/model_implementations/cohere_models.py +4 -0
  69. mteb/models/model_implementations/cohere_v.py +5 -0
  70. mteb/models/model_implementations/colpali_models.py +3 -0
  71. mteb/models/model_implementations/colqwen_models.py +7 -0
  72. mteb/models/model_implementations/colsmol_models.py +2 -0
  73. mteb/models/model_implementations/conan_models.py +1 -0
  74. mteb/models/model_implementations/dino_models.py +19 -0
  75. mteb/models/model_implementations/e5_instruct.py +4 -0
  76. mteb/models/model_implementations/e5_models.py +9 -0
  77. mteb/models/model_implementations/e5_v.py +1 -0
  78. mteb/models/model_implementations/eagerworks_models.py +1 -0
  79. mteb/models/model_implementations/emillykkejensen_models.py +3 -0
  80. mteb/models/model_implementations/en_code_retriever.py +1 -0
  81. mteb/models/model_implementations/euler_models.py +1 -0
  82. mteb/models/model_implementations/evaclip_models.py +4 -0
  83. mteb/models/model_implementations/fa_models.py +9 -0
  84. mteb/models/model_implementations/facebookai.py +2 -0
  85. mteb/models/model_implementations/geogpt_models.py +1 -0
  86. mteb/models/model_implementations/gme_v_models.py +2 -0
  87. mteb/models/model_implementations/google_models.py +5 -0
  88. mteb/models/model_implementations/granite_vision_embedding_models.py +1 -0
  89. mteb/models/model_implementations/gritlm_models.py +2 -0
  90. mteb/models/model_implementations/gte_models.py +9 -0
  91. mteb/models/model_implementations/hinvec_models.py +1 -0
  92. mteb/models/model_implementations/human.py +1 -0
  93. mteb/models/model_implementations/ibm_granite_models.py +6 -0
  94. mteb/models/model_implementations/inf_models.py +2 -0
  95. mteb/models/model_implementations/jasper_models.py +2 -0
  96. mteb/models/model_implementations/jina_clip.py +1 -0
  97. mteb/models/model_implementations/jina_models.py +7 -0
  98. mteb/models/model_implementations/kalm_models.py +6 -0
  99. mteb/models/model_implementations/kblab.py +1 -0
  100. mteb/models/model_implementations/kennethenevoldsen_models.py +2 -0
  101. mteb/models/model_implementations/kfst.py +1 -0
  102. mteb/models/model_implementations/kowshik24_models.py +1 -0
  103. mteb/models/model_implementations/lens_models.py +2 -0
  104. mteb/models/model_implementations/lgai_embedding_models.py +1 -0
  105. mteb/models/model_implementations/linq_models.py +1 -0
  106. mteb/models/model_implementations/listconranker.py +1 -0
  107. mteb/models/model_implementations/llm2clip_models.py +3 -0
  108. mteb/models/model_implementations/llm2vec_models.py +8 -0
  109. mteb/models/model_implementations/mcinext_models.py +3 -0
  110. mteb/models/model_implementations/mdbr_models.py +2 -0
  111. mteb/models/model_implementations/misc_models.py +63 -0
  112. mteb/models/model_implementations/mixedbread_ai_models.py +3 -0
  113. mteb/models/model_implementations/mme5_models.py +2 -1
  114. mteb/models/model_implementations/moco_models.py +2 -0
  115. mteb/models/model_implementations/mod_models.py +1 -0
  116. mteb/models/model_implementations/model2vec_models.py +13 -0
  117. mteb/models/model_implementations/moka_models.py +3 -0
  118. mteb/models/model_implementations/nbailab.py +3 -0
  119. mteb/models/model_implementations/no_instruct_sentence_models.py +1 -0
  120. mteb/models/model_implementations/nomic_models.py +6 -0
  121. mteb/models/model_implementations/nomic_models_vision.py +1 -0
  122. mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +2 -0
  123. mteb/models/model_implementations/nvidia_models.py +3 -0
  124. mteb/models/model_implementations/octen_models.py +2 -0
  125. mteb/models/model_implementations/openai_models.py +5 -0
  126. mteb/models/model_implementations/openclip_models.py +8 -0
  127. mteb/models/model_implementations/opensearch_neural_sparse_models.py +5 -0
  128. mteb/models/model_implementations/ops_moa_models.py +2 -0
  129. mteb/models/model_implementations/ordalietech_solon_embeddings_mini_beta_1_1.py +1 -0
  130. mteb/models/model_implementations/pawan_models.py +1 -0
  131. mteb/models/model_implementations/piccolo_models.py +2 -0
  132. mteb/models/model_implementations/promptriever_models.py +4 -0
  133. mteb/models/model_implementations/pylate_models.py +13 -0
  134. mteb/models/model_implementations/qodo_models.py +2 -0
  135. mteb/models/model_implementations/qtack_models.py +1 -0
  136. mteb/models/model_implementations/qwen3_models.py +3 -0
  137. mteb/models/model_implementations/qzhou_models.py +2 -0
  138. mteb/models/model_implementations/rasgaard_models.py +1 -0
  139. mteb/models/model_implementations/reasonir_model.py +65 -0
  140. mteb/models/model_implementations/repllama_models.py +2 -0
  141. mteb/models/model_implementations/rerankers_custom.py +3 -0
  142. mteb/models/model_implementations/rerankers_monot5_based.py +14 -0
  143. mteb/models/model_implementations/richinfoai_models.py +1 -0
  144. mteb/models/model_implementations/ru_sentence_models.py +20 -0
  145. mteb/models/model_implementations/ruri_models.py +10 -0
  146. mteb/models/model_implementations/salesforce_models.py +3 -0
  147. mteb/models/model_implementations/samilpwc_models.py +1 -0
  148. mteb/models/model_implementations/sarashina_embedding_models.py +2 -0
  149. mteb/models/model_implementations/searchmap_models.py +1 -0
  150. mteb/models/model_implementations/seed_1_6_embedding_models.py +1 -0
  151. mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +1 -0
  152. mteb/models/model_implementations/seed_models.py +1 -0
  153. mteb/models/model_implementations/sentence_transformers_models.py +18 -0
  154. mteb/models/model_implementations/shuu_model.py +1 -0
  155. mteb/models/model_implementations/siglip_models.py +10 -0
  156. mteb/models/model_implementations/sonar_models.py +2 -1
  157. mteb/models/model_implementations/spartan8806_atles_champion.py +1 -0
  158. mteb/models/model_implementations/stella_models.py +6 -0
  159. mteb/models/model_implementations/tarka_models.py +2 -0
  160. mteb/models/model_implementations/text2vec_models.py +3 -0
  161. mteb/models/model_implementations/ua_sentence_models.py +1 -0
  162. mteb/models/model_implementations/uae_models.py +1 -0
  163. mteb/models/model_implementations/vdr_models.py +1 -0
  164. mteb/models/model_implementations/vi_vn_models.py +6 -0
  165. mteb/models/model_implementations/vista_models.py +2 -0
  166. mteb/models/model_implementations/vlm2vec_models.py +2 -0
  167. mteb/models/model_implementations/voyage_models.py +15 -0
  168. mteb/models/model_implementations/voyage_v.py +1 -0
  169. mteb/models/model_implementations/xyz_models.py +1 -0
  170. mteb/models/model_implementations/youtu_models.py +1 -0
  171. mteb/models/model_implementations/yuan_models.py +1 -0
  172. mteb/models/model_implementations/yuan_models_en.py +1 -0
  173. mteb/models/model_meta.py +35 -2
  174. mteb/models/models_protocols.py +4 -0
  175. mteb/models/search_wrappers.py +12 -0
  176. mteb/tasks/bitext_mining/eng/pub_chem_smiles_bitext_mining.py +1 -1
  177. mteb/tasks/bitext_mining/fas/fa_mteb_summary_retrieval.py +3 -3
  178. mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining.py +1 -1
  179. mteb/tasks/bitext_mining/multilingual/flores_bitext_mining.py +1 -1
  180. mteb/tasks/bitext_mining/multilingual/in22_conv_bitext_mining.py +1 -1
  181. mteb/tasks/bitext_mining/multilingual/in22_gen_bitext_mining.py +1 -1
  182. mteb/tasks/bitext_mining/multilingual/norwegian_courts_bitext_mining.py +1 -1
  183. mteb/tasks/bitext_mining/multilingual/ntrex_bitext_mining.py +1 -1
  184. mteb/tasks/bitext_mining/multilingual/roma_tales_bitext_mining.py +2 -2
  185. mteb/tasks/bitext_mining/multilingual/web_faq_bitext_mining.py +2 -2
  186. mteb/tasks/classification/ara/online_store_review_sentiment_classification.py +1 -1
  187. mteb/tasks/classification/ara/restaurant_review_sentiment_classification.py +1 -1
  188. mteb/tasks/classification/ara/tweet_sarcasm_classification.py +1 -1
  189. mteb/tasks/classification/ben/bengali_hate_speech_classification.py +1 -1
  190. mteb/tasks/classification/ben/bengali_sentiment_analysis.py +1 -1
  191. mteb/tasks/classification/bul/bulgarian_store_review_sentiment_classfication.py +1 -1
  192. mteb/tasks/classification/ces/csfdcz_movie_review_sentiment_classification.py +2 -2
  193. mteb/tasks/classification/dan/ddisco_cohesion_classification.py +1 -1
  194. mteb/tasks/classification/dan/dk_hate_classification.py +1 -1
  195. mteb/tasks/classification/deu/german_politicians_twitter_sentiment_classification.py +1 -1
  196. mteb/tasks/classification/ell/greek_legal_code_classification.py +1 -1
  197. mteb/tasks/classification/eng/dbpedia_classification.py +2 -2
  198. mteb/tasks/classification/eng/toxic_chat_classification.py +2 -2
  199. mteb/tasks/classification/eng/toxic_conversations_classification.py +2 -2
  200. mteb/tasks/classification/eng/tweet_topic_single_classification.py +1 -1
  201. mteb/tasks/classification/eng/yahoo_answers_topics_classification.py +1 -1
  202. mteb/tasks/classification/eng/yelp_review_full_classification.py +2 -2
  203. mteb/tasks/classification/est/estonian_valence.py +1 -1
  204. mteb/tasks/classification/fas/fa_mteb_classification.py +6 -6
  205. mteb/tasks/classification/fas/persian_food_sentiment_classification.py +1 -1
  206. mteb/tasks/classification/fil/filipino_shopee_reviews_classification.py +1 -1
  207. mteb/tasks/classification/fin/fin_toxicity_classification.py +1 -1
  208. mteb/tasks/classification/fra/french_book_reviews.py +2 -2
  209. mteb/tasks/classification/fra/movie_review_sentiment_classification.py +2 -2
  210. mteb/tasks/classification/guj/gujarati_news_classification.py +1 -1
  211. mteb/tasks/classification/hin/hindi_discourse_classification.py +1 -1
  212. mteb/tasks/classification/hin/sentiment_analysis_hindi.py +1 -1
  213. mteb/tasks/classification/ind/indonesian_id_clickbait_classification.py +2 -2
  214. mteb/tasks/classification/ind/indonesian_mongabay_conservation_classification.py +1 -1
  215. mteb/tasks/classification/ita/dado_eval_coarse_classification.py +1 -1
  216. mteb/tasks/classification/ita/ita_casehold_classification.py +1 -1
  217. mteb/tasks/classification/ita/sardi_stance_classification.py +1 -1
  218. mteb/tasks/classification/jav/javanese_imdb_classification.py +1 -1
  219. mteb/tasks/classification/jpn/wrime_classification.py +1 -1
  220. mteb/tasks/classification/kan/kannada_news_classification.py +2 -2
  221. mteb/tasks/classification/kor/klue_tc.py +2 -2
  222. mteb/tasks/classification/kor/kor_fin.py +1 -1
  223. mteb/tasks/classification/kor/kor_hate_classification.py +1 -1
  224. mteb/tasks/classification/kor/kor_sarcasm_classification.py +1 -1
  225. mteb/tasks/classification/mal/malayalam_news_classification.py +1 -1
  226. mteb/tasks/classification/mar/marathi_news_classification.py +1 -1
  227. mteb/tasks/classification/multilingual/afri_senti_lang_classification.py +1 -1
  228. mteb/tasks/classification/multilingual/catalonia_tweet_classification.py +1 -1
  229. mteb/tasks/classification/multilingual/cyrillic_turkic_lang_classification.py +1 -1
  230. mteb/tasks/classification/multilingual/indic_nlp_news_classification.py +1 -1
  231. mteb/tasks/classification/multilingual/masakha_news_classification.py +1 -1
  232. mteb/tasks/classification/multilingual/multi_hate_classification.py +1 -1
  233. mteb/tasks/classification/multilingual/multilingual_sentiment_classification.py +1 -1
  234. mteb/tasks/classification/multilingual/scala_classification.py +1 -1
  235. mteb/tasks/classification/multilingual/sib200_classification.py +1 -1
  236. mteb/tasks/classification/multilingual/turkic_classification.py +1 -1
  237. mteb/tasks/classification/multilingual/tweet_sentiment_classification.py +1 -1
  238. mteb/tasks/classification/nep/nepali_news_classification.py +2 -2
  239. mteb/tasks/classification/nld/dutch_sarcastic_headlines_classification.py +1 -1
  240. mteb/tasks/classification/nld/vaccin_chat_nl_classification.py +1 -1
  241. mteb/tasks/classification/ory/odia_news_classification.py +2 -2
  242. mteb/tasks/classification/pan/punjabi_news_classification.py +1 -1
  243. mteb/tasks/classification/ron/moroco.py +1 -1
  244. mteb/tasks/classification/ron/romanian_reviews_sentiment.py +1 -1
  245. mteb/tasks/classification/ron/romanian_sentiment_classification.py +1 -1
  246. mteb/tasks/classification/rus/georeview_classification.py +1 -1
  247. mteb/tasks/classification/rus/headline_classification.py +2 -2
  248. mteb/tasks/classification/rus/inappropriateness_classification.py +2 -2
  249. mteb/tasks/classification/rus/ru_reviews_classification.py +2 -2
  250. mteb/tasks/classification/rus/ru_sci_bench_grnti_classification.py +1 -1
  251. mteb/tasks/classification/rus/ru_sci_bench_oecd_classification.py +1 -1
  252. mteb/tasks/classification/rus/ru_toixic_classification_okmlcup.py +1 -1
  253. mteb/tasks/classification/san/sanskrit_shlokas_classification.py +1 -1
  254. mteb/tasks/classification/sin/sinhala_news_classification.py +2 -2
  255. mteb/tasks/classification/sin/sinhala_news_source_classification.py +2 -2
  256. mteb/tasks/classification/slk/csfdsk_movie_review_sentiment_classification.py +2 -2
  257. mteb/tasks/classification/slv/frenk_sl_classification.py +1 -1
  258. mteb/tasks/classification/spa/spanish_news_classification.py +2 -2
  259. mteb/tasks/classification/ssw/siswati_news_classification.py +1 -1
  260. mteb/tasks/classification/tam/tamil_news_classification.py +2 -2
  261. mteb/tasks/classification/tel/telugu_andhra_jyoti_news_classification.py +2 -2
  262. mteb/tasks/classification/tha/wongnai_reviews_classification.py +1 -1
  263. mteb/tasks/classification/tur/turkish_movie_sentiment_classification.py +2 -2
  264. mteb/tasks/classification/ukr/ukr_formality_classification.py +2 -2
  265. mteb/tasks/classification/vie/toxic_conversations_vn_classification.py +1 -1
  266. mteb/tasks/classification/vie/vie_student_feedback_classification.py +1 -1
  267. mteb/tasks/classification/zho/yue_openrice_review_classification.py +2 -2
  268. mteb/tasks/classification/zul/isi_zulu_news_classification.py +1 -1
  269. mteb/tasks/clustering/deu/blurbs_clustering_p2p.py +1 -1
  270. mteb/tasks/clustering/deu/blurbs_clustering_s2s.py +1 -1
  271. mteb/tasks/clustering/eng/arxiv_clustering_p2p.py +1 -1
  272. mteb/tasks/clustering/eng/arxiv_hierarchical_clustering.py +2 -2
  273. mteb/tasks/clustering/eng/big_patent_clustering.py +1 -1
  274. mteb/tasks/clustering/eng/biorxiv_clustering_p2p.py +1 -1
  275. mteb/tasks/clustering/eng/biorxiv_clustering_s2s.py +1 -1
  276. mteb/tasks/clustering/eng/medrxiv_clustering_p2p.py +1 -1
  277. mteb/tasks/clustering/eng/medrxiv_clustering_s2s.py +1 -1
  278. mteb/tasks/clustering/eng/reddit_clustering.py +1 -1
  279. mteb/tasks/clustering/eng/reddit_clustering_p2p.py +1 -1
  280. mteb/tasks/clustering/eng/stack_exchange_clustering.py +1 -1
  281. mteb/tasks/clustering/eng/stack_exchange_clustering_p2p.py +1 -1
  282. mteb/tasks/clustering/eng/twenty_newsgroups_clustering.py +1 -1
  283. mteb/tasks/clustering/fas/fa_mteb_clustering.py +4 -4
  284. mteb/tasks/clustering/fra/hal_clustering_s2s.py +2 -2
  285. mteb/tasks/clustering/multilingual/mlsum_clustering_p2p.py +2 -2
  286. mteb/tasks/clustering/multilingual/mlsum_clustering_s2s.py +2 -2
  287. mteb/tasks/clustering/multilingual/sib200_clustering_s2s.py +1 -1
  288. mteb/tasks/clustering/multilingual/wiki_clustering_p2p.py +1 -1
  289. mteb/tasks/clustering/nld/dutch_news_articles_clustering_p2p.py +1 -1
  290. mteb/tasks/clustering/nld/dutch_news_articles_clustering_s2s.py +1 -1
  291. mteb/tasks/clustering/nld/iconclass_clustering_s2s.py +1 -1
  292. mteb/tasks/clustering/nld/open_tender_clustering_p2p.py +1 -1
  293. mteb/tasks/clustering/nld/vabb_clustering_p2p.py +1 -1
  294. mteb/tasks/clustering/nld/vabb_clustering_s2s.py +1 -1
  295. mteb/tasks/clustering/nob/snl_clustering.py +1 -1
  296. mteb/tasks/clustering/nob/vg_clustering.py +1 -1
  297. mteb/tasks/clustering/pol/polish_clustering.py +3 -3
  298. mteb/tasks/clustering/rus/ru_sci_bench_grnti_clustering_p2p.py +1 -1
  299. mteb/tasks/clustering/rus/ru_sci_bench_oecd_clustering_p2p.py +1 -1
  300. mteb/tasks/clustering/zho/cmteb_clustering.py +4 -4
  301. mteb/tasks/image_text_pair_classification/eng/image_co_de.py +1 -1
  302. mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +1 -1
  303. mteb/tasks/instruction_reranking/multilingual/m_follow_ir.py +2 -2
  304. mteb/tasks/multichoice/eng/cv_bench.py +4 -4
  305. mteb/tasks/multilabel_classification/ita/emit_classification.py +1 -1
  306. mteb/tasks/multilabel_classification/mlt/maltese_news_classification.py +1 -1
  307. mteb/tasks/multilabel_classification/rus/ru_toixic_multilabelclassification_okmlcup.py +1 -1
  308. mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_group_classification.py +1 -1
  309. mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_subclass_classification.py +1 -1
  310. mteb/tasks/pair_classification/ara/ar_entail.py +1 -1
  311. mteb/tasks/pair_classification/dan/talemaader_pc.py +1 -1
  312. mteb/tasks/pair_classification/deu/false_friends_de_en_pc.py +1 -1
  313. mteb/tasks/pair_classification/eng/pub_chem_ai_sentence_paraphrase_pc.py +1 -1
  314. mteb/tasks/pair_classification/eng/pub_chem_smilespc.py +1 -1
  315. mteb/tasks/pair_classification/eng/pub_chem_synonym_pc.py +1 -1
  316. mteb/tasks/pair_classification/eng/pub_chem_wiki_paragraphs_pc.py +1 -1
  317. mteb/tasks/pair_classification/eng/sprint_duplicate_questions_pc.py +1 -1
  318. mteb/tasks/pair_classification/eng/twitter_sem_eval2015_pc.py +1 -1
  319. mteb/tasks/pair_classification/eng/twitter_url_corpus_pc.py +1 -1
  320. mteb/tasks/pair_classification/fas/fa_mteb_pair_classification.py +5 -5
  321. mteb/tasks/pair_classification/fas/fars_tail.py +2 -2
  322. mteb/tasks/pair_classification/hye/armenian_paraphrase_pc.py +1 -1
  323. mteb/tasks/pair_classification/ita/dis_co_tex_pair_classification.py +1 -1
  324. mteb/tasks/pair_classification/kor/klue_nli.py +1 -1
  325. mteb/tasks/pair_classification/multilingual/rte3.py +2 -2
  326. mteb/tasks/pair_classification/multilingual/xnli.py +1 -1
  327. mteb/tasks/pair_classification/pol/polish_pc.py +4 -4
  328. mteb/tasks/pair_classification/por/assin2_rte.py +1 -1
  329. mteb/tasks/pair_classification/por/sick_br_pc.py +1 -1
  330. mteb/tasks/pair_classification/rus/terra.py +2 -2
  331. mteb/tasks/pair_classification/vie/sprint_duplicate_questions_pcvn.py +1 -1
  332. mteb/tasks/pair_classification/vie/twitter_sem_eval2015_pcvn.py +1 -1
  333. mteb/tasks/pair_classification/vie/twitter_url_corpus_pcvn.py +1 -1
  334. mteb/tasks/pair_classification/zho/cmteb_pair_classification.py +2 -2
  335. mteb/tasks/retrieval/ara/sadeem_question_retrieval.py +1 -1
  336. mteb/tasks/retrieval/code/code_edit_search_retrieval.py +1 -1
  337. mteb/tasks/retrieval/code/code_rag.py +4 -4
  338. mteb/tasks/retrieval/code/code_search_net_cc_retrieval.py +1 -1
  339. mteb/tasks/retrieval/code/coir_code_search_net_retrieval.py +1 -1
  340. mteb/tasks/retrieval/code/ds1000_retrieval.py +1 -1
  341. mteb/tasks/retrieval/code/fresh_stack_retrieval.py +1 -1
  342. mteb/tasks/retrieval/code/human_eval_retrieval.py +1 -1
  343. mteb/tasks/retrieval/code/mbpp_retrieval.py +1 -1
  344. mteb/tasks/retrieval/code/wiki_sql_retrieval.py +1 -1
  345. mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
  346. mteb/tasks/retrieval/dan/tv2_nordretrieval.py +1 -1
  347. mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +1 -1
  348. mteb/tasks/retrieval/deu/german_gov_service_retrieval.py +1 -1
  349. mteb/tasks/retrieval/deu/german_qu_ad_retrieval.py +1 -1
  350. mteb/tasks/retrieval/ell/greek_civics_qa.py +1 -1
  351. mteb/tasks/retrieval/eng/__init__.py +42 -0
  352. mteb/tasks/retrieval/eng/bright_retrieval.py +10 -2
  353. mteb/tasks/retrieval/eng/bright_v1_1_retrieval.py +968 -0
  354. mteb/tasks/retrieval/eng/chat_doctor_retrieval.py +1 -1
  355. mteb/tasks/retrieval/eng/fin_qa_retrieval.py +1 -1
  356. mteb/tasks/retrieval/eng/finance_bench_retrieval.py +1 -1
  357. mteb/tasks/retrieval/eng/hateful_memes_i2t_retrieval.py +1 -1
  358. mteb/tasks/retrieval/eng/hateful_memes_t2i_retrieval.py +1 -1
  359. mteb/tasks/retrieval/eng/hc3_finance_retrieval.py +1 -1
  360. mteb/tasks/retrieval/eng/lemb_narrative_qa_retrieval.py +1 -1
  361. mteb/tasks/retrieval/eng/lemb_needle_retrieval.py +1 -1
  362. mteb/tasks/retrieval/eng/lemb_passkey_retrieval.py +1 -1
  363. mteb/tasks/retrieval/eng/lemb_summ_screen_fd_retrieval.py +1 -1
  364. mteb/tasks/retrieval/eng/lemb_wikim_qa_retrieval.py +1 -1
  365. mteb/tasks/retrieval/eng/lembqm_sum_retrieval.py +1 -1
  366. mteb/tasks/retrieval/eng/lit_search_retrieval.py +1 -1
  367. mteb/tasks/retrieval/eng/memotion_i2t_retrieval.py +1 -1
  368. mteb/tasks/retrieval/eng/memotion_t2i_retrieval.py +1 -1
  369. mteb/tasks/retrieval/eng/ml_questions.py +1 -1
  370. mteb/tasks/retrieval/eng/nano_argu_ana_retrieval.py +1 -1
  371. mteb/tasks/retrieval/eng/nano_climate_fever_retrieval.py +1 -1
  372. mteb/tasks/retrieval/eng/nano_db_pedia_retrieval.py +1 -1
  373. mteb/tasks/retrieval/eng/nano_fever_retrieval.py +1 -1
  374. mteb/tasks/retrieval/eng/nano_fi_qa2018_retrieval.py +1 -1
  375. mteb/tasks/retrieval/eng/nano_hotpot_qa_retrieval.py +1 -1
  376. mteb/tasks/retrieval/eng/nano_msmarco_retrieval.py +1 -1
  377. mteb/tasks/retrieval/eng/nano_nf_corpus_retrieval.py +1 -1
  378. mteb/tasks/retrieval/eng/nano_nq_retrieval.py +1 -1
  379. mteb/tasks/retrieval/eng/nano_quora_retrieval.py +1 -1
  380. mteb/tasks/retrieval/eng/nano_sci_fact_retrieval.py +1 -1
  381. mteb/tasks/retrieval/eng/nano_scidocs_retrieval.py +1 -1
  382. mteb/tasks/retrieval/eng/nano_touche2020_retrieval.py +1 -1
  383. mteb/tasks/retrieval/eng/narrative_qa_retrieval.py +1 -1
  384. mteb/tasks/retrieval/eng/r2_med_retrieval.py +8 -8
  385. mteb/tasks/retrieval/eng/sci_mmir_i2t_retrieval.py +1 -1
  386. mteb/tasks/retrieval/eng/sci_mmir_t2i_retrieval.py +1 -1
  387. mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +10 -10
  388. mteb/tasks/retrieval/fra/f_qu_ad_retrieval.py +1 -1
  389. mteb/tasks/retrieval/fra/syntec_retrieval.py +1 -1
  390. mteb/tasks/retrieval/hun/hun_sum2.py +1 -1
  391. mteb/tasks/retrieval/kat/georgian_faq_retrieval.py +1 -1
  392. mteb/tasks/retrieval/multilingual/cross_lingual_semantic_discrimination_wmt19.py +1 -1
  393. mteb/tasks/retrieval/multilingual/cross_lingual_semantic_discrimination_wmt21.py +1 -1
  394. mteb/tasks/retrieval/multilingual/cur_ev1_retrieval.py +1 -1
  395. mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +1 -1
  396. mteb/tasks/retrieval/multilingual/miracl_vision_retrieval.py +1 -1
  397. mteb/tasks/retrieval/multilingual/mr_tidy_retrieval.py +1 -1
  398. mteb/tasks/retrieval/multilingual/public_health_qa_retrieval.py +1 -1
  399. mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +2 -2
  400. mteb/tasks/retrieval/multilingual/statcan_dialogue_dataset_retrieval.py +1 -1
  401. mteb/tasks/retrieval/multilingual/vdr_multilingual_retrieval.py +1 -1
  402. mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +14 -4
  403. mteb/tasks/retrieval/multilingual/wit_t2i_retrieval.py +1 -1
  404. mteb/tasks/retrieval/multilingual/x_flickr30k_co_t2i_retrieval.py +1 -1
  405. mteb/tasks/retrieval/multilingual/x_qu_ad_retrieval.py +1 -1
  406. mteb/tasks/retrieval/multilingual/xm3600_t2i_retrieval.py +1 -1
  407. mteb/tasks/retrieval/nld/cqa_dupstack_android_nl_retrieval.py +1 -1
  408. mteb/tasks/retrieval/nld/cqa_dupstack_english_nl_retrieval.py +1 -1
  409. mteb/tasks/retrieval/nld/cqa_dupstack_gaming_nl_retrieval.py +1 -1
  410. mteb/tasks/retrieval/nld/cqa_dupstack_gis_nl_retrieval.py +1 -1
  411. mteb/tasks/retrieval/nld/cqa_dupstack_mathematica_nl_retrieval.py +1 -1
  412. mteb/tasks/retrieval/nld/cqa_dupstack_physics_nl_retrieval.py +1 -1
  413. mteb/tasks/retrieval/nld/cqa_dupstack_programmers_nl_retrieval.py +1 -1
  414. mteb/tasks/retrieval/nld/cqa_dupstack_stats_nl_retrieval.py +1 -1
  415. mteb/tasks/retrieval/nld/cqa_dupstack_tex_nl_retrieval.py +1 -1
  416. mteb/tasks/retrieval/nld/cqa_dupstack_unix_nl_retrieval.py +1 -1
  417. mteb/tasks/retrieval/nld/cqa_dupstack_webmasters_nl_retrieval.py +1 -1
  418. mteb/tasks/retrieval/nld/cqa_dupstack_wordpress_nl_retrieval.py +1 -1
  419. mteb/tasks/retrieval/nob/norquad.py +1 -1
  420. mteb/tasks/retrieval/nob/snl_retrieval.py +1 -1
  421. mteb/tasks/retrieval/slk/slovak_sum_retrieval.py +1 -1
  422. mteb/tasks/retrieval/vie/vie_qu_ad_retrieval.py +1 -1
  423. mteb/tasks/sts/fao/faroese_sts.py +1 -1
  424. mteb/tasks/sts/fra/sick_fr_sts.py +1 -1
  425. mteb/tasks/sts/kor/klue_sts.py +1 -1
  426. mteb/tasks/sts/por/sick_br_sts.py +1 -1
  427. mteb/tasks/sts/rus/ru_para_phraser_sts.py +1 -1
  428. mteb/tasks/zeroshot_classification/eng/sci_mmir.py +1 -1
  429. {mteb-2.7.3.dist-info → mteb-2.7.5.dist-info}/METADATA +1 -1
  430. {mteb-2.7.3.dist-info → mteb-2.7.5.dist-info}/RECORD +434 -413
  431. {mteb-2.7.3.dist-info → mteb-2.7.5.dist-info}/WHEEL +0 -0
  432. {mteb-2.7.3.dist-info → mteb-2.7.5.dist-info}/entry_points.txt +0 -0
  433. {mteb-2.7.3.dist-info → mteb-2.7.5.dist-info}/licenses/LICENSE +0 -0
  434. {mteb-2.7.3.dist-info → mteb-2.7.5.dist-info}/top_level.txt +0 -0
@@ -38,7 +38,7 @@ class SiswatiNewsClassification(AbsTaskClassification):
38
38
  superseded_by="SiswatiNewsClassification.v2",
39
39
  )
40
40
 
41
- def dataset_transform(self):
41
+ def dataset_transform(self, num_proc: int = 1):
42
42
  self.dataset = self.dataset.rename_columns({"title": "text"})
43
43
 
44
44
 
@@ -35,7 +35,7 @@ class TamilNewsClassification(AbsTaskClassification):
35
35
  superseded_by="TamilNewsClassification.v2",
36
36
  )
37
37
 
38
- def dataset_transform(self):
38
+ def dataset_transform(self, num_proc: int = 1):
39
39
  self.dataset = self.dataset.rename_columns(
40
40
  {"NewsInTamil": "text", "Category": "label"}
41
41
  )
@@ -75,5 +75,5 @@ class TamilNewsClassificationV2(AbsTaskClassification):
75
75
  adapted_from=["TamilNewsClassification"],
76
76
  )
77
77
 
78
- def dataset_transform(self):
78
+ def dataset_transform(self, num_proc: int = 1):
79
79
  self.dataset = self.stratified_subsampling(self.dataset, seed=self.seed)
@@ -28,7 +28,7 @@ class TeluguAndhraJyotiNewsClassification(AbsTaskClassification):
28
28
  superseded_by="TeluguAndhraJyotiNewsClassification.v2",
29
29
  )
30
30
 
31
- def dataset_transform(self):
31
+ def dataset_transform(self, num_proc: int = 1):
32
32
  self.dataset = self.dataset.rename_columns({"body": "text", "topic": "label"})
33
33
  self.dataset = self.stratified_subsampling(self.dataset, seed=self.seed)
34
34
 
@@ -59,5 +59,5 @@ class TeluguAndhraJyotiNewsClassificationV2(AbsTaskClassification):
59
59
  adapted_from=["TeluguAndhraJyotiNewsClassification"],
60
60
  )
61
61
 
62
- def dataset_transform(self):
62
+ def dataset_transform(self, num_proc: int = 1):
63
63
  self.dataset = self.stratified_subsampling(self.dataset, seed=self.seed)
@@ -38,7 +38,7 @@ class WongnaiReviewsClassification(AbsTaskClassification):
38
38
  """,
39
39
  )
40
40
 
41
- def dataset_transform(self):
41
+ def dataset_transform(self, num_proc: int = 1):
42
42
  self.dataset = self.dataset.rename_columns(
43
43
  {"review_body": "text", "star_rating": "label"}
44
44
  )
@@ -36,7 +36,7 @@ class TurkishMovieSentimentClassification(AbsTaskClassification):
36
36
  superseded_by="TurkishMovieSentimentClassification.v2",
37
37
  )
38
38
 
39
- def dataset_transform(self):
39
+ def dataset_transform(self, num_proc: int = 1):
40
40
  self.dataset = self.stratified_subsampling(
41
41
  self.dataset, seed=self.seed, splits=["test"]
42
42
  )
@@ -76,7 +76,7 @@ class TurkishMovieSentimentClassificationV2(AbsTaskClassification):
76
76
  adapted_from=["TurkishMovieSentimentClassification"],
77
77
  )
78
78
 
79
- def dataset_transform(self):
79
+ def dataset_transform(self, num_proc: int = 1):
80
80
  self.dataset = self.stratified_subsampling(
81
81
  self.dataset, seed=self.seed, splits=["test"]
82
82
  )
@@ -39,7 +39,7 @@ Tetreault, Joel},
39
39
  superseded_by="UkrFormalityClassification.v2",
40
40
  )
41
41
 
42
- def dataset_transform(self):
42
+ def dataset_transform(self, num_proc: int = 1):
43
43
  self.dataset = self.dataset.rename_column("labels", "label")
44
44
  self.dataset = self.dataset.class_encode_column("label")
45
45
  self.dataset = self.stratified_subsampling(
@@ -84,7 +84,7 @@ Tetreault, Joel},
84
84
  adapted_from=["UkrFormalityClassification"],
85
85
  )
86
86
 
87
- def dataset_transform(self):
87
+ def dataset_transform(self, num_proc: int = 1):
88
88
  self.dataset = self.stratified_subsampling(
89
89
  self.dataset, seed=self.seed, splits=["train", "test"]
90
90
  )
@@ -39,7 +39,7 @@ class ToxicConversationsVNClassification(AbsTaskClassification):
39
39
  adapted_from=["ToxicConversationsClassification"],
40
40
  )
41
41
 
42
- def dataset_transform(self):
42
+ def dataset_transform(self, num_proc: int = 1):
43
43
  self.dataset = self.stratified_subsampling(
44
44
  self.dataset, seed=self.seed, splits=["test"]
45
45
  )
@@ -79,7 +79,7 @@ class VieStudentFeedbackClassificationV2(AbsTaskClassification):
79
79
  adapted_from=["VieStudentFeedbackClassification"],
80
80
  )
81
81
 
82
- def dataset_transform(self):
82
+ def dataset_transform(self, num_proc: int = 1):
83
83
  self.dataset = self.stratified_subsampling(
84
84
  self.dataset, seed=self.seed, splits=["test"]
85
85
  )
@@ -39,7 +39,7 @@ class YueOpenriceReviewClassification(AbsTaskClassification):
39
39
 
40
40
  samples_per_label = 32
41
41
 
42
- def dataset_transform(self):
42
+ def dataset_transform(self, num_proc: int = 1):
43
43
  self.dataset = self.stratified_subsampling(
44
44
  self.dataset, seed=self.seed, splits=["test"]
45
45
  )
@@ -82,7 +82,7 @@ class YueOpenriceReviewClassificationV2(AbsTaskClassification):
82
82
 
83
83
  samples_per_label = 32
84
84
 
85
- def dataset_transform(self):
85
+ def dataset_transform(self, num_proc: int = 1):
86
86
  self.dataset = self.stratified_subsampling(
87
87
  self.dataset, seed=self.seed, splits=["test"]
88
88
  )
@@ -38,7 +38,7 @@ class IsiZuluNewsClassification(AbsTaskClassification):
38
38
  superseded_by="IsiZuluNewsClassification.v2",
39
39
  )
40
40
 
41
- def dataset_transform(self):
41
+ def dataset_transform(self, num_proc: int = 1):
42
42
  self.dataset = self.dataset.rename_columns({"title": "text"})
43
43
 
44
44
 
@@ -82,7 +82,7 @@ class BlurbsClusteringP2PFast(AbsTaskClustering):
82
82
  adapted_from=["BlurbsClusteringP2P"],
83
83
  )
84
84
 
85
- def dataset_transform(self):
85
+ def dataset_transform(self, num_proc: int = 1):
86
86
  self.dataset = _convert_to_fast(
87
87
  self.dataset, self.input_column_name, self.label_column_name, self.seed
88
88
  )
@@ -91,7 +91,7 @@ class BlurbsClusteringS2SFast(AbsTaskClustering):
91
91
  adapted_from=["BlurbsClusteringS2S"],
92
92
  )
93
93
 
94
- def dataset_transform(self):
94
+ def dataset_transform(self, num_proc: int = 1):
95
95
  ds = {}
96
96
  for split in self.metadata.eval_splits:
97
97
  labels = list(itertools.chain.from_iterable(self.dataset[split]["labels"]))
@@ -82,7 +82,7 @@ class ArxivClusteringP2PFast(AbsTaskClusteringLegacy):
82
82
  # simply downsample each cluster.
83
83
  )
84
84
 
85
- def dataset_transform(self):
85
+ def dataset_transform(self, num_proc: int = 1):
86
86
  rng_state = random.Random(self.seed)
87
87
 
88
88
  ds = {}
@@ -38,7 +38,7 @@ class ArXivHierarchicalClusteringP2P(AbsTaskClustering):
38
38
  bibtex_citation="",
39
39
  )
40
40
 
41
- def dataset_transform(self):
41
+ def dataset_transform(self, num_proc: int = 1):
42
42
  ds = {}
43
43
  for split in self.metadata.eval_splits:
44
44
  labels = list(itertools.chain.from_iterable(self.dataset[split]["labels"]))
@@ -78,7 +78,7 @@ class ArXivHierarchicalClusteringS2S(AbsTaskClustering):
78
78
  bibtex_citation="",
79
79
  )
80
80
 
81
- def dataset_transform(self):
81
+ def dataset_transform(self, num_proc: int = 1):
82
82
  ds = {}
83
83
  for split in self.metadata.eval_splits:
84
84
  labels = list(itertools.chain.from_iterable(self.dataset[split]["labels"]))
@@ -104,7 +104,7 @@ Summarization},
104
104
  adapted_from=["BigPatentClustering"],
105
105
  )
106
106
 
107
- def dataset_transform(self):
107
+ def dataset_transform(self, num_proc: int = 1):
108
108
  for split in self.metadata.eval_splits:
109
109
  _check_label_distribution(self.dataset[split])
110
110
  self.dataset = self.stratified_subsampling(
@@ -33,7 +33,7 @@ class BiorxivClusteringP2PFast(AbsTaskClustering):
33
33
  adapted_from=["BiorxivClusteringP2P"],
34
34
  )
35
35
 
36
- def dataset_transform(self):
36
+ def dataset_transform(self, num_proc: int = 1):
37
37
  for split in self.metadata.eval_splits:
38
38
  _check_label_distribution(self.dataset[split])
39
39
 
@@ -33,7 +33,7 @@ class BiorxivClusteringS2SFast(AbsTaskClustering):
33
33
  adapted_from=["BiorxivClusteringS2S"],
34
34
  )
35
35
 
36
- def dataset_transform(self):
36
+ def dataset_transform(self, num_proc: int = 1):
37
37
  for split in self.metadata.eval_splits:
38
38
  _check_label_distribution(self.dataset[split])
39
39
 
@@ -37,7 +37,7 @@ class MedrxivClusteringP2PFast(AbsTaskClustering):
37
37
  adapted_from=["MedrxivClusteringP2P"],
38
38
  )
39
39
 
40
- def dataset_transform(self):
40
+ def dataset_transform(self, num_proc: int = 1):
41
41
  ds = {}
42
42
  for split in self.metadata.eval_splits:
43
43
  labels = list(itertools.chain.from_iterable(self.dataset[split]["labels"]))
@@ -37,7 +37,7 @@ class MedrxivClusteringS2SFast(AbsTaskClustering):
37
37
  adapted_from=["MedrxivClusteringS2S"],
38
38
  )
39
39
 
40
- def dataset_transform(self):
40
+ def dataset_transform(self, num_proc: int = 1):
41
41
  ds = {}
42
42
  for split in self.metadata.eval_splits:
43
43
  labels = list(itertools.chain.from_iterable(self.dataset[split]["labels"]))
@@ -51,7 +51,7 @@ Iryna Gurevych},
51
51
  adapted_from=["RedditClustering"],
52
52
  )
53
53
 
54
- def dataset_transform(self):
54
+ def dataset_transform(self, num_proc: int = 1):
55
55
  ds = {}
56
56
  for split in self.metadata.eval_splits:
57
57
  labels = list(itertools.chain.from_iterable(self.dataset[split]["labels"]))
@@ -94,7 +94,7 @@ Iryna Gurevych},
94
94
  adapted_from=["RedditClusteringP2P"],
95
95
  )
96
96
 
97
- def dataset_transform(self):
97
+ def dataset_transform(self, num_proc: int = 1):
98
98
  ds = {}
99
99
  for split in self.metadata.eval_splits:
100
100
  labels = list(itertools.chain.from_iterable(self.dataset[split]["labels"]))
@@ -51,7 +51,7 @@ Iryna Gurevych},
51
51
  adapted_from=["StackExchangeClustering"],
52
52
  )
53
53
 
54
- def dataset_transform(self):
54
+ def dataset_transform(self, num_proc: int = 1):
55
55
  ds = {}
56
56
  for split in self.metadata.eval_splits:
57
57
  labels = list(itertools.chain.from_iterable(self.dataset[split]["labels"]))
@@ -52,7 +52,7 @@ Iryna Gurevych},
52
52
  adapted_from=["StackExchangeClusteringP2P"],
53
53
  )
54
54
 
55
- def dataset_transform(self):
55
+ def dataset_transform(self, num_proc: int = 1):
56
56
  ds = {}
57
57
  for split in self.metadata.eval_splits:
58
58
  labels = list(itertools.chain.from_iterable(self.dataset[split]["labels"]))
@@ -93,7 +93,7 @@ class TwentyNewsgroupsClusteringFast(AbsTaskClustering):
93
93
  adapted_from=["TwentyNewsgroupsClustering"],
94
94
  )
95
95
 
96
- def dataset_transform(self):
96
+ def dataset_transform(self, num_proc: int = 1):
97
97
  ds = {}
98
98
  for split in self.metadata.eval_splits:
99
99
  labels = list(itertools.chain.from_iterable(self.dataset[split]["labels"]))
@@ -33,7 +33,7 @@ class BeytooteClustering(AbsTaskClustering):
33
33
  bibtex_citation=""" """,
34
34
  )
35
35
 
36
- def dataset_transform(self):
36
+ def dataset_transform(self, num_proc: int = 1):
37
37
  self.dataset = self.stratified_subsampling(
38
38
  self.dataset,
39
39
  seed=self.seed,
@@ -93,7 +93,7 @@ class HamshahriClustring(AbsTaskClustering):
93
93
  bibtex_citation=""" """,
94
94
  )
95
95
 
96
- def dataset_transform(self):
96
+ def dataset_transform(self, num_proc: int = 1):
97
97
  self.dataset = self.dataset.map(
98
98
  lambda x: {"sentences": f"{x['title']}\n: {x['summary']}"}
99
99
  )
@@ -151,7 +151,7 @@ class NLPTwitterAnalysisClustering(AbsTaskClustering):
151
151
  bibtex_citation=""" """,
152
152
  )
153
153
 
154
- def dataset_transform(self):
154
+ def dataset_transform(self, num_proc: int = 1):
155
155
  self.dataset = self.dataset.rename_column("tweet", "sentences")
156
156
  self.dataset = self.dataset.rename_column("label", "labels")
157
157
  self.dataset = self.stratified_subsampling(
@@ -187,7 +187,7 @@ class SIDClustring(AbsTaskClustering):
187
187
  bibtex_citation=""" """,
188
188
  )
189
189
 
190
- def dataset_transform(self):
190
+ def dataset_transform(self, num_proc: int = 1):
191
191
  self.dataset = self.stratified_subsampling(
192
192
  self.dataset,
193
193
  seed=self.seed,
@@ -48,7 +48,7 @@ class HALClusteringS2S(AbsTaskClusteringLegacy):
48
48
  superseded_by="HALClusteringS2S.v2",
49
49
  )
50
50
 
51
- def dataset_transform(self):
51
+ def dataset_transform(self, num_proc: int = 1):
52
52
  """Convert to standard format"""
53
53
  self.dataset = self.dataset.remove_columns("hal_id")
54
54
  titles = self.dataset["test"]["title"]
@@ -98,7 +98,7 @@ class HALClusteringS2SFast(AbsTaskClustering):
98
98
  adapted_from=["HALClusteringS2S"],
99
99
  )
100
100
 
101
- def dataset_transform(self):
101
+ def dataset_transform(self, num_proc: int = 1):
102
102
  """Convert to standard format"""
103
103
  self.dataset["test"] = self.dataset["test"].remove_columns("hal_id")
104
104
  self.dataset["test"] = self.dataset["test"].rename_columns(
@@ -51,7 +51,7 @@ class MLSUMClusteringP2P(AbsTaskClusteringLegacy):
51
51
  superseded_by="MLSUMClusteringP2P.v2",
52
52
  )
53
53
 
54
- def load_data(self) -> None:
54
+ def load_data(self, num_proc: int = 1, **kwargs) -> None:
55
55
  """Load dataset from HuggingFace hub and convert it to the standard format."""
56
56
  if self.data_loaded:
57
57
  return
@@ -124,7 +124,7 @@ class MLSUMClusteringP2PFast(AbsTaskClustering):
124
124
  adapted_from=["MLSUMClusteringP2P"],
125
125
  )
126
126
 
127
- def load_data(self) -> None:
127
+ def load_data(self, num_proc: int = 1, **kwargs) -> None:
128
128
  """Load dataset from HuggingFace hub and convert it to the standard format."""
129
129
  if self.data_loaded:
130
130
  return
@@ -51,7 +51,7 @@ class MLSUMClusteringS2S(AbsTaskClusteringLegacy):
51
51
  superseded_by="MLSUMClusteringS2S.v2",
52
52
  )
53
53
 
54
- def load_data(self) -> None:
54
+ def load_data(self, num_proc: int = 1, **kwargs) -> None:
55
55
  """Load dataset from HuggingFace hub and convert it to the standard format."""
56
56
  if self.data_loaded:
57
57
  return
@@ -119,7 +119,7 @@ class MLSUMClusteringS2SFast(AbsTaskClustering):
119
119
  adapted_from=["MLSUMClusteringS2S"],
120
120
  )
121
121
 
122
- def load_data(self) -> None:
122
+ def load_data(self, num_proc: int = 1, **kwargs) -> None:
123
123
  """Load dataset from HuggingFace hub and convert it to the standard format."""
124
124
  if self.data_loaded:
125
125
  return
@@ -239,7 +239,7 @@ class SIB200ClusteringFast(AbsTaskClustering):
239
239
  """, # combined train, validation, and test into test.
240
240
  )
241
241
 
242
- def dataset_transform(self):
242
+ def dataset_transform(self, num_proc: int = 1):
243
243
  ds = {}
244
244
  for lang in self.hf_subsets:
245
245
  labels = []
@@ -81,7 +81,7 @@ class WikiClusteringFastP2P(AbsTaskClustering):
81
81
  adapted_from=["WikiClusteringP2P"],
82
82
  )
83
83
 
84
- def dataset_transform(self):
84
+ def dataset_transform(self, num_proc: int = 1):
85
85
  ds = {}
86
86
  for lang in self.hf_subsets:
87
87
  labels = []
@@ -33,7 +33,7 @@ class DutchNewsArticlesClusteringP2P(AbsTaskClustering):
33
33
  },
34
34
  )
35
35
 
36
- def dataset_transform(self):
36
+ def dataset_transform(self, num_proc: int = 1):
37
37
  for split in self.dataset:
38
38
  self.dataset[split] = self.dataset[split].rename_columns(
39
39
  {"label": "labels", "text": "sentences"}
@@ -33,7 +33,7 @@ class DutchNewsArticlesClusteringS2S(AbsTaskClustering):
33
33
  },
34
34
  )
35
35
 
36
- def dataset_transform(self):
36
+ def dataset_transform(self, num_proc: int = 1):
37
37
  for split in self.dataset:
38
38
  self.dataset[split] = self.dataset[split].rename_columns(
39
39
  {"label": "labels", "title": "sentences"}
@@ -43,7 +43,7 @@ class IconclassClusteringS2S(AbsTaskClustering):
43
43
  },
44
44
  )
45
45
 
46
- def dataset_transform(self):
46
+ def dataset_transform(self, num_proc: int = 1):
47
47
  for split in self.dataset:
48
48
  self.dataset[split] = self.dataset[split].map(
49
49
  lambda ex: {"labels": ex["label"], "sentences": ex["text"]}
@@ -43,7 +43,7 @@ class OpenTenderClusteringP2P(AbsTaskClustering):
43
43
  },
44
44
  )
45
45
 
46
- def dataset_transform(self):
46
+ def dataset_transform(self, num_proc: int = 1):
47
47
  # reuse the dataset for classification
48
48
  for split in self.dataset:
49
49
  self.dataset[split] = self.dataset[split].map(
@@ -44,7 +44,7 @@ class VABBClusteringP2P(AbsTaskClustering):
44
44
  },
45
45
  )
46
46
 
47
- def dataset_transform(self):
47
+ def dataset_transform(self, num_proc: int = 1):
48
48
  for split in self.dataset:
49
49
  self.dataset[split] = self.dataset[split].map(
50
50
  lambda ex: {
@@ -44,7 +44,7 @@ class VABBClusteringS2S(AbsTaskClustering):
44
44
  },
45
45
  )
46
46
 
47
- def dataset_transform(self):
47
+ def dataset_transform(self, num_proc: int = 1):
48
48
  for split in self.dataset:
49
49
  self.dataset[split] = self.dataset[split].rename_columns(
50
50
  {"title": "sentences"}
@@ -58,7 +58,7 @@ class SNLClustering(AbsTaskClusteringLegacy):
58
58
  superseded_by="SNLHierarchicalClusteringP2P",
59
59
  )
60
60
 
61
- def dataset_transform(self):
61
+ def dataset_transform(self, num_proc: int = 1):
62
62
  splits = self.metadata.eval_splits
63
63
 
64
64
  documents: list = []
@@ -58,7 +58,7 @@ class VGClustering(AbsTaskClusteringLegacy):
58
58
  superseded_by="VGHierarchicalClusteringP2P",
59
59
  )
60
60
 
61
- def dataset_transform(self):
61
+ def dataset_transform(self, num_proc: int = 1):
62
62
  splits = self.metadata.eval_splits
63
63
 
64
64
  documents: list = []
@@ -131,7 +131,7 @@ Piperidis, Stelios},
131
131
  adapted_from=["EightTagsClustering"],
132
132
  )
133
133
 
134
- def dataset_transform(self):
134
+ def dataset_transform(self, num_proc: int = 1):
135
135
  ds = {}
136
136
  for split in self.metadata.eval_splits:
137
137
  labels = list(chain.from_iterable(self.dataset[split]["labels"]))
@@ -204,7 +204,7 @@ class PlscClusteringS2SFast(AbsTaskClustering):
204
204
  adapted_from=["PlscClusteringS2S"],
205
205
  )
206
206
 
207
- def dataset_transform(self):
207
+ def dataset_transform(self, num_proc: int = 1):
208
208
  ds = {}
209
209
  for split in self.metadata.eval_splits:
210
210
  labels = self.dataset[split]["labels"]
@@ -286,7 +286,7 @@ class PlscClusteringP2PFast(AbsTaskClustering):
286
286
  adapted_from=["PlscClusteringP2P"],
287
287
  )
288
288
 
289
- def dataset_transform(self):
289
+ def dataset_transform(self, num_proc: int = 1):
290
290
  ds = {}
291
291
  for split in self.metadata.eval_splits:
292
292
  labels = self.dataset[split]["labels"]
@@ -32,7 +32,7 @@ class RuSciBenchGRNTIClusteringP2P(AbsTaskClustering):
32
32
  prompt="Identify the category of scientific papers based on the titles and abstracts",
33
33
  )
34
34
 
35
- def dataset_transform(self):
35
+ def dataset_transform(self, num_proc: int = 1):
36
36
  self.dataset = self.dataset.rename_columns(
37
37
  {"label": "labels", "text": "sentences"}
38
38
  )
@@ -32,7 +32,7 @@ class RuSciBenchOECDClusteringP2P(AbsTaskClustering):
32
32
  prompt="Identify the category of scientific papers based on the titles and abstracts",
33
33
  )
34
34
 
35
- def dataset_transform(self):
35
+ def dataset_transform(self, num_proc: int = 1):
36
36
  self.dataset = self.dataset.rename_columns(
37
37
  {"label": "labels", "text": "sentences"}
38
38
  )
@@ -51,7 +51,7 @@ class CLSClusteringFastS2S(AbsTaskClustering):
51
51
  adapted_from=["CLSClusteringS2S"],
52
52
  )
53
53
 
54
- def dataset_transform(self):
54
+ def dataset_transform(self, num_proc: int = 1):
55
55
  ds = {}
56
56
  for split in self.metadata.eval_splits:
57
57
  labels = list(itertools.chain.from_iterable(self.dataset[split]["labels"]))
@@ -110,7 +110,7 @@ class CLSClusteringFastP2P(AbsTaskClustering):
110
110
  adapted_from=["CLSClusteringP2P"],
111
111
  )
112
112
 
113
- def dataset_transform(self):
113
+ def dataset_transform(self, num_proc: int = 1):
114
114
  ds = {}
115
115
  for split in self.metadata.eval_splits:
116
116
  labels = list(itertools.chain.from_iterable(self.dataset[split]["labels"]))
@@ -239,7 +239,7 @@ class ThuNewsClusteringFastS2S(AbsTaskClustering):
239
239
  adapted_from=["ThuNewsClusteringS2S"],
240
240
  )
241
241
 
242
- def dataset_transform(self):
242
+ def dataset_transform(self, num_proc: int = 1):
243
243
  ds = {}
244
244
  for split in self.metadata.eval_splits:
245
245
  labels = list(itertools.chain.from_iterable(self.dataset[split]["labels"]))
@@ -298,7 +298,7 @@ class ThuNewsClusteringFastP2P(AbsTaskClustering):
298
298
  adapted_from=["ThuNewsClusteringP2P"],
299
299
  )
300
300
 
301
- def dataset_transform(self):
301
+ def dataset_transform(self, num_proc: int = 1):
302
302
  ds = {}
303
303
  for split in self.metadata.eval_splits:
304
304
  labels = list(itertools.chain.from_iterable(self.dataset[split]["labels"]))
@@ -53,7 +53,7 @@ class ImageCoDe(AbsTaskImageTextPairClassification):
53
53
  """,
54
54
  )
55
55
 
56
- def load_data(self) -> None:
56
+ def load_data(self, num_proc: int = 1, **kwargs) -> None:
57
57
  if self.data_loaded:
58
58
  return
59
59
 
@@ -45,7 +45,7 @@ class SugarCrepe(AbsTaskImageTextPairClassification):
45
45
  """,
46
46
  )
47
47
 
48
- def load_data(self) -> None:
48
+ def load_data(self, num_proc: int = 1, **kwargs) -> None:
49
49
  """Load dataset from HuggingFace hub"""
50
50
  if self.data_loaded:
51
51
  return
@@ -175,7 +175,7 @@ class mFollowIRCrossLingual(AbsTaskRetrieval): # noqa: N801
175
175
  """,
176
176
  )
177
177
 
178
- def load_data(self) -> None:
178
+ def load_data(self, num_proc: int = 1, **kwargs) -> None:
179
179
  if self.data_loaded:
180
180
  return
181
181
 
@@ -243,7 +243,7 @@ class mFollowIR(AbsTaskRetrieval): # noqa: N801
243
243
  """,
244
244
  )
245
245
 
246
- def load_data(self) -> None:
246
+ def load_data(self, num_proc: int = 1, **kwargs) -> None:
247
247
  if self.data_loaded:
248
248
  return
249
249
 
@@ -123,7 +123,7 @@ class CVBenchCount(AbsTaskRetrieval):
123
123
  """,
124
124
  )
125
125
 
126
- def load_data(self) -> None:
126
+ def load_data(self, num_proc: int = 1, **kwargs) -> None:
127
127
  self.corpus, self.queries, self.relevant_docs, self.top_ranked = _load_data(
128
128
  path=self.metadata.dataset["path"],
129
129
  splits=self.metadata.eval_splits,
@@ -165,7 +165,7 @@ class CVBenchRelation(AbsTaskRetrieval):
165
165
  """,
166
166
  )
167
167
 
168
- def load_data(self) -> None:
168
+ def load_data(self, num_proc: int = 1, **kwargs) -> None:
169
169
  self.corpus, self.queries, self.relevant_docs, self.top_ranked = _load_data(
170
170
  path=self.metadata.dataset["path"],
171
171
  splits=self.metadata.eval_splits,
@@ -207,7 +207,7 @@ class CVBenchDepth(AbsTaskRetrieval):
207
207
  """,
208
208
  )
209
209
 
210
- def load_data(self) -> None:
210
+ def load_data(self, num_proc: int = 1, **kwargs) -> None:
211
211
  self.corpus, self.queries, self.relevant_docs, self.top_ranked = _load_data(
212
212
  path=self.metadata.dataset["path"],
213
213
  splits=self.metadata.eval_splits,
@@ -249,7 +249,7 @@ class CVBenchDistance(AbsTaskRetrieval):
249
249
  """,
250
250
  )
251
251
 
252
- def load_data(self) -> None:
252
+ def load_data(self, num_proc: int = 1, **kwargs) -> None:
253
253
  self.corpus, self.queries, self.relevant_docs, self.top_ranked = _load_data(
254
254
  path=self.metadata.dataset["path"],
255
255
  splits=self.metadata.eval_splits,