mteb 2.7.4__py3-none-any.whl → 2.7.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (287) hide show
  1. mteb/_create_dataloaders.py +47 -5
  2. mteb/_evaluators/any_sts_evaluator.py +2 -0
  3. mteb/_evaluators/clustering_evaluator.py +2 -0
  4. mteb/_evaluators/evaluator.py +2 -1
  5. mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +8 -1
  6. mteb/_evaluators/pair_classification_evaluator.py +3 -0
  7. mteb/_evaluators/retrieval_evaluator.py +3 -0
  8. mteb/_evaluators/sklearn_evaluator.py +6 -1
  9. mteb/_evaluators/text/bitext_mining_evaluator.py +2 -0
  10. mteb/_evaluators/text/summarization_evaluator.py +2 -0
  11. mteb/_evaluators/zeroshot_classification_evaluator.py +2 -0
  12. mteb/abstasks/abstask.py +31 -12
  13. mteb/abstasks/classification.py +10 -3
  14. mteb/abstasks/clustering.py +6 -2
  15. mteb/abstasks/clustering_legacy.py +8 -2
  16. mteb/abstasks/image/image_text_pair_classification.py +6 -2
  17. mteb/abstasks/multilabel_classification.py +2 -0
  18. mteb/abstasks/pair_classification.py +8 -2
  19. mteb/abstasks/retrieval.py +26 -11
  20. mteb/abstasks/retrieval_dataset_loaders.py +29 -19
  21. mteb/abstasks/sts.py +10 -3
  22. mteb/abstasks/text/bitext_mining.py +9 -5
  23. mteb/abstasks/text/reranking.py +2 -2
  24. mteb/abstasks/text/summarization.py +2 -1
  25. mteb/abstasks/zeroshot_classification.py +8 -2
  26. mteb/evaluate.py +10 -2
  27. mteb/models/model_implementations/bm25.py +2 -0
  28. mteb/models/model_implementations/pylate_models.py +10 -0
  29. mteb/models/models_protocols.py +4 -0
  30. mteb/models/search_wrappers.py +12 -0
  31. mteb/tasks/bitext_mining/eng/pub_chem_smiles_bitext_mining.py +1 -1
  32. mteb/tasks/bitext_mining/fas/fa_mteb_summary_retrieval.py +3 -3
  33. mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining.py +1 -1
  34. mteb/tasks/bitext_mining/multilingual/flores_bitext_mining.py +1 -1
  35. mteb/tasks/bitext_mining/multilingual/in22_conv_bitext_mining.py +1 -1
  36. mteb/tasks/bitext_mining/multilingual/in22_gen_bitext_mining.py +1 -1
  37. mteb/tasks/bitext_mining/multilingual/norwegian_courts_bitext_mining.py +1 -1
  38. mteb/tasks/bitext_mining/multilingual/ntrex_bitext_mining.py +1 -1
  39. mteb/tasks/bitext_mining/multilingual/roma_tales_bitext_mining.py +2 -2
  40. mteb/tasks/bitext_mining/multilingual/web_faq_bitext_mining.py +2 -2
  41. mteb/tasks/classification/ara/online_store_review_sentiment_classification.py +1 -1
  42. mteb/tasks/classification/ara/restaurant_review_sentiment_classification.py +1 -1
  43. mteb/tasks/classification/ara/tweet_sarcasm_classification.py +1 -1
  44. mteb/tasks/classification/ben/bengali_hate_speech_classification.py +1 -1
  45. mteb/tasks/classification/ben/bengali_sentiment_analysis.py +1 -1
  46. mteb/tasks/classification/bul/bulgarian_store_review_sentiment_classfication.py +1 -1
  47. mteb/tasks/classification/ces/csfdcz_movie_review_sentiment_classification.py +2 -2
  48. mteb/tasks/classification/dan/ddisco_cohesion_classification.py +1 -1
  49. mteb/tasks/classification/dan/dk_hate_classification.py +1 -1
  50. mteb/tasks/classification/deu/german_politicians_twitter_sentiment_classification.py +1 -1
  51. mteb/tasks/classification/ell/greek_legal_code_classification.py +1 -1
  52. mteb/tasks/classification/eng/dbpedia_classification.py +2 -2
  53. mteb/tasks/classification/eng/toxic_chat_classification.py +2 -2
  54. mteb/tasks/classification/eng/toxic_conversations_classification.py +2 -2
  55. mteb/tasks/classification/eng/tweet_topic_single_classification.py +1 -1
  56. mteb/tasks/classification/eng/yahoo_answers_topics_classification.py +1 -1
  57. mteb/tasks/classification/eng/yelp_review_full_classification.py +2 -2
  58. mteb/tasks/classification/est/estonian_valence.py +1 -1
  59. mteb/tasks/classification/fas/fa_mteb_classification.py +6 -6
  60. mteb/tasks/classification/fas/persian_food_sentiment_classification.py +1 -1
  61. mteb/tasks/classification/fil/filipino_shopee_reviews_classification.py +1 -1
  62. mteb/tasks/classification/fin/fin_toxicity_classification.py +1 -1
  63. mteb/tasks/classification/fra/french_book_reviews.py +2 -2
  64. mteb/tasks/classification/fra/movie_review_sentiment_classification.py +2 -2
  65. mteb/tasks/classification/guj/gujarati_news_classification.py +1 -1
  66. mteb/tasks/classification/hin/hindi_discourse_classification.py +1 -1
  67. mteb/tasks/classification/hin/sentiment_analysis_hindi.py +1 -1
  68. mteb/tasks/classification/ind/indonesian_id_clickbait_classification.py +2 -2
  69. mteb/tasks/classification/ind/indonesian_mongabay_conservation_classification.py +1 -1
  70. mteb/tasks/classification/ita/dado_eval_coarse_classification.py +1 -1
  71. mteb/tasks/classification/ita/ita_casehold_classification.py +1 -1
  72. mteb/tasks/classification/ita/sardi_stance_classification.py +1 -1
  73. mteb/tasks/classification/jav/javanese_imdb_classification.py +1 -1
  74. mteb/tasks/classification/jpn/wrime_classification.py +1 -1
  75. mteb/tasks/classification/kan/kannada_news_classification.py +2 -2
  76. mteb/tasks/classification/kor/klue_tc.py +2 -2
  77. mteb/tasks/classification/kor/kor_fin.py +1 -1
  78. mteb/tasks/classification/kor/kor_hate_classification.py +1 -1
  79. mteb/tasks/classification/kor/kor_sarcasm_classification.py +1 -1
  80. mteb/tasks/classification/mal/malayalam_news_classification.py +1 -1
  81. mteb/tasks/classification/mar/marathi_news_classification.py +1 -1
  82. mteb/tasks/classification/multilingual/afri_senti_lang_classification.py +1 -1
  83. mteb/tasks/classification/multilingual/catalonia_tweet_classification.py +1 -1
  84. mteb/tasks/classification/multilingual/cyrillic_turkic_lang_classification.py +1 -1
  85. mteb/tasks/classification/multilingual/indic_nlp_news_classification.py +1 -1
  86. mteb/tasks/classification/multilingual/masakha_news_classification.py +1 -1
  87. mteb/tasks/classification/multilingual/multi_hate_classification.py +1 -1
  88. mteb/tasks/classification/multilingual/multilingual_sentiment_classification.py +1 -1
  89. mteb/tasks/classification/multilingual/scala_classification.py +1 -1
  90. mteb/tasks/classification/multilingual/sib200_classification.py +1 -1
  91. mteb/tasks/classification/multilingual/turkic_classification.py +1 -1
  92. mteb/tasks/classification/multilingual/tweet_sentiment_classification.py +1 -1
  93. mteb/tasks/classification/nep/nepali_news_classification.py +2 -2
  94. mteb/tasks/classification/nld/dutch_sarcastic_headlines_classification.py +1 -1
  95. mteb/tasks/classification/nld/vaccin_chat_nl_classification.py +1 -1
  96. mteb/tasks/classification/ory/odia_news_classification.py +2 -2
  97. mteb/tasks/classification/pan/punjabi_news_classification.py +1 -1
  98. mteb/tasks/classification/ron/moroco.py +1 -1
  99. mteb/tasks/classification/ron/romanian_reviews_sentiment.py +1 -1
  100. mteb/tasks/classification/ron/romanian_sentiment_classification.py +1 -1
  101. mteb/tasks/classification/rus/georeview_classification.py +1 -1
  102. mteb/tasks/classification/rus/headline_classification.py +2 -2
  103. mteb/tasks/classification/rus/inappropriateness_classification.py +2 -2
  104. mteb/tasks/classification/rus/ru_reviews_classification.py +2 -2
  105. mteb/tasks/classification/rus/ru_sci_bench_grnti_classification.py +1 -1
  106. mteb/tasks/classification/rus/ru_sci_bench_oecd_classification.py +1 -1
  107. mteb/tasks/classification/rus/ru_toixic_classification_okmlcup.py +1 -1
  108. mteb/tasks/classification/san/sanskrit_shlokas_classification.py +1 -1
  109. mteb/tasks/classification/sin/sinhala_news_classification.py +2 -2
  110. mteb/tasks/classification/sin/sinhala_news_source_classification.py +2 -2
  111. mteb/tasks/classification/slk/csfdsk_movie_review_sentiment_classification.py +2 -2
  112. mteb/tasks/classification/slv/frenk_sl_classification.py +1 -1
  113. mteb/tasks/classification/spa/spanish_news_classification.py +2 -2
  114. mteb/tasks/classification/ssw/siswati_news_classification.py +1 -1
  115. mteb/tasks/classification/tam/tamil_news_classification.py +2 -2
  116. mteb/tasks/classification/tel/telugu_andhra_jyoti_news_classification.py +2 -2
  117. mteb/tasks/classification/tha/wongnai_reviews_classification.py +1 -1
  118. mteb/tasks/classification/tur/turkish_movie_sentiment_classification.py +2 -2
  119. mteb/tasks/classification/ukr/ukr_formality_classification.py +2 -2
  120. mteb/tasks/classification/vie/toxic_conversations_vn_classification.py +1 -1
  121. mteb/tasks/classification/vie/vie_student_feedback_classification.py +1 -1
  122. mteb/tasks/classification/zho/yue_openrice_review_classification.py +2 -2
  123. mteb/tasks/classification/zul/isi_zulu_news_classification.py +1 -1
  124. mteb/tasks/clustering/deu/blurbs_clustering_p2p.py +1 -1
  125. mteb/tasks/clustering/deu/blurbs_clustering_s2s.py +1 -1
  126. mteb/tasks/clustering/eng/arxiv_clustering_p2p.py +1 -1
  127. mteb/tasks/clustering/eng/arxiv_hierarchical_clustering.py +2 -2
  128. mteb/tasks/clustering/eng/big_patent_clustering.py +1 -1
  129. mteb/tasks/clustering/eng/biorxiv_clustering_p2p.py +1 -1
  130. mteb/tasks/clustering/eng/biorxiv_clustering_s2s.py +1 -1
  131. mteb/tasks/clustering/eng/medrxiv_clustering_p2p.py +1 -1
  132. mteb/tasks/clustering/eng/medrxiv_clustering_s2s.py +1 -1
  133. mteb/tasks/clustering/eng/reddit_clustering.py +1 -1
  134. mteb/tasks/clustering/eng/reddit_clustering_p2p.py +1 -1
  135. mteb/tasks/clustering/eng/stack_exchange_clustering.py +1 -1
  136. mteb/tasks/clustering/eng/stack_exchange_clustering_p2p.py +1 -1
  137. mteb/tasks/clustering/eng/twenty_newsgroups_clustering.py +1 -1
  138. mteb/tasks/clustering/fas/fa_mteb_clustering.py +4 -4
  139. mteb/tasks/clustering/fra/hal_clustering_s2s.py +2 -2
  140. mteb/tasks/clustering/multilingual/mlsum_clustering_p2p.py +2 -2
  141. mteb/tasks/clustering/multilingual/mlsum_clustering_s2s.py +2 -2
  142. mteb/tasks/clustering/multilingual/sib200_clustering_s2s.py +1 -1
  143. mteb/tasks/clustering/multilingual/wiki_clustering_p2p.py +1 -1
  144. mteb/tasks/clustering/nld/dutch_news_articles_clustering_p2p.py +1 -1
  145. mteb/tasks/clustering/nld/dutch_news_articles_clustering_s2s.py +1 -1
  146. mteb/tasks/clustering/nld/iconclass_clustering_s2s.py +1 -1
  147. mteb/tasks/clustering/nld/open_tender_clustering_p2p.py +1 -1
  148. mteb/tasks/clustering/nld/vabb_clustering_p2p.py +1 -1
  149. mteb/tasks/clustering/nld/vabb_clustering_s2s.py +1 -1
  150. mteb/tasks/clustering/nob/snl_clustering.py +1 -1
  151. mteb/tasks/clustering/nob/vg_clustering.py +1 -1
  152. mteb/tasks/clustering/pol/polish_clustering.py +3 -3
  153. mteb/tasks/clustering/rus/ru_sci_bench_grnti_clustering_p2p.py +1 -1
  154. mteb/tasks/clustering/rus/ru_sci_bench_oecd_clustering_p2p.py +1 -1
  155. mteb/tasks/clustering/zho/cmteb_clustering.py +4 -4
  156. mteb/tasks/image_text_pair_classification/eng/image_co_de.py +1 -1
  157. mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +1 -1
  158. mteb/tasks/instruction_reranking/multilingual/m_follow_ir.py +2 -2
  159. mteb/tasks/multichoice/eng/cv_bench.py +4 -4
  160. mteb/tasks/multilabel_classification/ita/emit_classification.py +1 -1
  161. mteb/tasks/multilabel_classification/mlt/maltese_news_classification.py +1 -1
  162. mteb/tasks/multilabel_classification/rus/ru_toixic_multilabelclassification_okmlcup.py +1 -1
  163. mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_group_classification.py +1 -1
  164. mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_subclass_classification.py +1 -1
  165. mteb/tasks/pair_classification/ara/ar_entail.py +1 -1
  166. mteb/tasks/pair_classification/dan/talemaader_pc.py +1 -1
  167. mteb/tasks/pair_classification/deu/false_friends_de_en_pc.py +1 -1
  168. mteb/tasks/pair_classification/eng/pub_chem_ai_sentence_paraphrase_pc.py +1 -1
  169. mteb/tasks/pair_classification/eng/pub_chem_smilespc.py +1 -1
  170. mteb/tasks/pair_classification/eng/pub_chem_synonym_pc.py +1 -1
  171. mteb/tasks/pair_classification/eng/pub_chem_wiki_paragraphs_pc.py +1 -1
  172. mteb/tasks/pair_classification/eng/sprint_duplicate_questions_pc.py +1 -1
  173. mteb/tasks/pair_classification/eng/twitter_sem_eval2015_pc.py +1 -1
  174. mteb/tasks/pair_classification/eng/twitter_url_corpus_pc.py +1 -1
  175. mteb/tasks/pair_classification/fas/fa_mteb_pair_classification.py +5 -5
  176. mteb/tasks/pair_classification/fas/fars_tail.py +2 -2
  177. mteb/tasks/pair_classification/hye/armenian_paraphrase_pc.py +1 -1
  178. mteb/tasks/pair_classification/ita/dis_co_tex_pair_classification.py +1 -1
  179. mteb/tasks/pair_classification/kor/klue_nli.py +1 -1
  180. mteb/tasks/pair_classification/multilingual/rte3.py +2 -2
  181. mteb/tasks/pair_classification/multilingual/xnli.py +1 -1
  182. mteb/tasks/pair_classification/pol/polish_pc.py +4 -4
  183. mteb/tasks/pair_classification/por/assin2_rte.py +1 -1
  184. mteb/tasks/pair_classification/por/sick_br_pc.py +1 -1
  185. mteb/tasks/pair_classification/rus/terra.py +2 -2
  186. mteb/tasks/pair_classification/vie/sprint_duplicate_questions_pcvn.py +1 -1
  187. mteb/tasks/pair_classification/vie/twitter_sem_eval2015_pcvn.py +1 -1
  188. mteb/tasks/pair_classification/vie/twitter_url_corpus_pcvn.py +1 -1
  189. mteb/tasks/pair_classification/zho/cmteb_pair_classification.py +2 -2
  190. mteb/tasks/retrieval/ara/sadeem_question_retrieval.py +1 -1
  191. mteb/tasks/retrieval/code/code_edit_search_retrieval.py +1 -1
  192. mteb/tasks/retrieval/code/code_rag.py +4 -4
  193. mteb/tasks/retrieval/code/code_search_net_cc_retrieval.py +1 -1
  194. mteb/tasks/retrieval/code/coir_code_search_net_retrieval.py +1 -1
  195. mteb/tasks/retrieval/code/ds1000_retrieval.py +1 -1
  196. mteb/tasks/retrieval/code/fresh_stack_retrieval.py +1 -1
  197. mteb/tasks/retrieval/code/human_eval_retrieval.py +1 -1
  198. mteb/tasks/retrieval/code/mbpp_retrieval.py +1 -1
  199. mteb/tasks/retrieval/code/wiki_sql_retrieval.py +1 -1
  200. mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
  201. mteb/tasks/retrieval/dan/tv2_nordretrieval.py +1 -1
  202. mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +1 -1
  203. mteb/tasks/retrieval/deu/german_gov_service_retrieval.py +1 -1
  204. mteb/tasks/retrieval/deu/german_qu_ad_retrieval.py +1 -1
  205. mteb/tasks/retrieval/ell/greek_civics_qa.py +1 -1
  206. mteb/tasks/retrieval/eng/bright_retrieval.py +1 -1
  207. mteb/tasks/retrieval/eng/chat_doctor_retrieval.py +1 -1
  208. mteb/tasks/retrieval/eng/fin_qa_retrieval.py +1 -1
  209. mteb/tasks/retrieval/eng/finance_bench_retrieval.py +1 -1
  210. mteb/tasks/retrieval/eng/hateful_memes_i2t_retrieval.py +1 -1
  211. mteb/tasks/retrieval/eng/hateful_memes_t2i_retrieval.py +1 -1
  212. mteb/tasks/retrieval/eng/hc3_finance_retrieval.py +1 -1
  213. mteb/tasks/retrieval/eng/lemb_narrative_qa_retrieval.py +1 -1
  214. mteb/tasks/retrieval/eng/lemb_needle_retrieval.py +1 -1
  215. mteb/tasks/retrieval/eng/lemb_passkey_retrieval.py +1 -1
  216. mteb/tasks/retrieval/eng/lemb_summ_screen_fd_retrieval.py +1 -1
  217. mteb/tasks/retrieval/eng/lemb_wikim_qa_retrieval.py +1 -1
  218. mteb/tasks/retrieval/eng/lembqm_sum_retrieval.py +1 -1
  219. mteb/tasks/retrieval/eng/lit_search_retrieval.py +1 -1
  220. mteb/tasks/retrieval/eng/memotion_i2t_retrieval.py +1 -1
  221. mteb/tasks/retrieval/eng/memotion_t2i_retrieval.py +1 -1
  222. mteb/tasks/retrieval/eng/ml_questions.py +1 -1
  223. mteb/tasks/retrieval/eng/nano_argu_ana_retrieval.py +1 -1
  224. mteb/tasks/retrieval/eng/nano_climate_fever_retrieval.py +1 -1
  225. mteb/tasks/retrieval/eng/nano_db_pedia_retrieval.py +1 -1
  226. mteb/tasks/retrieval/eng/nano_fever_retrieval.py +1 -1
  227. mteb/tasks/retrieval/eng/nano_fi_qa2018_retrieval.py +1 -1
  228. mteb/tasks/retrieval/eng/nano_hotpot_qa_retrieval.py +1 -1
  229. mteb/tasks/retrieval/eng/nano_msmarco_retrieval.py +1 -1
  230. mteb/tasks/retrieval/eng/nano_nf_corpus_retrieval.py +1 -1
  231. mteb/tasks/retrieval/eng/nano_nq_retrieval.py +1 -1
  232. mteb/tasks/retrieval/eng/nano_quora_retrieval.py +1 -1
  233. mteb/tasks/retrieval/eng/nano_sci_fact_retrieval.py +1 -1
  234. mteb/tasks/retrieval/eng/nano_scidocs_retrieval.py +1 -1
  235. mteb/tasks/retrieval/eng/nano_touche2020_retrieval.py +1 -1
  236. mteb/tasks/retrieval/eng/narrative_qa_retrieval.py +1 -1
  237. mteb/tasks/retrieval/eng/r2_med_retrieval.py +8 -8
  238. mteb/tasks/retrieval/eng/sci_mmir_i2t_retrieval.py +1 -1
  239. mteb/tasks/retrieval/eng/sci_mmir_t2i_retrieval.py +1 -1
  240. mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +10 -10
  241. mteb/tasks/retrieval/fra/f_qu_ad_retrieval.py +1 -1
  242. mteb/tasks/retrieval/fra/syntec_retrieval.py +1 -1
  243. mteb/tasks/retrieval/hun/hun_sum2.py +1 -1
  244. mteb/tasks/retrieval/kat/georgian_faq_retrieval.py +1 -1
  245. mteb/tasks/retrieval/multilingual/cross_lingual_semantic_discrimination_wmt19.py +1 -1
  246. mteb/tasks/retrieval/multilingual/cross_lingual_semantic_discrimination_wmt21.py +1 -1
  247. mteb/tasks/retrieval/multilingual/cur_ev1_retrieval.py +1 -1
  248. mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +1 -1
  249. mteb/tasks/retrieval/multilingual/miracl_vision_retrieval.py +1 -1
  250. mteb/tasks/retrieval/multilingual/mr_tidy_retrieval.py +1 -1
  251. mteb/tasks/retrieval/multilingual/public_health_qa_retrieval.py +1 -1
  252. mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +2 -2
  253. mteb/tasks/retrieval/multilingual/statcan_dialogue_dataset_retrieval.py +1 -1
  254. mteb/tasks/retrieval/multilingual/vdr_multilingual_retrieval.py +1 -1
  255. mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +14 -4
  256. mteb/tasks/retrieval/multilingual/wit_t2i_retrieval.py +1 -1
  257. mteb/tasks/retrieval/multilingual/x_flickr30k_co_t2i_retrieval.py +1 -1
  258. mteb/tasks/retrieval/multilingual/x_qu_ad_retrieval.py +1 -1
  259. mteb/tasks/retrieval/multilingual/xm3600_t2i_retrieval.py +1 -1
  260. mteb/tasks/retrieval/nld/cqa_dupstack_android_nl_retrieval.py +1 -1
  261. mteb/tasks/retrieval/nld/cqa_dupstack_english_nl_retrieval.py +1 -1
  262. mteb/tasks/retrieval/nld/cqa_dupstack_gaming_nl_retrieval.py +1 -1
  263. mteb/tasks/retrieval/nld/cqa_dupstack_gis_nl_retrieval.py +1 -1
  264. mteb/tasks/retrieval/nld/cqa_dupstack_mathematica_nl_retrieval.py +1 -1
  265. mteb/tasks/retrieval/nld/cqa_dupstack_physics_nl_retrieval.py +1 -1
  266. mteb/tasks/retrieval/nld/cqa_dupstack_programmers_nl_retrieval.py +1 -1
  267. mteb/tasks/retrieval/nld/cqa_dupstack_stats_nl_retrieval.py +1 -1
  268. mteb/tasks/retrieval/nld/cqa_dupstack_tex_nl_retrieval.py +1 -1
  269. mteb/tasks/retrieval/nld/cqa_dupstack_unix_nl_retrieval.py +1 -1
  270. mteb/tasks/retrieval/nld/cqa_dupstack_webmasters_nl_retrieval.py +1 -1
  271. mteb/tasks/retrieval/nld/cqa_dupstack_wordpress_nl_retrieval.py +1 -1
  272. mteb/tasks/retrieval/nob/norquad.py +1 -1
  273. mteb/tasks/retrieval/nob/snl_retrieval.py +1 -1
  274. mteb/tasks/retrieval/slk/slovak_sum_retrieval.py +1 -1
  275. mteb/tasks/retrieval/vie/vie_qu_ad_retrieval.py +1 -1
  276. mteb/tasks/sts/fao/faroese_sts.py +1 -1
  277. mteb/tasks/sts/fra/sick_fr_sts.py +1 -1
  278. mteb/tasks/sts/kor/klue_sts.py +1 -1
  279. mteb/tasks/sts/por/sick_br_sts.py +1 -1
  280. mteb/tasks/sts/rus/ru_para_phraser_sts.py +1 -1
  281. mteb/tasks/zeroshot_classification/eng/sci_mmir.py +1 -1
  282. {mteb-2.7.4.dist-info → mteb-2.7.5.dist-info}/METADATA +1 -1
  283. {mteb-2.7.4.dist-info → mteb-2.7.5.dist-info}/RECORD +287 -287
  284. {mteb-2.7.4.dist-info → mteb-2.7.5.dist-info}/WHEEL +0 -0
  285. {mteb-2.7.4.dist-info → mteb-2.7.5.dist-info}/entry_points.txt +0 -0
  286. {mteb-2.7.4.dist-info → mteb-2.7.5.dist-info}/licenses/LICENSE +0 -0
  287. {mteb-2.7.4.dist-info → mteb-2.7.5.dist-info}/top_level.txt +0 -0
@@ -29,7 +29,7 @@ class RuSciBenchGRNTIClassification(AbsTaskClassification):
29
29
  superseded_by="RuSciBenchGRNTIClassification.v2",
30
30
  )
31
31
 
32
- def dataset_transform(self):
32
+ def dataset_transform(self, num_proc: int = 1):
33
33
  self.dataset = self.stratified_subsampling(
34
34
  self.dataset, seed=self.seed, n_samples=2048, splits=["test"]
35
35
  )
@@ -29,7 +29,7 @@ class RuSciBenchOECDClassification(AbsTaskClassification):
29
29
  superseded_by="RuSciBenchOECDClassification.v2",
30
30
  )
31
31
 
32
- def dataset_transform(self):
32
+ def dataset_transform(self, num_proc: int = 1):
33
33
  self.dataset = self.stratified_subsampling(
34
34
  self.dataset, seed=self.seed, n_samples=2048, splits=["test"]
35
35
  )
@@ -28,7 +28,7 @@ class RuToxicOKMLCUPClassification(AbsTaskClassification):
28
28
  superseded_by="RuToxicOKMLCUPClassification.v2",
29
29
  )
30
30
 
31
- def dataset_transform(self):
31
+ def dataset_transform(self, num_proc: int = 1):
32
32
  self.dataset = self.dataset.rename_column("toxic", "label")
33
33
 
34
34
 
@@ -46,5 +46,5 @@ Tan, Liling},
46
46
  """,
47
47
  )
48
48
 
49
- def dataset_transform(self):
49
+ def dataset_transform(self, num_proc: int = 1):
50
50
  self.dataset = self.dataset.rename_columns({"Sloka": "text", "Class": "label"})
@@ -42,7 +42,7 @@ class SinhalaNewsClassification(AbsTaskClassification):
42
42
  superseded_by="SinhalaNewsClassification.v2",
43
43
  )
44
44
 
45
- def dataset_transform(self):
45
+ def dataset_transform(self, num_proc: int = 1):
46
46
  self.dataset = self.dataset.rename_columns(
47
47
  {"comments": "text", "labels": "label"}
48
48
  )
@@ -91,7 +91,7 @@ class SinhalaNewsClassificationV2(AbsTaskClassification):
91
91
  adapted_from=["SinhalaNewsClassification"],
92
92
  )
93
93
 
94
- def dataset_transform(self):
94
+ def dataset_transform(self, num_proc: int = 1):
95
95
  self.dataset = self.stratified_subsampling(
96
96
  self.dataset, seed=self.seed, splits=["train"]
97
97
  )
@@ -35,7 +35,7 @@ class SinhalaNewsSourceClassification(AbsTaskClassification):
35
35
  superseded_by="SinhalaNewsSourceClassification.v2",
36
36
  )
37
37
 
38
- def dataset_transform(self):
38
+ def dataset_transform(self, num_proc: int = 1):
39
39
  self.dataset = self.dataset.rename_column("comment", "text")
40
40
  self.dataset = self.stratified_subsampling(
41
41
  self.dataset, seed=self.seed, splits=["train"]
@@ -75,7 +75,7 @@ class SinhalaNewsSourceClassificationV2(AbsTaskClassification):
75
75
  adapted_from=["SinhalaNewsSourceClassification"],
76
76
  )
77
77
 
78
- def dataset_transform(self):
78
+ def dataset_transform(self, num_proc: int = 1):
79
79
  self.dataset = self.stratified_subsampling(
80
80
  self.dataset, seed=self.seed, splits=["train"]
81
81
  )
@@ -42,7 +42,7 @@ class CSFDSKMovieReviewSentimentClassification(AbsTaskClassification):
42
42
  # Increase the samples_per_label in order to improve baseline performance
43
43
  samples_per_label = 20
44
44
 
45
- def dataset_transform(self):
45
+ def dataset_transform(self, num_proc: int = 1):
46
46
  self.dataset = self.dataset.rename_columns(
47
47
  {"comment": "text", "rating_int": "label"}
48
48
  )
@@ -89,7 +89,7 @@ class CSFDSKMovieReviewSentimentClassificationV2(AbsTaskClassification):
89
89
  # Increase the samples_per_label in order to improve baseline performance
90
90
  samples_per_label = 20
91
91
 
92
- def dataset_transform(self):
92
+ def dataset_transform(self, num_proc: int = 1):
93
93
  self.dataset = self.stratified_subsampling(
94
94
  self.dataset, seed=self.seed, splits=["test"], n_samples=N_SAMPLES
95
95
  )
@@ -75,7 +75,7 @@ class FrenkSlClassificationV2(AbsTaskClassification):
75
75
  adapted_from=["FrenkSlClassification"],
76
76
  )
77
77
 
78
- def dataset_transform(self):
78
+ def dataset_transform(self, num_proc: int = 1):
79
79
  self.dataset = self.stratified_subsampling(
80
80
  self.dataset, seed=self.seed, splits=["test"]
81
81
  )
@@ -29,7 +29,7 @@ class SpanishNewsClassification(AbsTaskClassification):
29
29
  superseded_by="SpanishNewsClassification.v2",
30
30
  )
31
31
 
32
- def dataset_transform(self):
32
+ def dataset_transform(self, num_proc: int = 1):
33
33
  self.dataset = self.dataset.rename_columns({"category": "label"})
34
34
  self.dataset = self.stratified_subsampling(
35
35
  self.dataset, seed=self.seed, splits=["train"]
@@ -63,7 +63,7 @@ class SpanishNewsClassificationV2(AbsTaskClassification):
63
63
  adapted_from=["SpanishNewsClassification"],
64
64
  )
65
65
 
66
- def dataset_transform(self):
66
+ def dataset_transform(self, num_proc: int = 1):
67
67
  self.dataset = self.stratified_subsampling(
68
68
  self.dataset, seed=self.seed, splits=["train"]
69
69
  )
@@ -38,7 +38,7 @@ class SiswatiNewsClassification(AbsTaskClassification):
38
38
  superseded_by="SiswatiNewsClassification.v2",
39
39
  )
40
40
 
41
- def dataset_transform(self):
41
+ def dataset_transform(self, num_proc: int = 1):
42
42
  self.dataset = self.dataset.rename_columns({"title": "text"})
43
43
 
44
44
 
@@ -35,7 +35,7 @@ class TamilNewsClassification(AbsTaskClassification):
35
35
  superseded_by="TamilNewsClassification.v2",
36
36
  )
37
37
 
38
- def dataset_transform(self):
38
+ def dataset_transform(self, num_proc: int = 1):
39
39
  self.dataset = self.dataset.rename_columns(
40
40
  {"NewsInTamil": "text", "Category": "label"}
41
41
  )
@@ -75,5 +75,5 @@ class TamilNewsClassificationV2(AbsTaskClassification):
75
75
  adapted_from=["TamilNewsClassification"],
76
76
  )
77
77
 
78
- def dataset_transform(self):
78
+ def dataset_transform(self, num_proc: int = 1):
79
79
  self.dataset = self.stratified_subsampling(self.dataset, seed=self.seed)
@@ -28,7 +28,7 @@ class TeluguAndhraJyotiNewsClassification(AbsTaskClassification):
28
28
  superseded_by="TeluguAndhraJyotiNewsClassification.v2",
29
29
  )
30
30
 
31
- def dataset_transform(self):
31
+ def dataset_transform(self, num_proc: int = 1):
32
32
  self.dataset = self.dataset.rename_columns({"body": "text", "topic": "label"})
33
33
  self.dataset = self.stratified_subsampling(self.dataset, seed=self.seed)
34
34
 
@@ -59,5 +59,5 @@ class TeluguAndhraJyotiNewsClassificationV2(AbsTaskClassification):
59
59
  adapted_from=["TeluguAndhraJyotiNewsClassification"],
60
60
  )
61
61
 
62
- def dataset_transform(self):
62
+ def dataset_transform(self, num_proc: int = 1):
63
63
  self.dataset = self.stratified_subsampling(self.dataset, seed=self.seed)
@@ -38,7 +38,7 @@ class WongnaiReviewsClassification(AbsTaskClassification):
38
38
  """,
39
39
  )
40
40
 
41
- def dataset_transform(self):
41
+ def dataset_transform(self, num_proc: int = 1):
42
42
  self.dataset = self.dataset.rename_columns(
43
43
  {"review_body": "text", "star_rating": "label"}
44
44
  )
@@ -36,7 +36,7 @@ class TurkishMovieSentimentClassification(AbsTaskClassification):
36
36
  superseded_by="TurkishMovieSentimentClassification.v2",
37
37
  )
38
38
 
39
- def dataset_transform(self):
39
+ def dataset_transform(self, num_proc: int = 1):
40
40
  self.dataset = self.stratified_subsampling(
41
41
  self.dataset, seed=self.seed, splits=["test"]
42
42
  )
@@ -76,7 +76,7 @@ class TurkishMovieSentimentClassificationV2(AbsTaskClassification):
76
76
  adapted_from=["TurkishMovieSentimentClassification"],
77
77
  )
78
78
 
79
- def dataset_transform(self):
79
+ def dataset_transform(self, num_proc: int = 1):
80
80
  self.dataset = self.stratified_subsampling(
81
81
  self.dataset, seed=self.seed, splits=["test"]
82
82
  )
@@ -39,7 +39,7 @@ Tetreault, Joel},
39
39
  superseded_by="UkrFormalityClassification.v2",
40
40
  )
41
41
 
42
- def dataset_transform(self):
42
+ def dataset_transform(self, num_proc: int = 1):
43
43
  self.dataset = self.dataset.rename_column("labels", "label")
44
44
  self.dataset = self.dataset.class_encode_column("label")
45
45
  self.dataset = self.stratified_subsampling(
@@ -84,7 +84,7 @@ Tetreault, Joel},
84
84
  adapted_from=["UkrFormalityClassification"],
85
85
  )
86
86
 
87
- def dataset_transform(self):
87
+ def dataset_transform(self, num_proc: int = 1):
88
88
  self.dataset = self.stratified_subsampling(
89
89
  self.dataset, seed=self.seed, splits=["train", "test"]
90
90
  )
@@ -39,7 +39,7 @@ class ToxicConversationsVNClassification(AbsTaskClassification):
39
39
  adapted_from=["ToxicConversationsClassification"],
40
40
  )
41
41
 
42
- def dataset_transform(self):
42
+ def dataset_transform(self, num_proc: int = 1):
43
43
  self.dataset = self.stratified_subsampling(
44
44
  self.dataset, seed=self.seed, splits=["test"]
45
45
  )
@@ -79,7 +79,7 @@ class VieStudentFeedbackClassificationV2(AbsTaskClassification):
79
79
  adapted_from=["VieStudentFeedbackClassification"],
80
80
  )
81
81
 
82
- def dataset_transform(self):
82
+ def dataset_transform(self, num_proc: int = 1):
83
83
  self.dataset = self.stratified_subsampling(
84
84
  self.dataset, seed=self.seed, splits=["test"]
85
85
  )
@@ -39,7 +39,7 @@ class YueOpenriceReviewClassification(AbsTaskClassification):
39
39
 
40
40
  samples_per_label = 32
41
41
 
42
- def dataset_transform(self):
42
+ def dataset_transform(self, num_proc: int = 1):
43
43
  self.dataset = self.stratified_subsampling(
44
44
  self.dataset, seed=self.seed, splits=["test"]
45
45
  )
@@ -82,7 +82,7 @@ class YueOpenriceReviewClassificationV2(AbsTaskClassification):
82
82
 
83
83
  samples_per_label = 32
84
84
 
85
- def dataset_transform(self):
85
+ def dataset_transform(self, num_proc: int = 1):
86
86
  self.dataset = self.stratified_subsampling(
87
87
  self.dataset, seed=self.seed, splits=["test"]
88
88
  )
@@ -38,7 +38,7 @@ class IsiZuluNewsClassification(AbsTaskClassification):
38
38
  superseded_by="IsiZuluNewsClassification.v2",
39
39
  )
40
40
 
41
- def dataset_transform(self):
41
+ def dataset_transform(self, num_proc: int = 1):
42
42
  self.dataset = self.dataset.rename_columns({"title": "text"})
43
43
 
44
44
 
@@ -82,7 +82,7 @@ class BlurbsClusteringP2PFast(AbsTaskClustering):
82
82
  adapted_from=["BlurbsClusteringP2P"],
83
83
  )
84
84
 
85
- def dataset_transform(self):
85
+ def dataset_transform(self, num_proc: int = 1):
86
86
  self.dataset = _convert_to_fast(
87
87
  self.dataset, self.input_column_name, self.label_column_name, self.seed
88
88
  )
@@ -91,7 +91,7 @@ class BlurbsClusteringS2SFast(AbsTaskClustering):
91
91
  adapted_from=["BlurbsClusteringS2S"],
92
92
  )
93
93
 
94
- def dataset_transform(self):
94
+ def dataset_transform(self, num_proc: int = 1):
95
95
  ds = {}
96
96
  for split in self.metadata.eval_splits:
97
97
  labels = list(itertools.chain.from_iterable(self.dataset[split]["labels"]))
@@ -82,7 +82,7 @@ class ArxivClusteringP2PFast(AbsTaskClusteringLegacy):
82
82
  # simply downsample each cluster.
83
83
  )
84
84
 
85
- def dataset_transform(self):
85
+ def dataset_transform(self, num_proc: int = 1):
86
86
  rng_state = random.Random(self.seed)
87
87
 
88
88
  ds = {}
@@ -38,7 +38,7 @@ class ArXivHierarchicalClusteringP2P(AbsTaskClustering):
38
38
  bibtex_citation="",
39
39
  )
40
40
 
41
- def dataset_transform(self):
41
+ def dataset_transform(self, num_proc: int = 1):
42
42
  ds = {}
43
43
  for split in self.metadata.eval_splits:
44
44
  labels = list(itertools.chain.from_iterable(self.dataset[split]["labels"]))
@@ -78,7 +78,7 @@ class ArXivHierarchicalClusteringS2S(AbsTaskClustering):
78
78
  bibtex_citation="",
79
79
  )
80
80
 
81
- def dataset_transform(self):
81
+ def dataset_transform(self, num_proc: int = 1):
82
82
  ds = {}
83
83
  for split in self.metadata.eval_splits:
84
84
  labels = list(itertools.chain.from_iterable(self.dataset[split]["labels"]))
@@ -104,7 +104,7 @@ Summarization},
104
104
  adapted_from=["BigPatentClustering"],
105
105
  )
106
106
 
107
- def dataset_transform(self):
107
+ def dataset_transform(self, num_proc: int = 1):
108
108
  for split in self.metadata.eval_splits:
109
109
  _check_label_distribution(self.dataset[split])
110
110
  self.dataset = self.stratified_subsampling(
@@ -33,7 +33,7 @@ class BiorxivClusteringP2PFast(AbsTaskClustering):
33
33
  adapted_from=["BiorxivClusteringP2P"],
34
34
  )
35
35
 
36
- def dataset_transform(self):
36
+ def dataset_transform(self, num_proc: int = 1):
37
37
  for split in self.metadata.eval_splits:
38
38
  _check_label_distribution(self.dataset[split])
39
39
 
@@ -33,7 +33,7 @@ class BiorxivClusteringS2SFast(AbsTaskClustering):
33
33
  adapted_from=["BiorxivClusteringS2S"],
34
34
  )
35
35
 
36
- def dataset_transform(self):
36
+ def dataset_transform(self, num_proc: int = 1):
37
37
  for split in self.metadata.eval_splits:
38
38
  _check_label_distribution(self.dataset[split])
39
39
 
@@ -37,7 +37,7 @@ class MedrxivClusteringP2PFast(AbsTaskClustering):
37
37
  adapted_from=["MedrxivClusteringP2P"],
38
38
  )
39
39
 
40
- def dataset_transform(self):
40
+ def dataset_transform(self, num_proc: int = 1):
41
41
  ds = {}
42
42
  for split in self.metadata.eval_splits:
43
43
  labels = list(itertools.chain.from_iterable(self.dataset[split]["labels"]))
@@ -37,7 +37,7 @@ class MedrxivClusteringS2SFast(AbsTaskClustering):
37
37
  adapted_from=["MedrxivClusteringS2S"],
38
38
  )
39
39
 
40
- def dataset_transform(self):
40
+ def dataset_transform(self, num_proc: int = 1):
41
41
  ds = {}
42
42
  for split in self.metadata.eval_splits:
43
43
  labels = list(itertools.chain.from_iterable(self.dataset[split]["labels"]))
@@ -51,7 +51,7 @@ Iryna Gurevych},
51
51
  adapted_from=["RedditClustering"],
52
52
  )
53
53
 
54
- def dataset_transform(self):
54
+ def dataset_transform(self, num_proc: int = 1):
55
55
  ds = {}
56
56
  for split in self.metadata.eval_splits:
57
57
  labels = list(itertools.chain.from_iterable(self.dataset[split]["labels"]))
@@ -94,7 +94,7 @@ Iryna Gurevych},
94
94
  adapted_from=["RedditClusteringP2P"],
95
95
  )
96
96
 
97
- def dataset_transform(self):
97
+ def dataset_transform(self, num_proc: int = 1):
98
98
  ds = {}
99
99
  for split in self.metadata.eval_splits:
100
100
  labels = list(itertools.chain.from_iterable(self.dataset[split]["labels"]))
@@ -51,7 +51,7 @@ Iryna Gurevych},
51
51
  adapted_from=["StackExchangeClustering"],
52
52
  )
53
53
 
54
- def dataset_transform(self):
54
+ def dataset_transform(self, num_proc: int = 1):
55
55
  ds = {}
56
56
  for split in self.metadata.eval_splits:
57
57
  labels = list(itertools.chain.from_iterable(self.dataset[split]["labels"]))
@@ -52,7 +52,7 @@ Iryna Gurevych},
52
52
  adapted_from=["StackExchangeClusteringP2P"],
53
53
  )
54
54
 
55
- def dataset_transform(self):
55
+ def dataset_transform(self, num_proc: int = 1):
56
56
  ds = {}
57
57
  for split in self.metadata.eval_splits:
58
58
  labels = list(itertools.chain.from_iterable(self.dataset[split]["labels"]))
@@ -93,7 +93,7 @@ class TwentyNewsgroupsClusteringFast(AbsTaskClustering):
93
93
  adapted_from=["TwentyNewsgroupsClustering"],
94
94
  )
95
95
 
96
- def dataset_transform(self):
96
+ def dataset_transform(self, num_proc: int = 1):
97
97
  ds = {}
98
98
  for split in self.metadata.eval_splits:
99
99
  labels = list(itertools.chain.from_iterable(self.dataset[split]["labels"]))
@@ -33,7 +33,7 @@ class BeytooteClustering(AbsTaskClustering):
33
33
  bibtex_citation=""" """,
34
34
  )
35
35
 
36
- def dataset_transform(self):
36
+ def dataset_transform(self, num_proc: int = 1):
37
37
  self.dataset = self.stratified_subsampling(
38
38
  self.dataset,
39
39
  seed=self.seed,
@@ -93,7 +93,7 @@ class HamshahriClustring(AbsTaskClustering):
93
93
  bibtex_citation=""" """,
94
94
  )
95
95
 
96
- def dataset_transform(self):
96
+ def dataset_transform(self, num_proc: int = 1):
97
97
  self.dataset = self.dataset.map(
98
98
  lambda x: {"sentences": f"{x['title']}\n: {x['summary']}"}
99
99
  )
@@ -151,7 +151,7 @@ class NLPTwitterAnalysisClustering(AbsTaskClustering):
151
151
  bibtex_citation=""" """,
152
152
  )
153
153
 
154
- def dataset_transform(self):
154
+ def dataset_transform(self, num_proc: int = 1):
155
155
  self.dataset = self.dataset.rename_column("tweet", "sentences")
156
156
  self.dataset = self.dataset.rename_column("label", "labels")
157
157
  self.dataset = self.stratified_subsampling(
@@ -187,7 +187,7 @@ class SIDClustring(AbsTaskClustering):
187
187
  bibtex_citation=""" """,
188
188
  )
189
189
 
190
- def dataset_transform(self):
190
+ def dataset_transform(self, num_proc: int = 1):
191
191
  self.dataset = self.stratified_subsampling(
192
192
  self.dataset,
193
193
  seed=self.seed,
@@ -48,7 +48,7 @@ class HALClusteringS2S(AbsTaskClusteringLegacy):
48
48
  superseded_by="HALClusteringS2S.v2",
49
49
  )
50
50
 
51
- def dataset_transform(self):
51
+ def dataset_transform(self, num_proc: int = 1):
52
52
  """Convert to standard format"""
53
53
  self.dataset = self.dataset.remove_columns("hal_id")
54
54
  titles = self.dataset["test"]["title"]
@@ -98,7 +98,7 @@ class HALClusteringS2SFast(AbsTaskClustering):
98
98
  adapted_from=["HALClusteringS2S"],
99
99
  )
100
100
 
101
- def dataset_transform(self):
101
+ def dataset_transform(self, num_proc: int = 1):
102
102
  """Convert to standard format"""
103
103
  self.dataset["test"] = self.dataset["test"].remove_columns("hal_id")
104
104
  self.dataset["test"] = self.dataset["test"].rename_columns(
@@ -51,7 +51,7 @@ class MLSUMClusteringP2P(AbsTaskClusteringLegacy):
51
51
  superseded_by="MLSUMClusteringP2P.v2",
52
52
  )
53
53
 
54
- def load_data(self) -> None:
54
+ def load_data(self, num_proc: int = 1, **kwargs) -> None:
55
55
  """Load dataset from HuggingFace hub and convert it to the standard format."""
56
56
  if self.data_loaded:
57
57
  return
@@ -124,7 +124,7 @@ class MLSUMClusteringP2PFast(AbsTaskClustering):
124
124
  adapted_from=["MLSUMClusteringP2P"],
125
125
  )
126
126
 
127
- def load_data(self) -> None:
127
+ def load_data(self, num_proc: int = 1, **kwargs) -> None:
128
128
  """Load dataset from HuggingFace hub and convert it to the standard format."""
129
129
  if self.data_loaded:
130
130
  return
@@ -51,7 +51,7 @@ class MLSUMClusteringS2S(AbsTaskClusteringLegacy):
51
51
  superseded_by="MLSUMClusteringS2S.v2",
52
52
  )
53
53
 
54
- def load_data(self) -> None:
54
+ def load_data(self, num_proc: int = 1, **kwargs) -> None:
55
55
  """Load dataset from HuggingFace hub and convert it to the standard format."""
56
56
  if self.data_loaded:
57
57
  return
@@ -119,7 +119,7 @@ class MLSUMClusteringS2SFast(AbsTaskClustering):
119
119
  adapted_from=["MLSUMClusteringS2S"],
120
120
  )
121
121
 
122
- def load_data(self) -> None:
122
+ def load_data(self, num_proc: int = 1, **kwargs) -> None:
123
123
  """Load dataset from HuggingFace hub and convert it to the standard format."""
124
124
  if self.data_loaded:
125
125
  return
@@ -239,7 +239,7 @@ class SIB200ClusteringFast(AbsTaskClustering):
239
239
  """, # combined train, validation, and test into test.
240
240
  )
241
241
 
242
- def dataset_transform(self):
242
+ def dataset_transform(self, num_proc: int = 1):
243
243
  ds = {}
244
244
  for lang in self.hf_subsets:
245
245
  labels = []
@@ -81,7 +81,7 @@ class WikiClusteringFastP2P(AbsTaskClustering):
81
81
  adapted_from=["WikiClusteringP2P"],
82
82
  )
83
83
 
84
- def dataset_transform(self):
84
+ def dataset_transform(self, num_proc: int = 1):
85
85
  ds = {}
86
86
  for lang in self.hf_subsets:
87
87
  labels = []
@@ -33,7 +33,7 @@ class DutchNewsArticlesClusteringP2P(AbsTaskClustering):
33
33
  },
34
34
  )
35
35
 
36
- def dataset_transform(self):
36
+ def dataset_transform(self, num_proc: int = 1):
37
37
  for split in self.dataset:
38
38
  self.dataset[split] = self.dataset[split].rename_columns(
39
39
  {"label": "labels", "text": "sentences"}
@@ -33,7 +33,7 @@ class DutchNewsArticlesClusteringS2S(AbsTaskClustering):
33
33
  },
34
34
  )
35
35
 
36
- def dataset_transform(self):
36
+ def dataset_transform(self, num_proc: int = 1):
37
37
  for split in self.dataset:
38
38
  self.dataset[split] = self.dataset[split].rename_columns(
39
39
  {"label": "labels", "title": "sentences"}
@@ -43,7 +43,7 @@ class IconclassClusteringS2S(AbsTaskClustering):
43
43
  },
44
44
  )
45
45
 
46
- def dataset_transform(self):
46
+ def dataset_transform(self, num_proc: int = 1):
47
47
  for split in self.dataset:
48
48
  self.dataset[split] = self.dataset[split].map(
49
49
  lambda ex: {"labels": ex["label"], "sentences": ex["text"]}
@@ -43,7 +43,7 @@ class OpenTenderClusteringP2P(AbsTaskClustering):
43
43
  },
44
44
  )
45
45
 
46
- def dataset_transform(self):
46
+ def dataset_transform(self, num_proc: int = 1):
47
47
  # reuse the dataset for classification
48
48
  for split in self.dataset:
49
49
  self.dataset[split] = self.dataset[split].map(
@@ -44,7 +44,7 @@ class VABBClusteringP2P(AbsTaskClustering):
44
44
  },
45
45
  )
46
46
 
47
- def dataset_transform(self):
47
+ def dataset_transform(self, num_proc: int = 1):
48
48
  for split in self.dataset:
49
49
  self.dataset[split] = self.dataset[split].map(
50
50
  lambda ex: {
@@ -44,7 +44,7 @@ class VABBClusteringS2S(AbsTaskClustering):
44
44
  },
45
45
  )
46
46
 
47
- def dataset_transform(self):
47
+ def dataset_transform(self, num_proc: int = 1):
48
48
  for split in self.dataset:
49
49
  self.dataset[split] = self.dataset[split].rename_columns(
50
50
  {"title": "sentences"}
@@ -58,7 +58,7 @@ class SNLClustering(AbsTaskClusteringLegacy):
58
58
  superseded_by="SNLHierarchicalClusteringP2P",
59
59
  )
60
60
 
61
- def dataset_transform(self):
61
+ def dataset_transform(self, num_proc: int = 1):
62
62
  splits = self.metadata.eval_splits
63
63
 
64
64
  documents: list = []
@@ -58,7 +58,7 @@ class VGClustering(AbsTaskClusteringLegacy):
58
58
  superseded_by="VGHierarchicalClusteringP2P",
59
59
  )
60
60
 
61
- def dataset_transform(self):
61
+ def dataset_transform(self, num_proc: int = 1):
62
62
  splits = self.metadata.eval_splits
63
63
 
64
64
  documents: list = []
@@ -131,7 +131,7 @@ Piperidis, Stelios},
131
131
  adapted_from=["EightTagsClustering"],
132
132
  )
133
133
 
134
- def dataset_transform(self):
134
+ def dataset_transform(self, num_proc: int = 1):
135
135
  ds = {}
136
136
  for split in self.metadata.eval_splits:
137
137
  labels = list(chain.from_iterable(self.dataset[split]["labels"]))
@@ -204,7 +204,7 @@ class PlscClusteringS2SFast(AbsTaskClustering):
204
204
  adapted_from=["PlscClusteringS2S"],
205
205
  )
206
206
 
207
- def dataset_transform(self):
207
+ def dataset_transform(self, num_proc: int = 1):
208
208
  ds = {}
209
209
  for split in self.metadata.eval_splits:
210
210
  labels = self.dataset[split]["labels"]
@@ -286,7 +286,7 @@ class PlscClusteringP2PFast(AbsTaskClustering):
286
286
  adapted_from=["PlscClusteringP2P"],
287
287
  )
288
288
 
289
- def dataset_transform(self):
289
+ def dataset_transform(self, num_proc: int = 1):
290
290
  ds = {}
291
291
  for split in self.metadata.eval_splits:
292
292
  labels = self.dataset[split]["labels"]