mteb 2.7.4__py3-none-any.whl → 2.7.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (287) hide show
  1. mteb/_create_dataloaders.py +47 -5
  2. mteb/_evaluators/any_sts_evaluator.py +2 -0
  3. mteb/_evaluators/clustering_evaluator.py +2 -0
  4. mteb/_evaluators/evaluator.py +2 -1
  5. mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +8 -1
  6. mteb/_evaluators/pair_classification_evaluator.py +3 -0
  7. mteb/_evaluators/retrieval_evaluator.py +3 -0
  8. mteb/_evaluators/sklearn_evaluator.py +6 -1
  9. mteb/_evaluators/text/bitext_mining_evaluator.py +2 -0
  10. mteb/_evaluators/text/summarization_evaluator.py +2 -0
  11. mteb/_evaluators/zeroshot_classification_evaluator.py +2 -0
  12. mteb/abstasks/abstask.py +31 -12
  13. mteb/abstasks/classification.py +10 -3
  14. mteb/abstasks/clustering.py +6 -2
  15. mteb/abstasks/clustering_legacy.py +8 -2
  16. mteb/abstasks/image/image_text_pair_classification.py +6 -2
  17. mteb/abstasks/multilabel_classification.py +2 -0
  18. mteb/abstasks/pair_classification.py +8 -2
  19. mteb/abstasks/retrieval.py +26 -11
  20. mteb/abstasks/retrieval_dataset_loaders.py +29 -19
  21. mteb/abstasks/sts.py +10 -3
  22. mteb/abstasks/text/bitext_mining.py +9 -5
  23. mteb/abstasks/text/reranking.py +2 -2
  24. mteb/abstasks/text/summarization.py +2 -1
  25. mteb/abstasks/zeroshot_classification.py +8 -2
  26. mteb/evaluate.py +13 -2
  27. mteb/models/model_implementations/bm25.py +2 -0
  28. mteb/models/model_implementations/pylate_models.py +10 -0
  29. mteb/models/models_protocols.py +4 -0
  30. mteb/models/search_wrappers.py +12 -0
  31. mteb/tasks/bitext_mining/eng/pub_chem_smiles_bitext_mining.py +1 -1
  32. mteb/tasks/bitext_mining/fas/fa_mteb_summary_retrieval.py +3 -3
  33. mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining.py +1 -1
  34. mteb/tasks/bitext_mining/multilingual/flores_bitext_mining.py +1 -1
  35. mteb/tasks/bitext_mining/multilingual/in22_conv_bitext_mining.py +1 -1
  36. mteb/tasks/bitext_mining/multilingual/in22_gen_bitext_mining.py +1 -1
  37. mteb/tasks/bitext_mining/multilingual/norwegian_courts_bitext_mining.py +1 -1
  38. mteb/tasks/bitext_mining/multilingual/ntrex_bitext_mining.py +1 -1
  39. mteb/tasks/bitext_mining/multilingual/roma_tales_bitext_mining.py +2 -2
  40. mteb/tasks/bitext_mining/multilingual/web_faq_bitext_mining.py +2 -2
  41. mteb/tasks/classification/ara/online_store_review_sentiment_classification.py +1 -1
  42. mteb/tasks/classification/ara/restaurant_review_sentiment_classification.py +1 -1
  43. mteb/tasks/classification/ara/tweet_sarcasm_classification.py +1 -1
  44. mteb/tasks/classification/ben/bengali_hate_speech_classification.py +1 -1
  45. mteb/tasks/classification/ben/bengali_sentiment_analysis.py +1 -1
  46. mteb/tasks/classification/bul/bulgarian_store_review_sentiment_classfication.py +1 -1
  47. mteb/tasks/classification/ces/csfdcz_movie_review_sentiment_classification.py +2 -2
  48. mteb/tasks/classification/dan/ddisco_cohesion_classification.py +1 -1
  49. mteb/tasks/classification/dan/dk_hate_classification.py +1 -1
  50. mteb/tasks/classification/deu/german_politicians_twitter_sentiment_classification.py +1 -1
  51. mteb/tasks/classification/ell/greek_legal_code_classification.py +1 -1
  52. mteb/tasks/classification/eng/dbpedia_classification.py +2 -2
  53. mteb/tasks/classification/eng/toxic_chat_classification.py +2 -2
  54. mteb/tasks/classification/eng/toxic_conversations_classification.py +2 -2
  55. mteb/tasks/classification/eng/tweet_topic_single_classification.py +1 -1
  56. mteb/tasks/classification/eng/yahoo_answers_topics_classification.py +1 -1
  57. mteb/tasks/classification/eng/yelp_review_full_classification.py +2 -2
  58. mteb/tasks/classification/est/estonian_valence.py +1 -1
  59. mteb/tasks/classification/fas/fa_mteb_classification.py +6 -6
  60. mteb/tasks/classification/fas/persian_food_sentiment_classification.py +1 -1
  61. mteb/tasks/classification/fil/filipino_shopee_reviews_classification.py +1 -1
  62. mteb/tasks/classification/fin/fin_toxicity_classification.py +1 -1
  63. mteb/tasks/classification/fra/french_book_reviews.py +2 -2
  64. mteb/tasks/classification/fra/movie_review_sentiment_classification.py +2 -2
  65. mteb/tasks/classification/guj/gujarati_news_classification.py +1 -1
  66. mteb/tasks/classification/hin/hindi_discourse_classification.py +1 -1
  67. mteb/tasks/classification/hin/sentiment_analysis_hindi.py +1 -1
  68. mteb/tasks/classification/ind/indonesian_id_clickbait_classification.py +2 -2
  69. mteb/tasks/classification/ind/indonesian_mongabay_conservation_classification.py +1 -1
  70. mteb/tasks/classification/ita/dado_eval_coarse_classification.py +1 -1
  71. mteb/tasks/classification/ita/ita_casehold_classification.py +1 -1
  72. mteb/tasks/classification/ita/sardi_stance_classification.py +1 -1
  73. mteb/tasks/classification/jav/javanese_imdb_classification.py +1 -1
  74. mteb/tasks/classification/jpn/wrime_classification.py +1 -1
  75. mteb/tasks/classification/kan/kannada_news_classification.py +2 -2
  76. mteb/tasks/classification/kor/klue_tc.py +2 -2
  77. mteb/tasks/classification/kor/kor_fin.py +1 -1
  78. mteb/tasks/classification/kor/kor_hate_classification.py +1 -1
  79. mteb/tasks/classification/kor/kor_sarcasm_classification.py +1 -1
  80. mteb/tasks/classification/mal/malayalam_news_classification.py +1 -1
  81. mteb/tasks/classification/mar/marathi_news_classification.py +1 -1
  82. mteb/tasks/classification/multilingual/afri_senti_lang_classification.py +1 -1
  83. mteb/tasks/classification/multilingual/catalonia_tweet_classification.py +1 -1
  84. mteb/tasks/classification/multilingual/cyrillic_turkic_lang_classification.py +1 -1
  85. mteb/tasks/classification/multilingual/indic_nlp_news_classification.py +1 -1
  86. mteb/tasks/classification/multilingual/masakha_news_classification.py +1 -1
  87. mteb/tasks/classification/multilingual/multi_hate_classification.py +1 -1
  88. mteb/tasks/classification/multilingual/multilingual_sentiment_classification.py +1 -1
  89. mteb/tasks/classification/multilingual/scala_classification.py +1 -1
  90. mteb/tasks/classification/multilingual/sib200_classification.py +1 -1
  91. mteb/tasks/classification/multilingual/turkic_classification.py +1 -1
  92. mteb/tasks/classification/multilingual/tweet_sentiment_classification.py +1 -1
  93. mteb/tasks/classification/nep/nepali_news_classification.py +2 -2
  94. mteb/tasks/classification/nld/dutch_sarcastic_headlines_classification.py +1 -1
  95. mteb/tasks/classification/nld/vaccin_chat_nl_classification.py +1 -1
  96. mteb/tasks/classification/ory/odia_news_classification.py +2 -2
  97. mteb/tasks/classification/pan/punjabi_news_classification.py +1 -1
  98. mteb/tasks/classification/ron/moroco.py +1 -1
  99. mteb/tasks/classification/ron/romanian_reviews_sentiment.py +1 -1
  100. mteb/tasks/classification/ron/romanian_sentiment_classification.py +1 -1
  101. mteb/tasks/classification/rus/georeview_classification.py +1 -1
  102. mteb/tasks/classification/rus/headline_classification.py +2 -2
  103. mteb/tasks/classification/rus/inappropriateness_classification.py +2 -2
  104. mteb/tasks/classification/rus/ru_reviews_classification.py +2 -2
  105. mteb/tasks/classification/rus/ru_sci_bench_grnti_classification.py +1 -1
  106. mteb/tasks/classification/rus/ru_sci_bench_oecd_classification.py +1 -1
  107. mteb/tasks/classification/rus/ru_toixic_classification_okmlcup.py +1 -1
  108. mteb/tasks/classification/san/sanskrit_shlokas_classification.py +1 -1
  109. mteb/tasks/classification/sin/sinhala_news_classification.py +2 -2
  110. mteb/tasks/classification/sin/sinhala_news_source_classification.py +2 -2
  111. mteb/tasks/classification/slk/csfdsk_movie_review_sentiment_classification.py +2 -2
  112. mteb/tasks/classification/slv/frenk_sl_classification.py +1 -1
  113. mteb/tasks/classification/spa/spanish_news_classification.py +2 -2
  114. mteb/tasks/classification/ssw/siswati_news_classification.py +1 -1
  115. mteb/tasks/classification/tam/tamil_news_classification.py +2 -2
  116. mteb/tasks/classification/tel/telugu_andhra_jyoti_news_classification.py +2 -2
  117. mteb/tasks/classification/tha/wongnai_reviews_classification.py +1 -1
  118. mteb/tasks/classification/tur/turkish_movie_sentiment_classification.py +2 -2
  119. mteb/tasks/classification/ukr/ukr_formality_classification.py +2 -2
  120. mteb/tasks/classification/vie/toxic_conversations_vn_classification.py +1 -1
  121. mteb/tasks/classification/vie/vie_student_feedback_classification.py +1 -1
  122. mteb/tasks/classification/zho/yue_openrice_review_classification.py +2 -2
  123. mteb/tasks/classification/zul/isi_zulu_news_classification.py +1 -1
  124. mteb/tasks/clustering/deu/blurbs_clustering_p2p.py +1 -1
  125. mteb/tasks/clustering/deu/blurbs_clustering_s2s.py +1 -1
  126. mteb/tasks/clustering/eng/arxiv_clustering_p2p.py +1 -1
  127. mteb/tasks/clustering/eng/arxiv_hierarchical_clustering.py +2 -2
  128. mteb/tasks/clustering/eng/big_patent_clustering.py +1 -1
  129. mteb/tasks/clustering/eng/biorxiv_clustering_p2p.py +1 -1
  130. mteb/tasks/clustering/eng/biorxiv_clustering_s2s.py +1 -1
  131. mteb/tasks/clustering/eng/medrxiv_clustering_p2p.py +1 -1
  132. mteb/tasks/clustering/eng/medrxiv_clustering_s2s.py +1 -1
  133. mteb/tasks/clustering/eng/reddit_clustering.py +1 -1
  134. mteb/tasks/clustering/eng/reddit_clustering_p2p.py +1 -1
  135. mteb/tasks/clustering/eng/stack_exchange_clustering.py +1 -1
  136. mteb/tasks/clustering/eng/stack_exchange_clustering_p2p.py +1 -1
  137. mteb/tasks/clustering/eng/twenty_newsgroups_clustering.py +1 -1
  138. mteb/tasks/clustering/fas/fa_mteb_clustering.py +4 -4
  139. mteb/tasks/clustering/fra/hal_clustering_s2s.py +2 -2
  140. mteb/tasks/clustering/multilingual/mlsum_clustering_p2p.py +2 -2
  141. mteb/tasks/clustering/multilingual/mlsum_clustering_s2s.py +2 -2
  142. mteb/tasks/clustering/multilingual/sib200_clustering_s2s.py +1 -1
  143. mteb/tasks/clustering/multilingual/wiki_clustering_p2p.py +1 -1
  144. mteb/tasks/clustering/nld/dutch_news_articles_clustering_p2p.py +1 -1
  145. mteb/tasks/clustering/nld/dutch_news_articles_clustering_s2s.py +1 -1
  146. mteb/tasks/clustering/nld/iconclass_clustering_s2s.py +1 -1
  147. mteb/tasks/clustering/nld/open_tender_clustering_p2p.py +1 -1
  148. mteb/tasks/clustering/nld/vabb_clustering_p2p.py +1 -1
  149. mteb/tasks/clustering/nld/vabb_clustering_s2s.py +1 -1
  150. mteb/tasks/clustering/nob/snl_clustering.py +1 -1
  151. mteb/tasks/clustering/nob/vg_clustering.py +1 -1
  152. mteb/tasks/clustering/pol/polish_clustering.py +3 -3
  153. mteb/tasks/clustering/rus/ru_sci_bench_grnti_clustering_p2p.py +1 -1
  154. mteb/tasks/clustering/rus/ru_sci_bench_oecd_clustering_p2p.py +1 -1
  155. mteb/tasks/clustering/zho/cmteb_clustering.py +4 -4
  156. mteb/tasks/image_text_pair_classification/eng/image_co_de.py +1 -1
  157. mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +1 -1
  158. mteb/tasks/instruction_reranking/multilingual/m_follow_ir.py +2 -2
  159. mteb/tasks/multichoice/eng/cv_bench.py +4 -4
  160. mteb/tasks/multilabel_classification/ita/emit_classification.py +1 -1
  161. mteb/tasks/multilabel_classification/mlt/maltese_news_classification.py +1 -1
  162. mteb/tasks/multilabel_classification/rus/ru_toixic_multilabelclassification_okmlcup.py +1 -1
  163. mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_group_classification.py +1 -1
  164. mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_subclass_classification.py +1 -1
  165. mteb/tasks/pair_classification/ara/ar_entail.py +1 -1
  166. mteb/tasks/pair_classification/dan/talemaader_pc.py +1 -1
  167. mteb/tasks/pair_classification/deu/false_friends_de_en_pc.py +1 -1
  168. mteb/tasks/pair_classification/eng/pub_chem_ai_sentence_paraphrase_pc.py +1 -1
  169. mteb/tasks/pair_classification/eng/pub_chem_smilespc.py +1 -1
  170. mteb/tasks/pair_classification/eng/pub_chem_synonym_pc.py +1 -1
  171. mteb/tasks/pair_classification/eng/pub_chem_wiki_paragraphs_pc.py +1 -1
  172. mteb/tasks/pair_classification/eng/sprint_duplicate_questions_pc.py +1 -1
  173. mteb/tasks/pair_classification/eng/twitter_sem_eval2015_pc.py +1 -1
  174. mteb/tasks/pair_classification/eng/twitter_url_corpus_pc.py +1 -1
  175. mteb/tasks/pair_classification/fas/fa_mteb_pair_classification.py +5 -5
  176. mteb/tasks/pair_classification/fas/fars_tail.py +2 -2
  177. mteb/tasks/pair_classification/hye/armenian_paraphrase_pc.py +1 -1
  178. mteb/tasks/pair_classification/ita/dis_co_tex_pair_classification.py +1 -1
  179. mteb/tasks/pair_classification/kor/klue_nli.py +1 -1
  180. mteb/tasks/pair_classification/multilingual/rte3.py +2 -2
  181. mteb/tasks/pair_classification/multilingual/xnli.py +1 -1
  182. mteb/tasks/pair_classification/pol/polish_pc.py +4 -4
  183. mteb/tasks/pair_classification/por/assin2_rte.py +1 -1
  184. mteb/tasks/pair_classification/por/sick_br_pc.py +1 -1
  185. mteb/tasks/pair_classification/rus/terra.py +2 -2
  186. mteb/tasks/pair_classification/vie/sprint_duplicate_questions_pcvn.py +1 -1
  187. mteb/tasks/pair_classification/vie/twitter_sem_eval2015_pcvn.py +1 -1
  188. mteb/tasks/pair_classification/vie/twitter_url_corpus_pcvn.py +1 -1
  189. mteb/tasks/pair_classification/zho/cmteb_pair_classification.py +2 -2
  190. mteb/tasks/retrieval/ara/sadeem_question_retrieval.py +1 -1
  191. mteb/tasks/retrieval/code/code_edit_search_retrieval.py +1 -1
  192. mteb/tasks/retrieval/code/code_rag.py +4 -4
  193. mteb/tasks/retrieval/code/code_search_net_cc_retrieval.py +1 -1
  194. mteb/tasks/retrieval/code/coir_code_search_net_retrieval.py +1 -1
  195. mteb/tasks/retrieval/code/ds1000_retrieval.py +1 -1
  196. mteb/tasks/retrieval/code/fresh_stack_retrieval.py +1 -1
  197. mteb/tasks/retrieval/code/human_eval_retrieval.py +1 -1
  198. mteb/tasks/retrieval/code/mbpp_retrieval.py +1 -1
  199. mteb/tasks/retrieval/code/wiki_sql_retrieval.py +1 -1
  200. mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
  201. mteb/tasks/retrieval/dan/tv2_nordretrieval.py +1 -1
  202. mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +1 -1
  203. mteb/tasks/retrieval/deu/german_gov_service_retrieval.py +1 -1
  204. mteb/tasks/retrieval/deu/german_qu_ad_retrieval.py +1 -1
  205. mteb/tasks/retrieval/ell/greek_civics_qa.py +1 -1
  206. mteb/tasks/retrieval/eng/bright_retrieval.py +1 -1
  207. mteb/tasks/retrieval/eng/chat_doctor_retrieval.py +1 -1
  208. mteb/tasks/retrieval/eng/fin_qa_retrieval.py +1 -1
  209. mteb/tasks/retrieval/eng/finance_bench_retrieval.py +1 -1
  210. mteb/tasks/retrieval/eng/hateful_memes_i2t_retrieval.py +1 -1
  211. mteb/tasks/retrieval/eng/hateful_memes_t2i_retrieval.py +1 -1
  212. mteb/tasks/retrieval/eng/hc3_finance_retrieval.py +1 -1
  213. mteb/tasks/retrieval/eng/lemb_narrative_qa_retrieval.py +1 -1
  214. mteb/tasks/retrieval/eng/lemb_needle_retrieval.py +1 -1
  215. mteb/tasks/retrieval/eng/lemb_passkey_retrieval.py +1 -1
  216. mteb/tasks/retrieval/eng/lemb_summ_screen_fd_retrieval.py +1 -1
  217. mteb/tasks/retrieval/eng/lemb_wikim_qa_retrieval.py +1 -1
  218. mteb/tasks/retrieval/eng/lembqm_sum_retrieval.py +1 -1
  219. mteb/tasks/retrieval/eng/lit_search_retrieval.py +1 -1
  220. mteb/tasks/retrieval/eng/memotion_i2t_retrieval.py +1 -1
  221. mteb/tasks/retrieval/eng/memotion_t2i_retrieval.py +1 -1
  222. mteb/tasks/retrieval/eng/ml_questions.py +1 -1
  223. mteb/tasks/retrieval/eng/nano_argu_ana_retrieval.py +1 -1
  224. mteb/tasks/retrieval/eng/nano_climate_fever_retrieval.py +1 -1
  225. mteb/tasks/retrieval/eng/nano_db_pedia_retrieval.py +1 -1
  226. mteb/tasks/retrieval/eng/nano_fever_retrieval.py +1 -1
  227. mteb/tasks/retrieval/eng/nano_fi_qa2018_retrieval.py +1 -1
  228. mteb/tasks/retrieval/eng/nano_hotpot_qa_retrieval.py +1 -1
  229. mteb/tasks/retrieval/eng/nano_msmarco_retrieval.py +1 -1
  230. mteb/tasks/retrieval/eng/nano_nf_corpus_retrieval.py +1 -1
  231. mteb/tasks/retrieval/eng/nano_nq_retrieval.py +1 -1
  232. mteb/tasks/retrieval/eng/nano_quora_retrieval.py +1 -1
  233. mteb/tasks/retrieval/eng/nano_sci_fact_retrieval.py +1 -1
  234. mteb/tasks/retrieval/eng/nano_scidocs_retrieval.py +1 -1
  235. mteb/tasks/retrieval/eng/nano_touche2020_retrieval.py +1 -1
  236. mteb/tasks/retrieval/eng/narrative_qa_retrieval.py +1 -1
  237. mteb/tasks/retrieval/eng/r2_med_retrieval.py +8 -8
  238. mteb/tasks/retrieval/eng/sci_mmir_i2t_retrieval.py +1 -1
  239. mteb/tasks/retrieval/eng/sci_mmir_t2i_retrieval.py +1 -1
  240. mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +10 -10
  241. mteb/tasks/retrieval/fra/f_qu_ad_retrieval.py +1 -1
  242. mteb/tasks/retrieval/fra/syntec_retrieval.py +1 -1
  243. mteb/tasks/retrieval/hun/hun_sum2.py +1 -1
  244. mteb/tasks/retrieval/kat/georgian_faq_retrieval.py +1 -1
  245. mteb/tasks/retrieval/multilingual/cross_lingual_semantic_discrimination_wmt19.py +1 -1
  246. mteb/tasks/retrieval/multilingual/cross_lingual_semantic_discrimination_wmt21.py +1 -1
  247. mteb/tasks/retrieval/multilingual/cur_ev1_retrieval.py +1 -1
  248. mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +1 -1
  249. mteb/tasks/retrieval/multilingual/miracl_vision_retrieval.py +1 -1
  250. mteb/tasks/retrieval/multilingual/mr_tidy_retrieval.py +1 -1
  251. mteb/tasks/retrieval/multilingual/public_health_qa_retrieval.py +1 -1
  252. mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +2 -2
  253. mteb/tasks/retrieval/multilingual/statcan_dialogue_dataset_retrieval.py +1 -1
  254. mteb/tasks/retrieval/multilingual/vdr_multilingual_retrieval.py +1 -1
  255. mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +14 -4
  256. mteb/tasks/retrieval/multilingual/wit_t2i_retrieval.py +1 -1
  257. mteb/tasks/retrieval/multilingual/x_flickr30k_co_t2i_retrieval.py +1 -1
  258. mteb/tasks/retrieval/multilingual/x_qu_ad_retrieval.py +1 -1
  259. mteb/tasks/retrieval/multilingual/xm3600_t2i_retrieval.py +1 -1
  260. mteb/tasks/retrieval/nld/cqa_dupstack_android_nl_retrieval.py +1 -1
  261. mteb/tasks/retrieval/nld/cqa_dupstack_english_nl_retrieval.py +1 -1
  262. mteb/tasks/retrieval/nld/cqa_dupstack_gaming_nl_retrieval.py +1 -1
  263. mteb/tasks/retrieval/nld/cqa_dupstack_gis_nl_retrieval.py +1 -1
  264. mteb/tasks/retrieval/nld/cqa_dupstack_mathematica_nl_retrieval.py +1 -1
  265. mteb/tasks/retrieval/nld/cqa_dupstack_physics_nl_retrieval.py +1 -1
  266. mteb/tasks/retrieval/nld/cqa_dupstack_programmers_nl_retrieval.py +1 -1
  267. mteb/tasks/retrieval/nld/cqa_dupstack_stats_nl_retrieval.py +1 -1
  268. mteb/tasks/retrieval/nld/cqa_dupstack_tex_nl_retrieval.py +1 -1
  269. mteb/tasks/retrieval/nld/cqa_dupstack_unix_nl_retrieval.py +1 -1
  270. mteb/tasks/retrieval/nld/cqa_dupstack_webmasters_nl_retrieval.py +1 -1
  271. mteb/tasks/retrieval/nld/cqa_dupstack_wordpress_nl_retrieval.py +1 -1
  272. mteb/tasks/retrieval/nob/norquad.py +1 -1
  273. mteb/tasks/retrieval/nob/snl_retrieval.py +1 -1
  274. mteb/tasks/retrieval/slk/slovak_sum_retrieval.py +1 -1
  275. mteb/tasks/retrieval/vie/vie_qu_ad_retrieval.py +1 -1
  276. mteb/tasks/sts/fao/faroese_sts.py +1 -1
  277. mteb/tasks/sts/fra/sick_fr_sts.py +1 -1
  278. mteb/tasks/sts/kor/klue_sts.py +1 -1
  279. mteb/tasks/sts/por/sick_br_sts.py +1 -1
  280. mteb/tasks/sts/rus/ru_para_phraser_sts.py +1 -1
  281. mteb/tasks/zeroshot_classification/eng/sci_mmir.py +1 -1
  282. {mteb-2.7.4.dist-info → mteb-2.7.6.dist-info}/METADATA +1 -1
  283. {mteb-2.7.4.dist-info → mteb-2.7.6.dist-info}/RECORD +287 -287
  284. {mteb-2.7.4.dist-info → mteb-2.7.6.dist-info}/WHEEL +0 -0
  285. {mteb-2.7.4.dist-info → mteb-2.7.6.dist-info}/entry_points.txt +0 -0
  286. {mteb-2.7.4.dist-info → mteb-2.7.6.dist-info}/licenses/LICENSE +0 -0
  287. {mteb-2.7.4.dist-info → mteb-2.7.6.dist-info}/top_level.txt +0 -0
@@ -127,6 +127,7 @@ class AbsTaskZeroShotClassification(AbsTask):
127
127
  hf_subset: str,
128
128
  encode_kwargs: EncodeKwargs,
129
129
  prediction_folder: Path | None = None,
130
+ num_proc: int = 1,
130
131
  **kwargs,
131
132
  ) -> ZeroShotClassificationMetrics:
132
133
  if not isinstance(model, EncoderProtocol):
@@ -145,7 +146,11 @@ class AbsTaskZeroShotClassification(AbsTask):
145
146
  hf_subset=hf_subset,
146
147
  **kwargs,
147
148
  )
148
- probs = evaluator(model, encode_kwargs=encode_kwargs)
149
+ probs = evaluator(
150
+ model,
151
+ encode_kwargs=encode_kwargs,
152
+ num_proc=num_proc,
153
+ )
149
154
 
150
155
  if prediction_folder:
151
156
  self._save_task_predictions(
@@ -170,13 +175,14 @@ class AbsTaskZeroShotClassification(AbsTask):
170
175
  accuracy=metrics.accuracy_score(labels, predictions),
171
176
  )
172
177
 
173
- def _push_dataset_to_hub(self, repo_name: str) -> None:
178
+ def _push_dataset_to_hub(self, repo_name: str, num_proc: int = 1) -> None:
174
179
  self._upload_dataset_to_hub(
175
180
  repo_name,
176
181
  [
177
182
  self.input_column_name,
178
183
  self.label_column_name,
179
184
  ],
185
+ num_proc=num_proc,
180
186
  )
181
187
  labels_dataset = Dataset.from_dict({"labels": self.get_candidate_labels()})
182
188
  labels_dataset.push_to_hub(repo_name, config_name="labels")
mteb/evaluate.py CHANGED
@@ -125,6 +125,7 @@ def _evaluate_task(
125
125
  co2_tracker=False,
126
126
  prediction_folder=prediction_folder,
127
127
  public_only=public_only,
128
+ num_proc=num_proc,
128
129
  )
129
130
  if isinstance(result, TaskResult):
130
131
  result.kg_co2_emissions = tracker.final_emissions
@@ -137,7 +138,7 @@ def _evaluate_task(
137
138
  data_preloaded = task.data_loaded
138
139
  if not data_preloaded:
139
140
  try:
140
- task.load_data()
141
+ task.load_data(num_proc=num_proc)
141
142
  except DatasetNotFoundError as e:
142
143
  if not task.metadata.is_public and public_only is None:
143
144
  msg = (
@@ -163,6 +164,7 @@ def _evaluate_task(
163
164
  subsets_to_run=hf_subsets,
164
165
  encode_kwargs=encode_kwargs,
165
166
  prediction_folder=prediction_folder,
167
+ num_proc=num_proc,
166
168
  )
167
169
  tock = time()
168
170
 
@@ -280,6 +282,7 @@ def evaluate(
280
282
  prediction_folder: Path | str | None = None,
281
283
  show_progress_bar: bool = True,
282
284
  public_only: bool | None = None,
285
+ num_proc: int = 1,
283
286
  ) -> ModelResult:
284
287
  """This function runs a model on a given task and returns the results.
285
288
 
@@ -288,7 +291,7 @@ def evaluate(
288
291
  tasks: A task to run.
289
292
  co2_tracker: If True, track the CO₂ emissions of the evaluation, required codecarbon to be installed, which can be installed using
290
293
  `pip install mteb[codecarbon]`. If none is passed co2 tracking will only be run if codecarbon is installed.
291
- encode_kwargs: Additional keyword arguments passed to the models `encode` method.
294
+ encode_kwargs: Additional keyword arguments passed to the models `encode` and `load_data` methods;
292
295
  raise_error: If True, raise an error if the task fails. If False, return an empty list.
293
296
  cache: The cache to use for loading the results. If None, then no cache will be used. The default cache saved the cache in the
294
297
  `~/.cache/mteb` directory. It can be overridden by setting the `MTEB_CACHE` environment variable to a different directory or by directly
@@ -304,6 +307,7 @@ def evaluate(
304
307
  show_progress_bar: Whether to show a progress bar when running the evaluation. Default is True. Setting this to False will also set the
305
308
  `encode_kwargs['show_progress_bar']` to False if encode_kwargs is unspecified.
306
309
  public_only: Run only public tasks. If None, it will attempt to run the private task.
310
+ num_proc: Number of processes to use during data loading and transformation. Defaults to 1.
307
311
 
308
312
  Returns:
309
313
  The results of the evaluation.
@@ -356,8 +360,12 @@ def evaluate(
356
360
  prediction_folder=prediction_folder,
357
361
  show_progress_bar=show_progress_bar,
358
362
  public_only=public_only,
363
+ num_proc=num_proc,
359
364
  )
360
365
  combined_results = aggregated_task.combine_task_results(results.task_results)
366
+ if cache:
367
+ cache.save_to_cache(combined_results, meta)
368
+
361
369
  return ModelResult(
362
370
  model_name=results.model_name,
363
371
  model_revision=results.model_revision,
@@ -388,6 +396,7 @@ def evaluate(
388
396
  prediction_folder=prediction_folder,
389
397
  show_progress_bar=False,
390
398
  public_only=public_only,
399
+ num_proc=num_proc,
391
400
  )
392
401
  evaluate_results.extend(_res.task_results)
393
402
  if _res.exceptions:
@@ -467,6 +476,7 @@ def evaluate(
467
476
  encode_kwargs=encode_kwargs,
468
477
  prediction_folder=prediction_folder,
469
478
  public_only=public_only,
479
+ num_proc=num_proc,
470
480
  )
471
481
  except Exception as e:
472
482
  logger.error(
@@ -482,6 +492,7 @@ def evaluate(
482
492
  encode_kwargs=encode_kwargs,
483
493
  prediction_folder=prediction_folder,
484
494
  public_only=public_only,
495
+ num_proc=num_proc,
485
496
  )
486
497
  logger.info(f"✓ Finished evaluation for {task.metadata.name}")
487
498
 
@@ -54,6 +54,7 @@ def bm25_loader(model_name, **kwargs) -> SearchProtocol:
54
54
  hf_split: str,
55
55
  hf_subset: str,
56
56
  encode_kwargs: EncodeKwargs,
57
+ num_proc: int = 1,
57
58
  ) -> None:
58
59
  logger.info("Encoding Corpus...")
59
60
  corpus_texts = [
@@ -80,6 +81,7 @@ def bm25_loader(model_name, **kwargs) -> SearchProtocol:
80
81
  top_k: int,
81
82
  encode_kwargs: EncodeKwargs,
82
83
  top_ranked: TopRankedDocumentsType | None = None,
84
+ num_proc: int = 1,
83
85
  ) -> RetrievalOutputType:
84
86
  logger.info("Encoding Queries...")
85
87
  query_ids = list(queries["id"])
@@ -53,6 +53,7 @@ class PylateSearchEncoder:
53
53
  hf_split: str,
54
54
  hf_subset: str,
55
55
  encode_kwargs: EncodeKwargs,
56
+ num_proc: int,
56
57
  ) -> None:
57
58
  """Index the corpus for retrieval.
58
59
 
@@ -62,6 +63,7 @@ class PylateSearchEncoder:
62
63
  hf_split: Split of current task, allows to know some additional information about current split.
63
64
  hf_subset: Subset of current task. Similar to `hf_split` to get more information
64
65
  encode_kwargs: Additional arguments to pass to the encoder during indexing.
66
+ num_proc: Number of processes to use for indexing.
65
67
  """
66
68
  self.task_corpus = corpus
67
69
 
@@ -87,12 +89,14 @@ class PylateSearchEncoder:
87
89
  top_k: int,
88
90
  encode_kwargs: EncodeKwargs,
89
91
  top_ranked: TopRankedDocumentsType | None = None,
92
+ num_proc: int,
90
93
  ) -> RetrievalOutputType:
91
94
  queries_dataloader = create_dataloader(
92
95
  queries,
93
96
  task_metadata,
94
97
  prompt_type=PromptType.query,
95
98
  batch_size=encode_kwargs.get("batch_size", 32),
99
+ num_proc=num_proc,
96
100
  )
97
101
 
98
102
  query_embeddings = self.encode(
@@ -116,6 +120,7 @@ class PylateSearchEncoder:
116
120
  hf_subset=hf_subset,
117
121
  hf_split=hf_split,
118
122
  encode_kwargs=encode_kwargs,
123
+ num_proc=num_proc,
119
124
  )
120
125
  else:
121
126
  result_heaps = self._pylate_full_corpus_search(
@@ -126,6 +131,7 @@ class PylateSearchEncoder:
126
131
  hf_subset=hf_subset,
127
132
  hf_split=hf_split,
128
133
  encode_kwargs=encode_kwargs,
134
+ num_proc=num_proc,
129
135
  )
130
136
 
131
137
  results = {qid: {} for qid in query_idx_to_id.values()}
@@ -144,6 +150,7 @@ class PylateSearchEncoder:
144
150
  hf_split: str,
145
151
  top_k: int,
146
152
  encode_kwargs: EncodeKwargs,
153
+ num_proc: int,
147
154
  ) -> dict[str, list[tuple[float, str]]]:
148
155
  from pylate import indexes, retrieve
149
156
 
@@ -170,6 +177,7 @@ class PylateSearchEncoder:
170
177
  task_metadata,
171
178
  prompt_type=PromptType.document,
172
179
  batch_size=encode_kwargs.get("batch_size", 32),
180
+ num_proc=num_proc,
173
181
  )
174
182
  documents_embeddings = self.encode(
175
183
  documents_loader,
@@ -208,6 +216,7 @@ class PylateSearchEncoder:
208
216
  hf_subset: str,
209
217
  hf_split: str,
210
218
  encode_kwargs: EncodeKwargs,
219
+ num_proc: int = 1,
211
220
  ) -> dict[str, list[tuple[float, str]]]:
212
221
  """Rerank with PyLate's rank.rerank using per-query candidates.
213
222
 
@@ -230,6 +239,7 @@ class PylateSearchEncoder:
230
239
  task_metadata,
231
240
  prompt_type=PromptType.document,
232
241
  batch_size=encode_kwargs.get("batch_size", 32),
242
+ num_proc=num_proc,
233
243
  ),
234
244
  task_metadata=task_metadata,
235
245
  hf_split=hf_split,
@@ -32,6 +32,7 @@ class SearchProtocol(Protocol):
32
32
  hf_split: str,
33
33
  hf_subset: str,
34
34
  encode_kwargs: EncodeKwargs,
35
+ num_proc: int,
35
36
  ) -> None:
36
37
  """Index the corpus for retrieval.
37
38
 
@@ -41,6 +42,7 @@ class SearchProtocol(Protocol):
41
42
  hf_split: Split of current task, allows to know some additional information about current split.
42
43
  hf_subset: Subset of current task. Similar to `hf_split` to get more information
43
44
  encode_kwargs: Additional arguments to pass to the encoder during indexing.
45
+ num_proc: Number of processes to use for dataloading.
44
46
  """
45
47
  ...
46
48
 
@@ -54,6 +56,7 @@ class SearchProtocol(Protocol):
54
56
  top_k: int,
55
57
  encode_kwargs: EncodeKwargs,
56
58
  top_ranked: TopRankedDocumentsType | None = None,
59
+ num_proc: int,
57
60
  ) -> RetrievalOutputType:
58
61
  """Search the corpus using the given queries.
59
62
 
@@ -66,6 +69,7 @@ class SearchProtocol(Protocol):
66
69
  Passed only from Reranking tasks.
67
70
  top_k: Number of top documents to return for each query.
68
71
  encode_kwargs: Additional arguments to pass to the encoder during indexing.
72
+ num_proc: Number of processes to use for dataloading.
69
73
 
70
74
  Returns:
71
75
  Dictionary with query IDs as keys with dict as values, where each value is a mapping of document IDs to their relevance scores.
@@ -59,6 +59,7 @@ class SearchEncoderWrapper:
59
59
  hf_split: str,
60
60
  hf_subset: str,
61
61
  encode_kwargs: EncodeKwargs,
62
+ num_proc: int = 1,
62
63
  ) -> None:
63
64
  """Index the corpus for retrieval.
64
65
 
@@ -68,6 +69,7 @@ class SearchEncoderWrapper:
68
69
  hf_split: Split of current task, allows to know some additional information about current split.
69
70
  hf_subset: Subset of current task. Similar to `hf_split` to get more information
70
71
  encode_kwargs: Additional arguments to pass to the encoder during indexing.
72
+ num_proc: Number of processes to use for dataloading.
71
73
  """
72
74
  # Always retain corpus for potential reranking or fallback flows
73
75
  self.task_corpus = corpus
@@ -77,6 +79,7 @@ class SearchEncoderWrapper:
77
79
  corpus,
78
80
  task_metadata,
79
81
  prompt_type=PromptType.document,
82
+ num_proc=num_proc,
80
83
  **encode_kwargs,
81
84
  ),
82
85
  task_metadata=task_metadata,
@@ -98,6 +101,7 @@ class SearchEncoderWrapper:
98
101
  top_k: int,
99
102
  encode_kwargs: EncodeKwargs,
100
103
  top_ranked: TopRankedDocumentsType | None = None,
104
+ num_proc: int = 1,
101
105
  ) -> RetrievalOutputType:
102
106
  """Search the corpus for the given queries.
103
107
 
@@ -110,6 +114,7 @@ class SearchEncoderWrapper:
110
114
  Passed only from Reranking tasks.
111
115
  top_k: Number of top documents to return for each query.
112
116
  encode_kwargs: Additional arguments to pass to the encoder during indexing.
117
+ num_proc: Number of processes to use for dataloading.
113
118
 
114
119
  Returns:
115
120
  Dictionary with query IDs as keys with dict as values, where each value is a mapping of document IDs to their relevance scores.
@@ -121,6 +126,7 @@ class SearchEncoderWrapper:
121
126
  queries,
122
127
  task_metadata,
123
128
  prompt_type=PromptType.query,
129
+ num_proc=num_proc,
124
130
  **encode_kwargs,
125
131
  )
126
132
 
@@ -479,6 +485,7 @@ class SearchCrossEncoderWrapper:
479
485
  hf_split: str,
480
486
  hf_subset: str,
481
487
  encode_kwargs: EncodeKwargs,
488
+ num_proc: int = 1,
482
489
  ) -> None:
483
490
  """Index the corpus for retrieval.
484
491
 
@@ -488,6 +495,7 @@ class SearchCrossEncoderWrapper:
488
495
  hf_split: Split of current task, allows to know some additional information about current split.
489
496
  hf_subset: Subset of current task. Similar to `hf_split` to get more information
490
497
  encode_kwargs: Additional arguments to pass to the encoder during indexing.
498
+ num_proc: Number of processes to use.
491
499
  """
492
500
  self.task_corpus = corpus
493
501
 
@@ -501,6 +509,7 @@ class SearchCrossEncoderWrapper:
501
509
  top_k: int,
502
510
  encode_kwargs: EncodeKwargs,
503
511
  top_ranked: TopRankedDocumentsType | None = None,
512
+ num_proc: int = 1,
504
513
  ) -> RetrievalOutputType:
505
514
  """Search the corpus using the given queries.
506
515
 
@@ -513,6 +522,7 @@ class SearchCrossEncoderWrapper:
513
522
  Passed only from Reranking tasks.
514
523
  top_k: Number of top documents to return for each query.
515
524
  encode_kwargs: Additional arguments to pass to the encoder during indexing.
525
+ num_proc: Number of processes to use.
516
526
 
517
527
  Returns:
518
528
  Dictionary with query IDs as keys with dict as values, where each value is a mapping of document IDs to their relevance scores.
@@ -546,12 +556,14 @@ class SearchCrossEncoderWrapper:
546
556
  Dataset.from_list(total_queries),
547
557
  task_metadata,
548
558
  prompt_type=PromptType.document,
559
+ num_proc=num_proc,
549
560
  **encode_kwargs,
550
561
  )
551
562
  corpus_loader = create_dataloader(
552
563
  Dataset.from_list(total_docs),
553
564
  task_metadata,
554
565
  prompt_type=PromptType.document,
566
+ num_proc=num_proc,
555
567
  **encode_kwargs,
556
568
  )
557
569
  predictions = self.model.predict(
@@ -59,7 +59,7 @@ class PubChemSMILESBitextMining(AbsTaskBitextMining):
59
59
  """,
60
60
  )
61
61
 
62
- def dataset_transform(self):
62
+ def dataset_transform(self, num_proc: int = 1):
63
63
  for subset in self.hf_subsets:
64
64
  self.dataset[subset] = self.dataset[subset].rename_columns(
65
65
  COL_MAPPING[subset]
@@ -27,7 +27,7 @@ class SAMSumFa(AbsTaskBitextMining):
27
27
  bibtex_citation="",
28
28
  )
29
29
 
30
- def dataset_transform(self):
30
+ def dataset_transform(self, num_proc: int = 1):
31
31
  self.dataset = self.dataset.rename_columns(
32
32
  {"text": "sentence1", "summary": "sentence2"}
33
33
  )
@@ -58,7 +58,7 @@ class SynPerChatbotSumSRetrieval(AbsTaskBitextMining):
58
58
  bibtex_citation=""" """,
59
59
  )
60
60
 
61
- def dataset_transform(self):
61
+ def dataset_transform(self, num_proc: int = 1):
62
62
  self.dataset = self.dataset.rename_columns(
63
63
  {"text": "sentence1", "summary": "sentence2"}
64
64
  )
@@ -89,7 +89,7 @@ class SynPerChatbotRAGSumSRetrieval(AbsTaskBitextMining):
89
89
  bibtex_citation=""" """,
90
90
  )
91
91
 
92
- def dataset_transform(self):
92
+ def dataset_transform(self, num_proc: int = 1):
93
93
  self.dataset = self.dataset.rename_columns(
94
94
  {"text": "sentence1", "summary": "sentence2"}
95
95
  )
@@ -60,7 +60,7 @@ Rapp, Reinhard},
60
60
  superseded_by="BUCC.v2",
61
61
  )
62
62
 
63
- def dataset_transform(self):
63
+ def dataset_transform(self, num_proc: int = 1):
64
64
  dataset = {}
65
65
  for lang in self.dataset:
66
66
  dataset[lang] = {}
@@ -265,7 +265,7 @@ class FloresBitextMining(AbsTaskBitextMining):
265
265
  """,
266
266
  )
267
267
 
268
- def load_data(self) -> None:
268
+ def load_data(self, num_proc: int = 1, **kwargs) -> None:
269
269
  if self.data_loaded:
270
270
  return
271
271
 
@@ -99,7 +99,7 @@ class IN22ConvBitextMining(AbsTaskBitextMining):
99
99
  """,
100
100
  )
101
101
 
102
- def load_data(self) -> None:
102
+ def load_data(self, num_proc: int = 1, **kwargs) -> None:
103
103
  if self.data_loaded:
104
104
  return
105
105
 
@@ -93,7 +93,7 @@ class IN22GenBitextMining(AbsTaskBitextMining):
93
93
  """,
94
94
  )
95
95
 
96
- def load_data(self) -> None:
96
+ def load_data(self, num_proc: int = 1, **kwargs) -> None:
97
97
  if self.data_loaded:
98
98
  return
99
99
 
@@ -35,7 +35,7 @@ class NorwegianCourtsBitextMining(AbsTaskBitextMining):
35
35
  prompt="Retrieve parallel sentences in Norwegian Bokmål and Nynorsk",
36
36
  )
37
37
 
38
- def dataset_transform(self):
38
+ def dataset_transform(self, num_proc: int = 1):
39
39
  # Convert to standard format
40
40
  self.dataset = self.dataset.rename_column("nb", "sentence1")
41
41
  self.dataset = self.dataset.rename_column("nn", "sentence2")
@@ -280,7 +280,7 @@ class NTREXBitextMining(AbsTaskBitextMining):
280
280
  """,
281
281
  )
282
282
 
283
- def load_data(self) -> None:
283
+ def load_data(self, num_proc: int = 1, **kwargs) -> None:
284
284
  if self.data_loaded:
285
285
  return
286
286
 
@@ -32,7 +32,7 @@ class RomaTalesBitextMining(AbsTaskBitextMining):
32
32
  bibtex_citation="",
33
33
  )
34
34
 
35
- def load_data(self) -> None:
35
+ def load_data(self, num_proc: int = 1, **kwargs) -> None:
36
36
  """Load dataset from HuggingFace hub and convert it to the standard format."""
37
37
  if self.data_loaded:
38
38
  return
@@ -44,7 +44,7 @@ class RomaTalesBitextMining(AbsTaskBitextMining):
44
44
  self.dataset_transform()
45
45
  self.data_loaded = True
46
46
 
47
- def dataset_transform(self):
47
+ def dataset_transform(self, num_proc: int = 1):
48
48
  for lang in self.hf_subsets:
49
49
  self.dataset[lang] = self.dataset[lang].rename_columns(
50
50
  {"romani": "sentence1", "hungarian": "sentence2"}
@@ -230,7 +230,7 @@ class WebFAQBitextMiningQuestions(AbsTaskBitextMining):
230
230
  """,
231
231
  )
232
232
 
233
- def dataset_transform(self):
233
+ def dataset_transform(self, num_proc: int = 1):
234
234
  dataset = {}
235
235
  for langs in self.dataset:
236
236
  dataset[langs] = {}
@@ -284,7 +284,7 @@ class WebFAQBitextMiningQAs(AbsTaskBitextMining):
284
284
  """,
285
285
  )
286
286
 
287
- def dataset_transform(self):
287
+ def dataset_transform(self, num_proc: int = 1):
288
288
  dataset = {}
289
289
  for langs in self.dataset:
290
290
  dataset[langs] = {}
@@ -28,7 +28,7 @@ class OnlineStoreReviewSentimentClassification(AbsTaskClassification):
28
28
  superseded_by="OnlineStoreReviewSentimentClassification.v2",
29
29
  )
30
30
 
31
- def dataset_transform(self):
31
+ def dataset_transform(self, num_proc: int = 1):
32
32
  self.dataset = self.stratified_subsampling(
33
33
  self.dataset, seed=self.seed, splits=["train"]
34
34
  )
@@ -37,7 +37,7 @@ class RestaurantReviewSentimentClassification(AbsTaskClassification):
37
37
  superseded_by="RestaurantReviewSentimentClassification.v2",
38
38
  )
39
39
 
40
- def dataset_transform(self):
40
+ def dataset_transform(self, num_proc: int = 1):
41
41
  # labels: 0 negative, 1 positive
42
42
  self.dataset = self.dataset.rename_column("polarity", "label")
43
43
  self.dataset = self.stratified_subsampling(
@@ -48,7 +48,7 @@ Mubarak, Hamdy},
48
48
  superseded_by="TweetSarcasmClassification.v2",
49
49
  )
50
50
 
51
- def dataset_transform(self):
51
+ def dataset_transform(self, num_proc: int = 1):
52
52
  # labels: 0 non-sarcastic, 1 sarcastic
53
53
  self.dataset = self.dataset.rename_columns(
54
54
  {"tweet": "text", "sarcasm": "label"}
@@ -36,7 +36,7 @@ class BengaliHateSpeechClassification(AbsTaskClassification):
36
36
  superseded_by="BengaliHateSpeechClassification.v2",
37
37
  )
38
38
 
39
- def dataset_transform(self):
39
+ def dataset_transform(self, num_proc: int = 1):
40
40
  self.dataset = self.stratified_subsampling(
41
41
  self.dataset, seed=self.seed, splits=["train"]
42
42
  )
@@ -36,7 +36,7 @@ class BengaliSentimentAnalysis(AbsTaskClassification):
36
36
  superseded_by="BengaliSentimentAnalysis.v2",
37
37
  )
38
38
 
39
- def dataset_transform(self):
39
+ def dataset_transform(self, num_proc: int = 1):
40
40
  self.dataset = self.stratified_subsampling(
41
41
  self.dataset, seed=self.seed, splits=["train"]
42
42
  )
@@ -37,7 +37,7 @@ class BulgarianStoreReviewSentimentClassfication(AbsTaskClassification):
37
37
  """,
38
38
  )
39
39
 
40
- def dataset_transform(self):
40
+ def dataset_transform(self, num_proc: int = 1):
41
41
  self.dataset = self.dataset.rename_columns(
42
42
  {"Review": "text", "Category": "label"}
43
43
  )
@@ -39,7 +39,7 @@ class CSFDCZMovieReviewSentimentClassification(AbsTaskClassification):
39
39
  # Increase the samples_per_label in order to improve baseline performance
40
40
  samples_per_label = 20
41
41
 
42
- def dataset_transform(self):
42
+ def dataset_transform(self, num_proc: int = 1):
43
43
  self.dataset = self.dataset.rename_columns(
44
44
  {"comment": "text", "rating_int": "label"}
45
45
  )
@@ -85,7 +85,7 @@ class CSFDCZMovieReviewSentimentClassificationV2(AbsTaskClassification):
85
85
  # Increase the samples_per_label in order to improve baseline performance
86
86
  samples_per_label = 20
87
87
 
88
- def dataset_transform(self):
88
+ def dataset_transform(self, num_proc: int = 1):
89
89
  self.dataset = self.stratified_subsampling(
90
90
  self.dataset, seed=self.seed, splits=["test"], n_samples=2048
91
91
  )
@@ -56,7 +56,7 @@ Piperidis, Stelios},
56
56
  superseded_by="Ddisco.v2",
57
57
  )
58
58
 
59
- def dataset_transform(self):
59
+ def dataset_transform(self, num_proc: int = 1):
60
60
  self.dataset = self.dataset.rename_columns({"rating": "label"}).remove_columns(
61
61
  ["domain"]
62
62
  )
@@ -60,7 +60,7 @@ Piperidis, Stelios},
60
60
 
61
61
  samples_per_label = 16
62
62
 
63
- def dataset_transform(self):
63
+ def dataset_transform(self, num_proc: int = 1):
64
64
  # convert label to a 0/1 label
65
65
  labels = self.dataset["train"]["label"]
66
66
  lab2idx = {lab: idx for idx, lab in enumerate(set(labels))}
@@ -49,7 +49,7 @@ Zesch, Torsten},
49
49
  superseded_by="GermanPoliticiansTwitterSentimentClassification.v2",
50
50
  )
51
51
 
52
- def dataset_transform(self):
52
+ def dataset_transform(self, num_proc: int = 1):
53
53
  self.dataset = self.dataset.rename_column("majority_sentiment", "label")
54
54
 
55
55
 
@@ -42,7 +42,7 @@ class GreekLegalCodeClassification(AbsTaskClassification):
42
42
  """,
43
43
  )
44
44
 
45
- def dataset_transform(self):
45
+ def dataset_transform(self, num_proc: int = 1):
46
46
  self.dataset["validation"] = (
47
47
  self.dataset["validation"]
48
48
  .shuffle(seed=self.seed)
@@ -40,7 +40,7 @@ class DBpediaClassification(AbsTaskClassification):
40
40
  superseded_by="DBpediaClassification.v2",
41
41
  )
42
42
 
43
- def dataset_transform(self):
43
+ def dataset_transform(self, num_proc: int = 1):
44
44
  self.dataset = self.dataset.rename_column("content", "text")
45
45
  self.dataset = self.stratified_subsampling(
46
46
  self.dataset, seed=self.seed, splits=["train", "test"]
@@ -85,7 +85,7 @@ class DBpediaClassificationV2(AbsTaskClassification):
85
85
  adapted_from=["DBpediaClassification"],
86
86
  )
87
87
 
88
- def dataset_transform(self):
88
+ def dataset_transform(self, num_proc: int = 1):
89
89
  self.dataset = self.stratified_subsampling(
90
90
  self.dataset, seed=self.seed, splits=["train", "test"]
91
91
  )
@@ -40,7 +40,7 @@ class ToxicChatClassification(AbsTaskClassification):
40
40
  superseded_by="ToxicChatClassification.v2",
41
41
  )
42
42
 
43
- def dataset_transform(self):
43
+ def dataset_transform(self, num_proc: int = 1):
44
44
  keep_cols = ["user_input", "toxicity"]
45
45
  rename_dict = dict(zip(keep_cols, ["text", "label"]))
46
46
  remove_cols = [
@@ -93,7 +93,7 @@ class ToxicChatClassificationV2(AbsTaskClassification):
93
93
  adapted_from=["ToxicChatClassification"],
94
94
  )
95
95
 
96
- def dataset_transform(self):
96
+ def dataset_transform(self, num_proc: int = 1):
97
97
  self.dataset = self.stratified_subsampling(
98
98
  self.dataset, seed=self.seed, splits=["test"]
99
99
  )
@@ -42,7 +42,7 @@ class ToxicConversationsClassification(AbsTaskClassification):
42
42
 
43
43
  samples_per_label = 16
44
44
 
45
- def dataset_transform(self):
45
+ def dataset_transform(self, num_proc: int = 1):
46
46
  self.dataset = self.stratified_subsampling(
47
47
  self.dataset, seed=self.seed, splits=["test"]
48
48
  )
@@ -88,7 +88,7 @@ class ToxicConversationsClassificationV2(AbsTaskClassification):
88
88
 
89
89
  samples_per_label = 16
90
90
 
91
- def dataset_transform(self):
91
+ def dataset_transform(self, num_proc: int = 1):
92
92
  self.dataset = self.stratified_subsampling(
93
93
  self.dataset, seed=self.seed, splits=["test"]
94
94
  )
@@ -43,7 +43,7 @@ Barbieri, Francesco},
43
43
  superseded_by="TweetTopicSingleClassification.v2",
44
44
  )
45
45
 
46
- def dataset_transform(self):
46
+ def dataset_transform(self, num_proc: int = 1):
47
47
  self.dataset["train"] = self.dataset["train_2021"]
48
48
 
49
49