mteb 2.7.4__py3-none-any.whl → 2.7.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (287) hide show
  1. mteb/_create_dataloaders.py +47 -5
  2. mteb/_evaluators/any_sts_evaluator.py +2 -0
  3. mteb/_evaluators/clustering_evaluator.py +2 -0
  4. mteb/_evaluators/evaluator.py +2 -1
  5. mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +8 -1
  6. mteb/_evaluators/pair_classification_evaluator.py +3 -0
  7. mteb/_evaluators/retrieval_evaluator.py +3 -0
  8. mteb/_evaluators/sklearn_evaluator.py +6 -1
  9. mteb/_evaluators/text/bitext_mining_evaluator.py +2 -0
  10. mteb/_evaluators/text/summarization_evaluator.py +2 -0
  11. mteb/_evaluators/zeroshot_classification_evaluator.py +2 -0
  12. mteb/abstasks/abstask.py +31 -12
  13. mteb/abstasks/classification.py +10 -3
  14. mteb/abstasks/clustering.py +6 -2
  15. mteb/abstasks/clustering_legacy.py +8 -2
  16. mteb/abstasks/image/image_text_pair_classification.py +6 -2
  17. mteb/abstasks/multilabel_classification.py +2 -0
  18. mteb/abstasks/pair_classification.py +8 -2
  19. mteb/abstasks/retrieval.py +26 -11
  20. mteb/abstasks/retrieval_dataset_loaders.py +29 -19
  21. mteb/abstasks/sts.py +10 -3
  22. mteb/abstasks/text/bitext_mining.py +9 -5
  23. mteb/abstasks/text/reranking.py +2 -2
  24. mteb/abstasks/text/summarization.py +2 -1
  25. mteb/abstasks/zeroshot_classification.py +8 -2
  26. mteb/evaluate.py +10 -2
  27. mteb/models/model_implementations/bm25.py +2 -0
  28. mteb/models/model_implementations/pylate_models.py +10 -0
  29. mteb/models/models_protocols.py +4 -0
  30. mteb/models/search_wrappers.py +12 -0
  31. mteb/tasks/bitext_mining/eng/pub_chem_smiles_bitext_mining.py +1 -1
  32. mteb/tasks/bitext_mining/fas/fa_mteb_summary_retrieval.py +3 -3
  33. mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining.py +1 -1
  34. mteb/tasks/bitext_mining/multilingual/flores_bitext_mining.py +1 -1
  35. mteb/tasks/bitext_mining/multilingual/in22_conv_bitext_mining.py +1 -1
  36. mteb/tasks/bitext_mining/multilingual/in22_gen_bitext_mining.py +1 -1
  37. mteb/tasks/bitext_mining/multilingual/norwegian_courts_bitext_mining.py +1 -1
  38. mteb/tasks/bitext_mining/multilingual/ntrex_bitext_mining.py +1 -1
  39. mteb/tasks/bitext_mining/multilingual/roma_tales_bitext_mining.py +2 -2
  40. mteb/tasks/bitext_mining/multilingual/web_faq_bitext_mining.py +2 -2
  41. mteb/tasks/classification/ara/online_store_review_sentiment_classification.py +1 -1
  42. mteb/tasks/classification/ara/restaurant_review_sentiment_classification.py +1 -1
  43. mteb/tasks/classification/ara/tweet_sarcasm_classification.py +1 -1
  44. mteb/tasks/classification/ben/bengali_hate_speech_classification.py +1 -1
  45. mteb/tasks/classification/ben/bengali_sentiment_analysis.py +1 -1
  46. mteb/tasks/classification/bul/bulgarian_store_review_sentiment_classfication.py +1 -1
  47. mteb/tasks/classification/ces/csfdcz_movie_review_sentiment_classification.py +2 -2
  48. mteb/tasks/classification/dan/ddisco_cohesion_classification.py +1 -1
  49. mteb/tasks/classification/dan/dk_hate_classification.py +1 -1
  50. mteb/tasks/classification/deu/german_politicians_twitter_sentiment_classification.py +1 -1
  51. mteb/tasks/classification/ell/greek_legal_code_classification.py +1 -1
  52. mteb/tasks/classification/eng/dbpedia_classification.py +2 -2
  53. mteb/tasks/classification/eng/toxic_chat_classification.py +2 -2
  54. mteb/tasks/classification/eng/toxic_conversations_classification.py +2 -2
  55. mteb/tasks/classification/eng/tweet_topic_single_classification.py +1 -1
  56. mteb/tasks/classification/eng/yahoo_answers_topics_classification.py +1 -1
  57. mteb/tasks/classification/eng/yelp_review_full_classification.py +2 -2
  58. mteb/tasks/classification/est/estonian_valence.py +1 -1
  59. mteb/tasks/classification/fas/fa_mteb_classification.py +6 -6
  60. mteb/tasks/classification/fas/persian_food_sentiment_classification.py +1 -1
  61. mteb/tasks/classification/fil/filipino_shopee_reviews_classification.py +1 -1
  62. mteb/tasks/classification/fin/fin_toxicity_classification.py +1 -1
  63. mteb/tasks/classification/fra/french_book_reviews.py +2 -2
  64. mteb/tasks/classification/fra/movie_review_sentiment_classification.py +2 -2
  65. mteb/tasks/classification/guj/gujarati_news_classification.py +1 -1
  66. mteb/tasks/classification/hin/hindi_discourse_classification.py +1 -1
  67. mteb/tasks/classification/hin/sentiment_analysis_hindi.py +1 -1
  68. mteb/tasks/classification/ind/indonesian_id_clickbait_classification.py +2 -2
  69. mteb/tasks/classification/ind/indonesian_mongabay_conservation_classification.py +1 -1
  70. mteb/tasks/classification/ita/dado_eval_coarse_classification.py +1 -1
  71. mteb/tasks/classification/ita/ita_casehold_classification.py +1 -1
  72. mteb/tasks/classification/ita/sardi_stance_classification.py +1 -1
  73. mteb/tasks/classification/jav/javanese_imdb_classification.py +1 -1
  74. mteb/tasks/classification/jpn/wrime_classification.py +1 -1
  75. mteb/tasks/classification/kan/kannada_news_classification.py +2 -2
  76. mteb/tasks/classification/kor/klue_tc.py +2 -2
  77. mteb/tasks/classification/kor/kor_fin.py +1 -1
  78. mteb/tasks/classification/kor/kor_hate_classification.py +1 -1
  79. mteb/tasks/classification/kor/kor_sarcasm_classification.py +1 -1
  80. mteb/tasks/classification/mal/malayalam_news_classification.py +1 -1
  81. mteb/tasks/classification/mar/marathi_news_classification.py +1 -1
  82. mteb/tasks/classification/multilingual/afri_senti_lang_classification.py +1 -1
  83. mteb/tasks/classification/multilingual/catalonia_tweet_classification.py +1 -1
  84. mteb/tasks/classification/multilingual/cyrillic_turkic_lang_classification.py +1 -1
  85. mteb/tasks/classification/multilingual/indic_nlp_news_classification.py +1 -1
  86. mteb/tasks/classification/multilingual/masakha_news_classification.py +1 -1
  87. mteb/tasks/classification/multilingual/multi_hate_classification.py +1 -1
  88. mteb/tasks/classification/multilingual/multilingual_sentiment_classification.py +1 -1
  89. mteb/tasks/classification/multilingual/scala_classification.py +1 -1
  90. mteb/tasks/classification/multilingual/sib200_classification.py +1 -1
  91. mteb/tasks/classification/multilingual/turkic_classification.py +1 -1
  92. mteb/tasks/classification/multilingual/tweet_sentiment_classification.py +1 -1
  93. mteb/tasks/classification/nep/nepali_news_classification.py +2 -2
  94. mteb/tasks/classification/nld/dutch_sarcastic_headlines_classification.py +1 -1
  95. mteb/tasks/classification/nld/vaccin_chat_nl_classification.py +1 -1
  96. mteb/tasks/classification/ory/odia_news_classification.py +2 -2
  97. mteb/tasks/classification/pan/punjabi_news_classification.py +1 -1
  98. mteb/tasks/classification/ron/moroco.py +1 -1
  99. mteb/tasks/classification/ron/romanian_reviews_sentiment.py +1 -1
  100. mteb/tasks/classification/ron/romanian_sentiment_classification.py +1 -1
  101. mteb/tasks/classification/rus/georeview_classification.py +1 -1
  102. mteb/tasks/classification/rus/headline_classification.py +2 -2
  103. mteb/tasks/classification/rus/inappropriateness_classification.py +2 -2
  104. mteb/tasks/classification/rus/ru_reviews_classification.py +2 -2
  105. mteb/tasks/classification/rus/ru_sci_bench_grnti_classification.py +1 -1
  106. mteb/tasks/classification/rus/ru_sci_bench_oecd_classification.py +1 -1
  107. mteb/tasks/classification/rus/ru_toixic_classification_okmlcup.py +1 -1
  108. mteb/tasks/classification/san/sanskrit_shlokas_classification.py +1 -1
  109. mteb/tasks/classification/sin/sinhala_news_classification.py +2 -2
  110. mteb/tasks/classification/sin/sinhala_news_source_classification.py +2 -2
  111. mteb/tasks/classification/slk/csfdsk_movie_review_sentiment_classification.py +2 -2
  112. mteb/tasks/classification/slv/frenk_sl_classification.py +1 -1
  113. mteb/tasks/classification/spa/spanish_news_classification.py +2 -2
  114. mteb/tasks/classification/ssw/siswati_news_classification.py +1 -1
  115. mteb/tasks/classification/tam/tamil_news_classification.py +2 -2
  116. mteb/tasks/classification/tel/telugu_andhra_jyoti_news_classification.py +2 -2
  117. mteb/tasks/classification/tha/wongnai_reviews_classification.py +1 -1
  118. mteb/tasks/classification/tur/turkish_movie_sentiment_classification.py +2 -2
  119. mteb/tasks/classification/ukr/ukr_formality_classification.py +2 -2
  120. mteb/tasks/classification/vie/toxic_conversations_vn_classification.py +1 -1
  121. mteb/tasks/classification/vie/vie_student_feedback_classification.py +1 -1
  122. mteb/tasks/classification/zho/yue_openrice_review_classification.py +2 -2
  123. mteb/tasks/classification/zul/isi_zulu_news_classification.py +1 -1
  124. mteb/tasks/clustering/deu/blurbs_clustering_p2p.py +1 -1
  125. mteb/tasks/clustering/deu/blurbs_clustering_s2s.py +1 -1
  126. mteb/tasks/clustering/eng/arxiv_clustering_p2p.py +1 -1
  127. mteb/tasks/clustering/eng/arxiv_hierarchical_clustering.py +2 -2
  128. mteb/tasks/clustering/eng/big_patent_clustering.py +1 -1
  129. mteb/tasks/clustering/eng/biorxiv_clustering_p2p.py +1 -1
  130. mteb/tasks/clustering/eng/biorxiv_clustering_s2s.py +1 -1
  131. mteb/tasks/clustering/eng/medrxiv_clustering_p2p.py +1 -1
  132. mteb/tasks/clustering/eng/medrxiv_clustering_s2s.py +1 -1
  133. mteb/tasks/clustering/eng/reddit_clustering.py +1 -1
  134. mteb/tasks/clustering/eng/reddit_clustering_p2p.py +1 -1
  135. mteb/tasks/clustering/eng/stack_exchange_clustering.py +1 -1
  136. mteb/tasks/clustering/eng/stack_exchange_clustering_p2p.py +1 -1
  137. mteb/tasks/clustering/eng/twenty_newsgroups_clustering.py +1 -1
  138. mteb/tasks/clustering/fas/fa_mteb_clustering.py +4 -4
  139. mteb/tasks/clustering/fra/hal_clustering_s2s.py +2 -2
  140. mteb/tasks/clustering/multilingual/mlsum_clustering_p2p.py +2 -2
  141. mteb/tasks/clustering/multilingual/mlsum_clustering_s2s.py +2 -2
  142. mteb/tasks/clustering/multilingual/sib200_clustering_s2s.py +1 -1
  143. mteb/tasks/clustering/multilingual/wiki_clustering_p2p.py +1 -1
  144. mteb/tasks/clustering/nld/dutch_news_articles_clustering_p2p.py +1 -1
  145. mteb/tasks/clustering/nld/dutch_news_articles_clustering_s2s.py +1 -1
  146. mteb/tasks/clustering/nld/iconclass_clustering_s2s.py +1 -1
  147. mteb/tasks/clustering/nld/open_tender_clustering_p2p.py +1 -1
  148. mteb/tasks/clustering/nld/vabb_clustering_p2p.py +1 -1
  149. mteb/tasks/clustering/nld/vabb_clustering_s2s.py +1 -1
  150. mteb/tasks/clustering/nob/snl_clustering.py +1 -1
  151. mteb/tasks/clustering/nob/vg_clustering.py +1 -1
  152. mteb/tasks/clustering/pol/polish_clustering.py +3 -3
  153. mteb/tasks/clustering/rus/ru_sci_bench_grnti_clustering_p2p.py +1 -1
  154. mteb/tasks/clustering/rus/ru_sci_bench_oecd_clustering_p2p.py +1 -1
  155. mteb/tasks/clustering/zho/cmteb_clustering.py +4 -4
  156. mteb/tasks/image_text_pair_classification/eng/image_co_de.py +1 -1
  157. mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +1 -1
  158. mteb/tasks/instruction_reranking/multilingual/m_follow_ir.py +2 -2
  159. mteb/tasks/multichoice/eng/cv_bench.py +4 -4
  160. mteb/tasks/multilabel_classification/ita/emit_classification.py +1 -1
  161. mteb/tasks/multilabel_classification/mlt/maltese_news_classification.py +1 -1
  162. mteb/tasks/multilabel_classification/rus/ru_toixic_multilabelclassification_okmlcup.py +1 -1
  163. mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_group_classification.py +1 -1
  164. mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_subclass_classification.py +1 -1
  165. mteb/tasks/pair_classification/ara/ar_entail.py +1 -1
  166. mteb/tasks/pair_classification/dan/talemaader_pc.py +1 -1
  167. mteb/tasks/pair_classification/deu/false_friends_de_en_pc.py +1 -1
  168. mteb/tasks/pair_classification/eng/pub_chem_ai_sentence_paraphrase_pc.py +1 -1
  169. mteb/tasks/pair_classification/eng/pub_chem_smilespc.py +1 -1
  170. mteb/tasks/pair_classification/eng/pub_chem_synonym_pc.py +1 -1
  171. mteb/tasks/pair_classification/eng/pub_chem_wiki_paragraphs_pc.py +1 -1
  172. mteb/tasks/pair_classification/eng/sprint_duplicate_questions_pc.py +1 -1
  173. mteb/tasks/pair_classification/eng/twitter_sem_eval2015_pc.py +1 -1
  174. mteb/tasks/pair_classification/eng/twitter_url_corpus_pc.py +1 -1
  175. mteb/tasks/pair_classification/fas/fa_mteb_pair_classification.py +5 -5
  176. mteb/tasks/pair_classification/fas/fars_tail.py +2 -2
  177. mteb/tasks/pair_classification/hye/armenian_paraphrase_pc.py +1 -1
  178. mteb/tasks/pair_classification/ita/dis_co_tex_pair_classification.py +1 -1
  179. mteb/tasks/pair_classification/kor/klue_nli.py +1 -1
  180. mteb/tasks/pair_classification/multilingual/rte3.py +2 -2
  181. mteb/tasks/pair_classification/multilingual/xnli.py +1 -1
  182. mteb/tasks/pair_classification/pol/polish_pc.py +4 -4
  183. mteb/tasks/pair_classification/por/assin2_rte.py +1 -1
  184. mteb/tasks/pair_classification/por/sick_br_pc.py +1 -1
  185. mteb/tasks/pair_classification/rus/terra.py +2 -2
  186. mteb/tasks/pair_classification/vie/sprint_duplicate_questions_pcvn.py +1 -1
  187. mteb/tasks/pair_classification/vie/twitter_sem_eval2015_pcvn.py +1 -1
  188. mteb/tasks/pair_classification/vie/twitter_url_corpus_pcvn.py +1 -1
  189. mteb/tasks/pair_classification/zho/cmteb_pair_classification.py +2 -2
  190. mteb/tasks/retrieval/ara/sadeem_question_retrieval.py +1 -1
  191. mteb/tasks/retrieval/code/code_edit_search_retrieval.py +1 -1
  192. mteb/tasks/retrieval/code/code_rag.py +4 -4
  193. mteb/tasks/retrieval/code/code_search_net_cc_retrieval.py +1 -1
  194. mteb/tasks/retrieval/code/coir_code_search_net_retrieval.py +1 -1
  195. mteb/tasks/retrieval/code/ds1000_retrieval.py +1 -1
  196. mteb/tasks/retrieval/code/fresh_stack_retrieval.py +1 -1
  197. mteb/tasks/retrieval/code/human_eval_retrieval.py +1 -1
  198. mteb/tasks/retrieval/code/mbpp_retrieval.py +1 -1
  199. mteb/tasks/retrieval/code/wiki_sql_retrieval.py +1 -1
  200. mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
  201. mteb/tasks/retrieval/dan/tv2_nordretrieval.py +1 -1
  202. mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +1 -1
  203. mteb/tasks/retrieval/deu/german_gov_service_retrieval.py +1 -1
  204. mteb/tasks/retrieval/deu/german_qu_ad_retrieval.py +1 -1
  205. mteb/tasks/retrieval/ell/greek_civics_qa.py +1 -1
  206. mteb/tasks/retrieval/eng/bright_retrieval.py +1 -1
  207. mteb/tasks/retrieval/eng/chat_doctor_retrieval.py +1 -1
  208. mteb/tasks/retrieval/eng/fin_qa_retrieval.py +1 -1
  209. mteb/tasks/retrieval/eng/finance_bench_retrieval.py +1 -1
  210. mteb/tasks/retrieval/eng/hateful_memes_i2t_retrieval.py +1 -1
  211. mteb/tasks/retrieval/eng/hateful_memes_t2i_retrieval.py +1 -1
  212. mteb/tasks/retrieval/eng/hc3_finance_retrieval.py +1 -1
  213. mteb/tasks/retrieval/eng/lemb_narrative_qa_retrieval.py +1 -1
  214. mteb/tasks/retrieval/eng/lemb_needle_retrieval.py +1 -1
  215. mteb/tasks/retrieval/eng/lemb_passkey_retrieval.py +1 -1
  216. mteb/tasks/retrieval/eng/lemb_summ_screen_fd_retrieval.py +1 -1
  217. mteb/tasks/retrieval/eng/lemb_wikim_qa_retrieval.py +1 -1
  218. mteb/tasks/retrieval/eng/lembqm_sum_retrieval.py +1 -1
  219. mteb/tasks/retrieval/eng/lit_search_retrieval.py +1 -1
  220. mteb/tasks/retrieval/eng/memotion_i2t_retrieval.py +1 -1
  221. mteb/tasks/retrieval/eng/memotion_t2i_retrieval.py +1 -1
  222. mteb/tasks/retrieval/eng/ml_questions.py +1 -1
  223. mteb/tasks/retrieval/eng/nano_argu_ana_retrieval.py +1 -1
  224. mteb/tasks/retrieval/eng/nano_climate_fever_retrieval.py +1 -1
  225. mteb/tasks/retrieval/eng/nano_db_pedia_retrieval.py +1 -1
  226. mteb/tasks/retrieval/eng/nano_fever_retrieval.py +1 -1
  227. mteb/tasks/retrieval/eng/nano_fi_qa2018_retrieval.py +1 -1
  228. mteb/tasks/retrieval/eng/nano_hotpot_qa_retrieval.py +1 -1
  229. mteb/tasks/retrieval/eng/nano_msmarco_retrieval.py +1 -1
  230. mteb/tasks/retrieval/eng/nano_nf_corpus_retrieval.py +1 -1
  231. mteb/tasks/retrieval/eng/nano_nq_retrieval.py +1 -1
  232. mteb/tasks/retrieval/eng/nano_quora_retrieval.py +1 -1
  233. mteb/tasks/retrieval/eng/nano_sci_fact_retrieval.py +1 -1
  234. mteb/tasks/retrieval/eng/nano_scidocs_retrieval.py +1 -1
  235. mteb/tasks/retrieval/eng/nano_touche2020_retrieval.py +1 -1
  236. mteb/tasks/retrieval/eng/narrative_qa_retrieval.py +1 -1
  237. mteb/tasks/retrieval/eng/r2_med_retrieval.py +8 -8
  238. mteb/tasks/retrieval/eng/sci_mmir_i2t_retrieval.py +1 -1
  239. mteb/tasks/retrieval/eng/sci_mmir_t2i_retrieval.py +1 -1
  240. mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +10 -10
  241. mteb/tasks/retrieval/fra/f_qu_ad_retrieval.py +1 -1
  242. mteb/tasks/retrieval/fra/syntec_retrieval.py +1 -1
  243. mteb/tasks/retrieval/hun/hun_sum2.py +1 -1
  244. mteb/tasks/retrieval/kat/georgian_faq_retrieval.py +1 -1
  245. mteb/tasks/retrieval/multilingual/cross_lingual_semantic_discrimination_wmt19.py +1 -1
  246. mteb/tasks/retrieval/multilingual/cross_lingual_semantic_discrimination_wmt21.py +1 -1
  247. mteb/tasks/retrieval/multilingual/cur_ev1_retrieval.py +1 -1
  248. mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +1 -1
  249. mteb/tasks/retrieval/multilingual/miracl_vision_retrieval.py +1 -1
  250. mteb/tasks/retrieval/multilingual/mr_tidy_retrieval.py +1 -1
  251. mteb/tasks/retrieval/multilingual/public_health_qa_retrieval.py +1 -1
  252. mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +2 -2
  253. mteb/tasks/retrieval/multilingual/statcan_dialogue_dataset_retrieval.py +1 -1
  254. mteb/tasks/retrieval/multilingual/vdr_multilingual_retrieval.py +1 -1
  255. mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +14 -4
  256. mteb/tasks/retrieval/multilingual/wit_t2i_retrieval.py +1 -1
  257. mteb/tasks/retrieval/multilingual/x_flickr30k_co_t2i_retrieval.py +1 -1
  258. mteb/tasks/retrieval/multilingual/x_qu_ad_retrieval.py +1 -1
  259. mteb/tasks/retrieval/multilingual/xm3600_t2i_retrieval.py +1 -1
  260. mteb/tasks/retrieval/nld/cqa_dupstack_android_nl_retrieval.py +1 -1
  261. mteb/tasks/retrieval/nld/cqa_dupstack_english_nl_retrieval.py +1 -1
  262. mteb/tasks/retrieval/nld/cqa_dupstack_gaming_nl_retrieval.py +1 -1
  263. mteb/tasks/retrieval/nld/cqa_dupstack_gis_nl_retrieval.py +1 -1
  264. mteb/tasks/retrieval/nld/cqa_dupstack_mathematica_nl_retrieval.py +1 -1
  265. mteb/tasks/retrieval/nld/cqa_dupstack_physics_nl_retrieval.py +1 -1
  266. mteb/tasks/retrieval/nld/cqa_dupstack_programmers_nl_retrieval.py +1 -1
  267. mteb/tasks/retrieval/nld/cqa_dupstack_stats_nl_retrieval.py +1 -1
  268. mteb/tasks/retrieval/nld/cqa_dupstack_tex_nl_retrieval.py +1 -1
  269. mteb/tasks/retrieval/nld/cqa_dupstack_unix_nl_retrieval.py +1 -1
  270. mteb/tasks/retrieval/nld/cqa_dupstack_webmasters_nl_retrieval.py +1 -1
  271. mteb/tasks/retrieval/nld/cqa_dupstack_wordpress_nl_retrieval.py +1 -1
  272. mteb/tasks/retrieval/nob/norquad.py +1 -1
  273. mteb/tasks/retrieval/nob/snl_retrieval.py +1 -1
  274. mteb/tasks/retrieval/slk/slovak_sum_retrieval.py +1 -1
  275. mteb/tasks/retrieval/vie/vie_qu_ad_retrieval.py +1 -1
  276. mteb/tasks/sts/fao/faroese_sts.py +1 -1
  277. mteb/tasks/sts/fra/sick_fr_sts.py +1 -1
  278. mteb/tasks/sts/kor/klue_sts.py +1 -1
  279. mteb/tasks/sts/por/sick_br_sts.py +1 -1
  280. mteb/tasks/sts/rus/ru_para_phraser_sts.py +1 -1
  281. mteb/tasks/zeroshot_classification/eng/sci_mmir.py +1 -1
  282. {mteb-2.7.4.dist-info → mteb-2.7.5.dist-info}/METADATA +1 -1
  283. {mteb-2.7.4.dist-info → mteb-2.7.5.dist-info}/RECORD +287 -287
  284. {mteb-2.7.4.dist-info → mteb-2.7.5.dist-info}/WHEEL +0 -0
  285. {mteb-2.7.4.dist-info → mteb-2.7.5.dist-info}/entry_points.txt +0 -0
  286. {mteb-2.7.4.dist-info → mteb-2.7.5.dist-info}/licenses/LICENSE +0 -0
  287. {mteb-2.7.4.dist-info → mteb-2.7.5.dist-info}/top_level.txt +0 -0
@@ -32,7 +32,7 @@ class RuSciBenchGRNTIClusteringP2P(AbsTaskClustering):
32
32
  prompt="Identify the category of scientific papers based on the titles and abstracts",
33
33
  )
34
34
 
35
- def dataset_transform(self):
35
+ def dataset_transform(self, num_proc: int = 1):
36
36
  self.dataset = self.dataset.rename_columns(
37
37
  {"label": "labels", "text": "sentences"}
38
38
  )
@@ -32,7 +32,7 @@ class RuSciBenchOECDClusteringP2P(AbsTaskClustering):
32
32
  prompt="Identify the category of scientific papers based on the titles and abstracts",
33
33
  )
34
34
 
35
- def dataset_transform(self):
35
+ def dataset_transform(self, num_proc: int = 1):
36
36
  self.dataset = self.dataset.rename_columns(
37
37
  {"label": "labels", "text": "sentences"}
38
38
  )
@@ -51,7 +51,7 @@ class CLSClusteringFastS2S(AbsTaskClustering):
51
51
  adapted_from=["CLSClusteringS2S"],
52
52
  )
53
53
 
54
- def dataset_transform(self):
54
+ def dataset_transform(self, num_proc: int = 1):
55
55
  ds = {}
56
56
  for split in self.metadata.eval_splits:
57
57
  labels = list(itertools.chain.from_iterable(self.dataset[split]["labels"]))
@@ -110,7 +110,7 @@ class CLSClusteringFastP2P(AbsTaskClustering):
110
110
  adapted_from=["CLSClusteringP2P"],
111
111
  )
112
112
 
113
- def dataset_transform(self):
113
+ def dataset_transform(self, num_proc: int = 1):
114
114
  ds = {}
115
115
  for split in self.metadata.eval_splits:
116
116
  labels = list(itertools.chain.from_iterable(self.dataset[split]["labels"]))
@@ -239,7 +239,7 @@ class ThuNewsClusteringFastS2S(AbsTaskClustering):
239
239
  adapted_from=["ThuNewsClusteringS2S"],
240
240
  )
241
241
 
242
- def dataset_transform(self):
242
+ def dataset_transform(self, num_proc: int = 1):
243
243
  ds = {}
244
244
  for split in self.metadata.eval_splits:
245
245
  labels = list(itertools.chain.from_iterable(self.dataset[split]["labels"]))
@@ -298,7 +298,7 @@ class ThuNewsClusteringFastP2P(AbsTaskClustering):
298
298
  adapted_from=["ThuNewsClusteringP2P"],
299
299
  )
300
300
 
301
- def dataset_transform(self):
301
+ def dataset_transform(self, num_proc: int = 1):
302
302
  ds = {}
303
303
  for split in self.metadata.eval_splits:
304
304
  labels = list(itertools.chain.from_iterable(self.dataset[split]["labels"]))
@@ -53,7 +53,7 @@ class ImageCoDe(AbsTaskImageTextPairClassification):
53
53
  """,
54
54
  )
55
55
 
56
- def load_data(self) -> None:
56
+ def load_data(self, num_proc: int = 1, **kwargs) -> None:
57
57
  if self.data_loaded:
58
58
  return
59
59
 
@@ -45,7 +45,7 @@ class SugarCrepe(AbsTaskImageTextPairClassification):
45
45
  """,
46
46
  )
47
47
 
48
- def load_data(self) -> None:
48
+ def load_data(self, num_proc: int = 1, **kwargs) -> None:
49
49
  """Load dataset from HuggingFace hub"""
50
50
  if self.data_loaded:
51
51
  return
@@ -175,7 +175,7 @@ class mFollowIRCrossLingual(AbsTaskRetrieval): # noqa: N801
175
175
  """,
176
176
  )
177
177
 
178
- def load_data(self) -> None:
178
+ def load_data(self, num_proc: int = 1, **kwargs) -> None:
179
179
  if self.data_loaded:
180
180
  return
181
181
 
@@ -243,7 +243,7 @@ class mFollowIR(AbsTaskRetrieval): # noqa: N801
243
243
  """,
244
244
  )
245
245
 
246
- def load_data(self) -> None:
246
+ def load_data(self, num_proc: int = 1, **kwargs) -> None:
247
247
  if self.data_loaded:
248
248
  return
249
249
 
@@ -123,7 +123,7 @@ class CVBenchCount(AbsTaskRetrieval):
123
123
  """,
124
124
  )
125
125
 
126
- def load_data(self) -> None:
126
+ def load_data(self, num_proc: int = 1, **kwargs) -> None:
127
127
  self.corpus, self.queries, self.relevant_docs, self.top_ranked = _load_data(
128
128
  path=self.metadata.dataset["path"],
129
129
  splits=self.metadata.eval_splits,
@@ -165,7 +165,7 @@ class CVBenchRelation(AbsTaskRetrieval):
165
165
  """,
166
166
  )
167
167
 
168
- def load_data(self) -> None:
168
+ def load_data(self, num_proc: int = 1, **kwargs) -> None:
169
169
  self.corpus, self.queries, self.relevant_docs, self.top_ranked = _load_data(
170
170
  path=self.metadata.dataset["path"],
171
171
  splits=self.metadata.eval_splits,
@@ -207,7 +207,7 @@ class CVBenchDepth(AbsTaskRetrieval):
207
207
  """,
208
208
  )
209
209
 
210
- def load_data(self) -> None:
210
+ def load_data(self, num_proc: int = 1, **kwargs) -> None:
211
211
  self.corpus, self.queries, self.relevant_docs, self.top_ranked = _load_data(
212
212
  path=self.metadata.dataset["path"],
213
213
  splits=self.metadata.eval_splits,
@@ -249,7 +249,7 @@ class CVBenchDistance(AbsTaskRetrieval):
249
249
  """,
250
250
  )
251
251
 
252
- def load_data(self) -> None:
252
+ def load_data(self, num_proc: int = 1, **kwargs) -> None:
253
253
  self.corpus, self.queries, self.relevant_docs, self.top_ranked = _load_data(
254
254
  path=self.metadata.dataset["path"],
255
255
  splits=self.metadata.eval_splits,
@@ -39,7 +39,7 @@ class EmitClassification(AbsTaskMultilabelClassification):
39
39
  """,
40
40
  )
41
41
 
42
- def dataset_transform(self):
42
+ def dataset_transform(self, num_proc: int = 1):
43
43
  self.dataset = self.dataset.rename_columns({"emotion_labels": "label"})
44
44
  unused_cols = [
45
45
  col
@@ -40,7 +40,7 @@ Borg, Claudia},
40
40
  """,
41
41
  )
42
42
 
43
- def dataset_transform(self):
43
+ def dataset_transform(self, num_proc: int = 1):
44
44
  self.dataset = self.dataset.rename_columns({"labels": "label"})
45
45
  remove_cols = [
46
46
  col
@@ -29,5 +29,5 @@ class RuToxicOKMLCUPMultilabelClassification(AbsTaskMultilabelClassification):
29
29
  bibtex_citation="""""",
30
30
  )
31
31
 
32
- def dataset_transform(self):
32
+ def dataset_transform(self, num_proc: int = 1):
33
33
  self.dataset = self.dataset.rename_column("labels", "label")
@@ -42,7 +42,7 @@ class SwedishPatentCPCGroupClassification(AbsTaskMultilabelClassification):
42
42
  """,
43
43
  )
44
44
 
45
- def dataset_transform(self):
45
+ def dataset_transform(self, num_proc: int = 1):
46
46
  self.dataset = self.stratified_subsampling(
47
47
  self.dataset, seed=self.seed, splits=["train"], n_samples=8192
48
48
  )
@@ -42,7 +42,7 @@ class SwedishPatentCPCSubclassClassification(AbsTaskMultilabelClassification):
42
42
  """,
43
43
  )
44
44
 
45
- def dataset_transform(self):
45
+ def dataset_transform(self, num_proc: int = 1):
46
46
  self.dataset = self.stratified_subsampling(
47
47
  self.dataset, seed=self.seed, splits=["train"], n_samples=8192
48
48
  )
@@ -39,7 +39,7 @@ class ArEntail(AbsTaskPairClassification):
39
39
  """,
40
40
  )
41
41
 
42
- def dataset_transform(self):
42
+ def dataset_transform(self, num_proc: int = 1):
43
43
  _dataset = {}
44
44
  for split in self.metadata.eval_splits:
45
45
  _dataset[split] = [
@@ -38,5 +38,5 @@ class TalemaaderPC(AbsTaskPairClassification):
38
38
  """,
39
39
  )
40
40
 
41
- def dataset_transform(self):
41
+ def dataset_transform(self, num_proc: int = 1):
42
42
  self.dataset = self.dataset.rename_column("label", "labels")
@@ -34,7 +34,7 @@ class FalseFriendsDeEnPC(AbsTaskPairClassification):
34
34
  """,
35
35
  )
36
36
 
37
- def dataset_transform(self):
37
+ def dataset_transform(self, num_proc: int = 1):
38
38
  _dataset = {}
39
39
  for split in self.metadata.eval_splits:
40
40
  hf_dataset = self.dataset[split]
@@ -45,7 +45,7 @@ class PubChemAISentenceParaphrasePC(AbsTaskPairClassification):
45
45
  """,
46
46
  )
47
47
 
48
- def dataset_transform(self):
48
+ def dataset_transform(self, num_proc: int = 1):
49
49
  _dataset = {}
50
50
  for split in self.metadata.eval_splits:
51
51
  hf_dataset = self.dataset[split]
@@ -106,7 +106,7 @@ class PubChemSMILESPC(AbsTaskPairClassification):
106
106
  self.dataset_transform()
107
107
  self.data_loaded = True
108
108
 
109
- def dataset_transform(self):
109
+ def dataset_transform(self, num_proc: int = 1):
110
110
  self.dataset = self.stratified_subsampling(
111
111
  self.dataset,
112
112
  seed=self.seed,
@@ -45,7 +45,7 @@ class PubChemSynonymPC(AbsTaskPairClassification):
45
45
  """,
46
46
  )
47
47
 
48
- def dataset_transform(self):
48
+ def dataset_transform(self, num_proc: int = 1):
49
49
  _dataset = {}
50
50
 
51
51
  for split in self.metadata.eval_splits:
@@ -45,7 +45,7 @@ class PubChemWikiParagraphsPC(AbsTaskPairClassification):
45
45
  """,
46
46
  )
47
47
 
48
- def dataset_transform(self):
48
+ def dataset_transform(self, num_proc: int = 1):
49
49
  _dataset = {}
50
50
  for split in self.metadata.eval_splits:
51
51
  hf_dataset = self.dataset[split]
@@ -52,6 +52,6 @@ Tsujii, Jun{'}ichi},
52
52
  """,
53
53
  )
54
54
 
55
- def dataset_transform(self):
55
+ def dataset_transform(self, num_proc: int = 1):
56
56
  self.dataset = self.dataset.rename_column("sent1", "sentence1")
57
57
  self.dataset = self.dataset.rename_column("sent2", "sentence2")
@@ -47,6 +47,6 @@ Jurgens, David},
47
47
  prompt="Retrieve tweets that are semantically similar to the given tweet",
48
48
  )
49
49
 
50
- def dataset_transform(self):
50
+ def dataset_transform(self, num_proc: int = 1):
51
51
  self.dataset = self.dataset.rename_column("sent1", "sentence1")
52
52
  self.dataset = self.dataset.rename_column("sent2", "sentence2")
@@ -47,6 +47,6 @@ Riedel, Sebastian},
47
47
  prompt="Retrieve tweets that are semantically similar to the given tweet",
48
48
  )
49
49
 
50
- def dataset_transform(self):
50
+ def dataset_transform(self, num_proc: int = 1):
51
51
  self.dataset = self.dataset.rename_column("sent1", "sentence1")
52
52
  self.dataset = self.dataset.rename_column("sent2", "sentence2")
@@ -39,7 +39,7 @@ class CExaPPC(AbsTaskPairClassification):
39
39
  """,
40
40
  )
41
41
 
42
- def dataset_transform(self):
42
+ def dataset_transform(self, num_proc: int = 1):
43
43
  _dataset = {}
44
44
  self.dataset = self.dataset.map(
45
45
  lambda example: {"label": 1 if example["label"] == "paraphrase" else 0}
@@ -80,7 +80,7 @@ class SynPerChatbotRAGFAQPC(AbsTaskPairClassification):
80
80
  bibtex_citation=""" """,
81
81
  )
82
82
 
83
- def dataset_transform(self):
83
+ def dataset_transform(self, num_proc: int = 1):
84
84
  _dataset = {}
85
85
  for split in self.metadata.eval_splits:
86
86
  _dataset[split] = [
@@ -118,7 +118,7 @@ class FarsiParaphraseDetection(AbsTaskPairClassification):
118
118
  bibtex_citation=""" """,
119
119
  )
120
120
 
121
- def dataset_transform(self):
121
+ def dataset_transform(self, num_proc: int = 1):
122
122
  _dataset = {}
123
123
  for split in self.metadata.eval_splits:
124
124
  _dataset[split] = [
@@ -156,7 +156,7 @@ class SynPerTextKeywordsPC(AbsTaskPairClassification):
156
156
  bibtex_citation=""" """,
157
157
  )
158
158
 
159
- def dataset_transform(self):
159
+ def dataset_transform(self, num_proc: int = 1):
160
160
  _dataset = {}
161
161
  for split in self.metadata.eval_splits:
162
162
  _dataset[split] = [
@@ -194,7 +194,7 @@ class SynPerQAPC(AbsTaskPairClassification):
194
194
  bibtex_citation=""" """,
195
195
  )
196
196
 
197
- def dataset_transform(self):
197
+ def dataset_transform(self, num_proc: int = 1):
198
198
  _dataset = {}
199
199
  for split in self.metadata.eval_splits:
200
200
  _dataset[split] = [
@@ -38,7 +38,7 @@ class FarsTail(AbsTaskPairClassification):
38
38
  """, # after removing neutral
39
39
  )
40
40
 
41
- def load_data(self) -> None:
41
+ def load_data(self, num_proc: int = 1, **kwargs) -> None:
42
42
  if self.data_loaded:
43
43
  return
44
44
  path = self.metadata.dataset["path"]
@@ -52,7 +52,7 @@ class FarsTail(AbsTaskPairClassification):
52
52
  self.dataset_transform()
53
53
  self.data_loaded = True
54
54
 
55
- def dataset_transform(self):
55
+ def dataset_transform(self, num_proc: int = 1):
56
56
  _dataset = {}
57
57
  self.dataset = self.dataset.filter(lambda x: x["label"] != "n")
58
58
  self.dataset = self.dataset.map(
@@ -36,6 +36,6 @@ class ArmenianParaphrasePC(AbsTaskPairClassification):
36
36
  """,
37
37
  )
38
38
 
39
- def dataset_transform(self):
39
+ def dataset_transform(self, num_proc: int = 1):
40
40
  self.dataset = self.dataset.rename_column("sent1", "sentence1")
41
41
  self.dataset = self.dataset.rename_column("sent2", "sentence2")
@@ -37,7 +37,7 @@ class DisCoTexPairClassification(AbsTaskPairClassification):
37
37
  """,
38
38
  )
39
39
 
40
- def dataset_transform(self):
40
+ def dataset_transform(self, num_proc: int = 1):
41
41
  self.dataset = self.dataset.remove_columns(["id", "source"])
42
42
  self.dataset = self.dataset.map(
43
43
  lambda x: {
@@ -37,7 +37,7 @@ class KlueNLI(AbsTaskPairClassification):
37
37
  """, # 3000 - neutral samples
38
38
  )
39
39
 
40
- def dataset_transform(self):
40
+ def dataset_transform(self, num_proc: int = 1):
41
41
  _dataset = {}
42
42
  for split in self.metadata.eval_splits:
43
43
  # keep labels 0=entailment and 2=contradiction, and map them as 1 and 0 for binary classification
@@ -52,7 +52,7 @@ Dolan, Bill},
52
52
  # sum of 4 languages after neutral filtering
53
53
  )
54
54
 
55
- def load_data(self) -> None:
55
+ def load_data(self, num_proc: int = 1, **kwargs) -> None:
56
56
  """Load dataset from HuggingFace hub"""
57
57
  if self.data_loaded:
58
58
  return
@@ -62,7 +62,7 @@ Dolan, Bill},
62
62
  self.dataset_transform()
63
63
  self.data_loaded = True
64
64
 
65
- def dataset_transform(self):
65
+ def dataset_transform(self, num_proc: int = 1):
66
66
  _dataset = {}
67
67
  for lang in self.hf_subsets:
68
68
  _dataset[lang] = {}
@@ -60,7 +60,7 @@ in Natural Language Processing},
60
60
  """,
61
61
  )
62
62
 
63
- def dataset_transform(self):
63
+ def dataset_transform(self, num_proc: int = 1):
64
64
  _dataset = {}
65
65
  for lang in self.hf_subsets:
66
66
  _dataset[lang] = {}
@@ -57,7 +57,7 @@ Piperidis, Stelios},
57
57
  """,
58
58
  )
59
59
 
60
- def dataset_transform(self):
60
+ def dataset_transform(self, num_proc: int = 1):
61
61
  self.dataset = self.dataset.rename_column("sent1", "sentence1")
62
62
  self.dataset = self.dataset.rename_column("sent2", "sentence2")
63
63
 
@@ -105,7 +105,7 @@ class PpcPC(AbsTaskPairClassification):
105
105
  """,
106
106
  )
107
107
 
108
- def dataset_transform(self):
108
+ def dataset_transform(self, num_proc: int = 1):
109
109
  self.dataset = self.dataset.rename_column("sent1", "sentence1")
110
110
  self.dataset = self.dataset.rename_column("sent2", "sentence2")
111
111
 
@@ -151,7 +151,7 @@ Kan, Min-Yen},
151
151
  """,
152
152
  )
153
153
 
154
- def dataset_transform(self):
154
+ def dataset_transform(self, num_proc: int = 1):
155
155
  self.dataset = self.dataset.rename_column("sent1", "sentence1")
156
156
  self.dataset = self.dataset.rename_column("sent2", "sentence2")
157
157
 
@@ -203,6 +203,6 @@ Piperidis, Stelios},
203
203
  """,
204
204
  )
205
205
 
206
- def dataset_transform(self):
206
+ def dataset_transform(self, num_proc: int = 1):
207
207
  self.dataset = self.dataset.rename_column("sent1", "sentence1")
208
208
  self.dataset = self.dataset.rename_column("sent2", "sentence2")
@@ -36,7 +36,7 @@ class Assin2RTE(AbsTaskPairClassification):
36
36
  """,
37
37
  )
38
38
 
39
- def dataset_transform(self):
39
+ def dataset_transform(self, num_proc: int = 1):
40
40
  _dataset = {}
41
41
  self.dataset = self.stratified_subsampling(
42
42
  self.dataset,
@@ -47,7 +47,7 @@ and de Paiva, Valeria},
47
47
  """,
48
48
  )
49
49
 
50
- def dataset_transform(self):
50
+ def dataset_transform(self, num_proc: int = 1):
51
51
  _dataset = {}
52
52
 
53
53
  # Do not process the subsets we won't use
@@ -50,7 +50,7 @@ class TERRa(AbsTaskPairClassification):
50
50
  **_terra_metadata,
51
51
  )
52
52
 
53
- def dataset_transform(self):
53
+ def dataset_transform(self, num_proc: int = 1):
54
54
  self.dataset = self.dataset.rename_column("sent1", "sentence1")
55
55
  self.dataset = self.dataset.rename_column("sent2", "sentence2")
56
56
 
@@ -71,6 +71,6 @@ class TERRaV2(AbsTaskPairClassification):
71
71
  **_terra_metadata,
72
72
  )
73
73
 
74
- def dataset_transform(self):
74
+ def dataset_transform(self, num_proc: int = 1):
75
75
  self.dataset = self.dataset.rename_column("sent1", "sentence1")
76
76
  self.dataset = self.dataset.rename_column("sent2", "sentence2")
@@ -37,6 +37,6 @@ class SprintDuplicateQuestionsPCVN(AbsTaskPairClassification):
37
37
  adapted_from=["SprintDuplicateQuestions"],
38
38
  )
39
39
 
40
- def dataset_transform(self):
40
+ def dataset_transform(self, num_proc: int = 1):
41
41
  self.dataset = self.dataset.rename_column("sent1", "sentence1")
42
42
  self.dataset = self.dataset.rename_column("sent2", "sentence2")
@@ -37,6 +37,6 @@ class TwitterSemEval2015PCVN(AbsTaskPairClassification):
37
37
  adapted_from=["TwitterSemEval2015"],
38
38
  )
39
39
 
40
- def dataset_transform(self):
40
+ def dataset_transform(self, num_proc: int = 1):
41
41
  self.dataset = self.dataset.rename_column("sent1", "sentence1")
42
42
  self.dataset = self.dataset.rename_column("sent2", "sentence2")
@@ -37,6 +37,6 @@ class TwitterURLCorpusPC(AbsTaskPairClassification):
37
37
  adapted_from=["TwitterURLCorpus"],
38
38
  )
39
39
 
40
- def dataset_transform(self):
40
+ def dataset_transform(self, num_proc: int = 1):
41
41
  self.dataset = self.dataset.rename_column("sent1", "sentence1")
42
42
  self.dataset = self.dataset.rename_column("sent2", "sentence2")
@@ -37,7 +37,7 @@ class Ocnli(AbsTaskPairClassification):
37
37
  prompt="Retrieve semantically similar text.",
38
38
  )
39
39
 
40
- def dataset_transform(self):
40
+ def dataset_transform(self, num_proc: int = 1):
41
41
  self.dataset = self.dataset.rename_column("sent1", "sentence1")
42
42
  self.dataset = self.dataset.rename_column("sent2", "sentence2")
43
43
 
@@ -112,6 +112,6 @@ Lan, Zhenzhong},
112
112
  prompt="Retrieve semantically similar text.",
113
113
  )
114
114
 
115
- def dataset_transform(self):
115
+ def dataset_transform(self, num_proc: int = 1):
116
116
  self.dataset = self.dataset.rename_column("sent1", "sentence1")
117
117
  self.dataset = self.dataset.rename_column("sent2", "sentence2")
@@ -37,7 +37,7 @@ class SadeemQuestionRetrieval(AbsTaskRetrieval):
37
37
  """,
38
38
  )
39
39
 
40
- def load_data(self) -> None:
40
+ def load_data(self, num_proc: int = 1, **kwargs) -> None:
41
41
  if self.data_loaded:
42
42
  return
43
43
 
@@ -53,7 +53,7 @@ class CodeEditSearchRetrieval(AbsTaskRetrieval):
53
53
  """,
54
54
  )
55
55
 
56
- def load_data(self) -> None:
56
+ def load_data(self, num_proc: int = 1, **kwargs) -> None:
57
57
  if self.data_loaded:
58
58
  return
59
59
 
@@ -51,7 +51,7 @@ class CodeRAGProgrammingSolutionsRetrieval(AbsTaskRetrieval):
51
51
  **common_args,
52
52
  )
53
53
 
54
- def load_data(self) -> None:
54
+ def load_data(self, num_proc: int = 1, **kwargs) -> None:
55
55
  """Load dataset from HuggingFace hub"""
56
56
  if self.data_loaded:
57
57
  return
@@ -108,7 +108,7 @@ class CodeRAGOnlineTutorialsRetrieval(AbsTaskRetrieval):
108
108
  **common_args,
109
109
  )
110
110
 
111
- def load_data(self) -> None:
111
+ def load_data(self, num_proc: int = 1, **kwargs) -> None:
112
112
  """Load dataset from HuggingFace hub"""
113
113
  if self.data_loaded:
114
114
  return
@@ -168,7 +168,7 @@ class CodeRAGLibraryDocumentationSolutionsRetrieval(AbsTaskRetrieval):
168
168
  **common_args,
169
169
  )
170
170
 
171
- def load_data(self) -> None:
171
+ def load_data(self, num_proc: int = 1, **kwargs) -> None:
172
172
  """Load dataset from HuggingFace hub"""
173
173
  if self.data_loaded:
174
174
  return
@@ -225,7 +225,7 @@ class CodeRAGStackoverflowPostsRetrieval(AbsTaskRetrieval):
225
225
  **common_args,
226
226
  )
227
227
 
228
- def load_data(self) -> None:
228
+ def load_data(self, num_proc: int = 1, **kwargs) -> None:
229
229
  """Load dataset from HuggingFace hub"""
230
230
  if self.data_loaded:
231
231
  return
@@ -99,7 +99,7 @@ class CodeSearchNetCCRetrieval(AbsTaskRetrieval):
99
99
  """,
100
100
  )
101
101
 
102
- def load_data(self) -> None:
102
+ def load_data(self, num_proc: int = 1, **kwargs) -> None:
103
103
  if self.data_loaded:
104
104
  return
105
105
 
@@ -97,7 +97,7 @@ class COIRCodeSearchNetRetrieval(AbsTaskRetrieval):
97
97
  """,
98
98
  )
99
99
 
100
- def load_data(self) -> None:
100
+ def load_data(self, num_proc: int = 1, **kwargs) -> None:
101
101
  if self.data_loaded:
102
102
  return
103
103
 
@@ -34,7 +34,7 @@ class DS1000Retrieval(AbsTaskRetrieval):
34
34
  """,
35
35
  )
36
36
 
37
- def load_data(self) -> None:
37
+ def load_data(self, num_proc: int = 1, **kwargs) -> None:
38
38
  if self.data_loaded:
39
39
  return
40
40
 
@@ -37,7 +37,7 @@ class FreshStackRetrieval(AbsTaskRetrieval):
37
37
  """,
38
38
  )
39
39
 
40
- def load_data(self) -> None:
40
+ def load_data(self, num_proc: int = 1, **kwargs) -> None:
41
41
  if self.data_loaded:
42
42
  return
43
43
 
@@ -34,7 +34,7 @@ class HumanEvalRetrieval(AbsTaskRetrieval):
34
34
  }""",
35
35
  )
36
36
 
37
- def load_data(self) -> None:
37
+ def load_data(self, num_proc: int = 1, **kwargs) -> None:
38
38
  if self.data_loaded:
39
39
  return
40
40
 
@@ -34,7 +34,7 @@ class MBPPRetrieval(AbsTaskRetrieval):
34
34
  """,
35
35
  )
36
36
 
37
- def load_data(self) -> None:
37
+ def load_data(self, num_proc: int = 1, **kwargs) -> None:
38
38
  if self.data_loaded:
39
39
  return
40
40
 
@@ -36,7 +36,7 @@ class WikiSQLRetrieval(AbsTaskRetrieval):
36
36
  """,
37
37
  )
38
38
 
39
- def load_data(self) -> None:
39
+ def load_data(self, num_proc: int = 1, **kwargs) -> None:
40
40
  if self.data_loaded:
41
41
  return
42
42