mteb 2.7.3__py3-none-any.whl → 2.7.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (434) hide show
  1. mteb/_create_dataloaders.py +47 -5
  2. mteb/_evaluators/any_sts_evaluator.py +2 -0
  3. mteb/_evaluators/clustering_evaluator.py +2 -0
  4. mteb/_evaluators/evaluator.py +2 -1
  5. mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +8 -1
  6. mteb/_evaluators/pair_classification_evaluator.py +3 -0
  7. mteb/_evaluators/retrieval_evaluator.py +3 -0
  8. mteb/_evaluators/sklearn_evaluator.py +6 -1
  9. mteb/_evaluators/text/bitext_mining_evaluator.py +2 -0
  10. mteb/_evaluators/text/summarization_evaluator.py +2 -0
  11. mteb/_evaluators/zeroshot_classification_evaluator.py +2 -0
  12. mteb/abstasks/abstask.py +31 -12
  13. mteb/abstasks/classification.py +10 -3
  14. mteb/abstasks/clustering.py +6 -2
  15. mteb/abstasks/clustering_legacy.py +8 -2
  16. mteb/abstasks/image/image_text_pair_classification.py +6 -2
  17. mteb/abstasks/multilabel_classification.py +2 -0
  18. mteb/abstasks/pair_classification.py +8 -2
  19. mteb/abstasks/retrieval.py +27 -12
  20. mteb/abstasks/retrieval_dataset_loaders.py +29 -19
  21. mteb/abstasks/sts.py +10 -3
  22. mteb/abstasks/text/bitext_mining.py +9 -5
  23. mteb/abstasks/text/reranking.py +2 -2
  24. mteb/abstasks/text/summarization.py +2 -1
  25. mteb/abstasks/zeroshot_classification.py +8 -2
  26. mteb/benchmarks/benchmarks/__init__.py +2 -0
  27. mteb/benchmarks/benchmarks/benchmarks.py +41 -2
  28. mteb/descriptive_stats/Retrieval/BrightAopsRetrieval.json +35 -0
  29. mteb/descriptive_stats/Retrieval/BrightBiologyLongRetrieval.json +35 -0
  30. mteb/descriptive_stats/Retrieval/BrightBiologyRetrieval.json +35 -0
  31. mteb/descriptive_stats/Retrieval/BrightEarthScienceLongRetrieval.json +35 -0
  32. mteb/descriptive_stats/Retrieval/BrightEarthScienceRetrieval.json +35 -0
  33. mteb/descriptive_stats/Retrieval/BrightEconomicsLongRetrieval.json +35 -0
  34. mteb/descriptive_stats/Retrieval/BrightEconomicsRetrieval.json +35 -0
  35. mteb/descriptive_stats/Retrieval/BrightLeetcodeRetrieval.json +35 -0
  36. mteb/descriptive_stats/Retrieval/BrightPonyLongRetrieval.json +35 -0
  37. mteb/descriptive_stats/Retrieval/BrightPonyRetrieval.json +35 -0
  38. mteb/descriptive_stats/Retrieval/BrightPsychologyLongRetrieval.json +35 -0
  39. mteb/descriptive_stats/Retrieval/BrightPsychologyRetrieval.json +35 -0
  40. mteb/descriptive_stats/Retrieval/BrightRoboticsLongRetrieval.json +35 -0
  41. mteb/descriptive_stats/Retrieval/BrightRoboticsRetrieval.json +35 -0
  42. mteb/descriptive_stats/Retrieval/BrightStackoverflowLongRetrieval.json +35 -0
  43. mteb/descriptive_stats/Retrieval/BrightStackoverflowRetrieval.json +35 -0
  44. mteb/descriptive_stats/Retrieval/BrightSustainableLivingLongRetrieval.json +35 -0
  45. mteb/descriptive_stats/Retrieval/BrightSustainableLivingRetrieval.json +35 -0
  46. mteb/descriptive_stats/Retrieval/BrightTheoremQAQuestionsRetrieval.json +35 -0
  47. mteb/descriptive_stats/Retrieval/BrightTheoremQATheoremsRetrieval.json +35 -0
  48. mteb/evaluate.py +10 -2
  49. mteb/models/model_implementations/align_models.py +1 -0
  50. mteb/models/model_implementations/amazon_models.py +1 -0
  51. mteb/models/model_implementations/andersborges.py +2 -0
  52. mteb/models/model_implementations/ara_models.py +1 -0
  53. mteb/models/model_implementations/arctic_models.py +8 -0
  54. mteb/models/model_implementations/b1ade_models.py +1 -0
  55. mteb/models/model_implementations/bedrock_models.py +4 -0
  56. mteb/models/model_implementations/bge_models.py +40 -1
  57. mteb/models/model_implementations/bica_model.py +1 -0
  58. mteb/models/model_implementations/blip2_models.py +2 -0
  59. mteb/models/model_implementations/blip_models.py +8 -0
  60. mteb/models/model_implementations/bm25.py +10 -5
  61. mteb/models/model_implementations/bmretriever_models.py +4 -0
  62. mteb/models/model_implementations/cadet_models.py +1 -0
  63. mteb/models/model_implementations/cde_models.py +2 -0
  64. mteb/models/model_implementations/clip_models.py +3 -0
  65. mteb/models/model_implementations/clips_models.py +3 -0
  66. mteb/models/model_implementations/codefuse_models.py +5 -0
  67. mteb/models/model_implementations/codesage_models.py +3 -0
  68. mteb/models/model_implementations/cohere_models.py +4 -0
  69. mteb/models/model_implementations/cohere_v.py +5 -0
  70. mteb/models/model_implementations/colpali_models.py +3 -0
  71. mteb/models/model_implementations/colqwen_models.py +7 -0
  72. mteb/models/model_implementations/colsmol_models.py +2 -0
  73. mteb/models/model_implementations/conan_models.py +1 -0
  74. mteb/models/model_implementations/dino_models.py +19 -0
  75. mteb/models/model_implementations/e5_instruct.py +4 -0
  76. mteb/models/model_implementations/e5_models.py +9 -0
  77. mteb/models/model_implementations/e5_v.py +1 -0
  78. mteb/models/model_implementations/eagerworks_models.py +1 -0
  79. mteb/models/model_implementations/emillykkejensen_models.py +3 -0
  80. mteb/models/model_implementations/en_code_retriever.py +1 -0
  81. mteb/models/model_implementations/euler_models.py +1 -0
  82. mteb/models/model_implementations/evaclip_models.py +4 -0
  83. mteb/models/model_implementations/fa_models.py +9 -0
  84. mteb/models/model_implementations/facebookai.py +2 -0
  85. mteb/models/model_implementations/geogpt_models.py +1 -0
  86. mteb/models/model_implementations/gme_v_models.py +2 -0
  87. mteb/models/model_implementations/google_models.py +5 -0
  88. mteb/models/model_implementations/granite_vision_embedding_models.py +1 -0
  89. mteb/models/model_implementations/gritlm_models.py +2 -0
  90. mteb/models/model_implementations/gte_models.py +9 -0
  91. mteb/models/model_implementations/hinvec_models.py +1 -0
  92. mteb/models/model_implementations/human.py +1 -0
  93. mteb/models/model_implementations/ibm_granite_models.py +6 -0
  94. mteb/models/model_implementations/inf_models.py +2 -0
  95. mteb/models/model_implementations/jasper_models.py +2 -0
  96. mteb/models/model_implementations/jina_clip.py +1 -0
  97. mteb/models/model_implementations/jina_models.py +7 -0
  98. mteb/models/model_implementations/kalm_models.py +6 -0
  99. mteb/models/model_implementations/kblab.py +1 -0
  100. mteb/models/model_implementations/kennethenevoldsen_models.py +2 -0
  101. mteb/models/model_implementations/kfst.py +1 -0
  102. mteb/models/model_implementations/kowshik24_models.py +1 -0
  103. mteb/models/model_implementations/lens_models.py +2 -0
  104. mteb/models/model_implementations/lgai_embedding_models.py +1 -0
  105. mteb/models/model_implementations/linq_models.py +1 -0
  106. mteb/models/model_implementations/listconranker.py +1 -0
  107. mteb/models/model_implementations/llm2clip_models.py +3 -0
  108. mteb/models/model_implementations/llm2vec_models.py +8 -0
  109. mteb/models/model_implementations/mcinext_models.py +3 -0
  110. mteb/models/model_implementations/mdbr_models.py +2 -0
  111. mteb/models/model_implementations/misc_models.py +63 -0
  112. mteb/models/model_implementations/mixedbread_ai_models.py +3 -0
  113. mteb/models/model_implementations/mme5_models.py +2 -1
  114. mteb/models/model_implementations/moco_models.py +2 -0
  115. mteb/models/model_implementations/mod_models.py +1 -0
  116. mteb/models/model_implementations/model2vec_models.py +13 -0
  117. mteb/models/model_implementations/moka_models.py +3 -0
  118. mteb/models/model_implementations/nbailab.py +3 -0
  119. mteb/models/model_implementations/no_instruct_sentence_models.py +1 -0
  120. mteb/models/model_implementations/nomic_models.py +6 -0
  121. mteb/models/model_implementations/nomic_models_vision.py +1 -0
  122. mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +2 -0
  123. mteb/models/model_implementations/nvidia_models.py +3 -0
  124. mteb/models/model_implementations/octen_models.py +2 -0
  125. mteb/models/model_implementations/openai_models.py +5 -0
  126. mteb/models/model_implementations/openclip_models.py +8 -0
  127. mteb/models/model_implementations/opensearch_neural_sparse_models.py +5 -0
  128. mteb/models/model_implementations/ops_moa_models.py +2 -0
  129. mteb/models/model_implementations/ordalietech_solon_embeddings_mini_beta_1_1.py +1 -0
  130. mteb/models/model_implementations/pawan_models.py +1 -0
  131. mteb/models/model_implementations/piccolo_models.py +2 -0
  132. mteb/models/model_implementations/promptriever_models.py +4 -0
  133. mteb/models/model_implementations/pylate_models.py +13 -0
  134. mteb/models/model_implementations/qodo_models.py +2 -0
  135. mteb/models/model_implementations/qtack_models.py +1 -0
  136. mteb/models/model_implementations/qwen3_models.py +3 -0
  137. mteb/models/model_implementations/qzhou_models.py +2 -0
  138. mteb/models/model_implementations/rasgaard_models.py +1 -0
  139. mteb/models/model_implementations/reasonir_model.py +65 -0
  140. mteb/models/model_implementations/repllama_models.py +2 -0
  141. mteb/models/model_implementations/rerankers_custom.py +3 -0
  142. mteb/models/model_implementations/rerankers_monot5_based.py +14 -0
  143. mteb/models/model_implementations/richinfoai_models.py +1 -0
  144. mteb/models/model_implementations/ru_sentence_models.py +20 -0
  145. mteb/models/model_implementations/ruri_models.py +10 -0
  146. mteb/models/model_implementations/salesforce_models.py +3 -0
  147. mteb/models/model_implementations/samilpwc_models.py +1 -0
  148. mteb/models/model_implementations/sarashina_embedding_models.py +2 -0
  149. mteb/models/model_implementations/searchmap_models.py +1 -0
  150. mteb/models/model_implementations/seed_1_6_embedding_models.py +1 -0
  151. mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +1 -0
  152. mteb/models/model_implementations/seed_models.py +1 -0
  153. mteb/models/model_implementations/sentence_transformers_models.py +18 -0
  154. mteb/models/model_implementations/shuu_model.py +1 -0
  155. mteb/models/model_implementations/siglip_models.py +10 -0
  156. mteb/models/model_implementations/sonar_models.py +2 -1
  157. mteb/models/model_implementations/spartan8806_atles_champion.py +1 -0
  158. mteb/models/model_implementations/stella_models.py +6 -0
  159. mteb/models/model_implementations/tarka_models.py +2 -0
  160. mteb/models/model_implementations/text2vec_models.py +3 -0
  161. mteb/models/model_implementations/ua_sentence_models.py +1 -0
  162. mteb/models/model_implementations/uae_models.py +1 -0
  163. mteb/models/model_implementations/vdr_models.py +1 -0
  164. mteb/models/model_implementations/vi_vn_models.py +6 -0
  165. mteb/models/model_implementations/vista_models.py +2 -0
  166. mteb/models/model_implementations/vlm2vec_models.py +2 -0
  167. mteb/models/model_implementations/voyage_models.py +15 -0
  168. mteb/models/model_implementations/voyage_v.py +1 -0
  169. mteb/models/model_implementations/xyz_models.py +1 -0
  170. mteb/models/model_implementations/youtu_models.py +1 -0
  171. mteb/models/model_implementations/yuan_models.py +1 -0
  172. mteb/models/model_implementations/yuan_models_en.py +1 -0
  173. mteb/models/model_meta.py +35 -2
  174. mteb/models/models_protocols.py +4 -0
  175. mteb/models/search_wrappers.py +12 -0
  176. mteb/tasks/bitext_mining/eng/pub_chem_smiles_bitext_mining.py +1 -1
  177. mteb/tasks/bitext_mining/fas/fa_mteb_summary_retrieval.py +3 -3
  178. mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining.py +1 -1
  179. mteb/tasks/bitext_mining/multilingual/flores_bitext_mining.py +1 -1
  180. mteb/tasks/bitext_mining/multilingual/in22_conv_bitext_mining.py +1 -1
  181. mteb/tasks/bitext_mining/multilingual/in22_gen_bitext_mining.py +1 -1
  182. mteb/tasks/bitext_mining/multilingual/norwegian_courts_bitext_mining.py +1 -1
  183. mteb/tasks/bitext_mining/multilingual/ntrex_bitext_mining.py +1 -1
  184. mteb/tasks/bitext_mining/multilingual/roma_tales_bitext_mining.py +2 -2
  185. mteb/tasks/bitext_mining/multilingual/web_faq_bitext_mining.py +2 -2
  186. mteb/tasks/classification/ara/online_store_review_sentiment_classification.py +1 -1
  187. mteb/tasks/classification/ara/restaurant_review_sentiment_classification.py +1 -1
  188. mteb/tasks/classification/ara/tweet_sarcasm_classification.py +1 -1
  189. mteb/tasks/classification/ben/bengali_hate_speech_classification.py +1 -1
  190. mteb/tasks/classification/ben/bengali_sentiment_analysis.py +1 -1
  191. mteb/tasks/classification/bul/bulgarian_store_review_sentiment_classfication.py +1 -1
  192. mteb/tasks/classification/ces/csfdcz_movie_review_sentiment_classification.py +2 -2
  193. mteb/tasks/classification/dan/ddisco_cohesion_classification.py +1 -1
  194. mteb/tasks/classification/dan/dk_hate_classification.py +1 -1
  195. mteb/tasks/classification/deu/german_politicians_twitter_sentiment_classification.py +1 -1
  196. mteb/tasks/classification/ell/greek_legal_code_classification.py +1 -1
  197. mteb/tasks/classification/eng/dbpedia_classification.py +2 -2
  198. mteb/tasks/classification/eng/toxic_chat_classification.py +2 -2
  199. mteb/tasks/classification/eng/toxic_conversations_classification.py +2 -2
  200. mteb/tasks/classification/eng/tweet_topic_single_classification.py +1 -1
  201. mteb/tasks/classification/eng/yahoo_answers_topics_classification.py +1 -1
  202. mteb/tasks/classification/eng/yelp_review_full_classification.py +2 -2
  203. mteb/tasks/classification/est/estonian_valence.py +1 -1
  204. mteb/tasks/classification/fas/fa_mteb_classification.py +6 -6
  205. mteb/tasks/classification/fas/persian_food_sentiment_classification.py +1 -1
  206. mteb/tasks/classification/fil/filipino_shopee_reviews_classification.py +1 -1
  207. mteb/tasks/classification/fin/fin_toxicity_classification.py +1 -1
  208. mteb/tasks/classification/fra/french_book_reviews.py +2 -2
  209. mteb/tasks/classification/fra/movie_review_sentiment_classification.py +2 -2
  210. mteb/tasks/classification/guj/gujarati_news_classification.py +1 -1
  211. mteb/tasks/classification/hin/hindi_discourse_classification.py +1 -1
  212. mteb/tasks/classification/hin/sentiment_analysis_hindi.py +1 -1
  213. mteb/tasks/classification/ind/indonesian_id_clickbait_classification.py +2 -2
  214. mteb/tasks/classification/ind/indonesian_mongabay_conservation_classification.py +1 -1
  215. mteb/tasks/classification/ita/dado_eval_coarse_classification.py +1 -1
  216. mteb/tasks/classification/ita/ita_casehold_classification.py +1 -1
  217. mteb/tasks/classification/ita/sardi_stance_classification.py +1 -1
  218. mteb/tasks/classification/jav/javanese_imdb_classification.py +1 -1
  219. mteb/tasks/classification/jpn/wrime_classification.py +1 -1
  220. mteb/tasks/classification/kan/kannada_news_classification.py +2 -2
  221. mteb/tasks/classification/kor/klue_tc.py +2 -2
  222. mteb/tasks/classification/kor/kor_fin.py +1 -1
  223. mteb/tasks/classification/kor/kor_hate_classification.py +1 -1
  224. mteb/tasks/classification/kor/kor_sarcasm_classification.py +1 -1
  225. mteb/tasks/classification/mal/malayalam_news_classification.py +1 -1
  226. mteb/tasks/classification/mar/marathi_news_classification.py +1 -1
  227. mteb/tasks/classification/multilingual/afri_senti_lang_classification.py +1 -1
  228. mteb/tasks/classification/multilingual/catalonia_tweet_classification.py +1 -1
  229. mteb/tasks/classification/multilingual/cyrillic_turkic_lang_classification.py +1 -1
  230. mteb/tasks/classification/multilingual/indic_nlp_news_classification.py +1 -1
  231. mteb/tasks/classification/multilingual/masakha_news_classification.py +1 -1
  232. mteb/tasks/classification/multilingual/multi_hate_classification.py +1 -1
  233. mteb/tasks/classification/multilingual/multilingual_sentiment_classification.py +1 -1
  234. mteb/tasks/classification/multilingual/scala_classification.py +1 -1
  235. mteb/tasks/classification/multilingual/sib200_classification.py +1 -1
  236. mteb/tasks/classification/multilingual/turkic_classification.py +1 -1
  237. mteb/tasks/classification/multilingual/tweet_sentiment_classification.py +1 -1
  238. mteb/tasks/classification/nep/nepali_news_classification.py +2 -2
  239. mteb/tasks/classification/nld/dutch_sarcastic_headlines_classification.py +1 -1
  240. mteb/tasks/classification/nld/vaccin_chat_nl_classification.py +1 -1
  241. mteb/tasks/classification/ory/odia_news_classification.py +2 -2
  242. mteb/tasks/classification/pan/punjabi_news_classification.py +1 -1
  243. mteb/tasks/classification/ron/moroco.py +1 -1
  244. mteb/tasks/classification/ron/romanian_reviews_sentiment.py +1 -1
  245. mteb/tasks/classification/ron/romanian_sentiment_classification.py +1 -1
  246. mteb/tasks/classification/rus/georeview_classification.py +1 -1
  247. mteb/tasks/classification/rus/headline_classification.py +2 -2
  248. mteb/tasks/classification/rus/inappropriateness_classification.py +2 -2
  249. mteb/tasks/classification/rus/ru_reviews_classification.py +2 -2
  250. mteb/tasks/classification/rus/ru_sci_bench_grnti_classification.py +1 -1
  251. mteb/tasks/classification/rus/ru_sci_bench_oecd_classification.py +1 -1
  252. mteb/tasks/classification/rus/ru_toixic_classification_okmlcup.py +1 -1
  253. mteb/tasks/classification/san/sanskrit_shlokas_classification.py +1 -1
  254. mteb/tasks/classification/sin/sinhala_news_classification.py +2 -2
  255. mteb/tasks/classification/sin/sinhala_news_source_classification.py +2 -2
  256. mteb/tasks/classification/slk/csfdsk_movie_review_sentiment_classification.py +2 -2
  257. mteb/tasks/classification/slv/frenk_sl_classification.py +1 -1
  258. mteb/tasks/classification/spa/spanish_news_classification.py +2 -2
  259. mteb/tasks/classification/ssw/siswati_news_classification.py +1 -1
  260. mteb/tasks/classification/tam/tamil_news_classification.py +2 -2
  261. mteb/tasks/classification/tel/telugu_andhra_jyoti_news_classification.py +2 -2
  262. mteb/tasks/classification/tha/wongnai_reviews_classification.py +1 -1
  263. mteb/tasks/classification/tur/turkish_movie_sentiment_classification.py +2 -2
  264. mteb/tasks/classification/ukr/ukr_formality_classification.py +2 -2
  265. mteb/tasks/classification/vie/toxic_conversations_vn_classification.py +1 -1
  266. mteb/tasks/classification/vie/vie_student_feedback_classification.py +1 -1
  267. mteb/tasks/classification/zho/yue_openrice_review_classification.py +2 -2
  268. mteb/tasks/classification/zul/isi_zulu_news_classification.py +1 -1
  269. mteb/tasks/clustering/deu/blurbs_clustering_p2p.py +1 -1
  270. mteb/tasks/clustering/deu/blurbs_clustering_s2s.py +1 -1
  271. mteb/tasks/clustering/eng/arxiv_clustering_p2p.py +1 -1
  272. mteb/tasks/clustering/eng/arxiv_hierarchical_clustering.py +2 -2
  273. mteb/tasks/clustering/eng/big_patent_clustering.py +1 -1
  274. mteb/tasks/clustering/eng/biorxiv_clustering_p2p.py +1 -1
  275. mteb/tasks/clustering/eng/biorxiv_clustering_s2s.py +1 -1
  276. mteb/tasks/clustering/eng/medrxiv_clustering_p2p.py +1 -1
  277. mteb/tasks/clustering/eng/medrxiv_clustering_s2s.py +1 -1
  278. mteb/tasks/clustering/eng/reddit_clustering.py +1 -1
  279. mteb/tasks/clustering/eng/reddit_clustering_p2p.py +1 -1
  280. mteb/tasks/clustering/eng/stack_exchange_clustering.py +1 -1
  281. mteb/tasks/clustering/eng/stack_exchange_clustering_p2p.py +1 -1
  282. mteb/tasks/clustering/eng/twenty_newsgroups_clustering.py +1 -1
  283. mteb/tasks/clustering/fas/fa_mteb_clustering.py +4 -4
  284. mteb/tasks/clustering/fra/hal_clustering_s2s.py +2 -2
  285. mteb/tasks/clustering/multilingual/mlsum_clustering_p2p.py +2 -2
  286. mteb/tasks/clustering/multilingual/mlsum_clustering_s2s.py +2 -2
  287. mteb/tasks/clustering/multilingual/sib200_clustering_s2s.py +1 -1
  288. mteb/tasks/clustering/multilingual/wiki_clustering_p2p.py +1 -1
  289. mteb/tasks/clustering/nld/dutch_news_articles_clustering_p2p.py +1 -1
  290. mteb/tasks/clustering/nld/dutch_news_articles_clustering_s2s.py +1 -1
  291. mteb/tasks/clustering/nld/iconclass_clustering_s2s.py +1 -1
  292. mteb/tasks/clustering/nld/open_tender_clustering_p2p.py +1 -1
  293. mteb/tasks/clustering/nld/vabb_clustering_p2p.py +1 -1
  294. mteb/tasks/clustering/nld/vabb_clustering_s2s.py +1 -1
  295. mteb/tasks/clustering/nob/snl_clustering.py +1 -1
  296. mteb/tasks/clustering/nob/vg_clustering.py +1 -1
  297. mteb/tasks/clustering/pol/polish_clustering.py +3 -3
  298. mteb/tasks/clustering/rus/ru_sci_bench_grnti_clustering_p2p.py +1 -1
  299. mteb/tasks/clustering/rus/ru_sci_bench_oecd_clustering_p2p.py +1 -1
  300. mteb/tasks/clustering/zho/cmteb_clustering.py +4 -4
  301. mteb/tasks/image_text_pair_classification/eng/image_co_de.py +1 -1
  302. mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +1 -1
  303. mteb/tasks/instruction_reranking/multilingual/m_follow_ir.py +2 -2
  304. mteb/tasks/multichoice/eng/cv_bench.py +4 -4
  305. mteb/tasks/multilabel_classification/ita/emit_classification.py +1 -1
  306. mteb/tasks/multilabel_classification/mlt/maltese_news_classification.py +1 -1
  307. mteb/tasks/multilabel_classification/rus/ru_toixic_multilabelclassification_okmlcup.py +1 -1
  308. mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_group_classification.py +1 -1
  309. mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_subclass_classification.py +1 -1
  310. mteb/tasks/pair_classification/ara/ar_entail.py +1 -1
  311. mteb/tasks/pair_classification/dan/talemaader_pc.py +1 -1
  312. mteb/tasks/pair_classification/deu/false_friends_de_en_pc.py +1 -1
  313. mteb/tasks/pair_classification/eng/pub_chem_ai_sentence_paraphrase_pc.py +1 -1
  314. mteb/tasks/pair_classification/eng/pub_chem_smilespc.py +1 -1
  315. mteb/tasks/pair_classification/eng/pub_chem_synonym_pc.py +1 -1
  316. mteb/tasks/pair_classification/eng/pub_chem_wiki_paragraphs_pc.py +1 -1
  317. mteb/tasks/pair_classification/eng/sprint_duplicate_questions_pc.py +1 -1
  318. mteb/tasks/pair_classification/eng/twitter_sem_eval2015_pc.py +1 -1
  319. mteb/tasks/pair_classification/eng/twitter_url_corpus_pc.py +1 -1
  320. mteb/tasks/pair_classification/fas/fa_mteb_pair_classification.py +5 -5
  321. mteb/tasks/pair_classification/fas/fars_tail.py +2 -2
  322. mteb/tasks/pair_classification/hye/armenian_paraphrase_pc.py +1 -1
  323. mteb/tasks/pair_classification/ita/dis_co_tex_pair_classification.py +1 -1
  324. mteb/tasks/pair_classification/kor/klue_nli.py +1 -1
  325. mteb/tasks/pair_classification/multilingual/rte3.py +2 -2
  326. mteb/tasks/pair_classification/multilingual/xnli.py +1 -1
  327. mteb/tasks/pair_classification/pol/polish_pc.py +4 -4
  328. mteb/tasks/pair_classification/por/assin2_rte.py +1 -1
  329. mteb/tasks/pair_classification/por/sick_br_pc.py +1 -1
  330. mteb/tasks/pair_classification/rus/terra.py +2 -2
  331. mteb/tasks/pair_classification/vie/sprint_duplicate_questions_pcvn.py +1 -1
  332. mteb/tasks/pair_classification/vie/twitter_sem_eval2015_pcvn.py +1 -1
  333. mteb/tasks/pair_classification/vie/twitter_url_corpus_pcvn.py +1 -1
  334. mteb/tasks/pair_classification/zho/cmteb_pair_classification.py +2 -2
  335. mteb/tasks/retrieval/ara/sadeem_question_retrieval.py +1 -1
  336. mteb/tasks/retrieval/code/code_edit_search_retrieval.py +1 -1
  337. mteb/tasks/retrieval/code/code_rag.py +4 -4
  338. mteb/tasks/retrieval/code/code_search_net_cc_retrieval.py +1 -1
  339. mteb/tasks/retrieval/code/coir_code_search_net_retrieval.py +1 -1
  340. mteb/tasks/retrieval/code/ds1000_retrieval.py +1 -1
  341. mteb/tasks/retrieval/code/fresh_stack_retrieval.py +1 -1
  342. mteb/tasks/retrieval/code/human_eval_retrieval.py +1 -1
  343. mteb/tasks/retrieval/code/mbpp_retrieval.py +1 -1
  344. mteb/tasks/retrieval/code/wiki_sql_retrieval.py +1 -1
  345. mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
  346. mteb/tasks/retrieval/dan/tv2_nordretrieval.py +1 -1
  347. mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +1 -1
  348. mteb/tasks/retrieval/deu/german_gov_service_retrieval.py +1 -1
  349. mteb/tasks/retrieval/deu/german_qu_ad_retrieval.py +1 -1
  350. mteb/tasks/retrieval/ell/greek_civics_qa.py +1 -1
  351. mteb/tasks/retrieval/eng/__init__.py +42 -0
  352. mteb/tasks/retrieval/eng/bright_retrieval.py +10 -2
  353. mteb/tasks/retrieval/eng/bright_v1_1_retrieval.py +968 -0
  354. mteb/tasks/retrieval/eng/chat_doctor_retrieval.py +1 -1
  355. mteb/tasks/retrieval/eng/fin_qa_retrieval.py +1 -1
  356. mteb/tasks/retrieval/eng/finance_bench_retrieval.py +1 -1
  357. mteb/tasks/retrieval/eng/hateful_memes_i2t_retrieval.py +1 -1
  358. mteb/tasks/retrieval/eng/hateful_memes_t2i_retrieval.py +1 -1
  359. mteb/tasks/retrieval/eng/hc3_finance_retrieval.py +1 -1
  360. mteb/tasks/retrieval/eng/lemb_narrative_qa_retrieval.py +1 -1
  361. mteb/tasks/retrieval/eng/lemb_needle_retrieval.py +1 -1
  362. mteb/tasks/retrieval/eng/lemb_passkey_retrieval.py +1 -1
  363. mteb/tasks/retrieval/eng/lemb_summ_screen_fd_retrieval.py +1 -1
  364. mteb/tasks/retrieval/eng/lemb_wikim_qa_retrieval.py +1 -1
  365. mteb/tasks/retrieval/eng/lembqm_sum_retrieval.py +1 -1
  366. mteb/tasks/retrieval/eng/lit_search_retrieval.py +1 -1
  367. mteb/tasks/retrieval/eng/memotion_i2t_retrieval.py +1 -1
  368. mteb/tasks/retrieval/eng/memotion_t2i_retrieval.py +1 -1
  369. mteb/tasks/retrieval/eng/ml_questions.py +1 -1
  370. mteb/tasks/retrieval/eng/nano_argu_ana_retrieval.py +1 -1
  371. mteb/tasks/retrieval/eng/nano_climate_fever_retrieval.py +1 -1
  372. mteb/tasks/retrieval/eng/nano_db_pedia_retrieval.py +1 -1
  373. mteb/tasks/retrieval/eng/nano_fever_retrieval.py +1 -1
  374. mteb/tasks/retrieval/eng/nano_fi_qa2018_retrieval.py +1 -1
  375. mteb/tasks/retrieval/eng/nano_hotpot_qa_retrieval.py +1 -1
  376. mteb/tasks/retrieval/eng/nano_msmarco_retrieval.py +1 -1
  377. mteb/tasks/retrieval/eng/nano_nf_corpus_retrieval.py +1 -1
  378. mteb/tasks/retrieval/eng/nano_nq_retrieval.py +1 -1
  379. mteb/tasks/retrieval/eng/nano_quora_retrieval.py +1 -1
  380. mteb/tasks/retrieval/eng/nano_sci_fact_retrieval.py +1 -1
  381. mteb/tasks/retrieval/eng/nano_scidocs_retrieval.py +1 -1
  382. mteb/tasks/retrieval/eng/nano_touche2020_retrieval.py +1 -1
  383. mteb/tasks/retrieval/eng/narrative_qa_retrieval.py +1 -1
  384. mteb/tasks/retrieval/eng/r2_med_retrieval.py +8 -8
  385. mteb/tasks/retrieval/eng/sci_mmir_i2t_retrieval.py +1 -1
  386. mteb/tasks/retrieval/eng/sci_mmir_t2i_retrieval.py +1 -1
  387. mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +10 -10
  388. mteb/tasks/retrieval/fra/f_qu_ad_retrieval.py +1 -1
  389. mteb/tasks/retrieval/fra/syntec_retrieval.py +1 -1
  390. mteb/tasks/retrieval/hun/hun_sum2.py +1 -1
  391. mteb/tasks/retrieval/kat/georgian_faq_retrieval.py +1 -1
  392. mteb/tasks/retrieval/multilingual/cross_lingual_semantic_discrimination_wmt19.py +1 -1
  393. mteb/tasks/retrieval/multilingual/cross_lingual_semantic_discrimination_wmt21.py +1 -1
  394. mteb/tasks/retrieval/multilingual/cur_ev1_retrieval.py +1 -1
  395. mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +1 -1
  396. mteb/tasks/retrieval/multilingual/miracl_vision_retrieval.py +1 -1
  397. mteb/tasks/retrieval/multilingual/mr_tidy_retrieval.py +1 -1
  398. mteb/tasks/retrieval/multilingual/public_health_qa_retrieval.py +1 -1
  399. mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +2 -2
  400. mteb/tasks/retrieval/multilingual/statcan_dialogue_dataset_retrieval.py +1 -1
  401. mteb/tasks/retrieval/multilingual/vdr_multilingual_retrieval.py +1 -1
  402. mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +14 -4
  403. mteb/tasks/retrieval/multilingual/wit_t2i_retrieval.py +1 -1
  404. mteb/tasks/retrieval/multilingual/x_flickr30k_co_t2i_retrieval.py +1 -1
  405. mteb/tasks/retrieval/multilingual/x_qu_ad_retrieval.py +1 -1
  406. mteb/tasks/retrieval/multilingual/xm3600_t2i_retrieval.py +1 -1
  407. mteb/tasks/retrieval/nld/cqa_dupstack_android_nl_retrieval.py +1 -1
  408. mteb/tasks/retrieval/nld/cqa_dupstack_english_nl_retrieval.py +1 -1
  409. mteb/tasks/retrieval/nld/cqa_dupstack_gaming_nl_retrieval.py +1 -1
  410. mteb/tasks/retrieval/nld/cqa_dupstack_gis_nl_retrieval.py +1 -1
  411. mteb/tasks/retrieval/nld/cqa_dupstack_mathematica_nl_retrieval.py +1 -1
  412. mteb/tasks/retrieval/nld/cqa_dupstack_physics_nl_retrieval.py +1 -1
  413. mteb/tasks/retrieval/nld/cqa_dupstack_programmers_nl_retrieval.py +1 -1
  414. mteb/tasks/retrieval/nld/cqa_dupstack_stats_nl_retrieval.py +1 -1
  415. mteb/tasks/retrieval/nld/cqa_dupstack_tex_nl_retrieval.py +1 -1
  416. mteb/tasks/retrieval/nld/cqa_dupstack_unix_nl_retrieval.py +1 -1
  417. mteb/tasks/retrieval/nld/cqa_dupstack_webmasters_nl_retrieval.py +1 -1
  418. mteb/tasks/retrieval/nld/cqa_dupstack_wordpress_nl_retrieval.py +1 -1
  419. mteb/tasks/retrieval/nob/norquad.py +1 -1
  420. mteb/tasks/retrieval/nob/snl_retrieval.py +1 -1
  421. mteb/tasks/retrieval/slk/slovak_sum_retrieval.py +1 -1
  422. mteb/tasks/retrieval/vie/vie_qu_ad_retrieval.py +1 -1
  423. mteb/tasks/sts/fao/faroese_sts.py +1 -1
  424. mteb/tasks/sts/fra/sick_fr_sts.py +1 -1
  425. mteb/tasks/sts/kor/klue_sts.py +1 -1
  426. mteb/tasks/sts/por/sick_br_sts.py +1 -1
  427. mteb/tasks/sts/rus/ru_para_phraser_sts.py +1 -1
  428. mteb/tasks/zeroshot_classification/eng/sci_mmir.py +1 -1
  429. {mteb-2.7.3.dist-info → mteb-2.7.5.dist-info}/METADATA +1 -1
  430. {mteb-2.7.3.dist-info → mteb-2.7.5.dist-info}/RECORD +434 -413
  431. {mteb-2.7.3.dist-info → mteb-2.7.5.dist-info}/WHEEL +0 -0
  432. {mteb-2.7.3.dist-info → mteb-2.7.5.dist-info}/entry_points.txt +0 -0
  433. {mteb-2.7.3.dist-info → mteb-2.7.5.dist-info}/licenses/LICENSE +0 -0
  434. {mteb-2.7.3.dist-info → mteb-2.7.5.dist-info}/top_level.txt +0 -0
@@ -29,7 +29,7 @@ class FrenchBookReviews(AbsTaskClassification):
29
29
  superseded_by="FrenchBookReviews.v2",
30
30
  )
31
31
 
32
- def dataset_transform(self):
32
+ def dataset_transform(self, num_proc: int = 1):
33
33
  self.dataset = self.dataset.rename_columns({"reader_review": "text"})
34
34
  self.dataset = self.stratified_subsampling(
35
35
  self.dataset, seed=self.seed, splits=["train"]
@@ -63,7 +63,7 @@ class FrenchBookReviewsV2(AbsTaskClassification):
63
63
  adapted_from=["FrenchBookReviews"],
64
64
  )
65
65
 
66
- def dataset_transform(self):
66
+ def dataset_transform(self, num_proc: int = 1):
67
67
  self.dataset = self.stratified_subsampling(
68
68
  self.dataset, seed=self.seed, splits=["train"]
69
69
  )
@@ -35,7 +35,7 @@ class MovieReviewSentimentClassification(AbsTaskClassification):
35
35
  superseded_by="MovieReviewSentimentClassification.v2",
36
36
  )
37
37
 
38
- def dataset_transform(self):
38
+ def dataset_transform(self, num_proc: int = 1):
39
39
  self.dataset = self.dataset.rename_column("review", "text")
40
40
  self.dataset = self.stratified_subsampling(
41
41
  self.dataset, seed=self.seed, splits=["validation", "test"]
@@ -75,7 +75,7 @@ class MovieReviewSentimentClassificationV2(AbsTaskClassification):
75
75
  adapted_from=["MovieReviewSentimentClassification"],
76
76
  )
77
77
 
78
- def dataset_transform(self):
78
+ def dataset_transform(self, num_proc: int = 1):
79
79
  self.dataset = self.stratified_subsampling(
80
80
  self.dataset, seed=self.seed, splits=["validation", "test"]
81
81
  )
@@ -28,7 +28,7 @@ class GujaratiNewsClassification(AbsTaskClassification):
28
28
  superseded_by="GujaratiNewsClassification.v2",
29
29
  )
30
30
 
31
- def dataset_transform(self):
31
+ def dataset_transform(self, num_proc: int = 1):
32
32
  self.dataset = self.dataset.rename_column("headline", "text")
33
33
 
34
34
 
@@ -101,7 +101,7 @@ Stent, Amanda},
101
101
  adapted_from=["HindiDiscourseClassification"],
102
102
  )
103
103
 
104
- def dataset_transform(self):
104
+ def dataset_transform(self, num_proc: int = 1):
105
105
  self.dataset = self.stratified_subsampling(
106
106
  self.dataset, seed=self.seed, splits=["train"]
107
107
  )
@@ -37,7 +37,7 @@ class SentimentAnalysisHindi(AbsTaskClassification):
37
37
  superseded_by="SentimentAnalysisHindi.v2",
38
38
  )
39
39
 
40
- def dataset_transform(self):
40
+ def dataset_transform(self, num_proc: int = 1):
41
41
  self.dataset = self.stratified_subsampling(
42
42
  self.dataset, seed=self.seed, splits=["train"]
43
43
  )
@@ -41,7 +41,7 @@ class IndonesianIdClickbaitClassification(AbsTaskClassification):
41
41
  superseded_by="IndonesianIdClickbaitClassification.v2",
42
42
  )
43
43
 
44
- def dataset_transform(self):
44
+ def dataset_transform(self, num_proc: int = 1):
45
45
  self.dataset = self.dataset.remove_columns(["label"]).rename_columns(
46
46
  {"title": "text", "label_score": "label"}
47
47
  )
@@ -89,7 +89,7 @@ class IndonesianIdClickbaitClassificationV2(AbsTaskClassification):
89
89
  adapted_from=["IndonesianIdClickbaitClassification"],
90
90
  )
91
91
 
92
- def dataset_transform(self):
92
+ def dataset_transform(self, num_proc: int = 1):
93
93
  self.dataset = self.stratified_subsampling(
94
94
  self.dataset, seed=self.seed, splits=["train"]
95
95
  )
@@ -55,7 +55,7 @@ Purwarianti, Ayu},
55
55
  superseded_by="IndonesianMongabayConservationClassification.v2",
56
56
  )
57
57
 
58
- def dataset_transform(self):
58
+ def dataset_transform(self, num_proc: int = 1):
59
59
  splits = self.metadata.eval_splits
60
60
  class_labels = ["positif", "netral", "negatif"]
61
61
 
@@ -36,7 +36,7 @@ class DadoEvalCoarseClassification(AbsTaskClassification):
36
36
  """,
37
37
  )
38
38
 
39
- def dataset_transform(self):
39
+ def dataset_transform(self, num_proc: int = 1):
40
40
  self.dataset = self.dataset.rename_column("class", "label")
41
41
  unused_cols = [
42
42
  col
@@ -44,7 +44,7 @@ class ItaCaseholdClassification(AbsTaskClassification):
44
44
  """,
45
45
  )
46
46
 
47
- def dataset_transform(self):
47
+ def dataset_transform(self, num_proc: int = 1):
48
48
  self.dataset = self.dataset.rename_columns(
49
49
  {"summary": "text", "materia": "label"}
50
50
  )
@@ -36,7 +36,7 @@ class SardiStanceClassification(AbsTaskClassification):
36
36
  """,
37
37
  )
38
38
 
39
- def dataset_transform(self):
39
+ def dataset_transform(self, num_proc: int = 1):
40
40
  unused_cols = [
41
41
  col
42
42
  for col in self.dataset["test"].column_names
@@ -73,7 +73,7 @@ class JavaneseIMDBClassificationV2(AbsTaskClassification):
73
73
  adapted_from=["JavaneseIMDBClassification"],
74
74
  )
75
75
 
76
- def dataset_transform(self):
76
+ def dataset_transform(self, num_proc: int = 1):
77
77
  self.dataset = self.stratified_subsampling(
78
78
  self.dataset, seed=self.seed, splits=["test"]
79
79
  )
@@ -108,7 +108,7 @@ Zhou, Yichao},
108
108
  adapted_from=["WRIMEClassification"],
109
109
  )
110
110
 
111
- def dataset_transform(self):
111
+ def dataset_transform(self, num_proc: int = 1):
112
112
  self.dataset = self.stratified_subsampling(
113
113
  self.dataset, seed=self.seed, splits=["test"]
114
114
  )
@@ -35,7 +35,7 @@ class KannadaNewsClassification(AbsTaskClassification):
35
35
  superseded_by="KannadaNewsClassification.v2",
36
36
  )
37
37
 
38
- def dataset_transform(self):
38
+ def dataset_transform(self, num_proc: int = 1):
39
39
  self.dataset = self.dataset.rename_column("headline", "text")
40
40
  self.dataset = self.stratified_subsampling(
41
41
  self.dataset, seed=self.seed, splits=["train"]
@@ -75,7 +75,7 @@ class KannadaNewsClassificationV2(AbsTaskClassification):
75
75
  adapted_from=["KannadaNewsClassification"],
76
76
  )
77
77
 
78
- def dataset_transform(self):
78
+ def dataset_transform(self, num_proc: int = 1):
79
79
  self.dataset = self.stratified_subsampling(
80
80
  self.dataset, seed=self.seed, splits=["train"]
81
81
  )
@@ -38,7 +38,7 @@ class KlueTC(AbsTaskClassification):
38
38
  superseded_by="KLUE-TC.v2",
39
39
  )
40
40
 
41
- def dataset_transform(self):
41
+ def dataset_transform(self, num_proc: int = 1):
42
42
  def id2str(example):
43
43
  return {"label": label_feature.int2str(example["label_id"])}
44
44
 
@@ -90,7 +90,7 @@ class KlueTCV2(AbsTaskClassification):
90
90
  adapted_from=["KlueTC"],
91
91
  )
92
92
 
93
- def dataset_transform(self):
93
+ def dataset_transform(self, num_proc: int = 1):
94
94
  self.dataset = self.stratified_subsampling(
95
95
  self.dataset, seed=self.seed, splits=["validation"]
96
96
  )
@@ -37,7 +37,7 @@ class KorFin(AbsTaskClassification):
37
37
  """,
38
38
  )
39
39
 
40
- def dataset_transform(self):
40
+ def dataset_transform(self, num_proc: int = 1):
41
41
  self.dataset = self.dataset.rename_columns(
42
42
  {"SRC": "text", "SENTIMENT": "label"}
43
43
  ).remove_columns(["SID", "TYPE", "ASPECT"])
@@ -73,7 +73,7 @@ class KorHateClassificationV2(AbsTaskClassification):
73
73
  adapted_from=["KorHateClassification"],
74
74
  )
75
75
 
76
- def dataset_transform(self):
76
+ def dataset_transform(self, num_proc: int = 1):
77
77
  self.dataset = self.stratified_subsampling(
78
78
  self.dataset, seed=self.seed, splits=["train"]
79
79
  )
@@ -73,7 +73,7 @@ class KorSarcasmClassificationV2(AbsTaskClassification):
73
73
  adapted_from=["KorSarcasmClassification"],
74
74
  )
75
75
 
76
- def dataset_transform(self):
76
+ def dataset_transform(self, num_proc: int = 1):
77
77
  self.dataset = self.stratified_subsampling(
78
78
  self.dataset, seed=self.seed, splits=["train"]
79
79
  )
@@ -35,7 +35,7 @@ class MalayalamNewsClassification(AbsTaskClassification):
35
35
  superseded_by="MalayalamNewsClassification.v2",
36
36
  )
37
37
 
38
- def dataset_transform(self):
38
+ def dataset_transform(self, num_proc: int = 1):
39
39
  self.dataset = self.dataset.rename_columns({"headings": "text"})
40
40
 
41
41
 
@@ -35,7 +35,7 @@ class MarathiNewsClassification(AbsTaskClassification):
35
35
  superseded_by="MarathiNewsClassification.v2",
36
36
  )
37
37
 
38
- def dataset_transform(self):
38
+ def dataset_transform(self, num_proc: int = 1):
39
39
  self.dataset = self.dataset.rename_columns({"headline": "text"})
40
40
  self.dataset = self.stratified_subsampling(self.dataset, seed=self.seed)
41
41
 
@@ -43,7 +43,7 @@ class AfriSentiLangClassification(AbsTaskClassification):
43
43
 
44
44
  samples_per_label = 32
45
45
 
46
- def dataset_transform(self):
46
+ def dataset_transform(self, num_proc: int = 1):
47
47
  self.dataset = self.dataset.rename_column("tweet", "text")
48
48
  self.dataset = self.stratified_subsampling(
49
49
  self.dataset, seed=self.seed, splits=["test"]
@@ -60,7 +60,7 @@ Piperidis, Stelios},
60
60
  """,
61
61
  )
62
62
 
63
- def dataset_transform(self):
63
+ def dataset_transform(self, num_proc: int = 1):
64
64
  for lang in self.dataset.keys():
65
65
  self.dataset[lang] = self.dataset[lang].rename_columns(
66
66
  {"TWEET": "text", "LABEL": "label"}
@@ -44,7 +44,7 @@ class CyrillicTurkicLangClassification(AbsTaskClassification):
44
44
  """,
45
45
  )
46
46
 
47
- def dataset_transform(self):
47
+ def dataset_transform(self, num_proc: int = 1):
48
48
  self.dataset = self.stratified_subsampling(
49
49
  self.dataset, seed=self.seed, splits=["test"]
50
50
  )
@@ -45,7 +45,7 @@ class IndicNLPNewsClassification(AbsTaskClassification):
45
45
  """,
46
46
  )
47
47
 
48
- def dataset_transform(self):
48
+ def dataset_transform(self, num_proc: int = 1):
49
49
  for lang in self.hf_subsets:
50
50
  self.dataset[lang] = self.dataset[lang].rename_columns(
51
51
  {"news": "text", "class": "label"}
@@ -55,7 +55,7 @@ class MasakhaNEWSClassification(AbsTaskClassification):
55
55
  """,
56
56
  )
57
57
 
58
- def dataset_transform(self):
58
+ def dataset_transform(self, num_proc: int = 1):
59
59
  for lang in self.dataset.keys():
60
60
  self.dataset[lang] = self.dataset[lang].rename_columns(
61
61
  {"category": "label"}
@@ -86,7 +86,7 @@ Talat, Zeerak},
86
86
  """,
87
87
  )
88
88
 
89
- def dataset_transform(self):
89
+ def dataset_transform(self, num_proc: int = 1):
90
90
  # for each language perform some transforms
91
91
  for lang in self.dataset.keys():
92
92
  _dataset = self.dataset[lang]
@@ -89,7 +89,7 @@ Vylomova, Ekaterina},
89
89
  """,
90
90
  )
91
91
 
92
- def dataset_transform(self):
92
+ def dataset_transform(self, num_proc: int = 1):
93
93
  # create a train set from the test set for Welsh language (cym)
94
94
  lang = "cym"
95
95
  if lang in self.dataset.keys():
@@ -54,7 +54,7 @@ Fishel, Mark},
54
54
 
55
55
  samples_per_label = 32
56
56
 
57
- def dataset_transform(self):
57
+ def dataset_transform(self, num_proc: int = 1):
58
58
  for lang in self.dataset.keys():
59
59
  # convert label to a 0/1 label
60
60
  labels = self.dataset[lang]["train"]["label"]
@@ -234,7 +234,7 @@ class SIB200Classification(AbsTaskClassification):
234
234
  """,
235
235
  )
236
236
 
237
- def dataset_transform(self):
237
+ def dataset_transform(self, num_proc: int = 1):
238
238
  for lang in self.dataset.keys():
239
239
  self.dataset[lang] = self.dataset[lang].class_encode_column("category")
240
240
  self.dataset[lang] = self.dataset[lang].rename_columns(
@@ -49,7 +49,7 @@ class TurkicClassification(AbsTaskClassification):
49
49
  )
50
50
  return dataset_lang["train"]
51
51
 
52
- def load_data(self) -> None:
52
+ def load_data(self, num_proc: int = 1, **kwargs) -> None:
53
53
  """Load dataset from HuggingFace hub"""
54
54
  if self.data_loaded:
55
55
  return
@@ -53,7 +53,7 @@ Camacho-Collados, Jose},
53
53
  """,
54
54
  )
55
55
 
56
- def dataset_transform(self):
56
+ def dataset_transform(self, num_proc: int = 1):
57
57
  for lang in self.hf_subsets:
58
58
  self.dataset[lang] = self.stratified_subsampling(
59
59
  self.dataset[lang], n_samples=256, seed=self.seed, splits=["test"]
@@ -47,7 +47,7 @@ Tan, Liling},
47
47
  superseded_by="NepaliNewsClassification.v2",
48
48
  )
49
49
 
50
- def dataset_transform(self):
50
+ def dataset_transform(self, num_proc: int = 1):
51
51
  self.dataset = self.dataset.rename_column("paras", "text")
52
52
  self.dataset = self.stratified_subsampling(
53
53
  self.dataset, seed=self.seed, splits=["train"]
@@ -99,7 +99,7 @@ Tan, Liling},
99
99
  adapted_from=["NepaliNewsClassification"],
100
100
  )
101
101
 
102
- def dataset_transform(self):
102
+ def dataset_transform(self, num_proc: int = 1):
103
103
  self.dataset = self.stratified_subsampling(
104
104
  self.dataset, seed=self.seed, splits=["train"]
105
105
  )
@@ -32,7 +32,7 @@ class DutchSarcasticHeadlinesClassification(AbsTaskClassification):
32
32
  },
33
33
  )
34
34
 
35
- def dataset_transform(self):
35
+ def dataset_transform(self, num_proc: int = 1):
36
36
  for split in self.dataset:
37
37
  self.dataset[split] = self.dataset[split].rename_columns(
38
38
  {"headline": "text", "is_sarcastic": "label"}
@@ -42,7 +42,7 @@ class VaccinChatNLClassification(AbsTaskClassification):
42
42
  },
43
43
  )
44
44
 
45
- def dataset_transform(self):
45
+ def dataset_transform(self, num_proc: int = 1):
46
46
  for split in self.dataset:
47
47
  self.dataset[split] = self.dataset[split].rename_columns(
48
48
  {"sentence1": "text"}
@@ -35,7 +35,7 @@ class OdiaNewsClassification(AbsTaskClassification):
35
35
  superseded_by="OdiaNewsClassification.v2",
36
36
  )
37
37
 
38
- def dataset_transform(self):
38
+ def dataset_transform(self, num_proc: int = 1):
39
39
  self.dataset = self.dataset.rename_columns({"headings": "text"})
40
40
  self.dataset = self.stratified_subsampling(self.dataset, seed=self.seed)
41
41
 
@@ -73,5 +73,5 @@ class OdiaNewsClassificationV2(AbsTaskClassification):
73
73
  adapted_from=["OdiaNewsClassification"],
74
74
  )
75
75
 
76
- def dataset_transform(self):
76
+ def dataset_transform(self, num_proc: int = 1):
77
77
  self.dataset = self.stratified_subsampling(self.dataset, seed=self.seed)
@@ -34,7 +34,7 @@ class PunjabiNewsClassification(AbsTaskClassification):
34
34
  """,
35
35
  )
36
36
 
37
- def dataset_transform(self):
37
+ def dataset_transform(self, num_proc: int = 1):
38
38
  self.dataset = self.dataset.rename_columns(
39
39
  {"article": "text", "is_about_politics": "label"}
40
40
  )
@@ -77,7 +77,7 @@ class MorocoV2(AbsTaskClassification):
77
77
  adapted_from=["Moroco"],
78
78
  )
79
79
 
80
- def dataset_transform(self):
80
+ def dataset_transform(self, num_proc: int = 1):
81
81
  self.dataset = self.stratified_subsampling(
82
82
  self.dataset, seed=self.seed, splits=["test"]
83
83
  )
@@ -69,7 +69,7 @@ class RomanianReviewsSentimentV2(AbsTaskClassification):
69
69
  adapted_from=["RomanianReviewsSentiment"],
70
70
  )
71
71
 
72
- def dataset_transform(self):
72
+ def dataset_transform(self, num_proc: int = 1):
73
73
  self.dataset = self.stratified_subsampling(
74
74
  self.dataset, seed=self.seed, splits=["test"]
75
75
  )
@@ -71,7 +71,7 @@ class RomanianSentimentClassificationV2(AbsTaskClassification):
71
71
  adapted_from=["RomanianSentimentClassification"],
72
72
  )
73
73
 
74
- def dataset_transform(self):
74
+ def dataset_transform(self, num_proc: int = 1):
75
75
  self.dataset = self.stratified_subsampling(
76
76
  self.dataset, seed=self.seed, splits=["test"]
77
77
  )
@@ -57,7 +57,7 @@ class GeoreviewClassificationV2(AbsTaskClassification):
57
57
  adapted_from=["GeoreviewClassification"],
58
58
  )
59
59
 
60
- def dataset_transform(self):
60
+ def dataset_transform(self, num_proc: int = 1):
61
61
  self.dataset = self.stratified_subsampling(
62
62
  self.dataset, seed=self.seed, n_samples=2048, splits=["test"]
63
63
  )
@@ -53,7 +53,7 @@ Oda, Yusuke},
53
53
  superseded_by="HeadlineClassification.v2",
54
54
  )
55
55
 
56
- def dataset_transform(self):
56
+ def dataset_transform(self, num_proc: int = 1):
57
57
  self.dataset = self.stratified_subsampling(
58
58
  self.dataset, seed=self.seed, n_samples=2048, splits=["test"]
59
59
  )
@@ -110,7 +110,7 @@ Oda, Yusuke},
110
110
  adapted_from=["HeadlineClassification"],
111
111
  )
112
112
 
113
- def dataset_transform(self):
113
+ def dataset_transform(self, num_proc: int = 1):
114
114
  self.dataset = self.stratified_subsampling(
115
115
  self.dataset, seed=self.seed, n_samples=2048, splits=["test"]
116
116
  )
@@ -57,7 +57,7 @@ Robnik-{\v{S}}ikonja, Marko},
57
57
  superseded_by="InappropriatenessClassification.v2",
58
58
  )
59
59
 
60
- def dataset_transform(self):
60
+ def dataset_transform(self, num_proc: int = 1):
61
61
  self.dataset = self.stratified_subsampling(
62
62
  self.dataset, seed=self.seed, n_samples=2048, splits=["test"]
63
63
  )
@@ -118,7 +118,7 @@ Robnik-{\v{S}}ikonja, Marko},
118
118
  adapted_from=["InappropriatenessClassification"],
119
119
  )
120
120
 
121
- def dataset_transform(self):
121
+ def dataset_transform(self, num_proc: int = 1):
122
122
  self.dataset = self.stratified_subsampling(
123
123
  self.dataset, seed=self.seed, n_samples=2048, splits=["test"]
124
124
  )
@@ -42,7 +42,7 @@ class RuReviewsClassification(AbsTaskClassification):
42
42
  superseded_by="RuReviewsClassification.v2",
43
43
  )
44
44
 
45
- def dataset_transform(self):
45
+ def dataset_transform(self, num_proc: int = 1):
46
46
  self.dataset = self.stratified_subsampling(
47
47
  self.dataset, seed=self.seed, n_samples=2048, splits=["test"]
48
48
  )
@@ -88,7 +88,7 @@ class RuReviewsClassificationV2(AbsTaskClassification):
88
88
  adapted_from=["RuReviewsClassification"],
89
89
  )
90
90
 
91
- def dataset_transform(self):
91
+ def dataset_transform(self, num_proc: int = 1):
92
92
  self.dataset = self.stratified_subsampling(
93
93
  self.dataset, seed=self.seed, n_samples=2048, splits=["test"]
94
94
  )
@@ -29,7 +29,7 @@ class RuSciBenchGRNTIClassification(AbsTaskClassification):
29
29
  superseded_by="RuSciBenchGRNTIClassification.v2",
30
30
  )
31
31
 
32
- def dataset_transform(self):
32
+ def dataset_transform(self, num_proc: int = 1):
33
33
  self.dataset = self.stratified_subsampling(
34
34
  self.dataset, seed=self.seed, n_samples=2048, splits=["test"]
35
35
  )
@@ -29,7 +29,7 @@ class RuSciBenchOECDClassification(AbsTaskClassification):
29
29
  superseded_by="RuSciBenchOECDClassification.v2",
30
30
  )
31
31
 
32
- def dataset_transform(self):
32
+ def dataset_transform(self, num_proc: int = 1):
33
33
  self.dataset = self.stratified_subsampling(
34
34
  self.dataset, seed=self.seed, n_samples=2048, splits=["test"]
35
35
  )
@@ -28,7 +28,7 @@ class RuToxicOKMLCUPClassification(AbsTaskClassification):
28
28
  superseded_by="RuToxicOKMLCUPClassification.v2",
29
29
  )
30
30
 
31
- def dataset_transform(self):
31
+ def dataset_transform(self, num_proc: int = 1):
32
32
  self.dataset = self.dataset.rename_column("toxic", "label")
33
33
 
34
34
 
@@ -46,5 +46,5 @@ Tan, Liling},
46
46
  """,
47
47
  )
48
48
 
49
- def dataset_transform(self):
49
+ def dataset_transform(self, num_proc: int = 1):
50
50
  self.dataset = self.dataset.rename_columns({"Sloka": "text", "Class": "label"})
@@ -42,7 +42,7 @@ class SinhalaNewsClassification(AbsTaskClassification):
42
42
  superseded_by="SinhalaNewsClassification.v2",
43
43
  )
44
44
 
45
- def dataset_transform(self):
45
+ def dataset_transform(self, num_proc: int = 1):
46
46
  self.dataset = self.dataset.rename_columns(
47
47
  {"comments": "text", "labels": "label"}
48
48
  )
@@ -91,7 +91,7 @@ class SinhalaNewsClassificationV2(AbsTaskClassification):
91
91
  adapted_from=["SinhalaNewsClassification"],
92
92
  )
93
93
 
94
- def dataset_transform(self):
94
+ def dataset_transform(self, num_proc: int = 1):
95
95
  self.dataset = self.stratified_subsampling(
96
96
  self.dataset, seed=self.seed, splits=["train"]
97
97
  )
@@ -35,7 +35,7 @@ class SinhalaNewsSourceClassification(AbsTaskClassification):
35
35
  superseded_by="SinhalaNewsSourceClassification.v2",
36
36
  )
37
37
 
38
- def dataset_transform(self):
38
+ def dataset_transform(self, num_proc: int = 1):
39
39
  self.dataset = self.dataset.rename_column("comment", "text")
40
40
  self.dataset = self.stratified_subsampling(
41
41
  self.dataset, seed=self.seed, splits=["train"]
@@ -75,7 +75,7 @@ class SinhalaNewsSourceClassificationV2(AbsTaskClassification):
75
75
  adapted_from=["SinhalaNewsSourceClassification"],
76
76
  )
77
77
 
78
- def dataset_transform(self):
78
+ def dataset_transform(self, num_proc: int = 1):
79
79
  self.dataset = self.stratified_subsampling(
80
80
  self.dataset, seed=self.seed, splits=["train"]
81
81
  )
@@ -42,7 +42,7 @@ class CSFDSKMovieReviewSentimentClassification(AbsTaskClassification):
42
42
  # Increase the samples_per_label in order to improve baseline performance
43
43
  samples_per_label = 20
44
44
 
45
- def dataset_transform(self):
45
+ def dataset_transform(self, num_proc: int = 1):
46
46
  self.dataset = self.dataset.rename_columns(
47
47
  {"comment": "text", "rating_int": "label"}
48
48
  )
@@ -89,7 +89,7 @@ class CSFDSKMovieReviewSentimentClassificationV2(AbsTaskClassification):
89
89
  # Increase the samples_per_label in order to improve baseline performance
90
90
  samples_per_label = 20
91
91
 
92
- def dataset_transform(self):
92
+ def dataset_transform(self, num_proc: int = 1):
93
93
  self.dataset = self.stratified_subsampling(
94
94
  self.dataset, seed=self.seed, splits=["test"], n_samples=N_SAMPLES
95
95
  )
@@ -75,7 +75,7 @@ class FrenkSlClassificationV2(AbsTaskClassification):
75
75
  adapted_from=["FrenkSlClassification"],
76
76
  )
77
77
 
78
- def dataset_transform(self):
78
+ def dataset_transform(self, num_proc: int = 1):
79
79
  self.dataset = self.stratified_subsampling(
80
80
  self.dataset, seed=self.seed, splits=["test"]
81
81
  )
@@ -29,7 +29,7 @@ class SpanishNewsClassification(AbsTaskClassification):
29
29
  superseded_by="SpanishNewsClassification.v2",
30
30
  )
31
31
 
32
- def dataset_transform(self):
32
+ def dataset_transform(self, num_proc: int = 1):
33
33
  self.dataset = self.dataset.rename_columns({"category": "label"})
34
34
  self.dataset = self.stratified_subsampling(
35
35
  self.dataset, seed=self.seed, splits=["train"]
@@ -63,7 +63,7 @@ class SpanishNewsClassificationV2(AbsTaskClassification):
63
63
  adapted_from=["SpanishNewsClassification"],
64
64
  )
65
65
 
66
- def dataset_transform(self):
66
+ def dataset_transform(self, num_proc: int = 1):
67
67
  self.dataset = self.stratified_subsampling(
68
68
  self.dataset, seed=self.seed, splits=["train"]
69
69
  )