mteb 2.7.4__py3-none-any.whl → 2.7.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mteb/_create_dataloaders.py +47 -5
- mteb/_evaluators/any_sts_evaluator.py +2 -0
- mteb/_evaluators/clustering_evaluator.py +2 -0
- mteb/_evaluators/evaluator.py +2 -1
- mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +8 -1
- mteb/_evaluators/pair_classification_evaluator.py +3 -0
- mteb/_evaluators/retrieval_evaluator.py +3 -0
- mteb/_evaluators/sklearn_evaluator.py +6 -1
- mteb/_evaluators/text/bitext_mining_evaluator.py +2 -0
- mteb/_evaluators/text/summarization_evaluator.py +2 -0
- mteb/_evaluators/zeroshot_classification_evaluator.py +2 -0
- mteb/abstasks/abstask.py +31 -12
- mteb/abstasks/classification.py +10 -3
- mteb/abstasks/clustering.py +6 -2
- mteb/abstasks/clustering_legacy.py +8 -2
- mteb/abstasks/image/image_text_pair_classification.py +6 -2
- mteb/abstasks/multilabel_classification.py +2 -0
- mteb/abstasks/pair_classification.py +8 -2
- mteb/abstasks/retrieval.py +26 -11
- mteb/abstasks/retrieval_dataset_loaders.py +29 -19
- mteb/abstasks/sts.py +10 -3
- mteb/abstasks/text/bitext_mining.py +9 -5
- mteb/abstasks/text/reranking.py +2 -2
- mteb/abstasks/text/summarization.py +2 -1
- mteb/abstasks/zeroshot_classification.py +8 -2
- mteb/evaluate.py +10 -2
- mteb/models/model_implementations/bm25.py +2 -0
- mteb/models/model_implementations/pylate_models.py +10 -0
- mteb/models/models_protocols.py +4 -0
- mteb/models/search_wrappers.py +12 -0
- mteb/tasks/bitext_mining/eng/pub_chem_smiles_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/fas/fa_mteb_summary_retrieval.py +3 -3
- mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/flores_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/in22_conv_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/in22_gen_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/norwegian_courts_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/ntrex_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/roma_tales_bitext_mining.py +2 -2
- mteb/tasks/bitext_mining/multilingual/web_faq_bitext_mining.py +2 -2
- mteb/tasks/classification/ara/online_store_review_sentiment_classification.py +1 -1
- mteb/tasks/classification/ara/restaurant_review_sentiment_classification.py +1 -1
- mteb/tasks/classification/ara/tweet_sarcasm_classification.py +1 -1
- mteb/tasks/classification/ben/bengali_hate_speech_classification.py +1 -1
- mteb/tasks/classification/ben/bengali_sentiment_analysis.py +1 -1
- mteb/tasks/classification/bul/bulgarian_store_review_sentiment_classfication.py +1 -1
- mteb/tasks/classification/ces/csfdcz_movie_review_sentiment_classification.py +2 -2
- mteb/tasks/classification/dan/ddisco_cohesion_classification.py +1 -1
- mteb/tasks/classification/dan/dk_hate_classification.py +1 -1
- mteb/tasks/classification/deu/german_politicians_twitter_sentiment_classification.py +1 -1
- mteb/tasks/classification/ell/greek_legal_code_classification.py +1 -1
- mteb/tasks/classification/eng/dbpedia_classification.py +2 -2
- mteb/tasks/classification/eng/toxic_chat_classification.py +2 -2
- mteb/tasks/classification/eng/toxic_conversations_classification.py +2 -2
- mteb/tasks/classification/eng/tweet_topic_single_classification.py +1 -1
- mteb/tasks/classification/eng/yahoo_answers_topics_classification.py +1 -1
- mteb/tasks/classification/eng/yelp_review_full_classification.py +2 -2
- mteb/tasks/classification/est/estonian_valence.py +1 -1
- mteb/tasks/classification/fas/fa_mteb_classification.py +6 -6
- mteb/tasks/classification/fas/persian_food_sentiment_classification.py +1 -1
- mteb/tasks/classification/fil/filipino_shopee_reviews_classification.py +1 -1
- mteb/tasks/classification/fin/fin_toxicity_classification.py +1 -1
- mteb/tasks/classification/fra/french_book_reviews.py +2 -2
- mteb/tasks/classification/fra/movie_review_sentiment_classification.py +2 -2
- mteb/tasks/classification/guj/gujarati_news_classification.py +1 -1
- mteb/tasks/classification/hin/hindi_discourse_classification.py +1 -1
- mteb/tasks/classification/hin/sentiment_analysis_hindi.py +1 -1
- mteb/tasks/classification/ind/indonesian_id_clickbait_classification.py +2 -2
- mteb/tasks/classification/ind/indonesian_mongabay_conservation_classification.py +1 -1
- mteb/tasks/classification/ita/dado_eval_coarse_classification.py +1 -1
- mteb/tasks/classification/ita/ita_casehold_classification.py +1 -1
- mteb/tasks/classification/ita/sardi_stance_classification.py +1 -1
- mteb/tasks/classification/jav/javanese_imdb_classification.py +1 -1
- mteb/tasks/classification/jpn/wrime_classification.py +1 -1
- mteb/tasks/classification/kan/kannada_news_classification.py +2 -2
- mteb/tasks/classification/kor/klue_tc.py +2 -2
- mteb/tasks/classification/kor/kor_fin.py +1 -1
- mteb/tasks/classification/kor/kor_hate_classification.py +1 -1
- mteb/tasks/classification/kor/kor_sarcasm_classification.py +1 -1
- mteb/tasks/classification/mal/malayalam_news_classification.py +1 -1
- mteb/tasks/classification/mar/marathi_news_classification.py +1 -1
- mteb/tasks/classification/multilingual/afri_senti_lang_classification.py +1 -1
- mteb/tasks/classification/multilingual/catalonia_tweet_classification.py +1 -1
- mteb/tasks/classification/multilingual/cyrillic_turkic_lang_classification.py +1 -1
- mteb/tasks/classification/multilingual/indic_nlp_news_classification.py +1 -1
- mteb/tasks/classification/multilingual/masakha_news_classification.py +1 -1
- mteb/tasks/classification/multilingual/multi_hate_classification.py +1 -1
- mteb/tasks/classification/multilingual/multilingual_sentiment_classification.py +1 -1
- mteb/tasks/classification/multilingual/scala_classification.py +1 -1
- mteb/tasks/classification/multilingual/sib200_classification.py +1 -1
- mteb/tasks/classification/multilingual/turkic_classification.py +1 -1
- mteb/tasks/classification/multilingual/tweet_sentiment_classification.py +1 -1
- mteb/tasks/classification/nep/nepali_news_classification.py +2 -2
- mteb/tasks/classification/nld/dutch_sarcastic_headlines_classification.py +1 -1
- mteb/tasks/classification/nld/vaccin_chat_nl_classification.py +1 -1
- mteb/tasks/classification/ory/odia_news_classification.py +2 -2
- mteb/tasks/classification/pan/punjabi_news_classification.py +1 -1
- mteb/tasks/classification/ron/moroco.py +1 -1
- mteb/tasks/classification/ron/romanian_reviews_sentiment.py +1 -1
- mteb/tasks/classification/ron/romanian_sentiment_classification.py +1 -1
- mteb/tasks/classification/rus/georeview_classification.py +1 -1
- mteb/tasks/classification/rus/headline_classification.py +2 -2
- mteb/tasks/classification/rus/inappropriateness_classification.py +2 -2
- mteb/tasks/classification/rus/ru_reviews_classification.py +2 -2
- mteb/tasks/classification/rus/ru_sci_bench_grnti_classification.py +1 -1
- mteb/tasks/classification/rus/ru_sci_bench_oecd_classification.py +1 -1
- mteb/tasks/classification/rus/ru_toixic_classification_okmlcup.py +1 -1
- mteb/tasks/classification/san/sanskrit_shlokas_classification.py +1 -1
- mteb/tasks/classification/sin/sinhala_news_classification.py +2 -2
- mteb/tasks/classification/sin/sinhala_news_source_classification.py +2 -2
- mteb/tasks/classification/slk/csfdsk_movie_review_sentiment_classification.py +2 -2
- mteb/tasks/classification/slv/frenk_sl_classification.py +1 -1
- mteb/tasks/classification/spa/spanish_news_classification.py +2 -2
- mteb/tasks/classification/ssw/siswati_news_classification.py +1 -1
- mteb/tasks/classification/tam/tamil_news_classification.py +2 -2
- mteb/tasks/classification/tel/telugu_andhra_jyoti_news_classification.py +2 -2
- mteb/tasks/classification/tha/wongnai_reviews_classification.py +1 -1
- mteb/tasks/classification/tur/turkish_movie_sentiment_classification.py +2 -2
- mteb/tasks/classification/ukr/ukr_formality_classification.py +2 -2
- mteb/tasks/classification/vie/toxic_conversations_vn_classification.py +1 -1
- mteb/tasks/classification/vie/vie_student_feedback_classification.py +1 -1
- mteb/tasks/classification/zho/yue_openrice_review_classification.py +2 -2
- mteb/tasks/classification/zul/isi_zulu_news_classification.py +1 -1
- mteb/tasks/clustering/deu/blurbs_clustering_p2p.py +1 -1
- mteb/tasks/clustering/deu/blurbs_clustering_s2s.py +1 -1
- mteb/tasks/clustering/eng/arxiv_clustering_p2p.py +1 -1
- mteb/tasks/clustering/eng/arxiv_hierarchical_clustering.py +2 -2
- mteb/tasks/clustering/eng/big_patent_clustering.py +1 -1
- mteb/tasks/clustering/eng/biorxiv_clustering_p2p.py +1 -1
- mteb/tasks/clustering/eng/biorxiv_clustering_s2s.py +1 -1
- mteb/tasks/clustering/eng/medrxiv_clustering_p2p.py +1 -1
- mteb/tasks/clustering/eng/medrxiv_clustering_s2s.py +1 -1
- mteb/tasks/clustering/eng/reddit_clustering.py +1 -1
- mteb/tasks/clustering/eng/reddit_clustering_p2p.py +1 -1
- mteb/tasks/clustering/eng/stack_exchange_clustering.py +1 -1
- mteb/tasks/clustering/eng/stack_exchange_clustering_p2p.py +1 -1
- mteb/tasks/clustering/eng/twenty_newsgroups_clustering.py +1 -1
- mteb/tasks/clustering/fas/fa_mteb_clustering.py +4 -4
- mteb/tasks/clustering/fra/hal_clustering_s2s.py +2 -2
- mteb/tasks/clustering/multilingual/mlsum_clustering_p2p.py +2 -2
- mteb/tasks/clustering/multilingual/mlsum_clustering_s2s.py +2 -2
- mteb/tasks/clustering/multilingual/sib200_clustering_s2s.py +1 -1
- mteb/tasks/clustering/multilingual/wiki_clustering_p2p.py +1 -1
- mteb/tasks/clustering/nld/dutch_news_articles_clustering_p2p.py +1 -1
- mteb/tasks/clustering/nld/dutch_news_articles_clustering_s2s.py +1 -1
- mteb/tasks/clustering/nld/iconclass_clustering_s2s.py +1 -1
- mteb/tasks/clustering/nld/open_tender_clustering_p2p.py +1 -1
- mteb/tasks/clustering/nld/vabb_clustering_p2p.py +1 -1
- mteb/tasks/clustering/nld/vabb_clustering_s2s.py +1 -1
- mteb/tasks/clustering/nob/snl_clustering.py +1 -1
- mteb/tasks/clustering/nob/vg_clustering.py +1 -1
- mteb/tasks/clustering/pol/polish_clustering.py +3 -3
- mteb/tasks/clustering/rus/ru_sci_bench_grnti_clustering_p2p.py +1 -1
- mteb/tasks/clustering/rus/ru_sci_bench_oecd_clustering_p2p.py +1 -1
- mteb/tasks/clustering/zho/cmteb_clustering.py +4 -4
- mteb/tasks/image_text_pair_classification/eng/image_co_de.py +1 -1
- mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +1 -1
- mteb/tasks/instruction_reranking/multilingual/m_follow_ir.py +2 -2
- mteb/tasks/multichoice/eng/cv_bench.py +4 -4
- mteb/tasks/multilabel_classification/ita/emit_classification.py +1 -1
- mteb/tasks/multilabel_classification/mlt/maltese_news_classification.py +1 -1
- mteb/tasks/multilabel_classification/rus/ru_toixic_multilabelclassification_okmlcup.py +1 -1
- mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_group_classification.py +1 -1
- mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_subclass_classification.py +1 -1
- mteb/tasks/pair_classification/ara/ar_entail.py +1 -1
- mteb/tasks/pair_classification/dan/talemaader_pc.py +1 -1
- mteb/tasks/pair_classification/deu/false_friends_de_en_pc.py +1 -1
- mteb/tasks/pair_classification/eng/pub_chem_ai_sentence_paraphrase_pc.py +1 -1
- mteb/tasks/pair_classification/eng/pub_chem_smilespc.py +1 -1
- mteb/tasks/pair_classification/eng/pub_chem_synonym_pc.py +1 -1
- mteb/tasks/pair_classification/eng/pub_chem_wiki_paragraphs_pc.py +1 -1
- mteb/tasks/pair_classification/eng/sprint_duplicate_questions_pc.py +1 -1
- mteb/tasks/pair_classification/eng/twitter_sem_eval2015_pc.py +1 -1
- mteb/tasks/pair_classification/eng/twitter_url_corpus_pc.py +1 -1
- mteb/tasks/pair_classification/fas/fa_mteb_pair_classification.py +5 -5
- mteb/tasks/pair_classification/fas/fars_tail.py +2 -2
- mteb/tasks/pair_classification/hye/armenian_paraphrase_pc.py +1 -1
- mteb/tasks/pair_classification/ita/dis_co_tex_pair_classification.py +1 -1
- mteb/tasks/pair_classification/kor/klue_nli.py +1 -1
- mteb/tasks/pair_classification/multilingual/rte3.py +2 -2
- mteb/tasks/pair_classification/multilingual/xnli.py +1 -1
- mteb/tasks/pair_classification/pol/polish_pc.py +4 -4
- mteb/tasks/pair_classification/por/assin2_rte.py +1 -1
- mteb/tasks/pair_classification/por/sick_br_pc.py +1 -1
- mteb/tasks/pair_classification/rus/terra.py +2 -2
- mteb/tasks/pair_classification/vie/sprint_duplicate_questions_pcvn.py +1 -1
- mteb/tasks/pair_classification/vie/twitter_sem_eval2015_pcvn.py +1 -1
- mteb/tasks/pair_classification/vie/twitter_url_corpus_pcvn.py +1 -1
- mteb/tasks/pair_classification/zho/cmteb_pair_classification.py +2 -2
- mteb/tasks/retrieval/ara/sadeem_question_retrieval.py +1 -1
- mteb/tasks/retrieval/code/code_edit_search_retrieval.py +1 -1
- mteb/tasks/retrieval/code/code_rag.py +4 -4
- mteb/tasks/retrieval/code/code_search_net_cc_retrieval.py +1 -1
- mteb/tasks/retrieval/code/coir_code_search_net_retrieval.py +1 -1
- mteb/tasks/retrieval/code/ds1000_retrieval.py +1 -1
- mteb/tasks/retrieval/code/fresh_stack_retrieval.py +1 -1
- mteb/tasks/retrieval/code/human_eval_retrieval.py +1 -1
- mteb/tasks/retrieval/code/mbpp_retrieval.py +1 -1
- mteb/tasks/retrieval/code/wiki_sql_retrieval.py +1 -1
- mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
- mteb/tasks/retrieval/dan/tv2_nordretrieval.py +1 -1
- mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +1 -1
- mteb/tasks/retrieval/deu/german_gov_service_retrieval.py +1 -1
- mteb/tasks/retrieval/deu/german_qu_ad_retrieval.py +1 -1
- mteb/tasks/retrieval/ell/greek_civics_qa.py +1 -1
- mteb/tasks/retrieval/eng/bright_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/chat_doctor_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/fin_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/finance_bench_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/hateful_memes_i2t_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/hateful_memes_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/hc3_finance_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lemb_narrative_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lemb_needle_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lemb_passkey_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lemb_summ_screen_fd_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lemb_wikim_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lembqm_sum_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lit_search_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/memotion_i2t_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/memotion_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/ml_questions.py +1 -1
- mteb/tasks/retrieval/eng/nano_argu_ana_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_climate_fever_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_db_pedia_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_fever_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_fi_qa2018_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_hotpot_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_msmarco_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_nf_corpus_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_nq_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_quora_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_sci_fact_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_scidocs_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_touche2020_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/narrative_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/r2_med_retrieval.py +8 -8
- mteb/tasks/retrieval/eng/sci_mmir_i2t_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/sci_mmir_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +10 -10
- mteb/tasks/retrieval/fra/f_qu_ad_retrieval.py +1 -1
- mteb/tasks/retrieval/fra/syntec_retrieval.py +1 -1
- mteb/tasks/retrieval/hun/hun_sum2.py +1 -1
- mteb/tasks/retrieval/kat/georgian_faq_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/cross_lingual_semantic_discrimination_wmt19.py +1 -1
- mteb/tasks/retrieval/multilingual/cross_lingual_semantic_discrimination_wmt21.py +1 -1
- mteb/tasks/retrieval/multilingual/cur_ev1_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/miracl_vision_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/mr_tidy_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/public_health_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +2 -2
- mteb/tasks/retrieval/multilingual/statcan_dialogue_dataset_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/vdr_multilingual_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +14 -4
- mteb/tasks/retrieval/multilingual/wit_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/x_flickr30k_co_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/x_qu_ad_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/xm3600_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_android_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_english_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_gaming_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_gis_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_mathematica_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_physics_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_programmers_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_stats_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_tex_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_unix_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_webmasters_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_wordpress_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nob/norquad.py +1 -1
- mteb/tasks/retrieval/nob/snl_retrieval.py +1 -1
- mteb/tasks/retrieval/slk/slovak_sum_retrieval.py +1 -1
- mteb/tasks/retrieval/vie/vie_qu_ad_retrieval.py +1 -1
- mteb/tasks/sts/fao/faroese_sts.py +1 -1
- mteb/tasks/sts/fra/sick_fr_sts.py +1 -1
- mteb/tasks/sts/kor/klue_sts.py +1 -1
- mteb/tasks/sts/por/sick_br_sts.py +1 -1
- mteb/tasks/sts/rus/ru_para_phraser_sts.py +1 -1
- mteb/tasks/zeroshot_classification/eng/sci_mmir.py +1 -1
- {mteb-2.7.4.dist-info → mteb-2.7.5.dist-info}/METADATA +1 -1
- {mteb-2.7.4.dist-info → mteb-2.7.5.dist-info}/RECORD +287 -287
- {mteb-2.7.4.dist-info → mteb-2.7.5.dist-info}/WHEEL +0 -0
- {mteb-2.7.4.dist-info → mteb-2.7.5.dist-info}/entry_points.txt +0 -0
- {mteb-2.7.4.dist-info → mteb-2.7.5.dist-info}/licenses/LICENSE +0 -0
- {mteb-2.7.4.dist-info → mteb-2.7.5.dist-info}/top_level.txt +0 -0
|
@@ -29,7 +29,7 @@ class RuSciBenchGRNTIClassification(AbsTaskClassification):
|
|
|
29
29
|
superseded_by="RuSciBenchGRNTIClassification.v2",
|
|
30
30
|
)
|
|
31
31
|
|
|
32
|
-
def dataset_transform(self):
|
|
32
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
33
33
|
self.dataset = self.stratified_subsampling(
|
|
34
34
|
self.dataset, seed=self.seed, n_samples=2048, splits=["test"]
|
|
35
35
|
)
|
|
@@ -29,7 +29,7 @@ class RuSciBenchOECDClassification(AbsTaskClassification):
|
|
|
29
29
|
superseded_by="RuSciBenchOECDClassification.v2",
|
|
30
30
|
)
|
|
31
31
|
|
|
32
|
-
def dataset_transform(self):
|
|
32
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
33
33
|
self.dataset = self.stratified_subsampling(
|
|
34
34
|
self.dataset, seed=self.seed, n_samples=2048, splits=["test"]
|
|
35
35
|
)
|
|
@@ -28,7 +28,7 @@ class RuToxicOKMLCUPClassification(AbsTaskClassification):
|
|
|
28
28
|
superseded_by="RuToxicOKMLCUPClassification.v2",
|
|
29
29
|
)
|
|
30
30
|
|
|
31
|
-
def dataset_transform(self):
|
|
31
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
32
32
|
self.dataset = self.dataset.rename_column("toxic", "label")
|
|
33
33
|
|
|
34
34
|
|
|
@@ -42,7 +42,7 @@ class SinhalaNewsClassification(AbsTaskClassification):
|
|
|
42
42
|
superseded_by="SinhalaNewsClassification.v2",
|
|
43
43
|
)
|
|
44
44
|
|
|
45
|
-
def dataset_transform(self):
|
|
45
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
46
46
|
self.dataset = self.dataset.rename_columns(
|
|
47
47
|
{"comments": "text", "labels": "label"}
|
|
48
48
|
)
|
|
@@ -91,7 +91,7 @@ class SinhalaNewsClassificationV2(AbsTaskClassification):
|
|
|
91
91
|
adapted_from=["SinhalaNewsClassification"],
|
|
92
92
|
)
|
|
93
93
|
|
|
94
|
-
def dataset_transform(self):
|
|
94
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
95
95
|
self.dataset = self.stratified_subsampling(
|
|
96
96
|
self.dataset, seed=self.seed, splits=["train"]
|
|
97
97
|
)
|
|
@@ -35,7 +35,7 @@ class SinhalaNewsSourceClassification(AbsTaskClassification):
|
|
|
35
35
|
superseded_by="SinhalaNewsSourceClassification.v2",
|
|
36
36
|
)
|
|
37
37
|
|
|
38
|
-
def dataset_transform(self):
|
|
38
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
39
39
|
self.dataset = self.dataset.rename_column("comment", "text")
|
|
40
40
|
self.dataset = self.stratified_subsampling(
|
|
41
41
|
self.dataset, seed=self.seed, splits=["train"]
|
|
@@ -75,7 +75,7 @@ class SinhalaNewsSourceClassificationV2(AbsTaskClassification):
|
|
|
75
75
|
adapted_from=["SinhalaNewsSourceClassification"],
|
|
76
76
|
)
|
|
77
77
|
|
|
78
|
-
def dataset_transform(self):
|
|
78
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
79
79
|
self.dataset = self.stratified_subsampling(
|
|
80
80
|
self.dataset, seed=self.seed, splits=["train"]
|
|
81
81
|
)
|
|
@@ -42,7 +42,7 @@ class CSFDSKMovieReviewSentimentClassification(AbsTaskClassification):
|
|
|
42
42
|
# Increase the samples_per_label in order to improve baseline performance
|
|
43
43
|
samples_per_label = 20
|
|
44
44
|
|
|
45
|
-
def dataset_transform(self):
|
|
45
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
46
46
|
self.dataset = self.dataset.rename_columns(
|
|
47
47
|
{"comment": "text", "rating_int": "label"}
|
|
48
48
|
)
|
|
@@ -89,7 +89,7 @@ class CSFDSKMovieReviewSentimentClassificationV2(AbsTaskClassification):
|
|
|
89
89
|
# Increase the samples_per_label in order to improve baseline performance
|
|
90
90
|
samples_per_label = 20
|
|
91
91
|
|
|
92
|
-
def dataset_transform(self):
|
|
92
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
93
93
|
self.dataset = self.stratified_subsampling(
|
|
94
94
|
self.dataset, seed=self.seed, splits=["test"], n_samples=N_SAMPLES
|
|
95
95
|
)
|
|
@@ -75,7 +75,7 @@ class FrenkSlClassificationV2(AbsTaskClassification):
|
|
|
75
75
|
adapted_from=["FrenkSlClassification"],
|
|
76
76
|
)
|
|
77
77
|
|
|
78
|
-
def dataset_transform(self):
|
|
78
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
79
79
|
self.dataset = self.stratified_subsampling(
|
|
80
80
|
self.dataset, seed=self.seed, splits=["test"]
|
|
81
81
|
)
|
|
@@ -29,7 +29,7 @@ class SpanishNewsClassification(AbsTaskClassification):
|
|
|
29
29
|
superseded_by="SpanishNewsClassification.v2",
|
|
30
30
|
)
|
|
31
31
|
|
|
32
|
-
def dataset_transform(self):
|
|
32
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
33
33
|
self.dataset = self.dataset.rename_columns({"category": "label"})
|
|
34
34
|
self.dataset = self.stratified_subsampling(
|
|
35
35
|
self.dataset, seed=self.seed, splits=["train"]
|
|
@@ -63,7 +63,7 @@ class SpanishNewsClassificationV2(AbsTaskClassification):
|
|
|
63
63
|
adapted_from=["SpanishNewsClassification"],
|
|
64
64
|
)
|
|
65
65
|
|
|
66
|
-
def dataset_transform(self):
|
|
66
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
67
67
|
self.dataset = self.stratified_subsampling(
|
|
68
68
|
self.dataset, seed=self.seed, splits=["train"]
|
|
69
69
|
)
|
|
@@ -38,7 +38,7 @@ class SiswatiNewsClassification(AbsTaskClassification):
|
|
|
38
38
|
superseded_by="SiswatiNewsClassification.v2",
|
|
39
39
|
)
|
|
40
40
|
|
|
41
|
-
def dataset_transform(self):
|
|
41
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
42
42
|
self.dataset = self.dataset.rename_columns({"title": "text"})
|
|
43
43
|
|
|
44
44
|
|
|
@@ -35,7 +35,7 @@ class TamilNewsClassification(AbsTaskClassification):
|
|
|
35
35
|
superseded_by="TamilNewsClassification.v2",
|
|
36
36
|
)
|
|
37
37
|
|
|
38
|
-
def dataset_transform(self):
|
|
38
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
39
39
|
self.dataset = self.dataset.rename_columns(
|
|
40
40
|
{"NewsInTamil": "text", "Category": "label"}
|
|
41
41
|
)
|
|
@@ -75,5 +75,5 @@ class TamilNewsClassificationV2(AbsTaskClassification):
|
|
|
75
75
|
adapted_from=["TamilNewsClassification"],
|
|
76
76
|
)
|
|
77
77
|
|
|
78
|
-
def dataset_transform(self):
|
|
78
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
79
79
|
self.dataset = self.stratified_subsampling(self.dataset, seed=self.seed)
|
|
@@ -28,7 +28,7 @@ class TeluguAndhraJyotiNewsClassification(AbsTaskClassification):
|
|
|
28
28
|
superseded_by="TeluguAndhraJyotiNewsClassification.v2",
|
|
29
29
|
)
|
|
30
30
|
|
|
31
|
-
def dataset_transform(self):
|
|
31
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
32
32
|
self.dataset = self.dataset.rename_columns({"body": "text", "topic": "label"})
|
|
33
33
|
self.dataset = self.stratified_subsampling(self.dataset, seed=self.seed)
|
|
34
34
|
|
|
@@ -59,5 +59,5 @@ class TeluguAndhraJyotiNewsClassificationV2(AbsTaskClassification):
|
|
|
59
59
|
adapted_from=["TeluguAndhraJyotiNewsClassification"],
|
|
60
60
|
)
|
|
61
61
|
|
|
62
|
-
def dataset_transform(self):
|
|
62
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
63
63
|
self.dataset = self.stratified_subsampling(self.dataset, seed=self.seed)
|
|
@@ -38,7 +38,7 @@ class WongnaiReviewsClassification(AbsTaskClassification):
|
|
|
38
38
|
""",
|
|
39
39
|
)
|
|
40
40
|
|
|
41
|
-
def dataset_transform(self):
|
|
41
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
42
42
|
self.dataset = self.dataset.rename_columns(
|
|
43
43
|
{"review_body": "text", "star_rating": "label"}
|
|
44
44
|
)
|
|
@@ -36,7 +36,7 @@ class TurkishMovieSentimentClassification(AbsTaskClassification):
|
|
|
36
36
|
superseded_by="TurkishMovieSentimentClassification.v2",
|
|
37
37
|
)
|
|
38
38
|
|
|
39
|
-
def dataset_transform(self):
|
|
39
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
40
40
|
self.dataset = self.stratified_subsampling(
|
|
41
41
|
self.dataset, seed=self.seed, splits=["test"]
|
|
42
42
|
)
|
|
@@ -76,7 +76,7 @@ class TurkishMovieSentimentClassificationV2(AbsTaskClassification):
|
|
|
76
76
|
adapted_from=["TurkishMovieSentimentClassification"],
|
|
77
77
|
)
|
|
78
78
|
|
|
79
|
-
def dataset_transform(self):
|
|
79
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
80
80
|
self.dataset = self.stratified_subsampling(
|
|
81
81
|
self.dataset, seed=self.seed, splits=["test"]
|
|
82
82
|
)
|
|
@@ -39,7 +39,7 @@ Tetreault, Joel},
|
|
|
39
39
|
superseded_by="UkrFormalityClassification.v2",
|
|
40
40
|
)
|
|
41
41
|
|
|
42
|
-
def dataset_transform(self):
|
|
42
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
43
43
|
self.dataset = self.dataset.rename_column("labels", "label")
|
|
44
44
|
self.dataset = self.dataset.class_encode_column("label")
|
|
45
45
|
self.dataset = self.stratified_subsampling(
|
|
@@ -84,7 +84,7 @@ Tetreault, Joel},
|
|
|
84
84
|
adapted_from=["UkrFormalityClassification"],
|
|
85
85
|
)
|
|
86
86
|
|
|
87
|
-
def dataset_transform(self):
|
|
87
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
88
88
|
self.dataset = self.stratified_subsampling(
|
|
89
89
|
self.dataset, seed=self.seed, splits=["train", "test"]
|
|
90
90
|
)
|
|
@@ -39,7 +39,7 @@ class ToxicConversationsVNClassification(AbsTaskClassification):
|
|
|
39
39
|
adapted_from=["ToxicConversationsClassification"],
|
|
40
40
|
)
|
|
41
41
|
|
|
42
|
-
def dataset_transform(self):
|
|
42
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
43
43
|
self.dataset = self.stratified_subsampling(
|
|
44
44
|
self.dataset, seed=self.seed, splits=["test"]
|
|
45
45
|
)
|
|
@@ -79,7 +79,7 @@ class VieStudentFeedbackClassificationV2(AbsTaskClassification):
|
|
|
79
79
|
adapted_from=["VieStudentFeedbackClassification"],
|
|
80
80
|
)
|
|
81
81
|
|
|
82
|
-
def dataset_transform(self):
|
|
82
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
83
83
|
self.dataset = self.stratified_subsampling(
|
|
84
84
|
self.dataset, seed=self.seed, splits=["test"]
|
|
85
85
|
)
|
|
@@ -39,7 +39,7 @@ class YueOpenriceReviewClassification(AbsTaskClassification):
|
|
|
39
39
|
|
|
40
40
|
samples_per_label = 32
|
|
41
41
|
|
|
42
|
-
def dataset_transform(self):
|
|
42
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
43
43
|
self.dataset = self.stratified_subsampling(
|
|
44
44
|
self.dataset, seed=self.seed, splits=["test"]
|
|
45
45
|
)
|
|
@@ -82,7 +82,7 @@ class YueOpenriceReviewClassificationV2(AbsTaskClassification):
|
|
|
82
82
|
|
|
83
83
|
samples_per_label = 32
|
|
84
84
|
|
|
85
|
-
def dataset_transform(self):
|
|
85
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
86
86
|
self.dataset = self.stratified_subsampling(
|
|
87
87
|
self.dataset, seed=self.seed, splits=["test"]
|
|
88
88
|
)
|
|
@@ -38,7 +38,7 @@ class IsiZuluNewsClassification(AbsTaskClassification):
|
|
|
38
38
|
superseded_by="IsiZuluNewsClassification.v2",
|
|
39
39
|
)
|
|
40
40
|
|
|
41
|
-
def dataset_transform(self):
|
|
41
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
42
42
|
self.dataset = self.dataset.rename_columns({"title": "text"})
|
|
43
43
|
|
|
44
44
|
|
|
@@ -82,7 +82,7 @@ class BlurbsClusteringP2PFast(AbsTaskClustering):
|
|
|
82
82
|
adapted_from=["BlurbsClusteringP2P"],
|
|
83
83
|
)
|
|
84
84
|
|
|
85
|
-
def dataset_transform(self):
|
|
85
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
86
86
|
self.dataset = _convert_to_fast(
|
|
87
87
|
self.dataset, self.input_column_name, self.label_column_name, self.seed
|
|
88
88
|
)
|
|
@@ -91,7 +91,7 @@ class BlurbsClusteringS2SFast(AbsTaskClustering):
|
|
|
91
91
|
adapted_from=["BlurbsClusteringS2S"],
|
|
92
92
|
)
|
|
93
93
|
|
|
94
|
-
def dataset_transform(self):
|
|
94
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
95
95
|
ds = {}
|
|
96
96
|
for split in self.metadata.eval_splits:
|
|
97
97
|
labels = list(itertools.chain.from_iterable(self.dataset[split]["labels"]))
|
|
@@ -38,7 +38,7 @@ class ArXivHierarchicalClusteringP2P(AbsTaskClustering):
|
|
|
38
38
|
bibtex_citation="",
|
|
39
39
|
)
|
|
40
40
|
|
|
41
|
-
def dataset_transform(self):
|
|
41
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
42
42
|
ds = {}
|
|
43
43
|
for split in self.metadata.eval_splits:
|
|
44
44
|
labels = list(itertools.chain.from_iterable(self.dataset[split]["labels"]))
|
|
@@ -78,7 +78,7 @@ class ArXivHierarchicalClusteringS2S(AbsTaskClustering):
|
|
|
78
78
|
bibtex_citation="",
|
|
79
79
|
)
|
|
80
80
|
|
|
81
|
-
def dataset_transform(self):
|
|
81
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
82
82
|
ds = {}
|
|
83
83
|
for split in self.metadata.eval_splits:
|
|
84
84
|
labels = list(itertools.chain.from_iterable(self.dataset[split]["labels"]))
|
|
@@ -104,7 +104,7 @@ Summarization},
|
|
|
104
104
|
adapted_from=["BigPatentClustering"],
|
|
105
105
|
)
|
|
106
106
|
|
|
107
|
-
def dataset_transform(self):
|
|
107
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
108
108
|
for split in self.metadata.eval_splits:
|
|
109
109
|
_check_label_distribution(self.dataset[split])
|
|
110
110
|
self.dataset = self.stratified_subsampling(
|
|
@@ -33,7 +33,7 @@ class BiorxivClusteringP2PFast(AbsTaskClustering):
|
|
|
33
33
|
adapted_from=["BiorxivClusteringP2P"],
|
|
34
34
|
)
|
|
35
35
|
|
|
36
|
-
def dataset_transform(self):
|
|
36
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
37
37
|
for split in self.metadata.eval_splits:
|
|
38
38
|
_check_label_distribution(self.dataset[split])
|
|
39
39
|
|
|
@@ -33,7 +33,7 @@ class BiorxivClusteringS2SFast(AbsTaskClustering):
|
|
|
33
33
|
adapted_from=["BiorxivClusteringS2S"],
|
|
34
34
|
)
|
|
35
35
|
|
|
36
|
-
def dataset_transform(self):
|
|
36
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
37
37
|
for split in self.metadata.eval_splits:
|
|
38
38
|
_check_label_distribution(self.dataset[split])
|
|
39
39
|
|
|
@@ -37,7 +37,7 @@ class MedrxivClusteringP2PFast(AbsTaskClustering):
|
|
|
37
37
|
adapted_from=["MedrxivClusteringP2P"],
|
|
38
38
|
)
|
|
39
39
|
|
|
40
|
-
def dataset_transform(self):
|
|
40
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
41
41
|
ds = {}
|
|
42
42
|
for split in self.metadata.eval_splits:
|
|
43
43
|
labels = list(itertools.chain.from_iterable(self.dataset[split]["labels"]))
|
|
@@ -37,7 +37,7 @@ class MedrxivClusteringS2SFast(AbsTaskClustering):
|
|
|
37
37
|
adapted_from=["MedrxivClusteringS2S"],
|
|
38
38
|
)
|
|
39
39
|
|
|
40
|
-
def dataset_transform(self):
|
|
40
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
41
41
|
ds = {}
|
|
42
42
|
for split in self.metadata.eval_splits:
|
|
43
43
|
labels = list(itertools.chain.from_iterable(self.dataset[split]["labels"]))
|
|
@@ -51,7 +51,7 @@ Iryna Gurevych},
|
|
|
51
51
|
adapted_from=["RedditClustering"],
|
|
52
52
|
)
|
|
53
53
|
|
|
54
|
-
def dataset_transform(self):
|
|
54
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
55
55
|
ds = {}
|
|
56
56
|
for split in self.metadata.eval_splits:
|
|
57
57
|
labels = list(itertools.chain.from_iterable(self.dataset[split]["labels"]))
|
|
@@ -94,7 +94,7 @@ Iryna Gurevych},
|
|
|
94
94
|
adapted_from=["RedditClusteringP2P"],
|
|
95
95
|
)
|
|
96
96
|
|
|
97
|
-
def dataset_transform(self):
|
|
97
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
98
98
|
ds = {}
|
|
99
99
|
for split in self.metadata.eval_splits:
|
|
100
100
|
labels = list(itertools.chain.from_iterable(self.dataset[split]["labels"]))
|
|
@@ -51,7 +51,7 @@ Iryna Gurevych},
|
|
|
51
51
|
adapted_from=["StackExchangeClustering"],
|
|
52
52
|
)
|
|
53
53
|
|
|
54
|
-
def dataset_transform(self):
|
|
54
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
55
55
|
ds = {}
|
|
56
56
|
for split in self.metadata.eval_splits:
|
|
57
57
|
labels = list(itertools.chain.from_iterable(self.dataset[split]["labels"]))
|
|
@@ -52,7 +52,7 @@ Iryna Gurevych},
|
|
|
52
52
|
adapted_from=["StackExchangeClusteringP2P"],
|
|
53
53
|
)
|
|
54
54
|
|
|
55
|
-
def dataset_transform(self):
|
|
55
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
56
56
|
ds = {}
|
|
57
57
|
for split in self.metadata.eval_splits:
|
|
58
58
|
labels = list(itertools.chain.from_iterable(self.dataset[split]["labels"]))
|
|
@@ -93,7 +93,7 @@ class TwentyNewsgroupsClusteringFast(AbsTaskClustering):
|
|
|
93
93
|
adapted_from=["TwentyNewsgroupsClustering"],
|
|
94
94
|
)
|
|
95
95
|
|
|
96
|
-
def dataset_transform(self):
|
|
96
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
97
97
|
ds = {}
|
|
98
98
|
for split in self.metadata.eval_splits:
|
|
99
99
|
labels = list(itertools.chain.from_iterable(self.dataset[split]["labels"]))
|
|
@@ -33,7 +33,7 @@ class BeytooteClustering(AbsTaskClustering):
|
|
|
33
33
|
bibtex_citation=""" """,
|
|
34
34
|
)
|
|
35
35
|
|
|
36
|
-
def dataset_transform(self):
|
|
36
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
37
37
|
self.dataset = self.stratified_subsampling(
|
|
38
38
|
self.dataset,
|
|
39
39
|
seed=self.seed,
|
|
@@ -93,7 +93,7 @@ class HamshahriClustring(AbsTaskClustering):
|
|
|
93
93
|
bibtex_citation=""" """,
|
|
94
94
|
)
|
|
95
95
|
|
|
96
|
-
def dataset_transform(self):
|
|
96
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
97
97
|
self.dataset = self.dataset.map(
|
|
98
98
|
lambda x: {"sentences": f"{x['title']}\n: {x['summary']}"}
|
|
99
99
|
)
|
|
@@ -151,7 +151,7 @@ class NLPTwitterAnalysisClustering(AbsTaskClustering):
|
|
|
151
151
|
bibtex_citation=""" """,
|
|
152
152
|
)
|
|
153
153
|
|
|
154
|
-
def dataset_transform(self):
|
|
154
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
155
155
|
self.dataset = self.dataset.rename_column("tweet", "sentences")
|
|
156
156
|
self.dataset = self.dataset.rename_column("label", "labels")
|
|
157
157
|
self.dataset = self.stratified_subsampling(
|
|
@@ -187,7 +187,7 @@ class SIDClustring(AbsTaskClustering):
|
|
|
187
187
|
bibtex_citation=""" """,
|
|
188
188
|
)
|
|
189
189
|
|
|
190
|
-
def dataset_transform(self):
|
|
190
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
191
191
|
self.dataset = self.stratified_subsampling(
|
|
192
192
|
self.dataset,
|
|
193
193
|
seed=self.seed,
|
|
@@ -48,7 +48,7 @@ class HALClusteringS2S(AbsTaskClusteringLegacy):
|
|
|
48
48
|
superseded_by="HALClusteringS2S.v2",
|
|
49
49
|
)
|
|
50
50
|
|
|
51
|
-
def dataset_transform(self):
|
|
51
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
52
52
|
"""Convert to standard format"""
|
|
53
53
|
self.dataset = self.dataset.remove_columns("hal_id")
|
|
54
54
|
titles = self.dataset["test"]["title"]
|
|
@@ -98,7 +98,7 @@ class HALClusteringS2SFast(AbsTaskClustering):
|
|
|
98
98
|
adapted_from=["HALClusteringS2S"],
|
|
99
99
|
)
|
|
100
100
|
|
|
101
|
-
def dataset_transform(self):
|
|
101
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
102
102
|
"""Convert to standard format"""
|
|
103
103
|
self.dataset["test"] = self.dataset["test"].remove_columns("hal_id")
|
|
104
104
|
self.dataset["test"] = self.dataset["test"].rename_columns(
|
|
@@ -51,7 +51,7 @@ class MLSUMClusteringP2P(AbsTaskClusteringLegacy):
|
|
|
51
51
|
superseded_by="MLSUMClusteringP2P.v2",
|
|
52
52
|
)
|
|
53
53
|
|
|
54
|
-
def load_data(self) -> None:
|
|
54
|
+
def load_data(self, num_proc: int = 1, **kwargs) -> None:
|
|
55
55
|
"""Load dataset from HuggingFace hub and convert it to the standard format."""
|
|
56
56
|
if self.data_loaded:
|
|
57
57
|
return
|
|
@@ -124,7 +124,7 @@ class MLSUMClusteringP2PFast(AbsTaskClustering):
|
|
|
124
124
|
adapted_from=["MLSUMClusteringP2P"],
|
|
125
125
|
)
|
|
126
126
|
|
|
127
|
-
def load_data(self) -> None:
|
|
127
|
+
def load_data(self, num_proc: int = 1, **kwargs) -> None:
|
|
128
128
|
"""Load dataset from HuggingFace hub and convert it to the standard format."""
|
|
129
129
|
if self.data_loaded:
|
|
130
130
|
return
|
|
@@ -51,7 +51,7 @@ class MLSUMClusteringS2S(AbsTaskClusteringLegacy):
|
|
|
51
51
|
superseded_by="MLSUMClusteringS2S.v2",
|
|
52
52
|
)
|
|
53
53
|
|
|
54
|
-
def load_data(self) -> None:
|
|
54
|
+
def load_data(self, num_proc: int = 1, **kwargs) -> None:
|
|
55
55
|
"""Load dataset from HuggingFace hub and convert it to the standard format."""
|
|
56
56
|
if self.data_loaded:
|
|
57
57
|
return
|
|
@@ -119,7 +119,7 @@ class MLSUMClusteringS2SFast(AbsTaskClustering):
|
|
|
119
119
|
adapted_from=["MLSUMClusteringS2S"],
|
|
120
120
|
)
|
|
121
121
|
|
|
122
|
-
def load_data(self) -> None:
|
|
122
|
+
def load_data(self, num_proc: int = 1, **kwargs) -> None:
|
|
123
123
|
"""Load dataset from HuggingFace hub and convert it to the standard format."""
|
|
124
124
|
if self.data_loaded:
|
|
125
125
|
return
|
|
@@ -239,7 +239,7 @@ class SIB200ClusteringFast(AbsTaskClustering):
|
|
|
239
239
|
""", # combined train, validation, and test into test.
|
|
240
240
|
)
|
|
241
241
|
|
|
242
|
-
def dataset_transform(self):
|
|
242
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
243
243
|
ds = {}
|
|
244
244
|
for lang in self.hf_subsets:
|
|
245
245
|
labels = []
|
|
@@ -33,7 +33,7 @@ class DutchNewsArticlesClusteringP2P(AbsTaskClustering):
|
|
|
33
33
|
},
|
|
34
34
|
)
|
|
35
35
|
|
|
36
|
-
def dataset_transform(self):
|
|
36
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
37
37
|
for split in self.dataset:
|
|
38
38
|
self.dataset[split] = self.dataset[split].rename_columns(
|
|
39
39
|
{"label": "labels", "text": "sentences"}
|
|
@@ -33,7 +33,7 @@ class DutchNewsArticlesClusteringS2S(AbsTaskClustering):
|
|
|
33
33
|
},
|
|
34
34
|
)
|
|
35
35
|
|
|
36
|
-
def dataset_transform(self):
|
|
36
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
37
37
|
for split in self.dataset:
|
|
38
38
|
self.dataset[split] = self.dataset[split].rename_columns(
|
|
39
39
|
{"label": "labels", "title": "sentences"}
|
|
@@ -43,7 +43,7 @@ class IconclassClusteringS2S(AbsTaskClustering):
|
|
|
43
43
|
},
|
|
44
44
|
)
|
|
45
45
|
|
|
46
|
-
def dataset_transform(self):
|
|
46
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
47
47
|
for split in self.dataset:
|
|
48
48
|
self.dataset[split] = self.dataset[split].map(
|
|
49
49
|
lambda ex: {"labels": ex["label"], "sentences": ex["text"]}
|
|
@@ -43,7 +43,7 @@ class OpenTenderClusteringP2P(AbsTaskClustering):
|
|
|
43
43
|
},
|
|
44
44
|
)
|
|
45
45
|
|
|
46
|
-
def dataset_transform(self):
|
|
46
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
47
47
|
# reuse the dataset for classification
|
|
48
48
|
for split in self.dataset:
|
|
49
49
|
self.dataset[split] = self.dataset[split].map(
|
|
@@ -131,7 +131,7 @@ Piperidis, Stelios},
|
|
|
131
131
|
adapted_from=["EightTagsClustering"],
|
|
132
132
|
)
|
|
133
133
|
|
|
134
|
-
def dataset_transform(self):
|
|
134
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
135
135
|
ds = {}
|
|
136
136
|
for split in self.metadata.eval_splits:
|
|
137
137
|
labels = list(chain.from_iterable(self.dataset[split]["labels"]))
|
|
@@ -204,7 +204,7 @@ class PlscClusteringS2SFast(AbsTaskClustering):
|
|
|
204
204
|
adapted_from=["PlscClusteringS2S"],
|
|
205
205
|
)
|
|
206
206
|
|
|
207
|
-
def dataset_transform(self):
|
|
207
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
208
208
|
ds = {}
|
|
209
209
|
for split in self.metadata.eval_splits:
|
|
210
210
|
labels = self.dataset[split]["labels"]
|
|
@@ -286,7 +286,7 @@ class PlscClusteringP2PFast(AbsTaskClustering):
|
|
|
286
286
|
adapted_from=["PlscClusteringP2P"],
|
|
287
287
|
)
|
|
288
288
|
|
|
289
|
-
def dataset_transform(self):
|
|
289
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
290
290
|
ds = {}
|
|
291
291
|
for split in self.metadata.eval_splits:
|
|
292
292
|
labels = self.dataset[split]["labels"]
|