mteb 2.7.3__py3-none-any.whl → 2.7.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mteb/_create_dataloaders.py +47 -5
- mteb/_evaluators/any_sts_evaluator.py +2 -0
- mteb/_evaluators/clustering_evaluator.py +2 -0
- mteb/_evaluators/evaluator.py +2 -1
- mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +8 -1
- mteb/_evaluators/pair_classification_evaluator.py +3 -0
- mteb/_evaluators/retrieval_evaluator.py +3 -0
- mteb/_evaluators/sklearn_evaluator.py +6 -1
- mteb/_evaluators/text/bitext_mining_evaluator.py +2 -0
- mteb/_evaluators/text/summarization_evaluator.py +2 -0
- mteb/_evaluators/zeroshot_classification_evaluator.py +2 -0
- mteb/abstasks/abstask.py +31 -12
- mteb/abstasks/classification.py +10 -3
- mteb/abstasks/clustering.py +6 -2
- mteb/abstasks/clustering_legacy.py +8 -2
- mteb/abstasks/image/image_text_pair_classification.py +6 -2
- mteb/abstasks/multilabel_classification.py +2 -0
- mteb/abstasks/pair_classification.py +8 -2
- mteb/abstasks/retrieval.py +27 -12
- mteb/abstasks/retrieval_dataset_loaders.py +29 -19
- mteb/abstasks/sts.py +10 -3
- mteb/abstasks/text/bitext_mining.py +9 -5
- mteb/abstasks/text/reranking.py +2 -2
- mteb/abstasks/text/summarization.py +2 -1
- mteb/abstasks/zeroshot_classification.py +8 -2
- mteb/benchmarks/benchmarks/__init__.py +2 -0
- mteb/benchmarks/benchmarks/benchmarks.py +41 -2
- mteb/descriptive_stats/Retrieval/BrightAopsRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightBiologyLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightBiologyRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightEarthScienceLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightEarthScienceRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightEconomicsLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightEconomicsRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightLeetcodeRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightPonyLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightPonyRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightPsychologyLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightPsychologyRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightRoboticsLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightRoboticsRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightStackoverflowLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightStackoverflowRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightSustainableLivingLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightSustainableLivingRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightTheoremQAQuestionsRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightTheoremQATheoremsRetrieval.json +35 -0
- mteb/evaluate.py +10 -2
- mteb/models/model_implementations/align_models.py +1 -0
- mteb/models/model_implementations/amazon_models.py +1 -0
- mteb/models/model_implementations/andersborges.py +2 -0
- mteb/models/model_implementations/ara_models.py +1 -0
- mteb/models/model_implementations/arctic_models.py +8 -0
- mteb/models/model_implementations/b1ade_models.py +1 -0
- mteb/models/model_implementations/bedrock_models.py +4 -0
- mteb/models/model_implementations/bge_models.py +40 -1
- mteb/models/model_implementations/bica_model.py +1 -0
- mteb/models/model_implementations/blip2_models.py +2 -0
- mteb/models/model_implementations/blip_models.py +8 -0
- mteb/models/model_implementations/bm25.py +10 -5
- mteb/models/model_implementations/bmretriever_models.py +4 -0
- mteb/models/model_implementations/cadet_models.py +1 -0
- mteb/models/model_implementations/cde_models.py +2 -0
- mteb/models/model_implementations/clip_models.py +3 -0
- mteb/models/model_implementations/clips_models.py +3 -0
- mteb/models/model_implementations/codefuse_models.py +5 -0
- mteb/models/model_implementations/codesage_models.py +3 -0
- mteb/models/model_implementations/cohere_models.py +4 -0
- mteb/models/model_implementations/cohere_v.py +5 -0
- mteb/models/model_implementations/colpali_models.py +3 -0
- mteb/models/model_implementations/colqwen_models.py +7 -0
- mteb/models/model_implementations/colsmol_models.py +2 -0
- mteb/models/model_implementations/conan_models.py +1 -0
- mteb/models/model_implementations/dino_models.py +19 -0
- mteb/models/model_implementations/e5_instruct.py +4 -0
- mteb/models/model_implementations/e5_models.py +9 -0
- mteb/models/model_implementations/e5_v.py +1 -0
- mteb/models/model_implementations/eagerworks_models.py +1 -0
- mteb/models/model_implementations/emillykkejensen_models.py +3 -0
- mteb/models/model_implementations/en_code_retriever.py +1 -0
- mteb/models/model_implementations/euler_models.py +1 -0
- mteb/models/model_implementations/evaclip_models.py +4 -0
- mteb/models/model_implementations/fa_models.py +9 -0
- mteb/models/model_implementations/facebookai.py +2 -0
- mteb/models/model_implementations/geogpt_models.py +1 -0
- mteb/models/model_implementations/gme_v_models.py +2 -0
- mteb/models/model_implementations/google_models.py +5 -0
- mteb/models/model_implementations/granite_vision_embedding_models.py +1 -0
- mteb/models/model_implementations/gritlm_models.py +2 -0
- mteb/models/model_implementations/gte_models.py +9 -0
- mteb/models/model_implementations/hinvec_models.py +1 -0
- mteb/models/model_implementations/human.py +1 -0
- mteb/models/model_implementations/ibm_granite_models.py +6 -0
- mteb/models/model_implementations/inf_models.py +2 -0
- mteb/models/model_implementations/jasper_models.py +2 -0
- mteb/models/model_implementations/jina_clip.py +1 -0
- mteb/models/model_implementations/jina_models.py +7 -0
- mteb/models/model_implementations/kalm_models.py +6 -0
- mteb/models/model_implementations/kblab.py +1 -0
- mteb/models/model_implementations/kennethenevoldsen_models.py +2 -0
- mteb/models/model_implementations/kfst.py +1 -0
- mteb/models/model_implementations/kowshik24_models.py +1 -0
- mteb/models/model_implementations/lens_models.py +2 -0
- mteb/models/model_implementations/lgai_embedding_models.py +1 -0
- mteb/models/model_implementations/linq_models.py +1 -0
- mteb/models/model_implementations/listconranker.py +1 -0
- mteb/models/model_implementations/llm2clip_models.py +3 -0
- mteb/models/model_implementations/llm2vec_models.py +8 -0
- mteb/models/model_implementations/mcinext_models.py +3 -0
- mteb/models/model_implementations/mdbr_models.py +2 -0
- mteb/models/model_implementations/misc_models.py +63 -0
- mteb/models/model_implementations/mixedbread_ai_models.py +3 -0
- mteb/models/model_implementations/mme5_models.py +2 -1
- mteb/models/model_implementations/moco_models.py +2 -0
- mteb/models/model_implementations/mod_models.py +1 -0
- mteb/models/model_implementations/model2vec_models.py +13 -0
- mteb/models/model_implementations/moka_models.py +3 -0
- mteb/models/model_implementations/nbailab.py +3 -0
- mteb/models/model_implementations/no_instruct_sentence_models.py +1 -0
- mteb/models/model_implementations/nomic_models.py +6 -0
- mteb/models/model_implementations/nomic_models_vision.py +1 -0
- mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +2 -0
- mteb/models/model_implementations/nvidia_models.py +3 -0
- mteb/models/model_implementations/octen_models.py +2 -0
- mteb/models/model_implementations/openai_models.py +5 -0
- mteb/models/model_implementations/openclip_models.py +8 -0
- mteb/models/model_implementations/opensearch_neural_sparse_models.py +5 -0
- mteb/models/model_implementations/ops_moa_models.py +2 -0
- mteb/models/model_implementations/ordalietech_solon_embeddings_mini_beta_1_1.py +1 -0
- mteb/models/model_implementations/pawan_models.py +1 -0
- mteb/models/model_implementations/piccolo_models.py +2 -0
- mteb/models/model_implementations/promptriever_models.py +4 -0
- mteb/models/model_implementations/pylate_models.py +13 -0
- mteb/models/model_implementations/qodo_models.py +2 -0
- mteb/models/model_implementations/qtack_models.py +1 -0
- mteb/models/model_implementations/qwen3_models.py +3 -0
- mteb/models/model_implementations/qzhou_models.py +2 -0
- mteb/models/model_implementations/rasgaard_models.py +1 -0
- mteb/models/model_implementations/reasonir_model.py +65 -0
- mteb/models/model_implementations/repllama_models.py +2 -0
- mteb/models/model_implementations/rerankers_custom.py +3 -0
- mteb/models/model_implementations/rerankers_monot5_based.py +14 -0
- mteb/models/model_implementations/richinfoai_models.py +1 -0
- mteb/models/model_implementations/ru_sentence_models.py +20 -0
- mteb/models/model_implementations/ruri_models.py +10 -0
- mteb/models/model_implementations/salesforce_models.py +3 -0
- mteb/models/model_implementations/samilpwc_models.py +1 -0
- mteb/models/model_implementations/sarashina_embedding_models.py +2 -0
- mteb/models/model_implementations/searchmap_models.py +1 -0
- mteb/models/model_implementations/seed_1_6_embedding_models.py +1 -0
- mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +1 -0
- mteb/models/model_implementations/seed_models.py +1 -0
- mteb/models/model_implementations/sentence_transformers_models.py +18 -0
- mteb/models/model_implementations/shuu_model.py +1 -0
- mteb/models/model_implementations/siglip_models.py +10 -0
- mteb/models/model_implementations/sonar_models.py +2 -1
- mteb/models/model_implementations/spartan8806_atles_champion.py +1 -0
- mteb/models/model_implementations/stella_models.py +6 -0
- mteb/models/model_implementations/tarka_models.py +2 -0
- mteb/models/model_implementations/text2vec_models.py +3 -0
- mteb/models/model_implementations/ua_sentence_models.py +1 -0
- mteb/models/model_implementations/uae_models.py +1 -0
- mteb/models/model_implementations/vdr_models.py +1 -0
- mteb/models/model_implementations/vi_vn_models.py +6 -0
- mteb/models/model_implementations/vista_models.py +2 -0
- mteb/models/model_implementations/vlm2vec_models.py +2 -0
- mteb/models/model_implementations/voyage_models.py +15 -0
- mteb/models/model_implementations/voyage_v.py +1 -0
- mteb/models/model_implementations/xyz_models.py +1 -0
- mteb/models/model_implementations/youtu_models.py +1 -0
- mteb/models/model_implementations/yuan_models.py +1 -0
- mteb/models/model_implementations/yuan_models_en.py +1 -0
- mteb/models/model_meta.py +35 -2
- mteb/models/models_protocols.py +4 -0
- mteb/models/search_wrappers.py +12 -0
- mteb/tasks/bitext_mining/eng/pub_chem_smiles_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/fas/fa_mteb_summary_retrieval.py +3 -3
- mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/flores_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/in22_conv_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/in22_gen_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/norwegian_courts_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/ntrex_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/roma_tales_bitext_mining.py +2 -2
- mteb/tasks/bitext_mining/multilingual/web_faq_bitext_mining.py +2 -2
- mteb/tasks/classification/ara/online_store_review_sentiment_classification.py +1 -1
- mteb/tasks/classification/ara/restaurant_review_sentiment_classification.py +1 -1
- mteb/tasks/classification/ara/tweet_sarcasm_classification.py +1 -1
- mteb/tasks/classification/ben/bengali_hate_speech_classification.py +1 -1
- mteb/tasks/classification/ben/bengali_sentiment_analysis.py +1 -1
- mteb/tasks/classification/bul/bulgarian_store_review_sentiment_classfication.py +1 -1
- mteb/tasks/classification/ces/csfdcz_movie_review_sentiment_classification.py +2 -2
- mteb/tasks/classification/dan/ddisco_cohesion_classification.py +1 -1
- mteb/tasks/classification/dan/dk_hate_classification.py +1 -1
- mteb/tasks/classification/deu/german_politicians_twitter_sentiment_classification.py +1 -1
- mteb/tasks/classification/ell/greek_legal_code_classification.py +1 -1
- mteb/tasks/classification/eng/dbpedia_classification.py +2 -2
- mteb/tasks/classification/eng/toxic_chat_classification.py +2 -2
- mteb/tasks/classification/eng/toxic_conversations_classification.py +2 -2
- mteb/tasks/classification/eng/tweet_topic_single_classification.py +1 -1
- mteb/tasks/classification/eng/yahoo_answers_topics_classification.py +1 -1
- mteb/tasks/classification/eng/yelp_review_full_classification.py +2 -2
- mteb/tasks/classification/est/estonian_valence.py +1 -1
- mteb/tasks/classification/fas/fa_mteb_classification.py +6 -6
- mteb/tasks/classification/fas/persian_food_sentiment_classification.py +1 -1
- mteb/tasks/classification/fil/filipino_shopee_reviews_classification.py +1 -1
- mteb/tasks/classification/fin/fin_toxicity_classification.py +1 -1
- mteb/tasks/classification/fra/french_book_reviews.py +2 -2
- mteb/tasks/classification/fra/movie_review_sentiment_classification.py +2 -2
- mteb/tasks/classification/guj/gujarati_news_classification.py +1 -1
- mteb/tasks/classification/hin/hindi_discourse_classification.py +1 -1
- mteb/tasks/classification/hin/sentiment_analysis_hindi.py +1 -1
- mteb/tasks/classification/ind/indonesian_id_clickbait_classification.py +2 -2
- mteb/tasks/classification/ind/indonesian_mongabay_conservation_classification.py +1 -1
- mteb/tasks/classification/ita/dado_eval_coarse_classification.py +1 -1
- mteb/tasks/classification/ita/ita_casehold_classification.py +1 -1
- mteb/tasks/classification/ita/sardi_stance_classification.py +1 -1
- mteb/tasks/classification/jav/javanese_imdb_classification.py +1 -1
- mteb/tasks/classification/jpn/wrime_classification.py +1 -1
- mteb/tasks/classification/kan/kannada_news_classification.py +2 -2
- mteb/tasks/classification/kor/klue_tc.py +2 -2
- mteb/tasks/classification/kor/kor_fin.py +1 -1
- mteb/tasks/classification/kor/kor_hate_classification.py +1 -1
- mteb/tasks/classification/kor/kor_sarcasm_classification.py +1 -1
- mteb/tasks/classification/mal/malayalam_news_classification.py +1 -1
- mteb/tasks/classification/mar/marathi_news_classification.py +1 -1
- mteb/tasks/classification/multilingual/afri_senti_lang_classification.py +1 -1
- mteb/tasks/classification/multilingual/catalonia_tweet_classification.py +1 -1
- mteb/tasks/classification/multilingual/cyrillic_turkic_lang_classification.py +1 -1
- mteb/tasks/classification/multilingual/indic_nlp_news_classification.py +1 -1
- mteb/tasks/classification/multilingual/masakha_news_classification.py +1 -1
- mteb/tasks/classification/multilingual/multi_hate_classification.py +1 -1
- mteb/tasks/classification/multilingual/multilingual_sentiment_classification.py +1 -1
- mteb/tasks/classification/multilingual/scala_classification.py +1 -1
- mteb/tasks/classification/multilingual/sib200_classification.py +1 -1
- mteb/tasks/classification/multilingual/turkic_classification.py +1 -1
- mteb/tasks/classification/multilingual/tweet_sentiment_classification.py +1 -1
- mteb/tasks/classification/nep/nepali_news_classification.py +2 -2
- mteb/tasks/classification/nld/dutch_sarcastic_headlines_classification.py +1 -1
- mteb/tasks/classification/nld/vaccin_chat_nl_classification.py +1 -1
- mteb/tasks/classification/ory/odia_news_classification.py +2 -2
- mteb/tasks/classification/pan/punjabi_news_classification.py +1 -1
- mteb/tasks/classification/ron/moroco.py +1 -1
- mteb/tasks/classification/ron/romanian_reviews_sentiment.py +1 -1
- mteb/tasks/classification/ron/romanian_sentiment_classification.py +1 -1
- mteb/tasks/classification/rus/georeview_classification.py +1 -1
- mteb/tasks/classification/rus/headline_classification.py +2 -2
- mteb/tasks/classification/rus/inappropriateness_classification.py +2 -2
- mteb/tasks/classification/rus/ru_reviews_classification.py +2 -2
- mteb/tasks/classification/rus/ru_sci_bench_grnti_classification.py +1 -1
- mteb/tasks/classification/rus/ru_sci_bench_oecd_classification.py +1 -1
- mteb/tasks/classification/rus/ru_toixic_classification_okmlcup.py +1 -1
- mteb/tasks/classification/san/sanskrit_shlokas_classification.py +1 -1
- mteb/tasks/classification/sin/sinhala_news_classification.py +2 -2
- mteb/tasks/classification/sin/sinhala_news_source_classification.py +2 -2
- mteb/tasks/classification/slk/csfdsk_movie_review_sentiment_classification.py +2 -2
- mteb/tasks/classification/slv/frenk_sl_classification.py +1 -1
- mteb/tasks/classification/spa/spanish_news_classification.py +2 -2
- mteb/tasks/classification/ssw/siswati_news_classification.py +1 -1
- mteb/tasks/classification/tam/tamil_news_classification.py +2 -2
- mteb/tasks/classification/tel/telugu_andhra_jyoti_news_classification.py +2 -2
- mteb/tasks/classification/tha/wongnai_reviews_classification.py +1 -1
- mteb/tasks/classification/tur/turkish_movie_sentiment_classification.py +2 -2
- mteb/tasks/classification/ukr/ukr_formality_classification.py +2 -2
- mteb/tasks/classification/vie/toxic_conversations_vn_classification.py +1 -1
- mteb/tasks/classification/vie/vie_student_feedback_classification.py +1 -1
- mteb/tasks/classification/zho/yue_openrice_review_classification.py +2 -2
- mteb/tasks/classification/zul/isi_zulu_news_classification.py +1 -1
- mteb/tasks/clustering/deu/blurbs_clustering_p2p.py +1 -1
- mteb/tasks/clustering/deu/blurbs_clustering_s2s.py +1 -1
- mteb/tasks/clustering/eng/arxiv_clustering_p2p.py +1 -1
- mteb/tasks/clustering/eng/arxiv_hierarchical_clustering.py +2 -2
- mteb/tasks/clustering/eng/big_patent_clustering.py +1 -1
- mteb/tasks/clustering/eng/biorxiv_clustering_p2p.py +1 -1
- mteb/tasks/clustering/eng/biorxiv_clustering_s2s.py +1 -1
- mteb/tasks/clustering/eng/medrxiv_clustering_p2p.py +1 -1
- mteb/tasks/clustering/eng/medrxiv_clustering_s2s.py +1 -1
- mteb/tasks/clustering/eng/reddit_clustering.py +1 -1
- mteb/tasks/clustering/eng/reddit_clustering_p2p.py +1 -1
- mteb/tasks/clustering/eng/stack_exchange_clustering.py +1 -1
- mteb/tasks/clustering/eng/stack_exchange_clustering_p2p.py +1 -1
- mteb/tasks/clustering/eng/twenty_newsgroups_clustering.py +1 -1
- mteb/tasks/clustering/fas/fa_mteb_clustering.py +4 -4
- mteb/tasks/clustering/fra/hal_clustering_s2s.py +2 -2
- mteb/tasks/clustering/multilingual/mlsum_clustering_p2p.py +2 -2
- mteb/tasks/clustering/multilingual/mlsum_clustering_s2s.py +2 -2
- mteb/tasks/clustering/multilingual/sib200_clustering_s2s.py +1 -1
- mteb/tasks/clustering/multilingual/wiki_clustering_p2p.py +1 -1
- mteb/tasks/clustering/nld/dutch_news_articles_clustering_p2p.py +1 -1
- mteb/tasks/clustering/nld/dutch_news_articles_clustering_s2s.py +1 -1
- mteb/tasks/clustering/nld/iconclass_clustering_s2s.py +1 -1
- mteb/tasks/clustering/nld/open_tender_clustering_p2p.py +1 -1
- mteb/tasks/clustering/nld/vabb_clustering_p2p.py +1 -1
- mteb/tasks/clustering/nld/vabb_clustering_s2s.py +1 -1
- mteb/tasks/clustering/nob/snl_clustering.py +1 -1
- mteb/tasks/clustering/nob/vg_clustering.py +1 -1
- mteb/tasks/clustering/pol/polish_clustering.py +3 -3
- mteb/tasks/clustering/rus/ru_sci_bench_grnti_clustering_p2p.py +1 -1
- mteb/tasks/clustering/rus/ru_sci_bench_oecd_clustering_p2p.py +1 -1
- mteb/tasks/clustering/zho/cmteb_clustering.py +4 -4
- mteb/tasks/image_text_pair_classification/eng/image_co_de.py +1 -1
- mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +1 -1
- mteb/tasks/instruction_reranking/multilingual/m_follow_ir.py +2 -2
- mteb/tasks/multichoice/eng/cv_bench.py +4 -4
- mteb/tasks/multilabel_classification/ita/emit_classification.py +1 -1
- mteb/tasks/multilabel_classification/mlt/maltese_news_classification.py +1 -1
- mteb/tasks/multilabel_classification/rus/ru_toixic_multilabelclassification_okmlcup.py +1 -1
- mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_group_classification.py +1 -1
- mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_subclass_classification.py +1 -1
- mteb/tasks/pair_classification/ara/ar_entail.py +1 -1
- mteb/tasks/pair_classification/dan/talemaader_pc.py +1 -1
- mteb/tasks/pair_classification/deu/false_friends_de_en_pc.py +1 -1
- mteb/tasks/pair_classification/eng/pub_chem_ai_sentence_paraphrase_pc.py +1 -1
- mteb/tasks/pair_classification/eng/pub_chem_smilespc.py +1 -1
- mteb/tasks/pair_classification/eng/pub_chem_synonym_pc.py +1 -1
- mteb/tasks/pair_classification/eng/pub_chem_wiki_paragraphs_pc.py +1 -1
- mteb/tasks/pair_classification/eng/sprint_duplicate_questions_pc.py +1 -1
- mteb/tasks/pair_classification/eng/twitter_sem_eval2015_pc.py +1 -1
- mteb/tasks/pair_classification/eng/twitter_url_corpus_pc.py +1 -1
- mteb/tasks/pair_classification/fas/fa_mteb_pair_classification.py +5 -5
- mteb/tasks/pair_classification/fas/fars_tail.py +2 -2
- mteb/tasks/pair_classification/hye/armenian_paraphrase_pc.py +1 -1
- mteb/tasks/pair_classification/ita/dis_co_tex_pair_classification.py +1 -1
- mteb/tasks/pair_classification/kor/klue_nli.py +1 -1
- mteb/tasks/pair_classification/multilingual/rte3.py +2 -2
- mteb/tasks/pair_classification/multilingual/xnli.py +1 -1
- mteb/tasks/pair_classification/pol/polish_pc.py +4 -4
- mteb/tasks/pair_classification/por/assin2_rte.py +1 -1
- mteb/tasks/pair_classification/por/sick_br_pc.py +1 -1
- mteb/tasks/pair_classification/rus/terra.py +2 -2
- mteb/tasks/pair_classification/vie/sprint_duplicate_questions_pcvn.py +1 -1
- mteb/tasks/pair_classification/vie/twitter_sem_eval2015_pcvn.py +1 -1
- mteb/tasks/pair_classification/vie/twitter_url_corpus_pcvn.py +1 -1
- mteb/tasks/pair_classification/zho/cmteb_pair_classification.py +2 -2
- mteb/tasks/retrieval/ara/sadeem_question_retrieval.py +1 -1
- mteb/tasks/retrieval/code/code_edit_search_retrieval.py +1 -1
- mteb/tasks/retrieval/code/code_rag.py +4 -4
- mteb/tasks/retrieval/code/code_search_net_cc_retrieval.py +1 -1
- mteb/tasks/retrieval/code/coir_code_search_net_retrieval.py +1 -1
- mteb/tasks/retrieval/code/ds1000_retrieval.py +1 -1
- mteb/tasks/retrieval/code/fresh_stack_retrieval.py +1 -1
- mteb/tasks/retrieval/code/human_eval_retrieval.py +1 -1
- mteb/tasks/retrieval/code/mbpp_retrieval.py +1 -1
- mteb/tasks/retrieval/code/wiki_sql_retrieval.py +1 -1
- mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
- mteb/tasks/retrieval/dan/tv2_nordretrieval.py +1 -1
- mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +1 -1
- mteb/tasks/retrieval/deu/german_gov_service_retrieval.py +1 -1
- mteb/tasks/retrieval/deu/german_qu_ad_retrieval.py +1 -1
- mteb/tasks/retrieval/ell/greek_civics_qa.py +1 -1
- mteb/tasks/retrieval/eng/__init__.py +42 -0
- mteb/tasks/retrieval/eng/bright_retrieval.py +10 -2
- mteb/tasks/retrieval/eng/bright_v1_1_retrieval.py +968 -0
- mteb/tasks/retrieval/eng/chat_doctor_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/fin_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/finance_bench_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/hateful_memes_i2t_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/hateful_memes_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/hc3_finance_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lemb_narrative_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lemb_needle_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lemb_passkey_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lemb_summ_screen_fd_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lemb_wikim_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lembqm_sum_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lit_search_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/memotion_i2t_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/memotion_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/ml_questions.py +1 -1
- mteb/tasks/retrieval/eng/nano_argu_ana_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_climate_fever_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_db_pedia_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_fever_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_fi_qa2018_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_hotpot_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_msmarco_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_nf_corpus_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_nq_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_quora_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_sci_fact_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_scidocs_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_touche2020_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/narrative_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/r2_med_retrieval.py +8 -8
- mteb/tasks/retrieval/eng/sci_mmir_i2t_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/sci_mmir_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +10 -10
- mteb/tasks/retrieval/fra/f_qu_ad_retrieval.py +1 -1
- mteb/tasks/retrieval/fra/syntec_retrieval.py +1 -1
- mteb/tasks/retrieval/hun/hun_sum2.py +1 -1
- mteb/tasks/retrieval/kat/georgian_faq_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/cross_lingual_semantic_discrimination_wmt19.py +1 -1
- mteb/tasks/retrieval/multilingual/cross_lingual_semantic_discrimination_wmt21.py +1 -1
- mteb/tasks/retrieval/multilingual/cur_ev1_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/miracl_vision_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/mr_tidy_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/public_health_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +2 -2
- mteb/tasks/retrieval/multilingual/statcan_dialogue_dataset_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/vdr_multilingual_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +14 -4
- mteb/tasks/retrieval/multilingual/wit_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/x_flickr30k_co_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/x_qu_ad_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/xm3600_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_android_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_english_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_gaming_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_gis_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_mathematica_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_physics_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_programmers_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_stats_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_tex_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_unix_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_webmasters_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_wordpress_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nob/norquad.py +1 -1
- mteb/tasks/retrieval/nob/snl_retrieval.py +1 -1
- mteb/tasks/retrieval/slk/slovak_sum_retrieval.py +1 -1
- mteb/tasks/retrieval/vie/vie_qu_ad_retrieval.py +1 -1
- mteb/tasks/sts/fao/faroese_sts.py +1 -1
- mteb/tasks/sts/fra/sick_fr_sts.py +1 -1
- mteb/tasks/sts/kor/klue_sts.py +1 -1
- mteb/tasks/sts/por/sick_br_sts.py +1 -1
- mteb/tasks/sts/rus/ru_para_phraser_sts.py +1 -1
- mteb/tasks/zeroshot_classification/eng/sci_mmir.py +1 -1
- {mteb-2.7.3.dist-info → mteb-2.7.5.dist-info}/METADATA +1 -1
- {mteb-2.7.3.dist-info → mteb-2.7.5.dist-info}/RECORD +434 -413
- {mteb-2.7.3.dist-info → mteb-2.7.5.dist-info}/WHEEL +0 -0
- {mteb-2.7.3.dist-info → mteb-2.7.5.dist-info}/entry_points.txt +0 -0
- {mteb-2.7.3.dist-info → mteb-2.7.5.dist-info}/licenses/LICENSE +0 -0
- {mteb-2.7.3.dist-info → mteb-2.7.5.dist-info}/top_level.txt +0 -0
|
@@ -38,7 +38,7 @@ class SiswatiNewsClassification(AbsTaskClassification):
|
|
|
38
38
|
superseded_by="SiswatiNewsClassification.v2",
|
|
39
39
|
)
|
|
40
40
|
|
|
41
|
-
def dataset_transform(self):
|
|
41
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
42
42
|
self.dataset = self.dataset.rename_columns({"title": "text"})
|
|
43
43
|
|
|
44
44
|
|
|
@@ -35,7 +35,7 @@ class TamilNewsClassification(AbsTaskClassification):
|
|
|
35
35
|
superseded_by="TamilNewsClassification.v2",
|
|
36
36
|
)
|
|
37
37
|
|
|
38
|
-
def dataset_transform(self):
|
|
38
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
39
39
|
self.dataset = self.dataset.rename_columns(
|
|
40
40
|
{"NewsInTamil": "text", "Category": "label"}
|
|
41
41
|
)
|
|
@@ -75,5 +75,5 @@ class TamilNewsClassificationV2(AbsTaskClassification):
|
|
|
75
75
|
adapted_from=["TamilNewsClassification"],
|
|
76
76
|
)
|
|
77
77
|
|
|
78
|
-
def dataset_transform(self):
|
|
78
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
79
79
|
self.dataset = self.stratified_subsampling(self.dataset, seed=self.seed)
|
|
@@ -28,7 +28,7 @@ class TeluguAndhraJyotiNewsClassification(AbsTaskClassification):
|
|
|
28
28
|
superseded_by="TeluguAndhraJyotiNewsClassification.v2",
|
|
29
29
|
)
|
|
30
30
|
|
|
31
|
-
def dataset_transform(self):
|
|
31
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
32
32
|
self.dataset = self.dataset.rename_columns({"body": "text", "topic": "label"})
|
|
33
33
|
self.dataset = self.stratified_subsampling(self.dataset, seed=self.seed)
|
|
34
34
|
|
|
@@ -59,5 +59,5 @@ class TeluguAndhraJyotiNewsClassificationV2(AbsTaskClassification):
|
|
|
59
59
|
adapted_from=["TeluguAndhraJyotiNewsClassification"],
|
|
60
60
|
)
|
|
61
61
|
|
|
62
|
-
def dataset_transform(self):
|
|
62
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
63
63
|
self.dataset = self.stratified_subsampling(self.dataset, seed=self.seed)
|
|
@@ -38,7 +38,7 @@ class WongnaiReviewsClassification(AbsTaskClassification):
|
|
|
38
38
|
""",
|
|
39
39
|
)
|
|
40
40
|
|
|
41
|
-
def dataset_transform(self):
|
|
41
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
42
42
|
self.dataset = self.dataset.rename_columns(
|
|
43
43
|
{"review_body": "text", "star_rating": "label"}
|
|
44
44
|
)
|
|
@@ -36,7 +36,7 @@ class TurkishMovieSentimentClassification(AbsTaskClassification):
|
|
|
36
36
|
superseded_by="TurkishMovieSentimentClassification.v2",
|
|
37
37
|
)
|
|
38
38
|
|
|
39
|
-
def dataset_transform(self):
|
|
39
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
40
40
|
self.dataset = self.stratified_subsampling(
|
|
41
41
|
self.dataset, seed=self.seed, splits=["test"]
|
|
42
42
|
)
|
|
@@ -76,7 +76,7 @@ class TurkishMovieSentimentClassificationV2(AbsTaskClassification):
|
|
|
76
76
|
adapted_from=["TurkishMovieSentimentClassification"],
|
|
77
77
|
)
|
|
78
78
|
|
|
79
|
-
def dataset_transform(self):
|
|
79
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
80
80
|
self.dataset = self.stratified_subsampling(
|
|
81
81
|
self.dataset, seed=self.seed, splits=["test"]
|
|
82
82
|
)
|
|
@@ -39,7 +39,7 @@ Tetreault, Joel},
|
|
|
39
39
|
superseded_by="UkrFormalityClassification.v2",
|
|
40
40
|
)
|
|
41
41
|
|
|
42
|
-
def dataset_transform(self):
|
|
42
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
43
43
|
self.dataset = self.dataset.rename_column("labels", "label")
|
|
44
44
|
self.dataset = self.dataset.class_encode_column("label")
|
|
45
45
|
self.dataset = self.stratified_subsampling(
|
|
@@ -84,7 +84,7 @@ Tetreault, Joel},
|
|
|
84
84
|
adapted_from=["UkrFormalityClassification"],
|
|
85
85
|
)
|
|
86
86
|
|
|
87
|
-
def dataset_transform(self):
|
|
87
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
88
88
|
self.dataset = self.stratified_subsampling(
|
|
89
89
|
self.dataset, seed=self.seed, splits=["train", "test"]
|
|
90
90
|
)
|
|
@@ -39,7 +39,7 @@ class ToxicConversationsVNClassification(AbsTaskClassification):
|
|
|
39
39
|
adapted_from=["ToxicConversationsClassification"],
|
|
40
40
|
)
|
|
41
41
|
|
|
42
|
-
def dataset_transform(self):
|
|
42
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
43
43
|
self.dataset = self.stratified_subsampling(
|
|
44
44
|
self.dataset, seed=self.seed, splits=["test"]
|
|
45
45
|
)
|
|
@@ -79,7 +79,7 @@ class VieStudentFeedbackClassificationV2(AbsTaskClassification):
|
|
|
79
79
|
adapted_from=["VieStudentFeedbackClassification"],
|
|
80
80
|
)
|
|
81
81
|
|
|
82
|
-
def dataset_transform(self):
|
|
82
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
83
83
|
self.dataset = self.stratified_subsampling(
|
|
84
84
|
self.dataset, seed=self.seed, splits=["test"]
|
|
85
85
|
)
|
|
@@ -39,7 +39,7 @@ class YueOpenriceReviewClassification(AbsTaskClassification):
|
|
|
39
39
|
|
|
40
40
|
samples_per_label = 32
|
|
41
41
|
|
|
42
|
-
def dataset_transform(self):
|
|
42
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
43
43
|
self.dataset = self.stratified_subsampling(
|
|
44
44
|
self.dataset, seed=self.seed, splits=["test"]
|
|
45
45
|
)
|
|
@@ -82,7 +82,7 @@ class YueOpenriceReviewClassificationV2(AbsTaskClassification):
|
|
|
82
82
|
|
|
83
83
|
samples_per_label = 32
|
|
84
84
|
|
|
85
|
-
def dataset_transform(self):
|
|
85
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
86
86
|
self.dataset = self.stratified_subsampling(
|
|
87
87
|
self.dataset, seed=self.seed, splits=["test"]
|
|
88
88
|
)
|
|
@@ -38,7 +38,7 @@ class IsiZuluNewsClassification(AbsTaskClassification):
|
|
|
38
38
|
superseded_by="IsiZuluNewsClassification.v2",
|
|
39
39
|
)
|
|
40
40
|
|
|
41
|
-
def dataset_transform(self):
|
|
41
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
42
42
|
self.dataset = self.dataset.rename_columns({"title": "text"})
|
|
43
43
|
|
|
44
44
|
|
|
@@ -82,7 +82,7 @@ class BlurbsClusteringP2PFast(AbsTaskClustering):
|
|
|
82
82
|
adapted_from=["BlurbsClusteringP2P"],
|
|
83
83
|
)
|
|
84
84
|
|
|
85
|
-
def dataset_transform(self):
|
|
85
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
86
86
|
self.dataset = _convert_to_fast(
|
|
87
87
|
self.dataset, self.input_column_name, self.label_column_name, self.seed
|
|
88
88
|
)
|
|
@@ -91,7 +91,7 @@ class BlurbsClusteringS2SFast(AbsTaskClustering):
|
|
|
91
91
|
adapted_from=["BlurbsClusteringS2S"],
|
|
92
92
|
)
|
|
93
93
|
|
|
94
|
-
def dataset_transform(self):
|
|
94
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
95
95
|
ds = {}
|
|
96
96
|
for split in self.metadata.eval_splits:
|
|
97
97
|
labels = list(itertools.chain.from_iterable(self.dataset[split]["labels"]))
|
|
@@ -38,7 +38,7 @@ class ArXivHierarchicalClusteringP2P(AbsTaskClustering):
|
|
|
38
38
|
bibtex_citation="",
|
|
39
39
|
)
|
|
40
40
|
|
|
41
|
-
def dataset_transform(self):
|
|
41
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
42
42
|
ds = {}
|
|
43
43
|
for split in self.metadata.eval_splits:
|
|
44
44
|
labels = list(itertools.chain.from_iterable(self.dataset[split]["labels"]))
|
|
@@ -78,7 +78,7 @@ class ArXivHierarchicalClusteringS2S(AbsTaskClustering):
|
|
|
78
78
|
bibtex_citation="",
|
|
79
79
|
)
|
|
80
80
|
|
|
81
|
-
def dataset_transform(self):
|
|
81
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
82
82
|
ds = {}
|
|
83
83
|
for split in self.metadata.eval_splits:
|
|
84
84
|
labels = list(itertools.chain.from_iterable(self.dataset[split]["labels"]))
|
|
@@ -104,7 +104,7 @@ Summarization},
|
|
|
104
104
|
adapted_from=["BigPatentClustering"],
|
|
105
105
|
)
|
|
106
106
|
|
|
107
|
-
def dataset_transform(self):
|
|
107
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
108
108
|
for split in self.metadata.eval_splits:
|
|
109
109
|
_check_label_distribution(self.dataset[split])
|
|
110
110
|
self.dataset = self.stratified_subsampling(
|
|
@@ -33,7 +33,7 @@ class BiorxivClusteringP2PFast(AbsTaskClustering):
|
|
|
33
33
|
adapted_from=["BiorxivClusteringP2P"],
|
|
34
34
|
)
|
|
35
35
|
|
|
36
|
-
def dataset_transform(self):
|
|
36
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
37
37
|
for split in self.metadata.eval_splits:
|
|
38
38
|
_check_label_distribution(self.dataset[split])
|
|
39
39
|
|
|
@@ -33,7 +33,7 @@ class BiorxivClusteringS2SFast(AbsTaskClustering):
|
|
|
33
33
|
adapted_from=["BiorxivClusteringS2S"],
|
|
34
34
|
)
|
|
35
35
|
|
|
36
|
-
def dataset_transform(self):
|
|
36
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
37
37
|
for split in self.metadata.eval_splits:
|
|
38
38
|
_check_label_distribution(self.dataset[split])
|
|
39
39
|
|
|
@@ -37,7 +37,7 @@ class MedrxivClusteringP2PFast(AbsTaskClustering):
|
|
|
37
37
|
adapted_from=["MedrxivClusteringP2P"],
|
|
38
38
|
)
|
|
39
39
|
|
|
40
|
-
def dataset_transform(self):
|
|
40
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
41
41
|
ds = {}
|
|
42
42
|
for split in self.metadata.eval_splits:
|
|
43
43
|
labels = list(itertools.chain.from_iterable(self.dataset[split]["labels"]))
|
|
@@ -37,7 +37,7 @@ class MedrxivClusteringS2SFast(AbsTaskClustering):
|
|
|
37
37
|
adapted_from=["MedrxivClusteringS2S"],
|
|
38
38
|
)
|
|
39
39
|
|
|
40
|
-
def dataset_transform(self):
|
|
40
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
41
41
|
ds = {}
|
|
42
42
|
for split in self.metadata.eval_splits:
|
|
43
43
|
labels = list(itertools.chain.from_iterable(self.dataset[split]["labels"]))
|
|
@@ -51,7 +51,7 @@ Iryna Gurevych},
|
|
|
51
51
|
adapted_from=["RedditClustering"],
|
|
52
52
|
)
|
|
53
53
|
|
|
54
|
-
def dataset_transform(self):
|
|
54
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
55
55
|
ds = {}
|
|
56
56
|
for split in self.metadata.eval_splits:
|
|
57
57
|
labels = list(itertools.chain.from_iterable(self.dataset[split]["labels"]))
|
|
@@ -94,7 +94,7 @@ Iryna Gurevych},
|
|
|
94
94
|
adapted_from=["RedditClusteringP2P"],
|
|
95
95
|
)
|
|
96
96
|
|
|
97
|
-
def dataset_transform(self):
|
|
97
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
98
98
|
ds = {}
|
|
99
99
|
for split in self.metadata.eval_splits:
|
|
100
100
|
labels = list(itertools.chain.from_iterable(self.dataset[split]["labels"]))
|
|
@@ -51,7 +51,7 @@ Iryna Gurevych},
|
|
|
51
51
|
adapted_from=["StackExchangeClustering"],
|
|
52
52
|
)
|
|
53
53
|
|
|
54
|
-
def dataset_transform(self):
|
|
54
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
55
55
|
ds = {}
|
|
56
56
|
for split in self.metadata.eval_splits:
|
|
57
57
|
labels = list(itertools.chain.from_iterable(self.dataset[split]["labels"]))
|
|
@@ -52,7 +52,7 @@ Iryna Gurevych},
|
|
|
52
52
|
adapted_from=["StackExchangeClusteringP2P"],
|
|
53
53
|
)
|
|
54
54
|
|
|
55
|
-
def dataset_transform(self):
|
|
55
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
56
56
|
ds = {}
|
|
57
57
|
for split in self.metadata.eval_splits:
|
|
58
58
|
labels = list(itertools.chain.from_iterable(self.dataset[split]["labels"]))
|
|
@@ -93,7 +93,7 @@ class TwentyNewsgroupsClusteringFast(AbsTaskClustering):
|
|
|
93
93
|
adapted_from=["TwentyNewsgroupsClustering"],
|
|
94
94
|
)
|
|
95
95
|
|
|
96
|
-
def dataset_transform(self):
|
|
96
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
97
97
|
ds = {}
|
|
98
98
|
for split in self.metadata.eval_splits:
|
|
99
99
|
labels = list(itertools.chain.from_iterable(self.dataset[split]["labels"]))
|
|
@@ -33,7 +33,7 @@ class BeytooteClustering(AbsTaskClustering):
|
|
|
33
33
|
bibtex_citation=""" """,
|
|
34
34
|
)
|
|
35
35
|
|
|
36
|
-
def dataset_transform(self):
|
|
36
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
37
37
|
self.dataset = self.stratified_subsampling(
|
|
38
38
|
self.dataset,
|
|
39
39
|
seed=self.seed,
|
|
@@ -93,7 +93,7 @@ class HamshahriClustring(AbsTaskClustering):
|
|
|
93
93
|
bibtex_citation=""" """,
|
|
94
94
|
)
|
|
95
95
|
|
|
96
|
-
def dataset_transform(self):
|
|
96
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
97
97
|
self.dataset = self.dataset.map(
|
|
98
98
|
lambda x: {"sentences": f"{x['title']}\n: {x['summary']}"}
|
|
99
99
|
)
|
|
@@ -151,7 +151,7 @@ class NLPTwitterAnalysisClustering(AbsTaskClustering):
|
|
|
151
151
|
bibtex_citation=""" """,
|
|
152
152
|
)
|
|
153
153
|
|
|
154
|
-
def dataset_transform(self):
|
|
154
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
155
155
|
self.dataset = self.dataset.rename_column("tweet", "sentences")
|
|
156
156
|
self.dataset = self.dataset.rename_column("label", "labels")
|
|
157
157
|
self.dataset = self.stratified_subsampling(
|
|
@@ -187,7 +187,7 @@ class SIDClustring(AbsTaskClustering):
|
|
|
187
187
|
bibtex_citation=""" """,
|
|
188
188
|
)
|
|
189
189
|
|
|
190
|
-
def dataset_transform(self):
|
|
190
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
191
191
|
self.dataset = self.stratified_subsampling(
|
|
192
192
|
self.dataset,
|
|
193
193
|
seed=self.seed,
|
|
@@ -48,7 +48,7 @@ class HALClusteringS2S(AbsTaskClusteringLegacy):
|
|
|
48
48
|
superseded_by="HALClusteringS2S.v2",
|
|
49
49
|
)
|
|
50
50
|
|
|
51
|
-
def dataset_transform(self):
|
|
51
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
52
52
|
"""Convert to standard format"""
|
|
53
53
|
self.dataset = self.dataset.remove_columns("hal_id")
|
|
54
54
|
titles = self.dataset["test"]["title"]
|
|
@@ -98,7 +98,7 @@ class HALClusteringS2SFast(AbsTaskClustering):
|
|
|
98
98
|
adapted_from=["HALClusteringS2S"],
|
|
99
99
|
)
|
|
100
100
|
|
|
101
|
-
def dataset_transform(self):
|
|
101
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
102
102
|
"""Convert to standard format"""
|
|
103
103
|
self.dataset["test"] = self.dataset["test"].remove_columns("hal_id")
|
|
104
104
|
self.dataset["test"] = self.dataset["test"].rename_columns(
|
|
@@ -51,7 +51,7 @@ class MLSUMClusteringP2P(AbsTaskClusteringLegacy):
|
|
|
51
51
|
superseded_by="MLSUMClusteringP2P.v2",
|
|
52
52
|
)
|
|
53
53
|
|
|
54
|
-
def load_data(self) -> None:
|
|
54
|
+
def load_data(self, num_proc: int = 1, **kwargs) -> None:
|
|
55
55
|
"""Load dataset from HuggingFace hub and convert it to the standard format."""
|
|
56
56
|
if self.data_loaded:
|
|
57
57
|
return
|
|
@@ -124,7 +124,7 @@ class MLSUMClusteringP2PFast(AbsTaskClustering):
|
|
|
124
124
|
adapted_from=["MLSUMClusteringP2P"],
|
|
125
125
|
)
|
|
126
126
|
|
|
127
|
-
def load_data(self) -> None:
|
|
127
|
+
def load_data(self, num_proc: int = 1, **kwargs) -> None:
|
|
128
128
|
"""Load dataset from HuggingFace hub and convert it to the standard format."""
|
|
129
129
|
if self.data_loaded:
|
|
130
130
|
return
|
|
@@ -51,7 +51,7 @@ class MLSUMClusteringS2S(AbsTaskClusteringLegacy):
|
|
|
51
51
|
superseded_by="MLSUMClusteringS2S.v2",
|
|
52
52
|
)
|
|
53
53
|
|
|
54
|
-
def load_data(self) -> None:
|
|
54
|
+
def load_data(self, num_proc: int = 1, **kwargs) -> None:
|
|
55
55
|
"""Load dataset from HuggingFace hub and convert it to the standard format."""
|
|
56
56
|
if self.data_loaded:
|
|
57
57
|
return
|
|
@@ -119,7 +119,7 @@ class MLSUMClusteringS2SFast(AbsTaskClustering):
|
|
|
119
119
|
adapted_from=["MLSUMClusteringS2S"],
|
|
120
120
|
)
|
|
121
121
|
|
|
122
|
-
def load_data(self) -> None:
|
|
122
|
+
def load_data(self, num_proc: int = 1, **kwargs) -> None:
|
|
123
123
|
"""Load dataset from HuggingFace hub and convert it to the standard format."""
|
|
124
124
|
if self.data_loaded:
|
|
125
125
|
return
|
|
@@ -239,7 +239,7 @@ class SIB200ClusteringFast(AbsTaskClustering):
|
|
|
239
239
|
""", # combined train, validation, and test into test.
|
|
240
240
|
)
|
|
241
241
|
|
|
242
|
-
def dataset_transform(self):
|
|
242
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
243
243
|
ds = {}
|
|
244
244
|
for lang in self.hf_subsets:
|
|
245
245
|
labels = []
|
|
@@ -33,7 +33,7 @@ class DutchNewsArticlesClusteringP2P(AbsTaskClustering):
|
|
|
33
33
|
},
|
|
34
34
|
)
|
|
35
35
|
|
|
36
|
-
def dataset_transform(self):
|
|
36
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
37
37
|
for split in self.dataset:
|
|
38
38
|
self.dataset[split] = self.dataset[split].rename_columns(
|
|
39
39
|
{"label": "labels", "text": "sentences"}
|
|
@@ -33,7 +33,7 @@ class DutchNewsArticlesClusteringS2S(AbsTaskClustering):
|
|
|
33
33
|
},
|
|
34
34
|
)
|
|
35
35
|
|
|
36
|
-
def dataset_transform(self):
|
|
36
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
37
37
|
for split in self.dataset:
|
|
38
38
|
self.dataset[split] = self.dataset[split].rename_columns(
|
|
39
39
|
{"label": "labels", "title": "sentences"}
|
|
@@ -43,7 +43,7 @@ class IconclassClusteringS2S(AbsTaskClustering):
|
|
|
43
43
|
},
|
|
44
44
|
)
|
|
45
45
|
|
|
46
|
-
def dataset_transform(self):
|
|
46
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
47
47
|
for split in self.dataset:
|
|
48
48
|
self.dataset[split] = self.dataset[split].map(
|
|
49
49
|
lambda ex: {"labels": ex["label"], "sentences": ex["text"]}
|
|
@@ -43,7 +43,7 @@ class OpenTenderClusteringP2P(AbsTaskClustering):
|
|
|
43
43
|
},
|
|
44
44
|
)
|
|
45
45
|
|
|
46
|
-
def dataset_transform(self):
|
|
46
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
47
47
|
# reuse the dataset for classification
|
|
48
48
|
for split in self.dataset:
|
|
49
49
|
self.dataset[split] = self.dataset[split].map(
|
|
@@ -131,7 +131,7 @@ Piperidis, Stelios},
|
|
|
131
131
|
adapted_from=["EightTagsClustering"],
|
|
132
132
|
)
|
|
133
133
|
|
|
134
|
-
def dataset_transform(self):
|
|
134
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
135
135
|
ds = {}
|
|
136
136
|
for split in self.metadata.eval_splits:
|
|
137
137
|
labels = list(chain.from_iterable(self.dataset[split]["labels"]))
|
|
@@ -204,7 +204,7 @@ class PlscClusteringS2SFast(AbsTaskClustering):
|
|
|
204
204
|
adapted_from=["PlscClusteringS2S"],
|
|
205
205
|
)
|
|
206
206
|
|
|
207
|
-
def dataset_transform(self):
|
|
207
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
208
208
|
ds = {}
|
|
209
209
|
for split in self.metadata.eval_splits:
|
|
210
210
|
labels = self.dataset[split]["labels"]
|
|
@@ -286,7 +286,7 @@ class PlscClusteringP2PFast(AbsTaskClustering):
|
|
|
286
286
|
adapted_from=["PlscClusteringP2P"],
|
|
287
287
|
)
|
|
288
288
|
|
|
289
|
-
def dataset_transform(self):
|
|
289
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
290
290
|
ds = {}
|
|
291
291
|
for split in self.metadata.eval_splits:
|
|
292
292
|
labels = self.dataset[split]["labels"]
|
|
@@ -32,7 +32,7 @@ class RuSciBenchGRNTIClusteringP2P(AbsTaskClustering):
|
|
|
32
32
|
prompt="Identify the category of scientific papers based on the titles and abstracts",
|
|
33
33
|
)
|
|
34
34
|
|
|
35
|
-
def dataset_transform(self):
|
|
35
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
36
36
|
self.dataset = self.dataset.rename_columns(
|
|
37
37
|
{"label": "labels", "text": "sentences"}
|
|
38
38
|
)
|
|
@@ -32,7 +32,7 @@ class RuSciBenchOECDClusteringP2P(AbsTaskClustering):
|
|
|
32
32
|
prompt="Identify the category of scientific papers based on the titles and abstracts",
|
|
33
33
|
)
|
|
34
34
|
|
|
35
|
-
def dataset_transform(self):
|
|
35
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
36
36
|
self.dataset = self.dataset.rename_columns(
|
|
37
37
|
{"label": "labels", "text": "sentences"}
|
|
38
38
|
)
|
|
@@ -51,7 +51,7 @@ class CLSClusteringFastS2S(AbsTaskClustering):
|
|
|
51
51
|
adapted_from=["CLSClusteringS2S"],
|
|
52
52
|
)
|
|
53
53
|
|
|
54
|
-
def dataset_transform(self):
|
|
54
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
55
55
|
ds = {}
|
|
56
56
|
for split in self.metadata.eval_splits:
|
|
57
57
|
labels = list(itertools.chain.from_iterable(self.dataset[split]["labels"]))
|
|
@@ -110,7 +110,7 @@ class CLSClusteringFastP2P(AbsTaskClustering):
|
|
|
110
110
|
adapted_from=["CLSClusteringP2P"],
|
|
111
111
|
)
|
|
112
112
|
|
|
113
|
-
def dataset_transform(self):
|
|
113
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
114
114
|
ds = {}
|
|
115
115
|
for split in self.metadata.eval_splits:
|
|
116
116
|
labels = list(itertools.chain.from_iterable(self.dataset[split]["labels"]))
|
|
@@ -239,7 +239,7 @@ class ThuNewsClusteringFastS2S(AbsTaskClustering):
|
|
|
239
239
|
adapted_from=["ThuNewsClusteringS2S"],
|
|
240
240
|
)
|
|
241
241
|
|
|
242
|
-
def dataset_transform(self):
|
|
242
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
243
243
|
ds = {}
|
|
244
244
|
for split in self.metadata.eval_splits:
|
|
245
245
|
labels = list(itertools.chain.from_iterable(self.dataset[split]["labels"]))
|
|
@@ -298,7 +298,7 @@ class ThuNewsClusteringFastP2P(AbsTaskClustering):
|
|
|
298
298
|
adapted_from=["ThuNewsClusteringP2P"],
|
|
299
299
|
)
|
|
300
300
|
|
|
301
|
-
def dataset_transform(self):
|
|
301
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
302
302
|
ds = {}
|
|
303
303
|
for split in self.metadata.eval_splits:
|
|
304
304
|
labels = list(itertools.chain.from_iterable(self.dataset[split]["labels"]))
|
|
@@ -175,7 +175,7 @@ class mFollowIRCrossLingual(AbsTaskRetrieval): # noqa: N801
|
|
|
175
175
|
""",
|
|
176
176
|
)
|
|
177
177
|
|
|
178
|
-
def load_data(self) -> None:
|
|
178
|
+
def load_data(self, num_proc: int = 1, **kwargs) -> None:
|
|
179
179
|
if self.data_loaded:
|
|
180
180
|
return
|
|
181
181
|
|
|
@@ -243,7 +243,7 @@ class mFollowIR(AbsTaskRetrieval): # noqa: N801
|
|
|
243
243
|
""",
|
|
244
244
|
)
|
|
245
245
|
|
|
246
|
-
def load_data(self) -> None:
|
|
246
|
+
def load_data(self, num_proc: int = 1, **kwargs) -> None:
|
|
247
247
|
if self.data_loaded:
|
|
248
248
|
return
|
|
249
249
|
|
|
@@ -123,7 +123,7 @@ class CVBenchCount(AbsTaskRetrieval):
|
|
|
123
123
|
""",
|
|
124
124
|
)
|
|
125
125
|
|
|
126
|
-
def load_data(self) -> None:
|
|
126
|
+
def load_data(self, num_proc: int = 1, **kwargs) -> None:
|
|
127
127
|
self.corpus, self.queries, self.relevant_docs, self.top_ranked = _load_data(
|
|
128
128
|
path=self.metadata.dataset["path"],
|
|
129
129
|
splits=self.metadata.eval_splits,
|
|
@@ -165,7 +165,7 @@ class CVBenchRelation(AbsTaskRetrieval):
|
|
|
165
165
|
""",
|
|
166
166
|
)
|
|
167
167
|
|
|
168
|
-
def load_data(self) -> None:
|
|
168
|
+
def load_data(self, num_proc: int = 1, **kwargs) -> None:
|
|
169
169
|
self.corpus, self.queries, self.relevant_docs, self.top_ranked = _load_data(
|
|
170
170
|
path=self.metadata.dataset["path"],
|
|
171
171
|
splits=self.metadata.eval_splits,
|
|
@@ -207,7 +207,7 @@ class CVBenchDepth(AbsTaskRetrieval):
|
|
|
207
207
|
""",
|
|
208
208
|
)
|
|
209
209
|
|
|
210
|
-
def load_data(self) -> None:
|
|
210
|
+
def load_data(self, num_proc: int = 1, **kwargs) -> None:
|
|
211
211
|
self.corpus, self.queries, self.relevant_docs, self.top_ranked = _load_data(
|
|
212
212
|
path=self.metadata.dataset["path"],
|
|
213
213
|
splits=self.metadata.eval_splits,
|
|
@@ -249,7 +249,7 @@ class CVBenchDistance(AbsTaskRetrieval):
|
|
|
249
249
|
""",
|
|
250
250
|
)
|
|
251
251
|
|
|
252
|
-
def load_data(self) -> None:
|
|
252
|
+
def load_data(self, num_proc: int = 1, **kwargs) -> None:
|
|
253
253
|
self.corpus, self.queries, self.relevant_docs, self.top_ranked = _load_data(
|
|
254
254
|
path=self.metadata.dataset["path"],
|
|
255
255
|
splits=self.metadata.eval_splits,
|